* [PATCH v3 3/9] md: superblock changes for PPL
From: Artur Paszkiewicz @ 2017-01-30 18:59 UTC (permalink / raw)
To: linux-raid; +Cc: shli, neilb, jes.sorensen, Artur Paszkiewicz
In-Reply-To: <20170130185953.30428-1-artur.paszkiewicz@intel.com>
Include information about PPL location and size into mdp_superblock_1
and copy it to/from rdev. Because PPL is mutually exclusive with bitmap,
put it in place of 'bitmap_offset'. Add a new flag MD_FEATURE_PPL for
'feature_map', analogically to MD_FEATURE_BITMAP_OFFSET. Add MD_HAS_PPL
to mddev->flags to indicate that PPL is enabled on an array.
Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
---
drivers/md/md.c | 15 +++++++++++++++
drivers/md/md.h | 8 ++++++++
drivers/md/raid0.c | 3 ++-
drivers/md/raid1.c | 3 ++-
include/uapi/linux/raid/md_p.h | 18 ++++++++++++++----
5 files changed, 41 insertions(+), 6 deletions(-)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 85ac98417a08..e96f73572e23 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1566,6 +1566,12 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
} else if (sb->bblog_offset != 0)
rdev->badblocks.shift = 0;
+ if (le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) {
+ rdev->ppl.offset = (__s16)le16_to_cpu(sb->ppl.offset);
+ rdev->ppl.size = le16_to_cpu(sb->ppl.size);
+ rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset;
+ }
+
if (!refdev) {
ret = 1;
} else {
@@ -1678,6 +1684,9 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)
set_bit(MD_HAS_JOURNAL, &mddev->flags);
+
+ if (le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL)
+ set_bit(MD_HAS_PPL, &mddev->flags);
} else if (mddev->pers == NULL) {
/* Insist of good event counter while assembling, except for
* spares (which don't need an event count) */
@@ -1891,6 +1900,12 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
if (test_bit(MD_HAS_JOURNAL, &mddev->flags))
sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL);
+ if (test_bit(MD_HAS_PPL, &mddev->flags)) {
+ sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL);
+ sb->ppl.offset = cpu_to_le16(rdev->ppl.offset);
+ sb->ppl.size = cpu_to_le16(rdev->ppl.size);
+ }
+
rdev_for_each(rdev2, mddev) {
i = rdev2->desc_nr;
if (test_bit(Faulty, &rdev2->flags))
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 968bbe72b237..abdb5f2ed2d3 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -122,6 +122,13 @@ struct md_rdev {
* sysfs entry */
struct badblocks badblocks;
+
+ struct {
+ short offset; /* Offset from superblock to start of PPL.
+ * Not used by external metadata. */
+ unsigned int size; /* Size in sectors of the PPL space */
+ sector_t sector; /* First sector of the PPL space */
+ } ppl;
};
enum flag_bits {
Faulty, /* device is known to have a fault */
@@ -229,6 +236,7 @@ enum mddev_flags {
* supported as calls to md_error() will
* never cause the array to become failed.
*/
+ MD_HAS_PPL, /* The raid array has PPL feature set */
};
enum mddev_sb_flags {
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 5b3db367814a..37fc1f5185a9 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -29,7 +29,8 @@
#define UNSUPPORTED_MDDEV_FLAGS \
((1L << MD_HAS_JOURNAL) | \
(1L << MD_JOURNAL_CLEAN) | \
- (1L << MD_FAILFAST_SUPPORTED))
+ (1L << MD_FAILFAST_SUPPORTED) |\
+ (1L << MD_HAS_PPL))
static int raid0_congested(struct mddev *mddev, int bits)
{
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 7b0f647bcccb..53623a31b074 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -44,7 +44,8 @@
#define UNSUPPORTED_MDDEV_FLAGS \
((1L << MD_HAS_JOURNAL) | \
- (1L << MD_JOURNAL_CLEAN))
+ (1L << MD_JOURNAL_CLEAN) | \
+ (1L << MD_HAS_PPL))
/*
* Number of guaranteed r1bios in case of extreme VM load:
diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h
index 9930f3e9040f..fe2112810c43 100644
--- a/include/uapi/linux/raid/md_p.h
+++ b/include/uapi/linux/raid/md_p.h
@@ -242,10 +242,18 @@ struct mdp_superblock_1 {
__le32 chunksize; /* in 512byte sectors */
__le32 raid_disks;
- __le32 bitmap_offset; /* sectors after start of superblock that bitmap starts
- * NOTE: signed, so bitmap can be before superblock
- * only meaningful of feature_map[0] is set.
- */
+ union {
+ __le32 bitmap_offset; /* sectors after start of superblock that bitmap starts
+ * NOTE: signed, so bitmap can be before superblock
+ * only meaningful of feature_map[0] is set.
+ */
+
+ /* only meaningful when feature_map[MD_FEATURE_PPL] is set */
+ struct {
+ __le16 offset; /* sectors from start of superblock that ppl starts (signed) */
+ __le16 size; /* ppl size in sectors */
+ } ppl;
+ };
/* These are only valid with feature bit '4' */
__le32 new_level; /* new level we are reshaping to */
@@ -318,6 +326,7 @@ struct mdp_superblock_1 {
*/
#define MD_FEATURE_CLUSTERED 256 /* clustered MD */
#define MD_FEATURE_JOURNAL 512 /* support write cache */
+#define MD_FEATURE_PPL 1024 /* support PPL */
#define MD_FEATURE_ALL (MD_FEATURE_BITMAP_OFFSET \
|MD_FEATURE_RECOVERY_OFFSET \
|MD_FEATURE_RESHAPE_ACTIVE \
@@ -328,6 +337,7 @@ struct mdp_superblock_1 {
|MD_FEATURE_RECOVERY_BITMAP \
|MD_FEATURE_CLUSTERED \
|MD_FEATURE_JOURNAL \
+ |MD_FEATURE_PPL \
)
struct r5l_payload_header {
--
2.11.0
^ permalink raw reply related
* [PATCH v3 2/9] raid5-cache: add policy logic
From: Artur Paszkiewicz @ 2017-01-30 18:59 UTC (permalink / raw)
To: linux-raid; +Cc: shli, neilb, jes.sorensen, Artur Paszkiewicz
In-Reply-To: <20170130185953.30428-1-artur.paszkiewicz@intel.com>
Add wrappers for public log functions from raid5-cache and a struct
r5l_policy containing handlers for the log operations. This allows
adding different policies for raid5 logging without changing the
mechanism - calls from the raid5 personality stay the same.
Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
---
drivers/md/raid5-cache.c | 116 +++++++++++++++++++++++++++++++++++++----------
drivers/md/raid5-cache.h | 14 ++++++
2 files changed, 106 insertions(+), 24 deletions(-)
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 394d87b62efa..6fac581804a9 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -787,7 +787,7 @@ static inline void r5l_add_no_space_stripe(struct r5l_log *log,
* running in raid5d, where reclaim could wait for raid5d too (when it flushes
* data from log to raid disks), so we shouldn't wait for reclaim here
*/
-int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
+static int __r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
{
struct r5conf *conf = sh->raid_conf;
int write_disks = 0;
@@ -797,8 +797,6 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
int ret = 0;
bool wake_reclaim = false;
- if (!log)
- return -EAGAIN;
/* Don't support stripe batch */
if (sh->log_io || !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) ||
test_bit(STRIPE_SYNCING, &sh->state)) {
@@ -885,19 +883,28 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
return 0;
}
-void r5l_write_stripe_run(struct r5l_log *log)
+int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
+{
+ if (log && log->policy->write_stripe)
+ return log->policy->write_stripe(log, sh);
+ return -EAGAIN;
+}
+
+static void __r5l_write_stripe_run(struct r5l_log *log)
{
- if (!log)
- return;
mutex_lock(&log->io_mutex);
r5l_submit_current_io(log);
mutex_unlock(&log->io_mutex);
}
-int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio)
+void r5l_write_stripe_run(struct r5l_log *log)
+{
+ if (log && log->policy->write_stripe_run)
+ log->policy->write_stripe_run(log);
+}
+
+static int __r5l_handle_flush_request(struct r5l_log *log, struct bio *bio)
{
- if (!log)
- return -ENODEV;
if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) {
/*
@@ -929,6 +936,13 @@ int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio)
return -EAGAIN;
}
+int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio)
+{
+ if (log && log->policy->handle_flush_request)
+ return log->policy->handle_flush_request(log, bio);
+ return -ENODEV;
+}
+
/* This will run after log space is reclaimed */
static void r5l_run_no_space_stripes(struct r5l_log *log)
{
@@ -1049,8 +1063,9 @@ void r5l_stripe_write_finished(struct stripe_head *sh)
io = sh->log_io;
sh->log_io = NULL;
- if (io && atomic_dec_and_test(&io->pending_stripe))
- __r5l_stripe_write_finished(io);
+ if (io && atomic_dec_and_test(&io->pending_stripe) &&
+ io->log->policy->stripe_write_finished)
+ io->log->policy->stripe_write_finished(io);
}
static void r5l_log_flush_endio(struct bio *bio)
@@ -1084,11 +1099,11 @@ static void r5l_log_flush_endio(struct bio *bio)
* only write stripes of an io_unit to raid disks till the io_unit is the first
* one whose data/parity is in log.
*/
-void r5l_flush_stripe_to_raid(struct r5l_log *log)
+static void __r5l_flush_stripe_to_raid(struct r5l_log *log)
{
bool do_flush;
- if (!log || !log->need_cache_flush)
+ if (!log->need_cache_flush)
return;
spin_lock_irq(&log->io_list_lock);
@@ -1110,6 +1125,12 @@ void r5l_flush_stripe_to_raid(struct r5l_log *log)
submit_bio(&log->flush_bio);
}
+void r5l_flush_stripe_to_raid(struct r5l_log *log)
+{
+ if (log && log->policy->flush_stripe_to_raid)
+ log->policy->flush_stripe_to_raid(log);
+}
+
static void r5l_write_super(struct r5l_log *log, sector_t cp);
static void r5l_write_super_and_discard_space(struct r5l_log *log,
sector_t end)
@@ -1366,10 +1387,10 @@ void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
md_wakeup_thread(log->reclaim_thread);
}
-void r5l_quiesce(struct r5l_log *log, int state)
+static void __r5l_quiesce(struct r5l_log *log, int state)
{
struct mddev *mddev;
- if (!log || state == 2)
+ if (state == 2)
return;
if (state == 0)
kthread_unpark(log->reclaim_thread->tsk);
@@ -1383,6 +1404,12 @@ void r5l_quiesce(struct r5l_log *log, int state)
}
}
+void r5l_quiesce(struct r5l_log *log, int state)
+{
+ if (log && log->policy->quiesce)
+ log->policy->quiesce(log, state);
+}
+
bool r5l_log_disk_error(struct r5conf *conf)
{
struct r5l_log *log;
@@ -2626,11 +2653,10 @@ void r5c_update_on_rdev_error(struct mddev *mddev)
schedule_work(&log->disable_writeback_work);
}
-int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
+static int __r5l_init_log(struct r5l_log *log, struct r5conf *conf)
{
+ struct md_rdev *rdev = log->rdev;
struct request_queue *q = bdev_get_queue(rdev->bdev);
- struct r5l_log *log;
-
if (PAGE_SIZE != 4096)
return -EINVAL;
@@ -2649,10 +2675,6 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
return -EINVAL;
}
- log = kzalloc(sizeof(*log), GFP_KERNEL);
- if (!log)
- return -ENOMEM;
- log->rdev = rdev;
log->need_cache_flush = test_bit(QUEUE_FLAG_WC, &q->queue_flags) != 0;
@@ -2728,11 +2750,10 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
io_pool:
kmem_cache_destroy(log->io_kc);
io_kc:
- kfree(log);
return -EINVAL;
}
-void r5l_exit_log(struct r5l_log *log)
+static void __r5l_exit_log(struct r5l_log *log)
{
flush_work(&log->disable_writeback_work);
md_unregister_thread(&log->reclaim_thread);
@@ -2740,5 +2761,52 @@ void r5l_exit_log(struct r5l_log *log)
bioset_free(log->bs);
mempool_destroy(log->io_pool);
kmem_cache_destroy(log->io_kc);
+}
+
+void r5l_exit_log(struct r5l_log *log)
+{
+ if (!log)
+ return;
+
+ if (log->policy->exit_log)
+ log->policy->exit_log(log);
+
kfree(log);
}
+
+struct r5l_policy r5l_journal = {
+ .init_log = __r5l_init_log,
+ .exit_log = __r5l_exit_log,
+ .write_stripe = __r5l_write_stripe,
+ .write_stripe_run = __r5l_write_stripe_run,
+ .flush_stripe_to_raid = __r5l_flush_stripe_to_raid,
+ .stripe_write_finished = __r5l_stripe_write_finished,
+ .handle_flush_request = __r5l_handle_flush_request,
+ .quiesce = __r5l_quiesce,
+};
+
+int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
+{
+ int ret;
+ struct r5l_log *log;
+ struct mddev *mddev = conf->mddev;
+
+ log = kzalloc(sizeof(*log), GFP_KERNEL);
+ if (!log)
+ return -ENOMEM;
+
+ if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
+ log->policy = &r5l_journal;
+ } else {
+ kfree(log);
+ return -EINVAL;
+ }
+
+ log->rdev = rdev;
+
+ ret = log->policy->init_log(log, conf);
+ if (ret)
+ kfree(log);
+
+ return ret;
+}
diff --git a/drivers/md/raid5-cache.h b/drivers/md/raid5-cache.h
index 96dc95d4a36c..97803f3ae0fe 100644
--- a/drivers/md/raid5-cache.h
+++ b/drivers/md/raid5-cache.h
@@ -84,6 +84,9 @@ struct r5l_log {
/* to for chunk_aligned_read in writeback mode, details below */
spinlock_t tree_lock;
struct radix_tree_root big_stripe_tree;
+
+ /* handlers for log operations */
+ struct r5l_policy *policy;
};
/*
@@ -133,6 +136,17 @@ enum r5l_io_unit_state {
IO_UNIT_STRIPE_END = 3, /* stripes data finished writing to raid */
};
+struct r5l_policy {
+ int (*init_log)(struct r5l_log *log, struct r5conf *conf);
+ void (*exit_log)(struct r5l_log *log);
+ int (*write_stripe)(struct r5l_log *log, struct stripe_head *sh);
+ void (*write_stripe_run)(struct r5l_log *log);
+ void (*flush_stripe_to_raid)(struct r5l_log *log);
+ void (*stripe_write_finished)(struct r5l_io_unit *io);
+ int (*handle_flush_request)(struct r5l_log *log, struct bio *bio);
+ void (*quiesce)(struct r5l_log *log, int state);
+};
+
extern int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev);
extern void r5l_exit_log(struct r5l_log *log);
extern int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh);
--
2.11.0
^ permalink raw reply related
* [PATCH v3 1/9] raid5-cache: move declarations to separate header
From: Artur Paszkiewicz @ 2017-01-30 18:59 UTC (permalink / raw)
To: linux-raid; +Cc: shli, neilb, jes.sorensen, Artur Paszkiewicz
In-Reply-To: <20170130185953.30428-1-artur.paszkiewicz@intel.com>
Next patches will be reusing raid5-cache structures and functions, so
put them in their own header.
Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
---
drivers/md/raid5-cache.c | 133 +------------------------------------
drivers/md/raid5-cache.h | 166 +++++++++++++++++++++++++++++++++++++++++++++++
drivers/md/raid5.c | 1 +
drivers/md/raid5.h | 30 ---------
4 files changed, 168 insertions(+), 162 deletions(-)
create mode 100644 drivers/md/raid5-cache.h
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 76c0e5063f1b..394d87b62efa 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -24,6 +24,7 @@
#include "md.h"
#include "raid5.h"
#include "bitmap.h"
+#include "raid5-cache.h"
/*
* metadata/data stored in disk with 4k size unit (a block) regardless
@@ -53,16 +54,6 @@
*/
#define R5L_POOL_SIZE 4
-/*
- * r5c journal modes of the array: write-back or write-through.
- * write-through mode has identical behavior as existing log only
- * implementation.
- */
-enum r5c_journal_mode {
- R5C_JOURNAL_MODE_WRITE_THROUGH = 0,
- R5C_JOURNAL_MODE_WRITE_BACK = 1,
-};
-
static char *r5c_journal_mode_str[] = {"write-through",
"write-back"};
/*
@@ -96,81 +87,6 @@ static char *r5c_journal_mode_str[] = {"write-through",
* - return IO for pending writes
*/
-struct r5l_log {
- struct md_rdev *rdev;
-
- u32 uuid_checksum;
-
- sector_t device_size; /* log device size, round to
- * BLOCK_SECTORS */
- sector_t max_free_space; /* reclaim run if free space is at
- * this size */
-
- sector_t last_checkpoint; /* log tail. where recovery scan
- * starts from */
- u64 last_cp_seq; /* log tail sequence */
-
- sector_t log_start; /* log head. where new data appends */
- u64 seq; /* log head sequence */
-
- sector_t next_checkpoint;
-
- struct mutex io_mutex;
- struct r5l_io_unit *current_io; /* current io_unit accepting new data */
-
- spinlock_t io_list_lock;
- struct list_head running_ios; /* io_units which are still running,
- * and have not yet been completely
- * written to the log */
- struct list_head io_end_ios; /* io_units which have been completely
- * written to the log but not yet written
- * to the RAID */
- struct list_head flushing_ios; /* io_units which are waiting for log
- * cache flush */
- struct list_head finished_ios; /* io_units which settle down in log disk */
- struct bio flush_bio;
-
- struct list_head no_mem_stripes; /* pending stripes, -ENOMEM */
-
- struct kmem_cache *io_kc;
- mempool_t *io_pool;
- struct bio_set *bs;
- mempool_t *meta_pool;
-
- struct md_thread *reclaim_thread;
- unsigned long reclaim_target; /* number of space that need to be
- * reclaimed. if it's 0, reclaim spaces
- * used by io_units which are in
- * IO_UNIT_STRIPE_END state (eg, reclaim
- * dones't wait for specific io_unit
- * switching to IO_UNIT_STRIPE_END
- * state) */
- wait_queue_head_t iounit_wait;
-
- struct list_head no_space_stripes; /* pending stripes, log has no space */
- spinlock_t no_space_stripes_lock;
-
- bool need_cache_flush;
-
- /* for r5c_cache */
- enum r5c_journal_mode r5c_journal_mode;
-
- /* all stripes in r5cache, in the order of seq at sh->log_start */
- struct list_head stripe_in_journal_list;
-
- spinlock_t stripe_in_journal_lock;
- atomic_t stripe_in_journal_count;
-
- /* to submit async io_units, to fulfill ordering of flush */
- struct work_struct deferred_io_work;
- /* to disable write back during in degraded mode */
- struct work_struct disable_writeback_work;
-
- /* to for chunk_aligned_read in writeback mode, details below */
- spinlock_t tree_lock;
- struct radix_tree_root big_stripe_tree;
-};
-
/*
* Enable chunk_aligned_read() with write back cache.
*
@@ -218,53 +134,6 @@ static inline sector_t r5c_tree_index(struct r5conf *conf,
return sect;
}
-/*
- * an IO range starts from a meta data block and end at the next meta data
- * block. The io unit's the meta data block tracks data/parity followed it. io
- * unit is written to log disk with normal write, as we always flush log disk
- * first and then start move data to raid disks, there is no requirement to
- * write io unit with FLUSH/FUA
- */
-struct r5l_io_unit {
- struct r5l_log *log;
-
- struct page *meta_page; /* store meta block */
- int meta_offset; /* current offset in meta_page */
-
- struct bio *current_bio;/* current_bio accepting new data */
-
- atomic_t pending_stripe;/* how many stripes not flushed to raid */
- u64 seq; /* seq number of the metablock */
- sector_t log_start; /* where the io_unit starts */
- sector_t log_end; /* where the io_unit ends */
- struct list_head log_sibling; /* log->running_ios */
- struct list_head stripe_list; /* stripes added to the io_unit */
-
- int state;
- bool need_split_bio;
- struct bio *split_bio;
-
- unsigned int has_flush:1; /* include flush request */
- unsigned int has_fua:1; /* include fua request */
- unsigned int has_null_flush:1; /* include empty flush request */
- /*
- * io isn't sent yet, flush/fua request can only be submitted till it's
- * the first IO in running_ios list
- */
- unsigned int io_deferred:1;
-
- struct bio_list flush_barriers; /* size == 0 flush bios */
-};
-
-/* r5l_io_unit state */
-enum r5l_io_unit_state {
- IO_UNIT_RUNNING = 0, /* accepting new IO */
- IO_UNIT_IO_START = 1, /* io_unit bio start writing to log,
- * don't accepting new bio */
- IO_UNIT_IO_END = 2, /* io_unit bio finish writing to log */
- IO_UNIT_STRIPE_END = 3, /* stripes data finished writing to raid */
-};
-
bool r5c_is_writeback(struct r5l_log *log)
{
return (log != NULL &&
diff --git a/drivers/md/raid5-cache.h b/drivers/md/raid5-cache.h
new file mode 100644
index 000000000000..96dc95d4a36c
--- /dev/null
+++ b/drivers/md/raid5-cache.h
@@ -0,0 +1,166 @@
+#ifndef _RAID5_CACHE_H
+#define _RAID5_CACHE_H
+
+/*
+ * r5c journal modes of the array: write-back or write-through.
+ * write-through mode has identical behavior as existing log only
+ * implementation.
+ */
+enum r5c_journal_mode {
+ R5C_JOURNAL_MODE_WRITE_THROUGH = 0,
+ R5C_JOURNAL_MODE_WRITE_BACK = 1,
+};
+
+struct r5l_log {
+ struct md_rdev *rdev;
+
+ u32 uuid_checksum;
+
+ sector_t device_size; /* log device size, round to
+ * BLOCK_SECTORS */
+ sector_t max_free_space; /* reclaim run if free space is at
+ * this size */
+
+ sector_t last_checkpoint; /* log tail. where recovery scan
+ * starts from */
+ u64 last_cp_seq; /* log tail sequence */
+
+ sector_t log_start; /* log head. where new data appends */
+ u64 seq; /* log head sequence */
+
+ sector_t next_checkpoint;
+
+ struct mutex io_mutex;
+ struct r5l_io_unit *current_io; /* current io_unit accepting new data */
+
+ spinlock_t io_list_lock;
+ struct list_head running_ios; /* io_units which are still running,
+ * and have not yet been completely
+ * written to the log */
+ struct list_head io_end_ios; /* io_units which have been completely
+ * written to the log but not yet written
+ * to the RAID */
+ struct list_head flushing_ios; /* io_units which are waiting for log
+ * cache flush */
+ struct list_head finished_ios; /* io_units which settle down in log disk */
+ struct bio flush_bio;
+
+ struct list_head no_mem_stripes; /* pending stripes, -ENOMEM */
+
+ struct kmem_cache *io_kc;
+ mempool_t *io_pool;
+ struct bio_set *bs;
+ mempool_t *meta_pool;
+
+ struct md_thread *reclaim_thread;
+ unsigned long reclaim_target; /* number of space that need to be
+ * reclaimed. if it's 0, reclaim spaces
+ * used by io_units which are in
+ * IO_UNIT_STRIPE_END state (eg, reclaim
+ * dones't wait for specific io_unit
+ * switching to IO_UNIT_STRIPE_END
+ * state) */
+ wait_queue_head_t iounit_wait;
+
+ struct list_head no_space_stripes; /* pending stripes, log has no space */
+ spinlock_t no_space_stripes_lock;
+
+ bool need_cache_flush;
+
+ /* for r5c_cache */
+ enum r5c_journal_mode r5c_journal_mode;
+
+ /* all stripes in r5cache, in the order of seq at sh->log_start */
+ struct list_head stripe_in_journal_list;
+
+ spinlock_t stripe_in_journal_lock;
+ atomic_t stripe_in_journal_count;
+
+ /* to submit async io_units, to fulfill ordering of flush */
+ struct work_struct deferred_io_work;
+ /* to disable write back during in degraded mode */
+ struct work_struct disable_writeback_work;
+
+ /* to for chunk_aligned_read in writeback mode, details below */
+ spinlock_t tree_lock;
+ struct radix_tree_root big_stripe_tree;
+};
+
+/*
+ * an IO range starts from a meta data block and end at the next meta data
+ * block. The io unit's the meta data block tracks data/parity followed it. io
+ * unit is written to log disk with normal write, as we always flush log disk
+ * first and then start move data to raid disks, there is no requirement to
+ * write io unit with FLUSH/FUA
+ */
+struct r5l_io_unit {
+ struct r5l_log *log;
+
+ struct page *meta_page; /* store meta block */
+ int meta_offset; /* current offset in meta_page */
+
+ struct bio *current_bio;/* current_bio accepting new data */
+
+ atomic_t pending_stripe;/* how many stripes not flushed to raid */
+ u64 seq; /* seq number of the metablock */
+ sector_t log_start; /* where the io_unit starts */
+ sector_t log_end; /* where the io_unit ends */
+ struct list_head log_sibling; /* log->running_ios */
+ struct list_head stripe_list; /* stripes added to the io_unit */
+
+ int state;
+ bool need_split_bio;
+ struct bio *split_bio;
+
+ unsigned int has_flush:1; /* include flush request */
+ unsigned int has_fua:1; /* include fua request */
+ unsigned int has_null_flush:1; /* include empty flush request */
+ /*
+ * io isn't sent yet, flush/fua request can only be submitted till it's
+ * the first IO in running_ios list
+ */
+ unsigned int io_deferred:1;
+
+ struct bio_list flush_barriers; /* size == 0 flush bios */
+};
+
+/* r5l_io_unit state */
+enum r5l_io_unit_state {
+ IO_UNIT_RUNNING = 0, /* accepting new IO */
+ IO_UNIT_IO_START = 1, /* io_unit bio start writing to log,
+ * don't accepting new bio */
+ IO_UNIT_IO_END = 2, /* io_unit bio finish writing to log */
+ IO_UNIT_STRIPE_END = 3, /* stripes data finished writing to raid */
+};
+
+extern int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev);
+extern void r5l_exit_log(struct r5l_log *log);
+extern int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh);
+extern void r5l_write_stripe_run(struct r5l_log *log);
+extern void r5l_flush_stripe_to_raid(struct r5l_log *log);
+extern void r5l_stripe_write_finished(struct stripe_head *sh);
+extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio);
+extern void r5l_quiesce(struct r5l_log *log, int state);
+extern bool r5l_log_disk_error(struct r5conf *conf);
+extern bool r5c_is_writeback(struct r5l_log *log);
+extern int
+r5c_try_caching_write(struct r5conf *conf, struct stripe_head *sh,
+ struct stripe_head_state *s, int disks);
+extern void
+r5c_finish_stripe_write_out(struct r5conf *conf, struct stripe_head *sh,
+ struct stripe_head_state *s);
+extern void r5c_release_extra_page(struct stripe_head *sh);
+extern void r5c_use_extra_page(struct stripe_head *sh);
+extern void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
+extern void r5c_handle_cached_data_endio(struct r5conf *conf,
+ struct stripe_head *sh, int disks, struct bio_list *return_bi);
+extern int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh,
+ struct stripe_head_state *s);
+extern void r5c_make_stripe_write_out(struct stripe_head *sh);
+extern void r5c_flush_cache(struct r5conf *conf, int num);
+extern void r5c_check_stripe_cache_usage(struct r5conf *conf);
+extern void r5c_check_cached_full_stripe(struct r5conf *conf);
+extern struct md_sysfs_entry r5c_journal_mode;
+extern void r5c_update_on_rdev_error(struct mddev *mddev);
+extern bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect);
+#endif /* _RAID5_CACHE_H */
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b62f671a93ab..d1cba941951e 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -61,6 +61,7 @@
#include "raid5.h"
#include "raid0.h"
#include "bitmap.h"
+#include "raid5-cache.h"
#define UNSUPPORTED_MDDEV_FLAGS (1L << MD_FAILFAST_SUPPORTED)
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index c0687df5ba06..0f64a58873de 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -763,34 +763,4 @@ extern struct stripe_head *
raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
int previous, int noblock, int noquiesce);
extern int raid5_calc_degraded(struct r5conf *conf);
-extern int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev);
-extern void r5l_exit_log(struct r5l_log *log);
-extern int r5l_write_stripe(struct r5l_log *log, struct stripe_head *head_sh);
-extern void r5l_write_stripe_run(struct r5l_log *log);
-extern void r5l_flush_stripe_to_raid(struct r5l_log *log);
-extern void r5l_stripe_write_finished(struct stripe_head *sh);
-extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio);
-extern void r5l_quiesce(struct r5l_log *log, int state);
-extern bool r5l_log_disk_error(struct r5conf *conf);
-extern bool r5c_is_writeback(struct r5l_log *log);
-extern int
-r5c_try_caching_write(struct r5conf *conf, struct stripe_head *sh,
- struct stripe_head_state *s, int disks);
-extern void
-r5c_finish_stripe_write_out(struct r5conf *conf, struct stripe_head *sh,
- struct stripe_head_state *s);
-extern void r5c_release_extra_page(struct stripe_head *sh);
-extern void r5c_use_extra_page(struct stripe_head *sh);
-extern void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
-extern void r5c_handle_cached_data_endio(struct r5conf *conf,
- struct stripe_head *sh, int disks, struct bio_list *return_bi);
-extern int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh,
- struct stripe_head_state *s);
-extern void r5c_make_stripe_write_out(struct stripe_head *sh);
-extern void r5c_flush_cache(struct r5conf *conf, int num);
-extern void r5c_check_stripe_cache_usage(struct r5conf *conf);
-extern void r5c_check_cached_full_stripe(struct r5conf *conf);
-extern struct md_sysfs_entry r5c_journal_mode;
-extern void r5c_update_on_rdev_error(struct mddev *mddev);
-extern bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect);
#endif
--
2.11.0
^ permalink raw reply related
* [PATCH v3 0/9] Partial Parity Log for MD RAID 5
From: Artur Paszkiewicz @ 2017-01-30 18:59 UTC (permalink / raw)
To: linux-raid; +Cc: shli, neilb, jes.sorensen, Artur Paszkiewicz
This series of patches implements the Partial Parity Log for RAID5 arrays. The
purpose of this feature is closing the RAID 5 Write Hole. It is a solution
alternative to the existing raid5-cache, but the implementation is based on it
and reuses some of the code by introducing support for interchangeable
policies. This allows decoupling policy from mechanism and not adding more
boilerplate code in raid5.c.
The main differences compared to raid5-cache is that PPL is a distributed log -
it is stored on array member drives in the metadata area and does not require a
dedicated journaling drive. Write performance is reduced by up to 30%-40% but
it scales with the number of drives in the array and the journaling drive does
not become a bottleneck or a single point of failure. PPL does not protect from
losing in-flight data, only from silent data corruption. More details about how
the log works can be found in patches 5 and 7.
This feature originated from Intel RSTe, which uses IMSM metadata. This
patchset implements PPL for external metadata (specifically IMSM) as well as
native MD v1.x metadata.
Changes in mdadm are also required to make this fully usable. Patches for mdadm
will be sent later.
v3:
- Rebased to latest md for-next.
- Fixed alignment issues in the metadata structures.
- Removed reading IMSM signature from superblock.
- Removed 'rwh_policy' and per-device JournalPpl flags, added
'consistency_policy', 'ppl_sector' and 'ppl_size' sysfs attributes.
- Reworked and simplified disk removal logic.
- Debug messages in raid5-ppl.c converted to pr_debug().
- Fixed some bugs in logging and recovery code.
- Improved descriptions and documentation.
v2:
- Rebased to latest md for-next.
- Fixed wrong PPL size calculation for IMSM.
- Simplified full stripe write case.
- Removed direct access to bi_io_vec.
- Handle failed bio_add_page().
Artur Paszkiewicz (9):
raid5-cache: move declarations to separate header
raid5-cache: add policy logic
md: superblock changes for PPL
raid5: calculate partial parity for a stripe
raid5-ppl: Partial Parity Log write logging implementation
md: add sysfs entries for PPL
raid5-ppl: load and recover the log
raid5-ppl: support disk hot add/remove with PPL
raid5-ppl: runtime PPL enabling or disabling
Documentation/admin-guide/md.rst | 85 ++-
drivers/md/Makefile | 2 +-
drivers/md/md.c | 136 +++++
drivers/md/md.h | 10 +
drivers/md/raid0.c | 3 +-
drivers/md/raid1.c | 3 +-
drivers/md/raid5-cache.c | 270 ++++------
drivers/md/raid5-cache.h | 197 +++++++
drivers/md/raid5-ppl.c | 1101 ++++++++++++++++++++++++++++++++++++++
drivers/md/raid5.c | 195 ++++++-
drivers/md/raid5.h | 32 +-
include/uapi/linux/raid/md_p.h | 44 +-
12 files changed, 1875 insertions(+), 203 deletions(-)
create mode 100644 drivers/md/raid5-cache.h
create mode 100644 drivers/md/raid5-ppl.c
--
2.11.0
^ permalink raw reply
* Re: drives failed during reshape, array won't even force-assemble
From: Phil Turmel @ 2017-01-30 18:13 UTC (permalink / raw)
To: Thomas Warntjen, linux-raid
In-Reply-To: <d08f3721-6a72-3fc3-58d6-29034a6962f1@warntjen.net>
Hi Thomas,
On 01/25/2017 08:27 AM, Thomas Warntjen wrote:
> On my new Ubuntu Server 16.4 LTS server I have an old RAID5 made from
> 5+1 WD Red 3TB drives which I wanted to upgrade first to RAID6 (5+2) and
> then to 6 data disks, so I added 2 new drives und started the reshape:
[trim /]
> This seems to be the same problem this guy had 5 years ago
> https://www.spinics.net/lists/raid/msg37483.html but he got enough disks
> going to start the array.
> What else is there I can do? This is my last hope :/
>
> kernel: 4.4.0-59-generic #80-Ubuntu SMP Fri Jan 6 17:47:47 UTC 2017
> x86_64 x86_64 x86_64 GNU/Linux
> mdadm: installed was "v3.3 - 3rd September 2013", now updated to "v3.4 -
> 28th January 2016"
>
> Thanks in advance!
Did you ever get any help? Or solve it on your own? This looks like a
missed mail in the list archives.
Phil
^ permalink raw reply
* Re: Errorneous detection of degraded array
From: Andrei Borzenkov @ 2017-01-30 7:29 UTC (permalink / raw)
To: NeilBrown; +Cc: linux-raid, systemd-devel@lists.freedesktop.org
In-Reply-To: <87k29drr3w.fsf@notabene.neil.brown.name>
On Mon, Jan 30, 2017 at 9:36 AM, NeilBrown <neilb@suse.com> wrote:
...
>>>>>
>>>>> systemd[1]: Created slice system-mdadm\x2dlast\x2dresort.slice.
>>>>> systemd[1]: Starting system-mdadm\x2dlast\x2dresort.slice.
>>>>> systemd[1]: Starting Activate md array even though degraded...
>>>>> systemd[1]: Stopped target Local File Systems.
>>>>> systemd[1]: Stopping Local File Systems.
>>>>> systemd[1]: Unmounting /share...
>>>>> systemd[1]: Stopped (with error) /dev/md0.
>>>
...
>
> The race is, I think, that one I mentioned. If the md device is started
> before udev tells systemd to start the timer, the Conflicts dependencies
> goes the "wrong" way and stops the wrong thing.
>
From the logs provided it is unclear whether it is *timer* or
*service*. If it is timer - I do not understand why it is started
exactly 30 seconds after device apparently appears. This would match
starting service.
Yet another case where system logging is hopelessly unfriendly for
troubleshooting :(
> It would be nice to be able to reliably stop the timer when the device
> starts, without risking having the device get stopped when the timer
> starts, but I don't think we can reliably do that.
>
Well, let's wait until we can get some more information about what happens.
> Changing the
> Conflicts=sys-devices-virtual-block-%i.device
> lines to
> ConditionPathExists=/sys/devices/virtual/block/%i
> might make the problem go away, without any negative consequences.
>
Ugly, but yes, may be this is the only way using current systemd.
> The primary purpose of having the 'Conflicts' directives was so that
> systemd wouldn't log
> Starting Activate md array even though degraded
> after the array was successfully started.
This looks like cosmetic problem. What will happen if last resort
service is started when array is fully assembled? Will it do any harm?
> Hopefully it won't do that when the Condition fails.
>
_______________________________________________
systemd-devel mailing list
systemd-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/systemd-devel
^ permalink raw reply
* Re: split scsi passthrough fields out of struct request V2
From: Hannes Reinecke @ 2017-01-30 6:58 UTC (permalink / raw)
To: Bart Van Assche, hch@lst.de, axboe@fb.com
Cc: linux-scsi@vger.kernel.org, linux-raid@vger.kernel.org,
dm-devel@redhat.com, linux-block@vger.kernel.org,
snitzer@redhat.com, j-nomura@ce.jp.nec.com
In-Reply-To: <1485552454.4267.29.camel@sandisk.com>
On 01/27/2017 10:27 PM, Bart Van Assche wrote:
> On Wed, 2017-01-25 at 18:25 +0100, Christoph Hellwig wrote:
>> this series splits the support for SCSI passthrough commands from the
>> main struct request used all over the block layer into a separate
>> scsi_request structure that drivers that want to support SCSI passthough
>> need to embedded as the first thing into their request-private data,
>> similar to how we handle NVMe passthrough commands.
>>
>> To support this I've added support for that the private data after
>> request structure to the legacy request path instead, so that it can
>> be treated the same way as the blk-mq path. Compare to the current
>> scsi_cmnd allocator that actually is a major simplification.
>>
>> Changes since V1:
>> - fix handling of a NULL sense pointer in __scsi_execute
>> - clean up handling of the flush flags in the block layer and MD
>> - additional small cleanup in dm-rq
>
> Hello Christoph,
>
> A general comment: patch "block: allow specifying size for extra
> command data" is a very welcome improvement but unfortunately also
> introduces an inconsistency among block drivers. This patch series
> namely creates two kinds of block drivers:
> - Block drivers that use the block layer core to allocate
> request-private data. These block drivers set request.cmd_size
> to a non-zero value and do not need request.special.
> - Block drivers that allocate request-private data themselves.
> These block drivers set request.cmd_size to zero and use
> request.special to translate a request pointer into the private
> data pointer.
>
> Have you considered to convert all block drivers to the new
> approach and to get rid of request.special? If so, do you already
> have plans to start working on this? I'm namely wondering wheter I
> should start working on this myself.
>
I was actually looking into it, too.
Once scsi passthrough is removed from struct request there is no
reasonable need to rely on '->special' for anything, and we should just
ditch it.
Cheers,
Hannes
--
Dr. Hannes Reinecke Teamlead Storage & Networking
hare@suse.de +49 911 74053 688
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: F. Imendörffer, J. Smithard, J. Guild, D. Upmanyu, G. Norton
HRB 21284 (AG Nürnberg)
^ permalink raw reply
* Re: Errorneous detection of degraded array
From: NeilBrown @ 2017-01-30 6:36 UTC (permalink / raw)
To: Andrei Borzenkov, Luke Pyzowski,
'systemd-devel@lists.freedesktop.org', linux-raid
In-Reply-To: <666d5a48-4c71-77ee-f71b-c32f334cf7cc@gmail.com>
[-- Attachment #1.1: Type: text/plain, Size: 4630 bytes --]
On Mon, Jan 30 2017, Andrei Borzenkov wrote:
> 30.01.2017 04:53, NeilBrown пишет:
>> On Fri, Jan 27 2017, Andrei Borzenkov wrote:
>>
>>> 26.01.2017 21:02, Luke Pyzowski пишет:
>>>> Hello,
>>>> I have a large RAID6 device with 24 local drives on CentOS7.3. Randomly (around 50% of the time) systemd will unmount my RAID device thinking it is degraded after the mdadm-last-resort@.timer expires, however the device is working normally by all accounts, and I can immediately mount it manually upon boot completion. In the logs below /share is the RAID device. I can increase the timer in /usr/lib/systemd/system/mdadm-last-resort@.timer from 30 to 60 seconds, but this problem can randomly still occur.
>>>>
>>>> systemd[1]: Created slice system-mdadm\x2dlast\x2dresort.slice.
>>>> systemd[1]: Starting system-mdadm\x2dlast\x2dresort.slice.
>>>> systemd[1]: Starting Activate md array even though degraded...
>>>> systemd[1]: Stopped target Local File Systems.
>>>> systemd[1]: Stopping Local File Systems.
>>>> systemd[1]: Unmounting /share...
>>>> systemd[1]: Stopped (with error) /dev/md0.
>>
>> This line perplexes me.
>>
>> The last-resort.service (and .timer) files have a Conflict= directive
>> against sys-devices-virtual-block-md$DEV.device
>> Normally a Conflicts= directive means that if this service starts, that
>> one is stopped, and if that one starts, this is stopped.
>> However .device units cannot be stopped:
>>
>> $ systemctl show sys-devices-virtual-block-md0.device | grep Can
>> CanStart=no
>> CanStop=no
>> CanReload=no
>> CanIsolate=no
>>
>> so presumable the attempt to stop the device fails, so the Conflict=
>> dependency cannot be met, so the last-resort service (or timer) doesn't
>> get started.
>
> As I explained in other mail, to me it looks like last-resort timer does
> get started, and then last-resort service is started which attempts to
> stop device and because mount point depends on device it also stops
> mount point. So somehow we have bad timing when both device and timer
> start without canceling each other.
>
> The fact that stopping of device itself fails is irrelevant here -
> dependencies are evaluated at the time job is submitted, so if
> share.mount Requires dev-md0.device and you attempt to Stop
> dev-md0.device, systemd still queues job to Stop share.mount.
>
>> At least, that is what I see happening in my tests.
>>
>
> Yes, we have race condition here, I cannot reproduce this either. It
> does not mean it does not exist :) Let's hope debug logging will show
> something more useful (it is entirely possible that with debugging logs
> turned on this race does not happen).
>
>> But your log doesn't mention sys-devices-virtual-block-md0, it
>> mentions /dev/md0.
>> How does systemd know about /dev/md0, or the connection it has with
>> sys-devices-virtual-block-md0 ??
>>
>
> By virtue of "Following" attribute. dev-md0.device is Following
> sys-devices-virtual-block-md0.device so stopping the latter will also
> stop the former.
Ahh.. I see why I never saw this now.
Two reasons.
1/ My /etc/fstab has UUID=d1711227-c9fa-4883-a904-7cd7a3eb865c rather
than /dev/md0
systemd doesn't manage to intuit a 'Following' dependency between
the UUID and the mount point.
2/ I use partitions of md arrays: that UUID is actually /dev/md0p3.
systemd doesn't intuit that md0p3.device is Following md0.device.
So you only hit a problem if you have "/dev/md0" or similar in
/etc/fstab.
The race is, I think, that one I mentioned. If the md device is started
before udev tells systemd to start the timer, the Conflicts dependencies
goes the "wrong" way and stops the wrong thing.
It would be nice to be able to reliably stop the timer when the device
starts, without risking having the device get stopped when the timer
starts, but I don't think we can reliably do that.
Changing the
Conflicts=sys-devices-virtual-block-%i.device
lines to
ConditionPathExists=/sys/devices/virtual/block/%i
might make the problem go away, without any negative consequences.
The primary purpose of having the 'Conflicts' directives was so that
systemd wouldn't log
Starting Activate md array even though degraded
after the array was successfully started.
Hopefully it won't do that when the Condition fails.
Thanks,
NeilBrown
>
>> Does
>> systemctl list-dependencies sys-devices-virtual-block-md0.device
>>
>> report anything interesting? I get
>>
>> sys-devices-virtual-block-md0.device
>> ● └─mdmonitor.service
>>
[-- Attachment #1.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]
[-- Attachment #2: Type: text/plain, Size: 172 bytes --]
_______________________________________________
systemd-devel mailing list
systemd-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/systemd-devel
^ permalink raw reply
* Re: Errorneous detection of degraded array
From: Andrei Borzenkov @ 2017-01-30 3:40 UTC (permalink / raw)
To: NeilBrown, Luke Pyzowski,
'systemd-devel@lists.freedesktop.org', linux-raid
In-Reply-To: <87vasxs47y.fsf@notabene.neil.brown.name>
[-- Attachment #1.1.1: Type: text/plain, Size: 3159 bytes --]
30.01.2017 04:53, NeilBrown пишет:
> On Fri, Jan 27 2017, Andrei Borzenkov wrote:
>
>> 26.01.2017 21:02, Luke Pyzowski пишет:
>>> Hello,
>>> I have a large RAID6 device with 24 local drives on CentOS7.3. Randomly (around 50% of the time) systemd will unmount my RAID device thinking it is degraded after the mdadm-last-resort@.timer expires, however the device is working normally by all accounts, and I can immediately mount it manually upon boot completion. In the logs below /share is the RAID device. I can increase the timer in /usr/lib/systemd/system/mdadm-last-resort@.timer from 30 to 60 seconds, but this problem can randomly still occur.
>>>
>>> systemd[1]: Created slice system-mdadm\x2dlast\x2dresort.slice.
>>> systemd[1]: Starting system-mdadm\x2dlast\x2dresort.slice.
>>> systemd[1]: Starting Activate md array even though degraded...
>>> systemd[1]: Stopped target Local File Systems.
>>> systemd[1]: Stopping Local File Systems.
>>> systemd[1]: Unmounting /share...
>>> systemd[1]: Stopped (with error) /dev/md0.
>
> This line perplexes me.
>
> The last-resort.service (and .timer) files have a Conflict= directive
> against sys-devices-virtual-block-md$DEV.device
> Normally a Conflicts= directive means that if this service starts, that
> one is stopped, and if that one starts, this is stopped.
> However .device units cannot be stopped:
>
> $ systemctl show sys-devices-virtual-block-md0.device | grep Can
> CanStart=no
> CanStop=no
> CanReload=no
> CanIsolate=no
>
> so presumable the attempt to stop the device fails, so the Conflict=
> dependency cannot be met, so the last-resort service (or timer) doesn't
> get started.
As I explained in other mail, to me it looks like last-resort timer does
get started, and then last-resort service is started which attempts to
stop device and because mount point depends on device it also stops
mount point. So somehow we have bad timing when both device and timer
start without canceling each other.
The fact that stopping of device itself fails is irrelevant here -
dependencies are evaluated at the time job is submitted, so if
share.mount Requires dev-md0.device and you attempt to Stop
dev-md0.device, systemd still queues job to Stop share.mount.
> At least, that is what I see happening in my tests.
>
Yes, we have race condition here, I cannot reproduce this either. It
does not mean it does not exist :) Let's hope debug logging will show
something more useful (it is entirely possible that with debugging logs
turned on this race does not happen).
> But your log doesn't mention sys-devices-virtual-block-md0, it
> mentions /dev/md0.
> How does systemd know about /dev/md0, or the connection it has with
> sys-devices-virtual-block-md0 ??
>
By virtue of "Following" attribute. dev-md0.device is Following
sys-devices-virtual-block-md0.device so stopping the latter will also
stop the former.
> Does
> systemctl list-dependencies sys-devices-virtual-block-md0.device
>
> report anything interesting? I get
>
> sys-devices-virtual-block-md0.device
> ● └─mdmonitor.service
>
[-- Attachment #1.2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 181 bytes --]
[-- Attachment #2: Type: text/plain, Size: 172 bytes --]
_______________________________________________
systemd-devel mailing list
systemd-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/systemd-devel
^ permalink raw reply
* Re: Errorneous detection of degraded array
From: NeilBrown @ 2017-01-30 1:53 UTC (permalink / raw)
To: Andrei Borzenkov, Luke Pyzowski,
'systemd-devel@lists.freedesktop.org', linux-raid
In-Reply-To: <b842c2ce-7d32-a392-e9d9-9f330fa6a7cf@gmail.com>
[-- Attachment #1.1: Type: text/plain, Size: 2045 bytes --]
On Fri, Jan 27 2017, Andrei Borzenkov wrote:
> 26.01.2017 21:02, Luke Pyzowski пишет:
>> Hello,
>> I have a large RAID6 device with 24 local drives on CentOS7.3. Randomly (around 50% of the time) systemd will unmount my RAID device thinking it is degraded after the mdadm-last-resort@.timer expires, however the device is working normally by all accounts, and I can immediately mount it manually upon boot completion. In the logs below /share is the RAID device. I can increase the timer in /usr/lib/systemd/system/mdadm-last-resort@.timer from 30 to 60 seconds, but this problem can randomly still occur.
>>
>> systemd[1]: Created slice system-mdadm\x2dlast\x2dresort.slice.
>> systemd[1]: Starting system-mdadm\x2dlast\x2dresort.slice.
>> systemd[1]: Starting Activate md array even though degraded...
>> systemd[1]: Stopped target Local File Systems.
>> systemd[1]: Stopping Local File Systems.
>> systemd[1]: Unmounting /share...
>> systemd[1]: Stopped (with error) /dev/md0.
This line perplexes me.
The last-resort.service (and .timer) files have a Conflict= directive
against sys-devices-virtual-block-md$DEV.device
Normally a Conflicts= directive means that if this service starts, that
one is stopped, and if that one starts, this is stopped.
However .device units cannot be stopped:
$ systemctl show sys-devices-virtual-block-md0.device | grep Can
CanStart=no
CanStop=no
CanReload=no
CanIsolate=no
so presumable the attempt to stop the device fails, so the Conflict=
dependency cannot be met, so the last-resort service (or timer) doesn't
get started.
At least, that is what I see happening in my tests.
But your log doesn't mention sys-devices-virtual-block-md0, it
mentions /dev/md0.
How does systemd know about /dev/md0, or the connection it has with
sys-devices-virtual-block-md0 ??
Does
systemctl list-dependencies sys-devices-virtual-block-md0.device
report anything interesting? I get
sys-devices-virtual-block-md0.device
● └─mdmonitor.service
NeilBrown
[-- Attachment #1.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]
[-- Attachment #2: Type: text/plain, Size: 172 bytes --]
_______________________________________________
systemd-devel mailing list
systemd-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/systemd-devel
^ permalink raw reply
* Re: [PATCH] Fix oddity where mdadm did not recognise a relative path
From: Wols Lists @ 2017-01-29 18:24 UTC (permalink / raw)
Cc: linux-raid
In-Reply-To: <wrfjzii9iwpq.fsf@gmail.com>
On 29/01/17 17:48, jes.sorensen@gmail.com wrote:
> Wols Lists <antlists@youngman.org.uk> writes:
>> From 4ce784307a9004124392ce48432960d7ca94d0bf Mon Sep 17 00:00:00 2001
>> From: Wol <anthony@youngman.org.uk>
>> Date: Tue, 17 Jan 2017 17:47:05 +0000
>> Subject: [PATCH] Fix oddity where mdadm did not recognise a relative path
>>
>> mdadm assumed that a pathname started with a "/", while an array
>> name didn't. This alters the logic so that if the first character
>> is not a "/" it tries to open an array, and if that fails it drops
>> through to the pathname code rather than terminating immediately
>> with an error.
>>
>> Signed-off-by: Wol <anthony@youngman.org.uk>
>> ---
>> mdadm.c | 12 ++++++------
>> 1 file changed, 6 insertions(+), 6 deletions(-)
>
> Applied, thanks!
>
> Sorry I have been swamped the last couple of weeks finishing up my old
> job and starting the new one. Will be bogged down a lot the next couple
> of weeks, so please be patient.
>
Neil said you were changing jobs - that's fine. I just didn't want it to
get missed, and as there was no initial response I was trying to do as
gentle a "what's up" as I could.
That's fine. If it's in, then great :-) and I can wait for it to be
released whenever.
Cheers,
Wol
^ permalink raw reply
* Re: [PATCH] Fix oddity where mdadm did not recognise a relative path
From: jes.sorensen @ 2017-01-29 17:48 UTC (permalink / raw)
To: Wols Lists; +Cc: linux-raid
In-Reply-To: <587E6B56.8090704@youngman.org.uk>
Wols Lists <antlists@youngman.org.uk> writes:
> From 4ce784307a9004124392ce48432960d7ca94d0bf Mon Sep 17 00:00:00 2001
> From: Wol <anthony@youngman.org.uk>
> Date: Tue, 17 Jan 2017 17:47:05 +0000
> Subject: [PATCH] Fix oddity where mdadm did not recognise a relative path
>
> mdadm assumed that a pathname started with a "/", while an array
> name didn't. This alters the logic so that if the first character
> is not a "/" it tries to open an array, and if that fails it drops
> through to the pathname code rather than terminating immediately
> with an error.
>
> Signed-off-by: Wol <anthony@youngman.org.uk>
> ---
> mdadm.c | 12 ++++++------
> 1 file changed, 6 insertions(+), 6 deletions(-)
Applied, thanks!
Sorry I have been swamped the last couple of weeks finishing up my old
job and starting the new one. Will be bogged down a lot the next couple
of weeks, so please be patient.
Cheers,
Jes
^ permalink raw reply
* Re: [PATCH 0/2] Bad block notification
From: jes.sorensen @ 2017-01-29 17:44 UTC (permalink / raw)
To: Tomasz Majchrzak; +Cc: linux-raid, shli
In-Reply-To: <1485259419-2308-1-git-send-email-tomasz.majchrzak@intel.com>
Tomasz Majchrzak <tomasz.majchrzak@intel.com> writes:
> At the moment there is no way to be notified that bad blocks have been found on
> a disk. It is only possible to check manually with 'mdadm --examine-badblocks'.
> User might not be aware there is a bad block for a long period. If another disk
> in the array fails, data is lost.
>
> These patches add a new event to the kernel and mdadm in order to send
> notification on the first bad block on a disk. I have chosen to do it only for
> first bad block as I think it's sufficient indication that the drive requires
> replacement.
Tomasz,
Looks reasonable to me - I'll wait for Shaohua to respond on the kernel
part before I apply the mdadm part.
Cheers,
Jes
^ permalink raw reply
* Re: [PATCH v3] md linear: fix a race between linear_add() and linear_congested()
From: Coly Li @ 2017-01-29 6:39 UTC (permalink / raw)
To: Shaohua Li; +Cc: linux-raid, Shaohua Li, Neil Brown, stable
In-Reply-To: <20170129014555.lgmk2ivste2xttxq@kernel.org>
On 2017/1/29 上午9:45, Shaohua Li wrote:
> On Sat, Jan 28, 2017 at 09:11:49PM +0800, colyli@suse.de wrote:
>> Recently I receive a bug report that on Linux v3.0 based kerenl, hot add
>> disk to a md linear device causes kernel crash at linear_congested(). From
>> the crash image analysis, I find in linear_congested(), mddev->raid_disks
>> contains value N, but conf->disks[] only has N-1 pointers available. Then
>> a NULL pointer deference crashes the kernel.
>>
>> There is a race between linear_add() and linear_congested(), RCU stuffs
>> used in these two functions cannot avoid the race. Since Linuv v4.0
>> RCU code is replaced by introducing mddev_suspend(). After checking the
>> upstream code, it seems linear_congested() is not called in
>> generic_make_request() code patch, so mddev_suspend() cannot provent it
>> from being called. The possible race still exists.
>>
>> Here I explain how the race still exists in current code. For a machine
>> has many CPUs, on one CPU, linear_add() is called to add a hard disk to a
>> md linear device; at the same time on other CPU, linear_congested() is
>> called to detect whether this md linear device is congested before issuing
>> an I/O request onto it.
>>
>> Now I use a possible code execution time sequence to demo how the possible
>> race happens,
>>
>> seq linear_add() linear_congested()
>> 0 conf=mddev->private
>> 1 oldconf=mddev->private
>> 2 mddev->raid_disks++
>> 3 for (i=0; i<mddev->raid_disks;i++)
>> 4 bdev_get_queue(conf->disks[i].rdev->bdev)
>> 5 mddev->private=newconf
>>
>> In linear_add() mddev->raid_disks is increased in time seq 2, and on
>> another CPU in linear_congested() the for-loop iterates conf->disks[i] by
>> the increased mddev->raid_disks in time seq 3,4. But conf with one more
>> element (which is a pointer to struct dev_info type) to conf->disks[] is
>> not updated yet, accessing its structure member in time seq 4 will cause a
>> NULL pointer deference fault.
>>
>> To fix this race, there are 2 parts of modification in the patch,
>> 1) Add 'int raid_disks' in struct linear_conf, as a copy of
>> mddev->raid_disks. It is initialized in linear_conf(), always being
>> consistent with pointers number of 'struct dev_info disks[]'. When
>> iterating conf->disks[] in linear_congested(), use conf->raid_disks to
>> replace mddev->raid_disks in the for-loop, then NULL pointer deference
>> will not happen again.
>> 2) RCU stuffs are back again, and use kfree_rcu() in linear_add() to
>> free oldconf memory. Because oldconf may be referenced as mddev->private
>> in linear_congested(), kfree_rcu() makes sure that its memory will not
>> be released until no one uses it any more.
>> Also some code comments are added in this patch, to make this modification
>> to be easier understandable.
>>
>> This patch can be applied for kernels since v4.0 after commit:
>> 3be260cc18f8 ("md/linear: remove rcu protections in favour of
>> suspend/resume"). But this bug is reported on Linux v3.0 based kernel, for
>> people who maintain kernels before Linux v4.0, they need to do some back
>> back port to this patch.
>>
>> Changelog:
>> - V3: add 'int raid_disks' in struct linear_conf, and use kfree_rcu() to
>> replace rcu_call() in linear_add().
>> - v2: add RCU stuffs by suggestion from Shaohua and Neil.
>> - v1: initial effort.
>>
>> Signed-off-by: Coly Li <colyli@suse.de>
>> Cc: Shaohua Li <shli@fb.com>
>> Cc: Neil Brown <neilb@suse.com>
>> Cc: stable@vger.kernel.org
> Hi,
>
> Happy new year! Applied, though I changed the format of comments. It should be:
> /*
> * comments
> */
I just use this comment format follow the existing comments in linear.c
file. OK I will follow the above format in future.
Thank you! And Happy New Year :-)
Coly
^ permalink raw reply
* Re: [PATCH v3] md linear: fix a race between linear_add() and linear_congested()
From: Shaohua Li @ 2017-01-29 1:45 UTC (permalink / raw)
To: colyli; +Cc: linux-raid, Shaohua Li, Neil Brown, stable
In-Reply-To: <1485609109-23896-1-git-send-email-colyli@suse.de>
On Sat, Jan 28, 2017 at 09:11:49PM +0800, colyli@suse.de wrote:
> Recently I receive a bug report that on Linux v3.0 based kerenl, hot add
> disk to a md linear device causes kernel crash at linear_congested(). From
> the crash image analysis, I find in linear_congested(), mddev->raid_disks
> contains value N, but conf->disks[] only has N-1 pointers available. Then
> a NULL pointer deference crashes the kernel.
>
> There is a race between linear_add() and linear_congested(), RCU stuffs
> used in these two functions cannot avoid the race. Since Linuv v4.0
> RCU code is replaced by introducing mddev_suspend(). After checking the
> upstream code, it seems linear_congested() is not called in
> generic_make_request() code patch, so mddev_suspend() cannot provent it
> from being called. The possible race still exists.
>
> Here I explain how the race still exists in current code. For a machine
> has many CPUs, on one CPU, linear_add() is called to add a hard disk to a
> md linear device; at the same time on other CPU, linear_congested() is
> called to detect whether this md linear device is congested before issuing
> an I/O request onto it.
>
> Now I use a possible code execution time sequence to demo how the possible
> race happens,
>
> seq linear_add() linear_congested()
> 0 conf=mddev->private
> 1 oldconf=mddev->private
> 2 mddev->raid_disks++
> 3 for (i=0; i<mddev->raid_disks;i++)
> 4 bdev_get_queue(conf->disks[i].rdev->bdev)
> 5 mddev->private=newconf
>
> In linear_add() mddev->raid_disks is increased in time seq 2, and on
> another CPU in linear_congested() the for-loop iterates conf->disks[i] by
> the increased mddev->raid_disks in time seq 3,4. But conf with one more
> element (which is a pointer to struct dev_info type) to conf->disks[] is
> not updated yet, accessing its structure member in time seq 4 will cause a
> NULL pointer deference fault.
>
> To fix this race, there are 2 parts of modification in the patch,
> 1) Add 'int raid_disks' in struct linear_conf, as a copy of
> mddev->raid_disks. It is initialized in linear_conf(), always being
> consistent with pointers number of 'struct dev_info disks[]'. When
> iterating conf->disks[] in linear_congested(), use conf->raid_disks to
> replace mddev->raid_disks in the for-loop, then NULL pointer deference
> will not happen again.
> 2) RCU stuffs are back again, and use kfree_rcu() in linear_add() to
> free oldconf memory. Because oldconf may be referenced as mddev->private
> in linear_congested(), kfree_rcu() makes sure that its memory will not
> be released until no one uses it any more.
> Also some code comments are added in this patch, to make this modification
> to be easier understandable.
>
> This patch can be applied for kernels since v4.0 after commit:
> 3be260cc18f8 ("md/linear: remove rcu protections in favour of
> suspend/resume"). But this bug is reported on Linux v3.0 based kernel, for
> people who maintain kernels before Linux v4.0, they need to do some back
> back port to this patch.
>
> Changelog:
> - V3: add 'int raid_disks' in struct linear_conf, and use kfree_rcu() to
> replace rcu_call() in linear_add().
> - v2: add RCU stuffs by suggestion from Shaohua and Neil.
> - v1: initial effort.
>
> Signed-off-by: Coly Li <colyli@suse.de>
> Cc: Shaohua Li <shli@fb.com>
> Cc: Neil Brown <neilb@suse.com>
> Cc: stable@vger.kernel.org
Hi,
Happy new year! Applied, though I changed the format of comments. It should be:
/*
* comments
*/
> ---
> drivers/md/linear.c | 38 +++++++++++++++++++++++++++++++++-----
> drivers/md/linear.h | 2 ++
> 2 files changed, 35 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/md/linear.c b/drivers/md/linear.c
> index 5975c99..767e4b8 100644
> --- a/drivers/md/linear.c
> +++ b/drivers/md/linear.c
> @@ -53,18 +53,26 @@ static inline struct dev_info *which_dev(struct mddev *mddev, sector_t sector)
> return conf->disks + lo;
> }
>
> +/*
> + * In linear_congested() conf->raid_disks is used as a copy of
> + * mddev->raid_disks to iterate conf->disks[], because conf->raid_disks
> + * and conf->disks[] are created in linear_conf(), they are always
> + * consitent with each other, but mddev->raid_disks dose not.
> + */
> static int linear_congested(struct mddev *mddev, int bits)
> {
> struct linear_conf *conf;
> int i, ret = 0;
>
> - conf = mddev->private;
> + rcu_read_lock();
> + conf = rcu_dereference(mddev->private);
>
> - for (i = 0; i < mddev->raid_disks && !ret ; i++) {
> + for (i = 0; i < conf->raid_disks && !ret ; i++) {
> struct request_queue *q = bdev_get_queue(conf->disks[i].rdev->bdev);
> ret |= bdi_congested(&q->backing_dev_info, bits);
> }
>
> + rcu_read_unlock();
> return ret;
> }
>
> @@ -144,6 +152,18 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
> conf->disks[i-1].end_sector +
> conf->disks[i].rdev->sectors;
>
> + /* conf->raid_disks is copy of mddev->raid_disks. The reason to
> + * keep a copy of mddev->raid_disks in struct linear_conf is,
> + * mddev->raid_disks may not be consistent with pointers number of
> + * conf->disks[] when it is updated in linear_add() and used to
> + * iterate old conf->disks[] earray in linear_congested().
> + * Here conf->raid_disks is always consitent with number of
> + * pointers in conf->disks[] array, and mddev->private is updated
> + * with rcu_assign_pointer() in linear_addr(), such race can be
> + * avoided.
> + */
> + conf->raid_disks = raid_disks;
> +
> return conf;
>
> out:
> @@ -196,15 +216,23 @@ static int linear_add(struct mddev *mddev, struct md_rdev *rdev)
> if (!newconf)
> return -ENOMEM;
>
> + /* newconf->raid_disks already keeps a copy of * the increased
> + * value of mddev->raid_disks, WARN_ONCE() is just used to make
> + * sure of this. It is possible that oldconf is still referenced
> + * in linear_congested(), therefore kfree_rcu() is used to free
> + * oldconf until no one uses it anymore.
> + */
> mddev_suspend(mddev);
> - oldconf = mddev->private;
> + oldconf = rcu_dereference(mddev->private);
> mddev->raid_disks++;
> - mddev->private = newconf;
> + WARN_ONCE(mddev->raid_disks != newconf->raid_disks,
> + "copied raid_disks doesn't match mddev->raid_disks");
> + rcu_assign_pointer(mddev->private, newconf);
> md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
> set_capacity(mddev->gendisk, mddev->array_sectors);
> mddev_resume(mddev);
> revalidate_disk(mddev->gendisk);
> - kfree(oldconf);
> + kfree_rcu(oldconf, rcu);
> return 0;
> }
>
> diff --git a/drivers/md/linear.h b/drivers/md/linear.h
> index b685ddd..07fd40c 100644
> --- a/drivers/md/linear.h
> +++ b/drivers/md/linear.h
> @@ -10,6 +10,8 @@ struct linear_conf
> {
> struct rcu_head rcu;
> sector_t array_sectors;
> + int raid_disks; /* a copy of
> + * mddev->raid_disks */
> struct dev_info disks[0];
> };
> #endif
> --
> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply
* Re: RAID 5 reshape stalled at 77.5% - next steps??
From: George Rapp @ 2017-01-28 23:58 UTC (permalink / raw)
To: Roman Mamedov; +Cc: Linux-RAID, Matthew Krumwiede
In-Reply-To: <20170129043338.235723a7@natsu>
On Sat, Jan 28, 2017 at 6:33 PM, Roman Mamedov <rm@romanrm.net> wrote:
> On Sat, 28 Jan 2017 18:29:32 -0500
> George Rapp <george.rapp@gmail.com> wrote:
>
>> Attempting to re-add /dev/sdg4 to the array fails on a busy device:
>>
>> # mdadm --manage /dev/md4 --re-add /dev/sdg4
>> mdadm: Cannot open /dev/sdg4: Device or resource busy
>
> You need to remove it first
>
> mdadm --remove /dev/md4 /dev/sdg4
>
> or
>
> mdadm --remove /dev/md4 faulty
>
> But honestly I am not sure if simply removing and re-adding will bring your
> reshape back to its working order at this point.
>
> Also you should figure out why did it fail in the first place. Check
> SMART, check dmesg further back rather than a few lines only. Maybe the disk
> needs a replacement, not just a blind re-add.
Perhaps not surprisingly, the --remove command also hung.
/dev/sdg4 apparently suffered an uncorrectable read error. Entire
dmesg output (2172 lines) is at
https://app.box.com/s/7brp7c53a51zw4ez5to0m12oc5hxeq92 for your
reference.
Since none of the mdadm commands will respond, I'm thinking we need to
reboot the machine at this point to do any more diagnostics.
Thanks for your quick replies!
--
George Rapp (Pataskala, OH) Home: george.rapp -- at -- gmail.com
LinkedIn profile: https://www.linkedin.com/in/georgerapp
Phone: +1 740 936 RAPP (740 936 7277)
^ permalink raw reply
* Re: RAID 5 reshape stalled at 77.5% - next steps??
From: Roman Mamedov @ 2017-01-28 23:33 UTC (permalink / raw)
To: George Rapp; +Cc: Linux-RAID, Matthew Krumwiede
In-Reply-To: <CAF-KpgbmnOmbCXDi6_xr9d9Ms5rFt+D3WN=3gu=7xFv0G6U5vw@mail.gmail.com>
On Sat, 28 Jan 2017 18:29:32 -0500
George Rapp <george.rapp@gmail.com> wrote:
> Attempting to re-add /dev/sdg4 to the array fails on a busy device:
>
> # mdadm --manage /dev/md4 --re-add /dev/sdg4
> mdadm: Cannot open /dev/sdg4: Device or resource busy
You need to remove it first
mdadm --remove /dev/md4 /dev/sdg4
or
mdadm --remove /dev/md4 faulty
But honestly I am not sure if simply removing and re-adding will bring your
reshape back to its working order at this point.
Also you should figure out why did it fail in the first place. Check
SMART, check dmesg further back rather than a few lines only. Maybe the disk
needs a replacement, not just a blind re-add.
--
With respect,
Roman
^ permalink raw reply
* Re: RAID 5 reshape stalled at 77.5% - next steps??
From: George Rapp @ 2017-01-28 23:29 UTC (permalink / raw)
To: Roman Mamedov; +Cc: Linux-RAID, Matthew Krumwiede
In-Reply-To: <20170129041559.723d8259@natsu>
On Sat, Jan 28, 2017 at 6:15 PM, Roman Mamedov <rm@romanrm.net> wrote:
> On Sat, 28 Jan 2017 18:01:30 -0500
> George Rapp <george.rapp@gmail.com> wrote:
>
>> The reshape proceeded normally until it hit 77.5%, where it has been
>> stuck for the last couple of days:
>>
>> # cat /proc/mdstat
>> Personalities : [raid1] [raid6] [raid5] [raid4]
>> md4 : active raid5 sdd4[13](R) sdb4[12] sdg4[10](F) sdi4[8] sdl4[9]
>> sdf4[1] sdj4[7] sdh4[2] sde4[0] sdk4[11]
>>
>> 13454923776 blocks super 1.1 level 5, 512k chunk, algorithm 2 [10/9]
>> [UUUU_UUUU_]
>> [===============>.....] reshape = 77.5% (1490403328/1922131968)
>> finish=2544246.9min speed=2K/sec
>
> It shows you have a failed device (sdg4) but you don't mention anything about
> that? Post your mdadm --detail /dev/md4, and what do you have in dmesg.
Roman -
Good catch. I didn't notice that.
# mdadm --detail /dev/md4
/dev/md4:
Version : 1.1
Creation Time : Thu Feb 17 14:54:06 2011
Raid Level : raid5
Array Size : 13454923776 (12831.62 GiB 13777.84 GB)
Used Dev Size : 1922131968 (1833.09 GiB 1968.26 GB)
Raid Devices : 10
Total Devices : 10
Persistence : Superblock is persistent
Update Time : Thu Jan 26 08:06:56 2017
State : active, FAILED, reshaping
Active Devices : 8
Working Devices : 9
Failed Devices : 1
Spare Devices : 1
Layout : left-symmetric
Chunk Size : 512K
Reshape Status : 77% complete
Delta Devices : 2, (8->10)
Name : localhost.localdomain:4
UUID : 359d41dc:a2e506e3:5e802a49:a84ef89c
Events : 3957775
Number Major Minor RaidDevice State
0 8 68 0 active sync /dev/sde4
1 8 84 1 active sync /dev/sdf4
2 8 116 2 active sync /dev/sdh4
9 8 180 3 active sync /dev/sdl4
10 8 100 4 faulty /dev/sdg4
13 8 52 4 spare rebuilding /dev/sdd4
11 8 164 5 active sync /dev/sdk4
8 8 132 6 active sync /dev/sdi4
7 8 148 7 active sync /dev/sdj4
12 8 20 8 active sync /dev/sdb4
18 0 0 18 removed
Relevant dmesg output:
[128702.154193] md: super_written gets error=-5
[128702.154197] md/raid:md4: Disk failure on sdg4, disabling device.
md/raid:md4: Operation continuing on 9 devices.
[128702.154205] md: super_written gets error=-5
[128702.254561] mvsas 0000:03:00.0: Phy2 : No sig fis
[128703.151620] md: md4: reshape interrupted.
[128706.343757] sas: sas_form_port: phy2 belongs to port2 already(1)!
Attempting to re-add /dev/sdg4 to the array fails on a busy device:
# mdadm --manage /dev/md4 --re-add /dev/sdg4
mdadm: Cannot open /dev/sdg4: Device or resource busy
To free up /dev/sdg4, I tried to stop the array. Not surprisingly,
this command hung as well:
# mdadm --stop /dev/md4
--
George Rapp (Pataskala, OH) Home: george.rapp -- at -- gmail.com
LinkedIn profile: https://www.linkedin.com/in/georgerapp
Phone: +1 740 936 RAPP (740 936 7277)
^ permalink raw reply
* Re: RAID 5 reshape stalled at 77.5% - next steps??
From: Roman Mamedov @ 2017-01-28 23:15 UTC (permalink / raw)
To: George Rapp; +Cc: Linux-RAID, Matthew Krumwiede
In-Reply-To: <CAF-Kpgbz_Ld5WVOid1SHopcOjdx63kM-zVDUScZowtQzxrtZHg@mail.gmail.com>
On Sat, 28 Jan 2017 18:01:30 -0500
George Rapp <george.rapp@gmail.com> wrote:
> The reshape proceeded normally until it hit 77.5%, where it has been
> stuck for the last couple of days:
>
> # cat /proc/mdstat
> Personalities : [raid1] [raid6] [raid5] [raid4]
> md4 : active raid5 sdd4[13](R) sdb4[12] sdg4[10](F) sdi4[8] sdl4[9]
> sdf4[1] sdj4[7] sdh4[2] sde4[0] sdk4[11]
>
> 13454923776 blocks super 1.1 level 5, 512k chunk, algorithm 2 [10/9]
> [UUUU_UUUU_]
> [===============>.....] reshape = 77.5% (1490403328/1922131968)
> finish=2544246.9min speed=2K/sec
It shows you have a failed device (sdg4) but you don't mention anything about
that? Post your mdadm --detail /dev/md4, and what do you have in dmesg.
--
With respect,
Roman
^ permalink raw reply
* RAID 5 reshape stalled at 77.5% - next steps??
From: George Rapp @ 2017-01-28 23:01 UTC (permalink / raw)
To: Linux-RAID; +Cc: Matthew Krumwiede
Hello linux-raid team. I have a reshape operation that is stuck and
refuses to respond to commands. I'm wondering what my options are to
safely get it moving again.
Background: I added two new partitions to a RAID 5 array, using a
backup-file on a
separate device:
# mdadm --add /dev/md4 /dev/sdb4 /dev/sdd4
mdadm: added /dev/sdb4
mdadm: added /dev/sdd4
# mdadm --grow --raid-devices=10
--backup-file=/home/gwr/c/md4_backup__2017-01-25 /dev/md4
mdadm: Need to backup 32256K of critical section..
# cat /proc/mdstat
Personalities : [raid1] [raid6] [raid5] [raid4]
[...]
md4 : active raid5 sdd4[13](R) sdb4[12] sdg4[10] sdi4[8] sdl4[9]
sdf4[1] sdj4[7] sdh4[2] sde4[0] sdk4[11]
13454923776 blocks super 1.1 level 5, 512k chunk, algorithm 2 [10/9]
[UUUUUUUUU_]
[>....................] reshape = 0.8% (16715456/1922131968)
finish=965.4min speed=32892K/sec
The reshape proceeded normally until it hit 77.5%, where it has been
stuck for the last couple of days:
# cat /proc/mdstat
Personalities : [raid1] [raid6] [raid5] [raid4]
md4 : active raid5 sdd4[13](R) sdb4[12] sdg4[10](F) sdi4[8] sdl4[9]
sdf4[1] sdj4[7] sdh4[2] sde4[0] sdk4[11]
13454923776 blocks super 1.1 level 5, 512k chunk, algorithm 2 [10/9]
[UUUU_UUUU_]
[===============>.....] reshape = 77.5% (1490403328/1922131968)
finish=2544246.9min speed=2K/sec
The backup file was last accessed at about the time I started the reshape:
-rw-------. 1 root root 33034240 Jan 25 11:52 md4_backup__2017-01-25
I tried to idle the RAID reshape, but the "echo" command just hung:
# cd /sys/block/md4/md
# echo idle > sync_action
I can get some data from the files in this directory, though:
# cat reshape_direction
forwards
# cat reshape_position
26825379840
I tried to pull mdadm data about this array to add to this post, but that
command also hung:
# mdadm --misc --examine /dev/md4
The server CPU load is pegged, with md4_raid5 as the top CPU hog.
What are my safe alternatives here? Can I safely reboot without corrupting
the reshape? How can I get the reshape unstuck?
--
George Rapp (Pataskala, OH) Home: george.rapp -- at -- gmail.com
LinkedIn profile: https://www.linkedin.com/in/georgerapp
Phone: +1 740 936 RAPP (740 936 7277)
^ permalink raw reply
* Re: [systemd-devel] Errorneous detection of degraded array
From: Andrei Borzenkov @ 2017-01-28 17:34 UTC (permalink / raw)
To: Luke Pyzowski, 'systemd-devel@lists.freedesktop.org',
linux-raid@vger.kernel.org
In-Reply-To: <96A26C8C6786C341B83BC4F2BC5419E4795DF1D8@SRF-EXCH1.corp.sunrisefutures.com>
27.01.2017 22:44, Luke Pyzowski пишет:
...
> Jan 27 11:33:14 lnxnfs01 kernel: md/raid:md0: raid level 6 active with 24 out of 24 devices, algorithm 2
...
> Jan 27 11:33:14 lnxnfs01 kernel: md0: detected capacity change from 0 to 45062020923392
> Jan 27 11:33:14 lnxnfs01 systemd[1]: Found device /dev/disk/by-uuid/2b9114be-3d5a-41d7-8d4b-e5047d223129.
> Jan 27 11:33:14 lnxnfs01 systemd[1]: Started udev Wait for Complete Device Initialization.
> Jan 27 11:33:14 lnxnfs01 systemd[1]: Started Timer to wait for more drives before activating degraded array..
> Jan 27 11:33:14 lnxnfs01 systemd[1]: Starting Timer to wait for more drives before activating degraded array..
...
>
> ... + 31 seconds from disk initialization, expiration of 30 second timer from mdadm-last-resort@.timer
>
> Jan 27 11:33:45 lnxnfs01 systemd[1]: Created slice system-mdadm\x2dlast\x2dresort.slice.
> Jan 27 11:33:45 lnxnfs01 systemd[1]: Starting system-mdadm\x2dlast\x2dresort.slice.
> Jan 27 11:33:45 lnxnfs01 systemd[1]: Stopped target Local File Systems.
> Jan 27 11:33:45 lnxnfs01 systemd[1]: Stopping Local File Systems.
> Jan 27 11:33:45 lnxnfs01 systemd[1]: Unmounting Mount /share RAID partition explicitly...
> Jan 27 11:33:45 lnxnfs01 systemd[1]: Starting Activate md array even though degraded...
> Jan 27 11:33:45 lnxnfs01 systemd[1]: Stopped (with error) /dev/md0.
> Jan 27 11:33:45 lnxnfs01 systemd[1]: Started Activate md array even though degraded.
> Jan 27 11:33:45 lnxnfs01 systemd[1]: Unmounted Mount /share RAID partition explicitly.
>
Here is my educated guess.
Both mdadm-last-resort@.timer and mdadm-last-resort@.service conflict
with MD device:
bor@bor-Latitude-E5450:~/src/systemd$ cat ../mdadm/systemd/
mdadm-grow-continue@.service mdadm.shutdown
SUSE-mdadm_env.sh
mdadm-last-resort@.service mdmonitor.service
mdadm-last-resort@.timer mdmon@.service
bor@bor-Latitude-E5450:~/src/systemd$ cat
../mdadm/systemd/mdadm-last-resort@.timer
[Unit]
Description=Timer to wait for more drives before activating degraded array.
DefaultDependencies=no
Conflicts=sys-devices-virtual-block-%i.device
[Timer]
OnActiveSec=30
bor@bor-Latitude-E5450:~/src/systemd$ cat
../mdadm/systemd/mdadm-last-resort@.service
[Unit]
Description=Activate md array even though degraded
DefaultDependencies=no
Conflicts=sys-devices-virtual-block-%i.device
[Service]
Type=oneshot
ExecStart=BINDIR/mdadm --run /dev/%i
I presume intention is to stop these units when MD device is finally
assembled as complete. This is indeed what happens on my (test) system:
Jan 28 14:18:04 linux-ffk5 kernel: md: bind<vda1>
Jan 28 14:18:04 linux-ffk5 kernel: md: bind<vdb1>
Jan 28 14:18:05 linux-ffk5 kernel: md/raid1:md0: active with 2 out of 2
mirrors
Jan 28 14:18:05 linux-ffk5 kernel: md0: detected capacity change from 0
to 5363466240
Jan 28 14:18:06 linux-ffk5 systemd[1]: mdadm-last-resort@md0.timer:
Installed new job mdadm-last-resort@md0.timer/start as 287
Jan 28 14:18:06 linux-ffk5 systemd[1]: mdadm-last-resort@md0.timer:
Enqueued job mdadm-last-resort@md0.timer/start as 287
Jan 28 14:18:06 linux-ffk5 systemd[1]: dev-ttyS9.device: Changed dead ->
plugged
Jan 28 14:18:07 linux-ffk5 systemd[1]: mdadm-last-resort@md0.timer:
Changed dead -> waiting
Jan 28 14:18:12 linux-ffk5 systemd[1]:
sys-devices-virtual-block-md0.device: Changed dead -> plugged
Jan 28 14:18:12 linux-ffk5 systemd[1]: mdadm-last-resort@md0.timer:
Trying to enqueue job mdadm-last-resort@md0.timer/stop/replace
Jan 28 14:18:12 linux-ffk5 systemd[1]: mdadm-last-resort@md0.timer:
Installed new job mdadm-last-resort@md0.timer/stop as 292
Jan 28 14:18:12 linux-ffk5 systemd[1]: mdadm-last-resort@md0.timer:
Enqueued job mdadm-last-resort@md0.timer/stop as 292
Jan 28 14:18:12 linux-ffk5 systemd[1]: mdadm-last-resort@md0.timer:
Changed waiting -> dead
Jan 28 14:18:12 linux-ffk5 systemd[1]: mdadm-last-resort@md0.timer: Job
mdadm-last-resort@md0.timer/stop finished, result=done
Jan 28 14:18:12 linux-ffk5 systemd[1]: Stopped Timer to wait for more
drives before activating degraded array..
Jan 28 14:19:34 10 systemd[1692]: dev-vda1.device: Changed dead -> plugged
Jan 28 14:19:34 10 systemd[1692]: dev-vdb1.device: Changed dead -> plugged
On your system apparently timer is not stopped when md device appears so
that when later last-resort service runs, it causes attempt to stop md
device (due to conflict) and transitively mount on top of it.
Could you try run with systemd.log_level=debug on kernel command line
and upload journal again. We can only hope that it will not skew timings
enough but it may prove my hypothesis.
^ permalink raw reply
* [PATCH v3] md linear: fix a race between linear_add() and linear_congested()
From: colyli @ 2017-01-28 13:11 UTC (permalink / raw)
To: linux-raid; +Cc: Coly Li, Shaohua Li, Neil Brown, stable
Recently I receive a bug report that on Linux v3.0 based kerenl, hot add
disk to a md linear device causes kernel crash at linear_congested(). From
the crash image analysis, I find in linear_congested(), mddev->raid_disks
contains value N, but conf->disks[] only has N-1 pointers available. Then
a NULL pointer deference crashes the kernel.
There is a race between linear_add() and linear_congested(), RCU stuffs
used in these two functions cannot avoid the race. Since Linuv v4.0
RCU code is replaced by introducing mddev_suspend(). After checking the
upstream code, it seems linear_congested() is not called in
generic_make_request() code patch, so mddev_suspend() cannot provent it
from being called. The possible race still exists.
Here I explain how the race still exists in current code. For a machine
has many CPUs, on one CPU, linear_add() is called to add a hard disk to a
md linear device; at the same time on other CPU, linear_congested() is
called to detect whether this md linear device is congested before issuing
an I/O request onto it.
Now I use a possible code execution time sequence to demo how the possible
race happens,
seq linear_add() linear_congested()
0 conf=mddev->private
1 oldconf=mddev->private
2 mddev->raid_disks++
3 for (i=0; i<mddev->raid_disks;i++)
4 bdev_get_queue(conf->disks[i].rdev->bdev)
5 mddev->private=newconf
In linear_add() mddev->raid_disks is increased in time seq 2, and on
another CPU in linear_congested() the for-loop iterates conf->disks[i] by
the increased mddev->raid_disks in time seq 3,4. But conf with one more
element (which is a pointer to struct dev_info type) to conf->disks[] is
not updated yet, accessing its structure member in time seq 4 will cause a
NULL pointer deference fault.
To fix this race, there are 2 parts of modification in the patch,
1) Add 'int raid_disks' in struct linear_conf, as a copy of
mddev->raid_disks. It is initialized in linear_conf(), always being
consistent with pointers number of 'struct dev_info disks[]'. When
iterating conf->disks[] in linear_congested(), use conf->raid_disks to
replace mddev->raid_disks in the for-loop, then NULL pointer deference
will not happen again.
2) RCU stuffs are back again, and use kfree_rcu() in linear_add() to
free oldconf memory. Because oldconf may be referenced as mddev->private
in linear_congested(), kfree_rcu() makes sure that its memory will not
be released until no one uses it any more.
Also some code comments are added in this patch, to make this modification
to be easier understandable.
This patch can be applied for kernels since v4.0 after commit:
3be260cc18f8 ("md/linear: remove rcu protections in favour of
suspend/resume"). But this bug is reported on Linux v3.0 based kernel, for
people who maintain kernels before Linux v4.0, they need to do some back
back port to this patch.
Changelog:
- V3: add 'int raid_disks' in struct linear_conf, and use kfree_rcu() to
replace rcu_call() in linear_add().
- v2: add RCU stuffs by suggestion from Shaohua and Neil.
- v1: initial effort.
Signed-off-by: Coly Li <colyli@suse.de>
Cc: Shaohua Li <shli@fb.com>
Cc: Neil Brown <neilb@suse.com>
Cc: stable@vger.kernel.org
---
drivers/md/linear.c | 38 +++++++++++++++++++++++++++++++++-----
drivers/md/linear.h | 2 ++
2 files changed, 35 insertions(+), 5 deletions(-)
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 5975c99..767e4b8 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -53,18 +53,26 @@ static inline struct dev_info *which_dev(struct mddev *mddev, sector_t sector)
return conf->disks + lo;
}
+/*
+ * In linear_congested() conf->raid_disks is used as a copy of
+ * mddev->raid_disks to iterate conf->disks[], because conf->raid_disks
+ * and conf->disks[] are created in linear_conf(), they are always
+ * consitent with each other, but mddev->raid_disks dose not.
+ */
static int linear_congested(struct mddev *mddev, int bits)
{
struct linear_conf *conf;
int i, ret = 0;
- conf = mddev->private;
+ rcu_read_lock();
+ conf = rcu_dereference(mddev->private);
- for (i = 0; i < mddev->raid_disks && !ret ; i++) {
+ for (i = 0; i < conf->raid_disks && !ret ; i++) {
struct request_queue *q = bdev_get_queue(conf->disks[i].rdev->bdev);
ret |= bdi_congested(&q->backing_dev_info, bits);
}
+ rcu_read_unlock();
return ret;
}
@@ -144,6 +152,18 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
conf->disks[i-1].end_sector +
conf->disks[i].rdev->sectors;
+ /* conf->raid_disks is copy of mddev->raid_disks. The reason to
+ * keep a copy of mddev->raid_disks in struct linear_conf is,
+ * mddev->raid_disks may not be consistent with pointers number of
+ * conf->disks[] when it is updated in linear_add() and used to
+ * iterate old conf->disks[] earray in linear_congested().
+ * Here conf->raid_disks is always consitent with number of
+ * pointers in conf->disks[] array, and mddev->private is updated
+ * with rcu_assign_pointer() in linear_addr(), such race can be
+ * avoided.
+ */
+ conf->raid_disks = raid_disks;
+
return conf;
out:
@@ -196,15 +216,23 @@ static int linear_add(struct mddev *mddev, struct md_rdev *rdev)
if (!newconf)
return -ENOMEM;
+ /* newconf->raid_disks already keeps a copy of * the increased
+ * value of mddev->raid_disks, WARN_ONCE() is just used to make
+ * sure of this. It is possible that oldconf is still referenced
+ * in linear_congested(), therefore kfree_rcu() is used to free
+ * oldconf until no one uses it anymore.
+ */
mddev_suspend(mddev);
- oldconf = mddev->private;
+ oldconf = rcu_dereference(mddev->private);
mddev->raid_disks++;
- mddev->private = newconf;
+ WARN_ONCE(mddev->raid_disks != newconf->raid_disks,
+ "copied raid_disks doesn't match mddev->raid_disks");
+ rcu_assign_pointer(mddev->private, newconf);
md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
set_capacity(mddev->gendisk, mddev->array_sectors);
mddev_resume(mddev);
revalidate_disk(mddev->gendisk);
- kfree(oldconf);
+ kfree_rcu(oldconf, rcu);
return 0;
}
diff --git a/drivers/md/linear.h b/drivers/md/linear.h
index b685ddd..07fd40c 100644
--- a/drivers/md/linear.h
+++ b/drivers/md/linear.h
@@ -10,6 +10,8 @@ struct linear_conf
{
struct rcu_head rcu;
sector_t array_sectors;
+ int raid_disks; /* a copy of
+ * mddev->raid_disks */
struct dev_info disks[0];
};
#endif
^ permalink raw reply related
* Re: split scsi passthrough fields out of struct request V2
From: hch @ 2017-01-28 8:29 UTC (permalink / raw)
To: Bart Van Assche
Cc: hch@lst.de, axboe@fb.com, linux-scsi@vger.kernel.org,
linux-raid@vger.kernel.org, dm-devel@redhat.com,
linux-block@vger.kernel.org, snitzer@redhat.com,
j-nomura@ce.jp.nec.com
In-Reply-To: <1485552454.4267.29.camel@sandisk.com>
On Fri, Jan 27, 2017 at 09:27:53PM +0000, Bart Van Assche wrote:
> Have you considered to convert all block drivers to the new
> approach and to get rid of request.special? If so, do you already
> have plans to start working on this? I'm namely wondering wheter I
> should start working on this myself.
Hi Bart,
I'd love to have all drivers move of using .special (and thus reducing
request size further). I think the general way to do that is to convert
them to blk-mq and not using the legacy cmd_size field.
^ permalink raw reply
* Re: split scsi passthrough fields out of struct request V3
From: hch @ 2017-01-28 8:26 UTC (permalink / raw)
To: Bart Van Assche
Cc: linux-raid@vger.kernel.org, snitzer@redhat.com,
linux-scsi@vger.kernel.org, axboe@fb.com, dm-devel@redhat.com,
linux-block@vger.kernel.org, j-nomura@ce.jp.nec.com, hch@lst.de
In-Reply-To: <1485543514.4267.23.camel@sandisk.com>
On Fri, Jan 27, 2017 at 06:58:53PM +0000, Bart Van Assche wrote:
> Version 3 of the patch with title "block: split scsi_request out of
> struct request" (commit 3c30af6ebe12) differs significantly from v2
> of that patch that has been posted on several mailing lists. E.g. v2
> moves __cmd[], cmd and cmd_len from struct request into struct
> scsi_request but v3 not. Which version do you want us to review?
Hi Bart,
I tried to resend the whole updated v3 series, but the mail server
stopped accepting mails due to overload. Otherwise it would have
included all the patches. Jens instead took the updated version
straight from this git branch:
http://git.infradead.org/users/hch/block.git/shortlog/refs/heads/block-pc-refactor
^ permalink raw reply
* Re: [PATCH 15/18] scsi: allocate scsi_cmnd structures as part of struct request
From: hch @ 2017-01-28 8:25 UTC (permalink / raw)
To: Bart Van Assche
Cc: linux-raid@vger.kernel.org, snitzer@redhat.com,
linux-scsi@vger.kernel.org, axboe@fb.com, dm-devel@redhat.com,
linux-block@vger.kernel.org, j-nomura@ce.jp.nec.com, hch@lst.de
In-Reply-To: <1485542367.4267.19.camel@sandisk.com>
On Fri, Jan 27, 2017 at 06:39:46PM +0000, Bart Van Assche wrote:
> Why have the scsi_release_buffers() and scsi_put_command(cmd) calls been
> moved up? I haven't found an explanation for this change in the patch
> description.
Because they reference the scsi_cmnd, which are now part of the request
and thus freed by blk_finish_request. And yes, I should have mentioned
it in the changelog, sorry.
> Please also consider to remove the cmd->request->special = NULL assignments
> via this patch. Since this patch makes the lifetime of struct scsi_cmnd and
> struct request identical these assignments are no longer needed.
True. If I had to resend again I would have fixed it up, but it's probably
not worth the churn now.
> This patch introduces the function scsi_exit_rq(). Having two functions
> for the single-queue path that release resources (scsi_release_buffers()
> and scsi_exit_rq()) is confusing. Since every scsi_release_buffers() call
> is followed by a blk_unprep_request() call, have you considered to move
> the scsi_release_buffers() call into scsi_unprep_fn() via an additional
> patch?
We could have done that. But it's just more change for a code path
that I hope won't survive this calendar year.
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox