* [PATCH v5 4/7] md: add sysfs entries for PPL
From: Artur Paszkiewicz @ 2017-03-09 9:00 UTC (permalink / raw)
To: shli; +Cc: linux-raid, Artur Paszkiewicz
In-Reply-To: <20170309090003.13298-1-artur.paszkiewicz@intel.com>
Add 'consistency_policy' attribute for array. It indicates how the array
maintains consistency in case of unexpected shutdown.
Add 'ppl_sector' and 'ppl_size' for rdev, which describe the location
and size of the PPL space on the device. They can't be changed for
active members if the array is started and PPL is enabled, so in the
setter functions only basic checks are performed. More checks are done
in ppl_validate_rdev() when starting the log.
These attributes are writable to allow enabling PPL for external
metadata arrays and (later) to enable/disable PPL for a running array.
Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
---
Documentation/admin-guide/md.rst | 32 ++++++++++-
drivers/md/md.c | 115 +++++++++++++++++++++++++++++++++++++++
2 files changed, 144 insertions(+), 3 deletions(-)
diff --git a/Documentation/admin-guide/md.rst b/Documentation/admin-guide/md.rst
index 1e61bf50595c..84de718f24a4 100644
--- a/Documentation/admin-guide/md.rst
+++ b/Documentation/admin-guide/md.rst
@@ -276,14 +276,14 @@ All md devices contain:
array creation it will default to 0, though starting the array as
``clean`` will set it much larger.
- new_dev
+ new_dev
This file can be written but not read. The value written should
be a block device number as major:minor. e.g. 8:0
This will cause that device to be attached to the array, if it is
available. It will then appear at md/dev-XXX (depending on the
name of the device) and further configuration is then possible.
- safe_mode_delay
+ safe_mode_delay
When an md array has seen no write requests for a certain period
of time, it will be marked as ``clean``. When another write
request arrives, the array is marked as ``dirty`` before the write
@@ -292,7 +292,7 @@ All md devices contain:
period as a number of seconds. The default is 200msec (0.200).
Writing a value of 0 disables safemode.
- array_state
+ array_state
This file contains a single word which describes the current
state of the array. In many cases, the state can be set by
writing the word for the desired state, however some states
@@ -401,7 +401,30 @@ All md devices contain:
once the array becomes non-degraded, and this fact has been
recorded in the metadata.
+ consistency_policy
+ This indicates how the array maintains consistency in case of unexpected
+ shutdown. It can be:
+ none
+ Array has no redundancy information, e.g. raid0, linear.
+
+ resync
+ Full resync is performed and all redundancy is regenerated when the
+ array is started after unclean shutdown.
+
+ bitmap
+ Resync assisted by a write-intent bitmap.
+
+ journal
+ For raid4/5/6, journal device is used to log transactions and replay
+ after unclean shutdown.
+
+ ppl
+ For raid5 only, Partial Parity Log is used to close the write hole and
+ eliminate resync.
+
+ The accepted values when writing to this file are ``ppl`` and ``resync``,
+ used to enable and disable PPL.
As component devices are added to an md array, they appear in the ``md``
@@ -563,6 +586,9 @@ Each directory contains:
adds bad blocks without acknowledging them. This is largely
for testing.
+ ppl_sector, ppl_size
+ Location and size (in sectors) of the space used for Partial Parity Log
+ on this device.
An active md device will also contain an entry for each active device
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 173550455c42..1df48f365b3c 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -3149,6 +3149,78 @@ static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len)
static struct rdev_sysfs_entry rdev_unack_bad_blocks =
__ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store);
+static ssize_t
+ppl_sector_show(struct md_rdev *rdev, char *page)
+{
+ return sprintf(page, "%llu\n", (unsigned long long)rdev->ppl.sector);
+}
+
+static ssize_t
+ppl_sector_store(struct md_rdev *rdev, const char *buf, size_t len)
+{
+ unsigned long long sector;
+
+ if (kstrtoull(buf, 10, §or) < 0)
+ return -EINVAL;
+ if (sector != (sector_t)sector)
+ return -EINVAL;
+
+ if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) &&
+ rdev->raid_disk >= 0)
+ return -EBUSY;
+
+ if (rdev->mddev->persistent) {
+ if (rdev->mddev->major_version == 0)
+ return -EINVAL;
+ if ((sector > rdev->sb_start &&
+ sector - rdev->sb_start > S16_MAX) ||
+ (sector < rdev->sb_start &&
+ rdev->sb_start - sector > -S16_MIN))
+ return -EINVAL;
+ rdev->ppl.offset = sector - rdev->sb_start;
+ } else if (!rdev->mddev->external) {
+ return -EBUSY;
+ }
+ rdev->ppl.sector = sector;
+ return len;
+}
+
+static struct rdev_sysfs_entry rdev_ppl_sector =
+__ATTR(ppl_sector, S_IRUGO|S_IWUSR, ppl_sector_show, ppl_sector_store);
+
+static ssize_t
+ppl_size_show(struct md_rdev *rdev, char *page)
+{
+ return sprintf(page, "%u\n", rdev->ppl.size);
+}
+
+static ssize_t
+ppl_size_store(struct md_rdev *rdev, const char *buf, size_t len)
+{
+ unsigned int size;
+
+ if (kstrtouint(buf, 10, &size) < 0)
+ return -EINVAL;
+
+ if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) &&
+ rdev->raid_disk >= 0)
+ return -EBUSY;
+
+ if (rdev->mddev->persistent) {
+ if (rdev->mddev->major_version == 0)
+ return -EINVAL;
+ if (size > U16_MAX)
+ return -EINVAL;
+ } else if (!rdev->mddev->external) {
+ return -EBUSY;
+ }
+ rdev->ppl.size = size;
+ return len;
+}
+
+static struct rdev_sysfs_entry rdev_ppl_size =
+__ATTR(ppl_size, S_IRUGO|S_IWUSR, ppl_size_show, ppl_size_store);
+
static struct attribute *rdev_default_attrs[] = {
&rdev_state.attr,
&rdev_errors.attr,
@@ -3159,6 +3231,8 @@ static struct attribute *rdev_default_attrs[] = {
&rdev_recovery_start.attr,
&rdev_bad_blocks.attr,
&rdev_unack_bad_blocks.attr,
+ &rdev_ppl_sector.attr,
+ &rdev_ppl_size.attr,
NULL,
};
static ssize_t
@@ -4895,6 +4969,46 @@ static struct md_sysfs_entry md_array_size =
__ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show,
array_size_store);
+static ssize_t
+consistency_policy_show(struct mddev *mddev, char *page)
+{
+ int ret;
+
+ if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
+ ret = sprintf(page, "journal\n");
+ } else if (test_bit(MD_HAS_PPL, &mddev->flags)) {
+ ret = sprintf(page, "ppl\n");
+ } else if (mddev->bitmap) {
+ ret = sprintf(page, "bitmap\n");
+ } else if (mddev->pers) {
+ if (mddev->pers->sync_request)
+ ret = sprintf(page, "resync\n");
+ else
+ ret = sprintf(page, "none\n");
+ } else {
+ ret = sprintf(page, "unknown\n");
+ }
+
+ return ret;
+}
+
+static ssize_t
+consistency_policy_store(struct mddev *mddev, const char *buf, size_t len)
+{
+ if (mddev->pers) {
+ return -EBUSY;
+ } else if (mddev->external && strncmp(buf, "ppl", 3) == 0) {
+ set_bit(MD_HAS_PPL, &mddev->flags);
+ return len;
+ } else {
+ return -EINVAL;
+ }
+}
+
+static struct md_sysfs_entry md_consistency_policy =
+__ATTR(consistency_policy, S_IRUGO | S_IWUSR, consistency_policy_show,
+ consistency_policy_store);
+
static struct attribute *md_default_attrs[] = {
&md_level.attr,
&md_layout.attr,
@@ -4910,6 +5024,7 @@ static struct attribute *md_default_attrs[] = {
&md_reshape_direction.attr,
&md_array_size.attr,
&max_corr_read_errors.attr,
+ &md_consistency_policy.attr,
NULL,
};
--
2.11.0
^ permalink raw reply related
* [PATCH v5 3/7] raid5-ppl: Partial Parity Log write logging implementation
From: Artur Paszkiewicz @ 2017-03-09 8:59 UTC (permalink / raw)
To: shli; +Cc: linux-raid, Artur Paszkiewicz
In-Reply-To: <20170309090003.13298-1-artur.paszkiewicz@intel.com>
Implement the calculation of partial parity for a stripe and PPL write
logging functionality. The description of PPL is added to the
documentation. More details can be found in the comments in raid5-ppl.c.
Attach a page for holding the partial parity data to stripe_head.
Allocate it only if mddev has the MD_HAS_PPL flag set.
Partial parity is the xor of not modified data chunks of a stripe and is
calculated as follows:
- reconstruct-write case:
xor data from all not updated disks in a stripe
- read-modify-write case:
xor old data and parity from all updated disks in a stripe
Implement it using the async_tx API and integrate into raid_run_ops().
It must be called when we still have access to old data, so do it when
STRIPE_OP_BIODRAIN is set, but before ops_run_prexor5(). The result is
stored into sh->ppl_page.
Partial parity is not meaningful for full stripe write and is not stored
in the log or used for recovery, so don't attempt to calculate it when
stripe has STRIPE_FULL_WRITE.
Put the PPL metadata structures to md_p.h because userspace tools
(mdadm) will also need to read/write PPL.
Warn about using PPL with enabled disk volatile write-back cache for
now. It can be removed once disk cache flushing before writing PPL is
implemented.
Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
---
Documentation/md/raid5-ppl.txt | 44 +++
drivers/md/Makefile | 2 +-
drivers/md/raid5-log.h | 24 ++
drivers/md/raid5-ppl.c | 703 +++++++++++++++++++++++++++++++++++++++++
drivers/md/raid5.c | 64 +++-
drivers/md/raid5.h | 10 +-
include/uapi/linux/raid/md_p.h | 27 ++
7 files changed, 869 insertions(+), 5 deletions(-)
create mode 100644 Documentation/md/raid5-ppl.txt
create mode 100644 drivers/md/raid5-ppl.c
diff --git a/Documentation/md/raid5-ppl.txt b/Documentation/md/raid5-ppl.txt
new file mode 100644
index 000000000000..127072b09363
--- /dev/null
+++ b/Documentation/md/raid5-ppl.txt
@@ -0,0 +1,44 @@
+Partial Parity Log
+
+Partial Parity Log (PPL) is a feature available for RAID5 arrays. The issue
+addressed by PPL is that after a dirty shutdown, parity of a particular stripe
+may become inconsistent with data on other member disks. If the array is also
+in degraded state, there is no way to recalculate parity, because one of the
+disks is missing. This can lead to silent data corruption when rebuilding the
+array or using it is as degraded - data calculated from parity for array blocks
+that have not been touched by a write request during the unclean shutdown can
+be incorrect. Such condition is known as the RAID5 Write Hole. Because of
+this, md by default does not allow starting a dirty degraded array.
+
+Partial parity for a write operation is the XOR of stripe data chunks not
+modified by this write. It is just enough data needed for recovering from the
+write hole. XORing partial parity with the modified chunks produces parity for
+the stripe, consistent with its state before the write operation, regardless of
+which chunk writes have completed. If one of the not modified data disks of
+this stripe is missing, this updated parity can be used to recover its
+contents. PPL recovery is also performed when starting an array after an
+unclean shutdown and all disks are available, eliminating the need to resync
+the array. Because of this, using write-intent bitmap and PPL together is not
+supported.
+
+When handling a write request PPL writes partial parity before new data and
+parity are dispatched to disks. PPL is a distributed log - it is stored on
+array member drives in the metadata area, on the parity drive of a particular
+stripe. It does not require a dedicated journaling drive. Write performance is
+reduced by up to 30%-40% but it scales with the number of drives in the array
+and the journaling drive does not become a bottleneck or a single point of
+failure.
+
+Unlike raid5-cache, the other solution in md for closing the write hole, PPL is
+not a true journal. It does not protect from losing in-flight data, only from
+silent data corruption. If a dirty disk of a stripe is lost, no PPL recovery is
+performed for this stripe (parity is not updated). So it is possible to have
+arbitrary data in the written part of a stripe if that disk is lost. In such
+case the behavior is the same as in plain raid5.
+
+PPL is available for md version-1 metadata and external (specifically IMSM)
+metadata arrays. It can be enabled using mdadm option --consistency-policy=ppl.
+
+Currently, volatile write-back cache should be disabled on all member drives
+when using PPL. Otherwise it cannot guarantee consistency in case of power
+failure.
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 3cbda1af87a0..4d48714ccc6b 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -18,7 +18,7 @@ dm-cache-cleaner-y += dm-cache-policy-cleaner.o
dm-era-y += dm-era-target.o
dm-verity-y += dm-verity-target.o
md-mod-y += md.o bitmap.o
-raid456-y += raid5.o raid5-cache.o
+raid456-y += raid5.o raid5-cache.o raid5-ppl.o
# Note: link order is important. All raid personalities
# and must come before md.o, as they each initialise
diff --git a/drivers/md/raid5-log.h b/drivers/md/raid5-log.h
index 2da4bd3bbd79..a67fb58513b9 100644
--- a/drivers/md/raid5-log.h
+++ b/drivers/md/raid5-log.h
@@ -31,6 +31,20 @@ extern struct md_sysfs_entry r5c_journal_mode;
extern void r5c_update_on_rdev_error(struct mddev *mddev);
extern bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect);
+extern struct dma_async_tx_descriptor *
+ops_run_partial_parity(struct stripe_head *sh, struct raid5_percpu *percpu,
+ struct dma_async_tx_descriptor *tx);
+extern int ppl_init_log(struct r5conf *conf);
+extern void ppl_exit_log(struct r5conf *conf);
+extern int ppl_write_stripe(struct r5conf *conf, struct stripe_head *sh);
+extern void ppl_write_stripe_run(struct r5conf *conf);
+extern void ppl_stripe_write_finished(struct stripe_head *sh);
+
+static inline bool raid5_has_ppl(struct r5conf *conf)
+{
+ return test_bit(MD_HAS_PPL, &conf->mddev->flags);
+}
+
static inline int log_stripe(struct stripe_head *sh, struct stripe_head_state *s)
{
struct r5conf *conf = sh->raid_conf;
@@ -45,6 +59,8 @@ static inline int log_stripe(struct stripe_head *sh, struct stripe_head_state *s
/* caching phase */
return r5c_cache_data(conf->log, sh);
}
+ } else if (raid5_has_ppl(conf)) {
+ return ppl_write_stripe(conf, sh);
}
return -EAGAIN;
@@ -56,24 +72,32 @@ static inline void log_stripe_write_finished(struct stripe_head *sh)
if (conf->log)
r5l_stripe_write_finished(sh);
+ else if (raid5_has_ppl(conf))
+ ppl_stripe_write_finished(sh);
}
static inline void log_write_stripe_run(struct r5conf *conf)
{
if (conf->log)
r5l_write_stripe_run(conf->log);
+ else if (raid5_has_ppl(conf))
+ ppl_write_stripe_run(conf);
}
static inline void log_exit(struct r5conf *conf)
{
if (conf->log)
r5l_exit_log(conf);
+ else if (raid5_has_ppl(conf))
+ ppl_exit_log(conf);
}
static inline int log_init(struct r5conf *conf, struct md_rdev *journal_dev)
{
if (journal_dev)
return r5l_init_log(conf, journal_dev);
+ else if (raid5_has_ppl(conf))
+ return ppl_init_log(conf);
return 0;
}
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
new file mode 100644
index 000000000000..92783586743d
--- /dev/null
+++ b/drivers/md/raid5-ppl.c
@@ -0,0 +1,703 @@
+/*
+ * Partial Parity Log for closing the RAID5 write hole
+ * Copyright (c) 2017, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/blkdev.h>
+#include <linux/slab.h>
+#include <linux/crc32c.h>
+#include <linux/flex_array.h>
+#include <linux/async_tx.h>
+#include <linux/raid/md_p.h>
+#include "md.h"
+#include "raid5.h"
+
+/*
+ * PPL consists of a 4KB header (struct ppl_header) and at least 128KB for
+ * partial parity data. The header contains an array of entries
+ * (struct ppl_header_entry) which describe the logged write requests.
+ * Partial parity for the entries comes after the header, written in the same
+ * sequence as the entries:
+ *
+ * Header
+ * entry0
+ * ...
+ * entryN
+ * PP data
+ * PP for entry0
+ * ...
+ * PP for entryN
+ *
+ * An entry describes one or more consecutive stripe_heads, up to a full
+ * stripe. The modifed raid data chunks form an m-by-n matrix, where m is the
+ * number of stripe_heads in the entry and n is the number of modified data
+ * disks. Every stripe_head in the entry must write to the same data disks.
+ * An example of a valid case described by a single entry (writes to the first
+ * stripe of a 4 disk array, 16k chunk size):
+ *
+ * sh->sector dd0 dd1 dd2 ppl
+ * +-----+-----+-----+
+ * 0 | --- | --- | --- | +----+
+ * 8 | -W- | -W- | --- | | pp | data_sector = 8
+ * 16 | -W- | -W- | --- | | pp | data_size = 3 * 2 * 4k
+ * 24 | -W- | -W- | --- | | pp | pp_size = 3 * 4k
+ * +-----+-----+-----+ +----+
+ *
+ * data_sector is the first raid sector of the modified data, data_size is the
+ * total size of modified data and pp_size is the size of partial parity for
+ * this entry. Entries for full stripe writes contain no partial parity
+ * (pp_size = 0), they only mark the stripes for which parity should be
+ * recalculated after an unclean shutdown. Every entry holds a checksum of its
+ * partial parity, the header also has a checksum of the header itself.
+ *
+ * A write request is always logged to the PPL instance stored on the parity
+ * disk of the corresponding stripe. For each member disk there is one ppl_log
+ * used to handle logging for this disk, independently from others. They are
+ * grouped in child_logs array in struct ppl_conf, which is assigned to
+ * r5conf->log_private.
+ *
+ * ppl_io_unit represents a full PPL write, header_page contains the ppl_header.
+ * PPL entries for logged stripes are added in ppl_log_stripe(). A stripe_head
+ * can be appended to the last entry if it meets the conditions for a valid
+ * entry described above, otherwise a new entry is added. Checksums of entries
+ * are calculated incrementally as stripes containing partial parity are being
+ * added. ppl_submit_iounit() calculates the checksum of the header and submits
+ * a bio containing the header page and partial parity pages (sh->ppl_page) for
+ * all stripes of the io_unit. When the PPL write completes, the stripes
+ * associated with the io_unit are released and raid5d starts writing their data
+ * and parity. When all stripes are written, the io_unit is freed and the next
+ * can be submitted.
+ *
+ * An io_unit is used to gather stripes until it is submitted or becomes full
+ * (if the maximum number of entries or size of PPL is reached). Another io_unit
+ * can't be submitted until the previous has completed (PPL and stripe
+ * data+parity is written). The log->io_list tracks all io_units of a log
+ * (for a single member disk). New io_units are added to the end of the list
+ * and the first io_unit is submitted, if it is not submitted already.
+ * The current io_unit accepting new stripes is always at the end of the list.
+ */
+
+struct ppl_conf {
+ struct mddev *mddev;
+
+ /* array of child logs, one for each raid disk */
+ struct ppl_log *child_logs;
+ int count;
+
+ int block_size; /* the logical block size used for data_sector
+ * in ppl_header_entry */
+ u32 signature; /* raid array identifier */
+ atomic64_t seq; /* current log write sequence number */
+
+ struct kmem_cache *io_kc;
+ mempool_t *io_pool;
+ struct bio_set *bs;
+ mempool_t *meta_pool;
+};
+
+struct ppl_log {
+ struct ppl_conf *ppl_conf; /* shared between all log instances */
+
+ struct md_rdev *rdev; /* array member disk associated with
+ * this log instance */
+ struct mutex io_mutex;
+ struct ppl_io_unit *current_io; /* current io_unit accepting new data
+ * always at the end of io_list */
+ spinlock_t io_list_lock;
+ struct list_head io_list; /* all io_units of this log */
+ struct list_head no_mem_stripes;/* stripes to retry if failed to
+ * allocate io_unit */
+};
+
+#define PPL_IO_INLINE_BVECS 32
+
+struct ppl_io_unit {
+ struct ppl_log *log;
+
+ struct page *header_page; /* for ppl_header */
+
+ unsigned int entries_count; /* number of entries in ppl_header */
+ unsigned int pp_size; /* total size current of partial parity */
+
+ u64 seq; /* sequence number of this log write */
+ struct list_head log_sibling; /* log->io_list */
+
+ struct list_head stripe_list; /* stripes added to the io_unit */
+ atomic_t pending_stripes; /* how many stripes not written to raid */
+
+ bool submitted; /* true if write to log started */
+
+ /* inline bio and its biovec for submitting the iounit */
+ struct bio bio;
+ struct bio_vec biovec[PPL_IO_INLINE_BVECS];
+};
+
+struct dma_async_tx_descriptor *
+ops_run_partial_parity(struct stripe_head *sh, struct raid5_percpu *percpu,
+ struct dma_async_tx_descriptor *tx)
+{
+ int disks = sh->disks;
+ struct page **xor_srcs = flex_array_get(percpu->scribble, 0);
+ int count = 0, pd_idx = sh->pd_idx, i;
+ struct async_submit_ctl submit;
+
+ pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
+
+ /*
+ * Partial parity is the XOR of stripe data chunks that are not changed
+ * during the write request. Depending on available data
+ * (read-modify-write vs. reconstruct-write case) we calculate it
+ * differently.
+ */
+ if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
+ /* rmw: xor old data and parity from updated disks */
+ for (i = disks; i--;) {
+ struct r5dev *dev = &sh->dev[i];
+ if (test_bit(R5_Wantdrain, &dev->flags) || i == pd_idx)
+ xor_srcs[count++] = dev->page;
+ }
+ } else if (sh->reconstruct_state == reconstruct_state_drain_run) {
+ /* rcw: xor data from all not updated disks */
+ for (i = disks; i--;) {
+ struct r5dev *dev = &sh->dev[i];
+ if (test_bit(R5_UPTODATE, &dev->flags))
+ xor_srcs[count++] = dev->page;
+ }
+ } else {
+ return tx;
+ }
+
+ init_async_submit(&submit, ASYNC_TX_XOR_ZERO_DST, tx, NULL, sh,
+ flex_array_get(percpu->scribble, 0)
+ + sizeof(struct page *) * (sh->disks + 2));
+
+ if (count == 1)
+ tx = async_memcpy(sh->ppl_page, xor_srcs[0], 0, 0, PAGE_SIZE,
+ &submit);
+ else
+ tx = async_xor(sh->ppl_page, xor_srcs, 0, count, PAGE_SIZE,
+ &submit);
+
+ return tx;
+}
+
+static struct ppl_io_unit *ppl_new_iounit(struct ppl_log *log,
+ struct stripe_head *sh)
+{
+ struct ppl_conf *ppl_conf = log->ppl_conf;
+ struct ppl_io_unit *io;
+ struct ppl_header *pplhdr;
+
+ io = mempool_alloc(ppl_conf->io_pool, GFP_ATOMIC);
+ if (!io)
+ return NULL;
+
+ memset(io, 0, sizeof(*io));
+ io->log = log;
+ INIT_LIST_HEAD(&io->log_sibling);
+ INIT_LIST_HEAD(&io->stripe_list);
+ atomic_set(&io->pending_stripes, 0);
+ bio_init(&io->bio, io->biovec, PPL_IO_INLINE_BVECS);
+
+ io->header_page = mempool_alloc(ppl_conf->meta_pool, GFP_NOIO);
+ pplhdr = page_address(io->header_page);
+ clear_page(pplhdr);
+ memset(pplhdr->reserved, 0xff, PPL_HDR_RESERVED);
+ pplhdr->signature = cpu_to_le32(ppl_conf->signature);
+
+ io->seq = atomic64_add_return(1, &ppl_conf->seq);
+ pplhdr->generation = cpu_to_le64(io->seq);
+
+ return io;
+}
+
+static int ppl_log_stripe(struct ppl_log *log, struct stripe_head *sh)
+{
+ struct ppl_io_unit *io = log->current_io;
+ struct ppl_header_entry *e = NULL;
+ struct ppl_header *pplhdr;
+ int i;
+ sector_t data_sector = 0;
+ int data_disks = 0;
+ unsigned int entry_space = (log->rdev->ppl.size << 9) - PPL_HEADER_SIZE;
+ struct r5conf *conf = sh->raid_conf;
+
+ pr_debug("%s: stripe: %llu\n", __func__, (unsigned long long)sh->sector);
+
+ /* check if current io_unit is full */
+ if (io && (io->pp_size == entry_space ||
+ io->entries_count == PPL_HDR_MAX_ENTRIES)) {
+ pr_debug("%s: add io_unit blocked by seq: %llu\n",
+ __func__, io->seq);
+ io = NULL;
+ }
+
+ /* add a new unit if there is none or the current is full */
+ if (!io) {
+ io = ppl_new_iounit(log, sh);
+ if (!io)
+ return -ENOMEM;
+ spin_lock_irq(&log->io_list_lock);
+ list_add_tail(&io->log_sibling, &log->io_list);
+ spin_unlock_irq(&log->io_list_lock);
+
+ log->current_io = io;
+ }
+
+ for (i = 0; i < sh->disks; i++) {
+ struct r5dev *dev = &sh->dev[i];
+
+ if (i != sh->pd_idx && test_bit(R5_Wantwrite, &dev->flags)) {
+ if (!data_disks || dev->sector < data_sector)
+ data_sector = dev->sector;
+ data_disks++;
+ }
+ }
+ BUG_ON(!data_disks);
+
+ pr_debug("%s: seq: %llu data_sector: %llu data_disks: %d\n", __func__,
+ io->seq, (unsigned long long)data_sector, data_disks);
+
+ pplhdr = page_address(io->header_page);
+
+ if (io->entries_count > 0) {
+ struct ppl_header_entry *last =
+ &pplhdr->entries[io->entries_count - 1];
+ struct stripe_head *sh_last = list_last_entry(
+ &io->stripe_list, struct stripe_head, log_list);
+ u64 data_sector_last = le64_to_cpu(last->data_sector);
+ u32 data_size_last = le32_to_cpu(last->data_size);
+
+ /*
+ * Check if we can append the stripe to the last entry. It must
+ * be just after the last logged stripe and write to the same
+ * disks. Use bit shift and logarithm to avoid 64-bit division.
+ */
+ if ((sh->sector == sh_last->sector + STRIPE_SECTORS) &&
+ (data_sector >> ilog2(conf->chunk_sectors) ==
+ data_sector_last >> ilog2(conf->chunk_sectors)) &&
+ ((data_sector - data_sector_last) * data_disks ==
+ data_size_last >> 9))
+ e = last;
+ }
+
+ if (!e) {
+ e = &pplhdr->entries[io->entries_count++];
+ e->data_sector = cpu_to_le64(data_sector);
+ e->parity_disk = cpu_to_le32(sh->pd_idx);
+ e->checksum = cpu_to_le32(~0);
+ }
+
+ le32_add_cpu(&e->data_size, data_disks << PAGE_SHIFT);
+
+ /* don't write any PP if full stripe write */
+ if (!test_bit(STRIPE_FULL_WRITE, &sh->state)) {
+ le32_add_cpu(&e->pp_size, PAGE_SIZE);
+ io->pp_size += PAGE_SIZE;
+ e->checksum = cpu_to_le32(crc32c_le(le32_to_cpu(e->checksum),
+ page_address(sh->ppl_page),
+ PAGE_SIZE));
+ }
+
+ list_add_tail(&sh->log_list, &io->stripe_list);
+ atomic_inc(&io->pending_stripes);
+ sh->ppl_io = io;
+
+ return 0;
+}
+
+int ppl_write_stripe(struct r5conf *conf, struct stripe_head *sh)
+{
+ struct ppl_conf *ppl_conf = conf->log_private;
+ struct ppl_io_unit *io = sh->ppl_io;
+ struct ppl_log *log;
+
+ if (io || test_bit(STRIPE_SYNCING, &sh->state) ||
+ !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) ||
+ !test_bit(R5_Insync, &sh->dev[sh->pd_idx].flags)) {
+ clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
+ return -EAGAIN;
+ }
+
+ log = &ppl_conf->child_logs[sh->pd_idx];
+
+ mutex_lock(&log->io_mutex);
+
+ if (!log->rdev || test_bit(Faulty, &log->rdev->flags)) {
+ mutex_unlock(&log->io_mutex);
+ return -EAGAIN;
+ }
+
+ set_bit(STRIPE_LOG_TRAPPED, &sh->state);
+ clear_bit(STRIPE_DELAYED, &sh->state);
+ atomic_inc(&sh->count);
+
+ if (ppl_log_stripe(log, sh)) {
+ spin_lock_irq(&log->io_list_lock);
+ list_add_tail(&sh->log_list, &log->no_mem_stripes);
+ spin_unlock_irq(&log->io_list_lock);
+ }
+
+ mutex_unlock(&log->io_mutex);
+
+ return 0;
+}
+
+static void ppl_log_endio(struct bio *bio)
+{
+ struct ppl_io_unit *io = bio->bi_private;
+ struct ppl_log *log = io->log;
+ struct ppl_conf *ppl_conf = log->ppl_conf;
+ struct stripe_head *sh, *next;
+
+ pr_debug("%s: seq: %llu\n", __func__, io->seq);
+
+ if (bio->bi_error)
+ md_error(ppl_conf->mddev, log->rdev);
+
+ mempool_free(io->header_page, ppl_conf->meta_pool);
+
+ list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) {
+ list_del_init(&sh->log_list);
+
+ set_bit(STRIPE_HANDLE, &sh->state);
+ raid5_release_stripe(sh);
+ }
+}
+
+static void ppl_submit_iounit_bio(struct ppl_io_unit *io, struct bio *bio)
+{
+ char b[BDEVNAME_SIZE];
+
+ pr_debug("%s: seq: %llu size: %u sector: %llu dev: %s\n",
+ __func__, io->seq, bio->bi_iter.bi_size,
+ (unsigned long long)bio->bi_iter.bi_sector,
+ bdevname(bio->bi_bdev, b));
+
+ submit_bio(bio);
+}
+
+static void ppl_submit_iounit(struct ppl_io_unit *io)
+{
+ struct ppl_log *log = io->log;
+ struct ppl_conf *ppl_conf = log->ppl_conf;
+ struct ppl_header *pplhdr = page_address(io->header_page);
+ struct bio *bio = &io->bio;
+ struct stripe_head *sh;
+ int i;
+
+ for (i = 0; i < io->entries_count; i++) {
+ struct ppl_header_entry *e = &pplhdr->entries[i];
+
+ pr_debug("%s: seq: %llu entry: %d data_sector: %llu pp_size: %u data_size: %u\n",
+ __func__, io->seq, i, le64_to_cpu(e->data_sector),
+ le32_to_cpu(e->pp_size), le32_to_cpu(e->data_size));
+
+ e->data_sector = cpu_to_le64(le64_to_cpu(e->data_sector) >>
+ ilog2(ppl_conf->block_size >> 9));
+ e->checksum = cpu_to_le32(~le32_to_cpu(e->checksum));
+ }
+
+ pplhdr->entries_count = cpu_to_le32(io->entries_count);
+ pplhdr->checksum = cpu_to_le32(~crc32c_le(~0, pplhdr, PPL_HEADER_SIZE));
+
+ bio->bi_private = io;
+ bio->bi_end_io = ppl_log_endio;
+ bio->bi_opf = REQ_OP_WRITE | REQ_FUA;
+ bio->bi_bdev = log->rdev->bdev;
+ bio->bi_iter.bi_sector = log->rdev->ppl.sector;
+ bio_add_page(bio, io->header_page, PAGE_SIZE, 0);
+
+ list_for_each_entry(sh, &io->stripe_list, log_list) {
+ /* entries for full stripe writes have no partial parity */
+ if (test_bit(STRIPE_FULL_WRITE, &sh->state))
+ continue;
+
+ if (!bio_add_page(bio, sh->ppl_page, PAGE_SIZE, 0)) {
+ struct bio *prev = bio;
+
+ bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES,
+ ppl_conf->bs);
+ bio->bi_opf = prev->bi_opf;
+ bio->bi_bdev = prev->bi_bdev;
+ bio->bi_iter.bi_sector = bio_end_sector(prev);
+ bio_add_page(bio, sh->ppl_page, PAGE_SIZE, 0);
+
+ bio_chain(bio, prev);
+ ppl_submit_iounit_bio(io, prev);
+ }
+ }
+
+ ppl_submit_iounit_bio(io, bio);
+}
+
+static void ppl_submit_current_io(struct ppl_log *log)
+{
+ struct ppl_io_unit *io;
+
+ spin_lock_irq(&log->io_list_lock);
+
+ io = list_first_entry_or_null(&log->io_list, struct ppl_io_unit,
+ log_sibling);
+ if (io && io->submitted)
+ io = NULL;
+
+ spin_unlock_irq(&log->io_list_lock);
+
+ if (io) {
+ io->submitted = true;
+
+ if (io == log->current_io)
+ log->current_io = NULL;
+
+ ppl_submit_iounit(io);
+ }
+}
+
+void ppl_write_stripe_run(struct r5conf *conf)
+{
+ struct ppl_conf *ppl_conf = conf->log_private;
+ struct ppl_log *log;
+ int i;
+
+ for (i = 0; i < ppl_conf->count; i++) {
+ log = &ppl_conf->child_logs[i];
+
+ mutex_lock(&log->io_mutex);
+ ppl_submit_current_io(log);
+ mutex_unlock(&log->io_mutex);
+ }
+}
+
+static void ppl_io_unit_finished(struct ppl_io_unit *io)
+{
+ struct ppl_log *log = io->log;
+ unsigned long flags;
+
+ pr_debug("%s: seq: %llu\n", __func__, io->seq);
+
+ spin_lock_irqsave(&log->io_list_lock, flags);
+
+ list_del(&io->log_sibling);
+ mempool_free(io, log->ppl_conf->io_pool);
+
+ if (!list_empty(&log->no_mem_stripes)) {
+ struct stripe_head *sh = list_first_entry(&log->no_mem_stripes,
+ struct stripe_head,
+ log_list);
+ list_del_init(&sh->log_list);
+ set_bit(STRIPE_HANDLE, &sh->state);
+ raid5_release_stripe(sh);
+ }
+
+ spin_unlock_irqrestore(&log->io_list_lock, flags);
+}
+
+void ppl_stripe_write_finished(struct stripe_head *sh)
+{
+ struct ppl_io_unit *io;
+
+ io = sh->ppl_io;
+ sh->ppl_io = NULL;
+
+ if (io && atomic_dec_and_test(&io->pending_stripes))
+ ppl_io_unit_finished(io);
+}
+
+static void __ppl_exit_log(struct ppl_conf *ppl_conf)
+{
+ clear_bit(MD_HAS_PPL, &ppl_conf->mddev->flags);
+
+ kfree(ppl_conf->child_logs);
+
+ mempool_destroy(ppl_conf->meta_pool);
+ if (ppl_conf->bs)
+ bioset_free(ppl_conf->bs);
+ mempool_destroy(ppl_conf->io_pool);
+ kmem_cache_destroy(ppl_conf->io_kc);
+
+ kfree(ppl_conf);
+}
+
+void ppl_exit_log(struct r5conf *conf)
+{
+ struct ppl_conf *ppl_conf = conf->log_private;
+
+ if (ppl_conf) {
+ __ppl_exit_log(ppl_conf);
+ conf->log_private = NULL;
+ }
+}
+
+static int ppl_validate_rdev(struct md_rdev *rdev)
+{
+ char b[BDEVNAME_SIZE];
+ int ppl_data_sectors;
+ int ppl_size_new;
+
+ /*
+ * The configured PPL size must be enough to store
+ * the header and (at the very least) partial parity
+ * for one stripe. Round it down to ensure the data
+ * space is cleanly divisible by stripe size.
+ */
+ ppl_data_sectors = rdev->ppl.size - (PPL_HEADER_SIZE >> 9);
+
+ if (ppl_data_sectors > 0)
+ ppl_data_sectors = rounddown(ppl_data_sectors, STRIPE_SECTORS);
+
+ if (ppl_data_sectors <= 0) {
+ pr_warn("md/raid:%s: PPL space too small on %s\n",
+ mdname(rdev->mddev), bdevname(rdev->bdev, b));
+ return -ENOSPC;
+ }
+
+ ppl_size_new = ppl_data_sectors + (PPL_HEADER_SIZE >> 9);
+
+ if ((rdev->ppl.sector < rdev->data_offset &&
+ rdev->ppl.sector + ppl_size_new > rdev->data_offset) ||
+ (rdev->ppl.sector >= rdev->data_offset &&
+ rdev->data_offset + rdev->sectors > rdev->ppl.sector)) {
+ pr_warn("md/raid:%s: PPL space overlaps with data on %s\n",
+ mdname(rdev->mddev), bdevname(rdev->bdev, b));
+ return -EINVAL;
+ }
+
+ if (!rdev->mddev->external &&
+ ((rdev->ppl.offset > 0 && rdev->ppl.offset < (rdev->sb_size >> 9)) ||
+ (rdev->ppl.offset <= 0 && rdev->ppl.offset + ppl_size_new > 0))) {
+ pr_warn("md/raid:%s: PPL space overlaps with superblock on %s\n",
+ mdname(rdev->mddev), bdevname(rdev->bdev, b));
+ return -EINVAL;
+ }
+
+ rdev->ppl.size = ppl_size_new;
+
+ return 0;
+}
+
+int ppl_init_log(struct r5conf *conf)
+{
+ struct ppl_conf *ppl_conf;
+ struct mddev *mddev = conf->mddev;
+ int ret = 0;
+ int i;
+ bool need_cache_flush;
+
+ pr_debug("md/raid:%s: enabling distributed Partial Parity Log\n",
+ mdname(conf->mddev));
+
+ if (PAGE_SIZE != 4096)
+ return -EINVAL;
+
+ if (mddev->level != 5) {
+ pr_warn("md/raid:%s PPL is not compatible with raid level %d\n",
+ mdname(mddev), mddev->level);
+ return -EINVAL;
+ }
+
+ if (mddev->bitmap_info.file || mddev->bitmap_info.offset) {
+ pr_warn("md/raid:%s PPL is not compatible with bitmap\n",
+ mdname(mddev));
+ return -EINVAL;
+ }
+
+ if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
+ pr_warn("md/raid:%s PPL is not compatible with journal\n",
+ mdname(mddev));
+ return -EINVAL;
+ }
+
+ ppl_conf = kzalloc(sizeof(struct ppl_conf), GFP_KERNEL);
+ if (!ppl_conf)
+ return -ENOMEM;
+
+ ppl_conf->mddev = mddev;
+
+ ppl_conf->io_kc = KMEM_CACHE(ppl_io_unit, 0);
+ if (!ppl_conf->io_kc) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ ppl_conf->io_pool = mempool_create_slab_pool(conf->raid_disks, ppl_conf->io_kc);
+ if (!ppl_conf->io_pool) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ ppl_conf->bs = bioset_create(conf->raid_disks, 0);
+ if (!ppl_conf->bs) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ ppl_conf->meta_pool = mempool_create_page_pool(conf->raid_disks, 0);
+ if (!ppl_conf->meta_pool) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ ppl_conf->count = conf->raid_disks;
+ ppl_conf->child_logs = kcalloc(ppl_conf->count, sizeof(struct ppl_log),
+ GFP_KERNEL);
+ if (!ppl_conf->child_logs) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ atomic64_set(&ppl_conf->seq, 0);
+
+ if (!mddev->external) {
+ ppl_conf->signature = ~crc32c_le(~0, mddev->uuid, sizeof(mddev->uuid));
+ ppl_conf->block_size = 512;
+ } else {
+ ppl_conf->block_size = queue_logical_block_size(mddev->queue);
+ }
+
+ for (i = 0; i < ppl_conf->count; i++) {
+ struct ppl_log *log = &ppl_conf->child_logs[i];
+ struct md_rdev *rdev = conf->disks[i].rdev;
+
+ mutex_init(&log->io_mutex);
+ spin_lock_init(&log->io_list_lock);
+ INIT_LIST_HEAD(&log->io_list);
+ INIT_LIST_HEAD(&log->no_mem_stripes);
+
+ log->ppl_conf = ppl_conf;
+ log->rdev = rdev;
+
+ if (rdev) {
+ struct request_queue *q;
+
+ ret = ppl_validate_rdev(rdev);
+ if (ret)
+ goto err;
+
+ q = bdev_get_queue(rdev->bdev);
+ if (test_bit(QUEUE_FLAG_WC, &q->queue_flags))
+ need_cache_flush = true;
+ }
+ }
+
+ if (need_cache_flush)
+ pr_warn("md/raid:%s: Volatile write-back cache should be disabled on all member drives when using PPL!\n",
+ mdname(mddev));
+
+ conf->log_private = ppl_conf;
+
+ return 0;
+err:
+ __ppl_exit_log(ppl_conf);
+ return ret;
+}
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 043a509560c2..4ca8e555ae5c 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -464,6 +464,11 @@ static void shrink_buffers(struct stripe_head *sh)
sh->dev[i].page = NULL;
put_page(p);
}
+
+ if (sh->ppl_page) {
+ put_page(sh->ppl_page);
+ sh->ppl_page = NULL;
+ }
}
static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
@@ -480,6 +485,13 @@ static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
sh->dev[i].page = page;
sh->dev[i].orig_page = page;
}
+
+ if (raid5_has_ppl(sh->raid_conf)) {
+ sh->ppl_page = alloc_page(gfp);
+ if (!sh->ppl_page)
+ return 1;
+ }
+
return 0;
}
@@ -728,7 +740,7 @@ static bool stripe_can_batch(struct stripe_head *sh)
{
struct r5conf *conf = sh->raid_conf;
- if (conf->log)
+ if (conf->log || raid5_has_ppl(conf))
return false;
return test_bit(STRIPE_BATCH_READY, &sh->state) &&
!test_bit(STRIPE_BITMAP_PENDING, &sh->state) &&
@@ -1995,6 +2007,9 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
async_tx_ack(tx);
}
+ if (test_bit(STRIPE_OP_PARTIAL_PARITY, &ops_request))
+ tx = ops_run_partial_parity(sh, percpu, tx);
+
if (test_bit(STRIPE_OP_PREXOR, &ops_request)) {
if (level < 6)
tx = ops_run_prexor5(sh, percpu, tx);
@@ -3070,6 +3085,12 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
s->locked++;
}
+ if (raid5_has_ppl(sh->raid_conf) &&
+ test_bit(STRIPE_OP_BIODRAIN, &s->ops_request) &&
+ !test_bit(STRIPE_FULL_WRITE, &sh->state) &&
+ test_bit(R5_Insync, &sh->dev[pd_idx].flags))
+ set_bit(STRIPE_OP_PARTIAL_PARITY, &s->ops_request);
+
pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",
__func__, (unsigned long long)sh->sector,
s->locked, s->ops_request);
@@ -3117,6 +3138,36 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi))
goto overlap;
+ if (forwrite && raid5_has_ppl(conf)) {
+ /*
+ * With PPL only writes to consecutive data chunks within a
+ * stripe are allowed because for a single stripe_head we can
+ * only have one PPL entry at a time, which describes one data
+ * range. Not really an overlap, but wait_for_overlap can be
+ * used to handle this.
+ */
+ sector_t sector;
+ sector_t first = 0;
+ sector_t last = 0;
+ int count = 0;
+ int i;
+
+ for (i = 0; i < sh->disks; i++) {
+ if (i != sh->pd_idx &&
+ (i == dd_idx || sh->dev[i].towrite)) {
+ sector = sh->dev[i].sector;
+ if (count == 0 || sector < first)
+ first = sector;
+ if (sector > last)
+ last = sector;
+ count++;
+ }
+ }
+
+ if (first + conf->chunk_sectors * (count - 1) != last)
+ goto overlap;
+ }
+
if (!forwrite || previous)
clear_bit(STRIPE_BATCH_READY, &sh->state);
@@ -7087,6 +7138,13 @@ static int raid5_run(struct mddev *mddev)
BUG_ON(mddev->delta_disks != 0);
}
+ if (test_bit(MD_HAS_JOURNAL, &mddev->flags) &&
+ test_bit(MD_HAS_PPL, &mddev->flags)) {
+ pr_warn("md/raid:%s: using journal device and PPL not allowed - disabling PPL\n",
+ mdname(mddev));
+ clear_bit(MD_HAS_PPL, &mddev->flags);
+ }
+
if (mddev->private == NULL)
conf = setup_conf(mddev);
else
@@ -7568,7 +7626,7 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors)
sector_t newsize;
struct r5conf *conf = mddev->private;
- if (conf->log)
+ if (conf->log || raid5_has_ppl(conf))
return -EINVAL;
sectors &= ~((sector_t)conf->chunk_sectors - 1);
newsize = raid5_size(mddev, sectors, mddev->raid_disks);
@@ -7619,7 +7677,7 @@ static int check_reshape(struct mddev *mddev)
{
struct r5conf *conf = mddev->private;
- if (conf->log)
+ if (conf->log || raid5_has_ppl(conf))
return -EINVAL;
if (mddev->delta_disks == 0 &&
mddev->new_layout == mddev->layout &&
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 749c6c496e7d..5a371fa9e782 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -224,10 +224,16 @@ struct stripe_head {
spinlock_t batch_lock; /* only header's lock is useful */
struct list_head batch_list; /* protected by head's batch lock*/
- struct r5l_io_unit *log_io;
+ union {
+ struct r5l_io_unit *log_io;
+ struct ppl_io_unit *ppl_io;
+ };
+
struct list_head log_list;
sector_t log_start; /* first meta block on the journal */
struct list_head r5c; /* for r5c_cache->stripe_in_journal */
+
+ struct page *ppl_page; /* partial parity of this stripe */
/**
* struct stripe_operations
* @target - STRIPE_OP_COMPUTE_BLK target
@@ -400,6 +406,7 @@ enum {
STRIPE_OP_BIODRAIN,
STRIPE_OP_RECONSTRUCT,
STRIPE_OP_CHECK,
+ STRIPE_OP_PARTIAL_PARITY,
};
/*
@@ -686,6 +693,7 @@ struct r5conf {
int group_cnt;
int worker_cnt_per_group;
struct r5l_log *log;
+ void *log_private;
struct bio_list pending_bios;
spinlock_t pending_bios_lock;
diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h
index fe2112810c43..d9a1ead867b9 100644
--- a/include/uapi/linux/raid/md_p.h
+++ b/include/uapi/linux/raid/md_p.h
@@ -398,4 +398,31 @@ struct r5l_meta_block {
#define R5LOG_VERSION 0x1
#define R5LOG_MAGIC 0x6433c509
+
+struct ppl_header_entry {
+ __le64 data_sector; /* raid sector of the new data */
+ __le32 pp_size; /* length of partial parity */
+ __le32 data_size; /* length of data */
+ __le32 parity_disk; /* member disk containing parity */
+ __le32 checksum; /* checksum of partial parity data for this
+ * entry (~crc32c) */
+} __attribute__ ((__packed__));
+
+#define PPL_HEADER_SIZE 4096
+#define PPL_HDR_RESERVED 512
+#define PPL_HDR_ENTRY_SPACE \
+ (PPL_HEADER_SIZE - PPL_HDR_RESERVED - 4 * sizeof(u32) - sizeof(u64))
+#define PPL_HDR_MAX_ENTRIES \
+ (PPL_HDR_ENTRY_SPACE / sizeof(struct ppl_header_entry))
+
+struct ppl_header {
+ __u8 reserved[PPL_HDR_RESERVED];/* reserved space, fill with 0xff */
+ __le32 signature; /* signature (family number of volume) */
+ __le32 padding; /* zero pad */
+ __le64 generation; /* generation number of the header */
+ __le32 entries_count; /* number of entries in entry array */
+ __le32 checksum; /* checksum of the header (~crc32c) */
+ struct ppl_header_entry entries[PPL_HDR_MAX_ENTRIES];
+} __attribute__ ((__packed__));
+
#endif
--
2.11.0
^ permalink raw reply related
* [PATCH v5 2/7] raid5: separate header for log functions
From: Artur Paszkiewicz @ 2017-03-09 8:59 UTC (permalink / raw)
To: shli; +Cc: linux-raid, Artur Paszkiewicz
In-Reply-To: <20170309090003.13298-1-artur.paszkiewicz@intel.com>
Move raid5-cache declarations from raid5.h to raid5-log.h, add inline
wrappers for functions which will be shared with ppl and use them in
raid5 core instead of direct calls to raid5-cache.
Remove unused parameter from r5c_cache_data(), move two duplicated
pr_debug() calls to r5l_init_log().
Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
---
drivers/md/raid5-cache.c | 22 ++++++++++---
drivers/md/raid5-log.h | 81 ++++++++++++++++++++++++++++++++++++++++++++++++
drivers/md/raid5.c | 48 ++++++++--------------------
drivers/md/raid5.h | 30 ------------------
4 files changed, 112 insertions(+), 69 deletions(-)
create mode 100644 drivers/md/raid5-log.h
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 3f307be01b10..ac4e74ef1f7f 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -343,6 +343,8 @@ void r5c_handle_cached_data_endio(struct r5conf *conf,
}
}
+void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
+
/* Check whether we should flush some stripes to free up stripe cache */
void r5c_check_stripe_cache_usage(struct r5conf *conf)
{
@@ -2620,9 +2622,7 @@ void r5c_finish_stripe_write_out(struct r5conf *conf,
}
}
-int
-r5c_cache_data(struct r5l_log *log, struct stripe_head *sh,
- struct stripe_head_state *s)
+int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh)
{
struct r5conf *conf = sh->raid_conf;
int pages = 0;
@@ -2785,6 +2785,10 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
{
struct request_queue *q = bdev_get_queue(rdev->bdev);
struct r5l_log *log;
+ char b[BDEVNAME_SIZE];
+
+ pr_debug("md/raid:%s: using device %s as journal\n",
+ mdname(conf->mddev), bdevname(rdev->bdev, b));
if (PAGE_SIZE != 4096)
return -EINVAL;
@@ -2887,7 +2891,7 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
return -EINVAL;
}
-void r5l_exit_log(struct r5l_log *log)
+static void __r5l_exit_log(struct r5l_log *log)
{
flush_work(&log->disable_writeback_work);
md_unregister_thread(&log->reclaim_thread);
@@ -2897,3 +2901,13 @@ void r5l_exit_log(struct r5l_log *log)
kmem_cache_destroy(log->io_kc);
kfree(log);
}
+
+void r5l_exit_log(struct r5conf *conf)
+{
+ struct r5l_log *log = conf->log;
+
+ conf->log = NULL;
+ synchronize_rcu();
+
+ __r5l_exit_log(log);
+}
diff --git a/drivers/md/raid5-log.h b/drivers/md/raid5-log.h
new file mode 100644
index 000000000000..2da4bd3bbd79
--- /dev/null
+++ b/drivers/md/raid5-log.h
@@ -0,0 +1,81 @@
+#ifndef _RAID5_LOG_H
+#define _RAID5_LOG_H
+
+extern int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev);
+extern void r5l_exit_log(struct r5conf *conf);
+extern int r5l_write_stripe(struct r5l_log *log, struct stripe_head *head_sh);
+extern void r5l_write_stripe_run(struct r5l_log *log);
+extern void r5l_flush_stripe_to_raid(struct r5l_log *log);
+extern void r5l_stripe_write_finished(struct stripe_head *sh);
+extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio);
+extern void r5l_quiesce(struct r5l_log *log, int state);
+extern bool r5l_log_disk_error(struct r5conf *conf);
+extern bool r5c_is_writeback(struct r5l_log *log);
+extern int
+r5c_try_caching_write(struct r5conf *conf, struct stripe_head *sh,
+ struct stripe_head_state *s, int disks);
+extern void
+r5c_finish_stripe_write_out(struct r5conf *conf, struct stripe_head *sh,
+ struct stripe_head_state *s);
+extern void r5c_release_extra_page(struct stripe_head *sh);
+extern void r5c_use_extra_page(struct stripe_head *sh);
+extern void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
+extern void r5c_handle_cached_data_endio(struct r5conf *conf,
+ struct stripe_head *sh, int disks, struct bio_list *return_bi);
+extern int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh);
+extern void r5c_make_stripe_write_out(struct stripe_head *sh);
+extern void r5c_flush_cache(struct r5conf *conf, int num);
+extern void r5c_check_stripe_cache_usage(struct r5conf *conf);
+extern void r5c_check_cached_full_stripe(struct r5conf *conf);
+extern struct md_sysfs_entry r5c_journal_mode;
+extern void r5c_update_on_rdev_error(struct mddev *mddev);
+extern bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect);
+
+static inline int log_stripe(struct stripe_head *sh, struct stripe_head_state *s)
+{
+ struct r5conf *conf = sh->raid_conf;
+
+ if (conf->log) {
+ if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
+ /* writing out phase */
+ if (s->waiting_extra_page)
+ return 0;
+ return r5l_write_stripe(conf->log, sh);
+ } else if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) {
+ /* caching phase */
+ return r5c_cache_data(conf->log, sh);
+ }
+ }
+
+ return -EAGAIN;
+}
+
+static inline void log_stripe_write_finished(struct stripe_head *sh)
+{
+ struct r5conf *conf = sh->raid_conf;
+
+ if (conf->log)
+ r5l_stripe_write_finished(sh);
+}
+
+static inline void log_write_stripe_run(struct r5conf *conf)
+{
+ if (conf->log)
+ r5l_write_stripe_run(conf->log);
+}
+
+static inline void log_exit(struct r5conf *conf)
+{
+ if (conf->log)
+ r5l_exit_log(conf);
+}
+
+static inline int log_init(struct r5conf *conf, struct md_rdev *journal_dev)
+{
+ if (journal_dev)
+ return r5l_init_log(conf, journal_dev);
+
+ return 0;
+}
+
+#endif
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 5d9148125ec5..043a509560c2 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -61,6 +61,7 @@
#include "raid5.h"
#include "raid0.h"
#include "bitmap.h"
+#include "raid5-log.h"
#define UNSUPPORTED_MDDEV_FLAGS (1L << MD_FAILFAST_SUPPORTED)
@@ -911,18 +912,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
might_sleep();
- if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
- /* writing out phase */
- if (s->waiting_extra_page)
- return;
- if (r5l_write_stripe(conf->log, sh) == 0)
- return;
- } else { /* caching phase */
- if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) {
- r5c_cache_data(conf->log, sh, s);
- return;
- }
- }
+ if (log_stripe(sh, s) == 0)
+ return;
for (i = disks; i--; ) {
int op, op_flags = 0;
@@ -3247,7 +3238,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
if (bi)
bitmap_end = 1;
- r5l_stripe_write_finished(sh);
+ log_stripe_write_finished(sh);
if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
wake_up(&conf->wait_for_overlap);
@@ -3666,7 +3657,7 @@ static void handle_stripe_clean_event(struct r5conf *conf,
discard_pending = 1;
}
- r5l_stripe_write_finished(sh);
+ log_stripe_write_finished(sh);
if (!discard_pending &&
test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) {
@@ -4656,7 +4647,7 @@ static void handle_stripe(struct stripe_head *sh)
if (s.just_cached)
r5c_handle_cached_data_endio(conf, sh, disks, &s.return_bi);
- r5l_stripe_write_finished(sh);
+ log_stripe_write_finished(sh);
/* Now we might consider reading some blocks, either to check/generate
* parity, or to satisfy requests
@@ -6057,7 +6048,7 @@ static int handle_active_stripes(struct r5conf *conf, int group,
for (i = 0; i < batch_size; i++)
handle_stripe(batch[i]);
- r5l_write_stripe_run(conf->log);
+ log_write_stripe_run(conf);
cond_resched();
@@ -6633,8 +6624,8 @@ static void free_conf(struct r5conf *conf)
{
int i;
- if (conf->log)
- r5l_exit_log(conf->log);
+ log_exit(conf);
+
if (conf->shrinker.nr_deferred)
unregister_shrinker(&conf->shrinker);
@@ -7315,14 +7306,8 @@ static int raid5_run(struct mddev *mddev)
blk_queue_max_hw_sectors(mddev->queue, UINT_MAX);
}
- if (journal_dev) {
- char b[BDEVNAME_SIZE];
-
- pr_debug("md/raid:%s: using device %s as journal\n",
- mdname(mddev), bdevname(journal_dev->bdev, b));
- if (r5l_init_log(conf, journal_dev))
- goto abort;
- }
+ if (log_init(conf, journal_dev))
+ goto abort;
return 0;
abort:
@@ -7436,17 +7421,13 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
print_raid5_conf(conf);
if (test_bit(Journal, &rdev->flags) && conf->log) {
- struct r5l_log *log;
/*
* we can't wait pending write here, as this is called in
* raid5d, wait will deadlock.
*/
if (atomic_read(&mddev->writes_pending))
return -EBUSY;
- log = conf->log;
- conf->log = NULL;
- synchronize_rcu();
- r5l_exit_log(log);
+ log_exit(conf);
return 0;
}
if (rdev == p->rdev)
@@ -7515,7 +7496,6 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
int last = conf->raid_disks - 1;
if (test_bit(Journal, &rdev->flags)) {
- char b[BDEVNAME_SIZE];
if (conf->log)
return -EBUSY;
@@ -7524,9 +7504,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
* The array is in readonly mode if journal is missing, so no
* write requests running. We should be safe
*/
- r5l_init_log(conf, rdev);
- pr_debug("md/raid:%s: using device %s as journal\n",
- mdname(mddev), bdevname(rdev->bdev, b));
+ log_init(conf, rdev);
return 0;
}
if (mddev->recovery_disabled == conf->recovery_disabled)
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 4bb27b97bf6b..749c6c496e7d 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -765,34 +765,4 @@ extern struct stripe_head *
raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
int previous, int noblock, int noquiesce);
extern int raid5_calc_degraded(struct r5conf *conf);
-extern int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev);
-extern void r5l_exit_log(struct r5l_log *log);
-extern int r5l_write_stripe(struct r5l_log *log, struct stripe_head *head_sh);
-extern void r5l_write_stripe_run(struct r5l_log *log);
-extern void r5l_flush_stripe_to_raid(struct r5l_log *log);
-extern void r5l_stripe_write_finished(struct stripe_head *sh);
-extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio);
-extern void r5l_quiesce(struct r5l_log *log, int state);
-extern bool r5l_log_disk_error(struct r5conf *conf);
-extern bool r5c_is_writeback(struct r5l_log *log);
-extern int
-r5c_try_caching_write(struct r5conf *conf, struct stripe_head *sh,
- struct stripe_head_state *s, int disks);
-extern void
-r5c_finish_stripe_write_out(struct r5conf *conf, struct stripe_head *sh,
- struct stripe_head_state *s);
-extern void r5c_release_extra_page(struct stripe_head *sh);
-extern void r5c_use_extra_page(struct stripe_head *sh);
-extern void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
-extern void r5c_handle_cached_data_endio(struct r5conf *conf,
- struct stripe_head *sh, int disks, struct bio_list *return_bi);
-extern int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh,
- struct stripe_head_state *s);
-extern void r5c_make_stripe_write_out(struct stripe_head *sh);
-extern void r5c_flush_cache(struct r5conf *conf, int num);
-extern void r5c_check_stripe_cache_usage(struct r5conf *conf);
-extern void r5c_check_cached_full_stripe(struct r5conf *conf);
-extern struct md_sysfs_entry r5c_journal_mode;
-extern void r5c_update_on_rdev_error(struct mddev *mddev);
-extern bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect);
#endif
--
2.11.0
^ permalink raw reply related
* [PATCH v5 1/7] md: superblock changes for PPL
From: Artur Paszkiewicz @ 2017-03-09 8:59 UTC (permalink / raw)
To: shli; +Cc: linux-raid, Artur Paszkiewicz
In-Reply-To: <20170309090003.13298-1-artur.paszkiewicz@intel.com>
Include information about PPL location and size into mdp_superblock_1
and copy it to/from rdev. Because PPL is mutually exclusive with bitmap,
put it in place of 'bitmap_offset'. Add a new flag MD_FEATURE_PPL for
'feature_map', analogically to MD_FEATURE_BITMAP_OFFSET. Add MD_HAS_PPL
to mddev->flags to indicate that PPL is enabled on an array.
Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
---
drivers/md/md.c | 19 +++++++++++++++++++
drivers/md/md.h | 8 ++++++++
drivers/md/raid0.c | 3 ++-
drivers/md/raid1.c | 3 ++-
include/uapi/linux/raid/md_p.h | 18 ++++++++++++++----
5 files changed, 45 insertions(+), 6 deletions(-)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 79a99a1c9ce7..173550455c42 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1506,6 +1506,12 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
} else if (sb->bblog_offset != 0)
rdev->badblocks.shift = 0;
+ if (le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) {
+ rdev->ppl.offset = (__s16)le16_to_cpu(sb->ppl.offset);
+ rdev->ppl.size = le16_to_cpu(sb->ppl.size);
+ rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset;
+ }
+
if (!refdev) {
ret = 1;
} else {
@@ -1618,6 +1624,13 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)
set_bit(MD_HAS_JOURNAL, &mddev->flags);
+
+ if (le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) {
+ if (le32_to_cpu(sb->feature_map) &
+ (MD_FEATURE_BITMAP_OFFSET | MD_FEATURE_JOURNAL))
+ return -EINVAL;
+ set_bit(MD_HAS_PPL, &mddev->flags);
+ }
} else if (mddev->pers == NULL) {
/* Insist of good event counter while assembling, except for
* spares (which don't need an event count) */
@@ -1831,6 +1844,12 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
if (test_bit(MD_HAS_JOURNAL, &mddev->flags))
sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL);
+ if (test_bit(MD_HAS_PPL, &mddev->flags)) {
+ sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL);
+ sb->ppl.offset = cpu_to_le16(rdev->ppl.offset);
+ sb->ppl.size = cpu_to_le16(rdev->ppl.size);
+ }
+
rdev_for_each(rdev2, mddev) {
i = rdev2->desc_nr;
if (test_bit(Faulty, &rdev2->flags))
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 1c00160b09f9..a7b2f16452c4 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -122,6 +122,13 @@ struct md_rdev {
* sysfs entry */
struct badblocks badblocks;
+
+ struct {
+ short offset; /* Offset from superblock to start of PPL.
+ * Not used by external metadata. */
+ unsigned int size; /* Size in sectors of the PPL space */
+ sector_t sector; /* First sector of the PPL space */
+ } ppl;
};
enum flag_bits {
Faulty, /* device is known to have a fault */
@@ -226,6 +233,7 @@ enum mddev_flags {
* supported as calls to md_error() will
* never cause the array to become failed.
*/
+ MD_HAS_PPL, /* The raid array has PPL feature set */
};
enum mddev_sb_flags {
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 93347ca7c7a6..56f70c3ad37c 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -29,7 +29,8 @@
#define UNSUPPORTED_MDDEV_FLAGS \
((1L << MD_HAS_JOURNAL) | \
(1L << MD_JOURNAL_CLEAN) | \
- (1L << MD_FAILFAST_SUPPORTED))
+ (1L << MD_FAILFAST_SUPPORTED) |\
+ (1L << MD_HAS_PPL))
static int raid0_congested(struct mddev *mddev, int bits)
{
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 3c5933b1d8fb..31c7df2859a6 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -44,7 +44,8 @@
#define UNSUPPORTED_MDDEV_FLAGS \
((1L << MD_HAS_JOURNAL) | \
- (1L << MD_JOURNAL_CLEAN))
+ (1L << MD_JOURNAL_CLEAN) | \
+ (1L << MD_HAS_PPL))
/*
* Number of guaranteed r1bios in case of extreme VM load:
diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h
index 9930f3e9040f..fe2112810c43 100644
--- a/include/uapi/linux/raid/md_p.h
+++ b/include/uapi/linux/raid/md_p.h
@@ -242,10 +242,18 @@ struct mdp_superblock_1 {
__le32 chunksize; /* in 512byte sectors */
__le32 raid_disks;
- __le32 bitmap_offset; /* sectors after start of superblock that bitmap starts
- * NOTE: signed, so bitmap can be before superblock
- * only meaningful of feature_map[0] is set.
- */
+ union {
+ __le32 bitmap_offset; /* sectors after start of superblock that bitmap starts
+ * NOTE: signed, so bitmap can be before superblock
+ * only meaningful of feature_map[0] is set.
+ */
+
+ /* only meaningful when feature_map[MD_FEATURE_PPL] is set */
+ struct {
+ __le16 offset; /* sectors from start of superblock that ppl starts (signed) */
+ __le16 size; /* ppl size in sectors */
+ } ppl;
+ };
/* These are only valid with feature bit '4' */
__le32 new_level; /* new level we are reshaping to */
@@ -318,6 +326,7 @@ struct mdp_superblock_1 {
*/
#define MD_FEATURE_CLUSTERED 256 /* clustered MD */
#define MD_FEATURE_JOURNAL 512 /* support write cache */
+#define MD_FEATURE_PPL 1024 /* support PPL */
#define MD_FEATURE_ALL (MD_FEATURE_BITMAP_OFFSET \
|MD_FEATURE_RECOVERY_OFFSET \
|MD_FEATURE_RESHAPE_ACTIVE \
@@ -328,6 +337,7 @@ struct mdp_superblock_1 {
|MD_FEATURE_RECOVERY_BITMAP \
|MD_FEATURE_CLUSTERED \
|MD_FEATURE_JOURNAL \
+ |MD_FEATURE_PPL \
)
struct r5l_payload_header {
--
2.11.0
^ permalink raw reply related
* [PATCH v5 0/7] Partial Parity Log for MD RAID 5
From: Artur Paszkiewicz @ 2017-03-09 8:59 UTC (permalink / raw)
To: shli; +Cc: linux-raid, Artur Paszkiewicz
This series of patches implements the Partial Parity Log for RAID5 arrays. The
purpose of this feature is closing the RAID 5 Write Hole. It is a solution
alternative to the existing raid5-cache, but the logging workflow and much of
the implementation is based on it.
The main differences compared to raid5-cache is that PPL is a distributed log -
it is stored on array member drives in the metadata area and does not require a
dedicated journaling drive. Write performance is reduced by up to 30%-40% but
it scales with the number of drives in the array and the journaling drive does
not become a bottleneck or a single point of failure. PPL does not protect from
losing in-flight data, only from silent data corruption. More details about how
the log works can be found in patches 3 and 5.
This feature originated from Intel RSTe, which uses IMSM metadata. PPL for IMSM
is going to be included in RSTe implementations starting with upcoming Xeon
platforms and Intel will continue supporting and maintaining it. This patchset
implements PPL for external metadata (specifically IMSM) as well as native MD
v1.x metadata.
Changes in mdadm are also required to make this fully usable. Patches for mdadm
will be sent later.
v5:
- Added a common raid5-cache and ppl interface in raid5-log.h.
- Moved ops_run_partial_parity() to raid5-ppl.c.
- Use an inline bio in struct ppl_io_unit, simplify ppl_submit_iounit() and fix
a potential bio allocation issue.
- Simplified condition for appending a stripe_head to ppl entry in
ppl_log_stripe().
- Flush disk cache after ppl recovery, write with FUA in
ppl_write_empty_header().
- Removed order > 0 page allocation in ppl_recover_entry().
- Put r5l_io_unit and ppl_io_unit in a union in struct stripe_head.
- struct ppl_conf *ppl in struct r5conf replaced with void *log_private.
- Improved comments and descriptions.
v4:
- Separated raid5-cache and ppl structures.
- Removed the policy logic from raid5-cache, ppl calls moved to raid5 core.
- Checking wrong configuration when validating superblock.
- Moved documentation to separate file.
- More checks for ppl sector/size.
- Some small fixes and improvements.
v3:
- Fixed alignment issues in the metadata structures.
- Removed reading IMSM signature from superblock.
- Removed 'rwh_policy' and per-device JournalPpl flags, added
'consistency_policy', 'ppl_sector' and 'ppl_size' sysfs attributes.
- Reworked and simplified disk removal logic.
- Debug messages in raid5-ppl.c converted to pr_debug().
- Fixed some bugs in logging and recovery code.
- Improved descriptions and documentation.
v2:
- Fixed wrong PPL size calculation for IMSM.
- Simplified full stripe write case.
- Removed direct access to bi_io_vec.
- Handle failed bio_add_page().
Artur Paszkiewicz (7):
md: superblock changes for PPL
raid5: separate header for log functions
raid5-ppl: Partial Parity Log write logging implementation
md: add sysfs entries for PPL
raid5-ppl: load and recover the log
raid5-ppl: support disk hot add/remove with PPL
raid5-ppl: runtime PPL enabling or disabling
Documentation/admin-guide/md.rst | 32 +-
Documentation/md/raid5-ppl.txt | 44 ++
drivers/md/Makefile | 2 +-
drivers/md/md.c | 140 +++++
drivers/md/md.h | 10 +
drivers/md/raid0.c | 3 +-
drivers/md/raid1.c | 3 +-
drivers/md/raid5-cache.c | 22 +-
drivers/md/raid5-log.h | 114 ++++
drivers/md/raid5-ppl.c | 1247 ++++++++++++++++++++++++++++++++++++++
drivers/md/raid5.c | 182 ++++--
drivers/md/raid5.h | 40 +-
include/uapi/linux/raid/md_p.h | 45 +-
13 files changed, 1799 insertions(+), 85 deletions(-)
create mode 100644 Documentation/md/raid5-ppl.txt
create mode 100644 drivers/md/raid5-log.h
create mode 100644 drivers/md/raid5-ppl.c
--
2.11.0
^ permalink raw reply
* Re: [PATCH 22/29] drivers, scsi: convert iscsi_task.refcount from atomic_t to refcount_t
From: Johannes Thumshirn @ 2017-03-09 8:43 UTC (permalink / raw)
To: Reshetova, Elena, Chris Leech
Cc: gregkh-hQyY1W1yCW8ekmWlsbkhG0B+6BGkLq7r@public.gmane.org,
linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
xen-devel-GuqFBffKawtpuQazS67q72D2FQJk+8+b@public.gmane.org,
netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
linux1394-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f@public.gmane.org,
linux-bcache-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
linux-raid-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
linux-media-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
devel-tBiZLqfeLfOHmIFyCCdPziST3g8Odh+X@public.gmane.org,
linux-pci-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
linux-s390-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
fcoe-devel-s9riP+hp16TNLxjTenLetw@public.gmane.org,
linux-scsi-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
open-iscsi-/JYPxA39Uh5TLH3MbocFFw@public.gmane.org,
devel-gWbeCf7V1WCQmaza687I9g
In-Reply-To: <2236FBA76BA1254E88B949DDB74E612B41C569DC-kPTMFJFq+rFP9JyJpTNKArfspsVTdybXVpNB7YpNyf8@public.gmane.org>
On 03/09/2017 08:18 AM, Reshetova, Elena wrote:
>> On Mon, Mar 06, 2017 at 04:21:09PM +0200, Elena Reshetova wrote:
>>> refcount_t type and corresponding API should be
>>> used instead of atomic_t when the variable is used as
>>> a reference counter. This allows to avoid accidental
>>> refcounter overflows that might lead to use-after-free
>>> situations.
>>>
>>> Signed-off-by: Elena Reshetova <elena.reshetova-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
>>> Signed-off-by: Hans Liljestrand <ishkamiel-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
>>> Signed-off-by: Kees Cook <keescook-F7+t8E8rja9g9hUCZPvPmw@public.gmane.org>
>>> Signed-off-by: David Windsor <dwindsor-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
>>
>> This looks OK to me.
>>
>> Acked-by: Chris Leech <cleech-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
>
> Thank you for review! Do you have a tree that can take this change?
Hi Elena,
iscsi like fcoe should go via the SCSI tree.
Byte,
Johannes
--
Johannes Thumshirn Storage
jthumshirn-l3A5Bk7waGM@public.gmane.org +49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850
--
You received this message because you are subscribed to the Google Groups "open-iscsi" group.
To unsubscribe from this group and stop receiving emails from it, send an email to open-iscsi+unsubscribe-/JYPxA39Uh5TLH3MbocFF+G/Ez6ZCGd0@public.gmane.org
To post to this group, send email to open-iscsi-/JYPxA39Uh5TLH3MbocFF+G/Ez6ZCGd0@public.gmane.org
Visit this group at https://groups.google.com/group/open-iscsi.
For more options, visit https://groups.google.com/d/optout.
^ permalink raw reply
* Re: interesting case of a hung 'recovery'
From: Eyal Lebedinsky @ 2017-03-09 7:39 UTC (permalink / raw)
To: linux-raid@vger.kernel.org
In-Reply-To: <091a6513-1594-4a33-691a-cd1f7920d4a0@eyal.emu.id.au>
Bump.
On 18/02/17 23:14, Eyal Lebedinsky wrote:
> I should start by saying that this is an old fedora 19 system
>
> Executive summary: after '--add'ing a new member a 'recovery' starts but 'sync_max' is not reset.
>
> $ uname -a
> Linux e7.eyal.emu.id.au 3.14.27-100.fc19.x86_64 #1 SMP Wed Dec 17 19:36:34 UTC 2014 x86_64 x86_64 x86_64 GNU/Linux
$ sudo mdadm --version
mdadm - v4.0 - 2017-01-09
> so the issue may have been fixed since.
>
> I had a disk fail in a raid6. After some 'pending' sectors were logged I decided to do a 'check'
> around that location (set sync_min/max and echo 'check'). Sure enough it elicited disk errors,
> but the disk did not recover and it was kicked out of the array. Moreover it became unresponsive.
> It needed a power cycle so I shutdown and rebooted the machine.
>
> Not one to give up easily I tried the check again, with the same result.
> It was time to '--remove' this array member. I then '--add'ed a new disk which started a recovery.
>
> A few hours later I noticed that it slowed down. A lot. It actually did not progress at all for
> a few hours (I was away from the machine).
>
> As I was staring at the screen (for a long while) I realised that it stopped at 55.5%, and this
> number is exactly where the original 'check' failed (I still do not understand why with my bad
> memory I remembered this number).
>
> I checked 'sync_completed' and it was proper.
> I then examined 'sync_max' and it was wrong - it had the location where the very early 'check'
> failed earlier in the day. It was the same sector where it is now paused at - looks related.
>
> I decided to take a (small) risk and do
> # echo 'max' >/sys/block/md127/md/sync_max
> at which point the recovery moved on. It should be finished in about 5 hours.
>
> I do not think that it is correct for 'sync_max' to not be set to 'max' when a new member is
> added - it surely requires a full recovery.
>
> I really hope (and expect) that this was actually fixed, but this note may help others facing
> same predicament.
>
> cheers
>
--
Eyal Lebedinsky (eyal@eyal.emu.id.au)
^ permalink raw reply
* RE: [Xen-devel] [PATCH 29/29] drivers, xen: convert grant_map.users from atomic_t to refcount_t
From: Reshetova, Elena @ 2017-03-09 7:19 UTC (permalink / raw)
To: Boris Ostrovsky, gregkh@linuxfoundation.org
Cc: peterz@infradead.org, linux-pci@vger.kernel.org,
target-devel@vger.kernel.org,
linux1394-devel@lists.sourceforge.net, devel@driverdev.osuosl.org,
linux-s390@vger.kernel.org, linux-scsi@vger.kernel.org,
linux-serial@vger.kernel.org, fcoe-devel@open-fcoe.org,
Hans Liljestrand, open-iscsi@googlegroups.com,
linux-media@vger.kernel.org, Kees Cook,
linux-raid@vger.kernel.org, linux-bcache@vger.kernel.org,
xen-devel
In-Reply-To: <c4ea3925-f505-3c5b-a9fc-c74ea5a7cbe9@oracle.com>
> On 03/08/2017 08:49 AM, Reshetova, Elena wrote:
> >> On 03/06/2017 09:21 AM, Elena Reshetova wrote:
> >>> refcount_t type and corresponding API should be
> >>> used instead of atomic_t when the variable is used as
> >>> a reference counter. This allows to avoid accidental
> >>> refcounter overflows that might lead to use-after-free
> >>> situations.
> >>>
> >>> Signed-off-by: Elena Reshetova <elena.reshetova@intel.com>
> >>> Signed-off-by: Hans Liljestrand <ishkamiel@gmail.com>
> >>> Signed-off-by: Kees Cook <keescook@chromium.org>
> >>> Signed-off-by: David Windsor <dwindsor@gmail.com>
> >>> ---
> >>> drivers/xen/gntdev.c | 11 ++++++-----
> >>> 1 file changed, 6 insertions(+), 5 deletions(-)
> >> Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
> > Is there a tree that can take this change? Turns out it is better to propagate
> changes via separate trees and only leftovers can be taken via Greg's tree.
> >
>
> Sure, we can take it via Xen tree for rc3.
Thank you very much!
Best Regards,
Elena.
>
> -boris
^ permalink raw reply
* RE: [PATCH 22/29] drivers, scsi: convert iscsi_task.refcount from atomic_t to refcount_t
From: Reshetova, Elena @ 2017-03-09 7:18 UTC (permalink / raw)
To: Chris Leech
Cc: gregkh-hQyY1W1yCW8ekmWlsbkhG0B+6BGkLq7r@public.gmane.org,
linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
xen-devel-GuqFBffKawtpuQazS67q72D2FQJk+8+b@public.gmane.org,
netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
linux1394-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f@public.gmane.org,
linux-bcache-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
linux-raid-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
linux-media-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
devel-tBiZLqfeLfOHmIFyCCdPziST3g8Odh+X@public.gmane.org,
linux-pci-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
linux-s390-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
fcoe-devel-s9riP+hp16TNLxjTenLetw@public.gmane.org,
linux-scsi-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
open-iscsi-/JYPxA39Uh5TLH3MbocFFw@public.gmane.org,
devel-gWbeCf7V1WCQmaza687I9uG/Ez6ZCGd0
In-Reply-To: <20170308184740.4gueok5csdkt7u62-r8IHplWLGbA5tHQWs+pTeqPFFGjUI2lm2LY78lusg7I@public.gmane.org>
> On Mon, Mar 06, 2017 at 04:21:09PM +0200, Elena Reshetova wrote:
> > refcount_t type and corresponding API should be
> > used instead of atomic_t when the variable is used as
> > a reference counter. This allows to avoid accidental
> > refcounter overflows that might lead to use-after-free
> > situations.
> >
> > Signed-off-by: Elena Reshetova <elena.reshetova-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
> > Signed-off-by: Hans Liljestrand <ishkamiel-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
> > Signed-off-by: Kees Cook <keescook-F7+t8E8rja9g9hUCZPvPmw@public.gmane.org>
> > Signed-off-by: David Windsor <dwindsor-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
>
> This looks OK to me.
>
> Acked-by: Chris Leech <cleech-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
Thank you for review! Do you have a tree that can take this change?
Best Regards,
Elena.
>
> > ---
> > drivers/scsi/libiscsi.c | 8 ++++----
> > drivers/scsi/qedi/qedi_iscsi.c | 2 +-
> > include/scsi/libiscsi.h | 3 ++-
> > 3 files changed, 7 insertions(+), 6 deletions(-)
> >
> > diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c
> > index 834d121..7eb1d2c 100644
> > --- a/drivers/scsi/libiscsi.c
> > +++ b/drivers/scsi/libiscsi.c
> > @@ -516,13 +516,13 @@ static void iscsi_free_task(struct iscsi_task *task)
> >
> > void __iscsi_get_task(struct iscsi_task *task)
> > {
> > - atomic_inc(&task->refcount);
> > + refcount_inc(&task->refcount);
> > }
> > EXPORT_SYMBOL_GPL(__iscsi_get_task);
> >
> > void __iscsi_put_task(struct iscsi_task *task)
> > {
> > - if (atomic_dec_and_test(&task->refcount))
> > + if (refcount_dec_and_test(&task->refcount))
> > iscsi_free_task(task);
> > }
> > EXPORT_SYMBOL_GPL(__iscsi_put_task);
> > @@ -744,7 +744,7 @@ __iscsi_conn_send_pdu(struct iscsi_conn *conn, struct
> iscsi_hdr *hdr,
> > * released by the lld when it has transmitted the task for
> > * pdus we do not expect a response for.
> > */
> > - atomic_set(&task->refcount, 1);
> > + refcount_set(&task->refcount, 1);
> > task->conn = conn;
> > task->sc = NULL;
> > INIT_LIST_HEAD(&task->running);
> > @@ -1616,7 +1616,7 @@ static inline struct iscsi_task *iscsi_alloc_task(struct
> iscsi_conn *conn,
> > sc->SCp.phase = conn->session->age;
> > sc->SCp.ptr = (char *) task;
> >
> > - atomic_set(&task->refcount, 1);
> > + refcount_set(&task->refcount, 1);
> > task->state = ISCSI_TASK_PENDING;
> > task->conn = conn;
> > task->sc = sc;
> > diff --git a/drivers/scsi/qedi/qedi_iscsi.c b/drivers/scsi/qedi/qedi_iscsi.c
> > index b9f79d3..3895bd5 100644
> > --- a/drivers/scsi/qedi/qedi_iscsi.c
> > +++ b/drivers/scsi/qedi/qedi_iscsi.c
> > @@ -1372,7 +1372,7 @@ static void qedi_cleanup_task(struct iscsi_task *task)
> > {
> > if (!task->sc || task->state == ISCSI_TASK_PENDING) {
> > QEDI_INFO(NULL, QEDI_LOG_IO, "Returning
> ref_cnt=%d\n",
> > - atomic_read(&task->refcount));
> > + refcount_read(&task->refcount));
> > return;
> > }
> >
> > diff --git a/include/scsi/libiscsi.h b/include/scsi/libiscsi.h
> > index b0e275d..24d74b5 100644
> > --- a/include/scsi/libiscsi.h
> > +++ b/include/scsi/libiscsi.h
> > @@ -29,6 +29,7 @@
> > #include <linux/timer.h>
> > #include <linux/workqueue.h>
> > #include <linux/kfifo.h>
> > +#include <linux/refcount.h>
> > #include <scsi/iscsi_proto.h>
> > #include <scsi/iscsi_if.h>
> > #include <scsi/scsi_transport_iscsi.h>
> > @@ -139,7 +140,7 @@ struct iscsi_task {
> >
> > /* state set/tested under session->lock */
> > int state;
> > - atomic_t refcount;
> > + refcount_t refcount;
> > struct list_head running; /* running cmd list */
> > void *dd_data; /*
> driver/transport data */
> > };
> > --
> > 2.7.4
> >
--
You received this message because you are subscribed to the Google Groups "open-iscsi" group.
To unsubscribe from this group and stop receiving emails from it, send an email to open-iscsi+unsubscribe-/JYPxA39Uh5TLH3MbocFF+G/Ez6ZCGd0@public.gmane.org
To post to this group, send email to open-iscsi-/JYPxA39Uh5TLH3MbocFF+G/Ez6ZCGd0@public.gmane.org
Visit this group at https://groups.google.com/group/open-iscsi.
For more options, visit https://groups.google.com/d/optout.
^ permalink raw reply
* Re: Auto replace disk
From: Edward Kuns @ 2017-03-09 2:08 UTC (permalink / raw)
To: Gandalf Corvotempesta; +Cc: Wols Lists, Linux-RAID
In-Reply-To: <CAJH6TXjjjnQ3_OJ87Gv8Spsd=7BZ7RGrfCJ0kqeRXTQN_1Q3KQ@mail.gmail.com>
On Wed, Mar 8, 2017 at 3:32 PM, Gandalf Corvotempesta
<gandalf.corvotempesta@gmail.com> wrote:
>> The last resort is to remove the broken drive and then replace it - this is
>> likely to trigger further failures and bring down the array.
>
> Why ? I've removed many, many, many disks before with no issue.
> Why removing a disk should bring the whole array down? This seems a bug to me.
In a perfect world where you do scrubbing and your timeouts are
properly configured and your other disks are all in good shape, you're
right. In the real world where people often don't do what, where
multiple bad sectors accumulate on multiple disks before you get a
failure, or where you're just unlucky, a rebuild triggered by removing
a bad disk and adding back a new one has a fair chance of experiencing
a failure during the rebuild.
If you've lost all redundancy due to a disk failure and while doing a
rebuild you experience another read failure or a second disk failure,
whether due to negligence or bad luck, you can lose the whole array.
If you have the option of doing so, therefore, better to --replace
while you still have redundancy.
Eddie
^ permalink raw reply
* Re: Auto replace disk
From: Brad Campbell @ 2017-03-09 1:31 UTC (permalink / raw)
To: Gandalf Corvotempesta, Wols Lists; +Cc: linux-raid
In-Reply-To: <CAJH6TXjjjnQ3_OJ87Gv8Spsd=7BZ7RGrfCJ0kqeRXTQN_1Q3KQ@mail.gmail.com>
On 09/03/17 05:32, Gandalf Corvotempesta wrote:
> Why ? I've removed many, many, many disks before with no issue.
> Why removing a disk should bring the whole array down? This seems a bug to me.
> If a disk will crash, the effect is the same as removing from the slot
> and RAID is
> meant to protect against this kind of failures.
In general a good number of "help me my RAID is dead" requests that hit
this list are due to not performing routine array or drive scrubs. So
one drive dies, and one of the others has a previously unknown bad
sector. When you put the new drive in, during the rebuild the bad sector
is hit and the whole array comes tumbling down.
Doing a proactive replacement reduces the possibility of this occurring.
Having said that, if your disk is dead then there's no other option
anyway. Regular array scrubs go a long way to mitigating this risk, but
it does happen frequently enough that you need to be warned against it.
Just because it hasn't happened to you *yet* does not mean you're
immune, and it's certainly not a *bug*.
^ permalink raw reply
* Re: Auto replace disk
From: Gandalf Corvotempesta @ 2017-03-08 21:32 UTC (permalink / raw)
To: Wols Lists; +Cc: linux-raid
In-Reply-To: <58C04AC9.9070801@youngman.org.uk>
2017-03-08 19:17 GMT+01:00 Wols Lists <antlists@youngman.org.uk>:
> Do you mean you remove an old disk, and put a new blank disk in?
Yes
> If that's what you mean, then no, it's not possible. mdadm doesn't have
> a clue about disks, what it sees is "block devices".
Ok but mdadm.conf man page seems to say the opposite:
https://linux.die.net/man/5/mdadm.conf
"POLICY
This is used to specify what automatic behavior is allowed on devices
newly appearing in the system and provides a way of marking spares
that can be moved to other arrays as well as the migration domains.
action=include, re-add, spare, spare-same-slot, or force-spare
auto= yes, no, or homehost.
The action item determines the automatic behavior allowed for devices
matching the path and type in the same line. If a device matches
several lines with different actions then the most permissive will
apply. The ordering of policy lines is irrelevant to the end result.
includeallows adding a disk to an array if metadata on that disk
matches that arrayre-addwill include the device in the array if it
appears to be a current member or a member that was recently
removedspareas above and additionally: if the device is bare it can
become a spare if there is any array that it is a candidate for based
on domains and metadata.spare-same-slotas above and additionally if
given slot was used by an array that went degraded recently and the
device plugged in has no metadata then it will be automatically added
to that array (or it's container)force-spareas above and the disk will
become a spare in remaining cases
"
> You should not - if you can help it - ever remove a disk and then
> replace it. Yes in practice I know that's a luxury people often don't
> have ... at best you should have spares configured
If you have a server with only 4 slot configured in a RAID10,
this workflow would be impossible.
> if you have to you
> put the new drive in, use --replace, and then remove the old one. The
> last resort is to remove the broken drive and then replace it - this is
> likely to trigger further failures and bring down the array.
Why ? I've removed many, many, many disks before with no issue.
Why removing a disk should bring the whole array down? This seems a bug to me.
If a disk will crash, the effect is the same as removing from the slot
and RAID is
meant to protect against this kind of failures.
^ permalink raw reply
* RE: LSI RAID
From: Jared.Dominguez @ 2017-03-08 20:20 UTC (permalink / raw)
To: gandalf.corvotempesta, hare; +Cc: linux-raid
In-Reply-To: <CAJH6TXg1OPV42BUATcZ1T9b-LkV-LWVuKk6=HXAcM0J=GxC2VA@mail.gmail.com>
Dell Customer Communication
> 2017-02-28 10:44 GMT+01:00 Gandalf Corvotempesta
> <gandalf.corvotempesta@gmail.com>:
> > 2017-02-28 10:06 GMT+01:00 Hannes Reinecke <hare@suse.de>:
> >> Sure.
> >> The recent mdadm should be able to create DDF metadata.
> >
> > This means that i'll be able to import a configuration created with a
> > LSI MegaRaid controller and use them with mdadm ?
> > If yes, how ?
Support for DDF doesn't imply ability to use another vendor's DDF array. DDF allows for extensive vendor-specific metadata, which would impact the ability to import, say, a Broadcom array.
^ permalink raw reply
* Re: [RAID recovery] Unable to recover RAID5 array after disk failure
From: Olivier Swinkels @ 2017-03-08 19:01 UTC (permalink / raw)
To: Phil Turmel; +Cc: linux-raid
In-Reply-To: <f0536882-3a0b-b43c-317d-691d82c6923f@turmel.org>
On Tue, Mar 7, 2017 at 3:52 PM, Phil Turmel <philip@turmel.org> wrote:
> On 03/07/2017 03:39 AM, Olivier Swinkels wrote:
>
>> After I used the pvcreate command to recreate pv the vgcfgrestore
>> command succeeds and the lvm is available (after activating).
>>
>> However when I try to mount it I get the following error: sudo mount
>> -t ext4 /dev/lvm-raid/lvm0 /mnt/raid mount: mount
>> /dev/mapper/lvm--raid-lvm0 on /mnt/raid failed: Structure needs
>> cleaning
>>
>> So I guess the underlying RAID array is still not ok...
>
> No, your underlying array is very likely correct. But the intervening
> incorrect --create operation stomped on your filesystems. Run fsck
> while unmounted to deal with the corruption and recover what you can.
>
> Run fsck with "-n" first, to see just how extensive the problems are,
> then with "-y" to actually fix things. Based on your sequence of
> events, your corruptions should be at low sector addresses (first few
> Gigs) of your array. If that's what appears with "-n", proceed.
>
> If you are unlucky, the stompage hit one or more of your filesystems'
> superblocks, requiring access to backup superblocks. If you still see
> no progress with either of the above, you might need to search your
> array for ext2/3/4 superblocks. This grep would help:
>
> dd if=/dev/md0 bs=1M count=16k 2>/dev/null |hexdump -C |grep '30 .\+ 53 ef 0'
>
> (Not all hits from the grep will be superblocks, but they would be
> visually distinguishable, and would have decipherable timestamps.)
>
> Phil
Hi,
I ran fsck -n disk on the lvm and got the very large response below.
This didn't look promising, so I ran the ext2/3/4 superblock search.
I didn't recognize any obvious timestamps, but i'm not sure what to look for.
(I also pasted this output below)
Can you recommend any further action?
Olivier
===============================================================================
fsck /dev/lvm-raid/lvm0
fsck from util-linux 2.27.1
ext2fs_check_desc: Corrupt group descriptor: bad block for block bitmap
fsck.ext4: Group descriptors look bad... trying backup blocks...
Warning: skipping journal recovery because doing a read-only filesystem check.
/dev/mapper/lvm--raid-lvm0 has gone 1275 days without being checked,
check forced.
Pass 1: Checking inodes, blocks, and sizes
Pass 2: Checking directory structure
Pass 3: Checking directory connectivity
Pass 4: Checking reference counts
Pass 5: Checking group summary information
Free blocks count wrong for group #0 (23513, counted=5372).
Fix? no
Free blocks count wrong for group #1 (980, counted=485).
Fix? no
Free blocks count wrong for group #3 (1023, counted=151).
Fix? no
Free blocks count wrong for group #5 (1022, counted=142).
Fix? no
<<TRIMMED ~200000 lines >>>
Free inodes count wrong for group #60160 (8192, counted=8163).
Fix? no
Directories count wrong for group #60160 (0, counted=1).
Fix? no
Free inodes count wrong for group #60161 (8192, counted=8191).
Fix? no
Directories count wrong for group #60161 (0, counted=1).
Fix? no
Free inodes count wrong (488172170, counted=609035768).
Fix? no
/dev/mapper/lvm--raid-lvm0: ********** WARNING: Filesystem still has
errors **********
/dev/mapper/lvm--raid-lvm0: 122295670/610467840 files (0.0%
non-contiguous), 2291846165/2441871360 blocks
===============================================================================
dd if=/dev/md0 bs=1M count=16k 2>/dev/null |hexdump -C |grep '30 .\+ 53 ef 0'
00040430 3a 3b ab 58 17 00 19 00 53 ef 01 00 01 00 00 00 |:;.X....S.......|
08040030 0c 58 34 52 02 00 19 00 53 ef 01 00 01 00 00 00 |.X4R....S.......|
18040030 0c 58 34 52 02 00 19 00 53 ef 01 00 01 00 00 00 |.X4R....S.......|
1d220e30 2e 93 9c 90 c1 89 27 62 53 ef 03 12 d4 82 36 76 |......'bS.....6v|
27340430 bb 78 c4 e3 9e 56 62 0f 53 ef 04 b9 38 0c d0 26 |.x...Vb.S...8..&|
28040030 0c 58 34 52 02 00 19 00 53 ef 01 00 01 00 00 00 |.X4R....S.......|
2f02ab30 5e d0 c9 98 83 ce 3b 92 53 ef 08 51 c4 4b dc af |^.....;.S..Q.K..|
38040030 0c 58 34 52 02 00 19 00 53 ef 01 00 01 00 00 00 |.X4R....S.......|
44318c30 55 c1 19 f3 10 fb ab 2f 53 ef 04 0d fd c1 dc ed |U....../S.......|
48040030 0c 58 34 52 02 00 19 00 53 ef 01 00 01 00 00 00 |.X4R....S.......|
50791730 6b 04 06 e8 97 88 9b 08 53 ef 0b 24 21 68 3d a5 |k.......S..$!h=.|
5aab3030 db 52 8d 5c 82 f4 80 cd 53 ef 08 4c f7 a7 c7 a9 |.R.\....S..L....|
98b69330 41 07 bb 64 c2 4a 00 5a 53 ef 08 71 88 d2 5a 42 |A..d.J.ZS..q..ZB|
c8040030 0c 58 34 52 02 00 19 00 53 ef 01 00 01 00 00 00 |.X4R....S.......|
cefc4130 14 37 b9 ef 1f 89 14 ab 53 ef 02 1c ca 88 f0 c4 |.7......S.......|
d8040030 0c 58 34 52 02 00 19 00 53 ef 01 00 01 00 00 00 |.X4R....S.......|
fe124d30 8a 6f 46 db 63 9b c5 9b 53 ef 07 97 74 24 2d e0 |.oF.c...S...t$-.|
106432f30 76 d0 fc 66 02 06 e9 50 53 ef 02 44 6c ab 89 ac |v..f...PS..Dl...|
111e92630 f8 59 df b2 af 5d 6b 8b 53 ef 0d 65 c6 29 66 0c |.Y...]k.S..e.)f.|
12883be30 88 63 b9 3a 6a 52 03 a6 53 ef 0c 12 67 dc ad 9e |.c.:jR..S...g...|
14d315530 30 92 ef 27 a0 dc 46 dd 53 ef 09 d7 89 5a ba 95 |0..'..F.S....Z..|
17222e430 aa 7d 4f 4a c5 88 fa f0 53 ef 0c 08 04 10 9e ad |.}OJ....S.......|
173717c30 3d fd 91 37 5a 0e 7b aa 53 ef 0b 08 d4 51 89 fa |=..7Z.{.S....Q..|
17665ae30 c9 a8 24 d6 ba 0a 50 f7 53 ef 08 13 da d7 52 0a |..$...P.S.....R.|
188040030 0c 58 34 52 02 00 19 00 53 ef 01 00 01 00 00 00 |.X4R....S.......|
18ef3be30 41 c3 91 02 65 af 8d 27 53 ef 0c 1f 4b 47 f5 97 |A...e..'S...KG..|
1943f1d30 d6 87 e0 d7 c1 3b e7 53 53 ef 08 a5 88 14 b6 c8 |.....;.SS.......|
19bfba930 c3 e4 07 04 b4 d1 05 4a 53 ef 02 c4 69 2f e2 d0 |.......JS...i/..|
1a807c530 a1 12 db b0 77 9d c5 6c 53 ef 0d 6e 37 05 74 61 |....w..lS..n7.ta|
1b1976330 1a 2b 57 56 46 96 c2 9b 53 ef 03 78 0d 9e 4a e0 |.+WVF...S..x..J.|
1ebf5db30 1e 76 bf 9d 25 8f fd 16 53 ef 04 ba 23 14 2f 8d |.v..%...S...#./.|
1eec8b930 d5 dc 1e 90 b1 c8 f4 31 53 ef 06 10 3c 81 fb 37 |.......1S...<..7|
1f1392930 89 db 6c 92 6c 10 a5 f9 53 ef 05 31 0e 04 a2 d0 |..l.l...S..1....|
227cdff30 8d 8a 0a 61 91 04 c4 15 53 ef 03 5b 28 c4 52 59 |...a....S..[(.RY|
23000da30 1d e6 34 ca 2f 36 08 78 53 ef 0e 28 e4 94 5f 42 |..4./6.xS..(.._B|
245749e30 83 67 b7 d0 8b 40 0d f0 53 ef 0e ec 13 37 39 89 |.g...@..S....79.|
24f40e530 86 3b fc 43 ea ef 81 29 53 ef 09 6c fb 08 3b 6d |.;.C...)S..l..;m|
25a473430 94 24 8d 8c 6b 15 7b 8b 53 ef 00 ae 43 b2 64 a4 |.$..k.{.S...C.d.|
288040030 0c 58 34 52 02 00 19 00 53 ef 01 00 01 00 00 00 |.X4R....S.......|
29966b530 00 ad 14 71 ea c5 c6 7a 53 ef 0c 11 d2 57 fb a0 |...q...zS....W..|
2a5924830 98 43 57 9b 2f 98 18 4f 53 ef 0d 27 72 c3 58 ba |.CW./..OS..'r.X.|
2a7bf9b30 72 f2 a7 7c 73 4f 2f a6 53 ef 02 14 5b 7c 93 9e |r..|sO/.S...[|..|
2aae14530 25 6b d5 ab b0 48 94 cd 53 ef 06 08 41 e0 0b ba |%k...H..S...A...|
2ac470a30 a4 a6 cb a0 40 a2 8f cd 53 ef 06 47 3d d5 e0 38 |....@...S..G=..8|
2aca3fb30 99 e4 1a cd b1 a8 be 6c 53 ef 01 03 68 6f 4c f4 |.......lS...hoL.|
2b19b6030 30 64 6a d9 08 f0 46 c9 53 ef 07 95 b2 14 fb aa |0dj...F.S.......|
2c53e0a30 59 1b 4b c8 25 a1 56 d9 53 ef 0b 26 3a 76 72 a5 |Y.K.%.V.S..&:vr.|
2cb48d730 0a d2 b4 d5 40 fc c5 d0 53 ef 0b 6d 90 9b db 22 |....@...S..m..."|
2dc925b30 16 18 5b 30 09 18 f8 2e 53 ef 0a d1 00 74 13 e7 |..[0....S....t..|
2dcc76230 e9 75 d0 2e 92 45 59 2c 53 ef 0e 84 63 9a aa b6 |.u...EY,S...c...|
2e6d28730 f4 81 47 6a 8f 9c ae 1a 53 ef 09 9d d2 5b e4 35 |..Gj....S....[.5|
2f9639f30 82 67 ae 49 2c a1 fc c0 53 ef 07 b1 1f 54 c5 c9 |.g.I,...S....T..|
2fa204230 78 42 16 12 e9 83 72 88 53 ef 00 08 df d9 8b 2a |xB....r.S......*|
31b7c1030 99 f0 43 99 1f 52 77 17 53 ef 0c 5f b9 51 a1 42 |..C..Rw.S.._.Q.B|
321ad8a30 4e 9b 9e 2e c6 8e 19 8d 53 ef 0e 1d 5d 20 c0 9d |N.......S...] ..|
32ff61630 72 9f 28 d2 9a 35 79 63 53 ef 09 e6 d6 e3 27 c5 |r.(..5ycS.....'.|
33a4b8230 58 9d 65 cb da 31 07 e7 53 ef 04 07 e2 4a 9b 17 |X.e..1..S....J..|
3472a7230 b4 a9 fa cf 3d 39 c3 95 53 ef 03 a4 07 2a ac 9a |....=9..S....*..|
3534f5130 54 d2 19 77 a4 c2 4a 2f 53 ef 07 b3 0f 60 be ee |T..w..J/S....`..|
356145130 df f7 f8 eb 6b 2a 8b fa 53 ef 09 cf 9d cd db 68 |....k*..S......h|
361b68830 53 ab 0c 7a 8e 4e 96 1b 53 ef 0f 3b 78 e9 d5 ce |S..z.N..S..;x...|
37381e630 24 63 4a ce 1b eb b0 df 53 ef 02 68 54 7d 7a ad |$cJ.....S..hT}z.|
375103330 3a 20 84 18 b6 6d f5 3b 53 ef 0a 86 8f ac b2 d8 |: ...m.;S.......|
37b3f7230 11 43 a4 46 fd c8 da ae 53 ef 04 f3 80 db 5c 4b |.C.F....S.....\K|
3a5adf530 7f c1 e1 75 26 cf 25 b2 53 ef 01 4d 64 71 10 bf |...u&.%.S..Mdq..|
3ccc7b830 55 cb 6f 68 e3 c0 0f 36 53 ef 06 00 85 8a 17 72 |U.oh...6S......r|
3d6743f30 fa ca 0e 36 a6 4e 02 6a 53 ef 05 6d 6f e5 31 78 |...6.N.jS..mo.1x|
3e8040030 0c 58 34 52 02 00 19 00 53 ef 01 00 01 00 00 00 |.X4R....S.......|
===============================================================================
^ permalink raw reply
* Re: [PATCH 22/29] drivers, scsi: convert iscsi_task.refcount from atomic_t to refcount_t
From: Chris Leech @ 2017-03-08 18:47 UTC (permalink / raw)
To: Elena Reshetova
Cc: gregkh-hQyY1W1yCW8ekmWlsbkhG0B+6BGkLq7r,
linux-kernel-u79uwXL29TY76Z2rM5mHXA,
xen-devel-GuqFBffKawtpuQazS67q72D2FQJk+8+b,
netdev-u79uwXL29TY76Z2rM5mHXA,
linux1394-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f,
linux-bcache-u79uwXL29TY76Z2rM5mHXA,
linux-raid-u79uwXL29TY76Z2rM5mHXA,
linux-media-u79uwXL29TY76Z2rM5mHXA,
devel-tBiZLqfeLfOHmIFyCCdPziST3g8Odh+X,
linux-pci-u79uwXL29TY76Z2rM5mHXA,
linux-s390-u79uwXL29TY76Z2rM5mHXA,
fcoe-devel-s9riP+hp16TNLxjTenLetw,
linux-scsi-u79uwXL29TY76Z2rM5mHXA,
open-iscsi-/JYPxA39Uh5TLH3MbocFFw,
devel-gWbeCf7V1WCQmaza687I9mD2FQJk+8+b,
target-devel-u79uwXL29TY76Z2rM5mHXA,
linux-serial-u79uwXL29TY76Z2rM5mHXA,
linux-usb-u79uwXL29TY76Z2rM5mHXA, peterz-wEGCiKHe2LqWVfeAwA7xHQ,
Hans Liljestrand, Kees Cook, David Windsor
In-Reply-To: <1488810076-3754-23-git-send-email-elena.reshetova-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
On Mon, Mar 06, 2017 at 04:21:09PM +0200, Elena Reshetova wrote:
> refcount_t type and corresponding API should be
> used instead of atomic_t when the variable is used as
> a reference counter. This allows to avoid accidental
> refcounter overflows that might lead to use-after-free
> situations.
>
> Signed-off-by: Elena Reshetova <elena.reshetova-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
> Signed-off-by: Hans Liljestrand <ishkamiel-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
> Signed-off-by: Kees Cook <keescook-F7+t8E8rja9g9hUCZPvPmw@public.gmane.org>
> Signed-off-by: David Windsor <dwindsor-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
This looks OK to me.
Acked-by: Chris Leech <cleech-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
> ---
> drivers/scsi/libiscsi.c | 8 ++++----
> drivers/scsi/qedi/qedi_iscsi.c | 2 +-
> include/scsi/libiscsi.h | 3 ++-
> 3 files changed, 7 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c
> index 834d121..7eb1d2c 100644
> --- a/drivers/scsi/libiscsi.c
> +++ b/drivers/scsi/libiscsi.c
> @@ -516,13 +516,13 @@ static void iscsi_free_task(struct iscsi_task *task)
>
> void __iscsi_get_task(struct iscsi_task *task)
> {
> - atomic_inc(&task->refcount);
> + refcount_inc(&task->refcount);
> }
> EXPORT_SYMBOL_GPL(__iscsi_get_task);
>
> void __iscsi_put_task(struct iscsi_task *task)
> {
> - if (atomic_dec_and_test(&task->refcount))
> + if (refcount_dec_and_test(&task->refcount))
> iscsi_free_task(task);
> }
> EXPORT_SYMBOL_GPL(__iscsi_put_task);
> @@ -744,7 +744,7 @@ __iscsi_conn_send_pdu(struct iscsi_conn *conn, struct iscsi_hdr *hdr,
> * released by the lld when it has transmitted the task for
> * pdus we do not expect a response for.
> */
> - atomic_set(&task->refcount, 1);
> + refcount_set(&task->refcount, 1);
> task->conn = conn;
> task->sc = NULL;
> INIT_LIST_HEAD(&task->running);
> @@ -1616,7 +1616,7 @@ static inline struct iscsi_task *iscsi_alloc_task(struct iscsi_conn *conn,
> sc->SCp.phase = conn->session->age;
> sc->SCp.ptr = (char *) task;
>
> - atomic_set(&task->refcount, 1);
> + refcount_set(&task->refcount, 1);
> task->state = ISCSI_TASK_PENDING;
> task->conn = conn;
> task->sc = sc;
> diff --git a/drivers/scsi/qedi/qedi_iscsi.c b/drivers/scsi/qedi/qedi_iscsi.c
> index b9f79d3..3895bd5 100644
> --- a/drivers/scsi/qedi/qedi_iscsi.c
> +++ b/drivers/scsi/qedi/qedi_iscsi.c
> @@ -1372,7 +1372,7 @@ static void qedi_cleanup_task(struct iscsi_task *task)
> {
> if (!task->sc || task->state == ISCSI_TASK_PENDING) {
> QEDI_INFO(NULL, QEDI_LOG_IO, "Returning ref_cnt=%d\n",
> - atomic_read(&task->refcount));
> + refcount_read(&task->refcount));
> return;
> }
>
> diff --git a/include/scsi/libiscsi.h b/include/scsi/libiscsi.h
> index b0e275d..24d74b5 100644
> --- a/include/scsi/libiscsi.h
> +++ b/include/scsi/libiscsi.h
> @@ -29,6 +29,7 @@
> #include <linux/timer.h>
> #include <linux/workqueue.h>
> #include <linux/kfifo.h>
> +#include <linux/refcount.h>
> #include <scsi/iscsi_proto.h>
> #include <scsi/iscsi_if.h>
> #include <scsi/scsi_transport_iscsi.h>
> @@ -139,7 +140,7 @@ struct iscsi_task {
>
> /* state set/tested under session->lock */
> int state;
> - atomic_t refcount;
> + refcount_t refcount;
> struct list_head running; /* running cmd list */
> void *dd_data; /* driver/transport data */
> };
> --
> 2.7.4
>
--
You received this message because you are subscribed to the Google Groups "open-iscsi" group.
To unsubscribe from this group and stop receiving emails from it, send an email to open-iscsi+unsubscribe-/JYPxA39Uh5TLH3MbocFF+G/Ez6ZCGd0@public.gmane.org
To post to this group, send email to open-iscsi-/JYPxA39Uh5TLH3MbocFF+G/Ez6ZCGd0@public.gmane.org
Visit this group at https://groups.google.com/group/open-iscsi.
For more options, visit https://groups.google.com/d/optout.
^ permalink raw reply
* Re: Auto replace disk
From: Wols Lists @ 2017-03-08 18:17 UTC (permalink / raw)
To: Gandalf Corvotempesta, linux-raid
In-Reply-To: <CAJH6TXgyOEE5A8TZVZy-db+tPD1QeHtnXXO3B-1vmKe4cAt6xA@mail.gmail.com>
On 08/03/17 11:28, Gandalf Corvotempesta wrote:
> Hi to all
> I'm trying to configure mdadm to do automatic replace/rebuild when a
> disk is phisically removed and replaced in a slot but without success
Do you mean you remove an old disk, and put a new blank disk in?
>
> Is this possible? How?
> The new disk must be formatted or mdadm will replicate partition table
> on it's own?
If that's what you mean, then no, it's not possible. mdadm doesn't have
a clue about disks, what it sees is "block devices".
If you stick a new disk in, you need to tell mdadm about it. At which
point you can add it as a spare (which means mdadm will use it to
replace a disk that fails), or you can tell mdadm to replace a failed disk.
You should not - if you can help it - ever remove a disk and then
replace it. Yes in practice I know that's a luxury people often don't
have ... at best you should have spares configured; if you have to you
put the new drive in, use --replace, and then remove the old one. The
last resort is to remove the broken drive and then replace it - this is
likely to trigger further failures and bring down the array.
Cheers,
Wol
^ permalink raw reply
* Re: [Xen-devel] [PATCH 29/29] drivers, xen: convert grant_map.users from atomic_t to refcount_t
From: Boris Ostrovsky @ 2017-03-08 17:45 UTC (permalink / raw)
To: Reshetova, Elena,
gregkh-hQyY1W1yCW8ekmWlsbkhG0B+6BGkLq7r@public.gmane.org
Cc: peterz-wEGCiKHe2LqWVfeAwA7xHQ@public.gmane.org,
linux-pci-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
target-devel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
linux1394-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f@public.gmane.org,
devel-gWbeCf7V1WCQmaza687I9mD2FQJk+8+b@public.gmane.org,
linux-s390-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
linux-scsi-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
linux-serial-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
fcoe-devel-s9riP+hp16TNLxjTenLetw@public.gmane.org,
xen-devel-GuqFBffKawtpuQazS67q72D2FQJk+8+b@public.gmane.org,
open-iscsi-/JYPxA39Uh5TLH3MbocFFw@public.gmane.org,
linux-media-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, Kees Cook,
linux-raid-u79uwXL29TasMV2rI37PzA
In-Reply-To: <2236FBA76BA1254E88B949DDB74E612B41C56177-kPTMFJFq+rFP9JyJpTNKArfspsVTdybXVpNB7YpNyf8@public.gmane.org>
On 03/08/2017 08:49 AM, Reshetova, Elena wrote:
>> On 03/06/2017 09:21 AM, Elena Reshetova wrote:
>>> refcount_t type and corresponding API should be
>>> used instead of atomic_t when the variable is used as
>>> a reference counter. This allows to avoid accidental
>>> refcounter overflows that might lead to use-after-free
>>> situations.
>>>
>>> Signed-off-by: Elena Reshetova <elena.reshetova-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
>>> Signed-off-by: Hans Liljestrand <ishkamiel-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
>>> Signed-off-by: Kees Cook <keescook-F7+t8E8rja9g9hUCZPvPmw@public.gmane.org>
>>> Signed-off-by: David Windsor <dwindsor-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
>>> ---
>>> drivers/xen/gntdev.c | 11 ++++++-----
>>> 1 file changed, 6 insertions(+), 5 deletions(-)
>> Reviewed-by: Boris Ostrovsky <boris.ostrovsky-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
> Is there a tree that can take this change? Turns out it is better to propagate changes via separate trees and only leftovers can be taken via Greg's tree.
>
Sure, we can take it via Xen tree for rc3.
-boris
--
You received this message because you are subscribed to the Google Groups "open-iscsi" group.
To unsubscribe from this group and stop receiving emails from it, send an email to open-iscsi+unsubscribe-/JYPxA39Uh5TLH3MbocFF+G/Ez6ZCGd0@public.gmane.org
To post to this group, send email to open-iscsi-/JYPxA39Uh5TLH3MbocFF+G/Ez6ZCGd0@public.gmane.org
Visit this group at https://groups.google.com/group/open-iscsi.
For more options, visit https://groups.google.com/d/optout.
^ permalink raw reply
* Re: RAID Recovery
From: Adam Goryachev @ 2017-03-08 15:25 UTC (permalink / raw)
To: Phil Turmel, linux-raid
In-Reply-To: <c9a70435-56a5-9295-2582-7b71a71619eb@websitemanagers.com.au>
On 8/3/17 20:08, Adam Goryachev wrote:
>
>
> On 8/3/17 02:00, Phil Turmel wrote:
>> Hi Adam,
>>
>> Search the devices for ext2/3/4 superblocks, like so:
>>
>> dd if=/dev/sdX bs=1M 2>/dev/null |hexdump -C |grep '30 .\+ 53 ef 0'
> What is the chance it is a ext2/3/4 based FS? I suppose most NAS would
> use these filesystems... I guess I'll find out soon enough.
>
>> This will take a very long time, and will generate false positives.
> Can you advise what to do to "verify" these and work out which ones
> are false positives?
>> You probably would want to use screen or tmux to run these in
>> parallel in separate processes.
> I'm not sure there is much of a point, since they are mostly
> duplicates of each other. I'm running it on sdd now, and copying sda
> and sdc to a spare drive. I may re-run the command on sdc (and skip
> the first 490GB...) if nothing useful is found on sdd.
>> But superblock locations will give you hints as to the rest of data,
>> and make it possible to create partitions that will let you copy
>> stuff off into a new array.
>>
>
OK, so the first disk has completed and found 85 matches, I don't think
there is any point in posting the raw output, but can you advise what I
should do with it? Here is the first few matches:
082f6c30 3a 5d f7 a2 d0 52 ab ba 53 ef 0d 6d 51 d0 2a 76
|:]...R..S..mQ.*v|
21146d30 cf 17 bf 15 bf 9e e2 67 53 ef 05 15 a7 89 ae 38
|.......gS......8|
217e2730 73 0f 3c 00 99 68 a4 ed 53 ef 0f b4 ed 8d a6 7b
|s.<..h..S......{|
4c64f430 a4 48 02 00 00 00 86 00 53 ef 00 00 03 7a 1f 09
|.H......S....z..|
4c688430 a4 48 02 00 00 00 86 00 53 ef 00 00 03 7a 1f 09
|.H......S....z..|
e8724430 00 2b 02 17 71 e3 27 3b 53 ef 0c f3 90 3b 0c 0e
|.+..q.';S....;..|
f1fd0d30 53 ef 04 cd 4d ef 04 cd 53 ef 04 cd 4d ef 04 cd
|S...M...S...M...|
fb9a2930 ce f0 97 69 a0 f2 1b 07 53 ef 0b 95 74 ff 98 c9
|...i....S...t...|
10a0c9d30 41 2c 9b 24 7f bf ec 76 53 ef 0f 07 3e 0a 3d 15
|A,.$...vS...>.=.|
15bd95430 e1 72 92 fb 73 64 a4 1a 53 ef 0b 6a 66 e1 ef ae
|.r..sd..S..jf...|
Regards,
Adam
--
Adam Goryachev
Website Managers
P: +61 2 8304 0000 adam@websitemanagers.com.au
F: +61 2 8304 0001 www.websitemanagers.com.au
^ permalink raw reply
* Re: [PATCH 21/29] drivers, s390: convert fc_fcp_pkt.ref_cnt from atomic_t to refcount_t
From: Johannes Thumshirn @ 2017-03-08 14:06 UTC (permalink / raw)
To: Reshetova, Elena,
gregkh-hQyY1W1yCW8ekmWlsbkhG0B+6BGkLq7r@public.gmane.org
Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
xen-devel-GuqFBffKawtpuQazS67q72D2FQJk+8+b@public.gmane.org,
netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
linux1394-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f@public.gmane.org,
linux-bcache-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
linux-raid-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
linux-media-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
devel-tBiZLqfeLfOHmIFyCCdPziST3g8Odh+X@public.gmane.org,
linux-pci-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
linux-s390-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
fcoe-devel-s9riP+hp16TNLxjTenLetw@public.gmane.org,
linux-scsi-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
open-iscsi-/JYPxA39Uh5TLH3MbocFFw@public.gmane.org,
devel-gWbeCf7V1WCQmaza687I9mD2FQJk+8+b@public.gmane.org,
target-devel-u79uwXL29Tb/PtFMR13I2A
In-Reply-To: <2236FBA76BA1254E88B949DDB74E612B41C5615F-kPTMFJFq+rFP9JyJpTNKArfspsVTdybXVpNB7YpNyf8@public.gmane.org>
On 03/08/2017 02:48 PM, Reshetova, Elena wrote:
>> On 03/06/2017 03:21 PM, Elena Reshetova wrote:
>>> refcount_t type and corresponding API should be
>>> used instead of atomic_t when the variable is used as
>>> a reference counter. This allows to avoid accidental
>>> refcounter overflows that might lead to use-after-free
>>> situations.
>>
>> The subject is wrong, should be something like "scsi: libfc convert
>> fc_fcp_pkt.ref_cnt from atomic_t to refcount_t" but not s390.
>>
>> Other than that
>> Acked-by: Johannes Thumshirn <jth-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
>
> Turns out that it is better that all these patches go through the respective maintainer trees, if present.
> If I send an updated patch (with subject fixed), could you merge it through your tree?
Yes, but this would be the normal scsi tree from Martin and James.
Please include my Ack in the re-sends.
Thanks a lot,
Johannes
--
Johannes Thumshirn Storage
jthumshirn-l3A5Bk7waGM@public.gmane.org +49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850
--
You received this message because you are subscribed to the Google Groups "open-iscsi" group.
To unsubscribe from this group and stop receiving emails from it, send an email to open-iscsi+unsubscribe-/JYPxA39Uh5TLH3MbocFF+G/Ez6ZCGd0@public.gmane.org
To post to this group, send email to open-iscsi-/JYPxA39Uh5TLH3MbocFF+G/Ez6ZCGd0@public.gmane.org
Visit this group at https://groups.google.com/group/open-iscsi.
For more options, visit https://groups.google.com/d/optout.
^ permalink raw reply
* RE: [Xen-devel] [PATCH 29/29] drivers, xen: convert grant_map.users from atomic_t to refcount_t
From: Reshetova, Elena @ 2017-03-08 13:49 UTC (permalink / raw)
To: Boris Ostrovsky,
gregkh-hQyY1W1yCW8ekmWlsbkhG0B+6BGkLq7r@public.gmane.org
Cc: peterz-wEGCiKHe2LqWVfeAwA7xHQ@public.gmane.org,
linux-pci-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
target-devel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
linux1394-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f@public.gmane.org,
devel-gWbeCf7V1WCQmaza687I9mD2FQJk+8+b@public.gmane.org,
linux-s390-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
linux-scsi-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
linux-serial-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
fcoe-devel-s9riP+hp16TNLxjTenLetw@public.gmane.org,
xen-devel-GuqFBffKawtpuQazS67q72D2FQJk+8+b@public.gmane.org,
open-iscsi-/JYPxA39Uh5TLH3MbocFFw@public.gmane.org,
linux-media-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, Kees Cook,
linux-raid-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
linux-bcache-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
In-Reply-To: <99270126-7751-eed0-5efa-fc695ff3be25-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
> On 03/06/2017 09:21 AM, Elena Reshetova wrote:
> > refcount_t type and corresponding API should be
> > used instead of atomic_t when the variable is used as
> > a reference counter. This allows to avoid accidental
> > refcounter overflows that might lead to use-after-free
> > situations.
> >
> > Signed-off-by: Elena Reshetova <elena.reshetova-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
> > Signed-off-by: Hans Liljestrand <ishkamiel-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
> > Signed-off-by: Kees Cook <keescook-F7+t8E8rja9g9hUCZPvPmw@public.gmane.org>
> > Signed-off-by: David Windsor <dwindsor-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
> > ---
> > drivers/xen/gntdev.c | 11 ++++++-----
> > 1 file changed, 6 insertions(+), 5 deletions(-)
>
> Reviewed-by: Boris Ostrovsky <boris.ostrovsky-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
Is there a tree that can take this change? Turns out it is better to propagate changes via separate trees and only leftovers can be taken via Greg's tree.
Best Regards,
Elena.
--
You received this message because you are subscribed to the Google Groups "open-iscsi" group.
To unsubscribe from this group and stop receiving emails from it, send an email to open-iscsi+unsubscribe-/JYPxA39Uh5TLH3MbocFF+G/Ez6ZCGd0@public.gmane.org
To post to this group, send email to open-iscsi-/JYPxA39Uh5TLH3MbocFF+G/Ez6ZCGd0@public.gmane.org
Visit this group at https://groups.google.com/group/open-iscsi.
For more options, visit https://groups.google.com/d/optout.
^ permalink raw reply
* RE: [PATCH 21/29] drivers, s390: convert fc_fcp_pkt.ref_cnt from atomic_t to refcount_t
From: Reshetova, Elena @ 2017-03-08 13:48 UTC (permalink / raw)
To: Johannes Thumshirn, gregkh@linuxfoundation.org
Cc: devel@driverdev.osuosl.org, linux-s390@vger.kernel.org,
open-iscsi@googlegroups.com, Kees Cook,
linux-scsi@vger.kernel.org, David Windsor, peterz@infradead.org,
netdev@vger.kernel.org, linux-usb@vger.kernel.org,
linux-pci@vger.kernel.org, linux-kernel@vger.kernel.org,
linux-raid@vger.kernel.org, linux-bcache@vger.kernel.org,
target-devel@vger.kernel.org, linux-serial@vger.kernel.org,
devel@linuxdriverproject.org
In-Reply-To: <536a58ba-8896-5639-cab9-bd2f13bed325@suse.de>
> On 03/06/2017 03:21 PM, Elena Reshetova wrote:
> > refcount_t type and corresponding API should be
> > used instead of atomic_t when the variable is used as
> > a reference counter. This allows to avoid accidental
> > refcounter overflows that might lead to use-after-free
> > situations.
>
> The subject is wrong, should be something like "scsi: libfc convert
> fc_fcp_pkt.ref_cnt from atomic_t to refcount_t" but not s390.
>
> Other than that
> Acked-by: Johannes Thumshirn <jth@kernel.org>
Turns out that it is better that all these patches go through the respective maintainer trees, if present.
If I send an updated patch (with subject fixed), could you merge it through your tree?
Best Regards,
Elena.
>
> --
> Johannes Thumshirn Storage
> jthumshirn@suse.de +49 911 74053 689
> SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
> GF: Felix Imendörffer, Jane Smithard, Graham Norton
> HRB 21284 (AG Nürnberg)
> Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850
^ permalink raw reply
* Re: LSI RAID
From: Gandalf Corvotempesta @ 2017-03-08 11:48 UTC (permalink / raw)
To: Hannes Reinecke; +Cc: linux-raid
In-Reply-To: <CAJH6TXhYDMP_Xm+nZyoGBfAR77_W4h18_3bdTKeymEKYpUu-bw@mail.gmail.com>
2017-02-28 10:44 GMT+01:00 Gandalf Corvotempesta
<gandalf.corvotempesta@gmail.com>:
> 2017-02-28 10:06 GMT+01:00 Hannes Reinecke <hare@suse.de>:
>> Sure.
>> The recent mdadm should be able to create DDF metadata.
>
> This means that i'll be able to import a configuration created with a
> LSI MegaRaid controller and use them with mdadm ?
> If yes, how ?
Bump
^ permalink raw reply
* Auto replace disk
From: Gandalf Corvotempesta @ 2017-03-08 11:28 UTC (permalink / raw)
To: linux-raid
Hi to all
I'm trying to configure mdadm to do automatic replace/rebuild when a
disk is phisically removed and replaced in a slot but without success
Is this possible? How?
The new disk must be formatted or mdadm will replicate partition table
on it's own?
^ permalink raw reply
* Re: [PATCH 08/29] drivers, md: convert mddev.active from atomic_t to refcount_t
From: gregkh @ 2017-03-08 10:19 UTC (permalink / raw)
To: Reshetova, Elena
Cc: peterz@infradead.org, linux-pci@vger.kernel.org,
target-devel@vger.kernel.org,
linux1394-devel@lists.sourceforge.net, devel@driverdev.osuosl.org,
linux-s390@vger.kernel.org, linux-scsi@vger.kernel.org,
linux-serial@vger.kernel.org, fcoe-devel@open-fcoe.org,
Hans Liljestrand, open-iscsi@googlegroups.com, Shaohua Li,
linux-media@vger.kernel.org, Kees Cook,
linux-raid@vger.kernel.org, linux-bcache@vger.kernel.org
In-Reply-To: <2236FBA76BA1254E88B949DDB74E612B41C5606B@IRSMSX102.ger.corp.intel.com>
On Wed, Mar 08, 2017 at 09:42:09AM +0000, Reshetova, Elena wrote:
> > On Mon, Mar 06, 2017 at 04:20:55PM +0200, Elena Reshetova wrote:
> > > refcount_t type and corresponding API should be
> > > used instead of atomic_t when the variable is used as
> > > a reference counter. This allows to avoid accidental
> > > refcounter overflows that might lead to use-after-free
> > > situations.
> >
> > Looks good. Let me know how do you want to route the patch to upstream.
>
> Greg, you previously mentioned that driver's conversions can go via your tree. Does this still apply?
> Or should I be asking maintainers to merge these patches via their trees?
You should ask them to take them through their trees, if they have them.
I'll be glad to scoop up all of the remaining ones that get missed, or
for subsystems that do not have trees.
thanks,
greg k-h
^ permalink raw reply
* RE: [PATCH 08/29] drivers, md: convert mddev.active from atomic_t to refcount_t
From: Reshetova, Elena @ 2017-03-08 9:42 UTC (permalink / raw)
To: Shaohua Li,
gregkh-hQyY1W1yCW8ekmWlsbkhG0B+6BGkLq7r@public.gmane.org
Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
xen-devel-GuqFBffKawtpuQazS67q72D2FQJk+8+b@public.gmane.org,
netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
linux1394-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f@public.gmane.org,
linux-bcache-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
linux-raid-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
linux-media-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
devel-tBiZLqfeLfOHmIFyCCdPziST3g8Odh+X@public.gmane.org,
linux-pci-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
linux-s390-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
fcoe-devel-s9riP+hp16TNLxjTenLetw@public.gmane.org,
linux-scsi-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
open-iscsi-/JYPxA39Uh5TLH3MbocFFw@public.gmane.org,
devel-gWbeCf7V1WCQmaza687I9mD2FQJk+8+b@public.gmane.org,
target-devel-u79uwXL29TZNg+MwTxZMZA
In-Reply-To: <20170307190449.baceyzzngsz776x7-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
> On Mon, Mar 06, 2017 at 04:20:55PM +0200, Elena Reshetova wrote:
> > refcount_t type and corresponding API should be
> > used instead of atomic_t when the variable is used as
> > a reference counter. This allows to avoid accidental
> > refcounter overflows that might lead to use-after-free
> > situations.
>
> Looks good. Let me know how do you want to route the patch to upstream.
Greg, you previously mentioned that driver's conversions can go via your tree. Does this still apply?
Or should I be asking maintainers to merge these patches via their trees?
I am not sure about the correct (and easier for everyone) way, please suggest.
Best Regards,
Elena.
>
> > Signed-off-by: Elena Reshetova <elena.reshetova-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
> > Signed-off-by: Hans Liljestrand <ishkamiel-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
> > Signed-off-by: Kees Cook <keescook-F7+t8E8rja9g9hUCZPvPmw@public.gmane.org>
> > Signed-off-by: David Windsor <dwindsor-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
> > ---
> > drivers/md/md.c | 6 +++---
> > drivers/md/md.h | 3 ++-
> > 2 files changed, 5 insertions(+), 4 deletions(-)
> >
> > diff --git a/drivers/md/md.c b/drivers/md/md.c
> > index 985374f..94c8ebf 100644
> > --- a/drivers/md/md.c
> > +++ b/drivers/md/md.c
> > @@ -449,7 +449,7 @@ EXPORT_SYMBOL(md_unplug);
> >
> > static inline struct mddev *mddev_get(struct mddev *mddev)
> > {
> > - atomic_inc(&mddev->active);
> > + refcount_inc(&mddev->active);
> > return mddev;
> > }
> >
> > @@ -459,7 +459,7 @@ static void mddev_put(struct mddev *mddev)
> > {
> > struct bio_set *bs = NULL;
> >
> > - if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
> > + if (!refcount_dec_and_lock(&mddev->active, &all_mddevs_lock))
> > return;
> > if (!mddev->raid_disks && list_empty(&mddev->disks) &&
> > mddev->ctime == 0 && !mddev->hold_active) {
> > @@ -495,7 +495,7 @@ void mddev_init(struct mddev *mddev)
> > INIT_LIST_HEAD(&mddev->all_mddevs);
> > setup_timer(&mddev->safemode_timer, md_safemode_timeout,
> > (unsigned long) mddev);
> > - atomic_set(&mddev->active, 1);
> > + refcount_set(&mddev->active, 1);
> > atomic_set(&mddev->openers, 0);
> > atomic_set(&mddev->active_io, 0);
> > spin_lock_init(&mddev->lock);
> > diff --git a/drivers/md/md.h b/drivers/md/md.h
> > index b8859cb..4811663 100644
> > --- a/drivers/md/md.h
> > +++ b/drivers/md/md.h
> > @@ -22,6 +22,7 @@
> > #include <linux/list.h>
> > #include <linux/mm.h>
> > #include <linux/mutex.h>
> > +#include <linux/refcount.h>
> > #include <linux/timer.h>
> > #include <linux/wait.h>
> > #include <linux/workqueue.h>
> > @@ -360,7 +361,7 @@ struct mddev {
> > */
> > struct mutex open_mutex;
> > struct mutex reconfig_mutex;
> > - atomic_t active;
> /* general refcount */
> > + refcount_t active;
> /* general refcount */
> > atomic_t openers; /*
> number of active opens */
> >
> > int
> changed; /* True if we might need to
> > --
> > 2.7.4
> >
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> > the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> > More majordomo info at http://vger.kernel.org/majordomo-info.html
--
You received this message because you are subscribed to the Google Groups "open-iscsi" group.
To unsubscribe from this group and stop receiving emails from it, send an email to open-iscsi+unsubscribe-/JYPxA39Uh5TLH3MbocFF+G/Ez6ZCGd0@public.gmane.org
To post to this group, send email to open-iscsi-/JYPxA39Uh5TLH3MbocFF+G/Ez6ZCGd0@public.gmane.org
Visit this group at https://groups.google.com/group/open-iscsi.
For more options, visit https://groups.google.com/d/optout.
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox