Linux RAID subsystem development

Linux RAID subsystem development
 help / color / mirror / Atom feed

* [PATCH 3/5] r5cache: naive reclaim approach
From: Song Liu @ 2016-08-31 22:18 UTC (permalink / raw)
  To: linux-raid; +Cc: neilb, shli, kernel-team, dan.j.williams, hch, Song Liu
In-Reply-To: <1472681902-1172317-1-git-send-email-songliubraving@fb.com>

This patch adds a naive reclaim for r5c cache.

There are two limited resources, stripe cache and journal disk space.
For better performance, we priotize reclaim of stripes with more data
in cache. To free up more journal space, we free earliest data on
the journal.

In current implementation, reclaim decision is made in two places:
at the end of cached write, and from r5l_reclaim_thread.

At the end of every cached write, we check wthether we should reclaim
this stripe. Specifically, the stripe is reclaimed if:
 1. it is full stripe
 2. 50% of stripe cache space are in cached
 3. it is occupying large chunk of journal space

The reclaim thread (r5l_reclaim_thread) wakes up every 5 secounds. In
this thread, r5c_do_reclaim reclaims stripe cache space, while
r5l_do_reclaim reclaims journal space.

When resource is not limited, r5c_do_reclaim will do nothing.
Otherwise, r5c_do_reclaim walks through r5c_cached_list and freeze
up to R5C_RECLAIM_STRIPE_GROUP (set to 8) stripes.

r5c_cache keeps all data in cache (not fully committed to RAID) in
a list (stripe_in_cache). These stripes are in the order of their
first appearance on the journal. So the log tail (last_checkpoint)
should point to the journal_start of the first item in the list.

Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5-cache.c | 167 ++++++++++++++++++++++++++++++++++++++++++-----
 drivers/md/raid5.c       |  14 +++-
 drivers/md/raid5.h       |   2 +
 3 files changed, 166 insertions(+), 17 deletions(-)

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 78eeb6df..68f1470 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -34,6 +34,10 @@
 #define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */
 #define RECLAIM_MAX_FREE_SPACE_SHIFT (2)
 
+/* wake up reclaim thread periodically */
+#define R5C_RECLAIM_WAKEUP_INTERVAL (5 * HZ)
+/* reclaim stripes in groups */
+#define R5C_RECLAIM_STRIPE_GROUP  8
 /*
  * We only need 2 bios per I/O unit to make progress, but ensure we
  * have a few more available to not get too tight.
@@ -109,6 +113,9 @@ struct r5l_log {
 
 	/* for r5c_cache */
 	enum r5c_state r5c_state;
+	struct list_head stripe_in_cache; /* all stripes in the cache, with
+					   * sh->log_start in order */
+	spinlock_t stripe_in_cache_lock;  /* lock for stripe_in_cache */
 };
 
 /*
@@ -462,6 +469,7 @@ static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
 	int meta_size;
 	int ret;
 	struct r5l_io_unit *io;
+	unsigned long flags;
 
 	meta_size =
 		((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
@@ -505,6 +513,14 @@ static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
 	atomic_inc(&io->pending_stripe);
 	sh->log_io = io;
 
+	if (sh->log_start == MaxSector) {
+		BUG_ON(!list_empty(&sh->r5c));
+		sh->log_start = io->log_start;
+		spin_lock_irqsave(&log->stripe_in_cache_lock, flags);
+		list_add_tail(&sh->r5c,
+			      &log->stripe_in_cache);
+		spin_unlock_irqrestore(&log->stripe_in_cache_lock, flags);
+	}
 	return 0;
 }
 
@@ -705,15 +721,69 @@ static void __r5l_stripe_write_finished(struct r5l_io_unit *io)
 	wake_up(&log->iounit_wait);
 }
 
+/*
+ * Check whether we want to reclaim this stripe.
+ * Return true if the stripe should be freezed
+ *
+ * We would like to reclaim the stripe if
+ * 1. it is full stripe
+ * 2. 50% of stripe cache space are in cached
+ * 3. it is occupying large chunk of journal space
+ */
+static bool r5c_check_stripe_for_reclaim(struct stripe_head *sh,
+					 sector_t log_start)
+{
+	struct r5conf *conf = sh->raid_conf;
+	struct r5l_log *log = conf->log;
+	bool ret = false;
+
+	/* only check active stripe (STRIPE_ACTIVE) or
+	 * stripe in r5c_cached_list */
+	if (!test_bit(STRIPE_ACTIVE, &sh->state)) {
+		assert_spin_locked(&conf->device_lock);
+		WARN_ON(list_empty(&sh->r5c));
+	}
+
+	if (atomic_read(&sh->dev_in_cache) ==
+	    conf->raid_disks - conf->max_degraded) {
+		pr_debug("%s: freeze stripe for full stripe\n", __func__);
+		return true;
+	}
+
+	if (atomic_read(&conf->r5c_cached_stripes) * 2 >
+	    conf->min_nr_stripes) {
+		pr_debug("%s: freeze stripe for stripe cache\n", __func__);
+		return true;
+	}
+
+	/* TODO: do we need protection reading log->log_start? */
+	if (r5l_ring_distance(log, sh->log_start, log_start) >
+		   log->max_free_space) {
+		pr_debug("%s: freeze stripe for journal space\n", __func__);
+		ret = true;
+	}
+	return ret;
+}
+
 void r5l_stripe_write_finished(struct stripe_head *sh)
 {
+	struct r5conf *conf = sh->raid_conf;
+	struct r5l_log *log = conf->log;
 	struct r5l_io_unit *io;
+	sector_t log_start;
 
 	io = sh->log_io;
 	sh->log_io = NULL;
 
 	if (io && atomic_dec_and_test(&io->pending_stripe))
 		__r5l_stripe_write_finished(io);
+
+	mutex_lock(&log->io_mutex);
+	log_start = log->log_start;
+	mutex_unlock(&log->io_mutex);
+	if (!test_bit(STRIPE_R5C_FROZEN, &sh->state))
+		if (r5c_check_stripe_for_reclaim(sh, log_start))
+			r5c_freeze_stripe_for_reclaim(sh);
 }
 
 static void r5l_log_flush_endio(struct bio *bio)
@@ -817,6 +887,10 @@ static void r5l_write_super_and_discard_space(struct r5l_log *log,
 		blkdev_issue_discard(bdev, log->rdev->data_offset, end,
 				GFP_NOIO, 0);
 	}
+	mutex_lock(&log->io_mutex);
+	log->last_checkpoint = end;
+	log->last_cp_seq = log->next_cp_seq;
+	mutex_unlock(&log->io_mutex);
 }
 
 static void r5l_do_reclaim(struct r5l_log *log)
@@ -855,19 +929,30 @@ static void r5l_do_reclaim(struct r5l_log *log)
 	if (reclaimable == 0)
 		return;
 
-	/*
-	 * write_super will flush cache of each raid disk. We must write super
-	 * here, because the log area might be reused soon and we don't want to
-	 * confuse recovery
-	 */
-	r5l_write_super_and_discard_space(log, next_checkpoint);
+	r5l_run_no_space_stripes(log);
+}
 
-	mutex_lock(&log->io_mutex);
-	log->last_checkpoint = next_checkpoint;
-	log->last_cp_seq = next_cp_seq;
-	mutex_unlock(&log->io_mutex);
+static void r5c_update_super(struct r5conf *conf)
+{
+	struct stripe_head *sh;
+	struct r5l_log *log = conf->log;
+	sector_t end = MaxSector;
+	unsigned long flags;
 
-	r5l_run_no_space_stripes(log);
+	spin_lock_irqsave(&log->stripe_in_cache_lock, flags);
+	if (list_empty(&conf->log->stripe_in_cache)) {
+		/* all stripes flushed */
+		spin_unlock_irqrestore(&log->stripe_in_cache_lock, flags);
+		r5l_write_super_and_discard_space(log, log->next_checkpoint);
+		return;
+	}
+	sh = list_first_entry(&conf->log->stripe_in_cache,
+			      struct stripe_head, r5c);
+	end = sh->log_start;
+	spin_unlock_irqrestore(&log->stripe_in_cache_lock, flags);
+
+	if (end != log->last_checkpoint && end != MaxSector)
+		r5l_write_super_and_discard_space(log, end);
 }
 
 static void r5l_reclaim_thread(struct md_thread *thread)
@@ -878,7 +963,10 @@ static void r5l_reclaim_thread(struct md_thread *thread)
 
 	if (!log)
 		return;
+	r5c_do_reclaim(conf);
 	r5l_do_reclaim(log);
+	r5c_update_super(conf);
+	md_wakeup_thread(mddev->thread);
 }
 
 void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
@@ -913,9 +1001,10 @@ void r5l_quiesce(struct r5l_log *log, int state)
 		/* make sure r5l_write_super_and_discard_space exits */
 		mddev = log->rdev->mddev;
 		wake_up(&mddev->sb_wait);
-		r5l_wake_reclaim(log, -1L);
+		r5l_wake_reclaim(log, MaxSector);
 		md_unregister_thread(&log->reclaim_thread);
 		r5l_do_reclaim(log);
+		r5c_update_super(log->rdev->mddev->private);
 	}
 }
 
@@ -1194,6 +1283,7 @@ static void r5l_write_super(struct r5l_log *log, sector_t cp)
 	set_bit(MD_CHANGE_DEVS, &mddev->flags);
 }
 
+
 static void r5c_flush_stripe(struct r5conf *conf, struct stripe_head *sh)
 {
 	list_del_init(&sh->lru);
@@ -1326,6 +1416,7 @@ void r5c_handle_stripe_written(struct r5conf *conf,
 			       struct stripe_head *sh) {
 	int i;
 	int do_wakeup = 0;
+	unsigned long flags;
 
 	if (test_and_clear_bit(STRIPE_R5C_WRITTEN, &sh->state)) {
 		WARN_ON(!test_bit(STRIPE_R5C_FROZEN, &sh->state));
@@ -1338,6 +1429,10 @@ void r5c_handle_stripe_written(struct r5conf *conf,
 			if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
 				do_wakeup = 1;
 		}
+		spin_lock_irqsave(&conf->log->stripe_in_cache_lock, flags);
+		list_del_init(&sh->r5c);
+		spin_unlock_irqrestore(&conf->log->stripe_in_cache_lock, flags);
+		sh->log_start = MaxSector;
 	}
 
 	if (do_wakeup)
@@ -1413,13 +1508,49 @@ void r5c_do_reclaim(struct r5conf *conf)
 {
 	struct stripe_head *sh, *next;
 	struct r5l_log *log = conf->log;
-
-	assert_spin_locked(&conf->device_lock);
+	int count = 0;
+	unsigned long flags;
+	bool skip_reclaim = true;
+	sector_t log_start;
 
 	if (!log)
 		return;
-	list_for_each_entry_safe(sh, next, &conf->r5c_cached_list, lru)
-		r5c_flush_stripe(conf, sh);
+	if (atomic_read(&conf->r5c_cached_stripes) +
+	    atomic_read(&conf->active_stripes) > conf->min_nr_stripes * 3 / 4)
+		skip_reclaim = false;
+	else {
+		struct list_head *l;
+
+		spin_lock_irqsave(&log->stripe_in_cache_lock, flags);
+		if (!list_empty(&log->stripe_in_cache)) {
+			l = log->stripe_in_cache.next;
+			sh = list_entry(l, struct stripe_head, r5c);
+			if (r5l_ring_distance(log, sh->log_start, log->log_start) >
+			    log->max_free_space)
+				skip_reclaim = false;
+		}
+		spin_unlock_irqrestore(&log->stripe_in_cache_lock, flags);
+	}
+	if (skip_reclaim)
+		return;
+
+	/* lock io_mutex and get log->log_start before holding device_lock*/
+	mutex_lock(&log->io_mutex);
+	log_start = log->log_start;
+	mutex_unlock(&log->io_mutex);
+
+	spin_lock_irqsave(&conf->device_lock, flags);
+	list_for_each_entry_safe(sh, next, &conf->r5c_cached_list, lru) {
+		if (r5c_check_stripe_for_reclaim(sh, log_start)) {
+			count++;
+			r5c_flush_stripe(conf, sh);
+		}
+		if (count >= R5C_RECLAIM_STRIPE_GROUP)
+			break;
+	}
+	spin_unlock_irqrestore(&conf->device_lock, flags);
+	if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state))
+		wake_up(&conf->wait_for_stripe);
 }
 
 static int r5l_load_log(struct r5l_log *log)
@@ -1534,6 +1665,8 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 						 log->rdev->mddev, "reclaim");
 	if (!log->reclaim_thread)
 		goto reclaim_thread;
+	log->reclaim_thread->timeout = R5C_RECLAIM_WAKEUP_INTERVAL;
+
 	init_waitqueue_head(&log->iounit_wait);
 
 	INIT_LIST_HEAD(&log->no_mem_stripes);
@@ -1543,6 +1676,8 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 
 	/* flush full stripe */
 	log->r5c_state = R5C_STATE_WRITE_BACK;
+	INIT_LIST_HEAD(&log->stripe_in_cache);
+	spin_lock_init(&log->stripe_in_cache_lock);
 
 	if (r5l_load_log(log))
 		goto error;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 7956d13..af6875b 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -691,6 +691,8 @@ raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
 			if (!sh) {
 				set_bit(R5_INACTIVE_BLOCKED,
 					&conf->cache_state);
+				if (conf->log)
+					r5l_wake_reclaim(conf->log, 0);
 				wait_event_lock_irq(
 					conf->wait_for_stripe,
 					!list_empty(conf->inactive_list + hash) &&
@@ -729,6 +731,15 @@ raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
 	} while (sh == NULL);
 
 	spin_unlock_irq(conf->hash_locks + hash);
+
+	if (conf->log &&
+	    (atomic_read(&conf->active_stripes) +
+	     atomic_read(&conf->r5c_cached_stripes) >
+	     conf->max_nr_stripes * 3 / 4)) {
+		set_bit(R5_INACTIVE_BLOCKED, &conf->cache_state);
+		r5l_wake_reclaim(conf->log, 0);
+	}
+
 	return sh;
 }
 
@@ -2036,8 +2047,10 @@ static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
 		spin_lock_init(&sh->batch_lock);
 		INIT_LIST_HEAD(&sh->batch_list);
 		INIT_LIST_HEAD(&sh->lru);
+		INIT_LIST_HEAD(&sh->r5c);
 		atomic_set(&sh->count, 1);
 		atomic_set(&sh->dev_in_cache, 0);
+		sh->log_start = MaxSector;
 		for (i = 0; i < disks; i++) {
 			struct r5dev *dev = &sh->dev[i];
 
@@ -6029,7 +6042,6 @@ static void raid5d(struct md_thread *thread)
 			md_check_recovery(mddev);
 			spin_lock_irq(&conf->device_lock);
 		}
-		r5c_do_reclaim(conf);
 	}
 	pr_debug("%d stripes handled\n", handled);
 
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index dbc128e..901fd41 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -227,6 +227,8 @@ struct stripe_head {
 	struct r5l_io_unit	*log_io;
 	struct list_head	log_list;
 	atomic_t		dev_in_cache;
+	sector_t		log_start; /* first meta block on the journal */
+	struct list_head	r5c; /* for r5c_cache->stripe_in_cache */
 	/**
 	 * struct stripe_operations
 	 * @target - STRIPE_OP_COMPUTE_BLK target
-- 
2.8.0.rc2


^ permalink raw reply related

* [PATCH 2/5] r5cache: sysfs entry r5c_state
From: Song Liu @ 2016-08-31 22:18 UTC (permalink / raw)
  To: linux-raid; +Cc: neilb, shli, kernel-team, dan.j.williams, hch, Song Liu
In-Reply-To: <1472681902-1172317-1-git-send-email-songliubraving@fb.com>

r5c_state have 4 states:
* no-cache;
* write-through (write journal only);
* write-back (w/ write cache);
* cache-broken (journal missing or Faulty)

When there is functional write cache, r5c_state is a knob to
switch between write-back and write-through.

When the journal device is broken, the raid array is forced
in readonly mode. In this case, r5c_state can be used to
remove "journal feature", and thus make the array read-write
without journal. By writing into r5c_cache_mode, the array
can transit from cache-broken to no-cache, which removes
journal feature for the array.

Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5-cache.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/md/raid5.c       |  1 +
 drivers/md/raid5.h       |  2 ++
 3 files changed, 60 insertions(+)

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index cc7b80d..78eeb6df 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -47,6 +47,9 @@ enum r5c_state {
 	R5C_STATE_CACHE_BROKEN = 3,
 };
 
+static char *r5c_state_str[] = {"no-cache", "write-through",
+				"write-back", "cache-broken"};
+
 struct r5l_log {
 	struct md_rdev *rdev;
 
@@ -1219,6 +1222,60 @@ int r5c_flush_cache(struct r5conf *conf)
 	return count;
 }
 
+ssize_t r5c_state_show(struct mddev *mddev, char *page)
+{
+	struct r5conf *conf = mddev->private;
+	int val = 0;
+	int ret = 0;
+
+	if (conf->log)
+		val = conf->log->r5c_state;
+	else if (test_bit(MD_HAS_JOURNAL, &mddev->flags))
+		val = R5C_STATE_CACHE_BROKEN;
+	ret += snprintf(page, PAGE_SIZE - ret, "%d: %s\n",
+			val, r5c_state_str[val]);
+	return ret;
+}
+
+ssize_t r5c_state_store(struct mddev *mddev, const char *page, size_t len)
+{
+	struct r5conf *conf = mddev->private;
+	struct r5l_log *log = conf->log;
+	int val;
+
+	if (kstrtoint(page, 10, &val))
+		return -EINVAL;
+	if (!log && val != R5C_STATE_NO_CACHE)
+		return -EINVAL;
+
+	if (val < R5C_STATE_NO_CACHE || val > R5C_STATE_WRITE_BACK)
+		return -EINVAL;
+	if (val == R5C_STATE_NO_CACHE) {
+		if (conf->log &&
+		    !test_bit(Faulty, &log->rdev->flags)) {
+			pr_err("md/raid:%s: journal device is in use, cannot remove it\n",
+			       mdname(mddev));
+			return -EINVAL;
+		}
+	}
+
+	spin_lock_irq(&conf->device_lock);
+	if (log)
+		conf->log->r5c_state = val;
+	if (val == R5C_STATE_NO_CACHE) {
+		clear_bit(MD_HAS_JOURNAL, &mddev->flags);
+		set_bit(MD_UPDATE_SB_FLAGS, &mddev->flags);
+	}
+	spin_unlock_irq(&conf->device_lock);
+	pr_info("md/raid:%s: setting r5c cache mode to %d: %s\n",
+		mdname(mddev), val, r5c_state_str[val]);
+	return len;
+}
+
+struct md_sysfs_entry
+r5c_state = __ATTR(r5c_state, S_IRUGO | S_IWUSR,
+		   r5c_state_show, r5c_state_store);
+
 int r5c_handle_stripe_dirtying(struct r5conf *conf,
 			       struct stripe_head *sh,
 			       struct stripe_head_state *s,
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index ed6efd0..7956d13 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -6355,6 +6355,7 @@ static struct attribute *raid5_attrs[] =  {
 	&raid5_group_thread_cnt.attr,
 	&raid5_skip_copy.attr,
 	&raid5_rmw_level.attr,
+	&r5c_state.attr,
 	NULL,
 };
 static struct attribute_group raid5_attrs_group = {
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index e301d0e..dbc128e 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -658,4 +658,6 @@ r5c_handle_stripe_written(struct r5conf *conf, struct stripe_head *sh);
 extern void r5c_freeze_stripe_for_reclaim(struct stripe_head *sh);
 extern void r5c_do_reclaim(struct r5conf *conf);
 extern int r5c_flush_cache(struct r5conf *conf);
+extern struct md_sysfs_entry r5c_state;
+
 #endif
-- 
2.8.0.rc2


^ permalink raw reply related

* [PATCH 1/5] r5cache: write part of r5cache
From: Song Liu @ 2016-08-31 22:18 UTC (permalink / raw)
  To: linux-raid; +Cc: neilb, shli, kernel-team, dan.j.williams, hch, Song Liu
In-Reply-To: <1472681902-1172317-1-git-send-email-songliubraving@fb.com>

This is the write part of r5cache. The cache is integrated with
stripe cache of raid456. It leverages code of r5l_log to write
data to journal device.

r5cache split current write path into 2 parts: the write path
and the reclaim path. The write path is as following:
1. write data to journal
   (r5c_handle_stripe_dirtying, r5c_cache_data)
2. call bio_endio
   (r5c_handle_data_cached, r5c_return_dev_pending_writes).

Then the reclaim path is as:
1. Freeze the stripe (r5c_freeze_stripe_for_reclaim)
2. Calcualte parity (reconstruct or RMW)
3. Write parity (and maybe some other data) to journal device
4. Write data and parity to RAID disks

Step 3 and 4 of reclaim path is very similar to write path of
raid5 journal.

With r5cache, write operation does not wait for parity calculation
and write out, so the write latency is lower (1 write to journal
device vs. read and then write to raid disks). Also, r5cache will
reduce RAID overhead (multipile IO due to read-modify-write of
parity) and provide more opportunities of full stripe writes.

r5cache adds 2 flags to stripe_head.state: STRIPE_R5C_FROZEN and
STRIPE_R5C_WRITTEN. The write path runs w/ STRIPE_R5C_FROZEN == 0.
Cache writes start from r5c_handle_stripe_dirtying(), where bit
R5_Wantcache is set for devices with bio in towrite. Then, the
data is written to the journal through r5l_log implementation.
Once the data is in the journal, we set bit R5_InCache, and
presue bio_endio for these writes.

The reclaim path starts by setting STRIPE_R5C_FROZEN. This makes
the stripe into reclaim. If some write operation arrives at this
time, it will be handled as raid5 journal (calculate parity,
write to jorunal, write to disks, bio_endio).

Once frozen, the stripe is sent back to raid5 state machine,
where handle_stripe_dirtying will evaluate the stripe for
reconstruct writes or RMW writes (read data and calculate parity).

For RMW, the code allocates an extra page for each data block
being updated.  This is stored in r5dev->page and the old data
is read into it.  Then the prexor calculation subtracts ->page
from the parity block, and the reconstruct calculation adds the
->orig_page data back into the parity block.

r5cache naturally excludes SkipCopy. With R5_Wantcache bit set,
async_copy_data will not skip copy.

Before writing data to RAID disks, the r5l_log logic stores
parity (and non-overwrite data) to the journal.

Instead of inactive_list, stripes with cached data are tracked in
r5conf->r5c_cached_list. r5conf->r5c_cached_stripes tracks how
many stripes has dirty data in the cache.

There are some known limitations of the cache implementation:

1. Write cache only covers full page writes (R5_OVERWRITE). Writes
   of smaller granularity are write through.
2. Only one log io (sh->log_io) for each stripe at anytime. Later
   writes for the same stripe have to wait. This can be improved by
   moving log_io to r5dev.
3. When use with bitmap, there is a deadlock in add_stripe_bio =>
   bitmap_startwrite.

Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5-cache.c | 271 +++++++++++++++++++++++++++++++++++++++++++++--
 drivers/md/raid5.c       | 164 ++++++++++++++++++++++++----
 drivers/md/raid5.h       |  25 ++++-
 3 files changed, 429 insertions(+), 31 deletions(-)

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 1b1ab4a..cc7b80d 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -40,6 +40,13 @@
  */
 #define R5L_POOL_SIZE	4
 
+enum r5c_state {
+	R5C_STATE_NO_CACHE = 0,
+	R5C_STATE_WRITE_THROUGH = 1,
+	R5C_STATE_WRITE_BACK = 2,
+	R5C_STATE_CACHE_BROKEN = 3,
+};
+
 struct r5l_log {
 	struct md_rdev *rdev;
 
@@ -96,6 +103,9 @@ struct r5l_log {
 	spinlock_t no_space_stripes_lock;
 
 	bool need_cache_flush;
+
+	/* for r5c_cache */
+	enum r5c_state r5c_state;
 };
 
 /*
@@ -168,12 +178,73 @@ static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
 	io->state = state;
 }
 
+/*
+ * Freeze the stripe, thus send the stripe into reclaim path.
+ *
+ * This function should only be called from raid5d that handling this stripe,
+ * or when the stripe is on r5c_cached_list (hold conf->device_lock)
+ */
+void r5c_freeze_stripe_for_reclaim(struct stripe_head *sh)
+{
+	struct r5conf *conf = sh->raid_conf;
+
+	if (!conf->log)
+		return;
+
+	WARN_ON(test_bit(STRIPE_R5C_FROZEN, &sh->state));
+	set_bit(STRIPE_R5C_FROZEN, &sh->state);
+
+	if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+		atomic_inc(&conf->preread_active_stripes);
+	if (test_and_clear_bit(STRIPE_IN_R5C_CACHE, &sh->state)) {
+		BUG_ON(atomic_read(&conf->r5c_cached_stripes) == 0);
+		atomic_dec(&conf->r5c_cached_stripes);
+	}
+}
+
+static void r5c_handle_data_cached(struct stripe_head *sh)
+{
+	int i;
+
+	for (i = sh->disks; i--; )
+		if (test_and_clear_bit(R5_Wantcache, &sh->dev[i].flags)) {
+			set_bit(R5_InCache, &sh->dev[i].flags);
+			clear_bit(R5_LOCKED, &sh->dev[i].flags);
+			atomic_inc(&sh->dev_in_cache);
+		}
+}
+
+/*
+ * this journal write must contain full parity,
+ * it may also contain some data pages
+ */
+static void r5c_handle_parity_cached(struct stripe_head *sh)
+{
+	int i;
+
+	for (i = sh->disks; i--; )
+		if (test_bit(R5_InCache, &sh->dev[i].flags))
+			set_bit(R5_Wantwrite, &sh->dev[i].flags);
+	set_bit(STRIPE_R5C_WRITTEN, &sh->state);
+}
+
+static void r5c_finish_cache_stripe(struct stripe_head *sh)
+{
+	if (test_bit(STRIPE_R5C_FROZEN, &sh->state))
+		r5c_handle_parity_cached(sh);
+	else
+		r5c_handle_data_cached(sh);
+}
+
 static void r5l_io_run_stripes(struct r5l_io_unit *io)
 {
 	struct stripe_head *sh, *next;
 
 	list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) {
 		list_del_init(&sh->log_list);
+
+		r5c_finish_cache_stripe(sh);
+
 		set_bit(STRIPE_HANDLE, &sh->state);
 		raid5_release_stripe(sh);
 	}
@@ -402,7 +473,8 @@ static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
 	io = log->current_io;
 
 	for (i = 0; i < sh->disks; i++) {
-		if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
+		if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) &&
+		    !test_bit(R5_Wantcache, &sh->dev[i].flags))
 			continue;
 		if (i == sh->pd_idx || i == sh->qd_idx)
 			continue;
@@ -412,18 +484,19 @@ static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
 		r5l_append_payload_page(log, sh->dev[i].page);
 	}
 
-	if (sh->qd_idx >= 0) {
+	if (parity_pages == 2) {
 		r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
 					sh->sector, sh->dev[sh->pd_idx].log_checksum,
 					sh->dev[sh->qd_idx].log_checksum, true);
 		r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
 		r5l_append_payload_page(log, sh->dev[sh->qd_idx].page);
-	} else {
+	} else if (parity_pages == 1) {
 		r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
 					sh->sector, sh->dev[sh->pd_idx].log_checksum,
 					0, false);
 		r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
-	}
+	} else
+		BUG_ON(parity_pages != 0);
 
 	list_add_tail(&sh->log_list, &io->stripe_list);
 	atomic_inc(&io->pending_stripe);
@@ -432,7 +505,6 @@ static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
 	return 0;
 }
 
-static void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
 /*
  * running in raid5d, where reclaim could wait for raid5d too (when it flushes
  * data from log to raid disks), so we shouldn't wait for reclaim here
@@ -456,11 +528,17 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
 		return -EAGAIN;
 	}
 
+	WARN_ON(!test_bit(STRIPE_R5C_FROZEN, &sh->state));
+
 	for (i = 0; i < sh->disks; i++) {
 		void *addr;
 
 		if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
 			continue;
+
+		if (test_bit(R5_InCache, &sh->dev[i].flags))
+			continue;
+
 		write_disks++;
 		/* checksum is already calculated in last run */
 		if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
@@ -473,6 +551,9 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
 	parity_pages = 1 + !!(sh->qd_idx >= 0);
 	data_pages = write_disks - parity_pages;
 
+	pr_debug("%s: write %d data_pages and %d parity_pages\n",
+		 __func__, data_pages, parity_pages);
+
 	meta_size =
 		((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
 		 * data_pages) +
@@ -735,7 +816,6 @@ static void r5l_write_super_and_discard_space(struct r5l_log *log,
 	}
 }
 
-
 static void r5l_do_reclaim(struct r5l_log *log)
 {
 	sector_t reclaim_target = xchg(&log->reclaim_target, 0);
@@ -798,7 +878,7 @@ static void r5l_reclaim_thread(struct md_thread *thread)
 	r5l_do_reclaim(log);
 }
 
-static void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
+void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
 {
 	unsigned long target;
 	unsigned long new = (unsigned long)space; /* overflow in theory */
@@ -1111,6 +1191,180 @@ static void r5l_write_super(struct r5l_log *log, sector_t cp)
 	set_bit(MD_CHANGE_DEVS, &mddev->flags);
 }
 
+static void r5c_flush_stripe(struct r5conf *conf, struct stripe_head *sh)
+{
+	list_del_init(&sh->lru);
+	r5c_freeze_stripe_for_reclaim(sh);
+	atomic_inc(&conf->active_stripes);
+	atomic_inc(&sh->count);
+	set_bit(STRIPE_HANDLE, &sh->state);
+	raid5_release_stripe(sh);
+}
+
+int r5c_flush_cache(struct r5conf *conf)
+{
+	int count = 0;
+
+	assert_spin_locked(&conf->device_lock);
+	if (!conf->log)
+		return 0;
+	while (!list_empty(&conf->r5c_cached_list)) {
+		struct list_head *l = conf->r5c_cached_list.next;
+		struct stripe_head *sh;
+
+		sh = list_entry(l, struct stripe_head, lru);
+		r5c_flush_stripe(conf, sh);
+		++count;
+	}
+	return count;
+}
+
+int r5c_handle_stripe_dirtying(struct r5conf *conf,
+			       struct stripe_head *sh,
+			       struct stripe_head_state *s,
+			       int disks) {
+	struct r5l_log *log = conf->log;
+	int i;
+	struct r5dev *dev;
+
+	if (!log || test_bit(STRIPE_R5C_FROZEN, &sh->state))
+		return -EAGAIN;
+
+	if (conf->log->r5c_state == R5C_STATE_WRITE_THROUGH ||
+	    conf->quiesce != 0 || conf->mddev->degraded != 0) {
+		/* write through mode */
+		r5c_freeze_stripe_for_reclaim(sh);
+		return -EAGAIN;
+	}
+
+	s->to_cache = 0;
+
+	for (i = disks; i--; ) {
+		dev = &sh->dev[i];
+		/* if none-overwrite, use the reclaim path (write through) */
+		if (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags) &&
+		    !test_bit(R5_InCache, &dev->flags)) {
+			r5c_freeze_stripe_for_reclaim(sh);
+			return -EAGAIN;
+		}
+	}
+
+	for (i = disks; i--; ) {
+		dev = &sh->dev[i];
+		if (dev->towrite) {
+			set_bit(R5_Wantcache, &dev->flags);
+			set_bit(R5_Wantdrain, &dev->flags);
+			set_bit(R5_LOCKED, &dev->flags);
+			s->to_cache++;
+		}
+	}
+
+	if (s->to_cache)
+		set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
+
+	return 0;
+}
+
+void r5c_handle_stripe_written(struct r5conf *conf,
+			       struct stripe_head *sh) {
+	int i;
+	int do_wakeup = 0;
+
+	if (test_and_clear_bit(STRIPE_R5C_WRITTEN, &sh->state)) {
+		WARN_ON(!test_bit(STRIPE_R5C_FROZEN, &sh->state));
+		clear_bit(STRIPE_R5C_FROZEN, &sh->state);
+
+		for (i = sh->disks; i--; ) {
+			if (test_and_clear_bit(R5_InCache, &sh->dev[i].flags))
+				atomic_dec(&sh->dev_in_cache);
+			clear_bit(R5_UPTODATE, &sh->dev[i].flags);
+			if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
+				do_wakeup = 1;
+		}
+	}
+
+	if (do_wakeup)
+		wake_up(&conf->wait_for_overlap);
+}
+
+int
+r5c_cache_data(struct r5l_log *log, struct stripe_head *sh,
+	       struct stripe_head_state *s)
+{
+	int pages;
+	int meta_size;
+	int reserve;
+	int i;
+	int ret = 0;
+	int page_count = 0;
+
+	BUG_ON(!log);
+	BUG_ON(s->to_cache == 0);
+
+	for (i = 0; i < sh->disks; i++) {
+		void *addr;
+
+		if (!test_bit(R5_Wantcache, &sh->dev[i].flags))
+			continue;
+		addr = kmap_atomic(sh->dev[i].page);
+		sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
+						    addr, PAGE_SIZE);
+		kunmap_atomic(addr);
+		page_count++;
+	}
+	WARN_ON(page_count != s->to_cache);
+
+	pages = s->to_cache;
+
+	meta_size =
+		((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
+		 * pages);
+	/* Doesn't work with very big raid array */
+	if (meta_size + sizeof(struct r5l_meta_block) > PAGE_SIZE)
+		return -EINVAL;
+
+	/*
+	 * The stripe must enter state machine again to call endio, so
+	 * don't delay.
+	 */
+	clear_bit(STRIPE_DELAYED, &sh->state);
+	atomic_inc(&sh->count);
+
+	mutex_lock(&log->io_mutex);
+	/* meta + data */
+	reserve = (1 + pages) << (PAGE_SHIFT - 9);
+	if (!r5l_has_free_space(log, reserve)) {
+		spin_lock(&log->no_space_stripes_lock);
+		list_add_tail(&sh->log_list, &log->no_space_stripes);
+		spin_unlock(&log->no_space_stripes_lock);
+
+		r5l_wake_reclaim(log, reserve);
+	} else {
+		ret = r5l_log_stripe(log, sh, pages, 0);
+		if (ret) {
+			spin_lock_irq(&log->io_list_lock);
+			list_add_tail(&sh->log_list, &log->no_mem_stripes);
+			spin_unlock_irq(&log->io_list_lock);
+		}
+	}
+
+	mutex_unlock(&log->io_mutex);
+	return 0;
+}
+
+void r5c_do_reclaim(struct r5conf *conf)
+{
+	struct stripe_head *sh, *next;
+	struct r5l_log *log = conf->log;
+
+	assert_spin_locked(&conf->device_lock);
+
+	if (!log)
+		return;
+	list_for_each_entry_safe(sh, next, &conf->r5c_cached_list, lru)
+		r5c_flush_stripe(conf, sh);
+}
+
 static int r5l_load_log(struct r5l_log *log)
 {
 	struct md_rdev *rdev = log->rdev;
@@ -1230,6 +1484,9 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 	INIT_LIST_HEAD(&log->no_space_stripes);
 	spin_lock_init(&log->no_space_stripes_lock);
 
+	/* flush full stripe */
+	log->r5c_state = R5C_STATE_WRITE_BACK;
+
 	if (r5l_load_log(log))
 		goto error;
 
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b95c54c..ed6efd0 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -316,8 +316,16 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
 			    < IO_THRESHOLD)
 				md_wakeup_thread(conf->mddev->thread);
 		atomic_dec(&conf->active_stripes);
-		if (!test_bit(STRIPE_EXPANDING, &sh->state))
-			list_add_tail(&sh->lru, temp_inactive_list);
+		if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
+			if (atomic_read(&sh->dev_in_cache) == 0) {
+				list_add_tail(&sh->lru, temp_inactive_list);
+			} else {
+				if (!test_and_set_bit(STRIPE_IN_R5C_CACHE,
+						      &sh->state))
+					atomic_inc(&conf->r5c_cached_stripes);
+				list_add_tail(&sh->lru, &conf->r5c_cached_list);
+			}
+		}
 	}
 }
 
@@ -901,6 +909,13 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 
 	might_sleep();
 
+	if (s->to_cache) {
+		if (r5c_cache_data(conf->log, sh, s) == 0)
+			return;
+		/* array is too big that meta data size > PAGE_SIZE  */
+		r5c_freeze_stripe_for_reclaim(sh);
+	}
+
 	if (r5l_write_stripe(conf->log, sh) == 0)
 		return;
 	for (i = disks; i--; ) {
@@ -1029,6 +1044,7 @@ again:
 
 			if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
 				WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
+
 			sh->dev[i].vec.bv_page = sh->dev[i].page;
 			bi->bi_vcnt = 1;
 			bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
@@ -1115,7 +1131,7 @@ again:
 static struct dma_async_tx_descriptor *
 async_copy_data(int frombio, struct bio *bio, struct page **page,
 	sector_t sector, struct dma_async_tx_descriptor *tx,
-	struct stripe_head *sh)
+	struct stripe_head *sh, int no_skipcopy)
 {
 	struct bio_vec bvl;
 	struct bvec_iter iter;
@@ -1155,7 +1171,8 @@ async_copy_data(int frombio, struct bio *bio, struct page **page,
 			if (frombio) {
 				if (sh->raid_conf->skip_copy &&
 				    b_offset == 0 && page_offset == 0 &&
-				    clen == STRIPE_SIZE)
+				    clen == STRIPE_SIZE &&
+				    !no_skipcopy)
 					*page = bio_page;
 				else
 					tx = async_memcpy(*page, bio_page, page_offset,
@@ -1237,7 +1254,7 @@ static void ops_run_biofill(struct stripe_head *sh)
 			while (rbi && rbi->bi_iter.bi_sector <
 				dev->sector + STRIPE_SECTORS) {
 				tx = async_copy_data(0, rbi, &dev->page,
-					dev->sector, tx, sh);
+						     dev->sector, tx, sh, 0);
 				rbi = r5_next_bio(rbi, dev->sector);
 			}
 		}
@@ -1364,7 +1381,8 @@ static int set_syndrome_sources(struct page **srcs,
 		if (i == sh->qd_idx || i == sh->pd_idx ||
 		    (srctype == SYNDROME_SRC_ALL) ||
 		    (srctype == SYNDROME_SRC_WANT_DRAIN &&
-		     test_bit(R5_Wantdrain, &dev->flags)) ||
+		     (test_bit(R5_Wantdrain, &dev->flags) ||
+		      test_bit(R5_InCache, &dev->flags))) ||
 		    (srctype == SYNDROME_SRC_WRITTEN &&
 		     dev->written))
 			srcs[slot] = sh->dev[i].page;
@@ -1543,9 +1561,18 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
 static void ops_complete_prexor(void *stripe_head_ref)
 {
 	struct stripe_head *sh = stripe_head_ref;
+	int i;
 
 	pr_debug("%s: stripe %llu\n", __func__,
 		(unsigned long long)sh->sector);
+
+	for (i = sh->disks; i--; )
+		if (sh->dev[i].page != sh->dev[i].orig_page) {
+			struct page *p = sh->dev[i].page;
+
+			sh->dev[i].page = sh->dev[i].orig_page;
+			put_page(p);
+		}
 }
 
 static struct dma_async_tx_descriptor *
@@ -1567,7 +1594,8 @@ ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu,
 	for (i = disks; i--; ) {
 		struct r5dev *dev = &sh->dev[i];
 		/* Only process blocks that are known to be uptodate */
-		if (test_bit(R5_Wantdrain, &dev->flags))
+		if (test_bit(R5_Wantdrain, &dev->flags) ||
+		    test_bit(R5_InCache, &dev->flags))
 			xor_srcs[count++] = dev->page;
 	}
 
@@ -1618,6 +1646,10 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
 
 again:
 			dev = &sh->dev[i];
+			if (test_and_clear_bit(R5_InCache, &dev->flags)) {
+				BUG_ON(atomic_read(&sh->dev_in_cache) == 0);
+				atomic_dec(&sh->dev_in_cache);
+			}
 			spin_lock_irq(&sh->stripe_lock);
 			chosen = dev->towrite;
 			dev->towrite = NULL;
@@ -1625,7 +1657,8 @@ again:
 			BUG_ON(dev->written);
 			wbi = dev->written = chosen;
 			spin_unlock_irq(&sh->stripe_lock);
-			WARN_ON(dev->page != dev->orig_page);
+			if (!test_bit(R5_Wantcache, &dev->flags))
+				WARN_ON(dev->page != dev->orig_page);
 
 			while (wbi && wbi->bi_iter.bi_sector <
 				dev->sector + STRIPE_SECTORS) {
@@ -1637,8 +1670,10 @@ again:
 					set_bit(R5_Discard, &dev->flags);
 				else {
 					tx = async_copy_data(1, wbi, &dev->page,
-						dev->sector, tx, sh);
-					if (dev->page != dev->orig_page) {
+							     dev->sector, tx, sh,
+							     test_bit(R5_Wantcache, &dev->flags));
+					if (dev->page != dev->orig_page &&
+					    !test_bit(R5_Wantcache, &dev->flags)) {
 						set_bit(R5_SkipCopy, &dev->flags);
 						clear_bit(R5_UPTODATE, &dev->flags);
 						clear_bit(R5_OVERWRITE, &dev->flags);
@@ -1746,7 +1781,8 @@ again:
 		xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
-			if (head_sh->dev[i].written)
+			if (head_sh->dev[i].written ||
+			    test_bit(R5_InCache, &head_sh->dev[i].flags))
 				xor_srcs[count++] = dev->page;
 		}
 	} else {
@@ -2001,6 +2037,7 @@ static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
 		INIT_LIST_HEAD(&sh->batch_list);
 		INIT_LIST_HEAD(&sh->lru);
 		atomic_set(&sh->count, 1);
+		atomic_set(&sh->dev_in_cache, 0);
 		for (i = 0; i < disks; i++) {
 			struct r5dev *dev = &sh->dev[i];
 
@@ -2887,6 +2924,9 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
 				if (!expand)
 					clear_bit(R5_UPTODATE, &dev->flags);
 				s->locked++;
+			} else if (test_bit(R5_InCache, &dev->flags)) {
+				set_bit(R5_LOCKED, &dev->flags);
+				s->locked++;
 			}
 		}
 		/* if we are not expanding this is a proper write request, and
@@ -2926,6 +2966,9 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
 				set_bit(R5_LOCKED, &dev->flags);
 				clear_bit(R5_UPTODATE, &dev->flags);
 				s->locked++;
+			} else if (test_bit(R5_InCache, &dev->flags)) {
+				set_bit(R5_LOCKED, &dev->flags);
+				s->locked++;
 			}
 		}
 		if (!s->locked)
@@ -3577,6 +3620,9 @@ static void handle_stripe_dirtying(struct r5conf *conf,
 	int rmw = 0, rcw = 0, i;
 	sector_t recovery_cp = conf->mddev->recovery_cp;
 
+	if (r5c_handle_stripe_dirtying(conf, sh, s, disks) == 0)
+		return;
+
 	/* Check whether resync is now happening or should start.
 	 * If yes, then the array is dirty (after unclean shutdown or
 	 * initial creation), so parity in some stripes might be inconsistent.
@@ -3597,9 +3643,12 @@ static void handle_stripe_dirtying(struct r5conf *conf,
 	} else for (i = disks; i--; ) {
 		/* would I have to read this buffer for read_modify_write */
 		struct r5dev *dev = &sh->dev[i];
-		if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) &&
+		if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx ||
+		     test_bit(R5_InCache, &dev->flags)) &&
 		    !test_bit(R5_LOCKED, &dev->flags) &&
-		    !(test_bit(R5_UPTODATE, &dev->flags) ||
+		    !((test_bit(R5_UPTODATE, &dev->flags) &&
+		       (!test_bit(R5_InCache, &dev->flags) ||
+			dev->page != dev->orig_page)) ||
 		      test_bit(R5_Wantcompute, &dev->flags))) {
 			if (test_bit(R5_Insync, &dev->flags))
 				rmw++;
@@ -3611,13 +3660,15 @@ static void handle_stripe_dirtying(struct r5conf *conf,
 		    i != sh->pd_idx && i != sh->qd_idx &&
 		    !test_bit(R5_LOCKED, &dev->flags) &&
 		    !(test_bit(R5_UPTODATE, &dev->flags) ||
-		    test_bit(R5_Wantcompute, &dev->flags))) {
+		      test_bit(R5_InCache, &dev->flags) ||
+		      test_bit(R5_Wantcompute, &dev->flags))) {
 			if (test_bit(R5_Insync, &dev->flags))
 				rcw++;
 			else
 				rcw += 2*disks;
 		}
 	}
+
 	pr_debug("for sector %llu, rmw=%d rcw=%d\n",
 		(unsigned long long)sh->sector, rmw, rcw);
 	set_bit(STRIPE_HANDLE, &sh->state);
@@ -3629,10 +3680,18 @@ static void handle_stripe_dirtying(struct r5conf *conf,
 					  (unsigned long long)sh->sector, rmw);
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
-			if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) &&
+			if (test_bit(R5_InCache, &dev->flags) &&
+			    dev->page == dev->orig_page)
+				dev->page = alloc_page(GFP_NOIO);  /* prexor */
+
+			if ((dev->towrite ||
+			     i == sh->pd_idx || i == sh->qd_idx ||
+			     test_bit(R5_InCache, &dev->flags)) &&
 			    !test_bit(R5_LOCKED, &dev->flags) &&
-			    !(test_bit(R5_UPTODATE, &dev->flags) ||
-			    test_bit(R5_Wantcompute, &dev->flags)) &&
+			    !((test_bit(R5_UPTODATE, &dev->flags) &&
+			       (!test_bit(R5_InCache, &dev->flags) ||
+				dev->page != dev->orig_page)) ||
+			      test_bit(R5_Wantcompute, &dev->flags)) &&
 			    test_bit(R5_Insync, &dev->flags)) {
 				if (test_bit(STRIPE_PREREAD_ACTIVE,
 					     &sh->state)) {
@@ -3658,6 +3717,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
 			    i != sh->pd_idx && i != sh->qd_idx &&
 			    !test_bit(R5_LOCKED, &dev->flags) &&
 			    !(test_bit(R5_UPTODATE, &dev->flags) ||
+			      test_bit(R5_InCache, &dev->flags) ||
 			      test_bit(R5_Wantcompute, &dev->flags))) {
 				rcw++;
 				if (test_bit(R5_Insync, &dev->flags) &&
@@ -3697,7 +3757,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
 	 */
 	if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
 	    (s->locked == 0 && (rcw == 0 || rmw == 0) &&
-	    !test_bit(STRIPE_BIT_DELAY, &sh->state)))
+	     !test_bit(STRIPE_BIT_DELAY, &sh->state)))
 		schedule_reconstruction(sh, s, rcw == 0, 0);
 }
 
@@ -4010,6 +4070,46 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
 	async_tx_quiesce(&tx);
 }
 
+static void
+r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev,
+			      struct bio_list *return_bi)
+{
+	struct bio *wbi, *wbi2;
+
+	wbi = dev->written;
+	dev->written = NULL;
+	while (wbi && wbi->bi_iter.bi_sector <
+	       dev->sector + STRIPE_SECTORS) {
+		wbi2 = r5_next_bio(wbi, dev->sector);
+		if (!raid5_dec_bi_active_stripes(wbi)) {
+			md_write_end(conf->mddev);
+			bio_list_add(return_bi, wbi);
+		}
+		wbi = wbi2;
+	}
+}
+
+static void r5c_handle_cached_data_endio(struct r5conf *conf,
+	  struct stripe_head *sh, int disks, struct bio_list *return_bi)
+{
+	int i;
+
+	for (i = sh->disks; i--; ) {
+		if (test_bit(R5_InCache, &sh->dev[i].flags) &&
+		    sh->dev[i].written) {
+			set_bit(R5_UPTODATE, &sh->dev[i].flags);
+			r5c_return_dev_pending_writes(conf, &sh->dev[i],
+						      return_bi);
+		}
+	}
+	r5l_stripe_write_finished(sh);
+
+	bitmap_endwrite(conf->mddev->bitmap, sh->sector,
+			STRIPE_SECTORS,
+			!test_bit(STRIPE_DEGRADED, &sh->state),
+			0);
+}
+
 /*
  * handle_stripe - do things to a stripe.
  *
@@ -4188,6 +4288,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
 			if (rdev && !test_bit(Faulty, &rdev->flags))
 				do_recovery = 1;
 		}
+		if (test_bit(R5_InCache, &dev->flags) && dev->written)
+			s->just_cached++;
 	}
 	if (test_bit(STRIPE_SYNCING, &sh->state)) {
 		/* If there is a failed device being replaced,
@@ -4416,7 +4518,7 @@ static void handle_stripe(struct stripe_head *sh)
 			struct r5dev *dev = &sh->dev[i];
 			if (test_bit(R5_LOCKED, &dev->flags) &&
 				(i == sh->pd_idx || i == sh->qd_idx ||
-				 dev->written)) {
+				 dev->written || test_bit(R5_InCache, &dev->flags))) {
 				pr_debug("Writing block %d\n", i);
 				set_bit(R5_Wantwrite, &dev->flags);
 				if (prexor)
@@ -4456,6 +4558,12 @@ static void handle_stripe(struct stripe_head *sh)
 				 test_bit(R5_Discard, &qdev->flags))))))
 		handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
 
+	if (s.just_cached)
+		r5c_handle_cached_data_endio(conf, sh, disks, &s.return_bi);
+
+	if (test_bit(STRIPE_R5C_FROZEN, &sh->state))
+		r5l_stripe_write_finished(sh);
+
 	/* Now we might consider reading some blocks, either to check/generate
 	 * parity, or to satisfy requests
 	 * or to load a block that is being partially written.
@@ -4467,13 +4575,17 @@ static void handle_stripe(struct stripe_head *sh)
 	    || s.expanding)
 		handle_stripe_fill(sh, &s, disks);
 
-	/* Now to consider new write requests and what else, if anything
-	 * should be read.  We do not handle new writes when:
+	r5c_handle_stripe_written(conf, sh);
+
+	/* Now to consider new write requests, cache write back and what else,
+	 * if anything should be read.  We do not handle new writes when:
 	 * 1/ A 'write' operation (copy+xor) is already in flight.
 	 * 2/ A 'check' operation is in flight, as it may clobber the parity
 	 *    block.
+	 * 3/ A r5c cache log write is in flight.
 	 */
-	if (s.to_write && !sh->reconstruct_state && !sh->check_state)
+	if ((s.to_write || test_bit(STRIPE_R5C_FROZEN, &sh->state)) &&
+	     !sh->reconstruct_state && !sh->check_state && !sh->log_io)
 		handle_stripe_dirtying(conf, sh, &s, disks);
 
 	/* maybe we need to check and possibly fix the parity for this stripe
@@ -5192,7 +5304,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
 	 * later we might have to read it again in order to reconstruct
 	 * data on failed drives.
 	 */
-	if (rw == READ && mddev->degraded == 0 &&
+	if (rw == READ && mddev->degraded == 0 && conf->log == NULL &&
 	    mddev->reshape_position == MaxSector) {
 		bi = chunk_aligned_read(mddev, bi);
 		if (!bi)
@@ -5917,6 +6029,7 @@ static void raid5d(struct md_thread *thread)
 			md_check_recovery(mddev);
 			spin_lock_irq(&conf->device_lock);
 		}
+		r5c_do_reclaim(conf);
 	}
 	pr_debug("%d stripes handled\n", handled);
 
@@ -6583,6 +6696,9 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 	for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
 		INIT_LIST_HEAD(conf->temp_inactive_list + i);
 
+	atomic_set(&conf->r5c_cached_stripes, 0);
+	INIT_LIST_HEAD(&conf->r5c_cached_list);
+
 	conf->level = mddev->new_level;
 	conf->chunk_sectors = mddev->new_chunk_sectors;
 	if (raid5_alloc_percpu(conf) != 0)
@@ -7655,8 +7771,10 @@ static void raid5_quiesce(struct mddev *mddev, int state)
 		/* '2' tells resync/reshape to pause so that all
 		 * active stripes can drain
 		 */
+		r5c_flush_cache(conf);
 		conf->quiesce = 2;
 		wait_event_cmd(conf->wait_for_quiescent,
+				    atomic_read(&conf->r5c_cached_stripes) == 0 &&
 				    atomic_read(&conf->active_stripes) == 0 &&
 				    atomic_read(&conf->active_aligned_reads) == 0,
 				    unlock_all_device_hash_locks_irq(conf),
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 517d4b6..e301d0e 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -226,6 +226,7 @@ struct stripe_head {
 
 	struct r5l_io_unit	*log_io;
 	struct list_head	log_list;
+	atomic_t		dev_in_cache;
 	/**
 	 * struct stripe_operations
 	 * @target - STRIPE_OP_COMPUTE_BLK target
@@ -263,6 +264,7 @@ struct stripe_head_state {
 	 */
 	int syncing, expanding, expanded, replacing;
 	int locked, uptodate, to_read, to_write, failed, written;
+	int to_cache, just_cached;
 	int to_fill, compute, req_compute, non_overwrite;
 	int failed_num[2];
 	int p_failed, q_failed;
@@ -313,6 +315,8 @@ enum r5dev_flags {
 			 */
 	R5_Discard,	/* Discard the stripe */
 	R5_SkipCopy,	/* Don't copy data from bio to stripe cache */
+	R5_Wantcache,	/* Want write data to write cache */
+	R5_InCache,	/* Data in cache */
 };
 
 /*
@@ -345,7 +349,11 @@ enum {
 	STRIPE_BITMAP_PENDING,	/* Being added to bitmap, don't add
 				 * to batch yet.
 				 */
-	STRIPE_LOG_TRAPPED, /* trapped into log */
+	STRIPE_LOG_TRAPPED,	/* trapped into log */
+	STRIPE_IN_R5C_CACHE,	/* in r5c cache (to-be/being handled or
+				 * in conf->r5c_cached_list) */
+	STRIPE_R5C_FROZEN,      /* r5c_cache frozen and being written out */
+	STRIPE_R5C_WRITTEN,	/* ready for r5c_handle_stripe_written() */
 };
 
 #define STRIPE_EXPAND_SYNC_FLAGS \
@@ -521,6 +529,8 @@ struct r5conf {
 	 */
 	atomic_t		active_stripes;
 	struct list_head	inactive_list[NR_STRIPE_HASH_LOCKS];
+	atomic_t		r5c_cached_stripes;
+	struct list_head	r5c_cached_list;
 	atomic_t		empty_inactive_list_nr;
 	struct llist_head	released_stripes;
 	wait_queue_head_t	wait_for_quiescent;
@@ -635,4 +645,17 @@ extern void r5l_stripe_write_finished(struct stripe_head *sh);
 extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio);
 extern void r5l_quiesce(struct r5l_log *log, int state);
 extern bool r5l_log_disk_error(struct r5conf *conf);
+extern void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
+extern int
+r5c_handle_stripe_dirtying(struct r5conf *conf, struct stripe_head *sh,
+			   struct stripe_head_state *s, int disks);
+extern int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh,
+			  struct stripe_head_state *s);
+extern int r5c_cache_parity(struct r5l_log *log, struct stripe_head *sh,
+			    struct stripe_head_state *s);
+extern void
+r5c_handle_stripe_written(struct r5conf *conf, struct stripe_head *sh);
+extern void r5c_freeze_stripe_for_reclaim(struct stripe_head *sh);
+extern void r5c_do_reclaim(struct r5conf *conf);
+extern int r5c_flush_cache(struct r5conf *conf);
 #endif
-- 
2.8.0.rc2


^ permalink raw reply related

* [PATCH 0/5] raid5-cache: enabling cache features
From: Song Liu @ 2016-08-31 22:18 UTC (permalink / raw)
  To: linux-raid; +Cc: neilb, shli, kernel-team, dan.j.williams, hch, Song Liu

These patches enable write cache part of raid5-cache. The journal part was
released with kernel 4.4.

The caching part uses same disk format of raid456 journal, and provides
acceleration to writes. Write operations are committed (bio_endio) once
the data is secured in journal. Reconstruct and RMW are postponed to
reclaim path, which is (hopefully) not on the critical path.

The changes are organized in 5 patches (details below). Current reclaim
approach can be improved. So far it works OK in my tests with spinning
disks as raid device and SSD as journal.

I removed the patch for chunk_aligned_read in earlier RFC
(http://marc.info/?l=linux-raid&m=146432700719277). But we may still need
some optimizations later, especially for SSD raid devices.

Thanks,
Song

Song Liu (5):
  r5cache: write part of r5cache
  r5cache: sysfs entry r5c_state
  r5cache: naive reclaim approach
  r5cache: r5c recovery
  r5cache: handle SYNC and FUA

 drivers/md/raid5-cache.c | 1299 ++++++++++++++++++++++++++++++++++++++++------
 drivers/md/raid5.c       |  185 ++++++-
 drivers/md/raid5.h       |   30 +-
 3 files changed, 1327 insertions(+), 187 deletions(-)

--
2.8.0.rc2

^ permalink raw reply

* Direct Cash Loans Offer!!!
From: Direct cash Loans pty @ 2016-08-31 17:20 UTC (permalink / raw)

In-Reply-To: <381783771.30728256.1472663702505.JavaMail.root@goo.jp>

[-- Attachment #1: Type: text/plain, Size: 527 bytes --]





Good News,

Direct cash Loans is currently giving out loan at 3.5% interest rate. 

Kindly forward the following details your ID Number, Full Names, Occupation, Monthly Income, Type of loan , telephone number and email address via email to this email:  directcashloans_dept@webmail.co.za 

View attached document for more information about our special loan offer or call : +27(0)622 539 768  for more information.

Commission earners can also apply. 


Regards, 

Lydia Smith (Mrs) 

Loan Application Unit/Risk Supervisor.

[-- Attachment #2: Direct Cash Loan Low Rate Offer.docx --]
[-- Type: application/vnd.openxmlformats-officedocument.wordprocessingml.document, Size: 49106 bytes --]

^ permalink raw reply

* Re: [PATCH] dm-bufio: Remove deprecated create_singlethread_workqueue
From: Tejun Heo @ 2016-08-31 14:21 UTC (permalink / raw)
  To: Bhaktipriya Shridhar
  Cc: Alasdair Kergon, Mike Snitzer, dm-devel, Shaohua Li, linux-raid,
	linux-kernel
In-Reply-To: <20160830164911.GA26756@Karyakshetra>

On Tue, Aug 30, 2016 at 10:19:11PM +0530, Bhaktipriya Shridhar wrote:
> The workqueue "dm_bufio_wq" queues a single work item &dm_bufio_work and
> hence it doesn't require execution ordering. Hence, alloc_workqueue has
> been used to replace the deprecated create_singlethread_workqueue instance.
> 
> The WQ_MEM_RECLAIM flag has been set since md / dm are block multi disk
> drivers and require forward progress under memory pressure.
> 
> Since there are fixed number of work items, explicit concurrency
> limit is unnecessary here.
> 
> Signed-off-by: Bhaktipriya Shridhar <bhaktipriya96@gmail.com>

Acked-by: Tejun Heo <tj@kernel.org>

Thanks.

-- 
tejun

^ permalink raw reply

* Re: [PATCH] raid5: guarantee enough stripes to avoid reshape hang
From: NeilBrown @ 2016-08-31  8:05 UTC (permalink / raw)
  To: Shaohua Li, linux-raid; +Cc: Kernel-team
In-Reply-To: <e855bdf2ccb11e440191e9512de460f3c5355dd2.1472578078.git.shli@fb.com>

[-- Attachment #1: Type: text/plain, Size: 1713 bytes --]

On Wed, Aug 31 2016, Shaohua Li wrote:

> If there aren't enough stripes, reshape will hang. We have a check for
> this in new reshape, but miss it for reshape resume, hence we could see
> hang in reshape resume. This patch forces enough stripes existed if
> reshape resumes.
>
> Signed-off-by: Shaohua Li <shli@fb.com>

I'm really surprised that this was missing, but it certainly appears
that it was.

 Reviewed-by: NeilBrown <neilb@suse.com>

thanks!


NeilBrown


> ---
>  drivers/md/raid5.c | 10 ++++++++++
>  1 file changed, 10 insertions(+)
>
> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> index 62febe8..2fdb97b 100644
> --- a/drivers/md/raid5.c
> +++ b/drivers/md/raid5.c
> @@ -6639,6 +6639,16 @@ static struct r5conf *setup_conf(struct mddev *mddev)
>  	}
>  
>  	conf->min_nr_stripes = NR_STRIPES;
> +	if (mddev->reshape_position != MaxSector) {
> +		int stripes = max_t(int,
> +			((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4,
> +			((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4);
> +		conf->min_nr_stripes = max(NR_STRIPES, stripes);
> +		if (conf->min_nr_stripes != NR_STRIPES)
> +			printk(KERN_INFO
> +				"md/raid:%s: force stripe size %d for reshape\n",
> +				mdname(mddev), conf->min_nr_stripes);
> +	}
>  	memory = conf->min_nr_stripes * (sizeof(struct stripe_head) +
>  		 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
>  	atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS);
> -- 
> 2.8.0.rc2
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 818 bytes --]

^ permalink raw reply

* lening bieden 3%
From: Lloyds TSB Bank PLC @ 2016-08-31  1:02 UTC (permalink / raw)




Goede dag, 


Dit is Lloyd's TSB Bank plc leningen aan te bieden.   


     Lloyds TSB biedt flexibele en betaalbare leningen voor welk doel u te helpen uw doelen te bereiken. we lening tegen lage rente van 3%. Hier zijn een aantal belangrijke kenmerken van de persoonlijke lening aangeboden door Lloyd's TSB. Hier zijn de Loan Factoren we werken met de toonaangevende Britse makelaars die toegang hebben tot de top kredietverstrekkers hebben en in staat zijn om de beste financiële oplossing tegen een betaalbare price.Please vinden als u geïnteresseerd bent vriendelijk contact met ons op via deze e-mail: lloyds26tsb@gmail.com

Na de reactie, zal u een aanvraag voor een lening te vullen ontvangen. Geen sociale zekerheid en geen credit check, 100% gegarandeerd.
Het zal ons een eer zijn als u ons toelaten om u van dienst zijn.

INFORMATIE NODIG


Jullie namen:
Adres: ...........
Telefoon: ...........
Benodigd ........
Duur: ...............
Bezetting: ...........
Maandelijks Inkomen Level: ........
Geslacht: ...............
Geboortedatum: ........
Staat: ..................
Land: ..........
Doel: .........
Ontmoeting uw financiële behoeften is onze trots.

Dr.John Mahama.

^ permalink raw reply

* [GIT PULL] MD update for 4.8-rc4
From: Shaohua Li @ 2016-08-30 17:48 UTC (permalink / raw)
  To: torvalds; +Cc: linux-kernel, linux-raid, neilb

Hi,
please pull MD update for 4.8-rc4. This update includes several bug fixes:
- Alexey Obitotskiy fixes hang for faulty raid5 array with external management
- Song Liu fixes two raid5 journal realted bugs
- Tomasz Majchrzak fixes a bad block recording issue and an accounting issue for raid10
- ZhengYuan Liu fixes an accounting issue for raid5
- I fix a potential race condition and memory leak with DIF/DIX enabled
- other trival fixes

Thanks,
Shaohua

The following changes since commit d761f3ed6e71bcca724a6e9e39efcac65b7b4ac1:

  Merge branch 'x86-microcode-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip (2016-07-30 13:18:33 -0700)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/shli/md.git tags/md/4.8-rc4

for you to fetch changes up to 45c91d808ff989d950e260dab9f89e8f4a3c9c2c:

  raid5: avoid unnecessary bio data set (2016-08-24 10:21:53 -0700)

----------------------------------------------------------------
Alexey Obitotskiy (1):
      md: Prevent IO hold during accessing to faulty raid5 array

Artur Paszkiewicz (1):
      md: don't print the same repeated messages about delayed sync operation

Guoqing Jiang (1):
      md: remove obsolete ret in md_start_sync

Shaohua Li (3):
      MD: hold mddev lock to change bitmap location
      raid5: fix memory leak of bio integrity data
      raid5: avoid unnecessary bio data set

Song Liu (2):
      md: do not count journal as spare in GET_ARRAY_INFO
      r5cache: set MD_JOURNAL_CLEAN correctly

Tomasz Majchrzak (2):
      raid10: increment write counter after bio is split
      raid10: record correct address of bad block

Wei Yongjun (1):
      md-cluster: fix error return code in join()

ZhengYuan Liu (1):
      raid5: fix incorrectly counter of conf->empty_inactive_list_nr

 drivers/md/bitmap.c     | 47 +++++++++++++++++++++++++-----------
 drivers/md/md-cluster.c | 12 +++++++---
 drivers/md/md.c         | 28 ++++++++++++----------
 drivers/md/raid10.c     | 13 +++++-----
 drivers/md/raid5.c      | 64 +++++++++++++++++++++++++++++++++----------------
 5 files changed, 107 insertions(+), 57 deletions(-)

^ permalink raw reply

* [PATCH] raid5: guarantee enough stripes to avoid reshape hang
From: Shaohua Li @ 2016-08-30 17:29 UTC (permalink / raw)
  To: linux-raid; +Cc: Kernel-team

If there aren't enough stripes, reshape will hang. We have a check for
this in new reshape, but miss it for reshape resume, hence we could see
hang in reshape resume. This patch forces enough stripes existed if
reshape resumes.

Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 62febe8..2fdb97b 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -6639,6 +6639,16 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 	}
 
 	conf->min_nr_stripes = NR_STRIPES;
+	if (mddev->reshape_position != MaxSector) {
+		int stripes = max_t(int,
+			((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4,
+			((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4);
+		conf->min_nr_stripes = max(NR_STRIPES, stripes);
+		if (conf->min_nr_stripes != NR_STRIPES)
+			printk(KERN_INFO
+				"md/raid:%s: force stripe size %d for reshape\n",
+				mdname(mddev), conf->min_nr_stripes);
+	}
 	memory = conf->min_nr_stripes * (sizeof(struct stripe_head) +
 		 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
 	atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS);
-- 
2.8.0.rc2


^ permalink raw reply related

* [PATCH] dm-bufio: Remove deprecated create_singlethread_workqueue
From: Bhaktipriya Shridhar @ 2016-08-30 16:49 UTC (permalink / raw)
  To: Alasdair Kergon, Mike Snitzer, dm-devel, Shaohua Li
  Cc: Tejun Heo, linux-raid, linux-kernel

The workqueue "dm_bufio_wq" queues a single work item &dm_bufio_work and
hence it doesn't require execution ordering. Hence, alloc_workqueue has
been used to replace the deprecated create_singlethread_workqueue instance.

The WQ_MEM_RECLAIM flag has been set since md / dm are block multi disk
drivers and require forward progress under memory pressure.

Since there are fixed number of work items, explicit concurrency
limit is unnecessary here.

Signed-off-by: Bhaktipriya Shridhar <bhaktipriya96@gmail.com>
---
 drivers/md/dm-bufio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index cd77216..d073ec1 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -1876,7 +1876,7 @@ static int __init dm_bufio_init(void)
 	__cache_size_refresh();
 	mutex_unlock(&dm_bufio_clients_lock);

-	dm_bufio_wq = create_singlethread_workqueue("dm_bufio_cache");
+	dm_bufio_wq = alloc_workqueue("dm_bufio_cache", WQ_MEM_RECLAIM, 0);
 	if (!dm_bufio_wq)
 		return -ENOMEM;

--
2.1.4

^ permalink raw reply related

* [PATCH v3] raid6: fix the input of raid6 algorithm
From: liuzhengyuan @ 2016-08-30 13:48 UTC (permalink / raw)
  To: hpa; +Cc: shli, linux-raid, linux-kernel, liuzhengyuang521, ZhengYuan Liu

From: ZhengYuan Liu <liuzhengyuan@kylinos.cn>

To test and choose the best algorithm for raid6, disks number
and disks data must be offered. These input depend on page
size and gfmul table at current time. It would cause the disk
number less than 4 when the page size is more than 64KB.This
patch would support arbitrarily page size by defining a macro
for disks number and using a PRNG based on non-linear inversive
congruential algorithm to fill the disks data.

Signed-off-by: ZhengYuan Liu <liuzhengyuan@kylinos.cn>
---
 lib/raid6/algos.c | 91 ++++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 77 insertions(+), 14 deletions(-)

diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c
index 975c6e0..ca227b7 100644
--- a/lib/raid6/algos.c
+++ b/lib/raid6/algos.c
@@ -30,6 +30,9 @@ EXPORT_SYMBOL(raid6_empty_zero_page);
 #endif
 #endif
 
+#define RAID6_DISKS	8
+#define RAID6_DISKS_SHIFT	3
+
 struct raid6_calls raid6_call;
 EXPORT_SYMBOL_GPL(raid6_call);
 
@@ -129,7 +132,7 @@ static inline const struct raid6_recov_calls *raid6_choose_recov(void)
 }
 
 static inline const struct raid6_calls *raid6_choose_gen(
-	void *(*const dptrs)[(65536/PAGE_SIZE)+2], const int disks)
+	void *(*const dptrs)[RAID6_DISKS], const int disks)
 {
 	unsigned long perf, bestgenperf, bestxorperf, j0, j1;
 	int start = (disks>>1)-1, stop = disks-3;	/* work on the second half of the disks */
@@ -200,33 +203,93 @@ static inline const struct raid6_calls *raid6_choose_gen(
 	return best;
 }
 
+/* Euclidean Algorithm is selected to realize inversion */
+static uint32_t invert_euclidian(uint32_t c)
+{
+	uint32_t l1 = 0;
+	uint32_t l2 = 1;
+	uint32_t n = c;
+	uint32_t modulus = 2147483647;
+	uint32_t p = modulus, q, q2;
+
+	for (;;) {
+		q = p / n;
+		l1 += q * l2;
+		p -= q * n;
+		if (p == 0)
+			return l2;
+		q2 = n / p;
+		l2 +=  q2 * l1;
+		n -= q2 * p;
+		if (n == 0)
+			return modulus - l1;
+	}
+}
+
+/* a pseudorandom number generator based on non-linear inversive */
+/* congruential,use a initial state suggested by Peter Hellekalek */
+static uint32_t inversive_congruential(void)
+{
+	uint32_t multipler = 9102;
+	uint32_t increment = 2147483647-36884165;
+	uint32_t modulus = 2147483647; /* cycle length 2^31 - 1 */
+	static uint32_t seed = 1;
+	uint32_t add_ret, mult_ret, invert_ret;
+	uint32_t mult_q, mult_r, sub_q, sub_r;
+
+	invert_ret = invert_euclidian(seed);
+
+	/* use Schrage’s method to avoid multiplication overflow */
+	mult_q = modulus / multipler;
+	mult_r = modulus % multipler;
+	sub_q = multipler * (invert_ret % mult_q);
+	sub_r = mult_r * (invert_ret / mult_q);
+	if (sub_q < sub_r)
+		mult_ret = modulus - (sub_r - sub_q);
+	else
+		mult_ret = sub_q - sub_r;
+
+	/* avoid addition overflow */
+	if (mult_ret < modulus - increment)
+		add_ret = mult_ret + increment;
+	else
+		add_ret = mult_ret - (modulus - increment);
+
+	seed = add_ret;
+	return add_ret;
+}
 
 /* Try to pick the best algorithm */
 /* This code uses the gfmul table as convenient data set to abuse */
 
 int __init raid6_select_algo(void)
 {
-	const int disks = (65536/PAGE_SIZE)+2;
+	const int disks = RAID6_DISKS;
 
 	const struct raid6_calls *gen_best;
 	const struct raid6_recov_calls *rec_best;
-	char *syndromes;
-	void *dptrs[(65536/PAGE_SIZE)+2];
-	int i;
-
-	for (i = 0; i < disks-2; i++)
-		dptrs[i] = ((char *)raid6_gfmul) + PAGE_SIZE*i;
+	char *disk_ptr;
+	void *dptrs[RAID6_DISKS];
+	int i, j;
 
-	/* Normal code - use a 2-page allocation to avoid D$ conflict */
-	syndromes = (void *) __get_free_pages(GFP_KERNEL, 1);
+	/* use a 8-page allocation, The first 6 pages for disks
+	   and the last 2 pages for syndromes */
+	disk_ptr = (void *) __get_free_pages(GFP_KERNEL, RAID6_DISKS_SHIFT);
 
-	if (!syndromes) {
+	if (!disk_ptr) {
 		pr_err("raid6: Yikes!  No memory available.\n");
 		return -ENOMEM;
 	}
 
-	dptrs[disks-2] = syndromes;
-	dptrs[disks-1] = syndromes + PAGE_SIZE;
+	for (i = 0; i < disks-2; i++) {
+		dptrs[i] = disk_ptr + PAGE_SIZE*i;
+		for (j = 0; j < PAGE_SIZE; j = j + 4) {
+			*(uint32_t *)(dptrs[i]+j) = inversive_congruential();
+		}
+	}
+
+	dptrs[disks-2] = disk_ptr + PAGE_SIZE*(disks-2);
+	dptrs[disks-1] = disk_ptr + PAGE_SIZE*(disks-1);
 
 	/* select raid gen_syndrome function */
 	gen_best = raid6_choose_gen(&dptrs, disks);
@@ -234,7 +297,7 @@ int __init raid6_select_algo(void)
 	/* select raid recover functions */
 	rec_best = raid6_choose_recov();
 
-	free_pages((unsigned long)syndromes, 1);
+	free_pages((unsigned long)disk_ptr, RAID6_DISKS_SHIFT);
 
 	return gen_best && rec_best ? 0 : -EINVAL;
 }
-- 
1.9.1





^ permalink raw reply related

* Re: Raid settings
From: Wols Lists @ 2016-08-29 18:43 UTC (permalink / raw)
  To: o1bigtenor; +Cc: Adam Goryachev, Linux-RAID
In-Reply-To: <CAPpdf58qnEqobw8q1LTOAbT0MGRssV4HF2NO7a1-Mra8XnADkA@mail.gmail.com>

On 29/08/16 19:13, o1bigtenor wrote:
>> Once I've got access, I can sort out access for you, and if it works out
>> > well, we can work on this together :-) It'd be nice to get an up-to-date
>> > resource out there :-) Times move on, and the two original guys have
>> > moved on in their lives - now's as good a time as any to pass the baton :-)
>> >
> I'm starting to think that documents should be checked for
> 'currentness' at least
> annually if not more often. The software is constantly being worked on and
> hardware stuff is changing likely on a semi-annual basis.

Good sentiment. Snag is, the web is full of cobweb sites - and given the
propensity of the web to never forget, I don't think you're going to win
... It's like documentation - how many programmers keep documentation up
to date :-)

But yes, if it's (allegedly) hosted at kernel.org, yes it looks official
and should be kept up-to-date. And the reality is, a lot of it still is
good relevant advice. It's just the technical stuff, which isn't
actually the majority of the site. Unfortunately, if the tech stuff is
out of date, techies will distrust the rest.

Cheers,
Wol

^ permalink raw reply

* Re: Raid settings
From: o1bigtenor @ 2016-08-29 18:13 UTC (permalink / raw)
  To: Wols Lists; +Cc: Adam Goryachev, Linux-RAID
In-Reply-To: <57C42840.6040103@youngman.org.uk>

On Mon, Aug 29, 2016 at 7:19 AM, Wols Lists <antlists@youngman.org.uk> wrote:
> On 29/08/16 13:11, o1bigtenor wrote:
>> Would be willing to assist in the document crafting if I were given good
>> information (see it as a way to 'help' knowing bupkis about code!).
>>
>> Wol, do you have any pointers to documents that discuss the variable
>>  to be considered?
>>
>> What kind of shape would you think the document should have?
>>
>> What should be included/excluded?
>
> Take a look at the website -
> https://raid.wiki.kernel.org/index.php/Linux_Raid
>
> In particular look at "Raid setup" and the "Advanced Options" section,
> which is where this should probably go. Something to fit in there, I guess.
>
> Once I've got access, I can sort out access for you, and if it works out
> well, we can work on this together :-) It'd be nice to get an up-to-date
> resource out there :-) Times move on, and the two original guys have
> moved on in their lives - now's as good a time as any to pass the baton :-)
>
I'm starting to think that documents should be checked for
'currentness' at least
annually if not more often. The software is constantly being worked on and
hardware stuff is changing likely on a semi-annual basis.

TTYL

Dee

^ permalink raw reply

* Re: Raid settings
From: Wols Lists @ 2016-08-29 12:19 UTC (permalink / raw)
  To: o1bigtenor; +Cc: Adam Goryachev, Linux-RAID
In-Reply-To: <CAPpdf5-UpzRxJXbs=Lt=QTZ2y_=GUv6fn57=hAXgXeDQ-H41uw@mail.gmail.com>

On 29/08/16 13:11, o1bigtenor wrote:
> Would be willing to assist in the document crafting if I were given good
> information (see it as a way to 'help' knowing bupkis about code!).
> 
> Wol, do you have any pointers to documents that discuss the variable
>  to be considered?
> 
> What kind of shape would you think the document should have?
> 
> What should be included/excluded?

Take a look at the website -
https://raid.wiki.kernel.org/index.php/Linux_Raid

In particular look at "Raid setup" and the "Advanced Options" section,
which is where this should probably go. Something to fit in there, I guess.

Once I've got access, I can sort out access for you, and if it works out
well, we can work on this together :-) It'd be nice to get an up-to-date
resource out there :-) Times move on, and the two original guys have
moved on in their lives - now's as good a time as any to pass the baton :-)

Cheers,
Wol

^ permalink raw reply

* Re: Raid settings
From: o1bigtenor @ 2016-08-29 12:11 UTC (permalink / raw)
  To: Wols Lists; +Cc: Adam Goryachev, Linux-RAID
In-Reply-To: <57C4199F.5030601@youngman.org.uk>

On Mon, Aug 29, 2016 at 6:16 AM, Wols Lists <antlists@youngman.org.uk> wrote:
> On 29/08/16 10:26, o1bigtenor wrote:
>> I've looked but haven't been able to find anything that discusses the
>> points that
>> have been raised so far. Would you point me to a, or some, documents that
>> discuss these options?
>> What are the options besides the 2 you mentioned?
>>
>> What I'm looking for is a methodology for the decision rather than a blanket
>> answer at this point. Thought initially that there might be a 'best' answer but
>> now its looking like a much much more fluid thing.
>
> Apropos all this, I should be getting write access to the linux raid
> wiki sometime soon. At the moment, as I mentioned before, it seems to be
> a cobweb site, with the admin MIA and stuck in the days of kernel 2.6,
> LILO, and parallel ATA drives ... :-)
>
> So I'm planning on a major update/rewrite, and I'll probably be bugging
> the list for info :-) but this is exactly the sort of thing that might
> well go on there, if I can find the info or get someone to write it for me.
>

Would be willing to assist in the document crafting if I were given good
information (see it as a way to 'help' knowing bupkis about code!).

Wol, do you have any pointers to documents that discuss the variable
 to be considered?

What kind of shape would you think the document should have?

What should be included/excluded?

Regards

Dee

^ permalink raw reply

* Re: Need Help with crashed RAID5 (that was rebuilding and then had SATA error on another drive)
From: Wols Lists @ 2016-08-29 11:19 UTC (permalink / raw)
  To: Benjammin2068, linux-raid
In-Reply-To: <57C38EF5.7020005@gmail.com>

On 29/08/16 02:25, Benjammin2068 wrote:
> Right - I get that. But not knowing *for sure* I thought I would go look it up and google wasn't exactly helpful for a developer style description of what exactly the difference was.
> 
> again, no worries. I'll get me some of the right drives one way or another.

I don't know whether you can still get them, but there was a post about
a crashed raid1 array here not long ago, and the array contained a
couple of 1TB Seagate Constellations. Those DID support raid, but
they're probably discontinued now :-(

Cheers,
Wol

^ permalink raw reply

* Re: Raid settings
From: Wols Lists @ 2016-08-29 11:16 UTC (permalink / raw)
  To: o1bigtenor, Adam Goryachev; +Cc: Linux-RAID
In-Reply-To: <CAPpdf59b4Cd4CTa5Mfg=1qQk_Gvk4pr+Tm5YJ_5OD5STufzX2g@mail.gmail.com>

On 29/08/16 10:26, o1bigtenor wrote:
> I've looked but haven't been able to find anything that discusses the
> points that
> have been raised so far. Would you point me to a, or some, documents that
> discuss these options?
> What are the options besides the 2 you mentioned?
> 
> What I'm looking for is a methodology for the decision rather than a blanket
> answer at this point. Thought initially that there might be a 'best' answer but
> now its looking like a much much more fluid thing.

Apropos all this, I should be getting write access to the linux raid
wiki sometime soon. At the moment, as I mentioned before, it seems to be
a cobweb site, with the admin MIA and stuck in the days of kernel 2.6,
LILO, and parallel ATA drives ... :-)

So I'm planning on a major update/rewrite, and I'll probably be bugging
the list for info :-) but this is exactly the sort of thing that might
well go on there, if I can find the info or get someone to write it for me.

Cheers,
Wol

^ permalink raw reply

* Re: Raid settings
From: o1bigtenor @ 2016-08-29  9:26 UTC (permalink / raw)
  To: Adam Goryachev; +Cc: Wols Lists, Linux-RAID
In-Reply-To: <038ec076-aa03-993f-7c9a-15f09fac02b4@websitemanagers.com.au>

On Sun, Aug 28, 2016 at 9:46 PM, Adam Goryachev
<mailinglists@websitemanagers.com.au> wrote:
> On 29/08/16 12:28, o1bigtenor wrote:
snip
>>>
>>> Without knowing what you want, we can't know what's best for you.
>>
>> That's what it seems like - - - its possible to justify any setup.
>>
>> I have 2 - 4 disc setups both running raid 10 trying to get s fairly high
>> level
>> of security yet also some through put.
>>
>> I'm asking because what do I do if I need to have say 10 to 25 TB of
>> online
>> storage.
>>
>> Do I go for my raid 10 with 2 sets of 6 TB discs or is there a better way
>> to
>> achieve high levels of security AND throughput?
>>
>> What about for the next level of storage (my thinking here) or at 35 to
>> 100 TB?
>>
>> Maybe the better question is - - - how do I decide what I want?
>>
>
> Hi,
> I think you are forgetting that there are multiple factors, you are only
> looking at the storage capacity, and to some degree protection. You also
> need to consider what sort of performance you want to achieve, and this is
> usually the deciding factor.

Your examples also point to cost being a possible major factor.
>
> To get 25TB, you can easily use 3 x 10TB drives in RAID0 .... but if you
> want some level of protection, then you could choose 4 in RAID5, or 5 in
> RAID6, or 6 in RAID10.
> You of course have the same options when you want 35, or 100, or 1000TB
> etc...
> RAID10 will scale linearly, simply keep adding drives in pairs, and you will
> continue to have the similar level of protection (I guess the chance of a
> pair of drives failing increases as you have more pairs)...
> RAID5/6 you will likely want to use RAID50/60 with no more than X drives in
> each RAID5/6 part, where X is determined by your
> performance/storage/reliability decisions.
>
> After all that, you then need to look at another dozen options, (bitmap,
> chunk size, etc etc), which will also have a significant impact on
> performance (and reliability).

I've looked but haven't been able to find anything that discusses the
points that
have been raised so far. Would you point me to a, or some, documents that
discuss these options?
What are the options besides the 2 you mentioned?

What I'm looking for is a methodology for the decision rather than a blanket
answer at this point. Thought initially that there might be a 'best' answer but
now its looking like a much much more fluid thing.
>
> To get 35TB, you might do either 16 x 8TB drives in RAID10 (40TB) or you can
> do 7 x 8TB drives in RAID6 (at these capacities, I'd strongly suggest you
> skip RAID5). You could also consider doing 14 x 8TB drives in RAID60 (two
> sets of 7 drives).
>
> PS, you will want to use this range of drives:
> http://www.wdc.com/en/products/internal/nas/

Thanks for the tip.
>
> Or equivalent, as long as it definitely supports SCT/ERC.
>

Regards

Dee

^ permalink raw reply

* Re: Raid settings
From: Brad Campbell @ 2016-08-29  6:18 UTC (permalink / raw)
  To: Wols Lists, Linux-RAID
In-Reply-To: <57C35ED0.2030107@youngman.org.uk>

On 29/08/16 05:59, Wols Lists wrote:

> For my home system I've got 2 x 3TB in a raid1 config. I had intended to
> add a 3rd drive and go raid5, but with two Barracudas I'd be an idiot
> :-( If I want to go that route, I need three new proper raid drives :-(
> I want maximum disk capacity with some redundancy, so raid 5 or 6 makes
> most sense for me.
>

I'll get roasted for suggesting this, but for a home RAID where 
potential response times in cases of errors are not an issue those 
Barracudas will be fine. Just make sure you set the appropriate 
timeouts. It's not like the drives are going to explode in a ball of 
flames because they don't support ERC, they just won't play nice with 
the default linux stack timeouts.

Sure, if you are buying new drives spend the extra 20 bucks and get 
drives that do support ERC, but if what you have works then just keep on 
keeping on. Add another drive, stretch it out to RAID5 and be happy.

I don't do RAID5 anymore, but for 3 disks with the right configuration 
it's not awful.

I have a couple of RAID10 here and a couple of RAID6. The RAID10 is an 
interesting case, because it'll survive a double drive failure some of 
the time. If the wrong pair fails though it's toast, whereas a RAID6 
will survive a double drive failure *all* of the time.

I like your "most suitable for the circumstances" quote though.

I saw some pretty interesting configurations on Sun X4500's a few years 
ago. 48 drives in varying configurations depending entirely on the 
projected workload. No right tool for every job.

^ permalink raw reply

* Re: Raid settings
From: Adam Goryachev @ 2016-08-29  2:46 UTC (permalink / raw)
  To: o1bigtenor, Wols Lists; +Cc: Linux-RAID
In-Reply-To: <CAPpdf5-pCNLQXmnNQGTVufCKMULcjFPKL8riTnd8Aba4ORtW2g@mail.gmail.com>

On 29/08/16 12:28, o1bigtenor wrote:
> On Sun, Aug 28, 2016 at 4:59 PM, Wols Lists <antlists@youngman.org.uk> wrote:
>> On 28/08/16 22:43, o1bigtenor wrote:
>>> Greetings
>>>
>>> I have been doing some research thinking toward the future.
>>>
>>> Is there a 'best' raid setup?
>> What do you want to achieve? There's no such thing as "best" - there's
>> only "most suitable for the circumstances".
>>> It seems to me (a noob) that each of the options carries some negatives with it.
>>>
>>> Is there a good option for say:
>>>
>>> 2 - 5 disks
>>> 4 - 8 disks
>>> 6 - 12 disks
>>> 10 - 30 disks
>>> etc.
>>>
>>> I looked at raid 5/6/10/50/60/100 and I am wondering where is the
>>> 'best' use of each of these options?
>>>
>> Ignoring linear or stripe (which you seem to have done), with 2 disks
>> the only option is raid 1 (mirror). 3 disks gives you raid 5, and 4
>> disks gives you raid 6.
>>
>> But do you want to make maximum use of the disk space (raid 6 is your
>> friend) or do you want maximum redundancy (raid 1)?
>>
>> For my home system I've got 2 x 3TB in a raid1 config. I had intended to
>> add a 3rd drive and go raid5, but with two Barracudas I'd be an idiot
>> :-( If I want to go that route, I need three new proper raid drives :-(
>> I want maximum disk capacity with some redundancy, so raid 5 or 6 makes
>> most sense for me.
>>
>> Without knowing what you want, we can't know what's best for you.
> That's what it seems like - - - its possible to justify any setup.
>
> I have 2 - 4 disc setups both running raid 10 trying to get s fairly high level
> of security yet also some through put.
>
> I'm asking because what do I do if I need to have say 10 to 25 TB of online
> storage.
>
> Do I go for my raid 10 with 2 sets of 6 TB discs or is there a better way to
> achieve high levels of security AND throughput?
>
> What about for the next level of storage (my thinking here) or at 35 to 100 TB?
>
> Maybe the better question is - - - how do I decide what I want?
>

Hi,
I think you are forgetting that there are multiple factors, you are only 
looking at the storage capacity, and to some degree protection. You also 
need to consider what sort of performance you want to achieve, and this 
is usually the deciding factor.

To get 25TB, you can easily use 3 x 10TB drives in RAID0 .... but if you 
want some level of protection, then you could choose 4 in RAID5, or 5 in 
RAID6, or 6 in RAID10.
You of course have the same options when you want 35, or 100, or 1000TB 
etc...
RAID10 will scale linearly, simply keep adding drives in pairs, and you 
will continue to have the similar level of protection (I guess the 
chance of a pair of drives failing increases as you have more pairs)...
RAID5/6 you will likely want to use RAID50/60 with no more than X drives 
in each RAID5/6 part, where X is determined by your 
performance/storage/reliability decisions.

After all that, you then need to look at another dozen options, (bitmap, 
chunk size, etc etc), which will also have a significant impact on 
performance (and reliability).

To get 35TB, you might do either 16 x 8TB drives in RAID10 (40TB) or you 
can do 7 x 8TB drives in RAID6 (at these capacities, I'd strongly 
suggest you skip RAID5). You could also consider doing 14 x 8TB drives 
in RAID60 (two sets of 7 drives).

PS, you will want to use this range of drives:
http://www.wdc.com/en/products/internal/nas/

Or equivalent, as long as it definitely supports SCT/ERC.

Regards,
Adam

-- 
Adam Goryachev Website Managers www.websitemanagers.com.au

^ permalink raw reply

* Re: Raid settings
From: o1bigtenor @ 2016-08-29  2:28 UTC (permalink / raw)
  To: Wols Lists; +Cc: Linux-RAID
In-Reply-To: <57C35ED0.2030107@youngman.org.uk>

On Sun, Aug 28, 2016 at 4:59 PM, Wols Lists <antlists@youngman.org.uk> wrote:
> On 28/08/16 22:43, o1bigtenor wrote:
>> Greetings
>>
>> I have been doing some research thinking toward the future.
>>
>> Is there a 'best' raid setup?
>
> What do you want to achieve? There's no such thing as "best" - there's
> only "most suitable for the circumstances".
>>
>> It seems to me (a noob) that each of the options carries some negatives with it.
>>
>> Is there a good option for say:
>>
>> 2 - 5 disks
>> 4 - 8 disks
>> 6 - 12 disks
>> 10 - 30 disks
>> etc.
>>
>> I looked at raid 5/6/10/50/60/100 and I am wondering where is the
>> 'best' use of each of these options?
>>
> Ignoring linear or stripe (which you seem to have done), with 2 disks
> the only option is raid 1 (mirror). 3 disks gives you raid 5, and 4
> disks gives you raid 6.
>
> But do you want to make maximum use of the disk space (raid 6 is your
> friend) or do you want maximum redundancy (raid 1)?
>
> For my home system I've got 2 x 3TB in a raid1 config. I had intended to
> add a 3rd drive and go raid5, but with two Barracudas I'd be an idiot
> :-( If I want to go that route, I need three new proper raid drives :-(
> I want maximum disk capacity with some redundancy, so raid 5 or 6 makes
> most sense for me.
>
> Without knowing what you want, we can't know what's best for you.

That's what it seems like - - - its possible to justify any setup.

I have 2 - 4 disc setups both running raid 10 trying to get s fairly high level
of security yet also some through put.

I'm asking because what do I do if I need to have say 10 to 25 TB of online
storage.

Do I go for my raid 10 with 2 sets of 6 TB discs or is there a better way to
achieve high levels of security AND throughput?

What about for the next level of storage (my thinking here) or at 35 to 100 TB?

Maybe the better question is - - - how do I decide what I want?

TIA

Dee

^ permalink raw reply

* Re: Need Help with crashed RAID5 (that was rebuilding and then had SATA error on another drive)
From: Benjammin2068 @ 2016-08-29  1:25 UTC (permalink / raw)
  To: linux-raid
In-Reply-To: <3b008fb0-1fb3-f12f-d973-3657de6e6923@websitemanagers.com.au>

On 08/28/2016 06:54 PM, Adam Goryachev wrote:
> When I saw this, I assume it means you can ask for the status, and it will tell you it is disabled, but there is no support to modify the status (ie, turn it on). Totally useless for all intents and purposes....
>
> Then again, I could be wrong... but compared to your other drive which showed additional supports, or on my one here:
> SCT capabilities:              (0x0039) SCT Status supported.
>                                         SCT Error Recovery Control supported.
>                                         SCT Feature Control supported.
>                                         SCT Data Table supported.
>
> ie, the second one is probably what you want, the third allows you to turn it on/off, and no idea about the last option....
>


Right - I get that. But not knowing *for sure* I thought I would go look it up and google wasn't exactly helpful for a developer style description of what exactly the difference was.

again, no worries. I'll get me some of the right drives one way or another.

 -Ben


^ permalink raw reply

* Re: Need Help with crashed RAID5 (that was rebuilding and then had SATA error on another drive)
From: Benjammin2068 @ 2016-08-29  1:23 UTC (permalink / raw)
  To: linux-raid
In-Reply-To: <5c02d44b-eb51-4388-eab4-6715760ee6be@youngman.org.uk>

On 08/28/2016 02:20 PM, Anthony Youngman wrote:
> On 28/08/16 19:29, Benjammin2068 wrote:
>> And this drive (sort of  - but not this sub model -- and that's the replacement that Seagate recommended.) is not going to stay in the array.
>
> If they knew you were using it in a raid, and recommended it, then I don't know about your laws but over here in the UK I'd send it back as "unfit for purpose". Under SOGA (Sale Of Goods Act) they've sold you a pup and it's their problem, not yours.
>
> (UK law assumes the salesman knows more than you, and so long as you tell them what you want, that forms part of the contract. Which means if they sell you something that does not meet the requirements you told them, they have to put matters right - either swap the drive for something that is suitable, or give you a refund. They can charge the difference if "suitable" means a more expensive drive, but a lot of UK shops would swallow the loss if they had recommended the wrong drive.)
>

In the US.

I'll have to look at my receipt. The recommendation was went I purchase the *last* drive.. not this current set. But I copied and pasted part numbers. So I'll have to look to see what's up.

Like I said, I can find a use for them elsewhere. It's not a huge deal.

 -Ben


^ permalink raw reply

* Re: Need Help with crashed RAID5 (that was rebuilding and then had SATA error on another drive)
From: Adam Goryachev @ 2016-08-28 23:54 UTC (permalink / raw)
  To: Benjammin2068, linux-raid
In-Reply-To: <57C32D8E.9030102@gmail.com>

On 29/08/16 04:29, Benjammin2068 wrote:
>
> On 08/26/2016 01:07 PM, Wols Lists wrote:
>> On 26/08/16 02:20, Ben wrote:
>>> [root@quantum ~]# smartctl -a /dev/sde
>>> smartctl 5.43 2012-06-30 r3573 [x86_64-linux-2.6.32-642.el6.centos.plus.x86_64] (local build)
>>> Copyright (C) 2002-12 by Bruce Allen, http://smartmontools.sourceforge.net
>>>
>>> === START OF INFORMATION SECTION ===
>>> Model Family:     Seagate Barracuda (SATA 3Gb/s, 4K Sectors)
>>> Device Model:     ST1000DM003-1ER162
>>> Serial Number:    Z4YDLXWJ
>>> LU WWN Device Id: 5 000c50 091877801
>>> Firmware Version: CC45
>>> User Capacity:    1,000,204,886,016 bytes [1.00 TB]
>>> Sector Sizes:     512 bytes logical, 4096 bytes physical
>>> Device is:        In smartctl database [for details use: -P show]
>> Sorry Ben - that drive was NOT a smart buy !!! Seagate Barracuda :-(
>>
>> You MUST enable the timeout on this drive :-(
>>
>> Gut feel tells me most 1TB or less drives are okay in a raid - the
>> Barracudas are an exception :-( I've got two 3TB Barracudas mirrored,
>> and from reading the list, there's no way I'd go raid5 for more capacity
>> without ditching them.
>>
>> Most people seem to get WD Reds - I've asked about Seagate NAS but I've
>> not picked up on any reports about them - good or bad. Barracudas - the
>> news is pretty much all bad :-(
>>
>>
> Yea, I figured that out -- just couldn't find a decent detailed reference with what "SCT status supported" means versus the more fully featured.
When I saw this, I assume it means you can ask for the status, and it 
will tell you it is disabled, but there is no support to modify the 
status (ie, turn it on). Totally useless for all intents and purposes....

Then again, I could be wrong... but compared to your other drive which 
showed additional supports, or on my one here:
SCT capabilities:              (0x0039) SCT Status supported.
                                         SCT Error Recovery Control 
supported.
                                         SCT Feature Control supported.
                                         SCT Data Table supported.

ie, the second one is probably what you want, the third allows you to 
turn it on/off, and no idea about the last option....

Regards,
Adam

-- 
Adam Goryachev Website Managers www.websitemanagers.com.au

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox