From: Dan Williams <dan.j.williams@intel.com>
To: neilb@suse.de, linux-raid@vger.kernel.org
Cc: akpm@osdl.org, linux-kernel@vger.kernel.org, christopher.leech@intel.com
Subject: [PATCH 02/19] raid5: move write operations to a workqueue
Date: Mon, 11 Sep 2006 16:17:46 -0700 [thread overview]
Message-ID: <20060911231746.4737.82707.stgit@dwillia2-linux.ch.intel.com> (raw)
In-Reply-To: <1158015632.4241.31.camel@dwillia2-linux.ch.intel.com>
From: Dan Williams <dan.j.williams@intel.com>
Enable handle_stripe5 to pass off write operations to
raid5_do_soft_blocks_ops (which can be run as a workqueue). The operations
moved are reconstruct-writes and read-modify-writes formerly handled by
compute_parity5.
Changelog:
* moved raid5_do_soft_block_ops changes into a separate patch
* changed handle_write_operations5 to only initiate write operations, which
prevents new writes from being requested while the current one is in flight
* all blocks undergoing a write are now marked locked and !uptodate at the
beginning of the write operation
* blocks undergoing a read-modify-write need a request flag to distinguish
them from blocks that are locked for reading. Reconstruct-writes still use
the R5_LOCKED bit to select blocks for the operation
* integrated the work queue Kconfig option
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
drivers/md/Kconfig | 21 +++++
drivers/md/raid5.c | 192 ++++++++++++++++++++++++++++++++++++++------
include/linux/raid/raid5.h | 3 +
3 files changed, 190 insertions(+), 26 deletions(-)
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index bf869ed..2a16b3b 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -162,6 +162,27 @@ config MD_RAID5_RESHAPE
There should be enough spares already present to make the new
array workable.
+config MD_RAID456_WORKQUEUE
+ depends on MD_RAID456
+ bool "Offload raid work to a workqueue from raid5d"
+ ---help---
+ This option enables raid work (block copy and xor operations)
+ to run in a workqueue. If your platform has a high context
+ switch penalty say N. If you are using hardware offload or
+ are running on an SMP platform say Y.
+
+ If unsure say, Y.
+
+config MD_RAID456_WORKQUEUE_MULTITHREAD
+ depends on MD_RAID456_WORKQUEUE && SMP
+ bool "Enable multi-threaded raid processing"
+ default y
+ ---help---
+ This option controls whether the raid workqueue will be multi-
+ threaded or single threaded.
+
+ If unsure say, Y.
+
config MD_MULTIPATH
tristate "Multipath I/O support"
depends on BLK_DEV_MD
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 8fde62b..e39d248 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -222,6 +222,8 @@ static void init_stripe(struct stripe_he
BUG_ON(atomic_read(&sh->count) != 0);
BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
+ BUG_ON(sh->ops.state);
+ BUG_ON(sh->ops.pending);
CHECK_DEVLOCK();
PRINTK("init_stripe called, stripe %llu\n",
@@ -331,6 +333,9 @@ static int grow_one_stripe(raid5_conf_t
memset(sh, 0, sizeof(*sh) + (conf->raid_disks-1)*sizeof(struct r5dev));
sh->raid_conf = conf;
spin_lock_init(&sh->lock);
+ #ifdef CONFIG_MD_RAID456_WORKQUEUE
+ INIT_WORK(&sh->ops.work, conf->do_block_ops, sh);
+ #endif
if (grow_buffers(sh, conf->raid_disks)) {
shrink_buffers(sh, conf->raid_disks);
@@ -1266,7 +1271,72 @@ static void compute_block_2(struct strip
}
}
+static int handle_write_operations5(struct stripe_head *sh, int rcw)
+{
+ int i, pd_idx = sh->pd_idx, disks = sh->disks;
+ int locked=0;
+
+ if (rcw == 0) {
+ /* skip the drain operation on an expand */
+ if (test_bit(STRIPE_OP_RCW_Expand, &sh->ops.state)) {
+ set_bit(STRIPE_OP_RCW, &sh->state);
+ set_bit(STRIPE_OP_RCW_Parity, &sh->ops.state);
+ for (i=disks ; i-- ;) {
+ set_bit(R5_LOCKED, &sh->dev[i].flags);
+ locked++;
+ }
+ } else { /* enter stage 1 of reconstruct write operation */
+ set_bit(STRIPE_OP_RCW, &sh->state);
+ set_bit(STRIPE_OP_RCW_Drain, &sh->ops.state);
+ for (i=disks ; i-- ;) {
+ struct r5dev *dev = &sh->dev[i];
+
+ if (dev->towrite) {
+ set_bit(R5_LOCKED, &dev->flags);
+ clear_bit(R5_UPTODATE, &dev->flags);
+ locked++;
+ }
+ }
+ }
+ } else {
+ /* enter stage 1 of read modify write operation */
+ BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags));
+
+ set_bit(STRIPE_OP_RMW, &sh->state);
+ set_bit(STRIPE_OP_RMW_ParityPre, &sh->ops.state);
+ for (i=disks ; i-- ;) {
+ struct r5dev *dev = &sh->dev[i];
+ if (i==pd_idx)
+ continue;
+
+ /* For a read-modify write there may be blocks that are
+ * locked for reading while others are ready to be written
+ * so we distinguish these blocks by the RMWReq bit
+ */
+ if (dev->towrite &&
+ test_bit(R5_UPTODATE, &dev->flags)) {
+ set_bit(R5_RMWReq, &dev->flags);
+ set_bit(R5_LOCKED, &dev->flags);
+ clear_bit(R5_UPTODATE, &dev->flags);
+ locked++;
+ }
+ }
+ }
+
+ /* keep the parity disk locked while asynchronous operations
+ * are in flight
+ */
+ set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
+ clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
+ locked++;
+ sh->ops.pending++;
+ PRINTK("%s: stripe %llu locked: %d op_state: %lx\n",
+ __FUNCTION__, (unsigned long long)sh->sector,
+ locked, sh->ops.state);
+
+ return locked;
+}
/*
* Each stripe/dev can have one or more bion attached.
@@ -1664,7 +1734,6 @@ static void raid5_do_soft_block_ops(void
* schedule a write of some buffers
* return confirmation of parity correctness
*
- * Parity calculations are done inside the stripe lock
* buffers are taken off read_list or write_list, and bh_cache buffers
* get BH_Lock set before the stripe lock is released.
*
@@ -1679,13 +1748,13 @@ static void handle_stripe5(struct stripe
int i;
int syncing, expanding, expanded;
int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
- int non_overwrite = 0;
+ int non_overwrite=0, write_complete=0;
int failed_num=0;
struct r5dev *dev;
- PRINTK("handling stripe %llu, cnt=%d, pd_idx=%d\n",
- (unsigned long long)sh->sector, atomic_read(&sh->count),
- sh->pd_idx);
+ PRINTK("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d\n",
+ (unsigned long long)sh->sector, sh->state, atomic_read(&sh->count),
+ sh->pd_idx);
spin_lock(&sh->lock);
clear_bit(STRIPE_HANDLE, &sh->state);
@@ -1926,8 +1995,56 @@ #endif
set_bit(STRIPE_HANDLE, &sh->state);
}
- /* now to consider writing and what else, if anything should be read */
- if (to_write) {
+ /* Now we check to see if any write operations have recently
+ * completed
+ */
+ if (test_bit(STRIPE_OP_RCW, &sh->state) &&
+ test_bit(STRIPE_OP_RCW_Done, &sh->ops.state)) {
+ clear_bit(STRIPE_OP_RCW, &sh->state);
+ clear_bit(STRIPE_OP_RCW_Done, &sh->ops.state);
+ write_complete++;
+ }
+
+ if (test_bit(STRIPE_OP_RMW, &sh->state) &&
+ test_bit(STRIPE_OP_RMW_Done, &sh->ops.state)) {
+ clear_bit(STRIPE_OP_RMW, &sh->state);
+ clear_bit(STRIPE_OP_RMW_Done, &sh->ops.state);
+ BUG_ON(++write_complete > 1);
+ for (i=disks; i--;)
+ clear_bit(R5_RMWReq, &sh->dev[i].flags);
+ }
+
+ /* All the 'written' buffers and the parity block are ready to be
+ * written back to disk
+ */
+ if (write_complete) {
+ BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags));
+ for (i=disks; i--;) {
+ dev = &sh->dev[i];
+ if (test_bit(R5_LOCKED, &dev->flags) &&
+ (i == sh->pd_idx || dev->written)) {
+ PRINTK("Writing block %d\n", i);
+ set_bit(R5_Wantwrite, &dev->flags);
+ if (!test_bit(R5_Insync, &dev->flags)
+ || (i==sh->pd_idx && failed == 0))
+ set_bit(STRIPE_INSYNC, &sh->state);
+ }
+ }
+ if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
+ atomic_dec(&conf->preread_active_stripes);
+ if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
+ md_wakeup_thread(conf->mddev->thread);
+ }
+ }
+
+ /* 1/ Now to consider new write requests and what else, if anything should be read
+ * 2/ Check operations clobber the parity block so do not start new writes while
+ * a check is in flight
+ * 3/ Write operations do not stack
+ */
+ if (to_write && !test_bit(STRIPE_OP_RCW, &sh->state) &&
+ !test_bit(STRIPE_OP_RMW, &sh->state) &&
+ !test_bit(STRIPE_OP_CHECK, &sh->state)) {
int rmw=0, rcw=0;
for (i=disks ; i--;) {
/* would I have to read this buffer for read_modify_write */
@@ -2000,25 +2117,8 @@ #endif
}
/* now if nothing is locked, and if we have enough data, we can start a write request */
if (locked == 0 && (rcw == 0 ||rmw == 0) &&
- !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
- PRINTK("Computing parity...\n");
- compute_parity5(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE);
- /* now every locked buffer is ready to be written */
- for (i=disks; i--;)
- if (test_bit(R5_LOCKED, &sh->dev[i].flags)) {
- PRINTK("Writing block %d\n", i);
- locked++;
- set_bit(R5_Wantwrite, &sh->dev[i].flags);
- if (!test_bit(R5_Insync, &sh->dev[i].flags)
- || (i==sh->pd_idx && failed == 0))
- set_bit(STRIPE_INSYNC, &sh->state);
- }
- if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
- atomic_dec(&conf->preread_active_stripes);
- if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
- md_wakeup_thread(conf->mddev->thread);
- }
- }
+ !test_bit(STRIPE_BIT_DELAY, &sh->state))
+ locked += handle_write_operations5(sh, rcw);
}
/* maybe we need to check and possibly fix the parity for this stripe
@@ -2150,8 +2250,17 @@ #endif
}
}
+ queue_raid_work(sh);
+
spin_unlock(&sh->lock);
+ #ifndef CONFIG_MD_RAID456_WORKQUEUE
+ while (test_bit(STRIPE_OP_QUEUED, &sh->state)) {
+ PRINTK("run do_block_ops\n");
+ conf->do_block_ops(sh);
+ }
+ #endif
+
while ((bi=return_bi)) {
int bytes = bi->bi_size;
@@ -3439,6 +3548,30 @@ static int run(mddev_t *mddev)
if (!conf->spare_page)
goto abort;
}
+
+ #ifdef CONFIG_MD_RAID456_WORKQUEUE
+ sprintf(conf->workqueue_name, "%s_raid5_ops",
+ mddev->gendisk->disk_name);
+
+ #ifdef CONFIG_MD_RAID456_MULTITHREAD
+ if ((conf->block_ops_queue = create_workqueue(conf->workqueue_name))
+ == NULL)
+ goto abort;
+ #else
+ if ((conf->block_ops_queue = create_singlethread_workqueue(
+ conf->workqueue_name)) == NULL)
+ goto abort;
+ #endif
+ #endif
+
+ /* To Do:
+ * 1/ Offload to asynchronous copy / xor engines
+ * 2/ Automated selection of optimal do_block_ops
+ * routine similar to the xor template selection
+ */
+ conf->do_block_ops = raid5_do_soft_block_ops;
+
+
spin_lock_init(&conf->device_lock);
init_waitqueue_head(&conf->wait_for_stripe);
init_waitqueue_head(&conf->wait_for_overlap);
@@ -3598,6 +3731,10 @@ abort:
safe_put_page(conf->spare_page);
kfree(conf->disks);
kfree(conf->stripe_hashtbl);
+ #ifdef CONFIG_MD_RAID456_WORKQUEUE
+ if (conf->do_block_ops)
+ destroy_workqueue(conf->block_ops_queue);
+ #endif
kfree(conf);
}
mddev->private = NULL;
@@ -3618,6 +3755,9 @@ static int stop(mddev_t *mddev)
blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
sysfs_remove_group(&mddev->kobj, &raid5_attrs_group);
kfree(conf->disks);
+ #ifdef CONFIG_MD_RAID456_WORKQUEUE
+ destroy_workqueue(conf->block_ops_queue);
+ #endif
kfree(conf);
mddev->private = NULL;
return 0;
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index c8a315b..31ae55c 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -3,6 +3,7 @@ #define _RAID5_H
#include <linux/raid/md.h>
#include <linux/raid/xor.h>
+#include <linux/workqueue.h>
/*
*
@@ -333,6 +334,7 @@ struct raid5_private_data {
atomic_t preread_active_stripes; /* stripes with scheduled io */
atomic_t reshape_stripes; /* stripes with pending writes for reshape */
+
#ifdef CONFIG_MD_RAID456_WORKQUEUE
struct workqueue_struct *block_ops_queue;
#endif
@@ -376,6 +378,7 @@ struct raid5_private_data {
typedef struct raid5_private_data raid5_conf_t;
#define mddev_to_conf(mddev) ((raid5_conf_t *) mddev->private)
+
/* must be called under the stripe lock */
static inline void queue_raid_work(struct stripe_head *sh)
{
next prev parent reply other threads:[~2006-09-11 23:17 UTC|newest]
Thread overview: 57+ messages / expand[flat|nested] mbox.gz Atom feed top
2006-09-11 23:00 [PATCH 00/19] Hardware Accelerated MD RAID5: Introduction Dan Williams
2006-09-11 23:00 ` Dan Williams
2006-09-11 23:17 ` [PATCH 01/19] raid5: raid5_do_soft_block_ops Dan Williams
2006-09-11 23:34 ` Jeff Garzik
2006-09-11 23:17 ` Dan Williams [this message]
2006-09-11 23:36 ` [PATCH 02/19] raid5: move write operations to a workqueue Jeff Garzik
2006-09-11 23:17 ` [PATCH 03/19] raid5: move check parity " Dan Williams
2006-09-11 23:17 ` [PATCH 04/19] raid5: move compute block " Dan Williams
2006-09-11 23:18 ` [PATCH 05/19] raid5: move read completion copies " Dan Williams
2006-09-11 23:18 ` [PATCH 06/19] raid5: move the reconstruct write expansion operation " Dan Williams
2006-09-11 23:18 ` [PATCH 07/19] raid5: remove compute_block and compute_parity5 Dan Williams
2006-09-11 23:18 ` [PATCH 08/19] dmaengine: enable multiple clients and operations Dan Williams
2006-09-11 23:44 ` Jeff Garzik
2006-09-12 0:14 ` Dan Williams
2006-09-12 0:52 ` Roland Dreier
2006-09-12 6:18 ` Dan Williams
2006-09-12 9:15 ` Evgeniy Polyakov
2006-09-13 4:04 ` Jeff Garzik
2006-09-15 16:38 ` Olof Johansson
2006-09-15 19:44 ` [PATCH] dmaengine: clean up and abstract function types (was Re: [PATCH 08/19] dmaengine: enable multiple clients and operations) Olof Johansson
2006-09-15 20:02 ` [PATCH] [v2] " Olof Johansson
2006-09-18 22:56 ` [PATCH] " Dan Williams
2006-09-19 1:05 ` Olof Johansson
2006-09-19 11:20 ` Alan Cox
2006-09-19 16:32 ` Olof Johansson
2006-09-11 23:18 ` [PATCH 09/19] dmaengine: reduce backend address permutations Dan Williams
2006-09-15 14:46 ` Olof Johansson
2006-09-11 23:18 ` [PATCH 10/19] dmaengine: expose per channel dma mapping characteristics to clients Dan Williams
2006-09-11 23:18 ` [PATCH 11/19] dmaengine: add memset as an asynchronous dma operation Dan Williams
2006-09-11 23:50 ` Jeff Garzik
2006-09-11 23:18 ` [PATCH 12/19] dmaengine: dma_async_memcpy_err for DMA engines that do not support memcpy Dan Williams
2006-09-11 23:51 ` Jeff Garzik
2006-09-11 23:18 ` [PATCH 13/19] dmaengine: add support for dma xor zero sum operations Dan Williams
2006-09-11 23:18 ` [PATCH 14/19] dmaengine: add dma_sync_wait Dan Williams
2006-09-11 23:52 ` Jeff Garzik
2006-09-11 23:18 ` [PATCH 15/19] dmaengine: raid5 dma client Dan Williams
2006-09-11 23:54 ` Jeff Garzik
2006-09-11 23:19 ` [PATCH 16/19] dmaengine: Driver for the Intel IOP 32x, 33x, and 13xx RAID engines Dan Williams
2006-09-15 14:57 ` Olof Johansson
2006-09-11 23:19 ` [PATCH 17/19] iop3xx: define IOP3XX_REG_ADDR[32|16|8] and clean up DMA/AAU defs Dan Williams
2006-09-11 23:55 ` Jeff Garzik
2006-09-11 23:19 ` [PATCH 18/19] iop3xx: Give Linux control over PCI (ATU) initialization Dan Williams
2006-09-11 23:56 ` Jeff Garzik
2006-09-11 23:19 ` [PATCH 19/19] iop3xx: IOP 32x and 33x support for the iop-adma driver Dan Williams
2006-09-11 23:38 ` [PATCH 00/19] Hardware Accelerated MD RAID5: Introduction Jeff Garzik
2006-09-11 23:38 ` Jeff Garzik
2006-09-11 23:53 ` Dan Williams
2006-09-12 2:41 ` Jeff Garzik
2006-09-12 5:47 ` Dan Williams
2006-09-13 4:05 ` Jeff Garzik
2006-09-13 7:15 ` Jakob Oestergaard
2006-09-13 19:17 ` Dan Williams
2006-09-14 7:42 ` Jakob Oestergaard
2006-10-11 1:46 ` Dan Williams
2006-10-08 22:18 ` Neil Brown
2006-10-10 18:23 ` Dan Williams
2006-10-11 2:44 ` Neil Brown
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20060911231746.4737.82707.stgit@dwillia2-linux.ch.intel.com \
--to=dan.j.williams@intel.com \
--cc=akpm@osdl.org \
--cc=christopher.leech@intel.com \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-raid@vger.kernel.org \
--cc=neilb@suse.de \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.