[PATCH 001 of 006] raid5: Move write operations to a work queue

linux-raid.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 001 of 006] raid5: Move write operations to a work queue
@ 2006-06-28 18:23 Dan Williams
  0 siblings, 0 replies; 3+ messages in thread
From: Dan Williams @ 2006-06-28 18:23 UTC (permalink / raw)
  To: NeilBrown; +Cc: linux-kernel, linux-raid

This patch moves write (reconstruct and read-modify) operations to a
work queue.  Note the next patch in this series fixes some incorrect
assumptions around having multiple operations in flight (i.e. ignore
this version of 'queue_raid_work').

Signed-off-by: Dan Williams <dan.j.williams@intel.com>

 drivers/md/raid5.c         |  314 +++++++++++++++++++++++++++++++++++++++++----
 include/linux/raid/raid5.h |   67 +++++++++
 2 files changed, 357 insertions(+), 24 deletions(-)

===================================================================
Index: linux-2.6-raid/drivers/md/raid5.c
===================================================================
--- linux-2.6-raid.orig/drivers/md/raid5.c	2006-06-28 08:44:11.000000000 -0700
+++ linux-2.6-raid/drivers/md/raid5.c	2006-06-28 09:52:07.000000000 -0700
@@ -305,6 +305,7 @@
 	memset(sh, 0, sizeof(*sh) + (conf->raid_disks-1)*sizeof(struct r5dev));
 	sh->raid_conf = conf;
 	spin_lock_init(&sh->lock);
+	INIT_WORK(&sh->ops.work, conf->do_block_ops, sh);
 
 	if (grow_buffers(sh, conf->raid_disks)) {
 		shrink_buffers(sh, conf->raid_disks);
@@ -1224,6 +1225,80 @@
 	}
 }
 
+static int handle_write_operations5(struct stripe_head *sh, int rcw, int locked)
+{
+	int i, pd_idx = sh->pd_idx, disks = sh->disks;
+	int complete=0;
+
+	if (test_bit(STRIPE_OP_RCW, &sh->state) &&
+			test_bit(STRIPE_OP_RCW_Done, &sh->ops.state)) {
+				clear_bit(STRIPE_OP_RCW, &sh->state);
+				clear_bit(STRIPE_OP_RCW_Done, &sh->ops.state);
+				complete++;
+	}
+
+	if (test_bit(STRIPE_OP_RMW, &sh->state) &&
+			test_bit(STRIPE_OP_RMW_Done, &sh->ops.state)) {
+				clear_bit(STRIPE_OP_RMW, &sh->state);
+				clear_bit(STRIPE_OP_RMW_Done, &sh->ops.state);
+				BUG_ON(++complete == 2);
+	}
+
+
+	/* If no operation is currently in process then use the rcw flag to
+	 * select an operation
+	 */
+	if (locked == 0) {
+		if (rcw == 0) {
+			/* enter stage 1 of reconstruct write operation */
+			set_bit(STRIPE_OP_RCW, &sh->state);
+			set_bit(STRIPE_OP_RCW_Drain, &sh->ops.state);
+			for (i=disks ; i-- ;) {
+				struct r5dev *dev = &sh->dev[i];
+
+				if (i!=pd_idx && dev->towrite) {
+					set_bit(R5_LOCKED, &dev->flags);
+					locked++;
+				}
+			}
+		} else {
+			/* enter stage 1 of read modify write operation */
+			BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags));
+			set_bit(STRIPE_OP_RMW, &sh->state);
+			set_bit(STRIPE_OP_RMW_ParityPre, &sh->ops.state);
+			for (i=disks ; i-- ;) {
+				struct r5dev *dev = &sh->dev[i];
+				if (i==pd_idx)
+					continue;
+
+				if (dev->towrite &&
+				    test_bit(R5_UPTODATE, &dev->flags)) {
+					set_bit(R5_LOCKED, &dev->flags);
+					locked++;
+				}
+			}
+		}
+	} else if (locked && complete == 0) /* the queue has an operation in flight */
+		locked = -EBUSY;
+	else if (complete)
+		locked = 0;
+
+	/* keep the parity disk locked while asynchronous operations
+	 * are in flight
+	 */
+	if (locked > 0) {
+		set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
+		clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
+		sh->ops.queue_count++;
+	} else if (locked == 0)
+		set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
+
+	PRINTK("%s: stripe %llu locked: %d complete: %d op_state: %lx\n",
+		__FUNCTION__, (unsigned long long)sh->sector,
+		locked, complete, sh->ops.state);
+
+	return locked;
+}
 
 
 /*
@@ -1320,6 +1395,174 @@
 	return pd_idx;
 }
 
+static inline void drain_bio(struct bio *wbi, sector_t sector, struct page *page)
+{
+	while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
+		copy_data(1, wbi, page, sector);
+		wbi = r5_next_bio(wbi, sector);
+	}
+}
+
+/* must be called under the stripe lock */
+static void queue_raid_work(struct stripe_head *sh)
+{
+	if (--sh->ops.queue_count == 0) {
+		atomic_inc(&sh->count);
+		queue_work(sh->raid_conf->block_ops_queue, &sh->ops.work);
+	} else if (sh->ops.queue_count < 0)
+		sh->ops.queue_count = 0;
+}
+
+/*
+ * raid5_do_soft_block_ops - perform block memory operations on stripe data
+ * outside the spin lock.
+ */
+static void raid5_do_soft_block_ops(void *stripe_head_ref)
+{
+	struct stripe_head *sh = stripe_head_ref;
+	int i, pd_idx = sh->pd_idx, disks = sh->disks, count = 1;
+	void *ptr[MAX_XOR_BLOCKS];
+	struct bio *chosen;
+	int overlap=0, new_work=0, written=0;
+	unsigned long state, ops_state;
+
+	/* take a snapshot of what needs to be done at this point in time */
+	spin_lock(&sh->lock);
+	state = sh->state;
+	ops_state = sh->ops.state;
+	spin_unlock(&sh->lock);
+
+	if (test_bit(STRIPE_OP_RMW, &state)) {
+		PRINTK("%s: stripe %llu STRIPE_OP_RMW op_state: %lx\n",
+			__FUNCTION__, (unsigned long long)sh->sector,
+			ops_state);
+
+		ptr[0] = page_address(sh->dev[pd_idx].page);
+
+		if (test_and_clear_bit(STRIPE_OP_RMW_ParityPre, &ops_state)) {
+			for (i=disks ; i-- ;) {
+				struct r5dev *dev = &sh->dev[i];
+				/* if it is locked then servicing
+				 * has been requested
+				 */
+				if (dev->towrite && test_bit(R5_LOCKED, &dev->flags)) {
+					ptr[count++] = page_address(dev->page);
+					/* ? is the device_lock necessary here, compute_parity
+					 * does not lock for this operation ?
+					 */
+					chosen = dev->towrite;
+					dev->towrite = NULL;
+
+					overlap++;
+
+					BUG_ON(dev->written);
+					dev->written = chosen;
+					check_xor();
+				}
+			}
+			if (count != 1)
+				xor_block(count, STRIPE_SIZE, ptr);
+			set_bit(STRIPE_OP_RMW_Drain, &ops_state);
+		}
+		if (test_and_clear_bit(STRIPE_OP_RMW_Drain, &ops_state)) {
+			for (i=disks ; i-- ;) {
+				struct r5dev *dev = &sh->dev[i];
+				written++;
+				drain_bio(dev->written, dev->sector, dev->page);
+			}
+			set_bit(STRIPE_OP_RMW_ParityPost, &ops_state);
+		}
+		if (test_and_clear_bit(STRIPE_OP_RMW_ParityPost, &ops_state)) {
+			for (i=disks ; i-- ;) {
+				struct r5dev *dev = &sh->dev[i];
+				if (dev->written) {
+					ptr[count++] = page_address(dev->page);
+					check_xor();
+				}
+			}
+			if (count != 1)
+				xor_block(count, STRIPE_SIZE, ptr);
+
+			/* signal completion and acknowledge the last state seen
+			 * by sh->ops.state
+			 */
+			set_bit(STRIPE_OP_RMW_Done, &ops_state);
+			set_bit(STRIPE_OP_RMW_ParityPre, &ops_state);
+		}
+
+	} else if (test_bit(STRIPE_OP_RCW, &state)) {
+		PRINTK("%s: stripe %llu STRIPE_OP_RCW op_state: %lx\n",
+			__FUNCTION__, (unsigned long long)sh->sector,
+			ops_state);
+
+		ptr[0] = page_address(sh->dev[pd_idx].page);
+
+		if (test_and_clear_bit(STRIPE_OP_RCW_Drain, &ops_state)) {
+			for (i=disks ; i-- ;) {
+				struct r5dev *dev = &sh->dev[i];
+				if (i!=pd_idx && dev->towrite &&
+					test_bit(R5_LOCKED, &dev->flags)) {
+					chosen = dev->towrite;
+					dev->towrite = NULL;
+
+					BUG_ON(dev->written);
+					dev->written = chosen;
+
+					overlap++;
+					written++;
+
+					drain_bio(dev->written, dev->sector,
+						dev->page);
+				} else if (i==pd_idx)
+					memset(ptr[0], 0, STRIPE_SIZE);
+			}
+			set_bit(STRIPE_OP_RCW_Parity, &ops_state);
+		}
+		if (test_and_clear_bit(STRIPE_OP_RCW_Parity, &ops_state)) {
+			for (i=disks; i--;)
+				if (i != pd_idx) {
+					ptr[count++] = page_address(sh->dev[i].page);
+					check_xor();
+				}
+			if (count != 1)
+				xor_block(count, STRIPE_SIZE, ptr);
+
+			/* signal completion and acknowledge the last state seen
+			 * by sh->ops.state
+			 */
+			set_bit(STRIPE_OP_RCW_Done, &ops_state);
+			set_bit(STRIPE_OP_RCW_Drain, &ops_state);
+
+		}
+	}
+
+	spin_lock(&sh->lock);
+	/* Update the state of operations, by XORing we clear the stage 1 requests
+	 * while preserving new requests.
+	 */
+	sh->ops.state ^= ops_state;
+
+	if (written)
+		for (i=disks ; i-- ;) {
+			struct r5dev *dev = &sh->dev[i];
+			if (dev->written)
+				set_bit(R5_UPTODATE, &dev->flags);
+		}
+
+	if (overlap)
+		for (i= disks; i-- ;) {
+			struct r5dev *dev = &sh->dev[i];
+			if (test_and_clear_bit(R5_Overlap, &dev->flags))
+				wake_up(&sh->raid_conf->wait_for_overlap);
+		}
+
+	sh->ops.queue_count += new_work;
+	set_bit(STRIPE_HANDLE, &sh->state);
+	queue_raid_work(sh);
+	spin_unlock(&sh->lock);
+
+	release_stripe(sh);
+}
 
 /*
  * handle_stripe - do things to a stripe.
@@ -1333,7 +1576,6 @@
  *    schedule a write of some buffers
  *    return confirmation of parity correctness
  *
- * Parity calculations are done inside the stripe lock
  * buffers are taken off read_list or write_list, and bh_cache buffers
  * get BH_Lock set before the stripe lock is released.
  *
@@ -1352,9 +1594,9 @@
 	int failed_num=0;
 	struct r5dev *dev;
 
-	PRINTK("handling stripe %llu, cnt=%d, pd_idx=%d\n",
-		(unsigned long long)sh->sector, atomic_read(&sh->count),
-		sh->pd_idx);
+	PRINTK("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d\n",
+	       (unsigned long long)sh->sector, sh->state, atomic_read(&sh->count),
+	       sh->pd_idx);
 
 	spin_lock(&sh->lock);
 	clear_bit(STRIPE_HANDLE, &sh->state);
@@ -1596,7 +1838,8 @@
 	}
 
 	/* now to consider writing and what else, if anything should be read */
-	if (to_write) {
+	if (to_write || test_bit(STRIPE_OP_RCW, &sh->state) ||
+		test_bit(STRIPE_OP_RMW, &sh->state)) {
 		int rmw=0, rcw=0;
 		for (i=disks ; i--;) {
 			/* would I have to read this buffer for read_modify_write */
@@ -1668,25 +1911,29 @@
 				}
 			}
 		/* now if nothing is locked, and if we have enough data, we can start a write request */
-		if (locked == 0 && (rcw == 0 ||rmw == 0) &&
-		    !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
-			PRINTK("Computing parity...\n");
-			compute_parity5(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE);
-			/* now every locked buffer is ready to be written */
-			for (i=disks; i--;)
-				if (test_bit(R5_LOCKED, &sh->dev[i].flags)) {
-					PRINTK("Writing block %d\n", i);
-					locked++;
-					set_bit(R5_Wantwrite, &sh->dev[i].flags);
-					if (!test_bit(R5_Insync, &sh->dev[i].flags)
-					    || (i==sh->pd_idx && failed == 0))
-						set_bit(STRIPE_INSYNC, &sh->state);
+		/* ...or, if we have previously started write operations we can now advance the state */
+		if ((locked == 0 && (rcw == 0 ||rmw == 0) &&
+		    !test_bit(STRIPE_BIT_DELAY, &sh->state)) ||
+		    test_bit(STRIPE_OP_RCW, &sh->state) || test_bit(STRIPE_OP_RMW, &sh->state)) {
+			int work_queued = handle_write_operations5(sh, rcw, locked);
+			if (work_queued == 0) {
+				/* now every locked buffer is ready to be written */
+				for (i=disks; i--;)
+					if (test_bit(R5_LOCKED, &sh->dev[i].flags)) {
+						PRINTK("Writing block %d\n", i);
+						locked++;
+						set_bit(R5_Wantwrite, &sh->dev[i].flags);
+						if (!test_bit(R5_Insync, &sh->dev[i].flags)
+						    || (i==sh->pd_idx && failed == 0))
+							set_bit(STRIPE_INSYNC, &sh->state);
+					}
+				if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
+					atomic_dec(&conf->preread_active_stripes);
+					if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
+						md_wakeup_thread(conf->mddev->thread);
 				}
-			if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
-				atomic_dec(&conf->preread_active_stripes);
-				if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
-					md_wakeup_thread(conf->mddev->thread);
-			}
+			} else if (work_queued > 0)
+				locked += work_queued;
 		}
 	}
 
@@ -1819,6 +2066,8 @@
 			}
 	}
 
+	queue_raid_work(sh);
+
 	spin_unlock(&sh->lock);
 
 	while ((bi=return_bi)) {
@@ -1829,6 +2078,7 @@
 		bi->bi_size = 0;
 		bi->bi_end_io(bi, bytes, 0);
 	}
+
 	for (i=disks; i-- ;) {
 		int rw;
 		struct bio *bi;
@@ -3117,6 +3367,21 @@
 		if (!conf->spare_page)
 			goto abort;
 	}
+
+	sprintf(conf->workqueue_name, "%s_raid5_ops",
+		mddev->gendisk->disk_name);
+
+	if ((conf->block_ops_queue = create_workqueue(conf->workqueue_name))
+				     == NULL)
+		goto abort;
+
+	/* To Do:
+	 * 1/ Offload to asynchronous copy / xor engines
+	 * 2/ Automated selection of optimal do_block_ops
+	 *	routine similar to the xor template selection
+	 */
+	conf->do_block_ops = raid5_do_soft_block_ops;
+
 	spin_lock_init(&conf->device_lock);
 	init_waitqueue_head(&conf->wait_for_stripe);
 	init_waitqueue_head(&conf->wait_for_overlap);
@@ -3279,6 +3544,8 @@
 		safe_put_page(conf->spare_page);
 		kfree(conf->disks);
 		kfree(conf->stripe_hashtbl);
+		if (conf->do_block_ops)
+			destroy_workqueue(conf->block_ops_queue);
 		kfree(conf);
 	}
 	mddev->private = NULL;
@@ -3299,6 +3566,7 @@
 	blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
 	sysfs_remove_group(&mddev->kobj, &raid5_attrs_group);
 	kfree(conf->disks);
+	destroy_workqueue(conf->block_ops_queue);
 	kfree(conf);
 	mddev->private = NULL;
 	return 0;
Index: linux-2.6-raid/include/linux/raid/raid5.h
===================================================================
--- linux-2.6-raid.orig/include/linux/raid/raid5.h	2006-06-28 08:44:11.000000000 -0700
+++ linux-2.6-raid/include/linux/raid/raid5.h	2006-06-28 10:34:54.000000000 -0700
@@ -3,6 +3,8 @@
 
 #include <linux/raid/md.h>
 #include <linux/raid/xor.h>
+#include <linux/workqueue.h>
+#include <linux/dmaengine.h>
 
 /*
  *
@@ -123,6 +125,14 @@
  * The refcount counts each thread that have activated the stripe,
  * plus raid5d if it is handling it, plus one for each active request
  * on a cached buffer.
+ *
+ * Block operations (copy, xor, block fill, and block compare) are executed
+ * outside the spin lock.  A stripe can have multiple operations in flight provided
+ * that the operations do not have data dependencies.  For most cases data
+ * dependencies will be avoided by the 'overlap' protection logic in add_stripe_bio.
+ * A case that violates this rule is compute block operations where the work queue must
+ * guarantee that blocks are up to date before proceeding with a write or check
+ * parity operation.
  */
 
 struct stripe_head {
@@ -136,6 +146,16 @@
 	spinlock_t		lock;
 	int			bm_seq;	/* sequence number for bitmap flushes */
 	int			disks;			/* disks in stripe */
+	struct stripe_operations {
+		int			queue_count;	/* if == 0 places stripe in the workqueue */
+		unsigned long		state;		/* state of block operations */
+		struct work_struct	work;		/* work queue descriptor */
+		#ifdef CONFIG_DMA_ENGINE
+		u32 			dma_result;	/* storage for dma engine results */
+		dma_cookie_t		dma_cookie;	/* last issued dma operation */
+		struct dma_chan		*dma_chan;	/* dma channel for ops offload */
+		#endif
+	} ops;
 	struct r5dev {
 		struct bio	req;
 		struct bio_vec	vec;
@@ -145,6 +165,7 @@
 		unsigned long	flags;
 	} dev[1]; /* allocated with extra space depending of RAID geometry */
 };
+
 /* Flags */
 #define	R5_UPTODATE	0	/* page contains current data */
 #define	R5_LOCKED	1	/* IO has been submitted on "req" */
@@ -156,8 +177,9 @@
 #define	R5_Overlap	7	/* There is a pending overlapping request on this block */
 #define	R5_ReadError	8	/* seen a read error here recently */
 #define	R5_ReWrite	9	/* have tried to over-write the readerror */
-
 #define	R5_Expanded	10	/* This block now has post-expand data */
+#define	R5_Consistent	11	/* Block is HW DMA-able without a cache flush */
+
 /*
  * Write method
  */
@@ -179,6 +201,41 @@
 #define	STRIPE_EXPANDING	9
 #define	STRIPE_EXPAND_SOURCE	10
 #define	STRIPE_EXPAND_READY	11
+#define	STRIPE_OP_RCW		12
+#define	STRIPE_OP_RMW		13 /* RAID-5 only */
+#define	STRIPE_OP_UPDATE	14 /* RAID-6 only */
+#define	STRIPE_OP_CHECK		15
+#define	STRIPE_OP_COMPUTE	16
+#define	STRIPE_OP_COMPUTE2	17 /* RAID-6 only */
+#define	STRIPE_OP_BIOFILL	18
+
+/*
+ * These flags are communication markers between the handle_stripe[5|6]
+ * routine and the block operations work queue
+ * - The _End definitions are a signal from handle_stripe to the work queue to
+ *   to ensure the completion of the operation so the results can be committed
+ *   to disk
+ * - The _Done definitions signal completion from work queue to handle_stripe
+ * - All other definitions are service requests for the work queue
+ */
+#define	STRIPE_OP_RCW_Drain		0
+#define	STRIPE_OP_RCW_Parity		1
+#define	STRIPE_OP_RCW_End		2
+#define	STRIPE_OP_RCW_Done		3
+#define	STRIPE_OP_RMW_ParityPre		4
+#define	STRIPE_OP_RMW_Drain		5
+#define	STRIPE_OP_RMW_ParityPost	6
+#define	STRIPE_OP_RMW_End		7
+#define	STRIPE_OP_RMW_Done		8
+#define	STRIPE_OP_CHECK_Gen   		9
+#define	STRIPE_OP_CHECK_Verify		10
+#define	STRIPE_OP_CHECK_End		11
+#define	STRIPE_OP_CHECK_Done		12
+#define	STRIPE_OP_COMPUTE_Prep		13
+#define	STRIPE_OP_COMPUTE_Parity	14
+#define	STRIPE_OP_COMPUTE_End		15
+#define	STRIPE_OP_COMPUTE_Done		16
+
 /*
  * Plugging:
  *
@@ -229,11 +286,16 @@
 	atomic_t		preread_active_stripes; /* stripes with scheduled io */
 
 	atomic_t		reshape_stripes; /* stripes with pending writes for reshape */
+
+	struct workqueue_struct *block_ops_queue;
+	void (*do_block_ops)(void *);
+
 	/* unfortunately we need two cache names as we temporarily have
 	 * two caches.
 	 */
 	int			active_name;
 	char			cache_name[2][20];
+	char			workqueue_name[20];
 	kmem_cache_t		*slab_cache; /* for allocating stripes */
 
 	int			seq_flush, seq_write;

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH 001 of 006] raid5: Move write operations to a work queue
@ 2006-07-27 14:49 Yuri Tikhonov
  2006-07-27 18:53 ` Dan Williams
  0 siblings, 1 reply; 3+ messages in thread
From: Yuri Tikhonov @ 2006-07-27 14:49 UTC (permalink / raw)
  To: Dan Williams; +Cc: linux-raid

 Hello, Dan.

 I've looked through your patches, and have some suggestions about write operations processing.

 In the current implementation of the Raid5 driver the RMW operation won't begin until old blocks in the stripe cache, 
which are to be rewritten, become UPTODATE. 
 But if you have dedicated h/w DMA engines, then, while an IOC(input/output contoller) performs transmition of the 
old strip data from the disk to the stripe cache, it may make sense to start a DMA engine, which will transmit new 
strip data from the bio requested to write. So, when an IOC operation complete, we'll already have all necessary data 
to compute new parity value.

1) For the current implementation:

 Trmw = Tioc1 + Txor1 + Tdma + Txor2 + Tioc2,
 where Tioc1 is the time it takes to update stripe cache with old data, Txor1 is the time it takes to substract 
old data from old parity value, Tdma is the time it takes to update strip with new data, Txor2 is the time it takes 
to compute new parity, and Tioc2 is the time it takes to transfer updated data to disks. 
 So, Trmw = 2*Tioc + 2*Txor + Tdma

2) If copying old and new data to stripe cache is performed simultaneously, then time to complete the whole RMW 
operation will take:

 T'rmw = max(Tioc1, Tdma) + 2*Txor + Tioc2,
 where Tioc1 is the time it takes to update stripe cache with old data, Tdma is the time it takes to update strip 
with new data, 2*Txor is the time it takes to compute new parity, and Tioc2 is the time it takes to transfer updated 
data to disks.
 So, T'rmw = 2*Tioc + 2*Txor. 
(in any case, i think that Tioc > Tdma, because Tioc corresponds to the time spent reading from disk, and 
Tdma corresponds to operations with SDRAM, which are faster).

 Also, 2*Txor for (2) is less then 2*Txor for (1), because in (2) approach we have to prepare XOR engine descriptors only
once, but in the (1) approach - twice.

 Does it make sense to revise your Raid5 driver implementaion to allow IOC and DMA to have separate destination buffers? That is, some kind of a stripe shadow. IOC will copy to the regular buffer in the stripe cache, DMA - to the shadow one. 

 Regards, Yuri.

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH 001 of 006] raid5: Move write operations to a work queue
  2006-07-27 14:49 Yuri Tikhonov
@ 2006-07-27 18:53 ` Dan Williams
  0 siblings, 0 replies; 3+ messages in thread
From: Dan Williams @ 2006-07-27 18:53 UTC (permalink / raw)
  To: Yuri Tikhonov; +Cc: linux-raid

On 7/27/06, Yuri Tikhonov <yur_t@mail.ru> wrote:
>
>  Hello, Dan.
>
>  I've looked through your patches, and have some suggestions about write operations processing.

Thanks for reviewing the code.

>
>  In the current implementation of the Raid5 driver the RMW operation won't begin until old blocks in the stripe cache,
> which are to be rewritten, become UPTODATE.
>  But if you have dedicated h/w DMA engines, then, while an IOC(input/output contoller) performs transmition of the
> old strip data from the disk to the stripe cache, it may make sense to start a DMA engine, which will transmit new
> strip data from the bio requested to write. So, when an IOC operation complete, we'll already have all necessary data
> to compute new parity value.
>
> 1) For the current implementation:
>
>  Trmw = Tioc1 + Txor1 + Tdma + Txor2 + Tioc2,
>  where Tioc1 is the time it takes to update stripe cache with old data, Txor1 is the time it takes to substract
> old data from old parity value, Tdma is the time it takes to update strip with new data, Txor2 is the time it takes
> to compute new parity, and Tioc2 is the time it takes to transfer updated data to disks.
>  So, Trmw = 2*Tioc + 2*Txor + Tdma
>
> 2) If copying old and new data to stripe cache is performed simultaneously, then time to complete the whole RMW
> operation will take:
>
>  T'rmw = max(Tioc1, Tdma) + 2*Txor + Tioc2,
>  where Tioc1 is the time it takes to update stripe cache with old data, Tdma is the time it takes to update strip
> with new data, 2*Txor is the time it takes to compute new parity, and Tioc2 is the time it takes to transfer updated
> data to disks.
>  So, T'rmw = 2*Tioc + 2*Txor.
> (in any case, i think that Tioc > Tdma, because Tioc corresponds to the time spent reading from disk, and
> Tdma corresponds to operations with SDRAM, which are faster).
>
>  Also, 2*Txor for (2) is less then 2*Txor for (1), because in (2) approach we have to prepare XOR engine descriptors only
> once, but in the (1) approach - twice.
>
>  Does it make sense to revise your Raid5 driver implementaion to allow IOC and DMA to have separate destination buffers? That is, some kind of a stripe shadow. IOC will copy to the regular buffer in the stripe cache, DMA - to the shadow one.
>
The issue I see with this is that Tioc1 is orders of magnitude greater
than Tdma.  So while I agree there may be room to get some pre-work
done while the reads are in flight I do not expect that the
performance increase would be significant, and definitely not worth
the design complexity of adding a "shadow buffer".

>  Regards, Yuri.
>
Regards,

Dan

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2006-07-27 18:53 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-06-28 18:23 [PATCH 001 of 006] raid5: Move write operations to a work queue Dan Williams
  -- strict thread matches above, loose matches on Subject: below --
2006-07-27 14:49 Yuri Tikhonov
2006-07-27 18:53 ` Dan Williams

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).