[PATCH RFC 2/4] md: refactor raid5 cache policy code using 'struct stripe_cache_policy'

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Dan Williams <dan.j.williams@intel.com>
To: linux-raid@vger.kernel.org
Subject: [PATCH RFC 2/4] md: refactor raid5 cache policy code using 'struct stripe_cache_policy'
Date: Tue, 10 Apr 2007 23:00:31 -0700	[thread overview]
Message-ID: <20070411060031.15745.50795.stgit@dwillia2-linux.ch.intel.com> (raw)
In-Reply-To: <20070411055729.15745.51513.stgit@dwillia2-linux.ch.intel.com>

struct stripe_cache_policy is introduced as an interface to enable multiple
caching policies.  It adds several methods to be called when cache events
occur.  See the definition of stripe_cache_policy in
include/linux/raid/raid5.h.  This patch does not add any new caching
policies, it just moves the current code to a new location and calls it by
a struct stripe_cache_policy method.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---

 drivers/md/raid5.c         |  644 +++++++++++++++++++++++++-------------------
 include/linux/raid/raid5.h |   82 +++++-
 2 files changed, 446 insertions(+), 280 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 684552a..3b32a19 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -112,11 +112,12 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
 	if (atomic_dec_and_test(&sh->count)) {
 		BUG_ON(!list_empty(&sh->lru));
 		BUG_ON(atomic_read(&conf->active_stripes)==0);
+		if (conf->cache_policy->release_stripe(conf, sh,
+						test_bit(STRIPE_HANDLE, &sh->state)))
+			return; /* stripe was moved to a cache policy specific queue */
+
 		if (test_bit(STRIPE_HANDLE, &sh->state)) {
-			if (test_bit(STRIPE_DELAYED, &sh->state)) {
-				list_add_tail(&sh->lru, &conf->delayed_list);
-				blk_plug_device(conf->mddev->queue);
-			} else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
+			if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
 				   sh->bm_seq - conf->seq_write > 0) {
 				list_add_tail(&sh->lru, &conf->bitmap_list);
 				blk_plug_device(conf->mddev->queue);
@@ -125,23 +126,11 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
 				list_add_tail(&sh->lru, &conf->handle_list);
 			}
 			md_wakeup_thread(conf->mddev->thread);
-		} else {
-			BUG_ON(sh->ops.pending);
-			if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
-				atomic_dec(&conf->preread_active_stripes);
-				if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
-					md_wakeup_thread(conf->mddev->thread);
-			}
-			atomic_dec(&conf->active_stripes);
-			if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
-				list_add_tail(&sh->lru, &conf->inactive_list);
-				wake_up(&conf->wait_for_stripe);
-				if (conf->retry_read_aligned)
-					md_wakeup_thread(conf->mddev->thread);
-			}
-		}
+		} else
+			BUG();
 	}
 }
+
 static void release_stripe(struct stripe_head *sh)
 {
 	raid5_conf_t *conf = sh->raid_conf;
@@ -724,39 +713,6 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
 	return tx;
 }
 
-static void ops_complete_postxor(void *stripe_head_ref)
-{
-	struct stripe_head *sh = stripe_head_ref;
-
-	PRINTK("%s: stripe %llu\n", __FUNCTION__,
-		(unsigned long long)sh->sector);
-
-	set_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
-	set_bit(STRIPE_HANDLE, &sh->state);
-	release_stripe(sh);
-}
-
-static void ops_complete_write(void *stripe_head_ref)
-{
-	struct stripe_head *sh = stripe_head_ref;
-	int disks = sh->disks, i, pd_idx = sh->pd_idx;
-
-	PRINTK("%s: stripe %llu\n", __FUNCTION__,
-		(unsigned long long)sh->sector);
-
-	for (i=disks ; i-- ;) {
-		struct r5dev *dev = &sh->dev[i];
-		if (dev->written || i == pd_idx)
-			set_bit(R5_UPTODATE, &dev->flags);
-	}
-
-	set_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete);
-	set_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
-
-	set_bit(STRIPE_HANDLE, &sh->state);
-	release_stripe(sh);
-}
-
 static void
 ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
 {
@@ -764,6 +720,7 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
 	int disks = sh->disks;
 	struct page *xor_srcs[disks];
 
+	raid5_conf_t *conf = sh->raid_conf;
 	int count = 0, pd_idx = sh->pd_idx, i;
 	struct page *xor_dest;
 	int prexor = test_bit(STRIPE_OP_PREXOR, &sh->ops.pending);
@@ -792,9 +749,8 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
 		}
 	}
 
-	/* check whether this postxor is part of a write */
-	callback = test_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending) ?
-		ops_complete_write : ops_complete_postxor;
+	/* take cache policy specific action upon completion of the postxor */
+	callback = conf->cache_policy->complete_postxor_action;
 
 	/* 1/ if we prexor'd then the dest is reused as a source
 	 * 2/ if we did not prexor then we are redoing the parity
@@ -1683,7 +1639,8 @@ static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2)
 	}
 }
 
-static int handle_write_operations5(struct stripe_head *sh, int rcw, int expand)
+static int 
+raid5_wt_cache_handle_parity_updates(struct stripe_head *sh, int rcw, int expand)
 {
 	int i, pd_idx = sh->pd_idx, disks = sh->disks;
 	int locked=0;
@@ -1847,6 +1804,327 @@ static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks)
 	return pd_idx;
 }
 
+static int
+raid5_wt_cache_release_stripe(raid5_conf_t *conf, struct stripe_head *sh,
+	int handle)
+{
+	struct stripe_cache_policy *cp = conf->cache_policy;
+
+	PRINTK("%s: stripe %llu\n", __FUNCTION__,
+		(unsigned long long)sh->sector);
+
+	if (handle && test_bit(STRIPE_DELAYED, &sh->state)) {
+		list_add_tail(&sh->lru, &cp->delayed_list);
+		blk_plug_device(conf->mddev->queue);
+		return 1;
+	} else if (!handle) {
+		BUG_ON(sh->ops.pending);
+		if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
+			atomic_dec(&cp->preread_active_stripes);
+			if (atomic_read(&cp->preread_active_stripes) < IO_THRESHOLD)
+				md_wakeup_thread(conf->mddev->thread);
+		}
+		atomic_dec(&conf->active_stripes);
+		if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
+			list_add_tail(&sh->lru, &conf->inactive_list);
+			wake_up(&conf->wait_for_stripe);
+			if (conf->retry_read_aligned)
+				md_wakeup_thread(conf->mddev->thread);
+		}
+		return 1;
+	}
+
+	return 0;
+}
+
+static void raid5_wt_cache_complete_postxor_action(void *stripe_head_ref)
+{
+	struct stripe_head *sh = stripe_head_ref;
+
+	PRINTK("%s: stripe %llu\n", __FUNCTION__,
+		(unsigned long long)sh->sector);
+
+	set_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
+
+	/* leaving prexor set until postxor is done allows us to distinguish
+	 * a rmw from a rcw during biodrain
+	 */
+	if (test_bit(STRIPE_OP_PREXOR, &sh->ops.complete)) {
+		int i;
+		for (i=sh->disks; i--;)
+			clear_bit(R5_Wantprexor, &sh->dev[i].flags);
+
+		clear_bit(STRIPE_OP_PREXOR, &sh->ops.complete);
+		clear_bit(STRIPE_OP_PREXOR, &sh->ops.ack);
+		clear_bit(STRIPE_OP_PREXOR, &sh->ops.pending);
+	}
+
+	if (test_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending)) {
+		int disks = sh->disks, i, pd_idx = sh->pd_idx;
+
+		for (i=disks ; i-- ;) {
+			struct r5dev *dev = &sh->dev[i];
+			if (dev->written || i == pd_idx)
+				set_bit(R5_UPTODATE, &dev->flags);
+		}
+
+		set_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete);
+	}
+
+	set_bit(STRIPE_HANDLE, &sh->state);
+	release_stripe(sh);
+}
+
+static struct bio *
+raid5_wt_cache_handle_completed_writes(struct stripe_head *sh,
+	struct stripe_head_state *s)
+{
+	struct bio *return_bi = NULL;
+
+	/* might be able to return some write requests if the parity block
+	 * is safe, or on a failed drive
+	 */
+	struct r5dev *dev = &sh->dev[sh->pd_idx];
+	if ( s->written &&
+	     ( (test_bit(R5_Insync, &dev->flags) && !test_bit(R5_LOCKED, &dev->flags) &&
+		test_bit(R5_UPTODATE, &dev->flags))
+	       || (s->failed == 1 && s->failed_num == sh->pd_idx))
+	    ) {
+	    raid5_conf_t *conf = sh->raid_conf;
+	    int i;
+	    /* any written block on an uptodate or failed drive can be returned.
+	     * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but 
+	     * never LOCKED, so we don't need to test 'failed' directly.
+	     */
+	    for (i=sh->disks; i--; )
+		if (sh->dev[i].written) {
+		    dev = &sh->dev[i];
+		    if (!test_bit(R5_LOCKED, &dev->flags) &&
+			 test_bit(R5_UPTODATE, &dev->flags) ) {
+			/* We can return any write requests */
+			    struct bio *wbi, *wbi2;
+			    int bitmap_end = 0;
+			    PRINTK("%s: Return write for disc %d\n",
+			    	__FUNCTION__, i);
+			    spin_lock_irq(&conf->device_lock);
+			    wbi = dev->written;
+			    dev->written = NULL;
+			    while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
+				    wbi2 = r5_next_bio(wbi, dev->sector);
+				    if (--wbi->bi_phys_segments == 0) {
+					    md_write_end(conf->mddev);
+					    wbi->bi_next = return_bi;
+					    return_bi = wbi;
+				    }
+				    wbi = wbi2;
+			    }
+			    if (dev->towrite == NULL)
+				    bitmap_end = 1;
+			    spin_unlock_irq(&conf->device_lock);
+			    if (bitmap_end)
+				    bitmap_endwrite(conf->mddev->bitmap, sh->sector,
+						    STRIPE_SECTORS,
+						    !test_bit(STRIPE_DEGRADED, &sh->state), 0);
+		    }
+		}
+	}
+
+	return return_bi;
+}
+
+static void
+raid5_wt_cache_submit_pending_writes(struct stripe_head *sh,
+	struct stripe_head_state *s)
+{
+	/* if only POSTXOR is set then this is an 'expand' postxor */
+	if (test_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete) &&
+		test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) {
+		raid5_conf_t *conf = sh->raid_conf;
+		struct stripe_cache_policy *cp = conf->cache_policy;
+		int i;
+
+		PRINTK("%s: stripe %llu\n", __FUNCTION__,
+			(unsigned long long)sh->sector);
+
+		/* All the 'written' buffers and the parity block are ready to be
+		 * written back to disk
+		 */
+		BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags));
+		for (i=sh->disks; i--;) {
+			struct r5dev *dev = &sh->dev[i];
+			if (test_bit(R5_LOCKED, &dev->flags) &&
+				(i == sh->pd_idx || dev->written)) {
+				PRINTK("Writing block %d\n", i);
+				set_bit(R5_Wantwrite, &dev->flags);
+				if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
+					sh->ops.count++;
+				if (!test_bit(R5_Insync, &dev->flags)
+				    || (i==sh->pd_idx && s->failed == 0))
+					set_bit(STRIPE_INSYNC, &sh->state);
+			}
+		}
+		if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
+			atomic_dec(&cp->preread_active_stripes);
+			if (atomic_read(&cp->preread_active_stripes) < IO_THRESHOLD)
+				md_wakeup_thread(conf->mddev->thread);
+		}
+
+		clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete);
+		clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.ack);
+		clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending);
+
+		clear_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
+		clear_bit(STRIPE_OP_POSTXOR, &sh->ops.ack);
+		clear_bit(STRIPE_OP_POSTXOR, &sh->ops.pending);
+	}
+
+}
+
+static void
+raid5_wt_cache_handle_new_writes(struct stripe_head *sh, struct stripe_head_state *s)
+{
+	/* 1/ Check operations clobber the parity block so do not start new writes while
+	 *    a check is in flight
+	 * 2/ Write operations do not stack
+	 */
+	if (s->to_write && !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending) &&
+		!test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) {
+		int rmw=0, rcw=0, disks = sh->disks, i;
+		struct r5dev *dev;
+		for (i=disks ; i--;) {
+			/* would I have to read this buffer for read_modify_write */
+			dev = &sh->dev[i];
+			if ((dev->towrite || i == sh->pd_idx) &&
+			    (!test_bit(R5_LOCKED, &dev->flags) 
+				    ) &&
+			    !(test_bit(R5_UPTODATE, &dev->flags) || test_bit(R5_Wantcompute, &dev->flags))) {
+				if (test_bit(R5_Insync, &dev->flags)
+/*				    && !(!mddev->insync && i == sh->pd_idx) */
+					)
+					rmw++;
+				else rmw += 2*disks;  /* cannot read it */
+			}
+			/* Would I have to read this buffer for reconstruct_write */
+			if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
+			    (!test_bit(R5_LOCKED, &dev->flags) 
+				    ) &&
+			    !(test_bit(R5_UPTODATE, &dev->flags) || test_bit(R5_Wantcompute, &dev->flags))) {
+				if (test_bit(R5_Insync, &dev->flags)) rcw++;
+				else rcw += 2*disks;
+			}
+		}
+		PRINTK("for sector %llu, rmw=%d rcw=%d\n", 
+			(unsigned long long)sh->sector, rmw, rcw);
+		set_bit(STRIPE_HANDLE, &sh->state);
+		if (rmw < rcw && rmw > 0)
+			/* prefer read-modify-write, but need to get some data */
+			for (i=disks; i--;) {
+				dev = &sh->dev[i];
+				if ((dev->towrite || i == sh->pd_idx) &&
+				    !test_bit(R5_LOCKED, &dev->flags) &&
+				    !(test_bit(R5_UPTODATE, &dev->flags) || test_bit(R5_Wantcompute, &dev->flags)) &&
+				    test_bit(R5_Insync, &dev->flags)) {
+					if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+					{
+						PRINTK("Read_old block %d for r-m-w\n", i);
+						set_bit(R5_LOCKED, &dev->flags);
+						set_bit(R5_Wantread, &dev->flags);
+						if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
+							sh->ops.count++;
+						s->locked++;
+					} else {
+						set_bit(STRIPE_DELAYED, &sh->state);
+						set_bit(STRIPE_HANDLE, &sh->state);
+					}
+				}
+			}
+		if (rcw <= rmw && rcw > 0)
+			/* want reconstruct write, but need to get some data */
+			for (i=disks; i--;) {
+				dev = &sh->dev[i];
+				if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
+				    !test_bit(R5_LOCKED, &dev->flags) &&
+				    !(test_bit(R5_UPTODATE, &dev->flags) || test_bit(R5_Wantcompute, &dev->flags)) &&
+				    test_bit(R5_Insync, &dev->flags)) {
+					if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+					{
+						PRINTK("Read_old block %d for Reconstruct\n", i);
+						set_bit(R5_LOCKED, &dev->flags);
+						set_bit(R5_Wantread, &dev->flags);
+						if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
+							sh->ops.count++;
+						s->locked++;
+					} else {
+						set_bit(STRIPE_DELAYED, &sh->state);
+						set_bit(STRIPE_HANDLE, &sh->state);
+					}
+				}
+			}
+		/* now if nothing is locked, and if we have enough data, we can start a write request */
+		/* since handle_stripe can be called at any time we need to handle the case
+		 * where a compute block operation has been submitted and then a subsequent
+		 * call wants to start a write request.  raid5_run_ops only handles the case where
+		 * compute block and postxor are requested simultaneously.  If this
+		 * is not the case then new writes need to be held off until the compute
+		 * completes.
+		 */
+		if ((s->req_compute || !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) &&
+			(s->locked == 0 && (rcw == 0 ||rmw == 0) &&
+			!test_bit(STRIPE_BIT_DELAY, &sh->state)))
+			s->locked += raid5_wt_cache_handle_parity_updates(sh, rcw == 0, 0);
+			
+	}
+}
+
+static void raid5_wt_cache_activate_delayed(raid5_conf_t *conf)
+{
+	struct stripe_cache_policy *cp = conf->cache_policy;
+	if (atomic_read(&cp->preread_active_stripes) < IO_THRESHOLD) {
+		while (!list_empty(&cp->delayed_list)) {
+			struct list_head *l = cp->delayed_list.next;
+			struct stripe_head *sh;
+			sh = list_entry(l, struct stripe_head, lru);
+			list_del_init(l);
+			clear_bit(STRIPE_DELAYED, &sh->state);
+			if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+				atomic_inc(&cp->preread_active_stripes);
+			list_add_tail(&sh->lru, &conf->handle_list);
+		}
+	}
+}
+
+static void raid5_wt_cache_raid5d(mddev_t *mddev, raid5_conf_t *conf)
+{
+	struct stripe_cache_policy *cp = conf->cache_policy;
+
+	if (list_empty(&conf->handle_list) &&
+	    atomic_read(&cp->preread_active_stripes) < IO_THRESHOLD &&
+	    !blk_queue_plugged(mddev->queue) &&
+	    !list_empty(&cp->delayed_list))
+		raid5_wt_cache_activate_delayed(conf);
+}
+
+static void raid5_wt_cache_init(raid5_conf_t *conf)
+{
+	atomic_set(&conf->cache_policy->preread_active_stripes, 0);
+	INIT_LIST_HEAD(&conf->cache_policy->delayed_list);
+}
+
+static void raid5_wt_cache_unplug_device(raid5_conf_t *conf)
+{
+	raid5_wt_cache_activate_delayed(conf);
+}
+
+static struct stripe_cache_policy raid5_cache_policy_write_through = {
+	.release_stripe = raid5_wt_cache_release_stripe,
+	.complete_postxor_action = raid5_wt_cache_complete_postxor_action,
+	.submit_pending_writes = raid5_wt_cache_submit_pending_writes,
+	.handle_new_writes = raid5_wt_cache_handle_new_writes,
+	.handle_completed_writes = raid5_wt_cache_handle_completed_writes,
+	.raid5d = raid5_wt_cache_raid5d,
+	.init = raid5_wt_cache_init,
+	.unplug_device = raid5_wt_cache_unplug_device,
+};
 
 /*
  * handle_stripe - do things to a stripe.
@@ -1944,12 +2222,13 @@ static void handle_stripe5(struct stripe_head *sh)
 	}
 	rcu_read_unlock();
 
+	/* do we need to request a biofill operation? */
 	if (s.to_fill && !test_and_set_bit(STRIPE_OP_BIOFILL, &sh->ops.pending))
 		sh->ops.count++;
 
-	PRINTK("locked=%d uptodate=%d to_read=%d"
+	PRINTK("locked=%d dirty=%d uptodate=%d to_read=%d"
 		" to_write=%d to_fill=%d failed=%d failed_num=%d\n",
-		s.locked, s.uptodate, s.to_read, s.to_write, s.to_fill,
+		s.locked, s.dirty, s.uptodate, s.to_read, s.to_write, s.to_fill,
 		s.failed, s.failed_num);
 	/* check if the array has lost two devices and, if so, some requests might
 	 * need to be failed
@@ -2035,50 +2314,8 @@ static void handle_stripe5(struct stripe_head *sh)
 		s.syncing = 0;
 	}
 
-	/* might be able to return some write requests if the parity block
-	 * is safe, or on a failed drive
-	 */
-	dev = &sh->dev[sh->pd_idx];
-	if ( s.written &&
-	     ( (test_bit(R5_Insync, &dev->flags) && !test_bit(R5_LOCKED, &dev->flags) &&
-		test_bit(R5_UPTODATE, &dev->flags))
-	       || (s.failed == 1 && s.failed_num == sh->pd_idx))
-	    ) {
-	    /* any written block on an uptodate or failed drive can be returned.
-	     * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but 
-	     * never LOCKED, so we don't need to test 'failed' directly.
-	     */
-	    for (i=disks; i--; )
-		if (sh->dev[i].written) {
-		    dev = &sh->dev[i];
-		    if (!test_bit(R5_LOCKED, &dev->flags) &&
-			 test_bit(R5_UPTODATE, &dev->flags) ) {
-			/* We can return any write requests */
-			    struct bio *wbi, *wbi2;
-			    int bitmap_end = 0;
-			    PRINTK("Return write for disc %d\n", i);
-			    spin_lock_irq(&conf->device_lock);
-			    wbi = dev->written;
-			    dev->written = NULL;
-			    while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
-				    wbi2 = r5_next_bio(wbi, dev->sector);
-				    if (--wbi->bi_phys_segments == 0) {
-					    md_write_end(conf->mddev);
-					    wbi->bi_next = return_bi;
-					    return_bi = wbi;
-				    }
-				    wbi = wbi2;
-			    }
-			    if (dev->towrite == NULL)
-				    bitmap_end = 1;
-			    spin_unlock_irq(&conf->device_lock);
-			    if (bitmap_end)
-				    bitmap_endwrite(conf->mddev->bitmap, sh->sector,
-						    STRIPE_SECTORS,
-						    !test_bit(STRIPE_DEGRADED, &sh->state), 0);
-		    }
-		}
-	}
+	/* handle the completion of writes to the backing disks */
+	return_bi = conf->cache_policy->handle_completed_writes(sh, &s);
 
 	/* Now we might consider reading some blocks, either to check/generate
 	 * parity, or to satisfy requests
@@ -2135,7 +2372,8 @@ static void handle_stripe5(struct stripe_head *sh)
 					 * 3/ We hold off parity block re-reads until check
 					 * operations have quiesced.
 					 */
-					if ((s.uptodate == disks-1) && !test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) {
+					if (((s.uptodate == disks-1) && !s.dirty) &&
+						!test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) {
 						set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending);
 						set_bit(R5_Wantcompute, &dev->flags);
 						sh->ops.target = i;
@@ -2148,7 +2386,8 @@ static void handle_stripe5(struct stripe_head *sh)
 						 */
 						s.uptodate++;
 						break; /* uptodate + compute == disks */
-					} else if ((s.uptodate < disks-1) && test_bit(R5_Insync, &dev->flags)) {
+					} else if (((s.uptodate < disks-1) || s.dirty) &&
+							test_bit(R5_Insync, &dev->flags)) {
 						/* Note: we hold off compute operations while checks are in flight,
 						 * but we still prefer 'compute' over 'read' hence we only read if
 						 * (uptodate < disks-1)
@@ -2167,158 +2406,20 @@ static void handle_stripe5(struct stripe_head *sh)
 		set_bit(STRIPE_HANDLE, &sh->state);
 	}
 
-	/* Now we check to see if any write operations have recently
-	 * completed
-	 */
-
-	/* leave prexor set until postxor is done, allows us to distinguish
-	 * a rmw from a rcw during biodrain
-	 */
-	if (test_bit(STRIPE_OP_PREXOR, &sh->ops.complete) &&
-		test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) {
-
-		clear_bit(STRIPE_OP_PREXOR, &sh->ops.complete);
-		clear_bit(STRIPE_OP_PREXOR, &sh->ops.ack);
-		clear_bit(STRIPE_OP_PREXOR, &sh->ops.pending);
-
-		for (i=disks; i--;)
-			clear_bit(R5_Wantprexor, &sh->dev[i].flags);
-	}
-
-	/* if only POSTXOR is set then this is an 'expand' postxor */
-	if (test_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete) &&
-		test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) {
-
-		clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete);
-		clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.ack);
-		clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending);
+	/* Now we check to see if any blocks are ready to be written to disk */
+	conf->cache_policy->submit_pending_writes(sh, &s);
 
-		clear_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
-		clear_bit(STRIPE_OP_POSTXOR, &sh->ops.ack);
-		clear_bit(STRIPE_OP_POSTXOR, &sh->ops.pending);
-
-		/* All the 'written' buffers and the parity block are ready to be
-		 * written back to disk
-		 */
-		BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags));
-		for (i=disks; i--;) {
-			dev = &sh->dev[i];
-			if (test_bit(R5_LOCKED, &dev->flags) &&
-				(i == sh->pd_idx || dev->written)) {
-				PRINTK("Writing block %d\n", i);
-				set_bit(R5_Wantwrite, &dev->flags);
-				if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
-					sh->ops.count++;
-				if (!test_bit(R5_Insync, &dev->flags)
-				    || (i==sh->pd_idx && s.failed == 0))
-					set_bit(STRIPE_INSYNC, &sh->state);
-			}
-		}
-		if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
-			atomic_dec(&conf->preread_active_stripes);
-			if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
-				md_wakeup_thread(conf->mddev->thread);
-		}
-	}
-
-	/* 1/ Now to consider new write requests and what else, if anything should be read
-	 * 2/ Check operations clobber the parity block so do not start new writes while
-	 *    a check is in flight
-	 * 3/ Write operations do not stack
-	 */
-	if (s.to_write && !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending) &&
-		!test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) {
-		int rmw=0, rcw=0;
-		for (i=disks ; i--;) {
-			/* would I have to read this buffer for read_modify_write */
-			dev = &sh->dev[i];
-			if ((dev->towrite || i == sh->pd_idx) &&
-			    (!test_bit(R5_LOCKED, &dev->flags) 
-				    ) &&
-			    !(test_bit(R5_UPTODATE, &dev->flags) || test_bit(R5_Wantcompute, &dev->flags))) {
-				if (test_bit(R5_Insync, &dev->flags)
-/*				    && !(!mddev->insync && i == sh->pd_idx) */
-					)
-					rmw++;
-				else rmw += 2*disks;  /* cannot read it */
-			}
-			/* Would I have to read this buffer for reconstruct_write */
-			if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
-			    (!test_bit(R5_LOCKED, &dev->flags) 
-				    ) &&
-			    !(test_bit(R5_UPTODATE, &dev->flags) || test_bit(R5_Wantcompute, &dev->flags))) {
-				if (test_bit(R5_Insync, &dev->flags)) rcw++;
-				else rcw += 2*disks;
-			}
-		}
-		PRINTK("for sector %llu, rmw=%d rcw=%d\n", 
-			(unsigned long long)sh->sector, rmw, rcw);
-		set_bit(STRIPE_HANDLE, &sh->state);
-		if (rmw < rcw && rmw > 0)
-			/* prefer read-modify-write, but need to get some data */
-			for (i=disks; i--;) {
-				dev = &sh->dev[i];
-				if ((dev->towrite || i == sh->pd_idx) &&
-				    !test_bit(R5_LOCKED, &dev->flags) &&
-				    !(test_bit(R5_UPTODATE, &dev->flags) || test_bit(R5_Wantcompute, &dev->flags)) &&
-				    test_bit(R5_Insync, &dev->flags)) {
-					if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
-					{
-						PRINTK("Read_old block %d for r-m-w\n", i);
-						set_bit(R5_LOCKED, &dev->flags);
-						set_bit(R5_Wantread, &dev->flags);
-						if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
-							sh->ops.count++;
-						s.locked++;
-					} else {
-						set_bit(STRIPE_DELAYED, &sh->state);
-						set_bit(STRIPE_HANDLE, &sh->state);
-					}
-				}
-			}
-		if (rcw <= rmw && rcw > 0)
-			/* want reconstruct write, but need to get some data */
-			for (i=disks; i--;) {
-				dev = &sh->dev[i];
-				if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
-				    !test_bit(R5_LOCKED, &dev->flags) &&
-				    !(test_bit(R5_UPTODATE, &dev->flags) || test_bit(R5_Wantcompute, &dev->flags)) &&
-				    test_bit(R5_Insync, &dev->flags)) {
-					if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
-					{
-						PRINTK("Read_old block %d for Reconstruct\n", i);
-						set_bit(R5_LOCKED, &dev->flags);
-						set_bit(R5_Wantread, &dev->flags);
-						if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
-							sh->ops.count++;
-						s.locked++;
-					} else {
-						set_bit(STRIPE_DELAYED, &sh->state);
-						set_bit(STRIPE_HANDLE, &sh->state);
-					}
-				}
-			}
-		/* now if nothing is locked, and if we have enough data, we can start a write request */
-		/* since handle_stripe can be called at any time we need to handle the case
-		 * where a compute block operation has been submitted and then a subsequent
-		 * call wants to start a write request.  raid5_run_ops only handles the case where
-		 * compute block and postxor are requested simultaneously.  If this
-		 * is not the case then new writes need to be held off until the compute
-		 * completes.
-		 */
-		if ((s.req_compute || !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) &&
-			(s.locked == 0 && (rcw == 0 ||rmw == 0) &&
-			!test_bit(STRIPE_BIT_DELAY, &sh->state)))
-			s.locked += handle_write_operations5(sh, rcw == 0, 0);
-	}
+	/* Now to consider new write requests and what else, if anything should be read */
+	conf->cache_policy->handle_new_writes(sh, &s);
 
 	/* 1/ Maybe we need to check and possibly fix the parity for this stripe.
 	 *    Any reads will already have been scheduled, so we just see if enough data
 	 *    is available.
 	 * 2/ Hold off parity checks while parity dependent operations are in flight
-	 *    (conflicting writes are protected by the 'locked' variable)
+	 *    (conflicting writes are protected by the 'locked' and 'dirty' variables)
 	 */
-	if ((s.syncing && s.locked == 0 && !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) &&
+	if ((s.syncing && s.locked == 0 && s.dirty == 0 &&
+		!test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) &&
 		!test_bit(STRIPE_INSYNC, &sh->state)) ||
 	    	test_bit(STRIPE_OP_CHECK, &sh->ops.pending) ||
 	    	test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) {
@@ -2451,7 +2552,7 @@ static void handle_stripe5(struct stripe_head *sh)
 		/* Need to write out all blocks after computing parity */
 		sh->disks = conf->raid_disks;
 		sh->pd_idx = stripe_to_pdidx(sh->sector, conf, conf->raid_disks);
-		s.locked += handle_write_operations5(sh, 0, 1);
+		s.locked += raid5_wt_cache_handle_parity_updates(sh, 0, 1);
 	} else if (s.expanded && !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) {
 		clear_bit(STRIPE_EXPAND_READY, &sh->state);
 		atomic_dec(&conf->reshape_stripes);
@@ -2885,8 +2986,9 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
 			set_bit(STRIPE_INSYNC, &sh->state);
 
 			if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
-				atomic_dec(&conf->preread_active_stripes);
-				if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
+				atomic_dec(&conf->cache_policy->preread_active_stripes);
+				if (atomic_read(&conf->cache_policy->preread_active_stripes)
+					< IO_THRESHOLD)
 					md_wakeup_thread(conf->mddev->thread);
 			}
 		}
@@ -3164,22 +3266,6 @@ static void handle_stripe(struct stripe_head *sh, struct page *tmp_page)
 
 
 
-static void raid5_activate_delayed(raid5_conf_t *conf)
-{
-	if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
-		while (!list_empty(&conf->delayed_list)) {
-			struct list_head *l = conf->delayed_list.next;
-			struct stripe_head *sh;
-			sh = list_entry(l, struct stripe_head, lru);
-			list_del_init(l);
-			clear_bit(STRIPE_DELAYED, &sh->state);
-			if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
-				atomic_inc(&conf->preread_active_stripes);
-			list_add_tail(&sh->lru, &conf->handle_list);
-		}
-	}
-}
-
 static void activate_bit_delay(raid5_conf_t *conf)
 {
 	/* device_lock is held */
@@ -3222,14 +3308,17 @@ static void raid5_unplug_device(request_queue_t *q)
 {
 	mddev_t *mddev = q->queuedata;
 	raid5_conf_t *conf = mddev_to_conf(mddev);
+	struct stripe_cache_policy *cp = conf->cache_policy;
 	unsigned long flags;
 
 	spin_lock_irqsave(&conf->device_lock, flags);
 
 	if (blk_remove_plug(q)) {
 		conf->seq_flush++;
-		raid5_activate_delayed(conf);
+		if (cp->unplug_device)
+			cp->unplug_device(conf);
 	}
+
 	md_wakeup_thread(mddev->thread);
 
 	spin_unlock_irqrestore(&conf->device_lock, flags);
@@ -3944,11 +4033,8 @@ static void raid5d (mddev_t *mddev)
 			activate_bit_delay(conf);
 		}
 
-		if (list_empty(&conf->handle_list) &&
-		    atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD &&
-		    !blk_queue_plugged(mddev->queue) &&
-		    !list_empty(&conf->delayed_list))
-			raid5_activate_delayed(conf);
+		if (conf->cache_policy->raid5d)
+			conf->cache_policy->raid5d(mddev, conf);
 
 		while ((bio = remove_bio_from_retry(conf))) {
 			int ok;
@@ -4150,16 +4236,22 @@ static int run(mddev_t *mddev)
 		if (!conf->spare_page)
 			goto abort;
 	}
+
+	#ifdef CONFIG_RAID5_CACHE_POLICY_WRITE_BACK
+	conf->cache_policy = &raid5_cache_policy_write_back;
+	#else
+	conf->cache_policy = &raid5_cache_policy_write_through;
+	#endif
+	
 	spin_lock_init(&conf->device_lock);
 	init_waitqueue_head(&conf->wait_for_stripe);
 	init_waitqueue_head(&conf->wait_for_overlap);
 	INIT_LIST_HEAD(&conf->handle_list);
-	INIT_LIST_HEAD(&conf->delayed_list);
 	INIT_LIST_HEAD(&conf->bitmap_list);
 	INIT_LIST_HEAD(&conf->inactive_list);
 	atomic_set(&conf->active_stripes, 0);
-	atomic_set(&conf->preread_active_stripes, 0);
 	atomic_set(&conf->active_aligned_reads, 0);
+	conf->cache_policy->init(conf);
 
 	PRINTK("raid5: run(%s) called.\n", mdname(mddev));
 
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index 54e2aa2..f00da23 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -224,8 +224,8 @@ struct stripe_head_state {
 #define STRIPE_HANDLE		2
 #define	STRIPE_SYNCING		3
 #define	STRIPE_INSYNC		4
-#define	STRIPE_PREREAD_ACTIVE	5
-#define	STRIPE_DELAYED		6
+#define	STRIPE_PREREAD_ACTIVE	5 /* wt cache state */
+#define	STRIPE_DELAYED		6 /* wt cache state */
 #define	STRIPE_DEGRADED		7
 #define	STRIPE_BIT_DELAY	8
 #define	STRIPE_EXPANDING	9
@@ -276,6 +276,81 @@ struct disk_info {
 	mdk_rdev_t	*rdev;
 };
 
+/**
+ * struct stripe_cache_policy - handle writethrough/writeback caching
+ * @post_run_biodrain:
+ *  wb: allows writes to be signalled complete once
+ *      they are in the stripe cache
+ *  wt: NULL
+ * @notify_release:
+ *  wb: transition inactive stripes with pending data to a dirty list
+ *  rather than the inactive list
+ *  wt: handle delayed stripes and issuing pre-read actions.
+ * @submit_pending_writes:
+ *  wb: only writeback when STRIPE_EVICT is set
+ *  wt: always writethrough after postxor completes
+ */
+
+/* wt = write through
+ * wb = write back
+ */
+struct stripe_cache_policy {
+	/* release_stripe - returns '1' if stripe was moved to cache-private list
+	 *  else '0'
+	 * [ called from __release_stripe under spin_lock_irq(&conf->device_lock) ]
+	 * wt: catch 'delayed' stripes and poke the 'preread' state machine
+	 * if necessary
+	 */
+	int (*release_stripe)(struct raid5_private_data *conf,
+		struct stripe_head *sh,	int handle);
+	/* complete_postxor_action
+	 * wt: check if this is the end of a rcw/rmw write request and set
+	 * the state bits accordingly.  set 'handle' and release.
+	 */
+	void (*complete_postxor_action)(void *stripe_head_ref);
+	/* submit_pending_writes
+	 * [ called from handle_stripe under spin_lock(&sh->lock) ]
+	 * wt: check if 'biodrain' and 'postxor' are complete and schedule writes
+	 * to the backing disks
+	 */
+	void (*submit_pending_writes)(struct stripe_head *sh,
+		struct stripe_head_state *s);
+	/* handle_new_writes
+	 * [ called from handle_stripe under spin_lock(&sh->lock) ]
+	 * wt: schedule reads to prepare for a rcw or rmw operation.  once preread
+	 * data is available lock the blocks and schedule '[prexor]+biodrain+postxor'
+	 */
+	void (*handle_new_writes)(struct stripe_head *sh,
+		struct stripe_head_state *s);
+	/* handle_completed_writes
+	 * [ called from handle_stripe under spin_lock(&sh->lock) ]
+	 * wt: call bi_end_io on all written blocks and perform general md/bitmap
+	 * post write housekeeping.
+	 */
+	struct bio *(*handle_completed_writes)(struct stripe_head *sh,
+		struct stripe_head_state *s);
+	/* raid5d
+	 * wt: check for stripes that can be taken off the delayed list
+	 */
+	void (*raid5d)(mddev_t *mddev, struct raid5_private_data *conf);
+	/* init
+	 * wt: initialize 'delayed_list' and 'preread_active_stripes'
+	 * wb: initialize 'dirty_list' and 'dirty_stripes'
+	 */
+	void (*init)(struct raid5_private_data *conf);
+	/* unplug_device
+	 * [ called from raid5_unplug_device under spin_lock_irqsave(&conf->device_lock) ]
+	 * wt: activate stripes on the delayed list
+	 */
+	void (*unplug_device)(struct raid5_private_data *conf);
+	union {
+		struct list_head delayed_list; /* wt: stripes that have plugged requests */
+	};
+	union {
+		atomic_t preread_active_stripes;
+	};
+};
+
 struct raid5_private_data {
 	struct hlist_head	*stripe_hashtbl;
 	mddev_t			*mddev;
@@ -284,6 +359,7 @@ struct raid5_private_data {
 	int			max_degraded;
 	int			raid_disks;
 	int			max_nr_stripes;
+	struct stripe_cache_policy *cache_policy;
 
 	/* used during an expand */
 	sector_t		expand_progress;	/* MaxSector when no expand happening */
@@ -293,11 +369,9 @@ struct raid5_private_data {
 	int			previous_raid_disks;
 
 	struct list_head	handle_list; /* stripes needing handling */
-	struct list_head	delayed_list; /* stripes that have plugged requests */
 	struct list_head	bitmap_list; /* stripes delaying awaiting bitmap update */
 	struct bio		*retry_read_aligned; /* currently retrying aligned bios   */
 	struct bio		*retry_read_aligned_list; /* aligned bios retry list  */
-	atomic_t		preread_active_stripes; /* stripes with scheduled io */
 	atomic_t		active_aligned_reads;
 
 	atomic_t		reshape_stripes; /* stripes with pending writes for reshape */

next prev parent reply	other threads:[~2007-04-11  6:00 UTC|newest]

Thread overview: 9+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2007-04-11  6:00 [PATCH RFC 0/4] raid5: write-back caching policy and write performance Dan Williams
2007-04-11  6:00 ` [PATCH RFC 1/4] md: introduce struct stripe_head_state Dan Williams
2007-04-11  6:00 ` Dan Williams [this message]
2007-04-11  6:00 ` [PATCH RFC 3/4] md: writeback caching policy for raid5 [experimental] Dan Williams
2007-04-11 22:40   ` Mark Hahn
2007-04-12  0:08     ` Williams, Dan J
2007-04-12  6:21       ` Neil Brown
2007-04-12  5:37   ` Al Boldi
2007-04-11  6:00 ` [PATCH RFC 4/4] md: delayed stripe activation Dan Williams

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:684552a dfblob:3b32a19 dfblob:54e2aa2 dfblob:f00da23 )
 OR (
bs:"[PATCH RFC 2/4] md: refactor raid5 cache policy code using 'struct stripe_cache_policy'" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20070411060031.15745.50795.stgit@dwillia2-linux.ch.intel.com \
    --to=dan.j.williams@intel.com \
    --cc=linux-raid@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.