[RFC PATCH V1] raid1: rewrite the iobarrier

linux-raid.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [RFC PATCH V1] raid1: rewrite the iobarrier
@ 2013-01-24  6:02 majianpeng
  2013-02-06  2:16 ` NeilBrown
  0 siblings, 1 reply; 4+ messages in thread
From: majianpeng @ 2013-01-24  6:02 UTC (permalink / raw)
  To: NeilBrown; +Cc: linux-raid

There is iobarrier in raid1 because two reason resync/recovery  or reconfigure the array.
At present,it suspend all nornal IO when reysync/recovey.
But if nornal IO is outrange the resync/recovery windwos,it  don't need to iobarrier.
So I rewrite the iobarrier.
Because the reasons of barrier are two,so i use two different methods.
First for resync/recovery, there is a reysnc window.The end position is 'next_resync'.Because the resync depth is  RESYNC_DEPTH(32),
so the start is 'next_resync - RESYNC_SECTOR * RESYNC_DEPTH'
The nornal IO Will be divided into three categories by the location.
a: before the start of resync window
b: between the resync window
c: after the end of resync window
For a, it don't barrier.
For b, it need barrier and used the original method
For c, it don't barrier but it need record the minimum position.If next resync is larger this, resync action will suspend.Otherwise versa.
I used rbtree to order those io.

For the reason of reconfigure of the arrary,I proposed a concept "force_barrier".When there is force_barrier, all Nornam IO must be suspended.

NOTE:
Because this problem is also for raid10, but i only do it for raid1. It is post out mainly to make sure it is 
going in the correct direction and hope to get some helpful comments from other guys.
If the methods is accepted,i will send the patch for raid10.

Signed-off-by: Jianpeng Ma <majianpeng@gmail.com>
---
 drivers/md/raid1.c |  218 +++++++++++++++++++++++++++++++++++++++++-----------
 drivers/md/raid1.h |   11 +++
 2 files changed, 184 insertions(+), 45 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index d5bddfc..3115fb8 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -37,6 +37,7 @@
 #include <linux/module.h>
 #include <linux/seq_file.h>
 #include <linux/ratelimit.h>
+#include <linux/rbtree.h>
 #include "md.h"
 #include "raid1.h"
 #include "bitmap.h"
@@ -66,8 +67,8 @@
  */
 static int max_queued_requests = 1024;
 
-static void allow_barrier(struct r1conf *conf);
-static void lower_barrier(struct r1conf *conf);
+static void allow_barrier(struct r1conf *conf, int position);
+static void lower_barrier(struct r1conf *conf, int force);
 
 static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
 {
@@ -207,7 +208,7 @@ static void put_buf(struct r1bio *r1_bio)
 
 	mempool_free(r1_bio, conf->r1buf_pool);
 
-	lower_barrier(conf);
+	lower_barrier(conf, 0);
 }
 
 static void reschedule_retry(struct r1bio *r1_bio)
@@ -235,6 +236,12 @@ static void call_bio_endio(struct r1bio *r1_bio)
 	struct bio *bio = r1_bio->master_bio;
 	int done;
 	struct r1conf *conf = r1_bio->mddev->private;
+	int position = 0;
+
+	if (test_and_clear_bit(R1BIO_BEFORE_RESYNC, &r1_bio->state))
+		position = 1;
+	else if (test_and_clear_bit(R1BIO_AFTER_RESYNC, &r1_bio->state))
+		position = 2;
 
 	if (bio->bi_phys_segments) {
 		unsigned long flags;
@@ -249,11 +256,18 @@ static void call_bio_endio(struct r1bio *r1_bio)
 		clear_bit(BIO_UPTODATE, &bio->bi_flags);
 	if (done) {
 		bio_endio(bio, 0);
+
+		if (position == 2) {
+			unsigned long flags;
+			spin_lock_irqsave(&conf->resync_lock, flags);
+			rb_erase(&r1_bio->node, &conf->rb_root);
+			spin_unlock_irqrestore(&conf->resync_lock, flags);
+		}
 		/*
 		 * Wake up any possible resync thread that waits for the device
 		 * to go idle.
 		 */
-		allow_barrier(conf);
+		allow_barrier(conf, position);
 	}
 }
 
@@ -816,7 +830,7 @@ static void flush_pending_writes(struct r1conf *conf)
  */
 #define RESYNC_DEPTH 32
 
-static void raise_barrier(struct r1conf *conf)
+static void raise_barrier(struct r1conf *conf, int force)
 {
 	spin_lock_irq(&conf->resync_lock);
 
@@ -826,56 +840,125 @@ static void raise_barrier(struct r1conf *conf)
 
 	/* block any new IO from starting */
 	conf->barrier++;
+	if (force)
+		conf->force_barrier++;
 
 	/* Now wait for all pending IO to complete */
-	wait_event_lock_irq(conf->wait_barrier,
-			    !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
+	if (force)
+		wait_event_lock_irq(conf->wait_barrier,
+			    !(conf->nr_pending + conf->nr_before + conf->nr_after) &&
+			    (conf->barrier + conf->force_barrier) < RESYNC_DEPTH,
+			    conf->resync_lock);
+	else
+		wait_event_lock_irq(conf->wait_barrier,
+			    !conf->nr_pending &&
+			    (conf->barrier + conf->force_barrier) < RESYNC_DEPTH
+			    && !(conf->nr_after &&
+				(conf->next_resync + RESYNC_SECTORS < conf->near_position)),
 			    conf->resync_lock);
 
 	spin_unlock_irq(&conf->resync_lock);
 }
 
-static void lower_barrier(struct r1conf *conf)
+static void lower_barrier(struct r1conf *conf, int force)
 {
 	unsigned long flags;
 	BUG_ON(conf->barrier <= 0);
 	spin_lock_irqsave(&conf->resync_lock, flags);
 	conf->barrier--;
+	if (force)
+		conf->force_barrier--;
 	spin_unlock_irqrestore(&conf->resync_lock, flags);
 	wake_up(&conf->wait_barrier);
 }
 
-static void wait_barrier(struct r1conf *conf)
+/*max resync/recovery size*/
+#define RESYNC_WINDOWS (RESYNC_DEPTH * RESYNC_SECTORS)
+static int  wait_barrier(struct r1conf *conf, struct bio *bio)
 {
+	int position = 0;
 	spin_lock_irq(&conf->resync_lock);
-	if (conf->barrier) {
+
+	if (conf->force_barrier || (conf->barrier && bio == NULL)) {
 		conf->nr_waiting++;
-		/* Wait for the barrier to drop.
-		 * However if there are already pending
-		 * requests (preventing the barrier from
-		 * rising completely), and the
-		 * pre-process bio queue isn't empty,
-		 * then don't wait, as we need to empty
-		 * that queue to get the nr_pending
-		 * count down.
-		 */
 		wait_event_lock_irq(conf->wait_barrier,
+			!(conf->force_barrier + conf->barrier) ||
+			(conf->barrier &&
+			 (conf->nr_pending + conf->nr_before + conf->nr_after) &&
+			 current->bio_list &&
+			 !bio_list_empty(current->bio_list)),
+			conf->resync_lock);
+		conf->nr_waiting--;
+		conf->nr_pending++;
+	} else if (test_bit(MD_RECOVERY_RUNNING, &conf->mddev->recovery)
+		&& unlikely(bio != NULL)) {
+		/*before the resync window*/
+		if ((bio->bi_sector + bio_sectors(bio))  <
+			(conf->next_resync ?
+			 (conf->next_resync - RESYNC_WINDOWS) : 0)) {
+			conf->nr_before++;
+			position = 1;
+		} else if (bio->bi_sector >= conf->next_resync) {
+			conf->nr_after++;
+			position = 2;
+			if (bio->bi_sector < conf->near_position)
+				conf->near_position = bio->bi_sector;
+		} else {
+			if (conf->barrier) {
+				conf->nr_waiting++;
+			/* Wait for the barrier to drop.
+			 * However if there are already pending
+			 * requests (preventing the barrier from
+			 * rising completely), and the
+			 * pre-process bio queue isn't empty,
+			 * then don't wait, as we need to empty
+			 * that queue to get the nr_pending
+			 * count down.
+			 */
+			wait_event_lock_irq(conf->wait_barrier,
 				    !conf->barrier ||
 				    (conf->nr_pending &&
 				     current->bio_list &&
 				     !bio_list_empty(current->bio_list)),
 				    conf->resync_lock);
-		conf->nr_waiting--;
-	}
-	conf->nr_pending++;
+			conf->nr_waiting--;
+			}
+			conf->nr_pending++;
+		}
+	} else
+		conf->nr_pending++;
 	spin_unlock_irq(&conf->resync_lock);
+	return position;
 }
 
-static void allow_barrier(struct r1conf *conf)
+static void allow_barrier(struct r1conf *conf, int position)
 {
 	unsigned long flags;
+	struct rb_node *node;
+
 	spin_lock_irqsave(&conf->resync_lock, flags);
-	conf->nr_pending--;
+	switch (position) {
+	case 0:
+		conf->nr_pending--;
+		break;
+	case 1:
+		conf->nr_before--;
+		break;
+	case 2:
+		node = rb_first(&conf->rb_root);
+		conf->nr_after--;
+		if (node == NULL) {
+			BUG_ON(conf->nr_after);
+			conf->near_position = 0;
+		} else {
+			struct r1bio *r1_bio =
+				container_of(node, struct r1bio, node);
+			conf->near_position = r1_bio->sector;
+		}
+		break;
+	default:
+		BUG();
+	}
 	spin_unlock_irqrestore(&conf->resync_lock, flags);
 	wake_up(&conf->wait_barrier);
 }
@@ -895,19 +978,19 @@ static void freeze_array(struct r1conf *conf)
 	 * we continue.
 	 */
 	spin_lock_irq(&conf->resync_lock);
-	conf->barrier++;
+	conf->force_barrier++;
 	conf->nr_waiting++;
 	wait_event_lock_irq_cmd(conf->wait_barrier,
-				conf->nr_pending == conf->nr_queued+1,
-				conf->resync_lock,
-				flush_pending_writes(conf));
+		(conf->nr_pending + conf->nr_before + conf->nr_after)
+		== conf->nr_queued+1, conf->resync_lock,
+		flush_pending_writes(conf));
 	spin_unlock_irq(&conf->resync_lock);
 }
 static void unfreeze_array(struct r1conf *conf)
 {
 	/* reverse the effect of the freeze */
 	spin_lock_irq(&conf->resync_lock);
-	conf->barrier--;
+	conf->force_barrier--;
 	conf->nr_waiting--;
 	wake_up(&conf->wait_barrier);
 	spin_unlock_irq(&conf->resync_lock);
@@ -986,6 +1069,28 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
 	kfree(plug);
 }
 
+static bool raid1_rb_insert(struct rb_root *root, struct r1bio *bio)
+{
+	struct rb_node **new = &(root->rb_node);
+	struct rb_node *parent = NULL;
+	while (*new) {
+		struct r1bio *r1bio = container_of(*new, struct r1bio, node);
+
+		parent = *new;
+
+		if (bio->sector < r1bio->sector)
+			new = &((*new)->rb_left);
+		else if (bio->sector > r1bio->sector)
+			new = &((*new)->rb_right);
+		else
+			return false;
+	}
+	rb_link_node(&bio->node, parent, new);
+	rb_insert_color(&bio->node, root);
+
+	return true;
+
+}
 static void make_request(struct mddev *mddev, struct bio * bio)
 {
 	struct r1conf *conf = mddev->private;
@@ -1006,6 +1111,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
 	int first_clone;
 	int sectors_handled;
 	int max_sectors;
+	int position;
 
 	/*
 	 * Register the new request and wait if the reconstruction
@@ -1035,7 +1141,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
 		finish_wait(&conf->wait_barrier, &w);
 	}
 
-	wait_barrier(conf);
+	position = wait_barrier(conf, bio);
 
 	bitmap = mddev->bitmap;
 
@@ -1052,6 +1158,18 @@ static void make_request(struct mddev *mddev, struct bio * bio)
 	r1_bio->mddev = mddev;
 	r1_bio->sector = bio->bi_sector;
 
+	if (position == 1)
+		set_bit(R1BIO_BEFORE_RESYNC, &r1_bio->state);
+	else if (position == 2) {
+		set_bit(R1BIO_AFTER_RESYNC, &r1_bio->state);
+
+		spin_lock_irq(&conf->resync_lock);
+		if (!raid1_rb_insert(&conf->rb_root, r1_bio))
+			BUG();
+		spin_unlock_irq(&conf->resync_lock);
+	}
+
+
 	/* We might need to issue multiple reads to different
 	 * devices if there are bad blocks around, so we keep
 	 * track of the number of reads in bio->bi_phys_segments.
@@ -1229,9 +1347,16 @@ read_again:
 			if (r1_bio->bios[j])
 				rdev_dec_pending(conf->mirrors[j].rdev, mddev);
 		r1_bio->state = 0;
-		allow_barrier(conf);
+		allow_barrier(conf, position);
+
 		md_wait_for_blocked_rdev(blocked_rdev, mddev);
-		wait_barrier(conf);
+
+		position = wait_barrier(conf, bio);
+		if (position == 1)
+			set_bit(R1BIO_BEFORE_RESYNC, &r1_bio->state);
+		else if (position == 2)
+			set_bit(R1BIO_AFTER_RESYNC, &r1_bio->state);
+
 		goto retry_write;
 	}
 
@@ -1434,11 +1559,12 @@ static void print_conf(struct r1conf *conf)
 
 static void close_sync(struct r1conf *conf)
 {
-	wait_barrier(conf);
-	allow_barrier(conf);
+	wait_barrier(conf, NULL);
+	allow_barrier(conf, 0);
 
 	mempool_destroy(conf->r1buf_pool);
 	conf->r1buf_pool = NULL;
+	conf->next_resync = 0;
 }
 
 static int raid1_spare_active(struct mddev *mddev)
@@ -1550,8 +1676,8 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 		 * we wait for all outstanding requests to complete.
 		 */
 		synchronize_sched();
-		raise_barrier(conf);
-		lower_barrier(conf);
+		raise_barrier(conf, 1);
+		lower_barrier(conf, 1);
 		clear_bit(Unmerged, &rdev->flags);
 	}
 	md_integrity_add_rdev(rdev, mddev);
@@ -1601,11 +1727,11 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 			 */
 			struct md_rdev *repl =
 				conf->mirrors[conf->raid_disks + number].rdev;
-			raise_barrier(conf);
+			raise_barrier(conf, 1);
 			clear_bit(Replacement, &repl->flags);
 			p->rdev = repl;
 			conf->mirrors[conf->raid_disks + number].rdev = NULL;
-			lower_barrier(conf);
+			lower_barrier(conf, 1);
 			clear_bit(WantReplacement, &rdev->flags);
 		} else
 			clear_bit(WantReplacement, &rdev->flags);
@@ -2434,7 +2560,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
 
 	bitmap_cond_end_sync(mddev->bitmap, sector_nr);
 	r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO);
-	raise_barrier(conf);
+	raise_barrier(conf, 0);
 
 	conf->next_resync = sector_nr;
 
@@ -2734,6 +2860,8 @@ static struct r1conf *setup_conf(struct mddev *mddev)
 	conf->pending_count = 0;
 	conf->recovery_disabled = mddev->recovery_disabled - 1;
 
+	conf->rb_root = RB_ROOT;
+
 	err = -EIO;
 	for (i = 0; i < conf->raid_disks * 2; i++) {
 
@@ -2888,8 +3016,8 @@ static int stop(struct mddev *mddev)
 			   atomic_read(&bitmap->behind_writes) == 0);
 	}
 
-	raise_barrier(conf);
-	lower_barrier(conf);
+	raise_barrier(conf, 1);
+	lower_barrier(conf, 1);
 
 	md_unregister_thread(&mddev->thread);
 	if (conf->r1bio_pool)
@@ -2998,7 +3126,7 @@ static int raid1_reshape(struct mddev *mddev)
 		return -ENOMEM;
 	}
 
-	raise_barrier(conf);
+	raise_barrier(conf, 1);
 
 	/* ok, everything is stopped */
 	oldpool = conf->r1bio_pool;
@@ -3029,7 +3157,7 @@ static int raid1_reshape(struct mddev *mddev)
 	conf->raid_disks = mddev->raid_disks = raid_disks;
 	mddev->delta_disks = 0;
 
-	lower_barrier(conf);
+	lower_barrier(conf, 1);
 
 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 	md_wakeup_thread(mddev->thread);
@@ -3047,10 +3175,10 @@ static void raid1_quiesce(struct mddev *mddev, int state)
 		wake_up(&conf->wait_barrier);
 		break;
 	case 1:
-		raise_barrier(conf);
+		raise_barrier(conf, 1);
 		break;
 	case 0:
-		lower_barrier(conf);
+		lower_barrier(conf, 1);
 		break;
 	}
 }
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index 0ff3715..6611c17 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -1,6 +1,7 @@
 #ifndef _RAID1_H
 #define _RAID1_H
 
+
 struct raid1_info {
 	struct md_rdev	*rdev;
 	sector_t	head_position;
@@ -62,9 +63,15 @@ struct r1conf {
 	wait_queue_head_t	wait_barrier;
 	spinlock_t		resync_lock;
 	int			nr_pending;
+	int			nr_before;
+	int			nr_after;
 	int			nr_waiting;
 	int			nr_queued;
 	int			barrier;
+	int			force_barrier;
+	/*normal IO which the nearest resync window*/
+	sector_t		near_position;
+	struct rb_root		rb_root;
 
 	/* Set to 1 if a full sync is needed, (fresh device added).
 	 * Cleared when a sync completes.
@@ -104,6 +111,7 @@ struct r1conf {
  */
 
 struct r1bio {
+	struct rb_node		node;
 	atomic_t		remaining; /* 'have we finished' count,
 					    * used from IRQ handlers
 					    */
@@ -158,6 +166,9 @@ struct r1bio {
 #define	R1BIO_MadeGood 7
 #define	R1BIO_WriteError 8
 
+#define R1BIO_BEFORE_RESYNC 9
+#define R1BIO_AFTER_RESYNC  10
+
 extern int md_raid1_congested(struct mddev *mddev, int bits);
 
 #endif
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [RFC PATCH V1] raid1: rewrite the iobarrier
  2013-01-24  6:02 [RFC PATCH V1] raid1: rewrite the iobarrier majianpeng
@ 2013-02-06  2:16 ` NeilBrown
  2013-02-20  8:54   ` majianpeng
  0 siblings, 1 reply; 4+ messages in thread
From: NeilBrown @ 2013-02-06  2:16 UTC (permalink / raw)
  To: majianpeng; +Cc: linux-raid

[-- Attachment #1: Type: text/plain, Size: 3305 bytes --]

On Thu, 24 Jan 2013 14:02:52 +0800 majianpeng <majianpeng@gmail.com> wrote:

> There is iobarrier in raid1 because two reason resync/recovery  or reconfigure the array.
> At present,it suspend all nornal IO when reysync/recovey.
> But if nornal IO is outrange the resync/recovery windwos,it  don't need to iobarrier.
> So I rewrite the iobarrier.
> Because the reasons of barrier are two,so i use two different methods.
> First for resync/recovery, there is a reysnc window.The end position is 'next_resync'.Because the resync depth is  RESYNC_DEPTH(32),
> so the start is 'next_resync - RESYNC_SECTOR * RESYNC_DEPTH'
> The nornal IO Will be divided into three categories by the location.
> a: before the start of resync window
> b: between the resync window
> c: after the end of resync window
> For a, it don't barrier.
> For b, it need barrier and used the original method
> For c, it don't barrier but it need record the minimum position.If next resync is larger this, resync action will suspend.Otherwise versa.
> I used rbtree to order those io.
> 
> For the reason of reconfigure of the arrary,I proposed a concept "force_barrier".When there is force_barrier, all Nornam IO must be suspended.
> 
> NOTE:
> Because this problem is also for raid10, but i only do it for raid1. It is post out mainly to make sure it is 
> going in the correct direction and hope to get some helpful comments from other guys.
> If the methods is accepted,i will send the patch for raid10.
> 
> Signed-off-by: Jianpeng Ma <majianpeng@gmail.com>

Hi,
 thanks for this and sorry for the delay in replying.

The patch is reasonably good, but there is room for improvement.
I would break it up into several patches which are easier to review.

- Firstly, don't worry about the barrier for 'read' requests - it really
  isn't relevant for them (your patch didn't do this).

- Secondly, find those places where raise_barrier() is used to reconfigure
  the array and replace those with "freeze_array()".  I think that is safe,
  and it means that we don't need to pass a 'force' argument to
  'raise_barrier()' and 'lower_barrier()'.

- The rest might have to be all one patch, though if it could be done in a
  couple of distinct stages that would be good.
  For the different positions (before, during, after), which you currently
  call 0, 1, and 2 you should use an enum so they all have names.
 
  I don't really like the rbtree.  It adds an extra place where you take the
  spinlock and causes more work to be done inside a spinlock.  And it isn't
  really needed.  Instead, you can have two R1BIO flags for "AFTER_RESYNC".
  One for requests that are more than 2 windows after, one for request that
  are less then 2 windows after (or maybe 3 windows or maybe 8..).  Each of
  these has a corresponding count (as you already have: nr_after).
  Then when resync gets to the end of a window you wait until the count of 
  "less than 2 windows after" reaches zero, then atomically swap the meaning
  of the two bits (toggle a bit somewhere).
  This should give you nearly the same effect with constant (not log-n)
  effort.

Finally, have you done any testing, either to ensure there is no slow-down,
or to demonstrate some improvement?

Thanks,
NeilBrown



[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 828 bytes --]

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: Re: [RFC PATCH V1] raid1: rewrite the iobarrier
  2013-02-06  2:16 ` NeilBrown
@ 2013-02-20  8:54   ` majianpeng
  2013-03-04  3:18     ` NeilBrown
  0 siblings, 1 reply; 4+ messages in thread
From: majianpeng @ 2013-02-20  8:54 UTC (permalink / raw)
  To: NeilBrown; +Cc: linux-raid

>On Thu, 24 Jan 2013 14:02:52 +0800 majianpeng <majianpeng@gmail.com> wrote:
>
>> There is iobarrier in raid1 because two reason resync/recovery  or reconfigure the array.
>> At present,it suspend all nornal IO when reysync/recovey.
>> But if nornal IO is outrange the resync/recovery windwos,it  don't need to iobarrier.
>> So I rewrite the iobarrier.
>> Because the reasons of barrier are two,so i use two different methods.
>> First for resync/recovery, there is a reysnc window.The end position is 'next_resync'.Because the resync depth is  RESYNC_DEPTH(32),
>> so the start is 'next_resync - RESYNC_SECTOR * RESYNC_DEPTH'
>> The nornal IO Will be divided into three categories by the location.
>> a: before the start of resync window
>> b: between the resync window
>> c: after the end of resync window
>> For a, it don't barrier.
>> For b, it need barrier and used the original method
>> For c, it don't barrier but it need record the minimum position.If next resync is larger this, resync action will suspend.Otherwise versa.
>> I used rbtree to order those io.
>> 
>> For the reason of reconfigure of the arrary,I proposed a concept "force_barrier".When there is force_barrier, all Nornam IO must be suspended.
>> 
>> NOTE:
>> Because this problem is also for raid10, but i only do it for raid1. It is post out mainly to make sure it is 
>> going in the correct direction and hope to get some helpful comments from other guys.
>> If the methods is accepted,i will send the patch for raid10.
>> 
>> Signed-off-by: Jianpeng Ma <majianpeng@gmail.com>
>
>Hi,
> thanks for this and sorry for the delay in replying.
>
Hi, sorroy for delay in replying.Thanks very much for your suggestion.
>The patch is reasonably good, but there is room for improvement.
>I would break it up into several patches which are easier to review.
>
>- Firstly, don't worry about the barrier for 'read' requests - it really
>  isn't relevant for them (your patch didn't do this).
>
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index bd6a188..2e5bf75 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -235,6 +235,7 @@ static void call_bio_endio(struct r1bio *r1_bio)
        struct bio *bio = r1_bio->master_bio;
        int done;
        struct r1conf *conf = r1_bio->mddev->private;
+       int rw = bio_data_dir(bio);
 
        if (bio->bi_phys_segments) {
                unsigned long flags;
@@ -253,7 +254,8 @@ static void call_bio_endio(struct r1bio *r1_bio)
                 * Wake up any possible resync thread that waits for the device
                 * to go idle.
                 */
-               allow_barrier(conf);
+               if (rw == WRITE)
+                       allow_barrier(conf);
        }
 }
 
@@ -1035,7 +1037,8 @@ static void make_request(struct mddev *mddev, struct bio * bio)
                finish_wait(&conf->wait_barrier, &w);
        }
 
-       wait_barrier(conf);
+       if (rw == WRITE)
+               wait_barrier(conf);
 
        bitmap = mddev->bitmap;\x02

Above code is what's you said.But it met read-error,raid1d will blocked for ever.
The reason is freeze_array:
>       wait_event_lock_irq_cmd(conf->wait_barrier,
>                                conf->nr_pending == conf->nr_queued+1,
For read operation, it can't add nr_pending.
Are you have good method to do this?

>- Secondly, find those places where raise_barrier() is used to reconfigure
>  the array and replace those with "freeze_array()".  I think that is safe,
>  and it means that we don't need to pass a 'force' argument to
>  'raise_barrier()' and 'lower_barrier()'.
But it will be blocked for ever.
The comment of freeze_array,
> * This is called in the context of one normal IO request
>	 * that has failed.
In freeze_array,the judgement is
>wait_event_lock_irq_cmd(conf->wait_barrier,
>				conf->nr_pending == conf->nr_queued+1,
Because the place where call this func in io context,so there must be nr_pending.
But for the flace where called reconfigure array don't  in io context.So the condition
"conf->nr_pending == conf->nr_queued+1" is never true.
If we add a parameter 'int iocontext' to freeze_array, that is
>static void freeze_array(struct r1conf *conf, int iocontext)
>if (iocontext)
>wait_event_lock_irq_cmd(conf->wait_barrier,
>				conf->nr_pending == conf->nr_queued+1,
>  else
> wait_event_lock_irq_cmd(conf->wait_barrier,
>				conf->nr_pending == conf->nr_queued,

How about this method?
>
>- The rest might have to be all one patch, though if it could be done in a
>  couple of distinct stages that would be good.
>  For the different positions (before, during, after), which you currently
>  call 0, 1, and 2 you should use an enum so they all have names.
> 
Ok,thanks!
>  I don't really like the rbtree.  It adds an extra place where you take the
>  spinlock and causes more work to be done inside a spinlock.  And it isn't
>  really needed.  Instead, you can have two R1BIO flags for "AFTER_RESYNC".
>  One for requests that are more than 2 windows after, one for request that
>  are less then 2 windows after (or maybe 3 windows or maybe 8..).  Each of
>  these has a corresponding count (as you already have: nr_after).
>  Then when resync gets to the end of a window you wait until the count of 
For resync operation,there is no resync window.How to do this?
>  "less than 2 windows after" reaches zero, then atomically swap the meaning
>  of the two bits (toggle a bit somewhere).
We don't know the nearset position from request which more than 2 windows after.
For example, there are three request after 2 windows.
A is after three windows;B is after six windows; C is after 11 windows.
When the count of "less than 2 windows after" reached zero, how to do?
>  This should give you nearly the same effect with constant (not log-n)
>  effort.
>

>Finally, have you done any testing, either to ensure there is no slow-down,
>or to demonstrate some improvement?
>
I only used dd to test.There was no apparent performance degradation 

Thanks!
Jianpeng Ma

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [RFC PATCH V1] raid1: rewrite the iobarrier
  2013-02-20  8:54   ` majianpeng
@ 2013-03-04  3:18     ` NeilBrown
  0 siblings, 0 replies; 4+ messages in thread
From: NeilBrown @ 2013-03-04  3:18 UTC (permalink / raw)
  To: majianpeng; +Cc: linux-raid

[-- Attachment #1: Type: text/plain, Size: 6919 bytes --]

On Wed, 20 Feb 2013 16:54:12 +0800 majianpeng <majianpeng@gmail.com> wrote:

> >On Thu, 24 Jan 2013 14:02:52 +0800 majianpeng <majianpeng@gmail.com> wrote:
> >
> >> There is iobarrier in raid1 because two reason resync/recovery  or reconfigure the array.
> >> At present,it suspend all nornal IO when reysync/recovey.
> >> But if nornal IO is outrange the resync/recovery windwos,it  don't need to iobarrier.
> >> So I rewrite the iobarrier.
> >> Because the reasons of barrier are two,so i use two different methods.
> >> First for resync/recovery, there is a reysnc window.The end position is 'next_resync'.Because the resync depth is  RESYNC_DEPTH(32),
> >> so the start is 'next_resync - RESYNC_SECTOR * RESYNC_DEPTH'
> >> The nornal IO Will be divided into three categories by the location.
> >> a: before the start of resync window
> >> b: between the resync window
> >> c: after the end of resync window
> >> For a, it don't barrier.
> >> For b, it need barrier and used the original method
> >> For c, it don't barrier but it need record the minimum position.If next resync is larger this, resync action will suspend.Otherwise versa.
> >> I used rbtree to order those io.
> >> 
> >> For the reason of reconfigure of the arrary,I proposed a concept "force_barrier".When there is force_barrier, all Nornam IO must be suspended.
> >> 
> >> NOTE:
> >> Because this problem is also for raid10, but i only do it for raid1. It is post out mainly to make sure it is 
> >> going in the correct direction and hope to get some helpful comments from other guys.
> >> If the methods is accepted,i will send the patch for raid10.
> >> 
> >> Signed-off-by: Jianpeng Ma <majianpeng@gmail.com>
> >
> >Hi,
> > thanks for this and sorry for the delay in replying.
> >
> Hi, sorroy for delay in replying.Thanks very much for your suggestion.
> >The patch is reasonably good, but there is room for improvement.
> >I would break it up into several patches which are easier to review.
> >
> >- Firstly, don't worry about the barrier for 'read' requests - it really
> >  isn't relevant for them (your patch didn't do this).
> >
> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
> index bd6a188..2e5bf75 100644
> --- a/drivers/md/raid1.c
> +++ b/drivers/md/raid1.c
> @@ -235,6 +235,7 @@ static void call_bio_endio(struct r1bio *r1_bio)
>         struct bio *bio = r1_bio->master_bio;
>         int done;
>         struct r1conf *conf = r1_bio->mddev->private;
> +       int rw = bio_data_dir(bio);
>  
>         if (bio->bi_phys_segments) {
>                 unsigned long flags;
> @@ -253,7 +254,8 @@ static void call_bio_endio(struct r1bio *r1_bio)
>                  * Wake up any possible resync thread that waits for the device
>                  * to go idle.
>                  */
> -               allow_barrier(conf);
> +               if (rw == WRITE)
> +                       allow_barrier(conf);
>         }
>  }
>  
> @@ -1035,7 +1037,8 @@ static void make_request(struct mddev *mddev, struct bio * bio)
>                 finish_wait(&conf->wait_barrier, &w);
>         }
>  
> -       wait_barrier(conf);
> +       if (rw == WRITE)
> +               wait_barrier(conf);
>  
>         bitmap = mddev->bitmap;\x02
> 
> Above code is what's you said.But it met read-error,raid1d will blocked for ever.
> The reason is freeze_array:
> >       wait_event_lock_irq_cmd(conf->wait_barrier,
> >                                conf->nr_pending == conf->nr_queued+1,
> For read operation, it can't add nr_pending.
> Are you have good method to do this?

Only update nr_queued for Write requests, not for read requests?


> 
> >- Secondly, find those places where raise_barrier() is used to reconfigure
> >  the array and replace those with "freeze_array()".  I think that is safe,
> >  and it means that we don't need to pass a 'force' argument to
> >  'raise_barrier()' and 'lower_barrier()'.
> But it will be blocked for ever.
> The comment of freeze_array,
> > * This is called in the context of one normal IO request
> >	 * that has failed.
> In freeze_array,the judgement is
> >wait_event_lock_irq_cmd(conf->wait_barrier,
> >				conf->nr_pending == conf->nr_queued+1,
> Because the place where call this func in io context,so there must be nr_pending.
> But for the flace where called reconfigure array don't  in io context.So the condition
> "conf->nr_pending == conf->nr_queued+1" is never true.
> If we add a parameter 'int iocontext' to freeze_array, that is
> >static void freeze_array(struct r1conf *conf, int iocontext)
> >if (iocontext)
> >wait_event_lock_irq_cmd(conf->wait_barrier,
> >				conf->nr_pending == conf->nr_queued+1,
> >  else
> > wait_event_lock_irq_cmd(conf->wait_barrier,
> >				conf->nr_pending == conf->nr_queued,
> 
> How about this method?

Probably makes sense - hard to tell without the full context of a complete
patch.


> >
> >- The rest might have to be all one patch, though if it could be done in a
> >  couple of distinct stages that would be good.
> >  For the different positions (before, during, after), which you currently
> >  call 0, 1, and 2 you should use an enum so they all have names.
> > 
> Ok,thanks!

If you could blank lines around the text which is your reply, it would make
it a lot easier to find and to read.

> >  I don't really like the rbtree.  It adds an extra place where you take the
> >  spinlock and causes more work to be done inside a spinlock.  And it isn't
> >  really needed.  Instead, you can have two R1BIO flags for "AFTER_RESYNC".
> >  One for requests that are more than 2 windows after, one for request that
> >  are less then 2 windows after (or maybe 3 windows or maybe 8..).  Each of
> >  these has a corresponding count (as you already have: nr_after).
> >  Then when resync gets to the end of a window you wait until the count of 
> For resync operation,there is no resync window.How to do this?

You aren't explaining yourself very well (again!).

> >  "less than 2 windows after" reaches zero, then atomically swap the meaning
> >  of the two bits (toggle a bit somewhere).
> We don't know the nearset position from request which more than 2 windows after.
> For example, there are three request after 2 windows.
> A is after three windows;B is after six windows; C is after 11 windows.
> When the count of "less than 2 windows after" reached zero, how to do?
> >  This should give you nearly the same effect with constant (not log-n)
> >  effort.
> >
> 
> >Finally, have you done any testing, either to ensure there is no slow-down,
> >or to demonstrate some improvement?
> >
> I only used dd to test.There was no apparent performance degradation 

There is no point reporting any results of a performance test without
actually giving numbers.


NeilBrown

> 
> Thanks!
> Jianpeng Ma

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 828 bytes --]

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2013-03-04  3:18 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2013-01-24  6:02 [RFC PATCH V1] raid1: rewrite the iobarrier majianpeng
2013-02-06  2:16 ` NeilBrown
2013-02-20  8:54   ` majianpeng
2013-03-04  3:18     ` NeilBrown

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).