linux-raid.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC]raid5: adjust operation order of handle_stripe
@ 2014-05-12  1:16 Shaohua Li
  2014-05-19  2:21 ` Shaohua Li
  2014-05-20  7:21 ` NeilBrown
  0 siblings, 2 replies; 4+ messages in thread
From: Shaohua Li @ 2014-05-12  1:16 UTC (permalink / raw)
  To: linux-raid; +Cc: neilb


For a full stripe write, the ideal operation order is handle_stripe_dirtying(),
raid_run_ops(), set R5_Wantwrite bit, and ops_run_io(). In this way, one
handle_stripe() will dispatch IO for the stripe, otherwise there are more extra
rounds of handle_stripe(). In a high speed raid5 array, handle_stripe()
consumes considered cpu time. Reducing its overhead has around 10% performance
boost.

This patch makes handle_stripe() operations follow the ideal order. It also
moves handle_stripe_clean_event() up, as it handles completed stripe. And if I
don't change handle_stripe_clean_event() order, I saw some states confused with
other changes.

Signed-off-by: Shaohua Li <shli@fusionio.com>
---
 drivers/md/raid5.c |   39 +++++++++++++++++++++------------------
 1 file changed, 21 insertions(+), 18 deletions(-)

Index: linux/drivers/md/raid5.c
===================================================================
--- linux.orig/drivers/md/raid5.c	2014-05-06 17:19:13.868225752 +0800
+++ linux/drivers/md/raid5.c	2014-05-06 17:20:25.367326852 +0800
@@ -1633,7 +1633,7 @@ static void ops_run_check_pq(struct stri
 			   &sh->ops.zero_sum_result, percpu->spare_page, &submit);
 }
 
-static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
+static void raid_run_ops(struct stripe_head *sh, unsigned long *ops_request)
 {
 	int overlap_clear = 0, i, disks = sh->disks;
 	struct dma_async_tx_descriptor *tx = NULL;
@@ -1644,12 +1644,12 @@ static void raid_run_ops(struct stripe_h
 
 	cpu = get_cpu();
 	percpu = per_cpu_ptr(conf->percpu, cpu);
-	if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
+	if (test_and_clear_bit(STRIPE_OP_BIOFILL, ops_request)) {
 		ops_run_biofill(sh);
 		overlap_clear++;
 	}
 
-	if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) {
+	if (test_and_clear_bit(STRIPE_OP_COMPUTE_BLK, ops_request)) {
 		if (level < 6)
 			tx = ops_run_compute5(sh, percpu);
 		else {
@@ -1659,26 +1659,26 @@ static void raid_run_ops(struct stripe_h
 				tx = ops_run_compute6_2(sh, percpu);
 		}
 		/* terminate the chain if reconstruct is not set to be run */
-		if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request))
+		if (tx && !test_and_clear_bit(STRIPE_OP_RECONSTRUCT, ops_request))
 			async_tx_ack(tx);
 	}
 
-	if (test_bit(STRIPE_OP_PREXOR, &ops_request))
+	if (test_and_clear_bit(STRIPE_OP_PREXOR, ops_request))
 		tx = ops_run_prexor(sh, percpu, tx);
 
-	if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
+	if (test_and_clear_bit(STRIPE_OP_BIODRAIN, ops_request)) {
 		tx = ops_run_biodrain(sh, tx);
 		overlap_clear++;
 	}
 
-	if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) {
+	if (test_and_clear_bit(STRIPE_OP_RECONSTRUCT, ops_request)) {
 		if (level < 6)
 			ops_run_reconstruct5(sh, percpu, tx);
 		else
 			ops_run_reconstruct6(sh, percpu, tx);
 	}
 
-	if (test_bit(STRIPE_OP_CHECK, &ops_request)) {
+	if (test_and_clear_bit(STRIPE_OP_CHECK, ops_request)) {
 		if (sh->check_state == check_state_run)
 			ops_run_check_p(sh, percpu);
 		else if (sh->check_state == check_state_run_q)
@@ -3780,6 +3780,41 @@ static void handle_stripe(struct stripe_
 			handle_failed_sync(conf, sh, &s);
 	}
 
+	/*
+	 * might be able to return some write requests if the parity blocks
+	 * are safe, or on a failed drive
+	 */
+	pdev = &sh->dev[sh->pd_idx];
+	s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx)
+		|| (s.failed >= 2 && s.failed_num[1] == sh->pd_idx);
+	qdev = &sh->dev[sh->qd_idx];
+	s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx)
+		|| (s.failed >= 2 && s.failed_num[1] == sh->qd_idx)
+		|| conf->level < 6;
+
+	if (s.written &&
+	    (s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
+			     && !test_bit(R5_LOCKED, &pdev->flags)
+			     && (test_bit(R5_UPTODATE, &pdev->flags) ||
+				 test_bit(R5_Discard, &pdev->flags))))) &&
+	    (s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
+			     && !test_bit(R5_LOCKED, &qdev->flags)
+			     && (test_bit(R5_UPTODATE, &qdev->flags) ||
+				 test_bit(R5_Discard, &qdev->flags))))))
+		handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
+
+	/* Now to consider new write requests and what else, if anything
+	 * should be read.  We do not handle new writes when:
+	 * 1/ A 'write' operation (copy+xor) is already in flight.
+	 * 2/ A 'check' operation is in flight, as it may clobber the parity
+	 *    block.
+	 */
+	if (s.to_write && !sh->reconstruct_state && !sh->check_state)
+		handle_stripe_dirtying(conf, sh, &s, disks);
+
+	if (s.ops_request)
+		raid_run_ops(sh, &s.ops_request);
+
 	/* Now we check to see if any write operations have recently
 	 * completed
 	 */
@@ -3817,29 +3852,6 @@ static void handle_stripe(struct stripe_
 			s.dec_preread_active = 1;
 	}
 
-	/*
-	 * might be able to return some write requests if the parity blocks
-	 * are safe, or on a failed drive
-	 */
-	pdev = &sh->dev[sh->pd_idx];
-	s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx)
-		|| (s.failed >= 2 && s.failed_num[1] == sh->pd_idx);
-	qdev = &sh->dev[sh->qd_idx];
-	s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx)
-		|| (s.failed >= 2 && s.failed_num[1] == sh->qd_idx)
-		|| conf->level < 6;
-
-	if (s.written &&
-	    (s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
-			     && !test_bit(R5_LOCKED, &pdev->flags)
-			     && (test_bit(R5_UPTODATE, &pdev->flags) ||
-				 test_bit(R5_Discard, &pdev->flags))))) &&
-	    (s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
-			     && !test_bit(R5_LOCKED, &qdev->flags)
-			     && (test_bit(R5_UPTODATE, &qdev->flags) ||
-				 test_bit(R5_Discard, &qdev->flags))))))
-		handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
-
 	/* Now we might consider reading some blocks, either to check/generate
 	 * parity, or to satisfy requests
 	 * or to load a block that is being partially written.
@@ -3851,15 +3863,6 @@ static void handle_stripe(struct stripe_
 	    || s.expanding)
 		handle_stripe_fill(sh, &s, disks);
 
-	/* Now to consider new write requests and what else, if anything
-	 * should be read.  We do not handle new writes when:
-	 * 1/ A 'write' operation (copy+xor) is already in flight.
-	 * 2/ A 'check' operation is in flight, as it may clobber the parity
-	 *    block.
-	 */
-	if (s.to_write && !sh->reconstruct_state && !sh->check_state)
-		handle_stripe_dirtying(conf, sh, &s, disks);
-
 	/* maybe we need to check and possibly fix the parity for this stripe
 	 * Any reads will already have been scheduled, so we just see if enough
 	 * data is available.  The parity check is held off while parity
@@ -4014,7 +4017,7 @@ finish:
 		}
 
 	if (s.ops_request)
-		raid_run_ops(sh, s.ops_request);
+		raid_run_ops(sh, &s.ops_request);
 
 	ops_run_io(sh, &s);
 

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [RFC]raid5: adjust operation order of handle_stripe
  2014-05-12  1:16 [RFC]raid5: adjust operation order of handle_stripe Shaohua Li
@ 2014-05-19  2:21 ` Shaohua Li
  2014-05-19  2:42   ` NeilBrown
  2014-05-20  7:21 ` NeilBrown
  1 sibling, 1 reply; 4+ messages in thread
From: Shaohua Li @ 2014-05-19  2:21 UTC (permalink / raw)
  To: linux-raid; +Cc: neilb

Ping!

On Mon, May 12, 2014 at 09:16:59AM +0800, Shaohua Li wrote:
> 
> For a full stripe write, the ideal operation order is handle_stripe_dirtying(),
> raid_run_ops(), set R5_Wantwrite bit, and ops_run_io(). In this way, one
> handle_stripe() will dispatch IO for the stripe, otherwise there are more extra
> rounds of handle_stripe(). In a high speed raid5 array, handle_stripe()
> consumes considered cpu time. Reducing its overhead has around 10% performance
> boost.
> 
> This patch makes handle_stripe() operations follow the ideal order. It also
> moves handle_stripe_clean_event() up, as it handles completed stripe. And if I
> don't change handle_stripe_clean_event() order, I saw some states confused with
> other changes.
> 
> Signed-off-by: Shaohua Li <shli@fusionio.com>
> ---
>  drivers/md/raid5.c |   39 +++++++++++++++++++++------------------
>  1 file changed, 21 insertions(+), 18 deletions(-)
> 
> Index: linux/drivers/md/raid5.c
> ===================================================================
> --- linux.orig/drivers/md/raid5.c	2014-05-06 17:19:13.868225752 +0800
> +++ linux/drivers/md/raid5.c	2014-05-06 17:20:25.367326852 +0800
> @@ -1633,7 +1633,7 @@ static void ops_run_check_pq(struct stri
>  			   &sh->ops.zero_sum_result, percpu->spare_page, &submit);
>  }
>  
> -static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
> +static void raid_run_ops(struct stripe_head *sh, unsigned long *ops_request)
>  {
>  	int overlap_clear = 0, i, disks = sh->disks;
>  	struct dma_async_tx_descriptor *tx = NULL;
> @@ -1644,12 +1644,12 @@ static void raid_run_ops(struct stripe_h
>  
>  	cpu = get_cpu();
>  	percpu = per_cpu_ptr(conf->percpu, cpu);
> -	if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
> +	if (test_and_clear_bit(STRIPE_OP_BIOFILL, ops_request)) {
>  		ops_run_biofill(sh);
>  		overlap_clear++;
>  	}
>  
> -	if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) {
> +	if (test_and_clear_bit(STRIPE_OP_COMPUTE_BLK, ops_request)) {
>  		if (level < 6)
>  			tx = ops_run_compute5(sh, percpu);
>  		else {
> @@ -1659,26 +1659,26 @@ static void raid_run_ops(struct stripe_h
>  				tx = ops_run_compute6_2(sh, percpu);
>  		}
>  		/* terminate the chain if reconstruct is not set to be run */
> -		if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request))
> +		if (tx && !test_and_clear_bit(STRIPE_OP_RECONSTRUCT, ops_request))
>  			async_tx_ack(tx);
>  	}
>  
> -	if (test_bit(STRIPE_OP_PREXOR, &ops_request))
> +	if (test_and_clear_bit(STRIPE_OP_PREXOR, ops_request))
>  		tx = ops_run_prexor(sh, percpu, tx);
>  
> -	if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
> +	if (test_and_clear_bit(STRIPE_OP_BIODRAIN, ops_request)) {
>  		tx = ops_run_biodrain(sh, tx);
>  		overlap_clear++;
>  	}
>  
> -	if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) {
> +	if (test_and_clear_bit(STRIPE_OP_RECONSTRUCT, ops_request)) {
>  		if (level < 6)
>  			ops_run_reconstruct5(sh, percpu, tx);
>  		else
>  			ops_run_reconstruct6(sh, percpu, tx);
>  	}
>  
> -	if (test_bit(STRIPE_OP_CHECK, &ops_request)) {
> +	if (test_and_clear_bit(STRIPE_OP_CHECK, ops_request)) {
>  		if (sh->check_state == check_state_run)
>  			ops_run_check_p(sh, percpu);
>  		else if (sh->check_state == check_state_run_q)
> @@ -3780,6 +3780,41 @@ static void handle_stripe(struct stripe_
>  			handle_failed_sync(conf, sh, &s);
>  	}
>  
> +	/*
> +	 * might be able to return some write requests if the parity blocks
> +	 * are safe, or on a failed drive
> +	 */
> +	pdev = &sh->dev[sh->pd_idx];
> +	s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx)
> +		|| (s.failed >= 2 && s.failed_num[1] == sh->pd_idx);
> +	qdev = &sh->dev[sh->qd_idx];
> +	s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx)
> +		|| (s.failed >= 2 && s.failed_num[1] == sh->qd_idx)
> +		|| conf->level < 6;
> +
> +	if (s.written &&
> +	    (s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
> +			     && !test_bit(R5_LOCKED, &pdev->flags)
> +			     && (test_bit(R5_UPTODATE, &pdev->flags) ||
> +				 test_bit(R5_Discard, &pdev->flags))))) &&
> +	    (s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
> +			     && !test_bit(R5_LOCKED, &qdev->flags)
> +			     && (test_bit(R5_UPTODATE, &qdev->flags) ||
> +				 test_bit(R5_Discard, &qdev->flags))))))
> +		handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
> +
> +	/* Now to consider new write requests and what else, if anything
> +	 * should be read.  We do not handle new writes when:
> +	 * 1/ A 'write' operation (copy+xor) is already in flight.
> +	 * 2/ A 'check' operation is in flight, as it may clobber the parity
> +	 *    block.
> +	 */
> +	if (s.to_write && !sh->reconstruct_state && !sh->check_state)
> +		handle_stripe_dirtying(conf, sh, &s, disks);
> +
> +	if (s.ops_request)
> +		raid_run_ops(sh, &s.ops_request);
> +
>  	/* Now we check to see if any write operations have recently
>  	 * completed
>  	 */
> @@ -3817,29 +3852,6 @@ static void handle_stripe(struct stripe_
>  			s.dec_preread_active = 1;
>  	}
>  
> -	/*
> -	 * might be able to return some write requests if the parity blocks
> -	 * are safe, or on a failed drive
> -	 */
> -	pdev = &sh->dev[sh->pd_idx];
> -	s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx)
> -		|| (s.failed >= 2 && s.failed_num[1] == sh->pd_idx);
> -	qdev = &sh->dev[sh->qd_idx];
> -	s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx)
> -		|| (s.failed >= 2 && s.failed_num[1] == sh->qd_idx)
> -		|| conf->level < 6;
> -
> -	if (s.written &&
> -	    (s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
> -			     && !test_bit(R5_LOCKED, &pdev->flags)
> -			     && (test_bit(R5_UPTODATE, &pdev->flags) ||
> -				 test_bit(R5_Discard, &pdev->flags))))) &&
> -	    (s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
> -			     && !test_bit(R5_LOCKED, &qdev->flags)
> -			     && (test_bit(R5_UPTODATE, &qdev->flags) ||
> -				 test_bit(R5_Discard, &qdev->flags))))))
> -		handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
> -
>  	/* Now we might consider reading some blocks, either to check/generate
>  	 * parity, or to satisfy requests
>  	 * or to load a block that is being partially written.
> @@ -3851,15 +3863,6 @@ static void handle_stripe(struct stripe_
>  	    || s.expanding)
>  		handle_stripe_fill(sh, &s, disks);
>  
> -	/* Now to consider new write requests and what else, if anything
> -	 * should be read.  We do not handle new writes when:
> -	 * 1/ A 'write' operation (copy+xor) is already in flight.
> -	 * 2/ A 'check' operation is in flight, as it may clobber the parity
> -	 *    block.
> -	 */
> -	if (s.to_write && !sh->reconstruct_state && !sh->check_state)
> -		handle_stripe_dirtying(conf, sh, &s, disks);
> -
>  	/* maybe we need to check and possibly fix the parity for this stripe
>  	 * Any reads will already have been scheduled, so we just see if enough
>  	 * data is available.  The parity check is held off while parity
> @@ -4014,7 +4017,7 @@ finish:
>  		}
>  
>  	if (s.ops_request)
> -		raid_run_ops(sh, s.ops_request);
> +		raid_run_ops(sh, &s.ops_request);
>  
>  	ops_run_io(sh, &s);
>  

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [RFC]raid5: adjust operation order of handle_stripe
  2014-05-19  2:21 ` Shaohua Li
@ 2014-05-19  2:42   ` NeilBrown
  0 siblings, 0 replies; 4+ messages in thread
From: NeilBrown @ 2014-05-19  2:42 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid

[-- Attachment #1: Type: text/plain, Size: 7800 bytes --]

On Mon, 19 May 2014 10:21:17 +0800 Shaohua Li <shli@kernel.org> wrote:

> Ping!

Sorry,  I've been busy.  I'll try to look at this this week.

Thanks,
NeilBrown

> 
> On Mon, May 12, 2014 at 09:16:59AM +0800, Shaohua Li wrote:
> > 
> > For a full stripe write, the ideal operation order is handle_stripe_dirtying(),
> > raid_run_ops(), set R5_Wantwrite bit, and ops_run_io(). In this way, one
> > handle_stripe() will dispatch IO for the stripe, otherwise there are more extra
> > rounds of handle_stripe(). In a high speed raid5 array, handle_stripe()
> > consumes considered cpu time. Reducing its overhead has around 10% performance
> > boost.
> > 
> > This patch makes handle_stripe() operations follow the ideal order. It also
> > moves handle_stripe_clean_event() up, as it handles completed stripe. And if I
> > don't change handle_stripe_clean_event() order, I saw some states confused with
> > other changes.
> > 
> > Signed-off-by: Shaohua Li <shli@fusionio.com>
> > ---
> >  drivers/md/raid5.c |   39 +++++++++++++++++++++------------------
> >  1 file changed, 21 insertions(+), 18 deletions(-)
> > 
> > Index: linux/drivers/md/raid5.c
> > ===================================================================
> > --- linux.orig/drivers/md/raid5.c	2014-05-06 17:19:13.868225752 +0800
> > +++ linux/drivers/md/raid5.c	2014-05-06 17:20:25.367326852 +0800
> > @@ -1633,7 +1633,7 @@ static void ops_run_check_pq(struct stri
> >  			   &sh->ops.zero_sum_result, percpu->spare_page, &submit);
> >  }
> >  
> > -static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
> > +static void raid_run_ops(struct stripe_head *sh, unsigned long *ops_request)
> >  {
> >  	int overlap_clear = 0, i, disks = sh->disks;
> >  	struct dma_async_tx_descriptor *tx = NULL;
> > @@ -1644,12 +1644,12 @@ static void raid_run_ops(struct stripe_h
> >  
> >  	cpu = get_cpu();
> >  	percpu = per_cpu_ptr(conf->percpu, cpu);
> > -	if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
> > +	if (test_and_clear_bit(STRIPE_OP_BIOFILL, ops_request)) {
> >  		ops_run_biofill(sh);
> >  		overlap_clear++;
> >  	}
> >  
> > -	if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) {
> > +	if (test_and_clear_bit(STRIPE_OP_COMPUTE_BLK, ops_request)) {
> >  		if (level < 6)
> >  			tx = ops_run_compute5(sh, percpu);
> >  		else {
> > @@ -1659,26 +1659,26 @@ static void raid_run_ops(struct stripe_h
> >  				tx = ops_run_compute6_2(sh, percpu);
> >  		}
> >  		/* terminate the chain if reconstruct is not set to be run */
> > -		if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request))
> > +		if (tx && !test_and_clear_bit(STRIPE_OP_RECONSTRUCT, ops_request))
> >  			async_tx_ack(tx);
> >  	}
> >  
> > -	if (test_bit(STRIPE_OP_PREXOR, &ops_request))
> > +	if (test_and_clear_bit(STRIPE_OP_PREXOR, ops_request))
> >  		tx = ops_run_prexor(sh, percpu, tx);
> >  
> > -	if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
> > +	if (test_and_clear_bit(STRIPE_OP_BIODRAIN, ops_request)) {
> >  		tx = ops_run_biodrain(sh, tx);
> >  		overlap_clear++;
> >  	}
> >  
> > -	if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) {
> > +	if (test_and_clear_bit(STRIPE_OP_RECONSTRUCT, ops_request)) {
> >  		if (level < 6)
> >  			ops_run_reconstruct5(sh, percpu, tx);
> >  		else
> >  			ops_run_reconstruct6(sh, percpu, tx);
> >  	}
> >  
> > -	if (test_bit(STRIPE_OP_CHECK, &ops_request)) {
> > +	if (test_and_clear_bit(STRIPE_OP_CHECK, ops_request)) {
> >  		if (sh->check_state == check_state_run)
> >  			ops_run_check_p(sh, percpu);
> >  		else if (sh->check_state == check_state_run_q)
> > @@ -3780,6 +3780,41 @@ static void handle_stripe(struct stripe_
> >  			handle_failed_sync(conf, sh, &s);
> >  	}
> >  
> > +	/*
> > +	 * might be able to return some write requests if the parity blocks
> > +	 * are safe, or on a failed drive
> > +	 */
> > +	pdev = &sh->dev[sh->pd_idx];
> > +	s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx)
> > +		|| (s.failed >= 2 && s.failed_num[1] == sh->pd_idx);
> > +	qdev = &sh->dev[sh->qd_idx];
> > +	s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx)
> > +		|| (s.failed >= 2 && s.failed_num[1] == sh->qd_idx)
> > +		|| conf->level < 6;
> > +
> > +	if (s.written &&
> > +	    (s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
> > +			     && !test_bit(R5_LOCKED, &pdev->flags)
> > +			     && (test_bit(R5_UPTODATE, &pdev->flags) ||
> > +				 test_bit(R5_Discard, &pdev->flags))))) &&
> > +	    (s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
> > +			     && !test_bit(R5_LOCKED, &qdev->flags)
> > +			     && (test_bit(R5_UPTODATE, &qdev->flags) ||
> > +				 test_bit(R5_Discard, &qdev->flags))))))
> > +		handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
> > +
> > +	/* Now to consider new write requests and what else, if anything
> > +	 * should be read.  We do not handle new writes when:
> > +	 * 1/ A 'write' operation (copy+xor) is already in flight.
> > +	 * 2/ A 'check' operation is in flight, as it may clobber the parity
> > +	 *    block.
> > +	 */
> > +	if (s.to_write && !sh->reconstruct_state && !sh->check_state)
> > +		handle_stripe_dirtying(conf, sh, &s, disks);
> > +
> > +	if (s.ops_request)
> > +		raid_run_ops(sh, &s.ops_request);
> > +
> >  	/* Now we check to see if any write operations have recently
> >  	 * completed
> >  	 */
> > @@ -3817,29 +3852,6 @@ static void handle_stripe(struct stripe_
> >  			s.dec_preread_active = 1;
> >  	}
> >  
> > -	/*
> > -	 * might be able to return some write requests if the parity blocks
> > -	 * are safe, or on a failed drive
> > -	 */
> > -	pdev = &sh->dev[sh->pd_idx];
> > -	s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx)
> > -		|| (s.failed >= 2 && s.failed_num[1] == sh->pd_idx);
> > -	qdev = &sh->dev[sh->qd_idx];
> > -	s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx)
> > -		|| (s.failed >= 2 && s.failed_num[1] == sh->qd_idx)
> > -		|| conf->level < 6;
> > -
> > -	if (s.written &&
> > -	    (s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
> > -			     && !test_bit(R5_LOCKED, &pdev->flags)
> > -			     && (test_bit(R5_UPTODATE, &pdev->flags) ||
> > -				 test_bit(R5_Discard, &pdev->flags))))) &&
> > -	    (s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
> > -			     && !test_bit(R5_LOCKED, &qdev->flags)
> > -			     && (test_bit(R5_UPTODATE, &qdev->flags) ||
> > -				 test_bit(R5_Discard, &qdev->flags))))))
> > -		handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
> > -
> >  	/* Now we might consider reading some blocks, either to check/generate
> >  	 * parity, or to satisfy requests
> >  	 * or to load a block that is being partially written.
> > @@ -3851,15 +3863,6 @@ static void handle_stripe(struct stripe_
> >  	    || s.expanding)
> >  		handle_stripe_fill(sh, &s, disks);
> >  
> > -	/* Now to consider new write requests and what else, if anything
> > -	 * should be read.  We do not handle new writes when:
> > -	 * 1/ A 'write' operation (copy+xor) is already in flight.
> > -	 * 2/ A 'check' operation is in flight, as it may clobber the parity
> > -	 *    block.
> > -	 */
> > -	if (s.to_write && !sh->reconstruct_state && !sh->check_state)
> > -		handle_stripe_dirtying(conf, sh, &s, disks);
> > -
> >  	/* maybe we need to check and possibly fix the parity for this stripe
> >  	 * Any reads will already have been scheduled, so we just see if enough
> >  	 * data is available.  The parity check is held off while parity
> > @@ -4014,7 +4017,7 @@ finish:
> >  		}
> >  
> >  	if (s.ops_request)
> > -		raid_run_ops(sh, s.ops_request);
> > +		raid_run_ops(sh, &s.ops_request);
> >  
> >  	ops_run_io(sh, &s);
> >  


[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 828 bytes --]

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [RFC]raid5: adjust operation order of handle_stripe
  2014-05-12  1:16 [RFC]raid5: adjust operation order of handle_stripe Shaohua Li
  2014-05-19  2:21 ` Shaohua Li
@ 2014-05-20  7:21 ` NeilBrown
  1 sibling, 0 replies; 4+ messages in thread
From: NeilBrown @ 2014-05-20  7:21 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid

[-- Attachment #1: Type: text/plain, Size: 8477 bytes --]

On Mon, 12 May 2014 09:16:59 +0800 Shaohua Li <shli@kernel.org> wrote:

> 
> For a full stripe write, the ideal operation order is handle_stripe_dirtying(),
> raid_run_ops(), set R5_Wantwrite bit, and ops_run_io(). In this way, one
> handle_stripe() will dispatch IO for the stripe, otherwise there are more extra
> rounds of handle_stripe(). In a high speed raid5 array, handle_stripe()
> consumes considered cpu time. Reducing its overhead has around 10% performance
> boost.
> 
> This patch makes handle_stripe() operations follow the ideal order. It also
> moves handle_stripe_clean_event() up, as it handles completed stripe. And if I
> don't change handle_stripe_clean_event() order, I saw some states confused with
> other changes.
> 
> Signed-off-by: Shaohua Li <shli@fusionio.com>

handle_stripe() now calls raid_run_ops twice, which is quite a change.
I would need a clear statement of why that is OK.
If the xor etc operations are being performed asynchronously you will get
very different behaviour than if they are synchronous.  That might all work
perfectly, but again I'd like to be sure that question has been considered
and answered.

Also you seem to have about 3 changes in this patch.

- change raid_run_ops to be passed ops_request by reference instead of by
  value.  This would be easier to review in a separate patch.
- move handle_stripe_clean_event(), as you mention above.  Having this
  in a separate patch and explaining why it makes sense would be good.
- the main change that you want to make would be then a simple small patch
  with few distractions and so easier to review.

It might be a good idea to take this opportunity to split handle_stripe() up
a bit.  As you say, some of the code handles completed stripes, some flags
desired changes, and some executes those changes.  Having those pieces in
different functions would improve the code a lot.

Would you like to make that change (please) ?

Thanks,
NeilBrown


> ---
>  drivers/md/raid5.c |   39 +++++++++++++++++++++------------------
>  1 file changed, 21 insertions(+), 18 deletions(-)
> 
> Index: linux/drivers/md/raid5.c
> ===================================================================
> --- linux.orig/drivers/md/raid5.c	2014-05-06 17:19:13.868225752 +0800
> +++ linux/drivers/md/raid5.c	2014-05-06 17:20:25.367326852 +0800
> @@ -1633,7 +1633,7 @@ static void ops_run_check_pq(struct stri
>  			   &sh->ops.zero_sum_result, percpu->spare_page, &submit);
>  }
>  
> -static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
> +static void raid_run_ops(struct stripe_head *sh, unsigned long *ops_request)
>  {
>  	int overlap_clear = 0, i, disks = sh->disks;
>  	struct dma_async_tx_descriptor *tx = NULL;
> @@ -1644,12 +1644,12 @@ static void raid_run_ops(struct stripe_h
>  
>  	cpu = get_cpu();
>  	percpu = per_cpu_ptr(conf->percpu, cpu);
> -	if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
> +	if (test_and_clear_bit(STRIPE_OP_BIOFILL, ops_request)) {
>  		ops_run_biofill(sh);
>  		overlap_clear++;
>  	}
>  
> -	if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) {
> +	if (test_and_clear_bit(STRIPE_OP_COMPUTE_BLK, ops_request)) {
>  		if (level < 6)
>  			tx = ops_run_compute5(sh, percpu);
>  		else {
> @@ -1659,26 +1659,26 @@ static void raid_run_ops(struct stripe_h
>  				tx = ops_run_compute6_2(sh, percpu);
>  		}
>  		/* terminate the chain if reconstruct is not set to be run */
> -		if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request))
> +		if (tx && !test_and_clear_bit(STRIPE_OP_RECONSTRUCT, ops_request))
>  			async_tx_ack(tx);
>  	}
>  
> -	if (test_bit(STRIPE_OP_PREXOR, &ops_request))
> +	if (test_and_clear_bit(STRIPE_OP_PREXOR, ops_request))
>  		tx = ops_run_prexor(sh, percpu, tx);
>  
> -	if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
> +	if (test_and_clear_bit(STRIPE_OP_BIODRAIN, ops_request)) {
>  		tx = ops_run_biodrain(sh, tx);
>  		overlap_clear++;
>  	}
>  
> -	if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) {
> +	if (test_and_clear_bit(STRIPE_OP_RECONSTRUCT, ops_request)) {
>  		if (level < 6)
>  			ops_run_reconstruct5(sh, percpu, tx);
>  		else
>  			ops_run_reconstruct6(sh, percpu, tx);
>  	}
>  
> -	if (test_bit(STRIPE_OP_CHECK, &ops_request)) {
> +	if (test_and_clear_bit(STRIPE_OP_CHECK, ops_request)) {
>  		if (sh->check_state == check_state_run)
>  			ops_run_check_p(sh, percpu);
>  		else if (sh->check_state == check_state_run_q)
> @@ -3780,6 +3780,41 @@ static void handle_stripe(struct stripe_
>  			handle_failed_sync(conf, sh, &s);
>  	}
>  
> +	/*
> +	 * might be able to return some write requests if the parity blocks
> +	 * are safe, or on a failed drive
> +	 */
> +	pdev = &sh->dev[sh->pd_idx];
> +	s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx)
> +		|| (s.failed >= 2 && s.failed_num[1] == sh->pd_idx);
> +	qdev = &sh->dev[sh->qd_idx];
> +	s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx)
> +		|| (s.failed >= 2 && s.failed_num[1] == sh->qd_idx)
> +		|| conf->level < 6;
> +
> +	if (s.written &&
> +	    (s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
> +			     && !test_bit(R5_LOCKED, &pdev->flags)
> +			     && (test_bit(R5_UPTODATE, &pdev->flags) ||
> +				 test_bit(R5_Discard, &pdev->flags))))) &&
> +	    (s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
> +			     && !test_bit(R5_LOCKED, &qdev->flags)
> +			     && (test_bit(R5_UPTODATE, &qdev->flags) ||
> +				 test_bit(R5_Discard, &qdev->flags))))))
> +		handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
> +
> +	/* Now to consider new write requests and what else, if anything
> +	 * should be read.  We do not handle new writes when:
> +	 * 1/ A 'write' operation (copy+xor) is already in flight.
> +	 * 2/ A 'check' operation is in flight, as it may clobber the parity
> +	 *    block.
> +	 */
> +	if (s.to_write && !sh->reconstruct_state && !sh->check_state)
> +		handle_stripe_dirtying(conf, sh, &s, disks);
> +
> +	if (s.ops_request)
> +		raid_run_ops(sh, &s.ops_request);
> +
>  	/* Now we check to see if any write operations have recently
>  	 * completed
>  	 */
> @@ -3817,29 +3852,6 @@ static void handle_stripe(struct stripe_
>  			s.dec_preread_active = 1;
>  	}
>  
> -	/*
> -	 * might be able to return some write requests if the parity blocks
> -	 * are safe, or on a failed drive
> -	 */
> -	pdev = &sh->dev[sh->pd_idx];
> -	s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx)
> -		|| (s.failed >= 2 && s.failed_num[1] == sh->pd_idx);
> -	qdev = &sh->dev[sh->qd_idx];
> -	s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx)
> -		|| (s.failed >= 2 && s.failed_num[1] == sh->qd_idx)
> -		|| conf->level < 6;
> -
> -	if (s.written &&
> -	    (s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
> -			     && !test_bit(R5_LOCKED, &pdev->flags)
> -			     && (test_bit(R5_UPTODATE, &pdev->flags) ||
> -				 test_bit(R5_Discard, &pdev->flags))))) &&
> -	    (s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
> -			     && !test_bit(R5_LOCKED, &qdev->flags)
> -			     && (test_bit(R5_UPTODATE, &qdev->flags) ||
> -				 test_bit(R5_Discard, &qdev->flags))))))
> -		handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
> -
>  	/* Now we might consider reading some blocks, either to check/generate
>  	 * parity, or to satisfy requests
>  	 * or to load a block that is being partially written.
> @@ -3851,15 +3863,6 @@ static void handle_stripe(struct stripe_
>  	    || s.expanding)
>  		handle_stripe_fill(sh, &s, disks);
>  
> -	/* Now to consider new write requests and what else, if anything
> -	 * should be read.  We do not handle new writes when:
> -	 * 1/ A 'write' operation (copy+xor) is already in flight.
> -	 * 2/ A 'check' operation is in flight, as it may clobber the parity
> -	 *    block.
> -	 */
> -	if (s.to_write && !sh->reconstruct_state && !sh->check_state)
> -		handle_stripe_dirtying(conf, sh, &s, disks);
> -
>  	/* maybe we need to check and possibly fix the parity for this stripe
>  	 * Any reads will already have been scheduled, so we just see if enough
>  	 * data is available.  The parity check is held off while parity
> @@ -4014,7 +4017,7 @@ finish:
>  		}
>  
>  	if (s.ops_request)
> -		raid_run_ops(sh, s.ops_request);
> +		raid_run_ops(sh, &s.ops_request);
>  
>  	ops_run_io(sh, &s);
>  


[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 828 bytes --]

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2014-05-20  7:21 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-05-12  1:16 [RFC]raid5: adjust operation order of handle_stripe Shaohua Li
2014-05-19  2:21 ` Shaohua Li
2014-05-19  2:42   ` NeilBrown
2014-05-20  7:21 ` NeilBrown

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).