[PATCH] qla2xxx: Resolved a performance issue in interrupt handler

public inbox for linux-scsi@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH] qla2xxx: Resolved a performance issue in interrupt handler
@ 2009-06-09 18:42 Anirban Chakraborty
  2009-06-09 19:32 ` James Bottomley
  0 siblings, 1 reply; 24+ messages in thread
From: Anirban Chakraborty @ 2009-06-09 18:42 UTC (permalink / raw)
  To: linux-scsi; +Cc: douglas.w.styner, James.Bottomley

Reverted back a change from spin_lock to spin_lock_irq* that was causing
the cycle times to go up.
The patch is based on scsi-misc-2.6.

Reported-and-checked-by: Douglas W. Styner <douglas.w.styner@intel.com>
Signed-off-by: Anirban Chakraborty <anirban.chakraborty@qlogic.com>
---
 drivers/scsi/qla2xxx/qla_isr.c |   30 ++++++++++++++++++------------
 1 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/drivers/scsi/qla2xxx/qla_isr.c b/drivers/scsi/qla2xxx/qla_isr.c
index c8d0a17..bc92feb 100644
--- a/drivers/scsi/qla2xxx/qla_isr.c
+++ b/drivers/scsi/qla2xxx/qla_isr.c
@@ -37,6 +37,7 @@ qla2100_intr_handler(int irq, void *dev_id)
 	uint16_t	hccr;
 	uint16_t	mb[4];
 	struct rsp_que *rsp;
+	unsigned long	flags;
 
 	rsp = (struct rsp_que *) dev_id;
 	if (!rsp) {
@@ -49,7 +50,7 @@ qla2100_intr_handler(int irq, void *dev_id)
 	reg = &ha->iobase->isp;
 	status = 0;
 
-	spin_lock(&ha->hardware_lock);
+	spin_lock_irqsave(&ha->hardware_lock, flags);
 	vha = pci_get_drvdata(ha->pdev);
 	for (iter = 50; iter--; ) {
 		hccr = RD_REG_WORD(&reg->hccr);
@@ -101,7 +102,7 @@ qla2100_intr_handler(int irq, void *dev_id)
 			RD_REG_WORD(&reg->hccr);
 		}
 	}
-	spin_unlock(&ha->hardware_lock);
+	spin_unlock_irqrestore(&ha->hardware_lock, flags);
 
 	if (test_bit(MBX_INTR_WAIT, &ha->mbx_cmd_flags) &&
 	    (status & MBX_INTERRUPT) && ha->flags.mbox_int) {
@@ -133,6 +134,7 @@ qla2300_intr_handler(int irq, void *dev_id)
 	uint16_t	mb[4];
 	struct rsp_que *rsp;
 	struct qla_hw_data *ha;
+	unsigned long	flags;
 
 	rsp = (struct rsp_que *) dev_id;
 	if (!rsp) {
@@ -145,7 +147,7 @@ qla2300_intr_handler(int irq, void *dev_id)
 	reg = &ha->iobase->isp;
 	status = 0;
 
-	spin_lock(&ha->hardware_lock);
+	spin_lock_irqsave(&ha->hardware_lock, flags);
 	vha = pci_get_drvdata(ha->pdev);
 	for (iter = 50; iter--; ) {
 		stat = RD_REG_DWORD(&reg->u.isp2300.host_status);
@@ -216,7 +218,7 @@ qla2300_intr_handler(int irq, void *dev_id)
 		WRT_REG_WORD(&reg->hccr, HCCR_CLR_RISC_INT);
 		RD_REG_WORD_RELAXED(&reg->hccr);
 	}
-	spin_unlock(&ha->hardware_lock);
+	spin_unlock_irqrestore(&ha->hardware_lock, flags);
 
 	if (test_bit(MBX_INTR_WAIT, &ha->mbx_cmd_flags) &&
 	    (status & MBX_INTERRUPT) && ha->flags.mbox_int) {
@@ -1626,6 +1628,7 @@ qla24xx_intr_handler(int irq, void *dev_id)
 	uint32_t	hccr;
 	uint16_t	mb[4];
 	struct rsp_que *rsp;
+	unsigned long	flags;
 
 	rsp = (struct rsp_que *) dev_id;
 	if (!rsp) {
@@ -1638,7 +1641,7 @@ qla24xx_intr_handler(int irq, void *dev_id)
 	reg = &ha->iobase->isp24;
 	status = 0;
 
-	spin_lock(&ha->hardware_lock);
+	spin_lock_irqsave(&ha->hardware_lock, flags);
 	vha = pci_get_drvdata(ha->pdev);
 	for (iter = 50; iter--; ) {
 		stat = RD_REG_DWORD(&reg->host_status);
@@ -1688,7 +1691,7 @@ qla24xx_intr_handler(int irq, void *dev_id)
 		WRT_REG_DWORD(&reg->hccr, HCCRX_CLR_RISC_INT);
 		RD_REG_DWORD_RELAXED(&reg->hccr);
 	}
-	spin_unlock(&ha->hardware_lock);
+	spin_unlock_irqrestore(&ha->hardware_lock, flags);
 
 	if (test_bit(MBX_INTR_WAIT, &ha->mbx_cmd_flags) &&
 	    (status & MBX_INTERRUPT) && ha->flags.mbox_int) {
@@ -1706,6 +1709,7 @@ qla24xx_msix_rsp_q(int irq, void *dev_id)
 	struct rsp_que *rsp;
 	struct device_reg_24xx __iomem *reg;
 	struct scsi_qla_host *vha;
+	unsigned long	flags;
 
 	rsp = (struct rsp_que *) dev_id;
 	if (!rsp) {
@@ -1716,13 +1720,13 @@ qla24xx_msix_rsp_q(int irq, void *dev_id)
 	ha = rsp->hw;
 	reg = &ha->iobase->isp24;
 
-	spin_lock_irq(&ha->hardware_lock);
+	spin_lock_irqsave(&ha->hardware_lock, flags);
 
 	vha = qla25xx_get_host(rsp);
 	qla24xx_process_response_queue(vha, rsp);
 	WRT_REG_DWORD(&reg->hccr, HCCRX_CLR_RISC_INT);
 
-	spin_unlock_irq(&ha->hardware_lock);
+	spin_unlock_irqrestore(&ha->hardware_lock, flags);
 
 	return IRQ_HANDLED;
 }
@@ -1757,6 +1761,7 @@ qla24xx_msix_default(int irq, void *dev_id)
 	uint32_t	stat;
 	uint32_t	hccr;
 	uint16_t	mb[4];
+	unsigned long	flags;
 
 	rsp = (struct rsp_que *) dev_id;
 	if (!rsp) {
@@ -1768,7 +1773,7 @@ qla24xx_msix_default(int irq, void *dev_id)
 	reg = &ha->iobase->isp24;
 	status = 0;
 
-	spin_lock_irq(&ha->hardware_lock);
+	spin_lock_irqsave(&ha->hardware_lock, flags);
 	vha = pci_get_drvdata(ha->pdev);
 	do {
 		stat = RD_REG_DWORD(&reg->host_status);
@@ -1817,7 +1822,7 @@ qla24xx_msix_default(int irq, void *dev_id)
 		}
 		WRT_REG_DWORD(&reg->hccr, HCCRX_CLR_RISC_INT);
 	} while (0);
-	spin_unlock_irq(&ha->hardware_lock);
+	spin_unlock_irqrestore(&ha->hardware_lock, flags);
 
 	if (test_bit(MBX_INTR_WAIT, &ha->mbx_cmd_flags) &&
 	    (status & MBX_INTERRUPT) && ha->flags.mbox_int) {
@@ -1940,6 +1945,7 @@ int
 qla2x00_request_irqs(struct qla_hw_data *ha, struct rsp_que *rsp)
 {
 	int ret;
+	unsigned long	flags;
 	device_reg_t __iomem *reg = ha->iobase;
 
 	/* If possible, enable MSI-X. */
@@ -2007,7 +2013,7 @@ clear_risc_ints:
 	 */
 	if (IS_QLA81XX(ha))
 		goto fail;
-	spin_lock_irq(&ha->hardware_lock);
+	spin_lock_irqsave(&ha->hardware_lock, flags);
 	if (IS_FWI2_CAPABLE(ha)) {
 		WRT_REG_DWORD(&reg->isp24.hccr, HCCRX_CLR_HOST_INT);
 		WRT_REG_DWORD(&reg->isp24.hccr, HCCRX_CLR_RISC_INT);
@@ -2016,7 +2022,7 @@ clear_risc_ints:
 		WRT_REG_WORD(&reg->isp.hccr, HCCR_CLR_RISC_INT);
 		WRT_REG_WORD(&reg->isp.hccr, HCCR_CLR_HOST_INT);
 	}
-	spin_unlock_irq(&ha->hardware_lock);
+	spin_unlock_irqrestore(&ha->hardware_lock, flags);
 
 fail:
 	return ret;
-- 
1.5.5


^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: [PATCH] qla2xxx: Resolved a performance issue in interrupt handler
  2009-06-09 18:42 [PATCH] qla2xxx: Resolved a performance issue in interrupt handler Anirban Chakraborty
@ 2009-06-09 19:32 ` James Bottomley
  2009-06-09 20:34   ` Anirban Chakraborty
  0 siblings, 1 reply; 24+ messages in thread
From: James Bottomley @ 2009-06-09 19:32 UTC (permalink / raw)
  To: Anirban Chakraborty; +Cc: linux-scsi, douglas.w.styner

On Tue, 2009-06-09 at 11:42 -0700, Anirban Chakraborty wrote:
> Reverted back a change from spin_lock to spin_lock_irq* that was causing
> the cycle times to go up.
> The patch is based on scsi-misc-2.6.

Some more explanation of this would be greatly appreciated.  The cause
looks to be that holding off interrupts in the isr could potentially
reduce latency (caused by taking a different interrupt while holding the
hardware lock) and increase the chance of coalescing (by holding off
interrupts).  However, if that's the case, possibly using an
IRQF_DISABLED isr rather than going back to spin_lock_irqsave() would be
a better fix?

> Reported-and-checked-by: Douglas W. Styner <douglas.w.styner@intel.com>
> Signed-off-by: Anirban Chakraborty <anirban.chakraborty@qlogic.com>
> ---
>  drivers/scsi/qla2xxx/qla_isr.c |   30 ++++++++++++++++++------------
>  1 files changed, 18 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/scsi/qla2xxx/qla_isr.c b/drivers/scsi/qla2xxx/qla_isr.c
> index c8d0a17..bc92feb 100644
> --- a/drivers/scsi/qla2xxx/qla_isr.c
> +++ b/drivers/scsi/qla2xxx/qla_isr.c
> @@ -37,6 +37,7 @@ qla2100_intr_handler(int irq, void *dev_id)
>  	uint16_t	hccr;
>  	uint16_t	mb[4];
>  	struct rsp_que *rsp;
> +	unsigned long	flags;
>  
>  	rsp = (struct rsp_que *) dev_id;
>  	if (!rsp) {
> @@ -49,7 +50,7 @@ qla2100_intr_handler(int irq, void *dev_id)
>  	reg = &ha->iobase->isp;
>  	status = 0;
>  
> -	spin_lock(&ha->hardware_lock);
> +	spin_lock_irqsave(&ha->hardware_lock, flags);
>  	vha = pci_get_drvdata(ha->pdev);
>  	for (iter = 50; iter--; ) {
>  		hccr = RD_REG_WORD(&reg->hccr);
> @@ -101,7 +102,7 @@ qla2100_intr_handler(int irq, void *dev_id)
>  			RD_REG_WORD(&reg->hccr);
>  		}
>  	}
> -	spin_unlock(&ha->hardware_lock);
> +	spin_unlock_irqrestore(&ha->hardware_lock, flags);
>  
>  	if (test_bit(MBX_INTR_WAIT, &ha->mbx_cmd_flags) &&
>  	    (status & MBX_INTERRUPT) && ha->flags.mbox_int) {
> @@ -133,6 +134,7 @@ qla2300_intr_handler(int irq, void *dev_id)
>  	uint16_t	mb[4];
>  	struct rsp_que *rsp;
>  	struct qla_hw_data *ha;
> +	unsigned long	flags;
>  
>  	rsp = (struct rsp_que *) dev_id;
>  	if (!rsp) {
> @@ -145,7 +147,7 @@ qla2300_intr_handler(int irq, void *dev_id)
>  	reg = &ha->iobase->isp;
>  	status = 0;
>  
> -	spin_lock(&ha->hardware_lock);
> +	spin_lock_irqsave(&ha->hardware_lock, flags);
>  	vha = pci_get_drvdata(ha->pdev);
>  	for (iter = 50; iter--; ) {
>  		stat = RD_REG_DWORD(&reg->u.isp2300.host_status);
> @@ -216,7 +218,7 @@ qla2300_intr_handler(int irq, void *dev_id)
>  		WRT_REG_WORD(&reg->hccr, HCCR_CLR_RISC_INT);
>  		RD_REG_WORD_RELAXED(&reg->hccr);
>  	}
> -	spin_unlock(&ha->hardware_lock);
> +	spin_unlock_irqrestore(&ha->hardware_lock, flags);
>  
>  	if (test_bit(MBX_INTR_WAIT, &ha->mbx_cmd_flags) &&
>  	    (status & MBX_INTERRUPT) && ha->flags.mbox_int) {
> @@ -1626,6 +1628,7 @@ qla24xx_intr_handler(int irq, void *dev_id)
>  	uint32_t	hccr;
>  	uint16_t	mb[4];
>  	struct rsp_que *rsp;
> +	unsigned long	flags;
>  
>  	rsp = (struct rsp_que *) dev_id;
>  	if (!rsp) {
> @@ -1638,7 +1641,7 @@ qla24xx_intr_handler(int irq, void *dev_id)
>  	reg = &ha->iobase->isp24;
>  	status = 0;
>  
> -	spin_lock(&ha->hardware_lock);
> +	spin_lock_irqsave(&ha->hardware_lock, flags);
>  	vha = pci_get_drvdata(ha->pdev);
>  	for (iter = 50; iter--; ) {
>  		stat = RD_REG_DWORD(&reg->host_status);
> @@ -1688,7 +1691,7 @@ qla24xx_intr_handler(int irq, void *dev_id)
>  		WRT_REG_DWORD(&reg->hccr, HCCRX_CLR_RISC_INT);
>  		RD_REG_DWORD_RELAXED(&reg->hccr);
>  	}
> -	spin_unlock(&ha->hardware_lock);
> +	spin_unlock_irqrestore(&ha->hardware_lock, flags);
>  
>  	if (test_bit(MBX_INTR_WAIT, &ha->mbx_cmd_flags) &&
>  	    (status & MBX_INTERRUPT) && ha->flags.mbox_int) {
> @@ -1706,6 +1709,7 @@ qla24xx_msix_rsp_q(int irq, void *dev_id)
>  	struct rsp_que *rsp;
>  	struct device_reg_24xx __iomem *reg;
>  	struct scsi_qla_host *vha;
> +	unsigned long	flags;
>  
>  	rsp = (struct rsp_que *) dev_id;
>  	if (!rsp) {
> @@ -1716,13 +1720,13 @@ qla24xx_msix_rsp_q(int irq, void *dev_id)
>  	ha = rsp->hw;
>  	reg = &ha->iobase->isp24;
>  
> -	spin_lock_irq(&ha->hardware_lock);
> +	spin_lock_irqsave(&ha->hardware_lock, flags);

This doesn't make a lot of sense.  I can see in the ordinary isr above,
you were using spin_lock not spin_lock_irq, so you were going into the
spin with other interrupts enabled.  However, in the MSIX case, the net
effect of the code (since we enter with interrupts enabled) is nil plus
the small latency saving and restoring the flags.

I'm assuming Intel did the tests on a MSIX system, so I have a hard time
understanding how this could in any way fix their problem.

James



^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] qla2xxx: Resolved a performance issue in interrupt handler
  2009-06-09 19:32 ` James Bottomley
@ 2009-06-09 20:34   ` Anirban Chakraborty
  2009-06-09 20:40     ` James Bottomley
  0 siblings, 1 reply; 24+ messages in thread
From: Anirban Chakraborty @ 2009-06-09 20:34 UTC (permalink / raw)
  To: James Bottomley; +Cc: linux-scsi@vger.kernel.org, douglas.w.styner@intel.com


On Jun 9, 2009, at 12:32 PM, James Bottomley wrote:

> On Tue, 2009-06-09 at 11:42 -0700, Anirban Chakraborty wrote:
>> Reverted back a change from spin_lock to spin_lock_irq* that was  
>> causing
>> the cycle times to go up.
>> The patch is based on scsi-misc-2.6.
>
> Some more explanation of this would be greatly appreciated.  The cause
> looks to be that holding off interrupts in the isr could potentially
> reduce latency (caused by taking a different interrupt while holding  
> the
> hardware lock) and increase the chance of coalescing (by holding off
> interrupts).  However, if that's the case, possibly using an
> IRQF_DISABLED isr rather than going back to spin_lock_irqsave()  
> would be
> a better fix?
The primary cause is, as you mentioned, the contention for the  
hardware lock in the isr.
I'll check it with IRQF_DISABLED too. However, I was wondering if  
there is any additional savings if we use IRQF_DISABLED vs. using the  
spin_lock_irqsave. In the former case, probably we'd enter the isr  
with interrupts disabled and in the latter case it would be done upon  
entering the isr.

>
>
>> Reported-and-checked-by: Douglas W. Styner <douglas.w.styner@intel.com 
>> >
>> Signed-off-by: Anirban Chakraborty <anirban.chakraborty@qlogic.com>
>> ---
>> drivers/scsi/qla2xxx/qla_isr.c |   30 ++++++++++++++++++------------
>> 1 files changed, 18 insertions(+), 12 deletions(-)
>>
>> diff --git a/drivers/scsi/qla2xxx/qla_isr.c b/drivers/scsi/qla2xxx/ 
>> qla_isr.c
>> index c8d0a17..bc92feb 100644
>> --- a/drivers/scsi/qla2xxx/qla_isr.c
>> +++ b/drivers/scsi/qla2xxx/qla_isr.c
>> @@ -37,6 +37,7 @@ qla2100_intr_handler(int irq, void *dev_id)
>> 	uint16_t	hccr;
>> 	uint16_t	mb[4];
>> 	struct rsp_que *rsp;
>> +	unsigned long	flags;
>>
>> 	rsp = (struct rsp_que *) dev_id;
>> 	if (!rsp) {
>> @@ -49,7 +50,7 @@ qla2100_intr_handler(int irq, void *dev_id)
>> 	reg = &ha->iobase->isp;
>> 	status = 0;
>>
>> -	spin_lock(&ha->hardware_lock);
>> +	spin_lock_irqsave(&ha->hardware_lock, flags);
>> 	vha = pci_get_drvdata(ha->pdev);
>> 	for (iter = 50; iter--; ) {
>> 		hccr = RD_REG_WORD(&reg->hccr);
>> @@ -101,7 +102,7 @@ qla2100_intr_handler(int irq, void *dev_id)
>> 			RD_REG_WORD(&reg->hccr);
>> 		}
>> 	}
>> -	spin_unlock(&ha->hardware_lock);
>> +	spin_unlock_irqrestore(&ha->hardware_lock, flags);
>>
>> 	if (test_bit(MBX_INTR_WAIT, &ha->mbx_cmd_flags) &&
>> 	    (status & MBX_INTERRUPT) && ha->flags.mbox_int) {
>> @@ -133,6 +134,7 @@ qla2300_intr_handler(int irq, void *dev_id)
>> 	uint16_t	mb[4];
>> 	struct rsp_que *rsp;
>> 	struct qla_hw_data *ha;
>> +	unsigned long	flags;
>>
>> 	rsp = (struct rsp_que *) dev_id;
>> 	if (!rsp) {
>> @@ -145,7 +147,7 @@ qla2300_intr_handler(int irq, void *dev_id)
>> 	reg = &ha->iobase->isp;
>> 	status = 0;
>>
>> -	spin_lock(&ha->hardware_lock);
>> +	spin_lock_irqsave(&ha->hardware_lock, flags);
>> 	vha = pci_get_drvdata(ha->pdev);
>> 	for (iter = 50; iter--; ) {
>> 		stat = RD_REG_DWORD(&reg->u.isp2300.host_status);
>> @@ -216,7 +218,7 @@ qla2300_intr_handler(int irq, void *dev_id)
>> 		WRT_REG_WORD(&reg->hccr, HCCR_CLR_RISC_INT);
>> 		RD_REG_WORD_RELAXED(&reg->hccr);
>> 	}
>> -	spin_unlock(&ha->hardware_lock);
>> +	spin_unlock_irqrestore(&ha->hardware_lock, flags);
>>
>> 	if (test_bit(MBX_INTR_WAIT, &ha->mbx_cmd_flags) &&
>> 	    (status & MBX_INTERRUPT) && ha->flags.mbox_int) {
>> @@ -1626,6 +1628,7 @@ qla24xx_intr_handler(int irq, void *dev_id)
>> 	uint32_t	hccr;
>> 	uint16_t	mb[4];
>> 	struct rsp_que *rsp;
>> +	unsigned long	flags;
>>
>> 	rsp = (struct rsp_que *) dev_id;
>> 	if (!rsp) {
>> @@ -1638,7 +1641,7 @@ qla24xx_intr_handler(int irq, void *dev_id)
>> 	reg = &ha->iobase->isp24;
>> 	status = 0;
>>
>> -	spin_lock(&ha->hardware_lock);
>> +	spin_lock_irqsave(&ha->hardware_lock, flags);
>> 	vha = pci_get_drvdata(ha->pdev);
>> 	for (iter = 50; iter--; ) {
>> 		stat = RD_REG_DWORD(&reg->host_status);
>> @@ -1688,7 +1691,7 @@ qla24xx_intr_handler(int irq, void *dev_id)
>> 		WRT_REG_DWORD(&reg->hccr, HCCRX_CLR_RISC_INT);
>> 		RD_REG_DWORD_RELAXED(&reg->hccr);
>> 	}
>> -	spin_unlock(&ha->hardware_lock);
>> +	spin_unlock_irqrestore(&ha->hardware_lock, flags);
>>
>> 	if (test_bit(MBX_INTR_WAIT, &ha->mbx_cmd_flags) &&
>> 	    (status & MBX_INTERRUPT) && ha->flags.mbox_int) {
>> @@ -1706,6 +1709,7 @@ qla24xx_msix_rsp_q(int irq, void *dev_id)
>> 	struct rsp_que *rsp;
>> 	struct device_reg_24xx __iomem *reg;
>> 	struct scsi_qla_host *vha;
>> +	unsigned long	flags;
>>
>> 	rsp = (struct rsp_que *) dev_id;
>> 	if (!rsp) {
>> @@ -1716,13 +1720,13 @@ qla24xx_msix_rsp_q(int irq, void *dev_id)
>> 	ha = rsp->hw;
>> 	reg = &ha->iobase->isp24;
>>
>> -	spin_lock_irq(&ha->hardware_lock);
>> +	spin_lock_irqsave(&ha->hardware_lock, flags);
>
> This doesn't make a lot of sense.  I can see in the ordinary isr  
> above,
> you were using spin_lock not spin_lock_irq, so you were going into the
> spin with other interrupts enabled.  However, in the MSIX case, the  
> net
> effect of the code (since we enter with interrupts enabled) is nil  
> plus
> the small latency saving and restoring the flags.
>
> I'm assuming Intel did the tests on a MSIX system, so I have a hard  
> time
> understanding how this could in any way fix their problem.
>
> James
>
They did their tests on a system with MSI enabled, for which the  
qla24xx_intr_handler gets invoked. So, the qla24xx_msix_rsp_q never  
got tested in that case. However, I do agree that spin_lock_irqsave is  
not needed in the MSI-X case as you mentioned that the interrupts are  
already disabled here.
Thanks,
-Anirban


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] qla2xxx: Resolved a performance issue in interrupt handler
  2009-06-09 20:34   ` Anirban Chakraborty
@ 2009-06-09 20:40     ` James Bottomley
  2009-06-09 21:28       ` Giridhar Malavali
  2009-06-09 21:40       ` Anirban Chakraborty
  0 siblings, 2 replies; 24+ messages in thread
From: James Bottomley @ 2009-06-09 20:40 UTC (permalink / raw)
  To: Anirban Chakraborty
  Cc: linux-scsi@vger.kernel.org, douglas.w.styner@intel.com

On Tue, 2009-06-09 at 13:34 -0700, Anirban Chakraborty wrote:
> On Jun 9, 2009, at 12:32 PM, James Bottomley wrote:
> 
> > On Tue, 2009-06-09 at 11:42 -0700, Anirban Chakraborty wrote:
> >> Reverted back a change from spin_lock to spin_lock_irq* that was  
> >> causing
> >> the cycle times to go up.
> >> The patch is based on scsi-misc-2.6.
> >
> > Some more explanation of this would be greatly appreciated.  The cause
> > looks to be that holding off interrupts in the isr could potentially
> > reduce latency (caused by taking a different interrupt while holding  
> > the
> > hardware lock) and increase the chance of coalescing (by holding off
> > interrupts).  However, if that's the case, possibly using an
> > IRQF_DISABLED isr rather than going back to spin_lock_irqsave()  
> > would be
> > a better fix?
> The primary cause is, as you mentioned, the contention for the  
> hardware lock in the isr.
> I'll check it with IRQF_DISABLED too. However, I was wondering if  
> there is any additional savings if we use IRQF_DISABLED vs. using the  
> spin_lock_irqsave. In the former case, probably we'd enter the isr  
> with interrupts disabled and in the latter case it would be done upon  
> entering the isr.

It depends what the root cause is ... if it's really latency introduced
by other interrupts, then IRQF_DISABLED might be the better course.  If
it's purely interrupt problems in the spin locked critical sections,
then spin_lock_irq might be the better solution ... what would be useful
is to have the test rig at intel which turned up the problem see what
happens to the results for each case.

James



^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] qla2xxx: Resolved a performance issue in interrupt handler
  2009-06-09 20:40     ` James Bottomley
@ 2009-06-09 21:28       ` Giridhar Malavali
  2009-06-09 21:41         ` Matthew Wilcox
  2009-06-09 21:46         ` James Bottomley
  2009-06-09 21:40       ` Anirban Chakraborty
  1 sibling, 2 replies; 24+ messages in thread
From: Giridhar Malavali @ 2009-06-09 21:28 UTC (permalink / raw)
  To: Anirban Chakraborty; +Cc: linux-scsi, douglas.w.styner, James Bottomley


On Jun 9, 2009, at 1:40 PM, James Bottomley wrote:

> On Tue, 2009-06-09 at 13:34 -0700, Anirban Chakraborty wrote:
>> On Jun 9, 2009, at 12:32 PM, James Bottomley wrote:
>>
>>> On Tue, 2009-06-09 at 11:42 -0700, Anirban Chakraborty wrote:
>>>> Reverted back a change from spin_lock to spin_lock_irq* that was
>>>> causing
>>>> the cycle times to go up.
>>>> The patch is based on scsi-misc-2.6.
>>>
>>> Some more explanation of this would be greatly appreciated.  The  
>>> cause
>>> looks to be that holding off interrupts in the isr could potentially
>>> reduce latency (caused by taking a different interrupt while holding
>>> the
>>> hardware lock) and increase the chance of coalescing (by holding off
>>> interrupts).  However, if that's the case, possibly using an
>>> IRQF_DISABLED isr rather than going back to spin_lock_irqsave()
>>> would be
>>> a better fix?
>> The primary cause is, as you mentioned, the contention for the
>> hardware lock in the isr.
>> I'll check it with IRQF_DISABLED too. However, I was wondering if
>> there is any additional savings if we use IRQF_DISABLED vs. using the
>> spin_lock_irqsave. In the former case, probably we'd enter the isr
>> with interrupts disabled and in the latter case it would be done upon
>> entering the isr.
>
> It depends what the root cause is ... if it's really latency  
> introduced
> by other interrupts, then IRQF_DISABLED might be the better course.   
> If
> it's purely interrupt problems in the spin locked critical sections,
> then spin_lock_irq might be the better solution ... what would be  
> useful
> is to have the test rig at intel which turned up the problem see what
> happens to the results for each case.
>
> James
>
>
Earlier, I have seen that when IRQ's are shared across multiple  
controllers and if the first one to register (among shared  
controllers) does not disable the IRQ with IRQF_DISABLED flag,then  
irrespective of the IRQ registration from other controllers, the IRQ  
will be enabled by default. With this behavior and qla2xxx sharing the  
IRQ, just disabling the IRQ may not be sufficient.

Giridhar.M.B



> --
> To unsubscribe from this list: send the line "unsubscribe linux- 
> scsi" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] qla2xxx: Resolved a performance issue in interrupt handler
  2009-06-09 21:28       ` Giridhar Malavali
@ 2009-06-09 21:41         ` Matthew Wilcox
  2009-06-09 21:46         ` James Bottomley
  1 sibling, 0 replies; 24+ messages in thread
From: Matthew Wilcox @ 2009-06-09 21:41 UTC (permalink / raw)
  To: Giridhar Malavali
  Cc: Anirban Chakraborty, linux-scsi, douglas.w.styner,
	James Bottomley, Peter Zijlstra

On Tue, Jun 09, 2009 at 02:28:01PM -0700, Giridhar Malavali wrote:
> On Jun 9, 2009, at 1:40 PM, James Bottomley wrote:
>> It depends what the root cause is ... if it's really latency  
>> introduced
>> by other interrupts, then IRQF_DISABLED might be the better course.   
>> If
>> it's purely interrupt problems in the spin locked critical sections,
>> then spin_lock_irq might be the better solution ... what would be  
>> useful
>> is to have the test rig at intel which turned up the problem see what
>> happens to the results for each case.
>
> Earlier, I have seen that when IRQ's are shared across multiple  
> controllers and if the first one to register (among shared controllers) 
> does not disable the IRQ with IRQF_DISABLED flag,then irrespective of the 
> IRQ registration from other controllers, the IRQ will be enabled by 
> default. With this behavior and qla2xxx sharing the IRQ, just disabling 
> the IRQ may not be sufficient.

But MSI interrupts are never shared.  Also, almost every driver *should*
be using IRQF_DISABLED these days (there was a proposal to eliminate
IRQF_DISABLED and force every driver to explicitly re-enable interrupts
if it needed them on back in March).

-- 
Matthew Wilcox				Intel Open Source Technology Centre
"Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours.  We can't possibly take such
a retrograde step."

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] qla2xxx: Resolved a performance issue in interrupt handler
  2009-06-09 21:28       ` Giridhar Malavali
  2009-06-09 21:41         ` Matthew Wilcox
@ 2009-06-09 21:46         ` James Bottomley
  1 sibling, 0 replies; 24+ messages in thread
From: James Bottomley @ 2009-06-09 21:46 UTC (permalink / raw)
  To: Giridhar Malavali; +Cc: Anirban Chakraborty, linux-scsi, douglas.w.styner

On Tue, 2009-06-09 at 14:28 -0700, Giridhar Malavali wrote:
> On Jun 9, 2009, at 1:40 PM, James Bottomley wrote:
> 
> > On Tue, 2009-06-09 at 13:34 -0700, Anirban Chakraborty wrote:
> >> On Jun 9, 2009, at 12:32 PM, James Bottomley wrote:
> >>
> >>> On Tue, 2009-06-09 at 11:42 -0700, Anirban Chakraborty wrote:
> >>>> Reverted back a change from spin_lock to spin_lock_irq* that was
> >>>> causing
> >>>> the cycle times to go up.
> >>>> The patch is based on scsi-misc-2.6.
> >>>
> >>> Some more explanation of this would be greatly appreciated.  The  
> >>> cause
> >>> looks to be that holding off interrupts in the isr could potentially
> >>> reduce latency (caused by taking a different interrupt while holding
> >>> the
> >>> hardware lock) and increase the chance of coalescing (by holding off
> >>> interrupts).  However, if that's the case, possibly using an
> >>> IRQF_DISABLED isr rather than going back to spin_lock_irqsave()
> >>> would be
> >>> a better fix?
> >> The primary cause is, as you mentioned, the contention for the
> >> hardware lock in the isr.
> >> I'll check it with IRQF_DISABLED too. However, I was wondering if
> >> there is any additional savings if we use IRQF_DISABLED vs. using the
> >> spin_lock_irqsave. In the former case, probably we'd enter the isr
> >> with interrupts disabled and in the latter case it would be done upon
> >> entering the isr.
> >
> > It depends what the root cause is ... if it's really latency  
> > introduced
> > by other interrupts, then IRQF_DISABLED might be the better course.   
> > If
> > it's purely interrupt problems in the spin locked critical sections,
> > then spin_lock_irq might be the better solution ... what would be  
> > useful
> > is to have the test rig at intel which turned up the problem see what
> > happens to the results for each case.
> >
> > James
> >
> >
> Earlier, I have seen that when IRQ's are shared across multiple  
> controllers and if the first one to register (among shared  
> controllers) does not disable the IRQ with IRQF_DISABLED flag,then  
> irrespective of the IRQ registration from other controllers, the IRQ  
> will be enabled by default. With this behavior and qla2xxx sharing the  
> IRQ, just disabling the IRQ may not be sufficient.

Well, they're MSI interrupts, so unshared.  However, an equivalent trick
is just to do local_irq_disable() at the top of the interrupt routine.
There was a thread about making IRQF_DISABLED the default, which is what
made me think about it.

Anyway, I think the point is it would be interesting to see if we can
improve performance by holding off interrupts for all of the qla isr ...
if this is validated, we can move on to implementation discussions.

James



^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] qla2xxx: Resolved a performance issue in interrupt handler
  2009-06-09 20:40     ` James Bottomley
  2009-06-09 21:28       ` Giridhar Malavali
@ 2009-06-09 21:40       ` Anirban Chakraborty
  2009-06-09 21:43         ` Matthew Wilcox
       [not found]         ` <D7C42C27E6CB1E4D8CBDF2F81EA92A26035402EBA0@azsmsx501.amr.corp.intel.com>
  1 sibling, 2 replies; 24+ messages in thread
From: Anirban Chakraborty @ 2009-06-09 21:40 UTC (permalink / raw)
  To: James Bottomley
  Cc: Anirban Chakraborty, linux-scsi@vger.kernel.org,
	douglas.w.styner@intel.com



On Tue, 9 Jun 2009, James Bottomley wrote:

> On Tue, 2009-06-09 at 13:34 -0700, Anirban Chakraborty wrote:
> > On Jun 9, 2009, at 12:32 PM, James Bottomley wrote:
> > 
> > > On Tue, 2009-06-09 at 11:42 -0700, Anirban Chakraborty wrote:
> > >> Reverted back a change from spin_lock to spin_lock_irq* that was  
> > >> causing
> > >> the cycle times to go up.
> > >> The patch is based on scsi-misc-2.6.
> > >
> > > Some more explanation of this would be greatly appreciated.  The cause
> > > looks to be that holding off interrupts in the isr could potentially
> > > reduce latency (caused by taking a different interrupt while holding  
> > > the
> > > hardware lock) and increase the chance of coalescing (by holding off
> > > interrupts).  However, if that's the case, possibly using an
> > > IRQF_DISABLED isr rather than going back to spin_lock_irqsave()  
> > > would be
> > > a better fix?
> > The primary cause is, as you mentioned, the contention for the  
> > hardware lock in the isr.
> > I'll check it with IRQF_DISABLED too. However, I was wondering if  
> > there is any additional savings if we use IRQF_DISABLED vs. using the  
> > spin_lock_irqsave. In the former case, probably we'd enter the isr  
> > with interrupts disabled and in the latter case it would be done upon  
> > entering the isr.
> 
> It depends what the root cause is ... if it's really latency introduced
> by other interrupts, then IRQF_DISABLED might be the better course.  If
> it's purely interrupt problems in the spin locked critical sections,
> then spin_lock_irq might be the better solution ... what would be useful
> is to have the test rig at intel which turned up the problem see what
> happens to the results for each case.
> 
> James

I have attached the patch with IRQF_DISABLED. In my setup, I didn't find any 
significant difference in performance numbers between IRQF_DISABLED and 
spin_lock_irqsave. It would be interesting to see the numbers from Douglas's 
rig.
Doug, could you please run it once more with this patch and let us know the 
numbers vs. the one you already tested with spin_lock_irqsave earlier.
Thanks much,
Anirban

diff --git a/drivers/scsi/qla2xxx/qla_isr.c b/drivers/scsi/qla2xxx/qla_isr.c
index c8d0a17..d07bd97 100644
--- a/drivers/scsi/qla2xxx/qla_isr.c
+++ b/drivers/scsi/qla2xxx/qla_isr.c
@@ -1991,7 +1991,7 @@ skip_msix:
 skip_msi:
 
 	ret = request_irq(ha->pdev->irq, ha->isp_ops->intr_handler,
-	    IRQF_SHARED, QLA2XXX_DRIVER_NAME, rsp);
+	    IRQF_DISABLED, QLA2XXX_DRIVER_NAME, rsp);
 	if (ret) {
 		qla_printk(KERN_WARNING, ha,
 		    "Failed to reserve interrupt %d already in use.\n",

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: [PATCH] qla2xxx: Resolved a performance issue in interrupt handler
  2009-06-09 21:40       ` Anirban Chakraborty
@ 2009-06-09 21:43         ` Matthew Wilcox
  2009-06-09 21:46           ` Anirban Chakraborty
       [not found]         ` <D7C42C27E6CB1E4D8CBDF2F81EA92A26035402EBA0@azsmsx501.amr.corp.intel.com>
  1 sibling, 1 reply; 24+ messages in thread
From: Matthew Wilcox @ 2009-06-09 21:43 UTC (permalink / raw)
  To: Anirban Chakraborty
  Cc: James Bottomley, linux-scsi@vger.kernel.org,
	douglas.w.styner@intel.com

On Tue, Jun 09, 2009 at 02:40:44PM -0700, Anirban Chakraborty wrote:
> On Tue, 9 Jun 2009, James Bottomley wrote:
> > It depends what the root cause is ... if it's really latency introduced
> > by other interrupts, then IRQF_DISABLED might be the better course.  If
> > it's purely interrupt problems in the spin locked critical sections,
> > then spin_lock_irq might be the better solution ... what would be useful
> > is to have the test rig at intel which turned up the problem see what
> > happens to the results for each case.
> 
> I have attached the patch with IRQF_DISABLED. In my setup, I didn't find any 
> significant difference in performance numbers between IRQF_DISABLED and 
> spin_lock_irqsave. It would be interesting to see the numbers from Douglas's 
> rig.
> Doug, could you please run it once more with this patch and let us know the 
> numbers vs. the one you already tested with spin_lock_irqsave earlier.
> Thanks much,
> Anirban
> 
> diff --git a/drivers/scsi/qla2xxx/qla_isr.c b/drivers/scsi/qla2xxx/qla_isr.c
> index c8d0a17..d07bd97 100644
> --- a/drivers/scsi/qla2xxx/qla_isr.c
> +++ b/drivers/scsi/qla2xxx/qla_isr.c
> @@ -1991,7 +1991,7 @@ skip_msix:
>  skip_msi:
>  
>  	ret = request_irq(ha->pdev->irq, ha->isp_ops->intr_handler,
> -	    IRQF_SHARED, QLA2XXX_DRIVER_NAME, rsp);
> +	    IRQF_DISABLED, QLA2XXX_DRIVER_NAME, rsp);
>  	if (ret) {
>  		qla_printk(KERN_WARNING, ha,
>  		    "Failed to reserve interrupt %d already in use.\n",

While I don't think Doug has MSI-X adapters, a complete patch would change
all request_irq() calls to use IRQF_DISABLED, right?

-- 
Matthew Wilcox				Intel Open Source Technology Centre
"Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours.  We can't possibly take such
a retrograde step."

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] qla2xxx: Resolved a performance issue in interrupt handler
  2009-06-09 21:43         ` Matthew Wilcox
@ 2009-06-09 21:46           ` Anirban Chakraborty
  2009-06-09 21:47             ` Matthew Wilcox
  0 siblings, 1 reply; 24+ messages in thread
From: Anirban Chakraborty @ 2009-06-09 21:46 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: James Bottomley, linux-scsi@vger.kernel.org,
	douglas.w.styner@intel.com


On Jun 9, 2009, at 2:43 PM, Matthew Wilcox wrote:

> On Tue, Jun 09, 2009 at 02:40:44PM -0700, Anirban Chakraborty wrote:
>> On Tue, 9 Jun 2009, James Bottomley wrote:
>>> It depends what the root cause is ... if it's really latency  
>>> introduced
>>> by other interrupts, then IRQF_DISABLED might be the better  
>>> course.  If
>>> it's purely interrupt problems in the spin locked critical sections,
>>> then spin_lock_irq might be the better solution ... what would be  
>>> useful
>>> is to have the test rig at intel which turned up the problem see  
>>> what
>>> happens to the results for each case.
>>
>> I have attached the patch with IRQF_DISABLED. In my setup, I didn't  
>> find any
>> significant difference in performance numbers between IRQF_DISABLED  
>> and
>> spin_lock_irqsave. It would be interesting to see the numbers from  
>> Douglas's
>> rig.
>> Doug, could you please run it once more with this patch and let us  
>> know the
>> numbers vs. the one you already tested with spin_lock_irqsave  
>> earlier.
>> Thanks much,
>> Anirban
>>
>> diff --git a/drivers/scsi/qla2xxx/qla_isr.c b/drivers/scsi/qla2xxx/ 
>> qla_isr.c
>> index c8d0a17..d07bd97 100644
>> --- a/drivers/scsi/qla2xxx/qla_isr.c
>> +++ b/drivers/scsi/qla2xxx/qla_isr.c
>> @@ -1991,7 +1991,7 @@ skip_msix:
>> skip_msi:
>>
>> 	ret = request_irq(ha->pdev->irq, ha->isp_ops->intr_handler,
>> -	    IRQF_SHARED, QLA2XXX_DRIVER_NAME, rsp);
>> +	    IRQF_DISABLED, QLA2XXX_DRIVER_NAME, rsp);
>> 	if (ret) {
>> 		qla_printk(KERN_WARNING, ha,
>> 		    "Failed to reserve interrupt %d already in use.\n",
>
> While I don't think Doug has MSI-X adapters, a complete patch would  
> change
> all request_irq() calls to use IRQF_DISABLED, right?
>
> --

The above patch takes care of replacing all the IRQF_SHARED flags in  
irq registrations with IRQF_DISABLED.
Thanks,
Anirban


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] qla2xxx: Resolved a performance issue in interrupt handler
  2009-06-09 21:46           ` Anirban Chakraborty
@ 2009-06-09 21:47             ` Matthew Wilcox
  2009-06-09 22:30               ` Anirban Chakraborty
  0 siblings, 1 reply; 24+ messages in thread
From: Matthew Wilcox @ 2009-06-09 21:47 UTC (permalink / raw)
  To: Anirban Chakraborty
  Cc: James Bottomley, linux-scsi@vger.kernel.org,
	douglas.w.styner@intel.com

On Tue, Jun 09, 2009 at 02:46:12PM -0700, Anirban Chakraborty wrote:
>
> On Jun 9, 2009, at 2:43 PM, Matthew Wilcox wrote:
>> While I don't think Doug has MSI-X adapters, a complete patch would  
>> change
>> all request_irq() calls to use IRQF_DISABLED, right?
>
> The above patch takes care of replacing all the IRQF_SHARED flags in irq 
> registrations with IRQF_DISABLED.

Yes ... but even for the MSI-X case, you want the handler to be run with
interrupts disabled, don't you?  If not, why not?

-- 
Matthew Wilcox				Intel Open Source Technology Centre
"Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours.  We can't possibly take such
a retrograde step."

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] qla2xxx: Resolved a performance issue in interrupt handler
  2009-06-09 21:47             ` Matthew Wilcox
@ 2009-06-09 22:30               ` Anirban Chakraborty
  0 siblings, 0 replies; 24+ messages in thread
From: Anirban Chakraborty @ 2009-06-09 22:30 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: James Bottomley, linux-scsi@vger.kernel.org,
	douglas.w.styner@intel.com


On Jun 9, 2009, at 2:47 PM, Matthew Wilcox wrote:

> On Tue, Jun 09, 2009 at 02:46:12PM -0700, Anirban Chakraborty wrote:
>>
>> On Jun 9, 2009, at 2:43 PM, Matthew Wilcox wrote:
>>> While I don't think Doug has MSI-X adapters, a complete patch would
>>> change
>>> all request_irq() calls to use IRQF_DISABLED, right?
>>
>> The above patch takes care of replacing all the IRQF_SHARED flags  
>> in irq
>> registrations with IRQF_DISABLED.
>
> Yes ... but even for the MSI-X case, you want the handler to be run  
> with
> interrupts disabled, don't you?  If not, why not?

Yeah you're right. For some reason, request_irq for MSI-X passes 0 as  
the irq flag. We need to take care of that.
Thanks,
Anirban


^ permalink raw reply	[flat|nested] 24+ messages in thread

[parent not found: <D7C42C27E6CB1E4D8CBDF2F81EA92A26035402EBA0@azsmsx501.amr.corp.intel.com>]

[parent not found: <2796FDDF-31EA-44E2-8856-84A22F31A01F@qlogic.com>]

* RE: [PATCH] qla2xxx: Resolved a performance issue in interrupt handler
       [not found]           ` <2796FDDF-31EA-44E2-8856-84A22F31A01F@qlogic.com>
@ 2009-06-10 15:52             ` Styner, Douglas W
  2009-06-10 16:09               ` Anirban Chakraborty
  0 siblings, 1 reply; 24+ messages in thread
From: Styner, Douglas W @ 2009-06-10 15:52 UTC (permalink / raw)
  To: Anirban Chakraborty
  Cc: James.Bottomley@HansenPartnership.com, linux-scsi@vger.kernel.org,
	Tripathi, Sharad C, Ma, Chinang, Prickett, Terry O,
	Wilcox, Matthew R


Anirban Chakraborty [mailto:anirban.chakraborty@qlogic.com] write:
>
>You got it right. Basically we are doing:
>1. baseline scsi-misc + earlier patch
>2. baseline scsi-misc + this patch
>And then compare 1 and 2.

Results are as follows:
Total interrupts are down slightly (-0.8%), QLA interrupts are flat (-0.1%) Cycles for qla2xxx_intr_handler up from .77% to .92%.

Linux OLTP Performance summary
Kernel#                   Speedup(x)   Intr/s  CtxSw/s us%   sy%   id%  wa%
2.6.30-rc6_scsi-misc_0001      1.000   29538   43374   75    25     0    0
2.6.30-rc6_scsi-misc_irqf-d    1.000   29286   43125   74    25     0    1

Server configurations:
Intel Xeon Quad-core 2.0GHz  2 cpus/8 cores/8 threads
64GB memory, 3 qle2462 FC HBA, 450 spindles (30 logical units)


======oprofile CPU_CLK_UNHALTED for top 30 functions
Cycles% 2.6.30-rc6_scsi-misc_0001   Cycles% 2.6.30-rc6_scsi-misc_irqf-d
68.1402 <database>                  66.1875 <database>
0.9125 qla24xx_start_scsi            0.9552 qla24xx_start_scsi
0.8729 __schedule                    0.9272 __schedule
0.7739 qla24xx_intr_handler          0.9178 qla24xx_intr_handler
0.7161 kmem_cache_alloc              0.7963 kmem_cache_alloc
0.4686 __sigsetjmp                   0.4580 __sigsetjmp
0.4043 __blockdev_direct_IO          0.4206 __switch_to
0.3894 __switch_to                   0.4169 __blockdev_direct_IO
0.3878 rb_get_reader_page            0.3907 scsi_request_fn
0.3762 task_rq_lock                  0.3776 task_rq_lock
0.3729 scsi_request_fn               0.3701 rb_get_reader_page
0.3696 __list_add                    0.3664 __list_add
0.3168 <bash>                        0.3664 copy_user_generic_string
0.3135 try_to_wake_up                0.3122 lock_timer_base
0.2921 ring_buffer_consume           0.2972 ring_buffer_consume
0.2822 lock_timer_base               0.2879 <bash>
0.2624 memset_c                      0.2785 aio_complete
0.2607 aio_complete                  0.2524 blk_queue_end_tag
0.2508 copy_user_generic_string      0.2468 try_to_wake_up
0.2294 qla2x00_process_completed_re  0.2430 tcp_sendmsg
0.2294 tcp_sendmsg                   0.2393 mod_timer
0.2211 kfree                         0.2374 qla2x00_process_completed_re
0.2195 memmove                       0.2355 memset_c
0.2195 blk_queue_end_tag             0.2225 e1000_xmit_frame
0.2145 scsi_softirq_done             0.2094 generic_make_request
0.2145 mod_timer                     0.2075 scsi_device_unbusy
0.2129 generic_file_aio_read         0.2056 kref_get
0.2096 kref_get                      0.2056 mempool_free
0.1997 e1000_xmit_frame              0.2038 qla2xxx_queuecommand
0.1997 scsi_device_unbusy            0.2000 kfree

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] qla2xxx: Resolved a performance issue in interrupt handler
  2009-06-10 15:52             ` Styner, Douglas W
@ 2009-06-10 16:09               ` Anirban Chakraborty
  2009-06-10 16:17                 ` James Bottomley
  2009-06-10 16:18                 ` Matthew Wilcox
  0 siblings, 2 replies; 24+ messages in thread
From: Anirban Chakraborty @ 2009-06-10 16:09 UTC (permalink / raw)
  To: Styner, Douglas W
  Cc: James.Bottomley@HansenPartnership.com, linux-scsi@vger.kernel.org,
	Tripathi, Sharad C, Ma, Chinang, Prickett, Terry O,
	Wilcox, Matthew R


On Jun 10, 2009, at 8:52 AM, Styner, Douglas W wrote:

>
> Anirban Chakraborty [mailto:anirban.chakraborty@qlogic.com] write:
>>
>> You got it right. Basically we are doing:
>> 1. baseline scsi-misc + earlier patch
>> 2. baseline scsi-misc + this patch
>> And then compare 1 and 2.
>
> Results are as follows:
> Total interrupts are down slightly (-0.8%), QLA interrupts are flat  
> (-0.1%) Cycles for qla2xxx_intr_handler up from .77% to .92%.

So, it looks like using spin_lock_irqsave is doing a better job than  
globally disabling the sharing of the interrupt via IRQF_DISABLED.
-Anirban

>
>
> Linux OLTP Performance summary
> Kernel#                   Speedup(x)   Intr/s  CtxSw/s us%   sy%   id 
> %  wa%
> 2.6.30-rc6_scsi-misc_0001      1.000   29538   43374   75    25      
> 0    0
> 2.6.30-rc6_scsi-misc_irqf-d    1.000   29286   43125   74    25      
> 0    1
>
> Server configurations:
> Intel Xeon Quad-core 2.0GHz  2 cpus/8 cores/8 threads
> 64GB memory, 3 qle2462 FC HBA, 450 spindles (30 logical units)
>
>
> ======oprofile CPU_CLK_UNHALTED for top 30 functions
> Cycles% 2.6.30-rc6_scsi-misc_0001   Cycles% 2.6.30-rc6_scsi- 
> misc_irqf-d
> 68.1402 <database>                  66.1875 <database>
> 0.9125 qla24xx_start_scsi            0.9552 qla24xx_start_scsi
> 0.8729 __schedule                    0.9272 __schedule
> 0.7739 qla24xx_intr_handler          0.9178 qla24xx_intr_handler
> 0.7161 kmem_cache_alloc              0.7963 kmem_cache_alloc
> 0.4686 __sigsetjmp                   0.4580 __sigsetjmp
> 0.4043 __blockdev_direct_IO          0.4206 __switch_to
> 0.3894 __switch_to                   0.4169 __blockdev_direct_IO
> 0.3878 rb_get_reader_page            0.3907 scsi_request_fn
> 0.3762 task_rq_lock                  0.3776 task_rq_lock
> 0.3729 scsi_request_fn               0.3701 rb_get_reader_page
> 0.3696 __list_add                    0.3664 __list_add
> 0.3168 <bash>                        0.3664 copy_user_generic_string
> 0.3135 try_to_wake_up                0.3122 lock_timer_base
> 0.2921 ring_buffer_consume           0.2972 ring_buffer_consume
> 0.2822 lock_timer_base               0.2879 <bash>
> 0.2624 memset_c                      0.2785 aio_complete
> 0.2607 aio_complete                  0.2524 blk_queue_end_tag
> 0.2508 copy_user_generic_string      0.2468 try_to_wake_up
> 0.2294 qla2x00_process_completed_re  0.2430 tcp_sendmsg
> 0.2294 tcp_sendmsg                   0.2393 mod_timer
> 0.2211 kfree                         0.2374  
> qla2x00_process_completed_re
> 0.2195 memmove                       0.2355 memset_c
> 0.2195 blk_queue_end_tag             0.2225 e1000_xmit_frame
> 0.2145 scsi_softirq_done             0.2094 generic_make_request
> 0.2145 mod_timer                     0.2075 scsi_device_unbusy
> 0.2129 generic_file_aio_read         0.2056 kref_get
> 0.2096 kref_get                      0.2056 mempool_free
> 0.1997 e1000_xmit_frame              0.2038 qla2xxx_queuecommand
> 0.1997 scsi_device_unbusy            0.2000 kfree


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] qla2xxx: Resolved a performance issue in interrupt handler
  2009-06-10 16:09               ` Anirban Chakraborty
@ 2009-06-10 16:17                 ` James Bottomley
  2009-06-10 16:18                 ` Matthew Wilcox
  1 sibling, 0 replies; 24+ messages in thread
From: James Bottomley @ 2009-06-10 16:17 UTC (permalink / raw)
  To: Anirban Chakraborty
  Cc: Styner, Douglas W, linux-scsi@vger.kernel.org, Tripathi, Sharad C,
	Ma, Chinang, Prickett, Terry O, Wilcox, Matthew R

On Wed, 2009-06-10 at 09:09 -0700, Anirban Chakraborty wrote:
> On Jun 10, 2009, at 8:52 AM, Styner, Douglas W wrote:
> 
> >
> > Anirban Chakraborty [mailto:anirban.chakraborty@qlogic.com] write:
> >>
> >> You got it right. Basically we are doing:
> >> 1. baseline scsi-misc + earlier patch
> >> 2. baseline scsi-misc + this patch
> >> And then compare 1 and 2.
> >
> > Results are as follows:
> > Total interrupts are down slightly (-0.8%), QLA interrupts are flat  
> > (-0.1%) Cycles for qla2xxx_intr_handler up from .77% to .92%.
> 
> So, it looks like using spin_lock_irqsave is doing a better job than  
> globally disabling the sharing of the interrupt via IRQF_DISABLED.

OK, so just drop the msix changes and resubmit ... it'll have to go in
SCSI misc, but once the merge window is over, you can request a backport
to stable.

James



^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] qla2xxx: Resolved a performance issue in interrupt handler
  2009-06-10 16:09               ` Anirban Chakraborty
  2009-06-10 16:17                 ` James Bottomley
@ 2009-06-10 16:18                 ` Matthew Wilcox
  2009-06-10 18:29                   ` Anirban Chakraborty
  2009-06-10 18:32                   ` [PATCH] qla2xxx: Resolved a performance issue in interrupt Andrew Vasquez
  1 sibling, 2 replies; 24+ messages in thread
From: Matthew Wilcox @ 2009-06-10 16:18 UTC (permalink / raw)
  To: Anirban Chakraborty
  Cc: Styner, Douglas W, James.Bottomley@HansenPartnership.com,
	linux-scsi@vger.kernel.org, Tripathi, Sharad C, Ma, Chinang,
	Prickett, Terry O, Wilcox, Matthew R

On Wed, Jun 10, 2009 at 09:09:14AM -0700, Anirban Chakraborty wrote:
>
> On Jun 10, 2009, at 8:52 AM, Styner, Douglas W wrote:
>
>>
>> Anirban Chakraborty [mailto:anirban.chakraborty@qlogic.com] write:
>>>
>>> You got it right. Basically we are doing:
>>> 1. baseline scsi-misc + earlier patch
>>> 2. baseline scsi-misc + this patch
>>> And then compare 1 and 2.
>>
>> Results are as follows:
>> Total interrupts are down slightly (-0.8%), QLA interrupts are flat  
>> (-0.1%) Cycles for qla2xxx_intr_handler up from .77% to .92%.
>
> So, it looks like using spin_lock_irqsave is doing a better job than  
> globally disabling the sharing of the interrupt via IRQF_DISABLED.

The margin of error for this setup is about 0.3% for the performance
as a whole.  I don't know what the margin of error is for the number
of cycles consumed by a given function, but I would suspect it's
significantly higher than the 0.15% difference seen.

>> ======oprofile CPU_CLK_UNHALTED for top 30 functions
>> Cycles% 2.6.30-rc6_scsi-misc_0001   Cycles% 2.6.30-rc6_scsi-misc_irqf-d
>> 68.1402 <database>                  66.1875 <database>
>> 0.9125 qla24xx_start_scsi            0.9552 qla24xx_start_scsi
>> 0.8729 __schedule                    0.9272 __schedule
>> 0.7739 qla24xx_intr_handler          0.9178 qla24xx_intr_handler
>> 0.7161 kmem_cache_alloc              0.7963 kmem_cache_alloc
>> 0.4686 __sigsetjmp                   0.4580 __sigsetjmp
>> 0.4043 __blockdev_direct_IO          0.4206 __switch_to
>> 0.3894 __switch_to                   0.4169 __blockdev_direct_IO
>> 0.3878 rb_get_reader_page            0.3907 scsi_request_fn
>> 0.3762 task_rq_lock                  0.3776 task_rq_lock
>> 0.3729 scsi_request_fn               0.3701 rb_get_reader_page
>> 0.3696 __list_add                    0.3664 __list_add
>> 0.3168 <bash>                        0.3664 copy_user_generic_string
>> 0.3135 try_to_wake_up                0.3122 lock_timer_base
>> 0.2921 ring_buffer_consume           0.2972 ring_buffer_consume
>> 0.2822 lock_timer_base               0.2879 <bash>
>> 0.2624 memset_c                      0.2785 aio_complete
>> 0.2607 aio_complete                  0.2524 blk_queue_end_tag
>> 0.2508 copy_user_generic_string      0.2468 try_to_wake_up
>> 0.2294 qla2x00_process_completed_re  0.2430 tcp_sendmsg
>> 0.2294 tcp_sendmsg                   0.2393 mod_timer
>> 0.2211 kfree                         0.2374 qla2x00_process_completed_re
>> 0.2195 memmove                       0.2355 memset_c
>> 0.2195 blk_queue_end_tag             0.2225 e1000_xmit_frame
>> 0.2145 scsi_softirq_done             0.2094 generic_make_request
>> 0.2145 mod_timer                     0.2075 scsi_device_unbusy
>> 0.2129 generic_file_aio_read         0.2056 kref_get
>> 0.2096 kref_get                      0.2056 mempool_free
>> 0.1997 e1000_xmit_frame              0.2038 qla2xxx_queuecommand
>> 0.1997 scsi_device_unbusy            0.2000 kfree
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-scsi" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

-- 
Matthew Wilcox				Intel Open Source Technology Centre
"Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours.  We can't possibly take such
a retrograde step."

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] qla2xxx: Resolved a performance issue in interrupt handler
  2009-06-10 16:18                 ` Matthew Wilcox
@ 2009-06-10 18:29                   ` Anirban Chakraborty
  2009-06-10 18:32                   ` [PATCH] qla2xxx: Resolved a performance issue in interrupt Andrew Vasquez
  1 sibling, 0 replies; 24+ messages in thread
From: Anirban Chakraborty @ 2009-06-10 18:29 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Styner, Douglas W, James.Bottomley@HansenPartnership.com,
	linux-scsi@vger.kernel.org, Tripathi, Sharad C, Ma, Chinang,
	Prickett, Terry O, Wilcox, Matthew R


On Jun 10, 2009, at 9:18 AM, Matthew Wilcox wrote:

> On Wed, Jun 10, 2009 at 09:09:14AM -0700, Anirban Chakraborty wrote:
>>
>> On Jun 10, 2009, at 8:52 AM, Styner, Douglas W wrote:
>>
>>>
>>> Anirban Chakraborty [mailto:anirban.chakraborty@qlogic.com] write:
>>>>
>>>> You got it right. Basically we are doing:
>>>> 1. baseline scsi-misc + earlier patch
>>>> 2. baseline scsi-misc + this patch
>>>> And then compare 1 and 2.
>>>
>>> Results are as follows:
>>> Total interrupts are down slightly (-0.8%), QLA interrupts are flat
>>> (-0.1%) Cycles for qla2xxx_intr_handler up from .77% to .92%.
>>
>> So, it looks like using spin_lock_irqsave is doing a better job than
>> globally disabling the sharing of the interrupt via IRQF_DISABLED.
>
> The margin of error for this setup is about 0.3% for the performance
> as a whole.  I don't know what the margin of error is for the number
> of cycles consumed by a given function, but I would suspect it's
> significantly higher than the 0.15% difference seen.
>
One way to confirm this would be to run several times and see what  
comes up. If we get consistently same order of magnitude difference  
then we'd know for sure. The thing to note here is the overall  
difference in database cycle %s which is more than the margin of error.
Thanks,
Anirban


>>> ======oprofile CPU_CLK_UNHALTED for top 30 functions
>>> Cycles% 2.6.30-rc6_scsi-misc_0001   Cycles% 2.6.30-rc6_scsi- 
>>> misc_irqf-d
>>> 68.1402 <database>                  66.1875 <database>
>>> 0.9125 qla24xx_start_scsi            0.9552 qla24xx_start_scsi
>>> 0.8729 __schedule                    0.9272 __schedule
>>> 0.7739 qla24xx_intr_handler          0.9178 qla24xx_intr_handler
>>> 0.7161 kmem_cache_alloc              0.7963 kmem_cache_alloc
>>> 0.4686 __sigsetjmp                   0.4580 __sigsetjmp
>>> 0.4043 __blockdev_direct_IO          0.4206 __switch_to
>>> 0.3894 __switch_to                   0.4169 __blockdev_direct_IO
>>> 0.3878 rb_get_reader_page            0.3907 scsi_request_fn
>>> 0.3762 task_rq_lock                  0.3776 task_rq_lock
>>> 0.3729 scsi_request_fn               0.3701 rb_get_reader_page
>>> 0.3696 __list_add                    0.3664 __list_add
>>> 0.3168 <bash>                        0.3664 copy_user_generic_string
>>> 0.3135 try_to_wake_up                0.3122 lock_timer_base
>>> 0.2921 ring_buffer_consume           0.2972 ring_buffer_consume
>>> 0.2822 lock_timer_base               0.2879 <bash>
>>> 0.2624 memset_c                      0.2785 aio_complete
>>> 0.2607 aio_complete                  0.2524 blk_queue_end_tag
>>> 0.2508 copy_user_generic_string      0.2468 try_to_wake_up
>>> 0.2294 qla2x00_process_completed_re  0.2430 tcp_sendmsg
>>> 0.2294 tcp_sendmsg                   0.2393 mod_timer
>>> 0.2211 kfree                         0.2374  
>>> qla2x00_process_completed_re
>>> 0.2195 memmove                       0.2355 memset_c
>>> 0.2195 blk_queue_end_tag             0.2225 e1000_xmit_frame
>>> 0.2145 scsi_softirq_done             0.2094 generic_make_request
>>> 0.2145 mod_timer                     0.2075 scsi_device_unbusy
>>> 0.2129 generic_file_aio_read         0.2056 kref_get
>>> 0.2096 kref_get                      0.2056 mempool_free
>>> 0.1997 e1000_xmit_frame              0.2038 qla2xxx_queuecommand
>>> 0.1997 scsi_device_unbusy            0.2000 kfree
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux- 
>> scsi" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
> -- 
> Matthew Wilcox				Intel Open Source Technology Centre
> "Bill, look, we understand that you're interested in selling us this
> operating system, but compare it to ours.  We can't possibly take such
> a retrograde step."


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] qla2xxx: Resolved a performance issue in interrupt
  2009-06-10 16:18                 ` Matthew Wilcox
  2009-06-10 18:29                   ` Anirban Chakraborty
@ 2009-06-10 18:32                   ` Andrew Vasquez
  2009-06-10 19:10                     ` James Bottomley
  1 sibling, 1 reply; 24+ messages in thread
From: Andrew Vasquez @ 2009-06-10 18:32 UTC (permalink / raw)
  To: Wilcox, Matthew R
  Cc: Anirban Chakraborty, David Wagner, James Bottomley,
	Linux SCSI Mailing List, douglas.w.styner, sharad.c.tripathi,
	chinang.ma, terry.o.prickett

On Wed, Jun 10, 2009, wrote:

> On Wed, Jun 10, 2009 at 09:09:14AM -0700, Anirban Chakraborty wrote:
> >
> > On Jun 10, 2009, at 8:52 AM, Styner, Douglas W wrote:
> >> Anirban Chakraborty [mailto:anirban.chakraborty@xxxxxxxxxx] write:
> >>>
> >>> You got it right. Basically we are doing:
> >>> 1. baseline scsi-misc + earlier patch
> >>> 2. baseline scsi-misc + this patch
> >>> And then compare 1 and 2.
> >>
> >> Results are as follows:
> >> Total interrupts are down slightly (-0.8%), QLA interrupts are flat  
> >> (-0.1%) Cycles for qla2xxx_intr_handler up from .77% to .92%.
> >
> > So, it looks like using spin_lock_irqsave is doing a better job than  
> > globally disabling the sharing of the interrupt via IRQF_DISABLED.
> 
> The margin of error for this setup is about 0.3% for the performance
> as a whole.  I don't know what the margin of error is for the number
> of cycles consumed by a given function, but I would suspect it's
> significantly higher than the 0.15% difference seen.
> 
> >> ======oprofile CPU_CLK_UNHALTED for top 30 functions
> >> Cycles% 2.6.30-rc6_scsi-misc_0001   Cycles% 2.6.30-rc6_scsi-misc_irqf-d
> >> 68.1402 <database>                  66.1875 <database>
> >> 0.9125 qla24xx_start_scsi            0.9552 qla24xx_start_scsi
> >> 0.8729 __schedule                    0.9272 __schedule
> >> 0.7739 qla24xx_intr_handler          0.9178 qla24xx_intr_handler
> >> 0.7161 kmem_cache_alloc              0.7963 kmem_cache_alloc

That's a curious observation...  I'm just trying to understand the
numbers here, but, are we sure that this spin_lock() ->
spin_lock_irqsave() conversion is in fact the mitigating factor.

Thanks, AV

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] qla2xxx: Resolved a performance issue in interrupt
  2009-06-10 18:32                   ` [PATCH] qla2xxx: Resolved a performance issue in interrupt Andrew Vasquez
@ 2009-06-10 19:10                     ` James Bottomley
  2009-06-10 19:48                       ` Matthew Wilcox
  2009-06-10 20:55                       ` Anirban Chakraborty
  0 siblings, 2 replies; 24+ messages in thread
From: James Bottomley @ 2009-06-10 19:10 UTC (permalink / raw)
  To: Andrew Vasquez
  Cc: Wilcox, Matthew R, Anirban Chakraborty, David Wagner,
	Linux SCSI Mailing List, douglas.w.styner, sharad.c.tripathi,
	chinang.ma, terry.o.prickett

On Wed, 2009-06-10 at 11:32 -0700, Andrew Vasquez wrote:
> On Wed, Jun 10, 2009, wrote:
> 
> > On Wed, Jun 10, 2009 at 09:09:14AM -0700, Anirban Chakraborty wrote:
> > >
> > > On Jun 10, 2009, at 8:52 AM, Styner, Douglas W wrote:
> > >> Anirban Chakraborty [mailto:anirban.chakraborty@xxxxxxxxxx] write:
> > >>>
> > >>> You got it right. Basically we are doing:
> > >>> 1. baseline scsi-misc + earlier patch
> > >>> 2. baseline scsi-misc + this patch
> > >>> And then compare 1 and 2.
> > >>
> > >> Results are as follows:
> > >> Total interrupts are down slightly (-0.8%), QLA interrupts are flat  
> > >> (-0.1%) Cycles for qla2xxx_intr_handler up from .77% to .92%.
> > >
> > > So, it looks like using spin_lock_irqsave is doing a better job than  
> > > globally disabling the sharing of the interrupt via IRQF_DISABLED.
> > 
> > The margin of error for this setup is about 0.3% for the performance
> > as a whole.  I don't know what the margin of error is for the number
> > of cycles consumed by a given function, but I would suspect it's
> > significantly higher than the 0.15% difference seen.
> > 
> > >> ======oprofile CPU_CLK_UNHALTED for top 30 functions
> > >> Cycles% 2.6.30-rc6_scsi-misc_0001   Cycles% 2.6.30-rc6_scsi-misc_irqf-d
> > >> 68.1402 <database>                  66.1875 <database>
> > >> 0.9125 qla24xx_start_scsi            0.9552 qla24xx_start_scsi
> > >> 0.8729 __schedule                    0.9272 __schedule
> > >> 0.7739 qla24xx_intr_handler          0.9178 qla24xx_intr_handler
> > >> 0.7161 kmem_cache_alloc              0.7963 kmem_cache_alloc
> 
> That's a curious observation...  I'm just trying to understand the
> numbers here, but, are we sure that this spin_lock() ->
> spin_lock_irqsave() conversion is in fact the mitigating factor.

I think so ... the performance of both fixes is actually nearly
identical showing that the base reason (interrupt while holding hardware
spinlock adding to latency) is the correct one.

The curiosity I had is whether we can do even better by disabling
interrupts for the whole of the ISR rather than only over the sections
where we take the hw lock, and I don't think we have conclusive evidence
either way on that.

James



^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] qla2xxx: Resolved a performance issue in interrupt
  2009-06-10 19:10                     ` James Bottomley
@ 2009-06-10 19:48                       ` Matthew Wilcox
  2009-06-10 20:55                       ` Anirban Chakraborty
  1 sibling, 0 replies; 24+ messages in thread
From: Matthew Wilcox @ 2009-06-10 19:48 UTC (permalink / raw)
  To: James Bottomley
  Cc: Andrew Vasquez, Wilcox, Matthew R, Anirban Chakraborty,
	David Wagner, Linux SCSI Mailing List, douglas.w.styner,
	sharad.c.tripathi, chinang.ma, terry.o.prickett

On Wed, Jun 10, 2009 at 07:10:18PM +0000, James Bottomley wrote:
> > That's a curious observation...  I'm just trying to understand the
> > numbers here, but, are we sure that this spin_lock() ->
> > spin_lock_irqsave() conversion is in fact the mitigating factor.
> 
> I think so ... the performance of both fixes is actually nearly
> identical showing that the base reason (interrupt while holding hardware
> spinlock adding to latency) is the correct one.
> 
> The curiosity I had is whether we can do even better by disabling
> interrupts for the whole of the ISR rather than only over the sections
> where we take the hw lock, and I don't think we have conclusive evidence
> either way on that.

It probably doesn't matter much either way.  At some point, I think
Peter will be successful in forcing IRQF_DISABLED for all interrupts,
and this decision will go away.

-- 
Matthew Wilcox				Intel Open Source Technology Centre
"Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours.  We can't possibly take such
a retrograde step."

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] qla2xxx: Resolved a performance issue in interrupt
  2009-06-10 19:10                     ` James Bottomley
  2009-06-10 19:48                       ` Matthew Wilcox
@ 2009-06-10 20:55                       ` Anirban Chakraborty
  2009-06-12 22:48                         ` Styner, Douglas W
  2009-06-12 22:52                         ` [PATCH] qla2xxx: Resolved a performance issue in interrupt - this time with data Styner, Douglas W
  1 sibling, 2 replies; 24+ messages in thread
From: Anirban Chakraborty @ 2009-06-10 20:55 UTC (permalink / raw)
  To: James Bottomley
  Cc: Andrew Vasquez, Wilcox, Matthew R, Anirban Chakraborty,
	David Wagner, Linux SCSI Mailing List, douglas.w.styner@intel.com,
	sharad.c.tripathi@intel.com, chinang.ma@intel.com,
	terry.o.prickett@intel.com

This is the updated patch. Please apply.
Thanks,
Anirban

Reverted back a change in qla*_intr_handler code that caused
an increase in cpu cycles.

Reported-and-tested-by: Douglas W. Styner <douglas.w.styner@intel.com>
Signed-off-by: Anirban Chakraborty <anirban.chakraborty@qlogic.com>
---
 drivers/scsi/qla2xxx/qla_isr.c |   15 +++++++++------
 1 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/drivers/scsi/qla2xxx/qla_isr.c b/drivers/scsi/qla2xxx/qla_isr.c
index c8d0a17..245e7af 100644
--- a/drivers/scsi/qla2xxx/qla_isr.c
+++ b/drivers/scsi/qla2xxx/qla_isr.c
@@ -37,6 +37,7 @@ qla2100_intr_handler(int irq, void *dev_id)
 	uint16_t	hccr;
 	uint16_t	mb[4];
 	struct rsp_que *rsp;
+	unsigned long	flags;
 
 	rsp = (struct rsp_que *) dev_id;
 	if (!rsp) {
@@ -49,7 +50,7 @@ qla2100_intr_handler(int irq, void *dev_id)
 	reg = &ha->iobase->isp;
 	status = 0;
 
-	spin_lock(&ha->hardware_lock);
+	spin_lock_irqsave(&ha->hardware_lock, flags);
 	vha = pci_get_drvdata(ha->pdev);
 	for (iter = 50; iter--; ) {
 		hccr = RD_REG_WORD(&reg->hccr);
@@ -101,7 +102,7 @@ qla2100_intr_handler(int irq, void *dev_id)
 			RD_REG_WORD(&reg->hccr);
 		}
 	}
-	spin_unlock(&ha->hardware_lock);
+	spin_unlock_irqrestore(&ha->hardware_lock, flags);
 
 	if (test_bit(MBX_INTR_WAIT, &ha->mbx_cmd_flags) &&
 	    (status & MBX_INTERRUPT) && ha->flags.mbox_int) {
@@ -133,6 +134,7 @@ qla2300_intr_handler(int irq, void *dev_id)
 	uint16_t	mb[4];
 	struct rsp_que *rsp;
 	struct qla_hw_data *ha;
+	unsigned long	flags;
 
 	rsp = (struct rsp_que *) dev_id;
 	if (!rsp) {
@@ -145,7 +147,7 @@ qla2300_intr_handler(int irq, void *dev_id)
 	reg = &ha->iobase->isp;
 	status = 0;
 
-	spin_lock(&ha->hardware_lock);
+	spin_lock_irqsave(&ha->hardware_lock, flags);
 	vha = pci_get_drvdata(ha->pdev);
 	for (iter = 50; iter--; ) {
 		stat = RD_REG_DWORD(&reg->u.isp2300.host_status);
@@ -216,7 +218,7 @@ qla2300_intr_handler(int irq, void *dev_id)
 		WRT_REG_WORD(&reg->hccr, HCCR_CLR_RISC_INT);
 		RD_REG_WORD_RELAXED(&reg->hccr);
 	}
-	spin_unlock(&ha->hardware_lock);
+	spin_unlock_irqrestore(&ha->hardware_lock, flags);
 
 	if (test_bit(MBX_INTR_WAIT, &ha->mbx_cmd_flags) &&
 	    (status & MBX_INTERRUPT) && ha->flags.mbox_int) {
@@ -1626,6 +1628,7 @@ qla24xx_intr_handler(int irq, void *dev_id)
 	uint32_t	hccr;
 	uint16_t	mb[4];
 	struct rsp_que *rsp;
+	unsigned long	flags;
 
 	rsp = (struct rsp_que *) dev_id;
 	if (!rsp) {
@@ -1638,7 +1641,7 @@ qla24xx_intr_handler(int irq, void *dev_id)
 	reg = &ha->iobase->isp24;
 	status = 0;
 
-	spin_lock(&ha->hardware_lock);
+	spin_lock_irqsave(&ha->hardware_lock, flags);
 	vha = pci_get_drvdata(ha->pdev);
 	for (iter = 50; iter--; ) {
 		stat = RD_REG_DWORD(&reg->host_status);
@@ -1688,7 +1691,7 @@ qla24xx_intr_handler(int irq, void *dev_id)
 		WRT_REG_DWORD(&reg->hccr, HCCRX_CLR_RISC_INT);
 		RD_REG_DWORD_RELAXED(&reg->hccr);
 	}
-	spin_unlock(&ha->hardware_lock);
+	spin_unlock_irqrestore(&ha->hardware_lock, flags);
 
 	if (test_bit(MBX_INTR_WAIT, &ha->mbx_cmd_flags) &&
 	    (status & MBX_INTERRUPT) && ha->flags.mbox_int) {
-- 
1.5.5



^ permalink raw reply related	[flat|nested] 24+ messages in thread

* RE: [PATCH] qla2xxx: Resolved a performance issue in interrupt
  2009-06-10 20:55                       ` Anirban Chakraborty
@ 2009-06-12 22:48                         ` Styner, Douglas W
  2009-06-12 22:52                         ` [PATCH] qla2xxx: Resolved a performance issue in interrupt - this time with data Styner, Douglas W
  1 sibling, 0 replies; 24+ messages in thread
From: Styner, Douglas W @ 2009-06-12 22:48 UTC (permalink / raw)
  To: Anirban Chakraborty, James Bottomley
  Cc: Andrew Vasquez, Wilcox, Matthew R, David Wagner,
	Linux SCSI Mailing List, Tripathi, Sharad C, Ma, Chinang,
	Prickett, Terry O

Anirban Chakraborty [mailto:anirban.chakraborty@qlogic.com] writes:
>
>This is the updated patch. Please apply.

Performance results from this patch compared to 2.6.30-rc6_scsi-misc.


^ permalink raw reply	[flat|nested] 24+ messages in thread

* RE: [PATCH] qla2xxx: Resolved a performance issue in interrupt - this time with data
  2009-06-10 20:55                       ` Anirban Chakraborty
  2009-06-12 22:48                         ` Styner, Douglas W
@ 2009-06-12 22:52                         ` Styner, Douglas W
  2009-06-12 23:32                           ` Anirban Chakraborty
  1 sibling, 1 reply; 24+ messages in thread
From: Styner, Douglas W @ 2009-06-12 22:52 UTC (permalink / raw)
  To: Anirban Chakraborty, James Bottomley
  Cc: Andrew Vasquez, Wilcox, Matthew R, David Wagner,
	Linux SCSI Mailing List, Tripathi, Sharad C, Ma, Chinang,
	Prickett, Terry O

Anirban Chakraborty [mailto:anirban.chakraborty@qlogic.com] writes:
>
>This is the updated patch. Please apply.

Performance results from this patch compared to 2.6.30-rc6_scsi-misc.

Linux OLTP Performance summary
Kernel#            Speedup(x)   Intr/s  CtxSw/s us%    sys%   idle%  iowait%
scsi-misc               1.000   29481   43570   74      25      0       0
scsi-misc_qla-irqsave   1.004   29471   43290   75      25      0       0

Server configurations:
Intel Xeon Quad-core 2.0GHz  2 cpus/8 cores/8 threads
64GB memory, 3 qle2462 FC HBA, 450 spindles (30 logical units)


======oprofile CPU_CLK_UNHALTED for top 30 functions
Cycles% 2.6.30-rc6_scsi-misc       Cycles% 2.6.30-rc6_scsi-misc_qla-irqsave
67.3266 <database>                 66.6218 <database>
1.0062 qla24xx_start_scsi          1.0247 qla24xx_start_scsi
0.9246 qla24xx_intr_handler        0.9731 qla24xx_intr_handler
0.8158 __schedule                  0.8588 kmem_cache_alloc
0.7469 kmem_cache_alloc            0.8515 __schedule
0.4188 __blockdev_direct_IO        0.4663 __sigsetjmp
0.4097 __sigsetjmp                 0.4331 __blockdev_direct_IO
0.3989 scsi_request_fn             0.3852 scsi_request_fn
0.3916 __switch_to                 0.3852 task_rq_lock
0.3717 __list_add                  0.3612 rb_get_reader_page
0.3499 task_rq_lock                0.3336 __switch_to
0.3408 try_to_wake_up              0.3336 ring_buffer_consume
0.3281 aio_complete                0.3299 copy_user_generic_string
0.3281 rb_get_reader_page          0.3262 __list_add
0.3227 ring_buffer_consume         0.3262 lock_timer_base
0.2901 copy_user_generic_string    0.3225 try_to_wake_up
0.2883 <bash>                      0.3059 aio_complete
0.2611 blk_queue_end_tag           0.2783 mod_timer
0.2611 memset_c                    0.2488 kmem_cache_free
0.2448 kmem_cache_free             0.2451 blk_queue_end_tag
0.2357 qla2x00_process_completed_re0.2451 tcp_sendmsg
0.2321 lock_timer_base             0.2396 <bash>
0.2321 mod_timer                   0.2359 kref_get
0.2230 kfree                       0.2359 memset_c
0.2230 tcp_sendmsg                 0.2304 qla2x00_process_completed_re
0.2176 generic_make_request        0.2285 memmove
0.2085 scsi_dispatch_cmd           0.2285 mempool_free
0.2085 kref_get                    0.2248 generic_make_request
0.2085 sched_clock_cpu             0.2156 sched_clock_cpu
0.2067 scsi_device_unbusy          0.2119 e1000_xmit_frame

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] qla2xxx: Resolved a performance issue in interrupt - this time with data
  2009-06-12 22:52                         ` [PATCH] qla2xxx: Resolved a performance issue in interrupt - this time with data Styner, Douglas W
@ 2009-06-12 23:32                           ` Anirban Chakraborty
  0 siblings, 0 replies; 24+ messages in thread
From: Anirban Chakraborty @ 2009-06-12 23:32 UTC (permalink / raw)
  To: Styner, Douglas W
  Cc: James Bottomley, Andrew Vasquez, Wilcox, Matthew R, David Wagner,
	Linux SCSI Mailing List, Tripathi, Sharad C, Ma, Chinang,
	Prickett, Terry O


On Jun 12, 2009, at 3:52 PM, Styner, Douglas W wrote:

> Anirban Chakraborty [mailto:anirban.chakraborty@qlogic.com] writes:
>>
>> This is the updated patch. Please apply.
>
> Performance results from this patch compared to 2.6.30-rc6_scsi-misc.
>
> Linux OLTP Performance summary
> Kernel#            Speedup(x)   Intr/s  CtxSw/s us%    sys%   idle%   
> iowait%
> scsi-misc               1.000   29481   43570   74      25       
> 0       0
> scsi-misc_qla-irqsave   1.004   29471   43290   75      25       
> 0       0
>
> Server configurations:
> Intel Xeon Quad-core 2.0GHz  2 cpus/8 cores/8 threads
> 64GB memory, 3 qle2462 FC HBA, 450 spindles (30 logical units)
>
>
> ======oprofile CPU_CLK_UNHALTED for top 30 functions
> Cycles% 2.6.30-rc6_scsi-misc       Cycles% 2.6.30-rc6_scsi-misc_qla- 
> irqsave
> 67.3266 <database>                 66.6218 <database>
> 1.0062 qla24xx_start_scsi          1.0247 qla24xx_start_scsi
> 0.9246 qla24xx_intr_handler        0.9731 qla24xx_intr_handler
> 0.8158 __schedule                  0.8588 kmem_cache_alloc
> 0.7469 kmem_cache_alloc            0.8515 __schedule
> 0.4188 __blockdev_direct_IO        0.4663 __sigsetjmp
> 0.4097 __sigsetjmp                 0.4331 __blockdev_direct_IO
> 0.3989 scsi_request_fn             0.3852 scsi_request_fn
> 0.3916 __switch_to                 0.3852 task_rq_lock
> 0.3717 __list_add                  0.3612 rb_get_reader_page
> 0.3499 task_rq_lock                0.3336 __switch_to
> 0.3408 try_to_wake_up              0.3336 ring_buffer_consume
> 0.3281 aio_complete                0.3299 copy_user_generic_string
> 0.3281 rb_get_reader_page          0.3262 __list_add
> 0.3227 ring_buffer_consume         0.3262 lock_timer_base
> 0.2901 copy_user_generic_string    0.3225 try_to_wake_up
> 0.2883 <bash>                      0.3059 aio_complete
> 0.2611 blk_queue_end_tag           0.2783 mod_timer
> 0.2611 memset_c                    0.2488 kmem_cache_free
> 0.2448 kmem_cache_free             0.2451 blk_queue_end_tag
> 0.2357 qla2x00_process_completed_re0.2451 tcp_sendmsg
> 0.2321 lock_timer_base             0.2396 <bash>
> 0.2321 mod_timer                   0.2359 kref_get
> 0.2230 kfree                       0.2359 memset_c
> 0.2230 tcp_sendmsg                 0.2304 qla2x00_process_completed_re
> 0.2176 generic_make_request        0.2285 memmove
> 0.2085 scsi_dispatch_cmd           0.2285 mempool_free
> 0.2085 kref_get                    0.2248 generic_make_request
> 0.2085 sched_clock_cpu             0.2156 sched_clock_cpu
> 0.2067 scsi_device_unbusy          0.2119 e1000_xmit_frame

So, the data that you posted validated earlier didn't match with this  
run.
I did a similar testing several times before I posted the patch. Maybe  
I should share that data here.

Sever: Intel Xeon X7350, 4 core, 16GB memory, 2 dual port qla2432 and  
1 single port qla2432 (total 5 controllers).
Target: EMC Clarion (100+ luns per target, total no. of luns 512).
IO pumping tool: Orion with cold cache setting. Two Orion procs  
running each pumping to 256 devices.
Profiling tool: vtune.

And my results are as follows:

2.6.30-rc6 (without the patch)
function                                CPU_CLK_UNHALTED.CORE_P
qla24xx_intr_handler            1.18
qla2x00_process_completed_requ  0.18
qla24xx_start_scsi              0.15
qla24xx_process_response_queue  0.12
qla2xxx_queuecommand            0.06
qla2x00_sp_compl                0.06
qla2x00_status_entry            0.02
qla2x00_async_event             0.01

and 2.6.30-rc6 (with the irqsave patch)
function                        CPU_CLK_UNHALTED.CORE_P
qla24xx_start_scsi              0.11
qla2xxx_queuecommand            0.06
qla24xx_intr_handler            0.01
qla2x00_timer                   0.00

The difference in qla24xx_intr_handler is significant in my setup.

Thanks,
Anirban


^ permalink raw reply	[flat|nested] 24+ messages in thread

[parent not found: <B8134F80-B547-4E04-890A-6B646D2BA3E8@qlogic.com>]

end of thread, other threads:[~2009-06-12 23:32 UTC | newest]

Thread overview: 24+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-06-09 18:42 [PATCH] qla2xxx: Resolved a performance issue in interrupt handler Anirban Chakraborty
2009-06-09 19:32 ` James Bottomley
2009-06-09 20:34   ` Anirban Chakraborty
2009-06-09 20:40     ` James Bottomley
2009-06-09 21:28       ` Giridhar Malavali
2009-06-09 21:41         ` Matthew Wilcox
2009-06-09 21:46         ` James Bottomley
2009-06-09 21:40       ` Anirban Chakraborty
2009-06-09 21:43         ` Matthew Wilcox
2009-06-09 21:46           ` Anirban Chakraborty
2009-06-09 21:47             ` Matthew Wilcox
2009-06-09 22:30               ` Anirban Chakraborty
     [not found]         ` <D7C42C27E6CB1E4D8CBDF2F81EA92A26035402EBA0@azsmsx501.amr.corp.intel.com>
     [not found]           ` <2796FDDF-31EA-44E2-8856-84A22F31A01F@qlogic.com>
2009-06-10 15:52             ` Styner, Douglas W
2009-06-10 16:09               ` Anirban Chakraborty
2009-06-10 16:17                 ` James Bottomley
2009-06-10 16:18                 ` Matthew Wilcox
2009-06-10 18:29                   ` Anirban Chakraborty
2009-06-10 18:32                   ` [PATCH] qla2xxx: Resolved a performance issue in interrupt Andrew Vasquez
2009-06-10 19:10                     ` James Bottomley
2009-06-10 19:48                       ` Matthew Wilcox
2009-06-10 20:55                       ` Anirban Chakraborty
2009-06-12 22:48                         ` Styner, Douglas W
2009-06-12 22:52                         ` [PATCH] qla2xxx: Resolved a performance issue in interrupt - this time with data Styner, Douglas W
2009-06-12 23:32                           ` Anirban Chakraborty
     [not found] <B8134F80-B547-4E04-890A-6B646D2BA3E8@qlogic.com>

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox