public inbox for linux-scsi@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH 2/2] fusion:  Implement generic interrupt misroute handling
@ 2008-08-03 18:05 James Bottomley
  2008-08-03 22:59 ` Moore, Eric
  0 siblings, 1 reply; 7+ messages in thread
From: James Bottomley @ 2008-08-03 18:05 UTC (permalink / raw)
  To: linux-scsi, linux-kernel, linux-pci; +Cc: Moore, Eric

This patch uses the new pci_lost_interrupt() callback to note the loss
of an interrupt, and if the reason is MSI, to work around the problem.

I used the manufacturer config page for this, because every fusion type
has that one and its loss means that mpt_config() which is interrupt
driven, failed.

James

---
diff --git a/drivers/message/fusion/mptbase.c b/drivers/message/fusion/mptbase.c
index d6a0074..2d75c58 100644
--- a/drivers/message/fusion/mptbase.c
+++ b/drivers/message/fusion/mptbase.c
@@ -168,7 +168,7 @@ static int	mpt_readScsiDevicePageHeaders(MPT_ADAPTER *ioc, int portnum);
 static void 	mpt_read_ioc_pg_1(MPT_ADAPTER *ioc);
 static void 	mpt_read_ioc_pg_4(MPT_ADAPTER *ioc);
 static void	mpt_timer_expired(unsigned long data);
-static void	mpt_get_manufacturing_pg_0(MPT_ADAPTER *ioc);
+static int	mpt_get_manufacturing_pg_0(MPT_ADAPTER *ioc);
 static int	SendEventNotification(MPT_ADAPTER *ioc, u8 EvSwitch);
 static int	SendEventAck(MPT_ADAPTER *ioc, EventNotificationReply_t *evnp);
 static int	mpt_host_page_access_control(MPT_ADAPTER *ioc, u8 access_control_value, int sleepFlag);
@@ -2052,6 +2052,7 @@ mpt_do_ioc_recovery(MPT_ADAPTER *ioc, u32 reason, int sleepFlag)
 	int	 irq_allocated = 0;
 	u8	*a;
 
+ retry:
 	printk(MYIOC_s_INFO_FMT "Initiating %s\n", ioc->name,
 	    reason == MPT_HOSTEVENT_IOC_BRINGUP ? "bringup" : "recovery");
 
@@ -2268,6 +2269,7 @@ mpt_do_ioc_recovery(MPT_ADAPTER *ioc, u32 reason, int sleepFlag)
 	 *	and we try GetLanConfigPages again...
 	 */
 	if ((ret == 0) && (reason == MPT_HOSTEVENT_IOC_BRINGUP)) {
+		int rc;
 
 		/*
 		 * Initalize link list for inactive raid volumes.
@@ -2275,6 +2277,22 @@ mpt_do_ioc_recovery(MPT_ADAPTER *ioc, u32 reason, int sleepFlag)
 		mutex_init(&ioc->raid_data.inactive_list_mutex);
 		INIT_LIST_HEAD(&ioc->raid_data.inactive_list);
 
+		/* May fail becuase of IRQ misrouting */
+		rc = mpt_get_manufacturing_pg_0(ioc);
+		if (rc) {
+			if (pci_lost_interrupt(ioc->pcidev) ==
+			    PCI_LOST_IRQ_DISABLE_MSI) {
+				free_irq(ioc->pci_irq, ioc);
+				ioc->msi_enable = 0;
+				pci_disable_msi(ioc->pcidev);
+				goto retry;
+			}
+			printk(MYIOC_s_ERR_FMT "Cannot recover IRQ routing\n",
+			       ioc->name);
+			return -1;
+		}
+
+
 		if (ioc->bus_type == SAS) {
 
 			/* clear persistency table */
@@ -2326,7 +2344,6 @@ mpt_do_ioc_recovery(MPT_ADAPTER *ioc, u32 reason, int sleepFlag)
 		}
 
 		GetIoUnitPage2(ioc);
-		mpt_get_manufacturing_pg_0(ioc);
 	}
 
 	/*
@@ -5697,13 +5714,14 @@ mpt_read_ioc_pg_1(MPT_ADAPTER *ioc)
 	return;
 }
 
-static void
+static int
 mpt_get_manufacturing_pg_0(MPT_ADAPTER *ioc)
 {
 	CONFIGPARMS		cfg;
 	ConfigPageHeader_t	hdr;
 	dma_addr_t		buf_dma;
 	ManufacturingPage0_t	*pbuf = NULL;
+	int			ret;
 
 	memset(&cfg, 0 , sizeof(CONFIGPARMS));
 	memset(&hdr, 0 , sizeof(ConfigPageHeader_t));
@@ -5714,20 +5732,23 @@ mpt_get_manufacturing_pg_0(MPT_ADAPTER *ioc)
 	cfg.action = MPI_CONFIG_ACTION_PAGE_HEADER;
 	cfg.timeout = 10;
 
-	if (mpt_config(ioc, &cfg) != 0)
+	ret = mpt_config(ioc, &cfg);
+	if (ret != 0)
 		goto out;
 
 	if (!cfg.cfghdr.hdr->PageLength)
 		goto out;
 
 	cfg.action = MPI_CONFIG_ACTION_PAGE_READ_CURRENT;
+	ret = -ENOMEM;
 	pbuf = pci_alloc_consistent(ioc->pcidev, hdr.PageLength * 4, &buf_dma);
 	if (!pbuf)
 		goto out;
 
 	cfg.physAddr = buf_dma;
 
-	if (mpt_config(ioc, &cfg) != 0)
+	ret = mpt_config(ioc, &cfg);
+	if (ret != 0)
 		goto out;
 
 	memcpy(ioc->board_name, pbuf->BoardName, sizeof(ioc->board_name));
@@ -5738,6 +5759,7 @@ mpt_get_manufacturing_pg_0(MPT_ADAPTER *ioc)
 
 	if (pbuf)
 		pci_free_consistent(ioc->pcidev, hdr.PageLength * 4, pbuf, buf_dma);
+	return ret;
 }
 
 /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/



^ permalink raw reply related	[flat|nested] 7+ messages in thread

* RE: [PATCH 2/2] fusion:  Implement generic interrupt misroute handling
  2008-08-03 18:05 [PATCH 2/2] fusion: Implement generic interrupt misroute handling James Bottomley
@ 2008-08-03 22:59 ` Moore, Eric
  2008-08-03 23:07   ` James Bottomley
  2008-08-04 13:55   ` David Vrabel
  0 siblings, 2 replies; 7+ messages in thread
From: Moore, Eric @ 2008-08-03 22:59 UTC (permalink / raw)
  To: James Bottomley, linux-scsi, linux-kernel,
	linux-pci@vger.kernel.org
  Cc: Prakash, Sathya

Thanks, I will try this out.   However I thought I saw lost interrupts occurring randomly, meaning it was not necessarily the first config page access.  I'm back in the office on 8/11, I will test it out then and provide feedback.

Eric

________________________________________
From: James Bottomley [James.Bottomley@HansenPartnership.com]
Sent: Sunday, August 03, 2008 12:05 PM
To: linux-scsi; linux-kernel; linux-pci@vger.kernel.org
Cc: Moore, Eric
Subject: [PATCH 2/2] fusion:  Implement generic interrupt misroute handling

This patch uses the new pci_lost_interrupt() callback to note the loss
of an interrupt, and if the reason is MSI, to work around the problem.

I used the manufacturer config page for this, because every fusion type
has that one and its loss means that mpt_config() which is interrupt
driven, failed.

James

---
diff --git a/drivers/message/fusion/mptbase.c b/drivers/message/fusion/mptbase.c
index d6a0074..2d75c58 100644
--- a/drivers/message/fusion/mptbase.c
+++ b/drivers/message/fusion/mptbase.c
@@ -168,7 +168,7 @@ static int  mpt_readScsiDevicePageHeaders(MPT_ADAPTER *ioc, int portnum);
 static void    mpt_read_ioc_pg_1(MPT_ADAPTER *ioc);
 static void    mpt_read_ioc_pg_4(MPT_ADAPTER *ioc);
 static void    mpt_timer_expired(unsigned long data);
-static void    mpt_get_manufacturing_pg_0(MPT_ADAPTER *ioc);
+static int     mpt_get_manufacturing_pg_0(MPT_ADAPTER *ioc);
 static int     SendEventNotification(MPT_ADAPTER *ioc, u8 EvSwitch);
 static int     SendEventAck(MPT_ADAPTER *ioc, EventNotificationReply_t *evnp);
 static int     mpt_host_page_access_control(MPT_ADAPTER *ioc, u8 access_control_value, int sleepFlag);
@@ -2052,6 +2052,7 @@ mpt_do_ioc_recovery(MPT_ADAPTER *ioc, u32 reason, int sleepFlag)
        int      irq_allocated = 0;
        u8      *a;

+ retry:
        printk(MYIOC_s_INFO_FMT "Initiating %s\n", ioc->name,
            reason == MPT_HOSTEVENT_IOC_BRINGUP ? "bringup" : "recovery");

@@ -2268,6 +2269,7 @@ mpt_do_ioc_recovery(MPT_ADAPTER *ioc, u32 reason, int sleepFlag)
         *      and we try GetLanConfigPages again...
         */
        if ((ret == 0) && (reason == MPT_HOSTEVENT_IOC_BRINGUP)) {
+               int rc;

                /*
                 * Initalize link list for inactive raid volumes.
@@ -2275,6 +2277,22 @@ mpt_do_ioc_recovery(MPT_ADAPTER *ioc, u32 reason, int sleepFlag)
                mutex_init(&ioc->raid_data.inactive_list_mutex);
                INIT_LIST_HEAD(&ioc->raid_data.inactive_list);

+               /* May fail becuase of IRQ misrouting */
+               rc = mpt_get_manufacturing_pg_0(ioc);
+               if (rc) {
+                       if (pci_lost_interrupt(ioc->pcidev) ==
+                           PCI_LOST_IRQ_DISABLE_MSI) {
+                               free_irq(ioc->pci_irq, ioc);
+                               ioc->msi_enable = 0;
+                               pci_disable_msi(ioc->pcidev);
+                               goto retry;
+                       }
+                       printk(MYIOC_s_ERR_FMT "Cannot recover IRQ routing\n",
+                              ioc->name);
+                       return -1;
+               }
+
+
                if (ioc->bus_type == SAS) {

                        /* clear persistency table */
@@ -2326,7 +2344,6 @@ mpt_do_ioc_recovery(MPT_ADAPTER *ioc, u32 reason, int sleepFlag)
                }

                GetIoUnitPage2(ioc);
-               mpt_get_manufacturing_pg_0(ioc);
        }

        /*
@@ -5697,13 +5714,14 @@ mpt_read_ioc_pg_1(MPT_ADAPTER *ioc)
        return;
 }

-static void
+static int
 mpt_get_manufacturing_pg_0(MPT_ADAPTER *ioc)
 {
        CONFIGPARMS             cfg;
        ConfigPageHeader_t      hdr;
        dma_addr_t              buf_dma;
        ManufacturingPage0_t    *pbuf = NULL;
+       int                     ret;

        memset(&cfg, 0 , sizeof(CONFIGPARMS));
        memset(&hdr, 0 , sizeof(ConfigPageHeader_t));
@@ -5714,20 +5732,23 @@ mpt_get_manufacturing_pg_0(MPT_ADAPTER *ioc)
        cfg.action = MPI_CONFIG_ACTION_PAGE_HEADER;
        cfg.timeout = 10;

-       if (mpt_config(ioc, &cfg) != 0)
+       ret = mpt_config(ioc, &cfg);
+       if (ret != 0)
                goto out;

        if (!cfg.cfghdr.hdr->PageLength)
                goto out;

        cfg.action = MPI_CONFIG_ACTION_PAGE_READ_CURRENT;
+       ret = -ENOMEM;
        pbuf = pci_alloc_consistent(ioc->pcidev, hdr.PageLength * 4, &buf_dma);
        if (!pbuf)
                goto out;

        cfg.physAddr = buf_dma;

-       if (mpt_config(ioc, &cfg) != 0)
+       ret = mpt_config(ioc, &cfg);
+       if (ret != 0)
                goto out;

        memcpy(ioc->board_name, pbuf->BoardName, sizeof(ioc->board_name));
@@ -5738,6 +5759,7 @@ mpt_get_manufacturing_pg_0(MPT_ADAPTER *ioc)

        if (pbuf)
                pci_free_consistent(ioc->pcidev, hdr.PageLength * 4, pbuf, buf_dma);
+       return ret;
 }

 /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/



^ permalink raw reply related	[flat|nested] 7+ messages in thread

* RE: [PATCH 2/2] fusion:  Implement generic interrupt misroute handling
  2008-08-03 22:59 ` Moore, Eric
@ 2008-08-03 23:07   ` James Bottomley
  2008-08-04 13:55   ` David Vrabel
  1 sibling, 0 replies; 7+ messages in thread
From: James Bottomley @ 2008-08-03 23:07 UTC (permalink / raw)
  To: Moore, Eric
  Cc: linux-scsi, linux-kernel, linux-pci@vger.kernel.org,
	Prakash, Sathya

On Sun, 2008-08-03 at 16:59 -0600, Moore, Eric wrote:
> Thanks, I will try this out.   However I thought I saw lost interrupts
> occurring randomly, meaning it was not necessarily the first config
> page access.  I'm back in the office on 8/11, I will test it out then
> and provide feedback.

Well, this is the usual trouble.  I don't have a misbehaving
motherboard, so I artificially induce MSI problems on my platform.  For
me, it works, but obviously I'm losing every MSI interrupt.

James



^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH 2/2] fusion:  Implement generic interrupt misroute handling
  2008-08-03 22:59 ` Moore, Eric
  2008-08-03 23:07   ` James Bottomley
@ 2008-08-04 13:55   ` David Vrabel
  2008-08-05 16:36     ` James Bottomley
  1 sibling, 1 reply; 7+ messages in thread
From: David Vrabel @ 2008-08-04 13:55 UTC (permalink / raw)
  To: Moore, Eric
  Cc: James Bottomley, linux-scsi, linux-kernel,
	linux-pci@vger.kernel.org, Prakash, Sathya

Moore, Eric wrote:
> Thanks, I will try this out.   However I thought I saw lost
> interrupts occurring randomly, meaning it was not necessarily the
> first config page access.  I'm back in the office on 8/11, I will
> test it out then and provide feedback.

Is this using MSI on a device without per-vector mask bits?  If so, then
this patch may help.

http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit;h=ce6fce4295ba727b36fdc73040e444bd1aae64cd

David
-- 
David Vrabel, Senior Software Engineer, Drivers
CSR, Churchill House, Cambridge Business Park,  Tel: +44 (0)1223 692562
Cowley Road, Cambridge, CB4 0WZ                 http://www.csr.com/

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH 2/2] fusion:  Implement generic interrupt misroute handling
  2008-08-04 13:55   ` David Vrabel
@ 2008-08-05 16:36     ` James Bottomley
  2008-08-05 17:38       ` Jesse Barnes
  0 siblings, 1 reply; 7+ messages in thread
From: James Bottomley @ 2008-08-05 16:36 UTC (permalink / raw)
  To: David Vrabel
  Cc: Moore, Eric, linux-scsi, linux-kernel, linux-pci@vger.kernel.org,
	Prakash, Sathya

On Mon, 2008-08-04 at 14:55 +0100, David Vrabel wrote:
> Moore, Eric wrote:
> > Thanks, I will try this out.   However I thought I saw lost
> > interrupts occurring randomly, meaning it was not necessarily the
> > first config page access.  I'm back in the office on 8/11, I will
> > test it out then and provide feedback.
> 
> Is this using MSI on a device without per-vector mask bits?  If so, then
> this patch may help.
> 
> http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit;h=ce6fce4295ba727b36fdc73040e444bd1aae64cd

We don't really know what the problem is.  MSI interrupts get lost on
older motherboards (the ones most likely to contain a 1030).  Why is
anybody's guess although the clever money is on the motherboard bridge
having issues.

James



^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH 2/2] fusion:  Implement generic interrupt misroute handling
  2008-08-05 16:36     ` James Bottomley
@ 2008-08-05 17:38       ` Jesse Barnes
  2008-08-05 20:45         ` James Bottomley
  0 siblings, 1 reply; 7+ messages in thread
From: Jesse Barnes @ 2008-08-05 17:38 UTC (permalink / raw)
  To: James Bottomley
  Cc: David Vrabel, Moore, Eric, linux-scsi, linux-kernel,
	linux-pci@vger.kernel.org, Prakash, Sathya

On Tuesday, August 05, 2008 9:36 am James Bottomley wrote:
> On Mon, 2008-08-04 at 14:55 +0100, David Vrabel wrote:
> > Moore, Eric wrote:
> > > Thanks, I will try this out.   However I thought I saw lost
> > > interrupts occurring randomly, meaning it was not necessarily the
> > > first config page access.  I'm back in the office on 8/11, I will
> > > test it out then and provide feedback.
> >
> > Is this using MSI on a device without per-vector mask bits?  If so, then
> > this patch may help.
> >
> > http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit
> >;h=ce6fce4295ba727b36fdc73040e444bd1aae64cd
>
> We don't really know what the problem is.  MSI interrupts get lost on
> older motherboards (the ones most likely to contain a 1030).  Why is
> anybody's guess although the clever money is on the motherboard bridge
> having issues.

David just got us to fix an MSI masking bug recently, which could be related.

The issue is that the PCI & interrupt handling code was disabling MSI during 
interrupt handling, which could end up causing missed interrupts.  When MSI 
is disabled, devices can still generate interrupts, but they'll go out the 
interrupt line instead, so unless your IRQ handler is also registered with 
that IRQ number, you'll probably lose them.

As of the last PCI upstream merge, we work around this issue by never masking 
MSI interrupts unless the device supports the MSI mask bit (as opposed to 
just the big hammer enable/disable flag).

Jesse



^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH 2/2] fusion:  Implement generic interrupt misroute handling
  2008-08-05 17:38       ` Jesse Barnes
@ 2008-08-05 20:45         ` James Bottomley
  0 siblings, 0 replies; 7+ messages in thread
From: James Bottomley @ 2008-08-05 20:45 UTC (permalink / raw)
  To: Jesse Barnes
  Cc: David Vrabel, Moore, Eric, linux-scsi, linux-kernel,
	linux-pci@vger.kernel.org, Prakash, Sathya

On Tue, 2008-08-05 at 10:38 -0700, Jesse Barnes wrote:
> On Tuesday, August 05, 2008 9:36 am James Bottomley wrote:
> > On Mon, 2008-08-04 at 14:55 +0100, David Vrabel wrote:
> > > Moore, Eric wrote:
> > > > Thanks, I will try this out.   However I thought I saw lost
> > > > interrupts occurring randomly, meaning it was not necessarily the
> > > > first config page access.  I'm back in the office on 8/11, I will
> > > > test it out then and provide feedback.
> > >
> > > Is this using MSI on a device without per-vector mask bits?  If so, then
> > > this patch may help.
> > >
> > > http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit
> > >;h=ce6fce4295ba727b36fdc73040e444bd1aae64cd
> >
> > We don't really know what the problem is.  MSI interrupts get lost on
> > older motherboards (the ones most likely to contain a 1030).  Why is
> > anybody's guess although the clever money is on the motherboard bridge
> > having issues.
> 
> David just got us to fix an MSI masking bug recently, which could be related.
> 
> The issue is that the PCI & interrupt handling code was disabling MSI during 
> interrupt handling, which could end up causing missed interrupts.  When MSI 
> is disabled, devices can still generate interrupts, but they'll go out the 
> interrupt line instead, so unless your IRQ handler is also registered with 
> that IRQ number, you'll probably lose them.
> 
> As of the last PCI upstream merge, we work around this issue by never masking 
> MSI interrupts unless the device supports the MSI mask bit (as opposed to 
> just the big hammer enable/disable flag).

True, but this is orthogonal.

I'm trying to put together a diagnosing tool.  Once we know there's a
problem and who has it, then we can try seeing if patches like this fix
it.

James

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2008-08-05 20:45 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-08-03 18:05 [PATCH 2/2] fusion: Implement generic interrupt misroute handling James Bottomley
2008-08-03 22:59 ` Moore, Eric
2008-08-03 23:07   ` James Bottomley
2008-08-04 13:55   ` David Vrabel
2008-08-05 16:36     ` James Bottomley
2008-08-05 17:38       ` Jesse Barnes
2008-08-05 20:45         ` James Bottomley

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox