All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] rename CONFIG_PCI_USE_VECTOR to CONFIG_PCI_MSI
@ 2004-07-26 22:15 Bjorn Helgaas
  2004-07-26 22:39 ` Roland Dreier
  0 siblings, 1 reply; 8+ messages in thread
From: Bjorn Helgaas @ 2004-07-26 22:15 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, Tom L Nguyen

Rename CONFIG_PCI_USE_VECTOR to CONFIG_PCI_MSI.  The "vector"
terminology is architecture-dependent.  The PCI MSI interface
actually deals with Linux IRQ numbers (i.e., things you can
pass to request_irq()), and we shouldn't confuse things by
calling them "vectors" just because we're using MSI rather
than an IOSAPIC.

A similar patch was discussed a few months ago:
    http://groups.google.com/groups?hl=en&lr=&ie=UTF-8&th=92003e5ae290e1de&rnum=1
but it didn't seem to go anywhere.

Signed-off-by: Bjorn Helgaas <bjorn.helgaas@hp.com>

===== Documentation/MSI-HOWTO.txt 1.2 vs edited =====
--- 1.2/Documentation/MSI-HOWTO.txt	2004-02-20 08:44:42 -07:00
+++ edited/Documentation/MSI-HOWTO.txt	2004-07-26 15:24:15 -06:00
@@ -92,17 +92,17 @@
 5. Configuring a driver to use MSI/MSI-X
 
 By default, the kernel will not enable MSI/MSI-X on all devices that
-support this capability. The CONFIG_PCI_USE_VECTOR kernel option
+support this capability. The CONFIG_PCI_MSI kernel option
 must be selected to enable MSI/MSI-X support.
 
 5.1 Including MSI support into the kernel
 
 To allow MSI-Capable device drivers to selectively enable MSI (using
 pci_enable_msi as described below), the VECTOR based scheme needs to
-be enabled by setting CONFIG_PCI_USE_VECTOR.
+be enabled by setting CONFIG_PCI_MSI.
 
 Since the target of the inbound message is the local APIC, providing
-CONFIG_PCI_USE_VECTOR is dependent on whether CONFIG_X86_LOCAL_APIC
+CONFIG_PCI_MSI is dependent on whether CONFIG_X86_LOCAL_APIC
 is enabled or not.
 
 int pci_enable_msi(struct pci_dev *)
@@ -229,7 +229,7 @@
 In SMP environment, CONFIG_X86_LOCAL_APIC is automatically set;
 however, in UP environment, users must manually set
 CONFIG_X86_LOCAL_APIC. Once CONFIG_X86_LOCAL_APIC=y, setting
-CONFIG_PCI_USE_VECTOR enables the VECTOR based scheme and
+CONFIG_PCI_MSI enables the VECTOR based scheme and
 the option for MSI-capable device drivers to selectively enable
 MSI (using pci_enable_msi as described below).
 
===== arch/i386/kernel/io_apic.c 1.103 vs edited =====
--- 1.103/arch/i386/kernel/io_apic.c	2004-06-24 02:56:14 -06:00
+++ edited/arch/i386/kernel/io_apic.c	2004-07-26 15:26:50 -06:00
@@ -73,7 +73,7 @@
 } irq_2_pin[PIN_MAP_SIZE];
 
 int vector_irq[NR_VECTORS] = { [0 ... NR_VECTORS - 1] = -1};
-#ifdef CONFIG_PCI_USE_VECTOR
+#ifdef CONFIG_PCI_MSI
 #define vector_to_irq(vector) 	\
 	(platform_legacy_irq(vector) ? vector : vector_irq[vector])
 #else
@@ -1114,7 +1114,7 @@
 /* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
 u8 irq_vector[NR_IRQ_VECTORS] = { FIRST_DEVICE_VECTOR , 0 };
 
-#ifdef CONFIG_PCI_USE_VECTOR
+#ifdef CONFIG_PCI_MSI
 int assign_irq_vector(int irq)
 #else
 int __init assign_irq_vector(int irq)
@@ -1868,7 +1868,7 @@
 	}
 }
 
-#ifdef CONFIG_PCI_USE_VECTOR
+#ifdef CONFIG_PCI_MSI
 static unsigned int startup_edge_ioapic_vector(unsigned int vector)
 {
 	int irq = vector_to_irq(vector);
===== arch/i386/pci/irq.c 1.46 vs edited =====
--- 1.46/arch/i386/pci/irq.c	2004-06-27 01:19:28 -06:00
+++ edited/arch/i386/pci/irq.c	2004-07-26 15:26:51 -06:00
@@ -817,7 +817,7 @@
 		    	if ( dev2->irq && dev2->irq != irq && \
 			(!(pci_probe & PCI_USE_PIRQ_MASK) || \
 			((1 << dev2->irq) & mask)) ) {
-#ifndef CONFIG_PCI_USE_VECTOR
+#ifndef CONFIG_PCI_MSI
 		    		printk(KERN_INFO "IRQ routing conflict for %s, have irq %d, want irq %d\n",
 				       pci_name(dev2), dev2->irq, irq);
 #endif
@@ -1034,7 +1034,7 @@
 				}
 				dev = temp_dev;
 				if (irq >= 0) {
-#ifdef CONFIG_PCI_USE_VECTOR
+#ifdef CONFIG_PCI_MSI
 					if (!platform_legacy_irq(irq))
 						irq = IO_APIC_VECTOR(irq);
 #endif
===== arch/x86_64/kernel/i8259.c 1.15 vs edited =====
--- 1.15/arch/x86_64/kernel/i8259.c	2004-07-13 07:08:43 -06:00
+++ edited/arch/x86_64/kernel/i8259.c	2004-07-26 15:26:52 -06:00
@@ -75,7 +75,7 @@
 BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb)
 BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd)
 
-#ifdef CONFIG_PCI_USE_VECTOR
+#ifdef CONFIG_PCI_MSI
 	BUILD_14_IRQS(0xe)
 #endif
 
@@ -110,7 +110,7 @@
 	IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb),
 	IRQLIST_16(0xc), IRQLIST_16(0xd)
 
-#ifdef CONFIG_PCI_USE_VECTOR
+#ifdef CONFIG_PCI_MSI
 	, IRQLIST_14(0xe)
 #endif
 
===== arch/x86_64/kernel/io_apic.c 1.29 vs edited =====
--- 1.29/arch/x86_64/kernel/io_apic.c	2004-06-24 02:55:54 -06:00
+++ edited/arch/x86_64/kernel/io_apic.c	2004-07-26 15:26:52 -06:00
@@ -68,7 +68,7 @@
 } irq_2_pin[PIN_MAP_SIZE];
 
 int vector_irq[NR_VECTORS] = { [0 ... NR_VECTORS - 1] = -1};
-#ifdef CONFIG_PCI_USE_VECTOR
+#ifdef CONFIG_PCI_MSI
 #define vector_to_irq(vector) 	\
 	(platform_legacy_irq(vector) ? vector : vector_irq[vector])
 #else
@@ -656,7 +656,7 @@
 /* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
 u8 irq_vector[NR_IRQ_VECTORS] = { FIRST_DEVICE_VECTOR , 0 };
 
-#ifdef CONFIG_PCI_USE_VECTOR
+#ifdef CONFIG_PCI_MSI
 int assign_irq_vector(int irq)
 #else
 int __init assign_irq_vector(int irq)
@@ -1406,7 +1406,7 @@
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
-#ifdef CONFIG_PCI_USE_VECTOR
+#ifdef CONFIG_PCI_MSI
 static unsigned int startup_edge_ioapic_vector(unsigned int vector)
 {
 	int irq = vector_to_irq(vector);
===== drivers/pci/Kconfig 1.5 vs edited =====
--- 1.5/drivers/pci/Kconfig	2004-06-24 02:55:54 -06:00
+++ edited/drivers/pci/Kconfig	2004-07-26 15:26:53 -06:00
@@ -1,22 +1,15 @@
 #
 # PCI configuration
 #
-config PCI_USE_VECTOR
-	bool "Vector-based interrupt indexing (MSI)"
+config PCI_MSI
+	bool "Message-signalled interrupts (MSI and MSI-X)"
 	depends on (X86_LOCAL_APIC && X86_IO_APIC) || IA64
 	default n
 	help
-	   This replaces the current existing IRQ-based index interrupt scheme
-	   with the vector-base index scheme. The advantages of vector base
-	   over IRQ base are listed below:
-	   1) Support MSI implementation.
-	   2) Support future IOxAPIC hotplug
-
-	   Note that this allows the device drivers to enable MSI, Message
-	   Signaled Interrupt, on all MSI capable device functions detected.
-	   Message Signal Interrupt enables an MSI-capable hardware device to
-	   send an inbound Memory Write on its PCI bus instead of asserting
-	   IRQ signal on device IRQ pin.
+	   This allows device drivers to enable MSI (Message Signalled
+	   Interrupts).  Message Signalled Interrupts enable a device to
+	   generate an interrupt using an inbound Memory Write on its
+	   PCI bus instead of asserting a device IRQ pin.
 
 	   If you don't know what to do here, say N.
 
===== drivers/pci/Makefile 1.38 vs edited =====
--- 1.38/drivers/pci/Makefile	2004-04-22 02:40:35 -06:00
+++ edited/drivers/pci/Makefile	2004-07-26 15:27:23 -06:00
@@ -26,7 +26,7 @@
 obj-$(CONFIG_PPC64) += setup-bus.o
 obj-$(CONFIG_MIPS) += setup-bus.o setup-irq.o
 obj-$(CONFIG_X86_VISWS) += setup-irq.o
-obj-$(CONFIG_PCI_USE_VECTOR) += msi.o
+obj-$(CONFIG_PCI_MSI) += msi.o
 
 # Cardbus & CompactPCI use setup-bus
 obj-$(CONFIG_HOTPLUG) += setup-bus.o
===== include/asm-i386/io_apic.h 1.16 vs edited =====
--- 1.16/include/asm-i386/io_apic.h	2004-06-18 00:49:30 -06:00
+++ edited/include/asm-i386/io_apic.h	2004-07-26 15:26:53 -06:00
@@ -13,7 +13,7 @@
 
 #ifdef CONFIG_X86_IO_APIC
 
-#ifdef CONFIG_PCI_USE_VECTOR
+#ifdef CONFIG_PCI_MSI
 static inline int use_pci_vector(void)	{return 1;}
 static inline void disable_edge_ioapic_vector(unsigned int vector) { }
 static inline void mask_and_ack_level_ioapic_vector(unsigned int vector) { }
===== include/asm-i386/mach-default/irq_vectors_limits.h 1.1 vs edited =====
--- 1.1/include/asm-i386/mach-default/irq_vectors_limits.h	2004-04-12 11:54:29 -06:00
+++ edited/include/asm-i386/mach-default/irq_vectors_limits.h	2004-07-26 15:26:54 -06:00
@@ -1,7 +1,7 @@
 #ifndef _ASM_IRQ_VECTORS_LIMITS_H
 #define _ASM_IRQ_VECTORS_LIMITS_H
 
-#ifdef CONFIG_PCI_USE_VECTOR
+#ifdef CONFIG_PCI_MSI
 #define NR_IRQS FIRST_SYSTEM_VECTOR
 #define NR_IRQ_VECTORS NR_IRQS
 #else
===== include/asm-x86_64/io_apic.h 1.10 vs edited =====
--- 1.10/include/asm-x86_64/io_apic.h	2003-12-31 22:27:45 -07:00
+++ edited/include/asm-x86_64/io_apic.h	2004-07-26 15:26:55 -06:00
@@ -13,7 +13,7 @@
 
 #ifdef CONFIG_X86_IO_APIC
 
-#ifdef CONFIG_PCI_USE_VECTOR
+#ifdef CONFIG_PCI_MSI
 static inline int use_pci_vector(void)	{return 1;}
 static inline void disable_edge_ioapic_vector(unsigned int vector) { }
 static inline void mask_and_ack_level_ioapic_vector(unsigned int vector) { }
===== include/asm-x86_64/irq.h 1.6 vs edited =====
--- 1.6/include/asm-x86_64/irq.h	2004-04-12 11:54:45 -06:00
+++ edited/include/asm-x86_64/irq.h	2004-07-26 15:26:55 -06:00
@@ -31,7 +31,7 @@
 
 #define FIRST_SYSTEM_VECTOR	0xef   /* duplicated in hw_irq.h */
 
-#ifdef CONFIG_PCI_USE_VECTOR
+#ifdef CONFIG_PCI_MSI
 #define NR_IRQS FIRST_SYSTEM_VECTOR
 #define NR_IRQ_VECTORS NR_IRQS
 #else
===== include/linux/pci.h 1.130 vs edited =====
--- 1.130/include/linux/pci.h	2004-06-30 12:21:27 -06:00
+++ edited/include/linux/pci.h	2004-07-26 15:26:56 -06:00
@@ -831,7 +831,7 @@
 extern struct pci_dev *isa_bridge;
 #endif
 
-#ifndef CONFIG_PCI_USE_VECTOR
+#ifndef CONFIG_PCI_MSI
 static inline void pci_scan_msi_device(struct pci_dev *dev) {}
 static inline int pci_enable_msi(struct pci_dev *dev) {return -1;}
 static inline void msi_remove_pci_irq_vectors(struct pci_dev *dev) {}

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] rename CONFIG_PCI_USE_VECTOR to CONFIG_PCI_MSI
  2004-07-26 22:15 [PATCH] rename CONFIG_PCI_USE_VECTOR to CONFIG_PCI_MSI Bjorn Helgaas
@ 2004-07-26 22:39 ` Roland Dreier
  2004-07-26 22:45   ` Roland Dreier
  2004-07-26 23:34   ` Bjorn Helgaas
  0 siblings, 2 replies; 8+ messages in thread
From: Roland Dreier @ 2004-07-26 22:39 UTC (permalink / raw)
  To: Bjorn Helgaas; +Cc: Andrew Morton, linux-kernel, Tom L Nguyen

    Bjorn> Rename CONFIG_PCI_USE_VECTOR to CONFIG_PCI_MSI.  The
    Bjorn> "vector" terminology is architecture-dependent.  The PCI
    Bjorn> MSI interface actually deals with Linux IRQ numbers (i.e.,
    Bjorn> things you can pass to request_irq()), and we shouldn't
    Bjorn> confuse things by calling them "vectors" just because we're
    Bjorn> using MSI rather than an IOSAPIC.

Seems reasonable... however CONFIG_PCI_USE_VECTOR really has two
overloaded meanings (at least on i386).  First of all, as you say, it
does enable drivers to request MSI/MSI-X.  However, on i386
CONFIG_PCI_USE_VECTOR also changes how the APIC is setup (the most
visible effect of which is different interrupt numbers).

I would propose the following course of action:

 1) Merge Long's latest MSI/MSI-X patches (updated patches in
    http://gmane.linux.kernel/218830).  Without the new semantics of
    pci_disable_msi()/pci_disable_msix(), it's very difficult to use
    MSI/MSI-X in a device driver.
 2) Split the config options so we have an i386-specific
    CONFIG_PCI_USE_VECTOR and a generic CONFIG_PCI_MSI (with
    CONFIG_PCI_MSI depending on something like !I386 || CONFIG_PCI_USE_VECTOR)
    This would be an updated version of your patch.
 3) Make the code in drivers/pci/msi.c less Intel-specific -- instead
    of hard-coding Intel-specific addresses for vectors have the
    computation call into arch code.  This would be a fair amount of
    work and depends documentation for non-Intel platforms that
    implement MSI/MSI-X -- should be easier as PCI Express comes out.

 - Roland

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] rename CONFIG_PCI_USE_VECTOR to CONFIG_PCI_MSI
  2004-07-26 22:39 ` Roland Dreier
@ 2004-07-26 22:45   ` Roland Dreier
  2004-07-26 23:34   ` Bjorn Helgaas
  1 sibling, 0 replies; 8+ messages in thread
From: Roland Dreier @ 2004-07-26 22:45 UTC (permalink / raw)
  To: Bjorn Helgaas; +Cc: Andrew Morton, linux-kernel, Tom L Nguyen

    Roland>  1) Merge Long's latest MSI/MSI-X patches (updated patches
    Roland> in http://gmane.linux.kernel/218830).

err... http://article.gmane.org/gmane.linux.kernel/218830

Sorry...

 - R.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] rename CONFIG_PCI_USE_VECTOR to CONFIG_PCI_MSI
  2004-07-26 22:39 ` Roland Dreier
  2004-07-26 22:45   ` Roland Dreier
@ 2004-07-26 23:34   ` Bjorn Helgaas
  2004-07-27  1:03     ` Roland Dreier
       [not found]     ` <20040726164324.683ff471.akpm@osdl.org>
  1 sibling, 2 replies; 8+ messages in thread
From: Bjorn Helgaas @ 2004-07-26 23:34 UTC (permalink / raw)
  To: Roland Dreier; +Cc: Andrew Morton, linux-kernel, Tom L Nguyen

On Monday 26 July 2004 4:39 pm, Roland Dreier wrote:
> I would propose the following course of action:
> 
>  1) Merge Long's latest MSI/MSI-X patches (updated patches in
>     http://gmane.linux.kernel/218830).  Without the new semantics of
>     pci_disable_msi()/pci_disable_msix(), it's very difficult to use
>     MSI/MSI-X in a device driver.

That sounds fine to me.  There's nobody really using MSI yet, so
it can't break too much.

>  2) Split the config options so we have an i386-specific
>     CONFIG_PCI_USE_VECTOR and a generic CONFIG_PCI_MSI (with
>     CONFIG_PCI_MSI depending on something like !I386 || CONFIG_PCI_USE_VECTOR)
>     This would be an updated version of your patch.\

Yup.  Nothing in MSI has changed since April, so I thought my patch
would be a reasonable no-risk first step.

>  3) Make the code in drivers/pci/msi.c less Intel-specific -- instead
>     of hard-coding Intel-specific addresses for vectors have the
>     computation call into arch code.  This would be a fair amount of
>     work and depends documentation for non-Intel platforms that
>     implement MSI/MSI-X -- should be easier as PCI Express comes out.

This is the bit I really want to get to.  In particular, I want to
support multiple interrupt vector spaces on ia64, because we're
running out of vectors.  I can't do that as long as MSI mucks
around with the arch-specific vector allocation.  (There's plenty
of ia64 code that needs to be cleaned up, too; it's not just MSI.)

I think there needs to be some arch interface to allocate/deallocate
Linux IRQ numbers (not interrupt vectors).  Then MSI can allocate
as many as it needs, and use yet another arch interface to translate
the Linux IRQ numbers to the appropriate address/data info to program
the device.

(A side note on this -- the MSI code in the tree uses "vector" where
it should use "irq".  For example, msi_alloc_vectors() really allocates
Linux IRQs, not vectors, because you can pass them to request_irq() and
friends.  Maybe Long's latest patch cleans this up a bit.)

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] rename CONFIG_PCI_USE_VECTOR to CONFIG_PCI_MSI
  2004-07-26 23:34   ` Bjorn Helgaas
@ 2004-07-27  1:03     ` Roland Dreier
  2004-07-27  5:48       ` Zwane Mwaikambo
       [not found]     ` <20040726164324.683ff471.akpm@osdl.org>
  1 sibling, 1 reply; 8+ messages in thread
From: Roland Dreier @ 2004-07-27  1:03 UTC (permalink / raw)
  To: Bjorn Helgaas; +Cc: Andrew Morton, linux-kernel, Tom L Nguyen

    Roland> 1) Merge Long's latest MSI/MSI-X patches (updated patches
    Roland> in http://gmane.linux.kernel/218830).  Without the new
    Roland> semantics of pci_disable_msi()/pci_disable_msix(), it's
    Roland> very difficult to use MSI/MSI-X in a device driver.

    Bjorn> That sounds fine to me.  There's nobody really using MSI
    Bjorn> yet, so it can't break too much.

Yup... as far as I can tell there are no in-kernel users, and nobody
noticed that the MSI-X code didn't enable MSI-X properly until my
patch from last month.  My mthca driver:

    https://openib.org/svn/gen2/branches/roland-merge/src/linux-kernel/infiniband/hw/mthca

seems to be one of the first attempts to use MSI/MSI-X.  I have some
uncommitted changes to match Long's patch -- without the patch the
semantics of free_irq() releasing an MSI-X vector make my driver code
very awkward.

    Roland> 2) Split the config options so we have an i386-specific
    Roland> CONFIG_PCI_USE_VECTOR and a generic CONFIG_PCI_MSI (with
    Roland> CONFIG_PCI_MSI depending on something like !I386 ||
    Roland> CONFIG_PCI_USE_VECTOR) This would be an updated version of
    Roland> your patch.

    Bjorn> Yup.  Nothing in MSI has changed since April, so I thought
    Bjorn> my patch would be a reasonable no-risk first step.

I agree... I'd just really, really like to see Long's patch merged
first, since I've been waiting for it for a long time and my driver is
broken without it :)

    Roland> 3) Make the code in drivers/pci/msi.c less Intel-specific
    Roland> -- instead of hard-coding Intel-specific addresses for
    Roland> vectors have the computation call into arch code.  This
    Roland> would be a fair amount of work and depends documentation
    Roland> for non-Intel platforms that implement MSI/MSI-X -- should
    Roland> be easier as PCI Express comes out.

    Bjorn> This is the bit I really want to get to.  In particular, I
    Bjorn> want to support multiple interrupt vector spaces on ia64,
    Bjorn> because we're running out of vectors.  I can't do that as
    Bjorn> long as MSI mucks around with the arch-specific vector
    Bjorn> allocation.  (There's plenty of ia64 code that needs to be
    Bjorn> cleaned up, too; it's not just MSI.)

    Bjorn> I think there needs to be some arch interface to
    Bjorn> allocate/deallocate Linux IRQ numbers (not interrupt
    Bjorn> vectors).  Then MSI can allocate as many as it needs, and
    Bjorn> use yet another arch interface to translate the Linux IRQ
    Bjorn> numbers to the appropriate address/data info to program the
    Bjorn> device.

Sounds good, although I don't know much about the low-level details of
interrupt vectors on either i386 or ia64.  Some way of exposing which
interrupts are "closest" to which CPUs would be a good thing too.

One thing that I would be a little concerned about is making the
numbers in /proc/interrupts too divorced from the underlying platform
interrupt code -- it seems that ACPI debugging is hard enough as it
is.

 - Roland

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] rename CONFIG_PCI_USE_VECTOR to CONFIG_PCI_MSI
  2004-07-27  1:03     ` Roland Dreier
@ 2004-07-27  5:48       ` Zwane Mwaikambo
  0 siblings, 0 replies; 8+ messages in thread
From: Zwane Mwaikambo @ 2004-07-27  5:48 UTC (permalink / raw)
  To: Roland Dreier; +Cc: Bjorn Helgaas, Andrew Morton, Linux Kernel, Tom L Nguyen

On Mon, 26 Jul 2004, Roland Dreier wrote:

>     Bjorn> This is the bit I really want to get to.  In particular, I
>     Bjorn> want to support multiple interrupt vector spaces on ia64,
>     Bjorn> because we're running out of vectors.  I can't do that as
>     Bjorn> long as MSI mucks around with the arch-specific vector
>     Bjorn> allocation.  (There's plenty of ia64 code that needs to be
>     Bjorn> cleaned up, too; it's not just MSI.)

Agreed, this was discussed earlier and shouldn't be too hard to work into
the current MSI code. Something akin to arrays of msi_desc indexed by
irq handling node depending on the source irq/bus information.

>     Bjorn> I think there needs to be some arch interface to
>     Bjorn> allocate/deallocate Linux IRQ numbers (not interrupt
>     Bjorn> vectors).  Then MSI can allocate as many as it needs, and
>     Bjorn> use yet another arch interface to translate the Linux IRQ
>     Bjorn> numbers to the appropriate address/data info to program the
>     Bjorn> device.
>
> Sounds good, although I don't know much about the low-level details of
> interrupt vectors on either i386 or ia64.  Some way of exposing which
> interrupts are "closest" to which CPUs would be a good thing too.

We can do this right now using the topology information, it's been done
before on NUMAQ.

> One thing that I would be a little concerned about is making the
> numbers in /proc/interrupts too divorced from the underlying platform
> interrupt code -- it seems that ACPI debugging is hard enough as it
> is.

Ok, some of those really are vectors, the thing is, for irqs > 15 (non
legacy) we pass the real vector around. This is done by setting
pci_dev->irq the vector assigned to that irq line. It can get quite
confusing in places so variable naming is indeed important. But in
general, on i386, all irqs with CONFIG_PCI_MSI are vectors.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH][1/2] Stop using dev->bus->ops directly in msi.c
       [not found]           ` <20040727023927.GB24599@kroah.com>
@ 2004-07-28 17:08             ` Roland Dreier
  2004-07-28 17:11             ` [PATCH][2/2] MSI/MSI-X API updates Roland Dreier
  1 sibling, 0 replies; 8+ messages in thread
From: Roland Dreier @ 2004-07-28 17:08 UTC (permalink / raw)
  To: Greg KH; +Cc: Andrew Morton, bjorn.helgaas, tom.l.nguyen, linux-kernel

First half of the MSI rewrite: pure cleanup.  Use proper
pci_read_config_xxx() and pci_write_config_xxx() functions instead of
accessing raw dev->bus->ops.

Index: linux-2.6.8-rc2/drivers/pci/msi.c
===================================================================
--- linux-2.6.8-rc2.orig/drivers/pci/msi.c
+++ linux-2.6.8-rc2/drivers/pci/msi.c
@@ -64,15 +64,13 @@
 	case PCI_CAP_ID_MSI:
 	{
 		int		pos;
-		unsigned int	mask_bits;
+		u32		mask_bits;
 
 		pos = entry->mask_base;
-	        entry->dev->bus->ops->read(entry->dev->bus, entry->dev->devfn,
-				pos, 4, &mask_bits);
+		pci_read_config_dword(entry->dev, pos, &mask_bits);
 		mask_bits &= ~(1);
 		mask_bits |= flag;
-	        entry->dev->bus->ops->write(entry->dev->bus, entry->dev->devfn,
-				pos, 4, mask_bits);
+		pci_write_config_dword(entry->dev, pos, mask_bits);
 		break;
 	}
 	case PCI_CAP_ID_MSIX:
@@ -105,15 +103,13 @@
    		if (!(pos = pci_find_capability(entry->dev, PCI_CAP_ID_MSI)))
 			return;
 
-	        entry->dev->bus->ops->read(entry->dev->bus, entry->dev->devfn,
-			msi_lower_address_reg(pos), 4,
+		pci_read_config_dword(entry->dev, msi_lower_address_reg(pos),
 			&address.lo_address.value);
 		address.lo_address.value &= MSI_ADDRESS_DEST_ID_MASK;
 		address.lo_address.value |= (cpu_mask_to_apicid(cpu_mask) <<
 			MSI_TARGET_CPU_SHIFT);
 		entry->msi_attrib.current_cpu = cpu_mask_to_apicid(cpu_mask);
-		entry->dev->bus->ops->write(entry->dev->bus, entry->dev->devfn,
-			msi_lower_address_reg(pos), 4,
+		pci_write_config_dword(entry->dev, msi_lower_address_reg(pos),
 			address.lo_address.value);
 		break;
 	}
@@ -383,51 +379,45 @@
 
 static void enable_msi_mode(struct pci_dev *dev, int pos, int type)
 {
-	u32 control;
+	u16 control;
 
-	dev->bus->ops->read(dev->bus, dev->devfn,
-		msi_control_reg(pos), 2, &control);
+	pci_read_config_word(dev, msi_control_reg(pos), &control);
 	if (type == PCI_CAP_ID_MSI) {
 		/* Set enabled bits to single MSI & enable MSI_enable bit */
 		msi_enable(control, 1);
-	        dev->bus->ops->write(dev->bus, dev->devfn,
-			msi_control_reg(pos), 2, control);
+		pci_write_config_word(dev, msi_control_reg(pos), control);
 	} else {
 		msix_enable(control);
-	        dev->bus->ops->write(dev->bus, dev->devfn,
-			msi_control_reg(pos), 2, control);
+		pci_write_config_word(dev, msi_control_reg(pos), control);
 	}
     	if (pci_find_capability(dev, PCI_CAP_ID_EXP)) {
 		/* PCI Express Endpoint device detected */
-		u32 cmd;
-	        dev->bus->ops->read(dev->bus, dev->devfn, PCI_COMMAND, 2, &cmd);
+		u16 cmd;
+		pci_read_config_word(dev, PCI_COMMAND, &cmd);
 		cmd |= PCI_COMMAND_INTX_DISABLE;
-	        dev->bus->ops->write(dev->bus, dev->devfn, PCI_COMMAND, 2, cmd);
+		pci_write_config_word(dev, PCI_COMMAND, cmd);
 	}
 }
 
 static void disable_msi_mode(struct pci_dev *dev, int pos, int type)
 {
-	u32 control;
+	u16 control;
 
-	dev->bus->ops->read(dev->bus, dev->devfn,
-		msi_control_reg(pos), 2, &control);
+	pci_read_config_word(dev, msi_control_reg(pos), &control);
 	if (type == PCI_CAP_ID_MSI) {
 		/* Set enabled bits to single MSI & enable MSI_enable bit */
 		msi_disable(control);
-	        dev->bus->ops->write(dev->bus, dev->devfn,
-			msi_control_reg(pos), 2, control);
+		pci_write_config_word(dev, msi_control_reg(pos), control);
 	} else {
 		msix_disable(control);
-	        dev->bus->ops->write(dev->bus, dev->devfn,
-			msi_control_reg(pos), 2, control);
+		pci_write_config_word(dev, msi_control_reg(pos), control);
 	}
     	if (pci_find_capability(dev, PCI_CAP_ID_EXP)) {
 		/* PCI Express Endpoint device detected */
-		u32 cmd;
-	        dev->bus->ops->read(dev->bus, dev->devfn, PCI_COMMAND, 2, &cmd);
+		u16 cmd;
+		pci_read_config_word(dev, PCI_COMMAND, &cmd);
 		cmd &= ~PCI_COMMAND_INTX_DISABLE;
-	        dev->bus->ops->write(dev->bus, dev->devfn, PCI_COMMAND, 2, cmd);
+		pci_write_config_word(dev, PCI_COMMAND, cmd);
 	}
 }
 
@@ -480,14 +470,13 @@
 	struct msg_address address;
 	struct msg_data data;
 	int pos, vector;
-	u32 control;
+	u16 control;
 
    	pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
 	if (!pos)
 		return -EINVAL;
 
-	dev->bus->ops->read(dev->bus, dev->devfn, msi_control_reg(pos),
-		2, &control);
+	pci_read_config_word(dev, msi_control_reg(pos), &control);
 	if (control & PCI_MSI_FLAGS_ENABLE)
 		return 0;
 
@@ -521,27 +510,27 @@
 	msi_data_init(&data, vector);
 	entry->msi_attrib.current_cpu = ((address.lo_address.u.dest_id >>
 				MSI_TARGET_CPU_SHIFT) & MSI_TARGET_CPU_MASK);
-	dev->bus->ops->write(dev->bus, dev->devfn, msi_lower_address_reg(pos),
-				4, address.lo_address.value);
+	pci_write_config_dword(dev, msi_lower_address_reg(pos),
+			address.lo_address.value);
 	if (is_64bit_address(control)) {
-		dev->bus->ops->write(dev->bus, dev->devfn,
-			msi_upper_address_reg(pos), 4, address.hi_address);
-		dev->bus->ops->write(dev->bus, dev->devfn,
-			msi_data_reg(pos, 1), 2, *((u32*)&data));
+		pci_write_config_dword(dev, 
+			msi_upper_address_reg(pos), address.hi_address);
+		pci_write_config_word(dev, 
+			msi_data_reg(pos, 1), *((u32*)&data));
 	} else
-		dev->bus->ops->write(dev->bus, dev->devfn,
-			msi_data_reg(pos, 0), 2, *((u32*)&data));
+		pci_write_config_word(dev, 
+			msi_data_reg(pos, 0), *((u32*)&data));
 	if (entry->msi_attrib.maskbit) {
 		unsigned int maskbits, temp;
 		/* All MSIs are unmasked by default, Mask them all */
-	        dev->bus->ops->read(dev->bus, dev->devfn,
-			msi_mask_bits_reg(pos, is_64bit_address(control)), 4,
+		pci_read_config_dword(dev, 
+			msi_mask_bits_reg(pos, is_64bit_address(control)),
 			&maskbits);
 		temp = (1 << multi_msi_capable(control));
 		temp = ((temp - 1) & ~temp);
 		maskbits |= temp;
-		dev->bus->ops->write(dev->bus, dev->devfn,
-			msi_mask_bits_reg(pos, is_64bit_address(control)), 4,
+		pci_write_config_dword(dev, 
+			msi_mask_bits_reg(pos, is_64bit_address(control)),
 			maskbits);
 	}
 	attach_msi_entry(entry, vector);
@@ -571,7 +560,7 @@
 	struct msg_data data;
 	int vector = 0, pos, dev_msi_cap, i;
 	u32 phys_addr, table_offset;
-	u32 control;
+	u16 control;
 	u8 bir;
 	void *base;
 
@@ -580,8 +569,7 @@
 		return -EINVAL;
 
 	/* Request & Map MSI-X table region */
-	dev->bus->ops->read(dev->bus, dev->devfn, msi_control_reg(pos), 2,
-		&control);
+ 	pci_read_config_word(dev, msi_control_reg(pos), &control);
 	if (control & PCI_MSIX_FLAGS_ENABLE)
 		return 0;
 
@@ -592,8 +580,8 @@
 	}
 
 	dev_msi_cap = multi_msix_capable(control);
-	dev->bus->ops->read(dev->bus, dev->devfn,
-		msix_table_offset_reg(pos), 4, &table_offset);
+ 	pci_read_config_dword(dev, msix_table_offset_reg(pos),
+ 		&table_offset);
 	bir = (u8)(table_offset & PCI_MSIX_FLAGS_BIRMASK);
 	phys_addr = pci_resource_start (dev, bir);
 	phys_addr += (u32)(table_offset & ~PCI_MSIX_FLAGS_BIRMASK);
@@ -728,7 +716,8 @@
 	struct msg_address address;
 	struct msg_data data;
 	int i, offset, pos, dev_msi_cap, vector;
-	u32 low_address, control;
+	u32 low_address;
+	u16 control;
 	unsigned long base = 0L;
 	unsigned long flags;
 
@@ -742,8 +731,7 @@
 	spin_unlock_irqrestore(&msi_lock, flags);
 
    	pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
-	dev->bus->ops->read(dev->bus, dev->devfn, msi_control_reg(pos),
-		2, &control);
+ 	pci_read_config_word(dev, msi_control_reg(pos), &control);
 	dev_msi_cap = multi_msix_capable(control);
 	for (i = 1; i < dev_msi_cap; i++) {
 		if (!(low_address = readl(base + i * PCI_MSIX_ENTRY_SIZE)))
@@ -838,7 +826,7 @@
 	struct msi_desc *entry;
 	int i, head, pos, vec, free_vectors, alloc_vectors;
 	int *vectors = (int *)vector;
-	u32 control;
+	u16 control;
 	unsigned long flags;
 
 	if (!pci_msi_enable || !dev)
@@ -847,7 +835,7 @@
    	if (!(pos = pci_find_capability(dev, PCI_CAP_ID_MSIX)))
  		return -EINVAL;
 
-	dev->bus->ops->read(dev->bus, dev->devfn, msi_control_reg(pos), 			2, &control);
+ 	pci_read_config_word(dev, msi_control_reg(pos), &control);
 	if (nvec > multi_msix_capable(control))
 		return -EINVAL;
 
@@ -980,14 +968,13 @@
 	if (type == PCI_CAP_ID_MSIX) {
 		int i, pos, dev_msi_cap;
 		u32 phys_addr, table_offset;
-		u32 control;
+		u16 control;
 		u8 bir;
 
    		pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
-		dev->bus->ops->read(dev->bus, dev->devfn, msi_control_reg(pos), 			2, &control);
+		pci_read_config_word(dev, msi_control_reg(pos), &control);
 		dev_msi_cap = multi_msix_capable(control);
-		dev->bus->ops->read(dev->bus, dev->devfn,
-			msix_table_offset_reg(pos), 4, &table_offset);
+		pci_read_config_dword(dev, msix_table_offset_reg(pos), &table_offset);
 		bir = (u8)(table_offset & PCI_MSIX_FLAGS_BIRMASK);
 		phys_addr = pci_resource_start (dev, bir);
 		phys_addr += (u32)(table_offset & ~PCI_MSIX_FLAGS_BIRMASK);

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH][2/2] MSI/MSI-X API updates
       [not found]           ` <20040727023927.GB24599@kroah.com>
  2004-07-28 17:08             ` [PATCH][1/2] Stop using dev->bus->ops directly in msi.c Roland Dreier
@ 2004-07-28 17:11             ` Roland Dreier
  1 sibling, 0 replies; 8+ messages in thread
From: Roland Dreier @ 2004-07-28 17:11 UTC (permalink / raw)
  To: Greg KH; +Cc: Andrew Morton, bjorn.helgaas, tom.l.nguyen, linux-kernel

Second half of MSI rewrite: fix the API and update documentation.
Split enabling MSI and MSI-X to separate pci_enable_msi()/pci_disable_msi()
and pci_enable_msix()/pci_disable_msix() functions.  free_irq() no
longer has the side effect of freeing interrupt vectors (so a device
driver can do multiple request_irq()/free_irq() cycles on the same
MSI/MSI_X vector).

From: Tom L. Nguyen <tom.l.nguyen@intel.com>
Signed-off-by: Roland Dreier <roland@topspin.com>

Index: linux-2.6.8-rc2/Documentation/MSI-HOWTO.txt
===================================================================
--- linux-2.6.8-rc2.orig/Documentation/MSI-HOWTO.txt
+++ linux-2.6.8-rc2/Documentation/MSI-HOWTO.txt
@@ -3,13 +3,14 @@
 			10/03/2003
 	Revised Feb 12, 2004 by Martine Silbermann
 		email: Martine.Silbermann@hp.com
+	Revised Jun 25, 2004 by Tom L Nguyen
 
 1. About this guide
 
-This guide describes the basics of Message Signaled Interrupts(MSI), the
-advantages of using MSI over traditional interrupt mechanisms, and how
-to enable your driver to use MSI or MSI-X. Also included is a Frequently
-Asked Questions.
+This guide describes the basics of Message Signaled Interrupts (MSI), 
+the advantages of using MSI over traditional interrupt mechanisms, 
+and how to enable your driver to use MSI or MSI-X. Also included is 
+a Frequently Asked Questions.
 
 2. Copyright 2003 Intel Corporation
 
@@ -35,7 +36,7 @@
 the MSI/MSI-X capability structure in its PCI capability list. The
 device function may implement both the MSI capability structure and
 the MSI-X capability structure; however, the bus driver should not
-enable both, but instead enable only the MSI-X capability structure.
+enable both.
 
 The MSI capability structure contains Message Control register,
 Message Address register and Message Data register. These registers
@@ -86,7 +87,7 @@
 support for better interrupt performance.
 
 Using MSI enables the device functions to support two or more
-vectors, which can be configure to target different CPU's to
+vectors, which can be configured to target different CPU's to
 increase scalability.
 
 5. Configuring a driver to use MSI/MSI-X
@@ -95,26 +96,53 @@
 support this capability. The CONFIG_PCI_USE_VECTOR kernel option
 must be selected to enable MSI/MSI-X support.
 
-5.1 Including MSI support into the kernel
+5.1 Including MSI/MSI-X support into the kernel
 
-To allow MSI-Capable device drivers to selectively enable MSI (using
-pci_enable_msi as described below), the VECTOR based scheme needs to
-be enabled by setting CONFIG_PCI_USE_VECTOR.
+To allow MSI/MSI-X capable device drivers to selectively enable 
+MSI/MSI-X (using pci_enable_msi()/pci_enable_msix() as described 
+below), the VECTOR based scheme needs to be enabled by setting 
+CONFIG_PCI_USE_VECTOR during kernel config.
 
 Since the target of the inbound message is the local APIC, providing
-CONFIG_PCI_USE_VECTOR is dependent on whether CONFIG_X86_LOCAL_APIC
-is enabled or not.
+CONFIG_X86_LOCAL_APIC must be enabled as well as CONFIG_PCI_USE_VECTOR.
 
-int pci_enable_msi(struct pci_dev *)
+5.2 Configuring for MSI support
+
+Due to the non-contiguous fashion in vector assignment of the
+existing Linux kernel, this version does not support multiple
+messages regardless of a device function is capable of supporting
+more than one vector. To enable MSI on a device function's MSI
+capability structure requires a device driver to call the function 
+pci_enable_msi() explicitly. 
+
+5.2.1 API pci_enable_msi
+
+int pci_enable_msi(struct pci_dev *dev)
 
 With this new API, any existing device driver, which like to have
-MSI enabled on its device function, must call this explicitly. A
-successful call will initialize the MSI/MSI-X capability structure
-with ONE vector, regardless of whether the device function is
+MSI enabled on its device function, must call this API to enable MSI
+A successful call will initialize the MSI capability structure
+with ONE vector, regardless of whether a device function is
 capable of supporting multiple messages. This vector replaces the
 pre-assigned dev->irq with a new MSI vector. To avoid the conflict
 of new assigned vector with existing pre-assigned vector requires
-the device driver to call this API before calling request_irq(...).
+a device driver to call this API before calling request_irq().
+
+5.2.2 API pci_disable_msi
+
+void pci_disable_msi(struct pci_dev *dev)
+
+This API should always be used to undo the effect of pci_enable_msi()
+when a device driver is unloading. This API restores dev->irq with
+the pre-assigned IOAPIC vector and switches a device's interrupt 
+mode to PCI pin-irq assertion/INTx emulation mode.    
+
+Note that a device driver should always call free_irq() on MSI vector
+it has done request_irq() on before calling this API. Failure to do  
+so results a BUG_ON() and a device will be left with MSI enabled and 
+leaks its vector. 
+
+5.2.3 MSI mode vs. legacy mode diagram
 
 The below diagram shows the events, which switches the interrupt
 mode on the MSI-capable device function between MSI mode and
@@ -124,105 +152,258 @@
 	|	     | <===============	| 			 |
 	| MSI MODE   |	  	     	| PIN-IRQ ASSERTION MODE |
 	| 	     | ===============>	|			 |
- 	 ------------	free_irq      	 ------------------------
+ 	 ------------	pci_disable_msi  ------------------------
 
-5.2 Configuring for MSI support
 
-Due to the non-contiguous fashion in vector assignment of the
-existing Linux kernel, this version does not support multiple
-messages regardless of the device function is capable of supporting
-more than one vector. The bus driver initializes only entry 0 of
-this capability if pci_enable_msi(...) is called successfully by
-the device driver.
+Figure 1.0 MSI Mode vs. Legacy Mode
+
+In Figure 1.0, a device operates by default in legacy mode. Legacy
+in this context means PCI pin-irq assertion or PCI-Express INTx 
+emulation. A successful MSI request (using pci_enable_msi()) switches 
+a device's interrupt mode to MSI mode. A pre-assigned IOAPIC vector
+stored in dev->irq will be saved by the PCI subsystem and a new 
+assigned MSI vector will replace dev->irq. 
+
+To return back to its default mode, a device driver should always call
+pci_disable_msi() to undo the effect of pci_enable_msi(). Note that a
+device driver should always call free_irq() on MSI vector it has done
+request_irq() on before calling pci_disable_msi(). Failure to do so 
+results a BUG_ON() and a device will be left with MSI enabled and 
+leaks its vector. Otherwise, the PCI subsystem restores a device's
+dev->irq with a pre-assigned IOAPIC vector and marks released
+MSI vector as unused. 
+
+Once being marked as unused, there is no guarantee that the PCI 
+subsystem will reserve this MSI vector for a device. Depending on 
+the availability of current PCI vector resources and the number of 
+MSI/MSI-X requests from other drivers, this MSI may be re-assigned. 
+
+For the case where the PCI subsystem re-assigned this MSI vector 
+another driver, a request to switching back to MSI mode may result 
+in being assigned a different MSI vector or a failure if no more 
+vectors are available.  
 
 5.3 Configuring for MSI-X support
 
-Both the MSI capability structure and the MSI-X capability structure
-share the same above semantics; however, due to the ability of the
-system software to configure each vector of the MSI-X capability
-structure with an independent message address and message data, the
-non-contiguous fashion in vector assignment of the existing Linux
-kernel has no impact on supporting multiple messages on an MSI-X
-capable device functions. By default, as mentioned above, ONE vector
-should be always allocated to the MSI-X capability structure at
-entry 0. The bus driver does not initialize other entries of the
-MSI-X table.
-
-Note that the PCI subsystem should have full control of a MSI-X
-table that resides in Memory Space. The software device driver
-should not access this table.
-
-To request for additional vectors, the device software driver should
-call function msi_alloc_vectors(). It is recommended that the
-software driver should call this function once during the
+Due to the ability of the system software to configure each vector of
+the MSI-X capability structure with an independent message address 
+and message data, the non-contiguous fashion in vector assignment of
+the existing Linux kernel has no impact on supporting multiple 
+messages on an MSI-X capable device functions. To enable MSI-X on 
+a device function's MSI-X capability structure requires its device 
+driver to call the function pci_enable_msix() explicitly.
+
+The function pci_enable_msix(), once invoked, enables either
+all or nothing, depending on the current availability of PCI vector
+resources. If the PCI vector resources are available for the number 
+of vectors requested by a device driver, this function will configure 
+the MSI-X table of the MSI-X capability structure of a device with
+requested messages. To emphasize this reason, for example, a device 
+may be capable for supporting the maximum of 32 vectors while its 
+software driver usually may request 4 vectors. It is recommended
+that the device driver should call this function once during the 
 initialization phase of the device driver.
 
-The function msi_alloc_vectors(), once invoked, enables either
-all or nothing, depending on the current availability of vector
-resources. If no vector resources are available, the device function
-still works with ONE vector. If the vector resources are available
-for the number of vectors requested by the driver, this function
-will reconfigure the MSI-X capability structure of the device with
-additional messages, starting from entry 1. To emphasize this
-reason, for example, the device may be capable for supporting the
-maximum of 32 vectors while its software driver usually may request
-4 vectors.
-
-For each vector, after this successful call, the device driver is
-responsible to call other functions like request_irq(), enable_irq(),
-etc. to enable this vector with its corresponding interrupt service
-handler. It is the device driver's choice to have all vectors shared
-the same interrupt service handler or each vector with a unique
-interrupt service handler.
-
-In addition to the function msi_alloc_vectors(), another function
-msi_free_vectors() is provided to allow the software driver to
-release a number of vectors back to the vector resources. Once
-invoked, the PCI subsystem disables (masks) each vector released.
-These vectors are no longer valid for the hardware device and its
-software driver to use. Like free_irq, it recommends that the
-device driver should also call msi_free_vectors to release all
-additional vectors previously requested.
-
-int msi_alloc_vectors(struct pci_dev *dev, int *vector, int nvec)
-
-This API enables the software driver to request the PCI subsystem
-for additional messages. Depending on the number of vectors
-available, the PCI subsystem enables either all or nothing.
+Unlike the function pci_enable_msi(), the function pci_enable_msix() 
+does not replace the pre-assigned IOAPIC dev->irq with a new MSI 
+vector because the PCI subsystem writes the 1:1 vector-to-entry mapping
+into the field vector of each element contained in a second argument. 
+Note that the pre-assigned IO-APIC dev->irq is valid only if the device
+operates in PIN-IRQ assertion mode. In MSI-X mode, any attempt of
+using dev->irq by the device driver to request for interrupt service
+may result unpredictabe behavior. 
+
+For each MSI-X vector granted, a device driver is responsible to call 
+other functions like request_irq(), enable_irq(), etc. to enable
+this vector with its corresponding interrupt service handler. It is 
+a device driver's choice to assign all vectors with the same 
+interrupt service handler or each vector with a unique interrupt 
+service handler. 
+
+5.3.1 Handling MMIO address space of MSI-X Table
+
+The PCI 3.0 specification has implementation notes that MMIO address
+space for a device's MSI-X structure should be isolated so that the 
+software system can set different page for controlling accesses to 
+the MSI-X structure. The implementation of MSI patch requires the PCI
+subsystem, not a device driver, to maintain full control of the MSI-X
+table/MSI-X PBA and MMIO address space of the MSI-X table/MSI-X PBA. 
+A device driver is prohibited from requesting the MMIO address space 
+of the MSI-X table/MSI-X PBA. Otherwise, the PCI subsystem will fail 
+enabling MSI-X on its hardware device when it calls the function 
+pci_enable_msix().
+
+5.3.2 Handling MSI-X allocation
+
+Determining the number of MSI-X vectors allocated to a function is 
+dependent on the number of MSI capable devices and MSI-X capable
+devices populated in the system. The policy of allocating MSI-X 
+vectors to a function is defined as the following:
+
+#of MSI-X vectors allocated to a function = (x - y)/z where
+
+x = 	The number of available PCI vector resources by the time 
+	the device driver calls pci_enable_msix(). The PCI vector
+	resources is the sum of the number of unassigned vectors
+	(new) and the number of released vectors when any MSI/MSI-X
+	device driver switches its hardware device back to a legacy
+	mode or is hot-removed.	The number of unassigned vectors
+	may exclude some vectors reserved, as defined in parameter
+	NR_HP_RESERVED_VECTORS, for the case where the system is 
+	capable of supporting hot-add/hot-remove operations. Users
+	may change the value defined in NR_HR_RESERVED_VECTORS to
+	meet their specific needs. 
+
+y =	The number of MSI capable devices populated in the system.
+	This policy ensures that each MSI capable device has its
+	vector reserved to avoid the case where some MSI-X capable
+	drivers may attempt to claim all available vector resources.
+
+z =	The number of MSI-X capable devices pupulated in the system.
+	This policy ensures that maximum (x - y) is distributed 
+	evenly among MSI-X capable devices.	
+    
+Note that the PCI subsystem scans y and z during a bus enumeration.
+When the PCI subsystem completes configuring MSI/MSI-X capability
+structure of a device as requested by its device driver, y/z is 
+decremented accordingly.  
+
+5.3.3 Handling MSI-X shortages
+
+For the case where fewer MSI-X vectors are allocated to a function 
+than requested, the function pci_enable_msix() will return the
+maximum number of MSI-X vectors available to the caller. A device 
+driver may re-send its request with fewer or equal vectors indicated
+in a return. For example, if a device driver requests 5 vectors, but 
+the number of available vectors is 3 vectors, a value of 3 will be a 
+return as a result of pci_enable_msix() call. A function could be 
+designed for its driver to use only 3 MSI-X table entries as 
+different combinations as ABC--, A-B-C, A--CB, etc. Note that this 
+patch does not support multiple entries with the same vector. Such 
+attempt by a device driver to use 5 MSI-X table entries with 3 vectors
+as ABBCC, AABCC, BCCBA, etc will result as a failure by the function
+pci_enable_msix(). Below are the reasons why supporting multiple 
+entries with the same vector is an undesirable solution.
+	
+	- The PCI subsystem can not determine which entry, which
+	  generated the message, to mask/unmask MSI while handling
+	  software driver ISR. Attempting to walk through all MSI-X 
+	  table entries (2048 max) to mask/unmask any match vector 
+	  is an undesirable solution. 
+
+	- Walk through all MSI-X table entries (2048 max) to handle
+	  SMP affinity of any match vector is an undesirable solution. 
+
+5.3.4 API pci_enable_msix
+
+int pci_enable_msix(struct pci_dev *dev, u32 *entries, int nvec)
+
+This API enables a device driver to request the PCI subsystem
+for enabling MSI-X messages on its hardware device. Depending on 
+the availability of PCI vectors resources, the PCI subsystem enables
+either all or nothing.
 
 Argument dev points to the device (pci_dev) structure.
-Argument vector is a pointer of integer type. The number of
-elements is indicated in argument nvec.
+
+Argument entries is a pointer of unsigned integer type. The number of
+elements is indicated in argument nvec. The content of each element 
+will be mapped to the following struct defined in /driver/pci/msi.h.
+
+struct msix_entry {
+	u16 	vector; /* kernel uses to write alloc vector */
+	u16	entry; /* driver uses to specify entry */
+};
+
+A device driver is responsible for initializing the field entry of 
+each element with unique entry supported by MSI-X table. Otherwise, 
+-EINVAL will be returned as a result. A successful return of zero 
+indicates the PCI subsystem completes initializing each of requested 
+entries of the MSI-X table with message address and message data. 
+Last but not least, the PCI subsystem will write the 1:1 
+vector-to-entry mapping into the field vector of each element. A 
+device driver is responsible of keeping track of allocated MSI-X
+vectors in its internal data structure.
+
 Argument nvec is an integer indicating the number of messages
 requested.
-A return of zero indicates that the number of allocated vector is
-successfully allocated. Otherwise, indicate resources not
-available.
-
-int msi_free_vectors(struct pci_dev* dev, int *vector, int nvec)
-
-This API enables the software driver to inform the PCI subsystem
-that it is willing to release a number of vectors back to the
-MSI resource pool. Once invoked, the PCI subsystem disables each
-MSI-X entry associated with each vector stored in the argument 2.
-These vectors are no longer valid for the hardware device and
-its software driver to use.
 
-Argument dev points to the device (pci_dev) structure.
-Argument vector is a pointer of integer type. The number of
-elements is indicated in argument nvec.
-Argument nvec is an integer indicating the number of messages
-released.
-A return of zero indicates that the number of allocated vectors
-is successfully released. Otherwise, indicates a failure.
+A return of zero indicates that the number of MSI-X vectors is
+successfully allocated. A return of greater than zero indicates
+MSI-X vector shortage. Or a return of less than zero indicates
+a failure. This failure may be a result of duplicate entries 
+specified in second argument, or a result of no available vector,
+or a result of failing to initialize MSI-X table entries.
+
+5.3.5 API pci_disable_msix
+
+void pci_disable_msix(struct pci_dev *dev)
+
+This API should always be used to undo the effect of pci_enable_msix()
+when a device driver is unloading. Note that a device driver should 
+always call free_irq() on all MSI-X vectors it has done request_irq() 
+on before calling this API. Failure to do so results a BUG_ON() and 
+a device will be left with MSI-X enabled and leaks its vectors. 
+
+5.3.6 MSI-X mode vs. legacy mode diagram
+
+The below diagram shows the events, which switches the interrupt
+mode on the MSI-X capable device function between MSI-X mode and
+PIN-IRQ assertion mode (legacy).
+
+	 ------------   pci_enable_msix(,,n) ------------------------
+	|	     | <===============	    | 			     |
+	| MSI-X MODE |	  	     	    | PIN-IRQ ASSERTION MODE |
+	| 	     | ===============>	    |			     |
+ 	 ------------	pci_disable_msix     ------------------------
+
+Figure 2.0 MSI-X Mode vs. Legacy Mode
+
+In Figure 2.0, a device operates by default in legacy mode. A 
+successful MSI-X request (using pci_enable_msix()) switches a 
+device's interrupt mode to MSI-X mode. A pre-assigned IOAPIC vector
+stored in dev->irq will be saved by the PCI subsystem; however, 
+unlike MSI mode, the PCI subsystem will not replace dev->irq with 
+assigned MSI-X vector because the PCI subsystem already writes the 1:1 
+vector-to-entry mapping into the field vector of each element 
+specified in second argument.
+
+To return back to its default mode, a device driver should always call
+pci_disable_msix() to undo the effect of pci_enable_msix(). Note that 
+a device driver should always call free_irq() on all MSI-X vectors it 
+has done request_irq() on before calling pci_disable_msix(). Failure 
+to do so results a BUG_ON() and a device will be left with MSI-X 
+enabled and leaks its vectors. Otherwise, the PCI subsystem switches a
+device function's interrupt mode from MSI-X mode to legacy mode and 
+marks all allocated MSI-X vectors as unused. 
+
+Once being marked as unused, there is no guarantee that the PCI 
+subsystem will reserve these MSI-X vectors for a device. Depending on 
+the availability of current PCI vector resources and the number of 
+MSI/MSI-X requests from other drivers, these MSI-X vectors may be 
+re-assigned. 
+
+For the case where the PCI subsystem re-assigned these MSI-X vectors
+to other driver, a request to switching back to MSI-X mode may result
+being assigned with another set of MSI-X vectors or a failure if no 
+more vectors are available.  
+
+5.4 Handling function implementng both MSI and MSI-X capabilities
+
+For the case where a function implements both MSI and MSI-X 
+capabilities, the PCI subsystem enables a device to run either in MSI
+mode or MSI-X mode but not both. A device driver determines whether it
+wants MSI or MSI-X enabled on its hardware device. Once a device 
+driver requests for MSI, for example, it is prohibited to request for
+MSI-X; in other words, a device driver is not permitted to ping-pong
+between MSI mod MSI-X mode during a run-time.
 
-5.4 Hardware requirements for MSI support
-MSI support requires support from both system hardware and
+5.5 Hardware requirements for MSI/MSI-X support
+MSI/MSI-X support requires support from both system hardware and
 individual hardware device functions.
 
-5.4.1 System hardware support
+5.5.1 System hardware support
 Since the target of MSI address is the local APIC CPU, enabling
-MSI support in Linux kernel is dependent on whether existing
+MSI/MSI-X support in Linux kernel is dependent on whether existing
 system hardware supports local APIC. Users should verify their
 system whether it runs when CONFIG_X86_LOCAL_APIC=y.
 
@@ -231,14 +412,14 @@
 CONFIG_X86_LOCAL_APIC. Once CONFIG_X86_LOCAL_APIC=y, setting
 CONFIG_PCI_USE_VECTOR enables the VECTOR based scheme and
 the option for MSI-capable device drivers to selectively enable
-MSI (using pci_enable_msi as described below).
+MSI/MSI-X.
 
-Note that CONFIG_X86_IO_APIC setting is irrelevant because MSI
-vector is allocated new during runtime and MSI support does not
-depend on BIOS support. This key independency enables MSI support
-on future IOxAPIC free platform.
+Note that CONFIG_X86_IO_APIC setting is irrelevant because MSI/MSI-X
+vector is allocated new during runtime and MSI/MSI-X support does not
+depend on BIOS support. This key independency enables MSI/MSI-X 
+support on future IOxAPIC free platform.
 
-5.4.2 Device hardware support
+5.5.2 Device hardware support
 The hardware device function supports MSI by indicating the
 MSI/MSI-X capability structure on its PCI capability list. By
 default, this capability structure will not be initialized by
@@ -249,17 +430,19 @@
 MSI-capable hardware is responsible for whether calling
 pci_enable_msi or not. A return of zero indicates the kernel
 successfully initializes the MSI/MSI-X capability structure of the
-device funtion. The device function is now running on MSI mode.
+device funtion. The device function is now running on MSI/MSI-X mode.
 
-5.5 How to tell whether MSI is enabled on device function
+5.6 How to tell whether MSI/MSI-X is enabled on device function
 
-At the driver level, a return of zero from pci_enable_msi(...)
-indicates to the device driver that its device function is
-initialized successfully and ready to run in MSI mode.
+At the driver level, a return of zero from the function call of 
+pci_enable_msi()/pci_enable_msix() indicates to a device driver that
+its device function is initialized successfully and ready to run in 
+MSI/MSI-X mode.
 
 At the user level, users can use command 'cat /proc/interrupts'
-to display the vector allocated for the device and its interrupt
-mode, as shown below.
+to display the vector allocated for a device and its interrupt
+MSI/MSI-X mode ("PCI MSI"/"PCI MSIX"). Below shows below MSI mode is 
+enabled on a SCSI Adaptec 39320D Ultra320.  
 
            CPU0       CPU1
   0:     324639          0    IO-APIC-edge  timer
Index: linux-2.6.8-rc2/drivers/pci/msi.c
===================================================================
--- linux-2.6.8-rc2.orig/drivers/pci/msi.c
+++ linux-2.6.8-rc2/drivers/pci/msi.c
@@ -154,13 +154,25 @@
 
 static unsigned int startup_msi_irq_wo_maskbit(unsigned int vector)
 {
+	struct msi_desc *entry;
+	unsigned long flags;
+
+	spin_lock_irqsave(&msi_lock, flags);
+	entry = msi_desc[vector];
+	if (!entry || !entry->dev) {
+		spin_unlock_irqrestore(&msi_lock, flags);
+		return 0;
+	}
+	entry->msi_attrib.state = 1;	/* Mark it active */
+	spin_unlock_irqrestore(&msi_lock, flags);
+	
 	return 0;	/* never anything pending */
 }
 
-static void pci_disable_msi(unsigned int vector);
+static void release_msi(unsigned int vector);
 static void shutdown_msi_irq(unsigned int vector)
 {
-	pci_disable_msi(vector);
+	release_msi(vector);
 }
 
 #define shutdown_msi_irq_wo_maskbit	shutdown_msi_irq
@@ -175,6 +187,18 @@
 
 static unsigned int startup_msi_irq_w_maskbit(unsigned int vector)
 {
+	struct msi_desc *entry;
+	unsigned long flags;
+
+	spin_lock_irqsave(&msi_lock, flags);
+	entry = msi_desc[vector];
+	if (!entry || !entry->dev) {
+		spin_unlock_irqrestore(&msi_lock, flags);
+		return 0;
+	}
+	entry->msi_attrib.state = 1;	/* Mark it active */
+	spin_unlock_irqrestore(&msi_lock, flags);
+	
 	unmask_MSI_irq(vector);
 	return 0;	/* never anything pending */
 }
@@ -196,7 +220,7 @@
  * which implement the MSI-X Capability Structure.
  */
 static struct hw_interrupt_type msix_irq_type = {
-	.typename	= "PCI MSI-X",
+	.typename	= "PCI-MSI-X",
 	.startup	= startup_msi_irq_w_maskbit,
 	.shutdown	= shutdown_msi_irq_w_maskbit,
 	.enable		= enable_msi_irq_w_maskbit,
@@ -212,7 +236,7 @@
  * Mask-and-Pending Bits.
  */
 static struct hw_interrupt_type msi_irq_w_maskbit_type = {
-	.typename	= "PCI MSI",
+	.typename	= "PCI-MSI",
 	.startup	= startup_msi_irq_w_maskbit,
 	.shutdown	= shutdown_msi_irq_w_maskbit,
 	.enable		= enable_msi_irq_w_maskbit,
@@ -228,7 +252,7 @@
  * Mask-and-Pending Bits.
  */
 static struct hw_interrupt_type msi_irq_wo_maskbit_type = {
-	.typename	= "PCI MSI",
+	.typename	= "PCI-MSI",
 	.startup	= startup_msi_irq_wo_maskbit,
 	.shutdown	= shutdown_msi_irq_wo_maskbit,
 	.enable		= enable_msi_irq_wo_maskbit,
@@ -261,6 +285,7 @@
 	msi_address->lo_address.value |= (MSI_TARGET_CPU << MSI_TARGET_CPU_SHIFT);
 }
 
+static int msi_free_vector(struct pci_dev* dev, int vector, int reassign);
 static int assign_msi_vector(void)
 {
 	static int new_vector_avail = 1;
@@ -274,6 +299,8 @@
 	spin_lock_irqsave(&msi_lock, flags);
 
 	if (!new_vector_avail) {
+		int free_vector = 0;
+		
 		/*
 	 	 * vector_irq[] = -1 indicates that this specific vector is:
 	 	 * - assigned for MSI (since MSI have no associated IRQ) or
@@ -290,13 +317,34 @@
 		for (vector = FIRST_DEVICE_VECTOR; vector < NR_IRQS; vector++) {
 			if (vector_irq[vector] != 0)
 				continue;
-			vector_irq[vector] = -1;
-			nr_released_vectors--;
-			spin_unlock_irqrestore(&msi_lock, flags);
-			return vector;
+			free_vector = vector;
+			if (!msi_desc[vector]) 
+			      	break;	
+			else
+				continue;
 		}
+		if (!free_vector) {
+			spin_unlock_irqrestore(&msi_lock, flags);
+			return -EBUSY;
+		}	
+		vector_irq[free_vector] = -1;
+		nr_released_vectors--;
 		spin_unlock_irqrestore(&msi_lock, flags);
-		return -EBUSY;
+		if (msi_desc[free_vector] != NULL) {
+			struct pci_dev *dev;
+			int tail;
+			
+			/* free all linked vectors before re-assign */
+			do {
+				spin_lock_irqsave(&msi_lock, flags);
+				dev = msi_desc[free_vector]->dev;
+				tail = msi_desc[free_vector]->link.tail;
+				spin_unlock_irqrestore(&msi_lock, flags);
+				msi_free_vector(dev, tail, 1);
+			} while (free_vector != tail);
+		}
+	       	
+		return free_vector;
 	}
 	vector = assign_irq_vector(AUTO_ASSIGN);
 	last_alloc_vector = vector;
@@ -329,6 +377,15 @@
 		printk(KERN_INFO "WARNING: MSI INIT FAILURE\n");
 		return status;
 	}
+	last_alloc_vector = assign_irq_vector(AUTO_ASSIGN);
+	if (last_alloc_vector < 0) {
+		pci_msi_enable = 0;
+		printk(KERN_INFO "WARNING: ALL VECTORS ARE BUSY\n");
+		status = -EBUSY;
+		return status;
+	}
+	vector_irq[last_alloc_vector] = 0;
+	nr_released_vectors++;
 	printk(KERN_INFO "MSI INIT SUCCESS\n");
 
 	return status;
@@ -421,7 +478,7 @@
 	}
 }
 
-static int msi_lookup_vector(struct pci_dev *dev)
+static int msi_lookup_vector(struct pci_dev *dev, int type)
 {
 	int vector;
 	unsigned long flags;
@@ -429,11 +486,11 @@
 	spin_lock_irqsave(&msi_lock, flags);
 	for (vector = FIRST_DEVICE_VECTOR; vector < NR_IRQS; vector++) {
 		if (!msi_desc[vector] || msi_desc[vector]->dev != dev ||
-			msi_desc[vector]->msi_attrib.entry_nr ||
+			msi_desc[vector]->msi_attrib.type != type ||
 			msi_desc[vector]->msi_attrib.default_vector != dev->irq)
-			continue;	/* not entry 0, skip */
+			continue;	
 		spin_unlock_irqrestore(&msi_lock, flags);
-		/* This pre-assigned entry-0 MSI vector for this device
+		/* This pre-assigned MSI vector for this device
 		   already exits. Override dev->irq with this vector */
 		dev->irq = vector;
 		return 0;
@@ -448,10 +505,9 @@
 	if (!dev)
 		return;
 
-   	if (pci_find_capability(dev, PCI_CAP_ID_MSIX) > 0) {
-		nr_reserved_vectors++;
+   	if (pci_find_capability(dev, PCI_CAP_ID_MSIX) > 0) 
 		nr_msix_devices++;
-	} else if (pci_find_capability(dev, PCI_CAP_ID_MSI) > 0)
+	else if (pci_find_capability(dev, PCI_CAP_ID_MSI) > 0)
 		nr_reserved_vectors++;
 }
 
@@ -473,18 +529,7 @@
 	u16 control;
 
    	pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
-	if (!pos)
-		return -EINVAL;
-
 	pci_read_config_word(dev, msi_control_reg(pos), &control);
-	if (control & PCI_MSI_FLAGS_ENABLE)
-		return 0;
-
-	if (!msi_lookup_vector(dev)) {
-		/* Lookup Sucess */
-		enable_msi_mode(dev, pos, PCI_CAP_ID_MSI);
-		return 0;
-	}
 	/* MSI Entry Initialization */
 	if (!(entry = alloc_msi_entry()))
 		return -ENOMEM;
@@ -493,11 +538,14 @@
 		kmem_cache_free(msi_cachep, entry);
 		return -EBUSY;
 	}
+	entry->link.head = vector;
+	entry->link.tail = vector;
 	entry->msi_attrib.type = PCI_CAP_ID_MSI;
+	entry->msi_attrib.state = 0;			/* Mark it not active */
 	entry->msi_attrib.entry_nr = 0;
 	entry->msi_attrib.maskbit = is_mask_bit_support(control);
-	entry->msi_attrib.default_vector = dev->irq;
-	dev->irq = vector;	/* save default pre-assigned ioapic vector */
+	entry->msi_attrib.default_vector = dev->irq;	/* Save IOAPIC IRQ */
+	dev->irq = vector;	
 	entry->dev = dev;
 	if (is_mask_bit_support(control)) {
 		entry->mask_base = msi_mask_bits_reg(pos,
@@ -545,237 +593,219 @@
  * @dev: pointer to the pci_dev data structure of MSI-X device function
  *
  * Setup the MSI-X capability structure of device funtion with a
- * single MSI-X vector. A return of zero indicates the successful setup
- * of an entry zero with the new MSI-X vector or non-zero for otherwise.
- * To request for additional MSI-X vectors, the device drivers are
- * required to utilize the following supported APIs:
- * 1) msi_alloc_vectors(...) for requesting one or more MSI-X vectors
- * 2) msi_free_vectors(...) for releasing one or more MSI-X vectors
- *    back to PCI subsystem before calling free_irq(...)
+ * single MSI-X vector. A return of zero indicates the successful setup of
+ * requested MSI-X entries with allocated vectors or non-zero for otherwise.
  **/
-static int msix_capability_init(struct pci_dev	*dev)
+static int msix_capability_init(struct pci_dev *dev, 
+				struct msix_entry *entries, int nvec)
 {
-	struct msi_desc *entry;
+	struct msi_desc *head = NULL, *tail = NULL, *entry = NULL;
 	struct msg_address address;
 	struct msg_data data;
-	int vector = 0, pos, dev_msi_cap, i;
+	int vector, pos, i, j, nr_entries, temp = 0;
 	u32 phys_addr, table_offset;
-	u16 control;
+ 	u16 control;
 	u8 bir;
 	void *base;
-
+	
    	pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
-	if (!pos)
-		return -EINVAL;
-
 	/* Request & Map MSI-X table region */
  	pci_read_config_word(dev, msi_control_reg(pos), &control);
-	if (control & PCI_MSIX_FLAGS_ENABLE)
-		return 0;
-
-	if (!msi_lookup_vector(dev)) {
-		/* Lookup Sucess */
-		enable_msi_mode(dev, pos, PCI_CAP_ID_MSIX);
-		return 0;
-	}
-
-	dev_msi_cap = multi_msix_capable(control);
+	nr_entries = multi_msix_capable(control);
  	pci_read_config_dword(dev, msix_table_offset_reg(pos),
  		&table_offset);
 	bir = (u8)(table_offset & PCI_MSIX_FLAGS_BIRMASK);
 	phys_addr = pci_resource_start (dev, bir);
 	phys_addr += (u32)(table_offset & ~PCI_MSIX_FLAGS_BIRMASK);
 	if (!request_mem_region(phys_addr,
-		dev_msi_cap * PCI_MSIX_ENTRY_SIZE,
-		"MSI-X iomap Failure"))
+		nr_entries * PCI_MSIX_ENTRY_SIZE,
+		"MSI-X vector table"))
 		return -ENOMEM;
-	base = ioremap_nocache(phys_addr, dev_msi_cap * PCI_MSIX_ENTRY_SIZE);
-	if (base == NULL)
-		goto free_region;
-	/* MSI Entry Initialization */
-	entry = alloc_msi_entry();
-	if (!entry)
-		goto free_iomap;
-	if ((vector = get_msi_vector(dev)) < 0)
-		goto free_entry;
-
-	entry->msi_attrib.type = PCI_CAP_ID_MSIX;
-	entry->msi_attrib.entry_nr = 0;
-	entry->msi_attrib.maskbit = 1;
-	entry->msi_attrib.default_vector = dev->irq;
-	dev->irq = vector;	/* save default pre-assigned ioapic vector */
-	entry->dev = dev;
-	entry->mask_base = (unsigned long)base;
-	/* Replace with MSI handler */
-	irq_handler_init(PCI_CAP_ID_MSIX, vector, 1);
-	/* Configure MSI-X capability structure */
-	msi_address_init(&address);
-	msi_data_init(&data, vector);
-	entry->msi_attrib.current_cpu = ((address.lo_address.u.dest_id >>
-				MSI_TARGET_CPU_SHIFT) & MSI_TARGET_CPU_MASK);
-	writel(address.lo_address.value, base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
-	writel(address.hi_address, base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET);
-	writel(*(u32*)&data, base + PCI_MSIX_ENTRY_DATA_OFFSET);
-	/* Initialize all entries from 1 up to 0 */
-	for (i = 1; i < dev_msi_cap; i++) {
-		writel(0, base + i * PCI_MSIX_ENTRY_SIZE +
+	base = ioremap_nocache(phys_addr, nr_entries * PCI_MSIX_ENTRY_SIZE);
+	if (base == NULL) {
+		release_mem_region(phys_addr, nr_entries * PCI_MSIX_ENTRY_SIZE);
+		return -ENOMEM;
+	}
+	/* MSI-X Table Initialization */
+	for (i = 0; i < nvec; i++) {
+		entry = alloc_msi_entry();
+		if (!entry)
+			break;	
+		if ((vector = get_msi_vector(dev)) < 0)
+			break; 	
+
+ 		j = entries[i].entry;
+ 		entries[i].vector = vector;
+		entry->msi_attrib.type = PCI_CAP_ID_MSIX;
+ 		entry->msi_attrib.state = 0;		/* Mark it not active */
+		entry->msi_attrib.entry_nr = j;
+		entry->msi_attrib.maskbit = 1;
+		entry->msi_attrib.default_vector = dev->irq;
+		entry->dev = dev;
+		entry->mask_base = (unsigned long)base;
+		if (!head) {
+			entry->link.head = vector;
+			entry->link.tail = vector;
+			head = entry;
+		} else {
+			entry->link.head = temp;
+			entry->link.tail = tail->link.tail;
+			tail->link.tail = vector;
+			head->link.head = vector;
+		}
+		temp = vector;
+		tail = entry;
+		/* Replace with MSI-X handler */
+		irq_handler_init(PCI_CAP_ID_MSIX, vector, 1);
+		/* Configure MSI-X capability structure */
+		msi_address_init(&address);
+		msi_data_init(&data, vector);
+		entry->msi_attrib.current_cpu = 
+			((address.lo_address.u.dest_id >>
+			MSI_TARGET_CPU_SHIFT) & MSI_TARGET_CPU_MASK);
+		writel(address.lo_address.value, 
+			base + j * PCI_MSIX_ENTRY_SIZE +
 			PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
-		writel(0, base + i * PCI_MSIX_ENTRY_SIZE +
+		writel(address.hi_address, 
+			base + j * PCI_MSIX_ENTRY_SIZE +
 			PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET);
-		writel(0, base + i * PCI_MSIX_ENTRY_SIZE +
+		writel(*(u32*)&data, 
+			base + j * PCI_MSIX_ENTRY_SIZE +
 			PCI_MSIX_ENTRY_DATA_OFFSET);
+		attach_msi_entry(entry, vector);
 	}
-	attach_msi_entry(entry, vector);
-	/* Set MSI enabled bits	 */
+	if (i != nvec) {
+		i--;
+		for (; i >= 0; i--) {
+			vector = (entries + i)->vector;
+			msi_free_vector(dev, vector, 0);
+			(entries + i)->vector = 0;
+		}
+		return -EBUSY;
+	}
+	/* Set MSI-X enabled bits */
 	enable_msi_mode(dev, pos, PCI_CAP_ID_MSIX);
-
+	
 	return 0;
-
-free_entry:
-	kmem_cache_free(msi_cachep, entry);
-free_iomap:
-	iounmap(base);
-free_region:
-	release_mem_region(phys_addr, dev_msi_cap * PCI_MSIX_ENTRY_SIZE);
-
-	return ((vector < 0) ? -EBUSY : -ENOMEM);
 }
 
 /**
- * pci_enable_msi - configure device's MSI(X) capability structure
- * @dev: pointer to the pci_dev data structure of MSI(X) device function
+ * pci_enable_msi - configure device's MSI capability structure
+ * @dev: pointer to the pci_dev data structure of MSI device function
  *
- * Setup the MSI/MSI-X capability structure of device function with
- * a single MSI(X) vector upon its software driver call to request for
- * MSI(X) mode enabled on its hardware device function. A return of zero
- * indicates the successful setup of an entry zero with the new MSI(X)
+ * Setup the MSI capability structure of device function with
+ * a single MSI vector upon its software driver call to request for
+ * MSI mode enabled on its hardware device function. A return of zero
+ * indicates the successful setup of an entry zero with the new MSI
  * vector or non-zero for otherwise.
  **/
 int pci_enable_msi(struct pci_dev* dev)
 {
-	int status = -EINVAL;
+	int pos, temp = dev->irq, status = -EINVAL;
+	u16 control;
 
 	if (!pci_msi_enable || !dev)
  		return status;
 
-	if (msi_init() < 0)
-		return -ENOMEM;
+	if ((status = msi_init()) < 0)
+		return status;
 
-	if ((status = msix_capability_init(dev)) == -EINVAL)
-		status = msi_capability_init(dev);
-	if (!status)
-		nr_reserved_vectors--;
+   	if (!(pos = pci_find_capability(dev, PCI_CAP_ID_MSI)))
+		return -EINVAL;
+
+	pci_read_config_word(dev, msi_control_reg(pos), &control);
+	if (control & PCI_MSI_FLAGS_ENABLE)
+		return 0;			/* Already in MSI mode */
+
+	if (!msi_lookup_vector(dev, PCI_CAP_ID_MSI)) {
+		/* Lookup Sucess */
+		unsigned long flags;
+
+		spin_lock_irqsave(&msi_lock, flags);
+		if (!vector_irq[dev->irq]) {
+			msi_desc[dev->irq]->msi_attrib.state = 0; 	
+			vector_irq[dev->irq] = -1;			
+			nr_released_vectors--;
+			spin_unlock_irqrestore(&msi_lock, flags);
+			enable_msi_mode(dev, pos, PCI_CAP_ID_MSI);
+			return 0;
+		}
+		spin_unlock_irqrestore(&msi_lock, flags);
+		dev->irq = temp;
+	}
+	/* Check whether driver already requested for MSI-X vectors */
+   	if ((pos = pci_find_capability(dev, PCI_CAP_ID_MSIX)) > 0 && 
+		!msi_lookup_vector(dev, PCI_CAP_ID_MSIX)) {
+			printk(KERN_INFO "Can't enable MSI. Device already had MSI-X vectors assigned\n");
+			dev->irq = temp;
+			return -EINVAL;	
+	}		
+	status = msi_capability_init(dev);
+	if (!status) {
+   		if (!pos) 
+			nr_reserved_vectors--;	/* Only MSI capable */
+		else if (nr_msix_devices > 0) 
+			nr_msix_devices--;	/* Both MSI and MSI-X capable, 
+						   but choose enabling MSI */
+	}
 
 	return status;
 }
 
-static int msi_free_vector(struct pci_dev* dev, int vector);
-static void pci_disable_msi(unsigned int vector)
+void pci_disable_msi(struct pci_dev* dev)
 {
-	int head, tail, type, default_vector;
 	struct msi_desc *entry;
-	struct pci_dev *dev;
+	int pos, default_vector;
+	u16 control;
 	unsigned long flags;
 
+   	if (!dev || !(pos = pci_find_capability(dev, PCI_CAP_ID_MSI)))
+		return;
+
+	pci_read_config_word(dev, msi_control_reg(pos), &control);
+	if (!(control & PCI_MSI_FLAGS_ENABLE)) 
+		return;
+	
 	spin_lock_irqsave(&msi_lock, flags);
-	entry = msi_desc[vector];
-	if (!entry || !entry->dev) {
+	entry = msi_desc[dev->irq];
+	if (!entry || !entry->dev || entry->msi_attrib.type != PCI_CAP_ID_MSI) {
 		spin_unlock_irqrestore(&msi_lock, flags);
 		return;
 	}
-	dev = entry->dev;
-	type = entry->msi_attrib.type;
-	head = entry->link.head;
-	tail = entry->link.tail;
-	default_vector = entry->msi_attrib.default_vector;
-	spin_unlock_irqrestore(&msi_lock, flags);
-
-	disable_msi_mode(dev, pci_find_capability(dev, type), type);
-	/* Restore dev->irq to its default pin-assertion vector */
-	dev->irq = default_vector;
-	if (type == PCI_CAP_ID_MSIX && head != tail) {
-		/* Bad driver, which do not call msi_free_vectors before exit.
-		   We must do a cleanup here */
-		while (1) {
-			spin_lock_irqsave(&msi_lock, flags);
-			entry = msi_desc[vector];
-			head = entry->link.head;
-			tail = entry->link.tail;
-			spin_unlock_irqrestore(&msi_lock, flags);
-			if (tail == head)
-				break;
-			if (msi_free_vector(dev, entry->link.tail))
-				break;
-		}
+	if (entry->msi_attrib.state) {
+		spin_unlock_irqrestore(&msi_lock, flags);
+		printk(KERN_DEBUG "Driver[%d:%d:%d] unloaded wo doing free_irq on vector->%d\n", 
+		dev->bus->number, PCI_SLOT(dev->devfn),	PCI_FUNC(dev->devfn),
+		dev->irq);
+		BUG_ON(entry->msi_attrib.state > 0);
+	} else {
+		vector_irq[dev->irq] = 0; /* free it */
+		nr_released_vectors++;
+		default_vector = entry->msi_attrib.default_vector;
+		spin_unlock_irqrestore(&msi_lock, flags);
+		/* Restore dev->irq to its default pin-assertion vector */
+		dev->irq = default_vector;
+		disable_msi_mode(dev, pci_find_capability(dev, PCI_CAP_ID_MSI),
+					PCI_CAP_ID_MSI);
 	}
 }
 
-static int msi_alloc_vector(struct pci_dev* dev, int head)
+static void release_msi(unsigned int vector)
 {
 	struct msi_desc *entry;
-	struct msg_address address;
-	struct msg_data data;
-	int i, offset, pos, dev_msi_cap, vector;
-	u32 low_address;
-	u16 control;
-	unsigned long base = 0L;
 	unsigned long flags;
 
 	spin_lock_irqsave(&msi_lock, flags);
-	entry = msi_desc[dev->irq];
-	if (!entry) {
-		spin_unlock_irqrestore(&msi_lock, flags);
-		return -EINVAL;
-	}
-	base = entry->mask_base;
+	entry = msi_desc[vector];
+	if (entry && entry->dev) 
+		entry->msi_attrib.state = 0;	/* Mark it not active */
 	spin_unlock_irqrestore(&msi_lock, flags);
-
-   	pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
- 	pci_read_config_word(dev, msi_control_reg(pos), &control);
-	dev_msi_cap = multi_msix_capable(control);
-	for (i = 1; i < dev_msi_cap; i++) {
-		if (!(low_address = readl(base + i * PCI_MSIX_ENTRY_SIZE)))
-			 break;
-	}
-	if (i >= dev_msi_cap)
-		return -EINVAL;
-
-	/* MSI Entry Initialization */
-	if (!(entry = alloc_msi_entry()))
-		return -ENOMEM;
-
-	if ((vector = get_new_vector()) < 0) {
-		kmem_cache_free(msi_cachep, entry);
-		return vector;
-	}
-	entry->msi_attrib.type = PCI_CAP_ID_MSIX;
-	entry->msi_attrib.entry_nr = i;
-	entry->msi_attrib.maskbit = 1;
-	entry->dev = dev;
-	entry->link.head = head;
-	entry->mask_base = base;
-	irq_handler_init(PCI_CAP_ID_MSIX, vector, 1);
-	/* Configure MSI-X capability structure */
-	msi_address_init(&address);
-	msi_data_init(&data, vector);
-	entry->msi_attrib.current_cpu = ((address.lo_address.u.dest_id >>
-				MSI_TARGET_CPU_SHIFT) & MSI_TARGET_CPU_MASK);
-	offset = entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE;
-	writel(address.lo_address.value, base + offset +
-		PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
-	writel(address.hi_address, base + offset +
-		PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET);
-	writel(*(u32*)&data, base + offset + PCI_MSIX_ENTRY_DATA_OFFSET);
-	writel(1, base + offset + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
-	attach_msi_entry(entry, vector);
-
-	return vector;
 }
 
-static int msi_free_vector(struct pci_dev* dev, int vector)
+static int msi_free_vector(struct pci_dev* dev, int vector, int reassign)
 {
 	struct msi_desc *entry;
-	int entry_nr, type;
+	int head, entry_nr, type;
 	unsigned long base = 0L;
 	unsigned long flags;
 
@@ -787,66 +817,177 @@
 	}
 	type = entry->msi_attrib.type;
 	entry_nr = entry->msi_attrib.entry_nr;
+	head = entry->link.head;
 	base = entry->mask_base;
-	if (entry->link.tail != entry->link.head) {
-		msi_desc[entry->link.head]->link.tail = entry->link.tail;
-		if (entry->link.tail)
-			msi_desc[entry->link.tail]->link.head = entry->link.head;
-	}
+	msi_desc[entry->link.head]->link.tail = entry->link.tail;
+	msi_desc[entry->link.tail]->link.head = entry->link.head;
 	entry->dev = NULL;
-	vector_irq[vector] = 0;
-	nr_released_vectors++;
+	if (!reassign) {
+		vector_irq[vector] = 0;
+		nr_released_vectors++;
+	}
 	msi_desc[vector] = NULL;
 	spin_unlock_irqrestore(&msi_lock, flags);
 
 	kmem_cache_free(msi_cachep, entry);
-	if (type == PCI_CAP_ID_MSIX) {
-		int offset;
 
-		offset = entry_nr * PCI_MSIX_ENTRY_SIZE;
-		writel(1, base + offset + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
-		writel(0, base + offset + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
+	if (type == PCI_CAP_ID_MSIX) {
+		if (!reassign) 
+			writel(1, base + 
+				entry_nr * PCI_MSIX_ENTRY_SIZE +
+				PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
+		
+		if (head == vector) {
+			/* 
+			 * Detect last MSI-X vector to be released.
+			 * Release the MSI-X memory-mapped table.
+			 */
+			int pos, nr_entries;
+			u32 phys_addr, table_offset;
+			u16 control;
+			u8 bir;
+
+   			pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
+			pci_read_config_word(dev, msi_control_reg(pos), 
+				&control);
+			nr_entries = multi_msix_capable(control);
+			pci_read_config_dword(dev, msix_table_offset_reg(pos),
+				&table_offset);
+			bir = (u8)(table_offset & PCI_MSIX_FLAGS_BIRMASK);
+			phys_addr = pci_resource_start (dev, bir);
+			phys_addr += (u32)(table_offset & 
+				~PCI_MSIX_FLAGS_BIRMASK);
+			iounmap((void*)base);
+			release_mem_region(phys_addr, 
+				nr_entries * PCI_MSIX_ENTRY_SIZE);
+		}
 	}
 
 	return 0;
 }
 
+static int reroute_msix_table(int head, struct msix_entry *entries, int *nvec)
+{
+	int vector = head, tail = 0;
+	int i = 0, j = 0, nr_entries = 0;
+	unsigned long base = 0L;
+	unsigned long flags;
+		
+	spin_lock_irqsave(&msi_lock, flags);
+	while (head != tail) {
+		nr_entries++;
+		tail = msi_desc[vector]->link.tail;
+		if (entries[0].entry == msi_desc[vector]->msi_attrib.entry_nr)
+			j = vector;
+		vector = tail;
+	}
+	if (*nvec > nr_entries) {
+		spin_unlock_irqrestore(&msi_lock, flags);
+		*nvec = nr_entries;
+		return -EINVAL;
+	}
+	vector = ((j > 0) ? j : head);
+	for (i = 0; i < *nvec; i++) {
+		j = msi_desc[vector]->msi_attrib.entry_nr;
+		msi_desc[vector]->msi_attrib.state = 0;	/* Mark it not active */
+		vector_irq[vector] = -1;		/* Mark it busy */	
+		nr_released_vectors--;
+		entries[i].vector = vector;
+		if (j != (entries + i)->entry) {
+			base = msi_desc[vector]->mask_base;
+			msi_desc[vector]->msi_attrib.entry_nr =	
+				(entries + i)->entry;
+			writel( readl(base + j * PCI_MSIX_ENTRY_SIZE +
+				PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET), base + 
+				(entries + i)->entry * PCI_MSIX_ENTRY_SIZE +
+				PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
+			writel(	readl(base + j * PCI_MSIX_ENTRY_SIZE +
+				PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET), base + 
+				(entries + i)->entry * PCI_MSIX_ENTRY_SIZE +
+				PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET);
+			writel( (readl(base + j * PCI_MSIX_ENTRY_SIZE +
+				PCI_MSIX_ENTRY_DATA_OFFSET) & 0xff00) | vector,
+				base + (entries+i)->entry*PCI_MSIX_ENTRY_SIZE + 
+				PCI_MSIX_ENTRY_DATA_OFFSET);
+		}
+		vector = msi_desc[vector]->link.tail;
+	}
+	spin_unlock_irqrestore(&msi_lock, flags);
+	
+	return 0;
+}
+
 /**
- * msi_alloc_vectors - allocate additional MSI-X vectors
+ * pci_enable_msix - configure device's MSI-X capability structure
  * @dev: pointer to the pci_dev data structure of MSI-X device function
- * @vector: pointer to an array of new allocated MSI-X vectors
+ * @data: pointer to an array of MSI-X entries
  * @nvec: number of MSI-X vectors requested for allocation by device driver
  *
- * Allocate additional MSI-X vectors requested by device driver. A
- * return of zero indicates the successful setup of MSI-X capability
- * structure with new allocated MSI-X vectors or non-zero for otherwise.
+ * Setup the MSI-X capability structure of device function with the number
+ * of requested vectors upon its software driver call to request for
+ * MSI-X mode enabled on its hardware device function. A return of zero
+ * indicates the successful configuration of MSI-X capability structure 
+ * with new allocated MSI-X vectors. A return of < 0 indicates a failure. 
+ * Or a return of > 0 indicates that driver request is exceeding the number
+ * of vectors available. Driver should use the returned value to re-send 
+ * its request.
  **/
-int msi_alloc_vectors(struct pci_dev* dev, int *vector, int nvec)
+int pci_enable_msix(struct pci_dev* dev, struct msix_entry *entries, int nvec)
 {
-	struct msi_desc *entry;
-	int i, head, pos, vec, free_vectors, alloc_vectors;
-	int *vectors = (int *)vector;
+	int status, pos, nr_entries, free_vectors;
+	int i, j, temp;
 	u16 control;
 	unsigned long flags;
 
-	if (!pci_msi_enable || !dev)
+	if (!pci_msi_enable || !dev || !entries)
  		return -EINVAL;
-
+	
+	if ((status = msi_init()) < 0)
+		return status;
+	
    	if (!(pos = pci_find_capability(dev, PCI_CAP_ID_MSIX)))
  		return -EINVAL;
-
- 	pci_read_config_word(dev, msi_control_reg(pos), &control);
-	if (nvec > multi_msix_capable(control))
-		return -EINVAL;
-
-	spin_lock_irqsave(&msi_lock, flags);
-	entry = msi_desc[dev->irq];
-	if (!entry || entry->dev != dev ||		/* legal call */
-	   entry->msi_attrib.type != PCI_CAP_ID_MSIX || /* must be MSI-X */
-	   entry->link.head != entry->link.tail) {	/* already multi */
-		spin_unlock_irqrestore(&msi_lock, flags);
+		
+	pci_read_config_word(dev, msi_control_reg(pos), &control);
+	if (control & PCI_MSIX_FLAGS_ENABLE)
+		return -EINVAL;			/* Already in MSI-X mode */
+	
+	nr_entries = multi_msix_capable(control);
+	if (nvec > nr_entries)
 		return -EINVAL;
+	
+	/* Check for any invalid entries */
+	for (i = 0; i < nvec; i++) {
+		if (entries[i].entry >= nr_entries)
+			return -EINVAL;		/* invalid entry */
+		for (j = i + 1; j < nvec; j++) {
+			if (entries[i].entry == entries[j].entry)
+				return -EINVAL;	/* duplicate entry */
+		}
+	}
+	temp = dev->irq;
+	if (!msi_lookup_vector(dev, PCI_CAP_ID_MSIX)) {
+		/* Lookup Sucess */
+		nr_entries = nvec;	
+		/* Reroute MSI-X table */
+		if (reroute_msix_table(dev->irq, entries, &nr_entries)) {
+			/* #requested > #previous-assigned */
+			dev->irq = temp;
+			return nr_entries;
+		}
+		dev->irq = temp;
+		enable_msi_mode(dev, pos, PCI_CAP_ID_MSIX);
+		return 0;
 	}
+	/* Check whether driver already requested for MSI vector */
+   	if (pci_find_capability(dev, PCI_CAP_ID_MSI) > 0 &&
+		!msi_lookup_vector(dev, PCI_CAP_ID_MSI)) {
+		printk(KERN_INFO "Can't enable MSI-X. Device already had MSI vector assigned\n");
+		dev->irq = temp;
+		return -EINVAL;	
+	}
+	
+	spin_lock_irqsave(&msi_lock, flags);
 	/*
 	 * msi_lock is provided to ensure that enough vectors resources are
 	 * available before granting.
@@ -862,71 +1003,65 @@
 		free_vectors /= nr_msix_devices;
 	spin_unlock_irqrestore(&msi_lock, flags);
 
-	if (nvec > free_vectors)
-		return -EBUSY;
+	if (nvec > free_vectors) {
+		if (free_vectors > 0)
+			return free_vectors;
+		else
+			return -EBUSY;
+	}	
 
-	alloc_vectors = 0;
-	head = dev->irq;
-	for (i = 0; i < nvec; i++) {
-		if ((vec = msi_alloc_vector(dev, head)) < 0)
-			break;
-		*(vectors + i) = vec;
-		head = vec;
-		alloc_vectors++;
-	}
-	if (alloc_vectors != nvec) {
-		for (i = 0; i < alloc_vectors; i++) {
-			vec = *(vectors + i);
-			msi_free_vector(dev, vec);
-		}
-		spin_lock_irqsave(&msi_lock, flags);
-		msi_desc[dev->irq]->link.tail = msi_desc[dev->irq]->link.head;
-		spin_unlock_irqrestore(&msi_lock, flags);
-		return -EBUSY;
-	}
-	if (nr_msix_devices > 0)
+	status = msix_capability_init(dev, entries, nvec);
+	if (!status && nr_msix_devices > 0)
 		nr_msix_devices--;
-
-	return 0;
+	
+	return status;
 }
 
-/**
- * msi_free_vectors - reclaim MSI-X vectors to unused state
- * @dev: pointer to the pci_dev data structure of MSI-X device function
- * @vector: pointer to an array of released MSI-X vectors
- * @nvec: number of MSI-X vectors requested for release by device driver
- *
- * Reclaim MSI-X vectors released by device driver to unused state,
- * which may be used later on. A return of zero indicates the
- * success or non-zero for otherwise. Device driver should call this
- * before calling function free_irq.
- **/
-int msi_free_vectors(struct pci_dev* dev, int *vector, int nvec)
+void pci_disable_msix(struct pci_dev* dev)
 {
-	struct msi_desc *entry;
-	int i;
-	unsigned long flags;
+	int pos, temp;
+	u16 control;
+	
+   	if (!dev || !(pos = pci_find_capability(dev, PCI_CAP_ID_MSIX)))
+		return;
 
-	if (!pci_msi_enable)
- 		return -EINVAL;
+	pci_read_config_word(dev, msi_control_reg(pos), &control);
+	if (!(control & PCI_MSIX_FLAGS_ENABLE)) 
+		return;
+	
+	temp = dev->irq;
+	if (!msi_lookup_vector(dev, PCI_CAP_ID_MSIX)) {
+		int state, vector, head, tail = 0, warning = 0;
+		unsigned long flags;
 
-	spin_lock_irqsave(&msi_lock, flags);
-	entry = msi_desc[dev->irq];
-	if (!entry || entry->dev != dev ||
-	   	entry->msi_attrib.type != PCI_CAP_ID_MSIX ||
-		entry->link.head == entry->link.tail) {	/* Nothing to free */
+		vector = head = dev->irq;
+		spin_lock_irqsave(&msi_lock, flags);
+		while (head != tail) {
+			state = msi_desc[vector]->msi_attrib.state;
+			if (state) 
+				warning = 1;
+			else {
+				vector_irq[vector] = 0; /* free it */
+				nr_released_vectors++;
+			}
+			tail = msi_desc[vector]->link.tail;
+			vector = tail;
+		}
 		spin_unlock_irqrestore(&msi_lock, flags);
-		return -EINVAL;
-	}
-	spin_unlock_irqrestore(&msi_lock, flags);
+		if (warning) {
+			dev->irq = temp;
+			printk(KERN_DEBUG "Driver[%d:%d:%d] unloaded wo doing free_irq on all vectors\n", 
+			dev->bus->number, PCI_SLOT(dev->devfn),	
+			PCI_FUNC(dev->devfn));
+			BUG_ON(warning > 0);
+		} else {
+			dev->irq = temp;
+			disable_msi_mode(dev, 
+				pci_find_capability(dev, PCI_CAP_ID_MSIX),
+				PCI_CAP_ID_MSIX);
 
-	for (i = 0; i < nvec; i++) {
-		if (*(vector + i) == dev->irq)
-			continue;/* Don't free entry 0 if mistaken by driver */
-		msi_free_vector(dev, *(vector + i));
+		}
 	}
-
-	return 0;
 }
 
 /**
@@ -940,61 +1075,73 @@
  **/
 void msi_remove_pci_irq_vectors(struct pci_dev* dev)
 {
-	struct msi_desc *entry;
-	int type, temp;
+	int state, pos, temp;
 	unsigned long flags;
-
+	
 	if (!pci_msi_enable || !dev)
  		return;
-
-   	if (!pci_find_capability(dev, PCI_CAP_ID_MSI)) {
-   		if (!pci_find_capability(dev, PCI_CAP_ID_MSIX))
-			return;
-	}
-	temp = dev->irq;
-	if (msi_lookup_vector(dev))
-		return;
-
-	spin_lock_irqsave(&msi_lock, flags);
-	entry = msi_desc[dev->irq];
-	if (!entry || entry->dev != dev) {
+	
+	temp = dev->irq;		/* Save IOAPIC IRQ */
+   	if ((pos = pci_find_capability(dev, PCI_CAP_ID_MSI)) > 0 &&
+		!msi_lookup_vector(dev, PCI_CAP_ID_MSI)) {
+		spin_lock_irqsave(&msi_lock, flags);
+		state = msi_desc[dev->irq]->msi_attrib.state;
 		spin_unlock_irqrestore(&msi_lock, flags);
-		return;
-	}
-	type = entry->msi_attrib.type;
-	spin_unlock_irqrestore(&msi_lock, flags);
-
-	msi_free_vector(dev, dev->irq);
-	if (type == PCI_CAP_ID_MSIX) {
-		int i, pos, dev_msi_cap;
-		u32 phys_addr, table_offset;
-		u16 control;
-		u8 bir;
-
-   		pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
-		pci_read_config_word(dev, msi_control_reg(pos), &control);
-		dev_msi_cap = multi_msix_capable(control);
-		pci_read_config_dword(dev, msix_table_offset_reg(pos), &table_offset);
-		bir = (u8)(table_offset & PCI_MSIX_FLAGS_BIRMASK);
-		phys_addr = pci_resource_start (dev, bir);
-		phys_addr += (u32)(table_offset & ~PCI_MSIX_FLAGS_BIRMASK);
-		for (i = FIRST_DEVICE_VECTOR; i < NR_IRQS; i++) {
+		if (state) { 	
+			printk(KERN_DEBUG "Driver[%d:%d:%d] unloaded wo doing free_irq on vector->%d\n", 
+			dev->bus->number, PCI_SLOT(dev->devfn),	
+			PCI_FUNC(dev->devfn), dev->irq);
+			BUG_ON(state > 0);
+		} else /* Release MSI vector assigned to this device */
+			msi_free_vector(dev, dev->irq, 0);
+		dev->irq = temp;		/* Restore IOAPIC IRQ */
+	}
+   	if ((pos = pci_find_capability(dev, PCI_CAP_ID_MSIX)) > 0 &&
+		!msi_lookup_vector(dev, PCI_CAP_ID_MSIX)) {
+		int vector, head, tail = 0, warning = 0;
+		unsigned long base = 0L;
+		
+		vector = head = dev->irq;
+		while (head != tail) {
 			spin_lock_irqsave(&msi_lock, flags);
-			if (!msi_desc[i] || msi_desc[i]->dev != dev) {
-				spin_unlock_irqrestore(&msi_lock, flags);
-				continue;
-			}
+			state = msi_desc[vector]->msi_attrib.state;
+			tail = msi_desc[vector]->link.tail;
+			base = msi_desc[vector]->mask_base;
 			spin_unlock_irqrestore(&msi_lock, flags);
-			msi_free_vector(dev, i);
+			if (state)  	
+				warning = 1;
+			else if (vector != head) /* Release MSI-X vector */
+				msi_free_vector(dev, vector, 0);
+			vector = tail;
+		}
+		msi_free_vector(dev, vector, 0);
+		if (warning) {
+			/* Force to release the MSI-X memory-mapped table */
+			u32 phys_addr, table_offset;
+			u16 control;
+			u8 bir;
+
+			pci_read_config_word(dev, msi_control_reg(pos), 
+				&control);
+			pci_read_config_dword(dev, msix_table_offset_reg(pos),
+				&table_offset);
+			bir = (u8)(table_offset & PCI_MSIX_FLAGS_BIRMASK);
+			phys_addr = pci_resource_start (dev, bir);
+			phys_addr += (u32)(table_offset & 
+				~PCI_MSIX_FLAGS_BIRMASK);
+			iounmap((void*)base);
+			release_mem_region(phys_addr, PCI_MSIX_ENTRY_SIZE *
+				multi_msix_capable(control));
+			printk(KERN_DEBUG "Driver[%d:%d:%d] unloaded wo doing free_irq on all vectors\n", 
+				dev->bus->number, PCI_SLOT(dev->devfn),	
+				PCI_FUNC(dev->devfn));
+			BUG_ON(warning > 0);
 		}
-		writel(1, entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
-		iounmap((void*)entry->mask_base);
-		release_mem_region(phys_addr, dev_msi_cap * PCI_MSIX_ENTRY_SIZE);
+		dev->irq = temp;		/* Restore IOAPIC IRQ */
 	}
-	dev->irq = temp;
-	nr_reserved_vectors++;
 }
 
 EXPORT_SYMBOL(pci_enable_msi);
-EXPORT_SYMBOL(msi_alloc_vectors);
-EXPORT_SYMBOL(msi_free_vectors);
+EXPORT_SYMBOL(pci_disable_msi);
+EXPORT_SYMBOL(pci_enable_msix);
+EXPORT_SYMBOL(pci_disable_msix);
Index: linux-2.6.8-rc2/drivers/pci/msi.h
===================================================================
--- linux-2.6.8-rc2.orig/drivers/pci/msi.h
+++ linux-2.6.8-rc2/drivers/pci/msi.h
@@ -140,7 +140,8 @@
 	struct {
 		__u8	type	: 5; 	/* {0: unused, 5h:MSI, 11h:MSI-X} */
 		__u8	maskbit	: 1; 	/* mask-pending bit supported ?   */
-		__u8	reserved: 2; 	/* reserved			  */
+		__u8	state	: 1; 	/* {0: free, 1: busy}		  */
+		__u8	reserved: 1; 	/* reserved			  */
 		__u8	entry_nr;    	/* specific enabled entry 	  */
 		__u8	default_vector; /* default pre-assigned vector    */
 		__u8	current_cpu; 	/* current destination cpu	  */
Index: linux-2.6.8-rc2/include/linux/pci.h
===================================================================
--- linux-2.6.8-rc2.orig/include/linux/pci.h
+++ linux-2.6.8-rc2/include/linux/pci.h
@@ -831,16 +831,27 @@
 extern struct pci_dev *isa_bridge;
 #endif
 
+struct msix_entry {
+	u16 	vector;	/* kernel uses to write allocated vector */
+	u16	entry;	/* driver uses to specify entry, OS writes */
+};
+
 #ifndef CONFIG_PCI_USE_VECTOR
 static inline void pci_scan_msi_device(struct pci_dev *dev) {}
 static inline int pci_enable_msi(struct pci_dev *dev) {return -1;}
+static inline void pci_disable_msi(struct pci_dev *dev) {}
+static inline int pci_enable_msix(struct pci_dev* dev, 
+	struct msix_entry *entries, int nvec) {return -1;}
+static inline void pci_disable_msix(struct pci_dev *dev) {}
 static inline void msi_remove_pci_irq_vectors(struct pci_dev *dev) {}
 #else
 extern void pci_scan_msi_device(struct pci_dev *dev);
 extern int pci_enable_msi(struct pci_dev *dev);
+extern void pci_disable_msi(struct pci_dev *dev);
+extern int pci_enable_msix(struct pci_dev* dev, 
+	struct msix_entry *entries, int nvec);
+extern void pci_disable_msix(struct pci_dev *dev);
 extern void msi_remove_pci_irq_vectors(struct pci_dev *dev);
-extern int msi_alloc_vectors(struct pci_dev* dev, int *vector, int nvec);
-extern int msi_free_vectors(struct pci_dev* dev, int *vector, int nvec);
 #endif
 
 #endif /* CONFIG_PCI */

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2004-07-28 17:18 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2004-07-26 22:15 [PATCH] rename CONFIG_PCI_USE_VECTOR to CONFIG_PCI_MSI Bjorn Helgaas
2004-07-26 22:39 ` Roland Dreier
2004-07-26 22:45   ` Roland Dreier
2004-07-26 23:34   ` Bjorn Helgaas
2004-07-27  1:03     ` Roland Dreier
2004-07-27  5:48       ` Zwane Mwaikambo
     [not found]     ` <20040726164324.683ff471.akpm@osdl.org>
     [not found]       ` <524qnu5j8l.fsf@topspin.com>
     [not found]         ` <20040726183917.65927925.akpm@osdl.org>
     [not found]           ` <20040727023927.GB24599@kroah.com>
2004-07-28 17:08             ` [PATCH][1/2] Stop using dev->bus->ops directly in msi.c Roland Dreier
2004-07-28 17:11             ` [PATCH][2/2] MSI/MSI-X API updates Roland Dreier

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.