* [PATCH 1/4] PCI MSI: Store the number of messages in the msi_desc
2008-07-05 13:27 ` Matthew Wilcox
@ 2008-07-05 13:34 ` Matthew Wilcox
2008-07-07 2:05 ` Michael Ellerman
2008-07-05 13:34 ` [PATCH 2/4] PCI: Support multiple MSI Matthew Wilcox
` (3 subsequent siblings)
4 siblings, 1 reply; 43+ messages in thread
From: Matthew Wilcox @ 2008-07-05 13:34 UTC (permalink / raw)
To: linux-pci
Cc: kaneshige.kenji, mingo, tglx, davem, dan.j.williams,
Martine.Silbermann, benh, michael, linux-kernel, Matthew Wilcox,
Matthew Wilcox
This first part simply changes the msi_attrib data structure to store
how many vectors have been allocated. In order to do this, I shrink the
'type' from 5 bits to 2 and rename it to _type to catch any unsuspecting
users.
Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
---
drivers/pci/msi.c | 41 ++++++++++++++++++++++++-----------------
drivers/pci/msi.h | 4 ----
include/linux/msi.h | 6 +++++-
3 files changed, 29 insertions(+), 22 deletions(-)
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 8c61304..92992a8 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -106,11 +106,11 @@ static void msix_flush_writes(unsigned int irq)
entry = get_irq_msi(irq);
BUG_ON(!entry || !entry->dev);
- switch (entry->msi_attrib.type) {
- case PCI_CAP_ID_MSI:
+ switch (entry->msi_attrib._type) {
+ case MSI_ATTRIB:
/* nothing to do */
break;
- case PCI_CAP_ID_MSIX:
+ case MSIX_ATTRIB:
{
int offset = entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE +
PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET;
@@ -129,8 +129,8 @@ static void msi_set_mask_bits(unsigned int irq, u32 mask, u32 flag)
entry = get_irq_msi(irq);
BUG_ON(!entry || !entry->dev);
- switch (entry->msi_attrib.type) {
- case PCI_CAP_ID_MSI:
+ switch (entry->msi_attrib._type) {
+ case MSI_ATTRIB:
if (entry->msi_attrib.maskbit) {
int pos;
u32 mask_bits;
@@ -144,7 +144,7 @@ static void msi_set_mask_bits(unsigned int irq, u32 mask, u32 flag)
msi_set_enable(entry->dev, !flag);
}
break;
- case PCI_CAP_ID_MSIX:
+ case MSIX_ATTRIB:
{
int offset = entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE +
PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET;
@@ -162,8 +162,8 @@ static void msi_set_mask_bits(unsigned int irq, u32 mask, u32 flag)
void read_msi_msg(unsigned int irq, struct msi_msg *msg)
{
struct msi_desc *entry = get_irq_msi(irq);
- switch(entry->msi_attrib.type) {
- case PCI_CAP_ID_MSI:
+ switch(entry->msi_attrib._type) {
+ case MSI_ATTRIB:
{
struct pci_dev *dev = entry->dev;
int pos = entry->msi_attrib.pos;
@@ -182,7 +182,7 @@ void read_msi_msg(unsigned int irq, struct msi_msg *msg)
msg->data = data;
break;
}
- case PCI_CAP_ID_MSIX:
+ case MSIX_ATTRIB:
{
void __iomem *base;
base = entry->mask_base +
@@ -201,11 +201,17 @@ void read_msi_msg(unsigned int irq, struct msi_msg *msg)
void write_msi_msg(unsigned int irq, struct msi_msg *msg)
{
struct msi_desc *entry = get_irq_msi(irq);
- switch (entry->msi_attrib.type) {
- case PCI_CAP_ID_MSI:
+ switch (entry->msi_attrib._type) {
+ case MSI_ATTRIB:
{
struct pci_dev *dev = entry->dev;
int pos = entry->msi_attrib.pos;
+ u16 msgctl;
+
+ pci_read_config_word(dev, msi_control_reg(pos), &msgctl);
+ msgctl &= ~PCI_MSI_FLAGS_QSIZE;
+ msgctl |= entry->msi_attrib.multiple << 4;
+ pci_write_config_word(dev, msi_control_reg(pos), msgctl);
pci_write_config_dword(dev, msi_lower_address_reg(pos),
msg->address_lo);
@@ -220,7 +226,7 @@ void write_msi_msg(unsigned int irq, struct msi_msg *msg)
}
break;
}
- case PCI_CAP_ID_MSIX:
+ case MSIX_ATTRIB:
{
void __iomem *base;
base = entry->mask_base +
@@ -359,7 +365,7 @@ static int msi_capability_init(struct pci_dev *dev)
if (!entry)
return -ENOMEM;
- entry->msi_attrib.type = PCI_CAP_ID_MSI;
+ entry->msi_attrib._type = MSI_ATTRIB;
entry->msi_attrib.is_64 = is_64bit_address(control);
entry->msi_attrib.entry_nr = 0;
entry->msi_attrib.maskbit = is_mask_bit_support(control);
@@ -446,7 +452,7 @@ static int msix_capability_init(struct pci_dev *dev,
break;
j = entries[i].entry;
- entry->msi_attrib.type = PCI_CAP_ID_MSIX;
+ entry->msi_attrib._type = MSIX_ATTRIB;
entry->msi_attrib.is_64 = 1;
entry->msi_attrib.entry_nr = j;
entry->msi_attrib.maskbit = 1;
@@ -589,12 +595,13 @@ void pci_msi_shutdown(struct pci_dev* dev)
u32 mask = entry->msi_attrib.maskbits_mask;
msi_set_mask_bits(dev->irq, mask, ~mask);
}
- if (!entry->dev || entry->msi_attrib.type != PCI_CAP_ID_MSI)
+ if (!entry->dev || entry->msi_attrib._type != MSI_ATTRIB)
return;
/* Restore dev->irq to its default pin-assertion irq */
dev->irq = entry->msi_attrib.default_irq;
}
+
void pci_disable_msi(struct pci_dev* dev)
{
struct msi_desc *entry;
@@ -605,7 +612,7 @@ void pci_disable_msi(struct pci_dev* dev)
pci_msi_shutdown(dev);
entry = list_entry(dev->msi_list.next, struct msi_desc, list);
- if (!entry->dev || entry->msi_attrib.type != PCI_CAP_ID_MSI)
+ if (!entry->dev || entry->msi_attrib._type != MSI_ATTRIB)
return;
msi_free_irqs(dev);
@@ -624,7 +631,7 @@ static int msi_free_irqs(struct pci_dev* dev)
arch_teardown_msi_irqs(dev);
list_for_each_entry_safe(entry, tmp, &dev->msi_list, list) {
- if (entry->msi_attrib.type == PCI_CAP_ID_MSIX) {
+ if (entry->msi_attrib._type == MSIX_ATTRIB) {
writel(1, entry->mask_base + entry->msi_attrib.entry_nr
* PCI_MSIX_ENTRY_SIZE
+ PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
diff --git a/drivers/pci/msi.h b/drivers/pci/msi.h
index 3898f52..b72e0bd 100644
--- a/drivers/pci/msi.h
+++ b/drivers/pci/msi.h
@@ -22,12 +22,8 @@
#define msi_disable(control) control &= ~PCI_MSI_FLAGS_ENABLE
#define multi_msi_capable(control) \
(1 << ((control & PCI_MSI_FLAGS_QMASK) >> 1))
-#define multi_msi_enable(control, num) \
- control |= (((num >> 1) << 4) & PCI_MSI_FLAGS_QSIZE);
#define is_64bit_address(control) (!!(control & PCI_MSI_FLAGS_64BIT))
#define is_mask_bit_support(control) (!!(control & PCI_MSI_FLAGS_MASKBIT))
-#define msi_enable(control, num) multi_msi_enable(control, num); \
- control |= PCI_MSI_FLAGS_ENABLE
#define msix_table_offset_reg(base) (base + 0x04)
#define msix_pba_offset_reg(base) (base + 0x08)
diff --git a/include/linux/msi.h b/include/linux/msi.h
index 8f29392..d322148 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -15,9 +15,13 @@ extern void unmask_msi_irq(unsigned int irq);
extern void read_msi_msg(unsigned int irq, struct msi_msg *msg);
extern void write_msi_msg(unsigned int irq, struct msi_msg *msg);
+#define MSI_ATTRIB 1
+#define MSIX_ATTRIB 2
+
struct msi_desc {
struct {
- __u8 type : 5; /* {0: unused, 5h:MSI, 11h:MSI-X} */
+ __u8 _type : 2; /* {0: unused, 1:MSI, 2:MSI-X} */
+ __u8 multiple: 3; /* log2 number of messages */
__u8 maskbit : 1; /* mask-pending bit supported ? */
__u8 masked : 1;
__u8 is_64 : 1; /* Address size: 0=32bit 1=64bit */
--
1.5.5.4
^ permalink raw reply related [flat|nested] 43+ messages in thread* Re: [PATCH 1/4] PCI MSI: Store the number of messages in the msi_desc
2008-07-05 13:34 ` [PATCH 1/4] PCI MSI: Store the number of messages in the msi_desc Matthew Wilcox
@ 2008-07-07 2:05 ` Michael Ellerman
2008-07-07 2:41 ` Matthew Wilcox
0 siblings, 1 reply; 43+ messages in thread
From: Michael Ellerman @ 2008-07-07 2:05 UTC (permalink / raw)
To: Matthew Wilcox
Cc: linux-pci, kaneshige.kenji, mingo, tglx, davem, dan.j.williams,
Martine.Silbermann, benh, linux-kernel, Matthew Wilcox
[-- Attachment #1: Type: text/plain, Size: 7435 bytes --]
On Sat, 2008-07-05 at 09:34 -0400, Matthew Wilcox wrote:
> This first part simply changes the msi_attrib data structure to store
> how many vectors have been allocated. In order to do this, I shrink the
> 'type' from 5 bits to 2 and rename it to _type to catch any unsuspecting
> users.
Please don't, it significantly uglifies the code IMHO. Just add a new
field for the size, I'd rather call it qsize to match the register.
If you're worried about bloating msi_desc, there's several fields in
there that are per-device not per-desc, so we could do another patch to
move them into pci_dev or something hanging off it, eg.
pci_dev->msi_info rather than storing them in every desc.
cheers
> diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
> index 8c61304..92992a8 100644
> --- a/drivers/pci/msi.c
> +++ b/drivers/pci/msi.c
> @@ -106,11 +106,11 @@ static void msix_flush_writes(unsigned int irq)
>
> entry = get_irq_msi(irq);
> BUG_ON(!entry || !entry->dev);
> - switch (entry->msi_attrib.type) {
> - case PCI_CAP_ID_MSI:
> + switch (entry->msi_attrib._type) {
> + case MSI_ATTRIB:
> /* nothing to do */
> break;
> - case PCI_CAP_ID_MSIX:
> + case MSIX_ATTRIB:
> {
> int offset = entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE +
> PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET;
> @@ -129,8 +129,8 @@ static void msi_set_mask_bits(unsigned int irq, u32 mask, u32 flag)
>
> entry = get_irq_msi(irq);
> BUG_ON(!entry || !entry->dev);
> - switch (entry->msi_attrib.type) {
> - case PCI_CAP_ID_MSI:
> + switch (entry->msi_attrib._type) {
> + case MSI_ATTRIB:
> if (entry->msi_attrib.maskbit) {
> int pos;
> u32 mask_bits;
> @@ -144,7 +144,7 @@ static void msi_set_mask_bits(unsigned int irq, u32 mask, u32 flag)
> msi_set_enable(entry->dev, !flag);
> }
> break;
> - case PCI_CAP_ID_MSIX:
> + case MSIX_ATTRIB:
> {
> int offset = entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE +
> PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET;
> @@ -162,8 +162,8 @@ static void msi_set_mask_bits(unsigned int irq, u32 mask, u32 flag)
> void read_msi_msg(unsigned int irq, struct msi_msg *msg)
> {
> struct msi_desc *entry = get_irq_msi(irq);
> - switch(entry->msi_attrib.type) {
> - case PCI_CAP_ID_MSI:
> + switch(entry->msi_attrib._type) {
> + case MSI_ATTRIB:
> {
> struct pci_dev *dev = entry->dev;
> int pos = entry->msi_attrib.pos;
> @@ -182,7 +182,7 @@ void read_msi_msg(unsigned int irq, struct msi_msg *msg)
> msg->data = data;
> break;
> }
> - case PCI_CAP_ID_MSIX:
> + case MSIX_ATTRIB:
> {
> void __iomem *base;
> base = entry->mask_base +
> @@ -201,11 +201,17 @@ void read_msi_msg(unsigned int irq, struct msi_msg *msg)
> void write_msi_msg(unsigned int irq, struct msi_msg *msg)
> {
> struct msi_desc *entry = get_irq_msi(irq);
> - switch (entry->msi_attrib.type) {
> - case PCI_CAP_ID_MSI:
> + switch (entry->msi_attrib._type) {
> + case MSI_ATTRIB:
> {
> struct pci_dev *dev = entry->dev;
> int pos = entry->msi_attrib.pos;
> + u16 msgctl;
> +
> + pci_read_config_word(dev, msi_control_reg(pos), &msgctl);
> + msgctl &= ~PCI_MSI_FLAGS_QSIZE;
> + msgctl |= entry->msi_attrib.multiple << 4;
> + pci_write_config_word(dev, msi_control_reg(pos), msgctl);
>
> pci_write_config_dword(dev, msi_lower_address_reg(pos),
> msg->address_lo);
> @@ -220,7 +226,7 @@ void write_msi_msg(unsigned int irq, struct msi_msg *msg)
> }
> break;
> }
> - case PCI_CAP_ID_MSIX:
> + case MSIX_ATTRIB:
> {
> void __iomem *base;
> base = entry->mask_base +
> @@ -359,7 +365,7 @@ static int msi_capability_init(struct pci_dev *dev)
> if (!entry)
> return -ENOMEM;
>
> - entry->msi_attrib.type = PCI_CAP_ID_MSI;
> + entry->msi_attrib._type = MSI_ATTRIB;
> entry->msi_attrib.is_64 = is_64bit_address(control);
> entry->msi_attrib.entry_nr = 0;
> entry->msi_attrib.maskbit = is_mask_bit_support(control);
> @@ -446,7 +452,7 @@ static int msix_capability_init(struct pci_dev *dev,
> break;
>
> j = entries[i].entry;
> - entry->msi_attrib.type = PCI_CAP_ID_MSIX;
> + entry->msi_attrib._type = MSIX_ATTRIB;
> entry->msi_attrib.is_64 = 1;
> entry->msi_attrib.entry_nr = j;
> entry->msi_attrib.maskbit = 1;
> @@ -589,12 +595,13 @@ void pci_msi_shutdown(struct pci_dev* dev)
> u32 mask = entry->msi_attrib.maskbits_mask;
> msi_set_mask_bits(dev->irq, mask, ~mask);
> }
> - if (!entry->dev || entry->msi_attrib.type != PCI_CAP_ID_MSI)
> + if (!entry->dev || entry->msi_attrib._type != MSI_ATTRIB)
> return;
>
> /* Restore dev->irq to its default pin-assertion irq */
> dev->irq = entry->msi_attrib.default_irq;
> }
> +
> void pci_disable_msi(struct pci_dev* dev)
> {
> struct msi_desc *entry;
> @@ -605,7 +612,7 @@ void pci_disable_msi(struct pci_dev* dev)
> pci_msi_shutdown(dev);
>
> entry = list_entry(dev->msi_list.next, struct msi_desc, list);
> - if (!entry->dev || entry->msi_attrib.type != PCI_CAP_ID_MSI)
> + if (!entry->dev || entry->msi_attrib._type != MSI_ATTRIB)
> return;
>
> msi_free_irqs(dev);
> @@ -624,7 +631,7 @@ static int msi_free_irqs(struct pci_dev* dev)
> arch_teardown_msi_irqs(dev);
>
> list_for_each_entry_safe(entry, tmp, &dev->msi_list, list) {
> - if (entry->msi_attrib.type == PCI_CAP_ID_MSIX) {
> + if (entry->msi_attrib._type == MSIX_ATTRIB) {
> writel(1, entry->mask_base + entry->msi_attrib.entry_nr
> * PCI_MSIX_ENTRY_SIZE
> + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
> diff --git a/drivers/pci/msi.h b/drivers/pci/msi.h
> index 3898f52..b72e0bd 100644
> --- a/drivers/pci/msi.h
> +++ b/drivers/pci/msi.h
> @@ -22,12 +22,8 @@
> #define msi_disable(control) control &= ~PCI_MSI_FLAGS_ENABLE
> #define multi_msi_capable(control) \
> (1 << ((control & PCI_MSI_FLAGS_QMASK) >> 1))
> -#define multi_msi_enable(control, num) \
> - control |= (((num >> 1) << 4) & PCI_MSI_FLAGS_QSIZE);
> #define is_64bit_address(control) (!!(control & PCI_MSI_FLAGS_64BIT))
> #define is_mask_bit_support(control) (!!(control & PCI_MSI_FLAGS_MASKBIT))
> -#define msi_enable(control, num) multi_msi_enable(control, num); \
> - control |= PCI_MSI_FLAGS_ENABLE
>
> #define msix_table_offset_reg(base) (base + 0x04)
> #define msix_pba_offset_reg(base) (base + 0x08)
> diff --git a/include/linux/msi.h b/include/linux/msi.h
> index 8f29392..d322148 100644
> --- a/include/linux/msi.h
> +++ b/include/linux/msi.h
> @@ -15,9 +15,13 @@ extern void unmask_msi_irq(unsigned int irq);
> extern void read_msi_msg(unsigned int irq, struct msi_msg *msg);
> extern void write_msi_msg(unsigned int irq, struct msi_msg *msg);
>
> +#define MSI_ATTRIB 1
> +#define MSIX_ATTRIB 2
> +
> struct msi_desc {
> struct {
> - __u8 type : 5; /* {0: unused, 5h:MSI, 11h:MSI-X} */
> + __u8 _type : 2; /* {0: unused, 1:MSI, 2:MSI-X} */
> + __u8 multiple: 3; /* log2 number of messages */
> __u8 maskbit : 1; /* mask-pending bit supported ? */
> __u8 masked : 1;
> __u8 is_64 : 1; /* Address size: 0=32bit 1=64bit */
--
Michael Ellerman
OzLabs, IBM Australia Development Lab
wwweb: http://michael.ellerman.id.au
phone: +61 2 6212 1183 (tie line 70 21183)
We do not inherit the earth from our ancestors,
we borrow it from our children. - S.M.A.R.T Person
[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 189 bytes --]
^ permalink raw reply [flat|nested] 43+ messages in thread* Re: [PATCH 1/4] PCI MSI: Store the number of messages in the msi_desc
2008-07-07 2:05 ` Michael Ellerman
@ 2008-07-07 2:41 ` Matthew Wilcox
2008-07-07 3:26 ` Benjamin Herrenschmidt
2008-07-07 3:48 ` Michael Ellerman
0 siblings, 2 replies; 43+ messages in thread
From: Matthew Wilcox @ 2008-07-07 2:41 UTC (permalink / raw)
To: Michael Ellerman
Cc: linux-pci, kaneshige.kenji, mingo, tglx, davem, dan.j.williams,
Martine.Silbermann, benh, linux-kernel, Matthew Wilcox
On Mon, Jul 07, 2008 at 12:05:24PM +1000, Michael Ellerman wrote:
> On Sat, 2008-07-05 at 09:34 -0400, Matthew Wilcox wrote:
> > This first part simply changes the msi_attrib data structure to store
> > how many vectors have been allocated. In order to do this, I shrink the
> > 'type' from 5 bits to 2 and rename it to _type to catch any unsuspecting
> > users.
>
> Please don't, it significantly uglifies the code IMHO. Just add a new
> field for the size, I'd rather call it qsize to match the register.
Uglifies the code? Seriously? Other than the _ addition (which really
I just did to be sure I didn't miss a case), how is MSI_ATTRIB uglier
than PCI_CAP_ID_MSI?
I'd like to rename the register definition from QSIZE. It's _not_ a
queue. I don't know where this misunderstanding came from, but I
certainly don't want to spread it any further.
> If you're worried about bloating msi_desc, there's several fields in
> there that are per-device not per-desc, so we could do another patch to
> move them into pci_dev or something hanging off it, eg.
> pci_dev->msi_info rather than storing them in every desc.
Might be worth it anyway for devices with lots of MSI-X interrupts.
I think the MSI-X implementation is a bit poorly written anyway. If we
had an array of msi_desc for each device, we could avoid the list_head
in the msi_desc, for example. That'd save two pointers (8 or 16 bytes),
plus the overhead of allocating each one individually.
I also think that MSI-X could be improved by changing the interface to
do away with this msix_entry list passed in -- just allocate the irqs
consecutively.
--
Intel are signing my paycheques ... these opinions are still mine
"Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours. We can't possibly take such
a retrograde step."
^ permalink raw reply [flat|nested] 43+ messages in thread
* Re: [PATCH 1/4] PCI MSI: Store the number of messages in the msi_desc
2008-07-07 2:41 ` Matthew Wilcox
@ 2008-07-07 3:26 ` Benjamin Herrenschmidt
2008-07-07 3:48 ` Michael Ellerman
1 sibling, 0 replies; 43+ messages in thread
From: Benjamin Herrenschmidt @ 2008-07-07 3:26 UTC (permalink / raw)
To: Matthew Wilcox
Cc: Michael Ellerman, linux-pci, kaneshige.kenji, mingo, tglx, davem,
dan.j.williams, Martine.Silbermann, linux-kernel, Matthew Wilcox
On Sun, 2008-07-06 at 20:41 -0600, Matthew Wilcox wrote:
> I also think that MSI-X could be improved by changing the interface to
> do away with this msix_entry list passed in -- just allocate the irqs
> consecutively.
Apparently some drivers rely on scattered allocation of MSI-X
Ben.
^ permalink raw reply [flat|nested] 43+ messages in thread
* Re: [PATCH 1/4] PCI MSI: Store the number of messages in the msi_desc
2008-07-07 2:41 ` Matthew Wilcox
2008-07-07 3:26 ` Benjamin Herrenschmidt
@ 2008-07-07 3:48 ` Michael Ellerman
2008-07-07 12:04 ` Matthew Wilcox
1 sibling, 1 reply; 43+ messages in thread
From: Michael Ellerman @ 2008-07-07 3:48 UTC (permalink / raw)
To: Matthew Wilcox
Cc: linux-pci, kaneshige.kenji, mingo, tglx, davem, dan.j.williams,
Martine.Silbermann, benh, linux-kernel, Matthew Wilcox
[-- Attachment #1: Type: text/plain, Size: 2780 bytes --]
On Sun, 2008-07-06 at 20:41 -0600, Matthew Wilcox wrote:
> On Mon, Jul 07, 2008 at 12:05:24PM +1000, Michael Ellerman wrote:
> > On Sat, 2008-07-05 at 09:34 -0400, Matthew Wilcox wrote:
> > > This first part simply changes the msi_attrib data structure to store
> > > how many vectors have been allocated. In order to do this, I shrink the
> > > 'type' from 5 bits to 2 and rename it to _type to catch any unsuspecting
> > > users.
> >
> > Please don't, it significantly uglifies the code IMHO. Just add a new
> > field for the size, I'd rather call it qsize to match the register.
>
> Uglifies the code? Seriously? Other than the _ addition (which really
> I just did to be sure I didn't miss a case), how is MSI_ATTRIB uglier
> than PCI_CAP_ID_MSI?
Yeah seriously :) The _ is part of it, but MSI_ATTRIB is uglier than
PCI_CAP_ID_MSI exactly because it's not PCI_CAP_ID_MSI, which exists and
is well defined and is used in the rest of the code.
> I'd like to rename the register definition from QSIZE. It's _not_ a
> queue. I don't know where this misunderstanding came from, but I
> certainly don't want to spread it any further.
I didn't say it was a queue, but a Q ;) But I agree it's not a good
name, the spec calls it "multiple message enable", nvec would match the
existing code best, or log_nvec.
> > If you're worried about bloating msi_desc, there's several fields in
> > there that are per-device not per-desc, so we could do another patch to
> > move them into pci_dev or something hanging off it, eg.
> > pci_dev->msi_info rather than storing them in every desc.
>
> Might be worth it anyway for devices with lots of MSI-X interrupts.
Eventually yeah, last I looked we didn't have any drivers using more
than a few MSI-X, but at some point it will happen.
> I think the MSI-X implementation is a bit poorly written anyway. If we
> had an array of msi_desc for each device, we could avoid the list_head
> in the msi_desc, for example. That'd save two pointers (8 or 16 bytes),
> plus the overhead of allocating each one individually.
Yeah that would be nice.
> I also think that MSI-X could be improved by changing the interface to
> do away with this msix_entry list passed in -- just allocate the irqs
> consecutively.
It would be nice, but as I said the other day we have at least one
driver (s2io) which asks for non-consecutive entries. That doesn't
effect the irq allocation, but you need some way for the driver to
express it.
cheers
--
Michael Ellerman
OzLabs, IBM Australia Development Lab
wwweb: http://michael.ellerman.id.au
phone: +61 2 6212 1183 (tie line 70 21183)
We do not inherit the earth from our ancestors,
we borrow it from our children. - S.M.A.R.T Person
[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 189 bytes --]
^ permalink raw reply [flat|nested] 43+ messages in thread
* Re: [PATCH 1/4] PCI MSI: Store the number of messages in the msi_desc
2008-07-07 3:48 ` Michael Ellerman
@ 2008-07-07 12:04 ` Matthew Wilcox
2008-07-07 16:02 ` Grant Grundler
2008-07-10 1:32 ` Michael Ellerman
0 siblings, 2 replies; 43+ messages in thread
From: Matthew Wilcox @ 2008-07-07 12:04 UTC (permalink / raw)
To: Michael Ellerman
Cc: linux-pci, kaneshige.kenji, mingo, tglx, davem, dan.j.williams,
Martine.Silbermann, benh, linux-kernel, Matthew Wilcox
On Mon, Jul 07, 2008 at 01:48:32PM +1000, Michael Ellerman wrote:
> Yeah seriously :) The _ is part of it, but MSI_ATTRIB is uglier than
> PCI_CAP_ID_MSI exactly because it's not PCI_CAP_ID_MSI, which exists and
> is well defined and is used in the rest of the code.
Here's an improvement over both the status quo and my patch -- simply
use a single bit called is_msix.
> I didn't say it was a queue, but a Q ;) But I agree it's not a good
> name, the spec calls it "multiple message enable", nvec would match the
> existing code best, or log_nvec.
I don't see what's wrong with 'multiple'. log_nvec is clunky, and
'multiple' works well as a boolean (since 0 means 1 interrupt).
> > > If you're worried about bloating msi_desc, there's several fields in
> > > there that are per-device not per-desc, so we could do another patch to
> > > move them into pci_dev or something hanging off it, eg.
> > > pci_dev->msi_info rather than storing them in every desc.
Ouch. I just used pahole and discovered we were using 72 bytes on
64-bit. A swift rearrangement of a u16 gets us back down to 64.
Here's the replacement patch:
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 8c61304..8f7e483 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -106,20 +106,10 @@ static void msix_flush_writes(unsigned int irq)
entry = get_irq_msi(irq);
BUG_ON(!entry || !entry->dev);
- switch (entry->msi_attrib.type) {
- case PCI_CAP_ID_MSI:
- /* nothing to do */
- break;
- case PCI_CAP_ID_MSIX:
- {
+ if (entry->msi_attrib.is_msix) {
int offset = entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE +
PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET;
readl(entry->mask_base + offset);
- break;
- }
- default:
- BUG();
- break;
}
}
@@ -129,8 +119,12 @@ static void msi_set_mask_bits(unsigned int irq, u32 mask, u32 flag)
entry = get_irq_msi(irq);
BUG_ON(!entry || !entry->dev);
- switch (entry->msi_attrib.type) {
- case PCI_CAP_ID_MSI:
+ if (entry->msi_attrib.is_msix) {
+ int offset = entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE +
+ PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET;
+ writel(flag, entry->mask_base + offset);
+ readl(entry->mask_base + offset);
+ } else {
if (entry->msi_attrib.maskbit) {
int pos;
u32 mask_bits;
@@ -143,18 +137,6 @@ static void msi_set_mask_bits(unsigned int irq, u32 mask, u32 flag)
} else {
msi_set_enable(entry->dev, !flag);
}
- break;
- case PCI_CAP_ID_MSIX:
- {
- int offset = entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE +
- PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET;
- writel(flag, entry->mask_base + offset);
- readl(entry->mask_base + offset);
- break;
- }
- default:
- BUG();
- break;
}
entry->msi_attrib.masked = !!flag;
}
@@ -162,9 +144,14 @@ static void msi_set_mask_bits(unsigned int irq, u32 mask, u32 flag)
void read_msi_msg(unsigned int irq, struct msi_msg *msg)
{
struct msi_desc *entry = get_irq_msi(irq);
- switch(entry->msi_attrib.type) {
- case PCI_CAP_ID_MSI:
- {
+ if (entry->msi_attrib.is_msix) {
+ void __iomem *base = entry->mask_base +
+ entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE;
+
+ msg->address_lo = readl(base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
+ msg->address_hi = readl(base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET);
+ msg->data = readl(base + PCI_MSIX_ENTRY_DATA_OFFSET);
+ } else {
struct pci_dev *dev = entry->dev;
int pos = entry->msi_attrib.pos;
u16 data;
@@ -180,32 +167,31 @@ void read_msi_msg(unsigned int irq, struct msi_msg *msg)
pci_read_config_word(dev, msi_data_reg(pos, 0), &data);
}
msg->data = data;
- break;
- }
- case PCI_CAP_ID_MSIX:
- {
- void __iomem *base;
- base = entry->mask_base +
- entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE;
-
- msg->address_lo = readl(base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
- msg->address_hi = readl(base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET);
- msg->data = readl(base + PCI_MSIX_ENTRY_DATA_OFFSET);
- break;
- }
- default:
- BUG();
}
}
void write_msi_msg(unsigned int irq, struct msi_msg *msg)
{
struct msi_desc *entry = get_irq_msi(irq);
- switch (entry->msi_attrib.type) {
- case PCI_CAP_ID_MSI:
- {
+ if (entry->msi_attrib.is_msix) {
+ void __iomem *base;
+ base = entry->mask_base +
+ entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE;
+
+ writel(msg->address_lo,
+ base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
+ writel(msg->address_hi,
+ base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET);
+ writel(msg->data, base + PCI_MSIX_ENTRY_DATA_OFFSET);
+ } else {
struct pci_dev *dev = entry->dev;
int pos = entry->msi_attrib.pos;
+ u16 msgctl;
+
+ pci_read_config_word(dev, msi_control_reg(pos), &msgctl);
+ msgctl &= ~PCI_MSI_FLAGS_QSIZE;
+ msgctl |= entry->msi_attrib.multiple << 4;
+ pci_write_config_word(dev, msi_control_reg(pos), msgctl);
pci_write_config_dword(dev, msi_lower_address_reg(pos),
msg->address_lo);
@@ -218,23 +204,6 @@ void write_msi_msg(unsigned int irq, struct msi_msg *msg)
pci_write_config_word(dev, msi_data_reg(pos, 0),
msg->data);
}
- break;
- }
- case PCI_CAP_ID_MSIX:
- {
- void __iomem *base;
- base = entry->mask_base +
- entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE;
-
- writel(msg->address_lo,
- base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
- writel(msg->address_hi,
- base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET);
- writel(msg->data, base + PCI_MSIX_ENTRY_DATA_OFFSET);
- break;
- }
- default:
- BUG();
}
entry->msg = *msg;
}
@@ -359,7 +328,7 @@ static int msi_capability_init(struct pci_dev *dev)
if (!entry)
return -ENOMEM;
- entry->msi_attrib.type = PCI_CAP_ID_MSI;
+ entry->msi_attrib.is_msix = 0;
entry->msi_attrib.is_64 = is_64bit_address(control);
entry->msi_attrib.entry_nr = 0;
entry->msi_attrib.maskbit = is_mask_bit_support(control);
@@ -446,7 +415,7 @@ static int msix_capability_init(struct pci_dev *dev,
break;
j = entries[i].entry;
- entry->msi_attrib.type = PCI_CAP_ID_MSIX;
+ entry->msi_attrib.is_msix = 1;
entry->msi_attrib.is_64 = 1;
entry->msi_attrib.entry_nr = j;
entry->msi_attrib.maskbit = 1;
@@ -589,12 +558,13 @@ void pci_msi_shutdown(struct pci_dev* dev)
u32 mask = entry->msi_attrib.maskbits_mask;
msi_set_mask_bits(dev->irq, mask, ~mask);
}
- if (!entry->dev || entry->msi_attrib.type != PCI_CAP_ID_MSI)
+ if (!entry->dev || entry->msi_attrib.is_msix)
return;
/* Restore dev->irq to its default pin-assertion irq */
dev->irq = entry->msi_attrib.default_irq;
}
+
void pci_disable_msi(struct pci_dev* dev)
{
struct msi_desc *entry;
@@ -605,7 +575,7 @@ void pci_disable_msi(struct pci_dev* dev)
pci_msi_shutdown(dev);
entry = list_entry(dev->msi_list.next, struct msi_desc, list);
- if (!entry->dev || entry->msi_attrib.type != PCI_CAP_ID_MSI)
+ if (!entry->dev || entry->msi_attrib.is_msix)
return;
msi_free_irqs(dev);
@@ -624,7 +594,7 @@ static int msi_free_irqs(struct pci_dev* dev)
arch_teardown_msi_irqs(dev);
list_for_each_entry_safe(entry, tmp, &dev->msi_list, list) {
- if (entry->msi_attrib.type == PCI_CAP_ID_MSIX) {
+ if (entry->msi_attrib.is_msix) {
writel(1, entry->mask_base + entry->msi_attrib.entry_nr
* PCI_MSIX_ENTRY_SIZE
+ PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
diff --git a/drivers/pci/msi.h b/drivers/pci/msi.h
index 3898f52..b72e0bd 100644
--- a/drivers/pci/msi.h
+++ b/drivers/pci/msi.h
@@ -22,12 +22,8 @@
#define msi_disable(control) control &= ~PCI_MSI_FLAGS_ENABLE
#define multi_msi_capable(control) \
(1 << ((control & PCI_MSI_FLAGS_QMASK) >> 1))
-#define multi_msi_enable(control, num) \
- control |= (((num >> 1) << 4) & PCI_MSI_FLAGS_QSIZE);
#define is_64bit_address(control) (!!(control & PCI_MSI_FLAGS_64BIT))
#define is_mask_bit_support(control) (!!(control & PCI_MSI_FLAGS_MASKBIT))
-#define msi_enable(control, num) multi_msi_enable(control, num); \
- control |= PCI_MSI_FLAGS_ENABLE
#define msix_table_offset_reg(base) (base + 0x04)
#define msix_pba_offset_reg(base) (base + 0x08)
diff --git a/include/linux/msi.h b/include/linux/msi.h
index 8f29392..70fcebc 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -17,13 +17,14 @@ extern void write_msi_msg(unsigned int irq, struct msi_msg *msg);
struct msi_desc {
struct {
- __u8 type : 5; /* {0: unused, 5h:MSI, 11h:MSI-X} */
+ __u8 is_msix : 1;
+ __u8 multiple: 3; /* log2 number of messages */
__u8 maskbit : 1; /* mask-pending bit supported ? */
__u8 masked : 1;
__u8 is_64 : 1; /* Address size: 0=32bit 1=64bit */
__u8 pos; /* Location of the msi capability */
- __u32 maskbits_mask; /* mask bits mask */
__u16 entry_nr; /* specific enabled entry */
+ __u32 maskbits_mask; /* mask bits mask */
unsigned default_irq; /* default pre-assigned irq */
}msi_attrib;
--
Intel are signing my paycheques ... these opinions are still mine
"Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours. We can't possibly take such
a retrograde step."
^ permalink raw reply related [flat|nested] 43+ messages in thread* Re: [PATCH 1/4] PCI MSI: Store the number of messages in the msi_desc
2008-07-07 12:04 ` Matthew Wilcox
@ 2008-07-07 16:02 ` Grant Grundler
2008-07-07 16:19 ` Matthew Wilcox
2008-07-10 1:32 ` Michael Ellerman
1 sibling, 1 reply; 43+ messages in thread
From: Grant Grundler @ 2008-07-07 16:02 UTC (permalink / raw)
To: Matthew Wilcox
Cc: Michael Ellerman, linux-pci, kaneshige.kenji, mingo, tglx, davem,
dan.j.williams, Martine.Silbermann, benh, linux-kernel,
Matthew Wilcox
On Mon, Jul 07, 2008 at 06:04:18AM -0600, Matthew Wilcox wrote:
> On Mon, Jul 07, 2008 at 01:48:32PM +1000, Michael Ellerman wrote:
> > Yeah seriously :) The _ is part of it, but MSI_ATTRIB is uglier than
> > PCI_CAP_ID_MSI exactly because it's not PCI_CAP_ID_MSI, which exists and
> > is well defined and is used in the rest of the code.
>
> Here's an improvement over both the status quo and my patch -- simply
> use a single bit called is_msix.
I prefer the "is_msix" for readability though I'm not particularly fond
of bit fields (in any form).
...
> @@ -589,12 +558,13 @@ void pci_msi_shutdown(struct pci_dev* dev)
> u32 mask = entry->msi_attrib.maskbits_mask;
> msi_set_mask_bits(dev->irq, mask, ~mask);
> }
> - if (!entry->dev || entry->msi_attrib.type != PCI_CAP_ID_MSI)
> + if (!entry->dev || entry->msi_attrib.is_msix)
> return;
This is why I don't like bit fields.
"uninitialized" (3rd state) doesn't exist.
Is there something else in place to catch that state?
(It's clearly a bug if someone did that and maybe I'm
being too paranoid.)
hth,
grant
^ permalink raw reply [flat|nested] 43+ messages in thread
* Re: [PATCH 1/4] PCI MSI: Store the number of messages in the msi_desc
2008-07-07 16:02 ` Grant Grundler
@ 2008-07-07 16:19 ` Matthew Wilcox
0 siblings, 0 replies; 43+ messages in thread
From: Matthew Wilcox @ 2008-07-07 16:19 UTC (permalink / raw)
To: Grant Grundler
Cc: Michael Ellerman, linux-pci, kaneshige.kenji, mingo, tglx, davem,
dan.j.williams, Martine.Silbermann, benh, linux-kernel,
Matthew Wilcox
On Mon, Jul 07, 2008 at 10:02:03AM -0600, Grant Grundler wrote:
> > - if (!entry->dev || entry->msi_attrib.type != PCI_CAP_ID_MSI)
> > + if (!entry->dev || entry->msi_attrib.is_msix)
> > return;
>
> This is why I don't like bit fields.
> "uninitialized" (3rd state) doesn't exist.
> Is there something else in place to catch that state?
>
> (It's clearly a bug if someone did that and maybe I'm
> being too paranoid.)
msi_descs are only allocated in two places, one of which sets is_msix
and the other doesn't. I wouldn't object to passing parameters to
alloc_msi_entry() so there's only one place that fills in is_msix, but
yes, I do think the current code is too paranoid.
--
Intel are signing my paycheques ... these opinions are still mine
"Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours. We can't possibly take such
a retrograde step."
^ permalink raw reply [flat|nested] 43+ messages in thread
* Re: [PATCH 1/4] PCI MSI: Store the number of messages in the msi_desc
2008-07-07 12:04 ` Matthew Wilcox
2008-07-07 16:02 ` Grant Grundler
@ 2008-07-10 1:32 ` Michael Ellerman
2008-07-10 1:35 ` Matthew Wilcox
1 sibling, 1 reply; 43+ messages in thread
From: Michael Ellerman @ 2008-07-10 1:32 UTC (permalink / raw)
To: Matthew Wilcox
Cc: linux-pci, kaneshige.kenji, mingo, tglx, davem, dan.j.williams,
Martine.Silbermann, benh, linux-kernel, Matthew Wilcox
[-- Attachment #1: Type: text/plain, Size: 2485 bytes --]
On Mon, 2008-07-07 at 06:04 -0600, Matthew Wilcox wrote:
> On Mon, Jul 07, 2008 at 01:48:32PM +1000, Michael Ellerman wrote:
> > Yeah seriously :) The _ is part of it, but MSI_ATTRIB is uglier than
> > PCI_CAP_ID_MSI exactly because it's not PCI_CAP_ID_MSI, which exists and
> > is well defined and is used in the rest of the code.
>
> Here's an improvement over both the status quo and my patch -- simply
> use a single bit called is_msix.
That is cleaner, you get to fix it when they create MSIXX though ;)
> > I didn't say it was a queue, but a Q ;) But I agree it's not a good
> > name, the spec calls it "multiple message enable", nvec would match the
> > existing code best, or log_nvec.
>
> I don't see what's wrong with 'multiple'. log_nvec is clunky, and
> 'multiple' works well as a boolean (since 0 means 1 interrupt).
For me 'multiple' only makes sense as a boolean, but whatever.
> > > > If you're worried about bloating msi_desc, there's several fields in
> > > > there that are per-device not per-desc, so we could do another patch to
> > > > move them into pci_dev or something hanging off it, eg.
> > > > pci_dev->msi_info rather than storing them in every desc.
>
> Ouch. I just used pahole and discovered we were using 72 bytes on
> 64-bit. A swift rearrangement of a u16 gets us back down to 64.
pahole is awesome, nice find.
> Here's the replacement patch:
Perhaps I'm pedantic, but I'd rather it was two patches, one to change
type to is_msix and one to add the multiple flag.
> diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
> index 8c61304..8f7e483 100644
> --- a/drivers/pci/msi.c
> +++ b/drivers/pci/msi.c
> @@ -180,32 +167,31 @@ void read_msi_msg(unsigned int irq, struct msi_msg *msg)
...
> struct pci_dev *dev = entry->dev;
> int pos = entry->msi_attrib.pos;
> + u16 msgctl;
> +
> + pci_read_config_word(dev, msi_control_reg(pos), &msgctl);
> + msgctl &= ~PCI_MSI_FLAGS_QSIZE;
> + msgctl |= entry->msi_attrib.multiple << 4;
> + pci_write_config_word(dev, msi_control_reg(pos), msgctl);
A #define for "<< 4" would be nice. And should we be paranoid about
potentially writing 0b110 or 0b111 which are reserved?
cheers
--
Michael Ellerman
OzLabs, IBM Australia Development Lab
wwweb: http://michael.ellerman.id.au
phone: +61 2 6212 1183 (tie line 70 21183)
We do not inherit the earth from our ancestors,
we borrow it from our children. - S.M.A.R.T Person
[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 189 bytes --]
^ permalink raw reply [flat|nested] 43+ messages in thread
* Re: [PATCH 1/4] PCI MSI: Store the number of messages in the msi_desc
2008-07-10 1:32 ` Michael Ellerman
@ 2008-07-10 1:35 ` Matthew Wilcox
0 siblings, 0 replies; 43+ messages in thread
From: Matthew Wilcox @ 2008-07-10 1:35 UTC (permalink / raw)
To: Michael Ellerman
Cc: linux-pci, kaneshige.kenji, mingo, tglx, davem, dan.j.williams,
Martine.Silbermann, benh, linux-kernel, Matthew Wilcox
On Thu, Jul 10, 2008 at 11:32:43AM +1000, Michael Ellerman wrote:
> That is cleaner, you get to fix it when they create MSIXX though ;)
Heh, I've been calling it MSI-Y in my internal monologue ;-) Maybe
it'll be MSIe though.
> Perhaps I'm pedantic, but I'd rather it was two patches, one to change
> type to is_msix and one to add the multiple flag.
I'll make that change.
--
Intel are signing my paycheques ... these opinions are still mine
"Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours. We can't possibly take such
a retrograde step."
^ permalink raw reply [flat|nested] 43+ messages in thread
* [PATCH 2/4] PCI: Support multiple MSI
2008-07-05 13:27 ` Matthew Wilcox
2008-07-05 13:34 ` [PATCH 1/4] PCI MSI: Store the number of messages in the msi_desc Matthew Wilcox
@ 2008-07-05 13:34 ` Matthew Wilcox
2008-07-07 2:05 ` Michael Ellerman
2008-07-05 13:34 ` [PATCH 3/4] AHCI: Request multiple MSIs Matthew Wilcox
` (2 subsequent siblings)
4 siblings, 1 reply; 43+ messages in thread
From: Matthew Wilcox @ 2008-07-05 13:34 UTC (permalink / raw)
To: linux-pci
Cc: kaneshige.kenji, mingo, tglx, davem, dan.j.williams,
Martine.Silbermann, benh, michael, linux-kernel, Matthew Wilcox,
Matthew Wilcox
Add the new API pci_enable_msi_block() to allow drivers to
request multiple MSIs. Reimplement pci_enable_msi in terms
of pci_enable_msi_block. Add a default implementation of
arch_setup_msi_block() that only allows one MSI to be requested.
Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
---
arch/powerpc/kernel/msi.c | 2 +-
drivers/pci/msi.c | 109 +++++++++++++++++++++++++++++----------------
include/linux/msi.h | 3 +-
include/linux/pci.h | 6 ++-
4 files changed, 77 insertions(+), 43 deletions(-)
diff --git a/arch/powerpc/kernel/msi.c b/arch/powerpc/kernel/msi.c
index c62d101..317c7c8 100644
--- a/arch/powerpc/kernel/msi.c
+++ b/arch/powerpc/kernel/msi.c
@@ -32,7 +32,7 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
return ppc_md.setup_msi_irqs(dev, nvec, type);
}
-void arch_teardown_msi_irqs(struct pci_dev *dev)
+void arch_teardown_msi_irqs(struct pci_dev *dev, int nvec)
{
return ppc_md.teardown_msi_irqs(dev);
}
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 92992a8..6cbdf11 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -40,18 +40,31 @@ arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *entry)
}
int __attribute__ ((weak))
+arch_setup_msi_block(struct pci_dev *pdev, struct msi_desc *desc, int nvec)
+{
+ if (nvec > 1)
+ return 1;
+ return arch_setup_msi_irq(pdev, desc);
+}
+
+int __attribute__ ((weak))
arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
{
- struct msi_desc *entry;
+ struct msi_desc *desc;
int ret;
- list_for_each_entry(entry, &dev->msi_list, list) {
- ret = arch_setup_msi_irq(dev, entry);
- if (ret)
- return ret;
+ if (type == PCI_CAP_ID_MSI) {
+ desc = list_first_entry(&dev->msi_list, struct msi_desc, list);
+ ret = arch_setup_msi_block(dev, desc, nvec);
+ } else {
+ list_for_each_entry(desc, &dev->msi_list, list) {
+ ret = arch_setup_msi_irq(dev, desc);
+ if (ret)
+ break;
+ }
}
- return 0;
+ return ret;
}
void __attribute__ ((weak)) arch_teardown_msi_irq(unsigned int irq)
@@ -60,13 +73,16 @@ void __attribute__ ((weak)) arch_teardown_msi_irq(unsigned int irq)
}
void __attribute__ ((weak))
-arch_teardown_msi_irqs(struct pci_dev *dev)
+arch_teardown_msi_irqs(struct pci_dev *dev, int nvec)
{
struct msi_desc *entry;
list_for_each_entry(entry, &dev->msi_list, list) {
- if (entry->irq != 0)
- arch_teardown_msi_irq(entry->irq);
+ int i;
+ if (entry->irq == 0)
+ continue;
+ for (i = 0; i < nvec; i++)
+ arch_teardown_msi_irq(entry->irq + i);
}
}
@@ -350,7 +366,7 @@ EXPORT_SYMBOL_GPL(pci_restore_msi_state);
* multiple messages. A return of zero indicates the successful setup
* of an entry zero with the new MSI irq or non-zero for otherwise.
**/
-static int msi_capability_init(struct pci_dev *dev)
+static int msi_capability_init(struct pci_dev *dev, int nr_irqs)
{
struct msi_desc *entry;
int pos, ret;
@@ -394,7 +410,7 @@ static int msi_capability_init(struct pci_dev *dev)
list_add_tail(&entry->list, &dev->msi_list);
/* Configure MSI capability structure */
- ret = arch_setup_msi_irqs(dev, 1, PCI_CAP_ID_MSI);
+ ret = arch_setup_msi_irqs(dev, nr_irqs, PCI_CAP_ID_MSI);
if (ret) {
msi_free_irqs(dev);
return ret;
@@ -546,36 +562,47 @@ static int pci_msi_check_device(struct pci_dev* dev, int nvec, int type)
}
/**
- * pci_enable_msi - configure device's MSI capability structure
- * @dev: pointer to the pci_dev data structure of MSI device function
+ * pci_enable_msi_block - configure device's MSI capability structure
+ * @pdev: Device to configure
+ * @nr_irqs: Number of IRQs requested
+ *
+ * Allocate IRQs for a device with the MSI capability.
+ * This function returns a negative errno if an error occurs. On success,
+ * this function returns the number of IRQs actually allocated. Since
+ * MSIs are required to be a power of two, the number of IRQs allocated
+ * may be rounded up to the next power of two (if the number requested is
+ * not a power of two). Fewer IRQs than requested may be allocated if the
+ * system does not have the resources for the full number.
*
- * Setup the MSI capability structure of device function with
- * a single MSI irq upon its software driver call to request for
- * MSI mode enabled on its hardware device function. A return of zero
- * indicates the successful setup of an entry zero with the new MSI
- * irq or non-zero for otherwise.
+ * If successful, the @pdev's irq member will be updated to the lowest new
+ * IRQ allocated; the other IRQs allocated to this device will be consecutive.
**/
-int pci_enable_msi(struct pci_dev* dev)
+int pci_enable_msi_block(struct pci_dev *pdev, unsigned int nr_irqs)
{
int status;
- status = pci_msi_check_device(dev, 1, PCI_CAP_ID_MSI);
+ /* MSI only supports up to 32 interrupts */
+ if (nr_irqs > 32)
+ return 32;
+
+ status = pci_msi_check_device(pdev, nr_irqs, PCI_CAP_ID_MSI);
if (status)
return status;
- WARN_ON(!!dev->msi_enabled);
+ WARN_ON(!!pdev->msi_enabled);
- /* Check whether driver already requested for MSI-X irqs */
- if (dev->msix_enabled) {
+ /* Check whether driver already requested MSI-X irqs */
+ if (pdev->msix_enabled) {
printk(KERN_INFO "PCI: %s: Can't enable MSI. "
"Device already has MSI-X enabled\n",
- pci_name(dev));
+ pci_name(pdev));
return -EINVAL;
}
- status = msi_capability_init(dev);
+
+ status = msi_capability_init(pdev, nr_irqs);
return status;
}
-EXPORT_SYMBOL(pci_enable_msi);
+EXPORT_SYMBOL(pci_enable_msi_block);
void pci_msi_shutdown(struct pci_dev* dev)
{
@@ -621,26 +648,30 @@ EXPORT_SYMBOL(pci_disable_msi);
static int msi_free_irqs(struct pci_dev* dev)
{
- struct msi_desc *entry, *tmp;
+ int i, nvec = 1;
+ struct msi_desc *desc, *tmp;
- list_for_each_entry(entry, &dev->msi_list, list) {
- if (entry->irq)
- BUG_ON(irq_has_action(entry->irq));
+ list_for_each_entry(desc, &dev->msi_list, list) {
+ nvec = 1 << desc->msi_attrib.multiple;
+ if (!desc->irq)
+ continue;
+ for (i = 0; i < nvec; i++)
+ BUG_ON(irq_has_action(desc->irq + i));
}
- arch_teardown_msi_irqs(dev);
+ arch_teardown_msi_irqs(dev, nvec);
- list_for_each_entry_safe(entry, tmp, &dev->msi_list, list) {
- if (entry->msi_attrib._type == MSIX_ATTRIB) {
- writel(1, entry->mask_base + entry->msi_attrib.entry_nr
+ list_for_each_entry_safe(desc, tmp, &dev->msi_list, list) {
+ if (desc->msi_attrib._type == MSIX_ATTRIB) {
+ writel(1, desc->mask_base + desc->msi_attrib.entry_nr
* PCI_MSIX_ENTRY_SIZE
+ PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
- if (list_is_last(&entry->list, &dev->msi_list))
- iounmap(entry->mask_base);
+ if (list_is_last(&desc->list, &dev->msi_list))
+ iounmap(desc->mask_base);
}
- list_del(&entry->list);
- kfree(entry);
+ list_del(&desc->list);
+ kfree(desc);
}
return 0;
diff --git a/include/linux/msi.h b/include/linux/msi.h
index d322148..4731fe7 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -45,9 +45,10 @@ struct msi_desc {
* The arch hook for setup up msi irqs
*/
int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc);
+int arch_setup_msi_block(struct pci_dev *dev, struct msi_desc *desc, int nvec);
void arch_teardown_msi_irq(unsigned int irq);
extern int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type);
-extern void arch_teardown_msi_irqs(struct pci_dev *dev);
+extern void arch_teardown_msi_irqs(struct pci_dev *dev, int nvec);
extern int arch_msi_check_device(struct pci_dev* dev, int nvec, int type);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index d18b1dd..f7ca7f8 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -699,7 +699,7 @@ struct msix_entry {
#ifndef CONFIG_PCI_MSI
-static inline int pci_enable_msi(struct pci_dev *dev)
+static inline int pci_enable_msi_block(struct pci_dev *dev, unsigned int count)
{
return -1;
}
@@ -726,7 +726,7 @@ static inline void msi_remove_pci_irq_vectors(struct pci_dev *dev)
static inline void pci_restore_msi_state(struct pci_dev *dev)
{ }
#else
-extern int pci_enable_msi(struct pci_dev *dev);
+extern int pci_enable_msi_block(struct pci_dev *dev, unsigned int count);
extern void pci_msi_shutdown(struct pci_dev *dev);
extern void pci_disable_msi(struct pci_dev *dev);
extern int pci_enable_msix(struct pci_dev *dev,
@@ -737,6 +737,8 @@ extern void msi_remove_pci_irq_vectors(struct pci_dev *dev);
extern void pci_restore_msi_state(struct pci_dev *dev);
#endif
+#define pci_enable_msi(pdev) pci_enable_msi_block(pdev, 1)
+
#ifdef CONFIG_HT_IRQ
/* The functions a driver should call */
int ht_create_irq(struct pci_dev *dev, int idx);
--
1.5.5.4
^ permalink raw reply related [flat|nested] 43+ messages in thread* Re: [PATCH 2/4] PCI: Support multiple MSI
2008-07-05 13:34 ` [PATCH 2/4] PCI: Support multiple MSI Matthew Wilcox
@ 2008-07-07 2:05 ` Michael Ellerman
2008-07-07 2:45 ` Matthew Wilcox
0 siblings, 1 reply; 43+ messages in thread
From: Michael Ellerman @ 2008-07-07 2:05 UTC (permalink / raw)
To: Matthew Wilcox
Cc: linux-pci, kaneshige.kenji, mingo, tglx, davem, dan.j.williams,
Martine.Silbermann, benh, linux-kernel, Matthew Wilcox
[-- Attachment #1: Type: text/plain, Size: 9758 bytes --]
On Sat, 2008-07-05 at 09:34 -0400, Matthew Wilcox wrote:
> Add the new API pci_enable_msi_block() to allow drivers to
> request multiple MSIs. Reimplement pci_enable_msi in terms
> of pci_enable_msi_block. Add a default implementation of
> arch_setup_msi_block() that only allows one MSI to be requested.
I don't think you need arch_setup_msi_block() at all.
We already have an arch hook that takes a number of irqs, it's
arch_setup_msi_irqs(), plural. It also has the type passed to it (MSI or
MSI-X), so it can decide if it needs to allocate the irq numbers
contiguously.
Or am I missing something?
cheers
> diff --git a/arch/powerpc/kernel/msi.c b/arch/powerpc/kernel/msi.c
> index c62d101..317c7c8 100644
> --- a/arch/powerpc/kernel/msi.c
> +++ b/arch/powerpc/kernel/msi.c
> @@ -32,7 +32,7 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
> return ppc_md.setup_msi_irqs(dev, nvec, type);
> }
>
> -void arch_teardown_msi_irqs(struct pci_dev *dev)
> +void arch_teardown_msi_irqs(struct pci_dev *dev, int nvec)
> {
> return ppc_md.teardown_msi_irqs(dev);
> }
> diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
> index 92992a8..6cbdf11 100644
> --- a/drivers/pci/msi.c
> +++ b/drivers/pci/msi.c
> @@ -40,18 +40,31 @@ arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *entry)
> }
>
> int __attribute__ ((weak))
> +arch_setup_msi_block(struct pci_dev *pdev, struct msi_desc *desc, int nvec)
> +{
> + if (nvec > 1)
> + return 1;
> + return arch_setup_msi_irq(pdev, desc);
> +}
> +
> +int __attribute__ ((weak))
> arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
> {
> - struct msi_desc *entry;
> + struct msi_desc *desc;
> int ret;
>
> - list_for_each_entry(entry, &dev->msi_list, list) {
> - ret = arch_setup_msi_irq(dev, entry);
> - if (ret)
> - return ret;
> + if (type == PCI_CAP_ID_MSI) {
> + desc = list_first_entry(&dev->msi_list, struct msi_desc, list);
> + ret = arch_setup_msi_block(dev, desc, nvec);
> + } else {
> + list_for_each_entry(desc, &dev->msi_list, list) {
> + ret = arch_setup_msi_irq(dev, desc);
> + if (ret)
> + break;
> + }
> }
>
> - return 0;
> + return ret;
> }
>
> void __attribute__ ((weak)) arch_teardown_msi_irq(unsigned int irq)
> @@ -60,13 +73,16 @@ void __attribute__ ((weak)) arch_teardown_msi_irq(unsigned int irq)
> }
>
> void __attribute__ ((weak))
> -arch_teardown_msi_irqs(struct pci_dev *dev)
> +arch_teardown_msi_irqs(struct pci_dev *dev, int nvec)
> {
> struct msi_desc *entry;
>
> list_for_each_entry(entry, &dev->msi_list, list) {
> - if (entry->irq != 0)
> - arch_teardown_msi_irq(entry->irq);
> + int i;
> + if (entry->irq == 0)
> + continue;
> + for (i = 0; i < nvec; i++)
> + arch_teardown_msi_irq(entry->irq + i);
> }
> }
>
> @@ -350,7 +366,7 @@ EXPORT_SYMBOL_GPL(pci_restore_msi_state);
> * multiple messages. A return of zero indicates the successful setup
> * of an entry zero with the new MSI irq or non-zero for otherwise.
> **/
> -static int msi_capability_init(struct pci_dev *dev)
> +static int msi_capability_init(struct pci_dev *dev, int nr_irqs)
> {
> struct msi_desc *entry;
> int pos, ret;
> @@ -394,7 +410,7 @@ static int msi_capability_init(struct pci_dev *dev)
> list_add_tail(&entry->list, &dev->msi_list);
>
> /* Configure MSI capability structure */
> - ret = arch_setup_msi_irqs(dev, 1, PCI_CAP_ID_MSI);
> + ret = arch_setup_msi_irqs(dev, nr_irqs, PCI_CAP_ID_MSI);
> if (ret) {
> msi_free_irqs(dev);
> return ret;
> @@ -546,36 +562,47 @@ static int pci_msi_check_device(struct pci_dev* dev, int nvec, int type)
> }
>
> /**
> - * pci_enable_msi - configure device's MSI capability structure
> - * @dev: pointer to the pci_dev data structure of MSI device function
> + * pci_enable_msi_block - configure device's MSI capability structure
> + * @pdev: Device to configure
> + * @nr_irqs: Number of IRQs requested
> + *
> + * Allocate IRQs for a device with the MSI capability.
> + * This function returns a negative errno if an error occurs. On success,
> + * this function returns the number of IRQs actually allocated. Since
> + * MSIs are required to be a power of two, the number of IRQs allocated
> + * may be rounded up to the next power of two (if the number requested is
> + * not a power of two). Fewer IRQs than requested may be allocated if the
> + * system does not have the resources for the full number.
> *
> - * Setup the MSI capability structure of device function with
> - * a single MSI irq upon its software driver call to request for
> - * MSI mode enabled on its hardware device function. A return of zero
> - * indicates the successful setup of an entry zero with the new MSI
> - * irq or non-zero for otherwise.
> + * If successful, the @pdev's irq member will be updated to the lowest new
> + * IRQ allocated; the other IRQs allocated to this device will be consecutive.
> **/
> -int pci_enable_msi(struct pci_dev* dev)
> +int pci_enable_msi_block(struct pci_dev *pdev, unsigned int nr_irqs)
> {
> int status;
>
> - status = pci_msi_check_device(dev, 1, PCI_CAP_ID_MSI);
> + /* MSI only supports up to 32 interrupts */
> + if (nr_irqs > 32)
> + return 32;
> +
> + status = pci_msi_check_device(pdev, nr_irqs, PCI_CAP_ID_MSI);
> if (status)
> return status;
>
> - WARN_ON(!!dev->msi_enabled);
> + WARN_ON(!!pdev->msi_enabled);
>
> - /* Check whether driver already requested for MSI-X irqs */
> - if (dev->msix_enabled) {
> + /* Check whether driver already requested MSI-X irqs */
> + if (pdev->msix_enabled) {
> printk(KERN_INFO "PCI: %s: Can't enable MSI. "
> "Device already has MSI-X enabled\n",
> - pci_name(dev));
> + pci_name(pdev));
> return -EINVAL;
> }
> - status = msi_capability_init(dev);
> +
> + status = msi_capability_init(pdev, nr_irqs);
> return status;
> }
> -EXPORT_SYMBOL(pci_enable_msi);
> +EXPORT_SYMBOL(pci_enable_msi_block);
>
> void pci_msi_shutdown(struct pci_dev* dev)
> {
> @@ -621,26 +648,30 @@ EXPORT_SYMBOL(pci_disable_msi);
>
> static int msi_free_irqs(struct pci_dev* dev)
> {
> - struct msi_desc *entry, *tmp;
> + int i, nvec = 1;
> + struct msi_desc *desc, *tmp;
>
> - list_for_each_entry(entry, &dev->msi_list, list) {
> - if (entry->irq)
> - BUG_ON(irq_has_action(entry->irq));
> + list_for_each_entry(desc, &dev->msi_list, list) {
> + nvec = 1 << desc->msi_attrib.multiple;
> + if (!desc->irq)
> + continue;
> + for (i = 0; i < nvec; i++)
> + BUG_ON(irq_has_action(desc->irq + i));
> }
>
> - arch_teardown_msi_irqs(dev);
> + arch_teardown_msi_irqs(dev, nvec);
>
> - list_for_each_entry_safe(entry, tmp, &dev->msi_list, list) {
> - if (entry->msi_attrib._type == MSIX_ATTRIB) {
> - writel(1, entry->mask_base + entry->msi_attrib.entry_nr
> + list_for_each_entry_safe(desc, tmp, &dev->msi_list, list) {
> + if (desc->msi_attrib._type == MSIX_ATTRIB) {
> + writel(1, desc->mask_base + desc->msi_attrib.entry_nr
> * PCI_MSIX_ENTRY_SIZE
> + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
>
> - if (list_is_last(&entry->list, &dev->msi_list))
> - iounmap(entry->mask_base);
> + if (list_is_last(&desc->list, &dev->msi_list))
> + iounmap(desc->mask_base);
> }
> - list_del(&entry->list);
> - kfree(entry);
> + list_del(&desc->list);
> + kfree(desc);
> }
>
> return 0;
> diff --git a/include/linux/msi.h b/include/linux/msi.h
> index d322148..4731fe7 100644
> --- a/include/linux/msi.h
> +++ b/include/linux/msi.h
> @@ -45,9 +45,10 @@ struct msi_desc {
> * The arch hook for setup up msi irqs
> */
> int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc);
> +int arch_setup_msi_block(struct pci_dev *dev, struct msi_desc *desc, int nvec);
> void arch_teardown_msi_irq(unsigned int irq);
> extern int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type);
> -extern void arch_teardown_msi_irqs(struct pci_dev *dev);
> +extern void arch_teardown_msi_irqs(struct pci_dev *dev, int nvec);
> extern int arch_msi_check_device(struct pci_dev* dev, int nvec, int type);
>
>
> diff --git a/include/linux/pci.h b/include/linux/pci.h
> index d18b1dd..f7ca7f8 100644
> --- a/include/linux/pci.h
> +++ b/include/linux/pci.h
> @@ -699,7 +699,7 @@ struct msix_entry {
>
>
> #ifndef CONFIG_PCI_MSI
> -static inline int pci_enable_msi(struct pci_dev *dev)
> +static inline int pci_enable_msi_block(struct pci_dev *dev, unsigned int count)
> {
> return -1;
> }
> @@ -726,7 +726,7 @@ static inline void msi_remove_pci_irq_vectors(struct pci_dev *dev)
> static inline void pci_restore_msi_state(struct pci_dev *dev)
> { }
> #else
> -extern int pci_enable_msi(struct pci_dev *dev);
> +extern int pci_enable_msi_block(struct pci_dev *dev, unsigned int count);
> extern void pci_msi_shutdown(struct pci_dev *dev);
> extern void pci_disable_msi(struct pci_dev *dev);
> extern int pci_enable_msix(struct pci_dev *dev,
> @@ -737,6 +737,8 @@ extern void msi_remove_pci_irq_vectors(struct pci_dev *dev);
> extern void pci_restore_msi_state(struct pci_dev *dev);
> #endif
>
> +#define pci_enable_msi(pdev) pci_enable_msi_block(pdev, 1)
> +
> #ifdef CONFIG_HT_IRQ
> /* The functions a driver should call */
> int ht_create_irq(struct pci_dev *dev, int idx);
--
Michael Ellerman
OzLabs, IBM Australia Development Lab
wwweb: http://michael.ellerman.id.au
phone: +61 2 6212 1183 (tie line 70 21183)
We do not inherit the earth from our ancestors,
we borrow it from our children. - S.M.A.R.T Person
[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 189 bytes --]
^ permalink raw reply [flat|nested] 43+ messages in thread* Re: [PATCH 2/4] PCI: Support multiple MSI
2008-07-07 2:05 ` Michael Ellerman
@ 2008-07-07 2:45 ` Matthew Wilcox
2008-07-07 3:56 ` Michael Ellerman
0 siblings, 1 reply; 43+ messages in thread
From: Matthew Wilcox @ 2008-07-07 2:45 UTC (permalink / raw)
To: Michael Ellerman
Cc: linux-pci, kaneshige.kenji, mingo, tglx, davem, dan.j.williams,
Martine.Silbermann, benh, linux-kernel, Matthew Wilcox
On Mon, Jul 07, 2008 at 12:05:25PM +1000, Michael Ellerman wrote:
> On Sat, 2008-07-05 at 09:34 -0400, Matthew Wilcox wrote:
> > Add the new API pci_enable_msi_block() to allow drivers to
> > request multiple MSIs. Reimplement pci_enable_msi in terms
> > of pci_enable_msi_block. Add a default implementation of
> > arch_setup_msi_block() that only allows one MSI to be requested.
>
> I don't think you need arch_setup_msi_block() at all.
>
> We already have an arch hook that takes a number of irqs, it's
> arch_setup_msi_irqs(), plural. It also has the type passed to it (MSI or
> MSI-X), so it can decide if it needs to allocate the irq numbers
> contiguously.
>
> Or am I missing something?
I suppose I should audit the current implementors of arch_setup_msi_irqs
(er, maybe that's just you?) to be sure that there's no assumption that
MSI -> asked for one. I'll look into doing it your way tomorrow (my
timezone ;-)
--
Intel are signing my paycheques ... these opinions are still mine
"Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours. We can't possibly take such
a retrograde step."
^ permalink raw reply [flat|nested] 43+ messages in thread
* Re: [PATCH 2/4] PCI: Support multiple MSI
2008-07-07 2:45 ` Matthew Wilcox
@ 2008-07-07 3:56 ` Michael Ellerman
2008-07-07 11:31 ` Matthew Wilcox
0 siblings, 1 reply; 43+ messages in thread
From: Michael Ellerman @ 2008-07-07 3:56 UTC (permalink / raw)
To: Matthew Wilcox
Cc: linux-pci, kaneshige.kenji, mingo, tglx, davem, dan.j.williams,
Martine.Silbermann, benh, linux-kernel, Matthew Wilcox
[-- Attachment #1: Type: text/plain, Size: 1833 bytes --]
On Sun, 2008-07-06 at 20:45 -0600, Matthew Wilcox wrote:
> On Mon, Jul 07, 2008 at 12:05:25PM +1000, Michael Ellerman wrote:
> > On Sat, 2008-07-05 at 09:34 -0400, Matthew Wilcox wrote:
> > > Add the new API pci_enable_msi_block() to allow drivers to
> > > request multiple MSIs. Reimplement pci_enable_msi in terms
> > > of pci_enable_msi_block. Add a default implementation of
> > > arch_setup_msi_block() that only allows one MSI to be requested.
> >
> > I don't think you need arch_setup_msi_block() at all.
> >
> > We already have an arch hook that takes a number of irqs, it's
> > arch_setup_msi_irqs(), plural. It also has the type passed to it (MSI or
> > MSI-X), so it can decide if it needs to allocate the irq numbers
> > contiguously.
> >
> > Or am I missing something?
>
> I suppose I should audit the current implementors of arch_setup_msi_irqs
> (er, maybe that's just you?) to be sure that there's no assumption that
> MSI -> asked for one.
Yeah I think it's just us.
But there's also the default implementation, which will happily use the
singular arch hook to setup multiple MSIs without any constraint on the
irq numbers - which will break.
So I think you want to make the default arch_msi_check_device() return
an error if you ask for MSI & nvec > 1. Then on powerpc we'll probably
add the same check to our version (at least until we can test it), but
on x86 you can let MSI & nvec > 1 pass.
> I'll look into doing it your way tomorrow (my timezone ;-)
Sure, although that'll be today in my timezone :D
cheers
--
Michael Ellerman
OzLabs, IBM Australia Development Lab
wwweb: http://michael.ellerman.id.au
phone: +61 2 6212 1183 (tie line 70 21183)
We do not inherit the earth from our ancestors,
we borrow it from our children. - S.M.A.R.T Person
[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 189 bytes --]
^ permalink raw reply [flat|nested] 43+ messages in thread
* Re: [PATCH 2/4] PCI: Support multiple MSI
2008-07-07 3:56 ` Michael Ellerman
@ 2008-07-07 11:31 ` Matthew Wilcox
2008-07-10 1:32 ` Michael Ellerman
0 siblings, 1 reply; 43+ messages in thread
From: Matthew Wilcox @ 2008-07-07 11:31 UTC (permalink / raw)
To: Michael Ellerman
Cc: linux-pci, kaneshige.kenji, mingo, tglx, davem, dan.j.williams,
Martine.Silbermann, benh, linux-kernel, Matthew Wilcox
On Mon, Jul 07, 2008 at 01:56:52PM +1000, Michael Ellerman wrote:
> Yeah I think it's just us.
Grep agrees.
> But there's also the default implementation, which will happily use the
> singular arch hook to setup multiple MSIs without any constraint on the
> irq numbers - which will break.
Yup.
> So I think you want to make the default arch_msi_check_device() return
> an error if you ask for MSI & nvec > 1. Then on powerpc we'll probably
> add the same check to our version (at least until we can test it), but
> on x86 you can let MSI & nvec > 1 pass.
That was my intent ... something like this:
diff --git a/arch/powerpc/kernel/msi.c b/arch/powerpc/kernel/msi.c
index c62d101..79ff21f 100644
--- a/arch/powerpc/kernel/msi.c
+++ b/arch/powerpc/kernel/msi.c
@@ -29,10 +29,12 @@ int arch_msi_check_device(struct pci_dev* dev, int nvec, int type)
int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
{
+ if (type == PCI_CAP_ID_MSI && nvec > 1)
+ return 1;
return ppc_md.setup_msi_irqs(dev, nvec, type);
}
-void arch_teardown_msi_irqs(struct pci_dev *dev)
+void arch_teardown_msi_irqs(struct pci_dev *dev, int nvec)
{
return ppc_md.teardown_msi_irqs(dev);
}
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 92992a8..4f7b31f 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -42,11 +42,14 @@ arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *entry)
int __attribute__ ((weak))
arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
{
- struct msi_desc *entry;
+ struct msi_desc *desc;
int ret;
- list_for_each_entry(entry, &dev->msi_list, list) {
- ret = arch_setup_msi_irq(dev, entry);
+ if ((type == PCI_CAP_ID_MSI) && (nvec > 1))
+ return 1;
+
+ list_for_each_entry(desc, &dev->msi_list, list) {
+ ret = arch_setup_msi_irq(dev, desc);
if (ret)
return ret;
}
@@ -60,13 +63,16 @@ void __attribute__ ((weak)) arch_teardown_msi_irq(unsigned int irq)
}
void __attribute__ ((weak))
-arch_teardown_msi_irqs(struct pci_dev *dev)
+arch_teardown_msi_irqs(struct pci_dev *dev, int nvec)
{
struct msi_desc *entry;
list_for_each_entry(entry, &dev->msi_list, list) {
- if (entry->irq != 0)
- arch_teardown_msi_irq(entry->irq);
+ int i;
+ if (entry->irq == 0)
+ continue;
+ for (i = 0; i < nvec; i++)
+ arch_teardown_msi_irq(entry->irq + i);
}
}
@@ -350,7 +356,7 @@ EXPORT_SYMBOL_GPL(pci_restore_msi_state);
* multiple messages. A return of zero indicates the successful setup
* of an entry zero with the new MSI irq or non-zero for otherwise.
**/
-static int msi_capability_init(struct pci_dev *dev)
+static int msi_capability_init(struct pci_dev *dev, int nr_irqs)
{
struct msi_desc *entry;
int pos, ret;
@@ -394,7 +400,7 @@ static int msi_capability_init(struct pci_dev *dev)
list_add_tail(&entry->list, &dev->msi_list);
/* Configure MSI capability structure */
- ret = arch_setup_msi_irqs(dev, 1, PCI_CAP_ID_MSI);
+ ret = arch_setup_msi_irqs(dev, nr_irqs, PCI_CAP_ID_MSI);
if (ret) {
msi_free_irqs(dev);
return ret;
@@ -546,36 +552,47 @@ static int pci_msi_check_device(struct pci_dev* dev, int nvec, int type)
}
/**
- * pci_enable_msi - configure device's MSI capability structure
- * @dev: pointer to the pci_dev data structure of MSI device function
+ * pci_enable_msi_block - configure device's MSI capability structure
+ * @pdev: Device to configure
+ * @nr_irqs: Number of IRQs requested
*
- * Setup the MSI capability structure of device function with
- * a single MSI irq upon its software driver call to request for
- * MSI mode enabled on its hardware device function. A return of zero
- * indicates the successful setup of an entry zero with the new MSI
- * irq or non-zero for otherwise.
+ * Allocate IRQs for a device with the MSI capability.
+ * This function returns a negative errno if an error occurs. On success,
+ * this function returns the number of IRQs actually allocated. Since
+ * MSIs are required to be a power of two, the number of IRQs allocated
+ * may be rounded up to the next power of two (if the number requested is
+ * not a power of two). Fewer IRQs than requested may be allocated if the
+ * system does not have the resources for the full number.
+ *
+ * If successful, the @pdev's irq member will be updated to the lowest new
+ * IRQ allocated; the other IRQs allocated to this device will be consecutive.
**/
-int pci_enable_msi(struct pci_dev* dev)
+int pci_enable_msi_block(struct pci_dev *pdev, unsigned int nr_irqs)
{
int status;
- status = pci_msi_check_device(dev, 1, PCI_CAP_ID_MSI);
+ /* MSI only supports up to 32 interrupts */
+ if (nr_irqs > 32)
+ return 32;
+
+ status = pci_msi_check_device(pdev, nr_irqs, PCI_CAP_ID_MSI);
if (status)
return status;
- WARN_ON(!!dev->msi_enabled);
+ WARN_ON(!!pdev->msi_enabled);
- /* Check whether driver already requested for MSI-X irqs */
- if (dev->msix_enabled) {
+ /* Check whether driver already requested MSI-X irqs */
+ if (pdev->msix_enabled) {
printk(KERN_INFO "PCI: %s: Can't enable MSI. "
"Device already has MSI-X enabled\n",
- pci_name(dev));
+ pci_name(pdev));
return -EINVAL;
}
- status = msi_capability_init(dev);
+
+ status = msi_capability_init(pdev, nr_irqs);
return status;
}
-EXPORT_SYMBOL(pci_enable_msi);
+EXPORT_SYMBOL(pci_enable_msi_block);
void pci_msi_shutdown(struct pci_dev* dev)
{
@@ -621,26 +638,30 @@ EXPORT_SYMBOL(pci_disable_msi);
static int msi_free_irqs(struct pci_dev* dev)
{
- struct msi_desc *entry, *tmp;
+ int i, nvec = 1;
+ struct msi_desc *desc, *tmp;
- list_for_each_entry(entry, &dev->msi_list, list) {
- if (entry->irq)
- BUG_ON(irq_has_action(entry->irq));
+ list_for_each_entry(desc, &dev->msi_list, list) {
+ nvec = 1 << desc->msi_attrib.multiple;
+ if (!desc->irq)
+ continue;
+ for (i = 0; i < nvec; i++)
+ BUG_ON(irq_has_action(desc->irq + i));
}
- arch_teardown_msi_irqs(dev);
+ arch_teardown_msi_irqs(dev, nvec);
- list_for_each_entry_safe(entry, tmp, &dev->msi_list, list) {
- if (entry->msi_attrib._type == MSIX_ATTRIB) {
- writel(1, entry->mask_base + entry->msi_attrib.entry_nr
+ list_for_each_entry_safe(desc, tmp, &dev->msi_list, list) {
+ if (desc->msi_attrib._type == MSIX_ATTRIB) {
+ writel(1, desc->mask_base + desc->msi_attrib.entry_nr
* PCI_MSIX_ENTRY_SIZE
+ PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
- if (list_is_last(&entry->list, &dev->msi_list))
- iounmap(entry->mask_base);
+ if (list_is_last(&desc->list, &dev->msi_list))
+ iounmap(desc->mask_base);
}
- list_del(&entry->list);
- kfree(entry);
+ list_del(&desc->list);
+ kfree(desc);
}
return 0;
diff --git a/include/linux/msi.h b/include/linux/msi.h
index d322148..f2400cc 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -42,12 +42,14 @@ struct msi_desc {
};
/*
- * The arch hook for setup up msi irqs
+ * These functions should be implemented by the CPU architecture.
+ * Note that you need to implement only one of arch_setup_msi_irq() and
+ * arch_teardown_msi_irqs()
*/
int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc);
void arch_teardown_msi_irq(unsigned int irq);
extern int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type);
-extern void arch_teardown_msi_irqs(struct pci_dev *dev);
+extern void arch_teardown_msi_irqs(struct pci_dev *dev, int nvec);
extern int arch_msi_check_device(struct pci_dev* dev, int nvec, int type);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index d18b1dd..f7ca7f8 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -699,7 +699,7 @@ struct msix_entry {
#ifndef CONFIG_PCI_MSI
-static inline int pci_enable_msi(struct pci_dev *dev)
+static inline int pci_enable_msi_block(struct pci_dev *dev, unsigned int count)
{
return -1;
}
@@ -726,7 +726,7 @@ static inline void msi_remove_pci_irq_vectors(struct pci_dev *dev)
static inline void pci_restore_msi_state(struct pci_dev *dev)
{ }
#else
-extern int pci_enable_msi(struct pci_dev *dev);
+extern int pci_enable_msi_block(struct pci_dev *dev, unsigned int count);
extern void pci_msi_shutdown(struct pci_dev *dev);
extern void pci_disable_msi(struct pci_dev *dev);
extern int pci_enable_msix(struct pci_dev *dev,
@@ -737,6 +737,8 @@ extern void msi_remove_pci_irq_vectors(struct pci_dev *dev);
extern void pci_restore_msi_state(struct pci_dev *dev);
#endif
+#define pci_enable_msi(pdev) pci_enable_msi_block(pdev, 1)
+
#ifdef CONFIG_HT_IRQ
/* The functions a driver should call */
int ht_create_irq(struct pci_dev *dev, int idx);
--
Intel are signing my paycheques ... these opinions are still mine
"Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours. We can't possibly take such
a retrograde step."
^ permalink raw reply related [flat|nested] 43+ messages in thread* Re: [PATCH 2/4] PCI: Support multiple MSI
2008-07-07 11:31 ` Matthew Wilcox
@ 2008-07-10 1:32 ` Michael Ellerman
2008-07-10 1:43 ` Matthew Wilcox
0 siblings, 1 reply; 43+ messages in thread
From: Michael Ellerman @ 2008-07-10 1:32 UTC (permalink / raw)
To: Matthew Wilcox
Cc: linux-pci, kaneshige.kenji, mingo, tglx, davem, dan.j.williams,
Martine.Silbermann, benh, linux-kernel, Matthew Wilcox
[-- Attachment #1: Type: text/plain, Size: 8037 bytes --]
On Mon, 2008-07-07 at 05:31 -0600, Matthew Wilcox wrote:
> On Mon, Jul 07, 2008 at 01:56:52PM +1000, Michael Ellerman wrote:
> > So I think you want to make the default arch_msi_check_device() return
> > an error if you ask for MSI & nvec > 1. Then on powerpc we'll probably
> > add the same check to our version (at least until we can test it), but
> > on x86 you can let MSI & nvec > 1 pass.
>
> That was my intent ... something like this:
Sorry for the slow response, comments inline ...
> diff --git a/arch/powerpc/kernel/msi.c b/arch/powerpc/kernel/msi.c
> index c62d101..79ff21f 100644
> --- a/arch/powerpc/kernel/msi.c
> +++ b/arch/powerpc/kernel/msi.c
> @@ -29,10 +29,12 @@ int arch_msi_check_device(struct pci_dev* dev, int nvec, int type)
>
> int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
> {
> + if (type == PCI_CAP_ID_MSI && nvec > 1)
> + return 1;
This should go in arch_msi_check_device(). We might move it into a
ppc_md routine eventually.
> return ppc_md.setup_msi_irqs(dev, nvec, type);
> }
>
> -void arch_teardown_msi_irqs(struct pci_dev *dev)
> +void arch_teardown_msi_irqs(struct pci_dev *dev, int nvec)
> {
> return ppc_md.teardown_msi_irqs(dev);
> }
> diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
> index 92992a8..4f7b31f 100644
> --- a/drivers/pci/msi.c
> +++ b/drivers/pci/msi.c
> @@ -42,11 +42,14 @@ arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *entry)
> int __attribute__ ((weak))
> arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
> {
> - struct msi_desc *entry;
> + struct msi_desc *desc;
> int ret;
>
> - list_for_each_entry(entry, &dev->msi_list, list) {
> - ret = arch_setup_msi_irq(dev, entry);
> + if ((type == PCI_CAP_ID_MSI) && (nvec > 1))
> + return 1;
I think the check should be in the generic arch_msi_check_device(), so
archs can override just the check.
> +
> + list_for_each_entry(desc, &dev->msi_list, list) {
> + ret = arch_setup_msi_irq(dev, desc);
> if (ret)
> return ret;
> }
> @@ -60,13 +63,16 @@ void __attribute__ ((weak)) arch_teardown_msi_irq(unsigned int irq)
> }
>
> void __attribute__ ((weak))
> -arch_teardown_msi_irqs(struct pci_dev *dev)
> +arch_teardown_msi_irqs(struct pci_dev *dev, int nvec)
> {
> struct msi_desc *entry;
>
> list_for_each_entry(entry, &dev->msi_list, list) {
> - if (entry->irq != 0)
> - arch_teardown_msi_irq(entry->irq);
> + int i;
> + if (entry->irq == 0)
> + continue;
> + for (i = 0; i < nvec; i++)
> + arch_teardown_msi_irq(entry->irq + i);
This looks wrong. You're looping through all MSIs for the device, and
then for each one you're looping through all MSIs for the device. And
you're assuming they're contiguous, which they won't be for MSI-X.
AFAICS this code should work for you as it was.
> @@ -546,36 +552,47 @@ static int pci_msi_check_device(struct pci_dev* dev, int nvec, int type)
> }
>
> /**
> - * pci_enable_msi - configure device's MSI capability structure
> - * @dev: pointer to the pci_dev data structure of MSI device function
> + * pci_enable_msi_block - configure device's MSI capability structure
> + * @pdev: Device to configure
> + * @nr_irqs: Number of IRQs requested
> *
> - * Setup the MSI capability structure of device function with
> - * a single MSI irq upon its software driver call to request for
> - * MSI mode enabled on its hardware device function. A return of zero
> - * indicates the successful setup of an entry zero with the new MSI
> - * irq or non-zero for otherwise.
> + * Allocate IRQs for a device with the MSI capability.
> + * This function returns a negative errno if an error occurs. On success,
> + * this function returns the number of IRQs actually allocated. Since
> + * MSIs are required to be a power of two, the number of IRQs allocated
> + * may be rounded up to the next power of two (if the number requested is
> + * not a power of two). Fewer IRQs than requested may be allocated if the
> + * system does not have the resources for the full number.
> + *
> + * If successful, the @pdev's irq member will be updated to the lowest new
> + * IRQ allocated; the other IRQs allocated to this device will be consecutive.
> **/
> -int pci_enable_msi(struct pci_dev* dev)
> +int pci_enable_msi_block(struct pci_dev *pdev, unsigned int nr_irqs)
> {
> int status;
>
> - status = pci_msi_check_device(dev, 1, PCI_CAP_ID_MSI);
> + /* MSI only supports up to 32 interrupts */
> + if (nr_irqs > 32)
> + return 32;
You don't describe this behaviour in the doco. I'm a bit lukewarm on it,
ie. returning the number that /could/ be allocated and having drivers
use that, I think it's likely drivers will be poorly tested in the case
where they get fewer irqs than they ask for. But I suppose that's a
separate problem.
> +
> + status = pci_msi_check_device(pdev, nr_irqs, PCI_CAP_ID_MSI);
> if (status)
> return status;
>
> - WARN_ON(!!dev->msi_enabled);
> + WARN_ON(!!pdev->msi_enabled);
Your patches would be easier to read if you didn't keep renaming to
entry to desc and dev to pdev :)
> - /* Check whether driver already requested for MSI-X irqs */
> - if (dev->msix_enabled) {
> + /* Check whether driver already requested MSI-X irqs */
> + if (pdev->msix_enabled) {
> printk(KERN_INFO "PCI: %s: Can't enable MSI. "
> "Device already has MSI-X enabled\n",
> - pci_name(dev));
> + pci_name(pdev));
> return -EINVAL;
> }
> - status = msi_capability_init(dev);
> +
> + status = msi_capability_init(pdev, nr_irqs);
> return status;
> }
> -EXPORT_SYMBOL(pci_enable_msi);
> +EXPORT_SYMBOL(pci_enable_msi_block);
>
> void pci_msi_shutdown(struct pci_dev* dev)
> {
> @@ -621,26 +638,30 @@ EXPORT_SYMBOL(pci_disable_msi);
>
> static int msi_free_irqs(struct pci_dev* dev)
> {
> - struct msi_desc *entry, *tmp;
> + int i, nvec = 1;
> + struct msi_desc *desc, *tmp;
>
> - list_for_each_entry(entry, &dev->msi_list, list) {
> - if (entry->irq)
> - BUG_ON(irq_has_action(entry->irq));
> + list_for_each_entry(desc, &dev->msi_list, list) {
> + nvec = 1 << desc->msi_attrib.multiple;
> + if (!desc->irq)
> + continue;
> + for (i = 0; i < nvec; i++)
> + BUG_ON(irq_has_action(desc->irq + i));
This looks wrong in the same way arch_teardown_msi_irqs() does.
> }
>
> - arch_teardown_msi_irqs(dev);
> + arch_teardown_msi_irqs(dev, nvec);
>
> diff --git a/include/linux/pci.h b/include/linux/pci.h
> index d18b1dd..f7ca7f8 100644
> --- a/include/linux/pci.h
> +++ b/include/linux/pci.h
> @@ -699,7 +699,7 @@ struct msix_entry {
>
>
> #ifndef CONFIG_PCI_MSI
> -static inline int pci_enable_msi(struct pci_dev *dev)
> +static inline int pci_enable_msi_block(struct pci_dev *dev, unsigned int count)
> {
> return -1;
> }
> @@ -726,7 +726,7 @@ static inline void msi_remove_pci_irq_vectors(struct pci_dev *dev)
> static inline void pci_restore_msi_state(struct pci_dev *dev)
> { }
> #else
> -extern int pci_enable_msi(struct pci_dev *dev);
> +extern int pci_enable_msi_block(struct pci_dev *dev, unsigned int count);
Here you have "count", the implementation uses "nr_irqs", and the rest
of the code uses "nvec".
> extern void pci_msi_shutdown(struct pci_dev *dev);
> extern void pci_disable_msi(struct pci_dev *dev);
> extern int pci_enable_msix(struct pci_dev *dev,
> @@ -737,6 +737,8 @@ extern void msi_remove_pci_irq_vectors(struct pci_dev *dev);
> extern void pci_restore_msi_state(struct pci_dev *dev);
> #endif
>
> +#define pci_enable_msi(pdev) pci_enable_msi_block(pdev, 1)
Someone will probably say this should be a static inline.
cheers
--
Michael Ellerman
OzLabs, IBM Australia Development Lab
wwweb: http://michael.ellerman.id.au
phone: +61 2 6212 1183 (tie line 70 21183)
We do not inherit the earth from our ancestors,
we borrow it from our children. - S.M.A.R.T Person
[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 189 bytes --]
^ permalink raw reply [flat|nested] 43+ messages in thread* Re: [PATCH 2/4] PCI: Support multiple MSI
2008-07-10 1:32 ` Michael Ellerman
@ 2008-07-10 1:43 ` Matthew Wilcox
2008-07-10 4:00 ` Michael Ellerman
0 siblings, 1 reply; 43+ messages in thread
From: Matthew Wilcox @ 2008-07-10 1:43 UTC (permalink / raw)
To: Michael Ellerman
Cc: linux-pci, kaneshige.kenji, mingo, tglx, davem, dan.j.williams,
Martine.Silbermann, benh, linux-kernel, Matthew Wilcox
On Thu, Jul 10, 2008 at 11:32:44AM +1000, Michael Ellerman wrote:
> > int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
> > {
> > + if (type == PCI_CAP_ID_MSI && nvec > 1)
> > + return 1;
>
> This should go in arch_msi_check_device(). We might move it into a
> ppc_md routine eventually.
I'm OK with that, but ...
> > int __attribute__ ((weak))
> > arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
> > {
> > - struct msi_desc *entry;
> > + struct msi_desc *desc;
> > int ret;
> >
> > - list_for_each_entry(entry, &dev->msi_list, list) {
> > - ret = arch_setup_msi_irq(dev, entry);
> > + if ((type == PCI_CAP_ID_MSI) && (nvec > 1))
> > + return 1;
>
> I think the check should be in the generic arch_msi_check_device(), so
> archs can override just the check.
... then x86 has to implement arch_msi_check_device in order to _not_
perform the check, which feels a bit bass-ackwards to me.
> >
> > void __attribute__ ((weak))
> > -arch_teardown_msi_irqs(struct pci_dev *dev)
> > +arch_teardown_msi_irqs(struct pci_dev *dev, int nvec)
> > {
> > struct msi_desc *entry;
> >
> > list_for_each_entry(entry, &dev->msi_list, list) {
> > - if (entry->irq != 0)
> > - arch_teardown_msi_irq(entry->irq);
> > + int i;
> > + if (entry->irq == 0)
> > + continue;
> > + for (i = 0; i < nvec; i++)
> > + arch_teardown_msi_irq(entry->irq + i);
>
> This looks wrong. You're looping through all MSIs for the device, and
> then for each one you're looping through all MSIs for the device. And
> you're assuming they're contiguous, which they won't be for MSI-X.
>
> AFAICS this code should work for you as it was.
For MSI-X, nvec will be = 1. Maybe I should call it something else to
avoid confusion. The code won't work for me as-was because it won't
call arch_teardown_msi_irq() for all entries.
> > + * Allocate IRQs for a device with the MSI capability.
> > + * This function returns a negative errno if an error occurs. On success,
> > + * this function returns the number of IRQs actually allocated. Since
> > + * MSIs are required to be a power of two, the number of IRQs allocated
> > + * may be rounded up to the next power of two (if the number requested is
> > + * not a power of two). Fewer IRQs than requested may be allocated if the
> > + * system does not have the resources for the full number.
> > + *
> > + * If successful, the @pdev's irq member will be updated to the lowest new
> > + * IRQ allocated; the other IRQs allocated to this device will be consecutive.
> > **/
> > -int pci_enable_msi(struct pci_dev* dev)
> > +int pci_enable_msi_block(struct pci_dev *pdev, unsigned int nr_irqs)
> > {
> > int status;
> >
> > - status = pci_msi_check_device(dev, 1, PCI_CAP_ID_MSI);
> > + /* MSI only supports up to 32 interrupts */
> > + if (nr_irqs > 32)
> > + return 32;
>
> You don't describe this behaviour in the doco. I'm a bit lukewarm on it,
> ie. returning the number that /could/ be allocated and having drivers
> use that, I think it's likely drivers will be poorly tested in the case
> where they get fewer irqs than they ask for. But I suppose that's a
> separate problem.
Ah, I changed the bahviour (to match msix) and forgot to update the
comment. Thanks, I'll fix that. By the way I have an updated version
of MSI-HOWTO available from http://www.parisc-linux.org/~willy/MSI-HOWTO.txt
> > - WARN_ON(!!dev->msi_enabled);
> > + WARN_ON(!!pdev->msi_enabled);
>
> Your patches would be easier to read if you didn't keep renaming to
> entry to desc and dev to pdev :)
True ... I should do those in separate patches.
> > #else
> > -extern int pci_enable_msi(struct pci_dev *dev);
> > +extern int pci_enable_msi_block(struct pci_dev *dev, unsigned int count);
>
> Here you have "count", the implementation uses "nr_irqs", and the rest
> of the code uses "nvec".
There's inconsistency between the various implementations too. I got
confused with where I was.
> > extern void pci_msi_shutdown(struct pci_dev *dev);
> > extern void pci_disable_msi(struct pci_dev *dev);
> > extern int pci_enable_msix(struct pci_dev *dev,
> > @@ -737,6 +737,8 @@ extern void msi_remove_pci_irq_vectors(struct pci_dev *dev);
> > extern void pci_restore_msi_state(struct pci_dev *dev);
> > #endif
> >
> > +#define pci_enable_msi(pdev) pci_enable_msi_block(pdev, 1)
>
> Someone will probably say this should be a static inline.
Not quite sure why. You don't get any better typechecking by making it
a static inline.
--
Intel are signing my paycheques ... these opinions are still mine
"Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours. We can't possibly take such
a retrograde step."
^ permalink raw reply [flat|nested] 43+ messages in thread* Re: [PATCH 2/4] PCI: Support multiple MSI
2008-07-10 1:43 ` Matthew Wilcox
@ 2008-07-10 4:00 ` Michael Ellerman
0 siblings, 0 replies; 43+ messages in thread
From: Michael Ellerman @ 2008-07-10 4:00 UTC (permalink / raw)
To: Matthew Wilcox
Cc: linux-pci, kaneshige.kenji, mingo, tglx, davem, dan.j.williams,
Martine.Silbermann, benh, linux-kernel, Matthew Wilcox
[-- Attachment #1: Type: text/plain, Size: 3187 bytes --]
On Wed, 2008-07-09 at 19:43 -0600, Matthew Wilcox wrote:
> On Thu, Jul 10, 2008 at 11:32:44AM +1000, Michael Ellerman wrote:
> > > int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
> > > {
> > > + if (type == PCI_CAP_ID_MSI && nvec > 1)
> > > + return 1;
> >
> > This should go in arch_msi_check_device(). We might move it into a
> > ppc_md routine eventually.
>
> I'm OK with that, but ...
>
> > > int __attribute__ ((weak))
> > > arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
> > > {
> > > - struct msi_desc *entry;
> > > + struct msi_desc *desc;
> > > int ret;
> > >
> > > - list_for_each_entry(entry, &dev->msi_list, list) {
> > > - ret = arch_setup_msi_irq(dev, entry);
> > > + if ((type == PCI_CAP_ID_MSI) && (nvec > 1))
> > > + return 1;
> >
> > I think the check should be in the generic arch_msi_check_device(), so
> > archs can override just the check.
>
> ... then x86 has to implement arch_msi_check_device in order to _not_
> perform the check, which feels a bit bass-ackwards to me.
Agreed, but I think that's still better. You might have alignment
constraints or whatever you need to check as well.
> > >
> > > void __attribute__ ((weak))
> > > -arch_teardown_msi_irqs(struct pci_dev *dev)
> > > +arch_teardown_msi_irqs(struct pci_dev *dev, int nvec)
> > > {
> > > struct msi_desc *entry;
> > >
> > > list_for_each_entry(entry, &dev->msi_list, list) {
> > > - if (entry->irq != 0)
> > > - arch_teardown_msi_irq(entry->irq);
> > > + int i;
> > > + if (entry->irq == 0)
> > > + continue;
> > > + for (i = 0; i < nvec; i++)
> > > + arch_teardown_msi_irq(entry->irq + i);
> >
> > This looks wrong. You're looping through all MSIs for the device, and
> > then for each one you're looping through all MSIs for the device. And
> > you're assuming they're contiguous, which they won't be for MSI-X.
> >
> > AFAICS this code should work for you as it was.
>
> For MSI-X, nvec will be = 1. Maybe I should call it something else to
> avoid confusion. The code won't work for me as-was because it won't
> call arch_teardown_msi_irq() for all entries.
It will call arch_teardown_msi_irq() for all entries, unless they were
never allocated (entry->irq == 0). Or are we talking about different
things?
If you mean that you're allocating more irqs than there are entries then
you need to deal with that in arch_teardown_msi_irqs().
> > > @@ -737,6 +737,8 @@ extern void msi_remove_pci_irq_vectors(struct pci_dev *dev);
> > > extern void pci_restore_msi_state(struct pci_dev *dev);
> > > #endif
> > >
> > > +#define pci_enable_msi(pdev) pci_enable_msi_block(pdev, 1)
> >
> > Someone will probably say this should be a static inline.
>
> Not quite sure why. You don't get any better typechecking by making it
> a static inline.
Yeah I agree, just pointing it out.
cheers
--
Michael Ellerman
OzLabs, IBM Australia Development Lab
wwweb: http://michael.ellerman.id.au
phone: +61 2 6212 1183 (tie line 70 21183)
We do not inherit the earth from our ancestors,
we borrow it from our children. - S.M.A.R.T Person
[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 189 bytes --]
^ permalink raw reply [flat|nested] 43+ messages in thread
* [PATCH 3/4] AHCI: Request multiple MSIs
2008-07-05 13:27 ` Matthew Wilcox
2008-07-05 13:34 ` [PATCH 1/4] PCI MSI: Store the number of messages in the msi_desc Matthew Wilcox
2008-07-05 13:34 ` [PATCH 2/4] PCI: Support multiple MSI Matthew Wilcox
@ 2008-07-05 13:34 ` Matthew Wilcox
2008-07-07 16:45 ` Grant Grundler
2008-07-05 13:34 ` [PATCH 4/4] x86-64: Support for " Matthew Wilcox
2008-07-05 13:43 ` Multiple MSI Matthew Wilcox
4 siblings, 1 reply; 43+ messages in thread
From: Matthew Wilcox @ 2008-07-05 13:34 UTC (permalink / raw)
To: linux-pci
Cc: kaneshige.kenji, mingo, tglx, davem, dan.j.williams,
Martine.Silbermann, benh, michael, linux-kernel, Matthew Wilcox,
Matthew Wilcox
AHCI controllers can support up to 16 interrupts, one per port. This
saves us a readl() in the interrupt path to determine which port has
generated the interrupt.
Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
---
drivers/ata/ahci.c | 84 ++++++++++++++++++++++++++++++++++++++++++++++++---
1 files changed, 79 insertions(+), 5 deletions(-)
diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
index 061817a..4b2f90a 100644
--- a/drivers/ata/ahci.c
+++ b/drivers/ata/ahci.c
@@ -102,6 +102,7 @@ enum {
/* HOST_CTL bits */
HOST_RESET = (1 << 0), /* reset controller; self-clear */
HOST_IRQ_EN = (1 << 1), /* global IRQ enable */
+ HOST_MSI_RSM = (1 << 2), /* Revert to Single Message */
HOST_AHCI_EN = (1 << 31), /* AHCI enabled */
/* HOST_CAP bits */
@@ -1771,6 +1772,13 @@ static void ahci_port_intr(struct ata_port *ap)
}
}
+static irqreturn_t ahci_msi_interrupt(int irq, void *dev_instance)
+{
+ struct ata_port *ap = dev_instance;
+ ahci_port_intr(ap);
+ return IRQ_HANDLED;
+}
+
static irqreturn_t ahci_interrupt(int irq, void *dev_instance)
{
struct ata_host *host = dev_instance;
@@ -2210,6 +2218,66 @@ static void ahci_p5wdh_workaround(struct ata_host *host)
}
}
+static int ahci_request_irqs(struct pci_dev *pdev, struct ata_host *host)
+{
+ int i, n_irqs, rc;
+ struct ahci_host_priv *hpriv = host->private_data;
+
+ if (hpriv->flags & AHCI_HFLAG_NO_MSI) {
+ n_irqs = 1;
+ } else {
+ u16 control;
+ int pos = pci_find_capability(pdev, PCI_CAP_ID_MSI);
+ pci_read_config_word(pdev, pos + PCI_MSI_FLAGS, &control);
+ n_irqs = 1 << ((control & PCI_MSI_FLAGS_QMASK) >> 1);
+
+ for (;;) {
+ rc = pci_enable_msi_block(pdev, n_irqs);
+ if (rc == 0) {
+ void __iomem *mmio = host->iomap[AHCI_PCI_BAR];
+ u32 host_ctl = readl(mmio + HOST_CTL);
+ if ((host_ctl & HOST_MSI_RSM) == 0)
+ break;
+ pci_disable_msi(pdev);
+ n_irqs = 1;
+ } else if (rc < 0) {
+ n_irqs = 1;
+ pci_intx(pdev, 1);
+ break;
+ } else {
+ n_irqs = rc;
+ }
+ }
+ }
+
+ /* FIXME: Add support for CCC */
+ if (n_irqs > host->n_ports) {
+ n_irqs = host->n_ports;
+ } else if (n_irqs < host->n_ports) {
+ n_irqs--;
+ rc = devm_request_irq(host->dev, pdev->irq + n_irqs,
+ ahci_interrupt, IRQF_SHARED,
+ dev_driver_string(host->dev), host);
+ if (rc)
+ goto release_block;
+ }
+
+ for (i = 0; i < n_irqs; i++) {
+ rc = devm_request_irq(host->dev, pdev->irq + i,
+ ahci_msi_interrupt, IRQF_SHARED,
+ dev_driver_string(host->dev), host->ports[i]);
+ if (rc)
+ goto release_block;
+ }
+
+ return 0;
+
+ release_block:
+ pci_disable_msi(pdev);
+ pci_intx(pdev, 1);
+ return rc;
+}
+
static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
{
static int printed_version;
@@ -2268,9 +2336,6 @@ static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
(pdev->revision == 0xa1 || pdev->revision == 0xa2))
hpriv->flags |= AHCI_HFLAG_NO_MSI;
- if ((hpriv->flags & AHCI_HFLAG_NO_MSI) || pci_enable_msi(pdev))
- pci_intx(pdev, 1);
-
/* save initial config */
ahci_save_initial_config(pdev, hpriv);
@@ -2325,8 +2390,17 @@ static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
ahci_print_info(host);
pci_set_master(pdev);
- return ata_host_activate(host, pdev->irq, ahci_interrupt, IRQF_SHARED,
- &ahci_sht);
+
+ rc = ata_host_start(host);
+ if (rc)
+ return rc;
+
+ rc = ahci_request_irqs(pdev, host);
+ if (rc)
+ return rc;
+
+ rc = ata_host_register(host, &ahci_sht);
+ return rc;
}
static int __init ahci_init(void)
--
1.5.5.4
^ permalink raw reply related [flat|nested] 43+ messages in thread* Re: [PATCH 3/4] AHCI: Request multiple MSIs
2008-07-05 13:34 ` [PATCH 3/4] AHCI: Request multiple MSIs Matthew Wilcox
@ 2008-07-07 16:45 ` Grant Grundler
2008-07-07 17:48 ` Matthew Wilcox
0 siblings, 1 reply; 43+ messages in thread
From: Grant Grundler @ 2008-07-07 16:45 UTC (permalink / raw)
To: Matthew Wilcox
Cc: linux-pci, kaneshige.kenji, mingo, tglx, davem, dan.j.williams,
Martine.Silbermann, benh, michael, linux-kernel, Matthew Wilcox
On Sat, Jul 05, 2008 at 09:34:14AM -0400, Matthew Wilcox wrote:
> AHCI controllers can support up to 16 interrupts, one per port. This
> saves us a readl() in the interrupt path to determine which port has
> generated the interrupt.
If the system is busy, the readl is the cost of coalescing the
interrupts. I suspect it's cheaper to take one readl than
handle 16 individual interrupts.
I'm just pointing out the only upside of the existing code and not trying
to argue against this patch.
Can you confirm the upside for the use case of when multiple SSDs are attached?
Avoiding the readl _and_ the loop will work much better if interrupts
are getting redirected to multiple CPUs. Latency for each drive will
be substantially lower and handle many more IOPS.
Hrm...without targeting a particular socket on a multi-socket machine,
spinlocks and control data cachelines are going to bounce around alot.
*shrug*
BTW, one more downside of the regular IRQ is it's possibly shared.
Using MSI guaratees exclusive IRQ and avoids spurious readl's
when AHCI is not busy but the other device is. This would be worth
noting (or as a reminder) in the change log or as a comment in
the code.
hth,
grant
>
> Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
> ---
> drivers/ata/ahci.c | 84 ++++++++++++++++++++++++++++++++++++++++++++++++---
> 1 files changed, 79 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
> index 061817a..4b2f90a 100644
> --- a/drivers/ata/ahci.c
> +++ b/drivers/ata/ahci.c
> @@ -102,6 +102,7 @@ enum {
> /* HOST_CTL bits */
> HOST_RESET = (1 << 0), /* reset controller; self-clear */
> HOST_IRQ_EN = (1 << 1), /* global IRQ enable */
> + HOST_MSI_RSM = (1 << 2), /* Revert to Single Message */
> HOST_AHCI_EN = (1 << 31), /* AHCI enabled */
>
> /* HOST_CAP bits */
> @@ -1771,6 +1772,13 @@ static void ahci_port_intr(struct ata_port *ap)
> }
> }
>
> +static irqreturn_t ahci_msi_interrupt(int irq, void *dev_instance)
> +{
> + struct ata_port *ap = dev_instance;
> + ahci_port_intr(ap);
> + return IRQ_HANDLED;
> +}
> +
> static irqreturn_t ahci_interrupt(int irq, void *dev_instance)
> {
> struct ata_host *host = dev_instance;
> @@ -2210,6 +2218,66 @@ static void ahci_p5wdh_workaround(struct ata_host *host)
> }
> }
>
> +static int ahci_request_irqs(struct pci_dev *pdev, struct ata_host *host)
> +{
> + int i, n_irqs, rc;
> + struct ahci_host_priv *hpriv = host->private_data;
> +
> + if (hpriv->flags & AHCI_HFLAG_NO_MSI) {
> + n_irqs = 1;
> + } else {
> + u16 control;
> + int pos = pci_find_capability(pdev, PCI_CAP_ID_MSI);
> + pci_read_config_word(pdev, pos + PCI_MSI_FLAGS, &control);
> + n_irqs = 1 << ((control & PCI_MSI_FLAGS_QMASK) >> 1);
> +
> + for (;;) {
> + rc = pci_enable_msi_block(pdev, n_irqs);
> + if (rc == 0) {
> + void __iomem *mmio = host->iomap[AHCI_PCI_BAR];
> + u32 host_ctl = readl(mmio + HOST_CTL);
> + if ((host_ctl & HOST_MSI_RSM) == 0)
> + break;
> + pci_disable_msi(pdev);
> + n_irqs = 1;
> + } else if (rc < 0) {
> + n_irqs = 1;
> + pci_intx(pdev, 1);
> + break;
> + } else {
> + n_irqs = rc;
> + }
> + }
> + }
> +
> + /* FIXME: Add support for CCC */
> + if (n_irqs > host->n_ports) {
> + n_irqs = host->n_ports;
> + } else if (n_irqs < host->n_ports) {
> + n_irqs--;
> + rc = devm_request_irq(host->dev, pdev->irq + n_irqs,
> + ahci_interrupt, IRQF_SHARED,
> + dev_driver_string(host->dev), host);
> + if (rc)
> + goto release_block;
> + }
> +
> + for (i = 0; i < n_irqs; i++) {
> + rc = devm_request_irq(host->dev, pdev->irq + i,
> + ahci_msi_interrupt, IRQF_SHARED,
> + dev_driver_string(host->dev), host->ports[i]);
> + if (rc)
> + goto release_block;
> + }
> +
> + return 0;
> +
> + release_block:
> + pci_disable_msi(pdev);
> + pci_intx(pdev, 1);
> + return rc;
> +}
> +
> static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
> {
> static int printed_version;
> @@ -2268,9 +2336,6 @@ static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
> (pdev->revision == 0xa1 || pdev->revision == 0xa2))
> hpriv->flags |= AHCI_HFLAG_NO_MSI;
>
> - if ((hpriv->flags & AHCI_HFLAG_NO_MSI) || pci_enable_msi(pdev))
> - pci_intx(pdev, 1);
> -
> /* save initial config */
> ahci_save_initial_config(pdev, hpriv);
>
> @@ -2325,8 +2390,17 @@ static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
> ahci_print_info(host);
>
> pci_set_master(pdev);
> - return ata_host_activate(host, pdev->irq, ahci_interrupt, IRQF_SHARED,
> - &ahci_sht);
> +
> + rc = ata_host_start(host);
> + if (rc)
> + return rc;
> +
> + rc = ahci_request_irqs(pdev, host);
> + if (rc)
> + return rc;
> +
> + rc = ata_host_register(host, &ahci_sht);
> + return rc;
> }
>
> static int __init ahci_init(void)
> --
> 1.5.5.4
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-pci" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 43+ messages in thread* Re: [PATCH 3/4] AHCI: Request multiple MSIs
2008-07-07 16:45 ` Grant Grundler
@ 2008-07-07 17:48 ` Matthew Wilcox
2008-07-20 7:49 ` Grant Grundler
0 siblings, 1 reply; 43+ messages in thread
From: Matthew Wilcox @ 2008-07-07 17:48 UTC (permalink / raw)
To: Grant Grundler
Cc: linux-pci, kaneshige.kenji, mingo, tglx, davem, dan.j.williams,
Martine.Silbermann, benh, michael, linux-kernel, Matthew Wilcox
On Mon, Jul 07, 2008 at 10:45:34AM -0600, Grant Grundler wrote:
> If the system is busy, the readl is the cost of coalescing the
> interrupts. I suspect it's cheaper to take one readl than
> handle 16 individual interrupts.
16 would be a maximum imposed by the AHCI spec. My ICH9 board has 6
ports, but requests all 16 interrupts
> I'm just pointing out the only upside of the existing code and not trying
> to argue against this patch.
There may well be an upside to the existing code, but it's pretty slim.
The oprofile shows clearly that ahci_interrupt is the largest consumer of
time during an iozone run. The only thing that routine does is read the
HOST_IRQ_STAT register, acquire the spinlock and loop calling
ahci_port_intr().
I don't have a profile for this new code yet. Hopefully we'll have one
by the end of the day.
> BTW, one more downside of the regular IRQ is it's possibly shared.
> Using MSI guaratees exclusive IRQ and avoids spurious readl's
> when AHCI is not busy but the other device is. This would be worth
> noting (or as a reminder) in the change log or as a comment in
> the code.
AHCI already allocates itself a new MSI if the machine supports MSI.
This change merely extends AHCI to use multiple MSIs.
Thanks.
--
Intel are signing my paycheques ... these opinions are still mine
"Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours. We can't possibly take such
a retrograde step."
^ permalink raw reply [flat|nested] 43+ messages in thread
* Re: [PATCH 3/4] AHCI: Request multiple MSIs
2008-07-07 17:48 ` Matthew Wilcox
@ 2008-07-20 7:49 ` Grant Grundler
0 siblings, 0 replies; 43+ messages in thread
From: Grant Grundler @ 2008-07-20 7:49 UTC (permalink / raw)
To: Matthew Wilcox
Cc: Grant Grundler, linux-pci, kaneshige.kenji, mingo, tglx, davem,
dan.j.williams, Martine.Silbermann, benh, michael, linux-kernel,
Matthew Wilcox
On Mon, Jul 07, 2008 at 11:48:03AM -0600, Matthew Wilcox wrote:
> On Mon, Jul 07, 2008 at 10:45:34AM -0600, Grant Grundler wrote:
> > If the system is busy, the readl is the cost of coalescing the
> > interrupts. I suspect it's cheaper to take one readl than
> > handle 16 individual interrupts.
>
> 16 would be a maximum imposed by the AHCI spec. My ICH9 board has 6
> ports, but requests all 16 interrupts
>
> > I'm just pointing out the only upside of the existing code and not trying
> > to argue against this patch.
>
> There may well be an upside to the existing code, but it's pretty slim.
> The oprofile shows clearly that ahci_interrupt is the largest consumer of
> time during an iozone run. The only thing that routine does is read the
> HOST_IRQ_STAT register, acquire the spinlock and loop calling
> ahci_port_intr().
>
> I don't have a profile for this new code yet. Hopefully we'll have one
> by the end of the day.
Willy,
where you able to get this profile?
I'm still curious.
>
> > BTW, one more downside of the regular IRQ is it's possibly shared.
> > Using MSI guaratees exclusive IRQ and avoids spurious readl's
> > when AHCI is not busy but the other device is. This would be worth
> > noting (or as a reminder) in the change log or as a comment in
> > the code.
>
> AHCI already allocates itself a new MSI if the machine supports MSI.
> This change merely extends AHCI to use multiple MSIs.
ok.
thanks,
grant
>
> Thanks.
>
> --
> Intel are signing my paycheques ... these opinions are still mine
> "Bill, look, we understand that you're interested in selling us this
> operating system, but compare it to ours. We can't possibly take such
> a retrograde step."
^ permalink raw reply [flat|nested] 43+ messages in thread
* [PATCH 4/4] x86-64: Support for multiple MSIs
2008-07-05 13:27 ` Matthew Wilcox
` (2 preceding siblings ...)
2008-07-05 13:34 ` [PATCH 3/4] AHCI: Request multiple MSIs Matthew Wilcox
@ 2008-07-05 13:34 ` Matthew Wilcox
2008-07-05 13:43 ` Multiple MSI Matthew Wilcox
4 siblings, 0 replies; 43+ messages in thread
From: Matthew Wilcox @ 2008-07-05 13:34 UTC (permalink / raw)
To: linux-pci
Cc: kaneshige.kenji, mingo, tglx, davem, dan.j.williams,
Martine.Silbermann, benh, michael, linux-kernel, Matthew Wilcox,
Matthew Wilcox
Implement the arch_setup_msi_block() interface. Rewrite create_irq()
into create_irq_block() and call create_irq_block() from create_irq().
Implement __assign_irq_vector_block() based closely on __assign_irq_vector().
Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
---
arch/x86/kernel/io_apic_64.c | 199 ++++++++++++++++++++++++++++++++++++++----
1 files changed, 183 insertions(+), 16 deletions(-)
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index ef1a8df..44e942a 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -683,6 +683,8 @@ static int pin_2_irq(int idx, int apic, int pin)
return irq;
}
+static int current_vector = FIRST_DEVICE_VECTOR;
+
static int __assign_irq_vector(int irq, cpumask_t mask)
{
/*
@@ -696,7 +698,7 @@ static int __assign_irq_vector(int irq, cpumask_t mask)
* Also, we've got to be careful not to trash gate
* 0x80, because int 0x80 is hm, kind of importantish. ;)
*/
- static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0;
+ static int current_offset = 0;
unsigned int old_vector;
int cpu;
struct irq_cfg *cfg;
@@ -769,6 +771,97 @@ static int assign_irq_vector(int irq, cpumask_t mask)
return err;
}
+static int __assign_irq_vector_block(int irq, int count, cpumask_t mask)
+{
+ unsigned int old_vector;
+ int i, cpu;
+ struct irq_cfg *cfg;
+
+ /*
+ * We've got to be careful not to trash gate 0x80,
+ * because int 0x80 is hm, kind of importantish. ;)
+ */
+ BUG_ON((unsigned)irq + count > NR_IRQS);
+
+ /* Only try and allocate irqs on cpus that are present */
+ cpus_and(mask, mask, cpu_online_map);
+
+ for (i = 0; i < count; i++) {
+ cfg = &irq_cfg[irq + i];
+ if ((cfg->move_in_progress) || cfg->move_cleanup_count)
+ return -EBUSY;
+ }
+
+ cfg = &irq_cfg[irq];
+ old_vector = cfg->vector;
+ if (old_vector) {
+ cpumask_t tmp;
+ cpus_and(tmp, cfg->domain, mask);
+ if (!cpus_empty(tmp))
+ return 0;
+ }
+
+ for_each_cpu_mask(cpu, mask) {
+ cpumask_t domain, new_mask;
+ int new_cpu;
+ int vector;
+
+ domain = vector_allocation_domain(cpu);
+ cpus_and(new_mask, domain, cpu_online_map);
+
+ vector = current_vector & ~(count - 1);
+ next:
+ vector += count;
+ if (vector + count >= FIRST_SYSTEM_VECTOR) {
+ vector = FIRST_DEVICE_VECTOR & ~(count - 1);
+ if (vector < FIRST_DEVICE_VECTOR)
+ vector += count;
+ }
+ if (unlikely(vector == (current_vector & ~(count - 1))))
+ continue;
+ if ((IA32_SYSCALL_VECTOR >= vector) &&
+ (IA32_SYSCALL_VECTOR < vector + count))
+ goto next;
+ for_each_cpu_mask(new_cpu, new_mask) {
+ for (i = 0; i < count; i++) {
+ if (per_cpu(vector_irq, new_cpu)[vector + i]
+ != -1)
+ goto next;
+ }
+ }
+ /* Found one! */
+ current_vector = vector + count - 1;
+ for (i = 0; i < count; i++) {
+ cfg = &irq_cfg[irq + i];
+ if (old_vector) {
+ cfg->move_in_progress = 1;
+ cfg->old_domain = cfg->domain;
+ }
+ for_each_cpu_mask(new_cpu, new_mask) {
+ per_cpu(vector_irq, new_cpu)[vector + i] =
+ irq + i;
+ }
+ cfg->vector = vector;
+ cfg->domain = domain;
+ }
+ return 0;
+ }
+ return -ENOSPC;
+}
+
+/* Assumes that count is a power of two and aligns to that power of two */
+static int assign_irq_vector_block(int irq, int count, cpumask_t mask)
+{
+ int result;
+ unsigned long flags;
+
+ spin_lock_irqsave(&vector_lock, flags);
+ result = __assign_irq_vector_block(irq, count, mask);
+ spin_unlock_irqrestore(&vector_lock, flags);
+
+ return result;
+}
+
static void __clear_irq_vector(int irq)
{
struct irq_cfg *cfg;
@@ -788,6 +881,14 @@ static void __clear_irq_vector(int irq)
cpus_clear(cfg->domain);
}
+static void __clear_irq_vector_block(int irq, int count)
+{
+ while (count > 0) {
+ count--;
+ __clear_irq_vector(irq + count);
+ }
+}
+
void __setup_vector_irq(int cpu)
{
/* Initialize vector_irq on a new cpu */
@@ -1895,30 +1996,56 @@ device_initcall(ioapic_init_sysfs);
/*
* Dynamic irq allocate and deallocation
*/
-int create_irq(void)
+
+/*
+ * On success, returns the interrupt number of the lowest numbered irq
+ * in the block. If it can't find a block of the right size, it returns
+ * -1 - (length of the longest run).
+ */
+static int create_irq_block(int count)
{
- /* Allocate an unused irq */
- int irq;
- int new;
+ /* Allocate 'count' consecutive unused irqs */
+ int i, new, longest;
unsigned long flags;
- irq = -ENOSPC;
+ longest = 0;
spin_lock_irqsave(&vector_lock, flags);
for (new = (NR_IRQS - 1); new >= 0; new--) {
if (platform_legacy_irq(new))
- continue;
+ goto clear;
if (irq_cfg[new].vector != 0)
+ goto clear;
+ longest++;
+ if (longest < count)
continue;
- if (__assign_irq_vector(new, TARGET_CPUS) == 0)
- irq = new;
+
+ while (__assign_irq_vector_block(new, longest, TARGET_CPUS))
+ longest /= 2;
+ if (longest < count)
+ __clear_irq_vector_block(new, longest);
break;
+ clear:
+ __clear_irq_vector_block(new + 1, longest);
+ longest = 0;
}
spin_unlock_irqrestore(&vector_lock, flags);
- if (irq >= 0) {
- dynamic_irq_init(irq);
+ if (longest < count)
+ return -1 - longest;
+
+ for (i = 0; i < count; i++) {
+ dynamic_irq_init(new + i);
}
- return irq;
+
+ return new;
+}
+
+int create_irq(void)
+{
+ int ret = create_irq_block(1);
+ if (ret < 0)
+ return -ENOSPC;
+ return ret;
}
void destroy_irq(unsigned int irq)
@@ -1936,7 +2063,8 @@ void destroy_irq(unsigned int irq)
* MSI message composition
*/
#ifdef CONFIG_PCI_MSI
-static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
+static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
+ unsigned int count, struct msi_msg *msg)
{
struct irq_cfg *cfg = irq_cfg + irq;
int err;
@@ -1944,7 +2072,10 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
cpumask_t tmp;
tmp = TARGET_CPUS;
- err = assign_irq_vector(irq, tmp);
+ if (count == 1)
+ err = assign_irq_vector(irq, tmp);
+ else
+ err = assign_irq_vector_block(irq, count, tmp);
if (!err) {
cpus_and(tmp, cfg->domain, tmp);
dest = cpu_mask_to_apicid(tmp);
@@ -1975,6 +2106,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
{
struct irq_cfg *cfg = irq_cfg + irq;
+ struct msi_desc *desc = get_irq_msi(irq);
struct msi_msg msg;
unsigned int dest;
cpumask_t tmp;
@@ -1983,6 +2115,10 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
if (cpus_empty(tmp))
return;
+ /* XXX: Figure out how to do CPU affinity for multiple MSIs */
+ if (desc->msi_attrib.multiple)
+ return;
+
if (assign_irq_vector(irq, mask))
return;
@@ -2024,7 +2160,7 @@ int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
if (irq < 0)
return irq;
- ret = msi_compose_msg(dev, irq, &msg);
+ ret = msi_compose_msg(dev, irq, 1, &msg);
if (ret < 0) {
destroy_irq(irq);
return ret;
@@ -2038,6 +2174,37 @@ int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
return 0;
}
+int arch_setup_msi_block(struct pci_dev *pdev, struct msi_desc *desc, int count)
+{
+ struct msi_msg msg;
+ int i, ret, base_irq, alloc;
+
+ /* MSI can only allocate a power-of-two */
+ alloc = roundup_pow_of_two(count);
+
+ base_irq = create_irq_block(alloc);
+ if (base_irq < 0)
+ return rounddown_pow_of_two(-base_irq - 1);
+
+ ret = msi_compose_msg(pdev, base_irq, alloc, &msg);
+ if (ret)
+ return ret;
+
+ desc->msi_attrib.multiple = order_base_2(alloc);
+
+ /* Do loop in reverse so set_irq_msi ends up setting
+ * desc->irq to base_irq
+ */
+ for (i = count - 1; i >= 0; i--) {
+ set_irq_msi(base_irq + i, desc);
+ set_irq_chip_and_handler_name(base_irq + i, &msi_chip,
+ handle_edge_irq, "edge");
+ }
+ write_msi_msg(base_irq, &msg);
+
+ return 0;
+}
+
void arch_teardown_msi_irq(unsigned int irq)
{
destroy_irq(irq);
@@ -2090,7 +2257,7 @@ int arch_setup_dmar_msi(unsigned int irq)
int ret;
struct msi_msg msg;
- ret = msi_compose_msg(NULL, irq, &msg);
+ ret = msi_compose_msg(NULL, irq, 1, &msg);
if (ret < 0)
return ret;
dmar_msi_write(irq, &msg);
--
1.5.5.4
^ permalink raw reply related [flat|nested] 43+ messages in thread* Re: Multiple MSI
2008-07-05 13:27 ` Matthew Wilcox
` (3 preceding siblings ...)
2008-07-05 13:34 ` [PATCH 4/4] x86-64: Support for " Matthew Wilcox
@ 2008-07-05 13:43 ` Matthew Wilcox
2008-07-05 22:38 ` Matthew Wilcox
4 siblings, 1 reply; 43+ messages in thread
From: Matthew Wilcox @ 2008-07-05 13:43 UTC (permalink / raw)
To: linux-pci
Cc: Kenji Kaneshige, Ingo Molnar, Thomas Gleixner, David Miller,
Dan Williams, Martine.Silbermann, Benjamin Herrenschmidt,
linux-kernel, Michael Ellerman
On Sat, Jul 05, 2008 at 07:27:28AM -0600, Matthew Wilcox wrote:
> Here's some code. It's four patches, the first two are to the PCI MSI
> code, the third is for the AHCI driver and the fourth is for the x86-64
> interrupt code.
>
> It's had some light testing; no performance testing yet. I'd value some
> review.
It's taking a while to come through ... patches are also at:
http://www.parisc-linux.org/~willy/multiple-msi/
I'll put up the git tree if there's demand.
--
Intel are signing my paycheques ... these opinions are still mine
"Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours. We can't possibly take such
a retrograde step."
^ permalink raw reply [flat|nested] 43+ messages in thread* Re: Multiple MSI
2008-07-05 13:43 ` Multiple MSI Matthew Wilcox
@ 2008-07-05 22:38 ` Matthew Wilcox
0 siblings, 0 replies; 43+ messages in thread
From: Matthew Wilcox @ 2008-07-05 22:38 UTC (permalink / raw)
To: linux-pci
Cc: Kenji Kaneshige, Ingo Molnar, Thomas Gleixner, David Miller,
Dan Williams, Martine.Silbermann, Benjamin Herrenschmidt,
linux-kernel, Michael Ellerman, Jeff Garzik, linux-ide
On Sat, Jul 05, 2008 at 07:43:42AM -0600, Matthew Wilcox wrote:
> On Sat, Jul 05, 2008 at 07:27:28AM -0600, Matthew Wilcox wrote:
> > Here's some code. It's four patches, the first two are to the PCI MSI
> > code, the third is for the AHCI driver and the fourth is for the x86-64
> > interrupt code.
> >
> > It's had some light testing; no performance testing yet. I'd value some
> > review.
>
> It's taking a while to come through ... patches are also at:
>
> http://www.parisc-linux.org/~willy/multiple-msi/
>
> I'll put up the git tree if there's demand.
I found and fixed a couple of bugs ... and implemented CPU affinity.
As the comment says, when you move one MSI, you move them all (at least
as far as I can figure out the x86 APIC architecture ... other
architectures may not have this problem). Git tree now available:
git://git.kernel.org/pub/scm/linux/kernel/git/willy/misc.git multiple-msi
(will be updated as and when)
git://git.kernel.org/pub/scm/linux/kernel/git/willy/misc.git multiple-msi-20080705
(semi-permanent)
I don't intend to submit these patches to Linus myself; I'd like the
first two to go in through the PCI tree. The third patch does depend on
the first two, but should go in through the IDE tree. The fourth patch
is entirely independent of the first three and can go in through the x86
tree at any time.
I love these cross-responsibility patch sets ;-)
--
Intel are signing my paycheques ... these opinions are still mine
"Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours. We can't possibly take such
a retrograde step."
^ permalink raw reply [flat|nested] 43+ messages in thread