* [PATCH v8 1/7] PCI: initialize and release SR-IOV capability
2009-02-10 8:59 [PATCH v8 0/7] PCI: Linux kernel SR-IOV support Yu Zhao
@ 2009-02-10 8:59 ` Yu Zhao
2009-02-13 16:56 ` Andi Kleen
2009-02-10 8:59 ` [PATCH v8 2/7] PCI: restore saved SR-IOV state Yu Zhao
` (5 subsequent siblings)
6 siblings, 1 reply; 12+ messages in thread
From: Yu Zhao @ 2009-02-10 8:59 UTC (permalink / raw)
To: jbarnes; +Cc: linux-pci, kvm, linux-kernel, Yu Zhao
Signed-off-by: Yu Zhao <yu.zhao@intel.com>
---
drivers/pci/Kconfig | 13 ++++
drivers/pci/Makefile | 3 +
drivers/pci/iov.c | 178 ++++++++++++++++++++++++++++++++++++++++++++++
drivers/pci/pci.c | 7 ++
drivers/pci/pci.h | 37 ++++++++++
drivers/pci/probe.c | 4 +
include/linux/pci.h | 8 ++
include/linux/pci_regs.h | 33 +++++++++
8 files changed, 283 insertions(+), 0 deletions(-)
create mode 100644 drivers/pci/iov.c
diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig
index 2a4501d..2d0ca01 100644
--- a/drivers/pci/Kconfig
+++ b/drivers/pci/Kconfig
@@ -59,3 +59,16 @@ config HT_IRQ
This allows native hypertransport devices to use interrupts.
If unsure say Y.
+
+config PCI_IOV
+ bool "PCI IOV support"
+ depends on PCI
+ select PCI_MSI
+ default n
+ help
+ PCI-SIG I/O Virtualization (IOV) Specifications support.
+ Single Root IOV: allows the Physical Function driver to enable
+ the hardware capability, so the Virtual Function is accessible
+ via the PCI Configuration Space using its own Bus, Device and
+ Function Numbers. Each Virtual Function also has the PCI Memory
+ Space to map the device specific register set.
diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile
index 3d07ce2..ba99282 100644
--- a/drivers/pci/Makefile
+++ b/drivers/pci/Makefile
@@ -29,6 +29,9 @@ obj-$(CONFIG_DMAR) += dmar.o iova.o intel-iommu.o
obj-$(CONFIG_INTR_REMAP) += dmar.o intr_remapping.o
+# PCI IOV support
+obj-$(CONFIG_PCI_IOV) += iov.o
+
#
# Some architectures use the generic PCI setup functions
#
diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
new file mode 100644
index 0000000..9a1fabd
--- /dev/null
+++ b/drivers/pci/iov.c
@@ -0,0 +1,178 @@
+/*
+ * drivers/pci/iov.c
+ *
+ * Copyright (C) 2009 Intel Corporation, Yu Zhao <yu.zhao@intel.com>
+ *
+ * PCI Express I/O Virtualization (IOV) support.
+ * Single Root IOV 1.0
+ */
+
+#include <linux/pci.h>
+#include "pci.h"
+
+
+static int sriov_init(struct pci_dev *dev, int pos)
+{
+ int i;
+ int rc;
+ int nres;
+ u32 pgsz;
+ u16 ctrl, total, offset, stride;
+ struct pci_sriov *iov;
+ struct resource *res;
+ struct pci_dev *pdev;
+
+ if (dev->pcie_type != PCI_EXP_TYPE_RC_END &&
+ dev->pcie_type != PCI_EXP_TYPE_ENDPOINT)
+ return -ENODEV;
+
+ pci_read_config_word(dev, pos + PCI_SRIOV_CTRL, &ctrl);
+ if (ctrl & PCI_SRIOV_CTRL_VFE) {
+ pci_write_config_word(dev, pos + PCI_SRIOV_CTRL, 0);
+ msleep(100);
+ }
+
+ pci_read_config_word(dev, pos + PCI_SRIOV_TOTAL_VF, &total);
+ if (!total)
+ return 0;
+
+ list_for_each_entry(pdev, &dev->bus->devices, bus_list)
+ if (pdev->sriov)
+ break;
+ if (list_empty(&dev->bus->devices) || !pdev->sriov)
+ pdev = NULL;
+
+ ctrl = 0;
+ if (!pdev && pci_ari_enabled(dev->bus))
+ ctrl |= PCI_SRIOV_CTRL_ARI;
+
+ pci_write_config_word(dev, pos + PCI_SRIOV_CTRL, ctrl);
+ pci_write_config_word(dev, pos + PCI_SRIOV_NUM_VF, total);
+ pci_read_config_word(dev, pos + PCI_SRIOV_VF_OFFSET, &offset);
+ pci_read_config_word(dev, pos + PCI_SRIOV_VF_STRIDE, &stride);
+ if (!offset || (total > 1 && !stride))
+ return -EIO;
+
+ pci_read_config_dword(dev, pos + PCI_SRIOV_SUP_PGSIZE, &pgsz);
+ i = PAGE_SHIFT > 12 ? PAGE_SHIFT - 12 : 0;
+ pgsz &= ~((1 << i) - 1);
+ if (!pgsz)
+ return -EIO;
+
+ pgsz &= ~(pgsz - 1);
+ pci_write_config_dword(dev, pos + PCI_SRIOV_SYS_PGSIZE, pgsz);
+
+ nres = 0;
+ for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+ res = dev->resource + PCI_SRIOV_RESOURCES + i;
+ i += __pci_read_base(dev, pci_bar_unknown, res,
+ pos + PCI_SRIOV_BAR + i * 4);
+ if (!res->flags)
+ continue;
+ if (resource_size(res) & (PAGE_SIZE - 1)) {
+ rc = -EIO;
+ goto failed;
+ }
+ res->end = res->start + resource_size(res) * total - 1;
+ nres++;
+ }
+
+ iov = kzalloc(sizeof(*iov), GFP_KERNEL);
+ if (!iov) {
+ rc = -ENOMEM;
+ goto failed;
+ }
+
+ iov->pos = pos;
+ iov->nres = nres;
+ iov->ctrl = ctrl;
+ iov->total = total;
+ iov->offset = offset;
+ iov->stride = stride;
+ iov->pgsz = pgsz;
+ iov->self = dev;
+ pci_read_config_dword(dev, pos + PCI_SRIOV_CAP, &iov->cap);
+ pci_read_config_byte(dev, pos + PCI_SRIOV_FUNC_LINK, &iov->link);
+
+ if (pdev)
+ iov->pdev = pci_dev_get(pdev);
+ else {
+ iov->pdev = dev;
+ mutex_init(&iov->lock);
+ }
+
+ dev->sriov = iov;
+
+ return 0;
+
+failed:
+ for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+ res = dev->resource + PCI_SRIOV_RESOURCES + i;
+ res->flags = 0;
+ }
+
+ return rc;
+}
+
+static void sriov_release(struct pci_dev *dev)
+{
+ if (dev == dev->sriov->pdev)
+ mutex_destroy(&dev->sriov->lock);
+ else
+ pci_dev_put(dev->sriov->pdev);
+
+ kfree(dev->sriov);
+ dev->sriov = NULL;
+}
+
+/**
+ * pci_iov_init - initialize the IOV capability
+ * @dev: the PCI device
+ *
+ * Returns 0 on success, or negative on failure.
+ */
+int pci_iov_init(struct pci_dev *dev)
+{
+ int pos;
+
+ if (!dev->is_pcie)
+ return -ENODEV;
+
+ pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_SRIOV);
+ if (pos)
+ return sriov_init(dev, pos);
+
+ return -ENODEV;
+}
+
+/**
+ * pci_iov_release - release resources used by the IOV capability
+ * @dev: the PCI device
+ */
+void pci_iov_release(struct pci_dev *dev)
+{
+ if (dev->sriov)
+ sriov_release(dev);
+}
+
+/**
+ * pci_iov_resource_bar - get position of the SR-IOV BAR
+ * @dev: the PCI device
+ * @resno: the resource number
+ * @type: the BAR type to be filled in
+ *
+ * Returns position of the BAR encapsulated in the SR-IOV capability.
+ */
+int pci_iov_resource_bar(struct pci_dev *dev, int resno,
+ enum pci_bar_type *type)
+{
+ if (resno < PCI_SRIOV_RESOURCES || resno > PCI_SRIOV_RESOURCE_END)
+ return 0;
+
+ BUG_ON(!dev->sriov);
+
+ *type = pci_bar_unknown;
+
+ return dev->sriov->pos + PCI_SRIOV_BAR +
+ 4 * (resno - PCI_SRIOV_RESOURCES);
+}
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index e3efe6b..c4f14f3 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -2341,12 +2341,19 @@ int pci_select_bars(struct pci_dev *dev, unsigned long flags)
*/
int pci_resource_bar(struct pci_dev *dev, int resno, enum pci_bar_type *type)
{
+ int reg;
+
if (resno < PCI_ROM_RESOURCE) {
*type = pci_bar_unknown;
return PCI_BASE_ADDRESS_0 + 4 * resno;
} else if (resno == PCI_ROM_RESOURCE) {
*type = pci_bar_mem32;
return dev->rom_base_reg;
+ } else if (resno < PCI_BRIDGE_RESOURCES) {
+ /* device specific resource */
+ reg = pci_iov_resource_bar(dev, resno, type);
+ if (reg)
+ return reg;
}
dev_err(&dev->dev, "BAR: invalid resource #%d\n", resno);
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 26ddf78..d2dc6b7 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -195,4 +195,41 @@ static inline int pci_ari_enabled(struct pci_bus *bus)
return bus->self && bus->self->ari_enabled;
}
+/* Single Root I/O Virtualization */
+struct pci_sriov {
+ int pos; /* capability position */
+ int nres; /* number of resources */
+ u32 cap; /* SR-IOV Capabilities */
+ u16 ctrl; /* SR-IOV Control */
+ u16 total; /* total VFs associated with the PF */
+ u16 offset; /* first VF Routing ID offset */
+ u16 stride; /* following VF stride */
+ u32 pgsz; /* page size for BAR alignment */
+ u8 link; /* Function Dependency Link */
+ struct pci_dev *pdev; /* lowest numbered PF */
+ struct pci_dev *self; /* this PF */
+ struct mutex lock; /* lock for VF bus */
+};
+
+#ifdef CONFIG_PCI_IOV
+extern int pci_iov_init(struct pci_dev *dev);
+extern void pci_iov_release(struct pci_dev *dev);
+extern int pci_iov_resource_bar(struct pci_dev *dev, int resno,
+ enum pci_bar_type *type);
+#else
+static inline int pci_iov_init(struct pci_dev *dev)
+{
+ return -ENODEV;
+}
+static inline void pci_iov_release(struct pci_dev *dev)
+
+{
+}
+static inline int pci_iov_resource_bar(struct pci_dev *dev, int resno,
+ enum pci_bar_type *type)
+{
+ return 0;
+}
+#endif /* CONFIG_PCI_IOV */
+
#endif /* DRIVERS_PCI_H */
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 55ec44a..03b6f29 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -785,6 +785,7 @@ static int pci_setup_device(struct pci_dev * dev)
static void pci_release_capabilities(struct pci_dev *dev)
{
pci_vpd_release(dev);
+ pci_iov_release(dev);
}
/**
@@ -972,6 +973,9 @@ static void pci_init_capabilities(struct pci_dev *dev)
/* Alternative Routing-ID Forwarding */
pci_enable_ari(dev);
+
+ /* Single Root I/O Virtualization */
+ pci_iov_init(dev);
}
void pci_device_add(struct pci_dev *dev, struct pci_bus *bus)
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 7bd624b..f4d740e 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -93,6 +93,12 @@ enum {
/* #6: expansion ROM resource */
PCI_ROM_RESOURCE,
+ /* device specific resources */
+#ifdef CONFIG_PCI_IOV
+ PCI_SRIOV_RESOURCES,
+ PCI_SRIOV_RESOURCE_END = PCI_SRIOV_RESOURCES + PCI_SRIOV_NUM_BARS - 1,
+#endif
+
/* resources assigned to buses behind the bridge */
#define PCI_BRIDGE_RESOURCE_NUM 4
@@ -180,6 +186,7 @@ struct pci_cap_saved_state {
struct pcie_link_state;
struct pci_vpd;
+struct pci_sriov;
/*
* The pci_dev structure is used to describe PCI devices.
@@ -270,6 +277,7 @@ struct pci_dev {
struct list_head msi_list;
#endif
struct pci_vpd *vpd;
+ struct pci_sriov *sriov; /* SR-IOV capability related */
};
extern struct pci_dev *alloc_pci_dev(void);
diff --git a/include/linux/pci_regs.h b/include/linux/pci_regs.h
index 027815b..4ce5eb0 100644
--- a/include/linux/pci_regs.h
+++ b/include/linux/pci_regs.h
@@ -375,6 +375,7 @@
#define PCI_EXP_TYPE_UPSTREAM 0x5 /* Upstream Port */
#define PCI_EXP_TYPE_DOWNSTREAM 0x6 /* Downstream Port */
#define PCI_EXP_TYPE_PCI_BRIDGE 0x7 /* PCI/PCI-X Bridge */
+#define PCI_EXP_TYPE_RC_END 0x9 /* Root Complex Integrated Endpoint */
#define PCI_EXP_FLAGS_SLOT 0x0100 /* Slot implemented */
#define PCI_EXP_FLAGS_IRQ 0x3e00 /* Interrupt message number */
#define PCI_EXP_DEVCAP 4 /* Device capabilities */
@@ -498,6 +499,7 @@
#define PCI_EXT_CAP_ID_DSN 3
#define PCI_EXT_CAP_ID_PWR 4
#define PCI_EXT_CAP_ID_ARI 14
+#define PCI_EXT_CAP_ID_SRIOV 16
/* Advanced Error Reporting */
#define PCI_ERR_UNCOR_STATUS 4 /* Uncorrectable Error Status */
@@ -615,4 +617,35 @@
#define PCI_ARI_CTRL_ACS 0x0002 /* ACS Function Groups Enable */
#define PCI_ARI_CTRL_FG(x) (((x) >> 4) & 7) /* Function Group */
+/* Single Root I/O Virtualization */
+#define PCI_SRIOV_CAP 0x04 /* SR-IOV Capabilities */
+#define PCI_SRIOV_CAP_VFM 0x01 /* VF Migration Capable */
+#define PCI_SRIOV_CAP_INTR(x) ((x) >> 21) /* Interrupt Message Number */
+#define PCI_SRIOV_CTRL 0x08 /* SR-IOV Control */
+#define PCI_SRIOV_CTRL_VFE 0x01 /* VF Enable */
+#define PCI_SRIOV_CTRL_VFM 0x02 /* VF Migration Enable */
+#define PCI_SRIOV_CTRL_INTR 0x04 /* VF Migration Interrupt Enable */
+#define PCI_SRIOV_CTRL_MSE 0x08 /* VF Memory Space Enable */
+#define PCI_SRIOV_CTRL_ARI 0x10 /* ARI Capable Hierarchy */
+#define PCI_SRIOV_STATUS 0x0a /* SR-IOV Status */
+#define PCI_SRIOV_STATUS_VFM 0x01 /* VF Migration Status */
+#define PCI_SRIOV_INITIAL_VF 0x0c /* Initial VFs */
+#define PCI_SRIOV_TOTAL_VF 0x0e /* Total VFs */
+#define PCI_SRIOV_NUM_VF 0x10 /* Number of VFs */
+#define PCI_SRIOV_FUNC_LINK 0x12 /* Function Dependency Link */
+#define PCI_SRIOV_VF_OFFSET 0x14 /* First VF Offset */
+#define PCI_SRIOV_VF_STRIDE 0x16 /* Following VF Stride */
+#define PCI_SRIOV_VF_DID 0x1a /* VF Device ID */
+#define PCI_SRIOV_SUP_PGSIZE 0x1c /* Supported Page Sizes */
+#define PCI_SRIOV_SYS_PGSIZE 0x20 /* System Page Size */
+#define PCI_SRIOV_BAR 0x24 /* VF BAR0 */
+#define PCI_SRIOV_NUM_BARS 6 /* Number of VF BARs */
+#define PCI_SRIOV_VFM 0x3c /* VF Migration State Array Offset*/
+#define PCI_SRIOV_VFM_BIR(x) ((x) & 7) /* State BIR */
+#define PCI_SRIOV_VFM_OFFSET(x) ((x) & ~7) /* State Offset */
+#define PCI_SRIOV_VFM_UA 0x0 /* Inactive.Unavailable */
+#define PCI_SRIOV_VFM_MI 0x1 /* Dormant.MigrateIn */
+#define PCI_SRIOV_VFM_MO 0x2 /* Active.MigrateOut */
+#define PCI_SRIOV_VFM_AV 0x3 /* Active.Available */
+
#endif /* LINUX_PCI_REGS_H */
--
1.5.6.4
^ permalink raw reply related [flat|nested] 12+ messages in thread* Re: [PATCH v8 1/7] PCI: initialize and release SR-IOV capability
2009-02-10 8:59 ` [PATCH v8 1/7] PCI: initialize and release SR-IOV capability Yu Zhao
@ 2009-02-13 16:56 ` Andi Kleen
2009-02-13 12:30 ` Yu Zhao
2009-02-13 17:49 ` Matthew Wilcox
0 siblings, 2 replies; 12+ messages in thread
From: Andi Kleen @ 2009-02-13 16:56 UTC (permalink / raw)
To: Yu Zhao; +Cc: jbarnes, linux-pci, kvm, linux-kernel
Yu Zhao <yu.zhao@intel.com> writes:
> +
> +
> +static int sriov_init(struct pci_dev *dev, int pos)
> +{
> + int i;
> + int rc;
> + int nres;
> + u32 pgsz;
> + u16 ctrl, total, offset, stride;
> + struct pci_sriov *iov;
> + struct resource *res;
> + struct pci_dev *pdev;
> +
> + if (dev->pcie_type != PCI_EXP_TYPE_RC_END &&
> + dev->pcie_type != PCI_EXP_TYPE_ENDPOINT)
> + return -ENODEV;
> +
It would be a good idea to put a might_sleep() here just in
case the msleep happens below and drivers call it incorrectly.
> + pci_read_config_word(dev, pos + PCI_SRIOV_CTRL, &ctrl);
> + if (ctrl & PCI_SRIOV_CTRL_VFE) {
> + pci_write_config_word(dev, pos + PCI_SRIOV_CTRL, 0);
> + msleep(100);
That's really long. Hopefully that's really needed.
> +
> + pci_write_config_word(dev, pos + PCI_SRIOV_CTRL, ctrl);
> + pci_write_config_word(dev, pos + PCI_SRIOV_NUM_VF, total);
> + pci_read_config_word(dev, pos + PCI_SRIOV_VF_OFFSET, &offset);
> + pci_read_config_word(dev, pos + PCI_SRIOV_VF_STRIDE, &stride);
> + if (!offset || (total > 1 && !stride))
> + return -EIO;
> +
> + pci_read_config_dword(dev, pos + PCI_SRIOV_SUP_PGSIZE, &pgsz);
> + i = PAGE_SHIFT > 12 ? PAGE_SHIFT - 12 : 0;
> + pgsz &= ~((1 << i) - 1);
> + if (!pgsz)
> + return -EIO;
All the error paths don't seem to undo the config space writes.
How will the devices behave with half initialized context?
-Andi
--
ak@linux.intel.com -- Speaking for myself only.
^ permalink raw reply [flat|nested] 12+ messages in thread* Re: [PATCH v8 1/7] PCI: initialize and release SR-IOV capability
2009-02-13 16:56 ` Andi Kleen
@ 2009-02-13 12:30 ` Yu Zhao
2009-02-13 17:49 ` Matthew Wilcox
1 sibling, 0 replies; 12+ messages in thread
From: Yu Zhao @ 2009-02-13 12:30 UTC (permalink / raw)
To: Andi Kleen
Cc: jbarnes@virtuousgeek.org, linux-pci@vger.kernel.org,
kvm@vger.kernel.org, linux-kernel@vger.kernel.org
On Sat, Feb 14, 2009 at 12:56:44AM +0800, Andi Kleen wrote:
> Yu Zhao <yu.zhao@intel.com> writes:
> > +
> > +
> > +static int sriov_init(struct pci_dev *dev, int pos)
> > +{
> > + int i;
> > + int rc;
> > + int nres;
> > + u32 pgsz;
> > + u16 ctrl, total, offset, stride;
> > + struct pci_sriov *iov;
> > + struct resource *res;
> > + struct pci_dev *pdev;
> > +
> > + if (dev->pcie_type != PCI_EXP_TYPE_RC_END &&
> > + dev->pcie_type != PCI_EXP_TYPE_ENDPOINT)
> > + return -ENODEV;
> > +
>
> It would be a good idea to put a might_sleep() here just in
> case the msleep happens below and drivers call it incorrectly.
Yes, will do.
> > + pci_read_config_word(dev, pos + PCI_SRIOV_CTRL, &ctrl);
> > + if (ctrl & PCI_SRIOV_CTRL_VFE) {
> > + pci_write_config_word(dev, pos + PCI_SRIOV_CTRL, 0);
> > + msleep(100);
>
> That's really long. Hopefully that's really needed.
It's needed according to SR-IOV spec, however, these lines clear
the VF Enable bit if the BIOS or something else has set it. So it
doesn't always run into this.
> > +
> > + pci_write_config_word(dev, pos + PCI_SRIOV_CTRL, ctrl);
> > + pci_write_config_word(dev, pos + PCI_SRIOV_NUM_VF, total);
> > + pci_read_config_word(dev, pos + PCI_SRIOV_VF_OFFSET, &offset);
> > + pci_read_config_word(dev, pos + PCI_SRIOV_VF_STRIDE, &stride);
> > + if (!offset || (total > 1 && !stride))
> > + return -EIO;
> > +
> > + pci_read_config_dword(dev, pos + PCI_SRIOV_SUP_PGSIZE, &pgsz);
> > + i = PAGE_SHIFT > 12 ? PAGE_SHIFT - 12 : 0;
> > + pgsz &= ~((1 << i) - 1);
> > + if (!pgsz)
> > + return -EIO;
>
> All the error paths don't seem to undo the config space writes.
> How will the devices behave with half initialized context?
Since the VF Enable bit is cleared before the initialization, setting
others SR-IOV registers won't change state of the device. So it should
be OK even without undo these writes as long as the VF Enable bit is
not set.
Thanks,
Yu
^ permalink raw reply [flat|nested] 12+ messages in thread* Re: [PATCH v8 1/7] PCI: initialize and release SR-IOV capability
2009-02-13 16:56 ` Andi Kleen
2009-02-13 12:30 ` Yu Zhao
@ 2009-02-13 17:49 ` Matthew Wilcox
2009-02-13 12:47 ` Yu Zhao
1 sibling, 1 reply; 12+ messages in thread
From: Matthew Wilcox @ 2009-02-13 17:49 UTC (permalink / raw)
To: Andi Kleen; +Cc: Yu Zhao, jbarnes, linux-pci, kvm, linux-kernel
On Fri, Feb 13, 2009 at 05:56:44PM +0100, Andi Kleen wrote:
> > + pci_read_config_word(dev, pos + PCI_SRIOV_CTRL, &ctrl);
> > + if (ctrl & PCI_SRIOV_CTRL_VFE) {
> > + pci_write_config_word(dev, pos + PCI_SRIOV_CTRL, 0);
> > + msleep(100);
>
> That's really long. Hopefully that's really needed.
Yes and no. The spec says:
To allow components to perform internal initialization, system software
must wait for at least 100 ms after changing the VF Enable bit from
a 0 to a 1, before it is permitted to issue Configuration Requests to
the VFs which are enabled by that VF Enable bit.
So we don't have to wait here, but we do have to wait before exposing
all these virtual functions to the rest of the system. Should we add
more complexity, perhaps spawn a thread to do it asynchronously, or add
0.1 seconds to device initialisation? A question without an easy
answer, iMO.
> > +
> > + pci_write_config_word(dev, pos + PCI_SRIOV_CTRL, ctrl);
> > + pci_write_config_word(dev, pos + PCI_SRIOV_NUM_VF, total);
> > + pci_read_config_word(dev, pos + PCI_SRIOV_VF_OFFSET, &offset);
> > + pci_read_config_word(dev, pos + PCI_SRIOV_VF_STRIDE, &stride);
> > + if (!offset || (total > 1 && !stride))
> > + return -EIO;
> > +
> > + pci_read_config_dword(dev, pos + PCI_SRIOV_SUP_PGSIZE, &pgsz);
> > + i = PAGE_SHIFT > 12 ? PAGE_SHIFT - 12 : 0;
> > + pgsz &= ~((1 << i) - 1);
> > + if (!pgsz)
> > + return -EIO;
>
> All the error paths don't seem to undo the config space writes.
> How will the devices behave with half initialized context?
I think we should clear the VF_ENABLE bit. That action is also fraught
with danger:
If software Clears VF Enable, software must allow 1 second after VF
Enable is Cleared before reading any field in the SR-IOV Extended
Capability or the VF Migration State Array (see Section 3.3.15.1).
Another msleep(1000) here? Not pretty, but what else can we do?
Not to mention the danger of something else innocently using lspci -xxxx
to read a field in the extended capability -- I suspect we also need to
block user config accesses before clearing this bit.
--
Matthew Wilcox Intel Open Source Technology Centre
"Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours. We can't possibly take such
a retrograde step."
^ permalink raw reply [flat|nested] 12+ messages in thread* Re: [PATCH v8 1/7] PCI: initialize and release SR-IOV capability
2009-02-13 17:49 ` Matthew Wilcox
@ 2009-02-13 12:47 ` Yu Zhao
0 siblings, 0 replies; 12+ messages in thread
From: Yu Zhao @ 2009-02-13 12:47 UTC (permalink / raw)
To: Matthew Wilcox
Cc: Andi Kleen, jbarnes@virtuousgeek.org, linux-pci@vger.kernel.org,
kvm@vger.kernel.org, linux-kernel@vger.kernel.org
On Sat, Feb 14, 2009 at 01:49:59AM +0800, Matthew Wilcox wrote:
> On Fri, Feb 13, 2009 at 05:56:44PM +0100, Andi Kleen wrote:
> > > + pci_read_config_word(dev, pos + PCI_SRIOV_CTRL, &ctrl);
> > > + if (ctrl & PCI_SRIOV_CTRL_VFE) {
> > > + pci_write_config_word(dev, pos + PCI_SRIOV_CTRL, 0);
> > > + msleep(100);
> >
> > That's really long. Hopefully that's really needed.
>
> Yes and no. The spec says:
>
> To allow components to perform internal initialization, system software
> must wait for at least 100 ms after changing the VF Enable bit from
> a 0 to a 1, before it is permitted to issue Configuration Requests to
> the VFs which are enabled by that VF Enable bit.
>
> So we don't have to wait here, but we do have to wait before exposing
> all these virtual functions to the rest of the system. Should we add
> more complexity, perhaps spawn a thread to do it asynchronously, or add
> 0.1 seconds to device initialisation? A question without an easy
> answer, iMO.
This clears the VF Enable bit only if the BIOS has set it, so it doesn't
always happen. Actually the `msleep(100)' should be `ssleep(1)' here,
according to the spec you showed us below. I remembered the waiting time
incorrectly as 100ms which is the requirment for setting the VF Enable
bit rather than clearing it.
> > > +
> > > + pci_write_config_word(dev, pos + PCI_SRIOV_CTRL, ctrl);
> > > + pci_write_config_word(dev, pos + PCI_SRIOV_NUM_VF, total);
> > > + pci_read_config_word(dev, pos + PCI_SRIOV_VF_OFFSET, &offset);
> > > + pci_read_config_word(dev, pos + PCI_SRIOV_VF_STRIDE, &stride);
> > > + if (!offset || (total > 1 && !stride))
> > > + return -EIO;
> > > +
> > > + pci_read_config_dword(dev, pos + PCI_SRIOV_SUP_PGSIZE, &pgsz);
> > > + i = PAGE_SHIFT > 12 ? PAGE_SHIFT - 12 : 0;
> > > + pgsz &= ~((1 << i) - 1);
> > > + if (!pgsz)
> > > + return -EIO;
> >
> > All the error paths don't seem to undo the config space writes.
> > How will the devices behave with half initialized context?
>
> I think we should clear the VF_ENABLE bit. That action is also fraught
> with danger:
The VF Eanble bit hasn't been set yet :-) Actually the spec forbids the
s/w to write those registers (NumVFs, Supported Page Size, etc.) when the
enabling bit is set.
>
> If software Clears VF Enable, software must allow 1 second after VF
> Enable is Cleared before reading any field in the SR-IOV Extended
> Capability or the VF Migration State Array (see Section 3.3.15.1).
>
> Another msleep(1000) here? Not pretty, but what else can we do?
>
> Not to mention the danger of something else innocently using lspci -xxxx
> to read a field in the extended capability -- I suspect we also need to
> block user config accesses before clearing this bit.
Yes, we should block user config access.
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH v8 2/7] PCI: restore saved SR-IOV state
2009-02-10 8:59 [PATCH v8 0/7] PCI: Linux kernel SR-IOV support Yu Zhao
2009-02-10 8:59 ` [PATCH v8 1/7] PCI: initialize and release SR-IOV capability Yu Zhao
@ 2009-02-10 8:59 ` Yu Zhao
2009-02-10 8:59 ` [PATCH v8 3/7] PCI: reserve bus range for SR-IOV device Yu Zhao
` (4 subsequent siblings)
6 siblings, 0 replies; 12+ messages in thread
From: Yu Zhao @ 2009-02-10 8:59 UTC (permalink / raw)
To: jbarnes; +Cc: linux-pci, kvm, linux-kernel, Yu Zhao
Signed-off-by: Yu Zhao <yu.zhao@intel.com>
---
drivers/pci/iov.c | 25 +++++++++++++++++++++++++
drivers/pci/pci.c | 1 +
drivers/pci/pci.h | 4 ++++
3 files changed, 30 insertions(+), 0 deletions(-)
diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index 9a1fabd..bd389b4 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -125,6 +125,21 @@ static void sriov_release(struct pci_dev *dev)
dev->sriov = NULL;
}
+static void sriov_restore_state(struct pci_dev *dev)
+{
+ u16 ctrl;
+ struct pci_sriov *iov = dev->sriov;
+
+ pci_read_config_word(dev, iov->pos + PCI_SRIOV_CTRL, &ctrl);
+ if (ctrl & PCI_SRIOV_CTRL_VFE)
+ return;
+
+ pci_write_config_dword(dev, iov->pos + PCI_SRIOV_SYS_PGSIZE, iov->pgsz);
+ pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
+ if (iov->ctrl & PCI_SRIOV_CTRL_VFE)
+ msleep(100);
+}
+
/**
* pci_iov_init - initialize the IOV capability
* @dev: the PCI device
@@ -176,3 +191,13 @@ int pci_iov_resource_bar(struct pci_dev *dev, int resno,
return dev->sriov->pos + PCI_SRIOV_BAR +
4 * (resno - PCI_SRIOV_RESOURCES);
}
+
+/**
+ * pci_restore_iov_state - restore the state of the IOV capability
+ * @dev: the PCI device
+ */
+void pci_restore_iov_state(struct pci_dev *dev)
+{
+ if (dev->sriov)
+ sriov_restore_state(dev);
+}
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index c4f14f3..f791dcf 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -773,6 +773,7 @@ pci_restore_state(struct pci_dev *dev)
}
pci_restore_pcix_state(dev);
pci_restore_msi_state(dev);
+ pci_restore_iov_state(dev);
return 0;
}
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index d2dc6b7..9d76737 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -216,6 +216,7 @@ extern int pci_iov_init(struct pci_dev *dev);
extern void pci_iov_release(struct pci_dev *dev);
extern int pci_iov_resource_bar(struct pci_dev *dev, int resno,
enum pci_bar_type *type);
+extern void pci_restore_iov_state(struct pci_dev *dev);
#else
static inline int pci_iov_init(struct pci_dev *dev)
{
@@ -230,6 +231,9 @@ static inline int pci_iov_resource_bar(struct pci_dev *dev, int resno,
{
return 0;
}
+static inline void pci_restore_iov_state(struct pci_dev *dev)
+{
+}
#endif /* CONFIG_PCI_IOV */
#endif /* DRIVERS_PCI_H */
--
1.5.6.4
^ permalink raw reply related [flat|nested] 12+ messages in thread* [PATCH v8 3/7] PCI: reserve bus range for SR-IOV device
2009-02-10 8:59 [PATCH v8 0/7] PCI: Linux kernel SR-IOV support Yu Zhao
2009-02-10 8:59 ` [PATCH v8 1/7] PCI: initialize and release SR-IOV capability Yu Zhao
2009-02-10 8:59 ` [PATCH v8 2/7] PCI: restore saved SR-IOV state Yu Zhao
@ 2009-02-10 8:59 ` Yu Zhao
2009-02-10 8:59 ` [PATCH v8 4/7] PCI: add SR-IOV API for Physical Function driver Yu Zhao
` (3 subsequent siblings)
6 siblings, 0 replies; 12+ messages in thread
From: Yu Zhao @ 2009-02-10 8:59 UTC (permalink / raw)
To: jbarnes; +Cc: linux-pci, kvm, linux-kernel, Yu Zhao
Signed-off-by: Yu Zhao <yu.zhao@intel.com>
---
drivers/pci/iov.c | 34 ++++++++++++++++++++++++++++++++++
drivers/pci/pci.h | 5 +++++
drivers/pci/probe.c | 3 +++
3 files changed, 42 insertions(+), 0 deletions(-)
diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index bd389b4..1cf13be 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -11,6 +11,16 @@
#include "pci.h"
+static inline void virtfn_bdf(struct pci_dev *dev, int id, u8 *busnr, u8 *devfn)
+{
+ u16 bdf;
+
+ bdf = (dev->bus->number << 8) + dev->devfn +
+ dev->sriov->offset + dev->sriov->stride * id;
+ *busnr = bdf >> 8;
+ *devfn = bdf & 0xff;
+}
+
static int sriov_init(struct pci_dev *dev, int pos)
{
int i;
@@ -201,3 +211,27 @@ void pci_restore_iov_state(struct pci_dev *dev)
if (dev->sriov)
sriov_restore_state(dev);
}
+
+/**
+ * pci_iov_bus_range - find bus range used by Virtual Function
+ * @bus: the PCI bus
+ *
+ * Returns max number of buses (exclude current one) used by Virtual
+ * Functions.
+ */
+int pci_iov_bus_range(struct pci_bus *bus)
+{
+ int max = 0;
+ u8 busnr, devfn;
+ struct pci_dev *dev;
+
+ list_for_each_entry(dev, &bus->devices, bus_list) {
+ if (!dev->sriov)
+ continue;
+ virtfn_bdf(dev, dev->sriov->total - 1, &busnr, &devfn);
+ if (busnr > max)
+ max = busnr;
+ }
+
+ return max ? max - bus->number : 0;
+}
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 9d76737..fdfc476 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -217,6 +217,7 @@ extern void pci_iov_release(struct pci_dev *dev);
extern int pci_iov_resource_bar(struct pci_dev *dev, int resno,
enum pci_bar_type *type);
extern void pci_restore_iov_state(struct pci_dev *dev);
+extern int pci_iov_bus_range(struct pci_bus *bus);
#else
static inline int pci_iov_init(struct pci_dev *dev)
{
@@ -234,6 +235,10 @@ static inline int pci_iov_resource_bar(struct pci_dev *dev, int resno,
static inline void pci_restore_iov_state(struct pci_dev *dev)
{
}
+static inline int pci_iov_bus_range(struct pci_bus *bus)
+{
+ return 0;
+}
#endif /* CONFIG_PCI_IOV */
#endif /* DRIVERS_PCI_H */
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 03b6f29..4c8abd0 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -1078,6 +1078,9 @@ unsigned int __devinit pci_scan_child_bus(struct pci_bus *bus)
for (devfn = 0; devfn < 0x100; devfn += 8)
pci_scan_slot(bus, devfn);
+ /* Reserve buses for SR-IOV capability. */
+ max += pci_iov_bus_range(bus);
+
/*
* After performing arch-dependent fixup of the bus, look behind
* all PCI-to-PCI bridges on this bus.
--
1.5.6.4
^ permalink raw reply related [flat|nested] 12+ messages in thread* [PATCH v8 4/7] PCI: add SR-IOV API for Physical Function driver
2009-02-10 8:59 [PATCH v8 0/7] PCI: Linux kernel SR-IOV support Yu Zhao
` (2 preceding siblings ...)
2009-02-10 8:59 ` [PATCH v8 3/7] PCI: reserve bus range for SR-IOV device Yu Zhao
@ 2009-02-10 8:59 ` Yu Zhao
2009-02-10 8:59 ` [PATCH v8 5/7] PCI: handle SR-IOV Virtual Function Migration Yu Zhao
` (2 subsequent siblings)
6 siblings, 0 replies; 12+ messages in thread
From: Yu Zhao @ 2009-02-10 8:59 UTC (permalink / raw)
To: jbarnes; +Cc: linux-pci, kvm, linux-kernel, Yu Zhao
Signed-off-by: Yu Zhao <yu.zhao@intel.com>
---
drivers/pci/iov.c | 336 +++++++++++++++++++++++++++++++++++++++++++++++++++
drivers/pci/pci.h | 3 +
include/linux/pci.h | 14 ++
3 files changed, 353 insertions(+), 0 deletions(-)
diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index 1cf13be..d576160 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -10,6 +10,8 @@
#include <linux/pci.h>
#include "pci.h"
+#define VIRTFN_ID_LEN 8
+
static inline void virtfn_bdf(struct pci_dev *dev, int id, u8 *busnr, u8 *devfn)
{
@@ -21,6 +23,311 @@ static inline void virtfn_bdf(struct pci_dev *dev, int id, u8 *busnr, u8 *devfn)
*devfn = bdf & 0xff;
}
+static struct pci_bus *virtfn_add_bus(struct pci_bus *bus, int busnr)
+{
+ int rc;
+ struct pci_bus *child;
+
+ if (bus->number == busnr)
+ return bus;
+
+ child = pci_find_bus(pci_domain_nr(bus), busnr);
+ if (child)
+ return child;
+
+ child = pci_add_new_bus(bus, NULL, busnr);
+ if (!child)
+ return NULL;
+
+ child->subordinate = busnr;
+ child->dev.parent = bus->bridge;
+ rc = pci_bus_add_child(child);
+ if (rc) {
+ pci_remove_bus(child);
+ return NULL;
+ }
+
+ return child;
+}
+
+static void virtfn_remove_bus(struct pci_bus *bus, int busnr)
+{
+ struct pci_bus *child;
+
+ if (bus->number == busnr)
+ return;
+
+ child = pci_find_bus(pci_domain_nr(bus), busnr);
+ BUG_ON(!child);
+
+ if (list_empty(&child->devices))
+ pci_remove_bus(child);
+}
+
+static int virtfn_add(struct pci_dev *dev, int id, int reset)
+{
+ int i;
+ int rc;
+ u64 size;
+ u8 busnr, devfn;
+ char buf[VIRTFN_ID_LEN];
+ struct pci_dev *virtfn;
+ struct resource *res;
+ struct pci_sriov *iov = dev->sriov;
+
+ virtfn = alloc_pci_dev();
+ if (!virtfn)
+ return -ENOMEM;
+
+ virtfn_bdf(dev, id, &busnr, &devfn);
+ mutex_lock(&iov->pdev->sriov->lock);
+ virtfn->bus = virtfn_add_bus(dev->bus, busnr);
+ if (!virtfn->bus) {
+ kfree(virtfn);
+ mutex_unlock(&iov->pdev->sriov->lock);
+ return -ENOMEM;
+ }
+
+ virtfn->sysdata = dev->bus->sysdata;
+ virtfn->dev.parent = dev->dev.parent;
+ virtfn->dev.bus = dev->dev.bus;
+ virtfn->devfn = devfn;
+ virtfn->hdr_type = PCI_HEADER_TYPE_NORMAL;
+ virtfn->cfg_size = PCI_CFG_SPACE_EXP_SIZE;
+ virtfn->error_state = pci_channel_io_normal;
+ virtfn->current_state = PCI_UNKNOWN;
+ virtfn->is_pcie = 1;
+ virtfn->pcie_type = PCI_EXP_TYPE_ENDPOINT;
+ virtfn->dma_mask = 0xffffffff;
+ virtfn->vendor = dev->vendor;
+ virtfn->subsystem_vendor = dev->subsystem_vendor;
+ virtfn->class = dev->class;
+ pci_read_config_word(dev, iov->pos + PCI_SRIOV_VF_DID, &virtfn->device);
+ pci_read_config_byte(virtfn, PCI_REVISION_ID, &virtfn->revision);
+ pci_read_config_word(virtfn, PCI_SUBSYSTEM_ID,
+ &virtfn->subsystem_device);
+
+ dev_set_name(&virtfn->dev, "%04x:%02x:%02x.%d",
+ pci_domain_nr(virtfn->bus), busnr,
+ PCI_SLOT(devfn), PCI_FUNC(devfn));
+
+ for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+ res = dev->resource + PCI_SRIOV_RESOURCES + i;
+ if (!res->parent)
+ continue;
+ virtfn->resource[i].name = pci_name(virtfn);
+ virtfn->resource[i].flags = res->flags;
+ size = resource_size(res);
+ do_div(size, iov->total);
+ virtfn->resource[i].start = res->start + size * id;
+ virtfn->resource[i].end = virtfn->resource[i].start + size - 1;
+ rc = request_resource(res, &virtfn->resource[i]);
+ BUG_ON(rc);
+ }
+
+ if (reset)
+ pci_execute_reset_function(virtfn);
+
+ pci_device_add(virtfn, virtfn->bus);
+ mutex_unlock(&iov->pdev->sriov->lock);
+
+ virtfn->physfn = pci_dev_get(dev);
+
+ rc = pci_bus_add_device(virtfn);
+ if (rc)
+ goto failed1;
+ sprintf(buf, "%d", id);
+ rc = sysfs_create_link(&iov->dev.kobj, &virtfn->dev.kobj, buf);
+ if (rc)
+ goto failed1;
+ rc = sysfs_create_link(&virtfn->dev.kobj, &dev->dev.kobj, "physfn");
+ if (rc)
+ goto failed2;
+
+ kobject_uevent(&virtfn->dev.kobj, KOBJ_CHANGE);
+
+ return 0;
+
+failed2:
+ sysfs_remove_link(&iov->dev.kobj, buf);
+failed1:
+ pci_dev_put(dev);
+ mutex_lock(&iov->pdev->sriov->lock);
+ pci_remove_bus_device(virtfn);
+ virtfn_remove_bus(dev->bus, busnr);
+ mutex_unlock(&iov->pdev->sriov->lock);
+
+ return rc;
+}
+
+static void virtfn_remove(struct pci_dev *dev, int id, int reset)
+{
+ u8 busnr, devfn;
+ char buf[VIRTFN_ID_LEN];
+ struct pci_bus *bus;
+ struct pci_dev *virtfn;
+ struct pci_sriov *iov = dev->sriov;
+
+ virtfn_bdf(dev, id, &busnr, &devfn);
+ bus = pci_find_bus(pci_domain_nr(dev->bus), busnr);
+ if (!bus)
+ return;
+
+ virtfn = pci_get_slot(bus, devfn);
+ if (!virtfn)
+ return;
+
+ pci_dev_put(virtfn);
+
+ if (reset) {
+ device_release_driver(&virtfn->dev);
+ pci_execute_reset_function(virtfn);
+ }
+
+ sprintf(buf, "%d", id);
+ sysfs_remove_link(&iov->dev.kobj, buf);
+ sysfs_remove_link(&virtfn->dev.kobj, "physfn");
+
+ mutex_lock(&iov->pdev->sriov->lock);
+ pci_remove_bus_device(virtfn);
+ virtfn_remove_bus(dev->bus, busnr);
+ mutex_unlock(&iov->pdev->sriov->lock);
+
+ pci_dev_put(dev);
+}
+
+static void sriov_release_dev(struct device *dev)
+{
+ struct pci_sriov *iov = container_of(dev, struct pci_sriov, dev);
+
+ iov->nr_virtfn = 0;
+}
+
+static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
+{
+ int rc;
+ int i, j;
+ int nres;
+ u8 busnr, devfn;
+ u16 offset, stride, initial;
+ struct resource *res;
+ struct pci_dev *link;
+ struct pci_sriov *iov = dev->sriov;
+
+ if (!nr_virtfn)
+ return 0;
+
+ if (iov->nr_virtfn)
+ return -EINVAL;
+
+ pci_read_config_word(dev, iov->pos + PCI_SRIOV_INITIAL_VF, &initial);
+ if (initial > iov->total ||
+ (!(iov->cap & PCI_SRIOV_CAP_VFM) && (initial != iov->total)))
+ return -EIO;
+
+ if (nr_virtfn < 0 || nr_virtfn > iov->total ||
+ (!(iov->cap & PCI_SRIOV_CAP_VFM) && (nr_virtfn > initial)))
+ return -EINVAL;
+
+ pci_write_config_word(dev, iov->pos + PCI_SRIOV_NUM_VF, nr_virtfn);
+ pci_read_config_word(dev, iov->pos + PCI_SRIOV_VF_OFFSET, &offset);
+ pci_read_config_word(dev, iov->pos + PCI_SRIOV_VF_STRIDE, &stride);
+ if (!offset || (nr_virtfn > 1 && !stride))
+ return -EIO;
+
+ nres = 0;
+ for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+ res = dev->resource + PCI_SRIOV_RESOURCES + i;
+ if (!res->parent)
+ continue;
+ nres++;
+ }
+ if (nres != iov->nres)
+ return -ENOMEM;
+
+ iov->offset = offset;
+ iov->stride = stride;
+
+ virtfn_bdf(dev, nr_virtfn - 1, &busnr, &devfn);
+ if (busnr > dev->bus->subordinate) {
+ dev_err(&dev->dev, "no enough bus range for SR-IOV\n");
+ return -EIO;
+ }
+
+ memset(&iov->dev, 0, sizeof(iov->dev));
+ strcpy(iov->dev.bus_id, "virtfn");
+ iov->dev.parent = &dev->dev;
+ iov->dev.release = sriov_release_dev;
+ rc = device_register(&iov->dev);
+ if (rc)
+ return rc;
+
+ if (iov->link != dev->devfn) {
+ rc = -ENODEV;
+ list_for_each_entry(link, &dev->bus->devices, bus_list) {
+ if (link->sriov && link->devfn == iov->link)
+ rc = sysfs_create_link(&iov->dev.kobj,
+ &link->dev.kobj, "dep_link");
+ }
+ if (rc)
+ goto failed1;
+ }
+
+ iov->ctrl |= PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE;
+ pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
+ msleep(100);
+
+ iov->initial = initial;
+ if (nr_virtfn < initial)
+ initial = nr_virtfn;
+
+ for (i = 0; i < initial; i++) {
+ rc = virtfn_add(dev, i, 0);
+ if (rc)
+ goto failed2;
+ }
+
+ kobject_uevent(&dev->dev.kobj, KOBJ_CHANGE);
+ iov->nr_virtfn = nr_virtfn;
+
+ return 0;
+
+failed2:
+ for (j = 0; j < i; j++)
+ virtfn_remove(dev, j, 0);
+
+ iov->ctrl &= ~(PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE);
+ pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
+ msleep(100);
+
+ if (iov->link != dev->devfn)
+ sysfs_remove_link(&iov->dev.kobj, "dep_link");
+failed1:
+ device_unregister(&iov->dev);
+
+ return rc;
+}
+
+static void sriov_disable(struct pci_dev *dev)
+{
+ int i;
+ struct pci_sriov *iov = dev->sriov;
+
+ if (!iov->nr_virtfn)
+ return;
+
+ for (i = 0; i < iov->nr_virtfn; i++)
+ virtfn_remove(dev, i, 0);
+
+ iov->ctrl &= ~(PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE);
+ pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
+ msleep(100);
+
+ if (iov->link != dev->devfn)
+ sysfs_remove_link(&iov->dev.kobj, "dep_link");
+ device_unregister(&iov->dev);
+}
+
static int sriov_init(struct pci_dev *dev, int pos)
{
int i;
@@ -126,6 +433,8 @@ failed:
static void sriov_release(struct pci_dev *dev)
{
+ BUG_ON(dev->sriov->nr_virtfn);
+
if (dev == dev->sriov->pdev)
mutex_destroy(&dev->sriov->lock);
else
@@ -145,6 +454,7 @@ static void sriov_restore_state(struct pci_dev *dev)
return;
pci_write_config_dword(dev, iov->pos + PCI_SRIOV_SYS_PGSIZE, iov->pgsz);
+ pci_write_config_word(dev, iov->pos + PCI_SRIOV_NUM_VF, iov->nr_virtfn);
pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
if (iov->ctrl & PCI_SRIOV_CTRL_VFE)
msleep(100);
@@ -235,3 +545,29 @@ int pci_iov_bus_range(struct pci_bus *bus)
return max ? max - bus->number : 0;
}
+
+/**
+ * pci_enable_sriov - enable the SR-IOV capability
+ * @dev: the PCI device
+ *
+ * Returns 0 on success, or negative on failure.
+ */
+int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn)
+{
+ if (!dev->sriov)
+ return -ENODEV;
+
+ return sriov_enable(dev, nr_virtfn);
+}
+EXPORT_SYMBOL_GPL(pci_enable_sriov);
+
+/**
+ * pci_disable_sriov - disable the SR-IOV capability
+ * @dev: the PCI device
+ */
+void pci_disable_sriov(struct pci_dev *dev)
+{
+ if (dev->sriov)
+ sriov_disable(dev);
+}
+EXPORT_SYMBOL_GPL(pci_disable_sriov);
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index fdfc476..328a611 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -202,6 +202,8 @@ struct pci_sriov {
u32 cap; /* SR-IOV Capabilities */
u16 ctrl; /* SR-IOV Control */
u16 total; /* total VFs associated with the PF */
+ u16 initial; /* initial VFs associated with the PF */
+ u16 nr_virtfn; /* number of VFs available */
u16 offset; /* first VF Routing ID offset */
u16 stride; /* following VF stride */
u32 pgsz; /* page size for BAR alignment */
@@ -209,6 +211,7 @@ struct pci_sriov {
struct pci_dev *pdev; /* lowest numbered PF */
struct pci_dev *self; /* this PF */
struct mutex lock; /* lock for VF bus */
+ struct device dev;
};
#ifdef CONFIG_PCI_IOV
diff --git a/include/linux/pci.h b/include/linux/pci.h
index f4d740e..b14ecf0 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -278,6 +278,7 @@ struct pci_dev {
#endif
struct pci_vpd *vpd;
struct pci_sriov *sriov; /* SR-IOV capability related */
+ struct pci_dev *physfn; /* Physical Function the device belongs to */
};
extern struct pci_dev *alloc_pci_dev(void);
@@ -1202,5 +1203,18 @@ int pci_ext_cfg_avail(struct pci_dev *dev);
void __iomem *pci_ioremap_bar(struct pci_dev *pdev, int bar);
+#ifdef CONFIG_PCI_IOV
+extern int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn);
+extern void pci_disable_sriov(struct pci_dev *dev);
+#else
+static inline int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn)
+{
+ return -ENODEV;
+}
+static inline void pci_disable_sriov(struct pci_dev *dev)
+{
+}
+#endif
+
#endif /* __KERNEL__ */
#endif /* LINUX_PCI_H */
--
1.5.6.4
^ permalink raw reply related [flat|nested] 12+ messages in thread* [PATCH v8 5/7] PCI: handle SR-IOV Virtual Function Migration
2009-02-10 8:59 [PATCH v8 0/7] PCI: Linux kernel SR-IOV support Yu Zhao
` (3 preceding siblings ...)
2009-02-10 8:59 ` [PATCH v8 4/7] PCI: add SR-IOV API for Physical Function driver Yu Zhao
@ 2009-02-10 8:59 ` Yu Zhao
2009-02-10 8:59 ` [PATCH v8 6/7] PCI: document SR-IOV sysfs entries Yu Zhao
2009-02-10 8:59 ` [PATCH v8 7/7] PCI: manual for SR-IOV user and driver developer Yu Zhao
6 siblings, 0 replies; 12+ messages in thread
From: Yu Zhao @ 2009-02-10 8:59 UTC (permalink / raw)
To: jbarnes; +Cc: linux-pci, kvm, linux-kernel, Yu Zhao
Signed-off-by: Yu Zhao <yu.zhao@intel.com>
---
drivers/pci/iov.c | 119 +++++++++++++++++++++++++++++++++++++++++++++++++++
drivers/pci/pci.h | 4 ++
include/linux/pci.h | 6 +++
3 files changed, 129 insertions(+), 0 deletions(-)
diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index d576160..d622167 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -203,6 +203,97 @@ static void sriov_release_dev(struct device *dev)
iov->nr_virtfn = 0;
}
+static int sriov_migration(struct pci_dev *dev)
+{
+ u16 status;
+ struct pci_sriov *iov = dev->sriov;
+
+ if (!iov->nr_virtfn)
+ return 0;
+
+ if (!(iov->cap & PCI_SRIOV_CAP_VFM))
+ return 0;
+
+ pci_read_config_word(iov->self, iov->pos + PCI_SRIOV_STATUS, &status);
+ if (!(status & PCI_SRIOV_STATUS_VFM))
+ return 0;
+
+ schedule_work(&iov->mtask);
+
+ return 1;
+}
+
+static void sriov_migration_task(struct work_struct *work)
+{
+ int i;
+ u8 state;
+ u16 status;
+ struct pci_sriov *iov = container_of(work, struct pci_sriov, mtask);
+
+ for (i = iov->initial; i < iov->nr_virtfn; i++) {
+ state = readb(iov->mstate + i);
+ if (state == PCI_SRIOV_VFM_MI) {
+ writeb(PCI_SRIOV_VFM_AV, iov->mstate + i);
+ state = readb(iov->mstate + i);
+ if (state == PCI_SRIOV_VFM_AV)
+ virtfn_add(iov->self, i, 1);
+ } else if (state == PCI_SRIOV_VFM_MO) {
+ virtfn_remove(iov->self, i, 1);
+ writeb(PCI_SRIOV_VFM_UA, iov->mstate + i);
+ state = readb(iov->mstate + i);
+ if (state == PCI_SRIOV_VFM_AV)
+ virtfn_add(iov->self, i, 0);
+ }
+ }
+
+ pci_read_config_word(iov->self, iov->pos + PCI_SRIOV_STATUS, &status);
+ status &= ~PCI_SRIOV_STATUS_VFM;
+ pci_write_config_word(iov->self, iov->pos + PCI_SRIOV_STATUS, status);
+}
+
+static int sriov_enable_migration(struct pci_dev *dev, int nr_virtfn)
+{
+ int bir;
+ u32 table;
+ resource_size_t pa;
+ struct pci_sriov *iov = dev->sriov;
+
+ if (nr_virtfn <= iov->initial)
+ return 0;
+
+ pci_read_config_dword(dev, iov->pos + PCI_SRIOV_VFM, &table);
+ bir = PCI_SRIOV_VFM_BIR(table);
+ if (bir > PCI_STD_RESOURCE_END)
+ return -EIO;
+
+ table = PCI_SRIOV_VFM_OFFSET(table);
+ if (table + nr_virtfn > pci_resource_len(dev, bir))
+ return -EIO;
+
+ pa = pci_resource_start(dev, bir) + table;
+ iov->mstate = ioremap(pa, nr_virtfn);
+ if (!iov->mstate)
+ return -ENOMEM;
+
+ INIT_WORK(&iov->mtask, sriov_migration_task);
+
+ iov->ctrl |= PCI_SRIOV_CTRL_VFM | PCI_SRIOV_CTRL_INTR;
+ pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
+
+ return 0;
+}
+
+static void sriov_disable_migration(struct pci_dev *dev)
+{
+ struct pci_sriov *iov = dev->sriov;
+
+ iov->ctrl &= ~(PCI_SRIOV_CTRL_VFM | PCI_SRIOV_CTRL_INTR);
+ pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
+
+ cancel_work_sync(&iov->mtask);
+ iounmap(iov->mstate);
+}
+
static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
{
int rc;
@@ -287,6 +378,12 @@ static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
goto failed2;
}
+ if (iov->cap & PCI_SRIOV_CAP_VFM) {
+ rc = sriov_enable_migration(dev, nr_virtfn);
+ if (rc)
+ goto failed2;
+ }
+
kobject_uevent(&dev->dev.kobj, KOBJ_CHANGE);
iov->nr_virtfn = nr_virtfn;
@@ -316,6 +413,9 @@ static void sriov_disable(struct pci_dev *dev)
if (!iov->nr_virtfn)
return;
+ if (iov->cap & PCI_SRIOV_CAP_VFM)
+ sriov_disable_migration(dev);
+
for (i = 0; i < iov->nr_virtfn; i++)
virtfn_remove(dev, i, 0);
@@ -571,3 +671,22 @@ void pci_disable_sriov(struct pci_dev *dev)
sriov_disable(dev);
}
EXPORT_SYMBOL_GPL(pci_disable_sriov);
+
+/**
+ * pci_sriov_migration - notify SR-IOV core of Virtual Function Migration
+ * @dev: the PCI device
+ *
+ * Returns IRQ_HANDLED if the IRQ is handled, or IRQ_NONE if not.
+ *
+ * Physical Function driver is responsible to register IRQ handler using
+ * VF Migration Interrupt Message Number, and call this function when the
+ * interrupt is generated by the hardware.
+ */
+irqreturn_t pci_sriov_migration(struct pci_dev *dev)
+{
+ if (!dev->sriov)
+ return IRQ_NONE;
+
+ return sriov_migration(dev) ? IRQ_HANDLED : IRQ_NONE;
+}
+EXPORT_SYMBOL_GPL(pci_sriov_migration);
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 328a611..51bebb2 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -1,6 +1,8 @@
#ifndef DRIVERS_PCI_H
#define DRIVERS_PCI_H
+#include <linux/workqueue.h>
+
#define PCI_CFG_SPACE_SIZE 256
#define PCI_CFG_SPACE_EXP_SIZE 4096
@@ -211,6 +213,8 @@ struct pci_sriov {
struct pci_dev *pdev; /* lowest numbered PF */
struct pci_dev *self; /* this PF */
struct mutex lock; /* lock for VF bus */
+ struct work_struct mtask; /* VF Migration task */
+ u8 __iomem *mstate; /* VF Migration State Array */
struct device dev;
};
diff --git a/include/linux/pci.h b/include/linux/pci.h
index b14ecf0..84caf8c 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -52,6 +52,7 @@
#include <asm/atomic.h>
#include <linux/device.h>
#include <linux/io.h>
+#include <linux/irqreturn.h>
/* Include the ID list */
#include <linux/pci_ids.h>
@@ -1206,6 +1207,7 @@ void __iomem *pci_ioremap_bar(struct pci_dev *pdev, int bar);
#ifdef CONFIG_PCI_IOV
extern int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn);
extern void pci_disable_sriov(struct pci_dev *dev);
+extern irqreturn_t pci_sriov_migration(struct pci_dev *dev);
#else
static inline int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn)
{
@@ -1214,6 +1216,10 @@ static inline int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn)
static inline void pci_disable_sriov(struct pci_dev *dev)
{
}
+static inline irqreturn_t pci_sriov_migration(struct pci_dev *dev)
+{
+ return IRQ_NONE;
+}
#endif
#endif /* __KERNEL__ */
--
1.5.6.4
^ permalink raw reply related [flat|nested] 12+ messages in thread* [PATCH v8 6/7] PCI: document SR-IOV sysfs entries
2009-02-10 8:59 [PATCH v8 0/7] PCI: Linux kernel SR-IOV support Yu Zhao
` (4 preceding siblings ...)
2009-02-10 8:59 ` [PATCH v8 5/7] PCI: handle SR-IOV Virtual Function Migration Yu Zhao
@ 2009-02-10 8:59 ` Yu Zhao
2009-02-10 8:59 ` [PATCH v8 7/7] PCI: manual for SR-IOV user and driver developer Yu Zhao
6 siblings, 0 replies; 12+ messages in thread
From: Yu Zhao @ 2009-02-10 8:59 UTC (permalink / raw)
To: jbarnes; +Cc: linux-pci, kvm, linux-kernel, Yu Zhao
Signed-off-by: Yu Zhao <yu.zhao@intel.com>
---
Documentation/ABI/testing/sysfs-bus-pci | 27 +++++++++++++++++++++++++++
1 files changed, 27 insertions(+), 0 deletions(-)
diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci
index ceddcff..84dc100 100644
--- a/Documentation/ABI/testing/sysfs-bus-pci
+++ b/Documentation/ABI/testing/sysfs-bus-pci
@@ -9,3 +9,30 @@ Description:
that some devices may have malformatted data. If the
underlying VPD has a writable section then the
corresponding section of this file will be writable.
+
+What: /sys/bus/pci/devices/.../virtfn/N
+Date: February 2009
+Contact: Yu Zhao <yu.zhao@intel.com>
+Description:
+ This symbol link appears when hardware supports SR-IOV
+ capability and Physical Function driver has enabled it.
+ The symbol link points to the PCI device sysfs entry of
+ Virtual Function whose index is N (0...MaxVFs-1).
+
+What: /sys/bus/pci/devices/.../virtfn/dep_link
+Date: February 2009
+Contact: Yu Zhao <yu.zhao@intel.com>
+Description:
+ This symbol link appears when hardware supports SR-IOV
+ capability and Physical Function driver has enabled it,
+ and this device has vendor specific dependencies with
+ others. The symbol link points to the PCI device sysfs
+ entry of Physical Function this device depends on.
+
+What: /sys/bus/pci/devices/.../physfn
+Date: February 2009
+Contact: Yu Zhao <yu.zhao@intel.com>
+Description:
+ This symbol link appears when a device is Virtual Function.
+ The symbol link points to the PCI device sysfs entry of
+ Physical Function this device associates with.
--
1.5.6.4
^ permalink raw reply related [flat|nested] 12+ messages in thread* [PATCH v8 7/7] PCI: manual for SR-IOV user and driver developer
2009-02-10 8:59 [PATCH v8 0/7] PCI: Linux kernel SR-IOV support Yu Zhao
` (5 preceding siblings ...)
2009-02-10 8:59 ` [PATCH v8 6/7] PCI: document SR-IOV sysfs entries Yu Zhao
@ 2009-02-10 8:59 ` Yu Zhao
6 siblings, 0 replies; 12+ messages in thread
From: Yu Zhao @ 2009-02-10 8:59 UTC (permalink / raw)
To: jbarnes; +Cc: linux-pci, kvm, linux-kernel, Yu Zhao
Signed-off-by: Yu Zhao <yu.zhao@intel.com>
---
Documentation/DocBook/kernel-api.tmpl | 1 +
Documentation/PCI/pci-iov-howto.txt | 106 +++++++++++++++++++++++++++++++++
2 files changed, 107 insertions(+), 0 deletions(-)
create mode 100644 Documentation/PCI/pci-iov-howto.txt
diff --git a/Documentation/DocBook/kernel-api.tmpl b/Documentation/DocBook/kernel-api.tmpl
index 5818ff7..506e611 100644
--- a/Documentation/DocBook/kernel-api.tmpl
+++ b/Documentation/DocBook/kernel-api.tmpl
@@ -251,6 +251,7 @@ X!Edrivers/pci/hotplug.c
-->
!Edrivers/pci/probe.c
!Edrivers/pci/rom.c
+!Edrivers/pci/iov.c
</sect1>
<sect1><title>PCI Hotplug Support Library</title>
!Edrivers/pci/hotplug/pci_hotplug_core.c
diff --git a/Documentation/PCI/pci-iov-howto.txt b/Documentation/PCI/pci-iov-howto.txt
new file mode 100644
index 0000000..9029369
--- /dev/null
+++ b/Documentation/PCI/pci-iov-howto.txt
@@ -0,0 +1,106 @@
+ PCI Express I/O Virtualization Howto
+ Copyright (C) 2009 Intel Corporation
+ Yu Zhao <yu.zhao@intel.com>
+
+
+1. Overview
+
+1.1 What is SR-IOV
+
+Single Root I/O Virtualization (SR-IOV) is a PCI Express Extended
+capability which makes one physical device appear as multiple virtual
+devices. The physical device is referred to as Physical Function (PF)
+while the virtual devices are referred to as Virtual Functions (VF).
+Allocation of the VF can be dynamically controlled by the PF via
+registers encapsulated in the capability. By default, this feature is
+not enabled and the PF behaves as traditional PCIe device. Once it's
+turned on, each VF's PCI configuration space can be accessed by its own
+Bus, Device and Function Number (Routing ID). And each VF also has PCI
+Memory Space, which is used to map its register set. VF device driver
+operates on the register set so it can be functional and appear as a
+real existing PCI device.
+
+2. User Guide
+
+2.1 How can I enable SR-IOV capability
+
+The device driver (PF driver) will control the enabling and disabling
+of the capability via API provided by SR-IOV core. If the hardware
+has SR-IOV capability, loading its PF driver would enable it and all
+VFs associated with the PF.
+
+2.2 How can I use the Virtual Functions
+
+The VF is treated as hot-plugged PCI devices in the kernel, so they
+should be able to work in the same way as real PCI devices. The VF
+requires device driver that is same as a normal PCI device's.
+
+3. Developer Guide
+
+3.1 SR-IOV API
+
+To enable SR-IOV capability:
+ int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn);
+ 'nr_virtfn' is number of VFs to be enabled.
+
+To disable SR-IOV capability:
+ void pci_disable_sriov(struct pci_dev *dev);
+
+To notify SR-IOV core of Virtual Function Migration:
+ irqreturn_t pci_sriov_migration(struct pci_dev *dev);
+
+3.2 Usage example
+
+Following piece of code illustrates the usage of the SR-IOV API.
+
+static int __devinit dev_probe(struct pci_dev *dev, const struct pci_device_id *id)
+{
+
+ dev->current_state = PCI_D0;
+
+ pci_enable_sriov(dev, NR_VIRTFN);
+
+ ...
+
+ return 0;
+}
+
+static void __devexit dev_remove(struct pci_dev *dev)
+{
+ pci_disable_sriov(dev);
+
+ ...
+}
+
+static int dev_suspend(struct pci_dev *dev, pm_message_t state)
+{
+ ...
+
+ return 0;
+}
+
+static int dev_resume(struct pci_dev *dev)
+{
+ pci_restore_state(dev);
+
+ ...
+
+ return 0;
+}
+
+static void dev_shutdown(struct pci_dev *dev)
+{
+ ...
+}
+
+static struct pci_driver dev_driver = {
+ .name = "SR-IOV Physical Function driver",
+ .id_table = dev_id_table,
+ .probe = dev_probe,
+ .remove = __devexit_p(dev_remove),
+#ifdef CONFIG_PM
+ .suspend = dev_suspend,
+ .resume = dev_resume,
+#endif
+ .shutdown = dev_shutdown,
+};
--
1.5.6.4
^ permalink raw reply related [flat|nested] 12+ messages in thread