* [PATCH 1/5] iov: Update virtfn_max_buses to validate offset and stride
2015-10-27 20:52 [PATCH 0/5] Various of SR-IOV fixes and cleanup Alexander Duyck
@ 2015-10-27 20:52 ` Alexander Duyck
2015-10-28 16:32 ` Bjorn Helgaas
2015-10-27 20:52 ` [PATCH 2/5] iov: Reset resources to 0 if totalVFs increases after enabling ARI Alexander Duyck
` (3 subsequent siblings)
4 siblings, 1 reply; 20+ messages in thread
From: Alexander Duyck @ 2015-10-27 20:52 UTC (permalink / raw)
To: bhelgaas; +Cc: linux-pci, linux-kernel
This patch pulls the validation of offset and stride into virtfn_max_buses.
The general idea is to validate offset and stride for each possible value
of numvfs in addition to still determining the maximum bus value for the
VFs.
I also reversed the loop as the most likely maximum will be when numvfs is
set to total_VFs. In addition this makes it so that we loop down to a
value of 0 for numvfs which should be the resting state for the register.
Fixes: 8e20e89658f2 ("PCI: Set SR-IOV NumVFs to zero after enumeration")
Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
---
drivers/pci/iov.c | 42 ++++++++++++++++++++++++------------------
1 file changed, 24 insertions(+), 18 deletions(-)
diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index f8bfc1d39845..099050d78a39 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -54,24 +54,33 @@ static inline void pci_iov_set_numvfs(struct pci_dev *dev, int nr_virtfn)
* The PF consumes one bus number. NumVFs, First VF Offset, and VF Stride
* determine how many additional bus numbers will be consumed by VFs.
*
- * Iterate over all valid NumVFs and calculate the maximum number of bus
- * numbers that could ever be required.
+ * Iterate over all valid NumVFs, validate offset and stride, and calculate
+ * the maximum number of bus numbers that could ever be required.
*/
-static inline u8 virtfn_max_buses(struct pci_dev *dev)
+static int virtfn_max_buses(struct pci_dev *dev)
{
struct pci_sriov *iov = dev->sriov;
- int nr_virtfn;
- u8 max = 0;
+ int nr_virtfn = iov->total_VFs;
int busnr;
- for (nr_virtfn = 1; nr_virtfn <= iov->total_VFs; nr_virtfn++) {
- pci_iov_set_numvfs(dev, nr_virtfn);
+ pci_iov_set_numvfs(dev, nr_virtfn);
+
+ while (nr_virtfn--) {
+ if (!iov->offset || !iov->stride)
+ goto err;
+
busnr = pci_iov_virtfn_bus(dev, nr_virtfn - 1);
- if (busnr > max)
- max = busnr;
+ if (busnr > iov->max_VF_buses)
+ iov->max_VF_buses = busnr;
+
+ pci_iov_set_numvfs(dev, nr_virtfn);
}
- return max;
+ return 0;
+err:
+ pci_write_config_word(dev, iov->pos + PCI_SRIOV_NUM_VF, 0);
+
+ return -EIO;
}
static struct pci_bus *virtfn_add_bus(struct pci_bus *bus, int busnr)
@@ -467,22 +476,19 @@ found:
dev->sriov = iov;
dev->is_physfn = 1;
- iov->max_VF_buses = virtfn_max_buses(dev);
- pci_iov_set_numvfs(dev, 0);
- if (!iov->offset || (total > 1 && !iov->stride)) {
- rc = -EIO;
- goto failed;
- }
- return 0;
+ rc = virtfn_max_buses(dev);
+ if (!rc)
+ return 0;
+ dev->sriov = NULL;
+ dev->is_physfn = 0;
failed:
for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
res = &dev->resource[i + PCI_IOV_RESOURCES];
res->flags = 0;
}
- dev->sriov = NULL;
kfree(iov);
return rc;
}
^ permalink raw reply related [flat|nested] 20+ messages in thread
* Re: [PATCH 1/5] iov: Update virtfn_max_buses to validate offset and stride
2015-10-27 20:52 ` [PATCH 1/5] iov: Update virtfn_max_buses to validate offset and stride Alexander Duyck
@ 2015-10-28 16:32 ` Bjorn Helgaas
2015-10-28 17:57 ` Alexander Duyck
2015-10-28 18:43 ` Bjorn Helgaas
0 siblings, 2 replies; 20+ messages in thread
From: Bjorn Helgaas @ 2015-10-28 16:32 UTC (permalink / raw)
To: Alexander Duyck; +Cc: bhelgaas, linux-pci, linux-kernel
Hi Alex,
Thanks a lot for cleaning this up. I think this is a great
improvement over what I did.
On Tue, Oct 27, 2015 at 01:52:15PM -0700, Alexander Duyck wrote:
> This patch pulls the validation of offset and stride into virtfn_max_buses.
> The general idea is to validate offset and stride for each possible value
> of numvfs in addition to still determining the maximum bus value for the
> VFs.
>
> I also reversed the loop as the most likely maximum will be when numvfs is
> set to total_VFs. In addition this makes it so that we loop down to a
> value of 0 for numvfs which should be the resting state for the register.
>
> Fixes: 8e20e89658f2 ("PCI: Set SR-IOV NumVFs to zero after enumeration")
> Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
I'd like to squash this together with my patch instead of having fixes
on top of fixes. What do you think of the following? (This applies
on top of 70675e0b6a1a ("PCI: Don't try to restore VF BARs")).
commit c20e11b572c5d4e4f01c86580a133122fbd13cfa
Author: Alexander Duyck <aduyck@mirantis.com>
Date: Wed Oct 28 10:54:32 2015 -0500
PCI: Set SR-IOV NumVFs to zero after enumeration
The enumeration path should leave NumVFs set to zero. But after
4449f079722c ("PCI: Calculate maximum number of buses required for VFs"),
we call virtfn_max_buses() in the enumeration path, which changes NumVFs.
This NumVFs change is visible via lspci and sysfs until a driver enables
SR-IOV.
Iterate from TotalVFs down to zero so NumVFs is zero when we're finished
computing the maximum number of buses. Validate offset and stride in
the loop, so we can test it at every possible NumVFs setting. Rename
virtfn_max_buses() to compute_max_vf_buses() to hint that it does have a
side effect of updating iov->max_VF_buses.
[bhelgaas: changelog, rename, reverse sense of error path]
Fixes: 4449f079722c ("PCI: Calculate maximum number of buses required for VFs")
Based-on-patch-by: Ethan Zhao <ethan.zhao@oracle.com>
Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index ee0ebff..120cfb3 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -54,24 +54,33 @@ static inline void pci_iov_set_numvfs(struct pci_dev *dev, int nr_virtfn)
* The PF consumes one bus number. NumVFs, First VF Offset, and VF Stride
* determine how many additional bus numbers will be consumed by VFs.
*
- * Iterate over all valid NumVFs and calculate the maximum number of bus
- * numbers that could ever be required.
+ * Iterate over all valid NumVFs, validate offset and stride, and calculate
+ * the maximum number of bus numbers that could ever be required.
*/
-static inline u8 virtfn_max_buses(struct pci_dev *dev)
+static int compute_max_vf_buses(struct pci_dev *dev)
{
struct pci_sriov *iov = dev->sriov;
- int nr_virtfn;
- u8 max = 0;
+ int nr_virtfn = iov->total_VFs;
int busnr;
- for (nr_virtfn = 1; nr_virtfn <= iov->total_VFs; nr_virtfn++) {
- pci_iov_set_numvfs(dev, nr_virtfn);
+ pci_iov_set_numvfs(dev, nr_virtfn);
+
+ while (nr_virtfn--) {
+ if (!iov->offset || !iov->stride)
+ goto err;
+
busnr = pci_iov_virtfn_bus(dev, nr_virtfn - 1);
- if (busnr > max)
- max = busnr;
+ if (busnr > iov->max_VF_buses)
+ iov->max_VF_buses = busnr;
+
+ pci_iov_set_numvfs(dev, nr_virtfn);
}
- return max;
+ return 0;
+
+err:
+ pci_iov_set_numvfs(dev, 0);
+ return -EIO;
}
static struct pci_bus *virtfn_add_bus(struct pci_bus *bus, int busnr)
@@ -384,7 +393,7 @@ static int sriov_init(struct pci_dev *dev, int pos)
int rc;
int nres;
u32 pgsz;
- u16 ctrl, total, offset, stride;
+ u16 ctrl, total;
struct pci_sriov *iov;
struct resource *res;
struct pci_dev *pdev;
@@ -414,11 +423,6 @@ static int sriov_init(struct pci_dev *dev, int pos)
found:
pci_write_config_word(dev, pos + PCI_SRIOV_CTRL, ctrl);
- pci_write_config_word(dev, pos + PCI_SRIOV_NUM_VF, 0);
- pci_read_config_word(dev, pos + PCI_SRIOV_VF_OFFSET, &offset);
- pci_read_config_word(dev, pos + PCI_SRIOV_VF_STRIDE, &stride);
- if (!offset || (total > 1 && !stride))
- return -EIO;
pci_read_config_dword(dev, pos + PCI_SRIOV_SUP_PGSIZE, &pgsz);
i = PAGE_SHIFT > 12 ? PAGE_SHIFT - 12 : 0;
@@ -456,8 +460,6 @@ found:
iov->nres = nres;
iov->ctrl = ctrl;
iov->total_VFs = total;
- iov->offset = offset;
- iov->stride = stride;
iov->pgsz = pgsz;
iov->self = dev;
pci_read_config_dword(dev, pos + PCI_SRIOV_CAP, &iov->cap);
@@ -474,10 +476,15 @@ found:
dev->sriov = iov;
dev->is_physfn = 1;
- iov->max_VF_buses = virtfn_max_buses(dev);
+ rc = compute_max_vf_buses(dev);
+ if (rc)
+ goto fail_max_buses;
return 0;
+fail_max_buses:
+ dev->sriov = NULL;
+ dev->is_physfn = 0;
failed:
for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
res = &dev->resource[i + PCI_IOV_RESOURCES];
^ permalink raw reply related [flat|nested] 20+ messages in thread
* Re: [PATCH 1/5] iov: Update virtfn_max_buses to validate offset and stride
2015-10-28 16:32 ` Bjorn Helgaas
@ 2015-10-28 17:57 ` Alexander Duyck
2015-10-28 18:43 ` Bjorn Helgaas
1 sibling, 0 replies; 20+ messages in thread
From: Alexander Duyck @ 2015-10-28 17:57 UTC (permalink / raw)
To: Bjorn Helgaas, Alexander Duyck; +Cc: bhelgaas, linux-pci, linux-kernel
On 10/28/2015 09:32 AM, Bjorn Helgaas wrote:
> Hi Alex,
>
> Thanks a lot for cleaning this up. I think this is a great
> improvement over what I did.
>
> On Tue, Oct 27, 2015 at 01:52:15PM -0700, Alexander Duyck wrote:
>> This patch pulls the validation of offset and stride into virtfn_max_buses.
>> The general idea is to validate offset and stride for each possible value
>> of numvfs in addition to still determining the maximum bus value for the
>> VFs.
>>
>> I also reversed the loop as the most likely maximum will be when numvfs is
>> set to total_VFs. In addition this makes it so that we loop down to a
>> value of 0 for numvfs which should be the resting state for the register.
>>
>> Fixes: 8e20e89658f2 ("PCI: Set SR-IOV NumVFs to zero after enumeration")
>> Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
>
> I'd like to squash this together with my patch instead of having fixes
> on top of fixes. What do you think of the following? (This applies
> on top of 70675e0b6a1a ("PCI: Don't try to restore VF BARs")).
>
>
> commit c20e11b572c5d4e4f01c86580a133122fbd13cfa
> Author: Alexander Duyck <aduyck@mirantis.com>
> Date: Wed Oct 28 10:54:32 2015 -0500
>
> PCI: Set SR-IOV NumVFs to zero after enumeration
>
> The enumeration path should leave NumVFs set to zero. But after
> 4449f079722c ("PCI: Calculate maximum number of buses required for VFs"),
> we call virtfn_max_buses() in the enumeration path, which changes NumVFs.
> This NumVFs change is visible via lspci and sysfs until a driver enables
> SR-IOV.
>
> Iterate from TotalVFs down to zero so NumVFs is zero when we're finished
> computing the maximum number of buses. Validate offset and stride in
> the loop, so we can test it at every possible NumVFs setting. Rename
> virtfn_max_buses() to compute_max_vf_buses() to hint that it does have a
> side effect of updating iov->max_VF_buses.
>
> [bhelgaas: changelog, rename, reverse sense of error path]
> Fixes: 4449f079722c ("PCI: Calculate maximum number of buses required for VFs")
> Based-on-patch-by: Ethan Zhao <ethan.zhao@oracle.com>
> Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
>
This looks fine to me.
Acked-by: Alexander Duyck <aduyck@mirantis.com>
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 1/5] iov: Update virtfn_max_buses to validate offset and stride
2015-10-28 16:32 ` Bjorn Helgaas
2015-10-28 17:57 ` Alexander Duyck
@ 2015-10-28 18:43 ` Bjorn Helgaas
2015-10-28 21:46 ` Alexander Duyck
1 sibling, 1 reply; 20+ messages in thread
From: Bjorn Helgaas @ 2015-10-28 18:43 UTC (permalink / raw)
To: Alexander Duyck; +Cc: bhelgaas, linux-pci, linux-kernel
On Wed, Oct 28, 2015 at 11:32:16AM -0500, Bjorn Helgaas wrote:
> Hi Alex,
>
> Thanks a lot for cleaning this up. I think this is a great
> improvement over what I did.
>
> On Tue, Oct 27, 2015 at 01:52:15PM -0700, Alexander Duyck wrote:
> > This patch pulls the validation of offset and stride into virtfn_max_buses.
> > The general idea is to validate offset and stride for each possible value
> > of numvfs in addition to still determining the maximum bus value for the
> > VFs.
> >
> > I also reversed the loop as the most likely maximum will be when numvfs is
> > set to total_VFs. In addition this makes it so that we loop down to a
> > value of 0 for numvfs which should be the resting state for the register.
> >
> > Fixes: 8e20e89658f2 ("PCI: Set SR-IOV NumVFs to zero after enumeration")
> > Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
>
> I'd like to squash this together with my patch instead of having fixes
> on top of fixes. What do you think of the following? (This applies
> on top of 70675e0b6a1a ("PCI: Don't try to restore VF BARs")).
>
>
> commit c20e11b572c5d4e4f01c86580a133122fbd13cfa
> Author: Alexander Duyck <aduyck@mirantis.com>
> Date: Wed Oct 28 10:54:32 2015 -0500
>
> PCI: Set SR-IOV NumVFs to zero after enumeration
>
> The enumeration path should leave NumVFs set to zero. But after
> 4449f079722c ("PCI: Calculate maximum number of buses required for VFs"),
> we call virtfn_max_buses() in the enumeration path, which changes NumVFs.
> This NumVFs change is visible via lspci and sysfs until a driver enables
> SR-IOV.
>
> Iterate from TotalVFs down to zero so NumVFs is zero when we're finished
> computing the maximum number of buses. Validate offset and stride in
> the loop, so we can test it at every possible NumVFs setting. Rename
> virtfn_max_buses() to compute_max_vf_buses() to hint that it does have a
> side effect of updating iov->max_VF_buses.
>
> [bhelgaas: changelog, rename, reverse sense of error path]
> Fixes: 4449f079722c ("PCI: Calculate maximum number of buses required for VFs")
> Based-on-patch-by: Ethan Zhao <ethan.zhao@oracle.com>
> Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
>
> diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
> index ee0ebff..120cfb3 100644
> --- a/drivers/pci/iov.c
> +++ b/drivers/pci/iov.c
> @@ -54,24 +54,33 @@ static inline void pci_iov_set_numvfs(struct pci_dev *dev, int nr_virtfn)
> * The PF consumes one bus number. NumVFs, First VF Offset, and VF Stride
> * determine how many additional bus numbers will be consumed by VFs.
> *
> - * Iterate over all valid NumVFs and calculate the maximum number of bus
> - * numbers that could ever be required.
> + * Iterate over all valid NumVFs, validate offset and stride, and calculate
> + * the maximum number of bus numbers that could ever be required.
> */
> -static inline u8 virtfn_max_buses(struct pci_dev *dev)
> +static int compute_max_vf_buses(struct pci_dev *dev)
> {
> struct pci_sriov *iov = dev->sriov;
> - int nr_virtfn;
> - u8 max = 0;
> + int nr_virtfn = iov->total_VFs;
> int busnr;
>
> - for (nr_virtfn = 1; nr_virtfn <= iov->total_VFs; nr_virtfn++) {
> - pci_iov_set_numvfs(dev, nr_virtfn);
> + pci_iov_set_numvfs(dev, nr_virtfn);
> +
> + while (nr_virtfn--) {
> + if (!iov->offset || !iov->stride)
> + goto err;
I think we have a minor problem here. In sriov_enable(), we return an
error if "nr_virtfn > 1 && !iov->stride", so it's legal for stride to
be zero if NumVF is 1. Here we don't allow that. Sec 3.3.10 says:
Note: VF Stride is unused if NumVFs is 0 or 1. If NumVFs is greater
than 1, VF Stride must not be zero."
So I think we should allow "stride == 0" here when NumVFs is 1.
> +
> busnr = pci_iov_virtfn_bus(dev, nr_virtfn - 1);
I think this loop management is slightly wrong: I don't think we ever
compute busnr for the highest VF because we always decrement nr_virtfn
after calling pci_iov_set_numvfs(), and then we subtract one again.
E.g., if Total VFs is 8, the VFs are numbered VF0..VF7, and we have
this, which doesn't check VF7:
nr_virtfn = iov->total_VFs # nr_virtfn == 8
pci_iov_set_numvfs(..., nr_virtfn) # passes 8 (correct)
while (nr_virtfn--) {
# nr_virtfn == 7 in loop body
pci_iov_virtfn_bus(..., nr_virtfn - 1) # passes 6 (wrong)
> - if (busnr > max)
> - max = busnr;
> + if (busnr > iov->max_VF_buses)
> + iov->max_VF_buses = busnr;
> +
> + pci_iov_set_numvfs(dev, nr_virtfn);
> }
>
> - return max;
> + return 0;
> +
> +err:
> + pci_iov_set_numvfs(dev, 0);
> + return -EIO;
> }
Here's my new proposal:
static int compute_max_vf_buses(struct pci_dev *dev)
{
struct pci_sriov *iov = dev->sriov;
int nr_virtfn, busnr, rc = 0;
for (nr_virtfn = iov->total_VFs; nr_virtfn; nr_virtfn--) {
pci_iov_set_numvfs(dev, nr_virtfn);
if (!iov->offset || (nr_virtfn > 1 && !iov->stride)) {
rc = -EIO;
goto out;
}
busnr = pci_iov_virtfn_bus(dev, nr_virtfn - 1);
if (busnr > iov->max_VF_buses)
iov->max_VF_buses = busnr;
}
out:
pci_iov_set_numvfs(dev, 0);
return rc;
}
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 1/5] iov: Update virtfn_max_buses to validate offset and stride
2015-10-28 18:43 ` Bjorn Helgaas
@ 2015-10-28 21:46 ` Alexander Duyck
2015-10-29 19:50 ` Bjorn Helgaas
0 siblings, 1 reply; 20+ messages in thread
From: Alexander Duyck @ 2015-10-28 21:46 UTC (permalink / raw)
To: Bjorn Helgaas, Alexander Duyck; +Cc: bhelgaas, linux-pci, linux-kernel
On 10/28/2015 11:43 AM, Bjorn Helgaas wrote:
> On Wed, Oct 28, 2015 at 11:32:16AM -0500, Bjorn Helgaas wrote:
>> Hi Alex,
>>
>> Thanks a lot for cleaning this up. I think this is a great
>> improvement over what I did.
>>
>> On Tue, Oct 27, 2015 at 01:52:15PM -0700, Alexander Duyck wrote:
>>> This patch pulls the validation of offset and stride into virtfn_max_buses.
>>> The general idea is to validate offset and stride for each possible value
>>> of numvfs in addition to still determining the maximum bus value for the
>>> VFs.
>>>
>>> I also reversed the loop as the most likely maximum will be when numvfs is
>>> set to total_VFs. In addition this makes it so that we loop down to a
>>> value of 0 for numvfs which should be the resting state for the register.
>>>
>>> Fixes: 8e20e89658f2 ("PCI: Set SR-IOV NumVFs to zero after enumeration")
>>> Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
>>
>> I'd like to squash this together with my patch instead of having fixes
>> on top of fixes. What do you think of the following? (This applies
>> on top of 70675e0b6a1a ("PCI: Don't try to restore VF BARs")).
>>
>>
>> commit c20e11b572c5d4e4f01c86580a133122fbd13cfa
>> Author: Alexander Duyck <aduyck@mirantis.com>
>> Date: Wed Oct 28 10:54:32 2015 -0500
>>
>> PCI: Set SR-IOV NumVFs to zero after enumeration
>>
>> The enumeration path should leave NumVFs set to zero. But after
>> 4449f079722c ("PCI: Calculate maximum number of buses required for VFs"),
>> we call virtfn_max_buses() in the enumeration path, which changes NumVFs.
>> This NumVFs change is visible via lspci and sysfs until a driver enables
>> SR-IOV.
>>
>> Iterate from TotalVFs down to zero so NumVFs is zero when we're finished
>> computing the maximum number of buses. Validate offset and stride in
>> the loop, so we can test it at every possible NumVFs setting. Rename
>> virtfn_max_buses() to compute_max_vf_buses() to hint that it does have a
>> side effect of updating iov->max_VF_buses.
>>
>> [bhelgaas: changelog, rename, reverse sense of error path]
>> Fixes: 4449f079722c ("PCI: Calculate maximum number of buses required for VFs")
>> Based-on-patch-by: Ethan Zhao <ethan.zhao@oracle.com>
>> Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
>> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
>>
>> diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
>> index ee0ebff..120cfb3 100644
>> --- a/drivers/pci/iov.c
>> +++ b/drivers/pci/iov.c
>> @@ -54,24 +54,33 @@ static inline void pci_iov_set_numvfs(struct pci_dev *dev, int nr_virtfn)
>> * The PF consumes one bus number. NumVFs, First VF Offset, and VF Stride
>> * determine how many additional bus numbers will be consumed by VFs.
>> *
>> - * Iterate over all valid NumVFs and calculate the maximum number of bus
>> - * numbers that could ever be required.
>> + * Iterate over all valid NumVFs, validate offset and stride, and calculate
>> + * the maximum number of bus numbers that could ever be required.
>> */
>> -static inline u8 virtfn_max_buses(struct pci_dev *dev)
>> +static int compute_max_vf_buses(struct pci_dev *dev)
>> {
>> struct pci_sriov *iov = dev->sriov;
>> - int nr_virtfn;
>> - u8 max = 0;
>> + int nr_virtfn = iov->total_VFs;
>> int busnr;
>>
>> - for (nr_virtfn = 1; nr_virtfn <= iov->total_VFs; nr_virtfn++) {
>> - pci_iov_set_numvfs(dev, nr_virtfn);
>> + pci_iov_set_numvfs(dev, nr_virtfn);
>> +
>> + while (nr_virtfn--) {
>> + if (!iov->offset || !iov->stride)
>> + goto err;
>
> I think we have a minor problem here. In sriov_enable(), we return an
> error if "nr_virtfn > 1 && !iov->stride", so it's legal for stride to
> be zero if NumVF is 1. Here we don't allow that. Sec 3.3.10 says:
>
> Note: VF Stride is unused if NumVFs is 0 or 1. If NumVFs is greater
> than 1, VF Stride must not be zero."
>
> So I think we should allow "stride == 0" here when NumVFs is 1.
Right, we shouldn't be testing it if NumVFs is 1 or less.
>> +
>> busnr = pci_iov_virtfn_bus(dev, nr_virtfn - 1);
>
> I think this loop management is slightly wrong: I don't think we ever
> compute busnr for the highest VF because we always decrement nr_virtfn
> after calling pci_iov_set_numvfs(), and then we subtract one again.
> E.g., if Total VFs is 8, the VFs are numbered VF0..VF7, and we have
> this, which doesn't check VF7:
>
> nr_virtfn = iov->total_VFs # nr_virtfn == 8
> pci_iov_set_numvfs(..., nr_virtfn) # passes 8 (correct)
> while (nr_virtfn--) {
> # nr_virtfn == 7 in loop body
> pci_iov_virtfn_bus(..., nr_virtfn - 1) # passes 6 (wrong)
>
Yeah, that was supposed to just be nr_virtfn.
>> - if (busnr > max)
>> - max = busnr;
>> + if (busnr > iov->max_VF_buses)
>> + iov->max_VF_buses = busnr;
>> +
>> + pci_iov_set_numvfs(dev, nr_virtfn);
>> }
>>
>> - return max;
>> + return 0;
>> +
>> +err:
>> + pci_iov_set_numvfs(dev, 0);
>> + return -EIO;
>> }
>
> Here's my new proposal:
>
> static int compute_max_vf_buses(struct pci_dev *dev)
> {
> struct pci_sriov *iov = dev->sriov;
> int nr_virtfn, busnr, rc = 0;
>
> for (nr_virtfn = iov->total_VFs; nr_virtfn; nr_virtfn--) {
> pci_iov_set_numvfs(dev, nr_virtfn);
> if (!iov->offset || (nr_virtfn > 1 && !iov->stride)) {
> rc = -EIO;
> goto out;
> }
>
> busnr = pci_iov_virtfn_bus(dev, nr_virtfn - 1);
> if (busnr > iov->max_VF_buses)
> iov->max_VF_buses = busnr;
> }
>
> out:
> pci_iov_set_numvfs(dev, 0);
> return rc;
> }
>
This looks good to me. In theory you could save yourself a pair of MMIO
reads at the end of the loop by just writing numvfs without the offset
and stride read, but this should work.
- Alex
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 1/5] iov: Update virtfn_max_buses to validate offset and stride
2015-10-28 21:46 ` Alexander Duyck
@ 2015-10-29 19:50 ` Bjorn Helgaas
0 siblings, 0 replies; 20+ messages in thread
From: Bjorn Helgaas @ 2015-10-29 19:50 UTC (permalink / raw)
To: Alexander Duyck; +Cc: Alexander Duyck, bhelgaas, linux-pci, linux-kernel
On Wed, Oct 28, 2015 at 02:46:50PM -0700, Alexander Duyck wrote:
> On 10/28/2015 11:43 AM, Bjorn Helgaas wrote:
> >On Wed, Oct 28, 2015 at 11:32:16AM -0500, Bjorn Helgaas wrote:
> >Here's my new proposal:
> >
> > static int compute_max_vf_buses(struct pci_dev *dev)
> > {
> > struct pci_sriov *iov = dev->sriov;
> > int nr_virtfn, busnr, rc = 0;
> >
> > for (nr_virtfn = iov->total_VFs; nr_virtfn; nr_virtfn--) {
> > pci_iov_set_numvfs(dev, nr_virtfn);
> > if (!iov->offset || (nr_virtfn > 1 && !iov->stride)) {
> > rc = -EIO;
> > goto out;
> > }
> >
> > busnr = pci_iov_virtfn_bus(dev, nr_virtfn - 1);
> > if (busnr > iov->max_VF_buses)
> > iov->max_VF_buses = busnr;
> > }
> >
> > out:
> > pci_iov_set_numvfs(dev, 0);
> > return rc;
> > }
> >
>
> This looks good to me. In theory you could save yourself a pair of
> MMIO reads at the end of the loop by just writing numvfs without the
> offset and stride read, but this should work.
True. It's called once per device at boot-time, so it's not a hot path,
and I think it's worth two unnecessary MMIO reads to make it easier to
analyze: there's exactly one place that updates PCI_SRIOV_NUM_VF, and that
place always updates our caches of PCI_SRIOV_VF_OFFSET and
PCI_SRIOV_VF_STRIDE.
Bjorn
^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH 2/5] iov: Reset resources to 0 if totalVFs increases after enabling ARI
2015-10-27 20:52 [PATCH 0/5] Various of SR-IOV fixes and cleanup Alexander Duyck
2015-10-27 20:52 ` [PATCH 1/5] iov: Update virtfn_max_buses to validate offset and stride Alexander Duyck
@ 2015-10-27 20:52 ` Alexander Duyck
2015-10-28 16:37 ` Bjorn Helgaas
2015-10-27 20:52 ` [PATCH 3/5] iov: Fix sriov_enable exception handling path Alexander Duyck
` (2 subsequent siblings)
4 siblings, 1 reply; 20+ messages in thread
From: Alexander Duyck @ 2015-10-27 20:52 UTC (permalink / raw)
To: bhelgaas; +Cc: linux-pci, linux-kernel
This patch forces us to reallocate VF BARs if the totalVFs value has
increased after enabling ARI. This normally shouldn't occur, however I
have seen some non-spec devices that shift between 7 and some value greater
than 7 based on the ARI value and we want to avoid triggering any issues
with such devices.
Fixes: 3aa71da412fe ("PCI: Enable SR-IOV ARI Capable Hierarchy before reading TotalVFs")
Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
---
drivers/pci/iov.c | 11 ++++++++++-
1 file changed, 10 insertions(+), 1 deletion(-)
diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index 099050d78a39..238950412de0 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -393,7 +393,7 @@ static int sriov_init(struct pci_dev *dev, int pos)
int rc;
int nres;
u32 pgsz;
- u16 ctrl, total;
+ u16 ctrl, total, orig_total;
struct pci_sriov *iov;
struct resource *res;
struct pci_dev *pdev;
@@ -402,6 +402,7 @@ static int sriov_init(struct pci_dev *dev, int pos)
pci_pcie_type(dev) != PCI_EXP_TYPE_ENDPOINT)
return -ENODEV;
+ pci_read_config_word(dev, pos + PCI_SRIOV_TOTAL_VF, &orig_total);
pci_read_config_word(dev, pos + PCI_SRIOV_CTRL, &ctrl);
if (ctrl & PCI_SRIOV_CTRL_VFE) {
pci_write_config_word(dev, pos + PCI_SRIOV_CTRL, 0);
@@ -450,6 +451,14 @@ found:
}
iov->barsz[i] = resource_size(res);
res->end = res->start + resource_size(res) * total - 1;
+
+ /* force reallocation of BARs if total VFs increased */
+ if (orig_total < total) {
+ res->flags |= IORESOURCE_UNSET;
+ res->end -= res->start;
+ res->start = 0;
+ }
+
dev_info(&dev->dev, "VF(n) BAR%d space: %pR (contains BAR%d for %d VFs)\n",
i, res, i, total);
i += bar64;
^ permalink raw reply related [flat|nested] 20+ messages in thread
* Re: [PATCH 2/5] iov: Reset resources to 0 if totalVFs increases after enabling ARI
2015-10-27 20:52 ` [PATCH 2/5] iov: Reset resources to 0 if totalVFs increases after enabling ARI Alexander Duyck
@ 2015-10-28 16:37 ` Bjorn Helgaas
2015-10-28 18:32 ` Alexander Duyck
0 siblings, 1 reply; 20+ messages in thread
From: Bjorn Helgaas @ 2015-10-28 16:37 UTC (permalink / raw)
To: Alexander Duyck; +Cc: bhelgaas, linux-pci, linux-kernel
Hi Alex,
On Tue, Oct 27, 2015 at 01:52:21PM -0700, Alexander Duyck wrote:
> This patch forces us to reallocate VF BARs if the totalVFs value has
> increased after enabling ARI. This normally shouldn't occur, however I
> have seen some non-spec devices that shift between 7 and some value greater
> than 7 based on the ARI value and we want to avoid triggering any issues
> with such devices.
Can you include specifics about the devices? The value "7" is pretty
specific, so if we're going to include that level of detail, we should
have the actual device info to go with it.
I guess the problem is:
- Device supports 7 TotalVFs with ARI disabled, >7 with ARI enabled
- Firmware leaves ARI disabled in SRIOV_CTRL
- Firmware computes size based on 7 VFs
- Firmware allocates space and programs BARs for 7 VFs
- Linux enables ARI, reads >7 TotalVFs
- Linux computes size based on >7 VFs
- Increased size may overlap other resources
Right?
> Fixes: 3aa71da412fe ("PCI: Enable SR-IOV ARI Capable Hierarchy before reading TotalVFs")
> Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
> ---
> drivers/pci/iov.c | 11 ++++++++++-
> 1 file changed, 10 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
> index 099050d78a39..238950412de0 100644
> --- a/drivers/pci/iov.c
> +++ b/drivers/pci/iov.c
> @@ -393,7 +393,7 @@ static int sriov_init(struct pci_dev *dev, int pos)
> int rc;
> int nres;
> u32 pgsz;
> - u16 ctrl, total;
> + u16 ctrl, total, orig_total;
> struct pci_sriov *iov;
> struct resource *res;
> struct pci_dev *pdev;
> @@ -402,6 +402,7 @@ static int sriov_init(struct pci_dev *dev, int pos)
> pci_pcie_type(dev) != PCI_EXP_TYPE_ENDPOINT)
> return -ENODEV;
>
> + pci_read_config_word(dev, pos + PCI_SRIOV_TOTAL_VF, &orig_total);
> pci_read_config_word(dev, pos + PCI_SRIOV_CTRL, &ctrl);
> if (ctrl & PCI_SRIOV_CTRL_VFE) {
> pci_write_config_word(dev, pos + PCI_SRIOV_CTRL, 0);
> @@ -450,6 +451,14 @@ found:
> }
> iov->barsz[i] = resource_size(res);
> res->end = res->start + resource_size(res) * total - 1;
> +
> + /* force reallocation of BARs if total VFs increased */
> + if (orig_total < total) {
> + res->flags |= IORESOURCE_UNSET;
> + res->end -= res->start;
> + res->start = 0;
> + }
Two thoughts here:
1) Even if the required space increased, it's possible that firmware
placed the BAR somewhere where the extra space is available. In that
case, this forces reallocation unnecessarily.
2) This *feels* like something the PCI core should be doing anyway,
even without any help here. Shouldn't we fail in pci_claim_resource()
and set IORESOURCE_UNSET there?
OK, and a third: re-reading TotalVF is (I think) completely
unnecessary per spec, so if we're going to do it we should probably
have a one-line comment about why the code is doing something that
appears unnecessary, and really should have a concrete example device
in the changelog.
> +
> dev_info(&dev->dev, "VF(n) BAR%d space: %pR (contains BAR%d for %d VFs)\n",
> i, res, i, total);
> i += bar64;
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 2/5] iov: Reset resources to 0 if totalVFs increases after enabling ARI
2015-10-28 16:37 ` Bjorn Helgaas
@ 2015-10-28 18:32 ` Alexander Duyck
2015-10-28 19:52 ` Bjorn Helgaas
0 siblings, 1 reply; 20+ messages in thread
From: Alexander Duyck @ 2015-10-28 18:32 UTC (permalink / raw)
To: Bjorn Helgaas, Alexander Duyck
Cc: bhelgaas, linux-pci, linux-kernel, Ben Shelton
On 10/28/2015 09:37 AM, Bjorn Helgaas wrote:
> Hi Alex,
>
> On Tue, Oct 27, 2015 at 01:52:21PM -0700, Alexander Duyck wrote:
>> This patch forces us to reallocate VF BARs if the totalVFs value has
>> increased after enabling ARI. This normally shouldn't occur, however I
>> have seen some non-spec devices that shift between 7 and some value greater
>> than 7 based on the ARI value and we want to avoid triggering any issues
>> with such devices.
>
> Can you include specifics about the devices? The value "7" is pretty
> specific, so if we're going to include that level of detail, we should
> have the actual device info to go with it.
I referenced 7 as that is the largest number of VFs a single function
can support assuming a single function without ARI and without the
ability to handle Type 1 configuration requests. The Intel fm10k driver
has logic in it that does a check for ARI and if it is supported it
reports via sysfs a totalVFs of 64, otherwise it limits the totalVFs
reported to 7. However, I don't believe it exposes the limitation via
the configuration space.
> I guess the problem is:
>
> - Device supports 7 TotalVFs with ARI disabled, >7 with ARI enabled
> - Firmware leaves ARI disabled in SRIOV_CTRL
> - Firmware computes size based on 7 VFs
> - Firmware allocates space and programs BARs for 7 VFs
> - Linux enables ARI, reads >7 TotalVFs
> - Linux computes size based on >7 VFs
> - Increased size may overlap other resources
>
> Right?
Right. More than likely what will happen is that you will see overlap
of the device on itself if it has multiple base address registers
assigned to the VFs.
>> Fixes: 3aa71da412fe ("PCI: Enable SR-IOV ARI Capable Hierarchy before reading TotalVFs")
>> Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
>> ---
>> drivers/pci/iov.c | 11 ++++++++++-
>> 1 file changed, 10 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
>> index 099050d78a39..238950412de0 100644
>> --- a/drivers/pci/iov.c
>> +++ b/drivers/pci/iov.c
>> @@ -393,7 +393,7 @@ static int sriov_init(struct pci_dev *dev, int pos)
>> int rc;
>> int nres;
>> u32 pgsz;
>> - u16 ctrl, total;
>> + u16 ctrl, total, orig_total;
>> struct pci_sriov *iov;
>> struct resource *res;
>> struct pci_dev *pdev;
>> @@ -402,6 +402,7 @@ static int sriov_init(struct pci_dev *dev, int pos)
>> pci_pcie_type(dev) != PCI_EXP_TYPE_ENDPOINT)
>> return -ENODEV;
>>
>> + pci_read_config_word(dev, pos + PCI_SRIOV_TOTAL_VF, &orig_total);
>> pci_read_config_word(dev, pos + PCI_SRIOV_CTRL, &ctrl);
>> if (ctrl & PCI_SRIOV_CTRL_VFE) {
>> pci_write_config_word(dev, pos + PCI_SRIOV_CTRL, 0);
>> @@ -450,6 +451,14 @@ found:
>> }
>> iov->barsz[i] = resource_size(res);
>> res->end = res->start + resource_size(res) * total - 1;
>> +
>> + /* force reallocation of BARs if total VFs increased */
>> + if (orig_total < total) {
>> + res->flags |= IORESOURCE_UNSET;
>> + res->end -= res->start;
>> + res->start = 0;
>> + }
>
> Two thoughts here:
>
> 1) Even if the required space increased, it's possible that firmware
> placed the BAR somewhere where the extra space is available. In that
> case, this forces reallocation unnecessarily.
I'd say it is possible, but not likely. From past experience I have
seen BIOSes do some very dumb things when it comes to SR-IOV, assuming
they even support it.
In addition many of the VF devices out there support more than one base
address register per function. The Intel NICs for example have one for
device registers and one for MSI-X registers. And most BIOSes usually
pack one right after the other from what I have seen. So while there
may be more space there what usually happens is that the MSI-X region
will have to be relocated in order to make room for expanding the other
base address register.
My last bit on all this is that VFs are meant to be assigned into
guests. I would argue that for the sake of security we are much better
off invalidating the VF base address registers and forcing a
reallocation if there is even a risk of the VF base address register
space overlapping with some other piece of host memory. We don't want
to risk possibly exposing any bits of the host that we didn't intend on.
> 2) This *feels* like something the PCI core should be doing anyway,
> even without any help here. Shouldn't we fail in pci_claim_resource()
> and set IORESOURCE_UNSET there?
>
> OK, and a third: re-reading TotalVF is (I think) completely
> unnecessary per spec, so if we're going to do it we should probably
> have a one-line comment about why the code is doing something that
> appears unnecessary, and really should have a concrete example device
> in the changelog.
I'm perfectly fine with us adding a comment about this being needed for
out-of-spec devices on BIOSes that may not fully support SR-IOV.
However, I wasn't the one that had the device that needed the workaround
for totalVFs and so I cannot provide a concrete example myself.
I've added Ben to the Cc as he was the author for the original patch
that introduced this possible issue. Perhaps he can provide us with
more details on the hardware that needs this.
- Alex
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 2/5] iov: Reset resources to 0 if totalVFs increases after enabling ARI
2015-10-28 18:32 ` Alexander Duyck
@ 2015-10-28 19:52 ` Bjorn Helgaas
2015-10-28 21:37 ` Alexander Duyck
0 siblings, 1 reply; 20+ messages in thread
From: Bjorn Helgaas @ 2015-10-28 19:52 UTC (permalink / raw)
To: Alexander Duyck
Cc: Alexander Duyck, bhelgaas, linux-pci, linux-kernel, Ben Shelton
On Wed, Oct 28, 2015 at 11:32:14AM -0700, Alexander Duyck wrote:
> On 10/28/2015 09:37 AM, Bjorn Helgaas wrote:
> >Hi Alex,
> >
> >On Tue, Oct 27, 2015 at 01:52:21PM -0700, Alexander Duyck wrote:
> >>This patch forces us to reallocate VF BARs if the totalVFs value has
> >>increased after enabling ARI. This normally shouldn't occur, however I
> >>have seen some non-spec devices that shift between 7 and some value greater
> >>than 7 based on the ARI value and we want to avoid triggering any issues
> >>with such devices.
> >
> >Can you include specifics about the devices? The value "7" is pretty
> >specific, so if we're going to include that level of detail, we should
> >have the actual device info to go with it.
>
> I referenced 7 as that is the largest number of VFs a single
> function can support assuming a single function without ARI and
> without the ability to handle Type 1 configuration requests. The
> Intel fm10k driver has logic in it that does a check for ARI and if
> it is supported it reports via sysfs a totalVFs of 64, otherwise it
> limits the totalVFs reported to 7. However, I don't believe it
> exposes the limitation via the configuration space.
Ah, OK, that makes sense.
> >I guess the problem is:
> >
> > - Device supports 7 TotalVFs with ARI disabled, >7 with ARI enabled
> > - Firmware leaves ARI disabled in SRIOV_CTRL
> > - Firmware computes size based on 7 VFs
> > - Firmware allocates space and programs BARs for 7 VFs
> > - Linux enables ARI, reads >7 TotalVFs
> > - Linux computes size based on >7 VFs
> > - Increased size may overlap other resources
> >
> >Right?
>
> Right. More than likely what will happen is that you will see
> overlap of the device on itself if it has multiple base address
> registers assigned to the VFs.
>
> >>Fixes: 3aa71da412fe ("PCI: Enable SR-IOV ARI Capable Hierarchy before reading TotalVFs")
> >>Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
> >>---
> >> drivers/pci/iov.c | 11 ++++++++++-
> >> 1 file changed, 10 insertions(+), 1 deletion(-)
> >>
> >>diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
> >>index 099050d78a39..238950412de0 100644
> >>--- a/drivers/pci/iov.c
> >>+++ b/drivers/pci/iov.c
> >>@@ -393,7 +393,7 @@ static int sriov_init(struct pci_dev *dev, int pos)
> >> int rc;
> >> int nres;
> >> u32 pgsz;
> >>- u16 ctrl, total;
> >>+ u16 ctrl, total, orig_total;
> >> struct pci_sriov *iov;
> >> struct resource *res;
> >> struct pci_dev *pdev;
> >>@@ -402,6 +402,7 @@ static int sriov_init(struct pci_dev *dev, int pos)
> >> pci_pcie_type(dev) != PCI_EXP_TYPE_ENDPOINT)
> >> return -ENODEV;
> >>
> >>+ pci_read_config_word(dev, pos + PCI_SRIOV_TOTAL_VF, &orig_total);
> >> pci_read_config_word(dev, pos + PCI_SRIOV_CTRL, &ctrl);
> >> if (ctrl & PCI_SRIOV_CTRL_VFE) {
> >> pci_write_config_word(dev, pos + PCI_SRIOV_CTRL, 0);
> >>@@ -450,6 +451,14 @@ found:
> >> }
> >> iov->barsz[i] = resource_size(res);
> >> res->end = res->start + resource_size(res) * total - 1;
> >>+
> >>+ /* force reallocation of BARs if total VFs increased */
> >>+ if (orig_total < total) {
> >>+ res->flags |= IORESOURCE_UNSET;
> >>+ res->end -= res->start;
> >>+ res->start = 0;
> >>+ }
> >
> >Two thoughts here:
> >
> >1) Even if the required space increased, it's possible that firmware
> >placed the BAR somewhere where the extra space is available. In that
> >case, this forces reallocation unnecessarily.
>
> I'd say it is possible, but not likely. From past experience I have
> seen BIOSes do some very dumb things when it comes to SR-IOV,
> assuming they even support it.
>
> In addition many of the VF devices out there support more than one
> base address register per function. The Intel NICs for example have
> one for device registers and one for MSI-X registers. And most
> BIOSes usually pack one right after the other from what I have seen.
> So while there may be more space there what usually happens is that
> the MSI-X region will have to be relocated in order to make room for
> expanding the other base address register.
>
> My last bit on all this is that VFs are meant to be assigned into
> guests. I would argue that for the sake of security we are much
> better off invalidating the VF base address registers and forcing a
> reallocation if there is even a risk of the VF base address register
> space overlapping with some other piece of host memory. We don't
> want to risk possibly exposing any bits of the host that we didn't
> intend on.
Agreed, not likely for several reasons.
> >2) This *feels* like something the PCI core should be doing anyway,
> >even without any help here. Shouldn't we fail in pci_claim_resource()
> >and set IORESOURCE_UNSET there?
This is really the core of my question -- what problem does this patch
solve? I'm trying to figure out if delaying the read of TotalVFs
until after we set ARI Capable Hierarchy is sufficient, and if it's
not sufficient, *why* not?
> >OK, and a third: re-reading TotalVF is (I think) completely
> >unnecessary per spec, so if we're going to do it we should probably
> >have a one-line comment about why the code is doing something that
> >appears unnecessary, and really should have a concrete example device
> >in the changelog.
>
> I'm perfectly fine with us adding a comment about this being needed
> for out-of-spec devices on BIOSes that may not fully support SR-IOV.
> However, I wasn't the one that had the device that needed the
> workaround for totalVFs and so I cannot provide a concrete example
> myself.
>
> I've added Ben to the Cc as he was the author for the original patch
> that introduced this possible issue. Perhaps he can provide us with
> more details on the hardware that needs this.
>
> - Alex
> --
> To unsubscribe from this list: send the line "unsubscribe linux-pci" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 2/5] iov: Reset resources to 0 if totalVFs increases after enabling ARI
2015-10-28 19:52 ` Bjorn Helgaas
@ 2015-10-28 21:37 ` Alexander Duyck
0 siblings, 0 replies; 20+ messages in thread
From: Alexander Duyck @ 2015-10-28 21:37 UTC (permalink / raw)
To: Bjorn Helgaas
Cc: Alexander Duyck, bhelgaas, linux-pci, linux-kernel, Ben Shelton
On 10/28/2015 12:52 PM, Bjorn Helgaas wrote:
> On Wed, Oct 28, 2015 at 11:32:14AM -0700, Alexander Duyck wrote:
>> On 10/28/2015 09:37 AM, Bjorn Helgaas wrote:
>>> Hi Alex,
>>>
>>> On Tue, Oct 27, 2015 at 01:52:21PM -0700, Alexander Duyck wrote:
>>>> This patch forces us to reallocate VF BARs if the totalVFs value has
>>>> increased after enabling ARI. This normally shouldn't occur, however I
>>>> have seen some non-spec devices that shift between 7 and some value greater
>>>> than 7 based on the ARI value and we want to avoid triggering any issues
>>>> with such devices.
>>>
>>> Can you include specifics about the devices? The value "7" is pretty
>>> specific, so if we're going to include that level of detail, we should
>>> have the actual device info to go with it.
>>
>> I referenced 7 as that is the largest number of VFs a single
>> function can support assuming a single function without ARI and
>> without the ability to handle Type 1 configuration requests. The
>> Intel fm10k driver has logic in it that does a check for ARI and if
>> it is supported it reports via sysfs a totalVFs of 64, otherwise it
>> limits the totalVFs reported to 7. However, I don't believe it
>> exposes the limitation via the configuration space.
>
> Ah, OK, that makes sense.
>
>>> I guess the problem is:
>>>
>>> - Device supports 7 TotalVFs with ARI disabled, >7 with ARI enabled
>>> - Firmware leaves ARI disabled in SRIOV_CTRL
>>> - Firmware computes size based on 7 VFs
>>> - Firmware allocates space and programs BARs for 7 VFs
>>> - Linux enables ARI, reads >7 TotalVFs
>>> - Linux computes size based on >7 VFs
>>> - Increased size may overlap other resources
>>>
>>> Right?
>>
>> Right. More than likely what will happen is that you will see
>> overlap of the device on itself if it has multiple base address
>> registers assigned to the VFs.
>>
>>>> Fixes: 3aa71da412fe ("PCI: Enable SR-IOV ARI Capable Hierarchy before reading TotalVFs")
>>>> Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
>>>> ---
>>>> drivers/pci/iov.c | 11 ++++++++++-
>>>> 1 file changed, 10 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
>>>> index 099050d78a39..238950412de0 100644
>>>> --- a/drivers/pci/iov.c
>>>> +++ b/drivers/pci/iov.c
>>>> @@ -393,7 +393,7 @@ static int sriov_init(struct pci_dev *dev, int pos)
>>>> int rc;
>>>> int nres;
>>>> u32 pgsz;
>>>> - u16 ctrl, total;
>>>> + u16 ctrl, total, orig_total;
>>>> struct pci_sriov *iov;
>>>> struct resource *res;
>>>> struct pci_dev *pdev;
>>>> @@ -402,6 +402,7 @@ static int sriov_init(struct pci_dev *dev, int pos)
>>>> pci_pcie_type(dev) != PCI_EXP_TYPE_ENDPOINT)
>>>> return -ENODEV;
>>>>
>>>> + pci_read_config_word(dev, pos + PCI_SRIOV_TOTAL_VF, &orig_total);
>>>> pci_read_config_word(dev, pos + PCI_SRIOV_CTRL, &ctrl);
>>>> if (ctrl & PCI_SRIOV_CTRL_VFE) {
>>>> pci_write_config_word(dev, pos + PCI_SRIOV_CTRL, 0);
>>>> @@ -450,6 +451,14 @@ found:
>>>> }
>>>> iov->barsz[i] = resource_size(res);
>>>> res->end = res->start + resource_size(res) * total - 1;
>>>> +
>>>> + /* force reallocation of BARs if total VFs increased */
>>>> + if (orig_total < total) {
>>>> + res->flags |= IORESOURCE_UNSET;
>>>> + res->end -= res->start;
>>>> + res->start = 0;
>>>> + }
>>>
>>> Two thoughts here:
>>>
>>> 1) Even if the required space increased, it's possible that firmware
>>> placed the BAR somewhere where the extra space is available. In that
>>> case, this forces reallocation unnecessarily.
>>
>> I'd say it is possible, but not likely. From past experience I have
>> seen BIOSes do some very dumb things when it comes to SR-IOV,
>> assuming they even support it.
>>
>> In addition many of the VF devices out there support more than one
>> base address register per function. The Intel NICs for example have
>> one for device registers and one for MSI-X registers. And most
>> BIOSes usually pack one right after the other from what I have seen.
>> So while there may be more space there what usually happens is that
>> the MSI-X region will have to be relocated in order to make room for
>> expanding the other base address register.
>>
>> My last bit on all this is that VFs are meant to be assigned into
>> guests. I would argue that for the sake of security we are much
>> better off invalidating the VF base address registers and forcing a
>> reallocation if there is even a risk of the VF base address register
>> space overlapping with some other piece of host memory. We don't
>> want to risk possibly exposing any bits of the host that we didn't
>> intend on.
>
> Agreed, not likely for several reasons.
>
>>> 2) This *feels* like something the PCI core should be doing anyway,
>>> even without any help here. Shouldn't we fail in pci_claim_resource()
>>> and set IORESOURCE_UNSET there?
>
> This is really the core of my question -- what problem does this patch
> solve? I'm trying to figure out if delaying the read of TotalVFs
> until after we set ARI Capable Hierarchy is sufficient, and if it's
> not sufficient, *why* not?
I suppose you have a point. As long as the PCI core is taking care of
any overlaps in pci_claim_resource that is probably good enough.
- Alex
^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH 3/5] iov: Fix sriov_enable exception handling path
2015-10-27 20:52 [PATCH 0/5] Various of SR-IOV fixes and cleanup Alexander Duyck
2015-10-27 20:52 ` [PATCH 1/5] iov: Update virtfn_max_buses to validate offset and stride Alexander Duyck
2015-10-27 20:52 ` [PATCH 2/5] iov: Reset resources to 0 if totalVFs increases after enabling ARI Alexander Duyck
@ 2015-10-27 20:52 ` Alexander Duyck
2015-10-29 16:32 ` Bjorn Helgaas
2015-10-27 20:52 ` [PATCH 4/5] iov: Variable and loop cleanup for sriov_disable and sriov_enable Alexander Duyck
2015-10-27 20:52 ` [PATCH 5/5] iov: Update sriov_enable to correctly handle offset and stride Alexander Duyck
4 siblings, 1 reply; 20+ messages in thread
From: Alexander Duyck @ 2015-10-27 20:52 UTC (permalink / raw)
To: bhelgaas; +Cc: linux-pci, linux-kernel
>From what I can tell there were several errors in the sriov_enable
exception handling path. Below is a brief list of what I believe I am
fixing:
1. If pcibios_enable_sriov failed, we returned without disabling SR-IOV on
the device.
2. If virtfn_add failed we didn't call pcibios_disable_sriov to undo
pcibios_enable_sriov.
3. We were resetting numvfs to 0 before a second had passed for the VFs to
quiesce.
4. Minor coding style issues for white space and for assignment in
conditional check.
Beyond addressing these 4 issues there were also 2 other minor issues in
that retval was a redundant variable with rc, and j wasn't actually needed
as we could simply reverse the loop we were running when setting up i. As
such I have updated the code to address those two items.
Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
---
drivers/pci/iov.c | 31 +++++++++++++++++--------------
1 file changed, 17 insertions(+), 14 deletions(-)
diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index 238950412de0..cecc242c1af0 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -231,13 +231,18 @@ static void virtfn_remove(struct pci_dev *dev, int id, int reset)
int __weak pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
{
- return 0;
+ return 0;
+}
+
+int __weak pcibios_sriov_disable(struct pci_dev *pdev)
+{
+ return 0;
}
static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
{
int rc;
- int i, j;
+ int i;
int nres;
u16 offset, stride, initial;
struct resource *res;
@@ -245,7 +250,6 @@ static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
struct pci_sriov *iov = dev->sriov;
int bars = 0;
int bus;
- int retval;
if (!nr_virtfn)
return 0;
@@ -322,10 +326,11 @@ static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
if (nr_virtfn < initial)
initial = nr_virtfn;
- if ((retval = pcibios_sriov_enable(dev, initial))) {
+ rc = pcibios_sriov_enable(dev, initial);
+ if (rc) {
dev_err(&dev->dev, "failure %d from pcibios_sriov_enable()\n",
- retval);
- return retval;
+ rc);
+ goto err_pcibios;
}
for (i = 0; i < initial; i++) {
@@ -340,25 +345,23 @@ static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
return 0;
failed:
- for (j = 0; j < i; j++)
- virtfn_remove(dev, j, 0);
+ while (i--)
+ virtfn_remove(dev, i, 0);
+ pcibios_sriov_disable(dev);
+err_pcibios:
iov->ctrl &= ~(PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE);
pci_cfg_access_lock(dev);
pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
- pci_iov_set_numvfs(dev, 0);
ssleep(1);
pci_cfg_access_unlock(dev);
if (iov->link != dev->devfn)
sysfs_remove_link(&dev->dev.kobj, "dep_link");
- return rc;
-}
+ pci_iov_set_numvfs(dev, 0);
-int __weak pcibios_sriov_disable(struct pci_dev *pdev)
-{
- return 0;
+ return rc;
}
static void sriov_disable(struct pci_dev *dev)
^ permalink raw reply related [flat|nested] 20+ messages in thread
* Re: [PATCH 3/5] iov: Fix sriov_enable exception handling path
2015-10-27 20:52 ` [PATCH 3/5] iov: Fix sriov_enable exception handling path Alexander Duyck
@ 2015-10-29 16:32 ` Bjorn Helgaas
2015-10-29 16:54 ` Alex Duyck
0 siblings, 1 reply; 20+ messages in thread
From: Bjorn Helgaas @ 2015-10-29 16:32 UTC (permalink / raw)
To: Alexander Duyck; +Cc: bhelgaas, linux-pci, linux-kernel
Hi Alex,
Thanks, this definitely clears up some problems. Two minor questions
below.
On Tue, Oct 27, 2015 at 01:52:27PM -0700, Alexander Duyck wrote:
> >From what I can tell there were several errors in the sriov_enable
> exception handling path. Below is a brief list of what I believe I am
> fixing:
>
> 1. If pcibios_enable_sriov failed, we returned without disabling SR-IOV on
> the device.
> 2. If virtfn_add failed we didn't call pcibios_disable_sriov to undo
> pcibios_enable_sriov.
> 3. We were resetting numvfs to 0 before a second had passed for the VFs to
> quiesce.
> 4. Minor coding style issues for white space and for assignment in
> conditional check.
>
> Beyond addressing these 4 issues there were also 2 other minor issues in
> that retval was a redundant variable with rc, and j wasn't actually needed
> as we could simply reverse the loop we were running when setting up i. As
> such I have updated the code to address those two items.
>
> Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
> ---
> drivers/pci/iov.c | 31 +++++++++++++++++--------------
> 1 file changed, 17 insertions(+), 14 deletions(-)
>
> diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
> index 238950412de0..cecc242c1af0 100644
> --- a/drivers/pci/iov.c
> +++ b/drivers/pci/iov.c
> @@ -231,13 +231,18 @@ static void virtfn_remove(struct pci_dev *dev, int id, int reset)
>
> int __weak pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
> {
> - return 0;
> + return 0;
> +}
> +
> +int __weak pcibios_sriov_disable(struct pci_dev *pdev)
> +{
> + return 0;
> }
>
> static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
> {
> int rc;
> - int i, j;
> + int i;
> int nres;
> u16 offset, stride, initial;
> struct resource *res;
> @@ -245,7 +250,6 @@ static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
> struct pci_sriov *iov = dev->sriov;
> int bars = 0;
> int bus;
> - int retval;
>
> if (!nr_virtfn)
> return 0;
> @@ -322,10 +326,11 @@ static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
> if (nr_virtfn < initial)
> initial = nr_virtfn;
>
> - if ((retval = pcibios_sriov_enable(dev, initial))) {
> + rc = pcibios_sriov_enable(dev, initial);
> + if (rc) {
> dev_err(&dev->dev, "failure %d from pcibios_sriov_enable()\n",
> - retval);
> - return retval;
> + rc);
> + goto err_pcibios;
> }
>
> for (i = 0; i < initial; i++) {
> @@ -340,25 +345,23 @@ static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
> return 0;
>
> failed:
> - for (j = 0; j < i; j++)
> - virtfn_remove(dev, j, 0);
> + while (i--)
> + virtfn_remove(dev, i, 0);
>
> + pcibios_sriov_disable(dev);
> +err_pcibios:
> iov->ctrl &= ~(PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE);
> pci_cfg_access_lock(dev);
> pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
> - pci_iov_set_numvfs(dev, 0);
> ssleep(1);
> pci_cfg_access_unlock(dev);
>
> if (iov->link != dev->devfn)
> sysfs_remove_link(&dev->dev.kobj, "dep_link");
>
> - return rc;
> -}
> + pci_iov_set_numvfs(dev, 0);
Do you have a spec pointer for the 1 sec delay before clearing NumVFs?
Does we need to clear NumVFs while holding the cfg access lock?
> -int __weak pcibios_sriov_disable(struct pci_dev *pdev)
> -{
> - return 0;
> + return rc;
> }
>
> static void sriov_disable(struct pci_dev *dev)
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 3/5] iov: Fix sriov_enable exception handling path
2015-10-29 16:32 ` Bjorn Helgaas
@ 2015-10-29 16:54 ` Alex Duyck
2015-10-29 20:41 ` Bjorn Helgaas
0 siblings, 1 reply; 20+ messages in thread
From: Alex Duyck @ 2015-10-29 16:54 UTC (permalink / raw)
To: Bjorn Helgaas; +Cc: bhelgaas, linux-pci, linux-kernel
On Thu, Oct 29, 2015 at 9:32 AM, Bjorn Helgaas <helgaas@kernel.org> wrote:
> Hi Alex,
>
> Thanks, this definitely clears up some problems. Two minor questions
> below.
>
> On Tue, Oct 27, 2015 at 01:52:27PM -0700, Alexander Duyck wrote:
>> >From what I can tell there were several errors in the sriov_enable
>> exception handling path. Below is a brief list of what I believe I am
>> fixing:
>>
>> 1. If pcibios_enable_sriov failed, we returned without disabling SR-IOV on
>> the device.
>> 2. If virtfn_add failed we didn't call pcibios_disable_sriov to undo
>> pcibios_enable_sriov.
>> 3. We were resetting numvfs to 0 before a second had passed for the VFs to
>> quiesce.
>> 4. Minor coding style issues for white space and for assignment in
>> conditional check.
>>
>> Beyond addressing these 4 issues there were also 2 other minor issues in
>> that retval was a redundant variable with rc, and j wasn't actually needed
>> as we could simply reverse the loop we were running when setting up i. As
>> such I have updated the code to address those two items.
>>
>> Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
>> ---
>> drivers/pci/iov.c | 31 +++++++++++++++++--------------
>> 1 file changed, 17 insertions(+), 14 deletions(-)
>>
>> diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
>> index 238950412de0..cecc242c1af0 100644
>> --- a/drivers/pci/iov.c
>> +++ b/drivers/pci/iov.c
>> @@ -231,13 +231,18 @@ static void virtfn_remove(struct pci_dev *dev, int id, int reset)
>>
>> int __weak pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
>> {
>> - return 0;
>> + return 0;
>> +}
>> +
>> +int __weak pcibios_sriov_disable(struct pci_dev *pdev)
>> +{
>> + return 0;
>> }
>>
>> static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
>> {
>> int rc;
>> - int i, j;
>> + int i;
>> int nres;
>> u16 offset, stride, initial;
>> struct resource *res;
>> @@ -245,7 +250,6 @@ static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
>> struct pci_sriov *iov = dev->sriov;
>> int bars = 0;
>> int bus;
>> - int retval;
>>
>> if (!nr_virtfn)
>> return 0;
>> @@ -322,10 +326,11 @@ static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
>> if (nr_virtfn < initial)
>> initial = nr_virtfn;
>>
>> - if ((retval = pcibios_sriov_enable(dev, initial))) {
>> + rc = pcibios_sriov_enable(dev, initial);
>> + if (rc) {
>> dev_err(&dev->dev, "failure %d from pcibios_sriov_enable()\n",
>> - retval);
>> - return retval;
>> + rc);
>> + goto err_pcibios;
>> }
>>
>> for (i = 0; i < initial; i++) {
>> @@ -340,25 +345,23 @@ static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
>> return 0;
>>
>> failed:
>> - for (j = 0; j < i; j++)
>> - virtfn_remove(dev, j, 0);
>> + while (i--)
>> + virtfn_remove(dev, i, 0);
>>
>> + pcibios_sriov_disable(dev);
>> +err_pcibios:
>> iov->ctrl &= ~(PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE);
>> pci_cfg_access_lock(dev);
>> pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
>> - pci_iov_set_numvfs(dev, 0);
>> ssleep(1);
>> pci_cfg_access_unlock(dev);
>>
>> if (iov->link != dev->devfn)
>> sysfs_remove_link(&dev->dev.kobj, "dep_link");
>>
>> - return rc;
>> -}
>> + pci_iov_set_numvfs(dev, 0);
>
> Do you have a spec pointer for the 1 sec delay before clearing NumVFs?
The text from the SR-IOV spec v1.1 in relation to clearing VF enable reads:
If software Clears VF Enable, software must allow 1.0 s second after
VF Enable is Cleared before
reading any field in the SR-IOV Extended Capability or the VF
Migration State Array (see
Section 3.3.15.1).
I'm assuming the same would apply to writing to the region after VFE
has been cleared.
> Does we need to clear NumVFs while holding the cfg access lock?
I don't think so.
Earlier in the function pci_iov_set_numvfs was getting set before
without taking the lock. I think the lock is being used to enforce
the required grace period on configuration space access following
setting or clearing the VFE bit. The code as it is now matches what
we have in sriov_disable so I suspect it likely works this way as that
path has likely seen much more validation than the exception handling
path for sriov_enable has.
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 3/5] iov: Fix sriov_enable exception handling path
2015-10-29 16:54 ` Alex Duyck
@ 2015-10-29 20:41 ` Bjorn Helgaas
0 siblings, 0 replies; 20+ messages in thread
From: Bjorn Helgaas @ 2015-10-29 20:41 UTC (permalink / raw)
To: Alex Duyck; +Cc: bhelgaas, linux-pci, linux-kernel
On Thu, Oct 29, 2015 at 09:54:00AM -0700, Alex Duyck wrote:
> On Thu, Oct 29, 2015 at 9:32 AM, Bjorn Helgaas <helgaas@kernel.org> wrote:
> > Hi Alex,
> >
> > Thanks, this definitely clears up some problems. Two minor questions
> > below.
> >
> > On Tue, Oct 27, 2015 at 01:52:27PM -0700, Alexander Duyck wrote:
> >> >From what I can tell there were several errors in the sriov_enable
> >> exception handling path. Below is a brief list of what I believe I am
> >> fixing:
> >>
> >> 1. If pcibios_enable_sriov failed, we returned without disabling SR-IOV on
> >> the device.
> >> 2. If virtfn_add failed we didn't call pcibios_disable_sriov to undo
> >> pcibios_enable_sriov.
> >> 3. We were resetting numvfs to 0 before a second had passed for the VFs to
> >> quiesce.
> >> 4. Minor coding style issues for white space and for assignment in
> >> conditional check.
> >>
> >> Beyond addressing these 4 issues there were also 2 other minor issues in
> >> that retval was a redundant variable with rc, and j wasn't actually needed
> >> as we could simply reverse the loop we were running when setting up i. As
> >> such I have updated the code to address those two items.
> >>
> >> Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
> >> ---
> >> drivers/pci/iov.c | 31 +++++++++++++++++--------------
> >> 1 file changed, 17 insertions(+), 14 deletions(-)
> >>
> >> diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
> >> index 238950412de0..cecc242c1af0 100644
> >> --- a/drivers/pci/iov.c
> >> +++ b/drivers/pci/iov.c
> >> @@ -231,13 +231,18 @@ static void virtfn_remove(struct pci_dev *dev, int id, int reset)
> >>
> >> int __weak pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
> >> {
> >> - return 0;
> >> + return 0;
> >> +}
> >> +
> >> +int __weak pcibios_sriov_disable(struct pci_dev *pdev)
> >> +{
> >> + return 0;
> >> }
> >>
> >> static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
> >> {
> >> int rc;
> >> - int i, j;
> >> + int i;
> >> int nres;
> >> u16 offset, stride, initial;
> >> struct resource *res;
> >> @@ -245,7 +250,6 @@ static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
> >> struct pci_sriov *iov = dev->sriov;
> >> int bars = 0;
> >> int bus;
> >> - int retval;
> >>
> >> if (!nr_virtfn)
> >> return 0;
> >> @@ -322,10 +326,11 @@ static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
> >> if (nr_virtfn < initial)
> >> initial = nr_virtfn;
> >>
> >> - if ((retval = pcibios_sriov_enable(dev, initial))) {
> >> + rc = pcibios_sriov_enable(dev, initial);
> >> + if (rc) {
> >> dev_err(&dev->dev, "failure %d from pcibios_sriov_enable()\n",
> >> - retval);
> >> - return retval;
> >> + rc);
> >> + goto err_pcibios;
> >> }
> >>
> >> for (i = 0; i < initial; i++) {
> >> @@ -340,25 +345,23 @@ static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
> >> return 0;
> >>
> >> failed:
> >> - for (j = 0; j < i; j++)
> >> - virtfn_remove(dev, j, 0);
> >> + while (i--)
> >> + virtfn_remove(dev, i, 0);
> >>
> >> + pcibios_sriov_disable(dev);
> >> +err_pcibios:
> >> iov->ctrl &= ~(PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE);
> >> pci_cfg_access_lock(dev);
> >> pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
> >> - pci_iov_set_numvfs(dev, 0);
> >> ssleep(1);
> >> pci_cfg_access_unlock(dev);
> >>
> >> if (iov->link != dev->devfn)
> >> sysfs_remove_link(&dev->dev.kobj, "dep_link");
> >>
> >> - return rc;
> >> -}
> >> + pci_iov_set_numvfs(dev, 0);
> >
> > Do you have a spec pointer for the 1 sec delay before clearing NumVFs?
>
> The text from the SR-IOV spec v1.1 in relation to clearing VF enable reads:
>
> If software Clears VF Enable, software must allow 1.0 s second after
> VF Enable is Cleared before
> reading any field in the SR-IOV Extended Capability or the VF
> Migration State Array (see
> Section 3.3.15.1).
>
> I'm assuming the same would apply to writing to the region after VFE
> has been cleared.
Yep, thanks. Sec 3.3.3.1 clearly says we have to wait 1.0s after
clearing VF Enable before reading anything in the capability. And
pci_iov_set_numvfs() *does* read PCI_SRIOV_VF_OFFSET and
PCI_SRIOV_VF_STRIDE.
> > Does we need to clear NumVFs while holding the cfg access lock?
>
> I don't think so.
>
> Earlier in the function pci_iov_set_numvfs was getting set before
> without taking the lock. I think the lock is being used to enforce
> the required grace period on configuration space access following
> setting or clearing the VFE bit. The code as it is now matches what
> we have in sriov_disable so I suspect it likely works this way as that
> path has likely seen much more validation than the exception handling
> path for sriov_enable has.
Right. I think the important part is that we hold the lock during the
ssleep(1).
Slightly different problem: I'm a little worried about the places in
sriov_enable() and sriov_restore_state() where we set VF Enable and
msleep for 100ms. Sec. 3.3.3.1 requires the 100ms before we issue
config requests to the VFs, the msleep satisfies that.
But 3.3.3.1 goes on to say the new VFs can return CRS status for up to
1.0s, and they can silently drop Memory Requests for up to 1.0s. I
don't think the VF add path checks for CRS status: it doesn't call
pci_bus_read_dev_vendor_id(). So I'm not sure we're quite covered
here.
Bjorn
^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH 4/5] iov: Variable and loop cleanup for sriov_disable and sriov_enable
2015-10-27 20:52 [PATCH 0/5] Various of SR-IOV fixes and cleanup Alexander Duyck
` (2 preceding siblings ...)
2015-10-27 20:52 ` [PATCH 3/5] iov: Fix sriov_enable exception handling path Alexander Duyck
@ 2015-10-27 20:52 ` Alexander Duyck
2015-10-29 21:43 ` Bjorn Helgaas
2015-10-27 20:52 ` [PATCH 5/5] iov: Update sriov_enable to correctly handle offset and stride Alexander Duyck
4 siblings, 1 reply; 20+ messages in thread
From: Alexander Duyck @ 2015-10-27 20:52 UTC (permalink / raw)
To: bhelgaas; +Cc: linux-pci, linux-kernel
This patch is just a minor cleanup to go through and group all of the
variables into one declaration instead of a long string of single
declarations for each int. It also changes the direction for a couple
loops as we are able to loop with less code this way as testing against 0
can be done as a part of the decrement operation.
Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
---
drivers/pci/iov.c | 13 ++++---------
1 file changed, 4 insertions(+), 9 deletions(-)
diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index cecc242c1af0..c0fc88fa7c4d 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -241,15 +241,11 @@ int __weak pcibios_sriov_disable(struct pci_dev *pdev)
static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
{
- int rc;
- int i;
- int nres;
u16 offset, stride, initial;
struct resource *res;
struct pci_dev *pdev;
struct pci_sriov *iov = dev->sriov;
- int bars = 0;
- int bus;
+ int rc, i, nres, bars, bus;
if (!nr_virtfn)
return 0;
@@ -271,8 +267,7 @@ static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
if (!offset || (nr_virtfn > 1 && !stride))
return -EIO;
- nres = 0;
- for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+ for (nres = 0, bars = 0, i = PCI_SRIOV_NUM_BARS; i--;) {
bars |= (1 << (i + PCI_IOV_RESOURCES));
res = &dev->resource[i + PCI_IOV_RESOURCES];
if (res->parent)
@@ -366,13 +361,13 @@ err_pcibios:
static void sriov_disable(struct pci_dev *dev)
{
- int i;
struct pci_sriov *iov = dev->sriov;
+ int i = iov->num_VFs;
if (!iov->num_VFs)
return;
- for (i = 0; i < iov->num_VFs; i++)
+ while (i--)
virtfn_remove(dev, i, 0);
pcibios_sriov_disable(dev);
^ permalink raw reply related [flat|nested] 20+ messages in thread
* Re: [PATCH 4/5] iov: Variable and loop cleanup for sriov_disable and sriov_enable
2015-10-27 20:52 ` [PATCH 4/5] iov: Variable and loop cleanup for sriov_disable and sriov_enable Alexander Duyck
@ 2015-10-29 21:43 ` Bjorn Helgaas
2015-10-29 23:19 ` Alexander Duyck
0 siblings, 1 reply; 20+ messages in thread
From: Bjorn Helgaas @ 2015-10-29 21:43 UTC (permalink / raw)
To: Alexander Duyck; +Cc: bhelgaas, linux-pci, linux-kernel
Hi Alex,
On Tue, Oct 27, 2015 at 01:52:33PM -0700, Alexander Duyck wrote:
> This patch is just a minor cleanup to go through and group all of the
> variables into one declaration instead of a long string of single
> declarations for each int. It also changes the direction for a couple
> loops as we are able to loop with less code this way as testing against 0
> can be done as a part of the decrement operation.
>
> Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
> ---
> drivers/pci/iov.c | 13 ++++---------
> 1 file changed, 4 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
> index cecc242c1af0..c0fc88fa7c4d 100644
> --- a/drivers/pci/iov.c
> +++ b/drivers/pci/iov.c
> @@ -241,15 +241,11 @@ int __weak pcibios_sriov_disable(struct pci_dev *pdev)
>
> static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
> {
> - int rc;
> - int i;
> - int nres;
> u16 offset, stride, initial;
> struct resource *res;
> struct pci_dev *pdev;
> struct pci_sriov *iov = dev->sriov;
> - int bars = 0;
> - int bus;
> + int rc, i, nres, bars, bus;
I don't have a strong opinion on combining the declarations to one line,
and I would apply it if you wanted to do the same for the whole file
at once, in a patch by itself.
> if (!nr_virtfn)
> return 0;
> @@ -271,8 +267,7 @@ static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
> if (!offset || (nr_virtfn > 1 && !stride))
> return -EIO;
>
> - nres = 0;
> - for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
> + for (nres = 0, bars = 0, i = PCI_SRIOV_NUM_BARS; i--;) {
But I don't agree that this is easier to read. I suppose it could be
a tiny bit more efficient, but I think the benefit to the reader of
the usual "for (i = 0; i < limit; i++)" loop is larger.
> bars |= (1 << (i + PCI_IOV_RESOURCES));
> res = &dev->resource[i + PCI_IOV_RESOURCES];
> if (res->parent)
> @@ -366,13 +361,13 @@ err_pcibios:
>
> static void sriov_disable(struct pci_dev *dev)
> {
> - int i;
> struct pci_sriov *iov = dev->sriov;
> + int i = iov->num_VFs;
>
> if (!iov->num_VFs)
> return;
>
> - for (i = 0; i < iov->num_VFs; i++)
> + while (i--)
> virtfn_remove(dev, i, 0);
I do like the change to remove devices in the reverse order as we
added them. But I'm really partial to the way a "for" loop keeps all
the loop control in one spot. So I would apply a patch that made it
look like this:
for (i = iov->num_VFs - 1; i >= 0; i--)
virtfn_remove(dev, i, 0);
> pcibios_sriov_disable(dev);
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-pci" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 4/5] iov: Variable and loop cleanup for sriov_disable and sriov_enable
2015-10-29 21:43 ` Bjorn Helgaas
@ 2015-10-29 23:19 ` Alexander Duyck
0 siblings, 0 replies; 20+ messages in thread
From: Alexander Duyck @ 2015-10-29 23:19 UTC (permalink / raw)
To: Bjorn Helgaas, Alexander Duyck; +Cc: bhelgaas, linux-pci, linux-kernel
On 10/29/2015 02:43 PM, Bjorn Helgaas wrote:
> Hi Alex,
>
> On Tue, Oct 27, 2015 at 01:52:33PM -0700, Alexander Duyck wrote:
>> This patch is just a minor cleanup to go through and group all of the
>> variables into one declaration instead of a long string of single
>> declarations for each int. It also changes the direction for a couple
>> loops as we are able to loop with less code this way as testing against 0
>> can be done as a part of the decrement operation.
>>
>> Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
>> ---
>> drivers/pci/iov.c | 13 ++++---------
>> 1 file changed, 4 insertions(+), 9 deletions(-)
>>
>> diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
>> index cecc242c1af0..c0fc88fa7c4d 100644
>> --- a/drivers/pci/iov.c
>> +++ b/drivers/pci/iov.c
>> @@ -241,15 +241,11 @@ int __weak pcibios_sriov_disable(struct pci_dev *pdev)
>>
>> static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
>> {
>> - int rc;
>> - int i;
>> - int nres;
>> u16 offset, stride, initial;
>> struct resource *res;
>> struct pci_dev *pdev;
>> struct pci_sriov *iov = dev->sriov;
>> - int bars = 0;
>> - int bus;
>> + int rc, i, nres, bars, bus;
> I don't have a strong opinion on combining the declarations to one line,
> and I would apply it if you wanted to do the same for the whole file
> at once, in a patch by itself.
Maybe I will work on that tonight. It doesn't look like it would be
much work.
>
>> if (!nr_virtfn)
>> return 0;
>> @@ -271,8 +267,7 @@ static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
>> if (!offset || (nr_virtfn > 1 && !stride))
>> return -EIO;
>>
>> - nres = 0;
>> - for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
>> + for (nres = 0, bars = 0, i = PCI_SRIOV_NUM_BARS; i--;) {
> But I don't agree that this is easier to read. I suppose it could be
> a tiny bit more efficient, but I think the benefit to the reader of
> the usual "for (i = 0; i < limit; i++)" loop is larger.
I agree with you. Pulling nres and bars into the loop was probably a
bad idea on my part.
As far as reordering the loops that is just a bad habit I have kind of
developed from doing driver performance tuning. Running the loop
backwards you are able to combine the test and decrement so it saves a
few instructions since compare against 0 or signed is usually built in
for free with the decrement instructions. For something like this it
really isn't needed.
>> bars |= (1 << (i + PCI_IOV_RESOURCES));
>> res = &dev->resource[i + PCI_IOV_RESOURCES];
>> if (res->parent)
>> @@ -366,13 +361,13 @@ err_pcibios:
>>
>> static void sriov_disable(struct pci_dev *dev)
>> {
>> - int i;
>> struct pci_sriov *iov = dev->sriov;
>> + int i = iov->num_VFs;
>>
>> if (!iov->num_VFs)
>> return;
>>
>> - for (i = 0; i < iov->num_VFs; i++)
>> + while (i--)
>> virtfn_remove(dev, i, 0);
> I do like the change to remove devices in the reverse order as we
> added them. But I'm really partial to the way a "for" loop keeps all
> the loop control in one spot. So I would apply a patch that made it
> look like this:
>
> for (i = iov->num_VFs - 1; i >= 0; i--)
> virtfn_remove(dev, i, 0);
>
Yeah, this was a section I had gone back and forth on. I originally had
it doing a '!i' check at the start instead of '!iov->num_VFs'. I think
that was why I pulled it out like that. I started to undo parts of it
for readability sake, but I probably should have undone the move.
^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH 5/5] iov: Update sriov_enable to correctly handle offset and stride
2015-10-27 20:52 [PATCH 0/5] Various of SR-IOV fixes and cleanup Alexander Duyck
` (3 preceding siblings ...)
2015-10-27 20:52 ` [PATCH 4/5] iov: Variable and loop cleanup for sriov_disable and sriov_enable Alexander Duyck
@ 2015-10-27 20:52 ` Alexander Duyck
4 siblings, 0 replies; 20+ messages in thread
From: Alexander Duyck @ 2015-10-27 20:52 UTC (permalink / raw)
To: bhelgaas; +Cc: linux-pci, linux-kernel
As per the SR-IOV spec the values for offset and stride are undefined until
ARI and numVFs have been set. As such we should not be reading them until
the values have been assigned to the hardware. In addition if numVFs is 0
the values for offset and stride are defined as unused.
This change corrects a spot where we were reading the offset and stride
even though we had left numVFs set to 0.
Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
---
drivers/pci/iov.c | 28 +++++++++++-----------------
1 file changed, 11 insertions(+), 17 deletions(-)
diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index c0fc88fa7c4d..030568520772 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -241,11 +241,11 @@ int __weak pcibios_sriov_disable(struct pci_dev *pdev)
static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
{
- u16 offset, stride, initial;
struct resource *res;
struct pci_dev *pdev;
struct pci_sriov *iov = dev->sriov;
int rc, i, nres, bars, bus;
+ u16 initial;
if (!nr_virtfn)
return 0;
@@ -262,11 +262,6 @@ static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
(!(iov->cap & PCI_SRIOV_CAP_VFM) && (nr_virtfn > initial)))
return -EINVAL;
- pci_read_config_word(dev, iov->pos + PCI_SRIOV_VF_OFFSET, &offset);
- pci_read_config_word(dev, iov->pos + PCI_SRIOV_VF_STRIDE, &stride);
- if (!offset || (nr_virtfn > 1 && !stride))
- return -EIO;
-
for (nres = 0, bars = 0, i = PCI_SRIOV_NUM_BARS; i--;) {
bars |= (1 << (i + PCI_IOV_RESOURCES));
res = &dev->resource[i + PCI_IOV_RESOURCES];
@@ -278,16 +273,6 @@ static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
return -ENOMEM;
}
- iov->offset = offset;
- iov->stride = stride;
-
- bus = pci_iov_virtfn_bus(dev, nr_virtfn - 1);
- if (bus > dev->bus->busn_res.end) {
- dev_err(&dev->dev, "can't enable %d VFs (bus %02x out of range of %pR)\n",
- nr_virtfn, bus, &dev->bus->busn_res);
- return -ENOMEM;
- }
-
if (pci_enable_resources(dev, bars)) {
dev_err(&dev->dev, "SR-IOV: IOV BARS not allocated\n");
return -ENOMEM;
@@ -311,6 +296,15 @@ static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
}
pci_iov_set_numvfs(dev, nr_virtfn);
+
+ bus = pci_iov_virtfn_bus(dev, nr_virtfn - 1);
+ if (bus > dev->bus->busn_res.end) {
+ dev_err(&dev->dev, "can't enable %d VFs (bus %02x out of range of %pR)\n",
+ nr_virtfn, bus, &dev->bus->busn_res);
+ rc = -ENOMEM;
+ goto err_bus;
+ }
+
iov->ctrl |= PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE;
pci_cfg_access_lock(dev);
pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
@@ -350,7 +344,7 @@ err_pcibios:
pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
ssleep(1);
pci_cfg_access_unlock(dev);
-
+err_bus:
if (iov->link != dev->devfn)
sysfs_remove_link(&dev->dev.kobj, "dep_link");
^ permalink raw reply related [flat|nested] 20+ messages in thread