[PATCH] powerpc/powernv: use one M64 BAR in Single PE mode for one VF BAR

linuxppc-dev.lists.ozlabs.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] powerpc/powernv: use one M64 BAR in Single PE mode for one VF BAR
@ 2015-07-29  7:22 Wei Yang
  2015-07-30  1:15 ` Gavin Shan
  0 siblings, 1 reply; 56+ messages in thread
From: Wei Yang @ 2015-07-29  7:22 UTC (permalink / raw)
  To: aik, gwshan, benh; +Cc: linuxppc-dev, Wei Yang

In current implementation, when VF BAR is bigger than 64MB, it uses 4 M64
BAR in Single PE mode to cover the number of VFs required to be enabled.
By doing so, several VFs would be in one VF Group and leads to interference
between VFs in the same group.

This patch changes the design by using one M64 BAR in Single PE mode for
one VF BAR. This gives absolute isolation for VFs.

Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/pci-bridge.h     |    5 +-
 arch/powerpc/platforms/powernv/pci-ioda.c |  104 +++++------------------------
 2 files changed, 18 insertions(+), 91 deletions(-)

diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
index 712add5..1997e5d 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -214,10 +214,9 @@ struct pci_dn {
 	u16     vfs_expanded;		/* number of VFs IOV BAR expanded */
 	u16     num_vfs;		/* number of VFs enabled*/
 	int     offset;			/* PE# for the first VF PE */
-#define M64_PER_IOV 4
-	int     m64_per_iov;
+#define MAX_M64_WINDOW  16
 #define IODA_INVALID_M64        (-1)
-	int     m64_wins[PCI_SRIOV_NUM_BARS][M64_PER_IOV];
+	int     m64_wins[PCI_SRIOV_NUM_BARS][MAX_M64_WINDOW];
 #endif /* CONFIG_PCI_IOV */
 #endif
 	struct list_head child_list;
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 5738d31..b3e7909 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1168,7 +1168,7 @@ static int pnv_pci_vf_release_m64(struct pci_dev *pdev)
 	pdn = pci_get_pdn(pdev);
 
 	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
-		for (j = 0; j < M64_PER_IOV; j++) {
+		for (j = 0; j < MAX_M64_WINDOW; j++) {
 			if (pdn->m64_wins[i][j] == IODA_INVALID_M64)
 				continue;
 			opal_pci_phb_mmio_enable(phb->opal_id,
@@ -1193,8 +1193,7 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
 	int                    total_vfs;
 	resource_size_t        size, start;
 	int                    pe_num;
-	int                    vf_groups;
-	int                    vf_per_group;
+	int                    m64s;
 
 	bus = pdev->bus;
 	hose = pci_bus_to_host(bus);
@@ -1204,17 +1203,13 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
 
 	/* Initialize the m64_wins to IODA_INVALID_M64 */
 	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
-		for (j = 0; j < M64_PER_IOV; j++)
+		for (j = 0; j < MAX_M64_WINDOW; j++)
 			pdn->m64_wins[i][j] = IODA_INVALID_M64;
 
-	if (pdn->m64_per_iov == M64_PER_IOV) {
-		vf_groups = (num_vfs <= M64_PER_IOV) ? num_vfs: M64_PER_IOV;
-		vf_per_group = (num_vfs <= M64_PER_IOV)? 1:
-			roundup_pow_of_two(num_vfs) / pdn->m64_per_iov;
-	} else {
-		vf_groups = 1;
-		vf_per_group = 1;
-	}
+	if (pdn->vfs_expanded != phb->ioda.total_pe)
+		m64s = num_vfs;
+	else
+		m64s = 1;
 
 	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
 		res = &pdev->resource[i + PCI_IOV_RESOURCES];
@@ -1224,7 +1219,7 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
 		if (!pnv_pci_is_mem_pref_64(res->flags))
 			continue;
 
-		for (j = 0; j < vf_groups; j++) {
+		for (j = 0; j < m64s; j++) {
 			do {
 				win = find_next_zero_bit(&phb->ioda.m64_bar_alloc,
 						phb->ioda.m64_bar_idx + 1, 0);
@@ -1235,10 +1230,9 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
 
 			pdn->m64_wins[i][j] = win;
 
-			if (pdn->m64_per_iov == M64_PER_IOV) {
+			if (pdn->vfs_expanded != phb->ioda.total_pe) {
 				size = pci_iov_resource_size(pdev,
 							PCI_IOV_RESOURCES + i);
-				size = size * vf_per_group;
 				start = res->start + size * j;
 			} else {
 				size = resource_size(res);
@@ -1246,7 +1240,7 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
 			}
 
 			/* Map the M64 here */
-			if (pdn->m64_per_iov == M64_PER_IOV) {
+			if (pdn->vfs_expanded != phb->ioda.total_pe) {
 				pe_num = pdn->offset + j;
 				rc = opal_pci_map_pe_mmio_window(phb->opal_id,
 						pe_num, OPAL_M64_WINDOW_TYPE,
@@ -1267,7 +1261,7 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
 				goto m64_failed;
 			}
 
-			if (pdn->m64_per_iov == M64_PER_IOV)
+			if (pdn->vfs_expanded != phb->ioda.total_pe)
 				rc = opal_pci_phb_mmio_enable(phb->opal_id,
 				     OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 2);
 			else
@@ -1311,15 +1305,13 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe
 	iommu_free_table(tbl, of_node_full_name(dev->dev.of_node));
 }
 
-static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs)
+static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
 {
 	struct pci_bus        *bus;
 	struct pci_controller *hose;
 	struct pnv_phb        *phb;
 	struct pnv_ioda_pe    *pe, *pe_n;
 	struct pci_dn         *pdn;
-	u16                    vf_index;
-	int64_t                rc;
 
 	bus = pdev->bus;
 	hose = pci_bus_to_host(bus);
@@ -1329,35 +1321,6 @@ static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs)
 	if (!pdev->is_physfn)
 		return;
 
-	if (pdn->m64_per_iov == M64_PER_IOV && num_vfs > M64_PER_IOV) {
-		int   vf_group;
-		int   vf_per_group;
-		int   vf_index1;
-
-		vf_per_group = roundup_pow_of_two(num_vfs) / pdn->m64_per_iov;
-
-		for (vf_group = 0; vf_group < M64_PER_IOV; vf_group++)
-			for (vf_index = vf_group * vf_per_group;
-				vf_index < (vf_group + 1) * vf_per_group &&
-				vf_index < num_vfs;
-				vf_index++)
-				for (vf_index1 = vf_group * vf_per_group;
-					vf_index1 < (vf_group + 1) * vf_per_group &&
-					vf_index1 < num_vfs;
-					vf_index1++){
-
-					rc = opal_pci_set_peltv(phb->opal_id,
-						pdn->offset + vf_index,
-						pdn->offset + vf_index1,
-						OPAL_REMOVE_PE_FROM_DOMAIN);
-
-					if (rc)
-					    dev_warn(&pdev->dev, "%s: Failed to unlink same group PE#%d(%lld)\n",
-						__func__,
-						pdn->offset + vf_index1, rc);
-				}
-	}
-
 	list_for_each_entry_safe(pe, pe_n, &phb->ioda.pe_list, list) {
 		if (pe->parent_dev != pdev)
 			continue;
@@ -1392,10 +1355,10 @@ void pnv_pci_sriov_disable(struct pci_dev *pdev)
 	num_vfs = pdn->num_vfs;
 
 	/* Release VF PEs */
-	pnv_ioda_release_vf_PE(pdev, num_vfs);
+	pnv_ioda_release_vf_PE(pdev);
 
 	if (phb->type == PNV_PHB_IODA2) {
-		if (pdn->m64_per_iov == 1)
+		if (pdn->vfs_expanded == phb->ioda.total_pe)
 			pnv_pci_vf_resource_shift(pdev, -pdn->offset);
 
 		/* Release M64 windows */
@@ -1418,7 +1381,6 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
 	int                    pe_num;
 	u16                    vf_index;
 	struct pci_dn         *pdn;
-	int64_t                rc;
 
 	bus = pdev->bus;
 	hose = pci_bus_to_host(bus);
@@ -1463,37 +1425,6 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
 
 		pnv_pci_ioda2_setup_dma_pe(phb, pe);
 	}
-
-	if (pdn->m64_per_iov == M64_PER_IOV && num_vfs > M64_PER_IOV) {
-		int   vf_group;
-		int   vf_per_group;
-		int   vf_index1;
-
-		vf_per_group = roundup_pow_of_two(num_vfs) / pdn->m64_per_iov;
-
-		for (vf_group = 0; vf_group < M64_PER_IOV; vf_group++) {
-			for (vf_index = vf_group * vf_per_group;
-			     vf_index < (vf_group + 1) * vf_per_group &&
-			     vf_index < num_vfs;
-			     vf_index++) {
-				for (vf_index1 = vf_group * vf_per_group;
-				     vf_index1 < (vf_group + 1) * vf_per_group &&
-				     vf_index1 < num_vfs;
-				     vf_index1++) {
-
-					rc = opal_pci_set_peltv(phb->opal_id,
-						pdn->offset + vf_index,
-						pdn->offset + vf_index1,
-						OPAL_ADD_PE_TO_DOMAIN);
-
-					if (rc)
-					    dev_warn(&pdev->dev, "%s: Failed to link same group PE#%d(%lld)\n",
-						__func__,
-						pdn->offset + vf_index1, rc);
-				}
-			}
-		}
-	}
 }
 
 int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
@@ -1537,7 +1468,7 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
 		 * the IOV BAR according to the PE# allocated to the VFs.
 		 * Otherwise, the PE# for the VF will conflict with others.
 		 */
-		if (pdn->m64_per_iov == 1) {
+		if (pdn->vfs_expanded == phb->ioda.total_pe) {
 			ret = pnv_pci_vf_resource_shift(pdev, pdn->offset);
 			if (ret)
 				goto m64_failed;
@@ -1570,8 +1501,7 @@ int pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
 	/* Allocate PCI data */
 	add_dev_pci_data(pdev);
 
-	pnv_pci_sriov_enable(pdev, num_vfs);
-	return 0;
+	return pnv_pci_sriov_enable(pdev, num_vfs);
 }
 #endif /* CONFIG_PCI_IOV */
 
@@ -2766,7 +2696,6 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
 	pdn->vfs_expanded = 0;
 
 	total_vfs = pci_sriov_get_totalvfs(pdev);
-	pdn->m64_per_iov = 1;
 	mul = phb->ioda.total_pe;
 
 	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
@@ -2785,7 +2714,6 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
 		if (size > (1 << 26)) {
 			dev_info(&pdev->dev, "PowerNV: VF BAR%d: %pR IOV size is bigger than 64M, roundup power2\n",
 				 i, res);
-			pdn->m64_per_iov = M64_PER_IOV;
 			mul = roundup_pow_of_two(total_vfs);
 			break;
 		}
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 56+ messages in thread

* Re: [PATCH] powerpc/powernv: use one M64 BAR in Single PE mode for one VF BAR
  2015-07-29  7:22 [PATCH] powerpc/powernv: use one M64 BAR in Single PE mode for one VF BAR Wei Yang
@ 2015-07-30  1:15 ` Gavin Shan
  2015-07-30  5:43   ` Wei Yang
  0 siblings, 1 reply; 56+ messages in thread
From: Gavin Shan @ 2015-07-30  1:15 UTC (permalink / raw)
  To: Wei Yang; +Cc: aik, gwshan, benh, linuxppc-dev

On Wed, Jul 29, 2015 at 03:22:07PM +0800, Wei Yang wrote:
>In current implementation, when VF BAR is bigger than 64MB, it uses 4 M64
>BAR in Single PE mode to cover the number of VFs required to be enabled.
>By doing so, several VFs would be in one VF Group and leads to interference
>between VFs in the same group.
>
>This patch changes the design by using one M64 BAR in Single PE mode for
>one VF BAR. This gives absolute isolation for VFs.
>
>Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
>---
> arch/powerpc/include/asm/pci-bridge.h     |    5 +-
> arch/powerpc/platforms/powernv/pci-ioda.c |  104 +++++------------------------
> 2 files changed, 18 insertions(+), 91 deletions(-)
>

questions regarding this:

(1) When M64 BAR is running in single-PE-mode for VFs, the alignment for one
    particular IOV BAR still have to be (IOV_BAR_size * max_vf_number), or
    M64 segment size of last BAR (0x10000000) is fine? If the later one is fine,
    more M64 space would be saved. On the other hand, if the IOV BAR size
    (for all VFs) is less than 256MB, will the allocated resource conflict
    with the M64 segments in last BAR?
(2) When M64 BAR is in single-PE-mode, the PE numbers allocated for VFs need
    continuous or not.
(3) Each PF could have 6 IOV BARs and there're 15 available M64 BAR. It means
    only two VFs can be enabled in the extreme case. Would it be a problem?

>diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
>index 712add5..1997e5d 100644
>--- a/arch/powerpc/include/asm/pci-bridge.h
>+++ b/arch/powerpc/include/asm/pci-bridge.h
>@@ -214,10 +214,9 @@ struct pci_dn {
> 	u16     vfs_expanded;		/* number of VFs IOV BAR expanded */
> 	u16     num_vfs;		/* number of VFs enabled*/
> 	int     offset;			/* PE# for the first VF PE */
>-#define M64_PER_IOV 4
>-	int     m64_per_iov;
>+#define MAX_M64_WINDOW  16
> #define IODA_INVALID_M64        (-1)
>-	int     m64_wins[PCI_SRIOV_NUM_BARS][M64_PER_IOV];
>+	int     m64_wins[PCI_SRIOV_NUM_BARS][MAX_M64_WINDOW];
> #endif /* CONFIG_PCI_IOV */
> #endif

The "m64_wins" would be renamed to "m64_map". Also, it would have dynamic size:

- When the IOV BAR is extended to 256 segments, its size is sizeof(int) * PCI_SRIOV_NUM_BARS;
- When the IOV BAR is extended to max_vf_num, its size is sizeof(int) * max_vf_num;

> 	struct list_head child_list;
>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>index 5738d31..b3e7909 100644
>--- a/arch/powerpc/platforms/powernv/pci-ioda.c
>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>@@ -1168,7 +1168,7 @@ static int pnv_pci_vf_release_m64(struct pci_dev *pdev)
> 	pdn = pci_get_pdn(pdev);
>
> 	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
>-		for (j = 0; j < M64_PER_IOV; j++) {
>+		for (j = 0; j < MAX_M64_WINDOW; j++) {
> 			if (pdn->m64_wins[i][j] == IODA_INVALID_M64)
> 				continue;
> 			opal_pci_phb_mmio_enable(phb->opal_id,
>@@ -1193,8 +1193,7 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
> 	int                    total_vfs;
> 	resource_size_t        size, start;
> 	int                    pe_num;
>-	int                    vf_groups;
>-	int                    vf_per_group;
>+	int                    m64s;

"m64s" could have better name. For example, "vfs_per_m64_bar"...

>
> 	bus = pdev->bus;
> 	hose = pci_bus_to_host(bus);
>@@ -1204,17 +1203,13 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
>
> 	/* Initialize the m64_wins to IODA_INVALID_M64 */
> 	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
>-		for (j = 0; j < M64_PER_IOV; j++)
>+		for (j = 0; j < MAX_M64_WINDOW; j++)
> 			pdn->m64_wins[i][j] = IODA_INVALID_M64;
>
>-	if (pdn->m64_per_iov == M64_PER_IOV) {
>-		vf_groups = (num_vfs <= M64_PER_IOV) ? num_vfs: M64_PER_IOV;
>-		vf_per_group = (num_vfs <= M64_PER_IOV)? 1:
>-			roundup_pow_of_two(num_vfs) / pdn->m64_per_iov;
>-	} else {
>-		vf_groups = 1;
>-		vf_per_group = 1;
>-	}
>+	if (pdn->vfs_expanded != phb->ioda.total_pe)
>+		m64s = num_vfs;
>+	else
>+		m64s = 1;

The condition (pdn->vfs_expanded != phb->ioda.total_pe) isn't precise enough as
explained below.

>
> 	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
> 		res = &pdev->resource[i + PCI_IOV_RESOURCES];
>@@ -1224,7 +1219,7 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
> 		if (!pnv_pci_is_mem_pref_64(res->flags))
> 			continue;
>
>-		for (j = 0; j < vf_groups; j++) {
>+		for (j = 0; j < m64s; j++) {
> 			do {
> 				win = find_next_zero_bit(&phb->ioda.m64_bar_alloc,
> 						phb->ioda.m64_bar_idx + 1, 0);
>@@ -1235,10 +1230,9 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
>
> 			pdn->m64_wins[i][j] = win;
>
>-			if (pdn->m64_per_iov == M64_PER_IOV) {
>+			if (pdn->vfs_expanded != phb->ioda.total_pe) {
> 				size = pci_iov_resource_size(pdev,
> 							PCI_IOV_RESOURCES + i);
>-				size = size * vf_per_group;
> 				start = res->start + size * j;
> 			} else {
> 				size = resource_size(res);
>@@ -1246,7 +1240,7 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
> 			}
>
> 			/* Map the M64 here */
>-			if (pdn->m64_per_iov == M64_PER_IOV) {
>+			if (pdn->vfs_expanded != phb->ioda.total_pe) {
> 				pe_num = pdn->offset + j;
> 				rc = opal_pci_map_pe_mmio_window(phb->opal_id,
> 						pe_num, OPAL_M64_WINDOW_TYPE,
>@@ -1267,7 +1261,7 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
> 				goto m64_failed;
> 			}
>
>-			if (pdn->m64_per_iov == M64_PER_IOV)
>+			if (pdn->vfs_expanded != phb->ioda.total_pe)
> 				rc = opal_pci_phb_mmio_enable(phb->opal_id,
> 				     OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 2);
> 			else
>@@ -1311,15 +1305,13 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe
> 	iommu_free_table(tbl, of_node_full_name(dev->dev.of_node));
> }
>
>-static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs)
>+static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
> {
> 	struct pci_bus        *bus;
> 	struct pci_controller *hose;
> 	struct pnv_phb        *phb;
> 	struct pnv_ioda_pe    *pe, *pe_n;
> 	struct pci_dn         *pdn;
>-	u16                    vf_index;
>-	int64_t                rc;
>
> 	bus = pdev->bus;
> 	hose = pci_bus_to_host(bus);
>@@ -1329,35 +1321,6 @@ static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs)
> 	if (!pdev->is_physfn)
> 		return;
>
>-	if (pdn->m64_per_iov == M64_PER_IOV && num_vfs > M64_PER_IOV) {
>-		int   vf_group;
>-		int   vf_per_group;
>-		int   vf_index1;
>-
>-		vf_per_group = roundup_pow_of_two(num_vfs) / pdn->m64_per_iov;
>-
>-		for (vf_group = 0; vf_group < M64_PER_IOV; vf_group++)
>-			for (vf_index = vf_group * vf_per_group;
>-				vf_index < (vf_group + 1) * vf_per_group &&
>-				vf_index < num_vfs;
>-				vf_index++)
>-				for (vf_index1 = vf_group * vf_per_group;
>-					vf_index1 < (vf_group + 1) * vf_per_group &&
>-					vf_index1 < num_vfs;
>-					vf_index1++){
>-
>-					rc = opal_pci_set_peltv(phb->opal_id,
>-						pdn->offset + vf_index,
>-						pdn->offset + vf_index1,
>-						OPAL_REMOVE_PE_FROM_DOMAIN);
>-
>-					if (rc)
>-					    dev_warn(&pdev->dev, "%s: Failed to unlink same group PE#%d(%lld)\n",
>-						__func__,
>-						pdn->offset + vf_index1, rc);
>-				}
>-	}
>-
> 	list_for_each_entry_safe(pe, pe_n, &phb->ioda.pe_list, list) {
> 		if (pe->parent_dev != pdev)
> 			continue;
>@@ -1392,10 +1355,10 @@ void pnv_pci_sriov_disable(struct pci_dev *pdev)
> 	num_vfs = pdn->num_vfs;
>
> 	/* Release VF PEs */
>-	pnv_ioda_release_vf_PE(pdev, num_vfs);
>+	pnv_ioda_release_vf_PE(pdev);
>
> 	if (phb->type == PNV_PHB_IODA2) {
>-		if (pdn->m64_per_iov == 1)
>+		if (pdn->vfs_expanded == phb->ioda.total_pe)
> 			pnv_pci_vf_resource_shift(pdev, -pdn->offset);
>
> 		/* Release M64 windows */
>@@ -1418,7 +1381,6 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
> 	int                    pe_num;
> 	u16                    vf_index;
> 	struct pci_dn         *pdn;
>-	int64_t                rc;
>
> 	bus = pdev->bus;
> 	hose = pci_bus_to_host(bus);
>@@ -1463,37 +1425,6 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
>
> 		pnv_pci_ioda2_setup_dma_pe(phb, pe);
> 	}
>-
>-	if (pdn->m64_per_iov == M64_PER_IOV && num_vfs > M64_PER_IOV) {
>-		int   vf_group;
>-		int   vf_per_group;
>-		int   vf_index1;
>-
>-		vf_per_group = roundup_pow_of_two(num_vfs) / pdn->m64_per_iov;
>-
>-		for (vf_group = 0; vf_group < M64_PER_IOV; vf_group++) {
>-			for (vf_index = vf_group * vf_per_group;
>-			     vf_index < (vf_group + 1) * vf_per_group &&
>-			     vf_index < num_vfs;
>-			     vf_index++) {
>-				for (vf_index1 = vf_group * vf_per_group;
>-				     vf_index1 < (vf_group + 1) * vf_per_group &&
>-				     vf_index1 < num_vfs;
>-				     vf_index1++) {
>-
>-					rc = opal_pci_set_peltv(phb->opal_id,
>-						pdn->offset + vf_index,
>-						pdn->offset + vf_index1,
>-						OPAL_ADD_PE_TO_DOMAIN);
>-
>-					if (rc)
>-					    dev_warn(&pdev->dev, "%s: Failed to link same group PE#%d(%lld)\n",
>-						__func__,
>-						pdn->offset + vf_index1, rc);
>-				}
>-			}
>-		}
>-	}
> }
>
> int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
>@@ -1537,7 +1468,7 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
> 		 * the IOV BAR according to the PE# allocated to the VFs.
> 		 * Otherwise, the PE# for the VF will conflict with others.
> 		 */
>-		if (pdn->m64_per_iov == 1) {
>+		if (pdn->vfs_expanded == phb->ioda.total_pe) {

This condition isn't precise enough. When PF occasionally supports 256 VFs
and the summed size of all IOV BARs (explained below) exceeds 64MB, we're
expecting to use singole-pe-mode M64 BARs, not shared-mode.

> 			ret = pnv_pci_vf_resource_shift(pdev, pdn->offset);
> 			if (ret)
> 				goto m64_failed;
>@@ -1570,8 +1501,7 @@ int pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
> 	/* Allocate PCI data */
> 	add_dev_pci_data(pdev);
>
>-	pnv_pci_sriov_enable(pdev, num_vfs);
>-	return 0;
>+	return pnv_pci_sriov_enable(pdev, num_vfs);
> }
> #endif /* CONFIG_PCI_IOV */
>
>@@ -2766,7 +2696,6 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
> 	pdn->vfs_expanded = 0;
>
> 	total_vfs = pci_sriov_get_totalvfs(pdev);
>-	pdn->m64_per_iov = 1;
> 	mul = phb->ioda.total_pe;
>
> 	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
>@@ -2785,7 +2714,6 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
> 		if (size > (1 << 26)) {

Actually, the condition isn't precise enough. In theory, every PF can have 6 IOV BARs.
If all of their size are 64MB, we will have 256 extended VFs. The total MMIO size needed
is: 96GB = (6 * 64MB * 256), which exceeds 64GB. The original idea would be to have
the scheme other than extending to 256 VFs when the sum of all IOV BARs is bigger
than 64MB, not single M64 BAR. It's different issue and you can fix it up in another
patch if you want.

> 			dev_info(&pdev->dev, "PowerNV: VF BAR%d: %pR IOV size is bigger than 64M, roundup power2\n",
> 				 i, res);
>-			pdn->m64_per_iov = M64_PER_IOV;
> 			mul = roundup_pow_of_two(total_vfs);
> 			break;
> 		}
>-- 
>1.7.9.5
>

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH] powerpc/powernv: use one M64 BAR in Single PE mode for one VF BAR
  2015-07-30  1:15 ` Gavin Shan
@ 2015-07-30  5:43   ` Wei Yang
  2015-07-31  0:13     ` Gavin Shan
  0 siblings, 1 reply; 56+ messages in thread
From: Wei Yang @ 2015-07-30  5:43 UTC (permalink / raw)
  To: Gavin Shan; +Cc: Wei Yang, aik, benh, linuxppc-dev

On Thu, Jul 30, 2015 at 11:15:01AM +1000, Gavin Shan wrote:
>On Wed, Jul 29, 2015 at 03:22:07PM +0800, Wei Yang wrote:
>>In current implementation, when VF BAR is bigger than 64MB, it uses 4 M64
>>BAR in Single PE mode to cover the number of VFs required to be enabled.
>>By doing so, several VFs would be in one VF Group and leads to interference
>>between VFs in the same group.
>>
>>This patch changes the design by using one M64 BAR in Single PE mode for
>>one VF BAR. This gives absolute isolation for VFs.
>>
>>Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
>>---
>> arch/powerpc/include/asm/pci-bridge.h     |    5 +-
>> arch/powerpc/platforms/powernv/pci-ioda.c |  104 +++++------------------------
>> 2 files changed, 18 insertions(+), 91 deletions(-)
>>
>
>questions regarding this:
>
>(1) When M64 BAR is running in single-PE-mode for VFs, the alignment for one
>    particular IOV BAR still have to be (IOV_BAR_size * max_vf_number), or
>    M64 segment size of last BAR (0x10000000) is fine? If the later one is fine,
>    more M64 space would be saved. On the other hand, if the IOV BAR size
>    (for all VFs) is less than 256MB, will the allocated resource conflict
>    with the M64 segments in last BAR?

Not need to be IOV BAR size aligned, be individual VF BAR size aligned is fine.

IOV BAR size = VF BAR size * expended_num_vfs

>(2) When M64 BAR is in single-PE-mode, the PE numbers allocated for VFs need
>    continuous or not.

No, not need.

>(3) Each PF could have 6 IOV BARs and there're 15 available M64 BAR. It means
>    only two VFs can be enabled in the extreme case. Would it be a problem?
>

Yes, you are right.

Based on Alexey's mail, full isolation is more important than more VFs.

>>diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
>>index 712add5..1997e5d 100644
>>--- a/arch/powerpc/include/asm/pci-bridge.h
>>+++ b/arch/powerpc/include/asm/pci-bridge.h
>>@@ -214,10 +214,9 @@ struct pci_dn {
>> 	u16     vfs_expanded;		/* number of VFs IOV BAR expanded */
>> 	u16     num_vfs;		/* number of VFs enabled*/
>> 	int     offset;			/* PE# for the first VF PE */
>>-#define M64_PER_IOV 4
>>-	int     m64_per_iov;
>>+#define MAX_M64_WINDOW  16
>> #define IODA_INVALID_M64        (-1)
>>-	int     m64_wins[PCI_SRIOV_NUM_BARS][M64_PER_IOV];
>>+	int     m64_wins[PCI_SRIOV_NUM_BARS][MAX_M64_WINDOW];
>> #endif /* CONFIG_PCI_IOV */
>> #endif
>
>The "m64_wins" would be renamed to "m64_map". Also, it would have dynamic size:
>
>- When the IOV BAR is extended to 256 segments, its size is sizeof(int) * PCI_SRIOV_NUM_BARS;
>- When the IOV BAR is extended to max_vf_num, its size is sizeof(int) * max_vf_num;
>
>> 	struct list_head child_list;
>>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>>index 5738d31..b3e7909 100644
>>--- a/arch/powerpc/platforms/powernv/pci-ioda.c
>>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>>@@ -1168,7 +1168,7 @@ static int pnv_pci_vf_release_m64(struct pci_dev *pdev)
>> 	pdn = pci_get_pdn(pdev);
>>
>> 	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
>>-		for (j = 0; j < M64_PER_IOV; j++) {
>>+		for (j = 0; j < MAX_M64_WINDOW; j++) {
>> 			if (pdn->m64_wins[i][j] == IODA_INVALID_M64)
>> 				continue;
>> 			opal_pci_phb_mmio_enable(phb->opal_id,
>>@@ -1193,8 +1193,7 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
>> 	int                    total_vfs;
>> 	resource_size_t        size, start;
>> 	int                    pe_num;
>>-	int                    vf_groups;
>>-	int                    vf_per_group;
>>+	int                    m64s;
>
>"m64s" could have better name. For example, "vfs_per_m64_bar"...
>

m64s is used to represent number of M64 BARs necessary to enable num_vfs.
vfs_per_m64_bar may be misleading.

How about "m64_bars" ?

>>
>> 	bus = pdev->bus;
>> 	hose = pci_bus_to_host(bus);
>>@@ -1204,17 +1203,13 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
>>
>> 	/* Initialize the m64_wins to IODA_INVALID_M64 */
>> 	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
>>-		for (j = 0; j < M64_PER_IOV; j++)
>>+		for (j = 0; j < MAX_M64_WINDOW; j++)
>> 			pdn->m64_wins[i][j] = IODA_INVALID_M64;
>>
>>-	if (pdn->m64_per_iov == M64_PER_IOV) {
>>-		vf_groups = (num_vfs <= M64_PER_IOV) ? num_vfs: M64_PER_IOV;
>>-		vf_per_group = (num_vfs <= M64_PER_IOV)? 1:
>>-			roundup_pow_of_two(num_vfs) / pdn->m64_per_iov;
>>-	} else {
>>-		vf_groups = 1;
>>-		vf_per_group = 1;
>>-	}
>>+	if (pdn->vfs_expanded != phb->ioda.total_pe)
>>+		m64s = num_vfs;
>>+	else
>>+		m64s = 1;
>
>The condition (pdn->vfs_expanded != phb->ioda.total_pe) isn't precise enough as
>explained below.
>
>>
>> 	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
>> 		res = &pdev->resource[i + PCI_IOV_RESOURCES];
>>@@ -1224,7 +1219,7 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
>> 		if (!pnv_pci_is_mem_pref_64(res->flags))
>> 			continue;
>>
>>-		for (j = 0; j < vf_groups; j++) {
>>+		for (j = 0; j < m64s; j++) {
>> 			do {
>> 				win = find_next_zero_bit(&phb->ioda.m64_bar_alloc,
>> 						phb->ioda.m64_bar_idx + 1, 0);
>>@@ -1235,10 +1230,9 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
>>
>> 			pdn->m64_wins[i][j] = win;
>>
>>-			if (pdn->m64_per_iov == M64_PER_IOV) {
>>+			if (pdn->vfs_expanded != phb->ioda.total_pe) {
>> 				size = pci_iov_resource_size(pdev,
>> 							PCI_IOV_RESOURCES + i);
>>-				size = size * vf_per_group;
>> 				start = res->start + size * j;
>> 			} else {
>> 				size = resource_size(res);
>>@@ -1246,7 +1240,7 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
>> 			}
>>
>> 			/* Map the M64 here */
>>-			if (pdn->m64_per_iov == M64_PER_IOV) {
>>+			if (pdn->vfs_expanded != phb->ioda.total_pe) {
>> 				pe_num = pdn->offset + j;
>> 				rc = opal_pci_map_pe_mmio_window(phb->opal_id,
>> 						pe_num, OPAL_M64_WINDOW_TYPE,
>>@@ -1267,7 +1261,7 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
>> 				goto m64_failed;
>> 			}
>>
>>-			if (pdn->m64_per_iov == M64_PER_IOV)
>>+			if (pdn->vfs_expanded != phb->ioda.total_pe)
>> 				rc = opal_pci_phb_mmio_enable(phb->opal_id,
>> 				     OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 2);
>> 			else
>>@@ -1311,15 +1305,13 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe
>> 	iommu_free_table(tbl, of_node_full_name(dev->dev.of_node));
>> }
>>
>>-static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs)
>>+static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
>> {
>> 	struct pci_bus        *bus;
>> 	struct pci_controller *hose;
>> 	struct pnv_phb        *phb;
>> 	struct pnv_ioda_pe    *pe, *pe_n;
>> 	struct pci_dn         *pdn;
>>-	u16                    vf_index;
>>-	int64_t                rc;
>>
>> 	bus = pdev->bus;
>> 	hose = pci_bus_to_host(bus);
>>@@ -1329,35 +1321,6 @@ static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs)
>> 	if (!pdev->is_physfn)
>> 		return;
>>
>>-	if (pdn->m64_per_iov == M64_PER_IOV && num_vfs > M64_PER_IOV) {
>>-		int   vf_group;
>>-		int   vf_per_group;
>>-		int   vf_index1;
>>-
>>-		vf_per_group = roundup_pow_of_two(num_vfs) / pdn->m64_per_iov;
>>-
>>-		for (vf_group = 0; vf_group < M64_PER_IOV; vf_group++)
>>-			for (vf_index = vf_group * vf_per_group;
>>-				vf_index < (vf_group + 1) * vf_per_group &&
>>-				vf_index < num_vfs;
>>-				vf_index++)
>>-				for (vf_index1 = vf_group * vf_per_group;
>>-					vf_index1 < (vf_group + 1) * vf_per_group &&
>>-					vf_index1 < num_vfs;
>>-					vf_index1++){
>>-
>>-					rc = opal_pci_set_peltv(phb->opal_id,
>>-						pdn->offset + vf_index,
>>-						pdn->offset + vf_index1,
>>-						OPAL_REMOVE_PE_FROM_DOMAIN);
>>-
>>-					if (rc)
>>-					    dev_warn(&pdev->dev, "%s: Failed to unlink same group PE#%d(%lld)\n",
>>-						__func__,
>>-						pdn->offset + vf_index1, rc);
>>-				}
>>-	}
>>-
>> 	list_for_each_entry_safe(pe, pe_n, &phb->ioda.pe_list, list) {
>> 		if (pe->parent_dev != pdev)
>> 			continue;
>>@@ -1392,10 +1355,10 @@ void pnv_pci_sriov_disable(struct pci_dev *pdev)
>> 	num_vfs = pdn->num_vfs;
>>
>> 	/* Release VF PEs */
>>-	pnv_ioda_release_vf_PE(pdev, num_vfs);
>>+	pnv_ioda_release_vf_PE(pdev);
>>
>> 	if (phb->type == PNV_PHB_IODA2) {
>>-		if (pdn->m64_per_iov == 1)
>>+		if (pdn->vfs_expanded == phb->ioda.total_pe)
>> 			pnv_pci_vf_resource_shift(pdev, -pdn->offset);
>>
>> 		/* Release M64 windows */
>>@@ -1418,7 +1381,6 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
>> 	int                    pe_num;
>> 	u16                    vf_index;
>> 	struct pci_dn         *pdn;
>>-	int64_t                rc;
>>
>> 	bus = pdev->bus;
>> 	hose = pci_bus_to_host(bus);
>>@@ -1463,37 +1425,6 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
>>
>> 		pnv_pci_ioda2_setup_dma_pe(phb, pe);
>> 	}
>>-
>>-	if (pdn->m64_per_iov == M64_PER_IOV && num_vfs > M64_PER_IOV) {
>>-		int   vf_group;
>>-		int   vf_per_group;
>>-		int   vf_index1;
>>-
>>-		vf_per_group = roundup_pow_of_two(num_vfs) / pdn->m64_per_iov;
>>-
>>-		for (vf_group = 0; vf_group < M64_PER_IOV; vf_group++) {
>>-			for (vf_index = vf_group * vf_per_group;
>>-			     vf_index < (vf_group + 1) * vf_per_group &&
>>-			     vf_index < num_vfs;
>>-			     vf_index++) {
>>-				for (vf_index1 = vf_group * vf_per_group;
>>-				     vf_index1 < (vf_group + 1) * vf_per_group &&
>>-				     vf_index1 < num_vfs;
>>-				     vf_index1++) {
>>-
>>-					rc = opal_pci_set_peltv(phb->opal_id,
>>-						pdn->offset + vf_index,
>>-						pdn->offset + vf_index1,
>>-						OPAL_ADD_PE_TO_DOMAIN);
>>-
>>-					if (rc)
>>-					    dev_warn(&pdev->dev, "%s: Failed to link same group PE#%d(%lld)\n",
>>-						__func__,
>>-						pdn->offset + vf_index1, rc);
>>-				}
>>-			}
>>-		}
>>-	}
>> }
>>
>> int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
>>@@ -1537,7 +1468,7 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
>> 		 * the IOV BAR according to the PE# allocated to the VFs.
>> 		 * Otherwise, the PE# for the VF will conflict with others.
>> 		 */
>>-		if (pdn->m64_per_iov == 1) {
>>+		if (pdn->vfs_expanded == phb->ioda.total_pe) {
>
>This condition isn't precise enough. When PF occasionally supports 256 VFs
>and the summed size of all IOV BARs (explained below) exceeds 64MB, we're
>expecting to use singole-pe-mode M64 BARs, not shared-mode.
>

Yes, you are right. The vfs_expanded is not reliable.

>> 			ret = pnv_pci_vf_resource_shift(pdev, pdn->offset);
>> 			if (ret)
>> 				goto m64_failed;
>>@@ -1570,8 +1501,7 @@ int pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
>> 	/* Allocate PCI data */
>> 	add_dev_pci_data(pdev);
>>
>>-	pnv_pci_sriov_enable(pdev, num_vfs);
>>-	return 0;
>>+	return pnv_pci_sriov_enable(pdev, num_vfs);
>> }
>> #endif /* CONFIG_PCI_IOV */
>>
>>@@ -2766,7 +2696,6 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
>> 	pdn->vfs_expanded = 0;
>>
>> 	total_vfs = pci_sriov_get_totalvfs(pdev);
>>-	pdn->m64_per_iov = 1;
>> 	mul = phb->ioda.total_pe;
>>
>> 	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
>>@@ -2785,7 +2714,6 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
>> 		if (size > (1 << 26)) {
>
>Actually, the condition isn't precise enough. In theory, every PF can have 6 IOV BARs.
>If all of their size are 64MB, we will have 256 extended VFs. The total MMIO size needed
>is: 96GB = (6 * 64MB * 256), which exceeds 64GB. The original idea would be to have
>the scheme other than extending to 256 VFs when the sum of all IOV BARs is bigger
>than 64MB, not single M64 BAR. It's different issue and you can fix it up in another
>patch if you want.
>

I didn't get your point here.

You mean it is necessary to check the sum of IOV BAR instead of a single one?

>> 			dev_info(&pdev->dev, "PowerNV: VF BAR%d: %pR IOV size is bigger than 64M, roundup power2\n",
>> 				 i, res);
>>-			pdn->m64_per_iov = M64_PER_IOV;
>> 			mul = roundup_pow_of_two(total_vfs);
>> 			break;
>> 		}
>>-- 
>>1.7.9.5
>>

-- 
Richard Yang
Help you, Help me

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH] powerpc/powernv: use one M64 BAR in Single PE mode for one VF BAR
  2015-07-30  5:43   ` Wei Yang
@ 2015-07-31  0:13     ` Gavin Shan
  2015-07-31  2:01       ` Wei Yang
  0 siblings, 1 reply; 56+ messages in thread
From: Gavin Shan @ 2015-07-31  0:13 UTC (permalink / raw)
  To: Wei Yang; +Cc: Gavin Shan, aik, benh, linuxppc-dev

On Thu, Jul 30, 2015 at 01:43:59PM +0800, Wei Yang wrote:
>On Thu, Jul 30, 2015 at 11:15:01AM +1000, Gavin Shan wrote:
>>On Wed, Jul 29, 2015 at 03:22:07PM +0800, Wei Yang wrote:
>>>In current implementation, when VF BAR is bigger than 64MB, it uses 4 M64
>>>BAR in Single PE mode to cover the number of VFs required to be enabled.
>>>By doing so, several VFs would be in one VF Group and leads to interference
>>>between VFs in the same group.
>>>
>>>This patch changes the design by using one M64 BAR in Single PE mode for
>>>one VF BAR. This gives absolute isolation for VFs.
>>>
>>>Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
>>>---
>>> arch/powerpc/include/asm/pci-bridge.h     |    5 +-
>>> arch/powerpc/platforms/powernv/pci-ioda.c |  104 +++++------------------------
>>> 2 files changed, 18 insertions(+), 91 deletions(-)
>>>
>>
>>questions regarding this:
>>
>>(1) When M64 BAR is running in single-PE-mode for VFs, the alignment for one
>>    particular IOV BAR still have to be (IOV_BAR_size * max_vf_number), or
>>    M64 segment size of last BAR (0x10000000) is fine? If the later one is fine,
>>    more M64 space would be saved. On the other hand, if the IOV BAR size
>>    (for all VFs) is less than 256MB, will the allocated resource conflict
>>    with the M64 segments in last BAR?
>
>Not need to be IOV BAR size aligned, be individual VF BAR size aligned is fine.
>
>IOV BAR size = VF BAR size * expended_num_vfs
>

The (15th) last PHB's M64 BAR is divided into 256 segments and the size for
each of them is 256MB. Lets have an example: PF has one M64 BAR (128MB) and it
supports 8 VFs. The VF BAR size is 128MB and the IOV BAR size is (128MB * 8).
If we take the VF BAR size (128MB) as the alignment, the MMIO might be assigned
to have following layout. PF and VF will be put into different PE#. So I think
the correct alignment would be max{VF_bar_size, M64_segment_size}, or I missed
something?

   +---------------+----------------------------+
   |  PF's M64 BAR |     VF BARs                |
   +---------------+----------------------------+
   0               128MB                     (128MB *9)

>>(2) When M64 BAR is in single-PE-mode, the PE numbers allocated for VFs need
>>    continuous or not.
>
>No, not need.
>

Ok. If you like, you can improve it to have discrete PE numbers when the PHB's
M64 BARs for VFs runs in single-mode in separate patch.

>>(3) Each PF could have 6 IOV BARs and there're 15 available M64 BAR. It means
>>    only two VFs can be enabled in the extreme case. Would it be a problem?
>>
>
>Yes, you are right.
>
>Based on Alexey's mail, full isolation is more important than more VFs.
>

Ok. Lets ignore this issue for now. Maybe it has to be considered in future.
Here's another problem:

(4) In pnv_pci_sriov_enable(), we can bail early when num_vfs >= phb_avaiable_M64_BARs.
    no need to allocate PE number and PHB's M64 BARs, then hit failure and release
    the allocated resources.

>>>diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
>>>index 712add5..1997e5d 100644
>>>--- a/arch/powerpc/include/asm/pci-bridge.h
>>>+++ b/arch/powerpc/include/asm/pci-bridge.h
>>>@@ -214,10 +214,9 @@ struct pci_dn {
>>> 	u16     vfs_expanded;		/* number of VFs IOV BAR expanded */
>>> 	u16     num_vfs;		/* number of VFs enabled*/
>>> 	int     offset;			/* PE# for the first VF PE */
>>>-#define M64_PER_IOV 4
>>>-	int     m64_per_iov;
>>>+#define MAX_M64_WINDOW  16
>>> #define IODA_INVALID_M64        (-1)
>>>-	int     m64_wins[PCI_SRIOV_NUM_BARS][M64_PER_IOV];
>>>+	int     m64_wins[PCI_SRIOV_NUM_BARS][MAX_M64_WINDOW];
>>> #endif /* CONFIG_PCI_IOV */
>>> #endif
>>
>>The "m64_wins" would be renamed to "m64_map". Also, it would have dynamic size:
>>
>>- When the IOV BAR is extended to 256 segments, its size is sizeof(int) * PCI_SRIOV_NUM_BARS;
>>- When the IOV BAR is extended to max_vf_num, its size is sizeof(int) * max_vf_num;
>>
>>> 	struct list_head child_list;
>>>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>>>index 5738d31..b3e7909 100644
>>>--- a/arch/powerpc/platforms/powernv/pci-ioda.c
>>>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>>>@@ -1168,7 +1168,7 @@ static int pnv_pci_vf_release_m64(struct pci_dev *pdev)
>>> 	pdn = pci_get_pdn(pdev);
>>>
>>> 	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
>>>-		for (j = 0; j < M64_PER_IOV; j++) {
>>>+		for (j = 0; j < MAX_M64_WINDOW; j++) {
>>> 			if (pdn->m64_wins[i][j] == IODA_INVALID_M64)
>>> 				continue;
>>> 			opal_pci_phb_mmio_enable(phb->opal_id,
>>>@@ -1193,8 +1193,7 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
>>> 	int                    total_vfs;
>>> 	resource_size_t        size, start;
>>> 	int                    pe_num;
>>>-	int                    vf_groups;
>>>-	int                    vf_per_group;
>>>+	int                    m64s;
>>
>>"m64s" could have better name. For example, "vfs_per_m64_bar"...
>>
>
>m64s is used to represent number of M64 BARs necessary to enable num_vfs.
>vfs_per_m64_bar may be misleading.
>
>How about "m64_bars" ?
>

Actually, "m64s" represents the number of PHB's M64 BARs required for the
number of VF BARs, not "enabled num_vfs", isn't it? Yes, "m64_bars_per_iov_bar"
or "m64_bars" are better.

>>>
>>> 	bus = pdev->bus;
>>> 	hose = pci_bus_to_host(bus);
>>>@@ -1204,17 +1203,13 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
>>>
>>> 	/* Initialize the m64_wins to IODA_INVALID_M64 */
>>> 	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
>>>-		for (j = 0; j < M64_PER_IOV; j++)
>>>+		for (j = 0; j < MAX_M64_WINDOW; j++)
>>> 			pdn->m64_wins[i][j] = IODA_INVALID_M64;
>>>
>>>-	if (pdn->m64_per_iov == M64_PER_IOV) {
>>>-		vf_groups = (num_vfs <= M64_PER_IOV) ? num_vfs: M64_PER_IOV;
>>>-		vf_per_group = (num_vfs <= M64_PER_IOV)? 1:
>>>-			roundup_pow_of_two(num_vfs) / pdn->m64_per_iov;
>>>-	} else {
>>>-		vf_groups = 1;
>>>-		vf_per_group = 1;
>>>-	}
>>>+	if (pdn->vfs_expanded != phb->ioda.total_pe)
>>>+		m64s = num_vfs;
>>>+	else
>>>+		m64s = 1;
>>
>>The condition (pdn->vfs_expanded != phb->ioda.total_pe) isn't precise enough as
>>explained below.
>>
>>>
>>> 	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
>>> 		res = &pdev->resource[i + PCI_IOV_RESOURCES];
>>>@@ -1224,7 +1219,7 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
>>> 		if (!pnv_pci_is_mem_pref_64(res->flags))
>>> 			continue;
>>>
>>>-		for (j = 0; j < vf_groups; j++) {
>>>+		for (j = 0; j < m64s; j++) {
>>> 			do {
>>> 				win = find_next_zero_bit(&phb->ioda.m64_bar_alloc,
>>> 						phb->ioda.m64_bar_idx + 1, 0);
>>>@@ -1235,10 +1230,9 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
>>>
>>> 			pdn->m64_wins[i][j] = win;
>>>
>>>-			if (pdn->m64_per_iov == M64_PER_IOV) {
>>>+			if (pdn->vfs_expanded != phb->ioda.total_pe) {
>>> 				size = pci_iov_resource_size(pdev,
>>> 							PCI_IOV_RESOURCES + i);
>>>-				size = size * vf_per_group;
>>> 				start = res->start + size * j;
>>> 			} else {
>>> 				size = resource_size(res);
>>>@@ -1246,7 +1240,7 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
>>> 			}
>>>
>>> 			/* Map the M64 here */
>>>-			if (pdn->m64_per_iov == M64_PER_IOV) {
>>>+			if (pdn->vfs_expanded != phb->ioda.total_pe) {
>>> 				pe_num = pdn->offset + j;
>>> 				rc = opal_pci_map_pe_mmio_window(phb->opal_id,
>>> 						pe_num, OPAL_M64_WINDOW_TYPE,
>>>@@ -1267,7 +1261,7 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
>>> 				goto m64_failed;
>>> 			}
>>>
>>>-			if (pdn->m64_per_iov == M64_PER_IOV)
>>>+			if (pdn->vfs_expanded != phb->ioda.total_pe)
>>> 				rc = opal_pci_phb_mmio_enable(phb->opal_id,
>>> 				     OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 2);
>>> 			else
>>>@@ -1311,15 +1305,13 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe
>>> 	iommu_free_table(tbl, of_node_full_name(dev->dev.of_node));
>>> }
>>>
>>>-static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs)
>>>+static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
>>> {
>>> 	struct pci_bus        *bus;
>>> 	struct pci_controller *hose;
>>> 	struct pnv_phb        *phb;
>>> 	struct pnv_ioda_pe    *pe, *pe_n;
>>> 	struct pci_dn         *pdn;
>>>-	u16                    vf_index;
>>>-	int64_t                rc;
>>>
>>> 	bus = pdev->bus;
>>> 	hose = pci_bus_to_host(bus);
>>>@@ -1329,35 +1321,6 @@ static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs)
>>> 	if (!pdev->is_physfn)
>>> 		return;
>>>
>>>-	if (pdn->m64_per_iov == M64_PER_IOV && num_vfs > M64_PER_IOV) {
>>>-		int   vf_group;
>>>-		int   vf_per_group;
>>>-		int   vf_index1;
>>>-
>>>-		vf_per_group = roundup_pow_of_two(num_vfs) / pdn->m64_per_iov;
>>>-
>>>-		for (vf_group = 0; vf_group < M64_PER_IOV; vf_group++)
>>>-			for (vf_index = vf_group * vf_per_group;
>>>-				vf_index < (vf_group + 1) * vf_per_group &&
>>>-				vf_index < num_vfs;
>>>-				vf_index++)
>>>-				for (vf_index1 = vf_group * vf_per_group;
>>>-					vf_index1 < (vf_group + 1) * vf_per_group &&
>>>-					vf_index1 < num_vfs;
>>>-					vf_index1++){
>>>-
>>>-					rc = opal_pci_set_peltv(phb->opal_id,
>>>-						pdn->offset + vf_index,
>>>-						pdn->offset + vf_index1,
>>>-						OPAL_REMOVE_PE_FROM_DOMAIN);
>>>-
>>>-					if (rc)
>>>-					    dev_warn(&pdev->dev, "%s: Failed to unlink same group PE#%d(%lld)\n",
>>>-						__func__,
>>>-						pdn->offset + vf_index1, rc);
>>>-				}
>>>-	}
>>>-
>>> 	list_for_each_entry_safe(pe, pe_n, &phb->ioda.pe_list, list) {
>>> 		if (pe->parent_dev != pdev)
>>> 			continue;
>>>@@ -1392,10 +1355,10 @@ void pnv_pci_sriov_disable(struct pci_dev *pdev)
>>> 	num_vfs = pdn->num_vfs;
>>>
>>> 	/* Release VF PEs */
>>>-	pnv_ioda_release_vf_PE(pdev, num_vfs);
>>>+	pnv_ioda_release_vf_PE(pdev);
>>>
>>> 	if (phb->type == PNV_PHB_IODA2) {
>>>-		if (pdn->m64_per_iov == 1)
>>>+		if (pdn->vfs_expanded == phb->ioda.total_pe)
>>> 			pnv_pci_vf_resource_shift(pdev, -pdn->offset);
>>>
>>> 		/* Release M64 windows */
>>>@@ -1418,7 +1381,6 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
>>> 	int                    pe_num;
>>> 	u16                    vf_index;
>>> 	struct pci_dn         *pdn;
>>>-	int64_t                rc;
>>>
>>> 	bus = pdev->bus;
>>> 	hose = pci_bus_to_host(bus);
>>>@@ -1463,37 +1425,6 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
>>>
>>> 		pnv_pci_ioda2_setup_dma_pe(phb, pe);
>>> 	}
>>>-
>>>-	if (pdn->m64_per_iov == M64_PER_IOV && num_vfs > M64_PER_IOV) {
>>>-		int   vf_group;
>>>-		int   vf_per_group;
>>>-		int   vf_index1;
>>>-
>>>-		vf_per_group = roundup_pow_of_two(num_vfs) / pdn->m64_per_iov;
>>>-
>>>-		for (vf_group = 0; vf_group < M64_PER_IOV; vf_group++) {
>>>-			for (vf_index = vf_group * vf_per_group;
>>>-			     vf_index < (vf_group + 1) * vf_per_group &&
>>>-			     vf_index < num_vfs;
>>>-			     vf_index++) {
>>>-				for (vf_index1 = vf_group * vf_per_group;
>>>-				     vf_index1 < (vf_group + 1) * vf_per_group &&
>>>-				     vf_index1 < num_vfs;
>>>-				     vf_index1++) {
>>>-
>>>-					rc = opal_pci_set_peltv(phb->opal_id,
>>>-						pdn->offset + vf_index,
>>>-						pdn->offset + vf_index1,
>>>-						OPAL_ADD_PE_TO_DOMAIN);
>>>-
>>>-					if (rc)
>>>-					    dev_warn(&pdev->dev, "%s: Failed to link same group PE#%d(%lld)\n",
>>>-						__func__,
>>>-						pdn->offset + vf_index1, rc);
>>>-				}
>>>-			}
>>>-		}
>>>-	}
>>> }
>>>
>>> int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
>>>@@ -1537,7 +1468,7 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
>>> 		 * the IOV BAR according to the PE# allocated to the VFs.
>>> 		 * Otherwise, the PE# for the VF will conflict with others.
>>> 		 */
>>>-		if (pdn->m64_per_iov == 1) {
>>>+		if (pdn->vfs_expanded == phb->ioda.total_pe) {
>>
>>This condition isn't precise enough. When PF occasionally supports 256 VFs
>>and the summed size of all IOV BARs (explained below) exceeds 64MB, we're
>>expecting to use singole-pe-mode M64 BARs, not shared-mode.
>>
>
>Yes, you are right. The vfs_expanded is not reliable.
>
>>> 			ret = pnv_pci_vf_resource_shift(pdev, pdn->offset);
>>> 			if (ret)
>>> 				goto m64_failed;
>>>@@ -1570,8 +1501,7 @@ int pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
>>> 	/* Allocate PCI data */
>>> 	add_dev_pci_data(pdev);
>>>
>>>-	pnv_pci_sriov_enable(pdev, num_vfs);
>>>-	return 0;
>>>+	return pnv_pci_sriov_enable(pdev, num_vfs);
>>> }
>>> #endif /* CONFIG_PCI_IOV */
>>>
>>>@@ -2766,7 +2696,6 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
>>> 	pdn->vfs_expanded = 0;
>>>
>>> 	total_vfs = pci_sriov_get_totalvfs(pdev);
>>>-	pdn->m64_per_iov = 1;
>>> 	mul = phb->ioda.total_pe;
>>>
>>> 	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
>>>@@ -2785,7 +2714,6 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
>>> 		if (size > (1 << 26)) {
>>
>>Actually, the condition isn't precise enough. In theory, every PF can have 6 IOV BARs.
>>If all of their size are 64MB, we will have 256 extended VFs. The total MMIO size needed
>>is: 96GB = (6 * 64MB * 256), which exceeds 64GB. The original idea would be to have
>>the scheme other than extending to 256 VFs when the sum of all IOV BARs is bigger
>>than 64MB, not single M64 BAR. It's different issue and you can fix it up in another
>>patch if you want.
>>
>
>I didn't get your point here.
>
>You mean it is necessary to check the sum of IOV BAR instead of a single one?
>

I mean to check the sum of all VF BARs. For example, the VFs attached to its PF has two
VF BARs and each of them is 64MB. For this case, the MMIO resource can't be allocated
once extending them to 256 VFs. So we have to try "single-pe-mode" for this situation.
So the check becomes as below:

	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
	struct pnv_phb *phb = hose->private_data;
	resource_size_t total_vf_bar_sz = 0;
	resource_size_t gate;

	/* Some comments to explain the "gate" */
	gate = phb->m64_segsize / 2;
	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
		total_vf_bar_sz += pci_iov_resource_size(pdev, PCI_IOV_RESOURCES + i);

	if (total_vf_bar_sz >= gate)
		/* single-pe-mode */
	else
		/* shared-mode */

Also, the gate value (1 << 26) should be variable depends on the PHB's M64 capacity.

>>> 			dev_info(&pdev->dev, "PowerNV: VF BAR%d: %pR IOV size is bigger than 64M, roundup power2\n",
>>> 				 i, res);
>>>-			pdn->m64_per_iov = M64_PER_IOV;
>>> 			mul = roundup_pow_of_two(total_vfs);
>>> 			break;
>>> 		}
>>>-- 
>>>1.7.9.5
>>>
>
>-- 
>Richard Yang
>Help you, Help me

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH] powerpc/powernv: use one M64 BAR in Single PE mode for one VF BAR
  2015-07-31  0:13     ` Gavin Shan
@ 2015-07-31  2:01       ` Wei Yang
  2015-08-05  1:24         ` [PATCH V2 0/6] Redesign SR-IOV on PowerNV Wei Yang
  0 siblings, 1 reply; 56+ messages in thread
From: Wei Yang @ 2015-07-31  2:01 UTC (permalink / raw)
  To: Gavin Shan; +Cc: Wei Yang, aik, benh, linuxppc-dev

On Fri, Jul 31, 2015 at 10:13:26AM +1000, Gavin Shan wrote:
>On Thu, Jul 30, 2015 at 01:43:59PM +0800, Wei Yang wrote:
>>On Thu, Jul 30, 2015 at 11:15:01AM +1000, Gavin Shan wrote:
>>>On Wed, Jul 29, 2015 at 03:22:07PM +0800, Wei Yang wrote:
>>>>In current implementation, when VF BAR is bigger than 64MB, it uses 4 M64
>>>>BAR in Single PE mode to cover the number of VFs required to be enabled.
>>>>By doing so, several VFs would be in one VF Group and leads to interference
>>>>between VFs in the same group.
>>>>
>>>>This patch changes the design by using one M64 BAR in Single PE mode for
>>>>one VF BAR. This gives absolute isolation for VFs.
>>>>
>>>>Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
>>>>---
>>>> arch/powerpc/include/asm/pci-bridge.h     |    5 +-
>>>> arch/powerpc/platforms/powernv/pci-ioda.c |  104 +++++------------------------
>>>> 2 files changed, 18 insertions(+), 91 deletions(-)
>>>>
>>>
>>>questions regarding this:
>>>
>>>(1) When M64 BAR is running in single-PE-mode for VFs, the alignment for one
>>>    particular IOV BAR still have to be (IOV_BAR_size * max_vf_number), or
>>>    M64 segment size of last BAR (0x10000000) is fine? If the later one is fine,
>>>    more M64 space would be saved. On the other hand, if the IOV BAR size
>>>    (for all VFs) is less than 256MB, will the allocated resource conflict
>>>    with the M64 segments in last BAR?
>>
>>Not need to be IOV BAR size aligned, be individual VF BAR size aligned is fine.
>>
>>IOV BAR size = VF BAR size * expended_num_vfs
>>
>
>The (15th) last PHB's M64 BAR is divided into 256 segments and the size for
>each of them is 256MB. Lets have an example: PF has one M64 BAR (128MB) and it
>supports 8 VFs. The VF BAR size is 128MB and the IOV BAR size is (128MB * 8).
>If we take the VF BAR size (128MB) as the alignment, the MMIO might be assigned
>to have following layout. PF and VF will be put into different PE#. So I think
>the correct alignment would be max{VF_bar_size, M64_segment_size}, or I missed
>something?
>
>   +---------------+----------------------------+
>   |  PF's M64 BAR |     VF BARs                |
>   +---------------+----------------------------+
>   0               128MB                     (128MB *9)
>

Ok, got your point. So the layout should be

   +----------------------------+---------------+
   |     VF BARs                |  PF's M64 BAR |
   +----------------------------+---------------+
   0MB                         (128MB * 8)

>>>(2) When M64 BAR is in single-PE-mode, the PE numbers allocated for VFs need
>>>    continuous or not.
>>
>>No, not need.
>>
>
>Ok. If you like, you can improve it to have discrete PE numbers when the PHB's
>M64 BARs for VFs runs in single-mode in separate patch.
>

Yep, good suggestion.

>>>(3) Each PF could have 6 IOV BARs and there're 15 available M64 BAR. It means
>>>    only two VFs can be enabled in the extreme case. Would it be a problem?
>>>
>>
>>Yes, you are right.
>>
>>Based on Alexey's mail, full isolation is more important than more VFs.
>>
>
>Ok. Lets ignore this issue for now. Maybe it has to be considered in future.
>Here's another problem:
>
>(4) In pnv_pci_sriov_enable(), we can bail early when num_vfs >= phb_avaiable_M64_BARs.
>    no need to allocate PE number and PHB's M64 BARs, then hit failure and release
>    the allocated resources.
>

Yep, good suggestion.

>>>>diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
>>>>index 712add5..1997e5d 100644
>>>>--- a/arch/powerpc/include/asm/pci-bridge.h
>>>>+++ b/arch/powerpc/include/asm/pci-bridge.h
>>>>@@ -214,10 +214,9 @@ struct pci_dn {
>>>> 	u16     vfs_expanded;		/* number of VFs IOV BAR expanded */
>>>> 	u16     num_vfs;		/* number of VFs enabled*/
>>>> 	int     offset;			/* PE# for the first VF PE */
>>>>-#define M64_PER_IOV 4
>>>>-	int     m64_per_iov;
>>>>+#define MAX_M64_WINDOW  16
>>>> #define IODA_INVALID_M64        (-1)
>>>>-	int     m64_wins[PCI_SRIOV_NUM_BARS][M64_PER_IOV];
>>>>+	int     m64_wins[PCI_SRIOV_NUM_BARS][MAX_M64_WINDOW];
>>>> #endif /* CONFIG_PCI_IOV */
>>>> #endif
>>>
>>>The "m64_wins" would be renamed to "m64_map". Also, it would have dynamic size:
>>>
>>>- When the IOV BAR is extended to 256 segments, its size is sizeof(int) * PCI_SRIOV_NUM_BARS;
>>>- When the IOV BAR is extended to max_vf_num, its size is sizeof(int) * max_vf_num;
>>>
>>>> 	struct list_head child_list;
>>>>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>>>>index 5738d31..b3e7909 100644
>>>>--- a/arch/powerpc/platforms/powernv/pci-ioda.c
>>>>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>>>>@@ -1168,7 +1168,7 @@ static int pnv_pci_vf_release_m64(struct pci_dev *pdev)
>>>> 	pdn = pci_get_pdn(pdev);
>>>>
>>>> 	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
>>>>-		for (j = 0; j < M64_PER_IOV; j++) {
>>>>+		for (j = 0; j < MAX_M64_WINDOW; j++) {
>>>> 			if (pdn->m64_wins[i][j] == IODA_INVALID_M64)
>>>> 				continue;
>>>> 			opal_pci_phb_mmio_enable(phb->opal_id,
>>>>@@ -1193,8 +1193,7 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
>>>> 	int                    total_vfs;
>>>> 	resource_size_t        size, start;
>>>> 	int                    pe_num;
>>>>-	int                    vf_groups;
>>>>-	int                    vf_per_group;
>>>>+	int                    m64s;
>>>
>>>"m64s" could have better name. For example, "vfs_per_m64_bar"...
>>>
>>
>>m64s is used to represent number of M64 BARs necessary to enable num_vfs.
>>vfs_per_m64_bar may be misleading.
>>
>>How about "m64_bars" ?
>>
>
>Actually, "m64s" represents the number of PHB's M64 BARs required for the
>number of VF BARs, not "enabled num_vfs", isn't it? Yes, "m64_bars_per_iov_bar"
>or "m64_bars" are better.
>
>>>>
>>>> 	bus = pdev->bus;
>>>> 	hose = pci_bus_to_host(bus);
>>>>@@ -1204,17 +1203,13 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
>>>>
>>>> 	/* Initialize the m64_wins to IODA_INVALID_M64 */
>>>> 	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
>>>>-		for (j = 0; j < M64_PER_IOV; j++)
>>>>+		for (j = 0; j < MAX_M64_WINDOW; j++)
>>>> 			pdn->m64_wins[i][j] = IODA_INVALID_M64;
>>>>
>>>>-	if (pdn->m64_per_iov == M64_PER_IOV) {
>>>>-		vf_groups = (num_vfs <= M64_PER_IOV) ? num_vfs: M64_PER_IOV;
>>>>-		vf_per_group = (num_vfs <= M64_PER_IOV)? 1:
>>>>-			roundup_pow_of_two(num_vfs) / pdn->m64_per_iov;
>>>>-	} else {
>>>>-		vf_groups = 1;
>>>>-		vf_per_group = 1;
>>>>-	}
>>>>+	if (pdn->vfs_expanded != phb->ioda.total_pe)
>>>>+		m64s = num_vfs;
>>>>+	else
>>>>+		m64s = 1;
>>>
>>>The condition (pdn->vfs_expanded != phb->ioda.total_pe) isn't precise enough as
>>>explained below.
>>>
>>>>
>>>> 	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
>>>> 		res = &pdev->resource[i + PCI_IOV_RESOURCES];
>>>>@@ -1224,7 +1219,7 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
>>>> 		if (!pnv_pci_is_mem_pref_64(res->flags))
>>>> 			continue;
>>>>
>>>>-		for (j = 0; j < vf_groups; j++) {
>>>>+		for (j = 0; j < m64s; j++) {
>>>> 			do {
>>>> 				win = find_next_zero_bit(&phb->ioda.m64_bar_alloc,
>>>> 						phb->ioda.m64_bar_idx + 1, 0);
>>>>@@ -1235,10 +1230,9 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
>>>>
>>>> 			pdn->m64_wins[i][j] = win;
>>>>
>>>>-			if (pdn->m64_per_iov == M64_PER_IOV) {
>>>>+			if (pdn->vfs_expanded != phb->ioda.total_pe) {
>>>> 				size = pci_iov_resource_size(pdev,
>>>> 							PCI_IOV_RESOURCES + i);
>>>>-				size = size * vf_per_group;
>>>> 				start = res->start + size * j;
>>>> 			} else {
>>>> 				size = resource_size(res);
>>>>@@ -1246,7 +1240,7 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
>>>> 			}
>>>>
>>>> 			/* Map the M64 here */
>>>>-			if (pdn->m64_per_iov == M64_PER_IOV) {
>>>>+			if (pdn->vfs_expanded != phb->ioda.total_pe) {
>>>> 				pe_num = pdn->offset + j;
>>>> 				rc = opal_pci_map_pe_mmio_window(phb->opal_id,
>>>> 						pe_num, OPAL_M64_WINDOW_TYPE,
>>>>@@ -1267,7 +1261,7 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
>>>> 				goto m64_failed;
>>>> 			}
>>>>
>>>>-			if (pdn->m64_per_iov == M64_PER_IOV)
>>>>+			if (pdn->vfs_expanded != phb->ioda.total_pe)
>>>> 				rc = opal_pci_phb_mmio_enable(phb->opal_id,
>>>> 				     OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 2);
>>>> 			else
>>>>@@ -1311,15 +1305,13 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe
>>>> 	iommu_free_table(tbl, of_node_full_name(dev->dev.of_node));
>>>> }
>>>>
>>>>-static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs)
>>>>+static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
>>>> {
>>>> 	struct pci_bus        *bus;
>>>> 	struct pci_controller *hose;
>>>> 	struct pnv_phb        *phb;
>>>> 	struct pnv_ioda_pe    *pe, *pe_n;
>>>> 	struct pci_dn         *pdn;
>>>>-	u16                    vf_index;
>>>>-	int64_t                rc;
>>>>
>>>> 	bus = pdev->bus;
>>>> 	hose = pci_bus_to_host(bus);
>>>>@@ -1329,35 +1321,6 @@ static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs)
>>>> 	if (!pdev->is_physfn)
>>>> 		return;
>>>>
>>>>-	if (pdn->m64_per_iov == M64_PER_IOV && num_vfs > M64_PER_IOV) {
>>>>-		int   vf_group;
>>>>-		int   vf_per_group;
>>>>-		int   vf_index1;
>>>>-
>>>>-		vf_per_group = roundup_pow_of_two(num_vfs) / pdn->m64_per_iov;
>>>>-
>>>>-		for (vf_group = 0; vf_group < M64_PER_IOV; vf_group++)
>>>>-			for (vf_index = vf_group * vf_per_group;
>>>>-				vf_index < (vf_group + 1) * vf_per_group &&
>>>>-				vf_index < num_vfs;
>>>>-				vf_index++)
>>>>-				for (vf_index1 = vf_group * vf_per_group;
>>>>-					vf_index1 < (vf_group + 1) * vf_per_group &&
>>>>-					vf_index1 < num_vfs;
>>>>-					vf_index1++){
>>>>-
>>>>-					rc = opal_pci_set_peltv(phb->opal_id,
>>>>-						pdn->offset + vf_index,
>>>>-						pdn->offset + vf_index1,
>>>>-						OPAL_REMOVE_PE_FROM_DOMAIN);
>>>>-
>>>>-					if (rc)
>>>>-					    dev_warn(&pdev->dev, "%s: Failed to unlink same group PE#%d(%lld)\n",
>>>>-						__func__,
>>>>-						pdn->offset + vf_index1, rc);
>>>>-				}
>>>>-	}
>>>>-
>>>> 	list_for_each_entry_safe(pe, pe_n, &phb->ioda.pe_list, list) {
>>>> 		if (pe->parent_dev != pdev)
>>>> 			continue;
>>>>@@ -1392,10 +1355,10 @@ void pnv_pci_sriov_disable(struct pci_dev *pdev)
>>>> 	num_vfs = pdn->num_vfs;
>>>>
>>>> 	/* Release VF PEs */
>>>>-	pnv_ioda_release_vf_PE(pdev, num_vfs);
>>>>+	pnv_ioda_release_vf_PE(pdev);
>>>>
>>>> 	if (phb->type == PNV_PHB_IODA2) {
>>>>-		if (pdn->m64_per_iov == 1)
>>>>+		if (pdn->vfs_expanded == phb->ioda.total_pe)
>>>> 			pnv_pci_vf_resource_shift(pdev, -pdn->offset);
>>>>
>>>> 		/* Release M64 windows */
>>>>@@ -1418,7 +1381,6 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
>>>> 	int                    pe_num;
>>>> 	u16                    vf_index;
>>>> 	struct pci_dn         *pdn;
>>>>-	int64_t                rc;
>>>>
>>>> 	bus = pdev->bus;
>>>> 	hose = pci_bus_to_host(bus);
>>>>@@ -1463,37 +1425,6 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
>>>>
>>>> 		pnv_pci_ioda2_setup_dma_pe(phb, pe);
>>>> 	}
>>>>-
>>>>-	if (pdn->m64_per_iov == M64_PER_IOV && num_vfs > M64_PER_IOV) {
>>>>-		int   vf_group;
>>>>-		int   vf_per_group;
>>>>-		int   vf_index1;
>>>>-
>>>>-		vf_per_group = roundup_pow_of_two(num_vfs) / pdn->m64_per_iov;
>>>>-
>>>>-		for (vf_group = 0; vf_group < M64_PER_IOV; vf_group++) {
>>>>-			for (vf_index = vf_group * vf_per_group;
>>>>-			     vf_index < (vf_group + 1) * vf_per_group &&
>>>>-			     vf_index < num_vfs;
>>>>-			     vf_index++) {
>>>>-				for (vf_index1 = vf_group * vf_per_group;
>>>>-				     vf_index1 < (vf_group + 1) * vf_per_group &&
>>>>-				     vf_index1 < num_vfs;
>>>>-				     vf_index1++) {
>>>>-
>>>>-					rc = opal_pci_set_peltv(phb->opal_id,
>>>>-						pdn->offset + vf_index,
>>>>-						pdn->offset + vf_index1,
>>>>-						OPAL_ADD_PE_TO_DOMAIN);
>>>>-
>>>>-					if (rc)
>>>>-					    dev_warn(&pdev->dev, "%s: Failed to link same group PE#%d(%lld)\n",
>>>>-						__func__,
>>>>-						pdn->offset + vf_index1, rc);
>>>>-				}
>>>>-			}
>>>>-		}
>>>>-	}
>>>> }
>>>>
>>>> int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
>>>>@@ -1537,7 +1468,7 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
>>>> 		 * the IOV BAR according to the PE# allocated to the VFs.
>>>> 		 * Otherwise, the PE# for the VF will conflict with others.
>>>> 		 */
>>>>-		if (pdn->m64_per_iov == 1) {
>>>>+		if (pdn->vfs_expanded == phb->ioda.total_pe) {
>>>
>>>This condition isn't precise enough. When PF occasionally supports 256 VFs
>>>and the summed size of all IOV BARs (explained below) exceeds 64MB, we're
>>>expecting to use singole-pe-mode M64 BARs, not shared-mode.
>>>
>>
>>Yes, you are right. The vfs_expanded is not reliable.
>>
>>>> 			ret = pnv_pci_vf_resource_shift(pdev, pdn->offset);
>>>> 			if (ret)
>>>> 				goto m64_failed;
>>>>@@ -1570,8 +1501,7 @@ int pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
>>>> 	/* Allocate PCI data */
>>>> 	add_dev_pci_data(pdev);
>>>>
>>>>-	pnv_pci_sriov_enable(pdev, num_vfs);
>>>>-	return 0;
>>>>+	return pnv_pci_sriov_enable(pdev, num_vfs);
>>>> }
>>>> #endif /* CONFIG_PCI_IOV */
>>>>
>>>>@@ -2766,7 +2696,6 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
>>>> 	pdn->vfs_expanded = 0;
>>>>
>>>> 	total_vfs = pci_sriov_get_totalvfs(pdev);
>>>>-	pdn->m64_per_iov = 1;
>>>> 	mul = phb->ioda.total_pe;
>>>>
>>>> 	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
>>>>@@ -2785,7 +2714,6 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
>>>> 		if (size > (1 << 26)) {
>>>
>>>Actually, the condition isn't precise enough. In theory, every PF can have 6 IOV BARs.
>>>If all of their size are 64MB, we will have 256 extended VFs. The total MMIO size needed
>>>is: 96GB = (6 * 64MB * 256), which exceeds 64GB. The original idea would be to have
>>>the scheme other than extending to 256 VFs when the sum of all IOV BARs is bigger
>>>than 64MB, not single M64 BAR. It's different issue and you can fix it up in another
>>>patch if you want.
>>>
>>
>>I didn't get your point here.
>>
>>You mean it is necessary to check the sum of IOV BAR instead of a single one?
>>
>
>I mean to check the sum of all VF BARs. For example, the VFs attached to its PF has two
>VF BARs and each of them is 64MB. For this case, the MMIO resource can't be allocated
>once extending them to 256 VFs. So we have to try "single-pe-mode" for this situation.
>So the check becomes as below:
>
>	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
>	struct pnv_phb *phb = hose->private_data;
>	resource_size_t total_vf_bar_sz = 0;
>	resource_size_t gate;
>
>	/* Some comments to explain the "gate" */
>	gate = phb->m64_segsize / 2;
>	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
>		total_vf_bar_sz += pci_iov_resource_size(pdev, PCI_IOV_RESOURCES + i);
>
>	if (total_vf_bar_sz >= gate)
>		/* single-pe-mode */
>	else
>		/* shared-mode */
>
>Also, the gate value (1 << 26) should be variable depends on the PHB's M64 capacity.
>

Got your point.

>>>> 			dev_info(&pdev->dev, "PowerNV: VF BAR%d: %pR IOV size is bigger than 64M, roundup power2\n",
>>>> 				 i, res);
>>>>-			pdn->m64_per_iov = M64_PER_IOV;
>>>> 			mul = roundup_pow_of_two(total_vfs);
>>>> 			break;
>>>> 		}
>>>>-- 
>>>>1.7.9.5
>>>>
>>
>>-- 
>>Richard Yang
>>Help you, Help me

-- 
Richard Yang
Help you, Help me

^ permalink raw reply	[flat|nested] 56+ messages in thread

* [PATCH V2 0/6] Redesign SR-IOV on PowerNV
  2015-07-31  2:01       ` Wei Yang
@ 2015-08-05  1:24         ` Wei Yang
  2015-08-05  1:24           ` [PATCH V2 1/6] powerpc/powernv: don't enable SRIOV when VF BAR contains non M64 BAR Wei Yang
                             ` (5 more replies)
  0 siblings, 6 replies; 56+ messages in thread
From: Wei Yang @ 2015-08-05  1:24 UTC (permalink / raw)
  To: aik, gwshan, benh; +Cc: linuxppc-dev, Wei Yang

In original design, it tries to group VFs to enable more number of VFs in the
system, when VF BAR is bigger than 64MB. This design has a flaw in which one
error on a VF will interfere other VFs in the same group.

This patch series change this design by using M64 BAR in Single PE mode to
cover only one VF BAR. By doing so, it gives absolute isolation between VFs.

v2:
   * clean up iov bar alignment calculation
   * change m64s to m64_bars
   * add a field to represent M64 Single PE mode will be used
   * change m64_wins to m64_map
   * calculate the gate instead of hard coded
   * dynamically allocate m64_map
   * dynamically allocate PE#
   * add a case to calculate iov bar alignment when M64 Single PE is used
   * when M64 Single PE is used, compare num_vfs with M64 BAR available number 
     in system at first

Wei Yang (6):
  powerpc/powernv: don't enable SRIOV when VF BAR contains non M64 BAR
  powerpc/powernv: simplify the calculation of iov resource
  powerpc/powernv: use one M64 BAR in Single PE mode for one VF BAR
  powerpc/powernv: replace the hard coded boundary with gate
  powerpc/powernv: boundary the total vf bar size instead of the
    individual one
  powerpc/powernv: allocate discrete PE# when using M64 BAR in Single
    PE mode

 arch/powerpc/include/asm/pci-bridge.h     |    7 +-
 arch/powerpc/platforms/powernv/pci-ioda.c |  311 +++++++++++++++--------------
 2 files changed, 163 insertions(+), 155 deletions(-)

-- 
1.7.9.5

^ permalink raw reply	[flat|nested] 56+ messages in thread

* [PATCH V2 1/6] powerpc/powernv: don't enable SRIOV when VF BAR contains non M64 BAR
  2015-08-05  1:24         ` [PATCH V2 0/6] Redesign SR-IOV on PowerNV Wei Yang
@ 2015-08-05  1:24           ` Wei Yang
  2015-08-06  4:35             ` Gavin Shan
  2015-08-05  1:24           ` [PATCH V2 2/6] powerpc/powernv: simplify the calculation of iov resource Wei Yang
                             ` (4 subsequent siblings)
  5 siblings, 1 reply; 56+ messages in thread
From: Wei Yang @ 2015-08-05  1:24 UTC (permalink / raw)
  To: aik, gwshan, benh; +Cc: linuxppc-dev, Wei Yang

On PHB_IODA2, we enable SRIOV devices by mapping IOV BAR with M64 BARs. If
a SRIOV device's BAR is not 64-bit prefetchable, this is not assigned from
M64 windwo, which means M64 BAR can't work on it.

This patch makes this explicit.

Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
---
 arch/powerpc/platforms/powernv/pci-ioda.c |   25 +++++++++----------------
 1 file changed, 9 insertions(+), 16 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 5738d31..9b41dba 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -908,9 +908,6 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
 		if (!res->flags || !res->parent)
 			continue;
 
-		if (!pnv_pci_is_mem_pref_64(res->flags))
-			continue;
-
 		/*
 		 * The actual IOV BAR range is determined by the start address
 		 * and the actual size for num_vfs VFs BAR.  This check is to
@@ -939,9 +936,6 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
 		if (!res->flags || !res->parent)
 			continue;
 
-		if (!pnv_pci_is_mem_pref_64(res->flags))
-			continue;
-
 		size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
 		res2 = *res;
 		res->start += size * offset;
@@ -1221,9 +1215,6 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
 		if (!res->flags || !res->parent)
 			continue;
 
-		if (!pnv_pci_is_mem_pref_64(res->flags))
-			continue;
-
 		for (j = 0; j < vf_groups; j++) {
 			do {
 				win = find_next_zero_bit(&phb->ioda.m64_bar_alloc,
@@ -1510,6 +1501,12 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
 	pdn = pci_get_pdn(pdev);
 
 	if (phb->type == PNV_PHB_IODA2) {
+		if (!pdn->vfs_expanded) {
+			dev_info(&pdev->dev, "don't support this SRIOV device"
+				" with non M64 VF BAR\n");
+			return -EBUSY;
+		}
+
 		/* Calculate available PE for required VFs */
 		mutex_lock(&phb->ioda.pe_alloc_mutex);
 		pdn->offset = bitmap_find_next_zero_area(
@@ -2774,9 +2771,10 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
 		if (!res->flags || res->parent)
 			continue;
 		if (!pnv_pci_is_mem_pref_64(res->flags)) {
-			dev_warn(&pdev->dev, " non M64 VF BAR%d: %pR\n",
+			dev_warn(&pdev->dev, "Don't support SR-IOV with"
+					" non M64 VF BAR%d: %pR. \n",
 				 i, res);
-			continue;
+			return;
 		}
 
 		size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
@@ -2795,11 +2793,6 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
 		res = &pdev->resource[i + PCI_IOV_RESOURCES];
 		if (!res->flags || res->parent)
 			continue;
-		if (!pnv_pci_is_mem_pref_64(res->flags)) {
-			dev_warn(&pdev->dev, "Skipping expanding VF BAR%d: %pR\n",
-				 i, res);
-			continue;
-		}
 
 		dev_dbg(&pdev->dev, " Fixing VF BAR%d: %pR to\n", i, res);
 		size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 56+ messages in thread

* [PATCH V2 2/6] powerpc/powernv: simplify the calculation of iov resource
  2015-08-05  1:24         ` [PATCH V2 0/6] Redesign SR-IOV on PowerNV Wei Yang
  2015-08-05  1:24           ` [PATCH V2 1/6] powerpc/powernv: don't enable SRIOV when VF BAR contains non M64 BAR Wei Yang
@ 2015-08-05  1:24           ` Wei Yang
  2015-08-06  4:51             ` Gavin Shan
  2015-08-05  1:25           ` [PATCH V2 3/6] powerpc/powernv: use one M64 BAR in Single PE mode for one VF BAR Wei Yang
                             ` (3 subsequent siblings)
  5 siblings, 1 reply; 56+ messages in thread
From: Wei Yang @ 2015-08-05  1:24 UTC (permalink / raw)
  To: aik, gwshan, benh; +Cc: linuxppc-dev, Wei Yang

The alignment of IOV BAR on PowerNV platform is the total size of the IOV
BAR. No matter whether the IOV BAR is truncated or not, the total size
could be calculated by (vfs_expanded * VF size).

This patch simplifies the pnv_pci_iov_resource_alignment() by removing the
first case.

Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
---
 arch/powerpc/platforms/powernv/pci-ioda.c |   14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 9b41dba..7192e62 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -2987,12 +2987,16 @@ static resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev,
 						      int resno)
 {
 	struct pci_dn *pdn = pci_get_pdn(pdev);
-	resource_size_t align, iov_align;
-
-	iov_align = resource_size(&pdev->resource[resno]);
-	if (iov_align)
-		return iov_align;
+	resource_size_t align;
 
+	/*
+	 * On PowerNV platform, IOV BAR is mapped by M64 BAR to enable the
+	 * SR-IOV. While from hardware perspective, the range mapped by M64
+	 * BAR should be size aligned.
+	 *
+	 * This function return the total IOV BAR size if expanded or just the
+	 * individual size if not.
+	 */
 	align = pci_iov_resource_size(pdev, resno);
 	if (pdn->vfs_expanded)
 		return pdn->vfs_expanded * align;
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 56+ messages in thread

* [PATCH V2 3/6] powerpc/powernv: use one M64 BAR in Single PE mode for one VF BAR
  2015-08-05  1:24         ` [PATCH V2 0/6] Redesign SR-IOV on PowerNV Wei Yang
  2015-08-05  1:24           ` [PATCH V2 1/6] powerpc/powernv: don't enable SRIOV when VF BAR contains non M64 BAR Wei Yang
  2015-08-05  1:24           ` [PATCH V2 2/6] powerpc/powernv: simplify the calculation of iov resource Wei Yang
@ 2015-08-05  1:25           ` Wei Yang
  2015-08-06  5:20             ` Gavin Shan
  2015-08-06 10:04             ` Alexey Kardashevskiy
  2015-08-05  1:25           ` [PATCH V2 4/6] powerpc/powernv: replace the hard coded boundary with gate Wei Yang
                             ` (2 subsequent siblings)
  5 siblings, 2 replies; 56+ messages in thread
From: Wei Yang @ 2015-08-05  1:25 UTC (permalink / raw)
  To: aik, gwshan, benh; +Cc: linuxppc-dev, Wei Yang

In current implementation, when VF BAR is bigger than 64MB, it uses 4 M64
BAR in Single PE mode to cover the number of VFs required to be enabled.
By doing so, several VFs would be in one VF Group and leads to interference
between VFs in the same group.

This patch changes the design by using one M64 BAR in Single PE mode for
one VF BAR. This gives absolute isolation for VFs.

Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/pci-bridge.h     |    5 +-
 arch/powerpc/platforms/powernv/pci-ioda.c |  180 ++++++++++++-----------------
 2 files changed, 76 insertions(+), 109 deletions(-)

diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
index 712add5..8aeba4c 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -214,10 +214,9 @@ struct pci_dn {
 	u16     vfs_expanded;		/* number of VFs IOV BAR expanded */
 	u16     num_vfs;		/* number of VFs enabled*/
 	int     offset;			/* PE# for the first VF PE */
-#define M64_PER_IOV 4
-	int     m64_per_iov;
+	bool    m64_single_mode;	/* Use M64 BAR in Single Mode */
 #define IODA_INVALID_M64        (-1)
-	int     m64_wins[PCI_SRIOV_NUM_BARS][M64_PER_IOV];
+	int     (*m64_map)[PCI_SRIOV_NUM_BARS];
 #endif /* CONFIG_PCI_IOV */
 #endif
 	struct list_head child_list;
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 7192e62..f5d110c 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1148,29 +1148,36 @@ static void pnv_pci_ioda_setup_PEs(void)
 }
 
 #ifdef CONFIG_PCI_IOV
-static int pnv_pci_vf_release_m64(struct pci_dev *pdev)
+static int pnv_pci_vf_release_m64(struct pci_dev *pdev, u16 num_vfs)
 {
 	struct pci_bus        *bus;
 	struct pci_controller *hose;
 	struct pnv_phb        *phb;
 	struct pci_dn         *pdn;
 	int                    i, j;
+	int                    m64_bars;
 
 	bus = pdev->bus;
 	hose = pci_bus_to_host(bus);
 	phb = hose->private_data;
 	pdn = pci_get_pdn(pdev);
 
+	if (pdn->m64_single_mode)
+		m64_bars = num_vfs;
+	else
+		m64_bars = 1;
+
 	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
-		for (j = 0; j < M64_PER_IOV; j++) {
-			if (pdn->m64_wins[i][j] == IODA_INVALID_M64)
+		for (j = 0; j < m64_bars; j++) {
+			if (pdn->m64_map[j][i] == IODA_INVALID_M64)
 				continue;
 			opal_pci_phb_mmio_enable(phb->opal_id,
-				OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 0);
-			clear_bit(pdn->m64_wins[i][j], &phb->ioda.m64_bar_alloc);
-			pdn->m64_wins[i][j] = IODA_INVALID_M64;
+				OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 0);
+			clear_bit(pdn->m64_map[j][i], &phb->ioda.m64_bar_alloc);
+			pdn->m64_map[j][i] = IODA_INVALID_M64;
 		}
 
+	kfree(pdn->m64_map);
 	return 0;
 }
 
@@ -1187,8 +1194,7 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
 	int                    total_vfs;
 	resource_size_t        size, start;
 	int                    pe_num;
-	int                    vf_groups;
-	int                    vf_per_group;
+	int                    m64_bars;
 
 	bus = pdev->bus;
 	hose = pci_bus_to_host(bus);
@@ -1196,26 +1202,26 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
 	pdn = pci_get_pdn(pdev);
 	total_vfs = pci_sriov_get_totalvfs(pdev);
 
-	/* Initialize the m64_wins to IODA_INVALID_M64 */
-	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
-		for (j = 0; j < M64_PER_IOV; j++)
-			pdn->m64_wins[i][j] = IODA_INVALID_M64;
+	if (pdn->m64_single_mode)
+		m64_bars = num_vfs;
+	else
+		m64_bars = 1;
+
+	pdn->m64_map = kmalloc(sizeof(*pdn->m64_map) * m64_bars, GFP_KERNEL);
+	if (!pdn->m64_map)
+		return -ENOMEM;
+	/* Initialize the m64_map to IODA_INVALID_M64 */
+	for (i = 0; i < m64_bars ; i++)
+		for (j = 0; j < PCI_SRIOV_NUM_BARS; j++)
+			pdn->m64_map[i][j] = IODA_INVALID_M64;
 
-	if (pdn->m64_per_iov == M64_PER_IOV) {
-		vf_groups = (num_vfs <= M64_PER_IOV) ? num_vfs: M64_PER_IOV;
-		vf_per_group = (num_vfs <= M64_PER_IOV)? 1:
-			roundup_pow_of_two(num_vfs) / pdn->m64_per_iov;
-	} else {
-		vf_groups = 1;
-		vf_per_group = 1;
-	}
 
 	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
 		res = &pdev->resource[i + PCI_IOV_RESOURCES];
 		if (!res->flags || !res->parent)
 			continue;
 
-		for (j = 0; j < vf_groups; j++) {
+		for (j = 0; j < m64_bars; j++) {
 			do {
 				win = find_next_zero_bit(&phb->ioda.m64_bar_alloc,
 						phb->ioda.m64_bar_idx + 1, 0);
@@ -1224,12 +1230,11 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
 					goto m64_failed;
 			} while (test_and_set_bit(win, &phb->ioda.m64_bar_alloc));
 
-			pdn->m64_wins[i][j] = win;
+			pdn->m64_map[j][i] = win;
 
-			if (pdn->m64_per_iov == M64_PER_IOV) {
+			if (pdn->m64_single_mode) {
 				size = pci_iov_resource_size(pdev,
 							PCI_IOV_RESOURCES + i);
-				size = size * vf_per_group;
 				start = res->start + size * j;
 			} else {
 				size = resource_size(res);
@@ -1237,16 +1242,16 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
 			}
 
 			/* Map the M64 here */
-			if (pdn->m64_per_iov == M64_PER_IOV) {
+			if (pdn->m64_single_mode) {
 				pe_num = pdn->offset + j;
 				rc = opal_pci_map_pe_mmio_window(phb->opal_id,
 						pe_num, OPAL_M64_WINDOW_TYPE,
-						pdn->m64_wins[i][j], 0);
+						pdn->m64_map[j][i], 0);
 			}
 
 			rc = opal_pci_set_phb_mem_window(phb->opal_id,
 						 OPAL_M64_WINDOW_TYPE,
-						 pdn->m64_wins[i][j],
+						 pdn->m64_map[j][i],
 						 start,
 						 0, /* unused */
 						 size);
@@ -1258,12 +1263,12 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
 				goto m64_failed;
 			}
 
-			if (pdn->m64_per_iov == M64_PER_IOV)
+			if (pdn->m64_single_mode)
 				rc = opal_pci_phb_mmio_enable(phb->opal_id,
-				     OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 2);
+				     OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 2);
 			else
 				rc = opal_pci_phb_mmio_enable(phb->opal_id,
-				     OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 1);
+				     OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 1);
 
 			if (rc != OPAL_SUCCESS) {
 				dev_err(&pdev->dev, "Failed to enable M64 window #%d: %llx\n",
@@ -1275,7 +1280,7 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
 	return 0;
 
 m64_failed:
-	pnv_pci_vf_release_m64(pdev);
+	pnv_pci_vf_release_m64(pdev, num_vfs);
 	return -EBUSY;
 }
 
@@ -1302,15 +1307,13 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe
 	iommu_free_table(tbl, of_node_full_name(dev->dev.of_node));
 }
 
-static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs)
+static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
 {
 	struct pci_bus        *bus;
 	struct pci_controller *hose;
 	struct pnv_phb        *phb;
 	struct pnv_ioda_pe    *pe, *pe_n;
 	struct pci_dn         *pdn;
-	u16                    vf_index;
-	int64_t                rc;
 
 	bus = pdev->bus;
 	hose = pci_bus_to_host(bus);
@@ -1320,35 +1323,6 @@ static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs)
 	if (!pdev->is_physfn)
 		return;
 
-	if (pdn->m64_per_iov == M64_PER_IOV && num_vfs > M64_PER_IOV) {
-		int   vf_group;
-		int   vf_per_group;
-		int   vf_index1;
-
-		vf_per_group = roundup_pow_of_two(num_vfs) / pdn->m64_per_iov;
-
-		for (vf_group = 0; vf_group < M64_PER_IOV; vf_group++)
-			for (vf_index = vf_group * vf_per_group;
-				vf_index < (vf_group + 1) * vf_per_group &&
-				vf_index < num_vfs;
-				vf_index++)
-				for (vf_index1 = vf_group * vf_per_group;
-					vf_index1 < (vf_group + 1) * vf_per_group &&
-					vf_index1 < num_vfs;
-					vf_index1++){
-
-					rc = opal_pci_set_peltv(phb->opal_id,
-						pdn->offset + vf_index,
-						pdn->offset + vf_index1,
-						OPAL_REMOVE_PE_FROM_DOMAIN);
-
-					if (rc)
-					    dev_warn(&pdev->dev, "%s: Failed to unlink same group PE#%d(%lld)\n",
-						__func__,
-						pdn->offset + vf_index1, rc);
-				}
-	}
-
 	list_for_each_entry_safe(pe, pe_n, &phb->ioda.pe_list, list) {
 		if (pe->parent_dev != pdev)
 			continue;
@@ -1383,14 +1357,14 @@ void pnv_pci_sriov_disable(struct pci_dev *pdev)
 	num_vfs = pdn->num_vfs;
 
 	/* Release VF PEs */
-	pnv_ioda_release_vf_PE(pdev, num_vfs);
+	pnv_ioda_release_vf_PE(pdev);
 
 	if (phb->type == PNV_PHB_IODA2) {
-		if (pdn->m64_per_iov == 1)
+		if (!pdn->m64_single_mode)
 			pnv_pci_vf_resource_shift(pdev, -pdn->offset);
 
 		/* Release M64 windows */
-		pnv_pci_vf_release_m64(pdev);
+		pnv_pci_vf_release_m64(pdev, num_vfs);
 
 		/* Release PE numbers */
 		bitmap_clear(phb->ioda.pe_alloc, pdn->offset, num_vfs);
@@ -1409,7 +1383,6 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
 	int                    pe_num;
 	u16                    vf_index;
 	struct pci_dn         *pdn;
-	int64_t                rc;
 
 	bus = pdev->bus;
 	hose = pci_bus_to_host(bus);
@@ -1454,37 +1427,6 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
 
 		pnv_pci_ioda2_setup_dma_pe(phb, pe);
 	}
-
-	if (pdn->m64_per_iov == M64_PER_IOV && num_vfs > M64_PER_IOV) {
-		int   vf_group;
-		int   vf_per_group;
-		int   vf_index1;
-
-		vf_per_group = roundup_pow_of_two(num_vfs) / pdn->m64_per_iov;
-
-		for (vf_group = 0; vf_group < M64_PER_IOV; vf_group++) {
-			for (vf_index = vf_group * vf_per_group;
-			     vf_index < (vf_group + 1) * vf_per_group &&
-			     vf_index < num_vfs;
-			     vf_index++) {
-				for (vf_index1 = vf_group * vf_per_group;
-				     vf_index1 < (vf_group + 1) * vf_per_group &&
-				     vf_index1 < num_vfs;
-				     vf_index1++) {
-
-					rc = opal_pci_set_peltv(phb->opal_id,
-						pdn->offset + vf_index,
-						pdn->offset + vf_index1,
-						OPAL_ADD_PE_TO_DOMAIN);
-
-					if (rc)
-					    dev_warn(&pdev->dev, "%s: Failed to link same group PE#%d(%lld)\n",
-						__func__,
-						pdn->offset + vf_index1, rc);
-				}
-			}
-		}
-	}
 }
 
 int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
@@ -1507,6 +1449,18 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
 			return -EBUSY;
 		}
 
+		/*
+		 * On PNV_PHB_IODA2, We just have 16 M64 BARs and M64 BAR #15
+		 * is used to cover the whole system, which leaves only 15 M64
+		 * BAR usable for VFs.
+		 * When M64 BAR functions in Single PE mode, this means it
+		 * just could enable 15 VFs.
+		 */
+		if (pdn->m64_single_mode && num_vfs >= 16) {
+			dev_info(&pdev->dev, "Not enough M64 BAR for VFs\n");
+			return -EBUSY;
+		}
+
 		/* Calculate available PE for required VFs */
 		mutex_lock(&phb->ioda.pe_alloc_mutex);
 		pdn->offset = bitmap_find_next_zero_area(
@@ -1534,7 +1488,7 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
 		 * the IOV BAR according to the PE# allocated to the VFs.
 		 * Otherwise, the PE# for the VF will conflict with others.
 		 */
-		if (pdn->m64_per_iov == 1) {
+		if (!pdn->m64_single_mode) {
 			ret = pnv_pci_vf_resource_shift(pdev, pdn->offset);
 			if (ret)
 				goto m64_failed;
@@ -1567,8 +1521,7 @@ int pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
 	/* Allocate PCI data */
 	add_dev_pci_data(pdev);
 
-	pnv_pci_sriov_enable(pdev, num_vfs);
-	return 0;
+	return pnv_pci_sriov_enable(pdev, num_vfs);
 }
 #endif /* CONFIG_PCI_IOV */
 
@@ -2761,9 +2714,9 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
 
 	pdn = pci_get_pdn(pdev);
 	pdn->vfs_expanded = 0;
+	pdn->m64_single_mode = false;
 
 	total_vfs = pci_sriov_get_totalvfs(pdev);
-	pdn->m64_per_iov = 1;
 	mul = phb->ioda.total_pe;
 
 	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
@@ -2783,8 +2736,8 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
 		if (size > (1 << 26)) {
 			dev_info(&pdev->dev, "PowerNV: VF BAR%d: %pR IOV size is bigger than 64M, roundup power2\n",
 				 i, res);
-			pdn->m64_per_iov = M64_PER_IOV;
 			mul = roundup_pow_of_two(total_vfs);
+			pdn->m64_single_mode = true;
 			break;
 		}
 	}
@@ -2986,6 +2939,8 @@ static resource_size_t pnv_pci_window_alignment(struct pci_bus *bus,
 static resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev,
 						      int resno)
 {
+	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
+	struct pnv_phb *phb = hose->private_data;
 	struct pci_dn *pdn = pci_get_pdn(pdev);
 	resource_size_t align;
 
@@ -2994,12 +2949,25 @@ static resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev,
 	 * SR-IOV. While from hardware perspective, the range mapped by M64
 	 * BAR should be size aligned.
 	 *
+	 * When IOV BAR is mapped with M64 BAR in Single PE mode, the hardware
+	 * restriction to alignment is gone. But if just use the VF BAR size
+	 * as the alignment, PF BAR / VF BAR may be allocated with in one M64
+	 * segment, which introduces the PE conflict between PF and VF. Based
+	 * on this the minimum alignment of an IOV BAR is m64_segsize.
+	 *
 	 * This function return the total IOV BAR size if expanded or just the
-	 * individual size if not.
+	 * individual size if not, when M64 BAR is in Shared PE mode.
+	 * If the M64 BAR is in Single PE mode, return the VF BAR size or
+	 * m64_size if IOV BAR size is less.
 	 */
 	align = pci_iov_resource_size(pdev, resno);
-	if (pdn->vfs_expanded)
-		return pdn->vfs_expanded * align;
+	if (pdn->vfs_expanded) {
+		if (pdn->m64_single_mode)
+			return max(align,
+				(resource_size_t)phb->ioda.m64_segsize);
+		else
+			return pdn->vfs_expanded * align;
+	}
 
 	return align;
 }
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 56+ messages in thread

* [PATCH V2 4/6] powerpc/powernv: replace the hard coded boundary with gate
  2015-08-05  1:24         ` [PATCH V2 0/6] Redesign SR-IOV on PowerNV Wei Yang
                             ` (2 preceding siblings ...)
  2015-08-05  1:25           ` [PATCH V2 3/6] powerpc/powernv: use one M64 BAR in Single PE mode for one VF BAR Wei Yang
@ 2015-08-05  1:25           ` Wei Yang
  2015-08-06  5:26             ` Gavin Shan
  2015-08-05  1:25           ` [PATCH V2 5/6] powerpc/powernv: boundary the total vf bar size instead of the individual one Wei Yang
  2015-08-05  1:25           ` [PATCH V2 6/6] powerpc/powernv: allocate discrete PE# when using M64 BAR in Single PE mode Wei Yang
  5 siblings, 1 reply; 56+ messages in thread
From: Wei Yang @ 2015-08-05  1:25 UTC (permalink / raw)
  To: aik, gwshan, benh; +Cc: linuxppc-dev, Wei Yang

Based on the limitation of M64 Window size, when VF BAR size is bigger than
64MB, IOV BAR just round up power of 2 of the total_vfs. While the 64MB is
a magic boundary in code, which is hard to maintain.

This patch replaces the hard coded boundary with gate, which is calculated
from m64_segsize and adds comment to explain the reason for it.

Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
---
 arch/powerpc/platforms/powernv/pci-ioda.c |   22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index f5d110c..31dcedc 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -2702,7 +2702,7 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
 	struct pnv_phb *phb;
 	struct resource *res;
 	int i;
-	resource_size_t size;
+	resource_size_t size, gate;
 	struct pci_dn *pdn;
 	int mul, total_vfs;
 
@@ -2718,6 +2718,17 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
 
 	total_vfs = pci_sriov_get_totalvfs(pdev);
 	mul = phb->ioda.total_pe;
+	/*
+	 * If bigger than or equal to half of m64_segsize, just round up power
+	 * of two.
+	 *
+	 * Generally, one M64 BAR maps one IOV BAR. To avoid conflict with
+	 * other devices, IOV BAR size is expanded to be (total_pe * VF size).
+	 * When VF size is half of m64_segsize , the expanded size would equal
+	 * to half of the whole M64 Window size, which will exhaust the M64
+	 * Window and limit the system flexibility.
+	 */
+	gate = phb->ioda.m64_segsize >> 1;
 
 	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
 		res = &pdev->resource[i + PCI_IOV_RESOURCES];
@@ -2732,10 +2743,11 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
 
 		size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
 
-		/* bigger than 64M */
-		if (size > (1 << 26)) {
-			dev_info(&pdev->dev, "PowerNV: VF BAR%d: %pR IOV size is bigger than 64M, roundup power2\n",
-				 i, res);
+		/* bigger than or equal to gate */
+		if (size >= gate) {
+			dev_info(&pdev->dev, "PowerNV: VF BAR%d: %pR IOV size "
+				"is bigger than %lld, roundup power2\n",
+				 i, res, gate);
 			mul = roundup_pow_of_two(total_vfs);
 			pdn->m64_single_mode = true;
 			break;
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 56+ messages in thread

* [PATCH V2 5/6] powerpc/powernv: boundary the total vf bar size instead of the individual one
  2015-08-05  1:24         ` [PATCH V2 0/6] Redesign SR-IOV on PowerNV Wei Yang
                             ` (3 preceding siblings ...)
  2015-08-05  1:25           ` [PATCH V2 4/6] powerpc/powernv: replace the hard coded boundary with gate Wei Yang
@ 2015-08-05  1:25           ` Wei Yang
  2015-08-06  5:28             ` Gavin Shan
  2015-08-05  1:25           ` [PATCH V2 6/6] powerpc/powernv: allocate discrete PE# when using M64 BAR in Single PE mode Wei Yang
  5 siblings, 1 reply; 56+ messages in thread
From: Wei Yang @ 2015-08-05  1:25 UTC (permalink / raw)
  To: aik, gwshan, benh; +Cc: linuxppc-dev, Wei Yang

Each VF could have 6 BARs at most. When the total BAR size exceeds the
gate, after expanding it will also exhaust the M64 Window.

This patch limits the boundary by checking the total VF BAR size instead of
the individual BAR.

Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
---
 arch/powerpc/platforms/powernv/pci-ioda.c |   13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 31dcedc..4042303 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -2702,7 +2702,7 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
 	struct pnv_phb *phb;
 	struct resource *res;
 	int i;
-	resource_size_t size, gate;
+	resource_size_t size, gate, total_vf_bar_sz;
 	struct pci_dn *pdn;
 	int mul, total_vfs;
 
@@ -2729,6 +2729,7 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
 	 * Window and limit the system flexibility.
 	 */
 	gate = phb->ioda.m64_segsize >> 1;
+	total_vf_bar_sz = 0;
 
 	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
 		res = &pdev->resource[i + PCI_IOV_RESOURCES];
@@ -2741,13 +2742,13 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
 			return;
 		}
 
-		size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
+		total_vf_bar_sz += pci_iov_resource_size(pdev,
+				i + PCI_IOV_RESOURCES);
 
 		/* bigger than or equal to gate */
-		if (size >= gate) {
-			dev_info(&pdev->dev, "PowerNV: VF BAR%d: %pR IOV size "
-				"is bigger than %lld, roundup power2\n",
-				 i, res, gate);
+		if (total_vf_bar_sz >= gate) {
+			dev_info(&pdev->dev, "PowerNV: VF BAR Total IOV size "
+				"is bigger than %lld, roundup power2\n", gate);
 			mul = roundup_pow_of_two(total_vfs);
 			pdn->m64_single_mode = true;
 			break;
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 56+ messages in thread

* [PATCH V2 6/6] powerpc/powernv: allocate discrete PE# when using M64 BAR in Single PE mode
  2015-08-05  1:24         ` [PATCH V2 0/6] Redesign SR-IOV on PowerNV Wei Yang
                             ` (4 preceding siblings ...)
  2015-08-05  1:25           ` [PATCH V2 5/6] powerpc/powernv: boundary the total vf bar size instead of the individual one Wei Yang
@ 2015-08-05  1:25           ` Wei Yang
  2015-08-06  5:36             ` Gavin Shan
  5 siblings, 1 reply; 56+ messages in thread
From: Wei Yang @ 2015-08-05  1:25 UTC (permalink / raw)
  To: aik, gwshan, benh; +Cc: linuxppc-dev, Wei Yang

When M64 BAR is set to Single PE mode, the PE# assigned to VF could be
discrete.

This patch restructures the patch to allocate discrete PE# for VFs when M64
BAR is set to Single PE mode.

Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/pci-bridge.h     |    2 +-
 arch/powerpc/platforms/powernv/pci-ioda.c |   69 +++++++++++++++++++++--------
 2 files changed, 51 insertions(+), 20 deletions(-)

diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
index 8aeba4c..72415c7 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -213,7 +213,7 @@ struct pci_dn {
 #ifdef CONFIG_PCI_IOV
 	u16     vfs_expanded;		/* number of VFs IOV BAR expanded */
 	u16     num_vfs;		/* number of VFs enabled*/
-	int     offset;			/* PE# for the first VF PE */
+	int     *offset;		/* PE# for the first VF PE or array */
 	bool    m64_single_mode;	/* Use M64 BAR in Single Mode */
 #define IODA_INVALID_M64        (-1)
 	int     (*m64_map)[PCI_SRIOV_NUM_BARS];
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 4042303..9953829 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1243,7 +1243,7 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
 
 			/* Map the M64 here */
 			if (pdn->m64_single_mode) {
-				pe_num = pdn->offset + j;
+				pe_num = pdn->offset[j];
 				rc = opal_pci_map_pe_mmio_window(phb->opal_id,
 						pe_num, OPAL_M64_WINDOW_TYPE,
 						pdn->m64_map[j][i], 0);
@@ -1347,7 +1347,7 @@ void pnv_pci_sriov_disable(struct pci_dev *pdev)
 	struct pnv_phb        *phb;
 	struct pci_dn         *pdn;
 	struct pci_sriov      *iov;
-	u16 num_vfs;
+	u16                    num_vfs, i;
 
 	bus = pdev->bus;
 	hose = pci_bus_to_host(bus);
@@ -1361,14 +1361,18 @@ void pnv_pci_sriov_disable(struct pci_dev *pdev)
 
 	if (phb->type == PNV_PHB_IODA2) {
 		if (!pdn->m64_single_mode)
-			pnv_pci_vf_resource_shift(pdev, -pdn->offset);
+			pnv_pci_vf_resource_shift(pdev, -*pdn->offset);
 
 		/* Release M64 windows */
 		pnv_pci_vf_release_m64(pdev, num_vfs);
 
 		/* Release PE numbers */
-		bitmap_clear(phb->ioda.pe_alloc, pdn->offset, num_vfs);
-		pdn->offset = 0;
+		if (pdn->m64_single_mode) {
+			for (i = 0; i < num_vfs; i++)
+				pnv_ioda_free_pe(phb, pdn->offset[i]);
+		} else
+			bitmap_clear(phb->ioda.pe_alloc, *pdn->offset, num_vfs);
+		kfree(pdn->offset);
 	}
 }
 
@@ -1394,7 +1398,10 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
 
 	/* Reserve PE for each VF */
 	for (vf_index = 0; vf_index < num_vfs; vf_index++) {
-		pe_num = pdn->offset + vf_index;
+		if (pdn->m64_single_mode)
+			pe_num = pdn->offset[vf_index];
+		else
+			pe_num = *pdn->offset + vf_index;
 
 		pe = &phb->ioda.pe_array[pe_num];
 		pe->pe_number = pe_num;
@@ -1436,6 +1443,7 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
 	struct pnv_phb        *phb;
 	struct pci_dn         *pdn;
 	int                    ret;
+	u16                    i;
 
 	bus = pdev->bus;
 	hose = pci_bus_to_host(bus);
@@ -1462,19 +1470,38 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
 		}
 
 		/* Calculate available PE for required VFs */
-		mutex_lock(&phb->ioda.pe_alloc_mutex);
-		pdn->offset = bitmap_find_next_zero_area(
-			phb->ioda.pe_alloc, phb->ioda.total_pe,
-			0, num_vfs, 0);
-		if (pdn->offset >= phb->ioda.total_pe) {
+		if (pdn->m64_single_mode) {
+			pdn->offset = kmalloc(sizeof(*pdn->offset) * num_vfs,
+					GFP_KERNEL);
+			if (!pdn->offset)
+				return -ENOMEM;
+			for (i = 0; i < num_vfs; i++)
+				pdn->offset[i] = IODA_INVALID_PE;
+			for (i = 0; i < num_vfs; i++) {
+				pdn->offset[i] = pnv_ioda_alloc_pe(phb);
+				if (pdn->offset[i] == IODA_INVALID_PE) {
+					ret = -EBUSY;
+					goto m64_failed;
+				}
+			}
+		} else {
+			pdn->offset = kmalloc(sizeof(*pdn->offset), GFP_KERNEL);
+			if (!pdn->offset)
+				return -ENOMEM;
+			mutex_lock(&phb->ioda.pe_alloc_mutex);
+			*pdn->offset = bitmap_find_next_zero_area(
+				phb->ioda.pe_alloc, phb->ioda.total_pe,
+				0, num_vfs, 0);
+			if (*pdn->offset >= phb->ioda.total_pe) {
+				mutex_unlock(&phb->ioda.pe_alloc_mutex);
+				dev_info(&pdev->dev, "Failed to enable VF%d\n", num_vfs);
+				kfree(pdn->offset);
+				return -EBUSY;
+			}
+			bitmap_set(phb->ioda.pe_alloc, *pdn->offset, num_vfs);
 			mutex_unlock(&phb->ioda.pe_alloc_mutex);
-			dev_info(&pdev->dev, "Failed to enable VF%d\n", num_vfs);
-			pdn->offset = 0;
-			return -EBUSY;
 		}
-		bitmap_set(phb->ioda.pe_alloc, pdn->offset, num_vfs);
 		pdn->num_vfs = num_vfs;
-		mutex_unlock(&phb->ioda.pe_alloc_mutex);
 
 		/* Assign M64 window accordingly */
 		ret = pnv_pci_vf_assign_m64(pdev, num_vfs);
@@ -1489,7 +1516,7 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
 		 * Otherwise, the PE# for the VF will conflict with others.
 		 */
 		if (!pdn->m64_single_mode) {
-			ret = pnv_pci_vf_resource_shift(pdev, pdn->offset);
+			ret = pnv_pci_vf_resource_shift(pdev, *pdn->offset);
 			if (ret)
 				goto m64_failed;
 		}
@@ -1501,8 +1528,12 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
 	return 0;
 
 m64_failed:
-	bitmap_clear(phb->ioda.pe_alloc, pdn->offset, num_vfs);
-	pdn->offset = 0;
+	if (pdn->m64_single_mode) {
+		for (i = 0; i < num_vfs; i++)
+			pnv_ioda_free_pe(phb, pdn->offset[i]);
+	} else
+		bitmap_clear(phb->ioda.pe_alloc, *pdn->offset, num_vfs);
+	kfree(pdn->offset);
 
 	return ret;
 }
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 56+ messages in thread

* Re: [PATCH V2 1/6] powerpc/powernv: don't enable SRIOV when VF BAR contains non M64 BAR
  2015-08-05  1:24           ` [PATCH V2 1/6] powerpc/powernv: don't enable SRIOV when VF BAR contains non M64 BAR Wei Yang
@ 2015-08-06  4:35             ` Gavin Shan
  2015-08-06  6:10               ` Alexey Kardashevskiy
  2015-08-06 14:10               ` Wei Yang
  0 siblings, 2 replies; 56+ messages in thread
From: Gavin Shan @ 2015-08-06  4:35 UTC (permalink / raw)
  To: Wei Yang; +Cc: aik, gwshan, benh, linuxppc-dev

On Wed, Aug 05, 2015 at 09:24:58AM +0800, Wei Yang wrote:
>On PHB_IODA2, we enable SRIOV devices by mapping IOV BAR with M64 BARs. If
>a SRIOV device's BAR is not 64-bit prefetchable, this is not assigned from
>M64 windwo, which means M64 BAR can't work on it.
>

s/PHB_IODA2/PHB3
s/windwo/window

>This patch makes this explicit.
>
>Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>

The idea sounds right, but there is one question as below.

>---
> arch/powerpc/platforms/powernv/pci-ioda.c |   25 +++++++++----------------
> 1 file changed, 9 insertions(+), 16 deletions(-)
>
>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>index 5738d31..9b41dba 100644
>--- a/arch/powerpc/platforms/powernv/pci-ioda.c
>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>@@ -908,9 +908,6 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
> 		if (!res->flags || !res->parent)
> 			continue;
>
>-		if (!pnv_pci_is_mem_pref_64(res->flags))
>-			continue;
>-
> 		/*
> 		 * The actual IOV BAR range is determined by the start address
> 		 * and the actual size for num_vfs VFs BAR.  This check is to
>@@ -939,9 +936,6 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
> 		if (!res->flags || !res->parent)
> 			continue;
>
>-		if (!pnv_pci_is_mem_pref_64(res->flags))
>-			continue;
>-
> 		size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
> 		res2 = *res;
> 		res->start += size * offset;
>@@ -1221,9 +1215,6 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
> 		if (!res->flags || !res->parent)
> 			continue;
>
>-		if (!pnv_pci_is_mem_pref_64(res->flags))
>-			continue;
>-
> 		for (j = 0; j < vf_groups; j++) {
> 			do {
> 				win = find_next_zero_bit(&phb->ioda.m64_bar_alloc,
>@@ -1510,6 +1501,12 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
> 	pdn = pci_get_pdn(pdev);
>
> 	if (phb->type == PNV_PHB_IODA2) {
>+		if (!pdn->vfs_expanded) {
>+			dev_info(&pdev->dev, "don't support this SRIOV device"
>+				" with non M64 VF BAR\n");
>+			return -EBUSY;
>+		}
>+

It would be -ENOSPC since -EBUSY indicates the devices (VFs) are temparily
unavailable. For this case, the VFs are permanently unavailable because of
running out of space to accomodate M64 and non-M64 VF BARs.

The error message could be printed with dev_warn() and it would be precise
as below or something else you prefer:

	dev_warn(&pdev->dev, "SRIOV not supported because of non-M64 VF BAR\n");


> 		/* Calculate available PE for required VFs */
> 		mutex_lock(&phb->ioda.pe_alloc_mutex);
> 		pdn->offset = bitmap_find_next_zero_area(
>@@ -2774,9 +2771,10 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
> 		if (!res->flags || res->parent)
> 			continue;
> 		if (!pnv_pci_is_mem_pref_64(res->flags)) {
>-			dev_warn(&pdev->dev, " non M64 VF BAR%d: %pR\n",
>+			dev_warn(&pdev->dev, "Don't support SR-IOV with"
>+					" non M64 VF BAR%d: %pR. \n",
> 				 i, res);
>-			continue;
>+			return;
> 		}
>
> 		size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
>@@ -2795,11 +2793,6 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
> 		res = &pdev->resource[i + PCI_IOV_RESOURCES];
> 		if (!res->flags || res->parent)
> 			continue;
>-		if (!pnv_pci_is_mem_pref_64(res->flags)) {
>-			dev_warn(&pdev->dev, "Skipping expanding VF BAR%d: %pR\n",
>-				 i, res);
>-			continue;
>-		}

When any one IOV BAR on the PF is non-M64, none of the VFs can be enabled.
Will we still allocate/assign M64 or M32 resources for the IOV BARs? If so,
I think it can be avoided.

>
> 		dev_dbg(&pdev->dev, " Fixing VF BAR%d: %pR to\n", i, res);
> 		size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
>-- 
>1.7.9.5
>

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH V2 2/6] powerpc/powernv: simplify the calculation of iov resource
  2015-08-05  1:24           ` [PATCH V2 2/6] powerpc/powernv: simplify the calculation of iov resource Wei Yang
@ 2015-08-06  4:51             ` Gavin Shan
  2015-08-06  9:00               ` Alexey Kardashevskiy
  2015-08-06 13:49               ` Wei Yang
  0 siblings, 2 replies; 56+ messages in thread
From: Gavin Shan @ 2015-08-06  4:51 UTC (permalink / raw)
  To: Wei Yang; +Cc: aik, gwshan, benh, linuxppc-dev

On Wed, Aug 05, 2015 at 09:24:59AM +0800, Wei Yang wrote:
>The alignment of IOV BAR on PowerNV platform is the total size of the IOV
>BAR. No matter whether the IOV BAR is truncated or not, the total size
>could be calculated by (vfs_expanded * VF size).
>

s/VF size/VF BAR size

I think the changelog would be more explicit:

The alignment of IOV BAR on PowerNV platform is the total size of the
IOV BAR, no matter whether the IOV BAR is extended with number of max
VFs or number of max PE number (256). The alignment can be calculated
by (vfs_expaned * VF_BAR_size).

>This patch simplifies the pnv_pci_iov_resource_alignment() by removing the
>first case.
>
>Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>

Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>

>---
> arch/powerpc/platforms/powernv/pci-ioda.c |   14 +++++++++-----
> 1 file changed, 9 insertions(+), 5 deletions(-)
>
>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>index 9b41dba..7192e62 100644
>--- a/arch/powerpc/platforms/powernv/pci-ioda.c
>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>@@ -2987,12 +2987,16 @@ static resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev,
> 						      int resno)
> {
> 	struct pci_dn *pdn = pci_get_pdn(pdev);
>-	resource_size_t align, iov_align;
>-
>-	iov_align = resource_size(&pdev->resource[resno]);
>-	if (iov_align)
>-		return iov_align;
>+	resource_size_t align;
>
>+	/*
>+	 * On PowerNV platform, IOV BAR is mapped by M64 BAR to enable the
>+	 * SR-IOV. While from hardware perspective, the range mapped by M64
>+	 * BAR should be size aligned.
>+	 *
>+	 * This function return the total IOV BAR size if expanded or just the
>+	 * individual size if not.
>+	 */
> 	align = pci_iov_resource_size(pdev, resno);
> 	if (pdn->vfs_expanded)
> 		return pdn->vfs_expanded * align;
>-- 
>1.7.9.5
>

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH V2 3/6] powerpc/powernv: use one M64 BAR in Single PE mode for one VF BAR
  2015-08-05  1:25           ` [PATCH V2 3/6] powerpc/powernv: use one M64 BAR in Single PE mode for one VF BAR Wei Yang
@ 2015-08-06  5:20             ` Gavin Shan
  2015-08-06  9:36               ` Wei Yang
  2015-08-06 10:04             ` Alexey Kardashevskiy
  1 sibling, 1 reply; 56+ messages in thread
From: Gavin Shan @ 2015-08-06  5:20 UTC (permalink / raw)
  To: Wei Yang; +Cc: aik, gwshan, benh, linuxppc-dev

On Wed, Aug 05, 2015 at 09:25:00AM +0800, Wei Yang wrote:
>In current implementation, when VF BAR is bigger than 64MB, it uses 4 M64
>BAR in Single PE mode to cover the number of VFs required to be enabled.
>By doing so, several VFs would be in one VF Group and leads to interference
>between VFs in the same group.
>
>This patch changes the design by using one M64 BAR in Single PE mode for
>one VF BAR. This gives absolute isolation for VFs.
>
>Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
>---
> arch/powerpc/include/asm/pci-bridge.h     |    5 +-
> arch/powerpc/platforms/powernv/pci-ioda.c |  180 ++++++++++++-----------------
> 2 files changed, 76 insertions(+), 109 deletions(-)
>
>diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
>index 712add5..8aeba4c 100644
>--- a/arch/powerpc/include/asm/pci-bridge.h
>+++ b/arch/powerpc/include/asm/pci-bridge.h
>@@ -214,10 +214,9 @@ struct pci_dn {
> 	u16     vfs_expanded;		/* number of VFs IOV BAR expanded */
> 	u16     num_vfs;		/* number of VFs enabled*/
> 	int     offset;			/* PE# for the first VF PE */
>-#define M64_PER_IOV 4
>-	int     m64_per_iov;
>+	bool    m64_single_mode;	/* Use M64 BAR in Single Mode */
> #define IODA_INVALID_M64        (-1)
>-	int     m64_wins[PCI_SRIOV_NUM_BARS][M64_PER_IOV];
>+	int     (*m64_map)[PCI_SRIOV_NUM_BARS];

It can be explicit? For example:

	int	*m64_map;

	/* Initialization */
	size_t size = sizeof(*pdn->m64_map) * PCI_SRIOV_NUM_BARS * num_of_max_VFs;
	pdn->m64_map = kmalloc(size, GFP_KERNEL);
	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
		for (j = 0; j < num_of_max_VFs; j++)
			pdn->m64_map[i * PCI_SRIOV_NUM_BARS + j] = PNV_INVALID_M64;

	/* Destroy */
	int step = 1;

	if (!pdn->m64_single_mode)
		step = phb->ioda.total_pe;
	for (i = 0; i < PCI_SRIOV_NUM_BARS * num_of_max_VFs; i += step)
		if (pdn->m64_map[i] == PNV_INVALID_M64)
			continue;

		/* Unmap the window */
	

> #endif /* CONFIG_PCI_IOV */
> #endif
> 	struct list_head child_list;
>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>index 7192e62..f5d110c 100644
>--- a/arch/powerpc/platforms/powernv/pci-ioda.c
>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>@@ -1148,29 +1148,36 @@ static void pnv_pci_ioda_setup_PEs(void)
> }
>
> #ifdef CONFIG_PCI_IOV
>-static int pnv_pci_vf_release_m64(struct pci_dev *pdev)
>+static int pnv_pci_vf_release_m64(struct pci_dev *pdev, u16 num_vfs)
> {
> 	struct pci_bus        *bus;
> 	struct pci_controller *hose;
> 	struct pnv_phb        *phb;
> 	struct pci_dn         *pdn;
> 	int                    i, j;
>+	int                    m64_bars;
>
> 	bus = pdev->bus;
> 	hose = pci_bus_to_host(bus);
> 	phb = hose->private_data;
> 	pdn = pci_get_pdn(pdev);
>
>+	if (pdn->m64_single_mode)
>+		m64_bars = num_vfs;
>+	else
>+		m64_bars = 1;
>+
> 	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
>-		for (j = 0; j < M64_PER_IOV; j++) {
>-			if (pdn->m64_wins[i][j] == IODA_INVALID_M64)
>+		for (j = 0; j < m64_bars; j++) {
>+			if (pdn->m64_map[j][i] == IODA_INVALID_M64)
> 				continue;
> 			opal_pci_phb_mmio_enable(phb->opal_id,
>-				OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 0);
>-			clear_bit(pdn->m64_wins[i][j], &phb->ioda.m64_bar_alloc);
>-			pdn->m64_wins[i][j] = IODA_INVALID_M64;
>+				OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 0);
>+			clear_bit(pdn->m64_map[j][i], &phb->ioda.m64_bar_alloc);
>+			pdn->m64_map[j][i] = IODA_INVALID_M64;
> 		}
>
>+	kfree(pdn->m64_map);
> 	return 0;
> }
>
>@@ -1187,8 +1194,7 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
> 	int                    total_vfs;
> 	resource_size_t        size, start;
> 	int                    pe_num;
>-	int                    vf_groups;
>-	int                    vf_per_group;
>+	int                    m64_bars;
>
> 	bus = pdev->bus;
> 	hose = pci_bus_to_host(bus);
>@@ -1196,26 +1202,26 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
> 	pdn = pci_get_pdn(pdev);
> 	total_vfs = pci_sriov_get_totalvfs(pdev);
>
>-	/* Initialize the m64_wins to IODA_INVALID_M64 */
>-	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
>-		for (j = 0; j < M64_PER_IOV; j++)
>-			pdn->m64_wins[i][j] = IODA_INVALID_M64;
>+	if (pdn->m64_single_mode)
>+		m64_bars = num_vfs;
>+	else
>+		m64_bars = 1;
>+
>+	pdn->m64_map = kmalloc(sizeof(*pdn->m64_map) * m64_bars, GFP_KERNEL);
>+	if (!pdn->m64_map)
>+		return -ENOMEM;
>+	/* Initialize the m64_map to IODA_INVALID_M64 */
>+	for (i = 0; i < m64_bars ; i++)
>+		for (j = 0; j < PCI_SRIOV_NUM_BARS; j++)
>+			pdn->m64_map[i][j] = IODA_INVALID_M64;
>
>-	if (pdn->m64_per_iov == M64_PER_IOV) {
>-		vf_groups = (num_vfs <= M64_PER_IOV) ? num_vfs: M64_PER_IOV;
>-		vf_per_group = (num_vfs <= M64_PER_IOV)? 1:
>-			roundup_pow_of_two(num_vfs) / pdn->m64_per_iov;
>-	} else {
>-		vf_groups = 1;
>-		vf_per_group = 1;
>-	}
>
> 	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
> 		res = &pdev->resource[i + PCI_IOV_RESOURCES];
> 		if (!res->flags || !res->parent)
> 			continue;
>
>-		for (j = 0; j < vf_groups; j++) {
>+		for (j = 0; j < m64_bars; j++) {
> 			do {
> 				win = find_next_zero_bit(&phb->ioda.m64_bar_alloc,
> 						phb->ioda.m64_bar_idx + 1, 0);
>@@ -1224,12 +1230,11 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
> 					goto m64_failed;
> 			} while (test_and_set_bit(win, &phb->ioda.m64_bar_alloc));
>
>-			pdn->m64_wins[i][j] = win;
>+			pdn->m64_map[j][i] = win;
>
>-			if (pdn->m64_per_iov == M64_PER_IOV) {
>+			if (pdn->m64_single_mode) {
> 				size = pci_iov_resource_size(pdev,
> 							PCI_IOV_RESOURCES + i);
>-				size = size * vf_per_group;
> 				start = res->start + size * j;
> 			} else {
> 				size = resource_size(res);
>@@ -1237,16 +1242,16 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
> 			}
>
> 			/* Map the M64 here */
>-			if (pdn->m64_per_iov == M64_PER_IOV) {
>+			if (pdn->m64_single_mode) {
> 				pe_num = pdn->offset + j;
> 				rc = opal_pci_map_pe_mmio_window(phb->opal_id,
> 						pe_num, OPAL_M64_WINDOW_TYPE,
>-						pdn->m64_wins[i][j], 0);
>+						pdn->m64_map[j][i], 0);
> 			}
>
> 			rc = opal_pci_set_phb_mem_window(phb->opal_id,
> 						 OPAL_M64_WINDOW_TYPE,
>-						 pdn->m64_wins[i][j],
>+						 pdn->m64_map[j][i],
> 						 start,
> 						 0, /* unused */
> 						 size);
>@@ -1258,12 +1263,12 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
> 				goto m64_failed;
> 			}
>
>-			if (pdn->m64_per_iov == M64_PER_IOV)
>+			if (pdn->m64_single_mode)
> 				rc = opal_pci_phb_mmio_enable(phb->opal_id,
>-				     OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 2);
>+				     OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 2);
> 			else
> 				rc = opal_pci_phb_mmio_enable(phb->opal_id,
>-				     OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 1);
>+				     OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 1);
>
> 			if (rc != OPAL_SUCCESS) {
> 				dev_err(&pdev->dev, "Failed to enable M64 window #%d: %llx\n",
>@@ -1275,7 +1280,7 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
> 	return 0;
>
> m64_failed:
>-	pnv_pci_vf_release_m64(pdev);
>+	pnv_pci_vf_release_m64(pdev, num_vfs);
> 	return -EBUSY;
> }
>
>@@ -1302,15 +1307,13 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe
> 	iommu_free_table(tbl, of_node_full_name(dev->dev.of_node));
> }
>
>-static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs)
>+static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
> {
> 	struct pci_bus        *bus;
> 	struct pci_controller *hose;
> 	struct pnv_phb        *phb;
> 	struct pnv_ioda_pe    *pe, *pe_n;
> 	struct pci_dn         *pdn;
>-	u16                    vf_index;
>-	int64_t                rc;
>
> 	bus = pdev->bus;
> 	hose = pci_bus_to_host(bus);
>@@ -1320,35 +1323,6 @@ static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs)
> 	if (!pdev->is_physfn)
> 		return;
>
>-	if (pdn->m64_per_iov == M64_PER_IOV && num_vfs > M64_PER_IOV) {
>-		int   vf_group;
>-		int   vf_per_group;
>-		int   vf_index1;
>-
>-		vf_per_group = roundup_pow_of_two(num_vfs) / pdn->m64_per_iov;
>-
>-		for (vf_group = 0; vf_group < M64_PER_IOV; vf_group++)
>-			for (vf_index = vf_group * vf_per_group;
>-				vf_index < (vf_group + 1) * vf_per_group &&
>-				vf_index < num_vfs;
>-				vf_index++)
>-				for (vf_index1 = vf_group * vf_per_group;
>-					vf_index1 < (vf_group + 1) * vf_per_group &&
>-					vf_index1 < num_vfs;
>-					vf_index1++){
>-
>-					rc = opal_pci_set_peltv(phb->opal_id,
>-						pdn->offset + vf_index,
>-						pdn->offset + vf_index1,
>-						OPAL_REMOVE_PE_FROM_DOMAIN);
>-
>-					if (rc)
>-					    dev_warn(&pdev->dev, "%s: Failed to unlink same group PE#%d(%lld)\n",
>-						__func__,
>-						pdn->offset + vf_index1, rc);
>-				}
>-	}
>-
> 	list_for_each_entry_safe(pe, pe_n, &phb->ioda.pe_list, list) {
> 		if (pe->parent_dev != pdev)
> 			continue;
>@@ -1383,14 +1357,14 @@ void pnv_pci_sriov_disable(struct pci_dev *pdev)
> 	num_vfs = pdn->num_vfs;
>
> 	/* Release VF PEs */
>-	pnv_ioda_release_vf_PE(pdev, num_vfs);
>+	pnv_ioda_release_vf_PE(pdev);
>
> 	if (phb->type == PNV_PHB_IODA2) {
>-		if (pdn->m64_per_iov == 1)
>+		if (!pdn->m64_single_mode)
> 			pnv_pci_vf_resource_shift(pdev, -pdn->offset);
>
> 		/* Release M64 windows */
>-		pnv_pci_vf_release_m64(pdev);
>+		pnv_pci_vf_release_m64(pdev, num_vfs);
>
> 		/* Release PE numbers */
> 		bitmap_clear(phb->ioda.pe_alloc, pdn->offset, num_vfs);
>@@ -1409,7 +1383,6 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
> 	int                    pe_num;
> 	u16                    vf_index;
> 	struct pci_dn         *pdn;
>-	int64_t                rc;
>
> 	bus = pdev->bus;
> 	hose = pci_bus_to_host(bus);
>@@ -1454,37 +1427,6 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
>
> 		pnv_pci_ioda2_setup_dma_pe(phb, pe);
> 	}
>-
>-	if (pdn->m64_per_iov == M64_PER_IOV && num_vfs > M64_PER_IOV) {
>-		int   vf_group;
>-		int   vf_per_group;
>-		int   vf_index1;
>-
>-		vf_per_group = roundup_pow_of_two(num_vfs) / pdn->m64_per_iov;
>-
>-		for (vf_group = 0; vf_group < M64_PER_IOV; vf_group++) {
>-			for (vf_index = vf_group * vf_per_group;
>-			     vf_index < (vf_group + 1) * vf_per_group &&
>-			     vf_index < num_vfs;
>-			     vf_index++) {
>-				for (vf_index1 = vf_group * vf_per_group;
>-				     vf_index1 < (vf_group + 1) * vf_per_group &&
>-				     vf_index1 < num_vfs;
>-				     vf_index1++) {
>-
>-					rc = opal_pci_set_peltv(phb->opal_id,
>-						pdn->offset + vf_index,
>-						pdn->offset + vf_index1,
>-						OPAL_ADD_PE_TO_DOMAIN);
>-
>-					if (rc)
>-					    dev_warn(&pdev->dev, "%s: Failed to link same group PE#%d(%lld)\n",
>-						__func__,
>-						pdn->offset + vf_index1, rc);
>-				}
>-			}
>-		}
>-	}
> }
>
> int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
>@@ -1507,6 +1449,18 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
> 			return -EBUSY;
> 		}
>
>+		/*
>+		 * On PNV_PHB_IODA2, We just have 16 M64 BARs and M64 BAR #15
>+		 * is used to cover the whole system, which leaves only 15 M64
>+		 * BAR usable for VFs.
>+		 * When M64 BAR functions in Single PE mode, this means it
>+		 * just could enable 15 VFs.
>+		 */

s/PNV_PHB_IODA2/PHB3
s/15 VFs/(15 / num_of_IOV_BARs)

>+		if (pdn->m64_single_mode && num_vfs >= 16) {
>+			dev_info(&pdev->dev, "Not enough M64 BAR for VFs\n");
>+			return -EBUSY;
>+		}
>+

		if (pdn->m64_single_mode && num_vfs > phb->ioda.m64_bar_idx)

> 		/* Calculate available PE for required VFs */
> 		mutex_lock(&phb->ioda.pe_alloc_mutex);
> 		pdn->offset = bitmap_find_next_zero_area(
>@@ -1534,7 +1488,7 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
> 		 * the IOV BAR according to the PE# allocated to the VFs.
> 		 * Otherwise, the PE# for the VF will conflict with others.
> 		 */
>-		if (pdn->m64_per_iov == 1) {
>+		if (!pdn->m64_single_mode) {
> 			ret = pnv_pci_vf_resource_shift(pdev, pdn->offset);
> 			if (ret)
> 				goto m64_failed;
>@@ -1567,8 +1521,7 @@ int pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
> 	/* Allocate PCI data */
> 	add_dev_pci_data(pdev);
>
>-	pnv_pci_sriov_enable(pdev, num_vfs);
>-	return 0;
>+	return pnv_pci_sriov_enable(pdev, num_vfs);
> }
> #endif /* CONFIG_PCI_IOV */
>
>@@ -2761,9 +2714,9 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
>
> 	pdn = pci_get_pdn(pdev);
> 	pdn->vfs_expanded = 0;
>+	pdn->m64_single_mode = false;
>
> 	total_vfs = pci_sriov_get_totalvfs(pdev);
>-	pdn->m64_per_iov = 1;
> 	mul = phb->ioda.total_pe;
>
> 	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
>@@ -2783,8 +2736,8 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
> 		if (size > (1 << 26)) {
> 			dev_info(&pdev->dev, "PowerNV: VF BAR%d: %pR IOV size is bigger than 64M, roundup power2\n",
> 				 i, res);
>-			pdn->m64_per_iov = M64_PER_IOV;
> 			mul = roundup_pow_of_two(total_vfs);
>+			pdn->m64_single_mode = true;
> 			break;
> 		}
> 	}
>@@ -2986,6 +2939,8 @@ static resource_size_t pnv_pci_window_alignment(struct pci_bus *bus,
> static resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev,
> 						      int resno)
> {
>+	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
>+	struct pnv_phb *phb = hose->private_data;
> 	struct pci_dn *pdn = pci_get_pdn(pdev);
> 	resource_size_t align;
>
>@@ -2994,12 +2949,25 @@ static resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev,
> 	 * SR-IOV. While from hardware perspective, the range mapped by M64
> 	 * BAR should be size aligned.
> 	 *
>+	 * When IOV BAR is mapped with M64 BAR in Single PE mode, the hardware
>+	 * restriction to alignment is gone. But if just use the VF BAR size
>+	 * as the alignment, PF BAR / VF BAR may be allocated with in one M64
>+	 * segment, which introduces the PE conflict between PF and VF. Based
>+	 * on this the minimum alignment of an IOV BAR is m64_segsize.
>+	 *
> 	 * This function return the total IOV BAR size if expanded or just the
>-	 * individual size if not.
>+	 * individual size if not, when M64 BAR is in Shared PE mode.
>+	 * If the M64 BAR is in Single PE mode, return the VF BAR size or
>+	 * m64_size if IOV BAR size is less.
> 	 */
> 	align = pci_iov_resource_size(pdev, resno);
>-	if (pdn->vfs_expanded)
>-		return pdn->vfs_expanded * align;
>+	if (pdn->vfs_expanded) {
>+		if (pdn->m64_single_mode)
>+			return max(align,
>+				(resource_size_t)phb->ioda.m64_segsize);
>+		else
>+			return pdn->vfs_expanded * align;
>+	}
>
> 	return align;
> }
>-- 
>1.7.9.5
>

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH V2 4/6] powerpc/powernv: replace the hard coded boundary with gate
  2015-08-05  1:25           ` [PATCH V2 4/6] powerpc/powernv: replace the hard coded boundary with gate Wei Yang
@ 2015-08-06  5:26             ` Gavin Shan
  2015-08-07  9:11               ` Alexey Kardashevskiy
  0 siblings, 1 reply; 56+ messages in thread
From: Gavin Shan @ 2015-08-06  5:26 UTC (permalink / raw)
  To: Wei Yang; +Cc: aik, gwshan, benh, linuxppc-dev

On Wed, Aug 05, 2015 at 09:25:01AM +0800, Wei Yang wrote:
>Based on the limitation of M64 Window size, when VF BAR size is bigger than
>64MB, IOV BAR just round up power of 2 of the total_vfs. While the 64MB is
>a magic boundary in code, which is hard to maintain.
>
>This patch replaces the hard coded boundary with gate, which is calculated
>from m64_segsize and adds comment to explain the reason for it.
>
>Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
>---
> arch/powerpc/platforms/powernv/pci-ioda.c |   22 +++++++++++++++++-----
> 1 file changed, 17 insertions(+), 5 deletions(-)
>
>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>index f5d110c..31dcedc 100644
>--- a/arch/powerpc/platforms/powernv/pci-ioda.c
>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>@@ -2702,7 +2702,7 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
> 	struct pnv_phb *phb;
> 	struct resource *res;
> 	int i;
>-	resource_size_t size;
>+	resource_size_t size, gate;
> 	struct pci_dn *pdn;
> 	int mul, total_vfs;
>
>@@ -2718,6 +2718,17 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
>
> 	total_vfs = pci_sriov_get_totalvfs(pdev);
> 	mul = phb->ioda.total_pe;
>+	/*
>+	 * If bigger than or equal to half of m64_segsize, just round up power
>+	 * of two.
>+	 *
>+	 * Generally, one M64 BAR maps one IOV BAR. To avoid conflict with
>+	 * other devices, IOV BAR size is expanded to be (total_pe * VF size).
>+	 * When VF size is half of m64_segsize , the expanded size would equal
>+	 * to half of the whole M64 Window size, which will exhaust the M64
>+	 * Window and limit the system flexibility.
>+	 */

s/VF size/VF BAR size
s/m64_segsize/M64 segment size
s/M64 Window/M64 space

>+	gate = phb->ioda.m64_segsize >> 1;
>
> 	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
> 		res = &pdev->resource[i + PCI_IOV_RESOURCES];
>@@ -2732,10 +2743,11 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
>
> 		size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
>
>-		/* bigger than 64M */
>-		if (size > (1 << 26)) {
>-			dev_info(&pdev->dev, "PowerNV: VF BAR%d: %pR IOV size is bigger than 64M, roundup power2\n",
>-				 i, res);
>+		/* bigger than or equal to gate */
>+		if (size >= gate) {
>+			dev_info(&pdev->dev, "PowerNV: VF BAR%d: %pR IOV size "
>+				"is bigger than %lld, roundup power2\n",
>+				 i, res, gate);

If I understand the changes correctly, single VF BAR size is still checked against
the "gate" (128MB), not the total VF BAR size. Recap the comments I gave last time:

I mean to check the sum of all VF BARs. For example, the VFs attached to its PF has two
VF BARs and each of them is 64MB. For this case, the MMIO resource can't be allocated
once extending them to 256 VFs. So we have to try "single-pe-mode" for this situation.
So the check becomes as below:

        struct pci_controller *hose = pci_bus_to_host(pdev->bus);
        struct pnv_phb *phb = hose->private_data;
        resource_size_t total_vf_bar_sz = 0;
        resource_size_t gate;

        /* Some comments to explain the "gate" */
        gate = phb->m64_segsize / 2;
        for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
                total_vf_bar_sz += pci_iov_resource_size(pdev, PCI_IOV_RESOURCES + i);

        if (total_vf_bar_sz >= gate)
                /* single-pe-mode */
        else
                /* shared-mode */

> 			mul = roundup_pow_of_two(total_vfs);
> 			pdn->m64_single_mode = true;
> 			break;
>-- 
>1.7.9.5
>

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH V2 5/6] powerpc/powernv: boundary the total vf bar size instead of the individual one
  2015-08-05  1:25           ` [PATCH V2 5/6] powerpc/powernv: boundary the total vf bar size instead of the individual one Wei Yang
@ 2015-08-06  5:28             ` Gavin Shan
  2015-08-06 14:03               ` Wei Yang
  0 siblings, 1 reply; 56+ messages in thread
From: Gavin Shan @ 2015-08-06  5:28 UTC (permalink / raw)
  To: Wei Yang; +Cc: aik, gwshan, benh, linuxppc-dev

On Wed, Aug 05, 2015 at 09:25:02AM +0800, Wei Yang wrote:
>Each VF could have 6 BARs at most. When the total BAR size exceeds the
>gate, after expanding it will also exhaust the M64 Window.
>
>This patch limits the boundary by checking the total VF BAR size instead of
>the individual BAR.
>
>Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>

Ok. I didn't look at this when giving comments to last patch. It turns
you have the change in this patch. Please merge it with the previous
patch.

>---
> arch/powerpc/platforms/powernv/pci-ioda.c |   13 +++++++------
> 1 file changed, 7 insertions(+), 6 deletions(-)
>
>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>index 31dcedc..4042303 100644
>--- a/arch/powerpc/platforms/powernv/pci-ioda.c
>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>@@ -2702,7 +2702,7 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
> 	struct pnv_phb *phb;
> 	struct resource *res;
> 	int i;
>-	resource_size_t size, gate;
>+	resource_size_t size, gate, total_vf_bar_sz;
> 	struct pci_dn *pdn;
> 	int mul, total_vfs;
>
>@@ -2729,6 +2729,7 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
> 	 * Window and limit the system flexibility.
> 	 */
> 	gate = phb->ioda.m64_segsize >> 1;
>+	total_vf_bar_sz = 0;
>
> 	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
> 		res = &pdev->resource[i + PCI_IOV_RESOURCES];
>@@ -2741,13 +2742,13 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
> 			return;
> 		}
>
>-		size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
>+		total_vf_bar_sz += pci_iov_resource_size(pdev,
>+				i + PCI_IOV_RESOURCES);
>
> 		/* bigger than or equal to gate */
>-		if (size >= gate) {
>-			dev_info(&pdev->dev, "PowerNV: VF BAR%d: %pR IOV size "
>-				"is bigger than %lld, roundup power2\n",
>-				 i, res, gate);
>+		if (total_vf_bar_sz >= gate) {
>+			dev_info(&pdev->dev, "PowerNV: VF BAR Total IOV size "
>+				"is bigger than %lld, roundup power2\n", gate);
> 			mul = roundup_pow_of_two(total_vfs);
> 			pdn->m64_single_mode = true;
> 			break;
>-- 
>1.7.9.5
>

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH V2 6/6] powerpc/powernv: allocate discrete PE# when using M64 BAR in Single PE mode
  2015-08-05  1:25           ` [PATCH V2 6/6] powerpc/powernv: allocate discrete PE# when using M64 BAR in Single PE mode Wei Yang
@ 2015-08-06  5:36             ` Gavin Shan
  2015-08-06 13:41               ` Wei Yang
  0 siblings, 1 reply; 56+ messages in thread
From: Gavin Shan @ 2015-08-06  5:36 UTC (permalink / raw)
  To: Wei Yang; +Cc: aik, gwshan, benh, linuxppc-dev

On Wed, Aug 05, 2015 at 09:25:03AM +0800, Wei Yang wrote:
>When M64 BAR is set to Single PE mode, the PE# assigned to VF could be
>discrete.
>
>This patch restructures the patch to allocate discrete PE# for VFs when M64
>BAR is set to Single PE mode.
>
>Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
>---
> arch/powerpc/include/asm/pci-bridge.h     |    2 +-
> arch/powerpc/platforms/powernv/pci-ioda.c |   69 +++++++++++++++++++++--------
> 2 files changed, 51 insertions(+), 20 deletions(-)
>
>diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
>index 8aeba4c..72415c7 100644
>--- a/arch/powerpc/include/asm/pci-bridge.h
>+++ b/arch/powerpc/include/asm/pci-bridge.h
>@@ -213,7 +213,7 @@ struct pci_dn {
> #ifdef CONFIG_PCI_IOV
> 	u16     vfs_expanded;		/* number of VFs IOV BAR expanded */
> 	u16     num_vfs;		/* number of VFs enabled*/
>-	int     offset;			/* PE# for the first VF PE */
>+	int     *offset;		/* PE# for the first VF PE or array */
> 	bool    m64_single_mode;	/* Use M64 BAR in Single Mode */
> #define IODA_INVALID_M64        (-1)
> 	int     (*m64_map)[PCI_SRIOV_NUM_BARS];

how about renaming "offset" to "pe_num_map", or "pe_map" ? Similar to the comments
I gave to the "m64_bar_map", num_of_max_vfs entries can be allocated. Though not
all of them will be used, not too much memory will be wasted.

>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>index 4042303..9953829 100644
>--- a/arch/powerpc/platforms/powernv/pci-ioda.c
>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>@@ -1243,7 +1243,7 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
>
> 			/* Map the M64 here */
> 			if (pdn->m64_single_mode) {
>-				pe_num = pdn->offset + j;
>+				pe_num = pdn->offset[j];
> 				rc = opal_pci_map_pe_mmio_window(phb->opal_id,
> 						pe_num, OPAL_M64_WINDOW_TYPE,
> 						pdn->m64_map[j][i], 0);
>@@ -1347,7 +1347,7 @@ void pnv_pci_sriov_disable(struct pci_dev *pdev)
> 	struct pnv_phb        *phb;
> 	struct pci_dn         *pdn;
> 	struct pci_sriov      *iov;
>-	u16 num_vfs;
>+	u16                    num_vfs, i;
>
> 	bus = pdev->bus;
> 	hose = pci_bus_to_host(bus);
>@@ -1361,14 +1361,18 @@ void pnv_pci_sriov_disable(struct pci_dev *pdev)
>
> 	if (phb->type == PNV_PHB_IODA2) {
> 		if (!pdn->m64_single_mode)
>-			pnv_pci_vf_resource_shift(pdev, -pdn->offset);
>+			pnv_pci_vf_resource_shift(pdev, -*pdn->offset);
>
> 		/* Release M64 windows */
> 		pnv_pci_vf_release_m64(pdev, num_vfs);
>
> 		/* Release PE numbers */
>-		bitmap_clear(phb->ioda.pe_alloc, pdn->offset, num_vfs);
>-		pdn->offset = 0;
>+		if (pdn->m64_single_mode) {
>+			for (i = 0; i < num_vfs; i++)
>+				pnv_ioda_free_pe(phb, pdn->offset[i]);
>+		} else
>+			bitmap_clear(phb->ioda.pe_alloc, *pdn->offset, num_vfs);
>+		kfree(pdn->offset);

Can pnv_ioda_free_pe() be reused to release PE ?

> 	}
> }
>
>@@ -1394,7 +1398,10 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
>
> 	/* Reserve PE for each VF */
> 	for (vf_index = 0; vf_index < num_vfs; vf_index++) {
>-		pe_num = pdn->offset + vf_index;
>+		if (pdn->m64_single_mode)
>+			pe_num = pdn->offset[vf_index];
>+		else
>+			pe_num = *pdn->offset + vf_index;
>
> 		pe = &phb->ioda.pe_array[pe_num];
> 		pe->pe_number = pe_num;
>@@ -1436,6 +1443,7 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
> 	struct pnv_phb        *phb;
> 	struct pci_dn         *pdn;
> 	int                    ret;
>+	u16                    i;
>
> 	bus = pdev->bus;
> 	hose = pci_bus_to_host(bus);
>@@ -1462,19 +1470,38 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
> 		}
>
> 		/* Calculate available PE for required VFs */
>-		mutex_lock(&phb->ioda.pe_alloc_mutex);
>-		pdn->offset = bitmap_find_next_zero_area(
>-			phb->ioda.pe_alloc, phb->ioda.total_pe,
>-			0, num_vfs, 0);
>-		if (pdn->offset >= phb->ioda.total_pe) {
>+		if (pdn->m64_single_mode) {
>+			pdn->offset = kmalloc(sizeof(*pdn->offset) * num_vfs,
>+					GFP_KERNEL);
>+			if (!pdn->offset)
>+				return -ENOMEM;
>+			for (i = 0; i < num_vfs; i++)
>+				pdn->offset[i] = IODA_INVALID_PE;
>+			for (i = 0; i < num_vfs; i++) {
>+				pdn->offset[i] = pnv_ioda_alloc_pe(phb);
>+				if (pdn->offset[i] == IODA_INVALID_PE) {
>+					ret = -EBUSY;
>+					goto m64_failed;
>+				}
>+			}
>+		} else {
>+			pdn->offset = kmalloc(sizeof(*pdn->offset), GFP_KERNEL);
>+			if (!pdn->offset)
>+				return -ENOMEM;
>+			mutex_lock(&phb->ioda.pe_alloc_mutex);
>+			*pdn->offset = bitmap_find_next_zero_area(
>+				phb->ioda.pe_alloc, phb->ioda.total_pe,
>+				0, num_vfs, 0);
>+			if (*pdn->offset >= phb->ioda.total_pe) {
>+				mutex_unlock(&phb->ioda.pe_alloc_mutex);
>+				dev_info(&pdev->dev, "Failed to enable VF%d\n", num_vfs);
>+				kfree(pdn->offset);
>+				return -EBUSY;
>+			}
>+			bitmap_set(phb->ioda.pe_alloc, *pdn->offset, num_vfs);
> 			mutex_unlock(&phb->ioda.pe_alloc_mutex);
>-			dev_info(&pdev->dev, "Failed to enable VF%d\n", num_vfs);
>-			pdn->offset = 0;
>-			return -EBUSY;
> 		}
>-		bitmap_set(phb->ioda.pe_alloc, pdn->offset, num_vfs);
> 		pdn->num_vfs = num_vfs;
>-		mutex_unlock(&phb->ioda.pe_alloc_mutex);
>
> 		/* Assign M64 window accordingly */
> 		ret = pnv_pci_vf_assign_m64(pdev, num_vfs);
>@@ -1489,7 +1516,7 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
> 		 * Otherwise, the PE# for the VF will conflict with others.
> 		 */
> 		if (!pdn->m64_single_mode) {
>-			ret = pnv_pci_vf_resource_shift(pdev, pdn->offset);
>+			ret = pnv_pci_vf_resource_shift(pdev, *pdn->offset);
> 			if (ret)
> 				goto m64_failed;
> 		}
>@@ -1501,8 +1528,12 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
> 	return 0;
>
> m64_failed:
>-	bitmap_clear(phb->ioda.pe_alloc, pdn->offset, num_vfs);
>-	pdn->offset = 0;
>+	if (pdn->m64_single_mode) {
>+		for (i = 0; i < num_vfs; i++)
>+			pnv_ioda_free_pe(phb, pdn->offset[i]);
>+	} else
>+		bitmap_clear(phb->ioda.pe_alloc, *pdn->offset, num_vfs);
>+	kfree(pdn->offset);
>
> 	return ret;
> }
>-- 
>1.7.9.5
>

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH V2 1/6] powerpc/powernv: don't enable SRIOV when VF BAR contains non M64 BAR
  2015-08-06  4:35             ` Gavin Shan
@ 2015-08-06  6:10               ` Alexey Kardashevskiy
  2015-08-06  6:57                 ` Gavin Shan
  2015-08-06 14:10               ` Wei Yang
  1 sibling, 1 reply; 56+ messages in thread
From: Alexey Kardashevskiy @ 2015-08-06  6:10 UTC (permalink / raw)
  To: Gavin Shan, Wei Yang; +Cc: benh, linuxppc-dev

On 08/06/2015 02:35 PM, Gavin Shan wrote:
> On Wed, Aug 05, 2015 at 09:24:58AM +0800, Wei Yang wrote:
>> On PHB_IODA2, we enable SRIOV devices by mapping IOV BAR with M64 BARs. If
>> a SRIOV device's BAR is not 64-bit prefetchable, this is not assigned from
>> M64 windwo, which means M64 BAR can't work on it.
>>
>
> s/PHB_IODA2/PHB3


No, it is IODA2. OPEL does PHB3-specific bits, the host kernel just uses OPAL.


> s/windwo/window
>
>> This patch makes this explicit.
>>
>> Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
>
> The idea sounds right, but there is one question as below.
>
>> ---
>> arch/powerpc/platforms/powernv/pci-ioda.c |   25 +++++++++----------------
>> 1 file changed, 9 insertions(+), 16 deletions(-)
>>
>> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>> index 5738d31..9b41dba 100644
>> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
>> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>> @@ -908,9 +908,6 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
>> 		if (!res->flags || !res->parent)
>> 			continue;
>>
>> -		if (!pnv_pci_is_mem_pref_64(res->flags))
>> -			continue;
>> -
>> 		/*
>> 		 * The actual IOV BAR range is determined by the start address
>> 		 * and the actual size for num_vfs VFs BAR.  This check is to
>> @@ -939,9 +936,6 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
>> 		if (!res->flags || !res->parent)
>> 			continue;
>>
>> -		if (!pnv_pci_is_mem_pref_64(res->flags))
>> -			continue;
>> -
>> 		size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
>> 		res2 = *res;
>> 		res->start += size * offset;
>> @@ -1221,9 +1215,6 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
>> 		if (!res->flags || !res->parent)
>> 			continue;
>>
>> -		if (!pnv_pci_is_mem_pref_64(res->flags))
>> -			continue;
>> -
>> 		for (j = 0; j < vf_groups; j++) {
>> 			do {
>> 				win = find_next_zero_bit(&phb->ioda.m64_bar_alloc,
>> @@ -1510,6 +1501,12 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
>> 	pdn = pci_get_pdn(pdev);
>>
>> 	if (phb->type == PNV_PHB_IODA2) {
>> +		if (!pdn->vfs_expanded) {
>> +			dev_info(&pdev->dev, "don't support this SRIOV device"
>> +				" with non M64 VF BAR\n");
>> +			return -EBUSY;
>> +		}
>> +
>
> It would be -ENOSPC since -EBUSY indicates the devices (VFs) are temparily
> unavailable. For this case, the VFs are permanently unavailable because of
> running out of space to accomodate M64 and non-M64 VF BARs.
>
> The error message could be printed with dev_warn() and it would be precise
> as below or something else you prefer:
>
> 	dev_warn(&pdev->dev, "SRIOV not supported because of non-M64 VF BAR\n");


Both messages are cryptic.

If it is not M64 BAR, then what is it? It is always in one of M64 BARs (in 
the worst case - BAR#15?), the difference is if it is segmented or not, no?



>
>
>> 		/* Calculate available PE for required VFs */
>> 		mutex_lock(&phb->ioda.pe_alloc_mutex);
>> 		pdn->offset = bitmap_find_next_zero_area(
>> @@ -2774,9 +2771,10 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
>> 		if (!res->flags || res->parent)
>> 			continue;
>> 		if (!pnv_pci_is_mem_pref_64(res->flags)) {
>> -			dev_warn(&pdev->dev, " non M64 VF BAR%d: %pR\n",
>> +			dev_warn(&pdev->dev, "Don't support SR-IOV with"
>> +					" non M64 VF BAR%d: %pR. \n",
>> 				 i, res);
>> -			continue;
>> +			return;
>> 		}
>>
>> 		size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
>> @@ -2795,11 +2793,6 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
>> 		res = &pdev->resource[i + PCI_IOV_RESOURCES];
>> 		if (!res->flags || res->parent)
>> 			continue;
>> -		if (!pnv_pci_is_mem_pref_64(res->flags)) {
>> -			dev_warn(&pdev->dev, "Skipping expanding VF BAR%d: %pR\n",
>> -				 i, res);
>> -			continue;
>> -		}
>
> When any one IOV BAR on the PF is non-M64, none of the VFs can be enabled.
> Will we still allocate/assign M64 or M32 resources for the IOV BARs? If so,
> I think it can be avoided.
>
>>
>> 		dev_dbg(&pdev->dev, " Fixing VF BAR%d: %pR to\n", i, res);
>> 		size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
>> --
>> 1.7.9.5
>>
>


-- 
Alexey

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH V2 1/6] powerpc/powernv: don't enable SRIOV when VF BAR contains non M64 BAR
  2015-08-06  6:10               ` Alexey Kardashevskiy
@ 2015-08-06  6:57                 ` Gavin Shan
  2015-08-06  7:47                   ` Alexey Kardashevskiy
  0 siblings, 1 reply; 56+ messages in thread
From: Gavin Shan @ 2015-08-06  6:57 UTC (permalink / raw)
  To: Alexey Kardashevskiy; +Cc: Gavin Shan, Wei Yang, benh, linuxppc-dev

On Thu, Aug 06, 2015 at 04:10:21PM +1000, Alexey Kardashevskiy wrote:
>On 08/06/2015 02:35 PM, Gavin Shan wrote:
>>On Wed, Aug 05, 2015 at 09:24:58AM +0800, Wei Yang wrote:
>>>On PHB_IODA2, we enable SRIOV devices by mapping IOV BAR with M64 BARs. If
>>>a SRIOV device's BAR is not 64-bit prefetchable, this is not assigned from
>>>M64 windwo, which means M64 BAR can't work on it.
>>>
>>
>>s/PHB_IODA2/PHB3
>
>
>No, it is IODA2. OPEL does PHB3-specific bits, the host kernel just uses OPAL.
>

Ok.

>
>>s/windwo/window
>>
>>>This patch makes this explicit.
>>>
>>>Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
>>
>>The idea sounds right, but there is one question as below.
>>
>>>---
>>>arch/powerpc/platforms/powernv/pci-ioda.c |   25 +++++++++----------------
>>>1 file changed, 9 insertions(+), 16 deletions(-)
>>>
>>>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>>>index 5738d31..9b41dba 100644
>>>--- a/arch/powerpc/platforms/powernv/pci-ioda.c
>>>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>>>@@ -908,9 +908,6 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
>>>		if (!res->flags || !res->parent)
>>>			continue;
>>>
>>>-		if (!pnv_pci_is_mem_pref_64(res->flags))
>>>-			continue;
>>>-
>>>		/*
>>>		 * The actual IOV BAR range is determined by the start address
>>>		 * and the actual size for num_vfs VFs BAR.  This check is to
>>>@@ -939,9 +936,6 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
>>>		if (!res->flags || !res->parent)
>>>			continue;
>>>
>>>-		if (!pnv_pci_is_mem_pref_64(res->flags))
>>>-			continue;
>>>-
>>>		size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
>>>		res2 = *res;
>>>		res->start += size * offset;
>>>@@ -1221,9 +1215,6 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
>>>		if (!res->flags || !res->parent)
>>>			continue;
>>>
>>>-		if (!pnv_pci_is_mem_pref_64(res->flags))
>>>-			continue;
>>>-
>>>		for (j = 0; j < vf_groups; j++) {
>>>			do {
>>>				win = find_next_zero_bit(&phb->ioda.m64_bar_alloc,
>>>@@ -1510,6 +1501,12 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
>>>	pdn = pci_get_pdn(pdev);
>>>
>>>	if (phb->type == PNV_PHB_IODA2) {
>>>+		if (!pdn->vfs_expanded) {
>>>+			dev_info(&pdev->dev, "don't support this SRIOV device"
>>>+				" with non M64 VF BAR\n");
>>>+			return -EBUSY;
>>>+		}
>>>+
>>
>>It would be -ENOSPC since -EBUSY indicates the devices (VFs) are temparily
>>unavailable. For this case, the VFs are permanently unavailable because of
>>running out of space to accomodate M64 and non-M64 VF BARs.
>>
>>The error message could be printed with dev_warn() and it would be precise
>>as below or something else you prefer:
>>
>>	dev_warn(&pdev->dev, "SRIOV not supported because of non-M64 VF BAR\n");
>
>
>Both messages are cryptic.
>
>If it is not M64 BAR, then what is it? It is always in one of M64 BARs (in
>the worst case - BAR#15?), the difference is if it is segmented or not, no?
>

The VF BAR could be one of IO, M32, M64. If it's not M64, the VFs are supposed
to be disabled and the (IO and M32) resources won't be allocted, but for sure,
the IO and M32 resources can't be put into any one of the 16 PHB's M64 BARs.
would you recommend one better message then?

>>
>>
>>>		/* Calculate available PE for required VFs */
>>>		mutex_lock(&phb->ioda.pe_alloc_mutex);
>>>		pdn->offset = bitmap_find_next_zero_area(
>>>@@ -2774,9 +2771,10 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
>>>		if (!res->flags || res->parent)
>>>			continue;
>>>		if (!pnv_pci_is_mem_pref_64(res->flags)) {
>>>-			dev_warn(&pdev->dev, " non M64 VF BAR%d: %pR\n",
>>>+			dev_warn(&pdev->dev, "Don't support SR-IOV with"
>>>+					" non M64 VF BAR%d: %pR. \n",
>>>				 i, res);
>>>-			continue;
>>>+			return;
>>>		}
>>>
>>>		size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
>>>@@ -2795,11 +2793,6 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
>>>		res = &pdev->resource[i + PCI_IOV_RESOURCES];
>>>		if (!res->flags || res->parent)
>>>			continue;
>>>-		if (!pnv_pci_is_mem_pref_64(res->flags)) {
>>>-			dev_warn(&pdev->dev, "Skipping expanding VF BAR%d: %pR\n",
>>>-				 i, res);
>>>-			continue;
>>>-		}
>>
>>When any one IOV BAR on the PF is non-M64, none of the VFs can be enabled.
>>Will we still allocate/assign M64 or M32 resources for the IOV BARs? If so,
>>I think it can be avoided.
>>
>>>
>>>		dev_dbg(&pdev->dev, " Fixing VF BAR%d: %pR to\n", i, res);
>>>		size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
>>>--
>>>1.7.9.5
>>>
>>
>
>
>-- 
>Alexey
>

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH V2 1/6] powerpc/powernv: don't enable SRIOV when VF BAR contains non M64 BAR
  2015-08-06  6:57                 ` Gavin Shan
@ 2015-08-06  7:47                   ` Alexey Kardashevskiy
  2015-08-06 11:07                     ` Gavin Shan
  2015-08-06 14:13                     ` Wei Yang
  0 siblings, 2 replies; 56+ messages in thread
From: Alexey Kardashevskiy @ 2015-08-06  7:47 UTC (permalink / raw)
  To: Gavin Shan; +Cc: Wei Yang, benh, linuxppc-dev

On 08/06/2015 04:57 PM, Gavin Shan wrote:
> On Thu, Aug 06, 2015 at 04:10:21PM +1000, Alexey Kardashevskiy wrote:
>> On 08/06/2015 02:35 PM, Gavin Shan wrote:
>>> On Wed, Aug 05, 2015 at 09:24:58AM +0800, Wei Yang wrote:
>>>> On PHB_IODA2, we enable SRIOV devices by mapping IOV BAR with M64 BARs. If
>>>> a SRIOV device's BAR is not 64-bit prefetchable, this is not assigned from
>>>> M64 windwo, which means M64 BAR can't work on it.


The proper text would be something like this:

===
SRIOV only supports 64bit MMIO. So if we fail to assign 64bit BAR, we 
cannot enable the device.
===


>>>>
>>>
>>> s/PHB_IODA2/PHB3
>>
>>
>> No, it is IODA2. OPEL does PHB3-specific bits, the host kernel just uses OPAL.
>>
>
> Ok.
>
>>
>>> s/windwo/window
>>>
>>>> This patch makes this explicit.
>>>>
>>>> Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
>>>
>>> The idea sounds right, but there is one question as below.
>>>
>>>> ---
>>>> arch/powerpc/platforms/powernv/pci-ioda.c |   25 +++++++++----------------
>>>> 1 file changed, 9 insertions(+), 16 deletions(-)
>>>>
>>>> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>>>> index 5738d31..9b41dba 100644
>>>> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
>>>> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>>>> @@ -908,9 +908,6 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
>>>> 		if (!res->flags || !res->parent)
>>>> 			continue;
>>>>
>>>> -		if (!pnv_pci_is_mem_pref_64(res->flags))
>>>> -			continue;
>>>> -
>>>> 		/*
>>>> 		 * The actual IOV BAR range is determined by the start address
>>>> 		 * and the actual size for num_vfs VFs BAR.  This check is to
>>>> @@ -939,9 +936,6 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
>>>> 		if (!res->flags || !res->parent)
>>>> 			continue;
>>>>
>>>> -		if (!pnv_pci_is_mem_pref_64(res->flags))
>>>> -			continue;
>>>> -
>>>> 		size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
>>>> 		res2 = *res;
>>>> 		res->start += size * offset;
>>>> @@ -1221,9 +1215,6 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
>>>> 		if (!res->flags || !res->parent)
>>>> 			continue;
>>>>
>>>> -		if (!pnv_pci_is_mem_pref_64(res->flags))
>>>> -			continue;
>>>> -
>>>> 		for (j = 0; j < vf_groups; j++) {
>>>> 			do {
>>>> 				win = find_next_zero_bit(&phb->ioda.m64_bar_alloc,
>>>> @@ -1510,6 +1501,12 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
>>>> 	pdn = pci_get_pdn(pdev);
>>>>
>>>> 	if (phb->type == PNV_PHB_IODA2) {
>>>> +		if (!pdn->vfs_expanded) {
>>>> +			dev_info(&pdev->dev, "don't support this SRIOV device"
>>>> +				" with non M64 VF BAR\n");
>>>> +			return -EBUSY;
>>>> +		}
>>>> +
>>>
>>> It would be -ENOSPC since -EBUSY indicates the devices (VFs) are temparily
>>> unavailable. For this case, the VFs are permanently unavailable because of
>>> running out of space to accomodate M64 and non-M64 VF BARs.
>>>
>>> The error message could be printed with dev_warn() and it would be precise
>>> as below or something else you prefer:
>>>
>>> 	dev_warn(&pdev->dev, "SRIOV not supported because of non-M64 VF BAR\n");
>>
>>
>> Both messages are cryptic.
>>
>> If it is not M64 BAR, then what is it? It is always in one of M64 BARs (in
>> the worst case - BAR#15?), the difference is if it is segmented or not, no?
>>
>
> The VF BAR could be one of IO, M32, M64. If it's not M64, the VFs are supposed
> to be disabled and the (IO and M32) resources won't be allocted, but for sure,
> the IO and M32 resources can't be put into any one of the 16 PHB's M64 BARs.
> would you recommend one better message then?



dev_warn(&pdev->dev, "SRIOV is disabled as no space is left in 64bit MMIO 
window\n");

Or it is not "MMIO window"?



>
>>>
>>>
>>>> 		/* Calculate available PE for required VFs */
>>>> 		mutex_lock(&phb->ioda.pe_alloc_mutex);
>>>> 		pdn->offset = bitmap_find_next_zero_area(
>>>> @@ -2774,9 +2771,10 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
>>>> 		if (!res->flags || res->parent)
>>>> 			continue;
>>>> 		if (!pnv_pci_is_mem_pref_64(res->flags)) {
>>>> -			dev_warn(&pdev->dev, " non M64 VF BAR%d: %pR\n",
>>>> +			dev_warn(&pdev->dev, "Don't support SR-IOV with"
>>>> +					" non M64 VF BAR%d: %pR. \n",
>>>> 				 i, res);
>>>> -			continue;
>>>> +			return;
>>>> 		}
>>>>
>>>> 		size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
>>>> @@ -2795,11 +2793,6 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
>>>> 		res = &pdev->resource[i + PCI_IOV_RESOURCES];
>>>> 		if (!res->flags || res->parent)
>>>> 			continue;
>>>> -		if (!pnv_pci_is_mem_pref_64(res->flags)) {
>>>> -			dev_warn(&pdev->dev, "Skipping expanding VF BAR%d: %pR\n",
>>>> -				 i, res);
>>>> -			continue;
>>>> -		}
>>>
>>> When any one IOV BAR on the PF is non-M64, none of the VFs can be enabled.
>>> Will we still allocate/assign M64 or M32 resources for the IOV BARs? If so,
>>> I think it can be avoided.
>>>
>>>>
>>>> 		dev_dbg(&pdev->dev, " Fixing VF BAR%d: %pR to\n", i, res);
>>>> 		size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);


-- 
Alexey

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH V2 2/6] powerpc/powernv: simplify the calculation of iov resource
  2015-08-06  4:51             ` Gavin Shan
@ 2015-08-06  9:00               ` Alexey Kardashevskiy
  2015-08-06  9:41                 ` Wei Yang
  2015-08-06 13:49               ` Wei Yang
  1 sibling, 1 reply; 56+ messages in thread
From: Alexey Kardashevskiy @ 2015-08-06  9:00 UTC (permalink / raw)
  To: Gavin Shan, Wei Yang; +Cc: benh, linuxppc-dev

On 08/06/2015 02:51 PM, Gavin Shan wrote:
> On Wed, Aug 05, 2015 at 09:24:59AM +0800, Wei Yang wrote:
>> The alignment of IOV BAR on PowerNV platform is the total size of the IOV
>> BAR. No matter whether the IOV BAR is truncated or not, the total size
>> could be calculated by (vfs_expanded * VF size).
>>
>
> s/VF size/VF BAR size
>
> I think the changelog would be more explicit:
>
> The alignment of IOV BAR on PowerNV platform is the total size of the
> IOV BAR, no matter whether the IOV BAR is extended with number of max
> VFs or number of max PE number (256). The alignment can be calculated
> by (vfs_expaned * VF_BAR_size).



Is that really a PowerNV-specific requirement or it is valid for every 
platform (I suspect this is the case here)?


Also, what is the exact meaning of "expanded" in @vfs_expanded? It is 
either 255 (if individual VF BARs are <= 64MB) or 
roundup_pow_of_two(total_vfs) (which is something like 4 or 16). What is 
expanded here?


>
>> This patch simplifies the pnv_pci_iov_resource_alignment() by removing the
>> first case.
>>
>> Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
>
> Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
>
>> ---
>> arch/powerpc/platforms/powernv/pci-ioda.c |   14 +++++++++-----
>> 1 file changed, 9 insertions(+), 5 deletions(-)
>>
>> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>> index 9b41dba..7192e62 100644
>> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
>> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>> @@ -2987,12 +2987,16 @@ static resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev,
>> 						      int resno)
>> {
>> 	struct pci_dn *pdn = pci_get_pdn(pdev);
>> -	resource_size_t align, iov_align;
>> -
>> -	iov_align = resource_size(&pdev->resource[resno]);
>> -	if (iov_align)
>> -		return iov_align;
>> +	resource_size_t align;
>>
>> +	/*
>> +	 * On PowerNV platform, IOV BAR is mapped by M64 BAR to enable the
>> +	 * SR-IOV. While from hardware perspective, the range mapped by M64
>> +	 * BAR should be size aligned.
>> +	 *
>> +	 * This function return the total IOV BAR size if expanded or just the
>> +	 * individual size if not.
>> +	 */
>> 	align = pci_iov_resource_size(pdev, resno);
>> 	if (pdn->vfs_expanded)
>> 		return pdn->vfs_expanded * align;
>> --
>> 1.7.9.5
>>
>


-- 
Alexey

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH V2 3/6] powerpc/powernv: use one M64 BAR in Single PE mode for one VF BAR
  2015-08-06  5:20             ` Gavin Shan
@ 2015-08-06  9:36               ` Wei Yang
  2015-08-06 10:07                 ` Gavin Shan
  0 siblings, 1 reply; 56+ messages in thread
From: Wei Yang @ 2015-08-06  9:36 UTC (permalink / raw)
  To: Gavin Shan; +Cc: Wei Yang, aik, benh, linuxppc-dev

On Thu, Aug 06, 2015 at 03:20:25PM +1000, Gavin Shan wrote:
>On Wed, Aug 05, 2015 at 09:25:00AM +0800, Wei Yang wrote:
>>In current implementation, when VF BAR is bigger than 64MB, it uses 4 M64
>>BAR in Single PE mode to cover the number of VFs required to be enabled.
>>By doing so, several VFs would be in one VF Group and leads to interference
>>between VFs in the same group.
>>
>>This patch changes the design by using one M64 BAR in Single PE mode for
>>one VF BAR. This gives absolute isolation for VFs.
>>
>>Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
>>---
>> arch/powerpc/include/asm/pci-bridge.h     |    5 +-
>> arch/powerpc/platforms/powernv/pci-ioda.c |  180 ++++++++++++-----------------
>> 2 files changed, 76 insertions(+), 109 deletions(-)
>>
>>diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
>>index 712add5..8aeba4c 100644
>>--- a/arch/powerpc/include/asm/pci-bridge.h
>>+++ b/arch/powerpc/include/asm/pci-bridge.h
>>@@ -214,10 +214,9 @@ struct pci_dn {
>> 	u16     vfs_expanded;		/* number of VFs IOV BAR expanded */
>> 	u16     num_vfs;		/* number of VFs enabled*/
>> 	int     offset;			/* PE# for the first VF PE */
>>-#define M64_PER_IOV 4
>>-	int     m64_per_iov;
>>+	bool    m64_single_mode;	/* Use M64 BAR in Single Mode */
>> #define IODA_INVALID_M64        (-1)
>>-	int     m64_wins[PCI_SRIOV_NUM_BARS][M64_PER_IOV];
>>+	int     (*m64_map)[PCI_SRIOV_NUM_BARS];
>
>It can be explicit? For example:
>
>	int	*m64_map;
>
>	/* Initialization */
>	size_t size = sizeof(*pdn->m64_map) * PCI_SRIOV_NUM_BARS * num_of_max_VFs;
>	pdn->m64_map = kmalloc(size, GFP_KERNEL);
>	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
>		for (j = 0; j < num_of_max_VFs; j++)
>			pdn->m64_map[i * PCI_SRIOV_NUM_BARS + j] = PNV_INVALID_M64;
>
>	/* Destroy */
>	int step = 1;
>
>	if (!pdn->m64_single_mode)
>		step = phb->ioda.total_pe;
>	for (i = 0; i < PCI_SRIOV_NUM_BARS * num_of_max_VFs; i += step)
>		if (pdn->m64_map[i] == PNV_INVALID_M64)
>			continue;
>
>		/* Unmap the window */
>	

The m64_map is a pointer to an array with 6 elements, which represents the 6
M64 BAR index for the 6 VF BARs.

    When we use Shared Mode, one array is allocated. The six elements
    represents the six M64 BAR(at most) used to map the whole IOV BAR.
    
    When we use Single Mode, num_vfs array is allocate. Each array represents
    the map between one VF's BAR and M64 BAR index.

During the map and un-map, M64 BAR is assigned one by one in VF BAR's order.
So I think the code is explicit.

In your code, you allocate a big one dimension array to hold the M64 BAR
index. It works, while I don't think this is more explicit than original code.

-- 
Richard Yang
Help you, Help me

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH V2 2/6] powerpc/powernv: simplify the calculation of iov resource
  2015-08-06  9:00               ` Alexey Kardashevskiy
@ 2015-08-06  9:41                 ` Wei Yang
  2015-08-06 10:15                   ` Alexey Kardashevskiy
  0 siblings, 1 reply; 56+ messages in thread
From: Wei Yang @ 2015-08-06  9:41 UTC (permalink / raw)
  To: Alexey Kardashevskiy; +Cc: Gavin Shan, Wei Yang, benh, linuxppc-dev

On Thu, Aug 06, 2015 at 07:00:00PM +1000, Alexey Kardashevskiy wrote:
>On 08/06/2015 02:51 PM, Gavin Shan wrote:
>>On Wed, Aug 05, 2015 at 09:24:59AM +0800, Wei Yang wrote:
>>>The alignment of IOV BAR on PowerNV platform is the total size of the IOV
>>>BAR. No matter whether the IOV BAR is truncated or not, the total size
>>>could be calculated by (vfs_expanded * VF size).
>>>
>>
>>s/VF size/VF BAR size
>>
>>I think the changelog would be more explicit:
>>
>>The alignment of IOV BAR on PowerNV platform is the total size of the
>>IOV BAR, no matter whether the IOV BAR is extended with number of max
>>VFs or number of max PE number (256). The alignment can be calculated
>>by (vfs_expaned * VF_BAR_size).
>
>
>
>Is that really a PowerNV-specific requirement or it is valid for
>every platform (I suspect this is the case here)?
>

Currently, it is PowerNV-specific.

>
>Also, what is the exact meaning of "expanded" in @vfs_expanded? It is
>either 255 (if individual VF BARs are <= 64MB) or
>roundup_pow_of_two(total_vfs) (which is something like 4 or 16). What
>is expanded here?
>

PF's IOV BAR original size is (VF BAR size * total_vfs). 

After expanding, the IOV BAR size  is (VF BAR size * 256) or (VF BAR size *
roundup_pow_of_two(total_vfs)).

>
>>
>>>This patch simplifies the pnv_pci_iov_resource_alignment() by removing the
>>>first case.
>>>
>>>Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
>>
>>Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
>>
>>>---
>>>arch/powerpc/platforms/powernv/pci-ioda.c |   14 +++++++++-----
>>>1 file changed, 9 insertions(+), 5 deletions(-)
>>>
>>>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>>>index 9b41dba..7192e62 100644
>>>--- a/arch/powerpc/platforms/powernv/pci-ioda.c
>>>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>>>@@ -2987,12 +2987,16 @@ static resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev,
>>>						      int resno)
>>>{
>>>	struct pci_dn *pdn = pci_get_pdn(pdev);
>>>-	resource_size_t align, iov_align;
>>>-
>>>-	iov_align = resource_size(&pdev->resource[resno]);
>>>-	if (iov_align)
>>>-		return iov_align;
>>>+	resource_size_t align;
>>>
>>>+	/*
>>>+	 * On PowerNV platform, IOV BAR is mapped by M64 BAR to enable the
>>>+	 * SR-IOV. While from hardware perspective, the range mapped by M64
>>>+	 * BAR should be size aligned.
>>>+	 *
>>>+	 * This function return the total IOV BAR size if expanded or just the
>>>+	 * individual size if not.
>>>+	 */
>>>	align = pci_iov_resource_size(pdev, resno);
>>>	if (pdn->vfs_expanded)
>>>		return pdn->vfs_expanded * align;
>>>--
>>>1.7.9.5
>>>
>>
>
>
>-- 
>Alexey

-- 
Richard Yang
Help you, Help me

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH V2 3/6] powerpc/powernv: use one M64 BAR in Single PE mode for one VF BAR
  2015-08-05  1:25           ` [PATCH V2 3/6] powerpc/powernv: use one M64 BAR in Single PE mode for one VF BAR Wei Yang
  2015-08-06  5:20             ` Gavin Shan
@ 2015-08-06 10:04             ` Alexey Kardashevskiy
  2015-08-07  2:01               ` Wei Yang
  1 sibling, 1 reply; 56+ messages in thread
From: Alexey Kardashevskiy @ 2015-08-06 10:04 UTC (permalink / raw)
  To: Wei Yang, gwshan, benh; +Cc: linuxppc-dev

On 08/05/2015 11:25 AM, Wei Yang wrote:
> In current implementation, when VF BAR is bigger than 64MB, it uses 4 M64
> BAR in Single PE mode to cover the number of VFs required to be enabled.
> By doing so, several VFs would be in one VF Group and leads to interference
> between VFs in the same group.
>
> This patch changes the design by using one M64 BAR in Single PE mode for
> one VF BAR. This gives absolute isolation for VFs.
>
> Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
> ---
>   arch/powerpc/include/asm/pci-bridge.h     |    5 +-
>   arch/powerpc/platforms/powernv/pci-ioda.c |  180 ++++++++++++-----------------
>   2 files changed, 76 insertions(+), 109 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
> index 712add5..8aeba4c 100644
> --- a/arch/powerpc/include/asm/pci-bridge.h
> +++ b/arch/powerpc/include/asm/pci-bridge.h
> @@ -214,10 +214,9 @@ struct pci_dn {
>   	u16     vfs_expanded;		/* number of VFs IOV BAR expanded */
>   	u16     num_vfs;		/* number of VFs enabled*/
>   	int     offset;			/* PE# for the first VF PE */
> -#define M64_PER_IOV 4
> -	int     m64_per_iov;
> +	bool    m64_single_mode;	/* Use M64 BAR in Single Mode */
>   #define IODA_INVALID_M64        (-1)
> -	int     m64_wins[PCI_SRIOV_NUM_BARS][M64_PER_IOV];
> +	int     (*m64_map)[PCI_SRIOV_NUM_BARS];
>   #endif /* CONFIG_PCI_IOV */
>   #endif
>   	struct list_head child_list;
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
> index 7192e62..f5d110c 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> @@ -1148,29 +1148,36 @@ static void pnv_pci_ioda_setup_PEs(void)
>   }
>
>   #ifdef CONFIG_PCI_IOV
> -static int pnv_pci_vf_release_m64(struct pci_dev *pdev)
> +static int pnv_pci_vf_release_m64(struct pci_dev *pdev, u16 num_vfs)
>   {
>   	struct pci_bus        *bus;
>   	struct pci_controller *hose;
>   	struct pnv_phb        *phb;
>   	struct pci_dn         *pdn;
>   	int                    i, j;
> +	int                    m64_bars;
>
>   	bus = pdev->bus;
>   	hose = pci_bus_to_host(bus);
>   	phb = hose->private_data;
>   	pdn = pci_get_pdn(pdev);
>
> +	if (pdn->m64_single_mode)
> +		m64_bars = num_vfs;
> +	else
> +		m64_bars = 1;
> +
>   	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
> -		for (j = 0; j < M64_PER_IOV; j++) {
> -			if (pdn->m64_wins[i][j] == IODA_INVALID_M64)
> +		for (j = 0; j < m64_bars; j++) {
> +			if (pdn->m64_map[j][i] == IODA_INVALID_M64)
>   				continue;
>   			opal_pci_phb_mmio_enable(phb->opal_id,
> -				OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 0);
> -			clear_bit(pdn->m64_wins[i][j], &phb->ioda.m64_bar_alloc);
> -			pdn->m64_wins[i][j] = IODA_INVALID_M64;
> +				OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 0);
> +			clear_bit(pdn->m64_map[j][i], &phb->ioda.m64_bar_alloc);
> +			pdn->m64_map[j][i] = IODA_INVALID_M64;
>   		}
>
> +	kfree(pdn->m64_map);
>   	return 0;
>   }
>
> @@ -1187,8 +1194,7 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
>   	int                    total_vfs;
>   	resource_size_t        size, start;
>   	int                    pe_num;
> -	int                    vf_groups;
> -	int                    vf_per_group;
> +	int                    m64_bars;
>
>   	bus = pdev->bus;
>   	hose = pci_bus_to_host(bus);
> @@ -1196,26 +1202,26 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
>   	pdn = pci_get_pdn(pdev);
>   	total_vfs = pci_sriov_get_totalvfs(pdev);
>
> -	/* Initialize the m64_wins to IODA_INVALID_M64 */
> -	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
> -		for (j = 0; j < M64_PER_IOV; j++)
> -			pdn->m64_wins[i][j] = IODA_INVALID_M64;
> +	if (pdn->m64_single_mode)


This is a physical function's @pdn, right?


> +		m64_bars = num_vfs;
> +	else
> +		m64_bars = 1;
> +
> +	pdn->m64_map = kmalloc(sizeof(*pdn->m64_map) * m64_bars, GFP_KERNEL);


Assume we have SRIOV device with 16VF.
So it was m64_wins[6][4], now it is (roughly speaking) m64_map[6][16] (for 
a single PE mode) or m64_map[6][1]. I believe m64_bars cannot be bigger 
than 16 on PHB3, right? Is this checked anywhere (does it have to)?

This m64_wins -> m64_map change - is was not a map (what was it?), and it 
is, is not it?

What does it store? An index of M64 BAR (0..15)?



> +	if (!pdn->m64_map)
> +		return -ENOMEM;
> +	/* Initialize the m64_map to IODA_INVALID_M64 */
> +	for (i = 0; i < m64_bars ; i++)
> +		for (j = 0; j < PCI_SRIOV_NUM_BARS; j++)
> +			pdn->m64_map[i][j] = IODA_INVALID_M64;
>
> -	if (pdn->m64_per_iov == M64_PER_IOV) {
> -		vf_groups = (num_vfs <= M64_PER_IOV) ? num_vfs: M64_PER_IOV;
> -		vf_per_group = (num_vfs <= M64_PER_IOV)? 1:
> -			roundup_pow_of_two(num_vfs) / pdn->m64_per_iov;
> -	} else {
> -		vf_groups = 1;
> -		vf_per_group = 1;
> -	}
>
>   	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
>   		res = &pdev->resource[i + PCI_IOV_RESOURCES];
>   		if (!res->flags || !res->parent)
>   			continue;
>
> -		for (j = 0; j < vf_groups; j++) {
> +		for (j = 0; j < m64_bars; j++) {
>   			do {
>   				win = find_next_zero_bit(&phb->ioda.m64_bar_alloc,
>   						phb->ioda.m64_bar_idx + 1, 0);
> @@ -1224,12 +1230,11 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
>   					goto m64_failed;
>   			} while (test_and_set_bit(win, &phb->ioda.m64_bar_alloc));
>
> -			pdn->m64_wins[i][j] = win;
> +			pdn->m64_map[j][i] = win;
>
> -			if (pdn->m64_per_iov == M64_PER_IOV) {
> +			if (pdn->m64_single_mode) {
>   				size = pci_iov_resource_size(pdev,
>   							PCI_IOV_RESOURCES + i);
> -				size = size * vf_per_group;
>   				start = res->start + size * j;
>   			} else {
>   				size = resource_size(res);
> @@ -1237,16 +1242,16 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
>   			}
>
>   			/* Map the M64 here */
> -			if (pdn->m64_per_iov == M64_PER_IOV) {
> +			if (pdn->m64_single_mode) {
>   				pe_num = pdn->offset + j;
>   				rc = opal_pci_map_pe_mmio_window(phb->opal_id,
>   						pe_num, OPAL_M64_WINDOW_TYPE,
> -						pdn->m64_wins[i][j], 0);
> +						pdn->m64_map[j][i], 0);
>   			}
>
>   			rc = opal_pci_set_phb_mem_window(phb->opal_id,
>   						 OPAL_M64_WINDOW_TYPE,
> -						 pdn->m64_wins[i][j],
> +						 pdn->m64_map[j][i],
>   						 start,
>   						 0, /* unused */
>   						 size);
> @@ -1258,12 +1263,12 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
>   				goto m64_failed;
>   			}
>
> -			if (pdn->m64_per_iov == M64_PER_IOV)
> +			if (pdn->m64_single_mode)
>   				rc = opal_pci_phb_mmio_enable(phb->opal_id,
> -				     OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 2);
> +				     OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 2);
>   			else
>   				rc = opal_pci_phb_mmio_enable(phb->opal_id,
> -				     OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 1);
> +				     OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 1);
>
>   			if (rc != OPAL_SUCCESS) {
>   				dev_err(&pdev->dev, "Failed to enable M64 window #%d: %llx\n",
> @@ -1275,7 +1280,7 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
>   	return 0;
>
>   m64_failed:
> -	pnv_pci_vf_release_m64(pdev);
> +	pnv_pci_vf_release_m64(pdev, num_vfs);
>   	return -EBUSY;
>   }
>
> @@ -1302,15 +1307,13 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe
>   	iommu_free_table(tbl, of_node_full_name(dev->dev.of_node));
>   }
>
> -static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs)
> +static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
>   {
>   	struct pci_bus        *bus;
>   	struct pci_controller *hose;
>   	struct pnv_phb        *phb;
>   	struct pnv_ioda_pe    *pe, *pe_n;
>   	struct pci_dn         *pdn;
> -	u16                    vf_index;
> -	int64_t                rc;
>
>   	bus = pdev->bus;
>   	hose = pci_bus_to_host(bus);
> @@ -1320,35 +1323,6 @@ static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs)
>   	if (!pdev->is_physfn)
>   		return;
>
> -	if (pdn->m64_per_iov == M64_PER_IOV && num_vfs > M64_PER_IOV) {
> -		int   vf_group;
> -		int   vf_per_group;
> -		int   vf_index1;
> -
> -		vf_per_group = roundup_pow_of_two(num_vfs) / pdn->m64_per_iov;
> -
> -		for (vf_group = 0; vf_group < M64_PER_IOV; vf_group++)
> -			for (vf_index = vf_group * vf_per_group;
> -				vf_index < (vf_group + 1) * vf_per_group &&
> -				vf_index < num_vfs;
> -				vf_index++)
> -				for (vf_index1 = vf_group * vf_per_group;
> -					vf_index1 < (vf_group + 1) * vf_per_group &&
> -					vf_index1 < num_vfs;
> -					vf_index1++){
> -
> -					rc = opal_pci_set_peltv(phb->opal_id,
> -						pdn->offset + vf_index,
> -						pdn->offset + vf_index1,
> -						OPAL_REMOVE_PE_FROM_DOMAIN);
> -
> -					if (rc)
> -					    dev_warn(&pdev->dev, "%s: Failed to unlink same group PE#%d(%lld)\n",
> -						__func__,
> -						pdn->offset + vf_index1, rc);
> -				}
> -	}
> -
>   	list_for_each_entry_safe(pe, pe_n, &phb->ioda.pe_list, list) {
>   		if (pe->parent_dev != pdev)
>   			continue;
> @@ -1383,14 +1357,14 @@ void pnv_pci_sriov_disable(struct pci_dev *pdev)
>   	num_vfs = pdn->num_vfs;
>
>   	/* Release VF PEs */
> -	pnv_ioda_release_vf_PE(pdev, num_vfs);
> +	pnv_ioda_release_vf_PE(pdev);
>
>   	if (phb->type == PNV_PHB_IODA2) {
> -		if (pdn->m64_per_iov == 1)
> +		if (!pdn->m64_single_mode)
>   			pnv_pci_vf_resource_shift(pdev, -pdn->offset);
>
>   		/* Release M64 windows */
> -		pnv_pci_vf_release_m64(pdev);
> +		pnv_pci_vf_release_m64(pdev, num_vfs);
>
>   		/* Release PE numbers */
>   		bitmap_clear(phb->ioda.pe_alloc, pdn->offset, num_vfs);
> @@ -1409,7 +1383,6 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
>   	int                    pe_num;
>   	u16                    vf_index;
>   	struct pci_dn         *pdn;
> -	int64_t                rc;
>
>   	bus = pdev->bus;
>   	hose = pci_bus_to_host(bus);
> @@ -1454,37 +1427,6 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
>
>   		pnv_pci_ioda2_setup_dma_pe(phb, pe);
>   	}
> -
> -	if (pdn->m64_per_iov == M64_PER_IOV && num_vfs > M64_PER_IOV) {
> -		int   vf_group;
> -		int   vf_per_group;
> -		int   vf_index1;
> -
> -		vf_per_group = roundup_pow_of_two(num_vfs) / pdn->m64_per_iov;
> -
> -		for (vf_group = 0; vf_group < M64_PER_IOV; vf_group++) {
> -			for (vf_index = vf_group * vf_per_group;
> -			     vf_index < (vf_group + 1) * vf_per_group &&
> -			     vf_index < num_vfs;
> -			     vf_index++) {
> -				for (vf_index1 = vf_group * vf_per_group;
> -				     vf_index1 < (vf_group + 1) * vf_per_group &&
> -				     vf_index1 < num_vfs;
> -				     vf_index1++) {
> -
> -					rc = opal_pci_set_peltv(phb->opal_id,
> -						pdn->offset + vf_index,
> -						pdn->offset + vf_index1,
> -						OPAL_ADD_PE_TO_DOMAIN);
> -
> -					if (rc)
> -					    dev_warn(&pdev->dev, "%s: Failed to link same group PE#%d(%lld)\n",
> -						__func__,
> -						pdn->offset + vf_index1, rc);
> -				}
> -			}
> -		}
> -	}
>   }
>
>   int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
> @@ -1507,6 +1449,18 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
>   			return -EBUSY;
>   		}
>
> +		/*
> +		 * On PNV_PHB_IODA2, We just have 16 M64 BARs and M64 BAR #15
> +		 * is used to cover the whole system, which leaves only 15 M64
> +		 * BAR usable for VFs.
> +		 * When M64 BAR functions in Single PE mode, this means it
> +		 * just could enable 15 VFs.
> +		 */
> +		if (pdn->m64_single_mode && num_vfs >= 16) {

Magic constant 16. Where did this 16 come from? My understanding is it 
could come from

1) hostboot or
2) OPAL or
3) architected on IODA2
4) defined in PHB3 (actually it has to be 2))

which one is it? If 1) and 2) - make it a variable; if 3) - add a macro for it.


> +			dev_info(&pdev->dev, "Not enough M64 BAR for VFs\n");
> +			return -EBUSY;
> +		}
> +
>   		/* Calculate available PE for required VFs */
>   		mutex_lock(&phb->ioda.pe_alloc_mutex);
>   		pdn->offset = bitmap_find_next_zero_area(
> @@ -1534,7 +1488,7 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
>   		 * the IOV BAR according to the PE# allocated to the VFs.
>   		 * Otherwise, the PE# for the VF will conflict with others.
>   		 */
> -		if (pdn->m64_per_iov == 1) {
> +		if (!pdn->m64_single_mode) {
>   			ret = pnv_pci_vf_resource_shift(pdev, pdn->offset);
>   			if (ret)
>   				goto m64_failed;
> @@ -1567,8 +1521,7 @@ int pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
>   	/* Allocate PCI data */
>   	add_dev_pci_data(pdev);
>
> -	pnv_pci_sriov_enable(pdev, num_vfs);
> -	return 0;
> +	return pnv_pci_sriov_enable(pdev, num_vfs);
>   }
>   #endif /* CONFIG_PCI_IOV */
>
> @@ -2761,9 +2714,9 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
>
>   	pdn = pci_get_pdn(pdev);
>   	pdn->vfs_expanded = 0;
> +	pdn->m64_single_mode = false;
>
>   	total_vfs = pci_sriov_get_totalvfs(pdev);
> -	pdn->m64_per_iov = 1;
>   	mul = phb->ioda.total_pe;
>
>   	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
> @@ -2783,8 +2736,8 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
>   		if (size > (1 << 26)) {
>   			dev_info(&pdev->dev, "PowerNV: VF BAR%d: %pR IOV size is bigger than 64M, roundup power2\n",
>   				 i, res);
> -			pdn->m64_per_iov = M64_PER_IOV;
>   			mul = roundup_pow_of_two(total_vfs);
> +			pdn->m64_single_mode = true;
>   			break;
>   		}
>   	}
> @@ -2986,6 +2939,8 @@ static resource_size_t pnv_pci_window_alignment(struct pci_bus *bus,
>   static resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev,
>   						      int resno)
>   {
> +	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
> +	struct pnv_phb *phb = hose->private_data;
>   	struct pci_dn *pdn = pci_get_pdn(pdev);
>   	resource_size_t align;
>
> @@ -2994,12 +2949,25 @@ static resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev,
>   	 * SR-IOV. While from hardware perspective, the range mapped by M64
>   	 * BAR should be size aligned.
>   	 *
> +	 * When IOV BAR is mapped with M64 BAR in Single PE mode, the hardware
> +	 * restriction to alignment is gone.


Gone? Does not BAR still have to be aligned to its size?


> But if just use the VF BAR size
> +	 * as the alignment, PF BAR / VF BAR may be allocated with in one M64
> +	 * segment,


I thought each VF gets its own _segment_, am I wrong?


> which introduces the PE conflict between PF and VF. Based
> +	 * on this the minimum alignment of an IOV BAR is m64_segsize.
 >
> +	 *
>   	 * This function return the total IOV BAR size if expanded or just the
> -	 * individual size if not.
> +	 * individual size if not, when M64 BAR is in Shared PE mode.
> +	 * If the M64 BAR is in Single PE mode, return the VF BAR size or
> +	 * m64_size if IOV BAR size is less.
>   	 */
>   	align = pci_iov_resource_size(pdev, resno);
> -	if (pdn->vfs_expanded)
> -		return pdn->vfs_expanded * align;
> +	if (pdn->vfs_expanded) {
> +		if (pdn->m64_single_mode)
> +			return max(align,
> +				(resource_size_t)phb->ioda.m64_segsize);
> +		else
> +			return pdn->vfs_expanded * align;
> +	}
>
>   	return align;
>   }
>


-- 
Alexey

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH V2 3/6] powerpc/powernv: use one M64 BAR in Single PE mode for one VF BAR
  2015-08-06  9:36               ` Wei Yang
@ 2015-08-06 10:07                 ` Gavin Shan
  2015-08-07  1:48                   ` Wei Yang
  0 siblings, 1 reply; 56+ messages in thread
From: Gavin Shan @ 2015-08-06 10:07 UTC (permalink / raw)
  To: Wei Yang; +Cc: Gavin Shan, aik, benh, linuxppc-dev

On Thu, Aug 06, 2015 at 05:36:02PM +0800, Wei Yang wrote:
>On Thu, Aug 06, 2015 at 03:20:25PM +1000, Gavin Shan wrote:
>>On Wed, Aug 05, 2015 at 09:25:00AM +0800, Wei Yang wrote:
>>>In current implementation, when VF BAR is bigger than 64MB, it uses 4 M64
>>>BAR in Single PE mode to cover the number of VFs required to be enabled.
>>>By doing so, several VFs would be in one VF Group and leads to interference
>>>between VFs in the same group.
>>>
>>>This patch changes the design by using one M64 BAR in Single PE mode for
>>>one VF BAR. This gives absolute isolation for VFs.
>>>
>>>Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
>>>---
>>> arch/powerpc/include/asm/pci-bridge.h     |    5 +-
>>> arch/powerpc/platforms/powernv/pci-ioda.c |  180 ++++++++++++-----------------
>>> 2 files changed, 76 insertions(+), 109 deletions(-)
>>>
>>>diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
>>>index 712add5..8aeba4c 100644
>>>--- a/arch/powerpc/include/asm/pci-bridge.h
>>>+++ b/arch/powerpc/include/asm/pci-bridge.h
>>>@@ -214,10 +214,9 @@ struct pci_dn {
>>> 	u16     vfs_expanded;		/* number of VFs IOV BAR expanded */
>>> 	u16     num_vfs;		/* number of VFs enabled*/
>>> 	int     offset;			/* PE# for the first VF PE */
>>>-#define M64_PER_IOV 4
>>>-	int     m64_per_iov;
>>>+	bool    m64_single_mode;	/* Use M64 BAR in Single Mode */
>>> #define IODA_INVALID_M64        (-1)
>>>-	int     m64_wins[PCI_SRIOV_NUM_BARS][M64_PER_IOV];
>>>+	int     (*m64_map)[PCI_SRIOV_NUM_BARS];
>>
>>It can be explicit? For example:
>>
>>	int	*m64_map;
>>
>>	/* Initialization */
>>	size_t size = sizeof(*pdn->m64_map) * PCI_SRIOV_NUM_BARS * num_of_max_VFs;
>>	pdn->m64_map = kmalloc(size, GFP_KERNEL);
>>	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
>>		for (j = 0; j < num_of_max_VFs; j++)
>>			pdn->m64_map[i * PCI_SRIOV_NUM_BARS + j] = PNV_INVALID_M64;
>>
>>	/* Destroy */
>>	int step = 1;
>>
>>	if (!pdn->m64_single_mode)
>>		step = phb->ioda.total_pe;
>>	for (i = 0; i < PCI_SRIOV_NUM_BARS * num_of_max_VFs; i += step)
>>		if (pdn->m64_map[i] == PNV_INVALID_M64)
>>			continue;
>>
>>		/* Unmap the window */
>>	
>
>The m64_map is a pointer to an array with 6 elements, which represents the 6
>M64 BAR index for the 6 VF BARs.
>
>    When we use Shared Mode, one array is allocated. The six elements
>    represents the six M64 BAR(at most) used to map the whole IOV BAR.
>
>    When we use Single Mode, num_vfs array is allocate. Each array represents
>    the map between one VF's BAR and M64 BAR index.
>
>During the map and un-map, M64 BAR is assigned one by one in VF BAR's order.
>So I think the code is explicit.
>
>In your code, you allocate a big one dimension array to hold the M64 BAR
>index. It works, while I don't think this is more explicit than original code.
>

When M64 is in Single Mode, array with (num_vfs * 6) entries is allocated
because every VF BAR (6 at most) will have one corresponding PHB M64 BAR.
Anything I missed?

The point in my code is you needn't worry about the mode (single vs shared)
As I said, not too much memory wasted. However, it's up to you.

I'm not fan of "int (*m64_map)[PCI_SRIOV_NUM_BARS]". Instead, you can replace
it with "int *m64_map" and calculate its size using following formula:

	sizeof(*pdn->m64_map) * PCI_SRIOV_NUM_BARS;

	sizeof(*pdn->m64_map) * PCI_SRIOV_NUM_BARS * num_vfs;

>-- 
>Richard Yang
>Help you, Help me

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH V2 2/6] powerpc/powernv: simplify the calculation of iov resource
  2015-08-06  9:41                 ` Wei Yang
@ 2015-08-06 10:15                   ` Alexey Kardashevskiy
  2015-08-07  1:36                     ` Wei Yang
  0 siblings, 1 reply; 56+ messages in thread
From: Alexey Kardashevskiy @ 2015-08-06 10:15 UTC (permalink / raw)
  To: Wei Yang; +Cc: Gavin Shan, linuxppc-dev

On 08/06/2015 07:41 PM, Wei Yang wrote:
> On Thu, Aug 06, 2015 at 07:00:00PM +1000, Alexey Kardashevskiy wrote:
>> On 08/06/2015 02:51 PM, Gavin Shan wrote:
>>> On Wed, Aug 05, 2015 at 09:24:59AM +0800, Wei Yang wrote:
>>>> The alignment of IOV BAR on PowerNV platform is the total size of the IOV
>>>> BAR. No matter whether the IOV BAR is truncated or not, the total size
>>>> could be calculated by (vfs_expanded * VF size).
>>>>
>>>
>>> s/VF size/VF BAR size
>>>
>>> I think the changelog would be more explicit:
>>>
>>> The alignment of IOV BAR on PowerNV platform is the total size of the
>>> IOV BAR, no matter whether the IOV BAR is extended with number of max
>>> VFs or number of max PE number (256). The alignment can be calculated
>>> by (vfs_expaned * VF_BAR_size).
>>
>>
>>
>> Is that really a PowerNV-specific requirement or it is valid for
>> every platform (I suspect this is the case here)?
>>
>
> Currently, it is PowerNV-specific.


How is x86 different on this matter?
Why would we need this extra alignment, not just VF's BAR alignment?


>>
>> Also, what is the exact meaning of "expanded" in @vfs_expanded? It is
>> either 255 (if individual VF BARs are <= 64MB) or
>> roundup_pow_of_two(total_vfs) (which is something like 4 or 16). What
>> is expanded here?
>>
>
> PF's IOV BAR original size is (VF BAR size * total_vfs).
>
> After expanding, the IOV BAR size  is (VF BAR size * 256) or (VF BAR size *
> roundup_pow_of_two(total_vfs)).


Ufff, got it now. I'd store just an expanded IOV BAR size (not some magic 
VFs number) because this is what it actually is:
pdn->vfs_expanded * align


>>
>>>
>>>> This patch simplifies the pnv_pci_iov_resource_alignment() by removing the
>>>> first case.
>>>>
>>>> Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
>>>
>>> Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
>>>
>>>> ---
>>>> arch/powerpc/platforms/powernv/pci-ioda.c |   14 +++++++++-----
>>>> 1 file changed, 9 insertions(+), 5 deletions(-)
>>>>
>>>> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>>>> index 9b41dba..7192e62 100644
>>>> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
>>>> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>>>> @@ -2987,12 +2987,16 @@ static resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev,
>>>> 						      int resno)
>>>> {
>>>> 	struct pci_dn *pdn = pci_get_pdn(pdev);
>>>> -	resource_size_t align, iov_align;
>>>> -
>>>> -	iov_align = resource_size(&pdev->resource[resno]);
>>>> -	if (iov_align)
>>>> -		return iov_align;
>>>> +	resource_size_t align;
>>>>
>>>> +	/*
>>>> +	 * On PowerNV platform, IOV BAR is mapped by M64 BAR to enable the
>>>> +	 * SR-IOV. While from hardware perspective, the range mapped by M64
>>>> +	 * BAR should be size aligned.
>>>> +	 *
>>>> +	 * This function return the total IOV BAR size if expanded or just the
>>>> +	 * individual size if not.
>>>> +	 */
>>>> 	align = pci_iov_resource_size(pdev, resno);
>>>> 	if (pdn->vfs_expanded)
>>>> 		return pdn->vfs_expanded * align;
>>>> --
>>>> 1.7.9.5
>>>>
>>>
>>
>>
>> --
>> Alexey
>


-- 
Alexey

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH V2 1/6] powerpc/powernv: don't enable SRIOV when VF BAR contains non M64 BAR
  2015-08-06  7:47                   ` Alexey Kardashevskiy
@ 2015-08-06 11:07                     ` Gavin Shan
  2015-08-06 14:13                     ` Wei Yang
  1 sibling, 0 replies; 56+ messages in thread
From: Gavin Shan @ 2015-08-06 11:07 UTC (permalink / raw)
  To: Alexey Kardashevskiy; +Cc: Gavin Shan, Wei Yang, benh, linuxppc-dev

On Thu, Aug 06, 2015 at 05:47:42PM +1000, Alexey Kardashevskiy wrote:
>On 08/06/2015 04:57 PM, Gavin Shan wrote:
>>On Thu, Aug 06, 2015 at 04:10:21PM +1000, Alexey Kardashevskiy wrote:
>>>On 08/06/2015 02:35 PM, Gavin Shan wrote:
>>>>On Wed, Aug 05, 2015 at 09:24:58AM +0800, Wei Yang wrote:
>>>>>On PHB_IODA2, we enable SRIOV devices by mapping IOV BAR with M64 BARs. If
>>>>>a SRIOV device's BAR is not 64-bit prefetchable, this is not assigned from
>>>>>M64 windwo, which means M64 BAR can't work on it.
>
>
>The proper text would be something like this:
>
>===
>SRIOV only supports 64bit MMIO. So if we fail to assign 64bit BAR, we cannot
>enable the device.
>===
>
>
>>>>>
>>>>
>>>>s/PHB_IODA2/PHB3
>>>
>>>
>>>No, it is IODA2. OPEL does PHB3-specific bits, the host kernel just uses OPAL.
>>>
>>
>>Ok.
>>
>>>
>>>>s/windwo/window
>>>>
>>>>>This patch makes this explicit.
>>>>>
>>>>>Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
>>>>
>>>>The idea sounds right, but there is one question as below.
>>>>
>>>>>---
>>>>>arch/powerpc/platforms/powernv/pci-ioda.c |   25 +++++++++----------------
>>>>>1 file changed, 9 insertions(+), 16 deletions(-)
>>>>>
>>>>>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>>>>>index 5738d31..9b41dba 100644
>>>>>--- a/arch/powerpc/platforms/powernv/pci-ioda.c
>>>>>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>>>>>@@ -908,9 +908,6 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
>>>>>		if (!res->flags || !res->parent)
>>>>>			continue;
>>>>>
>>>>>-		if (!pnv_pci_is_mem_pref_64(res->flags))
>>>>>-			continue;
>>>>>-
>>>>>		/*
>>>>>		 * The actual IOV BAR range is determined by the start address
>>>>>		 * and the actual size for num_vfs VFs BAR.  This check is to
>>>>>@@ -939,9 +936,6 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
>>>>>		if (!res->flags || !res->parent)
>>>>>			continue;
>>>>>
>>>>>-		if (!pnv_pci_is_mem_pref_64(res->flags))
>>>>>-			continue;
>>>>>-
>>>>>		size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
>>>>>		res2 = *res;
>>>>>		res->start += size * offset;
>>>>>@@ -1221,9 +1215,6 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
>>>>>		if (!res->flags || !res->parent)
>>>>>			continue;
>>>>>
>>>>>-		if (!pnv_pci_is_mem_pref_64(res->flags))
>>>>>-			continue;
>>>>>-
>>>>>		for (j = 0; j < vf_groups; j++) {
>>>>>			do {
>>>>>				win = find_next_zero_bit(&phb->ioda.m64_bar_alloc,
>>>>>@@ -1510,6 +1501,12 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
>>>>>	pdn = pci_get_pdn(pdev);
>>>>>
>>>>>	if (phb->type == PNV_PHB_IODA2) {
>>>>>+		if (!pdn->vfs_expanded) {
>>>>>+			dev_info(&pdev->dev, "don't support this SRIOV device"
>>>>>+				" with non M64 VF BAR\n");
>>>>>+			return -EBUSY;
>>>>>+		}
>>>>>+
>>>>
>>>>It would be -ENOSPC since -EBUSY indicates the devices (VFs) are temparily
>>>>unavailable. For this case, the VFs are permanently unavailable because of
>>>>running out of space to accomodate M64 and non-M64 VF BARs.
>>>>
>>>>The error message could be printed with dev_warn() and it would be precise
>>>>as below or something else you prefer:
>>>>
>>>>	dev_warn(&pdev->dev, "SRIOV not supported because of non-M64 VF BAR\n");
>>>
>>>
>>>Both messages are cryptic.
>>>
>>>If it is not M64 BAR, then what is it? It is always in one of M64 BARs (in
>>>the worst case - BAR#15?), the difference is if it is segmented or not, no?
>>>
>>
>>The VF BAR could be one of IO, M32, M64. If it's not M64, the VFs are supposed
>>to be disabled and the (IO and M32) resources won't be allocted, but for sure,
>>the IO and M32 resources can't be put into any one of the 16 PHB's M64 BARs.
>>would you recommend one better message then?
>
>
>
>dev_warn(&pdev->dev, "SRIOV is disabled as no space is left in 64bit MMIO
>window\n");
>
>Or it is not "MMIO window"?
>

It's a confusing message: It's not related to space and M64 window.
When any VF BAR is IO or M32, we just give up attempting to allocate
resources for it. I still think my original message is enough or
similarly below one:

	dev_warn(&pdev->dev, "Disabled SRIOV because of non-M64 BAR"

>>
>>>>
>>>>
>>>>>		/* Calculate available PE for required VFs */
>>>>>		mutex_lock(&phb->ioda.pe_alloc_mutex);
>>>>>		pdn->offset = bitmap_find_next_zero_area(
>>>>>@@ -2774,9 +2771,10 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
>>>>>		if (!res->flags || res->parent)
>>>>>			continue;
>>>>>		if (!pnv_pci_is_mem_pref_64(res->flags)) {
>>>>>-			dev_warn(&pdev->dev, " non M64 VF BAR%d: %pR\n",
>>>>>+			dev_warn(&pdev->dev, "Don't support SR-IOV with"
>>>>>+					" non M64 VF BAR%d: %pR. \n",
>>>>>				 i, res);
>>>>>-			continue;
>>>>>+			return;
>>>>>		}
>>>>>
>>>>>		size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
>>>>>@@ -2795,11 +2793,6 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
>>>>>		res = &pdev->resource[i + PCI_IOV_RESOURCES];
>>>>>		if (!res->flags || res->parent)
>>>>>			continue;
>>>>>-		if (!pnv_pci_is_mem_pref_64(res->flags)) {
>>>>>-			dev_warn(&pdev->dev, "Skipping expanding VF BAR%d: %pR\n",
>>>>>-				 i, res);
>>>>>-			continue;
>>>>>-		}
>>>>
>>>>When any one IOV BAR on the PF is non-M64, none of the VFs can be enabled.
>>>>Will we still allocate/assign M64 or M32 resources for the IOV BARs? If so,
>>>>I think it can be avoided.
>>>>
>>>>>
>>>>>		dev_dbg(&pdev->dev, " Fixing VF BAR%d: %pR to\n", i, res);
>>>>>		size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
>
>
>-- 
>Alexey
>

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH V2 6/6] powerpc/powernv: allocate discrete PE# when using M64 BAR in Single PE mode
  2015-08-06  5:36             ` Gavin Shan
@ 2015-08-06 13:41               ` Wei Yang
  2015-08-07  1:36                 ` Gavin Shan
  0 siblings, 1 reply; 56+ messages in thread
From: Wei Yang @ 2015-08-06 13:41 UTC (permalink / raw)
  To: Gavin Shan; +Cc: Wei Yang, aik, benh, linuxppc-dev

On Thu, Aug 06, 2015 at 03:36:01PM +1000, Gavin Shan wrote:
>On Wed, Aug 05, 2015 at 09:25:03AM +0800, Wei Yang wrote:
>>When M64 BAR is set to Single PE mode, the PE# assigned to VF could be
>>discrete.
>>
>>This patch restructures the patch to allocate discrete PE# for VFs when M64
>>BAR is set to Single PE mode.
>>
>>Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
>>---
>> arch/powerpc/include/asm/pci-bridge.h     |    2 +-
>> arch/powerpc/platforms/powernv/pci-ioda.c |   69 +++++++++++++++++++++--------
>> 2 files changed, 51 insertions(+), 20 deletions(-)
>>
>>diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
>>index 8aeba4c..72415c7 100644
>>--- a/arch/powerpc/include/asm/pci-bridge.h
>>+++ b/arch/powerpc/include/asm/pci-bridge.h
>>@@ -213,7 +213,7 @@ struct pci_dn {
>> #ifdef CONFIG_PCI_IOV
>> 	u16     vfs_expanded;		/* number of VFs IOV BAR expanded */
>> 	u16     num_vfs;		/* number of VFs enabled*/
>>-	int     offset;			/* PE# for the first VF PE */
>>+	int     *offset;		/* PE# for the first VF PE or array */
>> 	bool    m64_single_mode;	/* Use M64 BAR in Single Mode */
>> #define IODA_INVALID_M64        (-1)
>> 	int     (*m64_map)[PCI_SRIOV_NUM_BARS];
>
>how about renaming "offset" to "pe_num_map", or "pe_map" ? Similar to the comments
>I gave to the "m64_bar_map", num_of_max_vfs entries can be allocated. Though not
>all of them will be used, not too much memory will be wasted.
>

Thanks for your comment.

I have thought about change the name to make it more self explain. While
another fact I want to take in is this field is also used to be reflect the
shift offset when M64 BAR is used in the Shared Mode. So I maintain the name.

How about use "enum", one maintain the name "offset", and another one rename to
"pe_num_map". And use the meaningful name at proper place?

>>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>>index 4042303..9953829 100644
>>--- a/arch/powerpc/platforms/powernv/pci-ioda.c
>>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>>@@ -1243,7 +1243,7 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
>>
>> 			/* Map the M64 here */
>> 			if (pdn->m64_single_mode) {
>>-				pe_num = pdn->offset + j;
>>+				pe_num = pdn->offset[j];
>> 				rc = opal_pci_map_pe_mmio_window(phb->opal_id,
>> 						pe_num, OPAL_M64_WINDOW_TYPE,
>> 						pdn->m64_map[j][i], 0);
>>@@ -1347,7 +1347,7 @@ void pnv_pci_sriov_disable(struct pci_dev *pdev)
>> 	struct pnv_phb        *phb;
>> 	struct pci_dn         *pdn;
>> 	struct pci_sriov      *iov;
>>-	u16 num_vfs;
>>+	u16                    num_vfs, i;
>>
>> 	bus = pdev->bus;
>> 	hose = pci_bus_to_host(bus);
>>@@ -1361,14 +1361,18 @@ void pnv_pci_sriov_disable(struct pci_dev *pdev)
>>
>> 	if (phb->type == PNV_PHB_IODA2) {
>> 		if (!pdn->m64_single_mode)
>>-			pnv_pci_vf_resource_shift(pdev, -pdn->offset);
>>+			pnv_pci_vf_resource_shift(pdev, -*pdn->offset);
>>
>> 		/* Release M64 windows */
>> 		pnv_pci_vf_release_m64(pdev, num_vfs);
>>
>> 		/* Release PE numbers */
>>-		bitmap_clear(phb->ioda.pe_alloc, pdn->offset, num_vfs);
>>-		pdn->offset = 0;
>>+		if (pdn->m64_single_mode) {
>>+			for (i = 0; i < num_vfs; i++)
>>+				pnv_ioda_free_pe(phb, pdn->offset[i]);
>>+		} else
>>+			bitmap_clear(phb->ioda.pe_alloc, *pdn->offset, num_vfs);
>>+		kfree(pdn->offset);
>
>Can pnv_ioda_free_pe() be reused to release PE ?

You mean use it to similar thing in pnv_ioda_deconfigure_pe()?

>
>> 	}
>> }
>>
>>@@ -1394,7 +1398,10 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
>>
>> 	/* Reserve PE for each VF */
>> 	for (vf_index = 0; vf_index < num_vfs; vf_index++) {
>>-		pe_num = pdn->offset + vf_index;
>>+		if (pdn->m64_single_mode)
>>+			pe_num = pdn->offset[vf_index];
>>+		else
>>+			pe_num = *pdn->offset + vf_index;
>>
>> 		pe = &phb->ioda.pe_array[pe_num];
>> 		pe->pe_number = pe_num;
>>@@ -1436,6 +1443,7 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
>> 	struct pnv_phb        *phb;
>> 	struct pci_dn         *pdn;
>> 	int                    ret;
>>+	u16                    i;
>>
>> 	bus = pdev->bus;
>> 	hose = pci_bus_to_host(bus);
>>@@ -1462,19 +1470,38 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
>> 		}
>>
>> 		/* Calculate available PE for required VFs */
>>-		mutex_lock(&phb->ioda.pe_alloc_mutex);
>>-		pdn->offset = bitmap_find_next_zero_area(
>>-			phb->ioda.pe_alloc, phb->ioda.total_pe,
>>-			0, num_vfs, 0);
>>-		if (pdn->offset >= phb->ioda.total_pe) {
>>+		if (pdn->m64_single_mode) {
>>+			pdn->offset = kmalloc(sizeof(*pdn->offset) * num_vfs,
>>+					GFP_KERNEL);
>>+			if (!pdn->offset)
>>+				return -ENOMEM;
>>+			for (i = 0; i < num_vfs; i++)
>>+				pdn->offset[i] = IODA_INVALID_PE;
>>+			for (i = 0; i < num_vfs; i++) {
>>+				pdn->offset[i] = pnv_ioda_alloc_pe(phb);
>>+				if (pdn->offset[i] == IODA_INVALID_PE) {
>>+					ret = -EBUSY;
>>+					goto m64_failed;
>>+				}
>>+			}
>>+		} else {
>>+			pdn->offset = kmalloc(sizeof(*pdn->offset), GFP_KERNEL);
>>+			if (!pdn->offset)
>>+				return -ENOMEM;
>>+			mutex_lock(&phb->ioda.pe_alloc_mutex);
>>+			*pdn->offset = bitmap_find_next_zero_area(
>>+				phb->ioda.pe_alloc, phb->ioda.total_pe,
>>+				0, num_vfs, 0);
>>+			if (*pdn->offset >= phb->ioda.total_pe) {
>>+				mutex_unlock(&phb->ioda.pe_alloc_mutex);
>>+				dev_info(&pdev->dev, "Failed to enable VF%d\n", num_vfs);
>>+				kfree(pdn->offset);
>>+				return -EBUSY;
>>+			}
>>+			bitmap_set(phb->ioda.pe_alloc, *pdn->offset, num_vfs);
>> 			mutex_unlock(&phb->ioda.pe_alloc_mutex);
>>-			dev_info(&pdev->dev, "Failed to enable VF%d\n", num_vfs);
>>-			pdn->offset = 0;
>>-			return -EBUSY;
>> 		}
>>-		bitmap_set(phb->ioda.pe_alloc, pdn->offset, num_vfs);
>> 		pdn->num_vfs = num_vfs;
>>-		mutex_unlock(&phb->ioda.pe_alloc_mutex);
>>
>> 		/* Assign M64 window accordingly */
>> 		ret = pnv_pci_vf_assign_m64(pdev, num_vfs);
>>@@ -1489,7 +1516,7 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
>> 		 * Otherwise, the PE# for the VF will conflict with others.
>> 		 */
>> 		if (!pdn->m64_single_mode) {
>>-			ret = pnv_pci_vf_resource_shift(pdev, pdn->offset);
>>+			ret = pnv_pci_vf_resource_shift(pdev, *pdn->offset);
>> 			if (ret)
>> 				goto m64_failed;
>> 		}
>>@@ -1501,8 +1528,12 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
>> 	return 0;
>>
>> m64_failed:
>>-	bitmap_clear(phb->ioda.pe_alloc, pdn->offset, num_vfs);
>>-	pdn->offset = 0;
>>+	if (pdn->m64_single_mode) {
>>+		for (i = 0; i < num_vfs; i++)
>>+			pnv_ioda_free_pe(phb, pdn->offset[i]);
>>+	} else
>>+		bitmap_clear(phb->ioda.pe_alloc, *pdn->offset, num_vfs);
>>+	kfree(pdn->offset);
>>
>> 	return ret;
>> }
>>-- 
>>1.7.9.5
>>

-- 
Richard Yang
Help you, Help me

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH V2 2/6] powerpc/powernv: simplify the calculation of iov resource
  2015-08-06  4:51             ` Gavin Shan
  2015-08-06  9:00               ` Alexey Kardashevskiy
@ 2015-08-06 13:49               ` Wei Yang
  2015-08-07  1:08                 ` Gavin Shan
  1 sibling, 1 reply; 56+ messages in thread
From: Wei Yang @ 2015-08-06 13:49 UTC (permalink / raw)
  To: Gavin Shan; +Cc: Wei Yang, aik, benh, linuxppc-dev

On Thu, Aug 06, 2015 at 02:51:40PM +1000, Gavin Shan wrote:
>On Wed, Aug 05, 2015 at 09:24:59AM +0800, Wei Yang wrote:
>>The alignment of IOV BAR on PowerNV platform is the total size of the IOV
>>BAR. No matter whether the IOV BAR is truncated or not, the total size
>>could be calculated by (vfs_expanded * VF size).
>>
>
>s/VF size/VF BAR size
>
>I think the changelog would be more explicit:
>
>The alignment of IOV BAR on PowerNV platform is the total size of the
>IOV BAR, no matter whether the IOV BAR is extended with number of max
>VFs or number of max PE number (256). The alignment can be calculated

number of max VFs is not correct. This should be
roundup_pow_of_two(total_vfs).

Others looks good to me.

>by (vfs_expaned * VF_BAR_size).
>
>>This patch simplifies the pnv_pci_iov_resource_alignment() by removing the
>>first case.
>>
>>Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
>
>Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
>
>>---
>> arch/powerpc/platforms/powernv/pci-ioda.c |   14 +++++++++-----
>> 1 file changed, 9 insertions(+), 5 deletions(-)
>>
>>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>>index 9b41dba..7192e62 100644
>>--- a/arch/powerpc/platforms/powernv/pci-ioda.c
>>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>>@@ -2987,12 +2987,16 @@ static resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev,
>> 						      int resno)
>> {
>> 	struct pci_dn *pdn = pci_get_pdn(pdev);
>>-	resource_size_t align, iov_align;
>>-
>>-	iov_align = resource_size(&pdev->resource[resno]);
>>-	if (iov_align)
>>-		return iov_align;
>>+	resource_size_t align;
>>
>>+	/*
>>+	 * On PowerNV platform, IOV BAR is mapped by M64 BAR to enable the
>>+	 * SR-IOV. While from hardware perspective, the range mapped by M64
>>+	 * BAR should be size aligned.
>>+	 *
>>+	 * This function return the total IOV BAR size if expanded or just the
>>+	 * individual size if not.
>>+	 */
>> 	align = pci_iov_resource_size(pdev, resno);
>> 	if (pdn->vfs_expanded)
>> 		return pdn->vfs_expanded * align;
>>-- 
>>1.7.9.5
>>

-- 
Richard Yang
Help you, Help me

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH V2 5/6] powerpc/powernv: boundary the total vf bar size instead of the individual one
  2015-08-06  5:28             ` Gavin Shan
@ 2015-08-06 14:03               ` Wei Yang
  2015-08-07  1:23                 ` Gavin Shan
  0 siblings, 1 reply; 56+ messages in thread
From: Wei Yang @ 2015-08-06 14:03 UTC (permalink / raw)
  To: Gavin Shan; +Cc: Wei Yang, aik, benh, linuxppc-dev

On Thu, Aug 06, 2015 at 03:28:51PM +1000, Gavin Shan wrote:
>On Wed, Aug 05, 2015 at 09:25:02AM +0800, Wei Yang wrote:
>>Each VF could have 6 BARs at most. When the total BAR size exceeds the
>>gate, after expanding it will also exhaust the M64 Window.
>>
>>This patch limits the boundary by checking the total VF BAR size instead of
>>the individual BAR.
>>
>>Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
>
>Ok. I didn't look at this when giving comments to last patch. It turns
>you have the change in this patch. Please merge it with the previous
>patch.
>

Hmm... I prefer to have them in two patches. One focus on the calculation of
gate and the other focus on checking the total VF BAR size. This would help
record the change.

>>---
>> arch/powerpc/platforms/powernv/pci-ioda.c |   13 +++++++------
>> 1 file changed, 7 insertions(+), 6 deletions(-)
>>
>>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>>index 31dcedc..4042303 100644
>>--- a/arch/powerpc/platforms/powernv/pci-ioda.c
>>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>>@@ -2702,7 +2702,7 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
>> 	struct pnv_phb *phb;
>> 	struct resource *res;
>> 	int i;
>>-	resource_size_t size, gate;
>>+	resource_size_t size, gate, total_vf_bar_sz;
>> 	struct pci_dn *pdn;
>> 	int mul, total_vfs;
>>
>>@@ -2729,6 +2729,7 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
>> 	 * Window and limit the system flexibility.
>> 	 */
>> 	gate = phb->ioda.m64_segsize >> 1;
>>+	total_vf_bar_sz = 0;
>>
>> 	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
>> 		res = &pdev->resource[i + PCI_IOV_RESOURCES];
>>@@ -2741,13 +2742,13 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
>> 			return;
>> 		}
>>
>>-		size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
>>+		total_vf_bar_sz += pci_iov_resource_size(pdev,
>>+				i + PCI_IOV_RESOURCES);
>>
>> 		/* bigger than or equal to gate */
>>-		if (size >= gate) {
>>-			dev_info(&pdev->dev, "PowerNV: VF BAR%d: %pR IOV size "
>>-				"is bigger than %lld, roundup power2\n",
>>-				 i, res, gate);
>>+		if (total_vf_bar_sz >= gate) {
>>+			dev_info(&pdev->dev, "PowerNV: VF BAR Total IOV size "
>>+				"is bigger than %lld, roundup power2\n", gate);
>> 			mul = roundup_pow_of_two(total_vfs);
>> 			pdn->m64_single_mode = true;
>> 			break;
>>-- 
>>1.7.9.5
>>

-- 
Richard Yang
Help you, Help me

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH V2 1/6] powerpc/powernv: don't enable SRIOV when VF BAR contains non M64 BAR
  2015-08-06  4:35             ` Gavin Shan
  2015-08-06  6:10               ` Alexey Kardashevskiy
@ 2015-08-06 14:10               ` Wei Yang
  2015-08-07  1:20                 ` Gavin Shan
  1 sibling, 1 reply; 56+ messages in thread
From: Wei Yang @ 2015-08-06 14:10 UTC (permalink / raw)
  To: Gavin Shan; +Cc: Wei Yang, aik, benh, linuxppc-dev

On Thu, Aug 06, 2015 at 02:35:57PM +1000, Gavin Shan wrote:
>On Wed, Aug 05, 2015 at 09:24:58AM +0800, Wei Yang wrote:
>>On PHB_IODA2, we enable SRIOV devices by mapping IOV BAR with M64 BARs. If
>>a SRIOV device's BAR is not 64-bit prefetchable, this is not assigned from
>>M64 windwo, which means M64 BAR can't work on it.
>>
>
>s/PHB_IODA2/PHB3
>s/windwo/window
>
>>This patch makes this explicit.
>>
>>Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
>
>The idea sounds right, but there is one question as below.
>
>>---
>> arch/powerpc/platforms/powernv/pci-ioda.c |   25 +++++++++----------------
>> 1 file changed, 9 insertions(+), 16 deletions(-)
>>
>>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>>index 5738d31..9b41dba 100644
>>--- a/arch/powerpc/platforms/powernv/pci-ioda.c
>>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>>@@ -908,9 +908,6 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
>> 		if (!res->flags || !res->parent)
>> 			continue;
>>
>>-		if (!pnv_pci_is_mem_pref_64(res->flags))
>>-			continue;
>>-
>> 		/*
>> 		 * The actual IOV BAR range is determined by the start address
>> 		 * and the actual size for num_vfs VFs BAR.  This check is to
>>@@ -939,9 +936,6 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
>> 		if (!res->flags || !res->parent)
>> 			continue;
>>
>>-		if (!pnv_pci_is_mem_pref_64(res->flags))
>>-			continue;
>>-
>> 		size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
>> 		res2 = *res;
>> 		res->start += size * offset;
>>@@ -1221,9 +1215,6 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
>> 		if (!res->flags || !res->parent)
>> 			continue;
>>
>>-		if (!pnv_pci_is_mem_pref_64(res->flags))
>>-			continue;
>>-
>> 		for (j = 0; j < vf_groups; j++) {
>> 			do {
>> 				win = find_next_zero_bit(&phb->ioda.m64_bar_alloc,
>>@@ -1510,6 +1501,12 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
>> 	pdn = pci_get_pdn(pdev);
>>
>> 	if (phb->type == PNV_PHB_IODA2) {
>>+		if (!pdn->vfs_expanded) {
>>+			dev_info(&pdev->dev, "don't support this SRIOV device"
>>+				" with non M64 VF BAR\n");
>>+			return -EBUSY;
>>+		}
>>+
>
>It would be -ENOSPC since -EBUSY indicates the devices (VFs) are temparily
>unavailable. For this case, the VFs are permanently unavailable because of
>running out of space to accomodate M64 and non-M64 VF BARs.
>
>The error message could be printed with dev_warn() and it would be precise
>as below or something else you prefer:
>
>	dev_warn(&pdev->dev, "SRIOV not supported because of non-M64 VF BAR\n");
>

Thanks for the comment, will change accordingly.

>
>> 		/* Calculate available PE for required VFs */
>> 		mutex_lock(&phb->ioda.pe_alloc_mutex);
>> 		pdn->offset = bitmap_find_next_zero_area(
>>@@ -2774,9 +2771,10 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
>> 		if (!res->flags || res->parent)
>> 			continue;
>> 		if (!pnv_pci_is_mem_pref_64(res->flags)) {
>>-			dev_warn(&pdev->dev, " non M64 VF BAR%d: %pR\n",
>>+			dev_warn(&pdev->dev, "Don't support SR-IOV with"
>>+					" non M64 VF BAR%d: %pR. \n",
>> 				 i, res);
>>-			continue;
>>+			return;
>> 		}
>>
>> 		size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
>>@@ -2795,11 +2793,6 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
>> 		res = &pdev->resource[i + PCI_IOV_RESOURCES];
>> 		if (!res->flags || res->parent)
>> 			continue;
>>-		if (!pnv_pci_is_mem_pref_64(res->flags)) {
>>-			dev_warn(&pdev->dev, "Skipping expanding VF BAR%d: %pR\n",
>>-				 i, res);
>>-			continue;
>>-		}
>
>When any one IOV BAR on the PF is non-M64, none of the VFs can be enabled.
>Will we still allocate/assign M64 or M32 resources for the IOV BARs? If so,
>I think it can be avoided.
>

Don't get your point. You mean to avoid this function?

Or clear the IOV BAR when we found one of it is non-M64?

>>
>> 		dev_dbg(&pdev->dev, " Fixing VF BAR%d: %pR to\n", i, res);
>> 		size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
>>-- 
>>1.7.9.5
>>

-- 
Richard Yang
Help you, Help me

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH V2 1/6] powerpc/powernv: don't enable SRIOV when VF BAR contains non M64 BAR
  2015-08-06  7:47                   ` Alexey Kardashevskiy
  2015-08-06 11:07                     ` Gavin Shan
@ 2015-08-06 14:13                     ` Wei Yang
  2015-08-07  1:24                       ` Alexey Kardashevskiy
  1 sibling, 1 reply; 56+ messages in thread
From: Wei Yang @ 2015-08-06 14:13 UTC (permalink / raw)
  To: Alexey Kardashevskiy; +Cc: Gavin Shan, Wei Yang, benh, linuxppc-dev

On Thu, Aug 06, 2015 at 05:47:42PM +1000, Alexey Kardashevskiy wrote:
>On 08/06/2015 04:57 PM, Gavin Shan wrote:
>>On Thu, Aug 06, 2015 at 04:10:21PM +1000, Alexey Kardashevskiy wrote:
>>>On 08/06/2015 02:35 PM, Gavin Shan wrote:
>>>>On Wed, Aug 05, 2015 at 09:24:58AM +0800, Wei Yang wrote:
>>>>>On PHB_IODA2, we enable SRIOV devices by mapping IOV BAR with M64 BARs. If
>>>>>a SRIOV device's BAR is not 64-bit prefetchable, this is not assigned from
>>>>>M64 windwo, which means M64 BAR can't work on it.
>
>
>The proper text would be something like this:
>
>===
>SRIOV only supports 64bit MMIO. So if we fail to assign 64bit BAR, we
>cannot enable the device.
>===
>
>
>>>>>
>>>>
>>>>s/PHB_IODA2/PHB3
>>>
>>>
>>>No, it is IODA2. OPEL does PHB3-specific bits, the host kernel just uses OPAL.
>>>
>>
>>Ok.
>>
>>>
>>>>s/windwo/window
>>>>
>>>>>This patch makes this explicit.
>>>>>
>>>>>Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
>>>>
>>>>The idea sounds right, but there is one question as below.
>>>>
>>>>>---
>>>>>arch/powerpc/platforms/powernv/pci-ioda.c |   25 +++++++++----------------
>>>>>1 file changed, 9 insertions(+), 16 deletions(-)
>>>>>
>>>>>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>>>>>index 5738d31..9b41dba 100644
>>>>>--- a/arch/powerpc/platforms/powernv/pci-ioda.c
>>>>>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>>>>>@@ -908,9 +908,6 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
>>>>>		if (!res->flags || !res->parent)
>>>>>			continue;
>>>>>
>>>>>-		if (!pnv_pci_is_mem_pref_64(res->flags))
>>>>>-			continue;
>>>>>-
>>>>>		/*
>>>>>		 * The actual IOV BAR range is determined by the start address
>>>>>		 * and the actual size for num_vfs VFs BAR.  This check is to
>>>>>@@ -939,9 +936,6 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
>>>>>		if (!res->flags || !res->parent)
>>>>>			continue;
>>>>>
>>>>>-		if (!pnv_pci_is_mem_pref_64(res->flags))
>>>>>-			continue;
>>>>>-
>>>>>		size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
>>>>>		res2 = *res;
>>>>>		res->start += size * offset;
>>>>>@@ -1221,9 +1215,6 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
>>>>>		if (!res->flags || !res->parent)
>>>>>			continue;
>>>>>
>>>>>-		if (!pnv_pci_is_mem_pref_64(res->flags))
>>>>>-			continue;
>>>>>-
>>>>>		for (j = 0; j < vf_groups; j++) {
>>>>>			do {
>>>>>				win = find_next_zero_bit(&phb->ioda.m64_bar_alloc,
>>>>>@@ -1510,6 +1501,12 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
>>>>>	pdn = pci_get_pdn(pdev);
>>>>>
>>>>>	if (phb->type == PNV_PHB_IODA2) {
>>>>>+		if (!pdn->vfs_expanded) {
>>>>>+			dev_info(&pdev->dev, "don't support this SRIOV device"
>>>>>+				" with non M64 VF BAR\n");
>>>>>+			return -EBUSY;
>>>>>+		}
>>>>>+
>>>>
>>>>It would be -ENOSPC since -EBUSY indicates the devices (VFs) are temparily
>>>>unavailable. For this case, the VFs are permanently unavailable because of
>>>>running out of space to accomodate M64 and non-M64 VF BARs.
>>>>
>>>>The error message could be printed with dev_warn() and it would be precise
>>>>as below or something else you prefer:
>>>>
>>>>	dev_warn(&pdev->dev, "SRIOV not supported because of non-M64 VF BAR\n");
>>>
>>>
>>>Both messages are cryptic.
>>>
>>>If it is not M64 BAR, then what is it? It is always in one of M64 BARs (in
>>>the worst case - BAR#15?), the difference is if it is segmented or not, no?
>>>
>>
>>The VF BAR could be one of IO, M32, M64. If it's not M64, the VFs are supposed
>>to be disabled and the (IO and M32) resources won't be allocted, but for sure,
>>the IO and M32 resources can't be put into any one of the 16 PHB's M64 BARs.
>>would you recommend one better message then?
>
>
>
>dev_warn(&pdev->dev, "SRIOV is disabled as no space is left in 64bit
>MMIO window\n");
>
>Or it is not "MMIO window"?
>

The reason is not "no space left in 64bit MMIO window".

The reason is the IOV BAR is not 64bit prefetchable, then in linux kernel this
can't be allocated from M64 Space, then we can't use M64 BAR to cover it.

>
>
>>
>>>>
>>>>
>>>>>		/* Calculate available PE for required VFs */
>>>>>		mutex_lock(&phb->ioda.pe_alloc_mutex);
>>>>>		pdn->offset = bitmap_find_next_zero_area(
>>>>>@@ -2774,9 +2771,10 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
>>>>>		if (!res->flags || res->parent)
>>>>>			continue;
>>>>>		if (!pnv_pci_is_mem_pref_64(res->flags)) {
>>>>>-			dev_warn(&pdev->dev, " non M64 VF BAR%d: %pR\n",
>>>>>+			dev_warn(&pdev->dev, "Don't support SR-IOV with"
>>>>>+					" non M64 VF BAR%d: %pR. \n",
>>>>>				 i, res);
>>>>>-			continue;
>>>>>+			return;
>>>>>		}
>>>>>
>>>>>		size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
>>>>>@@ -2795,11 +2793,6 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
>>>>>		res = &pdev->resource[i + PCI_IOV_RESOURCES];
>>>>>		if (!res->flags || res->parent)
>>>>>			continue;
>>>>>-		if (!pnv_pci_is_mem_pref_64(res->flags)) {
>>>>>-			dev_warn(&pdev->dev, "Skipping expanding VF BAR%d: %pR\n",
>>>>>-				 i, res);
>>>>>-			continue;
>>>>>-		}
>>>>
>>>>When any one IOV BAR on the PF is non-M64, none of the VFs can be enabled.
>>>>Will we still allocate/assign M64 or M32 resources for the IOV BARs? If so,
>>>>I think it can be avoided.
>>>>
>>>>>
>>>>>		dev_dbg(&pdev->dev, " Fixing VF BAR%d: %pR to\n", i, res);
>>>>>		size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
>
>
>-- 
>Alexey

-- 
Richard Yang
Help you, Help me

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH V2 2/6] powerpc/powernv: simplify the calculation of iov resource
  2015-08-06 13:49               ` Wei Yang
@ 2015-08-07  1:08                 ` Gavin Shan
  0 siblings, 0 replies; 56+ messages in thread
From: Gavin Shan @ 2015-08-07  1:08 UTC (permalink / raw)
  To: Wei Yang; +Cc: Gavin Shan, aik, benh, linuxppc-dev

On Thu, Aug 06, 2015 at 09:49:02PM +0800, Wei Yang wrote:
>On Thu, Aug 06, 2015 at 02:51:40PM +1000, Gavin Shan wrote:
>>On Wed, Aug 05, 2015 at 09:24:59AM +0800, Wei Yang wrote:
>>>The alignment of IOV BAR on PowerNV platform is the total size of the IOV
>>>BAR. No matter whether the IOV BAR is truncated or not, the total size
>>>could be calculated by (vfs_expanded * VF size).
>>>
>>
>>s/VF size/VF BAR size
>>
>>I think the changelog would be more explicit:
>>
>>The alignment of IOV BAR on PowerNV platform is the total size of the
>>IOV BAR, no matter whether the IOV BAR is extended with number of max
>>VFs or number of max PE number (256). The alignment can be calculated
>
>number of max VFs is not correct. This should be
>roundup_pow_of_two(total_vfs).
>
>Others looks good to me.
>

Yes, You're correct.

>>by (vfs_expaned * VF_BAR_size).
>>
>>>This patch simplifies the pnv_pci_iov_resource_alignment() by removing the
>>>first case.
>>>
>>>Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
>>
>>Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
>>
>>>---
>>> arch/powerpc/platforms/powernv/pci-ioda.c |   14 +++++++++-----
>>> 1 file changed, 9 insertions(+), 5 deletions(-)
>>>
>>>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>>>index 9b41dba..7192e62 100644
>>>--- a/arch/powerpc/platforms/powernv/pci-ioda.c
>>>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>>>@@ -2987,12 +2987,16 @@ static resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev,
>>> 						      int resno)
>>> {
>>> 	struct pci_dn *pdn = pci_get_pdn(pdev);
>>>-	resource_size_t align, iov_align;
>>>-
>>>-	iov_align = resource_size(&pdev->resource[resno]);
>>>-	if (iov_align)
>>>-		return iov_align;
>>>+	resource_size_t align;
>>>
>>>+	/*
>>>+	 * On PowerNV platform, IOV BAR is mapped by M64 BAR to enable the
>>>+	 * SR-IOV. While from hardware perspective, the range mapped by M64
>>>+	 * BAR should be size aligned.
>>>+	 *
>>>+	 * This function return the total IOV BAR size if expanded or just the
>>>+	 * individual size if not.
>>>+	 */
>>> 	align = pci_iov_resource_size(pdev, resno);
>>> 	if (pdn->vfs_expanded)
>>> 		return pdn->vfs_expanded * align;
>>>-- 
>>>1.7.9.5
>>>
>
>-- 
>Richard Yang
>Help you, Help me

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH V2 1/6] powerpc/powernv: don't enable SRIOV when VF BAR contains non M64 BAR
  2015-08-06 14:10               ` Wei Yang
@ 2015-08-07  1:20                 ` Gavin Shan
  2015-08-07  2:24                   ` Wei Yang
  0 siblings, 1 reply; 56+ messages in thread
From: Gavin Shan @ 2015-08-07  1:20 UTC (permalink / raw)
  To: Wei Yang; +Cc: Gavin Shan, aik, benh, linuxppc-dev

On Thu, Aug 06, 2015 at 10:10:10PM +0800, Wei Yang wrote:
>On Thu, Aug 06, 2015 at 02:35:57PM +1000, Gavin Shan wrote:
>>On Wed, Aug 05, 2015 at 09:24:58AM +0800, Wei Yang wrote:
>>>On PHB_IODA2, we enable SRIOV devices by mapping IOV BAR with M64 BARs. If
>>>a SRIOV device's BAR is not 64-bit prefetchable, this is not assigned from
>>>M64 windwo, which means M64 BAR can't work on it.
>>>
>>
>>s/PHB_IODA2/PHB3
>>s/windwo/window
>>
>>>This patch makes this explicit.
>>>
>>>Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
>>
>>The idea sounds right, but there is one question as below.
>>
>>>---
>>> arch/powerpc/platforms/powernv/pci-ioda.c |   25 +++++++++----------------
>>> 1 file changed, 9 insertions(+), 16 deletions(-)
>>>
>>>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>>>index 5738d31..9b41dba 100644
>>>--- a/arch/powerpc/platforms/powernv/pci-ioda.c
>>>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>>>@@ -908,9 +908,6 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
>>> 		if (!res->flags || !res->parent)
>>> 			continue;
>>>
>>>-		if (!pnv_pci_is_mem_pref_64(res->flags))
>>>-			continue;
>>>-
>>> 		/*
>>> 		 * The actual IOV BAR range is determined by the start address
>>> 		 * and the actual size for num_vfs VFs BAR.  This check is to
>>>@@ -939,9 +936,6 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
>>> 		if (!res->flags || !res->parent)
>>> 			continue;
>>>
>>>-		if (!pnv_pci_is_mem_pref_64(res->flags))
>>>-			continue;
>>>-
>>> 		size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
>>> 		res2 = *res;
>>> 		res->start += size * offset;
>>>@@ -1221,9 +1215,6 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
>>> 		if (!res->flags || !res->parent)
>>> 			continue;
>>>
>>>-		if (!pnv_pci_is_mem_pref_64(res->flags))
>>>-			continue;
>>>-
>>> 		for (j = 0; j < vf_groups; j++) {
>>> 			do {
>>> 				win = find_next_zero_bit(&phb->ioda.m64_bar_alloc,
>>>@@ -1510,6 +1501,12 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
>>> 	pdn = pci_get_pdn(pdev);
>>>
>>> 	if (phb->type == PNV_PHB_IODA2) {
>>>+		if (!pdn->vfs_expanded) {
>>>+			dev_info(&pdev->dev, "don't support this SRIOV device"
>>>+				" with non M64 VF BAR\n");
>>>+			return -EBUSY;
>>>+		}
>>>+
>>
>>It would be -ENOSPC since -EBUSY indicates the devices (VFs) are temparily
>>unavailable. For this case, the VFs are permanently unavailable because of
>>running out of space to accomodate M64 and non-M64 VF BARs.
>>
>>The error message could be printed with dev_warn() and it would be precise
>>as below or something else you prefer:
>>
>>	dev_warn(&pdev->dev, "SRIOV not supported because of non-M64 VF BAR\n");
>>
>
>Thanks for the comment, will change accordingly.
>
>>
>>> 		/* Calculate available PE for required VFs */
>>> 		mutex_lock(&phb->ioda.pe_alloc_mutex);
>>> 		pdn->offset = bitmap_find_next_zero_area(
>>>@@ -2774,9 +2771,10 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
>>> 		if (!res->flags || res->parent)
>>> 			continue;
>>> 		if (!pnv_pci_is_mem_pref_64(res->flags)) {
>>>-			dev_warn(&pdev->dev, " non M64 VF BAR%d: %pR\n",
>>>+			dev_warn(&pdev->dev, "Don't support SR-IOV with"
>>>+					" non M64 VF BAR%d: %pR. \n",
>>> 				 i, res);
>>>-			continue;
>>>+			return;
>>> 		}
>>>
>>> 		size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
>>>@@ -2795,11 +2793,6 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
>>> 		res = &pdev->resource[i + PCI_IOV_RESOURCES];
>>> 		if (!res->flags || res->parent)
>>> 			continue;
>>>-		if (!pnv_pci_is_mem_pref_64(res->flags)) {
>>>-			dev_warn(&pdev->dev, "Skipping expanding VF BAR%d: %pR\n",
>>>-				 i, res);
>>>-			continue;
>>>-		}
>>
>>When any one IOV BAR on the PF is non-M64, none of the VFs can be enabled.
>>Will we still allocate/assign M64 or M32 resources for the IOV BARs? If so,
>>I think it can be avoided.
>>
>
>Don't get your point. You mean to avoid this function?
>
>Or clear the IOV BAR when we found one of it is non-M64?
>

I mean to clear all IOV BARs in case any more of them are IO or M32. In this
case, the SRIOV capability won't be enabled. Otherwise, the resources for
all IOV BARs are assigned and allocated by PCI subsystem, but they won't
be used. Does it make sense to you?

>>>
>>> 		dev_dbg(&pdev->dev, " Fixing VF BAR%d: %pR to\n", i, res);
>>> 		size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
>>>-- 
>>>1.7.9.5
>>>
>
>-- 
>Richard Yang
>Help you, Help me

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH V2 5/6] powerpc/powernv: boundary the total vf bar size instead of the individual one
  2015-08-06 14:03               ` Wei Yang
@ 2015-08-07  1:23                 ` Gavin Shan
  2015-08-07  2:25                   ` Wei Yang
  0 siblings, 1 reply; 56+ messages in thread
From: Gavin Shan @ 2015-08-07  1:23 UTC (permalink / raw)
  To: Wei Yang; +Cc: Gavin Shan, aik, benh, linuxppc-dev

On Thu, Aug 06, 2015 at 10:03:04PM +0800, Wei Yang wrote:
>On Thu, Aug 06, 2015 at 03:28:51PM +1000, Gavin Shan wrote:
>>On Wed, Aug 05, 2015 at 09:25:02AM +0800, Wei Yang wrote:
>>>Each VF could have 6 BARs at most. When the total BAR size exceeds the
>>>gate, after expanding it will also exhaust the M64 Window.
>>>
>>>This patch limits the boundary by checking the total VF BAR size instead of
>>>the individual BAR.
>>>
>>>Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
>>
>>Ok. I didn't look at this when giving comments to last patch. It turns
>>you have the change in this patch. Please merge it with the previous
>>patch.
>>
>
>Hmm... I prefer to have them in two patches. One focus on the calculation of
>gate and the other focus on checking the total VF BAR size. This would help
>record the change.
>

It's fine to me as well. I'll take close look on your next revision since
you have to refresh the whole series. Is that fine to you?

>>>---
>>> arch/powerpc/platforms/powernv/pci-ioda.c |   13 +++++++------
>>> 1 file changed, 7 insertions(+), 6 deletions(-)
>>>
>>>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>>>index 31dcedc..4042303 100644
>>>--- a/arch/powerpc/platforms/powernv/pci-ioda.c
>>>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>>>@@ -2702,7 +2702,7 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
>>> 	struct pnv_phb *phb;
>>> 	struct resource *res;
>>> 	int i;
>>>-	resource_size_t size, gate;
>>>+	resource_size_t size, gate, total_vf_bar_sz;
>>> 	struct pci_dn *pdn;
>>> 	int mul, total_vfs;
>>>
>>>@@ -2729,6 +2729,7 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
>>> 	 * Window and limit the system flexibility.
>>> 	 */
>>> 	gate = phb->ioda.m64_segsize >> 1;
>>>+	total_vf_bar_sz = 0;
>>>
>>> 	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
>>> 		res = &pdev->resource[i + PCI_IOV_RESOURCES];
>>>@@ -2741,13 +2742,13 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
>>> 			return;
>>> 		}
>>>
>>>-		size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
>>>+		total_vf_bar_sz += pci_iov_resource_size(pdev,
>>>+				i + PCI_IOV_RESOURCES);
>>>
>>> 		/* bigger than or equal to gate */
>>>-		if (size >= gate) {
>>>-			dev_info(&pdev->dev, "PowerNV: VF BAR%d: %pR IOV size "
>>>-				"is bigger than %lld, roundup power2\n",
>>>-				 i, res, gate);
>>>+		if (total_vf_bar_sz >= gate) {
>>>+			dev_info(&pdev->dev, "PowerNV: VF BAR Total IOV size "
>>>+				"is bigger than %lld, roundup power2\n", gate);
>>> 			mul = roundup_pow_of_two(total_vfs);
>>> 			pdn->m64_single_mode = true;
>>> 			break;
>>>-- 
>>>1.7.9.5
>>>
>
>-- 
>Richard Yang
>Help you, Help me

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH V2 1/6] powerpc/powernv: don't enable SRIOV when VF BAR contains non M64 BAR
  2015-08-06 14:13                     ` Wei Yang
@ 2015-08-07  1:24                       ` Alexey Kardashevskiy
  0 siblings, 0 replies; 56+ messages in thread
From: Alexey Kardashevskiy @ 2015-08-07  1:24 UTC (permalink / raw)
  To: Wei Yang; +Cc: Gavin Shan, benh, linuxppc-dev

On 08/07/2015 12:13 AM, Wei Yang wrote:
> On Thu, Aug 06, 2015 at 05:47:42PM +1000, Alexey Kardashevskiy wrote:
>> On 08/06/2015 04:57 PM, Gavin Shan wrote:
>>> On Thu, Aug 06, 2015 at 04:10:21PM +1000, Alexey Kardashevskiy wrote:
>>>> On 08/06/2015 02:35 PM, Gavin Shan wrote:
>>>>> On Wed, Aug 05, 2015 at 09:24:58AM +0800, Wei Yang wrote:
>>>>>> On PHB_IODA2, we enable SRIOV devices by mapping IOV BAR with M64 BARs. If
>>>>>> a SRIOV device's BAR is not 64-bit prefetchable, this is not assigned from
>>>>>> M64 windwo, which means M64 BAR can't work on it.
>>
>>
>> The proper text would be something like this:
>>
>> ===
>> SRIOV only supports 64bit MMIO. So if we fail to assign 64bit BAR, we
>> cannot enable the device.
>> ===
>>
>>
>>>>>>
>>>>>
>>>>> s/PHB_IODA2/PHB3
>>>>
>>>>
>>>> No, it is IODA2. OPEL does PHB3-specific bits, the host kernel just uses OPAL.
>>>>
>>>
>>> Ok.
>>>
>>>>
>>>>> s/windwo/window
>>>>>
>>>>>> This patch makes this explicit.
>>>>>>
>>>>>> Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
>>>>>
>>>>> The idea sounds right, but there is one question as below.
>>>>>
>>>>>> ---
>>>>>> arch/powerpc/platforms/powernv/pci-ioda.c |   25 +++++++++----------------
>>>>>> 1 file changed, 9 insertions(+), 16 deletions(-)
>>>>>>
>>>>>> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>>>>>> index 5738d31..9b41dba 100644
>>>>>> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
>>>>>> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>>>>>> @@ -908,9 +908,6 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
>>>>>> 		if (!res->flags || !res->parent)
>>>>>> 			continue;
>>>>>>
>>>>>> -		if (!pnv_pci_is_mem_pref_64(res->flags))
>>>>>> -			continue;
>>>>>> -
>>>>>> 		/*
>>>>>> 		 * The actual IOV BAR range is determined by the start address
>>>>>> 		 * and the actual size for num_vfs VFs BAR.  This check is to
>>>>>> @@ -939,9 +936,6 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
>>>>>> 		if (!res->flags || !res->parent)
>>>>>> 			continue;
>>>>>>
>>>>>> -		if (!pnv_pci_is_mem_pref_64(res->flags))
>>>>>> -			continue;
>>>>>> -
>>>>>> 		size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
>>>>>> 		res2 = *res;
>>>>>> 		res->start += size * offset;
>>>>>> @@ -1221,9 +1215,6 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
>>>>>> 		if (!res->flags || !res->parent)
>>>>>> 			continue;
>>>>>>
>>>>>> -		if (!pnv_pci_is_mem_pref_64(res->flags))
>>>>>> -			continue;
>>>>>> -
>>>>>> 		for (j = 0; j < vf_groups; j++) {
>>>>>> 			do {
>>>>>> 				win = find_next_zero_bit(&phb->ioda.m64_bar_alloc,
>>>>>> @@ -1510,6 +1501,12 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
>>>>>> 	pdn = pci_get_pdn(pdev);
>>>>>>
>>>>>> 	if (phb->type == PNV_PHB_IODA2) {
>>>>>> +		if (!pdn->vfs_expanded) {
>>>>>> +			dev_info(&pdev->dev, "don't support this SRIOV device"
>>>>>> +				" with non M64 VF BAR\n");
>>>>>> +			return -EBUSY;
>>>>>> +		}
>>>>>> +
>>>>>
>>>>> It would be -ENOSPC since -EBUSY indicates the devices (VFs) are temparily
>>>>> unavailable. For this case, the VFs are permanently unavailable because of
>>>>> running out of space to accomodate M64 and non-M64 VF BARs.
>>>>>
>>>>> The error message could be printed with dev_warn() and it would be precise
>>>>> as below or something else you prefer:
>>>>>
>>>>> 	dev_warn(&pdev->dev, "SRIOV not supported because of non-M64 VF BAR\n");
>>>>
>>>>
>>>> Both messages are cryptic.
>>>>
>>>> If it is not M64 BAR, then what is it? It is always in one of M64 BARs (in
>>>> the worst case - BAR#15?), the difference is if it is segmented or not, no?
>>>>
>>>
>>> The VF BAR could be one of IO, M32, M64. If it's not M64, the VFs are supposed
>>> to be disabled and the (IO and M32) resources won't be allocted, but for sure,
>>> the IO and M32 resources can't be put into any one of the 16 PHB's M64 BARs.
>>> would you recommend one better message then?
>>
>>
>>
>> dev_warn(&pdev->dev, "SRIOV is disabled as no space is left in 64bit
>> MMIO window\n");
>>
>> Or it is not "MMIO window"?
>>
>
> The reason is not "no space left in 64bit MMIO window".
>
> The reason is the IOV BAR is not 64bit prefetchable, then in linux kernel this
> can't be allocated from M64 Space, then we can't use M64 BAR to cover it.

Oh. So now it is not M64 vs. M32 and IO, it is about prefetchable vs. 
non-prefetchable. Please choose one.

Should it be this then?
dev_warn(&pdev->dev, "Non-prefetchable BARs are not supported for SRIOV")


But Gavin keeps insisting on mentioning "non-M64 BAR" - this part I do not 
understand.

And where does this limit come from? Is it POWER8, IODA2, PHB3, SRIOV or 
something else? Is it all about isolation or the host _without_ KVM but 
with SRIOV also cannot use VFs if they have non-prefetchable BARs? Is this 
because of POWER8 or IODA2 or PHB3 or SRIOV or something else?




>>
>>
>>>
>>>>>
>>>>>
>>>>>> 		/* Calculate available PE for required VFs */
>>>>>> 		mutex_lock(&phb->ioda.pe_alloc_mutex);
>>>>>> 		pdn->offset = bitmap_find_next_zero_area(
>>>>>> @@ -2774,9 +2771,10 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
>>>>>> 		if (!res->flags || res->parent)
>>>>>> 			continue;
>>>>>> 		if (!pnv_pci_is_mem_pref_64(res->flags)) {
>>>>>> -			dev_warn(&pdev->dev, " non M64 VF BAR%d: %pR\n",
>>>>>> +			dev_warn(&pdev->dev, "Don't support SR-IOV with"
>>>>>> +					" non M64 VF BAR%d: %pR. \n",
>>>>>> 				 i, res);
>>>>>> -			continue;
>>>>>> +			return;
>>>>>> 		}
>>>>>>
>>>>>> 		size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
>>>>>> @@ -2795,11 +2793,6 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
>>>>>> 		res = &pdev->resource[i + PCI_IOV_RESOURCES];
>>>>>> 		if (!res->flags || res->parent)
>>>>>> 			continue;
>>>>>> -		if (!pnv_pci_is_mem_pref_64(res->flags)) {
>>>>>> -			dev_warn(&pdev->dev, "Skipping expanding VF BAR%d: %pR\n",
>>>>>> -				 i, res);
>>>>>> -			continue;
>>>>>> -		}
>>>>>
>>>>> When any one IOV BAR on the PF is non-M64, none of the VFs can be enabled.
>>>>> Will we still allocate/assign M64 or M32 resources for the IOV BARs? If so,
>>>>> I think it can be avoided.
>>>>>
>>>>>>
>>>>>> 		dev_dbg(&pdev->dev, " Fixing VF BAR%d: %pR to\n", i, res);
>>>>>> 		size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
>>
>>
>> --
>> Alexey
>


-- 
Alexey

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH V2 2/6] powerpc/powernv: simplify the calculation of iov resource
  2015-08-06 10:15                   ` Alexey Kardashevskiy
@ 2015-08-07  1:36                     ` Wei Yang
  0 siblings, 0 replies; 56+ messages in thread
From: Wei Yang @ 2015-08-07  1:36 UTC (permalink / raw)
  To: Alexey Kardashevskiy; +Cc: Wei Yang, Gavin Shan, linuxppc-dev

On Thu, Aug 06, 2015 at 08:15:55PM +1000, Alexey Kardashevskiy wrote:
>On 08/06/2015 07:41 PM, Wei Yang wrote:
>>On Thu, Aug 06, 2015 at 07:00:00PM +1000, Alexey Kardashevskiy wrote:
>>>On 08/06/2015 02:51 PM, Gavin Shan wrote:
>>>>On Wed, Aug 05, 2015 at 09:24:59AM +0800, Wei Yang wrote:
>>>>>The alignment of IOV BAR on PowerNV platform is the total size of the IOV
>>>>>BAR. No matter whether the IOV BAR is truncated or not, the total size
>>>>>could be calculated by (vfs_expanded * VF size).
>>>>>
>>>>
>>>>s/VF size/VF BAR size
>>>>
>>>>I think the changelog would be more explicit:
>>>>
>>>>The alignment of IOV BAR on PowerNV platform is the total size of the
>>>>IOV BAR, no matter whether the IOV BAR is extended with number of max
>>>>VFs or number of max PE number (256). The alignment can be calculated
>>>>by (vfs_expaned * VF_BAR_size).
>>>
>>>
>>>
>>>Is that really a PowerNV-specific requirement or it is valid for
>>>every platform (I suspect this is the case here)?
>>>
>>
>>Currently, it is PowerNV-specific.
>
>
>How is x86 different on this matter?
>Why would we need this extra alignment, not just VF's BAR alignment?
>
>

The difference lie the "PE" isolation on Power.

>From the MMIO perspective, we use M32 BAR and M64 BAR to map a MMIO range to a
PE#.  M32 is controlled with one M32 BAR, while M64 has 16 BARs.

Now let's back to the pure PCI world. Every PCI device has BAR, which is
required to be size aligned. For example, BAR#0 is 1MB, which means it is 1MB
aligned. Then let's look at the SRIOV case. The PF's IOV BAR (6 BARs at most)
contains the start address of the total_vfs number of VFs BAR. For example, VF
BAR#0 size is 1MB, and suppose PF's IOV BAR#0 is assigned to 0 and total_vfs
is 8. The IOV BAR is mapped to a range [0 - (8MB -1)]. When pci core
allocating the IOV BAR, it just make sure IOV BAR is 1MB aligned, since by
doing so each VF BAR is size aligned. That is x86 does and maybe other
platforms. I believe you understand this part.

Now back to our platform, we want to use M64 BAR to map the IOV BAR. Come to
the example above. The IOV BAR#0 is 8MB size, if we still let the pci core
allocate it with 1MB alignment, our M64 BAR can't work. Since M64 BAR itself
should be size aligned(in this case 8MB.). The same concept as PCI BAR.

>>>
>>>Also, what is the exact meaning of "expanded" in @vfs_expanded? It is
>>>either 255 (if individual VF BARs are <= 64MB) or
>>>roundup_pow_of_two(total_vfs) (which is something like 4 or 16). What
>>>is expanded here?
>>>
>>
>>PF's IOV BAR original size is (VF BAR size * total_vfs).
>>
>>After expanding, the IOV BAR size  is (VF BAR size * 256) or (VF BAR size *
>>roundup_pow_of_two(total_vfs)).
>
>
>Ufff, got it now. I'd store just an expanded IOV BAR size (not some
>magic VFs number) because this is what it actually is:
>pdn->vfs_expanded * align
>

This would change the idea, since VF still could have 6 BARs. In your
proposal, we need to 6 variable to store the expanded size for each.

>
>>>
>>>>
>>>>>This patch simplifies the pnv_pci_iov_resource_alignment() by removing the
>>>>>first case.
>>>>>
>>>>>Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
>>>>
>>>>Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
>>>>
>>>>>---
>>>>>arch/powerpc/platforms/powernv/pci-ioda.c |   14 +++++++++-----
>>>>>1 file changed, 9 insertions(+), 5 deletions(-)
>>>>>
>>>>>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>>>>>index 9b41dba..7192e62 100644
>>>>>--- a/arch/powerpc/platforms/powernv/pci-ioda.c
>>>>>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>>>>>@@ -2987,12 +2987,16 @@ static resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev,
>>>>>						      int resno)
>>>>>{
>>>>>	struct pci_dn *pdn = pci_get_pdn(pdev);
>>>>>-	resource_size_t align, iov_align;
>>>>>-
>>>>>-	iov_align = resource_size(&pdev->resource[resno]);
>>>>>-	if (iov_align)
>>>>>-		return iov_align;
>>>>>+	resource_size_t align;
>>>>>
>>>>>+	/*
>>>>>+	 * On PowerNV platform, IOV BAR is mapped by M64 BAR to enable the
>>>>>+	 * SR-IOV. While from hardware perspective, the range mapped by M64
>>>>>+	 * BAR should be size aligned.
>>>>>+	 *
>>>>>+	 * This function return the total IOV BAR size if expanded or just the
>>>>>+	 * individual size if not.
>>>>>+	 */
>>>>>	align = pci_iov_resource_size(pdev, resno);
>>>>>	if (pdn->vfs_expanded)
>>>>>		return pdn->vfs_expanded * align;
>>>>>--
>>>>>1.7.9.5
>>>>>
>>>>
>>>
>>>
>>>--
>>>Alexey
>>
>
>
>-- 
>Alexey

-- 
Richard Yang
Help you, Help me

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH V2 6/6] powerpc/powernv: allocate discrete PE# when using M64 BAR in Single PE mode
  2015-08-06 13:41               ` Wei Yang
@ 2015-08-07  1:36                 ` Gavin Shan
  2015-08-07  2:33                   ` Wei Yang
  0 siblings, 1 reply; 56+ messages in thread
From: Gavin Shan @ 2015-08-07  1:36 UTC (permalink / raw)
  To: Wei Yang; +Cc: Gavin Shan, aik, benh, linuxppc-dev

On Thu, Aug 06, 2015 at 09:41:41PM +0800, Wei Yang wrote:
>On Thu, Aug 06, 2015 at 03:36:01PM +1000, Gavin Shan wrote:
>>On Wed, Aug 05, 2015 at 09:25:03AM +0800, Wei Yang wrote:
>>>When M64 BAR is set to Single PE mode, the PE# assigned to VF could be
>>>discrete.
>>>
>>>This patch restructures the patch to allocate discrete PE# for VFs when M64
>>>BAR is set to Single PE mode.
>>>
>>>Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
>>>---
>>> arch/powerpc/include/asm/pci-bridge.h     |    2 +-
>>> arch/powerpc/platforms/powernv/pci-ioda.c |   69 +++++++++++++++++++++--------
>>> 2 files changed, 51 insertions(+), 20 deletions(-)
>>>
>>>diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
>>>index 8aeba4c..72415c7 100644
>>>--- a/arch/powerpc/include/asm/pci-bridge.h
>>>+++ b/arch/powerpc/include/asm/pci-bridge.h
>>>@@ -213,7 +213,7 @@ struct pci_dn {
>>> #ifdef CONFIG_PCI_IOV
>>> 	u16     vfs_expanded;		/* number of VFs IOV BAR expanded */
>>> 	u16     num_vfs;		/* number of VFs enabled*/
>>>-	int     offset;			/* PE# for the first VF PE */
>>>+	int     *offset;		/* PE# for the first VF PE or array */
>>> 	bool    m64_single_mode;	/* Use M64 BAR in Single Mode */
>>> #define IODA_INVALID_M64        (-1)
>>> 	int     (*m64_map)[PCI_SRIOV_NUM_BARS];
>>
>>how about renaming "offset" to "pe_num_map", or "pe_map" ? Similar to the comments
>>I gave to the "m64_bar_map", num_of_max_vfs entries can be allocated. Though not
>>all of them will be used, not too much memory will be wasted.
>>
>
>Thanks for your comment.
>
>I have thought about change the name to make it more self explain. While
>another fact I want to take in is this field is also used to be reflect the
>shift offset when M64 BAR is used in the Shared Mode. So I maintain the name.
>
>How about use "enum", one maintain the name "offset", and another one rename to
>"pe_num_map". And use the meaningful name at proper place?
>

Ok. I'm explaining it with more details. There are two cases: single vs shared
mode. When PHB M64 BARs run in single mode, you need an array to track the
allocated discrete PE#. The VF_index is the index to the array. When PHB M64
BARs run in shared mode, you need continuous PE#. No array required for this
case. Instead, the starting PE# should be stored to somewhere, which can
be pdn->offset[0] simply.

So when allocating memory for this array, you just simply allocate (sizeof(*pdn->offset)
*max_vf_num) no matter what mode PHB's M64 BARs will run in. The point is nobody
can enable (max_vf_num + 1) VFs.

With above way, the arrays for PE# and M64 BAR remapping needn't be allocated
when enabling SRIOV capability and releasing on disabling SRIOV capability.
Instead, those two arrays can be allocated during resource fixup time and free'ed
when destroying the pdn.

>>>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>>>index 4042303..9953829 100644
>>>--- a/arch/powerpc/platforms/powernv/pci-ioda.c
>>>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>>>@@ -1243,7 +1243,7 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
>>>
>>> 			/* Map the M64 here */
>>> 			if (pdn->m64_single_mode) {
>>>-				pe_num = pdn->offset + j;
>>>+				pe_num = pdn->offset[j];
>>> 				rc = opal_pci_map_pe_mmio_window(phb->opal_id,
>>> 						pe_num, OPAL_M64_WINDOW_TYPE,
>>> 						pdn->m64_map[j][i], 0);
>>>@@ -1347,7 +1347,7 @@ void pnv_pci_sriov_disable(struct pci_dev *pdev)
>>> 	struct pnv_phb        *phb;
>>> 	struct pci_dn         *pdn;
>>> 	struct pci_sriov      *iov;
>>>-	u16 num_vfs;
>>>+	u16                    num_vfs, i;
>>>
>>> 	bus = pdev->bus;
>>> 	hose = pci_bus_to_host(bus);
>>>@@ -1361,14 +1361,18 @@ void pnv_pci_sriov_disable(struct pci_dev *pdev)
>>>
>>> 	if (phb->type == PNV_PHB_IODA2) {
>>> 		if (!pdn->m64_single_mode)
>>>-			pnv_pci_vf_resource_shift(pdev, -pdn->offset);
>>>+			pnv_pci_vf_resource_shift(pdev, -*pdn->offset);
>>>
>>> 		/* Release M64 windows */
>>> 		pnv_pci_vf_release_m64(pdev, num_vfs);
>>>
>>> 		/* Release PE numbers */
>>>-		bitmap_clear(phb->ioda.pe_alloc, pdn->offset, num_vfs);
>>>-		pdn->offset = 0;
>>>+		if (pdn->m64_single_mode) {
>>>+			for (i = 0; i < num_vfs; i++)
>>>+				pnv_ioda_free_pe(phb, pdn->offset[i]);
>>>+		} else
>>>+			bitmap_clear(phb->ioda.pe_alloc, *pdn->offset, num_vfs);
>>>+		kfree(pdn->offset);
>>
>>Can pnv_ioda_free_pe() be reused to release PE ?
>
>You mean use it to similar thing in pnv_ioda_deconfigure_pe()?
>

Forget it please. You can clean it up later, not in this patchset...

>>
>>> 	}
>>> }
>>>
>>>@@ -1394,7 +1398,10 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
>>>
>>> 	/* Reserve PE for each VF */
>>> 	for (vf_index = 0; vf_index < num_vfs; vf_index++) {
>>>-		pe_num = pdn->offset + vf_index;
>>>+		if (pdn->m64_single_mode)
>>>+			pe_num = pdn->offset[vf_index];
>>>+		else
>>>+			pe_num = *pdn->offset + vf_index;
>>>
>>> 		pe = &phb->ioda.pe_array[pe_num];
>>> 		pe->pe_number = pe_num;
>>>@@ -1436,6 +1443,7 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
>>> 	struct pnv_phb        *phb;
>>> 	struct pci_dn         *pdn;
>>> 	int                    ret;
>>>+	u16                    i;
>>>
>>> 	bus = pdev->bus;
>>> 	hose = pci_bus_to_host(bus);
>>>@@ -1462,19 +1470,38 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
>>> 		}
>>>
>>> 		/* Calculate available PE for required VFs */
>>>-		mutex_lock(&phb->ioda.pe_alloc_mutex);
>>>-		pdn->offset = bitmap_find_next_zero_area(
>>>-			phb->ioda.pe_alloc, phb->ioda.total_pe,
>>>-			0, num_vfs, 0);
>>>-		if (pdn->offset >= phb->ioda.total_pe) {
>>>+		if (pdn->m64_single_mode) {
>>>+			pdn->offset = kmalloc(sizeof(*pdn->offset) * num_vfs,
>>>+					GFP_KERNEL);
>>>+			if (!pdn->offset)
>>>+				return -ENOMEM;
>>>+			for (i = 0; i < num_vfs; i++)
>>>+				pdn->offset[i] = IODA_INVALID_PE;
>>>+			for (i = 0; i < num_vfs; i++) {
>>>+				pdn->offset[i] = pnv_ioda_alloc_pe(phb);
>>>+				if (pdn->offset[i] == IODA_INVALID_PE) {
>>>+					ret = -EBUSY;
>>>+					goto m64_failed;
>>>+				}
>>>+			}
>>>+		} else {
>>>+			pdn->offset = kmalloc(sizeof(*pdn->offset), GFP_KERNEL);
>>>+			if (!pdn->offset)
>>>+				return -ENOMEM;
>>>+			mutex_lock(&phb->ioda.pe_alloc_mutex);
>>>+			*pdn->offset = bitmap_find_next_zero_area(
>>>+				phb->ioda.pe_alloc, phb->ioda.total_pe,
>>>+				0, num_vfs, 0);
>>>+			if (*pdn->offset >= phb->ioda.total_pe) {
>>>+				mutex_unlock(&phb->ioda.pe_alloc_mutex);
>>>+				dev_info(&pdev->dev, "Failed to enable VF%d\n", num_vfs);
>>>+				kfree(pdn->offset);
>>>+				return -EBUSY;
>>>+			}
>>>+			bitmap_set(phb->ioda.pe_alloc, *pdn->offset, num_vfs);
>>> 			mutex_unlock(&phb->ioda.pe_alloc_mutex);
>>>-			dev_info(&pdev->dev, "Failed to enable VF%d\n", num_vfs);
>>>-			pdn->offset = 0;
>>>-			return -EBUSY;
>>> 		}
>>>-		bitmap_set(phb->ioda.pe_alloc, pdn->offset, num_vfs);
>>> 		pdn->num_vfs = num_vfs;
>>>-		mutex_unlock(&phb->ioda.pe_alloc_mutex);
>>>
>>> 		/* Assign M64 window accordingly */
>>> 		ret = pnv_pci_vf_assign_m64(pdev, num_vfs);
>>>@@ -1489,7 +1516,7 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
>>> 		 * Otherwise, the PE# for the VF will conflict with others.
>>> 		 */
>>> 		if (!pdn->m64_single_mode) {
>>>-			ret = pnv_pci_vf_resource_shift(pdev, pdn->offset);
>>>+			ret = pnv_pci_vf_resource_shift(pdev, *pdn->offset);
>>> 			if (ret)
>>> 				goto m64_failed;
>>> 		}
>>>@@ -1501,8 +1528,12 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
>>> 	return 0;
>>>
>>> m64_failed:
>>>-	bitmap_clear(phb->ioda.pe_alloc, pdn->offset, num_vfs);
>>>-	pdn->offset = 0;
>>>+	if (pdn->m64_single_mode) {
>>>+		for (i = 0; i < num_vfs; i++)
>>>+			pnv_ioda_free_pe(phb, pdn->offset[i]);
>>>+	} else
>>>+		bitmap_clear(phb->ioda.pe_alloc, *pdn->offset, num_vfs);
>>>+	kfree(pdn->offset);
>>>
>>> 	return ret;
>>> }
>>>-- 
>>>1.7.9.5
>>>
>
>-- 
>Richard Yang
>Help you, Help me

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH V2 3/6] powerpc/powernv: use one M64 BAR in Single PE mode for one VF BAR
  2015-08-06 10:07                 ` Gavin Shan
@ 2015-08-07  1:48                   ` Wei Yang
  2015-08-07  8:13                     ` Alexey Kardashevskiy
  0 siblings, 1 reply; 56+ messages in thread
From: Wei Yang @ 2015-08-07  1:48 UTC (permalink / raw)
  To: Gavin Shan; +Cc: Wei Yang, aik, benh, linuxppc-dev

On Thu, Aug 06, 2015 at 08:07:01PM +1000, Gavin Shan wrote:
>On Thu, Aug 06, 2015 at 05:36:02PM +0800, Wei Yang wrote:
>>On Thu, Aug 06, 2015 at 03:20:25PM +1000, Gavin Shan wrote:
>>>On Wed, Aug 05, 2015 at 09:25:00AM +0800, Wei Yang wrote:
>>>>In current implementation, when VF BAR is bigger than 64MB, it uses 4 M64
>>>>BAR in Single PE mode to cover the number of VFs required to be enabled.
>>>>By doing so, several VFs would be in one VF Group and leads to interference
>>>>between VFs in the same group.
>>>>
>>>>This patch changes the design by using one M64 BAR in Single PE mode for
>>>>one VF BAR. This gives absolute isolation for VFs.
>>>>
>>>>Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
>>>>---
>>>> arch/powerpc/include/asm/pci-bridge.h     |    5 +-
>>>> arch/powerpc/platforms/powernv/pci-ioda.c |  180 ++++++++++++-----------------
>>>> 2 files changed, 76 insertions(+), 109 deletions(-)
>>>>
>>>>diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
>>>>index 712add5..8aeba4c 100644
>>>>--- a/arch/powerpc/include/asm/pci-bridge.h
>>>>+++ b/arch/powerpc/include/asm/pci-bridge.h
>>>>@@ -214,10 +214,9 @@ struct pci_dn {
>>>> 	u16     vfs_expanded;		/* number of VFs IOV BAR expanded */
>>>> 	u16     num_vfs;		/* number of VFs enabled*/
>>>> 	int     offset;			/* PE# for the first VF PE */
>>>>-#define M64_PER_IOV 4
>>>>-	int     m64_per_iov;
>>>>+	bool    m64_single_mode;	/* Use M64 BAR in Single Mode */
>>>> #define IODA_INVALID_M64        (-1)
>>>>-	int     m64_wins[PCI_SRIOV_NUM_BARS][M64_PER_IOV];
>>>>+	int     (*m64_map)[PCI_SRIOV_NUM_BARS];
>>>
>>>It can be explicit? For example:
>>>
>>>	int	*m64_map;
>>>
>>>	/* Initialization */
>>>	size_t size = sizeof(*pdn->m64_map) * PCI_SRIOV_NUM_BARS * num_of_max_VFs;
>>>	pdn->m64_map = kmalloc(size, GFP_KERNEL);
>>>	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
>>>		for (j = 0; j < num_of_max_VFs; j++)
>>>			pdn->m64_map[i * PCI_SRIOV_NUM_BARS + j] = PNV_INVALID_M64;
>>>
>>>	/* Destroy */
>>>	int step = 1;
>>>
>>>	if (!pdn->m64_single_mode)
>>>		step = phb->ioda.total_pe;
>>>	for (i = 0; i < PCI_SRIOV_NUM_BARS * num_of_max_VFs; i += step)
>>>		if (pdn->m64_map[i] == PNV_INVALID_M64)
>>>			continue;
>>>
>>>		/* Unmap the window */
>>>	
>>
>>The m64_map is a pointer to an array with 6 elements, which represents the 6
>>M64 BAR index for the 6 VF BARs.
>>
>>    When we use Shared Mode, one array is allocated. The six elements
>>    represents the six M64 BAR(at most) used to map the whole IOV BAR.
>>
>>    When we use Single Mode, num_vfs array is allocate. Each array represents
>>    the map between one VF's BAR and M64 BAR index.
>>
>>During the map and un-map, M64 BAR is assigned one by one in VF BAR's order.
>>So I think the code is explicit.
>>
>>In your code, you allocate a big one dimension array to hold the M64 BAR
>>index. It works, while I don't think this is more explicit than original code.
>>
>
>When M64 is in Single Mode, array with (num_vfs * 6) entries is allocated
>because every VF BAR (6 at most) will have one corresponding PHB M64 BAR.
>Anything I missed?
>
>The point in my code is you needn't worry about the mode (single vs shared)
>As I said, not too much memory wasted. However, it's up to you.
>

If we don't want to save some memory, how about just define them static
instead of dynamically allocate?

>I'm not fan of "int (*m64_map)[PCI_SRIOV_NUM_BARS]". Instead, you can replace
>it with "int *m64_map" and calculate its size using following formula:
>
>	sizeof(*pdn->m64_map) * PCI_SRIOV_NUM_BARS;
>
>	sizeof(*pdn->m64_map) * PCI_SRIOV_NUM_BARS * num_vfs;
>
>>-- 
>>Richard Yang
>>Help you, Help me

-- 
Richard Yang
Help you, Help me

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH V2 3/6] powerpc/powernv: use one M64 BAR in Single PE mode for one VF BAR
  2015-08-06 10:04             ` Alexey Kardashevskiy
@ 2015-08-07  2:01               ` Wei Yang
  2015-08-07  8:59                 ` Alexey Kardashevskiy
  0 siblings, 1 reply; 56+ messages in thread
From: Wei Yang @ 2015-08-07  2:01 UTC (permalink / raw)
  To: Alexey Kardashevskiy; +Cc: Wei Yang, gwshan, benh, linuxppc-dev

On Thu, Aug 06, 2015 at 08:04:58PM +1000, Alexey Kardashevskiy wrote:
>On 08/05/2015 11:25 AM, Wei Yang wrote:
>>In current implementation, when VF BAR is bigger than 64MB, it uses 4 M64
>>BAR in Single PE mode to cover the number of VFs required to be enabled.
>>By doing so, several VFs would be in one VF Group and leads to interference
>>between VFs in the same group.
>>
>>This patch changes the design by using one M64 BAR in Single PE mode for
>>one VF BAR. This gives absolute isolation for VFs.
>>
>>Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
>>---
>>  arch/powerpc/include/asm/pci-bridge.h     |    5 +-
>>  arch/powerpc/platforms/powernv/pci-ioda.c |  180 ++++++++++++-----------------
>>  2 files changed, 76 insertions(+), 109 deletions(-)
>>
>>diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
>>index 712add5..8aeba4c 100644
>>--- a/arch/powerpc/include/asm/pci-bridge.h
>>+++ b/arch/powerpc/include/asm/pci-bridge.h
>>@@ -214,10 +214,9 @@ struct pci_dn {
>>  	u16     vfs_expanded;		/* number of VFs IOV BAR expanded */
>>  	u16     num_vfs;		/* number of VFs enabled*/
>>  	int     offset;			/* PE# for the first VF PE */
>>-#define M64_PER_IOV 4
>>-	int     m64_per_iov;
>>+	bool    m64_single_mode;	/* Use M64 BAR in Single Mode */
>>  #define IODA_INVALID_M64        (-1)
>>-	int     m64_wins[PCI_SRIOV_NUM_BARS][M64_PER_IOV];
>>+	int     (*m64_map)[PCI_SRIOV_NUM_BARS];
>>  #endif /* CONFIG_PCI_IOV */
>>  #endif
>>  	struct list_head child_list;
>>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>>index 7192e62..f5d110c 100644
>>--- a/arch/powerpc/platforms/powernv/pci-ioda.c
>>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>>@@ -1148,29 +1148,36 @@ static void pnv_pci_ioda_setup_PEs(void)
>>  }
>>
>>  #ifdef CONFIG_PCI_IOV
>>-static int pnv_pci_vf_release_m64(struct pci_dev *pdev)
>>+static int pnv_pci_vf_release_m64(struct pci_dev *pdev, u16 num_vfs)
>>  {
>>  	struct pci_bus        *bus;
>>  	struct pci_controller *hose;
>>  	struct pnv_phb        *phb;
>>  	struct pci_dn         *pdn;
>>  	int                    i, j;
>>+	int                    m64_bars;
>>
>>  	bus = pdev->bus;
>>  	hose = pci_bus_to_host(bus);
>>  	phb = hose->private_data;
>>  	pdn = pci_get_pdn(pdev);
>>
>>+	if (pdn->m64_single_mode)
>>+		m64_bars = num_vfs;
>>+	else
>>+		m64_bars = 1;
>>+
>>  	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
>>-		for (j = 0; j < M64_PER_IOV; j++) {
>>-			if (pdn->m64_wins[i][j] == IODA_INVALID_M64)
>>+		for (j = 0; j < m64_bars; j++) {
>>+			if (pdn->m64_map[j][i] == IODA_INVALID_M64)
>>  				continue;
>>  			opal_pci_phb_mmio_enable(phb->opal_id,
>>-				OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 0);
>>-			clear_bit(pdn->m64_wins[i][j], &phb->ioda.m64_bar_alloc);
>>-			pdn->m64_wins[i][j] = IODA_INVALID_M64;
>>+				OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 0);
>>+			clear_bit(pdn->m64_map[j][i], &phb->ioda.m64_bar_alloc);
>>+			pdn->m64_map[j][i] = IODA_INVALID_M64;
>>  		}
>>
>>+	kfree(pdn->m64_map);
>>  	return 0;
>>  }
>>
>>@@ -1187,8 +1194,7 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
>>  	int                    total_vfs;
>>  	resource_size_t        size, start;
>>  	int                    pe_num;
>>-	int                    vf_groups;
>>-	int                    vf_per_group;
>>+	int                    m64_bars;
>>
>>  	bus = pdev->bus;
>>  	hose = pci_bus_to_host(bus);
>>@@ -1196,26 +1202,26 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
>>  	pdn = pci_get_pdn(pdev);
>>  	total_vfs = pci_sriov_get_totalvfs(pdev);
>>
>>-	/* Initialize the m64_wins to IODA_INVALID_M64 */
>>-	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
>>-		for (j = 0; j < M64_PER_IOV; j++)
>>-			pdn->m64_wins[i][j] = IODA_INVALID_M64;
>>+	if (pdn->m64_single_mode)
>
>
>This is a physical function's @pdn, right?

Yes

>
>
>>+		m64_bars = num_vfs;
>>+	else
>>+		m64_bars = 1;
>>+
>>+	pdn->m64_map = kmalloc(sizeof(*pdn->m64_map) * m64_bars, GFP_KERNEL);
>
>
>Assume we have SRIOV device with 16VF.
>So it was m64_wins[6][4], now it is (roughly speaking) m64_map[6][16]
>(for a single PE mode) or m64_map[6][1]. I believe m64_bars cannot be
>bigger than 16 on PHB3, right? Is this checked anywhere (does it have
>to)?

In pnv_pci_vf_assign_m64(), we need to find_next_zero_bit() and check the
return value. If exceed m64_bar_idx, means fail.

>
>This m64_wins -> m64_map change - is was not a map (what was it?),
>and it is, is not it?

Hmm... Gavin like this name.

>
>What does it store? An index of M64 BAR (0..15)?
>

Yes.

>
>
>>+	if (!pdn->m64_map)
>>+		return -ENOMEM;
>>+	/* Initialize the m64_map to IODA_INVALID_M64 */
>>+	for (i = 0; i < m64_bars ; i++)
>>+		for (j = 0; j < PCI_SRIOV_NUM_BARS; j++)
>>+			pdn->m64_map[i][j] = IODA_INVALID_M64;
>>
>>-	if (pdn->m64_per_iov == M64_PER_IOV) {
>>-		vf_groups = (num_vfs <= M64_PER_IOV) ? num_vfs: M64_PER_IOV;
>>-		vf_per_group = (num_vfs <= M64_PER_IOV)? 1:
>>-			roundup_pow_of_two(num_vfs) / pdn->m64_per_iov;
>>-	} else {
>>-		vf_groups = 1;
>>-		vf_per_group = 1;
>>-	}
>>
>>  	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
>>  		res = &pdev->resource[i + PCI_IOV_RESOURCES];
>>  		if (!res->flags || !res->parent)
>>  			continue;
>>
>>-		for (j = 0; j < vf_groups; j++) {
>>+		for (j = 0; j < m64_bars; j++) {
>>  			do {
>>  				win = find_next_zero_bit(&phb->ioda.m64_bar_alloc,
>>  						phb->ioda.m64_bar_idx + 1, 0);
>>@@ -1224,12 +1230,11 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
>>  					goto m64_failed;
>>  			} while (test_and_set_bit(win, &phb->ioda.m64_bar_alloc));
>>
>>-			pdn->m64_wins[i][j] = win;
>>+			pdn->m64_map[j][i] = win;
>>
>>-			if (pdn->m64_per_iov == M64_PER_IOV) {
>>+			if (pdn->m64_single_mode) {
>>  				size = pci_iov_resource_size(pdev,
>>  							PCI_IOV_RESOURCES + i);
>>-				size = size * vf_per_group;
>>  				start = res->start + size * j;
>>  			} else {
>>  				size = resource_size(res);
>>@@ -1237,16 +1242,16 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
>>  			}
>>
>>  			/* Map the M64 here */
>>-			if (pdn->m64_per_iov == M64_PER_IOV) {
>>+			if (pdn->m64_single_mode) {
>>  				pe_num = pdn->offset + j;
>>  				rc = opal_pci_map_pe_mmio_window(phb->opal_id,
>>  						pe_num, OPAL_M64_WINDOW_TYPE,
>>-						pdn->m64_wins[i][j], 0);
>>+						pdn->m64_map[j][i], 0);
>>  			}
>>
>>  			rc = opal_pci_set_phb_mem_window(phb->opal_id,
>>  						 OPAL_M64_WINDOW_TYPE,
>>-						 pdn->m64_wins[i][j],
>>+						 pdn->m64_map[j][i],
>>  						 start,
>>  						 0, /* unused */
>>  						 size);
>>@@ -1258,12 +1263,12 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
>>  				goto m64_failed;
>>  			}
>>
>>-			if (pdn->m64_per_iov == M64_PER_IOV)
>>+			if (pdn->m64_single_mode)
>>  				rc = opal_pci_phb_mmio_enable(phb->opal_id,
>>-				     OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 2);
>>+				     OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 2);
>>  			else
>>  				rc = opal_pci_phb_mmio_enable(phb->opal_id,
>>-				     OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 1);
>>+				     OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 1);
>>
>>  			if (rc != OPAL_SUCCESS) {
>>  				dev_err(&pdev->dev, "Failed to enable M64 window #%d: %llx\n",
>>@@ -1275,7 +1280,7 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
>>  	return 0;
>>
>>  m64_failed:
>>-	pnv_pci_vf_release_m64(pdev);
>>+	pnv_pci_vf_release_m64(pdev, num_vfs);
>>  	return -EBUSY;
>>  }
>>
>>@@ -1302,15 +1307,13 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe
>>  	iommu_free_table(tbl, of_node_full_name(dev->dev.of_node));
>>  }
>>
>>-static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs)
>>+static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
>>  {
>>  	struct pci_bus        *bus;
>>  	struct pci_controller *hose;
>>  	struct pnv_phb        *phb;
>>  	struct pnv_ioda_pe    *pe, *pe_n;
>>  	struct pci_dn         *pdn;
>>-	u16                    vf_index;
>>-	int64_t                rc;
>>
>>  	bus = pdev->bus;
>>  	hose = pci_bus_to_host(bus);
>>@@ -1320,35 +1323,6 @@ static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs)
>>  	if (!pdev->is_physfn)
>>  		return;
>>
>>-	if (pdn->m64_per_iov == M64_PER_IOV && num_vfs > M64_PER_IOV) {
>>-		int   vf_group;
>>-		int   vf_per_group;
>>-		int   vf_index1;
>>-
>>-		vf_per_group = roundup_pow_of_two(num_vfs) / pdn->m64_per_iov;
>>-
>>-		for (vf_group = 0; vf_group < M64_PER_IOV; vf_group++)
>>-			for (vf_index = vf_group * vf_per_group;
>>-				vf_index < (vf_group + 1) * vf_per_group &&
>>-				vf_index < num_vfs;
>>-				vf_index++)
>>-				for (vf_index1 = vf_group * vf_per_group;
>>-					vf_index1 < (vf_group + 1) * vf_per_group &&
>>-					vf_index1 < num_vfs;
>>-					vf_index1++){
>>-
>>-					rc = opal_pci_set_peltv(phb->opal_id,
>>-						pdn->offset + vf_index,
>>-						pdn->offset + vf_index1,
>>-						OPAL_REMOVE_PE_FROM_DOMAIN);
>>-
>>-					if (rc)
>>-					    dev_warn(&pdev->dev, "%s: Failed to unlink same group PE#%d(%lld)\n",
>>-						__func__,
>>-						pdn->offset + vf_index1, rc);
>>-				}
>>-	}
>>-
>>  	list_for_each_entry_safe(pe, pe_n, &phb->ioda.pe_list, list) {
>>  		if (pe->parent_dev != pdev)
>>  			continue;
>>@@ -1383,14 +1357,14 @@ void pnv_pci_sriov_disable(struct pci_dev *pdev)
>>  	num_vfs = pdn->num_vfs;
>>
>>  	/* Release VF PEs */
>>-	pnv_ioda_release_vf_PE(pdev, num_vfs);
>>+	pnv_ioda_release_vf_PE(pdev);
>>
>>  	if (phb->type == PNV_PHB_IODA2) {
>>-		if (pdn->m64_per_iov == 1)
>>+		if (!pdn->m64_single_mode)
>>  			pnv_pci_vf_resource_shift(pdev, -pdn->offset);
>>
>>  		/* Release M64 windows */
>>-		pnv_pci_vf_release_m64(pdev);
>>+		pnv_pci_vf_release_m64(pdev, num_vfs);
>>
>>  		/* Release PE numbers */
>>  		bitmap_clear(phb->ioda.pe_alloc, pdn->offset, num_vfs);
>>@@ -1409,7 +1383,6 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
>>  	int                    pe_num;
>>  	u16                    vf_index;
>>  	struct pci_dn         *pdn;
>>-	int64_t                rc;
>>
>>  	bus = pdev->bus;
>>  	hose = pci_bus_to_host(bus);
>>@@ -1454,37 +1427,6 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
>>
>>  		pnv_pci_ioda2_setup_dma_pe(phb, pe);
>>  	}
>>-
>>-	if (pdn->m64_per_iov == M64_PER_IOV && num_vfs > M64_PER_IOV) {
>>-		int   vf_group;
>>-		int   vf_per_group;
>>-		int   vf_index1;
>>-
>>-		vf_per_group = roundup_pow_of_two(num_vfs) / pdn->m64_per_iov;
>>-
>>-		for (vf_group = 0; vf_group < M64_PER_IOV; vf_group++) {
>>-			for (vf_index = vf_group * vf_per_group;
>>-			     vf_index < (vf_group + 1) * vf_per_group &&
>>-			     vf_index < num_vfs;
>>-			     vf_index++) {
>>-				for (vf_index1 = vf_group * vf_per_group;
>>-				     vf_index1 < (vf_group + 1) * vf_per_group &&
>>-				     vf_index1 < num_vfs;
>>-				     vf_index1++) {
>>-
>>-					rc = opal_pci_set_peltv(phb->opal_id,
>>-						pdn->offset + vf_index,
>>-						pdn->offset + vf_index1,
>>-						OPAL_ADD_PE_TO_DOMAIN);
>>-
>>-					if (rc)
>>-					    dev_warn(&pdev->dev, "%s: Failed to link same group PE#%d(%lld)\n",
>>-						__func__,
>>-						pdn->offset + vf_index1, rc);
>>-				}
>>-			}
>>-		}
>>-	}
>>  }
>>
>>  int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
>>@@ -1507,6 +1449,18 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
>>  			return -EBUSY;
>>  		}
>>
>>+		/*
>>+		 * On PNV_PHB_IODA2, We just have 16 M64 BARs and M64 BAR #15
>>+		 * is used to cover the whole system, which leaves only 15 M64
>>+		 * BAR usable for VFs.
>>+		 * When M64 BAR functions in Single PE mode, this means it
>>+		 * just could enable 15 VFs.
>>+		 */
>>+		if (pdn->m64_single_mode && num_vfs >= 16) {
>
>Magic constant 16. Where did this 16 come from? My understanding is
>it could come from
>
>1) hostboot or
>2) OPAL or
>3) architected on IODA2
>4) defined in PHB3 (actually it has to be 2))
>
>which one is it? If 1) and 2) - make it a variable; if 3) - add a macro for it.
>

As Gavin indicated, this will change to "num_vfs > phb->ioda.m64_bar_idx"

>
>>+			dev_info(&pdev->dev, "Not enough M64 BAR for VFs\n");
>>+			return -EBUSY;
>>+		}
>>+
>>  		/* Calculate available PE for required VFs */
>>  		mutex_lock(&phb->ioda.pe_alloc_mutex);
>>  		pdn->offset = bitmap_find_next_zero_area(
>>@@ -1534,7 +1488,7 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
>>  		 * the IOV BAR according to the PE# allocated to the VFs.
>>  		 * Otherwise, the PE# for the VF will conflict with others.
>>  		 */
>>-		if (pdn->m64_per_iov == 1) {
>>+		if (!pdn->m64_single_mode) {
>>  			ret = pnv_pci_vf_resource_shift(pdev, pdn->offset);
>>  			if (ret)
>>  				goto m64_failed;
>>@@ -1567,8 +1521,7 @@ int pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
>>  	/* Allocate PCI data */
>>  	add_dev_pci_data(pdev);
>>
>>-	pnv_pci_sriov_enable(pdev, num_vfs);
>>-	return 0;
>>+	return pnv_pci_sriov_enable(pdev, num_vfs);
>>  }
>>  #endif /* CONFIG_PCI_IOV */
>>
>>@@ -2761,9 +2714,9 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
>>
>>  	pdn = pci_get_pdn(pdev);
>>  	pdn->vfs_expanded = 0;
>>+	pdn->m64_single_mode = false;
>>
>>  	total_vfs = pci_sriov_get_totalvfs(pdev);
>>-	pdn->m64_per_iov = 1;
>>  	mul = phb->ioda.total_pe;
>>
>>  	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
>>@@ -2783,8 +2736,8 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
>>  		if (size > (1 << 26)) {
>>  			dev_info(&pdev->dev, "PowerNV: VF BAR%d: %pR IOV size is bigger than 64M, roundup power2\n",
>>  				 i, res);
>>-			pdn->m64_per_iov = M64_PER_IOV;
>>  			mul = roundup_pow_of_two(total_vfs);
>>+			pdn->m64_single_mode = true;
>>  			break;
>>  		}
>>  	}
>>@@ -2986,6 +2939,8 @@ static resource_size_t pnv_pci_window_alignment(struct pci_bus *bus,
>>  static resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev,
>>  						      int resno)
>>  {
>>+	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
>>+	struct pnv_phb *phb = hose->private_data;
>>  	struct pci_dn *pdn = pci_get_pdn(pdev);
>>  	resource_size_t align;
>>
>>@@ -2994,12 +2949,25 @@ static resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev,
>>  	 * SR-IOV. While from hardware perspective, the range mapped by M64
>>  	 * BAR should be size aligned.
>>  	 *
>>+	 * When IOV BAR is mapped with M64 BAR in Single PE mode, the hardware
>>+	 * restriction to alignment is gone.
>
>
>Gone? Does not BAR still have to be aligned to its size?
>

Yes, M64 BAR is always size aligned. While since in Single PE mode, the M64
BAR size is the same as a VF BAR size, which means they have the same
alignment now.  What I want to say is the extra hardware restriction is gone.

Let me put more to explain this.

>
>>But if just use the VF BAR size
>>+	 * as the alignment, PF BAR / VF BAR may be allocated with in one M64
>>+	 * segment,
>
>
>I thought each VF gets its own _segment_, am I wrong?
>

>From the one M64 BAR map the VF BAR, yes.

While we have M64 BAR#15 to cover the whole 64bit MMIO space, whose segment
size is bigger then the one map the VF BARA. When not properly aligned, VF and
PF may sit in the same segment of the M64 BAR#15.

>
>>which introduces the PE conflict between PF and VF. Based
>>+	 * on this the minimum alignment of an IOV BAR is m64_segsize.
>>
>>+	 *
>>  	 * This function return the total IOV BAR size if expanded or just the
>>-	 * individual size if not.
>>+	 * individual size if not, when M64 BAR is in Shared PE mode.
>>+	 * If the M64 BAR is in Single PE mode, return the VF BAR size or
>>+	 * m64_size if IOV BAR size is less.
>>  	 */
>>  	align = pci_iov_resource_size(pdev, resno);
>>-	if (pdn->vfs_expanded)
>>-		return pdn->vfs_expanded * align;
>>+	if (pdn->vfs_expanded) {
>>+		if (pdn->m64_single_mode)
>>+			return max(align,
>>+				(resource_size_t)phb->ioda.m64_segsize);
>>+		else
>>+			return pdn->vfs_expanded * align;
>>+	}
>>
>>  	return align;
>>  }
>>
>
>
>-- 
>Alexey

-- 
Richard Yang
Help you, Help me

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH V2 1/6] powerpc/powernv: don't enable SRIOV when VF BAR contains non M64 BAR
  2015-08-07  1:20                 ` Gavin Shan
@ 2015-08-07  2:24                   ` Wei Yang
  2015-08-07  3:50                     ` Gavin Shan
  2015-08-07  7:14                     ` Alexey Kardashevskiy
  0 siblings, 2 replies; 56+ messages in thread
From: Wei Yang @ 2015-08-07  2:24 UTC (permalink / raw)
  To: Gavin Shan; +Cc: Wei Yang, aik, benh, linuxppc-dev

On Fri, Aug 07, 2015 at 11:20:10AM +1000, Gavin Shan wrote:
>On Thu, Aug 06, 2015 at 10:10:10PM +0800, Wei Yang wrote:
>>On Thu, Aug 06, 2015 at 02:35:57PM +1000, Gavin Shan wrote:
>>>On Wed, Aug 05, 2015 at 09:24:58AM +0800, Wei Yang wrote:
>>>>On PHB_IODA2, we enable SRIOV devices by mapping IOV BAR with M64 BARs. If
>>>>a SRIOV device's BAR is not 64-bit prefetchable, this is not assigned from
>>>>M64 windwo, which means M64 BAR can't work on it.
>>>>
>>>
>>>s/PHB_IODA2/PHB3
>>>s/windwo/window
>>>
>>>>This patch makes this explicit.
>>>>
>>>>Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
>>>
>>>The idea sounds right, but there is one question as below.
>>>
>>>>---
>>>> arch/powerpc/platforms/powernv/pci-ioda.c |   25 +++++++++----------------
>>>> 1 file changed, 9 insertions(+), 16 deletions(-)
>>>>
>>>>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>>>>index 5738d31..9b41dba 100644
>>>>--- a/arch/powerpc/platforms/powernv/pci-ioda.c
>>>>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>>>>@@ -908,9 +908,6 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
>>>> 		if (!res->flags || !res->parent)
>>>> 			continue;
>>>>
>>>>-		if (!pnv_pci_is_mem_pref_64(res->flags))
>>>>-			continue;
>>>>-
>>>> 		/*
>>>> 		 * The actual IOV BAR range is determined by the start address
>>>> 		 * and the actual size for num_vfs VFs BAR.  This check is to
>>>>@@ -939,9 +936,6 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
>>>> 		if (!res->flags || !res->parent)
>>>> 			continue;
>>>>
>>>>-		if (!pnv_pci_is_mem_pref_64(res->flags))
>>>>-			continue;
>>>>-
>>>> 		size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
>>>> 		res2 = *res;
>>>> 		res->start += size * offset;
>>>>@@ -1221,9 +1215,6 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
>>>> 		if (!res->flags || !res->parent)
>>>> 			continue;
>>>>
>>>>-		if (!pnv_pci_is_mem_pref_64(res->flags))
>>>>-			continue;
>>>>-
>>>> 		for (j = 0; j < vf_groups; j++) {
>>>> 			do {
>>>> 				win = find_next_zero_bit(&phb->ioda.m64_bar_alloc,
>>>>@@ -1510,6 +1501,12 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
>>>> 	pdn = pci_get_pdn(pdev);
>>>>
>>>> 	if (phb->type == PNV_PHB_IODA2) {
>>>>+		if (!pdn->vfs_expanded) {
>>>>+			dev_info(&pdev->dev, "don't support this SRIOV device"
>>>>+				" with non M64 VF BAR\n");
>>>>+			return -EBUSY;
>>>>+		}
>>>>+
>>>
>>>It would be -ENOSPC since -EBUSY indicates the devices (VFs) are temparily
>>>unavailable. For this case, the VFs are permanently unavailable because of
>>>running out of space to accomodate M64 and non-M64 VF BARs.
>>>
>>>The error message could be printed with dev_warn() and it would be precise
>>>as below or something else you prefer:
>>>
>>>	dev_warn(&pdev->dev, "SRIOV not supported because of non-M64 VF BAR\n");
>>>
>>
>>Thanks for the comment, will change accordingly.
>>
>>>
>>>> 		/* Calculate available PE for required VFs */
>>>> 		mutex_lock(&phb->ioda.pe_alloc_mutex);
>>>> 		pdn->offset = bitmap_find_next_zero_area(
>>>>@@ -2774,9 +2771,10 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
>>>> 		if (!res->flags || res->parent)
>>>> 			continue;
>>>> 		if (!pnv_pci_is_mem_pref_64(res->flags)) {
>>>>-			dev_warn(&pdev->dev, " non M64 VF BAR%d: %pR\n",
>>>>+			dev_warn(&pdev->dev, "Don't support SR-IOV with"
>>>>+					" non M64 VF BAR%d: %pR. \n",
>>>> 				 i, res);
>>>>-			continue;
>>>>+			return;
>>>> 		}
>>>>
>>>> 		size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
>>>>@@ -2795,11 +2793,6 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
>>>> 		res = &pdev->resource[i + PCI_IOV_RESOURCES];
>>>> 		if (!res->flags || res->parent)
>>>> 			continue;
>>>>-		if (!pnv_pci_is_mem_pref_64(res->flags)) {
>>>>-			dev_warn(&pdev->dev, "Skipping expanding VF BAR%d: %pR\n",
>>>>-				 i, res);
>>>>-			continue;
>>>>-		}
>>>
>>>When any one IOV BAR on the PF is non-M64, none of the VFs can be enabled.
>>>Will we still allocate/assign M64 or M32 resources for the IOV BARs? If so,
>>>I think it can be avoided.
>>>
>>
>>Don't get your point. You mean to avoid this function?
>>
>>Or clear the IOV BAR when we found one of it is non-M64?
>>
>
>I mean to clear all IOV BARs in case any more of them are IO or M32. In this
>case, the SRIOV capability won't be enabled. Otherwise, the resources for
>all IOV BARs are assigned and allocated by PCI subsystem, but they won't
>be used. Does it make sense to you?
>

If we want to save MMIO space, this is not necessary.

The IOV BAR will be put into the optional list in assignment stage. So when
there is not enough MMIO space, they will not be assigned.

For the long term, maybe P9/P10, we will finally adjust the solution to
support SRIOV devices with M32 MMIO. So I suggest to leave as it is.

>>>>
>>>> 		dev_dbg(&pdev->dev, " Fixing VF BAR%d: %pR to\n", i, res);
>>>> 		size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
>>>>-- 
>>>>1.7.9.5
>>>>
>>
>>-- 
>>Richard Yang
>>Help you, Help me

-- 
Richard Yang
Help you, Help me

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH V2 5/6] powerpc/powernv: boundary the total vf bar size instead of the individual one
  2015-08-07  1:23                 ` Gavin Shan
@ 2015-08-07  2:25                   ` Wei Yang
  0 siblings, 0 replies; 56+ messages in thread
From: Wei Yang @ 2015-08-07  2:25 UTC (permalink / raw)
  To: Gavin Shan; +Cc: Wei Yang, aik, benh, linuxppc-dev

On Fri, Aug 07, 2015 at 11:23:54AM +1000, Gavin Shan wrote:
>On Thu, Aug 06, 2015 at 10:03:04PM +0800, Wei Yang wrote:
>>On Thu, Aug 06, 2015 at 03:28:51PM +1000, Gavin Shan wrote:
>>>On Wed, Aug 05, 2015 at 09:25:02AM +0800, Wei Yang wrote:
>>>>Each VF could have 6 BARs at most. When the total BAR size exceeds the
>>>>gate, after expanding it will also exhaust the M64 Window.
>>>>
>>>>This patch limits the boundary by checking the total VF BAR size instead of
>>>>the individual BAR.
>>>>
>>>>Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
>>>
>>>Ok. I didn't look at this when giving comments to last patch. It turns
>>>you have the change in this patch. Please merge it with the previous
>>>patch.
>>>
>>
>>Hmm... I prefer to have them in two patches. One focus on the calculation of
>>gate and the other focus on checking the total VF BAR size. This would help
>>record the change.
>>
>
>It's fine to me as well. I'll take close look on your next revision since
>you have to refresh the whole series. Is that fine to you?
>

Fine and thanks for your comments.

>>>>---
>>>> arch/powerpc/platforms/powernv/pci-ioda.c |   13 +++++++------
>>>> 1 file changed, 7 insertions(+), 6 deletions(-)
>>>>
>>>>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>>>>index 31dcedc..4042303 100644
>>>>--- a/arch/powerpc/platforms/powernv/pci-ioda.c
>>>>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>>>>@@ -2702,7 +2702,7 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
>>>> 	struct pnv_phb *phb;
>>>> 	struct resource *res;
>>>> 	int i;
>>>>-	resource_size_t size, gate;
>>>>+	resource_size_t size, gate, total_vf_bar_sz;
>>>> 	struct pci_dn *pdn;
>>>> 	int mul, total_vfs;
>>>>
>>>>@@ -2729,6 +2729,7 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
>>>> 	 * Window and limit the system flexibility.
>>>> 	 */
>>>> 	gate = phb->ioda.m64_segsize >> 1;
>>>>+	total_vf_bar_sz = 0;
>>>>
>>>> 	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
>>>> 		res = &pdev->resource[i + PCI_IOV_RESOURCES];
>>>>@@ -2741,13 +2742,13 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
>>>> 			return;
>>>> 		}
>>>>
>>>>-		size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
>>>>+		total_vf_bar_sz += pci_iov_resource_size(pdev,
>>>>+				i + PCI_IOV_RESOURCES);
>>>>
>>>> 		/* bigger than or equal to gate */
>>>>-		if (size >= gate) {
>>>>-			dev_info(&pdev->dev, "PowerNV: VF BAR%d: %pR IOV size "
>>>>-				"is bigger than %lld, roundup power2\n",
>>>>-				 i, res, gate);
>>>>+		if (total_vf_bar_sz >= gate) {
>>>>+			dev_info(&pdev->dev, "PowerNV: VF BAR Total IOV size "
>>>>+				"is bigger than %lld, roundup power2\n", gate);
>>>> 			mul = roundup_pow_of_two(total_vfs);
>>>> 			pdn->m64_single_mode = true;
>>>> 			break;
>>>>-- 
>>>>1.7.9.5
>>>>
>>
>>-- 
>>Richard Yang
>>Help you, Help me

-- 
Richard Yang
Help you, Help me

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH V2 6/6] powerpc/powernv: allocate discrete PE# when using M64 BAR in Single PE mode
  2015-08-07  1:36                 ` Gavin Shan
@ 2015-08-07  2:33                   ` Wei Yang
  2015-08-07  3:43                     ` Gavin Shan
  0 siblings, 1 reply; 56+ messages in thread
From: Wei Yang @ 2015-08-07  2:33 UTC (permalink / raw)
  To: Gavin Shan; +Cc: Wei Yang, aik, benh, linuxppc-dev

On Fri, Aug 07, 2015 at 11:36:56AM +1000, Gavin Shan wrote:
>On Thu, Aug 06, 2015 at 09:41:41PM +0800, Wei Yang wrote:
>>On Thu, Aug 06, 2015 at 03:36:01PM +1000, Gavin Shan wrote:
>>>On Wed, Aug 05, 2015 at 09:25:03AM +0800, Wei Yang wrote:
>>>>When M64 BAR is set to Single PE mode, the PE# assigned to VF could be
>>>>discrete.
>>>>
>>>>This patch restructures the patch to allocate discrete PE# for VFs when M64
>>>>BAR is set to Single PE mode.
>>>>
>>>>Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
>>>>---
>>>> arch/powerpc/include/asm/pci-bridge.h     |    2 +-
>>>> arch/powerpc/platforms/powernv/pci-ioda.c |   69 +++++++++++++++++++++--------
>>>> 2 files changed, 51 insertions(+), 20 deletions(-)
>>>>
>>>>diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
>>>>index 8aeba4c..72415c7 100644
>>>>--- a/arch/powerpc/include/asm/pci-bridge.h
>>>>+++ b/arch/powerpc/include/asm/pci-bridge.h
>>>>@@ -213,7 +213,7 @@ struct pci_dn {
>>>> #ifdef CONFIG_PCI_IOV
>>>> 	u16     vfs_expanded;		/* number of VFs IOV BAR expanded */
>>>> 	u16     num_vfs;		/* number of VFs enabled*/
>>>>-	int     offset;			/* PE# for the first VF PE */
>>>>+	int     *offset;		/* PE# for the first VF PE or array */
>>>> 	bool    m64_single_mode;	/* Use M64 BAR in Single Mode */
>>>> #define IODA_INVALID_M64        (-1)
>>>> 	int     (*m64_map)[PCI_SRIOV_NUM_BARS];
>>>
>>>how about renaming "offset" to "pe_num_map", or "pe_map" ? Similar to the comments
>>>I gave to the "m64_bar_map", num_of_max_vfs entries can be allocated. Though not
>>>all of them will be used, not too much memory will be wasted.
>>>
>>
>>Thanks for your comment.
>>
>>I have thought about change the name to make it more self explain. While
>>another fact I want to take in is this field is also used to be reflect the
>>shift offset when M64 BAR is used in the Shared Mode. So I maintain the name.
>>
>>How about use "enum", one maintain the name "offset", and another one rename to
>>"pe_num_map". And use the meaningful name at proper place?
>>

So I suppose you agree with my naming proposal.

>
>Ok. I'm explaining it with more details. There are two cases: single vs shared
>mode. When PHB M64 BARs run in single mode, you need an array to track the
>allocated discrete PE#. The VF_index is the index to the array. When PHB M64
>BARs run in shared mode, you need continuous PE#. No array required for this
>case. Instead, the starting PE# should be stored to somewhere, which can
>be pdn->offset[0] simply.
>
>So when allocating memory for this array, you just simply allocate (sizeof(*pdn->offset)
>*max_vf_num) no matter what mode PHB's M64 BARs will run in. The point is nobody
>can enable (max_vf_num + 1) VFs.

The max_vf_num is 15?

>
>With above way, the arrays for PE# and M64 BAR remapping needn't be allocated
>when enabling SRIOV capability and releasing on disabling SRIOV capability.
>Instead, those two arrays can be allocated during resource fixup time and free'ed
>when destroying the pdn.
>

My same point of view like previous, if the memory is not in the concern, how
about define them static?

And for the long term, we may support more VFs. Then at that moment, we need
to restructure the code to meet it.

So I suggest if we want to allocate it dynamically, we allocate the exact
number of space.

-- 
Richard Yang
Help you, Help me

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH V2 6/6] powerpc/powernv: allocate discrete PE# when using M64 BAR in Single PE mode
  2015-08-07  2:33                   ` Wei Yang
@ 2015-08-07  3:43                     ` Gavin Shan
  2015-08-07  5:44                       ` Wei Yang
  0 siblings, 1 reply; 56+ messages in thread
From: Gavin Shan @ 2015-08-07  3:43 UTC (permalink / raw)
  To: Wei Yang; +Cc: Gavin Shan, aik, benh, linuxppc-dev

On Fri, Aug 07, 2015 at 10:33:33AM +0800, Wei Yang wrote:
>On Fri, Aug 07, 2015 at 11:36:56AM +1000, Gavin Shan wrote:
>>On Thu, Aug 06, 2015 at 09:41:41PM +0800, Wei Yang wrote:
>>>On Thu, Aug 06, 2015 at 03:36:01PM +1000, Gavin Shan wrote:
>>>>On Wed, Aug 05, 2015 at 09:25:03AM +0800, Wei Yang wrote:
>>>>>When M64 BAR is set to Single PE mode, the PE# assigned to VF could be
>>>>>discrete.
>>>>>
>>>>>This patch restructures the patch to allocate discrete PE# for VFs when M64
>>>>>BAR is set to Single PE mode.
>>>>>
>>>>>Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
>>>>>---
>>>>> arch/powerpc/include/asm/pci-bridge.h     |    2 +-
>>>>> arch/powerpc/platforms/powernv/pci-ioda.c |   69 +++++++++++++++++++++--------
>>>>> 2 files changed, 51 insertions(+), 20 deletions(-)
>>>>>
>>>>>diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
>>>>>index 8aeba4c..72415c7 100644
>>>>>--- a/arch/powerpc/include/asm/pci-bridge.h
>>>>>+++ b/arch/powerpc/include/asm/pci-bridge.h
>>>>>@@ -213,7 +213,7 @@ struct pci_dn {
>>>>> #ifdef CONFIG_PCI_IOV
>>>>> 	u16     vfs_expanded;		/* number of VFs IOV BAR expanded */
>>>>> 	u16     num_vfs;		/* number of VFs enabled*/
>>>>>-	int     offset;			/* PE# for the first VF PE */
>>>>>+	int     *offset;		/* PE# for the first VF PE or array */
>>>>> 	bool    m64_single_mode;	/* Use M64 BAR in Single Mode */
>>>>> #define IODA_INVALID_M64        (-1)
>>>>> 	int     (*m64_map)[PCI_SRIOV_NUM_BARS];
>>>>
>>>>how about renaming "offset" to "pe_num_map", or "pe_map" ? Similar to the comments
>>>>I gave to the "m64_bar_map", num_of_max_vfs entries can be allocated. Though not
>>>>all of them will be used, not too much memory will be wasted.
>>>>
>>>
>>>Thanks for your comment.
>>>
>>>I have thought about change the name to make it more self explain. While
>>>another fact I want to take in is this field is also used to be reflect the
>>>shift offset when M64 BAR is used in the Shared Mode. So I maintain the name.
>>>
>>>How about use "enum", one maintain the name "offset", and another one rename to
>>>"pe_num_map". And use the meaningful name at proper place?
>>>
>
>So I suppose you agree with my naming proposal.
>

No, I dislike the "enum" things.

>>
>>Ok. I'm explaining it with more details. There are two cases: single vs shared
>>mode. When PHB M64 BARs run in single mode, you need an array to track the
>>allocated discrete PE#. The VF_index is the index to the array. When PHB M64
>>BARs run in shared mode, you need continuous PE#. No array required for this
>>case. Instead, the starting PE# should be stored to somewhere, which can
>>be pdn->offset[0] simply.
>>
>>So when allocating memory for this array, you just simply allocate (sizeof(*pdn->offset)
>>*max_vf_num) no matter what mode PHB's M64 BARs will run in. The point is nobody
>>can enable (max_vf_num + 1) VFs.
>
>The max_vf_num is 15?
>

I don't understand why you said: the max_vf_num is 15. Since max_vf_num is variable
on different PFs, how can it be fixed value - 15 ?

>>
>>With above way, the arrays for PE# and M64 BAR remapping needn't be allocated
>>when enabling SRIOV capability and releasing on disabling SRIOV capability.
>>Instead, those two arrays can be allocated during resource fixup time and free'ed
>>when destroying the pdn.
>>
>
>My same point of view like previous, if the memory is not in the concern, how
>about define them static?
>

It's a bad idea from my review. How many entries this array is going to have?
256 * NUM_OF_MAX_VF_BARS ?

>And for the long term, we may support more VFs. Then at that moment, we need
>to restructure the code to meet it.
>
>So I suggest if we want to allocate it dynamically, we allocate the exact
>number of space.
>

Fine... it can be improved when it has to be, as you said.

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH V2 1/6] powerpc/powernv: don't enable SRIOV when VF BAR contains non M64 BAR
  2015-08-07  2:24                   ` Wei Yang
@ 2015-08-07  3:50                     ` Gavin Shan
  2015-08-07  7:14                     ` Alexey Kardashevskiy
  1 sibling, 0 replies; 56+ messages in thread
From: Gavin Shan @ 2015-08-07  3:50 UTC (permalink / raw)
  To: Wei Yang; +Cc: Gavin Shan, aik, benh, linuxppc-dev

On Fri, Aug 07, 2015 at 10:24:05AM +0800, Wei Yang wrote:
>On Fri, Aug 07, 2015 at 11:20:10AM +1000, Gavin Shan wrote:
>>On Thu, Aug 06, 2015 at 10:10:10PM +0800, Wei Yang wrote:
>>>On Thu, Aug 06, 2015 at 02:35:57PM +1000, Gavin Shan wrote:
>>>>On Wed, Aug 05, 2015 at 09:24:58AM +0800, Wei Yang wrote:
>>>>>On PHB_IODA2, we enable SRIOV devices by mapping IOV BAR with M64 BARs. If
>>>>>a SRIOV device's BAR is not 64-bit prefetchable, this is not assigned from
>>>>>M64 windwo, which means M64 BAR can't work on it.
>>>>>
>>>>
>>>>s/PHB_IODA2/PHB3
>>>>s/windwo/window
>>>>
>>>>>This patch makes this explicit.
>>>>>
>>>>>Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
>>>>
>>>>The idea sounds right, but there is one question as below.
>>>>
>>>>>---
>>>>> arch/powerpc/platforms/powernv/pci-ioda.c |   25 +++++++++----------------
>>>>> 1 file changed, 9 insertions(+), 16 deletions(-)
>>>>>
>>>>>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>>>>>index 5738d31..9b41dba 100644
>>>>>--- a/arch/powerpc/platforms/powernv/pci-ioda.c
>>>>>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>>>>>@@ -908,9 +908,6 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
>>>>> 		if (!res->flags || !res->parent)
>>>>> 			continue;
>>>>>
>>>>>-		if (!pnv_pci_is_mem_pref_64(res->flags))
>>>>>-			continue;
>>>>>-
>>>>> 		/*
>>>>> 		 * The actual IOV BAR range is determined by the start address
>>>>> 		 * and the actual size for num_vfs VFs BAR.  This check is to
>>>>>@@ -939,9 +936,6 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
>>>>> 		if (!res->flags || !res->parent)
>>>>> 			continue;
>>>>>
>>>>>-		if (!pnv_pci_is_mem_pref_64(res->flags))
>>>>>-			continue;
>>>>>-
>>>>> 		size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
>>>>> 		res2 = *res;
>>>>> 		res->start += size * offset;
>>>>>@@ -1221,9 +1215,6 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
>>>>> 		if (!res->flags || !res->parent)
>>>>> 			continue;
>>>>>
>>>>>-		if (!pnv_pci_is_mem_pref_64(res->flags))
>>>>>-			continue;
>>>>>-
>>>>> 		for (j = 0; j < vf_groups; j++) {
>>>>> 			do {
>>>>> 				win = find_next_zero_bit(&phb->ioda.m64_bar_alloc,
>>>>>@@ -1510,6 +1501,12 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
>>>>> 	pdn = pci_get_pdn(pdev);
>>>>>
>>>>> 	if (phb->type == PNV_PHB_IODA2) {
>>>>>+		if (!pdn->vfs_expanded) {
>>>>>+			dev_info(&pdev->dev, "don't support this SRIOV device"
>>>>>+				" with non M64 VF BAR\n");
>>>>>+			return -EBUSY;
>>>>>+		}
>>>>>+
>>>>
>>>>It would be -ENOSPC since -EBUSY indicates the devices (VFs) are temparily
>>>>unavailable. For this case, the VFs are permanently unavailable because of
>>>>running out of space to accomodate M64 and non-M64 VF BARs.
>>>>
>>>>The error message could be printed with dev_warn() and it would be precise
>>>>as below or something else you prefer:
>>>>
>>>>	dev_warn(&pdev->dev, "SRIOV not supported because of non-M64 VF BAR\n");
>>>>
>>>
>>>Thanks for the comment, will change accordingly.
>>>
>>>>
>>>>> 		/* Calculate available PE for required VFs */
>>>>> 		mutex_lock(&phb->ioda.pe_alloc_mutex);
>>>>> 		pdn->offset = bitmap_find_next_zero_area(
>>>>>@@ -2774,9 +2771,10 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
>>>>> 		if (!res->flags || res->parent)
>>>>> 			continue;
>>>>> 		if (!pnv_pci_is_mem_pref_64(res->flags)) {
>>>>>-			dev_warn(&pdev->dev, " non M64 VF BAR%d: %pR\n",
>>>>>+			dev_warn(&pdev->dev, "Don't support SR-IOV with"
>>>>>+					" non M64 VF BAR%d: %pR. \n",
>>>>> 				 i, res);
>>>>>-			continue;
>>>>>+			return;
>>>>> 		}
>>>>>
>>>>> 		size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
>>>>>@@ -2795,11 +2793,6 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
>>>>> 		res = &pdev->resource[i + PCI_IOV_RESOURCES];
>>>>> 		if (!res->flags || res->parent)
>>>>> 			continue;
>>>>>-		if (!pnv_pci_is_mem_pref_64(res->flags)) {
>>>>>-			dev_warn(&pdev->dev, "Skipping expanding VF BAR%d: %pR\n",
>>>>>-				 i, res);
>>>>>-			continue;
>>>>>-		}
>>>>
>>>>When any one IOV BAR on the PF is non-M64, none of the VFs can be enabled.
>>>>Will we still allocate/assign M64 or M32 resources for the IOV BARs? If so,
>>>>I think it can be avoided.
>>>>
>>>
>>>Don't get your point. You mean to avoid this function?
>>>
>>>Or clear the IOV BAR when we found one of it is non-M64?
>>>
>>
>>I mean to clear all IOV BARs in case any more of them are IO or M32. In this
>>case, the SRIOV capability won't be enabled. Otherwise, the resources for
>>all IOV BARs are assigned and allocated by PCI subsystem, but they won't
>>be used. Does it make sense to you?
>>
>
>If we want to save MMIO space, this is not necessary.
>
>The IOV BAR will be put into the optional list in assignment stage. So when
>there is not enough MMIO space, they will not be assigned.
>

Why it's not necessary? The problem isn't related to MMIO space - enough, or
not enough as I explained above. Lets have an example here: PF has two
IOV BARs. One of the them is M32 BAR and the other is M64 BAR. For this case,
the SRIOV won't be enabled on the PF. Will the code still assign M32 resources
and M64 resources for those VF BARs (or IOV BARs)? If so, those VF BARs are
never used. Aren't we wasting resources? And how much MMIO resources are wasted
in this case?

>For the long term, maybe P9/P10, we will finally adjust the solution to
>support SRIOV devices with M32 MMIO. So I suggest to leave as it is.
>

I don't see how this problem is specific to P8. However, if you don't want
to improve this case currently. I'm fine. somebody can improve it in future.

>>>>>
>>>>> 		dev_dbg(&pdev->dev, " Fixing VF BAR%d: %pR to\n", i, res);
>>>>> 		size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
>>>>>-- 
>>>>>1.7.9.5
>>>>>
>>>
>>>-- 
>>>Richard Yang
>>>Help you, Help me
>
>-- 
>Richard Yang
>Help you, Help me

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH V2 6/6] powerpc/powernv: allocate discrete PE# when using M64 BAR in Single PE mode
  2015-08-07  3:43                     ` Gavin Shan
@ 2015-08-07  5:44                       ` Wei Yang
  2015-08-07  5:54                         ` Gavin Shan
  0 siblings, 1 reply; 56+ messages in thread
From: Wei Yang @ 2015-08-07  5:44 UTC (permalink / raw)
  To: Gavin Shan; +Cc: Wei Yang, aik, benh, linuxppc-dev

On Fri, Aug 07, 2015 at 01:43:01PM +1000, Gavin Shan wrote:
>On Fri, Aug 07, 2015 at 10:33:33AM +0800, Wei Yang wrote:
>>On Fri, Aug 07, 2015 at 11:36:56AM +1000, Gavin Shan wrote:
>>>On Thu, Aug 06, 2015 at 09:41:41PM +0800, Wei Yang wrote:
>>>>On Thu, Aug 06, 2015 at 03:36:01PM +1000, Gavin Shan wrote:
>>>>>On Wed, Aug 05, 2015 at 09:25:03AM +0800, Wei Yang wrote:
>>>>>>When M64 BAR is set to Single PE mode, the PE# assigned to VF could be
>>>>>>discrete.
>>>>>>
>>>>>>This patch restructures the patch to allocate discrete PE# for VFs when M64
>>>>>>BAR is set to Single PE mode.
>>>>>>
>>>>>>Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
>>>>>>---
>>>>>> arch/powerpc/include/asm/pci-bridge.h     |    2 +-
>>>>>> arch/powerpc/platforms/powernv/pci-ioda.c |   69 +++++++++++++++++++++--------
>>>>>> 2 files changed, 51 insertions(+), 20 deletions(-)
>>>>>>
>>>>>>diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
>>>>>>index 8aeba4c..72415c7 100644
>>>>>>--- a/arch/powerpc/include/asm/pci-bridge.h
>>>>>>+++ b/arch/powerpc/include/asm/pci-bridge.h
>>>>>>@@ -213,7 +213,7 @@ struct pci_dn {
>>>>>> #ifdef CONFIG_PCI_IOV
>>>>>> 	u16     vfs_expanded;		/* number of VFs IOV BAR expanded */
>>>>>> 	u16     num_vfs;		/* number of VFs enabled*/
>>>>>>-	int     offset;			/* PE# for the first VF PE */
>>>>>>+	int     *offset;		/* PE# for the first VF PE or array */
>>>>>> 	bool    m64_single_mode;	/* Use M64 BAR in Single Mode */
>>>>>> #define IODA_INVALID_M64        (-1)
>>>>>> 	int     (*m64_map)[PCI_SRIOV_NUM_BARS];
>>>>>
>>>>>how about renaming "offset" to "pe_num_map", or "pe_map" ? Similar to the comments
>>>>>I gave to the "m64_bar_map", num_of_max_vfs entries can be allocated. Though not
>>>>>all of them will be used, not too much memory will be wasted.
>>>>>
>>>>
>>>>Thanks for your comment.
>>>>
>>>>I have thought about change the name to make it more self explain. While
>>>>another fact I want to take in is this field is also used to be reflect the
>>>>shift offset when M64 BAR is used in the Shared Mode. So I maintain the name.
>>>>
>>>>How about use "enum", one maintain the name "offset", and another one rename to
>>>>"pe_num_map". And use the meaningful name at proper place?
>>>>
>>
>>So I suppose you agree with my naming proposal.
>>
>
>No, I dislike the "enum" things.
>

OK, then you suggest to rename it pe_num_map or keep it as offset?

>>>
>>>Ok. I'm explaining it with more details. There are two cases: single vs shared
>>>mode. When PHB M64 BARs run in single mode, you need an array to track the
>>>allocated discrete PE#. The VF_index is the index to the array. When PHB M64
>>>BARs run in shared mode, you need continuous PE#. No array required for this
>>>case. Instead, the starting PE# should be stored to somewhere, which can
>>>be pdn->offset[0] simply.
>>>
>>>So when allocating memory for this array, you just simply allocate (sizeof(*pdn->offset)
>>>*max_vf_num) no matter what mode PHB's M64 BARs will run in. The point is nobody
>>>can enable (max_vf_num + 1) VFs.
>>
>>The max_vf_num is 15?
>>
>
>I don't understand why you said: the max_vf_num is 15. Since max_vf_num is variable
>on different PFs, how can it be fixed value - 15 ?
>

In Shared PE case, only one int to indicate the start PE# is fine.
In Single PE mode, we totally could enable 15 VF, the same number of PEs for
each VF, which is limited by the number M64 BARs we have in the system.

If not, the number you expected is total_vfs?

>>>
>>>With above way, the arrays for PE# and M64 BAR remapping needn't be allocated
>>>when enabling SRIOV capability and releasing on disabling SRIOV capability.
>>>Instead, those two arrays can be allocated during resource fixup time and free'ed
>>>when destroying the pdn.
>>>
>>
>>My same point of view like previous, if the memory is not in the concern, how
>>about define them static?
>>
>
>It's a bad idea from my review. How many entries this array is going to have?
>256 * NUM_OF_MAX_VF_BARS ?
>

No.

It has 15 * 6, 15 VFs we could enable at most and 6 VF BARs a VF could have at
most.

>>And for the long term, we may support more VFs. Then at that moment, we need
>>to restructure the code to meet it.
>>
>>So I suggest if we want to allocate it dynamically, we allocate the exact
>>number of space.
>>
>
>Fine... it can be improved when it has to be, as you said.
>

-- 
Richard Yang
Help you, Help me

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH V2 6/6] powerpc/powernv: allocate discrete PE# when using M64 BAR in Single PE mode
  2015-08-07  5:44                       ` Wei Yang
@ 2015-08-07  5:54                         ` Gavin Shan
  2015-08-07  6:25                           ` Wei Yang
  2015-08-07 10:00                           ` Alexey Kardashevskiy
  0 siblings, 2 replies; 56+ messages in thread
From: Gavin Shan @ 2015-08-07  5:54 UTC (permalink / raw)
  To: Wei Yang; +Cc: Gavin Shan, aik, benh, linuxppc-dev

On Fri, Aug 07, 2015 at 01:44:33PM +0800, Wei Yang wrote:
>On Fri, Aug 07, 2015 at 01:43:01PM +1000, Gavin Shan wrote:
>>On Fri, Aug 07, 2015 at 10:33:33AM +0800, Wei Yang wrote:
>>>On Fri, Aug 07, 2015 at 11:36:56AM +1000, Gavin Shan wrote:
>>>>On Thu, Aug 06, 2015 at 09:41:41PM +0800, Wei Yang wrote:
>>>>>On Thu, Aug 06, 2015 at 03:36:01PM +1000, Gavin Shan wrote:
>>>>>>On Wed, Aug 05, 2015 at 09:25:03AM +0800, Wei Yang wrote:
>>>>>>>When M64 BAR is set to Single PE mode, the PE# assigned to VF could be
>>>>>>>discrete.
>>>>>>>
>>>>>>>This patch restructures the patch to allocate discrete PE# for VFs when M64
>>>>>>>BAR is set to Single PE mode.
>>>>>>>
>>>>>>>Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
>>>>>>>---
>>>>>>> arch/powerpc/include/asm/pci-bridge.h     |    2 +-
>>>>>>> arch/powerpc/platforms/powernv/pci-ioda.c |   69 +++++++++++++++++++++--------
>>>>>>> 2 files changed, 51 insertions(+), 20 deletions(-)
>>>>>>>
>>>>>>>diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
>>>>>>>index 8aeba4c..72415c7 100644
>>>>>>>--- a/arch/powerpc/include/asm/pci-bridge.h
>>>>>>>+++ b/arch/powerpc/include/asm/pci-bridge.h
>>>>>>>@@ -213,7 +213,7 @@ struct pci_dn {
>>>>>>> #ifdef CONFIG_PCI_IOV
>>>>>>> 	u16     vfs_expanded;		/* number of VFs IOV BAR expanded */
>>>>>>> 	u16     num_vfs;		/* number of VFs enabled*/
>>>>>>>-	int     offset;			/* PE# for the first VF PE */
>>>>>>>+	int     *offset;		/* PE# for the first VF PE or array */
>>>>>>> 	bool    m64_single_mode;	/* Use M64 BAR in Single Mode */
>>>>>>> #define IODA_INVALID_M64        (-1)
>>>>>>> 	int     (*m64_map)[PCI_SRIOV_NUM_BARS];
>>>>>>
>>>>>>how about renaming "offset" to "pe_num_map", or "pe_map" ? Similar to the comments
>>>>>>I gave to the "m64_bar_map", num_of_max_vfs entries can be allocated. Though not
>>>>>>all of them will be used, not too much memory will be wasted.
>>>>>>
>>>>>
>>>>>Thanks for your comment.
>>>>>
>>>>>I have thought about change the name to make it more self explain. While
>>>>>another fact I want to take in is this field is also used to be reflect the
>>>>>shift offset when M64 BAR is used in the Shared Mode. So I maintain the name.
>>>>>
>>>>>How about use "enum", one maintain the name "offset", and another one rename to
>>>>>"pe_num_map". And use the meaningful name at proper place?
>>>>>
>>>
>>>So I suppose you agree with my naming proposal.
>>>
>>
>>No, I dislike the "enum" things.
>>
>
>OK, then you suggest to rename it pe_num_map or keep it as offset?
>

pe_num_map would be better.

>>>>
>>>>Ok. I'm explaining it with more details. There are two cases: single vs shared
>>>>mode. When PHB M64 BARs run in single mode, you need an array to track the
>>>>allocated discrete PE#. The VF_index is the index to the array. When PHB M64
>>>>BARs run in shared mode, you need continuous PE#. No array required for this
>>>>case. Instead, the starting PE# should be stored to somewhere, which can
>>>>be pdn->offset[0] simply.
>>>>
>>>>So when allocating memory for this array, you just simply allocate (sizeof(*pdn->offset)
>>>>*max_vf_num) no matter what mode PHB's M64 BARs will run in. The point is nobody
>>>>can enable (max_vf_num + 1) VFs.
>>>
>>>The max_vf_num is 15?
>>>
>>
>>I don't understand why you said: the max_vf_num is 15. Since max_vf_num is variable
>>on different PFs, how can it be fixed value - 15 ?
>>
>
>In Shared PE case, only one int to indicate the start PE# is fine.
>In Single PE mode, we totally could enable 15 VF, the same number of PEs for
>each VF, which is limited by the number M64 BARs we have in the system.
>
>If not, the number you expected is total_vfs?
>

then it should be min(total_vfs, phb->ioda.m64_bar_idx), isn't it? 

>>>>
>>>>With above way, the arrays for PE# and M64 BAR remapping needn't be allocated
>>>>when enabling SRIOV capability and releasing on disabling SRIOV capability.
>>>>Instead, those two arrays can be allocated during resource fixup time and free'ed
>>>>when destroying the pdn.
>>>>
>>>
>>>My same point of view like previous, if the memory is not in the concern, how
>>>about define them static?
>>>
>>
>>It's a bad idea from my review. How many entries this array is going to have?
>>256 * NUM_OF_MAX_VF_BARS ?
>>
>
>No.
>
>It has 15 * 6, 15 VFs we could enable at most and 6 VF BARs a VF could have at
>most.
>

It's min(total_vfs, phb->ioda.m64_bar_idx) VFs that can be enabled at maximal
degree, no?

>>>And for the long term, we may support more VFs. Then at that moment, we need
>>>to restructure the code to meet it.
>>>
>>>So I suggest if we want to allocate it dynamically, we allocate the exact
>>>number of space.
>>>
>>
>>Fine... it can be improved when it has to be, as you said.
>>
>
>-- 
>Richard Yang
>Help you, Help me

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH V2 6/6] powerpc/powernv: allocate discrete PE# when using M64 BAR in Single PE mode
  2015-08-07  5:54                         ` Gavin Shan
@ 2015-08-07  6:25                           ` Wei Yang
  2015-08-07 10:00                           ` Alexey Kardashevskiy
  1 sibling, 0 replies; 56+ messages in thread
From: Wei Yang @ 2015-08-07  6:25 UTC (permalink / raw)
  To: Gavin Shan; +Cc: Wei Yang, aik, benh, linuxppc-dev

On Fri, Aug 07, 2015 at 03:54:48PM +1000, Gavin Shan wrote:
>On Fri, Aug 07, 2015 at 01:44:33PM +0800, Wei Yang wrote:
>>On Fri, Aug 07, 2015 at 01:43:01PM +1000, Gavin Shan wrote:
>>>On Fri, Aug 07, 2015 at 10:33:33AM +0800, Wei Yang wrote:
>>>>On Fri, Aug 07, 2015 at 11:36:56AM +1000, Gavin Shan wrote:
>>>>>On Thu, Aug 06, 2015 at 09:41:41PM +0800, Wei Yang wrote:
>>>>>>On Thu, Aug 06, 2015 at 03:36:01PM +1000, Gavin Shan wrote:
>>>>>>>On Wed, Aug 05, 2015 at 09:25:03AM +0800, Wei Yang wrote:
>>>>>>>>When M64 BAR is set to Single PE mode, the PE# assigned to VF could be
>>>>>>>>discrete.
>>>>>>>>
>>>>>>>>This patch restructures the patch to allocate discrete PE# for VFs when M64
>>>>>>>>BAR is set to Single PE mode.
>>>>>>>>
>>>>>>>>Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
>>>>>>>>---
>>>>>>>> arch/powerpc/include/asm/pci-bridge.h     |    2 +-
>>>>>>>> arch/powerpc/platforms/powernv/pci-ioda.c |   69 +++++++++++++++++++++--------
>>>>>>>> 2 files changed, 51 insertions(+), 20 deletions(-)
>>>>>>>>
>>>>>>>>diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
>>>>>>>>index 8aeba4c..72415c7 100644
>>>>>>>>--- a/arch/powerpc/include/asm/pci-bridge.h
>>>>>>>>+++ b/arch/powerpc/include/asm/pci-bridge.h
>>>>>>>>@@ -213,7 +213,7 @@ struct pci_dn {
>>>>>>>> #ifdef CONFIG_PCI_IOV
>>>>>>>> 	u16     vfs_expanded;		/* number of VFs IOV BAR expanded */
>>>>>>>> 	u16     num_vfs;		/* number of VFs enabled*/
>>>>>>>>-	int     offset;			/* PE# for the first VF PE */
>>>>>>>>+	int     *offset;		/* PE# for the first VF PE or array */
>>>>>>>> 	bool    m64_single_mode;	/* Use M64 BAR in Single Mode */
>>>>>>>> #define IODA_INVALID_M64        (-1)
>>>>>>>> 	int     (*m64_map)[PCI_SRIOV_NUM_BARS];
>>>>>>>
>>>>>>>how about renaming "offset" to "pe_num_map", or "pe_map" ? Similar to the comments
>>>>>>>I gave to the "m64_bar_map", num_of_max_vfs entries can be allocated. Though not
>>>>>>>all of them will be used, not too much memory will be wasted.
>>>>>>>
>>>>>>
>>>>>>Thanks for your comment.
>>>>>>
>>>>>>I have thought about change the name to make it more self explain. While
>>>>>>another fact I want to take in is this field is also used to be reflect the
>>>>>>shift offset when M64 BAR is used in the Shared Mode. So I maintain the name.
>>>>>>
>>>>>>How about use "enum", one maintain the name "offset", and another one rename to
>>>>>>"pe_num_map". And use the meaningful name at proper place?
>>>>>>
>>>>
>>>>So I suppose you agree with my naming proposal.
>>>>
>>>
>>>No, I dislike the "enum" things.
>>>
>>
>>OK, then you suggest to rename it pe_num_map or keep it as offset?
>>
>
>pe_num_map would be better.
>
>>>>>
>>>>>Ok. I'm explaining it with more details. There are two cases: single vs shared
>>>>>mode. When PHB M64 BARs run in single mode, you need an array to track the
>>>>>allocated discrete PE#. The VF_index is the index to the array. When PHB M64
>>>>>BARs run in shared mode, you need continuous PE#. No array required for this
>>>>>case. Instead, the starting PE# should be stored to somewhere, which can
>>>>>be pdn->offset[0] simply.
>>>>>
>>>>>So when allocating memory for this array, you just simply allocate (sizeof(*pdn->offset)
>>>>>*max_vf_num) no matter what mode PHB's M64 BARs will run in. The point is nobody
>>>>>can enable (max_vf_num + 1) VFs.
>>>>
>>>>The max_vf_num is 15?
>>>>
>>>
>>>I don't understand why you said: the max_vf_num is 15. Since max_vf_num is variable
>>>on different PFs, how can it be fixed value - 15 ?
>>>
>>
>>In Shared PE case, only one int to indicate the start PE# is fine.
>>In Single PE mode, we totally could enable 15 VF, the same number of PEs for
>>each VF, which is limited by the number M64 BARs we have in the system.
>>
>>If not, the number you expected is total_vfs?
>>
>
>then it should be min(total_vfs, phb->ioda.m64_bar_idx), isn't it? 
>
>>>>>
>>>>>With above way, the arrays for PE# and M64 BAR remapping needn't be allocated
>>>>>when enabling SRIOV capability and releasing on disabling SRIOV capability.
>>>>>Instead, those two arrays can be allocated during resource fixup time and free'ed
>>>>>when destroying the pdn.
>>>>>
>>>>
>>>>My same point of view like previous, if the memory is not in the concern, how
>>>>about define them static?
>>>>
>>>
>>>It's a bad idea from my review. How many entries this array is going to have?
>>>256 * NUM_OF_MAX_VF_BARS ?
>>>
>>
>>No.
>>
>>It has 15 * 6, 15 VFs we could enable at most and 6 VF BARs a VF could have at
>>most.
>>
>
>It's min(total_vfs, phb->ioda.m64_bar_idx) VFs that can be enabled at maximal
>degree, no?
>

Yes, you are right. The number 15 is the one I used when the field is static.
If we want to allocate it dynamically, we need to choose the smaller one.

While I suggest to even improve this formula to min(num_vfs, m64_bar_idx),
since num_vfs <= total_vfs always. That's why num_vfs entries are allocate in
the code. 

>>>>And for the long term, we may support more VFs. Then at that moment, we need
>>>>to restructure the code to meet it.
>>>>
>>>>So I suggest if we want to allocate it dynamically, we allocate the exact
>>>>number of space.
>>>>
>>>
>>>Fine... it can be improved when it has to be, as you said.
>>>
>>
>>-- 
>>Richard Yang
>>Help you, Help me

-- 
Richard Yang
Help you, Help me

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH V2 1/6] powerpc/powernv: don't enable SRIOV when VF BAR contains non M64 BAR
  2015-08-07  2:24                   ` Wei Yang
  2015-08-07  3:50                     ` Gavin Shan
@ 2015-08-07  7:14                     ` Alexey Kardashevskiy
  2015-08-10  1:40                       ` Wei Yang
  1 sibling, 1 reply; 56+ messages in thread
From: Alexey Kardashevskiy @ 2015-08-07  7:14 UTC (permalink / raw)
  To: Wei Yang, Gavin Shan; +Cc: benh, linuxppc-dev

On 08/07/2015 12:24 PM, Wei Yang wrote:
> On Fri, Aug 07, 2015 at 11:20:10AM +1000, Gavin Shan wrote:
>> On Thu, Aug 06, 2015 at 10:10:10PM +0800, Wei Yang wrote:
>>> On Thu, Aug 06, 2015 at 02:35:57PM +1000, Gavin Shan wrote:
>>>> On Wed, Aug 05, 2015 at 09:24:58AM +0800, Wei Yang wrote:
>>>>> On PHB_IODA2, we enable SRIOV devices by mapping IOV BAR with M64 BARs. If
>>>>> a SRIOV device's BAR is not 64-bit prefetchable, this is not assigned from
>>>>> M64 windwo, which means M64 BAR can't work on it.
>>>>>
>>>>
>>>> s/PHB_IODA2/PHB3
>>>> s/windwo/window
>>>>
>>>>> This patch makes this explicit.
>>>>>
>>>>> Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
>>>>
>>>> The idea sounds right, but there is one question as below.
>>>>
>>>>> ---
>>>>> arch/powerpc/platforms/powernv/pci-ioda.c |   25 +++++++++----------------
>>>>> 1 file changed, 9 insertions(+), 16 deletions(-)
>>>>>
>>>>> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>>>>> index 5738d31..9b41dba 100644
>>>>> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
>>>>> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>>>>> @@ -908,9 +908,6 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
>>>>> 		if (!res->flags || !res->parent)
>>>>> 			continue;
>>>>>
>>>>> -		if (!pnv_pci_is_mem_pref_64(res->flags))
>>>>> -			continue;
>>>>> -
>>>>> 		/*
>>>>> 		 * The actual IOV BAR range is determined by the start address
>>>>> 		 * and the actual size for num_vfs VFs BAR.  This check is to
>>>>> @@ -939,9 +936,6 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
>>>>> 		if (!res->flags || !res->parent)
>>>>> 			continue;
>>>>>
>>>>> -		if (!pnv_pci_is_mem_pref_64(res->flags))
>>>>> -			continue;
>>>>> -
>>>>> 		size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
>>>>> 		res2 = *res;
>>>>> 		res->start += size * offset;
>>>>> @@ -1221,9 +1215,6 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
>>>>> 		if (!res->flags || !res->parent)
>>>>> 			continue;
>>>>>
>>>>> -		if (!pnv_pci_is_mem_pref_64(res->flags))
>>>>> -			continue;
>>>>> -
>>>>> 		for (j = 0; j < vf_groups; j++) {
>>>>> 			do {
>>>>> 				win = find_next_zero_bit(&phb->ioda.m64_bar_alloc,
>>>>> @@ -1510,6 +1501,12 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
>>>>> 	pdn = pci_get_pdn(pdev);
>>>>>
>>>>> 	if (phb->type == PNV_PHB_IODA2) {
>>>>> +		if (!pdn->vfs_expanded) {
>>>>> +			dev_info(&pdev->dev, "don't support this SRIOV device"
>>>>> +				" with non M64 VF BAR\n");
>>>>> +			return -EBUSY;
>>>>> +		}
>>>>> +
>>>>
>>>> It would be -ENOSPC since -EBUSY indicates the devices (VFs) are temparily
>>>> unavailable. For this case, the VFs are permanently unavailable because of
>>>> running out of space to accomodate M64 and non-M64 VF BARs.
>>>>
>>>> The error message could be printed with dev_warn() and it would be precise
>>>> as below or something else you prefer:
>>>>
>>>> 	dev_warn(&pdev->dev, "SRIOV not supported because of non-M64 VF BAR\n");
>>>>
>>>
>>> Thanks for the comment, will change accordingly.
>>>
>>>>
>>>>> 		/* Calculate available PE for required VFs */
>>>>> 		mutex_lock(&phb->ioda.pe_alloc_mutex);
>>>>> 		pdn->offset = bitmap_find_next_zero_area(
>>>>> @@ -2774,9 +2771,10 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
>>>>> 		if (!res->flags || res->parent)
>>>>> 			continue;
>>>>> 		if (!pnv_pci_is_mem_pref_64(res->flags)) {
>>>>> -			dev_warn(&pdev->dev, " non M64 VF BAR%d: %pR\n",
>>>>> +			dev_warn(&pdev->dev, "Don't support SR-IOV with"
>>>>> +					" non M64 VF BAR%d: %pR. \n",
>>>>> 				 i, res);
>>>>> -			continue;
>>>>> +			return;
>>>>> 		}
>>>>>
>>>>> 		size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
>>>>> @@ -2795,11 +2793,6 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
>>>>> 		res = &pdev->resource[i + PCI_IOV_RESOURCES];
>>>>> 		if (!res->flags || res->parent)
>>>>> 			continue;
>>>>> -		if (!pnv_pci_is_mem_pref_64(res->flags)) {
>>>>> -			dev_warn(&pdev->dev, "Skipping expanding VF BAR%d: %pR\n",
>>>>> -				 i, res);
>>>>> -			continue;
>>>>> -		}
>>>>
>>>> When any one IOV BAR on the PF is non-M64, none of the VFs can be enabled.
>>>> Will we still allocate/assign M64 or M32 resources for the IOV BARs? If so,
>>>> I think it can be avoided.
>>>>
>>>
>>> Don't get your point. You mean to avoid this function?
>>>
>>> Or clear the IOV BAR when we found one of it is non-M64?
>>>
>>
>> I mean to clear all IOV BARs in case any more of them are IO or M32. In this
>> case, the SRIOV capability won't be enabled. Otherwise, the resources for
>> all IOV BARs are assigned and allocated by PCI subsystem, but they won't
>> be used. Does it make sense to you?
>>
>
> If we want to save MMIO space, this is not necessary.
>
> The IOV BAR will be put into the optional list in assignment stage. So when
> there is not enough MMIO space, they will not be assigned.


If we are not going to use non-64bit IOV BAR, why would we assign anything 
to it at the first place? Or it is a common PCI code which does it?




> For the long term, maybe P9/P10, we will finally adjust the solution to
> support SRIOV devices with M32 MMIO. So I suggest to leave as it is.
>
>>>>>
>>>>> 		dev_dbg(&pdev->dev, " Fixing VF BAR%d: %pR to\n", i, res);
>>>>> 		size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);



-- 
Alexey

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH V2 3/6] powerpc/powernv: use one M64 BAR in Single PE mode for one VF BAR
  2015-08-07  1:48                   ` Wei Yang
@ 2015-08-07  8:13                     ` Alexey Kardashevskiy
  0 siblings, 0 replies; 56+ messages in thread
From: Alexey Kardashevskiy @ 2015-08-07  8:13 UTC (permalink / raw)
  To: Wei Yang, Gavin Shan; +Cc: benh, linuxppc-dev

On 08/07/2015 11:48 AM, Wei Yang wrote:
> On Thu, Aug 06, 2015 at 08:07:01PM +1000, Gavin Shan wrote:
>> On Thu, Aug 06, 2015 at 05:36:02PM +0800, Wei Yang wrote:
>>> On Thu, Aug 06, 2015 at 03:20:25PM +1000, Gavin Shan wrote:
>>>> On Wed, Aug 05, 2015 at 09:25:00AM +0800, Wei Yang wrote:
>>>>> In current implementation, when VF BAR is bigger than 64MB, it uses 4 M64
>>>>> BAR in Single PE mode to cover the number of VFs required to be enabled.
>>>>> By doing so, several VFs would be in one VF Group and leads to interference
>>>>> between VFs in the same group.
>>>>>
>>>>> This patch changes the design by using one M64 BAR in Single PE mode for
>>>>> one VF BAR. This gives absolute isolation for VFs.
>>>>>
>>>>> Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
>>>>> ---
>>>>> arch/powerpc/include/asm/pci-bridge.h     |    5 +-
>>>>> arch/powerpc/platforms/powernv/pci-ioda.c |  180 ++++++++++++-----------------
>>>>> 2 files changed, 76 insertions(+), 109 deletions(-)
>>>>>
>>>>> diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
>>>>> index 712add5..8aeba4c 100644
>>>>> --- a/arch/powerpc/include/asm/pci-bridge.h
>>>>> +++ b/arch/powerpc/include/asm/pci-bridge.h
>>>>> @@ -214,10 +214,9 @@ struct pci_dn {
>>>>> 	u16     vfs_expanded;		/* number of VFs IOV BAR expanded */
>>>>> 	u16     num_vfs;		/* number of VFs enabled*/
>>>>> 	int     offset;			/* PE# for the first VF PE */
>>>>> -#define M64_PER_IOV 4
>>>>> -	int     m64_per_iov;
>>>>> +	bool    m64_single_mode;	/* Use M64 BAR in Single Mode */
>>>>> #define IODA_INVALID_M64        (-1)
>>>>> -	int     m64_wins[PCI_SRIOV_NUM_BARS][M64_PER_IOV];
>>>>> +	int     (*m64_map)[PCI_SRIOV_NUM_BARS];
>>>>
>>>> It can be explicit? For example:
>>>>
>>>> 	int	*m64_map;
>>>>
>>>> 	/* Initialization */
>>>> 	size_t size = sizeof(*pdn->m64_map) * PCI_SRIOV_NUM_BARS * num_of_max_VFs;
>>>> 	pdn->m64_map = kmalloc(size, GFP_KERNEL);
>>>> 	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
>>>> 		for (j = 0; j < num_of_max_VFs; j++)
>>>> 			pdn->m64_map[i * PCI_SRIOV_NUM_BARS + j] = PNV_INVALID_M64;
>>>>
>>>> 	/* Destroy */
>>>> 	int step = 1;
>>>>
>>>> 	if (!pdn->m64_single_mode)
>>>> 		step = phb->ioda.total_pe;
>>>> 	for (i = 0; i < PCI_SRIOV_NUM_BARS * num_of_max_VFs; i += step)
>>>> 		if (pdn->m64_map[i] == PNV_INVALID_M64)
>>>> 			continue;
>>>>
>>>> 		/* Unmap the window */
>>>> 	
>>>
>>> The m64_map is a pointer to an array with 6 elements, which represents the 6
>>> M64 BAR index for the 6 VF BARs.
>>>
>>>     When we use Shared Mode, one array is allocated. The six elements
>>>     represents the six M64 BAR(at most) used to map the whole IOV BAR.
>>>
>>>     When we use Single Mode, num_vfs array is allocate. Each array represents
>>>     the map between one VF's BAR and M64 BAR index.
>>>
>>> During the map and un-map, M64 BAR is assigned one by one in VF BAR's order.
>>> So I think the code is explicit.
>>>
>>> In your code, you allocate a big one dimension array to hold the M64 BAR
>>> index. It works, while I don't think this is more explicit than original code.
>>>
>>
>> When M64 is in Single Mode, array with (num_vfs * 6) entries is allocated
>> because every VF BAR (6 at most) will have one corresponding PHB M64 BAR.
>> Anything I missed?
>>
>> The point in my code is you needn't worry about the mode (single vs shared)
>> As I said, not too much memory wasted. However, it's up to you.
>>
>
> If we don't want to save some memory, how about just define them static
> instead of dynamically allocate?


I like static and you can make it uint8_t[][] (or char[][]) as these 
indexes are not going to be bigger than 255 anyway.



>> I'm not fan of "int (*m64_map)[PCI_SRIOV_NUM_BARS]". Instead, you can replace
>> it with "int *m64_map" and calculate its size using following formula:
>>
>> 	sizeof(*pdn->m64_map) * PCI_SRIOV_NUM_BARS;
>>
>> 	sizeof(*pdn->m64_map) * PCI_SRIOV_NUM_BARS * num_vfs;




-- 
Alexey

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH V2 3/6] powerpc/powernv: use one M64 BAR in Single PE mode for one VF BAR
  2015-08-07  2:01               ` Wei Yang
@ 2015-08-07  8:59                 ` Alexey Kardashevskiy
  2015-08-10  1:48                   ` Wei Yang
  0 siblings, 1 reply; 56+ messages in thread
From: Alexey Kardashevskiy @ 2015-08-07  8:59 UTC (permalink / raw)
  To: Wei Yang; +Cc: gwshan, benh, linuxppc-dev

On 08/07/2015 12:01 PM, Wei Yang wrote:
> On Thu, Aug 06, 2015 at 08:04:58PM +1000, Alexey Kardashevskiy wrote:
>> On 08/05/2015 11:25 AM, Wei Yang wrote:
>>> In current implementation, when VF BAR is bigger than 64MB, it uses 4 M64
>>> BAR in Single PE mode to cover the number of VFs required to be enabled.
>>> By doing so, several VFs would be in one VF Group and leads to interference
>>> between VFs in the same group.
>>>
>>> This patch changes the design by using one M64 BAR in Single PE mode for
>>> one VF BAR. This gives absolute isolation for VFs.
>>>
>>> Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
>>> ---
>>>   arch/powerpc/include/asm/pci-bridge.h     |    5 +-
>>>   arch/powerpc/platforms/powernv/pci-ioda.c |  180 ++++++++++++-----------------
>>>   2 files changed, 76 insertions(+), 109 deletions(-)
>>>
>>> diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
>>> index 712add5..8aeba4c 100644
>>> --- a/arch/powerpc/include/asm/pci-bridge.h
>>> +++ b/arch/powerpc/include/asm/pci-bridge.h
>>> @@ -214,10 +214,9 @@ struct pci_dn {
>>>   	u16     vfs_expanded;		/* number of VFs IOV BAR expanded */
>>>   	u16     num_vfs;		/* number of VFs enabled*/
>>>   	int     offset;			/* PE# for the first VF PE */
>>> -#define M64_PER_IOV 4
>>> -	int     m64_per_iov;
>>> +	bool    m64_single_mode;	/* Use M64 BAR in Single Mode */
>>>   #define IODA_INVALID_M64        (-1)
>>> -	int     m64_wins[PCI_SRIOV_NUM_BARS][M64_PER_IOV];
>>> +	int     (*m64_map)[PCI_SRIOV_NUM_BARS];
>>>   #endif /* CONFIG_PCI_IOV */
>>>   #endif
>>>   	struct list_head child_list;
>>> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>>> index 7192e62..f5d110c 100644
>>> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
>>> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>>> @@ -1148,29 +1148,36 @@ static void pnv_pci_ioda_setup_PEs(void)
>>>   }
>>>
>>>   #ifdef CONFIG_PCI_IOV
>>> -static int pnv_pci_vf_release_m64(struct pci_dev *pdev)
>>> +static int pnv_pci_vf_release_m64(struct pci_dev *pdev, u16 num_vfs)
>>>   {
>>>   	struct pci_bus        *bus;
>>>   	struct pci_controller *hose;
>>>   	struct pnv_phb        *phb;
>>>   	struct pci_dn         *pdn;
>>>   	int                    i, j;
>>> +	int                    m64_bars;
>>>
>>>   	bus = pdev->bus;
>>>   	hose = pci_bus_to_host(bus);
>>>   	phb = hose->private_data;
>>>   	pdn = pci_get_pdn(pdev);
>>>
>>> +	if (pdn->m64_single_mode)
>>> +		m64_bars = num_vfs;
>>> +	else
>>> +		m64_bars = 1;
>>> +
>>>   	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
>>> -		for (j = 0; j < M64_PER_IOV; j++) {
>>> -			if (pdn->m64_wins[i][j] == IODA_INVALID_M64)
>>> +		for (j = 0; j < m64_bars; j++) {
>>> +			if (pdn->m64_map[j][i] == IODA_INVALID_M64)
>>>   				continue;
>>>   			opal_pci_phb_mmio_enable(phb->opal_id,
>>> -				OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 0);
>>> -			clear_bit(pdn->m64_wins[i][j], &phb->ioda.m64_bar_alloc);
>>> -			pdn->m64_wins[i][j] = IODA_INVALID_M64;
>>> +				OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 0);
>>> +			clear_bit(pdn->m64_map[j][i], &phb->ioda.m64_bar_alloc);
>>> +			pdn->m64_map[j][i] = IODA_INVALID_M64;
>>>   		}
>>>
>>> +	kfree(pdn->m64_map);
>>>   	return 0;
>>>   }
>>>
>>> @@ -1187,8 +1194,7 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
>>>   	int                    total_vfs;
>>>   	resource_size_t        size, start;
>>>   	int                    pe_num;
>>> -	int                    vf_groups;
>>> -	int                    vf_per_group;
>>> +	int                    m64_bars;
>>>
>>>   	bus = pdev->bus;
>>>   	hose = pci_bus_to_host(bus);
>>> @@ -1196,26 +1202,26 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
>>>   	pdn = pci_get_pdn(pdev);
>>>   	total_vfs = pci_sriov_get_totalvfs(pdev);
>>>
>>> -	/* Initialize the m64_wins to IODA_INVALID_M64 */
>>> -	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
>>> -		for (j = 0; j < M64_PER_IOV; j++)
>>> -			pdn->m64_wins[i][j] = IODA_INVALID_M64;
>>> +	if (pdn->m64_single_mode)
>>
>>
>> This is a physical function's @pdn, right?
>
> Yes
>
>>
>>
>>> +		m64_bars = num_vfs;
>>> +	else
>>> +		m64_bars = 1;
>>> +
>>> +	pdn->m64_map = kmalloc(sizeof(*pdn->m64_map) * m64_bars, GFP_KERNEL);
>>
>>
>> Assume we have SRIOV device with 16VF.
>> So it was m64_wins[6][4], now it is (roughly speaking) m64_map[6][16]
>> (for a single PE mode) or m64_map[6][1]. I believe m64_bars cannot be
>> bigger than 16 on PHB3, right? Is this checked anywhere (does it have
>> to)?
>
> In pnv_pci_vf_assign_m64(), we need to find_next_zero_bit() and check the
> return value. If exceed m64_bar_idx, means fail.
>
>>
>> This m64_wins -> m64_map change - is was not a map (what was it?),
>> and it is, is not it?
>
> Hmm... Gavin like this name.
>
>>
>> What does it store? An index of M64 BAR (0..15)?
>>
>
> Yes.
>
>>
>>
>>> +	if (!pdn->m64_map)
>>> +		return -ENOMEM;
>>> +	/* Initialize the m64_map to IODA_INVALID_M64 */
>>> +	for (i = 0; i < m64_bars ; i++)
>>> +		for (j = 0; j < PCI_SRIOV_NUM_BARS; j++)
>>> +			pdn->m64_map[i][j] = IODA_INVALID_M64;
>>>
>>> -	if (pdn->m64_per_iov == M64_PER_IOV) {
>>> -		vf_groups = (num_vfs <= M64_PER_IOV) ? num_vfs: M64_PER_IOV;
>>> -		vf_per_group = (num_vfs <= M64_PER_IOV)? 1:
>>> -			roundup_pow_of_two(num_vfs) / pdn->m64_per_iov;
>>> -	} else {
>>> -		vf_groups = 1;
>>> -		vf_per_group = 1;
>>> -	}
>>>
>>>   	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
>>>   		res = &pdev->resource[i + PCI_IOV_RESOURCES];
>>>   		if (!res->flags || !res->parent)
>>>   			continue;
>>>
>>> -		for (j = 0; j < vf_groups; j++) {
>>> +		for (j = 0; j < m64_bars; j++) {
>>>   			do {
>>>   				win = find_next_zero_bit(&phb->ioda.m64_bar_alloc,
>>>   						phb->ioda.m64_bar_idx + 1, 0);
>>> @@ -1224,12 +1230,11 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
>>>   					goto m64_failed;
>>>   			} while (test_and_set_bit(win, &phb->ioda.m64_bar_alloc));
>>>
>>> -			pdn->m64_wins[i][j] = win;
>>> +			pdn->m64_map[j][i] = win;
>>>
>>> -			if (pdn->m64_per_iov == M64_PER_IOV) {
>>> +			if (pdn->m64_single_mode) {
>>>   				size = pci_iov_resource_size(pdev,
>>>   							PCI_IOV_RESOURCES + i);
>>> -				size = size * vf_per_group;
>>>   				start = res->start + size * j;
>>>   			} else {
>>>   				size = resource_size(res);
>>> @@ -1237,16 +1242,16 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
>>>   			}
>>>
>>>   			/* Map the M64 here */
>>> -			if (pdn->m64_per_iov == M64_PER_IOV) {
>>> +			if (pdn->m64_single_mode) {
>>>   				pe_num = pdn->offset + j;
>>>   				rc = opal_pci_map_pe_mmio_window(phb->opal_id,
>>>   						pe_num, OPAL_M64_WINDOW_TYPE,
>>> -						pdn->m64_wins[i][j], 0);
>>> +						pdn->m64_map[j][i], 0);
>>>   			}
>>>
>>>   			rc = opal_pci_set_phb_mem_window(phb->opal_id,
>>>   						 OPAL_M64_WINDOW_TYPE,
>>> -						 pdn->m64_wins[i][j],
>>> +						 pdn->m64_map[j][i],
>>>   						 start,
>>>   						 0, /* unused */
>>>   						 size);
>>> @@ -1258,12 +1263,12 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
>>>   				goto m64_failed;
>>>   			}
>>>
>>> -			if (pdn->m64_per_iov == M64_PER_IOV)
>>> +			if (pdn->m64_single_mode)
>>>   				rc = opal_pci_phb_mmio_enable(phb->opal_id,
>>> -				     OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 2);
>>> +				     OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 2);
>>>   			else
>>>   				rc = opal_pci_phb_mmio_enable(phb->opal_id,
>>> -				     OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 1);
>>> +				     OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 1);
>>>
>>>   			if (rc != OPAL_SUCCESS) {
>>>   				dev_err(&pdev->dev, "Failed to enable M64 window #%d: %llx\n",
>>> @@ -1275,7 +1280,7 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
>>>   	return 0;
>>>
>>>   m64_failed:
>>> -	pnv_pci_vf_release_m64(pdev);
>>> +	pnv_pci_vf_release_m64(pdev, num_vfs);
>>>   	return -EBUSY;
>>>   }
>>>
>>> @@ -1302,15 +1307,13 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe
>>>   	iommu_free_table(tbl, of_node_full_name(dev->dev.of_node));
>>>   }
>>>
>>> -static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs)
>>> +static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
>>>   {
>>>   	struct pci_bus        *bus;
>>>   	struct pci_controller *hose;
>>>   	struct pnv_phb        *phb;
>>>   	struct pnv_ioda_pe    *pe, *pe_n;
>>>   	struct pci_dn         *pdn;
>>> -	u16                    vf_index;
>>> -	int64_t                rc;
>>>
>>>   	bus = pdev->bus;
>>>   	hose = pci_bus_to_host(bus);
>>> @@ -1320,35 +1323,6 @@ static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs)
>>>   	if (!pdev->is_physfn)
>>>   		return;
>>>
>>> -	if (pdn->m64_per_iov == M64_PER_IOV && num_vfs > M64_PER_IOV) {
>>> -		int   vf_group;
>>> -		int   vf_per_group;
>>> -		int   vf_index1;
>>> -
>>> -		vf_per_group = roundup_pow_of_two(num_vfs) / pdn->m64_per_iov;
>>> -
>>> -		for (vf_group = 0; vf_group < M64_PER_IOV; vf_group++)
>>> -			for (vf_index = vf_group * vf_per_group;
>>> -				vf_index < (vf_group + 1) * vf_per_group &&
>>> -				vf_index < num_vfs;
>>> -				vf_index++)
>>> -				for (vf_index1 = vf_group * vf_per_group;
>>> -					vf_index1 < (vf_group + 1) * vf_per_group &&
>>> -					vf_index1 < num_vfs;
>>> -					vf_index1++){
>>> -
>>> -					rc = opal_pci_set_peltv(phb->opal_id,
>>> -						pdn->offset + vf_index,
>>> -						pdn->offset + vf_index1,
>>> -						OPAL_REMOVE_PE_FROM_DOMAIN);
>>> -
>>> -					if (rc)
>>> -					    dev_warn(&pdev->dev, "%s: Failed to unlink same group PE#%d(%lld)\n",
>>> -						__func__,
>>> -						pdn->offset + vf_index1, rc);
>>> -				}
>>> -	}
>>> -
>>>   	list_for_each_entry_safe(pe, pe_n, &phb->ioda.pe_list, list) {
>>>   		if (pe->parent_dev != pdev)
>>>   			continue;
>>> @@ -1383,14 +1357,14 @@ void pnv_pci_sriov_disable(struct pci_dev *pdev)
>>>   	num_vfs = pdn->num_vfs;
>>>
>>>   	/* Release VF PEs */
>>> -	pnv_ioda_release_vf_PE(pdev, num_vfs);
>>> +	pnv_ioda_release_vf_PE(pdev);
>>>
>>>   	if (phb->type == PNV_PHB_IODA2) {
>>> -		if (pdn->m64_per_iov == 1)
>>> +		if (!pdn->m64_single_mode)
>>>   			pnv_pci_vf_resource_shift(pdev, -pdn->offset);
>>>
>>>   		/* Release M64 windows */
>>> -		pnv_pci_vf_release_m64(pdev);
>>> +		pnv_pci_vf_release_m64(pdev, num_vfs);
>>>
>>>   		/* Release PE numbers */
>>>   		bitmap_clear(phb->ioda.pe_alloc, pdn->offset, num_vfs);
>>> @@ -1409,7 +1383,6 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
>>>   	int                    pe_num;
>>>   	u16                    vf_index;
>>>   	struct pci_dn         *pdn;
>>> -	int64_t                rc;
>>>
>>>   	bus = pdev->bus;
>>>   	hose = pci_bus_to_host(bus);
>>> @@ -1454,37 +1427,6 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
>>>
>>>   		pnv_pci_ioda2_setup_dma_pe(phb, pe);
>>>   	}
>>> -
>>> -	if (pdn->m64_per_iov == M64_PER_IOV && num_vfs > M64_PER_IOV) {
>>> -		int   vf_group;
>>> -		int   vf_per_group;
>>> -		int   vf_index1;
>>> -
>>> -		vf_per_group = roundup_pow_of_two(num_vfs) / pdn->m64_per_iov;
>>> -
>>> -		for (vf_group = 0; vf_group < M64_PER_IOV; vf_group++) {
>>> -			for (vf_index = vf_group * vf_per_group;
>>> -			     vf_index < (vf_group + 1) * vf_per_group &&
>>> -			     vf_index < num_vfs;
>>> -			     vf_index++) {
>>> -				for (vf_index1 = vf_group * vf_per_group;
>>> -				     vf_index1 < (vf_group + 1) * vf_per_group &&
>>> -				     vf_index1 < num_vfs;
>>> -				     vf_index1++) {
>>> -
>>> -					rc = opal_pci_set_peltv(phb->opal_id,
>>> -						pdn->offset + vf_index,
>>> -						pdn->offset + vf_index1,
>>> -						OPAL_ADD_PE_TO_DOMAIN);
>>> -
>>> -					if (rc)
>>> -					    dev_warn(&pdev->dev, "%s: Failed to link same group PE#%d(%lld)\n",
>>> -						__func__,
>>> -						pdn->offset + vf_index1, rc);
>>> -				}
>>> -			}
>>> -		}
>>> -	}
>>>   }
>>>
>>>   int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
>>> @@ -1507,6 +1449,18 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
>>>   			return -EBUSY;
>>>   		}
>>>
>>> +		/*
>>> +		 * On PNV_PHB_IODA2, We just have 16 M64 BARs and M64 BAR #15
>>> +		 * is used to cover the whole system, which leaves only 15 M64
>>> +		 * BAR usable for VFs.
>>> +		 * When M64 BAR functions in Single PE mode, this means it
>>> +		 * just could enable 15 VFs.
>>> +		 */
>>> +		if (pdn->m64_single_mode && num_vfs >= 16) {
>>
>> Magic constant 16. Where did this 16 come from? My understanding is
>> it could come from
>>
>> 1) hostboot or
>> 2) OPAL or
>> 3) architected on IODA2
>> 4) defined in PHB3 (actually it has to be 2))
>>
>> which one is it? If 1) and 2) - make it a variable; if 3) - add a macro for it.
>>
>
> As Gavin indicated, this will change to "num_vfs > phb->ioda.m64_bar_idx"


This does not really answer my question ;) But I believe it is 4) as PHB3 
(IODA2 does not mention M64 at all) has only 16 M64's per PHB.

Still, pnv_ioda_parse_m64_window() puts 15 to m64_bar_idx with no 
explanation why. If there was "#define PNV_IODA2_PHB3_M64_MAX_NUMBER 16" 
somewhere OR some call to OPAL which returns this "16" on PHB3 but there is 
none.



>
>>
>>> +			dev_info(&pdev->dev, "Not enough M64 BAR for VFs\n");
>>> +			return -EBUSY;
>>> +		}
>>> +
>>>   		/* Calculate available PE for required VFs */
>>>   		mutex_lock(&phb->ioda.pe_alloc_mutex);
>>>   		pdn->offset = bitmap_find_next_zero_area(
>>> @@ -1534,7 +1488,7 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
>>>   		 * the IOV BAR according to the PE# allocated to the VFs.
>>>   		 * Otherwise, the PE# for the VF will conflict with others.
>>>   		 */
>>> -		if (pdn->m64_per_iov == 1) {
>>> +		if (!pdn->m64_single_mode) {
>>>   			ret = pnv_pci_vf_resource_shift(pdev, pdn->offset);
>>>   			if (ret)
>>>   				goto m64_failed;
>>> @@ -1567,8 +1521,7 @@ int pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
>>>   	/* Allocate PCI data */
>>>   	add_dev_pci_data(pdev);
>>>
>>> -	pnv_pci_sriov_enable(pdev, num_vfs);
>>> -	return 0;
>>> +	return pnv_pci_sriov_enable(pdev, num_vfs);
>>>   }
>>>   #endif /* CONFIG_PCI_IOV */
>>>
>>> @@ -2761,9 +2714,9 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
>>>
>>>   	pdn = pci_get_pdn(pdev);
>>>   	pdn->vfs_expanded = 0;
>>> +	pdn->m64_single_mode = false;
>>>
>>>   	total_vfs = pci_sriov_get_totalvfs(pdev);
>>> -	pdn->m64_per_iov = 1;
>>>   	mul = phb->ioda.total_pe;
>>>
>>>   	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
>>> @@ -2783,8 +2736,8 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
>>>   		if (size > (1 << 26)) {
>>>   			dev_info(&pdev->dev, "PowerNV: VF BAR%d: %pR IOV size is bigger than 64M, roundup power2\n",
>>>   				 i, res);
>>> -			pdn->m64_per_iov = M64_PER_IOV;
>>>   			mul = roundup_pow_of_two(total_vfs);
>>> +			pdn->m64_single_mode = true;
>>>   			break;
>>>   		}
>>>   	}
>>> @@ -2986,6 +2939,8 @@ static resource_size_t pnv_pci_window_alignment(struct pci_bus *bus,
>>>   static resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev,
>>>   						      int resno)
>>>   {
>>> +	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
>>> +	struct pnv_phb *phb = hose->private_data;
>>>   	struct pci_dn *pdn = pci_get_pdn(pdev);
>>>   	resource_size_t align;
>>>
>>> @@ -2994,12 +2949,25 @@ static resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev,
>>>   	 * SR-IOV. While from hardware perspective, the range mapped by M64
>>>   	 * BAR should be size aligned.
>>>   	 *
>>> +	 * When IOV BAR is mapped with M64 BAR in Single PE mode, the hardware
>>> +	 * restriction to alignment is gone.
>>
>>
>> Gone? Does not BAR still have to be aligned to its size?
>>
>
> Yes, M64 BAR is always size aligned. While since in Single PE mode, the M64
> BAR size is the same as a VF BAR size, which means they have the same
> alignment now.  What I want to say is the extra hardware restriction is gone.
>
> Let me put more to explain this.

Sure. Just add "extra powernv-specific" before "hardware restriction" (or 
something like that).



>>
>>> But if just use the VF BAR size
>>> +	 * as the alignment, PF BAR / VF BAR may be allocated with in one M64
>>> +	 * segment,
>>
>>
>> I thought each VF gets its own _segment_, am I wrong?
>>
>
>  From the one M64 BAR map the VF BAR, yes.
>
> While we have M64 BAR#15 to cover the whole 64bit MMIO space, whose segment
> size is bigger then the one map the VF BARA. When not properly aligned, VF and
> PF may sit in the same segment of the M64 BAR#15.


When is M64 #15 not in a single mode? Always?



>>
>>> which introduces the PE conflict between PF and VF. Based
>>> +	 * on this the minimum alignment of an IOV BAR is m64_segsize.
>>>
>>> +	 *
>>>   	 * This function return the total IOV BAR size if expanded or just the
>>> -	 * individual size if not.
>>> +	 * individual size if not, when M64 BAR is in Shared PE mode.
>>> +	 * If the M64 BAR is in Single PE mode, return the VF BAR size or
>>> +	 * m64_size if IOV BAR size is less.
>>>   	 */
>>>   	align = pci_iov_resource_size(pdev, resno);
>>> -	if (pdn->vfs_expanded)
>>> -		return pdn->vfs_expanded * align;
>>> +	if (pdn->vfs_expanded) {
>>> +		if (pdn->m64_single_mode)
>>> +			return max(align,
>>> +				(resource_size_t)phb->ioda.m64_segsize);
>>> +		else
>>> +			return pdn->vfs_expanded * align;
>>> +	}
>>>
>>>   	return align;
>>>   }



-- 
Alexey

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH V2 4/6] powerpc/powernv: replace the hard coded boundary with gate
  2015-08-06  5:26             ` Gavin Shan
@ 2015-08-07  9:11               ` Alexey Kardashevskiy
  0 siblings, 0 replies; 56+ messages in thread
From: Alexey Kardashevskiy @ 2015-08-07  9:11 UTC (permalink / raw)
  To: Gavin Shan, Wei Yang; +Cc: benh, linuxppc-dev

On 08/06/2015 03:26 PM, Gavin Shan wrote:
> On Wed, Aug 05, 2015 at 09:25:01AM +0800, Wei Yang wrote:
>> Based on the limitation of M64 Window size, when VF BAR size is bigger than
>> 64MB, IOV BAR just round up power of 2 of the total_vfs. While the 64MB is
>> a magic boundary in code, which is hard to maintain.
>>
>> This patch replaces the hard coded boundary with gate, which is calculated
>>from m64_segsize and adds comment to explain the reason for it.
>>
>> Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
>> ---
>> arch/powerpc/platforms/powernv/pci-ioda.c |   22 +++++++++++++++++-----
>> 1 file changed, 17 insertions(+), 5 deletions(-)
>>
>> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>> index f5d110c..31dcedc 100644
>> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
>> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>> @@ -2702,7 +2702,7 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
>> 	struct pnv_phb *phb;
>> 	struct resource *res;
>> 	int i;
>> -	resource_size_t size;
>> +	resource_size_t size, gate;
>> 	struct pci_dn *pdn;
>> 	int mul, total_vfs;
>>
>> @@ -2718,6 +2718,17 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
>>
>> 	total_vfs = pci_sriov_get_totalvfs(pdev);
>> 	mul = phb->ioda.total_pe;
>> +	/*
>> +	 * If bigger than or equal to half of m64_segsize, just round up power
>> +	 * of two.
>> +	 *
>> +	 * Generally, one M64 BAR maps one IOV BAR. To avoid conflict with
>> +	 * other devices, IOV BAR size is expanded to be (total_pe * VF size).
>> +	 * When VF size is half of m64_segsize , the expanded size would equal
>> +	 * to half of the whole M64 Window size, which will exhaust the M64
>> +	 * Window and limit the system flexibility.
>> +	 */
>
> s/VF size/VF BAR size
> s/m64_segsize/M64 segment size
> s/M64 Window/M64 space

I thought I started understanding the stuff and you just introduces new 
term - "M64 space". Not "64bit MMIO space" but "M64 space" - what is this? 
Is that 64GB 64bit MMIO window which we get from the hostboot?


>
>> +	gate = phb->ioda.m64_segsize >> 1;
>>
>> 	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
>> 		res = &pdev->resource[i + PCI_IOV_RESOURCES];
>> @@ -2732,10 +2743,11 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
>>
>> 		size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
>>
>> -		/* bigger than 64M */
>> -		if (size > (1 << 26)) {
>> -			dev_info(&pdev->dev, "PowerNV: VF BAR%d: %pR IOV size is bigger than 64M, roundup power2\n",
>> -				 i, res);
>> +		/* bigger than or equal to gate */
>> +		if (size >= gate) {
>> +			dev_info(&pdev->dev, "PowerNV: VF BAR%d: %pR IOV size "
>> +				"is bigger than %lld, roundup power2\n",
>> +				 i, res, gate);
>
> If I understand the changes correctly, single VF BAR size is still checked against
> the "gate" (128MB), not the total VF BAR size. Recap the comments I gave last time:
>
> I mean to check the sum of all VF BARs. For example, the VFs attached to its PF has two
> VF BARs and each of them is 64MB. For this case, the MMIO resource can't be allocated
> once extending them to 256 VFs. So we have to try "single-pe-mode" for this situation.
> So the check becomes as below:
>
>          struct pci_controller *hose = pci_bus_to_host(pdev->bus);
>          struct pnv_phb *phb = hose->private_data;
>          resource_size_t total_vf_bar_sz = 0;
>          resource_size_t gate;
>
>          /* Some comments to explain the "gate" */
>          gate = phb->m64_segsize / 2;
>          for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
>                  total_vf_bar_sz += pci_iov_resource_size(pdev, PCI_IOV_RESOURCES + i);
>
>          if (total_vf_bar_sz >= gate)


Why would be compare to the total size of the BARs? If VFs have 3 64MB BARs 
each (these are 64bit BARs so up to 3 per VF, right?), which is 192MB in 
total per VF, we can use 3 M64's, each in segmented mode (1 segment == 
64MB) and cover many VFs.



>                  /* single-pe-mode */
>          else
>                  /* shared-mode */
>
>> 			mul = roundup_pow_of_two(total_vfs);
>> 			pdn->m64_single_mode = true;
>> 			break;
>> --
>> 1.7.9.5
>>
>


-- 
Alexey

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH V2 6/6] powerpc/powernv: allocate discrete PE# when using M64 BAR in Single PE mode
  2015-08-07  5:54                         ` Gavin Shan
  2015-08-07  6:25                           ` Wei Yang
@ 2015-08-07 10:00                           ` Alexey Kardashevskiy
  1 sibling, 0 replies; 56+ messages in thread
From: Alexey Kardashevskiy @ 2015-08-07 10:00 UTC (permalink / raw)
  To: Gavin Shan, Wei Yang; +Cc: benh, linuxppc-dev

On 08/07/2015 03:54 PM, Gavin Shan wrote:
> On Fri, Aug 07, 2015 at 01:44:33PM +0800, Wei Yang wrote:
>> On Fri, Aug 07, 2015 at 01:43:01PM +1000, Gavin Shan wrote:
>>> On Fri, Aug 07, 2015 at 10:33:33AM +0800, Wei Yang wrote:
>>>> On Fri, Aug 07, 2015 at 11:36:56AM +1000, Gavin Shan wrote:
>>>>> On Thu, Aug 06, 2015 at 09:41:41PM +0800, Wei Yang wrote:
>>>>>> On Thu, Aug 06, 2015 at 03:36:01PM +1000, Gavin Shan wrote:
>>>>>>> On Wed, Aug 05, 2015 at 09:25:03AM +0800, Wei Yang wrote:
>>>>>>>> When M64 BAR is set to Single PE mode, the PE# assigned to VF could be
>>>>>>>> discrete.

s/discrete/sparse/ may be?


>>>>>>>>
>>>>>>>> This patch restructures the patch to allocate discrete PE# for VFs when M64
>>>>>>>> BAR is set to Single PE mode.
>>>>>>>>
>>>>>>>> Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
>>>>>>>> ---
>>>>>>>> arch/powerpc/include/asm/pci-bridge.h     |    2 +-
>>>>>>>> arch/powerpc/platforms/powernv/pci-ioda.c |   69 +++++++++++++++++++++--------
>>>>>>>> 2 files changed, 51 insertions(+), 20 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
>>>>>>>> index 8aeba4c..72415c7 100644
>>>>>>>> --- a/arch/powerpc/include/asm/pci-bridge.h
>>>>>>>> +++ b/arch/powerpc/include/asm/pci-bridge.h
>>>>>>>> @@ -213,7 +213,7 @@ struct pci_dn {
>>>>>>>> #ifdef CONFIG_PCI_IOV
>>>>>>>> 	u16     vfs_expanded;		/* number of VFs IOV BAR expanded */
>>>>>>>> 	u16     num_vfs;		/* number of VFs enabled*/
>>>>>>>> -	int     offset;			/* PE# for the first VF PE */
>>>>>>>> +	int     *offset;		/* PE# for the first VF PE or array */
>>>>>>>> 	bool    m64_single_mode;	/* Use M64 BAR in Single Mode */
>>>>>>>> #define IODA_INVALID_M64        (-1)
>>>>>>>> 	int     (*m64_map)[PCI_SRIOV_NUM_BARS];
>>>>>>>
>>>>>>> how about renaming "offset" to "pe_num_map", or "pe_map" ? Similar to the comments
>>>>>>> I gave to the "m64_bar_map", num_of_max_vfs entries can be allocated. Though not
>>>>>>> all of them will be used, not too much memory will be wasted.
>>>>>>>
>>>>>>
>>>>>> Thanks for your comment.
>>>>>>
>>>>>> I have thought about change the name to make it more self explain. While
>>>>>> another fact I want to take in is this field is also used to be reflect the
>>>>>> shift offset when M64 BAR is used in the Shared Mode. So I maintain the name.
>>>>>>
>>>>>> How about use "enum", one maintain the name "offset", and another one rename to
>>>>>> "pe_num_map". And use the meaningful name at proper place?
>>>>>>
>>>>
>>>> So I suppose you agree with my naming proposal.
>>>>
>>>
>>> No, I dislike the "enum" things.
>>>
>>
>> OK, then you suggest to rename it pe_num_map or keep it as offset?
>>
>
> pe_num_map would be better.


+1. @offset is very confusing.

It could be a linked list actually, "struct list_head pe_list" in pci_dn 
and "struct list_head next" in struct pnv_ioda_pe.  I could not quickly 
spot places where you would access this array outside a loop 
for(i=0;i<num_vfs;++i).



-- 
Alexey

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH V2 1/6] powerpc/powernv: don't enable SRIOV when VF BAR contains non M64 BAR
  2015-08-07  7:14                     ` Alexey Kardashevskiy
@ 2015-08-10  1:40                       ` Wei Yang
  0 siblings, 0 replies; 56+ messages in thread
From: Wei Yang @ 2015-08-10  1:40 UTC (permalink / raw)
  To: Alexey Kardashevskiy; +Cc: Wei Yang, Gavin Shan, benh, linuxppc-dev

On Fri, Aug 07, 2015 at 05:14:41PM +1000, Alexey Kardashevskiy wrote:
>On 08/07/2015 12:24 PM, Wei Yang wrote:
>>On Fri, Aug 07, 2015 at 11:20:10AM +1000, Gavin Shan wrote:
>>>On Thu, Aug 06, 2015 at 10:10:10PM +0800, Wei Yang wrote:
>>>>On Thu, Aug 06, 2015 at 02:35:57PM +1000, Gavin Shan wrote:
>>>>>On Wed, Aug 05, 2015 at 09:24:58AM +0800, Wei Yang wrote:
>>>>>>On PHB_IODA2, we enable SRIOV devices by mapping IOV BAR with M64 BARs. If
>>>>>>a SRIOV device's BAR is not 64-bit prefetchable, this is not assigned from
>>>>>>M64 windwo, which means M64 BAR can't work on it.
>>>>>>
>>>>>
>>>>>s/PHB_IODA2/PHB3
>>>>>s/windwo/window
>>>>>
>>>>>>This patch makes this explicit.
>>>>>>
>>>>>>Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
>>>>>
>>>>>The idea sounds right, but there is one question as below.
>>>>>
>>>>>>---
>>>>>>arch/powerpc/platforms/powernv/pci-ioda.c |   25 +++++++++----------------
>>>>>>1 file changed, 9 insertions(+), 16 deletions(-)
>>>>>>
>>>>>>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>>>>>>index 5738d31..9b41dba 100644
>>>>>>--- a/arch/powerpc/platforms/powernv/pci-ioda.c
>>>>>>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>>>>>>@@ -908,9 +908,6 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
>>>>>>		if (!res->flags || !res->parent)
>>>>>>			continue;
>>>>>>
>>>>>>-		if (!pnv_pci_is_mem_pref_64(res->flags))
>>>>>>-			continue;
>>>>>>-
>>>>>>		/*
>>>>>>		 * The actual IOV BAR range is determined by the start address
>>>>>>		 * and the actual size for num_vfs VFs BAR.  This check is to
>>>>>>@@ -939,9 +936,6 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
>>>>>>		if (!res->flags || !res->parent)
>>>>>>			continue;
>>>>>>
>>>>>>-		if (!pnv_pci_is_mem_pref_64(res->flags))
>>>>>>-			continue;
>>>>>>-
>>>>>>		size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
>>>>>>		res2 = *res;
>>>>>>		res->start += size * offset;
>>>>>>@@ -1221,9 +1215,6 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
>>>>>>		if (!res->flags || !res->parent)
>>>>>>			continue;
>>>>>>
>>>>>>-		if (!pnv_pci_is_mem_pref_64(res->flags))
>>>>>>-			continue;
>>>>>>-
>>>>>>		for (j = 0; j < vf_groups; j++) {
>>>>>>			do {
>>>>>>				win = find_next_zero_bit(&phb->ioda.m64_bar_alloc,
>>>>>>@@ -1510,6 +1501,12 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
>>>>>>	pdn = pci_get_pdn(pdev);
>>>>>>
>>>>>>	if (phb->type == PNV_PHB_IODA2) {
>>>>>>+		if (!pdn->vfs_expanded) {
>>>>>>+			dev_info(&pdev->dev, "don't support this SRIOV device"
>>>>>>+				" with non M64 VF BAR\n");
>>>>>>+			return -EBUSY;
>>>>>>+		}
>>>>>>+
>>>>>
>>>>>It would be -ENOSPC since -EBUSY indicates the devices (VFs) are temparily
>>>>>unavailable. For this case, the VFs are permanently unavailable because of
>>>>>running out of space to accomodate M64 and non-M64 VF BARs.
>>>>>
>>>>>The error message could be printed with dev_warn() and it would be precise
>>>>>as below or something else you prefer:
>>>>>
>>>>>	dev_warn(&pdev->dev, "SRIOV not supported because of non-M64 VF BAR\n");
>>>>>
>>>>
>>>>Thanks for the comment, will change accordingly.
>>>>
>>>>>
>>>>>>		/* Calculate available PE for required VFs */
>>>>>>		mutex_lock(&phb->ioda.pe_alloc_mutex);
>>>>>>		pdn->offset = bitmap_find_next_zero_area(
>>>>>>@@ -2774,9 +2771,10 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
>>>>>>		if (!res->flags || res->parent)
>>>>>>			continue;
>>>>>>		if (!pnv_pci_is_mem_pref_64(res->flags)) {
>>>>>>-			dev_warn(&pdev->dev, " non M64 VF BAR%d: %pR\n",
>>>>>>+			dev_warn(&pdev->dev, "Don't support SR-IOV with"
>>>>>>+					" non M64 VF BAR%d: %pR. \n",
>>>>>>				 i, res);
>>>>>>-			continue;
>>>>>>+			return;
>>>>>>		}
>>>>>>
>>>>>>		size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
>>>>>>@@ -2795,11 +2793,6 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
>>>>>>		res = &pdev->resource[i + PCI_IOV_RESOURCES];
>>>>>>		if (!res->flags || res->parent)
>>>>>>			continue;
>>>>>>-		if (!pnv_pci_is_mem_pref_64(res->flags)) {
>>>>>>-			dev_warn(&pdev->dev, "Skipping expanding VF BAR%d: %pR\n",
>>>>>>-				 i, res);
>>>>>>-			continue;
>>>>>>-		}
>>>>>
>>>>>When any one IOV BAR on the PF is non-M64, none of the VFs can be enabled.
>>>>>Will we still allocate/assign M64 or M32 resources for the IOV BARs? If so,
>>>>>I think it can be avoided.
>>>>>
>>>>
>>>>Don't get your point. You mean to avoid this function?
>>>>
>>>>Or clear the IOV BAR when we found one of it is non-M64?
>>>>
>>>
>>>I mean to clear all IOV BARs in case any more of them are IO or M32. In this
>>>case, the SRIOV capability won't be enabled. Otherwise, the resources for
>>>all IOV BARs are assigned and allocated by PCI subsystem, but they won't
>>>be used. Does it make sense to you?
>>>
>>
>>If we want to save MMIO space, this is not necessary.
>>
>>The IOV BAR will be put into the optional list in assignment stage. So when
>>there is not enough MMIO space, they will not be assigned.
>
>
>If we are not going to use non-64bit IOV BAR, why would we assign
>anything to it at the first place? Or it is a common PCI code which
>does it?
>

Yes.

First skiboot will allocate the range, then kernel read it. Kernel has two
choice, use the address firmware allocated or re-assigned them as we did on
powernv platform

>
>
>
>>For the long term, maybe P9/P10, we will finally adjust the solution to
>>support SRIOV devices with M32 MMIO. So I suggest to leave as it is.
>>
>>>>>>
>>>>>>		dev_dbg(&pdev->dev, " Fixing VF BAR%d: %pR to\n", i, res);
>>>>>>		size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
>
>
>
>-- 
>Alexey

-- 
Richard Yang
Help you, Help me

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH V2 3/6] powerpc/powernv: use one M64 BAR in Single PE mode for one VF BAR
  2015-08-07  8:59                 ` Alexey Kardashevskiy
@ 2015-08-10  1:48                   ` Wei Yang
  0 siblings, 0 replies; 56+ messages in thread
From: Wei Yang @ 2015-08-10  1:48 UTC (permalink / raw)
  To: Alexey Kardashevskiy; +Cc: Wei Yang, gwshan, benh, linuxppc-dev

On Fri, Aug 07, 2015 at 06:59:58PM +1000, Alexey Kardashevskiy wrote:
>On 08/07/2015 12:01 PM, Wei Yang wrote:
>>On Thu, Aug 06, 2015 at 08:04:58PM +1000, Alexey Kardashevskiy wrote:
>>>On 08/05/2015 11:25 AM, Wei Yang wrote:
>>>>In current implementation, when VF BAR is bigger than 64MB, it uses 4 M64
>>>>BAR in Single PE mode to cover the number of VFs required to be enabled.
>>>>By doing so, several VFs would be in one VF Group and leads to interference
>>>>between VFs in the same group.
>>>>
>>>>This patch changes the design by using one M64 BAR in Single PE mode for
>>>>one VF BAR. This gives absolute isolation for VFs.
>>>>
>>>>Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
>>>>---
>>>>  arch/powerpc/include/asm/pci-bridge.h     |    5 +-
>>>>  arch/powerpc/platforms/powernv/pci-ioda.c |  180 ++++++++++++-----------------
>>>>  2 files changed, 76 insertions(+), 109 deletions(-)
>>>>
>>>>diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
>>>>index 712add5..8aeba4c 100644
>>>>--- a/arch/powerpc/include/asm/pci-bridge.h
>>>>+++ b/arch/powerpc/include/asm/pci-bridge.h
>>>>@@ -214,10 +214,9 @@ struct pci_dn {
>>>>  	u16     vfs_expanded;		/* number of VFs IOV BAR expanded */
>>>>  	u16     num_vfs;		/* number of VFs enabled*/
>>>>  	int     offset;			/* PE# for the first VF PE */
>>>>-#define M64_PER_IOV 4
>>>>-	int     m64_per_iov;
>>>>+	bool    m64_single_mode;	/* Use M64 BAR in Single Mode */
>>>>  #define IODA_INVALID_M64        (-1)
>>>>-	int     m64_wins[PCI_SRIOV_NUM_BARS][M64_PER_IOV];
>>>>+	int     (*m64_map)[PCI_SRIOV_NUM_BARS];
>>>>  #endif /* CONFIG_PCI_IOV */
>>>>  #endif
>>>>  	struct list_head child_list;
>>>>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>>>>index 7192e62..f5d110c 100644
>>>>--- a/arch/powerpc/platforms/powernv/pci-ioda.c
>>>>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>>>>@@ -1148,29 +1148,36 @@ static void pnv_pci_ioda_setup_PEs(void)
>>>>  }
>>>>
>>>>  #ifdef CONFIG_PCI_IOV
>>>>-static int pnv_pci_vf_release_m64(struct pci_dev *pdev)
>>>>+static int pnv_pci_vf_release_m64(struct pci_dev *pdev, u16 num_vfs)
>>>>  {
>>>>  	struct pci_bus        *bus;
>>>>  	struct pci_controller *hose;
>>>>  	struct pnv_phb        *phb;
>>>>  	struct pci_dn         *pdn;
>>>>  	int                    i, j;
>>>>+	int                    m64_bars;
>>>>
>>>>  	bus = pdev->bus;
>>>>  	hose = pci_bus_to_host(bus);
>>>>  	phb = hose->private_data;
>>>>  	pdn = pci_get_pdn(pdev);
>>>>
>>>>+	if (pdn->m64_single_mode)
>>>>+		m64_bars = num_vfs;
>>>>+	else
>>>>+		m64_bars = 1;
>>>>+
>>>>  	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
>>>>-		for (j = 0; j < M64_PER_IOV; j++) {
>>>>-			if (pdn->m64_wins[i][j] == IODA_INVALID_M64)
>>>>+		for (j = 0; j < m64_bars; j++) {
>>>>+			if (pdn->m64_map[j][i] == IODA_INVALID_M64)
>>>>  				continue;
>>>>  			opal_pci_phb_mmio_enable(phb->opal_id,
>>>>-				OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 0);
>>>>-			clear_bit(pdn->m64_wins[i][j], &phb->ioda.m64_bar_alloc);
>>>>-			pdn->m64_wins[i][j] = IODA_INVALID_M64;
>>>>+				OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 0);
>>>>+			clear_bit(pdn->m64_map[j][i], &phb->ioda.m64_bar_alloc);
>>>>+			pdn->m64_map[j][i] = IODA_INVALID_M64;
>>>>  		}
>>>>
>>>>+	kfree(pdn->m64_map);
>>>>  	return 0;
>>>>  }
>>>>
>>>>@@ -1187,8 +1194,7 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
>>>>  	int                    total_vfs;
>>>>  	resource_size_t        size, start;
>>>>  	int                    pe_num;
>>>>-	int                    vf_groups;
>>>>-	int                    vf_per_group;
>>>>+	int                    m64_bars;
>>>>
>>>>  	bus = pdev->bus;
>>>>  	hose = pci_bus_to_host(bus);
>>>>@@ -1196,26 +1202,26 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
>>>>  	pdn = pci_get_pdn(pdev);
>>>>  	total_vfs = pci_sriov_get_totalvfs(pdev);
>>>>
>>>>-	/* Initialize the m64_wins to IODA_INVALID_M64 */
>>>>-	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
>>>>-		for (j = 0; j < M64_PER_IOV; j++)
>>>>-			pdn->m64_wins[i][j] = IODA_INVALID_M64;
>>>>+	if (pdn->m64_single_mode)
>>>
>>>
>>>This is a physical function's @pdn, right?
>>
>>Yes
>>
>>>
>>>
>>>>+		m64_bars = num_vfs;
>>>>+	else
>>>>+		m64_bars = 1;
>>>>+
>>>>+	pdn->m64_map = kmalloc(sizeof(*pdn->m64_map) * m64_bars, GFP_KERNEL);
>>>
>>>
>>>Assume we have SRIOV device with 16VF.
>>>So it was m64_wins[6][4], now it is (roughly speaking) m64_map[6][16]
>>>(for a single PE mode) or m64_map[6][1]. I believe m64_bars cannot be
>>>bigger than 16 on PHB3, right? Is this checked anywhere (does it have
>>>to)?
>>
>>In pnv_pci_vf_assign_m64(), we need to find_next_zero_bit() and check the
>>return value. If exceed m64_bar_idx, means fail.
>>
>>>
>>>This m64_wins -> m64_map change - is was not a map (what was it?),
>>>and it is, is not it?
>>
>>Hmm... Gavin like this name.
>>
>>>
>>>What does it store? An index of M64 BAR (0..15)?
>>>
>>
>>Yes.
>>
>>>
>>>
>>>>+	if (!pdn->m64_map)
>>>>+		return -ENOMEM;
>>>>+	/* Initialize the m64_map to IODA_INVALID_M64 */
>>>>+	for (i = 0; i < m64_bars ; i++)
>>>>+		for (j = 0; j < PCI_SRIOV_NUM_BARS; j++)
>>>>+			pdn->m64_map[i][j] = IODA_INVALID_M64;
>>>>
>>>>-	if (pdn->m64_per_iov == M64_PER_IOV) {
>>>>-		vf_groups = (num_vfs <= M64_PER_IOV) ? num_vfs: M64_PER_IOV;
>>>>-		vf_per_group = (num_vfs <= M64_PER_IOV)? 1:
>>>>-			roundup_pow_of_two(num_vfs) / pdn->m64_per_iov;
>>>>-	} else {
>>>>-		vf_groups = 1;
>>>>-		vf_per_group = 1;
>>>>-	}
>>>>
>>>>  	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
>>>>  		res = &pdev->resource[i + PCI_IOV_RESOURCES];
>>>>  		if (!res->flags || !res->parent)
>>>>  			continue;
>>>>
>>>>-		for (j = 0; j < vf_groups; j++) {
>>>>+		for (j = 0; j < m64_bars; j++) {
>>>>  			do {
>>>>  				win = find_next_zero_bit(&phb->ioda.m64_bar_alloc,
>>>>  						phb->ioda.m64_bar_idx + 1, 0);
>>>>@@ -1224,12 +1230,11 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
>>>>  					goto m64_failed;
>>>>  			} while (test_and_set_bit(win, &phb->ioda.m64_bar_alloc));
>>>>
>>>>-			pdn->m64_wins[i][j] = win;
>>>>+			pdn->m64_map[j][i] = win;
>>>>
>>>>-			if (pdn->m64_per_iov == M64_PER_IOV) {
>>>>+			if (pdn->m64_single_mode) {
>>>>  				size = pci_iov_resource_size(pdev,
>>>>  							PCI_IOV_RESOURCES + i);
>>>>-				size = size * vf_per_group;
>>>>  				start = res->start + size * j;
>>>>  			} else {
>>>>  				size = resource_size(res);
>>>>@@ -1237,16 +1242,16 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
>>>>  			}
>>>>
>>>>  			/* Map the M64 here */
>>>>-			if (pdn->m64_per_iov == M64_PER_IOV) {
>>>>+			if (pdn->m64_single_mode) {
>>>>  				pe_num = pdn->offset + j;
>>>>  				rc = opal_pci_map_pe_mmio_window(phb->opal_id,
>>>>  						pe_num, OPAL_M64_WINDOW_TYPE,
>>>>-						pdn->m64_wins[i][j], 0);
>>>>+						pdn->m64_map[j][i], 0);
>>>>  			}
>>>>
>>>>  			rc = opal_pci_set_phb_mem_window(phb->opal_id,
>>>>  						 OPAL_M64_WINDOW_TYPE,
>>>>-						 pdn->m64_wins[i][j],
>>>>+						 pdn->m64_map[j][i],
>>>>  						 start,
>>>>  						 0, /* unused */
>>>>  						 size);
>>>>@@ -1258,12 +1263,12 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
>>>>  				goto m64_failed;
>>>>  			}
>>>>
>>>>-			if (pdn->m64_per_iov == M64_PER_IOV)
>>>>+			if (pdn->m64_single_mode)
>>>>  				rc = opal_pci_phb_mmio_enable(phb->opal_id,
>>>>-				     OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 2);
>>>>+				     OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 2);
>>>>  			else
>>>>  				rc = opal_pci_phb_mmio_enable(phb->opal_id,
>>>>-				     OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 1);
>>>>+				     OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 1);
>>>>
>>>>  			if (rc != OPAL_SUCCESS) {
>>>>  				dev_err(&pdev->dev, "Failed to enable M64 window #%d: %llx\n",
>>>>@@ -1275,7 +1280,7 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
>>>>  	return 0;
>>>>
>>>>  m64_failed:
>>>>-	pnv_pci_vf_release_m64(pdev);
>>>>+	pnv_pci_vf_release_m64(pdev, num_vfs);
>>>>  	return -EBUSY;
>>>>  }
>>>>
>>>>@@ -1302,15 +1307,13 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe
>>>>  	iommu_free_table(tbl, of_node_full_name(dev->dev.of_node));
>>>>  }
>>>>
>>>>-static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs)
>>>>+static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
>>>>  {
>>>>  	struct pci_bus        *bus;
>>>>  	struct pci_controller *hose;
>>>>  	struct pnv_phb        *phb;
>>>>  	struct pnv_ioda_pe    *pe, *pe_n;
>>>>  	struct pci_dn         *pdn;
>>>>-	u16                    vf_index;
>>>>-	int64_t                rc;
>>>>
>>>>  	bus = pdev->bus;
>>>>  	hose = pci_bus_to_host(bus);
>>>>@@ -1320,35 +1323,6 @@ static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs)
>>>>  	if (!pdev->is_physfn)
>>>>  		return;
>>>>
>>>>-	if (pdn->m64_per_iov == M64_PER_IOV && num_vfs > M64_PER_IOV) {
>>>>-		int   vf_group;
>>>>-		int   vf_per_group;
>>>>-		int   vf_index1;
>>>>-
>>>>-		vf_per_group = roundup_pow_of_two(num_vfs) / pdn->m64_per_iov;
>>>>-
>>>>-		for (vf_group = 0; vf_group < M64_PER_IOV; vf_group++)
>>>>-			for (vf_index = vf_group * vf_per_group;
>>>>-				vf_index < (vf_group + 1) * vf_per_group &&
>>>>-				vf_index < num_vfs;
>>>>-				vf_index++)
>>>>-				for (vf_index1 = vf_group * vf_per_group;
>>>>-					vf_index1 < (vf_group + 1) * vf_per_group &&
>>>>-					vf_index1 < num_vfs;
>>>>-					vf_index1++){
>>>>-
>>>>-					rc = opal_pci_set_peltv(phb->opal_id,
>>>>-						pdn->offset + vf_index,
>>>>-						pdn->offset + vf_index1,
>>>>-						OPAL_REMOVE_PE_FROM_DOMAIN);
>>>>-
>>>>-					if (rc)
>>>>-					    dev_warn(&pdev->dev, "%s: Failed to unlink same group PE#%d(%lld)\n",
>>>>-						__func__,
>>>>-						pdn->offset + vf_index1, rc);
>>>>-				}
>>>>-	}
>>>>-
>>>>  	list_for_each_entry_safe(pe, pe_n, &phb->ioda.pe_list, list) {
>>>>  		if (pe->parent_dev != pdev)
>>>>  			continue;
>>>>@@ -1383,14 +1357,14 @@ void pnv_pci_sriov_disable(struct pci_dev *pdev)
>>>>  	num_vfs = pdn->num_vfs;
>>>>
>>>>  	/* Release VF PEs */
>>>>-	pnv_ioda_release_vf_PE(pdev, num_vfs);
>>>>+	pnv_ioda_release_vf_PE(pdev);
>>>>
>>>>  	if (phb->type == PNV_PHB_IODA2) {
>>>>-		if (pdn->m64_per_iov == 1)
>>>>+		if (!pdn->m64_single_mode)
>>>>  			pnv_pci_vf_resource_shift(pdev, -pdn->offset);
>>>>
>>>>  		/* Release M64 windows */
>>>>-		pnv_pci_vf_release_m64(pdev);
>>>>+		pnv_pci_vf_release_m64(pdev, num_vfs);
>>>>
>>>>  		/* Release PE numbers */
>>>>  		bitmap_clear(phb->ioda.pe_alloc, pdn->offset, num_vfs);
>>>>@@ -1409,7 +1383,6 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
>>>>  	int                    pe_num;
>>>>  	u16                    vf_index;
>>>>  	struct pci_dn         *pdn;
>>>>-	int64_t                rc;
>>>>
>>>>  	bus = pdev->bus;
>>>>  	hose = pci_bus_to_host(bus);
>>>>@@ -1454,37 +1427,6 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
>>>>
>>>>  		pnv_pci_ioda2_setup_dma_pe(phb, pe);
>>>>  	}
>>>>-
>>>>-	if (pdn->m64_per_iov == M64_PER_IOV && num_vfs > M64_PER_IOV) {
>>>>-		int   vf_group;
>>>>-		int   vf_per_group;
>>>>-		int   vf_index1;
>>>>-
>>>>-		vf_per_group = roundup_pow_of_two(num_vfs) / pdn->m64_per_iov;
>>>>-
>>>>-		for (vf_group = 0; vf_group < M64_PER_IOV; vf_group++) {
>>>>-			for (vf_index = vf_group * vf_per_group;
>>>>-			     vf_index < (vf_group + 1) * vf_per_group &&
>>>>-			     vf_index < num_vfs;
>>>>-			     vf_index++) {
>>>>-				for (vf_index1 = vf_group * vf_per_group;
>>>>-				     vf_index1 < (vf_group + 1) * vf_per_group &&
>>>>-				     vf_index1 < num_vfs;
>>>>-				     vf_index1++) {
>>>>-
>>>>-					rc = opal_pci_set_peltv(phb->opal_id,
>>>>-						pdn->offset + vf_index,
>>>>-						pdn->offset + vf_index1,
>>>>-						OPAL_ADD_PE_TO_DOMAIN);
>>>>-
>>>>-					if (rc)
>>>>-					    dev_warn(&pdev->dev, "%s: Failed to link same group PE#%d(%lld)\n",
>>>>-						__func__,
>>>>-						pdn->offset + vf_index1, rc);
>>>>-				}
>>>>-			}
>>>>-		}
>>>>-	}
>>>>  }
>>>>
>>>>  int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
>>>>@@ -1507,6 +1449,18 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
>>>>  			return -EBUSY;
>>>>  		}
>>>>
>>>>+		/*
>>>>+		 * On PNV_PHB_IODA2, We just have 16 M64 BARs and M64 BAR #15
>>>>+		 * is used to cover the whole system, which leaves only 15 M64
>>>>+		 * BAR usable for VFs.
>>>>+		 * When M64 BAR functions in Single PE mode, this means it
>>>>+		 * just could enable 15 VFs.
>>>>+		 */
>>>>+		if (pdn->m64_single_mode && num_vfs >= 16) {
>>>
>>>Magic constant 16. Where did this 16 come from? My understanding is
>>>it could come from
>>>
>>>1) hostboot or
>>>2) OPAL or
>>>3) architected on IODA2
>>>4) defined in PHB3 (actually it has to be 2))
>>>
>>>which one is it? If 1) and 2) - make it a variable; if 3) - add a macro for it.
>>>
>>
>>As Gavin indicated, this will change to "num_vfs > phb->ioda.m64_bar_idx"
>
>
>This does not really answer my question ;) But I believe it is 4) as
>PHB3 (IODA2 does not mention M64 at all) has only 16 M64's per PHB.
>

Yep, I think it is 4).

>Still, pnv_ioda_parse_m64_window() puts 15 to m64_bar_idx with no
>explanation why. If there was "#define PNV_IODA2_PHB3_M64_MAX_NUMBER
>16" somewhere OR some call to OPAL which returns this "16" on PHB3
>but there is none.
>

No place defined it.

We can leverage m64_bar_idx in this patch, and need another patch to fix this
problem as mentioned above.

>
>
>>
>>>
>>>>+			dev_info(&pdev->dev, "Not enough M64 BAR for VFs\n");
>>>>+			return -EBUSY;
>>>>+		}
>>>>+
>>>>  		/* Calculate available PE for required VFs */
>>>>  		mutex_lock(&phb->ioda.pe_alloc_mutex);
>>>>  		pdn->offset = bitmap_find_next_zero_area(
>>>>@@ -1534,7 +1488,7 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
>>>>  		 * the IOV BAR according to the PE# allocated to the VFs.
>>>>  		 * Otherwise, the PE# for the VF will conflict with others.
>>>>  		 */
>>>>-		if (pdn->m64_per_iov == 1) {
>>>>+		if (!pdn->m64_single_mode) {
>>>>  			ret = pnv_pci_vf_resource_shift(pdev, pdn->offset);
>>>>  			if (ret)
>>>>  				goto m64_failed;
>>>>@@ -1567,8 +1521,7 @@ int pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
>>>>  	/* Allocate PCI data */
>>>>  	add_dev_pci_data(pdev);
>>>>
>>>>-	pnv_pci_sriov_enable(pdev, num_vfs);
>>>>-	return 0;
>>>>+	return pnv_pci_sriov_enable(pdev, num_vfs);
>>>>  }
>>>>  #endif /* CONFIG_PCI_IOV */
>>>>
>>>>@@ -2761,9 +2714,9 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
>>>>
>>>>  	pdn = pci_get_pdn(pdev);
>>>>  	pdn->vfs_expanded = 0;
>>>>+	pdn->m64_single_mode = false;
>>>>
>>>>  	total_vfs = pci_sriov_get_totalvfs(pdev);
>>>>-	pdn->m64_per_iov = 1;
>>>>  	mul = phb->ioda.total_pe;
>>>>
>>>>  	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
>>>>@@ -2783,8 +2736,8 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
>>>>  		if (size > (1 << 26)) {
>>>>  			dev_info(&pdev->dev, "PowerNV: VF BAR%d: %pR IOV size is bigger than 64M, roundup power2\n",
>>>>  				 i, res);
>>>>-			pdn->m64_per_iov = M64_PER_IOV;
>>>>  			mul = roundup_pow_of_two(total_vfs);
>>>>+			pdn->m64_single_mode = true;
>>>>  			break;
>>>>  		}
>>>>  	}
>>>>@@ -2986,6 +2939,8 @@ static resource_size_t pnv_pci_window_alignment(struct pci_bus *bus,
>>>>  static resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev,
>>>>  						      int resno)
>>>>  {
>>>>+	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
>>>>+	struct pnv_phb *phb = hose->private_data;
>>>>  	struct pci_dn *pdn = pci_get_pdn(pdev);
>>>>  	resource_size_t align;
>>>>
>>>>@@ -2994,12 +2949,25 @@ static resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev,
>>>>  	 * SR-IOV. While from hardware perspective, the range mapped by M64
>>>>  	 * BAR should be size aligned.
>>>>  	 *
>>>>+	 * When IOV BAR is mapped with M64 BAR in Single PE mode, the hardware
>>>>+	 * restriction to alignment is gone.
>>>
>>>
>>>Gone? Does not BAR still have to be aligned to its size?
>>>
>>
>>Yes, M64 BAR is always size aligned. While since in Single PE mode, the M64
>>BAR size is the same as a VF BAR size, which means they have the same
>>alignment now.  What I want to say is the extra hardware restriction is gone.
>>
>>Let me put more to explain this.
>
>Sure. Just add "extra powernv-specific" before "hardware restriction"
>(or something like that).
>
>
>
>>>
>>>>But if just use the VF BAR size
>>>>+	 * as the alignment, PF BAR / VF BAR may be allocated with in one M64
>>>>+	 * segment,
>>>
>>>
>>>I thought each VF gets its own _segment_, am I wrong?
>>>
>>
>> From the one M64 BAR map the VF BAR, yes.
>>
>>While we have M64 BAR#15 to cover the whole 64bit MMIO space, whose segment
>>size is bigger then the one map the VF BARA. When not properly aligned, VF and
>>PF may sit in the same segment of the M64 BAR#15.
>
>
>When is M64 #15 not in a single mode? Always?
>

Always in shared mode. When we want to use the 64bit MMIO range, we need one
M64 BAR to segment it.

>
>
>>>
>>>>which introduces the PE conflict between PF and VF. Based
>>>>+	 * on this the minimum alignment of an IOV BAR is m64_segsize.
>>>>
>>>>+	 *
>>>>  	 * This function return the total IOV BAR size if expanded or just the
>>>>-	 * individual size if not.
>>>>+	 * individual size if not, when M64 BAR is in Shared PE mode.
>>>>+	 * If the M64 BAR is in Single PE mode, return the VF BAR size or
>>>>+	 * m64_size if IOV BAR size is less.
>>>>  	 */
>>>>  	align = pci_iov_resource_size(pdev, resno);
>>>>-	if (pdn->vfs_expanded)
>>>>-		return pdn->vfs_expanded * align;
>>>>+	if (pdn->vfs_expanded) {
>>>>+		if (pdn->m64_single_mode)
>>>>+			return max(align,
>>>>+				(resource_size_t)phb->ioda.m64_segsize);
>>>>+		else
>>>>+			return pdn->vfs_expanded * align;
>>>>+	}
>>>>
>>>>  	return align;
>>>>  }
>
>
>
>-- 
>Alexey

-- 
Richard Yang
Help you, Help me

^ permalink raw reply	[flat|nested] 56+ messages in thread

end of thread, other threads:[~2015-08-10  1:49 UTC | newest]

Thread overview: 56+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-07-29  7:22 [PATCH] powerpc/powernv: use one M64 BAR in Single PE mode for one VF BAR Wei Yang
2015-07-30  1:15 ` Gavin Shan
2015-07-30  5:43   ` Wei Yang
2015-07-31  0:13     ` Gavin Shan
2015-07-31  2:01       ` Wei Yang
2015-08-05  1:24         ` [PATCH V2 0/6] Redesign SR-IOV on PowerNV Wei Yang
2015-08-05  1:24           ` [PATCH V2 1/6] powerpc/powernv: don't enable SRIOV when VF BAR contains non M64 BAR Wei Yang
2015-08-06  4:35             ` Gavin Shan
2015-08-06  6:10               ` Alexey Kardashevskiy
2015-08-06  6:57                 ` Gavin Shan
2015-08-06  7:47                   ` Alexey Kardashevskiy
2015-08-06 11:07                     ` Gavin Shan
2015-08-06 14:13                     ` Wei Yang
2015-08-07  1:24                       ` Alexey Kardashevskiy
2015-08-06 14:10               ` Wei Yang
2015-08-07  1:20                 ` Gavin Shan
2015-08-07  2:24                   ` Wei Yang
2015-08-07  3:50                     ` Gavin Shan
2015-08-07  7:14                     ` Alexey Kardashevskiy
2015-08-10  1:40                       ` Wei Yang
2015-08-05  1:24           ` [PATCH V2 2/6] powerpc/powernv: simplify the calculation of iov resource Wei Yang
2015-08-06  4:51             ` Gavin Shan
2015-08-06  9:00               ` Alexey Kardashevskiy
2015-08-06  9:41                 ` Wei Yang
2015-08-06 10:15                   ` Alexey Kardashevskiy
2015-08-07  1:36                     ` Wei Yang
2015-08-06 13:49               ` Wei Yang
2015-08-07  1:08                 ` Gavin Shan
2015-08-05  1:25           ` [PATCH V2 3/6] powerpc/powernv: use one M64 BAR in Single PE mode for one VF BAR Wei Yang
2015-08-06  5:20             ` Gavin Shan
2015-08-06  9:36               ` Wei Yang
2015-08-06 10:07                 ` Gavin Shan
2015-08-07  1:48                   ` Wei Yang
2015-08-07  8:13                     ` Alexey Kardashevskiy
2015-08-06 10:04             ` Alexey Kardashevskiy
2015-08-07  2:01               ` Wei Yang
2015-08-07  8:59                 ` Alexey Kardashevskiy
2015-08-10  1:48                   ` Wei Yang
2015-08-05  1:25           ` [PATCH V2 4/6] powerpc/powernv: replace the hard coded boundary with gate Wei Yang
2015-08-06  5:26             ` Gavin Shan
2015-08-07  9:11               ` Alexey Kardashevskiy
2015-08-05  1:25           ` [PATCH V2 5/6] powerpc/powernv: boundary the total vf bar size instead of the individual one Wei Yang
2015-08-06  5:28             ` Gavin Shan
2015-08-06 14:03               ` Wei Yang
2015-08-07  1:23                 ` Gavin Shan
2015-08-07  2:25                   ` Wei Yang
2015-08-05  1:25           ` [PATCH V2 6/6] powerpc/powernv: allocate discrete PE# when using M64 BAR in Single PE mode Wei Yang
2015-08-06  5:36             ` Gavin Shan
2015-08-06 13:41               ` Wei Yang
2015-08-07  1:36                 ` Gavin Shan
2015-08-07  2:33                   ` Wei Yang
2015-08-07  3:43                     ` Gavin Shan
2015-08-07  5:44                       ` Wei Yang
2015-08-07  5:54                         ` Gavin Shan
2015-08-07  6:25                           ` Wei Yang
2015-08-07 10:00                           ` Alexey Kardashevskiy

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).