From: Alexey Kardashevskiy <aik@ozlabs.ru>
To: linuxppc-dev@lists.ozlabs.org
Cc: Alexey Kardashevskiy <aik@ozlabs.ru>,
Gavin Shan <gwshan@linux.vnet.ibm.com>,
linux-kernel@vger.kernel.org,
Alex Williamson <alex.williamson@redhat.com>,
Paul Mackerras <paulus@samba.org>,
David Gibson <david@gibson.dropbear.id.au>
Subject: [PATCH kernel v9 22/32] powerpc/powernv: Implement multilevel TCE tables
Date: Sat, 25 Apr 2015 22:14:46 +1000 [thread overview]
Message-ID: <1429964096-11524-23-git-send-email-aik@ozlabs.ru> (raw)
In-Reply-To: <1429964096-11524-1-git-send-email-aik@ozlabs.ru>
TCE tables might get too big in case of 4K IOMMU pages and DDW enabled
on huge guests (hundreds of GB of RAM) so the kernel might be unable to
allocate contiguous chunk of physical memory to store the TCE table.
To address this, POWER8 CPU (actually, IODA2) supports multi-level TCE tables,
up to 5 levels which splits the table into a tree of smaller subtables.
This adds multi-level TCE tables support to pnv_pci_create_table()
and pnv_pci_free_table() helpers.
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
Changes:
v9:
* moved from ioda2 to common powernv pci code
* fixed cleanup if allocation fails in a middle
* removed check for the size - all boundary checks happen in the calling code
anyway
---
arch/powerpc/include/asm/iommu.h | 2 +
arch/powerpc/platforms/powernv/pci-ioda.c | 15 +++--
arch/powerpc/platforms/powernv/pci.c | 94 +++++++++++++++++++++++++++++--
arch/powerpc/platforms/powernv/pci.h | 4 +-
4 files changed, 104 insertions(+), 11 deletions(-)
diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 7e7ca0a..0f50ee2 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -96,6 +96,8 @@ struct iommu_pool {
struct iommu_table {
unsigned long it_busno; /* Bus number this table belongs to */
unsigned long it_size; /* Size of iommu table in entries */
+ unsigned long it_indirect_levels;
+ unsigned long it_level_size;
unsigned long it_offset; /* Offset into global table */
unsigned long it_base; /* mapped address of tce table */
unsigned long it_index; /* which iommu table this is */
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 59baa15..cc1d09c 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1967,13 +1967,17 @@ static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group,
table_group);
struct pnv_phb *phb = pe->phb;
int64_t rc;
+ const unsigned long size = tbl->it_indirect_levels ?
+ tbl->it_level_size : tbl->it_size;
const __u64 start_addr = tbl->it_offset << tbl->it_page_shift;
const __u64 win_size = tbl->it_size << tbl->it_page_shift;
pe_info(pe, "Setting up window at %llx..%llx "
- "pgsize=0x%x tablesize=0x%lx\n",
+ "pgsize=0x%x tablesize=0x%lx "
+ "levels=%d levelsize=%x\n",
start_addr, start_addr + win_size - 1,
- 1UL << tbl->it_page_shift, tbl->it_size << 3);
+ 1UL << tbl->it_page_shift, tbl->it_size << 3,
+ tbl->it_indirect_levels + 1, tbl->it_level_size << 3);
tbl->it_table_group = &pe->table_group;
@@ -1984,9 +1988,9 @@ static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group,
rc = opal_pci_map_pe_dma_window(phb->opal_id,
pe->pe_number,
pe->pe_number << 1,
- 1,
+ tbl->it_indirect_levels + 1,
__pa(tbl->it_base),
- tbl->it_size << 3,
+ size << 3,
1ULL << tbl->it_page_shift);
if (rc) {
pe_err(pe, "Failed to configure TCE table, err %ld\n", rc);
@@ -2099,7 +2103,8 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
phb->ioda.m32_pci_base);
rc = pnv_pci_create_table(&pe->table_group, pe->phb->hose->node,
- 0, IOMMU_PAGE_SHIFT_4K, phb->ioda.m32_pci_base, tbl);
+ 0, IOMMU_PAGE_SHIFT_4K, phb->ioda.m32_pci_base,
+ POWERNV_IOMMU_DEFAULT_LEVELS, tbl);
if (rc) {
pe_err(pe, "Failed to create 32-bit TCE table, err %ld", rc);
return;
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index 6bcfad5..fc129c4 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -46,6 +46,8 @@
#define cfg_dbg(fmt...) do { } while(0)
//#define cfg_dbg(fmt...) printk(fmt)
+#define ROUND_UP(x, n) (((x) + (n) - 1ULL) & ~((n) - 1ULL))
+
#ifdef CONFIG_PCI_MSI
static int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
{
@@ -577,6 +579,19 @@ struct pci_ops pnv_pci_ops = {
static __be64 *pnv_tce(struct iommu_table *tbl, long idx)
{
__be64 *tmp = ((__be64 *)tbl->it_base);
+ int level = tbl->it_indirect_levels;
+ const long shift = ilog2(tbl->it_level_size);
+ unsigned long mask = (tbl->it_level_size - 1) << (level * shift);
+
+ while (level) {
+ int n = (idx & mask) >> (level * shift);
+ unsigned long tce = be64_to_cpu(tmp[n]);
+
+ tmp = __va(tce & ~(TCE_PCI_READ | TCE_PCI_WRITE));
+ idx &= ~mask;
+ mask >>= shift;
+ --level;
+ }
return tmp + idx;
}
@@ -648,12 +663,18 @@ void pnv_pci_setup_iommu_table(struct iommu_table *tbl,
}
static __be64 *pnv_alloc_tce_table_pages(int nid, unsigned shift,
+ unsigned levels, unsigned long limit,
unsigned long *tce_table_allocated)
{
struct page *tce_mem = NULL;
- __be64 *addr;
+ __be64 *addr, *tmp;
unsigned order = max_t(unsigned, shift, PAGE_SHIFT) - PAGE_SHIFT;
unsigned long local_allocated = 1UL << (order + PAGE_SHIFT);
+ unsigned entries = 1UL << (shift - 3);
+ long i;
+
+ if (limit == *tce_table_allocated)
+ return NULL;
tce_mem = alloc_pages_node(nid, GFP_KERNEL, order);
if (!tce_mem) {
@@ -662,14 +683,33 @@ static __be64 *pnv_alloc_tce_table_pages(int nid, unsigned shift,
}
addr = page_address(tce_mem);
memset(addr, 0, local_allocated);
- *tce_table_allocated = local_allocated;
+
+ --levels;
+ if (!levels) {
+ /* Update tce_table_allocated with bottom level table size only */
+ *tce_table_allocated += local_allocated;
+ return addr;
+ }
+
+ for (i = 0; i < entries; ++i) {
+ tmp = pnv_alloc_tce_table_pages(nid, shift, levels, limit,
+ tce_table_allocated);
+ if (!tmp)
+ break;
+
+ addr[i] = cpu_to_be64(__pa(tmp) |
+ TCE_PCI_READ | TCE_PCI_WRITE);
+ }
return addr;
}
+static void pnv_free_tce_table_pages(unsigned long addr, unsigned long size,
+ unsigned level);
+
long pnv_pci_create_table(struct iommu_table_group *table_group, int nid,
__u64 bus_offset, __u32 page_shift, __u64 window_size,
- struct iommu_table *tbl)
+ __u32 levels, struct iommu_table *tbl)
{
void *addr;
unsigned long tce_table_allocated = 0;
@@ -678,16 +718,34 @@ long pnv_pci_create_table(struct iommu_table_group *table_group, int nid,
unsigned table_shift = entries_shift + 3;
const unsigned long tce_table_size = max(0x1000UL, 1UL << table_shift);
+ if (!levels || (levels > POWERNV_IOMMU_MAX_LEVELS))
+ return -EINVAL;
+
if ((window_size > memory_hotplug_max()) || !is_power_of_2(window_size))
return -EINVAL;
+ /* Adjust direct table size from window_size and levels */
+ entries_shift = ROUND_UP(entries_shift, levels) / levels;
+ table_shift = entries_shift + 3;
+ table_shift = max_t(unsigned, table_shift, PAGE_SHIFT);
+
/* Allocate TCE table */
addr = pnv_alloc_tce_table_pages(nid, table_shift,
- &tce_table_allocated);
+ levels, tce_table_size, &tce_table_allocated);
+ if (!addr)
+ return -ENOMEM;
+
+ if (tce_table_size != tce_table_allocated) {
+ pnv_free_tce_table_pages((unsigned long) addr,
+ tbl->it_level_size, tbl->it_indirect_levels);
+ return -ENOMEM;
+ }
/* Setup linux iommu table */
pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, bus_offset,
page_shift);
+ tbl->it_level_size = 1ULL << (table_shift - 3);
+ tbl->it_indirect_levels = levels - 1;
pr_info("Created TCE table: window size = %08llx, "
"tablesize = %lx (%lx), start @%08llx\n",
@@ -697,12 +755,38 @@ long pnv_pci_create_table(struct iommu_table_group *table_group, int nid,
return 0;
}
+static void pnv_free_tce_table_pages(unsigned long addr, unsigned long size,
+ unsigned level)
+{
+ addr &= ~(TCE_PCI_READ | TCE_PCI_WRITE);
+
+ if (level) {
+ long i;
+ u64 *tmp = (u64 *) addr;
+
+ for (i = 0; i < size; ++i) {
+ unsigned long hpa = be64_to_cpu(tmp[i]);
+
+ if (!(hpa & (TCE_PCI_READ | TCE_PCI_WRITE)))
+ continue;
+
+ pnv_free_tce_table_pages((unsigned long) __va(hpa),
+ size, level - 1);
+ }
+ }
+
+ free_pages(addr, get_order(size << 3));
+}
+
void pnv_pci_free_table(struct iommu_table *tbl)
{
+ const unsigned long size = tbl->it_indirect_levels ?
+ tbl->it_level_size : tbl->it_size;
+
if (!tbl->it_size)
return;
- free_pages(tbl->it_base, get_order(tbl->it_size << 3));
+ pnv_free_tce_table_pages(tbl->it_base, size, tbl->it_indirect_levels);
iommu_reset_table(tbl, "pnv");
}
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index e6cbbec..3d1ff584 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -218,9 +218,11 @@ int pnv_pci_cfg_write(struct pci_dn *pdn,
extern void pnv_pci_setup_iommu_table(struct iommu_table *tbl,
void *tce_mem, u64 tce_size,
u64 dma_offset, unsigned page_shift);
+#define POWERNV_IOMMU_DEFAULT_LEVELS 1
+#define POWERNV_IOMMU_MAX_LEVELS 5
extern long pnv_pci_create_table(struct iommu_table_group *table_group, int nid,
__u64 bus_offset, __u32 page_shift, __u64 window_size,
- struct iommu_table *tbl);
+ __u32 levels, struct iommu_table *tbl);
extern void pnv_pci_free_table(struct iommu_table *tbl);
extern void pnv_pci_init_p5ioc2_hub(struct device_node *np);
extern void pnv_pci_init_ioda_hub(struct device_node *np);
--
2.0.0
next prev parent reply other threads:[~2015-04-25 12:16 UTC|newest]
Thread overview: 110+ messages / expand[flat|nested] mbox.gz Atom feed top
2015-04-25 12:14 [PATCH kernel v9 00/32] powerpc/iommu/vfio: Enable Dynamic DMA windows Alexey Kardashevskiy
2015-04-25 12:14 ` [PATCH kernel v9 01/32] powerpc/iommu: Split iommu_free_table into 2 helpers Alexey Kardashevskiy
2015-04-29 2:03 ` David Gibson
2015-04-25 12:14 ` [PATCH kernel v9 02/32] Revert "powerpc/powernv: Allocate struct pnv_ioda_pe iommu_table dynamically" Alexey Kardashevskiy
2015-04-27 21:05 ` Alex Williamson
2015-04-29 2:05 ` David Gibson
2015-04-25 12:14 ` [PATCH kernel v9 03/32] vfio: powerpc/spapr: Move page pinning from arch code to VFIO IOMMU driver Alexey Kardashevskiy
2015-04-25 12:14 ` [PATCH kernel v9 04/32] vfio: powerpc/spapr: Check that IOMMU page is fully contained by system page Alexey Kardashevskiy
2015-04-25 12:14 ` [PATCH kernel v9 05/32] vfio: powerpc/spapr: Use it_page_size Alexey Kardashevskiy
2015-04-25 12:14 ` [PATCH kernel v9 06/32] vfio: powerpc/spapr: Move locked_vm accounting to helpers Alexey Kardashevskiy
2015-04-25 12:14 ` [PATCH kernel v9 07/32] vfio: powerpc/spapr: Disable DMA mappings on disabled container Alexey Kardashevskiy
2015-04-25 12:14 ` [PATCH kernel v9 08/32] vfio: powerpc/spapr: Moving pinning/unpinning to helpers Alexey Kardashevskiy
2015-04-29 2:14 ` David Gibson
2015-04-25 12:14 ` [PATCH kernel v9 09/32] vfio: powerpc/spapr: Rework groups attaching Alexey Kardashevskiy
2015-04-29 2:16 ` David Gibson
2015-04-30 2:29 ` Alexey Kardashevskiy
2015-04-30 4:05 ` David Gibson
2015-04-25 12:14 ` [PATCH kernel v9 10/32] powerpc/powernv: Do not set "read" flag if direction==DMA_NONE Alexey Kardashevskiy
2015-04-25 12:14 ` [PATCH kernel v9 11/32] powerpc/iommu: Move tce_xxx callbacks from ppc_md to iommu_table Alexey Kardashevskiy
2015-04-25 12:14 ` [PATCH kernel v9 12/32] powerpc/spapr: vfio: Switch from iommu_table to new iommu_table_group Alexey Kardashevskiy
2015-04-29 2:49 ` David Gibson
2015-04-30 2:30 ` Alexey Kardashevskiy
2015-04-25 12:14 ` [PATCH kernel v9 13/32] vfio: powerpc/spapr/iommu/powernv/ioda2: Rework IOMMU ownership control Alexey Kardashevskiy
2015-04-29 3:02 ` David Gibson
2015-04-29 9:19 ` Alexey Kardashevskiy
2015-04-30 4:08 ` David Gibson
2015-04-25 12:14 ` [PATCH kernel v9 14/32] powerpc/iommu: Fix IOMMU ownership control functions Alexey Kardashevskiy
2015-04-29 3:08 ` David Gibson
2015-04-25 12:14 ` [PATCH kernel v9 15/32] powerpc/powernv/ioda/ioda2: Rework TCE invalidation in tce_build()/tce_free() Alexey Kardashevskiy
2015-04-29 3:18 ` David Gibson
2015-04-30 2:58 ` Alexey Kardashevskiy
2015-04-30 4:16 ` David Gibson
2015-04-25 12:14 ` [PATCH kernel v9 16/32] powerpc/powernv/ioda: Move TCE kill register address to PE Alexey Kardashevskiy
2015-04-27 21:05 ` Alex Williamson
2015-04-29 3:25 ` David Gibson
2015-04-29 9:00 ` Alexey Kardashevskiy
2015-04-30 4:18 ` David Gibson
2015-04-25 12:14 ` [PATCH kernel v9 17/32] powerpc/powernv: Implement accessor to TCE entry Alexey Kardashevskiy
2015-04-29 4:04 ` David Gibson
2015-04-29 9:02 ` Alexey Kardashevskiy
2015-04-30 0:13 ` David Gibson
2015-04-25 12:14 ` [PATCH kernel v9 18/32] powerpc/iommu/powernv: Release replaced TCE Alexey Kardashevskiy
2015-04-29 4:18 ` David Gibson
2015-04-29 9:51 ` Alexey Kardashevskiy
2015-04-30 4:21 ` David Gibson
2015-04-25 12:14 ` [PATCH kernel v9 19/32] powerpc/powernv/ioda2: Rework iommu_table creation Alexey Kardashevskiy
2015-04-29 4:27 ` David Gibson
2015-04-25 12:14 ` [PATCH kernel v9 20/32] powerpc/powernv/ioda2: Introduce pnv_pci_create_table/pnv_pci_free_table Alexey Kardashevskiy
2015-04-29 4:39 ` David Gibson
2015-04-29 9:12 ` Alexey Kardashevskiy
2015-04-30 4:24 ` David Gibson
2015-05-01 10:13 ` Alexey Kardashevskiy
2015-04-25 12:14 ` [PATCH kernel v9 21/32] powerpc/powernv/ioda2: Introduce pnv_pci_ioda2_set_window Alexey Kardashevskiy
2015-04-29 4:45 ` David Gibson
2015-04-29 9:26 ` Alexey Kardashevskiy
2015-04-30 4:32 ` David Gibson
2015-04-25 12:14 ` Alexey Kardashevskiy [this message]
2015-04-29 5:04 ` [PATCH kernel v9 22/32] powerpc/powernv: Implement multilevel TCE tables David Gibson
2015-05-01 9:48 ` Alexey Kardashevskiy
2015-05-05 12:05 ` David Gibson
2015-04-25 12:14 ` [PATCH kernel v9 23/32] powerpc/powernv/ioda: Define and implement DMA table/window management callbacks Alexey Kardashevskiy
2015-04-29 5:30 ` David Gibson
2015-04-29 9:44 ` Alexey Kardashevskiy
2015-04-30 4:37 ` David Gibson
2015-04-30 9:56 ` Alexey Kardashevskiy
2015-05-01 3:36 ` David Gibson
2015-04-25 12:14 ` [PATCH kernel v9 24/32] powerpc/powernv/ioda2: Use new helpers to do proper cleanup on PE release Alexey Kardashevskiy
2015-04-25 12:14 ` [PATCH kernel v9 25/32] vfio: powerpc/spapr: powerpc/powernv/ioda2: Rework ownership Alexey Kardashevskiy
2015-04-29 5:39 ` David Gibson
2015-04-25 12:14 ` [PATCH kernel v9 26/32] powerpc/iommu: Add userspace view of TCE table Alexey Kardashevskiy
2015-04-29 6:31 ` David Gibson
2015-05-01 4:01 ` Alexey Kardashevskiy
2015-05-01 4:23 ` David Gibson
2015-05-01 7:12 ` Alexey Kardashevskiy
2015-05-05 12:02 ` David Gibson
2015-05-11 2:11 ` Alexey Kardashevskiy
2015-05-11 4:52 ` Alexey Kardashevskiy
2015-04-25 12:14 ` [PATCH kernel v9 27/32] powerpc/iommu/ioda2: Add get_table_size() to calculate the size of future table Alexey Kardashevskiy
2015-04-29 6:40 ` David Gibson
2015-05-01 4:10 ` Alexey Kardashevskiy
2015-05-01 5:12 ` David Gibson
2015-05-01 6:53 ` Alexey Kardashevskiy
2015-05-05 11:58 ` David Gibson
2015-05-11 2:24 ` Alexey Kardashevskiy
2015-04-25 12:14 ` [PATCH kernel v9 28/32] powerpc/mmu: Add userspace-to-physical addresses translation cache Alexey Kardashevskiy
2015-04-29 7:01 ` David Gibson
2015-05-01 11:26 ` Alexey Kardashevskiy
2015-05-05 12:12 ` David Gibson
2015-04-30 6:34 ` David Gibson
2015-04-30 8:25 ` Paul Mackerras
2015-05-01 3:39 ` David Gibson
2015-04-25 12:14 ` [PATCH kernel v9 29/32] vfio: powerpc/spapr: Register memory and define IOMMU v2 Alexey Kardashevskiy
2015-04-30 6:55 ` David Gibson
2015-05-01 4:35 ` Alexey Kardashevskiy
2015-05-01 5:23 ` David Gibson
2015-05-01 6:27 ` Alexey Kardashevskiy
2015-05-05 11:53 ` David Gibson
2015-04-25 12:14 ` [PATCH kernel v9 30/32] vfio: powerpc/spapr: Use 32bit DMA window properties from table_group Alexey Kardashevskiy
2015-04-27 22:18 ` Alex Williamson
2015-04-30 6:58 ` David Gibson
2015-04-25 12:14 ` [PATCH kernel v9 31/32] vfio: powerpc/spapr: Support multiple groups in one container if possible Alexey Kardashevskiy
2015-04-30 7:22 ` David Gibson
2015-04-30 9:33 ` Alexey Kardashevskiy
2015-05-01 0:46 ` Benjamin Herrenschmidt
2015-05-01 4:44 ` David Gibson
2015-05-01 4:33 ` David Gibson
2015-05-01 6:05 ` Alexey Kardashevskiy
2015-05-05 11:50 ` David Gibson
2015-05-11 2:26 ` Alexey Kardashevskiy
2015-04-25 12:14 ` [PATCH kernel v9 32/32] vfio: powerpc/spapr: Support Dynamic DMA windows Alexey Kardashevskiy
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1429964096-11524-23-git-send-email-aik@ozlabs.ru \
--to=aik@ozlabs.ru \
--cc=alex.williamson@redhat.com \
--cc=david@gibson.dropbear.id.au \
--cc=gwshan@linux.vnet.ibm.com \
--cc=linux-kernel@vger.kernel.org \
--cc=linuxppc-dev@lists.ozlabs.org \
--cc=paulus@samba.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).