linuxppc-dev.lists.ozlabs.org archive mirror
 help / color / mirror / Atom feed
From: Alexey Kardashevskiy <aik@ozlabs.ru>
To: linuxppc-dev@lists.ozlabs.org
Cc: Alexey Kardashevskiy <aik@ozlabs.ru>,
	Alex Williamson <alex.williamson@redhat.com>,
	Benjamin Herrenschmidt <benh@kernel.crashing.org>,
	David Gibson <david@gibson.dropbear.id.au>,
	Gavin Shan <gwshan@linux.vnet.ibm.com>,
	Paul Mackerras <paulus@samba.org>,
	Wei Yang <weiyang@linux.vnet.ibm.com>,
	linux-kernel@vger.kernel.org
Subject: [PATCH kernel v12 27/34] powerpc/powernv: Implement multilevel TCE tables
Date: Fri,  5 Jun 2015 16:35:19 +1000	[thread overview]
Message-ID: <1433486126-23551-28-git-send-email-aik@ozlabs.ru> (raw)
In-Reply-To: <1433486126-23551-1-git-send-email-aik@ozlabs.ru>

TCE tables might get too big in case of 4K IOMMU pages and DDW enabled
on huge guests (hundreds of GB of RAM) so the kernel might be unable to
allocate contiguous chunk of physical memory to store the TCE table.

To address this, POWER8 CPU (actually, IODA2) supports multi-level
TCE tables, up to 5 levels which splits the table into a tree of
smaller subtables.

This adds multi-level TCE tables support to
pnv_pci_ioda2_table_alloc_pages() and pnv_pci_ioda2_table_free_pages()
helpers.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
Changes:
v12:
* changed pnv_pci_ioda2_table_do_alloc_pages() to return NULL to
pnv_pci_ioda2_table_alloc_pages() only if the first level allocation
failed, otherwise it always returns non zero value
* pnv_pci_ioda2_table_do_free_pages() now takes __be64* rather than
uinsigned long
* s/tce_table_allocated/current_offset/

v10:
* fixed multiple comments received for v9

v9:
* moved from ioda2 to common powernv pci code
* fixed cleanup if allocation fails in a middle
* removed check for the size - all boundary checks happen in the calling code
anyway
---
 arch/powerpc/include/asm/iommu.h          |   2 +
 arch/powerpc/platforms/powernv/pci-ioda.c | 105 +++++++++++++++++++++++++++---
 arch/powerpc/platforms/powernv/pci.c      |  13 ++++
 3 files changed, 111 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 4636734..706cfc0 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -96,6 +96,8 @@ struct iommu_pool {
 struct iommu_table {
 	unsigned long  it_busno;     /* Bus number this table belongs to */
 	unsigned long  it_size;      /* Size of iommu table in entries */
+	unsigned long  it_indirect_levels;
+	unsigned long  it_level_size;
 	unsigned long  it_offset;    /* Offset into global table */
 	unsigned long  it_base;      /* mapped address of tce table */
 	unsigned long  it_index;     /* which iommu table this is */
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index da14043..a253dda 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -50,6 +50,9 @@
 /* 256M DMA window, 4K TCE pages, 8 bytes TCE */
 #define TCE32_TABLE_SIZE	((0x10000000 / 0x1000) * 8)
 
+#define POWERNV_IOMMU_DEFAULT_LEVELS	1
+#define POWERNV_IOMMU_MAX_LEVELS	5
+
 static void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl);
 
 static void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
@@ -1976,6 +1979,8 @@ static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group,
 			table_group);
 	struct pnv_phb *phb = pe->phb;
 	int64_t rc;
+	const unsigned long size = tbl->it_indirect_levels ?
+			tbl->it_level_size : tbl->it_size;
 	const __u64 start_addr = tbl->it_offset << tbl->it_page_shift;
 	const __u64 win_size = tbl->it_size << tbl->it_page_shift;
 
@@ -1990,9 +1995,9 @@ static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group,
 	rc = opal_pci_map_pe_dma_window(phb->opal_id,
 			pe->pe_number,
 			pe->pe_number << 1,
-			1,
+			tbl->it_indirect_levels + 1,
 			__pa(tbl->it_base),
-			tbl->it_size << 3,
+			size << 3,
 			IOMMU_PAGE_SIZE(tbl));
 	if (rc) {
 		pe_err(pe, "Failed to configure TCE table, err %ld\n", rc);
@@ -2072,11 +2077,16 @@ static void pnv_pci_ioda_setup_opal_tce_kill(struct pnv_phb *phb)
 	phb->ioda.tce_inval_reg = ioremap(phb->ioda.tce_inval_reg_phys, 8);
 }
 
-static __be64 *pnv_pci_ioda2_table_do_alloc_pages(int nid, unsigned shift)
+static __be64 *pnv_pci_ioda2_table_do_alloc_pages(int nid, unsigned shift,
+		unsigned levels, unsigned long limit,
+		unsigned long *current_offset)
 {
 	struct page *tce_mem = NULL;
-	__be64 *addr;
+	__be64 *addr, *tmp;
 	unsigned order = max_t(unsigned, shift, PAGE_SHIFT) - PAGE_SHIFT;
+	unsigned long allocated = 1UL << (order + PAGE_SHIFT);
+	unsigned entries = 1UL << (shift - 3);
+	long i;
 
 	tce_mem = alloc_pages_node(nid, GFP_KERNEL, order);
 	if (!tce_mem) {
@@ -2084,31 +2094,79 @@ static __be64 *pnv_pci_ioda2_table_do_alloc_pages(int nid, unsigned shift)
 		return NULL;
 	}
 	addr = page_address(tce_mem);
-	memset(addr, 0, 1UL << (order + PAGE_SHIFT));
+	memset(addr, 0, allocated);
+
+	--levels;
+	if (!levels) {
+		*current_offset += allocated;
+		return addr;
+	}
+
+	for (i = 0; i < entries; ++i) {
+		tmp = pnv_pci_ioda2_table_do_alloc_pages(nid, shift,
+				levels, limit, current_offset);
+		if (!tmp)
+			break;
+
+		addr[i] = cpu_to_be64(__pa(tmp) |
+				TCE_PCI_READ | TCE_PCI_WRITE);
+
+		if (*current_offset >= limit)
+			break;
+	}
 
 	return addr;
 }
 
+static void pnv_pci_ioda2_table_do_free_pages(__be64 *addr,
+		unsigned long size, unsigned level);
+
 static long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
-		__u32 page_shift, __u64 window_size, struct iommu_table *tbl)
+		__u32 page_shift, __u64 window_size, __u32 levels,
+		struct iommu_table *tbl)
 {
 	void *addr;
+	unsigned long offset = 0, level_shift;
 	const unsigned window_shift = ilog2(window_size);
 	unsigned entries_shift = window_shift - page_shift;
 	unsigned table_shift = max_t(unsigned, entries_shift + 3, PAGE_SHIFT);
 	const unsigned long tce_table_size = 1UL << table_shift;
 
+	if (!levels || (levels > POWERNV_IOMMU_MAX_LEVELS))
+		return -EINVAL;
+
 	if ((window_size > memory_hotplug_max()) || !is_power_of_2(window_size))
 		return -EINVAL;
 
+	/* Adjust direct table size from window_size and levels */
+	entries_shift = (entries_shift + levels - 1) / levels;
+	level_shift = entries_shift + 3;
+	level_shift = max_t(unsigned, level_shift, PAGE_SHIFT);
+
 	/* Allocate TCE table */
-	addr = pnv_pci_ioda2_table_do_alloc_pages(nid, table_shift);
+	addr = pnv_pci_ioda2_table_do_alloc_pages(nid, level_shift,
+			levels, tce_table_size, &offset);
+
+	/* addr==NULL means that the first level allocation failed */
 	if (!addr)
 		return -ENOMEM;
 
+	/*
+	 * First level was allocated but some lower level failed as
+	 * we did not allocate as much as we wanted,
+	 * release partially allocated table.
+	 */
+	if (offset < tce_table_size) {
+		pnv_pci_ioda2_table_do_free_pages(addr,
+				1ULL << (level_shift - 3), levels - 1);
+		return -ENOMEM;
+	}
+
 	/* Setup linux iommu table */
 	pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, bus_offset,
 			page_shift);
+	tbl->it_level_size = 1ULL << (level_shift - 3);
+	tbl->it_indirect_levels = levels - 1;
 
 	pr_devel("Created TCE table: ws=%08llx ts=%lx @%08llx\n",
 			window_size, tce_table_size, bus_offset);
@@ -2116,12 +2174,40 @@ static long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
 	return 0;
 }
 
+static void pnv_pci_ioda2_table_do_free_pages(__be64 *addr,
+		unsigned long size, unsigned level)
+{
+	const unsigned long addr_ul = (unsigned long) addr &
+			~(TCE_PCI_READ | TCE_PCI_WRITE);
+
+	if (level) {
+		long i;
+		u64 *tmp = (u64 *) addr_ul;
+
+		for (i = 0; i < size; ++i) {
+			unsigned long hpa = be64_to_cpu(tmp[i]);
+
+			if (!(hpa & (TCE_PCI_READ | TCE_PCI_WRITE)))
+				continue;
+
+			pnv_pci_ioda2_table_do_free_pages(__va(hpa), size,
+					level - 1);
+		}
+	}
+
+	free_pages(addr_ul, get_order(size << 3));
+}
+
 static void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl)
 {
+	const unsigned long size = tbl->it_indirect_levels ?
+			tbl->it_level_size : tbl->it_size;
+
 	if (!tbl->it_size)
 		return;
 
-	free_pages(tbl->it_base, get_order(tbl->it_size << 3));
+	pnv_pci_ioda2_table_do_free_pages((__be64 *)tbl->it_base, size,
+			tbl->it_indirect_levels);
 }
 
 static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
@@ -2149,7 +2235,8 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 
 	/* Setup linux iommu table */
 	rc = pnv_pci_ioda2_table_alloc_pages(pe->phb->hose->node,
-			0, IOMMU_PAGE_SHIFT_4K, phb->ioda.m32_pci_base, tbl);
+			0, IOMMU_PAGE_SHIFT_4K, phb->ioda.m32_pci_base,
+			POWERNV_IOMMU_DEFAULT_LEVELS, tbl);
 	if (rc) {
 		pe_err(pe, "Failed to create 32-bit TCE table, err %ld", rc);
 		goto fail;
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index dce3bfd..d4e59f7 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -575,6 +575,19 @@ struct pci_ops pnv_pci_ops = {
 static __be64 *pnv_tce(struct iommu_table *tbl, long idx)
 {
 	__be64 *tmp = ((__be64 *)tbl->it_base);
+	int  level = tbl->it_indirect_levels;
+	const long shift = ilog2(tbl->it_level_size);
+	unsigned long mask = (tbl->it_level_size - 1) << (level * shift);
+
+	while (level) {
+		int n = (idx & mask) >> (level * shift);
+		unsigned long tce = be64_to_cpu(tmp[n]);
+
+		tmp = __va(tce & ~(TCE_PCI_READ | TCE_PCI_WRITE));
+		idx &= ~mask;
+		mask >>= shift;
+		--level;
+	}
 
 	return tmp + idx;
 }
-- 
2.4.0.rc3.8.gfb3e7d5

  parent reply	other threads:[~2015-06-05  6:38 UTC|newest]

Thread overview: 46+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-06-05  6:34 [PATCH kernel v12 00/34] powerpc/iommu/vfio: Enable Dynamic DMA windows Alexey Kardashevskiy
2015-06-05  6:34 ` [PATCH kernel v12 01/34] powerpc/eeh/ioda2: Use device::iommu_group to check IOMMU group Alexey Kardashevskiy
2015-06-05  6:34 ` [PATCH kernel v12 02/34] powerpc/iommu/powernv: Get rid of set_iommu_table_base_and_group Alexey Kardashevskiy
2015-06-05  6:34 ` [PATCH kernel v12 03/34] powerpc/powernv/ioda: Clean up IOMMU group registration Alexey Kardashevskiy
2015-06-05  6:34 ` [PATCH kernel v12 04/34] powerpc/iommu: Put IOMMU group explicitly Alexey Kardashevskiy
2015-06-05  6:34 ` [PATCH kernel v12 05/34] powerpc/iommu: Always release iommu_table in iommu_free_table() Alexey Kardashevskiy
2015-06-05  6:34 ` [PATCH kernel v12 06/34] vfio: powerpc/spapr: Move page pinning from arch code to VFIO IOMMU driver Alexey Kardashevskiy
2015-06-05  6:34 ` [PATCH kernel v12 07/34] vfio: powerpc/spapr: Check that IOMMU page is fully contained by system page Alexey Kardashevskiy
2015-06-05  6:35 ` [PATCH kernel v12 08/34] vfio: powerpc/spapr: Use it_page_size Alexey Kardashevskiy
2015-06-05  6:35 ` [PATCH kernel v12 09/34] vfio: powerpc/spapr: Move locked_vm accounting to helpers Alexey Kardashevskiy
2015-06-09  4:22   ` David Gibson
2015-06-05  6:35 ` [PATCH kernel v12 10/34] vfio: powerpc/spapr: Disable DMA mappings on disabled container Alexey Kardashevskiy
2015-06-05  6:35 ` [PATCH kernel v12 11/34] vfio: powerpc/spapr: Moving pinning/unpinning to helpers Alexey Kardashevskiy
2015-06-05  6:35 ` [PATCH kernel v12 12/34] vfio: powerpc/spapr: Rework groups attaching Alexey Kardashevskiy
2015-06-05  6:35 ` [PATCH kernel v12 13/34] powerpc/powernv: Do not set "read" flag if direction==DMA_NONE Alexey Kardashevskiy
2015-06-05  6:35 ` [PATCH kernel v12 14/34] powerpc/iommu: Move tce_xxx callbacks from ppc_md to iommu_table Alexey Kardashevskiy
2015-06-05  6:35 ` [PATCH kernel v12 15/34] powerpc/powernv/ioda/ioda2: Rework TCE invalidation in tce_build()/tce_free() Alexey Kardashevskiy
2015-06-05  6:35 ` [PATCH kernel v12 16/34] powerpc/spapr: vfio: Replace iommu_table with iommu_table_group Alexey Kardashevskiy
2015-06-05  6:35 ` [PATCH kernel v12 17/34] powerpc/spapr: vfio: Switch from iommu_table to new iommu_table_group Alexey Kardashevskiy
2015-06-09  2:36   ` David Gibson
2015-06-09  5:59   ` [PATCH kernel] powerpc/pseries: Fix compile error when CONFIG_IOMMU_API if off Alexey Kardashevskiy
2015-06-09 12:23   ` [PATCH kernel v12 17/34] powerpc/spapr: vfio: Switch from iommu_table to new iommu_table_group Michael Ellerman
2015-06-10  3:08     ` [PATCH kernel] powerpc/powernv: Fix crash when CONFIG_IOMMU_API is off Alexey Kardashevskiy
2015-06-10  7:33   ` [kernel, v12, 17/34] powerpc/spapr: vfio: Switch from iommu_table to new iommu_table_group Michael Ellerman
2015-06-11  4:28     ` [PATCH kernel v12.2] powerpc/powernv: Fix crash when CONFIG_IOMMU_API is off Alexey Kardashevskiy
2015-06-05  6:35 ` [PATCH kernel v12 18/34] vfio: powerpc/spapr/iommu/powernv/ioda2: Rework IOMMU ownership control Alexey Kardashevskiy
2015-06-05  6:35 ` [PATCH kernel v12 19/34] powerpc/iommu: Fix IOMMU ownership control functions Alexey Kardashevskiy
2015-06-05  6:35 ` [PATCH kernel v12 20/34] powerpc/powernv/ioda2: Move TCE kill register address to PE Alexey Kardashevskiy
2015-06-05  6:35 ` [PATCH kernel v12 21/34] powerpc/powernv/ioda2: Add TCE invalidation for all attached groups Alexey Kardashevskiy
2015-06-05  6:35 ` [PATCH kernel v12 22/34] powerpc/powernv: Implement accessor to TCE entry Alexey Kardashevskiy
2015-06-05  6:35 ` [PATCH kernel v12 23/34] powerpc/iommu/powernv: Release replaced TCE Alexey Kardashevskiy
2015-06-05  6:35 ` [PATCH kernel v12 24/34] powerpc/powernv/ioda2: Rework iommu_table creation Alexey Kardashevskiy
2015-06-05  6:35 ` [PATCH kernel v12 25/34] powerpc/powernv/ioda2: Introduce helpers to allocate TCE pages Alexey Kardashevskiy
2015-06-05  6:35 ` [PATCH kernel v12 26/34] powerpc/powernv/ioda2: Introduce pnv_pci_ioda2_set_window Alexey Kardashevskiy
2015-06-09  4:23   ` David Gibson
2015-06-05  6:35 ` Alexey Kardashevskiy [this message]
2015-06-09  3:57   ` [PATCH kernel v12 27/34] powerpc/powernv: Implement multilevel TCE tables David Gibson
2015-06-05  6:35 ` [PATCH kernel v12 28/34] vfio: powerpc/spapr: powerpc/powernv/ioda: Define and implement DMA windows API Alexey Kardashevskiy
2015-06-05  6:35 ` [PATCH kernel v12 29/34] powerpc/powernv/ioda2: Use new helpers to do proper cleanup on PE release Alexey Kardashevskiy
2015-06-05  6:35 ` [PATCH kernel v12 30/34] powerpc/iommu/ioda2: Add get_table_size() to calculate the size of future table Alexey Kardashevskiy
2015-06-05  6:35 ` [PATCH kernel v12 31/34] vfio: powerpc/spapr: powerpc/powernv/ioda2: Use DMA windows API in ownership control Alexey Kardashevskiy
2015-06-05  6:35 ` [PATCH kernel v12 32/34] powerpc/mmu: Add userspace-to-physical addresses translation cache Alexey Kardashevskiy
2015-06-09  4:05   ` David Gibson
2015-06-05  6:35 ` [PATCH kernel v12 33/34] vfio: powerpc/spapr: Register memory and define IOMMU v2 Alexey Kardashevskiy
2015-06-09  4:21   ` David Gibson
2015-06-05  6:35 ` [PATCH kernel v12 34/34] vfio: powerpc/spapr: Support Dynamic DMA windows Alexey Kardashevskiy

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1433486126-23551-28-git-send-email-aik@ozlabs.ru \
    --to=aik@ozlabs.ru \
    --cc=alex.williamson@redhat.com \
    --cc=benh@kernel.crashing.org \
    --cc=david@gibson.dropbear.id.au \
    --cc=gwshan@linux.vnet.ibm.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linuxppc-dev@lists.ozlabs.org \
    --cc=paulus@samba.org \
    --cc=weiyang@linux.vnet.ibm.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).