* [RFC]: map 4K iommu pages even on 64K largepage systems.
@ 2006-10-24 0:25 Linas Vepstas
2006-10-24 0:50 ` Geoff Levand
` (2 more replies)
0 siblings, 3 replies; 13+ messages in thread
From: Linas Vepstas @ 2006-10-24 0:25 UTC (permalink / raw)
To: linuxppc-dev; +Cc: Olof Johansson
Subject: [RFC]: map 4K iommu pages even on 64K largepage systems.
The 10Gigabit ethernet device drivers appear to be able to chew
up all 256MB of TCE mappings on pSeries systems, as evidenced by
numerous error messages:
iommu_alloc failed, tbl c0000000010d5c48 vaddr c0000000d875eff0 npages 1
Some experimentaiton indicates that this is essentially because
one 1500 byte ethernet MTU gets mapped as a 64K DMA region when
the large 64K pages are enabled. Thus, it doesn't take much to
exhaust all of the available DMA mappings for a high-speed card.
This patch changes the iommu allocator to work with its own
unique, distinct page size. Although the patch is long, its
actually quite simple: it just #defines distinct IOMMU_PAGE_SIZE
and then uses this in al the places tha matter.
The patch boots on pseries, untested in other places.
Haven't yet thought if this is a good long-term solution or not,
whether this kind of thing is desirable or not. That's why its
an RFC. Comments?
--linas
Cc: Olof Johansson <olof@lixom.net>
----
arch/powerpc/kernel/iommu.c | 63 ++++++++++++++++++---------------
arch/powerpc/kernel/vio.c | 4 +-
arch/powerpc/platforms/pseries/iommu.c | 13 +++---
include/asm-powerpc/iommu.h | 27 +++++++++++++-
include/asm-powerpc/tce.h | 4 +-
5 files changed, 74 insertions(+), 37 deletions(-)
Index: linux-2.6.19-rc1-git11/arch/powerpc/kernel/iommu.c
===================================================================
--- linux-2.6.19-rc1-git11.orig/arch/powerpc/kernel/iommu.c 2006-10-13 11:54:00.000000000 -0500
+++ linux-2.6.19-rc1-git11/arch/powerpc/kernel/iommu.c 2006-10-23 18:41:38.000000000 -0500
@@ -47,6 +47,15 @@ static int novmerge = 0;
static int novmerge = 1;
#endif
+static inline unsigned long
+iommu_num_pages(unsigned long vaddr, unsigned long slen)
+{
+ unsigned long npages;
+ npages = IOMMU_PAGE_ALIGN(vaddr + slen) - (vaddr & IOMMU_PAGE_MASK);
+ npages >>= IOMMU_PAGE_SHIFT;
+ return npages;
+}
+
static int __init setup_iommu(char *str)
{
if (!strcmp(str, "novmerge"))
@@ -178,10 +187,10 @@ static dma_addr_t iommu_alloc(struct iom
}
entry += tbl->it_offset; /* Offset into real TCE table */
- ret = entry << PAGE_SHIFT; /* Set the return dma address */
+ ret = entry << IOMMU_PAGE_SHIFT; /* Set the return dma address */
/* Put the TCEs in the HW table */
- ppc_md.tce_build(tbl, entry, npages, (unsigned long)page & PAGE_MASK,
+ ppc_md.tce_build(tbl, entry, npages, (unsigned long)page & IOMMU_PAGE_MASK,
direction);
@@ -203,7 +212,7 @@ static void __iommu_free(struct iommu_ta
unsigned long entry, free_entry;
unsigned long i;
- entry = dma_addr >> PAGE_SHIFT;
+ entry = dma_addr >> IOMMU_PAGE_SHIFT;
free_entry = entry - tbl->it_offset;
if (((free_entry + npages) > tbl->it_size) ||
@@ -270,7 +279,7 @@ int iommu_map_sg(struct device *dev, str
/* Init first segment length for backout at failure */
outs->dma_length = 0;
- DBG("mapping %d elements:\n", nelems);
+ DBG("sg mapping %d elements:\n", nelems);
spin_lock_irqsave(&(tbl->it_lock), flags);
@@ -285,9 +294,8 @@ int iommu_map_sg(struct device *dev, str
}
/* Allocate iommu entries for that segment */
vaddr = (unsigned long)page_address(s->page) + s->offset;
- npages = PAGE_ALIGN(vaddr + slen) - (vaddr & PAGE_MASK);
- npages >>= PAGE_SHIFT;
- entry = iommu_range_alloc(tbl, npages, &handle, mask >> PAGE_SHIFT, 0);
+ npages = iommu_num_pages(vaddr, slen);
+ entry = iommu_range_alloc(tbl, npages, &handle, mask >> IOMMU_PAGE_SHIFT, 0);
DBG(" - vaddr: %lx, size: %lx\n", vaddr, slen);
@@ -301,14 +309,14 @@ int iommu_map_sg(struct device *dev, str
/* Convert entry to a dma_addr_t */
entry += tbl->it_offset;
- dma_addr = entry << PAGE_SHIFT;
- dma_addr |= s->offset;
+ dma_addr = entry << IOMMU_PAGE_SHIFT;
+ dma_addr |= (s->offset & ~IOMMU_PAGE_MASK);
- DBG(" - %lx pages, entry: %lx, dma_addr: %lx\n",
+ DBG(" - %lu pages, entry: %lx, dma_addr: %lx\n",
npages, entry, dma_addr);
/* Insert into HW table */
- ppc_md.tce_build(tbl, entry, npages, vaddr & PAGE_MASK, direction);
+ ppc_md.tce_build(tbl, entry, npages, vaddr & IOMMU_PAGE_MASK, direction);
/* If we are in an open segment, try merging */
if (segstart != s) {
@@ -323,7 +331,7 @@ int iommu_map_sg(struct device *dev, str
DBG(" can't merge, new segment.\n");
} else {
outs->dma_length += s->length;
- DBG(" merged, new len: %lx\n", outs->dma_length);
+ DBG(" merged, new len: %ux\n", outs->dma_length);
}
}
@@ -367,9 +375,8 @@ int iommu_map_sg(struct device *dev, str
if (s->dma_length != 0) {
unsigned long vaddr, npages;
- vaddr = s->dma_address & PAGE_MASK;
- npages = (PAGE_ALIGN(s->dma_address + s->dma_length) - vaddr)
- >> PAGE_SHIFT;
+ vaddr = s->dma_address & IOMMU_PAGE_MASK;
+ npages = iommu_num_pages(s->dma_address, s->dma_length);
__iommu_free(tbl, vaddr, npages);
s->dma_address = DMA_ERROR_CODE;
s->dma_length = 0;
@@ -398,8 +405,7 @@ void iommu_unmap_sg(struct iommu_table *
if (sglist->dma_length == 0)
break;
- npages = (PAGE_ALIGN(dma_handle + sglist->dma_length)
- - (dma_handle & PAGE_MASK)) >> PAGE_SHIFT;
+ npages = iommu_num_pages(dma_handle,sglist->dma_length);
__iommu_free(tbl, dma_handle, npages);
sglist++;
}
@@ -532,12 +538,11 @@ dma_addr_t iommu_map_single(struct iommu
BUG_ON(direction == DMA_NONE);
uaddr = (unsigned long)vaddr;
- npages = PAGE_ALIGN(uaddr + size) - (uaddr & PAGE_MASK);
- npages >>= PAGE_SHIFT;
+ npages = iommu_num_pages(uaddr, size);
if (tbl) {
dma_handle = iommu_alloc(tbl, vaddr, npages, direction,
- mask >> PAGE_SHIFT, 0);
+ mask >> IOMMU_PAGE_SHIFT, 0);
if (dma_handle == DMA_ERROR_CODE) {
if (printk_ratelimit()) {
printk(KERN_INFO "iommu_alloc failed, "
@@ -545,7 +550,7 @@ dma_addr_t iommu_map_single(struct iommu
tbl, vaddr, npages);
}
} else
- dma_handle |= (uaddr & ~PAGE_MASK);
+ dma_handle |= (uaddr & ~IOMMU_PAGE_MASK);
}
return dma_handle;
@@ -557,8 +562,8 @@ void iommu_unmap_single(struct iommu_tab
BUG_ON(direction == DMA_NONE);
if (tbl)
- iommu_free(tbl, dma_handle, (PAGE_ALIGN(dma_handle + size) -
- (dma_handle & PAGE_MASK)) >> PAGE_SHIFT);
+ iommu_free(tbl, dma_handle, (IOMMU_PAGE_ALIGN(dma_handle + size) -
+ (dma_handle & IOMMU_PAGE_MASK)) >> IOMMU_PAGE_SHIFT);
}
/* Allocates a contiguous real buffer and creates mappings over it.
@@ -571,6 +576,7 @@ void *iommu_alloc_coherent(struct iommu_
void *ret = NULL;
dma_addr_t mapping;
unsigned int npages, order;
+ unsigned int nio_pages, io_order;
struct page *page;
size = PAGE_ALIGN(size);
@@ -598,8 +604,10 @@ void *iommu_alloc_coherent(struct iommu_
memset(ret, 0, size);
/* Set up tces to cover the allocated range */
- mapping = iommu_alloc(tbl, ret, npages, DMA_BIDIRECTIONAL,
- mask >> PAGE_SHIFT, order);
+ nio_pages = size >> IOMMU_PAGE_SHIFT;
+ io_order = get_iommu_order(size);
+ mapping = iommu_alloc(tbl, ret, nio_pages, DMA_BIDIRECTIONAL,
+ mask >> IOMMU_PAGE_SHIFT, io_order);
if (mapping == DMA_ERROR_CODE) {
free_pages((unsigned long)ret, order);
return NULL;
@@ -614,9 +622,10 @@ void iommu_free_coherent(struct iommu_ta
unsigned int npages;
if (tbl) {
- size = PAGE_ALIGN(size);
- npages = size >> PAGE_SHIFT;
+ size = IOMMU_PAGE_ALIGN(size);
+ npages = size >> IOMMU_PAGE_SHIFT;
iommu_free(tbl, dma_handle, npages);
+ size = PAGE_ALIGN(size);
free_pages((unsigned long)vaddr, get_order(size));
}
}
Index: linux-2.6.19-rc1-git11/include/asm-powerpc/iommu.h
===================================================================
--- linux-2.6.19-rc1-git11.orig/include/asm-powerpc/iommu.h 2006-09-19 22:42:06.000000000 -0500
+++ linux-2.6.19-rc1-git11/include/asm-powerpc/iommu.h 2006-10-23 18:28:21.000000000 -0500
@@ -23,16 +23,41 @@
#ifdef __KERNEL__
#include <asm/types.h>
+#include <linux/compiler.h>
#include <linux/spinlock.h>
#include <linux/device.h>
#include <linux/dma-mapping.h>
+#define IOMMU_PAGE_SHIFT 12
+#define IOMMU_PAGE_SIZE (ASM_CONST(1) << IOMMU_PAGE_SHIFT)
+#define IOMMU_PAGE_MASK (~((1 << IOMMU_PAGE_SHIFT) - 1))
+#define IOMMU_PAGE_ALIGN(addr) _ALIGN_UP(addr, IOMMU_PAGE_SIZE)
+
+#ifndef __ASSEMBLY__
+
+/* Pure 2^n version of get_order */
+static __inline__ __attribute_const__ int get_iommu_order(unsigned long size)
+{
+ int order;
+
+ size = (size - 1) >> (IOMMU_PAGE_SHIFT - 1);
+ order = -1;
+ do {
+ size >>= 1;
+ order++;
+ } while (size);
+ return order;
+}
+
+#endif /* __ASSEMBLY__ */
+
+
/*
* IOMAP_MAX_ORDER defines the largest contiguous block
* of dma space we can get. IOMAP_MAX_ORDER = 13
* allows up to 2**12 pages (4096 * 4096) = 16 MB
*/
-#define IOMAP_MAX_ORDER 13
+#define IOMAP_MAX_ORDER (25 - PAGE_SHIFT)
struct iommu_table {
unsigned long it_busno; /* Bus number this table belongs to */
Index: linux-2.6.19-rc1-git11/include/asm-powerpc/tce.h
===================================================================
--- linux-2.6.19-rc1-git11.orig/include/asm-powerpc/tce.h 2006-09-19 22:42:06.000000000 -0500
+++ linux-2.6.19-rc1-git11/include/asm-powerpc/tce.h 2006-10-23 15:46:57.000000000 -0500
@@ -22,6 +22,8 @@
#define _ASM_POWERPC_TCE_H
#ifdef __KERNEL__
+#include <asm/iommu.h>
+
/*
* Tces come in two formats, one for the virtual bus and a different
* format for PCI
@@ -33,7 +35,7 @@
#define TCE_SHIFT 12
#define TCE_PAGE_SIZE (1 << TCE_SHIFT)
-#define TCE_PAGE_FACTOR (PAGE_SHIFT - TCE_SHIFT)
+#define TCE_PAGE_FACTOR (IOMMU_PAGE_SHIFT - TCE_SHIFT)
#define TCE_ENTRY_SIZE 8 /* each TCE is 64 bits */
Index: linux-2.6.19-rc1-git11/arch/powerpc/kernel/vio.c
===================================================================
--- linux-2.6.19-rc1-git11.orig/arch/powerpc/kernel/vio.c 2006-10-13 11:52:51.000000000 -0500
+++ linux-2.6.19-rc1-git11/arch/powerpc/kernel/vio.c 2006-10-23 16:06:43.000000000 -0500
@@ -92,9 +92,9 @@ static struct iommu_table *vio_build_iom
&tbl->it_index, &offset, &size);
/* TCE table size - measured in tce entries */
- tbl->it_size = size >> PAGE_SHIFT;
+ tbl->it_size = size >> IOMMU_PAGE_SHIFT;
/* offset for VIO should always be 0 */
- tbl->it_offset = offset >> PAGE_SHIFT;
+ tbl->it_offset = offset >> IOMMU_PAGE_SHIFT;
tbl->it_busno = 0;
tbl->it_type = TCE_VB;
Index: linux-2.6.19-rc1-git11/arch/powerpc/platforms/pseries/iommu.c
===================================================================
--- linux-2.6.19-rc1-git11.orig/arch/powerpc/platforms/pseries/iommu.c 2006-10-13 11:54:00.000000000 -0500
+++ linux-2.6.19-rc1-git11/arch/powerpc/platforms/pseries/iommu.c 2006-10-23 18:42:03.000000000 -0500
@@ -289,7 +289,7 @@ static void iommu_table_setparms(struct
tbl->it_busno = phb->bus->number;
/* Units of tce entries */
- tbl->it_offset = phb->dma_window_base_cur >> PAGE_SHIFT;
+ tbl->it_offset = phb->dma_window_base_cur >> IOMMU_PAGE_SHIFT;
/* Test if we are going over 2GB of DMA space */
if (phb->dma_window_base_cur + phb->dma_window_size > 0x80000000ul) {
@@ -300,7 +300,7 @@ static void iommu_table_setparms(struct
phb->dma_window_base_cur += phb->dma_window_size;
/* Set the tce table size - measured in entries */
- tbl->it_size = phb->dma_window_size >> PAGE_SHIFT;
+ tbl->it_size = phb->dma_window_size >> IOMMU_PAGE_SHIFT;
tbl->it_index = 0;
tbl->it_blocksize = 16;
@@ -325,8 +325,8 @@ static void iommu_table_setparms_lpar(st
tbl->it_base = 0;
tbl->it_blocksize = 16;
tbl->it_type = TCE_PCI;
- tbl->it_offset = offset >> PAGE_SHIFT;
- tbl->it_size = size >> PAGE_SHIFT;
+ tbl->it_offset = offset >> IOMMU_PAGE_SHIFT;
+ tbl->it_size = size >> IOMMU_PAGE_SHIFT;
}
static void iommu_bus_setup_pSeries(struct pci_bus *bus)
@@ -522,8 +522,6 @@ static void iommu_dev_setup_pSeriesLP(st
const void *dma_window = NULL;
struct pci_dn *pci;
- DBG("iommu_dev_setup_pSeriesLP, dev %p (%s)\n", dev, pci_name(dev));
-
/* dev setup for LPAR is a little tricky, since the device tree might
* contain the dma-window properties per-device and not neccesarily
* for the bus. So we need to search upwards in the tree until we
@@ -532,6 +530,9 @@ static void iommu_dev_setup_pSeriesLP(st
*/
dn = pci_device_to_OF_node(dev);
+ DBG("iommu_dev_setup_pSeriesLP, dev %p (%s) %s\n",
+ dev, pci_name(dev), dn->full_name);
+
for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->iommu_table;
pdn = pdn->parent) {
dma_window = get_property(pdn, "ibm,dma-window", NULL);
^ permalink raw reply [flat|nested] 13+ messages in thread* Re: [RFC]: map 4K iommu pages even on 64K largepage systems.
2006-10-24 0:25 [RFC]: map 4K iommu pages even on 64K largepage systems Linas Vepstas
@ 2006-10-24 0:50 ` Geoff Levand
2006-10-24 2:23 ` Benjamin Herrenschmidt
2006-10-24 2:22 ` Benjamin Herrenschmidt
2006-10-24 4:43 ` Benjamin Herrenschmidt
2 siblings, 1 reply; 13+ messages in thread
From: Geoff Levand @ 2006-10-24 0:50 UTC (permalink / raw)
To: Linas Vepstas; +Cc: Olof Johansson, linuxppc-dev
Linas Vepstas wrote:
> Subject: [RFC]: map 4K iommu pages even on 64K largepage systems.
...
> Some experimentaiton indicates that this is essentially because
> one 1500 byte ethernet MTU gets mapped as a 64K DMA region when
> the large 64K pages are enabled. Thus, it doesn't take much to
> exhaust all of the available DMA mappings for a high-speed card.
>
> This patch changes the iommu allocator to work with its own
> unique, distinct page size. Although the patch is long, its
> actually quite simple: it just #defines distinct IOMMU_PAGE_SIZE
> and then uses this in al the places tha matter.
This is a step in the right direction. Cell allows each device to
have its own io pagesize, from among those that the io controller
supports (4k, 64k, 1m,16m). This limitation of the current iommu
code that you try to address here has caused me to use platform
specific dma alloc routines.
-Geoff
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [RFC]: map 4K iommu pages even on 64K largepage systems.
2006-10-24 0:50 ` Geoff Levand
@ 2006-10-24 2:23 ` Benjamin Herrenschmidt
0 siblings, 0 replies; 13+ messages in thread
From: Benjamin Herrenschmidt @ 2006-10-24 2:23 UTC (permalink / raw)
To: Geoff Levand; +Cc: Olof Johansson, linuxppc-dev
> This is a step in the right direction. Cell allows each device to
> have its own io pagesize, from among those that the io controller
> supports (4k, 64k, 1m,16m). This limitation of the current iommu
> code that you try to address here has caused me to use platform
> specific dma alloc routines.
Are devices -actually- using different page sizes ? AFAIK, for your
platform, you are using a 4k base page size, thus there is very little
point (in fact, it's more like a problem) to have an IO mapping using
larger page sizes.
In any case, read my reply to Linas, I'm proposing a scheme where each
iommu_table instance could have its own page size, in which case you can
just create a table instance per device.
Ben.
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [RFC]: map 4K iommu pages even on 64K largepage systems.
2006-10-24 0:25 [RFC]: map 4K iommu pages even on 64K largepage systems Linas Vepstas
2006-10-24 0:50 ` Geoff Levand
@ 2006-10-24 2:22 ` Benjamin Herrenschmidt
2006-10-24 3:00 ` Geoff Levand
2006-10-24 4:43 ` Benjamin Herrenschmidt
2 siblings, 1 reply; 13+ messages in thread
From: Benjamin Herrenschmidt @ 2006-10-24 2:22 UTC (permalink / raw)
To: Linas Vepstas; +Cc: Olof Johansson, linuxppc-dev
On Mon, 2006-10-23 at 19:25 -0500, Linas Vepstas wrote:
> Subject: [RFC]: map 4K iommu pages even on 64K largepage systems.
>
> The 10Gigabit ethernet device drivers appear to be able to chew
> up all 256MB of TCE mappings on pSeries systems, as evidenced by
> numerous error messages:
>
> iommu_alloc failed, tbl c0000000010d5c48 vaddr c0000000d875eff0 npages 1
>
> Some experimentaiton indicates that this is essentially because
> one 1500 byte ethernet MTU gets mapped as a 64K DMA region when
> the large 64K pages are enabled. Thus, it doesn't take much to
> exhaust all of the available DMA mappings for a high-speed card.
There is much to be said about using a 1500MTU and no TSO on a 10G
link :) But appart from that, I agree, we have a problem.
> This patch changes the iommu allocator to work with its own
> unique, distinct page size. Although the patch is long, its
> actually quite simple: it just #defines distinct IOMMU_PAGE_SIZE
> and then uses this in al the places tha matter.
>
> The patch boots on pseries, untested in other places.
>
> Haven't yet thought if this is a good long-term solution or not,
> whether this kind of thing is desirable or not. That's why its
> an RFC. Comments?
It's probably a good enough solution for RHEL, but we should do
something different long term. There are a few things I have in mind:
- We could have a page size field in the iommu_table and have the iommu
allocator use that. Thus we can have a per iommu table instance page
size. That would allow Geoff to deal with his crazy hypervisor by
basically having one iommu table instance per device. It would also
allow us to keep using large iommu page sizes on platform where the
system gives us more than a pinhole for iommu space :)
- In the long run, I'm thinking about the interest in supporting two
page sizes for the fine and coarse allocation regions of the table. We
would need to get a bit more infos from the HW backend to do that, but
for example, on native Cell, we can have a page size per 256Mb region,
thus we could have the iommu space dividied in 4k pages for small
mappings and 64k pages for full page or more mappings.
So I reckon we should first audit and make sure your current patch works
fine on everything as a crash-fix for 2.6.19 and backportable to RHEL.
Then, we can implement my first option for 2.6.20 and possibly debate
about the interest of my second option, unless somebody else comes up
with better ideas of course :)
Ben.
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [RFC]: map 4K iommu pages even on 64K largepage systems.
2006-10-24 2:22 ` Benjamin Herrenschmidt
@ 2006-10-24 3:00 ` Geoff Levand
0 siblings, 0 replies; 13+ messages in thread
From: Geoff Levand @ 2006-10-24 3:00 UTC (permalink / raw)
To: Benjamin Herrenschmidt; +Cc: Olof Johansson, linuxppc-dev
Benjamin Herrenschmidt wrote:
> On Mon, 2006-10-23 at 19:25 -0500, Linas Vepstas wrote:
>> Subject: [RFC]: map 4K iommu pages even on 64K largepage systems.
>>
>> The 10Gigabit ethernet device drivers appear to be able to chew
>> up all 256MB of TCE mappings on pSeries systems, as evidenced by
>> numerous error messages:
>>
>> iommu_alloc failed, tbl c0000000010d5c48 vaddr c0000000d875eff0 npages 1
>>
>> Some experimentaiton indicates that this is essentially because
>> one 1500 byte ethernet MTU gets mapped as a 64K DMA region when
>> the large 64K pages are enabled. Thus, it doesn't take much to
>> exhaust all of the available DMA mappings for a high-speed card.
>
> There is much to be said about using a 1500MTU and no TSO on a 10G
> link :) But appart from that, I agree, we have a problem.
>
>> This patch changes the iommu allocator to work with its own
>> unique, distinct page size. Although the patch is long, its
>> actually quite simple: it just #defines distinct IOMMU_PAGE_SIZE
>> and then uses this in al the places tha matter.
>>
>> The patch boots on pseries, untested in other places.
>>
>> Haven't yet thought if this is a good long-term solution or not,
>> whether this kind of thing is desirable or not. That's why its
>> an RFC. Comments?
>
> It's probably a good enough solution for RHEL, but we should do
> something different long term. There are a few things I have in mind:
>
> - We could have a page size field in the iommu_table and have the iommu
> allocator use that. Thus we can have a per iommu table instance page
> size. That would allow Geoff to deal with his crazy hypervisor by
> basically having one iommu table instance per device. It would also
> allow us to keep using large iommu page sizes on platform where the
> system gives us more than a pinhole for iommu space :)
Actually, its not so important for me, since for performance, most users
will just want to map the whole of ram for every device and essentially
not use iommu pages.
For those that want to use per-device dynamic mapping, I believe there
are enough entries to support 4K io pages.
-Geoff
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [RFC]: map 4K iommu pages even on 64K largepage systems.
2006-10-24 0:25 [RFC]: map 4K iommu pages even on 64K largepage systems Linas Vepstas
2006-10-24 0:50 ` Geoff Levand
2006-10-24 2:22 ` Benjamin Herrenschmidt
@ 2006-10-24 4:43 ` Benjamin Herrenschmidt
2006-10-24 5:14 ` [PATCH] powerpc: " Benjamin Herrenschmidt
2 siblings, 1 reply; 13+ messages in thread
From: Benjamin Herrenschmidt @ 2006-10-24 4:43 UTC (permalink / raw)
To: Linas Vepstas; +Cc: Olof Johansson, linuxppc-dev
> Haven't yet thought if this is a good long-term solution or not,
> whether this kind of thing is desirable or not. That's why its
> an RFC. Comments?
Ok, after some time reading the details of the patch, it looks fairly
good, just a few nits. Also, we need to fix the DART code still :)
Ben
> +static inline unsigned long
> +iommu_num_pages(unsigned long vaddr, unsigned long slen)
Coding style in that file is to do:
static inline unsigned long iommu_num_pages(unsigned long vaddr,
unsigned long slen)
> @@ -557,8 +562,8 @@ void iommu_unmap_single(struct iommu_tab
> BUG_ON(direction == DMA_NONE);
>
> if (tbl)
> - iommu_free(tbl, dma_handle, (PAGE_ALIGN(dma_handle + size) -
> - (dma_handle & PAGE_MASK)) >> PAGE_SHIFT);
> + iommu_free(tbl, dma_handle, (IOMMU_PAGE_ALIGN(dma_handle + size) -
> + (dma_handle & IOMMU_PAGE_MASK)) >> IOMMU_PAGE_SHIFT);
> }
Use iommu_num_pages ?
> /* Allocates a contiguous real buffer and creates mappings over it.
> @@ -571,6 +576,7 @@ void *iommu_alloc_coherent(struct iommu_
> void *ret = NULL;
> dma_addr_t mapping;
> unsigned int npages, order;
> + unsigned int nio_pages, io_order;
> struct page *page;
>
> size = PAGE_ALIGN(size);
> @@ -598,8 +604,10 @@ void *iommu_alloc_coherent(struct iommu_
> memset(ret, 0, size);
>
> /* Set up tces to cover the allocated range */
> - mapping = iommu_alloc(tbl, ret, npages, DMA_BIDIRECTIONAL,
> - mask >> PAGE_SHIFT, order);
> + nio_pages = size >> IOMMU_PAGE_SHIFT;
> + io_order = get_iommu_order(size);
Not directly related to your patch, but do we really have this
requirement of allocating with an alignement of the order of the
alloation size here ?
Also, your code assumes that PAGE_SHIFT >= IOMMU_PAGE_SHIFT... it might
be useful to not have this restriction in fact. Just maybe use
iommu_num_pages(size) instead of size >> IOMMU_PAGE_SHIFT here ...
> + mapping = iommu_alloc(tbl, ret, nio_pages, DMA_BIDIRECTIONAL,
> + mask >> IOMMU_PAGE_SHIFT, io_order);
> if (mapping == DMA_ERROR_CODE) {
> free_pages((unsigned long)ret, order);
> return NULL;
> @@ -614,9 +622,10 @@ void iommu_free_coherent(struct iommu_ta
> unsigned int npages;
>
> if (tbl) {
> - size = PAGE_ALIGN(size);
> - npages = size >> PAGE_SHIFT;
> + size = IOMMU_PAGE_ALIGN(size);
> + npages = size >> IOMMU_PAGE_SHIFT;
>
> iommu_free(tbl, dma_handle, npages);
> + size = PAGE_ALIGN(size);
> free_pages((unsigned long)vaddr, get_order(size));
> }
All those repeated alignments on size and one loses track... Are you
sure you'll always end up with the proper values ? I'd rather use
iommu_num_pages() for the iommu_free and only align size once with
PAGE_ALIGN for free_pages().
> }
> Index: linux-2.6.19-rc1-git11/include/asm-powerpc/iommu.h
> ===================================================================
> --- linux-2.6.19-rc1-git11.orig/include/asm-powerpc/iommu.h 2006-09-19 22:42:06.000000000 -0500
> +++ linux-2.6.19-rc1-git11/include/asm-powerpc/iommu.h 2006-10-23 18:28:21.000000000 -0500
> @@ -23,16 +23,41 @@
> #ifdef __KERNEL__
>
> #include <asm/types.h>
> +#include <linux/compiler.h>
> #include <linux/spinlock.h>
> #include <linux/device.h>
> #include <linux/dma-mapping.h>
>
> +#define IOMMU_PAGE_SHIFT 12
> +#define IOMMU_PAGE_SIZE (ASM_CONST(1) << IOMMU_PAGE_SHIFT)
> +#define IOMMU_PAGE_MASK (~((1 << IOMMU_PAGE_SHIFT) - 1))
> +#define IOMMU_PAGE_ALIGN(addr) _ALIGN_UP(addr, IOMMU_PAGE_SIZE)
> +
> +#ifndef __ASSEMBLY__
> +
> +/* Pure 2^n version of get_order */
> +static __inline__ __attribute_const__ int get_iommu_order(unsigned long size)
> +{
> + int order;
> +
> + size = (size - 1) >> (IOMMU_PAGE_SHIFT - 1);
> + order = -1;
> + do {
> + size >>= 1;
> + order++;
> + } while (size);
> + return order;
> +}
> +
> +#endif /* __ASSEMBLY__ */
Use cntlzw... or ilog2, which would give something like :
return __ilog2((size - 1) >> IOMMU_PAGE_SHIFT) + 1;
I think Dave Howells has patches fixing get_order generically to be
implemented in terms of ilog2. (I just discovered that on ppc64, we
didn't use cntlzw/ilog2 for it, sucks, but let's wait for Dave patches
rather than fixing it ourselves now).
> +
> /*
> * IOMAP_MAX_ORDER defines the largest contiguous block
> * of dma space we can get. IOMAP_MAX_ORDER = 13
> * allows up to 2**12 pages (4096 * 4096) = 16 MB
> */
> -#define IOMAP_MAX_ORDER 13
> +#define IOMAP_MAX_ORDER (25 - PAGE_SHIFT)
I'm not completely sure we want the above to be defined as a function of
the page shift...
> struct iommu_table {
> unsigned long it_busno; /* Bus number this table belongs to */
> Index: linux-2.6.19-rc1-git11/include/asm-powerpc/tce.h
> ===================================================================
> --- linux-2.6.19-rc1-git11.orig/include/asm-powerpc/tce.h 2006-09-19 22:42:06.000000000 -0500
> +++ linux-2.6.19-rc1-git11/include/asm-powerpc/tce.h 2006-10-23 15:46:57.000000000 -0500
> @@ -22,6 +22,8 @@
> #define _ASM_POWERPC_TCE_H
> #ifdef __KERNEL__
>
> +#include <asm/iommu.h>
> +
> /*
> * Tces come in two formats, one for the virtual bus and a different
> * format for PCI
> @@ -33,7 +35,7 @@
>
> #define TCE_SHIFT 12
> #define TCE_PAGE_SIZE (1 << TCE_SHIFT)
> -#define TCE_PAGE_FACTOR (PAGE_SHIFT - TCE_SHIFT)
> +#define TCE_PAGE_FACTOR (IOMMU_PAGE_SHIFT - TCE_SHIFT)
>
> #define TCE_ENTRY_SIZE 8 /* each TCE is 64 bits */
We can actually remove TCE_PAGE_FACTOR and code using it completely.
With your patch, TCE_SHIFT == IOMMU_PAGE_SHIFT, and in a second step, we
can start having the iommu_page_shift be a member of iommu_table. Thus
there will no longer be any need for a correction factor in the backend.
> Index: linux-2.6.19-rc1-git11/arch/powerpc/kernel/vio.c
> ===================================================================
> --- linux-2.6.19-rc1-git11.orig/arch/powerpc/kernel/vio.c 2006-10-13 11:52:51.000000000 -0500
> +++ linux-2.6.19-rc1-git11/arch/powerpc/kernel/vio.c 2006-10-23 16:06:43.000000000 -0500
> @@ -92,9 +92,9 @@ static struct iommu_table *vio_build_iom
> &tbl->it_index, &offset, &size);
>
> /* TCE table size - measured in tce entries */
> - tbl->it_size = size >> PAGE_SHIFT;
> + tbl->it_size = size >> IOMMU_PAGE_SHIFT;
> /* offset for VIO should always be 0 */
> - tbl->it_offset = offset >> PAGE_SHIFT;
> + tbl->it_offset = offset >> IOMMU_PAGE_SHIFT;
> tbl->it_busno = 0;
> tbl->it_type = TCE_VB;
>
> Index: linux-2.6.19-rc1-git11/arch/powerpc/platforms/pseries/iommu.c
> ===================================================================
> --- linux-2.6.19-rc1-git11.orig/arch/powerpc/platforms/pseries/iommu.c 2006-10-13 11:54:00.000000000 -0500
> +++ linux-2.6.19-rc1-git11/arch/powerpc/platforms/pseries/iommu.c 2006-10-23 18:42:03.000000000 -0500
> @@ -289,7 +289,7 @@ static void iommu_table_setparms(struct
> tbl->it_busno = phb->bus->number;
>
> /* Units of tce entries */
> - tbl->it_offset = phb->dma_window_base_cur >> PAGE_SHIFT;
> + tbl->it_offset = phb->dma_window_base_cur >> IOMMU_PAGE_SHIFT;
>
> /* Test if we are going over 2GB of DMA space */
> if (phb->dma_window_base_cur + phb->dma_window_size > 0x80000000ul) {
> @@ -300,7 +300,7 @@ static void iommu_table_setparms(struct
> phb->dma_window_base_cur += phb->dma_window_size;
>
> /* Set the tce table size - measured in entries */
> - tbl->it_size = phb->dma_window_size >> PAGE_SHIFT;
> + tbl->it_size = phb->dma_window_size >> IOMMU_PAGE_SHIFT;
>
> tbl->it_index = 0;
> tbl->it_blocksize = 16;
> @@ -325,8 +325,8 @@ static void iommu_table_setparms_lpar(st
> tbl->it_base = 0;
> tbl->it_blocksize = 16;
> tbl->it_type = TCE_PCI;
> - tbl->it_offset = offset >> PAGE_SHIFT;
> - tbl->it_size = size >> PAGE_SHIFT;
> + tbl->it_offset = offset >> IOMMU_PAGE_SHIFT;
> + tbl->it_size = size >> IOMMU_PAGE_SHIFT;
> }
>
> static void iommu_bus_setup_pSeries(struct pci_bus *bus)
> @@ -522,8 +522,6 @@ static void iommu_dev_setup_pSeriesLP(st
> const void *dma_window = NULL;
> struct pci_dn *pci;
>
> - DBG("iommu_dev_setup_pSeriesLP, dev %p (%s)\n", dev, pci_name(dev));
> -
> /* dev setup for LPAR is a little tricky, since the device tree might
> * contain the dma-window properties per-device and not neccesarily
> * for the bus. So we need to search upwards in the tree until we
> @@ -532,6 +530,9 @@ static void iommu_dev_setup_pSeriesLP(st
> */
> dn = pci_device_to_OF_node(dev);
>
> + DBG("iommu_dev_setup_pSeriesLP, dev %p (%s) %s\n",
> + dev, pci_name(dev), dn->full_name);
> +
> for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->iommu_table;
> pdn = pdn->parent) {
> dma_window = get_property(pdn, "ibm,dma-window", NULL);
> _______________________________________________
> Linuxppc-dev mailing list
> Linuxppc-dev@ozlabs.org
> https://ozlabs.org/mailman/listinfo/linuxppc-dev
^ permalink raw reply [flat|nested] 13+ messages in thread* [PATCH] powerpc: map 4K iommu pages even on 64K largepage systems
2006-10-24 4:43 ` Benjamin Herrenschmidt
@ 2006-10-24 5:14 ` Benjamin Herrenschmidt
2006-10-24 20:08 ` Linas Vepstas
0 siblings, 1 reply; 13+ messages in thread
From: Benjamin Herrenschmidt @ 2006-10-24 5:14 UTC (permalink / raw)
To: Linas Vepstas; +Cc: Olof Johansson, linuxppc-dev, Paul Mackerras
The 10Gigabit ethernet device drivers appear to be able to chew
up all 256MB of TCE mappings on pSeries systems, as evidenced by
numerous error messages:
iommu_alloc failed, tbl c0000000010d5c48 vaddr c0000000d875eff0 npages 1
Some experimentaiton indicates that this is essentially because
one 1500 byte ethernet MTU gets mapped as a 64K DMA region when
the large 64K pages are enabled. Thus, it doesn't take much to
exhaust all of the available DMA mappings for a high-speed card.
This patch changes the iommu allocator to work with its own
unique, distinct page size. Although the patch is long, its
actually quite simple: it just #defines distinct IOMMU_PAGE_SIZE
and then uses this in al the places tha matter.
The patch boots on pseries, untested in other places.
Haven't yet thought if this is a good long-term solution or not,
whether this kind of thing is desirable or not. That's why its
an RFC. Comments?
Signed-off-by: Linas Vepstas <linas@austin.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
----
And here's a slightly modified version that fixes dart_iommu too and
changes things according to some of my comments.
(I tested on the G5 with and without 64K pages.
arch/powerpc/kernel/iommu.c | 77 +++++++++++++++++++--------------
arch/powerpc/kernel/vio.c | 4 -
arch/powerpc/platforms/pseries/iommu.c | 35 +++------------
arch/powerpc/sysdev/dart.h | 1
arch/powerpc/sysdev/dart_iommu.c | 8 ---
include/asm-powerpc/iommu.h | 22 ++++++++-
include/asm-powerpc/tce.h | 3 -
7 files changed, 78 insertions(+), 72 deletions(-)
Index: linux-cell/arch/powerpc/kernel/iommu.c
===================================================================
--- linux-cell.orig/arch/powerpc/kernel/iommu.c 2006-10-24 14:44:26.000000000 +1000
+++ linux-cell/arch/powerpc/kernel/iommu.c 2006-10-24 14:55:59.000000000 +1000
@@ -47,6 +47,17 @@ static int novmerge = 0;
static int novmerge = 1;
#endif
+static inline unsigned long iommu_num_pages(unsigned long vaddr,
+ unsigned long slen)
+{
+ unsigned long npages;
+
+ npages = IOMMU_PAGE_ALIGN(vaddr + slen) - (vaddr & IOMMU_PAGE_MASK);
+ npages >>= IOMMU_PAGE_SHIFT;
+
+ return npages;
+}
+
static int __init setup_iommu(char *str)
{
if (!strcmp(str, "novmerge"))
@@ -178,10 +189,10 @@ static dma_addr_t iommu_alloc(struct iom
}
entry += tbl->it_offset; /* Offset into real TCE table */
- ret = entry << PAGE_SHIFT; /* Set the return dma address */
+ ret = entry << IOMMU_PAGE_SHIFT; /* Set the return dma address */
/* Put the TCEs in the HW table */
- ppc_md.tce_build(tbl, entry, npages, (unsigned long)page & PAGE_MASK,
+ ppc_md.tce_build(tbl, entry, npages, (unsigned long)page & IOMMU_PAGE_MASK,
direction);
@@ -203,7 +214,7 @@ static void __iommu_free(struct iommu_ta
unsigned long entry, free_entry;
unsigned long i;
- entry = dma_addr >> PAGE_SHIFT;
+ entry = dma_addr >> IOMMU_PAGE_SHIFT;
free_entry = entry - tbl->it_offset;
if (((free_entry + npages) > tbl->it_size) ||
@@ -270,7 +281,7 @@ int iommu_map_sg(struct device *dev, str
/* Init first segment length for backout at failure */
outs->dma_length = 0;
- DBG("mapping %d elements:\n", nelems);
+ DBG("sg mapping %d elements:\n", nelems);
spin_lock_irqsave(&(tbl->it_lock), flags);
@@ -285,9 +296,8 @@ int iommu_map_sg(struct device *dev, str
}
/* Allocate iommu entries for that segment */
vaddr = (unsigned long)page_address(s->page) + s->offset;
- npages = PAGE_ALIGN(vaddr + slen) - (vaddr & PAGE_MASK);
- npages >>= PAGE_SHIFT;
- entry = iommu_range_alloc(tbl, npages, &handle, mask >> PAGE_SHIFT, 0);
+ npages = iommu_num_pages(vaddr, slen);
+ entry = iommu_range_alloc(tbl, npages, &handle, mask >> IOMMU_PAGE_SHIFT, 0);
DBG(" - vaddr: %lx, size: %lx\n", vaddr, slen);
@@ -301,14 +311,14 @@ int iommu_map_sg(struct device *dev, str
/* Convert entry to a dma_addr_t */
entry += tbl->it_offset;
- dma_addr = entry << PAGE_SHIFT;
- dma_addr |= s->offset;
+ dma_addr = entry << IOMMU_PAGE_SHIFT;
+ dma_addr |= (s->offset & ~IOMMU_PAGE_MASK);
- DBG(" - %lx pages, entry: %lx, dma_addr: %lx\n",
+ DBG(" - %lu pages, entry: %lx, dma_addr: %lx\n",
npages, entry, dma_addr);
/* Insert into HW table */
- ppc_md.tce_build(tbl, entry, npages, vaddr & PAGE_MASK, direction);
+ ppc_md.tce_build(tbl, entry, npages, vaddr & IOMMU_PAGE_MASK, direction);
/* If we are in an open segment, try merging */
if (segstart != s) {
@@ -323,7 +333,7 @@ int iommu_map_sg(struct device *dev, str
DBG(" can't merge, new segment.\n");
} else {
outs->dma_length += s->length;
- DBG(" merged, new len: %lx\n", outs->dma_length);
+ DBG(" merged, new len: %ux\n", outs->dma_length);
}
}
@@ -367,9 +377,8 @@ int iommu_map_sg(struct device *dev, str
if (s->dma_length != 0) {
unsigned long vaddr, npages;
- vaddr = s->dma_address & PAGE_MASK;
- npages = (PAGE_ALIGN(s->dma_address + s->dma_length) - vaddr)
- >> PAGE_SHIFT;
+ vaddr = s->dma_address & IOMMU_PAGE_MASK;
+ npages = iommu_num_pages(s->dma_address, s->dma_length);
__iommu_free(tbl, vaddr, npages);
s->dma_address = DMA_ERROR_CODE;
s->dma_length = 0;
@@ -398,8 +407,7 @@ void iommu_unmap_sg(struct iommu_table *
if (sglist->dma_length == 0)
break;
- npages = (PAGE_ALIGN(dma_handle + sglist->dma_length)
- - (dma_handle & PAGE_MASK)) >> PAGE_SHIFT;
+ npages = iommu_num_pages(dma_handle,sglist->dma_length);
__iommu_free(tbl, dma_handle, npages);
sglist++;
}
@@ -532,12 +540,11 @@ dma_addr_t iommu_map_single(struct iommu
BUG_ON(direction == DMA_NONE);
uaddr = (unsigned long)vaddr;
- npages = PAGE_ALIGN(uaddr + size) - (uaddr & PAGE_MASK);
- npages >>= PAGE_SHIFT;
+ npages = iommu_num_pages(uaddr, size);
if (tbl) {
dma_handle = iommu_alloc(tbl, vaddr, npages, direction,
- mask >> PAGE_SHIFT, 0);
+ mask >> IOMMU_PAGE_SHIFT, 0);
if (dma_handle == DMA_ERROR_CODE) {
if (printk_ratelimit()) {
printk(KERN_INFO "iommu_alloc failed, "
@@ -545,7 +552,7 @@ dma_addr_t iommu_map_single(struct iommu
tbl, vaddr, npages);
}
} else
- dma_handle |= (uaddr & ~PAGE_MASK);
+ dma_handle |= (uaddr & ~IOMMU_PAGE_MASK);
}
return dma_handle;
@@ -554,11 +561,14 @@ dma_addr_t iommu_map_single(struct iommu
void iommu_unmap_single(struct iommu_table *tbl, dma_addr_t dma_handle,
size_t size, enum dma_data_direction direction)
{
+ unsigned int npages;
+
BUG_ON(direction == DMA_NONE);
- if (tbl)
- iommu_free(tbl, dma_handle, (PAGE_ALIGN(dma_handle + size) -
- (dma_handle & PAGE_MASK)) >> PAGE_SHIFT);
+ if (tbl) {
+ npages = iommu_num_pages(dma_handle, size);
+ iommu_free(tbl, dma_handle, npages);
+ }
}
/* Allocates a contiguous real buffer and creates mappings over it.
@@ -570,11 +580,11 @@ void *iommu_alloc_coherent(struct iommu_
{
void *ret = NULL;
dma_addr_t mapping;
- unsigned int npages, order;
+ unsigned int order;
+ unsigned int nio_pages, io_order;
struct page *page;
size = PAGE_ALIGN(size);
- npages = size >> PAGE_SHIFT;
order = get_order(size);
/*
@@ -598,8 +608,10 @@ void *iommu_alloc_coherent(struct iommu_
memset(ret, 0, size);
/* Set up tces to cover the allocated range */
- mapping = iommu_alloc(tbl, ret, npages, DMA_BIDIRECTIONAL,
- mask >> PAGE_SHIFT, order);
+ nio_pages = size >> IOMMU_PAGE_SHIFT;
+ io_order = get_iommu_order(size);
+ mapping = iommu_alloc(tbl, ret, nio_pages, DMA_BIDIRECTIONAL,
+ mask >> IOMMU_PAGE_SHIFT, io_order);
if (mapping == DMA_ERROR_CODE) {
free_pages((unsigned long)ret, order);
return NULL;
@@ -611,12 +623,13 @@ void *iommu_alloc_coherent(struct iommu_
void iommu_free_coherent(struct iommu_table *tbl, size_t size,
void *vaddr, dma_addr_t dma_handle)
{
- unsigned int npages;
-
if (tbl) {
+ unsigned int nio_pages;
+
+ size = PAGE_ALIGN(size);
+ nio_pages = size >> IOMMU_PAGE_SHIFT;
+ iommu_free(tbl, dma_handle, nio_pages);
size = PAGE_ALIGN(size);
- npages = size >> PAGE_SHIFT;
- iommu_free(tbl, dma_handle, npages);
free_pages((unsigned long)vaddr, get_order(size));
}
}
Index: linux-cell/include/asm-powerpc/iommu.h
===================================================================
--- linux-cell.orig/include/asm-powerpc/iommu.h 2006-10-24 14:44:26.000000000 +1000
+++ linux-cell/include/asm-powerpc/iommu.h 2006-10-24 14:58:45.000000000 +1000
@@ -22,17 +22,35 @@
#define _ASM_IOMMU_H
#ifdef __KERNEL__
-#include <asm/types.h>
+#include <linux/compiler.h>
#include <linux/spinlock.h>
#include <linux/device.h>
#include <linux/dma-mapping.h>
+#include <asm/types.h>
+#include <asm/bitops.h>
+
+#define IOMMU_PAGE_SHIFT 12
+#define IOMMU_PAGE_SIZE (ASM_CONST(1) << IOMMU_PAGE_SHIFT)
+#define IOMMU_PAGE_MASK (~((1 << IOMMU_PAGE_SHIFT) - 1))
+#define IOMMU_PAGE_ALIGN(addr) _ALIGN_UP(addr, IOMMU_PAGE_SIZE)
+
+#ifndef __ASSEMBLY__
+
+/* Pure 2^n version of get_order */
+static __inline__ __attribute_const__ int get_iommu_order(unsigned long size)
+{
+ return __ilog2((size - 1) >> IOMMU_PAGE_SHIFT) + 1;
+}
+
+#endif /* __ASSEMBLY__ */
+
/*
* IOMAP_MAX_ORDER defines the largest contiguous block
* of dma space we can get. IOMAP_MAX_ORDER = 13
* allows up to 2**12 pages (4096 * 4096) = 16 MB
*/
-#define IOMAP_MAX_ORDER 13
+#define IOMAP_MAX_ORDER 13
struct iommu_table {
unsigned long it_busno; /* Bus number this table belongs to */
Index: linux-cell/include/asm-powerpc/tce.h
===================================================================
--- linux-cell.orig/include/asm-powerpc/tce.h 2006-10-06 13:45:32.000000000 +1000
+++ linux-cell/include/asm-powerpc/tce.h 2006-10-24 14:59:20.000000000 +1000
@@ -22,6 +22,8 @@
#define _ASM_POWERPC_TCE_H
#ifdef __KERNEL__
+#include <asm/iommu.h>
+
/*
* Tces come in two formats, one for the virtual bus and a different
* format for PCI
@@ -33,7 +35,6 @@
#define TCE_SHIFT 12
#define TCE_PAGE_SIZE (1 << TCE_SHIFT)
-#define TCE_PAGE_FACTOR (PAGE_SHIFT - TCE_SHIFT)
#define TCE_ENTRY_SIZE 8 /* each TCE is 64 bits */
Index: linux-cell/arch/powerpc/kernel/vio.c
===================================================================
--- linux-cell.orig/arch/powerpc/kernel/vio.c 2006-10-24 14:44:26.000000000 +1000
+++ linux-cell/arch/powerpc/kernel/vio.c 2006-10-24 14:44:55.000000000 +1000
@@ -92,9 +92,9 @@ static struct iommu_table *vio_build_iom
&tbl->it_index, &offset, &size);
/* TCE table size - measured in tce entries */
- tbl->it_size = size >> PAGE_SHIFT;
+ tbl->it_size = size >> IOMMU_PAGE_SHIFT;
/* offset for VIO should always be 0 */
- tbl->it_offset = offset >> PAGE_SHIFT;
+ tbl->it_offset = offset >> IOMMU_PAGE_SHIFT;
tbl->it_busno = 0;
tbl->it_type = TCE_VB;
Index: linux-cell/arch/powerpc/platforms/pseries/iommu.c
===================================================================
--- linux-cell.orig/arch/powerpc/platforms/pseries/iommu.c 2006-10-24 14:44:26.000000000 +1000
+++ linux-cell/arch/powerpc/platforms/pseries/iommu.c 2006-10-24 15:00:07.000000000 +1000
@@ -57,9 +57,6 @@ static void tce_build_pSeries(struct iom
u64 *tcep;
u64 rpn;
- index <<= TCE_PAGE_FACTOR;
- npages <<= TCE_PAGE_FACTOR;
-
proto_tce = TCE_PCI_READ; // Read allowed
if (direction != DMA_TO_DEVICE)
@@ -82,9 +79,6 @@ static void tce_free_pSeries(struct iomm
{
u64 *tcep;
- npages <<= TCE_PAGE_FACTOR;
- index <<= TCE_PAGE_FACTOR;
-
tcep = ((u64 *)tbl->it_base) + index;
while (npages--)
@@ -95,7 +89,6 @@ static unsigned long tce_get_pseries(str
{
u64 *tcep;
- index <<= TCE_PAGE_FACTOR;
tcep = ((u64 *)tbl->it_base) + index;
return *tcep;
@@ -109,9 +102,6 @@ static void tce_build_pSeriesLP(struct i
u64 proto_tce, tce;
u64 rpn;
- tcenum <<= TCE_PAGE_FACTOR;
- npages <<= TCE_PAGE_FACTOR;
-
rpn = (virt_to_abs(uaddr)) >> TCE_SHIFT;
proto_tce = TCE_PCI_READ;
if (direction != DMA_TO_DEVICE)
@@ -146,7 +136,7 @@ static void tce_buildmulti_pSeriesLP(str
u64 rpn;
long l, limit;
- if (TCE_PAGE_FACTOR == 0 && npages == 1)
+ if (npages == 1)
return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr,
direction);
@@ -164,9 +154,6 @@ static void tce_buildmulti_pSeriesLP(str
__get_cpu_var(tce_page) = tcep;
}
- tcenum <<= TCE_PAGE_FACTOR;
- npages <<= TCE_PAGE_FACTOR;
-
rpn = (virt_to_abs(uaddr)) >> TCE_SHIFT;
proto_tce = TCE_PCI_READ;
if (direction != DMA_TO_DEVICE)
@@ -207,9 +194,6 @@ static void tce_free_pSeriesLP(struct io
{
u64 rc;
- tcenum <<= TCE_PAGE_FACTOR;
- npages <<= TCE_PAGE_FACTOR;
-
while (npages--) {
rc = plpar_tce_put((u64)tbl->it_index, (u64)tcenum << 12, 0);
@@ -229,9 +213,6 @@ static void tce_freemulti_pSeriesLP(stru
{
u64 rc;
- tcenum <<= TCE_PAGE_FACTOR;
- npages <<= TCE_PAGE_FACTOR;
-
rc = plpar_tce_stuff((u64)tbl->it_index, (u64)tcenum << 12, 0, npages);
if (rc && printk_ratelimit()) {
@@ -248,7 +229,6 @@ static unsigned long tce_get_pSeriesLP(s
u64 rc;
unsigned long tce_ret;
- tcenum <<= TCE_PAGE_FACTOR;
rc = plpar_tce_get((u64)tbl->it_index, (u64)tcenum << 12, &tce_ret);
if (rc && printk_ratelimit()) {
@@ -289,7 +269,7 @@ static void iommu_table_setparms(struct
tbl->it_busno = phb->bus->number;
/* Units of tce entries */
- tbl->it_offset = phb->dma_window_base_cur >> PAGE_SHIFT;
+ tbl->it_offset = phb->dma_window_base_cur >> IOMMU_PAGE_SHIFT;
/* Test if we are going over 2GB of DMA space */
if (phb->dma_window_base_cur + phb->dma_window_size > 0x80000000ul) {
@@ -300,7 +280,7 @@ static void iommu_table_setparms(struct
phb->dma_window_base_cur += phb->dma_window_size;
/* Set the tce table size - measured in entries */
- tbl->it_size = phb->dma_window_size >> PAGE_SHIFT;
+ tbl->it_size = phb->dma_window_size >> IOMMU_PAGE_SHIFT;
tbl->it_index = 0;
tbl->it_blocksize = 16;
@@ -325,8 +305,8 @@ static void iommu_table_setparms_lpar(st
tbl->it_base = 0;
tbl->it_blocksize = 16;
tbl->it_type = TCE_PCI;
- tbl->it_offset = offset >> PAGE_SHIFT;
- tbl->it_size = size >> PAGE_SHIFT;
+ tbl->it_offset = offset >> IOMMU_PAGE_SHIFT;
+ tbl->it_size = size >> IOMMU_PAGE_SHIFT;
}
static void iommu_bus_setup_pSeries(struct pci_bus *bus)
@@ -522,8 +502,6 @@ static void iommu_dev_setup_pSeriesLP(st
const void *dma_window = NULL;
struct pci_dn *pci;
- DBG("iommu_dev_setup_pSeriesLP, dev %p (%s)\n", dev, pci_name(dev));
-
/* dev setup for LPAR is a little tricky, since the device tree might
* contain the dma-window properties per-device and not neccesarily
* for the bus. So we need to search upwards in the tree until we
@@ -532,6 +510,9 @@ static void iommu_dev_setup_pSeriesLP(st
*/
dn = pci_device_to_OF_node(dev);
+ DBG("iommu_dev_setup_pSeriesLP, dev %p (%s) %s\n",
+ dev, pci_name(dev), dn->full_name);
+
for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->iommu_table;
pdn = pdn->parent) {
dma_window = get_property(pdn, "ibm,dma-window", NULL);
Index: linux-cell/arch/powerpc/sysdev/dart.h
===================================================================
--- linux-cell.orig/arch/powerpc/sysdev/dart.h 2006-10-06 13:47:54.000000000 +1000
+++ linux-cell/arch/powerpc/sysdev/dart.h 2006-10-24 15:01:16.000000000 +1000
@@ -72,7 +72,6 @@
#define DART_PAGE_SHIFT 12
#define DART_PAGE_SIZE (1 << DART_PAGE_SHIFT)
-#define DART_PAGE_FACTOR (PAGE_SHIFT - DART_PAGE_SHIFT)
#endif /* _POWERPC_SYSDEV_DART_H */
Index: linux-cell/arch/powerpc/sysdev/dart_iommu.c
===================================================================
--- linux-cell.orig/arch/powerpc/sysdev/dart_iommu.c 2006-10-24 14:44:26.000000000 +1000
+++ linux-cell/arch/powerpc/sysdev/dart_iommu.c 2006-10-24 15:01:47.000000000 +1000
@@ -156,9 +156,6 @@ static void dart_build(struct iommu_tabl
DBG("dart: build at: %lx, %lx, addr: %x\n", index, npages, uaddr);
- index <<= DART_PAGE_FACTOR;
- npages <<= DART_PAGE_FACTOR;
-
dp = ((unsigned int*)tbl->it_base) + index;
/* On U3, all memory is contigous, so we can move this
@@ -199,9 +196,6 @@ static void dart_free(struct iommu_table
DBG("dart: free at: %lx, %lx\n", index, npages);
- index <<= DART_PAGE_FACTOR;
- npages <<= DART_PAGE_FACTOR;
-
dp = ((unsigned int *)tbl->it_base) + index;
while (npages--)
@@ -281,7 +275,7 @@ static void iommu_table_dart_setup(void)
iommu_table_dart.it_busno = 0;
iommu_table_dart.it_offset = 0;
/* it_size is in number of entries */
- iommu_table_dart.it_size = (dart_tablesize / sizeof(u32)) >> DART_PAGE_FACTOR;
+ iommu_table_dart.it_size = dart_tablesize / sizeof(u32);
/* Initialize the common IOMMU code */
iommu_table_dart.it_base = (unsigned long)dart_vbase;
^ permalink raw reply [flat|nested] 13+ messages in thread* Re: [PATCH] powerpc: map 4K iommu pages even on 64K largepage systems
2006-10-24 5:14 ` [PATCH] powerpc: " Benjamin Herrenschmidt
@ 2006-10-24 20:08 ` Linas Vepstas
2006-10-24 21:41 ` Benjamin Herrenschmidt
0 siblings, 1 reply; 13+ messages in thread
From: Linas Vepstas @ 2006-10-24 20:08 UTC (permalink / raw)
To: Benjamin Herrenschmidt; +Cc: Olof Johansson, linuxppc-dev, Paul Mackerras
Hi Ben,
The patch tests well; bt there was one change I didn't understand ...
> +++ linux-cell/include/asm-powerpc/iommu.h 2006-10-24 14:58:45.000000000 +1000
> +
> +#define IOMMU_PAGE_SHIFT 12
> +++ linux-cell/include/asm-powerpc/tce.h 2006-10-24 14:59:20.000000000 +1000
>
> #define TCE_SHIFT 12
> #define TCE_PAGE_SIZE (1 << TCE_SHIFT)
> -#define TCE_PAGE_FACTOR (PAGE_SHIFT - TCE_SHIFT)
This is zero now, but if anyone ever changes IOMMU_PAGE_SHIFT to a
value oter than 12, then the below will break:
> +++ linux-cell/arch/powerpc/platforms/pseries/iommu.c 2006-10-24 15:00:07.000000000 +1000
>
> - index <<= TCE_PAGE_FACTOR;
> - npages <<= TCE_PAGE_FACTOR;
since this shift does need to be made, if IOMMU_PAGE_SHIFT != TCE_SHIFT
> +++ linux-cell/arch/powerpc/sysdev/dart.h 2006-10-24 15:01:16.000000000 +1000
>
> #define DART_PAGE_SHIFT 12
> #define DART_PAGE_SIZE (1 << DART_PAGE_SHIFT)
> -#define DART_PAGE_FACTOR (PAGE_SHIFT - DART_PAGE_SHIFT)
I'd argue that the right fix would have been
> +#define DART_PAGE_FACTOR (IOMMU_PAGE_SHIFT - DART_PAGE_SHIFT)
> +++ linux-cell/arch/powerpc/sysdev/dart_iommu.c 2006-10-24 15:01:47.000000000 +1000
>
> - index <<= DART_PAGE_FACTOR;
> - npages <<= DART_PAGE_FACTOR;
And do *not* remove these lines... certainly, they would have to be
put back in if we made iommu_page_size to be a variable in the iommu
table..
--linas
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH] powerpc: map 4K iommu pages even on 64K largepage systems
2006-10-24 20:08 ` Linas Vepstas
@ 2006-10-24 21:41 ` Benjamin Herrenschmidt
2006-10-24 23:17 ` Linas Vepstas
0 siblings, 1 reply; 13+ messages in thread
From: Benjamin Herrenschmidt @ 2006-10-24 21:41 UTC (permalink / raw)
To: Linas Vepstas; +Cc: Olof Johansson, linuxppc-dev, Paul Mackerras
On Tue, 2006-10-24 at 15:08 -0500, Linas Vepstas wrote:
> Hi Ben,
>
> The patch tests well; bt there was one change I didn't understand ...
>
> > +++ linux-cell/include/asm-powerpc/iommu.h 2006-10-24 14:58:45.000000000 +1000
> > +
> > +#define IOMMU_PAGE_SHIFT 12
>
>
> > +++ linux-cell/include/asm-powerpc/tce.h 2006-10-24 14:59:20.000000000 +1000
> >
> > #define TCE_SHIFT 12
> > #define TCE_PAGE_SIZE (1 << TCE_SHIFT)
> > -#define TCE_PAGE_FACTOR (PAGE_SHIFT - TCE_SHIFT)
>
> This is zero now, but if anyone ever changes IOMMU_PAGE_SHIFT to a
> value oter than 12, then the below will break:
My point is that we'll probably not change IOMMU_PAGE_SHIFT. The only
thing we might do is to move the shift into the iommu_table structure to
make it per-iommu (in which case the TCE backend will use 4k or 64k
depending on actual HW/FW support for those sizes).
> > +++ linux-cell/arch/powerpc/platforms/pseries/iommu.c 2006-10-24 15:00:07.000000000 +1000
> >
> > - index <<= TCE_PAGE_FACTOR;
> > - npages <<= TCE_PAGE_FACTOR;
>
> since this shift does need to be made, if IOMMU_PAGE_SHIFT != TCE_SHIFT
Which will not happen.
> > +++ linux-cell/arch/powerpc/sysdev/dart.h 2006-10-24 15:01:16.000000000 +1000
> >
> > #define DART_PAGE_SHIFT 12
> > #define DART_PAGE_SIZE (1 << DART_PAGE_SHIFT)
> > -#define DART_PAGE_FACTOR (PAGE_SHIFT - DART_PAGE_SHIFT)
>
> I'd argue that the right fix would have been
>
> > +#define DART_PAGE_FACTOR (IOMMU_PAGE_SHIFT - DART_PAGE_SHIFT)
See my above comment :)
> > +++ linux-cell/arch/powerpc/sysdev/dart_iommu.c 2006-10-24 15:01:47.000000000 +1000
> >
> > - index <<= DART_PAGE_FACTOR;
> > - npages <<= DART_PAGE_FACTOR;
>
> And do *not* remove these lines... certainly, they would have to be
> put back in if we made iommu_page_size to be a variable in the iommu
> table..
No. DART would set the table shift to DART_PAGE_SHIFT, TCE would set it
to TCE_PAGE_SHIFT and the backend would always get natively sized
addresses/counts.
Ben.
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH] powerpc: map 4K iommu pages even on 64K largepage systems
2006-10-24 21:41 ` Benjamin Herrenschmidt
@ 2006-10-24 23:17 ` Linas Vepstas
0 siblings, 0 replies; 13+ messages in thread
From: Linas Vepstas @ 2006-10-24 23:17 UTC (permalink / raw)
To: Benjamin Herrenschmidt; +Cc: Olof Johansson, linuxppc-dev, Paul Mackerras
On Wed, Oct 25, 2006 at 07:41:23AM +1000, Benjamin Herrenschmidt wrote:
>
> My point is that we'll probably not change IOMMU_PAGE_SHIFT. The only
> thing we might do is to move the shift into the iommu_table structure to
> make it per-iommu (in which case the TCE backend will use 4k or 64k
> depending on actual HW/FW support for those sizes).
OK, right; I don't know why, I didn't see this right wawy.
--linas
^ permalink raw reply [flat|nested] 13+ messages in thread
* [PATCH] powerpc: map 4K iommu pages even on 64K largepage systems
@ 2006-10-30 5:15 Benjamin Herrenschmidt
2006-10-30 21:33 ` Olof Johansson
2006-10-31 3:06 ` Stephen Rothwell
0 siblings, 2 replies; 13+ messages in thread
From: Benjamin Herrenschmidt @ 2006-10-30 5:15 UTC (permalink / raw)
To: linuxppc-dev list; +Cc: Olof Johansson, Paul Mackerras
The 10Gigabit ethernet device drivers appear to be able to chew
up all 256MB of TCE mappings on pSeries systems, as evidenced by
numerous error messages:
iommu_alloc failed, tbl c0000000010d5c48 vaddr c0000000d875eff0 npages 1
Some experimentaiton indicates that this is essentially because
one 1500 byte ethernet MTU gets mapped as a 64K DMA region when
the large 64K pages are enabled. Thus, it doesn't take much to
exhaust all of the available DMA mappings for a high-speed card.
This patch changes the iommu allocator to work with its own
unique, distinct page size. Although the patch is long, its
actually quite simple: it just #defines distinct IOMMU_PAGE_SIZE
and then uses this in al the places tha matter.
As a side effect , it also dramatically improves network performances
on platforms with H-calls on iommu translation inserts/removes (since
we no longer call it 16 times for a 1500 bytes packet when the iommu HW
is still 4k).
In the future, we might want to make the IOMMU_PAGE_SIZE a variable
in the iommu_table instance, thus allowing support for different HW
page sizes in the iommu itself.
Signed-off-by: Linas Vepstas <linas@austin.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
iSeries friendly version of it. Needs to go to RHEL5 too.
Index: linux-cell/arch/powerpc/kernel/iommu.c
===================================================================
--- linux-cell.orig/arch/powerpc/kernel/iommu.c 2006-10-09 12:03:33.000000000 +1000
+++ linux-cell/arch/powerpc/kernel/iommu.c 2006-10-30 14:04:38.000000000 +1100
@@ -47,6 +47,17 @@ static int novmerge = 0;
static int novmerge = 1;
#endif
+static inline unsigned long iommu_num_pages(unsigned long vaddr,
+ unsigned long slen)
+{
+ unsigned long npages;
+
+ npages = IOMMU_PAGE_ALIGN(vaddr + slen) - (vaddr & IOMMU_PAGE_MASK);
+ npages >>= IOMMU_PAGE_SHIFT;
+
+ return npages;
+}
+
static int __init setup_iommu(char *str)
{
if (!strcmp(str, "novmerge"))
@@ -178,10 +189,10 @@ static dma_addr_t iommu_alloc(struct iom
}
entry += tbl->it_offset; /* Offset into real TCE table */
- ret = entry << PAGE_SHIFT; /* Set the return dma address */
+ ret = entry << IOMMU_PAGE_SHIFT; /* Set the return dma address */
/* Put the TCEs in the HW table */
- ppc_md.tce_build(tbl, entry, npages, (unsigned long)page & PAGE_MASK,
+ ppc_md.tce_build(tbl, entry, npages, (unsigned long)page & IOMMU_PAGE_MASK,
direction);
@@ -203,7 +214,7 @@ static void __iommu_free(struct iommu_ta
unsigned long entry, free_entry;
unsigned long i;
- entry = dma_addr >> PAGE_SHIFT;
+ entry = dma_addr >> IOMMU_PAGE_SHIFT;
free_entry = entry - tbl->it_offset;
if (((free_entry + npages) > tbl->it_size) ||
@@ -270,7 +281,7 @@ int iommu_map_sg(struct device *dev, str
/* Init first segment length for backout at failure */
outs->dma_length = 0;
- DBG("mapping %d elements:\n", nelems);
+ DBG("sg mapping %d elements:\n", nelems);
spin_lock_irqsave(&(tbl->it_lock), flags);
@@ -285,9 +296,8 @@ int iommu_map_sg(struct device *dev, str
}
/* Allocate iommu entries for that segment */
vaddr = (unsigned long)page_address(s->page) + s->offset;
- npages = PAGE_ALIGN(vaddr + slen) - (vaddr & PAGE_MASK);
- npages >>= PAGE_SHIFT;
- entry = iommu_range_alloc(tbl, npages, &handle, mask >> PAGE_SHIFT, 0);
+ npages = iommu_num_pages(vaddr, slen);
+ entry = iommu_range_alloc(tbl, npages, &handle, mask >> IOMMU_PAGE_SHIFT, 0);
DBG(" - vaddr: %lx, size: %lx\n", vaddr, slen);
@@ -301,14 +311,14 @@ int iommu_map_sg(struct device *dev, str
/* Convert entry to a dma_addr_t */
entry += tbl->it_offset;
- dma_addr = entry << PAGE_SHIFT;
- dma_addr |= s->offset;
+ dma_addr = entry << IOMMU_PAGE_SHIFT;
+ dma_addr |= (s->offset & ~IOMMU_PAGE_MASK);
- DBG(" - %lx pages, entry: %lx, dma_addr: %lx\n",
+ DBG(" - %lu pages, entry: %lx, dma_addr: %lx\n",
npages, entry, dma_addr);
/* Insert into HW table */
- ppc_md.tce_build(tbl, entry, npages, vaddr & PAGE_MASK, direction);
+ ppc_md.tce_build(tbl, entry, npages, vaddr & IOMMU_PAGE_MASK, direction);
/* If we are in an open segment, try merging */
if (segstart != s) {
@@ -323,7 +333,7 @@ int iommu_map_sg(struct device *dev, str
DBG(" can't merge, new segment.\n");
} else {
outs->dma_length += s->length;
- DBG(" merged, new len: %lx\n", outs->dma_length);
+ DBG(" merged, new len: %ux\n", outs->dma_length);
}
}
@@ -367,9 +377,8 @@ int iommu_map_sg(struct device *dev, str
if (s->dma_length != 0) {
unsigned long vaddr, npages;
- vaddr = s->dma_address & PAGE_MASK;
- npages = (PAGE_ALIGN(s->dma_address + s->dma_length) - vaddr)
- >> PAGE_SHIFT;
+ vaddr = s->dma_address & IOMMU_PAGE_MASK;
+ npages = iommu_num_pages(s->dma_address, s->dma_length);
__iommu_free(tbl, vaddr, npages);
s->dma_address = DMA_ERROR_CODE;
s->dma_length = 0;
@@ -398,8 +407,7 @@ void iommu_unmap_sg(struct iommu_table *
if (sglist->dma_length == 0)
break;
- npages = (PAGE_ALIGN(dma_handle + sglist->dma_length)
- - (dma_handle & PAGE_MASK)) >> PAGE_SHIFT;
+ npages = iommu_num_pages(dma_handle,sglist->dma_length);
__iommu_free(tbl, dma_handle, npages);
sglist++;
}
@@ -532,12 +540,11 @@ dma_addr_t iommu_map_single(struct iommu
BUG_ON(direction == DMA_NONE);
uaddr = (unsigned long)vaddr;
- npages = PAGE_ALIGN(uaddr + size) - (uaddr & PAGE_MASK);
- npages >>= PAGE_SHIFT;
+ npages = iommu_num_pages(uaddr, size);
if (tbl) {
dma_handle = iommu_alloc(tbl, vaddr, npages, direction,
- mask >> PAGE_SHIFT, 0);
+ mask >> IOMMU_PAGE_SHIFT, 0);
if (dma_handle == DMA_ERROR_CODE) {
if (printk_ratelimit()) {
printk(KERN_INFO "iommu_alloc failed, "
@@ -545,7 +552,7 @@ dma_addr_t iommu_map_single(struct iommu
tbl, vaddr, npages);
}
} else
- dma_handle |= (uaddr & ~PAGE_MASK);
+ dma_handle |= (uaddr & ~IOMMU_PAGE_MASK);
}
return dma_handle;
@@ -554,11 +561,14 @@ dma_addr_t iommu_map_single(struct iommu
void iommu_unmap_single(struct iommu_table *tbl, dma_addr_t dma_handle,
size_t size, enum dma_data_direction direction)
{
+ unsigned int npages;
+
BUG_ON(direction == DMA_NONE);
- if (tbl)
- iommu_free(tbl, dma_handle, (PAGE_ALIGN(dma_handle + size) -
- (dma_handle & PAGE_MASK)) >> PAGE_SHIFT);
+ if (tbl) {
+ npages = iommu_num_pages(dma_handle, size);
+ iommu_free(tbl, dma_handle, npages);
+ }
}
/* Allocates a contiguous real buffer and creates mappings over it.
@@ -570,11 +580,11 @@ void *iommu_alloc_coherent(struct iommu_
{
void *ret = NULL;
dma_addr_t mapping;
- unsigned int npages, order;
+ unsigned int order;
+ unsigned int nio_pages, io_order;
struct page *page;
size = PAGE_ALIGN(size);
- npages = size >> PAGE_SHIFT;
order = get_order(size);
/*
@@ -598,8 +608,10 @@ void *iommu_alloc_coherent(struct iommu_
memset(ret, 0, size);
/* Set up tces to cover the allocated range */
- mapping = iommu_alloc(tbl, ret, npages, DMA_BIDIRECTIONAL,
- mask >> PAGE_SHIFT, order);
+ nio_pages = size >> IOMMU_PAGE_SHIFT;
+ io_order = get_iommu_order(size);
+ mapping = iommu_alloc(tbl, ret, nio_pages, DMA_BIDIRECTIONAL,
+ mask >> IOMMU_PAGE_SHIFT, io_order);
if (mapping == DMA_ERROR_CODE) {
free_pages((unsigned long)ret, order);
return NULL;
@@ -611,12 +623,13 @@ void *iommu_alloc_coherent(struct iommu_
void iommu_free_coherent(struct iommu_table *tbl, size_t size,
void *vaddr, dma_addr_t dma_handle)
{
- unsigned int npages;
-
if (tbl) {
+ unsigned int nio_pages;
+
+ size = PAGE_ALIGN(size);
+ nio_pages = size >> IOMMU_PAGE_SHIFT;
+ iommu_free(tbl, dma_handle, nio_pages);
size = PAGE_ALIGN(size);
- npages = size >> PAGE_SHIFT;
- iommu_free(tbl, dma_handle, npages);
free_pages((unsigned long)vaddr, get_order(size));
}
}
Index: linux-cell/include/asm-powerpc/iommu.h
===================================================================
--- linux-cell.orig/include/asm-powerpc/iommu.h 2006-10-06 13:45:32.000000000 +1000
+++ linux-cell/include/asm-powerpc/iommu.h 2006-10-30 14:04:38.000000000 +1100
@@ -22,17 +22,35 @@
#define _ASM_IOMMU_H
#ifdef __KERNEL__
-#include <asm/types.h>
+#include <linux/compiler.h>
#include <linux/spinlock.h>
#include <linux/device.h>
#include <linux/dma-mapping.h>
+#include <asm/types.h>
+#include <asm/bitops.h>
+
+#define IOMMU_PAGE_SHIFT 12
+#define IOMMU_PAGE_SIZE (ASM_CONST(1) << IOMMU_PAGE_SHIFT)
+#define IOMMU_PAGE_MASK (~((1 << IOMMU_PAGE_SHIFT) - 1))
+#define IOMMU_PAGE_ALIGN(addr) _ALIGN_UP(addr, IOMMU_PAGE_SIZE)
+
+#ifndef __ASSEMBLY__
+
+/* Pure 2^n version of get_order */
+static __inline__ __attribute_const__ int get_iommu_order(unsigned long size)
+{
+ return __ilog2((size - 1) >> IOMMU_PAGE_SHIFT) + 1;
+}
+
+#endif /* __ASSEMBLY__ */
+
/*
* IOMAP_MAX_ORDER defines the largest contiguous block
* of dma space we can get. IOMAP_MAX_ORDER = 13
* allows up to 2**12 pages (4096 * 4096) = 16 MB
*/
-#define IOMAP_MAX_ORDER 13
+#define IOMAP_MAX_ORDER 13
struct iommu_table {
unsigned long it_busno; /* Bus number this table belongs to */
Index: linux-cell/include/asm-powerpc/tce.h
===================================================================
--- linux-cell.orig/include/asm-powerpc/tce.h 2006-10-06 13:45:32.000000000 +1000
+++ linux-cell/include/asm-powerpc/tce.h 2006-10-27 15:41:49.000000000 +1000
@@ -22,6 +22,8 @@
#define _ASM_POWERPC_TCE_H
#ifdef __KERNEL__
+#include <asm/iommu.h>
+
/*
* Tces come in two formats, one for the virtual bus and a different
* format for PCI
@@ -33,7 +35,6 @@
#define TCE_SHIFT 12
#define TCE_PAGE_SIZE (1 << TCE_SHIFT)
-#define TCE_PAGE_FACTOR (PAGE_SHIFT - TCE_SHIFT)
#define TCE_ENTRY_SIZE 8 /* each TCE is 64 bits */
Index: linux-cell/arch/powerpc/kernel/vio.c
===================================================================
--- linux-cell.orig/arch/powerpc/kernel/vio.c 2006-10-06 13:47:54.000000000 +1000
+++ linux-cell/arch/powerpc/kernel/vio.c 2006-10-30 14:04:38.000000000 +1100
@@ -92,9 +92,9 @@ static struct iommu_table *vio_build_iom
&tbl->it_index, &offset, &size);
/* TCE table size - measured in tce entries */
- tbl->it_size = size >> PAGE_SHIFT;
+ tbl->it_size = size >> IOMMU_PAGE_SHIFT;
/* offset for VIO should always be 0 */
- tbl->it_offset = offset >> PAGE_SHIFT;
+ tbl->it_offset = offset >> IOMMU_PAGE_SHIFT;
tbl->it_busno = 0;
tbl->it_type = TCE_VB;
Index: linux-cell/arch/powerpc/platforms/pseries/iommu.c
===================================================================
--- linux-cell.orig/arch/powerpc/platforms/pseries/iommu.c 2006-10-09 12:03:33.000000000 +1000
+++ linux-cell/arch/powerpc/platforms/pseries/iommu.c 2006-10-30 14:20:45.000000000 +1100
@@ -57,9 +57,6 @@ static void tce_build_pSeries(struct iom
u64 *tcep;
u64 rpn;
- index <<= TCE_PAGE_FACTOR;
- npages <<= TCE_PAGE_FACTOR;
-
proto_tce = TCE_PCI_READ; // Read allowed
if (direction != DMA_TO_DEVICE)
@@ -82,9 +79,6 @@ static void tce_free_pSeries(struct iomm
{
u64 *tcep;
- npages <<= TCE_PAGE_FACTOR;
- index <<= TCE_PAGE_FACTOR;
-
tcep = ((u64 *)tbl->it_base) + index;
while (npages--)
@@ -95,7 +89,6 @@ static unsigned long tce_get_pseries(str
{
u64 *tcep;
- index <<= TCE_PAGE_FACTOR;
tcep = ((u64 *)tbl->it_base) + index;
return *tcep;
@@ -109,9 +102,6 @@ static void tce_build_pSeriesLP(struct i
u64 proto_tce, tce;
u64 rpn;
- tcenum <<= TCE_PAGE_FACTOR;
- npages <<= TCE_PAGE_FACTOR;
-
rpn = (virt_to_abs(uaddr)) >> TCE_SHIFT;
proto_tce = TCE_PCI_READ;
if (direction != DMA_TO_DEVICE)
@@ -146,7 +136,7 @@ static void tce_buildmulti_pSeriesLP(str
u64 rpn;
long l, limit;
- if (TCE_PAGE_FACTOR == 0 && npages == 1)
+ if (npages == 1)
return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr,
direction);
@@ -164,9 +154,6 @@ static void tce_buildmulti_pSeriesLP(str
__get_cpu_var(tce_page) = tcep;
}
- tcenum <<= TCE_PAGE_FACTOR;
- npages <<= TCE_PAGE_FACTOR;
-
rpn = (virt_to_abs(uaddr)) >> TCE_SHIFT;
proto_tce = TCE_PCI_READ;
if (direction != DMA_TO_DEVICE)
@@ -207,9 +194,6 @@ static void tce_free_pSeriesLP(struct io
{
u64 rc;
- tcenum <<= TCE_PAGE_FACTOR;
- npages <<= TCE_PAGE_FACTOR;
-
while (npages--) {
rc = plpar_tce_put((u64)tbl->it_index, (u64)tcenum << 12, 0);
@@ -229,9 +213,6 @@ static void tce_freemulti_pSeriesLP(stru
{
u64 rc;
- tcenum <<= TCE_PAGE_FACTOR;
- npages <<= TCE_PAGE_FACTOR;
-
rc = plpar_tce_stuff((u64)tbl->it_index, (u64)tcenum << 12, 0, npages);
if (rc && printk_ratelimit()) {
@@ -248,7 +229,6 @@ static unsigned long tce_get_pSeriesLP(s
u64 rc;
unsigned long tce_ret;
- tcenum <<= TCE_PAGE_FACTOR;
rc = plpar_tce_get((u64)tbl->it_index, (u64)tcenum << 12, &tce_ret);
if (rc && printk_ratelimit()) {
@@ -289,7 +269,7 @@ static void iommu_table_setparms(struct
tbl->it_busno = phb->bus->number;
/* Units of tce entries */
- tbl->it_offset = phb->dma_window_base_cur >> PAGE_SHIFT;
+ tbl->it_offset = phb->dma_window_base_cur >> IOMMU_PAGE_SHIFT;
/* Test if we are going over 2GB of DMA space */
if (phb->dma_window_base_cur + phb->dma_window_size > 0x80000000ul) {
@@ -300,7 +280,7 @@ static void iommu_table_setparms(struct
phb->dma_window_base_cur += phb->dma_window_size;
/* Set the tce table size - measured in entries */
- tbl->it_size = phb->dma_window_size >> PAGE_SHIFT;
+ tbl->it_size = phb->dma_window_size >> IOMMU_PAGE_SHIFT;
tbl->it_index = 0;
tbl->it_blocksize = 16;
@@ -325,8 +305,8 @@ static void iommu_table_setparms_lpar(st
tbl->it_base = 0;
tbl->it_blocksize = 16;
tbl->it_type = TCE_PCI;
- tbl->it_offset = offset >> PAGE_SHIFT;
- tbl->it_size = size >> PAGE_SHIFT;
+ tbl->it_offset = offset >> IOMMU_PAGE_SHIFT;
+ tbl->it_size = size >> IOMMU_PAGE_SHIFT;
}
static void iommu_bus_setup_pSeries(struct pci_bus *bus)
@@ -522,8 +502,6 @@ static void iommu_dev_setup_pSeriesLP(st
const void *dma_window = NULL;
struct pci_dn *pci;
- DBG("iommu_dev_setup_pSeriesLP, dev %p (%s)\n", dev, pci_name(dev));
-
/* dev setup for LPAR is a little tricky, since the device tree might
* contain the dma-window properties per-device and not neccesarily
* for the bus. So we need to search upwards in the tree until we
@@ -532,6 +510,9 @@ static void iommu_dev_setup_pSeriesLP(st
*/
dn = pci_device_to_OF_node(dev);
+ DBG("iommu_dev_setup_pSeriesLP, dev %p (%s) %s\n",
+ dev, pci_name(dev), dn->full_name);
+
for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->iommu_table;
pdn = pdn->parent) {
dma_window = get_property(pdn, "ibm,dma-window", NULL);
Index: linux-cell/arch/powerpc/sysdev/dart.h
===================================================================
--- linux-cell.orig/arch/powerpc/sysdev/dart.h 2006-10-06 13:47:54.000000000 +1000
+++ linux-cell/arch/powerpc/sysdev/dart.h 2006-10-27 15:41:49.000000000 +1000
@@ -72,7 +72,6 @@
#define DART_PAGE_SHIFT 12
#define DART_PAGE_SIZE (1 << DART_PAGE_SHIFT)
-#define DART_PAGE_FACTOR (PAGE_SHIFT - DART_PAGE_SHIFT)
#endif /* _POWERPC_SYSDEV_DART_H */
Index: linux-cell/arch/powerpc/sysdev/dart_iommu.c
===================================================================
--- linux-cell.orig/arch/powerpc/sysdev/dart_iommu.c 2006-10-06 13:47:54.000000000 +1000
+++ linux-cell/arch/powerpc/sysdev/dart_iommu.c 2006-10-30 14:04:38.000000000 +1100
@@ -156,9 +156,6 @@ static void dart_build(struct iommu_tabl
DBG("dart: build at: %lx, %lx, addr: %x\n", index, npages, uaddr);
- index <<= DART_PAGE_FACTOR;
- npages <<= DART_PAGE_FACTOR;
-
dp = ((unsigned int*)tbl->it_base) + index;
/* On U3, all memory is contigous, so we can move this
@@ -199,9 +196,6 @@ static void dart_free(struct iommu_table
DBG("dart: free at: %lx, %lx\n", index, npages);
- index <<= DART_PAGE_FACTOR;
- npages <<= DART_PAGE_FACTOR;
-
dp = ((unsigned int *)tbl->it_base) + index;
while (npages--)
@@ -281,7 +275,7 @@ static void iommu_table_dart_setup(void)
iommu_table_dart.it_busno = 0;
iommu_table_dart.it_offset = 0;
/* it_size is in number of entries */
- iommu_table_dart.it_size = (dart_tablesize / sizeof(u32)) >> DART_PAGE_FACTOR;
+ iommu_table_dart.it_size = dart_tablesize / sizeof(u32);
/* Initialize the common IOMMU code */
iommu_table_dart.it_base = (unsigned long)dart_vbase;
Index: linux-cell/arch/powerpc/platforms/iseries/iommu.c
===================================================================
--- linux-cell.orig/arch/powerpc/platforms/iseries/iommu.c 2006-10-06 13:47:54.000000000 +1000
+++ linux-cell/arch/powerpc/platforms/iseries/iommu.c 2006-10-30 14:10:44.000000000 +1100
@@ -43,9 +43,6 @@ static void tce_build_iSeries(struct iom
u64 rc;
u64 tce, rpn;
- index <<= TCE_PAGE_FACTOR;
- npages <<= TCE_PAGE_FACTOR;
-
while (npages--) {
rpn = virt_to_abs(uaddr) >> TCE_SHIFT;
tce = (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT;
@@ -75,9 +72,6 @@ static void tce_free_iSeries(struct iomm
{
u64 rc;
- npages <<= TCE_PAGE_FACTOR;
- index <<= TCE_PAGE_FACTOR;
-
while (npages--) {
rc = HvCallXm_setTce((u64)tbl->it_index, (u64)index, 0);
if (rc)
@@ -136,10 +130,9 @@ void iommu_table_getparms_iSeries(unsign
panic("PCI_DMA: parms->size is zero, parms is 0x%p", parms);
/* itc_size is in pages worth of table, it_size is in # of entries */
- tbl->it_size = ((parms->itc_size * TCE_PAGE_SIZE) /
- TCE_ENTRY_SIZE) >> TCE_PAGE_FACTOR;
+ tbl->it_size = (parms->itc_size * TCE_PAGE_SIZE) / TCE_ENTRY_SIZE;
tbl->it_busno = parms->itc_busno;
- tbl->it_offset = parms->itc_offset >> TCE_PAGE_FACTOR;
+ tbl->it_offset = parms->itc_offset;
tbl->it_index = parms->itc_index;
tbl->it_blocksize = 1;
tbl->it_type = virtbus ? TCE_VB : TCE_PCI;
^ permalink raw reply [flat|nested] 13+ messages in thread* Re: [PATCH] powerpc: map 4K iommu pages even on 64K largepage systems
2006-10-30 5:15 Benjamin Herrenschmidt
@ 2006-10-30 21:33 ` Olof Johansson
2006-10-31 3:06 ` Stephen Rothwell
1 sibling, 0 replies; 13+ messages in thread
From: Olof Johansson @ 2006-10-30 21:33 UTC (permalink / raw)
To: Benjamin Herrenschmidt; +Cc: linuxppc-dev list, Paul Mackerras
Thanks for doing this, Linas and Ben.
On Mon, 30 Oct 2006 16:15:59 +1100 Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:
> Signed-off-by: Linas Vepstas <linas@austin.ibm.com>
> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Acked-by: Olof Johansson <olof@lixom.net>
-Olof
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH] powerpc: map 4K iommu pages even on 64K largepage systems
2006-10-30 5:15 Benjamin Herrenschmidt
2006-10-30 21:33 ` Olof Johansson
@ 2006-10-31 3:06 ` Stephen Rothwell
1 sibling, 0 replies; 13+ messages in thread
From: Stephen Rothwell @ 2006-10-31 3:06 UTC (permalink / raw)
To: Benjamin Herrenschmidt; +Cc: Olof Johansson, linuxppc-dev list, Paul Mackerras
[-- Attachment #1: Type: text/plain, Size: 436 bytes --]
On Mon, 30 Oct 2006 16:15:59 +1100 Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:
>
> Signed-off-by: Linas Vepstas <linas@austin.ibm.com>
> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
> ---
>
> iSeries friendly version of it. Needs to go to RHEL5 too.
Acked-by: Stephen Rothwell <sfr@canb.auug.org.au>
--
Cheers,
Stephen Rothwell sfr@canb.auug.org.au
http://www.canb.auug.org.au/~sfr/
[-- Attachment #2: Type: application/pgp-signature, Size: 189 bytes --]
^ permalink raw reply [flat|nested] 13+ messages in thread
end of thread, other threads:[~2006-10-31 3:06 UTC | newest]
Thread overview: 13+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-10-24 0:25 [RFC]: map 4K iommu pages even on 64K largepage systems Linas Vepstas
2006-10-24 0:50 ` Geoff Levand
2006-10-24 2:23 ` Benjamin Herrenschmidt
2006-10-24 2:22 ` Benjamin Herrenschmidt
2006-10-24 3:00 ` Geoff Levand
2006-10-24 4:43 ` Benjamin Herrenschmidt
2006-10-24 5:14 ` [PATCH] powerpc: " Benjamin Herrenschmidt
2006-10-24 20:08 ` Linas Vepstas
2006-10-24 21:41 ` Benjamin Herrenschmidt
2006-10-24 23:17 ` Linas Vepstas
-- strict thread matches above, loose matches on Subject: below --
2006-10-30 5:15 Benjamin Herrenschmidt
2006-10-30 21:33 ` Olof Johansson
2006-10-31 3:06 ` Stephen Rothwell
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).