Re: [PATCH -mm 0/3] convert IOMMUs to use iova

All of lore.kernel.org
 help / color / mirror / Atom feed

From: FUJITA Tomonori <tomof@acm.org>
To: muli@il.ibm.com
Cc: tomof@acm.org, linux-kernel@vger.kernel.org,
	linux-scsi@vger.kernel.org, James.Bottomley@SteelEye.com,
	jens.axboe@oracle.com, jeff@garzik.org,
	anil.s.keshavamurthy@intel.com, paulus@samba.org,
	anton@samba.org, olof@lixom.net, tony.luck@intel.com,
	davem@davemloft.net, kyle@parisc-linux.org,
	fujita.tomonori@lab.ntt.co.jpfujita.tomonori@lab.ntt.co.jp
Subject: Re: [PATCH -mm 0/3] convert IOMMUs to use iova
Date: Wed, 7 Nov 2007 22:33:44 +0900	[thread overview]
Message-ID: <20071107223521D.tomof@acm.org> (raw)
In-Reply-To: <20071102171227.GH7975@rhun.haifa.ibm.com>

On Fri, 2 Nov 2007 19:12:27 +0200
Muli Ben-Yehuda <muli@il.ibm.com> wrote:

> On Sat, Nov 03, 2007 at 02:05:39AM +0900, FUJITA Tomonori wrote:
> 
> > This patchset convert the PPC64 IOMMU to use the iova code for free
> > area management.
> > 
> > The IOMMUs ignores low level drivers' restrictions, the maximum
> > segment size and segment boundary.
> > 
> > I fixed the former:
> > 
> > http://thread.gmane.org/gmane.linux.scsi/35602
> > 
> > The latter makes the free area management complicated. I'd like to
> > convert IOMMUs to use the iova code (that intel-iommu introduced)
> > for free area management and enable iova to handle segment boundary
> > restrictions, rather than fixing all the IOMMUs' free area
> > management,
> 
> In general it sounds like a great idea, but have you looked at what
> impact this has on the performance of the IO path?

I converted swiotlb to use iova and compared it with the original
algorithm (better than the simple bit map one that most of the IOMMUs
use, I think).

I use 'swiotlb=force' boot option and run netperf with "-m 128 -D"
(lead to tons of dma_map_single).

The original produced 281.8 Mb/s and the iova produced 77.2 Mb/s.

Seems that it would be better to generalization the swiotlb algorithm
(at least for small I/O area)? Or my patch might have bugs.

Here's the patch to convert swiotlb to use iova against my iova
patchset:

http://marc.info/?l=linux-kernel&m=119402340801254&w=2

diff --git a/arch/x86/Kconfig.x86_64 b/arch/x86/Kconfig.x86_64
index c10d3f0..8735822 100644
--- a/arch/x86/Kconfig.x86_64
+++ b/arch/x86/Kconfig.x86_64
@@ -524,6 +524,7 @@ config CALGARY_IOMMU_ENABLED_BY_DEFAULT
 # need this always selected by IOMMU for the VIA workaround
 config SWIOTLB
 	bool
+	select IOVA
 	help
 	  Support for software bounce buffers used on x86-64 systems
 	  which don't have a hardware IOMMU (e.g. the current generation
diff --git a/arch/x86/kernel/pci-dma_64.c b/arch/x86/kernel/pci-dma_64.c
index aa805b1..8507402 100644
--- a/arch/x86/kernel/pci-dma_64.c
+++ b/arch/x86/kernel/pci-dma_64.c
@@ -325,6 +325,9 @@ static int __init pci_iommu_init(void)
 	gart_iommu_init();
 #endif
 
+#ifdef CONFIG_SWIOTLB
+	swiotlb_alloc();
+#endif
 	no_iommu_init();
 	return 0;
 }
diff --git a/include/asm-x86/swiotlb.h b/include/asm-x86/swiotlb.h
index f9c5895..f00d20c 100644
--- a/include/asm-x86/swiotlb.h
+++ b/include/asm-x86/swiotlb.h
@@ -40,6 +40,7 @@ extern void swiotlb_free_coherent (struct device *hwdev, size_t size,
 				   void *vaddr, dma_addr_t dma_handle);
 extern int swiotlb_dma_supported(struct device *hwdev, u64 mask);
 extern void swiotlb_init(void);
+extern void swiotlb_alloc(void);
 
 extern int swiotlb_force;
 
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index 1a8050a..54ecb87 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -24,6 +24,7 @@
 #include <linux/string.h>
 #include <linux/types.h>
 #include <linux/ctype.h>
+#include <linux/iova.h>
 
 #include <asm/io.h>
 #include <asm/dma.h>
@@ -103,10 +104,7 @@ static unsigned int io_tlb_index;
  */
 static unsigned char **io_tlb_orig_addr;
 
-/*
- * Protect the above data structures in the map and unmap calls
- */
-static DEFINE_SPINLOCK(io_tlb_lock);
+static struct iova_domain swiotlb_iovad;
 
 static int __init
 setup_io_tlb_npages(char *str)
@@ -272,6 +270,19 @@ cleanup1:
 	return -ENOMEM;
 }
 
+static struct kmem_cache *iova_cachep;
+
+void __init
+swiotlb_alloc(void)
+{
+	if (!swiotlb)
+		return;
+
+	iova_cachep = KMEM_CACHE(iova, 0);
+	init_iova_domain(&swiotlb_iovad, DMA_32BIT_MASK >> IO_TLB_SHIFT,
+			 iova_cachep);
+}
+
 static int
 address_needs_mapping(struct device *hwdev, dma_addr_t addr)
 {
@@ -288,70 +299,20 @@ address_needs_mapping(struct device *hwdev, dma_addr_t addr)
 static void *
 map_single(struct device *hwdev, char *buffer, size_t size, int dir)
 {
-	unsigned long flags;
 	char *dma_addr;
-	unsigned int nslots, stride, index, wrap;
+	unsigned int nslots, index;
 	int i;
+	struct iova *iova;
 
-	/*
-	 * For mappings greater than a page, we limit the stride (and
-	 * hence alignment) to a page size.
-	 */
 	nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
-	if (size > PAGE_SIZE)
-		stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT));
-	else
-		stride = 1;
-
 	BUG_ON(!nslots);
 
-	/*
-	 * Find suitable number of IO TLB entries size that will fit this
-	 * request and allocate a buffer from that IO TLB pool.
-	 */
-	spin_lock_irqsave(&io_tlb_lock, flags);
-	{
-		wrap = index = ALIGN(io_tlb_index, stride);
-
-		if (index >= io_tlb_nslabs)
-			wrap = index = 0;
-
-		do {
-			/*
-			 * If we find a slot that indicates we have 'nslots'
-			 * number of contiguous buffers, we allocate the
-			 * buffers from that slot and mark the entries as '0'
-			 * indicating unavailable.
-			 */
-			if (io_tlb_list[index] >= nslots) {
-				int count = 0;
-
-				for (i = index; i < (int) (index + nslots); i++)
-					io_tlb_list[i] = 0;
-				for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE -1) && io_tlb_list[i]; i--)
-					io_tlb_list[i] = ++count;
-				dma_addr = io_tlb_start + (index << IO_TLB_SHIFT);
-
-				/*
-				 * Update the indices to avoid searching in
-				 * the next round.
-				 */
-				io_tlb_index = ((index + nslots) < io_tlb_nslabs
-						? (index + nslots) : 0);
-
-				goto found;
-			}
-			index += stride;
-			if (index >= io_tlb_nslabs)
-				index = 0;
-		} while (index != wrap);
-
-		spin_unlock_irqrestore(&io_tlb_lock, flags);
+	iova = alloc_iova(&swiotlb_iovad, nslots, io_tlb_nslabs - 1, 1);
+	if (!iova)
 		return NULL;
-	}
-  found:
-	spin_unlock_irqrestore(&io_tlb_lock, flags);
 
+	index = iova->pfn_lo;
+	dma_addr = io_tlb_start + (index << IO_TLB_SHIFT);
 	/*
 	 * Save away the mapping from the original address to the DMA address.
 	 * This is needed when we sync the memory.  Then we sync the buffer if
@@ -371,8 +332,6 @@ map_single(struct device *hwdev, char *buffer, size_t size, int dir)
 static void
 unmap_single(struct device *hwdev, char *dma_addr, size_t size, int dir)
 {
-	unsigned long flags;
-	int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
 	int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT;
 	char *buffer = io_tlb_orig_addr[index];
 
@@ -386,30 +345,7 @@ unmap_single(struct device *hwdev, char *dma_addr, size_t size, int dir)
 		 */
 		memcpy(buffer, dma_addr, size);
 
-	/*
-	 * Return the buffer to the free list by setting the corresponding
-	 * entries to indicate the number of contigous entries available.
-	 * While returning the entries to the free list, we merge the entries
-	 * with slots below and above the pool being returned.
-	 */
-	spin_lock_irqsave(&io_tlb_lock, flags);
-	{
-		count = ((index + nslots) < ALIGN(index + 1, IO_TLB_SEGSIZE) ?
-			 io_tlb_list[index + nslots] : 0);
-		/*
-		 * Step 1: return the slots to the free list, merging the
-		 * slots with superceeding slots
-		 */
-		for (i = index + nslots - 1; i >= index; i--)
-			io_tlb_list[i] = ++count;
-		/*
-		 * Step 2: merge the returned slots with the preceding slots,
-		 * if available (non zero)
-		 */
-		for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE -1) && io_tlb_list[i]; i--)
-			io_tlb_list[i] = ++count;
-	}
-	spin_unlock_irqrestore(&io_tlb_lock, flags);
+	free_iova(&swiotlb_iovad, index);
 }
 
 static void
-- 
1.5.3.4

WARNING: multiple messages have this Message-ID (diff)

From: FUJITA Tomonori <tomof@acm.org>
To: muli@il.ibm.com
Cc: tomof@acm.org, linux-kernel@vger.kernel.org,
	linux-scsi@vger.kernel.org, James.Bottomley@SteelEye.com,
	jens.axboe@oracle.com, jeff@garzik.org,
	anil.s.keshavamurthy@intel.com, paulus@samba.org,
	anton@samba.org, olof@lixom.net, tony.luck@intel.com,
	davem@davemloft.net, kyle@parisc-linux.org,
	fujita.tomonori@lab.ntt.co.jp
Cc: fujita.tomonori@lab.ntt.co.jp
Subject: Re: [PATCH -mm 0/3] convert IOMMUs to use iova
Date: Wed, 7 Nov 2007 22:33:44 +0900	[thread overview]
Message-ID: <20071107223521D.tomof@acm.org> (raw)
In-Reply-To: <20071102171227.GH7975@rhun.haifa.ibm.com>

On Fri, 2 Nov 2007 19:12:27 +0200
Muli Ben-Yehuda <muli@il.ibm.com> wrote:

> On Sat, Nov 03, 2007 at 02:05:39AM +0900, FUJITA Tomonori wrote:
> 
> > This patchset convert the PPC64 IOMMU to use the iova code for free
> > area management.
> > 
> > The IOMMUs ignores low level drivers' restrictions, the maximum
> > segment size and segment boundary.
> > 
> > I fixed the former:
> > 
> > http://thread.gmane.org/gmane.linux.scsi/35602
> > 
> > The latter makes the free area management complicated. I'd like to
> > convert IOMMUs to use the iova code (that intel-iommu introduced)
> > for free area management and enable iova to handle segment boundary
> > restrictions, rather than fixing all the IOMMUs' free area
> > management,
> 
> In general it sounds like a great idea, but have you looked at what
> impact this has on the performance of the IO path?

I converted swiotlb to use iova and compared it with the original
algorithm (better than the simple bit map one that most of the IOMMUs
use, I think).

I use 'swiotlb=force' boot option and run netperf with "-m 128 -D"
(lead to tons of dma_map_single).

The original produced 281.8 Mb/s and the iova produced 77.2 Mb/s.

Seems that it would be better to generalization the swiotlb algorithm
(at least for small I/O area)? Or my patch might have bugs.

Here's the patch to convert swiotlb to use iova against my iova
patchset:

http://marc.info/?l=linux-kernel&m=119402340801254&w=2

diff --git a/arch/x86/Kconfig.x86_64 b/arch/x86/Kconfig.x86_64
index c10d3f0..8735822 100644
--- a/arch/x86/Kconfig.x86_64
+++ b/arch/x86/Kconfig.x86_64
@@ -524,6 +524,7 @@ config CALGARY_IOMMU_ENABLED_BY_DEFAULT
 # need this always selected by IOMMU for the VIA workaround
 config SWIOTLB
 	bool
+	select IOVA
 	help
 	  Support for software bounce buffers used on x86-64 systems
 	  which don't have a hardware IOMMU (e.g. the current generation
diff --git a/arch/x86/kernel/pci-dma_64.c b/arch/x86/kernel/pci-dma_64.c
index aa805b1..8507402 100644
--- a/arch/x86/kernel/pci-dma_64.c
+++ b/arch/x86/kernel/pci-dma_64.c
@@ -325,6 +325,9 @@ static int __init pci_iommu_init(void)
 	gart_iommu_init();
 #endif
 
+#ifdef CONFIG_SWIOTLB
+	swiotlb_alloc();
+#endif
 	no_iommu_init();
 	return 0;
 }
diff --git a/include/asm-x86/swiotlb.h b/include/asm-x86/swiotlb.h
index f9c5895..f00d20c 100644
--- a/include/asm-x86/swiotlb.h
+++ b/include/asm-x86/swiotlb.h
@@ -40,6 +40,7 @@ extern void swiotlb_free_coherent (struct device *hwdev, size_t size,
 				   void *vaddr, dma_addr_t dma_handle);
 extern int swiotlb_dma_supported(struct device *hwdev, u64 mask);
 extern void swiotlb_init(void);
+extern void swiotlb_alloc(void);
 
 extern int swiotlb_force;
 
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index 1a8050a..54ecb87 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -24,6 +24,7 @@
 #include <linux/string.h>
 #include <linux/types.h>
 #include <linux/ctype.h>
+#include <linux/iova.h>
 
 #include <asm/io.h>
 #include <asm/dma.h>
@@ -103,10 +104,7 @@ static unsigned int io_tlb_index;
  */
 static unsigned char **io_tlb_orig_addr;
 
-/*
- * Protect the above data structures in the map and unmap calls
- */
-static DEFINE_SPINLOCK(io_tlb_lock);
+static struct iova_domain swiotlb_iovad;
 
 static int __init
 setup_io_tlb_npages(char *str)
@@ -272,6 +270,19 @@ cleanup1:
 	return -ENOMEM;
 }
 
+static struct kmem_cache *iova_cachep;
+
+void __init
+swiotlb_alloc(void)
+{
+	if (!swiotlb)
+		return;
+
+	iova_cachep = KMEM_CACHE(iova, 0);
+	init_iova_domain(&swiotlb_iovad, DMA_32BIT_MASK >> IO_TLB_SHIFT,
+			 iova_cachep);
+}
+
 static int
 address_needs_mapping(struct device *hwdev, dma_addr_t addr)
 {
@@ -288,70 +299,20 @@ address_needs_mapping(struct device *hwdev, dma_addr_t addr)
 static void *
 map_single(struct device *hwdev, char *buffer, size_t size, int dir)
 {
-	unsigned long flags;
 	char *dma_addr;
-	unsigned int nslots, stride, index, wrap;
+	unsigned int nslots, index;
 	int i;
+	struct iova *iova;
 
-	/*
-	 * For mappings greater than a page, we limit the stride (and
-	 * hence alignment) to a page size.
-	 */
 	nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
-	if (size > PAGE_SIZE)
-		stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT));
-	else
-		stride = 1;
-
 	BUG_ON(!nslots);
 
-	/*
-	 * Find suitable number of IO TLB entries size that will fit this
-	 * request and allocate a buffer from that IO TLB pool.
-	 */
-	spin_lock_irqsave(&io_tlb_lock, flags);
-	{
-		wrap = index = ALIGN(io_tlb_index, stride);
-
-		if (index >= io_tlb_nslabs)
-			wrap = index = 0;
-
-		do {
-			/*
-			 * If we find a slot that indicates we have 'nslots'
-			 * number of contiguous buffers, we allocate the
-			 * buffers from that slot and mark the entries as '0'
-			 * indicating unavailable.
-			 */
-			if (io_tlb_list[index] >= nslots) {
-				int count = 0;
-
-				for (i = index; i < (int) (index + nslots); i++)
-					io_tlb_list[i] = 0;
-				for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE -1) && io_tlb_list[i]; i--)
-					io_tlb_list[i] = ++count;
-				dma_addr = io_tlb_start + (index << IO_TLB_SHIFT);
-
-				/*
-				 * Update the indices to avoid searching in
-				 * the next round.
-				 */
-				io_tlb_index = ((index + nslots) < io_tlb_nslabs
-						? (index + nslots) : 0);
-
-				goto found;
-			}
-			index += stride;
-			if (index >= io_tlb_nslabs)
-				index = 0;
-		} while (index != wrap);
-
-		spin_unlock_irqrestore(&io_tlb_lock, flags);
+	iova = alloc_iova(&swiotlb_iovad, nslots, io_tlb_nslabs - 1, 1);
+	if (!iova)
 		return NULL;
-	}
-  found:
-	spin_unlock_irqrestore(&io_tlb_lock, flags);
 
+	index = iova->pfn_lo;
+	dma_addr = io_tlb_start + (index << IO_TLB_SHIFT);
 	/*
 	 * Save away the mapping from the original address to the DMA address.
 	 * This is needed when we sync the memory.  Then we sync the buffer if
@@ -371,8 +332,6 @@ map_single(struct device *hwdev, char *buffer, size_t size, int dir)
 static void
 unmap_single(struct device *hwdev, char *dma_addr, size_t size, int dir)
 {
-	unsigned long flags;
-	int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
 	int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT;
 	char *buffer = io_tlb_orig_addr[index];
 
@@ -386,30 +345,7 @@ unmap_single(struct device *hwdev, char *dma_addr, size_t size, int dir)
 		 */
 		memcpy(buffer, dma_addr, size);
 
-	/*
-	 * Return the buffer to the free list by setting the corresponding
-	 * entries to indicate the number of contigous entries available.
-	 * While returning the entries to the free list, we merge the entries
-	 * with slots below and above the pool being returned.
-	 */
-	spin_lock_irqsave(&io_tlb_lock, flags);
-	{
-		count = ((index + nslots) < ALIGN(index + 1, IO_TLB_SEGSIZE) ?
-			 io_tlb_list[index + nslots] : 0);
-		/*
-		 * Step 1: return the slots to the free list, merging the
-		 * slots with superceeding slots
-		 */
-		for (i = index + nslots - 1; i >= index; i--)
-			io_tlb_list[i] = ++count;
-		/*
-		 * Step 2: merge the returned slots with the preceding slots,
-		 * if available (non zero)
-		 */
-		for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE -1) && io_tlb_list[i]; i--)
-			io_tlb_list[i] = ++count;
-	}
-	spin_unlock_irqrestore(&io_tlb_lock, flags);
+	free_iova(&swiotlb_iovad, index);
 }
 
 static void
-- 
1.5.3.4

next prev parent reply	other threads:[~2007-11-07 13:39 UTC|newest]

Thread overview: 13+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2007-11-02 17:05 [PATCH -mm 0/3] convert IOMMUs to use iova FUJITA Tomonori
2007-11-02 17:05 ` FUJITA Tomonori
2007-11-02 17:05 ` [PATCH 1/3] move iova from drivers/pci/ to lib/ FUJITA Tomonori
2007-11-02 17:05   ` FUJITA Tomonori
2007-11-02 17:05 ` [PATCH 2/3] move iova cache code to iova.c FUJITA Tomonori
2007-11-02 17:05   ` FUJITA Tomonori
2007-11-02 17:05 ` [PATCH 3/3] POWERPC: convert the IOMMU to use iova FUJITA Tomonori
2007-11-02 17:05   ` FUJITA Tomonori
2007-11-02 17:12 ` [PATCH -mm 0/3] convert IOMMUs " Muli Ben-Yehuda
2007-11-02 18:11   ` FUJITA Tomonori
2007-11-02 18:11     ` FUJITA Tomonori
2007-11-07 13:33   ` FUJITA Tomonori [this message]
2007-11-07 13:33     ` FUJITA Tomonori

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:c10d3f0 dfblob:8735822 dfblob:aa805b1 dfblob:8507402
dfblob:f9c5895 dfblob:f00d20c dfblob:1a8050a dfblob:54ecb87
dfblob:c10d3f0 dfblob:8735822 dfblob:aa805b1 dfblob:8507402
dfblob:f9c5895 dfblob:f00d20c dfblob:1a8050a dfblob:54ecb87 )
 OR (
bs:"Re: [PATCH -mm 0/3] convert IOMMUs to use iova" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20071107223521D.tomof@acm.org \
    --to=tomof@acm.org \
    --cc=James.Bottomley@SteelEye.com \
    --cc=anil.s.keshavamurthy@intel.com \
    --cc=anton@samba.org \
    --cc=davem@davemloft.net \
    --cc=fujita.tomonori@lab.ntt.co.jpfujita.tomonori \
    --cc=jeff@garzik.org \
    --cc=jens.axboe@oracle.com \
    --cc=kyle@parisc-linux.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-scsi@vger.kernel.org \
    --cc=muli@il.ibm.com \
    --cc=olof@lixom.net \
    --cc=paulus@samba.org \
    --cc=tony.luck@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.