From: mark gross <mgross@linux.intel.com>
To: Andrew Morton <akpm@linux-foundation.org>
Cc: lkml <linux-kernel@vger.kernel.org>
Subject: Re: [PATCH]iommu-iotlb-flushing
Date: Mon, 25 Feb 2008 08:28:51 -0800 [thread overview]
Message-ID: <20080225162851.GC22018@linux.intel.com> (raw)
In-Reply-To: <20080223000517.232704f3.akpm@linux-foundation.org>
On Sat, Feb 23, 2008 at 12:05:17AM -0800, Andrew Morton wrote:
> On Wed, 20 Feb 2008 16:06:23 -0800 mark gross <mgross@linux.intel.com> wrote:
>
> > The following patch is for batching up the flushing of the IOTLB for
> > the DMAR implementation found in the Intel VT-d hardware. It works by
> > building a list of to be flushed IOTLB entries and a bitmap list of
> > which DMAR engine they are from.
> >
> > After either a high water mark (250 accessible via debugfs) or 10ms the
> > list of iova's will be reclaimed and the DMAR engines associated are
> > IOTLB-flushed.
> >
> > This approach recovers 15 to 20% of the performance lost when using the
> > IOMMU for my netperf udp stream benchmark with small packets. It can be
> > disabled with a kernel boot parameter
> > "intel_iommu=strict".
> >
> > Its use does weaken the IOMMU protections a bit.
> >
> > I would like to see this go into MM for a while and then onto mainline.
> >
> > ...
> >
> > +static struct timer_list unmap_timer =
> > + TIMER_INITIALIZER(flush_unmaps_timeout, 0, 0);
>
> Could use DEFINE_TIMER here.
ok
>
> > +struct unmap_list {
> > + struct list_head list;
> > + struct dmar_domain *domain;
> > + struct iova *iova;
> > +};
>
> unmap_list doens't seem to be used anywhere?
oops, left over from earlier version.
>
> > +static struct intel_iommu *g_iommus;
> > +/* bitmap for indexing intel_iommus */
> > +static unsigned long *g_iommus_to_flush;
> > +static int g_num_of_iommus;
> > +
> > +static DEFINE_SPINLOCK(async_umap_flush_lock);
> > +static LIST_HEAD(unmaps_to_do);
> > +
> > +static int timer_on;
> > +static long list_size;
> > +static int high_watermark;
> > +
> > +static struct dentry *intel_iommu_debug, *debug;
> > +
> > +
> > static void domain_remove_dev_info(struct dmar_domain *domain);
> >
> > static int dmar_disabled;
> > static int __initdata dmar_map_gfx = 1;
> > static int dmar_forcedac;
> > +static int intel_iommu_strict;
> >
> > #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
> > static DEFINE_SPINLOCK(device_domain_lock);
> > @@ -73,9 +103,13 @@
> > printk(KERN_INFO
> > "Intel-IOMMU: disable GFX device mapping\n");
> > } else if (!strncmp(str, "forcedac", 8)) {
> > - printk (KERN_INFO
> > + printk(KERN_INFO
> > "Intel-IOMMU: Forcing DAC for PCI devices\n");
> > dmar_forcedac = 1;
> > + } else if (!strncmp(str, "strict", 8)) {
>
> s/8/6/
ack.
>
> > + printk(KERN_INFO
> > + "Intel-IOMMU: disable batched IOTLB flush\n");
> > + intel_iommu_strict = 1;
> > }
> >
> > str += strcspn(str, ",");
> > @@ -965,17 +999,13 @@
> > set_bit(0, iommu->domain_ids);
> > return 0;
> > }
> > -
> > -static struct intel_iommu *alloc_iommu(struct dmar_drhd_unit *drhd)
> > +static struct intel_iommu *alloc_iommu(struct intel_iommu *iommu,
> > + struct dmar_drhd_unit *drhd)
> > {
> > - struct intel_iommu *iommu;
> > int ret;
> > int map_size;
> > u32 ver;
> >
> > - iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
> > - if (!iommu)
> > - return NULL;
> > iommu->reg = ioremap(drhd->reg_base_addr, PAGE_SIZE_4K);
> > if (!iommu->reg) {
> > printk(KERN_ERR "IOMMU: can't map the region\n");
> > @@ -1396,7 +1426,7 @@
> > int index;
> >
> > while (dev) {
> > - for (index = 0; index < cnt; index ++)
> > + for (index = 0; index < cnt; index++)
> > if (dev == devices[index])
> > return 1;
> >
> > @@ -1661,7 +1691,7 @@
> > struct dmar_rmrr_unit *rmrr;
> > struct pci_dev *pdev;
> > struct intel_iommu *iommu;
> > - int ret, unit = 0;
> > + int nlongs, i, ret, unit = 0;
> >
> > /*
> > * for each drhd
> > @@ -1672,7 +1702,30 @@
> > for_each_drhd_unit(drhd) {
> > if (drhd->ignored)
> > continue;
> > - iommu = alloc_iommu(drhd);
> > + g_num_of_iommus++;
>
> No locking needed for g_num_of_iommus?
>
I'll double check if its needed, but it wouldn't hurt. This code is on
the kernel startup / init path.
> > + }
> > +
> > + nlongs = BITS_TO_LONGS(g_num_of_iommus);
>
> Would this code be neater if it used the <linux/bitmap.h> stuff?
I'll look into it.
>
> > + g_iommus_to_flush = kzalloc(nlongs * sizeof(unsigned long), GFP_KERNEL);
> > + if (!g_iommus_to_flush) {
> > + printk(KERN_ERR "Intel-IOMMU: "
> > + "Allocating bitmap array failed\n");
> > + return -ENOMEM;
>
> Are you sure we aren't leaking anything here? Like the alloc_iommu() above?
Once you set up the IOMMU's you never take them down or re-set them up.
This code runs one and one time at boot up.
>
> > + }
> > +
> > + g_iommus = kzalloc(g_num_of_iommus * sizeof(*iommu), GFP_KERNEL);
> > + if (!g_iommus) {
> > + kfree(g_iommus_to_flush);
> > + ret = -ENOMEM;
> > + goto error;
> > + }
> > +
> > + i = 0;
> > + for_each_drhd_unit(drhd) {
> > + if (drhd->ignored)
> > + continue;
> > + iommu = alloc_iommu(&g_iommus[i], drhd);
> > + i++;
> > if (!iommu) {
> > ret = -ENOMEM;
> > goto error;
> > @@ -1705,7 +1758,6 @@
> > * endfor
> > */
> > for_each_rmrr_units(rmrr) {
> > - int i;
> > for (i = 0; i < rmrr->devices_cnt; i++) {
> > pdev = rmrr->devices[i];
> > /* some BIOS lists non-exist devices in DMAR table */
> > @@ -1909,6 +1961,54 @@
> > return 0;
> > }
> >
> > +static void flush_unmaps(void)
> > +{
> > + struct iova *node, *n;
> > + unsigned long flags;
> > + int i;
> > +
> > + spin_lock_irqsave(&async_umap_flush_lock, flags);
> > + timer_on = 0;
> > +
> > + /*just flush them all*/
>
> I'm surprised that checkpatch didn't grump about the odd commenting style.
It didn't. What's odd about the comment style here?
>
> > + for (i = 0; i < g_num_of_iommus; i++) {
> > + if (test_and_clear_bit(i, g_iommus_to_flush))
> > + iommu_flush_iotlb_global(&g_iommus[i], 0);
> > + }
> > +
> > + list_for_each_entry_safe(node, n, &unmaps_to_do, list) {
> > + /* free iova */
> > + list_del(&node->list);
> > + __free_iova(&((struct dmar_domain *)node->dmar)->iovad, node);
> > +
> > + }
> > + list_size = 0;
> > + spin_unlock_irqrestore(&async_umap_flush_lock, flags);
> > +}
> > +
> > +static void flush_unmaps_timeout(unsigned long data)
> > +{
> > + flush_unmaps();
> > +}
> > +
> > +static void add_unmap(struct dmar_domain *dom, struct iova *iova)
> > +{
> > + unsigned long flags;
> > +
> > + spin_lock_irqsave(&async_umap_flush_lock, flags);
>
> How scalable is this?
Not very. But, its better than blocking on the hardware poll IOTLB
flush operation on every unmap. Is there a lock less way to insert
into this list? I suppose I could have one lock per DMAR-engine but,
that would still have the scalability issue. (perhaps a list per IOVA?
/me needs to think on this a bit)
The best way is to get the network stack to reuse dma buffers when using
an iommu and avoid the unmap operation altogether. But thats a longer
term goal.
>
> > + iova->dmar = dom;
> > + list_add(&iova->list, &unmaps_to_do);
> > + set_bit((dom->iommu - g_iommus), g_iommus_to_flush);
> > +
> > + if (!timer_on) {
> > + unmap_timer.expires = jiffies + msecs_to_jiffies(10);
> > + mod_timer(&unmap_timer, unmap_timer.expires);
>
> No, this modifies unmap_timer.expires twice. Might be racy too. You want
>
> mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
ack
>
> > + timer_on = 1;
> > + }
> > + list_size++;
> > + spin_unlock_irqrestore(&async_umap_flush_lock, flags);
> > +}
> > +
> > static void intel_unmap_single(struct device *dev, dma_addr_t dev_addr,
> > size_t size, int dir)
> > {
> > @@ -1936,13 +2036,21 @@
> > dma_pte_clear_range(domain, start_addr, start_addr + size);
> > /* free page tables */
> > dma_pte_free_pagetable(domain, start_addr, start_addr + size);
> > -
> > - if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr,
> > - size >> PAGE_SHIFT_4K, 0))
> > - iommu_flush_write_buffer(domain->iommu);
> > -
> > - /* free iova */
> > - __free_iova(&domain->iovad, iova);
> > + if (intel_iommu_strict) {
> > + if (iommu_flush_iotlb_psi(domain->iommu,
> > + domain->id, start_addr, size >> PAGE_SHIFT_4K, 0))
> > + iommu_flush_write_buffer(domain->iommu);
> > + /* free iova */
> > + __free_iova(&domain->iovad, iova);
> > + } else {
> > + add_unmap(domain, iova);
> > + /*
> > + * queue up the release of the unmap to save the 1/6th of the
> > + * cpu used up by the iotlb flush operation...
> > + */
> > + if (list_size > high_watermark)
> > + flush_unmaps();
> > + }
> > }
> >
> > static void * intel_alloc_coherent(struct device *hwdev, size_t size,
> > @@ -2266,6 +2374,10 @@
> > if (dmar_table_init())
> > return -ENODEV;
> >
> > + high_watermark = 250;
> > + intel_iommu_debug = debugfs_create_dir("intel_iommu", NULL);
> > + debug = debugfs_create_u32("high_watermark", S_IWUGO | S_IRUGO,
> > + intel_iommu_debug, &high_watermark);
> > iommu_init_mempool();
> > dmar_init_reserved_ranges();
> >
> > @@ -2281,6 +2393,7 @@
> > printk(KERN_INFO
> > "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
> >
> > + init_timer(&unmap_timer);
>
> I see timers being added, but I see no del_timer_sync()s added on cleanup
> paths. Are you sure that we don't have races on various teardown paths?
>
This code doesn't really tear down well. However; at this point in
intel_iommu_init there is no further error paths to put tear down code.
> > force_iommu = 1;
> > dma_ops = &intel_dma_ops;
> > return 0;
> > Index: linux-2.6.24-mm1/drivers/pci/iova.h
> > ===================================================================
> > --- linux-2.6.24-mm1.orig/drivers/pci/iova.h 2008-02-12 07:12:06.000000000 -0800
> > +++ linux-2.6.24-mm1/drivers/pci/iova.h 2008-02-12 07:39:53.000000000 -0800
> > @@ -23,6 +23,8 @@
> > struct rb_node node;
> > unsigned long pfn_hi; /* IOMMU dish out addr hi */
> > unsigned long pfn_lo; /* IOMMU dish out addr lo */
> > + struct list_head list;
> > + void *dmar;
> > };
> >
> > /* holds all the iova translations for a domain */
> > Index: linux-2.6.24-mm1/Documentation/kernel-parameters.txt
> > ===================================================================
> > --- linux-2.6.24-mm1.orig/Documentation/kernel-parameters.txt 2008-02-12 07:12:06.000000000 -0800
> > +++ linux-2.6.24-mm1/Documentation/kernel-parameters.txt 2008-02-13 11:17:22.000000000 -0800
> > @@ -822,6 +822,10 @@
> > than 32 bit addressing. The default is to look
> > for translation below 32 bit and if not available
> > then look in the higher range.
> > + strict [Default Off]
> > + With this option on every unmap_single operation will
> > + result in a hardware IOTLB flush operation as opposed
> > + to batching them for performance.
>
> boot-time options suck. Is it not possible to tweak this at runtime?
Yes they do. There may be a way to enable / disable this behavior at
runtime. Let me think on it a bit.
Thank you for looking at this!
--mgross
next prev parent reply other threads:[~2008-02-25 16:40 UTC|newest]
Thread overview: 11+ messages / expand[flat|nested] mbox.gz Atom feed top
2008-02-21 0:06 [PATCH]iommu-iotlb-flushing mark gross
2008-02-23 8:05 ` [PATCH]iommu-iotlb-flushing Andrew Morton
2008-02-25 16:28 ` mark gross [this message]
2008-02-25 18:40 ` [PATCH]iommu-iotlb-flushing Andrew Morton
2008-02-29 23:18 ` [PATCH]iommu-iotlb-flushing mark gross
2008-03-01 5:54 ` [PATCH]iommu-iotlb-flushing Greg KH
2008-03-01 7:10 ` [PATCH]iommu-iotlb-flushing Grant Grundler
2008-03-03 18:34 ` [PATCH]iommu-iotlb-flushing mark gross
2008-03-05 18:23 ` [PATCH]iommu-iotlb-flushing Grant Grundler
2008-03-05 23:01 ` [PATCH] Use an array instead of a list for deffered intel-iommu iotlb flushing [PATCH]iommu-iotlb-flushing mark gross
2008-03-08 17:20 ` Grant Grundler
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20080225162851.GC22018@linux.intel.com \
--to=mgross@linux.intel.com \
--cc=akpm@linux-foundation.org \
--cc=linux-kernel@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.