[PATCH] iommu/amd: Fix for L2 race with VM invalidation

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH] iommu/amd: Fix for L2 race with VM invalidation
@ 2014-05-14  6:34 suravee.suthikulpanit
  2014-05-14  9:29 ` Joerg Roedel
  0 siblings, 1 reply; 5+ messages in thread
From: suravee.suthikulpanit @ 2014-05-14  6:34 UTC (permalink / raw)
  To: iommu, joro
  Cc: sherry.hurwitz, kim.naru, linux-kernel, Jay Cornwall,
	Suravee Suthikulpanit

From: Jay Cornwall <jay.cornwall@amd.com>

Do not disassociate the process page tables from a PASID during VM
invalidation. Invalidate the IOMMU TLB and IOTLBs before invalidation.

L2 translations may fail during VM range invalidation. The current
implementation associates an empty page table with a PASID within
the critical section to avoid races with the VM. This causes
unconditional failure of all translations during this period.

A low probability race exists with this fix. Translations received
within the critical section to PTEs which are concurrently being
invalidated may resolve to stale mappings.

Signed-off-by: Jay Cornwall <jay.cornwall@amd.com>
Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
---
 drivers/iommu/amd_iommu.c       |   40 +++++++++++++++++++++++++++++++++++----
 drivers/iommu/amd_iommu_proto.h |    2 ++
 drivers/iommu/amd_iommu_v2.c    |   40 ++++++---------------------------------
 3 files changed, 44 insertions(+), 38 deletions(-)

diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index c949520..da43985 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -3667,12 +3667,12 @@ out:
 	return ret;
 }
 
-static int __amd_iommu_flush_page(struct protection_domain *domain, int pasid,
-				  u64 address)
+static int __amd_iommu_flush_pages(struct protection_domain *domain, int pasid,
+				  u64 address, bool size)
 {
 	INC_STATS_COUNTER(invalidate_iotlb);
 
-	return __flush_pasid(domain, pasid, address, false);
+	return __flush_pasid(domain, pasid, address, size);
 }
 
 int amd_iommu_flush_page(struct iommu_domain *dom, int pasid,
@@ -3683,13 +3683,45 @@ int amd_iommu_flush_page(struct iommu_domain *dom, int pasid,
 	int ret;
 
 	spin_lock_irqsave(&domain->lock, flags);
-	ret = __amd_iommu_flush_page(domain, pasid, address);
+	ret = __amd_iommu_flush_pages(domain, pasid, address, 0);
 	spin_unlock_irqrestore(&domain->lock, flags);
 
 	return ret;
 }
 EXPORT_SYMBOL(amd_iommu_flush_page);
 
+int amd_iommu_flush_page_range(struct iommu_domain *dom, int pasid,
+			u64 start, u64 end)
+{
+	struct protection_domain *domain = dom->priv;
+	unsigned long flags;
+	unsigned long pages;
+	int ret;
+	u64 addr;
+	bool size;
+
+	pages = iommu_num_pages(start, end - start, PAGE_SIZE);
+
+	/*
+	 * If we have to flush more than one page, flush all
+	 * pages for this PASID
+	 */
+	if (pages > 1) {
+		addr = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
+		size = 1;
+	} else {
+		addr = start;
+		size = 0;
+	}
+
+	spin_lock_irqsave(&domain->lock, flags);
+	ret = __amd_iommu_flush_pages(domain, pasid, addr, size);
+	spin_unlock_irqrestore(&domain->lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(amd_iommu_flush_page_range);
+
 static int __amd_iommu_flush_tlb(struct protection_domain *domain, int pasid)
 {
 	INC_STATS_COUNTER(invalidate_iotlb_all);
diff --git a/drivers/iommu/amd_iommu_proto.h b/drivers/iommu/amd_iommu_proto.h
index 95ed6de..3919dfc 100644
--- a/drivers/iommu/amd_iommu_proto.h
+++ b/drivers/iommu/amd_iommu_proto.h
@@ -50,6 +50,8 @@ extern void amd_iommu_domain_direct_map(struct iommu_domain *dom);
 extern int amd_iommu_domain_enable_v2(struct iommu_domain *dom, int pasids);
 extern int amd_iommu_flush_page(struct iommu_domain *dom, int pasid,
 				u64 address);
+extern int amd_iommu_flush_page_range(struct iommu_domain *dom, int pasid,
+				u64 start, u64 end);
 extern int amd_iommu_flush_tlb(struct iommu_domain *dom, int pasid);
 extern int amd_iommu_domain_set_gcr3(struct iommu_domain *dom, int pasid,
 				     unsigned long cr3);
diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c
index 5208828..592de33f 100644
--- a/drivers/iommu/amd_iommu_v2.c
+++ b/drivers/iommu/amd_iommu_v2.c
@@ -90,13 +90,6 @@ static DEFINE_SPINLOCK(ps_lock);
 
 static struct workqueue_struct *iommu_wq;
 
-/*
- * Empty page table - Used between
- * mmu_notifier_invalidate_range_start and
- * mmu_notifier_invalidate_range_end
- */
-static u64 *empty_page_table;
-
 static void free_pasid_states(struct device_state *dev_state);
 static void unbind_pasid(struct device_state *dev_state, int pasid);
 static int task_exit(struct notifier_block *nb, unsigned long e, void *data);
@@ -443,22 +436,12 @@ static void mn_invalidate_range_start(struct mmu_notifier *mn,
 	pasid_state = mn_to_state(mn);
 	dev_state   = pasid_state->device_state;
 
-	amd_iommu_domain_set_gcr3(dev_state->domain, pasid_state->pasid,
-				  __pa(empty_page_table));
-}
-
-static void mn_invalidate_range_end(struct mmu_notifier *mn,
-				    struct mm_struct *mm,
-				    unsigned long start, unsigned long end)
-{
-	struct pasid_state *pasid_state;
-	struct device_state *dev_state;
-
-	pasid_state = mn_to_state(mn);
-	dev_state   = pasid_state->device_state;
-
-	amd_iommu_domain_set_gcr3(dev_state->domain, pasid_state->pasid,
-				  __pa(pasid_state->mm->pgd));
+	/*
+	 * FIXME: A low probability race exists if
+	 * IOTLB actively using page range being invalidated
+	 */
+	amd_iommu_flush_page_range(dev_state->domain,
+				pasid_state->pasid, start, end);
 }
 
 static struct mmu_notifier_ops iommu_mn = {
@@ -466,7 +449,6 @@ static struct mmu_notifier_ops iommu_mn = {
 	.change_pte             = mn_change_pte,
 	.invalidate_page        = mn_invalidate_page,
 	.invalidate_range_start = mn_invalidate_range_start,
-	.invalidate_range_end   = mn_invalidate_range_end,
 };
 
 static void set_pri_tag_status(struct pasid_state *pasid_state,
@@ -947,19 +929,11 @@ static int __init amd_iommu_v2_init(void)
 	if (iommu_wq == NULL)
 		goto out_free;
 
-	ret = -ENOMEM;
-	empty_page_table = (u64 *)get_zeroed_page(GFP_KERNEL);
-	if (empty_page_table == NULL)
-		goto out_destroy_wq;
-
 	amd_iommu_register_ppr_notifier(&ppr_nb);
 	profile_event_register(PROFILE_TASK_EXIT, &profile_nb);
 
 	return 0;
 
-out_destroy_wq:
-	destroy_workqueue(iommu_wq);
-
 out_free:
 	free_pages((unsigned long)state_table, get_order(state_table_size));
 
@@ -1000,8 +974,6 @@ static void __exit amd_iommu_v2_exit(void)
 
 	state_table_size = MAX_DEVICES * sizeof(struct device_state *);
 	free_pages((unsigned long)state_table, get_order(state_table_size));
-
-	free_page((unsigned long)empty_page_table);
 }
 
 module_init(amd_iommu_v2_init);
-- 
1.7.10.4


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH] iommu/amd: Fix for L2 race with VM invalidation
  2014-05-14  6:34 [PATCH] iommu/amd: Fix for L2 race with VM invalidation suravee.suthikulpanit
@ 2014-05-14  9:29 ` Joerg Roedel
  2014-05-14 11:38   ` Cornwall, Jay
  0 siblings, 1 reply; 5+ messages in thread
From: Joerg Roedel @ 2014-05-14  9:29 UTC (permalink / raw)
  To: suravee.suthikulpanit
  Cc: iommu, sherry.hurwitz, kim.naru, linux-kernel, Jay Cornwall

On Wed, May 14, 2014 at 01:34:12AM -0500, suravee.suthikulpanit@amd.com wrote:
> A low probability race exists with this fix. Translations received
> within the critical section to PTEs which are concurrently being
> invalidated may resolve to stale mappings.

Sorry, no. This patch can cause silent data corruption when the pages in
these stale mappings get reused. I understand that the current way does
not work either and can cause failures in the GPU, but silent data
corruption is worse than that.

How about putting the page-fault requests on hold between
invalidate_range_start/end()? The GPU will not see the failures, just a
longer delay until the page faults are handled.

	Joerg

^ permalink raw reply	[flat|nested] 5+ messages in thread

* RE: [PATCH] iommu/amd: Fix for L2 race with VM invalidation
  2014-05-14  9:29 ` Joerg Roedel
@ 2014-05-14 11:38   ` Cornwall, Jay
  2014-05-20 13:32     ` Joerg Roedel
  0 siblings, 1 reply; 5+ messages in thread
From: Cornwall, Jay @ 2014-05-14 11:38 UTC (permalink / raw)
  To: Joerg Roedel, Suthikulpanit, Suravee
  Cc: iommu@lists.linux-foundation.org, Hurwitz, Sherry, Naru, Kim,
	linux-kernel@vger.kernel.org

Hi,

I'm not sure why you're submitting this, Suravee?

We already agreed that we need the extra mmu_notifier point, proposed by Joerg back in 2011, to eliminate the race. I had thought we were waiting on that to be implemented.

________________________________________
From: Joerg Roedel [joro@8bytes.org]
Sent: Wednesday, May 14, 2014 4:29 AM
To: Suthikulpanit, Suravee
Cc: iommu@lists.linux-foundation.org; Hurwitz, Sherry; Naru, Kim; linux-kernel@vger.kernel.org; Cornwall, Jay
Subject: Re: [PATCH] iommu/amd: Fix for L2 race with VM invalidation

On Wed, May 14, 2014 at 01:34:12AM -0500, suravee.suthikulpanit@amd.com wrote:
> A low probability race exists with this fix. Translations received
> within the critical section to PTEs which are concurrently being
> invalidated may resolve to stale mappings.

Sorry, no. This patch can cause silent data corruption when the pages in
these stale mappings get reused. I understand that the current way does
not work either and can cause failures in the GPU, but silent data
corruption is worse than that.

How about putting the page-fault requests on hold between
invalidate_range_start/end()? The GPU will not see the failures, just a
longer delay until the page faults are handled.

        Joerg

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] iommu/amd: Fix for L2 race with VM invalidation
  2014-05-14 11:38   ` Cornwall, Jay
@ 2014-05-20 13:32     ` Joerg Roedel
  2014-05-20 13:53       ` Cornwall, Jay
  0 siblings, 1 reply; 5+ messages in thread
From: Joerg Roedel @ 2014-05-20 13:32 UTC (permalink / raw)
  To: Cornwall, Jay
  Cc: Suthikulpanit, Suravee, iommu@lists.linux-foundation.org,
	Hurwitz, Sherry, Naru, Kim, linux-kernel@vger.kernel.org

On Wed, May 14, 2014 at 11:38:25AM +0000, Cornwall, Jay wrote:
> Hi,
> 
> I'm not sure why you're submitting this, Suravee?
> 
> We already agreed that we need the extra mmu_notifier point, proposed
> by Joerg back in 2011, to eliminate the race. I had thought we were
> waiting on that to be implemented.

Speaking of this, would it also be possible to hold back all faults (so
not sending back ppr-comletions) between invalidate_range_start/end()?
Or will the hardware run into a timeout when it takes too long to
process a fault or when it issued the maximum number of parallel faults?


Regards,

	Joerg



^ permalink raw reply	[flat|nested] 5+ messages in thread

* RE: [PATCH] iommu/amd: Fix for L2 race with VM invalidation
  2014-05-20 13:32     ` Joerg Roedel
@ 2014-05-20 13:53       ` Cornwall, Jay
  0 siblings, 0 replies; 5+ messages in thread
From: Cornwall, Jay @ 2014-05-20 13:53 UTC (permalink / raw)
  To: Joerg Roedel
  Cc: Suthikulpanit, Suravee, iommu@lists.linux-foundation.org,
	Hurwitz, Sherry, Naru, Kim, linux-kernel@vger.kernel.org

> From: Joerg Roedel [mailto:joro@8bytes.org]
> Sent: Tuesday, May 20, 2014 08:33
> 
> On Wed, May 14, 2014 at 11:38:25AM +0000, Cornwall, Jay wrote:
> > Hi,
> >
> > I'm not sure why you're submitting this, Suravee?
> >
> > We already agreed that we need the extra mmu_notifier point, proposed
> > by Joerg back in 2011, to eliminate the race. I had thought we were
> > waiting on that to be implemented.
> 
> Speaking of this, would it also be possible to hold back all faults (so not
> sending back ppr-comletions) between invalidate_range_start/end()?
> Or will the hardware run into a timeout when it takes too long to process a
> fault or when it issued the maximum number of parallel faults?

I haven't located a comment on this in the HW documentation. The ATC will set a MMIO register flag if a response takes more than 2^32 cycles (I'm not certain of the frequency). I don't know if this is fatal. I have experimented with adding significant delays into the PPR handler in the past without affecting test cases.

My larger concern with this design is that the only situations in which there is a conflict between the MM and IOTLB activity is where the IOTLB client is racing with munmap() or physical page relocation. Both of these are low proability scenarios. However, blocking PPR completions for all MM invalidations would impact all IOTLB client activity.

This is particularly true for heterogeneous applications.

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2014-05-20 13:54 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-05-14  6:34 [PATCH] iommu/amd: Fix for L2 race with VM invalidation suravee.suthikulpanit
2014-05-14  9:29 ` Joerg Roedel
2014-05-14 11:38   ` Cornwall, Jay
2014-05-20 13:32     ` Joerg Roedel
2014-05-20 13:53       ` Cornwall, Jay

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox