* [RFC PATCH] iommu/vt-d: Add IOTLB flush support for kernel addresses
@ 2015-10-20 15:52 David Woodhouse
2015-10-20 16:03 ` Joerg Roedel
0 siblings, 1 reply; 10+ messages in thread
From: David Woodhouse @ 2015-10-20 15:52 UTC (permalink / raw)
To: linux-mm@kvack.org; +Cc: iommu, Sudeep Dutt
[-- Attachment #1: Type: text/plain, Size: 6764 bytes --]
On top of the tree at git.infradead.org/users/dwmw2/linux-svm.git
(http:// or git://).
For userspace addresses, we use the MMU notifiers and flush the IOTLB
as appropriate.
However, we need to do it for kernel addresses too — which basically
means adding a hook to tlb_flush_kernel_range(). Does this look
reasonable? I was trying to avoid it and insist on supporting addresses
within the kernel's static mapping only. But it doesn't look like
that's a reasonable thing to require.
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
arch/x86/mm/tlb.c | 2 ++
drivers/iommu/intel-svm.c | 37 ++++++++++++++++++++++++++++++++++---
include/linux/intel-iommu.h | 6 +++++-
include/linux/intel-svm.h | 13 +++++--------
4 files changed, 46 insertions(+), 12 deletions(-)
diff --git a/include/linux/intel-svm.h b/include/linux/intel-svm.h
index 0a48ccf..61d9533 100644
--- a/include/linux/intel-svm.h
+++ b/include/linux/intel-svm.h
@@ -44,14 +44,11 @@ struct svm_dev_ops {
/*
* The SVM_FLAG_SUPERVISOR_MODE flag requests a PASID which can be used only
- * for access to kernel addresses. No IOTLB flushes are automatically done
- * for kernel mappings; it is valid only for access to the kernel's static
- * 1:1 mapping of physical memory — not to vmalloc or even module mappings.
- * A future API addition may permit the use of such ranges, by means of an
- * explicit IOTLB flush call (akin to the DMA API's unmap method).
- *
- * It is unlikely that we will ever hook into flush_tlb_kernel_range() to
- * do such IOTLB flushes automatically.
+ * for access to kernel addresses. IOTLB flushes are performed as required
+ * by means of a hook from flush_tlb_kernel_range(). This flag is mutually
+ * exclusive with the SVM_FLAG_PRIVATE_PASID flag — there can be only one
+ * PASID used for kernel mode, to keep the performance implications of the
+ * IOTLB flush hook relatively sane.
*/
#define SVM_FLAG_SUPERVISOR_MODE (1<<1)
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 8ddb5d0..40ebe83 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -6,6 +6,7 @@
#include <linux/interrupt.h>
#include <linux/module.h>
#include <linux/cpu.h>
+#include <linux/intel-iommu.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
@@ -266,6 +267,7 @@ static void do_kernel_range_flush(void *info)
void flush_tlb_kernel_range(unsigned long start, unsigned long end)
{
+ intel_iommu_flush_kernel_pasid(start, end);
/* Balance as user space task's flush, a bit conservative */
if (end == TLB_FLUSH_ALL ||
diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
index a584df0..f8ca3c1 100644
--- a/drivers/iommu/intel-svm.c
+++ b/drivers/iommu/intel-svm.c
@@ -23,6 +23,7 @@
#include <linux/pci-ats.h>
#include <linux/dmar.h>
#include <linux/interrupt.h>
+#include <asm/tlbflush.h>
static irqreturn_t prq_event_thread(int irq, void *d);
@@ -264,6 +265,26 @@ static const struct mmu_notifier_ops intel_mmuops = {
.invalidate_range = intel_invalidate_range,
};
+void intel_iommu_flush_kernel_pasid(unsigned long start, unsigned long end)
+{
+ struct dmar_drhd_unit *drhd;
+ struct intel_iommu *iommu;
+ unsigned long pages;
+
+ if (end == TLB_FLUSH_ALL)
+ pages = end;
+ else
+ pages = (end - start) >> VTD_PAGE_SHIFT;
+
+ rcu_read_lock();
+ for_each_active_iommu(iommu, drhd) {
+ struct intel_svm *svm = rcu_dereference(iommu->kernel_svm);
+ if (svm)
+ intel_flush_svm_range(svm, start, pages, 0, 1);
+ }
+ rcu_read_unlock();
+}
+
static DEFINE_MUTEX(pasid_mutex);
int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_ops *ops)
@@ -286,6 +307,8 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_
pasid_max = 1 << 20;
if ((flags & SVM_FLAG_SUPERVISOR_MODE)) {
+ if (flags & SVM_FLAG_PRIVATE_PASID)
+ return -EINVAL;
if (!ecap_srs(iommu->ecap))
return -EINVAL;
} else if (pasid) {
@@ -294,7 +317,9 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_
}
mutex_lock(&pasid_mutex);
- if (pasid && !(flags & SVM_FLAG_PRIVATE_PASID)) {
+ if (SVM_FLAG_SUPERVISOR_MODE)
+ svm = iommu->kernel_svm;
+ else if (pasid && !(flags & SVM_FLAG_PRIVATE_PASID)) {
int i;
idr_for_each_entry(&iommu->pasid_idr, svm, i) {
@@ -378,8 +403,10 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_
}
iommu->pasid_table[svm->pasid].val = (u64)__pa(mm->pgd) | 1;
mm = NULL;
- } else
+ } else {
iommu->pasid_table[svm->pasid].val = (u64)__pa(init_mm.pgd) | 1 | (1ULL << 11);
+ rcu_assign_pointer(iommu->kernel_svm, svm);
+ }
wmb();
}
list_add_rcu(&sdev->list, &svm->devs);
@@ -432,8 +459,12 @@ int intel_svm_unbind_mm(struct device *dev, int pasid)
mmu_notifier_unregister(&svm->notifier, svm->mm);
idr_remove(&svm->iommu->pasid_idr, svm->pasid);
- if (svm->mm)
+ if (svm->mm) {
mmput(svm->mm);
+ } else {
+ rcu_assign_pointer(iommu->kernel_svm, NULL);
+ synchronize_rcu();
+ }
/* We mandate that no page faults may be outstanding
* for the PASID when intel_svm_unbind_mm() is called.
* If that is not obeyed, subtle errors will happen.
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 821273c..169bc84 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -391,6 +391,7 @@ enum {
struct pasid_entry;
struct pasid_state_entry;
struct page_req_dsc;
+struct intel_svm;
struct intel_iommu {
void __iomem *reg; /* Pointer to hardware regs, virtual addr */
@@ -426,6 +427,7 @@ struct intel_iommu {
struct page_req_dsc *prq;
unsigned char prq_name[16]; /* Name for PRQ interrupt */
struct idr pasid_idr;
+ struct intel_svm __rcu *kernel_svm;
#endif
struct q_inval *qi; /* Queued invalidation info */
u32 *iommu_state; /* Store iommu states between suspend and resume.*/
@@ -496,8 +498,10 @@ struct intel_svm {
extern int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct intel_svm_dev *sdev);
extern struct intel_iommu *intel_svm_device_to_iommu(struct device *dev);
+extern void intel_iommu_flush_kernel_pasid(unsigned long start, unsigned long end);
+#else
+#define intel_iommu_flush_kernel_pasid(start, end) do { ; } while(0)
#endif
-
extern const struct attribute_group *intel_iommu_groups[];
#endif
--
David Woodhouse Open Source Technology Centre
David.Woodhouse@intel.com Intel Corporation
[-- Attachment #2: smime.p7s --]
[-- Type: application/x-pkcs7-signature, Size: 5691 bytes --]
^ permalink raw reply related [flat|nested] 10+ messages in thread* Re: [RFC PATCH] iommu/vt-d: Add IOTLB flush support for kernel addresses
2015-10-20 15:52 [RFC PATCH] iommu/vt-d: Add IOTLB flush support for kernel addresses David Woodhouse
@ 2015-10-20 16:03 ` Joerg Roedel
2015-10-20 16:17 ` David Woodhouse
0 siblings, 1 reply; 10+ messages in thread
From: Joerg Roedel @ 2015-10-20 16:03 UTC (permalink / raw)
To: David Woodhouse; +Cc: linux-mm@kvack.org, iommu, Sudeep Dutt
Hi David,
On Tue, Oct 20, 2015 at 04:52:59PM +0100, David Woodhouse wrote:
> void flush_tlb_kernel_range(unsigned long start, unsigned long end)
> {
> + intel_iommu_flush_kernel_pasid(start, end);
A more generic naming would be good, and probably expose it through a
function in the IOMMU-API.
> +void intel_iommu_flush_kernel_pasid(unsigned long start, unsigned long end)
> +{
> + struct dmar_drhd_unit *drhd;
> + struct intel_iommu *iommu;
> + unsigned long pages;
And I think, as a performance optimiztion, we should bail out early here
if the pasid has no users.
Joerg
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 10+ messages in thread* Re: [RFC PATCH] iommu/vt-d: Add IOTLB flush support for kernel addresses
2015-10-20 16:03 ` Joerg Roedel
@ 2015-10-20 16:17 ` David Woodhouse
2015-10-23 10:20 ` Joerg Roedel
0 siblings, 1 reply; 10+ messages in thread
From: David Woodhouse @ 2015-10-20 16:17 UTC (permalink / raw)
To: Joerg Roedel; +Cc: linux-mm@kvack.org, iommu, Sudeep Dutt
[-- Attachment #1: Type: text/plain, Size: 2049 bytes --]
On Tue, 2015-10-20 at 18:03 +0200, Joerg Roedel wrote:
> Hi David,
>
> On Tue, Oct 20, 2015 at 04:52:59PM +0100, David Woodhouse wrote:
> > void flush_tlb_kernel_range(unsigned long start, unsigned long end)
> > {
> > +> > > > intel_iommu_flush_kernel_pasid(start, end);
>
> A more generic naming would be good, and probably expose it through a
> function in the IOMMU-API.
Yeah. *All* the SVM functionality needs to be exposed through the DMA
API (for native drivers like i915) and the IOMMU API (so VM guests can
see an IOMMU and do SVM on assigned devices).
Can we assume that only one type of SVM-capable IOMMU will be present
in the system at a time? Perhaps we could just register a single
function (intel_iommu_flush_kernel_pasid in the VT-d case) to be used
as a notifier callback from tlb_flush_kernel_range()? Rather than the
overhead of a *list* of notifiers?
> > +void intel_iommu_flush_kernel_pasid(unsigned long start, unsigned long end)
> > +{
> > +> > > > struct dmar_drhd_unit *drhd;
> > +> > > > struct intel_iommu *iommu;
> > +> > > > unsigned long pages;
>
> And I think, as a performance optimiztion, we should bail out early here
> if the pasid has no users.
If the PASID has no users, it won't exist (iommu->kernel_svm will be
NULL). We still have to walk each IOMMU to see if it has one.
But... that's because the PASID-space is currently per-IOMMU. The plan
is to have a *single* PASID-space system-wide, And then I still want to
retain the property that there can be only *one* kernel PASID.
I have forbidden the use of a given PASID to access *both* kernel and
user addresses. I'm hoping we can get away with putting that
restriction into the generic SVM APIs.
So yeah, perhaps we can set the notifier pointer to NULL when there's
no kernel PASID assigned, and only set it to point to
${MY_IOMMU}_flush_kernel_pasid() if/when there *is* one?
That way, tlb_flush_kernel_range() doesn't even need to make the call
when there's no work to do...
--
dwmw2
[-- Attachment #2: smime.p7s --]
[-- Type: application/x-pkcs7-signature, Size: 5691 bytes --]
^ permalink raw reply [flat|nested] 10+ messages in thread* Re: [RFC PATCH] iommu/vt-d: Add IOTLB flush support for kernel addresses
2015-10-20 16:17 ` David Woodhouse
@ 2015-10-23 10:20 ` Joerg Roedel
2015-10-23 10:33 ` David Woodhouse
0 siblings, 1 reply; 10+ messages in thread
From: Joerg Roedel @ 2015-10-23 10:20 UTC (permalink / raw)
To: David Woodhouse; +Cc: linux-mm@kvack.org, iommu, Sudeep Dutt
On Tue, Oct 20, 2015 at 05:17:04PM +0100, David Woodhouse wrote:
> Can we assume that only one type of SVM-capable IOMMU will be present
> in the system at a time? Perhaps we could just register a single
> function (intel_iommu_flush_kernel_pasid in the VT-d case) to be used
> as a notifier callback from tlb_flush_kernel_range()? Rather than the
> overhead of a *list* of notifiers?
Yes, a single notifier is certainly preferable to a list. It is just
too easy for others to attach to this list silently and adding more
overhead to kernel TLB flushing.
> But... that's because the PASID-space is currently per-IOMMU. The plan
> is to have a *single* PASID-space system-wide, And then I still want to
> retain the property that there can be only *one* kernel PASID.
That makes a lot of sense. Then we can check in the call-back simply if
this pasid has users and bail out early when not.
> I have forbidden the use of a given PASID to access *both* kernel and
> user addresses. I'm hoping we can get away with putting that
> restriction into the generic SVM APIs.
We have to, having kernel-pasids already nullifies all protection the
IOMMU provides, giving kernel-access to a process-pasid is security wise
equivalent to accessing /dev/mem.
> So yeah, perhaps we can set the notifier pointer to NULL when there's
> no kernel PASID assigned, and only set it to point to
> ${MY_IOMMU}_flush_kernel_pasid() if/when there *is* one?
That sounds like it needs some clever locking. Instead of checking the
function pointer it is probably easier to put the check for pasid-users
into an inline function and just do the real flush-call only when
necessary.
Joerg
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 10+ messages in thread* Re: [RFC PATCH] iommu/vt-d: Add IOTLB flush support for kernel addresses
2015-10-23 10:20 ` Joerg Roedel
@ 2015-10-23 10:33 ` David Woodhouse
[not found] ` <1445596413.4113.175.camel-wEGCiKHe2LqWVfeAwA7xHQ@public.gmane.org>
0 siblings, 1 reply; 10+ messages in thread
From: David Woodhouse @ 2015-10-23 10:33 UTC (permalink / raw)
To: Joerg Roedel; +Cc: linux-mm@kvack.org, iommu, Sudeep Dutt
[-- Attachment #1: Type: text/plain, Size: 5388 bytes --]
On Fri, 2015-10-23 at 12:20 +0200, Joerg Roedel wrote:
> On Tue, Oct 20, 2015 at 05:17:04PM +0100, David Woodhouse wrote:
> > Can we assume that only one type of SVM-capable IOMMU will be present
> > in the system at a time? Perhaps we could just register a single
> > function (intel_iommu_flush_kernel_pasid in the VT-d case) to be used
> > as a notifier callback from tlb_flush_kernel_range()? Rather than the
> > overhead of a *list* of notifiers?
>
> Yes, a single notifier is certainly preferable to a list. It is just
> too easy for others to attach to this list silently and adding more
> overhead to kernel TLB flushing.
Yeah. It's easy enough to add that in the x86 tlb_flush_kernel_range()
but I think we actually want it to be cross-platform.
Which means I'm pondering *renaming* tlb_flush_kernel_range() to
something like arch_tlb_flush_kernel_range() everywhere, then having a
tlb_flush_kernel_range() inline function which optionally calls
iommu_flush_kernel_range() first.
Or I could reduce the churn by adding explicit calls to
iommu_flush_kernel_range() at every location that calls
tlb_flush_kernel_range(), but that's going to lead to some callers
missing the IOMMU flush.
> > But... that's because the PASID-space is currently per-IOMMU. The plan
> > is to have a *single* PASID-space system-wide, And then I still want to
> > retain the property that there can be only *one* kernel PASID.
>
> That makes a lot of sense. Then we can check in the call-back simply if
> this pasid has users and bail out early when not.
>
> > I have forbidden the use of a given PASID to access *both* kernel and
> > user addresses. I'm hoping we can get away with putting that
> > restriction into the generic SVM APIs.
>
> We have to, having kernel-pasids already nullifies all protection the
> IOMMU provides, giving kernel-access to a process-pasid is security wise
> equivalent to accessing /dev/mem.
Not entirely. The device still gets to specify whether it's doing
supervisor or user mode access, for each request it makes. It doesn't
open the door to users just using kernel addresses and getting away
with it!
Sure, we need to trust the *device* — but we need to trust it to
provide the correct PASID too. Which basically means in the VFIO case
where the user gets *full* control of the device, we have to ensure
that it gets its own PASID table with only the *one* PASID in it, and
*that* PASID can't have supervisor mode.
But in the general case, apart from the fact that it makes life hard
for us, there's no fundamental security reason why we couldn't set the
bit which allows supervisor mode access to happen in *any* PASID.
> > So yeah, perhaps we can set the notifier pointer to NULL when there's
> > no kernel PASID assigned, and only set it to point to
> > ${MY_IOMMU}_flush_kernel_pasid() if/when there *is* one?
>
> That sounds like it needs some clever locking. Instead of checking the
> function pointer it is probably easier to put the check for pasid-users
> into an inline function and just do the real flush-call only when
> necessary.
The locking isn't hard; it's just RCU. Which in the VT-d case is just
the same as the handling of the kernel PASID structure which tells us
if we have any work to do anyway.
What I have in my working tree right now (but will probably throw away)
looks something like...
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 6df2029..c9e0b6c 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -7,6 +7,22 @@
#include <asm/processor.h>
#include <asm/special_insns.h>
+#ifdef CONFIG_IOMMU_TLB_FLUSH
+#include <linux/rcupdate.h>
+typedef void (*iommu_flush_ktlb_fn)(unsigned long start, unsigned long end);
+extern iommu_flush_ktlb_fn __rcu iommu_flush_ktlb;
+
+static inline void do_iommu_flush_ktlb(unsigned long start, unsigned long end)
+{
+ iommu_flush_ktlb_fn *fn;
+ rcu_read_lock();
+ fn = rcu_dereference(iommu_flush_ktlb);
+ if (fn)
+ (*fn)(start, end);
+ rcu_read_unlock();
+}
+#endif
+
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
#else
@@ -223,6 +239,9 @@ static inline void reset_lazy_tlbstate(void)
static inline void flush_tlb_kernel_range(unsigned long start,
unsigned long end)
{
+#ifdef CONFIG_IOMMU_FLUSH_TLB
+ do_iommu_flush_ktlb(start, end);
+#endif
flush_tlb_all();
}
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 8ddb5d0..4122b49 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -266,6 +267,9 @@ static void do_kernel_range_flush(void *info)
void flush_tlb_kernel_range(unsigned long start, unsigned long end)
{
+#ifdef CONFIG_IOMMU_FLUSH_TLB
+ do_iommu_flush_ktlb(start, end);
+#endif
/* Balance as user space task's flush, a bit conservative */
if (end == TLB_FLUSH_ALL ||
Maybe we could keep it simple and just declare that once the function pointer is set, it may never be cleared? But I think we really do want to avoid the out-of-line function call altogether in the case where kernel PASIDs are not being used. Or at *least* the case where SVM isn't being used at all.
--
--
David Woodhouse Open Source Technology Centre
David.Woodhouse@intel.com Intel Corporation
[-- Attachment #2: smime.p7s --]
[-- Type: application/x-pkcs7-signature, Size: 5691 bytes --]
^ permalink raw reply related [flat|nested] 10+ messages in thread
end of thread, other threads:[~2015-10-23 12:43 UTC | newest]
Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-10-20 15:52 [RFC PATCH] iommu/vt-d: Add IOTLB flush support for kernel addresses David Woodhouse
2015-10-20 16:03 ` Joerg Roedel
2015-10-20 16:17 ` David Woodhouse
2015-10-23 10:20 ` Joerg Roedel
2015-10-23 10:33 ` David Woodhouse
[not found] ` <1445596413.4113.175.camel-wEGCiKHe2LqWVfeAwA7xHQ@public.gmane.org>
2015-10-23 11:03 ` Joerg Roedel
2015-10-23 11:03 ` Joerg Roedel
2015-10-23 11:37 ` David Woodhouse
[not found] ` <1445600226.4113.196.camel-wEGCiKHe2LqWVfeAwA7xHQ@public.gmane.org>
2015-10-23 12:42 ` Joerg Roedel
2015-10-23 12:42 ` Joerg Roedel
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.