* [RFC PATCH 01/30] iommu/arm-smmu-v3: Link groups and devices
[not found] ` <20170227195441.5170-1-jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
@ 2017-02-27 19:54 ` Jean-Philippe Brucker
[not found] ` <20170227195441.5170-2-jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
2017-02-27 19:54 ` [RFC PATCH 02/30] iommu/arm-smmu-v3: Link groups and domains Jean-Philippe Brucker
` (29 subsequent siblings)
30 siblings, 1 reply; 103+ messages in thread
From: Jean-Philippe Brucker @ 2017-02-27 19:54 UTC (permalink / raw)
Cc: Shanker Donthineni, kvm-u79uwXL29TY76Z2rM5mHXA, Catalin Marinas,
Sinan Kaya, Will Deacon,
iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
Harv Abdulhamid, linux-pci-u79uwXL29TY76Z2rM5mHXA, Bjorn Helgaas,
David Woodhouse,
linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r, Nate Watterson
Reintroduce smmu_group. This structure was removed during the generic DT
bindings rework, but will be needed when implementing PCIe ATS, to lookup
devices attached to a given domain.
When unmapping from a domain, we need to send an invalidation to all
devices that could have stored the mapping in their ATC. It would be nice
to use IOMMU API's iommu_group_for_each_dev, but that list is protected by
group->mutex, which we can't use because atc_invalidate won't be allowed
to sleep. So add a list of devices, protected by a spinlock.
Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
---
drivers/iommu/arm-smmu-v3.c | 74 +++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 74 insertions(+)
diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index 5806a6acc94e..ce8b68fe254b 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -625,6 +625,9 @@ struct arm_smmu_device {
struct arm_smmu_master_data {
struct arm_smmu_device *smmu;
struct arm_smmu_strtab_ent ste;
+
+ struct device *dev;
+ struct list_head group_head;
};
/* SMMU private data for an IOMMU domain */
@@ -650,6 +653,11 @@ struct arm_smmu_domain {
struct iommu_domain domain;
};
+struct arm_smmu_group {
+ struct list_head devices;
+ spinlock_t devices_lock;
+};
+
struct arm_smmu_option_prop {
u32 opt;
const char *prop;
@@ -665,6 +673,8 @@ static struct arm_smmu_domain *to_smmu_domain(struct iommu_domain *dom)
return container_of(dom, struct arm_smmu_domain, domain);
}
+#define to_smmu_group iommu_group_get_iommudata
+
static void parse_driver_options(struct arm_smmu_device *smmu)
{
int i = 0;
@@ -1595,6 +1605,30 @@ static int arm_smmu_install_ste_for_dev(struct iommu_fwspec *fwspec)
return 0;
}
+static void arm_smmu_group_release(void *smmu_group)
+{
+ kfree(smmu_group);
+}
+
+static struct arm_smmu_group *arm_smmu_group_alloc(struct iommu_group *group)
+{
+ struct arm_smmu_group *smmu_group = to_smmu_group(group);
+
+ if (smmu_group)
+ return smmu_group;
+
+ smmu_group = kzalloc(sizeof(*smmu_group), GFP_KERNEL);
+ if (!smmu_group)
+ return NULL;
+
+ INIT_LIST_HEAD(&smmu_group->devices);
+ spin_lock_init(&smmu_group->devices_lock);
+
+ iommu_group_set_iommudata(group, smmu_group, arm_smmu_group_release);
+
+ return smmu_group;
+}
+
static void arm_smmu_detach_dev(struct device *dev)
{
struct arm_smmu_master_data *master = dev->iommu_fwspec->iommu_priv;
@@ -1607,7 +1641,9 @@ static void arm_smmu_detach_dev(struct device *dev)
static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
{
int ret = 0;
+ struct iommu_group *group;
struct arm_smmu_device *smmu;
+ struct arm_smmu_group *smmu_group;
struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
struct arm_smmu_master_data *master;
struct arm_smmu_strtab_ent *ste;
@@ -1619,6 +1655,17 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
smmu = master->smmu;
ste = &master->ste;
+ /*
+ * When adding devices, this is the first occasion we have to create the
+ * smmu_group and attach it to iommu_group.
+ */
+ group = iommu_group_get(dev);
+ smmu_group = arm_smmu_group_alloc(group);
+ if (!smmu_group) {
+ iommu_group_put(group);
+ return -ENOMEM;
+ }
+
/* Already attached to a different domain? */
if (!ste->bypass)
arm_smmu_detach_dev(dev);
@@ -1659,6 +1706,9 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
out_unlock:
mutex_unlock(&smmu_domain->init_mutex);
+
+ iommu_group_put(group);
+
return ret;
}
@@ -1745,7 +1795,9 @@ static struct iommu_ops arm_smmu_ops;
static int arm_smmu_add_device(struct device *dev)
{
int i, ret;
+ unsigned long flags;
struct arm_smmu_device *smmu;
+ struct arm_smmu_group *smmu_group;
struct arm_smmu_master_data *master;
struct iommu_fwspec *fwspec = dev->iommu_fwspec;
struct iommu_group *group;
@@ -1769,6 +1821,7 @@ static int arm_smmu_add_device(struct device *dev)
return -ENOMEM;
master->smmu = smmu;
+ master->dev = dev;
fwspec->iommu_priv = master;
}
@@ -1789,6 +1842,12 @@ static int arm_smmu_add_device(struct device *dev)
group = iommu_group_get_for_dev(dev);
if (!IS_ERR(group)) {
+ smmu_group = to_smmu_group(group);
+
+ spin_lock_irqsave(&smmu_group->devices_lock, flags);
+ list_add(&master->group_head, &smmu_group->devices);
+ spin_unlock_irqrestore(&smmu_group->devices_lock, flags);
+
iommu_group_put(group);
iommu_device_link(&smmu->iommu, dev);
}
@@ -1800,7 +1859,10 @@ static void arm_smmu_remove_device(struct device *dev)
{
struct iommu_fwspec *fwspec = dev->iommu_fwspec;
struct arm_smmu_master_data *master;
+ struct arm_smmu_group *smmu_group;
struct arm_smmu_device *smmu;
+ struct iommu_group *group;
+ unsigned long flags;
if (!fwspec || fwspec->ops != &arm_smmu_ops)
return;
@@ -1809,6 +1871,18 @@ static void arm_smmu_remove_device(struct device *dev)
smmu = master->smmu;
if (master && master->ste.valid)
arm_smmu_detach_dev(dev);
+
+ if (master) {
+ group = iommu_group_get(dev);
+ smmu_group = to_smmu_group(group);
+
+ spin_lock_irqsave(&smmu_group->devices_lock, flags);
+ list_del(&master->group_head);
+ spin_unlock_irqrestore(&smmu_group->devices_lock, flags);
+
+ iommu_group_put(group);
+ }
+
iommu_group_remove_device(dev);
iommu_device_unlink(&smmu->iommu, dev);
kfree(master);
--
2.11.0
^ permalink raw reply related [flat|nested] 103+ messages in thread
* [RFC PATCH 02/30] iommu/arm-smmu-v3: Link groups and domains
[not found] ` <20170227195441.5170-1-jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
2017-02-27 19:54 ` [RFC PATCH 01/30] iommu/arm-smmu-v3: Link groups and devices Jean-Philippe Brucker
@ 2017-02-27 19:54 ` Jean-Philippe Brucker
2017-02-27 19:54 ` [RFC PATCH 03/30] PCI: Move ATS declarations outside of CONFIG_PCI Jean-Philippe Brucker
` (28 subsequent siblings)
30 siblings, 0 replies; 103+ messages in thread
From: Jean-Philippe Brucker @ 2017-02-27 19:54 UTC (permalink / raw)
Cc: Shanker Donthineni, kvm-u79uwXL29TY76Z2rM5mHXA, Catalin Marinas,
Sinan Kaya, Will Deacon,
iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
Harv Abdulhamid, linux-pci-u79uwXL29TY76Z2rM5mHXA, Bjorn Helgaas,
David Woodhouse,
linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r, Nate Watterson
When removing a mapping from a domain, we need to send an invalidation to
all of devices that might have stored it in their Address Translation
Cache (ATC). This requires a lookup from smmu_domain to smmu_group. Add a
list of groups in each domain.
Although this domain-group link is already protected by
iommu_group->mutex, the domain-to-group lookup will be done outside of the
sections protected by that mutex. Add a spinlock for this case.
Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
---
drivers/iommu/arm-smmu-v3.c | 44 ++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 42 insertions(+), 2 deletions(-)
diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index ce8b68fe254b..69d00416990d 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -651,9 +651,15 @@ struct arm_smmu_domain {
};
struct iommu_domain domain;
+
+ struct list_head groups;
+ spinlock_t groups_lock;
};
struct arm_smmu_group {
+ struct arm_smmu_domain *domain;
+ struct list_head domain_head;
+
struct list_head devices;
spinlock_t devices_lock;
};
@@ -1408,6 +1414,9 @@ static struct iommu_domain *arm_smmu_domain_alloc(unsigned type)
mutex_init(&smmu_domain->init_mutex);
spin_lock_init(&smmu_domain->pgtbl_lock);
+ INIT_LIST_HEAD(&smmu_domain->groups);
+ spin_lock_init(&smmu_domain->groups_lock);
+
return &smmu_domain->domain;
}
@@ -1641,6 +1650,7 @@ static void arm_smmu_detach_dev(struct device *dev)
static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
{
int ret = 0;
+ unsigned long flags;
struct iommu_group *group;
struct arm_smmu_device *smmu;
struct arm_smmu_group *smmu_group;
@@ -1666,9 +1676,31 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
return -ENOMEM;
}
- /* Already attached to a different domain? */
- if (!ste->bypass)
+ /*
+ * Already attached to a different domain? This happens when we're
+ * switching from default domain to unmanaged domain, and back. We
+ * assume here that, when switching from old domain to new domain, old
+ * domain doesn't have any live mapping anymore. This is an important
+ * requirement because here we remove the group-domain link when we
+ * re-attach the first device in a group. Other devices in that group
+ * might still be attached to the old domain, and will be reattached in
+ * a moment.
+ *
+ * We also take this path when attaching for the very first time, just
+ * after the STE is initialized.
+ */
+ if (!ste->bypass) {
+ struct arm_smmu_domain *other_domain = smmu_group->domain;
+
+ if (other_domain) {
+ spin_lock_irqsave(&other_domain->groups_lock, flags);
+ list_del(&smmu_group->domain_head);
+ spin_unlock_irqrestore(&other_domain->groups_lock, flags);
+
+ smmu_group->domain = NULL;
+ }
arm_smmu_detach_dev(dev);
+ }
mutex_lock(&smmu_domain->init_mutex);
@@ -1688,6 +1720,14 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
goto out_unlock;
}
+ if (!smmu_group->domain) {
+ smmu_group->domain = smmu_domain;
+
+ spin_lock_irqsave(&smmu_domain->groups_lock, flags);
+ list_add(&smmu_group->domain_head, &smmu_domain->groups);
+ spin_unlock_irqrestore(&smmu_domain->groups_lock, flags);
+ }
+
ste->bypass = false;
ste->valid = true;
--
2.11.0
^ permalink raw reply related [flat|nested] 103+ messages in thread
* [RFC PATCH 03/30] PCI: Move ATS declarations outside of CONFIG_PCI
[not found] ` <20170227195441.5170-1-jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
2017-02-27 19:54 ` [RFC PATCH 01/30] iommu/arm-smmu-v3: Link groups and devices Jean-Philippe Brucker
2017-02-27 19:54 ` [RFC PATCH 02/30] iommu/arm-smmu-v3: Link groups and domains Jean-Philippe Brucker
@ 2017-02-27 19:54 ` Jean-Philippe Brucker
[not found] ` <20170227195441.5170-4-jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
2017-02-27 19:54 ` [RFC PATCH 04/30] iommu/arm-smmu-v3: Add support for PCI ATS Jean-Philippe Brucker
` (27 subsequent siblings)
30 siblings, 1 reply; 103+ messages in thread
From: Jean-Philippe Brucker @ 2017-02-27 19:54 UTC (permalink / raw)
Cc: Shanker Donthineni, kvm-u79uwXL29TY76Z2rM5mHXA, Catalin Marinas,
Sinan Kaya, Will Deacon,
iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
Harv Abdulhamid, linux-pci-u79uwXL29TY76Z2rM5mHXA, Bjorn Helgaas,
David Woodhouse,
linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r, Nate Watterson
Currently ATS helpers like pci_enable_ats are only defined when CONFIG_PCI
is enabled. The ARM SMMU driver might get built with CONFIG_PCI disabled.
It would thus have to wrap any use of ATS helpers around #ifdef
CONFIG_PCI, which isn't ideal.
A nicer solution is to always define these helpers. Since CONFIG_PCI_ATS
is only enabled in association with CONFIG_PCI, move defines outside of
CONFIG_PCI to prevent build failure when PCI is disabled.
Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
---
include/linux/pci.h | 26 +++++++++++++-------------
1 file changed, 13 insertions(+), 13 deletions(-)
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 282ed32244ce..e606f289bf5f 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1418,19 +1418,6 @@ int ht_create_irq(struct pci_dev *dev, int idx);
void ht_destroy_irq(unsigned int irq);
#endif /* CONFIG_HT_IRQ */
-#ifdef CONFIG_PCI_ATS
-/* Address Translation Service */
-void pci_ats_init(struct pci_dev *dev);
-int pci_enable_ats(struct pci_dev *dev, int ps);
-void pci_disable_ats(struct pci_dev *dev);
-int pci_ats_queue_depth(struct pci_dev *dev);
-#else
-static inline void pci_ats_init(struct pci_dev *d) { }
-static inline int pci_enable_ats(struct pci_dev *d, int ps) { return -ENODEV; }
-static inline void pci_disable_ats(struct pci_dev *d) { }
-static inline int pci_ats_queue_depth(struct pci_dev *d) { return -ENODEV; }
-#endif
-
#ifdef CONFIG_PCIE_PTM
int pci_enable_ptm(struct pci_dev *dev, u8 *granularity);
#else
@@ -1616,6 +1603,19 @@ static inline int pci_get_new_domain_nr(void) { return -ENOSYS; }
#define dev_is_pf(d) (false)
#endif /* CONFIG_PCI */
+#ifdef CONFIG_PCI_ATS
+/* Address Translation Service */
+void pci_ats_init(struct pci_dev *dev);
+int pci_enable_ats(struct pci_dev *dev, int ps);
+void pci_disable_ats(struct pci_dev *dev);
+int pci_ats_queue_depth(struct pci_dev *dev);
+#else
+static inline void pci_ats_init(struct pci_dev *d) { }
+static inline int pci_enable_ats(struct pci_dev *d, int ps) { return -ENODEV; }
+static inline void pci_disable_ats(struct pci_dev *d) { }
+static inline int pci_ats_queue_depth(struct pci_dev *d) { return -ENODEV; }
+#endif
+
/* Include architecture-dependent settings and functions */
#include <asm/pci.h>
--
2.11.0
^ permalink raw reply related [flat|nested] 103+ messages in thread
* [RFC PATCH 04/30] iommu/arm-smmu-v3: Add support for PCI ATS
[not found] ` <20170227195441.5170-1-jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
` (2 preceding siblings ...)
2017-02-27 19:54 ` [RFC PATCH 03/30] PCI: Move ATS declarations outside of CONFIG_PCI Jean-Philippe Brucker
@ 2017-02-27 19:54 ` Jean-Philippe Brucker
2017-03-08 15:26 ` Sinan Kaya
` (3 more replies)
2017-02-27 19:54 ` [RFC PATCH 05/30] iommu/arm-smmu-v3: Disable tagged pointers when ATS is in use Jean-Philippe Brucker
` (26 subsequent siblings)
30 siblings, 4 replies; 103+ messages in thread
From: Jean-Philippe Brucker @ 2017-02-27 19:54 UTC (permalink / raw)
Cc: Shanker Donthineni, kvm-u79uwXL29TY76Z2rM5mHXA, Catalin Marinas,
Sinan Kaya, Will Deacon,
iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
Harv Abdulhamid, linux-pci-u79uwXL29TY76Z2rM5mHXA, Bjorn Helgaas,
David Woodhouse,
linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r, Nate Watterson
PCIe devices can implement their own TLB, named Address Translation Cache
(ATC). Steps involved in the use and maintenance of such caches are:
* Device sends an Address Translation Request for a given IOVA to the
IOMMU. If the translation succeeds, the IOMMU returns the corresponding
physical address, which is stored in the device's ATC.
* Device can then use the physical address directly in a transaction.
A PCIe device does so by setting the TLP AT field to 0b10 - translated.
The SMMU might check that the device is allowed to send translated
transactions, and let it pass through.
* When an address is unmapped, CPU sends a CMD_ATC_INV command to the
SMMU, that is relayed to the device.
In theory, this doesn't require a lot of software intervention. The IOMMU
driver needs to enable ATS when adding a PCI device, and send an
invalidation request when unmapping. Note that this invalidation is
allowed to take up to a minute, according to the PCIe spec. In
addition, the invalidation queue on the ATC side is fairly small, 32 by
default, so we cannot keep many invalidations in flight (see ATS spec
section 3.5, Invalidate Flow Control).
Handling these constraints properly would require to postpone
invalidations, and keep the stale mappings until we're certain that all
devices forgot about them. This requires major work in the page table
managers, and is therefore not done by this patch.
Range calculation
-----------------
The invalidation packet itself is a bit awkward: range must be naturally
aligned, which means that the start address is a multiple of the range
size. In addition, the size must be a power of two number of 4k pages. We
have a few options to enforce this constraint:
(1) Find the smallest naturally aligned region that covers the requested
range. This is simple to compute and only takes one ATC_INV, but it
will spill on lots of neighbouring ATC entries.
(2) Align the start address to the region size (rounded up to a power of
two), and send a second invalidation for the next range of the same
size. Still not great, but reduces spilling.
(3) Cover the range exactly with the smallest number of naturally aligned
regions. This would be interesting to implement but as for (2),
requires multiple ATC_INV.
As I suspect ATC invalidation packets will be a very scarce resource,
we'll go with option (1) for now, and only send one big invalidation.
Note that with io-pgtable, the unmap function is called for each page, so
this doesn't matter. The problem shows up when sharing page tables with
the MMU.
Locking
-------
The atc_invalidate function is called from arm_smmu_unmap, with pgtbl_lock
held (hardirq-safe). When sharing page tables with the MMU, we will have a
few more call sites:
* When unbinding an address space from a device, to invalidate the whole
address space.
* When a task bound to a device does an mlock, munmap, etc. This comes
from an MMU notifier, with mmap_sem and pte_lock held.
Given this, all locks take on the ATC invalidation path must be hardirq-
safe.
Timeout
-------
Some SMMU implementations will raise a CERROR_ATC_INV_SYNC when a CMD_SYNC
fails because of an ATC invalidation. Some will just fail the CMD_SYNC.
Others might let CMD_SYNC complete and have an asynchronous IMPDEF
mechanism to record the error. When we receive a CERROR_ATC_INV_SYNC, we
could retry sending all ATC_INV since last successful CMD_SYNC. When a
CMD_SYNC fails without CERROR_ATC_INV_SYNC, we could retry sending *all*
commands since last successful CMD_SYNC. This patch doesn't properly
handle timeout, and ignores devices that don't behave. It might lead to
memory corruption.
Optional support
----------------
For the moment, enable ATS whenever a device advertises it. Later, we
might want to allow users to opt-in for the whole system or individual
devices via sysfs or cmdline. Some firmware interfaces also provide a
description of ATS capabilities in the root complex, and we might want to
add a similar capability in DT. For instance, the following could be added
to bindings/pci/pci-iommu.txt, as an optional property to PCI RC:
- ats-map: describe Address Translation Service support by the root
complex. This property is an arbitrary number of tuples of
(rid-base,length). Any RID in this interval is allowed to issue address
translation requests.
Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
---
drivers/iommu/arm-smmu-v3.c | 262 ++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 250 insertions(+), 12 deletions(-)
diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index 69d00416990d..e7b940146ae3 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -35,6 +35,7 @@
#include <linux/of_iommu.h>
#include <linux/of_platform.h>
#include <linux/pci.h>
+#include <linux/pci-ats.h>
#include <linux/platform_device.h>
#include <linux/amba/bus.h>
@@ -102,6 +103,7 @@
#define IDR5_OAS_48_BIT (5 << IDR5_OAS_SHIFT)
#define ARM_SMMU_CR0 0x20
+#define CR0_ATSCHK (1 << 4)
#define CR0_CMDQEN (1 << 3)
#define CR0_EVTQEN (1 << 2)
#define CR0_PRIQEN (1 << 1)
@@ -343,6 +345,7 @@
#define CMDQ_ERR_CERROR_NONE_IDX 0
#define CMDQ_ERR_CERROR_ILL_IDX 1
#define CMDQ_ERR_CERROR_ABT_IDX 2
+#define CMDQ_ERR_CERROR_ATC_INV_IDX 3
#define CMDQ_0_OP_SHIFT 0
#define CMDQ_0_OP_MASK 0xffUL
@@ -364,6 +367,15 @@
#define CMDQ_TLBI_1_VA_MASK ~0xfffUL
#define CMDQ_TLBI_1_IPA_MASK 0xfffffffff000UL
+#define CMDQ_ATC_0_SSID_SHIFT 12
+#define CMDQ_ATC_0_SSID_MASK 0xfffffUL
+#define CMDQ_ATC_0_SID_SHIFT 32
+#define CMDQ_ATC_0_SID_MASK 0xffffffffUL
+#define CMDQ_ATC_0_GLOBAL (1UL << 9)
+#define CMDQ_ATC_1_SIZE_SHIFT 0
+#define CMDQ_ATC_1_SIZE_MASK 0x3fUL
+#define CMDQ_ATC_1_ADDR_MASK ~0xfffUL
+
#define CMDQ_PRI_0_SSID_SHIFT 12
#define CMDQ_PRI_0_SSID_MASK 0xfffffUL
#define CMDQ_PRI_0_SID_SHIFT 32
@@ -417,6 +429,11 @@ module_param_named(disable_bypass, disable_bypass, bool, S_IRUGO);
MODULE_PARM_DESC(disable_bypass,
"Disable bypass streams such that incoming transactions from devices that are not attached to an iommu domain will report an abort back to the device and will not be allowed to pass through the SMMU.");
+static bool disable_ats_check;
+module_param_named(disable_ats_check, disable_ats_check, bool, S_IRUGO);
+MODULE_PARM_DESC(disable_ats_check,
+ "By default, the SMMU checks whether each incoming transaction marked as translated is allowed by the stream configuration. This option disables the check.");
+
enum pri_resp {
PRI_RESP_DENY,
PRI_RESP_FAIL,
@@ -485,6 +502,15 @@ struct arm_smmu_cmdq_ent {
u64 addr;
} tlbi;
+ #define CMDQ_OP_ATC_INV 0x40
+ struct {
+ u32 sid;
+ u32 ssid;
+ u64 addr;
+ u8 size;
+ bool global;
+ } atc;
+
#define CMDQ_OP_PRI_RESP 0x41
struct {
u32 sid;
@@ -662,6 +688,8 @@ struct arm_smmu_group {
struct list_head devices;
spinlock_t devices_lock;
+
+ bool ats_enabled;
};
struct arm_smmu_option_prop {
@@ -839,6 +867,14 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
case CMDQ_OP_TLBI_S12_VMALL:
cmd[0] |= (u64)ent->tlbi.vmid << CMDQ_TLBI_0_VMID_SHIFT;
break;
+ case CMDQ_OP_ATC_INV:
+ cmd[0] |= ent->substream_valid ? CMDQ_0_SSV : 0;
+ cmd[0] |= ent->atc.global ? CMDQ_ATC_0_GLOBAL : 0;
+ cmd[0] |= ent->atc.ssid << CMDQ_ATC_0_SSID_SHIFT;
+ cmd[0] |= (u64)ent->atc.sid << CMDQ_ATC_0_SID_SHIFT;
+ cmd[1] |= ent->atc.size << CMDQ_ATC_1_SIZE_SHIFT;
+ cmd[1] |= ent->atc.addr & CMDQ_ATC_1_ADDR_MASK;
+ break;
case CMDQ_OP_PRI_RESP:
cmd[0] |= ent->substream_valid ? CMDQ_0_SSV : 0;
cmd[0] |= ent->pri.ssid << CMDQ_PRI_0_SSID_SHIFT;
@@ -874,6 +910,7 @@ static void arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu)
[CMDQ_ERR_CERROR_NONE_IDX] = "No error",
[CMDQ_ERR_CERROR_ILL_IDX] = "Illegal command",
[CMDQ_ERR_CERROR_ABT_IDX] = "Abort on command fetch",
+ [CMDQ_ERR_CERROR_ATC_INV_IDX] = "ATC invalidate timeout",
};
int i;
@@ -893,6 +930,13 @@ static void arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu)
dev_err(smmu->dev, "retrying command fetch\n");
case CMDQ_ERR_CERROR_NONE_IDX:
return;
+ case CMDQ_ERR_CERROR_ATC_INV_IDX:
+ /*
+ * CMD_SYNC failed because of ATC Invalidation completion
+ * timeout. CONS is still pointing at the CMD_SYNC. Ensure other
+ * operations complete by re-submitting the CMD_SYNC, cowardly
+ * ignoring the ATC error.
+ */
case CMDQ_ERR_CERROR_ILL_IDX:
/* Fallthrough */
default:
@@ -1084,9 +1128,6 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_device *smmu, u32 sid,
STRTAB_STE_1_S1C_CACHE_WBRA
<< STRTAB_STE_1_S1COR_SHIFT |
STRTAB_STE_1_S1C_SH_ISH << STRTAB_STE_1_S1CSH_SHIFT |
-#ifdef CONFIG_PCI_ATS
- STRTAB_STE_1_EATS_TRANS << STRTAB_STE_1_EATS_SHIFT |
-#endif
STRTAB_STE_1_STRW_NSEL1 << STRTAB_STE_1_STRW_SHIFT);
if (smmu->features & ARM_SMMU_FEAT_STALLS)
@@ -1115,6 +1156,10 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_device *smmu, u32 sid,
val |= STRTAB_STE_0_CFG_S2_TRANS;
}
+ if (IS_ENABLED(CONFIG_PCI_ATS) && !ste_live)
+ dst[1] |= cpu_to_le64(STRTAB_STE_1_EATS_TRANS
+ << STRTAB_STE_1_EATS_SHIFT);
+
arm_smmu_sync_ste_for_sid(smmu, sid);
dst[0] = cpu_to_le64(val);
arm_smmu_sync_ste_for_sid(smmu, sid);
@@ -1377,6 +1422,120 @@ static const struct iommu_gather_ops arm_smmu_gather_ops = {
.tlb_sync = arm_smmu_tlb_sync,
};
+static void arm_smmu_atc_invalidate_to_cmd(struct arm_smmu_device *smmu,
+ unsigned long iova, size_t size,
+ struct arm_smmu_cmdq_ent *cmd)
+{
+ size_t log2_span;
+ size_t span_mask;
+ size_t smmu_grain;
+ /* ATC invalidates are always on 4096 bytes pages */
+ size_t inval_grain_shift = 12;
+ unsigned long iova_start, iova_end;
+ unsigned long page_start, page_end;
+
+ smmu_grain = 1ULL << __ffs(smmu->pgsize_bitmap);
+
+ /* In case parameters are not aligned on PAGE_SIZE */
+ iova_start = round_down(iova, smmu_grain);
+ iova_end = round_up(iova + size, smmu_grain) - 1;
+
+ page_start = iova_start >> inval_grain_shift;
+ page_end = iova_end >> inval_grain_shift;
+
+ /*
+ * Find the smallest power of two that covers the range. Most
+ * significant differing bit between start and end address indicates the
+ * required span, ie. fls(start ^ end). For example:
+ *
+ * We want to invalidate pages [8; 11]. This is already the ideal range:
+ * x = 0b1000 ^ 0b1011 = 0b11
+ * span = 1 << fls(x) = 4
+ *
+ * To invalidate pages [7; 10], we need to invalidate [0; 15]:
+ * x = 0b0111 ^ 0b1010 = 0b1101
+ * span = 1 << fls(x) = 16
+ */
+ log2_span = fls_long(page_start ^ page_end);
+ span_mask = (1ULL << log2_span) - 1;
+
+ page_start &= ~span_mask;
+
+ *cmd = (struct arm_smmu_cmdq_ent) {
+ .opcode = CMDQ_OP_ATC_INV,
+ .atc = {
+ .addr = page_start << inval_grain_shift,
+ .size = log2_span,
+ }
+ };
+}
+
+static int arm_smmu_atc_invalidate_master(struct arm_smmu_master_data *master,
+ struct arm_smmu_cmdq_ent *cmd)
+{
+ int i;
+ struct iommu_fwspec *fwspec = master->dev->iommu_fwspec;
+ struct pci_dev *pdev = to_pci_dev(master->dev);
+
+ if (!pdev->ats_enabled)
+ return 0;
+
+ for (i = 0; i < fwspec->num_ids; i++) {
+ cmd->atc.sid = fwspec->ids[i];
+
+ dev_dbg(master->smmu->dev,
+ "ATC invalidate %#x:%#x:%#llx-%#llx, esz=%d\n",
+ cmd->atc.sid, cmd->atc.ssid, cmd->atc.addr,
+ cmd->atc.addr + (1 << (cmd->atc.size + 12)) - 1,
+ cmd->atc.size);
+
+ arm_smmu_cmdq_issue_cmd(master->smmu, cmd);
+ }
+
+ return 0;
+}
+
+static size_t arm_smmu_atc_invalidate_domain(struct arm_smmu_domain *smmu_domain,
+ unsigned long iova, size_t size)
+{
+ unsigned long flags;
+ struct arm_smmu_cmdq_ent cmd = {0};
+ struct arm_smmu_group *smmu_group;
+ struct arm_smmu_master_data *master;
+ struct arm_smmu_device *smmu = smmu_domain->smmu;
+ struct arm_smmu_cmdq_ent sync_cmd = {
+ .opcode = CMDQ_OP_CMD_SYNC,
+ };
+
+ spin_lock_irqsave(&smmu_domain->groups_lock, flags);
+
+ list_for_each_entry(smmu_group, &smmu_domain->groups, domain_head) {
+ if (!smmu_group->ats_enabled)
+ continue;
+
+ /* Initialise command lazily */
+ if (!cmd.opcode)
+ arm_smmu_atc_invalidate_to_cmd(smmu, iova, size, &cmd);
+
+ spin_lock(&smmu_group->devices_lock);
+
+ list_for_each_entry(master, &smmu_group->devices, group_head)
+ arm_smmu_atc_invalidate_master(master, &cmd);
+
+ /*
+ * TODO: ensure we do a sync whenever we have sent ats_queue_depth
+ * invalidations to the same device.
+ */
+ arm_smmu_cmdq_issue_cmd(smmu, &sync_cmd);
+
+ spin_unlock(&smmu_group->devices_lock);
+ }
+
+ spin_unlock_irqrestore(&smmu_domain->groups_lock, flags);
+
+ return size;
+}
+
/* IOMMU API */
static bool arm_smmu_capable(enum iommu_cap cap)
{
@@ -1782,7 +1941,10 @@ arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova, size_t size)
spin_lock_irqsave(&smmu_domain->pgtbl_lock, flags);
ret = ops->unmap(ops, iova, size);
+ if (ret)
+ ret = arm_smmu_atc_invalidate_domain(smmu_domain, iova, size);
spin_unlock_irqrestore(&smmu_domain->pgtbl_lock, flags);
+
return ret;
}
@@ -1830,11 +1992,63 @@ static bool arm_smmu_sid_in_range(struct arm_smmu_device *smmu, u32 sid)
return sid < limit;
}
+/*
+ * Returns -ENOSYS if ATS is not supported either by the device or by the SMMU
+ */
+static int arm_smmu_enable_ats(struct arm_smmu_master_data *master)
+{
+ int ret;
+ size_t stu;
+ struct pci_dev *pdev;
+ struct arm_smmu_device *smmu = master->smmu;
+
+ if (!(smmu->features & ARM_SMMU_FEAT_ATS) || !dev_is_pci(master->dev))
+ return -ENOSYS;
+
+ pdev = to_pci_dev(master->dev);
+
+#ifdef CONFIG_PCI_ATS
+ if (!pdev->ats_cap)
+ return -ENOSYS;
+#else
+ return -ENOSYS;
+#endif
+
+ /* Smallest Translation Unit: log2 of the smallest supported granule */
+ stu = __ffs(smmu->pgsize_bitmap);
+
+ ret = pci_enable_ats(pdev, stu);
+ if (ret) {
+ dev_err(&pdev->dev, "cannot enable ATS: %d\n", ret);
+ return ret;
+ }
+
+ dev_dbg(&pdev->dev, "enabled ATS with STU = %zu\n", stu);
+
+ return 0;
+}
+
+static void arm_smmu_disable_ats(struct arm_smmu_master_data *master)
+{
+ struct pci_dev *pdev;
+
+ if (!dev_is_pci(master->dev))
+ return;
+
+ pdev = to_pci_dev(master->dev);
+
+ if (!pdev->ats_enabled)
+ return;
+
+ pci_disable_ats(pdev);
+}
+
static struct iommu_ops arm_smmu_ops;
static int arm_smmu_add_device(struct device *dev)
{
int i, ret;
+ bool ats_enabled;
unsigned long flags;
struct arm_smmu_device *smmu;
struct arm_smmu_group *smmu_group;
@@ -1880,19 +2094,31 @@ static int arm_smmu_add_device(struct device *dev)
}
}
+ ats_enabled = !arm_smmu_enable_ats(master);
+
group = iommu_group_get_for_dev(dev);
- if (!IS_ERR(group)) {
- smmu_group = to_smmu_group(group);
+ if (IS_ERR(group)) {
+ ret = PTR_ERR(group);
+ goto err_disable_ats;
+ }
- spin_lock_irqsave(&smmu_group->devices_lock, flags);
- list_add(&master->group_head, &smmu_group->devices);
- spin_unlock_irqrestore(&smmu_group->devices_lock, flags);
+ smmu_group = to_smmu_group(group);
- iommu_group_put(group);
- iommu_device_link(&smmu->iommu, dev);
- }
+ smmu_group->ats_enabled |= ats_enabled;
- return PTR_ERR_OR_ZERO(group);
+ spin_lock_irqsave(&smmu_group->devices_lock, flags);
+ list_add(&master->group_head, &smmu_group->devices);
+ spin_unlock_irqrestore(&smmu_group->devices_lock, flags);
+
+ iommu_group_put(group);
+ iommu_device_link(&smmu->iommu, dev);
+
+ return 0;
+
+err_disable_ats:
+ arm_smmu_disable_ats(master);
+
+ return ret;
}
static void arm_smmu_remove_device(struct device *dev)
@@ -1921,6 +2147,8 @@ static void arm_smmu_remove_device(struct device *dev)
spin_unlock_irqrestore(&smmu_group->devices_lock, flags);
iommu_group_put(group);
+
+ arm_smmu_disable_ats(master);
}
iommu_group_remove_device(dev);
@@ -2485,6 +2713,16 @@ static int arm_smmu_device_reset(struct arm_smmu_device *smmu, bool bypass)
}
}
+ if (smmu->features & ARM_SMMU_FEAT_ATS && !disable_ats_check) {
+ enables |= CR0_ATSCHK;
+ ret = arm_smmu_write_reg_sync(smmu, enables, ARM_SMMU_CR0,
+ ARM_SMMU_CR0ACK);
+ if (ret) {
+ dev_err(smmu->dev, "failed to enable ATS check\n");
+ return ret;
+ }
+ }
+
ret = arm_smmu_setup_irqs(smmu);
if (ret) {
dev_err(smmu->dev, "failed to setup irqs\n");
--
2.11.0
^ permalink raw reply related [flat|nested] 103+ messages in thread
* Re: [RFC PATCH 04/30] iommu/arm-smmu-v3: Add support for PCI ATS
2017-02-27 19:54 ` [RFC PATCH 04/30] iommu/arm-smmu-v3: Add support for PCI ATS Jean-Philippe Brucker
@ 2017-03-08 15:26 ` Sinan Kaya
[not found] ` <c0f74140-f1f6-7c52-295a-5d4722017664-sgV2jX0FEOL9JmXXK+q4OQ@public.gmane.org>
2017-04-03 8:34 ` Sunil Kovvuri
` (2 subsequent siblings)
3 siblings, 1 reply; 103+ messages in thread
From: Sinan Kaya @ 2017-03-08 15:26 UTC (permalink / raw)
To: Jean-Philippe Brucker
Cc: Harv Abdulhamid, Will Deacon, Shanker Donthineni, Bjorn Helgaas,
Sinan Kaya, Lorenzo Pieralisi, Catalin Marinas, Robin Murphy,
Joerg Roedel, Nate Watterson, Alex Williamson, David Woodhouse,
linux-arm-kernel, linux-pci, iommu, kvm
On 2/27/2017 2:54 PM, Jean-Philippe Brucker wrote:
> + ats_enabled = !arm_smmu_enable_ats(master);
> +
You should make ats_supported field in IORT table part of the decision
process for when to enable ATS.
--
Sinan Kaya
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm Technologies, Inc.
Qualcomm Technologies, Inc. is a member of the Code Aurora Forum, a Linux Foundation Collaborative Project.
^ permalink raw reply [flat|nested] 103+ messages in thread
* Re: [RFC PATCH 04/30] iommu/arm-smmu-v3: Add support for PCI ATS
2017-02-27 19:54 ` [RFC PATCH 04/30] iommu/arm-smmu-v3: Add support for PCI ATS Jean-Philippe Brucker
2017-03-08 15:26 ` Sinan Kaya
@ 2017-04-03 8:34 ` Sunil Kovvuri
2017-04-03 10:14 ` Jean-Philippe Brucker
[not found] ` <20170227195441.5170-5-jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
2017-05-23 8:41 ` Leizhen (ThunderTown)
3 siblings, 1 reply; 103+ messages in thread
From: Sunil Kovvuri @ 2017-04-03 8:34 UTC (permalink / raw)
To: Jean-Philippe Brucker
Cc: Harv Abdulhamid, Will Deacon, Shanker Donthineni, Bjorn Helgaas,
Sinan Kaya, Lorenzo Pieralisi, Catalin Marinas, Robin Murphy,
Joerg Roedel, Nate Watterson, Alex Williamson, David Woodhouse,
LAKML, linux-pci, iommu, kvm
> +static size_t arm_smmu_atc_invalidate_domain(struct arm_smmu_domain *smmu_domain,
> + unsigned long iova, size_t size)
> +{
> + unsigned long flags;
> + struct arm_smmu_cmdq_ent cmd = {0};
> + struct arm_smmu_group *smmu_group;
> + struct arm_smmu_master_data *master;
> + struct arm_smmu_device *smmu = smmu_domain->smmu;
> + struct arm_smmu_cmdq_ent sync_cmd = {
> + .opcode = CMDQ_OP_CMD_SYNC,
> + };
> +
> + spin_lock_irqsave(&smmu_domain->groups_lock, flags);
> +
> + list_for_each_entry(smmu_group, &smmu_domain->groups, domain_head) {
> + if (!smmu_group->ats_enabled)
> + continue;
If ATS is not supported, this seems to increase no of cycles spent in
pgtbl_lock.
Can we return from this API by checking 'ARM_SMMU_FEAT_ATS' in smmu->features ?
Thanks,
Sunil.
^ permalink raw reply [flat|nested] 103+ messages in thread
* Re: [RFC PATCH 04/30] iommu/arm-smmu-v3: Add support for PCI ATS
2017-04-03 8:34 ` Sunil Kovvuri
@ 2017-04-03 10:14 ` Jean-Philippe Brucker
2017-04-03 11:42 ` Sunil Kovvuri
0 siblings, 1 reply; 103+ messages in thread
From: Jean-Philippe Brucker @ 2017-04-03 10:14 UTC (permalink / raw)
To: Sunil Kovvuri
Cc: Harv Abdulhamid, Will Deacon, Shanker Donthineni, Bjorn Helgaas,
Sinan Kaya, Lorenzo Pieralisi, Catalin Marinas, Robin Murphy,
Joerg Roedel, Nate Watterson, Alex Williamson, David Woodhouse,
LAKML, linux-pci, iommu, kvm
On 03/04/17 09:34, Sunil Kovvuri wrote:
>> +static size_t arm_smmu_atc_invalidate_domain(struct arm_smmu_domain *smmu_domain,
>> + unsigned long iova, size_t size)
>> +{
>> + unsigned long flags;
>> + struct arm_smmu_cmdq_ent cmd = {0};
>> + struct arm_smmu_group *smmu_group;
>> + struct arm_smmu_master_data *master;
>> + struct arm_smmu_device *smmu = smmu_domain->smmu;
>> + struct arm_smmu_cmdq_ent sync_cmd = {
>> + .opcode = CMDQ_OP_CMD_SYNC,
>> + };
>> +
>> + spin_lock_irqsave(&smmu_domain->groups_lock, flags);
>> +
>> + list_for_each_entry(smmu_group, &smmu_domain->groups, domain_head) {
>> + if (!smmu_group->ats_enabled)
>> + continue;
>
> If ATS is not supported, this seems to increase no of cycles spent in
> pgtbl_lock.
> Can we return from this API by checking 'ARM_SMMU_FEAT_ATS' in smmu->features ?
Sure, I can add a check before taking the lock. Have you been able to
observe a significant difference in cycles between checking FEAT_ATS,
checking group->ats_enabled after taking the lock, and removing this
function call altogether?
Thanks,
Jean-Philippe
^ permalink raw reply [flat|nested] 103+ messages in thread
* Re: [RFC PATCH 04/30] iommu/arm-smmu-v3: Add support for PCI ATS
2017-04-03 10:14 ` Jean-Philippe Brucker
@ 2017-04-03 11:42 ` Sunil Kovvuri
2017-04-03 11:56 ` Jean-Philippe Brucker
0 siblings, 1 reply; 103+ messages in thread
From: Sunil Kovvuri @ 2017-04-03 11:42 UTC (permalink / raw)
To: Jean-Philippe Brucker
Cc: Harv Abdulhamid, Will Deacon, Shanker Donthineni, Bjorn Helgaas,
Sinan Kaya, Lorenzo Pieralisi, Catalin Marinas, Robin Murphy,
Joerg Roedel, Nate Watterson, Alex Williamson, David Woodhouse,
LAKML, linux-pci, iommu, kvm
On Mon, Apr 3, 2017 at 3:44 PM, Jean-Philippe Brucker
<jean-philippe.brucker@arm.com> wrote:
> On 03/04/17 09:34, Sunil Kovvuri wrote:
>>> +static size_t arm_smmu_atc_invalidate_domain(struct arm_smmu_domain *smmu_domain,
>>> + unsigned long iova, size_t size)
>>> +{
>>> + unsigned long flags;
>>> + struct arm_smmu_cmdq_ent cmd = {0};
>>> + struct arm_smmu_group *smmu_group;
>>> + struct arm_smmu_master_data *master;
>>> + struct arm_smmu_device *smmu = smmu_domain->smmu;
>>> + struct arm_smmu_cmdq_ent sync_cmd = {
>>> + .opcode = CMDQ_OP_CMD_SYNC,
>>> + };
>>> +
>>> + spin_lock_irqsave(&smmu_domain->groups_lock, flags);
>>> +
>>> + list_for_each_entry(smmu_group, &smmu_domain->groups, domain_head) {
>>> + if (!smmu_group->ats_enabled)
>>> + continue;
>>
>> If ATS is not supported, this seems to increase no of cycles spent in
>> pgtbl_lock.
>> Can we return from this API by checking 'ARM_SMMU_FEAT_ATS' in smmu->features ?
>
> Sure, I can add a check before taking the lock. Have you been able to
> observe a significant difference in cycles between checking FEAT_ATS,
> checking group->ats_enabled after taking the lock, and removing this
> function call altogether?
>
> Thanks,
> Jean-Philippe
No, I haven't verified, was just making an observation.
Thanks,
Sunil.
^ permalink raw reply [flat|nested] 103+ messages in thread
* Re: [RFC PATCH 04/30] iommu/arm-smmu-v3: Add support for PCI ATS
2017-04-03 11:42 ` Sunil Kovvuri
@ 2017-04-03 11:56 ` Jean-Philippe Brucker
0 siblings, 0 replies; 103+ messages in thread
From: Jean-Philippe Brucker @ 2017-04-03 11:56 UTC (permalink / raw)
To: Sunil Kovvuri
Cc: Harv Abdulhamid, Will Deacon, Shanker Donthineni, Bjorn Helgaas,
Sinan Kaya, Lorenzo Pieralisi, Catalin Marinas, Robin Murphy,
Joerg Roedel, Nate Watterson, Alex Williamson, David Woodhouse,
LAKML, linux-pci, iommu, kvm
On 03/04/17 12:42, Sunil Kovvuri wrote:
> On Mon, Apr 3, 2017 at 3:44 PM, Jean-Philippe Brucker
> <jean-philippe.brucker@arm.com> wrote:
>> On 03/04/17 09:34, Sunil Kovvuri wrote:
>>>> +static size_t arm_smmu_atc_invalidate_domain(struct arm_smmu_domain *smmu_domain,
>>>> + unsigned long iova, size_t size)
>>>> +{
>>>> + unsigned long flags;
>>>> + struct arm_smmu_cmdq_ent cmd = {0};
>>>> + struct arm_smmu_group *smmu_group;
>>>> + struct arm_smmu_master_data *master;
>>>> + struct arm_smmu_device *smmu = smmu_domain->smmu;
>>>> + struct arm_smmu_cmdq_ent sync_cmd = {
>>>> + .opcode = CMDQ_OP_CMD_SYNC,
>>>> + };
>>>> +
>>>> + spin_lock_irqsave(&smmu_domain->groups_lock, flags);
>>>> +
>>>> + list_for_each_entry(smmu_group, &smmu_domain->groups, domain_head) {
>>>> + if (!smmu_group->ats_enabled)
>>>> + continue;
>>>
>>> If ATS is not supported, this seems to increase no of cycles spent in
>>> pgtbl_lock.
>>> Can we return from this API by checking 'ARM_SMMU_FEAT_ATS' in smmu->features ?
>>
>> Sure, I can add a check before taking the lock. Have you been able to
>> observe a significant difference in cycles between checking FEAT_ATS,
>> checking group->ats_enabled after taking the lock, and removing this
>> function call altogether?
>>
>> Thanks,
>> Jean-Philippe
>
> No, I haven't verified, was just making an observation.
Fair enough, I think avoiding the lock when ATS isn't in use makes sense.
Thanks,
Jean-Philippe
^ permalink raw reply [flat|nested] 103+ messages in thread
[parent not found: <20170227195441.5170-5-jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>]
* Re: [RFC PATCH 04/30] iommu/arm-smmu-v3: Add support for PCI ATS
[not found] ` <20170227195441.5170-5-jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
@ 2017-03-01 19:24 ` Sinan Kaya
[not found] ` <5a7822f2-3991-aa51-169f-78ef49567feb-sgV2jX0FEOL9JmXXK+q4OQ@public.gmane.org>
2017-05-10 12:54 ` Tomasz Nowicki
1 sibling, 1 reply; 103+ messages in thread
From: Sinan Kaya @ 2017-03-01 19:24 UTC (permalink / raw)
To: Jean-Philippe Brucker
Cc: Harb Abdulhamid, Shanker Donthineni, kvm-u79uwXL29TY76Z2rM5mHXA,
Catalin Marinas, Will Deacon,
iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA, Sinan Kaya,
linux-pci-u79uwXL29TY76Z2rM5mHXA, Bjorn Helgaas, David Woodhouse,
linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r, Nate Watterson
On 2/27/2017 2:54 PM, Jean-Philippe Brucker wrote:
> /* Initialise command lazily */
> + if (!cmd.opcode)
> + arm_smmu_atc_invalidate_to_cmd(smmu, iova, size, &cmd);
> +
> + spin_lock(&smmu_group->devices_lock);
> +
> + list_for_each_entry(master, &smmu_group->devices, group_head)
> + arm_smmu_atc_invalidate_master(master, &cmd);
> +
> + /*
> + * TODO: ensure we do a sync whenever we have sent ats_queue_depth
> + * invalidations to the same device.
> + */
> + arm_smmu_cmdq_issue_cmd(smmu, &sync_cmd);
> +
It is possible to observe ATS invalidation timeout up to 90 seconds according to PCIe
spec. How does the current code deal with this?
--
Sinan Kaya
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm Technologies, Inc.
Qualcomm Technologies, Inc. is a member of the Code Aurora Forum, a Linux Foundation Collaborative Project.
^ permalink raw reply [flat|nested] 103+ messages in thread
* Re: [RFC PATCH 04/30] iommu/arm-smmu-v3: Add support for PCI ATS
[not found] ` <20170227195441.5170-5-jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
2017-03-01 19:24 ` Sinan Kaya
@ 2017-05-10 12:54 ` Tomasz Nowicki
2017-05-10 13:35 ` Jean-Philippe Brucker
1 sibling, 1 reply; 103+ messages in thread
From: Tomasz Nowicki @ 2017-05-10 12:54 UTC (permalink / raw)
To: Jean-Philippe Brucker
Cc: Shanker Donthineni, kvm-u79uwXL29TY76Z2rM5mHXA, Catalin Marinas,
Sinan Kaya, Will Deacon, Harv Abdulhamid,
iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
linux-pci-u79uwXL29TY76Z2rM5mHXA, Bjorn Helgaas, David Woodhouse,
linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r, Nate Watterson
Hi Jean,
On 27.02.2017 20:54, Jean-Philippe Brucker wrote:
> +/*
> + * Returns -ENOSYS if ATS is not supported either by the device or by the SMMU
> + */
> +static int arm_smmu_enable_ats(struct arm_smmu_master_data *master)
> +{
> + int ret;
> + size_t stu;
> + struct pci_dev *pdev;
> + struct arm_smmu_device *smmu = master->smmu;
> +
> + if (!(smmu->features & ARM_SMMU_FEAT_ATS) || !dev_is_pci(master->dev))
> + return -ENOSYS;
> +
> + pdev = to_pci_dev(master->dev);
> +
> +#ifdef CONFIG_PCI_ATS
> + if (!pdev->ats_cap)
> + return -ENOSYS;
> +#else
> + return -ENOSYS;
> +#endif
Nit: This deserves to be another helper in ats.c like:
int pci_ats_supported(struct pci_dev *dev) {
if (!pdev->ats_cap)
return 0;
return 1;
}
Thanks,
Tomasz
^ permalink raw reply [flat|nested] 103+ messages in thread
* Re: [RFC PATCH 04/30] iommu/arm-smmu-v3: Add support for PCI ATS
2017-05-10 12:54 ` Tomasz Nowicki
@ 2017-05-10 13:35 ` Jean-Philippe Brucker
0 siblings, 0 replies; 103+ messages in thread
From: Jean-Philippe Brucker @ 2017-05-10 13:35 UTC (permalink / raw)
To: Tomasz Nowicki
Cc: Lorenzo Pieralisi, Shanker Donthineni, kvm, Catalin Marinas,
Joerg Roedel, Sinan Kaya, Will Deacon, iommu, Harv Abdulhamid,
Alex Williamson, linux-pci, Bjorn Helgaas, Robin Murphy,
David Woodhouse, linux-arm-kernel, Nate Watterson
On 10/05/17 13:54, Tomasz Nowicki wrote:
> Hi Jean,
>
> On 27.02.2017 20:54, Jean-Philippe Brucker wrote:
>> +/*
>> + * Returns -ENOSYS if ATS is not supported either by the device or by
>> the SMMU
>> + */
>> +static int arm_smmu_enable_ats(struct arm_smmu_master_data *master)
>> +{
>> + int ret;
>> + size_t stu;
>> + struct pci_dev *pdev;
>> + struct arm_smmu_device *smmu = master->smmu;
>> +
>> + if (!(smmu->features & ARM_SMMU_FEAT_ATS) || !dev_is_pci(master->dev))
>> + return -ENOSYS;
>> +
>> + pdev = to_pci_dev(master->dev);
>> +
>> +#ifdef CONFIG_PCI_ATS
>> + if (!pdev->ats_cap)
>> + return -ENOSYS;
>> +#else
>> + return -ENOSYS;
>> +#endif
>
> Nit: This deserves to be another helper in ats.c like:
>
> int pci_ats_supported(struct pci_dev *dev) {
> if (!pdev->ats_cap)
> return 0;
>
> return 1;
> }
Indeed, although in my next version I'll remove this check altogether.
Instead I now rely on pci_enable_ats to check for ats_cap (as discussed in
patch 3). The downside is that we can't distinguish between absence of ATS
and error in enabling ATS. So we don't print a message in the latter case
anymore, we expect device drivers to check whether ATS is enabled.
Thanks,
Jean
^ permalink raw reply [flat|nested] 103+ messages in thread
* Re: [RFC PATCH 04/30] iommu/arm-smmu-v3: Add support for PCI ATS
2017-02-27 19:54 ` [RFC PATCH 04/30] iommu/arm-smmu-v3: Add support for PCI ATS Jean-Philippe Brucker
` (2 preceding siblings ...)
[not found] ` <20170227195441.5170-5-jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
@ 2017-05-23 8:41 ` Leizhen (ThunderTown)
2017-05-23 11:21 ` Jean-Philippe Brucker
3 siblings, 1 reply; 103+ messages in thread
From: Leizhen (ThunderTown) @ 2017-05-23 8:41 UTC (permalink / raw)
To: Jean-Philippe Brucker
Cc: Lorenzo Pieralisi, Shanker Donthineni, kvm, Catalin Marinas,
Joerg Roedel, Sinan Kaya, Will Deacon, iommu, Harv Abdulhamid,
Alex Williamson, linux-pci, Bjorn Helgaas, Robin Murphy,
David Woodhouse, linux-arm-kernel, Nate Watterson, LinuxArm
On 2017/2/28 3:54, Jean-Philippe Brucker wrote:
> PCIe devices can implement their own TLB, named Address Translation Cache
> (ATC). Steps involved in the use and maintenance of such caches are:
>
> * Device sends an Address Translation Request for a given IOVA to the
> IOMMU. If the translation succeeds, the IOMMU returns the corresponding
> physical address, which is stored in the device's ATC.
>
> * Device can then use the physical address directly in a transaction.
> A PCIe device does so by setting the TLP AT field to 0b10 - translated.
> The SMMU might check that the device is allowed to send translated
> transactions, and let it pass through.
>
> * When an address is unmapped, CPU sends a CMD_ATC_INV command to the
> SMMU, that is relayed to the device.
>
> In theory, this doesn't require a lot of software intervention. The IOMMU
> driver needs to enable ATS when adding a PCI device, and send an
> invalidation request when unmapping. Note that this invalidation is
> allowed to take up to a minute, according to the PCIe spec. In
> addition, the invalidation queue on the ATC side is fairly small, 32 by
> default, so we cannot keep many invalidations in flight (see ATS spec
> section 3.5, Invalidate Flow Control).
>
> Handling these constraints properly would require to postpone
> invalidations, and keep the stale mappings until we're certain that all
> devices forgot about them. This requires major work in the page table
> managers, and is therefore not done by this patch.
>
> Range calculation
> -----------------
>
> The invalidation packet itself is a bit awkward: range must be naturally
> aligned, which means that the start address is a multiple of the range
> size. In addition, the size must be a power of two number of 4k pages. We
> have a few options to enforce this constraint:
>
> (1) Find the smallest naturally aligned region that covers the requested
> range. This is simple to compute and only takes one ATC_INV, but it
> will spill on lots of neighbouring ATC entries.
>
> (2) Align the start address to the region size (rounded up to a power of
> two), and send a second invalidation for the next range of the same
> size. Still not great, but reduces spilling.
>
> (3) Cover the range exactly with the smallest number of naturally aligned
> regions. This would be interesting to implement but as for (2),
> requires multiple ATC_INV.
>
> As I suspect ATC invalidation packets will be a very scarce resource,
> we'll go with option (1) for now, and only send one big invalidation.
>
> Note that with io-pgtable, the unmap function is called for each page, so
> this doesn't matter. The problem shows up when sharing page tables with
> the MMU.
Suppose this is true, I'd like to choose option (2). Because the worst cases of
both (1) and (2) will not be happened, but the code of (2) will look clearer.
And (2) is technically more acceptable.
>
> Locking
> -------
>
> The atc_invalidate function is called from arm_smmu_unmap, with pgtbl_lock
> held (hardirq-safe). When sharing page tables with the MMU, we will have a
> few more call sites:
>
> * When unbinding an address space from a device, to invalidate the whole
> address space.
> * When a task bound to a device does an mlock, munmap, etc. This comes
> from an MMU notifier, with mmap_sem and pte_lock held.
>
> Given this, all locks take on the ATC invalidation path must be hardirq-
> safe.
>
> Timeout
> -------
>
> Some SMMU implementations will raise a CERROR_ATC_INV_SYNC when a CMD_SYNC
> fails because of an ATC invalidation. Some will just fail the CMD_SYNC.
> Others might let CMD_SYNC complete and have an asynchronous IMPDEF
> mechanism to record the error. When we receive a CERROR_ATC_INV_SYNC, we
> could retry sending all ATC_INV since last successful CMD_SYNC. When a
> CMD_SYNC fails without CERROR_ATC_INV_SYNC, we could retry sending *all*
> commands since last successful CMD_SYNC. This patch doesn't properly
> handle timeout, and ignores devices that don't behave. It might lead to
> memory corruption.
>
> Optional support
> ----------------
>
> For the moment, enable ATS whenever a device advertises it. Later, we
> might want to allow users to opt-in for the whole system or individual
> devices via sysfs or cmdline. Some firmware interfaces also provide a
> description of ATS capabilities in the root complex, and we might want to
> add a similar capability in DT. For instance, the following could be added
> to bindings/pci/pci-iommu.txt, as an optional property to PCI RC:
>
> - ats-map: describe Address Translation Service support by the root
> complex. This property is an arbitrary number of tuples of
> (rid-base,length). Any RID in this interval is allowed to issue address
> translation requests.
>
> Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker@arm.com>
> ---
> drivers/iommu/arm-smmu-v3.c | 262 ++++++++++++++++++++++++++++++++++++++++++--
> 1 file changed, 250 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
> index 69d00416990d..e7b940146ae3 100644
> --- a/drivers/iommu/arm-smmu-v3.c
> +++ b/drivers/iommu/arm-smmu-v3.c
> @@ -35,6 +35,7 @@
> #include <linux/of_iommu.h>
> #include <linux/of_platform.h>
> #include <linux/pci.h>
> +#include <linux/pci-ats.h>
> #include <linux/platform_device.h>
>
> #include <linux/amba/bus.h>
> @@ -102,6 +103,7 @@
> #define IDR5_OAS_48_BIT (5 << IDR5_OAS_SHIFT)
>
> #define ARM_SMMU_CR0 0x20
> +#define CR0_ATSCHK (1 << 4)
> #define CR0_CMDQEN (1 << 3)
> #define CR0_EVTQEN (1 << 2)
> #define CR0_PRIQEN (1 << 1)
> @@ -343,6 +345,7 @@
> #define CMDQ_ERR_CERROR_NONE_IDX 0
> #define CMDQ_ERR_CERROR_ILL_IDX 1
> #define CMDQ_ERR_CERROR_ABT_IDX 2
> +#define CMDQ_ERR_CERROR_ATC_INV_IDX 3
>
> #define CMDQ_0_OP_SHIFT 0
> #define CMDQ_0_OP_MASK 0xffUL
> @@ -364,6 +367,15 @@
> #define CMDQ_TLBI_1_VA_MASK ~0xfffUL
> #define CMDQ_TLBI_1_IPA_MASK 0xfffffffff000UL
>
> +#define CMDQ_ATC_0_SSID_SHIFT 12
> +#define CMDQ_ATC_0_SSID_MASK 0xfffffUL
> +#define CMDQ_ATC_0_SID_SHIFT 32
> +#define CMDQ_ATC_0_SID_MASK 0xffffffffUL
> +#define CMDQ_ATC_0_GLOBAL (1UL << 9)
> +#define CMDQ_ATC_1_SIZE_SHIFT 0
> +#define CMDQ_ATC_1_SIZE_MASK 0x3fUL
> +#define CMDQ_ATC_1_ADDR_MASK ~0xfffUL
> +
> #define CMDQ_PRI_0_SSID_SHIFT 12
> #define CMDQ_PRI_0_SSID_MASK 0xfffffUL
> #define CMDQ_PRI_0_SID_SHIFT 32
> @@ -417,6 +429,11 @@ module_param_named(disable_bypass, disable_bypass, bool, S_IRUGO);
> MODULE_PARM_DESC(disable_bypass,
> "Disable bypass streams such that incoming transactions from devices that are not attached to an iommu domain will report an abort back to the device and will not be allowed to pass through the SMMU.");
>
> +static bool disable_ats_check;
> +module_param_named(disable_ats_check, disable_ats_check, bool, S_IRUGO);
> +MODULE_PARM_DESC(disable_ats_check,
> + "By default, the SMMU checks whether each incoming transaction marked as translated is allowed by the stream configuration. This option disables the check.");
> +
> enum pri_resp {
> PRI_RESP_DENY,
> PRI_RESP_FAIL,
> @@ -485,6 +502,15 @@ struct arm_smmu_cmdq_ent {
> u64 addr;
> } tlbi;
>
> + #define CMDQ_OP_ATC_INV 0x40
> + struct {
> + u32 sid;
> + u32 ssid;
> + u64 addr;
> + u8 size;
> + bool global;
> + } atc;
> +
> #define CMDQ_OP_PRI_RESP 0x41
> struct {
> u32 sid;
> @@ -662,6 +688,8 @@ struct arm_smmu_group {
>
> struct list_head devices;
> spinlock_t devices_lock;
> +
> + bool ats_enabled;
> };
>
> struct arm_smmu_option_prop {
> @@ -839,6 +867,14 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
> case CMDQ_OP_TLBI_S12_VMALL:
> cmd[0] |= (u64)ent->tlbi.vmid << CMDQ_TLBI_0_VMID_SHIFT;
> break;
> + case CMDQ_OP_ATC_INV:
> + cmd[0] |= ent->substream_valid ? CMDQ_0_SSV : 0;
> + cmd[0] |= ent->atc.global ? CMDQ_ATC_0_GLOBAL : 0;
> + cmd[0] |= ent->atc.ssid << CMDQ_ATC_0_SSID_SHIFT;
> + cmd[0] |= (u64)ent->atc.sid << CMDQ_ATC_0_SID_SHIFT;
> + cmd[1] |= ent->atc.size << CMDQ_ATC_1_SIZE_SHIFT;
> + cmd[1] |= ent->atc.addr & CMDQ_ATC_1_ADDR_MASK;
> + break;
> case CMDQ_OP_PRI_RESP:
> cmd[0] |= ent->substream_valid ? CMDQ_0_SSV : 0;
> cmd[0] |= ent->pri.ssid << CMDQ_PRI_0_SSID_SHIFT;
> @@ -874,6 +910,7 @@ static void arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu)
> [CMDQ_ERR_CERROR_NONE_IDX] = "No error",
> [CMDQ_ERR_CERROR_ILL_IDX] = "Illegal command",
> [CMDQ_ERR_CERROR_ABT_IDX] = "Abort on command fetch",
> + [CMDQ_ERR_CERROR_ATC_INV_IDX] = "ATC invalidate timeout",
> };
>
> int i;
> @@ -893,6 +930,13 @@ static void arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu)
> dev_err(smmu->dev, "retrying command fetch\n");
> case CMDQ_ERR_CERROR_NONE_IDX:
> return;
> + case CMDQ_ERR_CERROR_ATC_INV_IDX:
> + /*
> + * CMD_SYNC failed because of ATC Invalidation completion
> + * timeout. CONS is still pointing at the CMD_SYNC. Ensure other
> + * operations complete by re-submitting the CMD_SYNC, cowardly
> + * ignoring the ATC error.
> + */
> case CMDQ_ERR_CERROR_ILL_IDX:
> /* Fallthrough */
> default:
> @@ -1084,9 +1128,6 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_device *smmu, u32 sid,
> STRTAB_STE_1_S1C_CACHE_WBRA
> << STRTAB_STE_1_S1COR_SHIFT |
> STRTAB_STE_1_S1C_SH_ISH << STRTAB_STE_1_S1CSH_SHIFT |
> -#ifdef CONFIG_PCI_ATS
> - STRTAB_STE_1_EATS_TRANS << STRTAB_STE_1_EATS_SHIFT |
> -#endif
> STRTAB_STE_1_STRW_NSEL1 << STRTAB_STE_1_STRW_SHIFT);
>
> if (smmu->features & ARM_SMMU_FEAT_STALLS)
> @@ -1115,6 +1156,10 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_device *smmu, u32 sid,
> val |= STRTAB_STE_0_CFG_S2_TRANS;
> }
>
> + if (IS_ENABLED(CONFIG_PCI_ATS) && !ste_live)
> + dst[1] |= cpu_to_le64(STRTAB_STE_1_EATS_TRANS
> + << STRTAB_STE_1_EATS_SHIFT);
> +
> arm_smmu_sync_ste_for_sid(smmu, sid);
> dst[0] = cpu_to_le64(val);
> arm_smmu_sync_ste_for_sid(smmu, sid);
> @@ -1377,6 +1422,120 @@ static const struct iommu_gather_ops arm_smmu_gather_ops = {
> .tlb_sync = arm_smmu_tlb_sync,
> };
>
> +static void arm_smmu_atc_invalidate_to_cmd(struct arm_smmu_device *smmu,
> + unsigned long iova, size_t size,
> + struct arm_smmu_cmdq_ent *cmd)
> +{
> + size_t log2_span;
> + size_t span_mask;
> + size_t smmu_grain;
> + /* ATC invalidates are always on 4096 bytes pages */
> + size_t inval_grain_shift = 12;
> + unsigned long iova_start, iova_end;
> + unsigned long page_start, page_end;
> +
> + smmu_grain = 1ULL << __ffs(smmu->pgsize_bitmap);
> +
> + /* In case parameters are not aligned on PAGE_SIZE */
> + iova_start = round_down(iova, smmu_grain);
> + iova_end = round_up(iova + size, smmu_grain) - 1;
> +
> + page_start = iova_start >> inval_grain_shift;
> + page_end = iova_end >> inval_grain_shift;
> +
> + /*
> + * Find the smallest power of two that covers the range. Most
> + * significant differing bit between start and end address indicates the
> + * required span, ie. fls(start ^ end). For example:
> + *
> + * We want to invalidate pages [8; 11]. This is already the ideal range:
> + * x = 0b1000 ^ 0b1011 = 0b11
> + * span = 1 << fls(x) = 4
> + *
> + * To invalidate pages [7; 10], we need to invalidate [0; 15]:
> + * x = 0b0111 ^ 0b1010 = 0b1101
> + * span = 1 << fls(x) = 16
> + */
> + log2_span = fls_long(page_start ^ page_end);
> + span_mask = (1ULL << log2_span) - 1;
> +
> + page_start &= ~span_mask;
In my opinion, below(option 2) is more readable:
end = iova + size;
size = max(size, smmu_grain);
size = roundup_pow_of_two(size);
start = iova & ~(size - 1);
if (end < (start + size))
//all included in (start,size)
else if (!(start & ~(2 * size - 1)) //start aligned on (2 * size) boundary
size <<= 1; //double size
else
//send two invalidate command: (start,size), (start+size,size)
> +
> + *cmd = (struct arm_smmu_cmdq_ent) {
> + .opcode = CMDQ_OP_ATC_INV,
> + .atc = {
> + .addr = page_start << inval_grain_shift,
> + .size = log2_span,
> + }
> + };
> +}
> +
> +static int arm_smmu_atc_invalidate_master(struct arm_smmu_master_data *master,
> + struct arm_smmu_cmdq_ent *cmd)
> +{
> + int i;
> + struct iommu_fwspec *fwspec = master->dev->iommu_fwspec;
> + struct pci_dev *pdev = to_pci_dev(master->dev);
> +
> + if (!pdev->ats_enabled)
> + return 0;
> +
> + for (i = 0; i < fwspec->num_ids; i++) {
> + cmd->atc.sid = fwspec->ids[i];
> +
> + dev_dbg(master->smmu->dev,
> + "ATC invalidate %#x:%#x:%#llx-%#llx, esz=%d\n",
> + cmd->atc.sid, cmd->atc.ssid, cmd->atc.addr,
> + cmd->atc.addr + (1 << (cmd->atc.size + 12)) - 1,
> + cmd->atc.size);
> +
> + arm_smmu_cmdq_issue_cmd(master->smmu, cmd);
> + }
> +
> + return 0;
> +}
> +
> +static size_t arm_smmu_atc_invalidate_domain(struct arm_smmu_domain *smmu_domain,
> + unsigned long iova, size_t size)
> +{
> + unsigned long flags;
> + struct arm_smmu_cmdq_ent cmd = {0};
> + struct arm_smmu_group *smmu_group;
> + struct arm_smmu_master_data *master;
> + struct arm_smmu_device *smmu = smmu_domain->smmu;
> + struct arm_smmu_cmdq_ent sync_cmd = {
> + .opcode = CMDQ_OP_CMD_SYNC,
> + };
> +
> + spin_lock_irqsave(&smmu_domain->groups_lock, flags);
> +
> + list_for_each_entry(smmu_group, &smmu_domain->groups, domain_head) {
> + if (!smmu_group->ats_enabled)
> + continue;
> +
> + /* Initialise command lazily */
> + if (!cmd.opcode)
> + arm_smmu_atc_invalidate_to_cmd(smmu, iova, size, &cmd);
> +
> + spin_lock(&smmu_group->devices_lock);
> +
> + list_for_each_entry(master, &smmu_group->devices, group_head)
> + arm_smmu_atc_invalidate_master(master, &cmd);
> +
> + /*
> + * TODO: ensure we do a sync whenever we have sent ats_queue_depth
> + * invalidations to the same device.
> + */
> + arm_smmu_cmdq_issue_cmd(smmu, &sync_cmd);
> +
> + spin_unlock(&smmu_group->devices_lock);
> + }
> +
> + spin_unlock_irqrestore(&smmu_domain->groups_lock, flags);
> +
> + return size;
> +}
> +
> /* IOMMU API */
> static bool arm_smmu_capable(enum iommu_cap cap)
> {
> @@ -1782,7 +1941,10 @@ arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova, size_t size)
>
> spin_lock_irqsave(&smmu_domain->pgtbl_lock, flags);
> ret = ops->unmap(ops, iova, size);
> + if (ret)
> + ret = arm_smmu_atc_invalidate_domain(smmu_domain, iova, size);
> spin_unlock_irqrestore(&smmu_domain->pgtbl_lock, flags);
> +
> return ret;
> }
>
> @@ -1830,11 +1992,63 @@ static bool arm_smmu_sid_in_range(struct arm_smmu_device *smmu, u32 sid)
> return sid < limit;
> }
>
> +/*
> + * Returns -ENOSYS if ATS is not supported either by the device or by the SMMU
> + */
> +static int arm_smmu_enable_ats(struct arm_smmu_master_data *master)
> +{
> + int ret;
> + size_t stu;
> + struct pci_dev *pdev;
> + struct arm_smmu_device *smmu = master->smmu;
> +
> + if (!(smmu->features & ARM_SMMU_FEAT_ATS) || !dev_is_pci(master->dev))
> + return -ENOSYS;
> +
> + pdev = to_pci_dev(master->dev);
> +
> +#ifdef CONFIG_PCI_ATS
> + if (!pdev->ats_cap)
> + return -ENOSYS;
> +#else
> + return -ENOSYS;
> +#endif
> +
> + /* Smallest Translation Unit: log2 of the smallest supported granule */
> + stu = __ffs(smmu->pgsize_bitmap);
> +
> + ret = pci_enable_ats(pdev, stu);
> + if (ret) {
> + dev_err(&pdev->dev, "cannot enable ATS: %d\n", ret);
> + return ret;
> + }
> +
> + dev_dbg(&pdev->dev, "enabled ATS with STU = %zu\n", stu);
> +
> + return 0;
> +}
> +
> +static void arm_smmu_disable_ats(struct arm_smmu_master_data *master)
> +{
> + struct pci_dev *pdev;
> +
> + if (!dev_is_pci(master->dev))
> + return;
> +
> + pdev = to_pci_dev(master->dev);
> +
> + if (!pdev->ats_enabled)
> + return;
> +
> + pci_disable_ats(pdev);
> +}
> +
> static struct iommu_ops arm_smmu_ops;
>
> static int arm_smmu_add_device(struct device *dev)
> {
> int i, ret;
> + bool ats_enabled;
> unsigned long flags;
> struct arm_smmu_device *smmu;
> struct arm_smmu_group *smmu_group;
> @@ -1880,19 +2094,31 @@ static int arm_smmu_add_device(struct device *dev)
> }
> }
>
> + ats_enabled = !arm_smmu_enable_ats(master);
> +
> group = iommu_group_get_for_dev(dev);
> - if (!IS_ERR(group)) {
> - smmu_group = to_smmu_group(group);
> + if (IS_ERR(group)) {
> + ret = PTR_ERR(group);
> + goto err_disable_ats;
> + }
>
> - spin_lock_irqsave(&smmu_group->devices_lock, flags);
> - list_add(&master->group_head, &smmu_group->devices);
> - spin_unlock_irqrestore(&smmu_group->devices_lock, flags);
> + smmu_group = to_smmu_group(group);
>
> - iommu_group_put(group);
> - iommu_device_link(&smmu->iommu, dev);
> - }
> + smmu_group->ats_enabled |= ats_enabled;
>
> - return PTR_ERR_OR_ZERO(group);
> + spin_lock_irqsave(&smmu_group->devices_lock, flags);
> + list_add(&master->group_head, &smmu_group->devices);
> + spin_unlock_irqrestore(&smmu_group->devices_lock, flags);
> +
> + iommu_group_put(group);
> + iommu_device_link(&smmu->iommu, dev);
> +
> + return 0;
> +
> +err_disable_ats:
> + arm_smmu_disable_ats(master);
> +
> + return ret;
> }
>
> static void arm_smmu_remove_device(struct device *dev)
> @@ -1921,6 +2147,8 @@ static void arm_smmu_remove_device(struct device *dev)
> spin_unlock_irqrestore(&smmu_group->devices_lock, flags);
>
> iommu_group_put(group);
> +
> + arm_smmu_disable_ats(master);
> }
>
> iommu_group_remove_device(dev);
> @@ -2485,6 +2713,16 @@ static int arm_smmu_device_reset(struct arm_smmu_device *smmu, bool bypass)
> }
> }
>
> + if (smmu->features & ARM_SMMU_FEAT_ATS && !disable_ats_check) {
> + enables |= CR0_ATSCHK;
> + ret = arm_smmu_write_reg_sync(smmu, enables, ARM_SMMU_CR0,
> + ARM_SMMU_CR0ACK);
> + if (ret) {
> + dev_err(smmu->dev, "failed to enable ATS check\n");
> + return ret;
> + }
> + }
> +
> ret = arm_smmu_setup_irqs(smmu);
> if (ret) {
> dev_err(smmu->dev, "failed to setup irqs\n");
>
--
Thanks!
BestRegards
^ permalink raw reply [flat|nested] 103+ messages in thread
* Re: [RFC PATCH 04/30] iommu/arm-smmu-v3: Add support for PCI ATS
2017-05-23 8:41 ` Leizhen (ThunderTown)
@ 2017-05-23 11:21 ` Jean-Philippe Brucker
2017-05-25 18:27 ` Roy Franz (Cavium)
0 siblings, 1 reply; 103+ messages in thread
From: Jean-Philippe Brucker @ 2017-05-23 11:21 UTC (permalink / raw)
To: Leizhen (ThunderTown)
Cc: Lorenzo Pieralisi, Shanker Donthineni, kvm, Catalin Marinas,
Joerg Roedel, Sinan Kaya, Will Deacon, iommu, Harv Abdulhamid,
Alex Williamson, linux-pci, Bjorn Helgaas, Robin Murphy,
David Woodhouse, linux-arm-kernel, Nate Watterson, LinuxArm
On 23/05/17 09:41, Leizhen (ThunderTown) wrote:
> On 2017/2/28 3:54, Jean-Philippe Brucker wrote:
>> PCIe devices can implement their own TLB, named Address Translation Cache
>> (ATC). Steps involved in the use and maintenance of such caches are:
>>
>> * Device sends an Address Translation Request for a given IOVA to the
>> IOMMU. If the translation succeeds, the IOMMU returns the corresponding
>> physical address, which is stored in the device's ATC.
>>
>> * Device can then use the physical address directly in a transaction.
>> A PCIe device does so by setting the TLP AT field to 0b10 - translated.
>> The SMMU might check that the device is allowed to send translated
>> transactions, and let it pass through.
>>
>> * When an address is unmapped, CPU sends a CMD_ATC_INV command to the
>> SMMU, that is relayed to the device.
>>
>> In theory, this doesn't require a lot of software intervention. The IOMMU
>> driver needs to enable ATS when adding a PCI device, and send an
>> invalidation request when unmapping. Note that this invalidation is
>> allowed to take up to a minute, according to the PCIe spec. In
>> addition, the invalidation queue on the ATC side is fairly small, 32 by
>> default, so we cannot keep many invalidations in flight (see ATS spec
>> section 3.5, Invalidate Flow Control).
>>
>> Handling these constraints properly would require to postpone
>> invalidations, and keep the stale mappings until we're certain that all
>> devices forgot about them. This requires major work in the page table
>> managers, and is therefore not done by this patch.
>>
>> Range calculation
>> -----------------
>>
>> The invalidation packet itself is a bit awkward: range must be naturally
>> aligned, which means that the start address is a multiple of the range
>> size. In addition, the size must be a power of two number of 4k pages. We
>> have a few options to enforce this constraint:
>>
>> (1) Find the smallest naturally aligned region that covers the requested
>> range. This is simple to compute and only takes one ATC_INV, but it
>> will spill on lots of neighbouring ATC entries.
>>
>> (2) Align the start address to the region size (rounded up to a power of
>> two), and send a second invalidation for the next range of the same
>> size. Still not great, but reduces spilling.
>>
>> (3) Cover the range exactly with the smallest number of naturally aligned
>> regions. This would be interesting to implement but as for (2),
>> requires multiple ATC_INV.
>>
>> As I suspect ATC invalidation packets will be a very scarce resource,
>> we'll go with option (1) for now, and only send one big invalidation.
>>
>> Note that with io-pgtable, the unmap function is called for each page, so
>> this doesn't matter. The problem shows up when sharing page tables with
>> the MMU.
> Suppose this is true, I'd like to choose option (2). Because the worst cases of
> both (1) and (2) will not be happened, but the code of (2) will look clearer.
> And (2) is technically more acceptable.
I agree that (2) is a bit clearer, but the question is of performance
rather than readability. I'd like to see some benchmarks or experiment on
my own before switching to a two-invalidation system.
Intuitively one big invalidation will result in more ATC trashing and will
bring overall device performance down. But then according to the PCI spec,
ATC invalidations are grossly expensive, they have an upper bound of a
minute. I agree that this is highly improbable and might depend on the
range size, but purely from an architectural standpoint, reducing the
number of ATC invalidation requests is the priority, because this is much
worse than any performance slow-down incurred by ATC trashing. And for the
moment I can only base my decisions on the architecture.
So I'd like to keep (1) for now, and update it to (2) (or even (3)) once
we have more hardware to experiment with.
Thanks,
Jean
^ permalink raw reply [flat|nested] 103+ messages in thread
* Re: [RFC PATCH 04/30] iommu/arm-smmu-v3: Add support for PCI ATS
2017-05-23 11:21 ` Jean-Philippe Brucker
@ 2017-05-25 18:27 ` Roy Franz (Cavium)
0 siblings, 0 replies; 103+ messages in thread
From: Roy Franz (Cavium) @ 2017-05-25 18:27 UTC (permalink / raw)
To: Jean-Philippe Brucker
Cc: Leizhen (ThunderTown), Lorenzo Pieralisi, Shanker Donthineni, kvm,
Catalin Marinas, Joerg Roedel, Sinan Kaya, Will Deacon,
Alex Williamson, Harv Abdulhamid, LinuxArm, iommu, linux-pci,
Bjorn Helgaas, Robin Murphy, David Woodhouse, linux-arm-kernel,
Nate Watterson
On Tue, May 23, 2017 at 4:21 AM, Jean-Philippe Brucker
<jean-philippe.brucker@arm.com> wrote:
> On 23/05/17 09:41, Leizhen (ThunderTown) wrote:
>> On 2017/2/28 3:54, Jean-Philippe Brucker wrote:
>>> PCIe devices can implement their own TLB, named Address Translation Cache
>>> (ATC). Steps involved in the use and maintenance of such caches are:
>>>
>>> * Device sends an Address Translation Request for a given IOVA to the
>>> IOMMU. If the translation succeeds, the IOMMU returns the corresponding
>>> physical address, which is stored in the device's ATC.
>>>
>>> * Device can then use the physical address directly in a transaction.
>>> A PCIe device does so by setting the TLP AT field to 0b10 - translated.
>>> The SMMU might check that the device is allowed to send translated
>>> transactions, and let it pass through.
>>>
>>> * When an address is unmapped, CPU sends a CMD_ATC_INV command to the
>>> SMMU, that is relayed to the device.
>>>
>>> In theory, this doesn't require a lot of software intervention. The IOMMU
>>> driver needs to enable ATS when adding a PCI device, and send an
>>> invalidation request when unmapping. Note that this invalidation is
>>> allowed to take up to a minute, according to the PCIe spec. In
>>> addition, the invalidation queue on the ATC side is fairly small, 32 by
>>> default, so we cannot keep many invalidations in flight (see ATS spec
>>> section 3.5, Invalidate Flow Control).
>>>
>>> Handling these constraints properly would require to postpone
>>> invalidations, and keep the stale mappings until we're certain that all
>>> devices forgot about them. This requires major work in the page table
>>> managers, and is therefore not done by this patch.
>>>
>>> Range calculation
>>> -----------------
>>>
>>> The invalidation packet itself is a bit awkward: range must be naturally
>>> aligned, which means that the start address is a multiple of the range
>>> size. In addition, the size must be a power of two number of 4k pages. We
>>> have a few options to enforce this constraint:
>>>
>>> (1) Find the smallest naturally aligned region that covers the requested
>>> range. This is simple to compute and only takes one ATC_INV, but it
>>> will spill on lots of neighbouring ATC entries.
>>>
>>> (2) Align the start address to the region size (rounded up to a power of
>>> two), and send a second invalidation for the next range of the same
>>> size. Still not great, but reduces spilling.
>>>
>>> (3) Cover the range exactly with the smallest number of naturally aligned
>>> regions. This would be interesting to implement but as for (2),
>>> requires multiple ATC_INV.
>>>
>>> As I suspect ATC invalidation packets will be a very scarce resource,
>>> we'll go with option (1) for now, and only send one big invalidation.
>>>
>>> Note that with io-pgtable, the unmap function is called for each page, so
>>> this doesn't matter. The problem shows up when sharing page tables with
>>> the MMU.
>> Suppose this is true, I'd like to choose option (2). Because the worst cases of
>> both (1) and (2) will not be happened, but the code of (2) will look clearer.
>> And (2) is technically more acceptable.
>
> I agree that (2) is a bit clearer, but the question is of performance
> rather than readability. I'd like to see some benchmarks or experiment on
> my own before switching to a two-invalidation system.
>
> Intuitively one big invalidation will result in more ATC trashing and will
> bring overall device performance down. But then according to the PCI spec,
> ATC invalidations are grossly expensive, they have an upper bound of a
> minute. I agree that this is highly improbable and might depend on the
> range size, but purely from an architectural standpoint, reducing the
> number of ATC invalidation requests is the priority, because this is much
> worse than any performance slow-down incurred by ATC trashing. And for the
> moment I can only base my decisions on the architecture.
>
> So I'd like to keep (1) for now, and update it to (2) (or even (3)) once
> we have more hardware to experiment with.
>
> Thanks,
> Jean
>
I think (1) is a good place to start, as the same restricted encoding
that is used in
the invalidations is also used in the translation responses - all of
the ATC entries
were created with regions described this way. We still may end up with nothing
but STU sized ATC entries, as TAs are free to respond to large
translation requests
with multiple STU sized translations, and in some cases this is the
best that they
can do. Picking the optimal strategy will depend on hardware, and
maybe workload
as well.
Thanks,
Roy
>
> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
^ permalink raw reply [flat|nested] 103+ messages in thread
* [RFC PATCH 05/30] iommu/arm-smmu-v3: Disable tagged pointers when ATS is in use
[not found] ` <20170227195441.5170-1-jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
` (3 preceding siblings ...)
2017-02-27 19:54 ` [RFC PATCH 04/30] iommu/arm-smmu-v3: Add support for PCI ATS Jean-Philippe Brucker
@ 2017-02-27 19:54 ` Jean-Philippe Brucker
2017-05-22 6:27 ` Leizhen (ThunderTown)
2017-02-27 19:54 ` [RFC PATCH 06/30] iommu/arm-smmu-v3: Add support for Substream IDs Jean-Philippe Brucker
` (25 subsequent siblings)
30 siblings, 1 reply; 103+ messages in thread
From: Jean-Philippe Brucker @ 2017-02-27 19:54 UTC (permalink / raw)
Cc: Shanker Donthineni, kvm-u79uwXL29TY76Z2rM5mHXA, Catalin Marinas,
Sinan Kaya, Will Deacon,
iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
Harv Abdulhamid, linux-pci-u79uwXL29TY76Z2rM5mHXA, Bjorn Helgaas,
David Woodhouse,
linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r, Nate Watterson
The ARM architecture has a "Top Byte Ignore" (TBI) option that makes the
MMU mask out bits [63:56] of an address, allowing a userspace application
to store data in its pointers.
The ATS doesn't have an architected mechanism to enable TBI, and might
create ATC entries for addresses that include a tag. Software would then
have to send ATC invalidation packets for each 255 possible alias of an
address, or just wipe the whole address space. This is not a viable
option, so disable TBI when ATS is in use.
It is unclear for the moment how this restriction will affect user
applications. One example I can imagine (with my complete lack of
knowledge about JIT engines and their use of tagged pointers) is a JIT
translating a WebCL applications that uses SVM. Since this kind of
interpreted language doesn't expose addresses, the interpreter and SVM
implementations will be given the opportunity to do the right thing and
remove tags before handing pointers to devices.
Ideally we should remove TBI only for domains that are susceptible to use
ATS. But at the point we're writing the context descriptor of a domain, we
are still probing the first device in the group. If that device doesn't
have an ATC but the next one does, we would then have to clear the TBI
bit, invalidate the ATCs of previous devices, and notify the drivers that
their devices cannot use tagged pointers anymore. Such a level of
complexity doesn't make any sense here since it is unlikely devices will
use tagged pointers at all.
Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
---
drivers/iommu/arm-smmu-v3.c | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index e7b940146ae3..06b29d4fcf65 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -987,7 +987,7 @@ static void arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
}
/* Context descriptor manipulation functions */
-static u64 arm_smmu_cpu_tcr_to_cd(u64 tcr)
+static u64 arm_smmu_cpu_tcr_to_cd(struct arm_smmu_device *smmu, u64 tcr)
{
u64 val = 0;
@@ -1000,7 +1000,8 @@ static u64 arm_smmu_cpu_tcr_to_cd(u64 tcr)
val |= ARM_SMMU_TCR2CD(tcr, EPD0);
val |= ARM_SMMU_TCR2CD(tcr, EPD1);
val |= ARM_SMMU_TCR2CD(tcr, IPS);
- val |= ARM_SMMU_TCR2CD(tcr, TBI0);
+ if (!(smmu->features & ARM_SMMU_FEAT_ATS))
+ val |= ARM_SMMU_TCR2CD(tcr, TBI0);
return val;
}
@@ -1014,7 +1015,7 @@ static void arm_smmu_write_ctx_desc(struct arm_smmu_device *smmu,
* We don't need to issue any invalidation here, as we'll invalidate
* the STE when installing the new entry anyway.
*/
- val = arm_smmu_cpu_tcr_to_cd(cfg->cd.tcr) |
+ val = arm_smmu_cpu_tcr_to_cd(smmu, cfg->cd.tcr) |
#ifdef __BIG_ENDIAN
CTXDESC_CD_0_ENDI |
#endif
--
2.11.0
^ permalink raw reply related [flat|nested] 103+ messages in thread
* Re: [RFC PATCH 05/30] iommu/arm-smmu-v3: Disable tagged pointers when ATS is in use
2017-02-27 19:54 ` [RFC PATCH 05/30] iommu/arm-smmu-v3: Disable tagged pointers when ATS is in use Jean-Philippe Brucker
@ 2017-05-22 6:27 ` Leizhen (ThunderTown)
2017-05-22 14:02 ` Jean-Philippe Brucker
0 siblings, 1 reply; 103+ messages in thread
From: Leizhen (ThunderTown) @ 2017-05-22 6:27 UTC (permalink / raw)
To: Jean-Philippe Brucker
Cc: Lorenzo Pieralisi, Shanker Donthineni, kvm, Catalin Marinas,
Joerg Roedel, Sinan Kaya, Will Deacon, iommu, Harv Abdulhamid,
Alex Williamson, linux-pci, Bjorn Helgaas, Robin Murphy,
David Woodhouse, linux-arm-kernel, Nate Watterson, LinuxArm
On 2017/2/28 3:54, Jean-Philippe Brucker wrote:
> The ARM architecture has a "Top Byte Ignore" (TBI) option that makes the
> MMU mask out bits [63:56] of an address, allowing a userspace application
> to store data in its pointers.
>
> The ATS doesn't have an architected mechanism to enable TBI, and might
> create ATC entries for addresses that include a tag. Software would then
> have to send ATC invalidation packets for each 255 possible alias of an
> address, or just wipe the whole address space. This is not a viable
> option, so disable TBI when ATS is in use.
>
> It is unclear for the moment how this restriction will affect user
> applications. One example I can imagine (with my complete lack of
> knowledge about JIT engines and their use of tagged pointers) is a JIT
> translating a WebCL applications that uses SVM. Since this kind of
> interpreted language doesn't expose addresses, the interpreter and SVM
> implementations will be given the opportunity to do the right thing and
> remove tags before handing pointers to devices.
>
> Ideally we should remove TBI only for domains that are susceptible to use
> ATS. But at the point we're writing the context descriptor of a domain, we
> are still probing the first device in the group. If that device doesn't
> have an ATC but the next one does, we would then have to clear the TBI
> bit, invalidate the ATCs of previous devices, and notify the drivers that
> their devices cannot use tagged pointers anymore. Such a level of
> complexity doesn't make any sense here since it is unlikely devices will
> use tagged pointers at all.
>
> Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker@arm.com>
> ---
> drivers/iommu/arm-smmu-v3.c | 7 ++++---
> 1 file changed, 4 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
> index e7b940146ae3..06b29d4fcf65 100644
> --- a/drivers/iommu/arm-smmu-v3.c
> +++ b/drivers/iommu/arm-smmu-v3.c
> @@ -987,7 +987,7 @@ static void arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
> }
>
> /* Context descriptor manipulation functions */
> -static u64 arm_smmu_cpu_tcr_to_cd(u64 tcr)
> +static u64 arm_smmu_cpu_tcr_to_cd(struct arm_smmu_device *smmu, u64 tcr)
> {
> u64 val = 0;
>
> @@ -1000,7 +1000,8 @@ static u64 arm_smmu_cpu_tcr_to_cd(u64 tcr)
> val |= ARM_SMMU_TCR2CD(tcr, EPD0);
> val |= ARM_SMMU_TCR2CD(tcr, EPD1);
> val |= ARM_SMMU_TCR2CD(tcr, IPS);
> - val |= ARM_SMMU_TCR2CD(tcr, TBI0);
> + if (!(smmu->features & ARM_SMMU_FEAT_ATS))
> + val |= ARM_SMMU_TCR2CD(tcr, TBI0);
Maybe we should always disable TBI. Otherwise, a device behind a ATS supported or
non-supported SMMU should use different strategies.
>
> return val;
> }
> @@ -1014,7 +1015,7 @@ static void arm_smmu_write_ctx_desc(struct arm_smmu_device *smmu,
> * We don't need to issue any invalidation here, as we'll invalidate
> * the STE when installing the new entry anyway.
> */
> - val = arm_smmu_cpu_tcr_to_cd(cfg->cd.tcr) |
> + val = arm_smmu_cpu_tcr_to_cd(smmu, cfg->cd.tcr) |
> #ifdef __BIG_ENDIAN
> CTXDESC_CD_0_ENDI |
> #endif
>
--
Thanks!
BestRegards
^ permalink raw reply [flat|nested] 103+ messages in thread
* Re: [RFC PATCH 05/30] iommu/arm-smmu-v3: Disable tagged pointers when ATS is in use
2017-05-22 6:27 ` Leizhen (ThunderTown)
@ 2017-05-22 14:02 ` Jean-Philippe Brucker
0 siblings, 0 replies; 103+ messages in thread
From: Jean-Philippe Brucker @ 2017-05-22 14:02 UTC (permalink / raw)
To: Leizhen (ThunderTown)
Cc: Lorenzo Pieralisi, Shanker Donthineni, kvm, Catalin Marinas,
Joerg Roedel, Sinan Kaya, Will Deacon, iommu, Harv Abdulhamid,
Alex Williamson, linux-pci, Bjorn Helgaas, Robin Murphy,
David Woodhouse, linux-arm-kernel, Nate Watterson, LinuxArm
On 22/05/17 07:27, Leizhen (ThunderTown) wrote:
> On 2017/2/28 3:54, Jean-Philippe Brucker wrote:
>> The ARM architecture has a "Top Byte Ignore" (TBI) option that makes the
>> MMU mask out bits [63:56] of an address, allowing a userspace application
>> to store data in its pointers.
>>
>> The ATS doesn't have an architected mechanism to enable TBI, and might
>> create ATC entries for addresses that include a tag. Software would then
>> have to send ATC invalidation packets for each 255 possible alias of an
>> address, or just wipe the whole address space. This is not a viable
>> option, so disable TBI when ATS is in use.
>>
>> It is unclear for the moment how this restriction will affect user
>> applications. One example I can imagine (with my complete lack of
>> knowledge about JIT engines and their use of tagged pointers) is a JIT
>> translating a WebCL applications that uses SVM. Since this kind of
>> interpreted language doesn't expose addresses, the interpreter and SVM
>> implementations will be given the opportunity to do the right thing and
>> remove tags before handing pointers to devices.
>>
>> Ideally we should remove TBI only for domains that are susceptible to use
>> ATS. But at the point we're writing the context descriptor of a domain, we
>> are still probing the first device in the group. If that device doesn't
>> have an ATC but the next one does, we would then have to clear the TBI
>> bit, invalidate the ATCs of previous devices, and notify the drivers that
>> their devices cannot use tagged pointers anymore. Such a level of
>> complexity doesn't make any sense here since it is unlikely devices will
>> use tagged pointers at all.
>>
>> Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker@arm.com>
>> ---
>> drivers/iommu/arm-smmu-v3.c | 7 ++++---
>> 1 file changed, 4 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
>> index e7b940146ae3..06b29d4fcf65 100644
>> --- a/drivers/iommu/arm-smmu-v3.c
>> +++ b/drivers/iommu/arm-smmu-v3.c
>> @@ -987,7 +987,7 @@ static void arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
>> }
>>
>> /* Context descriptor manipulation functions */
>> -static u64 arm_smmu_cpu_tcr_to_cd(u64 tcr)
>> +static u64 arm_smmu_cpu_tcr_to_cd(struct arm_smmu_device *smmu, u64 tcr)
>> {
>> u64 val = 0;
>>
>> @@ -1000,7 +1000,8 @@ static u64 arm_smmu_cpu_tcr_to_cd(u64 tcr)
>> val |= ARM_SMMU_TCR2CD(tcr, EPD0);
>> val |= ARM_SMMU_TCR2CD(tcr, EPD1);
>> val |= ARM_SMMU_TCR2CD(tcr, IPS);
>> - val |= ARM_SMMU_TCR2CD(tcr, TBI0);
>> + if (!(smmu->features & ARM_SMMU_FEAT_ATS))
>> + val |= ARM_SMMU_TCR2CD(tcr, TBI0);
> Maybe we should always disable TBI. Otherwise, a device behind a ATS supported or
> non-supported SMMU should use different strategies.
I don't have any objection to this. Since ATS state shouldn't be visible
to userspace, the responsibility of sanitizing tagged pointers for DMA
already lies on the rare applications that use of them. I haven't thought
about the stall handling code yet, but I'm sure there are creative ways to
break it using tagged pointers, and I'd rather not go down that rabbit hole.
I'll update this patch accordingly in next version.
Thanks,
Jean
>> return val;
>> }
>> @@ -1014,7 +1015,7 @@ static void arm_smmu_write_ctx_desc(struct arm_smmu_device *smmu,
>> * We don't need to issue any invalidation here, as we'll invalidate
>> * the STE when installing the new entry anyway.
>> */
>> - val = arm_smmu_cpu_tcr_to_cd(cfg->cd.tcr) |
>> + val = arm_smmu_cpu_tcr_to_cd(smmu, cfg->cd.tcr) |
>> #ifdef __BIG_ENDIAN
>> CTXDESC_CD_0_ENDI |
>> #endif
>>
>
^ permalink raw reply [flat|nested] 103+ messages in thread
* [RFC PATCH 06/30] iommu/arm-smmu-v3: Add support for Substream IDs
[not found] ` <20170227195441.5170-1-jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
` (4 preceding siblings ...)
2017-02-27 19:54 ` [RFC PATCH 05/30] iommu/arm-smmu-v3: Disable tagged pointers when ATS is in use Jean-Philippe Brucker
@ 2017-02-27 19:54 ` Jean-Philippe Brucker
2017-02-27 19:54 ` [RFC PATCH 07/30] iommu/arm-smmu-v3: Add second level of context descriptor table Jean-Philippe Brucker
` (24 subsequent siblings)
30 siblings, 0 replies; 103+ messages in thread
From: Jean-Philippe Brucker @ 2017-02-27 19:54 UTC (permalink / raw)
Cc: Shanker Donthineni, kvm-u79uwXL29TY76Z2rM5mHXA, Catalin Marinas,
Sinan Kaya, Will Deacon,
iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
Harv Abdulhamid, linux-pci-u79uwXL29TY76Z2rM5mHXA, Bjorn Helgaas,
David Woodhouse,
linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r, Nate Watterson
At the moment, the SMMUv3 driver offers only one stage-1 or stage-2
address space to each device. SMMUv3 allows to associate multiple address
spaces per device. In addition to the Stream ID (SID), that identifies a
device, we can now have Substream IDs (SSID) identifying an address space.
In PCIe lingo, SID is called Requester ID (RID) and SSID is called Process
Address-Space ID (PASID).
Prepare the driver for SSID support, by adding context descriptor tables
in STEs (previously a single static context descriptor). A complete
stage-1 walk is now performed like this by the SMMU:
Stream tables Ctx. tables Page tables
+--------+ ,------->+-------+ ,------->+-------+
: : | : : | : :
+--------+ | +-------+ | +-------+
SID->| STE |---' SSID->| CD |---' IOVA->| PTE |--> IPA
+--------+ +-------+ +-------+
: : : : : :
+--------+ +-------+ +-------+
Note that we only implement one level of context descriptor table for now,
but as with stream and page tables, an SSID can be split to target
multiple levels of tables.
In all stream table entries, we set S1DSS=SSID0 mode, which forces all
traffic lacking an SSID to be routed to context descriptor 0. Since we
allocate a single context descriptor per group for the moment, this patch
doesn't introduce any change in behavior.
Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
---
drivers/iommu/arm-smmu-v3.c | 314 ++++++++++++++++++++++++++++++++++----------
1 file changed, 243 insertions(+), 71 deletions(-)
diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index 06b29d4fcf65..f88d62025efa 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -246,6 +246,12 @@
#define STRTAB_STE_0_S1CDMAX_SHIFT 59
#define STRTAB_STE_0_S1CDMAX_MASK 0x1fUL
+#define STRTAB_STE_1_S1DSS_SHIFT 0
+#define STRTAB_STE_1_S1DSS_MASK 0x3UL
+#define STRTAB_STE_1_S1DSS_TERMINATE (0x0 << STRTAB_STE_1_S1DSS_SHIFT)
+#define STRTAB_STE_1_S1DSS_BYPASS (0x1 << STRTAB_STE_1_S1DSS_SHIFT)
+#define STRTAB_STE_1_S1DSS_SSID0 (0x2 << STRTAB_STE_1_S1DSS_SHIFT)
+
#define STRTAB_STE_1_S1C_CACHE_NC 0UL
#define STRTAB_STE_1_S1C_CACHE_WBRA 1UL
#define STRTAB_STE_1_S1C_CACHE_WT 2UL
@@ -351,10 +357,14 @@
#define CMDQ_0_OP_MASK 0xffUL
#define CMDQ_0_SSV (1UL << 11)
+#define CMDQ_PREFETCH_0_SSID_SHIFT 12
+#define CMDQ_PREFETCH_0_SSID_MASK 0xfffffUL
#define CMDQ_PREFETCH_0_SID_SHIFT 32
#define CMDQ_PREFETCH_1_SIZE_SHIFT 0
#define CMDQ_PREFETCH_1_ADDR_MASK ~0xfffUL
+#define CMDQ_CFGI_0_SSID_SHIFT 12
+#define CMDQ_CFGI_0_SSID_MASK 0xfffffUL
#define CMDQ_CFGI_0_SID_SHIFT 32
#define CMDQ_CFGI_0_SID_MASK 0xffffffffUL
#define CMDQ_CFGI_1_LEAF (1UL << 0)
@@ -475,14 +485,18 @@ struct arm_smmu_cmdq_ent {
#define CMDQ_OP_PREFETCH_CFG 0x1
struct {
u32 sid;
+ u32 ssid;
u8 size;
u64 addr;
} prefetch;
#define CMDQ_OP_CFGI_STE 0x3
#define CMDQ_OP_CFGI_ALL 0x4
+ #define CMDQ_OP_CFGI_CD 0x5
+ #define CMDQ_OP_CFGI_CD_ALL 0x6
struct {
u32 sid;
+ u32 ssid;
union {
bool leaf;
u8 span;
@@ -562,15 +576,10 @@ struct arm_smmu_strtab_l1_desc {
};
struct arm_smmu_s1_cfg {
- __le64 *cdptr;
- dma_addr_t cdptr_dma;
-
- struct arm_smmu_ctx_desc {
- u16 asid;
- u64 ttbr;
- u64 tcr;
- u64 mair;
- } cd;
+ u16 asid;
+ u64 ttbr;
+ u64 tcr;
+ u64 mair;
};
struct arm_smmu_s2_cfg {
@@ -579,10 +588,19 @@ struct arm_smmu_s2_cfg {
u64 vtcr;
};
+struct arm_smmu_cd_cfg {
+ __le64 *cdptr;
+ dma_addr_t cdptr_dma;
+
+ unsigned long *context_map;
+ size_t num_entries;
+};
+
struct arm_smmu_strtab_ent {
bool valid;
bool bypass; /* Overrides s1/s2 config */
+ struct arm_smmu_cd_cfg cd_cfg;
struct arm_smmu_s1_cfg *s1_cfg;
struct arm_smmu_s2_cfg *s2_cfg;
};
@@ -723,6 +741,24 @@ static void parse_driver_options(struct arm_smmu_device *smmu)
} while (arm_smmu_options[++i].opt);
}
+static int arm_smmu_bitmap_alloc(unsigned long *map, int span)
+{
+ int idx, size = 1 << span;
+
+ do {
+ idx = find_first_zero_bit(map, size);
+ if (idx == size)
+ return -ENOSPC;
+ } while (test_and_set_bit(idx, map));
+
+ return idx;
+}
+
+static void arm_smmu_bitmap_free(unsigned long *map, int idx)
+{
+ clear_bit(idx, map);
+}
+
/* Low-level queue manipulation functions */
static bool queue_full(struct arm_smmu_queue *q)
{
@@ -839,14 +875,22 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
case CMDQ_OP_TLBI_NSNH_ALL:
break;
case CMDQ_OP_PREFETCH_CFG:
+ cmd[0] |= ent->substream_valid ? CMDQ_0_SSV : 0;
cmd[0] |= (u64)ent->prefetch.sid << CMDQ_PREFETCH_0_SID_SHIFT;
+ cmd[0] |= ent->prefetch.ssid << CMDQ_PREFETCH_0_SSID_SHIFT;
cmd[1] |= ent->prefetch.size << CMDQ_PREFETCH_1_SIZE_SHIFT;
cmd[1] |= ent->prefetch.addr & CMDQ_PREFETCH_1_ADDR_MASK;
break;
+ case CMDQ_OP_CFGI_CD:
+ cmd[0] |= ent->cfgi.ssid << CMDQ_CFGI_0_SSID_SHIFT;
+ /* pass through */
case CMDQ_OP_CFGI_STE:
cmd[0] |= (u64)ent->cfgi.sid << CMDQ_CFGI_0_SID_SHIFT;
cmd[1] |= ent->cfgi.leaf ? CMDQ_CFGI_1_LEAF : 0;
break;
+ case CMDQ_OP_CFGI_CD_ALL:
+ cmd[0] |= (u64)ent->cfgi.sid << CMDQ_CFGI_0_SID_SHIFT;
+ break;
case CMDQ_OP_CFGI_ALL:
/* Cover the entire SID range */
cmd[1] |= CMDQ_CFGI_1_RANGE_MASK << CMDQ_CFGI_1_RANGE_SHIFT;
@@ -987,6 +1031,29 @@ static void arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
}
/* Context descriptor manipulation functions */
+static void arm_smmu_sync_cd(struct arm_smmu_master_data *master, u32 ssid,
+ bool leaf)
+{
+ size_t i;
+ struct arm_smmu_device *smmu = master->smmu;
+ struct iommu_fwspec *fwspec = master->dev->iommu_fwspec;
+ struct arm_smmu_cmdq_ent cmd = {
+ .opcode = CMDQ_OP_CFGI_CD,
+ .cfgi = {
+ .ssid = ssid,
+ .leaf = leaf,
+ },
+ };
+
+ for (i = 0; i < fwspec->num_ids; i++) {
+ cmd.cfgi.sid = fwspec->ids[i];
+ arm_smmu_cmdq_issue_cmd(smmu, &cmd);
+ }
+
+ cmd.opcode = CMDQ_OP_CMD_SYNC;
+ arm_smmu_cmdq_issue_cmd(smmu, &cmd);
+}
+
static u64 arm_smmu_cpu_tcr_to_cd(struct arm_smmu_device *smmu, u64 tcr)
{
u64 val = 0;
@@ -1006,28 +1073,157 @@ static u64 arm_smmu_cpu_tcr_to_cd(struct arm_smmu_device *smmu, u64 tcr)
return val;
}
-static void arm_smmu_write_ctx_desc(struct arm_smmu_device *smmu,
- struct arm_smmu_s1_cfg *cfg)
+static void arm_smmu_write_ctx_desc(struct arm_smmu_master_data *master,
+ u32 ssid, struct arm_smmu_s1_cfg *cfg)
{
u64 val;
+ bool cd_live;
+ struct arm_smmu_device *smmu = master->smmu;
+ struct arm_smmu_cd_cfg *descs_cfg = &master->ste.cd_cfg;
+ __u64 *cdptr = (__u64 *)descs_cfg->cdptr + ssid * CTXDESC_CD_DWORDS;
/*
- * We don't need to issue any invalidation here, as we'll invalidate
- * the STE when installing the new entry anyway.
+ * This function handles the following cases:
+ *
+ * (1) Install primary CD, for normal DMA traffic (SSID = 0). In this
+ * case, invalidation is performed when installing the STE.
+ * (2) Install a secondary CD, for SID+SSID traffic, followed by an
+ * invalidation.
+ * (3) Update ASID of primary CD. This is allowed by atomically writing
+ * the first 64 bits of the CD, followed by invalidation of the old
+ * entry and mappings.
+ * (4) Remove a secondary CD and invalidate it.
+ * (5) Remove primary CD. The STE is cleared and invalidated beforehand,
+ * so this CD is already unreachable and invalidated.
*/
- val = arm_smmu_cpu_tcr_to_cd(smmu, cfg->cd.tcr) |
+
+ val = le64_to_cpu(cdptr[0]);
+ cd_live = !!(val & CTXDESC_CD_0_V);
+
+ if (!cfg) {
+ /* (4) and (5) */
+ cdptr[0] = 0;
+ if (ssid && cd_live)
+ arm_smmu_sync_cd(master, ssid, true);
+ return;
+ }
+
+ if (cd_live) {
+ /* (3) */
+ val &= ~(CTXDESC_CD_0_ASID_MASK << CTXDESC_CD_0_ASID_SHIFT);
+ val |= (u64)cfg->asid << CTXDESC_CD_0_ASID_SHIFT;
+
+ cdptr[0] = cpu_to_le64(val);
+ /*
+ * Until CD+TLB invalidation, both ASIDs may be used for tagging
+ * this substream's traffic
+ */
+
+ } else {
+ /* (1) and (2) */
+ cdptr[1] = cpu_to_le64(cfg->ttbr & CTXDESC_CD_1_TTB0_MASK
+ << CTXDESC_CD_1_TTB0_SHIFT);
+ cdptr[2] = 0;
+ cdptr[3] = cpu_to_le64(cfg->mair << CTXDESC_CD_3_MAIR_SHIFT);
+
+ if (ssid)
+ /*
+ * STE is live, and the SMMU might fetch this CD at any
+ * time. Ensure it observes the rest of the CD before we
+ * enable it.
+ */
+ arm_smmu_sync_cd(master, ssid, true);
+
+ val = arm_smmu_cpu_tcr_to_cd(smmu, cfg->tcr) |
#ifdef __BIG_ENDIAN
- CTXDESC_CD_0_ENDI |
+ CTXDESC_CD_0_ENDI |
#endif
- CTXDESC_CD_0_R | CTXDESC_CD_0_A | CTXDESC_CD_0_ASET_PRIVATE |
- CTXDESC_CD_0_AA64 | (u64)cfg->cd.asid << CTXDESC_CD_0_ASID_SHIFT |
- CTXDESC_CD_0_V;
- cfg->cdptr[0] = cpu_to_le64(val);
+ CTXDESC_CD_0_R | CTXDESC_CD_0_A |
+ CTXDESC_CD_0_ASET_PRIVATE |
+ CTXDESC_CD_0_AA64 |
+ (u64)cfg->asid << CTXDESC_CD_0_ASID_SHIFT |
+ CTXDESC_CD_0_V;
+
+ cdptr[0] = cpu_to_le64(val);
+
+ }
+
+ if (ssid || cd_live)
+ arm_smmu_sync_cd(master, ssid, true);
+}
+
+static int arm_smmu_alloc_cd_tables(struct arm_smmu_master_data *master,
+ size_t nr_ssids)
+{
+ struct arm_smmu_device *smmu = master->smmu;
+ struct arm_smmu_cd_cfg *cfg = &master->ste.cd_cfg;
+
+ if (cfg->num_entries) {
+ /*
+ * Messy master initialization. arm_smmu_add_device already
+ * moaned about it, let's ignore it.
+ */
+ return nr_ssids;
+ }
+
+ nr_ssids = clamp_val(nr_ssids, 1, 1 << smmu->ssid_bits);
+ if (WARN_ON_ONCE(!is_power_of_2(nr_ssids)))
+ nr_ssids = 1;
+
+ cfg->num_entries = nr_ssids;
+
+ cfg->context_map = devm_kzalloc(smmu->dev,
+ BITS_TO_LONGS(nr_ssids) * sizeof(long),
+ GFP_KERNEL);
+ if (!cfg->context_map)
+ return -ENOMEM;
- val = cfg->cd.ttbr & CTXDESC_CD_1_TTB0_MASK << CTXDESC_CD_1_TTB0_SHIFT;
- cfg->cdptr[1] = cpu_to_le64(val);
+ /* SSID 0 corresponds to default context */
+ set_bit(0, cfg->context_map);
- cfg->cdptr[3] = cpu_to_le64(cfg->cd.mair << CTXDESC_CD_3_MAIR_SHIFT);
+ cfg->cdptr = dmam_alloc_coherent(smmu->dev,
+ nr_ssids * (CTXDESC_CD_DWORDS << 3),
+ &cfg->cdptr_dma,
+ GFP_KERNEL | __GFP_ZERO);
+ if (!cfg->cdptr) {
+ devm_kfree(smmu->dev, cfg->context_map);
+ return -ENOMEM;
+ }
+
+ return nr_ssids;
+}
+
+static void arm_smmu_free_cd_tables(struct arm_smmu_master_data *master)
+{
+ struct arm_smmu_device *smmu = master->smmu;
+ struct arm_smmu_cd_cfg *cfg = &master->ste.cd_cfg;
+
+ if (!cfg->num_entries)
+ return;
+
+ dmam_free_coherent(smmu->dev,
+ cfg->num_entries * (CTXDESC_CD_DWORDS << 3),
+ cfg->cdptr, cfg->cdptr_dma);
+
+ devm_kfree(smmu->dev, cfg->context_map);
+
+ cfg->num_entries = 0;
+}
+
+__maybe_unused
+static int arm_smmu_alloc_cd(struct arm_smmu_master_data *master)
+{
+ struct arm_smmu_cd_cfg *cfg = &master->ste.cd_cfg;
+
+ return arm_smmu_bitmap_alloc(cfg->context_map, ilog2(cfg->num_entries));
+}
+
+__maybe_unused
+static void arm_smmu_free_cd(struct arm_smmu_master_data *master, u32 ssid)
+{
+ struct arm_smmu_cd_cfg *cfg = &master->ste.cd_cfg;
+
+ arm_smmu_bitmap_free(cfg->context_map, ssid);
}
/* Stream table manipulation functions */
@@ -1122,8 +1318,11 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_device *smmu, u32 sid,
}
if (ste->s1_cfg) {
+ unsigned int s1cdmax = ilog2(ste->cd_cfg.num_entries);
BUG_ON(ste_live);
+
dst[1] = cpu_to_le64(
+ STRTAB_STE_1_S1DSS_SSID0 |
STRTAB_STE_1_S1C_CACHE_WBRA
<< STRTAB_STE_1_S1CIR_SHIFT |
STRTAB_STE_1_S1C_CACHE_WBRA
@@ -1134,8 +1333,11 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_device *smmu, u32 sid,
if (smmu->features & ARM_SMMU_FEAT_STALLS)
dst[1] |= cpu_to_le64(STRTAB_STE_1_S1STALLD);
- val |= (ste->s1_cfg->cdptr_dma & STRTAB_STE_0_S1CTXPTR_MASK
+ val |= (ste->cd_cfg.cdptr_dma & STRTAB_STE_0_S1CTXPTR_MASK
<< STRTAB_STE_0_S1CTXPTR_SHIFT) |
+ (u64)(s1cdmax & STRTAB_STE_0_S1CDMAX_MASK)
+ << STRTAB_STE_0_S1CDMAX_SHIFT |
+ STRTAB_STE_0_S1FMT_LINEAR |
STRTAB_STE_0_CFG_S1_TRANS;
}
@@ -1380,7 +1582,7 @@ static void arm_smmu_tlb_inv_context(void *cookie)
if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) {
cmd.opcode = CMDQ_OP_TLBI_NH_ASID;
- cmd.tlbi.asid = smmu_domain->s1_cfg.cd.asid;
+ cmd.tlbi.asid = smmu_domain->s1_cfg.asid;
cmd.tlbi.vmid = 0;
} else {
cmd.opcode = CMDQ_OP_TLBI_S12_VMALL;
@@ -1405,7 +1607,7 @@ static void arm_smmu_tlb_inv_range_nosync(unsigned long iova, size_t size,
if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) {
cmd.opcode = CMDQ_OP_TLBI_NH_VA;
- cmd.tlbi.asid = smmu_domain->s1_cfg.cd.asid;
+ cmd.tlbi.asid = smmu_domain->s1_cfg.asid;
} else {
cmd.opcode = CMDQ_OP_TLBI_S2_IPA;
cmd.tlbi.vmid = smmu_domain->s2_cfg.vmid;
@@ -1580,24 +1782,6 @@ static struct iommu_domain *arm_smmu_domain_alloc(unsigned type)
return &smmu_domain->domain;
}
-static int arm_smmu_bitmap_alloc(unsigned long *map, int span)
-{
- int idx, size = 1 << span;
-
- do {
- idx = find_first_zero_bit(map, size);
- if (idx == size)
- return -ENOSPC;
- } while (test_and_set_bit(idx, map));
-
- return idx;
-}
-
-static void arm_smmu_bitmap_free(unsigned long *map, int idx)
-{
- clear_bit(idx, map);
-}
-
static void arm_smmu_domain_free(struct iommu_domain *domain)
{
struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
@@ -1606,18 +1790,10 @@ static void arm_smmu_domain_free(struct iommu_domain *domain)
iommu_put_dma_cookie(domain);
free_io_pgtable_ops(smmu_domain->pgtbl_ops);
- /* Free the CD and ASID, if we allocated them */
if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) {
struct arm_smmu_s1_cfg *cfg = &smmu_domain->s1_cfg;
-
- if (cfg->cdptr) {
- dmam_free_coherent(smmu_domain->smmu->dev,
- CTXDESC_CD_DWORDS << 3,
- cfg->cdptr,
- cfg->cdptr_dma);
-
- arm_smmu_bitmap_free(smmu->asid_map, cfg->cd.asid);
- }
+ if (cfg->asid)
+ arm_smmu_bitmap_free(smmu->asid_map, cfg->asid);
} else {
struct arm_smmu_s2_cfg *cfg = &smmu_domain->s2_cfg;
if (cfg->vmid)
@@ -1630,7 +1806,6 @@ static void arm_smmu_domain_free(struct iommu_domain *domain)
static int arm_smmu_domain_finalise_s1(struct arm_smmu_domain *smmu_domain,
struct io_pgtable_cfg *pgtbl_cfg)
{
- int ret;
int asid;
struct arm_smmu_device *smmu = smmu_domain->smmu;
struct arm_smmu_s1_cfg *cfg = &smmu_domain->s1_cfg;
@@ -1639,24 +1814,12 @@ static int arm_smmu_domain_finalise_s1(struct arm_smmu_domain *smmu_domain,
if (asid < 0)
return asid;
- cfg->cdptr = dmam_alloc_coherent(smmu->dev, CTXDESC_CD_DWORDS << 3,
- &cfg->cdptr_dma,
- GFP_KERNEL | __GFP_ZERO);
- if (!cfg->cdptr) {
- dev_warn(smmu->dev, "failed to allocate context descriptor\n");
- ret = -ENOMEM;
- goto out_free_asid;
- }
+ cfg->asid = (u16)asid;
+ cfg->ttbr = pgtbl_cfg->arm_lpae_s1_cfg.ttbr[0];
+ cfg->tcr = pgtbl_cfg->arm_lpae_s1_cfg.tcr;
+ cfg->mair = pgtbl_cfg->arm_lpae_s1_cfg.mair[0];
- cfg->cd.asid = (u16)asid;
- cfg->cd.ttbr = pgtbl_cfg->arm_lpae_s1_cfg.ttbr[0];
- cfg->cd.tcr = pgtbl_cfg->arm_lpae_s1_cfg.tcr;
- cfg->cd.mair = pgtbl_cfg->arm_lpae_s1_cfg.mair[0];
return 0;
-
-out_free_asid:
- arm_smmu_bitmap_free(smmu->asid_map, asid);
- return ret;
}
static int arm_smmu_domain_finalise_s2(struct arm_smmu_domain *smmu_domain,
@@ -1805,6 +1968,8 @@ static void arm_smmu_detach_dev(struct device *dev)
master->ste.bypass = true;
if (arm_smmu_install_ste_for_dev(dev->iommu_fwspec) < 0)
dev_warn(dev, "failed to install bypass STE\n");
+
+ arm_smmu_write_ctx_desc(master, 0, NULL);
}
static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
@@ -1894,7 +2059,7 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) {
ste->s1_cfg = &smmu_domain->s1_cfg;
ste->s2_cfg = NULL;
- arm_smmu_write_ctx_desc(smmu, ste->s1_cfg);
+ arm_smmu_write_ctx_desc(master, 0, ste->s1_cfg);
} else {
ste->s1_cfg = NULL;
ste->s2_cfg = &smmu_domain->s2_cfg;
@@ -2095,6 +2260,10 @@ static int arm_smmu_add_device(struct device *dev)
}
}
+ ret = arm_smmu_alloc_cd_tables(master, 1);
+ if (ret < 0)
+ return ret;
+
ats_enabled = !arm_smmu_enable_ats(master);
group = iommu_group_get_for_dev(dev);
@@ -2119,6 +2288,8 @@ static int arm_smmu_add_device(struct device *dev)
err_disable_ats:
arm_smmu_disable_ats(master);
+ arm_smmu_free_cd_tables(master);
+
return ret;
}
@@ -2150,6 +2321,7 @@ static void arm_smmu_remove_device(struct device *dev)
iommu_group_put(group);
arm_smmu_disable_ats(master);
+ arm_smmu_free_cd_tables(master);
}
iommu_group_remove_device(dev);
--
2.11.0
^ permalink raw reply related [flat|nested] 103+ messages in thread
* [RFC PATCH 07/30] iommu/arm-smmu-v3: Add second level of context descriptor table
[not found] ` <20170227195441.5170-1-jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
` (5 preceding siblings ...)
2017-02-27 19:54 ` [RFC PATCH 06/30] iommu/arm-smmu-v3: Add support for Substream IDs Jean-Philippe Brucker
@ 2017-02-27 19:54 ` Jean-Philippe Brucker
[not found] ` <20170227195441.5170-8-jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
2017-02-27 19:54 ` [RFC PATCH 08/30] iommu/arm-smmu-v3: Add support for VHE Jean-Philippe Brucker
` (23 subsequent siblings)
30 siblings, 1 reply; 103+ messages in thread
From: Jean-Philippe Brucker @ 2017-02-27 19:54 UTC (permalink / raw)
Cc: Shanker Donthineni, kvm-u79uwXL29TY76Z2rM5mHXA, Catalin Marinas,
Sinan Kaya, Will Deacon,
iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
Harv Abdulhamid, linux-pci-u79uwXL29TY76Z2rM5mHXA, Bjorn Helgaas,
David Woodhouse,
linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r, Nate Watterson
The SMMU can support up to 20 bits of SSID. Add a second level of page
tables to accommodate this. Devices without support for SSID still have a
single context descriptor, but the others now have a first table of 1024
entries (8kB), pointing to tables of 1024 context descriptors (64kB),
allocated on demand.
Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
---
drivers/iommu/arm-smmu-v3.c | 251 +++++++++++++++++++++++++++++++++++++++-----
1 file changed, 223 insertions(+), 28 deletions(-)
diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index f88d62025efa..cebbc8a22ec6 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -241,6 +241,8 @@
#define STRTAB_STE_0_S1FMT_SHIFT 4
#define STRTAB_STE_0_S1FMT_LINEAR (0UL << STRTAB_STE_0_S1FMT_SHIFT)
+#define STRTAB_STE_0_S1FMT_4K_L2 (1UL << STRTAB_STE_0_S1FMT_SHIFT)
+#define STRTAB_STE_0_S1FMT_64K_L2 (2UL << STRTAB_STE_0_S1FMT_SHIFT)
#define STRTAB_STE_0_S1CTXPTR_SHIFT 6
#define STRTAB_STE_0_S1CTXPTR_MASK 0x3ffffffffffUL
#define STRTAB_STE_0_S1CDMAX_SHIFT 59
@@ -289,7 +291,21 @@
#define STRTAB_STE_3_S2TTB_SHIFT 4
#define STRTAB_STE_3_S2TTB_MASK 0xfffffffffffUL
-/* Context descriptor (stage-1 only) */
+/*
+ * Context descriptor
+ *
+ * Linear: when less than 1024 SSIDs are supported
+ * 2lvl: at most 1024 L1 entrie,
+ * 1024 lazy entries per table.
+ */
+#define CTXDESC_SPLIT 10
+#define CTXDESC_NUM_L2_ENTRIES (1 << CTXDESC_SPLIT)
+
+#define CTXDESC_L1_DESC_DWORD 1
+#define CTXDESC_L1_DESC_VALID 1
+#define CTXDESC_L1_DESC_L2PTR_SHIFT 12
+#define CTXDESC_L1_DESC_L2PTR_MASK 0xfffffffffUL
+
#define CTXDESC_CD_DWORDS 8
#define CTXDESC_CD_0_TCR_T0SZ_SHIFT 0
#define ARM64_TCR_T0SZ_SHIFT 0
@@ -588,11 +604,27 @@ struct arm_smmu_s2_cfg {
u64 vtcr;
};
-struct arm_smmu_cd_cfg {
+struct arm_smmu_cd_table {
__le64 *cdptr;
dma_addr_t cdptr_dma;
unsigned long *context_map;
+};
+
+struct arm_smmu_cd_cfg {
+ bool linear;
+
+ union {
+ struct arm_smmu_cd_table table;
+ struct {
+ __le64 *ptr;
+ dma_addr_t ptr_dma;
+
+ struct arm_smmu_cd_table *tables;
+ unsigned long cur_table;
+ } l1;
+ };
+
size_t num_entries;
};
@@ -1054,6 +1086,27 @@ static void arm_smmu_sync_cd(struct arm_smmu_master_data *master, u32 ssid,
arm_smmu_cmdq_issue_cmd(smmu, &cmd);
}
+static __u64 *arm_smmu_get_cd_ptr(struct arm_smmu_cd_cfg *cfg, u32 ssid)
+{
+ unsigned long idx;
+ struct arm_smmu_cd_table *l1_desc;
+
+ if (cfg->linear)
+ return cfg->table.cdptr + ssid * CTXDESC_CD_DWORDS;
+
+ idx = ssid >> CTXDESC_SPLIT;
+ if (idx >= cfg->num_entries)
+ return NULL;
+
+ l1_desc = &cfg->l1.tables[idx];
+ if (!l1_desc->cdptr)
+ return NULL;
+
+ idx = ssid & ((1 << CTXDESC_SPLIT) - 1);
+
+ return l1_desc->cdptr + idx * CTXDESC_CD_DWORDS;
+}
+
static u64 arm_smmu_cpu_tcr_to_cd(struct arm_smmu_device *smmu, u64 tcr)
{
u64 val = 0;
@@ -1073,6 +1126,15 @@ static u64 arm_smmu_cpu_tcr_to_cd(struct arm_smmu_device *smmu, u64 tcr)
return val;
}
+static void arm_smmu_write_cd_l1_desc(__le64 *dst,
+ struct arm_smmu_cd_table *table)
+{
+ u64 val = (table->cdptr_dma & CTXDESC_L1_DESC_L2PTR_MASK
+ << CTXDESC_L1_DESC_L2PTR_SHIFT) | CTXDESC_L1_DESC_VALID;
+
+ *dst = cpu_to_le64(val);
+}
+
static void arm_smmu_write_ctx_desc(struct arm_smmu_master_data *master,
u32 ssid, struct arm_smmu_s1_cfg *cfg)
{
@@ -1080,7 +1142,7 @@ static void arm_smmu_write_ctx_desc(struct arm_smmu_master_data *master,
bool cd_live;
struct arm_smmu_device *smmu = master->smmu;
struct arm_smmu_cd_cfg *descs_cfg = &master->ste.cd_cfg;
- __u64 *cdptr = (__u64 *)descs_cfg->cdptr + ssid * CTXDESC_CD_DWORDS;
+ __u64 *cdptr = arm_smmu_get_cd_ptr(descs_cfg, ssid);
/*
* This function handles the following cases:
@@ -1097,6 +1159,9 @@ static void arm_smmu_write_ctx_desc(struct arm_smmu_master_data *master,
* so this CD is already unreachable and invalidated.
*/
+ if (WARN_ON(!cdptr))
+ return;
+
val = le64_to_cpu(cdptr[0]);
cd_live = !!(val & CTXDESC_CD_0_V);
@@ -1152,9 +1217,43 @@ static void arm_smmu_write_ctx_desc(struct arm_smmu_master_data *master,
arm_smmu_sync_cd(master, ssid, true);
}
+static int arm_smmu_alloc_cd_leaf_table(struct arm_smmu_device *smmu,
+ struct arm_smmu_cd_table *desc,
+ size_t num_entries)
+{
+ size_t size = num_entries * (CTXDESC_CD_DWORDS << 3);
+
+ desc->context_map = devm_kzalloc(smmu->dev, BITS_TO_LONGS(num_entries) *
+ sizeof(long), GFP_ATOMIC);
+ if (!desc->context_map)
+ return -ENOMEM;
+
+ desc->cdptr = dmam_alloc_coherent(smmu->dev, size, &desc->cdptr_dma,
+ GFP_ATOMIC | __GFP_ZERO);
+ if (!desc->cdptr) {
+ devm_kfree(smmu->dev, desc->context_map);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void arm_smmu_free_cd_leaf_table(struct arm_smmu_device *smmu,
+ struct arm_smmu_cd_table *desc,
+ size_t num_entries)
+{
+ size_t size = num_entries * (CTXDESC_CD_DWORDS << 3);
+
+ dmam_free_coherent(smmu->dev, size, desc->cdptr, desc->cdptr_dma);
+ devm_kfree(smmu->dev, desc->context_map);
+}
+
static int arm_smmu_alloc_cd_tables(struct arm_smmu_master_data *master,
- size_t nr_ssids)
+ int nr_ssids)
{
+ int ret;
+ size_t num_leaf_entries, size = 0;
+ struct arm_smmu_cd_table *leaf_table;
struct arm_smmu_device *smmu = master->smmu;
struct arm_smmu_cd_cfg *cfg = &master->ste.cd_cfg;
@@ -1170,42 +1269,86 @@ static int arm_smmu_alloc_cd_tables(struct arm_smmu_master_data *master,
if (WARN_ON_ONCE(!is_power_of_2(nr_ssids)))
nr_ssids = 1;
- cfg->num_entries = nr_ssids;
+ if (nr_ssids <= (1 << CTXDESC_SPLIT)) {
+ /* Fits in a single table */
+ cfg->linear = true;
+ cfg->num_entries = num_leaf_entries = nr_ssids;
+ leaf_table = &cfg->table;
+ } else {
+ /*
+ * SSID[S1CDmax-1:10] indexes 1st-level table, SSID[9:0] indexes
+ * 2nd-level
+ */
+ cfg->linear = false;
+ cfg->num_entries = nr_ssids / CTXDESC_NUM_L2_ENTRIES;
- cfg->context_map = devm_kzalloc(smmu->dev,
- BITS_TO_LONGS(nr_ssids) * sizeof(long),
- GFP_KERNEL);
- if (!cfg->context_map)
- return -ENOMEM;
+ cfg->l1.tables = devm_kzalloc(smmu->dev,
+ sizeof(struct arm_smmu_cd_table) *
+ cfg->num_entries, GFP_KERNEL);
+ if (!cfg->l1.tables)
+ return -ENOMEM;
- /* SSID 0 corresponds to default context */
- set_bit(0, cfg->context_map);
-
- cfg->cdptr = dmam_alloc_coherent(smmu->dev,
- nr_ssids * (CTXDESC_CD_DWORDS << 3),
- &cfg->cdptr_dma,
- GFP_KERNEL | __GFP_ZERO);
- if (!cfg->cdptr) {
- devm_kfree(smmu->dev, cfg->context_map);
- return -ENOMEM;
+ size = cfg->num_entries * (CTXDESC_L1_DESC_DWORD << 3);
+ cfg->l1.ptr = dmam_alloc_coherent(smmu->dev, size,
+ &cfg->l1.ptr_dma,
+ GFP_KERNEL | __GFP_ZERO);
+ if (!cfg->l1.ptr) {
+ devm_kfree(smmu->dev, cfg->l1.tables);
+ return -ENOMEM;
+ }
+
+ num_leaf_entries = CTXDESC_NUM_L2_ENTRIES;
+ leaf_table = cfg->l1.tables;
}
+ ret = arm_smmu_alloc_cd_leaf_table(smmu, leaf_table, num_leaf_entries);
+ if (ret) {
+ if (!cfg->linear) {
+ dmam_free_coherent(smmu->dev, size, cfg->l1.ptr,
+ cfg->l1.ptr_dma);
+ devm_kfree(smmu->dev, cfg->l1.tables);
+ }
+
+ cfg->num_entries = 0;
+ return ret;
+ }
+
+ if (!cfg->linear)
+ arm_smmu_write_cd_l1_desc(cfg->l1.ptr, leaf_table);
+
+ /* SSID 0 corresponds to default context */
+ set_bit(0, leaf_table->context_map);
+
return nr_ssids;
}
static void arm_smmu_free_cd_tables(struct arm_smmu_master_data *master)
{
+ size_t i, size;
struct arm_smmu_device *smmu = master->smmu;
struct arm_smmu_cd_cfg *cfg = &master->ste.cd_cfg;
if (!cfg->num_entries)
return;
- dmam_free_coherent(smmu->dev,
- cfg->num_entries * (CTXDESC_CD_DWORDS << 3),
- cfg->cdptr, cfg->cdptr_dma);
+ if (cfg->linear) {
+ arm_smmu_free_cd_leaf_table(smmu, &cfg->table, cfg->num_entries);
+ } else {
+ for (i = 0; i < cfg->num_entries; i++) {
+ struct arm_smmu_cd_table *desc = &cfg->l1.tables[i];
+
+ if (!desc->cdptr)
+ continue;
+
+ arm_smmu_free_cd_leaf_table(smmu, desc,
+ CTXDESC_NUM_L2_ENTRIES);
+ }
+
+ size = cfg->num_entries * (CTXDESC_L1_DESC_DWORD << 3);
+ dmam_free_coherent(smmu->dev, size, cfg->l1.ptr, cfg->l1.ptr_dma);
- devm_kfree(smmu->dev, cfg->context_map);
+ devm_kfree(smmu->dev, cfg->l1.tables);
+ }
cfg->num_entries = 0;
}
@@ -1213,17 +1356,59 @@ static void arm_smmu_free_cd_tables(struct arm_smmu_master_data *master)
__maybe_unused
static int arm_smmu_alloc_cd(struct arm_smmu_master_data *master)
{
+ int ssid;
+ int i, ret;
struct arm_smmu_cd_cfg *cfg = &master->ste.cd_cfg;
- return arm_smmu_bitmap_alloc(cfg->context_map, ilog2(cfg->num_entries));
+ if (cfg->linear)
+ return arm_smmu_bitmap_alloc(cfg->table.context_map,
+ ilog2(cfg->num_entries));
+
+ /* Find first leaf table with an empty slot, or allocate a new leaf */
+ for (i = cfg->l1.cur_table; i < cfg->num_entries; i++) {
+ struct arm_smmu_cd_table *table = &cfg->l1.tables[i];
+
+ if (!table->cdptr) {
+ __le64 *l1ptr = cfg->l1.ptr + i * CTXDESC_L1_DESC_DWORD;
+
+ ret = arm_smmu_alloc_cd_leaf_table(master->smmu, table,
+ CTXDESC_NUM_L2_ENTRIES);
+ if (ret)
+ return ret;
+
+ arm_smmu_write_cd_l1_desc(l1ptr, table);
+ arm_smmu_sync_cd(master, i << CTXDESC_SPLIT, false);
+ }
+
+ ssid = arm_smmu_bitmap_alloc(table->context_map, CTXDESC_SPLIT);
+ if (ssid < 0)
+ continue;
+
+ cfg->l1.cur_table = i;
+ return i << CTXDESC_SPLIT | ssid;
+ }
+
+ return -ENOSPC;
}
__maybe_unused
static void arm_smmu_free_cd(struct arm_smmu_master_data *master, u32 ssid)
{
+ unsigned long l1_idx, idx;
struct arm_smmu_cd_cfg *cfg = &master->ste.cd_cfg;
- arm_smmu_bitmap_free(cfg->context_map, ssid);
+ if (cfg->linear) {
+ arm_smmu_bitmap_free(cfg->table.context_map, ssid);
+ return;
+ }
+
+ l1_idx = ssid >> CTXDESC_SPLIT;
+ idx = ssid & ((1 << CTXDESC_SPLIT) - 1);
+ arm_smmu_bitmap_free(cfg->l1.tables[l1_idx].context_map, idx);
+
+ /* Prepare next allocation */
+ if (cfg->l1.cur_table > idx)
+ cfg->l1.cur_table = idx;
}
/* Stream table manipulation functions */
@@ -1318,7 +1503,16 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_device *smmu, u32 sid,
}
if (ste->s1_cfg) {
+ dma_addr_t s1ctxptr;
unsigned int s1cdmax = ilog2(ste->cd_cfg.num_entries);
+
+ if (ste->cd_cfg.linear) {
+ s1ctxptr = ste->cd_cfg.table.cdptr_dma;
+ } else {
+ s1cdmax += CTXDESC_SPLIT;
+ s1ctxptr = ste->cd_cfg.l1.ptr_dma;
+ }
+
BUG_ON(ste_live);
dst[1] = cpu_to_le64(
@@ -1333,11 +1527,12 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_device *smmu, u32 sid,
if (smmu->features & ARM_SMMU_FEAT_STALLS)
dst[1] |= cpu_to_le64(STRTAB_STE_1_S1STALLD);
- val |= (ste->cd_cfg.cdptr_dma & STRTAB_STE_0_S1CTXPTR_MASK
+ val |= (s1ctxptr & STRTAB_STE_0_S1CTXPTR_MASK
<< STRTAB_STE_0_S1CTXPTR_SHIFT) |
(u64)(s1cdmax & STRTAB_STE_0_S1CDMAX_MASK)
<< STRTAB_STE_0_S1CDMAX_SHIFT |
- STRTAB_STE_0_S1FMT_LINEAR |
+ (ste->cd_cfg.linear ? STRTAB_STE_0_S1FMT_LINEAR :
+ STRTAB_STE_0_S1FMT_64K_L2) |
STRTAB_STE_0_CFG_S1_TRANS;
}
--
2.11.0
^ permalink raw reply related [flat|nested] 103+ messages in thread
* [RFC PATCH 08/30] iommu/arm-smmu-v3: Add support for VHE
[not found] ` <20170227195441.5170-1-jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
` (6 preceding siblings ...)
2017-02-27 19:54 ` [RFC PATCH 07/30] iommu/arm-smmu-v3: Add second level of context descriptor table Jean-Philippe Brucker
@ 2017-02-27 19:54 ` Jean-Philippe Brucker
2017-02-27 19:54 ` [RFC PATCH 09/30] iommu/arm-smmu-v3: Support broadcast TLB maintenance Jean-Philippe Brucker
` (22 subsequent siblings)
30 siblings, 0 replies; 103+ messages in thread
From: Jean-Philippe Brucker @ 2017-02-27 19:54 UTC (permalink / raw)
Cc: Shanker Donthineni, kvm-u79uwXL29TY76Z2rM5mHXA, Catalin Marinas,
Sinan Kaya, Will Deacon,
iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
Harv Abdulhamid, linux-pci-u79uwXL29TY76Z2rM5mHXA, Bjorn Helgaas,
David Woodhouse,
linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r, Nate Watterson
ARMv8.1 extensions added Virtualization Host Extensions (VHE), which allow
to run a host kernel at EL2. When using normal DMA, Device and CPU address
spaces are orthogonal, and do not need to implement the same capabilities,
so VHE hasn't been in use on the SMMU side until now.
With shared address spaces however, ASIDs are shared between MMU and SMMU,
and broadcast TLB invalidations issued by a CPU are taken into account by
the SMMU. TLB entries on both sides need to have identical exception level
in order to be shot with a single invalidation.
When the CPU is using VHE, enable VHE in the SMMU and for all streams.
Normal DMA mappings will need to use TLBI_EL2 commands instead of TLBI_NH,
but shouldn't be otherwise affected by this change.
Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
---
drivers/iommu/arm-smmu-v3.c | 29 ++++++++++++++++++++++++-----
1 file changed, 24 insertions(+), 5 deletions(-)
diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index cebbc8a22ec6..0981159ada04 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -22,6 +22,7 @@
#include <linux/acpi.h>
#include <linux/acpi_iort.h>
+#include <linux/cpufeature.h>
#include <linux/delay.h>
#include <linux/dma-iommu.h>
#include <linux/err.h>
@@ -522,6 +523,8 @@ struct arm_smmu_cmdq_ent {
#define CMDQ_OP_TLBI_NH_ASID 0x11
#define CMDQ_OP_TLBI_NH_VA 0x12
#define CMDQ_OP_TLBI_EL2_ALL 0x20
+ #define CMDQ_OP_TLBI_EL2_ASID 0x21
+ #define CMDQ_OP_TLBI_EL2_VA 0x22
#define CMDQ_OP_TLBI_S12_VMALL 0x28
#define CMDQ_OP_TLBI_S2_IPA 0x2a
#define CMDQ_OP_TLBI_NSNH_ALL 0x30
@@ -665,6 +668,7 @@ struct arm_smmu_device {
#define ARM_SMMU_FEAT_TRANS_S2 (1 << 10)
#define ARM_SMMU_FEAT_STALLS (1 << 11)
#define ARM_SMMU_FEAT_HYP (1 << 12)
+#define ARM_SMMU_FEAT_E2H (1 << 13)
u32 features;
#define ARM_SMMU_OPT_SKIP_PREFETCH (1 << 0)
@@ -928,6 +932,7 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
cmd[1] |= CMDQ_CFGI_1_RANGE_MASK << CMDQ_CFGI_1_RANGE_SHIFT;
break;
case CMDQ_OP_TLBI_NH_VA:
+ case CMDQ_OP_TLBI_EL2_VA:
cmd[0] |= (u64)ent->tlbi.asid << CMDQ_TLBI_0_ASID_SHIFT;
cmd[1] |= ent->tlbi.leaf ? CMDQ_TLBI_1_LEAF : 0;
cmd[1] |= ent->tlbi.addr & CMDQ_TLBI_1_VA_MASK;
@@ -943,6 +948,9 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
case CMDQ_OP_TLBI_S12_VMALL:
cmd[0] |= (u64)ent->tlbi.vmid << CMDQ_TLBI_0_VMID_SHIFT;
break;
+ case CMDQ_OP_TLBI_EL2_ASID:
+ cmd[0] |= (u64)ent->tlbi.asid << CMDQ_TLBI_0_ASID_SHIFT;
+ break;
case CMDQ_OP_ATC_INV:
cmd[0] |= ent->substream_valid ? CMDQ_0_SSV : 0;
cmd[0] |= ent->atc.global ? CMDQ_ATC_0_GLOBAL : 0;
@@ -1522,7 +1530,9 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_device *smmu, u32 sid,
STRTAB_STE_1_S1C_CACHE_WBRA
<< STRTAB_STE_1_S1COR_SHIFT |
STRTAB_STE_1_S1C_SH_ISH << STRTAB_STE_1_S1CSH_SHIFT |
- STRTAB_STE_1_STRW_NSEL1 << STRTAB_STE_1_STRW_SHIFT);
+ (smmu->features & ARM_SMMU_FEAT_E2H ?
+ STRTAB_STE_1_STRW_EL2 : STRTAB_STE_1_STRW_NSEL1) <<
+ STRTAB_STE_1_STRW_SHIFT);
if (smmu->features & ARM_SMMU_FEAT_STALLS)
dst[1] |= cpu_to_le64(STRTAB_STE_1_S1STALLD);
@@ -1776,7 +1786,8 @@ static void arm_smmu_tlb_inv_context(void *cookie)
struct arm_smmu_cmdq_ent cmd;
if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) {
- cmd.opcode = CMDQ_OP_TLBI_NH_ASID;
+ cmd.opcode = smmu->features & ARM_SMMU_FEAT_E2H ?
+ CMDQ_OP_TLBI_EL2_ASID : CMDQ_OP_TLBI_NH_ASID;
cmd.tlbi.asid = smmu_domain->s1_cfg.asid;
cmd.tlbi.vmid = 0;
} else {
@@ -1801,7 +1812,8 @@ static void arm_smmu_tlb_inv_range_nosync(unsigned long iova, size_t size,
};
if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) {
- cmd.opcode = CMDQ_OP_TLBI_NH_VA;
+ cmd.opcode = smmu->features & ARM_SMMU_FEAT_E2H ?
+ CMDQ_OP_TLBI_EL2_VA : CMDQ_OP_TLBI_NH_VA;
cmd.tlbi.asid = smmu_domain->s1_cfg.asid;
} else {
cmd.opcode = CMDQ_OP_TLBI_S2_IPA;
@@ -3011,7 +3023,11 @@ static int arm_smmu_device_reset(struct arm_smmu_device *smmu, bool bypass)
writel_relaxed(reg, smmu->base + ARM_SMMU_CR1);
/* CR2 (random crap) */
- reg = CR2_PTM | CR2_RECINVSID | CR2_E2H;
+ reg = CR2_PTM | CR2_RECINVSID;
+
+ if (smmu->features & ARM_SMMU_FEAT_E2H)
+ reg |= CR2_E2H;
+
writel_relaxed(reg, smmu->base + ARM_SMMU_CR2);
/* Stream table */
@@ -3169,8 +3185,11 @@ static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu)
if (reg & IDR0_MSI)
smmu->features |= ARM_SMMU_FEAT_MSI;
- if (reg & IDR0_HYP)
+ if (reg & IDR0_HYP) {
smmu->features |= ARM_SMMU_FEAT_HYP;
+ if (cpus_have_cap(ARM64_HAS_VIRT_HOST_EXTN))
+ smmu->features |= ARM_SMMU_FEAT_E2H;
+ }
/*
* The coherency feature as set by FW is used in preference to the ID
--
2.11.0
^ permalink raw reply related [flat|nested] 103+ messages in thread
* [RFC PATCH 09/30] iommu/arm-smmu-v3: Support broadcast TLB maintenance
[not found] ` <20170227195441.5170-1-jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
` (7 preceding siblings ...)
2017-02-27 19:54 ` [RFC PATCH 08/30] iommu/arm-smmu-v3: Add support for VHE Jean-Philippe Brucker
@ 2017-02-27 19:54 ` Jean-Philippe Brucker
2017-02-27 19:54 ` [RFC PATCH 10/30] iommu/arm-smmu-v3: Add task contexts Jean-Philippe Brucker
` (21 subsequent siblings)
30 siblings, 0 replies; 103+ messages in thread
From: Jean-Philippe Brucker @ 2017-02-27 19:54 UTC (permalink / raw)
Cc: Shanker Donthineni, kvm-u79uwXL29TY76Z2rM5mHXA, Catalin Marinas,
Sinan Kaya, Will Deacon,
iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
Harv Abdulhamid, linux-pci-u79uwXL29TY76Z2rM5mHXA, Bjorn Helgaas,
David Woodhouse,
linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r, Nate Watterson
The SMMUv3 can handle invalidation targeted at TLB entries with shared
ASIDs. If the implementation supports broadcast TLB maintenance, enable
it and keep track of it in a feature bit. The SMMU will then take into
account the following CPU instruction for ASIDs in the shared set:
* TLBI VAE1IS(ASID, VA)
* TLBI ASIDE1IS(ASID)
Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
---
drivers/iommu/arm-smmu-v3.c | 19 +++++++++++++++++--
1 file changed, 17 insertions(+), 2 deletions(-)
diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index 0981159ada04..2724788157a5 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -65,6 +65,7 @@
#define IDR0_ASID16 (1 << 12)
#define IDR0_ATS (1 << 10)
#define IDR0_HYP (1 << 9)
+#define IDR0_BTM (1 << 5)
#define IDR0_COHACC (1 << 4)
#define IDR0_TTF_SHIFT 2
#define IDR0_TTF_MASK 0x3
@@ -669,6 +670,7 @@ struct arm_smmu_device {
#define ARM_SMMU_FEAT_STALLS (1 << 11)
#define ARM_SMMU_FEAT_HYP (1 << 12)
#define ARM_SMMU_FEAT_E2H (1 << 13)
+#define ARM_SMMU_FEAT_BTM (1 << 14)
u32 features;
#define ARM_SMMU_OPT_SKIP_PREFETCH (1 << 0)
@@ -3023,11 +3025,14 @@ static int arm_smmu_device_reset(struct arm_smmu_device *smmu, bool bypass)
writel_relaxed(reg, smmu->base + ARM_SMMU_CR1);
/* CR2 (random crap) */
- reg = CR2_PTM | CR2_RECINVSID;
+ reg = CR2_RECINVSID;
if (smmu->features & ARM_SMMU_FEAT_E2H)
reg |= CR2_E2H;
+ if (!(smmu->features & ARM_SMMU_FEAT_BTM))
+ reg |= CR2_PTM;
+
writel_relaxed(reg, smmu->base + ARM_SMMU_CR2);
/* Stream table */
@@ -3138,6 +3143,7 @@ static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu)
{
u32 reg;
bool coherent = smmu->features & ARM_SMMU_FEAT_COHERENCY;
+ bool vhe = cpus_have_cap(ARM64_HAS_VIRT_HOST_EXTN);
/* IDR0 */
reg = readl_relaxed(smmu->base + ARM_SMMU_IDR0);
@@ -3187,11 +3193,20 @@ static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu)
if (reg & IDR0_HYP) {
smmu->features |= ARM_SMMU_FEAT_HYP;
- if (cpus_have_cap(ARM64_HAS_VIRT_HOST_EXTN))
+ if (vhe)
smmu->features |= ARM_SMMU_FEAT_E2H;
}
/*
+ * If the CPU is using VHE, but the SMMU doesn't support it, the SMMU
+ * will create TLB entries for NH-EL1 world and will miss the
+ * broadcasted TLB invalidations that target EL2-E2H world. Don't enable
+ * BTM in that case.
+ */
+ if (reg & IDR0_BTM && (!vhe || reg & IDR0_HYP))
+ smmu->features |= ARM_SMMU_FEAT_BTM;
+
+ /*
* The coherency feature as set by FW is used in preference to the ID
* register, but warn on mismatch.
*/
--
2.11.0
^ permalink raw reply related [flat|nested] 103+ messages in thread
* [RFC PATCH 10/30] iommu/arm-smmu-v3: Add task contexts
[not found] ` <20170227195441.5170-1-jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
` (8 preceding siblings ...)
2017-02-27 19:54 ` [RFC PATCH 09/30] iommu/arm-smmu-v3: Support broadcast TLB maintenance Jean-Philippe Brucker
@ 2017-02-27 19:54 ` Jean-Philippe Brucker
2017-02-27 19:54 ` [RFC PATCH 11/30] arm64: mm: Pin down ASIDs for sharing contexts with devices Jean-Philippe Brucker
` (20 subsequent siblings)
30 siblings, 0 replies; 103+ messages in thread
From: Jean-Philippe Brucker @ 2017-02-27 19:54 UTC (permalink / raw)
Cc: Shanker Donthineni, kvm-u79uwXL29TY76Z2rM5mHXA, Catalin Marinas,
Sinan Kaya, Will Deacon,
iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
Harv Abdulhamid, linux-pci-u79uwXL29TY76Z2rM5mHXA, Bjorn Helgaas,
David Woodhouse,
linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r, Nate Watterson
Now that we support substreams, add the required infrastructure to use
them. Each device can be attached to multiple address spaces. The default
address space is the domain, and additional ones are tasks. Each task can
be attached to multiple devices as well.
+------------+ +---------------+
| group | | domain |
| +-------+ |
| | | | (c)
(a) ,-----[master]-------------[context 1]---------[task]--. (d)
/ | \ | (b) | | \
/ | `------------[context 2]----. [SMMU]
/ | | | | \ /
[SMMU] +------------+ +---------------+ \ /
\ `-[task]--'
\ +------------+ +---------------+ /
\ | | | | /
\----. | | | /
`----[master]-------------[context 1]----'
| | | |
| +-------+ |
| group | (e) | domain |
+------------+ +---------------+
(a) Add a rbtree of streams in each SMMU instance, indexed by stream ID.
Each stream points to a single master, and masters can be identified by
multiple stream IDs. PCIe endpoints are, for the moment, expected to have
a unique stream ID (RID), but see the discussion about groups below.
Platform devices may issue multiple stream IDs.
(b) Add a rbtree of contexts in each SVM-capable master, indexed by
substream ID. A context is the link between a device and a task. It is
bidirectional since we need to lookup all devices attached to a task when
invalidating a mapping, and all tasks attached to a device when handling a
device fault (PRI).
Both these rbtrees will allow fast lookup of an address space when
handling a fault.
(c) Add a task in each context. We need tasks and contexts to be separate
entities with their own refcounting for at least two reasons. To reduce
the number of mmu notifiers and simplify ASID tracking, tasks are allowed
to be attached to multiple masters. It is also necessary for sane PASID
invalidation but we won't go into that now.
(d) Add a list of tasks in each SMMU instance. This allows to find
existing tasks when binding to a device. We could make it an rbtree, but
it's not clear whether we need this complication yet.
Lifetime of contexts and tasks is the following:
* arm_smmu_alloc_task creates smmu_task and adds it to the list.
* arm_smmu_attach_task allocates an SSID, creates smmu_context and
connects task to master. Context descriptor is written, so the SSID is
active and can be used in transactions.
* users may get krefs to task and contexts temporarily, and release them
with put_task/put_context.
* arm_smmu_detach_task severs the link between context and task, and drops
the task's refcount. If the task was only attached to one master, it is
freed and removed from the list. arm_smmu_detach_task also clears the
context descriptor, after which any transaction with this SSID would
result in a fault.
* arm_smmu_put_context is called later, freeing the dangling context.
All these structures are protected by a single context_lock spinlock for
each SMMU instance. Therefore context_lock also serializes modifications
of the context descriptors. Any update of the krefs must be done while
holding the lock. Reads of lists and context->task as well. As we expect a
lot more reads than writes on the structures introduced by this patch,
especially for handling page requests, we hope to make use of the RCU in
the future.
(e) A group still has a default DMA domain, represented by context 0 (no
SSID). When attaching to a different domain, for example a VFIO domain, we
also want to destroy all active contexts. If VFIO is handing control of a
device to a guest for instance, we do not want the guest to have access to
host tasks through bound contexts. So contexts are conceptually contained
in the domain, although we do not need to maintain any structure for this.
When detaching a domain from a group, all contexts attached to this group
are also detached.
An IOMMU group contains all devices that can't be distinguished from one
another by their IOMMU. For PCIe this may be due to buggy devices, lack of
ACS isolation, legacy or non-transparent bridges. Since PCIe SVM platforms
on ARM have yet to be seen in the wild, legacy issues don't matter and SVM
through NTB is pretty far off.
So for prototyping, we currently bypass the concept of IOMMU groups, since
we expect SVM-capable devices to be properly integrated. This might change
as soon as someone starts taping out devices. There will then be some
complication in attaching a task to a group instead of an individual
device (for instance, finding a consensus for PASID sizes of all devices
in the group, which may be altered by hotplugging a device in a group),
but the current design doesn't prevent this change.
Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
---
drivers/iommu/arm-smmu-v3.c | 316 +++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 314 insertions(+), 2 deletions(-)
diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index 2724788157a5..5b4d1f265194 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -701,6 +701,16 @@ struct arm_smmu_device {
/* IOMMU core code handle */
struct iommu_device iommu;
+
+ spinlock_t contexts_lock;
+ struct rb_root streams;
+ struct list_head tasks;
+};
+
+struct arm_smmu_stream {
+ u32 id;
+ struct arm_smmu_master_data *master;
+ struct rb_node node;
};
/* SMMU private data for each master */
@@ -710,6 +720,9 @@ struct arm_smmu_master_data {
struct device *dev;
struct list_head group_head;
+
+ struct arm_smmu_stream *streams;
+ struct rb_root contexts;
};
/* SMMU private data for an IOMMU domain */
@@ -738,6 +751,31 @@ struct arm_smmu_domain {
spinlock_t groups_lock;
};
+struct arm_smmu_task {
+ struct pid *pid;
+
+ struct arm_smmu_device *smmu;
+ struct list_head smmu_head;
+
+ struct list_head contexts;
+
+ struct arm_smmu_s1_cfg s1_cfg;
+
+ struct kref kref;
+};
+
+struct arm_smmu_context {
+ u32 ssid;
+
+ struct arm_smmu_task *task;
+ struct arm_smmu_master_data *master;
+
+ struct list_head task_head;
+ struct rb_node master_node;
+
+ struct kref kref;
+};
+
struct arm_smmu_group {
struct arm_smmu_domain *domain;
struct list_head domain_head;
@@ -1363,7 +1401,6 @@ static void arm_smmu_free_cd_tables(struct arm_smmu_master_data *master)
cfg->num_entries = 0;
}
-__maybe_unused
static int arm_smmu_alloc_cd(struct arm_smmu_master_data *master)
{
int ssid;
@@ -1401,7 +1438,6 @@ static int arm_smmu_alloc_cd(struct arm_smmu_master_data *master)
return -ENOSPC;
}
-__maybe_unused
static void arm_smmu_free_cd(struct arm_smmu_master_data *master, u32 ssid)
{
unsigned long l1_idx, idx;
@@ -1961,6 +1997,195 @@ static bool arm_smmu_capable(enum iommu_cap cap)
}
}
+__maybe_unused
+static struct arm_smmu_context *
+arm_smmu_attach_task(struct arm_smmu_task *smmu_task,
+ struct arm_smmu_master_data *master)
+{
+ int ssid;
+ int ret = 0;
+ struct arm_smmu_context *smmu_context, *ctx;
+ struct arm_smmu_device *smmu = master->smmu;
+ struct rb_node **new_node, *parent_node = NULL;
+
+ smmu_context = kzalloc(sizeof(*smmu_context), GFP_KERNEL);
+ if (!smmu_context)
+ return ERR_PTR(-ENOMEM);
+
+ smmu_context->task = smmu_task;
+ smmu_context->master = master;
+ kref_init(&smmu_context->kref);
+
+ spin_lock(&smmu->contexts_lock);
+
+ /* Allocate a context descriptor and SSID */
+ ssid = arm_smmu_alloc_cd(master);
+ if (ssid <= 0) {
+ if (WARN_ON_ONCE(ssid == 0))
+ ret = -EEXIST;
+ else
+ ret = ssid;
+ goto err_free_context;
+ }
+
+ smmu_context->ssid = ssid;
+
+ arm_smmu_write_ctx_desc(master, ssid, &smmu_task->s1_cfg);
+
+ list_add(&smmu_context->task_head, &smmu_task->contexts);
+
+ /* Insert into master context list */
+ new_node = &(master->contexts.rb_node);
+ while (*new_node) {
+ ctx = rb_entry(*new_node, struct arm_smmu_context,
+ master_node);
+ parent_node = *new_node;
+ if (ctx->ssid > ssid) {
+ new_node = &((*new_node)->rb_left);
+ } else if (ctx->ssid < ssid) {
+ new_node = &((*new_node)->rb_right);
+ } else {
+ dev_warn(master->dev, "context %u already exists\n",
+ ctx->ssid);
+ ret = -EEXIST;
+ goto err_remove_context;
+ }
+ }
+
+ rb_link_node(&smmu_context->master_node, parent_node, new_node);
+ rb_insert_color(&smmu_context->master_node, &master->contexts);
+
+ spin_unlock(&smmu->contexts_lock);
+
+ return smmu_context;
+
+err_remove_context:
+ list_del(&smmu_context->task_head);
+ arm_smmu_write_ctx_desc(master, ssid, NULL);
+ arm_smmu_free_cd(master, ssid);
+
+err_free_context:
+ spin_unlock(&smmu->contexts_lock);
+
+ kfree(smmu_context);
+
+ return ERR_PTR(ret);
+}
+
+/* Caller must hold contexts_lock */
+static void arm_smmu_free_context(struct kref *kref)
+{
+ struct arm_smmu_master_data *master;
+ struct arm_smmu_context *smmu_context;
+
+ smmu_context = container_of(kref, struct arm_smmu_context, kref);
+
+ WARN_ON_ONCE(smmu_context->task);
+
+ master = smmu_context->master;
+
+ arm_smmu_free_cd(master, smmu_context->ssid);
+
+ rb_erase(&smmu_context->master_node, &master->contexts);
+
+ kfree(smmu_context);
+}
+
+static void _arm_smmu_put_context(struct arm_smmu_context *smmu_context)
+{
+ kref_put(&smmu_context->kref, arm_smmu_free_context);
+}
+
+__maybe_unused
+static void arm_smmu_put_context(struct arm_smmu_device *smmu,
+ struct arm_smmu_context *smmu_context)
+{
+ spin_lock(&smmu->contexts_lock);
+ _arm_smmu_put_context(smmu_context);
+ spin_unlock(&smmu->contexts_lock);
+}
+
+__maybe_unused
+static struct arm_smmu_task *arm_smmu_alloc_task(struct arm_smmu_device *smmu,
+ struct task_struct *task)
+{
+ struct arm_smmu_task *smmu_task;
+
+ smmu_task = kzalloc(sizeof(*smmu_task), GFP_KERNEL);
+ if (!smmu_task)
+ return ERR_PTR(-ENOMEM);
+
+ smmu_task->smmu = smmu;
+ smmu_task->pid = get_task_pid(task, PIDTYPE_PID);
+ INIT_LIST_HEAD(&smmu_task->contexts);
+ kref_init(&smmu_task->kref);
+
+ spin_lock(&smmu->contexts_lock);
+ list_add(&smmu_task->smmu_head, &smmu->tasks);
+ spin_unlock(&smmu->contexts_lock);
+
+ return smmu_task;
+}
+
+/* Caller must hold contexts_lock */
+static void arm_smmu_free_task(struct kref *kref)
+{
+ struct arm_smmu_device *smmu;
+ struct arm_smmu_task *smmu_task;
+ struct arm_smmu_master_data *master;
+ struct arm_smmu_context *smmu_context, *next;
+
+ smmu_task = container_of(kref, struct arm_smmu_task, kref);
+ smmu = smmu_task->smmu;
+
+ if (WARN_ON_ONCE(!list_empty(&smmu_task->contexts))) {
+ list_for_each_entry_safe(smmu_context, next,
+ &smmu_task->contexts, task_head) {
+ master = smmu_context->master;
+
+ arm_smmu_write_ctx_desc(master, smmu_context->ssid, NULL);
+ smmu_context->task = NULL;
+ list_del(&smmu_context->task_head);
+ }
+ }
+
+ list_del(&smmu_task->smmu_head);
+
+ put_pid(smmu_task->pid);
+ kfree(smmu_task);
+}
+
+static void _arm_smmu_put_task(struct arm_smmu_task *smmu_task)
+{
+ kref_put(&smmu_task->kref, arm_smmu_free_task);
+}
+
+/* Caller must hold contexts_lock */
+static void arm_smmu_detach_task(struct arm_smmu_context *smmu_context)
+{
+ struct arm_smmu_task *smmu_task = smmu_context->task;
+
+ smmu_context->task = NULL;
+ list_del(&smmu_context->task_head);
+ _arm_smmu_put_task(smmu_task);
+
+ arm_smmu_write_ctx_desc(smmu_context->master, smmu_context->ssid, NULL);
+}
+
+__maybe_unused
+static void arm_smmu_put_task(struct arm_smmu_device *smmu,
+ struct arm_smmu_task *smmu_task)
+{
+ spin_lock(&smmu->contexts_lock);
+ _arm_smmu_put_task(smmu_task);
+ spin_unlock(&smmu->contexts_lock);
+}
+
+static bool arm_smmu_master_supports_svm(struct arm_smmu_master_data *master)
+{
+ return false;
+}
+
static struct iommu_domain *arm_smmu_domain_alloc(unsigned type)
{
struct arm_smmu_domain *smmu_domain;
@@ -2173,12 +2398,31 @@ static struct arm_smmu_group *arm_smmu_group_alloc(struct iommu_group *group)
static void arm_smmu_detach_dev(struct device *dev)
{
struct arm_smmu_master_data *master = dev->iommu_fwspec->iommu_priv;
+ struct arm_smmu_device *smmu = master->smmu;
+ struct arm_smmu_context *smmu_context;
+ struct rb_node *node, *next;
master->ste.bypass = true;
if (arm_smmu_install_ste_for_dev(dev->iommu_fwspec) < 0)
dev_warn(dev, "failed to install bypass STE\n");
arm_smmu_write_ctx_desc(master, 0, NULL);
+
+ if (!master->ste.valid)
+ return;
+
+ spin_lock(&smmu->contexts_lock);
+ for (node = rb_first(&master->contexts); node; node = next) {
+ smmu_context = rb_entry(node, struct arm_smmu_context,
+ master_node);
+ next = rb_next(node);
+
+ if (smmu_context->task)
+ arm_smmu_detach_task(smmu_context);
+
+ _arm_smmu_put_context(smmu_context);
+ }
+ spin_unlock(&smmu->contexts_lock);
}
static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
@@ -2418,6 +2662,54 @@ static void arm_smmu_disable_ats(struct arm_smmu_master_data *master)
pci_disable_ats(pdev);
}
+static int arm_smmu_insert_master(struct arm_smmu_device *smmu,
+ struct arm_smmu_master_data *master)
+{
+ int i;
+ int ret = 0;
+ struct arm_smmu_stream *new_stream, *cur_stream;
+ struct rb_node **new_node, *parent_node = NULL;
+ struct iommu_fwspec *fwspec = master->dev->iommu_fwspec;
+
+ master->streams = kcalloc(fwspec->num_ids,
+ sizeof(struct arm_smmu_stream), GFP_KERNEL);
+ if (!master->streams)
+ return -ENOMEM;
+
+ spin_lock(&smmu->contexts_lock);
+ for (i = 0; i < fwspec->num_ids && !ret; i++) {
+ new_stream = &master->streams[i];
+ new_stream->id = fwspec->ids[i];
+ new_stream->master = master;
+
+ new_node = &(smmu->streams.rb_node);
+ while (*new_node) {
+ cur_stream = rb_entry(*new_node, struct arm_smmu_stream,
+ node);
+ parent_node = *new_node;
+ if (cur_stream->id > new_stream->id) {
+ new_node = &((*new_node)->rb_left);
+ } else if (cur_stream->id < new_stream->id) {
+ new_node = &((*new_node)->rb_right);
+ } else {
+ dev_warn(master->dev,
+ "stream %u already in tree\n",
+ cur_stream->id);
+ ret = -EINVAL;
+ break;
+ }
+ }
+
+ if (!ret) {
+ rb_link_node(&new_stream->node, parent_node, new_node);
+ rb_insert_color(&new_stream->node, &smmu->streams);
+ }
+ }
+ spin_unlock(&smmu->contexts_lock);
+
+ return ret;
+}
+
static struct iommu_ops arm_smmu_ops;
static int arm_smmu_add_device(struct device *dev)
@@ -2452,6 +2744,8 @@ static int arm_smmu_add_device(struct device *dev)
master->smmu = smmu;
master->dev = dev;
fwspec->iommu_priv = master;
+
+ master->contexts = RB_ROOT;
}
/* Check the SIDs are in range of the SMMU and our stream table */
@@ -2475,6 +2769,9 @@ static int arm_smmu_add_device(struct device *dev)
ats_enabled = !arm_smmu_enable_ats(master);
+ if (arm_smmu_master_supports_svm(master))
+ arm_smmu_insert_master(smmu, master);
+
group = iommu_group_get_for_dev(dev);
if (IS_ERR(group)) {
ret = PTR_ERR(group);
@@ -2510,6 +2807,7 @@ static void arm_smmu_remove_device(struct device *dev)
struct arm_smmu_device *smmu;
struct iommu_group *group;
unsigned long flags;
+ int i;
if (!fwspec || fwspec->ops != &arm_smmu_ops)
return;
@@ -2520,6 +2818,16 @@ static void arm_smmu_remove_device(struct device *dev)
arm_smmu_detach_dev(dev);
if (master) {
+ if (master->streams) {
+ spin_lock(&smmu->contexts_lock);
+ for (i = 0; i < fwspec->num_ids; i++)
+ rb_erase(&master->streams[i].node,
+ &smmu->streams);
+ spin_unlock(&smmu->contexts_lock);
+
+ kfree(master->streams);
+ }
+
group = iommu_group_get(dev);
smmu_group = to_smmu_group(group);
@@ -2820,6 +3128,10 @@ static int arm_smmu_init_structures(struct arm_smmu_device *smmu)
{
int ret;
+ spin_lock_init(&smmu->contexts_lock);
+ smmu->streams = RB_ROOT;
+ INIT_LIST_HEAD(&smmu->tasks);
+
ret = arm_smmu_init_queues(smmu);
if (ret)
return ret;
--
2.11.0
^ permalink raw reply related [flat|nested] 103+ messages in thread
* [RFC PATCH 11/30] arm64: mm: Pin down ASIDs for sharing contexts with devices
[not found] ` <20170227195441.5170-1-jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
` (9 preceding siblings ...)
2017-02-27 19:54 ` [RFC PATCH 10/30] iommu/arm-smmu-v3: Add task contexts Jean-Philippe Brucker
@ 2017-02-27 19:54 ` Jean-Philippe Brucker
2017-02-27 19:54 ` [RFC PATCH 12/30] iommu/arm-smmu-v3: Keep track of process address spaces Jean-Philippe Brucker
` (19 subsequent siblings)
30 siblings, 0 replies; 103+ messages in thread
From: Jean-Philippe Brucker @ 2017-02-27 19:54 UTC (permalink / raw)
Cc: Shanker Donthineni, kvm-u79uwXL29TY76Z2rM5mHXA, Catalin Marinas,
Sinan Kaya, Will Deacon,
iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
Harv Abdulhamid, linux-pci-u79uwXL29TY76Z2rM5mHXA, Bjorn Helgaas,
David Woodhouse,
linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r, Nate Watterson
In order to enable address space sharing with the IOMMU, we introduce
functions mm_context_get and mm_context_put, that pin down a context and
ensure that its ASID won't be modified willy-nilly after a rollover.
Pinning is necessary because, once a device is using an ASID, it needs a
valid and unique one at all times, whether the associated task is running
or not.
Without pinning, we would need to notify the IOMMU when we're about to use
a new ASID for a task. Things would get messy when a new task is assigned
a shared ASID. Consider the following scenario:
1. Task t1 is running on CPUx with shared ASID (1, 1)
2. Task t2 is scheduled on CPUx, gets ASID (1, 2)
3. Task tn is scheduled on CPUy, a rollover occurs, tn gets ASID (2, 1)
We would now have to immediately generate a new ASID for t1, notify
the IOMMU, and finally enable task tn. We are holding the lock during
all that time, since we can't afford having another CPU trigger a
rollover.
It gets needlessly complicated, and all we wanted to do was schedule poor
task tn, that has no business with the IOMMU. By letting the IOMMU pin
tasks when needed, we avoid stalling the slow path, and let the pinning
fail when we're out of potential ASIDs.
After a rollover, we assume that there is at least one more ASID than
number of CPUs. So we can use (NR_ASIDS - NR_CPUS - 1) as a hard limit for
the number of ASIDs we can afford to share with the IOMMU.
Since multiple IOMMUs could pin the same context, we need to keep track of
the number of references. Add a refcount value in mm_context_t for this
purpose.
Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
---
arch/arm64/include/asm/mmu.h | 1 +
arch/arm64/include/asm/mmu_context.h | 11 ++++-
arch/arm64/mm/context.c | 80 +++++++++++++++++++++++++++++++++++-
3 files changed, 90 insertions(+), 2 deletions(-)
diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h
index 47619411f0ff..e18a6dfcf745 100644
--- a/arch/arm64/include/asm/mmu.h
+++ b/arch/arm64/include/asm/mmu.h
@@ -18,6 +18,7 @@
typedef struct {
atomic64_t id;
+ unsigned long refcount;
void *vdso;
unsigned long flags;
} mm_context_t;
diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
index 1ef40d82cfd3..ce4216e09e13 100644
--- a/arch/arm64/include/asm/mmu_context.h
+++ b/arch/arm64/include/asm/mmu_context.h
@@ -152,7 +152,13 @@ static inline void cpu_replace_ttbr1(pgd_t *pgd)
#define destroy_context(mm) do { } while(0)
void check_and_switch_context(struct mm_struct *mm, unsigned int cpu);
-#define init_new_context(tsk,mm) ({ atomic64_set(&(mm)->context.id, 0); 0; })
+static inline int
+init_new_context(struct task_struct *tsk, struct mm_struct *mm)
+{
+ atomic64_set(&mm->context.id, 0);
+ mm->context.refcount = 0;
+ return 0;
+}
/*
* This is called when "tsk" is about to enter lazy TLB mode.
@@ -224,6 +230,9 @@ switch_mm(struct mm_struct *prev, struct mm_struct *next,
void verify_cpu_asid_bits(void);
+unsigned long mm_context_get(struct mm_struct *mm);
+void mm_context_put(struct mm_struct *mm);
+
#endif /* !__ASSEMBLY__ */
#endif /* !__ASM_MMU_CONTEXT_H */
diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c
index 68634c630cdd..b8ddd46bedb6 100644
--- a/arch/arm64/mm/context.c
+++ b/arch/arm64/mm/context.c
@@ -37,6 +37,10 @@ static DEFINE_PER_CPU(atomic64_t, active_asids);
static DEFINE_PER_CPU(u64, reserved_asids);
static cpumask_t tlb_flush_pending;
+static unsigned long max_pinned_asids;
+static unsigned long nr_pinned_asids;
+static unsigned long *pinned_asid_map;
+
#define ASID_MASK (~GENMASK(asid_bits - 1, 0))
#define ASID_FIRST_VERSION (1UL << asid_bits)
#define NUM_USER_ASIDS ASID_FIRST_VERSION
@@ -92,7 +96,7 @@ static void flush_context(unsigned int cpu)
u64 asid;
/* Update the list of reserved ASIDs and the ASID bitmap. */
- bitmap_clear(asid_map, 0, NUM_USER_ASIDS);
+ bitmap_copy(asid_map, pinned_asid_map, NUM_USER_ASIDS);
set_reserved_asid_bits();
@@ -157,6 +161,10 @@ static u64 new_context(struct mm_struct *mm, unsigned int cpu)
if (asid != 0) {
u64 newasid = generation | (asid & ~ASID_MASK);
+ /* That ASID is pinned for us, we're good to go. */
+ if (mm->context.refcount)
+ return newasid;
+
/*
* If our current ASID was active during a rollover, we
* can continue to use it and this was just a false alarm.
@@ -238,6 +246,63 @@ void check_and_switch_context(struct mm_struct *mm, unsigned int cpu)
cpu_switch_mm(mm->pgd, mm);
}
+unsigned long mm_context_get(struct mm_struct *mm)
+{
+ unsigned long flags;
+ u64 asid;
+
+ raw_spin_lock_irqsave(&cpu_asid_lock, flags);
+
+ asid = atomic64_read(&mm->context.id);
+
+ if (mm->context.refcount) {
+ mm->context.refcount++;
+ asid &= ~ASID_MASK;
+ goto out_unlock;
+ }
+
+ if (nr_pinned_asids >= max_pinned_asids) {
+ asid = 0;
+ goto out_unlock;
+ }
+
+ if (((asid ^ atomic64_read(&asid_generation)) >> asid_bits)) {
+ /*
+ * We went through one or more rollover since that ASID was
+ * used. Ensure that it is still valid, or generate a new one.
+ * The cpu argument isn't used by new_context.
+ */
+ asid = new_context(mm, 0);
+ atomic64_set(&mm->context.id, asid);
+ }
+
+ asid &= ~ASID_MASK;
+
+ nr_pinned_asids++;
+ __set_bit(asid, pinned_asid_map);
+ mm->context.refcount++;
+
+out_unlock:
+ raw_spin_unlock_irqrestore(&cpu_asid_lock, flags);
+
+ return asid;
+}
+
+void mm_context_put(struct mm_struct *mm)
+{
+ unsigned long flags;
+ u64 asid = atomic64_read(&mm->context.id) & ~ASID_MASK;
+
+ raw_spin_lock_irqsave(&cpu_asid_lock, flags);
+
+ if (--mm->context.refcount == 0) {
+ __clear_bit(asid, pinned_asid_map);
+ nr_pinned_asids--;
+ }
+
+ raw_spin_unlock_irqrestore(&cpu_asid_lock, flags);
+}
+
static int asids_init(void)
{
asid_bits = get_cpu_asid_bits();
@@ -255,6 +320,19 @@ static int asids_init(void)
set_reserved_asid_bits();
+ pinned_asid_map = kzalloc(BITS_TO_LONGS(NUM_USER_ASIDS)
+ * sizeof(*pinned_asid_map), GFP_KERNEL);
+ if (!pinned_asid_map)
+ panic("Failed to allocate pinned bitmap\n");
+
+ /*
+ * We assume that an ASID is always available after a rollback. This
+ * means that even if all CPUs have a reserved ASID, there still is at
+ * least one slot available in the asid_bitmap.
+ */
+ max_pinned_asids = NUM_USER_ASIDS - num_possible_cpus() - 2;
+ nr_pinned_asids = 0;
+
pr_info("ASID allocator initialised with %lu entries\n", NUM_USER_ASIDS);
return 0;
}
--
2.11.0
^ permalink raw reply related [flat|nested] 103+ messages in thread
* [RFC PATCH 12/30] iommu/arm-smmu-v3: Keep track of process address spaces
[not found] ` <20170227195441.5170-1-jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
` (10 preceding siblings ...)
2017-02-27 19:54 ` [RFC PATCH 11/30] arm64: mm: Pin down ASIDs for sharing contexts with devices Jean-Philippe Brucker
@ 2017-02-27 19:54 ` Jean-Philippe Brucker
2017-02-27 19:54 ` [RFC PATCH 13/30] iommu/io-pgtable-arm: Factor out ARM LPAE register defines Jean-Philippe Brucker
` (18 subsequent siblings)
30 siblings, 0 replies; 103+ messages in thread
From: Jean-Philippe Brucker @ 2017-02-27 19:54 UTC (permalink / raw)
Cc: Shanker Donthineni, kvm-u79uwXL29TY76Z2rM5mHXA, Catalin Marinas,
Sinan Kaya, Will Deacon,
iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
Harv Abdulhamid, linux-pci-u79uwXL29TY76Z2rM5mHXA, Bjorn Helgaas,
David Woodhouse,
linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r, Nate Watterson
Binding to a task requires the following steps:
* Pin down the mm context, and use its ASID to tag SMMU mappings. If
the ASID is already in use by the SMMU for a domain using the DMA API,
abort the binding. We can steal the domain's ASID since it is allocated
by the SMMU, but this deserves a patch of its own.
* Register an MMU notifier to mirror invalidations of the system's TLBs
with a device's ATC.
Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
---
drivers/iommu/arm-smmu-v3.c | 171 +++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 169 insertions(+), 2 deletions(-)
diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index 5b4d1f265194..ab49164bf09b 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -29,6 +29,8 @@
#include <linux/interrupt.h>
#include <linux/iommu.h>
#include <linux/iopoll.h>
+#include <linux/mmu_context.h>
+#include <linux/mmu_notifier.h>
#include <linux/module.h>
#include <linux/msi.h>
#include <linux/of.h>
@@ -761,6 +763,9 @@ struct arm_smmu_task {
struct arm_smmu_s1_cfg s1_cfg;
+ struct mmu_notifier mmu_notifier;
+ struct mm_struct *mm;
+
struct kref kref;
};
@@ -1984,6 +1989,33 @@ static size_t arm_smmu_atc_invalidate_domain(struct arm_smmu_domain *smmu_domain
return size;
}
+static size_t arm_smmu_atc_invalidate_task(struct arm_smmu_task *smmu_task,
+ unsigned long iova, size_t size)
+{
+ struct arm_smmu_cmdq_ent cmd;
+ struct arm_smmu_context *smmu_context;
+ struct arm_smmu_device *smmu = smmu_task->smmu;
+ struct arm_smmu_cmdq_ent sync_cmd = {
+ .opcode = CMDQ_OP_CMD_SYNC,
+ };
+
+ arm_smmu_atc_invalidate_to_cmd(smmu, iova, size, &cmd);
+ cmd.substream_valid = true;
+
+ spin_lock(&smmu->contexts_lock);
+
+ list_for_each_entry(smmu_context, &smmu_task->contexts, task_head) {
+ cmd.atc.ssid = smmu_context->ssid;
+ arm_smmu_atc_invalidate_master(smmu_context->master, &cmd);
+ }
+
+ spin_unlock(&smmu->contexts_lock);
+
+ arm_smmu_cmdq_issue_cmd(smmu, &sync_cmd);
+
+ return size;
+}
+
/* IOMMU API */
static bool arm_smmu_capable(enum iommu_cap cap)
{
@@ -2105,26 +2137,148 @@ static void arm_smmu_put_context(struct arm_smmu_device *smmu,
spin_unlock(&smmu->contexts_lock);
}
+static struct arm_smmu_task *mn_to_task(struct mmu_notifier *mn)
+{
+ return container_of(mn, struct arm_smmu_task, mmu_notifier);
+}
+
+static void arm_smmu_notifier_invalidate_range(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long start,
+ unsigned long end)
+{
+ struct arm_smmu_task *smmu_task = mn_to_task(mn);
+
+ arm_smmu_atc_invalidate_task(smmu_task, start, end - start);
+}
+
+static void arm_smmu_notifier_invalidate_page(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long address)
+{
+ arm_smmu_notifier_invalidate_range(mn, mm, address, address + PAGE_SIZE);
+}
+
+static int arm_smmu_notifier_clear_flush_young(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long start,
+ unsigned long end)
+{
+ arm_smmu_notifier_invalidate_range(mn, mm, start, end);
+
+ return 0;
+}
+
+static const struct mmu_notifier_ops arm_smmu_mmu_notifier_ops = {
+ .invalidate_page = arm_smmu_notifier_invalidate_page,
+ .invalidate_range = arm_smmu_notifier_invalidate_range,
+ .clear_flush_young = arm_smmu_notifier_clear_flush_young,
+};
+
+static int arm_smmu_context_share(struct arm_smmu_task *smmu_task, int asid)
+{
+ int ret = 0;
+ struct arm_smmu_device *smmu = smmu_task->smmu;
+
+ if (test_and_set_bit(asid, smmu->asid_map))
+ /* ASID is already used for a domain */
+ return -EEXIST;
+
+ return ret;
+}
+
+static int arm_smmu_init_task_pgtable(struct arm_smmu_task *smmu_task)
+{
+ int ret;
+ int asid;
+
+ /* Pin ASID on the CPU side */
+ asid = mm_context_get(smmu_task->mm);
+ if (!asid)
+ return -ENOSPC;
+
+ ret = arm_smmu_context_share(smmu_task, asid);
+ if (ret) {
+ mm_context_put(smmu_task->mm);
+ return ret;
+ }
+
+ /* TODO: Initialize the rest of s1_cfg */
+ smmu_task->s1_cfg.asid = asid;
+
+ return 0;
+}
+
+static void arm_smmu_free_task_pgtable(struct arm_smmu_task *smmu_task)
+{
+ struct arm_smmu_device *smmu = smmu_task->smmu;
+
+ mm_context_put(smmu_task->mm);
+
+ arm_smmu_bitmap_free(smmu->asid_map, smmu_task->s1_cfg.asid);
+}
+
__maybe_unused
static struct arm_smmu_task *arm_smmu_alloc_task(struct arm_smmu_device *smmu,
struct task_struct *task)
{
+ int ret;
+ struct mm_struct *mm;
struct arm_smmu_task *smmu_task;
+ mm = get_task_mm(task);
+ if (!mm)
+ return ERR_PTR(-EINVAL);
+
smmu_task = kzalloc(sizeof(*smmu_task), GFP_KERNEL);
- if (!smmu_task)
- return ERR_PTR(-ENOMEM);
+ if (!smmu_task) {
+ ret = -ENOMEM;
+ goto err_put_mm;
+ }
smmu_task->smmu = smmu;
smmu_task->pid = get_task_pid(task, PIDTYPE_PID);
+ smmu_task->mmu_notifier.ops = &arm_smmu_mmu_notifier_ops;
+ smmu_task->mm = mm;
INIT_LIST_HEAD(&smmu_task->contexts);
kref_init(&smmu_task->kref);
+ ret = arm_smmu_init_task_pgtable(smmu_task);
+ if (ret)
+ goto err_free_task;
+
+ /*
+ * TODO: check conflicts between task mappings and reserved HW
+ * mappings. It is unclear which reserved mappings might be affected
+ * because, for instance, devices are unlikely to send MSIs tagged with
+ * PASIDs so we (probably) don't need to carve out MSI regions from the
+ * task address space. Clarify this.
+ */
+
+ ret = mmu_notifier_register(&smmu_task->mmu_notifier, mm);
+ if (ret)
+ goto err_free_pgtable;
+
spin_lock(&smmu->contexts_lock);
list_add(&smmu_task->smmu_head, &smmu->tasks);
spin_unlock(&smmu->contexts_lock);
+ /* A reference to mm is kept by the notifier */
+ mmput(mm);
+
return smmu_task;
+
+err_free_pgtable:
+ arm_smmu_free_task_pgtable(smmu_task);
+
+err_free_task:
+ put_pid(smmu_task->pid);
+ kfree(smmu_task);
+
+err_put_mm:
+ mmput(mm);
+
+ return ERR_PTR(ret);
}
/* Caller must hold contexts_lock */
@@ -2151,8 +2305,21 @@ static void arm_smmu_free_task(struct kref *kref)
list_del(&smmu_task->smmu_head);
+ /*
+ * Release the lock temporarily to unregister the notifier. This is safe
+ * because the task is not accessible anymore.
+ */
+ spin_unlock(&smmu->contexts_lock);
+
+ /* Unpin ASID */
+ arm_smmu_free_task_pgtable(smmu_task);
+
+ mmu_notifier_unregister(&smmu_task->mmu_notifier, smmu_task->mm);
+
put_pid(smmu_task->pid);
kfree(smmu_task);
+
+ spin_lock(&smmu->contexts_lock);
}
static void _arm_smmu_put_task(struct arm_smmu_task *smmu_task)
--
2.11.0
^ permalink raw reply related [flat|nested] 103+ messages in thread
* [RFC PATCH 13/30] iommu/io-pgtable-arm: Factor out ARM LPAE register defines
[not found] ` <20170227195441.5170-1-jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
` (11 preceding siblings ...)
2017-02-27 19:54 ` [RFC PATCH 12/30] iommu/arm-smmu-v3: Keep track of process address spaces Jean-Philippe Brucker
@ 2017-02-27 19:54 ` Jean-Philippe Brucker
2017-02-27 19:54 ` [RFC PATCH 14/30] iommu/arm-smmu-v3: Share process page tables Jean-Philippe Brucker
` (17 subsequent siblings)
30 siblings, 0 replies; 103+ messages in thread
From: Jean-Philippe Brucker @ 2017-02-27 19:54 UTC (permalink / raw)
Cc: Shanker Donthineni, kvm-u79uwXL29TY76Z2rM5mHXA, Catalin Marinas,
Sinan Kaya, Will Deacon,
iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
Harv Abdulhamid, linux-pci-u79uwXL29TY76Z2rM5mHXA, Bjorn Helgaas,
David Woodhouse,
linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r, Nate Watterson
For unified address space, we'll need to extract CPU page table
information and mirror it in the substream setup. Move relevant defines
to a common header.
Fix TCR_SZ_MASK while we're at it.
Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
---
MAINTAINERS | 1 +
drivers/iommu/io-pgtable-arm.c | 48 +-----------------------------
drivers/iommu/io-pgtable-arm.h | 67 ++++++++++++++++++++++++++++++++++++++++++
3 files changed, 69 insertions(+), 47 deletions(-)
create mode 100644 drivers/iommu/io-pgtable-arm.h
diff --git a/MAINTAINERS b/MAINTAINERS
index 8d97b1d83c3e..bb089396cdd1 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2025,6 +2025,7 @@ S: Maintained
F: drivers/iommu/arm-smmu.c
F: drivers/iommu/arm-smmu-v3.c
F: drivers/iommu/io-pgtable-arm.c
+F: drivers/iommu/io-pgtable-arm.h
F: drivers/iommu/io-pgtable-arm-v7s.c
ARM64 PORT (AARCH64 ARCHITECTURE)
diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
index feacc54bec68..973c335afd04 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -30,6 +30,7 @@
#include <asm/barrier.h>
#include "io-pgtable.h"
+#include "io-pgtable-arm.h"
#define ARM_LPAE_MAX_ADDR_BITS 48
#define ARM_LPAE_S2_MAX_CONCAT_PAGES 16
@@ -115,53 +116,6 @@
#define ARM_LPAE_PTE_MEMATTR_DEV (((arm_lpae_iopte)0x1) << 2)
/* Register bits */
-#define ARM_32_LPAE_TCR_EAE (1 << 31)
-#define ARM_64_LPAE_S2_TCR_RES1 (1 << 31)
-
-#define ARM_LPAE_TCR_EPD1 (1 << 23)
-
-#define ARM_LPAE_TCR_TG0_4K (0 << 14)
-#define ARM_LPAE_TCR_TG0_64K (1 << 14)
-#define ARM_LPAE_TCR_TG0_16K (2 << 14)
-
-#define ARM_LPAE_TCR_SH0_SHIFT 12
-#define ARM_LPAE_TCR_SH0_MASK 0x3
-#define ARM_LPAE_TCR_SH_NS 0
-#define ARM_LPAE_TCR_SH_OS 2
-#define ARM_LPAE_TCR_SH_IS 3
-
-#define ARM_LPAE_TCR_ORGN0_SHIFT 10
-#define ARM_LPAE_TCR_IRGN0_SHIFT 8
-#define ARM_LPAE_TCR_RGN_MASK 0x3
-#define ARM_LPAE_TCR_RGN_NC 0
-#define ARM_LPAE_TCR_RGN_WBWA 1
-#define ARM_LPAE_TCR_RGN_WT 2
-#define ARM_LPAE_TCR_RGN_WB 3
-
-#define ARM_LPAE_TCR_SL0_SHIFT 6
-#define ARM_LPAE_TCR_SL0_MASK 0x3
-
-#define ARM_LPAE_TCR_T0SZ_SHIFT 0
-#define ARM_LPAE_TCR_SZ_MASK 0xf
-
-#define ARM_LPAE_TCR_PS_SHIFT 16
-#define ARM_LPAE_TCR_PS_MASK 0x7
-
-#define ARM_LPAE_TCR_IPS_SHIFT 32
-#define ARM_LPAE_TCR_IPS_MASK 0x7
-
-#define ARM_LPAE_TCR_PS_32_BIT 0x0ULL
-#define ARM_LPAE_TCR_PS_36_BIT 0x1ULL
-#define ARM_LPAE_TCR_PS_40_BIT 0x2ULL
-#define ARM_LPAE_TCR_PS_42_BIT 0x3ULL
-#define ARM_LPAE_TCR_PS_44_BIT 0x4ULL
-#define ARM_LPAE_TCR_PS_48_BIT 0x5ULL
-
-#define ARM_LPAE_MAIR_ATTR_SHIFT(n) ((n) << 3)
-#define ARM_LPAE_MAIR_ATTR_MASK 0xff
-#define ARM_LPAE_MAIR_ATTR_DEVICE 0x04
-#define ARM_LPAE_MAIR_ATTR_NC 0x44
-#define ARM_LPAE_MAIR_ATTR_WBRWA 0xff
#define ARM_LPAE_MAIR_ATTR_IDX_NC 0
#define ARM_LPAE_MAIR_ATTR_IDX_CACHE 1
#define ARM_LPAE_MAIR_ATTR_IDX_DEV 2
diff --git a/drivers/iommu/io-pgtable-arm.h b/drivers/iommu/io-pgtable-arm.h
new file mode 100644
index 000000000000..cb31314971ac
--- /dev/null
+++ b/drivers/iommu/io-pgtable-arm.h
@@ -0,0 +1,67 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Copyright (C) 2017 ARM Limited
+ */
+#ifndef __IO_PGTABLE_ARM_H
+#define __IO_PGTABLE_ARM_H
+
+#define ARM_32_LPAE_TCR_EAE (1 << 31)
+#define ARM_64_LPAE_S2_TCR_RES1 (1 << 31)
+
+#define ARM_LPAE_TCR_EPD1 (1 << 23)
+
+#define ARM_LPAE_TCR_TG0_4K (0 << 14)
+#define ARM_LPAE_TCR_TG0_64K (1 << 14)
+#define ARM_LPAE_TCR_TG0_16K (2 << 14)
+
+#define ARM_LPAE_TCR_SH0_SHIFT 12
+#define ARM_LPAE_TCR_SH0_MASK 0x3
+#define ARM_LPAE_TCR_SH_NS 0
+#define ARM_LPAE_TCR_SH_OS 2
+#define ARM_LPAE_TCR_SH_IS 3
+
+#define ARM_LPAE_TCR_ORGN0_SHIFT 10
+#define ARM_LPAE_TCR_IRGN0_SHIFT 8
+#define ARM_LPAE_TCR_RGN_MASK 0x3
+#define ARM_LPAE_TCR_RGN_NC 0
+#define ARM_LPAE_TCR_RGN_WBWA 1
+#define ARM_LPAE_TCR_RGN_WT 2
+#define ARM_LPAE_TCR_RGN_WB 3
+
+#define ARM_LPAE_TCR_SL0_SHIFT 6
+#define ARM_LPAE_TCR_SL0_MASK 0x3
+
+#define ARM_LPAE_TCR_T0SZ_SHIFT 0
+#define ARM_LPAE_TCR_SZ_MASK 0x3f
+
+#define ARM_LPAE_TCR_PS_SHIFT 16
+#define ARM_LPAE_TCR_PS_MASK 0x7
+
+#define ARM_LPAE_TCR_IPS_SHIFT 32
+#define ARM_LPAE_TCR_IPS_MASK 0x7
+
+#define ARM_LPAE_TCR_PS_32_BIT 0x0ULL
+#define ARM_LPAE_TCR_PS_36_BIT 0x1ULL
+#define ARM_LPAE_TCR_PS_40_BIT 0x2ULL
+#define ARM_LPAE_TCR_PS_42_BIT 0x3ULL
+#define ARM_LPAE_TCR_PS_44_BIT 0x4ULL
+#define ARM_LPAE_TCR_PS_48_BIT 0x5ULL
+
+#define ARM_LPAE_MAIR_ATTR_SHIFT(n) ((n) << 3)
+#define ARM_LPAE_MAIR_ATTR_MASK 0xff
+#define ARM_LPAE_MAIR_ATTR_DEVICE 0x04
+#define ARM_LPAE_MAIR_ATTR_NC 0x44
+#define ARM_LPAE_MAIR_ATTR_WBRWA 0xff
+
+#endif /* __IO_PGTABLE_ARM_H */
--
2.11.0
^ permalink raw reply related [flat|nested] 103+ messages in thread
* [RFC PATCH 14/30] iommu/arm-smmu-v3: Share process page tables
[not found] ` <20170227195441.5170-1-jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
` (12 preceding siblings ...)
2017-02-27 19:54 ` [RFC PATCH 13/30] iommu/io-pgtable-arm: Factor out ARM LPAE register defines Jean-Philippe Brucker
@ 2017-02-27 19:54 ` Jean-Philippe Brucker
2017-02-27 19:54 ` [RFC PATCH 15/30] iommu/arm-smmu-v3: Steal private ASID from a domain Jean-Philippe Brucker
` (16 subsequent siblings)
30 siblings, 0 replies; 103+ messages in thread
From: Jean-Philippe Brucker @ 2017-02-27 19:54 UTC (permalink / raw)
Cc: Shanker Donthineni, kvm-u79uwXL29TY76Z2rM5mHXA, Catalin Marinas,
Sinan Kaya, Will Deacon,
iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
Harv Abdulhamid, linux-pci-u79uwXL29TY76Z2rM5mHXA, Bjorn Helgaas,
David Woodhouse,
linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r, Nate Watterson
Copy the content of TCR, MAIR and TTBR of a given task into a context
descriptor.
Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
---
drivers/iommu/arm-smmu-v3.c | 39 +++++++++++++++++++++++++++++++++++++--
1 file changed, 37 insertions(+), 2 deletions(-)
diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index ab49164bf09b..c3fa4616bd58 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -44,6 +44,7 @@
#include <linux/amba/bus.h>
#include "io-pgtable.h"
+#include "io-pgtable-arm.h"
/* MMIO registers */
#define ARM_SMMU_IDR0 0x0
@@ -2191,6 +2192,9 @@ static int arm_smmu_init_task_pgtable(struct arm_smmu_task *smmu_task)
{
int ret;
int asid;
+ unsigned long tcr;
+ unsigned long reg, par;
+ struct arm_smmu_s1_cfg *cfg = &smmu_task->s1_cfg;
/* Pin ASID on the CPU side */
asid = mm_context_get(smmu_task->mm);
@@ -2203,8 +2207,39 @@ static int arm_smmu_init_task_pgtable(struct arm_smmu_task *smmu_task)
return ret;
}
- /* TODO: Initialize the rest of s1_cfg */
- smmu_task->s1_cfg.asid = asid;
+ tcr = TCR_T0SZ(VA_BITS) | TCR_IRGN0_WBWA | TCR_ORGN0_WBWA |
+ TCR_SH0_INNER | ARM_LPAE_TCR_EPD1;
+
+ switch (PAGE_SIZE) {
+ case SZ_4K:
+ tcr |= TCR_TG0_4K;
+ break;
+ case SZ_16K:
+ tcr |= TCR_TG0_16K;
+ break;
+ case SZ_64K:
+ tcr |= TCR_TG0_64K;
+ break;
+ default:
+ WARN_ON(1);
+ return -EFAULT;
+ }
+
+ reg = read_system_reg(SYS_ID_AA64MMFR0_EL1);
+ par = cpuid_feature_extract_unsigned_field(reg, ID_AA64MMFR0_PARANGE_SHIFT);
+ tcr |= par << ARM_LPAE_TCR_IPS_SHIFT;
+
+ /* Enable this by default, it will be filtered when writing the CD */
+ tcr |= TCR_TBI0;
+
+ cfg->asid = asid;
+ cfg->ttbr = virt_to_phys(smmu_task->mm->pgd);
+ /*
+ * MAIR value is pretty much constant and global, so we can just get it
+ * from the current CPU register
+ */
+ cfg->mair = read_sysreg(mair_el1);
+ cfg->tcr = tcr;
return 0;
}
--
2.11.0
^ permalink raw reply related [flat|nested] 103+ messages in thread
* [RFC PATCH 15/30] iommu/arm-smmu-v3: Steal private ASID from a domain
[not found] ` <20170227195441.5170-1-jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
` (13 preceding siblings ...)
2017-02-27 19:54 ` [RFC PATCH 14/30] iommu/arm-smmu-v3: Share process page tables Jean-Philippe Brucker
@ 2017-02-27 19:54 ` Jean-Philippe Brucker
2017-02-27 19:54 ` [RFC PATCH 16/30] iommu/arm-smmu-v3: Use shared ASID set Jean-Philippe Brucker
` (15 subsequent siblings)
30 siblings, 0 replies; 103+ messages in thread
From: Jean-Philippe Brucker @ 2017-02-27 19:54 UTC (permalink / raw)
Cc: Shanker Donthineni, kvm-u79uwXL29TY76Z2rM5mHXA, Catalin Marinas,
Sinan Kaya, Will Deacon,
iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
Harv Abdulhamid, linux-pci-u79uwXL29TY76Z2rM5mHXA, Bjorn Helgaas,
David Woodhouse,
linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r, Nate Watterson
The SMMU only has one ASID space, so the task allocator competes with the
domain allocator for ASIDs. Task ASIDs are shared with CPUs, whereas
domain ASIDs are private to the SMMU, and not affected by broadcast TLB
invalidations. When the task allocator pins a mm_context and gets an ASID
already used by the SMMU, it belongs to a domain. Attempt to assign a new
ASID to the domain, and steal the old one for our shared context.
Replacing an ASID requires some pretty invasive introspection. We could
try to do fine-grained locking on asid_map, domains list, context
descriptors and domains themselves, but it gets terribly complicated and
my brain melted twice before I could find solutions to all lock
dependencies.
Instead, introduce a big fat mutex around domains and (default) contexts
modifications. It ensures that arm_smmu_context_share finds the domain
that owns the ASID we want, and then changes all associated stream table
entries without racing with attach/detach_dev. Note that domain_free is
called after devices have been removed from the group, so
arm_smmu_context_share might do the whole ASID replacement dance for
nothing, but it is harmless.
Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
---
drivers/iommu/arm-smmu-v3.c | 98 +++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 94 insertions(+), 4 deletions(-)
diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index c3fa4616bd58..3af47b1427a6 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -708,6 +708,9 @@ struct arm_smmu_device {
spinlock_t contexts_lock;
struct rb_root streams;
struct list_head tasks;
+
+ struct list_head domains;
+ struct mutex domains_mutex;
};
struct arm_smmu_stream {
@@ -752,6 +755,8 @@ struct arm_smmu_domain {
struct list_head groups;
spinlock_t groups_lock;
+
+ struct list_head list; /* For domain search by ASID */
};
struct arm_smmu_task {
@@ -2179,11 +2184,79 @@ static const struct mmu_notifier_ops arm_smmu_mmu_notifier_ops = {
static int arm_smmu_context_share(struct arm_smmu_task *smmu_task, int asid)
{
int ret = 0;
+ int new_asid;
+ unsigned long flags;
+ struct arm_smmu_group *smmu_group;
+ struct arm_smmu_master_data *master;
struct arm_smmu_device *smmu = smmu_task->smmu;
+ struct arm_smmu_domain *tmp_domain, *smmu_domain = NULL;
+ struct arm_smmu_cmdq_ent cmd = {
+ .opcode = smmu->features & ARM_SMMU_FEAT_E2H ?
+ CMDQ_OP_TLBI_EL2_ASID : CMDQ_OP_TLBI_NH_ASID,
+ };
+
+ mutex_lock(&smmu->domains_mutex);
+
+ if (!test_and_set_bit(asid, smmu->asid_map))
+ goto out_unlock;
+
+ /* ASID is used by a domain. Try to replace it with a new one. */
+ new_asid = arm_smmu_bitmap_alloc(smmu->asid_map, smmu->asid_bits);
+ if (new_asid < 0) {
+ ret = new_asid;
+ goto out_unlock;
+ }
+
+ list_for_each_entry(tmp_domain, &smmu->domains, list) {
+ if (tmp_domain->stage != ARM_SMMU_DOMAIN_S1 ||
+ tmp_domain->s1_cfg.asid != asid)
+ continue;
+
+ smmu_domain = tmp_domain;
+ break;
+ }
+
+ /*
+ * We didn't find the domain that owns this ASID. It is a bug, since we
+ * hold domains_mutex
+ */
+ if (WARN_ON(!smmu_domain)) {
+ ret = -ENOSPC;
+ goto out_unlock;
+ }
+
+ /*
+ * Race with smmu_unmap; TLB invalidations will start targeting the
+ * new ASID, which isn't assigned yet. We'll do an invalidate-all on
+ * the old ASID later, so it doesn't matter.
+ */
+ smmu_domain->s1_cfg.asid = new_asid;
- if (test_and_set_bit(asid, smmu->asid_map))
- /* ASID is already used for a domain */
- return -EEXIST;
+ /*
+ * Update ASID and invalidate CD in all associated masters. There will
+ * be some overlapping between use of both ASIDs, until we invalidate
+ * the TLB.
+ */
+ spin_lock_irqsave(&smmu_domain->groups_lock, flags);
+
+ list_for_each_entry(smmu_group, &smmu_domain->groups, domain_head) {
+ spin_lock(&smmu_group->devices_lock);
+ list_for_each_entry(master, &smmu_group->devices, group_head) {
+ arm_smmu_write_ctx_desc(master, 0, &smmu_domain->s1_cfg);
+ }
+ spin_unlock(&smmu_group->devices_lock);
+ }
+
+ spin_unlock_irqrestore(&smmu_domain->groups_lock, flags);
+
+ /* Invalidate TLB entries previously associated with that domain */
+ cmd.tlbi.asid = asid;
+ arm_smmu_cmdq_issue_cmd(smmu, &cmd);
+ cmd.opcode = CMDQ_OP_CMD_SYNC;
+ arm_smmu_cmdq_issue_cmd(smmu, &cmd);
+
+out_unlock:
+ mutex_unlock(&smmu->domains_mutex);
return ret;
}
@@ -2426,16 +2499,23 @@ static void arm_smmu_domain_free(struct iommu_domain *domain)
iommu_put_dma_cookie(domain);
free_io_pgtable_ops(smmu_domain->pgtbl_ops);
+ mutex_lock(&smmu->domains_mutex);
+
if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) {
struct arm_smmu_s1_cfg *cfg = &smmu_domain->s1_cfg;
- if (cfg->asid)
+ if (cfg->asid) {
arm_smmu_bitmap_free(smmu->asid_map, cfg->asid);
+
+ list_del(&smmu_domain->list);
+ }
} else {
struct arm_smmu_s2_cfg *cfg = &smmu_domain->s2_cfg;
if (cfg->vmid)
arm_smmu_bitmap_free(smmu->vmid_map, cfg->vmid);
}
+ mutex_unlock(&smmu->domains_mutex);
+
kfree(smmu_domain);
}
@@ -2455,6 +2535,8 @@ static int arm_smmu_domain_finalise_s1(struct arm_smmu_domain *smmu_domain,
cfg->tcr = pgtbl_cfg->arm_lpae_s1_cfg.tcr;
cfg->mair = pgtbl_cfg->arm_lpae_s1_cfg.mair[0];
+ list_add(&smmu_domain->list, &smmu->domains);
+
return 0;
}
@@ -2604,12 +2686,16 @@ static void arm_smmu_detach_dev(struct device *dev)
struct arm_smmu_context *smmu_context;
struct rb_node *node, *next;
+ mutex_lock(&smmu->domains_mutex);
+
master->ste.bypass = true;
if (arm_smmu_install_ste_for_dev(dev->iommu_fwspec) < 0)
dev_warn(dev, "failed to install bypass STE\n");
arm_smmu_write_ctx_desc(master, 0, NULL);
+ mutex_unlock(&smmu->domains_mutex);
+
if (!master->ste.valid)
return;
@@ -2682,6 +2768,7 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
arm_smmu_detach_dev(dev);
}
+ mutex_lock(&smmu->domains_mutex);
mutex_lock(&smmu_domain->init_mutex);
if (!smmu_domain->smmu) {
@@ -2726,6 +2813,7 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
out_unlock:
mutex_unlock(&smmu_domain->init_mutex);
+ mutex_unlock(&smmu->domains_mutex);
iommu_group_put(group);
@@ -3330,9 +3418,11 @@ static int arm_smmu_init_structures(struct arm_smmu_device *smmu)
{
int ret;
+ mutex_init(&smmu->domains_mutex);
spin_lock_init(&smmu->contexts_lock);
smmu->streams = RB_ROOT;
INIT_LIST_HEAD(&smmu->tasks);
+ INIT_LIST_HEAD(&smmu->domains);
ret = arm_smmu_init_queues(smmu);
if (ret)
--
2.11.0
^ permalink raw reply related [flat|nested] 103+ messages in thread
* [RFC PATCH 16/30] iommu/arm-smmu-v3: Use shared ASID set
[not found] ` <20170227195441.5170-1-jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
` (14 preceding siblings ...)
2017-02-27 19:54 ` [RFC PATCH 15/30] iommu/arm-smmu-v3: Steal private ASID from a domain Jean-Philippe Brucker
@ 2017-02-27 19:54 ` Jean-Philippe Brucker
2017-02-27 19:54 ` [RFC PATCH 17/30] iommu/arm-smmu-v3: Add SVM feature checking Jean-Philippe Brucker
` (14 subsequent siblings)
30 siblings, 0 replies; 103+ messages in thread
From: Jean-Philippe Brucker @ 2017-02-27 19:54 UTC (permalink / raw)
Cc: Shanker Donthineni, kvm-u79uwXL29TY76Z2rM5mHXA, Catalin Marinas,
Sinan Kaya, Will Deacon,
iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
Harv Abdulhamid, linux-pci-u79uwXL29TY76Z2rM5mHXA, Bjorn Helgaas,
David Woodhouse,
linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r, Nate Watterson
We now have two exclusive sets of ASIDs: private and shared. SMMUv3 allows
for contexts to take part in distributed TLB maintenance via the ASET bit.
When this bit is 0 for a given context, TLB entries tagged with its ASID
are invalidated by broadcast TLB maintenance. Set ASET=0 for task contexts.
Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
---
drivers/iommu/arm-smmu-v3.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index 3af47b1427a6..86d5430bd68d 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -1263,7 +1263,8 @@ static void arm_smmu_write_ctx_desc(struct arm_smmu_master_data *master,
CTXDESC_CD_0_ENDI |
#endif
CTXDESC_CD_0_R | CTXDESC_CD_0_A |
- CTXDESC_CD_0_ASET_PRIVATE |
+ (ssid ? CTXDESC_CD_0_ASET_SHARED :
+ CTXDESC_CD_0_ASET_PRIVATE) |
CTXDESC_CD_0_AA64 |
(u64)cfg->asid << CTXDESC_CD_0_ASID_SHIFT |
CTXDESC_CD_0_V;
--
2.11.0
^ permalink raw reply related [flat|nested] 103+ messages in thread
* [RFC PATCH 17/30] iommu/arm-smmu-v3: Add SVM feature checking
[not found] ` <20170227195441.5170-1-jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
` (15 preceding siblings ...)
2017-02-27 19:54 ` [RFC PATCH 16/30] iommu/arm-smmu-v3: Use shared ASID set Jean-Philippe Brucker
@ 2017-02-27 19:54 ` Jean-Philippe Brucker
2017-02-27 19:54 ` [RFC PATCH 18/30] PCI: Make "PRG Response PASID Required" handling common Jean-Philippe Brucker
` (13 subsequent siblings)
30 siblings, 0 replies; 103+ messages in thread
From: Jean-Philippe Brucker @ 2017-02-27 19:54 UTC (permalink / raw)
Cc: Shanker Donthineni, kvm-u79uwXL29TY76Z2rM5mHXA, Catalin Marinas,
Sinan Kaya, Will Deacon,
iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
Harv Abdulhamid, linux-pci-u79uwXL29TY76Z2rM5mHXA, Bjorn Helgaas,
David Woodhouse,
linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r, Nate Watterson
Aggregate all sanity-checks for SVM under a single ARM_SMMU_FEAT_SVM bit.
For PCIe SVM, users also need to check FEAT_ATS and FEAT_PRI. For platform
SVM, they will most likely have to check ARM_SMMU_FEAT_STALLS.
Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
---
drivers/iommu/arm-smmu-v3.c | 63 +++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 63 insertions(+)
diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index 86d5430bd68d..499dc1cd07eb 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -674,6 +674,7 @@ struct arm_smmu_device {
#define ARM_SMMU_FEAT_HYP (1 << 12)
#define ARM_SMMU_FEAT_E2H (1 << 13)
#define ARM_SMMU_FEAT_BTM (1 << 14)
+#define ARM_SMMU_FEAT_SVM (1 << 15)
u32 features;
#define ARM_SMMU_OPT_SKIP_PREFETCH (1 << 0)
@@ -3744,6 +3745,65 @@ static int arm_smmu_device_reset(struct arm_smmu_device *smmu, bool bypass)
return 0;
}
+static bool arm_smmu_supports_svm(struct arm_smmu_device *smmu)
+{
+ unsigned long reg, fld;
+ unsigned long oas;
+ unsigned long asid_bits;
+
+ u32 feat_mask = ARM_SMMU_FEAT_BTM | ARM_SMMU_FEAT_COHERENCY;
+
+ if ((smmu->features & feat_mask) != feat_mask)
+ return false;
+
+ if (!smmu->ssid_bits)
+ return false;
+
+ if (!(smmu->pgsize_bitmap & PAGE_SIZE))
+ return false;
+
+ /*
+ * Get the smallest PA size of all CPUs (sanitized by cpufeature). We're
+ * not even pretending to support AArch32 here.
+ */
+ reg = read_system_reg(SYS_ID_AA64MMFR0_EL1);
+ fld = cpuid_feature_extract_unsigned_field(reg, ID_AA64MMFR0_PARANGE_SHIFT);
+ switch (fld) {
+ case 0x0:
+ oas = 32;
+ break;
+ case 0x1:
+ oas = 36;
+ break;
+ case 0x2:
+ oas = 40;
+ break;
+ case 0x3:
+ oas = 42;
+ break;
+ case 0x4:
+ oas = 44;
+ break;
+ case 0x5:
+ oas = 48;
+ break;
+ default:
+ return false;
+ }
+
+ /* abort if MMU outputs addresses greater than what we support. */
+ if (smmu->oas < oas)
+ return false;
+
+ /* We can support bigger ASIDs than the CPU, but not smaller */
+ fld = cpuid_feature_extract_unsigned_field(reg, ID_AA64MMFR0_ASID_SHIFT);
+ asid_bits = fld ? 16 : 8;
+ if (smmu->asid_bits < asid_bits)
+ return false;
+
+ return true;
+}
+
static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu)
{
u32 reg;
@@ -3937,6 +3997,9 @@ static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu)
smmu->ias = max(smmu->ias, smmu->oas);
+ if (arm_smmu_supports_svm(smmu))
+ smmu->features |= ARM_SMMU_FEAT_SVM;
+
dev_info(smmu->dev, "ias %lu-bit, oas %lu-bit (features 0x%08x)\n",
smmu->ias, smmu->oas, smmu->features);
return 0;
--
2.11.0
^ permalink raw reply related [flat|nested] 103+ messages in thread
* [RFC PATCH 18/30] PCI: Make "PRG Response PASID Required" handling common
[not found] ` <20170227195441.5170-1-jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
` (16 preceding siblings ...)
2017-02-27 19:54 ` [RFC PATCH 17/30] iommu/arm-smmu-v3: Add SVM feature checking Jean-Philippe Brucker
@ 2017-02-27 19:54 ` Jean-Philippe Brucker
[not found] ` <20170227195441.5170-19-jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
2017-02-27 19:54 ` [RFC PATCH 19/30] PCI: Cache PRI and PASID bits in pci_dev Jean-Philippe Brucker
` (12 subsequent siblings)
30 siblings, 1 reply; 103+ messages in thread
From: Jean-Philippe Brucker @ 2017-02-27 19:54 UTC (permalink / raw)
Cc: Shanker Donthineni, kvm-u79uwXL29TY76Z2rM5mHXA, Catalin Marinas,
Sinan Kaya, Will Deacon,
iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
Harv Abdulhamid, linux-pci-u79uwXL29TY76Z2rM5mHXA, Bjorn Helgaas,
David Woodhouse,
linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r, Nate Watterson
The PASID ECN to the PCIe spec added a bit in the PRI status register that
allows a Function to declare whether a PRG Response should contain the
PASID prefix or not.
Move the helper that accesses it from amd_iommu into the PCI subsystem,
renaming it to something more consistent with the spec, and introducing
another obscure acronym to make it all fit.
Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
---
drivers/iommu/amd_iommu.c | 19 +------------------
drivers/pci/ats.c | 17 +++++++++++++++++
include/linux/pci-ats.h | 8 ++++++++
include/uapi/linux/pci_regs.h | 1 +
4 files changed, 27 insertions(+), 18 deletions(-)
diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 98940d1392cb..c5c598bf4ba3 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -2024,23 +2024,6 @@ static int pdev_iommuv2_enable(struct pci_dev *pdev)
return ret;
}
-/* FIXME: Move this to PCI code */
-#define PCI_PRI_TLP_OFF (1 << 15)
-
-static bool pci_pri_tlp_required(struct pci_dev *pdev)
-{
- u16 status;
- int pos;
-
- pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI);
- if (!pos)
- return false;
-
- pci_read_config_word(pdev, pos + PCI_PRI_STATUS, &status);
-
- return (status & PCI_PRI_TLP_OFF) ? true : false;
-}
-
/*
* If a device is not yet associated with a domain, this function
* assigns it visible for the hardware
@@ -2069,7 +2052,7 @@ static int attach_device(struct device *dev,
dev_data->ats.enabled = true;
dev_data->ats.qdep = pci_ats_queue_depth(pdev);
- dev_data->pri_tlp = pci_pri_tlp_required(pdev);
+ dev_data->pri_tlp = pci_prg_resp_requires_prefix(pdev);
}
} else if (amd_iommu_iotlb_sup &&
pci_enable_ats(pdev, PAGE_SHIFT) == 0) {
diff --git a/drivers/pci/ats.c b/drivers/pci/ats.c
index eeb9fb2b47aa..331376e9bb8b 100644
--- a/drivers/pci/ats.c
+++ b/drivers/pci/ats.c
@@ -334,3 +334,20 @@ int pci_max_pasids(struct pci_dev *pdev)
}
EXPORT_SYMBOL_GPL(pci_max_pasids);
#endif /* CONFIG_PCI_PASID */
+
+#if defined(CONFIG_PCI_PASID) && defined(CONFIG_PCI_PRI)
+bool pci_prg_resp_requires_prefix(struct pci_dev *pdev)
+{
+ u16 status;
+ int pos;
+
+ pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI);
+ if (!pos)
+ return false;
+
+ pci_read_config_word(pdev, pos + PCI_PRI_STATUS, &status);
+
+ return !!(status & PCI_PRI_STATUS_PRPR);
+}
+EXPORT_SYMBOL_GPL(pci_prg_resp_requires_prefix);
+#endif /* CONFIG_PCI_PASID && CONFIG_PCI_PRI */
diff --git a/include/linux/pci-ats.h b/include/linux/pci-ats.h
index 57e0b8250947..e21bcacbe80c 100644
--- a/include/linux/pci-ats.h
+++ b/include/linux/pci-ats.h
@@ -57,5 +57,13 @@ static inline int pci_max_pasids(struct pci_dev *pdev)
#endif /* CONFIG_PCI_PASID */
+#if defined(CONFIG_PCI_PASID) && defined(CONFIG_PCI_PRI)
+bool pci_prg_resp_requires_prefix(struct pci_dev *pdev);
+#else
+static inline bool pci_prg_resp_requires_prefix(struct pci_dev *pdev)
+{
+ return false;
+}
+#endif /* CONFIG_PCI_PASID && CONFIG_PCI_PRI */
#endif /* LINUX_PCI_ATS_H*/
diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index 634c9c44ed6c..bae815876be6 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -864,6 +864,7 @@
#define PCI_PRI_STATUS_RF 0x001 /* Response Failure */
#define PCI_PRI_STATUS_UPRGI 0x002 /* Unexpected PRG index */
#define PCI_PRI_STATUS_STOPPED 0x100 /* PRI Stopped */
+#define PCI_PRI_STATUS_PRPR 0x8000 /* PRG Response requires PASID prefix */
#define PCI_PRI_MAX_REQ 0x08 /* PRI max reqs supported */
#define PCI_PRI_ALLOC_REQ 0x0c /* PRI max reqs allowed */
#define PCI_EXT_CAP_PRI_SIZEOF 16
--
2.11.0
^ permalink raw reply related [flat|nested] 103+ messages in thread
* [RFC PATCH 19/30] PCI: Cache PRI and PASID bits in pci_dev
[not found] ` <20170227195441.5170-1-jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
` (17 preceding siblings ...)
2017-02-27 19:54 ` [RFC PATCH 18/30] PCI: Make "PRG Response PASID Required" handling common Jean-Philippe Brucker
@ 2017-02-27 19:54 ` Jean-Philippe Brucker
[not found] ` <20170227195441.5170-20-jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
2017-02-27 19:54 ` [RFC PATCH 20/30] iommu/arm-smmu-v3: Enable PCI PASID in masters Jean-Philippe Brucker
` (11 subsequent siblings)
30 siblings, 1 reply; 103+ messages in thread
From: Jean-Philippe Brucker @ 2017-02-27 19:54 UTC (permalink / raw)
Cc: Shanker Donthineni, kvm-u79uwXL29TY76Z2rM5mHXA, Catalin Marinas,
Sinan Kaya, Will Deacon,
iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
Harv Abdulhamid, linux-pci-u79uwXL29TY76Z2rM5mHXA, Bjorn Helgaas,
David Woodhouse,
linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r, Nate Watterson
Device drivers need to check if an IOMMU enabled ATS, PRI and PASID in
order to know when they can use the SVM API. Cache PRI and PASID bits in
the pci_dev structure, similarly to what is currently done for ATS.
Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
---
drivers/pci/ats.c | 23 +++++++++++++++++++++++
include/linux/pci.h | 2 ++
2 files changed, 25 insertions(+)
diff --git a/drivers/pci/ats.c b/drivers/pci/ats.c
index 331376e9bb8b..486dc2208119 100644
--- a/drivers/pci/ats.c
+++ b/drivers/pci/ats.c
@@ -153,6 +153,9 @@ int pci_enable_pri(struct pci_dev *pdev, u32 reqs)
u32 max_requests;
int pos;
+ if (WARN_ON(pdev->pri_enabled))
+ return -EBUSY;
+
pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI);
if (!pos)
return -EINVAL;
@@ -170,6 +173,8 @@ int pci_enable_pri(struct pci_dev *pdev, u32 reqs)
control |= PCI_PRI_CTRL_ENABLE;
pci_write_config_word(pdev, pos + PCI_PRI_CTRL, control);
+ pdev->pri_enabled = 1;
+
return 0;
}
EXPORT_SYMBOL_GPL(pci_enable_pri);
@@ -185,6 +190,9 @@ void pci_disable_pri(struct pci_dev *pdev)
u16 control;
int pos;
+ if (WARN_ON(!pdev->pri_enabled))
+ return;
+
pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI);
if (!pos)
return;
@@ -192,6 +200,8 @@ void pci_disable_pri(struct pci_dev *pdev)
pci_read_config_word(pdev, pos + PCI_PRI_CTRL, &control);
control &= ~PCI_PRI_CTRL_ENABLE;
pci_write_config_word(pdev, pos + PCI_PRI_CTRL, control);
+
+ pdev->pri_enabled = 0;
}
EXPORT_SYMBOL_GPL(pci_disable_pri);
@@ -207,6 +217,9 @@ int pci_reset_pri(struct pci_dev *pdev)
u16 control;
int pos;
+ if (WARN_ON(pdev->pri_enabled))
+ return -EBUSY;
+
pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI);
if (!pos)
return -EINVAL;
@@ -239,6 +252,9 @@ int pci_enable_pasid(struct pci_dev *pdev, int features)
u16 control, supported;
int pos;
+ if (WARN_ON(pdev->pasid_enabled))
+ return -EBUSY;
+
pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PASID);
if (!pos)
return -EINVAL;
@@ -259,6 +275,8 @@ int pci_enable_pasid(struct pci_dev *pdev, int features)
pci_write_config_word(pdev, pos + PCI_PASID_CTRL, control);
+ pdev->pasid_enabled = 1;
+
return 0;
}
EXPORT_SYMBOL_GPL(pci_enable_pasid);
@@ -273,11 +291,16 @@ void pci_disable_pasid(struct pci_dev *pdev)
u16 control = 0;
int pos;
+ if (WARN_ON(!pdev->pasid_enabled))
+ return;
+
pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PASID);
if (!pos)
return;
pci_write_config_word(pdev, pos + PCI_PASID_CTRL, control);
+
+ pdev->pasid_enabled = 0;
}
EXPORT_SYMBOL_GPL(pci_disable_pasid);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index e606f289bf5f..47c353ca9957 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -351,6 +351,8 @@ struct pci_dev {
unsigned int msix_enabled:1;
unsigned int ari_enabled:1; /* ARI forwarding */
unsigned int ats_enabled:1; /* Address Translation Service */
+ unsigned int pasid_enabled:1; /* Process Address Space ID */
+ unsigned int pri_enabled:1; /* Page Request Interface */
unsigned int is_managed:1;
unsigned int needs_freset:1; /* Dev requires fundamental reset */
unsigned int state_saved:1;
--
2.11.0
^ permalink raw reply related [flat|nested] 103+ messages in thread
* [RFC PATCH 20/30] iommu/arm-smmu-v3: Enable PCI PASID in masters
[not found] ` <20170227195441.5170-1-jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
` (18 preceding siblings ...)
2017-02-27 19:54 ` [RFC PATCH 19/30] PCI: Cache PRI and PASID bits in pci_dev Jean-Philippe Brucker
@ 2017-02-27 19:54 ` Jean-Philippe Brucker
[not found] ` <20170227195441.5170-21-jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
2017-02-27 19:54 ` [RFC PATCH 21/30] iommu/arm-smmu-v3: Handle device faults from PRI Jean-Philippe Brucker
` (10 subsequent siblings)
30 siblings, 1 reply; 103+ messages in thread
From: Jean-Philippe Brucker @ 2017-02-27 19:54 UTC (permalink / raw)
Cc: Shanker Donthineni, kvm-u79uwXL29TY76Z2rM5mHXA, Catalin Marinas,
Sinan Kaya, Will Deacon,
iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
Harv Abdulhamid, linux-pci-u79uwXL29TY76Z2rM5mHXA, Bjorn Helgaas,
David Woodhouse,
linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r, Nate Watterson
Enable PASID for PCI devices that support it.
Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
---
drivers/iommu/arm-smmu-v3.c | 66 ++++++++++++++++++++++++++++++++++++++++++---
1 file changed, 63 insertions(+), 3 deletions(-)
diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index 499dc1cd07eb..37fd061405e9 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -730,6 +730,8 @@ struct arm_smmu_master_data {
struct arm_smmu_stream *streams;
struct rb_root contexts;
+
+ u32 avail_contexts;
};
/* SMMU private data for an IOMMU domain */
@@ -2954,6 +2956,47 @@ static void arm_smmu_disable_ats(struct arm_smmu_master_data *master)
pci_disable_ats(pdev);
}
+static int arm_smmu_enable_ssid(struct arm_smmu_master_data *master)
+{
+ int ret;
+ int features;
+ int nr_ssids;
+ struct pci_dev *pdev;
+
+ if (!dev_is_pci(master->dev))
+ return -ENOSYS;
+
+ pdev = to_pci_dev(master->dev);
+
+ features = pci_pasid_features(pdev);
+ if (features < 0)
+ return -ENOSYS;
+
+ nr_ssids = pci_max_pasids(pdev);
+
+ dev_dbg(&pdev->dev, "device supports %#x SSIDs [%s%s]\n", nr_ssids,
+ (features & PCI_PASID_CAP_EXEC) ? "x" : "",
+ (features & PCI_PASID_CAP_PRIV) ? "p" : "");
+
+ ret = pci_enable_pasid(pdev, features);
+ return ret ? ret : nr_ssids;
+}
+
+static void arm_smmu_disable_ssid(struct arm_smmu_master_data *master)
+{
+ struct pci_dev *pdev;
+
+ if (!dev_is_pci(master->dev))
+ return;
+
+ pdev = to_pci_dev(master->dev);
+
+ if (!pdev->pasid_enabled)
+ return;
+
+ pci_disable_pasid(pdev);
+}
+
static int arm_smmu_insert_master(struct arm_smmu_device *smmu,
struct arm_smmu_master_data *master)
{
@@ -3007,6 +3050,7 @@ static struct iommu_ops arm_smmu_ops;
static int arm_smmu_add_device(struct device *dev)
{
int i, ret;
+ int nr_ssids;
bool ats_enabled;
unsigned long flags;
struct arm_smmu_device *smmu;
@@ -3055,9 +3099,19 @@ static int arm_smmu_add_device(struct device *dev)
}
}
- ret = arm_smmu_alloc_cd_tables(master, 1);
- if (ret < 0)
- return ret;
+ /* PCIe PASID must be enabled before ATS */
+ nr_ssids = arm_smmu_enable_ssid(master);
+ if (nr_ssids <= 0)
+ nr_ssids = 1;
+
+ nr_ssids = arm_smmu_alloc_cd_tables(master, nr_ssids);
+ if (nr_ssids < 0) {
+ ret = nr_ssids;
+ goto err_disable_ssid;
+ }
+
+ /* SSID0 is reserved */
+ master->avail_contexts = nr_ssids - 1;
ats_enabled = !arm_smmu_enable_ats(master);
@@ -3088,6 +3142,9 @@ static int arm_smmu_add_device(struct device *dev)
arm_smmu_free_cd_tables(master);
+err_disable_ssid:
+ arm_smmu_disable_ssid(master);
+
return ret;
}
@@ -3129,7 +3186,10 @@ static void arm_smmu_remove_device(struct device *dev)
iommu_group_put(group);
+ /* PCIe PASID must be disabled after ATS */
arm_smmu_disable_ats(master);
+ arm_smmu_disable_ssid(master);
+
arm_smmu_free_cd_tables(master);
}
--
2.11.0
^ permalink raw reply related [flat|nested] 103+ messages in thread
* [RFC PATCH 21/30] iommu/arm-smmu-v3: Handle device faults from PRI
[not found] ` <20170227195441.5170-1-jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
` (19 preceding siblings ...)
2017-02-27 19:54 ` [RFC PATCH 20/30] iommu/arm-smmu-v3: Enable PCI PASID in masters Jean-Philippe Brucker
@ 2017-02-27 19:54 ` Jean-Philippe Brucker
[not found] ` <8520D5D51A55D047800579B0941471982640F43C@XAP-PVEXMBX02.xlnx.xilinx.com>
2017-02-27 19:54 ` [RFC PATCH 22/30] iommu: Bind/unbind tasks to/from devices Jean-Philippe Brucker
` (9 subsequent siblings)
30 siblings, 1 reply; 103+ messages in thread
From: Jean-Philippe Brucker @ 2017-02-27 19:54 UTC (permalink / raw)
Cc: Shanker Donthineni, kvm-u79uwXL29TY76Z2rM5mHXA, Catalin Marinas,
Sinan Kaya, Will Deacon,
iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
Harv Abdulhamid, linux-pci-u79uwXL29TY76Z2rM5mHXA, Bjorn Helgaas,
David Woodhouse,
linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r, Nate Watterson
When we receive a PRI Page Request (PPR) from the SMMU, it contains a
context identifier SID:SSID, an IOVA and the requested access flags.
Search the domain corresponding to SID:SSID, and call handle_mm_fault on
its mm. If memory management is able to fix the fault, we ask the device
to retry the access with a PRI_SUCCESS message. Otherwise send PRI_FAIL.
PPRs can be sent in batches identified by a Page Request Group (PRG). The
last request of a group is always marked with a flag, which tells the
fault handler to send a reply for the group. If a page request in the
group failed, reply with PRI_FAIL for the whole group.
Each device gets a number of credits, describing the number of PPRs it can
have in flight. The SMMU architecture says that the kernel should
carefully assign those credits such that the sum of all credits isn't
greater than the PRI queue size. Otherwise it is a programming error, says
the spec. This is impossible for us since we have no idea how many devices
will use PRI when we start assigning credits. In addition, new PRI-capable
devices might get hotplugged at any time, which would require us to stop
all existing devices and shrink their credits if we wanted to be fair.
This is not viable. Overcommit the PRI queue size and hand a fixed number
of credits to each device.
Our priority is therefore on relieving the PRI queue as fast as possible,
by moving all PPRs to a workqueue that we'll call "fault queue". When
adding support for handling page faults from platform devices, we'll
receive these events on the event queue, and inject them in the same fault
queue. Note that stall support is just around the corner, so this patch
attempts to abstract PCI notions where necessary.
The PCI specification defines a special PPR called "Stop Marker Message",
characterized by flags Read=Write=Last=0. This tells software that all
previous PPRs containing this PASID are invalid, and the next PPRs with
this PASID belong to a different address space. Subsequent patches handle
Stop Markers and overflow of the queue.
Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
---
drivers/iommu/arm-smmu-v3.c | 434 +++++++++++++++++++++++++++++++++++++++-----
1 file changed, 392 insertions(+), 42 deletions(-)
diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index 37fd061405e9..5e0008ac68cb 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -270,6 +270,8 @@
#define STRTAB_STE_1_S1COR_SHIFT 4
#define STRTAB_STE_1_S1CSH_SHIFT 6
+#define STRTAB_STE_1_PPAR (1UL << 18)
+
#define STRTAB_STE_1_S1STALLD (1UL << 27)
#define STRTAB_STE_1_EATS_ABT 0UL
@@ -465,10 +467,13 @@ module_param_named(disable_ats_check, disable_ats_check, bool, S_IRUGO);
MODULE_PARM_DESC(disable_ats_check,
"By default, the SMMU checks whether each incoming transaction marked as translated is allowed by the stream configuration. This option disables the check.");
-enum pri_resp {
- PRI_RESP_DENY,
- PRI_RESP_FAIL,
- PRI_RESP_SUCC,
+enum fault_status {
+ /* Non-paging error. SMMU will not handle any fault from this device */
+ ARM_SMMU_FAULT_DENY,
+ /* Page fault is permanent, device shouldn't retry this access */
+ ARM_SMMU_FAULT_FAIL,
+ /* Fault has been handled, the access should be retried */
+ ARM_SMMU_FAULT_SUCC,
};
enum arm_smmu_msi_index {
@@ -553,7 +558,7 @@ struct arm_smmu_cmdq_ent {
u32 sid;
u32 ssid;
u16 grpid;
- enum pri_resp resp;
+ enum fault_status resp;
} pri;
#define CMDQ_OP_CMD_SYNC 0x46
@@ -642,6 +647,8 @@ struct arm_smmu_strtab_ent {
struct arm_smmu_cd_cfg cd_cfg;
struct arm_smmu_s1_cfg *s1_cfg;
struct arm_smmu_s2_cfg *s2_cfg;
+
+ bool prg_response_needs_ssid;
};
struct arm_smmu_strtab_cfg {
@@ -710,6 +717,8 @@ struct arm_smmu_device {
struct rb_root streams;
struct list_head tasks;
+ struct workqueue_struct *fault_queue;
+
struct list_head domains;
struct mutex domains_mutex;
};
@@ -731,6 +740,7 @@ struct arm_smmu_master_data {
struct arm_smmu_stream *streams;
struct rb_root contexts;
+ bool can_fault;
u32 avail_contexts;
};
@@ -762,6 +772,31 @@ struct arm_smmu_domain {
struct list_head list; /* For domain search by ASID */
};
+struct arm_smmu_fault {
+ struct arm_smmu_device *smmu;
+ u32 sid;
+ u32 ssid;
+ bool ssv;
+ u16 grpid;
+
+ u64 iova;
+ bool read;
+ bool write;
+ bool exec;
+ bool priv;
+
+ bool last;
+
+ struct work_struct work;
+};
+
+struct arm_smmu_pri_group {
+ u16 index;
+ enum fault_status resp;
+
+ struct list_head list;
+};
+
struct arm_smmu_task {
struct pid *pid;
@@ -775,6 +810,8 @@ struct arm_smmu_task {
struct mmu_notifier mmu_notifier;
struct mm_struct *mm;
+ struct list_head prgs;
+
struct kref kref;
};
@@ -815,6 +852,8 @@ static struct arm_smmu_domain *to_smmu_domain(struct iommu_domain *dom)
return container_of(dom, struct arm_smmu_domain, domain);
}
+static struct kmem_cache *arm_smmu_fault_cache;
+
#define to_smmu_group iommu_group_get_iommudata
static void parse_driver_options(struct arm_smmu_device *smmu)
@@ -1019,13 +1058,13 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
cmd[0] |= (u64)ent->pri.sid << CMDQ_PRI_0_SID_SHIFT;
cmd[1] |= ent->pri.grpid << CMDQ_PRI_1_GRPID_SHIFT;
switch (ent->pri.resp) {
- case PRI_RESP_DENY:
+ case ARM_SMMU_FAULT_DENY:
cmd[1] |= CMDQ_PRI_1_RESP_DENY;
break;
- case PRI_RESP_FAIL:
+ case ARM_SMMU_FAULT_FAIL:
cmd[1] |= CMDQ_PRI_1_RESP_FAIL;
break;
- case PRI_RESP_SUCC:
+ case ARM_SMMU_FAULT_SUCC:
cmd[1] |= CMDQ_PRI_1_RESP_SUCC;
break;
default:
@@ -1124,6 +1163,28 @@ static void arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
}
+static void arm_smmu_fault_reply(struct arm_smmu_fault *fault,
+ enum fault_status resp)
+{
+ struct arm_smmu_cmdq_ent cmd = {
+ .opcode = CMDQ_OP_PRI_RESP,
+ .substream_valid = fault->ssv,
+ .pri = {
+ .sid = fault->sid,
+ .ssid = fault->ssid,
+ .grpid = fault->grpid,
+ .resp = resp,
+ },
+ };
+
+ if (!fault->last)
+ return;
+
+ arm_smmu_cmdq_issue_cmd(fault->smmu, &cmd);
+ cmd.opcode = CMDQ_OP_CMD_SYNC;
+ arm_smmu_cmdq_issue_cmd(fault->smmu, &cmd);
+}
+
/* Context descriptor manipulation functions */
static void arm_smmu_sync_cd(struct arm_smmu_master_data *master, u32 ssid,
bool leaf)
@@ -1587,6 +1648,9 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_device *smmu, u32 sid,
STRTAB_STE_1_STRW_EL2 : STRTAB_STE_1_STRW_NSEL1) <<
STRTAB_STE_1_STRW_SHIFT);
+ if (ste->prg_response_needs_ssid)
+ dst[1] |= STRTAB_STE_1_PPAR;
+
if (smmu->features & ARM_SMMU_FEAT_STALLS)
dst[1] |= cpu_to_le64(STRTAB_STE_1_S1STALLD);
@@ -1704,42 +1768,37 @@ static irqreturn_t arm_smmu_evtq_thread(int irq, void *dev)
return IRQ_HANDLED;
}
+static void arm_smmu_handle_fault(struct work_struct *work);
+
static void arm_smmu_handle_ppr(struct arm_smmu_device *smmu, u64 *evt)
{
- u32 sid, ssid;
- u16 grpid;
- bool ssv, last;
-
- sid = evt[0] >> PRIQ_0_SID_SHIFT & PRIQ_0_SID_MASK;
- ssv = evt[0] & PRIQ_0_SSID_V;
- ssid = ssv ? evt[0] >> PRIQ_0_SSID_SHIFT & PRIQ_0_SSID_MASK : 0;
- last = evt[0] & PRIQ_0_PRG_LAST;
- grpid = evt[1] >> PRIQ_1_PRG_IDX_SHIFT & PRIQ_1_PRG_IDX_MASK;
-
- dev_info(smmu->dev, "unexpected PRI request received:\n");
- dev_info(smmu->dev,
- "\tsid 0x%08x.0x%05x: [%u%s] %sprivileged %s%s%s access at iova 0x%016llx\n",
- sid, ssid, grpid, last ? "L" : "",
- evt[0] & PRIQ_0_PERM_PRIV ? "" : "un",
- evt[0] & PRIQ_0_PERM_READ ? "R" : "",
- evt[0] & PRIQ_0_PERM_WRITE ? "W" : "",
- evt[0] & PRIQ_0_PERM_EXEC ? "X" : "",
- evt[1] & PRIQ_1_ADDR_MASK << PRIQ_1_ADDR_SHIFT);
-
- if (last) {
- struct arm_smmu_cmdq_ent cmd = {
- .opcode = CMDQ_OP_PRI_RESP,
- .substream_valid = ssv,
- .pri = {
- .sid = sid,
- .ssid = ssid,
- .grpid = grpid,
- .resp = PRI_RESP_DENY,
- },
- };
+ struct arm_smmu_fault *fault;
+ struct arm_smmu_fault params = {
+ .smmu = smmu,
+
+ .sid = evt[0] >> PRIQ_0_SID_SHIFT & PRIQ_0_SID_MASK,
+ .ssv = evt[0] & PRIQ_0_SSID_V,
+ .ssid = evt[0] >> PRIQ_0_SSID_SHIFT & PRIQ_0_SSID_MASK,
+ .last = evt[0] & PRIQ_0_PRG_LAST,
+ .grpid = evt[1] >> PRIQ_1_PRG_IDX_SHIFT & PRIQ_1_PRG_IDX_MASK,
+
+ .iova = evt[1] & PRIQ_1_ADDR_MASK << PRIQ_1_ADDR_SHIFT,
+ .read = evt[0] & PRIQ_0_PERM_READ,
+ .write = evt[0] & PRIQ_0_PERM_WRITE,
+ .exec = evt[0] & PRIQ_0_PERM_EXEC,
+ .priv = evt[0] & PRIQ_0_PERM_PRIV,
+ };
- arm_smmu_cmdq_issue_cmd(smmu, &cmd);
+ fault = kmem_cache_alloc(arm_smmu_fault_cache, GFP_KERNEL);
+ if (!fault) {
+ /* Out of memory, tell the device to retry later */
+ arm_smmu_fault_reply(¶ms, ARM_SMMU_FAULT_SUCC);
+ return;
}
+
+ *fault = params;
+ INIT_WORK(&fault->work, arm_smmu_handle_fault);
+ queue_work(smmu->fault_queue, &fault->work);
}
static irqreturn_t arm_smmu_priq_thread(int irq, void *dev)
@@ -2138,7 +2197,6 @@ static void _arm_smmu_put_context(struct arm_smmu_context *smmu_context)
kref_put(&smmu_context->kref, arm_smmu_free_context);
}
-__maybe_unused
static void arm_smmu_put_context(struct arm_smmu_device *smmu,
struct arm_smmu_context *smmu_context)
{
@@ -2147,6 +2205,62 @@ static void arm_smmu_put_context(struct arm_smmu_device *smmu,
spin_unlock(&smmu->contexts_lock);
}
+/*
+ * Find context associated to a (@sid, @ssid) pair. If found, take a reference
+ * to the context and return it. Otherwise, return NULL. If a non-NULL master
+ * is provided, search context by @ssid, ignoring argument @sid.
+ */
+static struct arm_smmu_context *
+arm_smmu_get_context_by_id(struct arm_smmu_device *smmu,
+ struct arm_smmu_master_data *master,
+ u32 sid, u32 ssid)
+{
+ struct rb_node *node;
+ struct arm_smmu_stream *stream;
+ struct arm_smmu_context *cur_context, *smmu_context = NULL;
+
+ spin_lock(&smmu->contexts_lock);
+
+ if (!master) {
+ node = smmu->streams.rb_node;
+ while (node) {
+ stream = rb_entry(node, struct arm_smmu_stream, node);
+ if (stream->id < sid) {
+ node = node->rb_right;
+ } else if (stream->id > sid) {
+ node = node->rb_left;
+ } else {
+ master = stream->master;
+ break;
+ }
+ }
+ }
+
+ if (!master)
+ goto out_unlock;
+
+ node = master->contexts.rb_node;
+ while (node) {
+ cur_context = rb_entry(node, struct arm_smmu_context,
+ master_node);
+
+ if (cur_context->ssid < ssid) {
+ node = node->rb_right;
+ } else if (cur_context->ssid > ssid) {
+ node = node->rb_left;
+ } else {
+ smmu_context = cur_context;
+ kref_get(&smmu_context->kref);
+ break;
+ }
+ }
+
+out_unlock:
+ spin_unlock(&smmu->contexts_lock);
+
+ return smmu_context;
+}
+
static struct arm_smmu_task *mn_to_task(struct mmu_notifier *mn)
{
return container_of(mn, struct arm_smmu_task, mmu_notifier);
@@ -2353,6 +2467,7 @@ static struct arm_smmu_task *arm_smmu_alloc_task(struct arm_smmu_device *smmu,
smmu_task->mmu_notifier.ops = &arm_smmu_mmu_notifier_ops;
smmu_task->mm = mm;
INIT_LIST_HEAD(&smmu_task->contexts);
+ INIT_LIST_HEAD(&smmu_task->prgs);
kref_init(&smmu_task->kref);
ret = arm_smmu_init_task_pgtable(smmu_task);
@@ -2399,6 +2514,7 @@ static void arm_smmu_free_task(struct kref *kref)
struct arm_smmu_device *smmu;
struct arm_smmu_task *smmu_task;
struct arm_smmu_master_data *master;
+ struct arm_smmu_pri_group *prg, *next_prg;
struct arm_smmu_context *smmu_context, *next;
smmu_task = container_of(kref, struct arm_smmu_task, kref);
@@ -2428,6 +2544,9 @@ static void arm_smmu_free_task(struct kref *kref)
mmu_notifier_unregister(&smmu_task->mmu_notifier, smmu_task->mm);
+ list_for_each_entry_safe(prg, next_prg, &smmu_task->prgs, list)
+ list_del(&prg->list);
+
put_pid(smmu_task->pid);
kfree(smmu_task);
@@ -2451,7 +2570,6 @@ static void arm_smmu_detach_task(struct arm_smmu_context *smmu_context)
arm_smmu_write_ctx_desc(smmu_context->master, smmu_context->ssid, NULL);
}
-__maybe_unused
static void arm_smmu_put_task(struct arm_smmu_device *smmu,
struct arm_smmu_task *smmu_task)
{
@@ -2460,6 +2578,167 @@ static void arm_smmu_put_task(struct arm_smmu_device *smmu,
spin_unlock(&smmu->contexts_lock);
}
+static int arm_smmu_handle_mm_fault(struct arm_smmu_device *smmu,
+ struct mm_struct *mm,
+ struct arm_smmu_fault *fault)
+{
+ int ret;
+ struct vm_area_struct *vma;
+ unsigned long access_flags = 0;
+ unsigned long fault_flags = FAULT_FLAG_USER | FAULT_FLAG_REMOTE;
+
+ /*
+ * We're holding smmu_task, which holds the mmu notifier, so mm is
+ * guaranteed to be here, but mm_users might still drop to zero when
+ * the task exits.
+ */
+ if (!mmget_not_zero(mm)) {
+ dev_dbg(smmu->dev, "mm dead\n");
+ return -EINVAL;
+ }
+
+ down_read(&mm->mmap_sem);
+
+ vma = find_extend_vma(mm, fault->iova);
+ if (!vma) {
+ ret = -ESRCH;
+ dev_dbg(smmu->dev, "VMA not found\n");
+ goto out_release;
+ }
+
+ if (fault->read)
+ access_flags |= VM_READ;
+
+ if (fault->write) {
+ access_flags |= VM_WRITE;
+ fault_flags |= FAULT_FLAG_WRITE;
+ }
+
+ if (fault->exec) {
+ access_flags |= VM_EXEC;
+ fault_flags |= FAULT_FLAG_INSTRUCTION;
+ }
+
+ if (access_flags & ~vma->vm_flags) {
+ ret = -EFAULT;
+ dev_dbg(smmu->dev, "access flags mismatch\n");
+ goto out_release;
+ }
+
+ ret = handle_mm_fault(vma, fault->iova, fault_flags);
+ dev_dbg(smmu->dev, "handle_mm_fault(%#x:%#x:%#llx, %#lx) -> %#x\n",
+ fault->sid, fault->ssid, fault->iova, fault_flags, ret);
+
+ ret = ret & VM_FAULT_ERROR ? -EFAULT : 0;
+
+out_release:
+ up_read(&mm->mmap_sem);
+ mmput(mm);
+
+ return ret;
+}
+
+static enum fault_status _arm_smmu_handle_fault(struct arm_smmu_fault *fault)
+{
+ struct arm_smmu_task *smmu_task = NULL;
+ struct arm_smmu_device *smmu = fault->smmu;
+ struct arm_smmu_context *smmu_context = NULL;
+ enum fault_status resp = ARM_SMMU_FAULT_FAIL;
+ struct arm_smmu_pri_group *prg = NULL, *tmp_prg;
+
+ if (!fault->ssv)
+ return ARM_SMMU_FAULT_DENY;
+
+ if (fault->priv)
+ return resp;
+
+ smmu_context = arm_smmu_get_context_by_id(smmu, NULL, fault->sid,
+ fault->ssid);
+ if (!smmu_context) {
+ dev_dbg(smmu->dev, "unable to find context %#x:%#x\n",
+ fault->sid, fault->ssid);
+ /*
+ * Note that we don't have prg_response_needs_ssid yet. Reply
+ * might be inconsistent with what the device expects.
+ */
+ return resp;
+ }
+
+ fault->ssv = smmu_context->master->ste.prg_response_needs_ssid;
+
+ spin_lock(&smmu->contexts_lock);
+ smmu_task = smmu_context->task;
+ if (smmu_task)
+ kref_get(&smmu_task->kref);
+ spin_unlock(&smmu->contexts_lock);
+
+ if (!smmu_task)
+ goto out_put_context;
+
+ list_for_each_entry(tmp_prg, &smmu_task->prgs, list) {
+ if (tmp_prg->index == fault->grpid) {
+ prg = tmp_prg;
+ break;
+ }
+ }
+
+ if (!prg && !fault->last) {
+ prg = kzalloc(sizeof(*prg), GFP_KERNEL);
+ if (!prg) {
+ resp = ARM_SMMU_FAULT_SUCC;
+ goto out_put_task;
+ }
+
+ prg->index = fault->grpid;
+ list_add(&prg->list, &smmu_task->prgs);
+ } else if (prg && prg->resp != ARM_SMMU_FAULT_SUCC) {
+ resp = prg->resp;
+ goto out_put_task;
+ }
+
+ if (!arm_smmu_handle_mm_fault(smmu, smmu_task->mm, fault))
+ resp = ARM_SMMU_FAULT_SUCC;
+
+ if (prg) {
+ if (fault->last) {
+ list_del(&prg->list);
+ kfree(prg);
+ } else {
+ prg->resp = resp;
+ }
+ }
+
+out_put_task:
+ arm_smmu_put_task(smmu, smmu_task);
+
+out_put_context:
+ arm_smmu_put_context(smmu, smmu_context);
+
+ return resp;
+}
+
+static void arm_smmu_handle_fault(struct work_struct *work)
+{
+ enum fault_status resp;
+ struct arm_smmu_fault *fault = container_of(work, struct arm_smmu_fault,
+ work);
+
+ resp = _arm_smmu_handle_fault(fault);
+ if (resp != ARM_SMMU_FAULT_SUCC)
+ dev_info_ratelimited(fault->smmu->dev, "%s fault:\n"
+ "\t0x%08x.0x%05x: [%u%s] %sprivileged %s%s%s access at iova "
+ "0x%016llx\n",
+ resp == ARM_SMMU_FAULT_DENY ? "unexpected" : "unhandled",
+ fault->sid, fault->ssid, fault->grpid,
+ fault->last ? "L" : "", fault->priv ? "" : "un",
+ fault->read ? "R" : "", fault->write ? "W" : "",
+ fault->exec ? "X" : "", fault->iova);
+
+ arm_smmu_fault_reply(fault, resp);
+
+ kfree(fault);
+}
+
static bool arm_smmu_master_supports_svm(struct arm_smmu_master_data *master)
{
return false;
@@ -2997,6 +3276,57 @@ static void arm_smmu_disable_ssid(struct arm_smmu_master_data *master)
pci_disable_pasid(pdev);
}
+static int arm_smmu_enable_pri(struct arm_smmu_master_data *master)
+{
+ int ret, pos;
+ struct pci_dev *pdev;
+ size_t max_requests = 64;
+ struct arm_smmu_device *smmu = master->smmu;
+
+ /* Do not enable PRI if SVM isn't supported */
+ unsigned long feat_mask = ARM_SMMU_FEAT_PRI | ARM_SMMU_FEAT_SVM;
+
+ if ((smmu->features & feat_mask) != feat_mask || !dev_is_pci(master->dev))
+ return -ENOSYS;
+
+ pdev = to_pci_dev(master->dev);
+
+ pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI);
+ if (!pos)
+ return -ENOSYS;
+
+ ret = pci_reset_pri(pdev);
+ if (ret)
+ return ret;
+
+ ret = pci_enable_pri(pdev, max_requests);
+ if (ret) {
+ dev_err(master->dev, "cannot enable PRI: %d\n", ret);
+ return ret;
+ }
+
+ master->can_fault = true;
+ master->ste.prg_response_needs_ssid = pci_prg_resp_requires_prefix(pdev);
+
+ dev_dbg(master->dev, "enabled PRI");
+
+ return 0;
+}
+
+static void arm_smmu_disable_pri(struct arm_smmu_master_data *master)
+{
+ struct pci_dev *pdev;
+
+ if (!master->can_fault || !dev_is_pci(master->dev))
+ return;
+
+ pdev = to_pci_dev(master->dev);
+
+ pci_disable_pri(pdev);
+
+ master->can_fault = false;
+}
+
static int arm_smmu_insert_master(struct arm_smmu_device *smmu,
struct arm_smmu_master_data *master)
{
@@ -3114,6 +3444,8 @@ static int arm_smmu_add_device(struct device *dev)
master->avail_contexts = nr_ssids - 1;
ats_enabled = !arm_smmu_enable_ats(master);
+ if (ats_enabled)
+ arm_smmu_enable_pri(master);
if (arm_smmu_master_supports_svm(master))
arm_smmu_insert_master(smmu, master);
@@ -3138,6 +3470,7 @@ static int arm_smmu_add_device(struct device *dev)
return 0;
err_disable_ats:
+ arm_smmu_disable_pri(master);
arm_smmu_disable_ats(master);
arm_smmu_free_cd_tables(master);
@@ -3186,6 +3519,7 @@ static void arm_smmu_remove_device(struct device *dev)
iommu_group_put(group);
+ arm_smmu_disable_pri(master);
/* PCIe PASID must be disabled after ATS */
arm_smmu_disable_ats(master);
arm_smmu_disable_ssid(master);
@@ -3490,6 +3824,18 @@ static int arm_smmu_init_structures(struct arm_smmu_device *smmu)
if (ret)
return ret;
+ if (smmu->features & ARM_SMMU_FEAT_SVM &&
+ smmu->features & ARM_SMMU_FEAT_PRI) {
+ /*
+ * Ensure strict ordering of the queue. We can't go reordering
+ * page faults willy nilly since they work in groups, with a
+ * flag "last" denoting when we should send a PRI response.
+ */
+ smmu->fault_queue = alloc_ordered_workqueue("smmu_fault_queue", 0);
+ if (!smmu->fault_queue)
+ return -ENOMEM;
+ }
+
return arm_smmu_init_strtab(smmu);
}
@@ -4250,6 +4596,10 @@ static int __init arm_smmu_init(void)
int ret = 0;
if (!registered) {
+ arm_smmu_fault_cache = KMEM_CACHE(arm_smmu_fault, 0);
+ if (!arm_smmu_fault_cache)
+ return -ENOMEM;
+
ret = platform_driver_register(&arm_smmu_driver);
registered = !ret;
}
--
2.11.0
^ permalink raw reply related [flat|nested] 103+ messages in thread
* [RFC PATCH 22/30] iommu: Bind/unbind tasks to/from devices
[not found] ` <20170227195441.5170-1-jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
` (20 preceding siblings ...)
2017-02-27 19:54 ` [RFC PATCH 21/30] iommu/arm-smmu-v3: Handle device faults from PRI Jean-Philippe Brucker
@ 2017-02-27 19:54 ` Jean-Philippe Brucker
2017-03-03 9:40 ` David Woodhouse
[not found] ` <20170227195441.5170-23-jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
2017-02-27 19:54 ` [RFC PATCH 23/30] iommu/arm-smmu-v3: Bind/unbind device and task Jean-Philippe Brucker
` (8 subsequent siblings)
30 siblings, 2 replies; 103+ messages in thread
From: Jean-Philippe Brucker @ 2017-02-27 19:54 UTC (permalink / raw)
Cc: Shanker Donthineni, kvm-u79uwXL29TY76Z2rM5mHXA, Catalin Marinas,
Sinan Kaya, Will Deacon,
iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
Harv Abdulhamid, linux-pci-u79uwXL29TY76Z2rM5mHXA, Bjorn Helgaas,
David Woodhouse,
linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r, Nate Watterson
Add three functions to the IOMMU API. iommu_bind_task takes a device and a
task as argument. If the IOMMU, the device and the bus support it, attach
task to device and create a Process Address Space ID (PASID) unique to the
device. DMA from the device can then use the PASID to read or write into
the address space. iommu_unbind_task removes a bond created with
iommu_bind_task. iommu_set_svm_ops allows a device driver to set some
callbacks for specific SVM-related operations.
Try to accommodate current implementations (AMD, Intel and ARM), by
letting the IOMMU driver do all the work, but attempt by the same occasion
to find intersections between implementations.
* amd_iommu_v2 expects the device to allocate a PASID and pass it to the
IOMMU. The driver also provides separate functions to register callbacks
that handles failed PRI requests and invalidate PASIDs.
int amd_iommu_bind_pasid(struct pci_dev *pdev, int pasid,
struct task_struct *task)
void amd_iommu_unbind_pasid(struct pci_dev *pdev, int pasid)
int amd_iommu_set_invalid_ppr_cb(struct pci_dev *pdev,
amd_iommu_invalid_ppr_cb cb)
int amd_iommu_set_invalidate_ctx_cb(struct pci_dev *pdev,
amd_iommu_invalidate_ctx cb)
* intel-svm allocates a PASID, and requires the driver to pass
"svm_dev_ops", which currently contains a fault callback. It also
doesn't take a task as argument, but uses 'current'.
int intel_svm_bind_mm(struct device *dev, int *pasid, int flags,
struct svm_dev_ops *ops)
int intel_svm_unbind_mm(struct device *dev, int pasid)
* For arm-smmu-v3, PASID must be allocated by the SMMU driver since it
indexes contexts in an array handled by the SMMU device.
Bind and unbind
===============
The following could suit existing implementations:
int iommu_bind_task(struct device *dev, struct task_struct *task,
int *pasid, int flags, void *priv);
int iommu_unbind_task(struct device *dev, int pasid, int flags);
This is similar to existing functions.
* @dev is a SVM-capable device. If it is not, bind fails,
* @task is a userspace task. It doesn't have to be current, but
implementations can reject the call if they only support current.
* @pasid is a handle for the bond. It would be nice to have the IOMMU
driver handle PASID allocation, for consistency. Otherwise, the
requirement for drivers to allocate PASIDs might be advertised in a
capability.
* @flags represents parameters of bind/unbind. We might want to reserve a
few bits, maybe the bottom half, for the API, and give the rest to the
driver.
* @priv will be passed to SVM callbacks targeting this bond
SVM device callbacks
=====================
Making svm_dev_ops (here iommu_svm_ops) a first-class citizen of struct
device would be a useful next step. Device drivers could set this
structure when they want to participate in SVM. For the moment,
iommu_set_svm_ops must be called. I'm not sure what to do when assigning a
device via VFIO. Should we remove the SVM ops when detaching from a
domain, or have the device driver remove them when detaching itself from a
device?
Fault handling
--------------
The first callback allows a driver to be notified when the IOMMU driver
cannot handle a fault.
amd_iommu_v2 has:
int (*amd_iommu_invalid_ppr_cb)(struct pci_dev *pdev, int pasid,
unsigned long address, u16 prot)
intel-svm has (called for all faults):
void (*fault_cb)(struct device *dev, int pasid, u64 address, u32 private,
int rwxp, int response)
We put the following in iommu_svm_ops:
int (*handle_fault)(struct device *dev, int pasid, u64 address, int prot,
int status, void *priv);
The IOMMU driver calls handle_mm_fault and sends the result back to the
device. If the fault cannot be handled, it gives a chance to the device
driver to record the fault and maybe even fix it up. @pasid, @address and
@prot are copied from the page request. @status is the return value of
handle_mm_fault. @prot could use the format defined in iommu.h
(IOMMU_READ, IOMMU_WRITE, etc.) @status could be a combination of
VM_FAULT_* as returned by handle_mm_fault, but this leaves out the case
where we don't even reach the fault handling part. We could instead define
new status flags: one for failure to locate the context associated to the
PASID, one for failure of mm to handle the fault. We cannot piggy-back on
existing IOMMU_FAULT_READ and WRITE in their current state, because
devices might request both read and write permissions at the same time.
They would need to be redefined as flags.
All callbacks have a @priv field. This is an opaque pointer set by the
device driver when binding. This way the device driver gets both a PASID
and its metadata in the callback, and we avoid duplicating pasid state
lookups in both IOMMU driver and device driver.
Another question is the location of the callback. IOMMU driver could
notify device driver either:
* before handle_mm_fault, to do some custom fault handling and perhaps
bypass the IOMMU handler entirely,
* after handle_mm_fault, to notify the driver of an error (AMD),
* after handle_mm_fault, to notify the driver of any page request (Intel),
We might want to let the driver decide when binding a PASID, or offer two
callbacks: handle_fault and report_fault. I don't have a proposal for this
yet.
handle_fault returns the response that the IOMMU driver should send to the
device. Either success, meaning that the page has been mapped (or it is
likely to succeed later), or failure, meaning that the device shouldn't
bother retrying.
It would be nice to reconcile with the iommu_fault_handler API, that isn't
widely used yet but is being considered for handling domain faults from
platform devices on the SMMUv2, using the stall model instead of ATS/PRI.
Yet another concern for ARM is that platform devices may issue traffic
over multiple stream IDs, for instance one stream ID per channel in a DMA
engine. handle_fault doesn't provide a way to pass those stream IDs back
to the driver.
PASID invalidation
------------------
Next, we need to let the IOMMU driver notify the device driver before it
attempts to unbind a PASID. Subsequent patches discuss PASID invalidation
in more details, so we'll simply propose the following interface for now.
AMD has:
void (*amd_iommu_invalidate_ctx)(struct pci_dev *pdev, int pasid);
We put the following in iommu_svm_ops:
int (*invalidate_pasid)(struct device *dev, int pasid, void *priv);
Capability detection
====================
I didn't add any public function for detecting SVM capability yet. In my
opinion, a nice way to do it is to have user query the state of the device
to know if they can call bind/unbind. If the IOMMU supports SVM, and the
IOMMU driver was able to enable it successfully in the device, then user
can call bind/unbind on the device.
In the VFIO patch later in this series, I implemented the PCIe detection
like this: if ATS, PRI and PASID are enabled (by the IOMMU driver), then
the device can do SVM. If for some reason the IOMMU is incompatible with
the device's SVM properties or is incompatible with the MMU page tables,
then it shouldn't enable PRI or PASID. For platform devices, the
requirements are very blurry at the moment. We'll probably add a device-
tree property saying that a device and its bus are SVM-capable. The
following interface could be added to the API:
int iommu_svm_capable(struct device *dev, int flags);
This tells the device driver whether the IOMMU driver is capable of
binding a task to the device. @flags may contain specific SVM capabilities
(paging/pinned, executable, etc) and the function could return a subset of
these flags. For PCI devices, everything is enabled when this call is
successful. For platform devices the device driver would have to enable
SVM itself.
API naming
==========
I realize that "SVM" as a name isn't great because the svm namespace is
already taken by AMD-V (Secure Virtual Machine) in arch/x86. Also, the
name itself doesn't say much.
I personally prefer "Unified Virtual Addressing" (UVA), adopted by CUDA,
or rather Unified Virtual Address Space (UVAS). Another possibility is
Unified Virtual Memory (UVM). Acronym UAS for Unified Address Space is
already used by USB. Same for Shared Address Space (SAS), already in use
in the kernel, but SVAS would work (although it doesn't look good).
Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
---
drivers/iommu/iommu.c | 108 ++++++++++++++++++++++++++++++++++++++++++++++++++
include/linux/iommu.h | 41 +++++++++++++++++++
2 files changed, 149 insertions(+)
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 8ea14f41a979..26c5f6528c69 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1438,6 +1438,114 @@ void iommu_detach_group(struct iommu_domain *domain, struct iommu_group *group)
}
EXPORT_SYMBOL_GPL(iommu_detach_group);
+int iommu_set_svm_ops(struct device *dev, const struct iommu_svm_ops *svm_ops)
+{
+ const struct iommu_ops *ops;
+ struct iommu_group *group;
+ int ret;
+
+ group = iommu_group_get_for_dev(dev);
+ if (IS_ERR(group))
+ return PTR_ERR(group);
+
+ ops = dev->bus->iommu_ops;
+ if (!ops->set_svm_ops) {
+ iommu_group_put(group);
+ return -ENODEV;
+ }
+
+ mutex_lock(&group->mutex);
+ ret = ops->set_svm_ops(dev, svm_ops);
+ mutex_unlock(&group->mutex);
+
+ iommu_group_put(group);
+ return ret;
+
+}
+EXPORT_SYMBOL_GPL(iommu_set_svm_ops);
+
+/*
+ * iommu_bind_task - Share task address space with device
+ *
+ * @dev: device to bind
+ * @task: task to bind
+ * @pasid: valid address where the PASID is stored
+ * @flags: driver-specific flags
+ * @priv: private data to associate with the bond
+ *
+ * Create a bond between device and task, allowing the device to access the task
+ * address space using @pasid. Intel and ARM SMMU drivers allocate and return
+ * the PASID, while AMD requires the caller to allocate a PASID beforehand.
+ *
+ * iommu_unbind_task must be called with this PASID before the task exits.
+ */
+int iommu_bind_task(struct device *dev, struct task_struct *task, int *pasid,
+ int flags, void *priv)
+{
+ const struct iommu_ops *ops;
+ struct iommu_group *group;
+ int ret;
+
+ if (!pasid)
+ return -EINVAL;
+
+ group = iommu_group_get(dev);
+ if (!group)
+ return -ENODEV;
+
+ ops = dev->bus->iommu_ops;
+ if (!ops->bind_task) {
+ iommu_group_put(group);
+ return -ENODEV;
+ }
+
+ mutex_lock(&group->mutex);
+ if (!group->domain)
+ ret = -EINVAL;
+ else
+ ret = ops->bind_task(dev, task, pasid, flags, priv);
+ mutex_unlock(&group->mutex);
+
+ iommu_group_put(group);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_bind_task);
+
+/*
+ * iommu_unbind_task - Remove a bond created with iommu_bind_task
+ *
+ * @dev: device bound to the task
+ * @pasid: identifier of the bond
+ * @flags: state of the PASID and driver-specific flags
+ */
+int iommu_unbind_task(struct device *dev, int pasid, int flags)
+{
+ const struct iommu_ops *ops;
+ struct iommu_group *group;
+ int ret;
+
+ group = iommu_group_get(dev);
+ if (!group)
+ return -ENODEV;
+
+ ops = dev->bus->iommu_ops;
+ if (!ops->unbind_task) {
+ iommu_group_put(group);
+ return -ENODEV;
+ }
+
+ mutex_lock(&group->mutex);
+ if (!group->domain)
+ ret = -EINVAL;
+ else
+ ret = ops->unbind_task(dev, pasid, flags);
+ mutex_unlock(&group->mutex);
+
+ iommu_group_put(group);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_unbind_task);
+
phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova)
{
if (unlikely(domain->ops->iova_to_phys == NULL))
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 6a6de187ddc0..9554f45d4305 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -145,6 +145,16 @@ struct iommu_resv_region {
int type;
};
+/*
+ * @handle_fault: report or handle a fault from the device (FIXME: imprecise)
+ * @invalidate_pasid: stop using a PASID.
+ */
+struct iommu_svm_ops {
+ int (*handle_fault)(struct device *dev, int pasid, u64 address,
+ int prot, int status, void *priv);
+ int (*invalidate_pasid)(struct device *dev, int pasid, void *priv);
+};
+
#ifdef CONFIG_IOMMU_API
/**
@@ -154,6 +164,9 @@ struct iommu_resv_region {
* @domain_free: free iommu domain
* @attach_dev: attach device to an iommu domain
* @detach_dev: detach device from an iommu domain
+ * @set_svm_ops: set SVM callbacks for device
+ * @bind_task: attach a task address space to a device
+ * @unbind_task: detach a task address space from a device
* @map: map a physically contiguous memory region to an iommu domain
* @unmap: unmap a physically contiguous memory region from an iommu domain
* @map_sg: map a scatter-gather list of physically contiguous memory chunks
@@ -183,6 +196,10 @@ struct iommu_ops {
int (*attach_dev)(struct iommu_domain *domain, struct device *dev);
void (*detach_dev)(struct iommu_domain *domain, struct device *dev);
+ int (*set_svm_ops)(struct device *dev, const struct iommu_svm_ops *ops);
+ int (*bind_task)(struct device *dev, struct task_struct *task,
+ int *pasid, int flags, void *priv);
+ int (*unbind_task)(struct device *dev, int pasid, int flags);
int (*map)(struct iommu_domain *domain, unsigned long iova,
phys_addr_t paddr, size_t size, int prot);
size_t (*unmap)(struct iommu_domain *domain, unsigned long iova,
@@ -403,6 +420,13 @@ void iommu_fwspec_free(struct device *dev);
int iommu_fwspec_add_ids(struct device *dev, u32 *ids, int num_ids);
const struct iommu_ops *iommu_ops_from_fwnode(struct fwnode_handle *fwnode);
+extern int iommu_set_svm_ops(struct device *dev,
+ const struct iommu_svm_ops *svm_ops);
+extern int iommu_bind_task(struct device *dev, struct task_struct *task,
+ int *pasid, int flags, void *priv);
+
+extern int iommu_unbind_task(struct device *dev, int pasid, int flags);
+
#else /* CONFIG_IOMMU_API */
struct iommu_ops {};
@@ -663,6 +687,23 @@ const struct iommu_ops *iommu_ops_from_fwnode(struct fwnode_handle *fwnode)
return NULL;
}
+static inline int iommu_set_svm_ops(struct device *dev,
+ const struct iommu_svm_ops *svm_ops)
+{
+ return -ENODEV;
+}
+
+static inline int iommu_bind_task(struct device *dev, struct task_struct *task,
+ int *pasid, int flags, void *priv)
+{
+ return -ENODEV;
+}
+
+static int iommu_unbind_task(struct device *dev, int pasid, int flags)
+{
+ return -ENODEV;
+}
+
#endif /* CONFIG_IOMMU_API */
#endif /* __LINUX_IOMMU_H */
--
2.11.0
^ permalink raw reply related [flat|nested] 103+ messages in thread
* Re: [RFC PATCH 22/30] iommu: Bind/unbind tasks to/from devices
2017-02-27 19:54 ` [RFC PATCH 22/30] iommu: Bind/unbind tasks to/from devices Jean-Philippe Brucker
@ 2017-03-03 9:40 ` David Woodhouse
[not found] ` <1488534044.6234.14.camel-wEGCiKHe2LqWVfeAwA7xHQ@public.gmane.org>
[not found] ` <20170227195441.5170-23-jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
1 sibling, 1 reply; 103+ messages in thread
From: David Woodhouse @ 2017-03-03 9:40 UTC (permalink / raw)
To: Jean-Philippe Brucker
Cc: Lorenzo Pieralisi, Shanker Donthineni, kvm, Catalin Marinas,
Joerg Roedel, Sinan Kaya, Will Deacon, iommu, Harv Abdulhamid,
Alex Williamson, linux-pci, Bjorn Helgaas, Robin Murphy,
linux-arm-kernel, Nate Watterson
[-- Attachment #1.1: Type: text/plain, Size: 4747 bytes --]
On Mon, 2017-02-27 at 19:54 +0000, Jean-Philippe Brucker wrote:
> Add three functions to the IOMMU API. iommu_bind_task takes a device and a
> task as argument. If the IOMMU, the device and the bus support it, attach
> task to device and create a Process Address Space ID (PASID) unique to the
> device. DMA from the device can then use the PASID to read or write into
> the address space. iommu_unbind_task removes a bond created with
> iommu_bind_task. iommu_set_svm_ops allows a device driver to set some
> callbacks for specific SVM-related operations.
>
> Try to accommodate current implementations (AMD, Intel and ARM), by
> letting the IOMMU driver do all the work, but attempt by the same occasion
> to find intersections between implementations.
>
> * amd_iommu_v2 expects the device to allocate a PASID and pass it to the
> IOMMU. The driver also provides separate functions to register callbacks
> that handles failed PRI requests and invalidate PASIDs.
>
> int amd_iommu_bind_pasid(struct pci_dev *pdev, int pasid,
> struct task_struct *task)
> void amd_iommu_unbind_pasid(struct pci_dev *pdev, int pasid)
> int amd_iommu_set_invalid_ppr_cb(struct pci_dev *pdev,
> amd_iommu_invalid_ppr_cb cb)
> int amd_iommu_set_invalidate_ctx_cb(struct pci_dev *pdev,
> amd_iommu_invalidate_ctx cb)
>
> * intel-svm allocates a PASID, and requires the driver to pass
> "svm_dev_ops", which currently contains a fault callback. It also
> doesn't take a task as argument, but uses 'current'.
>
> int intel_svm_bind_mm(struct device *dev, int *pasid, int flags,
> struct svm_dev_ops *ops)
> int intel_svm_unbind_mm(struct device *dev, int pasid)
>
> * For arm-smmu-v3, PASID must be allocated by the SMMU driver since it
> indexes contexts in an array handled by the SMMU device.
Right. The Intel version was designed with all of the above three in
mind. It was discussed at the Kernel Summit and LPC on more than one
occasion as it took shape, and what I implemented for Intel basicall
represents the consensus of what we thought it should look like.
I meant to convert the AMD driver to the same API, but don't have
access to test hardware. Note that the amdkfd code will need careful
attention here.
Intel slightly deviates from the "one PASID per process" vision too,
because it currently has a PASID allocator idr per IOMMU. That wants
making system-wide. And probably not Intel-specific.
Some other comments...
The callbacks and fault handlers could perhaps be deprecated. In an
ideal world nobody would ever use them — the device itself is supposed
to be able to communicate with its driver about the request that
failed; we don't need a dirty hook into the IOMMU code from when *it*
handles the fault.
In the Intel IOMMU fault reports, there are some additional bits in the
descriptor which are 'context private' bits. For built-in devices like
the graphics engine, this contains further information about precisely
which context was performing the failing access. But again I don't
think we should need it in an ideal world. It's a horrid thing to have
to feed through a generic IOMMU API.
One thing which might help us *avoid* needing it is the
SVM_FLAG_PRIVATE_PASID option, which asks for a *new* PASID. So a
single process can have more than one PASID. That's still OK on ARM,
isn't it? As long as they're all allocated from the same pool and we
never use a given PASID for more than one address space simultaneously
on different devices.
We also have SVM_FLAG_SUPERVISOR_MODE, which gives access to kernel
address space. Yes, people use it.
> PASID invalidation
> ------------------
>
> Next, we need to let the IOMMU driver notify the device driver before it
> attempts to unbind a PASID. Subsequent patches discuss PASID invalidation
> in more details, so we'll simply propose the following interface for now.
>
> AMD has:
>
> void (*amd_iommu_invalidate_ctx)(struct pci_dev *pdev, int pasid);
>
> We put the following in iommu_svm_ops:
>
> int (*invalidate_pasid)(struct device *dev, int pasid, void *priv);
These can basically take for ever, right? You're asking the *device* to
tell you when it's finished using that PASID.
> Capability detection
> ====================
> ...
>
> int iommu_svm_capable(struct device *dev, int flags);
We already had this for Intel. It basically goes through *all* the
enabling checks that it needs to for really setting up SVM, and that's
why it's actually the *same* call, but with a NULL pasid argument:
#define intel_svm_available(dev) (!intel_svm_bind_mm((dev), NULL, 0, NULL))
[-- Attachment #1.2: smime.p7s --]
[-- Type: application/x-pkcs7-signature, Size: 4938 bytes --]
[-- Attachment #2: Type: text/plain, Size: 176 bytes --]
_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
^ permalink raw reply [flat|nested] 103+ messages in thread
[parent not found: <20170227195441.5170-23-jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>]
* RE: [RFC PATCH 22/30] iommu: Bind/unbind tasks to/from devices
[not found] ` <20170227195441.5170-23-jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
@ 2017-03-02 7:29 ` Tian, Kevin
2017-03-22 15:38 ` Joerg Roedel
1 sibling, 0 replies; 103+ messages in thread
From: Tian, Kevin @ 2017-03-02 7:29 UTC (permalink / raw)
To: Jean-Philippe Brucker
Cc: Shanker Donthineni, kvm-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
Catalin Marinas, Sinan Kaya, Will Deacon,
iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org,
Harv Abdulhamid,
linux-pci-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, Bjorn Helgaas,
David Woodhouse,
linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r@public.gmane.org,
Nate Watterson
> From: Jean-Philippe Brucker
> Sent: Tuesday, February 28, 2017 3:55 AM
>
[...]
>
> API naming
> ==========
>
> I realize that "SVM" as a name isn't great because the svm namespace is
> already taken by AMD-V (Secure Virtual Machine) in arch/x86. Also, the
> name itself doesn't say much.
>
> I personally prefer "Unified Virtual Addressing" (UVA), adopted by CUDA,
> or rather Unified Virtual Address Space (UVAS). Another possibility is
> Unified Virtual Memory (UVM). Acronym UAS for Unified Address Space is
> already used by USB. Same for Shared Address Space (SAS), already in use
> in the kernel, but SVAS would work (although it doesn't look good).
>
'unified' is not exactly matching to 'shared'. In some context it means
unifying device local memory and system memory in one virtual address
space, while SVM is more for sharing of a CPU virtual address space with
device.
What about Shared Virtual Addressing (SVA)?
Thanks
Kevin
^ permalink raw reply [flat|nested] 103+ messages in thread
* Re: [RFC PATCH 22/30] iommu: Bind/unbind tasks to/from devices
[not found] ` <20170227195441.5170-23-jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
2017-03-02 7:29 ` Tian, Kevin
@ 2017-03-22 15:38 ` Joerg Roedel
1 sibling, 0 replies; 103+ messages in thread
From: Joerg Roedel @ 2017-03-22 15:38 UTC (permalink / raw)
To: Jean-Philippe Brucker
Cc: Shanker Donthineni, kvm-u79uwXL29TY76Z2rM5mHXA, Catalin Marinas,
Sinan Kaya, Will Deacon,
iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
Harv Abdulhamid, linux-pci-u79uwXL29TY76Z2rM5mHXA, Bjorn Helgaas,
David Woodhouse,
linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r, Nate Watterson
Hi Jean-Philippe,
On Mon, Feb 27, 2017 at 07:54:33PM +0000, Jean-Philippe Brucker wrote:
> +extern int iommu_set_svm_ops(struct device *dev,
> + const struct iommu_svm_ops *svm_ops);
> +extern int iommu_bind_task(struct device *dev, struct task_struct *task,
> + int *pasid, int flags, void *priv);
> +
> +extern int iommu_unbind_task(struct device *dev, int pasid, int flags);
I really like that API, it is simpler than what the AMD driver
currently implements but should work for it too (once we adapt the
AMD-KFD driver to it).
One issue I like to have discussed is whether we can make a global PASID
allocation (with a one-PASID per-task model) workable with SMMU too.
Joerg
^ permalink raw reply [flat|nested] 103+ messages in thread
* [RFC PATCH 23/30] iommu/arm-smmu-v3: Bind/unbind device and task
[not found] ` <20170227195441.5170-1-jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
` (21 preceding siblings ...)
2017-02-27 19:54 ` [RFC PATCH 22/30] iommu: Bind/unbind tasks to/from devices Jean-Philippe Brucker
@ 2017-02-27 19:54 ` Jean-Philippe Brucker
2017-02-27 19:54 ` [RFC PATCH 24/30] iommu: Specify PASID state when unbinding a task Jean-Philippe Brucker
` (7 subsequent siblings)
30 siblings, 0 replies; 103+ messages in thread
From: Jean-Philippe Brucker @ 2017-02-27 19:54 UTC (permalink / raw)
Cc: Shanker Donthineni, kvm-u79uwXL29TY76Z2rM5mHXA, Catalin Marinas,
Sinan Kaya, Will Deacon,
iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
Harv Abdulhamid, linux-pci-u79uwXL29TY76Z2rM5mHXA, Bjorn Helgaas,
David Woodhouse,
linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r, Nate Watterson
Now that everything is in place, implement bind and unbind operations.
Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
---
drivers/iommu/arm-smmu-v3.c | 178 +++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 175 insertions(+), 3 deletions(-)
diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index 5e0008ac68cb..3ba7f65020f9 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -742,6 +742,7 @@ struct arm_smmu_master_data {
bool can_fault;
u32 avail_contexts;
+ const struct iommu_svm_ops *svm_ops;
};
/* SMMU private data for an IOMMU domain */
@@ -820,6 +821,7 @@ struct arm_smmu_context {
struct arm_smmu_task *task;
struct arm_smmu_master_data *master;
+ void *priv;
struct list_head task_head;
struct rb_node master_node;
@@ -2085,6 +2087,26 @@ static size_t arm_smmu_atc_invalidate_task(struct arm_smmu_task *smmu_task,
return size;
}
+static size_t arm_smmu_atc_invalidate_context(struct arm_smmu_context *smmu_context,
+ unsigned long iova, size_t size)
+{
+ struct arm_smmu_cmdq_ent cmd;
+ struct arm_smmu_device *smmu = smmu_context->master->smmu;
+ struct arm_smmu_cmdq_ent sync_cmd = {
+ .opcode = CMDQ_OP_CMD_SYNC,
+ };
+
+ arm_smmu_atc_invalidate_to_cmd(smmu, iova, size, &cmd);
+
+ cmd.substream_valid = true;
+ cmd.atc.ssid = smmu_context->ssid;
+
+ arm_smmu_atc_invalidate_master(smmu_context->master, &cmd);
+ arm_smmu_cmdq_issue_cmd(smmu, &sync_cmd);
+
+ return size;
+}
+
/* IOMMU API */
static bool arm_smmu_capable(enum iommu_cap cap)
{
@@ -2098,7 +2120,6 @@ static bool arm_smmu_capable(enum iommu_cap cap)
}
}
-__maybe_unused
static struct arm_smmu_context *
arm_smmu_attach_task(struct arm_smmu_task *smmu_task,
struct arm_smmu_master_data *master)
@@ -2444,7 +2465,6 @@ static void arm_smmu_free_task_pgtable(struct arm_smmu_task *smmu_task)
arm_smmu_bitmap_free(smmu->asid_map, smmu_task->s1_cfg.asid);
}
-__maybe_unused
static struct arm_smmu_task *arm_smmu_alloc_task(struct arm_smmu_device *smmu,
struct task_struct *task)
{
@@ -2741,7 +2761,156 @@ static void arm_smmu_handle_fault(struct work_struct *work)
static bool arm_smmu_master_supports_svm(struct arm_smmu_master_data *master)
{
- return false;
+ return dev_is_pci(master->dev) && master->can_fault &&
+ master->avail_contexts;
+}
+
+static int arm_smmu_set_svm_ops(struct device *dev,
+ const struct iommu_svm_ops *svm_ops)
+{
+ struct arm_smmu_master_data *master;
+
+ if (!dev->iommu_fwspec)
+ return -EINVAL;
+
+ master = dev->iommu_fwspec->iommu_priv;
+ if (!master)
+ return -EINVAL;
+
+ master->svm_ops = svm_ops;
+
+ return 0;
+}
+
+static int arm_smmu_bind_task(struct device *dev, struct task_struct *task,
+ int *pasid, int flags, void *priv)
+{
+ int ret = 0;
+ struct pid *pid;
+ struct iommu_group *group;
+ struct arm_smmu_device *smmu;
+ struct arm_smmu_group *smmu_group;
+ struct arm_smmu_domain *smmu_domain;
+ struct arm_smmu_master_data *master;
+ struct arm_smmu_task *smmu_task = NULL, *cur_task;
+ struct arm_smmu_context *smmu_context = NULL, *cur_context;
+
+ if (!dev->iommu_fwspec)
+ return -EINVAL;
+
+ master = dev->iommu_fwspec->iommu_priv;
+ if (!master)
+ return -EINVAL;
+
+ if (!arm_smmu_master_supports_svm(master))
+ return -EINVAL;
+
+ smmu = master->smmu;
+
+ group = iommu_group_get(dev);
+ smmu_group = to_smmu_group(group);
+
+ smmu_domain = smmu_group->domain;
+ if (!smmu_domain) {
+ iommu_group_put(group);
+ return -EINVAL;
+ }
+
+ if (smmu_domain->stage != ARM_SMMU_DOMAIN_S1) {
+ /* We do not support stage-2 SVM yet... */
+ iommu_group_put(group);
+ return -ENOSYS;
+ }
+
+ iommu_group_put(group);
+
+ pid = get_task_pid(task, PIDTYPE_PID);
+
+ spin_lock(&smmu->contexts_lock);
+
+ list_for_each_entry(cur_task, &smmu->tasks, smmu_head) {
+ if (cur_task->pid == pid) {
+ kref_get(&cur_task->kref);
+ smmu_task = cur_task;
+ break;
+ }
+ }
+
+ if (smmu_task) {
+ list_for_each_entry(cur_context, &smmu_task->contexts,
+ task_head) {
+ if (cur_context->master->dev == dev) {
+ smmu_context = cur_context;
+ _arm_smmu_put_task(cur_task);
+ break;
+ }
+ }
+ }
+ spin_unlock(&smmu->contexts_lock);
+
+ put_pid(pid);
+
+ if (smmu_context)
+ /* We don't support nested bind/unbind calls */
+ return -EEXIST;
+
+ if (!smmu_task) {
+ smmu_task = arm_smmu_alloc_task(smmu, task);
+ if (IS_ERR(smmu_task))
+ return -PTR_ERR(smmu_task);
+ }
+
+ smmu_context = arm_smmu_attach_task(smmu_task, master);
+ if (IS_ERR(smmu_context)) {
+ arm_smmu_put_task(smmu, smmu_task);
+ return PTR_ERR(smmu_context);
+ }
+
+ smmu_context->priv = priv;
+
+ *pasid = smmu_context->ssid;
+ dev_dbg(dev, "bound to task %d with PASID %d\n", pid_vnr(pid), *pasid);
+
+ return ret;
+}
+
+static int arm_smmu_unbind_task(struct device *dev, int pasid, int flags)
+{
+ struct arm_smmu_device *smmu;
+ struct arm_smmu_master_data *master;
+ struct arm_smmu_context *smmu_context = NULL;
+
+ if (!dev->iommu_fwspec)
+ return -EINVAL;
+
+ master = dev->iommu_fwspec->iommu_priv;
+ if (!master)
+ return -EINVAL;
+
+ smmu = master->smmu;
+
+ smmu_context = arm_smmu_get_context_by_id(smmu, master, 0, pasid);
+ if (!smmu_context)
+ return -ESRCH;
+
+ dev_dbg(dev, "unbind PASID %d\n", pasid);
+
+ /*
+ * There isn't any "ATC invalidate all by PASID" command. If this isn't
+ * good enough, we'll need fine-grained invalidation for each vma.
+ */
+ arm_smmu_atc_invalidate_context(smmu_context, 0, -1);
+
+ spin_lock(&smmu->contexts_lock);
+ if (smmu_context->task)
+ arm_smmu_detach_task(smmu_context);
+
+ /* Release the ref we got earlier in this function */
+ _arm_smmu_put_context(smmu_context);
+ _arm_smmu_put_context(smmu_context);
+ spin_unlock(&smmu->contexts_lock);
+
+ return 0;
}
static struct iommu_domain *arm_smmu_domain_alloc(unsigned type)
@@ -3626,6 +3795,9 @@ static struct iommu_ops arm_smmu_ops = {
.capable = arm_smmu_capable,
.domain_alloc = arm_smmu_domain_alloc,
.domain_free = arm_smmu_domain_free,
+ .set_svm_ops = arm_smmu_set_svm_ops,
+ .bind_task = arm_smmu_bind_task,
+ .unbind_task = arm_smmu_unbind_task,
.attach_dev = arm_smmu_attach_dev,
.map = arm_smmu_map,
.unmap = arm_smmu_unmap,
--
2.11.0
^ permalink raw reply related [flat|nested] 103+ messages in thread
* [RFC PATCH 24/30] iommu: Specify PASID state when unbinding a task
[not found] ` <20170227195441.5170-1-jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
` (22 preceding siblings ...)
2017-02-27 19:54 ` [RFC PATCH 23/30] iommu/arm-smmu-v3: Bind/unbind device and task Jean-Philippe Brucker
@ 2017-02-27 19:54 ` Jean-Philippe Brucker
[not found] ` <20170227195441.5170-25-jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
2017-02-27 19:54 ` [RFC PATCH 25/30] iommu/arm-smmu-v3: Safe invalidation and recycling of PASIDs Jean-Philippe Brucker
` (6 subsequent siblings)
30 siblings, 1 reply; 103+ messages in thread
From: Jean-Philippe Brucker @ 2017-02-27 19:54 UTC (permalink / raw)
Cc: Shanker Donthineni, kvm-u79uwXL29TY76Z2rM5mHXA, Catalin Marinas,
Sinan Kaya, Will Deacon,
iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
Harv Abdulhamid, linux-pci-u79uwXL29TY76Z2rM5mHXA, Bjorn Helgaas,
David Woodhouse,
linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r, Nate Watterson
Provide a way for device drivers to tell the IOMMU driver about the state
of the PASID they are trying to decommission. When unbinding a task from a
device, the IOMMU driver needs to know whether it can immediately reuse
the PASID for another task, or if there is additional work to be done
before the PASID is safe to re-use.
One hard requirement when calling unbind is that the associated PASID is
not present in any transaction downstream of the IOMMU anymore. In other
words, any read, write, page requests referring to this PASID has
finished.
For PCIe, this means that the driver has successfully executed the
device-specific stop request mechanism described in 6.20.1 (Managing PASID
TLP Prefix Usage). In particular:
* device doesn't issue any new request for this PASID,
* all non-posted requests for this PASID have been completed,
* all posted requests for this PASID (addressing host memory) have been
flushed to the host.
Address Translation Requests are non-posted, and PRI Page Requests (PPR)
are posted. In addition with PRI, device must implement one of the
following mechanism (ATS spec 4.1.2. - Managing PASID TLP Prefix Usage):
A. Finish transmitting any PPR affecting this PASID and wait for their
response. In this case, the IOMMU driver can safely reuse the PASID and
must not wait for a Stop Marker.
B. Finish transmitting any PPR affecting this PASID and send a Stop
Marker. The driver must wait to receive a Stop Marker for this PASID
before reusing it.
This patch lets the driver communicate the current state of the PASID with
either IOMMU_PASID_FLUSHED for case A, or IOMMU_PASID_CLEAN for case B.
It is an important distinction because, if the IOMMU driver reassigns a
PASID while the IOMMU still holds pending PPR targeting that PASID
internally, the PPR will trigger a fault in the wrong address space.
Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
---
drivers/iommu/iommu.c | 8 ++++++++
include/linux/iommu.h | 18 +++++++++++++++++-
2 files changed, 25 insertions(+), 1 deletion(-)
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 26c5f6528c69..eed52500d469 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1517,6 +1517,14 @@ EXPORT_SYMBOL_GPL(iommu_bind_task);
* @dev: device bound to the task
* @pasid: identifier of the bond
* @flags: state of the PASID and driver-specific flags
+ *
+ * The caller must informs the IOMMU driver whether the PASID is safe to reuse
+ * immediately or if it needs more invalidation steps, by setting flags to
+ * either IOMMU_PASID_FLUSHED, or IOMMU_PASID_CLEAN.
+ *
+ * Without one of these flags, the device driver must have provided an
+ * invalidate_pasid callback in iommu_svm_ops. Otherwise, iommu_unbind_task
+ * returns an error.
*/
int iommu_unbind_task(struct device *dev, int pasid, int flags)
{
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 9554f45d4305..204943ef38b2 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -50,6 +50,21 @@ struct notifier_block;
#define IOMMU_FAULT_READ 0x0
#define IOMMU_FAULT_WRITE 0x1
+/*
+ * State of a PASID in the system
+ *
+ * IOMMU_PASID_FLUSHED: the device does not generate any traffic for this PASID
+ * anymore, and all references to the PASID have been flushed; in other words,
+ * the IOMMU will not receive any transaction referring to this instance of
+ * the PASID anymore.
+ *
+ * IOMMU_PASID_CLEAN: in addition to IOMMU_PASID_FLUSHED, the PASID isn't
+ * present in the IOMMU either. For instance when using PRI, the device waited
+ * for all of its page requests to come back with a response.
+ */
+#define IOMMU_PASID_FLUSHED 0x1
+#define IOMMU_PASID_CLEAN 0x2
+
typedef int (*iommu_fault_handler_t)(struct iommu_domain *,
struct device *, unsigned long, int, void *);
@@ -147,7 +162,8 @@ struct iommu_resv_region {
/*
* @handle_fault: report or handle a fault from the device (FIXME: imprecise)
- * @invalidate_pasid: stop using a PASID.
+ * @invalidate_pasid: stop using a PASID. Returns one of IOMMU_PASID_FLUSHED or
+ * IOMMU_PASID_CLEAN when stopped successfully. 0 otherwise.
*/
struct iommu_svm_ops {
int (*handle_fault)(struct device *dev, int pasid, u64 address,
--
2.11.0
^ permalink raw reply related [flat|nested] 103+ messages in thread
* [RFC PATCH 25/30] iommu/arm-smmu-v3: Safe invalidation and recycling of PASIDs
[not found] ` <20170227195441.5170-1-jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
` (23 preceding siblings ...)
2017-02-27 19:54 ` [RFC PATCH 24/30] iommu: Specify PASID state when unbinding a task Jean-Philippe Brucker
@ 2017-02-27 19:54 ` Jean-Philippe Brucker
2017-02-27 19:54 ` [RFC PATCH 26/30] iommu/arm-smmu-v3: Fix PRI queue overflow acknowledgement Jean-Philippe Brucker
` (5 subsequent siblings)
30 siblings, 0 replies; 103+ messages in thread
From: Jean-Philippe Brucker @ 2017-02-27 19:54 UTC (permalink / raw)
Cc: Shanker Donthineni, kvm-u79uwXL29TY76Z2rM5mHXA, Catalin Marinas,
Sinan Kaya, Will Deacon,
iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
Harv Abdulhamid, linux-pci-u79uwXL29TY76Z2rM5mHXA, Bjorn Helgaas,
David Woodhouse,
linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r, Nate Watterson
This patch proposes a solution for safely reusing a context after it is
released with iommu_unbind_task. Let's first describe the lifetime of a
context.
A context is a bond between device and task, identified by a PASID. (I
will be using "PASID" and "context" interchangeably.) We identify four
states for a PASID: USED, STALE, INVALID, FREE.
(2) .----- INVALID <-----. (3a)
| |
v (1) |
(init)----> FREE ---------------> USED
^ |
| |
(3b) '------ STALE <------' (2)
Initially, all PASIDs are free for use. A call to bind_task (1) allocates
a PASID. A call to unbind_task (2) puts the context into STALE state. At
this point we mandate that the device doesn't generate any new traffic for
the PASID. If the device isn't using PRI (3a), we can free the PASID.
Otherwise, we cannot re-allocate the PASID until we are certain that there
are no pending page request for that PASID. This is done by a bus- and
device-specific PASID invalidation operation (3b). Once that operation
completes, the PASID can be reallocated for a new context. The PASID
invalidation could also be observed prior to receiving an unbind_task call
(3a). In that case, the PASID can be reused immediately.
The PCIe ATS specification defines two mechanisms for invalidating PASIDs
(4.1.2. Managing PASID TLP Prefix Usage):
* When ceasing to use a PASID, the device finishes to transmit any related
request and waits for them to come back with a response.
* When ceasing to use a PASID, the device marks all related outstanding
requests as stale and send a Stop Marker. Any page request with that
PASID received after the Stop Marker is related to a different context.
In the first case, the device driver might know that the PASID has been
invalidated before calling unbind_task, in which case it should pass
IOMMU_PASID_CLEAN to iommu_unbind_task. This indicate that the PASID can
be safely reused immediately. In any other implementation, it is
impossible to know which happens first, (2) or (3).
When unbind_task is called, there could still be transactions with the
affected PASID in the system buffers:
(A) making their way towards the SMMU,
(B) waiting in the PRI queue to be processed by the handler,
(C) waiting in the fault work queue.
We consider (A) to be a bug. The PCIe specification requires all "Posted
Requests addressing host memory" to be flushed to the host before
completing the device-specific stop request mechanism (6.20.1 Managing
PASID TLP Prefix Usage). We mandate the device driver to perform this stop
request before calling iommu_unbind, and ensure that no transaction
referring to this PASID is pending in the PCIe system. We'll have to put
the same requirement on non-PCIe buses.
(B) is the SMMU driver's responsibility, and is quite a drag, because we
can't inspect the PRI queue without adding locks around producer and
consumer registers, or else we would race with the PRI handling thread.
(C) is easier, we have a direct way to drain a work queue.
A major complication of the second point is that even when a device
properly implements Stop Markers, we might lose them if the SMMU's PRI
queue overflows. Indeed, in case of an overflow the SMMU is able to
auto-respond to page faults, but Stop Markers are discarded. So a safe
implementation that takes overflow into account cannot solely rely on Stop
Markers for freeing contexts. Stop Markers only allow to speed up the
freeing process.
*
* *
This patch adds context state tracking and delayed invalidation, in order
to safely recycle contexts.
arm_smmu_unbind_task atomically sets the context's state to STALE. If the
state was already INVALIDATED, either by Stop Marker or by a flag passed
to unbind, then we can immediately release the context. Otherwise release
only the address space. Transitions between states are done atomically, so
for example when a transition from STALE to FREE is successful, the thread
doing the transition can safely release the context.
A stale context that wasn't released during unbind may be released later
when the fault handler receives a Stop Marker. The fault handler, when it
receives such marker, sets the context's state to INVALIDATED. If the
state was already STALE, the context can be released. Unlike any other
PPR, no reply is expected for a stop marker.
Someone then needs to sweep stale contexts that never received a Stop
Marker. Introduce a work "sweep_context" for each master, that cleans the
context list by inspecting the state of each context and releasing it when
its time has come. The work is scheduled whenever the number of stale
contexts reaches a watermark. For the moment we arbitrarily define this
limit as a fourth of the total number of contexts supported by a master.
Knowing when a stale context can be invalidated is a bit tricky as
explained above, because it requires to know the state of the PRI queue.
The sweeper waits for the queue to be empty or the PRIQ thread to read the
whole queue (do a cycle), whichever comes first. After that, we can
consider that any reference to the PASID present in the PRIQ when we
marked the context stale, has now been removed and pushed out to the fault
work queue. Flush the work queue and remove the context.
Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
---
drivers/iommu/arm-smmu-v3.c | 269 ++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 261 insertions(+), 8 deletions(-)
diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index 3ba7f65020f9..2f1ec09aeaec 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -474,6 +474,8 @@ enum fault_status {
ARM_SMMU_FAULT_FAIL,
/* Fault has been handled, the access should be retried */
ARM_SMMU_FAULT_SUCC,
+ /* Do not send any reply to the device */
+ ARM_SMMU_FAULT_IGNORE,
};
enum arm_smmu_msi_index {
@@ -593,6 +595,9 @@ struct arm_smmu_evtq {
struct arm_smmu_priq {
struct arm_smmu_queue q;
+
+ u64 batch;
+ wait_queue_head_t wq;
};
/* High-level stream table and context descriptor structures */
@@ -742,6 +747,10 @@ struct arm_smmu_master_data {
bool can_fault;
u32 avail_contexts;
+ struct work_struct sweep_contexts;
+#define STALE_CONTEXTS_LIMIT(master) ((master)->avail_contexts / 4)
+ u32 stale_contexts;
+
const struct iommu_svm_ops *svm_ops;
};
@@ -825,8 +834,15 @@ struct arm_smmu_context {
struct list_head task_head;
struct rb_node master_node;
+ struct list_head flush_head;
struct kref kref;
+
+#define ARM_SMMU_CONTEXT_STALE (1 << 0)
+#define ARM_SMMU_CONTEXT_INVALIDATED (1 << 1)
+#define ARM_SMMU_CONTEXT_FREE (ARM_SMMU_CONTEXT_STALE |\
+ ARM_SMMU_CONTEXT_INVALIDATED)
+ atomic64_t state;
};
struct arm_smmu_group {
@@ -1179,7 +1195,7 @@ static void arm_smmu_fault_reply(struct arm_smmu_fault *fault,
},
};
- if (!fault->last)
+ if (!fault->last || resp == ARM_SMMU_FAULT_IGNORE)
return;
arm_smmu_cmdq_issue_cmd(fault->smmu, &cmd);
@@ -1807,11 +1823,23 @@ static irqreturn_t arm_smmu_priq_thread(int irq, void *dev)
{
struct arm_smmu_device *smmu = dev;
struct arm_smmu_queue *q = &smmu->priq.q;
+ size_t queue_size = 1 << q->max_n_shift;
u64 evt[PRIQ_ENT_DWORDS];
+ size_t i = 0;
+
+ spin_lock(&smmu->priq.wq.lock);
do {
- while (!queue_remove_raw(q, evt))
+ while (!queue_remove_raw(q, evt)) {
+ spin_unlock(&smmu->priq.wq.lock);
arm_smmu_handle_ppr(smmu, evt);
+ spin_lock(&smmu->priq.wq.lock);
+ if (++i == queue_size) {
+ smmu->priq.batch++;
+ wake_up_locked(&smmu->priq.wq);
+ i = 0;
+ }
+ }
if (queue_sync_prod(q) == -EOVERFLOW)
dev_err(smmu->dev, "PRIQ overflow detected -- requests lost\n");
@@ -1819,6 +1847,12 @@ static irqreturn_t arm_smmu_priq_thread(int irq, void *dev)
/* Sync our overflow flag, as we believe we're up to speed */
q->cons = Q_OVF(q, q->prod) | Q_WRP(q, q->cons) | Q_IDX(q, q->cons);
+
+ smmu->priq.batch++;
+ wake_up_locked(&smmu->priq.wq);
+
+ spin_unlock(&smmu->priq.wq.lock);
+
return IRQ_HANDLED;
}
@@ -2684,6 +2718,22 @@ static enum fault_status _arm_smmu_handle_fault(struct arm_smmu_fault *fault)
return resp;
}
+ if (fault->last && !fault->read && !fault->write) {
+ /* Special case: stop marker invalidates the PASID */
+ u64 val = atomic64_fetch_or(ARM_SMMU_CONTEXT_INVALIDATED,
+ &smmu_context->state);
+ if (val == ARM_SMMU_CONTEXT_STALE) {
+ spin_lock(&smmu->contexts_lock);
+ _arm_smmu_put_context(smmu_context);
+ smmu_context->master->stale_contexts--;
+ spin_unlock(&smmu->contexts_lock);
+ }
+
+ /* No reply expected */
+ resp = ARM_SMMU_FAULT_IGNORE;
+ goto out_put_context;
+ }
+
fault->ssv = smmu_context->master->ste.prg_response_needs_ssid;
spin_lock(&smmu->contexts_lock);
@@ -2693,6 +2743,7 @@ static enum fault_status _arm_smmu_handle_fault(struct arm_smmu_fault *fault)
spin_unlock(&smmu->contexts_lock);
if (!smmu_task)
+ /* Stale context */
goto out_put_context;
list_for_each_entry(tmp_prg, &smmu_task->prgs, list) {
@@ -2744,7 +2795,7 @@ static void arm_smmu_handle_fault(struct work_struct *work)
work);
resp = _arm_smmu_handle_fault(fault);
- if (resp != ARM_SMMU_FAULT_SUCC)
+ if (resp != ARM_SMMU_FAULT_SUCC && resp != ARM_SMMU_FAULT_IGNORE)
dev_info_ratelimited(fault->smmu->dev, "%s fault:\n"
"\t0x%08x.0x%05x: [%u%s] %sprivileged %s%s%s access at iova "
"0x%016llx\n",
@@ -2759,6 +2810,81 @@ static void arm_smmu_handle_fault(struct work_struct *work)
kfree(fault);
}
+static void arm_smmu_sweep_contexts(struct work_struct *work)
+{
+ u64 batch;
+ int ret, i = 0;
+ struct arm_smmu_priq *priq;
+ struct arm_smmu_device *smmu;
+ struct arm_smmu_master_data *master;
+ struct arm_smmu_context *smmu_context, *tmp;
+ struct list_head flush_list = LIST_HEAD_INIT(flush_list);
+
+ master = container_of(work, struct arm_smmu_master_data, sweep_contexts);
+ smmu = master->smmu;
+ priq = &smmu->priq;
+
+ spin_lock(&smmu->contexts_lock);
+ dev_dbg(smmu->dev, "Sweeping contexts %u/%u\n",
+ master->stale_contexts, master->avail_contexts);
+
+ rbtree_postorder_for_each_entry_safe(smmu_context, tmp,
+ &master->contexts, master_node) {
+ u64 val = atomic64_cmpxchg(&smmu_context->state,
+ ARM_SMMU_CONTEXT_STALE,
+ ARM_SMMU_CONTEXT_FREE);
+ if (val != ARM_SMMU_CONTEXT_STALE)
+ continue;
+
+ /*
+ * We volunteered for deleting this context by setting the state
+ * atomically. This guarantees that no one else writes to its
+ * flush_head field.
+ */
+ list_add(&smmu_context->flush_head, &flush_list);
+ }
+ spin_unlock(&smmu->contexts_lock);
+
+ if (list_empty(&flush_list))
+ return;
+
+ /*
+ * Now wait until the priq thread finishes a batch, or until the queue
+ * is empty. After that, we are certain that the last references to this
+ * context have been flushed to the fault work queue. Note that we don't
+ * handle overflows on priq->batch. If it occurs, just wait for the
+ * queue to be empty.
+ */
+ spin_lock(&priq->wq.lock);
+ if (queue_sync_prod(&priq->q) == -EOVERFLOW)
+ dev_err(smmu->dev, "PRIQ overflow detected -- requests lost\n");
+ batch = priq->batch;
+ ret = wait_event_interruptible_locked(priq->wq, queue_empty(&priq->q) ||
+ priq->batch >= batch + 2);
+ spin_unlock(&priq->wq.lock);
+
+ if (ret) {
+ /* Woops, rollback. */
+ spin_lock(&smmu->contexts_lock);
+ list_for_each_entry(smmu_context, &flush_list, flush_head)
+ atomic64_xchg(&smmu_context->state,
+ ARM_SMMU_CONTEXT_STALE);
+ spin_unlock(&smmu->contexts_lock);
+ return;
+ }
+
+ flush_workqueue(smmu->fault_queue);
+
+ spin_lock(&smmu->contexts_lock);
+ list_for_each_entry_safe(smmu_context, tmp, &flush_list, flush_head) {
+ _arm_smmu_put_context(smmu_context);
+ i++;
+ }
+
+ master->stale_contexts -= i;
+ spin_unlock(&smmu->contexts_lock);
+}
+
static bool arm_smmu_master_supports_svm(struct arm_smmu_master_data *master)
{
return dev_is_pci(master->dev) && master->can_fault &&
@@ -2782,6 +2908,18 @@ static int arm_smmu_set_svm_ops(struct device *dev,
return 0;
}
+static int arm_smmu_invalidate_context(struct arm_smmu_context *smmu_context)
+{
+ struct arm_smmu_master_data *master = smmu_context->master;
+
+ if (!master->svm_ops || !master->svm_ops->invalidate_pasid)
+ return 0;
+
+ return master->svm_ops->invalidate_pasid(master->dev,
+ smmu_context->ssid,
+ smmu_context->priv);
+}
+
static int arm_smmu_bind_task(struct device *dev, struct task_struct *task,
int *pasid, int flags, void *priv)
{
@@ -2876,6 +3014,10 @@ static int arm_smmu_bind_task(struct device *dev, struct task_struct *task,
static int arm_smmu_unbind_task(struct device *dev, int pasid, int flags)
{
+ int ret;
+ unsigned long val;
+ unsigned int pasid_state;
+ bool put_context = false;
struct arm_smmu_device *smmu;
struct arm_smmu_master_data *master;
struct arm_smmu_context *smmu_context = NULL;
@@ -2895,22 +3037,53 @@ static int arm_smmu_unbind_task(struct device *dev, int pasid, int flags)
dev_dbg(dev, "unbind PASID %d\n", pasid);
+ pasid_state = flags & (IOMMU_PASID_FLUSHED | IOMMU_PASID_CLEAN);
+ if (!pasid_state)
+ pasid_state = arm_smmu_invalidate_context(smmu_context);
+
+ if (!pasid_state) {
+ /* PASID is in use, we can't do anything. */
+ ret = -EBUSY;
+ goto err_put_context;
+ }
+
/*
* There isn't any "ATC invalidate all by PASID" command. If this isn't
* good enough, we'll need fine-grained invalidation for each vma.
*/
arm_smmu_atc_invalidate_context(smmu_context, 0, -1);
+ val = atomic64_fetch_or(ARM_SMMU_CONTEXT_STALE, &smmu_context->state);
+ if (val == ARM_SMMU_CONTEXT_INVALIDATED || !master->can_fault) {
+ /* We already received a stop marker for this context. */
+ put_context = true;
+ } else if (pasid_state & IOMMU_PASID_CLEAN) {
+ /* We are allowed to free the PASID now! */
+ val = atomic64_fetch_or(ARM_SMMU_CONTEXT_INVALIDATED,
+ &smmu_context->state);
+ if (val == ARM_SMMU_CONTEXT_STALE)
+ put_context = true;
+ }
+
spin_lock(&smmu->contexts_lock);
if (smmu_context->task)
arm_smmu_detach_task(smmu_context);
/* Release the ref we got earlier in this function */
_arm_smmu_put_context(smmu_context);
- _arm_smmu_put_context(smmu_context);
+
+ if (put_context)
+ _arm_smmu_put_context(smmu_context);
+ else if (++master->stale_contexts >= STALE_CONTEXTS_LIMIT(master))
+ queue_work(system_long_wq, &master->sweep_contexts);
spin_unlock(&smmu->contexts_lock);
return 0;
+
+err_put_context:
+ arm_smmu_put_context(smmu, smmu_context);
+
+ return ret;
}
static struct iommu_domain *arm_smmu_domain_alloc(unsigned type)
@@ -3137,6 +3310,7 @@ static void arm_smmu_detach_dev(struct device *dev)
struct arm_smmu_device *smmu = master->smmu;
struct arm_smmu_context *smmu_context;
struct rb_node *node, *next;
+ int new_stale_contexts = 0;
mutex_lock(&smmu->domains_mutex);
@@ -3151,17 +3325,64 @@ static void arm_smmu_detach_dev(struct device *dev)
if (!master->ste.valid)
return;
+ /* Try to clean the contexts. */
spin_lock(&smmu->contexts_lock);
for (node = rb_first(&master->contexts); node; node = next) {
+ u64 val;
+ int pasid_state = 0;
+
smmu_context = rb_entry(node, struct arm_smmu_context,
master_node);
next = rb_next(node);
- if (smmu_context->task)
- arm_smmu_detach_task(smmu_context);
+ val = atomic64_fetch_or(ARM_SMMU_CONTEXT_STALE,
+ &smmu_context->state);
+ if (val == ARM_SMMU_CONTEXT_FREE)
+ /* Someone else is waiting to free this context */
+ continue;
+
+ if (!(val & ARM_SMMU_CONTEXT_STALE)) {
+ pasid_state = arm_smmu_invalidate_context(smmu_context);
+ if (!pasid_state) {
+ /*
+ * This deserves a slap, since there still
+ * might be references to that PASID hanging
+ * around downstream of the SMMU and we can't
+ * do anything about it.
+ */
+ dev_warn(dev, "PASID %u was still bound!\n",
+ smmu_context->ssid);
+ }
+
+ if (smmu_context->task)
+ arm_smmu_detach_task(smmu_context);
+ else
+ dev_warn(dev, "bound without a task?!");
+
+ new_stale_contexts++;
+ }
+
+ if (!(val & ARM_SMMU_CONTEXT_INVALIDATED) && master->can_fault &&
+ !(pasid_state & IOMMU_PASID_CLEAN)) {
+ /*
+ * We can't free the context yet, its PASID might still
+ * be waiting in the pipe.
+ */
+ continue;
+ }
+
+ val = atomic64_fetch_or(ARM_SMMU_CONTEXT_INVALIDATED,
+ &smmu_context->state);
+ if (val == ARM_SMMU_CONTEXT_FREE)
+ continue;
_arm_smmu_put_context(smmu_context);
+ new_stale_contexts--;
}
+
+ master->stale_contexts += new_stale_contexts;
+ if (master->stale_contexts)
+ queue_work(system_long_wq, &master->sweep_contexts);
spin_unlock(&smmu->contexts_lock);
}
@@ -3581,6 +3802,8 @@ static int arm_smmu_add_device(struct device *dev)
fwspec->iommu_priv = master;
master->contexts = RB_ROOT;
+
+ INIT_WORK(&master->sweep_contexts, arm_smmu_sweep_contexts);
}
/* Check the SIDs are in range of the SMMU and our stream table */
@@ -3653,11 +3876,14 @@ static int arm_smmu_add_device(struct device *dev)
static void arm_smmu_remove_device(struct device *dev)
{
struct iommu_fwspec *fwspec = dev->iommu_fwspec;
+ struct arm_smmu_context *smmu_context;
struct arm_smmu_master_data *master;
struct arm_smmu_group *smmu_group;
struct arm_smmu_device *smmu;
+ struct rb_node *node, *next;
struct iommu_group *group;
unsigned long flags;
+ u64 val;
int i;
if (!fwspec || fwspec->ops != &arm_smmu_ops)
@@ -3669,16 +3895,40 @@ static void arm_smmu_remove_device(struct device *dev)
arm_smmu_detach_dev(dev);
if (master) {
+ cancel_work_sync(&master->sweep_contexts);
+
+ spin_lock(&smmu->contexts_lock);
+
+ for (node = rb_first(&master->contexts); node; node = next) {
+ smmu_context = rb_entry(node, struct arm_smmu_context,
+ master_node);
+ next = rb_next(node);
+
+ /*
+ * Force removal of remaining contexts. They were marked
+ * stale by detach_dev, but haven't been invalidated
+ * since. Page requests might be pending but we can't
+ * afford to wait for them anymore. Bad things will
+ * happen.
+ */
+ dev_warn(dev, "PASID %u wasn't invalidated\n",
+ smmu_context->ssid);
+ val = atomic64_xchg(&smmu_context->state,
+ ARM_SMMU_CONTEXT_FREE);
+ if (val != ARM_SMMU_CONTEXT_FREE)
+ _arm_smmu_put_context(smmu_context);
+ }
+
if (master->streams) {
- spin_lock(&smmu->contexts_lock);
for (i = 0; i < fwspec->num_ids; i++)
rb_erase(&master->streams[i].node,
&smmu->streams);
- spin_unlock(&smmu->contexts_lock);
kfree(master->streams);
}
+ spin_unlock(&smmu->contexts_lock);
+
group = iommu_group_get(dev);
smmu_group = to_smmu_group(group);
@@ -3864,6 +4114,9 @@ static int arm_smmu_init_queues(struct arm_smmu_device *smmu)
if (!(smmu->features & ARM_SMMU_FEAT_PRI))
return 0;
+ init_waitqueue_head(&smmu->priq.wq);
+ smmu->priq.batch = 0;
+
return arm_smmu_init_one_queue(smmu, &smmu->priq.q, ARM_SMMU_PRIQ_PROD,
ARM_SMMU_PRIQ_CONS, PRIQ_ENT_DWORDS);
}
--
2.11.0
^ permalink raw reply related [flat|nested] 103+ messages in thread
* [RFC PATCH 26/30] iommu/arm-smmu-v3: Fix PRI queue overflow acknowledgement
[not found] ` <20170227195441.5170-1-jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
` (24 preceding siblings ...)
2017-02-27 19:54 ` [RFC PATCH 25/30] iommu/arm-smmu-v3: Safe invalidation and recycling of PASIDs Jean-Philippe Brucker
@ 2017-02-27 19:54 ` Jean-Philippe Brucker
2017-02-27 19:54 ` [RFC PATCH 27/30] iommu/arm-smmu-v3: Handle PRI queue overflow Jean-Philippe Brucker
` (4 subsequent siblings)
30 siblings, 0 replies; 103+ messages in thread
From: Jean-Philippe Brucker @ 2017-02-27 19:54 UTC (permalink / raw)
Cc: Shanker Donthineni, kvm-u79uwXL29TY76Z2rM5mHXA, Catalin Marinas,
Sinan Kaya, Will Deacon,
iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
Harv Abdulhamid, linux-pci-u79uwXL29TY76Z2rM5mHXA, Bjorn Helgaas,
David Woodhouse,
linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r, Nate Watterson
When an overflow occurs in the PRI queue, the SMMU toggles the overflow
flag in the PROD register. To exit the overflow condition, the PRI thread
is supposed to acknowledge it by toggling this flag in the CONS register.
Currently with an overflow condition, the flag is toggled in q->cons after
clearing the PRI queue, but is never published to the hardware. It would
be done next time we execute the thread. However, we never get a chance
because the SMMU doesn't append anything to the queue while in overflow
condition, and the thread is not scheduled unless the queue transitions
from empty to non-empty. To fix it, synchronize the hardware CONS register
before leaving the PRIQ thread.
This bug doesn't affect the event queue, since the SMMU still adds
elements to that queue when the overflow condition is active. Even missing
an overflow condition because one is already active doesn't matter. We
won't miss fault records for stalled transactions. But it feels nicer to
keep the SMMU in sync when possible, so do it there as well.
Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
---
drivers/iommu/arm-smmu-v3.c | 14 ++++++++++++--
1 file changed, 12 insertions(+), 2 deletions(-)
diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index 2f1ec09aeaec..b5d45c1e14d1 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -932,6 +932,16 @@ static void queue_inc_cons(struct arm_smmu_queue *q)
writel(q->cons, q->cons_reg);
}
+static void queue_sync_cons_ovf(struct arm_smmu_queue *q)
+{
+ /* Acknowledge overflow condition if any */
+ if (Q_OVF(q, q->prod) == Q_OVF(q, q->cons))
+ return;
+
+ q->cons = Q_OVF(q, q->prod) | Q_WRP(q, q->cons) | Q_IDX(q, q->cons);
+ writel(q->cons, q->cons_reg);
+}
+
static int queue_sync_prod(struct arm_smmu_queue *q)
{
int ret = 0;
@@ -1782,7 +1792,7 @@ static irqreturn_t arm_smmu_evtq_thread(int irq, void *dev)
} while (!queue_empty(q));
/* Sync our overflow flag, as we believe we're up to speed */
- q->cons = Q_OVF(q, q->prod) | Q_WRP(q, q->cons) | Q_IDX(q, q->cons);
+ queue_sync_cons_ovf(q);
return IRQ_HANDLED;
}
@@ -1846,7 +1856,7 @@ static irqreturn_t arm_smmu_priq_thread(int irq, void *dev)
} while (!queue_empty(q));
/* Sync our overflow flag, as we believe we're up to speed */
- q->cons = Q_OVF(q, q->prod) | Q_WRP(q, q->cons) | Q_IDX(q, q->cons);
+ queue_sync_cons_ovf(q);
smmu->priq.batch++;
wake_up_locked(&smmu->priq.wq);
--
2.11.0
^ permalink raw reply related [flat|nested] 103+ messages in thread
* [RFC PATCH 27/30] iommu/arm-smmu-v3: Handle PRI queue overflow
[not found] ` <20170227195441.5170-1-jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
` (25 preceding siblings ...)
2017-02-27 19:54 ` [RFC PATCH 26/30] iommu/arm-smmu-v3: Fix PRI queue overflow acknowledgement Jean-Philippe Brucker
@ 2017-02-27 19:54 ` Jean-Philippe Brucker
2017-02-27 19:54 ` [RFC PATCH 28/30] iommu/arm-smmu-v3: Add support for Hardware Translation Table Update at stage 1 Jean-Philippe Brucker
` (3 subsequent siblings)
30 siblings, 0 replies; 103+ messages in thread
From: Jean-Philippe Brucker @ 2017-02-27 19:54 UTC (permalink / raw)
Cc: Shanker Donthineni, kvm-u79uwXL29TY76Z2rM5mHXA, Catalin Marinas,
Sinan Kaya, Will Deacon,
iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
Harv Abdulhamid, linux-pci-u79uwXL29TY76Z2rM5mHXA, Bjorn Helgaas,
David Woodhouse,
linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r, Nate Watterson
When the PRI queue is full, it enters overflow condition, which is sticky
and exited by the PRI thread once it has had time to free up some slots.
During that time, no new entry is added to the queue. The SMMU
automatically replies to PRI Page Requests (PPR) that have "last=1" with
"success", to let the device retry later. PPRs that have "last=0" and
PASID Stop Markers are silently ignored. Two related issues need to be
fixed:
* Any PPR in the PRI queue prior to the overflow condition might be in a
Page Request Group (PRG) that has its last entry auto-responded while in
overflow. Until we fix up the overflow, ignore any non-last PPR received
by the PRI thread.
* In addition, any PRG of PPRs already committed to the fault queue is now
potentially invalid, since their last PPR might have been lost. Wait
until the overflow condition is fixed, and destroy *all* remaining PRG
structures :( We do that by appending a PRG sweeper work to the fault
queue, that will do some inefficient sweeping and lock up the fault
queue for a while. Awful, but necessary.
Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
---
drivers/iommu/arm-smmu-v3.c | 37 ++++++++++++++++++++++++++++++++++---
1 file changed, 34 insertions(+), 3 deletions(-)
diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index b5d45c1e14d1..1a5e72752e6d 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -723,6 +723,7 @@ struct arm_smmu_device {
struct list_head tasks;
struct workqueue_struct *fault_queue;
+ struct work_struct flush_prgs;
struct list_head domains;
struct mutex domains_mutex;
@@ -1798,7 +1799,8 @@ static irqreturn_t arm_smmu_evtq_thread(int irq, void *dev)
static void arm_smmu_handle_fault(struct work_struct *work);
-static void arm_smmu_handle_ppr(struct arm_smmu_device *smmu, u64 *evt)
+static void arm_smmu_handle_ppr(struct arm_smmu_device *smmu, u64 *evt,
+ bool overflowing)
{
struct arm_smmu_fault *fault;
struct arm_smmu_fault params = {
@@ -1817,6 +1819,9 @@ static void arm_smmu_handle_ppr(struct arm_smmu_device *smmu, u64 *evt)
.priv = evt[0] & PRIQ_0_PERM_PRIV,
};
+ if (overflowing && !params.last)
+ return;
+
fault = kmem_cache_alloc(arm_smmu_fault_cache, GFP_KERNEL);
if (!fault) {
/* Out of memory, tell the device to retry later */
@@ -1834,6 +1839,7 @@ static irqreturn_t arm_smmu_priq_thread(int irq, void *dev)
struct arm_smmu_device *smmu = dev;
struct arm_smmu_queue *q = &smmu->priq.q;
size_t queue_size = 1 << q->max_n_shift;
+ bool overflowing = false;
u64 evt[PRIQ_ENT_DWORDS];
size_t i = 0;
@@ -1842,7 +1848,7 @@ static irqreturn_t arm_smmu_priq_thread(int irq, void *dev)
do {
while (!queue_remove_raw(q, evt)) {
spin_unlock(&smmu->priq.wq.lock);
- arm_smmu_handle_ppr(smmu, evt);
+ arm_smmu_handle_ppr(smmu, evt, overflowing);
spin_lock(&smmu->priq.wq.lock);
if (++i == queue_size) {
smmu->priq.batch++;
@@ -1851,8 +1857,10 @@ static irqreturn_t arm_smmu_priq_thread(int irq, void *dev)
}
}
- if (queue_sync_prod(q) == -EOVERFLOW)
+ if (queue_sync_prod(q) == -EOVERFLOW) {
dev_err(smmu->dev, "PRIQ overflow detected -- requests lost\n");
+ overflowing = true;
+ }
} while (!queue_empty(q));
/* Sync our overflow flag, as we believe we're up to speed */
@@ -1863,6 +1871,9 @@ static irqreturn_t arm_smmu_priq_thread(int irq, void *dev)
spin_unlock(&smmu->priq.wq.lock);
+ if (overflowing)
+ queue_work(smmu->fault_queue, &smmu->flush_prgs);
+
return IRQ_HANDLED;
}
@@ -2820,6 +2831,24 @@ static void arm_smmu_handle_fault(struct work_struct *work)
kfree(fault);
}
+static void arm_smmu_flush_prgs(struct work_struct *work)
+{
+ struct arm_smmu_device *smmu;
+ struct arm_smmu_task *smmu_task;
+ struct arm_smmu_pri_group *prg, *next_prg;
+
+ smmu = container_of(work, struct arm_smmu_device, flush_prgs);
+
+ spin_lock(&smmu->contexts_lock);
+ list_for_each_entry(smmu_task, &smmu->tasks, smmu_head) {
+ list_for_each_entry_safe(prg, next_prg, &smmu_task->prgs, list) {
+ list_del(&prg->list);
+ kfree(prg);
+ }
+ }
+ spin_unlock(&smmu->contexts_lock);
+}
+
static void arm_smmu_sweep_contexts(struct work_struct *work)
{
u64 batch;
@@ -4269,6 +4298,8 @@ static int arm_smmu_init_structures(struct arm_smmu_device *smmu)
smmu->fault_queue = alloc_ordered_workqueue("smmu_fault_queue", 0);
if (!smmu->fault_queue)
return -ENOMEM;
+
+ INIT_WORK(&smmu->flush_prgs, arm_smmu_flush_prgs);
}
return arm_smmu_init_strtab(smmu);
--
2.11.0
^ permalink raw reply related [flat|nested] 103+ messages in thread
* [RFC PATCH 28/30] iommu/arm-smmu-v3: Add support for Hardware Translation Table Update at stage 1
[not found] ` <20170227195441.5170-1-jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
` (26 preceding siblings ...)
2017-02-27 19:54 ` [RFC PATCH 27/30] iommu/arm-smmu-v3: Handle PRI queue overflow Jean-Philippe Brucker
@ 2017-02-27 19:54 ` Jean-Philippe Brucker
2017-02-27 19:54 ` [RFC PATCH 29/30] vfio: Add support for Shared Virtual Memory Jean-Philippe Brucker
` (2 subsequent siblings)
30 siblings, 0 replies; 103+ messages in thread
From: Jean-Philippe Brucker @ 2017-02-27 19:54 UTC (permalink / raw)
Cc: Shanker Donthineni, kvm-u79uwXL29TY76Z2rM5mHXA, Catalin Marinas,
Sinan Kaya, Will Deacon,
iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
Harv Abdulhamid, linux-pci-u79uwXL29TY76Z2rM5mHXA, Bjorn Helgaas,
David Woodhouse,
linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r, Nate Watterson
If the SMMU supports it and the kernel was built with HTTU support, enable
hardware update of access and dirty flags.
Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
---
drivers/iommu/arm-smmu-v3.c | 27 ++++++++++++++++++++++++++-
1 file changed, 26 insertions(+), 1 deletion(-)
diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index 1a5e72752e6d..5d202f164b95 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -68,6 +68,8 @@
#define IDR0_ASID16 (1 << 12)
#define IDR0_ATS (1 << 10)
#define IDR0_HYP (1 << 9)
+#define IDR0_HD (1 << 7)
+#define IDR0_HA (1 << 6)
#define IDR0_BTM (1 << 5)
#define IDR0_COHACC (1 << 4)
#define IDR0_TTF_SHIFT 2
@@ -346,7 +348,16 @@
#define ARM64_TCR_TBI0_SHIFT 37
#define ARM64_TCR_TBI0_MASK 0x1UL
+#define ARM64_TCR_HA_SHIFT 39
+#define ARM64_TCR_HA_MASK 0x1UL
+#define ARM64_TCR_HD_SHIFT 40
+#define ARM64_TCR_HD_MASK 0x1UL
+
#define CTXDESC_CD_0_AA64 (1UL << 41)
+#define CTXDESC_CD_0_TCR_HD_SHIFT 42
+#define CTXDESC_CD_0_TCR_HA_SHIFT 43
+#define CTXDESC_CD_0_HD (1UL << CTXDESC_CD_0_TCR_HD_SHIFT)
+#define CTXDESC_CD_0_HA (1UL << CTXDESC_CD_0_TCR_HA_SHIFT)
#define CTXDESC_CD_0_R (1UL << 45)
#define CTXDESC_CD_0_A (1UL << 46)
#define CTXDESC_CD_0_ASET_SHIFT 47
@@ -687,6 +698,8 @@ struct arm_smmu_device {
#define ARM_SMMU_FEAT_E2H (1 << 13)
#define ARM_SMMU_FEAT_BTM (1 << 14)
#define ARM_SMMU_FEAT_SVM (1 << 15)
+#define ARM_SMMU_FEAT_HA (1 << 16)
+#define ARM_SMMU_FEAT_HD (1 << 17)
u32 features;
#define ARM_SMMU_OPT_SKIP_PREFETCH (1 << 0)
@@ -1275,6 +1288,12 @@ static u64 arm_smmu_cpu_tcr_to_cd(struct arm_smmu_device *smmu, u64 tcr)
if (!(smmu->features & ARM_SMMU_FEAT_ATS))
val |= ARM_SMMU_TCR2CD(tcr, TBI0);
+ if (smmu->features & ARM_SMMU_FEAT_HA)
+ val |= ARM_SMMU_TCR2CD(tcr, HA);
+
+ if (smmu->features & ARM_SMMU_FEAT_HD)
+ val |= ARM_SMMU_TCR2CD(tcr, HD);
+
return val;
}
@@ -2497,7 +2516,7 @@ static int arm_smmu_init_task_pgtable(struct arm_smmu_task *smmu_task)
tcr |= par << ARM_LPAE_TCR_IPS_SHIFT;
/* Enable this by default, it will be filtered when writing the CD */
- tcr |= TCR_TBI0;
+ tcr |= TCR_TBI0 | TCR_HA | TCR_HD;
cfg->asid = asid;
cfg->ttbr = virt_to_phys(smmu_task->mm->pgd);
@@ -4734,6 +4753,12 @@ static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu)
smmu->features |= ARM_SMMU_FEAT_E2H;
}
+ if (IS_ENABLED(CONFIG_ARM64_HW_AFDBM) && (reg & (IDR0_HA | IDR0_HD))) {
+ smmu->features |= ARM_SMMU_FEAT_HA;
+ if (reg & IDR0_HD)
+ smmu->features |= ARM_SMMU_FEAT_HD;
+ }
+
/*
* If the CPU is using VHE, but the SMMU doesn't support it, the SMMU
* will create TLB entries for NH-EL1 world and will miss the
--
2.11.0
^ permalink raw reply related [flat|nested] 103+ messages in thread
* [RFC PATCH 29/30] vfio: Add support for Shared Virtual Memory
[not found] ` <20170227195441.5170-1-jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
` (27 preceding siblings ...)
2017-02-27 19:54 ` [RFC PATCH 28/30] iommu/arm-smmu-v3: Add support for Hardware Translation Table Update at stage 1 Jean-Philippe Brucker
@ 2017-02-27 19:54 ` Jean-Philippe Brucker
2017-03-21 7:04 ` Liu, Yi L
[not found] ` <20170227195441.5170-30-jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
2017-02-27 19:54 ` [RFC PATCH 30/30] vfio: Allow to bind foreign task Jean-Philippe Brucker
2017-03-06 8:20 ` [RFC PATCH 00/30] Add PCIe SVM support to ARM SMMUv3 Liu, Yi L
30 siblings, 2 replies; 103+ messages in thread
From: Jean-Philippe Brucker @ 2017-02-27 19:54 UTC (permalink / raw)
Cc: Shanker Donthineni, kvm-u79uwXL29TY76Z2rM5mHXA, Catalin Marinas,
Sinan Kaya, Will Deacon,
iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
Harv Abdulhamid, linux-pci-u79uwXL29TY76Z2rM5mHXA, Bjorn Helgaas,
David Woodhouse,
linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r, Nate Watterson
Add two new ioctl for VFIO devices. VFIO_DEVICE_BIND_TASK creates a bond
between a device and a process address space, identified by a
device-specific ID named PASID. This allows the device to target DMA
transactions at the process virtual addresses without a need for mapping
and unmapping buffers explicitly in the IOMMU. The process page tables are
shared with the IOMMU, and mechanisms such as PCI ATS/PRI may be used to
handle faults. VFIO_DEVICE_UNBIND_TASK removed a bond identified by a
PASID.
Also add a capability flag in device info to detect whether the system and
the device support SVM.
Users need to specify the state of a PASID when unbinding, with flags
VFIO_PASID_RELEASE_FLUSHED and VFIO_PASID_RELEASE_CLEAN. Even for PCI,
PASID invalidation is specific to each device and only partially covered
by the specification:
* Device must have an implementation-defined mechanism for stopping the
use of a PASID. When this mechanism finishes, the device has stopped
issuing transactions for this PASID and all transactions for this PASID
have been flushed to the IOMMU.
* Device may either wait for all outstanding PRI requests for this PASID
to finish, or issue a Stop Marker message, a barrier that separates PRI
requests affecting this instance of the PASID from PRI requests
affecting the next instance. In the first case, we say that the PASID is
"clean", in the second case it is "flushed" (and the IOMMU has to wait
for the Stop Marker before reassigning the PASID.)
We expect similar distinctions for platform devices. Ideally there should
be a callback for each PCI device, allowing the IOMMU to ask the device to
stop using a PASID. When the callback returns, the PASID is either flushed
or clean and the return value tells which.
For the moment I don't know how to implement this callback for PCI, so if
the user forgets to call unbind with either "clean" or "flushed", the
PASID is never reused. For platform devices, it might be simpler to
implement since we could associate an invalidate_pasid callback to a DT
compatible string, as is currently done for reset.
Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
---
drivers/vfio/pci/vfio_pci.c | 24 ++++++++++
drivers/vfio/vfio.c | 104 ++++++++++++++++++++++++++++++++++++++++++++
include/uapi/linux/vfio.h | 55 +++++++++++++++++++++++
3 files changed, 183 insertions(+)
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 324c52e3a1a4..3d7733f94891 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -22,6 +22,7 @@
#include <linux/mutex.h>
#include <linux/notifier.h>
#include <linux/pci.h>
+#include <linux/pci-ats.h>
#include <linux/pm_runtime.h>
#include <linux/slab.h>
#include <linux/types.h>
@@ -623,6 +624,26 @@ int vfio_pci_register_dev_region(struct vfio_pci_device *vdev,
return 0;
}
+static bool vfio_pci_supports_svm(struct vfio_pci_device *vdev)
+{
+ struct pci_dev *pdev = vdev->pdev;
+
+ if (!pdev->ats_enabled)
+ return false;
+
+ if (!pdev->pasid_enabled || pci_max_pasids(pdev) <= 1)
+ return false;
+
+ if (!pdev->pri_enabled)
+ return false;
+
+ /*
+ * If the IOMMU driver enabled all of these, then it supports PCI SVM
+ * for this device.
+ */
+ return true;
+}
+
static long vfio_pci_ioctl(void *device_data,
unsigned int cmd, unsigned long arg)
{
@@ -642,6 +663,9 @@ static long vfio_pci_ioctl(void *device_data,
info.flags = VFIO_DEVICE_FLAGS_PCI;
+ if (vfio_pci_supports_svm(vdev))
+ info.flags |= VFIO_DEVICE_FLAGS_SVM;
+
if (vdev->reset_works)
info.flags |= VFIO_DEVICE_FLAGS_RESET;
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 609f4f982c74..c4505d8f4c61 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -97,6 +97,14 @@ struct vfio_device {
struct vfio_group *group;
struct list_head group_next;
void *device_data;
+
+ struct mutex tasks_lock;
+ struct list_head tasks;
+};
+
+struct vfio_task {
+ int pasid;
+ struct list_head list;
};
#ifdef CONFIG_VFIO_NOIOMMU
@@ -520,6 +528,9 @@ struct vfio_device *vfio_group_create_device(struct vfio_group *group,
device->device_data = device_data;
dev_set_drvdata(dev, device);
+ mutex_init(&device->tasks_lock);
+ INIT_LIST_HEAD(&device->tasks);
+
/* No need to get group_lock, caller has group reference */
vfio_group_get(group);
@@ -532,6 +543,8 @@ struct vfio_device *vfio_group_create_device(struct vfio_group *group,
static void vfio_device_release(struct kref *kref)
{
+ int ret;
+ struct vfio_task *tmp, *task;
struct vfio_device *device = container_of(kref,
struct vfio_device, kref);
struct vfio_group *group = device->group;
@@ -539,6 +552,22 @@ static void vfio_device_release(struct kref *kref)
list_del(&device->group_next);
mutex_unlock(&group->device_lock);
+ mutex_lock(&device->tasks_lock);
+ list_for_each_entry_safe(task, tmp, &device->tasks, list) {
+ /*
+ * This might leak the PASID, since the IOMMU won't know
+ * if it is safe to reuse.
+ */
+ ret = iommu_unbind_task(device->dev, task->pasid, 0);
+ if (ret)
+ dev_warn(device->dev, "failed to unbind PASID %u\n",
+ task->pasid);
+
+ list_del(&task->list);
+ kfree(task);
+ }
+ mutex_unlock(&device->tasks_lock);
+
dev_set_drvdata(device->dev, NULL);
kfree(device);
@@ -1622,6 +1651,75 @@ static int vfio_device_fops_release(struct inode *inode, struct file *filep)
return 0;
}
+static long vfio_svm_ioctl(struct vfio_device *device, unsigned int cmd,
+ unsigned long arg)
+{
+ int ret;
+ unsigned long minsz;
+
+ struct vfio_device_svm svm;
+ struct vfio_task *vfio_task;
+
+ minsz = offsetofend(struct vfio_device_svm, pasid);
+
+ if (copy_from_user(&svm, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ if (svm.argsz < minsz)
+ return -EINVAL;
+
+ if (cmd == VFIO_DEVICE_BIND_TASK) {
+ struct task_struct *task = current;
+
+ ret = iommu_bind_task(device->dev, task, &svm.pasid, 0, NULL);
+ if (ret)
+ return ret;
+
+ vfio_task = kzalloc(sizeof(*vfio_task), GFP_KERNEL);
+ if (!vfio_task) {
+ iommu_unbind_task(device->dev, svm.pasid,
+ IOMMU_PASID_CLEAN);
+ return -ENOMEM;
+ }
+
+ vfio_task->pasid = svm.pasid;
+
+ mutex_lock(&device->tasks_lock);
+ list_add(&vfio_task->list, &device->tasks);
+ mutex_unlock(&device->tasks_lock);
+
+ } else {
+ int flags = 0;
+
+ if (svm.flags & ~(VFIO_SVM_PASID_RELEASE_FLUSHED |
+ VFIO_SVM_PASID_RELEASE_CLEAN))
+ return -EINVAL;
+
+ if (svm.flags & VFIO_SVM_PASID_RELEASE_FLUSHED)
+ flags = IOMMU_PASID_FLUSHED;
+ else if (svm.flags & VFIO_SVM_PASID_RELEASE_CLEAN)
+ flags = IOMMU_PASID_CLEAN;
+
+ mutex_lock(&device->tasks_lock);
+ list_for_each_entry(vfio_task, &device->tasks, list) {
+ if (vfio_task->pasid != svm.pasid)
+ continue;
+
+ ret = iommu_unbind_task(device->dev, svm.pasid, flags);
+ if (ret)
+ dev_warn(device->dev, "failed to unbind PASID %u\n",
+ vfio_task->pasid);
+
+ list_del(&vfio_task->list);
+ kfree(vfio_task);
+ break;
+ }
+ mutex_unlock(&device->tasks_lock);
+ }
+
+ return copy_to_user((void __user *)arg, &svm, minsz) ? -EFAULT : 0;
+}
+
static long vfio_device_fops_unl_ioctl(struct file *filep,
unsigned int cmd, unsigned long arg)
{
@@ -1630,6 +1728,12 @@ static long vfio_device_fops_unl_ioctl(struct file *filep,
if (unlikely(!device->ops->ioctl))
return -EINVAL;
+ switch (cmd) {
+ case VFIO_DEVICE_BIND_TASK:
+ case VFIO_DEVICE_UNBIND_TASK:
+ return vfio_svm_ioctl(device, cmd, arg);
+ }
+
return device->ops->ioctl(device->device_data, cmd, arg);
}
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 519eff362c1c..3fe4197a5ea0 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -198,6 +198,7 @@ struct vfio_device_info {
#define VFIO_DEVICE_FLAGS_PCI (1 << 1) /* vfio-pci device */
#define VFIO_DEVICE_FLAGS_PLATFORM (1 << 2) /* vfio-platform device */
#define VFIO_DEVICE_FLAGS_AMBA (1 << 3) /* vfio-amba device */
+#define VFIO_DEVICE_FLAGS_SVM (1 << 4) /* Device supports bind/unbind */
__u32 num_regions; /* Max region index + 1 */
__u32 num_irqs; /* Max IRQ index + 1 */
};
@@ -409,6 +410,60 @@ struct vfio_irq_set {
*/
#define VFIO_DEVICE_RESET _IO(VFIO_TYPE, VFIO_BASE + 11)
+struct vfio_device_svm {
+ __u32 argsz;
+ __u32 flags;
+#define VFIO_SVM_PASID_RELEASE_FLUSHED (1 << 0)
+#define VFIO_SVM_PASID_RELEASE_CLEAN (1 << 1)
+ __u32 pasid;
+};
+/*
+ * VFIO_DEVICE_BIND_TASK - _IOWR(VFIO_TYPE, VFIO_BASE + 22,
+ * struct vfio_device_svm)
+ *
+ * Share a process' virtual address space with the device.
+ *
+ * This feature creates a new address space for the device, which is not
+ * affected by VFIO_IOMMU_MAP/UNMAP_DMA. Instead, the device can tag its DMA
+ * traffic with the given @pasid to perform transactions on the associated
+ * virtual address space. Mapping and unmapping of buffers is performed by
+ * standard functions such as mmap and malloc.
+ *
+ * On success, VFIO writes a Process Address Space ID (PASID) into @pasid. This
+ * ID is unique to a device.
+ *
+ * The bond between device and process must be removed with
+ * VFIO_DEVICE_UNBIND_TASK before exiting.
+ *
+ * On fork, the child inherits the device fd and can use the bonds setup by its
+ * parent. Consequently, the child has R/W access on the address spaces bound by
+ * its parent. After an execv, the device fd is closed and the child doesn't
+ * have access to the address space anymore.
+ *
+ * Availability of this feature depends on the device, its bus, the underlying
+ * IOMMU and the CPU architecture. All of these are guaranteed when the device
+ * has VFIO_DEVICE_FLAGS_SVM flag set.
+ *
+ * returns: 0 on success, -errno on failure.
+ */
+#define VFIO_DEVICE_BIND_TASK _IO(VFIO_TYPE, VFIO_BASE + 22)
+
+/*
+ * VFIO_DEVICE_UNBIND_TASK - _IOWR(VFIO_TYPE, VFIO_BASE + 23,
+ * struct vfio_device_svm)
+ *
+ * Unbind address space identified by @pasid from device. Device must have
+ * stopped issuing any DMA transaction for the PASID and flushed any reference
+ * to this PASID upstream. Some IOMMUs need to know when a PASID is safe to
+ * reuse, in which case one of the following must be present in @flags
+ *
+ * VFIO_PASID_RELEASE_FLUSHED: the PASID is safe to reassign after the IOMMU
+ * receives an invalidation message from the device.
+ *
+ * VFIO_PASID_RELEASE_CLEAN: the PASID is safe to reassign immediately.
+ */
+#define VFIO_DEVICE_UNBIND_TASK _IO(VFIO_TYPE, VFIO_BASE + 23)
+
/*
* The VFIO-PCI bus driver makes use of the following fixed region and
* IRQ index mapping. Unimplemented regions return a size of zero.
--
2.11.0
^ permalink raw reply related [flat|nested] 103+ messages in thread
* RE: [RFC PATCH 29/30] vfio: Add support for Shared Virtual Memory
2017-02-27 19:54 ` [RFC PATCH 29/30] vfio: Add support for Shared Virtual Memory Jean-Philippe Brucker
@ 2017-03-21 7:04 ` Liu, Yi L
[not found] ` <A2975661238FB949B60364EF0F2C2574390206F0-E2R4CRU6q/6iAffOGbnezLfspsVTdybXVpNB7YpNyf8@public.gmane.org>
[not found] ` <20170227195441.5170-30-jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
1 sibling, 1 reply; 103+ messages in thread
From: Liu, Yi L @ 2017-03-21 7:04 UTC (permalink / raw)
To: Jean-Philippe Brucker
Cc: Shanker Donthineni, kvm@vger.kernel.org, Catalin Marinas,
Sinan Kaya, Will Deacon, iommu@lists.linux-foundation.org,
Harv Abdulhamid, linux-pci@vger.kernel.org, Bjorn Helgaas,
David Woodhouse, linux-arm-kernel@lists.infradead.org,
Nate Watterson, Tian, Kevin, Lan, Tianyu, Raj, Ashok,
Pan, Jacob jun
Hi Jean,
I'm working on virtual SVM, and have some comments on the VFIO channel
definition.
> -----Original Message-----
> From: iommu-bounces@lists.linux-foundation.org [mailto:iommu-
> bounces@lists.linux-foundation.org] On Behalf Of Jean-Philippe Brucker
> Sent: Tuesday, February 28, 2017 3:55 AM
> Cc: Shanker Donthineni <shankerd@qti.qualcomm.com>; kvm@vger.kernel.org;
> Catalin Marinas <catalin.marinas@arm.com>; Sinan Kaya
> <okaya@qti.qualcomm.com>; Will Deacon <will.deacon@arm.com>;
> iommu@lists.linux-foundation.org; Harv Abdulhamid <harba@qti.qualcomm.com>;
> linux-pci@vger.kernel.org; Bjorn Helgaas <bhelgaas@google.com>; David
> Woodhouse <dwmw2@infradead.org>; linux-arm-kernel@lists.infradead.org; Nate
> Watterson <nwatters@qti.qualcomm.com>
> Subject: [RFC PATCH 29/30] vfio: Add support for Shared Virtual Memory
>
> Add two new ioctl for VFIO devices. VFIO_DEVICE_BIND_TASK creates a bond
> between a device and a process address space, identified by a device-specific ID
> named PASID. This allows the device to target DMA transactions at the process
> virtual addresses without a need for mapping and unmapping buffers explicitly in the
> IOMMU. The process page tables are shared with the IOMMU, and mechanisms such
> as PCI ATS/PRI may be used to handle faults. VFIO_DEVICE_UNBIND_TASK removed
> a bond identified by a PASID.
>
> Also add a capability flag in device info to detect whether the system and the device
> support SVM.
>
> Users need to specify the state of a PASID when unbinding, with flags
> VFIO_PASID_RELEASE_FLUSHED and VFIO_PASID_RELEASE_CLEAN. Even for PCI,
> PASID invalidation is specific to each device and only partially covered by the
> specification:
>
> * Device must have an implementation-defined mechanism for stopping the
> use of a PASID. When this mechanism finishes, the device has stopped
> issuing transactions for this PASID and all transactions for this PASID
> have been flushed to the IOMMU.
>
> * Device may either wait for all outstanding PRI requests for this PASID
> to finish, or issue a Stop Marker message, a barrier that separates PRI
> requests affecting this instance of the PASID from PRI requests
> affecting the next instance. In the first case, we say that the PASID is
> "clean", in the second case it is "flushed" (and the IOMMU has to wait
> for the Stop Marker before reassigning the PASID.)
>
> We expect similar distinctions for platform devices. Ideally there should be a callback
> for each PCI device, allowing the IOMMU to ask the device to stop using a PASID.
> When the callback returns, the PASID is either flushed or clean and the return value
> tells which.
>
> For the moment I don't know how to implement this callback for PCI, so if the user
> forgets to call unbind with either "clean" or "flushed", the PASID is never reused. For
> platform devices, it might be simpler to implement since we could associate an
> invalidate_pasid callback to a DT compatible string, as is currently done for reset.
>
> Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker@arm.com>
[...]
> drivers/vfio/pci/vfio_pci.c | 24 ++++++++++
> drivers/vfio/vfio.c | 104 ++++++++++++++++++++++++++++++++++++++++++++
> include/uapi/linux/vfio.h | 55 +++++++++++++++++++++++
> 3 files changed, 183 insertions(+)
>
...
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index
> 519eff362c1c..3fe4197a5ea0 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -198,6 +198,7 @@ struct vfio_device_info {
> #define VFIO_DEVICE_FLAGS_PCI (1 << 1) /* vfio-pci device */
> #define VFIO_DEVICE_FLAGS_PLATFORM (1 << 2) /* vfio-platform device */
> #define VFIO_DEVICE_FLAGS_AMBA (1 << 3) /* vfio-amba device */
> +#define VFIO_DEVICE_FLAGS_SVM (1 << 4) /* Device supports bind/unbind */
> __u32 num_regions; /* Max region index + 1 */
> __u32 num_irqs; /* Max IRQ index + 1 */
> };
> @@ -409,6 +410,60 @@ struct vfio_irq_set {
> */
> #define VFIO_DEVICE_RESET _IO(VFIO_TYPE, VFIO_BASE + 11)
>
> +struct vfio_device_svm {
> + __u32 argsz;
> + __u32 flags;
> +#define VFIO_SVM_PASID_RELEASE_FLUSHED (1 << 0)
> +#define VFIO_SVM_PASID_RELEASE_CLEAN (1 << 1)
> + __u32 pasid;
> +};
For virtual SVM work, the VFIO channel would be used to passdown guest
PASID tale PTR and invalidation information. And may have further usage
except the above.
Here is the virtual SVM design doc which illustrates the VFIO usage.
https://lists.gnu.org/archive/html/qemu-devel/2016-11/msg05311.html
For the guest PASID table ptr passdown, I've following message in pseudo code.
struct pasid_table_info {
__u64 ptr;
__u32 size;
};
For invalidation, I've following info in in pseudo code.
struct iommu_svm_tlb_invalidate_info
{
__u32 inv_type;
#define IOTLB_INV (1 << 0)
#define EXTENDED_IOTLB_INV (1 << 1)
#define DEVICE_IOTLB_INV (1 << 2)
#define EXTENDED_DEVICE_IOTLB_INV (1 << 3)
#define PASID_CACHE_INV (1 << 4)
__u32 pasid;
__u64 addr;
__u64 size;
__u8 granularity;
#define DEFAULT_INV_GRN 0
#define PAGE_SELECTIVE_INV (1 << 0)
#define PASID_SELECVIVE_INV (1 << 1)
__u64 flags;
#define INVALIDATE_HINT_BIT (1 << 0)
#define GLOBAL_HINT_BIT (1 << 1)
#define DRAIN_READ_BIT (1 << 2)
#define DRAIN_WRITE_BIT (1 << 3)
#define DEVICE_TLB_GLOBAL_BIT (1 << 4)
__u8 mip;
__u16 pfsid;
};
Although your proposal is for userspace driver SVM usage while mine is
for SVM usage in virtual machine, there should be a chance to make the
channel meet our request. And I think it would be more acceptable. So I'd
like to see your comments if we define the channel as following definition.
If any better solution, pls feel free let me know.
struct vfio_device_svm {
__u32 argsz;
#define VFIO_SVM_BIND_PASIDTP (1 << 0)
#define VFIO_SVM_PASSDOWN_INVALIDATE (1 << 1)
#define VFIO_SVM_PASID_RELEASE_FLUSHED (1 << 2)
#define VFIO_SVM_PASID_RELEASE_CLEAN (1 << 3)
__u32 flags;
__u32 length;
__u8 data[];
};
Thanks,
Yi L
> + * VFIO_DEVICE_BIND_TASK - _IOWR(VFIO_TYPE, VFIO_BASE + 22,
> + * struct vfio_device_svm)
> + *
> + * Share a process' virtual address space with the device.
> + *
> + * This feature creates a new address space for the device, which is
> +not
> + * affected by VFIO_IOMMU_MAP/UNMAP_DMA. Instead, the device can tag
> +its DMA
> + * traffic with the given @pasid to perform transactions on the
> +associated
> + * virtual address space. Mapping and unmapping of buffers is performed
> +by
> + * standard functions such as mmap and malloc.
> + *
> + * On success, VFIO writes a Process Address Space ID (PASID) into
> +@pasid. This
> + * ID is unique to a device.
> + *
> + * The bond between device and process must be removed with
> + * VFIO_DEVICE_UNBIND_TASK before exiting.
> + *
> + * On fork, the child inherits the device fd and can use the bonds
> +setup by its
> + * parent. Consequently, the child has R/W access on the address spaces
> +bound by
> + * its parent. After an execv, the device fd is closed and the child
> +doesn't
> + * have access to the address space anymore.
> + *
> + * Availability of this feature depends on the device, its bus, the
> +underlying
> + * IOMMU and the CPU architecture. All of these are guaranteed when the
> +device
> + * has VFIO_DEVICE_FLAGS_SVM flag set.
> + *
> + * returns: 0 on success, -errno on failure.
> + */
> +#define VFIO_DEVICE_BIND_TASK _IO(VFIO_TYPE, VFIO_BASE + 22)
> +
> +/*
> + * VFIO_DEVICE_UNBIND_TASK - _IOWR(VFIO_TYPE, VFIO_BASE + 23,
> + * struct vfio_device_svm)
> + *
> + * Unbind address space identified by @pasid from device. Device must
> +have
> + * stopped issuing any DMA transaction for the PASID and flushed any
> +reference
> + * to this PASID upstream. Some IOMMUs need to know when a PASID is
> +safe to
> + * reuse, in which case one of the following must be present in @flags
> + *
> + * VFIO_PASID_RELEASE_FLUSHED: the PASID is safe to reassign after the IOMMU
> + * receives an invalidation message from the device.
> + *
> + * VFIO_PASID_RELEASE_CLEAN: the PASID is safe to reassign immediately.
> + */
> +#define VFIO_DEVICE_UNBIND_TASK _IO(VFIO_TYPE, VFIO_BASE + 23)
> +
> /*
> * The VFIO-PCI bus driver makes use of the following fixed region and
> * IRQ index mapping. Unimplemented regions return a size of zero.
> --
> 2.11.0
>
> _______________________________________________
> iommu mailing list
> iommu@lists.linux-foundation.org
> https://lists.linuxfoundation.org/mailman/listinfo/iommu
^ permalink raw reply [flat|nested] 103+ messages in thread
[parent not found: <20170227195441.5170-30-jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>]
* Re: [RFC PATCH 29/30] vfio: Add support for Shared Virtual Memory
[not found] ` <20170227195441.5170-30-jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
@ 2017-02-28 3:54 ` Alex Williamson
[not found] ` <20170227205409.14f0e2c7-1yVPhWWZRC1BDLzU/O5InQ@public.gmane.org>
2017-04-26 6:53 ` Tomasz Nowicki
1 sibling, 1 reply; 103+ messages in thread
From: Alex Williamson @ 2017-02-28 3:54 UTC (permalink / raw)
To: Jean-Philippe Brucker
Cc: Shanker Donthineni, kvm-u79uwXL29TY76Z2rM5mHXA, Catalin Marinas,
Sinan Kaya, Will Deacon, Harv Abdulhamid,
iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
linux-pci-u79uwXL29TY76Z2rM5mHXA, Bjorn Helgaas, David Woodhouse,
linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r, Nate Watterson
On Mon, 27 Feb 2017 19:54:40 +0000
Jean-Philippe Brucker <jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org> wrote:
> Add two new ioctl for VFIO devices. VFIO_DEVICE_BIND_TASK creates a bond
> between a device and a process address space, identified by a
> device-specific ID named PASID. This allows the device to target DMA
> transactions at the process virtual addresses without a need for mapping
> and unmapping buffers explicitly in the IOMMU. The process page tables are
> shared with the IOMMU, and mechanisms such as PCI ATS/PRI may be used to
> handle faults. VFIO_DEVICE_UNBIND_TASK removed a bond identified by a
> PASID.
>
> Also add a capability flag in device info to detect whether the system and
> the device support SVM.
>
> Users need to specify the state of a PASID when unbinding, with flags
> VFIO_PASID_RELEASE_FLUSHED and VFIO_PASID_RELEASE_CLEAN. Even for PCI,
> PASID invalidation is specific to each device and only partially covered
> by the specification:
>
> * Device must have an implementation-defined mechanism for stopping the
> use of a PASID. When this mechanism finishes, the device has stopped
> issuing transactions for this PASID and all transactions for this PASID
> have been flushed to the IOMMU.
>
> * Device may either wait for all outstanding PRI requests for this PASID
> to finish, or issue a Stop Marker message, a barrier that separates PRI
> requests affecting this instance of the PASID from PRI requests
> affecting the next instance. In the first case, we say that the PASID is
> "clean", in the second case it is "flushed" (and the IOMMU has to wait
> for the Stop Marker before reassigning the PASID.)
>
> We expect similar distinctions for platform devices. Ideally there should
> be a callback for each PCI device, allowing the IOMMU to ask the device to
> stop using a PASID. When the callback returns, the PASID is either flushed
> or clean and the return value tells which.
>
> For the moment I don't know how to implement this callback for PCI, so if
> the user forgets to call unbind with either "clean" or "flushed", the
> PASID is never reused. For platform devices, it might be simpler to
> implement since we could associate an invalidate_pasid callback to a DT
> compatible string, as is currently done for reset.
>
> Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
> ---
> drivers/vfio/pci/vfio_pci.c | 24 ++++++++++
> drivers/vfio/vfio.c | 104 ++++++++++++++++++++++++++++++++++++++++++++
> include/uapi/linux/vfio.h | 55 +++++++++++++++++++++++
> 3 files changed, 183 insertions(+)
>
> diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
> index 324c52e3a1a4..3d7733f94891 100644
> --- a/drivers/vfio/pci/vfio_pci.c
> +++ b/drivers/vfio/pci/vfio_pci.c
> @@ -22,6 +22,7 @@
> #include <linux/mutex.h>
> #include <linux/notifier.h>
> #include <linux/pci.h>
> +#include <linux/pci-ats.h>
> #include <linux/pm_runtime.h>
> #include <linux/slab.h>
> #include <linux/types.h>
> @@ -623,6 +624,26 @@ int vfio_pci_register_dev_region(struct vfio_pci_device *vdev,
> return 0;
> }
>
> +static bool vfio_pci_supports_svm(struct vfio_pci_device *vdev)
> +{
> + struct pci_dev *pdev = vdev->pdev;
> +
> + if (!pdev->ats_enabled)
> + return false;
> +
> + if (!pdev->pasid_enabled || pci_max_pasids(pdev) <= 1)
> + return false;
> +
> + if (!pdev->pri_enabled)
> + return false;
> +
> + /*
> + * If the IOMMU driver enabled all of these, then it supports PCI SVM
> + * for this device.
> + */
> + return true;
> +}
> +
> static long vfio_pci_ioctl(void *device_data,
> unsigned int cmd, unsigned long arg)
> {
> @@ -642,6 +663,9 @@ static long vfio_pci_ioctl(void *device_data,
>
> info.flags = VFIO_DEVICE_FLAGS_PCI;
>
> + if (vfio_pci_supports_svm(vdev))
> + info.flags |= VFIO_DEVICE_FLAGS_SVM;
> +
> if (vdev->reset_works)
> info.flags |= VFIO_DEVICE_FLAGS_RESET;
>
> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
> index 609f4f982c74..c4505d8f4c61 100644
> --- a/drivers/vfio/vfio.c
> +++ b/drivers/vfio/vfio.c
> @@ -97,6 +97,14 @@ struct vfio_device {
> struct vfio_group *group;
> struct list_head group_next;
> void *device_data;
> +
> + struct mutex tasks_lock;
> + struct list_head tasks;
> +};
> +
> +struct vfio_task {
> + int pasid;
> + struct list_head list;
> };
>
> #ifdef CONFIG_VFIO_NOIOMMU
> @@ -520,6 +528,9 @@ struct vfio_device *vfio_group_create_device(struct vfio_group *group,
> device->device_data = device_data;
> dev_set_drvdata(dev, device);
>
> + mutex_init(&device->tasks_lock);
> + INIT_LIST_HEAD(&device->tasks);
> +
> /* No need to get group_lock, caller has group reference */
> vfio_group_get(group);
>
> @@ -532,6 +543,8 @@ struct vfio_device *vfio_group_create_device(struct vfio_group *group,
>
> static void vfio_device_release(struct kref *kref)
> {
> + int ret;
> + struct vfio_task *tmp, *task;
> struct vfio_device *device = container_of(kref,
> struct vfio_device, kref);
> struct vfio_group *group = device->group;
> @@ -539,6 +552,22 @@ static void vfio_device_release(struct kref *kref)
> list_del(&device->group_next);
> mutex_unlock(&group->device_lock);
>
> + mutex_lock(&device->tasks_lock);
> + list_for_each_entry_safe(task, tmp, &device->tasks, list) {
> + /*
> + * This might leak the PASID, since the IOMMU won't know
> + * if it is safe to reuse.
> + */
> + ret = iommu_unbind_task(device->dev, task->pasid, 0);
> + if (ret)
> + dev_warn(device->dev, "failed to unbind PASID %u\n",
> + task->pasid);
> +
> + list_del(&task->list);
> + kfree(task);
> + }
> + mutex_unlock(&device->tasks_lock);
> +
> dev_set_drvdata(device->dev, NULL);
>
> kfree(device);
> @@ -1622,6 +1651,75 @@ static int vfio_device_fops_release(struct inode *inode, struct file *filep)
> return 0;
> }
>
> +static long vfio_svm_ioctl(struct vfio_device *device, unsigned int cmd,
> + unsigned long arg)
> +{
> + int ret;
> + unsigned long minsz;
> +
> + struct vfio_device_svm svm;
> + struct vfio_task *vfio_task;
> +
> + minsz = offsetofend(struct vfio_device_svm, pasid);
> +
> + if (copy_from_user(&svm, (void __user *)arg, minsz))
> + return -EFAULT;
> +
> + if (svm.argsz < minsz)
> + return -EINVAL;
> +
> + if (cmd == VFIO_DEVICE_BIND_TASK) {
> + struct task_struct *task = current;
Seems like SVM should be in the name of these ioctls.
svm.flags needs to be validated here or else we lose the field for
future use... you add this in the next patch, but see compatibility
comment there.
> +
> + ret = iommu_bind_task(device->dev, task, &svm.pasid, 0, NULL);
> + if (ret)
> + return ret;
vfio-pci advertises the device feature, but vfio intercepts the ioctl
and attempts to handle it regardless of device support.
We also need to be careful of using, or even referencing iommu_ops
without regard to the device or IOMMU backend. SPAPR doesn't fully
implement IOMMU API, vfio-noiommu devices don't have iommu_ops, mdev
devices don't either. I agree with your comments in the cover letter,
it's not entirely clear that the device fd is the right place to host
this.
> +
> + vfio_task = kzalloc(sizeof(*vfio_task), GFP_KERNEL);
> + if (!vfio_task) {
> + iommu_unbind_task(device->dev, svm.pasid,
> + IOMMU_PASID_CLEAN);
> + return -ENOMEM;
> + }
> +
> + vfio_task->pasid = svm.pasid;
> +
> + mutex_lock(&device->tasks_lock);
> + list_add(&vfio_task->list, &device->tasks);
> + mutex_unlock(&device->tasks_lock);
> +
> + } else {
> + int flags = 0;
> +
> + if (svm.flags & ~(VFIO_SVM_PASID_RELEASE_FLUSHED |
> + VFIO_SVM_PASID_RELEASE_CLEAN))
> + return -EINVAL;
> +
> + if (svm.flags & VFIO_SVM_PASID_RELEASE_FLUSHED)
> + flags = IOMMU_PASID_FLUSHED;
> + else if (svm.flags & VFIO_SVM_PASID_RELEASE_CLEAN)
> + flags = IOMMU_PASID_CLEAN;
> +
> + mutex_lock(&device->tasks_lock);
> + list_for_each_entry(vfio_task, &device->tasks, list) {
> + if (vfio_task->pasid != svm.pasid)
> + continue;
> +
> + ret = iommu_unbind_task(device->dev, svm.pasid, flags);
> + if (ret)
> + dev_warn(device->dev, "failed to unbind PASID %u\n",
> + vfio_task->pasid);
> +
> + list_del(&vfio_task->list);
> + kfree(vfio_task);
> + break;
> + }
> + mutex_unlock(&device->tasks_lock);
> + }
> +
> + return copy_to_user((void __user *)arg, &svm, minsz) ? -EFAULT : 0;
> +}
> +
> static long vfio_device_fops_unl_ioctl(struct file *filep,
> unsigned int cmd, unsigned long arg)
> {
> @@ -1630,6 +1728,12 @@ static long vfio_device_fops_unl_ioctl(struct file *filep,
> if (unlikely(!device->ops->ioctl))
> return -EINVAL;
>
> + switch (cmd) {
> + case VFIO_DEVICE_BIND_TASK:
> + case VFIO_DEVICE_UNBIND_TASK:
> + return vfio_svm_ioctl(device, cmd, arg);
> + }
> +
> return device->ops->ioctl(device->device_data, cmd, arg);
> }
>
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index 519eff362c1c..3fe4197a5ea0 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -198,6 +198,7 @@ struct vfio_device_info {
> #define VFIO_DEVICE_FLAGS_PCI (1 << 1) /* vfio-pci device */
> #define VFIO_DEVICE_FLAGS_PLATFORM (1 << 2) /* vfio-platform device */
> #define VFIO_DEVICE_FLAGS_AMBA (1 << 3) /* vfio-amba device */
> +#define VFIO_DEVICE_FLAGS_SVM (1 << 4) /* Device supports bind/unbind */
We could also define one of the bits in vfio_device_svm.flags to be
"probe" (ie. no-op, return success). Using an SVM flag follows the
model we used for RESET support, but I'm not convinced that's a great
model to follow.
> __u32 num_regions; /* Max region index + 1 */
> __u32 num_irqs; /* Max IRQ index + 1 */
> };
> @@ -409,6 +410,60 @@ struct vfio_irq_set {
> */
> #define VFIO_DEVICE_RESET _IO(VFIO_TYPE, VFIO_BASE + 11)
>
> +struct vfio_device_svm {
> + __u32 argsz;
> + __u32 flags;
> +#define VFIO_SVM_PASID_RELEASE_FLUSHED (1 << 0)
> +#define VFIO_SVM_PASID_RELEASE_CLEAN (1 << 1)
> + __u32 pasid;
> +};
> +/*
> + * VFIO_DEVICE_BIND_TASK - _IOWR(VFIO_TYPE, VFIO_BASE + 22,
> + * struct vfio_device_svm)
> + *
> + * Share a process' virtual address space with the device.
> + *
> + * This feature creates a new address space for the device, which is not
> + * affected by VFIO_IOMMU_MAP/UNMAP_DMA. Instead, the device can tag its DMA
> + * traffic with the given @pasid to perform transactions on the associated
> + * virtual address space. Mapping and unmapping of buffers is performed by
> + * standard functions such as mmap and malloc.
> + *
> + * On success, VFIO writes a Process Address Space ID (PASID) into @pasid. This
> + * ID is unique to a device.
> + *
> + * The bond between device and process must be removed with
> + * VFIO_DEVICE_UNBIND_TASK before exiting.
I'm not sure I understand this since we do a pass of unbinds on
release. Certainly we can't rely on the user for cleanup.
> + *
> + * On fork, the child inherits the device fd and can use the bonds setup by its
> + * parent. Consequently, the child has R/W access on the address spaces bound by
> + * its parent. After an execv, the device fd is closed and the child doesn't
> + * have access to the address space anymore.
> + *
> + * Availability of this feature depends on the device, its bus, the underlying
> + * IOMMU and the CPU architecture. All of these are guaranteed when the device
> + * has VFIO_DEVICE_FLAGS_SVM flag set.
> + *
> + * returns: 0 on success, -errno on failure.
> + */
> +#define VFIO_DEVICE_BIND_TASK _IO(VFIO_TYPE, VFIO_BASE + 22)
> +
> +/*
> + * VFIO_DEVICE_UNBIND_TASK - _IOWR(VFIO_TYPE, VFIO_BASE + 23,
> + * struct vfio_device_svm)
> + *
> + * Unbind address space identified by @pasid from device. Device must have
> + * stopped issuing any DMA transaction for the PASID and flushed any reference
> + * to this PASID upstream. Some IOMMUs need to know when a PASID is safe to
> + * reuse, in which case one of the following must be present in @flags
> + *
> + * VFIO_PASID_RELEASE_FLUSHED: the PASID is safe to reassign after the IOMMU
> + * receives an invalidation message from the device.
> + *
> + * VFIO_PASID_RELEASE_CLEAN: the PASID is safe to reassign immediately.
> + */
> +#define VFIO_DEVICE_UNBIND_TASK _IO(VFIO_TYPE, VFIO_BASE + 23)
> +
> /*
> * The VFIO-PCI bus driver makes use of the following fixed region and
> * IRQ index mapping. Unimplemented regions return a size of zero.
^ permalink raw reply [flat|nested] 103+ messages in thread
* Re: [RFC PATCH 29/30] vfio: Add support for Shared Virtual Memory
[not found] ` <20170227195441.5170-30-jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
2017-02-28 3:54 ` Alex Williamson
@ 2017-04-26 6:53 ` Tomasz Nowicki
[not found] ` <f5745241-83b0-0945-7616-4b59d7ebcd48-nYOzD4b6Jr9Wk0Htik3J/w@public.gmane.org>
1 sibling, 1 reply; 103+ messages in thread
From: Tomasz Nowicki @ 2017-04-26 6:53 UTC (permalink / raw)
To: Jean-Philippe Brucker
Cc: Shanker Donthineni, kvm-u79uwXL29TY76Z2rM5mHXA, Catalin Marinas,
Sinan Kaya, Will Deacon, Harv Abdulhamid,
iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
linux-pci-u79uwXL29TY76Z2rM5mHXA, Bjorn Helgaas, David Woodhouse,
linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r, Nate Watterson
Hi Jean,
On 27.02.2017 20:54, Jean-Philippe Brucker wrote:
> Add two new ioctl for VFIO devices. VFIO_DEVICE_BIND_TASK creates a bond
> between a device and a process address space, identified by a
> device-specific ID named PASID. This allows the device to target DMA
> transactions at the process virtual addresses without a need for mapping
> and unmapping buffers explicitly in the IOMMU. The process page tables are
> shared with the IOMMU, and mechanisms such as PCI ATS/PRI may be used to
> handle faults. VFIO_DEVICE_UNBIND_TASK removed a bond identified by a
> PASID.
>
> Also add a capability flag in device info to detect whether the system and
> the device support SVM.
>
> Users need to specify the state of a PASID when unbinding, with flags
> VFIO_PASID_RELEASE_FLUSHED and VFIO_PASID_RELEASE_CLEAN. Even for PCI,
> PASID invalidation is specific to each device and only partially covered
> by the specification:
>
> * Device must have an implementation-defined mechanism for stopping the
> use of a PASID. When this mechanism finishes, the device has stopped
> issuing transactions for this PASID and all transactions for this PASID
> have been flushed to the IOMMU.
>
> * Device may either wait for all outstanding PRI requests for this PASID
> to finish, or issue a Stop Marker message, a barrier that separates PRI
> requests affecting this instance of the PASID from PRI requests
> affecting the next instance. In the first case, we say that the PASID is
> "clean", in the second case it is "flushed" (and the IOMMU has to wait
> for the Stop Marker before reassigning the PASID.)
>
> We expect similar distinctions for platform devices. Ideally there should
> be a callback for each PCI device, allowing the IOMMU to ask the device to
> stop using a PASID. When the callback returns, the PASID is either flushed
> or clean and the return value tells which.
>
> For the moment I don't know how to implement this callback for PCI, so if
> the user forgets to call unbind with either "clean" or "flushed", the
> PASID is never reused. For platform devices, it might be simpler to
> implement since we could associate an invalidate_pasid callback to a DT
> compatible string, as is currently done for reset.
>
> Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
> ---
> drivers/vfio/pci/vfio_pci.c | 24 ++++++++++
> drivers/vfio/vfio.c | 104 ++++++++++++++++++++++++++++++++++++++++++++
> include/uapi/linux/vfio.h | 55 +++++++++++++++++++++++
> 3 files changed, 183 insertions(+)
>
> diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
> index 324c52e3a1a4..3d7733f94891 100644
> --- a/drivers/vfio/pci/vfio_pci.c
> +++ b/drivers/vfio/pci/vfio_pci.c
> @@ -22,6 +22,7 @@
> #include <linux/mutex.h>
> #include <linux/notifier.h>
> #include <linux/pci.h>
> +#include <linux/pci-ats.h>
> #include <linux/pm_runtime.h>
> #include <linux/slab.h>
> #include <linux/types.h>
> @@ -623,6 +624,26 @@ int vfio_pci_register_dev_region(struct vfio_pci_device *vdev,
> return 0;
> }
>
[...]
>
> kfree(device);
> @@ -1622,6 +1651,75 @@ static int vfio_device_fops_release(struct inode *inode, struct file *filep)
> return 0;
> }
>
> +static long vfio_svm_ioctl(struct vfio_device *device, unsigned int cmd,
> + unsigned long arg)
> +{
> + int ret;
> + unsigned long minsz;
> +
> + struct vfio_device_svm svm;
> + struct vfio_task *vfio_task;
> +
> + minsz = offsetofend(struct vfio_device_svm, pasid);
> +
> + if (copy_from_user(&svm, (void __user *)arg, minsz))
> + return -EFAULT;
> +
> + if (svm.argsz < minsz)
> + return -EINVAL;
> +
> + if (cmd == VFIO_DEVICE_BIND_TASK) {
> + struct task_struct *task = current;
> +
> + ret = iommu_bind_task(device->dev, task, &svm.pasid, 0, NULL);
> + if (ret)
> + return ret;
> +
> + vfio_task = kzalloc(sizeof(*vfio_task), GFP_KERNEL);
> + if (!vfio_task) {
> + iommu_unbind_task(device->dev, svm.pasid,
> + IOMMU_PASID_CLEAN);
> + return -ENOMEM;
> + }
> +
> + vfio_task->pasid = svm.pasid;
> +
> + mutex_lock(&device->tasks_lock);
> + list_add(&vfio_task->list, &device->tasks);
> + mutex_unlock(&device->tasks_lock);
> +
> + } else {
> + int flags = 0;
> +
> + if (svm.flags & ~(VFIO_SVM_PASID_RELEASE_FLUSHED |
> + VFIO_SVM_PASID_RELEASE_CLEAN))
> + return -EINVAL;
> +
> + if (svm.flags & VFIO_SVM_PASID_RELEASE_FLUSHED)
> + flags = IOMMU_PASID_FLUSHED;
> + else if (svm.flags & VFIO_SVM_PASID_RELEASE_CLEAN)
> + flags = IOMMU_PASID_CLEAN;
> +
> + mutex_lock(&device->tasks_lock);
> + list_for_each_entry(vfio_task, &device->tasks, list) {
> + if (vfio_task->pasid != svm.pasid)
> + continue;
> +
> + ret = iommu_unbind_task(device->dev, svm.pasid, flags);
> + if (ret)
> + dev_warn(device->dev, "failed to unbind PASID %u\n",
> + vfio_task->pasid);
> +
> + list_del(&vfio_task->list);
> + kfree(vfio_task);
Please use list_for_each_entry_safe.
Thanks,
Tomasz
^ permalink raw reply [flat|nested] 103+ messages in thread
* [RFC PATCH 30/30] vfio: Allow to bind foreign task
[not found] ` <20170227195441.5170-1-jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
` (28 preceding siblings ...)
2017-02-27 19:54 ` [RFC PATCH 29/30] vfio: Add support for Shared Virtual Memory Jean-Philippe Brucker
@ 2017-02-27 19:54 ` Jean-Philippe Brucker
[not found] ` <20170227195441.5170-31-jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
2017-04-26 7:25 ` Tomasz Nowicki
2017-03-06 8:20 ` [RFC PATCH 00/30] Add PCIe SVM support to ARM SMMUv3 Liu, Yi L
30 siblings, 2 replies; 103+ messages in thread
From: Jean-Philippe Brucker @ 2017-02-27 19:54 UTC (permalink / raw)
Cc: Shanker Donthineni, kvm-u79uwXL29TY76Z2rM5mHXA, Catalin Marinas,
Sinan Kaya, Will Deacon,
iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
Harv Abdulhamid, linux-pci-u79uwXL29TY76Z2rM5mHXA, Bjorn Helgaas,
David Woodhouse,
linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r, Nate Watterson
Let the process that owns the device create an address space bond on
behalf of another process. We add a pid argument to the BIND_TASK ioctl,
allowing the caller to bind a foreign task. The expected program flow in
this case is:
* Process A creates the VFIO context and initializes the device.
* Process B asks A to bind its address space.
* Process A issues an ioctl to the VFIO device fd with BIND_TASK(pid).
It may communicate the given PASID back to process B or keep track of it
internally.
* Process B asks A to perform transactions on its virtual address.
* Process A launches transaction tagged with the given PASID.
Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
---
drivers/vfio/vfio.c | 35 +++++++++++++++++++++++++++++++++--
include/uapi/linux/vfio.h | 15 +++++++++++++++
2 files changed, 48 insertions(+), 2 deletions(-)
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index c4505d8f4c61..ecc5d07e3dbb 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -26,6 +26,7 @@
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/pci.h>
+#include <linux/ptrace.h>
#include <linux/rwsem.h>
#include <linux/sched.h>
#include <linux/slab.h>
@@ -1660,7 +1661,7 @@ static long vfio_svm_ioctl(struct vfio_device *device, unsigned int cmd,
struct vfio_device_svm svm;
struct vfio_task *vfio_task;
- minsz = offsetofend(struct vfio_device_svm, pasid);
+ minsz = offsetofend(struct vfio_device_svm, pid);
if (copy_from_user(&svm, (void __user *)arg, minsz))
return -EFAULT;
@@ -1669,9 +1670,39 @@ static long vfio_svm_ioctl(struct vfio_device *device, unsigned int cmd,
return -EINVAL;
if (cmd == VFIO_DEVICE_BIND_TASK) {
- struct task_struct *task = current;
+ struct mm_struct *mm;
+ struct task_struct *task;
+
+ if (svm.flags & ~VFIO_SVM_PID)
+ return -EINVAL;
+
+ if (svm.flags & VFIO_SVM_PID) {
+ rcu_read_lock();
+ task = find_task_by_vpid(svm.pid);
+ if (task)
+ get_task_struct(task);
+ rcu_read_unlock();
+ if (!task)
+ return -ESRCH;
+
+ /*
+ * Ensure process has RW access on the task's mm
+ * FIXME:
+ * - I think this ought to be in the IOMMU API
+ * - I'm assuming permission is never revoked during the
+ * task's lifetime. Might be mistaken.
+ */
+ mm = mm_access(task, PTRACE_MODE_ATTACH_REALCREDS);
+ if (!mm || IS_ERR(mm))
+ return IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
+ mmput(mm);
+ } else {
+ get_task_struct(current);
+ task = current;
+ }
ret = iommu_bind_task(device->dev, task, &svm.pasid, 0, NULL);
+ put_task_struct(task);
if (ret)
return ret;
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 3fe4197a5ea0..41ae8a231d42 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -415,7 +415,9 @@ struct vfio_device_svm {
__u32 flags;
#define VFIO_SVM_PASID_RELEASE_FLUSHED (1 << 0)
#define VFIO_SVM_PASID_RELEASE_CLEAN (1 << 1)
+#define VFIO_SVM_PID (1 << 2)
__u32 pasid;
+ __u32 pid;
};
/*
* VFIO_DEVICE_BIND_TASK - _IOWR(VFIO_TYPE, VFIO_BASE + 22,
@@ -432,6 +434,19 @@ struct vfio_device_svm {
* On success, VFIO writes a Process Address Space ID (PASID) into @pasid. This
* ID is unique to a device.
*
+ * VFIO_SVM_PID: bind task @pid instead of current task. The shared address
+ * space identified by @pasid is that of task identified by @pid.
+ *
+ * Given that the caller owns the device, setting this flag grants the
+ * caller read and write permissions on the entire address space of
+ * foreign task described by @pid. Therefore, permission to perform the
+ * bind operation on a foreign process is governed by the ptrace access
+ * mode PTRACE_MODE_ATTACH_REALCREDS check. See man ptrace(2) for more
+ * information.
+ *
+ * If the VFIO_SVM_PID flag is not set, @pid is unused and it is the
+ * current task that is bound to the device.
+ *
* The bond between device and process must be removed with
* VFIO_DEVICE_UNBIND_TASK before exiting.
*
--
2.11.0
^ permalink raw reply related [flat|nested] 103+ messages in thread
[parent not found: <20170227195441.5170-31-jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>]
* Re: [RFC PATCH 30/30] vfio: Allow to bind foreign task
[not found] ` <20170227195441.5170-31-jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
@ 2017-02-28 3:54 ` Alex Williamson
[not found] ` <20170227205411.1abca59a-1yVPhWWZRC1BDLzU/O5InQ@public.gmane.org>
0 siblings, 1 reply; 103+ messages in thread
From: Alex Williamson @ 2017-02-28 3:54 UTC (permalink / raw)
To: Jean-Philippe Brucker
Cc: Shanker Donthineni, kvm-u79uwXL29TY76Z2rM5mHXA, Catalin Marinas,
Sinan Kaya, Will Deacon, Harv Abdulhamid,
iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
linux-pci-u79uwXL29TY76Z2rM5mHXA, Bjorn Helgaas, David Woodhouse,
linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r, Nate Watterson
On Mon, 27 Feb 2017 19:54:41 +0000
Jean-Philippe Brucker <jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org> wrote:
> Let the process that owns the device create an address space bond on
> behalf of another process. We add a pid argument to the BIND_TASK ioctl,
> allowing the caller to bind a foreign task. The expected program flow in
> this case is:
>
> * Process A creates the VFIO context and initializes the device.
> * Process B asks A to bind its address space.
> * Process A issues an ioctl to the VFIO device fd with BIND_TASK(pid).
> It may communicate the given PASID back to process B or keep track of it
> internally.
> * Process B asks A to perform transactions on its virtual address.
> * Process A launches transaction tagged with the given PASID.
>
> Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
> ---
> drivers/vfio/vfio.c | 35 +++++++++++++++++++++++++++++++++--
> include/uapi/linux/vfio.h | 15 +++++++++++++++
> 2 files changed, 48 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
> index c4505d8f4c61..ecc5d07e3dbb 100644
> --- a/drivers/vfio/vfio.c
> +++ b/drivers/vfio/vfio.c
> @@ -26,6 +26,7 @@
> #include <linux/module.h>
> #include <linux/mutex.h>
> #include <linux/pci.h>
> +#include <linux/ptrace.h>
> #include <linux/rwsem.h>
> #include <linux/sched.h>
> #include <linux/slab.h>
> @@ -1660,7 +1661,7 @@ static long vfio_svm_ioctl(struct vfio_device *device, unsigned int cmd,
> struct vfio_device_svm svm;
> struct vfio_task *vfio_task;
>
> - minsz = offsetofend(struct vfio_device_svm, pasid);
> + minsz = offsetofend(struct vfio_device_svm, pid);
This is only the minsz if flags includes VFIO_SVM_PID, right?
Otherwise this isn't a backward compatible change (granted you're
proposing both in the same series), userspace built against 29/30
won't work against 30/30.
>
> if (copy_from_user(&svm, (void __user *)arg, minsz))
> return -EFAULT;
> @@ -1669,9 +1670,39 @@ static long vfio_svm_ioctl(struct vfio_device *device, unsigned int cmd,
> return -EINVAL;
>
> if (cmd == VFIO_DEVICE_BIND_TASK) {
> - struct task_struct *task = current;
> + struct mm_struct *mm;
> + struct task_struct *task;
> +
> + if (svm.flags & ~VFIO_SVM_PID)
> + return -EINVAL;
29/30 never validated flags, so theoretically userspace compiled
against 29/30 could have put anything in flags and it would have
worked, no longer the case here.
> +
> + if (svm.flags & VFIO_SVM_PID) {
> + rcu_read_lock();
> + task = find_task_by_vpid(svm.pid);
> + if (task)
> + get_task_struct(task);
> + rcu_read_unlock();
> + if (!task)
> + return -ESRCH;
> +
> + /*
> + * Ensure process has RW access on the task's mm
> + * FIXME:
> + * - I think this ought to be in the IOMMU API
> + * - I'm assuming permission is never revoked during the
> + * task's lifetime. Might be mistaken.
> + */
> + mm = mm_access(task, PTRACE_MODE_ATTACH_REALCREDS);
> + if (!mm || IS_ERR(mm))
> + return IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
> + mmput(mm);
> + } else {
> + get_task_struct(current);
> + task = current;
> + }
>
> ret = iommu_bind_task(device->dev, task, &svm.pasid, 0, NULL);
> + put_task_struct(task);
> if (ret)
> return ret;
>
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index 3fe4197a5ea0..41ae8a231d42 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -415,7 +415,9 @@ struct vfio_device_svm {
> __u32 flags;
> #define VFIO_SVM_PASID_RELEASE_FLUSHED (1 << 0)
> #define VFIO_SVM_PASID_RELEASE_CLEAN (1 << 1)
> +#define VFIO_SVM_PID (1 << 2)
> __u32 pasid;
> + __u32 pid;
> };
> /*
> * VFIO_DEVICE_BIND_TASK - _IOWR(VFIO_TYPE, VFIO_BASE + 22,
> @@ -432,6 +434,19 @@ struct vfio_device_svm {
> * On success, VFIO writes a Process Address Space ID (PASID) into @pasid. This
> * ID is unique to a device.
> *
> + * VFIO_SVM_PID: bind task @pid instead of current task. The shared address
> + * space identified by @pasid is that of task identified by @pid.
> + *
> + * Given that the caller owns the device, setting this flag grants the
> + * caller read and write permissions on the entire address space of
> + * foreign task described by @pid. Therefore, permission to perform the
> + * bind operation on a foreign process is governed by the ptrace access
> + * mode PTRACE_MODE_ATTACH_REALCREDS check. See man ptrace(2) for more
> + * information.
> + *
> + * If the VFIO_SVM_PID flag is not set, @pid is unused and it is the
> + * current task that is bound to the device.
> + *
> * The bond between device and process must be removed with
> * VFIO_DEVICE_UNBIND_TASK before exiting.
> *
BTW, nice commit logs throughout this series, I probably need to read
through them a few more times to really digest it all. AIUI, the VFIO
support here is really only useful for basic userspace drivers, I don't
see how we could take advantage of it for a VM use case where the guest
manages the PASID space for a domain. Perhaps it hasn't spent enough
cycles bouncing around in my head yet. Thanks,
Alex
^ permalink raw reply [flat|nested] 103+ messages in thread
* Re: [RFC PATCH 30/30] vfio: Allow to bind foreign task
2017-02-27 19:54 ` [RFC PATCH 30/30] vfio: Allow to bind foreign task Jean-Philippe Brucker
[not found] ` <20170227195441.5170-31-jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
@ 2017-04-26 7:25 ` Tomasz Nowicki
[not found] ` <b937914a-d215-8223-0846-65271a568170-nYOzD4b6Jr9Wk0Htik3J/w@public.gmane.org>
1 sibling, 1 reply; 103+ messages in thread
From: Tomasz Nowicki @ 2017-04-26 7:25 UTC (permalink / raw)
To: Jean-Philippe Brucker
Cc: Lorenzo Pieralisi, Shanker Donthineni, kvm, Catalin Marinas,
Joerg Roedel, Sinan Kaya, Will Deacon, iommu, Harv Abdulhamid,
Alex Williamson, linux-pci, Bjorn Helgaas, Robin Murphy,
David Woodhouse, linux-arm-kernel, Nate Watterson
On 27.02.2017 20:54, Jean-Philippe Brucker wrote:
> Let the process that owns the device create an address space bond on
> behalf of another process. We add a pid argument to the BIND_TASK ioctl,
> allowing the caller to bind a foreign task. The expected program flow in
> this case is:
>
> * Process A creates the VFIO context and initializes the device.
> * Process B asks A to bind its address space.
> * Process A issues an ioctl to the VFIO device fd with BIND_TASK(pid).
> It may communicate the given PASID back to process B or keep track of it
> internally.
> * Process B asks A to perform transactions on its virtual address.
> * Process A launches transaction tagged with the given PASID.
>
> Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker@arm.com>
> ---
> drivers/vfio/vfio.c | 35 +++++++++++++++++++++++++++++++++--
> include/uapi/linux/vfio.h | 15 +++++++++++++++
> 2 files changed, 48 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
> index c4505d8f4c61..ecc5d07e3dbb 100644
> --- a/drivers/vfio/vfio.c
> +++ b/drivers/vfio/vfio.c
> @@ -26,6 +26,7 @@
> #include <linux/module.h>
> #include <linux/mutex.h>
> #include <linux/pci.h>
> +#include <linux/ptrace.h>
> #include <linux/rwsem.h>
> #include <linux/sched.h>
> #include <linux/slab.h>
> @@ -1660,7 +1661,7 @@ static long vfio_svm_ioctl(struct vfio_device *device, unsigned int cmd,
> struct vfio_device_svm svm;
> struct vfio_task *vfio_task;
>
> - minsz = offsetofend(struct vfio_device_svm, pasid);
> + minsz = offsetofend(struct vfio_device_svm, pid);
>
> if (copy_from_user(&svm, (void __user *)arg, minsz))
> return -EFAULT;
> @@ -1669,9 +1670,39 @@ static long vfio_svm_ioctl(struct vfio_device *device, unsigned int cmd,
> return -EINVAL;
>
> if (cmd == VFIO_DEVICE_BIND_TASK) {
> - struct task_struct *task = current;
> + struct mm_struct *mm;
> + struct task_struct *task;
> +
> + if (svm.flags & ~VFIO_SVM_PID)
> + return -EINVAL;
> +
> + if (svm.flags & VFIO_SVM_PID) {
> + rcu_read_lock();
> + task = find_task_by_vpid(svm.pid);
> + if (task)
> + get_task_struct(task);
> + rcu_read_unlock();
> + if (!task)
> + return -ESRCH;
> +
> + /*
> + * Ensure process has RW access on the task's mm
> + * FIXME:
> + * - I think this ought to be in the IOMMU API
> + * - I'm assuming permission is never revoked during the
> + * task's lifetime. Might be mistaken.
> + */
> + mm = mm_access(task, PTRACE_MODE_ATTACH_REALCREDS);
> + if (!mm || IS_ERR(mm))
I know this is RFC patch but considering we will keep this as is, we
need here:
+put_task_struct(task);
> + return IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
> + mmput(mm);
> + } else {
> + get_task_struct(current);
> + task = current;
> + }
>
> ret = iommu_bind_task(device->dev, task, &svm.pasid, 0, NULL);
> + put_task_struct(task);
> if (ret)
> return ret;
>
Thanks,
Tomasz
^ permalink raw reply [flat|nested] 103+ messages in thread
* RE: [RFC PATCH 00/30] Add PCIe SVM support to ARM SMMUv3
[not found] ` <20170227195441.5170-1-jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org>
` (29 preceding siblings ...)
2017-02-27 19:54 ` [RFC PATCH 30/30] vfio: Allow to bind foreign task Jean-Philippe Brucker
@ 2017-03-06 8:20 ` Liu, Yi L
[not found] ` <A2975661238FB949B60364EF0F2C2574390186B8-E2R4CRU6q/6iAffOGbnezLfspsVTdybXVpNB7YpNyf8@public.gmane.org>
30 siblings, 1 reply; 103+ messages in thread
From: Liu, Yi L @ 2017-03-06 8:20 UTC (permalink / raw)
To: Jean-Philippe Brucker
Cc: Shanker Donthineni, kvm-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
Catalin Marinas, Sinan Kaya, Will Deacon, Harv Abdulhamid,
iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org,
linux-pci-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, Bjorn Helgaas,
David Woodhouse,
linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r@public.gmane.org,
Nate Watterson
> -----Original Message-----
> From: iommu-bounces-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org [mailto:iommu-
> bounces-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org] On Behalf Of Jean-Philippe Brucker
> Sent: Tuesday, February 28, 2017 3:54 AM
> Cc: Shanker Donthineni <shankerd-Rm6X0d1/PG5y9aJCnZT0Uw@public.gmane.org>; kvm-u79uwXL29TY76Z2rM5mHXA@public.gmane.org;
> Catalin Marinas <catalin.marinas-5wv7dgnIgG8@public.gmane.org>; Sinan Kaya
> <okaya-Rm6X0d1/PG5y9aJCnZT0Uw@public.gmane.org>; Will Deacon <will.deacon-5wv7dgnIgG8@public.gmane.org>;
> iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org; Harv Abdulhamid <harba-Rm6X0d1/PG5y9aJCnZT0Uw@public.gmane.org>;
> linux-pci-u79uwXL29TY76Z2rM5mHXA@public.gmane.org; Bjorn Helgaas <bhelgaas-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>; David
> Woodhouse <dwmw2-wEGCiKHe2LqWVfeAwA7xHQ@public.gmane.org>; linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r@public.gmane.org; Nate
> Watterson <nwatters-Rm6X0d1/PG5y9aJCnZT0Uw@public.gmane.org>
> Subject: [RFC PATCH 00/30] Add PCIe SVM support to ARM SMMUv3
>
> Hi,
>
> This series adds support for PCI ATS, PRI and PASID extensions to the
> SMMUv3 driver. In systems that support it, it is now possible for some high-end
> devices to perform DMA into process address spaces. Page tables are shared
> between MMU and SMMU; page faults from devices are recoverable and handled by
> the mm subsystem.
>
> We propose an extension to the IOMMU API that unifies existing SVM
> implementations (AMD, Intel and ARM) in patches 22 and 24. Nothing is set in stone,
> the goal is to start discussions and find an intersection between implementations.
>
> We also propose a VFIO interface in patches 29 and 30, that allows userspace device
> drivers to make use of SVM. It would also serve as example implementation for
> other device drivers.
>
> Overview of the patches:
>
> * 1 and 2 prepare the SMMUv3 structures for ATS,
> * 3 to 5 enable ATS for devices that support it.
> * 6 to 10 prepare the SMMUv3 structures for PASID and PRI. Patch 9,
> in particular, provides details on the structure requirements.
> * 11 introduces an interface for sharing ASIDs on ARM64,
> * 12 to 17 add more infrastructure for sharing page tables,
> * 18 and 19 add minor helpers to PCI,
> * 20 enables PASID in devices that support it,
Jean, supposedly, you will introduce a PASID management mechanism in
SMMU v3 driver. Here I have a question about PASID management on ARM.
Will there be a system wide PASID table? Or there is equivalent implementation.
Thanks,
Yi L
> * 21 enables PRI and adds device fault handler,
> * 22 and 24 draft a possible interface for SVM in the IOMMU API
> * 23 and 25-28 finalize support for SVM in SMMUv3
> * 29 and 30 draft a possible interface for SVM in VFIO.
>
> The series is available on git://linux-arm.org/linux-jpb.git svm/rfc1 Enable
> CONFIG_PCI_PASID, CONFIG_PCI_PRI and you should be good to go.
>
> So far, this has only been tested with a software model of an SMMUv3 and a PCIe
> DMA engine. We don't intend to get this merged until it has been tested on silicon,
> but at least the driver implementation should be mature enough. I might split next
> versions depending on what is ready and what needs more work so we can merge it
> progressively.
>
> A lot of open questions remain:
>
> 1. Can we declare that PASID 0 is always invalid?
>
> 2. For this prototype, I kept the interface simple from an implementation
> perspective. At the moment is is "bind this device to that address
> space". For consistency with the rest of VFIO and IOMMU, I think "bind
> this container to that address space" would be more in line with VFIO,
> and "bind that group to that address space" more in line with IOMMU.
> VFIO would tell the IOMMU "for all groups in this container, bind to
> that address space".
> This raises the question of inconsistency between device capabilities.
> When adding a device that supports less PASID bits to a group, what do
> we do? What if we already allocated a PASID that is out of range for
> the new device?
>
> 3. How do we reconcile the IOMMU fault reporting infrastructure with the
> SVM interface?
>
> 4. SVM is the product of two features: handling device faults, and devices
> having multiple address spaces. What about one feature without the
> other?
> a. If we cannot afford to have a device fault, can we at least share a
> pinned address space? Pinning all current memory would be done by
> vfio, but there also need to be pinning of all future mappings.
> (mlock isn't sufficient, still allows for minor faults.)
> b. If the device has a single address space, can we still bind it to a
> process? The main issue with unifying DMA and process page tables is
> reserved regions on the device side. What do we do if, for instance,
> and MSI frame address clashes with a process mapping? Or if a
> process mapping exists outside of the device's DMA window?
>
> Please find more details in the IOMMU API and VFIO patches.
>
> Thanks,
> Jean-Philippe
>
> Cc: Harv Abdulhamid <harba-Rm6X0d1/PG5y9aJCnZT0Uw@public.gmane.org>
> Cc: Will Deacon <will.deacon-5wv7dgnIgG8@public.gmane.org>
> Cc: Shanker Donthineni <shankerd-Rm6X0d1/PG5y9aJCnZT0Uw@public.gmane.org>
> Cc: Bjorn Helgaas <bhelgaas-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
> Cc: Sinan Kaya <okaya-Rm6X0d1/PG5y9aJCnZT0Uw@public.gmane.org>
> Cc: Lorenzo Pieralisi <lorenzo.pieralisi-5wv7dgnIgG8@public.gmane.org>
> Cc: Catalin Marinas <catalin.marinas-5wv7dgnIgG8@public.gmane.org>
> Cc: Robin Murphy <robin.murphy-5wv7dgnIgG8@public.gmane.org>
> Cc: Joerg Roedel <joro-zLv9SwRftAIdnm+yROfE0A@public.gmane.org>
> Cc: Nate Watterson <nwatters-Rm6X0d1/PG5y9aJCnZT0Uw@public.gmane.org>
> Cc: Alex Williamson <alex.williamson-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
> Cc: David Woodhouse <dwmw2-wEGCiKHe2LqWVfeAwA7xHQ@public.gmane.org>
>
> Cc: linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r@public.gmane.org
> Cc: linux-pci-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> Cc: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org
> Cc: kvm-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
>
> Jean-Philippe Brucker (30):
> iommu/arm-smmu-v3: Link groups and devices
> iommu/arm-smmu-v3: Link groups and domains
> PCI: Move ATS declarations outside of CONFIG_PCI
> iommu/arm-smmu-v3: Add support for PCI ATS
> iommu/arm-smmu-v3: Disable tagged pointers when ATS is in use
> iommu/arm-smmu-v3: Add support for Substream IDs
> iommu/arm-smmu-v3: Add second level of context descriptor table
> iommu/arm-smmu-v3: Add support for VHE
> iommu/arm-smmu-v3: Support broadcast TLB maintenance
> iommu/arm-smmu-v3: Add task contexts
> arm64: mm: Pin down ASIDs for sharing contexts with devices
> iommu/arm-smmu-v3: Keep track of process address spaces
> iommu/io-pgtable-arm: Factor out ARM LPAE register defines
> iommu/arm-smmu-v3: Share process page tables
> iommu/arm-smmu-v3: Steal private ASID from a domain
> iommu/arm-smmu-v3: Use shared ASID set
> iommu/arm-smmu-v3: Add SVM feature checking
> PCI: Make "PRG Response PASID Required" handling common
> PCI: Cache PRI and PASID bits in pci_dev
> iommu/arm-smmu-v3: Enable PCI PASID in masters
> iommu/arm-smmu-v3: Handle device faults from PRI
> iommu: Bind/unbind tasks to/from devices
> iommu/arm-smmu-v3: Bind/unbind device and task
> iommu: Specify PASID state when unbinding a task
> iommu/arm-smmu-v3: Safe invalidation and recycling of PASIDs
> iommu/arm-smmu-v3: Fix PRI queue overflow acknowledgement
> iommu/arm-smmu-v3: Handle PRI queue overflow
> iommu/arm-smmu-v3: Add support for Hardware Translation Table Update
> at stage 1
> vfio: Add support for Shared Virtual Memory
> vfio: Allow to bind foreign task
>
> MAINTAINERS | 1 +
> arch/arm64/include/asm/mmu.h | 1 +
> arch/arm64/include/asm/mmu_context.h | 11 +-
> arch/arm64/mm/context.c | 80 +-
> drivers/iommu/amd_iommu.c | 19 +-
> drivers/iommu/arm-smmu-v3.c | 2593 ++++++++++++++++++++++++++++++++-
> -
> drivers/iommu/io-pgtable-arm.c | 48 +-
> drivers/iommu/io-pgtable-arm.h | 67 +
> drivers/iommu/iommu.c | 116 ++
> drivers/pci/ats.c | 40 +
> drivers/vfio/pci/vfio_pci.c | 24 +
> drivers/vfio/vfio.c | 135 ++
> include/linux/iommu.h | 57 +
> include/linux/pci-ats.h | 8 +
> include/linux/pci.h | 28 +-
> include/uapi/linux/pci_regs.h | 1 +
> include/uapi/linux/vfio.h | 70 +
> 17 files changed, 3084 insertions(+), 215 deletions(-) create mode 100644
> drivers/iommu/io-pgtable-arm.h
>
> --
> 2.11.0
>
> _______________________________________________
> iommu mailing list
> iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org
> https://lists.linuxfoundation.org/mailman/listinfo/iommu
^ permalink raw reply [flat|nested] 103+ messages in thread