* [PATCH v2 01/16] iommu: introduce bind_pasid_table API function
2017-10-05 23:03 ` Jacob Pan
@ 2017-10-05 23:03 ` Jacob Pan
-1 siblings, 0 replies; 109+ messages in thread
From: Jacob Pan @ 2017-10-05 23:03 UTC (permalink / raw)
To: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA, LKML,
Joerg Roedel, David Woodhouse, Greg Kroah-Hartman, Rafael Wysocki,
Jean-Philippe Brucker
Cc: Lan Tianyu, Yi L, Liu-i9wRM+HIrmnmtl4Z8vJ8Kg761KYD1DLY
Virtual IOMMU was proposed to support Shared Virtual Memory (SVM)
use in the guest:
https://lists.gnu.org/archive/html/qemu-devel/2016-11/msg05311.html
As part of the proposed architecture, when an SVM capable PCI
device is assigned to a guest, nested mode is turned on. Guest owns the
first level page tables (request with PASID) which performs GVA->GPA
translation. Second level page tables are owned by the host for GPA->HPA
translation for both request with and without PASID.
A new IOMMU driver interface is therefore needed to perform tasks as
follows:
* Enable nested translation and appropriate translation type
* Assign guest PASID table pointer (in GPA) and size to host IOMMU
This patch introduces new API functions to perform bind/unbind guest PASID
tables. Based on common data, model specific IOMMU drivers can be extended
to perform the specific steps for binding pasid table of assigned devices.
Signed-off-by: Jacob Pan <jacob.jun.pan-VuQAYsv1563Yd54FQh9/CA@public.gmane.org>
Signed-off-by: Liu, Yi L <yi.l.liu-VuQAYsv1563Yd54FQh9/CA@public.gmane.org>
Signed-off-by: Ashok Raj <ashok.raj-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
---
drivers/iommu/iommu.c | 19 ++++++++++++++++
include/linux/iommu.h | 25 +++++++++++++++++++++
include/uapi/linux/iommu.h | 55 ++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 99 insertions(+)
create mode 100644 include/uapi/linux/iommu.h
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 3de5c0b..761cf50 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1322,6 +1322,25 @@ int iommu_attach_device(struct iommu_domain *domain, struct device *dev)
}
EXPORT_SYMBOL_GPL(iommu_attach_device);
+int iommu_bind_pasid_table(struct iommu_domain *domain, struct device *dev,
+ struct pasid_table_config *pasidt_binfo)
+{
+ if (unlikely(!domain->ops->bind_pasid_table))
+ return -ENODEV;
+
+ return domain->ops->bind_pasid_table(domain, dev, pasidt_binfo);
+}
+EXPORT_SYMBOL_GPL(iommu_bind_pasid_table);
+
+int iommu_unbind_pasid_table(struct iommu_domain *domain, struct device *dev)
+{
+ if (unlikely(!domain->ops->unbind_pasid_table))
+ return -EINVAL;
+
+ return domain->ops->unbind_pasid_table(domain, dev);
+}
+EXPORT_SYMBOL_GPL(iommu_unbind_pasid_table);
+
static void __iommu_detach_device(struct iommu_domain *domain,
struct device *dev)
{
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 41b8c57..672cc06 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -25,6 +25,7 @@
#include <linux/errno.h>
#include <linux/err.h>
#include <linux/of.h>
+#include <uapi/linux/iommu.h>
#define IOMMU_READ (1 << 0)
#define IOMMU_WRITE (1 << 1)
@@ -187,6 +188,8 @@ struct iommu_resv_region {
* @domain_get_windows: Return the number of windows for a domain
* @of_xlate: add OF master IDs to iommu grouping
* @pgsize_bitmap: bitmap of all possible supported page sizes
+ * @bind_pasid_table: bind pasid table pointer for guest SVM
+ * @unbind_pasid_table: unbind pasid table pointer and restore defaults
*/
struct iommu_ops {
bool (*capable)(enum iommu_cap);
@@ -233,8 +236,14 @@ struct iommu_ops {
u32 (*domain_get_windows)(struct iommu_domain *domain);
int (*of_xlate)(struct device *dev, struct of_phandle_args *args);
+
bool (*is_attach_deferred)(struct iommu_domain *domain, struct device *dev);
+ int (*bind_pasid_table)(struct iommu_domain *domain, struct device *dev,
+ struct pasid_table_config *pasidt_binfo);
+ int (*unbind_pasid_table)(struct iommu_domain *domain,
+ struct device *dev);
+
unsigned long pgsize_bitmap;
};
@@ -296,6 +305,10 @@ extern int iommu_attach_device(struct iommu_domain *domain,
struct device *dev);
extern void iommu_detach_device(struct iommu_domain *domain,
struct device *dev);
+extern int iommu_bind_pasid_table(struct iommu_domain *domain,
+ struct device *dev, struct pasid_table_config *pasidt_binfo);
+extern int iommu_unbind_pasid_table(struct iommu_domain *domain,
+ struct device *dev);
extern struct iommu_domain *iommu_get_domain_for_dev(struct device *dev);
extern int iommu_map(struct iommu_domain *domain, unsigned long iova,
phys_addr_t paddr, size_t size, int prot);
@@ -696,6 +709,18 @@ const struct iommu_ops *iommu_ops_from_fwnode(struct fwnode_handle *fwnode)
return NULL;
}
+static inline
+int iommu_bind_pasid_table(struct iommu_domain *domain, struct device *dev,
+ struct pasid_table_config *pasidt_binfo)
+{
+ return -EINVAL;
+}
+static inline
+int iommu_unbind_pasid_table(struct iommu_domain *domain, struct device *dev)
+{
+ return -EINVAL;
+}
+
#endif /* CONFIG_IOMMU_API */
#endif /* __LINUX_IOMMU_H */
diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h
new file mode 100644
index 0000000..aeeaf0e
--- /dev/null
+++ b/include/uapi/linux/iommu.h
@@ -0,0 +1,55 @@
+/*
+ * IOMMU user API definitions
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _UAPI_IOMMU_H
+#define _UAPI_IOMMU_H
+
+#include <linux/types.h>
+
+enum pasid_table_model {
+ PASID_TABLE_FORMAT_HOST,
+ PASID_TABLE_FORMAT_ARM_1LVL,
+ PASID_TABLE_FORMAT_ARM_2LVL,
+ PASID_TABLE_FORMAT_AMD,
+ PASID_TABLE_FORMAT_INTEL,
+};
+
+/**
+ * PASID table data used to bind guest PASID table to the host IOMMU. This will
+ * enable guest managed first level page tables.
+ * @version: for future extensions and identification of the data format
+ * @bytes: size of this structure
+ * @base_ptr: PASID table pointer
+ * @pasid_bits: number of bits supported in the guest PASID table, must be less
+ * or equal than the host table size.
+ * @model: PASID table format for different IOMMU models
+ */
+struct pasid_table_config {
+ __u32 version;
+ __u32 bytes;
+ __u64 base_ptr;
+ __u8 pasid_bits;
+ enum pasid_table_model model;
+ union {
+ struct {
+ /* Intel specific fields */
+ } intel;
+
+ struct {
+ /* ARM specific fields */
+ bool pasid0_dma_no_pasid;
+ } arm;
+
+ struct {
+ /* AMD specific fields */
+ } amd;
+ };
+};
+
+#endif /* _UAPI_IOMMU_H */
--
2.7.4
^ permalink raw reply related [flat|nested] 109+ messages in thread* [PATCH v2 01/16] iommu: introduce bind_pasid_table API function
@ 2017-10-05 23:03 ` Jacob Pan
0 siblings, 0 replies; 109+ messages in thread
From: Jacob Pan @ 2017-10-05 23:03 UTC (permalink / raw)
To: iommu, LKML, Joerg Roedel, David Woodhouse, Greg Kroah-Hartman,
Rafael Wysocki, Jean-Philippe Brucker
Cc: Liu, Yi L, Lan Tianyu, Tian, Kevin, Raj Ashok, Alex Williamson,
Jacob Pan, Liu, Yi L
Virtual IOMMU was proposed to support Shared Virtual Memory (SVM)
use in the guest:
https://lists.gnu.org/archive/html/qemu-devel/2016-11/msg05311.html
As part of the proposed architecture, when an SVM capable PCI
device is assigned to a guest, nested mode is turned on. Guest owns the
first level page tables (request with PASID) which performs GVA->GPA
translation. Second level page tables are owned by the host for GPA->HPA
translation for both request with and without PASID.
A new IOMMU driver interface is therefore needed to perform tasks as
follows:
* Enable nested translation and appropriate translation type
* Assign guest PASID table pointer (in GPA) and size to host IOMMU
This patch introduces new API functions to perform bind/unbind guest PASID
tables. Based on common data, model specific IOMMU drivers can be extended
to perform the specific steps for binding pasid table of assigned devices.
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Liu, Yi L <yi.l.liu@linux.intel.com>
Signed-off-by: Ashok Raj <ashok.raj@intel.com>
---
drivers/iommu/iommu.c | 19 ++++++++++++++++
include/linux/iommu.h | 25 +++++++++++++++++++++
include/uapi/linux/iommu.h | 55 ++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 99 insertions(+)
create mode 100644 include/uapi/linux/iommu.h
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 3de5c0b..761cf50 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1322,6 +1322,25 @@ int iommu_attach_device(struct iommu_domain *domain, struct device *dev)
}
EXPORT_SYMBOL_GPL(iommu_attach_device);
+int iommu_bind_pasid_table(struct iommu_domain *domain, struct device *dev,
+ struct pasid_table_config *pasidt_binfo)
+{
+ if (unlikely(!domain->ops->bind_pasid_table))
+ return -ENODEV;
+
+ return domain->ops->bind_pasid_table(domain, dev, pasidt_binfo);
+}
+EXPORT_SYMBOL_GPL(iommu_bind_pasid_table);
+
+int iommu_unbind_pasid_table(struct iommu_domain *domain, struct device *dev)
+{
+ if (unlikely(!domain->ops->unbind_pasid_table))
+ return -EINVAL;
+
+ return domain->ops->unbind_pasid_table(domain, dev);
+}
+EXPORT_SYMBOL_GPL(iommu_unbind_pasid_table);
+
static void __iommu_detach_device(struct iommu_domain *domain,
struct device *dev)
{
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 41b8c57..672cc06 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -25,6 +25,7 @@
#include <linux/errno.h>
#include <linux/err.h>
#include <linux/of.h>
+#include <uapi/linux/iommu.h>
#define IOMMU_READ (1 << 0)
#define IOMMU_WRITE (1 << 1)
@@ -187,6 +188,8 @@ struct iommu_resv_region {
* @domain_get_windows: Return the number of windows for a domain
* @of_xlate: add OF master IDs to iommu grouping
* @pgsize_bitmap: bitmap of all possible supported page sizes
+ * @bind_pasid_table: bind pasid table pointer for guest SVM
+ * @unbind_pasid_table: unbind pasid table pointer and restore defaults
*/
struct iommu_ops {
bool (*capable)(enum iommu_cap);
@@ -233,8 +236,14 @@ struct iommu_ops {
u32 (*domain_get_windows)(struct iommu_domain *domain);
int (*of_xlate)(struct device *dev, struct of_phandle_args *args);
+
bool (*is_attach_deferred)(struct iommu_domain *domain, struct device *dev);
+ int (*bind_pasid_table)(struct iommu_domain *domain, struct device *dev,
+ struct pasid_table_config *pasidt_binfo);
+ int (*unbind_pasid_table)(struct iommu_domain *domain,
+ struct device *dev);
+
unsigned long pgsize_bitmap;
};
@@ -296,6 +305,10 @@ extern int iommu_attach_device(struct iommu_domain *domain,
struct device *dev);
extern void iommu_detach_device(struct iommu_domain *domain,
struct device *dev);
+extern int iommu_bind_pasid_table(struct iommu_domain *domain,
+ struct device *dev, struct pasid_table_config *pasidt_binfo);
+extern int iommu_unbind_pasid_table(struct iommu_domain *domain,
+ struct device *dev);
extern struct iommu_domain *iommu_get_domain_for_dev(struct device *dev);
extern int iommu_map(struct iommu_domain *domain, unsigned long iova,
phys_addr_t paddr, size_t size, int prot);
@@ -696,6 +709,18 @@ const struct iommu_ops *iommu_ops_from_fwnode(struct fwnode_handle *fwnode)
return NULL;
}
+static inline
+int iommu_bind_pasid_table(struct iommu_domain *domain, struct device *dev,
+ struct pasid_table_config *pasidt_binfo)
+{
+ return -EINVAL;
+}
+static inline
+int iommu_unbind_pasid_table(struct iommu_domain *domain, struct device *dev)
+{
+ return -EINVAL;
+}
+
#endif /* CONFIG_IOMMU_API */
#endif /* __LINUX_IOMMU_H */
diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h
new file mode 100644
index 0000000..aeeaf0e
--- /dev/null
+++ b/include/uapi/linux/iommu.h
@@ -0,0 +1,55 @@
+/*
+ * IOMMU user API definitions
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _UAPI_IOMMU_H
+#define _UAPI_IOMMU_H
+
+#include <linux/types.h>
+
+enum pasid_table_model {
+ PASID_TABLE_FORMAT_HOST,
+ PASID_TABLE_FORMAT_ARM_1LVL,
+ PASID_TABLE_FORMAT_ARM_2LVL,
+ PASID_TABLE_FORMAT_AMD,
+ PASID_TABLE_FORMAT_INTEL,
+};
+
+/**
+ * PASID table data used to bind guest PASID table to the host IOMMU. This will
+ * enable guest managed first level page tables.
+ * @version: for future extensions and identification of the data format
+ * @bytes: size of this structure
+ * @base_ptr: PASID table pointer
+ * @pasid_bits: number of bits supported in the guest PASID table, must be less
+ * or equal than the host table size.
+ * @model: PASID table format for different IOMMU models
+ */
+struct pasid_table_config {
+ __u32 version;
+ __u32 bytes;
+ __u64 base_ptr;
+ __u8 pasid_bits;
+ enum pasid_table_model model;
+ union {
+ struct {
+ /* Intel specific fields */
+ } intel;
+
+ struct {
+ /* ARM specific fields */
+ bool pasid0_dma_no_pasid;
+ } arm;
+
+ struct {
+ /* AMD specific fields */
+ } amd;
+ };
+};
+
+#endif /* _UAPI_IOMMU_H */
--
2.7.4
^ permalink raw reply related [flat|nested] 109+ messages in thread* Re: [PATCH v2 01/16] iommu: introduce bind_pasid_table API function
2017-10-05 23:03 ` Jacob Pan
(?)
@ 2017-10-10 13:14 ` Joerg Roedel
[not found] ` <20171010131433.fgo5tnwidzywfnx4-zLv9SwRftAIdnm+yROfE0A@public.gmane.org>
-1 siblings, 1 reply; 109+ messages in thread
From: Joerg Roedel @ 2017-10-10 13:14 UTC (permalink / raw)
To: Jacob Pan
Cc: iommu, LKML, David Woodhouse, Greg Kroah-Hartman, Rafael Wysocki,
Jean-Philippe Brucker, Liu, Yi L, Lan Tianyu, Tian, Kevin,
Raj Ashok, Alex Williamson, Liu, Yi L
Hi Jacob,
On Thu, Oct 05, 2017 at 04:03:29PM -0700, Jacob Pan wrote:
> +int iommu_unbind_pasid_table(struct iommu_domain *domain, struct device *dev)
> +{
> + if (unlikely(!domain->ops->unbind_pasid_table))
> + return -EINVAL;
> +
> + return domain->ops->unbind_pasid_table(domain, dev);
> +}
> +EXPORT_SYMBOL_GPL(iommu_unbind_pasid_table);
Are there other reasons to let the unbind fail? Otherwise I'd suggest to
just make this a void function. Also not sure what the user of this
function should do when the unbind really fails.
> +enum pasid_table_model {
> + PASID_TABLE_FORMAT_HOST,
What is this FORMAT_HOST for?
> + PASID_TABLE_FORMAT_ARM_1LVL,
> + PASID_TABLE_FORMAT_ARM_2LVL,
> + PASID_TABLE_FORMAT_AMD,
> + PASID_TABLE_FORMAT_INTEL,
> +};
> +
> +/**
> + * PASID table data used to bind guest PASID table to the host IOMMU. This will
> + * enable guest managed first level page tables.
> + * @version: for future extensions and identification of the data format
> + * @bytes: size of this structure
> + * @base_ptr: PASID table pointer
> + * @pasid_bits: number of bits supported in the guest PASID table, must be less
> + * or equal than the host table size.
> + * @model: PASID table format for different IOMMU models
> + */
> +struct pasid_table_config {
> + __u32 version;
Can you also add a define for the version number? Userspace needs it to
initialize the struct and the kernel to check against it.
> + __u32 bytes;
> + __u64 base_ptr;
> + __u8 pasid_bits;
> + enum pasid_table_model model;
> + union {
> + struct {
> + /* Intel specific fields */
> + } intel;
> +
> + struct {
> + /* ARM specific fields */
> + bool pasid0_dma_no_pasid;
> + } arm;
> +
> + struct {
> + /* AMD specific fields */
> + } amd;
Thinking more about this, we can omit the sub-structs for models that
don't need them. For the amd-model for example the base_ptr and
pasid_bits fields are sufficient.
Regards,
Joerg
^ permalink raw reply [flat|nested] 109+ messages in thread[parent not found: <1507244624-39189-2-git-send-email-jacob.jun.pan-VuQAYsv1563Yd54FQh9/CA@public.gmane.org>]
* Re: [PATCH v2 01/16] iommu: introduce bind_pasid_table API function
2017-10-05 23:03 ` Jacob Pan
@ 2017-10-10 16:45 ` Jean-Philippe Brucker
-1 siblings, 0 replies; 109+ messages in thread
From: Jean-Philippe Brucker @ 2017-10-10 16:45 UTC (permalink / raw)
To: Jacob Pan,
iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org,
LKML, Joerg Roedel, David Woodhouse, Greg Kroah-Hartman,
Rafael Wysocki
Cc: Lan Tianyu, Yi L
On 06/10/17 00:03, Jacob Pan wrote:
> Virtual IOMMU was proposed to support Shared Virtual Memory (SVM)
> use in the guest:
> https://lists.gnu.org/archive/html/qemu-devel/2016-11/msg05311.html
>
> As part of the proposed architecture, when an SVM capable PCI
> device is assigned to a guest, nested mode is turned on. Guest owns the
> first level page tables (request with PASID) which performs GVA->GPA
> translation. Second level page tables are owned by the host for GPA->HPA
> translation for both request with and without PASID.
>
> A new IOMMU driver interface is therefore needed to perform tasks as
> follows:
> * Enable nested translation and appropriate translation type
> * Assign guest PASID table pointer (in GPA) and size to host IOMMU
>
> This patch introduces new API functions to perform bind/unbind guest PASID
> tables. Based on common data, model specific IOMMU drivers can be extended
> to perform the specific steps for binding pasid table of assigned devices.
>
> Signed-off-by: Jacob Pan <jacob.jun.pan-VuQAYsv1563Yd54FQh9/CA@public.gmane.org>
> Signed-off-by: Liu, Yi L <yi.l.liu-VuQAYsv1563Yd54FQh9/CA@public.gmane.org>
> Signed-off-by: Ashok Raj <ashok.raj-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
> ---
> drivers/iommu/iommu.c | 19 ++++++++++++++++
> include/linux/iommu.h | 25 +++++++++++++++++++++
> include/uapi/linux/iommu.h | 55 ++++++++++++++++++++++++++++++++++++++++++++++
> 3 files changed, 99 insertions(+)
> create mode 100644 include/uapi/linux/iommu.h
>
> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
> index 3de5c0b..761cf50 100644
> --- a/drivers/iommu/iommu.c
> +++ b/drivers/iommu/iommu.c
> @@ -1322,6 +1322,25 @@ int iommu_attach_device(struct iommu_domain *domain, struct device *dev)
> }
> EXPORT_SYMBOL_GPL(iommu_attach_device);
>
> +int iommu_bind_pasid_table(struct iommu_domain *domain, struct device *dev,
> + struct pasid_table_config *pasidt_binfo)
> +{
> + if (unlikely(!domain->ops->bind_pasid_table))
> + return -ENODEV;
> +
> + return domain->ops->bind_pasid_table(domain, dev, pasidt_binfo);
> +}
> +EXPORT_SYMBOL_GPL(iommu_bind_pasid_table);
> +
> +int iommu_unbind_pasid_table(struct iommu_domain *domain, struct device *dev)
> +{
> + if (unlikely(!domain->ops->unbind_pasid_table))
> + return -EINVAL;
> +
> + return domain->ops->unbind_pasid_table(domain, dev);
> +}
> +EXPORT_SYMBOL_GPL(iommu_unbind_pasid_table);
> +
> static void __iommu_detach_device(struct iommu_domain *domain,
> struct device *dev)
> {
> diff --git a/include/linux/iommu.h b/include/linux/iommu.h
> index 41b8c57..672cc06 100644
> --- a/include/linux/iommu.h
> +++ b/include/linux/iommu.h
> @@ -25,6 +25,7 @@
> #include <linux/errno.h>
> #include <linux/err.h>
> #include <linux/of.h>
> +#include <uapi/linux/iommu.h>
>
> #define IOMMU_READ (1 << 0)
> #define IOMMU_WRITE (1 << 1)
> @@ -187,6 +188,8 @@ struct iommu_resv_region {
> * @domain_get_windows: Return the number of windows for a domain
> * @of_xlate: add OF master IDs to iommu grouping
> * @pgsize_bitmap: bitmap of all possible supported page sizes
> + * @bind_pasid_table: bind pasid table pointer for guest SVM
> + * @unbind_pasid_table: unbind pasid table pointer and restore defaults
> */
> struct iommu_ops {
> bool (*capable)(enum iommu_cap);
> @@ -233,8 +236,14 @@ struct iommu_ops {
> u32 (*domain_get_windows)(struct iommu_domain *domain);
>
> int (*of_xlate)(struct device *dev, struct of_phandle_args *args);
> +
(whitespace change)
> bool (*is_attach_deferred)(struct iommu_domain *domain, struct device *dev);
>
> + int (*bind_pasid_table)(struct iommu_domain *domain, struct device *dev,
> + struct pasid_table_config *pasidt_binfo);
> + int (*unbind_pasid_table)(struct iommu_domain *domain,
> + struct device *dev);
> +
> unsigned long pgsize_bitmap;
> };
>
> @@ -296,6 +305,10 @@ extern int iommu_attach_device(struct iommu_domain *domain,
> struct device *dev);
> extern void iommu_detach_device(struct iommu_domain *domain,
> struct device *dev);
> +extern int iommu_bind_pasid_table(struct iommu_domain *domain,
> + struct device *dev, struct pasid_table_config *pasidt_binfo);
> +extern int iommu_unbind_pasid_table(struct iommu_domain *domain,
> + struct device *dev);
> extern struct iommu_domain *iommu_get_domain_for_dev(struct device *dev);
> extern int iommu_map(struct iommu_domain *domain, unsigned long iova,
> phys_addr_t paddr, size_t size, int prot);
> @@ -696,6 +709,18 @@ const struct iommu_ops *iommu_ops_from_fwnode(struct fwnode_handle *fwnode)
> return NULL;
> }
>
> +static inline
> +int iommu_bind_pasid_table(struct iommu_domain *domain, struct device *dev,
> + struct pasid_table_config *pasidt_binfo)
> +{
> + return -EINVAL;
> +}
> +static inline
> +int iommu_unbind_pasid_table(struct iommu_domain *domain, struct device *dev)
> +{
> + return -EINVAL;
> +}
> +
> #endif /* CONFIG_IOMMU_API */
>
> #endif /* __LINUX_IOMMU_H */
> diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h
> new file mode 100644
> index 0000000..aeeaf0e
> --- /dev/null
> +++ b/include/uapi/linux/iommu.h
> @@ -0,0 +1,55 @@
> +/*
> + * IOMMU user API definitions
> + *
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#ifndef _UAPI_IOMMU_H
> +#define _UAPI_IOMMU_H
> +
> +#include <linux/types.h>
> +
> +enum pasid_table_model {
> + PASID_TABLE_FORMAT_HOST,
> + PASID_TABLE_FORMAT_ARM_1LVL,
> + PASID_TABLE_FORMAT_ARM_2LVL,
Maybe remove the ARM values and struct for the moment, I'm still not sure
how to implement it. I think this should be a single ARM_SMMUV3 model
(2LVL might correspond to two different formats in SMMUv3, and a future
SMMU version could still have 1- or 2-level PASID table but incompatible
format).
> + PASID_TABLE_FORMAT_AMD,
> + PASID_TABLE_FORMAT_INTEL,
> +};
> +
> +/**
> + * PASID table data used to bind guest PASID table to the host IOMMU. This will
> + * enable guest managed first level page tables.
> + * @version: for future extensions and identification of the data format
> + * @bytes: size of this structure
> + * @base_ptr: PASID table pointer
> + * @pasid_bits: number of bits supported in the guest PASID table, must be less
> + * or equal than the host table size.
"host table size" is a bit confusing in this context, especially if using
multi-level tables. Perhaps it's clear enough that @pasid_bits must be
smaller or equal than the PASID size supported by the IOMMU, and we can
remove that second part?
Thanks,
Jean
> + * @model: PASID table format for different IOMMU models
> + */
> +struct pasid_table_config {
> + __u32 version;
> + __u32 bytes;
> + __u64 base_ptr;
> + __u8 pasid_bits;
> + enum pasid_table_model model;
> + union {
> + struct {
> + /* Intel specific fields */
> + } intel;
> +
> + struct {
> + /* ARM specific fields */
> + bool pasid0_dma_no_pasid;
> + } arm;
> +
> + struct {
> + /* AMD specific fields */
> + } amd;
> + };
> +};
> +
> +#endif /* _UAPI_IOMMU_H */
>
^ permalink raw reply [flat|nested] 109+ messages in thread* Re: [PATCH v2 01/16] iommu: introduce bind_pasid_table API function
@ 2017-10-10 16:45 ` Jean-Philippe Brucker
0 siblings, 0 replies; 109+ messages in thread
From: Jean-Philippe Brucker @ 2017-10-10 16:45 UTC (permalink / raw)
To: Jacob Pan, iommu@lists.linux-foundation.org, LKML, Joerg Roedel,
David Woodhouse, Greg Kroah-Hartman, Rafael Wysocki
Cc: Liu, Yi L, Lan Tianyu, Tian, Kevin, Raj Ashok, Alex Williamson,
Yi L
On 06/10/17 00:03, Jacob Pan wrote:
> Virtual IOMMU was proposed to support Shared Virtual Memory (SVM)
> use in the guest:
> https://lists.gnu.org/archive/html/qemu-devel/2016-11/msg05311.html
>
> As part of the proposed architecture, when an SVM capable PCI
> device is assigned to a guest, nested mode is turned on. Guest owns the
> first level page tables (request with PASID) which performs GVA->GPA
> translation. Second level page tables are owned by the host for GPA->HPA
> translation for both request with and without PASID.
>
> A new IOMMU driver interface is therefore needed to perform tasks as
> follows:
> * Enable nested translation and appropriate translation type
> * Assign guest PASID table pointer (in GPA) and size to host IOMMU
>
> This patch introduces new API functions to perform bind/unbind guest PASID
> tables. Based on common data, model specific IOMMU drivers can be extended
> to perform the specific steps for binding pasid table of assigned devices.
>
> Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
> Signed-off-by: Liu, Yi L <yi.l.liu@linux.intel.com>
> Signed-off-by: Ashok Raj <ashok.raj@intel.com>
> ---
> drivers/iommu/iommu.c | 19 ++++++++++++++++
> include/linux/iommu.h | 25 +++++++++++++++++++++
> include/uapi/linux/iommu.h | 55 ++++++++++++++++++++++++++++++++++++++++++++++
> 3 files changed, 99 insertions(+)
> create mode 100644 include/uapi/linux/iommu.h
>
> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
> index 3de5c0b..761cf50 100644
> --- a/drivers/iommu/iommu.c
> +++ b/drivers/iommu/iommu.c
> @@ -1322,6 +1322,25 @@ int iommu_attach_device(struct iommu_domain *domain, struct device *dev)
> }
> EXPORT_SYMBOL_GPL(iommu_attach_device);
>
> +int iommu_bind_pasid_table(struct iommu_domain *domain, struct device *dev,
> + struct pasid_table_config *pasidt_binfo)
> +{
> + if (unlikely(!domain->ops->bind_pasid_table))
> + return -ENODEV;
> +
> + return domain->ops->bind_pasid_table(domain, dev, pasidt_binfo);
> +}
> +EXPORT_SYMBOL_GPL(iommu_bind_pasid_table);
> +
> +int iommu_unbind_pasid_table(struct iommu_domain *domain, struct device *dev)
> +{
> + if (unlikely(!domain->ops->unbind_pasid_table))
> + return -EINVAL;
> +
> + return domain->ops->unbind_pasid_table(domain, dev);
> +}
> +EXPORT_SYMBOL_GPL(iommu_unbind_pasid_table);
> +
> static void __iommu_detach_device(struct iommu_domain *domain,
> struct device *dev)
> {
> diff --git a/include/linux/iommu.h b/include/linux/iommu.h
> index 41b8c57..672cc06 100644
> --- a/include/linux/iommu.h
> +++ b/include/linux/iommu.h
> @@ -25,6 +25,7 @@
> #include <linux/errno.h>
> #include <linux/err.h>
> #include <linux/of.h>
> +#include <uapi/linux/iommu.h>
>
> #define IOMMU_READ (1 << 0)
> #define IOMMU_WRITE (1 << 1)
> @@ -187,6 +188,8 @@ struct iommu_resv_region {
> * @domain_get_windows: Return the number of windows for a domain
> * @of_xlate: add OF master IDs to iommu grouping
> * @pgsize_bitmap: bitmap of all possible supported page sizes
> + * @bind_pasid_table: bind pasid table pointer for guest SVM
> + * @unbind_pasid_table: unbind pasid table pointer and restore defaults
> */
> struct iommu_ops {
> bool (*capable)(enum iommu_cap);
> @@ -233,8 +236,14 @@ struct iommu_ops {
> u32 (*domain_get_windows)(struct iommu_domain *domain);
>
> int (*of_xlate)(struct device *dev, struct of_phandle_args *args);
> +
(whitespace change)
> bool (*is_attach_deferred)(struct iommu_domain *domain, struct device *dev);
>
> + int (*bind_pasid_table)(struct iommu_domain *domain, struct device *dev,
> + struct pasid_table_config *pasidt_binfo);
> + int (*unbind_pasid_table)(struct iommu_domain *domain,
> + struct device *dev);
> +
> unsigned long pgsize_bitmap;
> };
>
> @@ -296,6 +305,10 @@ extern int iommu_attach_device(struct iommu_domain *domain,
> struct device *dev);
> extern void iommu_detach_device(struct iommu_domain *domain,
> struct device *dev);
> +extern int iommu_bind_pasid_table(struct iommu_domain *domain,
> + struct device *dev, struct pasid_table_config *pasidt_binfo);
> +extern int iommu_unbind_pasid_table(struct iommu_domain *domain,
> + struct device *dev);
> extern struct iommu_domain *iommu_get_domain_for_dev(struct device *dev);
> extern int iommu_map(struct iommu_domain *domain, unsigned long iova,
> phys_addr_t paddr, size_t size, int prot);
> @@ -696,6 +709,18 @@ const struct iommu_ops *iommu_ops_from_fwnode(struct fwnode_handle *fwnode)
> return NULL;
> }
>
> +static inline
> +int iommu_bind_pasid_table(struct iommu_domain *domain, struct device *dev,
> + struct pasid_table_config *pasidt_binfo)
> +{
> + return -EINVAL;
> +}
> +static inline
> +int iommu_unbind_pasid_table(struct iommu_domain *domain, struct device *dev)
> +{
> + return -EINVAL;
> +}
> +
> #endif /* CONFIG_IOMMU_API */
>
> #endif /* __LINUX_IOMMU_H */
> diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h
> new file mode 100644
> index 0000000..aeeaf0e
> --- /dev/null
> +++ b/include/uapi/linux/iommu.h
> @@ -0,0 +1,55 @@
> +/*
> + * IOMMU user API definitions
> + *
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#ifndef _UAPI_IOMMU_H
> +#define _UAPI_IOMMU_H
> +
> +#include <linux/types.h>
> +
> +enum pasid_table_model {
> + PASID_TABLE_FORMAT_HOST,
> + PASID_TABLE_FORMAT_ARM_1LVL,
> + PASID_TABLE_FORMAT_ARM_2LVL,
Maybe remove the ARM values and struct for the moment, I'm still not sure
how to implement it. I think this should be a single ARM_SMMUV3 model
(2LVL might correspond to two different formats in SMMUv3, and a future
SMMU version could still have 1- or 2-level PASID table but incompatible
format).
> + PASID_TABLE_FORMAT_AMD,
> + PASID_TABLE_FORMAT_INTEL,
> +};
> +
> +/**
> + * PASID table data used to bind guest PASID table to the host IOMMU. This will
> + * enable guest managed first level page tables.
> + * @version: for future extensions and identification of the data format
> + * @bytes: size of this structure
> + * @base_ptr: PASID table pointer
> + * @pasid_bits: number of bits supported in the guest PASID table, must be less
> + * or equal than the host table size.
"host table size" is a bit confusing in this context, especially if using
multi-level tables. Perhaps it's clear enough that @pasid_bits must be
smaller or equal than the PASID size supported by the IOMMU, and we can
remove that second part?
Thanks,
Jean
> + * @model: PASID table format for different IOMMU models
> + */
> +struct pasid_table_config {
> + __u32 version;
> + __u32 bytes;
> + __u64 base_ptr;
> + __u8 pasid_bits;
> + enum pasid_table_model model;
> + union {
> + struct {
> + /* Intel specific fields */
> + } intel;
> +
> + struct {
> + /* ARM specific fields */
> + bool pasid0_dma_no_pasid;
> + } arm;
> +
> + struct {
> + /* AMD specific fields */
> + } amd;
> + };
> +};
> +
> +#endif /* _UAPI_IOMMU_H */
>
^ permalink raw reply [flat|nested] 109+ messages in thread[parent not found: <59945b24-ace9-f0c1-d68d-ccd929e1fe28-5wv7dgnIgG8@public.gmane.org>]
* Re: [PATCH v2 01/16] iommu: introduce bind_pasid_table API function
2017-10-10 16:45 ` Jean-Philippe Brucker
@ 2017-10-10 21:42 ` Jacob Pan
-1 siblings, 0 replies; 109+ messages in thread
From: Jacob Pan @ 2017-10-10 21:42 UTC (permalink / raw)
To: Jean-Philippe Brucker
Cc: Lan Tianyu, Yi L, Greg Kroah-Hartman, Rafael Wysocki, LKML,
iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org,
David Woodhouse
On Tue, 10 Oct 2017 17:45:53 +0100
Jean-Philippe Brucker <jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org> wrote:
> On 06/10/17 00:03, Jacob Pan wrote:
> > Virtual IOMMU was proposed to support Shared Virtual Memory (SVM)
> > use in the guest:
> > https://lists.gnu.org/archive/html/qemu-devel/2016-11/msg05311.html
> >
> > As part of the proposed architecture, when an SVM capable PCI
> > device is assigned to a guest, nested mode is turned on. Guest owns
> > the first level page tables (request with PASID) which performs
> > GVA->GPA translation. Second level page tables are owned by the
> > host for GPA->HPA translation for both request with and without
> > PASID.
> >
> > A new IOMMU driver interface is therefore needed to perform tasks as
> > follows:
> > * Enable nested translation and appropriate translation type
> > * Assign guest PASID table pointer (in GPA) and size to host IOMMU
> >
> > This patch introduces new API functions to perform bind/unbind
> > guest PASID tables. Based on common data, model specific IOMMU
> > drivers can be extended to perform the specific steps for binding
> > pasid table of assigned devices.
> >
> > Signed-off-by: Jacob Pan <jacob.jun.pan-VuQAYsv1563Yd54FQh9/CA@public.gmane.org>
> > Signed-off-by: Liu, Yi L <yi.l.liu-VuQAYsv1563Yd54FQh9/CA@public.gmane.org>
> > Signed-off-by: Ashok Raj <ashok.raj-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
> > ---
> > drivers/iommu/iommu.c | 19 ++++++++++++++++
> > include/linux/iommu.h | 25 +++++++++++++++++++++
> > include/uapi/linux/iommu.h | 55
> > ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 99
> > insertions(+) create mode 100644 include/uapi/linux/iommu.h
> >
> > diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
> > index 3de5c0b..761cf50 100644
> > --- a/drivers/iommu/iommu.c
> > +++ b/drivers/iommu/iommu.c
> > @@ -1322,6 +1322,25 @@ int iommu_attach_device(struct iommu_domain
> > *domain, struct device *dev) }
> > EXPORT_SYMBOL_GPL(iommu_attach_device);
> >
> > +int iommu_bind_pasid_table(struct iommu_domain *domain, struct
> > device *dev,
> > + struct pasid_table_config *pasidt_binfo)
> > +{
> > + if (unlikely(!domain->ops->bind_pasid_table))
> > + return -ENODEV;
> > +
> > + return domain->ops->bind_pasid_table(domain, dev,
> > pasidt_binfo); +}
> > +EXPORT_SYMBOL_GPL(iommu_bind_pasid_table);
> > +
> > +int iommu_unbind_pasid_table(struct iommu_domain *domain, struct
> > device *dev) +{
> > + if (unlikely(!domain->ops->unbind_pasid_table))
> > + return -EINVAL;
> > +
> > + return domain->ops->unbind_pasid_table(domain, dev);
> > +}
> > +EXPORT_SYMBOL_GPL(iommu_unbind_pasid_table);
> > +
> > static void __iommu_detach_device(struct iommu_domain *domain,
> > struct device *dev)
> > {
> > diff --git a/include/linux/iommu.h b/include/linux/iommu.h
> > index 41b8c57..672cc06 100644
> > --- a/include/linux/iommu.h
> > +++ b/include/linux/iommu.h
> > @@ -25,6 +25,7 @@
> > #include <linux/errno.h>
> > #include <linux/err.h>
> > #include <linux/of.h>
> > +#include <uapi/linux/iommu.h>
> >
> > #define IOMMU_READ (1 << 0)
> > #define IOMMU_WRITE (1 << 1)
> > @@ -187,6 +188,8 @@ struct iommu_resv_region {
> > * @domain_get_windows: Return the number of windows for a domain
> > * @of_xlate: add OF master IDs to iommu grouping
> > * @pgsize_bitmap: bitmap of all possible supported page sizes
> > + * @bind_pasid_table: bind pasid table pointer for guest SVM
> > + * @unbind_pasid_table: unbind pasid table pointer and restore
> > defaults */
> > struct iommu_ops {
> > bool (*capable)(enum iommu_cap);
> > @@ -233,8 +236,14 @@ struct iommu_ops {
> > u32 (*domain_get_windows)(struct iommu_domain *domain);
> >
> > int (*of_xlate)(struct device *dev, struct of_phandle_args
> > *args);
> > +
>
> (whitespace change)
will fix, thanks
>
> > bool (*is_attach_deferred)(struct iommu_domain *domain,
> > struct device *dev);
> > + int (*bind_pasid_table)(struct iommu_domain *domain,
> > struct device *dev,
> > + struct pasid_table_config
> > *pasidt_binfo);
> > + int (*unbind_pasid_table)(struct iommu_domain *domain,
> > + struct device *dev);
> > +
> > unsigned long pgsize_bitmap;
> > };
> >
> > @@ -296,6 +305,10 @@ extern int iommu_attach_device(struct
> > iommu_domain *domain, struct device *dev);
> > extern void iommu_detach_device(struct iommu_domain *domain,
> > struct device *dev);
> > +extern int iommu_bind_pasid_table(struct iommu_domain *domain,
> > + struct device *dev, struct pasid_table_config
> > *pasidt_binfo); +extern int iommu_unbind_pasid_table(struct
> > iommu_domain *domain,
> > + struct device *dev);
> > extern struct iommu_domain *iommu_get_domain_for_dev(struct device
> > *dev); extern int iommu_map(struct iommu_domain *domain, unsigned
> > long iova, phys_addr_t paddr, size_t size, int prot);
> > @@ -696,6 +709,18 @@ const struct iommu_ops
> > *iommu_ops_from_fwnode(struct fwnode_handle *fwnode) return NULL;
> > }
> >
> > +static inline
> > +int iommu_bind_pasid_table(struct iommu_domain *domain, struct
> > device *dev,
> > + struct pasid_table_config *pasidt_binfo)
> > +{
> > + return -EINVAL;
> > +}
> > +static inline
> > +int iommu_unbind_pasid_table(struct iommu_domain *domain, struct
> > device *dev) +{
> > + return -EINVAL;
> > +}
> > +
> > #endif /* CONFIG_IOMMU_API */
> >
> > #endif /* __LINUX_IOMMU_H */
> > diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h
> > new file mode 100644
> > index 0000000..aeeaf0e
> > --- /dev/null
> > +++ b/include/uapi/linux/iommu.h
> > @@ -0,0 +1,55 @@
> > +/*
> > + * IOMMU user API definitions
> > + *
> > + *
> > + * This program is free software; you can redistribute it and/or
> > modify
> > + * it under the terms of the GNU General Public License version 2
> > as
> > + * published by the Free Software Foundation.
> > + */
> > +
> > +#ifndef _UAPI_IOMMU_H
> > +#define _UAPI_IOMMU_H
> > +
> > +#include <linux/types.h>
> > +
> > +enum pasid_table_model {
> > + PASID_TABLE_FORMAT_HOST,
> > + PASID_TABLE_FORMAT_ARM_1LVL,
> > + PASID_TABLE_FORMAT_ARM_2LVL,
>
> Maybe remove the ARM values and struct for the moment, I'm still not
> sure how to implement it. I think this should be a single ARM_SMMUV3
> model (2LVL might correspond to two different formats in SMMUv3, and
> a future SMMU version could still have 1- or 2-level PASID table but
> incompatible format).
>
will do, remove the model info is fine for Intel, I guess AMD also?
> > + PASID_TABLE_FORMAT_AMD,
> > + PASID_TABLE_FORMAT_INTEL,
> > +};
> > +
> > +/**
> > + * PASID table data used to bind guest PASID table to the host
> > IOMMU. This will
> > + * enable guest managed first level page tables.
> > + * @version: for future extensions and identification of the data
> > format
> > + * @bytes: size of this structure
> > + * @base_ptr: PASID table pointer
> > + * @pasid_bits: number of bits supported in the guest PASID
> > table, must be less
> > + * or equal than the host table size.
>
> "host table size" is a bit confusing in this context, especially if
> using multi-level tables. Perhaps it's clear enough that @pasid_bits
> must be smaller or equal than the PASID size supported by the IOMMU,
> and we can remove that second part?
>
Not sure what is the second part?
> Thanks,
> Jean
>
> > + * @model: PASID table format for different IOMMU models
> > + */
> > +struct pasid_table_config {
> > + __u32 version;
> > + __u32 bytes;
> > + __u64 base_ptr;
> > + __u8 pasid_bits;
> > + enum pasid_table_model model;
> > + union {
> > + struct {
> > + /* Intel specific fields */
> > + } intel;
> > +
> > + struct {
> > + /* ARM specific fields */
> > + bool pasid0_dma_no_pasid;
> > + } arm;
> > +
> > + struct {
> > + /* AMD specific fields */
> > + } amd;
> > + };
> > +};
> > +
> > +#endif /* _UAPI_IOMMU_H */
> >
>
[Jacob Pan]
^ permalink raw reply [flat|nested] 109+ messages in thread* Re: [PATCH v2 01/16] iommu: introduce bind_pasid_table API function
@ 2017-10-10 21:42 ` Jacob Pan
0 siblings, 0 replies; 109+ messages in thread
From: Jacob Pan @ 2017-10-10 21:42 UTC (permalink / raw)
To: Jean-Philippe Brucker
Cc: iommu@lists.linux-foundation.org, LKML, Joerg Roedel,
David Woodhouse, Greg Kroah-Hartman, Rafael Wysocki, Liu, Yi L,
Lan Tianyu, Tian, Kevin, Raj Ashok, Alex Williamson, Yi L,
jacob.jun.pan
On Tue, 10 Oct 2017 17:45:53 +0100
Jean-Philippe Brucker <jean-philippe.brucker@arm.com> wrote:
> On 06/10/17 00:03, Jacob Pan wrote:
> > Virtual IOMMU was proposed to support Shared Virtual Memory (SVM)
> > use in the guest:
> > https://lists.gnu.org/archive/html/qemu-devel/2016-11/msg05311.html
> >
> > As part of the proposed architecture, when an SVM capable PCI
> > device is assigned to a guest, nested mode is turned on. Guest owns
> > the first level page tables (request with PASID) which performs
> > GVA->GPA translation. Second level page tables are owned by the
> > host for GPA->HPA translation for both request with and without
> > PASID.
> >
> > A new IOMMU driver interface is therefore needed to perform tasks as
> > follows:
> > * Enable nested translation and appropriate translation type
> > * Assign guest PASID table pointer (in GPA) and size to host IOMMU
> >
> > This patch introduces new API functions to perform bind/unbind
> > guest PASID tables. Based on common data, model specific IOMMU
> > drivers can be extended to perform the specific steps for binding
> > pasid table of assigned devices.
> >
> > Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
> > Signed-off-by: Liu, Yi L <yi.l.liu@linux.intel.com>
> > Signed-off-by: Ashok Raj <ashok.raj@intel.com>
> > ---
> > drivers/iommu/iommu.c | 19 ++++++++++++++++
> > include/linux/iommu.h | 25 +++++++++++++++++++++
> > include/uapi/linux/iommu.h | 55
> > ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 99
> > insertions(+) create mode 100644 include/uapi/linux/iommu.h
> >
> > diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
> > index 3de5c0b..761cf50 100644
> > --- a/drivers/iommu/iommu.c
> > +++ b/drivers/iommu/iommu.c
> > @@ -1322,6 +1322,25 @@ int iommu_attach_device(struct iommu_domain
> > *domain, struct device *dev) }
> > EXPORT_SYMBOL_GPL(iommu_attach_device);
> >
> > +int iommu_bind_pasid_table(struct iommu_domain *domain, struct
> > device *dev,
> > + struct pasid_table_config *pasidt_binfo)
> > +{
> > + if (unlikely(!domain->ops->bind_pasid_table))
> > + return -ENODEV;
> > +
> > + return domain->ops->bind_pasid_table(domain, dev,
> > pasidt_binfo); +}
> > +EXPORT_SYMBOL_GPL(iommu_bind_pasid_table);
> > +
> > +int iommu_unbind_pasid_table(struct iommu_domain *domain, struct
> > device *dev) +{
> > + if (unlikely(!domain->ops->unbind_pasid_table))
> > + return -EINVAL;
> > +
> > + return domain->ops->unbind_pasid_table(domain, dev);
> > +}
> > +EXPORT_SYMBOL_GPL(iommu_unbind_pasid_table);
> > +
> > static void __iommu_detach_device(struct iommu_domain *domain,
> > struct device *dev)
> > {
> > diff --git a/include/linux/iommu.h b/include/linux/iommu.h
> > index 41b8c57..672cc06 100644
> > --- a/include/linux/iommu.h
> > +++ b/include/linux/iommu.h
> > @@ -25,6 +25,7 @@
> > #include <linux/errno.h>
> > #include <linux/err.h>
> > #include <linux/of.h>
> > +#include <uapi/linux/iommu.h>
> >
> > #define IOMMU_READ (1 << 0)
> > #define IOMMU_WRITE (1 << 1)
> > @@ -187,6 +188,8 @@ struct iommu_resv_region {
> > * @domain_get_windows: Return the number of windows for a domain
> > * @of_xlate: add OF master IDs to iommu grouping
> > * @pgsize_bitmap: bitmap of all possible supported page sizes
> > + * @bind_pasid_table: bind pasid table pointer for guest SVM
> > + * @unbind_pasid_table: unbind pasid table pointer and restore
> > defaults */
> > struct iommu_ops {
> > bool (*capable)(enum iommu_cap);
> > @@ -233,8 +236,14 @@ struct iommu_ops {
> > u32 (*domain_get_windows)(struct iommu_domain *domain);
> >
> > int (*of_xlate)(struct device *dev, struct of_phandle_args
> > *args);
> > +
>
> (whitespace change)
will fix, thanks
>
> > bool (*is_attach_deferred)(struct iommu_domain *domain,
> > struct device *dev);
> > + int (*bind_pasid_table)(struct iommu_domain *domain,
> > struct device *dev,
> > + struct pasid_table_config
> > *pasidt_binfo);
> > + int (*unbind_pasid_table)(struct iommu_domain *domain,
> > + struct device *dev);
> > +
> > unsigned long pgsize_bitmap;
> > };
> >
> > @@ -296,6 +305,10 @@ extern int iommu_attach_device(struct
> > iommu_domain *domain, struct device *dev);
> > extern void iommu_detach_device(struct iommu_domain *domain,
> > struct device *dev);
> > +extern int iommu_bind_pasid_table(struct iommu_domain *domain,
> > + struct device *dev, struct pasid_table_config
> > *pasidt_binfo); +extern int iommu_unbind_pasid_table(struct
> > iommu_domain *domain,
> > + struct device *dev);
> > extern struct iommu_domain *iommu_get_domain_for_dev(struct device
> > *dev); extern int iommu_map(struct iommu_domain *domain, unsigned
> > long iova, phys_addr_t paddr, size_t size, int prot);
> > @@ -696,6 +709,18 @@ const struct iommu_ops
> > *iommu_ops_from_fwnode(struct fwnode_handle *fwnode) return NULL;
> > }
> >
> > +static inline
> > +int iommu_bind_pasid_table(struct iommu_domain *domain, struct
> > device *dev,
> > + struct pasid_table_config *pasidt_binfo)
> > +{
> > + return -EINVAL;
> > +}
> > +static inline
> > +int iommu_unbind_pasid_table(struct iommu_domain *domain, struct
> > device *dev) +{
> > + return -EINVAL;
> > +}
> > +
> > #endif /* CONFIG_IOMMU_API */
> >
> > #endif /* __LINUX_IOMMU_H */
> > diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h
> > new file mode 100644
> > index 0000000..aeeaf0e
> > --- /dev/null
> > +++ b/include/uapi/linux/iommu.h
> > @@ -0,0 +1,55 @@
> > +/*
> > + * IOMMU user API definitions
> > + *
> > + *
> > + * This program is free software; you can redistribute it and/or
> > modify
> > + * it under the terms of the GNU General Public License version 2
> > as
> > + * published by the Free Software Foundation.
> > + */
> > +
> > +#ifndef _UAPI_IOMMU_H
> > +#define _UAPI_IOMMU_H
> > +
> > +#include <linux/types.h>
> > +
> > +enum pasid_table_model {
> > + PASID_TABLE_FORMAT_HOST,
> > + PASID_TABLE_FORMAT_ARM_1LVL,
> > + PASID_TABLE_FORMAT_ARM_2LVL,
>
> Maybe remove the ARM values and struct for the moment, I'm still not
> sure how to implement it. I think this should be a single ARM_SMMUV3
> model (2LVL might correspond to two different formats in SMMUv3, and
> a future SMMU version could still have 1- or 2-level PASID table but
> incompatible format).
>
will do, remove the model info is fine for Intel, I guess AMD also?
> > + PASID_TABLE_FORMAT_AMD,
> > + PASID_TABLE_FORMAT_INTEL,
> > +};
> > +
> > +/**
> > + * PASID table data used to bind guest PASID table to the host
> > IOMMU. This will
> > + * enable guest managed first level page tables.
> > + * @version: for future extensions and identification of the data
> > format
> > + * @bytes: size of this structure
> > + * @base_ptr: PASID table pointer
> > + * @pasid_bits: number of bits supported in the guest PASID
> > table, must be less
> > + * or equal than the host table size.
>
> "host table size" is a bit confusing in this context, especially if
> using multi-level tables. Perhaps it's clear enough that @pasid_bits
> must be smaller or equal than the PASID size supported by the IOMMU,
> and we can remove that second part?
>
Not sure what is the second part?
> Thanks,
> Jean
>
> > + * @model: PASID table format for different IOMMU models
> > + */
> > +struct pasid_table_config {
> > + __u32 version;
> > + __u32 bytes;
> > + __u64 base_ptr;
> > + __u8 pasid_bits;
> > + enum pasid_table_model model;
> > + union {
> > + struct {
> > + /* Intel specific fields */
> > + } intel;
> > +
> > + struct {
> > + /* ARM specific fields */
> > + bool pasid0_dma_no_pasid;
> > + } arm;
> > +
> > + struct {
> > + /* AMD specific fields */
> > + } amd;
> > + };
> > +};
> > +
> > +#endif /* _UAPI_IOMMU_H */
> >
>
[Jacob Pan]
^ permalink raw reply [flat|nested] 109+ messages in thread* Re: [PATCH v2 01/16] iommu: introduce bind_pasid_table API function
2017-10-10 21:42 ` Jacob Pan
(?)
@ 2017-10-11 9:17 ` Jean-Philippe Brucker
-1 siblings, 0 replies; 109+ messages in thread
From: Jean-Philippe Brucker @ 2017-10-11 9:17 UTC (permalink / raw)
To: Jacob Pan
Cc: iommu@lists.linux-foundation.org, LKML, Joerg Roedel,
David Woodhouse, Greg Kroah-Hartman, Rafael Wysocki, Liu, Yi L,
Lan Tianyu, Tian, Kevin, Raj Ashok, Alex Williamson, Yi L
On 10/10/17 22:42, Jacob Pan wrote:
[...]
>>> +/**
>>> + * PASID table data used to bind guest PASID table to the host
>>> IOMMU. This will
>>> + * enable guest managed first level page tables.
>>> + * @version: for future extensions and identification of the data
>>> format
>>> + * @bytes: size of this structure
>>> + * @base_ptr: PASID table pointer
>>> + * @pasid_bits: number of bits supported in the guest PASID
>>> table, must be less
>>> + * or equal than the host table size.
>>
>> "host table size" is a bit confusing in this context, especially if
>> using multi-level tables. Perhaps it's clear enough that @pasid_bits
>> must be smaller or equal than the PASID size supported by the IOMMU,
>> and we can remove that second part?
>>
> Not sure what is the second part?
"must be less or equal than the host table size", is confusing to me when
you're talking about max PASID bits
Thanks,
Jean
^ permalink raw reply [flat|nested] 109+ messages in thread
* [PATCH v2 02/16] iommu/vt-d: add bind_pasid_table function
2017-10-05 23:03 ` Jacob Pan
@ 2017-10-05 23:03 ` Jacob Pan
-1 siblings, 0 replies; 109+ messages in thread
From: Jacob Pan @ 2017-10-05 23:03 UTC (permalink / raw)
To: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA, LKML,
Joerg Roedel, David Woodhouse, Greg Kroah-Hartman, Rafael Wysocki,
Jean-Philippe Brucker
Cc: Lan Tianyu, Yi L, Liu-i9wRM+HIrmnmtl4Z8vJ8Kg761KYD1DLY
Add Intel VT-d ops to the generic iommu_bind_pasid_table API
functions.
The primary use case is for direct assignment of SVM capable
device. Originated from emulated IOMMU in the guest, the request goes
through many layers (e.g. VFIO). Upon calling host IOMMU driver, caller
passes guest PASID table pointer (GPA) and size.
Device context table entry is modified by Intel IOMMU specific
bind_pasid_table function. This will turn on nesting mode and matching
translation type.
The unbind operation restores default context mapping.
Signed-off-by: Jacob Pan <jacob.jun.pan-VuQAYsv1563Yd54FQh9/CA@public.gmane.org>
Signed-off-by: Liu, Yi L <yi.l.liu-VuQAYsv1563Yd54FQh9/CA@public.gmane.org>
Signed-off-by: Ashok Raj <ashok.raj-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
---
drivers/iommu/intel-iommu.c | 117 ++++++++++++++++++++++++++++++++++++++++++
include/linux/dma_remapping.h | 1 +
2 files changed, 118 insertions(+)
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 209d99a..7ae569c 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -5200,6 +5200,7 @@ static void intel_iommu_put_resv_regions(struct device *dev,
#ifdef CONFIG_INTEL_IOMMU_SVM
#define MAX_NR_PASID_BITS (20)
+#define MIN_NR_PASID_BITS (5)
static inline unsigned long intel_iommu_get_pts(struct intel_iommu *iommu)
{
/*
@@ -5326,6 +5327,118 @@ struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
return iommu;
}
+
+static int intel_iommu_bind_pasid_table(struct iommu_domain *domain,
+ struct device *dev, struct pasid_table_config *pasidt_binfo)
+{
+ struct intel_iommu *iommu;
+ struct context_entry *context;
+ struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+ struct device_domain_info *info;
+ struct pci_dev *pdev;
+ u8 bus, devfn, host_table_pasid_bits;
+ u16 did, sid;
+ int ret = 0;
+ unsigned long flags;
+ u64 ctx_lo;
+
+ iommu = device_to_iommu(dev, &bus, &devfn);
+ if (!iommu)
+ return -ENODEV;
+ /* VT-d spec 9.4 says pasid table size is encoded as 2^(x+5) */
+ host_table_pasid_bits = intel_iommu_get_pts(iommu) + MIN_NR_PASID_BITS;
+ if (!pasidt_binfo || pasidt_binfo->pasid_bits > host_table_pasid_bits ||
+ pasidt_binfo->pasid_bits < MIN_NR_PASID_BITS) {
+ pr_err("Invalid gPASID bits %d, host range %d - %d\n",
+ pasidt_binfo->pasid_bits,
+ MIN_NR_PASID_BITS, host_table_pasid_bits);
+ return -ERANGE;
+ }
+
+ pdev = to_pci_dev(dev);
+ if (!pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
+ return -EINVAL;
+ sid = PCI_DEVID(bus, devfn);
+
+ info = dev->archdata.iommu;
+ if (!info || !info->pasid_supported) {
+ dev_err(dev, "No PASID support\n");
+ ret = -EINVAL;
+ goto out;
+ }
+ if (!info->pasid_enabled) {
+ ret = pci_enable_pasid(pdev, info->pasid_supported & ~1);
+ if (ret)
+ goto out;
+ }
+ if (!device_context_mapped(iommu, bus, devfn)) {
+ pr_warn("ctx not mapped for bus devfn %x:%x\n", bus, devfn);
+ ret = -EINVAL;
+ goto out;
+ }
+ spin_lock_irqsave(&iommu->lock, flags);
+ context = iommu_context_addr(iommu, bus, devfn, 0);
+ if (!context) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ /* Anticipate guest to use SVM and owns the first level, so we turn
+ * nested mode on
+ */
+ ctx_lo = context[0].lo;
+ ctx_lo |= CONTEXT_NESTE | CONTEXT_PRS | CONTEXT_PASIDE;
+ ctx_lo &= ~CONTEXT_TT_MASK;
+ ctx_lo |= CONTEXT_TT_DEV_IOTLB << 2;
+ context[0].lo = ctx_lo;
+
+ /* Assign guest PASID table pointer and size order */
+ ctx_lo = (pasidt_binfo->base_ptr & VTD_PAGE_MASK) |
+ (pasidt_binfo->pasid_bits - MIN_NR_PASID_BITS);
+ context[1].lo = ctx_lo;
+ /* make sure context entry is updated before flushing */
+ wmb();
+ did = dmar_domain->iommu_did[iommu->seq_id];
+ iommu->flush.flush_context(iommu, did,
+ (((u16)bus) << 8) | devfn,
+ DMA_CCMD_MASK_NOBIT,
+ DMA_CCMD_DEVICE_INVL);
+ iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
+
+out_unlock:
+ spin_unlock_irqrestore(&iommu->lock, flags);
+out:
+ return ret;
+}
+
+static int intel_iommu_unbind_pasid_table(struct iommu_domain *domain,
+ struct device *dev)
+{
+ struct intel_iommu *iommu;
+ struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+ struct device_domain_info *info;
+ u8 bus, devfn;
+
+ iommu = device_to_iommu(dev, &bus, &devfn);
+ if (!iommu)
+ return -ENODEV;
+ /*
+ * REVISIT: we might want to clear the PASID table pointer
+ * as part of context clear operation. Currently, it leaves
+ * stale data but should be ignored by hardware since PASIDE
+ * is clear.
+ */
+ /* ATS will be reenabled when remapping is restored */
+ pci_disable_ats(to_pci_dev(dev));
+ domain_context_clear(iommu, dev);
+ info = dev->archdata.iommu;
+ if (!info) {
+ dev_err(dev, "PASID table not bound!\n");
+ return -EINVAL;
+ } else
+
+ return domain_context_mapping_one(dmar_domain, iommu, bus, devfn);
+}
#endif /* CONFIG_INTEL_IOMMU_SVM */
const struct iommu_ops intel_iommu_ops = {
@@ -5334,6 +5447,10 @@ const struct iommu_ops intel_iommu_ops = {
.domain_free = intel_iommu_domain_free,
.attach_dev = intel_iommu_attach_device,
.detach_dev = intel_iommu_detach_device,
+#ifdef CONFIG_INTEL_IOMMU_SVM
+ .bind_pasid_table = intel_iommu_bind_pasid_table,
+ .unbind_pasid_table = intel_iommu_unbind_pasid_table,
+#endif
.map = intel_iommu_map,
.unmap = intel_iommu_unmap,
.map_sg = default_iommu_map_sg,
diff --git a/include/linux/dma_remapping.h b/include/linux/dma_remapping.h
index 9088407..85367b7 100644
--- a/include/linux/dma_remapping.h
+++ b/include/linux/dma_remapping.h
@@ -27,6 +27,7 @@
#define CONTEXT_DINVE (1ULL << 8)
#define CONTEXT_PRS (1ULL << 9)
+#define CONTEXT_NESTE (1ULL << 10)
#define CONTEXT_PASIDE (1ULL << 11)
struct intel_iommu;
--
2.7.4
^ permalink raw reply related [flat|nested] 109+ messages in thread* [PATCH v2 02/16] iommu/vt-d: add bind_pasid_table function
@ 2017-10-05 23:03 ` Jacob Pan
0 siblings, 0 replies; 109+ messages in thread
From: Jacob Pan @ 2017-10-05 23:03 UTC (permalink / raw)
To: iommu, LKML, Joerg Roedel, David Woodhouse, Greg Kroah-Hartman,
Rafael Wysocki, Jean-Philippe Brucker
Cc: Liu, Yi L, Lan Tianyu, Tian, Kevin, Raj Ashok, Alex Williamson,
Jacob Pan, Liu, Yi L
Add Intel VT-d ops to the generic iommu_bind_pasid_table API
functions.
The primary use case is for direct assignment of SVM capable
device. Originated from emulated IOMMU in the guest, the request goes
through many layers (e.g. VFIO). Upon calling host IOMMU driver, caller
passes guest PASID table pointer (GPA) and size.
Device context table entry is modified by Intel IOMMU specific
bind_pasid_table function. This will turn on nesting mode and matching
translation type.
The unbind operation restores default context mapping.
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Liu, Yi L <yi.l.liu@linux.intel.com>
Signed-off-by: Ashok Raj <ashok.raj@intel.com>
---
drivers/iommu/intel-iommu.c | 117 ++++++++++++++++++++++++++++++++++++++++++
include/linux/dma_remapping.h | 1 +
2 files changed, 118 insertions(+)
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 209d99a..7ae569c 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -5200,6 +5200,7 @@ static void intel_iommu_put_resv_regions(struct device *dev,
#ifdef CONFIG_INTEL_IOMMU_SVM
#define MAX_NR_PASID_BITS (20)
+#define MIN_NR_PASID_BITS (5)
static inline unsigned long intel_iommu_get_pts(struct intel_iommu *iommu)
{
/*
@@ -5326,6 +5327,118 @@ struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
return iommu;
}
+
+static int intel_iommu_bind_pasid_table(struct iommu_domain *domain,
+ struct device *dev, struct pasid_table_config *pasidt_binfo)
+{
+ struct intel_iommu *iommu;
+ struct context_entry *context;
+ struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+ struct device_domain_info *info;
+ struct pci_dev *pdev;
+ u8 bus, devfn, host_table_pasid_bits;
+ u16 did, sid;
+ int ret = 0;
+ unsigned long flags;
+ u64 ctx_lo;
+
+ iommu = device_to_iommu(dev, &bus, &devfn);
+ if (!iommu)
+ return -ENODEV;
+ /* VT-d spec 9.4 says pasid table size is encoded as 2^(x+5) */
+ host_table_pasid_bits = intel_iommu_get_pts(iommu) + MIN_NR_PASID_BITS;
+ if (!pasidt_binfo || pasidt_binfo->pasid_bits > host_table_pasid_bits ||
+ pasidt_binfo->pasid_bits < MIN_NR_PASID_BITS) {
+ pr_err("Invalid gPASID bits %d, host range %d - %d\n",
+ pasidt_binfo->pasid_bits,
+ MIN_NR_PASID_BITS, host_table_pasid_bits);
+ return -ERANGE;
+ }
+
+ pdev = to_pci_dev(dev);
+ if (!pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
+ return -EINVAL;
+ sid = PCI_DEVID(bus, devfn);
+
+ info = dev->archdata.iommu;
+ if (!info || !info->pasid_supported) {
+ dev_err(dev, "No PASID support\n");
+ ret = -EINVAL;
+ goto out;
+ }
+ if (!info->pasid_enabled) {
+ ret = pci_enable_pasid(pdev, info->pasid_supported & ~1);
+ if (ret)
+ goto out;
+ }
+ if (!device_context_mapped(iommu, bus, devfn)) {
+ pr_warn("ctx not mapped for bus devfn %x:%x\n", bus, devfn);
+ ret = -EINVAL;
+ goto out;
+ }
+ spin_lock_irqsave(&iommu->lock, flags);
+ context = iommu_context_addr(iommu, bus, devfn, 0);
+ if (!context) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ /* Anticipate guest to use SVM and owns the first level, so we turn
+ * nested mode on
+ */
+ ctx_lo = context[0].lo;
+ ctx_lo |= CONTEXT_NESTE | CONTEXT_PRS | CONTEXT_PASIDE;
+ ctx_lo &= ~CONTEXT_TT_MASK;
+ ctx_lo |= CONTEXT_TT_DEV_IOTLB << 2;
+ context[0].lo = ctx_lo;
+
+ /* Assign guest PASID table pointer and size order */
+ ctx_lo = (pasidt_binfo->base_ptr & VTD_PAGE_MASK) |
+ (pasidt_binfo->pasid_bits - MIN_NR_PASID_BITS);
+ context[1].lo = ctx_lo;
+ /* make sure context entry is updated before flushing */
+ wmb();
+ did = dmar_domain->iommu_did[iommu->seq_id];
+ iommu->flush.flush_context(iommu, did,
+ (((u16)bus) << 8) | devfn,
+ DMA_CCMD_MASK_NOBIT,
+ DMA_CCMD_DEVICE_INVL);
+ iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
+
+out_unlock:
+ spin_unlock_irqrestore(&iommu->lock, flags);
+out:
+ return ret;
+}
+
+static int intel_iommu_unbind_pasid_table(struct iommu_domain *domain,
+ struct device *dev)
+{
+ struct intel_iommu *iommu;
+ struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+ struct device_domain_info *info;
+ u8 bus, devfn;
+
+ iommu = device_to_iommu(dev, &bus, &devfn);
+ if (!iommu)
+ return -ENODEV;
+ /*
+ * REVISIT: we might want to clear the PASID table pointer
+ * as part of context clear operation. Currently, it leaves
+ * stale data but should be ignored by hardware since PASIDE
+ * is clear.
+ */
+ /* ATS will be reenabled when remapping is restored */
+ pci_disable_ats(to_pci_dev(dev));
+ domain_context_clear(iommu, dev);
+ info = dev->archdata.iommu;
+ if (!info) {
+ dev_err(dev, "PASID table not bound!\n");
+ return -EINVAL;
+ } else
+
+ return domain_context_mapping_one(dmar_domain, iommu, bus, devfn);
+}
#endif /* CONFIG_INTEL_IOMMU_SVM */
const struct iommu_ops intel_iommu_ops = {
@@ -5334,6 +5447,10 @@ const struct iommu_ops intel_iommu_ops = {
.domain_free = intel_iommu_domain_free,
.attach_dev = intel_iommu_attach_device,
.detach_dev = intel_iommu_detach_device,
+#ifdef CONFIG_INTEL_IOMMU_SVM
+ .bind_pasid_table = intel_iommu_bind_pasid_table,
+ .unbind_pasid_table = intel_iommu_unbind_pasid_table,
+#endif
.map = intel_iommu_map,
.unmap = intel_iommu_unmap,
.map_sg = default_iommu_map_sg,
diff --git a/include/linux/dma_remapping.h b/include/linux/dma_remapping.h
index 9088407..85367b7 100644
--- a/include/linux/dma_remapping.h
+++ b/include/linux/dma_remapping.h
@@ -27,6 +27,7 @@
#define CONTEXT_DINVE (1ULL << 8)
#define CONTEXT_PRS (1ULL << 9)
+#define CONTEXT_NESTE (1ULL << 10)
#define CONTEXT_PASIDE (1ULL << 11)
struct intel_iommu;
--
2.7.4
^ permalink raw reply related [flat|nested] 109+ messages in thread[parent not found: <1507244624-39189-3-git-send-email-jacob.jun.pan-VuQAYsv1563Yd54FQh9/CA@public.gmane.org>]
* Re: [PATCH v2 02/16] iommu/vt-d: add bind_pasid_table function
2017-10-05 23:03 ` Jacob Pan
@ 2017-10-10 13:21 ` Joerg Roedel
-1 siblings, 0 replies; 109+ messages in thread
From: Joerg Roedel @ 2017-10-10 13:21 UTC (permalink / raw)
To: Jacob Pan
Cc: Lan Tianyu, Liu-zLv9SwRftAIdnm+yROfE0A, Yi L, Greg Kroah-Hartman,
Rafael Wysocki, LKML,
iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
David Woodhouse
On Thu, Oct 05, 2017 at 04:03:30PM -0700, Jacob Pan wrote:
> + pdev = to_pci_dev(dev);
> + if (!pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
> + return -EINVAL;
> + sid = PCI_DEVID(bus, devfn);
Okay, you search for the PRI capability ...
> +
> + info = dev->archdata.iommu;
> + if (!info || !info->pasid_supported) {
> + dev_err(dev, "No PASID support\n");
> + ret = -EINVAL;
> + goto out;
> + }
> + if (!info->pasid_enabled) {
> + ret = pci_enable_pasid(pdev, info->pasid_supported & ~1);
> + if (ret)
> + goto out;
... and you enable PASID in bind_pasid_table() ...
> + /*
> + * REVISIT: we might want to clear the PASID table pointer
> + * as part of context clear operation. Currently, it leaves
> + * stale data but should be ignored by hardware since PASIDE
> + * is clear.
> + */
> + /* ATS will be reenabled when remapping is restored */
> + pci_disable_ats(to_pci_dev(dev));
.. while you disable ATS in unbind_pasid_table(). Where does this
asymmetry come from?
> #define CONTEXT_DINVE (1ULL << 8)
> #define CONTEXT_PRS (1ULL << 9)
> +#define CONTEXT_NESTE (1ULL << 10)
Missing 'D' at the end?
^ permalink raw reply [flat|nested] 109+ messages in thread* Re: [PATCH v2 02/16] iommu/vt-d: add bind_pasid_table function
@ 2017-10-10 13:21 ` Joerg Roedel
0 siblings, 0 replies; 109+ messages in thread
From: Joerg Roedel @ 2017-10-10 13:21 UTC (permalink / raw)
To: Jacob Pan
Cc: iommu, LKML, David Woodhouse, Greg Kroah-Hartman, Rafael Wysocki,
Jean-Philippe Brucker, Liu, Yi L, Lan Tianyu, Tian, Kevin,
Raj Ashok, Alex Williamson, Liu, Yi L
On Thu, Oct 05, 2017 at 04:03:30PM -0700, Jacob Pan wrote:
> + pdev = to_pci_dev(dev);
> + if (!pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
> + return -EINVAL;
> + sid = PCI_DEVID(bus, devfn);
Okay, you search for the PRI capability ...
> +
> + info = dev->archdata.iommu;
> + if (!info || !info->pasid_supported) {
> + dev_err(dev, "No PASID support\n");
> + ret = -EINVAL;
> + goto out;
> + }
> + if (!info->pasid_enabled) {
> + ret = pci_enable_pasid(pdev, info->pasid_supported & ~1);
> + if (ret)
> + goto out;
... and you enable PASID in bind_pasid_table() ...
> + /*
> + * REVISIT: we might want to clear the PASID table pointer
> + * as part of context clear operation. Currently, it leaves
> + * stale data but should be ignored by hardware since PASIDE
> + * is clear.
> + */
> + /* ATS will be reenabled when remapping is restored */
> + pci_disable_ats(to_pci_dev(dev));
.. while you disable ATS in unbind_pasid_table(). Where does this
asymmetry come from?
> #define CONTEXT_DINVE (1ULL << 8)
> #define CONTEXT_PRS (1ULL << 9)
> +#define CONTEXT_NESTE (1ULL << 10)
Missing 'D' at the end?
^ permalink raw reply [flat|nested] 109+ messages in thread
* RE: [PATCH v2 02/16] iommu/vt-d: add bind_pasid_table function
2017-10-05 23:03 ` Jacob Pan
@ 2017-10-12 11:12 ` Liu, Yi L
-1 siblings, 0 replies; 109+ messages in thread
From: Liu, Yi L @ 2017-10-12 11:12 UTC (permalink / raw)
To: Jacob Pan,
iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org,
LKML, Joerg Roedel, David Woodhouse, Greg Kroah-Hartman,
Wysocki, Rafael J, Jean-Philippe Brucker
Cc: Lan, Tianyu, Yi L
> From: Jacob Pan [mailto:jacob.jun.pan-VuQAYsv1563Yd54FQh9/CA@public.gmane.org]
> Sent: Friday, October 6, 2017 7:04 AM
> To: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org; LKML <linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org>; Joerg
> Roedel <joro-zLv9SwRftAIdnm+yROfE0A@public.gmane.org>; David Woodhouse <dwmw2-wEGCiKHe2LqWVfeAwA7xHQ@public.gmane.org>; Greg
> Kroah-Hartman <gregkh-hQyY1W1yCW8ekmWlsbkhG0B+6BGkLq7r@public.gmane.org>; Wysocki, Rafael J
> <rafael.j.wysocki-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>; Jean-Philippe Brucker <jean-
> philippe.brucker-5wv7dgnIgG8@public.gmane.org>
> Cc: Liu, Yi L <yi.l.liu-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>; Lan, Tianyu <tianyu.lan-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>; Tian, Kevin
> <kevin.tian-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>; Raj, Ashok <ashok.raj-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>; Alex Williamson
> <alex.williamson-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>; Jacob Pan <jacob.jun.pan-VuQAYsv1563Yd54FQh9/CA@public.gmane.org>; Liu; Yi
> L <yi.l.liu-VuQAYsv1563Yd54FQh9/CA@public.gmane.org>
> Subject: [PATCH v2 02/16] iommu/vt-d: add bind_pasid_table function
>
> Add Intel VT-d ops to the generic iommu_bind_pasid_table API functions.
>
> The primary use case is for direct assignment of SVM capable device. Originated
> from emulated IOMMU in the guest, the request goes through many layers (e.g.
> VFIO). Upon calling host IOMMU driver, caller passes guest PASID table pointer (GPA)
> and size.
>
> Device context table entry is modified by Intel IOMMU specific bind_pasid_table
> function. This will turn on nesting mode and matching translation type.
>
> The unbind operation restores default context mapping.
>
> Signed-off-by: Jacob Pan <jacob.jun.pan-VuQAYsv1563Yd54FQh9/CA@public.gmane.org>
> Signed-off-by: Liu, Yi L <yi.l.liu-VuQAYsv1563Yd54FQh9/CA@public.gmane.org>
> Signed-off-by: Ashok Raj <ashok.raj-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
> ---
> drivers/iommu/intel-iommu.c | 117
> ++++++++++++++++++++++++++++++++++++++++++
> include/linux/dma_remapping.h | 1 +
> 2 files changed, 118 insertions(+)
>
> diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index
> 209d99a..7ae569c 100644
> --- a/drivers/iommu/intel-iommu.c
> +++ b/drivers/iommu/intel-iommu.c
> @@ -5200,6 +5200,7 @@ static void intel_iommu_put_resv_regions(struct device
> *dev,
>
> #ifdef CONFIG_INTEL_IOMMU_SVM
> #define MAX_NR_PASID_BITS (20)
> +#define MIN_NR_PASID_BITS (5)
> static inline unsigned long intel_iommu_get_pts(struct intel_iommu *iommu) {
> /*
> @@ -5326,6 +5327,118 @@ struct intel_iommu
> *intel_svm_device_to_iommu(struct device *dev)
>
> return iommu;
> }
> +
> +static int intel_iommu_bind_pasid_table(struct iommu_domain *domain,
> + struct device *dev, struct pasid_table_config *pasidt_binfo) {
> + struct intel_iommu *iommu;
> + struct context_entry *context;
> + struct dmar_domain *dmar_domain = to_dmar_domain(domain);
> + struct device_domain_info *info;
> + struct pci_dev *pdev;
> + u8 bus, devfn, host_table_pasid_bits;
> + u16 did, sid;
> + int ret = 0;
> + unsigned long flags;
> + u64 ctx_lo;
> +
> + iommu = device_to_iommu(dev, &bus, &devfn);
> + if (!iommu)
> + return -ENODEV;
> + /* VT-d spec 9.4 says pasid table size is encoded as 2^(x+5) */
> + host_table_pasid_bits = intel_iommu_get_pts(iommu) +
> MIN_NR_PASID_BITS;
> + if (!pasidt_binfo || pasidt_binfo->pasid_bits > host_table_pasid_bits ||
> + pasidt_binfo->pasid_bits < MIN_NR_PASID_BITS) {
> + pr_err("Invalid gPASID bits %d, host range %d - %d\n",
> + pasidt_binfo->pasid_bits,
> + MIN_NR_PASID_BITS, host_table_pasid_bits);
> + return -ERANGE;
> + }
> +
> + pdev = to_pci_dev(dev);
> + if (!pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
> + return -EINVAL;
> + sid = PCI_DEVID(bus, devfn);
> +
> + info = dev->archdata.iommu;
> + if (!info || !info->pasid_supported) {
> + dev_err(dev, "No PASID support\n");
> + ret = -EINVAL;
> + goto out;
> + }
> + if (!info->pasid_enabled) {
> + ret = pci_enable_pasid(pdev, info->pasid_supported & ~1);
> + if (ret)
> + goto out;
> + }
> + if (!device_context_mapped(iommu, bus, devfn)) {
> + pr_warn("ctx not mapped for bus devfn %x:%x\n", bus, devfn);
> + ret = -EINVAL;
> + goto out;
> + }
[Liu, Yi L] This is checking whether ctx is present. So if it is true, then the following 6 line
should be always true. Perhaps, a merge could be done here with the following 6 lines.
> + spin_lock_irqsave(&iommu->lock, flags);
> + context = iommu_context_addr(iommu, bus, devfn, 0);
> + if (!context) {
> + ret = -EINVAL;
> + goto out_unlock;
> + }
> +
Regards,
Yi L
^ permalink raw reply [flat|nested] 109+ messages in thread* RE: [PATCH v2 02/16] iommu/vt-d: add bind_pasid_table function
@ 2017-10-12 11:12 ` Liu, Yi L
0 siblings, 0 replies; 109+ messages in thread
From: Liu, Yi L @ 2017-10-12 11:12 UTC (permalink / raw)
To: Jacob Pan, iommu@lists.linux-foundation.org, LKML, Joerg Roedel,
David Woodhouse, Greg Kroah-Hartman, Wysocki, Rafael J,
Jean-Philippe Brucker
Cc: Lan, Tianyu, Tian, Kevin, Raj, Ashok, Alex Williamson, Yi L
> From: Jacob Pan [mailto:jacob.jun.pan@linux.intel.com]
> Sent: Friday, October 6, 2017 7:04 AM
> To: iommu@lists.linux-foundation.org; LKML <linux-kernel@vger.kernel.org>; Joerg
> Roedel <joro@8bytes.org>; David Woodhouse <dwmw2@infradead.org>; Greg
> Kroah-Hartman <gregkh@linuxfoundation.org>; Wysocki, Rafael J
> <rafael.j.wysocki@intel.com>; Jean-Philippe Brucker <jean-
> philippe.brucker@arm.com>
> Cc: Liu, Yi L <yi.l.liu@intel.com>; Lan, Tianyu <tianyu.lan@intel.com>; Tian, Kevin
> <kevin.tian@intel.com>; Raj, Ashok <ashok.raj@intel.com>; Alex Williamson
> <alex.williamson@redhat.com>; Jacob Pan <jacob.jun.pan@linux.intel.com>; Liu; Yi
> L <yi.l.liu@linux.intel.com>
> Subject: [PATCH v2 02/16] iommu/vt-d: add bind_pasid_table function
>
> Add Intel VT-d ops to the generic iommu_bind_pasid_table API functions.
>
> The primary use case is for direct assignment of SVM capable device. Originated
> from emulated IOMMU in the guest, the request goes through many layers (e.g.
> VFIO). Upon calling host IOMMU driver, caller passes guest PASID table pointer (GPA)
> and size.
>
> Device context table entry is modified by Intel IOMMU specific bind_pasid_table
> function. This will turn on nesting mode and matching translation type.
>
> The unbind operation restores default context mapping.
>
> Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
> Signed-off-by: Liu, Yi L <yi.l.liu@linux.intel.com>
> Signed-off-by: Ashok Raj <ashok.raj@intel.com>
> ---
> drivers/iommu/intel-iommu.c | 117
> ++++++++++++++++++++++++++++++++++++++++++
> include/linux/dma_remapping.h | 1 +
> 2 files changed, 118 insertions(+)
>
> diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index
> 209d99a..7ae569c 100644
> --- a/drivers/iommu/intel-iommu.c
> +++ b/drivers/iommu/intel-iommu.c
> @@ -5200,6 +5200,7 @@ static void intel_iommu_put_resv_regions(struct device
> *dev,
>
> #ifdef CONFIG_INTEL_IOMMU_SVM
> #define MAX_NR_PASID_BITS (20)
> +#define MIN_NR_PASID_BITS (5)
> static inline unsigned long intel_iommu_get_pts(struct intel_iommu *iommu) {
> /*
> @@ -5326,6 +5327,118 @@ struct intel_iommu
> *intel_svm_device_to_iommu(struct device *dev)
>
> return iommu;
> }
> +
> +static int intel_iommu_bind_pasid_table(struct iommu_domain *domain,
> + struct device *dev, struct pasid_table_config *pasidt_binfo) {
> + struct intel_iommu *iommu;
> + struct context_entry *context;
> + struct dmar_domain *dmar_domain = to_dmar_domain(domain);
> + struct device_domain_info *info;
> + struct pci_dev *pdev;
> + u8 bus, devfn, host_table_pasid_bits;
> + u16 did, sid;
> + int ret = 0;
> + unsigned long flags;
> + u64 ctx_lo;
> +
> + iommu = device_to_iommu(dev, &bus, &devfn);
> + if (!iommu)
> + return -ENODEV;
> + /* VT-d spec 9.4 says pasid table size is encoded as 2^(x+5) */
> + host_table_pasid_bits = intel_iommu_get_pts(iommu) +
> MIN_NR_PASID_BITS;
> + if (!pasidt_binfo || pasidt_binfo->pasid_bits > host_table_pasid_bits ||
> + pasidt_binfo->pasid_bits < MIN_NR_PASID_BITS) {
> + pr_err("Invalid gPASID bits %d, host range %d - %d\n",
> + pasidt_binfo->pasid_bits,
> + MIN_NR_PASID_BITS, host_table_pasid_bits);
> + return -ERANGE;
> + }
> +
> + pdev = to_pci_dev(dev);
> + if (!pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
> + return -EINVAL;
> + sid = PCI_DEVID(bus, devfn);
> +
> + info = dev->archdata.iommu;
> + if (!info || !info->pasid_supported) {
> + dev_err(dev, "No PASID support\n");
> + ret = -EINVAL;
> + goto out;
> + }
> + if (!info->pasid_enabled) {
> + ret = pci_enable_pasid(pdev, info->pasid_supported & ~1);
> + if (ret)
> + goto out;
> + }
> + if (!device_context_mapped(iommu, bus, devfn)) {
> + pr_warn("ctx not mapped for bus devfn %x:%x\n", bus, devfn);
> + ret = -EINVAL;
> + goto out;
> + }
[Liu, Yi L] This is checking whether ctx is present. So if it is true, then the following 6 line
should be always true. Perhaps, a merge could be done here with the following 6 lines.
> + spin_lock_irqsave(&iommu->lock, flags);
> + context = iommu_context_addr(iommu, bus, devfn, 0);
> + if (!context) {
> + ret = -EINVAL;
> + goto out_unlock;
> + }
> +
Regards,
Yi L
^ permalink raw reply [flat|nested] 109+ messages in thread[parent not found: <A2975661238FB949B60364EF0F2C257439AF6CDD-0J0gbvR4kTg/UvCtAeCM4rfspsVTdybXVpNB7YpNyf8@public.gmane.org>]
* Re: [PATCH v2 02/16] iommu/vt-d: add bind_pasid_table function
2017-10-12 11:12 ` Liu, Yi L
@ 2017-10-12 17:38 ` Jacob Pan
-1 siblings, 0 replies; 109+ messages in thread
From: Jacob Pan @ 2017-10-12 17:38 UTC (permalink / raw)
To: Liu, Yi L
Cc: Lan, Tianyu, Yi L, Greg Kroah-Hartman, Wysocki, Rafael J, LKML,
iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org,
David Woodhouse
On Thu, 12 Oct 2017 11:12:46 +0000
"Liu, Yi L" <yi.l.liu-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org> wrote:
> > From: Jacob Pan [mailto:jacob.jun.pan-VuQAYsv1563Yd54FQh9/CA@public.gmane.org]
> > Sent: Friday, October 6, 2017 7:04 AM
> > To: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org; LKML
> > <linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org>; Joerg Roedel <joro-zLv9SwRftAIdnm+yROfE0A@public.gmane.org>;
> > David Woodhouse <dwmw2-wEGCiKHe2LqWVfeAwA7xHQ@public.gmane.org>; Greg Kroah-Hartman
> > <gregkh-hQyY1W1yCW8ekmWlsbkhG0B+6BGkLq7r@public.gmane.org>; Wysocki, Rafael J
> > <rafael.j.wysocki-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>; Jean-Philippe Brucker <jean-
> > philippe.brucker-5wv7dgnIgG8@public.gmane.org> Cc: Liu, Yi L <yi.l.liu-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>; Lan,
> > Tianyu <tianyu.lan-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>; Tian, Kevin <kevin.tian-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>;
> > Raj, Ashok <ashok.raj-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>; Alex Williamson
> > <alex.williamson-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>; Jacob Pan
> > <jacob.jun.pan-VuQAYsv1563Yd54FQh9/CA@public.gmane.org>; Liu; Yi L
> > <yi.l.liu-VuQAYsv1563Yd54FQh9/CA@public.gmane.org> Subject: [PATCH v2 02/16] iommu/vt-d:
> > add bind_pasid_table function
> >
> > Add Intel VT-d ops to the generic iommu_bind_pasid_table API
> > functions.
> >
> > The primary use case is for direct assignment of SVM capable
> > device. Originated from emulated IOMMU in the guest, the request
> > goes through many layers (e.g. VFIO). Upon calling host IOMMU
> > driver, caller passes guest PASID table pointer (GPA) and size.
> >
> > Device context table entry is modified by Intel IOMMU specific
> > bind_pasid_table function. This will turn on nesting mode and
> > matching translation type.
> >
> > The unbind operation restores default context mapping.
> >
> > Signed-off-by: Jacob Pan <jacob.jun.pan-VuQAYsv1563Yd54FQh9/CA@public.gmane.org>
> > Signed-off-by: Liu, Yi L <yi.l.liu-VuQAYsv1563Yd54FQh9/CA@public.gmane.org>
> > Signed-off-by: Ashok Raj <ashok.raj-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
> > ---
> > drivers/iommu/intel-iommu.c | 117
> > ++++++++++++++++++++++++++++++++++++++++++
> > include/linux/dma_remapping.h | 1 +
> > 2 files changed, 118 insertions(+)
> >
> > diff --git a/drivers/iommu/intel-iommu.c
> > b/drivers/iommu/intel-iommu.c index 209d99a..7ae569c 100644
> > --- a/drivers/iommu/intel-iommu.c
> > +++ b/drivers/iommu/intel-iommu.c
> > @@ -5200,6 +5200,7 @@ static void
> > intel_iommu_put_resv_regions(struct device *dev,
> >
> > #ifdef CONFIG_INTEL_IOMMU_SVM
> > #define MAX_NR_PASID_BITS (20)
> > +#define MIN_NR_PASID_BITS (5)
> > static inline unsigned long intel_iommu_get_pts(struct intel_iommu
> > *iommu) { /*
> > @@ -5326,6 +5327,118 @@ struct intel_iommu
> > *intel_svm_device_to_iommu(struct device *dev)
> >
> > return iommu;
> > }
> > +
> > +static int intel_iommu_bind_pasid_table(struct iommu_domain
> > *domain,
> > + struct device *dev, struct pasid_table_config
> > *pasidt_binfo) {
> > + struct intel_iommu *iommu;
> > + struct context_entry *context;
> > + struct dmar_domain *dmar_domain = to_dmar_domain(domain);
> > + struct device_domain_info *info;
> > + struct pci_dev *pdev;
> > + u8 bus, devfn, host_table_pasid_bits;
> > + u16 did, sid;
> > + int ret = 0;
> > + unsigned long flags;
> > + u64 ctx_lo;
> > +
> > + iommu = device_to_iommu(dev, &bus, &devfn);
> > + if (!iommu)
> > + return -ENODEV;
> > + /* VT-d spec 9.4 says pasid table size is encoded as
> > 2^(x+5) */
> > + host_table_pasid_bits = intel_iommu_get_pts(iommu) +
> > MIN_NR_PASID_BITS;
> > + if (!pasidt_binfo || pasidt_binfo->pasid_bits >
> > host_table_pasid_bits ||
> > + pasidt_binfo->pasid_bits < MIN_NR_PASID_BITS) {
> > + pr_err("Invalid gPASID bits %d, host range %d -
> > %d\n",
> > + pasidt_binfo->pasid_bits,
> > + MIN_NR_PASID_BITS, host_table_pasid_bits);
> > + return -ERANGE;
> > + }
> > +
> > + pdev = to_pci_dev(dev);
> > + if (!pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
> > + return -EINVAL;
> > + sid = PCI_DEVID(bus, devfn);
> > +
> > + info = dev->archdata.iommu;
> > + if (!info || !info->pasid_supported) {
> > + dev_err(dev, "No PASID support\n");
> > + ret = -EINVAL;
> > + goto out;
> > + }
> > + if (!info->pasid_enabled) {
> > + ret = pci_enable_pasid(pdev, info->pasid_supported
> > & ~1);
> > + if (ret)
> > + goto out;
> > + }
> > + if (!device_context_mapped(iommu, bus, devfn)) {
> > + pr_warn("ctx not mapped for bus devfn %x:%x\n",
> > bus, devfn);
> > + ret = -EINVAL;
> > + goto out;
> > + }
>
> [Liu, Yi L] This is checking whether ctx is present. So if it is
> true, then the following 6 line should be always true. Perhaps, a
> merge could be done here with the following 6 lines.
>
good point, I can do the present check below. no need to hold the lock
twice.
> > + spin_lock_irqsave(&iommu->lock, flags);
> > + context = iommu_context_addr(iommu, bus, devfn, 0);
> > + if (!context) {
> > + ret = -EINVAL;
> > + goto out_unlock;
> > + }
> > +
>
> Regards,
> Yi L
^ permalink raw reply [flat|nested] 109+ messages in thread* Re: [PATCH v2 02/16] iommu/vt-d: add bind_pasid_table function
@ 2017-10-12 17:38 ` Jacob Pan
0 siblings, 0 replies; 109+ messages in thread
From: Jacob Pan @ 2017-10-12 17:38 UTC (permalink / raw)
To: Liu, Yi L
Cc: iommu@lists.linux-foundation.org, LKML, Joerg Roedel,
David Woodhouse, Greg Kroah-Hartman, Wysocki, Rafael J,
Jean-Philippe Brucker, Lan, Tianyu, Tian, Kevin, Raj, Ashok,
Alex Williamson, Yi L, jacob.jun.pan
On Thu, 12 Oct 2017 11:12:46 +0000
"Liu, Yi L" <yi.l.liu@intel.com> wrote:
> > From: Jacob Pan [mailto:jacob.jun.pan@linux.intel.com]
> > Sent: Friday, October 6, 2017 7:04 AM
> > To: iommu@lists.linux-foundation.org; LKML
> > <linux-kernel@vger.kernel.org>; Joerg Roedel <joro@8bytes.org>;
> > David Woodhouse <dwmw2@infradead.org>; Greg Kroah-Hartman
> > <gregkh@linuxfoundation.org>; Wysocki, Rafael J
> > <rafael.j.wysocki@intel.com>; Jean-Philippe Brucker <jean-
> > philippe.brucker@arm.com> Cc: Liu, Yi L <yi.l.liu@intel.com>; Lan,
> > Tianyu <tianyu.lan@intel.com>; Tian, Kevin <kevin.tian@intel.com>;
> > Raj, Ashok <ashok.raj@intel.com>; Alex Williamson
> > <alex.williamson@redhat.com>; Jacob Pan
> > <jacob.jun.pan@linux.intel.com>; Liu; Yi L
> > <yi.l.liu@linux.intel.com> Subject: [PATCH v2 02/16] iommu/vt-d:
> > add bind_pasid_table function
> >
> > Add Intel VT-d ops to the generic iommu_bind_pasid_table API
> > functions.
> >
> > The primary use case is for direct assignment of SVM capable
> > device. Originated from emulated IOMMU in the guest, the request
> > goes through many layers (e.g. VFIO). Upon calling host IOMMU
> > driver, caller passes guest PASID table pointer (GPA) and size.
> >
> > Device context table entry is modified by Intel IOMMU specific
> > bind_pasid_table function. This will turn on nesting mode and
> > matching translation type.
> >
> > The unbind operation restores default context mapping.
> >
> > Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
> > Signed-off-by: Liu, Yi L <yi.l.liu@linux.intel.com>
> > Signed-off-by: Ashok Raj <ashok.raj@intel.com>
> > ---
> > drivers/iommu/intel-iommu.c | 117
> > ++++++++++++++++++++++++++++++++++++++++++
> > include/linux/dma_remapping.h | 1 +
> > 2 files changed, 118 insertions(+)
> >
> > diff --git a/drivers/iommu/intel-iommu.c
> > b/drivers/iommu/intel-iommu.c index 209d99a..7ae569c 100644
> > --- a/drivers/iommu/intel-iommu.c
> > +++ b/drivers/iommu/intel-iommu.c
> > @@ -5200,6 +5200,7 @@ static void
> > intel_iommu_put_resv_regions(struct device *dev,
> >
> > #ifdef CONFIG_INTEL_IOMMU_SVM
> > #define MAX_NR_PASID_BITS (20)
> > +#define MIN_NR_PASID_BITS (5)
> > static inline unsigned long intel_iommu_get_pts(struct intel_iommu
> > *iommu) { /*
> > @@ -5326,6 +5327,118 @@ struct intel_iommu
> > *intel_svm_device_to_iommu(struct device *dev)
> >
> > return iommu;
> > }
> > +
> > +static int intel_iommu_bind_pasid_table(struct iommu_domain
> > *domain,
> > + struct device *dev, struct pasid_table_config
> > *pasidt_binfo) {
> > + struct intel_iommu *iommu;
> > + struct context_entry *context;
> > + struct dmar_domain *dmar_domain = to_dmar_domain(domain);
> > + struct device_domain_info *info;
> > + struct pci_dev *pdev;
> > + u8 bus, devfn, host_table_pasid_bits;
> > + u16 did, sid;
> > + int ret = 0;
> > + unsigned long flags;
> > + u64 ctx_lo;
> > +
> > + iommu = device_to_iommu(dev, &bus, &devfn);
> > + if (!iommu)
> > + return -ENODEV;
> > + /* VT-d spec 9.4 says pasid table size is encoded as
> > 2^(x+5) */
> > + host_table_pasid_bits = intel_iommu_get_pts(iommu) +
> > MIN_NR_PASID_BITS;
> > + if (!pasidt_binfo || pasidt_binfo->pasid_bits >
> > host_table_pasid_bits ||
> > + pasidt_binfo->pasid_bits < MIN_NR_PASID_BITS) {
> > + pr_err("Invalid gPASID bits %d, host range %d -
> > %d\n",
> > + pasidt_binfo->pasid_bits,
> > + MIN_NR_PASID_BITS, host_table_pasid_bits);
> > + return -ERANGE;
> > + }
> > +
> > + pdev = to_pci_dev(dev);
> > + if (!pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
> > + return -EINVAL;
> > + sid = PCI_DEVID(bus, devfn);
> > +
> > + info = dev->archdata.iommu;
> > + if (!info || !info->pasid_supported) {
> > + dev_err(dev, "No PASID support\n");
> > + ret = -EINVAL;
> > + goto out;
> > + }
> > + if (!info->pasid_enabled) {
> > + ret = pci_enable_pasid(pdev, info->pasid_supported
> > & ~1);
> > + if (ret)
> > + goto out;
> > + }
> > + if (!device_context_mapped(iommu, bus, devfn)) {
> > + pr_warn("ctx not mapped for bus devfn %x:%x\n",
> > bus, devfn);
> > + ret = -EINVAL;
> > + goto out;
> > + }
>
> [Liu, Yi L] This is checking whether ctx is present. So if it is
> true, then the following 6 line should be always true. Perhaps, a
> merge could be done here with the following 6 lines.
>
good point, I can do the present check below. no need to hold the lock
twice.
> > + spin_lock_irqsave(&iommu->lock, flags);
> > + context = iommu_context_addr(iommu, bus, devfn, 0);
> > + if (!context) {
> > + ret = -EINVAL;
> > + goto out_unlock;
> > + }
> > +
>
> Regards,
> Yi L
^ permalink raw reply [flat|nested] 109+ messages in thread
* [PATCH v2 04/16] iommu/vt-d: support flushing more TLB types
2017-10-05 23:03 ` Jacob Pan
@ 2017-10-05 23:03 ` Jacob Pan
-1 siblings, 0 replies; 109+ messages in thread
From: Jacob Pan @ 2017-10-05 23:03 UTC (permalink / raw)
To: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA, LKML,
Joerg Roedel, David Woodhouse, Greg Kroah-Hartman, Rafael Wysocki,
Jean-Philippe Brucker
Cc: Lan Tianyu
Signed-off-by: Jacob Pan <jacob.jun.pan-VuQAYsv1563Yd54FQh9/CA@public.gmane.org>
---
drivers/iommu/dmar.c | 53 ++++++++++++++++++++++++++++++++++++++++++---
drivers/iommu/intel-iommu.c | 3 ++-
include/linux/intel-iommu.h | 10 +++++++--
3 files changed, 60 insertions(+), 6 deletions(-)
diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c
index 57c920c..2fbff8b 100644
--- a/drivers/iommu/dmar.c
+++ b/drivers/iommu/dmar.c
@@ -1336,11 +1336,25 @@ void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
qi_submit_sync(&desc, iommu);
}
-void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 qdep,
- u64 addr, unsigned mask)
+void qi_flush_eiotlb(struct intel_iommu *iommu, u16 did, u64 addr, u32 pasid,
+ unsigned int size_order, u64 granu, bool global)
{
struct qi_desc desc;
+ desc.low = QI_EIOTLB_PASID(pasid) | QI_EIOTLB_DID(did) |
+ QI_EIOTLB_GRAN(granu) | QI_EIOTLB_TYPE;
+ desc.high = QI_EIOTLB_ADDR(addr) | QI_EIOTLB_GL(global) |
+ QI_EIOTLB_IH(0) | QI_EIOTLB_AM(size_order);
+ qi_submit_sync(&desc, iommu);
+}
+
+void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid,
+ u16 qdep, u64 addr, unsigned mask)
+{
+ struct qi_desc desc;
+
+ pr_debug_ratelimited("%s: sid %d, pfsid %d, qdep %d, addr %llx, mask %d\n",
+ __func__, sid, pfsid, qdep, addr, mask);
if (mask) {
BUG_ON(addr & ((1 << (VTD_PAGE_SHIFT + mask)) - 1));
addr |= (1ULL << (VTD_PAGE_SHIFT + mask - 1)) - 1;
@@ -1352,7 +1366,40 @@ void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 qdep,
qdep = 0;
desc.low = QI_DEV_IOTLB_SID(sid) | QI_DEV_IOTLB_QDEP(qdep) |
- QI_DIOTLB_TYPE;
+ QI_DIOTLB_TYPE | QI_DEV_IOTLB_SID(pfsid);
+
+ qi_submit_sync(&desc, iommu);
+}
+
+void qi_flush_dev_eiotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid,
+ u32 pasid, u16 qdep, u64 addr, unsigned size)
+{
+ struct qi_desc desc;
+
+ desc.low = QI_DEV_EIOTLB_PASID(pasid) | QI_DEV_EIOTLB_SID(sid) |
+ QI_DEV_EIOTLB_QDEP(qdep) | QI_DEIOTLB_TYPE |
+ QI_DEV_EIOTLB_PFSID(pfsid);
+
+ /* If S bit is 0, we only flush a single page. If S bit is set,
+ * The least significant zero bit indicates the size. VT-d spec
+ * 6.5.2.6
+ */
+ if (!size)
+ desc.high = QI_DEV_EIOTLB_ADDR(addr) & ~QI_DEV_EIOTLB_SIZE;
+ else {
+ unsigned long mask = 1UL << (VTD_PAGE_SHIFT + size);
+
+ desc.high = QI_DEV_EIOTLB_ADDR(addr & ~mask) | QI_DEV_EIOTLB_SIZE;
+ }
+ qi_submit_sync(&desc, iommu);
+}
+
+void qi_flush_pasid(struct intel_iommu *iommu, u16 did, u64 granu, int pasid)
+{
+ struct qi_desc desc;
+
+ desc.high = 0;
+ desc.low = QI_PC_TYPE | QI_PC_DID(did) | QI_PC_GRAN(granu) | QI_PC_PASID(pasid);
qi_submit_sync(&desc, iommu);
}
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 7ae569c..6832f73 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -1567,7 +1567,8 @@ static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
sid = info->bus << 8 | info->devfn;
qdep = info->ats_qdep;
- qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
+ qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
+ qdep, addr, mask);
}
spin_unlock_irqrestore(&device_domain_lock, flags);
}
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 485a5b4..e42d317 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -305,6 +305,7 @@ enum {
#define QI_DEV_EIOTLB_PASID(p) (((u64)p) << 32)
#define QI_DEV_EIOTLB_SID(sid) ((u64)((sid) & 0xffff) << 16)
#define QI_DEV_EIOTLB_QDEP(qd) ((u64)((qd) & 0x1f) << 4)
+#define QI_DEV_EIOTLB_PFSID(pfsid) (((u64)(pfsid & 0xf) << 12) | ((u64)(pfsid & 0xff0) << 48))
#define QI_DEV_EIOTLB_MAX_INVS 32
#define QI_PGRP_IDX(idx) (((u64)(idx)) << 55)
@@ -450,8 +451,13 @@ extern void qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid,
u8 fm, u64 type);
extern void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
unsigned int size_order, u64 type);
-extern void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 qdep,
- u64 addr, unsigned mask);
+extern void qi_flush_eiotlb(struct intel_iommu *iommu, u16 did, u64 addr,
+ u32 pasid, unsigned int size_order, u64 type, bool global);
+extern void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid,
+ u16 qdep, u64 addr, unsigned mask);
+extern void qi_flush_dev_eiotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid,
+ u32 pasid, u16 qdep, u64 addr, unsigned size);
+extern void qi_flush_pasid(struct intel_iommu *iommu, u16 did, u64 granu, int pasid);
extern int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu);
--
2.7.4
^ permalink raw reply related [flat|nested] 109+ messages in thread* [PATCH v2 04/16] iommu/vt-d: support flushing more TLB types
@ 2017-10-05 23:03 ` Jacob Pan
0 siblings, 0 replies; 109+ messages in thread
From: Jacob Pan @ 2017-10-05 23:03 UTC (permalink / raw)
To: iommu, LKML, Joerg Roedel, David Woodhouse, Greg Kroah-Hartman,
Rafael Wysocki, Jean-Philippe Brucker
Cc: Liu, Yi L, Lan Tianyu, Tian, Kevin, Raj Ashok, Alex Williamson,
Jacob Pan
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
---
drivers/iommu/dmar.c | 53 ++++++++++++++++++++++++++++++++++++++++++---
drivers/iommu/intel-iommu.c | 3 ++-
include/linux/intel-iommu.h | 10 +++++++--
3 files changed, 60 insertions(+), 6 deletions(-)
diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c
index 57c920c..2fbff8b 100644
--- a/drivers/iommu/dmar.c
+++ b/drivers/iommu/dmar.c
@@ -1336,11 +1336,25 @@ void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
qi_submit_sync(&desc, iommu);
}
-void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 qdep,
- u64 addr, unsigned mask)
+void qi_flush_eiotlb(struct intel_iommu *iommu, u16 did, u64 addr, u32 pasid,
+ unsigned int size_order, u64 granu, bool global)
{
struct qi_desc desc;
+ desc.low = QI_EIOTLB_PASID(pasid) | QI_EIOTLB_DID(did) |
+ QI_EIOTLB_GRAN(granu) | QI_EIOTLB_TYPE;
+ desc.high = QI_EIOTLB_ADDR(addr) | QI_EIOTLB_GL(global) |
+ QI_EIOTLB_IH(0) | QI_EIOTLB_AM(size_order);
+ qi_submit_sync(&desc, iommu);
+}
+
+void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid,
+ u16 qdep, u64 addr, unsigned mask)
+{
+ struct qi_desc desc;
+
+ pr_debug_ratelimited("%s: sid %d, pfsid %d, qdep %d, addr %llx, mask %d\n",
+ __func__, sid, pfsid, qdep, addr, mask);
if (mask) {
BUG_ON(addr & ((1 << (VTD_PAGE_SHIFT + mask)) - 1));
addr |= (1ULL << (VTD_PAGE_SHIFT + mask - 1)) - 1;
@@ -1352,7 +1366,40 @@ void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 qdep,
qdep = 0;
desc.low = QI_DEV_IOTLB_SID(sid) | QI_DEV_IOTLB_QDEP(qdep) |
- QI_DIOTLB_TYPE;
+ QI_DIOTLB_TYPE | QI_DEV_IOTLB_SID(pfsid);
+
+ qi_submit_sync(&desc, iommu);
+}
+
+void qi_flush_dev_eiotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid,
+ u32 pasid, u16 qdep, u64 addr, unsigned size)
+{
+ struct qi_desc desc;
+
+ desc.low = QI_DEV_EIOTLB_PASID(pasid) | QI_DEV_EIOTLB_SID(sid) |
+ QI_DEV_EIOTLB_QDEP(qdep) | QI_DEIOTLB_TYPE |
+ QI_DEV_EIOTLB_PFSID(pfsid);
+
+ /* If S bit is 0, we only flush a single page. If S bit is set,
+ * The least significant zero bit indicates the size. VT-d spec
+ * 6.5.2.6
+ */
+ if (!size)
+ desc.high = QI_DEV_EIOTLB_ADDR(addr) & ~QI_DEV_EIOTLB_SIZE;
+ else {
+ unsigned long mask = 1UL << (VTD_PAGE_SHIFT + size);
+
+ desc.high = QI_DEV_EIOTLB_ADDR(addr & ~mask) | QI_DEV_EIOTLB_SIZE;
+ }
+ qi_submit_sync(&desc, iommu);
+}
+
+void qi_flush_pasid(struct intel_iommu *iommu, u16 did, u64 granu, int pasid)
+{
+ struct qi_desc desc;
+
+ desc.high = 0;
+ desc.low = QI_PC_TYPE | QI_PC_DID(did) | QI_PC_GRAN(granu) | QI_PC_PASID(pasid);
qi_submit_sync(&desc, iommu);
}
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 7ae569c..6832f73 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -1567,7 +1567,8 @@ static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
sid = info->bus << 8 | info->devfn;
qdep = info->ats_qdep;
- qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
+ qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
+ qdep, addr, mask);
}
spin_unlock_irqrestore(&device_domain_lock, flags);
}
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 485a5b4..e42d317 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -305,6 +305,7 @@ enum {
#define QI_DEV_EIOTLB_PASID(p) (((u64)p) << 32)
#define QI_DEV_EIOTLB_SID(sid) ((u64)((sid) & 0xffff) << 16)
#define QI_DEV_EIOTLB_QDEP(qd) ((u64)((qd) & 0x1f) << 4)
+#define QI_DEV_EIOTLB_PFSID(pfsid) (((u64)(pfsid & 0xf) << 12) | ((u64)(pfsid & 0xff0) << 48))
#define QI_DEV_EIOTLB_MAX_INVS 32
#define QI_PGRP_IDX(idx) (((u64)(idx)) << 55)
@@ -450,8 +451,13 @@ extern void qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid,
u8 fm, u64 type);
extern void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
unsigned int size_order, u64 type);
-extern void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 qdep,
- u64 addr, unsigned mask);
+extern void qi_flush_eiotlb(struct intel_iommu *iommu, u16 did, u64 addr,
+ u32 pasid, unsigned int size_order, u64 type, bool global);
+extern void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid,
+ u16 qdep, u64 addr, unsigned mask);
+extern void qi_flush_dev_eiotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid,
+ u32 pasid, u16 qdep, u64 addr, unsigned size);
+extern void qi_flush_pasid(struct intel_iommu *iommu, u16 did, u64 granu, int pasid);
extern int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu);
--
2.7.4
^ permalink raw reply related [flat|nested] 109+ messages in thread* Re: [v2,04/16] iommu/vt-d: support flushing more TLB types
2017-10-05 23:03 ` Jacob Pan
@ 2017-10-26 13:02 ` Lukoshkov, Maksim
-1 siblings, 0 replies; 109+ messages in thread
From: Lukoshkov, Maksim @ 2017-10-26 13:02 UTC (permalink / raw)
To: Jacob Pan, iommu, LKML, Joerg Roedel, David Woodhouse,
Greg Kroah-Hartman, Rafael Wysocki, Jean-Philippe Brucker
Cc: Liu, Yi L, Lan Tianyu, Tian, Kevin, Raj Ashok, Alex Williamson
On 10/6/2017 00:03, Jacob Pan wrote:
> Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
> ---
> drivers/iommu/dmar.c | 53 ++++++++++++++++++++++++++++++++++++++++++---
> drivers/iommu/intel-iommu.c | 3 ++-
> include/linux/intel-iommu.h | 10 +++++++--
> 3 files changed, 60 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c
> index 57c920c..2fbff8b 100644
> --- a/drivers/iommu/dmar.c
> +++ b/drivers/iommu/dmar.c
> @@ -1336,11 +1336,25 @@ void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
> qi_submit_sync(&desc, iommu);
> }
>
> -void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 qdep,
> - u64 addr, unsigned mask)
> +void qi_flush_eiotlb(struct intel_iommu *iommu, u16 did, u64 addr, u32 pasid,
> + unsigned int size_order, u64 granu, bool global)
> {
> struct qi_desc desc;
>
> + desc.low = QI_EIOTLB_PASID(pasid) | QI_EIOTLB_DID(did) |
> + QI_EIOTLB_GRAN(granu) | QI_EIOTLB_TYPE;
> + desc.high = QI_EIOTLB_ADDR(addr) | QI_EIOTLB_GL(global) |
> + QI_EIOTLB_IH(0) | QI_EIOTLB_AM(size_order);
> + qi_submit_sync(&desc, iommu);
> +}
> +
> +void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid,
> + u16 qdep, u64 addr, unsigned mask)
> +{
> + struct qi_desc desc;
> +
> + pr_debug_ratelimited("%s: sid %d, pfsid %d, qdep %d, addr %llx, mask %d\n",
> + __func__, sid, pfsid, qdep, addr, mask);
> if (mask) {
> BUG_ON(addr & ((1 << (VTD_PAGE_SHIFT + mask)) - 1));
> addr |= (1ULL << (VTD_PAGE_SHIFT + mask - 1)) - 1;
> @@ -1352,7 +1366,40 @@ void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 qdep,
> qdep = 0;
>
> desc.low = QI_DEV_IOTLB_SID(sid) | QI_DEV_IOTLB_QDEP(qdep) |
> - QI_DIOTLB_TYPE;
> + QI_DIOTLB_TYPE | QI_DEV_IOTLB_SID(pfsid);
So this change just combining sid and pfsid together, i.e. (sid |
pfsid)? What if both of them are not zero?
> +
> + qi_submit_sync(&desc, iommu);
> +}
> +
> +void qi_flush_dev_eiotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid,
> + u32 pasid, u16 qdep, u64 addr, unsigned size)
> +{
> + struct qi_desc desc;
> +
> + desc.low = QI_DEV_EIOTLB_PASID(pasid) | QI_DEV_EIOTLB_SID(sid) |
> + QI_DEV_EIOTLB_QDEP(qdep) | QI_DEIOTLB_TYPE |
> + QI_DEV_EIOTLB_PFSID(pfsid);
> +
> + /* If S bit is 0, we only flush a single page. If S bit is set,
> + * The least significant zero bit indicates the size. VT-d spec
> + * 6.5.2.6
> + */
> + if (!size)
> + desc.high = QI_DEV_EIOTLB_ADDR(addr) & ~QI_DEV_EIOTLB_SIZE;
> + else {
> + unsigned long mask = 1UL << (VTD_PAGE_SHIFT + size);
> +
> + desc.high = QI_DEV_EIOTLB_ADDR(addr & ~mask) | QI_DEV_EIOTLB_SIZE;
> + }
> + qi_submit_sync(&desc, iommu);
> +}
> +
> +void qi_flush_pasid(struct intel_iommu *iommu, u16 did, u64 granu, int pasid)
> +{
> + struct qi_desc desc;
> +
> + desc.high = 0;
> + desc.low = QI_PC_TYPE | QI_PC_DID(did) | QI_PC_GRAN(granu) | QI_PC_PASID(pasid);
>
> qi_submit_sync(&desc, iommu);
> }
> diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
> index 7ae569c..6832f73 100644
> --- a/drivers/iommu/intel-iommu.c
> +++ b/drivers/iommu/intel-iommu.c
> @@ -1567,7 +1567,8 @@ static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
>
> sid = info->bus << 8 | info->devfn;
> qdep = info->ats_qdep;
> - qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
> + qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
> + qdep, addr, mask);
> }
> spin_unlock_irqrestore(&device_domain_lock, flags);
> }
> diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
> index 485a5b4..e42d317 100644
> --- a/include/linux/intel-iommu.h
> +++ b/include/linux/intel-iommu.h
> @@ -305,6 +305,7 @@ enum {
> #define QI_DEV_EIOTLB_PASID(p) (((u64)p) << 32)
> #define QI_DEV_EIOTLB_SID(sid) ((u64)((sid) & 0xffff) << 16)
> #define QI_DEV_EIOTLB_QDEP(qd) ((u64)((qd) & 0x1f) << 4)
> +#define QI_DEV_EIOTLB_PFSID(pfsid) (((u64)(pfsid & 0xf) << 12) | ((u64)(pfsid & 0xff0) << 48))
> #define QI_DEV_EIOTLB_MAX_INVS 32
>
> #define QI_PGRP_IDX(idx) (((u64)(idx)) << 55)
> @@ -450,8 +451,13 @@ extern void qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid,
> u8 fm, u64 type);
> extern void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
> unsigned int size_order, u64 type);
> -extern void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 qdep,
> - u64 addr, unsigned mask);
> +extern void qi_flush_eiotlb(struct intel_iommu *iommu, u16 did, u64 addr,
> + u32 pasid, unsigned int size_order, u64 type, bool global);
> +extern void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid,
> + u16 qdep, u64 addr, unsigned mask);
> +extern void qi_flush_dev_eiotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid,
> + u32 pasid, u16 qdep, u64 addr, unsigned size);
> +extern void qi_flush_pasid(struct intel_iommu *iommu, u16 did, u64 granu, int pasid);
>
> extern int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu);
>
>
--------------------------------------------------------------
Intel Research and Development Ireland Limited
Registered in Ireland
Registered Office: Collinstown Industrial Park, Leixlip, County Kildare
Registered Number: 308263
This e-mail and any attachments may contain confidential material for the sole
use of the intended recipient(s). Any review or distribution by others is
strictly prohibited. If you are not the intended recipient, please contact the
sender and delete all copies.
^ permalink raw reply [flat|nested] 109+ messages in thread* Re: [v2,04/16] iommu/vt-d: support flushing more TLB types
@ 2017-10-26 13:02 ` Lukoshkov, Maksim
0 siblings, 0 replies; 109+ messages in thread
From: Lukoshkov, Maksim @ 2017-10-26 13:02 UTC (permalink / raw)
To: Jacob Pan, iommu, LKML, Joerg Roedel, David Woodhouse,
Greg Kroah-Hartman, Rafael Wysocki, Jean-Philippe Brucker
Cc: Liu, Yi L, Lan Tianyu, Tian, Kevin, Raj Ashok, Alex Williamson
On 10/6/2017 00:03, Jacob Pan wrote:
> Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
> ---
> drivers/iommu/dmar.c | 53 ++++++++++++++++++++++++++++++++++++++++++---
> drivers/iommu/intel-iommu.c | 3 ++-
> include/linux/intel-iommu.h | 10 +++++++--
> 3 files changed, 60 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c
> index 57c920c..2fbff8b 100644
> --- a/drivers/iommu/dmar.c
> +++ b/drivers/iommu/dmar.c
> @@ -1336,11 +1336,25 @@ void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
> qi_submit_sync(&desc, iommu);
> }
>
> -void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 qdep,
> - u64 addr, unsigned mask)
> +void qi_flush_eiotlb(struct intel_iommu *iommu, u16 did, u64 addr, u32 pasid,
> + unsigned int size_order, u64 granu, bool global)
> {
> struct qi_desc desc;
>
> + desc.low = QI_EIOTLB_PASID(pasid) | QI_EIOTLB_DID(did) |
> + QI_EIOTLB_GRAN(granu) | QI_EIOTLB_TYPE;
> + desc.high = QI_EIOTLB_ADDR(addr) | QI_EIOTLB_GL(global) |
> + QI_EIOTLB_IH(0) | QI_EIOTLB_AM(size_order);
> + qi_submit_sync(&desc, iommu);
> +}
> +
> +void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid,
> + u16 qdep, u64 addr, unsigned mask)
> +{
> + struct qi_desc desc;
> +
> + pr_debug_ratelimited("%s: sid %d, pfsid %d, qdep %d, addr %llx, mask %d\n",
> + __func__, sid, pfsid, qdep, addr, mask);
> if (mask) {
> BUG_ON(addr & ((1 << (VTD_PAGE_SHIFT + mask)) - 1));
> addr |= (1ULL << (VTD_PAGE_SHIFT + mask - 1)) - 1;
> @@ -1352,7 +1366,40 @@ void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 qdep,
> qdep = 0;
>
> desc.low = QI_DEV_IOTLB_SID(sid) | QI_DEV_IOTLB_QDEP(qdep) |
> - QI_DIOTLB_TYPE;
> + QI_DIOTLB_TYPE | QI_DEV_IOTLB_SID(pfsid);
So this change just combining sid and pfsid together, i.e. (sid |
pfsid)? What if both of them are not zero?
> +
> + qi_submit_sync(&desc, iommu);
> +}
> +
> +void qi_flush_dev_eiotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid,
> + u32 pasid, u16 qdep, u64 addr, unsigned size)
> +{
> + struct qi_desc desc;
> +
> + desc.low = QI_DEV_EIOTLB_PASID(pasid) | QI_DEV_EIOTLB_SID(sid) |
> + QI_DEV_EIOTLB_QDEP(qdep) | QI_DEIOTLB_TYPE |
> + QI_DEV_EIOTLB_PFSID(pfsid);
> +
> + /* If S bit is 0, we only flush a single page. If S bit is set,
> + * The least significant zero bit indicates the size. VT-d spec
> + * 6.5.2.6
> + */
> + if (!size)
> + desc.high = QI_DEV_EIOTLB_ADDR(addr) & ~QI_DEV_EIOTLB_SIZE;
> + else {
> + unsigned long mask = 1UL << (VTD_PAGE_SHIFT + size);
> +
> + desc.high = QI_DEV_EIOTLB_ADDR(addr & ~mask) | QI_DEV_EIOTLB_SIZE;
> + }
> + qi_submit_sync(&desc, iommu);
> +}
> +
> +void qi_flush_pasid(struct intel_iommu *iommu, u16 did, u64 granu, int pasid)
> +{
> + struct qi_desc desc;
> +
> + desc.high = 0;
> + desc.low = QI_PC_TYPE | QI_PC_DID(did) | QI_PC_GRAN(granu) | QI_PC_PASID(pasid);
>
> qi_submit_sync(&desc, iommu);
> }
> diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
> index 7ae569c..6832f73 100644
> --- a/drivers/iommu/intel-iommu.c
> +++ b/drivers/iommu/intel-iommu.c
> @@ -1567,7 +1567,8 @@ static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
>
> sid = info->bus << 8 | info->devfn;
> qdep = info->ats_qdep;
> - qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
> + qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
> + qdep, addr, mask);
> }
> spin_unlock_irqrestore(&device_domain_lock, flags);
> }
> diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
> index 485a5b4..e42d317 100644
> --- a/include/linux/intel-iommu.h
> +++ b/include/linux/intel-iommu.h
> @@ -305,6 +305,7 @@ enum {
> #define QI_DEV_EIOTLB_PASID(p) (((u64)p) << 32)
> #define QI_DEV_EIOTLB_SID(sid) ((u64)((sid) & 0xffff) << 16)
> #define QI_DEV_EIOTLB_QDEP(qd) ((u64)((qd) & 0x1f) << 4)
> +#define QI_DEV_EIOTLB_PFSID(pfsid) (((u64)(pfsid & 0xf) << 12) | ((u64)(pfsid & 0xff0) << 48))
> #define QI_DEV_EIOTLB_MAX_INVS 32
>
> #define QI_PGRP_IDX(idx) (((u64)(idx)) << 55)
> @@ -450,8 +451,13 @@ extern void qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid,
> u8 fm, u64 type);
> extern void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
> unsigned int size_order, u64 type);
> -extern void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 qdep,
> - u64 addr, unsigned mask);
> +extern void qi_flush_eiotlb(struct intel_iommu *iommu, u16 did, u64 addr,
> + u32 pasid, unsigned int size_order, u64 type, bool global);
> +extern void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid,
> + u16 qdep, u64 addr, unsigned mask);
> +extern void qi_flush_dev_eiotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid,
> + u32 pasid, u16 qdep, u64 addr, unsigned size);
> +extern void qi_flush_pasid(struct intel_iommu *iommu, u16 did, u64 granu, int pasid);
>
> extern int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu);
>
>
--------------------------------------------------------------
Intel Research and Development Ireland Limited
Registered in Ireland
Registered Office: Collinstown Industrial Park, Leixlip, County Kildare
Registered Number: 308263
This e-mail and any attachments may contain confidential material for the sole
use of the intended recipient(s). Any review or distribution by others is
strictly prohibited. If you are not the intended recipient, please contact the
sender and delete all copies.
^ permalink raw reply [flat|nested] 109+ messages in thread[parent not found: <c7d32ea1-fc82-fdef-c275-d4e29d428094-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>]
* Re: [v2,04/16] iommu/vt-d: support flushing more TLB types
2017-10-26 13:02 ` Lukoshkov, Maksim
@ 2017-10-31 20:39 ` Jacob Pan
-1 siblings, 0 replies; 109+ messages in thread
From: Jacob Pan @ 2017-10-31 20:39 UTC (permalink / raw)
To: Lukoshkov, Maksim
Cc: Lan Tianyu, Greg Kroah-Hartman, Rafael Wysocki, LKML,
iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
David Woodhouse
On Thu, 26 Oct 2017 14:02:13 +0100
"Lukoshkov, Maksim" <maksim.lukoshkov-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org> wrote:
> On 10/6/2017 00:03, Jacob Pan wrote:
> > Signed-off-by: Jacob Pan <jacob.jun.pan-VuQAYsv1563Yd54FQh9/CA@public.gmane.org>
> > ---
> > drivers/iommu/dmar.c | 53
> > ++++++++++++++++++++++++++++++++++++++++++---
> > drivers/iommu/intel-iommu.c | 3 ++- include/linux/intel-iommu.h |
> > 10 +++++++-- 3 files changed, 60 insertions(+), 6 deletions(-)
> >
> > diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c
> > index 57c920c..2fbff8b 100644
> > --- a/drivers/iommu/dmar.c
> > +++ b/drivers/iommu/dmar.c
> > @@ -1336,11 +1336,25 @@ void qi_flush_iotlb(struct intel_iommu
> > *iommu, u16 did, u64 addr, qi_submit_sync(&desc, iommu);
> > }
> >
> > -void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16
> > qdep,
> > - u64 addr, unsigned mask)
> > +void qi_flush_eiotlb(struct intel_iommu *iommu, u16 did, u64 addr,
> > u32 pasid,
> > + unsigned int size_order, u64 granu, bool global)
> > {
> > struct qi_desc desc;
> >
> > + desc.low = QI_EIOTLB_PASID(pasid) | QI_EIOTLB_DID(did) |
> > + QI_EIOTLB_GRAN(granu) | QI_EIOTLB_TYPE;
> > + desc.high = QI_EIOTLB_ADDR(addr) | QI_EIOTLB_GL(global) |
> > + QI_EIOTLB_IH(0) | QI_EIOTLB_AM(size_order);
> > + qi_submit_sync(&desc, iommu);
> > +}
> > +
> > +void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16
> > pfsid,
> > + u16 qdep, u64 addr, unsigned mask)
> > +{
> > + struct qi_desc desc;
> > +
> > + pr_debug_ratelimited("%s: sid %d, pfsid %d, qdep %d, addr
> > %llx, mask %d\n",
> > + __func__, sid, pfsid, qdep, addr, mask);
> > if (mask) {
> > BUG_ON(addr & ((1 << (VTD_PAGE_SHIFT + mask)) -
> > 1)); addr |= (1ULL << (VTD_PAGE_SHIFT + mask - 1)) - 1;
> > @@ -1352,7 +1366,40 @@ void qi_flush_dev_iotlb(struct intel_iommu
> > *iommu, u16 sid, u16 qdep, qdep = 0;
> >
> > desc.low = QI_DEV_IOTLB_SID(sid) |
> > QI_DEV_IOTLB_QDEP(qdep) |
> > - QI_DIOTLB_TYPE;
> > + QI_DIOTLB_TYPE | QI_DEV_IOTLB_SID(pfsid);
> So this change just combining sid and pfsid together, i.e. (sid |
> pfsid)? What if both of them are not zero?
I made a mistake here, it should be:
QI_DIOTLB_TYPE | QI_DEV_IOTLB_PFSID(pfsid);
Thanks for the catch.
Jacob
> [...]
>
[Jacob Pan]
^ permalink raw reply [flat|nested] 109+ messages in thread* Re: [v2,04/16] iommu/vt-d: support flushing more TLB types
@ 2017-10-31 20:39 ` Jacob Pan
0 siblings, 0 replies; 109+ messages in thread
From: Jacob Pan @ 2017-10-31 20:39 UTC (permalink / raw)
To: Lukoshkov, Maksim
Cc: iommu, LKML, Joerg Roedel, David Woodhouse, Greg Kroah-Hartman,
Rafael Wysocki, Jean-Philippe Brucker, Liu, Yi L, Lan Tianyu,
Tian, Kevin, Raj Ashok, Alex Williamson, jacob.jun.pan
On Thu, 26 Oct 2017 14:02:13 +0100
"Lukoshkov, Maksim" <maksim.lukoshkov@intel.com> wrote:
> On 10/6/2017 00:03, Jacob Pan wrote:
> > Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
> > ---
> > drivers/iommu/dmar.c | 53
> > ++++++++++++++++++++++++++++++++++++++++++---
> > drivers/iommu/intel-iommu.c | 3 ++- include/linux/intel-iommu.h |
> > 10 +++++++-- 3 files changed, 60 insertions(+), 6 deletions(-)
> >
> > diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c
> > index 57c920c..2fbff8b 100644
> > --- a/drivers/iommu/dmar.c
> > +++ b/drivers/iommu/dmar.c
> > @@ -1336,11 +1336,25 @@ void qi_flush_iotlb(struct intel_iommu
> > *iommu, u16 did, u64 addr, qi_submit_sync(&desc, iommu);
> > }
> >
> > -void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16
> > qdep,
> > - u64 addr, unsigned mask)
> > +void qi_flush_eiotlb(struct intel_iommu *iommu, u16 did, u64 addr,
> > u32 pasid,
> > + unsigned int size_order, u64 granu, bool global)
> > {
> > struct qi_desc desc;
> >
> > + desc.low = QI_EIOTLB_PASID(pasid) | QI_EIOTLB_DID(did) |
> > + QI_EIOTLB_GRAN(granu) | QI_EIOTLB_TYPE;
> > + desc.high = QI_EIOTLB_ADDR(addr) | QI_EIOTLB_GL(global) |
> > + QI_EIOTLB_IH(0) | QI_EIOTLB_AM(size_order);
> > + qi_submit_sync(&desc, iommu);
> > +}
> > +
> > +void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16
> > pfsid,
> > + u16 qdep, u64 addr, unsigned mask)
> > +{
> > + struct qi_desc desc;
> > +
> > + pr_debug_ratelimited("%s: sid %d, pfsid %d, qdep %d, addr
> > %llx, mask %d\n",
> > + __func__, sid, pfsid, qdep, addr, mask);
> > if (mask) {
> > BUG_ON(addr & ((1 << (VTD_PAGE_SHIFT + mask)) -
> > 1)); addr |= (1ULL << (VTD_PAGE_SHIFT + mask - 1)) - 1;
> > @@ -1352,7 +1366,40 @@ void qi_flush_dev_iotlb(struct intel_iommu
> > *iommu, u16 sid, u16 qdep, qdep = 0;
> >
> > desc.low = QI_DEV_IOTLB_SID(sid) |
> > QI_DEV_IOTLB_QDEP(qdep) |
> > - QI_DIOTLB_TYPE;
> > + QI_DIOTLB_TYPE | QI_DEV_IOTLB_SID(pfsid);
> So this change just combining sid and pfsid together, i.e. (sid |
> pfsid)? What if both of them are not zero?
I made a mistake here, it should be:
QI_DIOTLB_TYPE | QI_DEV_IOTLB_PFSID(pfsid);
Thanks for the catch.
Jacob
> [...]
>
[Jacob Pan]
^ permalink raw reply [flat|nested] 109+ messages in thread
* [PATCH v2 05/16] iommu/vt-d: add iommu invalidate function
2017-10-05 23:03 ` Jacob Pan
@ 2017-10-05 23:03 ` Jacob Pan
-1 siblings, 0 replies; 109+ messages in thread
From: Jacob Pan @ 2017-10-05 23:03 UTC (permalink / raw)
To: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA, LKML,
Joerg Roedel, David Woodhouse, Greg Kroah-Hartman, Rafael Wysocki,
Jean-Philippe Brucker
Cc: Lan Tianyu, Yi L, Liu-i9wRM+HIrmnmtl4Z8vJ8Kg761KYD1DLY
This patch adds Intel VT-d specific function to implement
iommu passdown invalidate API.
The use case is for supporting caching structure invalidation
of assigned SVM capable devices. Emulated IOMMU exposes queue
invalidation capability and passes down all descriptors from the guest
to the physical IOMMU.
The assumption is that guest to host device ID mapping should be
resolved prior to calling IOMMU driver. Based on the device handle,
host IOMMU driver can replace certain fields before submit to the
invalidation queue.
Signed-off-by: Liu, Yi L <yi.l.liu-VuQAYsv1563Yd54FQh9/CA@public.gmane.org>
Signed-off-by: Jacob Pan <jacob.jun.pan-VuQAYsv1563Yd54FQh9/CA@public.gmane.org>
Signed-off-by: Ashok Raj <ashok.raj-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
---
drivers/iommu/intel-iommu.c | 148 ++++++++++++++++++++++++++++++++++++++++++++
include/linux/intel-iommu.h | 10 +++
2 files changed, 158 insertions(+)
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 6832f73..81e27eb 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -5023,6 +5023,153 @@ static void intel_iommu_detach_device(struct iommu_domain *domain,
dmar_remove_one_dev_info(to_dmar_domain(domain), dev);
}
+/*
+ * 3D array for converting IOMMU generic type-granularity to VT-d granularity
+ * X indexed by enum iommu_inv_type
+ * Y indicates request without and with PASID
+ * Z indexed by enum enum iommu_inv_granularity
+ *
+ * For an example, if we want to find the VT-d granularity encoding for IOTLB
+ * type, DMA request with PASID, and page selective. The look up indices are:
+ * [1][1][8], where
+ * 1: IOMMU_INV_TYPE_TLB
+ * 1: with PASID
+ * 8: IOMMU_INV_GRANU_PAGE_PASID
+ *
+ */
+const static u64 inv_type_granu_table[IOMMU_INV_NR_TYPE][2][IOMMU_INV_NR_GRANU] = {
+ /* extended dev IOTLBs, only global is valid */
+ {
+ {1}
+ },
+ /* IOTLB and EIOTLB */
+ {
+ {DMA_TLB_GLOBAL_FLUSH, DMA_TLB_DSI_FLUSH, 0, DMA_TLB_PSI_FLUSH},
+ {0, 0, 0, 0, QI_GRAN_ALL_ALL, 0, QI_GRAN_NONG_ALL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID}
+ },
+ /* PASID cache */
+ {
+ {0, 0, 0, 0, 1}
+ },
+ /* context cache */
+ {
+ {DMA_CCMD_GLOBAL_INVL, DMA_CCMD_DOMAIN_INVL, DMA_CCMD_DEVICE_INVL}
+ }
+};
+
+static inline int to_vtd_granularity(int type, int granu, int with_pasid, u64 *vtd_granu)
+{
+ if (type >= IOMMU_INV_NR_TYPE || granu >= IOMMU_INV_NR_GRANU || with_pasid > 1)
+ return -EINVAL;
+ *vtd_granu = inv_type_granu_table[type][with_pasid][granu];
+
+ return 0;
+}
+
+static int intel_iommu_invalidate(struct iommu_domain *domain,
+ struct device *dev, struct tlb_invalidate_info *inv_info)
+{
+ struct intel_iommu *iommu;
+ struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+ struct device_domain_info *info;
+ struct pci_dev *pdev;
+ u16 did, sid, pfsid;
+ u8 bus, devfn;
+ int ret = 0;
+ u64 granu;
+ unsigned long flags;
+
+ if (!inv_info || !dmar_domain)
+ return -EINVAL;
+
+ iommu = device_to_iommu(dev, &bus, &devfn);
+ if (!iommu)
+ return -ENODEV;
+
+ if (!dev || !dev_is_pci(dev))
+ return -ENODEV;
+
+ did = dmar_domain->iommu_did[iommu->seq_id];
+ sid = PCI_DEVID(bus, devfn);
+ ret = to_vtd_granularity(inv_info->hdr.type, inv_info->granularity,
+ !!(inv_info->flags & IOMMU_INVALIDATE_DMA_PASID), &granu);
+ if (ret) {
+ pr_err("Invalid range type %d, granu %d\n", inv_info->hdr.type,
+ inv_info->granularity);
+ return ret;
+ }
+
+ spin_lock(&iommu->lock);
+ spin_lock_irqsave(&device_domain_lock, flags);
+
+ switch (inv_info->hdr.type) {
+ case IOMMU_INV_TYPE_CONTEXT:
+ iommu->flush.flush_context(iommu, did, sid,
+ DMA_CCMD_MASK_NOBIT, granu);
+ break;
+ case IOMMU_INV_TYPE_TLB:
+ /* We need to deal with two scenarios:
+ * - IOTLB for request w/o PASID
+ * - extended IOTLB for request with PASID.
+ */
+ if (inv_info->size &&
+ (inv_info->addr & ((1 << (VTD_PAGE_SHIFT + inv_info->size)) - 1))) {
+ pr_err("Addr out of range, addr 0x%llx, size order %d\n",
+ inv_info->addr, inv_info->size);
+ ret = -ERANGE;
+ goto out_unlock;
+ }
+
+ if (inv_info->flags & IOMMU_INVALIDATE_DMA_PASID)
+ qi_flush_eiotlb(iommu, did, mm_to_dma_pfn(inv_info->addr),
+ inv_info->pasid,
+ inv_info->size, granu,
+ inv_info->flags & IOMMU_INVALIDATE_GLOBAL_PAGE);
+ else
+ qi_flush_iotlb(iommu, did, mm_to_dma_pfn(inv_info->addr),
+ inv_info->size, granu);
+ /* For SRIOV VF, invalidation of device IOTLB requires PFSID */
+ pdev = to_pci_dev(dev);
+ if (pdev && pdev->is_virtfn)
+ pfsid = PCI_DEVID(pdev->physfn->bus->number, pdev->physfn->devfn);
+ else
+ pfsid = sid;
+
+ /**
+ * Always flush device IOTLB if ATS is enabled since guest
+ * vIOMMU exposes CM = 1, no device IOTLB flush will be passed
+ * down.
+ * TODO: check if device is VF, use PF ATS data if spec does not require
+ * VF to include all PF capabilities, VF qdep and VF ats_enabled.
+ */
+ info = iommu_support_dev_iotlb(dmar_domain, iommu, bus, devfn);
+ if (info && info->ats_enabled) {
+ if (inv_info->flags & IOMMU_INVALIDATE_NO_PASID)
+ qi_flush_dev_iotlb(iommu, sid, info->pfsid,
+ info->ats_qdep, inv_info->addr,
+ inv_info->size);
+ else
+ qi_flush_dev_eiotlb(iommu, sid, info->pfsid,
+ inv_info->pasid, info->ats_qdep,
+ inv_info->addr, inv_info->size);
+ }
+ break;
+ case IOMMU_INV_TYPE_PASID:
+ qi_flush_pasid(iommu, did, granu, inv_info->pasid);
+
+ break;
+ default:
+ dev_err(dev, "Unknown IOMMU invalidation type %d\n",
+ inv_info->hdr.type);
+ ret = -EINVAL;
+ }
+out_unlock:
+ spin_unlock(&iommu->lock);
+ spin_unlock_irqrestore(&device_domain_lock, flags);
+
+ return ret;
+}
+
static int intel_iommu_map(struct iommu_domain *domain,
unsigned long iova, phys_addr_t hpa,
size_t size, int iommu_prot)
@@ -5451,6 +5598,7 @@ const struct iommu_ops intel_iommu_ops = {
#ifdef CONFIG_INTEL_IOMMU_SVM
.bind_pasid_table = intel_iommu_bind_pasid_table,
.unbind_pasid_table = intel_iommu_unbind_pasid_table,
+ .invalidate = intel_iommu_invalidate,
#endif
.map = intel_iommu_map,
.unmap = intel_iommu_unmap,
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index e42d317..5c734bd 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -258,6 +258,10 @@ enum {
#define QI_PGRP_RESP_TYPE 0x9
#define QI_PSTRM_RESP_TYPE 0xa
+#define QI_DID(did) (((u64)did & 0xffff) << 16)
+#define QI_DID_MASK GENMASK(31, 16)
+#define QI_TYPE_MASK GENMASK(3, 0)
+
#define QI_IEC_SELECTIVE (((u64)1) << 4)
#define QI_IEC_IIDEX(idx) (((u64)(idx & 0xffff) << 32))
#define QI_IEC_IM(m) (((u64)(m & 0x1f) << 27))
@@ -495,6 +499,12 @@ extern int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct intel_svm_
extern struct intel_iommu *intel_svm_device_to_iommu(struct device *dev);
#endif
+struct intel_invalidate_data {
+ u16 sid;
+ u32 pasid;
+ struct qi_desc inv_desc;
+};
+
extern const struct attribute_group *intel_iommu_groups[];
#endif
--
2.7.4
^ permalink raw reply related [flat|nested] 109+ messages in thread* [PATCH v2 05/16] iommu/vt-d: add iommu invalidate function
@ 2017-10-05 23:03 ` Jacob Pan
0 siblings, 0 replies; 109+ messages in thread
From: Jacob Pan @ 2017-10-05 23:03 UTC (permalink / raw)
To: iommu, LKML, Joerg Roedel, David Woodhouse, Greg Kroah-Hartman,
Rafael Wysocki, Jean-Philippe Brucker
Cc: Liu, Yi L, Lan Tianyu, Tian, Kevin, Raj Ashok, Alex Williamson,
Jacob Pan, Liu, Yi L
This patch adds Intel VT-d specific function to implement
iommu passdown invalidate API.
The use case is for supporting caching structure invalidation
of assigned SVM capable devices. Emulated IOMMU exposes queue
invalidation capability and passes down all descriptors from the guest
to the physical IOMMU.
The assumption is that guest to host device ID mapping should be
resolved prior to calling IOMMU driver. Based on the device handle,
host IOMMU driver can replace certain fields before submit to the
invalidation queue.
Signed-off-by: Liu, Yi L <yi.l.liu@linux.intel.com>
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Ashok Raj <ashok.raj@intel.com>
---
drivers/iommu/intel-iommu.c | 148 ++++++++++++++++++++++++++++++++++++++++++++
include/linux/intel-iommu.h | 10 +++
2 files changed, 158 insertions(+)
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 6832f73..81e27eb 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -5023,6 +5023,153 @@ static void intel_iommu_detach_device(struct iommu_domain *domain,
dmar_remove_one_dev_info(to_dmar_domain(domain), dev);
}
+/*
+ * 3D array for converting IOMMU generic type-granularity to VT-d granularity
+ * X indexed by enum iommu_inv_type
+ * Y indicates request without and with PASID
+ * Z indexed by enum enum iommu_inv_granularity
+ *
+ * For an example, if we want to find the VT-d granularity encoding for IOTLB
+ * type, DMA request with PASID, and page selective. The look up indices are:
+ * [1][1][8], where
+ * 1: IOMMU_INV_TYPE_TLB
+ * 1: with PASID
+ * 8: IOMMU_INV_GRANU_PAGE_PASID
+ *
+ */
+const static u64 inv_type_granu_table[IOMMU_INV_NR_TYPE][2][IOMMU_INV_NR_GRANU] = {
+ /* extended dev IOTLBs, only global is valid */
+ {
+ {1}
+ },
+ /* IOTLB and EIOTLB */
+ {
+ {DMA_TLB_GLOBAL_FLUSH, DMA_TLB_DSI_FLUSH, 0, DMA_TLB_PSI_FLUSH},
+ {0, 0, 0, 0, QI_GRAN_ALL_ALL, 0, QI_GRAN_NONG_ALL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID}
+ },
+ /* PASID cache */
+ {
+ {0, 0, 0, 0, 1}
+ },
+ /* context cache */
+ {
+ {DMA_CCMD_GLOBAL_INVL, DMA_CCMD_DOMAIN_INVL, DMA_CCMD_DEVICE_INVL}
+ }
+};
+
+static inline int to_vtd_granularity(int type, int granu, int with_pasid, u64 *vtd_granu)
+{
+ if (type >= IOMMU_INV_NR_TYPE || granu >= IOMMU_INV_NR_GRANU || with_pasid > 1)
+ return -EINVAL;
+ *vtd_granu = inv_type_granu_table[type][with_pasid][granu];
+
+ return 0;
+}
+
+static int intel_iommu_invalidate(struct iommu_domain *domain,
+ struct device *dev, struct tlb_invalidate_info *inv_info)
+{
+ struct intel_iommu *iommu;
+ struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+ struct device_domain_info *info;
+ struct pci_dev *pdev;
+ u16 did, sid, pfsid;
+ u8 bus, devfn;
+ int ret = 0;
+ u64 granu;
+ unsigned long flags;
+
+ if (!inv_info || !dmar_domain)
+ return -EINVAL;
+
+ iommu = device_to_iommu(dev, &bus, &devfn);
+ if (!iommu)
+ return -ENODEV;
+
+ if (!dev || !dev_is_pci(dev))
+ return -ENODEV;
+
+ did = dmar_domain->iommu_did[iommu->seq_id];
+ sid = PCI_DEVID(bus, devfn);
+ ret = to_vtd_granularity(inv_info->hdr.type, inv_info->granularity,
+ !!(inv_info->flags & IOMMU_INVALIDATE_DMA_PASID), &granu);
+ if (ret) {
+ pr_err("Invalid range type %d, granu %d\n", inv_info->hdr.type,
+ inv_info->granularity);
+ return ret;
+ }
+
+ spin_lock(&iommu->lock);
+ spin_lock_irqsave(&device_domain_lock, flags);
+
+ switch (inv_info->hdr.type) {
+ case IOMMU_INV_TYPE_CONTEXT:
+ iommu->flush.flush_context(iommu, did, sid,
+ DMA_CCMD_MASK_NOBIT, granu);
+ break;
+ case IOMMU_INV_TYPE_TLB:
+ /* We need to deal with two scenarios:
+ * - IOTLB for request w/o PASID
+ * - extended IOTLB for request with PASID.
+ */
+ if (inv_info->size &&
+ (inv_info->addr & ((1 << (VTD_PAGE_SHIFT + inv_info->size)) - 1))) {
+ pr_err("Addr out of range, addr 0x%llx, size order %d\n",
+ inv_info->addr, inv_info->size);
+ ret = -ERANGE;
+ goto out_unlock;
+ }
+
+ if (inv_info->flags & IOMMU_INVALIDATE_DMA_PASID)
+ qi_flush_eiotlb(iommu, did, mm_to_dma_pfn(inv_info->addr),
+ inv_info->pasid,
+ inv_info->size, granu,
+ inv_info->flags & IOMMU_INVALIDATE_GLOBAL_PAGE);
+ else
+ qi_flush_iotlb(iommu, did, mm_to_dma_pfn(inv_info->addr),
+ inv_info->size, granu);
+ /* For SRIOV VF, invalidation of device IOTLB requires PFSID */
+ pdev = to_pci_dev(dev);
+ if (pdev && pdev->is_virtfn)
+ pfsid = PCI_DEVID(pdev->physfn->bus->number, pdev->physfn->devfn);
+ else
+ pfsid = sid;
+
+ /**
+ * Always flush device IOTLB if ATS is enabled since guest
+ * vIOMMU exposes CM = 1, no device IOTLB flush will be passed
+ * down.
+ * TODO: check if device is VF, use PF ATS data if spec does not require
+ * VF to include all PF capabilities, VF qdep and VF ats_enabled.
+ */
+ info = iommu_support_dev_iotlb(dmar_domain, iommu, bus, devfn);
+ if (info && info->ats_enabled) {
+ if (inv_info->flags & IOMMU_INVALIDATE_NO_PASID)
+ qi_flush_dev_iotlb(iommu, sid, info->pfsid,
+ info->ats_qdep, inv_info->addr,
+ inv_info->size);
+ else
+ qi_flush_dev_eiotlb(iommu, sid, info->pfsid,
+ inv_info->pasid, info->ats_qdep,
+ inv_info->addr, inv_info->size);
+ }
+ break;
+ case IOMMU_INV_TYPE_PASID:
+ qi_flush_pasid(iommu, did, granu, inv_info->pasid);
+
+ break;
+ default:
+ dev_err(dev, "Unknown IOMMU invalidation type %d\n",
+ inv_info->hdr.type);
+ ret = -EINVAL;
+ }
+out_unlock:
+ spin_unlock(&iommu->lock);
+ spin_unlock_irqrestore(&device_domain_lock, flags);
+
+ return ret;
+}
+
static int intel_iommu_map(struct iommu_domain *domain,
unsigned long iova, phys_addr_t hpa,
size_t size, int iommu_prot)
@@ -5451,6 +5598,7 @@ const struct iommu_ops intel_iommu_ops = {
#ifdef CONFIG_INTEL_IOMMU_SVM
.bind_pasid_table = intel_iommu_bind_pasid_table,
.unbind_pasid_table = intel_iommu_unbind_pasid_table,
+ .invalidate = intel_iommu_invalidate,
#endif
.map = intel_iommu_map,
.unmap = intel_iommu_unmap,
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index e42d317..5c734bd 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -258,6 +258,10 @@ enum {
#define QI_PGRP_RESP_TYPE 0x9
#define QI_PSTRM_RESP_TYPE 0xa
+#define QI_DID(did) (((u64)did & 0xffff) << 16)
+#define QI_DID_MASK GENMASK(31, 16)
+#define QI_TYPE_MASK GENMASK(3, 0)
+
#define QI_IEC_SELECTIVE (((u64)1) << 4)
#define QI_IEC_IIDEX(idx) (((u64)(idx & 0xffff) << 32))
#define QI_IEC_IM(m) (((u64)(m & 0x1f) << 27))
@@ -495,6 +499,12 @@ extern int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct intel_svm_
extern struct intel_iommu *intel_svm_device_to_iommu(struct device *dev);
#endif
+struct intel_invalidate_data {
+ u16 sid;
+ u32 pasid;
+ struct qi_desc inv_desc;
+};
+
extern const struct attribute_group *intel_iommu_groups[];
#endif
--
2.7.4
^ permalink raw reply related [flat|nested] 109+ messages in thread
* [PATCH v2 06/16] iommu/vt-d: move device_domain_info to header
2017-10-05 23:03 ` Jacob Pan
@ 2017-10-05 23:03 ` Jacob Pan
-1 siblings, 0 replies; 109+ messages in thread
From: Jacob Pan @ 2017-10-05 23:03 UTC (permalink / raw)
To: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA, LKML,
Joerg Roedel, David Woodhouse, Greg Kroah-Hartman, Rafael Wysocki,
Jean-Philippe Brucker
Cc: Lan Tianyu
Allow both intel-iommu.c and dmar.c to access device_domain_info.
Prepare for additional per device arch data used in TLB flush function
Signed-off-by: Jacob Pan <jacob.jun.pan-VuQAYsv1563Yd54FQh9/CA@public.gmane.org>
---
drivers/iommu/intel-iommu.c | 18 ------------------
include/linux/intel-iommu.h | 19 +++++++++++++++++++
2 files changed, 19 insertions(+), 18 deletions(-)
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 81e27eb..e5a5209 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -416,24 +416,6 @@ struct dmar_domain {
iommu core */
};
-/* PCI domain-device relationship */
-struct device_domain_info {
- struct list_head link; /* link to domain siblings */
- struct list_head global; /* link to global list */
- u8 bus; /* PCI bus number */
- u8 devfn; /* PCI devfn number */
- u8 pasid_supported:3;
- u8 pasid_enabled:1;
- u8 pri_supported:1;
- u8 pri_enabled:1;
- u8 ats_supported:1;
- u8 ats_enabled:1;
- u8 ats_qdep;
- struct device *dev; /* it's NULL for PCIe-to-PCI bridge */
- struct intel_iommu *iommu; /* IOMMU used by this device */
- struct dmar_domain *domain; /* pointer to domain */
-};
-
struct dmar_rmrr_unit {
struct list_head list; /* list of rmrr units */
struct acpi_dmar_header *hdr; /* ACPI header */
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 5c734bd..f42b46c 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -436,6 +436,25 @@ struct intel_iommu {
u32 flags; /* Software defined flags */
};
+/* PCI domain-device relationship */
+struct device_domain_info {
+ struct list_head link; /* link to domain siblings */
+ struct list_head global; /* link to global list */
+ u8 bus; /* PCI bus number */
+ u8 devfn; /* PCI devfn number */
+ u8 pasid_supported:3;
+ u8 pasid_enabled:1;
+ u8 pri_supported:1;
+ u8 pri_enabled:1;
+ u8 ats_supported:1;
+ u8 ats_enabled:1;
+ u8 ats_qdep;
+ u64 fault_mask; /* selected IOMMU faults to be reported */
+ struct device *dev; /* it's NULL for PCIe-to-PCI bridge */
+ struct intel_iommu *iommu; /* IOMMU used by this device */
+ struct dmar_domain *domain; /* pointer to domain */
+};
+
static inline void __iommu_flush_cache(
struct intel_iommu *iommu, void *addr, int size)
{
--
2.7.4
^ permalink raw reply related [flat|nested] 109+ messages in thread* [PATCH v2 06/16] iommu/vt-d: move device_domain_info to header
@ 2017-10-05 23:03 ` Jacob Pan
0 siblings, 0 replies; 109+ messages in thread
From: Jacob Pan @ 2017-10-05 23:03 UTC (permalink / raw)
To: iommu, LKML, Joerg Roedel, David Woodhouse, Greg Kroah-Hartman,
Rafael Wysocki, Jean-Philippe Brucker
Cc: Liu, Yi L, Lan Tianyu, Tian, Kevin, Raj Ashok, Alex Williamson,
Jacob Pan
Allow both intel-iommu.c and dmar.c to access device_domain_info.
Prepare for additional per device arch data used in TLB flush function
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
---
drivers/iommu/intel-iommu.c | 18 ------------------
include/linux/intel-iommu.h | 19 +++++++++++++++++++
2 files changed, 19 insertions(+), 18 deletions(-)
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 81e27eb..e5a5209 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -416,24 +416,6 @@ struct dmar_domain {
iommu core */
};
-/* PCI domain-device relationship */
-struct device_domain_info {
- struct list_head link; /* link to domain siblings */
- struct list_head global; /* link to global list */
- u8 bus; /* PCI bus number */
- u8 devfn; /* PCI devfn number */
- u8 pasid_supported:3;
- u8 pasid_enabled:1;
- u8 pri_supported:1;
- u8 pri_enabled:1;
- u8 ats_supported:1;
- u8 ats_enabled:1;
- u8 ats_qdep;
- struct device *dev; /* it's NULL for PCIe-to-PCI bridge */
- struct intel_iommu *iommu; /* IOMMU used by this device */
- struct dmar_domain *domain; /* pointer to domain */
-};
-
struct dmar_rmrr_unit {
struct list_head list; /* list of rmrr units */
struct acpi_dmar_header *hdr; /* ACPI header */
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 5c734bd..f42b46c 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -436,6 +436,25 @@ struct intel_iommu {
u32 flags; /* Software defined flags */
};
+/* PCI domain-device relationship */
+struct device_domain_info {
+ struct list_head link; /* link to domain siblings */
+ struct list_head global; /* link to global list */
+ u8 bus; /* PCI bus number */
+ u8 devfn; /* PCI devfn number */
+ u8 pasid_supported:3;
+ u8 pasid_enabled:1;
+ u8 pri_supported:1;
+ u8 pri_enabled:1;
+ u8 ats_supported:1;
+ u8 ats_enabled:1;
+ u8 ats_qdep;
+ u64 fault_mask; /* selected IOMMU faults to be reported */
+ struct device *dev; /* it's NULL for PCIe-to-PCI bridge */
+ struct intel_iommu *iommu; /* IOMMU used by this device */
+ struct dmar_domain *domain; /* pointer to domain */
+};
+
static inline void __iommu_flush_cache(
struct intel_iommu *iommu, void *addr, int size)
{
--
2.7.4
^ permalink raw reply related [flat|nested] 109+ messages in thread
* [PATCH v2 07/16] iommu/vt-d: assign PFSID in device TLB invalidation
2017-10-05 23:03 ` Jacob Pan
@ 2017-10-05 23:03 ` Jacob Pan
-1 siblings, 0 replies; 109+ messages in thread
From: Jacob Pan @ 2017-10-05 23:03 UTC (permalink / raw)
To: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA, LKML,
Joerg Roedel, David Woodhouse, Greg Kroah-Hartman, Rafael Wysocki,
Jean-Philippe Brucker
Cc: Lan Tianyu
When SRIOV VF device IOTLB is invalidated, we need to provide
the PF source SID such that IOMMU hardware can gauge the depth
of invalidation queue which is shared among VFs. This is needed
when device invalidation throttle (DIT) capability is supported.
Signed-off-by: Jacob Pan <jacob.jun.pan-VuQAYsv1563Yd54FQh9/CA@public.gmane.org>
---
drivers/iommu/intel-iommu.c | 13 +++++++++++++
include/linux/intel-iommu.h | 3 +++
2 files changed, 16 insertions(+)
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index e5a5209..ede0f2e 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -1484,6 +1484,19 @@ static void iommu_enable_dev_iotlb(struct device_domain_info *info)
return;
pdev = to_pci_dev(info->dev);
+ /* For IOMMU that supports device IOTLB throttling (DIT), we assign
+ * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
+ * queue depth at PF level. If DIT is not set, PFSID will be treated as
+ * reserved, which should be set to 0.
+ */
+ if (!ecap_dit(info->iommu->ecap))
+ info->pfsid = 0;
+ else if (pdev && pdev->is_virtfn) {
+ if (ecap_dit(info->iommu->ecap))
+ dev_warn(&pdev->dev, "SRIOV VF device IOTLB enabled without flow control\n");
+ info->pfsid = PCI_DEVID(pdev->physfn->bus->number, pdev->physfn->devfn);
+ } else
+ info->pfsid = PCI_DEVID(info->bus, info->devfn);
#ifdef CONFIG_INTEL_IOMMU_SVM
/* The PCIe spec, in its wisdom, declares that the behaviour of
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index f42b46c..c8ac5c6 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -112,6 +112,7 @@
* Extended Capability Register
*/
+#define ecap_dit(e) ((e >> 41) & 0x1)
#define ecap_pasid(e) ((e >> 40) & 0x1)
#define ecap_pss(e) ((e >> 35) & 0x1f)
#define ecap_eafs(e) ((e >> 34) & 0x1)
@@ -285,6 +286,7 @@ enum {
#define QI_DEV_IOTLB_SID(sid) ((u64)((sid) & 0xffff) << 32)
#define QI_DEV_IOTLB_QDEP(qdep) (((qdep) & 0x1f) << 16)
#define QI_DEV_IOTLB_ADDR(addr) ((u64)(addr) & VTD_PAGE_MASK)
+#define QI_DEV_IOTLB_PFSID(pfsid) (((u64)(pfsid & 0xf) << 12) | ((u64)(pfsid & 0xff0) << 48))
#define QI_DEV_IOTLB_SIZE 1
#define QI_DEV_IOTLB_MAX_INVS 32
@@ -442,6 +444,7 @@ struct device_domain_info {
struct list_head global; /* link to global list */
u8 bus; /* PCI bus number */
u8 devfn; /* PCI devfn number */
+ u16 pfsid; /* SRIOV physical function source ID */
u8 pasid_supported:3;
u8 pasid_enabled:1;
u8 pri_supported:1;
--
2.7.4
^ permalink raw reply related [flat|nested] 109+ messages in thread* [PATCH v2 07/16] iommu/vt-d: assign PFSID in device TLB invalidation
@ 2017-10-05 23:03 ` Jacob Pan
0 siblings, 0 replies; 109+ messages in thread
From: Jacob Pan @ 2017-10-05 23:03 UTC (permalink / raw)
To: iommu, LKML, Joerg Roedel, David Woodhouse, Greg Kroah-Hartman,
Rafael Wysocki, Jean-Philippe Brucker
Cc: Liu, Yi L, Lan Tianyu, Tian, Kevin, Raj Ashok, Alex Williamson,
Jacob Pan
When SRIOV VF device IOTLB is invalidated, we need to provide
the PF source SID such that IOMMU hardware can gauge the depth
of invalidation queue which is shared among VFs. This is needed
when device invalidation throttle (DIT) capability is supported.
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
---
drivers/iommu/intel-iommu.c | 13 +++++++++++++
include/linux/intel-iommu.h | 3 +++
2 files changed, 16 insertions(+)
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index e5a5209..ede0f2e 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -1484,6 +1484,19 @@ static void iommu_enable_dev_iotlb(struct device_domain_info *info)
return;
pdev = to_pci_dev(info->dev);
+ /* For IOMMU that supports device IOTLB throttling (DIT), we assign
+ * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
+ * queue depth at PF level. If DIT is not set, PFSID will be treated as
+ * reserved, which should be set to 0.
+ */
+ if (!ecap_dit(info->iommu->ecap))
+ info->pfsid = 0;
+ else if (pdev && pdev->is_virtfn) {
+ if (ecap_dit(info->iommu->ecap))
+ dev_warn(&pdev->dev, "SRIOV VF device IOTLB enabled without flow control\n");
+ info->pfsid = PCI_DEVID(pdev->physfn->bus->number, pdev->physfn->devfn);
+ } else
+ info->pfsid = PCI_DEVID(info->bus, info->devfn);
#ifdef CONFIG_INTEL_IOMMU_SVM
/* The PCIe spec, in its wisdom, declares that the behaviour of
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index f42b46c..c8ac5c6 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -112,6 +112,7 @@
* Extended Capability Register
*/
+#define ecap_dit(e) ((e >> 41) & 0x1)
#define ecap_pasid(e) ((e >> 40) & 0x1)
#define ecap_pss(e) ((e >> 35) & 0x1f)
#define ecap_eafs(e) ((e >> 34) & 0x1)
@@ -285,6 +286,7 @@ enum {
#define QI_DEV_IOTLB_SID(sid) ((u64)((sid) & 0xffff) << 32)
#define QI_DEV_IOTLB_QDEP(qdep) (((qdep) & 0x1f) << 16)
#define QI_DEV_IOTLB_ADDR(addr) ((u64)(addr) & VTD_PAGE_MASK)
+#define QI_DEV_IOTLB_PFSID(pfsid) (((u64)(pfsid & 0xf) << 12) | ((u64)(pfsid & 0xff0) << 48))
#define QI_DEV_IOTLB_SIZE 1
#define QI_DEV_IOTLB_MAX_INVS 32
@@ -442,6 +444,7 @@ struct device_domain_info {
struct list_head global; /* link to global list */
u8 bus; /* PCI bus number */
u8 devfn; /* PCI devfn number */
+ u16 pfsid; /* SRIOV physical function source ID */
u8 pasid_supported:3;
u8 pasid_enabled:1;
u8 pri_supported:1;
--
2.7.4
^ permalink raw reply related [flat|nested] 109+ messages in thread
* [PATCH v2 08/16] iommu: introduce device fault data
2017-10-05 23:03 ` Jacob Pan
@ 2017-10-05 23:03 ` Jacob Pan
-1 siblings, 0 replies; 109+ messages in thread
From: Jacob Pan @ 2017-10-05 23:03 UTC (permalink / raw)
To: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA, LKML,
Joerg Roedel, David Woodhouse, Greg Kroah-Hartman, Rafael Wysocki,
Jean-Philippe Brucker
Cc: Lan Tianyu
Device faults detected by IOMMU can be reported outside IOMMU
subsystem. This patch intends to provide a generic device
fault data such that device drivers can communicate IOMMU faults
without model specific knowledge.
The assumption is that model specific IOMMU driver can filter and
handle most of the IOMMU faults if the cause is within IOMMU driver
control. Therefore, the fault reasons can be reported are grouped
and generalized based common specifications such as PCI ATS.
Signed-off-by: Jacob Pan <jacob.jun.pan-VuQAYsv1563Yd54FQh9/CA@public.gmane.org>
---
include/linux/iommu.h | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 69 insertions(+)
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 4af1820..3f9b367 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -49,6 +49,7 @@ struct bus_type;
struct device;
struct iommu_domain;
struct notifier_block;
+struct iommu_fault_event;
/* iommu fault flags */
#define IOMMU_FAULT_READ 0x0
@@ -56,6 +57,7 @@ struct notifier_block;
typedef int (*iommu_fault_handler_t)(struct iommu_domain *,
struct device *, unsigned long, int, void *);
+typedef int (*iommu_dev_fault_handler_t)(struct device *, struct iommu_fault_event *);
struct iommu_domain_geometry {
dma_addr_t aperture_start; /* First address that can be mapped */
@@ -264,6 +266,60 @@ struct iommu_device {
struct device *dev;
};
+enum iommu_model {
+ IOMMU_MODEL_INTEL = 1,
+ IOMMU_MODEL_AMD,
+ IOMMU_MODEL_SMMU3,
+};
+
+/* Generic fault types, can be expanded IRQ remapping fault */
+enum iommu_fault_type {
+ IOMMU_FAULT_DMA_UNRECOV = 1, /* unrecoverable fault */
+ IOMMU_FAULT_PAGE_REQ, /* page request fault */
+};
+
+enum iommu_fault_reason {
+ IOMMU_FAULT_REASON_CTX = 1,
+ IOMMU_FAULT_REASON_ACCESS,
+ IOMMU_FAULT_REASON_INVALIDATE,
+ IOMMU_FAULT_REASON_UNKNOWN,
+};
+
+/**
+ * struct iommu_fault_event - Generic per device fault data
+ *
+ * - PCI and non-PCI devices
+ * - Recoverable faults (e.g. page request), information based on PCI ATS
+ * and PASID spec.
+ * - Un-recoverable faults of device interest
+ * - DMA remapping and IRQ remapping faults
+
+ * @type contains fault type.
+ * @reason fault reasons if relevant outside IOMMU driver, IOMMU driver internal
+ * faults are not reported
+ * @paddr: tells the offending page address
+ * @pasid: contains process address space ID, used in shared virtual memory(SVM)
+ * @rid: requestor ID
+ * @page_req_group_id: page request group index
+ * @last_req: last request in a page request group
+ * @pasid_valid: indicates if the PRQ has a valid PASID
+ * @prot: page access protection flag, e.g. IOMMU_READ, IOMMU_WRITE
+ * @private_data: uniquely identify device-specific private data for an
+ * individual page request
+ */
+struct iommu_fault_event {
+ enum iommu_fault_type type;
+ enum iommu_fault_reason reason;
+ u64 paddr;
+ u32 pasid;
+ u32 rid:16;
+ u32 page_req_group_id : 9;
+ u32 last_req : 1;
+ u32 pasid_valid : 1;
+ u32 prot;
+ u32 private_data;
+};
+
int iommu_device_register(struct iommu_device *iommu);
void iommu_device_unregister(struct iommu_device *iommu);
int iommu_device_sysfs_add(struct iommu_device *iommu,
@@ -425,6 +481,18 @@ struct iommu_fwspec {
u32 ids[1];
};
+/**
+ * struct iommu_fault_param - per-device IOMMU runtime data
+ * @dev_fault_handler: Callback function to handle IOMMU faults at device level
+ * @pasid_tbl_bound: Device PASID table is bound to a guest
+ *
+ */
+struct iommu_fault_param {
+ iommu_dev_fault_handler_t dev_fault_handler;
+ bool pasid_tbl_bound:1;
+ bool pasid_tbl_shadowed:1;
+};
+
int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode,
const struct iommu_ops *ops);
void iommu_fwspec_free(struct device *dev);
@@ -437,6 +505,7 @@ struct iommu_ops {};
struct iommu_group {};
struct iommu_fwspec {};
struct iommu_device {};
+struct iommu_fault_param {};
static inline bool iommu_present(struct bus_type *bus)
{
--
2.7.4
^ permalink raw reply related [flat|nested] 109+ messages in thread* [PATCH v2 08/16] iommu: introduce device fault data
@ 2017-10-05 23:03 ` Jacob Pan
0 siblings, 0 replies; 109+ messages in thread
From: Jacob Pan @ 2017-10-05 23:03 UTC (permalink / raw)
To: iommu, LKML, Joerg Roedel, David Woodhouse, Greg Kroah-Hartman,
Rafael Wysocki, Jean-Philippe Brucker
Cc: Liu, Yi L, Lan Tianyu, Tian, Kevin, Raj Ashok, Alex Williamson,
Jacob Pan
Device faults detected by IOMMU can be reported outside IOMMU
subsystem. This patch intends to provide a generic device
fault data such that device drivers can communicate IOMMU faults
without model specific knowledge.
The assumption is that model specific IOMMU driver can filter and
handle most of the IOMMU faults if the cause is within IOMMU driver
control. Therefore, the fault reasons can be reported are grouped
and generalized based common specifications such as PCI ATS.
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
---
include/linux/iommu.h | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 69 insertions(+)
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 4af1820..3f9b367 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -49,6 +49,7 @@ struct bus_type;
struct device;
struct iommu_domain;
struct notifier_block;
+struct iommu_fault_event;
/* iommu fault flags */
#define IOMMU_FAULT_READ 0x0
@@ -56,6 +57,7 @@ struct notifier_block;
typedef int (*iommu_fault_handler_t)(struct iommu_domain *,
struct device *, unsigned long, int, void *);
+typedef int (*iommu_dev_fault_handler_t)(struct device *, struct iommu_fault_event *);
struct iommu_domain_geometry {
dma_addr_t aperture_start; /* First address that can be mapped */
@@ -264,6 +266,60 @@ struct iommu_device {
struct device *dev;
};
+enum iommu_model {
+ IOMMU_MODEL_INTEL = 1,
+ IOMMU_MODEL_AMD,
+ IOMMU_MODEL_SMMU3,
+};
+
+/* Generic fault types, can be expanded IRQ remapping fault */
+enum iommu_fault_type {
+ IOMMU_FAULT_DMA_UNRECOV = 1, /* unrecoverable fault */
+ IOMMU_FAULT_PAGE_REQ, /* page request fault */
+};
+
+enum iommu_fault_reason {
+ IOMMU_FAULT_REASON_CTX = 1,
+ IOMMU_FAULT_REASON_ACCESS,
+ IOMMU_FAULT_REASON_INVALIDATE,
+ IOMMU_FAULT_REASON_UNKNOWN,
+};
+
+/**
+ * struct iommu_fault_event - Generic per device fault data
+ *
+ * - PCI and non-PCI devices
+ * - Recoverable faults (e.g. page request), information based on PCI ATS
+ * and PASID spec.
+ * - Un-recoverable faults of device interest
+ * - DMA remapping and IRQ remapping faults
+
+ * @type contains fault type.
+ * @reason fault reasons if relevant outside IOMMU driver, IOMMU driver internal
+ * faults are not reported
+ * @paddr: tells the offending page address
+ * @pasid: contains process address space ID, used in shared virtual memory(SVM)
+ * @rid: requestor ID
+ * @page_req_group_id: page request group index
+ * @last_req: last request in a page request group
+ * @pasid_valid: indicates if the PRQ has a valid PASID
+ * @prot: page access protection flag, e.g. IOMMU_READ, IOMMU_WRITE
+ * @private_data: uniquely identify device-specific private data for an
+ * individual page request
+ */
+struct iommu_fault_event {
+ enum iommu_fault_type type;
+ enum iommu_fault_reason reason;
+ u64 paddr;
+ u32 pasid;
+ u32 rid:16;
+ u32 page_req_group_id : 9;
+ u32 last_req : 1;
+ u32 pasid_valid : 1;
+ u32 prot;
+ u32 private_data;
+};
+
int iommu_device_register(struct iommu_device *iommu);
void iommu_device_unregister(struct iommu_device *iommu);
int iommu_device_sysfs_add(struct iommu_device *iommu,
@@ -425,6 +481,18 @@ struct iommu_fwspec {
u32 ids[1];
};
+/**
+ * struct iommu_fault_param - per-device IOMMU runtime data
+ * @dev_fault_handler: Callback function to handle IOMMU faults at device level
+ * @pasid_tbl_bound: Device PASID table is bound to a guest
+ *
+ */
+struct iommu_fault_param {
+ iommu_dev_fault_handler_t dev_fault_handler;
+ bool pasid_tbl_bound:1;
+ bool pasid_tbl_shadowed:1;
+};
+
int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode,
const struct iommu_ops *ops);
void iommu_fwspec_free(struct device *dev);
@@ -437,6 +505,7 @@ struct iommu_ops {};
struct iommu_group {};
struct iommu_fwspec {};
struct iommu_device {};
+struct iommu_fault_param {};
static inline bool iommu_present(struct bus_type *bus)
{
--
2.7.4
^ permalink raw reply related [flat|nested] 109+ messages in thread[parent not found: <1507244624-39189-9-git-send-email-jacob.jun.pan-VuQAYsv1563Yd54FQh9/CA@public.gmane.org>]
* Re: [PATCH v2 08/16] iommu: introduce device fault data
2017-10-05 23:03 ` Jacob Pan
@ 2017-10-10 19:29 ` Jean-Philippe Brucker
-1 siblings, 0 replies; 109+ messages in thread
From: Jean-Philippe Brucker @ 2017-10-10 19:29 UTC (permalink / raw)
To: Jacob Pan,
iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org,
LKML, Joerg Roedel, David Woodhouse, Greg Kroah-Hartman,
Rafael Wysocki
Cc: Lan Tianyu
On 06/10/17 00:03, Jacob Pan wrote:
> Device faults detected by IOMMU can be reported outside IOMMU
> subsystem. This patch intends to provide a generic device
> fault data such that device drivers can communicate IOMMU faults
> without model specific knowledge.
>
> The assumption is that model specific IOMMU driver can filter and
> handle most of the IOMMU faults if the cause is within IOMMU driver
> control. Therefore, the fault reasons can be reported are grouped
> and generalized based common specifications such as PCI ATS.
>
> Signed-off-by: Jacob Pan <jacob.jun.pan-VuQAYsv1563Yd54FQh9/CA@public.gmane.org>
> ---
> include/linux/iommu.h | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 69 insertions(+)
>
> diff --git a/include/linux/iommu.h b/include/linux/iommu.h
> index 4af1820..3f9b367 100644
> --- a/include/linux/iommu.h
> +++ b/include/linux/iommu.h
> @@ -49,6 +49,7 @@ struct bus_type;
> struct device;
> struct iommu_domain;
> struct notifier_block;
> +struct iommu_fault_event;
>
> /* iommu fault flags */
> #define IOMMU_FAULT_READ 0x0
> @@ -56,6 +57,7 @@ struct notifier_block;
>
> typedef int (*iommu_fault_handler_t)(struct iommu_domain *,
> struct device *, unsigned long, int, void *);
> +typedef int (*iommu_dev_fault_handler_t)(struct device *, struct iommu_fault_event *);
>
> struct iommu_domain_geometry {
> dma_addr_t aperture_start; /* First address that can be mapped */
> @@ -264,6 +266,60 @@ struct iommu_device {
> struct device *dev;
> };
>
> +enum iommu_model {
> + IOMMU_MODEL_INTEL = 1,
> + IOMMU_MODEL_AMD,
> + IOMMU_MODEL_SMMU3,
> +};
Now unused, I guess?
> +
> +/* Generic fault types, can be expanded IRQ remapping fault */
> +enum iommu_fault_type {
> + IOMMU_FAULT_DMA_UNRECOV = 1, /* unrecoverable fault */
> + IOMMU_FAULT_PAGE_REQ, /* page request fault */
> +};
> +
> +enum iommu_fault_reason {
> + IOMMU_FAULT_REASON_CTX = 1,
If I read the VT-d spec right, this is a fault encountered while fetching
the PASID table pointer?
> + IOMMU_FAULT_REASON_ACCESS,
And this a pgd or pte access fault?
> + IOMMU_FAULT_REASON_INVALIDATE,
What would this be?
> + IOMMU_FAULT_REASON_UNKNOWN,
> +};
I'm currently doing the same exploratory work for virtio-iommu, and I'd be
tempted to report reasons as detailed as possible to guest or device
driver, but it's not clear what they need, how they would use this
information. I'd like to discuss this some more.
For unrecoverable faults I guess CTX means "the host IOMMU driver is
broken", since the device tables are invalid. In which case there is no
use continuing, trying to shutdown the device cleanly is really all the
guest/device driver can do.
For ACCESS the error is the device driver's or guest's fault, since the
device driver triggered DMA on unmapped buffers, or the guest didn't
install the right page tables. This can be repaired without shutting down,
it may even just be one execution stream that failed in the device while
the others continued normally. It's not as recoverable as a PRI Page
Request, but the device driver may still be able to isolate the problem
(e.g. by killing the process responsible) and the device to recover from it.
So maybe ACCESS would benefit from more details, for example
differentiating faults encountered while fetching the pgd from those
encountered while fetching a second-level table or pte. The former is a
lot less recoverable than the latter (bug in the guest IOMMU driver vs.
bug in the device driver).
Generalizing this maybe we should differentiate each step of the
translation in fault_reason:
* Device entry (context) fetch -> host IOMMU driver's fault
* PASID table fetch -> guest IOMMU driver or host userspace's fault
* pgd fetch -> guest IOMMU driver's fault
* pte fetch, including validity and access check -> device driver's fault
It's probably not worth mentioning intermediate table levels (pmd, etc).
Thoughts?
> +/**
> + * struct iommu_fault_event - Generic per device fault data
> + *
> + * - PCI and non-PCI devices
> + * - Recoverable faults (e.g. page request), information based on PCI ATS
> + * and PASID spec.
> + * - Un-recoverable faults of device interest
> + * - DMA remapping and IRQ remapping faults
> +
> + * @type contains fault type.
> + * @reason fault reasons if relevant outside IOMMU driver, IOMMU driver internal
> + * faults are not reported
> + * @paddr: tells the offending page address
> + * @pasid: contains process address space ID, used in shared virtual memory(SVM)
> + * @rid: requestor ID> + * @page_req_group_id: page request group index
> + * @last_req: last request in a page request group
> + * @pasid_valid: indicates if the PRQ has a valid PASID
> + * @prot: page access protection flag, e.g. IOMMU_READ, IOMMU_WRITE
> + * @private_data: uniquely identify device-specific private data for an
> + * individual page request
I understand this is for the streaming extension on VT-d, is it
IOMMU-specific or specific to the faulting endpoint? Could the device
driver receiving the fault attempt to decode or modify this field before
sending the page response?
> + */
> +struct iommu_fault_event {
> + enum iommu_fault_type type;
> + enum iommu_fault_reason reason;
> + u64 paddr;
> + u32 pasid;
> + u32 rid:16;
I think this is redundant, since you already pass the struct device to the
fault handler. Otherwise it should probably be extended to 32 bits, for
non-PCI or multiple PCI domains.
> + u32 page_req_group_id : 9;> + u32 last_req : 1;
> + u32 pasid_valid : 1;
> + u32 prot;
> + u32 private_data;
> +};
> +
> int iommu_device_register(struct iommu_device *iommu);
> void iommu_device_unregister(struct iommu_device *iommu);
> int iommu_device_sysfs_add(struct iommu_device *iommu,
> @@ -425,6 +481,18 @@ struct iommu_fwspec {
> u32 ids[1];
> };
>
> +/**
> + * struct iommu_fault_param - per-device IOMMU runtime data
> + * @dev_fault_handler: Callback function to handle IOMMU faults at device level
> + * @pasid_tbl_bound: Device PASID table is bound to a guest
> + *
> + */
> +struct iommu_fault_param {
> + iommu_dev_fault_handler_t dev_fault_handler;
> + bool pasid_tbl_bound:1;
> + bool pasid_tbl_shadowed:1;
I guess you can remove this?
Thanks,
Jean
> +};
> +
> int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode,
> const struct iommu_ops *ops);
> void iommu_fwspec_free(struct device *dev);
> @@ -437,6 +505,7 @@ struct iommu_ops {};
> struct iommu_group {};
> struct iommu_fwspec {};
> struct iommu_device {};
> +struct iommu_fault_param {};
>
> static inline bool iommu_present(struct bus_type *bus)
> {
>
^ permalink raw reply [flat|nested] 109+ messages in thread* Re: [PATCH v2 08/16] iommu: introduce device fault data
@ 2017-10-10 19:29 ` Jean-Philippe Brucker
0 siblings, 0 replies; 109+ messages in thread
From: Jean-Philippe Brucker @ 2017-10-10 19:29 UTC (permalink / raw)
To: Jacob Pan, iommu@lists.linux-foundation.org, LKML, Joerg Roedel,
David Woodhouse, Greg Kroah-Hartman, Rafael Wysocki
Cc: Liu, Yi L, Lan Tianyu, Tian, Kevin, Raj Ashok, Alex Williamson
On 06/10/17 00:03, Jacob Pan wrote:
> Device faults detected by IOMMU can be reported outside IOMMU
> subsystem. This patch intends to provide a generic device
> fault data such that device drivers can communicate IOMMU faults
> without model specific knowledge.
>
> The assumption is that model specific IOMMU driver can filter and
> handle most of the IOMMU faults if the cause is within IOMMU driver
> control. Therefore, the fault reasons can be reported are grouped
> and generalized based common specifications such as PCI ATS.
>
> Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
> ---
> include/linux/iommu.h | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 69 insertions(+)
>
> diff --git a/include/linux/iommu.h b/include/linux/iommu.h
> index 4af1820..3f9b367 100644
> --- a/include/linux/iommu.h
> +++ b/include/linux/iommu.h
> @@ -49,6 +49,7 @@ struct bus_type;
> struct device;
> struct iommu_domain;
> struct notifier_block;
> +struct iommu_fault_event;
>
> /* iommu fault flags */
> #define IOMMU_FAULT_READ 0x0
> @@ -56,6 +57,7 @@ struct notifier_block;
>
> typedef int (*iommu_fault_handler_t)(struct iommu_domain *,
> struct device *, unsigned long, int, void *);
> +typedef int (*iommu_dev_fault_handler_t)(struct device *, struct iommu_fault_event *);
>
> struct iommu_domain_geometry {
> dma_addr_t aperture_start; /* First address that can be mapped */
> @@ -264,6 +266,60 @@ struct iommu_device {
> struct device *dev;
> };
>
> +enum iommu_model {
> + IOMMU_MODEL_INTEL = 1,
> + IOMMU_MODEL_AMD,
> + IOMMU_MODEL_SMMU3,
> +};
Now unused, I guess?
> +
> +/* Generic fault types, can be expanded IRQ remapping fault */
> +enum iommu_fault_type {
> + IOMMU_FAULT_DMA_UNRECOV = 1, /* unrecoverable fault */
> + IOMMU_FAULT_PAGE_REQ, /* page request fault */
> +};
> +
> +enum iommu_fault_reason {
> + IOMMU_FAULT_REASON_CTX = 1,
If I read the VT-d spec right, this is a fault encountered while fetching
the PASID table pointer?
> + IOMMU_FAULT_REASON_ACCESS,
And this a pgd or pte access fault?
> + IOMMU_FAULT_REASON_INVALIDATE,
What would this be?
> + IOMMU_FAULT_REASON_UNKNOWN,
> +};
I'm currently doing the same exploratory work for virtio-iommu, and I'd be
tempted to report reasons as detailed as possible to guest or device
driver, but it's not clear what they need, how they would use this
information. I'd like to discuss this some more.
For unrecoverable faults I guess CTX means "the host IOMMU driver is
broken", since the device tables are invalid. In which case there is no
use continuing, trying to shutdown the device cleanly is really all the
guest/device driver can do.
For ACCESS the error is the device driver's or guest's fault, since the
device driver triggered DMA on unmapped buffers, or the guest didn't
install the right page tables. This can be repaired without shutting down,
it may even just be one execution stream that failed in the device while
the others continued normally. It's not as recoverable as a PRI Page
Request, but the device driver may still be able to isolate the problem
(e.g. by killing the process responsible) and the device to recover from it.
So maybe ACCESS would benefit from more details, for example
differentiating faults encountered while fetching the pgd from those
encountered while fetching a second-level table or pte. The former is a
lot less recoverable than the latter (bug in the guest IOMMU driver vs.
bug in the device driver).
Generalizing this maybe we should differentiate each step of the
translation in fault_reason:
* Device entry (context) fetch -> host IOMMU driver's fault
* PASID table fetch -> guest IOMMU driver or host userspace's fault
* pgd fetch -> guest IOMMU driver's fault
* pte fetch, including validity and access check -> device driver's fault
It's probably not worth mentioning intermediate table levels (pmd, etc).
Thoughts?
> +/**
> + * struct iommu_fault_event - Generic per device fault data
> + *
> + * - PCI and non-PCI devices
> + * - Recoverable faults (e.g. page request), information based on PCI ATS
> + * and PASID spec.
> + * - Un-recoverable faults of device interest
> + * - DMA remapping and IRQ remapping faults
> +
> + * @type contains fault type.
> + * @reason fault reasons if relevant outside IOMMU driver, IOMMU driver internal
> + * faults are not reported
> + * @paddr: tells the offending page address
> + * @pasid: contains process address space ID, used in shared virtual memory(SVM)
> + * @rid: requestor ID> + * @page_req_group_id: page request group index
> + * @last_req: last request in a page request group
> + * @pasid_valid: indicates if the PRQ has a valid PASID
> + * @prot: page access protection flag, e.g. IOMMU_READ, IOMMU_WRITE
> + * @private_data: uniquely identify device-specific private data for an
> + * individual page request
I understand this is for the streaming extension on VT-d, is it
IOMMU-specific or specific to the faulting endpoint? Could the device
driver receiving the fault attempt to decode or modify this field before
sending the page response?
> + */
> +struct iommu_fault_event {
> + enum iommu_fault_type type;
> + enum iommu_fault_reason reason;
> + u64 paddr;
> + u32 pasid;
> + u32 rid:16;
I think this is redundant, since you already pass the struct device to the
fault handler. Otherwise it should probably be extended to 32 bits, for
non-PCI or multiple PCI domains.
> + u32 page_req_group_id : 9;> + u32 last_req : 1;
> + u32 pasid_valid : 1;
> + u32 prot;
> + u32 private_data;
> +};
> +
> int iommu_device_register(struct iommu_device *iommu);
> void iommu_device_unregister(struct iommu_device *iommu);
> int iommu_device_sysfs_add(struct iommu_device *iommu,
> @@ -425,6 +481,18 @@ struct iommu_fwspec {
> u32 ids[1];
> };
>
> +/**
> + * struct iommu_fault_param - per-device IOMMU runtime data
> + * @dev_fault_handler: Callback function to handle IOMMU faults at device level
> + * @pasid_tbl_bound: Device PASID table is bound to a guest
> + *
> + */
> +struct iommu_fault_param {
> + iommu_dev_fault_handler_t dev_fault_handler;
> + bool pasid_tbl_bound:1;
> + bool pasid_tbl_shadowed:1;
I guess you can remove this?
Thanks,
Jean
> +};
> +
> int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode,
> const struct iommu_ops *ops);
> void iommu_fwspec_free(struct device *dev);
> @@ -437,6 +505,7 @@ struct iommu_ops {};
> struct iommu_group {};
> struct iommu_fwspec {};
> struct iommu_device {};
> +struct iommu_fault_param {};
>
> static inline bool iommu_present(struct bus_type *bus)
> {
>
^ permalink raw reply [flat|nested] 109+ messages in thread* Re: [PATCH v2 08/16] iommu: introduce device fault data
2017-10-10 19:29 ` Jean-Philippe Brucker
(?)
@ 2017-10-10 21:43 ` Jacob Pan
-1 siblings, 0 replies; 109+ messages in thread
From: Jacob Pan @ 2017-10-10 21:43 UTC (permalink / raw)
To: Jean-Philippe Brucker
Cc: iommu@lists.linux-foundation.org, LKML, Joerg Roedel,
David Woodhouse, Greg Kroah-Hartman, Rafael Wysocki, Liu, Yi L,
Lan Tianyu, Tian, Kevin, Raj Ashok, Alex Williamson,
jacob.jun.pan
On Tue, 10 Oct 2017 20:29:29 +0100
Jean-Philippe Brucker <jean-philippe.brucker@arm.com> wrote:
> > +enum iommu_model {
> > + IOMMU_MODEL_INTEL = 1,
> > + IOMMU_MODEL_AMD,
> > + IOMMU_MODEL_SMMU3,
> > +};
>
> Now unused, I guess?
right, missed it. thanks
^ permalink raw reply [flat|nested] 109+ messages in thread[parent not found: <439401c0-a9ff-a69a-dc10-12d72f7abbab-5wv7dgnIgG8@public.gmane.org>]
* RE: [PATCH v2 08/16] iommu: introduce device fault data
2017-10-10 19:29 ` Jean-Philippe Brucker
@ 2017-10-20 10:07 ` Liu, Yi L
-1 siblings, 0 replies; 109+ messages in thread
From: Liu, Yi L @ 2017-10-20 10:07 UTC (permalink / raw)
To: Jean-Philippe Brucker, Jacob Pan,
iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org,
LKML, Joerg Roedel, David Woodhouse, Greg Kroah-Hartman,
Wysocki, Rafael J
Cc: Lan, Tianyu
> -----Original Message-----
> From: Jean-Philippe Brucker [mailto:jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org]
> Sent: Wednesday, October 11, 2017 3:29 AM
> To: Jacob Pan <jacob.jun.pan-VuQAYsv1563Yd54FQh9/CA@public.gmane.org>; iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org;
> LKML <linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org>; Joerg Roedel <joro-zLv9SwRftAIdnm+yROfE0A@public.gmane.org>; David
> Woodhouse <dwmw2-wEGCiKHe2LqWVfeAwA7xHQ@public.gmane.org>; Greg Kroah-Hartman
> <gregkh-hQyY1W1yCW8ekmWlsbkhG0B+6BGkLq7r@public.gmane.org>; Wysocki, Rafael J <rafael.j.wysocki-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
> Cc: Liu, Yi L <yi.l.liu-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>; Lan, Tianyu <tianyu.lan-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>; Tian, Kevin
> <kevin.tian-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>; Raj, Ashok <ashok.raj-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>; Alex Williamson
> <alex.williamson-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
> Subject: Re: [PATCH v2 08/16] iommu: introduce device fault data
>
> On 06/10/17 00:03, Jacob Pan wrote:
> > Device faults detected by IOMMU can be reported outside IOMMU
> > subsystem. This patch intends to provide a generic device fault data
> > such that device drivers can communicate IOMMU faults without model
> > specific knowledge.
> >
> > The assumption is that model specific IOMMU driver can filter and
> > handle most of the IOMMU faults if the cause is within IOMMU driver
> > control. Therefore, the fault reasons can be reported are grouped and
> > generalized based common specifications such as PCI ATS.
> >
> > Signed-off-by: Jacob Pan <jacob.jun.pan-VuQAYsv1563Yd54FQh9/CA@public.gmane.org>
> > ---
> > include/linux/iommu.h | 69
> > +++++++++++++++++++++++++++++++++++++++++++++++++++
> > 1 file changed, 69 insertions(+)
> >
> > diff --git a/include/linux/iommu.h b/include/linux/iommu.h index
> > 4af1820..3f9b367 100644
> > --- a/include/linux/iommu.h
> > +++ b/include/linux/iommu.h
> > @@ -49,6 +49,7 @@ struct bus_type;
> > struct device;
> > struct iommu_domain;
> > struct notifier_block;
> > +struct iommu_fault_event;
> >
> > /* iommu fault flags */
> > #define IOMMU_FAULT_READ 0x0
> > @@ -56,6 +57,7 @@ struct notifier_block;
> >
> > typedef int (*iommu_fault_handler_t)(struct iommu_domain *,
> > struct device *, unsigned long, int, void *);
> > +typedef int (*iommu_dev_fault_handler_t)(struct device *, struct
> > +iommu_fault_event *);
> >
> > struct iommu_domain_geometry {
> > dma_addr_t aperture_start; /* First address that can be mapped */
> > @@ -264,6 +266,60 @@ struct iommu_device {
> > struct device *dev;
> > };
> >
> > +enum iommu_model {
> > + IOMMU_MODEL_INTEL = 1,
> > + IOMMU_MODEL_AMD,
> > + IOMMU_MODEL_SMMU3,
> > +};
>
> Now unused, I guess?
>
> > +
> > +/* Generic fault types, can be expanded IRQ remapping fault */ enum
> > +iommu_fault_type {
> > + IOMMU_FAULT_DMA_UNRECOV = 1, /* unrecoverable fault */
> > + IOMMU_FAULT_PAGE_REQ, /* page request fault */
> > +};
> > +
> > +enum iommu_fault_reason {
> > + IOMMU_FAULT_REASON_CTX = 1,
>
> If I read the VT-d spec right, this is a fault encountered while fetching the PASID table
> pointer?
>
> > + IOMMU_FAULT_REASON_ACCESS,
>
> And this a pgd or pte access fault?
>
> > + IOMMU_FAULT_REASON_INVALIDATE,
>
> What would this be?
>
> > + IOMMU_FAULT_REASON_UNKNOWN,
> > +};
>
> I'm currently doing the same exploratory work for virtio-iommu, and I'd be tempted
> to report reasons as detailed as possible to guest or device driver, but it's not clear
> what they need, how they would use this information. I'd like to discuss this some
> more.
[Liu, Yi L] In fact, it's not necessary to pass the detailed unrecoverable fault to guest in
virtualization case. Unrecoverable fault happened on native indicates fault during native
IOMMU address translation. If the fault is not due to guest IOMMU page table setting,
then it is not necessary to inject the fault to guest. And hypervisor should be able to
deduce it by walking the guest IOMMU page table with the fault address. So I think for
virtualization case, pass the fault address is enough. If hypervisor doesn't see any issue
after checking the guest IOMMU translation hierarchy, no use to let guest know it. Hypervisor
can either throw error log or stop the guest. If hypervisor see any error in the guest
iommu translation hierarchy, then inject the error to guest with a proper fault type.
But for device driver or other user-space driver, I'm not sure if they need detailed fault
info. In fact, it is enough to pass the possible info which would help them to deduce whether
the unrecoverable fault is due to them. This need more inputs from device driver reviewers.
> For unrecoverable faults I guess CTX means "the host IOMMU driver is broken", since
> the device tables are invalid. In which case there is no use continuing, trying to
> shutdown the device cleanly is really all the guest/device driver can do.
[Liu, Yi L] Not sure about what device table mean here. But I agree that if host IOMMU
driver has no valid CTX for the device, then this kind of error should result in a shutdown to
the device.
> For ACCESS the error is the device driver's or guest's fault, since the device driver
> triggered DMA on unmapped buffers, or the guest didn't install the right page tables.
> This can be repaired without shutting down, it may even just be one execution
> stream that failed in the device while the others continued normally. It's not as
> recoverable as a PRI Page Request, but the device driver may still be able to isolate
> the problem (e.g. by killing the process responsible) and the device to recover from it.
>
> So maybe ACCESS would benefit from more details, for example differentiating
> faults encountered while fetching the pgd from those encountered while fetching a
> second-level table or pte. The former is a lot less recoverable than the latter (bug in
> the guest IOMMU driver vs.
> bug in the device driver).
>
> Generalizing this maybe we should differentiate each step of the translation in
> fault_reason:
>
> * Device entry (context) fetch -> host IOMMU driver's fault
> * PASID table fetch -> guest IOMMU driver or host userspace's fault
> * pgd fetch -> guest IOMMU driver's fault
> * pte fetch, including validity and access check -> device driver's fault
[Liu, Yi L] It's a good summary here. BTW. why pte fetch is due to device driver's fault?
> It's probably not worth mentioning intermediate table levels (pmd, etc).
> Thoughts?
[Liu, Yi L] As my comments above, the info passed to guest/userspace driver/driver should
be able to deduce if the fault is due to it.
> > +/**
> > + * struct iommu_fault_event - Generic per device fault data
> > + *
> > + * - PCI and non-PCI devices
> > + * - Recoverable faults (e.g. page request), information based on PCI
> > +ATS
> > + * and PASID spec.
> > + * - Un-recoverable faults of device interest
> > + * - DMA remapping and IRQ remapping faults
> > +
> > + * @type contains fault type.
> > + * @reason fault reasons if relevant outside IOMMU driver, IOMMU driver
> internal
> > + * faults are not reported
> > + * @paddr: tells the offending page address
> > + * @pasid: contains process address space ID, used in shared virtual
> > + memory(SVM)
> > + * @rid: requestor ID> + * @page_req_group_id: page request group
> > + index
> > + * @last_req: last request in a page request group
> > + * @pasid_valid: indicates if the PRQ has a valid PASID
> > + * @prot: page access protection flag, e.g. IOMMU_READ, IOMMU_WRITE
> > + * @private_data: uniquely identify device-specific private data for an
> > + * individual page request
>
> I understand this is for the streaming extension on VT-d, is it IOMMU-specific or
[Liu, Yi L] yes, it's the streaming extension on VT-d.
> specific to the faulting endpoint? Could the device driver receiving the fault attempt
> to decode or modify this field before sending the page response?
[Liu, Yi L] IOMMU driver need to include it when sending the page response.
Regards,
Yi L
>
> > + */
> > +struct iommu_fault_event {
> > + enum iommu_fault_type type;
> > + enum iommu_fault_reason reason;
> > + u64 paddr;
> > + u32 pasid;
> > + u32 rid:16;
>
> I think this is redundant, since you already pass the struct device to the fault handler.
> Otherwise it should probably be extended to 32 bits, for non-PCI or multiple PCI
> domains.
>
> > + u32 page_req_group_id : 9;> + u32 last_req : 1;
> > + u32 pasid_valid : 1;
> > + u32 prot;
> > + u32 private_data;
> > +};
> > +
> > int iommu_device_register(struct iommu_device *iommu); void
> > iommu_device_unregister(struct iommu_device *iommu); int
> > iommu_device_sysfs_add(struct iommu_device *iommu, @@ -425,6 +481,18
> > @@ struct iommu_fwspec {
> > u32 ids[1];
> > };
> >
> > +/**
> > + * struct iommu_fault_param - per-device IOMMU runtime data
> > + * @dev_fault_handler: Callback function to handle IOMMU faults at
> > +device level
> > + * @pasid_tbl_bound: Device PASID table is bound to a guest
> > + *
> > + */
> > +struct iommu_fault_param {
> > + iommu_dev_fault_handler_t dev_fault_handler;
> > + bool pasid_tbl_bound:1;
> > + bool pasid_tbl_shadowed:1;
>
> I guess you can remove this?
>
> Thanks,
> Jean
>
> > +};
> > +
> > int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode,
> > const struct iommu_ops *ops); void iommu_fwspec_free(struct
> > device *dev); @@ -437,6 +505,7 @@ struct iommu_ops {}; struct
> > iommu_group {}; struct iommu_fwspec {}; struct iommu_device {};
> > +struct iommu_fault_param {};
> >
> > static inline bool iommu_present(struct bus_type *bus)
> > {
> >
^ permalink raw reply [flat|nested] 109+ messages in thread* RE: [PATCH v2 08/16] iommu: introduce device fault data
@ 2017-10-20 10:07 ` Liu, Yi L
0 siblings, 0 replies; 109+ messages in thread
From: Liu, Yi L @ 2017-10-20 10:07 UTC (permalink / raw)
To: Jean-Philippe Brucker, Jacob Pan,
iommu@lists.linux-foundation.org, LKML, Joerg Roedel,
David Woodhouse, Greg Kroah-Hartman, Wysocki, Rafael J
Cc: Lan, Tianyu, Tian, Kevin, Raj, Ashok, Alex Williamson
> -----Original Message-----
> From: Jean-Philippe Brucker [mailto:jean-philippe.brucker@arm.com]
> Sent: Wednesday, October 11, 2017 3:29 AM
> To: Jacob Pan <jacob.jun.pan@linux.intel.com>; iommu@lists.linux-foundation.org;
> LKML <linux-kernel@vger.kernel.org>; Joerg Roedel <joro@8bytes.org>; David
> Woodhouse <dwmw2@infradead.org>; Greg Kroah-Hartman
> <gregkh@linuxfoundation.org>; Wysocki, Rafael J <rafael.j.wysocki@intel.com>
> Cc: Liu, Yi L <yi.l.liu@intel.com>; Lan, Tianyu <tianyu.lan@intel.com>; Tian, Kevin
> <kevin.tian@intel.com>; Raj, Ashok <ashok.raj@intel.com>; Alex Williamson
> <alex.williamson@redhat.com>
> Subject: Re: [PATCH v2 08/16] iommu: introduce device fault data
>
> On 06/10/17 00:03, Jacob Pan wrote:
> > Device faults detected by IOMMU can be reported outside IOMMU
> > subsystem. This patch intends to provide a generic device fault data
> > such that device drivers can communicate IOMMU faults without model
> > specific knowledge.
> >
> > The assumption is that model specific IOMMU driver can filter and
> > handle most of the IOMMU faults if the cause is within IOMMU driver
> > control. Therefore, the fault reasons can be reported are grouped and
> > generalized based common specifications such as PCI ATS.
> >
> > Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
> > ---
> > include/linux/iommu.h | 69
> > +++++++++++++++++++++++++++++++++++++++++++++++++++
> > 1 file changed, 69 insertions(+)
> >
> > diff --git a/include/linux/iommu.h b/include/linux/iommu.h index
> > 4af1820..3f9b367 100644
> > --- a/include/linux/iommu.h
> > +++ b/include/linux/iommu.h
> > @@ -49,6 +49,7 @@ struct bus_type;
> > struct device;
> > struct iommu_domain;
> > struct notifier_block;
> > +struct iommu_fault_event;
> >
> > /* iommu fault flags */
> > #define IOMMU_FAULT_READ 0x0
> > @@ -56,6 +57,7 @@ struct notifier_block;
> >
> > typedef int (*iommu_fault_handler_t)(struct iommu_domain *,
> > struct device *, unsigned long, int, void *);
> > +typedef int (*iommu_dev_fault_handler_t)(struct device *, struct
> > +iommu_fault_event *);
> >
> > struct iommu_domain_geometry {
> > dma_addr_t aperture_start; /* First address that can be mapped */
> > @@ -264,6 +266,60 @@ struct iommu_device {
> > struct device *dev;
> > };
> >
> > +enum iommu_model {
> > + IOMMU_MODEL_INTEL = 1,
> > + IOMMU_MODEL_AMD,
> > + IOMMU_MODEL_SMMU3,
> > +};
>
> Now unused, I guess?
>
> > +
> > +/* Generic fault types, can be expanded IRQ remapping fault */ enum
> > +iommu_fault_type {
> > + IOMMU_FAULT_DMA_UNRECOV = 1, /* unrecoverable fault */
> > + IOMMU_FAULT_PAGE_REQ, /* page request fault */
> > +};
> > +
> > +enum iommu_fault_reason {
> > + IOMMU_FAULT_REASON_CTX = 1,
>
> If I read the VT-d spec right, this is a fault encountered while fetching the PASID table
> pointer?
>
> > + IOMMU_FAULT_REASON_ACCESS,
>
> And this a pgd or pte access fault?
>
> > + IOMMU_FAULT_REASON_INVALIDATE,
>
> What would this be?
>
> > + IOMMU_FAULT_REASON_UNKNOWN,
> > +};
>
> I'm currently doing the same exploratory work for virtio-iommu, and I'd be tempted
> to report reasons as detailed as possible to guest or device driver, but it's not clear
> what they need, how they would use this information. I'd like to discuss this some
> more.
[Liu, Yi L] In fact, it's not necessary to pass the detailed unrecoverable fault to guest in
virtualization case. Unrecoverable fault happened on native indicates fault during native
IOMMU address translation. If the fault is not due to guest IOMMU page table setting,
then it is not necessary to inject the fault to guest. And hypervisor should be able to
deduce it by walking the guest IOMMU page table with the fault address. So I think for
virtualization case, pass the fault address is enough. If hypervisor doesn't see any issue
after checking the guest IOMMU translation hierarchy, no use to let guest know it. Hypervisor
can either throw error log or stop the guest. If hypervisor see any error in the guest
iommu translation hierarchy, then inject the error to guest with a proper fault type.
But for device driver or other user-space driver, I'm not sure if they need detailed fault
info. In fact, it is enough to pass the possible info which would help them to deduce whether
the unrecoverable fault is due to them. This need more inputs from device driver reviewers.
> For unrecoverable faults I guess CTX means "the host IOMMU driver is broken", since
> the device tables are invalid. In which case there is no use continuing, trying to
> shutdown the device cleanly is really all the guest/device driver can do.
[Liu, Yi L] Not sure about what device table mean here. But I agree that if host IOMMU
driver has no valid CTX for the device, then this kind of error should result in a shutdown to
the device.
> For ACCESS the error is the device driver's or guest's fault, since the device driver
> triggered DMA on unmapped buffers, or the guest didn't install the right page tables.
> This can be repaired without shutting down, it may even just be one execution
> stream that failed in the device while the others continued normally. It's not as
> recoverable as a PRI Page Request, but the device driver may still be able to isolate
> the problem (e.g. by killing the process responsible) and the device to recover from it.
>
> So maybe ACCESS would benefit from more details, for example differentiating
> faults encountered while fetching the pgd from those encountered while fetching a
> second-level table or pte. The former is a lot less recoverable than the latter (bug in
> the guest IOMMU driver vs.
> bug in the device driver).
>
> Generalizing this maybe we should differentiate each step of the translation in
> fault_reason:
>
> * Device entry (context) fetch -> host IOMMU driver's fault
> * PASID table fetch -> guest IOMMU driver or host userspace's fault
> * pgd fetch -> guest IOMMU driver's fault
> * pte fetch, including validity and access check -> device driver's fault
[Liu, Yi L] It's a good summary here. BTW. why pte fetch is due to device driver's fault?
> It's probably not worth mentioning intermediate table levels (pmd, etc).
> Thoughts?
[Liu, Yi L] As my comments above, the info passed to guest/userspace driver/driver should
be able to deduce if the fault is due to it.
> > +/**
> > + * struct iommu_fault_event - Generic per device fault data
> > + *
> > + * - PCI and non-PCI devices
> > + * - Recoverable faults (e.g. page request), information based on PCI
> > +ATS
> > + * and PASID spec.
> > + * - Un-recoverable faults of device interest
> > + * - DMA remapping and IRQ remapping faults
> > +
> > + * @type contains fault type.
> > + * @reason fault reasons if relevant outside IOMMU driver, IOMMU driver
> internal
> > + * faults are not reported
> > + * @paddr: tells the offending page address
> > + * @pasid: contains process address space ID, used in shared virtual
> > + memory(SVM)
> > + * @rid: requestor ID> + * @page_req_group_id: page request group
> > + index
> > + * @last_req: last request in a page request group
> > + * @pasid_valid: indicates if the PRQ has a valid PASID
> > + * @prot: page access protection flag, e.g. IOMMU_READ, IOMMU_WRITE
> > + * @private_data: uniquely identify device-specific private data for an
> > + * individual page request
>
> I understand this is for the streaming extension on VT-d, is it IOMMU-specific or
[Liu, Yi L] yes, it's the streaming extension on VT-d.
> specific to the faulting endpoint? Could the device driver receiving the fault attempt
> to decode or modify this field before sending the page response?
[Liu, Yi L] IOMMU driver need to include it when sending the page response.
Regards,
Yi L
>
> > + */
> > +struct iommu_fault_event {
> > + enum iommu_fault_type type;
> > + enum iommu_fault_reason reason;
> > + u64 paddr;
> > + u32 pasid;
> > + u32 rid:16;
>
> I think this is redundant, since you already pass the struct device to the fault handler.
> Otherwise it should probably be extended to 32 bits, for non-PCI or multiple PCI
> domains.
>
> > + u32 page_req_group_id : 9;> + u32 last_req : 1;
> > + u32 pasid_valid : 1;
> > + u32 prot;
> > + u32 private_data;
> > +};
> > +
> > int iommu_device_register(struct iommu_device *iommu); void
> > iommu_device_unregister(struct iommu_device *iommu); int
> > iommu_device_sysfs_add(struct iommu_device *iommu, @@ -425,6 +481,18
> > @@ struct iommu_fwspec {
> > u32 ids[1];
> > };
> >
> > +/**
> > + * struct iommu_fault_param - per-device IOMMU runtime data
> > + * @dev_fault_handler: Callback function to handle IOMMU faults at
> > +device level
> > + * @pasid_tbl_bound: Device PASID table is bound to a guest
> > + *
> > + */
> > +struct iommu_fault_param {
> > + iommu_dev_fault_handler_t dev_fault_handler;
> > + bool pasid_tbl_bound:1;
> > + bool pasid_tbl_shadowed:1;
>
> I guess you can remove this?
>
> Thanks,
> Jean
>
> > +};
> > +
> > int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode,
> > const struct iommu_ops *ops); void iommu_fwspec_free(struct
> > device *dev); @@ -437,6 +505,7 @@ struct iommu_ops {}; struct
> > iommu_group {}; struct iommu_fwspec {}; struct iommu_device {};
> > +struct iommu_fault_param {};
> >
> > static inline bool iommu_present(struct bus_type *bus)
> > {
> >
^ permalink raw reply [flat|nested] 109+ messages in thread[parent not found: <A2975661238FB949B60364EF0F2C257439AFC86D-0J0gbvR4kTg/UvCtAeCM4rfspsVTdybXVpNB7YpNyf8@public.gmane.org>]
* Re: [PATCH v2 08/16] iommu: introduce device fault data
2017-10-20 10:07 ` Liu, Yi L
@ 2017-11-06 19:01 ` Jean-Philippe Brucker
-1 siblings, 0 replies; 109+ messages in thread
From: Jean-Philippe Brucker @ 2017-11-06 19:01 UTC (permalink / raw)
To: Liu, Yi L, Jacob Pan,
iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org,
LKML, Joerg Roedel, David Woodhouse, Greg Kroah-Hartman,
Wysocki, Rafael J
Cc: Lan, Tianyu
Hi Yi,
Sorry for the late reply, I seem to have missed this.
On 20/10/17 11:07, Liu, Yi L wrote:
[...]
>>> +
>>> +/* Generic fault types, can be expanded IRQ remapping fault */ enum
>>> +iommu_fault_type {
>>> + IOMMU_FAULT_DMA_UNRECOV = 1, /* unrecoverable fault */
>>> + IOMMU_FAULT_PAGE_REQ, /* page request fault */
>>> +};
>>> +
>>> +enum iommu_fault_reason {
>>> + IOMMU_FAULT_REASON_CTX = 1,
>>
>> If I read the VT-d spec right, this is a fault encountered while fetching the PASID table
>> pointer?
>>
>>> + IOMMU_FAULT_REASON_ACCESS,
>>
>> And this a pgd or pte access fault?
>>
>>> + IOMMU_FAULT_REASON_INVALIDATE,
>>
>> What would this be?
>>
>>> + IOMMU_FAULT_REASON_UNKNOWN,
>>> +};
>>
>> I'm currently doing the same exploratory work for virtio-iommu, and I'd be tempted
>> to report reasons as detailed as possible to guest or device driver, but it's not clear
>> what they need, how they would use this information. I'd like to discuss this some
>> more.
>
> [Liu, Yi L] In fact, it's not necessary to pass the detailed unrecoverable fault to guest in
> virtualization case. Unrecoverable fault happened on native indicates fault during native
> IOMMU address translation. If the fault is not due to guest IOMMU page table setting,
> then it is not necessary to inject the fault to guest. And hypervisor should be able to
> deduce it by walking the guest IOMMU page table with the fault address.
I'm not sure the hypervisor should go and inspect the guest's page tables.
The pIOMMU already did the walk and reported the fault, so the hypervisor
knows that they are invalid. I thought VT-d and other pIOMMUs provide
enough information in the fault report to tell if the error was due to
invalid page tables?
> So I think for
> virtualization case, pass the fault address is enough. If hypervisor doesn't see any issue
> after checking the guest IOMMU translation hierarchy, no use to let guest know it. Hypervisor
> can either throw error log or stop the guest. If hypervisor see any error in the guest
> iommu translation hierarchy, then inject the error to guest with a proper fault type.>
> But for device driver or other user-space driver, I'm not sure if they need detailed fault
> info. In fact, it is enough to pass the possible info which would help them to deduce whether
> the unrecoverable fault is due to them. This need more inputs from device driver reviewers.
Agreed, though I'm not sure how to reach them.
At the moment, the only users of report_iommu_fault, the existing fault
reporting mechanism, are ARM-based IOMMU drivers and there are only four
device drivers that register a handler with iommu_set_fault_handler. Two
of them simply print the fault, one resets the offending device, and the
last one (msm GPU) wants to provide more detailed debugging information
about the device state.
>> For unrecoverable faults I guess CTX means "the host IOMMU driver is broken", since
>> the device tables are invalid. In which case there is no use continuing, trying to
>> shutdown the device cleanly is really all the guest/device driver can do.
>
> [Liu, Yi L] Not sure about what device table mean here. But I agree that if host IOMMU
> driver has no valid CTX for the device, then this kind of error should result in a shutdown to
> the device.
Yes by device table I meant VT-d's root table and context tables.
>> For ACCESS the error is the device driver's or guest's fault, since the device driver
>> triggered DMA on unmapped buffers, or the guest didn't install the right page tables.
>> This can be repaired without shutting down, it may even just be one execution
>> stream that failed in the device while the others continued normally. It's not as
>> recoverable as a PRI Page Request, but the device driver may still be able to isolate
>> the problem (e.g. by killing the process responsible) and the device to recover from it.
>>
>> So maybe ACCESS would benefit from more details, for example differentiating
>> faults encountered while fetching the pgd from those encountered while fetching a
>> second-level table or pte. The former is a lot less recoverable than the latter (bug in
>> the guest IOMMU driver vs.
>> bug in the device driver).
>>
>> Generalizing this maybe we should differentiate each step of the translation in
>> fault_reason:
>>
>> * Device entry (context) fetch -> host IOMMU driver's fault
>> * PASID table fetch -> guest IOMMU driver or host userspace's fault
>> * pgd fetch -> guest IOMMU driver's fault
>> * pte fetch, including validity and access check -> device driver's fault
>
> [Liu, Yi L] It's a good summary here. BTW. why pte fetch is due to device driver's fault?
Mmh, not necessarily the device driver's fault, but the most likely cause
is that the device driver didn't call map() before triggering the DMA.
Another less likely cause is a programming error in the vIOMMU driver,
where it failed to perform the map() or populate the page tables properly.
Thanks,
Jean
^ permalink raw reply [flat|nested] 109+ messages in thread* Re: [PATCH v2 08/16] iommu: introduce device fault data
@ 2017-11-06 19:01 ` Jean-Philippe Brucker
0 siblings, 0 replies; 109+ messages in thread
From: Jean-Philippe Brucker @ 2017-11-06 19:01 UTC (permalink / raw)
To: Liu, Yi L, Jacob Pan, iommu@lists.linux-foundation.org, LKML,
Joerg Roedel, David Woodhouse, Greg Kroah-Hartman,
Wysocki, Rafael J
Cc: Lan, Tianyu, Tian, Kevin, Raj, Ashok, Alex Williamson
Hi Yi,
Sorry for the late reply, I seem to have missed this.
On 20/10/17 11:07, Liu, Yi L wrote:
[...]
>>> +
>>> +/* Generic fault types, can be expanded IRQ remapping fault */ enum
>>> +iommu_fault_type {
>>> + IOMMU_FAULT_DMA_UNRECOV = 1, /* unrecoverable fault */
>>> + IOMMU_FAULT_PAGE_REQ, /* page request fault */
>>> +};
>>> +
>>> +enum iommu_fault_reason {
>>> + IOMMU_FAULT_REASON_CTX = 1,
>>
>> If I read the VT-d spec right, this is a fault encountered while fetching the PASID table
>> pointer?
>>
>>> + IOMMU_FAULT_REASON_ACCESS,
>>
>> And this a pgd or pte access fault?
>>
>>> + IOMMU_FAULT_REASON_INVALIDATE,
>>
>> What would this be?
>>
>>> + IOMMU_FAULT_REASON_UNKNOWN,
>>> +};
>>
>> I'm currently doing the same exploratory work for virtio-iommu, and I'd be tempted
>> to report reasons as detailed as possible to guest or device driver, but it's not clear
>> what they need, how they would use this information. I'd like to discuss this some
>> more.
>
> [Liu, Yi L] In fact, it's not necessary to pass the detailed unrecoverable fault to guest in
> virtualization case. Unrecoverable fault happened on native indicates fault during native
> IOMMU address translation. If the fault is not due to guest IOMMU page table setting,
> then it is not necessary to inject the fault to guest. And hypervisor should be able to
> deduce it by walking the guest IOMMU page table with the fault address.
I'm not sure the hypervisor should go and inspect the guest's page tables.
The pIOMMU already did the walk and reported the fault, so the hypervisor
knows that they are invalid. I thought VT-d and other pIOMMUs provide
enough information in the fault report to tell if the error was due to
invalid page tables?
> So I think for
> virtualization case, pass the fault address is enough. If hypervisor doesn't see any issue
> after checking the guest IOMMU translation hierarchy, no use to let guest know it. Hypervisor
> can either throw error log or stop the guest. If hypervisor see any error in the guest
> iommu translation hierarchy, then inject the error to guest with a proper fault type.>
> But for device driver or other user-space driver, I'm not sure if they need detailed fault
> info. In fact, it is enough to pass the possible info which would help them to deduce whether
> the unrecoverable fault is due to them. This need more inputs from device driver reviewers.
Agreed, though I'm not sure how to reach them.
At the moment, the only users of report_iommu_fault, the existing fault
reporting mechanism, are ARM-based IOMMU drivers and there are only four
device drivers that register a handler with iommu_set_fault_handler. Two
of them simply print the fault, one resets the offending device, and the
last one (msm GPU) wants to provide more detailed debugging information
about the device state.
>> For unrecoverable faults I guess CTX means "the host IOMMU driver is broken", since
>> the device tables are invalid. In which case there is no use continuing, trying to
>> shutdown the device cleanly is really all the guest/device driver can do.
>
> [Liu, Yi L] Not sure about what device table mean here. But I agree that if host IOMMU
> driver has no valid CTX for the device, then this kind of error should result in a shutdown to
> the device.
Yes by device table I meant VT-d's root table and context tables.
>> For ACCESS the error is the device driver's or guest's fault, since the device driver
>> triggered DMA on unmapped buffers, or the guest didn't install the right page tables.
>> This can be repaired without shutting down, it may even just be one execution
>> stream that failed in the device while the others continued normally. It's not as
>> recoverable as a PRI Page Request, but the device driver may still be able to isolate
>> the problem (e.g. by killing the process responsible) and the device to recover from it.
>>
>> So maybe ACCESS would benefit from more details, for example differentiating
>> faults encountered while fetching the pgd from those encountered while fetching a
>> second-level table or pte. The former is a lot less recoverable than the latter (bug in
>> the guest IOMMU driver vs.
>> bug in the device driver).
>>
>> Generalizing this maybe we should differentiate each step of the translation in
>> fault_reason:
>>
>> * Device entry (context) fetch -> host IOMMU driver's fault
>> * PASID table fetch -> guest IOMMU driver or host userspace's fault
>> * pgd fetch -> guest IOMMU driver's fault
>> * pte fetch, including validity and access check -> device driver's fault
>
> [Liu, Yi L] It's a good summary here. BTW. why pte fetch is due to device driver's fault?
Mmh, not necessarily the device driver's fault, but the most likely cause
is that the device driver didn't call map() before triggering the DMA.
Another less likely cause is a programming error in the vIOMMU driver,
where it failed to perform the map() or populate the page tables properly.
Thanks,
Jean
^ permalink raw reply [flat|nested] 109+ messages in thread* RE: [PATCH v2 08/16] iommu: introduce device fault data
2017-11-06 19:01 ` Jean-Philippe Brucker
(?)
@ 2017-11-07 8:40 ` Liu, Yi L
[not found] ` <A2975661238FB949B60364EF0F2C257439B06809-0J0gbvR4kTg/UvCtAeCM4rfspsVTdybXVpNB7YpNyf8@public.gmane.org>
-1 siblings, 1 reply; 109+ messages in thread
From: Liu, Yi L @ 2017-11-07 8:40 UTC (permalink / raw)
To: Jean-Philippe Brucker, Jacob Pan,
iommu@lists.linux-foundation.org, LKML, Joerg Roedel,
David Woodhouse, Greg Kroah-Hartman, Wysocki, Rafael J
Cc: Lan, Tianyu, Tian, Kevin, Raj, Ashok, Alex Williamson
Hi Jean,
Nice to have you "online". This open is really blocking the progress. Pls check inline.
> -----Original Message-----
> From: Jean-Philippe Brucker [mailto:jean-philippe.brucker@arm.com]
> Sent: Tuesday, November 7, 2017 3:02 AM
> To: Liu, Yi L <yi.l.liu@intel.com>; Jacob Pan <jacob.jun.pan@linux.intel.com>;
> iommu@lists.linux-foundation.org; LKML <linux-kernel@vger.kernel.org>; Joerg
> Roedel <joro@8bytes.org>; David Woodhouse <dwmw2@infradead.org>; Greg
> Kroah-Hartman <gregkh@linuxfoundation.org>; Wysocki, Rafael J
> <rafael.j.wysocki@intel.com>
> Cc: Lan, Tianyu <tianyu.lan@intel.com>; Tian, Kevin <kevin.tian@intel.com>; Raj,
> Ashok <ashok.raj@intel.com>; Alex Williamson <alex.williamson@redhat.com>
> Subject: Re: [PATCH v2 08/16] iommu: introduce device fault data
>
> Hi Yi,
>
> Sorry for the late reply, I seem to have missed this.
>
> On 20/10/17 11:07, Liu, Yi L wrote:
> [...]
> >>> +
> >>> +/* Generic fault types, can be expanded IRQ remapping fault */
> >>> +enum iommu_fault_type {
> >>> + IOMMU_FAULT_DMA_UNRECOV = 1, /* unrecoverable fault */
> >>> + IOMMU_FAULT_PAGE_REQ, /* page request fault */
> >>> +};
> >>> +
> >>> +enum iommu_fault_reason {
> >>> + IOMMU_FAULT_REASON_CTX = 1,
> >>
> >> If I read the VT-d spec right, this is a fault encountered while
> >> fetching the PASID table pointer?
> >>
> >>> + IOMMU_FAULT_REASON_ACCESS,
> >>
> >> And this a pgd or pte access fault?
> >>
> >>> + IOMMU_FAULT_REASON_INVALIDATE,
> >>
> >> What would this be?
> >>
> >>> + IOMMU_FAULT_REASON_UNKNOWN,
> >>> +};
> >>
> >> I'm currently doing the same exploratory work for virtio-iommu, and
> >> I'd be tempted to report reasons as detailed as possible to guest or
> >> device driver, but it's not clear what they need, how they would use
> >> this information. I'd like to discuss this some more.
> >
> > [Liu, Yi L] In fact, it's not necessary to pass the detailed
> > unrecoverable fault to guest in virtualization case. Unrecoverable
> > fault happened on native indicates fault during native IOMMU address
> > translation. If the fault is not due to guest IOMMU page table
> > setting, then it is not necessary to inject the fault to guest. And hypervisor should
> be able to deduce it by walking the guest IOMMU page table with the fault address.
>
> I'm not sure the hypervisor should go and inspect the guest's page tables.
[Liu, Yi L] I think hypervisor needs to do it to make sure reporting fault to guest
correctly. If not, hypervisor may report some fault to guest and make guest
confused. e.g. pIOMMU walks page table and failed during walking root table(VT-d)
or device table(SMMU). such fault is due to no valid programming in host, guest
has no duty on it and neither has knowledge to fix it. it would make guest to
believe that it has programmed the root table or device table in the wrong way
while the fact is not.
> The pIOMMU already did the walk and reported the fault, so the hypervisor knows
> that they are invalid. I thought VT-d and other pIOMMUs provide enough
> information in the fault report to tell if the error was due to invalid page tables?
[Liu, Yi L] yes, pIOMMU did walk and get the fault info, but it's not sure who is
responsible to the fault. With inspecting the guest table, hypervisor may know who
should be responsible to the fault.
>
> > So I think for
> > virtualization case, pass the fault address is enough. If hypervisor
> > doesn't see any issue after checking the guest IOMMU translation
> > hierarchy, no use to let guest know it. Hypervisor can either throw
> > error log or stop the guest. If hypervisor see any error in the guest
> > iommu translation hierarchy, then inject the error to guest with a
> > proper fault type.> But for device driver or other user-space driver,
> > I'm not sure if they need detailed fault info. In fact, it is enough to pass the
> possible info which would help them to deduce whether the unrecoverable fault is
> due to them. This need more inputs from device driver reviewers.
>
> Agreed, though I'm not sure how to reach them.
[Liu, Yi L] I'd like to supplement my words here. Except the fault address, we may also
need to provide the BDF and PASID if it is there.
>
> At the moment, the only users of report_iommu_fault, the existing fault reporting
> mechanism, are ARM-based IOMMU drivers and there are only four device drivers
> that register a handler with iommu_set_fault_handler. Two of them simply print the
> fault, one resets the offending device, and the last one (msm GPU) wants to provide
> more detailed debugging information about the device state.
[Liu, Yi L] Well, it looks like device driver may not try to fix the fault, instead, it would
more likely do kind of clean up after un-recoverable fault. If so, it may be enough to
have a notification to device driver.
> >> For unrecoverable faults I guess CTX means "the host IOMMU driver is
> >> broken", since the device tables are invalid. In which case there is
> >> no use continuing, trying to shutdown the device cleanly is really all the
> guest/device driver can do.
> >
> > [Liu, Yi L] Not sure about what device table mean here. But I agree
> > that if host IOMMU driver has no valid CTX for the device, then this
> > kind of error should result in a shutdown to the device.
>
> Yes by device table I meant VT-d's root table and context tables.
[Liu, Yi L] I see.
> >> For ACCESS the error is the device driver's or guest's fault, since
> >> the device driver triggered DMA on unmapped buffers, or the guest didn't install
> the right page tables.
> >> This can be repaired without shutting down, it may even just be one
> >> execution stream that failed in the device while the others continued
> >> normally. It's not as recoverable as a PRI Page Request, but the
> >> device driver may still be able to isolate the problem (e.g. by killing the process
> responsible) and the device to recover from it.
> >>
> >> So maybe ACCESS would benefit from more details, for example
> >> differentiating faults encountered while fetching the pgd from those
> >> encountered while fetching a second-level table or pte. The former is
> >> a lot less recoverable than the latter (bug in the guest IOMMU driver vs.
> >> bug in the device driver).
> >>
> >> Generalizing this maybe we should differentiate each step of the
> >> translation in
> >> fault_reason:
> >>
> >> * Device entry (context) fetch -> host IOMMU driver's fault
> >> * PASID table fetch -> guest IOMMU driver or host userspace's fault
> >> * pgd fetch -> guest IOMMU driver's fault
> >> * pte fetch, including validity and access check -> device driver's
> >> fault
> >
> > [Liu, Yi L] It's a good summary here. BTW. why pte fetch is due to device driver's
> fault?
>
> Mmh, not necessarily the device driver's fault, but the most likely cause is that the
> device driver didn't call map() before triggering the DMA.
> Another less likely cause is a programming error in the vIOMMU driver, where it
> failed to perform the map() or populate the page tables properly.
[Liu, Yi L] Yes, I think for fault during iova(host iova or GPA) translation, the most likely
reason would be no calling of map() since we are using synchronized map API.
Besides the four reasons you listed above, I think there is still other reasons like
no present bit, invalid programming or so. And also, we have several tables which may
be referenced in an address translation. e.g. VT-d, we have root table, CTX table, pasid
table, translation page table(1st level, 2nd level). I think AMD-iommu and SMMU should
have similar stuffs?
Regards,
Yi L
^ permalink raw reply [flat|nested] 109+ messages in thread
* [PATCH v2 09/16] driver core: add iommu device fault reporting data
2017-10-05 23:03 ` Jacob Pan
@ 2017-10-05 23:03 ` Jacob Pan
-1 siblings, 0 replies; 109+ messages in thread
From: Jacob Pan @ 2017-10-05 23:03 UTC (permalink / raw)
To: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA, LKML,
Joerg Roedel, David Woodhouse, Greg Kroah-Hartman, Rafael Wysocki,
Jean-Philippe Brucker
Cc: Lan Tianyu
DMA faults can be detected by IOMMU at device level. Adding a pointer
to struct device allows IOMMU subsystem to report relevant faults
back to the device driver for further handling.
For direct assigned device (or user space drivers), guest OS holds
responsibility to handle and respond per device IOMMU fault.
Therefore we need fault reporting mechanism to propagate faults beyond
IOMMU subsystem.
Signed-off-by: Jacob Pan <jacob.jun.pan-VuQAYsv1563Yd54FQh9/CA@public.gmane.org>
---
include/linux/device.h | 3 +++
1 file changed, 3 insertions(+)
diff --git a/include/linux/device.h b/include/linux/device.h
index 1d26079..4e3d543 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -42,6 +42,7 @@ struct fwnode_handle;
struct iommu_ops;
struct iommu_group;
struct iommu_fwspec;
+struct iommu_fault_param;
struct bus_attribute {
struct attribute attr;
@@ -873,6 +874,7 @@ struct dev_links_info {
* device (i.e. the bus driver that discovered the device).
* @iommu_group: IOMMU group the device belongs to.
* @iommu_fwspec: IOMMU-specific properties supplied by firmware.
+ * @iommu_fault_param: Per device generic IOMMU runtime parameters
*
* @offline_disabled: If set, the device is permanently online.
* @offline: Set after successful invocation of bus type's .offline().
@@ -962,6 +964,7 @@ struct device {
void (*release)(struct device *dev);
struct iommu_group *iommu_group;
struct iommu_fwspec *iommu_fwspec;
+ struct iommu_fault_param *iommu_fault_param;
bool offline_disabled:1;
bool offline:1;
--
2.7.4
^ permalink raw reply related [flat|nested] 109+ messages in thread* [PATCH v2 09/16] driver core: add iommu device fault reporting data
@ 2017-10-05 23:03 ` Jacob Pan
0 siblings, 0 replies; 109+ messages in thread
From: Jacob Pan @ 2017-10-05 23:03 UTC (permalink / raw)
To: iommu, LKML, Joerg Roedel, David Woodhouse, Greg Kroah-Hartman,
Rafael Wysocki, Jean-Philippe Brucker
Cc: Liu, Yi L, Lan Tianyu, Tian, Kevin, Raj Ashok, Alex Williamson,
Jacob Pan
DMA faults can be detected by IOMMU at device level. Adding a pointer
to struct device allows IOMMU subsystem to report relevant faults
back to the device driver for further handling.
For direct assigned device (or user space drivers), guest OS holds
responsibility to handle and respond per device IOMMU fault.
Therefore we need fault reporting mechanism to propagate faults beyond
IOMMU subsystem.
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
---
include/linux/device.h | 3 +++
1 file changed, 3 insertions(+)
diff --git a/include/linux/device.h b/include/linux/device.h
index 1d26079..4e3d543 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -42,6 +42,7 @@ struct fwnode_handle;
struct iommu_ops;
struct iommu_group;
struct iommu_fwspec;
+struct iommu_fault_param;
struct bus_attribute {
struct attribute attr;
@@ -873,6 +874,7 @@ struct dev_links_info {
* device (i.e. the bus driver that discovered the device).
* @iommu_group: IOMMU group the device belongs to.
* @iommu_fwspec: IOMMU-specific properties supplied by firmware.
+ * @iommu_fault_param: Per device generic IOMMU runtime parameters
*
* @offline_disabled: If set, the device is permanently online.
* @offline: Set after successful invocation of bus type's .offline().
@@ -962,6 +964,7 @@ struct device {
void (*release)(struct device *dev);
struct iommu_group *iommu_group;
struct iommu_fwspec *iommu_fwspec;
+ struct iommu_fault_param *iommu_fault_param;
bool offline_disabled:1;
bool offline:1;
--
2.7.4
^ permalink raw reply related [flat|nested] 109+ messages in thread[parent not found: <1507244624-39189-10-git-send-email-jacob.jun.pan-VuQAYsv1563Yd54FQh9/CA@public.gmane.org>]
* Re: [PATCH v2 09/16] driver core: add iommu device fault reporting data
2017-10-05 23:03 ` Jacob Pan
@ 2017-10-06 5:43 ` Greg Kroah-Hartman
-1 siblings, 0 replies; 109+ messages in thread
From: Greg Kroah-Hartman @ 2017-10-06 5:43 UTC (permalink / raw)
To: Jacob Pan
Cc: Lan Tianyu, Rafael Wysocki, LKML,
iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
David Woodhouse
On Thu, Oct 05, 2017 at 04:03:37PM -0700, Jacob Pan wrote:
> DMA faults can be detected by IOMMU at device level. Adding a pointer
> to struct device allows IOMMU subsystem to report relevant faults
> back to the device driver for further handling.
> For direct assigned device (or user space drivers), guest OS holds
> responsibility to handle and respond per device IOMMU fault.
> Therefore we need fault reporting mechanism to propagate faults beyond
> IOMMU subsystem.
>
> Signed-off-by: Jacob Pan <jacob.jun.pan-VuQAYsv1563Yd54FQh9/CA@public.gmane.org>
Acked-by: Greg Kroah-Hartman <gregkh-hQyY1W1yCW8ekmWlsbkhG0B+6BGkLq7r@public.gmane.org>
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH v2 09/16] driver core: add iommu device fault reporting data
@ 2017-10-06 5:43 ` Greg Kroah-Hartman
0 siblings, 0 replies; 109+ messages in thread
From: Greg Kroah-Hartman @ 2017-10-06 5:43 UTC (permalink / raw)
To: Jacob Pan
Cc: iommu, LKML, Joerg Roedel, David Woodhouse, Rafael Wysocki,
Jean-Philippe Brucker, Liu, Yi L, Lan Tianyu, Tian, Kevin,
Raj Ashok, Alex Williamson
On Thu, Oct 05, 2017 at 04:03:37PM -0700, Jacob Pan wrote:
> DMA faults can be detected by IOMMU at device level. Adding a pointer
> to struct device allows IOMMU subsystem to report relevant faults
> back to the device driver for further handling.
> For direct assigned device (or user space drivers), guest OS holds
> responsibility to handle and respond per device IOMMU fault.
> Therefore we need fault reporting mechanism to propagate faults beyond
> IOMMU subsystem.
>
> Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH v2 09/16] driver core: add iommu device fault reporting data
2017-10-05 23:03 ` Jacob Pan
(?)
(?)
@ 2017-10-06 7:11 ` Christoph Hellwig
2017-10-06 8:26 ` Greg Kroah-Hartman
[not found] ` <20171006071145.GA24354-wEGCiKHe2LqWVfeAwA7xHQ@public.gmane.org>
-1 siblings, 2 replies; 109+ messages in thread
From: Christoph Hellwig @ 2017-10-06 7:11 UTC (permalink / raw)
To: Jacob Pan
Cc: iommu, LKML, Joerg Roedel, David Woodhouse, Greg Kroah-Hartman,
Rafael Wysocki, Jean-Philippe Brucker, Lan Tianyu
On Thu, Oct 05, 2017 at 04:03:37PM -0700, Jacob Pan wrote:
> DMA faults can be detected by IOMMU at device level. Adding a pointer
> to struct device allows IOMMU subsystem to report relevant faults
> back to the device driver for further handling.
> For direct assigned device (or user space drivers), guest OS holds
> responsibility to handle and respond per device IOMMU fault.
> Therefore we need fault reporting mechanism to propagate faults beyond
> IOMMU subsystem.
We use struct device all over the system, and I don't think we should
bloat it for fringe case IOMMU bits.
Someone really needs to take a step back and figure out how to move
this into a structure that's only allocated for device that actually
can do physical DMA (and/or have an iommu attached)
This is the 3rd iommu field, in addition to 8 dma-specific fields
that we carry around for each struct device.
^ permalink raw reply [flat|nested] 109+ messages in thread* Re: [PATCH v2 09/16] driver core: add iommu device fault reporting data
2017-10-06 7:11 ` Christoph Hellwig
@ 2017-10-06 8:26 ` Greg Kroah-Hartman
[not found] ` <20171006071145.GA24354-wEGCiKHe2LqWVfeAwA7xHQ@public.gmane.org>
1 sibling, 0 replies; 109+ messages in thread
From: Greg Kroah-Hartman @ 2017-10-06 8:26 UTC (permalink / raw)
To: Christoph Hellwig
Cc: Jacob Pan, iommu, LKML, Joerg Roedel, David Woodhouse,
Rafael Wysocki, Jean-Philippe Brucker, Lan Tianyu
On Fri, Oct 06, 2017 at 12:11:45AM -0700, Christoph Hellwig wrote:
> On Thu, Oct 05, 2017 at 04:03:37PM -0700, Jacob Pan wrote:
> > DMA faults can be detected by IOMMU at device level. Adding a pointer
> > to struct device allows IOMMU subsystem to report relevant faults
> > back to the device driver for further handling.
> > For direct assigned device (or user space drivers), guest OS holds
> > responsibility to handle and respond per device IOMMU fault.
> > Therefore we need fault reporting mechanism to propagate faults beyond
> > IOMMU subsystem.
>
> We use struct device all over the system, and I don't think we should
> bloat it for fringe case IOMMU bits.
>
> Someone really needs to take a step back and figure out how to move
> this into a structure that's only allocated for device that actually
> can do physical DMA (and/or have an iommu attached)
>
> This is the 3rd iommu field, in addition to 8 dma-specific fields
> that we carry around for each struct device.
Ick, 8? Yeah, it's getting big... How about just a single pointer for
iommu and dma-specific stuff that you all can hang crap like this off
of if needed?
thanks,
greg k-h
^ permalink raw reply [flat|nested] 109+ messages in thread[parent not found: <20171006071145.GA24354-wEGCiKHe2LqWVfeAwA7xHQ@public.gmane.org>]
* Re: [PATCH v2 09/16] driver core: add iommu device fault reporting data
2017-10-06 7:11 ` Christoph Hellwig
@ 2017-10-06 8:39 ` Joerg Roedel
[not found] ` <20171006071145.GA24354-wEGCiKHe2LqWVfeAwA7xHQ@public.gmane.org>
1 sibling, 0 replies; 109+ messages in thread
From: Joerg Roedel @ 2017-10-06 8:39 UTC (permalink / raw)
To: Christoph Hellwig
Cc: Lan Tianyu, Greg Kroah-Hartman, Rafael Wysocki, LKML,
iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
David Woodhouse
On Fri, Oct 06, 2017 at 12:11:45AM -0700, Christoph Hellwig wrote:
> This is the 3rd iommu field, in addition to 8 dma-specific fields
> that we carry around for each struct device.
Agreed, consolidating the iommu-fields in 'struct device' into a single
'struct iommu_data' is on my todo-list.
Jacob, can you add that 'struct iommu_data' to 'struct device' and put
your fault-data into it? We can then move on and migrate the other
fields into that struct too.
Regards,
Joerg
^ permalink raw reply [flat|nested] 109+ messages in thread* Re: [PATCH v2 09/16] driver core: add iommu device fault reporting data
@ 2017-10-06 8:39 ` Joerg Roedel
0 siblings, 0 replies; 109+ messages in thread
From: Joerg Roedel @ 2017-10-06 8:39 UTC (permalink / raw)
To: Christoph Hellwig
Cc: Jacob Pan, iommu, LKML, David Woodhouse, Greg Kroah-Hartman,
Rafael Wysocki, Jean-Philippe Brucker, Lan Tianyu
On Fri, Oct 06, 2017 at 12:11:45AM -0700, Christoph Hellwig wrote:
> This is the 3rd iommu field, in addition to 8 dma-specific fields
> that we carry around for each struct device.
Agreed, consolidating the iommu-fields in 'struct device' into a single
'struct iommu_data' is on my todo-list.
Jacob, can you add that 'struct iommu_data' to 'struct device' and put
your fault-data into it? We can then move on and migrate the other
fields into that struct too.
Regards,
Joerg
^ permalink raw reply [flat|nested] 109+ messages in thread
[parent not found: <20171006083931.GY8398-zLv9SwRftAIdnm+yROfE0A@public.gmane.org>]
* Re: [PATCH v2 09/16] driver core: add iommu device fault reporting data
2017-10-06 8:39 ` Joerg Roedel
@ 2017-10-06 16:22 ` Jacob Pan
-1 siblings, 0 replies; 109+ messages in thread
From: Jacob Pan @ 2017-10-06 16:22 UTC (permalink / raw)
To: Joerg Roedel
Cc: Lan Tianyu, Greg Kroah-Hartman, Rafael Wysocki, LKML,
iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
David Woodhouse
On Fri, 6 Oct 2017 10:39:31 +0200
Joerg Roedel <joro-zLv9SwRftAIdnm+yROfE0A@public.gmane.org> wrote:
> On Fri, Oct 06, 2017 at 12:11:45AM -0700, Christoph Hellwig wrote:
> > This is the 3rd iommu field, in addition to 8 dma-specific fields
> > that we carry around for each struct device.
>
> Agreed, consolidating the iommu-fields in 'struct device' into a
> single 'struct iommu_data' is on my todo-list.
>
> Jacob, can you add that 'struct iommu_data' to 'struct device' and put
> your fault-data into it? We can then move on and migrate the other
> fields into that struct too.
>
sounds good.
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH v2 09/16] driver core: add iommu device fault reporting data
@ 2017-10-06 16:22 ` Jacob Pan
0 siblings, 0 replies; 109+ messages in thread
From: Jacob Pan @ 2017-10-06 16:22 UTC (permalink / raw)
To: Joerg Roedel
Cc: Christoph Hellwig, iommu, LKML, David Woodhouse,
Greg Kroah-Hartman, Rafael Wysocki, Jean-Philippe Brucker,
Lan Tianyu, jacob.jun.pan
On Fri, 6 Oct 2017 10:39:31 +0200
Joerg Roedel <joro@8bytes.org> wrote:
> On Fri, Oct 06, 2017 at 12:11:45AM -0700, Christoph Hellwig wrote:
> > This is the 3rd iommu field, in addition to 8 dma-specific fields
> > that we carry around for each struct device.
>
> Agreed, consolidating the iommu-fields in 'struct device' into a
> single 'struct iommu_data' is on my todo-list.
>
> Jacob, can you add that 'struct iommu_data' to 'struct device' and put
> your fault-data into it? We can then move on and migrate the other
> fields into that struct too.
>
sounds good.
^ permalink raw reply [flat|nested] 109+ messages in thread
* [PATCH v2 10/16] iommu: introduce device fault report API
2017-10-05 23:03 ` Jacob Pan
@ 2017-10-05 23:03 ` Jacob Pan
-1 siblings, 0 replies; 109+ messages in thread
From: Jacob Pan @ 2017-10-05 23:03 UTC (permalink / raw)
To: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA, LKML,
Joerg Roedel, David Woodhouse, Greg Kroah-Hartman, Rafael Wysocki,
Jean-Philippe Brucker
Cc: Lan Tianyu
Traditionally, device specific faults are detected and handled within
their own device drivers. When IOMMU is enabled, faults such as DMA
related transactions are detected by IOMMU. There is no generic
reporting mechanism to report faults back to the in-kernel device
driver or the guest OS in case of assigned devices.
Faults detected by IOMMU is based on the transaction's source ID which
can be reported at per device basis, regardless of the device type is a
PCI device or not.
The fault types include recoverable (e.g. page request) and
unrecoverable faults(e.g. access error). In most cases, faults can be
handled by IOMMU drivers internally. The primary use cases are as
follows:
1. page request fault originated from an SVM capable device that is
assigned to guest via vIOMMU. In this case, the first level page tables
are owned by the guest. Page request must be propagated to the guest to
let guest OS fault in the pages then send page response. In this
mechanism, the direct receiver of IOMMU fault notification is VFIO,
which can relay notification events to QEMU or other user space
software.
2. faults need more subtle handling by device drivers. Other than
simply invoke reset function, there are needs to let device driver
handle the fault with a smaller impact.
This patchset is intended to create a generic fault report API such
that it can scale as follows:
- all IOMMU types
- PCI and non-PCI devices
- recoverable and unrecoverable faults
- VFIO and other other in kernel users
- DMA & IRQ remapping (TBD)
The original idea was brought up by David Woodhouse and discussions
summarized at https://lwn.net/Articles/608914/.
Signed-off-by: Jacob Pan <jacob.jun.pan-VuQAYsv1563Yd54FQh9/CA@public.gmane.org>
Signed-off-by: Ashok Raj <ashok.raj-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
---
drivers/iommu/iommu.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++-
include/linux/iommu.h | 23 +++++++++++++++++++++
2 files changed, 78 insertions(+), 1 deletion(-)
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 5a14154..0b058e2 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -554,9 +554,15 @@ int iommu_group_add_device(struct iommu_group *group, struct device *dev)
device->dev = dev;
+ dev->iommu_fault_param = kzalloc(sizeof(struct iommu_fault_param), GFP_KERNEL);
+ if (!dev->iommu_fault_param) {
+ ret = -ENOMEM;
+ goto err_free_device;
+ }
+
ret = sysfs_create_link(&dev->kobj, &group->kobj, "iommu_group");
if (ret)
- goto err_free_device;
+ goto err_free_device_iommu_fault_param;
device->name = kasprintf(GFP_KERNEL, "%s", kobject_name(&dev->kobj));
rename:
@@ -615,6 +621,8 @@ int iommu_group_add_device(struct iommu_group *group, struct device *dev)
kfree(device->name);
err_remove_link:
sysfs_remove_link(&dev->kobj, "iommu_group");
+err_free_device_iommu_fault_param:
+ kfree(dev->iommu_fault_param);
err_free_device:
kfree(device);
pr_err("Failed to add device %s to group %d: %d\n", dev_name(dev), group->id, ret);
@@ -791,6 +799,52 @@ int iommu_group_unregister_notifier(struct iommu_group *group,
}
EXPORT_SYMBOL_GPL(iommu_group_unregister_notifier);
+int iommu_register_device_fault_handler(struct device *dev,
+ iommu_dev_fault_handler_t handler)
+{
+ if (dev->iommu_fault_param)
+ return -EBUSY;
+ get_device(dev);
+ dev->iommu_fault_param =
+ kzalloc(sizeof(struct iommu_fault_param), GFP_KERNEL);
+ if (!dev->iommu_fault_param)
+ return -ENOMEM;
+ dev->iommu_fault_param->dev_fault_handler = handler;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(iommu_register_device_fault_handler);
+
+int iommu_unregister_device_fault_handler(struct device *dev)
+{
+ if (!dev->iommu_fault_param)
+ return -EINVAL;
+
+ kfree(dev->iommu_fault_param);
+ dev->iommu_fault_param = NULL;
+ put_device(dev);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(iommu_unregister_device_fault_handler);
+
+
+int iommu_report_device_fault(struct device *dev, struct iommu_fault_event *evt)
+{
+ /* we only report device fault if there is a handler registered */
+ if (!dev->iommu_fault_param ||
+ !dev->iommu_fault_param->dev_fault_handler)
+ return -ENOSYS;
+ if (evt->type == IOMMU_FAULT_PAGE_REQ &&
+ !dev->iommu_fault_param->pasid_tbl_bound) {
+ dev_warn(dev, "PRQ not propaged, PASID table not bound\n");
+ return -EPERM;
+ }
+
+ return dev->iommu_fault_param->dev_fault_handler(dev, evt);
+}
+EXPORT_SYMBOL_GPL(iommu_report_device_fault);
+
/**
* iommu_group_id - Return ID for a group
* @group: the group to ID
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 3f9b367..44d2ada 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -416,6 +416,13 @@ extern int iommu_group_register_notifier(struct iommu_group *group,
struct notifier_block *nb);
extern int iommu_group_unregister_notifier(struct iommu_group *group,
struct notifier_block *nb);
+extern int iommu_register_device_fault_handler(struct device *dev,
+ iommu_dev_fault_handler_t handler);
+
+extern int iommu_unregister_device_fault_handler(struct device *dev);
+
+extern int iommu_report_device_fault(struct device *dev, struct iommu_fault_event *evt);
+
extern int iommu_group_id(struct iommu_group *group);
extern struct iommu_group *iommu_group_get_for_dev(struct device *dev);
extern struct iommu_domain *iommu_group_default_domain(struct iommu_group *);
@@ -699,6 +706,22 @@ static inline int iommu_group_unregister_notifier(struct iommu_group *group,
return 0;
}
+static inline int iommu_register_device_fault_handler(struct device *dev,
+ iommu_dev_fault_handler_t handler)
+{
+ return 0;
+}
+
+static inline int iommu_unregister_device_fault_handler(struct device *dev)
+{
+ return 0;
+}
+
+static inline int iommu_report_device_fault(struct device *dev, struct iommu_fault_event *evt)
+{
+ return 0;
+}
+
static inline int iommu_group_id(struct iommu_group *group)
{
return -ENODEV;
--
2.7.4
^ permalink raw reply related [flat|nested] 109+ messages in thread* [PATCH v2 10/16] iommu: introduce device fault report API
@ 2017-10-05 23:03 ` Jacob Pan
0 siblings, 0 replies; 109+ messages in thread
From: Jacob Pan @ 2017-10-05 23:03 UTC (permalink / raw)
To: iommu, LKML, Joerg Roedel, David Woodhouse, Greg Kroah-Hartman,
Rafael Wysocki, Jean-Philippe Brucker
Cc: Liu, Yi L, Lan Tianyu, Tian, Kevin, Raj Ashok, Alex Williamson,
Jacob Pan
Traditionally, device specific faults are detected and handled within
their own device drivers. When IOMMU is enabled, faults such as DMA
related transactions are detected by IOMMU. There is no generic
reporting mechanism to report faults back to the in-kernel device
driver or the guest OS in case of assigned devices.
Faults detected by IOMMU is based on the transaction's source ID which
can be reported at per device basis, regardless of the device type is a
PCI device or not.
The fault types include recoverable (e.g. page request) and
unrecoverable faults(e.g. access error). In most cases, faults can be
handled by IOMMU drivers internally. The primary use cases are as
follows:
1. page request fault originated from an SVM capable device that is
assigned to guest via vIOMMU. In this case, the first level page tables
are owned by the guest. Page request must be propagated to the guest to
let guest OS fault in the pages then send page response. In this
mechanism, the direct receiver of IOMMU fault notification is VFIO,
which can relay notification events to QEMU or other user space
software.
2. faults need more subtle handling by device drivers. Other than
simply invoke reset function, there are needs to let device driver
handle the fault with a smaller impact.
This patchset is intended to create a generic fault report API such
that it can scale as follows:
- all IOMMU types
- PCI and non-PCI devices
- recoverable and unrecoverable faults
- VFIO and other other in kernel users
- DMA & IRQ remapping (TBD)
The original idea was brought up by David Woodhouse and discussions
summarized at https://lwn.net/Articles/608914/.
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Ashok Raj <ashok.raj@intel.com>
---
drivers/iommu/iommu.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++-
include/linux/iommu.h | 23 +++++++++++++++++++++
2 files changed, 78 insertions(+), 1 deletion(-)
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 5a14154..0b058e2 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -554,9 +554,15 @@ int iommu_group_add_device(struct iommu_group *group, struct device *dev)
device->dev = dev;
+ dev->iommu_fault_param = kzalloc(sizeof(struct iommu_fault_param), GFP_KERNEL);
+ if (!dev->iommu_fault_param) {
+ ret = -ENOMEM;
+ goto err_free_device;
+ }
+
ret = sysfs_create_link(&dev->kobj, &group->kobj, "iommu_group");
if (ret)
- goto err_free_device;
+ goto err_free_device_iommu_fault_param;
device->name = kasprintf(GFP_KERNEL, "%s", kobject_name(&dev->kobj));
rename:
@@ -615,6 +621,8 @@ int iommu_group_add_device(struct iommu_group *group, struct device *dev)
kfree(device->name);
err_remove_link:
sysfs_remove_link(&dev->kobj, "iommu_group");
+err_free_device_iommu_fault_param:
+ kfree(dev->iommu_fault_param);
err_free_device:
kfree(device);
pr_err("Failed to add device %s to group %d: %d\n", dev_name(dev), group->id, ret);
@@ -791,6 +799,52 @@ int iommu_group_unregister_notifier(struct iommu_group *group,
}
EXPORT_SYMBOL_GPL(iommu_group_unregister_notifier);
+int iommu_register_device_fault_handler(struct device *dev,
+ iommu_dev_fault_handler_t handler)
+{
+ if (dev->iommu_fault_param)
+ return -EBUSY;
+ get_device(dev);
+ dev->iommu_fault_param =
+ kzalloc(sizeof(struct iommu_fault_param), GFP_KERNEL);
+ if (!dev->iommu_fault_param)
+ return -ENOMEM;
+ dev->iommu_fault_param->dev_fault_handler = handler;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(iommu_register_device_fault_handler);
+
+int iommu_unregister_device_fault_handler(struct device *dev)
+{
+ if (!dev->iommu_fault_param)
+ return -EINVAL;
+
+ kfree(dev->iommu_fault_param);
+ dev->iommu_fault_param = NULL;
+ put_device(dev);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(iommu_unregister_device_fault_handler);
+
+
+int iommu_report_device_fault(struct device *dev, struct iommu_fault_event *evt)
+{
+ /* we only report device fault if there is a handler registered */
+ if (!dev->iommu_fault_param ||
+ !dev->iommu_fault_param->dev_fault_handler)
+ return -ENOSYS;
+ if (evt->type == IOMMU_FAULT_PAGE_REQ &&
+ !dev->iommu_fault_param->pasid_tbl_bound) {
+ dev_warn(dev, "PRQ not propaged, PASID table not bound\n");
+ return -EPERM;
+ }
+
+ return dev->iommu_fault_param->dev_fault_handler(dev, evt);
+}
+EXPORT_SYMBOL_GPL(iommu_report_device_fault);
+
/**
* iommu_group_id - Return ID for a group
* @group: the group to ID
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 3f9b367..44d2ada 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -416,6 +416,13 @@ extern int iommu_group_register_notifier(struct iommu_group *group,
struct notifier_block *nb);
extern int iommu_group_unregister_notifier(struct iommu_group *group,
struct notifier_block *nb);
+extern int iommu_register_device_fault_handler(struct device *dev,
+ iommu_dev_fault_handler_t handler);
+
+extern int iommu_unregister_device_fault_handler(struct device *dev);
+
+extern int iommu_report_device_fault(struct device *dev, struct iommu_fault_event *evt);
+
extern int iommu_group_id(struct iommu_group *group);
extern struct iommu_group *iommu_group_get_for_dev(struct device *dev);
extern struct iommu_domain *iommu_group_default_domain(struct iommu_group *);
@@ -699,6 +706,22 @@ static inline int iommu_group_unregister_notifier(struct iommu_group *group,
return 0;
}
+static inline int iommu_register_device_fault_handler(struct device *dev,
+ iommu_dev_fault_handler_t handler)
+{
+ return 0;
+}
+
+static inline int iommu_unregister_device_fault_handler(struct device *dev)
+{
+ return 0;
+}
+
+static inline int iommu_report_device_fault(struct device *dev, struct iommu_fault_event *evt)
+{
+ return 0;
+}
+
static inline int iommu_group_id(struct iommu_group *group)
{
return -ENODEV;
--
2.7.4
^ permalink raw reply related [flat|nested] 109+ messages in thread[parent not found: <1507244624-39189-11-git-send-email-jacob.jun.pan-VuQAYsv1563Yd54FQh9/CA@public.gmane.org>]
* Re: [PATCH v2 10/16] iommu: introduce device fault report API
2017-10-05 23:03 ` Jacob Pan
@ 2017-10-06 9:36 ` Jean-Philippe Brucker
-1 siblings, 0 replies; 109+ messages in thread
From: Jean-Philippe Brucker @ 2017-10-06 9:36 UTC (permalink / raw)
To: Jacob Pan,
iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org,
LKML, Joerg Roedel, David Woodhouse, Greg Kroah-Hartman,
Rafael Wysocki
Cc: Lan Tianyu
Hi Jacob,
On 06/10/17 00:03, Jacob Pan wrote:
> Traditionally, device specific faults are detected and handled within
> their own device drivers. When IOMMU is enabled, faults such as DMA
> related transactions are detected by IOMMU. There is no generic
> reporting mechanism to report faults back to the in-kernel device
> driver or the guest OS in case of assigned devices.
>
> Faults detected by IOMMU is based on the transaction's source ID which
> can be reported at per device basis, regardless of the device type is a
> PCI device or not.
>
> The fault types include recoverable (e.g. page request) and
> unrecoverable faults(e.g. access error). In most cases, faults can be
> handled by IOMMU drivers internally. The primary use cases are as
> follows:
> 1. page request fault originated from an SVM capable device that is
> assigned to guest via vIOMMU. In this case, the first level page tables
> are owned by the guest. Page request must be propagated to the guest to
> let guest OS fault in the pages then send page response. In this
> mechanism, the direct receiver of IOMMU fault notification is VFIO,
> which can relay notification events to QEMU or other user space
> software.
>
> 2. faults need more subtle handling by device drivers. Other than
> simply invoke reset function, there are needs to let device driver
> handle the fault with a smaller impact.
>
> This patchset is intended to create a generic fault report API such
> that it can scale as follows:
> - all IOMMU types
> - PCI and non-PCI devices
> - recoverable and unrecoverable faults
> - VFIO and other other in kernel users
> - DMA & IRQ remapping (TBD)
> The original idea was brought up by David Woodhouse and discussions
> summarized at https://lwn.net/Articles/608914/.
>
> Signed-off-by: Jacob Pan <jacob.jun.pan-VuQAYsv1563Yd54FQh9/CA@public.gmane.org>
> Signed-off-by: Ashok Raj <ashok.raj-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
> ---
[...]
> +int iommu_register_device_fault_handler(struct device *dev,
> + iommu_dev_fault_handler_t handler)
> +{
> + if (dev->iommu_fault_param)
> + return -EBUSY;
> + get_device(dev);
> + dev->iommu_fault_param =
> + kzalloc(sizeof(struct iommu_fault_param), GFP_KERNEL);
> + if (!dev->iommu_fault_param)
> + return -ENOMEM;
> + dev->iommu_fault_param->dev_fault_handler = handler;
Since the handler is owned by a device driver, you also need to clean it
up when switching the driver (native->VFIO and VFIO->native), in
iommu_attach_device I suppose.
Thanks,
Jean
^ permalink raw reply [flat|nested] 109+ messages in thread* Re: [PATCH v2 10/16] iommu: introduce device fault report API
@ 2017-10-06 9:36 ` Jean-Philippe Brucker
0 siblings, 0 replies; 109+ messages in thread
From: Jean-Philippe Brucker @ 2017-10-06 9:36 UTC (permalink / raw)
To: Jacob Pan, iommu@lists.linux-foundation.org, LKML, Joerg Roedel,
David Woodhouse, Greg Kroah-Hartman, Rafael Wysocki
Cc: Liu, Yi L, Lan Tianyu, Tian, Kevin, Raj Ashok, Alex Williamson
Hi Jacob,
On 06/10/17 00:03, Jacob Pan wrote:
> Traditionally, device specific faults are detected and handled within
> their own device drivers. When IOMMU is enabled, faults such as DMA
> related transactions are detected by IOMMU. There is no generic
> reporting mechanism to report faults back to the in-kernel device
> driver or the guest OS in case of assigned devices.
>
> Faults detected by IOMMU is based on the transaction's source ID which
> can be reported at per device basis, regardless of the device type is a
> PCI device or not.
>
> The fault types include recoverable (e.g. page request) and
> unrecoverable faults(e.g. access error). In most cases, faults can be
> handled by IOMMU drivers internally. The primary use cases are as
> follows:
> 1. page request fault originated from an SVM capable device that is
> assigned to guest via vIOMMU. In this case, the first level page tables
> are owned by the guest. Page request must be propagated to the guest to
> let guest OS fault in the pages then send page response. In this
> mechanism, the direct receiver of IOMMU fault notification is VFIO,
> which can relay notification events to QEMU or other user space
> software.
>
> 2. faults need more subtle handling by device drivers. Other than
> simply invoke reset function, there are needs to let device driver
> handle the fault with a smaller impact.
>
> This patchset is intended to create a generic fault report API such
> that it can scale as follows:
> - all IOMMU types
> - PCI and non-PCI devices
> - recoverable and unrecoverable faults
> - VFIO and other other in kernel users
> - DMA & IRQ remapping (TBD)
> The original idea was brought up by David Woodhouse and discussions
> summarized at https://lwn.net/Articles/608914/.
>
> Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
> Signed-off-by: Ashok Raj <ashok.raj@intel.com>
> ---
[...]
> +int iommu_register_device_fault_handler(struct device *dev,
> + iommu_dev_fault_handler_t handler)
> +{
> + if (dev->iommu_fault_param)
> + return -EBUSY;
> + get_device(dev);
> + dev->iommu_fault_param =
> + kzalloc(sizeof(struct iommu_fault_param), GFP_KERNEL);
> + if (!dev->iommu_fault_param)
> + return -ENOMEM;
> + dev->iommu_fault_param->dev_fault_handler = handler;
Since the handler is owned by a device driver, you also need to clean it
up when switching the driver (native->VFIO and VFIO->native), in
iommu_attach_device I suppose.
Thanks,
Jean
^ permalink raw reply [flat|nested] 109+ messages in thread[parent not found: <5103e49c-d74c-c697-b5f7-e5c54edce595-5wv7dgnIgG8@public.gmane.org>]
* Re: [PATCH v2 10/16] iommu: introduce device fault report API
2017-10-06 9:36 ` Jean-Philippe Brucker
@ 2017-10-09 18:50 ` Jacob Pan
-1 siblings, 0 replies; 109+ messages in thread
From: Jacob Pan @ 2017-10-09 18:50 UTC (permalink / raw)
To: Jean-Philippe Brucker
Cc: Lan Tianyu, Greg Kroah-Hartman, Rafael Wysocki, LKML,
iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org,
David Woodhouse
On Fri, 6 Oct 2017 10:36:02 +0100
Jean-Philippe Brucker <jean-philippe.brucker-5wv7dgnIgG8@public.gmane.org> wrote:
> Hi Jacob,
>
> On 06/10/17 00:03, Jacob Pan wrote:
> > Traditionally, device specific faults are detected and handled
> > within their own device drivers. When IOMMU is enabled, faults such
> > as DMA related transactions are detected by IOMMU. There is no
> > generic reporting mechanism to report faults back to the in-kernel
> > device driver or the guest OS in case of assigned devices.
> >
> > Faults detected by IOMMU is based on the transaction's source ID
> > which can be reported at per device basis, regardless of the device
> > type is a PCI device or not.
> >
> > The fault types include recoverable (e.g. page request) and
> > unrecoverable faults(e.g. access error). In most cases, faults can
> > be handled by IOMMU drivers internally. The primary use cases are as
> > follows:
> > 1. page request fault originated from an SVM capable device that is
> > assigned to guest via vIOMMU. In this case, the first level page
> > tables are owned by the guest. Page request must be propagated to
> > the guest to let guest OS fault in the pages then send page
> > response. In this mechanism, the direct receiver of IOMMU fault
> > notification is VFIO, which can relay notification events to QEMU
> > or other user space software.
> >
> > 2. faults need more subtle handling by device drivers. Other than
> > simply invoke reset function, there are needs to let device driver
> > handle the fault with a smaller impact.
> >
> > This patchset is intended to create a generic fault report API such
> > that it can scale as follows:
> > - all IOMMU types
> > - PCI and non-PCI devices
> > - recoverable and unrecoverable faults
> > - VFIO and other other in kernel users
> > - DMA & IRQ remapping (TBD)
> > The original idea was brought up by David Woodhouse and discussions
> > summarized at https://lwn.net/Articles/608914/.
> >
> > Signed-off-by: Jacob Pan <jacob.jun.pan-VuQAYsv1563Yd54FQh9/CA@public.gmane.org>
> > Signed-off-by: Ashok Raj <ashok.raj-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
> > ---
> [...]
> > +int iommu_register_device_fault_handler(struct device *dev,
> > + iommu_dev_fault_handler_t
> > handler) +{
> > + if (dev->iommu_fault_param)
> > + return -EBUSY;
> > + get_device(dev);
> > + dev->iommu_fault_param =
> > + kzalloc(sizeof(struct iommu_fault_param),
> > GFP_KERNEL);
> > + if (!dev->iommu_fault_param)
> > + return -ENOMEM;
> > + dev->iommu_fault_param->dev_fault_handler = handler;
>
> Since the handler is owned by a device driver, you also need to clean
> it up when switching the driver (native->VFIO and VFIO->native), in
> iommu_attach_device I suppose.
>
I was thinking the driver who registered fault handler shall be held
accountable to unregister. e.g. User must unbind driver (unregister
fault handler included) before assigning device to vfio-pci. Otherwise,
VFIO call to register handler would fail.
I am assuming VFIO needs to have a separate device fault handler of its
own.
Jacob
^ permalink raw reply [flat|nested] 109+ messages in thread* Re: [PATCH v2 10/16] iommu: introduce device fault report API
@ 2017-10-09 18:50 ` Jacob Pan
0 siblings, 0 replies; 109+ messages in thread
From: Jacob Pan @ 2017-10-09 18:50 UTC (permalink / raw)
To: Jean-Philippe Brucker
Cc: iommu@lists.linux-foundation.org, LKML, Joerg Roedel,
David Woodhouse, Greg Kroah-Hartman, Rafael Wysocki, Liu, Yi L,
Lan Tianyu, Tian, Kevin, Raj Ashok, Alex Williamson,
jacob.jun.pan
On Fri, 6 Oct 2017 10:36:02 +0100
Jean-Philippe Brucker <jean-philippe.brucker@arm.com> wrote:
> Hi Jacob,
>
> On 06/10/17 00:03, Jacob Pan wrote:
> > Traditionally, device specific faults are detected and handled
> > within their own device drivers. When IOMMU is enabled, faults such
> > as DMA related transactions are detected by IOMMU. There is no
> > generic reporting mechanism to report faults back to the in-kernel
> > device driver or the guest OS in case of assigned devices.
> >
> > Faults detected by IOMMU is based on the transaction's source ID
> > which can be reported at per device basis, regardless of the device
> > type is a PCI device or not.
> >
> > The fault types include recoverable (e.g. page request) and
> > unrecoverable faults(e.g. access error). In most cases, faults can
> > be handled by IOMMU drivers internally. The primary use cases are as
> > follows:
> > 1. page request fault originated from an SVM capable device that is
> > assigned to guest via vIOMMU. In this case, the first level page
> > tables are owned by the guest. Page request must be propagated to
> > the guest to let guest OS fault in the pages then send page
> > response. In this mechanism, the direct receiver of IOMMU fault
> > notification is VFIO, which can relay notification events to QEMU
> > or other user space software.
> >
> > 2. faults need more subtle handling by device drivers. Other than
> > simply invoke reset function, there are needs to let device driver
> > handle the fault with a smaller impact.
> >
> > This patchset is intended to create a generic fault report API such
> > that it can scale as follows:
> > - all IOMMU types
> > - PCI and non-PCI devices
> > - recoverable and unrecoverable faults
> > - VFIO and other other in kernel users
> > - DMA & IRQ remapping (TBD)
> > The original idea was brought up by David Woodhouse and discussions
> > summarized at https://lwn.net/Articles/608914/.
> >
> > Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
> > Signed-off-by: Ashok Raj <ashok.raj@intel.com>
> > ---
> [...]
> > +int iommu_register_device_fault_handler(struct device *dev,
> > + iommu_dev_fault_handler_t
> > handler) +{
> > + if (dev->iommu_fault_param)
> > + return -EBUSY;
> > + get_device(dev);
> > + dev->iommu_fault_param =
> > + kzalloc(sizeof(struct iommu_fault_param),
> > GFP_KERNEL);
> > + if (!dev->iommu_fault_param)
> > + return -ENOMEM;
> > + dev->iommu_fault_param->dev_fault_handler = handler;
>
> Since the handler is owned by a device driver, you also need to clean
> it up when switching the driver (native->VFIO and VFIO->native), in
> iommu_attach_device I suppose.
>
I was thinking the driver who registered fault handler shall be held
accountable to unregister. e.g. User must unbind driver (unregister
fault handler included) before assigning device to vfio-pci. Otherwise,
VFIO call to register handler would fail.
I am assuming VFIO needs to have a separate device fault handler of its
own.
Jacob
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH v2 10/16] iommu: introduce device fault report API
2017-10-05 23:03 ` Jacob Pan
(?)
(?)
@ 2017-10-10 13:40 ` Joerg Roedel
2017-10-11 17:21 ` Jacob Pan
-1 siblings, 1 reply; 109+ messages in thread
From: Joerg Roedel @ 2017-10-10 13:40 UTC (permalink / raw)
To: Jacob Pan
Cc: iommu, LKML, David Woodhouse, Greg Kroah-Hartman, Rafael Wysocki,
Jean-Philippe Brucker, Liu, Yi L, Lan Tianyu, Tian, Kevin,
Raj Ashok, Alex Williamson
On Thu, Oct 05, 2017 at 04:03:38PM -0700, Jacob Pan wrote:
> Traditionally, device specific faults are detected and handled within
> their own device drivers. When IOMMU is enabled, faults such as DMA
> related transactions are detected by IOMMU. There is no generic
> reporting mechanism to report faults back to the in-kernel device
> driver or the guest OS in case of assigned devices.
>
> Faults detected by IOMMU is based on the transaction's source ID which
> can be reported at per device basis, regardless of the device type is a
> PCI device or not.
>
> The fault types include recoverable (e.g. page request) and
> unrecoverable faults(e.g. access error). In most cases, faults can be
> handled by IOMMU drivers internally. The primary use cases are as
> follows:
> 1. page request fault originated from an SVM capable device that is
> assigned to guest via vIOMMU. In this case, the first level page tables
> are owned by the guest. Page request must be propagated to the guest to
> let guest OS fault in the pages then send page response. In this
> mechanism, the direct receiver of IOMMU fault notification is VFIO,
> which can relay notification events to QEMU or other user space
> software.
>
> 2. faults need more subtle handling by device drivers. Other than
> simply invoke reset function, there are needs to let device driver
> handle the fault with a smaller impact.
>
> This patchset is intended to create a generic fault report API such
> that it can scale as follows:
> - all IOMMU types
> - PCI and non-PCI devices
> - recoverable and unrecoverable faults
> - VFIO and other other in kernel users
> - DMA & IRQ remapping (TBD)
> The original idea was brought up by David Woodhouse and discussions
> summarized at https://lwn.net/Articles/608914/.
>
> Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
> Signed-off-by: Ashok Raj <ashok.raj@intel.com>
> ---
> drivers/iommu/iommu.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++-
> include/linux/iommu.h | 23 +++++++++++++++++++++
> 2 files changed, 78 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
> index 5a14154..0b058e2 100644
> --- a/drivers/iommu/iommu.c
> +++ b/drivers/iommu/iommu.c
> @@ -554,9 +554,15 @@ int iommu_group_add_device(struct iommu_group *group, struct device *dev)
>
> device->dev = dev;
>
> + dev->iommu_fault_param = kzalloc(sizeof(struct iommu_fault_param), GFP_KERNEL);
> + if (!dev->iommu_fault_param) {
> + ret = -ENOMEM;
> + goto err_free_device;
> + }
> +
This looks like some left-over from a previous version, because
allocation of that structure is done in iommu_register_device_fault_handler()
^ permalink raw reply [flat|nested] 109+ messages in thread* Re: [PATCH v2 10/16] iommu: introduce device fault report API
2017-10-10 13:40 ` Joerg Roedel
@ 2017-10-11 17:21 ` Jacob Pan
0 siblings, 0 replies; 109+ messages in thread
From: Jacob Pan @ 2017-10-11 17:21 UTC (permalink / raw)
To: Joerg Roedel
Cc: iommu, LKML, David Woodhouse, Greg Kroah-Hartman, Rafael Wysocki,
Jean-Philippe Brucker, Liu, Yi L, Lan Tianyu, Tian, Kevin,
Raj Ashok, Alex Williamson, jacob.jun.pan
On Tue, 10 Oct 2017 15:40:54 +0200
Joerg Roedel <joro@8bytes.org> wrote:
> On Thu, Oct 05, 2017 at 04:03:38PM -0700, Jacob Pan wrote:
> > Traditionally, device specific faults are detected and handled
> > within their own device drivers. When IOMMU is enabled, faults such
> > as DMA related transactions are detected by IOMMU. There is no
> > generic reporting mechanism to report faults back to the in-kernel
> > device driver or the guest OS in case of assigned devices.
> >
> > Faults detected by IOMMU is based on the transaction's source ID
> > which can be reported at per device basis, regardless of the device
> > type is a PCI device or not.
> >
> > The fault types include recoverable (e.g. page request) and
> > unrecoverable faults(e.g. access error). In most cases, faults can
> > be handled by IOMMU drivers internally. The primary use cases are as
> > follows:
> > 1. page request fault originated from an SVM capable device that is
> > assigned to guest via vIOMMU. In this case, the first level page
> > tables are owned by the guest. Page request must be propagated to
> > the guest to let guest OS fault in the pages then send page
> > response. In this mechanism, the direct receiver of IOMMU fault
> > notification is VFIO, which can relay notification events to QEMU
> > or other user space software.
> >
> > 2. faults need more subtle handling by device drivers. Other than
> > simply invoke reset function, there are needs to let device driver
> > handle the fault with a smaller impact.
> >
> > This patchset is intended to create a generic fault report API such
> > that it can scale as follows:
> > - all IOMMU types
> > - PCI and non-PCI devices
> > - recoverable and unrecoverable faults
> > - VFIO and other other in kernel users
> > - DMA & IRQ remapping (TBD)
> > The original idea was brought up by David Woodhouse and discussions
> > summarized at https://lwn.net/Articles/608914/.
> >
> > Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
> > Signed-off-by: Ashok Raj <ashok.raj@intel.com>
> > ---
> > drivers/iommu/iommu.c | 56
> > ++++++++++++++++++++++++++++++++++++++++++++++++++-
> > include/linux/iommu.h | 23 +++++++++++++++++++++ 2 files changed,
> > 78 insertions(+), 1 deletion(-)
> >
> > diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
> > index 5a14154..0b058e2 100644
> > --- a/drivers/iommu/iommu.c
> > +++ b/drivers/iommu/iommu.c
> > @@ -554,9 +554,15 @@ int iommu_group_add_device(struct iommu_group
> > *group, struct device *dev)
> > device->dev = dev;
> >
> > + dev->iommu_fault_param = kzalloc(sizeof(struct
> > iommu_fault_param), GFP_KERNEL);
> > + if (!dev->iommu_fault_param) {
> > + ret = -ENOMEM;
> > + goto err_free_device;
> > + }
> > +
>
> This looks like some left-over from a previous version, because
> allocation of that structure is done in
> iommu_register_device_fault_handler()
>
you are right! I later changed it to do allocation at the
handler registration time.
^ permalink raw reply [flat|nested] 109+ messages in thread
* [PATCH v2 11/16] iommu/vt-d: use threaded irq for dmar_fault
2017-10-05 23:03 ` Jacob Pan
@ 2017-10-05 23:03 ` Jacob Pan
-1 siblings, 0 replies; 109+ messages in thread
From: Jacob Pan @ 2017-10-05 23:03 UTC (permalink / raw)
To: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA, LKML,
Joerg Roedel, David Woodhouse, Greg Kroah-Hartman, Rafael Wysocki,
Jean-Philippe Brucker
Cc: Lan Tianyu
Currently, dmar fault IRQ handler does nothing more than rate
limited printk, no critical hardware handling need to be done
in IRQ context.
Convert it to threaded IRQ would allow fault processing that
requires process context. e.g. find out offending device based
on source ID in the fault rasons.
Signed-off-by: Jacob Pan <jacob.jun.pan-VuQAYsv1563Yd54FQh9/CA@public.gmane.org>
---
drivers/iommu/dmar.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c
index 2fbff8b..ae33d61 100644
--- a/drivers/iommu/dmar.c
+++ b/drivers/iommu/dmar.c
@@ -1748,7 +1748,8 @@ int dmar_set_interrupt(struct intel_iommu *iommu)
return -EINVAL;
}
- ret = request_irq(irq, dmar_fault, IRQF_NO_THREAD, iommu->name, iommu);
+ ret = request_threaded_irq(irq, NULL, dmar_fault,
+ IRQF_ONESHOT, iommu->name, iommu);
if (ret)
pr_err("Can't request irq\n");
return ret;
--
2.7.4
^ permalink raw reply related [flat|nested] 109+ messages in thread* [PATCH v2 11/16] iommu/vt-d: use threaded irq for dmar_fault
@ 2017-10-05 23:03 ` Jacob Pan
0 siblings, 0 replies; 109+ messages in thread
From: Jacob Pan @ 2017-10-05 23:03 UTC (permalink / raw)
To: iommu, LKML, Joerg Roedel, David Woodhouse, Greg Kroah-Hartman,
Rafael Wysocki, Jean-Philippe Brucker
Cc: Liu, Yi L, Lan Tianyu, Tian, Kevin, Raj Ashok, Alex Williamson,
Jacob Pan
Currently, dmar fault IRQ handler does nothing more than rate
limited printk, no critical hardware handling need to be done
in IRQ context.
Convert it to threaded IRQ would allow fault processing that
requires process context. e.g. find out offending device based
on source ID in the fault rasons.
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
---
drivers/iommu/dmar.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c
index 2fbff8b..ae33d61 100644
--- a/drivers/iommu/dmar.c
+++ b/drivers/iommu/dmar.c
@@ -1748,7 +1748,8 @@ int dmar_set_interrupt(struct intel_iommu *iommu)
return -EINVAL;
}
- ret = request_irq(irq, dmar_fault, IRQF_NO_THREAD, iommu->name, iommu);
+ ret = request_threaded_irq(irq, NULL, dmar_fault,
+ IRQF_ONESHOT, iommu->name, iommu);
if (ret)
pr_err("Can't request irq\n");
return ret;
--
2.7.4
^ permalink raw reply related [flat|nested] 109+ messages in thread
* [PATCH v2 12/16] iommu/vt-d: report unrecoverable device faults
2017-10-05 23:03 ` Jacob Pan
@ 2017-10-05 23:03 ` Jacob Pan
-1 siblings, 0 replies; 109+ messages in thread
From: Jacob Pan @ 2017-10-05 23:03 UTC (permalink / raw)
To: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA, LKML,
Joerg Roedel, David Woodhouse, Greg Kroah-Hartman, Rafael Wysocki,
Jean-Philippe Brucker
Cc: Lan Tianyu
Currently, when device DMA faults are detected by IOMMU the fault
reasons are printed but the driver of the offending device is
involved in fault handling.
This patch uses per device fault reporting API to send fault event
data for further processing.
Offending device is identified by the source ID in VT-d fault reason
report registers.
Signed-off-by: Jacob Pan <jacob.jun.pan-VuQAYsv1563Yd54FQh9/CA@public.gmane.org>
Signed-off-by: Ashok Raj <ashok.raj-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
---
drivers/iommu/dmar.c | 95 +++++++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 94 insertions(+), 1 deletion(-)
diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c
index ae33d61..43ea7ab 100644
--- a/drivers/iommu/dmar.c
+++ b/drivers/iommu/dmar.c
@@ -1554,6 +1554,31 @@ static const char *irq_remap_fault_reasons[] =
"Blocked an interrupt request due to source-id verification failure",
};
+/* fault data and status */
+enum intel_iommu_fault_reason {
+ INTEL_IOMMU_FAULT_REASON_SW,
+ INTEL_IOMMU_FAULT_REASON_ROOT_NOT_PRESENT,
+ INTEL_IOMMU_FAULT_REASON_CONTEXT_NOT_PRESENT,
+ INTEL_IOMMU_FAULT_REASON_CONTEXT_INVALID,
+ INTEL_IOMMU_FAULT_REASON_BEYOND_ADDR_WIDTH,
+ INTEL_IOMMU_FAULT_REASON_PTE_WRITE_ACCESS,
+ INTEL_IOMMU_FAULT_REASON_PTE_READ_ACCESS,
+ INTEL_IOMMU_FAULT_REASON_NEXT_PT_INVALID,
+ INTEL_IOMMU_FAULT_REASON_ROOT_ADDR_INVALID,
+ INTEL_IOMMU_FAULT_REASON_CONTEXT_PTR_INVALID,
+ INTEL_IOMMU_FAULT_REASON_NONE_ZERO_RTP,
+ INTEL_IOMMU_FAULT_REASON_NONE_ZERO_CTP,
+ INTEL_IOMMU_FAULT_REASON_NONE_ZERO_PTE,
+ NR_INTEL_IOMMU_FAULT_REASON,
+};
+
+/* fault reasons that are allowed to be reported outside IOMMU subsystem */
+#define INTEL_IOMMU_FAULT_REASON_ALLOWED \
+ ((1ULL << INTEL_IOMMU_FAULT_REASON_BEYOND_ADDR_WIDTH) | \
+ (1ULL << INTEL_IOMMU_FAULT_REASON_PTE_WRITE_ACCESS) | \
+ (1ULL << INTEL_IOMMU_FAULT_REASON_PTE_READ_ACCESS))
+
+
static const char *dmar_get_fault_reason(u8 fault_reason, int *fault_type)
{
if (fault_reason >= 0x20 && (fault_reason - 0x20 <
@@ -1634,6 +1659,70 @@ void dmar_msi_read(int irq, struct msi_msg *msg)
raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
}
+static enum iommu_fault_reason to_iommu_fault_reason(u8 reason)
+{
+ if (reason >= NR_INTEL_IOMMU_FAULT_REASON) {
+ pr_warn("unknown DMAR fault reason %d\n", reason);
+ return IOMMU_FAULT_REASON_UNKNOWN;
+ }
+ switch (reason) {
+ case INTEL_IOMMU_FAULT_REASON_SW:
+ case INTEL_IOMMU_FAULT_REASON_ROOT_NOT_PRESENT:
+ case INTEL_IOMMU_FAULT_REASON_CONTEXT_NOT_PRESENT:
+ case INTEL_IOMMU_FAULT_REASON_CONTEXT_INVALID:
+ case INTEL_IOMMU_FAULT_REASON_BEYOND_ADDR_WIDTH:
+ case INTEL_IOMMU_FAULT_REASON_ROOT_ADDR_INVALID:
+ case INTEL_IOMMU_FAULT_REASON_CONTEXT_PTR_INVALID:
+ return IOMMU_FAULT_REASON_CTX;
+ case INTEL_IOMMU_FAULT_REASON_NEXT_PT_INVALID:
+ case INTEL_IOMMU_FAULT_REASON_PTE_WRITE_ACCESS:
+ case INTEL_IOMMU_FAULT_REASON_PTE_READ_ACCESS:
+ return IOMMU_FAULT_REASON_ACCESS;
+ default:
+ return IOMMU_FAULT_REASON_UNKNOWN;
+ }
+}
+
+static void report_fault_to_device(struct intel_iommu *iommu, u64 addr, int type,
+ int fault_type, enum intel_iommu_fault_reason reason, u16 sid)
+{
+ struct iommu_fault_event event;
+ struct pci_dev *pdev;
+ u8 bus, devfn;
+
+ /* check if fault reason is worth reporting outside IOMMU */
+ if (!((1 << reason) & INTEL_IOMMU_FAULT_REASON_ALLOWED)) {
+ pr_debug("Fault reason %d not allowed to report to device\n",
+ reason);
+ return;
+ }
+
+ bus = PCI_BUS_NUM(sid);
+ devfn = PCI_DEVFN(PCI_SLOT(sid), PCI_FUNC(sid));
+ /*
+ * we need to check if the fault reporting is requested for the
+ * offending device.
+ */
+ pdev = pci_get_bus_and_slot(bus, devfn);
+ if (!pdev) {
+ pr_warn("No PCI device found for source ID %x\n", sid);
+ return;
+ }
+ /*
+ * unrecoverable fault is reported per IOMMU, notifier handler can
+ * resolve PCI device based on source ID.
+ */
+ event.reason = to_iommu_fault_reason(reason);
+ event.paddr = addr;
+ event.rid = sid;
+ event.type = IOMMU_FAULT_DMA_UNRECOV;
+ event.prot = type ? IOMMU_READ : IOMMU_WRITE;
+ dev_warn(&pdev->dev, "report device unrecoverable fault: %d, %x, %d\n",
+ event.reason, event.rid, event.type);
+ iommu_report_device_fault(&pdev->dev, &event);
+ pci_dev_put(pdev);
+}
+
static int dmar_fault_do_one(struct intel_iommu *iommu, int type,
u8 fault_reason, u16 source_id, unsigned long long addr)
{
@@ -1647,11 +1736,15 @@ static int dmar_fault_do_one(struct intel_iommu *iommu, int type,
source_id >> 8, PCI_SLOT(source_id & 0xFF),
PCI_FUNC(source_id & 0xFF), addr >> 48,
fault_reason, reason);
- else
+ else {
pr_err("[%s] Request device [%02x:%02x.%d] fault addr %llx [fault reason %02d] %s\n",
type ? "DMA Read" : "DMA Write",
source_id >> 8, PCI_SLOT(source_id & 0xFF),
PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
+ }
+ report_fault_to_device(iommu, addr, type, fault_type,
+ fault_reason, source_id);
+
return 0;
}
--
2.7.4
^ permalink raw reply related [flat|nested] 109+ messages in thread* [PATCH v2 12/16] iommu/vt-d: report unrecoverable device faults
@ 2017-10-05 23:03 ` Jacob Pan
0 siblings, 0 replies; 109+ messages in thread
From: Jacob Pan @ 2017-10-05 23:03 UTC (permalink / raw)
To: iommu, LKML, Joerg Roedel, David Woodhouse, Greg Kroah-Hartman,
Rafael Wysocki, Jean-Philippe Brucker
Cc: Liu, Yi L, Lan Tianyu, Tian, Kevin, Raj Ashok, Alex Williamson,
Jacob Pan
Currently, when device DMA faults are detected by IOMMU the fault
reasons are printed but the driver of the offending device is
involved in fault handling.
This patch uses per device fault reporting API to send fault event
data for further processing.
Offending device is identified by the source ID in VT-d fault reason
report registers.
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Ashok Raj <ashok.raj@intel.com>
---
drivers/iommu/dmar.c | 95 +++++++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 94 insertions(+), 1 deletion(-)
diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c
index ae33d61..43ea7ab 100644
--- a/drivers/iommu/dmar.c
+++ b/drivers/iommu/dmar.c
@@ -1554,6 +1554,31 @@ static const char *irq_remap_fault_reasons[] =
"Blocked an interrupt request due to source-id verification failure",
};
+/* fault data and status */
+enum intel_iommu_fault_reason {
+ INTEL_IOMMU_FAULT_REASON_SW,
+ INTEL_IOMMU_FAULT_REASON_ROOT_NOT_PRESENT,
+ INTEL_IOMMU_FAULT_REASON_CONTEXT_NOT_PRESENT,
+ INTEL_IOMMU_FAULT_REASON_CONTEXT_INVALID,
+ INTEL_IOMMU_FAULT_REASON_BEYOND_ADDR_WIDTH,
+ INTEL_IOMMU_FAULT_REASON_PTE_WRITE_ACCESS,
+ INTEL_IOMMU_FAULT_REASON_PTE_READ_ACCESS,
+ INTEL_IOMMU_FAULT_REASON_NEXT_PT_INVALID,
+ INTEL_IOMMU_FAULT_REASON_ROOT_ADDR_INVALID,
+ INTEL_IOMMU_FAULT_REASON_CONTEXT_PTR_INVALID,
+ INTEL_IOMMU_FAULT_REASON_NONE_ZERO_RTP,
+ INTEL_IOMMU_FAULT_REASON_NONE_ZERO_CTP,
+ INTEL_IOMMU_FAULT_REASON_NONE_ZERO_PTE,
+ NR_INTEL_IOMMU_FAULT_REASON,
+};
+
+/* fault reasons that are allowed to be reported outside IOMMU subsystem */
+#define INTEL_IOMMU_FAULT_REASON_ALLOWED \
+ ((1ULL << INTEL_IOMMU_FAULT_REASON_BEYOND_ADDR_WIDTH) | \
+ (1ULL << INTEL_IOMMU_FAULT_REASON_PTE_WRITE_ACCESS) | \
+ (1ULL << INTEL_IOMMU_FAULT_REASON_PTE_READ_ACCESS))
+
+
static const char *dmar_get_fault_reason(u8 fault_reason, int *fault_type)
{
if (fault_reason >= 0x20 && (fault_reason - 0x20 <
@@ -1634,6 +1659,70 @@ void dmar_msi_read(int irq, struct msi_msg *msg)
raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
}
+static enum iommu_fault_reason to_iommu_fault_reason(u8 reason)
+{
+ if (reason >= NR_INTEL_IOMMU_FAULT_REASON) {
+ pr_warn("unknown DMAR fault reason %d\n", reason);
+ return IOMMU_FAULT_REASON_UNKNOWN;
+ }
+ switch (reason) {
+ case INTEL_IOMMU_FAULT_REASON_SW:
+ case INTEL_IOMMU_FAULT_REASON_ROOT_NOT_PRESENT:
+ case INTEL_IOMMU_FAULT_REASON_CONTEXT_NOT_PRESENT:
+ case INTEL_IOMMU_FAULT_REASON_CONTEXT_INVALID:
+ case INTEL_IOMMU_FAULT_REASON_BEYOND_ADDR_WIDTH:
+ case INTEL_IOMMU_FAULT_REASON_ROOT_ADDR_INVALID:
+ case INTEL_IOMMU_FAULT_REASON_CONTEXT_PTR_INVALID:
+ return IOMMU_FAULT_REASON_CTX;
+ case INTEL_IOMMU_FAULT_REASON_NEXT_PT_INVALID:
+ case INTEL_IOMMU_FAULT_REASON_PTE_WRITE_ACCESS:
+ case INTEL_IOMMU_FAULT_REASON_PTE_READ_ACCESS:
+ return IOMMU_FAULT_REASON_ACCESS;
+ default:
+ return IOMMU_FAULT_REASON_UNKNOWN;
+ }
+}
+
+static void report_fault_to_device(struct intel_iommu *iommu, u64 addr, int type,
+ int fault_type, enum intel_iommu_fault_reason reason, u16 sid)
+{
+ struct iommu_fault_event event;
+ struct pci_dev *pdev;
+ u8 bus, devfn;
+
+ /* check if fault reason is worth reporting outside IOMMU */
+ if (!((1 << reason) & INTEL_IOMMU_FAULT_REASON_ALLOWED)) {
+ pr_debug("Fault reason %d not allowed to report to device\n",
+ reason);
+ return;
+ }
+
+ bus = PCI_BUS_NUM(sid);
+ devfn = PCI_DEVFN(PCI_SLOT(sid), PCI_FUNC(sid));
+ /*
+ * we need to check if the fault reporting is requested for the
+ * offending device.
+ */
+ pdev = pci_get_bus_and_slot(bus, devfn);
+ if (!pdev) {
+ pr_warn("No PCI device found for source ID %x\n", sid);
+ return;
+ }
+ /*
+ * unrecoverable fault is reported per IOMMU, notifier handler can
+ * resolve PCI device based on source ID.
+ */
+ event.reason = to_iommu_fault_reason(reason);
+ event.paddr = addr;
+ event.rid = sid;
+ event.type = IOMMU_FAULT_DMA_UNRECOV;
+ event.prot = type ? IOMMU_READ : IOMMU_WRITE;
+ dev_warn(&pdev->dev, "report device unrecoverable fault: %d, %x, %d\n",
+ event.reason, event.rid, event.type);
+ iommu_report_device_fault(&pdev->dev, &event);
+ pci_dev_put(pdev);
+}
+
static int dmar_fault_do_one(struct intel_iommu *iommu, int type,
u8 fault_reason, u16 source_id, unsigned long long addr)
{
@@ -1647,11 +1736,15 @@ static int dmar_fault_do_one(struct intel_iommu *iommu, int type,
source_id >> 8, PCI_SLOT(source_id & 0xFF),
PCI_FUNC(source_id & 0xFF), addr >> 48,
fault_reason, reason);
- else
+ else {
pr_err("[%s] Request device [%02x:%02x.%d] fault addr %llx [fault reason %02d] %s\n",
type ? "DMA Read" : "DMA Write",
source_id >> 8, PCI_SLOT(source_id & 0xFF),
PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
+ }
+ report_fault_to_device(iommu, addr, type, fault_type,
+ fault_reason, source_id);
+
return 0;
}
--
2.7.4
^ permalink raw reply related [flat|nested] 109+ messages in thread