All of lore.kernel.org
 help / color / mirror / Atom feed
From: Lu Baolu <baolu.lu@linux.intel.com>
To: Joerg Roedel <joro@8bytes.org>,
	David Woodhouse <dwmw2@infradead.org>,
	Alex Williamson <alex.williamson@redhat.com>
Cc: kevin.tian@intel.com, Yi Sun <yi.y.sun@linux.intel.com>,
	ashok.raj@intel.com, kvm@vger.kernel.org,
	sanjay.k.kumar@intel.com, iommu@lists.linux-foundation.org,
	linux-kernel@vger.kernel.org, yi.y.sun@intel.com
Subject: [RFC PATCH 2/4] iommu/vt-d: Add first level page table interfaces
Date: Mon, 23 Sep 2019 20:24:52 +0800	[thread overview]
Message-ID: <20190923122454.9888-3-baolu.lu@linux.intel.com> (raw)
In-Reply-To: <20190923122454.9888-1-baolu.lu@linux.intel.com>

This adds functions to manipulate first level page tables
which could be used by a scalale mode capable IOMMU unit.

intel_mmmap_range(domain, addr, end, phys_addr, prot)
 - Map an iova range of [addr, end) to the physical memory
   started at @phys_addr with the @prot permissions.

intel_mmunmap_range(domain, addr, end)
 - Tear down the map of an iova range [addr, end). A page
   list will be returned which will be freed after iotlb
   flushing.

Cc: Ashok Raj <ashok.raj@intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Kevin Tian <kevin.tian@intel.com>
Cc: Liu Yi L <yi.l.liu@intel.com>
Cc: Yi Sun <yi.y.sun@linux.intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
---
 drivers/iommu/Makefile             |   2 +-
 drivers/iommu/intel-pgtable.c      | 342 +++++++++++++++++++++++++++++
 include/linux/intel-iommu.h        |  24 +-
 include/trace/events/intel_iommu.h |  60 +++++
 4 files changed, 426 insertions(+), 2 deletions(-)
 create mode 100644 drivers/iommu/intel-pgtable.c

diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
index 4f405f926e73..dc550e14cc58 100644
--- a/drivers/iommu/Makefile
+++ b/drivers/iommu/Makefile
@@ -17,7 +17,7 @@ obj-$(CONFIG_ARM_SMMU) += arm-smmu.o arm-smmu-impl.o
 obj-$(CONFIG_ARM_SMMU_V3) += arm-smmu-v3.o
 obj-$(CONFIG_DMAR_TABLE) += dmar.o
 obj-$(CONFIG_INTEL_IOMMU) += intel-iommu.o intel-pasid.o
-obj-$(CONFIG_INTEL_IOMMU) += intel-trace.o
+obj-$(CONFIG_INTEL_IOMMU) += intel-trace.o intel-pgtable.o
 obj-$(CONFIG_INTEL_IOMMU_DEBUGFS) += intel-iommu-debugfs.o
 obj-$(CONFIG_INTEL_IOMMU_SVM) += intel-svm.o
 obj-$(CONFIG_IPMMU_VMSA) += ipmmu-vmsa.o
diff --git a/drivers/iommu/intel-pgtable.c b/drivers/iommu/intel-pgtable.c
new file mode 100644
index 000000000000..8e95978cd381
--- /dev/null
+++ b/drivers/iommu/intel-pgtable.c
@@ -0,0 +1,342 @@
+// SPDX-License-Identifier: GPL-2.0
+/**
+ * intel-pgtable.c - Intel IOMMU page table manipulation library
+ *
+ * Copyright (C) 2019 Intel Corporation
+ *
+ * Author: Lu Baolu <baolu.lu@linux.intel.com>
+ */
+
+#define pr_fmt(fmt)     "DMAR: " fmt
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/io.h>
+#include <linux/export.h>
+#include <linux/intel-iommu.h>
+#include <asm/cacheflush.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <trace/events/intel_iommu.h>
+
+#ifdef CONFIG_X86
+/*
+ * mmmap: Map a range of IO virtual address to physical addresses.
+ */
+#define pgtable_populate(domain, nm)					\
+do {									\
+	void *__new = alloc_pgtable_page(domain->nid);			\
+	if (!__new)							\
+		return -ENOMEM;						\
+	smp_wmb();							\
+	spin_lock(&(domain)->page_table_lock);				\
+	if (nm ## _present(*nm)) {					\
+		free_pgtable_page(__new);				\
+	} else {							\
+		set_##nm(nm, __##nm(__pa(__new) | _PAGE_TABLE));	\
+		domain_flush_cache(domain, nm, sizeof(nm##_t));		\
+	}								\
+	spin_unlock(&(domain)->page_table_lock);			\
+} while(0);
+
+static int
+mmmap_pte_range(struct dmar_domain *domain, pmd_t *pmd, unsigned long addr,
+		unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
+{
+	pte_t *pte, *first_pte;
+	u64 pfn;
+
+	pfn = phys_addr >> PAGE_SHIFT;
+	if (unlikely(pmd_none(*pmd)))
+		pgtable_populate(domain, pmd);
+
+	first_pte = pte = pte_offset_kernel(pmd, addr);
+
+	do {
+		set_pte(pte, pfn_pte(pfn, prot));
+		pfn++;
+	} while (pte++, addr += PAGE_SIZE, addr != end);
+
+	domain_flush_cache(domain, first_pte, (void *)pte - (void *)first_pte);
+
+	return 0;
+}
+
+static int
+mmmap_pmd_range(struct dmar_domain *domain, pud_t *pud, unsigned long addr,
+		unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
+{
+	unsigned long next;
+	pmd_t *pmd;
+
+	if (unlikely(pud_none(*pud)))
+		pgtable_populate(domain, pud);
+	pmd = pmd_offset(pud, addr);
+
+	phys_addr -= addr;
+	do {
+		next = pmd_addr_end(addr, end);
+		if (mmmap_pte_range(domain, pmd, addr, next,
+				    phys_addr + addr, prot))
+			return -ENOMEM;
+	} while (pmd++, addr = next, addr != end);
+
+	return 0;
+}
+
+static int
+mmmap_pud_range(struct dmar_domain *domain, p4d_t *p4d, unsigned long addr,
+		unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
+{
+	unsigned long next;
+	pud_t *pud;
+
+	if (unlikely(p4d_none(*p4d)))
+		pgtable_populate(domain, p4d);
+
+	pud = pud_offset(p4d, addr);
+
+	phys_addr -= addr;
+	do {
+		next = pud_addr_end(addr, end);
+		if (mmmap_pmd_range(domain, pud, addr, next,
+				    phys_addr + addr, prot))
+			return -ENOMEM;
+	} while (pud++, addr = next, addr != end);
+
+	return 0;
+}
+
+static int
+mmmap_p4d_range(struct dmar_domain *domain, pgd_t *pgd, unsigned long addr,
+		unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
+{
+	unsigned long next;
+	p4d_t *p4d;
+
+	if (cpu_feature_enabled(X86_FEATURE_LA57) && unlikely(pgd_none(*pgd)))
+		pgtable_populate(domain, pgd);
+
+	p4d = p4d_offset(pgd, addr);
+
+	phys_addr -= addr;
+	do {
+		next = p4d_addr_end(addr, end);
+		if (mmmap_pud_range(domain, p4d, addr, next,
+				    phys_addr + addr, prot))
+			return -ENOMEM;
+	} while (p4d++, addr = next, addr != end);
+
+	return 0;
+}
+
+int intel_mmmap_range(struct dmar_domain *domain, unsigned long addr,
+		      unsigned long end, phys_addr_t phys_addr, int dma_prot)
+{
+	unsigned long next;
+	pgprot_t prot;
+	pgd_t *pgd;
+
+	trace_domain_mm_map(domain, addr, end, phys_addr);
+
+	/*
+	 * There is no PAGE_KERNEL_WO for a pte entry, so let's use RW
+	 * for a pte that requires write operation.
+	 */
+	prot = dma_prot & DMA_PTE_WRITE ? PAGE_KERNEL : PAGE_KERNEL_RO;
+	BUG_ON(addr >= end);
+
+	phys_addr -= addr;
+	pgd = pgd_offset_pgd(domain->pgd, addr);
+	do {
+		next = pgd_addr_end(addr, end);
+		if (mmmap_p4d_range(domain, pgd, addr, next,
+				    phys_addr + addr, prot))
+			return -ENOMEM;
+	} while (pgd++, addr = next, addr != end);
+
+	return 0;
+}
+
+/*
+ * mmunmap: Unmap an existing mapping between a range of IO vitual address
+ *          and physical addresses.
+ */
+static struct page *
+mmunmap_pte_range(struct dmar_domain *domain, pmd_t *pmd,
+		  unsigned long addr, unsigned long end,
+		  struct page *freelist, bool reclaim)
+{
+	int i;
+	unsigned long start;
+	pte_t *pte, *first_pte;
+
+	start = addr;
+	pte = pte_offset_kernel(pmd, addr);
+	first_pte = pte;
+	do {
+		set_pte(pte, __pte(0));
+	} while (pte++, addr += PAGE_SIZE, addr != end);
+
+	domain_flush_cache(domain, first_pte, (void *)pte - (void *)first_pte);
+
+	/* Add page to free list if all entries are empty. */
+	if (reclaim) {
+		struct page *pte_page;
+
+		pte = (pte_t *)pmd_page_vaddr(*pmd);
+		for (i = 0; i < PTRS_PER_PTE; i++)
+			if (!pte || !pte_none(pte[i]))
+				goto pte_out;
+
+		pte_page = pmd_page(*pmd);
+		pte_page->freelist = freelist;
+		freelist = pte_page;
+		pmd_clear(pmd);
+		domain_flush_cache(domain, pmd, sizeof(pmd_t));
+	}
+
+pte_out:
+	return freelist;
+}
+
+static struct page *
+mmunmap_pmd_range(struct dmar_domain *domain, pud_t *pud,
+		  unsigned long addr, unsigned long end,
+		  struct page *freelist, bool reclaim)
+{
+	int i;
+	pmd_t *pmd;
+	unsigned long start, next;
+
+	start = addr;
+	pmd = pmd_offset(pud, addr);
+	do {
+		next = pmd_addr_end(addr, end);
+		if (pmd_none_or_clear_bad(pmd))
+			continue;
+		freelist = mmunmap_pte_range(domain, pmd, addr, next,
+					     freelist, reclaim);
+	} while (pmd++, addr = next, addr != end);
+
+	/* Add page to free list if all entries are empty. */
+	if (reclaim) {
+		struct page *pmd_page;
+
+		pmd = (pmd_t *)pud_page_vaddr(*pud);
+		for (i = 0; i < PTRS_PER_PMD; i++)
+			if (!pmd || !pmd_none(pmd[i]))
+				goto pmd_out;
+
+		pmd_page = pud_page(*pud);
+		pmd_page->freelist = freelist;
+		freelist = pmd_page;
+		pud_clear(pud);
+		domain_flush_cache(domain, pud, sizeof(pud_t));
+	}
+
+pmd_out:
+	return freelist;
+}
+
+static struct page *
+mmunmap_pud_range(struct dmar_domain *domain, p4d_t *p4d,
+		  unsigned long addr, unsigned long end,
+		  struct page *freelist, bool reclaim)
+{
+	int i;
+	pud_t *pud;
+	unsigned long start, next;
+
+	start = addr;
+	pud = pud_offset(p4d, addr);
+	do {
+		next = pud_addr_end(addr, end);
+		if (pud_none_or_clear_bad(pud))
+			continue;
+		freelist = mmunmap_pmd_range(domain, pud, addr, next,
+					     freelist, reclaim);
+	} while (pud++, addr = next, addr != end);
+
+	/* Add page to free list if all entries are empty. */
+	if (reclaim) {
+		struct page *pud_page;
+
+		pud = (pud_t *)p4d_page_vaddr(*p4d);
+		for (i = 0; i < PTRS_PER_PUD; i++)
+			if (!pud || !pud_none(pud[i]))
+				goto pud_out;
+
+		pud_page = p4d_page(*p4d);
+		pud_page->freelist = freelist;
+		freelist = pud_page;
+		p4d_clear(p4d);
+		domain_flush_cache(domain, p4d, sizeof(p4d_t));
+	}
+
+pud_out:
+	return freelist;
+}
+
+static struct page *
+mmunmap_p4d_range(struct dmar_domain *domain, pgd_t *pgd,
+		  unsigned long addr, unsigned long end,
+		  struct page *freelist, bool reclaim)
+{
+	p4d_t *p4d;
+	unsigned long start, next;
+
+	start = addr;
+	p4d = p4d_offset(pgd, addr);
+	do {
+		next = p4d_addr_end(addr, end);
+		if (p4d_none_or_clear_bad(p4d))
+			continue;
+		freelist = mmunmap_pud_range(domain, p4d, addr, next,
+					     freelist, reclaim);
+	} while (p4d++, addr = next, addr != end);
+
+	/* Add page to free list if all entries are empty. */
+	if (cpu_feature_enabled(X86_FEATURE_LA57) && reclaim) {
+		struct page *p4d_page;
+		int i;
+
+		p4d = (p4d_t *)pgd_page_vaddr(*pgd);
+		for (i = 0; i < PTRS_PER_P4D; i++)
+			if (!p4d || !p4d_none(p4d[i]))
+				goto p4d_out;
+
+		p4d_page = pgd_page(*pgd);
+		p4d_page->freelist = freelist;
+		freelist = p4d_page;
+		pgd_clear(pgd);
+		domain_flush_cache(domain, pgd, sizeof(pgd_t));
+	}
+
+p4d_out:
+	return freelist;
+}
+
+struct page *
+intel_mmunmap_range(struct dmar_domain *domain,
+		    unsigned long addr, unsigned long end)
+{
+	pgd_t *pgd;
+	unsigned long next;
+	struct page *freelist = NULL;
+
+	trace_domain_mm_unmap(domain, addr, end);
+
+	BUG_ON(addr >= end);
+	pgd = pgd_offset_pgd(domain->pgd, addr);
+	do {
+		next = pgd_addr_end(addr, end);
+		if (pgd_none_or_clear_bad(pgd))
+			continue;
+		freelist = mmunmap_p4d_range(domain, pgd, addr, next,
+					     freelist, !addr);
+	} while (pgd++, addr = next, addr != end);
+
+	return freelist;
+}
+#endif /* CONFIG_X86 */
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 3ee694d4f361..044a91fa5431 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -489,7 +489,8 @@ struct dmar_domain {
 	struct list_head auxd;		/* link to device's auxiliary list */
 	struct iova_domain iovad;	/* iova's that belong to this domain */
 
-	struct dma_pte	*pgd;		/* virtual address */
+	void		*pgd;		/* virtual address */
+	spinlock_t page_table_lock;	/* Protects page tables */
 	int		gaw;		/* max guest address width */
 
 	/* adjusted guest address width, 0 is level 2 30-bit */
@@ -662,6 +663,27 @@ int for_each_device_domain(int (*fn)(struct device_domain_info *info,
 void iommu_flush_write_buffer(struct intel_iommu *iommu);
 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev);
 
+#ifdef CONFIG_X86
+int intel_mmmap_range(struct dmar_domain *domain, unsigned long addr,
+		      unsigned long end, phys_addr_t phys_addr, int dma_prot);
+struct page *intel_mmunmap_range(struct dmar_domain *domain,
+				 unsigned long addr, unsigned long end);
+#else
+static inline int
+intel_mmmap_range(struct dmar_domain *domain, unsigned long addr,
+		  unsigned long end, phys_addr_t phys_addr, int dma_prot)
+{
+	return -ENODEV;
+}
+
+static inline struct page *
+intel_mmunmap_range(struct dmar_domain *domain,
+		    unsigned long addr, unsigned long end)
+{
+	return NULL;
+}
+#endif
+
 #ifdef CONFIG_INTEL_IOMMU_SVM
 int intel_svm_init(struct intel_iommu *iommu);
 extern int intel_svm_enable_prq(struct intel_iommu *iommu);
diff --git a/include/trace/events/intel_iommu.h b/include/trace/events/intel_iommu.h
index 54e61d456cdf..e8c95290fd13 100644
--- a/include/trace/events/intel_iommu.h
+++ b/include/trace/events/intel_iommu.h
@@ -99,6 +99,66 @@ DEFINE_EVENT(dma_unmap, bounce_unmap_single,
 	TP_ARGS(dev, dev_addr, size)
 );
 
+DECLARE_EVENT_CLASS(domain_map,
+	TP_PROTO(struct dmar_domain *domain, unsigned long addr,
+		 unsigned long end, phys_addr_t phys_addr),
+
+	TP_ARGS(domain, addr, end, phys_addr),
+
+	TP_STRUCT__entry(
+		__field(struct dmar_domain *, domain)
+		__field(unsigned long, addr)
+		__field(unsigned long, end)
+		__field(phys_addr_t, phys_addr)
+	),
+
+	TP_fast_assign(
+		__entry->domain = domain;
+		__entry->addr = addr;
+		__entry->end = end;
+		__entry->phys_addr = phys_addr;
+	),
+
+	TP_printk("domain=%p addr=0x%lx end=0x%lx phys_addr=0x%llx",
+		  __entry->domain, __entry->addr, __entry->end,
+		  (unsigned long long)__entry->phys_addr)
+);
+
+DEFINE_EVENT(domain_map, domain_mm_map,
+	TP_PROTO(struct dmar_domain *domain, unsigned long addr,
+		 unsigned long end, phys_addr_t phys_addr),
+
+	TP_ARGS(domain, addr, end, phys_addr)
+);
+
+DECLARE_EVENT_CLASS(domain_unmap,
+	TP_PROTO(struct dmar_domain *domain, unsigned long addr,
+		 unsigned long end),
+
+	TP_ARGS(domain, addr, end),
+
+	TP_STRUCT__entry(
+		__field(struct dmar_domain *, domain)
+		__field(unsigned long, addr)
+		__field(unsigned long, end)
+	),
+
+	TP_fast_assign(
+		__entry->domain = domain;
+		__entry->addr = addr;
+		__entry->end = end;
+	),
+
+	TP_printk("domain=%p addr=0x%lx end=0x%lx",
+		  __entry->domain, __entry->addr, __entry->end)
+);
+
+DEFINE_EVENT(domain_unmap, domain_mm_unmap,
+	TP_PROTO(struct dmar_domain *domain, unsigned long addr,
+		 unsigned long end),
+
+	TP_ARGS(domain, addr, end)
+);
 #endif /* _TRACE_INTEL_IOMMU_H */
 
 /* This part must be outside protection */
-- 
2.17.1

_______________________________________________
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

WARNING: multiple messages have this Message-ID (diff)
From: Lu Baolu <baolu.lu@linux.intel.com>
To: Joerg Roedel <joro@8bytes.org>,
	David Woodhouse <dwmw2@infradead.org>,
	Alex Williamson <alex.williamson@redhat.com>
Cc: ashok.raj@intel.com, sanjay.k.kumar@intel.com,
	jacob.jun.pan@linux.intel.com, kevin.tian@intel.com,
	yi.l.liu@intel.com, yi.y.sun@intel.com,
	iommu@lists.linux-foundation.org, kvm@vger.kernel.org,
	linux-kernel@vger.kernel.org, Lu Baolu <baolu.lu@linux.intel.com>,
	Yi Sun <yi.y.sun@linux.intel.com>
Subject: [RFC PATCH 2/4] iommu/vt-d: Add first level page table interfaces
Date: Mon, 23 Sep 2019 20:24:52 +0800	[thread overview]
Message-ID: <20190923122454.9888-3-baolu.lu@linux.intel.com> (raw)
In-Reply-To: <20190923122454.9888-1-baolu.lu@linux.intel.com>

This adds functions to manipulate first level page tables
which could be used by a scalale mode capable IOMMU unit.

intel_mmmap_range(domain, addr, end, phys_addr, prot)
 - Map an iova range of [addr, end) to the physical memory
   started at @phys_addr with the @prot permissions.

intel_mmunmap_range(domain, addr, end)
 - Tear down the map of an iova range [addr, end). A page
   list will be returned which will be freed after iotlb
   flushing.

Cc: Ashok Raj <ashok.raj@intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Kevin Tian <kevin.tian@intel.com>
Cc: Liu Yi L <yi.l.liu@intel.com>
Cc: Yi Sun <yi.y.sun@linux.intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
---
 drivers/iommu/Makefile             |   2 +-
 drivers/iommu/intel-pgtable.c      | 342 +++++++++++++++++++++++++++++
 include/linux/intel-iommu.h        |  24 +-
 include/trace/events/intel_iommu.h |  60 +++++
 4 files changed, 426 insertions(+), 2 deletions(-)
 create mode 100644 drivers/iommu/intel-pgtable.c

diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
index 4f405f926e73..dc550e14cc58 100644
--- a/drivers/iommu/Makefile
+++ b/drivers/iommu/Makefile
@@ -17,7 +17,7 @@ obj-$(CONFIG_ARM_SMMU) += arm-smmu.o arm-smmu-impl.o
 obj-$(CONFIG_ARM_SMMU_V3) += arm-smmu-v3.o
 obj-$(CONFIG_DMAR_TABLE) += dmar.o
 obj-$(CONFIG_INTEL_IOMMU) += intel-iommu.o intel-pasid.o
-obj-$(CONFIG_INTEL_IOMMU) += intel-trace.o
+obj-$(CONFIG_INTEL_IOMMU) += intel-trace.o intel-pgtable.o
 obj-$(CONFIG_INTEL_IOMMU_DEBUGFS) += intel-iommu-debugfs.o
 obj-$(CONFIG_INTEL_IOMMU_SVM) += intel-svm.o
 obj-$(CONFIG_IPMMU_VMSA) += ipmmu-vmsa.o
diff --git a/drivers/iommu/intel-pgtable.c b/drivers/iommu/intel-pgtable.c
new file mode 100644
index 000000000000..8e95978cd381
--- /dev/null
+++ b/drivers/iommu/intel-pgtable.c
@@ -0,0 +1,342 @@
+// SPDX-License-Identifier: GPL-2.0
+/**
+ * intel-pgtable.c - Intel IOMMU page table manipulation library
+ *
+ * Copyright (C) 2019 Intel Corporation
+ *
+ * Author: Lu Baolu <baolu.lu@linux.intel.com>
+ */
+
+#define pr_fmt(fmt)     "DMAR: " fmt
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/io.h>
+#include <linux/export.h>
+#include <linux/intel-iommu.h>
+#include <asm/cacheflush.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <trace/events/intel_iommu.h>
+
+#ifdef CONFIG_X86
+/*
+ * mmmap: Map a range of IO virtual address to physical addresses.
+ */
+#define pgtable_populate(domain, nm)					\
+do {									\
+	void *__new = alloc_pgtable_page(domain->nid);			\
+	if (!__new)							\
+		return -ENOMEM;						\
+	smp_wmb();							\
+	spin_lock(&(domain)->page_table_lock);				\
+	if (nm ## _present(*nm)) {					\
+		free_pgtable_page(__new);				\
+	} else {							\
+		set_##nm(nm, __##nm(__pa(__new) | _PAGE_TABLE));	\
+		domain_flush_cache(domain, nm, sizeof(nm##_t));		\
+	}								\
+	spin_unlock(&(domain)->page_table_lock);			\
+} while(0);
+
+static int
+mmmap_pte_range(struct dmar_domain *domain, pmd_t *pmd, unsigned long addr,
+		unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
+{
+	pte_t *pte, *first_pte;
+	u64 pfn;
+
+	pfn = phys_addr >> PAGE_SHIFT;
+	if (unlikely(pmd_none(*pmd)))
+		pgtable_populate(domain, pmd);
+
+	first_pte = pte = pte_offset_kernel(pmd, addr);
+
+	do {
+		set_pte(pte, pfn_pte(pfn, prot));
+		pfn++;
+	} while (pte++, addr += PAGE_SIZE, addr != end);
+
+	domain_flush_cache(domain, first_pte, (void *)pte - (void *)first_pte);
+
+	return 0;
+}
+
+static int
+mmmap_pmd_range(struct dmar_domain *domain, pud_t *pud, unsigned long addr,
+		unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
+{
+	unsigned long next;
+	pmd_t *pmd;
+
+	if (unlikely(pud_none(*pud)))
+		pgtable_populate(domain, pud);
+	pmd = pmd_offset(pud, addr);
+
+	phys_addr -= addr;
+	do {
+		next = pmd_addr_end(addr, end);
+		if (mmmap_pte_range(domain, pmd, addr, next,
+				    phys_addr + addr, prot))
+			return -ENOMEM;
+	} while (pmd++, addr = next, addr != end);
+
+	return 0;
+}
+
+static int
+mmmap_pud_range(struct dmar_domain *domain, p4d_t *p4d, unsigned long addr,
+		unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
+{
+	unsigned long next;
+	pud_t *pud;
+
+	if (unlikely(p4d_none(*p4d)))
+		pgtable_populate(domain, p4d);
+
+	pud = pud_offset(p4d, addr);
+
+	phys_addr -= addr;
+	do {
+		next = pud_addr_end(addr, end);
+		if (mmmap_pmd_range(domain, pud, addr, next,
+				    phys_addr + addr, prot))
+			return -ENOMEM;
+	} while (pud++, addr = next, addr != end);
+
+	return 0;
+}
+
+static int
+mmmap_p4d_range(struct dmar_domain *domain, pgd_t *pgd, unsigned long addr,
+		unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
+{
+	unsigned long next;
+	p4d_t *p4d;
+
+	if (cpu_feature_enabled(X86_FEATURE_LA57) && unlikely(pgd_none(*pgd)))
+		pgtable_populate(domain, pgd);
+
+	p4d = p4d_offset(pgd, addr);
+
+	phys_addr -= addr;
+	do {
+		next = p4d_addr_end(addr, end);
+		if (mmmap_pud_range(domain, p4d, addr, next,
+				    phys_addr + addr, prot))
+			return -ENOMEM;
+	} while (p4d++, addr = next, addr != end);
+
+	return 0;
+}
+
+int intel_mmmap_range(struct dmar_domain *domain, unsigned long addr,
+		      unsigned long end, phys_addr_t phys_addr, int dma_prot)
+{
+	unsigned long next;
+	pgprot_t prot;
+	pgd_t *pgd;
+
+	trace_domain_mm_map(domain, addr, end, phys_addr);
+
+	/*
+	 * There is no PAGE_KERNEL_WO for a pte entry, so let's use RW
+	 * for a pte that requires write operation.
+	 */
+	prot = dma_prot & DMA_PTE_WRITE ? PAGE_KERNEL : PAGE_KERNEL_RO;
+	BUG_ON(addr >= end);
+
+	phys_addr -= addr;
+	pgd = pgd_offset_pgd(domain->pgd, addr);
+	do {
+		next = pgd_addr_end(addr, end);
+		if (mmmap_p4d_range(domain, pgd, addr, next,
+				    phys_addr + addr, prot))
+			return -ENOMEM;
+	} while (pgd++, addr = next, addr != end);
+
+	return 0;
+}
+
+/*
+ * mmunmap: Unmap an existing mapping between a range of IO vitual address
+ *          and physical addresses.
+ */
+static struct page *
+mmunmap_pte_range(struct dmar_domain *domain, pmd_t *pmd,
+		  unsigned long addr, unsigned long end,
+		  struct page *freelist, bool reclaim)
+{
+	int i;
+	unsigned long start;
+	pte_t *pte, *first_pte;
+
+	start = addr;
+	pte = pte_offset_kernel(pmd, addr);
+	first_pte = pte;
+	do {
+		set_pte(pte, __pte(0));
+	} while (pte++, addr += PAGE_SIZE, addr != end);
+
+	domain_flush_cache(domain, first_pte, (void *)pte - (void *)first_pte);
+
+	/* Add page to free list if all entries are empty. */
+	if (reclaim) {
+		struct page *pte_page;
+
+		pte = (pte_t *)pmd_page_vaddr(*pmd);
+		for (i = 0; i < PTRS_PER_PTE; i++)
+			if (!pte || !pte_none(pte[i]))
+				goto pte_out;
+
+		pte_page = pmd_page(*pmd);
+		pte_page->freelist = freelist;
+		freelist = pte_page;
+		pmd_clear(pmd);
+		domain_flush_cache(domain, pmd, sizeof(pmd_t));
+	}
+
+pte_out:
+	return freelist;
+}
+
+static struct page *
+mmunmap_pmd_range(struct dmar_domain *domain, pud_t *pud,
+		  unsigned long addr, unsigned long end,
+		  struct page *freelist, bool reclaim)
+{
+	int i;
+	pmd_t *pmd;
+	unsigned long start, next;
+
+	start = addr;
+	pmd = pmd_offset(pud, addr);
+	do {
+		next = pmd_addr_end(addr, end);
+		if (pmd_none_or_clear_bad(pmd))
+			continue;
+		freelist = mmunmap_pte_range(domain, pmd, addr, next,
+					     freelist, reclaim);
+	} while (pmd++, addr = next, addr != end);
+
+	/* Add page to free list if all entries are empty. */
+	if (reclaim) {
+		struct page *pmd_page;
+
+		pmd = (pmd_t *)pud_page_vaddr(*pud);
+		for (i = 0; i < PTRS_PER_PMD; i++)
+			if (!pmd || !pmd_none(pmd[i]))
+				goto pmd_out;
+
+		pmd_page = pud_page(*pud);
+		pmd_page->freelist = freelist;
+		freelist = pmd_page;
+		pud_clear(pud);
+		domain_flush_cache(domain, pud, sizeof(pud_t));
+	}
+
+pmd_out:
+	return freelist;
+}
+
+static struct page *
+mmunmap_pud_range(struct dmar_domain *domain, p4d_t *p4d,
+		  unsigned long addr, unsigned long end,
+		  struct page *freelist, bool reclaim)
+{
+	int i;
+	pud_t *pud;
+	unsigned long start, next;
+
+	start = addr;
+	pud = pud_offset(p4d, addr);
+	do {
+		next = pud_addr_end(addr, end);
+		if (pud_none_or_clear_bad(pud))
+			continue;
+		freelist = mmunmap_pmd_range(domain, pud, addr, next,
+					     freelist, reclaim);
+	} while (pud++, addr = next, addr != end);
+
+	/* Add page to free list if all entries are empty. */
+	if (reclaim) {
+		struct page *pud_page;
+
+		pud = (pud_t *)p4d_page_vaddr(*p4d);
+		for (i = 0; i < PTRS_PER_PUD; i++)
+			if (!pud || !pud_none(pud[i]))
+				goto pud_out;
+
+		pud_page = p4d_page(*p4d);
+		pud_page->freelist = freelist;
+		freelist = pud_page;
+		p4d_clear(p4d);
+		domain_flush_cache(domain, p4d, sizeof(p4d_t));
+	}
+
+pud_out:
+	return freelist;
+}
+
+static struct page *
+mmunmap_p4d_range(struct dmar_domain *domain, pgd_t *pgd,
+		  unsigned long addr, unsigned long end,
+		  struct page *freelist, bool reclaim)
+{
+	p4d_t *p4d;
+	unsigned long start, next;
+
+	start = addr;
+	p4d = p4d_offset(pgd, addr);
+	do {
+		next = p4d_addr_end(addr, end);
+		if (p4d_none_or_clear_bad(p4d))
+			continue;
+		freelist = mmunmap_pud_range(domain, p4d, addr, next,
+					     freelist, reclaim);
+	} while (p4d++, addr = next, addr != end);
+
+	/* Add page to free list if all entries are empty. */
+	if (cpu_feature_enabled(X86_FEATURE_LA57) && reclaim) {
+		struct page *p4d_page;
+		int i;
+
+		p4d = (p4d_t *)pgd_page_vaddr(*pgd);
+		for (i = 0; i < PTRS_PER_P4D; i++)
+			if (!p4d || !p4d_none(p4d[i]))
+				goto p4d_out;
+
+		p4d_page = pgd_page(*pgd);
+		p4d_page->freelist = freelist;
+		freelist = p4d_page;
+		pgd_clear(pgd);
+		domain_flush_cache(domain, pgd, sizeof(pgd_t));
+	}
+
+p4d_out:
+	return freelist;
+}
+
+struct page *
+intel_mmunmap_range(struct dmar_domain *domain,
+		    unsigned long addr, unsigned long end)
+{
+	pgd_t *pgd;
+	unsigned long next;
+	struct page *freelist = NULL;
+
+	trace_domain_mm_unmap(domain, addr, end);
+
+	BUG_ON(addr >= end);
+	pgd = pgd_offset_pgd(domain->pgd, addr);
+	do {
+		next = pgd_addr_end(addr, end);
+		if (pgd_none_or_clear_bad(pgd))
+			continue;
+		freelist = mmunmap_p4d_range(domain, pgd, addr, next,
+					     freelist, !addr);
+	} while (pgd++, addr = next, addr != end);
+
+	return freelist;
+}
+#endif /* CONFIG_X86 */
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 3ee694d4f361..044a91fa5431 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -489,7 +489,8 @@ struct dmar_domain {
 	struct list_head auxd;		/* link to device's auxiliary list */
 	struct iova_domain iovad;	/* iova's that belong to this domain */
 
-	struct dma_pte	*pgd;		/* virtual address */
+	void		*pgd;		/* virtual address */
+	spinlock_t page_table_lock;	/* Protects page tables */
 	int		gaw;		/* max guest address width */
 
 	/* adjusted guest address width, 0 is level 2 30-bit */
@@ -662,6 +663,27 @@ int for_each_device_domain(int (*fn)(struct device_domain_info *info,
 void iommu_flush_write_buffer(struct intel_iommu *iommu);
 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev);
 
+#ifdef CONFIG_X86
+int intel_mmmap_range(struct dmar_domain *domain, unsigned long addr,
+		      unsigned long end, phys_addr_t phys_addr, int dma_prot);
+struct page *intel_mmunmap_range(struct dmar_domain *domain,
+				 unsigned long addr, unsigned long end);
+#else
+static inline int
+intel_mmmap_range(struct dmar_domain *domain, unsigned long addr,
+		  unsigned long end, phys_addr_t phys_addr, int dma_prot)
+{
+	return -ENODEV;
+}
+
+static inline struct page *
+intel_mmunmap_range(struct dmar_domain *domain,
+		    unsigned long addr, unsigned long end)
+{
+	return NULL;
+}
+#endif
+
 #ifdef CONFIG_INTEL_IOMMU_SVM
 int intel_svm_init(struct intel_iommu *iommu);
 extern int intel_svm_enable_prq(struct intel_iommu *iommu);
diff --git a/include/trace/events/intel_iommu.h b/include/trace/events/intel_iommu.h
index 54e61d456cdf..e8c95290fd13 100644
--- a/include/trace/events/intel_iommu.h
+++ b/include/trace/events/intel_iommu.h
@@ -99,6 +99,66 @@ DEFINE_EVENT(dma_unmap, bounce_unmap_single,
 	TP_ARGS(dev, dev_addr, size)
 );
 
+DECLARE_EVENT_CLASS(domain_map,
+	TP_PROTO(struct dmar_domain *domain, unsigned long addr,
+		 unsigned long end, phys_addr_t phys_addr),
+
+	TP_ARGS(domain, addr, end, phys_addr),
+
+	TP_STRUCT__entry(
+		__field(struct dmar_domain *, domain)
+		__field(unsigned long, addr)
+		__field(unsigned long, end)
+		__field(phys_addr_t, phys_addr)
+	),
+
+	TP_fast_assign(
+		__entry->domain = domain;
+		__entry->addr = addr;
+		__entry->end = end;
+		__entry->phys_addr = phys_addr;
+	),
+
+	TP_printk("domain=%p addr=0x%lx end=0x%lx phys_addr=0x%llx",
+		  __entry->domain, __entry->addr, __entry->end,
+		  (unsigned long long)__entry->phys_addr)
+);
+
+DEFINE_EVENT(domain_map, domain_mm_map,
+	TP_PROTO(struct dmar_domain *domain, unsigned long addr,
+		 unsigned long end, phys_addr_t phys_addr),
+
+	TP_ARGS(domain, addr, end, phys_addr)
+);
+
+DECLARE_EVENT_CLASS(domain_unmap,
+	TP_PROTO(struct dmar_domain *domain, unsigned long addr,
+		 unsigned long end),
+
+	TP_ARGS(domain, addr, end),
+
+	TP_STRUCT__entry(
+		__field(struct dmar_domain *, domain)
+		__field(unsigned long, addr)
+		__field(unsigned long, end)
+	),
+
+	TP_fast_assign(
+		__entry->domain = domain;
+		__entry->addr = addr;
+		__entry->end = end;
+	),
+
+	TP_printk("domain=%p addr=0x%lx end=0x%lx",
+		  __entry->domain, __entry->addr, __entry->end)
+);
+
+DEFINE_EVENT(domain_unmap, domain_mm_unmap,
+	TP_PROTO(struct dmar_domain *domain, unsigned long addr,
+		 unsigned long end),
+
+	TP_ARGS(domain, addr, end)
+);
 #endif /* _TRACE_INTEL_IOMMU_H */
 
 /* This part must be outside protection */
-- 
2.17.1


  parent reply	other threads:[~2019-09-23 12:27 UTC|newest]

Thread overview: 76+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-09-23 12:24 [RFC PATCH 0/4] Use 1st-level for DMA remapping in guest Lu Baolu
2019-09-23 12:24 ` Lu Baolu
2019-09-23 12:24 ` [RFC PATCH 1/4] iommu/vt-d: Move domain_flush_cache helper into header Lu Baolu
2019-09-23 12:24   ` Lu Baolu
2019-09-23 12:24 ` Lu Baolu [this message]
2019-09-23 12:24   ` [RFC PATCH 2/4] iommu/vt-d: Add first level page table interfaces Lu Baolu
2019-09-23 20:31   ` Raj, Ashok
2019-09-23 20:31     ` Raj, Ashok
2019-09-24  1:38     ` Lu Baolu
2019-09-24  1:38       ` Lu Baolu
2019-09-25  4:30       ` Peter Xu
2019-09-25  4:30         ` Peter Xu
2019-09-25  4:38         ` Tian, Kevin
2019-09-25  4:38           ` Tian, Kevin
2019-09-25  5:24           ` Peter Xu
2019-09-25  5:24             ` Peter Xu
2019-09-25  6:52             ` Lu Baolu
2019-09-25  6:52               ` Lu Baolu
2019-09-25  7:32               ` Tian, Kevin
2019-09-25  7:32                 ` Tian, Kevin
2019-09-25  8:35                 ` Peter Xu
2019-09-25  8:35                   ` Peter Xu
2019-09-26  1:42                 ` Lu Baolu
2019-09-26  1:42                   ` Lu Baolu
2019-09-25  5:21   ` Peter Xu
2019-09-25  5:21     ` Peter Xu
2019-09-26  2:35     ` Lu Baolu
2019-09-26  2:35       ` Lu Baolu
2019-09-26  3:49       ` Peter Xu
2019-09-26  3:49         ` Peter Xu
2019-09-27  2:27         ` Lu Baolu
2019-09-27  2:27           ` Lu Baolu
2019-09-27  5:34           ` Peter Xu
2019-09-27  5:34             ` Peter Xu
2019-09-28  8:23             ` Lu Baolu
2019-09-28  8:23               ` Lu Baolu
2019-09-29  5:25               ` Peter Xu
2019-09-29  5:25                 ` Peter Xu
2019-10-08  2:20                 ` Lu Baolu
2019-10-08  2:20                   ` Lu Baolu
2019-09-23 12:24 ` [RFC PATCH 3/4] iommu/vt-d: Map/unmap domain with mmmap/mmunmap Lu Baolu
2019-09-23 12:24   ` Lu Baolu
2019-09-25  5:00   ` Tian, Kevin
2019-09-25  5:00     ` Tian, Kevin
2019-09-25  7:06     ` Lu Baolu
2019-09-25  7:06       ` Lu Baolu
2019-09-23 12:24 ` [RFC PATCH 4/4] iommu/vt-d: Identify domains using first level page table Lu Baolu
2019-09-23 12:24   ` Lu Baolu
2019-09-25  6:50   ` Peter Xu
2019-09-25  6:50     ` Peter Xu
2019-09-25  7:35     ` Tian, Kevin
2019-09-25  7:35       ` Tian, Kevin
2019-09-23 19:27 ` [RFC PATCH 0/4] Use 1st-level for DMA remapping in guest Jacob Pan
2019-09-23 19:27   ` Jacob Pan
2019-09-23 20:25   ` Raj, Ashok
2019-09-23 20:25     ` Raj, Ashok
2019-09-24  4:40     ` Lu Baolu
2019-09-24  4:40       ` Lu Baolu
2019-09-24  7:00     ` Tian, Kevin
2019-09-24  7:00       ` Tian, Kevin
2019-09-25  2:48       ` Lu Baolu
2019-09-25  2:48         ` Lu Baolu
2019-09-25  6:56         ` Peter Xu
2019-09-25  6:56           ` Peter Xu
2019-09-25  7:21           ` Tian, Kevin
2019-09-25  7:21             ` Tian, Kevin
2019-09-25  7:45             ` Peter Xu
2019-09-25  7:45               ` Peter Xu
2019-09-25  8:02               ` Tian, Kevin
2019-09-25  8:02                 ` Tian, Kevin
2019-09-25  8:52                 ` Peter Xu
2019-09-25  8:52                   ` Peter Xu
2019-09-26  1:37                   ` Lu Baolu
2019-09-26  1:37                     ` Lu Baolu
2019-09-24  4:27   ` Lu Baolu
2019-09-24  4:27     ` Lu Baolu

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20190923122454.9888-3-baolu.lu@linux.intel.com \
    --to=baolu.lu@linux.intel.com \
    --cc=alex.williamson@redhat.com \
    --cc=ashok.raj@intel.com \
    --cc=dwmw2@infradead.org \
    --cc=iommu@lists.linux-foundation.org \
    --cc=joro@8bytes.org \
    --cc=kevin.tian@intel.com \
    --cc=kvm@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=sanjay.k.kumar@intel.com \
    --cc=yi.y.sun@intel.com \
    --cc=yi.y.sun@linux.intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.