* [PATCH v2 01/10] iommu/pages: Add support for a incoherent IOMMU page walker
2025-08-26 17:26 [PATCH v2 00/10] Convert Intel VT-D to use the generic iommu page table Jason Gunthorpe
@ 2025-08-26 17:26 ` Jason Gunthorpe
2025-08-26 17:26 ` [PATCH v2 02/10] iommupt: Add basic support for SW bits in the page table Jason Gunthorpe
` (8 subsequent siblings)
9 siblings, 0 replies; 11+ messages in thread
From: Jason Gunthorpe @ 2025-08-26 17:26 UTC (permalink / raw)
To: Lu Baolu, David Woodhouse, iommu, Joerg Roedel, Robin Murphy,
Will Deacon
Cc: Kevin Tian, patches, Tina Zhang, Wei Wang
Some IOMMU HW cannot snoop the CPU cache when it walks the IO page tables.
The CPU is required to flush the cache to make changes visible to the HW.
Provide some helpers from iommu-pages to manage this. The helpers combine
both the ARM and x86 (used in Intel VT-D) versions of the cache flushing
under a single API.
The ARM version uses the DMA API to access the cache flush on the
assumption that the iommu is using a direct mapping and is already marked
incoherent. The helpers will do the DMA API calls to set things up and
keep track of DMA mapped folios using a bit in the ioptdesc so that
unmapping on error paths is cleaner.
The Intel version just calls the arch cache flush call directly and has no
need to cleanup prior to destruction.
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
drivers/iommu/iommu-pages.c | 117 ++++++++++++++++++++++++++++++++++++
drivers/iommu/iommu-pages.h | 45 +++++++++++++-
2 files changed, 160 insertions(+), 2 deletions(-)
diff --git a/drivers/iommu/iommu-pages.c b/drivers/iommu/iommu-pages.c
index 238c09e5166b4d..5dc8cdf71e2ade 100644
--- a/drivers/iommu/iommu-pages.c
+++ b/drivers/iommu/iommu-pages.c
@@ -4,6 +4,7 @@
* Pasha Tatashin <pasha.tatashin@soleen.com>
*/
#include "iommu-pages.h"
+#include <linux/dma-mapping.h>
#include <linux/gfp.h>
#include <linux/mm.h>
@@ -22,6 +23,11 @@ IOPTDESC_MATCH(memcg_data, memcg_data);
#undef IOPTDESC_MATCH
static_assert(sizeof(struct ioptdesc) <= sizeof(struct page));
+static inline size_t ioptdesc_mem_size(struct ioptdesc *desc)
+{
+ return 1UL << (folio_order(ioptdesc_folio(desc)) + PAGE_SHIFT);
+}
+
/**
* iommu_alloc_pages_node_sz - Allocate a zeroed page of a given size from
* specific NUMA node
@@ -36,6 +42,7 @@ static_assert(sizeof(struct ioptdesc) <= sizeof(struct page));
*/
void *iommu_alloc_pages_node_sz(int nid, gfp_t gfp, size_t size)
{
+ struct ioptdesc *iopt;
unsigned long pgcnt;
struct folio *folio;
unsigned int order;
@@ -60,6 +67,9 @@ void *iommu_alloc_pages_node_sz(int nid, gfp_t gfp, size_t size)
if (unlikely(!folio))
return NULL;
+ iopt = folio_ioptdesc(folio);
+ iopt->incoherent = false;
+
/*
* All page allocations that should be reported to as "iommu-pagetables"
* to userspace must use one of the functions below. This includes
@@ -82,6 +92,9 @@ static void __iommu_free_desc(struct ioptdesc *iopt)
struct folio *folio = ioptdesc_folio(iopt);
const unsigned long pgcnt = 1UL << folio_order(folio);
+ if (IOMMU_PAGES_USE_DMA_API)
+ WARN_ON_ONCE(iopt->incoherent);
+
mod_node_page_state(folio_pgdat(folio), NR_IOMMU_PAGES, -pgcnt);
lruvec_stat_mod_folio(folio, NR_SECONDARY_PAGETABLE, -pgcnt);
folio_put(folio);
@@ -117,3 +130,107 @@ void iommu_put_pages_list(struct iommu_pages_list *list)
__iommu_free_desc(iopt);
}
EXPORT_SYMBOL_GPL(iommu_put_pages_list);
+
+/**
+ * iommu_pages_start_incoherent - Setup the page for cache incoherent operation
+ * @virt: The page to setup
+ * @dma_dev: The iommu device
+ *
+ * For incoherent memory this will use the DMA API to manage the cache flushing
+ * on some arches. This is a lot of complexity compared to just calling
+ * arch_sync_dma_for_device(), but it is what the existing ARM iommu drivers
+ * have been doing. The DMA API requires keeping track of the DMA map and
+ * freeing it when required. This keeps track of the dma map inside the ioptdesc
+ * so that error paths are simple for the caller.
+ */
+int iommu_pages_start_incoherent(void *virt, struct device *dma_dev)
+{
+ struct ioptdesc *iopt = virt_to_ioptdesc(virt);
+ dma_addr_t dma;
+
+ if (WARN_ON(iopt->incoherent))
+ return -EINVAL;
+
+ if (!IOMMU_PAGES_USE_DMA_API) {
+ iommu_pages_flush_incoherent(dma_dev, virt, 0,
+ ioptdesc_mem_size(iopt));
+ } else {
+ dma = dma_map_single(dma_dev, virt, ioptdesc_mem_size(iopt),
+ DMA_TO_DEVICE);
+ if (dma_mapping_error(dma_dev, dma))
+ return -EINVAL;
+
+ /*
+ * The DMA API is not allowed to do anything other than DMA
+ * direct. It would be nice to also check
+ * dev_is_dma_coherent(dma_dev));
+ */
+ if (WARN_ON(dma != virt_to_phys(virt))) {
+ dma_unmap_single(dma_dev, dma, ioptdesc_mem_size(iopt),
+ DMA_TO_DEVICE);
+ return -EOPNOTSUPP;
+ }
+ }
+
+ iopt->incoherent = 1;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(iommu_pages_start_incoherent);
+
+/**
+ * iommu_pages_start_incoherent_list
+ * @list: The list of pages to setup
+ * @dma_dev: The iommu device
+ *
+ * Perform iommu_pages_start_incoherent() across all of list.
+ *
+ * If this fails the caller must call iommu_pages_stop_incoherent_list().
+ */
+int iommu_pages_start_incoherent_list(struct iommu_pages_list *list,
+ struct device *dma_dev)
+{
+ struct ioptdesc *cur;
+ int ret;
+
+ list_for_each_entry(cur, &list->pages, iopt_freelist_elm) {
+ if (WARN_ON(cur->incoherent))
+ continue;
+
+ ret = iommu_pages_start_incoherent(
+ folio_address(ioptdesc_folio(cur)), dma_dev);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(iommu_pages_start_incoherent_list);
+
+/**
+ * iommu_pages_stop_incoherent_list
+ * @list: The list of pages to release
+ * @dma_dev: The iommu device
+ *
+ * Revert iommu_pages_start_incoherent() across all of the list. Pages that did
+ * not call or succeed iommu_pages_start_incoherent() will be ignored.
+ */
+#if IOMMU_PAGES_USE_DMA_API
+void iommu_pages_stop_incoherent_list(struct iommu_pages_list *list,
+ struct device *dma_dev)
+{
+ struct ioptdesc *cur;
+
+ if (IS_ENABLED(CONFIG_X86))
+ return;
+
+ list_for_each_entry(cur, &list->pages, iopt_freelist_elm) {
+ struct folio *folio = ioptdesc_folio(cur);
+
+ if (!cur->incoherent)
+ continue;
+ dma_unmap_single(dma_dev, virt_to_phys(folio_address(folio)),
+ ioptdesc_mem_size(cur), DMA_TO_DEVICE);
+ cur->incoherent = 0;
+ }
+}
+EXPORT_SYMBOL_GPL(iommu_pages_stop_incoherent_list);
+#endif
diff --git a/drivers/iommu/iommu-pages.h b/drivers/iommu/iommu-pages.h
index b3af2813ed0ced..1c0904a90ef252 100644
--- a/drivers/iommu/iommu-pages.h
+++ b/drivers/iommu/iommu-pages.h
@@ -21,7 +21,10 @@ struct ioptdesc {
struct list_head iopt_freelist_elm;
unsigned long __page_mapping;
- pgoff_t __index;
+ union {
+ u8 incoherent;
+ pgoff_t __index;
+ };
void *_private;
unsigned int __page_type;
@@ -98,4 +101,42 @@ static inline void *iommu_alloc_pages_sz(gfp_t gfp, size_t size)
return iommu_alloc_pages_node_sz(NUMA_NO_NODE, gfp, size);
}
-#endif /* __IOMMU_PAGES_H */
+int iommu_pages_start_incoherent(void *virt, struct device *dma_dev);
+int iommu_pages_start_incoherent_list(struct iommu_pages_list *list,
+ struct device *dma_dev);
+
+#ifdef CONFIG_X86
+#define IOMMU_PAGES_USE_DMA_API 0
+#include <linux/cacheflush.h>
+
+static inline void iommu_pages_flush_incoherent(struct device *dma_dev,
+ void *virt, size_t offset,
+ size_t len)
+{
+ clflush_cache_range(virt + offset, len);
+}
+static inline void
+iommu_pages_stop_incoherent_list(struct iommu_pages_list *list,
+ struct device *dma_dev)
+{
+ /*
+ * For performance leave the incoherent flag alone which turns this into
+ * a NOP. For X86 the rest of the stop/free flow ignores the flag.
+ */
+}
+#else
+#define IOMMU_PAGES_USE_DMA_API 1
+#include <linux/dma-mapping.h>
+
+static inline void iommu_pages_flush_incoherent(struct device *dma_dev,
+ void *virt, size_t offset,
+ size_t len)
+{
+ dma_sync_single_for_device(dma_dev, (uintptr_t)virt + offset, len,
+ DMA_TO_DEVICE);
+}
+void iommu_pages_stop_incoherent_list(struct iommu_pages_list *list,
+ struct device *dma_dev);
+#endif
+
+#endif /* __IOMMU_PAGES_H */
--
2.43.0
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [PATCH v2 02/10] iommupt: Add basic support for SW bits in the page table
2025-08-26 17:26 [PATCH v2 00/10] Convert Intel VT-D to use the generic iommu page table Jason Gunthorpe
2025-08-26 17:26 ` [PATCH v2 01/10] iommu/pages: Add support for a incoherent IOMMU page walker Jason Gunthorpe
@ 2025-08-26 17:26 ` Jason Gunthorpe
2025-08-26 17:26 ` [PATCH v2 03/10] iommupt: Use the incoherent start/stop functions for PT_FEAT_DMA_INCOHERENT Jason Gunthorpe
` (7 subsequent siblings)
9 siblings, 0 replies; 11+ messages in thread
From: Jason Gunthorpe @ 2025-08-26 17:26 UTC (permalink / raw)
To: Lu Baolu, David Woodhouse, iommu, Joerg Roedel, Robin Murphy,
Will Deacon
Cc: Kevin Tian, patches, Tina Zhang, Wei Wang
SW bits can be placed on items, including table entries, single OA's and
individual items within a contiguous OA. They are guaranteed to be ignored
by the HW. The API is very basic since the only use case so far is a
single bit.
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
drivers/iommu/generic_pt/pt_common.h | 29 ++++++++++
drivers/iommu/generic_pt/pt_fmt_defaults.h | 62 ++++++++++++++++++++++
2 files changed, 91 insertions(+)
diff --git a/drivers/iommu/generic_pt/pt_common.h b/drivers/iommu/generic_pt/pt_common.h
index 5ed06104d38b45..ac139ae74e670a 100644
--- a/drivers/iommu/generic_pt/pt_common.h
+++ b/drivers/iommu/generic_pt/pt_common.h
@@ -338,6 +338,35 @@ static inline struct pt_table_p *pt_table_ptr(const struct pt_state *pts)
return __va(pt_table_pa(pts));
}
+/**
+ * pt_max_sw_bit() - Return the maximum software bit usable for any level and
+ * entry
+ * @common: Page table
+ *
+ * The swbit can be passed as bitnr to the other sw_bit functions.
+ */
+static inline unsigned int pt_max_sw_bit(struct pt_common *common);
+
+/**
+ * pt_test_sw_bit_acquire() - Read a software bit in an item
+ * @pts: Entry to set
+ *
+ * Software bits are ignored by HW and can be used for any purpose by the
+ * software. This does a test bit and acquire operation.
+ */
+static inline bool pt_test_sw_bit_acquire(struct pt_state *pts,
+ unsigned int bitnr);
+
+/**
+ * pt_set_sw_bit_release() - Set a software bit in an item
+ * @pts: Entry to set
+ *
+ * Software bits are ignored by HW and can be used for any purpose by the
+ * software. This does a set bit and release operation.
+ */
+static inline void pt_set_sw_bit_release(struct pt_state *pts,
+ unsigned int bitnr);
+
/**
* pt_load_entry() - Read from the location pts points at into the pts
* @pts: Table index to load
diff --git a/drivers/iommu/generic_pt/pt_fmt_defaults.h b/drivers/iommu/generic_pt/pt_fmt_defaults.h
index 8738008d024b0b..a837ee9abdb7d4 100644
--- a/drivers/iommu/generic_pt/pt_fmt_defaults.h
+++ b/drivers/iommu/generic_pt/pt_fmt_defaults.h
@@ -190,4 +190,66 @@ static inline void pt_clear_entry(struct pt_state *pts,
#define pt_clear_entry pt_clear_entry
#endif
+/* If not supplied then SW bits are not supported */
+#ifdef pt_sw_bit
+static inline bool pt_test_sw_bit_acquire(struct pt_state *pts,
+ unsigned int bitnr)
+{
+ /* Acquire, pairs with pt_set_sw_bit_release() */
+ smp_mb();
+ /* For a contiguous entry the sw bit is only stored in the firstitem. */
+ return pts->entry & pt_sw_bit(bitnr);
+}
+#define pt_test_sw_bit_acquire pt_test_sw_bit_acquire
+
+static inline void pt_set_sw_bit_release(struct pt_state *pts,
+ unsigned int bitnr)
+{
+#if !IS_ENABLED(CONFIG_GENERIC_ATOMIC64)
+ if (PT_ITEM_WORD_SIZE == sizeof(u64)) {
+ u64 *entryp = pt_cur_table(pts, u64) + pts->index;
+ u64 old_entry = pts->entry;
+ u64 new_entry;
+
+ do {
+ new_entry = old_entry | pt_sw_bit(bitnr);
+ } while (!try_cmpxchg64_release(entryp, &old_entry, new_entry));
+ pts->entry = new_entry;
+ return;
+ }
+#endif
+ if (PT_ITEM_WORD_SIZE == sizeof(u32)) {
+ u32 *entryp = pt_cur_table(pts, u32) + pts->index;
+ u32 old_entry = pts->entry;
+ u32 new_entry;
+
+ do {
+ new_entry = old_entry | pt_sw_bit(bitnr);
+ } while (!try_cmpxchg_release(entryp, &old_entry, new_entry));
+ pts->entry = new_entry;
+ } else
+ BUILD_BUG();
+}
+#define pt_set_sw_bit_release pt_set_sw_bit_release
+#else
+static inline unsigned int pt_max_sw_bit(struct pt_common *common)
+{
+ return 0;
+}
+
+extern void __pt_no_sw_bit(void);
+static inline bool pt_test_sw_bit_acquire(struct pt_state *pts,
+ unsigned int bitnr)
+{
+ __pt_no_sw_bit();
+ return false;
+}
+
+static inline void pt_set_sw_bit_release(struct pt_state *pts,
+ unsigned int bitnr)
+{
+ __pt_no_sw_bit();
+}
+#endif
+
#endif
--
2.43.0
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [PATCH v2 03/10] iommupt: Use the incoherent start/stop functions for PT_FEAT_DMA_INCOHERENT
2025-08-26 17:26 [PATCH v2 00/10] Convert Intel VT-D to use the generic iommu page table Jason Gunthorpe
2025-08-26 17:26 ` [PATCH v2 01/10] iommu/pages: Add support for a incoherent IOMMU page walker Jason Gunthorpe
2025-08-26 17:26 ` [PATCH v2 02/10] iommupt: Add basic support for SW bits in the page table Jason Gunthorpe
@ 2025-08-26 17:26 ` Jason Gunthorpe
2025-08-26 17:26 ` [PATCH v2 04/10] iommupt: Flush the CPU cache after any writes to the page table Jason Gunthorpe
` (6 subsequent siblings)
9 siblings, 0 replies; 11+ messages in thread
From: Jason Gunthorpe @ 2025-08-26 17:26 UTC (permalink / raw)
To: Lu Baolu, David Woodhouse, iommu, Joerg Roedel, Robin Murphy,
Will Deacon
Cc: Kevin Tian, patches, Tina Zhang, Wei Wang
This is the first step to supporting an incoherent walker, start and stop
the incoherence around the allocation and frees of the page table memory.
The iommu_pages API maps this to dma_map/unmap_single(), or arch cache
flushing calls.
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
drivers/iommu/generic_pt/iommu_pt.h | 81 ++++++++++++++++++++------
drivers/iommu/generic_pt/kunit_iommu.h | 1 +
drivers/iommu/generic_pt/pt_defs.h | 3 +
include/linux/generic_pt/common.h | 6 ++
include/linux/generic_pt/iommu.h | 7 +++
5 files changed, 81 insertions(+), 17 deletions(-)
diff --git a/drivers/iommu/generic_pt/iommu_pt.h b/drivers/iommu/generic_pt/iommu_pt.h
index 2a6c1bc2bc9be7..4789fe5361cb3a 100644
--- a/drivers/iommu/generic_pt/iommu_pt.h
+++ b/drivers/iommu/generic_pt/iommu_pt.h
@@ -24,6 +24,10 @@ static void gather_range_pages(struct iommu_iotlb_gather *iotlb_gather,
{
struct pt_common *common = common_from_iommu(iommu_table);
+ if (pt_feature(common, PT_FEAT_DMA_INCOHERENT))
+ iommu_pages_stop_incoherent_list(free_list,
+ iommu_table->iommu_device);
+
if (pt_feature(common, PT_FEAT_FLUSH_RANGE_NO_GAPS) &&
iommu_iotlb_gather_is_disjoint(iotlb_gather, iova, len)) {
iommu_iotlb_sync(&iommu_table->domain, iotlb_gather);
@@ -327,35 +331,55 @@ static int __collect_tables(struct pt_range *range, void *arg,
return 0;
}
-static inline struct pt_table_p *table_alloc_top(struct pt_common *common,
- uintptr_t top_of_table,
- gfp_t gfp)
+enum alloc_mode {ALLOC_NORMAL, ALLOC_KEEP_INCOHERENT};
+
+/* Allocate a table, the empty table will be ready to be installed. */
+static inline struct pt_table_p *_table_alloc(struct pt_common *common,
+ size_t lg2sz, gfp_t gfp,
+ enum alloc_mode mode)
{
struct pt_iommu *iommu_table = iommu_from_common(common);
+ struct pt_table_p *table_mem;
+ table_mem = iommu_alloc_pages_node_sz(iommu_table->nid, gfp,
+ log2_to_int(lg2sz));
+ if (pt_feature(common, PT_FEAT_DMA_INCOHERENT) &&
+ mode == ALLOC_NORMAL) {
+ int ret = iommu_pages_start_incoherent(
+ table_mem, iommu_table->iommu_device);
+ if (ret) {
+ iommu_free_pages(table_mem);
+ return ERR_PTR(ret);
+ }
+ }
+ return table_mem;
+}
+
+static inline struct pt_table_p *table_alloc_top(struct pt_common *common,
+ uintptr_t top_of_table,
+ gfp_t gfp,
+ enum alloc_mode mode)
+{
/*
* Top doesn't need the free list or otherwise, so it technically
* doesn't need to use iommu pages. Use the API anyhow as the top is
* usually not smaller than PAGE_SIZE to keep things simple.
*/
- return iommu_alloc_pages_node_sz(
- iommu_table->nid, gfp,
- log2_to_int(pt_top_memsize_lg2(common, top_of_table)));
+ return _table_alloc(common, pt_top_memsize_lg2(common, top_of_table),
+ gfp, mode);
}
/* Allocate an interior table */
static inline struct pt_table_p *table_alloc(const struct pt_state *parent_pts,
- gfp_t gfp)
+ gfp_t gfp, enum alloc_mode mode)
{
- struct pt_iommu *iommu_table =
- iommu_from_common(parent_pts->range->common);
struct pt_state child_pts =
pt_init(parent_pts->range, parent_pts->level - 1, NULL);
- return iommu_alloc_pages_node_sz(
- iommu_table->nid, gfp,
- log2_to_int(pt_num_items_lg2(&child_pts) +
- ilog2(PT_ITEM_WORD_SIZE)));
+ return _table_alloc(parent_pts->range->common,
+ pt_num_items_lg2(&child_pts) +
+ ilog2(PT_ITEM_WORD_SIZE),
+ gfp, mode);
}
static inline int pt_iommu_new_table(struct pt_state *pts,
@@ -368,7 +392,7 @@ static inline int pt_iommu_new_table(struct pt_state *pts,
if (unlikely(!pt_can_have_table(pts)))
return -ENXIO;
- table_mem = table_alloc(pts, attrs->gfp);
+ table_mem = table_alloc(pts, attrs->gfp, ALLOC_NORMAL);
if (IS_ERR(table_mem))
return PTR_ERR(table_mem);
@@ -606,8 +630,9 @@ static int increase_top(struct pt_iommu *iommu_table, struct pt_range *range,
}
new_level = pts.level;
- table_mem = table_alloc_top(
- common, _pt_top_set(NULL, pts.level), map->attrs.gfp);
+ table_mem =
+ table_alloc_top(common, _pt_top_set(NULL, pts.level),
+ map->attrs.gfp, ALLOC_KEEP_INCOHERENT);
if (IS_ERR(table_mem))
return PTR_ERR(table_mem);
iommu_pages_list_add(&free_list, table_mem);
@@ -624,6 +649,16 @@ static int increase_top(struct pt_iommu *iommu_table, struct pt_range *range,
new_top_of_table = _pt_top_set(pts.table, pts.level);
}
+ /*
+ * Avoid double flushing, flush it once after all pt_install_table()
+ */
+ if (pt_feature(common, PT_FEAT_DMA_INCOHERENT)) {
+ ret = iommu_pages_start_incoherent_list(
+ &free_list, iommu_table->iommu_device);
+ if (ret)
+ goto err_free;
+ }
+
/*
* top_of_table is write locked by the spinlock, but readers can use
* READ_ONCE() to get the value. Since we encode both the level and the
@@ -656,6 +691,9 @@ static int increase_top(struct pt_iommu *iommu_table, struct pt_range *range,
return 0;
err_free:
+ if (pt_feature(common, PT_FEAT_DMA_INCOHERENT))
+ iommu_pages_stop_incoherent_list(&free_list,
+ iommu_table->iommu_device);
iommu_put_pages_list(&free_list);
return ret;
}
@@ -971,6 +1009,9 @@ static void NS(deinit)(struct pt_iommu *iommu_table)
* The driver has to already have fenced the HW access to the page table
* and invalidated any caching referring to this memory.
*/
+ if (pt_feature(common, PT_FEAT_DMA_INCOHERENT))
+ iommu_pages_stop_incoherent_list(&collect.free_list,
+ iommu_table->iommu_device);
iommu_put_pages_list(&collect.free_list);
}
@@ -1063,6 +1104,7 @@ static void pt_iommu_zero(struct pt_iommu_table *fmt_table)
memset_after(fmt_table, 0, iommu.domain);
/* The caller can initialize some of these values */
+ iommu_table->iommu_device = cfg.iommu_device;
iommu_table->hw_flush_ops = cfg.hw_flush_ops;
iommu_table->nid = cfg.nid;
}
@@ -1107,11 +1149,16 @@ int pt_iommu_init(struct pt_iommu_table *fmt_table,
pt_feature(common, PT_FEAT_DYNAMIC_TOP)))
return -EINVAL;
+ if (pt_feature(common, PT_FEAT_DMA_INCOHERENT) &&
+ WARN_ON(!iommu_table->iommu_device))
+ return -EINVAL;
+
ret = pt_iommu_init_domain(iommu_table, &iommu_table->domain);
if (ret)
return ret;
- table_mem = table_alloc_top(common, common->top_of_table, gfp);
+ table_mem = table_alloc_top(common, common->top_of_table, gfp,
+ ALLOC_NORMAL);
if (IS_ERR(table_mem))
return PTR_ERR(table_mem);
pt_top_set(common, table_mem, pt_top_get_level(common));
diff --git a/drivers/iommu/generic_pt/kunit_iommu.h b/drivers/iommu/generic_pt/kunit_iommu.h
index cca4e72efcaa04..45ecfa8ca5fa6c 100644
--- a/drivers/iommu/generic_pt/kunit_iommu.h
+++ b/drivers/iommu/generic_pt/kunit_iommu.h
@@ -139,6 +139,7 @@ static int pt_kunit_priv_init(struct kunit *test, struct kunit_iommu_priv *priv)
priv->fmt_table.iommu.nid = NUMA_NO_NODE;
priv->fmt_table.iommu.hw_flush_ops = &pt_kunit_flush_ops;
+ priv->fmt_table.iommu.iommu_device = priv->dummy_dev;
priv->domain.ops = &kunit_pt_ops;
ret = pt_iommu_init(&priv->fmt_table, &priv->cfg, GFP_KERNEL);
if (ret) {
diff --git a/drivers/iommu/generic_pt/pt_defs.h b/drivers/iommu/generic_pt/pt_defs.h
index 3673566708495d..869965883e6e51 100644
--- a/drivers/iommu/generic_pt/pt_defs.h
+++ b/drivers/iommu/generic_pt/pt_defs.h
@@ -55,6 +55,9 @@ enum {
PT_ORIG_SUPPORTED_FEATURES = PT_SUPPORTED_FEATURES,
PT_DEBUG_SUPPORTED_FEATURES =
UINT_MAX &
+ ~((PT_ORIG_SUPPORTED_FEATURES & BIT(PT_FEAT_DMA_INCOHERENT) ?
+ 0 :
+ BIT(PT_FEAT_DMA_INCOHERENT))) &
~((PT_ORIG_SUPPORTED_FEATURES & BIT(PT_FEAT_SIGN_EXTEND)) ?
BIT(PT_FEAT_DYNAMIC_TOP) | BIT(PT_FEAT_FULL_VA) :
BIT(PT_FEAT_SIGN_EXTEND)),
diff --git a/include/linux/generic_pt/common.h b/include/linux/generic_pt/common.h
index 7729008f17c799..1b97bbfaa4f90a 100644
--- a/include/linux/generic_pt/common.h
+++ b/include/linux/generic_pt/common.h
@@ -84,6 +84,12 @@ enum {
* position.
*/
enum pt_features {
+ /**
+ * @PT_FEAT_DMA_INCOHERENT: Cache flush page table memory before
+ * assuming the HW can read it. Otherwise a SMP release is sufficient
+ * for HW to read it.
+ */
+ PT_FEAT_DMA_INCOHERENT,
/**
* @PT_FEAT_FULL_VA: The table can span the full VA range from 0 to
* PT_VADDR_MAX.
diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h
index 960281046e62b3..5dc3a960a8989e 100644
--- a/include/linux/generic_pt/iommu.h
+++ b/include/linux/generic_pt/iommu.h
@@ -57,6 +57,13 @@ struct pt_iommu {
* table walkers.
*/
int nid;
+
+ /**
+ * @iommu_device - Device pointer used for any DMA cache flushing when
+ * PT_FEAT_DMA_INCOHERENT. This is the iommu device that created the
+ * page table which must have dma ops that perform cache flushing.
+ */
+ struct device *iommu_device;
};
/**
--
2.43.0
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [PATCH v2 04/10] iommupt: Flush the CPU cache after any writes to the page table
2025-08-26 17:26 [PATCH v2 00/10] Convert Intel VT-D to use the generic iommu page table Jason Gunthorpe
` (2 preceding siblings ...)
2025-08-26 17:26 ` [PATCH v2 03/10] iommupt: Use the incoherent start/stop functions for PT_FEAT_DMA_INCOHERENT Jason Gunthorpe
@ 2025-08-26 17:26 ` Jason Gunthorpe
2025-08-26 17:26 ` [PATCH v2 05/10] iommupt: Add the Intel VT-D second stage page table format Jason Gunthorpe
` (5 subsequent siblings)
9 siblings, 0 replies; 11+ messages in thread
From: Jason Gunthorpe @ 2025-08-26 17:26 UTC (permalink / raw)
To: Lu Baolu, David Woodhouse, iommu, Joerg Roedel, Robin Murphy,
Will Deacon
Cc: Kevin Tian, patches, Tina Zhang, Wei Wang
Flush the CPU cache for the page table memory after each set of writes to
the page table. The iommu should have visibility to the updated entries as
soon as the map/unmap/etc operations return, like normal coherent hardware
does.
The caches also have to be flushed before any gather can be submitted to
the driver.
Implement the same solution to the race as io-pgtable-arm by using a
software PTE bit to track if a table entry has been flushed or not. If
another thread is still flushing then another concurrent map operation
could return without IOMMU visibility to a required table entry. The SW
bit will tell the second thread to also flush the cache.
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
drivers/iommu/generic_pt/iommu_pt.h | 56 ++++++++++++++++++++++++++++-
1 file changed, 55 insertions(+), 1 deletion(-)
diff --git a/drivers/iommu/generic_pt/iommu_pt.h b/drivers/iommu/generic_pt/iommu_pt.h
index 4789fe5361cb3a..c04c6750d0e250 100644
--- a/drivers/iommu/generic_pt/iommu_pt.h
+++ b/drivers/iommu/generic_pt/iommu_pt.h
@@ -17,6 +17,29 @@
#include <linux/cleanup.h>
#include <linux/dma-mapping.h>
+enum {
+ SW_BIT_CACHE_FLUSH_DONE = 0,
+};
+
+static void flush_writes_range(const struct pt_state *pts,
+ unsigned int start_index, unsigned int end_index)
+{
+ if (pts_feature(pts, PT_FEAT_DMA_INCOHERENT))
+ iommu_pages_flush_incoherent(
+ iommu_from_common(pts->range->common)->iommu_device,
+ pts->table, start_index * PT_ITEM_WORD_SIZE,
+ (end_index - start_index) * PT_ITEM_WORD_SIZE);
+}
+
+static void flush_writes_item(const struct pt_state *pts)
+{
+ if (pts_feature(pts, PT_FEAT_DMA_INCOHERENT))
+ iommu_pages_flush_incoherent(
+ iommu_from_common(pts->range->common)->iommu_device,
+ pts->table, pts->index * PT_ITEM_WORD_SIZE,
+ PT_ITEM_WORD_SIZE);
+}
+
static void gather_range_pages(struct iommu_iotlb_gather *iotlb_gather,
struct pt_iommu *iommu_table, pt_vaddr_t iova,
pt_vaddr_t len,
@@ -195,6 +218,10 @@ static void record_dirty(struct pt_state *pts,
dirty_len);
if (!(dirty->flags & IOMMU_DIRTY_NO_CLEAR)) {
+ /*
+ * No write log required because DMA incoherence and atomic
+ * dirty tracking bits can't work together
+ */
pt_entry_set_write_clean(pts);
iommu_iotlb_gather_add_range(dirty->dirty->gather,
pts->range->va, dirty_len);
@@ -402,6 +429,11 @@ static inline int pt_iommu_new_table(struct pt_state *pts,
return -EAGAIN;
}
+ if (pts_feature(pts, PT_FEAT_DMA_INCOHERENT)) {
+ flush_writes_item(pts);
+ pt_set_sw_bit_release(pts, SW_BIT_CACHE_FLUSH_DONE);
+ }
+
if (IS_ENABLED(CONFIG_DEBUG_GENERIC_PT)) {
/*
* The underlying table can't store the physical table address.
@@ -461,6 +493,7 @@ static int clear_contig(const struct pt_state *start_pts,
* the gather
*/
pt_clear_entry(&pts, ilog2(1));
+ flush_writes_item(&pts);
iommu_pages_list_add(&collect.free_list,
pt_table_ptr(&pts));
@@ -515,6 +548,8 @@ static int __map_range_leaf(struct pt_range *range, void *arg,
pts.index += step;
} while (pts.index < pts.end_index);
+ flush_writes_range(&pts, start_index, pts.index);
+
map->oa = oa;
return ret;
}
@@ -549,6 +584,21 @@ static int __map_range(struct pt_range *range, void *arg, unsigned int level,
}
} else {
pts.table_lower = pt_table_ptr(&pts);
+ /*
+ * Racing with a shared pt_iommu_new_table()? The other
+ * thread is still flushing the cache, so we have to
+ * also flush it to ensure that when our thread's map
+ * completes all the table items leading to our mapping
+ * are visible.
+ *
+ * This requires the pt_set_bit_release() to be a
+ * release of the cache flush so that this can acquire
+ * visibility at the iommu.
+ */
+ if (pts_feature(&pts, PT_FEAT_DMA_INCOHERENT) &&
+ !pt_test_sw_bit_acquire(&pts,
+ SW_BIT_CACHE_FLUSH_DONE))
+ flush_writes_item(&pts);
}
/*
@@ -585,6 +635,7 @@ static __always_inline int __do_map_single_page(struct pt_range *range,
return -EADDRINUSE;
pt_install_leaf_entry(&pts, map->oa, PAGE_SHIFT,
&map->attrs);
+ /* No flush, not used when incoherent */
map->oa += PAGE_SIZE;
return 0;
}
@@ -811,7 +862,8 @@ int DOMAIN_NS(map_pages)(struct iommu_domain *domain, unsigned long iova,
PT_WARN_ON(map.leaf_level > range.top_level);
do {
- if (single_page) {
+ if (single_page &&
+ !pt_feature(common, PT_FEAT_DMA_INCOHERENT)) {
ret = pt_walk_range(&range, __map_single_page, &map);
if (ret != -EAGAIN)
break;
@@ -922,6 +974,8 @@ static __maybe_unused int __unmap_range(struct pt_range *range, void *arg,
} while (true);
unmap->unmapped += log2_mul(num_oas, pt_table_item_lg2sz(&pts));
+ flush_writes_range(&pts, start_index, pts.index);
+
return ret;
}
--
2.43.0
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [PATCH v2 05/10] iommupt: Add the Intel VT-D second stage page table format
2025-08-26 17:26 [PATCH v2 00/10] Convert Intel VT-D to use the generic iommu page table Jason Gunthorpe
` (3 preceding siblings ...)
2025-08-26 17:26 ` [PATCH v2 04/10] iommupt: Flush the CPU cache after any writes to the page table Jason Gunthorpe
@ 2025-08-26 17:26 ` Jason Gunthorpe
2025-08-26 17:26 ` [PATCH v2 06/10] iommupt/x86: Set the dirty bit only for writable PTEs Jason Gunthorpe
` (4 subsequent siblings)
9 siblings, 0 replies; 11+ messages in thread
From: Jason Gunthorpe @ 2025-08-26 17:26 UTC (permalink / raw)
To: Lu Baolu, David Woodhouse, iommu, Joerg Roedel, Robin Murphy,
Will Deacon
Cc: Kevin Tian, patches, Tina Zhang, Wei Wang
The VT-D second stage format is almost the same as the x86 PAE format,
except the bit encodings in the PTE are different and a few new PTE
features, like force coherency are present.
Among all the formats it is unique in not having a designated present bit.
Comparing the performance of several operations to the existing version:
iommu_map()
pgsz ,avg new,old ns, min new,old ns , min % (+ve is better)
2^12, 53,66 , 50,64 , 21.21
2^21, 59,70 , 56,67 , 16.16
2^30, 54,66 , 52,63 , 17.17
256*2^12, 384,524 , 337,516 , 34.34
256*2^21, 387,632 , 336,626 , 46.46
256*2^30, 376,629 , 323,623 , 48.48
iommu_unmap()
pgsz ,avg new,old ns, min new,old ns , min % (+ve is better)
2^12, 67,86 , 63,84 , 25.25
2^21, 64,84 , 59,80 , 26.26
2^30, 59,78 , 56,74 , 24.24
256*2^12, 216,335 , 198,317 , 37.37
256*2^21, 245,350 , 232,344 , 32.32
256*2^30, 248,345 , 226,339 , 33.33
Cc: Tina Zhang <tina.zhang@intel.com>
Cc: Kevin Tian <kevin.tian@intel.com>
Cc: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
drivers/iommu/generic_pt/.kunitconfig | 1 +
drivers/iommu/generic_pt/Kconfig | 11 +
drivers/iommu/generic_pt/fmt/Makefile | 2 +
drivers/iommu/generic_pt/fmt/defs_vtdss.h | 21 ++
drivers/iommu/generic_pt/fmt/iommu_vtdss.c | 10 +
drivers/iommu/generic_pt/fmt/vtdss.h | 289 +++++++++++++++++++++
include/linux/generic_pt/common.h | 18 ++
include/linux/generic_pt/iommu.h | 11 +
8 files changed, 363 insertions(+)
create mode 100644 drivers/iommu/generic_pt/fmt/defs_vtdss.h
create mode 100644 drivers/iommu/generic_pt/fmt/iommu_vtdss.c
create mode 100644 drivers/iommu/generic_pt/fmt/vtdss.h
diff --git a/drivers/iommu/generic_pt/.kunitconfig b/drivers/iommu/generic_pt/.kunitconfig
index 5265d884e79cea..2f9b6060e3b983 100644
--- a/drivers/iommu/generic_pt/.kunitconfig
+++ b/drivers/iommu/generic_pt/.kunitconfig
@@ -4,6 +4,7 @@ CONFIG_DEBUG_GENERIC_PT=y
CONFIG_IOMMU_PT=y
CONFIG_IOMMU_PT_AMDV1=y
CONFIG_IOMMU_PT_RISCV64=y
+CONFIG_IOMMU_PT_VTDSS=y
CONFIG_IOMMU_PT_X86_64=y
CONFIG_IOMMU_PT_KUNIT_TEST=y
diff --git a/drivers/iommu/generic_pt/Kconfig b/drivers/iommu/generic_pt/Kconfig
index 59007b794d3b54..5e4f44e25e38e5 100644
--- a/drivers/iommu/generic_pt/Kconfig
+++ b/drivers/iommu/generic_pt/Kconfig
@@ -51,6 +51,16 @@ config IOMMU_PT_RISCV64
Selected automatically by an IOMMU driver that uses this format.
+config IOMMU_PT_VTDSS
+ tristate "IOMMU page table for Intel VT-D IOMMU Second Stage"
+ depends on !GENERIC_ATOMIC64 # for cmpxchg64
+ help
+ iommu_domain implementation for the Intel VT-D IOMMU's 64 bit 3/4/5
+ level Second Stage page table. It is similar to the X86_64 format with
+ 4K/2M/1G page sizes.
+
+ Selected automatically by an IOMMU driver that uses this format.
+
config IOMMU_PT_X86_64
tristate "IOMMU page table for x86 64 bit, 4/5 levels"
depends on !GENERIC_ATOMIC64 # for cmpxchg64
@@ -66,6 +76,7 @@ config IOMMU_PT_KUNIT_TEST
depends on KUNIT
depends on IOMMU_PT_AMDV1 || !IOMMU_PT_AMDV1
depends on IOMMU_PT_RISCV64 || !IOMMU_PT_RISCV64
+ depends on IOMMU_PT_VTDSS || !IOMMU_PT_VTDSS
depends on IOMMU_PT_X86_64 || !IOMMU_PT_X86_64
default KUNIT_ALL_TESTS
help
diff --git a/drivers/iommu/generic_pt/fmt/Makefile b/drivers/iommu/generic_pt/fmt/Makefile
index 9c0edc4d5396b3..6fe95fc8466523 100644
--- a/drivers/iommu/generic_pt/fmt/Makefile
+++ b/drivers/iommu/generic_pt/fmt/Makefile
@@ -5,6 +5,8 @@ iommu_pt_fmt-$(CONFIG_IOMMUFD_TEST) += mock
iommu_pt_fmt-$(CONFIG_IOMMU_PT_RISCV64) += riscv64
+iommu_pt_fmt-$(CONFIG_IOMMU_PT_VTDSS) += vtdss
+
iommu_pt_fmt-$(CONFIG_IOMMU_PT_X86_64) += x86_64
IOMMU_PT_KUNIT_TEST :=
diff --git a/drivers/iommu/generic_pt/fmt/defs_vtdss.h b/drivers/iommu/generic_pt/fmt/defs_vtdss.h
new file mode 100644
index 00000000000000..4a239bcaae2a90
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/defs_vtdss.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+ *
+ */
+#ifndef __GENERIC_PT_FMT_DEFS_VTDSS_H
+#define __GENERIC_PT_FMT_DEFS_VTDSS_H
+
+#include <linux/generic_pt/common.h>
+#include <linux/types.h>
+
+typedef u64 pt_vaddr_t;
+typedef u64 pt_oaddr_t;
+
+struct vtdss_pt_write_attrs {
+ u64 descriptor_bits;
+ gfp_t gfp;
+};
+#define pt_write_attrs vtdss_pt_write_attrs
+
+#endif
diff --git a/drivers/iommu/generic_pt/fmt/iommu_vtdss.c b/drivers/iommu/generic_pt/fmt/iommu_vtdss.c
new file mode 100644
index 00000000000000..f551711e2a336d
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/iommu_vtdss.c
@@ -0,0 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+ */
+#define PT_FMT vtdss
+#define PT_SUPPORTED_FEATURES \
+ (BIT(PT_FEAT_FLUSH_RANGE) | BIT(PT_FEAT_VTDSS_FORCE_COHERENCE) | \
+ BIT(PT_FEAT_VTDSS_FORCE_WRITEABLE) | BIT(PT_FEAT_DMA_INCOHERENT))
+
+#include "iommu_template.h"
diff --git a/drivers/iommu/generic_pt/fmt/vtdss.h b/drivers/iommu/generic_pt/fmt/vtdss.h
new file mode 100644
index 00000000000000..da2f11d2e348c0
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/vtdss.h
@@ -0,0 +1,289 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+ *
+ * Intel VT-D Second Stange 5/4 level page table
+ *
+ * This is described in
+ * Section "3.7 Second-Stage Translation"
+ * Section "9.8 Second-Stage Paging Entries"
+ *
+ * Of the "Intel Virtualization Technology for Directed I/O Architecture
+ * Specification".
+ *
+ * The named levels in the spec map to the pts->level as:
+ * Table/SS-PTE - 0
+ * Directory/SS-PDE - 1
+ * Directory Ptr/SS-PDPTE - 2
+ * PML4/SS-PML4E - 3
+ * PML5/SS-PML5E - 4
+ */
+#ifndef __GENERIC_PT_FMT_VTDSS_H
+#define __GENERIC_PT_FMT_VTDSS_H
+
+#include "defs_vtdss.h"
+#include "../pt_defs.h"
+
+#include <linux/bitfield.h>
+#include <linux/container_of.h>
+#include <linux/log2.h>
+
+enum {
+ PT_MAX_OUTPUT_ADDRESS_LG2 = 52,
+ PT_MAX_VA_ADDRESS_LG2 = 57,
+ PT_ITEM_WORD_SIZE = sizeof(u64),
+ PT_MAX_TOP_LEVEL = 4,
+ PT_GRANULE_LG2SZ = 12,
+ PT_TABLEMEM_LG2SZ = 12,
+
+ /* SSPTPTR is 4k aligned and limited by HAW */
+ PT_TOP_PHYS_MASK = GENMASK_ULL(63, 12),
+};
+
+/* Shared descriptor bits */
+enum {
+ VTDSS_FMT_R = BIT(0),
+ VTDSS_FMT_W = BIT(1),
+ VTDSS_FMT_A = BIT(8),
+ VTDSS_FMT_D = BIT(9),
+ VTDSS_FMT_SNP = BIT(11),
+ VTDSS_FMT_OA = GENMASK_ULL(51, 12),
+};
+
+/* PDPTE/PDE */
+enum {
+ VTDSS_FMT_PS = BIT(7),
+};
+
+#define common_to_vtdss_pt(common_ptr) \
+ container_of_const(common_ptr, struct pt_vtdss, common)
+#define to_vtdss_pt(pts) common_to_vtdss_pt((pts)->range->common)
+
+static inline pt_oaddr_t vtdss_pt_table_pa(const struct pt_state *pts)
+{
+ return oalog2_mul(FIELD_GET(VTDSS_FMT_OA, pts->entry),
+ PT_TABLEMEM_LG2SZ);
+}
+#define pt_table_pa vtdss_pt_table_pa
+
+static inline pt_oaddr_t vtdss_pt_entry_oa(const struct pt_state *pts)
+{
+ return oalog2_mul(FIELD_GET(VTDSS_FMT_OA, pts->entry),
+ PT_GRANULE_LG2SZ);
+}
+#define pt_entry_oa vtdss_pt_entry_oa
+
+static inline bool vtdss_pt_can_have_leaf(const struct pt_state *pts)
+{
+ return pts->level <= 2;
+}
+#define pt_can_have_leaf vtdss_pt_can_have_leaf
+
+static inline unsigned int vtdss_pt_num_items_lg2(const struct pt_state *pts)
+{
+ return PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64));
+}
+#define pt_num_items_lg2 vtdss_pt_num_items_lg2
+
+static inline enum pt_entry_type vtdss_pt_load_entry_raw(struct pt_state *pts)
+{
+ const u64 *tablep = pt_cur_table(pts, u64);
+ u64 entry;
+
+ pts->entry = entry = READ_ONCE(tablep[pts->index]);
+ if (!entry)
+ return PT_ENTRY_EMPTY;
+ if (pts->level == 0 ||
+ (vtdss_pt_can_have_leaf(pts) && (pts->entry & VTDSS_FMT_PS)))
+ return PT_ENTRY_OA;
+ return PT_ENTRY_TABLE;
+}
+#define pt_load_entry_raw vtdss_pt_load_entry_raw
+
+static inline void
+vtdss_pt_install_leaf_entry(struct pt_state *pts, pt_oaddr_t oa,
+ unsigned int oasz_lg2,
+ const struct pt_write_attrs *attrs)
+{
+ u64 *tablep = pt_cur_table(pts, u64);
+ u64 entry;
+
+ entry = FIELD_PREP(VTDSS_FMT_OA, log2_div(oa, PT_GRANULE_LG2SZ)) |
+ attrs->descriptor_bits;
+ if (pts->level != 0)
+ entry |= VTDSS_FMT_PS;
+
+ WRITE_ONCE(tablep[pts->index], entry);
+ pts->entry = entry;
+}
+#define pt_install_leaf_entry vtdss_pt_install_leaf_entry
+
+static inline bool vtdss_pt_install_table(struct pt_state *pts,
+ pt_oaddr_t table_pa,
+ const struct pt_write_attrs *attrs)
+{
+ u64 entry;
+
+ entry = VTDSS_FMT_R | VTDSS_FMT_W |
+ FIELD_PREP(VTDSS_FMT_OA, log2_div(table_pa, PT_GRANULE_LG2SZ));
+ return pt_table_install64(pts, entry);
+}
+#define pt_install_table vtdss_pt_install_table
+
+static inline void vtdss_pt_attr_from_entry(const struct pt_state *pts,
+ struct pt_write_attrs *attrs)
+{
+ attrs->descriptor_bits = pts->entry &
+ (VTDSS_FMT_R | VTDSS_FMT_W | VTDSS_FMT_SNP);
+}
+#define pt_attr_from_entry vtdss_pt_attr_from_entry
+
+static inline bool vtdss_pt_entry_write_is_dirty(const struct pt_state *pts)
+{
+ u64 *tablep = pt_cur_table(pts, u64) + pts->index;
+
+ return READ_ONCE(*tablep) & VTDSS_FMT_D;
+}
+#define pt_entry_write_is_dirty vtdss_pt_entry_write_is_dirty
+
+static inline void vtdss_pt_entry_set_write_clean(struct pt_state *pts)
+{
+ u64 *tablep = pt_cur_table(pts, u64) + pts->index;
+
+ WRITE_ONCE(*tablep, READ_ONCE(*tablep) & ~(u64)VTDSS_FMT_D);
+}
+#define pt_entry_set_write_clean vtdss_pt_entry_set_write_clean
+
+static inline bool vtdss_pt_entry_make_write_dirty(struct pt_state *pts)
+{
+ u64 *tablep = pt_cur_table(pts, u64) + pts->index;
+ u64 new = pts->entry | VTDSS_FMT_D;
+
+ return try_cmpxchg64(tablep, &pts->entry, new);
+}
+#define pt_entry_make_write_dirty vtdss_pt_entry_make_write_dirty
+
+static inline unsigned int vtdss_pt_max_sw_bit(struct pt_common *common)
+{
+ return 10;
+}
+#define pt_max_sw_bit vtdss_pt_max_sw_bit
+
+static inline u64 vtdss_pt_sw_bit(unsigned int bitnr)
+{
+ /* Bits marked Ignored in the specification */
+ switch (bitnr) {
+ case 0:
+ return BIT(10);
+ case 1 ... 9:
+ return BIT_ULL((bitnr - 1) + 52);
+ case 10:
+ return BIT_ULL(63);
+ /* Some bits in 9-3 are available in some entries */
+ default:
+ if (__builtin_constant_p(bitnr))
+ BUILD_BUG();
+ else
+ PT_WARN_ON(true);
+ return 0;
+ }
+}
+#define pt_sw_bit vtdss_pt_sw_bit
+
+/* --- iommu */
+#include <linux/generic_pt/iommu.h>
+#include <linux/iommu.h>
+
+#define pt_iommu_table pt_iommu_vtdss
+
+/* The common struct is in the per-format common struct */
+static inline struct pt_common *common_from_iommu(struct pt_iommu *iommu_table)
+{
+ return &container_of(iommu_table, struct pt_iommu_table, iommu)
+ ->vtdss_pt.common;
+}
+
+static inline struct pt_iommu *iommu_from_common(struct pt_common *common)
+{
+ return &container_of(common, struct pt_iommu_table, vtdss_pt.common)
+ ->iommu;
+}
+
+static inline int vtdss_pt_iommu_set_prot(struct pt_common *common,
+ struct pt_write_attrs *attrs,
+ unsigned int iommu_prot)
+{
+ u64 pte = 0;
+
+ /*
+ * VTDSS does not have a present bit, so we tell if any entry is present
+ * by checking for R or W.
+ */
+ if (!(iommu_prot & (IOMMU_READ | IOMMU_WRITE)))
+ return -EINVAL;
+
+ if (iommu_prot & IOMMU_READ)
+ pte |= VTDSS_FMT_R;
+ if (iommu_prot & IOMMU_WRITE)
+ pte |= VTDSS_FMT_W;
+ if (pt_feature(common, PT_FEAT_VTDSS_FORCE_COHERENCE))
+ pte |= VTDSS_FMT_SNP;
+
+ if (pt_feature(common, PT_FEAT_VTDSS_FORCE_WRITEABLE) &&
+ !(iommu_prot & IOMMU_WRITE)) {
+ pr_err_ratelimited(
+ "Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n");
+ return -EINVAL;
+ }
+
+ attrs->descriptor_bits = pte;
+ return 0;
+}
+#define pt_iommu_set_prot vtdss_pt_iommu_set_prot
+
+static inline int vtdss_pt_iommu_fmt_init(struct pt_iommu_vtdss *iommu_table,
+ const struct pt_iommu_vtdss_cfg *cfg)
+{
+ struct pt_vtdss *table = &iommu_table->vtdss_pt;
+ unsigned int vasz_lg2 = cfg->common.hw_max_vasz_lg2;
+
+ if (vasz_lg2 > PT_MAX_VA_ADDRESS_LG2)
+ return -EOPNOTSUPP;
+ else if (vasz_lg2 > 48)
+ pt_top_set_level(&table->common, 4);
+ else if (vasz_lg2 > 39)
+ pt_top_set_level(&table->common, 3);
+ else if (vasz_lg2 > 30)
+ pt_top_set_level(&table->common, 2);
+ else
+ return -EOPNOTSUPP;
+ return 0;
+}
+#define pt_iommu_fmt_init vtdss_pt_iommu_fmt_init
+
+static inline void
+vtdss_pt_iommu_fmt_hw_info(struct pt_iommu_vtdss *table,
+ const struct pt_range *top_range,
+ struct pt_iommu_vtdss_hw_info *info)
+{
+ info->ssptptr = virt_to_phys(top_range->top_table);
+ PT_WARN_ON(info->ssptptr & ~PT_TOP_PHYS_MASK);
+ /*
+ * top_level = 2 = 3 level table aw=1
+ * top_level = 3 = 4 level table aw=2
+ * top_level = 4 = 5 level table aw=3
+ */
+ info->aw = top_range->top_level - 1;
+}
+#define pt_iommu_fmt_hw_info vtdss_pt_iommu_fmt_hw_info
+
+#if defined(GENERIC_PT_KUNIT)
+static const struct pt_iommu_vtdss_cfg vtdss_kunit_fmt_cfgs[] = {
+ [0] = { .common.hw_max_vasz_lg2 = 39 },
+ [1] = { .common.hw_max_vasz_lg2 = 48 },
+ [2] = { .common.hw_max_vasz_lg2 = 57 },
+};
+#define kunit_fmt_cfgs vtdss_kunit_fmt_cfgs
+enum { KUNIT_FMT_FEATURES = BIT(PT_FEAT_VTDSS_FORCE_WRITEABLE) };
+#endif
+#endif
diff --git a/include/linux/generic_pt/common.h b/include/linux/generic_pt/common.h
index 1b97bbfaa4f90a..fa6e36e0b9efa3 100644
--- a/include/linux/generic_pt/common.h
+++ b/include/linux/generic_pt/common.h
@@ -171,6 +171,24 @@ enum {
PT_FEAT_RSICV_SVNAPOT_64K = PT_FEAT_FMT_START,
};
+struct pt_vtdss {
+ struct pt_common common;
+};
+
+enum {
+ /*
+ * The PTEs are set to prevent cache incoherent traffic, such as PCI no
+ * snoop. This is set either at creation time or before the first map
+ * operation.
+ */
+ PT_FEAT_VTDSS_FORCE_COHERENCE = PT_FEAT_FMT_START,
+ /*
+ * Prevent creating read-only PTEs. Used to work around HW errata
+ * ERRATA_772415_SPR17.
+ */
+ PT_FEAT_VTDSS_FORCE_WRITEABLE,
+};
+
struct pt_x86_64 {
struct pt_common common;
};
diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h
index 5dc3a960a8989e..9557e78c110fde 100644
--- a/include/linux/generic_pt/iommu.h
+++ b/include/linux/generic_pt/iommu.h
@@ -272,6 +272,17 @@ struct pt_iommu_riscv_64_hw_info {
IOMMU_FORMAT(riscv_64, riscv_64pt);
+struct pt_iommu_vtdss_cfg {
+ struct pt_iommu_cfg common;
+};
+
+struct pt_iommu_vtdss_hw_info {
+ u64 ssptptr;
+ u8 aw;
+};
+
+IOMMU_FORMAT(vtdss, vtdss_pt);
+
struct pt_iommu_x86_64_cfg {
struct pt_iommu_cfg common;
};
--
2.43.0
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [PATCH v2 06/10] iommupt/x86: Set the dirty bit only for writable PTEs
2025-08-26 17:26 [PATCH v2 00/10] Convert Intel VT-D to use the generic iommu page table Jason Gunthorpe
` (4 preceding siblings ...)
2025-08-26 17:26 ` [PATCH v2 05/10] iommupt: Add the Intel VT-D second stage page table format Jason Gunthorpe
@ 2025-08-26 17:26 ` Jason Gunthorpe
2025-08-26 17:26 ` [PATCH v2 07/10] iommupt/x86: Support SW bits and permit PT_FEAT_DMA_INCOHERENT Jason Gunthorpe
` (3 subsequent siblings)
9 siblings, 0 replies; 11+ messages in thread
From: Jason Gunthorpe @ 2025-08-26 17:26 UTC (permalink / raw)
To: Lu Baolu, David Woodhouse, iommu, Joerg Roedel, Robin Murphy,
Will Deacon
Cc: Kevin Tian, patches, Tina Zhang, Wei Wang
AMD and VTD are historically different here, adopt the VTD version of
setting the D bit only on writable PTEs as it makes more sense.
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
drivers/iommu/generic_pt/fmt/x86_64.h | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/iommu/generic_pt/fmt/x86_64.h b/drivers/iommu/generic_pt/fmt/x86_64.h
index c01815b6229cce..60f8bd6b164b54 100644
--- a/drivers/iommu/generic_pt/fmt/x86_64.h
+++ b/drivers/iommu/generic_pt/fmt/x86_64.h
@@ -185,9 +185,9 @@ static inline int x86_64_pt_iommu_set_prot(struct pt_common *common,
{
u64 pte;
- pte = X86_64_FMT_U | X86_64_FMT_A | X86_64_FMT_D;
+ pte = X86_64_FMT_U | X86_64_FMT_A;
if (iommu_prot & IOMMU_WRITE)
- pte |= X86_64_FMT_RW;
+ pte |= X86_64_FMT_RW | X86_64_FMT_D;
/*
* Ideally we'd have an IOMMU_ENCRYPTED flag set by higher levels to
--
2.43.0
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [PATCH v2 07/10] iommupt/x86: Support SW bits and permit PT_FEAT_DMA_INCOHERENT
2025-08-26 17:26 [PATCH v2 00/10] Convert Intel VT-D to use the generic iommu page table Jason Gunthorpe
` (5 preceding siblings ...)
2025-08-26 17:26 ` [PATCH v2 06/10] iommupt/x86: Set the dirty bit only for writable PTEs Jason Gunthorpe
@ 2025-08-26 17:26 ` Jason Gunthorpe
2025-08-26 17:26 ` [PATCH v2 08/10] iommu/vt-d: Use the generic iommu page table Jason Gunthorpe
` (2 subsequent siblings)
9 siblings, 0 replies; 11+ messages in thread
From: Jason Gunthorpe @ 2025-08-26 17:26 UTC (permalink / raw)
To: Lu Baolu, David Woodhouse, iommu, Joerg Roedel, Robin Murphy,
Will Deacon
Cc: Kevin Tian, patches, Tina Zhang, Wei Wang
VT-D requires PT_FEAT_DMA_INCOHERENT for the x86 page table as well,
implement the required SW bits and enable the feature.
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
drivers/iommu/generic_pt/fmt/iommu_x86_64.c | 2 +-
drivers/iommu/generic_pt/fmt/x86_64.h | 27 +++++++++++++++++++++
2 files changed, 28 insertions(+), 1 deletion(-)
diff --git a/drivers/iommu/generic_pt/fmt/iommu_x86_64.c b/drivers/iommu/generic_pt/fmt/iommu_x86_64.c
index 5c5960d871a32f..5472660c2d7156 100644
--- a/drivers/iommu/generic_pt/fmt/iommu_x86_64.c
+++ b/drivers/iommu/generic_pt/fmt/iommu_x86_64.c
@@ -6,6 +6,6 @@
#define PT_SUPPORTED_FEATURES \
(BIT(PT_FEAT_SIGN_EXTEND) | BIT(PT_FEAT_FLUSH_RANGE) | \
BIT(PT_FEAT_FLUSH_RANGE_NO_GAPS) | \
- BIT(PT_FEAT_X86_64_AMD_ENCRYPT_TABLES))
+ BIT(PT_FEAT_X86_64_AMD_ENCRYPT_TABLES) | BIT(PT_FEAT_DMA_INCOHERENT))
#include "iommu_template.h"
diff --git a/drivers/iommu/generic_pt/fmt/x86_64.h b/drivers/iommu/generic_pt/fmt/x86_64.h
index 60f8bd6b164b54..0ebe77c6fda451 100644
--- a/drivers/iommu/generic_pt/fmt/x86_64.h
+++ b/drivers/iommu/generic_pt/fmt/x86_64.h
@@ -160,6 +160,33 @@ static inline void x86_64_pt_attr_from_entry(const struct pt_state *pts,
}
#define pt_attr_from_entry x86_64_pt_attr_from_entry
+static inline unsigned int x86_64_pt_max_sw_bit(struct pt_common *common)
+{
+ return 12;
+}
+#define pt_max_sw_bit x86_64_pt_max_sw_bit
+
+static inline u64 x86_64_pt_sw_bit(unsigned int bitnr)
+{
+ /* Bits marked Ignored/AVL in the specification */
+ switch (bitnr) {
+ case 0:
+ return BIT(9);
+ case 1:
+ return BIT(11);
+ case 2 ... 12:
+ return BIT_ULL((bitnr - 2) + 52);
+ /* Some bits in 8,6,4,3 are available in some entries */
+ default:
+ if (__builtin_constant_p(bitnr))
+ BUILD_BUG();
+ else
+ PT_WARN_ON(true);
+ return 0;
+ }
+}
+#define pt_sw_bit x86_64_pt_sw_bit
+
/* --- iommu */
#include <linux/generic_pt/iommu.h>
#include <linux/iommu.h>
--
2.43.0
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [PATCH v2 08/10] iommu/vt-d: Use the generic iommu page table
2025-08-26 17:26 [PATCH v2 00/10] Convert Intel VT-D to use the generic iommu page table Jason Gunthorpe
` (6 preceding siblings ...)
2025-08-26 17:26 ` [PATCH v2 07/10] iommupt/x86: Support SW bits and permit PT_FEAT_DMA_INCOHERENT Jason Gunthorpe
@ 2025-08-26 17:26 ` Jason Gunthorpe
2025-08-26 17:26 ` [PATCH v2 09/10] iommu/vt-d: Follow PT_FEAT_DMA_INCOHERENT into the PASID entry Jason Gunthorpe
2025-08-26 17:26 ` [PATCH v2 10/10] iommupt: Add a kunit test for the SW bits Jason Gunthorpe
9 siblings, 0 replies; 11+ messages in thread
From: Jason Gunthorpe @ 2025-08-26 17:26 UTC (permalink / raw)
To: Lu Baolu, David Woodhouse, iommu, Joerg Roedel, Robin Murphy,
Will Deacon
Cc: Kevin Tian, patches, Tina Zhang, Wei Wang
Replace the VT-D iommu_domain implementation of the VTD second stage and
first stage page tables with the iommupt VTDSS and x86_64
pagetables. x86_64 is shared with the AMD driver.
There are a couple notable things in VT-D:
- Like AMD the second stage format is not sign extended, unlike AMD it
cannot decode a full 64 bits. The first stage format is a normal sign
extended x86 page table
- The HW caps can indicate how many levels, how many address bits and what
leaf page sizes are supported in HW. As before the highest number of
levels that can translate the entire supported address width is used.
The supported page sizes are adjusted directly from the dedicated
first/second stage cap bits.
- VTD requires flushing 'write buffers'. This logic is left unchanged,
the write buffer flushes on any gather flush or through iotlb_sync_map.
- Like ARM, VTD has an optional non-coherent page table walker that
requires cache flushing. This is supported through PT_FEAT_DMA_INCOHERENT
the same as ARM, however x86 can't use the DMA API for flush, it must
call the arch function clflush_cache_range()
- The PT_FEAT_DYNAMIC_TOP can probably be supported on VTD someday for the
second stage when it uses 128 bit atomic stores for the HW context
structures.
- PT_FEAT_VTDSS_FORCE_WRITEABLE is used to work around ERRATA_772415_SPR17
- A kernel command line parameter "sp_off" disables all page sizes except
4k
Remove all the unused iommu_domain page table code. The debugfs paths have
their own independent page table walker that is left alone for now.
This corrects a race with the non-coherent walker that the ARM
implementations have fixed:
CPU 0 CPU 1
pfn_to_dma_pte() pfn_to_dma_pte()
pte = &parent[offset];
if (!dma_pte_present(pte)) {
try_cmpxchg64(&pte->val)
pte = &parent[offset];
.. dma_pte_present(pte) ..
[...]
// iommu_map() completes
// Device does DMA
domain_flush_cache(pte)
The CPU 1 mapping operation shares a page table level with the CPU 0
mapping operation. CPU 0 installed a new page table level but has not
flushed it yet. CPU1 returns from iommu_map() and the device does DMA. The
non coherent walker fails to see the new table level installed by CPU 0
and fails the DMA with non-present.
The iommupt PT_FEAT_DMA_INCOHERENT implementation uses the ARM design of
storing a flag when CPU 0 completes the flush. If the flag is not set CPU
1 will also flush to ensure the HW can fully walk to the PTE being
installed.
Cc: Tina Zhang <tina.zhang@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
drivers/iommu/intel/Kconfig | 4 +
drivers/iommu/intel/iommu.c | 896 ++++++-----------------------------
drivers/iommu/intel/iommu.h | 99 +---
drivers/iommu/intel/nested.c | 5 -
drivers/iommu/intel/pasid.c | 29 +-
5 files changed, 175 insertions(+), 858 deletions(-)
diff --git a/drivers/iommu/intel/Kconfig b/drivers/iommu/intel/Kconfig
index f2f538c7065032..b847266b19514f 100644
--- a/drivers/iommu/intel/Kconfig
+++ b/drivers/iommu/intel/Kconfig
@@ -13,6 +13,10 @@ config INTEL_IOMMU
bool "Support for Intel IOMMU using DMA Remapping Devices"
depends on PCI_MSI && ACPI && X86
select IOMMU_API
+ select GENERIC_PT
+ select IOMMU_PT
+ select IOMMU_PT_X86_64
+ select IOMMU_PT_VTDSS
select IOMMU_IOVA
select IOMMU_IOPF
select IOMMUFD_DRIVER if IOMMUFD
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 9c3ab9d9f69a3e..6a269d201a614b 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -45,16 +45,9 @@
#define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
-#define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
-#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
-
-/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
- to match. That way, we can use 'unsigned long' for PFNs with impunity. */
-#define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
- __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
-#define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
-
static void __init check_tylersburg_isoch(void);
+static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
+ bool enable);
static int rwbf_quirk;
#define rwbf_required(iommu) (rwbf_quirk || cap_rwbf((iommu)->cap))
@@ -217,7 +210,6 @@ static int disable_igfx_iommu;
#define IDENTMAP_AZALIA 4
const struct iommu_ops intel_iommu_ops;
-static const struct iommu_dirty_ops intel_dirty_ops;
static bool translation_pre_enabled(struct intel_iommu *iommu)
{
@@ -285,13 +277,6 @@ static int __init intel_iommu_setup(char *str)
}
__setup("intel_iommu=", intel_iommu_setup);
-static int domain_pfn_supported(struct dmar_domain *domain, unsigned long pfn)
-{
- int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
-
- return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
-}
-
/*
* Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
* Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
@@ -353,23 +338,6 @@ static bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
}
-/* Return the super pagesize bitmap if supported. */
-static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
-{
- unsigned long bitmap = 0;
-
- /*
- * 1-level super page supports page size of 2MiB, 2-level super page
- * supports page size of both 2MiB and 1GiB.
- */
- if (domain->iommu_superpage == 1)
- bitmap |= SZ_2M;
- else if (domain->iommu_superpage == 2)
- bitmap |= SZ_2M | SZ_1G;
-
- return bitmap;
-}
-
struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
u8 devfn, int alloc)
{
@@ -556,13 +524,6 @@ static struct intel_iommu *device_lookup_iommu(struct device *dev, u8 *bus, u8 *
return iommu;
}
-static void domain_flush_cache(struct dmar_domain *domain,
- void *addr, int size)
-{
- if (!domain->iommu_coherency)
- clflush_cache_range(addr, size);
-}
-
static void free_context_table(struct intel_iommu *iommu)
{
struct context_entry *context;
@@ -707,280 +668,6 @@ void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
}
#endif
-static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
- unsigned long pfn, int *target_level,
- gfp_t gfp)
-{
- struct dma_pte *parent, *pte;
- int level = agaw_to_level(domain->agaw);
- int offset;
-
- if (!domain_pfn_supported(domain, pfn))
- /* Address beyond IOMMU's addressing capabilities. */
- return NULL;
-
- parent = domain->pgd;
-
- while (1) {
- void *tmp_page;
-
- offset = pfn_level_offset(pfn, level);
- pte = &parent[offset];
- if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
- break;
- if (level == *target_level)
- break;
-
- if (!dma_pte_present(pte)) {
- uint64_t pteval, tmp;
-
- tmp_page = iommu_alloc_pages_node_sz(domain->nid, gfp,
- SZ_4K);
-
- if (!tmp_page)
- return NULL;
-
- domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
- pteval = virt_to_phys(tmp_page) | DMA_PTE_READ |
- DMA_PTE_WRITE;
- if (domain->use_first_level)
- pteval |= DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
-
- tmp = 0ULL;
- if (!try_cmpxchg64(&pte->val, &tmp, pteval))
- /* Someone else set it while we were thinking; use theirs. */
- iommu_free_pages(tmp_page);
- else
- domain_flush_cache(domain, pte, sizeof(*pte));
- }
- if (level == 1)
- break;
-
- parent = phys_to_virt(dma_pte_addr(pte));
- level--;
- }
-
- if (!*target_level)
- *target_level = level;
-
- return pte;
-}
-
-/* return address's pte at specific level */
-static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
- unsigned long pfn,
- int level, int *large_page)
-{
- struct dma_pte *parent, *pte;
- int total = agaw_to_level(domain->agaw);
- int offset;
-
- parent = domain->pgd;
- while (level <= total) {
- offset = pfn_level_offset(pfn, total);
- pte = &parent[offset];
- if (level == total)
- return pte;
-
- if (!dma_pte_present(pte)) {
- *large_page = total;
- break;
- }
-
- if (dma_pte_superpage(pte)) {
- *large_page = total;
- return pte;
- }
-
- parent = phys_to_virt(dma_pte_addr(pte));
- total--;
- }
- return NULL;
-}
-
-/* clear last level pte, a tlb flush should be followed */
-static void dma_pte_clear_range(struct dmar_domain *domain,
- unsigned long start_pfn,
- unsigned long last_pfn)
-{
- unsigned int large_page;
- struct dma_pte *first_pte, *pte;
-
- if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
- WARN_ON(start_pfn > last_pfn))
- return;
-
- /* we don't need lock here; nobody else touches the iova range */
- do {
- large_page = 1;
- first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
- if (!pte) {
- start_pfn = align_to_level(start_pfn + 1, large_page + 1);
- continue;
- }
- do {
- dma_clear_pte(pte);
- start_pfn += lvl_to_nr_pages(large_page);
- pte++;
- } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
-
- domain_flush_cache(domain, first_pte,
- (void *)pte - (void *)first_pte);
-
- } while (start_pfn && start_pfn <= last_pfn);
-}
-
-static void dma_pte_free_level(struct dmar_domain *domain, int level,
- int retain_level, struct dma_pte *pte,
- unsigned long pfn, unsigned long start_pfn,
- unsigned long last_pfn)
-{
- pfn = max(start_pfn, pfn);
- pte = &pte[pfn_level_offset(pfn, level)];
-
- do {
- unsigned long level_pfn;
- struct dma_pte *level_pte;
-
- if (!dma_pte_present(pte) || dma_pte_superpage(pte))
- goto next;
-
- level_pfn = pfn & level_mask(level);
- level_pte = phys_to_virt(dma_pte_addr(pte));
-
- if (level > 2) {
- dma_pte_free_level(domain, level - 1, retain_level,
- level_pte, level_pfn, start_pfn,
- last_pfn);
- }
-
- /*
- * Free the page table if we're below the level we want to
- * retain and the range covers the entire table.
- */
- if (level < retain_level && !(start_pfn > level_pfn ||
- last_pfn < level_pfn + level_size(level) - 1)) {
- dma_clear_pte(pte);
- domain_flush_cache(domain, pte, sizeof(*pte));
- iommu_free_pages(level_pte);
- }
-next:
- pfn += level_size(level);
- } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
-}
-
-/*
- * clear last level (leaf) ptes and free page table pages below the
- * level we wish to keep intact.
- */
-static void dma_pte_free_pagetable(struct dmar_domain *domain,
- unsigned long start_pfn,
- unsigned long last_pfn,
- int retain_level)
-{
- dma_pte_clear_range(domain, start_pfn, last_pfn);
-
- /* We don't need lock here; nobody else touches the iova range */
- dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
- domain->pgd, 0, start_pfn, last_pfn);
-
- /* free pgd */
- if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
- iommu_free_pages(domain->pgd);
- domain->pgd = NULL;
- }
-}
-
-/* When a page at a given level is being unlinked from its parent, we don't
- need to *modify* it at all. All we need to do is make a list of all the
- pages which can be freed just as soon as we've flushed the IOTLB and we
- know the hardware page-walk will no longer touch them.
- The 'pte' argument is the *parent* PTE, pointing to the page that is to
- be freed. */
-static void dma_pte_list_pagetables(struct dmar_domain *domain,
- int level, struct dma_pte *parent_pte,
- struct iommu_pages_list *freelist)
-{
- struct dma_pte *pte = phys_to_virt(dma_pte_addr(parent_pte));
-
- iommu_pages_list_add(freelist, pte);
-
- if (level == 1)
- return;
-
- do {
- if (dma_pte_present(pte) && !dma_pte_superpage(pte))
- dma_pte_list_pagetables(domain, level - 1, pte, freelist);
- pte++;
- } while (!first_pte_in_page(pte));
-}
-
-static void dma_pte_clear_level(struct dmar_domain *domain, int level,
- struct dma_pte *pte, unsigned long pfn,
- unsigned long start_pfn, unsigned long last_pfn,
- struct iommu_pages_list *freelist)
-{
- struct dma_pte *first_pte = NULL, *last_pte = NULL;
-
- pfn = max(start_pfn, pfn);
- pte = &pte[pfn_level_offset(pfn, level)];
-
- do {
- unsigned long level_pfn = pfn & level_mask(level);
-
- if (!dma_pte_present(pte))
- goto next;
-
- /* If range covers entire pagetable, free it */
- if (start_pfn <= level_pfn &&
- last_pfn >= level_pfn + level_size(level) - 1) {
- /* These suborbinate page tables are going away entirely. Don't
- bother to clear them; we're just going to *free* them. */
- if (level > 1 && !dma_pte_superpage(pte))
- dma_pte_list_pagetables(domain, level - 1, pte, freelist);
-
- dma_clear_pte(pte);
- if (!first_pte)
- first_pte = pte;
- last_pte = pte;
- } else if (level > 1) {
- /* Recurse down into a level that isn't *entirely* obsolete */
- dma_pte_clear_level(domain, level - 1,
- phys_to_virt(dma_pte_addr(pte)),
- level_pfn, start_pfn, last_pfn,
- freelist);
- }
-next:
- pfn = level_pfn + level_size(level);
- } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
-
- if (first_pte)
- domain_flush_cache(domain, first_pte,
- (void *)++last_pte - (void *)first_pte);
-}
-
-/* We can't just free the pages because the IOMMU may still be walking
- the page tables, and may have cached the intermediate levels. The
- pages can only be freed after the IOTLB flush has been done. */
-static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
- unsigned long last_pfn,
- struct iommu_pages_list *freelist)
-{
- if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
- WARN_ON(start_pfn > last_pfn))
- return;
-
- /* we don't need lock here; nobody else touches the iova range */
- dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
- domain->pgd, 0, start_pfn, last_pfn, freelist);
-
- /* free pgd */
- if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
- iommu_pages_list_add(freelist, domain->pgd);
- domain->pgd = NULL;
- }
-}
-
/* iommu handling */
static int iommu_alloc_root_entry(struct intel_iommu *iommu)
{
@@ -1460,13 +1147,15 @@ static int domain_context_mapping_one(struct dmar_domain *domain,
domain_lookup_dev_info(domain, iommu, bus, devfn);
u16 did = domain_id_iommu(domain, iommu);
int translation = CONTEXT_TT_MULTI_LEVEL;
- struct dma_pte *pgd = domain->pgd;
+ struct pt_iommu_vtdss_hw_info pt_info;
struct context_entry *context;
int ret;
if (WARN_ON(!intel_domain_is_ss_paging(domain)))
return -EINVAL;
+ pt_iommu_vtdss_hw_info(&domain->sspt, &pt_info);
+
pr_debug("Set context mapping for %02x:%02x.%d\n",
bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
@@ -1489,8 +1178,8 @@ static int domain_context_mapping_one(struct dmar_domain *domain,
else
translation = CONTEXT_TT_MULTI_LEVEL;
- context_set_address_root(context, virt_to_phys(pgd));
- context_set_address_width(context, domain->agaw);
+ context_set_address_root(context, pt_info.ssptptr);
+ context_set_address_width(context, pt_info.aw);
context_set_translation_type(context, translation);
context_set_fault_enable(context);
context_set_present(context);
@@ -1537,172 +1226,6 @@ domain_context_mapping(struct dmar_domain *domain, struct device *dev)
return 0;
}
-/* Return largest possible superpage level for a given mapping */
-static int hardware_largepage_caps(struct dmar_domain *domain, unsigned long iov_pfn,
- unsigned long phy_pfn, unsigned long pages)
-{
- int support, level = 1;
- unsigned long pfnmerge;
-
- support = domain->iommu_superpage;
-
- /* To use a large page, the virtual *and* physical addresses
- must be aligned to 2MiB/1GiB/etc. Lower bits set in either
- of them will mean we have to use smaller pages. So just
- merge them and check both at once. */
- pfnmerge = iov_pfn | phy_pfn;
-
- while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
- pages >>= VTD_STRIDE_SHIFT;
- if (!pages)
- break;
- pfnmerge >>= VTD_STRIDE_SHIFT;
- level++;
- support--;
- }
- return level;
-}
-
-/*
- * Ensure that old small page tables are removed to make room for superpage(s).
- * We're going to add new large pages, so make sure we don't remove their parent
- * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
- */
-static void switch_to_super_page(struct dmar_domain *domain,
- unsigned long start_pfn,
- unsigned long end_pfn, int level)
-{
- unsigned long lvl_pages = lvl_to_nr_pages(level);
- struct dma_pte *pte = NULL;
-
- while (start_pfn <= end_pfn) {
- if (!pte)
- pte = pfn_to_dma_pte(domain, start_pfn, &level,
- GFP_ATOMIC);
-
- if (dma_pte_present(pte)) {
- dma_pte_free_pagetable(domain, start_pfn,
- start_pfn + lvl_pages - 1,
- level + 1);
-
- cache_tag_flush_range(domain, start_pfn << VTD_PAGE_SHIFT,
- end_pfn << VTD_PAGE_SHIFT, 0);
- }
-
- pte++;
- start_pfn += lvl_pages;
- if (first_pte_in_page(pte))
- pte = NULL;
- }
-}
-
-static int
-__domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
- unsigned long phys_pfn, unsigned long nr_pages, int prot,
- gfp_t gfp)
-{
- struct dma_pte *first_pte = NULL, *pte = NULL;
- unsigned int largepage_lvl = 0;
- unsigned long lvl_pages = 0;
- phys_addr_t pteval;
- u64 attr;
-
- if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)))
- return -EINVAL;
-
- if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
- return -EINVAL;
-
- if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) {
- pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n");
- return -EINVAL;
- }
-
- attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
- if (domain->use_first_level) {
- attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
- if (prot & DMA_PTE_WRITE)
- attr |= DMA_FL_PTE_DIRTY;
- }
-
- domain->has_mappings = true;
-
- pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
-
- while (nr_pages > 0) {
- uint64_t tmp;
-
- if (!pte) {
- largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
- phys_pfn, nr_pages);
-
- pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl,
- gfp);
- if (!pte)
- return -ENOMEM;
- first_pte = pte;
-
- lvl_pages = lvl_to_nr_pages(largepage_lvl);
-
- /* It is large page*/
- if (largepage_lvl > 1) {
- unsigned long end_pfn;
- unsigned long pages_to_remove;
-
- pteval |= DMA_PTE_LARGE_PAGE;
- pages_to_remove = min_t(unsigned long, nr_pages,
- nr_pte_to_next_page(pte) * lvl_pages);
- end_pfn = iov_pfn + pages_to_remove - 1;
- switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
- } else {
- pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
- }
-
- }
- /* We don't need lock here, nobody else
- * touches the iova range
- */
- tmp = 0ULL;
- if (!try_cmpxchg64_local(&pte->val, &tmp, pteval)) {
- static int dumps = 5;
- pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
- iov_pfn, tmp, (unsigned long long)pteval);
- if (dumps) {
- dumps--;
- debug_dma_dump_mappings(NULL);
- }
- WARN_ON(1);
- }
-
- nr_pages -= lvl_pages;
- iov_pfn += lvl_pages;
- phys_pfn += lvl_pages;
- pteval += lvl_pages * VTD_PAGE_SIZE;
-
- /* If the next PTE would be the first in a new page, then we
- * need to flush the cache on the entries we've just written.
- * And then we'll need to recalculate 'pte', so clear it and
- * let it get set again in the if (!pte) block above.
- *
- * If we're done (!nr_pages) we need to flush the cache too.
- *
- * Also if we've been setting superpages, we may need to
- * recalculate 'pte' and switch back to smaller pages for the
- * end of the mapping, if the trailing size is not enough to
- * use another superpage (i.e. nr_pages < lvl_pages).
- */
- pte++;
- if (!nr_pages || first_pte_in_page(pte) ||
- (largepage_lvl > 1 && nr_pages < lvl_pages)) {
- domain_flush_cache(domain, first_pte,
- (void *)pte - (void *)first_pte);
- pte = NULL;
- }
- }
-
- return 0;
-}
-
static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
{
struct intel_iommu *iommu = info->iommu;
@@ -1764,14 +1287,14 @@ static int domain_setup_first_level(struct intel_iommu *iommu,
struct device *dev,
u32 pasid, struct iommu_domain *old)
{
- struct dma_pte *pgd = domain->pgd;
- int level, flags = 0;
+ struct pt_iommu_x86_64_hw_info pt_info;
+ unsigned int flags = 0;
- level = agaw_to_level(domain->agaw);
- if (level != 4 && level != 5)
+ pt_iommu_x86_64_hw_info(&domain->fspt, &pt_info);
+ if (WARN_ON(pt_info.levels != 4 && pt_info.levels != 5))
return -EINVAL;
- if (level == 5)
+ if (pt_info.levels == 5)
flags |= PASID_FLAG_FL5LP;
if (domain->force_snooping)
@@ -1779,7 +1302,7 @@ static int domain_setup_first_level(struct intel_iommu *iommu,
return __domain_setup_first_level(iommu, dev, pasid,
domain_id_iommu(domain, iommu),
- __pa(pgd), flags, old);
+ pt_info.gcr3_pt, flags, old);
}
static int dmar_domain_attach_device(struct dmar_domain *domain,
@@ -3246,23 +2769,9 @@ static struct iommu_domain blocking_domain = {
}
};
-static int iommu_superpage_capability(struct intel_iommu *iommu, bool first_stage)
+static struct dmar_domain *paging_domain_alloc(void)
{
- if (!intel_iommu_superpage)
- return 0;
-
- if (first_stage)
- return cap_fl1gp_support(iommu->cap) ? 2 : 1;
-
- return fls(cap_super_page_val(iommu->cap));
-}
-
-static struct dmar_domain *paging_domain_alloc(struct device *dev, bool first_stage)
-{
- struct device_domain_info *info = dev_iommu_priv_get(dev);
- struct intel_iommu *iommu = info->iommu;
struct dmar_domain *domain;
- int addr_width;
domain = kzalloc(sizeof(*domain), GFP_KERNEL);
if (!domain)
@@ -3277,48 +2786,6 @@ static struct dmar_domain *paging_domain_alloc(struct device *dev, bool first_st
INIT_LIST_HEAD(&domain->s1_domains);
spin_lock_init(&domain->s1_lock);
- domain->nid = dev_to_node(dev);
- domain->use_first_level = first_stage;
-
- domain->domain.type = IOMMU_DOMAIN_UNMANAGED;
-
- /* calculate the address width */
- addr_width = agaw_to_width(iommu->agaw);
- if (addr_width > cap_mgaw(iommu->cap))
- addr_width = cap_mgaw(iommu->cap);
- domain->gaw = addr_width;
- domain->agaw = iommu->agaw;
- domain->max_addr = __DOMAIN_MAX_ADDR(addr_width);
-
- /* iommu memory access coherency */
- domain->iommu_coherency = iommu_paging_structure_coherency(iommu);
-
- /* pagesize bitmap */
- domain->domain.pgsize_bitmap = SZ_4K;
- domain->iommu_superpage = iommu_superpage_capability(iommu, first_stage);
- domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
-
- /*
- * IOVA aperture: First-level translation restricts the input-address
- * to a canonical address (i.e., address bits 63:N have the same value
- * as address bit [N-1], where N is 48-bits with 4-level paging and
- * 57-bits with 5-level paging). Hence, skip bit [N-1].
- */
- domain->domain.geometry.force_aperture = true;
- domain->domain.geometry.aperture_start = 0;
- if (first_stage)
- domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
- else
- domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
-
- /* always allocate the top pgd */
- domain->pgd = iommu_alloc_pages_node_sz(domain->nid, GFP_KERNEL, SZ_4K);
- if (!domain->pgd) {
- kfree(domain);
- return ERR_PTR(-ENOMEM);
- }
- domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
-
return domain;
}
@@ -3326,7 +2793,9 @@ static struct iommu_domain *
intel_iommu_domain_alloc_first_stage(struct device *dev,
struct intel_iommu *iommu, u32 flags)
{
+ struct pt_iommu_x86_64_cfg cfg = {};
struct dmar_domain *dmar_domain;
+ int ret;
if (flags & ~IOMMU_HWPT_ALLOC_PASID)
return ERR_PTR(-EOPNOTSUPP);
@@ -3335,10 +2804,22 @@ intel_iommu_domain_alloc_first_stage(struct device *dev,
if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
return ERR_PTR(-EOPNOTSUPP);
- dmar_domain = paging_domain_alloc(dev, true);
+ dmar_domain = paging_domain_alloc();
if (IS_ERR(dmar_domain))
return ERR_CAST(dmar_domain);
+ if (cap_fl5lp_support(iommu->cap))
+ cfg.common.hw_max_vasz_lg2 = 57;
+ else
+ cfg.common.hw_max_vasz_lg2 = 48;
+ cfg.common.hw_max_oasz_lg2 = 52;
+ cfg.common.features = BIT(PT_FEAT_SIGN_EXTEND) |
+ BIT(PT_FEAT_FLUSH_RANGE);
+ /* First stage always uses scalable mode */
+ if (!ecap_smpwc(iommu->ecap))
+ cfg.common.features |= BIT(PT_FEAT_DMA_INCOHERENT);
+ dmar_domain->iommu.iommu_device = dev;
+ dmar_domain->iommu.nid = dev_to_node(dev);
dmar_domain->domain.ops = &intel_fs_paging_domain_ops;
/*
* iotlb sync for map is only needed for legacy implementations that
@@ -3348,14 +2829,52 @@ intel_iommu_domain_alloc_first_stage(struct device *dev,
if (rwbf_required(iommu))
dmar_domain->iotlb_sync_map = true;
+ ret = pt_iommu_x86_64_init(&dmar_domain->fspt, &cfg, GFP_KERNEL);
+ if (ret) {
+ kfree(dmar_domain);
+ return ERR_PTR(ret);
+ }
+
+ if (!cap_fl1gp_support(iommu->cap))
+ dmar_domain->domain.pgsize_bitmap &= ~(u64)SZ_1G;
+ if (!intel_iommu_superpage)
+ dmar_domain->domain.pgsize_bitmap = SZ_4K;
+
return &dmar_domain->domain;
}
+static int compute_vasz_lg2_ss(struct intel_iommu *iommu)
+{
+ unsigned int sagaw = cap_sagaw(iommu->cap);
+ unsigned int mgaw = cap_mgaw(iommu->cap);
+
+ /*
+ * Find the largest table size that both the mgaw and sagaw support.
+ * This sets both the number of table levels and the valid range of
+ * IOVA.
+ */
+ if (mgaw >= 48 && (sagaw & BIT(3)))
+ return min(57, mgaw);
+ else if (mgaw >= 39 && (sagaw & BIT(2)))
+ return min(48, mgaw);
+ else if (mgaw >= 30 && (sagaw & BIT(1)))
+ return min(39, mgaw);
+ return 0;
+}
+
+static const struct iommu_dirty_ops intel_second_stage_dirty_ops = {
+ IOMMU_PT_DIRTY_OPS(vtdss),
+ .set_dirty_tracking = intel_iommu_set_dirty_tracking,
+};
+
static struct iommu_domain *
intel_iommu_domain_alloc_second_stage(struct device *dev,
struct intel_iommu *iommu, u32 flags)
{
+ struct pt_iommu_vtdss_cfg cfg = {};
struct dmar_domain *dmar_domain;
+ unsigned int sslps;
+ int ret;
if (flags &
(~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING |
@@ -3372,15 +2891,46 @@ intel_iommu_domain_alloc_second_stage(struct device *dev,
if (sm_supported(iommu) && !ecap_slts(iommu->ecap))
return ERR_PTR(-EOPNOTSUPP);
- dmar_domain = paging_domain_alloc(dev, false);
+ dmar_domain = paging_domain_alloc();
if (IS_ERR(dmar_domain))
return ERR_CAST(dmar_domain);
+ cfg.common.hw_max_vasz_lg2 = compute_vasz_lg2_ss(iommu);
+ cfg.common.hw_max_oasz_lg2 = 52;
+ cfg.common.features = BIT(PT_FEAT_FLUSH_RANGE);
+
+ /*
+ * Read-only mapping is disallowed on the domain which serves as the
+ * parent in a nested configuration, due to HW errata
+ * (ERRATA_772415_SPR17)
+ */
+ if (flags & IOMMU_HWPT_ALLOC_NEST_PARENT)
+ cfg.common.features |= BIT(PT_FEAT_VTDSS_FORCE_WRITEABLE);
+
+ if (!iommu_paging_structure_coherency(iommu))
+ cfg.common.features |= BIT(PT_FEAT_DMA_INCOHERENT);
+ dmar_domain->iommu.iommu_device = dev;
+ dmar_domain->iommu.nid = dev_to_node(dev);
dmar_domain->domain.ops = &intel_ss_paging_domain_ops;
dmar_domain->nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
if (flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING)
- dmar_domain->domain.dirty_ops = &intel_dirty_ops;
+ dmar_domain->domain.dirty_ops = &intel_second_stage_dirty_ops;
+
+ ret = pt_iommu_vtdss_init(&dmar_domain->sspt, &cfg, GFP_KERNEL);
+ if (ret) {
+ kfree(dmar_domain);
+ return ERR_PTR(ret);
+ }
+
+ /* Adjust the supported page sizes to HW capability */
+ sslps = cap_super_page_val(iommu->cap);
+ if (!(sslps & BIT(0)))
+ dmar_domain->domain.pgsize_bitmap &= ~(u64)SZ_2M;
+ if (!(sslps & BIT(1)))
+ dmar_domain->domain.pgsize_bitmap &= ~(u64)SZ_1G;
+ if (!intel_iommu_superpage)
+ dmar_domain->domain.pgsize_bitmap = SZ_4K;
/*
* Besides the internal write buffer flush, the caching mode used for
@@ -3422,14 +2972,7 @@ static void intel_iommu_domain_free(struct iommu_domain *domain)
if (WARN_ON(!list_empty(&dmar_domain->devices)))
return;
- if (dmar_domain->pgd) {
- struct iommu_pages_list freelist =
- IOMMU_PAGES_LIST_INIT(freelist);
-
- domain_unmap(dmar_domain, 0, DOMAIN_MAX_PFN(dmar_domain->gaw),
- &freelist);
- iommu_put_pages_list(&freelist);
- }
+ pt_iommu_deinit(&dmar_domain->iommu);
kfree(dmar_domain->qi_batch);
kfree(dmar_domain);
@@ -3446,6 +2989,16 @@ static int paging_domain_compatible_first_stage(struct dmar_domain *dmar_domain,
if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
return -EINVAL;
+ if (!!ecap_smpwc(iommu->ecap) !=
+ !(dmar_domain->fspt.x86_64_pt.common.features &
+ BIT(PT_FEAT_DMA_INCOHERENT)))
+ return -EINVAL;
+
+ /* Supports the number of table levels */
+ if (!cap_fl5lp_support(iommu->cap) &&
+ dmar_domain->fspt.x86_64_pt.common.max_vasz_lg2 > 48)
+ return -EINVAL;
+
/* Same page size support */
if (!cap_fl1gp_support(iommu->cap) &&
(dmar_domain->domain.pgsize_bitmap & SZ_1G))
@@ -3462,7 +3015,11 @@ static int
paging_domain_compatible_second_stage(struct dmar_domain *dmar_domain,
struct intel_iommu *iommu)
{
+ unsigned int vasz_lg2 = dmar_domain->sspt.vtdss_pt.common.max_vasz_lg2;
unsigned int sslps = cap_super_page_val(iommu->cap);
+ struct pt_iommu_vtdss_hw_info pt_info;
+
+ pt_iommu_vtdss_hw_info(&dmar_domain->sspt, &pt_info);
if (dmar_domain->domain.dirty_ops && !ssads_supported(iommu))
return -EINVAL;
@@ -3473,6 +3030,19 @@ paging_domain_compatible_second_stage(struct dmar_domain *dmar_domain,
if (sm_supported(iommu) && !ecap_slts(iommu->ecap))
return -EINVAL;
+ if (iommu_paging_structure_coherency(iommu) !=
+ !(dmar_domain->sspt.vtdss_pt.common.features &
+ BIT(PT_FEAT_DMA_INCOHERENT)))
+ return -EINVAL;
+
+ /* Address width falls within the capability */
+ if (cap_mgaw(iommu->cap) < vasz_lg2)
+ return -EINVAL;
+
+ /* Page table level is supported. */
+ if (!(cap_sagaw(iommu->cap) & BIT(pt_info.aw)))
+ return -EINVAL;
+
/* Same page size support */
if (!(sslps & BIT(0)) && (dmar_domain->domain.pgsize_bitmap & SZ_2M))
return -EINVAL;
@@ -3484,6 +3054,14 @@ paging_domain_compatible_second_stage(struct dmar_domain *dmar_domain,
!dmar_domain->iotlb_sync_map)
return -EINVAL;
+ /*
+ * FIXME this is locked wrong, it needs to be under the
+ * dmar_domain->lock
+ */
+ if ((dmar_domain->sspt.vtdss_pt.common.features &
+ BIT(PT_FEAT_VTDSS_FORCE_COHERENCE)) &&
+ !ecap_sc_support(iommu->ecap))
+ return -EINVAL;
return 0;
}
@@ -3493,7 +3071,6 @@ int paging_domain_compatible(struct iommu_domain *domain, struct device *dev)
struct dmar_domain *dmar_domain = to_dmar_domain(domain);
struct intel_iommu *iommu = info->iommu;
int ret = -EINVAL;
- int addr_width;
if (intel_domain_is_fs_paging(dmar_domain))
ret = paging_domain_compatible_first_stage(dmar_domain, iommu);
@@ -3504,26 +3081,6 @@ int paging_domain_compatible(struct iommu_domain *domain, struct device *dev)
if (ret)
return ret;
- /*
- * FIXME this is locked wrong, it needs to be under the
- * dmar_domain->lock
- */
- if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
- return -EINVAL;
-
- if (dmar_domain->iommu_coherency !=
- iommu_paging_structure_coherency(iommu))
- return -EINVAL;
-
-
- /* check if this iommu agaw is sufficient for max mapped address */
- addr_width = agaw_to_width(iommu->agaw);
- if (addr_width > cap_mgaw(iommu->cap))
- addr_width = cap_mgaw(iommu->cap);
-
- if (dmar_domain->gaw > addr_width || dmar_domain->agaw > iommu->agaw)
- return -EINVAL;
-
if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
context_copied(iommu, info->bus, info->devfn))
return intel_pasid_setup_sm_context(dev);
@@ -3553,110 +3110,6 @@ static int intel_iommu_attach_device(struct iommu_domain *domain,
return ret;
}
-static int intel_iommu_map(struct iommu_domain *domain,
- unsigned long iova, phys_addr_t hpa,
- size_t size, int iommu_prot, gfp_t gfp)
-{
- struct dmar_domain *dmar_domain = to_dmar_domain(domain);
- u64 max_addr;
- int prot = 0;
-
- if (iommu_prot & IOMMU_READ)
- prot |= DMA_PTE_READ;
- if (iommu_prot & IOMMU_WRITE)
- prot |= DMA_PTE_WRITE;
- if (dmar_domain->set_pte_snp)
- prot |= DMA_PTE_SNP;
-
- max_addr = iova + size;
- if (dmar_domain->max_addr < max_addr) {
- u64 end;
-
- /* check if minimum agaw is sufficient for mapped address */
- end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
- if (end < max_addr) {
- pr_err("%s: iommu width (%d) is not "
- "sufficient for the mapped address (%llx)\n",
- __func__, dmar_domain->gaw, max_addr);
- return -EFAULT;
- }
- dmar_domain->max_addr = max_addr;
- }
- /* Round up size to next multiple of PAGE_SIZE, if it and
- the low bits of hpa would take us onto the next page */
- size = aligned_nrpages(hpa, size);
- return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
- hpa >> VTD_PAGE_SHIFT, size, prot, gfp);
-}
-
-static int intel_iommu_map_pages(struct iommu_domain *domain,
- unsigned long iova, phys_addr_t paddr,
- size_t pgsize, size_t pgcount,
- int prot, gfp_t gfp, size_t *mapped)
-{
- unsigned long pgshift = __ffs(pgsize);
- size_t size = pgcount << pgshift;
- int ret;
-
- if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
- return -EINVAL;
-
- if (!IS_ALIGNED(iova | paddr, pgsize))
- return -EINVAL;
-
- ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
- if (!ret && mapped)
- *mapped = size;
-
- return ret;
-}
-
-static size_t intel_iommu_unmap(struct iommu_domain *domain,
- unsigned long iova, size_t size,
- struct iommu_iotlb_gather *gather)
-{
- struct dmar_domain *dmar_domain = to_dmar_domain(domain);
- unsigned long start_pfn, last_pfn;
- int level = 0;
-
- /* Cope with horrid API which requires us to unmap more than the
- size argument if it happens to be a large-page mapping. */
- if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
- &level, GFP_ATOMIC)))
- return 0;
-
- if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
- size = VTD_PAGE_SIZE << level_to_offset_bits(level);
-
- start_pfn = iova >> VTD_PAGE_SHIFT;
- last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
-
- domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
-
- if (dmar_domain->max_addr == iova + size)
- dmar_domain->max_addr = iova;
-
- /*
- * We do not use page-selective IOTLB invalidation in flush queue,
- * so there is no need to track page and sync iotlb.
- */
- if (!iommu_iotlb_gather_queued(gather))
- iommu_iotlb_gather_add_page(domain, gather, iova, size);
-
- return size;
-}
-
-static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
- unsigned long iova,
- size_t pgsize, size_t pgcount,
- struct iommu_iotlb_gather *gather)
-{
- unsigned long pgshift = __ffs(pgsize);
- size_t size = pgcount << pgshift;
-
- return intel_iommu_unmap(domain, iova, size, gather);
-}
-
static void intel_iommu_tlb_sync(struct iommu_domain *domain,
struct iommu_iotlb_gather *gather)
{
@@ -3666,24 +3119,6 @@ static void intel_iommu_tlb_sync(struct iommu_domain *domain,
iommu_put_pages_list(&gather->freelist);
}
-static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
- dma_addr_t iova)
-{
- struct dmar_domain *dmar_domain = to_dmar_domain(domain);
- struct dma_pte *pte;
- int level = 0;
- u64 phys = 0;
-
- pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
- GFP_ATOMIC);
- if (pte && dma_pte_present(pte))
- phys = dma_pte_addr(pte) +
- (iova & (BIT_MASK(level_to_offset_bits(level) +
- VTD_PAGE_SHIFT) - 1));
-
- return phys;
-}
-
static bool domain_support_force_snooping(struct dmar_domain *domain)
{
struct device_domain_info *info;
@@ -3725,15 +3160,15 @@ static bool intel_iommu_enforce_cache_coherency_ss(struct iommu_domain *domain)
struct dmar_domain *dmar_domain = to_dmar_domain(domain);
guard(spinlock_irqsave)(&dmar_domain->lock);
- if (!domain_support_force_snooping(dmar_domain) ||
- dmar_domain->has_mappings)
+ if (!domain_support_force_snooping(dmar_domain))
return false;
/*
* Second level page table supports per-PTE snoop control. The
* iommu_map() interface will handle this by setting SNP bit.
*/
- dmar_domain->set_pte_snp = true;
+ dmar_domain->sspt.vtdss_pt.common.features |=
+ BIT(PT_FEAT_VTDSS_FORCE_COHERENCE);
dmar_domain->force_snooping = true;
return true;
}
@@ -4297,49 +3732,6 @@ static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
return ret;
}
-static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain,
- unsigned long iova, size_t size,
- unsigned long flags,
- struct iommu_dirty_bitmap *dirty)
-{
- struct dmar_domain *dmar_domain = to_dmar_domain(domain);
- unsigned long end = iova + size - 1;
- unsigned long pgsize;
-
- /*
- * IOMMUFD core calls into a dirty tracking disabled domain without an
- * IOVA bitmap set in order to clean dirty bits in all PTEs that might
- * have occurred when we stopped dirty tracking. This ensures that we
- * never inherit dirtied bits from a previous cycle.
- */
- if (!dmar_domain->dirty_tracking && dirty->bitmap)
- return -EINVAL;
-
- do {
- struct dma_pte *pte;
- int lvl = 0;
-
- pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl,
- GFP_ATOMIC);
- pgsize = level_size(lvl) << VTD_PAGE_SHIFT;
- if (!pte || !dma_pte_present(pte)) {
- iova += pgsize;
- continue;
- }
-
- if (dma_sl_pte_test_and_clear_dirty(pte, flags))
- iommu_dirty_bitmap_record(dirty, iova, pgsize);
- iova += pgsize;
- } while (iova < end);
-
- return 0;
-}
-
-static const struct iommu_dirty_ops intel_dirty_ops = {
- .set_dirty_tracking = intel_iommu_set_dirty_tracking,
- .read_and_clear_dirty = intel_iommu_read_and_clear_dirty,
-};
-
static int context_setup_pass_through(struct device *dev, u8 bus, u8 devfn)
{
struct device_domain_info *info = dev_iommu_priv_get(dev);
@@ -4457,27 +3849,23 @@ static struct iommu_domain identity_domain = {
};
const struct iommu_domain_ops intel_fs_paging_domain_ops = {
+ IOMMU_PT_DOMAIN_OPS(x86_64),
.attach_dev = intel_iommu_attach_device,
.set_dev_pasid = intel_iommu_set_dev_pasid,
- .map_pages = intel_iommu_map_pages,
- .unmap_pages = intel_iommu_unmap_pages,
.iotlb_sync_map = intel_iommu_iotlb_sync_map,
.flush_iotlb_all = intel_flush_iotlb_all,
.iotlb_sync = intel_iommu_tlb_sync,
- .iova_to_phys = intel_iommu_iova_to_phys,
.free = intel_iommu_domain_free,
.enforce_cache_coherency = intel_iommu_enforce_cache_coherency_fs,
};
const struct iommu_domain_ops intel_ss_paging_domain_ops = {
+ IOMMU_PT_DOMAIN_OPS(vtdss),
.attach_dev = intel_iommu_attach_device,
.set_dev_pasid = intel_iommu_set_dev_pasid,
- .map_pages = intel_iommu_map_pages,
- .unmap_pages = intel_iommu_unmap_pages,
.iotlb_sync_map = intel_iommu_iotlb_sync_map,
.flush_iotlb_all = intel_flush_iotlb_all,
.iotlb_sync = intel_iommu_tlb_sync,
- .iova_to_phys = intel_iommu_iova_to_phys,
.free = intel_iommu_domain_free,
.enforce_cache_coherency = intel_iommu_enforce_cache_coherency_ss,
};
@@ -4792,3 +4180,5 @@ int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob)
return ret;
}
+
+MODULE_IMPORT_NS("GENERIC_PT_IOMMU");
diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
index d09b9287165927..df069e4074c92a 100644
--- a/drivers/iommu/intel/iommu.h
+++ b/drivers/iommu/intel/iommu.h
@@ -23,8 +23,8 @@
#include <linux/xarray.h>
#include <linux/perf_event.h>
#include <linux/pci.h>
+#include <linux/generic_pt/iommu.h>
-#include <asm/cacheflush.h>
#include <asm/iommu.h>
#include <uapi/linux/iommufd.h>
@@ -173,8 +173,6 @@
#define cap_pgsel_inv(c) (((c) >> 39) & 1)
#define cap_super_page_val(c) (((c) >> 34) & 0xf)
-#define cap_super_offset(c) (((find_first_bit(&cap_super_page_val(c), 4)) \
- * OFFSET_STRIDE) + 21)
#define cap_fault_reg_offset(c) ((((c) >> 24) & 0x3ff) * 16)
#define cap_max_fault_reg_offset(c) \
@@ -598,22 +596,20 @@ struct qi_batch {
};
struct dmar_domain {
- int nid; /* node id */
+ union {
+ struct iommu_domain domain;
+ struct pt_iommu iommu;
+ /* First stage page table */
+ struct pt_iommu_x86_64 fspt;
+ /* Second stage page table */
+ struct pt_iommu_vtdss sspt;
+ };
+
struct xarray iommu_array; /* Attached IOMMU array */
- u8 iommu_coherency: 1; /* indicate coherency of iommu access */
- u8 force_snooping : 1; /* Create IOPTEs with snoop control */
- u8 set_pte_snp:1;
- u8 use_first_level:1; /* DMA translation for the domain goes
- * through the first level page table,
- * otherwise, goes through the second
- * level.
- */
+ u8 force_snooping:1; /* Create PASID entry with snoop control */
u8 dirty_tracking:1; /* Dirty tracking is enabled */
u8 nested_parent:1; /* Has other domains nested on it */
- u8 has_mappings:1; /* Has mappings configured through
- * iommu_map() interface.
- */
u8 iotlb_sync_map:1; /* Need to flush IOTLB cache or write
* buffer when creating mappings.
*/
@@ -626,26 +622,9 @@ struct dmar_domain {
struct list_head cache_tags; /* Cache tag list */
struct qi_batch *qi_batch; /* Batched QI descriptors */
- int iommu_superpage;/* Level of superpages supported:
- 0 == 4KiB (no superpages), 1 == 2MiB,
- 2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
union {
/* DMA remapping domain */
struct {
- /* virtual address */
- struct dma_pte *pgd;
- /* max guest address width */
- int gaw;
- /*
- * adjusted guest address width:
- * 0: level 2 30-bit
- * 1: level 3 39-bit
- * 2: level 4 48-bit
- * 3: level 5 57-bit
- */
- int agaw;
- /* maximum mapped address */
- u64 max_addr;
/* Protect the s1_domains list */
spinlock_t s1_lock;
/* Track s1_domains nested on this domain */
@@ -667,10 +646,10 @@ struct dmar_domain {
struct mmu_notifier notifier;
};
};
-
- struct iommu_domain domain; /* generic domain data structure for
- iommu core */
};
+PT_IOMMU_CHECK_DOMAIN(struct dmar_domain, iommu, domain);
+PT_IOMMU_CHECK_DOMAIN(struct dmar_domain, sspt.iommu, domain);
+PT_IOMMU_CHECK_DOMAIN(struct dmar_domain, fspt.iommu, domain);
/*
* In theory, the VT-d 4.0 spec can support up to 2 ^ 16 counters.
@@ -869,11 +848,6 @@ struct dma_pte {
u64 val;
};
-static inline void dma_clear_pte(struct dma_pte *pte)
-{
- pte->val = 0;
-}
-
static inline u64 dma_pte_addr(struct dma_pte *pte)
{
#ifdef CONFIG_64BIT
@@ -889,32 +863,11 @@ static inline bool dma_pte_present(struct dma_pte *pte)
return (pte->val & 3) != 0;
}
-static inline bool dma_sl_pte_test_and_clear_dirty(struct dma_pte *pte,
- unsigned long flags)
-{
- if (flags & IOMMU_DIRTY_NO_CLEAR)
- return (pte->val & DMA_SL_PTE_DIRTY) != 0;
-
- return test_and_clear_bit(DMA_SL_PTE_DIRTY_BIT,
- (unsigned long *)&pte->val);
-}
-
static inline bool dma_pte_superpage(struct dma_pte *pte)
{
return (pte->val & DMA_PTE_LARGE_PAGE);
}
-static inline bool first_pte_in_page(struct dma_pte *pte)
-{
- return IS_ALIGNED((unsigned long)pte, VTD_PAGE_SIZE);
-}
-
-static inline int nr_pte_to_next_page(struct dma_pte *pte)
-{
- return first_pte_in_page(pte) ? BIT_ULL(VTD_STRIDE_SHIFT) :
- (struct dma_pte *)ALIGN((unsigned long)pte, VTD_PAGE_SIZE) - pte;
-}
-
static inline bool context_present(struct context_entry *context)
{
return (context->lo & 1);
@@ -930,11 +883,6 @@ static inline int agaw_to_level(int agaw)
return agaw + 2;
}
-static inline int agaw_to_width(int agaw)
-{
- return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
-}
-
static inline int width_to_agaw(int width)
{
return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
@@ -950,25 +898,6 @@ static inline int pfn_level_offset(u64 pfn, int level)
return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
}
-static inline u64 level_mask(int level)
-{
- return -1ULL << level_to_offset_bits(level);
-}
-
-static inline u64 level_size(int level)
-{
- return 1ULL << level_to_offset_bits(level);
-}
-
-static inline u64 align_to_level(u64 pfn, int level)
-{
- return (pfn + level_size(level) - 1) & level_mask(level);
-}
-
-static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
-{
- return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
-}
static inline void context_set_present(struct context_entry *context)
{
diff --git a/drivers/iommu/intel/nested.c b/drivers/iommu/intel/nested.c
index 1b6ad9c900a5ad..f85f9110de0d89 100644
--- a/drivers/iommu/intel/nested.c
+++ b/drivers/iommu/intel/nested.c
@@ -29,11 +29,6 @@ static int intel_nested_attach_dev(struct iommu_domain *domain,
device_block_translation(dev);
- if (iommu->agaw < dmar_domain->s2_domain->agaw) {
- dev_err_ratelimited(dev, "Adjusted guest address width not compatible\n");
- return -ENODEV;
- }
-
/*
* Stage-1 domain cannot work alone, it is nested on a s2_domain.
* The s2_domain will be used in nested translation, hence needs
diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
index 52f678975da745..b03da83583ac3a 100644
--- a/drivers/iommu/intel/pasid.c
+++ b/drivers/iommu/intel/pasid.c
@@ -483,11 +483,12 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu,
struct dmar_domain *domain,
struct device *dev, u32 pasid)
{
+ struct pt_iommu_vtdss_hw_info pt_info;
struct pasid_entry *pte;
- struct dma_pte *pgd;
- u64 pgd_val;
u16 did;
+ pt_iommu_vtdss_hw_info(&domain->sspt, &pt_info);
+
/*
* If hardware advertises no support for second level
* translation, return directly.
@@ -498,8 +499,6 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu,
return -EINVAL;
}
- pgd = domain->pgd;
- pgd_val = virt_to_phys(pgd);
did = domain_id_iommu(domain, iommu);
spin_lock(&iommu->lock);
@@ -514,7 +513,7 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu,
return -EBUSY;
}
- pasid_pte_config_second_level(iommu, pte, pgd_val, domain->agaw,
+ pasid_pte_config_second_level(iommu, pte, pt_info.ssptptr, pt_info.aw,
did, domain->dirty_tracking);
spin_unlock(&iommu->lock);
@@ -528,11 +527,12 @@ int intel_pasid_replace_second_level(struct intel_iommu *iommu,
struct device *dev, u16 old_did,
u32 pasid)
{
+ struct pt_iommu_vtdss_hw_info pt_info;
struct pasid_entry *pte, new_pte;
- struct dma_pte *pgd;
- u64 pgd_val;
u16 did;
+ pt_iommu_vtdss_hw_info(&domain->sspt, &pt_info);
+
/*
* If hardware advertises no support for second level
* translation, return directly.
@@ -543,13 +543,10 @@ int intel_pasid_replace_second_level(struct intel_iommu *iommu,
return -EINVAL;
}
- pgd = domain->pgd;
- pgd_val = virt_to_phys(pgd);
did = domain_id_iommu(domain, iommu);
- pasid_pte_config_second_level(iommu, &new_pte, pgd_val,
- domain->agaw, did,
- domain->dirty_tracking);
+ pasid_pte_config_second_level(iommu, &new_pte, pt_info.ssptptr,
+ pt_info.aw, did, domain->dirty_tracking);
spin_lock(&iommu->lock);
pte = intel_pasid_get_entry(dev, pasid);
@@ -747,10 +744,12 @@ static void pasid_pte_config_nestd(struct intel_iommu *iommu,
struct dmar_domain *s2_domain,
u16 did)
{
- struct dma_pte *pgd = s2_domain->pgd;
+ struct pt_iommu_vtdss_hw_info pt_info;
lockdep_assert_held(&iommu->lock);
+ pt_iommu_vtdss_hw_info(&s2_domain->sspt, &pt_info);
+
pasid_clear_entry(pte);
if (s1_cfg->addr_width == ADDR_WIDTH_5LEVEL)
@@ -770,10 +769,10 @@ static void pasid_pte_config_nestd(struct intel_iommu *iommu,
if (s2_domain->force_snooping)
pasid_set_pgsnp(pte);
- pasid_set_slptr(pte, virt_to_phys(pgd));
+ pasid_set_slptr(pte, pt_info.ssptptr);
pasid_set_fault_enable(pte);
pasid_set_domain_id(pte, did);
- pasid_set_address_width(pte, s2_domain->agaw);
+ pasid_set_address_width(pte, pt_info.aw);
pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap));
if (s2_domain->dirty_tracking)
pasid_set_ssade(pte);
--
2.43.0
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [PATCH v2 09/10] iommu/vt-d: Follow PT_FEAT_DMA_INCOHERENT into the PASID entry
2025-08-26 17:26 [PATCH v2 00/10] Convert Intel VT-D to use the generic iommu page table Jason Gunthorpe
` (7 preceding siblings ...)
2025-08-26 17:26 ` [PATCH v2 08/10] iommu/vt-d: Use the generic iommu page table Jason Gunthorpe
@ 2025-08-26 17:26 ` Jason Gunthorpe
2025-08-26 17:26 ` [PATCH v2 10/10] iommupt: Add a kunit test for the SW bits Jason Gunthorpe
9 siblings, 0 replies; 11+ messages in thread
From: Jason Gunthorpe @ 2025-08-26 17:26 UTC (permalink / raw)
To: Lu Baolu, David Woodhouse, iommu, Joerg Roedel, Robin Murphy,
Will Deacon
Cc: Kevin Tian, patches, Tina Zhang, Wei Wang
Currently a incoherent walk domain cannot be attached do a coherent
capable iommu. Kevin says HW probably doesn't exist with such a mixture,
but make the driver make logical sense anyhow.
When building the PASID entry the PWSNP (Page Walk Snoop) bit tells the HW
if it should issue snoops. If the page table is cache flushed because of
PT_FEAT_DMA_INCOHERENT then it is fine to set this bit to 0 even if the HW
supports 1.
Weaken the compatible check to permit a coherent instance to accept an
incoherent table and fix the PASID table construction to set PWSNP from
PT_FEAT_DMA_INCOHERENT.
SVA always sets PWSNP.
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
drivers/iommu/intel/iommu.c | 8 ++++++--
drivers/iommu/intel/pasid.c | 31 ++++++++++++++-----------------
drivers/iommu/intel/pasid.h | 1 +
drivers/iommu/intel/svm.c | 1 +
4 files changed, 22 insertions(+), 19 deletions(-)
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 6a269d201a614b..b9c69c43ca8dcf 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -1300,6 +1300,10 @@ static int domain_setup_first_level(struct intel_iommu *iommu,
if (domain->force_snooping)
flags |= PASID_FLAG_PAGE_SNOOP;
+ if (!(domain->fspt.x86_64_pt.common.features &
+ BIT(PT_FEAT_DMA_INCOHERENT)))
+ flags |= PASID_FLAG_PWSNP;
+
return __domain_setup_first_level(iommu, dev, pasid,
domain_id_iommu(domain, iommu),
pt_info.gcr3_pt, flags, old);
@@ -2989,7 +2993,7 @@ static int paging_domain_compatible_first_stage(struct dmar_domain *dmar_domain,
if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
return -EINVAL;
- if (!!ecap_smpwc(iommu->ecap) !=
+ if (!ecap_smpwc(iommu->ecap) &&
!(dmar_domain->fspt.x86_64_pt.common.features &
BIT(PT_FEAT_DMA_INCOHERENT)))
return -EINVAL;
@@ -3030,7 +3034,7 @@ paging_domain_compatible_second_stage(struct dmar_domain *dmar_domain,
if (sm_supported(iommu) && !ecap_slts(iommu->ecap))
return -EINVAL;
- if (iommu_paging_structure_coherency(iommu) !=
+ if (!iommu_paging_structure_coherency(iommu) &&
!(dmar_domain->sspt.vtdss_pt.common.features &
BIT(PT_FEAT_DMA_INCOHERENT)))
return -EINVAL;
diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
index b03da83583ac3a..3e2255057079c5 100644
--- a/drivers/iommu/intel/pasid.c
+++ b/drivers/iommu/intel/pasid.c
@@ -366,7 +366,7 @@ static void pasid_pte_config_first_level(struct intel_iommu *iommu,
pasid_set_domain_id(pte, did);
pasid_set_address_width(pte, iommu->agaw);
- pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap));
+ pasid_set_page_snoop(pte, flags & PASID_FLAG_PWSNP);
/* Setup Present and PASID Granular Transfer Type: */
pasid_set_translation_type(pte, PASID_ENTRY_PGTT_FL_ONLY);
@@ -461,19 +461,22 @@ int intel_pasid_replace_first_level(struct intel_iommu *iommu,
*/
static void pasid_pte_config_second_level(struct intel_iommu *iommu,
struct pasid_entry *pte,
- u64 pgd_val, int agaw, u16 did,
- bool dirty_tracking)
+ struct dmar_domain *domain, u16 did)
{
+ struct pt_iommu_vtdss_hw_info pt_info;
+
lockdep_assert_held(&iommu->lock);
+ pt_iommu_vtdss_hw_info(&domain->sspt, &pt_info);
pasid_clear_entry(pte);
pasid_set_domain_id(pte, did);
- pasid_set_slptr(pte, pgd_val);
- pasid_set_address_width(pte, agaw);
+ pasid_set_slptr(pte, pt_info.ssptptr);
+ pasid_set_address_width(pte, pt_info.aw);
pasid_set_translation_type(pte, PASID_ENTRY_PGTT_SL_ONLY);
pasid_set_fault_enable(pte);
- pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap));
- if (dirty_tracking)
+ pasid_set_page_snoop(pte, !(domain->sspt.vtdss_pt.common.features &
+ BIT(PT_FEAT_DMA_INCOHERENT)));
+ if (domain->dirty_tracking)
pasid_set_ssade(pte);
pasid_set_present(pte);
@@ -483,11 +486,9 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu,
struct dmar_domain *domain,
struct device *dev, u32 pasid)
{
- struct pt_iommu_vtdss_hw_info pt_info;
struct pasid_entry *pte;
u16 did;
- pt_iommu_vtdss_hw_info(&domain->sspt, &pt_info);
/*
* If hardware advertises no support for second level
@@ -513,8 +514,7 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu,
return -EBUSY;
}
- pasid_pte_config_second_level(iommu, pte, pt_info.ssptptr, pt_info.aw,
- did, domain->dirty_tracking);
+ pasid_pte_config_second_level(iommu, pte, domain, did);
spin_unlock(&iommu->lock);
pasid_flush_caches(iommu, pte, pasid, did);
@@ -527,12 +527,9 @@ int intel_pasid_replace_second_level(struct intel_iommu *iommu,
struct device *dev, u16 old_did,
u32 pasid)
{
- struct pt_iommu_vtdss_hw_info pt_info;
struct pasid_entry *pte, new_pte;
u16 did;
- pt_iommu_vtdss_hw_info(&domain->sspt, &pt_info);
-
/*
* If hardware advertises no support for second level
* translation, return directly.
@@ -545,8 +542,7 @@ int intel_pasid_replace_second_level(struct intel_iommu *iommu,
did = domain_id_iommu(domain, iommu);
- pasid_pte_config_second_level(iommu, &new_pte, pt_info.ssptptr,
- pt_info.aw, did, domain->dirty_tracking);
+ pasid_pte_config_second_level(iommu, &new_pte, domain, did);
spin_lock(&iommu->lock);
pte = intel_pasid_get_entry(dev, pasid);
@@ -773,7 +769,8 @@ static void pasid_pte_config_nestd(struct intel_iommu *iommu,
pasid_set_fault_enable(pte);
pasid_set_domain_id(pte, did);
pasid_set_address_width(pte, pt_info.aw);
- pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap));
+ pasid_set_page_snoop(pte, !(s2_domain->sspt.vtdss_pt.common.features &
+ BIT(PT_FEAT_DMA_INCOHERENT)));
if (s2_domain->dirty_tracking)
pasid_set_ssade(pte);
pasid_set_translation_type(pte, PASID_ENTRY_PGTT_NESTED);
diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h
index a771a77d4239c4..b4c85242dc7962 100644
--- a/drivers/iommu/intel/pasid.h
+++ b/drivers/iommu/intel/pasid.h
@@ -24,6 +24,7 @@
#define PASID_FLAG_NESTED BIT(1)
#define PASID_FLAG_PAGE_SNOOP BIT(2)
+#define PASID_FLAG_PWSNP BIT(2)
/*
* The PASID_FLAG_FL5LP flag Indicates using 5-level paging for first-
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index e147f71f91b722..71de7947971f82 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -170,6 +170,7 @@ static int intel_svm_set_dev_pasid(struct iommu_domain *domain,
/* Setup the pasid table: */
sflags = cpu_feature_enabled(X86_FEATURE_LA57) ? PASID_FLAG_FL5LP : 0;
+ sflags |= PASID_FLAG_PWSNP;
ret = __domain_setup_first_level(iommu, dev, pasid,
FLPT_DEFAULT_DID, __pa(mm->pgd),
sflags, old);
--
2.43.0
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [PATCH v2 10/10] iommupt: Add a kunit test for the SW bits
2025-08-26 17:26 [PATCH v2 00/10] Convert Intel VT-D to use the generic iommu page table Jason Gunthorpe
` (8 preceding siblings ...)
2025-08-26 17:26 ` [PATCH v2 09/10] iommu/vt-d: Follow PT_FEAT_DMA_INCOHERENT into the PASID entry Jason Gunthorpe
@ 2025-08-26 17:26 ` Jason Gunthorpe
9 siblings, 0 replies; 11+ messages in thread
From: Jason Gunthorpe @ 2025-08-26 17:26 UTC (permalink / raw)
To: Lu Baolu, David Woodhouse, iommu, Joerg Roedel, Robin Murphy,
Will Deacon
Cc: Kevin Tian, patches, Tina Zhang, Wei Wang
Add some basic checks that the sw_bit APIs work as expected.
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
drivers/iommu/generic_pt/kunit_generic_pt.h | 110 ++++++++++++++++++++
1 file changed, 110 insertions(+)
diff --git a/drivers/iommu/generic_pt/kunit_generic_pt.h b/drivers/iommu/generic_pt/kunit_generic_pt.h
index 3f3eab35e5cb1b..9395e4aa295c19 100644
--- a/drivers/iommu/generic_pt/kunit_generic_pt.h
+++ b/drivers/iommu/generic_pt/kunit_generic_pt.h
@@ -664,6 +664,112 @@ static __maybe_unused void test_dirty(struct kunit *test)
check_all_levels(test, test_lvl_dirty, NULL);
}
+static void test_lvl_sw_bit_leaf(struct kunit *test, struct pt_state *pts,
+ void *arg)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ pt_vaddr_t pgsize_bitmap = pt_possible_sizes(pts);
+ unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
+ struct pt_write_attrs attrs = {};
+ unsigned int len_lg2;
+
+ if (!pt_can_have_leaf(pts))
+ return;
+ if (pts->index != 0)
+ return;
+
+ KUNIT_ASSERT_NO_ERRNO_FN(test, "pt_iommu_set_prot",
+ pt_iommu_set_prot(pts->range->common, &attrs,
+ IOMMU_READ));
+
+ for (len_lg2 = 0; len_lg2 < PT_VADDR_MAX_LG2 - 1; len_lg2++) {
+ pt_oaddr_t paddr = log2_set_mod(priv->test_oa, 0, len_lg2);
+ struct pt_write_attrs new_attrs = {};
+ unsigned int bitnr;
+
+ if (!(pgsize_bitmap & log2_to_int(len_lg2)))
+ continue;
+
+ pt_install_leaf_entry(pts, paddr, len_lg2, &attrs);
+
+ for (bitnr = 0; bitnr <= pt_max_sw_bit(pts->range->common);
+ bitnr++)
+ KUNIT_ASSERT_FALSE(test,
+ pt_test_sw_bit_acquire(pts, bitnr));
+
+ for (bitnr = 0; bitnr <= pt_max_sw_bit(pts->range->common);
+ bitnr++) {
+ KUNIT_ASSERT_FALSE(test,
+ pt_test_sw_bit_acquire(pts, bitnr));
+ pt_set_sw_bit_release(pts, bitnr);
+ KUNIT_ASSERT_TRUE(test,
+ pt_test_sw_bit_acquire(pts, bitnr));
+ }
+
+ for (bitnr = 0; bitnr <= pt_max_sw_bit(pts->range->common);
+ bitnr++)
+ KUNIT_ASSERT_TRUE(test,
+ pt_test_sw_bit_acquire(pts, bitnr));
+
+ KUNIT_ASSERT_EQ(test, pt_item_oa(pts), paddr);
+
+ /* SW bits didn't leak into the attrs */
+ pt_attr_from_entry(pts, &new_attrs);
+ KUNIT_ASSERT_MEMEQ(test, &new_attrs, &attrs, sizeof(attrs));
+
+ pt_clear_entry(pts, len_lg2 - isz_lg2);
+ KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_EMPTY);
+ }
+}
+
+static __maybe_unused void test_sw_bit_leaf(struct kunit *test)
+{
+ check_all_levels(test, test_lvl_sw_bit_leaf, NULL);
+}
+
+static void test_lvl_sw_bit_table(struct kunit *test, struct pt_state *pts,
+ void *arg)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ struct pt_write_attrs attrs = {};
+ pt_oaddr_t paddr =
+ log2_set_mod(priv->test_oa, 0, priv->smallest_pgsz_lg2);
+ unsigned int bitnr;
+
+ if (!pt_can_have_leaf(pts))
+ return;
+ if (pts->index != 0)
+ return;
+
+ KUNIT_ASSERT_NO_ERRNO_FN(test, "pt_iommu_set_prot",
+ pt_iommu_set_prot(pts->range->common, &attrs,
+ IOMMU_READ));
+
+ KUNIT_ASSERT_TRUE(test, pt_install_table(pts, paddr, &attrs));
+
+ for (bitnr = 0; bitnr <= pt_max_sw_bit(pts->range->common); bitnr++)
+ KUNIT_ASSERT_FALSE(test, pt_test_sw_bit_acquire(pts, bitnr));
+
+ for (bitnr = 0; bitnr <= pt_max_sw_bit(pts->range->common); bitnr++) {
+ KUNIT_ASSERT_FALSE(test, pt_test_sw_bit_acquire(pts, bitnr));
+ pt_set_sw_bit_release(pts, bitnr);
+ KUNIT_ASSERT_TRUE(test, pt_test_sw_bit_acquire(pts, bitnr));
+ }
+
+ for (bitnr = 0; bitnr <= pt_max_sw_bit(pts->range->common); bitnr++)
+ KUNIT_ASSERT_TRUE(test, pt_test_sw_bit_acquire(pts, bitnr));
+
+ KUNIT_ASSERT_EQ(test, pt_table_pa(pts), paddr);
+
+ pt_clear_entry(pts, ilog2(1));
+ KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_EMPTY);
+}
+
+static __maybe_unused void test_sw_bit_table(struct kunit *test)
+{
+ check_all_levels(test, test_lvl_sw_bit_table, NULL);
+}
+
static struct kunit_case generic_pt_test_cases[] = {
KUNIT_CASE_FMT(test_init),
KUNIT_CASE_FMT(test_bitops),
@@ -676,6 +782,10 @@ static struct kunit_case generic_pt_test_cases[] = {
KUNIT_CASE_FMT(test_attr_from_entry),
#ifdef pt_entry_write_is_dirty
KUNIT_CASE_FMT(test_dirty),
+#endif
+#ifdef pt_sw_bit
+ KUNIT_CASE_FMT(test_sw_bit_leaf),
+ KUNIT_CASE_FMT(test_sw_bit_table),
#endif
{},
};
--
2.43.0
^ permalink raw reply related [flat|nested] 11+ messages in thread