linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Jason Gunthorpe <jgg@nvidia.com>
Cc: Alejandro Jimenez <alejandro.j.jimenez@oracle.com>,
	Lu Baolu <baolu.lu@linux.intel.com>,
	David Hildenbrand <david@redhat.com>,
	Christoph Hellwig <hch@lst.de>,
	iommu@lists.linux.dev, Joao Martins <joao.m.martins@oracle.com>,
	Kevin Tian <kevin.tian@intel.com>,
	kvm@vger.kernel.org, linux-mm@kvack.org,
	Pasha Tatashin <pasha.tatashin@soleen.com>,
	Peter Xu <peterx@redhat.com>, Ryan Roberts <ryan.roberts@arm.com>,
	Sean Christopherson <seanjc@google.com>,
	Tina Zhang <tina.zhang@intel.com>
Subject: [PATCH 05/16] iommupt: Add unmap_pages op
Date: Thu, 15 Aug 2024 12:11:21 -0300	[thread overview]
Message-ID: <5-v1-01fa10580981+1d-iommu_pt_jgg@nvidia.com> (raw)
In-Reply-To: <0-v1-01fa10580981+1d-iommu_pt_jgg@nvidia.com>

unmap_pages removes mappings and any fully contained interior tables from
the given range. This follows the strict iommu_domain API definition where
it does not split up larger page sizes into smaller. The caller must
perform unmap only on ranges created by map or it must have somehow
otherwise determined safe cut points (eg iommufd/vfio use iova_to_phys to
scan for them)

A following patch will provide 'cut' which explicitly does the page size
split if the HW can support it.

unmap is implemented with a recursive descent of the tree. It has an
additional cost of checking that the entire VA range is mapped. If the
caller provides a VA range that spans an entire table item then the table
can be freed as well.

Cache incoherent HW is handled by keep tracking of what table memory
ranges need CPU cache invalidation at each level and performing that
invalidation once when ascending from that level.

Currently, the only user I know of for partial unmap is VFIO type 1 v1.0.

Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/iommu/generic_pt/iommu_pt.h | 143 ++++++++++++++++++++++++++++
 include/linux/generic_pt/iommu.h    |  24 +++++
 2 files changed, 167 insertions(+)

diff --git a/drivers/iommu/generic_pt/iommu_pt.h b/drivers/iommu/generic_pt/iommu_pt.h
index 835c84ea716093..6d1c59b33d02f3 100644
--- a/drivers/iommu/generic_pt/iommu_pt.h
+++ b/drivers/iommu/generic_pt/iommu_pt.h
@@ -14,6 +14,63 @@
 
 #include <linux/iommu.h>
 #include <linux/export.h>
+#include <linux/cleanup.h>
+#include <linux/dma-mapping.h>
+
+/*
+ * Keep track of what table items are being written too during mutation
+ * operations. When the HW is DMA Incoherent these have to be cache flushed
+ * before they are visible. The write_log batches flushes together and uses a C
+ * cleanup to make sure the table memory is flushed before walking concludes
+ * with that table.
+ *
+ * There are two notable cases that need special flushing:
+ *  1) Installing a table entry requires the new table memory (and all of it's
+ *     children) are flushed.
+ *  2) Installing a shared table requires that other threads using the shared
+ *     table ensure it is flushed before they attempt to use it.
+ */
+struct iommu_write_log {
+	struct pt_range *range;
+	struct pt_table_p *table;
+	unsigned int start_idx;
+	unsigned int last_idx;
+};
+
+static void record_write(struct iommu_write_log *wlog,
+			 const struct pt_state *pts,
+			 unsigned int index_count_lg2)
+{
+	if (!(PT_SUPPORTED_FEATURES & BIT(PT_FEAT_DMA_INCOHERENT)))
+		return;
+
+	if (!wlog->table) {
+		wlog->table = pts->table;
+		wlog->start_idx = pts->index;
+	}
+	wlog->last_idx =
+		max(wlog->last_idx,
+		    log2_set_mod(pts->index + log2_to_int(index_count_lg2), 0,
+				 index_count_lg2));
+}
+
+static void done_writes(struct iommu_write_log *wlog)
+{
+	struct pt_iommu *iommu_table = iommu_from_common(wlog->range->common);
+	dma_addr_t dma;
+
+	if (!pt_feature(wlog->range->common, PT_FEAT_DMA_INCOHERENT) ||
+	    !wlog->table)
+		return;
+
+	dma = virt_to_phys(wlog->table);
+	dma_sync_single_for_device(iommu_table->iommu_device,
+				   dma + wlog->start_idx * PT_ENTRY_WORD_SIZE,
+				   (wlog->last_idx - wlog->start_idx + 1) *
+					   PT_ENTRY_WORD_SIZE,
+				   DMA_TO_DEVICE);
+	wlog->table = NULL;
+}
 
 static int make_range(struct pt_common *common, struct pt_range *range,
 		      dma_addr_t iova, dma_addr_t len)
@@ -102,6 +159,91 @@ static int __collect_tables(struct pt_range *range, void *arg,
 	return 0;
 }
 
+struct pt_unmap_args {
+	struct pt_radix_list_head free_list;
+	pt_vaddr_t unmapped;
+};
+
+static int __unmap_pages(struct pt_range *range, void *arg, unsigned int level,
+			 struct pt_table_p *table)
+{
+	struct iommu_write_log wlog __cleanup(done_writes) = { .range = range };
+	struct pt_state pts = pt_init(range, level, table);
+	struct pt_unmap_args *unmap = arg;
+	int ret;
+
+	for_each_pt_level_item(&pts) {
+		switch (pts.type) {
+		case PT_ENTRY_TABLE: {
+			/* descend will change va */
+			bool fully_covered = pt_entry_fully_covered(
+				&pts, pt_table_item_lg2sz(&pts));
+
+			ret = pt_descend(&pts, arg, __unmap_pages);
+			if (ret)
+				return ret;
+
+			/*
+			 * If the unmapping range fully covers the table then we
+			 * can free it as well. The clear is delayed until we
+			 * succeed in clearing the lower table levels.
+			 */
+			if (fully_covered) {
+				pt_radix_add_list(&unmap->free_list,
+						  pts.table_lower);
+				record_write(&wlog, &pts, ilog2(1));
+				pt_clear_entry(&pts, ilog2(1));
+			}
+			break;
+		}
+		case PT_ENTRY_EMPTY:
+			return -EFAULT;
+		case PT_ENTRY_OA:
+			/*
+			 * The IOMMU API does not require drivers to support
+			 * unmapping parts of pages. Only legacy VFIO type 1 v1
+			 * will attempt it after probing for "fine-grained
+			 * superpages" support. There it allows the v1 version
+			 * of VFIO (that nobody uses) to pass more than
+			 * PAGE_SIZE to map.
+			 */
+			if (!pt_entry_fully_covered(&pts,
+						    pt_entry_oa_lg2sz(&pts)))
+				return -EADDRINUSE;
+			unmap->unmapped += log2_to_int(pt_entry_oa_lg2sz(&pts));
+			record_write(&wlog, &pts,
+				     pt_entry_num_contig_lg2(&pts));
+			pt_clear_entry(&pts, pt_entry_num_contig_lg2(&pts));
+			break;
+		}
+	}
+	return 0;
+}
+
+static size_t NS(unmap_pages)(struct pt_iommu *iommu_table, dma_addr_t iova,
+			      dma_addr_t len,
+			      struct iommu_iotlb_gather *iotlb_gather)
+{
+	struct pt_common *common = common_from_iommu(iommu_table);
+	struct pt_unmap_args unmap = {};
+	struct pt_range range;
+	int ret;
+
+	ret = make_range(common_from_iommu(iommu_table), &range, iova, len);
+	if (ret)
+		return ret;
+
+	pt_walk_range(&range, __unmap_pages, &unmap);
+
+	if (pt_feature(common, PT_FEAT_DMA_INCOHERENT))
+		pt_radix_stop_incoherent_list(&unmap.free_list,
+					      iommu_table->iommu_device);
+
+	/* FIXME into gather */
+	pt_radix_free_list_rcu(&unmap.free_list);
+	return unmap.unmapped;
+}
+
 static void NS(get_info)(struct pt_iommu *iommu_table,
 			 struct pt_iommu_info *info)
 {
@@ -143,6 +285,7 @@ static void NS(deinit)(struct pt_iommu *iommu_table)
 }
 
 static const struct pt_iommu_ops NS(ops) = {
+	.unmap_pages = NS(unmap_pages),
 	.iova_to_phys = NS(iova_to_phys),
 	.get_info = NS(get_info),
 	.deinit = NS(deinit),
diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h
index 5cd56eac14b41d..bdb6bf2c2ebe85 100644
--- a/include/linux/generic_pt/iommu.h
+++ b/include/linux/generic_pt/iommu.h
@@ -8,6 +8,7 @@
 #include <linux/generic_pt/common.h>
 #include <linux/mm_types.h>
 
+struct iommu_iotlb_gather;
 struct pt_iommu_ops;
 
 /**
@@ -60,6 +61,29 @@ struct pt_iommu_info {
 
 /* See the function comments in iommu_pt.c for kdocs */
 struct pt_iommu_ops {
+	/**
+	 * unmap_pages() - Make a range of IOVA empty/not present
+	 * @iommu_table: Table to manipulate
+	 * @iova: IO virtual address to start
+	 * @len: Length of the range starting from @iova
+	 * @gather: Gather struct that must be flushed on return
+	 *
+	 * unmap_pages() will remove translation created by map_pages().
+	 * It cannot subdivide a mapping created by map_pages(),
+	 * so it should be called with IOVA ranges that match those passed
+	 * to map_pages. The IOVA range can aggregate contiguous map_pages() calls
+	 * so long as no individual range is split.
+	 *
+	 * Context: The caller must hold a write range lock that includes
+	 * the whole range.
+	 *
+	 * Returns: Number of bytes of VA unmapped. iova + res will be the
+	 * point unmapping stopped.
+	 */
+	size_t (*unmap_pages)(struct pt_iommu *iommu_table, dma_addr_t iova,
+			      dma_addr_t len,
+			      struct iommu_iotlb_gather *iotlb_gather);
+
 	/**
 	 * iova_to_phys() - Return the output address for the given IOVA
 	 * @iommu_table: Table to query
-- 
2.46.0



  parent reply	other threads:[~2024-08-15 15:15 UTC|newest]

Thread overview: 25+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-08-15 15:11 [PATCH 00/16] Consolidate iommu page table implementations Jason Gunthorpe
2024-08-15 15:11 ` [PATCH 01/16] genpt: Generic Page Table base API Jason Gunthorpe
2024-08-15 15:11 ` [PATCH 02/16] genpt: Add a specialized allocator for page table levels Jason Gunthorpe
2024-08-15 15:11 ` [PATCH 03/16] iommupt: Add the basic structure of the iommu implementation Jason Gunthorpe
2024-08-16 17:58   ` Jeff Johnson
2024-08-15 15:11 ` [PATCH 04/16] iommupt: Add iova_to_phys op Jason Gunthorpe
2024-08-15 15:11 ` Jason Gunthorpe [this message]
2024-08-15 15:11 ` [PATCH 06/16] iommupt: Add map_pages op Jason Gunthorpe
2024-08-15 15:11 ` [PATCH 07/16] iommupt: Add cut_mapping op Jason Gunthorpe
2024-08-15 15:11 ` [PATCH 08/16] iommupt: Add read_and_clear_dirty op Jason Gunthorpe
2024-08-15 15:11 ` [PATCH 09/16] iommupt: Add a kunit test for Generic Page Table and the IOMMU implementation Jason Gunthorpe
2024-08-16 17:55   ` Jeff Johnson
2024-08-19 14:16     ` Jason Gunthorpe
2024-08-15 15:11 ` [PATCH 10/16] iommupt: Add a kunit test to compare against iopt Jason Gunthorpe
2024-08-15 15:11 ` [PATCH 11/16] iommupt: Add the 64 bit ARMv8 page table format Jason Gunthorpe
2024-08-15 15:11 ` [PATCH 12/16] iommupt: Add the AMD IOMMU v1 " Jason Gunthorpe
2024-08-15 15:11 ` [PATCH 13/16] iommupt: Add the x86 PAE " Jason Gunthorpe
2024-08-16 19:21   ` Sean Christopherson
2024-08-17  0:36     ` Jason Gunthorpe
2024-08-15 15:11 ` [PATCH 14/16] iommupt: Add the DART v1/v2 " Jason Gunthorpe
2024-08-15 15:11 ` [PATCH 15/16] iommupt: Add the 32 bit ARMv7s " Jason Gunthorpe
2024-08-15 15:11 ` [PATCH 16/16] iommupt: Add the Intel VT-D second stage " Jason Gunthorpe
2024-08-19  2:51   ` Zhang, Tina
2024-08-19 15:53     ` Jason Gunthorpe
2024-08-20  8:22       ` Yi Liu

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=5-v1-01fa10580981+1d-iommu_pt_jgg@nvidia.com \
    --to=jgg@nvidia.com \
    --cc=alejandro.j.jimenez@oracle.com \
    --cc=baolu.lu@linux.intel.com \
    --cc=david@redhat.com \
    --cc=hch@lst.de \
    --cc=iommu@lists.linux.dev \
    --cc=joao.m.martins@oracle.com \
    --cc=kevin.tian@intel.com \
    --cc=kvm@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=pasha.tatashin@soleen.com \
    --cc=peterx@redhat.com \
    --cc=ryan.roberts@arm.com \
    --cc=seanjc@google.com \
    --cc=tina.zhang@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).