From: Zhenzhong Duan <zhenzhong.duan@intel.com>
To: qemu-devel@nongnu.org
Cc: alex.williamson@redhat.com, clg@redhat.com,
eric.auger@redhat.com, peterx@redhat.com, jasowang@redhat.com,
mst@redhat.com, jgg@nvidia.com, nicolinc@nvidia.com,
joao.m.martins@oracle.com, kevin.tian@intel.com,
yi.l.liu@intel.com, yi.y.sun@intel.com, chao.p.peng@intel.com,
Yi Sun <yi.y.sun@linux.intel.com>,
Zhenzhong Duan <zhenzhong.duan@intel.com>,
Marcel Apfelbaum <marcel.apfelbaum@gmail.com>,
Paolo Bonzini <pbonzini@redhat.com>,
Richard Henderson <richard.henderson@linaro.org>,
Eduardo Habkost <eduardo@habkost.net>
Subject: [PATCH rfcv1 17/23] intel_iommu: implement firt level translation
Date: Mon, 15 Jan 2024 18:37:29 +0800 [thread overview]
Message-ID: <20240115103735.132209-18-zhenzhong.duan@intel.com> (raw)
In-Reply-To: <20240115103735.132209-1-zhenzhong.duan@intel.com>
From: Yi Liu <yi.l.liu@intel.com>
This adds stage-1 page table walking to support stage-1 only
transltion in scalable mode.
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
---
hw/i386/intel_iommu_internal.h | 16 +++
hw/i386/intel_iommu.c | 242 ++++++++++++++++++++++++++++++++-
hw/i386/trace-events | 2 +
3 files changed, 258 insertions(+), 2 deletions(-)
diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index dcf1410fcf..41b958cd5d 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -598,6 +598,22 @@ typedef struct VTDPIOTLBInvInfo VTDPIOTLBInvInfo;
#define VTD_SM_PASID_ENTRY_WPE_BIT(val) (!!(((val) >> 4) & 1ULL))
#define VTD_SM_PASID_ENTRY_EAFE_BIT(val) (!!(((val) >> 7) & 1ULL))
+#define VTD_PASID_IOTLB_MAX_SIZE 1024 /* Max size of the hash table */
+
+/* Paging Structure common */
+#define VTD_FL_PT_PAGE_SIZE_MASK (1ULL << 7)
+/* Bits to decide the offset for each level */
+#define VTD_FL_LEVEL_BITS 9
+
+/* First Level Paging Structure */
+#define VTD_FL_PT_LEVEL 1
+#define VTD_FL_PT_ENTRY_NR 512
+
+/* Masks for First Level Paging Entry */
+#define VTD_FL_RW_MASK (1ULL << 1)
+#define VTD_FL_PT_BASE_ADDR_MASK(aw) (~(VTD_PAGE_SIZE - 1) & VTD_HAW_MASK(aw))
+#define VTD_PASID_ENTRY_FPD (1ULL << 1) /* Fault Processing Disable */
+
/* Second Level Page Translation Pointer*/
#define VTD_SM_PASID_ENTRY_SLPTPTR (~0xfffULL)
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 7c24f8f677..1c21f40ccd 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -78,6 +78,10 @@ static void vtd_pasid_cache_sync(IntelIOMMUState *s,
VTDPASIDCacheInfo *pc_info);
static void vtd_pasid_cache_devsi(IntelIOMMUState *s,
PCIBus *bus, uint16_t devfn);
+static VTDPASIDAddressSpace *vtd_add_find_pasid_as(IntelIOMMUState *s,
+ PCIBus *bus,
+ int devfn,
+ uint32_t pasid);
static void vtd_panic_require_caching_mode(void)
{
@@ -1888,6 +1892,114 @@ out:
trace_vtd_pt_enable_fast_path(source_id, success);
}
+/* The shift of an addr for a certain level of paging structure */
+static inline uint32_t vtd_flpt_level_shift(uint32_t level)
+{
+ assert(level != 0);
+ return VTD_PAGE_SHIFT_4K + (level - 1) * VTD_FL_LEVEL_BITS;
+}
+
+static inline uint64_t vtd_flpt_level_page_mask(uint32_t level)
+{
+ return ~((1ULL << vtd_flpt_level_shift(level)) - 1);
+}
+
+static inline dma_addr_t vtd_pe_get_flpt_level(VTDPASIDEntry *pe)
+{
+ return 4 + ((pe->val[2] >> 2) & VTD_SM_PASID_ENTRY_FLPM);
+}
+
+/*
+ * Given an iova and the level of paging structure, return the offset
+ * of current level.
+ */
+static inline uint32_t vtd_iova_fl_level_offset(uint64_t iova, uint32_t level)
+{
+ return (iova >> vtd_flpt_level_shift(level)) &
+ ((1ULL << VTD_FL_LEVEL_BITS) - 1);
+}
+
+/* Get the content of a flpte located in @base_addr[@index] */
+static uint64_t vtd_get_flpte(dma_addr_t base_addr, uint32_t index)
+{
+ uint64_t flpte;
+
+ assert(index < VTD_FL_PT_ENTRY_NR);
+
+ if (dma_memory_read(&address_space_memory,
+ base_addr + index * sizeof(flpte), &flpte,
+ sizeof(flpte), MEMTXATTRS_UNSPECIFIED)) {
+ flpte = (uint64_t)-1;
+ return flpte;
+ }
+ flpte = le64_to_cpu(flpte);
+ return flpte;
+}
+
+static inline bool vtd_flpte_present(uint64_t flpte)
+{
+ return !!(flpte & 0x1);
+}
+
+/* Whether the pte indicates the address of the page frame */
+static inline bool vtd_is_last_flpte(uint64_t flpte, uint32_t level)
+{
+ return level == VTD_FL_PT_LEVEL || (flpte & VTD_FL_PT_PAGE_SIZE_MASK);
+}
+
+static inline uint64_t vtd_get_flpte_addr(uint64_t flpte, uint8_t aw)
+{
+ return flpte & VTD_FL_PT_BASE_ADDR_MASK(aw);
+}
+
+/*
+ * Given the @iova, get relevant @flptep. @flpte_level will be the last level
+ * of the translation, can be used for deciding the size of large page.
+ */
+static int vtd_iova_to_flpte(VTDPASIDEntry *pe, uint64_t iova, bool is_write,
+ uint64_t *flptep, uint32_t *flpte_level,
+ bool *reads, bool *writes, uint8_t aw_bits)
+{
+ dma_addr_t addr = vtd_pe_get_flpt_base(pe);
+ uint32_t level = vtd_pe_get_flpt_level(pe);
+ uint32_t offset;
+ uint64_t flpte;
+
+ while (true) {
+ offset = vtd_iova_fl_level_offset(iova, level);
+ flpte = vtd_get_flpte(addr, offset);
+ if (flpte == (uint64_t)-1) {
+ if (level == VTD_PE_GET_LEVEL(pe)) {
+ /* Invalid programming of context-entry */
+ return -VTD_FR_CONTEXT_ENTRY_INV;
+ } else {
+ return -VTD_FR_PAGING_ENTRY_INV;
+ }
+ }
+
+ if (!vtd_flpte_present(flpte)) {
+ *reads = false;
+ *writes = false;
+ return -VTD_FR_PAGING_ENTRY_INV;
+ }
+
+ *reads = true;
+ *writes = (*writes) && (flpte & VTD_FL_RW_MASK);
+ if (is_write && !(flpte & VTD_FL_RW_MASK)) {
+ return -VTD_FR_WRITE;
+ }
+
+ if (vtd_is_last_flpte(flpte, level)) {
+ *flptep = flpte;
+ *flpte_level = level;
+ return 0;
+ }
+
+ addr = vtd_get_flpte_addr(flpte, aw_bits);
+ level--;
+ }
+}
+
static void vtd_report_fault(IntelIOMMUState *s,
int err, bool is_fpd_set,
uint16_t source_id,
@@ -1904,6 +2016,105 @@ static void vtd_report_fault(IntelIOMMUState *s,
}
}
+/*
+ * Map dev to pasid-entry then do a paging-structures walk to do a iommu
+ * translation.
+ *
+ * Called from RCU critical section.
+ *
+ * @vtd_as: The untranslated address space
+ * @bus_num: The bus number
+ * @devfn: The devfn, which is the combined of device and function number
+ * @is_write: The access is a write operation
+ * @entry: IOMMUTLBEntry that contain the addr to be translated and result
+ *
+ * Returns true if translation is successful, otherwise false.
+ */
+static bool vtd_do_iommu_fl_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
+ uint8_t devfn, hwaddr addr, bool is_write,
+ IOMMUTLBEntry *entry)
+{
+ IntelIOMMUState *s = vtd_as->iommu_state;
+ VTDContextEntry ce;
+ VTDPASIDEntry pe;
+ uint8_t bus_num = pci_bus_num(bus);
+ uint64_t flpte, page_mask;
+ uint32_t level;
+ uint16_t source_id = PCI_BUILD_BDF(bus_num, devfn);
+ int ret;
+ bool is_fpd_set = false;
+ bool reads = true;
+ bool writes = true;
+ uint8_t access_flags;
+
+ /*
+ * We have standalone memory region for interrupt addresses, we
+ * should never receive translation requests in this region.
+ */
+ assert(!vtd_is_interrupt_addr(addr));
+
+ ret = vtd_dev_to_context_entry(s, pci_bus_num(bus), devfn, &ce);
+ if (ret) {
+ error_report_once("%s: detected translation failure 1 "
+ "(dev=%02x:%02x:%02x, iova=0x%" PRIx64 ")",
+ __func__, pci_bus_num(bus),
+ VTD_PCI_SLOT(devfn),
+ VTD_PCI_FUNC(devfn),
+ addr);
+ return false;
+ }
+
+ vtd_iommu_lock(s);
+
+ ret = vtd_ce_get_rid2pasid_entry(s, &ce, &pe, PCI_NO_PASID);
+ is_fpd_set = pe.val[0] & VTD_PASID_ENTRY_FPD;
+ if (ret) {
+ vtd_report_fault(s, -ret, is_fpd_set, source_id, addr, is_write,
+ false, PCI_NO_PASID);
+ goto error;
+ }
+
+ /*
+ * We don't need to translate for pass-through context entries.
+ * Also, let's ignore IOTLB caching as well for PT devices.
+ */
+ if (VTD_PE_GET_TYPE(&pe) == VTD_SM_PASID_ENTRY_PT) {
+ entry->iova = addr & VTD_PAGE_MASK_4K;
+ entry->translated_addr = entry->iova;
+ entry->addr_mask = ~VTD_PAGE_MASK_4K;
+ entry->perm = IOMMU_RW;
+ vtd_iommu_unlock(s);
+ return true;
+ }
+
+ ret = vtd_iova_to_flpte(&pe, addr, is_write, &flpte, &level,
+ &reads, &writes, s->aw_bits);
+ if (ret) {
+ vtd_report_fault(s, -ret, is_fpd_set, source_id, addr, is_write,
+ false, PCI_NO_PASID);
+ goto error;
+ }
+
+ page_mask = vtd_flpt_level_page_mask(level);
+ access_flags = IOMMU_ACCESS_FLAG(reads, writes);
+
+ vtd_iommu_unlock(s);
+
+ entry->iova = addr & page_mask;
+ entry->translated_addr = vtd_get_flpte_addr(flpte, s->aw_bits) & page_mask;
+ entry->addr_mask = ~page_mask;
+ entry->perm = access_flags;
+ return true;
+
+error:
+ vtd_iommu_unlock(s);
+ entry->iova = 0;
+ entry->translated_addr = 0;
+ entry->addr_mask = 0;
+ entry->perm = IOMMU_NONE;
+ return false;
+}
+
/* Map dev to context-entry then do a paging-structures walk to do a iommu
* translation.
*
@@ -4516,10 +4727,37 @@ static IOMMUTLBEntry vtd_iommu_translate(IOMMUMemoryRegion *iommu, hwaddr addr,
.target_as = &address_space_memory,
};
bool success;
+ VTDContextEntry ce;
+ VTDPASIDEntry pe;
+ int ret = 0;
if (likely(s->dmar_enabled)) {
- success = vtd_do_iommu_translate(vtd_as, vtd_as->bus, vtd_as->devfn,
- addr, flag & IOMMU_WO, &iotlb);
+ if (s->root_scalable) {
+ ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
+ vtd_as->devfn, &ce);
+ ret = vtd_ce_get_rid2pasid_entry(s, &ce, &pe, PCI_NO_PASID);
+ if (ret) {
+ error_report_once("%s: detected translation failure 1 "
+ "(dev=%02x:%02x:%02x, iova=0x%" PRIx64 ")",
+ __func__, pci_bus_num(vtd_as->bus),
+ VTD_PCI_SLOT(vtd_as->devfn),
+ VTD_PCI_FUNC(vtd_as->devfn),
+ addr);
+ return iotlb;
+ }
+ if (VTD_PE_GET_TYPE(&pe) == VTD_SM_PASID_ENTRY_FLT) {
+ success = vtd_do_iommu_fl_translate(vtd_as, vtd_as->bus,
+ vtd_as->devfn, addr,
+ flag & IOMMU_WO, &iotlb);
+ } else {
+ success = vtd_do_iommu_translate(vtd_as, vtd_as->bus,
+ vtd_as->devfn, addr,
+ flag & IOMMU_WO, &iotlb);
+ }
+ } else {
+ success = vtd_do_iommu_translate(vtd_as, vtd_as->bus, vtd_as->devfn,
+ addr, flag & IOMMU_WO, &iotlb);
+ }
} else {
/* DMAR disabled, passthrough, use 4k-page*/
iotlb.iova = addr & VTD_PAGE_MASK_4K;
diff --git a/hw/i386/trace-events b/hw/i386/trace-events
index 66f7c1ba59..00b27bc5b1 100644
--- a/hw/i386/trace-events
+++ b/hw/i386/trace-events
@@ -33,6 +33,8 @@ vtd_re_not_present(uint8_t bus) "Root entry bus %"PRIu8" not present"
vtd_ce_not_present(uint8_t bus, uint8_t devfn) "Context entry bus %"PRIu8" devfn %"PRIu8" not present"
vtd_iotlb_page_hit(uint16_t sid, uint64_t addr, uint64_t slpte, uint16_t domain) "IOTLB page hit sid 0x%"PRIx16" iova 0x%"PRIx64" slpte 0x%"PRIx64" domain 0x%"PRIx16
vtd_iotlb_page_update(uint16_t sid, uint64_t addr, uint64_t slpte, uint16_t domain) "IOTLB page update sid 0x%"PRIx16" iova 0x%"PRIx64" slpte 0x%"PRIx64" domain 0x%"PRIx16
+vtd_iotlb_pe_hit(uint32_t pasid, uint64_t val0, uint32_t gen) "IOTLB pasid hit pasid %"PRIu32" val[0] 0x%"PRIx64" gen %"PRIu32
+vtd_iotlb_pe_update(uint32_t pasid, uint64_t val0, uint32_t gen1, uint32_t gen2) "IOTLB pasid update pasid %"PRIu32" val[0] 0x%"PRIx64" gen %"PRIu32" -> gen %"PRIu32
vtd_iotlb_cc_hit(uint8_t bus, uint8_t devfn, uint64_t high, uint64_t low, uint32_t gen) "IOTLB context hit bus 0x%"PRIx8" devfn 0x%"PRIx8" high 0x%"PRIx64" low 0x%"PRIx64" gen %"PRIu32
vtd_iotlb_cc_update(uint8_t bus, uint8_t devfn, uint64_t high, uint64_t low, uint32_t gen1, uint32_t gen2) "IOTLB context update bus 0x%"PRIx8" devfn 0x%"PRIx8" high 0x%"PRIx64" low 0x%"PRIx64" gen %"PRIu32" -> gen %"PRIu32
vtd_iotlb_reset(const char *reason) "IOTLB reset (reason: %s)"
--
2.34.1
next prev parent reply other threads:[~2024-01-15 10:42 UTC|newest]
Thread overview: 29+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-01-15 10:37 [PATCH rfcv1 00/23] intel_iommu: Enable stage-1 translation Zhenzhong Duan
2024-01-15 10:37 ` [PATCH rfcv1 01/23] Update linux header to support nested hwpt alloc Zhenzhong Duan
2024-01-15 10:37 ` [PATCH rfcv1 02/23] backends/iommufd: add helpers for allocating user-managed HWPT Zhenzhong Duan
2024-01-15 10:37 ` [PATCH rfcv1 03/23] backends/iommufd_device: introduce IOMMUFDDevice targeted interface Zhenzhong Duan
2024-01-15 10:37 ` [PATCH rfcv1 04/23] vfio: implement IOMMUFDDevice interface callbacks Zhenzhong Duan
2024-01-15 10:37 ` [PATCH rfcv1 05/23] intel_iommu: add a placeholder variable for scalable modern mode Zhenzhong Duan
2024-01-15 10:37 ` [PATCH rfcv1 06/23] intel_iommu: check and sync host IOMMU cap/ecap in " Zhenzhong Duan
2024-01-15 10:37 ` [PATCH rfcv1 07/23] intel_iommu: process PASID cache invalidation Zhenzhong Duan
2024-01-15 10:37 ` [PATCH rfcv1 08/23] intel_iommu: add PASID cache management infrastructure Zhenzhong Duan
2024-01-15 10:37 ` [PATCH rfcv1 09/23] vfio/iommufd_device: Add ioas_id in IOMMUFDDevice and pass to vIOMMU Zhenzhong Duan
2024-01-15 10:37 ` [PATCH rfcv1 10/23] intel_iommu: bind/unbind guest page table to host Zhenzhong Duan
2024-01-15 10:37 ` [PATCH rfcv1 11/23] intel_iommu: ERRATA_772415 workaround Zhenzhong Duan
2024-01-15 10:37 ` [PATCH rfcv1 12/23] intel_iommu: replay pasid binds after context cache invalidation Zhenzhong Duan
2024-01-15 10:37 ` [PATCH rfcv1 13/23] intel_iommu: process PASID-based iotlb invalidation Zhenzhong Duan
2024-01-15 10:37 ` [PATCH rfcv1 14/23] intel_iommu: propagate PASID-based iotlb invalidation to host Zhenzhong Duan
2024-01-15 10:37 ` [PATCH rfcv1 15/23] intel_iommu: process PASID-based Device-TLB invalidation Zhenzhong Duan
2024-01-15 10:37 ` [PATCH rfcv1 16/23] intel_iommu: rename slpte in iotlb_entry to pte Zhenzhong Duan
2024-01-15 10:37 ` Zhenzhong Duan [this message]
2024-01-15 10:37 ` [PATCH rfcv1 18/23] intel_iommu: fix the fault reason report Zhenzhong Duan
2024-01-15 10:37 ` [PATCH rfcv1 19/23] intel_iommu: introduce pasid iotlb cache Zhenzhong Duan
2024-01-15 10:37 ` [PATCH rfcv1 20/23] intel_iommu: piotlb invalidation should notify unmap Zhenzhong Duan
2024-01-15 10:37 ` [PATCH rfcv1 21/23] intel_iommu: invalidate piotlb when flush pasid Zhenzhong Duan
2024-01-15 10:37 ` [PATCH rfcv1 22/23] intel_iommu: refresh pasid bind after pasid cache force reset Zhenzhong Duan
2024-01-15 10:37 ` [PATCH rfcv1 23/23] intel_iommu: modify x-scalable-mode to be string option Zhenzhong Duan
[not found] ` <CGME20240131144013eucas1p22d46339ae42f54dd59c23e8b95502dda@eucas1p2.samsung.com>
2024-01-31 14:40 ` Joel Granados
2024-01-31 15:24 ` Yi Liu
2024-02-04 21:05 ` Joel Granados
2024-01-22 4:29 ` [PATCH rfcv1 00/23] intel_iommu: Enable stage-1 translation Jason Wang
2024-01-22 5:59 ` Duan, Zhenzhong
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20240115103735.132209-18-zhenzhong.duan@intel.com \
--to=zhenzhong.duan@intel.com \
--cc=alex.williamson@redhat.com \
--cc=chao.p.peng@intel.com \
--cc=clg@redhat.com \
--cc=eduardo@habkost.net \
--cc=eric.auger@redhat.com \
--cc=jasowang@redhat.com \
--cc=jgg@nvidia.com \
--cc=joao.m.martins@oracle.com \
--cc=kevin.tian@intel.com \
--cc=marcel.apfelbaum@gmail.com \
--cc=mst@redhat.com \
--cc=nicolinc@nvidia.com \
--cc=pbonzini@redhat.com \
--cc=peterx@redhat.com \
--cc=qemu-devel@nongnu.org \
--cc=richard.henderson@linaro.org \
--cc=yi.l.liu@intel.com \
--cc=yi.y.sun@intel.com \
--cc=yi.y.sun@linux.intel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).