qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
From: Zhenzhong Duan <zhenzhong.duan@intel.com>
To: qemu-devel@nongnu.org
Cc: alex.williamson@redhat.com, clg@redhat.com,
	eric.auger@redhat.com, peterx@redhat.com, jasowang@redhat.com,
	mst@redhat.com, jgg@nvidia.com, nicolinc@nvidia.com,
	joao.m.martins@oracle.com, kevin.tian@intel.com,
	yi.l.liu@intel.com, yi.y.sun@intel.com, chao.p.peng@intel.com,
	Yi Sun <yi.y.sun@linux.intel.com>,
	Zhenzhong Duan <zhenzhong.duan@intel.com>,
	Marcel Apfelbaum <marcel.apfelbaum@gmail.com>,
	Paolo Bonzini <pbonzini@redhat.com>,
	Richard Henderson <richard.henderson@linaro.org>,
	Eduardo Habkost <eduardo@habkost.net>
Subject: [PATCH rfcv1 17/23] intel_iommu: implement firt level translation
Date: Mon, 15 Jan 2024 18:37:29 +0800	[thread overview]
Message-ID: <20240115103735.132209-18-zhenzhong.duan@intel.com> (raw)
In-Reply-To: <20240115103735.132209-1-zhenzhong.duan@intel.com>

From: Yi Liu <yi.l.liu@intel.com>

This adds stage-1 page table walking to support stage-1 only
transltion in scalable mode.

Signed-off-by: Yi Liu <yi.l.liu@intel.com>
Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
---
 hw/i386/intel_iommu_internal.h |  16 +++
 hw/i386/intel_iommu.c          | 242 ++++++++++++++++++++++++++++++++-
 hw/i386/trace-events           |   2 +
 3 files changed, 258 insertions(+), 2 deletions(-)

diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index dcf1410fcf..41b958cd5d 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -598,6 +598,22 @@ typedef struct VTDPIOTLBInvInfo VTDPIOTLBInvInfo;
 #define VTD_SM_PASID_ENTRY_WPE_BIT(val)  (!!(((val) >> 4) & 1ULL))
 #define VTD_SM_PASID_ENTRY_EAFE_BIT(val) (!!(((val) >> 7) & 1ULL))
 
+#define VTD_PASID_IOTLB_MAX_SIZE       1024 /* Max size of the hash table */
+
+/* Paging Structure common */
+#define VTD_FL_PT_PAGE_SIZE_MASK    (1ULL << 7)
+/* Bits to decide the offset for each level */
+#define VTD_FL_LEVEL_BITS           9
+
+/* First Level Paging Structure */
+#define VTD_FL_PT_LEVEL             1
+#define VTD_FL_PT_ENTRY_NR          512
+
+/* Masks for First Level Paging Entry */
+#define VTD_FL_RW_MASK              (1ULL << 1)
+#define VTD_FL_PT_BASE_ADDR_MASK(aw) (~(VTD_PAGE_SIZE - 1) & VTD_HAW_MASK(aw))
+#define VTD_PASID_ENTRY_FPD         (1ULL << 1) /* Fault Processing Disable */
+
 /* Second Level Page Translation Pointer*/
 #define VTD_SM_PASID_ENTRY_SLPTPTR     (~0xfffULL)
 
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 7c24f8f677..1c21f40ccd 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -78,6 +78,10 @@ static void vtd_pasid_cache_sync(IntelIOMMUState *s,
                                  VTDPASIDCacheInfo *pc_info);
 static void vtd_pasid_cache_devsi(IntelIOMMUState *s,
                                   PCIBus *bus, uint16_t devfn);
+static VTDPASIDAddressSpace *vtd_add_find_pasid_as(IntelIOMMUState *s,
+                                                   PCIBus *bus,
+                                                   int devfn,
+                                                   uint32_t pasid);
 
 static void vtd_panic_require_caching_mode(void)
 {
@@ -1888,6 +1892,114 @@ out:
     trace_vtd_pt_enable_fast_path(source_id, success);
 }
 
+/* The shift of an addr for a certain level of paging structure */
+static inline uint32_t vtd_flpt_level_shift(uint32_t level)
+{
+    assert(level != 0);
+    return VTD_PAGE_SHIFT_4K + (level - 1) * VTD_FL_LEVEL_BITS;
+}
+
+static inline uint64_t vtd_flpt_level_page_mask(uint32_t level)
+{
+    return ~((1ULL << vtd_flpt_level_shift(level)) - 1);
+}
+
+static inline dma_addr_t vtd_pe_get_flpt_level(VTDPASIDEntry *pe)
+{
+    return 4 + ((pe->val[2] >> 2) & VTD_SM_PASID_ENTRY_FLPM);
+}
+
+/*
+ * Given an iova and the level of paging structure, return the offset
+ * of current level.
+ */
+static inline uint32_t vtd_iova_fl_level_offset(uint64_t iova, uint32_t level)
+{
+    return (iova >> vtd_flpt_level_shift(level)) &
+            ((1ULL << VTD_FL_LEVEL_BITS) - 1);
+}
+
+/* Get the content of a flpte located in @base_addr[@index] */
+static uint64_t vtd_get_flpte(dma_addr_t base_addr, uint32_t index)
+{
+    uint64_t flpte;
+
+    assert(index < VTD_FL_PT_ENTRY_NR);
+
+    if (dma_memory_read(&address_space_memory,
+                        base_addr + index * sizeof(flpte), &flpte,
+                        sizeof(flpte), MEMTXATTRS_UNSPECIFIED)) {
+        flpte = (uint64_t)-1;
+        return flpte;
+    }
+    flpte = le64_to_cpu(flpte);
+    return flpte;
+}
+
+static inline bool vtd_flpte_present(uint64_t flpte)
+{
+    return !!(flpte & 0x1);
+}
+
+/* Whether the pte indicates the address of the page frame */
+static inline bool vtd_is_last_flpte(uint64_t flpte, uint32_t level)
+{
+    return level == VTD_FL_PT_LEVEL || (flpte & VTD_FL_PT_PAGE_SIZE_MASK);
+}
+
+static inline uint64_t vtd_get_flpte_addr(uint64_t flpte, uint8_t aw)
+{
+    return flpte & VTD_FL_PT_BASE_ADDR_MASK(aw);
+}
+
+/*
+ * Given the @iova, get relevant @flptep. @flpte_level will be the last level
+ * of the translation, can be used for deciding the size of large page.
+ */
+static int vtd_iova_to_flpte(VTDPASIDEntry *pe, uint64_t iova, bool is_write,
+                             uint64_t *flptep, uint32_t *flpte_level,
+                             bool *reads, bool *writes, uint8_t aw_bits)
+{
+    dma_addr_t addr = vtd_pe_get_flpt_base(pe);
+    uint32_t level = vtd_pe_get_flpt_level(pe);
+    uint32_t offset;
+    uint64_t flpte;
+
+    while (true) {
+        offset = vtd_iova_fl_level_offset(iova, level);
+        flpte = vtd_get_flpte(addr, offset);
+        if (flpte == (uint64_t)-1) {
+            if (level == VTD_PE_GET_LEVEL(pe)) {
+                /* Invalid programming of context-entry */
+                return -VTD_FR_CONTEXT_ENTRY_INV;
+            } else {
+                return -VTD_FR_PAGING_ENTRY_INV;
+            }
+        }
+
+        if (!vtd_flpte_present(flpte)) {
+            *reads = false;
+            *writes = false;
+            return -VTD_FR_PAGING_ENTRY_INV;
+        }
+
+        *reads = true;
+        *writes = (*writes) && (flpte & VTD_FL_RW_MASK);
+        if (is_write && !(flpte & VTD_FL_RW_MASK)) {
+            return -VTD_FR_WRITE;
+        }
+
+        if (vtd_is_last_flpte(flpte, level)) {
+            *flptep = flpte;
+            *flpte_level = level;
+            return 0;
+        }
+
+        addr = vtd_get_flpte_addr(flpte, aw_bits);
+        level--;
+    }
+}
+
 static void vtd_report_fault(IntelIOMMUState *s,
                              int err, bool is_fpd_set,
                              uint16_t source_id,
@@ -1904,6 +2016,105 @@ static void vtd_report_fault(IntelIOMMUState *s,
     }
 }
 
+/*
+ * Map dev to pasid-entry then do a paging-structures walk to do a iommu
+ * translation.
+ *
+ * Called from RCU critical section.
+ *
+ * @vtd_as: The untranslated address space
+ * @bus_num: The bus number
+ * @devfn: The devfn, which is the  combined of device and function number
+ * @is_write: The access is a write operation
+ * @entry: IOMMUTLBEntry that contain the addr to be translated and result
+ *
+ * Returns true if translation is successful, otherwise false.
+ */
+static bool vtd_do_iommu_fl_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
+                                      uint8_t devfn, hwaddr addr, bool is_write,
+                                      IOMMUTLBEntry *entry)
+{
+    IntelIOMMUState *s = vtd_as->iommu_state;
+    VTDContextEntry ce;
+    VTDPASIDEntry pe;
+    uint8_t bus_num = pci_bus_num(bus);
+    uint64_t flpte, page_mask;
+    uint32_t level;
+    uint16_t source_id = PCI_BUILD_BDF(bus_num, devfn);
+    int ret;
+    bool is_fpd_set = false;
+    bool reads = true;
+    bool writes = true;
+    uint8_t access_flags;
+
+    /*
+     * We have standalone memory region for interrupt addresses, we
+     * should never receive translation requests in this region.
+     */
+    assert(!vtd_is_interrupt_addr(addr));
+
+    ret = vtd_dev_to_context_entry(s, pci_bus_num(bus), devfn, &ce);
+    if (ret) {
+        error_report_once("%s: detected translation failure 1 "
+                          "(dev=%02x:%02x:%02x, iova=0x%" PRIx64 ")",
+                          __func__, pci_bus_num(bus),
+                          VTD_PCI_SLOT(devfn),
+                          VTD_PCI_FUNC(devfn),
+                          addr);
+        return false;
+    }
+
+    vtd_iommu_lock(s);
+
+    ret = vtd_ce_get_rid2pasid_entry(s, &ce, &pe, PCI_NO_PASID);
+    is_fpd_set = pe.val[0] & VTD_PASID_ENTRY_FPD;
+    if (ret) {
+        vtd_report_fault(s, -ret, is_fpd_set, source_id, addr, is_write,
+                         false, PCI_NO_PASID);
+        goto error;
+    }
+
+    /*
+     * We don't need to translate for pass-through context entries.
+     * Also, let's ignore IOTLB caching as well for PT devices.
+     */
+    if (VTD_PE_GET_TYPE(&pe) == VTD_SM_PASID_ENTRY_PT) {
+        entry->iova = addr & VTD_PAGE_MASK_4K;
+        entry->translated_addr = entry->iova;
+        entry->addr_mask = ~VTD_PAGE_MASK_4K;
+        entry->perm = IOMMU_RW;
+        vtd_iommu_unlock(s);
+        return true;
+    }
+
+    ret = vtd_iova_to_flpte(&pe, addr, is_write, &flpte, &level,
+                            &reads, &writes, s->aw_bits);
+    if (ret) {
+        vtd_report_fault(s, -ret, is_fpd_set, source_id, addr, is_write,
+                         false, PCI_NO_PASID);
+        goto error;
+    }
+
+    page_mask = vtd_flpt_level_page_mask(level);
+    access_flags = IOMMU_ACCESS_FLAG(reads, writes);
+
+    vtd_iommu_unlock(s);
+
+    entry->iova = addr & page_mask;
+    entry->translated_addr = vtd_get_flpte_addr(flpte, s->aw_bits) & page_mask;
+    entry->addr_mask = ~page_mask;
+    entry->perm = access_flags;
+    return true;
+
+error:
+    vtd_iommu_unlock(s);
+    entry->iova = 0;
+    entry->translated_addr = 0;
+    entry->addr_mask = 0;
+    entry->perm = IOMMU_NONE;
+    return false;
+}
+
 /* Map dev to context-entry then do a paging-structures walk to do a iommu
  * translation.
  *
@@ -4516,10 +4727,37 @@ static IOMMUTLBEntry vtd_iommu_translate(IOMMUMemoryRegion *iommu, hwaddr addr,
         .target_as = &address_space_memory,
     };
     bool success;
+    VTDContextEntry ce;
+    VTDPASIDEntry pe;
+    int ret = 0;
 
     if (likely(s->dmar_enabled)) {
-        success = vtd_do_iommu_translate(vtd_as, vtd_as->bus, vtd_as->devfn,
-                                         addr, flag & IOMMU_WO, &iotlb);
+        if (s->root_scalable) {
+            ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
+                                           vtd_as->devfn, &ce);
+            ret = vtd_ce_get_rid2pasid_entry(s, &ce, &pe, PCI_NO_PASID);
+            if (ret) {
+                error_report_once("%s: detected translation failure 1 "
+                                  "(dev=%02x:%02x:%02x, iova=0x%" PRIx64 ")",
+                                  __func__, pci_bus_num(vtd_as->bus),
+                                  VTD_PCI_SLOT(vtd_as->devfn),
+                                  VTD_PCI_FUNC(vtd_as->devfn),
+                                  addr);
+                return iotlb;
+            }
+            if (VTD_PE_GET_TYPE(&pe) == VTD_SM_PASID_ENTRY_FLT) {
+                success = vtd_do_iommu_fl_translate(vtd_as, vtd_as->bus,
+                                                    vtd_as->devfn, addr,
+                                                    flag & IOMMU_WO, &iotlb);
+            } else {
+                success = vtd_do_iommu_translate(vtd_as, vtd_as->bus,
+                                                 vtd_as->devfn, addr,
+                                                 flag & IOMMU_WO, &iotlb);
+            }
+        } else {
+            success = vtd_do_iommu_translate(vtd_as, vtd_as->bus, vtd_as->devfn,
+                                             addr, flag & IOMMU_WO, &iotlb);
+        }
     } else {
         /* DMAR disabled, passthrough, use 4k-page*/
         iotlb.iova = addr & VTD_PAGE_MASK_4K;
diff --git a/hw/i386/trace-events b/hw/i386/trace-events
index 66f7c1ba59..00b27bc5b1 100644
--- a/hw/i386/trace-events
+++ b/hw/i386/trace-events
@@ -33,6 +33,8 @@ vtd_re_not_present(uint8_t bus) "Root entry bus %"PRIu8" not present"
 vtd_ce_not_present(uint8_t bus, uint8_t devfn) "Context entry bus %"PRIu8" devfn %"PRIu8" not present"
 vtd_iotlb_page_hit(uint16_t sid, uint64_t addr, uint64_t slpte, uint16_t domain) "IOTLB page hit sid 0x%"PRIx16" iova 0x%"PRIx64" slpte 0x%"PRIx64" domain 0x%"PRIx16
 vtd_iotlb_page_update(uint16_t sid, uint64_t addr, uint64_t slpte, uint16_t domain) "IOTLB page update sid 0x%"PRIx16" iova 0x%"PRIx64" slpte 0x%"PRIx64" domain 0x%"PRIx16
+vtd_iotlb_pe_hit(uint32_t pasid, uint64_t val0, uint32_t gen) "IOTLB pasid hit pasid %"PRIu32" val[0] 0x%"PRIx64" gen %"PRIu32
+vtd_iotlb_pe_update(uint32_t pasid, uint64_t val0, uint32_t gen1, uint32_t gen2) "IOTLB pasid update pasid %"PRIu32" val[0] 0x%"PRIx64" gen %"PRIu32" -> gen %"PRIu32
 vtd_iotlb_cc_hit(uint8_t bus, uint8_t devfn, uint64_t high, uint64_t low, uint32_t gen) "IOTLB context hit bus 0x%"PRIx8" devfn 0x%"PRIx8" high 0x%"PRIx64" low 0x%"PRIx64" gen %"PRIu32
 vtd_iotlb_cc_update(uint8_t bus, uint8_t devfn, uint64_t high, uint64_t low, uint32_t gen1, uint32_t gen2) "IOTLB context update bus 0x%"PRIx8" devfn 0x%"PRIx8" high 0x%"PRIx64" low 0x%"PRIx64" gen %"PRIu32" -> gen %"PRIu32
 vtd_iotlb_reset(const char *reason) "IOTLB reset (reason: %s)"
-- 
2.34.1



  parent reply	other threads:[~2024-01-15 10:42 UTC|newest]

Thread overview: 29+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-01-15 10:37 [PATCH rfcv1 00/23] intel_iommu: Enable stage-1 translation Zhenzhong Duan
2024-01-15 10:37 ` [PATCH rfcv1 01/23] Update linux header to support nested hwpt alloc Zhenzhong Duan
2024-01-15 10:37 ` [PATCH rfcv1 02/23] backends/iommufd: add helpers for allocating user-managed HWPT Zhenzhong Duan
2024-01-15 10:37 ` [PATCH rfcv1 03/23] backends/iommufd_device: introduce IOMMUFDDevice targeted interface Zhenzhong Duan
2024-01-15 10:37 ` [PATCH rfcv1 04/23] vfio: implement IOMMUFDDevice interface callbacks Zhenzhong Duan
2024-01-15 10:37 ` [PATCH rfcv1 05/23] intel_iommu: add a placeholder variable for scalable modern mode Zhenzhong Duan
2024-01-15 10:37 ` [PATCH rfcv1 06/23] intel_iommu: check and sync host IOMMU cap/ecap in " Zhenzhong Duan
2024-01-15 10:37 ` [PATCH rfcv1 07/23] intel_iommu: process PASID cache invalidation Zhenzhong Duan
2024-01-15 10:37 ` [PATCH rfcv1 08/23] intel_iommu: add PASID cache management infrastructure Zhenzhong Duan
2024-01-15 10:37 ` [PATCH rfcv1 09/23] vfio/iommufd_device: Add ioas_id in IOMMUFDDevice and pass to vIOMMU Zhenzhong Duan
2024-01-15 10:37 ` [PATCH rfcv1 10/23] intel_iommu: bind/unbind guest page table to host Zhenzhong Duan
2024-01-15 10:37 ` [PATCH rfcv1 11/23] intel_iommu: ERRATA_772415 workaround Zhenzhong Duan
2024-01-15 10:37 ` [PATCH rfcv1 12/23] intel_iommu: replay pasid binds after context cache invalidation Zhenzhong Duan
2024-01-15 10:37 ` [PATCH rfcv1 13/23] intel_iommu: process PASID-based iotlb invalidation Zhenzhong Duan
2024-01-15 10:37 ` [PATCH rfcv1 14/23] intel_iommu: propagate PASID-based iotlb invalidation to host Zhenzhong Duan
2024-01-15 10:37 ` [PATCH rfcv1 15/23] intel_iommu: process PASID-based Device-TLB invalidation Zhenzhong Duan
2024-01-15 10:37 ` [PATCH rfcv1 16/23] intel_iommu: rename slpte in iotlb_entry to pte Zhenzhong Duan
2024-01-15 10:37 ` Zhenzhong Duan [this message]
2024-01-15 10:37 ` [PATCH rfcv1 18/23] intel_iommu: fix the fault reason report Zhenzhong Duan
2024-01-15 10:37 ` [PATCH rfcv1 19/23] intel_iommu: introduce pasid iotlb cache Zhenzhong Duan
2024-01-15 10:37 ` [PATCH rfcv1 20/23] intel_iommu: piotlb invalidation should notify unmap Zhenzhong Duan
2024-01-15 10:37 ` [PATCH rfcv1 21/23] intel_iommu: invalidate piotlb when flush pasid Zhenzhong Duan
2024-01-15 10:37 ` [PATCH rfcv1 22/23] intel_iommu: refresh pasid bind after pasid cache force reset Zhenzhong Duan
2024-01-15 10:37 ` [PATCH rfcv1 23/23] intel_iommu: modify x-scalable-mode to be string option Zhenzhong Duan
     [not found]   ` <CGME20240131144013eucas1p22d46339ae42f54dd59c23e8b95502dda@eucas1p2.samsung.com>
2024-01-31 14:40     ` Joel Granados
2024-01-31 15:24       ` Yi Liu
2024-02-04 21:05         ` Joel Granados
2024-01-22  4:29 ` [PATCH rfcv1 00/23] intel_iommu: Enable stage-1 translation Jason Wang
2024-01-22  5:59   ` Duan, Zhenzhong

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20240115103735.132209-18-zhenzhong.duan@intel.com \
    --to=zhenzhong.duan@intel.com \
    --cc=alex.williamson@redhat.com \
    --cc=chao.p.peng@intel.com \
    --cc=clg@redhat.com \
    --cc=eduardo@habkost.net \
    --cc=eric.auger@redhat.com \
    --cc=jasowang@redhat.com \
    --cc=jgg@nvidia.com \
    --cc=joao.m.martins@oracle.com \
    --cc=kevin.tian@intel.com \
    --cc=marcel.apfelbaum@gmail.com \
    --cc=mst@redhat.com \
    --cc=nicolinc@nvidia.com \
    --cc=pbonzini@redhat.com \
    --cc=peterx@redhat.com \
    --cc=qemu-devel@nongnu.org \
    --cc=richard.henderson@linaro.org \
    --cc=yi.l.liu@intel.com \
    --cc=yi.y.sun@intel.com \
    --cc=yi.y.sun@linux.intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).