From: "Michael S. Tsirkin" <mst@redhat.com>
To: Jason Wang <jasowang@redhat.com>
Cc: qemu-devel@nongnu.org, pbonzini@redhat.com, ehabkost@redhat.com,
rth@twiddle.net
Subject: Re: [Qemu-devel] [PATCH] intel_iommu: large page support
Date: Thu, 14 Jan 2016 11:28:10 +0200 [thread overview]
Message-ID: <20160114112703-mutt-send-email-mst@redhat.com> (raw)
In-Reply-To: <1452750444-17750-1-git-send-email-jasowang@redhat.com>
On Thu, Jan 14, 2016 at 12:47:24AM -0500, Jason Wang wrote:
> Current intel_iommu only supports 4K page which may not be sufficient
> to cover guest working set. This patch tries to enable 2M and 1G mapping
> for intel_iommu. This is also useful for future device IOTLB
> implementation to have a better hit rate.
>
> Major work is adding a page mask field on IOTLB entry to make it
> support large page. And also use the slpte level as key to do IOTLB
> lookup. MAMV was increased to 18 to support direct invalidation for 1G
> mapping.
>
> Cc: Michael S. Tsirkin <mst@redhat.com>
> Cc: Paolo Bonzini <pbonzini@redhat.com>
> Cc: Richard Henderson <rth@twiddle.net>
> Cc: Eduardo Habkost <ehabkost@redhat.com>
> Signed-off-by: Jason Wang <jasowang@redhat.com>
Looks good, thanks!
I was going to comment that changes such as MAMV would
have to be versioned, when I noticed that this device
is unmigrateable ATM.
So no issue, but we do need to fix migration for it.
> ---
> Test was done by virtio-net-pmd/vfio with 2M or 1G mapping in guest.
> ---
> hw/i386/intel_iommu.c | 76 ++++++++++++++++++++++++++++++------------
> hw/i386/intel_iommu_internal.h | 6 ++--
> include/hw/i386/intel_iommu.h | 1 +
> 3 files changed, 59 insertions(+), 24 deletions(-)
>
> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> index 3fe27fa..68940a0 100644
> --- a/hw/i386/intel_iommu.c
> +++ b/hw/i386/intel_iommu.c
> @@ -152,14 +152,27 @@ static gboolean vtd_hash_remove_by_domain(gpointer key, gpointer value,
> return entry->domain_id == domain_id;
> }
>
> +/* The shift of an addr for a certain level of paging structure */
> +static inline uint32_t vtd_slpt_level_shift(uint32_t level)
> +{
> + return VTD_PAGE_SHIFT_4K + (level - 1) * VTD_SL_LEVEL_BITS;
> +}
> +
> +static inline uint64_t vtd_slpt_level_page_mask(uint32_t level)
> +{
> + return ~((1ULL << vtd_slpt_level_shift(level)) - 1);
> +}
> +
> static gboolean vtd_hash_remove_by_page(gpointer key, gpointer value,
> gpointer user_data)
> {
> VTDIOTLBEntry *entry = (VTDIOTLBEntry *)value;
> VTDIOTLBPageInvInfo *info = (VTDIOTLBPageInvInfo *)user_data;
> - uint64_t gfn = info->gfn & info->mask;
> + uint64_t gfn = (info->addr >> VTD_PAGE_SHIFT_4K) & info->mask;
> + uint64_t gfn_tlb = (info->addr & entry->mask) >> VTD_PAGE_SHIFT_4K;
> return (entry->domain_id == info->domain_id) &&
> - ((entry->gfn & info->mask) == gfn);
> + (((entry->gfn & info->mask) == gfn) ||
> + (entry->gfn == gfn_tlb));
> }
>
> /* Reset all the gen of VTDAddressSpace to zero and set the gen of
> @@ -193,24 +206,46 @@ static void vtd_reset_iotlb(IntelIOMMUState *s)
> g_hash_table_remove_all(s->iotlb);
> }
>
> +static uint64_t vtd_get_iotlb_key(uint64_t gfn, uint8_t source_id,
> + uint32_t level)
> +{
> + return gfn | ((uint64_t)(source_id) << VTD_IOTLB_SID_SHIFT) |
> + ((uint64_t)(level) << VTD_IOTLB_LVL_SHIFT);
> +}
> +
> +static uint64_t vtd_get_iotlb_gfn(hwaddr addr, uint32_t level)
> +{
> + return (addr & vtd_slpt_level_page_mask(level)) >> VTD_PAGE_SHIFT_4K;
> +}
> +
> static VTDIOTLBEntry *vtd_lookup_iotlb(IntelIOMMUState *s, uint16_t source_id,
> hwaddr addr)
> {
> + VTDIOTLBEntry *entry;
> uint64_t key;
> + int level;
> +
> + for (level = VTD_SL_PT_LEVEL; level < VTD_SL_PML4_LEVEL; level++) {
> + key = vtd_get_iotlb_key(vtd_get_iotlb_gfn(addr, level),
> + source_id, level);
> + entry = g_hash_table_lookup(s->iotlb, &key);
> + if (entry) {
> + goto out;
> + }
> + }
>
> - key = (addr >> VTD_PAGE_SHIFT_4K) |
> - ((uint64_t)(source_id) << VTD_IOTLB_SID_SHIFT);
> - return g_hash_table_lookup(s->iotlb, &key);
> -
> +out:
> + return entry;
> }
>
> static void vtd_update_iotlb(IntelIOMMUState *s, uint16_t source_id,
> uint16_t domain_id, hwaddr addr, uint64_t slpte,
> - bool read_flags, bool write_flags)
> + bool read_flags, bool write_flags,
> + uint32_t level)
> {
> VTDIOTLBEntry *entry = g_malloc(sizeof(*entry));
> uint64_t *key = g_malloc(sizeof(*key));
> - uint64_t gfn = addr >> VTD_PAGE_SHIFT_4K;
> + uint64_t gfn = vtd_get_iotlb_gfn(addr, level);
>
> VTD_DPRINTF(CACHE, "update iotlb sid 0x%"PRIx16 " gpa 0x%"PRIx64
> " slpte 0x%"PRIx64 " did 0x%"PRIx16, source_id, addr, slpte,
> @@ -225,7 +260,8 @@ static void vtd_update_iotlb(IntelIOMMUState *s, uint16_t source_id,
> entry->slpte = slpte;
> entry->read_flags = read_flags;
> entry->write_flags = write_flags;
> - *key = gfn | ((uint64_t)(source_id) << VTD_IOTLB_SID_SHIFT);
> + entry->mask = vtd_slpt_level_page_mask(level);
> + *key = vtd_get_iotlb_key(gfn, source_id, level);
> g_hash_table_replace(s->iotlb, key, entry);
> }
>
> @@ -500,12 +536,6 @@ static inline dma_addr_t vtd_get_slpt_base_from_context(VTDContextEntry *ce)
> return ce->lo & VTD_CONTEXT_ENTRY_SLPTPTR;
> }
>
> -/* The shift of an addr for a certain level of paging structure */
> -static inline uint32_t vtd_slpt_level_shift(uint32_t level)
> -{
> - return VTD_PAGE_SHIFT_4K + (level - 1) * VTD_SL_LEVEL_BITS;
> -}
> -
> static inline uint64_t vtd_get_slpte_addr(uint64_t slpte)
> {
> return slpte & VTD_SL_PT_BASE_ADDR_MASK;
> @@ -761,7 +791,7 @@ static void vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
> VTDContextEntry ce;
> uint8_t bus_num = pci_bus_num(bus);
> VTDContextCacheEntry *cc_entry = &vtd_as->context_cache_entry;
> - uint64_t slpte;
> + uint64_t slpte, page_mask;
> uint32_t level;
> uint16_t source_id = vtd_make_source_id(bus_num, devfn);
> int ret_fr;
> @@ -801,6 +831,7 @@ static void vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
> slpte = iotlb_entry->slpte;
> reads = iotlb_entry->read_flags;
> writes = iotlb_entry->write_flags;
> + page_mask = iotlb_entry->mask;
> goto out;
> }
> /* Try to fetch context-entry from cache first */
> @@ -847,12 +878,13 @@ static void vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
> return;
> }
>
> + page_mask = vtd_slpt_level_page_mask(level);
> vtd_update_iotlb(s, source_id, VTD_CONTEXT_ENTRY_DID(ce.hi), addr, slpte,
> - reads, writes);
> + reads, writes, level);
> out:
> - entry->iova = addr & VTD_PAGE_MASK_4K;
> - entry->translated_addr = vtd_get_slpte_addr(slpte) & VTD_PAGE_MASK_4K;
> - entry->addr_mask = ~VTD_PAGE_MASK_4K;
> + entry->iova = addr & page_mask;
> + entry->translated_addr = vtd_get_slpte_addr(slpte) & page_mask;
> + entry->addr_mask = ~page_mask;
> entry->perm = (writes ? 2 : 0) + (reads ? 1 : 0);
> }
>
> @@ -990,7 +1022,7 @@ static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
>
> assert(am <= VTD_MAMV);
> info.domain_id = domain_id;
> - info.gfn = addr >> VTD_PAGE_SHIFT_4K;
> + info.addr = addr;
> info.mask = ~((1 << am) - 1);
> g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page, &info);
> }
> @@ -1916,7 +1948,7 @@ static void vtd_init(IntelIOMMUState *s)
> s->iq_last_desc_type = VTD_INV_DESC_NONE;
> s->next_frcd_reg = 0;
> s->cap = VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND | VTD_CAP_MGAW |
> - VTD_CAP_SAGAW | VTD_CAP_MAMV | VTD_CAP_PSI;
> + VTD_CAP_SAGAW | VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS;
> s->ecap = VTD_ECAP_QI | VTD_ECAP_IRO;
>
> vtd_reset_context_cache(s);
> diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
> index ba288ab..e5f514c 100644
> --- a/hw/i386/intel_iommu_internal.h
> +++ b/hw/i386/intel_iommu_internal.h
> @@ -113,6 +113,7 @@
>
> /* The shift of source_id in the key of IOTLB hash table */
> #define VTD_IOTLB_SID_SHIFT 36
> +#define VTD_IOTLB_LVL_SHIFT 44
> #define VTD_IOTLB_MAX_SIZE 1024 /* Max size of the hash table */
>
> /* IOTLB_REG */
> @@ -185,9 +186,10 @@
> #define VTD_CAP_ND (((VTD_DOMAIN_ID_SHIFT - 4) / 2) & 7ULL)
> #define VTD_MGAW 39 /* Maximum Guest Address Width */
> #define VTD_CAP_MGAW (((VTD_MGAW - 1) & 0x3fULL) << 16)
> -#define VTD_MAMV 9ULL
> +#define VTD_MAMV 18ULL
> #define VTD_CAP_MAMV (VTD_MAMV << 48)
> #define VTD_CAP_PSI (1ULL << 39)
> +#define VTD_CAP_SLLPS ((1ULL << 34) | (1ULL << 35))
>
> /* Supported Adjusted Guest Address Widths */
> #define VTD_CAP_SAGAW_SHIFT 8
> @@ -320,7 +322,7 @@ typedef struct VTDInvDesc VTDInvDesc;
> /* Information about page-selective IOTLB invalidate */
> struct VTDIOTLBPageInvInfo {
> uint16_t domain_id;
> - uint64_t gfn;
> + uint64_t addr;
> uint8_t mask;
> };
> typedef struct VTDIOTLBPageInvInfo VTDIOTLBPageInvInfo;
> diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
> index 5dbadb7..b024ffa 100644
> --- a/include/hw/i386/intel_iommu.h
> +++ b/include/hw/i386/intel_iommu.h
> @@ -83,6 +83,7 @@ struct VTDIOTLBEntry {
> uint64_t gfn;
> uint16_t domain_id;
> uint64_t slpte;
> + uint64_t mask;
> bool read_flags;
> bool write_flags;
> };
> --
> 1.8.3.1
next prev parent reply other threads:[~2016-01-14 9:28 UTC|newest]
Thread overview: 3+ messages / expand[flat|nested] mbox.gz Atom feed top
2016-01-14 5:47 [Qemu-devel] [PATCH] intel_iommu: large page support Jason Wang
2016-01-14 9:28 ` Michael S. Tsirkin [this message]
2016-01-15 3:15 ` Jason Wang
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20160114112703-mutt-send-email-mst@redhat.com \
--to=mst@redhat.com \
--cc=ehabkost@redhat.com \
--cc=jasowang@redhat.com \
--cc=pbonzini@redhat.com \
--cc=qemu-devel@nongnu.org \
--cc=rth@twiddle.net \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).