From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from eggs.gnu.org ([2001:4830:134:3::10]:46413) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1afmby-000303-Sp for qemu-devel@nongnu.org; Tue, 15 Mar 2016 06:54:06 -0400 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1afmbx-0006t2-4U for qemu-devel@nongnu.org; Tue, 15 Mar 2016 06:54:02 -0400 Received: from mx1.redhat.com ([209.132.183.28]:45628) by eggs.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1afmbw-0006sy-TA for qemu-devel@nongnu.org; Tue, 15 Mar 2016 06:54:01 -0400 Date: Tue, 15 Mar 2016 12:53:57 +0200 From: "Michael S. Tsirkin" Message-ID: <20160315124301-mutt-send-email-mst@redhat.com> References: MIME-Version: 1.0 Content-Type: text/plain; charset=iso-8859-1 Content-Disposition: inline In-Reply-To: Content-Transfer-Encoding: quoted-printable Subject: Re: [Qemu-devel] [PATCH][RFC] IOMMU: Add Support to VFIO devices with vIOMMU present List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: "Aviv B.D." Cc: qemu-devel@nongnu.org On Sat, Mar 12, 2016 at 06:13:17PM +0200, Aviv B.D. wrote: > From: "Aviv B.D." >=20 > =A0* Fix bug that prevent qemu from starting up when vIOMMU and VFIO > =A0 =A0 device are present. > =A0* Advertise Cache Mode capability in iommu cap register. > =A0* Register every VFIO device with IOMMU state. > =A0* On page cache invalidation in vIOMMU, check if the domain belong t= o > =A0 =A0VFIO device and mirror the guest requests to host. >=20 > Not working (Yet!): > =A0* Tested only with network interface card (ixgbevf) and > =A0 =A0 intel_iommu=3Dstrict in guest's kernel command line. > =A0* Lock up under high load. > =A0* Errors on guest poweroff. > =A0* High relative latency compare to VFIO without IOMMU. >=20 > Signed-off-by: Aviv B.D. Thanks, this is very interesting. So this needs some cleanup, and some issues that will have to be addresse= d. See below. Thanks! > --- > =A0hw/i386/intel_iommu.c =A0 =A0 =A0 =A0 =A0| 76 ++++++++++++++++++++++= ++++++++++++++++---- > =A0hw/i386/intel_iommu_internal.h | =A01 + > =A0hw/vfio/common.c =A0 =A0 =A0 =A0 =A0 =A0 =A0 | 12 +++++-- > =A0include/hw/i386/intel_iommu.h =A0| =A04 +++ > =A0include/hw/vfio/vfio-common.h =A0| =A01 + > =A05 files changed, 85 insertions(+), 9 deletions(-) >=20 > diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c > index 347718f..046688f 100644 > --- a/hw/i386/intel_iommu.c > +++ b/hw/i386/intel_iommu.c > @@ -43,6 +44,9 @@ static int vtd_dbgflags =3D VTD_DBGBIT(GENERAL) | VTD= _DBGBIT > (CSR); > =A0#define VTD_DPRINTF(what, fmt, ...) do {} while (0) > =A0#endif > =A0 > +static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_nu= m, > + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0= uint8_t devfn, VTDContextEntry *ce); > + > =A0static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_= t val, > =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0uint64_t wma= sk, uint64_t w1cmask) > =A0{ > @@ -126,6 +130,19 @@ static uint32_t vtd_set_clear_mask_long(IntelIOMMU= State > *s, hwaddr addr, > =A0 =A0 =A0return new_val; > =A0} > =A0 > +static uint16_t vtd_get_did_dev(IntelIOMMUState *s, uint8_t bus_num, u= int8_t > devfn) > +{ > + =A0 =A0VTDContextEntry ce; > + =A0 =A0int ret_fr; > + > + =A0 =A0ret_fr =3D vtd_dev_to_context_entry(s, bus_num, devfn, &ce); > + =A0 =A0if (ret_fr){ > + =A0 =A0 =A0 =A0return -1; > + =A0 =A0} > + > + =A0 =A0return VTD_CONTEXT_ENTRY_DID(ce.hi); > +} > + > =A0static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState *s, hwaddr a= ddr, > =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0= =A0 =A0 =A0uint64_t clear, uint64_t mask) > =A0{ > @@ -711,9 +728,9 @@ static int vtd_dev_to_context_entry(IntelIOMMUState= *s, > uint8_t bus_num, > =A0 =A0 =A0} > =A0 > =A0 =A0 =A0if (!vtd_context_entry_present(ce)) { > - =A0 =A0 =A0 =A0VTD_DPRINTF(GENERAL, > + =A0 =A0 =A0 =A0/*VTD_DPRINTF(GENERAL, > =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0"error: context-entry #%"PRI= u8 "(bus #%"PRIu8 ") " > - =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0"is not present", devfn, bus_n= um); > + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0"is not present", devfn, bus_n= um);*/ > =A0 =A0 =A0 =A0 =A0return -VTD_FR_CONTEXT_ENTRY_P; > =A0 =A0 =A0} else if ((ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI) || > =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 (ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO)) { > @@ -1020,14 +1037,53 @@ static void vtd_iotlb_page_invalidate(IntelIOMM= UState > *s, uint16_t domain_id, > =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0= =A0 =A0hwaddr addr, uint8_t am) > =A0{ > =A0 =A0 =A0VTDIOTLBPageInvInfo info; > + =A0 =A0VFIOGuestIOMMU * giommu; > + =A0 =A0bool flag =3D false; > =A0 > =A0 =A0 =A0assert(am <=3D VTD_MAMV); > =A0 =A0 =A0info.domain_id =3D domain_id; > =A0 =A0 =A0info.addr =3D addr; > =A0 =A0 =A0info.mask =3D ~((1 << am) - 1); > + > + =A0 =A0QLIST_FOREACH(giommu, &(s->giommu_list), iommu_next){ > + =A0 =A0 =A0 =A0VTDAddressSpace *vtd_as =3D container_of(giommu->iommu= , VTDAddressSpace, > iommu); > + =A0 =A0 =A0 =A0uint16_t vfio_source_id =3D vtd_make_source_id(pci_bus= _num(vtd_as->bus), > vtd_as->devfn); > + =A0 =A0 =A0 =A0uint16_t vfio_domain_id =3D vtd_get_did_dev(s, pci_bus= _num(vtd_as->bus), > vtd_as->devfn); > + =A0 =A0 =A0 =A0if (vfio_domain_id !=3D (uint16_t)-1 &&=A0 > + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0domain_id =3D=3D vfio_domain_id){ > + =A0 =A0 =A0 =A0 =A0 =A0VTDIOTLBEntry *iotlb_entry =3D vtd_lookup_iotl= b(s, vfio_source_id, > addr); > + =A0 =A0 =A0 =A0 =A0 =A0if (iotlb_entry !=3D NULL){ > + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0IOMMUTLBEntry entry;=A0 > + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0VTD_DPRINTF(GENERAL, "Remove addr 0x%"= PRIx64 " mask %d", addr, > am); > + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0entry.iova =3D addr & VTD_PAGE_MASK_4K= ; > + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0entry.translated_addr =3D vtd_get_slpt= e_addr(iotlb_entry->slpte) > & VTD_PAGE_MASK_4K; > + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0entry.addr_mask =3D ~VTD_PAGE_MASK_4K; > + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0entry.perm =3D IOMMU_NONE; > + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0memory_region_notify_iommu(giommu->iom= mu, entry); > + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0flag =3D true; > + > + =A0 =A0 =A0 =A0 =A0 =A0} > + =A0 =A0 =A0 =A0} > + > + =A0 =A0} > + > =A0 =A0 =A0g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_pag= e, &info); > -} > =A0 > + =A0 =A0QLIST_FOREACH(giommu, &(s->giommu_list), iommu_next){ > + =A0 =A0 =A0 =A0VTDAddressSpace *vtd_as =3D container_of(giommu->iommu= , VTDAddressSpace, > iommu); > + =A0 =A0 =A0 =A0uint16_t vfio_domain_id =3D vtd_get_did_dev(s, pci_bus= _num(vtd_as->bus), > vtd_as->devfn); > + =A0 =A0 =A0 =A0if (vfio_domain_id !=3D (uint16_t)-1 &&=A0 > + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0domain_id =3D=3D vfio_domain_id && !fl= ag){ > + =A0 =A0 =A0 =A0 =A0 =A0/* do vfio map */ > + =A0 =A0 =A0 =A0 =A0 =A0VTD_DPRINTF(GENERAL, "add addr 0x%"PRIx64 " ma= sk %d", addr, am); > + =A0 =A0 =A0 =A0 =A0 =A0/* call to vtd_iommu_translate */ > + =A0 =A0 =A0 =A0 =A0 =A0IOMMUTLBEntry entry =3D s->iommu_ops.translate= (giommu->iommu, addr, > 0);=A0 > + =A0 =A0 =A0 =A0 =A0 =A0entry.perm =3D IOMMU_RW; > + =A0 =A0 =A0 =A0 =A0 =A0memory_region_notify_iommu(giommu->iommu, entr= y); So this just triggers the iommu notifiers in the regular way. But what if two devices have conflicting entries? We need to setup host IOMMU differently for them. But see below > + =A0 =A0 =A0 =A0 =A0 =A0//g_vfio_iommu->n.notify(&g_vfio_iommu->n, &en= try); > + =A0 =A0 =A0 =A0} > + =A0 =A0} > +} > =A0/* Flush IOTLB > =A0 * Returns the IOTLB Actual Invalidation Granularity. > =A0 * @val: the content of the IOTLB_REG > @@ -1895,6 +1951,13 @@ static Property vtd_properties[] =3D { > =A0 =A0 =A0DEFINE_PROP_END_OF_LIST(), > =A0}; > =A0 > +void vtd_register_giommu(VFIOGuestIOMMU * giommu) > +{ > + =A0 =A0VTDAddressSpace *vtd_as =3D container_of(giommu->iommu, VTDAdd= ressSpace, > iommu); > + =A0 =A0IntelIOMMUState *s =3D vtd_as->iommu_state; > + =A0 =A0 > + =A0 =A0QLIST_INSERT_HEAD(&s->giommu_list, giommu, iommu_next); > +} > =A0 > =A0VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, in= t devfn) > =A0{ > @@ -1949,7 +2012,8 @@ static void vtd_init(IntelIOMMUState *s) > =A0 =A0 =A0s->iq_last_desc_type =3D VTD_INV_DESC_NONE; > =A0 =A0 =A0s->next_frcd_reg =3D 0; > =A0 =A0 =A0s->cap =3D VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND | VTD_CAP_= MGAW | > - =A0 =A0 =A0 =A0 =A0 =A0 VTD_CAP_SAGAW | VTD_CAP_MAMV | VTD_CAP_PSI | = VTD_CAP_SLLPS; > + =A0 =A0 =A0 =A0 =A0 =A0 VTD_CAP_SAGAW | VTD_CAP_MAMV | VTD_CAP_PSI | = VTD_CAP_SLLPS | > + =A0 =A0 =A0 =A0 =A0 =A0 VTD_CAP_CM; > =A0 =A0 =A0s->ecap =3D VTD_ECAP_QI | VTD_ECAP_IRO; > =A0 > =A0 =A0 =A0vtd_reset_context_cache(s); > diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_inter= nal.h > index e5f514c..ae40f73 100644 > --- a/hw/i386/intel_iommu_internal.h > +++ b/hw/i386/intel_iommu_internal.h > @@ -190,6 +190,7 @@ > =A0#define VTD_CAP_MAMV =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0(VTD_MAMV << 48) > =A0#define VTD_CAP_PSI =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 (1ULL << 39) > =A0#define VTD_CAP_SLLPS =A0 =A0 =A0 =A0 =A0 =A0 =A0 ((1ULL << 34) | (1= ULL << 35)) > +#define VTD_CAP_CM =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0(1ULL << 7) > =A0 > =A0/* Supported Adjusted Guest Address Widths */ > =A0#define VTD_CAP_SAGAW_SHIFT =A0 =A0 =A0 =A0 8 > diff --git a/hw/vfio/common.c b/hw/vfio/common.c > index 607ec70..98c8d67 100644 > --- a/hw/vfio/common.c > +++ b/hw/vfio/common.c > @@ -32,6 +32,9 @@ > =A0#include "sysemu/kvm.h" > =A0#include "trace.h" > =A0 > +#include "hw/sysbus.h" > +#include "hw/i386/intel_iommu.h" > + > =A0struct vfio_group_head vfio_group_list =3D > =A0 =A0 =A0QLIST_HEAD_INITIALIZER(vfio_group_list); > =A0struct vfio_as_head vfio_address_spaces =3D > @@ -312,12 +315,12 @@ static void vfio_iommu_map_notify(Notifier *n, vo= id > *data) > =A0out: > =A0 =A0 =A0rcu_read_unlock(); > =A0} > - > +#if 0 > =A0static hwaddr vfio_container_granularity(VFIOContainer *container) > =A0{ > =A0 =A0 =A0return (hwaddr)1 << ctz64(container->iova_pgsizes); > =A0} > - > +#endif > =A0static void vfio_listener_region_add(MemoryListener *listener, > =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0= =A0 MemoryRegionSection *section) > =A0{ > @@ -344,6 +347,7 @@ static void vfio_listener_region_add(MemoryListener > *listener, > =A0 =A0 =A0iova =3D TARGET_PAGE_ALIGN(section->offset_within_address_sp= ace); > =A0 =A0 =A0llend =3D int128_make64(section->offset_within_address_space= ); > =A0 =A0 =A0llend =3D int128_add(llend, section->size); > + =A0 =A0llend =3D int128_add(llend, int128_exts64(-1)); > =A0 =A0 =A0llend =3D int128_and(llend, int128_exts64(TARGET_PAGE_MASK))= ; > =A0 > =A0 =A0 =A0if (int128_ge(int128_make64(iova), llend)) { > @@ -381,11 +385,13 @@ static void vfio_listener_region_add(MemoryListen= er > *listener, > =A0 =A0 =A0 =A0 =A0giommu->n.notify =3D vfio_iommu_map_notify; > =A0 =A0 =A0 =A0 =A0QLIST_INSERT_HEAD(&container->giommu_list, giommu, g= iommu_next); > =A0 > + =A0 =A0 =A0 =A0vtd_register_giommu(giommu); > =A0 =A0 =A0 =A0 =A0memory_region_register_iommu_notifier(giommu->iommu,= &giommu->n); > +#if 0 > =A0 =A0 =A0 =A0 =A0memory_region_iommu_replay(giommu->iommu, &giommu->n= , > =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0= vfio_container_granularity(container), > =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0= false); > - > +#endif > =A0 =A0 =A0 =A0 =A0return; > =A0 =A0 =A0} >=20 Looks like there is still a single global domain on the host. Works up to a point as long as we have a single VFIO device. =A0 > diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iomm= u.h > index b024ffa..22f3f83 100644 > --- a/include/hw/i386/intel_iommu.h > +++ b/include/hw/i386/intel_iommu.h > @@ -23,6 +23,7 @@ > =A0#define INTEL_IOMMU_H > =A0#include "hw/qdev.h" > =A0#include "sysemu/dma.h" > +#include "hw/vfio/vfio-common.h" > =A0 > =A0#define TYPE_INTEL_IOMMU_DEVICE "intel-iommu" > =A0#define INTEL_IOMMU_DEVICE(obj) \ > @@ -123,6 +124,8 @@ struct IntelIOMMUState { > =A0 =A0 =A0MemoryRegionIOMMUOps iommu_ops; > =A0 =A0 =A0GHashTable *vtd_as_by_busptr; =A0 /* VTDBus objects indexed = by PCIBus* > reference */ > =A0 =A0 =A0VTDBus *vtd_as_by_bus_num[VTD_PCI_BUS_MAX]; /* VTDBus object= s indexed by > bus number */ > + > + =A0 =A0QLIST_HEAD(, VFIOGuestIOMMU) giommu_list; > =A0}; > =A0 > =A0/* Find the VTD Address space associated with the given bus pointer, > @@ -130,4 +133,5 @@ struct IntelIOMMUState { > =A0 */ > =A0VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, in= t devfn); > =A0 > +void vtd_register_giommu(VFIOGuestIOMMU * giommu); > =A0#endif > diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-commo= n.h > index f037f3c..9225ba3 100644 > --- a/include/hw/vfio/vfio-common.h > +++ b/include/hw/vfio/vfio-common.h > @@ -82,6 +82,7 @@ typedef struct VFIOGuestIOMMU { > =A0 =A0 =A0MemoryRegion *iommu; > =A0 =A0 =A0Notifier n; > =A0 =A0 =A0QLIST_ENTRY(VFIOGuestIOMMU) giommu_next; > + =A0 =A0QLIST_ENTRY(VFIOGuestIOMMU) iommu_next; > =A0} VFIOGuestIOMMU; > =A0 > =A0typedef struct VFIODeviceOps VFIODeviceOps; > --=A0 > 1.9.1 > =A0