[Qemu-devel] [PATCH 0/3] VFIO-based PCI device assignment for QEMU 1.2

qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed

* [Qemu-devel] [PATCH 0/3] VFIO-based PCI device assignment for QEMU 1.2
@ 2012-08-01  5:18 Alex Williamson
  2012-08-01  5:18 ` [Qemu-devel] [PATCH 1/3] vfio: Import vfio kernel header Alex Williamson
                   ` (3 more replies)
  0 siblings, 4 replies; 42+ messages in thread
From: Alex Williamson @ 2012-08-01  5:18 UTC (permalink / raw)
  To: aliguori; +Cc: aik, qemu-devel, kvm

VFIO kernel support was just merged into Linux, so I'd like to
formally propose inclusion of the QEMU vfio-pci driver for
QEMU 1.2.  Included here is support for x86 PCI device assignment.
PCI INTx is not yet enabled, but devices making use of either MSI
or MSI-X work.  The level irqfd and eoifd support I've proposed
for KVM enable an accelerated patch for this through KVM.  I'd
like to get this base driver in first and enable the remaining
support in-tree.

I've split this version up a little from the RFC to make it a bit
easier to review.  Review comments from Blue Swirl and Avi are
already incorporated, including Avi's requests to simplify both
the PCI BAR mapping and unmapping paths.

This series is also available at:

git://github.com/awilliam/qemu-vfio.git tags/vfio-pci-for-qemu-1.2

Thanks,

Alex

---

Alex Williamson (3):
      vfio: Enable vfio-pci and mark supported
      vfio: vfio-pci device assignment driver
      vfio: Import vfio kernel header

 MAINTAINERS                |    5 
 configure                  |   12 
 hw/i386/Makefile.objs      |    1 
 hw/vfio_pci.c              | 1853 ++++++++++++++++++++++++++++++++++++++++++++
 hw/vfio_pci.h              |  101 ++
 linux-headers/linux/vfio.h |  368 +++++++++
 6 files changed, 2340 insertions(+)
 create mode 100644 hw/vfio_pci.c
 create mode 100644 hw/vfio_pci.h
 create mode 100644 linux-headers/linux/vfio.h

^ permalink raw reply	[flat|nested] 42+ messages in thread

* [Qemu-devel] [PATCH 1/3] vfio: Import vfio kernel header
  2012-08-01  5:18 [Qemu-devel] [PATCH 0/3] VFIO-based PCI device assignment for QEMU 1.2 Alex Williamson
@ 2012-08-01  5:18 ` Alex Williamson
  2012-08-01  7:13   ` Jan Kiszka
  2012-08-01  5:18 ` [Qemu-devel] [PATCH 2/3] vfio: vfio-pci device assignment driver Alex Williamson
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 42+ messages in thread
From: Alex Williamson @ 2012-08-01  5:18 UTC (permalink / raw)
  To: aliguori; +Cc: aik, qemu-devel, kvm

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---

 linux-headers/linux/vfio.h |  368 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 368 insertions(+)
 create mode 100644 linux-headers/linux/vfio.h

diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h
new file mode 100644
index 0000000..f787b72
--- /dev/null
+++ b/linux-headers/linux/vfio.h
@@ -0,0 +1,368 @@
+/*
+ * VFIO API definition
+ *
+ * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
+ *     Author: Alex Williamson <alex.williamson@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef VFIO_H
+#define VFIO_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+#define VFIO_API_VERSION	0
+
+
+/* Kernel & User level defines for VFIO IOCTLs. */
+
+/* Extensions */
+
+#define VFIO_TYPE1_IOMMU		1
+
+/*
+ * The IOCTL interface is designed for extensibility by embedding the
+ * structure length (argsz) and flags into structures passed between
+ * kernel and userspace.  We therefore use the _IO() macro for these
+ * defines to avoid implicitly embedding a size into the ioctl request.
+ * As structure fields are added, argsz will increase to match and flag
+ * bits will be defined to indicate additional fields with valid data.
+ * It's *always* the caller's responsibility to indicate the size of
+ * the structure passed by setting argsz appropriately.
+ */
+
+#define VFIO_TYPE	(';')
+#define VFIO_BASE	100
+
+/* -------- IOCTLs for VFIO file descriptor (/dev/vfio/vfio) -------- */
+
+/**
+ * VFIO_GET_API_VERSION - _IO(VFIO_TYPE, VFIO_BASE + 0)
+ *
+ * Report the version of the VFIO API.  This allows us to bump the entire
+ * API version should we later need to add or change features in incompatible
+ * ways.
+ * Return: VFIO_API_VERSION
+ * Availability: Always
+ */
+#define VFIO_GET_API_VERSION		_IO(VFIO_TYPE, VFIO_BASE + 0)
+
+/**
+ * VFIO_CHECK_EXTENSION - _IOW(VFIO_TYPE, VFIO_BASE + 1, __u32)
+ *
+ * Check whether an extension is supported.
+ * Return: 0 if not supported, 1 (or some other positive integer) if supported.
+ * Availability: Always
+ */
+#define VFIO_CHECK_EXTENSION		_IO(VFIO_TYPE, VFIO_BASE + 1)
+
+/**
+ * VFIO_SET_IOMMU - _IOW(VFIO_TYPE, VFIO_BASE + 2, __s32)
+ *
+ * Set the iommu to the given type.  The type must be supported by an
+ * iommu driver as verified by calling CHECK_EXTENSION using the same
+ * type.  A group must be set to this file descriptor before this
+ * ioctl is available.  The IOMMU interfaces enabled by this call are
+ * specific to the value set.
+ * Return: 0 on success, -errno on failure
+ * Availability: When VFIO group attached
+ */
+#define VFIO_SET_IOMMU			_IO(VFIO_TYPE, VFIO_BASE + 2)
+
+/* -------- IOCTLs for GROUP file descriptors (/dev/vfio/$GROUP) -------- */
+
+/**
+ * VFIO_GROUP_GET_STATUS - _IOR(VFIO_TYPE, VFIO_BASE + 3,
+ *						struct vfio_group_status)
+ *
+ * Retrieve information about the group.  Fills in provided
+ * struct vfio_group_info.  Caller sets argsz.
+ * Return: 0 on succes, -errno on failure.
+ * Availability: Always
+ */
+struct vfio_group_status {
+	__u32	argsz;
+	__u32	flags;
+#define VFIO_GROUP_FLAGS_VIABLE		(1 << 0)
+#define VFIO_GROUP_FLAGS_CONTAINER_SET	(1 << 1)
+};
+#define VFIO_GROUP_GET_STATUS		_IO(VFIO_TYPE, VFIO_BASE + 3)
+
+/**
+ * VFIO_GROUP_SET_CONTAINER - _IOW(VFIO_TYPE, VFIO_BASE + 4, __s32)
+ *
+ * Set the container for the VFIO group to the open VFIO file
+ * descriptor provided.  Groups may only belong to a single
+ * container.  Containers may, at their discretion, support multiple
+ * groups.  Only when a container is set are all of the interfaces
+ * of the VFIO file descriptor and the VFIO group file descriptor
+ * available to the user.
+ * Return: 0 on success, -errno on failure.
+ * Availability: Always
+ */
+#define VFIO_GROUP_SET_CONTAINER	_IO(VFIO_TYPE, VFIO_BASE + 4)
+
+/**
+ * VFIO_GROUP_UNSET_CONTAINER - _IO(VFIO_TYPE, VFIO_BASE + 5)
+ *
+ * Remove the group from the attached container.  This is the
+ * opposite of the SET_CONTAINER call and returns the group to
+ * an initial state.  All device file descriptors must be released
+ * prior to calling this interface.  When removing the last group
+ * from a container, the IOMMU will be disabled and all state lost,
+ * effectively also returning the VFIO file descriptor to an initial
+ * state.
+ * Return: 0 on success, -errno on failure.
+ * Availability: When attached to container
+ */
+#define VFIO_GROUP_UNSET_CONTAINER	_IO(VFIO_TYPE, VFIO_BASE + 5)
+
+/**
+ * VFIO_GROUP_GET_DEVICE_FD - _IOW(VFIO_TYPE, VFIO_BASE + 6, char)
+ *
+ * Return a new file descriptor for the device object described by
+ * the provided string.  The string should match a device listed in
+ * the devices subdirectory of the IOMMU group sysfs entry.  The
+ * group containing the device must already be added to this context.
+ * Return: new file descriptor on success, -errno on failure.
+ * Availability: When attached to container
+ */
+#define VFIO_GROUP_GET_DEVICE_FD	_IO(VFIO_TYPE, VFIO_BASE + 6)
+
+/* --------------- IOCTLs for DEVICE file descriptors --------------- */
+
+/**
+ * VFIO_DEVICE_GET_INFO - _IOR(VFIO_TYPE, VFIO_BASE + 7,
+ *						struct vfio_device_info)
+ *
+ * Retrieve information about the device.  Fills in provided
+ * struct vfio_device_info.  Caller sets argsz.
+ * Return: 0 on success, -errno on failure.
+ */
+struct vfio_device_info {
+	__u32	argsz;
+	__u32	flags;
+#define VFIO_DEVICE_FLAGS_RESET	(1 << 0)	/* Device supports reset */
+#define VFIO_DEVICE_FLAGS_PCI	(1 << 1)	/* vfio-pci device */
+	__u32	num_regions;	/* Max region index + 1 */
+	__u32	num_irqs;	/* Max IRQ index + 1 */
+};
+#define VFIO_DEVICE_GET_INFO		_IO(VFIO_TYPE, VFIO_BASE + 7)
+
+/**
+ * VFIO_DEVICE_GET_REGION_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 8,
+ *				       struct vfio_region_info)
+ *
+ * Retrieve information about a device region.  Caller provides
+ * struct vfio_region_info with index value set.  Caller sets argsz.
+ * Implementation of region mapping is bus driver specific.  This is
+ * intended to describe MMIO, I/O port, as well as bus specific
+ * regions (ex. PCI config space).  Zero sized regions may be used
+ * to describe unimplemented regions (ex. unimplemented PCI BARs).
+ * Return: 0 on success, -errno on failure.
+ */
+struct vfio_region_info {
+	__u32	argsz;
+	__u32	flags;
+#define VFIO_REGION_INFO_FLAG_READ	(1 << 0) /* Region supports read */
+#define VFIO_REGION_INFO_FLAG_WRITE	(1 << 1) /* Region supports write */
+#define VFIO_REGION_INFO_FLAG_MMAP	(1 << 2) /* Region supports mmap */
+	__u32	index;		/* Region index */
+	__u32	resv;		/* Reserved for alignment */
+	__u64	size;		/* Region size (bytes) */
+	__u64	offset;		/* Region offset from start of device fd */
+};
+#define VFIO_DEVICE_GET_REGION_INFO	_IO(VFIO_TYPE, VFIO_BASE + 8)
+
+/**
+ * VFIO_DEVICE_GET_IRQ_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 9,
+ *				    struct vfio_irq_info)
+ *
+ * Retrieve information about a device IRQ.  Caller provides
+ * struct vfio_irq_info with index value set.  Caller sets argsz.
+ * Implementation of IRQ mapping is bus driver specific.  Indexes
+ * using multiple IRQs are primarily intended to support MSI-like
+ * interrupt blocks.  Zero count irq blocks may be used to describe
+ * unimplemented interrupt types.
+ *
+ * The EVENTFD flag indicates the interrupt index supports eventfd based
+ * signaling.
+ *
+ * The MASKABLE flags indicates the index supports MASK and UNMASK
+ * actions described below.
+ *
+ * AUTOMASKED indicates that after signaling, the interrupt line is
+ * automatically masked by VFIO and the user needs to unmask the line
+ * to receive new interrupts.  This is primarily intended to distinguish
+ * level triggered interrupts.
+ *
+ * The NORESIZE flag indicates that the interrupt lines within the index
+ * are setup as a set and new subindexes cannot be enabled without first
+ * disabling the entire index.  This is used for interrupts like PCI MSI
+ * and MSI-X where the driver may only use a subset of the available
+ * indexes, but VFIO needs to enable a specific number of vectors
+ * upfront.  In the case of MSI-X, where the user can enable MSI-X and
+ * then add and unmask vectors, it's up to userspace to make the decision
+ * whether to allocate the maximum supported number of vectors or tear
+ * down setup and incrementally increase the vectors as each is enabled.
+ */
+struct vfio_irq_info {
+	__u32	argsz;
+	__u32	flags;
+#define VFIO_IRQ_INFO_EVENTFD		(1 << 0)
+#define VFIO_IRQ_INFO_MASKABLE		(1 << 1)
+#define VFIO_IRQ_INFO_AUTOMASKED	(1 << 2)
+#define VFIO_IRQ_INFO_NORESIZE		(1 << 3)
+	__u32	index;		/* IRQ index */
+	__u32	count;		/* Number of IRQs within this index */
+};
+#define VFIO_DEVICE_GET_IRQ_INFO	_IO(VFIO_TYPE, VFIO_BASE + 9)
+
+/**
+ * VFIO_DEVICE_SET_IRQS - _IOW(VFIO_TYPE, VFIO_BASE + 10, struct vfio_irq_set)
+ *
+ * Set signaling, masking, and unmasking of interrupts.  Caller provides
+ * struct vfio_irq_set with all fields set.  'start' and 'count' indicate
+ * the range of subindexes being specified.
+ *
+ * The DATA flags specify the type of data provided.  If DATA_NONE, the
+ * operation performs the specified action immediately on the specified
+ * interrupt(s).  For example, to unmask AUTOMASKED interrupt [0,0]:
+ * flags = (DATA_NONE|ACTION_UNMASK), index = 0, start = 0, count = 1.
+ *
+ * DATA_BOOL allows sparse support for the same on arrays of interrupts.
+ * For example, to mask interrupts [0,1] and [0,3] (but not [0,2]):
+ * flags = (DATA_BOOL|ACTION_MASK), index = 0, start = 1, count = 3,
+ * data = {1,0,1}
+ *
+ * DATA_EVENTFD binds the specified ACTION to the provided __s32 eventfd.
+ * A value of -1 can be used to either de-assign interrupts if already
+ * assigned or skip un-assigned interrupts.  For example, to set an eventfd
+ * to be trigger for interrupts [0,0] and [0,2]:
+ * flags = (DATA_EVENTFD|ACTION_TRIGGER), index = 0, start = 0, count = 3,
+ * data = {fd1, -1, fd2}
+ * If index [0,1] is previously set, two count = 1 ioctls calls would be
+ * required to set [0,0] and [0,2] without changing [0,1].
+ *
+ * Once a signaling mechanism is set, DATA_BOOL or DATA_NONE can be used
+ * with ACTION_TRIGGER to perform kernel level interrupt loopback testing
+ * from userspace (ie. simulate hardware triggering).
+ *
+ * Setting of an event triggering mechanism to userspace for ACTION_TRIGGER
+ * enables the interrupt index for the device.  Individual subindex interrupts
+ * can be disabled using the -1 value for DATA_EVENTFD or the index can be
+ * disabled as a whole with: flags = (DATA_NONE|ACTION_TRIGGER), count = 0.
+ *
+ * Note that ACTION_[UN]MASK specify user->kernel signaling (irqfds) while
+ * ACTION_TRIGGER specifies kernel->user signaling.
+ */
+struct vfio_irq_set {
+	__u32	argsz;
+	__u32	flags;
+#define VFIO_IRQ_SET_DATA_NONE		(1 << 0) /* Data not present */
+#define VFIO_IRQ_SET_DATA_BOOL		(1 << 1) /* Data is bool (u8) */
+#define VFIO_IRQ_SET_DATA_EVENTFD	(1 << 2) /* Data is eventfd (s32) */
+#define VFIO_IRQ_SET_ACTION_MASK	(1 << 3) /* Mask interrupt */
+#define VFIO_IRQ_SET_ACTION_UNMASK	(1 << 4) /* Unmask interrupt */
+#define VFIO_IRQ_SET_ACTION_TRIGGER	(1 << 5) /* Trigger interrupt */
+	__u32	index;
+	__u32	start;
+	__u32	count;
+	__u8	data[];
+};
+#define VFIO_DEVICE_SET_IRQS		_IO(VFIO_TYPE, VFIO_BASE + 10)
+
+#define VFIO_IRQ_SET_DATA_TYPE_MASK	(VFIO_IRQ_SET_DATA_NONE | \
+					 VFIO_IRQ_SET_DATA_BOOL | \
+					 VFIO_IRQ_SET_DATA_EVENTFD)
+#define VFIO_IRQ_SET_ACTION_TYPE_MASK	(VFIO_IRQ_SET_ACTION_MASK | \
+					 VFIO_IRQ_SET_ACTION_UNMASK | \
+					 VFIO_IRQ_SET_ACTION_TRIGGER)
+/**
+ * VFIO_DEVICE_RESET - _IO(VFIO_TYPE, VFIO_BASE + 11)
+ *
+ * Reset a device.
+ */
+#define VFIO_DEVICE_RESET		_IO(VFIO_TYPE, VFIO_BASE + 11)
+
+/*
+ * The VFIO-PCI bus driver makes use of the following fixed region and
+ * IRQ index mapping.  Unimplemented regions return a size of zero.
+ * Unimplemented IRQ types return a count of zero.
+ */
+
+enum {
+	VFIO_PCI_BAR0_REGION_INDEX,
+	VFIO_PCI_BAR1_REGION_INDEX,
+	VFIO_PCI_BAR2_REGION_INDEX,
+	VFIO_PCI_BAR3_REGION_INDEX,
+	VFIO_PCI_BAR4_REGION_INDEX,
+	VFIO_PCI_BAR5_REGION_INDEX,
+	VFIO_PCI_ROM_REGION_INDEX,
+	VFIO_PCI_CONFIG_REGION_INDEX,
+	VFIO_PCI_NUM_REGIONS
+};
+
+enum {
+	VFIO_PCI_INTX_IRQ_INDEX,
+	VFIO_PCI_MSI_IRQ_INDEX,
+	VFIO_PCI_MSIX_IRQ_INDEX,
+	VFIO_PCI_NUM_IRQS
+};
+
+/* -------- API for Type1 VFIO IOMMU -------- */
+
+/**
+ * VFIO_IOMMU_GET_INFO - _IOR(VFIO_TYPE, VFIO_BASE + 12, struct vfio_iommu_info)
+ *
+ * Retrieve information about the IOMMU object. Fills in provided
+ * struct vfio_iommu_info. Caller sets argsz.
+ *
+ * XXX Should we do these by CHECK_EXTENSION too?
+ */
+struct vfio_iommu_type1_info {
+	__u32	argsz;
+	__u32	flags;
+#define VFIO_IOMMU_INFO_PGSIZES (1 << 0)	/* supported page sizes info */
+	__u64	iova_pgsizes;		/* Bitmap of supported page sizes */
+};
+
+#define VFIO_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
+
+/**
+ * VFIO_IOMMU_MAP_DMA - _IOW(VFIO_TYPE, VFIO_BASE + 13, struct vfio_dma_map)
+ *
+ * Map process virtual addresses to IO virtual addresses using the
+ * provided struct vfio_dma_map. Caller sets argsz. READ &/ WRITE required.
+ */
+struct vfio_iommu_type1_dma_map {
+	__u32	argsz;
+	__u32	flags;
+#define VFIO_DMA_MAP_FLAG_READ (1 << 0)		/* readable from device */
+#define VFIO_DMA_MAP_FLAG_WRITE (1 << 1)	/* writable from device */
+	__u64	vaddr;				/* Process virtual address */
+	__u64	iova;				/* IO virtual address */
+	__u64	size;				/* Size of mapping (bytes) */
+};
+
+#define VFIO_IOMMU_MAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 13)
+
+/**
+ * VFIO_IOMMU_UNMAP_DMA - _IOW(VFIO_TYPE, VFIO_BASE + 14, struct vfio_dma_unmap)
+ *
+ * Unmap IO virtual addresses using the provided struct vfio_dma_unmap.
+ * Caller sets argsz.
+ */
+struct vfio_iommu_type1_dma_unmap {
+	__u32	argsz;
+	__u32	flags;
+	__u64	iova;				/* IO virtual address */
+	__u64	size;				/* Size of mapping (bytes) */
+};
+
+#define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
+
+#endif /* VFIO_H */

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [Qemu-devel] [PATCH 2/3] vfio: vfio-pci device assignment driver
  2012-08-01  5:18 [Qemu-devel] [PATCH 0/3] VFIO-based PCI device assignment for QEMU 1.2 Alex Williamson
  2012-08-01  5:18 ` [Qemu-devel] [PATCH 1/3] vfio: Import vfio kernel header Alex Williamson
@ 2012-08-01  5:18 ` Alex Williamson
  2012-08-13 22:18   ` Anthony Liguori
                     ` (2 more replies)
  2012-08-01  5:18 ` [Qemu-devel] [PATCH 3/3] vfio: Enable vfio-pci and mark supported Alex Williamson
  2012-08-13 13:27 ` [Qemu-devel] [PATCH 0/3] VFIO-based PCI device assignment for QEMU 1.2 Anthony Liguori
  3 siblings, 3 replies; 42+ messages in thread
From: Alex Williamson @ 2012-08-01  5:18 UTC (permalink / raw)
  To: aliguori; +Cc: aik, qemu-devel, kvm

This adds the core of the QEMU VFIO-based PCI device assignment driver.
To make use of this driver, enable CONFIG_VFIO, CONFIG_VFIO_IOMMU_TYPE1,
and CONFIG_VFIO_PCI in your host Linux kernel config.  Load the vfio-pci
module.  To assign device 0000:05:00.0 to a guest, do the following:

for dev in $(ls /sys/bus/pci/devices/0000:05:00.0/iommu_group/devices); do
    vendor=$(cat /sys/bus/pci/devices/$dev/vendor)
    device=$(cat /sys/bus/pci/devices/$dev/device)
    if [ -e /sys/bus/pci/devices/$dev/driver ]; then
        echo $dev > /sys/bus/pci/devices/$dev/driver/unbind
    fi
    echo $vendor $device > /sys/bus/pci/drivers/vfio-pci/new_id
done

See Documentation/vfio.txt in the Linux kernel tree for further
description of IOMMU groups and VFIO.

Then launch qemu including the option:

-device vfio-pci,host=0000:05:00.0

Support for legacy PCI interrupts (INTx) is not yet included and will
be added in a future update.  Both MSI and MSI-X are supported here.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---

 hw/vfio_pci.c | 1853 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 hw/vfio_pci.h |  101 +++
 2 files changed, 1954 insertions(+)
 create mode 100644 hw/vfio_pci.c
 create mode 100644 hw/vfio_pci.h

diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c
new file mode 100644
index 0000000..71bb1bd
--- /dev/null
+++ b/hw/vfio_pci.c
@@ -0,0 +1,1853 @@
+/*
+ * vfio based device assignment support
+ *
+ * Copyright Red Hat, Inc. 2012
+ *
+ * Authors:
+ *  Alex Williamson <alex.williamson@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ * Based on qemu-kvm device-assignment:
+ *  Adapted for KVM by Qumranet.
+ *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
+ *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
+ *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
+ *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
+ *  Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
+ */
+
+#include <dirent.h>
+#include <unistd.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <linux/vfio.h>
+
+#include "config.h"
+#include "event_notifier.h"
+#include "exec-memory.h"
+#include "kvm.h"
+#include "memory.h"
+#include "msi.h"
+#include "msix.h"
+#include "qemu-error.h"
+#include "range.h"
+#include "vfio_pci.h"
+
+/* #define DEBUG_VFIO */
+#ifdef DEBUG_VFIO
+#define DPRINTF(fmt, ...) \
+    do { fprintf(stderr, "vfio: " fmt, ## __VA_ARGS__); } while (0)
+#else
+#define DPRINTF(fmt, ...) \
+    do { } while (0)
+#endif
+
+#define MSIX_CAP_LENGTH 12
+
+static QLIST_HEAD(, VFIOContainer)
+    container_list = QLIST_HEAD_INITIALIZER(container_list);
+
+static QLIST_HEAD(, VFIOGroup)
+    group_list = QLIST_HEAD_INITIALIZER(group_list);
+
+static void vfio_disable_interrupts(VFIODevice *vdev);
+static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
+
+/*
+ * Common VFIO interrupt disable
+ */
+static void vfio_disable_irqindex(VFIODevice *vdev, int index)
+{
+    struct vfio_irq_set irq_set = {
+        .argsz = sizeof(irq_set),
+        .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
+        .index = index,
+        .start = 0,
+        .count = 0,
+    };
+
+    ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
+
+    vdev->interrupt = INT_NONE;
+}
+
+/*
+ * INTx
+ */
+static void vfio_unmask_intx(VFIODevice *vdev)
+{
+    struct vfio_irq_set irq_set = {
+        .argsz = sizeof(irq_set),
+        .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK,
+        .index = VFIO_PCI_INTX_IRQ_INDEX,
+        .start = 0,
+        .count = 1,
+    };
+
+    ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
+}
+
+static void vfio_intx_interrupt(void *opaque)
+{
+    VFIODevice *vdev = opaque;
+
+    if (!event_notifier_test_and_clear(&vdev->intx.interrupt)) {
+        return;
+    }
+
+    DPRINTF("%s(%04x:%02x:%02x.%x) Pin %c\n", __func__, vdev->host.domain,
+            vdev->host.bus, vdev->host.slot, vdev->host.function,
+            'A' + vdev->intx.pin);
+
+    vdev->intx.pending = true;
+    qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 1);
+}
+
+static void vfio_eoi(VFIODevice *vdev)
+{
+    if (!vdev->intx.pending) {
+        return;
+    }
+
+    DPRINTF("%s(%04x:%02x:%02x.%x) EOI\n", __func__, vdev->host.domain,
+            vdev->host.bus, vdev->host.slot, vdev->host.function);
+
+    vdev->intx.pending = false;
+    qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 0);
+    vfio_unmask_intx(vdev);
+}
+
+struct vfio_irq_set_fd {
+    struct vfio_irq_set irq_set;
+    int32_t fd;
+} QEMU_PACKED;
+
+static void vfio_enable_intx_kvm(VFIODevice *vdev)
+{
+#ifdef CONFIG_KVM
+    /*
+     * VFIO supports an eventfd for INTx notification and an irqfd-like
+     * mechanism for unmasking INTx.  If we could get a level irqfd in
+     * KVM and an eventfd triggered on EOI from guest, we could interlock
+     * these and avoid userspace for INTx.  Work in progress.
+     */
+#endif
+}
+
+static void vfio_disable_intx_kvm(VFIODevice *vdev)
+{
+#ifdef CONFIG_KVM
+    /* Same. */
+#endif
+}
+
+static void vfio_update_irq(PCIDevice *pdev)
+{
+    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
+    PCIINTxRoute route;
+
+    if (vdev->interrupt != INT_INTx) {
+        return;
+    }
+
+    route = pci_device_route_intx_to_irq(&vdev->pdev, vdev->intx.pin);
+    if (!memcmp(&route, &vdev->intx.route, sizeof(route))) {
+        return; /* Nothing changed */
+    }
+
+    DPRINTF("%s(%04x:%02x:%02x.%x) IRQ moved %d -> %d\n", __func__,
+            vdev->host.domain, vdev->host.bus, vdev->host.slot,
+            vdev->host.function, vdev->intx.route.irq, route.irq);
+
+    vfio_disable_intx_kvm(vdev);
+    /* TBD - Disable QEMU eoi notifier */
+
+    vdev->intx.route = route;
+
+    if (route.mode == PCI_INTX_DISABLED) {
+        return;
+    }
+
+    /* TBD - Enable QEMU eoi notifier */
+    vfio_enable_intx_kvm(vdev);
+
+    /* Re-enable the interrupt in cased we missed an EOI */
+    vfio_eoi(vdev);
+}
+
+static int vfio_enable_intx(VFIODevice *vdev)
+{
+    struct vfio_irq_set_fd irq_set_fd = {
+        .irq_set = {
+            .argsz = sizeof(irq_set_fd),
+            .flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER,
+            .index = VFIO_PCI_INTX_IRQ_INDEX,
+            .start = 0,
+            .count = 1,
+        },
+    };
+    uint8_t pin = vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1);
+
+    if (!pin) {
+        return 0;
+    }
+
+    vfio_disable_interrupts(vdev);
+
+    vdev->intx.pin = pin - 1; /* Pin A (1) -> irq[0] */
+    vdev->intx.route = pci_device_route_intx_to_irq(&vdev->pdev,
+                                                    vdev->intx.pin);
+    /* TBD - Enable QEMU eoi notifier */
+
+    if (event_notifier_init(&vdev->intx.interrupt, 0)) {
+        error_report("vfio: Error: event_notifier_init failed\n");
+        return -1;
+    }
+
+    irq_set_fd.fd = event_notifier_get_fd(&vdev->intx.interrupt);
+    qemu_set_fd_handler(irq_set_fd.fd, vfio_intx_interrupt, NULL, vdev);
+
+    if (ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set_fd)) {
+        error_report("vfio: Error: Failed to setup INTx fd: %s\n",
+                     strerror(errno));
+        return -1;
+    }
+
+    vfio_enable_intx_kvm(vdev);
+
+    vdev->interrupt = INT_INTx;
+
+    DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
+            vdev->host.bus, vdev->host.slot, vdev->host.function);
+
+    return 0;
+}
+
+static void vfio_disable_intx(VFIODevice *vdev)
+{
+    int fd;
+
+    vfio_disable_intx_kvm(vdev);
+    vfio_disable_irqindex(vdev, VFIO_PCI_INTX_IRQ_INDEX);
+
+    /* TBD - Disable QEMU eoi notifier */
+
+    fd = event_notifier_get_fd(&vdev->intx.interrupt);
+    qemu_set_fd_handler(fd, NULL, NULL, vdev);
+    event_notifier_cleanup(&vdev->intx.interrupt);
+
+    vdev->interrupt = INT_NONE;
+
+    DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
+            vdev->host.bus, vdev->host.slot, vdev->host.function);
+}
+
+/*
+ * MSI/X
+ */
+static void vfio_msi_interrupt(void *opaque)
+{
+    MSIVector *vec = opaque;
+    VFIODevice *vdev = vec->vdev;
+
+    if (!event_notifier_test_and_clear(&vec->interrupt)) {
+        return;
+    }
+
+    DPRINTF("%s(%04x:%02x:%02x.%x) vector %d\n", __func__,
+            vdev->host.domain, vdev->host.bus, vdev->host.slot,
+            vdev->host.function, vec->vector);
+
+    if (vdev->interrupt == INT_MSIX) {
+        msix_notify(&vdev->pdev, vec->vector);
+    } else if (vdev->interrupt == INT_MSI) {
+        msi_notify(&vdev->pdev, vec->vector);
+    } else {
+        error_report("vfio: MSI interrupt receieved, but not enabled?\n");
+    }
+}
+
+static int vfio_enable_vectors(VFIODevice *vdev, bool msix)
+{
+    struct vfio_irq_set *irq_set;
+    int ret = 0, i, argsz;
+    int32_t *fds;
+
+    argsz = sizeof(*irq_set) + (vdev->nr_vectors * sizeof(*fds));
+
+    irq_set = g_malloc0(argsz);
+    irq_set->argsz = argsz;
+    irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
+    irq_set->index = msix ? VFIO_PCI_MSIX_IRQ_INDEX : VFIO_PCI_MSI_IRQ_INDEX;
+    irq_set->start = 0;
+    irq_set->count = vdev->nr_vectors;
+    fds = (int32_t *)&irq_set->data;
+
+    for (i = 0; i < vdev->nr_vectors; i++) {
+        if (!vdev->msi_vectors[i].use) {
+            fds[i] = -1;
+            continue;
+        }
+
+        fds[i] = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt);
+    }
+
+    ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
+
+    g_free(irq_set);
+
+    if (!ret) {
+        vdev->interrupt = msix ? INT_MSIX : INT_MSI;
+    }
+
+    return ret;
+}
+
+static int vfio_msix_vector_use(PCIDevice *pdev,
+                                unsigned int vector, MSIMessage msg)
+{
+    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
+    int ret, fd;
+
+    DPRINTF("%s(%04x:%02x:%02x.%x) vector %d used\n", __func__,
+            vdev->host.domain, vdev->host.bus, vdev->host.slot,
+            vdev->host.function, vector);
+
+    if (vdev->interrupt != INT_MSIX) {
+        vfio_disable_interrupts(vdev);
+    }
+
+    if (!vdev->msi_vectors) {
+        vdev->msi_vectors = g_malloc0(vdev->msix->entries * sizeof(MSIVector));
+    }
+
+    vdev->msi_vectors[vector].vdev = vdev;
+    vdev->msi_vectors[vector].vector = vector;
+    vdev->msi_vectors[vector].use = true;
+
+    msix_vector_use(pdev, vector);
+
+    if (event_notifier_init(&vdev->msi_vectors[vector].interrupt, 0)) {
+        error_report("vfio: Error: event_notifier_init failed\n");
+    }
+
+    fd = event_notifier_get_fd(&vdev->msi_vectors[vector].interrupt);
+
+    /*
+     * Attempt to enable route through KVM irqchip,
+     * default to userspace handling if unavailable.
+     */
+    vdev->msi_vectors[vector].virq = kvm_irqchip_add_msi_route(kvm_state, msg);
+    if (vdev->msi_vectors[vector].virq < 0 ||
+        kvm_irqchip_add_irqfd(kvm_state, fd,
+                              vdev->msi_vectors[vector].virq) < 0) {
+        qemu_set_fd_handler(fd, vfio_msi_interrupt, NULL,
+                            &vdev->msi_vectors[vector]);
+    }
+
+    /*
+     * We don't want to have the host allocate all possible MSI vectors
+     * for a device if they're not in use, so we shutdown and incrementally
+     * increase them as needed.
+     */
+    if (vdev->nr_vectors < vector + 1) {
+        int i;
+
+        vfio_disable_irqindex(vdev, VFIO_PCI_MSIX_IRQ_INDEX);
+        vdev->nr_vectors = vector + 1;
+        ret = vfio_enable_vectors(vdev, true);
+        if (ret) {
+            error_report("vfio: failed to enable vectors, %d\n", ret);
+        }
+
+        /* We don't know if we've missed interrupts in the interim... */
+        for (i = 0; i < vdev->msix->entries; i++) {
+            if (vdev->msi_vectors[i].use) {
+                msix_notify(&vdev->pdev, i);
+            }
+        }
+    } else {
+        struct vfio_irq_set_fd irq_set_fd = {
+            .irq_set = {
+                .argsz = sizeof(irq_set_fd),
+                .flags = VFIO_IRQ_SET_DATA_EVENTFD |
+                         VFIO_IRQ_SET_ACTION_TRIGGER,
+                .index = VFIO_PCI_MSIX_IRQ_INDEX,
+                .start = vector,
+                .count = 1,
+            },
+            .fd = fd,
+        };
+        ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set_fd);
+        if (ret) {
+            error_report("vfio: failed to modify vector, %d\n", ret);
+        }
+        msix_notify(&vdev->pdev, vector);
+    }
+
+    return 0;
+}
+
+static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int vector)
+{
+    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
+    struct vfio_irq_set_fd irq_set_fd = {
+        .irq_set = {
+            .argsz = sizeof(irq_set_fd),
+            .flags = VFIO_IRQ_SET_DATA_EVENTFD |
+                     VFIO_IRQ_SET_ACTION_TRIGGER,
+            .index = VFIO_PCI_MSIX_IRQ_INDEX,
+            .start = vector,
+            .count = 1,
+        },
+        .fd = -1,
+    };
+    int fd;
+
+    DPRINTF("%s(%04x:%02x:%02x.%x) vector %d released\n", __func__,
+            vdev->host.domain, vdev->host.bus, vdev->host.slot,
+            vdev->host.function, vector);
+
+    /*
+     * XXX What's the right thing to do here?  This turns off the interrupt
+     * completely, but do we really just want to switch the interrupt to
+     * bouncing through userspace and let msix.c drop it?  Not sure.
+     */
+    msix_vector_unuse(pdev, vector);
+    ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set_fd);
+
+    fd = event_notifier_get_fd(&vdev->msi_vectors[vector].interrupt);
+
+    if (vdev->msi_vectors[vector].virq < 0) {
+        qemu_set_fd_handler(fd, NULL, NULL, NULL);
+    } else {
+        kvm_irqchip_remove_irqfd(kvm_state, fd, vdev->msi_vectors[vector].virq);
+        kvm_irqchip_release_virq(kvm_state, vdev->msi_vectors[vector].virq);
+        vdev->msi_vectors[vector].virq = -1;
+    }
+
+    event_notifier_cleanup(&vdev->msi_vectors[vector].interrupt);
+    vdev->msi_vectors[vector].use = false;
+}
+
+/* XXX This should move to msi.c */
+static MSIMessage msi_get_msg(PCIDevice *pdev, unsigned int vector)
+{
+    uint16_t flags = pci_get_word(pdev->config + pdev->msi_cap + PCI_MSI_FLAGS);
+    bool msi64bit = flags & PCI_MSI_FLAGS_64BIT;
+    MSIMessage msg;
+
+    if (msi64bit) {
+        msg.address = pci_get_quad(pdev->config +
+                                   pdev->msi_cap + PCI_MSI_ADDRESS_LO);
+    } else {
+        msg.address = pci_get_long(pdev->config +
+                                   pdev->msi_cap + PCI_MSI_ADDRESS_LO);
+    }
+
+    msg.data = pci_get_word(pdev->config + pdev->msi_cap +
+                            (msi64bit ? PCI_MSI_DATA_64 : PCI_MSI_DATA_32));
+    msg.data += vector;
+
+    return msg;
+}
+
+/* So should this */
+static void msi_set_qsize(PCIDevice *pdev, uint8_t size)
+{
+    uint8_t *config = pdev->config + pdev->msi_cap;
+    uint16_t flags;
+
+    flags = pci_get_word(config + PCI_MSI_FLAGS);
+    flags = le16_to_cpu(flags);
+    flags &= ~PCI_MSI_FLAGS_QSIZE;
+    flags |= (size & 0x7) << 4;
+    flags = cpu_to_le16(flags);
+    pci_set_word(config + PCI_MSI_FLAGS, flags);
+}
+
+static void vfio_enable_msi(VFIODevice *vdev)
+{
+    int ret, i;
+
+    vfio_disable_interrupts(vdev);
+
+    vdev->nr_vectors = msi_nr_vectors_allocated(&vdev->pdev);
+retry:
+    vdev->msi_vectors = g_malloc0(vdev->nr_vectors * sizeof(MSIVector));
+
+    for (i = 0; i < vdev->nr_vectors; i++) {
+        MSIMessage msg;
+        int fd;
+
+        vdev->msi_vectors[i].vdev = vdev;
+        vdev->msi_vectors[i].vector = i;
+        vdev->msi_vectors[i].use = true;
+
+        if (event_notifier_init(&vdev->msi_vectors[i].interrupt, 0)) {
+            error_report("vfio: Error: event_notifier_init failed\n");
+        }
+
+        fd = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt);
+
+        msg = msi_get_msg(&vdev->pdev, i);
+
+        /*
+         * Attempt to enable route through KVM irqchip,
+         * default to userspace handling if unavailable.
+         */
+        vdev->msi_vectors[i].virq = kvm_irqchip_add_msi_route(kvm_state, msg);
+        if (vdev->msi_vectors[i].virq < 0 ||
+            kvm_irqchip_add_irqfd(kvm_state, fd,
+                                  vdev->msi_vectors[i].virq) < 0) {
+            qemu_set_fd_handler(fd, vfio_msi_interrupt, NULL,
+                                &vdev->msi_vectors[i]);
+        }
+    }
+
+    ret = vfio_enable_vectors(vdev, false);
+    if (ret) {
+        if (ret < 0) {
+            error_report("vfio: Error: Failed to setup MSI fds: %s\n",
+                         strerror(errno));
+        } else if (ret != vdev->nr_vectors) {
+            error_report("vfio: Error: Failed to enable %d "
+                         "MSI vectors, retry with %d\n", vdev->nr_vectors, ret);
+        }
+
+        for (i = 0; i < vdev->nr_vectors; i++) {
+            int fd = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt);
+            if (vdev->msi_vectors[i].virq >= 0) {
+                kvm_irqchip_remove_irqfd(kvm_state, fd,
+                                         vdev->msi_vectors[i].virq);
+                kvm_irqchip_release_virq(kvm_state, vdev->msi_vectors[i].virq);
+                vdev->msi_vectors[i].virq = -1;
+            } else {
+                qemu_set_fd_handler(fd, NULL, NULL, NULL);
+            }
+            event_notifier_cleanup(&vdev->msi_vectors[i].interrupt);
+        }
+
+        g_free(vdev->msi_vectors);
+
+        if (ret > 0 && ret != vdev->nr_vectors) {
+            vdev->nr_vectors = ret;
+            goto retry;
+        }
+        vdev->nr_vectors = 0;
+
+        return;
+    }
+
+    msi_set_qsize(&vdev->pdev, vdev->nr_vectors);
+
+    DPRINTF("%s(%04x:%02x:%02x.%x) Enabled %d MSI vectors\n", __func__,
+            vdev->host.domain, vdev->host.bus, vdev->host.slot,
+            vdev->host.function, vdev->nr_vectors);
+}
+
+static void vfio_disable_msi_x(VFIODevice *vdev, bool msix)
+{
+    int i;
+
+    vfio_disable_irqindex(vdev, msix ? VFIO_PCI_MSIX_IRQ_INDEX :
+                                       VFIO_PCI_MSI_IRQ_INDEX);
+
+    for (i = 0; i < vdev->nr_vectors; i++) {
+        int fd;
+
+        if (!vdev->msi_vectors[i].use) {
+            continue;
+        }
+
+        fd = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt);
+
+        if (vdev->msi_vectors[i].virq >= 0) {
+            kvm_irqchip_remove_irqfd(kvm_state, fd, vdev->msi_vectors[i].virq);
+            kvm_irqchip_release_virq(kvm_state, vdev->msi_vectors[i].virq);
+            vdev->msi_vectors[i].virq = -1;
+        } else {
+            qemu_set_fd_handler(fd, NULL, NULL, NULL);
+        }
+
+        if (msix) {
+            msix_vector_unuse(&vdev->pdev, i);
+        }
+
+        event_notifier_cleanup(&vdev->msi_vectors[i].interrupt);
+    }
+
+    g_free(vdev->msi_vectors);
+    vdev->msi_vectors = NULL;
+    vdev->nr_vectors = 0;
+
+    if (!msix) {
+        msi_set_qsize(&vdev->pdev, 0); /* Actually still means 1 vector */
+    }
+
+    DPRINTF("%s(%04x:%02x:%02x.%x, msi%s)\n", __func__,
+            vdev->host.domain, vdev->host.bus, vdev->host.slot,
+            vdev->host.function, msix ? "x" : "");
+
+    vfio_enable_intx(vdev);
+}
+
+/*
+ * IO Port/MMIO - Beware of the endians, VFIO is always little endian
+ */
+static void vfio_bar_write(void *opaque, target_phys_addr_t addr,
+                           uint64_t data, unsigned size)
+{
+    VFIOBAR *bar = opaque;
+    uint8_t buf[8];
+
+    switch (size) {
+    case 1:
+        *buf = data & 0xff;
+        break;
+    case 2:
+        *(uint16_t *)buf = cpu_to_le16(data);
+        break;
+    case 4:
+        *(uint32_t *)buf = cpu_to_le32(data);
+        break;
+    default:
+        hw_error("vfio: unsupported write size, %d bytes\n", size);
+        break;
+    }
+
+    if (pwrite(bar->fd, buf, size, bar->fd_offset + addr) != size) {
+        error_report("%s(,0x%"PRIx64", 0x%"PRIx64", %d) failed: %s\n",
+                     __func__, addr, data, size, strerror(errno));
+    }
+
+    DPRINTF("%s(BAR%d+0x%"PRIx64", 0x%"PRIx64", %d)\n",
+            __func__, bar->nr, addr, data, size);
+}
+
+static uint64_t vfio_bar_read(void *opaque,
+                              target_phys_addr_t addr, unsigned size)
+{
+    VFIOBAR *bar = opaque;
+    uint8_t buf[8];
+    uint64_t data = 0;
+
+    if (pread(bar->fd, buf, size, bar->fd_offset + addr) != size) {
+        error_report("%s(,0x%"PRIx64", %d) failed: %s\n",
+                     __func__, addr, size, strerror(errno));
+        return (uint64_t)-1;
+    }
+
+    switch (size) {
+    case 1:
+        data = buf[0];
+        break;
+    case 2:
+        data = le16_to_cpu(*(uint16_t *)buf);
+        break;
+    case 4:
+        data = le32_to_cpu(*(uint32_t *)buf);
+        break;
+    default:
+        hw_error("vfio: unsupported read size, %d bytes\n", size);
+        break;
+    }
+
+    DPRINTF("%s(BAR%d+0x%"PRIx64", %d) = 0x%"PRIx64"\n",
+            __func__, bar->nr, addr, size, data);
+
+    return data;
+}
+
+static const MemoryRegionOps vfio_bar_ops = {
+    .read = vfio_bar_read,
+    .write = vfio_bar_write,
+    .endianness = DEVICE_LITTLE_ENDIAN,
+};
+
+/*
+ * PCI config space
+ */
+static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len)
+{
+    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
+    uint32_t val = 0;
+
+    /*
+     * We only need QEMU PCI config support for the ROM BAR, the MSI and MSIX
+     * capabilities, and the multifunction bit below.  We let VFIO handle
+     * virtualizing everything else.  Performance is not a concern here.
+     */
+    if (ranges_overlap(addr, len, PCI_ROM_ADDRESS, 4) ||
+        (pdev->cap_present & QEMU_PCI_CAP_MSIX &&
+         ranges_overlap(addr, len, pdev->msix_cap, MSIX_CAP_LENGTH)) ||
+        (pdev->cap_present & QEMU_PCI_CAP_MSI &&
+         ranges_overlap(addr, len, pdev->msi_cap, vdev->msi_cap_size))) {
+
+        val = pci_default_read_config(pdev, addr, len);
+    } else {
+        if (pread(vdev->fd, &val, len, vdev->config_offset + addr) != len) {
+            error_report("%s(%04x:%02x:%02x.%x, 0x%x, 0x%x) failed: %s\n",
+                         __func__, vdev->host.domain, vdev->host.bus,
+                         vdev->host.slot, vdev->host.function, addr, len,
+                         strerror(errno));
+            return -1;
+        }
+        val = le32_to_cpu(val);
+    }
+
+    /* Multifunction bit is virualized in QEMU */
+    if (unlikely(ranges_overlap(addr, len, PCI_HEADER_TYPE, 1))) {
+        uint32_t mask = PCI_HEADER_TYPE_MULTI_FUNCTION;
+
+        if (len == 4) {
+            mask <<= 16;
+        }
+
+        if (pdev->cap_present & QEMU_PCI_CAP_MULTIFUNCTION) {
+            val |= mask;
+        } else {
+            val &= ~mask;
+        }
+    }
+
+    DPRINTF("%s(%04x:%02x:%02x.%x, @0x%x, len=0x%x) %x\n", __func__,
+            vdev->host.domain, vdev->host.bus, vdev->host.slot,
+            vdev->host.function, addr, len, val);
+
+    return val;
+}
+
+static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr,
+                                  uint32_t val, int len)
+{
+    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
+    uint32_t val_le = cpu_to_le32(val);
+
+    DPRINTF("%s(%04x:%02x:%02x.%x, @0x%x, 0x%x, len=0x%x)\n", __func__,
+            vdev->host.domain, vdev->host.bus, vdev->host.slot,
+            vdev->host.function, addr, val, len);
+
+    /* Write everything to VFIO, let it filter out what we can't write */
+    if (pwrite(vdev->fd, &val_le, len, vdev->config_offset + addr) != len) {
+        error_report("%s(%04x:%02x:%02x.%x, 0x%x, 0x%x, 0x%x) failed: %s\n",
+                     __func__, vdev->host.domain, vdev->host.bus,
+                     vdev->host.slot, vdev->host.function, addr, val, len,
+                     strerror(errno));
+    }
+
+    /* Write standard header bits to emulation */
+    if (addr < PCI_CONFIG_HEADER_SIZE) {
+        pci_default_write_config(pdev, addr, val, len);
+        return;
+    }
+
+    /* MSI/MSI-X Enabling/Disabling */
+    if (pdev->cap_present & QEMU_PCI_CAP_MSI &&
+        ranges_overlap(addr, len, pdev->msi_cap, vdev->msi_cap_size)) {
+        int is_enabled, was_enabled = msi_enabled(pdev);
+
+        pci_default_write_config(pdev, addr, val, len);
+
+        is_enabled = msi_enabled(pdev);
+
+        if (!was_enabled && is_enabled) {
+            vfio_enable_msi(vdev);
+        } else if (was_enabled && !is_enabled) {
+            vfio_disable_msi_x(vdev, false);
+        }
+    }
+
+    if (pdev->cap_present & QEMU_PCI_CAP_MSIX &&
+        ranges_overlap(addr, len, pdev->msix_cap, MSIX_CAP_LENGTH)) {
+        int is_enabled, was_enabled = msix_enabled(pdev);
+
+        pci_default_write_config(pdev, addr, val, len);
+
+        is_enabled = msix_enabled(pdev);
+
+        if (!was_enabled && is_enabled) {
+            /* vfio_msix_vector_use handles this automatically */
+        } else if (was_enabled && !is_enabled) {
+            vfio_disable_msi_x(vdev, true);
+        }
+    }
+}
+
+/*
+ * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86
+ */
+static int vfio_dma_map(VFIOContainer *container, target_phys_addr_t iova,
+                        ram_addr_t size, void *vaddr, bool readonly)
+{
+    struct vfio_iommu_type1_dma_map map = {
+        .argsz = sizeof(map),
+        .flags = VFIO_DMA_MAP_FLAG_READ,
+        .vaddr = (__u64)vaddr,
+        .iova = iova,
+        .size = size,
+    };
+
+    if (!readonly) {
+        map.flags |= VFIO_DMA_MAP_FLAG_WRITE;
+    }
+
+    if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map)) {
+        DPRINTF("VFIO_MAP_DMA: %d\n", -errno);
+        return -errno;
+    }
+
+    return 0;
+}
+
+static int vfio_dma_unmap(VFIOContainer *container,
+                          target_phys_addr_t iova, ram_addr_t size)
+{
+    struct vfio_iommu_type1_dma_unmap unmap = {
+        .argsz = sizeof(unmap),
+        .flags = 0,
+        .iova = iova,
+        .size = size,
+    };
+
+    if (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
+        DPRINTF("VFIO_UNMAP_DMA: %d\n", -errno);
+        return -errno;
+    }
+
+    return 0;
+}
+
+static void vfio_listener_dummy1(MemoryListener *listener)
+{
+    /* We don't do batching (begin/commit) or care about logging */
+}
+
+static void vfio_listener_dummy2(MemoryListener *listener,
+                                 MemoryRegionSection *section)
+{
+    /* We don't do logging or care about nops */
+}
+
+static void vfio_listener_dummy3(MemoryListener *listener,
+                                 MemoryRegionSection *section,
+                                 bool match_data, uint64_t data,
+                                 EventNotifier *e)
+{
+    /* We don't care about eventfds */
+}
+
+static bool vfio_listener_skipped_section(MemoryRegionSection *section)
+{
+    return !memory_region_is_ram(section->mr);
+}
+
+static void vfio_listener_region_add(MemoryListener *listener,
+                                     MemoryRegionSection *section)
+{
+    VFIOContainer *container = container_of(listener, VFIOContainer,
+                                            iommu_data.listener);
+    target_phys_addr_t iova, end;
+    void *vaddr;
+    int ret;
+
+    if (vfio_listener_skipped_section(section)) {
+        DPRINTF("vfio: SKIPPING region_add %016lx - %016lx\n",
+                section->offset_within_address_space,
+                section->offset_within_address_space + section->size - 1);
+        return;
+    }
+
+    if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
+                 (section->offset_within_region & ~TARGET_PAGE_MASK))) {
+        error_report("%s received unaligned region\n", __func__);
+        return;
+    }
+
+    iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
+    end = (section->offset_within_address_space + section->size) &
+          TARGET_PAGE_MASK;
+
+    if (iova >= end) {
+        return;
+    }
+
+    vaddr = memory_region_get_ram_ptr(section->mr) +
+            section->offset_within_region +
+            (iova - section->offset_within_address_space);
+
+    DPRINTF("vfio: region_add %016lx - %016lx [%p]\n",
+            iova, end - 1, vaddr);
+
+    ret = vfio_dma_map(container, iova, end - iova, vaddr, section->readonly);
+    if (ret) {
+        error_report("vfio_dma_map(%p, 0x%016lx, 0x%lx, %p) = %d (%s)\n",
+                     container, iova, end - iova, vaddr, ret, strerror(errno));
+    }
+}
+
+static void vfio_listener_region_del(MemoryListener *listener,
+                                     MemoryRegionSection *section)
+{
+    VFIOContainer *container = container_of(listener, VFIOContainer,
+                                            iommu_data.listener);
+    target_phys_addr_t iova, end;
+    int ret;
+
+    if (vfio_listener_skipped_section(section)) {
+        DPRINTF("vfio: SKIPPING region_del %016lx - %016lx\n",
+                section->offset_within_address_space,
+                section->offset_within_address_space + section->size - 1);
+        return;
+    }
+
+    if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
+                 (section->offset_within_region & ~TARGET_PAGE_MASK))) {
+        error_report("%s received unaligned region\n", __func__);
+        return;
+    }
+
+    iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
+    end = (section->offset_within_address_space + section->size) &
+          TARGET_PAGE_MASK;
+
+    if (iova >= end) {
+        return;
+    }
+
+    DPRINTF("vfio: region_del %016lx - %016lx\n", iova, end - 1);
+
+    ret = vfio_dma_unmap(container, iova, end - iova);
+    if (ret) {
+        error_report("vfio_dma_unmap(%p, 0x%016lx, 0x%lx) = %d (%s)\n",
+                     container, iova, end - iova, ret, strerror(errno));
+    }
+}
+
+static void vfio_listener_release(VFIOContainer *container)
+{
+    memory_listener_unregister(&container->iommu_data.listener);
+}
+
+/*
+ * Interrupt setup
+ */
+static void vfio_disable_interrupts(VFIODevice *vdev)
+{
+    switch (vdev->interrupt) {
+    case INT_INTx:
+        vfio_disable_intx(vdev);
+        break;
+    case INT_MSI:
+        vfio_disable_msi_x(vdev, false);
+        break;
+    case INT_MSIX:
+        vfio_disable_msi_x(vdev, true);
+        break;
+    }
+}
+
+static int vfio_setup_msi(VFIODevice *vdev, int pos)
+{
+    uint16_t ctrl;
+    bool msi_64bit, msi_maskbit;
+    int ret, entries;
+
+    if (!msi_supported) {
+        return 0;
+    }
+
+    if (pread(vdev->fd, &ctrl, sizeof(ctrl),
+              vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) {
+        return -1;
+    }
+    ctrl = le16_to_cpu(ctrl);
+
+    msi_64bit = !!(ctrl & PCI_MSI_FLAGS_64BIT);
+    msi_maskbit = !!(ctrl & PCI_MSI_FLAGS_MASKBIT);
+    entries = 1 << ((ctrl & PCI_MSI_FLAGS_QMASK) >> 1);
+
+    DPRINTF("%04x:%02x:%02x.%x PCI MSI CAP @0x%x\n", vdev->host.domain,
+            vdev->host.bus, vdev->host.slot, vdev->host.function, pos);
+
+    ret = msi_init(&vdev->pdev, pos, entries, msi_64bit, msi_maskbit);
+    if (ret < 0) {
+        error_report("vfio: msi_init failed\n");
+        return ret;
+    }
+    vdev->msi_cap_size = 0xa + (msi_maskbit ? 0xa : 0) + (msi_64bit ? 0x4 : 0);
+
+    return 0;
+}
+
+/*
+ * We don't have any control over how pci_add_capability() inserts
+ * capabilities into the chain.  In order to setup MSI-X we need a
+ * MemoryRegion for the BAR.  In order to setup the BAR and not
+ * attempt to mmap the MSI-X table area, which VFIO won't allow, we
+ * need to first look for where the MSI-X table lives.  So we
+ * unfortunately split MSI-X setup across two functions.
+ */
+static int vfio_early_setup_msix(VFIODevice *vdev)
+{
+    uint8_t pos;
+    uint16_t ctrl;
+    uint32_t table, pba;
+
+    pos = pci_find_capability(&vdev->pdev, PCI_CAP_ID_MSIX);
+    if (!pos) {
+        return 0;
+    }
+
+    if (pread(vdev->fd, &ctrl, sizeof(ctrl),
+              vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) {
+        return -1;
+    }
+
+    if (pread(vdev->fd, &table, sizeof(table),
+              vdev->config_offset + pos + PCI_MSIX_TABLE) != sizeof(table)) {
+        return -1;
+    }
+
+    if (pread(vdev->fd, &pba, sizeof(pba),
+              vdev->config_offset + pos + PCI_MSIX_PBA) != sizeof(pba)) {
+        return -1;
+    }
+
+    ctrl = le16_to_cpu(ctrl);
+    table = le32_to_cpu(table);
+    pba = le32_to_cpu(pba);
+
+    vdev->msix = g_malloc0(sizeof(*(vdev->msix)));
+    vdev->msix->table_bar = table & PCI_MSIX_FLAGS_BIRMASK;
+    vdev->msix->table_offset = table & ~PCI_MSIX_FLAGS_BIRMASK;
+    vdev->msix->pba_bar = pba & PCI_MSIX_FLAGS_BIRMASK;
+    vdev->msix->pba_offset = pba & ~PCI_MSIX_FLAGS_BIRMASK;
+    vdev->msix->entries = (ctrl & PCI_MSIX_FLAGS_QSIZE) + 1;
+
+    DPRINTF("%04x:%02x:%02x.%x "
+            "PCI MSI-X CAP @0x%x, BAR %d, offset 0x%x, entries %d\n",
+            vdev->host.domain, vdev->host.bus, vdev->host.slot,
+            vdev->host.function, pos, vdev->msix->table_bar,
+            vdev->msix->table_offset, vdev->msix->entries);
+
+    return 0;
+}
+
+static int vfio_setup_msix(VFIODevice *vdev, int pos)
+{
+    int ret;
+
+    if (!msi_supported) {
+        return 0;
+    }
+
+    ret = msix_init(&vdev->pdev, vdev->msix->entries,
+                    &vdev->bars[vdev->msix->table_bar].mem,
+                    vdev->msix->table_bar, vdev->msix->table_offset,
+                    &vdev->bars[vdev->msix->pba_bar].mem,
+                    vdev->msix->pba_bar, vdev->msix->pba_offset, pos);
+    if (ret < 0) {
+        error_report("vfio: msix_init failed\n");
+        return ret;
+    }
+
+    ret = msix_set_vector_notifiers(&vdev->pdev, vfio_msix_vector_use,
+                                    vfio_msix_vector_release);
+    if (ret) {
+        error_report("vfio: msix_set_vector_notifiers failed %d\n", ret);
+        msix_uninit(&vdev->pdev, &vdev->bars[vdev->msix->table_bar].mem,
+                    &vdev->bars[vdev->msix->pba_bar].mem);
+        return ret;
+    }
+
+    return 0;
+}
+
+static void vfio_teardown_msi(VFIODevice *vdev)
+{
+    msi_uninit(&vdev->pdev);
+
+    if (vdev->msix) {
+        /* FIXME: Why can't unset just silently do nothing?? */
+        if (vdev->pdev.msix_vector_use_notifier &&
+            vdev->pdev.msix_vector_release_notifier) {
+            msix_unset_vector_notifiers(&vdev->pdev);
+        }
+
+        msix_uninit(&vdev->pdev, &vdev->bars[vdev->msix->table_bar].mem,
+                    &vdev->bars[vdev->msix->pba_bar].mem);
+    }
+}
+
+/*
+ * Resource setup
+ */
+static void vfio_unmap_bar(VFIODevice *vdev, int nr)
+{
+    VFIOBAR *bar = &vdev->bars[nr];
+
+    if (!bar->size) {
+        return;
+    }
+
+    memory_region_del_subregion(&bar->mem, &bar->mmap_mem);
+    munmap(bar->mmap, memory_region_size(&bar->mmap_mem));
+
+    if (vdev->msix && vdev->msix->table_bar == nr) {
+        memory_region_del_subregion(&bar->mem, &vdev->msix->mmap_mem);
+        munmap(vdev->msix->mmap, memory_region_size(&vdev->msix->mmap_mem));
+    }
+
+    memory_region_destroy(&bar->mem);
+}
+
+static int vfio_mmap_bar(VFIOBAR *bar, MemoryRegion *mem, MemoryRegion *submem,
+                         void **map, size_t size, off_t offset,
+                         const char *name)
+{
+    int ret = 0;
+
+    if (size && bar->flags & VFIO_REGION_INFO_FLAG_MMAP) {
+        int prot = 0;
+
+        if (bar->flags & VFIO_REGION_INFO_FLAG_READ) {
+            prot |= PROT_READ;
+        }
+
+        if (bar->flags & VFIO_REGION_INFO_FLAG_WRITE) {
+            prot |= PROT_WRITE;
+        }
+
+        *map = mmap(NULL, size, prot, MAP_SHARED,
+                    bar->fd, bar->fd_offset + offset);
+        if (*map == MAP_FAILED) {
+            *map = NULL;
+            ret = -errno;
+            goto empty_region;
+        }
+
+        memory_region_init_ram_ptr(submem, name, size, *map);
+    } else {
+empty_region:
+        /* Create a zero sized sub-region to make cleanup easy. */
+        memory_region_init(submem, name, 0);
+    }
+
+    memory_region_add_subregion(mem, offset, submem);
+
+    return ret;
+}
+
+static void vfio_map_bar(VFIODevice *vdev, int nr)
+{
+    VFIOBAR *bar = &vdev->bars[nr];
+    unsigned size = bar->size;
+    char name[64];
+    uint32_t pci_bar;
+    uint8_t type;
+    int ret;
+
+    /* Skip both unimplemented BARs and the upper half of 64bit BARS. */
+    if (!size) {
+        return;
+    }
+
+    snprintf(name, sizeof(name), "VFIO %04x:%02x:%02x.%x BAR %d",
+             vdev->host.domain, vdev->host.bus, vdev->host.slot,
+             vdev->host.function, nr);
+
+    /* Determine what type of BAR this is for registration */
+    ret = pread(vdev->fd, &pci_bar, sizeof(pci_bar),
+                vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr));
+    if (ret != sizeof(pci_bar)) {
+        error_report("vfio: Failed to read BAR %d (%s)\n", nr, strerror(errno));
+        return;
+    }
+
+    pci_bar = le32_to_cpu(pci_bar);
+    type = pci_bar & (pci_bar & PCI_BASE_ADDRESS_SPACE_IO ?
+           ~PCI_BASE_ADDRESS_IO_MASK : ~PCI_BASE_ADDRESS_MEM_MASK);
+
+    /* A "slow" read/write mapping underlies all BARs */
+    memory_region_init_io(&bar->mem, &vfio_bar_ops, bar, name, size);
+    pci_register_bar(&vdev->pdev, nr, type, &bar->mem);
+
+    /*
+     * We can't mmap areas overlapping the MSIX vector table, so we
+     * potentially insert a direct-mapped subregion before and after it.
+     */
+    if (vdev->msix && vdev->msix->table_bar == nr) {
+        size = vdev->msix->table_offset & TARGET_PAGE_MASK;
+    }
+
+    strncat(name, " mmap", sizeof(name) - strlen(name) - 1);
+    if (vfio_mmap_bar(bar, &bar->mem,
+                      &bar->mmap_mem, &bar->mmap, size, 0, name)) {
+        error_report("%s unsupported. Performance may be slow\n", name);
+    }
+
+    if (vdev->msix && vdev->msix->table_bar == nr) {
+        unsigned start;
+
+        start = TARGET_PAGE_ALIGN(vdev->msix->table_offset +
+                                  (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE));
+
+        size = start < bar->size ? bar->size - start : 0;
+        strncat(name, " msix-hi", sizeof(name) - strlen(name) - 1);
+        /* MSIXInfo contains another MemoryRegion for this mapping */
+        if (vfio_mmap_bar(bar, &bar->mem, &vdev->msix->mmap_mem,
+                          &vdev->msix->mmap, size, start, name)) {
+            error_report("%s unsupported. Performance may be slow\n", name);
+        }
+    }
+
+    return;
+}
+
+static void vfio_map_bars(VFIODevice *vdev)
+{
+    int i;
+
+    for (i = 0; i < PCI_ROM_SLOT; i++) {
+        vfio_map_bar(vdev, i);
+    }
+}
+
+static void vfio_unmap_bars(VFIODevice *vdev)
+{
+    int i;
+
+    for (i = 0; i < PCI_ROM_SLOT; i++) {
+        vfio_unmap_bar(vdev, i);
+    }
+}
+
+/*
+ * General setup
+ */
+static uint8_t vfio_std_cap_max_size(PCIDevice *pdev, uint8_t pos)
+{
+    uint8_t tmp, next = 0xff;
+
+    for (tmp = pdev->config[PCI_CAPABILITY_LIST]; tmp;
+         tmp = pdev->config[tmp + 1]) {
+        if (tmp > pos && tmp < next) {
+            next = tmp;
+        }
+    }
+
+    return next - pos;
+}
+
+static int vfio_add_std_cap(VFIODevice *vdev, uint8_t pos)
+{
+    PCIDevice *pdev = &vdev->pdev;
+    uint8_t cap_id, next, size;
+    int ret;
+
+    cap_id = pdev->config[pos];
+    next = pdev->config[pos + 1];
+
+    /*
+     * If it becomes important to configure capabilities to their actual
+     * size, use this as the default when it's something we don't recognize.
+     * Since QEMU doesn't actually handle many of the config accesses,
+     * exact size doesn't seem worthwhile.
+     */
+    size = vfio_std_cap_max_size(pdev, pos);
+
+    /*
+     * pci_add_capability always inserts the new capability at the head
+     * of the chain.  Therefore to end up with a chain that matches the
+     * physical device, we insert from the end by making this recursive.
+     * This is also why we pre-caclulate size above as cached config space
+     * will be changed as we unwind the stack.
+     */
+    if (next) {
+        ret = vfio_add_std_cap(vdev, next);
+        if (ret) {
+            return ret;
+        }
+    } else {
+        pdev->config[PCI_CAPABILITY_LIST] = 0; /* Begin the rebuild */
+    }
+
+    switch (cap_id) {
+    case PCI_CAP_ID_MSI:
+        ret = vfio_setup_msi(vdev, pos);
+        break;
+    case PCI_CAP_ID_MSIX:
+        ret = vfio_setup_msix(vdev, pos);
+        break;
+    default:
+        ret = pci_add_capability(pdev, cap_id, pos, size);
+        break;
+    }
+
+    if (ret < 0) {
+        error_report("vfio: %04x:%02x:%02x.%x Error adding PCI capability "
+                     "0x%x[0x%x]@0x%x: %d\n", vdev->host.domain,
+                     vdev->host.bus, vdev->host.slot, vdev->host.function,
+                     cap_id, size, pos, ret);
+        return ret;
+    }
+
+    return 0;
+}
+
+static int vfio_add_capabilities(VFIODevice *vdev)
+{
+    PCIDevice *pdev = &vdev->pdev;
+
+    if (!(pdev->config[PCI_STATUS] & PCI_STATUS_CAP_LIST) ||
+        !pdev->config[PCI_CAPABILITY_LIST]) {
+        return 0; /* Nothing to add */
+    }
+
+    return vfio_add_std_cap(vdev, pdev->config[PCI_CAPABILITY_LIST]);
+}
+
+static int vfio_load_rom(VFIODevice *vdev)
+{
+    uint64_t size = vdev->rom_size;
+    const VMStateDescription *vmsd;
+    char name[32];
+    off_t off = 0, voff = vdev->rom_offset;
+    ssize_t bytes;
+    void *ptr;
+
+    /* If loading ROM from file, pci handles it */
+    if (vdev->pdev.romfile || !vdev->pdev.rom_bar || !size) {
+        return 0;
+    }
+
+    DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
+            vdev->host.bus, vdev->host.slot, vdev->host.function);
+
+    vmsd = qdev_get_vmsd(DEVICE(&vdev->pdev));
+
+    if (vmsd) {
+        snprintf(name, sizeof(name), "%s.rom", vmsd->name);
+    } else {
+        snprintf(name, sizeof(name), "%s.rom",
+                 object_get_typename(OBJECT(&vdev->pdev)));
+    }
+    memory_region_init_ram(&vdev->pdev.rom, name, size);
+    ptr = memory_region_get_ram_ptr(&vdev->pdev.rom);
+    memset(ptr, 0xff, size);
+
+    while (size) {
+        bytes = pread(vdev->fd, ptr + off, size, voff + off);
+        if (bytes == 0) {
+            break; /* expect that we could get back less than the ROM BAR */
+        } else if (bytes > 0) {
+            off += bytes;
+            size -= bytes;
+        } else {
+            if (errno == EINTR || errno == EAGAIN) {
+                continue;
+            }
+            error_report("vfio: Error reading device ROM: %s\n",
+                         strerror(errno));
+            memory_region_destroy(&vdev->pdev.rom);
+            return -1;
+        }
+    }
+
+    pci_register_bar(&vdev->pdev, PCI_ROM_SLOT, 0, &vdev->pdev.rom);
+    vdev->pdev.has_rom = true;
+    return 0;
+}
+
+static int vfio_connect_container(VFIOGroup *group)
+{
+    VFIOContainer *container;
+    int ret, fd;
+
+    if (group->container) {
+        return 0;
+    }
+
+    QLIST_FOREACH(container, &container_list, next) {
+        if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) {
+            group->container = container;
+            QLIST_INSERT_HEAD(&container->group_list, group, container_next);
+            return 0;
+        }
+    }
+
+    fd = qemu_open("/dev/vfio/vfio", O_RDWR);
+    if (fd < 0) {
+        error_report("vfio: failed to open /dev/vfio/vfio: %s\n",
+                     strerror(errno));
+        return -1;
+    }
+
+    ret = ioctl(fd, VFIO_GET_API_VERSION);
+    if (ret != VFIO_API_VERSION) {
+        error_report("vfio: supported vfio version: %d, "
+                     "reported version: %d\n", VFIO_API_VERSION, ret);
+        close(fd);
+        return -1;
+    }
+
+    container = g_malloc0(sizeof(*container));
+    container->fd = fd;
+
+    if (ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) {
+        ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd);
+        if (ret) {
+            error_report("vfio: failed to set group container: %s\n",
+                         strerror(errno));
+            g_free(container);
+            close(fd);
+            return -1;
+        }
+
+        ret = ioctl(fd, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU);
+        if (ret) {
+            error_report("vfio: failed to set iommu for container: %s\n",
+                         strerror(errno));
+            g_free(container);
+            close(fd);
+            return -1;
+        }
+
+        container->iommu_data.listener = (MemoryListener) {
+            .begin = vfio_listener_dummy1,
+            .commit = vfio_listener_dummy1,
+            .region_add = vfio_listener_region_add,
+            .region_del = vfio_listener_region_del,
+            .region_nop = vfio_listener_dummy2,
+            .log_start = vfio_listener_dummy2,
+            .log_stop = vfio_listener_dummy2,
+            .log_sync = vfio_listener_dummy2,
+            .log_global_start = vfio_listener_dummy1,
+            .log_global_stop = vfio_listener_dummy1,
+            .eventfd_add = vfio_listener_dummy3,
+            .eventfd_del = vfio_listener_dummy3,
+        };
+        container->iommu_data.release = vfio_listener_release;
+
+        memory_listener_register(&container->iommu_data.listener,
+                                 get_system_memory());
+    } else {
+        error_report("vfio: No available IOMMU models\n");
+        g_free(container);
+        close(fd);
+        return -1;
+    }
+
+    QLIST_INIT(&container->group_list);
+    QLIST_INSERT_HEAD(&container_list, container, next);
+
+    group->container = container;
+    QLIST_INSERT_HEAD(&container->group_list, group, container_next);
+
+    return 0;
+}
+
+static void vfio_disconnect_container(VFIOGroup *group)
+{
+    VFIOContainer *container = group->container;
+
+    if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, &container->fd)) {
+        error_report("vfio: error disconnecting group %d from container\n",
+                     group->groupid);
+    }
+
+    QLIST_REMOVE(group, container_next);
+    group->container = NULL;
+
+    if (QLIST_EMPTY(&container->group_list)) {
+        if (container->iommu_data.release) {
+            container->iommu_data.release(container);
+        }
+        QLIST_REMOVE(container, next);
+        DPRINTF("vfio_disconnect_container: close container->fd\n");
+        close(container->fd);
+        g_free(container);
+    }
+}
+
+static VFIOGroup *vfio_get_group(int groupid)
+{
+    VFIOGroup *group;
+    char path[32];
+    struct vfio_group_status status = { .argsz = sizeof(status) };
+
+    QLIST_FOREACH(group, &group_list, next) {
+        if (group->groupid == groupid) {
+            return group;
+        }
+    }
+
+    group = g_malloc0(sizeof(*group));
+
+    snprintf(path, sizeof(path), "/dev/vfio/%d", groupid);
+    group->fd = qemu_open(path, O_RDWR);
+    if (group->fd < 0) {
+        error_report("vfio: error opening %s: %s", path, strerror(errno));
+        g_free(group);
+        return NULL;
+    }
+
+    if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &status)) {
+        error_report("vfio: error getting group status: %s\n",
+                     strerror(errno));
+        close(group->fd);
+        g_free(group);
+        return NULL;
+    }
+
+    if (!(status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
+        error_report("vfio: error, group %d is not viable, please ensure "
+                     "all devices within the iommu_group are bound to their "
+                     "vfio bus driver.\n", groupid);
+        close(group->fd);
+        g_free(group);
+        return NULL;
+    }
+
+    group->groupid = groupid;
+    QLIST_INIT(&group->device_list);
+
+    if (vfio_connect_container(group)) {
+        error_report("vfio: failed to setup container for group %d\n", groupid);
+        close(group->fd);
+        g_free(group);
+        return NULL;
+    }
+
+    QLIST_INSERT_HEAD(&group_list, group, next);
+
+    return group;
+}
+
+static void vfio_put_group(VFIOGroup *group)
+{
+    if (!QLIST_EMPTY(&group->device_list)) {
+        return;
+    }
+
+    vfio_disconnect_container(group);
+    QLIST_REMOVE(group, next);
+    DPRINTF("vfio_put_group: close group->fd\n");
+    close(group->fd);
+    g_free(group);
+}
+
+static int vfio_get_device(VFIOGroup *group, const char *name, VFIODevice *vdev)
+{
+    struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
+    struct vfio_region_info reg_info = { .argsz = sizeof(reg_info) };
+    int ret, i;
+
+    ret = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name);
+    if (ret < 0) {
+        error_report("vfio: error getting device %s from group %d: %s",
+                     name, group->groupid, strerror(errno));
+        error_report("Verify all devices in group %d "
+                     "are bound to vfio-pci or pci-stub and not already in use",
+                     group->groupid);
+        return ret;
+    }
+
+    vdev->fd = ret;
+    vdev->group = group;
+    QLIST_INSERT_HEAD(&group->device_list, vdev, next);
+
+    /* Sanity check device */
+    ret = ioctl(vdev->fd, VFIO_DEVICE_GET_INFO, &dev_info);
+    if (ret) {
+        error_report("vfio: error getting device info: %s", strerror(errno));
+        goto error;
+    }
+
+    DPRINTF("Device %s flags: %u, regions: %u, irgs: %u\n", name,
+            dev_info.flags, dev_info.num_regions, dev_info.num_irqs);
+
+    if (!(dev_info.flags & VFIO_DEVICE_FLAGS_PCI)) {
+        error_report("vfio: Um, this isn't a PCI device");
+        goto error;
+    }
+
+    vdev->reset_works = !!(dev_info.flags & VFIO_DEVICE_FLAGS_RESET);
+    if (!vdev->reset_works) {
+        error_report("Warning, device %s does not support reset\n", name);
+    }
+
+    if (dev_info.num_regions != VFIO_PCI_NUM_REGIONS) {
+        error_report("vfio: unexpected number of io regions %u",
+                     dev_info.num_regions);
+        goto error;
+    }
+
+    if (dev_info.num_irqs != VFIO_PCI_NUM_IRQS) {
+        error_report("vfio: unexpected number of irqs %u", dev_info.num_irqs);
+        goto error;
+    }
+
+    for (i = VFIO_PCI_BAR0_REGION_INDEX; i < VFIO_PCI_ROM_REGION_INDEX; i++) {
+        reg_info.index = i;
+
+        ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info);
+        if (ret) {
+            error_report("vfio: Error getting region %d info: %s", i,
+                         strerror(errno));
+            goto error;
+        }
+
+        DPRINTF("Device %s region %d:\n", name, i);
+        DPRINTF("  size: 0x%lx, offset: 0x%lx, flags: 0x%lx\n",
+                (unsigned long)reg_info.size, (unsigned long)reg_info.offset,
+                (unsigned long)reg_info.flags);
+
+        vdev->bars[i].flags = reg_info.flags;
+        vdev->bars[i].size = reg_info.size;
+        vdev->bars[i].fd_offset = reg_info.offset;
+        vdev->bars[i].fd = vdev->fd;
+        vdev->bars[i].nr = i;
+    }
+
+    reg_info.index = VFIO_PCI_ROM_REGION_INDEX;
+
+    ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info);
+    if (ret) {
+        error_report("vfio: Error getting ROM info: %s", strerror(errno));
+        goto error;
+    }
+
+    DPRINTF("Device %s ROM:\n", name);
+    DPRINTF("  size: 0x%lx, offset: 0x%lx, flags: 0x%lx\n",
+            (unsigned long)reg_info.size, (unsigned long)reg_info.offset,
+            (unsigned long)reg_info.flags);
+
+    vdev->rom_size = reg_info.size;
+    vdev->rom_offset = reg_info.offset;
+
+    reg_info.index = VFIO_PCI_CONFIG_REGION_INDEX;
+
+    ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info);
+    if (ret) {
+        error_report("vfio: Error getting config info: %s", strerror(errno));
+        goto error;
+    }
+
+    DPRINTF("Device %s config:\n", name);
+    DPRINTF("  size: 0x%lx, offset: 0x%lx, flags: 0x%lx\n",
+            (unsigned long)reg_info.size, (unsigned long)reg_info.offset,
+            (unsigned long)reg_info.flags);
+
+    vdev->config_size = reg_info.size;
+    vdev->config_offset = reg_info.offset;
+
+error:
+    if (ret) {
+        QLIST_REMOVE(vdev, next);
+        vdev->group = NULL;
+        close(vdev->fd);
+    }
+    return ret;
+}
+
+static void vfio_put_device(VFIODevice *vdev)
+{
+    QLIST_REMOVE(vdev, next);
+    vdev->group = NULL;
+    DPRINTF("vfio_put_device: close vdev->fd\n");
+    close(vdev->fd);
+    if (vdev->msix) {
+        g_free(vdev->msix);
+        vdev->msix = NULL;
+    }
+}
+
+static int vfio_initfn(struct PCIDevice *pdev)
+{
+    VFIODevice *pvdev, *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
+    VFIOGroup *group;
+    char path[PATH_MAX], iommu_group_path[PATH_MAX], *group_name;
+    ssize_t len;
+    struct stat st;
+    int groupid;
+    int ret;
+
+    /* Check that the host device exists */
+    snprintf(path, sizeof(path),
+             "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/",
+             vdev->host.domain, vdev->host.bus, vdev->host.slot,
+             vdev->host.function);
+    if (stat(path, &st) < 0) {
+        error_report("vfio: error: no such host device: %s", path);
+        return -1;
+    }
+
+    strncat(path, "iommu_group", sizeof(path) - strlen(path) - 1);
+
+    len = readlink(path, iommu_group_path, PATH_MAX);
+    if (len <= 0) {
+        error_report("vfio: error no iommu_group for device\n");
+        return -1;
+    }
+
+    iommu_group_path[len] = 0;
+    group_name = basename(iommu_group_path);
+
+    if (sscanf(group_name, "%d", &groupid) != 1) {
+        error_report("vfio: error reading %s: %s", path, strerror(errno));
+        return -1;
+    }
+
+    DPRINTF("%s(%04x:%02x:%02x.%x) group %d\n", __func__, vdev->host.domain,
+            vdev->host.bus, vdev->host.slot, vdev->host.function, groupid);
+
+    group = vfio_get_group(groupid);
+    if (!group) {
+        error_report("vfio: failed to get group %d", groupid);
+        return -1;
+    }
+
+    snprintf(path, sizeof(path), "%04x:%02x:%02x.%01x",
+            vdev->host.domain, vdev->host.bus, vdev->host.slot,
+            vdev->host.function);
+
+    QLIST_FOREACH(pvdev, &group->device_list, next) {
+        if (pvdev->host.domain == vdev->host.domain &&
+            pvdev->host.bus == vdev->host.bus &&
+            pvdev->host.slot == vdev->host.slot &&
+            pvdev->host.function == vdev->host.function) {
+
+            error_report("vfio: error: device %s is already attached\n", path);
+            vfio_put_group(group);
+            return -1;
+        }
+    }
+
+    ret = vfio_get_device(group, path, vdev);
+    if (ret) {
+        error_report("vfio: failed to get device %s", path);
+        vfio_put_group(group);
+        return -1;
+    }
+
+    /* Get a copy of config space */
+    assert(pci_config_size(&vdev->pdev) <= vdev->config_size);
+    ret = pread(vdev->fd, vdev->pdev.config,
+                pci_config_size(&vdev->pdev), vdev->config_offset);
+    if (ret < (int)pci_config_size(&vdev->pdev)) {
+        error_report("vfio: Failed to read device config space\n");
+        goto out_put;
+    }
+
+    /*
+     * Clear host resource mapping info.  If we choose not to register a
+     * BAR, such as might be the case with the option ROM, we can get
+     * confusing, unwritable, residual addresses from the host here.
+     */
+    memset(&vdev->pdev.config[PCI_BASE_ADDRESS_0], 0, 24);
+    memset(&vdev->pdev.config[PCI_ROM_ADDRESS], 0, 4);
+
+    vfio_load_rom(vdev);
+
+    if (vfio_early_setup_msix(vdev)) {
+        goto out_put;
+    }
+
+    vfio_map_bars(vdev);
+
+    if (vfio_add_capabilities(vdev)) {
+        goto out_teardown;
+    }
+
+    if (vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1)) {
+        pci_device_set_intx_routing_notifier(&vdev->pdev, vfio_update_irq);
+    }
+
+    if (vfio_enable_intx(vdev)) {
+        goto out_teardown;
+    }
+
+    return 0;
+
+out_teardown:
+    pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
+    vfio_teardown_msi(vdev);
+    vfio_unmap_bars(vdev);
+out_put:
+    vfio_put_device(vdev);
+    vfio_put_group(group);
+    return -1;
+}
+
+static void vfio_exitfn(struct PCIDevice *pdev)
+{
+    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
+    VFIOGroup *group = vdev->group;
+
+    pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
+    vfio_disable_interrupts(vdev);
+    vfio_teardown_msi(vdev);
+    vfio_unmap_bars(vdev);
+    vfio_put_device(vdev);
+    vfio_put_group(group);
+}
+
+static void vfio_reset(DeviceState *dev)
+{
+    PCIDevice *pdev = DO_UPCAST(PCIDevice, qdev, dev);
+    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
+
+    if (!vdev->reset_works) {
+        return;
+    }
+
+    if (ioctl(vdev->fd, VFIO_DEVICE_RESET)) {
+        error_report("vfio: Error unable to reset physical device "
+                     "(%04x:%02x:%02x.%x): %s\n", vdev->host.domain,
+                     vdev->host.bus, vdev->host.slot, vdev->host.function,
+                     strerror(errno));
+    }
+}
+
+static Property vfio_pci_dev_properties[] = {
+    DEFINE_PROP_PCI_HOST_DEVADDR("host", VFIODevice, host),
+    /*
+     * TODO - support passed fds... is this necessary?
+     * DEFINE_PROP_STRING("vfiofd", VFIODevice, vfiofd_name),
+     * DEFINE_PROP_STRING("vfiogroupfd, VFIODevice, vfiogroupfd_name),
+     */
+    DEFINE_PROP_END_OF_LIST(),
+};
+
+
+static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
+{
+    PCIDeviceClass *dc = PCI_DEVICE_CLASS(klass);
+
+    dc->parent_class.reset = vfio_reset;
+    dc->init = vfio_initfn;
+    dc->exit = vfio_exitfn;
+    dc->config_read = vfio_pci_read_config;
+    dc->config_write = vfio_pci_write_config;
+    dc->parent_class.props = vfio_pci_dev_properties;
+}
+
+static TypeInfo vfio_pci_dev_info = {
+    .name          = "vfio-pci",
+    .parent        = TYPE_PCI_DEVICE,
+    .instance_size = sizeof(VFIODevice),
+    .class_init    = vfio_pci_dev_class_init,
+};
+
+static void register_vfio_pci_dev_type(void)
+{
+    type_register_static(&vfio_pci_dev_info);
+}
+
+type_init(register_vfio_pci_dev_type)
diff --git a/hw/vfio_pci.h b/hw/vfio_pci.h
new file mode 100644
index 0000000..0a71bce
--- /dev/null
+++ b/hw/vfio_pci.h
@@ -0,0 +1,101 @@
+#ifndef HW_VFIO_PCI_H
+#define HW_VFIO_PCI_H
+
+#include "qemu-common.h"
+#include "qemu-queue.h"
+#include "pci.h"
+#include "event_notifier.h"
+
+typedef struct VFIOBAR {
+    off_t fd_offset; /* offset of BAR within device fd */
+    int fd; /* device fd, allows us to pass VFIOBAR as opaque data */
+    MemoryRegion mem; /* slow, read/write access */
+    MemoryRegion mmap_mem; /* direct mapped access */
+    void *mmap;
+    size_t size;
+    uint32_t flags; /* VFIO region flags (rd/wr/mmap) */
+    uint8_t nr; /* cache the BAR number for debug */
+} VFIOBAR;
+
+typedef struct INTx {
+    bool pending; /* interrupt pending */
+    bool kvm_accel; /* set when QEMU bypass through KVM enabled */
+    uint8_t pin; /* which pin to pull for qemu_set_irq */
+    EventNotifier interrupt; /* eventfd triggered on interrupt */
+    EventNotifier unmask; /* eventfd for unmask on QEMU bypass */
+    PCIINTxRoute route; /* routing info for QEMU bypass */
+} INTx;
+
+struct VFIODevice;
+
+typedef struct MSIVector {
+    EventNotifier interrupt; /* eventfd triggered on interrupt */
+    struct VFIODevice *vdev; /* back pointer to device */
+    int vector; /* the vector number for this element */
+    int virq; /* KVM irqchip route for QEMU bypass */
+    bool use;
+} MSIVector;
+
+enum {
+    INT_NONE = 0,
+    INT_INTx = 1,
+    INT_MSI  = 2,
+    INT_MSIX = 3,
+};
+
+struct VFIOGroup;
+
+typedef struct VFIOContainer {
+    int fd; /* /dev/vfio/vfio, empowered by the attached groups */
+    struct {
+        /* enable abstraction to support various iommu backends */
+        union {
+            MemoryListener listener; /* Used by type1 iommu */
+        };
+        void (*release)(struct VFIOContainer *);
+    } iommu_data;
+    QLIST_HEAD(, VFIOGroup) group_list;
+    QLIST_ENTRY(VFIOContainer) next;
+} VFIOContainer;
+
+/* Cache of MSI-X setup plus extra mmap and memory region for split BAR map */
+typedef struct MSIXInfo {
+    uint8_t table_bar;
+    uint8_t pba_bar;
+    uint16_t entries;
+    uint32_t table_offset;
+    uint32_t pba_offset;
+    MemoryRegion mmap_mem;
+    void *mmap;
+} MSIXInfo;
+
+typedef struct VFIODevice {
+    PCIDevice pdev;
+    int fd;
+    INTx intx;
+    unsigned int config_size;
+    off_t config_offset; /* Offset of config space region within device fd */
+    unsigned int rom_size;
+    off_t rom_offset; /* Offset of ROM region within device fd */
+    int msi_cap_size;
+    MSIVector *msi_vectors;
+    MSIXInfo *msix;
+    int nr_vectors; /* Number of MSI/MSIX vectors currently in use */
+    int interrupt; /* Current interrupt type */
+    VFIOBAR bars[PCI_NUM_REGIONS - 1]; /* No ROM */
+    PCIHostDeviceAddress host;
+    QLIST_ENTRY(VFIODevice) next;
+    struct VFIOGroup *group;
+    bool reset_works;
+} VFIODevice;
+
+typedef struct VFIOGroup {
+    int fd;
+    int groupid;
+    VFIOContainer *container;
+    QLIST_HEAD(, VFIODevice) device_list;
+    QLIST_ENTRY(VFIOGroup) next;
+    QLIST_ENTRY(VFIOGroup) container_next;
+} VFIOGroup;
+
+#endif /* HW_VFIO_PCI_H */

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [Qemu-devel] [PATCH 3/3] vfio: Enable vfio-pci and mark supported
  2012-08-01  5:18 [Qemu-devel] [PATCH 0/3] VFIO-based PCI device assignment for QEMU 1.2 Alex Williamson
  2012-08-01  5:18 ` [Qemu-devel] [PATCH 1/3] vfio: Import vfio kernel header Alex Williamson
  2012-08-01  5:18 ` [Qemu-devel] [PATCH 2/3] vfio: vfio-pci device assignment driver Alex Williamson
@ 2012-08-01  5:18 ` Alex Williamson
  2012-08-01  7:15   ` Jan Kiszka
  2012-08-13 13:27 ` [Qemu-devel] [PATCH 0/3] VFIO-based PCI device assignment for QEMU 1.2 Anthony Liguori
  3 siblings, 1 reply; 42+ messages in thread
From: Alex Williamson @ 2012-08-01  5:18 UTC (permalink / raw)
  To: aliguori; +Cc: aik, qemu-devel, kvm

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---

 MAINTAINERS           |    5 +++++
 configure             |   12 ++++++++++++
 hw/i386/Makefile.objs |    1 +
 3 files changed, 18 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 2d219d2..9680d69 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -460,6 +460,11 @@ M: Gerd Hoffmann <kraxel@redhat.com>
 S: Maintained
 F: hw/usb*
 
+VFIO
+M: Alex Williamson <alex.williamson@redhat.com>
+S: Supported
+F: hw/vfio*
+
 vhost
 M: Michael S. Tsirkin <mst@redhat.com>
 S: Supported
diff --git a/configure b/configure
index c65b5f6..81108dc 100755
--- a/configure
+++ b/configure
@@ -143,6 +143,7 @@ attr=""
 libattr=""
 xfs=""
 
+vfio_pci="no"
 vhost_net="no"
 kvm="no"
 gprof="no"
@@ -489,6 +490,7 @@ Haiku)
   usb="linux"
   kvm="yes"
   vhost_net="yes"
+  vfio_pci="yes"
   if [ "$cpu" = "i386" -o "$cpu" = "x86_64" ] ; then
     audio_possible_drivers="$audio_possible_drivers fmod"
   fi
@@ -824,6 +826,10 @@ for opt do
   ;;
   --disable-guest-agent) guest_agent="no"
   ;;
+  --disable-vfio-pci) vfio_pci="no"
+  ;;
+  --enable-vfio-pci) vfio_pci="yes"
+  ;;
   *) echo "ERROR: unknown option $opt"; show_help="yes"
   ;;
   esac
@@ -1112,6 +1118,8 @@ echo "  --disable-guest-agent    disable building of the QEMU Guest Agent"
 echo "  --enable-guest-agent     enable building of the QEMU Guest Agent"
 echo "  --with-coroutine=BACKEND coroutine backend. Supported options:"
 echo "                           gthread, ucontext, sigaltstack, windows"
+echo "  --disable-vfio-pci       disable vfio pci device assignement support"
+echo "  --enable-vfio-pci        enable vfio pci device assignment support"
 echo ""
 echo "NOTE: The object files are built at the place where configure is launched"
 exit 1
@@ -3072,6 +3080,7 @@ echo "OpenGL support    $opengl"
 echo "libiscsi support  $libiscsi"
 echo "build guest agent $guest_agent"
 echo "coroutine backend $coroutine_backend"
+echo "VFIO PCI support  $vfio_pci"
 
 if test "$sdl_too_old" = "yes"; then
 echo "-> Your SDL version is too old - please upgrade to have SDL support"
@@ -3754,6 +3763,9 @@ case "$target_arch2" in
   *)
     echo "CONFIG_NO_XEN=y" >> $config_target_mak
 esac
+if test "$vfio_pci" = "yes" -a "$target_softmmu" = "yes" ; then
+  echo "CONFIG_VFIO_PCI=y" >> $config_target_mak
+fi
 case "$target_arch2" in
   i386|x86_64|ppcemb|ppc|ppc64|s390x)
     # Make sure the target and host cpus are compatible
diff --git a/hw/i386/Makefile.objs b/hw/i386/Makefile.objs
index 8c764bb..a2783ef 100644
--- a/hw/i386/Makefile.objs
+++ b/hw/i386/Makefile.objs
@@ -11,5 +11,6 @@ obj-$(CONFIG_XEN_PCI_PASSTHROUGH) += xen-host-pci-device.o
 obj-$(CONFIG_XEN_PCI_PASSTHROUGH) += xen_pt.o xen_pt_config_init.o xen_pt_msi.o
 obj-y += kvm/
 obj-$(CONFIG_SPICE) += qxl.o qxl-logger.o qxl-render.o
+obj-$(CONFIG_VFIO_PCI) += vfio_pci.o
 
 obj-y := $(addprefix ../,$(obj-y))

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH 1/3] vfio: Import vfio kernel header
  2012-08-01  5:18 ` [Qemu-devel] [PATCH 1/3] vfio: Import vfio kernel header Alex Williamson
@ 2012-08-01  7:13   ` Jan Kiszka
  2012-08-01 18:09     ` Alex Williamson
  0 siblings, 1 reply; 42+ messages in thread
From: Jan Kiszka @ 2012-08-01  7:13 UTC (permalink / raw)
  To: Alex Williamson; +Cc: aik, aliguori, qemu-devel, kvm

[-- Attachment #1: Type: text/plain, Size: 15473 bytes --]

On 2012-08-01 07:18, Alex Williamson wrote:
> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> ---
> 
>  linux-headers/linux/vfio.h |  368 ++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 368 insertions(+)
>  create mode 100644 linux-headers/linux/vfio.h
> 
> diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h
> new file mode 100644
> index 0000000..f787b72
> --- /dev/null
> +++ b/linux-headers/linux/vfio.h
> @@ -0,0 +1,368 @@
> +/*
> + * VFIO API definition
> + *
> + * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
> + *     Author: Alex Williamson <alex.williamson@redhat.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +#ifndef VFIO_H
> +#define VFIO_H
> +
> +#include <linux/types.h>
> +#include <linux/ioctl.h>
> +
> +#define VFIO_API_VERSION	0
> +
> +
> +/* Kernel & User level defines for VFIO IOCTLs. */
> +
> +/* Extensions */
> +
> +#define VFIO_TYPE1_IOMMU		1
> +
> +/*
> + * The IOCTL interface is designed for extensibility by embedding the
> + * structure length (argsz) and flags into structures passed between
> + * kernel and userspace.  We therefore use the _IO() macro for these
> + * defines to avoid implicitly embedding a size into the ioctl request.
> + * As structure fields are added, argsz will increase to match and flag
> + * bits will be defined to indicate additional fields with valid data.
> + * It's *always* the caller's responsibility to indicate the size of
> + * the structure passed by setting argsz appropriately.
> + */
> +
> +#define VFIO_TYPE	(';')
> +#define VFIO_BASE	100
> +
> +/* -------- IOCTLs for VFIO file descriptor (/dev/vfio/vfio) -------- */
> +
> +/**
> + * VFIO_GET_API_VERSION - _IO(VFIO_TYPE, VFIO_BASE + 0)
> + *
> + * Report the version of the VFIO API.  This allows us to bump the entire
> + * API version should we later need to add or change features in incompatible
> + * ways.
> + * Return: VFIO_API_VERSION
> + * Availability: Always
> + */
> +#define VFIO_GET_API_VERSION		_IO(VFIO_TYPE, VFIO_BASE + 0)
> +
> +/**
> + * VFIO_CHECK_EXTENSION - _IOW(VFIO_TYPE, VFIO_BASE + 1, __u32)
> + *
> + * Check whether an extension is supported.
> + * Return: 0 if not supported, 1 (or some other positive integer) if supported.
> + * Availability: Always
> + */
> +#define VFIO_CHECK_EXTENSION		_IO(VFIO_TYPE, VFIO_BASE + 1)
> +
> +/**
> + * VFIO_SET_IOMMU - _IOW(VFIO_TYPE, VFIO_BASE + 2, __s32)
> + *
> + * Set the iommu to the given type.  The type must be supported by an
> + * iommu driver as verified by calling CHECK_EXTENSION using the same
> + * type.  A group must be set to this file descriptor before this
> + * ioctl is available.  The IOMMU interfaces enabled by this call are
> + * specific to the value set.
> + * Return: 0 on success, -errno on failure
> + * Availability: When VFIO group attached
> + */
> +#define VFIO_SET_IOMMU			_IO(VFIO_TYPE, VFIO_BASE + 2)
> +
> +/* -------- IOCTLs for GROUP file descriptors (/dev/vfio/$GROUP) -------- */
> +
> +/**
> + * VFIO_GROUP_GET_STATUS - _IOR(VFIO_TYPE, VFIO_BASE + 3,
> + *						struct vfio_group_status)
> + *
> + * Retrieve information about the group.  Fills in provided
> + * struct vfio_group_info.  Caller sets argsz.
> + * Return: 0 on succes, -errno on failure.
> + * Availability: Always
> + */
> +struct vfio_group_status {
> +	__u32	argsz;
> +	__u32	flags;
> +#define VFIO_GROUP_FLAGS_VIABLE		(1 << 0)
> +#define VFIO_GROUP_FLAGS_CONTAINER_SET	(1 << 1)
> +};
> +#define VFIO_GROUP_GET_STATUS		_IO(VFIO_TYPE, VFIO_BASE + 3)
> +
> +/**
> + * VFIO_GROUP_SET_CONTAINER - _IOW(VFIO_TYPE, VFIO_BASE + 4, __s32)
> + *
> + * Set the container for the VFIO group to the open VFIO file
> + * descriptor provided.  Groups may only belong to a single
> + * container.  Containers may, at their discretion, support multiple
> + * groups.  Only when a container is set are all of the interfaces
> + * of the VFIO file descriptor and the VFIO group file descriptor
> + * available to the user.
> + * Return: 0 on success, -errno on failure.
> + * Availability: Always
> + */
> +#define VFIO_GROUP_SET_CONTAINER	_IO(VFIO_TYPE, VFIO_BASE + 4)
> +
> +/**
> + * VFIO_GROUP_UNSET_CONTAINER - _IO(VFIO_TYPE, VFIO_BASE + 5)
> + *
> + * Remove the group from the attached container.  This is the
> + * opposite of the SET_CONTAINER call and returns the group to
> + * an initial state.  All device file descriptors must be released
> + * prior to calling this interface.  When removing the last group
> + * from a container, the IOMMU will be disabled and all state lost,
> + * effectively also returning the VFIO file descriptor to an initial
> + * state.
> + * Return: 0 on success, -errno on failure.
> + * Availability: When attached to container
> + */
> +#define VFIO_GROUP_UNSET_CONTAINER	_IO(VFIO_TYPE, VFIO_BASE + 5)
> +
> +/**
> + * VFIO_GROUP_GET_DEVICE_FD - _IOW(VFIO_TYPE, VFIO_BASE + 6, char)
> + *
> + * Return a new file descriptor for the device object described by
> + * the provided string.  The string should match a device listed in
> + * the devices subdirectory of the IOMMU group sysfs entry.  The
> + * group containing the device must already be added to this context.
> + * Return: new file descriptor on success, -errno on failure.
> + * Availability: When attached to container
> + */
> +#define VFIO_GROUP_GET_DEVICE_FD	_IO(VFIO_TYPE, VFIO_BASE + 6)
> +
> +/* --------------- IOCTLs for DEVICE file descriptors --------------- */
> +
> +/**
> + * VFIO_DEVICE_GET_INFO - _IOR(VFIO_TYPE, VFIO_BASE + 7,
> + *						struct vfio_device_info)
> + *
> + * Retrieve information about the device.  Fills in provided
> + * struct vfio_device_info.  Caller sets argsz.
> + * Return: 0 on success, -errno on failure.
> + */
> +struct vfio_device_info {
> +	__u32	argsz;
> +	__u32	flags;
> +#define VFIO_DEVICE_FLAGS_RESET	(1 << 0)	/* Device supports reset */
> +#define VFIO_DEVICE_FLAGS_PCI	(1 << 1)	/* vfio-pci device */
> +	__u32	num_regions;	/* Max region index + 1 */
> +	__u32	num_irqs;	/* Max IRQ index + 1 */
> +};
> +#define VFIO_DEVICE_GET_INFO		_IO(VFIO_TYPE, VFIO_BASE + 7)
> +
> +/**
> + * VFIO_DEVICE_GET_REGION_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 8,
> + *				       struct vfio_region_info)
> + *
> + * Retrieve information about a device region.  Caller provides
> + * struct vfio_region_info with index value set.  Caller sets argsz.
> + * Implementation of region mapping is bus driver specific.  This is
> + * intended to describe MMIO, I/O port, as well as bus specific
> + * regions (ex. PCI config space).  Zero sized regions may be used
> + * to describe unimplemented regions (ex. unimplemented PCI BARs).
> + * Return: 0 on success, -errno on failure.
> + */
> +struct vfio_region_info {
> +	__u32	argsz;
> +	__u32	flags;
> +#define VFIO_REGION_INFO_FLAG_READ	(1 << 0) /* Region supports read */
> +#define VFIO_REGION_INFO_FLAG_WRITE	(1 << 1) /* Region supports write */
> +#define VFIO_REGION_INFO_FLAG_MMAP	(1 << 2) /* Region supports mmap */
> +	__u32	index;		/* Region index */
> +	__u32	resv;		/* Reserved for alignment */
> +	__u64	size;		/* Region size (bytes) */
> +	__u64	offset;		/* Region offset from start of device fd */
> +};
> +#define VFIO_DEVICE_GET_REGION_INFO	_IO(VFIO_TYPE, VFIO_BASE + 8)
> +
> +/**
> + * VFIO_DEVICE_GET_IRQ_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 9,
> + *				    struct vfio_irq_info)
> + *
> + * Retrieve information about a device IRQ.  Caller provides
> + * struct vfio_irq_info with index value set.  Caller sets argsz.
> + * Implementation of IRQ mapping is bus driver specific.  Indexes
> + * using multiple IRQs are primarily intended to support MSI-like
> + * interrupt blocks.  Zero count irq blocks may be used to describe
> + * unimplemented interrupt types.
> + *
> + * The EVENTFD flag indicates the interrupt index supports eventfd based
> + * signaling.
> + *
> + * The MASKABLE flags indicates the index supports MASK and UNMASK
> + * actions described below.
> + *
> + * AUTOMASKED indicates that after signaling, the interrupt line is
> + * automatically masked by VFIO and the user needs to unmask the line
> + * to receive new interrupts.  This is primarily intended to distinguish
> + * level triggered interrupts.
> + *
> + * The NORESIZE flag indicates that the interrupt lines within the index
> + * are setup as a set and new subindexes cannot be enabled without first
> + * disabling the entire index.  This is used for interrupts like PCI MSI
> + * and MSI-X where the driver may only use a subset of the available
> + * indexes, but VFIO needs to enable a specific number of vectors
> + * upfront.  In the case of MSI-X, where the user can enable MSI-X and
> + * then add and unmask vectors, it's up to userspace to make the decision
> + * whether to allocate the maximum supported number of vectors or tear
> + * down setup and incrementally increase the vectors as each is enabled.
> + */
> +struct vfio_irq_info {
> +	__u32	argsz;
> +	__u32	flags;
> +#define VFIO_IRQ_INFO_EVENTFD		(1 << 0)
> +#define VFIO_IRQ_INFO_MASKABLE		(1 << 1)
> +#define VFIO_IRQ_INFO_AUTOMASKED	(1 << 2)
> +#define VFIO_IRQ_INFO_NORESIZE		(1 << 3)
> +	__u32	index;		/* IRQ index */
> +	__u32	count;		/* Number of IRQs within this index */
> +};
> +#define VFIO_DEVICE_GET_IRQ_INFO	_IO(VFIO_TYPE, VFIO_BASE + 9)
> +
> +/**
> + * VFIO_DEVICE_SET_IRQS - _IOW(VFIO_TYPE, VFIO_BASE + 10, struct vfio_irq_set)
> + *
> + * Set signaling, masking, and unmasking of interrupts.  Caller provides
> + * struct vfio_irq_set with all fields set.  'start' and 'count' indicate
> + * the range of subindexes being specified.
> + *
> + * The DATA flags specify the type of data provided.  If DATA_NONE, the
> + * operation performs the specified action immediately on the specified
> + * interrupt(s).  For example, to unmask AUTOMASKED interrupt [0,0]:
> + * flags = (DATA_NONE|ACTION_UNMASK), index = 0, start = 0, count = 1.
> + *
> + * DATA_BOOL allows sparse support for the same on arrays of interrupts.
> + * For example, to mask interrupts [0,1] and [0,3] (but not [0,2]):
> + * flags = (DATA_BOOL|ACTION_MASK), index = 0, start = 1, count = 3,
> + * data = {1,0,1}
> + *
> + * DATA_EVENTFD binds the specified ACTION to the provided __s32 eventfd.
> + * A value of -1 can be used to either de-assign interrupts if already
> + * assigned or skip un-assigned interrupts.  For example, to set an eventfd
> + * to be trigger for interrupts [0,0] and [0,2]:
> + * flags = (DATA_EVENTFD|ACTION_TRIGGER), index = 0, start = 0, count = 3,
> + * data = {fd1, -1, fd2}
> + * If index [0,1] is previously set, two count = 1 ioctls calls would be
> + * required to set [0,0] and [0,2] without changing [0,1].
> + *
> + * Once a signaling mechanism is set, DATA_BOOL or DATA_NONE can be used
> + * with ACTION_TRIGGER to perform kernel level interrupt loopback testing
> + * from userspace (ie. simulate hardware triggering).
> + *
> + * Setting of an event triggering mechanism to userspace for ACTION_TRIGGER
> + * enables the interrupt index for the device.  Individual subindex interrupts
> + * can be disabled using the -1 value for DATA_EVENTFD or the index can be
> + * disabled as a whole with: flags = (DATA_NONE|ACTION_TRIGGER), count = 0.
> + *
> + * Note that ACTION_[UN]MASK specify user->kernel signaling (irqfds) while
> + * ACTION_TRIGGER specifies kernel->user signaling.
> + */
> +struct vfio_irq_set {
> +	__u32	argsz;
> +	__u32	flags;
> +#define VFIO_IRQ_SET_DATA_NONE		(1 << 0) /* Data not present */
> +#define VFIO_IRQ_SET_DATA_BOOL		(1 << 1) /* Data is bool (u8) */
> +#define VFIO_IRQ_SET_DATA_EVENTFD	(1 << 2) /* Data is eventfd (s32) */
> +#define VFIO_IRQ_SET_ACTION_MASK	(1 << 3) /* Mask interrupt */
> +#define VFIO_IRQ_SET_ACTION_UNMASK	(1 << 4) /* Unmask interrupt */
> +#define VFIO_IRQ_SET_ACTION_TRIGGER	(1 << 5) /* Trigger interrupt */
> +	__u32	index;
> +	__u32	start;
> +	__u32	count;
> +	__u8	data[];
> +};
> +#define VFIO_DEVICE_SET_IRQS		_IO(VFIO_TYPE, VFIO_BASE + 10)
> +
> +#define VFIO_IRQ_SET_DATA_TYPE_MASK	(VFIO_IRQ_SET_DATA_NONE | \
> +					 VFIO_IRQ_SET_DATA_BOOL | \
> +					 VFIO_IRQ_SET_DATA_EVENTFD)
> +#define VFIO_IRQ_SET_ACTION_TYPE_MASK	(VFIO_IRQ_SET_ACTION_MASK | \
> +					 VFIO_IRQ_SET_ACTION_UNMASK | \
> +					 VFIO_IRQ_SET_ACTION_TRIGGER)
> +/**
> + * VFIO_DEVICE_RESET - _IO(VFIO_TYPE, VFIO_BASE + 11)
> + *
> + * Reset a device.
> + */
> +#define VFIO_DEVICE_RESET		_IO(VFIO_TYPE, VFIO_BASE + 11)
> +
> +/*
> + * The VFIO-PCI bus driver makes use of the following fixed region and
> + * IRQ index mapping.  Unimplemented regions return a size of zero.
> + * Unimplemented IRQ types return a count of zero.
> + */
> +
> +enum {
> +	VFIO_PCI_BAR0_REGION_INDEX,
> +	VFIO_PCI_BAR1_REGION_INDEX,
> +	VFIO_PCI_BAR2_REGION_INDEX,
> +	VFIO_PCI_BAR3_REGION_INDEX,
> +	VFIO_PCI_BAR4_REGION_INDEX,
> +	VFIO_PCI_BAR5_REGION_INDEX,
> +	VFIO_PCI_ROM_REGION_INDEX,
> +	VFIO_PCI_CONFIG_REGION_INDEX,
> +	VFIO_PCI_NUM_REGIONS
> +};
> +
> +enum {
> +	VFIO_PCI_INTX_IRQ_INDEX,
> +	VFIO_PCI_MSI_IRQ_INDEX,
> +	VFIO_PCI_MSIX_IRQ_INDEX,
> +	VFIO_PCI_NUM_IRQS
> +};
> +
> +/* -------- API for Type1 VFIO IOMMU -------- */
> +
> +/**
> + * VFIO_IOMMU_GET_INFO - _IOR(VFIO_TYPE, VFIO_BASE + 12, struct vfio_iommu_info)
> + *
> + * Retrieve information about the IOMMU object. Fills in provided
> + * struct vfio_iommu_info. Caller sets argsz.
> + *
> + * XXX Should we do these by CHECK_EXTENSION too?
> + */
> +struct vfio_iommu_type1_info {
> +	__u32	argsz;
> +	__u32	flags;
> +#define VFIO_IOMMU_INFO_PGSIZES (1 << 0)	/* supported page sizes info */
> +	__u64	iova_pgsizes;		/* Bitmap of supported page sizes */
> +};
> +
> +#define VFIO_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
> +
> +/**
> + * VFIO_IOMMU_MAP_DMA - _IOW(VFIO_TYPE, VFIO_BASE + 13, struct vfio_dma_map)
> + *
> + * Map process virtual addresses to IO virtual addresses using the
> + * provided struct vfio_dma_map. Caller sets argsz. READ &/ WRITE required.
> + */
> +struct vfio_iommu_type1_dma_map {
> +	__u32	argsz;
> +	__u32	flags;
> +#define VFIO_DMA_MAP_FLAG_READ (1 << 0)		/* readable from device */
> +#define VFIO_DMA_MAP_FLAG_WRITE (1 << 1)	/* writable from device */
> +	__u64	vaddr;				/* Process virtual address */
> +	__u64	iova;				/* IO virtual address */
> +	__u64	size;				/* Size of mapping (bytes) */
> +};
> +
> +#define VFIO_IOMMU_MAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 13)
> +
> +/**
> + * VFIO_IOMMU_UNMAP_DMA - _IOW(VFIO_TYPE, VFIO_BASE + 14, struct vfio_dma_unmap)
> + *
> + * Unmap IO virtual addresses using the provided struct vfio_dma_unmap.
> + * Caller sets argsz.
> + */
> +struct vfio_iommu_type1_dma_unmap {
> +	__u32	argsz;
> +	__u32	flags;
> +	__u64	iova;				/* IO virtual address */
> +	__u64	size;				/* Size of mapping (bytes) */
> +};
> +
> +#define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
> +
> +#endif /* VFIO_H */
> 

Please patch update-linux-headers.sh and let it do its work instead
(separate patches).

Jan


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 262 bytes --]

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH 3/3] vfio: Enable vfio-pci and mark supported
  2012-08-01  5:18 ` [Qemu-devel] [PATCH 3/3] vfio: Enable vfio-pci and mark supported Alex Williamson
@ 2012-08-01  7:15   ` Jan Kiszka
  2012-08-01 18:14     ` Alex Williamson
  2012-08-13 22:19     ` Anthony Liguori
  0 siblings, 2 replies; 42+ messages in thread
From: Jan Kiszka @ 2012-08-01  7:15 UTC (permalink / raw)
  To: Alex Williamson; +Cc: aik, aliguori, qemu-devel, kvm

[-- Attachment #1: Type: text/plain, Size: 3484 bytes --]

On 2012-08-01 07:18, Alex Williamson wrote:
> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> ---
> 
>  MAINTAINERS           |    5 +++++
>  configure             |   12 ++++++++++++
>  hw/i386/Makefile.objs |    1 +
>  3 files changed, 18 insertions(+)
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 2d219d2..9680d69 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -460,6 +460,11 @@ M: Gerd Hoffmann <kraxel@redhat.com>
>  S: Maintained
>  F: hw/usb*
>  
> +VFIO
> +M: Alex Williamson <alex.williamson@redhat.com>
> +S: Supported
> +F: hw/vfio*
> +
>  vhost
>  M: Michael S. Tsirkin <mst@redhat.com>
>  S: Supported
> diff --git a/configure b/configure
> index c65b5f6..81108dc 100755
> --- a/configure
> +++ b/configure
> @@ -143,6 +143,7 @@ attr=""
>  libattr=""
>  xfs=""
>  
> +vfio_pci="no"
>  vhost_net="no"
>  kvm="no"
>  gprof="no"
> @@ -489,6 +490,7 @@ Haiku)
>    usb="linux"
>    kvm="yes"
>    vhost_net="yes"
> +  vfio_pci="yes"
>    if [ "$cpu" = "i386" -o "$cpu" = "x86_64" ] ; then
>      audio_possible_drivers="$audio_possible_drivers fmod"
>    fi
> @@ -824,6 +826,10 @@ for opt do
>    ;;
>    --disable-guest-agent) guest_agent="no"
>    ;;
> +  --disable-vfio-pci) vfio_pci="no"
> +  ;;
> +  --enable-vfio-pci) vfio_pci="yes"
> +  ;;

Do we need this level of control? Open question I'm just wondering every
time a new feature gets added together with --disable/--enable switches.

>    *) echo "ERROR: unknown option $opt"; show_help="yes"
>    ;;
>    esac
> @@ -1112,6 +1118,8 @@ echo "  --disable-guest-agent    disable building of the QEMU Guest Agent"
>  echo "  --enable-guest-agent     enable building of the QEMU Guest Agent"
>  echo "  --with-coroutine=BACKEND coroutine backend. Supported options:"
>  echo "                           gthread, ucontext, sigaltstack, windows"
> +echo "  --disable-vfio-pci       disable vfio pci device assignement support"
> +echo "  --enable-vfio-pci        enable vfio pci device assignment support"
>  echo ""
>  echo "NOTE: The object files are built at the place where configure is launched"
>  exit 1
> @@ -3072,6 +3080,7 @@ echo "OpenGL support    $opengl"
>  echo "libiscsi support  $libiscsi"
>  echo "build guest agent $guest_agent"
>  echo "coroutine backend $coroutine_backend"
> +echo "VFIO PCI support  $vfio_pci"
>  
>  if test "$sdl_too_old" = "yes"; then
>  echo "-> Your SDL version is too old - please upgrade to have SDL support"
> @@ -3754,6 +3763,9 @@ case "$target_arch2" in
>    *)
>      echo "CONFIG_NO_XEN=y" >> $config_target_mak
>  esac
> +if test "$vfio_pci" = "yes" -a "$target_softmmu" = "yes" ; then
> +  echo "CONFIG_VFIO_PCI=y" >> $config_target_mak
> +fi

Does this already somehow depend on host == Linux? If not, you may break
the others.

>  case "$target_arch2" in
>    i386|x86_64|ppcemb|ppc|ppc64|s390x)
>      # Make sure the target and host cpus are compatible
> diff --git a/hw/i386/Makefile.objs b/hw/i386/Makefile.objs
> index 8c764bb..a2783ef 100644
> --- a/hw/i386/Makefile.objs
> +++ b/hw/i386/Makefile.objs
> @@ -11,5 +11,6 @@ obj-$(CONFIG_XEN_PCI_PASSTHROUGH) += xen-host-pci-device.o
>  obj-$(CONFIG_XEN_PCI_PASSTHROUGH) += xen_pt.o xen_pt_config_init.o xen_pt_msi.o
>  obj-y += kvm/
>  obj-$(CONFIG_SPICE) += qxl.o qxl-logger.o qxl-render.o
> +obj-$(CONFIG_VFIO_PCI) += vfio_pci.o
>  
>  obj-y := $(addprefix ../,$(obj-y))
> 

Jan


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 262 bytes --]

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH 1/3] vfio: Import vfio kernel header
  2012-08-01  7:13   ` Jan Kiszka
@ 2012-08-01 18:09     ` Alex Williamson
  2012-08-02  9:02       ` Jan Kiszka
  0 siblings, 1 reply; 42+ messages in thread
From: Alex Williamson @ 2012-08-01 18:09 UTC (permalink / raw)
  To: Jan Kiszka; +Cc: aik, aliguori, qemu-devel, kvm

On Wed, 2012-08-01 at 09:13 +0200, Jan Kiszka wrote:
> On 2012-08-01 07:18, Alex Williamson wrote:

> > +#define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
> > +
> > +#endif /* VFIO_H */
> > 
> 
> Please patch update-linux-headers.sh and let it do its work instead
> (separate patches).

Unfortunately I missed updating the Kbuild file in the kernel, so vfio.h
doesn't get installed yet.  I'll include that in my next pull request,
but maybe in the meantime I should temporarily put this header back in
hw/ unless you have a better idea.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH 3/3] vfio: Enable vfio-pci and mark supported
  2012-08-01  7:15   ` Jan Kiszka
@ 2012-08-01 18:14     ` Alex Williamson
  2012-08-01 19:40       ` Alex Williamson
  2012-08-13 22:19     ` Anthony Liguori
  1 sibling, 1 reply; 42+ messages in thread
From: Alex Williamson @ 2012-08-01 18:14 UTC (permalink / raw)
  To: Jan Kiszka; +Cc: aik, aliguori, qemu-devel, kvm

On Wed, 2012-08-01 at 09:15 +0200, Jan Kiszka wrote:
> On 2012-08-01 07:18, Alex Williamson wrote:
> > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> > ---
> > 
> >  MAINTAINERS           |    5 +++++
> >  configure             |   12 ++++++++++++
> >  hw/i386/Makefile.objs |    1 +
> >  3 files changed, 18 insertions(+)
> > 
> > diff --git a/MAINTAINERS b/MAINTAINERS
> > index 2d219d2..9680d69 100644
> > --- a/MAINTAINERS
> > +++ b/MAINTAINERS
> > @@ -460,6 +460,11 @@ M: Gerd Hoffmann <kraxel@redhat.com>
> >  S: Maintained
> >  F: hw/usb*
> >  
> > +VFIO
> > +M: Alex Williamson <alex.williamson@redhat.com>
> > +S: Supported
> > +F: hw/vfio*
> > +
> >  vhost
> >  M: Michael S. Tsirkin <mst@redhat.com>
> >  S: Supported
> > diff --git a/configure b/configure
> > index c65b5f6..81108dc 100755
> > --- a/configure
> > +++ b/configure
> > @@ -143,6 +143,7 @@ attr=""
> >  libattr=""
> >  xfs=""
> >  
> > +vfio_pci="no"
> >  vhost_net="no"
> >  kvm="no"
> >  gprof="no"
> > @@ -489,6 +490,7 @@ Haiku)
> >    usb="linux"
> >    kvm="yes"
> >    vhost_net="yes"
> > +  vfio_pci="yes"
> >    if [ "$cpu" = "i386" -o "$cpu" = "x86_64" ] ; then
> >      audio_possible_drivers="$audio_possible_drivers fmod"
> >    fi
> > @@ -824,6 +826,10 @@ for opt do
> >    ;;
> >    --disable-guest-agent) guest_agent="no"
> >    ;;
> > +  --disable-vfio-pci) vfio_pci="no"
> > +  ;;
> > +  --enable-vfio-pci) vfio_pci="yes"
> > +  ;;
> 
> Do we need this level of control? Open question I'm just wondering every
> time a new feature gets added together with --disable/--enable switches.

Well, I could certainly understand if some downstream wanted to ship a
qemu that didn't enable device assignment.  I'm sure they'd rather have
a config option to do that instead of needing to modify code.  I
generally find --enable useful to force an error and tell me what I'm
missing when I specifically want a feature rather than having it
silently disabled.  

> >    *) echo "ERROR: unknown option $opt"; show_help="yes"
> >    ;;
> >    esac
> > @@ -1112,6 +1118,8 @@ echo "  --disable-guest-agent    disable building of the QEMU Guest Agent"
> >  echo "  --enable-guest-agent     enable building of the QEMU Guest Agent"
> >  echo "  --with-coroutine=BACKEND coroutine backend. Supported options:"
> >  echo "                           gthread, ucontext, sigaltstack, windows"
> > +echo "  --disable-vfio-pci       disable vfio pci device assignement support"
> > +echo "  --enable-vfio-pci        enable vfio pci device assignment support"
> >  echo ""
> >  echo "NOTE: The object files are built at the place where configure is launched"
> >  exit 1
> > @@ -3072,6 +3080,7 @@ echo "OpenGL support    $opengl"
> >  echo "libiscsi support  $libiscsi"
> >  echo "build guest agent $guest_agent"
> >  echo "coroutine backend $coroutine_backend"
> > +echo "VFIO PCI support  $vfio_pci"
> >  
> >  if test "$sdl_too_old" = "yes"; then
> >  echo "-> Your SDL version is too old - please upgrade to have SDL support"
> > @@ -3754,6 +3763,9 @@ case "$target_arch2" in
> >    *)
> >      echo "CONFIG_NO_XEN=y" >> $config_target_mak
> >  esac
> > +if test "$vfio_pci" = "yes" -a "$target_softmmu" = "yes" ; then
> > +  echo "CONFIG_VFIO_PCI=y" >> $config_target_mak
> > +fi
> 
> Does this already somehow depend on host == Linux? If not, you may break
> the others.

Hmm, probably missing that, I'll look where to add it.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH 3/3] vfio: Enable vfio-pci and mark supported
  2012-08-01 18:14     ` Alex Williamson
@ 2012-08-01 19:40       ` Alex Williamson
  2012-08-02  9:03         ` Jan Kiszka
  0 siblings, 1 reply; 42+ messages in thread
From: Alex Williamson @ 2012-08-01 19:40 UTC (permalink / raw)
  To: Jan Kiszka; +Cc: aik, aliguori, qemu-devel, kvm

On Wed, 2012-08-01 at 12:14 -0600, Alex Williamson wrote:
> On Wed, 2012-08-01 at 09:15 +0200, Jan Kiszka wrote:
> > On 2012-08-01 07:18, Alex Williamson wrote:
> > > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> > > ---
> > > 
> > >  MAINTAINERS           |    5 +++++
> > >  configure             |   12 ++++++++++++
> > >  hw/i386/Makefile.objs |    1 +
> > >  3 files changed, 18 insertions(+)
> > > 
> > > diff --git a/MAINTAINERS b/MAINTAINERS
> > > index 2d219d2..9680d69 100644
> > > --- a/MAINTAINERS
> > > +++ b/MAINTAINERS
> > > @@ -460,6 +460,11 @@ M: Gerd Hoffmann <kraxel@redhat.com>
> > >  S: Maintained
> > >  F: hw/usb*
> > >  
> > > +VFIO
> > > +M: Alex Williamson <alex.williamson@redhat.com>
> > > +S: Supported
> > > +F: hw/vfio*
> > > +
> > >  vhost
> > >  M: Michael S. Tsirkin <mst@redhat.com>
> > >  S: Supported
> > > diff --git a/configure b/configure
> > > index c65b5f6..81108dc 100755
> > > --- a/configure
> > > +++ b/configure
> > > @@ -143,6 +143,7 @@ attr=""
> > >  libattr=""
> > >  xfs=""
> > >  
> > > +vfio_pci="no"
> > >  vhost_net="no"
> > >  kvm="no"
> > >  gprof="no"
> > > @@ -489,6 +490,7 @@ Haiku)
> > >    usb="linux"
> > >    kvm="yes"
> > >    vhost_net="yes"
> > > +  vfio_pci="yes"
> > >    if [ "$cpu" = "i386" -o "$cpu" = "x86_64" ] ; then
> > >      audio_possible_drivers="$audio_possible_drivers fmod"
> > >    fi
> > > @@ -824,6 +826,10 @@ for opt do
> > >    ;;
> > >    --disable-guest-agent) guest_agent="no"
> > >    ;;
> > > +  --disable-vfio-pci) vfio_pci="no"
> > > +  ;;
> > > +  --enable-vfio-pci) vfio_pci="yes"
> > > +  ;;
> > 
> > Do we need this level of control? Open question I'm just wondering every
> > time a new feature gets added together with --disable/--enable switches.
> 
> Well, I could certainly understand if some downstream wanted to ship a
> qemu that didn't enable device assignment.  I'm sure they'd rather have
> a config option to do that instead of needing to modify code.  I
> generally find --enable useful to force an error and tell me what I'm
> missing when I specifically want a feature rather than having it
> silently disabled.  
> 
> > >    *) echo "ERROR: unknown option $opt"; show_help="yes"
> > >    ;;
> > >    esac
> > > @@ -1112,6 +1118,8 @@ echo "  --disable-guest-agent    disable building of the QEMU Guest Agent"
> > >  echo "  --enable-guest-agent     enable building of the QEMU Guest Agent"
> > >  echo "  --with-coroutine=BACKEND coroutine backend. Supported options:"
> > >  echo "                           gthread, ucontext, sigaltstack, windows"
> > > +echo "  --disable-vfio-pci       disable vfio pci device assignement support"
> > > +echo "  --enable-vfio-pci        enable vfio pci device assignment support"
> > >  echo ""
> > >  echo "NOTE: The object files are built at the place where configure is launched"
> > >  exit 1
> > > @@ -3072,6 +3080,7 @@ echo "OpenGL support    $opengl"
> > >  echo "libiscsi support  $libiscsi"
> > >  echo "build guest agent $guest_agent"
> > >  echo "coroutine backend $coroutine_backend"
> > > +echo "VFIO PCI support  $vfio_pci"
> > >  
> > >  if test "$sdl_too_old" = "yes"; then
> > >  echo "-> Your SDL version is too old - please upgrade to have SDL support"
> > > @@ -3754,6 +3763,9 @@ case "$target_arch2" in
> > >    *)
> > >      echo "CONFIG_NO_XEN=y" >> $config_target_mak
> > >  esac
> > > +if test "$vfio_pci" = "yes" -a "$target_softmmu" = "yes" ; then
> > > +  echo "CONFIG_VFIO_PCI=y" >> $config_target_mak
> > > +fi
> > 
> > Does this already somehow depend on host == Linux? If not, you may break
> > the others.
> 
> Hmm, probably missing that, I'll look where to add it.  Thanks,

Or I'm just forgetful.  This is handled by the vfio_pci="yes" in the
second chunk of configure above.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH 1/3] vfio: Import vfio kernel header
  2012-08-01 18:09     ` Alex Williamson
@ 2012-08-02  9:02       ` Jan Kiszka
  2012-08-02 16:37         ` Alex Williamson
  0 siblings, 1 reply; 42+ messages in thread
From: Jan Kiszka @ 2012-08-02  9:02 UTC (permalink / raw)
  To: Alex Williamson; +Cc: aik, aliguori, qemu-devel, kvm

[-- Attachment #1: Type: text/plain, Size: 891 bytes --]

On 2012-08-01 20:09, Alex Williamson wrote:
> On Wed, 2012-08-01 at 09:13 +0200, Jan Kiszka wrote:
>> On 2012-08-01 07:18, Alex Williamson wrote:
> 
>>> +#define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
>>> +
>>> +#endif /* VFIO_H */
>>>
>>
>> Please patch update-linux-headers.sh and let it do its work instead
>> (separate patches).
> 
> Unfortunately I missed updating the Kbuild file in the kernel, so vfio.h
> doesn't get installed yet.  I'll include that in my next pull request,
> but maybe in the meantime I should temporarily put this header back in
> hw/ unless you have a better idea.  Thanks,

Well, if it's going to be fixed very soon, we can merge it like this.
But you should already be able provide a patch for the update script so
that the next one running it against a fixed kernel will not
accidentally drop the vfio header again.

Jan


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 262 bytes --]

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH 3/3] vfio: Enable vfio-pci and mark supported
  2012-08-01 19:40       ` Alex Williamson
@ 2012-08-02  9:03         ` Jan Kiszka
  0 siblings, 0 replies; 42+ messages in thread
From: Jan Kiszka @ 2012-08-02  9:03 UTC (permalink / raw)
  To: Alex Williamson; +Cc: aik, aliguori, qemu-devel, kvm

[-- Attachment #1: Type: text/plain, Size: 3903 bytes --]

On 2012-08-01 21:40, Alex Williamson wrote:
> On Wed, 2012-08-01 at 12:14 -0600, Alex Williamson wrote:
>> On Wed, 2012-08-01 at 09:15 +0200, Jan Kiszka wrote:
>>> On 2012-08-01 07:18, Alex Williamson wrote:
>>>> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
>>>> ---
>>>>
>>>>  MAINTAINERS           |    5 +++++
>>>>  configure             |   12 ++++++++++++
>>>>  hw/i386/Makefile.objs |    1 +
>>>>  3 files changed, 18 insertions(+)
>>>>
>>>> diff --git a/MAINTAINERS b/MAINTAINERS
>>>> index 2d219d2..9680d69 100644
>>>> --- a/MAINTAINERS
>>>> +++ b/MAINTAINERS
>>>> @@ -460,6 +460,11 @@ M: Gerd Hoffmann <kraxel@redhat.com>
>>>>  S: Maintained
>>>>  F: hw/usb*
>>>>  
>>>> +VFIO
>>>> +M: Alex Williamson <alex.williamson@redhat.com>
>>>> +S: Supported
>>>> +F: hw/vfio*
>>>> +
>>>>  vhost
>>>>  M: Michael S. Tsirkin <mst@redhat.com>
>>>>  S: Supported
>>>> diff --git a/configure b/configure
>>>> index c65b5f6..81108dc 100755
>>>> --- a/configure
>>>> +++ b/configure
>>>> @@ -143,6 +143,7 @@ attr=""
>>>>  libattr=""
>>>>  xfs=""
>>>>  
>>>> +vfio_pci="no"
>>>>  vhost_net="no"
>>>>  kvm="no"
>>>>  gprof="no"
>>>> @@ -489,6 +490,7 @@ Haiku)
>>>>    usb="linux"
>>>>    kvm="yes"
>>>>    vhost_net="yes"
>>>> +  vfio_pci="yes"
>>>>    if [ "$cpu" = "i386" -o "$cpu" = "x86_64" ] ; then
>>>>      audio_possible_drivers="$audio_possible_drivers fmod"
>>>>    fi
>>>> @@ -824,6 +826,10 @@ for opt do
>>>>    ;;
>>>>    --disable-guest-agent) guest_agent="no"
>>>>    ;;
>>>> +  --disable-vfio-pci) vfio_pci="no"
>>>> +  ;;
>>>> +  --enable-vfio-pci) vfio_pci="yes"
>>>> +  ;;
>>>
>>> Do we need this level of control? Open question I'm just wondering every
>>> time a new feature gets added together with --disable/--enable switches.
>>
>> Well, I could certainly understand if some downstream wanted to ship a
>> qemu that didn't enable device assignment.  I'm sure they'd rather have
>> a config option to do that instead of needing to modify code.  I
>> generally find --enable useful to force an error and tell me what I'm
>> missing when I specifically want a feature rather than having it
>> silently disabled.  
>>
>>>>    *) echo "ERROR: unknown option $opt"; show_help="yes"
>>>>    ;;
>>>>    esac
>>>> @@ -1112,6 +1118,8 @@ echo "  --disable-guest-agent    disable building of the QEMU Guest Agent"
>>>>  echo "  --enable-guest-agent     enable building of the QEMU Guest Agent"
>>>>  echo "  --with-coroutine=BACKEND coroutine backend. Supported options:"
>>>>  echo "                           gthread, ucontext, sigaltstack, windows"
>>>> +echo "  --disable-vfio-pci       disable vfio pci device assignement support"
>>>> +echo "  --enable-vfio-pci        enable vfio pci device assignment support"
>>>>  echo ""
>>>>  echo "NOTE: The object files are built at the place where configure is launched"
>>>>  exit 1
>>>> @@ -3072,6 +3080,7 @@ echo "OpenGL support    $opengl"
>>>>  echo "libiscsi support  $libiscsi"
>>>>  echo "build guest agent $guest_agent"
>>>>  echo "coroutine backend $coroutine_backend"
>>>> +echo "VFIO PCI support  $vfio_pci"
>>>>  
>>>>  if test "$sdl_too_old" = "yes"; then
>>>>  echo "-> Your SDL version is too old - please upgrade to have SDL support"
>>>> @@ -3754,6 +3763,9 @@ case "$target_arch2" in
>>>>    *)
>>>>      echo "CONFIG_NO_XEN=y" >> $config_target_mak
>>>>  esac
>>>> +if test "$vfio_pci" = "yes" -a "$target_softmmu" = "yes" ; then
>>>> +  echo "CONFIG_VFIO_PCI=y" >> $config_target_mak
>>>> +fi
>>>
>>> Does this already somehow depend on host == Linux? If not, you may break
>>> the others.
>>
>> Hmm, probably missing that, I'll look where to add it.  Thanks,
> 
> Or I'm just forgetful.  This is handled by the vfio_pci="yes" in the
> second chunk of configure above.  Thanks,

Ah, ok. Then it's fine.

Jan


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 262 bytes --]

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH 1/3] vfio: Import vfio kernel header
  2012-08-02  9:02       ` Jan Kiszka
@ 2012-08-02 16:37         ` Alex Williamson
  2012-08-02 16:45           ` Jan Kiszka
  0 siblings, 1 reply; 42+ messages in thread
From: Alex Williamson @ 2012-08-02 16:37 UTC (permalink / raw)
  To: Jan Kiszka; +Cc: aik, aliguori, qemu-devel, kvm

On Thu, 2012-08-02 at 11:02 +0200, Jan Kiszka wrote:
> On 2012-08-01 20:09, Alex Williamson wrote:
> > On Wed, 2012-08-01 at 09:13 +0200, Jan Kiszka wrote:
> >> On 2012-08-01 07:18, Alex Williamson wrote:
> > 
> >>> +#define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
> >>> +
> >>> +#endif /* VFIO_H */
> >>>
> >>
> >> Please patch update-linux-headers.sh and let it do its work instead
> >> (separate patches).
> > 
> > Unfortunately I missed updating the Kbuild file in the kernel, so vfio.h
> > doesn't get installed yet.  I'll include that in my next pull request,
> > but maybe in the meantime I should temporarily put this header back in
> > hw/ unless you have a better idea.  Thanks,
> 
> Well, if it's going to be fixed very soon, we can merge it like this.
> But you should already be able provide a patch for the update script so
> that the next one running it against a fixed kernel will not
> accidentally drop the vfio header again.

What's the reason for the rm -fr; mkdir -p in the update script?  I
don't see how we'd be unintentionally generating cruft in these
directories if we were to just overwrite files instead of removing the
directory and repopulating it.  I'll get the kernel fixed soon, but I'm
tempted to add vfio and drop the expunge of the directory at the same
time.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH 1/3] vfio: Import vfio kernel header
  2012-08-02 16:37         ` Alex Williamson
@ 2012-08-02 16:45           ` Jan Kiszka
  0 siblings, 0 replies; 42+ messages in thread
From: Jan Kiszka @ 2012-08-02 16:45 UTC (permalink / raw)
  To: Alex Williamson; +Cc: aik, aliguori, qemu-devel, kvm

[-- Attachment #1: Type: text/plain, Size: 1558 bytes --]

On 2012-08-02 18:37, Alex Williamson wrote:
> On Thu, 2012-08-02 at 11:02 +0200, Jan Kiszka wrote:
>> On 2012-08-01 20:09, Alex Williamson wrote:
>>> On Wed, 2012-08-01 at 09:13 +0200, Jan Kiszka wrote:
>>>> On 2012-08-01 07:18, Alex Williamson wrote:
>>>
>>>>> +#define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
>>>>> +
>>>>> +#endif /* VFIO_H */
>>>>>
>>>>
>>>> Please patch update-linux-headers.sh and let it do its work instead
>>>> (separate patches).
>>>
>>> Unfortunately I missed updating the Kbuild file in the kernel, so vfio.h
>>> doesn't get installed yet.  I'll include that in my next pull request,
>>> but maybe in the meantime I should temporarily put this header back in
>>> hw/ unless you have a better idea.  Thanks,
>>
>> Well, if it's going to be fixed very soon, we can merge it like this.
>> But you should already be able provide a patch for the update script so
>> that the next one running it against a fixed kernel will not
>> accidentally drop the vfio header again.
> 
> What's the reason for the rm -fr; mkdir -p in the update script?  I
> don't see how we'd be unintentionally generating cruft in these
> directories if we were to just overwrite files instead of removing the
> directory and repopulating it.  I'll get the kernel fixed soon, but I'm
> tempted to add vfio and drop the expunge of the directory at the same
> time.  Thanks,

Well, chances to pile up cruft are low, granted. On the other hand, this
policy enforces proper upstream header exporting as we see...

Jan


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 262 bytes --]

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH 0/3] VFIO-based PCI device assignment for QEMU 1.2
  2012-08-01  5:18 [Qemu-devel] [PATCH 0/3] VFIO-based PCI device assignment for QEMU 1.2 Alex Williamson
                   ` (2 preceding siblings ...)
  2012-08-01  5:18 ` [Qemu-devel] [PATCH 3/3] vfio: Enable vfio-pci and mark supported Alex Williamson
@ 2012-08-13 13:27 ` Anthony Liguori
  2012-08-13 13:58   ` Avi Kivity
  2012-08-13 14:23   ` Alex Williamson
  3 siblings, 2 replies; 42+ messages in thread
From: Anthony Liguori @ 2012-08-13 13:27 UTC (permalink / raw)
  To: Alex Williamson; +Cc: kvm, aik, Jan Kiszka, qemu-devel, Avi Kivity

Alex Williamson <alex.williamson@redhat.com> writes:

> VFIO kernel support was just merged into Linux, so I'd like to
> formally propose inclusion of the QEMU vfio-pci driver for
> QEMU 1.2.  Included here is support for x86 PCI device assignment.
> PCI INTx is not yet enabled, but devices making use of either MSI
> or MSI-X work.  The level irqfd and eoifd support I've proposed
> for KVM enable an accelerated patch for this through KVM.  I'd
> like to get this base driver in first and enable the remaining
> support in-tree.
>
> I've split this version up a little from the RFC to make it a bit
> easier to review.  Review comments from Blue Swirl and Avi are
> already incorporated, including Avi's requests to simplify both
> the PCI BAR mapping and unmapping paths.

Hi Alex,

Thanks for pushing this forward!  Hopefully this will finally kill off
qemu-kvm.git for good.

I think this series is going to have to wait for 1.3 to open up.  We
have a very short release window for this release and I'd feel a lot
more comfortable having such a significant feature spend some time in
the development cycle getting testing/review.

I'd like to see a few Reviewed-by's too for this series before it goes
in.  I expect they won't be hard to get but I also expect it will take a
few more revisions of this series to get there.

Regards,

Anthony Liguori

>
> This series is also available at:
>
> git://github.com/awilliam/qemu-vfio.git tags/vfio-pci-for-qemu-1.2
>
> Thanks,
>
> Alex
>
> ---
>
> Alex Williamson (3):
>       vfio: Enable vfio-pci and mark supported
>       vfio: vfio-pci device assignment driver
>       vfio: Import vfio kernel header
>
>
>  MAINTAINERS                |    5 
>  configure                  |   12 
>  hw/i386/Makefile.objs      |    1 
>  hw/vfio_pci.c              | 1853 ++++++++++++++++++++++++++++++++++++++++++++
>  hw/vfio_pci.h              |  101 ++
>  linux-headers/linux/vfio.h |  368 +++++++++
>  6 files changed, 2340 insertions(+)
>  create mode 100644 hw/vfio_pci.c
>  create mode 100644 hw/vfio_pci.h
>  create mode 100644 linux-headers/linux/vfio.h
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH 0/3] VFIO-based PCI device assignment for QEMU 1.2
  2012-08-13 13:27 ` [Qemu-devel] [PATCH 0/3] VFIO-based PCI device assignment for QEMU 1.2 Anthony Liguori
@ 2012-08-13 13:58   ` Avi Kivity
  2012-08-13 14:04     ` Jan Kiszka
  2012-08-13 14:23   ` Alex Williamson
  1 sibling, 1 reply; 42+ messages in thread
From: Avi Kivity @ 2012-08-13 13:58 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: kvm, aik, Jan Kiszka, qemu-devel, Alex Williamson

On 08/13/2012 04:27 PM, Anthony Liguori wrote:

> Thanks for pushing this forward!  Hopefully this will finally kill off
> qemu-kvm.git for good.

No, it won't.  vfio requires a 3.6 kernel, which we cannot assume anyone
has.  We'll need the original device assignment code side-by-side.

-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH 0/3] VFIO-based PCI device assignment for QEMU 1.2
  2012-08-13 13:58   ` Avi Kivity
@ 2012-08-13 14:04     ` Jan Kiszka
  2012-08-13 19:31       ` Anthony Liguori
  0 siblings, 1 reply; 42+ messages in thread
From: Jan Kiszka @ 2012-08-13 14:04 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Anthony Liguori, kvm@vger.kernel.org, aik@ozlabs.ru,
	qemu-devel@nongnu.org, Alex Williamson

On 2012-08-13 15:58, Avi Kivity wrote:
> On 08/13/2012 04:27 PM, Anthony Liguori wrote:
> 
>> Thanks for pushing this forward!  Hopefully this will finally kill off
>> qemu-kvm.git for good.
> 
> No, it won't.  vfio requires a 3.6 kernel, which we cannot assume anyone
> has.  We'll need the original device assignment code side-by-side.

...which is on my to-do list for 1.3.

Jan

-- 
Siemens AG, Corporate Technology, CT RTC ITP SDP-DE
Corporate Competence Center Embedded Linux

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH 0/3] VFIO-based PCI device assignment for QEMU 1.2
  2012-08-13 13:27 ` [Qemu-devel] [PATCH 0/3] VFIO-based PCI device assignment for QEMU 1.2 Anthony Liguori
  2012-08-13 13:58   ` Avi Kivity
@ 2012-08-13 14:23   ` Alex Williamson
  2012-08-13 15:48     ` Andreas Hartmann
  2012-08-13 19:33     ` Anthony Liguori
  1 sibling, 2 replies; 42+ messages in thread
From: Alex Williamson @ 2012-08-13 14:23 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Andreas Hartmann, kvm, aik, Jan Kiszka, qemu-devel, Blue Swirl,
	Avi Kivity

On Mon, 2012-08-13 at 08:27 -0500, Anthony Liguori wrote:
> Alex Williamson <alex.williamson@redhat.com> writes:
> 
> > VFIO kernel support was just merged into Linux, so I'd like to
> > formally propose inclusion of the QEMU vfio-pci driver for
> > QEMU 1.2.  Included here is support for x86 PCI device assignment.
> > PCI INTx is not yet enabled, but devices making use of either MSI
> > or MSI-X work.  The level irqfd and eoifd support I've proposed
> > for KVM enable an accelerated patch for this through KVM.  I'd
> > like to get this base driver in first and enable the remaining
> > support in-tree.
> >
> > I've split this version up a little from the RFC to make it a bit
> > easier to review.  Review comments from Blue Swirl and Avi are
> > already incorporated, including Avi's requests to simplify both
> > the PCI BAR mapping and unmapping paths.
> 
> Hi Alex,
> 
> Thanks for pushing this forward!  Hopefully this will finally kill off
> qemu-kvm.git for good.
> 
> I think this series is going to have to wait for 1.3 to open up.  We
> have a very short release window for this release and I'd feel a lot
> more comfortable having such a significant feature spend some time in
> the development cycle getting testing/review.
> 
> I'd like to see a few Reviewed-by's too for this series before it goes
> in.  I expect they won't be hard to get but I also expect it will take a
> few more revisions of this series to get there.

That's disappointing, but I can understand your reluctance.  Blue Swirl
reviewed the RFC and could perhaps add a Reviewed-by.  Alexey has been
working on the POWER port and I'm sure could provide a Reviewed-by.  We
also have a few early adopters that are already making use of this code.
Towards accepting it, the driver is entirely self contained, there's
really no risk to the rest of qemu.  The only missing functionality is
legacy interrupt support.  Perhaps there's a compromise where this
driver could be considered a tech preview in 1.2 (x-vfio-pci?).  Thanks,

Alex

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH 0/3] VFIO-based PCI device assignment for QEMU 1.2
  2012-08-13 14:23   ` Alex Williamson
@ 2012-08-13 15:48     ` Andreas Hartmann
  2012-08-13 16:14       ` Alex Williamson
  2012-08-13 19:33     ` Anthony Liguori
  1 sibling, 1 reply; 42+ messages in thread
From: Andreas Hartmann @ 2012-08-13 15:48 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Anthony Liguori, kvm, aik, Jan Kiszka, qemu-devel, Blue Swirl,
	Avi Kivity

Alex Williamson wrote:
> On Mon, 2012-08-13 at 08:27 -0500, Anthony Liguori wrote:
>> Alex Williamson <alex.williamson@redhat.com> writes:
>>
>>> VFIO kernel support was just merged into Linux, so I'd like to
>>> formally propose inclusion of the QEMU vfio-pci driver for
>>> QEMU 1.2.  Included here is support for x86 PCI device assignment.
>>> PCI INTx is not yet enabled, but devices making use of either MSI
>>> or MSI-X work.  The level irqfd and eoifd support I've proposed
>>> for KVM enable an accelerated patch for this through KVM.  I'd
>>> like to get this base driver in first and enable the remaining
>>> support in-tree.
>>>
>>> I've split this version up a little from the RFC to make it a bit
>>> easier to review.  Review comments from Blue Swirl and Avi are
>>> already incorporated, including Avi's requests to simplify both
>>> the PCI BAR mapping and unmapping paths.
>>
>> Hi Alex,
>>
>> Thanks for pushing this forward!  Hopefully this will finally kill
>> off qemu-kvm.git for good.
>>
>> I think this series is going to have to wait for 1.3 to open up.  We
>> have a very short release window for this release and I'd feel a lot
>> more comfortable having such a significant feature spend some time in
>> the development cycle getting testing/review.
>>
>> I'd like to see a few Reviewed-by's too for this series before it
>> goes in.  I expect they won't be hard to get but I also expect it
>> will take a few more revisions of this series to get there.
> 
> That's disappointing, but I can understand your reluctance.  Blue
> Swirl reviewed the RFC and could perhaps add a Reviewed-by.  Alexey
> has been working on the POWER port and I'm sure could provide a
> Reviewed-by.  We also have a few early adopters that are already
> making use of this code.

I'm running qemu with vfio patch since Jun 05, 2012
(awilliam-qemu-vfio-v0.14.0-rc0-6402-g323cf9f.tar.gz). I didn't
encounter any problem so far.

If you like, I could compile a more actual version, too (if there have
been any changes).

To see more about my use case:
http://permalink.gmane.org/gmane.linux.drivers.rt2x00.user/1051

You may add a Tested-by Andreas Hartmann <andihartmann@01019freenet.de>
if you like.

Unfortunately, I'm only running the vfio VM (kvm) with this version of
qemu, but I'm running parallel 4 other VM's with the unchanged version
of qemu (kvm-0.15.0-123.2.x86_64), too.
One of these 4 VM's uses PCIe passthrough.

I now tried to run all VMs with the new version of qemu. At this point,
I unfortunately run into a problem with the VM which passes through a
PCIe device. The error message is (during start of VM):

virsh start VM
error: Failed to start domain VM
error: internal error process exited while connecting to monitor: qemu-system-x86_64: -device pci-assign,host=04:00.0,id=hostdev0,bus=pci.0,addr=0x6: Parameter 'driver' expects device type

The xml file for libvirt looks like this: 

<domain type='kvm'>
  <name>VM</name>
  <uuid>44444444-4444-2222-7777-111111111111</uuid>
  <memory>1048576</memory>
  <currentMemory>262144</currentMemory>
  <vcpu>1</vcpu>
  <os>
    <type arch='x86_64' machine='pc-0.14'>hvm</type>
    <boot dev='hd'/>
  </os>
  <features>
    <acpi/>
    <pae/>
  </features>
  <clock offset='utc'/>
  <on_poweroff>destroy</on_poweroff>
  <on_reboot>restart</on_reboot>
  <on_crash>destroy</on_crash>
  <devices>
    <emulator>/usr/local/bin/qemu-system-x86_64</emulator>

    <hostdev mode='subsystem' type='pci' managed='yes'>
      <source>
        <address domain='0x0000' bus='0x04' slot='0x00' function='0x0'/>
      </source>
    </hostdev>

    <disk type='file' device='disk'>
      <source file='/hds/vm.qed' type='aio'/>
      <target dev='vda' bus='virtio'/>
      <driver name='qemu' type='qed' cache='none'/>
    </disk>

    <input type='mouse' bus='ps2'/>
    <graphics type='vnc' autoport='yes'/>
    <video>
      <model type='cirrus' vram='9216' heads='1'/>
    </video>
  </devices>
</domain>


Maybe, this is fixed in a newer version of qemu for vfio?


Thanks,
kind regards,
Andreas Hartmann

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH 0/3] VFIO-based PCI device assignment for QEMU 1.2
  2012-08-13 15:48     ` Andreas Hartmann
@ 2012-08-13 16:14       ` Alex Williamson
  2012-08-13 16:36         ` Andreas Hartmann
  0 siblings, 1 reply; 42+ messages in thread
From: Alex Williamson @ 2012-08-13 16:14 UTC (permalink / raw)
  To: Andreas Hartmann
  Cc: Anthony Liguori, kvm, aik, Jan Kiszka, qemu-devel, Blue Swirl,
	Avi Kivity

On Mon, 2012-08-13 at 17:48 +0200, Andreas Hartmann wrote:
> Alex Williamson wrote:
> > On Mon, 2012-08-13 at 08:27 -0500, Anthony Liguori wrote:
> >> Alex Williamson <alex.williamson@redhat.com> writes:
> >>
> >>> VFIO kernel support was just merged into Linux, so I'd like to
> >>> formally propose inclusion of the QEMU vfio-pci driver for
> >>> QEMU 1.2.  Included here is support for x86 PCI device assignment.
> >>> PCI INTx is not yet enabled, but devices making use of either MSI
> >>> or MSI-X work.  The level irqfd and eoifd support I've proposed
> >>> for KVM enable an accelerated patch for this through KVM.  I'd
> >>> like to get this base driver in first and enable the remaining
> >>> support in-tree.
> >>>
> >>> I've split this version up a little from the RFC to make it a bit
> >>> easier to review.  Review comments from Blue Swirl and Avi are
> >>> already incorporated, including Avi's requests to simplify both
> >>> the PCI BAR mapping and unmapping paths.
> >>
> >> Hi Alex,
> >>
> >> Thanks for pushing this forward!  Hopefully this will finally kill
> >> off qemu-kvm.git for good.
> >>
> >> I think this series is going to have to wait for 1.3 to open up.  We
> >> have a very short release window for this release and I'd feel a lot
> >> more comfortable having such a significant feature spend some time in
> >> the development cycle getting testing/review.
> >>
> >> I'd like to see a few Reviewed-by's too for this series before it
> >> goes in.  I expect they won't be hard to get but I also expect it
> >> will take a few more revisions of this series to get there.
> > 
> > That's disappointing, but I can understand your reluctance.  Blue
> > Swirl reviewed the RFC and could perhaps add a Reviewed-by.  Alexey
> > has been working on the POWER port and I'm sure could provide a
> > Reviewed-by.  We also have a few early adopters that are already
> > making use of this code.
> 
> I'm running qemu with vfio patch since Jun 05, 2012
> (awilliam-qemu-vfio-v0.14.0-rc0-6402-g323cf9f.tar.gz). I didn't
> encounter any problem so far.
> 
> If you like, I could compile a more actual version, too (if there have
> been any changes).

The only change in the version proposed for qemu is that legacy
interrupt support has been removed until we can agree on interfaces in
kvm and plumb an EOI path through qemu.  IIRC, the devices you're using
require legacy interrupt support.

> To see more about my use case:
> http://permalink.gmane.org/gmane.linux.drivers.rt2x00.user/1051
> 
> You may add a Tested-by Andreas Hartmann <andihartmann@01019freenet.de>
> if you like.
> 
> Unfortunately, I'm only running the vfio VM (kvm) with this version of
> qemu, but I'm running parallel 4 other VM's with the unchanged version
> of qemu (kvm-0.15.0-123.2.x86_64), too.
> One of these 4 VM's uses PCIe passthrough.
> 
> I now tried to run all VMs with the new version of qemu. At this point,
> I unfortunately run into a problem with the VM which passes through a
> PCIe device. The error message is (during start of VM):
> 
> virsh start VM
> error: Failed to start domain VM
> error: internal error process exited while connecting to monitor: qemu-system-x86_64: -device pci-assign,host=04:00.0,id=hostdev0,bus=pci.0,addr=0x6: Parameter 'driver' expects device type
> 
> The xml file for libvirt looks like this: 

libvirt doesn't yet support vfio-pci and current qemu.git doesn't yet
support pci-assign.  To use libvirt for device assignment, your only
option right now is to use a qemu-kvm.git based version of qemu.
Thanks,

Alex

> <domain type='kvm'>
>   <name>VM</name>
>   <uuid>44444444-4444-2222-7777-111111111111</uuid>
>   <memory>1048576</memory>
>   <currentMemory>262144</currentMemory>
>   <vcpu>1</vcpu>
>   <os>
>     <type arch='x86_64' machine='pc-0.14'>hvm</type>
>     <boot dev='hd'/>
>   </os>
>   <features>
>     <acpi/>
>     <pae/>
>   </features>
>   <clock offset='utc'/>
>   <on_poweroff>destroy</on_poweroff>
>   <on_reboot>restart</on_reboot>
>   <on_crash>destroy</on_crash>
>   <devices>
>     <emulator>/usr/local/bin/qemu-system-x86_64</emulator>
> 
>     <hostdev mode='subsystem' type='pci' managed='yes'>
>       <source>
>         <address domain='0x0000' bus='0x04' slot='0x00' function='0x0'/>
>       </source>
>     </hostdev>
> 
>     <disk type='file' device='disk'>
>       <source file='/hds/vm.qed' type='aio'/>
>       <target dev='vda' bus='virtio'/>
>       <driver name='qemu' type='qed' cache='none'/>
>     </disk>
> 
>     <input type='mouse' bus='ps2'/>
>     <graphics type='vnc' autoport='yes'/>
>     <video>
>       <model type='cirrus' vram='9216' heads='1'/>
>     </video>
>   </devices>
> </domain>
> 
> 
> Maybe, this is fixed in a newer version of qemu for vfio?
> 
> 
> Thanks,
> kind regards,
> Andreas Hartmann

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH 0/3] VFIO-based PCI device assignment for QEMU 1.2
  2012-08-13 16:14       ` Alex Williamson
@ 2012-08-13 16:36         ` Andreas Hartmann
  2012-08-13 16:57           ` Alex Williamson
  0 siblings, 1 reply; 42+ messages in thread
From: Andreas Hartmann @ 2012-08-13 16:36 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Andreas Hartmann, kvm, aik, Jan Kiszka, Anthony Liguori,
	qemu-devel, Blue Swirl, Avi Kivity

Alex Williamson schrieb:
> On Mon, 2012-08-13 at 17:48 +0200, Andreas Hartmann wrote:
>> Alex Williamson wrote:
>>> On Mon, 2012-08-13 at 08:27 -0500, Anthony Liguori wrote:
>>>> Alex Williamson <alex.williamson@redhat.com> writes:
>>>>
>>>>> VFIO kernel support was just merged into Linux, so I'd like to
>>>>> formally propose inclusion of the QEMU vfio-pci driver for
>>>>> QEMU 1.2.  Included here is support for x86 PCI device assignment.
>>>>> PCI INTx is not yet enabled, but devices making use of either MSI
>>>>> or MSI-X work.  The level irqfd and eoifd support I've proposed
>>>>> for KVM enable an accelerated patch for this through KVM.  I'd
>>>>> like to get this base driver in first and enable the remaining
>>>>> support in-tree.
>>>>>
>>>>> I've split this version up a little from the RFC to make it a bit
>>>>> easier to review.  Review comments from Blue Swirl and Avi are
>>>>> already incorporated, including Avi's requests to simplify both
>>>>> the PCI BAR mapping and unmapping paths.
>>>>
>>>> Hi Alex,
>>>>
>>>> Thanks for pushing this forward!  Hopefully this will finally kill
>>>> off qemu-kvm.git for good.
>>>>
>>>> I think this series is going to have to wait for 1.3 to open up.  We
>>>> have a very short release window for this release and I'd feel a lot
>>>> more comfortable having such a significant feature spend some time in
>>>> the development cycle getting testing/review.
>>>>
>>>> I'd like to see a few Reviewed-by's too for this series before it
>>>> goes in.  I expect they won't be hard to get but I also expect it
>>>> will take a few more revisions of this series to get there.
>>>
>>> That's disappointing, but I can understand your reluctance.  Blue
>>> Swirl reviewed the RFC and could perhaps add a Reviewed-by.  Alexey
>>> has been working on the POWER port and I'm sure could provide a
>>> Reviewed-by.  We also have a few early adopters that are already
>>> making use of this code.
>>
>> I'm running qemu with vfio patch since Jun 05, 2012
>> (awilliam-qemu-vfio-v0.14.0-rc0-6402-g323cf9f.tar.gz). I didn't
>> encounter any problem so far.
>>
>> If you like, I could compile a more actual version, too (if there have
>> been any changes).
> 
> The only change in the version proposed for qemu is that legacy
> interrupt support has been removed until we can agree on interfaces in
> kvm and plumb an EOI path through qemu.  IIRC, the devices you're using
> require legacy interrupt support.
> 
>> To see more about my use case:
>> http://permalink.gmane.org/gmane.linux.drivers.rt2x00.user/1051
>>
>> You may add a Tested-by Andreas Hartmann <andihartmann@01019freenet.de>
>> if you like.
>>
>> Unfortunately, I'm only running the vfio VM (kvm) with this version of
>> qemu, but I'm running parallel 4 other VM's with the unchanged version
>> of qemu (kvm-0.15.0-123.2.x86_64), too.
>> One of these 4 VM's uses PCIe passthrough.
>>
>> I now tried to run all VMs with the new version of qemu. At this point,
>> I unfortunately run into a problem with the VM which passes through a
>> PCIe device. The error message is (during start of VM):
>>
>> virsh start VM
>> error: Failed to start domain VM
>> error: internal error process exited while connecting to monitor: qemu-system-x86_64: -device pci-assign,host=04:00.0,id=hostdev0,bus=pci.0,addr=0x6: Parameter 'driver' expects device type
>>
>> The xml file for libvirt looks like this: 
> 
> libvirt doesn't yet support vfio-pci and current qemu.git doesn't yet
> support pci-assign.  To use libvirt for device assignment, your only
> option right now is to use a qemu-kvm.git based version of qemu.

If I'm using your qemu instead of qemu from kvm-0.15 (opensuse package),
this error comes up when passing through a PCIe device, which works
absolutely fine with kvm 0.15. I would have expected, that your qemu
works with the legacy way of handling pcie passthrough, too (with
pci-stub module).

This would mean, that all users get errors if they use the traditional
way. IOW: there are changes needed (which?) to move from kvm 0.15 to
your qemu version.

> 
>> <domain type='kvm'>
>>   <name>VM</name>
>>   <uuid>44444444-4444-2222-7777-111111111111</uuid>
>>   <memory>1048576</memory>
>>   <currentMemory>262144</currentMemory>
>>   <vcpu>1</vcpu>
>>   <os>
>>     <type arch='x86_64' machine='pc-0.14'>hvm</type>
>>     <boot dev='hd'/>
>>   </os>
>>   <features>
>>     <acpi/>
>>     <pae/>
>>   </features>
>>   <clock offset='utc'/>
>>   <on_poweroff>destroy</on_poweroff>
>>   <on_reboot>restart</on_reboot>
>>   <on_crash>destroy</on_crash>
>>   <devices>
>>     <emulator>/usr/local/bin/qemu-system-x86_64</emulator>
>>
>>     <hostdev mode='subsystem' type='pci' managed='yes'>
>>       <source>
>>         <address domain='0x0000' bus='0x04' slot='0x00' function='0x0'/>
>>       </source>
>>     </hostdev>
>>
>>     <disk type='file' device='disk'>
>>       <source file='/hds/vm.qed' type='aio'/>
>>       <target dev='vda' bus='virtio'/>
>>       <driver name='qemu' type='qed' cache='none'/>
>>     </disk>
>>
>>     <input type='mouse' bus='ps2'/>
>>     <graphics type='vnc' autoport='yes'/>
>>     <video>
>>       <model type='cirrus' vram='9216' heads='1'/>
>>     </video>
>>   </devices>
>> </domain>
>>
>>
>> Maybe, this is fixed in a newer version of qemu for vfio?

Kind regards,
Andreas

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH 0/3] VFIO-based PCI device assignment for QEMU 1.2
  2012-08-13 16:36         ` Andreas Hartmann
@ 2012-08-13 16:57           ` Alex Williamson
  2012-08-13 18:32             ` Andreas Hartmann
  0 siblings, 1 reply; 42+ messages in thread
From: Alex Williamson @ 2012-08-13 16:57 UTC (permalink / raw)
  To: Andreas Hartmann
  Cc: Anthony Liguori, kvm, aik, Jan Kiszka, qemu-devel, Blue Swirl,
	Avi Kivity

On Mon, 2012-08-13 at 18:36 +0200, Andreas Hartmann wrote:
> Alex Williamson schrieb:
> > On Mon, 2012-08-13 at 17:48 +0200, Andreas Hartmann wrote:
> >> Alex Williamson wrote:
> >>> On Mon, 2012-08-13 at 08:27 -0500, Anthony Liguori wrote:
> >>>> Alex Williamson <alex.williamson@redhat.com> writes:
> >>>>
> >>>>> VFIO kernel support was just merged into Linux, so I'd like to
> >>>>> formally propose inclusion of the QEMU vfio-pci driver for
> >>>>> QEMU 1.2.  Included here is support for x86 PCI device assignment.
> >>>>> PCI INTx is not yet enabled, but devices making use of either MSI
> >>>>> or MSI-X work.  The level irqfd and eoifd support I've proposed
> >>>>> for KVM enable an accelerated patch for this through KVM.  I'd
> >>>>> like to get this base driver in first and enable the remaining
> >>>>> support in-tree.
> >>>>>
> >>>>> I've split this version up a little from the RFC to make it a bit
> >>>>> easier to review.  Review comments from Blue Swirl and Avi are
> >>>>> already incorporated, including Avi's requests to simplify both
> >>>>> the PCI BAR mapping and unmapping paths.
> >>>>
> >>>> Hi Alex,
> >>>>
> >>>> Thanks for pushing this forward!  Hopefully this will finally kill
> >>>> off qemu-kvm.git for good.
> >>>>
> >>>> I think this series is going to have to wait for 1.3 to open up.  We
> >>>> have a very short release window for this release and I'd feel a lot
> >>>> more comfortable having such a significant feature spend some time in
> >>>> the development cycle getting testing/review.
> >>>>
> >>>> I'd like to see a few Reviewed-by's too for this series before it
> >>>> goes in.  I expect they won't be hard to get but I also expect it
> >>>> will take a few more revisions of this series to get there.
> >>>
> >>> That's disappointing, but I can understand your reluctance.  Blue
> >>> Swirl reviewed the RFC and could perhaps add a Reviewed-by.  Alexey
> >>> has been working on the POWER port and I'm sure could provide a
> >>> Reviewed-by.  We also have a few early adopters that are already
> >>> making use of this code.
> >>
> >> I'm running qemu with vfio patch since Jun 05, 2012
> >> (awilliam-qemu-vfio-v0.14.0-rc0-6402-g323cf9f.tar.gz). I didn't
> >> encounter any problem so far.
> >>
> >> If you like, I could compile a more actual version, too (if there have
> >> been any changes).
> > 
> > The only change in the version proposed for qemu is that legacy
> > interrupt support has been removed until we can agree on interfaces in
> > kvm and plumb an EOI path through qemu.  IIRC, the devices you're using
> > require legacy interrupt support.
> > 
> >> To see more about my use case:
> >> http://permalink.gmane.org/gmane.linux.drivers.rt2x00.user/1051
> >>
> >> You may add a Tested-by Andreas Hartmann <andihartmann@01019freenet.de>
> >> if you like.
> >>
> >> Unfortunately, I'm only running the vfio VM (kvm) with this version of
> >> qemu, but I'm running parallel 4 other VM's with the unchanged version
> >> of qemu (kvm-0.15.0-123.2.x86_64), too.
> >> One of these 4 VM's uses PCIe passthrough.
> >>
> >> I now tried to run all VMs with the new version of qemu. At this point,
> >> I unfortunately run into a problem with the VM which passes through a
> >> PCIe device. The error message is (during start of VM):
> >>
> >> virsh start VM
> >> error: Failed to start domain VM
> >> error: internal error process exited while connecting to monitor: qemu-system-x86_64: -device pci-assign,host=04:00.0,id=hostdev0,bus=pci.0,addr=0x6: Parameter 'driver' expects device type
> >>
> >> The xml file for libvirt looks like this: 
> > 
> > libvirt doesn't yet support vfio-pci and current qemu.git doesn't yet
> > support pci-assign.  To use libvirt for device assignment, your only
> > option right now is to use a qemu-kvm.git based version of qemu.
> 
> If I'm using your qemu instead of qemu from kvm-0.15 (opensuse package),
> this error comes up when passing through a PCIe device, which works
> absolutely fine with kvm 0.15. I would have expected, that your qemu
> works with the legacy way of handling pcie passthrough, too (with
> pci-stub module).

VFIO cannot work with pci-stub, the backends are fundamentally
different.  KVM making use pci-stub to hold onto a device is actually
one of the design problems that VFIO is meant to correct.  The other
significant interface change is use of IOMMU groups, which is actually
why VFIO works for some of your uses while pci-assign does not.

> This would mean, that all users get errors if they use the traditional
> way. IOW: there are changes needed (which?) to move from kvm 0.15 to
> your qemu version.

But the way we solve this is to make libvirt understand how to do both.
Then it can probe the qemu/kvm binary and host system to figure out
which is supported and use the correct device options based on what it
finds.  Trying to do both via the same qemu command line doesn't make
sense to me, especially when the device setup is so different.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH 0/3] VFIO-based PCI device assignment for QEMU 1.2
  2012-08-13 16:57           ` Alex Williamson
@ 2012-08-13 18:32             ` Andreas Hartmann
  0 siblings, 0 replies; 42+ messages in thread
From: Andreas Hartmann @ 2012-08-13 18:32 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Anthony Liguori, kvm, aik, Jan Kiszka, qemu-devel, Blue Swirl,
	Avi Kivity

Alex Williamson wrote:
> On Mon, 2012-08-13 at 18:36 +0200, Andreas Hartmann wrote:
[...]
>> If I'm using your qemu instead of qemu from kvm-0.15 (opensuse package),
>> this error comes up when passing through a PCIe device, which works
>> absolutely fine with kvm 0.15. I would have expected, that your qemu
>> works with the legacy way of handling pcie passthrough, too (with
>> pci-stub module).
> 
> VFIO cannot work with pci-stub, the backends are fundamentally
> different.  KVM making use pci-stub to hold onto a device is actually
> one of the design problems that VFIO is meant to correct.  The other
> significant interface change is use of IOMMU groups, which is actually
> why VFIO works for some of your uses while pci-assign does not.
> 
>> This would mean, that all users get errors if they use the traditional
>> way. IOW: there are changes needed (which?) to move from kvm 0.15 to
>> your qemu version.
> 
> But the way we solve this is to make libvirt understand how to do both.
> Then it can probe the qemu/kvm binary and host system to figure out
> which is supported and use the correct device options based on what it
> finds.  Trying to do both via the same qemu command line doesn't make
> sense to me, especially when the device setup is so different.

Ok! I thought your qemu version still would understand the old way via
pci-stub. But this obviously was a misunderstanding of mine :-(.


Thanks,
Andreas

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH 0/3] VFIO-based PCI device assignment for QEMU 1.2
  2012-08-13 14:04     ` Jan Kiszka
@ 2012-08-13 19:31       ` Anthony Liguori
  2012-08-14  7:19         ` Jan Kiszka
                           ` (2 more replies)
  0 siblings, 3 replies; 42+ messages in thread
From: Anthony Liguori @ 2012-08-13 19:31 UTC (permalink / raw)
  To: Jan Kiszka, Avi Kivity
  Cc: kvm@vger.kernel.org, aik@ozlabs.ru, qemu-devel@nongnu.org,
	Alex Graf, Alex Williamson, Cole Robinson

Jan Kiszka <jan.kiszka@siemens.com> writes:

> On 2012-08-13 15:58, Avi Kivity wrote:
>> On 08/13/2012 04:27 PM, Anthony Liguori wrote:
>> 
>>> Thanks for pushing this forward!  Hopefully this will finally kill off
>>> qemu-kvm.git for good.
>> 
>> No, it won't.  vfio requires a 3.6 kernel, which we cannot assume anyone
>> has.  We'll need the original device assignment code side-by-side.
>
> ...which is on my to-do list for 1.3.

Is there a deprecation plan for the old device assignment code?

I'm not really against the idea of requiring a new kernel for new
features.

>From a Fedora/OpenSUSE point of view, would supporting old kernels be a
requirement to stop shipping qemu-kvm.git over qemu.git?

Since distros ship new kernels and new userspaces, I don't think distros
would care so I'm not sure who we're trying to support old kernels for.

Regards,

Anthony Liguori

>
> Jan
>
> -- 
> Siemens AG, Corporate Technology, CT RTC ITP SDP-DE
> Corporate Competence Center Embedded Linux
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH 0/3] VFIO-based PCI device assignment for QEMU 1.2
  2012-08-13 14:23   ` Alex Williamson
  2012-08-13 15:48     ` Andreas Hartmann
@ 2012-08-13 19:33     ` Anthony Liguori
  2012-08-13 20:48       ` Blue Swirl
  2012-08-13 20:55       ` [Qemu-devel] VFIO: Call for reviewers (was Re: [PATCH 0/3] VFIO-based PCI device assignment for QEMU 1.2) Alex Williamson
  1 sibling, 2 replies; 42+ messages in thread
From: Anthony Liguori @ 2012-08-13 19:33 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Andreas Hartmann, kvm, aik, Jan Kiszka, qemu-devel, Blue Swirl,
	Avi Kivity

Alex Williamson <alex.williamson@redhat.com> writes:

> On Mon, 2012-08-13 at 08:27 -0500, Anthony Liguori wrote:
>> Alex Williamson <alex.williamson@redhat.com> writes:
>> 
>> > VFIO kernel support was just merged into Linux, so I'd like to
>> > formally propose inclusion of the QEMU vfio-pci driver for
>> > QEMU 1.2.  Included here is support for x86 PCI device assignment.
>> > PCI INTx is not yet enabled, but devices making use of either MSI
>> > or MSI-X work.  The level irqfd and eoifd support I've proposed
>> > for KVM enable an accelerated patch for this through KVM.  I'd
>> > like to get this base driver in first and enable the remaining
>> > support in-tree.
>> >
>> > I've split this version up a little from the RFC to make it a bit
>> > easier to review.  Review comments from Blue Swirl and Avi are
>> > already incorporated, including Avi's requests to simplify both
>> > the PCI BAR mapping and unmapping paths.
>> 
>> Hi Alex,
>> 
>> Thanks for pushing this forward!  Hopefully this will finally kill off
>> qemu-kvm.git for good.
>> 
>> I think this series is going to have to wait for 1.3 to open up.  We
>> have a very short release window for this release and I'd feel a lot
>> more comfortable having such a significant feature spend some time in
>> the development cycle getting testing/review.
>> 
>> I'd like to see a few Reviewed-by's too for this series before it goes
>> in.  I expect they won't be hard to get but I also expect it will take a
>> few more revisions of this series to get there.
>
> That's disappointing, but I can understand your reluctance.  Blue Swirl
> reviewed the RFC and could perhaps add a Reviewed-by.  Alexey has been
> working on the POWER port and I'm sure could provide a Reviewed-by.  We
> also have a few early adopters that are already making use of this code.
> Towards accepting it, the driver is entirely self contained, there's
> really no risk to the rest of qemu.  The only missing functionality is
> legacy interrupt support.  Perhaps there's a compromise where this
> driver could be considered a tech preview in 1.2 (x-vfio-pci?).
> Thanks,

Yeah, if a few people were willing to at least give an Acked-by by
Wednesday, I'd be okay taking this in a "preview" or something like
that.

I wouldn't bother renaming it or anything like that.  We can just
declare in the release notes that it's an experimental feature and may
eat your lunch while you're not looking.

Regards,

Anthony Liguori

>
> Alex
>
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH 0/3] VFIO-based PCI device assignment for QEMU 1.2
  2012-08-13 19:33     ` Anthony Liguori
@ 2012-08-13 20:48       ` Blue Swirl
  2012-08-13 20:56         ` Alex Williamson
  2012-08-13 20:55       ` [Qemu-devel] VFIO: Call for reviewers (was Re: [PATCH 0/3] VFIO-based PCI device assignment for QEMU 1.2) Alex Williamson
  1 sibling, 1 reply; 42+ messages in thread
From: Blue Swirl @ 2012-08-13 20:48 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Andreas Hartmann, kvm, aik, Jan Kiszka, qemu-devel,
	Alex Williamson, Avi Kivity

On Mon, Aug 13, 2012 at 7:33 PM, Anthony Liguori <aliguori@us.ibm.com> wrote:
> Alex Williamson <alex.williamson@redhat.com> writes:
>
>> On Mon, 2012-08-13 at 08:27 -0500, Anthony Liguori wrote:
>>> Alex Williamson <alex.williamson@redhat.com> writes:
>>>
>>> > VFIO kernel support was just merged into Linux, so I'd like to
>>> > formally propose inclusion of the QEMU vfio-pci driver for
>>> > QEMU 1.2.  Included here is support for x86 PCI device assignment.
>>> > PCI INTx is not yet enabled, but devices making use of either MSI
>>> > or MSI-X work.  The level irqfd and eoifd support I've proposed
>>> > for KVM enable an accelerated patch for this through KVM.  I'd
>>> > like to get this base driver in first and enable the remaining
>>> > support in-tree.
>>> >
>>> > I've split this version up a little from the RFC to make it a bit
>>> > easier to review.  Review comments from Blue Swirl and Avi are
>>> > already incorporated, including Avi's requests to simplify both
>>> > the PCI BAR mapping and unmapping paths.
>>>
>>> Hi Alex,
>>>
>>> Thanks for pushing this forward!  Hopefully this will finally kill off
>>> qemu-kvm.git for good.
>>>
>>> I think this series is going to have to wait for 1.3 to open up.  We
>>> have a very short release window for this release and I'd feel a lot
>>> more comfortable having such a significant feature spend some time in
>>> the development cycle getting testing/review.
>>>
>>> I'd like to see a few Reviewed-by's too for this series before it goes
>>> in.  I expect they won't be hard to get but I also expect it will take a
>>> few more revisions of this series to get there.
>>
>> That's disappointing, but I can understand your reluctance.  Blue Swirl
>> reviewed the RFC and could perhaps add a Reviewed-by.  Alexey has been
>> working on the POWER port and I'm sure could provide a Reviewed-by.  We
>> also have a few early adopters that are already making use of this code.
>> Towards accepting it, the driver is entirely self contained, there's
>> really no risk to the rest of qemu.  The only missing functionality is
>> legacy interrupt support.  Perhaps there's a compromise where this
>> driver could be considered a tech preview in 1.2 (x-vfio-pci?).
>> Thanks,
>
> Yeah, if a few people were willing to at least give an Acked-by by
> Wednesday, I'd be okay taking this in a "preview" or something like
> that.

Acked-by: Blue Swirl <blauwirbel@gmail.com>

>
> I wouldn't bother renaming it or anything like that.  We can just
> declare in the release notes that it's an experimental feature and may
> eat your lunch while you're not looking.
>
> Regards,
>
> Anthony Liguori
>
>>
>> Alex
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe kvm" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* [Qemu-devel] VFIO: Call for reviewers (was Re: [PATCH 0/3] VFIO-based PCI device assignment for QEMU 1.2)
  2012-08-13 19:33     ` Anthony Liguori
  2012-08-13 20:48       ` Blue Swirl
@ 2012-08-13 20:55       ` Alex Williamson
  1 sibling, 0 replies; 42+ messages in thread
From: Alex Williamson @ 2012-08-13 20:55 UTC (permalink / raw)
  To: Anthony Liguori, Alexey Kardashevskiy, Jan Kiszka, Blue Swirl
  Cc: Andreas Hartmann, kvm, qemu-devel, Avi Kivity

On Mon, 2012-08-13 at 14:33 -0500, Anthony Liguori wrote:
> Alex Williamson <alex.williamson@redhat.com> writes:
> 
> > On Mon, 2012-08-13 at 08:27 -0500, Anthony Liguori wrote:
> >> Alex Williamson <alex.williamson@redhat.com> writes:
> >> 
> >> > VFIO kernel support was just merged into Linux, so I'd like to
> >> > formally propose inclusion of the QEMU vfio-pci driver for
> >> > QEMU 1.2.  Included here is support for x86 PCI device assignment.
> >> > PCI INTx is not yet enabled, but devices making use of either MSI
> >> > or MSI-X work.  The level irqfd and eoifd support I've proposed
> >> > for KVM enable an accelerated patch for this through KVM.  I'd
> >> > like to get this base driver in first and enable the remaining
> >> > support in-tree.
> >> >
> >> > I've split this version up a little from the RFC to make it a bit
> >> > easier to review.  Review comments from Blue Swirl and Avi are
> >> > already incorporated, including Avi's requests to simplify both
> >> > the PCI BAR mapping and unmapping paths.
> >> 
> >> Hi Alex,
> >> 
> >> Thanks for pushing this forward!  Hopefully this will finally kill off
> >> qemu-kvm.git for good.
> >> 
> >> I think this series is going to have to wait for 1.3 to open up.  We
> >> have a very short release window for this release and I'd feel a lot
> >> more comfortable having such a significant feature spend some time in
> >> the development cycle getting testing/review.
> >> 
> >> I'd like to see a few Reviewed-by's too for this series before it goes
> >> in.  I expect they won't be hard to get but I also expect it will take a
> >> few more revisions of this series to get there.
> >
> > That's disappointing, but I can understand your reluctance.  Blue Swirl
> > reviewed the RFC and could perhaps add a Reviewed-by.  Alexey has been
> > working on the POWER port and I'm sure could provide a Reviewed-by.  We
> > also have a few early adopters that are already making use of this code.
> > Towards accepting it, the driver is entirely self contained, there's
> > really no risk to the rest of qemu.  The only missing functionality is
> > legacy interrupt support.  Perhaps there's a compromise where this
> > driver could be considered a tech preview in 1.2 (x-vfio-pci?).
> > Thanks,
> 
> Yeah, if a few people were willing to at least give an Acked-by by
> Wednesday, I'd be okay taking this in a "preview" or something like
> that.

Alexey, Blue, Jan, if any of you have time to review the current VFIO
series (linked below) to help it become a preview release in 1.2 I'd
very much appreciate it.  Reviews from anyone else also appreciated.
Thanks!

http://lists.gnu.org/archive/html/qemu-devel/2012-08/msg00545.html

> I wouldn't bother renaming it or anything like that.  We can just
> declare in the release notes that it's an experimental feature and may
> eat your lunch while you're not looking.

Ok, thanks,

Alex

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH 0/3] VFIO-based PCI device assignment for QEMU 1.2
  2012-08-13 20:48       ` Blue Swirl
@ 2012-08-13 20:56         ` Alex Williamson
  0 siblings, 0 replies; 42+ messages in thread
From: Alex Williamson @ 2012-08-13 20:56 UTC (permalink / raw)
  To: Blue Swirl
  Cc: Anthony Liguori, kvm, aik, Jan Kiszka, qemu-devel, Avi Kivity,
	Andreas Hartmann

On Mon, 2012-08-13 at 20:48 +0000, Blue Swirl wrote:
> On Mon, Aug 13, 2012 at 7:33 PM, Anthony Liguori <aliguori@us.ibm.com> wrote:
> > Alex Williamson <alex.williamson@redhat.com> writes:
> >
> >> On Mon, 2012-08-13 at 08:27 -0500, Anthony Liguori wrote:
> >>> Alex Williamson <alex.williamson@redhat.com> writes:
> >>>
> >>> > VFIO kernel support was just merged into Linux, so I'd like to
> >>> > formally propose inclusion of the QEMU vfio-pci driver for
> >>> > QEMU 1.2.  Included here is support for x86 PCI device assignment.
> >>> > PCI INTx is not yet enabled, but devices making use of either MSI
> >>> > or MSI-X work.  The level irqfd and eoifd support I've proposed
> >>> > for KVM enable an accelerated patch for this through KVM.  I'd
> >>> > like to get this base driver in first and enable the remaining
> >>> > support in-tree.
> >>> >
> >>> > I've split this version up a little from the RFC to make it a bit
> >>> > easier to review.  Review comments from Blue Swirl and Avi are
> >>> > already incorporated, including Avi's requests to simplify both
> >>> > the PCI BAR mapping and unmapping paths.
> >>>
> >>> Hi Alex,
> >>>
> >>> Thanks for pushing this forward!  Hopefully this will finally kill off
> >>> qemu-kvm.git for good.
> >>>
> >>> I think this series is going to have to wait for 1.3 to open up.  We
> >>> have a very short release window for this release and I'd feel a lot
> >>> more comfortable having such a significant feature spend some time in
> >>> the development cycle getting testing/review.
> >>>
> >>> I'd like to see a few Reviewed-by's too for this series before it goes
> >>> in.  I expect they won't be hard to get but I also expect it will take a
> >>> few more revisions of this series to get there.
> >>
> >> That's disappointing, but I can understand your reluctance.  Blue Swirl
> >> reviewed the RFC and could perhaps add a Reviewed-by.  Alexey has been
> >> working on the POWER port and I'm sure could provide a Reviewed-by.  We
> >> also have a few early adopters that are already making use of this code.
> >> Towards accepting it, the driver is entirely self contained, there's
> >> really no risk to the rest of qemu.  The only missing functionality is
> >> legacy interrupt support.  Perhaps there's a compromise where this
> >> driver could be considered a tech preview in 1.2 (x-vfio-pci?).
> >> Thanks,
> >
> > Yeah, if a few people were willing to at least give an Acked-by by
> > Wednesday, I'd be okay taking this in a "preview" or something like
> > that.
> 
> Acked-by: Blue Swirl <blauwirbel@gmail.com>

Thank you!  :)

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH 2/3] vfio: vfio-pci device assignment driver
  2012-08-01  5:18 ` [Qemu-devel] [PATCH 2/3] vfio: vfio-pci device assignment driver Alex Williamson
@ 2012-08-13 22:18   ` Anthony Liguori
  2012-08-14  5:25     ` Alex Williamson
  2012-08-14  7:12   ` Stefan Hajnoczi
  2012-08-14 15:53   ` Avi Kivity
  2 siblings, 1 reply; 42+ messages in thread
From: Anthony Liguori @ 2012-08-13 22:18 UTC (permalink / raw)
  To: Alex Williamson; +Cc: aik, qemu-devel, kvm

Alex Williamson <alex.williamson@redhat.com> writes:

> This adds the core of the QEMU VFIO-based PCI device assignment driver.
> To make use of this driver, enable CONFIG_VFIO, CONFIG_VFIO_IOMMU_TYPE1,
> and CONFIG_VFIO_PCI in your host Linux kernel config.  Load the vfio-pci
> module.  To assign device 0000:05:00.0 to a guest, do the following:
>
> for dev in $(ls /sys/bus/pci/devices/0000:05:00.0/iommu_group/devices); do
>     vendor=$(cat /sys/bus/pci/devices/$dev/vendor)
>     device=$(cat /sys/bus/pci/devices/$dev/device)
>     if [ -e /sys/bus/pci/devices/$dev/driver ]; then
>         echo $dev > /sys/bus/pci/devices/$dev/driver/unbind
>     fi
>     echo $vendor $device > /sys/bus/pci/drivers/vfio-pci/new_id
> done
>
> See Documentation/vfio.txt in the Linux kernel tree for further
> description of IOMMU groups and VFIO.
>
> Then launch qemu including the option:
>
> -device vfio-pci,host=0000:05:00.0
>
> Support for legacy PCI interrupts (INTx) is not yet included and will
> be added in a future update.  Both MSI and MSI-X are supported here.
>
> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> ---
>
>  hw/vfio_pci.c | 1853 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  hw/vfio_pci.h |  101 +++
>  2 files changed, 1954 insertions(+)
>  create mode 100644 hw/vfio_pci.c
>  create mode 100644 hw/vfio_pci.h
>
> diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c
> new file mode 100644
> index 0000000..71bb1bd
> --- /dev/null
> +++ b/hw/vfio_pci.c
> @@ -0,0 +1,1853 @@
> +/*
> + * vfio based device assignment support
> + *
> + * Copyright Red Hat, Inc. 2012
> + *
> + * Authors:
> + *  Alex Williamson <alex.williamson@redhat.com>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2.  See
> + * the COPYING file in the top-level directory.
> + *
> + * Based on qemu-kvm device-assignment:
> + *  Adapted for KVM by Qumranet.
> + *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
> + *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
> + *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
> + *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
> + *  Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
> + */
> +
> +#include <dirent.h>
> +#include <unistd.h>
> +#include <sys/ioctl.h>
> +#include <sys/mman.h>
> +#include <sys/stat.h>
> +#include <sys/types.h>
> +#include <linux/vfio.h>
> +
> +#include "config.h"
> +#include "event_notifier.h"
> +#include "exec-memory.h"
> +#include "kvm.h"
> +#include "memory.h"
> +#include "msi.h"
> +#include "msix.h"
> +#include "qemu-error.h"
> +#include "range.h"
> +#include "vfio_pci.h"
> +
> +/* #define DEBUG_VFIO */
> +#ifdef DEBUG_VFIO
> +#define DPRINTF(fmt, ...) \
> +    do { fprintf(stderr, "vfio: " fmt, ## __VA_ARGS__); } while (0)
> +#else
> +#define DPRINTF(fmt, ...) \
> +    do { } while (0)
> +#endif
> +
> +#define MSIX_CAP_LENGTH 12
> +
> +static QLIST_HEAD(, VFIOContainer)
> +    container_list = QLIST_HEAD_INITIALIZER(container_list);
> +
> +static QLIST_HEAD(, VFIOGroup)
> +    group_list = QLIST_HEAD_INITIALIZER(group_list);
> +
> +static void vfio_disable_interrupts(VFIODevice *vdev);
> +static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
> +
> +/*
> + * Common VFIO interrupt disable
> + */
> +static void vfio_disable_irqindex(VFIODevice *vdev, int index)
> +{
> +    struct vfio_irq_set irq_set = {
> +        .argsz = sizeof(irq_set),
> +        .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
> +        .index = index,
> +        .start = 0,
> +        .count = 0,
> +    };
> +
> +    ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
> +
> +    vdev->interrupt = INT_NONE;
> +}
> +
> +/*
> + * INTx
> + */
> +static void vfio_unmask_intx(VFIODevice *vdev)
> +{
> +    struct vfio_irq_set irq_set = {
> +        .argsz = sizeof(irq_set),
> +        .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK,
> +        .index = VFIO_PCI_INTX_IRQ_INDEX,
> +        .start = 0,
> +        .count = 1,
> +    };
> +
> +    ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
> +}
> +
> +static void vfio_intx_interrupt(void *opaque)
> +{
> +    VFIODevice *vdev = opaque;
> +
> +    if (!event_notifier_test_and_clear(&vdev->intx.interrupt)) {
> +        return;
> +    }
> +
> +    DPRINTF("%s(%04x:%02x:%02x.%x) Pin %c\n", __func__, vdev->host.domain,
> +            vdev->host.bus, vdev->host.slot, vdev->host.function,
> +            'A' + vdev->intx.pin);
> +
> +    vdev->intx.pending = true;
> +    qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 1);
> +}
> +
> +static void vfio_eoi(VFIODevice *vdev)
> +{
> +    if (!vdev->intx.pending) {
> +        return;
> +    }
> +
> +    DPRINTF("%s(%04x:%02x:%02x.%x) EOI\n", __func__, vdev->host.domain,
> +            vdev->host.bus, vdev->host.slot, vdev->host.function);
> +
> +    vdev->intx.pending = false;
> +    qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 0);
> +    vfio_unmask_intx(vdev);
> +}
> +
> +struct vfio_irq_set_fd {
> +    struct vfio_irq_set irq_set;
> +    int32_t fd;
> +} QEMU_PACKED;
> +
> +static void vfio_enable_intx_kvm(VFIODevice *vdev)
> +{
> +#ifdef CONFIG_KVM
> +    /*
> +     * VFIO supports an eventfd for INTx notification and an irqfd-like
> +     * mechanism for unmasking INTx.  If we could get a level irqfd in
> +     * KVM and an eventfd triggered on EOI from guest, we could interlock
> +     * these and avoid userspace for INTx.  Work in progress.
> +     */
> +#endif
> +}
> +
> +static void vfio_disable_intx_kvm(VFIODevice *vdev)
> +{
> +#ifdef CONFIG_KVM
> +    /* Same. */
> +#endif
> +}
> +
> +static void vfio_update_irq(PCIDevice *pdev)
> +{
> +    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
> +    PCIINTxRoute route;
> +
> +    if (vdev->interrupt != INT_INTx) {
> +        return;
> +    }
> +
> +    route = pci_device_route_intx_to_irq(&vdev->pdev, vdev->intx.pin);
> +    if (!memcmp(&route, &vdev->intx.route, sizeof(route))) {
> +        return; /* Nothing changed */
> +    }
> +
> +    DPRINTF("%s(%04x:%02x:%02x.%x) IRQ moved %d -> %d\n", __func__,
> +            vdev->host.domain, vdev->host.bus, vdev->host.slot,
> +            vdev->host.function, vdev->intx.route.irq, route.irq);
> +
> +    vfio_disable_intx_kvm(vdev);
> +    /* TBD - Disable QEMU eoi notifier */
> +
> +    vdev->intx.route = route;
> +
> +    if (route.mode == PCI_INTX_DISABLED) {
> +        return;
> +    }
> +
> +    /* TBD - Enable QEMU eoi notifier */
> +    vfio_enable_intx_kvm(vdev);
> +
> +    /* Re-enable the interrupt in cased we missed an EOI */
> +    vfio_eoi(vdev);
> +}
> +
> +static int vfio_enable_intx(VFIODevice *vdev)
> +{
> +    struct vfio_irq_set_fd irq_set_fd = {
> +        .irq_set = {
> +            .argsz = sizeof(irq_set_fd),
> +            .flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER,
> +            .index = VFIO_PCI_INTX_IRQ_INDEX,
> +            .start = 0,
> +            .count = 1,
> +        },
> +    };
> +    uint8_t pin = vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1);
> +
> +    if (!pin) {
> +        return 0;
> +    }
> +
> +    vfio_disable_interrupts(vdev);
> +
> +    vdev->intx.pin = pin - 1; /* Pin A (1) -> irq[0] */
> +    vdev->intx.route = pci_device_route_intx_to_irq(&vdev->pdev,
> +                                                    vdev->intx.pin);
> +    /* TBD - Enable QEMU eoi notifier */
> +
> +    if (event_notifier_init(&vdev->intx.interrupt, 0)) {
> +        error_report("vfio: Error: event_notifier_init failed\n");
> +        return -1;
> +    }
> +
> +    irq_set_fd.fd = event_notifier_get_fd(&vdev->intx.interrupt);
> +    qemu_set_fd_handler(irq_set_fd.fd, vfio_intx_interrupt, NULL, vdev);
> +
> +    if (ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set_fd)) {
> +        error_report("vfio: Error: Failed to setup INTx fd: %s\n",
> +                     strerror(errno));
> +        return -1;
> +    }
> +
> +    vfio_enable_intx_kvm(vdev);
> +
> +    vdev->interrupt = INT_INTx;
> +
> +    DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
> +            vdev->host.bus, vdev->host.slot, vdev->host.function);
> +
> +    return 0;
> +}
> +
> +static void vfio_disable_intx(VFIODevice *vdev)
> +{
> +    int fd;
> +
> +    vfio_disable_intx_kvm(vdev);
> +    vfio_disable_irqindex(vdev, VFIO_PCI_INTX_IRQ_INDEX);
> +
> +    /* TBD - Disable QEMU eoi notifier */
> +
> +    fd = event_notifier_get_fd(&vdev->intx.interrupt);
> +    qemu_set_fd_handler(fd, NULL, NULL, vdev);
> +    event_notifier_cleanup(&vdev->intx.interrupt);
> +
> +    vdev->interrupt = INT_NONE;
> +
> +    DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
> +            vdev->host.bus, vdev->host.slot, vdev->host.function);
> +}
> +
> +/*
> + * MSI/X
> + */
> +static void vfio_msi_interrupt(void *opaque)
> +{
> +    MSIVector *vec = opaque;
> +    VFIODevice *vdev = vec->vdev;
> +
> +    if (!event_notifier_test_and_clear(&vec->interrupt)) {
> +        return;
> +    }
> +
> +    DPRINTF("%s(%04x:%02x:%02x.%x) vector %d\n", __func__,
> +            vdev->host.domain, vdev->host.bus, vdev->host.slot,
> +            vdev->host.function, vec->vector);
> +
> +    if (vdev->interrupt == INT_MSIX) {
> +        msix_notify(&vdev->pdev, vec->vector);
> +    } else if (vdev->interrupt == INT_MSI) {
> +        msi_notify(&vdev->pdev, vec->vector);
> +    } else {
> +        error_report("vfio: MSI interrupt receieved, but not enabled?\n");
> +    }
> +}
> +
> +static int vfio_enable_vectors(VFIODevice *vdev, bool msix)
> +{
> +    struct vfio_irq_set *irq_set;
> +    int ret = 0, i, argsz;
> +    int32_t *fds;
> +
> +    argsz = sizeof(*irq_set) + (vdev->nr_vectors * sizeof(*fds));
> +
> +    irq_set = g_malloc0(argsz);
> +    irq_set->argsz = argsz;
> +    irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
> +    irq_set->index = msix ? VFIO_PCI_MSIX_IRQ_INDEX : VFIO_PCI_MSI_IRQ_INDEX;
> +    irq_set->start = 0;
> +    irq_set->count = vdev->nr_vectors;
> +    fds = (int32_t *)&irq_set->data;
> +
> +    for (i = 0; i < vdev->nr_vectors; i++) {
> +        if (!vdev->msi_vectors[i].use) {
> +            fds[i] = -1;
> +            continue;
> +        }
> +
> +        fds[i] = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt);
> +    }
> +
> +    ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
> +
> +    g_free(irq_set);
> +
> +    if (!ret) {
> +        vdev->interrupt = msix ? INT_MSIX : INT_MSI;
> +    }
> +
> +    return ret;
> +}
> +
> +static int vfio_msix_vector_use(PCIDevice *pdev,
> +                                unsigned int vector, MSIMessage msg)
> +{
> +    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
> +    int ret, fd;
> +
> +    DPRINTF("%s(%04x:%02x:%02x.%x) vector %d used\n", __func__,
> +            vdev->host.domain, vdev->host.bus, vdev->host.slot,
> +            vdev->host.function, vector);
> +
> +    if (vdev->interrupt != INT_MSIX) {
> +        vfio_disable_interrupts(vdev);
> +    }
> +
> +    if (!vdev->msi_vectors) {
> +        vdev->msi_vectors = g_malloc0(vdev->msix->entries * sizeof(MSIVector));
> +    }
> +
> +    vdev->msi_vectors[vector].vdev = vdev;
> +    vdev->msi_vectors[vector].vector = vector;
> +    vdev->msi_vectors[vector].use = true;
> +
> +    msix_vector_use(pdev, vector);
> +
> +    if (event_notifier_init(&vdev->msi_vectors[vector].interrupt, 0)) {
> +        error_report("vfio: Error: event_notifier_init failed\n");
> +    }
> +
> +    fd = event_notifier_get_fd(&vdev->msi_vectors[vector].interrupt);
> +
> +    /*
> +     * Attempt to enable route through KVM irqchip,
> +     * default to userspace handling if unavailable.
> +     */
> +    vdev->msi_vectors[vector].virq = kvm_irqchip_add_msi_route(kvm_state, msg);
> +    if (vdev->msi_vectors[vector].virq < 0 ||
> +        kvm_irqchip_add_irqfd(kvm_state, fd,
> +                              vdev->msi_vectors[vector].virq) < 0) {
> +        qemu_set_fd_handler(fd, vfio_msi_interrupt, NULL,
> +                            &vdev->msi_vectors[vector]);
> +    }
> +
> +    /*
> +     * We don't want to have the host allocate all possible MSI vectors
> +     * for a device if they're not in use, so we shutdown and incrementally
> +     * increase them as needed.
> +     */
> +    if (vdev->nr_vectors < vector + 1) {
> +        int i;
> +
> +        vfio_disable_irqindex(vdev, VFIO_PCI_MSIX_IRQ_INDEX);
> +        vdev->nr_vectors = vector + 1;
> +        ret = vfio_enable_vectors(vdev, true);
> +        if (ret) {
> +            error_report("vfio: failed to enable vectors, %d\n", ret);
> +        }
> +
> +        /* We don't know if we've missed interrupts in the interim... */
> +        for (i = 0; i < vdev->msix->entries; i++) {
> +            if (vdev->msi_vectors[i].use) {
> +                msix_notify(&vdev->pdev, i);
> +            }
> +        }
> +    } else {
> +        struct vfio_irq_set_fd irq_set_fd = {
> +            .irq_set = {
> +                .argsz = sizeof(irq_set_fd),
> +                .flags = VFIO_IRQ_SET_DATA_EVENTFD |
> +                         VFIO_IRQ_SET_ACTION_TRIGGER,
> +                .index = VFIO_PCI_MSIX_IRQ_INDEX,
> +                .start = vector,
> +                .count = 1,
> +            },
> +            .fd = fd,
> +        };
> +        ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set_fd);
> +        if (ret) {
> +            error_report("vfio: failed to modify vector, %d\n", ret);
> +        }
> +        msix_notify(&vdev->pdev, vector);
> +    }
> +
> +    return 0;
> +}
> +
> +static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int vector)
> +{
> +    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
> +    struct vfio_irq_set_fd irq_set_fd = {
> +        .irq_set = {
> +            .argsz = sizeof(irq_set_fd),
> +            .flags = VFIO_IRQ_SET_DATA_EVENTFD |
> +                     VFIO_IRQ_SET_ACTION_TRIGGER,
> +            .index = VFIO_PCI_MSIX_IRQ_INDEX,
> +            .start = vector,
> +            .count = 1,
> +        },
> +        .fd = -1,
> +    };
> +    int fd;
> +
> +    DPRINTF("%s(%04x:%02x:%02x.%x) vector %d released\n", __func__,
> +            vdev->host.domain, vdev->host.bus, vdev->host.slot,
> +            vdev->host.function, vector);
> +
> +    /*
> +     * XXX What's the right thing to do here?  This turns off the interrupt
> +     * completely, but do we really just want to switch the interrupt to
> +     * bouncing through userspace and let msix.c drop it?  Not sure.
> +     */
> +    msix_vector_unuse(pdev, vector);
> +    ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set_fd);
> +
> +    fd = event_notifier_get_fd(&vdev->msi_vectors[vector].interrupt);
> +
> +    if (vdev->msi_vectors[vector].virq < 0) {
> +        qemu_set_fd_handler(fd, NULL, NULL, NULL);
> +    } else {
> +        kvm_irqchip_remove_irqfd(kvm_state, fd, vdev->msi_vectors[vector].virq);
> +        kvm_irqchip_release_virq(kvm_state, vdev->msi_vectors[vector].virq);
> +        vdev->msi_vectors[vector].virq = -1;
> +    }
> +
> +    event_notifier_cleanup(&vdev->msi_vectors[vector].interrupt);
> +    vdev->msi_vectors[vector].use = false;
> +}
> +
> +/* XXX This should move to msi.c */
> +static MSIMessage msi_get_msg(PCIDevice *pdev, unsigned int vector)
> +{
> +    uint16_t flags = pci_get_word(pdev->config + pdev->msi_cap + PCI_MSI_FLAGS);
> +    bool msi64bit = flags & PCI_MSI_FLAGS_64BIT;
> +    MSIMessage msg;
> +
> +    if (msi64bit) {
> +        msg.address = pci_get_quad(pdev->config +
> +                                   pdev->msi_cap + PCI_MSI_ADDRESS_LO);
> +    } else {
> +        msg.address = pci_get_long(pdev->config +
> +                                   pdev->msi_cap + PCI_MSI_ADDRESS_LO);
> +    }
> +
> +    msg.data = pci_get_word(pdev->config + pdev->msi_cap +
> +                            (msi64bit ? PCI_MSI_DATA_64 : PCI_MSI_DATA_32));
> +    msg.data += vector;
> +
> +    return msg;
> +}
> +
> +/* So should this */
> +static void msi_set_qsize(PCIDevice *pdev, uint8_t size)
> +{
> +    uint8_t *config = pdev->config + pdev->msi_cap;
> +    uint16_t flags;
> +
> +    flags = pci_get_word(config + PCI_MSI_FLAGS);
> +    flags = le16_to_cpu(flags);
> +    flags &= ~PCI_MSI_FLAGS_QSIZE;
> +    flags |= (size & 0x7) << 4;
> +    flags = cpu_to_le16(flags);
> +    pci_set_word(config + PCI_MSI_FLAGS, flags);
> +}
> +
> +static void vfio_enable_msi(VFIODevice *vdev)
> +{
> +    int ret, i;
> +
> +    vfio_disable_interrupts(vdev);
> +
> +    vdev->nr_vectors = msi_nr_vectors_allocated(&vdev->pdev);
> +retry:
> +    vdev->msi_vectors = g_malloc0(vdev->nr_vectors * sizeof(MSIVector));
> +
> +    for (i = 0; i < vdev->nr_vectors; i++) {
> +        MSIMessage msg;
> +        int fd;
> +
> +        vdev->msi_vectors[i].vdev = vdev;
> +        vdev->msi_vectors[i].vector = i;
> +        vdev->msi_vectors[i].use = true;
> +
> +        if (event_notifier_init(&vdev->msi_vectors[i].interrupt, 0)) {
> +            error_report("vfio: Error: event_notifier_init failed\n");
> +        }
> +
> +        fd = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt);
> +
> +        msg = msi_get_msg(&vdev->pdev, i);
> +
> +        /*
> +         * Attempt to enable route through KVM irqchip,
> +         * default to userspace handling if unavailable.
> +         */
> +        vdev->msi_vectors[i].virq = kvm_irqchip_add_msi_route(kvm_state, msg);
> +        if (vdev->msi_vectors[i].virq < 0 ||
> +            kvm_irqchip_add_irqfd(kvm_state, fd,
> +                                  vdev->msi_vectors[i].virq) < 0) {
> +            qemu_set_fd_handler(fd, vfio_msi_interrupt, NULL,
> +                                &vdev->msi_vectors[i]);
> +        }
> +    }
> +
> +    ret = vfio_enable_vectors(vdev, false);
> +    if (ret) {
> +        if (ret < 0) {
> +            error_report("vfio: Error: Failed to setup MSI fds: %s\n",
> +                         strerror(errno));
> +        } else if (ret != vdev->nr_vectors) {
> +            error_report("vfio: Error: Failed to enable %d "
> +                         "MSI vectors, retry with %d\n", vdev->nr_vectors, ret);
> +        }
> +
> +        for (i = 0; i < vdev->nr_vectors; i++) {
> +            int fd = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt);
> +            if (vdev->msi_vectors[i].virq >= 0) {
> +                kvm_irqchip_remove_irqfd(kvm_state, fd,
> +                                         vdev->msi_vectors[i].virq);
> +                kvm_irqchip_release_virq(kvm_state, vdev->msi_vectors[i].virq);
> +                vdev->msi_vectors[i].virq = -1;
> +            } else {
> +                qemu_set_fd_handler(fd, NULL, NULL, NULL);
> +            }
> +            event_notifier_cleanup(&vdev->msi_vectors[i].interrupt);
> +        }
> +
> +        g_free(vdev->msi_vectors);
> +
> +        if (ret > 0 && ret != vdev->nr_vectors) {
> +            vdev->nr_vectors = ret;
> +            goto retry;
> +        }
> +        vdev->nr_vectors = 0;
> +
> +        return;
> +    }
> +
> +    msi_set_qsize(&vdev->pdev, vdev->nr_vectors);
> +
> +    DPRINTF("%s(%04x:%02x:%02x.%x) Enabled %d MSI vectors\n", __func__,
> +            vdev->host.domain, vdev->host.bus, vdev->host.slot,
> +            vdev->host.function, vdev->nr_vectors);
> +}
> +
> +static void vfio_disable_msi_x(VFIODevice *vdev, bool msix)
> +{
> +    int i;
> +
> +    vfio_disable_irqindex(vdev, msix ? VFIO_PCI_MSIX_IRQ_INDEX :
> +                                       VFIO_PCI_MSI_IRQ_INDEX);
> +
> +    for (i = 0; i < vdev->nr_vectors; i++) {
> +        int fd;
> +
> +        if (!vdev->msi_vectors[i].use) {
> +            continue;
> +        }
> +
> +        fd = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt);
> +
> +        if (vdev->msi_vectors[i].virq >= 0) {
> +            kvm_irqchip_remove_irqfd(kvm_state, fd, vdev->msi_vectors[i].virq);
> +            kvm_irqchip_release_virq(kvm_state, vdev->msi_vectors[i].virq);
> +            vdev->msi_vectors[i].virq = -1;
> +        } else {
> +            qemu_set_fd_handler(fd, NULL, NULL, NULL);
> +        }
> +
> +        if (msix) {
> +            msix_vector_unuse(&vdev->pdev, i);
> +        }
> +
> +        event_notifier_cleanup(&vdev->msi_vectors[i].interrupt);
> +    }
> +
> +    g_free(vdev->msi_vectors);
> +    vdev->msi_vectors = NULL;
> +    vdev->nr_vectors = 0;
> +
> +    if (!msix) {
> +        msi_set_qsize(&vdev->pdev, 0); /* Actually still means 1 vector */
> +    }
> +
> +    DPRINTF("%s(%04x:%02x:%02x.%x, msi%s)\n", __func__,
> +            vdev->host.domain, vdev->host.bus, vdev->host.slot,
> +            vdev->host.function, msix ? "x" : "");
> +
> +    vfio_enable_intx(vdev);
> +}
> +
> +/*
> + * IO Port/MMIO - Beware of the endians, VFIO is always little endian
> + */
> +static void vfio_bar_write(void *opaque, target_phys_addr_t addr,
> +                           uint64_t data, unsigned size)
> +{
> +    VFIOBAR *bar = opaque;
> +    uint8_t buf[8];
> +
> +    switch (size) {
> +    case 1:
> +        *buf = data & 0xff;
> +        break;
> +    case 2:
> +        *(uint16_t *)buf = cpu_to_le16(data);
> +        break;
> +    case 4:
> +        *(uint32_t *)buf = cpu_to_le32(data);
> +        break;
> +    default:
> +        hw_error("vfio: unsupported write size, %d bytes\n", size);
> +        break;
> +    }
> +
> +    if (pwrite(bar->fd, buf, size, bar->fd_offset + addr) != size) {
> +        error_report("%s(,0x%"PRIx64", 0x%"PRIx64", %d) failed: %s\n",
> +                     __func__, addr, data, size, strerror(errno));
> +    }
> +
> +    DPRINTF("%s(BAR%d+0x%"PRIx64", 0x%"PRIx64", %d)\n",
> +            __func__, bar->nr, addr, data, size);
> +}
> +
> +static uint64_t vfio_bar_read(void *opaque,
> +                              target_phys_addr_t addr, unsigned size)
> +{
> +    VFIOBAR *bar = opaque;
> +    uint8_t buf[8];
> +    uint64_t data = 0;
> +
> +    if (pread(bar->fd, buf, size, bar->fd_offset + addr) != size) {
> +        error_report("%s(,0x%"PRIx64", %d) failed: %s\n",
> +                     __func__, addr, size, strerror(errno));
> +        return (uint64_t)-1;
> +    }
> +
> +    switch (size) {
> +    case 1:
> +        data = buf[0];
> +        break;
> +    case 2:
> +        data = le16_to_cpu(*(uint16_t *)buf);
> +        break;
> +    case 4:
> +        data = le32_to_cpu(*(uint32_t *)buf);
> +        break;
> +    default:
> +        hw_error("vfio: unsupported read size, %d bytes\n", size);
> +        break;
> +    }
> +
> +    DPRINTF("%s(BAR%d+0x%"PRIx64", %d) = 0x%"PRIx64"\n",
> +            __func__, bar->nr, addr, size, data);
> +
> +    return data;
> +}
> +
> +static const MemoryRegionOps vfio_bar_ops = {
> +    .read = vfio_bar_read,
> +    .write = vfio_bar_write,
> +    .endianness = DEVICE_LITTLE_ENDIAN,
> +};
> +
> +/*
> + * PCI config space
> + */
> +static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len)
> +{
> +    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
> +    uint32_t val = 0;
> +
> +    /*
> +     * We only need QEMU PCI config support for the ROM BAR, the MSI and MSIX
> +     * capabilities, and the multifunction bit below.  We let VFIO handle
> +     * virtualizing everything else.  Performance is not a concern here.
> +     */
> +    if (ranges_overlap(addr, len, PCI_ROM_ADDRESS, 4) ||
> +        (pdev->cap_present & QEMU_PCI_CAP_MSIX &&
> +         ranges_overlap(addr, len, pdev->msix_cap, MSIX_CAP_LENGTH)) ||
> +        (pdev->cap_present & QEMU_PCI_CAP_MSI &&
> +         ranges_overlap(addr, len, pdev->msi_cap, vdev->msi_cap_size))) {
> +
> +        val = pci_default_read_config(pdev, addr, len);
> +    } else {
> +        if (pread(vdev->fd, &val, len, vdev->config_offset + addr) != len) {
> +            error_report("%s(%04x:%02x:%02x.%x, 0x%x, 0x%x) failed: %s\n",
> +                         __func__, vdev->host.domain, vdev->host.bus,
> +                         vdev->host.slot, vdev->host.function, addr, len,
> +                         strerror(errno));
> +            return -1;
> +        }
> +        val = le32_to_cpu(val);
> +    }
> +
> +    /* Multifunction bit is virualized in QEMU */
> +    if (unlikely(ranges_overlap(addr, len, PCI_HEADER_TYPE, 1))) {
> +        uint32_t mask = PCI_HEADER_TYPE_MULTI_FUNCTION;
> +
> +        if (len == 4) {
> +            mask <<= 16;
> +        }
> +
> +        if (pdev->cap_present & QEMU_PCI_CAP_MULTIFUNCTION) {
> +            val |= mask;
> +        } else {
> +            val &= ~mask;
> +        }
> +    }
> +
> +    DPRINTF("%s(%04x:%02x:%02x.%x, @0x%x, len=0x%x) %x\n", __func__,
> +            vdev->host.domain, vdev->host.bus, vdev->host.slot,
> +            vdev->host.function, addr, len, val);
> +
> +    return val;
> +}
> +
> +static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr,
> +                                  uint32_t val, int len)
> +{
> +    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
> +    uint32_t val_le = cpu_to_le32(val);
> +
> +    DPRINTF("%s(%04x:%02x:%02x.%x, @0x%x, 0x%x, len=0x%x)\n", __func__,
> +            vdev->host.domain, vdev->host.bus, vdev->host.slot,
> +            vdev->host.function, addr, val, len);
> +
> +    /* Write everything to VFIO, let it filter out what we can't write */
> +    if (pwrite(vdev->fd, &val_le, len, vdev->config_offset + addr) != len) {
> +        error_report("%s(%04x:%02x:%02x.%x, 0x%x, 0x%x, 0x%x) failed: %s\n",
> +                     __func__, vdev->host.domain, vdev->host.bus,
> +                     vdev->host.slot, vdev->host.function, addr, val, len,
> +                     strerror(errno));
> +    }
> +
> +    /* Write standard header bits to emulation */
> +    if (addr < PCI_CONFIG_HEADER_SIZE) {
> +        pci_default_write_config(pdev, addr, val, len);
> +        return;
> +    }
> +
> +    /* MSI/MSI-X Enabling/Disabling */
> +    if (pdev->cap_present & QEMU_PCI_CAP_MSI &&
> +        ranges_overlap(addr, len, pdev->msi_cap, vdev->msi_cap_size)) {
> +        int is_enabled, was_enabled = msi_enabled(pdev);
> +
> +        pci_default_write_config(pdev, addr, val, len);
> +
> +        is_enabled = msi_enabled(pdev);
> +
> +        if (!was_enabled && is_enabled) {
> +            vfio_enable_msi(vdev);
> +        } else if (was_enabled && !is_enabled) {
> +            vfio_disable_msi_x(vdev, false);
> +        }
> +    }
> +
> +    if (pdev->cap_present & QEMU_PCI_CAP_MSIX &&
> +        ranges_overlap(addr, len, pdev->msix_cap, MSIX_CAP_LENGTH)) {
> +        int is_enabled, was_enabled = msix_enabled(pdev);
> +
> +        pci_default_write_config(pdev, addr, val, len);
> +
> +        is_enabled = msix_enabled(pdev);
> +
> +        if (!was_enabled && is_enabled) {
> +            /* vfio_msix_vector_use handles this automatically */
> +        } else if (was_enabled && !is_enabled) {
> +            vfio_disable_msi_x(vdev, true);
> +        }
> +    }
> +}
> +
> +/*
> + * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86
> + */
> +static int vfio_dma_map(VFIOContainer *container, target_phys_addr_t iova,
> +                        ram_addr_t size, void *vaddr, bool readonly)
> +{
> +    struct vfio_iommu_type1_dma_map map = {
> +        .argsz = sizeof(map),
> +        .flags = VFIO_DMA_MAP_FLAG_READ,
> +        .vaddr = (__u64)vaddr,
> +        .iova = iova,
> +        .size = size,
> +    };
> +
> +    if (!readonly) {
> +        map.flags |= VFIO_DMA_MAP_FLAG_WRITE;
> +    }
> +
> +    if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map)) {
> +        DPRINTF("VFIO_MAP_DMA: %d\n", -errno);
> +        return -errno;
> +    }
> +
> +    return 0;
> +}
> +
> +static int vfio_dma_unmap(VFIOContainer *container,
> +                          target_phys_addr_t iova, ram_addr_t size)
> +{
> +    struct vfio_iommu_type1_dma_unmap unmap = {
> +        .argsz = sizeof(unmap),
> +        .flags = 0,
> +        .iova = iova,
> +        .size = size,
> +    };
> +
> +    if (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
> +        DPRINTF("VFIO_UNMAP_DMA: %d\n", -errno);
> +        return -errno;
> +    }
> +
> +    return 0;
> +}
> +
> +static void vfio_listener_dummy1(MemoryListener *listener)
> +{
> +    /* We don't do batching (begin/commit) or care about logging */
> +}
> +
> +static void vfio_listener_dummy2(MemoryListener *listener,
> +                                 MemoryRegionSection *section)
> +{
> +    /* We don't do logging or care about nops */
> +}
> +
> +static void vfio_listener_dummy3(MemoryListener *listener,
> +                                 MemoryRegionSection *section,
> +                                 bool match_data, uint64_t data,
> +                                 EventNotifier *e)
> +{
> +    /* We don't care about eventfds */
> +}
> +
> +static bool vfio_listener_skipped_section(MemoryRegionSection *section)
> +{
> +    return !memory_region_is_ram(section->mr);
> +}
> +
> +static void vfio_listener_region_add(MemoryListener *listener,
> +                                     MemoryRegionSection *section)
> +{
> +    VFIOContainer *container = container_of(listener, VFIOContainer,
> +                                            iommu_data.listener);
> +    target_phys_addr_t iova, end;
> +    void *vaddr;
> +    int ret;
> +
> +    if (vfio_listener_skipped_section(section)) {
> +        DPRINTF("vfio: SKIPPING region_add %016lx - %016lx\n",
> +                section->offset_within_address_space,
> +                section->offset_within_address_space + section->size - 1);
> +        return;
> +    }
> +
> +    if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
> +                 (section->offset_within_region & ~TARGET_PAGE_MASK))) {
> +        error_report("%s received unaligned region\n", __func__);
> +        return;
> +    }
> +
> +    iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
> +    end = (section->offset_within_address_space + section->size) &
> +          TARGET_PAGE_MASK;
> +
> +    if (iova >= end) {
> +        return;
> +    }
> +
> +    vaddr = memory_region_get_ram_ptr(section->mr) +
> +            section->offset_within_region +
> +            (iova - section->offset_within_address_space);
> +
> +    DPRINTF("vfio: region_add %016lx - %016lx [%p]\n",
> +            iova, end - 1, vaddr);
> +
> +    ret = vfio_dma_map(container, iova, end - iova, vaddr, section->readonly);
> +    if (ret) {
> +        error_report("vfio_dma_map(%p, 0x%016lx, 0x%lx, %p) = %d (%s)\n",
> +                     container, iova, end - iova, vaddr, ret, strerror(errno));
> +    }
> +}
> +
> +static void vfio_listener_region_del(MemoryListener *listener,
> +                                     MemoryRegionSection *section)
> +{
> +    VFIOContainer *container = container_of(listener, VFIOContainer,
> +                                            iommu_data.listener);
> +    target_phys_addr_t iova, end;
> +    int ret;
> +
> +    if (vfio_listener_skipped_section(section)) {
> +        DPRINTF("vfio: SKIPPING region_del %016lx - %016lx\n",
> +                section->offset_within_address_space,
> +                section->offset_within_address_space + section->size - 1);
> +        return;
> +    }
> +
> +    if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
> +                 (section->offset_within_region & ~TARGET_PAGE_MASK))) {
> +        error_report("%s received unaligned region\n", __func__);
> +        return;
> +    }
> +
> +    iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
> +    end = (section->offset_within_address_space + section->size) &
> +          TARGET_PAGE_MASK;
> +
> +    if (iova >= end) {
> +        return;
> +    }
> +
> +    DPRINTF("vfio: region_del %016lx - %016lx\n", iova, end - 1);
> +
> +    ret = vfio_dma_unmap(container, iova, end - iova);
> +    if (ret) {
> +        error_report("vfio_dma_unmap(%p, 0x%016lx, 0x%lx) = %d (%s)\n",
> +                     container, iova, end - iova, ret, strerror(errno));
> +    }
> +}
> +
> +static void vfio_listener_release(VFIOContainer *container)
> +{
> +    memory_listener_unregister(&container->iommu_data.listener);
> +}
> +
> +/*
> + * Interrupt setup
> + */
> +static void vfio_disable_interrupts(VFIODevice *vdev)
> +{
> +    switch (vdev->interrupt) {
> +    case INT_INTx:
> +        vfio_disable_intx(vdev);
> +        break;
> +    case INT_MSI:
> +        vfio_disable_msi_x(vdev, false);
> +        break;
> +    case INT_MSIX:
> +        vfio_disable_msi_x(vdev, true);
> +        break;
> +    }
> +}
> +
> +static int vfio_setup_msi(VFIODevice *vdev, int pos)
> +{
> +    uint16_t ctrl;
> +    bool msi_64bit, msi_maskbit;
> +    int ret, entries;
> +
> +    if (!msi_supported) {
> +        return 0;
> +    }
> +
> +    if (pread(vdev->fd, &ctrl, sizeof(ctrl),
> +              vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) {
> +        return -1;
> +    }
> +    ctrl = le16_to_cpu(ctrl);
> +
> +    msi_64bit = !!(ctrl & PCI_MSI_FLAGS_64BIT);
> +    msi_maskbit = !!(ctrl & PCI_MSI_FLAGS_MASKBIT);
> +    entries = 1 << ((ctrl & PCI_MSI_FLAGS_QMASK) >> 1);
> +
> +    DPRINTF("%04x:%02x:%02x.%x PCI MSI CAP @0x%x\n", vdev->host.domain,
> +            vdev->host.bus, vdev->host.slot, vdev->host.function, pos);
> +
> +    ret = msi_init(&vdev->pdev, pos, entries, msi_64bit, msi_maskbit);
> +    if (ret < 0) {
> +        error_report("vfio: msi_init failed\n");
> +        return ret;
> +    }
> +    vdev->msi_cap_size = 0xa + (msi_maskbit ? 0xa : 0) + (msi_64bit ? 0x4 : 0);
> +
> +    return 0;
> +}
> +
> +/*
> + * We don't have any control over how pci_add_capability() inserts
> + * capabilities into the chain.  In order to setup MSI-X we need a
> + * MemoryRegion for the BAR.  In order to setup the BAR and not
> + * attempt to mmap the MSI-X table area, which VFIO won't allow, we
> + * need to first look for where the MSI-X table lives.  So we
> + * unfortunately split MSI-X setup across two functions.
> + */
> +static int vfio_early_setup_msix(VFIODevice *vdev)
> +{
> +    uint8_t pos;
> +    uint16_t ctrl;
> +    uint32_t table, pba;
> +
> +    pos = pci_find_capability(&vdev->pdev, PCI_CAP_ID_MSIX);
> +    if (!pos) {
> +        return 0;
> +    }
> +
> +    if (pread(vdev->fd, &ctrl, sizeof(ctrl),
> +              vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) {
> +        return -1;
> +    }
> +
> +    if (pread(vdev->fd, &table, sizeof(table),
> +              vdev->config_offset + pos + PCI_MSIX_TABLE) != sizeof(table)) {
> +        return -1;
> +    }
> +
> +    if (pread(vdev->fd, &pba, sizeof(pba),
> +              vdev->config_offset + pos + PCI_MSIX_PBA) != sizeof(pba)) {
> +        return -1;
> +    }
> +
> +    ctrl = le16_to_cpu(ctrl);
> +    table = le32_to_cpu(table);
> +    pba = le32_to_cpu(pba);
> +
> +    vdev->msix = g_malloc0(sizeof(*(vdev->msix)));
> +    vdev->msix->table_bar = table & PCI_MSIX_FLAGS_BIRMASK;
> +    vdev->msix->table_offset = table & ~PCI_MSIX_FLAGS_BIRMASK;
> +    vdev->msix->pba_bar = pba & PCI_MSIX_FLAGS_BIRMASK;
> +    vdev->msix->pba_offset = pba & ~PCI_MSIX_FLAGS_BIRMASK;
> +    vdev->msix->entries = (ctrl & PCI_MSIX_FLAGS_QSIZE) + 1;
> +
> +    DPRINTF("%04x:%02x:%02x.%x "
> +            "PCI MSI-X CAP @0x%x, BAR %d, offset 0x%x, entries %d\n",
> +            vdev->host.domain, vdev->host.bus, vdev->host.slot,
> +            vdev->host.function, pos, vdev->msix->table_bar,
> +            vdev->msix->table_offset, vdev->msix->entries);
> +
> +    return 0;
> +}
> +
> +static int vfio_setup_msix(VFIODevice *vdev, int pos)
> +{
> +    int ret;
> +
> +    if (!msi_supported) {
> +        return 0;
> +    }
> +
> +    ret = msix_init(&vdev->pdev, vdev->msix->entries,
> +                    &vdev->bars[vdev->msix->table_bar].mem,
> +                    vdev->msix->table_bar, vdev->msix->table_offset,
> +                    &vdev->bars[vdev->msix->pba_bar].mem,
> +                    vdev->msix->pba_bar, vdev->msix->pba_offset, pos);
> +    if (ret < 0) {
> +        error_report("vfio: msix_init failed\n");
> +        return ret;
> +    }
> +
> +    ret = msix_set_vector_notifiers(&vdev->pdev, vfio_msix_vector_use,
> +                                    vfio_msix_vector_release);
> +    if (ret) {
> +        error_report("vfio: msix_set_vector_notifiers failed %d\n", ret);
> +        msix_uninit(&vdev->pdev, &vdev->bars[vdev->msix->table_bar].mem,
> +                    &vdev->bars[vdev->msix->pba_bar].mem);
> +        return ret;
> +    }
> +
> +    return 0;
> +}
> +
> +static void vfio_teardown_msi(VFIODevice *vdev)
> +{
> +    msi_uninit(&vdev->pdev);
> +
> +    if (vdev->msix) {
> +        /* FIXME: Why can't unset just silently do nothing?? */
> +        if (vdev->pdev.msix_vector_use_notifier &&
> +            vdev->pdev.msix_vector_release_notifier) {
> +            msix_unset_vector_notifiers(&vdev->pdev);
> +        }
> +
> +        msix_uninit(&vdev->pdev, &vdev->bars[vdev->msix->table_bar].mem,
> +                    &vdev->bars[vdev->msix->pba_bar].mem);
> +    }
> +}
> +
> +/*
> + * Resource setup
> + */
> +static void vfio_unmap_bar(VFIODevice *vdev, int nr)
> +{
> +    VFIOBAR *bar = &vdev->bars[nr];
> +
> +    if (!bar->size) {
> +        return;
> +    }
> +
> +    memory_region_del_subregion(&bar->mem, &bar->mmap_mem);
> +    munmap(bar->mmap, memory_region_size(&bar->mmap_mem));
> +
> +    if (vdev->msix && vdev->msix->table_bar == nr) {
> +        memory_region_del_subregion(&bar->mem, &vdev->msix->mmap_mem);
> +        munmap(vdev->msix->mmap, memory_region_size(&vdev->msix->mmap_mem));
> +    }
> +
> +    memory_region_destroy(&bar->mem);
> +}
> +
> +static int vfio_mmap_bar(VFIOBAR *bar, MemoryRegion *mem, MemoryRegion *submem,
> +                         void **map, size_t size, off_t offset,
> +                         const char *name)
> +{
> +    int ret = 0;
> +
> +    if (size && bar->flags & VFIO_REGION_INFO_FLAG_MMAP) {
> +        int prot = 0;
> +
> +        if (bar->flags & VFIO_REGION_INFO_FLAG_READ) {
> +            prot |= PROT_READ;
> +        }
> +
> +        if (bar->flags & VFIO_REGION_INFO_FLAG_WRITE) {
> +            prot |= PROT_WRITE;
> +        }
> +
> +        *map = mmap(NULL, size, prot, MAP_SHARED,
> +                    bar->fd, bar->fd_offset + offset);
> +        if (*map == MAP_FAILED) {
> +            *map = NULL;
> +            ret = -errno;
> +            goto empty_region;
> +        }
> +
> +        memory_region_init_ram_ptr(submem, name, size, *map);
> +    } else {
> +empty_region:
> +        /* Create a zero sized sub-region to make cleanup easy. */
> +        memory_region_init(submem, name, 0);
> +    }
> +
> +    memory_region_add_subregion(mem, offset, submem);
> +
> +    return ret;
> +}
> +
> +static void vfio_map_bar(VFIODevice *vdev, int nr)
> +{
> +    VFIOBAR *bar = &vdev->bars[nr];
> +    unsigned size = bar->size;
> +    char name[64];
> +    uint32_t pci_bar;
> +    uint8_t type;
> +    int ret;
> +
> +    /* Skip both unimplemented BARs and the upper half of 64bit BARS. */
> +    if (!size) {
> +        return;
> +    }
> +
> +    snprintf(name, sizeof(name), "VFIO %04x:%02x:%02x.%x BAR %d",
> +             vdev->host.domain, vdev->host.bus, vdev->host.slot,
> +             vdev->host.function, nr);
> +
> +    /* Determine what type of BAR this is for registration */
> +    ret = pread(vdev->fd, &pci_bar, sizeof(pci_bar),
> +                vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr));
> +    if (ret != sizeof(pci_bar)) {
> +        error_report("vfio: Failed to read BAR %d (%s)\n", nr, strerror(errno));
> +        return;
> +    }
> +
> +    pci_bar = le32_to_cpu(pci_bar);
> +    type = pci_bar & (pci_bar & PCI_BASE_ADDRESS_SPACE_IO ?
> +           ~PCI_BASE_ADDRESS_IO_MASK : ~PCI_BASE_ADDRESS_MEM_MASK);
> +
> +    /* A "slow" read/write mapping underlies all BARs */
> +    memory_region_init_io(&bar->mem, &vfio_bar_ops, bar, name, size);
> +    pci_register_bar(&vdev->pdev, nr, type, &bar->mem);
> +
> +    /*
> +     * We can't mmap areas overlapping the MSIX vector table, so we
> +     * potentially insert a direct-mapped subregion before and after it.
> +     */
> +    if (vdev->msix && vdev->msix->table_bar == nr) {
> +        size = vdev->msix->table_offset & TARGET_PAGE_MASK;
> +    }
> +
> +    strncat(name, " mmap", sizeof(name) - strlen(name) - 1);
> +    if (vfio_mmap_bar(bar, &bar->mem,
> +                      &bar->mmap_mem, &bar->mmap, size, 0, name)) {
> +        error_report("%s unsupported. Performance may be slow\n", name);
> +    }
> +
> +    if (vdev->msix && vdev->msix->table_bar == nr) {
> +        unsigned start;
> +
> +        start = TARGET_PAGE_ALIGN(vdev->msix->table_offset +
> +                                  (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE));
> +
> +        size = start < bar->size ? bar->size - start : 0;
> +        strncat(name, " msix-hi", sizeof(name) - strlen(name) - 1);
> +        /* MSIXInfo contains another MemoryRegion for this mapping */
> +        if (vfio_mmap_bar(bar, &bar->mem, &vdev->msix->mmap_mem,
> +                          &vdev->msix->mmap, size, start, name)) {
> +            error_report("%s unsupported. Performance may be slow\n", name);
> +        }
> +    }
> +
> +    return;
> +}
> +
> +static void vfio_map_bars(VFIODevice *vdev)
> +{
> +    int i;
> +
> +    for (i = 0; i < PCI_ROM_SLOT; i++) {
> +        vfio_map_bar(vdev, i);
> +    }
> +}
> +
> +static void vfio_unmap_bars(VFIODevice *vdev)
> +{
> +    int i;
> +
> +    for (i = 0; i < PCI_ROM_SLOT; i++) {
> +        vfio_unmap_bar(vdev, i);
> +    }
> +}
> +
> +/*
> + * General setup
> + */
> +static uint8_t vfio_std_cap_max_size(PCIDevice *pdev, uint8_t pos)
> +{
> +    uint8_t tmp, next = 0xff;
> +
> +    for (tmp = pdev->config[PCI_CAPABILITY_LIST]; tmp;
> +         tmp = pdev->config[tmp + 1]) {
> +        if (tmp > pos && tmp < next) {
> +            next = tmp;
> +        }
> +    }
> +
> +    return next - pos;
> +}
> +
> +static int vfio_add_std_cap(VFIODevice *vdev, uint8_t pos)
> +{
> +    PCIDevice *pdev = &vdev->pdev;
> +    uint8_t cap_id, next, size;
> +    int ret;
> +
> +    cap_id = pdev->config[pos];
> +    next = pdev->config[pos + 1];
> +
> +    /*
> +     * If it becomes important to configure capabilities to their actual
> +     * size, use this as the default when it's something we don't recognize.
> +     * Since QEMU doesn't actually handle many of the config accesses,
> +     * exact size doesn't seem worthwhile.
> +     */
> +    size = vfio_std_cap_max_size(pdev, pos);
> +
> +    /*
> +     * pci_add_capability always inserts the new capability at the head
> +     * of the chain.  Therefore to end up with a chain that matches the
> +     * physical device, we insert from the end by making this recursive.
> +     * This is also why we pre-caclulate size above as cached config space
> +     * will be changed as we unwind the stack.
> +     */
> +    if (next) {
> +        ret = vfio_add_std_cap(vdev, next);
> +        if (ret) {
> +            return ret;
> +        }
> +    } else {
> +        pdev->config[PCI_CAPABILITY_LIST] = 0; /* Begin the rebuild */
> +    }
> +
> +    switch (cap_id) {
> +    case PCI_CAP_ID_MSI:
> +        ret = vfio_setup_msi(vdev, pos);
> +        break;
> +    case PCI_CAP_ID_MSIX:
> +        ret = vfio_setup_msix(vdev, pos);
> +        break;
> +    default:
> +        ret = pci_add_capability(pdev, cap_id, pos, size);
> +        break;
> +    }
> +
> +    if (ret < 0) {
> +        error_report("vfio: %04x:%02x:%02x.%x Error adding PCI capability "
> +                     "0x%x[0x%x]@0x%x: %d\n", vdev->host.domain,
> +                     vdev->host.bus, vdev->host.slot, vdev->host.function,
> +                     cap_id, size, pos, ret);
> +        return ret;
> +    }
> +
> +    return 0;
> +}
> +
> +static int vfio_add_capabilities(VFIODevice *vdev)
> +{
> +    PCIDevice *pdev = &vdev->pdev;
> +
> +    if (!(pdev->config[PCI_STATUS] & PCI_STATUS_CAP_LIST) ||
> +        !pdev->config[PCI_CAPABILITY_LIST]) {
> +        return 0; /* Nothing to add */
> +    }
> +
> +    return vfio_add_std_cap(vdev, pdev->config[PCI_CAPABILITY_LIST]);
> +}
> +
> +static int vfio_load_rom(VFIODevice *vdev)
> +{
> +    uint64_t size = vdev->rom_size;
> +    const VMStateDescription *vmsd;
> +    char name[32];
> +    off_t off = 0, voff = vdev->rom_offset;
> +    ssize_t bytes;
> +    void *ptr;
> +
> +    /* If loading ROM from file, pci handles it */
> +    if (vdev->pdev.romfile || !vdev->pdev.rom_bar || !size) {
> +        return 0;
> +    }
> +
> +    DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
> +            vdev->host.bus, vdev->host.slot, vdev->host.function);
> +
> +    vmsd = qdev_get_vmsd(DEVICE(&vdev->pdev));
> +
> +    if (vmsd) {
> +        snprintf(name, sizeof(name), "%s.rom", vmsd->name);
> +    } else {
> +        snprintf(name, sizeof(name), "%s.rom",
> +                 object_get_typename(OBJECT(&vdev->pdev)));
> +    }

Not sure where this came from.  You can just hard code this to vfio.rom
or better yet, use the pci-host address and do vfio[%s].rom.

> +    memory_region_init_ram(&vdev->pdev.rom, name, size);
> +    ptr = memory_region_get_ram_ptr(&vdev->pdev.rom);
> +    memset(ptr, 0xff, size);
> +
> +    while (size) {
> +        bytes = pread(vdev->fd, ptr + off, size, voff + off);
> +        if (bytes == 0) {
> +            break; /* expect that we could get back less than the ROM BAR */
> +        } else if (bytes > 0) {
> +            off += bytes;
> +            size -= bytes;
> +        } else {
> +            if (errno == EINTR || errno == EAGAIN) {
> +                continue;
> +            }
> +            error_report("vfio: Error reading device ROM: %s\n",
> +                         strerror(errno));
> +            memory_region_destroy(&vdev->pdev.rom);
> +            return -1;
> +        }
> +    }
> +
> +    pci_register_bar(&vdev->pdev, PCI_ROM_SLOT, 0, &vdev->pdev.rom);
> +    vdev->pdev.has_rom = true;
> +    return 0;
> +}
> +
> +static int vfio_connect_container(VFIOGroup *group)
> +{
> +    VFIOContainer *container;
> +    int ret, fd;
> +
> +    if (group->container) {
> +        return 0;
> +    }
> +
> +    QLIST_FOREACH(container, &container_list, next) {
> +        if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) {
> +            group->container = container;
> +            QLIST_INSERT_HEAD(&container->group_list, group, container_next);
> +            return 0;
> +        }
> +    }
> +
> +    fd = qemu_open("/dev/vfio/vfio", O_RDWR);
> +    if (fd < 0) {
> +        error_report("vfio: failed to open /dev/vfio/vfio: %s\n",
> +                     strerror(errno));
> +        return -1;
> +    }
> +
> +    ret = ioctl(fd, VFIO_GET_API_VERSION);
> +    if (ret != VFIO_API_VERSION) {
> +        error_report("vfio: supported vfio version: %d, "
> +                     "reported version: %d\n", VFIO_API_VERSION, ret);
> +        close(fd);
> +        return -1;
> +    }
> +
> +    container = g_malloc0(sizeof(*container));
> +    container->fd = fd;
> +
> +    if (ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) {
> +        ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd);
> +        if (ret) {
> +            error_report("vfio: failed to set group container: %s\n",
> +                         strerror(errno));
> +            g_free(container);
> +            close(fd);
> +            return -1;
> +        }
> +
> +        ret = ioctl(fd, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU);
> +        if (ret) {
> +            error_report("vfio: failed to set iommu for container: %s\n",
> +                         strerror(errno));
> +            g_free(container);
> +            close(fd);
> +            return -1;
> +        }
> +
> +        container->iommu_data.listener = (MemoryListener) {
> +            .begin = vfio_listener_dummy1,
> +            .commit = vfio_listener_dummy1,
> +            .region_add = vfio_listener_region_add,
> +            .region_del = vfio_listener_region_del,
> +            .region_nop = vfio_listener_dummy2,
> +            .log_start = vfio_listener_dummy2,
> +            .log_stop = vfio_listener_dummy2,
> +            .log_sync = vfio_listener_dummy2,
> +            .log_global_start = vfio_listener_dummy1,
> +            .log_global_stop = vfio_listener_dummy1,
> +            .eventfd_add = vfio_listener_dummy3,
> +            .eventfd_del = vfio_listener_dummy3,
> +        };

It would be nicer to move this out to a static structure.

> +        container->iommu_data.release = vfio_listener_release;
> +
> +        memory_listener_register(&container->iommu_data.listener,
> +                                 get_system_memory());
> +    } else {
> +        error_report("vfio: No available IOMMU models\n");
> +        g_free(container);
> +        close(fd);
> +        return -1;
> +    }
> +
> +    QLIST_INIT(&container->group_list);
> +    QLIST_INSERT_HEAD(&container_list, container, next);
> +
> +    group->container = container;
> +    QLIST_INSERT_HEAD(&container->group_list, group, container_next);
> +
> +    return 0;
> +}
> +
> +static void vfio_disconnect_container(VFIOGroup *group)
> +{
> +    VFIOContainer *container = group->container;
> +
> +    if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, &container->fd)) {
> +        error_report("vfio: error disconnecting group %d from container\n",
> +                     group->groupid);
> +    }

error_report isn't terminal.

> +
> +    QLIST_REMOVE(group, container_next);
> +    group->container = NULL;
> +
> +    if (QLIST_EMPTY(&container->group_list)) {
> +        if (container->iommu_data.release) {
> +            container->iommu_data.release(container);
> +        }
> +        QLIST_REMOVE(container, next);
> +        DPRINTF("vfio_disconnect_container: close container->fd\n");
> +        close(container->fd);
> +        g_free(container);
> +    }
> +}
> +
> +static VFIOGroup *vfio_get_group(int groupid)
> +{
> +    VFIOGroup *group;
> +    char path[32];
> +    struct vfio_group_status status = { .argsz = sizeof(status) };
> +
> +    QLIST_FOREACH(group, &group_list, next) {
> +        if (group->groupid == groupid) {
> +            return group;
> +        }
> +    }
> +
> +    group = g_malloc0(sizeof(*group));
> +
> +    snprintf(path, sizeof(path), "/dev/vfio/%d", groupid);
> +    group->fd = qemu_open(path, O_RDWR);
> +    if (group->fd < 0) {
> +        error_report("vfio: error opening %s: %s", path, strerror(errno));
> +        g_free(group);
> +        return NULL;
> +    }
> +
> +    if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &status)) {
> +        error_report("vfio: error getting group status: %s\n",
> +                     strerror(errno));
> +        close(group->fd);
> +        g_free(group);
> +        return NULL;
> +    }
> +
> +    if (!(status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
> +        error_report("vfio: error, group %d is not viable, please ensure "
> +                     "all devices within the iommu_group are bound to their "
> +                     "vfio bus driver.\n", groupid);
> +        close(group->fd);
> +        g_free(group);
> +        return NULL;
> +    }
> +
> +    group->groupid = groupid;
> +    QLIST_INIT(&group->device_list);
> +
> +    if (vfio_connect_container(group)) {
> +        error_report("vfio: failed to setup container for group %d\n", groupid);
> +        close(group->fd);
> +        g_free(group);
> +        return NULL;
> +    }
> +
> +    QLIST_INSERT_HEAD(&group_list, group, next);
> +
> +    return group;
> +}
> +
> +static void vfio_put_group(VFIOGroup *group)
> +{
> +    if (!QLIST_EMPTY(&group->device_list)) {
> +        return;
> +    }
> +
> +    vfio_disconnect_container(group);
> +    QLIST_REMOVE(group, next);
> +    DPRINTF("vfio_put_group: close group->fd\n");
> +    close(group->fd);
> +    g_free(group);
> +}
> +
> +static int vfio_get_device(VFIOGroup *group, const char *name, VFIODevice *vdev)
> +{
> +    struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
> +    struct vfio_region_info reg_info = { .argsz = sizeof(reg_info) };
> +    int ret, i;
> +
> +    ret = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name);
> +    if (ret < 0) {
> +        error_report("vfio: error getting device %s from group %d: %s",
> +                     name, group->groupid, strerror(errno));
> +        error_report("Verify all devices in group %d "
> +                     "are bound to vfio-pci or pci-stub and not already in use",
> +                     group->groupid);
> +        return ret;
> +    }
> +
> +    vdev->fd = ret;
> +    vdev->group = group;
> +    QLIST_INSERT_HEAD(&group->device_list, vdev, next);
> +
> +    /* Sanity check device */
> +    ret = ioctl(vdev->fd, VFIO_DEVICE_GET_INFO, &dev_info);
> +    if (ret) {
> +        error_report("vfio: error getting device info: %s", strerror(errno));
> +        goto error;
> +    }
> +
> +    DPRINTF("Device %s flags: %u, regions: %u, irgs: %u\n", name,
> +            dev_info.flags, dev_info.num_regions, dev_info.num_irqs);
> +
> +    if (!(dev_info.flags & VFIO_DEVICE_FLAGS_PCI)) {
> +        error_report("vfio: Um, this isn't a PCI device");
> +        goto error;
> +    }
> +
> +    vdev->reset_works = !!(dev_info.flags & VFIO_DEVICE_FLAGS_RESET);
> +    if (!vdev->reset_works) {
> +        error_report("Warning, device %s does not support reset\n", name);
> +    }
> +
> +    if (dev_info.num_regions != VFIO_PCI_NUM_REGIONS) {
> +        error_report("vfio: unexpected number of io regions %u",
> +                     dev_info.num_regions);
> +        goto error;
> +    }
> +
> +    if (dev_info.num_irqs != VFIO_PCI_NUM_IRQS) {
> +        error_report("vfio: unexpected number of irqs %u", dev_info.num_irqs);
> +        goto error;
> +    }
> +
> +    for (i = VFIO_PCI_BAR0_REGION_INDEX; i < VFIO_PCI_ROM_REGION_INDEX; i++) {
> +        reg_info.index = i;
> +
> +        ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info);
> +        if (ret) {
> +            error_report("vfio: Error getting region %d info: %s", i,
> +                         strerror(errno));
> +            goto error;
> +        }
> +
> +        DPRINTF("Device %s region %d:\n", name, i);
> +        DPRINTF("  size: 0x%lx, offset: 0x%lx, flags: 0x%lx\n",
> +                (unsigned long)reg_info.size, (unsigned long)reg_info.offset,
> +                (unsigned long)reg_info.flags);
> +
> +        vdev->bars[i].flags = reg_info.flags;
> +        vdev->bars[i].size = reg_info.size;
> +        vdev->bars[i].fd_offset = reg_info.offset;
> +        vdev->bars[i].fd = vdev->fd;
> +        vdev->bars[i].nr = i;
> +    }
> +
> +    reg_info.index = VFIO_PCI_ROM_REGION_INDEX;
> +
> +    ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info);
> +    if (ret) {
> +        error_report("vfio: Error getting ROM info: %s", strerror(errno));
> +        goto error;
> +    }
> +
> +    DPRINTF("Device %s ROM:\n", name);
> +    DPRINTF("  size: 0x%lx, offset: 0x%lx, flags: 0x%lx\n",
> +            (unsigned long)reg_info.size, (unsigned long)reg_info.offset,
> +            (unsigned long)reg_info.flags);
> +
> +    vdev->rom_size = reg_info.size;
> +    vdev->rom_offset = reg_info.offset;
> +
> +    reg_info.index = VFIO_PCI_CONFIG_REGION_INDEX;
> +
> +    ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info);
> +    if (ret) {
> +        error_report("vfio: Error getting config info: %s", strerror(errno));
> +        goto error;
> +    }
> +
> +    DPRINTF("Device %s config:\n", name);
> +    DPRINTF("  size: 0x%lx, offset: 0x%lx, flags: 0x%lx\n",
> +            (unsigned long)reg_info.size, (unsigned long)reg_info.offset,
> +            (unsigned long)reg_info.flags);
> +
> +    vdev->config_size = reg_info.size;
> +    vdev->config_offset = reg_info.offset;
> +
> +error:
> +    if (ret) {
> +        QLIST_REMOVE(vdev, next);
> +        vdev->group = NULL;
> +        close(vdev->fd);
> +    }
> +    return ret;
> +}
> +
> +static void vfio_put_device(VFIODevice *vdev)
> +{
> +    QLIST_REMOVE(vdev, next);
> +    vdev->group = NULL;
> +    DPRINTF("vfio_put_device: close vdev->fd\n");
> +    close(vdev->fd);
> +    if (vdev->msix) {
> +        g_free(vdev->msix);
> +        vdev->msix = NULL;
> +    }
> +}
> +
> +static int vfio_initfn(struct PCIDevice *pdev)

Why struct PCIDevice?

> +{
> +    VFIODevice *pvdev, *vdev = DO_UPCAST(VFIODevice, pdev, pdev);

PCI_DEVICE()

> +    VFIOGroup *group;
> +    char path[PATH_MAX], iommu_group_path[PATH_MAX], *group_name;
> +    ssize_t len;
> +    struct stat st;
> +    int groupid;
> +    int ret;
> +
> +    /* Check that the host device exists */
> +    snprintf(path, sizeof(path),
> +             "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/",
> +             vdev->host.domain, vdev->host.bus, vdev->host.slot,
> +             vdev->host.function);
> +    if (stat(path, &st) < 0) {
> +        error_report("vfio: error: no such host device: %s", path);
> +        return -1;
> +    }
> +
> +    strncat(path, "iommu_group", sizeof(path) - strlen(path) - 1);
> +
> +    len = readlink(path, iommu_group_path, PATH_MAX);
> +    if (len <= 0) {
> +        error_report("vfio: error no iommu_group for device\n");
> +        return -1;
> +    }
> +
> +    iommu_group_path[len] = 0;
> +    group_name = basename(iommu_group_path);
> +
> +    if (sscanf(group_name, "%d", &groupid) != 1) {
> +        error_report("vfio: error reading %s: %s", path, strerror(errno));
> +        return -1;
> +    }
> +
> +    DPRINTF("%s(%04x:%02x:%02x.%x) group %d\n", __func__, vdev->host.domain,
> +            vdev->host.bus, vdev->host.slot, vdev->host.function, groupid);
> +
> +    group = vfio_get_group(groupid);
> +    if (!group) {
> +        error_report("vfio: failed to get group %d", groupid);
> +        return -1;
> +    }
> +
> +    snprintf(path, sizeof(path), "%04x:%02x:%02x.%01x",
> +            vdev->host.domain, vdev->host.bus, vdev->host.slot,
> +            vdev->host.function);
> +
> +    QLIST_FOREACH(pvdev, &group->device_list, next) {
> +        if (pvdev->host.domain == vdev->host.domain &&
> +            pvdev->host.bus == vdev->host.bus &&
> +            pvdev->host.slot == vdev->host.slot &&
> +            pvdev->host.function == vdev->host.function) {
> +
> +            error_report("vfio: error: device %s is already attached\n", path);
> +            vfio_put_group(group);
> +            return -1;
> +        }
> +    }
> +
> +    ret = vfio_get_device(group, path, vdev);
> +    if (ret) {
> +        error_report("vfio: failed to get device %s", path);
> +        vfio_put_group(group);
> +        return -1;
> +    }
> +
> +    /* Get a copy of config space */
> +    assert(pci_config_size(&vdev->pdev) <= vdev->config_size);
> +    ret = pread(vdev->fd, vdev->pdev.config,
> +                pci_config_size(&vdev->pdev), vdev->config_offset);
> +    if (ret < (int)pci_config_size(&vdev->pdev)) {
> +        error_report("vfio: Failed to read device config space\n");
> +        goto out_put;
> +    }
> +
> +    /*
> +     * Clear host resource mapping info.  If we choose not to register a
> +     * BAR, such as might be the case with the option ROM, we can get
> +     * confusing, unwritable, residual addresses from the host here.
> +     */
> +    memset(&vdev->pdev.config[PCI_BASE_ADDRESS_0], 0, 24);
> +    memset(&vdev->pdev.config[PCI_ROM_ADDRESS], 0, 4);
> +
> +    vfio_load_rom(vdev);
> +
> +    if (vfio_early_setup_msix(vdev)) {
> +        goto out_put;
> +    }
> +
> +    vfio_map_bars(vdev);
> +
> +    if (vfio_add_capabilities(vdev)) {
> +        goto out_teardown;
> +    }
> +
> +    if (vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1)) {
> +        pci_device_set_intx_routing_notifier(&vdev->pdev, vfio_update_irq);
> +    }
> +
> +    if (vfio_enable_intx(vdev)) {
> +        goto out_teardown;
> +    }
> +
> +    return 0;
> +
> +out_teardown:
> +    pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
> +    vfio_teardown_msi(vdev);
> +    vfio_unmap_bars(vdev);
> +out_put:
> +    vfio_put_device(vdev);
> +    vfio_put_group(group);
> +    return -1;
> +}
> +
> +static void vfio_exitfn(struct PCIDevice *pdev)
> +{
> +    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
> +    VFIOGroup *group = vdev->group;
> +
> +    pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
> +    vfio_disable_interrupts(vdev);
> +    vfio_teardown_msi(vdev);
> +    vfio_unmap_bars(vdev);
> +    vfio_put_device(vdev);
> +    vfio_put_group(group);

You should move this all to the destructor (instance_finalize).

> +}
> +
> +static void vfio_reset(DeviceState *dev)
> +{
> +    PCIDevice *pdev = DO_UPCAST(PCIDevice, qdev, dev);
> +    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
> +
> +    if (!vdev->reset_works) {
> +        return;
> +    }
> +
> +    if (ioctl(vdev->fd, VFIO_DEVICE_RESET)) {
> +        error_report("vfio: Error unable to reset physical device "
> +                     "(%04x:%02x:%02x.%x): %s\n", vdev->host.domain,
> +                     vdev->host.bus, vdev->host.slot, vdev->host.function,
> +                     strerror(errno));

%m is thread safe, strerror isn't.

> +    }
> +}
> +
> +static Property vfio_pci_dev_properties[] = {
> +    DEFINE_PROP_PCI_HOST_DEVADDR("host", VFIODevice, host),
> +    /*
> +     * TODO - support passed fds... is this necessary?
> +     * DEFINE_PROP_STRING("vfiofd", VFIODevice, vfiofd_name),
> +     * DEFINE_PROP_STRING("vfiogroupfd, VFIODevice, vfiogroupfd_name),
> +     */
> +    DEFINE_PROP_END_OF_LIST(),
> +};
> +
> +
> +static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
> +{
> +    PCIDeviceClass *dc = PCI_DEVICE_CLASS(klass);
> +
> +    dc->parent_class.reset = vfio_reset;

This is definitely not right.  You want to do:

PCIDeviceClass *pdc = PCI_DEVICE_CLASS(klass);
DeviceClass *dc = DEVICE_CLASS(klass);

dc->reset = vfio_reset;

> +    dc->init = vfio_initfn;
> +    dc->exit = vfio_exitfn;
> +    dc->config_read = vfio_pci_read_config;
> +    dc->config_write = vfio_pci_write_config;
> +    dc->parent_class.props = vfio_pci_dev_properties;

dc->props = vfio_pci_dev_properties;
> +}
> +
> +static TypeInfo vfio_pci_dev_info = {
> +    .name          = "vfio-pci",
> +    .parent        = TYPE_PCI_DEVICE,
> +    .instance_size = sizeof(VFIODevice),
> +    .class_init    = vfio_pci_dev_class_init,
> +};
> +
> +static void register_vfio_pci_dev_type(void)
> +{
> +    type_register_static(&vfio_pci_dev_info);
> +}
> +
> +type_init(register_vfio_pci_dev_type)
> diff --git a/hw/vfio_pci.h b/hw/vfio_pci.h
> new file mode 100644
> index 0000000..0a71bce
> --- /dev/null
> +++ b/hw/vfio_pci.h
> @@ -0,0 +1,101 @@

copyright/license.

> +#ifndef HW_VFIO_PCI_H
> +#define HW_VFIO_PCI_H
> +
> +#include "qemu-common.h"
> +#include "qemu-queue.h"
> +#include "pci.h"
> +#include "event_notifier.h"

This is all private to vfio.c, right?  Perhaps call it vfio_pci_int.h

Regards,

Anthony Liguori

> +
> +typedef struct VFIOBAR {
> +    off_t fd_offset; /* offset of BAR within device fd */
> +    int fd; /* device fd, allows us to pass VFIOBAR as opaque data */
> +    MemoryRegion mem; /* slow, read/write access */
> +    MemoryRegion mmap_mem; /* direct mapped access */
> +    void *mmap;
> +    size_t size;
> +    uint32_t flags; /* VFIO region flags (rd/wr/mmap) */
> +    uint8_t nr; /* cache the BAR number for debug */
> +} VFIOBAR;
> +
> +typedef struct INTx {
> +    bool pending; /* interrupt pending */
> +    bool kvm_accel; /* set when QEMU bypass through KVM enabled */
> +    uint8_t pin; /* which pin to pull for qemu_set_irq */
> +    EventNotifier interrupt; /* eventfd triggered on interrupt */
> +    EventNotifier unmask; /* eventfd for unmask on QEMU bypass */
> +    PCIINTxRoute route; /* routing info for QEMU bypass */
> +} INTx;
> +
> +struct VFIODevice;
> +
> +typedef struct MSIVector {
> +    EventNotifier interrupt; /* eventfd triggered on interrupt */
> +    struct VFIODevice *vdev; /* back pointer to device */
> +    int vector; /* the vector number for this element */
> +    int virq; /* KVM irqchip route for QEMU bypass */
> +    bool use;
> +} MSIVector;
> +
> +enum {
> +    INT_NONE = 0,
> +    INT_INTx = 1,
> +    INT_MSI  = 2,
> +    INT_MSIX = 3,
> +};
> +
> +struct VFIOGroup;
> +
> +typedef struct VFIOContainer {
> +    int fd; /* /dev/vfio/vfio, empowered by the attached groups */
> +    struct {
> +        /* enable abstraction to support various iommu backends */
> +        union {
> +            MemoryListener listener; /* Used by type1 iommu */
> +        };
> +        void (*release)(struct VFIOContainer *);
> +    } iommu_data;
> +    QLIST_HEAD(, VFIOGroup) group_list;
> +    QLIST_ENTRY(VFIOContainer) next;
> +} VFIOContainer;
> +
> +/* Cache of MSI-X setup plus extra mmap and memory region for split BAR map */
> +typedef struct MSIXInfo {
> +    uint8_t table_bar;
> +    uint8_t pba_bar;
> +    uint16_t entries;
> +    uint32_t table_offset;
> +    uint32_t pba_offset;
> +    MemoryRegion mmap_mem;
> +    void *mmap;
> +} MSIXInfo;
> +
> +typedef struct VFIODevice {
> +    PCIDevice pdev;
> +    int fd;
> +    INTx intx;
> +    unsigned int config_size;
> +    off_t config_offset; /* Offset of config space region within device fd */
> +    unsigned int rom_size;
> +    off_t rom_offset; /* Offset of ROM region within device fd */
> +    int msi_cap_size;
> +    MSIVector *msi_vectors;
> +    MSIXInfo *msix;
> +    int nr_vectors; /* Number of MSI/MSIX vectors currently in use */
> +    int interrupt; /* Current interrupt type */
> +    VFIOBAR bars[PCI_NUM_REGIONS - 1]; /* No ROM */
> +    PCIHostDeviceAddress host;
> +    QLIST_ENTRY(VFIODevice) next;
> +    struct VFIOGroup *group;
> +    bool reset_works;
> +} VFIODevice;
> +
> +typedef struct VFIOGroup {
> +    int fd;
> +    int groupid;
> +    VFIOContainer *container;
> +    QLIST_HEAD(, VFIODevice) device_list;
> +    QLIST_ENTRY(VFIOGroup) next;
> +    QLIST_ENTRY(VFIOGroup) container_next;
> +} VFIOGroup;
> +
> +#endif /* HW_VFIO_PCI_H */
>
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH 3/3] vfio: Enable vfio-pci and mark supported
  2012-08-01  7:15   ` Jan Kiszka
  2012-08-01 18:14     ` Alex Williamson
@ 2012-08-13 22:19     ` Anthony Liguori
  2012-08-14  5:27       ` Alex Williamson
  1 sibling, 1 reply; 42+ messages in thread
From: Anthony Liguori @ 2012-08-13 22:19 UTC (permalink / raw)
  To: Jan Kiszka, Alex Williamson; +Cc: aik, qemu-devel, kvm

Jan Kiszka <jan.kiszka@web.de> writes:

> On 2012-08-01 07:18, Alex Williamson wrote:
>> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
>> ---
>> 
>>  MAINTAINERS           |    5 +++++
>>  configure             |   12 ++++++++++++
>>  hw/i386/Makefile.objs |    1 +
>>  3 files changed, 18 insertions(+)
>> 
>> diff --git a/MAINTAINERS b/MAINTAINERS
>> index 2d219d2..9680d69 100644
>> --- a/MAINTAINERS
>> +++ b/MAINTAINERS
>> @@ -460,6 +460,11 @@ M: Gerd Hoffmann <kraxel@redhat.com>
>>  S: Maintained
>>  F: hw/usb*
>>  
>> +VFIO
>> +M: Alex Williamson <alex.williamson@redhat.com>
>> +S: Supported
>> +F: hw/vfio*
>> +
>>  vhost
>>  M: Michael S. Tsirkin <mst@redhat.com>
>>  S: Supported
>> diff --git a/configure b/configure
>> index c65b5f6..81108dc 100755
>> --- a/configure
>> +++ b/configure
>> @@ -143,6 +143,7 @@ attr=""
>>  libattr=""
>>  xfs=""
>>  
>> +vfio_pci="no"
>>  vhost_net="no"
>>  kvm="no"
>>  gprof="no"
>> @@ -489,6 +490,7 @@ Haiku)
>>    usb="linux"
>>    kvm="yes"
>>    vhost_net="yes"
>> +  vfio_pci="yes"
>>    if [ "$cpu" = "i386" -o "$cpu" = "x86_64" ] ; then
>>      audio_possible_drivers="$audio_possible_drivers fmod"
>>    fi
>> @@ -824,6 +826,10 @@ for opt do
>>    ;;
>>    --disable-guest-agent) guest_agent="no"
>>    ;;
>> +  --disable-vfio-pci) vfio_pci="no"
>> +  ;;
>> +  --enable-vfio-pci) vfio_pci="yes"
>> +  ;;
>
> Do we need this level of control? Open question I'm just wondering every
> time a new feature gets added together with --disable/--enable
> switches.

I don't think so--it's easy enough for an administrator to disable vfio
for a user.

Regards,

Anthony Liguori

>
>>    *) echo "ERROR: unknown option $opt"; show_help="yes"
>>    ;;
>>    esac
>> @@ -1112,6 +1118,8 @@ echo "  --disable-guest-agent    disable building of the QEMU Guest Agent"
>>  echo "  --enable-guest-agent     enable building of the QEMU Guest Agent"
>>  echo "  --with-coroutine=BACKEND coroutine backend. Supported options:"
>>  echo "                           gthread, ucontext, sigaltstack, windows"
>> +echo "  --disable-vfio-pci       disable vfio pci device assignement support"
>> +echo "  --enable-vfio-pci        enable vfio pci device assignment support"
>>  echo ""
>>  echo "NOTE: The object files are built at the place where configure is launched"
>>  exit 1
>> @@ -3072,6 +3080,7 @@ echo "OpenGL support    $opengl"
>>  echo "libiscsi support  $libiscsi"
>>  echo "build guest agent $guest_agent"
>>  echo "coroutine backend $coroutine_backend"
>> +echo "VFIO PCI support  $vfio_pci"
>>  
>>  if test "$sdl_too_old" = "yes"; then
>>  echo "-> Your SDL version is too old - please upgrade to have SDL support"
>> @@ -3754,6 +3763,9 @@ case "$target_arch2" in
>>    *)
>>      echo "CONFIG_NO_XEN=y" >> $config_target_mak
>>  esac
>> +if test "$vfio_pci" = "yes" -a "$target_softmmu" = "yes" ; then
>> +  echo "CONFIG_VFIO_PCI=y" >> $config_target_mak
>> +fi
>
> Does this already somehow depend on host == Linux? If not, you may break
> the others.
>
>>  case "$target_arch2" in
>>    i386|x86_64|ppcemb|ppc|ppc64|s390x)
>>      # Make sure the target and host cpus are compatible
>> diff --git a/hw/i386/Makefile.objs b/hw/i386/Makefile.objs
>> index 8c764bb..a2783ef 100644
>> --- a/hw/i386/Makefile.objs
>> +++ b/hw/i386/Makefile.objs
>> @@ -11,5 +11,6 @@ obj-$(CONFIG_XEN_PCI_PASSTHROUGH) += xen-host-pci-device.o
>>  obj-$(CONFIG_XEN_PCI_PASSTHROUGH) += xen_pt.o xen_pt_config_init.o xen_pt_msi.o
>>  obj-y += kvm/
>>  obj-$(CONFIG_SPICE) += qxl.o qxl-logger.o qxl-render.o
>> +obj-$(CONFIG_VFIO_PCI) += vfio_pci.o
>>  
>>  obj-y := $(addprefix ../,$(obj-y))
>> 
>
> Jan

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH 2/3] vfio: vfio-pci device assignment driver
  2012-08-13 22:18   ` Anthony Liguori
@ 2012-08-14  5:25     ` Alex Williamson
  0 siblings, 0 replies; 42+ messages in thread
From: Alex Williamson @ 2012-08-14  5:25 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: aik, qemu-devel, kvm

On Mon, 2012-08-13 at 17:18 -0500, Anthony Liguori wrote:
> Alex Williamson <alex.williamson@redhat.com> writes:
> > +static int vfio_load_rom(VFIODevice *vdev)
> > +{
> > +    uint64_t size = vdev->rom_size;
> > +    const VMStateDescription *vmsd;
> > +    char name[32];
> > +    off_t off = 0, voff = vdev->rom_offset;
> > +    ssize_t bytes;
> > +    void *ptr;
> > +
> > +    /* If loading ROM from file, pci handles it */
> > +    if (vdev->pdev.romfile || !vdev->pdev.rom_bar || !size) {
> > +        return 0;
> > +    }
> > +
> > +    DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
> > +            vdev->host.bus, vdev->host.slot, vdev->host.function);
> > +
> > +    vmsd = qdev_get_vmsd(DEVICE(&vdev->pdev));
> > +
> > +    if (vmsd) {
> > +        snprintf(name, sizeof(name), "%s.rom", vmsd->name);
> > +    } else {
> > +        snprintf(name, sizeof(name), "%s.rom",
> > +                 object_get_typename(OBJECT(&vdev->pdev)));
> > +    }
> 
> Not sure where this came from.  You can just hard code this to vfio.rom
> or better yet, use the pci-host address and do vfio[%s].rom.

Ok, assume you mean vfio[%04x:%02x:%02x.%x].rom or should I be calling
object_property_print() to get a %s?

> > +    memory_region_init_ram(&vdev->pdev.rom, name, size);
> > +    ptr = memory_region_get_ram_ptr(&vdev->pdev.rom);
> > +    memset(ptr, 0xff, size);
> > +
> > +    while (size) {
> > +        bytes = pread(vdev->fd, ptr + off, size, voff + off);
> > +        if (bytes == 0) {
> > +            break; /* expect that we could get back less than the ROM BAR */
> > +        } else if (bytes > 0) {
> > +            off += bytes;
> > +            size -= bytes;
> > +        } else {
> > +            if (errno == EINTR || errno == EAGAIN) {
> > +                continue;
> > +            }
> > +            error_report("vfio: Error reading device ROM: %s\n",
> > +                         strerror(errno));
> > +            memory_region_destroy(&vdev->pdev.rom);
> > +            return -1;
> > +        }
> > +    }
> > +
> > +    pci_register_bar(&vdev->pdev, PCI_ROM_SLOT, 0, &vdev->pdev.rom);
> > +    vdev->pdev.has_rom = true;
> > +    return 0;
> > +}
> > +
> > +static int vfio_connect_container(VFIOGroup *group)
> > +{
> > +    VFIOContainer *container;
> > +    int ret, fd;
> > +
> > +    if (group->container) {
> > +        return 0;
> > +    }
> > +
> > +    QLIST_FOREACH(container, &container_list, next) {
> > +        if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) {
> > +            group->container = container;
> > +            QLIST_INSERT_HEAD(&container->group_list, group, container_next);
> > +            return 0;
> > +        }
> > +    }
> > +
> > +    fd = qemu_open("/dev/vfio/vfio", O_RDWR);
> > +    if (fd < 0) {
> > +        error_report("vfio: failed to open /dev/vfio/vfio: %s\n",
> > +                     strerror(errno));
> > +        return -1;
> > +    }
> > +
> > +    ret = ioctl(fd, VFIO_GET_API_VERSION);
> > +    if (ret != VFIO_API_VERSION) {
> > +        error_report("vfio: supported vfio version: %d, "
> > +                     "reported version: %d\n", VFIO_API_VERSION, ret);
> > +        close(fd);
> > +        return -1;
> > +    }
> > +
> > +    container = g_malloc0(sizeof(*container));
> > +    container->fd = fd;
> > +
> > +    if (ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) {
> > +        ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd);
> > +        if (ret) {
> > +            error_report("vfio: failed to set group container: %s\n",
> > +                         strerror(errno));
> > +            g_free(container);
> > +            close(fd);
> > +            return -1;
> > +        }
> > +
> > +        ret = ioctl(fd, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU);
> > +        if (ret) {
> > +            error_report("vfio: failed to set iommu for container: %s\n",
> > +                         strerror(errno));
> > +            g_free(container);
> > +            close(fd);
> > +            return -1;
> > +        }
> > +
> > +        container->iommu_data.listener = (MemoryListener) {
> > +            .begin = vfio_listener_dummy1,
> > +            .commit = vfio_listener_dummy1,
> > +            .region_add = vfio_listener_region_add,
> > +            .region_del = vfio_listener_region_del,
> > +            .region_nop = vfio_listener_dummy2,
> > +            .log_start = vfio_listener_dummy2,
> > +            .log_stop = vfio_listener_dummy2,
> > +            .log_sync = vfio_listener_dummy2,
> > +            .log_global_start = vfio_listener_dummy1,
> > +            .log_global_stop = vfio_listener_dummy1,
> > +            .eventfd_add = vfio_listener_dummy3,
> > +            .eventfd_del = vfio_listener_dummy3,
> > +        };
> 
> It would be nicer to move this out to a static structure.

Ok

> > +        container->iommu_data.release = vfio_listener_release;
> > +
> > +        memory_listener_register(&container->iommu_data.listener,
> > +                                 get_system_memory());
> > +    } else {
> > +        error_report("vfio: No available IOMMU models\n");
> > +        g_free(container);
> > +        close(fd);
> > +        return -1;
> > +    }
> > +
> > +    QLIST_INIT(&container->group_list);
> > +    QLIST_INSERT_HEAD(&container_list, container, next);
> > +
> > +    group->container = container;
> > +    QLIST_INSERT_HEAD(&container->group_list, group, container_next);
> > +
> > +    return 0;
> > +}
> > +
> > +static void vfio_disconnect_container(VFIOGroup *group)
> > +{
> > +    VFIOContainer *container = group->container;
> > +
> > +    if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, &container->fd)) {
> > +        error_report("vfio: error disconnecting group %d from container\n",
> > +                     group->groupid);
> > +    }
> 
> error_report isn't terminal.

Not meant to be here, the kernel side will prevent the group from being
re-used.  I suppose there's the question of whether we want to continue
to get further out of sync with the kernel by continuing the below
disconnects though.  Either way seems to be a funky state and I'm not
sure I have a strong preference.  It seems unnecessary to kill the guest
though.

> > +
> > +    QLIST_REMOVE(group, container_next);
> > +    group->container = NULL;
> > +
> > +    if (QLIST_EMPTY(&container->group_list)) {
> > +        if (container->iommu_data.release) {
> > +            container->iommu_data.release(container);
> > +        }
> > +        QLIST_REMOVE(container, next);
> > +        DPRINTF("vfio_disconnect_container: close container->fd\n");
> > +        close(container->fd);
> > +        g_free(container);
> > +    }
> > +}
> > +
> > +static VFIOGroup *vfio_get_group(int groupid)
> > +{
> > +    VFIOGroup *group;
> > +    char path[32];
> > +    struct vfio_group_status status = { .argsz = sizeof(status) };
> > +
> > +    QLIST_FOREACH(group, &group_list, next) {
> > +        if (group->groupid == groupid) {
> > +            return group;
> > +        }
> > +    }
> > +
> > +    group = g_malloc0(sizeof(*group));
> > +
> > +    snprintf(path, sizeof(path), "/dev/vfio/%d", groupid);
> > +    group->fd = qemu_open(path, O_RDWR);
> > +    if (group->fd < 0) {
> > +        error_report("vfio: error opening %s: %s", path, strerror(errno));
> > +        g_free(group);
> > +        return NULL;
> > +    }
> > +
> > +    if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &status)) {
> > +        error_report("vfio: error getting group status: %s\n",
> > +                     strerror(errno));
> > +        close(group->fd);
> > +        g_free(group);
> > +        return NULL;
> > +    }
> > +
> > +    if (!(status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
> > +        error_report("vfio: error, group %d is not viable, please ensure "
> > +                     "all devices within the iommu_group are bound to their "
> > +                     "vfio bus driver.\n", groupid);
> > +        close(group->fd);
> > +        g_free(group);
> > +        return NULL;
> > +    }
> > +
> > +    group->groupid = groupid;
> > +    QLIST_INIT(&group->device_list);
> > +
> > +    if (vfio_connect_container(group)) {
> > +        error_report("vfio: failed to setup container for group %d\n", groupid);
> > +        close(group->fd);
> > +        g_free(group);
> > +        return NULL;
> > +    }
> > +
> > +    QLIST_INSERT_HEAD(&group_list, group, next);
> > +
> > +    return group;
> > +}
> > +
> > +static void vfio_put_group(VFIOGroup *group)
> > +{
> > +    if (!QLIST_EMPTY(&group->device_list)) {
> > +        return;
> > +    }
> > +
> > +    vfio_disconnect_container(group);
> > +    QLIST_REMOVE(group, next);
> > +    DPRINTF("vfio_put_group: close group->fd\n");
> > +    close(group->fd);
> > +    g_free(group);
> > +}
> > +
> > +static int vfio_get_device(VFIOGroup *group, const char *name, VFIODevice *vdev)
> > +{
> > +    struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
> > +    struct vfio_region_info reg_info = { .argsz = sizeof(reg_info) };
> > +    int ret, i;
> > +
> > +    ret = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name);
> > +    if (ret < 0) {
> > +        error_report("vfio: error getting device %s from group %d: %s",
> > +                     name, group->groupid, strerror(errno));
> > +        error_report("Verify all devices in group %d "
> > +                     "are bound to vfio-pci or pci-stub and not already in use",
> > +                     group->groupid);
> > +        return ret;
> > +    }
> > +
> > +    vdev->fd = ret;
> > +    vdev->group = group;
> > +    QLIST_INSERT_HEAD(&group->device_list, vdev, next);
> > +
> > +    /* Sanity check device */
> > +    ret = ioctl(vdev->fd, VFIO_DEVICE_GET_INFO, &dev_info);
> > +    if (ret) {
> > +        error_report("vfio: error getting device info: %s", strerror(errno));
> > +        goto error;
> > +    }
> > +
> > +    DPRINTF("Device %s flags: %u, regions: %u, irgs: %u\n", name,
> > +            dev_info.flags, dev_info.num_regions, dev_info.num_irqs);
> > +
> > +    if (!(dev_info.flags & VFIO_DEVICE_FLAGS_PCI)) {
> > +        error_report("vfio: Um, this isn't a PCI device");
> > +        goto error;
> > +    }
> > +
> > +    vdev->reset_works = !!(dev_info.flags & VFIO_DEVICE_FLAGS_RESET);
> > +    if (!vdev->reset_works) {
> > +        error_report("Warning, device %s does not support reset\n", name);
> > +    }
> > +
> > +    if (dev_info.num_regions != VFIO_PCI_NUM_REGIONS) {
> > +        error_report("vfio: unexpected number of io regions %u",
> > +                     dev_info.num_regions);
> > +        goto error;
> > +    }
> > +
> > +    if (dev_info.num_irqs != VFIO_PCI_NUM_IRQS) {
> > +        error_report("vfio: unexpected number of irqs %u", dev_info.num_irqs);
> > +        goto error;
> > +    }
> > +
> > +    for (i = VFIO_PCI_BAR0_REGION_INDEX; i < VFIO_PCI_ROM_REGION_INDEX; i++) {
> > +        reg_info.index = i;
> > +
> > +        ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info);
> > +        if (ret) {
> > +            error_report("vfio: Error getting region %d info: %s", i,
> > +                         strerror(errno));
> > +            goto error;
> > +        }
> > +
> > +        DPRINTF("Device %s region %d:\n", name, i);
> > +        DPRINTF("  size: 0x%lx, offset: 0x%lx, flags: 0x%lx\n",
> > +                (unsigned long)reg_info.size, (unsigned long)reg_info.offset,
> > +                (unsigned long)reg_info.flags);
> > +
> > +        vdev->bars[i].flags = reg_info.flags;
> > +        vdev->bars[i].size = reg_info.size;
> > +        vdev->bars[i].fd_offset = reg_info.offset;
> > +        vdev->bars[i].fd = vdev->fd;
> > +        vdev->bars[i].nr = i;
> > +    }
> > +
> > +    reg_info.index = VFIO_PCI_ROM_REGION_INDEX;
> > +
> > +    ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info);
> > +    if (ret) {
> > +        error_report("vfio: Error getting ROM info: %s", strerror(errno));
> > +        goto error;
> > +    }
> > +
> > +    DPRINTF("Device %s ROM:\n", name);
> > +    DPRINTF("  size: 0x%lx, offset: 0x%lx, flags: 0x%lx\n",
> > +            (unsigned long)reg_info.size, (unsigned long)reg_info.offset,
> > +            (unsigned long)reg_info.flags);
> > +
> > +    vdev->rom_size = reg_info.size;
> > +    vdev->rom_offset = reg_info.offset;
> > +
> > +    reg_info.index = VFIO_PCI_CONFIG_REGION_INDEX;
> > +
> > +    ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info);
> > +    if (ret) {
> > +        error_report("vfio: Error getting config info: %s", strerror(errno));
> > +        goto error;
> > +    }
> > +
> > +    DPRINTF("Device %s config:\n", name);
> > +    DPRINTF("  size: 0x%lx, offset: 0x%lx, flags: 0x%lx\n",
> > +            (unsigned long)reg_info.size, (unsigned long)reg_info.offset,
> > +            (unsigned long)reg_info.flags);
> > +
> > +    vdev->config_size = reg_info.size;
> > +    vdev->config_offset = reg_info.offset;
> > +
> > +error:
> > +    if (ret) {
> > +        QLIST_REMOVE(vdev, next);
> > +        vdev->group = NULL;
> > +        close(vdev->fd);
> > +    }
> > +    return ret;
> > +}
> > +
> > +static void vfio_put_device(VFIODevice *vdev)
> > +{
> > +    QLIST_REMOVE(vdev, next);
> > +    vdev->group = NULL;
> > +    DPRINTF("vfio_put_device: close vdev->fd\n");
> > +    close(vdev->fd);
> > +    if (vdev->msix) {
> > +        g_free(vdev->msix);
> > +        vdev->msix = NULL;
> > +    }
> > +}
> > +
> > +static int vfio_initfn(struct PCIDevice *pdev)
> 
> Why struct PCIDevice?

Must have pasted it from somewhere, fixed.

> > +{
> > +    VFIODevice *pvdev, *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
> 
> PCI_DEVICE()

I'm not sure what you're after here, are you perhaps mistaking pdev for
qdev?

> > +    VFIOGroup *group;
> > +    char path[PATH_MAX], iommu_group_path[PATH_MAX], *group_name;
> > +    ssize_t len;
> > +    struct stat st;
> > +    int groupid;
> > +    int ret;
> > +
> > +    /* Check that the host device exists */
> > +    snprintf(path, sizeof(path),
> > +             "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/",
> > +             vdev->host.domain, vdev->host.bus, vdev->host.slot,
> > +             vdev->host.function);
> > +    if (stat(path, &st) < 0) {
> > +        error_report("vfio: error: no such host device: %s", path);
> > +        return -1;
> > +    }
> > +
> > +    strncat(path, "iommu_group", sizeof(path) - strlen(path) - 1);
> > +
> > +    len = readlink(path, iommu_group_path, PATH_MAX);
> > +    if (len <= 0) {
> > +        error_report("vfio: error no iommu_group for device\n");
> > +        return -1;
> > +    }
> > +
> > +    iommu_group_path[len] = 0;
> > +    group_name = basename(iommu_group_path);
> > +
> > +    if (sscanf(group_name, "%d", &groupid) != 1) {
> > +        error_report("vfio: error reading %s: %s", path, strerror(errno));
> > +        return -1;
> > +    }
> > +
> > +    DPRINTF("%s(%04x:%02x:%02x.%x) group %d\n", __func__, vdev->host.domain,
> > +            vdev->host.bus, vdev->host.slot, vdev->host.function, groupid);
> > +
> > +    group = vfio_get_group(groupid);
> > +    if (!group) {
> > +        error_report("vfio: failed to get group %d", groupid);
> > +        return -1;
> > +    }
> > +
> > +    snprintf(path, sizeof(path), "%04x:%02x:%02x.%01x",
> > +            vdev->host.domain, vdev->host.bus, vdev->host.slot,
> > +            vdev->host.function);
> > +
> > +    QLIST_FOREACH(pvdev, &group->device_list, next) {
> > +        if (pvdev->host.domain == vdev->host.domain &&
> > +            pvdev->host.bus == vdev->host.bus &&
> > +            pvdev->host.slot == vdev->host.slot &&
> > +            pvdev->host.function == vdev->host.function) {
> > +
> > +            error_report("vfio: error: device %s is already attached\n", path);
> > +            vfio_put_group(group);
> > +            return -1;
> > +        }
> > +    }
> > +
> > +    ret = vfio_get_device(group, path, vdev);
> > +    if (ret) {
> > +        error_report("vfio: failed to get device %s", path);
> > +        vfio_put_group(group);
> > +        return -1;
> > +    }
> > +
> > +    /* Get a copy of config space */
> > +    assert(pci_config_size(&vdev->pdev) <= vdev->config_size);
> > +    ret = pread(vdev->fd, vdev->pdev.config,
> > +                pci_config_size(&vdev->pdev), vdev->config_offset);
> > +    if (ret < (int)pci_config_size(&vdev->pdev)) {
> > +        error_report("vfio: Failed to read device config space\n");
> > +        goto out_put;
> > +    }
> > +
> > +    /*
> > +     * Clear host resource mapping info.  If we choose not to register a
> > +     * BAR, such as might be the case with the option ROM, we can get
> > +     * confusing, unwritable, residual addresses from the host here.
> > +     */
> > +    memset(&vdev->pdev.config[PCI_BASE_ADDRESS_0], 0, 24);
> > +    memset(&vdev->pdev.config[PCI_ROM_ADDRESS], 0, 4);
> > +
> > +    vfio_load_rom(vdev);
> > +
> > +    if (vfio_early_setup_msix(vdev)) {
> > +        goto out_put;
> > +    }
> > +
> > +    vfio_map_bars(vdev);
> > +
> > +    if (vfio_add_capabilities(vdev)) {
> > +        goto out_teardown;
> > +    }
> > +
> > +    if (vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1)) {
> > +        pci_device_set_intx_routing_notifier(&vdev->pdev, vfio_update_irq);
> > +    }
> > +
> > +    if (vfio_enable_intx(vdev)) {
> > +        goto out_teardown;
> > +    }
> > +
> > +    return 0;
> > +
> > +out_teardown:
> > +    pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
> > +    vfio_teardown_msi(vdev);
> > +    vfio_unmap_bars(vdev);
> > +out_put:
> > +    vfio_put_device(vdev);
> > +    vfio_put_group(group);
> > +    return -1;
> > +}
> > +
> > +static void vfio_exitfn(struct PCIDevice *pdev)
> > +{
> > +    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
> > +    VFIOGroup *group = vdev->group;
> > +
> > +    pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
> > +    vfio_disable_interrupts(vdev);
> > +    vfio_teardown_msi(vdev);
> > +    vfio_unmap_bars(vdev);
> > +    vfio_put_device(vdev);
> > +    vfio_put_group(group);
> 
> You should move this all to the destructor (instance_finalize).

Hmm, it looks like if I do that then pci will free my interrupt and
config space out from under me before I get any notice we're killing the
device.  Being a pci device, I think I'm tied to PCIDeviceClass.exit
function, right?

> > +}
> > +
> > +static void vfio_reset(DeviceState *dev)
> > +{
> > +    PCIDevice *pdev = DO_UPCAST(PCIDevice, qdev, dev);
> > +    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
> > +
> > +    if (!vdev->reset_works) {
> > +        return;
> > +    }
> > +
> > +    if (ioctl(vdev->fd, VFIO_DEVICE_RESET)) {
> > +        error_report("vfio: Error unable to reset physical device "
> > +                     "(%04x:%02x:%02x.%x): %s\n", vdev->host.domain,
> > +                     vdev->host.bus, vdev->host.slot, vdev->host.function,
> > +                     strerror(errno));
> 
> %m is thread safe, strerror isn't.

Neat.  Fixed throughout.

> > +    }
> > +}
> > +
> > +static Property vfio_pci_dev_properties[] = {
> > +    DEFINE_PROP_PCI_HOST_DEVADDR("host", VFIODevice, host),
> > +    /*
> > +     * TODO - support passed fds... is this necessary?
> > +     * DEFINE_PROP_STRING("vfiofd", VFIODevice, vfiofd_name),
> > +     * DEFINE_PROP_STRING("vfiogroupfd, VFIODevice, vfiogroupfd_name),
> > +     */
> > +    DEFINE_PROP_END_OF_LIST(),
> > +};
> > +
> > +
> > +static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
> > +{
> > +    PCIDeviceClass *dc = PCI_DEVICE_CLASS(klass);
> > +
> > +    dc->parent_class.reset = vfio_reset;
> 
> This is definitely not right.  You want to do:
> 
> PCIDeviceClass *pdc = PCI_DEVICE_CLASS(klass);
> DeviceClass *dc = DEVICE_CLASS(klass);
> 
> dc->reset = vfio_reset;
> 
> > +    dc->init = vfio_initfn;
> > +    dc->exit = vfio_exitfn;
> > +    dc->config_read = vfio_pci_read_config;
> > +    dc->config_write = vfio_pci_write_config;
> > +    dc->parent_class.props = vfio_pci_dev_properties;
> 
> dc->props = vfio_pci_dev_properties;

Ok, fixed.

> > +}
> > +
> > +static TypeInfo vfio_pci_dev_info = {
> > +    .name          = "vfio-pci",
> > +    .parent        = TYPE_PCI_DEVICE,
> > +    .instance_size = sizeof(VFIODevice),
> > +    .class_init    = vfio_pci_dev_class_init,
> > +};
> > +
> > +static void register_vfio_pci_dev_type(void)
> > +{
> > +    type_register_static(&vfio_pci_dev_info);
> > +}
> > +
> > +type_init(register_vfio_pci_dev_type)
> > diff --git a/hw/vfio_pci.h b/hw/vfio_pci.h
> > new file mode 100644
> > index 0000000..0a71bce
> > --- /dev/null
> > +++ b/hw/vfio_pci.h
> > @@ -0,0 +1,101 @@
> 
> copyright/license.
> 
> > +#ifndef HW_VFIO_PCI_H
> > +#define HW_VFIO_PCI_H
> > +
> > +#include "qemu-common.h"
> > +#include "qemu-queue.h"
> > +#include "pci.h"
> > +#include "event_notifier.h"
> 
> This is all private to vfio.c, right?  Perhaps call it vfio_pci_int.h

Yes, it is private.  Ok, renamed.  Thanks for the review,

Alex

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH 3/3] vfio: Enable vfio-pci and mark supported
  2012-08-13 22:19     ` Anthony Liguori
@ 2012-08-14  5:27       ` Alex Williamson
  2012-08-14 14:35         ` Avi Kivity
  0 siblings, 1 reply; 42+ messages in thread
From: Alex Williamson @ 2012-08-14  5:27 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: aik, Jan Kiszka, qemu-devel, kvm

On Mon, 2012-08-13 at 17:19 -0500, Anthony Liguori wrote:
> Jan Kiszka <jan.kiszka@web.de> writes:
> 
> > On 2012-08-01 07:18, Alex Williamson wrote:
> >> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> >> ---
> >> 
> >>  MAINTAINERS           |    5 +++++
> >>  configure             |   12 ++++++++++++
> >>  hw/i386/Makefile.objs |    1 +
> >>  3 files changed, 18 insertions(+)
> >> 
> >> diff --git a/MAINTAINERS b/MAINTAINERS
> >> index 2d219d2..9680d69 100644
> >> --- a/MAINTAINERS
> >> +++ b/MAINTAINERS
> >> @@ -460,6 +460,11 @@ M: Gerd Hoffmann <kraxel@redhat.com>
> >>  S: Maintained
> >>  F: hw/usb*
> >>  
> >> +VFIO
> >> +M: Alex Williamson <alex.williamson@redhat.com>
> >> +S: Supported
> >> +F: hw/vfio*
> >> +
> >>  vhost
> >>  M: Michael S. Tsirkin <mst@redhat.com>
> >>  S: Supported
> >> diff --git a/configure b/configure
> >> index c65b5f6..81108dc 100755
> >> --- a/configure
> >> +++ b/configure
> >> @@ -143,6 +143,7 @@ attr=""
> >>  libattr=""
> >>  xfs=""
> >>  
> >> +vfio_pci="no"
> >>  vhost_net="no"
> >>  kvm="no"
> >>  gprof="no"
> >> @@ -489,6 +490,7 @@ Haiku)
> >>    usb="linux"
> >>    kvm="yes"
> >>    vhost_net="yes"
> >> +  vfio_pci="yes"
> >>    if [ "$cpu" = "i386" -o "$cpu" = "x86_64" ] ; then
> >>      audio_possible_drivers="$audio_possible_drivers fmod"
> >>    fi
> >> @@ -824,6 +826,10 @@ for opt do
> >>    ;;
> >>    --disable-guest-agent) guest_agent="no"
> >>    ;;
> >> +  --disable-vfio-pci) vfio_pci="no"
> >> +  ;;
> >> +  --enable-vfio-pci) vfio_pci="yes"
> >> +  ;;
> >
> > Do we need this level of control? Open question I'm just wondering every
> > time a new feature gets added together with --disable/--enable
> > switches.
> 
> I don't think so--it's easy enough for an administrator to disable vfio
> for a user.

Ok, out voted.  I'll remove.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH 2/3] vfio: vfio-pci device assignment driver
  2012-08-01  5:18 ` [Qemu-devel] [PATCH 2/3] vfio: vfio-pci device assignment driver Alex Williamson
  2012-08-13 22:18   ` Anthony Liguori
@ 2012-08-14  7:12   ` Stefan Hajnoczi
  2012-08-14 13:51     ` Alex Williamson
  2012-08-14 15:53   ` Avi Kivity
  2 siblings, 1 reply; 42+ messages in thread
From: Stefan Hajnoczi @ 2012-08-14  7:12 UTC (permalink / raw)
  To: Alex Williamson; +Cc: aik, aliguori, qemu-devel, kvm

On Tue, Jul 31, 2012 at 11:18:15PM -0600, Alex Williamson wrote:
> This adds the core of the QEMU VFIO-based PCI device assignment driver.
> To make use of this driver, enable CONFIG_VFIO, CONFIG_VFIO_IOMMU_TYPE1,
> and CONFIG_VFIO_PCI in your host Linux kernel config.  Load the vfio-pci
> module.  To assign device 0000:05:00.0 to a guest, do the following:
> 
> for dev in $(ls /sys/bus/pci/devices/0000:05:00.0/iommu_group/devices); do
>     vendor=$(cat /sys/bus/pci/devices/$dev/vendor)
>     device=$(cat /sys/bus/pci/devices/$dev/device)
>     if [ -e /sys/bus/pci/devices/$dev/driver ]; then
>         echo $dev > /sys/bus/pci/devices/$dev/driver/unbind
>     fi
>     echo $vendor $device > /sys/bus/pci/drivers/vfio-pci/new_id
> done

Both vfio-pci and the old driver successfully match the $vendor:$device.
What happens when another $vendor:$device PCI adapter is hotplugged into
the host?

Is there a way to bind vfio-pci on a per-adapter basis instead of a
per-$vendor:$device?

Stefan

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH 0/3] VFIO-based PCI device assignment for QEMU 1.2
  2012-08-13 19:31       ` Anthony Liguori
@ 2012-08-14  7:19         ` Jan Kiszka
  2012-08-14 14:42         ` Avi Kivity
  2012-08-14 14:53         ` Cole Robinson
  2 siblings, 0 replies; 42+ messages in thread
From: Jan Kiszka @ 2012-08-14  7:19 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: kvm@vger.kernel.org, aik@ozlabs.ru, qemu-devel@nongnu.org,
	Alex Graf, Alex Williamson, Avi Kivity, Cole Robinson

[-- Attachment #1: Type: text/plain, Size: 1673 bytes --]

On 2012-08-13 21:31, Anthony Liguori wrote:
> Jan Kiszka <jan.kiszka@siemens.com> writes:
> 
>> On 2012-08-13 15:58, Avi Kivity wrote:
>>> On 08/13/2012 04:27 PM, Anthony Liguori wrote:
>>>
>>>> Thanks for pushing this forward!  Hopefully this will finally kill off
>>>> qemu-kvm.git for good.
>>>
>>> No, it won't.  vfio requires a 3.6 kernel, which we cannot assume anyone
>>> has.  We'll need the original device assignment code side-by-side.
>>
>> ...which is on my to-do list for 1.3.
> 
> Is there a deprecation plan for the old device assignment code?
> 
> I'm not really against the idea of requiring a new kernel for new
> features.
> 
> From a Fedora/OpenSUSE point of view, would supporting old kernels be a
> requirement to stop shipping qemu-kvm.git over qemu.git?
> 
> Since distros ship new kernels and new userspaces, I don't think distros
> would care so I'm not sure who we're trying to support old kernels for.

We are supporting KVM down to 2.6.3x, if not 2.6.2x. Also, device
assignment is a new feature for upstream, but not for the masses of KVM
users of QEMU (due to qemu-kvm and corresponding libvirt support). I
think it will take some more kernel releases to have all feature there
that allows performance-wise equivalent device assignment via VFIO. And
it can even be helpful to cross-check issues of VFIO in the field.

Except for some self-contained helper functions in the KVM layer,
classic device assignment will be as isolated as VFIO. So I don't think
we would take any noteworthy burden to maintain it as long as the kernel
supports this interface.

Can't comment on the other questions.

Jan



[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 262 bytes --]

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH 2/3] vfio: vfio-pci device assignment driver
  2012-08-14  7:12   ` Stefan Hajnoczi
@ 2012-08-14 13:51     ` Alex Williamson
  0 siblings, 0 replies; 42+ messages in thread
From: Alex Williamson @ 2012-08-14 13:51 UTC (permalink / raw)
  To: Stefan Hajnoczi; +Cc: aik, aliguori, qemu-devel, kvm

On Tue, 2012-08-14 at 08:12 +0100, Stefan Hajnoczi wrote:
> On Tue, Jul 31, 2012 at 11:18:15PM -0600, Alex Williamson wrote:
> > This adds the core of the QEMU VFIO-based PCI device assignment driver.
> > To make use of this driver, enable CONFIG_VFIO, CONFIG_VFIO_IOMMU_TYPE1,
> > and CONFIG_VFIO_PCI in your host Linux kernel config.  Load the vfio-pci
> > module.  To assign device 0000:05:00.0 to a guest, do the following:
> > 
> > for dev in $(ls /sys/bus/pci/devices/0000:05:00.0/iommu_group/devices); do
> >     vendor=$(cat /sys/bus/pci/devices/$dev/vendor)
> >     device=$(cat /sys/bus/pci/devices/$dev/device)
> >     if [ -e /sys/bus/pci/devices/$dev/driver ]; then
> >         echo $dev > /sys/bus/pci/devices/$dev/driver/unbind
> >     fi
> >     echo $vendor $device > /sys/bus/pci/drivers/vfio-pci/new_id
> > done
> 
> Both vfio-pci and the old driver successfully match the $vendor:$device.
> What happens when another $vendor:$device PCI adapter is hotplugged into
> the host?
> 
> Is there a way to bind vfio-pci on a per-adapter basis instead of a
> per-$vendor:$device?

There's also a remove_id sysfs entry so we can be a little more
strategic, this was just an example.  We can also re-order new_id vs
unbind so that we need to manually unbind->bind the device.  There's
still an opportunity to race with a hotplug during this interaction
though.  I think this is the best Linux currently offers for dynamic
driver binding though.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH 3/3] vfio: Enable vfio-pci and mark supported
  2012-08-14  5:27       ` Alex Williamson
@ 2012-08-14 14:35         ` Avi Kivity
  0 siblings, 0 replies; 42+ messages in thread
From: Avi Kivity @ 2012-08-14 14:35 UTC (permalink / raw)
  To: Alex Williamson; +Cc: aik, Anthony Liguori, Jan Kiszka, qemu-devel, kvm

On 08/14/2012 08:27 AM, Alex Williamson wrote:
>> >
>> > Do we need this level of control? Open question I'm just wondering every
>> > time a new feature gets added together with --disable/--enable
>> > switches.
>> 
>> I don't think so--it's easy enough for an administrator to disable vfio
>> for a user.
> 
> Ok, out voted.  I'll remove.  Thanks,
> 

There is an advantage to --enable-blah in that it errors out if build
requirements are not satisfied, compared to silently disabling the
feature with a plain ./configure.  This is important for distro builds
which can start to silently break features when we add a new build
requirement.

But it can be done later, possibly with a new --enable=vfio,kvm,... list
instead of individual features.

-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH 0/3] VFIO-based PCI device assignment for QEMU 1.2
  2012-08-13 19:31       ` Anthony Liguori
  2012-08-14  7:19         ` Jan Kiszka
@ 2012-08-14 14:42         ` Avi Kivity
  2012-08-14 14:53         ` Cole Robinson
  2 siblings, 0 replies; 42+ messages in thread
From: Avi Kivity @ 2012-08-14 14:42 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: kvm@vger.kernel.org, aik@ozlabs.ru, Jan Kiszka, Cole Robinson,
	qemu-devel@nongnu.org, Alex Graf, Alex Williamson

On 08/13/2012 10:31 PM, Anthony Liguori wrote:
> Jan Kiszka <jan.kiszka@siemens.com> writes:
> 
>> On 2012-08-13 15:58, Avi Kivity wrote:
>>> On 08/13/2012 04:27 PM, Anthony Liguori wrote:
>>> 
>>>> Thanks for pushing this forward!  Hopefully this will finally kill off
>>>> qemu-kvm.git for good.
>>> 
>>> No, it won't.  vfio requires a 3.6 kernel, which we cannot assume anyone
>>> has.  We'll need the original device assignment code side-by-side.
>>
>> ...which is on my to-do list for 1.3.
> 
> Is there a deprecation plan for the old device assignment code?

Not yet.  I would say 2-3 years from the release that has full support
(3.7, since we need more support for INTx in kvm).

This feature is user visible, all the way up to scripts and management
tools.  It's not a plug-in replacement (though I tried).  Even with a
new kernel, we can't just drop it and point users to vfio.

> 
> I'm not really against the idea of requiring a new kernel for new
> features.
> 
> From a Fedora/OpenSUSE point of view, would supporting old kernels be a
> requirement to stop shipping qemu-kvm.git over qemu.git?
> 
> Since distros ship new kernels and new userspaces, I don't think distros
> would care so I'm not sure who we're trying to support old kernels for.

People do all sort of wierd things.  qemu and the kernel are not just
for fedora/opensuse consumers who have someone to take care of the
entire stack for them, when we deprecate an ABI we need to give them
plenty of time to adjust.  I think this is even more relevant for device
assignment which is sometimes used with unusual use cases.

-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH 0/3] VFIO-based PCI device assignment for QEMU 1.2
  2012-08-13 19:31       ` Anthony Liguori
  2012-08-14  7:19         ` Jan Kiszka
  2012-08-14 14:42         ` Avi Kivity
@ 2012-08-14 14:53         ` Cole Robinson
  2012-08-14 15:04           ` Jan Kiszka
  2 siblings, 1 reply; 42+ messages in thread
From: Cole Robinson @ 2012-08-14 14:53 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: kvm@vger.kernel.org, aik@ozlabs.ru, Jan Kiszka,
	qemu-devel@nongnu.org, Alex Graf, Alex Williamson, Avi Kivity

On 08/13/2012 03:31 PM, Anthony Liguori wrote:
> Jan Kiszka <jan.kiszka@siemens.com> writes:
> 
>> On 2012-08-13 15:58, Avi Kivity wrote:
>>> On 08/13/2012 04:27 PM, Anthony Liguori wrote:
>>>
>>>> Thanks for pushing this forward!  Hopefully this will finally kill off
>>>> qemu-kvm.git for good.
>>>
>>> No, it won't.  vfio requires a 3.6 kernel, which we cannot assume anyone
>>> has.  We'll need the original device assignment code side-by-side.
>>
>> ...which is on my to-do list for 1.3.
> 
> Is there a deprecation plan for the old device assignment code?
> 
> I'm not really against the idea of requiring a new kernel for new
> features.
> 
> From a Fedora/OpenSUSE point of view, would supporting old kernels be a
> requirement to stop shipping qemu-kvm.git over qemu.git?
> 

Speaking as a Fedora maintainer, compatibility with old kernels isn't that
important to us, provided the functionality of the new way is comparable to
the old way.

As far as switching over to qemu.git, I assume there will eventually be a day
when the fork would 'end' and qemu-kvm would stop getting its own releases,
which is when we'd switch. Maybe that assumption is wrong or over simplifying
the trade offs, but if merge work is ongoing I don't see a very compelling
reason to switch.

- Cole

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH 0/3] VFIO-based PCI device assignment for QEMU 1.2
  2012-08-14 14:53         ` Cole Robinson
@ 2012-08-14 15:04           ` Jan Kiszka
  2012-08-14 15:28             ` Cole Robinson
  0 siblings, 1 reply; 42+ messages in thread
From: Jan Kiszka @ 2012-08-14 15:04 UTC (permalink / raw)
  To: Cole Robinson
  Cc: Anthony Liguori, kvm@vger.kernel.org, aik@ozlabs.ru,
	qemu-devel@nongnu.org, Alex Graf, Alex Williamson, Avi Kivity

On 2012-08-14 16:53, Cole Robinson wrote:
> On 08/13/2012 03:31 PM, Anthony Liguori wrote:
>> Jan Kiszka <jan.kiszka@siemens.com> writes:
>>
>>> On 2012-08-13 15:58, Avi Kivity wrote:
>>>> On 08/13/2012 04:27 PM, Anthony Liguori wrote:
>>>>
>>>>> Thanks for pushing this forward!  Hopefully this will finally kill off
>>>>> qemu-kvm.git for good.
>>>>
>>>> No, it won't.  vfio requires a 3.6 kernel, which we cannot assume anyone
>>>> has.  We'll need the original device assignment code side-by-side.
>>>
>>> ...which is on my to-do list for 1.3.
>>
>> Is there a deprecation plan for the old device assignment code?
>>
>> I'm not really against the idea of requiring a new kernel for new
>> features.
>>
>> From a Fedora/OpenSUSE point of view, would supporting old kernels be a
>> requirement to stop shipping qemu-kvm.git over qemu.git?
>>
> 
> Speaking as a Fedora maintainer, compatibility with old kernels isn't that
> important to us, provided the functionality of the new way is comparable to
> the old way.
> 
> As far as switching over to qemu.git, I assume there will eventually be a day
> when the fork would 'end' and qemu-kvm would stop getting its own releases,
> which is when we'd switch. Maybe that assumption is wrong or over simplifying
> the trade offs, but if merge work is ongoing I don't see a very compelling
> reason to switch.

If you sit and wait, you may find out on that specific day that someone
forget to port over feature X and Y, and now QEMU does not fit your
needs and qemu-kvm is dead.

Jan

-- 
Siemens AG, Corporate Technology, CT RTC ITP SDP-DE
Corporate Competence Center Embedded Linux

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH 0/3] VFIO-based PCI device assignment for QEMU 1.2
  2012-08-14 15:04           ` Jan Kiszka
@ 2012-08-14 15:28             ` Cole Robinson
  0 siblings, 0 replies; 42+ messages in thread
From: Cole Robinson @ 2012-08-14 15:28 UTC (permalink / raw)
  To: Jan Kiszka
  Cc: Anthony Liguori, kvm@vger.kernel.org, aik@ozlabs.ru,
	qemu-devel@nongnu.org, Alex Graf, Alex Williamson, Avi Kivity

On 08/14/2012 11:04 AM, Jan Kiszka wrote:
> On 2012-08-14 16:53, Cole Robinson wrote:
>> On 08/13/2012 03:31 PM, Anthony Liguori wrote:
>>> Jan Kiszka <jan.kiszka@siemens.com> writes:
>>>
>>>> On 2012-08-13 15:58, Avi Kivity wrote:
>>>>> On 08/13/2012 04:27 PM, Anthony Liguori wrote:
>>>>>
>>>>>> Thanks for pushing this forward!  Hopefully this will finally kill off
>>>>>> qemu-kvm.git for good.
>>>>>
>>>>> No, it won't.  vfio requires a 3.6 kernel, which we cannot assume anyone
>>>>> has.  We'll need the original device assignment code side-by-side.
>>>>
>>>> ...which is on my to-do list for 1.3.
>>>
>>> Is there a deprecation plan for the old device assignment code?
>>>
>>> I'm not really against the idea of requiring a new kernel for new
>>> features.
>>>
>>> From a Fedora/OpenSUSE point of view, would supporting old kernels be a
>>> requirement to stop shipping qemu-kvm.git over qemu.git?
>>>
>>
>> Speaking as a Fedora maintainer, compatibility with old kernels isn't that
>> important to us, provided the functionality of the new way is comparable to
>> the old way.
>>
>> As far as switching over to qemu.git, I assume there will eventually be a day
>> when the fork would 'end' and qemu-kvm would stop getting its own releases,
>> which is when we'd switch. Maybe that assumption is wrong or over simplifying
>> the trade offs, but if merge work is ongoing I don't see a very compelling
>> reason to switch.
> 
> If you sit and wait, you may find out on that specific day that someone
> forget to port over feature X and Y, and now QEMU does not fit your
> needs and qemu-kvm is dead.
> 

My head isn't entirely in the sand here, I've watched the patches go by and
feel pretty confident that you and co. wouldn't drop qemu-kvm if there was
something missing that left qemu.git substantially lacking, at least not
without announcing it clearly. I know certain defaults will change and certain
cli options will go away but that just requires user education.

And qemu-kvm won't really 'die', the code isn't going to disappear. If we
switch to qemu.git and discover some vital piece is missing, we can
temporarily carry the relevant qemu-kvm bits and try to get the issue resolved
upstream. If upstream doesn't want to change, then we are back to user education.

- Cole

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH 2/3] vfio: vfio-pci device assignment driver
  2012-08-01  5:18 ` [Qemu-devel] [PATCH 2/3] vfio: vfio-pci device assignment driver Alex Williamson
  2012-08-13 22:18   ` Anthony Liguori
  2012-08-14  7:12   ` Stefan Hajnoczi
@ 2012-08-14 15:53   ` Avi Kivity
  2012-08-14 17:23     ` Alex Williamson
  2 siblings, 1 reply; 42+ messages in thread
From: Avi Kivity @ 2012-08-14 15:53 UTC (permalink / raw)
  To: Alex Williamson; +Cc: aik, aliguori, qemu-devel, kvm

On 08/01/2012 08:18 AM, Alex Williamson wrote:
> This adds the core of the QEMU VFIO-based PCI device assignment driver.
> To make use of this driver, enable CONFIG_VFIO, CONFIG_VFIO_IOMMU_TYPE1,
> and CONFIG_VFIO_PCI in your host Linux kernel config.  Load the vfio-pci
> module.  To assign device 0000:05:00.0 to a guest, do the following:
> 
> for dev in $(ls /sys/bus/pci/devices/0000:05:00.0/iommu_group/devices); do
>     vendor=$(cat /sys/bus/pci/devices/$dev/vendor)
>     device=$(cat /sys/bus/pci/devices/$dev/device)
>     if [ -e /sys/bus/pci/devices/$dev/driver ]; then
>         echo $dev > /sys/bus/pci/devices/$dev/driver/unbind
>     fi
>     echo $vendor $device > /sys/bus/pci/drivers/vfio-pci/new_id
> done
> 
> See Documentation/vfio.txt in the Linux kernel tree for further
> description of IOMMU groups and VFIO.
> 
> Then launch qemu including the option:
> 
> -device vfio-pci,host=0000:05:00.0
> 
> Support for legacy PCI interrupts (INTx) is not yet included and will
> be added in a future update.  Both MSI and MSI-X are supported here.


> +
> +static void vfio_update_irq(PCIDevice *pdev)
> +{
> +    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
> +    PCIINTxRoute route;
> +
> +    if (vdev->interrupt != INT_INTx) {
> +        return;
> +    }
> +
> +    route = pci_device_route_intx_to_irq(&vdev->pdev, vdev->intx.pin);
> +    if (!memcmp(&route, &vdev->intx.route, sizeof(route))) {
> +        return; /* Nothing changed */
> +    }

You can't memcmp() structures, the compiler may add uninitialized holes
that will miscompare.  It's probably harmless here since it's an
optimization.

Unrelated nit: memcmp() doesn't return a boolean or a count, so
!memcmp() is really unintuitive, at least to me.

> +
> +static int vfio_enable_intx(VFIODevice *vdev)
> +{
> +    struct vfio_irq_set_fd irq_set_fd = {
> +        .irq_set = {
> +            .argsz = sizeof(irq_set_fd),
> +            .flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER,
> +            .index = VFIO_PCI_INTX_IRQ_INDEX,
> +            .start = 0,
> +            .count = 1,
> +        },
> +    };
> +    uint8_t pin = vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1);
> +
> +    if (!pin) {
> +        return 0;
> +    }
> +
> +    vfio_disable_interrupts(vdev);
> +
> +    vdev->intx.pin = pin - 1; /* Pin A (1) -> irq[0] */
> +    vdev->intx.route = pci_device_route_intx_to_irq(&vdev->pdev,
> +                                                    vdev->intx.pin);
> +    /* TBD - Enable QEMU eoi notifier */
> +
> +    if (event_notifier_init(&vdev->intx.interrupt, 0)) {
> +        error_report("vfio: Error: event_notifier_init failed\n");
> +        return -1;

return -error is better.

> +    }
> +
> +    irq_set_fd.fd = event_notifier_get_fd(&vdev->intx.interrupt);
> +    qemu_set_fd_handler(irq_set_fd.fd, vfio_intx_interrupt, NULL, vdev);
> +
> +    if (ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set_fd)) {
> +        error_report("vfio: Error: Failed to setup INTx fd: %s\n",
> +                     strerror(errno));
> +        return -1;
> +    }
> +
> +    vfio_enable_intx_kvm(vdev);
> +
> +    vdev->interrupt = INT_INTx;
> +
> +    DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
> +            vdev->host.bus, vdev->host.slot, vdev->host.function);
> +
> +    return 0;
> +}
> +
> +
> +
> +/* XXX This should move to msi.c */

Well?

> +static MSIMessage msi_get_msg(PCIDevice *pdev, unsigned int vector)
> +{
> +    uint16_t flags = pci_get_word(pdev->config + pdev->msi_cap + PCI_MSI_FLAGS);
> +    bool msi64bit = flags & PCI_MSI_FLAGS_64BIT;
> +    MSIMessage msg;
> +
> +    if (msi64bit) {
> +        msg.address = pci_get_quad(pdev->config +
> +                                   pdev->msi_cap + PCI_MSI_ADDRESS_LO);
> +    } else {
> +        msg.address = pci_get_long(pdev->config +
> +                                   pdev->msi_cap + PCI_MSI_ADDRESS_LO);
> +    }
> +
> +    msg.data = pci_get_word(pdev->config + pdev->msi_cap +
> +                            (msi64bit ? PCI_MSI_DATA_64 : PCI_MSI_DATA_32));
> +    msg.data += vector;
> +
> +    return msg;
> +}
> +
> +
> +/*
> + * IO Port/MMIO - Beware of the endians, VFIO is always little endian
> + */
> +static void vfio_bar_write(void *opaque, target_phys_addr_t addr,
> +                           uint64_t data, unsigned size)
> +{
> +    VFIOBAR *bar = opaque;
> +    uint8_t buf[8];
> +
> +    switch (size) {
> +    case 1:
> +        *buf = data & 0xff;
> +        break;
> +    case 2:
> +        *(uint16_t *)buf = cpu_to_le16(data);
> +        break;
> +    case 4:
> +        *(uint32_t *)buf = cpu_to_le32(data);
> +        break;

This works accidentally on machines that require alignment, since
there's no requirement from the compiler to align buf.  You can use a
union to align it.

> +    default:
> +        hw_error("vfio: unsupported write size, %d bytes\n", size);
> +        break;
> +    }
> +
> +    if (pwrite(bar->fd, buf, size, bar->fd_offset + addr) != size) {
> +        error_report("%s(,0x%"PRIx64", 0x%"PRIx64", %d) failed: %s\n",
> +                     __func__, addr, data, size, strerror(errno));
> +    }
> +
> +    DPRINTF("%s(BAR%d+0x%"PRIx64", 0x%"PRIx64", %d)\n",
> +            __func__, bar->nr, addr, data, size);
> +}
> +
> +
> +static void vfio_listener_region_add(MemoryListener *listener,
> +                                     MemoryRegionSection *section)
> +{
> +    VFIOContainer *container = container_of(listener, VFIOContainer,
> +                                            iommu_data.listener);
> +    target_phys_addr_t iova, end;
> +    void *vaddr;
> +    int ret;
> +
> +    if (vfio_listener_skipped_section(section)) {
> +        DPRINTF("vfio: SKIPPING region_add %016lx - %016lx\n",
> +                section->offset_within_address_space,
> +                section->offset_within_address_space + section->size - 1);
> +        return;
> +    }
> +
> +    if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
> +                 (section->offset_within_region & ~TARGET_PAGE_MASK))) {
> +        error_report("%s received unaligned region\n", __func__);

Is it really an error?  I think you can just add the condition to
skipped_section.

> +        return;
> +    }
> +
> +


-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH 2/3] vfio: vfio-pci device assignment driver
  2012-08-14 15:53   ` Avi Kivity
@ 2012-08-14 17:23     ` Alex Williamson
  2012-08-15  8:56       ` Avi Kivity
  0 siblings, 1 reply; 42+ messages in thread
From: Alex Williamson @ 2012-08-14 17:23 UTC (permalink / raw)
  To: Avi Kivity; +Cc: aik, aliguori, qemu-devel, kvm

On Tue, 2012-08-14 at 18:53 +0300, Avi Kivity wrote:
> On 08/01/2012 08:18 AM, Alex Williamson wrote:
> > This adds the core of the QEMU VFIO-based PCI device assignment driver.
> > To make use of this driver, enable CONFIG_VFIO, CONFIG_VFIO_IOMMU_TYPE1,
> > and CONFIG_VFIO_PCI in your host Linux kernel config.  Load the vfio-pci
> > module.  To assign device 0000:05:00.0 to a guest, do the following:
> > 
> > for dev in $(ls /sys/bus/pci/devices/0000:05:00.0/iommu_group/devices); do
> >     vendor=$(cat /sys/bus/pci/devices/$dev/vendor)
> >     device=$(cat /sys/bus/pci/devices/$dev/device)
> >     if [ -e /sys/bus/pci/devices/$dev/driver ]; then
> >         echo $dev > /sys/bus/pci/devices/$dev/driver/unbind
> >     fi
> >     echo $vendor $device > /sys/bus/pci/drivers/vfio-pci/new_id
> > done
> > 
> > See Documentation/vfio.txt in the Linux kernel tree for further
> > description of IOMMU groups and VFIO.
> > 
> > Then launch qemu including the option:
> > 
> > -device vfio-pci,host=0000:05:00.0
> > 
> > Support for legacy PCI interrupts (INTx) is not yet included and will
> > be added in a future update.  Both MSI and MSI-X are supported here.
> 
> 
> > +
> > +static void vfio_update_irq(PCIDevice *pdev)
> > +{
> > +    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
> > +    PCIINTxRoute route;
> > +
> > +    if (vdev->interrupt != INT_INTx) {
> > +        return;
> > +    }
> > +
> > +    route = pci_device_route_intx_to_irq(&vdev->pdev, vdev->intx.pin);
> > +    if (!memcmp(&route, &vdev->intx.route, sizeof(route))) {
> > +        return; /* Nothing changed */
> > +    }
> 
> You can't memcmp() structures, the compiler may add uninitialized holes
> that will miscompare.  It's probably harmless here since it's an
> optimization.

Added this helper function:

/* TODO: Move this helper out to generic PCI code */
static bool vfio_intx_route_changed(PCIINTxRoute *old, PCIINTxRoute *new)
{
    return old->mode != new->mode || old->irq != new->irq;
}


> Unrelated nit: memcmp() doesn't return a boolean or a count, so
> !memcmp() is really unintuitive, at least to me.

I figure we're all pretty used to it growing up on !strcmp though.

> > +
> > +static int vfio_enable_intx(VFIODevice *vdev)
> > +{
> > +    struct vfio_irq_set_fd irq_set_fd = {
> > +        .irq_set = {
> > +            .argsz = sizeof(irq_set_fd),
> > +            .flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER,
> > +            .index = VFIO_PCI_INTX_IRQ_INDEX,
> > +            .start = 0,
> > +            .count = 1,
> > +        },
> > +    };
> > +    uint8_t pin = vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1);
> > +
> > +    if (!pin) {
> > +        return 0;
> > +    }
> > +
> > +    vfio_disable_interrupts(vdev);
> > +
> > +    vdev->intx.pin = pin - 1; /* Pin A (1) -> irq[0] */
> > +    vdev->intx.route = pci_device_route_intx_to_irq(&vdev->pdev,
> > +                                                    vdev->intx.pin);
> > +    /* TBD - Enable QEMU eoi notifier */
> > +
> > +    if (event_notifier_init(&vdev->intx.interrupt, 0)) {
> > +        error_report("vfio: Error: event_notifier_init failed\n");
> > +        return -1;
> 
> return -error is better.

Here we probably want to return the return value of
event_notifier_init(), but there are lots of cases where we could return
-errno.  Fixed them.

> > +    }
> > +
> > +    irq_set_fd.fd = event_notifier_get_fd(&vdev->intx.interrupt);
> > +    qemu_set_fd_handler(irq_set_fd.fd, vfio_intx_interrupt, NULL, vdev);
> > +
> > +    if (ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set_fd)) {
> > +        error_report("vfio: Error: Failed to setup INTx fd: %s\n",
> > +                     strerror(errno));
> > +        return -1;
> > +    }
> > +
> > +    vfio_enable_intx_kvm(vdev);
> > +
> > +    vdev->interrupt = INT_INTx;
> > +
> > +    DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
> > +            vdev->host.bus, vdev->host.slot, vdev->host.function);
> > +
> > +    return 0;
> > +}
> > +
> > +
> > +
> > +/* XXX This should move to msi.c */
> 
> Well?

Just marking a todo item.  I'll change it formally to TODO.  I think
there are a few interfaces to msi.c that probably needs some rethinking
for device assignment.  When they're small like this it seems easier to
have the user in tree first.
> 
> > +static MSIMessage msi_get_msg(PCIDevice *pdev, unsigned int vector)
> > +{
> > +    uint16_t flags = pci_get_word(pdev->config + pdev->msi_cap + PCI_MSI_FLAGS);
> > +    bool msi64bit = flags & PCI_MSI_FLAGS_64BIT;
> > +    MSIMessage msg;
> > +
> > +    if (msi64bit) {
> > +        msg.address = pci_get_quad(pdev->config +
> > +                                   pdev->msi_cap + PCI_MSI_ADDRESS_LO);
> > +    } else {
> > +        msg.address = pci_get_long(pdev->config +
> > +                                   pdev->msi_cap + PCI_MSI_ADDRESS_LO);
> > +    }
> > +
> > +    msg.data = pci_get_word(pdev->config + pdev->msi_cap +
> > +                            (msi64bit ? PCI_MSI_DATA_64 : PCI_MSI_DATA_32));
> > +    msg.data += vector;
> > +
> > +    return msg;
> > +}
> > +
> > +
> > +/*
> > + * IO Port/MMIO - Beware of the endians, VFIO is always little endian
> > + */
> > +static void vfio_bar_write(void *opaque, target_phys_addr_t addr,
> > +                           uint64_t data, unsigned size)
> > +{
> > +    VFIOBAR *bar = opaque;
> > +    uint8_t buf[8];
> > +
> > +    switch (size) {
> > +    case 1:
> > +        *buf = data & 0xff;
> > +        break;
> > +    case 2:
> > +        *(uint16_t *)buf = cpu_to_le16(data);
> > +        break;
> > +    case 4:
> > +        *(uint32_t *)buf = cpu_to_le32(data);
> > +        break;
> 
> This works accidentally on machines that require alignment, since
> there's no requirement from the compiler to align buf.  You can use a
> union to align it.

Good catch, fixed.

> > +    default:
> > +        hw_error("vfio: unsupported write size, %d bytes\n", size);
> > +        break;
> > +    }
> > +
> > +    if (pwrite(bar->fd, buf, size, bar->fd_offset + addr) != size) {
> > +        error_report("%s(,0x%"PRIx64", 0x%"PRIx64", %d) failed: %s\n",
> > +                     __func__, addr, data, size, strerror(errno));
> > +    }
> > +
> > +    DPRINTF("%s(BAR%d+0x%"PRIx64", 0x%"PRIx64", %d)\n",
> > +            __func__, bar->nr, addr, data, size);
> > +}
> > +
> > +
> > +static void vfio_listener_region_add(MemoryListener *listener,
> > +                                     MemoryRegionSection *section)
> > +{
> > +    VFIOContainer *container = container_of(listener, VFIOContainer,
> > +                                            iommu_data.listener);
> > +    target_phys_addr_t iova, end;
> > +    void *vaddr;
> > +    int ret;
> > +
> > +    if (vfio_listener_skipped_section(section)) {
> > +        DPRINTF("vfio: SKIPPING region_add %016lx - %016lx\n",
> > +                section->offset_within_address_space,
> > +                section->offset_within_address_space + section->size - 1);
> > +        return;
> > +    }
> > +
> > +    if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
> > +                 (section->offset_within_region & ~TARGET_PAGE_MASK))) {
> > +        error_report("%s received unaligned region\n", __func__);
> 
> Is it really an error?  I think you can just add the condition to
> skipped_section.

I had left this in as paranoia for myself that I wanted to see if this
actually happens.  I want to assume that our TARGET_PAGE_ALIGNED
offset_within_address_space results in an aligned ram pointer.  If one
is aligned different from the other we're kinda screwed trying to map it
into the iommu.  So far I haven't seen it.  Thanks for the feedback,

Alex

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH 2/3] vfio: vfio-pci device assignment driver
  2012-08-14 17:23     ` Alex Williamson
@ 2012-08-15  8:56       ` Avi Kivity
  0 siblings, 0 replies; 42+ messages in thread
From: Avi Kivity @ 2012-08-15  8:56 UTC (permalink / raw)
  To: Alex Williamson; +Cc: aik, aliguori, qemu-devel, kvm

On 08/14/2012 08:23 PM, Alex Williamson wrote:
> 
>> Unrelated nit: memcmp() doesn't return a boolean or a count, so
>> !memcmp() is really unintuitive, at least to me.
> 
> I figure we're all pretty used to it growing up on !strcmp though.

I hate that one too.

>> > +
>> > +/* XXX This should move to msi.c */
>> 
>> Well?
> 
> Just marking a todo item.  I'll change it formally to TODO.  I think
> there are a few interfaces to msi.c that probably needs some rethinking
> for device assignment.  When they're small like this it seems easier to
> have the user in tree first.

I prefer them in the right place but I don't insist.

>> > +
>> > +    if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
>> > +                 (section->offset_within_region & ~TARGET_PAGE_MASK))) {
>> > +        error_report("%s received unaligned region\n", __func__);
>> 
>> Is it really an error?  I think you can just add the condition to
>> skipped_section.
> 
> I had left this in as paranoia for myself that I wanted to see if this
> actually happens.  I want to assume that our TARGET_PAGE_ALIGNED
> offset_within_address_space results in an aligned ram pointer.  If one
> is aligned different from the other we're kinda screwed trying to map it
> into the iommu.  So far I haven't seen it.  Thanks for the feedback,

We could have a sub-page RAM region (perhaps inserted as a mapped BAR
from some emulated device, or from vfio if/when it grows that capability).

But you're right, it really is an error, we can't just ignore it.  So
the current code is right.

-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 42+ messages in thread

end of thread, other threads:[~2012-08-15  8:56 UTC | newest]

Thread overview: 42+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2012-08-01  5:18 [Qemu-devel] [PATCH 0/3] VFIO-based PCI device assignment for QEMU 1.2 Alex Williamson
2012-08-01  5:18 ` [Qemu-devel] [PATCH 1/3] vfio: Import vfio kernel header Alex Williamson
2012-08-01  7:13   ` Jan Kiszka
2012-08-01 18:09     ` Alex Williamson
2012-08-02  9:02       ` Jan Kiszka
2012-08-02 16:37         ` Alex Williamson
2012-08-02 16:45           ` Jan Kiszka
2012-08-01  5:18 ` [Qemu-devel] [PATCH 2/3] vfio: vfio-pci device assignment driver Alex Williamson
2012-08-13 22:18   ` Anthony Liguori
2012-08-14  5:25     ` Alex Williamson
2012-08-14  7:12   ` Stefan Hajnoczi
2012-08-14 13:51     ` Alex Williamson
2012-08-14 15:53   ` Avi Kivity
2012-08-14 17:23     ` Alex Williamson
2012-08-15  8:56       ` Avi Kivity
2012-08-01  5:18 ` [Qemu-devel] [PATCH 3/3] vfio: Enable vfio-pci and mark supported Alex Williamson
2012-08-01  7:15   ` Jan Kiszka
2012-08-01 18:14     ` Alex Williamson
2012-08-01 19:40       ` Alex Williamson
2012-08-02  9:03         ` Jan Kiszka
2012-08-13 22:19     ` Anthony Liguori
2012-08-14  5:27       ` Alex Williamson
2012-08-14 14:35         ` Avi Kivity
2012-08-13 13:27 ` [Qemu-devel] [PATCH 0/3] VFIO-based PCI device assignment for QEMU 1.2 Anthony Liguori
2012-08-13 13:58   ` Avi Kivity
2012-08-13 14:04     ` Jan Kiszka
2012-08-13 19:31       ` Anthony Liguori
2012-08-14  7:19         ` Jan Kiszka
2012-08-14 14:42         ` Avi Kivity
2012-08-14 14:53         ` Cole Robinson
2012-08-14 15:04           ` Jan Kiszka
2012-08-14 15:28             ` Cole Robinson
2012-08-13 14:23   ` Alex Williamson
2012-08-13 15:48     ` Andreas Hartmann
2012-08-13 16:14       ` Alex Williamson
2012-08-13 16:36         ` Andreas Hartmann
2012-08-13 16:57           ` Alex Williamson
2012-08-13 18:32             ` Andreas Hartmann
2012-08-13 19:33     ` Anthony Liguori
2012-08-13 20:48       ` Blue Swirl
2012-08-13 20:56         ` Alex Williamson
2012-08-13 20:55       ` [Qemu-devel] VFIO: Call for reviewers (was Re: [PATCH 0/3] VFIO-based PCI device assignment for QEMU 1.2) Alex Williamson

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).