From mboxrd@z Thu Jan  1 00:00:00 1970
Received: from eggs.gnu.org ([208.118.235.92]:35943)
	by lists.gnu.org with esmtp (Exim 4.71)
	(envelope-from <avi@redhat.com>) id 1SuSZ0-0003cc-D4
	for qemu-devel@nongnu.org; Thu, 26 Jul 2012 14:13:31 -0400
Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71)
	(envelope-from <avi@redhat.com>) id 1SuSYy-0005RK-Po
	for qemu-devel@nongnu.org; Thu, 26 Jul 2012 14:13:30 -0400
Received: from mx1.redhat.com ([209.132.183.28]:50415)
	by eggs.gnu.org with esmtp (Exim 4.71)
	(envelope-from <avi@redhat.com>) id 1SuSYy-0005RF-Eq
	for qemu-devel@nongnu.org; Thu, 26 Jul 2012 14:13:28 -0400
Message-ID: <5011717F.6090600@redhat.com>
Date: Thu, 26 Jul 2012 19:34:07 +0300
From: Avi Kivity <avi@redhat.com>
MIME-Version: 1.0
References: <20120725165948.17260.82862.stgit@bling.home>
In-Reply-To: <20120725165948.17260.82862.stgit@bling.home>
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 7bit
Subject: Re: [Qemu-devel] [RFC PATCH] vfio: VFIO PCI driver for Qemu
List-Id: <qemu-devel.nongnu.org>
List-Unsubscribe: <https://lists.nongnu.org/mailman/options/qemu-devel>,
	<mailto:qemu-devel-request@nongnu.org?subject=unsubscribe>
List-Archive: <http://lists.nongnu.org/archive/html/qemu-devel>
List-Post: <mailto:qemu-devel@nongnu.org>
List-Help: <mailto:qemu-devel-request@nongnu.org?subject=help>
List-Subscribe: <https://lists.nongnu.org/mailman/listinfo/qemu-devel>,
	<mailto:qemu-devel-request@nongnu.org?subject=subscribe>
To: Alex Williamson <alex.williamson@redhat.com>
Cc: aik@ozlabs.ru, qemu-devel@nongnu.org, kvm@vger.kernel.org

On 07/25/2012 08:03 PM, Alex Williamson wrote:

> +/*
> + * Resource setup
> + */
> +static void vfio_unmap_bar(VFIODevice *vdev, int nr)
> +{
> +    VFIOBAR *bar = &vdev->bars[nr];
> +    uint64_t size;
> +
> +    if (!memory_region_size(&bar->mem)) {
> +        return;
> +    }
> +
> +    size = memory_region_size(&bar->mmap_mem);
> +    if (size) {
> +         memory_region_del_subregion(&bar->mem, &bar->mmap_mem);
> +         munmap(bar->mmap, size);
> +    }
> +
> +    if (vdev->msix && vdev->msix->table_bar == nr) {
> +        size = memory_region_size(&vdev->msix->mmap_mem);
> +        if (size) {
> +            memory_region_del_subregion(&bar->mem, &vdev->msix->mmap_mem);
> +            munmap(vdev->msix->mmap, size);
> +        }
> +    }

Are the three size checks needed? Everything should work without them
from the memory core point of view.

> +
> +    memory_region_destroy(&bar->mem);
> +}
> +
> +static int vfio_mmap_bar(VFIOBAR *bar, MemoryRegion *mem, MemoryRegion *submem,
> +                         void **map, size_t size, off_t offset,
> +                         const char *name)
> +{
> +    *map = mmap(NULL, size, PROT_READ | PROT_WRITE,
> +                MAP_SHARED, bar->fd, bar->fd_offset + offset);
> +    if (*map == MAP_FAILED) {
> +        *map = NULL;
> +        return -1;
> +    }
> +
> +    memory_region_init_ram_ptr(submem, name, size, *map);
> +    memory_region_add_subregion(mem, offset, submem);


> +
> +    return 0;
> +}
> +
> +static void vfio_map_bar(VFIODevice *vdev, int nr, uint8_t type)
> +{
> +    VFIOBAR *bar = &vdev->bars[nr];
> +    unsigned size = bar->size;
> +    char name[64];
> +
> +    sprintf(name, "VFIO %04x:%02x:%02x.%x BAR %d", vdev->host.domain,
> +            vdev->host.bus, vdev->host.slot, vdev->host.function, nr);
> +
> +    /* A "slow" read/write mapping underlies all BARs */
> +    memory_region_init_io(&bar->mem, &vfio_bar_ops, bar, name, size);
> +    pci_register_bar(&vdev->pdev, nr, type, &bar->mem);

So far all container BARs have been pure containers, without RAM or I/O
callbacks.  It should all work, but this sets precedent and requires it
to work.  I guess there's no problem supporting it though.

> +
> +    if (type & PCI_BASE_ADDRESS_SPACE_IO) {
> +        return; /* IO space is only slow, don't expect high perf here */
> +    }

What about non-x86 where IO is actually memory?  I think you can drop
this and let the address space filtering in the listener drop it if it
turns out to be in IO space.

> +
> +    if (size & ~TARGET_PAGE_MASK) {
> +        error_report("%s is too small to mmap, this may affect performance.\n",
> +                     name);
> +        return;
> +    }

We can work a little harder and align the host space offset with the
guest space offset, and map it in.

> +
> +    /*
> +     * We can't mmap areas overlapping the MSIX vector table, so we
> +     * potentially insert a direct-mapped subregion before and after it.
> +     */

This splitting is what the memory core really enjoys.  You can just
place the MSIX page over the RAM page and let it do the cut-n-paste.

> +    if (vdev->msix && vdev->msix->table_bar == nr) {
> +        size = vdev->msix->table_offset & TARGET_PAGE_MASK;
> +    }
> +         
> +    if (size) {
> +        strcat(name, " mmap");
> +        if (vfio_mmap_bar(bar, &bar->mem, &bar->mmap_mem, &bar->mmap,
> +                          size, 0, name)) {
> +            error_report("%s Failed. Performance may be slow\n", name);
> +        }
> +    }
> +
> +    if (vdev->msix && vdev->msix->table_bar == nr) {
> +        unsigned start;
> +
> +        start = TARGET_PAGE_ALIGN(vdev->msix->table_offset +
> +                                  (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE));
> +
> +        if (start < bar->size) {
> +            size = bar->size - start;
> +            strcat(name, " msix-hi");
> +            /* MSIXInfo contains another MemoryRegion for this mapping */
> +            if (vfio_mmap_bar(bar, &bar->mem, &vdev->msix->mmap_mem,
> +                              &vdev->msix->mmap, size, start, name)) {
> +                error_report("%s Failed. Performance may be slow\n", name);
> +            }
> +        }
> +    }
> +
> +    return;
> +}
> +
> +
> +static int __vfio_get_device(VFIOGroup *group,
> +                             const char *name, VFIODevice *vdev)

__foo is a reserved symbol.

> +{
> +    int ret;
> +
> +    ret = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name);
> +    if (ret < 0) {
> +        error_report("vfio: error getting device %s from group %d: %s",
> +                     name, group->groupid, strerror(errno));
> +        error_report("Verify all devices in group %d "
> +                     "are bound to vfio-pci or pci-stub and not already in use",
> +                     group->groupid);
> +        return -1;
> +    }
> +
> +    vdev->group = group;
> +    QLIST_INSERT_HEAD(&group->device_list, vdev, next);
> +
> +    vdev->fd = ret;
> +
> +    return 0;
> +}
> +
> +
> +static Property vfio_pci_dev_properties[] = {
> +    DEFINE_PROP_PCI_HOST_DEVADDR("host", VFIODevice, host),
> +    //TODO - support passed fds... is this necessary?

Yes.

> +    //DEFINE_PROP_STRING("vfiofd", VFIODevice, vfiofd_name),
> +    //DEFINE_PROP_STRING("vfiogroupfd, VFIODevice, vfiogroupfd_name),
> +    DEFINE_PROP_END_OF_LIST(),
> +};
> +
> +
> +
> +typedef struct MSIVector {
> +    EventNotifier interrupt; /* eventfd triggered on interrupt */
> +    struct VFIODevice *vdev; /* back pointer to device */
> +    int vector; /* the vector number for this element */
> +    int virq; /* KVM irqchip route for Qemu bypass */

This calls for an abstraction (don't we have a cache where we look those
up?)

> +    bool use;
> +} MSIVector;
> +
> +
> +typedef struct VFIOContainer {
> +    int fd; /* /dev/vfio/vfio, empowered by the attached groups */
> +    struct {
> +        /* enable abstraction to support various iommu backends */
> +        union {
> +            MemoryListener listener; /* Used by type1 iommu */
> +        };

The usual was is to have a Type1VFIOContainer deriving from
VFIOContainer and adding a MemoryListener.

> +        void (*release)(struct VFIOContainer *);
> +    } iommu_data;
> +    QLIST_HEAD(, VFIOGroup) group_list;
> +    QLIST_ENTRY(VFIOContainer) next;
> +} VFIOContainer;
> +

> +#endif /* __VFIO_H__ */
> diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h
> index 5a9d4e3..bd1a76c 100644
> --- a/linux-headers/linux/kvm.h
> +++ b/linux-headers/linux/kvm.h

Separate patch when leaving RFC mode.

> diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h
> new file mode 100644
> index 0000000..0a4f180
> --- /dev/null
> +++ b/linux-headers/linux/vfio.h
> @@ -0,0 +1,445 @@
> +/*
> + * VFIO API definition
> + *
> + * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
> + *     Author: Alex Williamson <alex.williamson@redhat.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +#ifndef VFIO_H
> +#define VFIO_H
> +
> +#include <linux/types.h>
> +#include <linux/ioctl.h>
> +
> +#define VFIO_API_VERSION	0
> +
> +#ifdef __KERNEL__	/* Internal VFIO-core/bus driver API */

Use the exported file, that gets rid of the __KERNEL__ bits.

-- 
error compiling committee.c: too many arguments to function