Re: [RFC PATCH] vfio: VFIO PCI driver for Qemu

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Avi Kivity <avi@redhat.com>
To: Alex Williamson <alex.williamson@redhat.com>
Cc: aik@ozlabs.ru, qemu-devel@nongnu.org, kvm@vger.kernel.org
Subject: Re: [RFC PATCH] vfio: VFIO PCI driver for Qemu
Date: Thu, 26 Jul 2012 19:34:07 +0300	[thread overview]
Message-ID: <5011717F.6090600@redhat.com> (raw)
In-Reply-To: <20120725165948.17260.82862.stgit@bling.home>

On 07/25/2012 08:03 PM, Alex Williamson wrote:

> +/*
> + * Resource setup
> + */
> +static void vfio_unmap_bar(VFIODevice *vdev, int nr)
> +{
> +    VFIOBAR *bar = &vdev->bars[nr];
> +    uint64_t size;
> +
> +    if (!memory_region_size(&bar->mem)) {
> +        return;
> +    }
> +
> +    size = memory_region_size(&bar->mmap_mem);
> +    if (size) {
> +         memory_region_del_subregion(&bar->mem, &bar->mmap_mem);
> +         munmap(bar->mmap, size);
> +    }
> +
> +    if (vdev->msix && vdev->msix->table_bar == nr) {
> +        size = memory_region_size(&vdev->msix->mmap_mem);
> +        if (size) {
> +            memory_region_del_subregion(&bar->mem, &vdev->msix->mmap_mem);
> +            munmap(vdev->msix->mmap, size);
> +        }
> +    }

Are the three size checks needed? Everything should work without them
from the memory core point of view.

> +
> +    memory_region_destroy(&bar->mem);
> +}
> +
> +static int vfio_mmap_bar(VFIOBAR *bar, MemoryRegion *mem, MemoryRegion *submem,
> +                         void **map, size_t size, off_t offset,
> +                         const char *name)
> +{
> +    *map = mmap(NULL, size, PROT_READ | PROT_WRITE,
> +                MAP_SHARED, bar->fd, bar->fd_offset + offset);
> +    if (*map == MAP_FAILED) {
> +        *map = NULL;
> +        return -1;
> +    }
> +
> +    memory_region_init_ram_ptr(submem, name, size, *map);
> +    memory_region_add_subregion(mem, offset, submem);



> +
> +    return 0;
> +}
> +
> +static void vfio_map_bar(VFIODevice *vdev, int nr, uint8_t type)
> +{
> +    VFIOBAR *bar = &vdev->bars[nr];
> +    unsigned size = bar->size;
> +    char name[64];
> +
> +    sprintf(name, "VFIO %04x:%02x:%02x.%x BAR %d", vdev->host.domain,
> +            vdev->host.bus, vdev->host.slot, vdev->host.function, nr);
> +
> +    /* A "slow" read/write mapping underlies all BARs */
> +    memory_region_init_io(&bar->mem, &vfio_bar_ops, bar, name, size);
> +    pci_register_bar(&vdev->pdev, nr, type, &bar->mem);

So far all container BARs have been pure containers, without RAM or I/O
callbacks.  It should all work, but this sets precedent and requires it
to work.  I guess there's no problem supporting it though.

> +
> +    if (type & PCI_BASE_ADDRESS_SPACE_IO) {
> +        return; /* IO space is only slow, don't expect high perf here */
> +    }

What about non-x86 where IO is actually memory?  I think you can drop
this and let the address space filtering in the listener drop it if it
turns out to be in IO space.

> +
> +    if (size & ~TARGET_PAGE_MASK) {
> +        error_report("%s is too small to mmap, this may affect performance.\n",
> +                     name);
> +        return;
> +    }

We can work a little harder and align the host space offset with the
guest space offset, and map it in.

> +
> +    /*
> +     * We can't mmap areas overlapping the MSIX vector table, so we
> +     * potentially insert a direct-mapped subregion before and after it.
> +     */

This splitting is what the memory core really enjoys.  You can just
place the MSIX page over the RAM page and let it do the cut-n-paste.

> +    if (vdev->msix && vdev->msix->table_bar == nr) {
> +        size = vdev->msix->table_offset & TARGET_PAGE_MASK;
> +    }
> +         
> +    if (size) {
> +        strcat(name, " mmap");
> +        if (vfio_mmap_bar(bar, &bar->mem, &bar->mmap_mem, &bar->mmap,
> +                          size, 0, name)) {
> +            error_report("%s Failed. Performance may be slow\n", name);
> +        }
> +    }
> +
> +    if (vdev->msix && vdev->msix->table_bar == nr) {
> +        unsigned start;
> +
> +        start = TARGET_PAGE_ALIGN(vdev->msix->table_offset +
> +                                  (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE));
> +
> +        if (start < bar->size) {
> +            size = bar->size - start;
> +            strcat(name, " msix-hi");
> +            /* MSIXInfo contains another MemoryRegion for this mapping */
> +            if (vfio_mmap_bar(bar, &bar->mem, &vdev->msix->mmap_mem,
> +                              &vdev->msix->mmap, size, start, name)) {
> +                error_report("%s Failed. Performance may be slow\n", name);
> +            }
> +        }
> +    }
> +
> +    return;
> +}
> +
> +
> +static int __vfio_get_device(VFIOGroup *group,
> +                             const char *name, VFIODevice *vdev)

__foo is a reserved symbol.

> +{
> +    int ret;
> +
> +    ret = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name);
> +    if (ret < 0) {
> +        error_report("vfio: error getting device %s from group %d: %s",
> +                     name, group->groupid, strerror(errno));
> +        error_report("Verify all devices in group %d "
> +                     "are bound to vfio-pci or pci-stub and not already in use",
> +                     group->groupid);
> +        return -1;
> +    }
> +
> +    vdev->group = group;
> +    QLIST_INSERT_HEAD(&group->device_list, vdev, next);
> +
> +    vdev->fd = ret;
> +
> +    return 0;
> +}
> +
> +
> +static Property vfio_pci_dev_properties[] = {
> +    DEFINE_PROP_PCI_HOST_DEVADDR("host", VFIODevice, host),
> +    //TODO - support passed fds... is this necessary?

Yes.

> +    //DEFINE_PROP_STRING("vfiofd", VFIODevice, vfiofd_name),
> +    //DEFINE_PROP_STRING("vfiogroupfd, VFIODevice, vfiogroupfd_name),
> +    DEFINE_PROP_END_OF_LIST(),
> +};
> +
> +
> +
> +typedef struct MSIVector {
> +    EventNotifier interrupt; /* eventfd triggered on interrupt */
> +    struct VFIODevice *vdev; /* back pointer to device */
> +    int vector; /* the vector number for this element */
> +    int virq; /* KVM irqchip route for Qemu bypass */

This calls for an abstraction (don't we have a cache where we look those
up?)

> +    bool use;
> +} MSIVector;
> +
> +
> +typedef struct VFIOContainer {
> +    int fd; /* /dev/vfio/vfio, empowered by the attached groups */
> +    struct {
> +        /* enable abstraction to support various iommu backends */
> +        union {
> +            MemoryListener listener; /* Used by type1 iommu */
> +        };

The usual was is to have a Type1VFIOContainer deriving from
VFIOContainer and adding a MemoryListener.

> +        void (*release)(struct VFIOContainer *);
> +    } iommu_data;
> +    QLIST_HEAD(, VFIOGroup) group_list;
> +    QLIST_ENTRY(VFIOContainer) next;
> +} VFIOContainer;
> +

> +#endif /* __VFIO_H__ */
> diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h
> index 5a9d4e3..bd1a76c 100644
> --- a/linux-headers/linux/kvm.h
> +++ b/linux-headers/linux/kvm.h

Separate patch when leaving RFC mode.

> diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h
> new file mode 100644
> index 0000000..0a4f180
> --- /dev/null
> +++ b/linux-headers/linux/vfio.h
> @@ -0,0 +1,445 @@
> +/*
> + * VFIO API definition
> + *
> + * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
> + *     Author: Alex Williamson <alex.williamson@redhat.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +#ifndef VFIO_H
> +#define VFIO_H
> +
> +#include <linux/types.h>
> +#include <linux/ioctl.h>
> +
> +#define VFIO_API_VERSION	0
> +
> +#ifdef __KERNEL__	/* Internal VFIO-core/bus driver API */

Use the exported file, that gets rid of the __KERNEL__ bits.

-- 
error compiling committee.c: too many arguments to function

WARNING: multiple messages have this Message-ID (diff)

From: Avi Kivity <avi@redhat.com>
To: Alex Williamson <alex.williamson@redhat.com>
Cc: aik@ozlabs.ru, qemu-devel@nongnu.org, kvm@vger.kernel.org
Subject: Re: [Qemu-devel] [RFC PATCH] vfio: VFIO PCI driver for Qemu
Date: Thu, 26 Jul 2012 19:34:07 +0300	[thread overview]
Message-ID: <5011717F.6090600@redhat.com> (raw)
In-Reply-To: <20120725165948.17260.82862.stgit@bling.home>

On 07/25/2012 08:03 PM, Alex Williamson wrote:

> +/*
> + * Resource setup
> + */
> +static void vfio_unmap_bar(VFIODevice *vdev, int nr)
> +{
> +    VFIOBAR *bar = &vdev->bars[nr];
> +    uint64_t size;
> +
> +    if (!memory_region_size(&bar->mem)) {
> +        return;
> +    }
> +
> +    size = memory_region_size(&bar->mmap_mem);
> +    if (size) {
> +         memory_region_del_subregion(&bar->mem, &bar->mmap_mem);
> +         munmap(bar->mmap, size);
> +    }
> +
> +    if (vdev->msix && vdev->msix->table_bar == nr) {
> +        size = memory_region_size(&vdev->msix->mmap_mem);
> +        if (size) {
> +            memory_region_del_subregion(&bar->mem, &vdev->msix->mmap_mem);
> +            munmap(vdev->msix->mmap, size);
> +        }
> +    }

Are the three size checks needed? Everything should work without them
from the memory core point of view.

> +
> +    memory_region_destroy(&bar->mem);
> +}
> +
> +static int vfio_mmap_bar(VFIOBAR *bar, MemoryRegion *mem, MemoryRegion *submem,
> +                         void **map, size_t size, off_t offset,
> +                         const char *name)
> +{
> +    *map = mmap(NULL, size, PROT_READ | PROT_WRITE,
> +                MAP_SHARED, bar->fd, bar->fd_offset + offset);
> +    if (*map == MAP_FAILED) {
> +        *map = NULL;
> +        return -1;
> +    }
> +
> +    memory_region_init_ram_ptr(submem, name, size, *map);
> +    memory_region_add_subregion(mem, offset, submem);



> +
> +    return 0;
> +}
> +
> +static void vfio_map_bar(VFIODevice *vdev, int nr, uint8_t type)
> +{
> +    VFIOBAR *bar = &vdev->bars[nr];
> +    unsigned size = bar->size;
> +    char name[64];
> +
> +    sprintf(name, "VFIO %04x:%02x:%02x.%x BAR %d", vdev->host.domain,
> +            vdev->host.bus, vdev->host.slot, vdev->host.function, nr);
> +
> +    /* A "slow" read/write mapping underlies all BARs */
> +    memory_region_init_io(&bar->mem, &vfio_bar_ops, bar, name, size);
> +    pci_register_bar(&vdev->pdev, nr, type, &bar->mem);

So far all container BARs have been pure containers, without RAM or I/O
callbacks.  It should all work, but this sets precedent and requires it
to work.  I guess there's no problem supporting it though.

> +
> +    if (type & PCI_BASE_ADDRESS_SPACE_IO) {
> +        return; /* IO space is only slow, don't expect high perf here */
> +    }

What about non-x86 where IO is actually memory?  I think you can drop
this and let the address space filtering in the listener drop it if it
turns out to be in IO space.

> +
> +    if (size & ~TARGET_PAGE_MASK) {
> +        error_report("%s is too small to mmap, this may affect performance.\n",
> +                     name);
> +        return;
> +    }

We can work a little harder and align the host space offset with the
guest space offset, and map it in.

> +
> +    /*
> +     * We can't mmap areas overlapping the MSIX vector table, so we
> +     * potentially insert a direct-mapped subregion before and after it.
> +     */

This splitting is what the memory core really enjoys.  You can just
place the MSIX page over the RAM page and let it do the cut-n-paste.

> +    if (vdev->msix && vdev->msix->table_bar == nr) {
> +        size = vdev->msix->table_offset & TARGET_PAGE_MASK;
> +    }
> +         
> +    if (size) {
> +        strcat(name, " mmap");
> +        if (vfio_mmap_bar(bar, &bar->mem, &bar->mmap_mem, &bar->mmap,
> +                          size, 0, name)) {
> +            error_report("%s Failed. Performance may be slow\n", name);
> +        }
> +    }
> +
> +    if (vdev->msix && vdev->msix->table_bar == nr) {
> +        unsigned start;
> +
> +        start = TARGET_PAGE_ALIGN(vdev->msix->table_offset +
> +                                  (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE));
> +
> +        if (start < bar->size) {
> +            size = bar->size - start;
> +            strcat(name, " msix-hi");
> +            /* MSIXInfo contains another MemoryRegion for this mapping */
> +            if (vfio_mmap_bar(bar, &bar->mem, &vdev->msix->mmap_mem,
> +                              &vdev->msix->mmap, size, start, name)) {
> +                error_report("%s Failed. Performance may be slow\n", name);
> +            }
> +        }
> +    }
> +
> +    return;
> +}
> +
> +
> +static int __vfio_get_device(VFIOGroup *group,
> +                             const char *name, VFIODevice *vdev)

__foo is a reserved symbol.

> +{
> +    int ret;
> +
> +    ret = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name);
> +    if (ret < 0) {
> +        error_report("vfio: error getting device %s from group %d: %s",
> +                     name, group->groupid, strerror(errno));
> +        error_report("Verify all devices in group %d "
> +                     "are bound to vfio-pci or pci-stub and not already in use",
> +                     group->groupid);
> +        return -1;
> +    }
> +
> +    vdev->group = group;
> +    QLIST_INSERT_HEAD(&group->device_list, vdev, next);
> +
> +    vdev->fd = ret;
> +
> +    return 0;
> +}
> +
> +
> +static Property vfio_pci_dev_properties[] = {
> +    DEFINE_PROP_PCI_HOST_DEVADDR("host", VFIODevice, host),
> +    //TODO - support passed fds... is this necessary?

Yes.

> +    //DEFINE_PROP_STRING("vfiofd", VFIODevice, vfiofd_name),
> +    //DEFINE_PROP_STRING("vfiogroupfd, VFIODevice, vfiogroupfd_name),
> +    DEFINE_PROP_END_OF_LIST(),
> +};
> +
> +
> +
> +typedef struct MSIVector {
> +    EventNotifier interrupt; /* eventfd triggered on interrupt */
> +    struct VFIODevice *vdev; /* back pointer to device */
> +    int vector; /* the vector number for this element */
> +    int virq; /* KVM irqchip route for Qemu bypass */

This calls for an abstraction (don't we have a cache where we look those
up?)

> +    bool use;
> +} MSIVector;
> +
> +
> +typedef struct VFIOContainer {
> +    int fd; /* /dev/vfio/vfio, empowered by the attached groups */
> +    struct {
> +        /* enable abstraction to support various iommu backends */
> +        union {
> +            MemoryListener listener; /* Used by type1 iommu */
> +        };

The usual was is to have a Type1VFIOContainer deriving from
VFIOContainer and adding a MemoryListener.

> +        void (*release)(struct VFIOContainer *);
> +    } iommu_data;
> +    QLIST_HEAD(, VFIOGroup) group_list;
> +    QLIST_ENTRY(VFIOContainer) next;
> +} VFIOContainer;
> +

> +#endif /* __VFIO_H__ */
> diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h
> index 5a9d4e3..bd1a76c 100644
> --- a/linux-headers/linux/kvm.h
> +++ b/linux-headers/linux/kvm.h

Separate patch when leaving RFC mode.

> diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h
> new file mode 100644
> index 0000000..0a4f180
> --- /dev/null
> +++ b/linux-headers/linux/vfio.h
> @@ -0,0 +1,445 @@
> +/*
> + * VFIO API definition
> + *
> + * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
> + *     Author: Alex Williamson <alex.williamson@redhat.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +#ifndef VFIO_H
> +#define VFIO_H
> +
> +#include <linux/types.h>
> +#include <linux/ioctl.h>
> +
> +#define VFIO_API_VERSION	0
> +
> +#ifdef __KERNEL__	/* Internal VFIO-core/bus driver API */

Use the exported file, that gets rid of the __KERNEL__ bits.

-- 
error compiling committee.c: too many arguments to function

next prev parent reply	other threads:[~2012-07-26 16:34 UTC|newest]

Thread overview: 46+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2012-07-25 17:03 [RFC PATCH] vfio: VFIO PCI driver for Qemu Alex Williamson
2012-07-25 17:03 ` [Qemu-devel] " Alex Williamson
2012-07-25 19:30 ` Avi Kivity
2012-07-25 19:30   ` [Qemu-devel] " Avi Kivity
2012-07-25 19:53   ` Alex Williamson
2012-07-25 19:53     ` [Qemu-devel] " Alex Williamson
2012-07-26  8:35     ` Avi Kivity
2012-07-26  8:35       ` [Qemu-devel] " Avi Kivity
2012-07-26  9:28       ` Andreas Hartmann
2012-07-26  9:40         ` Avi Kivity
2012-07-26 13:45           ` Andreas Hartmann
2012-07-26 14:56       ` Alex Williamson
2012-07-26 14:56         ` [Qemu-devel] " Alex Williamson
2012-07-26 15:59         ` Avi Kivity
2012-07-26 15:59           ` [Qemu-devel] " Avi Kivity
2012-07-26 16:33           ` Alex Williamson
2012-07-26 16:33             ` [Qemu-devel] " Alex Williamson
2012-07-26 16:40             ` Avi Kivity
2012-07-26 16:40               ` [Qemu-devel] " Avi Kivity
2012-07-26 19:11               ` Alex Williamson
2012-07-26 21:23                 ` Andreas Hartmann
2012-07-26 21:37                   ` Alex Williamson
2012-07-26 16:06         ` Avi Kivity
2012-07-26 16:06           ` [Qemu-devel] " Avi Kivity
2012-07-26 16:40           ` Alex Williamson
2012-07-26 16:40             ` [Qemu-devel] " Alex Williamson
2012-07-26 16:47             ` Avi Kivity
2012-07-26 16:47               ` [Qemu-devel] " Avi Kivity
2012-07-26 15:09 ` Alex Williamson
2012-07-26 15:09   ` [Qemu-devel] " Alex Williamson
2012-07-26 16:34 ` Avi Kivity [this message]
2012-07-26 16:34   ` Avi Kivity
2012-07-26 17:40   ` Alex Williamson
2012-07-26 17:40     ` Alex Williamson
2012-07-29 13:47     ` Avi Kivity
2012-07-29 13:47       ` [Qemu-devel] " Avi Kivity
2012-07-30 22:29       ` Alex Williamson
2012-07-30 22:29         ` Alex Williamson
2012-07-31 12:34         ` Avi Kivity
2012-07-31 16:56           ` Alex Williamson
2012-07-31 16:56             ` [Qemu-devel] " Alex Williamson
2012-07-27 19:22 ` Blue Swirl
2012-07-27 19:22   ` Blue Swirl
2012-07-27 20:28   ` Alex Williamson
2012-07-27 20:28     ` Alex Williamson
2012-07-28  2:55   ` Alexey Kardashevskiy

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=5011717F.6090600@redhat.com \
    --to=avi@redhat.com \
    --cc=aik@ozlabs.ru \
    --cc=alex.williamson@redhat.com \
    --cc=kvm@vger.kernel.org \
    --cc=qemu-devel@nongnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.