From: Anthony Liguori <anthony@codemonkey.ws>
To: Cam Macdonell <cam@cs.ualberta.ca>
Cc: kvm@vger.kernel.org, qemu-devel@nongnu.org
Subject: Re: [PATCH v5 4/5] Inter-VM shared memory PCI device
Date: Thu, 06 May 2010 12:32:53 -0500 [thread overview]
Message-ID: <4BE2FD45.8000601@codemonkey.ws> (raw)
In-Reply-To: <1271872408-22842-5-git-send-email-cam@cs.ualberta.ca>
On 04/21/2010 12:53 PM, Cam Macdonell wrote:
> Support an inter-vm shared memory device that maps a shared-memory object as a
> PCI device in the guest. This patch also supports interrupts between guest by
> communicating over a unix domain socket. This patch applies to the qemu-kvm
> repository.
>
> -device ivshmem,size=<size in format accepted by -m>[,shm=<shm name>]
>
> Interrupts are supported between multiple VMs by using a shared memory server
> by using a chardev socket.
>
> -device ivshmem,size=<size in format accepted by -m>[,shm=<shm name>]
> [,chardev=<id>][,msi=on][,irqfd=on][,vectors=n]
> -chardev socket,path=<path>,id=<id>
>
> (shared memory server is qemu.git/contrib/ivshmem-server)
>
> Sample programs and init scripts are in a git repo here:
>
> www.gitorious.org/nahanni
> ---
> Makefile.target | 3 +
> hw/ivshmem.c | 727 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
> qemu-char.c | 6 +
> qemu-char.h | 3 +
> qemu-doc.texi | 25 ++
> 5 files changed, 764 insertions(+), 0 deletions(-)
> create mode 100644 hw/ivshmem.c
>
> diff --git a/Makefile.target b/Makefile.target
> index 1ffd802..bc9a681 100644
> --- a/Makefile.target
> +++ b/Makefile.target
> @@ -199,6 +199,9 @@ obj-$(CONFIG_USB_OHCI) += usb-ohci.o
> obj-y += rtl8139.o
> obj-y += e1000.o
>
> +# Inter-VM PCI shared memory
> +obj-y += ivshmem.o
> +
> # Hardware support
> obj-i386-y = pckbd.o dma.o
> obj-i386-y += vga.o
> diff --git a/hw/ivshmem.c b/hw/ivshmem.c
> new file mode 100644
> index 0000000..f8d8fdb
> --- /dev/null
> +++ b/hw/ivshmem.c
> @@ -0,0 +1,727 @@
> +/*
> + * Inter-VM Shared Memory PCI device.
> + *
> + * Author:
> + * Cam Macdonell<cam@cs.ualberta.ca>
> + *
> + * Based On: cirrus_vga.c and rtl8139.c
> + *
> + * This code is licensed under the GNU GPL v2.
> + */
> +#include<sys/mman.h>
> +#include<sys/types.h>
> +#include<sys/socket.h>
> +#include<sys/io.h>
> +#include<sys/ioctl.h>
> +#include<sys/eventfd.h>
>
This will break the Windows along with any non-Linux unix or any Linux
old enough to not have eventfd support.
If it's based on cirrus_vga.c and rtl8139.c, then it ought to carry the
respective copyrights, no?
Regards,
Anthony Liguori
> +#include "hw.h"
> +#include "console.h"
> +#include "pc.h"
> +#include "pci.h"
> +#include "sysemu.h"
> +
> +#include "msix.h"
> +#include "qemu-kvm.h"
> +#include "libkvm.h"
> +
> +#include<sys/eventfd.h>
> +#include<sys/mman.h>
> +#include<sys/socket.h>
> +#include<sys/ioctl.h>
> +
> +#define IVSHMEM_IRQFD 0
> +#define IVSHMEM_MSI 1
> +
> +#define DEBUG_IVSHMEM
> +#ifdef DEBUG_IVSHMEM
> +#define IVSHMEM_DPRINTF(fmt, args...) \
> + do {printf("IVSHMEM: " fmt, ##args); } while (0)
> +#else
> +#define IVSHMEM_DPRINTF(fmt, args...)
> +#endif
> +
> +typedef struct EventfdEntry {
> + PCIDevice *pdev;
> + int vector;
> +} EventfdEntry;
> +
> +typedef struct IVShmemState {
> + PCIDevice dev;
> + uint32_t intrmask;
> + uint32_t intrstatus;
> + uint32_t doorbell;
> +
> + CharDriverState * chr;
> + CharDriverState ** eventfd_chr;
> + int ivshmem_mmio_io_addr;
> +
> + pcibus_t mmio_addr;
> + unsigned long ivshmem_offset;
> + uint64_t ivshmem_size; /* size of shared memory region */
> + int shm_fd; /* shared memory file descriptor */
> +
> + int nr_allocated_vms;
> + /* array of eventfds for each guest */
> + int ** eventfds;
> + /* keep track of # of eventfds for each guest*/
> + int * eventfds_posn_count;
> +
> + int nr_alloc_guests;
> + int vm_id;
> + int num_eventfds;
> + uint32_t vectors;
> + uint32_t features;
> + EventfdEntry *eventfd_table;
> +
> + char * shmobj;
> + char * sizearg;
> +} IVShmemState;
> +
> +/* registers for the Inter-VM shared memory device */
> +enum ivshmem_registers {
> + IntrMask = 0,
> + IntrStatus = 4,
> + IVPosition = 8,
> + Doorbell = 12,
> +};
> +
> +static inline uint32_t ivshmem_has_feature(IVShmemState *ivs, int feature) {
> + return (ivs->features& (1<< feature));
> +}
> +
> +static inline int is_power_of_two(int x) {
> + return (x& (x-1)) == 0;
> +}
> +
> +static void ivshmem_map(PCIDevice *pci_dev, int region_num,
> + pcibus_t addr, pcibus_t size, int type)
> +{
> + IVShmemState *s = DO_UPCAST(IVShmemState, dev, pci_dev);
> +
> + IVSHMEM_DPRINTF("addr = %u size = %u\n", (uint32_t)addr, (uint32_t)size);
> + cpu_register_physical_memory(addr, s->ivshmem_size, s->ivshmem_offset);
> +
> +}
> +
> +/* accessing registers - based on rtl8139 */
> +static void ivshmem_update_irq(IVShmemState *s, int val)
> +{
> + int isr;
> + isr = (s->intrstatus& s->intrmask)& 0xffffffff;
> +
> + /* don't print ISR resets */
> + if (isr) {
> + IVSHMEM_DPRINTF("Set IRQ to %d (%04x %04x)\n",
> + isr ? 1 : 0, s->intrstatus, s->intrmask);
> + }
> +
> + qemu_set_irq(s->dev.irq[0], (isr != 0));
> +}
> +
> +static void ivshmem_IntrMask_write(IVShmemState *s, uint32_t val)
> +{
> + IVSHMEM_DPRINTF("IntrMask write(w) val = 0x%04x\n", val);
> +
> + s->intrmask = val;
> +
> + ivshmem_update_irq(s, val);
> +}
> +
> +static uint32_t ivshmem_IntrMask_read(IVShmemState *s)
> +{
> + uint32_t ret = s->intrmask;
> +
> + IVSHMEM_DPRINTF("intrmask read(w) val = 0x%04x\n", ret);
> +
> + return ret;
> +}
> +
> +static void ivshmem_IntrStatus_write(IVShmemState *s, uint32_t val)
> +{
> + IVSHMEM_DPRINTF("IntrStatus write(w) val = 0x%04x\n", val);
> +
> + s->intrstatus = val;
> +
> + ivshmem_update_irq(s, val);
> + return;
> +}
> +
> +static uint32_t ivshmem_IntrStatus_read(IVShmemState *s)
> +{
> + uint32_t ret = s->intrstatus;
> +
> + /* reading ISR clears all interrupts */
> + s->intrstatus = 0;
> +
> + ivshmem_update_irq(s, 0);
> +
> + return ret;
> +}
> +
> +static void ivshmem_io_writew(void *opaque, uint8_t addr, uint32_t val)
> +{
> +
> + IVSHMEM_DPRINTF("We shouldn't be writing words\n");
> +}
> +
> +static void ivshmem_io_writel(void *opaque, uint8_t addr, uint32_t val)
> +{
> + IVShmemState *s = opaque;
> +
> + u_int64_t write_one = 1;
> + u_int16_t dest = val>> 16;
> + u_int16_t vector = val& 0xff;
> +
> + addr&= 0xfe;
> +
> + switch (addr)
> + {
> + case IntrMask:
> + ivshmem_IntrMask_write(s, val);
> + break;
> +
> + case IntrStatus:
> + ivshmem_IntrStatus_write(s, val);
> + break;
> +
> + case Doorbell:
> + /* check doorbell range */
> + if ((vector>= 0)&& (vector< s->eventfds_posn_count[dest])) {
> + IVSHMEM_DPRINTF("Writing %ld to VM %d on vector %d\n", write_one, dest, vector);
> + if (write(s->eventfds[dest][vector],&(write_one), 8) != 8) {
> + IVSHMEM_DPRINTF("error writing to eventfd\n");
> + }
> + }
> + break;
> + default:
> + IVSHMEM_DPRINTF("Invalid VM Doorbell VM %d\n", dest);
> + }
> +}
> +
> +static void ivshmem_io_writeb(void *opaque, uint8_t addr, uint32_t val)
> +{
> + IVSHMEM_DPRINTF("We shouldn't be writing bytes\n");
> +}
> +
> +static uint32_t ivshmem_io_readw(void *opaque, uint8_t addr)
> +{
> +
> + IVSHMEM_DPRINTF("We shouldn't be reading words\n");
> + return 0;
> +}
> +
> +static uint32_t ivshmem_io_readl(void *opaque, uint8_t addr)
> +{
> +
> + IVShmemState *s = opaque;
> + uint32_t ret;
> +
> + switch (addr)
> + {
> + case IntrMask:
> + ret = ivshmem_IntrMask_read(s);
> + break;
> +
> + case IntrStatus:
> + ret = ivshmem_IntrStatus_read(s);
> + break;
> +
> + case IVPosition:
> + /* return my id in the ivshmem list */
> + ret = s->vm_id;
> + break;
> +
> + default:
> + IVSHMEM_DPRINTF("why are we reading 0x%x\n", addr);
> + ret = 0;
> + }
> +
> + return ret;
> +
> +}
> +
> +static uint32_t ivshmem_io_readb(void *opaque, uint8_t addr)
> +{
> + IVSHMEM_DPRINTF("We shouldn't be reading bytes\n");
> +
> + return 0;
> +}
> +
> +static void ivshmem_mmio_writeb(void *opaque,
> + target_phys_addr_t addr, uint32_t val)
> +{
> + ivshmem_io_writeb(opaque, addr& 0xFF, val);
> +}
> +
> +static void ivshmem_mmio_writew(void *opaque,
> + target_phys_addr_t addr, uint32_t val)
> +{
> + ivshmem_io_writew(opaque, addr& 0xFF, val);
> +}
> +
> +static void ivshmem_mmio_writel(void *opaque,
> + target_phys_addr_t addr, uint32_t val)
> +{
> + ivshmem_io_writel(opaque, addr& 0xFF, val);
> +}
> +
> +static uint32_t ivshmem_mmio_readb(void *opaque, target_phys_addr_t addr)
> +{
> + return ivshmem_io_readb(opaque, addr& 0xFF);
> +}
> +
> +static uint32_t ivshmem_mmio_readw(void *opaque, target_phys_addr_t addr)
> +{
> + uint32_t val = ivshmem_io_readw(opaque, addr& 0xFF);
> + return val;
> +}
> +
> +static uint32_t ivshmem_mmio_readl(void *opaque, target_phys_addr_t addr)
> +{
> + uint32_t val = ivshmem_io_readl(opaque, addr& 0xFF);
> + return val;
> +}
> +
> +static CPUReadMemoryFunc *ivshmem_mmio_read[3] = {
> + ivshmem_mmio_readb,
> + ivshmem_mmio_readw,
> + ivshmem_mmio_readl,
> +};
> +
> +static CPUWriteMemoryFunc *ivshmem_mmio_write[3] = {
> + ivshmem_mmio_writeb,
> + ivshmem_mmio_writew,
> + ivshmem_mmio_writel,
> +};
> +
> +static void ivshmem_receive(void *opaque, const uint8_t *buf, int size)
> +{
> + IVShmemState *s = opaque;
> +
> + ivshmem_IntrStatus_write(s, *buf);
> +
> + IVSHMEM_DPRINTF("ivshmem_receive 0x%02x\n", *buf);
> +}
> +
> +static int ivshmem_can_receive(void * opaque)
> +{
> + return 8;
> +}
> +
> +static void ivshmem_event(void *opaque, int event)
> +{
> + IVSHMEM_DPRINTF("ivshmem_event %d\n", event);
> +}
> +
> +static void fake_irqfd(void *opaque, const uint8_t *buf, int size) {
> +
> + EventfdEntry *entry = opaque;
> + PCIDevice *pdev = entry->pdev;
> +
> + IVSHMEM_DPRINTF("fake irqfd on vector %d\n", entry->vector);
> + msix_notify(pdev, entry->vector);
> +}
> +
> +static CharDriverState* create_eventfd_chr_device(void * opaque, int eventfd,
> + int vector)
> +{
> + /* create a event character device based on the passed eventfd */
> + IVShmemState *s = opaque;
> + CharDriverState * chr;
> +
> + chr = qemu_chr_open_eventfd(eventfd);
> +
> + if (chr == NULL) {
> + IVSHMEM_DPRINTF("creating eventfd for eventfd %d failed\n", eventfd);
> + exit(-1);
> + }
> +
> + if (ivshmem_has_feature(s, IVSHMEM_MSI)) {
> + s->eventfd_table[vector].pdev =&s->dev;
> + s->eventfd_table[vector].vector = vector;
> +
> + qemu_chr_add_handlers(chr, ivshmem_can_receive, fake_irqfd,
> + ivshmem_event,&s->eventfd_table[vector]);
> + } else {
> + qemu_chr_add_handlers(chr, ivshmem_can_receive, ivshmem_receive,
> + ivshmem_event, s);
> + }
> +
> + return chr;
> +
> +}
> +
> +static int check_shm_size(IVShmemState *s, int shmemfd) {
> + /* check that the guest isn't going to try and map more memory than the
> + * card server allocated return -1 to indicate error */
> +
> + struct stat buf;
> +
> + fstat(shmemfd,&buf);
> +
> + if (s->ivshmem_size> buf.st_size) {
> + fprintf(stderr, "IVSHMEM ERROR: Requested memory size greater");
> + fprintf(stderr, " than shared object size (%ld> %ld)\n",
> + s->ivshmem_size, buf.st_size);
> + return -1;
> + } else {
> + return 0;
> + }
> +}
> +
> +static void create_shared_memory_BAR(IVShmemState *s, int fd) {
> +
> + s->shm_fd = fd;
> +
> + s->ivshmem_offset = qemu_ram_mmap(s->shm_fd, s->ivshmem_size,
> + MAP_SHARED, 0);
> +
> + /* region for shared memory */
> + pci_register_bar(&s->dev, 2, s->ivshmem_size,
> + PCI_BASE_ADDRESS_SPACE_MEMORY, ivshmem_map);
> +}
> +
> +static void close_guest_eventfds(IVShmemState *s, int posn)
> +{
> + int i, guest_curr_max;
> +
> + guest_curr_max = s->eventfds_posn_count[posn];
> +
> + for (i = 0; i< guest_curr_max; i++)
> + close(s->eventfds[posn][i]);
> +
> + free(s->eventfds[posn]);
> + s->eventfds_posn_count[posn] = 0;
> +}
> +
> +/* this function increase the dynamic storage need to store data about other
> + * guests */
> +static void increase_dynamic_storage(IVShmemState *s, int new_min_size) {
> +
> + int j, old_nr_alloc;
> +
> + old_nr_alloc = s->nr_alloc_guests;
> +
> + while (s->nr_alloc_guests< new_min_size)
> + s->nr_alloc_guests = s->nr_alloc_guests * 2;
> +
> + IVSHMEM_DPRINTF("bumping storage to %d guests\n", s->nr_alloc_guests);
> + s->eventfds = qemu_realloc(s->eventfds, s->nr_alloc_guests *
> + sizeof(int *));
> + s->eventfds_posn_count = qemu_realloc(s->eventfds_posn_count,
> + s->nr_alloc_guests *
> + sizeof(int));
> + s->eventfd_table = qemu_realloc(s->eventfd_table, s->nr_alloc_guests *
> + sizeof(EventfdEntry));
> +
> + if ((s->eventfds == NULL) || (s->eventfds_posn_count == NULL) ||
> + (s->eventfd_table == NULL)) {
> + fprintf(stderr, "Allocation error - exiting\n");
> + exit(1);
> + }
> +
> + if (!ivshmem_has_feature(s, IVSHMEM_IRQFD)) {
> + s->eventfd_chr = (CharDriverState **)qemu_realloc(s->eventfd_chr,
> + s->nr_alloc_guests * sizeof(void *));
> + if (s->eventfd_chr == NULL) {
> + fprintf(stderr, "Allocation error - exiting\n");
> + exit(1);
> + }
> + }
> +
> + /* zero out new pointers */
> + for (j = old_nr_alloc; j< s->nr_alloc_guests; j++) {
> + s->eventfds[j] = NULL;
> + }
> +}
> +
> +static void ivshmem_read(void *opaque, const uint8_t * buf, int flags)
> +{
> + IVShmemState *s = opaque;
> + int incoming_fd, tmp_fd;
> + int guest_curr_max;
> + long incoming_posn;
> +
> + memcpy(&incoming_posn, buf, sizeof(long));
> + /* pick off s->chr->msgfd and store it, posn should accompany msg */
> + tmp_fd = qemu_chr_get_msgfd(s->chr);
> + IVSHMEM_DPRINTF("posn is %ld, fd is %d\n", incoming_posn, tmp_fd);
> +
> + /* make sure we have enough space for this guest */
> + if (incoming_posn>= s->nr_alloc_guests) {
> + increase_dynamic_storage(s, incoming_posn);
> + }
> +
> + if (tmp_fd == -1) {
> + /* if posn is positive and unseen before then this is our posn*/
> + if ((incoming_posn>= 0)&& (s->eventfds[incoming_posn] == NULL)) {
> + /* receive our posn */
> + s->vm_id = incoming_posn;
> + return;
> + } else {
> + /* otherwise an fd == -1 means an existing guest has gone away */
> + IVSHMEM_DPRINTF("posn %ld has gone away\n", incoming_posn);
> + close_guest_eventfds(s, incoming_posn);
> + return;
> + }
> + }
> +
> + /* because of the implementation of get_msgfd, we need a dup */
> + incoming_fd = dup(tmp_fd);
> +
> + /* if the position is -1, then it's shared memory region fd */
> + if (incoming_posn == -1) {
> +
> + s->num_eventfds = 0;
> +
> + if (check_shm_size(s, incoming_fd) == -1) {
> + exit(-1);
> + }
> +
> + /* creating a BAR in qemu_chr callback may be crazy */
> + create_shared_memory_BAR(s, incoming_fd);
> +
> + return;
> + }
> +
> + /* each guest has an array of eventfds, and we keep track of how many
> + * guests for each VM */
> + guest_curr_max = s->eventfds_posn_count[incoming_posn];
> + if (guest_curr_max == 0) {
> + /* one eventfd per MSI vector */
> + s->eventfds[incoming_posn] = (int *) qemu_malloc(s->vectors *
> + sizeof(int));
> + }
> +
> + /* this is an eventfd for a particular guest VM */
> + IVSHMEM_DPRINTF("eventfds[%ld][%d] = %d\n", incoming_posn, guest_curr_max,
> + incoming_fd);
> + s->eventfds[incoming_posn][guest_curr_max] = incoming_fd;
> +
> + /* increment count for particular guest */
> + s->eventfds_posn_count[incoming_posn]++;
> +
> + /* ioeventfd and irqfd are enabled together,
> + * so the flag IRQFD refers to both */
> + if (ivshmem_has_feature(s, IVSHMEM_IRQFD)&& guest_curr_max>= 0) {
> + /* allocate ioeventfd for the new fd
> + * received for guest @ incoming_posn */
> + kvm_set_ioeventfd_mmio_long(incoming_fd, s->mmio_addr + Doorbell,
> + (incoming_posn<< 16) | guest_curr_max, 1);
> + }
> +
> + /* keep track of the maximum VM ID */
> + if (incoming_posn> s->num_eventfds) {
> + s->num_eventfds = incoming_posn;
> + }
> +
> + if (incoming_posn == s->vm_id) {
> + if (ivshmem_has_feature(s, IVSHMEM_IRQFD)) {
> + /* setup irqfd for this VM's eventfd */
> + int vector = guest_curr_max;
> + kvm_set_irqfd(s->eventfds[s->vm_id][guest_curr_max], vector,
> + s->dev.msix_irq_entries[vector].gsi);
> + } else {
> + /* initialize char device for callback
> + * if this is one of my eventfd */
> + s->eventfd_chr[guest_curr_max] = create_eventfd_chr_device(s,
> + s->eventfds[s->vm_id][guest_curr_max], guest_curr_max);
> + }
> + }
> +
> + return;
> +}
> +
> +static void ivshmem_reset(DeviceState *d)
> +{
> + return;
> +}
> +
> +static void ivshmem_mmio_map(PCIDevice *pci_dev, int region_num,
> + pcibus_t addr, pcibus_t size, int type)
> +{
> + IVShmemState *s = DO_UPCAST(IVShmemState, dev, pci_dev);
> +
> + s->mmio_addr = addr;
> + cpu_register_physical_memory(addr + 0, 0x400, s->ivshmem_mmio_io_addr);
> +
> + /* now that our mmio region has been allocated, we can receive
> + * the file descriptors */
> + if (s->chr != NULL) {
> + qemu_chr_add_handlers(s->chr, ivshmem_can_receive, ivshmem_read,
> + ivshmem_event, s);
> + }
> +
> +}
> +
> +static uint64_t ivshmem_get_size(IVShmemState * s) {
> +
> + uint64_t value;
> + char *ptr;
> +
> + value = strtoul(s->sizearg,&ptr, 10);
> + switch (*ptr) {
> + case 0: case 'M': case 'm':
> + value<<= 20;
> + break;
> + case 'G': case 'g':
> + value<<= 30;
> + break;
> + default:
> + fprintf(stderr, "qemu: invalid ram size: %s\n", s->sizearg);
> + exit(1);
> + }
> +
> + /* BARs must be a power of 2 */
> + if (!is_power_of_two(value)) {
> + fprintf(stderr, "ivshmem: size must be power of 2\n");
> + exit(1);
> + }
> +
> + return value;
> +
> +}
> +
> +static int pci_ivshmem_init(PCIDevice *dev)
> +{
> + IVShmemState *s = DO_UPCAST(IVShmemState, dev, dev);
> + uint8_t *pci_conf;
> + int i;
> +
> + if (s->sizearg == NULL)
> + s->ivshmem_size = 4<< 20; /* 4 MB default */
> + else {
> + s->ivshmem_size = ivshmem_get_size(s);
> + }
> +
> + /* IRQFD requires MSI */
> + if (ivshmem_has_feature(s, IVSHMEM_IRQFD)&&
> + !ivshmem_has_feature(s, IVSHMEM_MSI)) {
> + fprintf(stderr, "ivshmem: ioeventfd/irqfd requires MSI\n");
> + exit(1);
> + }
> +
> + pci_conf = s->dev.config;
> + pci_conf[0x00] = 0xf4; /* Qumranet vendor ID 0x5002 */
> + pci_conf[0x01] = 0x1a;
> + pci_conf[0x02] = 0x10;
> + pci_conf[0x03] = 0x11;
> + pci_conf[0x04] = PCI_COMMAND_IO | PCI_COMMAND_MEMORY;
> + pci_conf[0x0a] = 0x00; /* RAM controller */
> + pci_conf[0x0b] = 0x05;
> + pci_conf[0x0e] = 0x00; /* header_type */
> +
> + s->ivshmem_mmio_io_addr = cpu_register_io_memory(ivshmem_mmio_read,
> + ivshmem_mmio_write, s);
> + /* region for registers*/
> + pci_register_bar(&s->dev, 0, 0x400,
> + PCI_BASE_ADDRESS_SPACE_MEMORY, ivshmem_mmio_map);
> +
> + /* allocate the MSI-X vectors */
> + if (ivshmem_has_feature(s, IVSHMEM_MSI)) {
> +
> + if (!msix_init(&s->dev, s->vectors, 1, 0)) {
> + pci_register_bar(&s->dev, 1,
> + msix_bar_size(&s->dev),
> + PCI_BASE_ADDRESS_SPACE_MEMORY,
> + msix_mmio_map);
> + IVSHMEM_DPRINTF("msix initialized (%d vectors)\n", s->vectors);
> + } else {
> + IVSHMEM_DPRINTF("msix initialization failed\n");
> + }
> +
> + /* 'activate' the vectors */
> + for (i = 0; i< s->vectors; i++) {
> + msix_vector_use(&s->dev, i);
> + }
> + }
> +
> + if ((s->chr != NULL)&& (strncmp(s->chr->filename, "unix:", 5) == 0)) {
> + /* if we get a UNIX socket as the parameter we will talk
> + * to the ivshmem server later once the MMIO BAR is actually
> + * allocated (see ivshmem_mmio_map) */
> +
> + IVSHMEM_DPRINTF("using shared memory server (socket = %s)\n",
> + s->chr->filename);
> +
> + /* we allocate enough space for 16 guests and grow as needed */
> + s->nr_alloc_guests = 16;
> + s->vm_id = -1;
> +
> + /* allocate/initialize space for interrupt handling */
> + s->eventfds = qemu_mallocz(s->nr_alloc_guests * sizeof(int *));
> + s->eventfd_table = qemu_mallocz(s->vectors * sizeof(EventfdEntry));
> + s->eventfds_posn_count = qemu_mallocz(s->nr_alloc_guests * sizeof(int));
> +
> + pci_conf[PCI_INTERRUPT_PIN] = 1; /* we are going to support interrupts */
> +
> + if (!ivshmem_has_feature(s, IVSHMEM_IRQFD)) {
> + s->eventfd_chr = (CharDriverState **)qemu_malloc(s->nr_alloc_guests *
> + sizeof(void *));
> + }
> +
> + } else {
> + /* just map the file immediately, we're not using a server */
> + int fd;
> +
> + if (s->shmobj == NULL) {
> + fprintf(stderr, "Must specify 'chardev' or 'shm' to ivshmem\n");
> + }
> +
> + IVSHMEM_DPRINTF("using shm_open (shm object = %s)\n", s->shmobj);
> +
> + /* try opening with O_EXCL and if it succeeds zero the memory
> + * by truncating to 0 */
> + if ((fd = shm_open(s->shmobj, O_CREAT|O_RDWR|O_EXCL,
> + S_IRWXU|S_IRWXG|S_IRWXO))> 0) {
> + /* truncate file to length PCI device's memory */
> + if (ftruncate(fd, s->ivshmem_size) != 0) {
> + fprintf(stderr, "kvm_ivshmem: could not truncate shared file\n");
> + }
> +
> + } else if ((fd = shm_open(s->shmobj, O_CREAT|O_RDWR,
> + S_IRWXU|S_IRWXG|S_IRWXO))< 0) {
> + fprintf(stderr, "kvm_ivshmem: could not open shared file\n");
> + exit(-1);
> + }
> +
> + create_shared_memory_BAR(s, fd);
> +
> + }
> +
> +
> + return 0;
> +}
> +
> +static int pci_ivshmem_uninit(PCIDevice *dev)
> +{
> + IVShmemState *s = DO_UPCAST(IVShmemState, dev, dev);
> +
> + cpu_unregister_io_memory(s->ivshmem_mmio_io_addr);
> +
> + return 0;
> +}
> +
> +static PCIDeviceInfo ivshmem_info = {
> + .qdev.name = "ivshmem",
> + .qdev.size = sizeof(IVShmemState),
> + .qdev.reset = ivshmem_reset,
> + .init = pci_ivshmem_init,
> + .exit = pci_ivshmem_uninit,
> + .qdev.props = (Property[]) {
> + DEFINE_PROP_CHR("chardev", IVShmemState, chr),
> + DEFINE_PROP_STRING("size", IVShmemState, sizearg),
> + DEFINE_PROP_UINT32("vectors", IVShmemState, vectors, 1),
> + DEFINE_PROP_BIT("irqfd", IVShmemState, features, IVSHMEM_IRQFD, false),
> + DEFINE_PROP_BIT("msi", IVShmemState, features, IVSHMEM_MSI, true),
> + DEFINE_PROP_STRING("shm", IVShmemState, shmobj),
> + DEFINE_PROP_END_OF_LIST(),
> + }
> +};
> +
> +static void ivshmem_register_devices(void)
> +{
> + pci_qdev_register(&ivshmem_info);
> +}
> +
> +device_init(ivshmem_register_devices)
> diff --git a/qemu-char.c b/qemu-char.c
> index 048da3f..41cb8c7 100644
> --- a/qemu-char.c
> +++ b/qemu-char.c
> @@ -2076,6 +2076,12 @@ static void tcp_chr_read(void *opaque)
> }
> }
>
> +CharDriverState *qemu_chr_open_eventfd(int eventfd){
> +
> + return qemu_chr_open_fd(eventfd, eventfd);
> +
> +}
> +
> static void tcp_chr_connect(void *opaque)
> {
> CharDriverState *chr = opaque;
> diff --git a/qemu-char.h b/qemu-char.h
> index 3a9427b..1571091 100644
> --- a/qemu-char.h
> +++ b/qemu-char.h
> @@ -93,6 +93,9 @@ void qemu_chr_info_print(Monitor *mon, const QObject *ret_data);
> void qemu_chr_info(Monitor *mon, QObject **ret_data);
> CharDriverState *qemu_chr_find(const char *name);
>
> +/* add an eventfd to the qemu devices that are polled */
> +CharDriverState *qemu_chr_open_eventfd(int eventfd);
> +
> extern int term_escape_char;
>
> /* async I/O support */
> diff --git a/qemu-doc.texi b/qemu-doc.texi
> index 6647b7b..2df4687 100644
> --- a/qemu-doc.texi
> +++ b/qemu-doc.texi
> @@ -706,6 +706,31 @@ Using the @option{-net socket} option, it is possible to make VLANs
> that span several QEMU instances. See @ref{sec_invocation} to have a
> basic example.
>
> +@section Other Devices
> +
> +@subsection Inter-VM Shared Memory device
> +
> +With KVM enabled on a Linux host, a shared memory device is available. Guests
> +map a POSIX shared memory region into the guest as a PCI device that enables
> +zero-copy communication to the application level of the guests. The basic
> +syntax is:
> +
> +@example
> +qemu -device ivshmem,size=<size in format accepted by -m>[,shm=<shm name>]
> +@end example
> +
> +If desired, interrupts can be sent between guest VMs accessing the same shared
> +memory region. Interrupt support requires using a shared memory server and
> +using a chardev socket to connect to it. The code for the shared memory server
> +is qemu.git/contrib/ivshmem-server. An example syntax when using the shared
> +memory server is:
> +
> +@example
> +qemu -device ivshmem,size=<size in format accepted by -m>[,shm=<shm name>]
> + [,chardev=<id>][,msi=on][,irqfd=on][,vectors=n]
> +qemu -chardev socket,path=<path>,id=<id>
> +@end example
> +
> @node direct_linux_boot
> @section Direct Linux Boot
>
>
WARNING: multiple messages have this Message-ID (diff)
From: Anthony Liguori <anthony@codemonkey.ws>
To: Cam Macdonell <cam@cs.ualberta.ca>
Cc: qemu-devel@nongnu.org, kvm@vger.kernel.org
Subject: [Qemu-devel] Re: [PATCH v5 4/5] Inter-VM shared memory PCI device
Date: Thu, 06 May 2010 12:32:53 -0500 [thread overview]
Message-ID: <4BE2FD45.8000601@codemonkey.ws> (raw)
In-Reply-To: <1271872408-22842-5-git-send-email-cam@cs.ualberta.ca>
On 04/21/2010 12:53 PM, Cam Macdonell wrote:
> Support an inter-vm shared memory device that maps a shared-memory object as a
> PCI device in the guest. This patch also supports interrupts between guest by
> communicating over a unix domain socket. This patch applies to the qemu-kvm
> repository.
>
> -device ivshmem,size=<size in format accepted by -m>[,shm=<shm name>]
>
> Interrupts are supported between multiple VMs by using a shared memory server
> by using a chardev socket.
>
> -device ivshmem,size=<size in format accepted by -m>[,shm=<shm name>]
> [,chardev=<id>][,msi=on][,irqfd=on][,vectors=n]
> -chardev socket,path=<path>,id=<id>
>
> (shared memory server is qemu.git/contrib/ivshmem-server)
>
> Sample programs and init scripts are in a git repo here:
>
> www.gitorious.org/nahanni
> ---
> Makefile.target | 3 +
> hw/ivshmem.c | 727 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
> qemu-char.c | 6 +
> qemu-char.h | 3 +
> qemu-doc.texi | 25 ++
> 5 files changed, 764 insertions(+), 0 deletions(-)
> create mode 100644 hw/ivshmem.c
>
> diff --git a/Makefile.target b/Makefile.target
> index 1ffd802..bc9a681 100644
> --- a/Makefile.target
> +++ b/Makefile.target
> @@ -199,6 +199,9 @@ obj-$(CONFIG_USB_OHCI) += usb-ohci.o
> obj-y += rtl8139.o
> obj-y += e1000.o
>
> +# Inter-VM PCI shared memory
> +obj-y += ivshmem.o
> +
> # Hardware support
> obj-i386-y = pckbd.o dma.o
> obj-i386-y += vga.o
> diff --git a/hw/ivshmem.c b/hw/ivshmem.c
> new file mode 100644
> index 0000000..f8d8fdb
> --- /dev/null
> +++ b/hw/ivshmem.c
> @@ -0,0 +1,727 @@
> +/*
> + * Inter-VM Shared Memory PCI device.
> + *
> + * Author:
> + * Cam Macdonell<cam@cs.ualberta.ca>
> + *
> + * Based On: cirrus_vga.c and rtl8139.c
> + *
> + * This code is licensed under the GNU GPL v2.
> + */
> +#include<sys/mman.h>
> +#include<sys/types.h>
> +#include<sys/socket.h>
> +#include<sys/io.h>
> +#include<sys/ioctl.h>
> +#include<sys/eventfd.h>
>
This will break the Windows along with any non-Linux unix or any Linux
old enough to not have eventfd support.
If it's based on cirrus_vga.c and rtl8139.c, then it ought to carry the
respective copyrights, no?
Regards,
Anthony Liguori
> +#include "hw.h"
> +#include "console.h"
> +#include "pc.h"
> +#include "pci.h"
> +#include "sysemu.h"
> +
> +#include "msix.h"
> +#include "qemu-kvm.h"
> +#include "libkvm.h"
> +
> +#include<sys/eventfd.h>
> +#include<sys/mman.h>
> +#include<sys/socket.h>
> +#include<sys/ioctl.h>
> +
> +#define IVSHMEM_IRQFD 0
> +#define IVSHMEM_MSI 1
> +
> +#define DEBUG_IVSHMEM
> +#ifdef DEBUG_IVSHMEM
> +#define IVSHMEM_DPRINTF(fmt, args...) \
> + do {printf("IVSHMEM: " fmt, ##args); } while (0)
> +#else
> +#define IVSHMEM_DPRINTF(fmt, args...)
> +#endif
> +
> +typedef struct EventfdEntry {
> + PCIDevice *pdev;
> + int vector;
> +} EventfdEntry;
> +
> +typedef struct IVShmemState {
> + PCIDevice dev;
> + uint32_t intrmask;
> + uint32_t intrstatus;
> + uint32_t doorbell;
> +
> + CharDriverState * chr;
> + CharDriverState ** eventfd_chr;
> + int ivshmem_mmio_io_addr;
> +
> + pcibus_t mmio_addr;
> + unsigned long ivshmem_offset;
> + uint64_t ivshmem_size; /* size of shared memory region */
> + int shm_fd; /* shared memory file descriptor */
> +
> + int nr_allocated_vms;
> + /* array of eventfds for each guest */
> + int ** eventfds;
> + /* keep track of # of eventfds for each guest*/
> + int * eventfds_posn_count;
> +
> + int nr_alloc_guests;
> + int vm_id;
> + int num_eventfds;
> + uint32_t vectors;
> + uint32_t features;
> + EventfdEntry *eventfd_table;
> +
> + char * shmobj;
> + char * sizearg;
> +} IVShmemState;
> +
> +/* registers for the Inter-VM shared memory device */
> +enum ivshmem_registers {
> + IntrMask = 0,
> + IntrStatus = 4,
> + IVPosition = 8,
> + Doorbell = 12,
> +};
> +
> +static inline uint32_t ivshmem_has_feature(IVShmemState *ivs, int feature) {
> + return (ivs->features& (1<< feature));
> +}
> +
> +static inline int is_power_of_two(int x) {
> + return (x& (x-1)) == 0;
> +}
> +
> +static void ivshmem_map(PCIDevice *pci_dev, int region_num,
> + pcibus_t addr, pcibus_t size, int type)
> +{
> + IVShmemState *s = DO_UPCAST(IVShmemState, dev, pci_dev);
> +
> + IVSHMEM_DPRINTF("addr = %u size = %u\n", (uint32_t)addr, (uint32_t)size);
> + cpu_register_physical_memory(addr, s->ivshmem_size, s->ivshmem_offset);
> +
> +}
> +
> +/* accessing registers - based on rtl8139 */
> +static void ivshmem_update_irq(IVShmemState *s, int val)
> +{
> + int isr;
> + isr = (s->intrstatus& s->intrmask)& 0xffffffff;
> +
> + /* don't print ISR resets */
> + if (isr) {
> + IVSHMEM_DPRINTF("Set IRQ to %d (%04x %04x)\n",
> + isr ? 1 : 0, s->intrstatus, s->intrmask);
> + }
> +
> + qemu_set_irq(s->dev.irq[0], (isr != 0));
> +}
> +
> +static void ivshmem_IntrMask_write(IVShmemState *s, uint32_t val)
> +{
> + IVSHMEM_DPRINTF("IntrMask write(w) val = 0x%04x\n", val);
> +
> + s->intrmask = val;
> +
> + ivshmem_update_irq(s, val);
> +}
> +
> +static uint32_t ivshmem_IntrMask_read(IVShmemState *s)
> +{
> + uint32_t ret = s->intrmask;
> +
> + IVSHMEM_DPRINTF("intrmask read(w) val = 0x%04x\n", ret);
> +
> + return ret;
> +}
> +
> +static void ivshmem_IntrStatus_write(IVShmemState *s, uint32_t val)
> +{
> + IVSHMEM_DPRINTF("IntrStatus write(w) val = 0x%04x\n", val);
> +
> + s->intrstatus = val;
> +
> + ivshmem_update_irq(s, val);
> + return;
> +}
> +
> +static uint32_t ivshmem_IntrStatus_read(IVShmemState *s)
> +{
> + uint32_t ret = s->intrstatus;
> +
> + /* reading ISR clears all interrupts */
> + s->intrstatus = 0;
> +
> + ivshmem_update_irq(s, 0);
> +
> + return ret;
> +}
> +
> +static void ivshmem_io_writew(void *opaque, uint8_t addr, uint32_t val)
> +{
> +
> + IVSHMEM_DPRINTF("We shouldn't be writing words\n");
> +}
> +
> +static void ivshmem_io_writel(void *opaque, uint8_t addr, uint32_t val)
> +{
> + IVShmemState *s = opaque;
> +
> + u_int64_t write_one = 1;
> + u_int16_t dest = val>> 16;
> + u_int16_t vector = val& 0xff;
> +
> + addr&= 0xfe;
> +
> + switch (addr)
> + {
> + case IntrMask:
> + ivshmem_IntrMask_write(s, val);
> + break;
> +
> + case IntrStatus:
> + ivshmem_IntrStatus_write(s, val);
> + break;
> +
> + case Doorbell:
> + /* check doorbell range */
> + if ((vector>= 0)&& (vector< s->eventfds_posn_count[dest])) {
> + IVSHMEM_DPRINTF("Writing %ld to VM %d on vector %d\n", write_one, dest, vector);
> + if (write(s->eventfds[dest][vector],&(write_one), 8) != 8) {
> + IVSHMEM_DPRINTF("error writing to eventfd\n");
> + }
> + }
> + break;
> + default:
> + IVSHMEM_DPRINTF("Invalid VM Doorbell VM %d\n", dest);
> + }
> +}
> +
> +static void ivshmem_io_writeb(void *opaque, uint8_t addr, uint32_t val)
> +{
> + IVSHMEM_DPRINTF("We shouldn't be writing bytes\n");
> +}
> +
> +static uint32_t ivshmem_io_readw(void *opaque, uint8_t addr)
> +{
> +
> + IVSHMEM_DPRINTF("We shouldn't be reading words\n");
> + return 0;
> +}
> +
> +static uint32_t ivshmem_io_readl(void *opaque, uint8_t addr)
> +{
> +
> + IVShmemState *s = opaque;
> + uint32_t ret;
> +
> + switch (addr)
> + {
> + case IntrMask:
> + ret = ivshmem_IntrMask_read(s);
> + break;
> +
> + case IntrStatus:
> + ret = ivshmem_IntrStatus_read(s);
> + break;
> +
> + case IVPosition:
> + /* return my id in the ivshmem list */
> + ret = s->vm_id;
> + break;
> +
> + default:
> + IVSHMEM_DPRINTF("why are we reading 0x%x\n", addr);
> + ret = 0;
> + }
> +
> + return ret;
> +
> +}
> +
> +static uint32_t ivshmem_io_readb(void *opaque, uint8_t addr)
> +{
> + IVSHMEM_DPRINTF("We shouldn't be reading bytes\n");
> +
> + return 0;
> +}
> +
> +static void ivshmem_mmio_writeb(void *opaque,
> + target_phys_addr_t addr, uint32_t val)
> +{
> + ivshmem_io_writeb(opaque, addr& 0xFF, val);
> +}
> +
> +static void ivshmem_mmio_writew(void *opaque,
> + target_phys_addr_t addr, uint32_t val)
> +{
> + ivshmem_io_writew(opaque, addr& 0xFF, val);
> +}
> +
> +static void ivshmem_mmio_writel(void *opaque,
> + target_phys_addr_t addr, uint32_t val)
> +{
> + ivshmem_io_writel(opaque, addr& 0xFF, val);
> +}
> +
> +static uint32_t ivshmem_mmio_readb(void *opaque, target_phys_addr_t addr)
> +{
> + return ivshmem_io_readb(opaque, addr& 0xFF);
> +}
> +
> +static uint32_t ivshmem_mmio_readw(void *opaque, target_phys_addr_t addr)
> +{
> + uint32_t val = ivshmem_io_readw(opaque, addr& 0xFF);
> + return val;
> +}
> +
> +static uint32_t ivshmem_mmio_readl(void *opaque, target_phys_addr_t addr)
> +{
> + uint32_t val = ivshmem_io_readl(opaque, addr& 0xFF);
> + return val;
> +}
> +
> +static CPUReadMemoryFunc *ivshmem_mmio_read[3] = {
> + ivshmem_mmio_readb,
> + ivshmem_mmio_readw,
> + ivshmem_mmio_readl,
> +};
> +
> +static CPUWriteMemoryFunc *ivshmem_mmio_write[3] = {
> + ivshmem_mmio_writeb,
> + ivshmem_mmio_writew,
> + ivshmem_mmio_writel,
> +};
> +
> +static void ivshmem_receive(void *opaque, const uint8_t *buf, int size)
> +{
> + IVShmemState *s = opaque;
> +
> + ivshmem_IntrStatus_write(s, *buf);
> +
> + IVSHMEM_DPRINTF("ivshmem_receive 0x%02x\n", *buf);
> +}
> +
> +static int ivshmem_can_receive(void * opaque)
> +{
> + return 8;
> +}
> +
> +static void ivshmem_event(void *opaque, int event)
> +{
> + IVSHMEM_DPRINTF("ivshmem_event %d\n", event);
> +}
> +
> +static void fake_irqfd(void *opaque, const uint8_t *buf, int size) {
> +
> + EventfdEntry *entry = opaque;
> + PCIDevice *pdev = entry->pdev;
> +
> + IVSHMEM_DPRINTF("fake irqfd on vector %d\n", entry->vector);
> + msix_notify(pdev, entry->vector);
> +}
> +
> +static CharDriverState* create_eventfd_chr_device(void * opaque, int eventfd,
> + int vector)
> +{
> + /* create a event character device based on the passed eventfd */
> + IVShmemState *s = opaque;
> + CharDriverState * chr;
> +
> + chr = qemu_chr_open_eventfd(eventfd);
> +
> + if (chr == NULL) {
> + IVSHMEM_DPRINTF("creating eventfd for eventfd %d failed\n", eventfd);
> + exit(-1);
> + }
> +
> + if (ivshmem_has_feature(s, IVSHMEM_MSI)) {
> + s->eventfd_table[vector].pdev =&s->dev;
> + s->eventfd_table[vector].vector = vector;
> +
> + qemu_chr_add_handlers(chr, ivshmem_can_receive, fake_irqfd,
> + ivshmem_event,&s->eventfd_table[vector]);
> + } else {
> + qemu_chr_add_handlers(chr, ivshmem_can_receive, ivshmem_receive,
> + ivshmem_event, s);
> + }
> +
> + return chr;
> +
> +}
> +
> +static int check_shm_size(IVShmemState *s, int shmemfd) {
> + /* check that the guest isn't going to try and map more memory than the
> + * card server allocated return -1 to indicate error */
> +
> + struct stat buf;
> +
> + fstat(shmemfd,&buf);
> +
> + if (s->ivshmem_size> buf.st_size) {
> + fprintf(stderr, "IVSHMEM ERROR: Requested memory size greater");
> + fprintf(stderr, " than shared object size (%ld> %ld)\n",
> + s->ivshmem_size, buf.st_size);
> + return -1;
> + } else {
> + return 0;
> + }
> +}
> +
> +static void create_shared_memory_BAR(IVShmemState *s, int fd) {
> +
> + s->shm_fd = fd;
> +
> + s->ivshmem_offset = qemu_ram_mmap(s->shm_fd, s->ivshmem_size,
> + MAP_SHARED, 0);
> +
> + /* region for shared memory */
> + pci_register_bar(&s->dev, 2, s->ivshmem_size,
> + PCI_BASE_ADDRESS_SPACE_MEMORY, ivshmem_map);
> +}
> +
> +static void close_guest_eventfds(IVShmemState *s, int posn)
> +{
> + int i, guest_curr_max;
> +
> + guest_curr_max = s->eventfds_posn_count[posn];
> +
> + for (i = 0; i< guest_curr_max; i++)
> + close(s->eventfds[posn][i]);
> +
> + free(s->eventfds[posn]);
> + s->eventfds_posn_count[posn] = 0;
> +}
> +
> +/* this function increase the dynamic storage need to store data about other
> + * guests */
> +static void increase_dynamic_storage(IVShmemState *s, int new_min_size) {
> +
> + int j, old_nr_alloc;
> +
> + old_nr_alloc = s->nr_alloc_guests;
> +
> + while (s->nr_alloc_guests< new_min_size)
> + s->nr_alloc_guests = s->nr_alloc_guests * 2;
> +
> + IVSHMEM_DPRINTF("bumping storage to %d guests\n", s->nr_alloc_guests);
> + s->eventfds = qemu_realloc(s->eventfds, s->nr_alloc_guests *
> + sizeof(int *));
> + s->eventfds_posn_count = qemu_realloc(s->eventfds_posn_count,
> + s->nr_alloc_guests *
> + sizeof(int));
> + s->eventfd_table = qemu_realloc(s->eventfd_table, s->nr_alloc_guests *
> + sizeof(EventfdEntry));
> +
> + if ((s->eventfds == NULL) || (s->eventfds_posn_count == NULL) ||
> + (s->eventfd_table == NULL)) {
> + fprintf(stderr, "Allocation error - exiting\n");
> + exit(1);
> + }
> +
> + if (!ivshmem_has_feature(s, IVSHMEM_IRQFD)) {
> + s->eventfd_chr = (CharDriverState **)qemu_realloc(s->eventfd_chr,
> + s->nr_alloc_guests * sizeof(void *));
> + if (s->eventfd_chr == NULL) {
> + fprintf(stderr, "Allocation error - exiting\n");
> + exit(1);
> + }
> + }
> +
> + /* zero out new pointers */
> + for (j = old_nr_alloc; j< s->nr_alloc_guests; j++) {
> + s->eventfds[j] = NULL;
> + }
> +}
> +
> +static void ivshmem_read(void *opaque, const uint8_t * buf, int flags)
> +{
> + IVShmemState *s = opaque;
> + int incoming_fd, tmp_fd;
> + int guest_curr_max;
> + long incoming_posn;
> +
> + memcpy(&incoming_posn, buf, sizeof(long));
> + /* pick off s->chr->msgfd and store it, posn should accompany msg */
> + tmp_fd = qemu_chr_get_msgfd(s->chr);
> + IVSHMEM_DPRINTF("posn is %ld, fd is %d\n", incoming_posn, tmp_fd);
> +
> + /* make sure we have enough space for this guest */
> + if (incoming_posn>= s->nr_alloc_guests) {
> + increase_dynamic_storage(s, incoming_posn);
> + }
> +
> + if (tmp_fd == -1) {
> + /* if posn is positive and unseen before then this is our posn*/
> + if ((incoming_posn>= 0)&& (s->eventfds[incoming_posn] == NULL)) {
> + /* receive our posn */
> + s->vm_id = incoming_posn;
> + return;
> + } else {
> + /* otherwise an fd == -1 means an existing guest has gone away */
> + IVSHMEM_DPRINTF("posn %ld has gone away\n", incoming_posn);
> + close_guest_eventfds(s, incoming_posn);
> + return;
> + }
> + }
> +
> + /* because of the implementation of get_msgfd, we need a dup */
> + incoming_fd = dup(tmp_fd);
> +
> + /* if the position is -1, then it's shared memory region fd */
> + if (incoming_posn == -1) {
> +
> + s->num_eventfds = 0;
> +
> + if (check_shm_size(s, incoming_fd) == -1) {
> + exit(-1);
> + }
> +
> + /* creating a BAR in qemu_chr callback may be crazy */
> + create_shared_memory_BAR(s, incoming_fd);
> +
> + return;
> + }
> +
> + /* each guest has an array of eventfds, and we keep track of how many
> + * guests for each VM */
> + guest_curr_max = s->eventfds_posn_count[incoming_posn];
> + if (guest_curr_max == 0) {
> + /* one eventfd per MSI vector */
> + s->eventfds[incoming_posn] = (int *) qemu_malloc(s->vectors *
> + sizeof(int));
> + }
> +
> + /* this is an eventfd for a particular guest VM */
> + IVSHMEM_DPRINTF("eventfds[%ld][%d] = %d\n", incoming_posn, guest_curr_max,
> + incoming_fd);
> + s->eventfds[incoming_posn][guest_curr_max] = incoming_fd;
> +
> + /* increment count for particular guest */
> + s->eventfds_posn_count[incoming_posn]++;
> +
> + /* ioeventfd and irqfd are enabled together,
> + * so the flag IRQFD refers to both */
> + if (ivshmem_has_feature(s, IVSHMEM_IRQFD)&& guest_curr_max>= 0) {
> + /* allocate ioeventfd for the new fd
> + * received for guest @ incoming_posn */
> + kvm_set_ioeventfd_mmio_long(incoming_fd, s->mmio_addr + Doorbell,
> + (incoming_posn<< 16) | guest_curr_max, 1);
> + }
> +
> + /* keep track of the maximum VM ID */
> + if (incoming_posn> s->num_eventfds) {
> + s->num_eventfds = incoming_posn;
> + }
> +
> + if (incoming_posn == s->vm_id) {
> + if (ivshmem_has_feature(s, IVSHMEM_IRQFD)) {
> + /* setup irqfd for this VM's eventfd */
> + int vector = guest_curr_max;
> + kvm_set_irqfd(s->eventfds[s->vm_id][guest_curr_max], vector,
> + s->dev.msix_irq_entries[vector].gsi);
> + } else {
> + /* initialize char device for callback
> + * if this is one of my eventfd */
> + s->eventfd_chr[guest_curr_max] = create_eventfd_chr_device(s,
> + s->eventfds[s->vm_id][guest_curr_max], guest_curr_max);
> + }
> + }
> +
> + return;
> +}
> +
> +static void ivshmem_reset(DeviceState *d)
> +{
> + return;
> +}
> +
> +static void ivshmem_mmio_map(PCIDevice *pci_dev, int region_num,
> + pcibus_t addr, pcibus_t size, int type)
> +{
> + IVShmemState *s = DO_UPCAST(IVShmemState, dev, pci_dev);
> +
> + s->mmio_addr = addr;
> + cpu_register_physical_memory(addr + 0, 0x400, s->ivshmem_mmio_io_addr);
> +
> + /* now that our mmio region has been allocated, we can receive
> + * the file descriptors */
> + if (s->chr != NULL) {
> + qemu_chr_add_handlers(s->chr, ivshmem_can_receive, ivshmem_read,
> + ivshmem_event, s);
> + }
> +
> +}
> +
> +static uint64_t ivshmem_get_size(IVShmemState * s) {
> +
> + uint64_t value;
> + char *ptr;
> +
> + value = strtoul(s->sizearg,&ptr, 10);
> + switch (*ptr) {
> + case 0: case 'M': case 'm':
> + value<<= 20;
> + break;
> + case 'G': case 'g':
> + value<<= 30;
> + break;
> + default:
> + fprintf(stderr, "qemu: invalid ram size: %s\n", s->sizearg);
> + exit(1);
> + }
> +
> + /* BARs must be a power of 2 */
> + if (!is_power_of_two(value)) {
> + fprintf(stderr, "ivshmem: size must be power of 2\n");
> + exit(1);
> + }
> +
> + return value;
> +
> +}
> +
> +static int pci_ivshmem_init(PCIDevice *dev)
> +{
> + IVShmemState *s = DO_UPCAST(IVShmemState, dev, dev);
> + uint8_t *pci_conf;
> + int i;
> +
> + if (s->sizearg == NULL)
> + s->ivshmem_size = 4<< 20; /* 4 MB default */
> + else {
> + s->ivshmem_size = ivshmem_get_size(s);
> + }
> +
> + /* IRQFD requires MSI */
> + if (ivshmem_has_feature(s, IVSHMEM_IRQFD)&&
> + !ivshmem_has_feature(s, IVSHMEM_MSI)) {
> + fprintf(stderr, "ivshmem: ioeventfd/irqfd requires MSI\n");
> + exit(1);
> + }
> +
> + pci_conf = s->dev.config;
> + pci_conf[0x00] = 0xf4; /* Qumranet vendor ID 0x5002 */
> + pci_conf[0x01] = 0x1a;
> + pci_conf[0x02] = 0x10;
> + pci_conf[0x03] = 0x11;
> + pci_conf[0x04] = PCI_COMMAND_IO | PCI_COMMAND_MEMORY;
> + pci_conf[0x0a] = 0x00; /* RAM controller */
> + pci_conf[0x0b] = 0x05;
> + pci_conf[0x0e] = 0x00; /* header_type */
> +
> + s->ivshmem_mmio_io_addr = cpu_register_io_memory(ivshmem_mmio_read,
> + ivshmem_mmio_write, s);
> + /* region for registers*/
> + pci_register_bar(&s->dev, 0, 0x400,
> + PCI_BASE_ADDRESS_SPACE_MEMORY, ivshmem_mmio_map);
> +
> + /* allocate the MSI-X vectors */
> + if (ivshmem_has_feature(s, IVSHMEM_MSI)) {
> +
> + if (!msix_init(&s->dev, s->vectors, 1, 0)) {
> + pci_register_bar(&s->dev, 1,
> + msix_bar_size(&s->dev),
> + PCI_BASE_ADDRESS_SPACE_MEMORY,
> + msix_mmio_map);
> + IVSHMEM_DPRINTF("msix initialized (%d vectors)\n", s->vectors);
> + } else {
> + IVSHMEM_DPRINTF("msix initialization failed\n");
> + }
> +
> + /* 'activate' the vectors */
> + for (i = 0; i< s->vectors; i++) {
> + msix_vector_use(&s->dev, i);
> + }
> + }
> +
> + if ((s->chr != NULL)&& (strncmp(s->chr->filename, "unix:", 5) == 0)) {
> + /* if we get a UNIX socket as the parameter we will talk
> + * to the ivshmem server later once the MMIO BAR is actually
> + * allocated (see ivshmem_mmio_map) */
> +
> + IVSHMEM_DPRINTF("using shared memory server (socket = %s)\n",
> + s->chr->filename);
> +
> + /* we allocate enough space for 16 guests and grow as needed */
> + s->nr_alloc_guests = 16;
> + s->vm_id = -1;
> +
> + /* allocate/initialize space for interrupt handling */
> + s->eventfds = qemu_mallocz(s->nr_alloc_guests * sizeof(int *));
> + s->eventfd_table = qemu_mallocz(s->vectors * sizeof(EventfdEntry));
> + s->eventfds_posn_count = qemu_mallocz(s->nr_alloc_guests * sizeof(int));
> +
> + pci_conf[PCI_INTERRUPT_PIN] = 1; /* we are going to support interrupts */
> +
> + if (!ivshmem_has_feature(s, IVSHMEM_IRQFD)) {
> + s->eventfd_chr = (CharDriverState **)qemu_malloc(s->nr_alloc_guests *
> + sizeof(void *));
> + }
> +
> + } else {
> + /* just map the file immediately, we're not using a server */
> + int fd;
> +
> + if (s->shmobj == NULL) {
> + fprintf(stderr, "Must specify 'chardev' or 'shm' to ivshmem\n");
> + }
> +
> + IVSHMEM_DPRINTF("using shm_open (shm object = %s)\n", s->shmobj);
> +
> + /* try opening with O_EXCL and if it succeeds zero the memory
> + * by truncating to 0 */
> + if ((fd = shm_open(s->shmobj, O_CREAT|O_RDWR|O_EXCL,
> + S_IRWXU|S_IRWXG|S_IRWXO))> 0) {
> + /* truncate file to length PCI device's memory */
> + if (ftruncate(fd, s->ivshmem_size) != 0) {
> + fprintf(stderr, "kvm_ivshmem: could not truncate shared file\n");
> + }
> +
> + } else if ((fd = shm_open(s->shmobj, O_CREAT|O_RDWR,
> + S_IRWXU|S_IRWXG|S_IRWXO))< 0) {
> + fprintf(stderr, "kvm_ivshmem: could not open shared file\n");
> + exit(-1);
> + }
> +
> + create_shared_memory_BAR(s, fd);
> +
> + }
> +
> +
> + return 0;
> +}
> +
> +static int pci_ivshmem_uninit(PCIDevice *dev)
> +{
> + IVShmemState *s = DO_UPCAST(IVShmemState, dev, dev);
> +
> + cpu_unregister_io_memory(s->ivshmem_mmio_io_addr);
> +
> + return 0;
> +}
> +
> +static PCIDeviceInfo ivshmem_info = {
> + .qdev.name = "ivshmem",
> + .qdev.size = sizeof(IVShmemState),
> + .qdev.reset = ivshmem_reset,
> + .init = pci_ivshmem_init,
> + .exit = pci_ivshmem_uninit,
> + .qdev.props = (Property[]) {
> + DEFINE_PROP_CHR("chardev", IVShmemState, chr),
> + DEFINE_PROP_STRING("size", IVShmemState, sizearg),
> + DEFINE_PROP_UINT32("vectors", IVShmemState, vectors, 1),
> + DEFINE_PROP_BIT("irqfd", IVShmemState, features, IVSHMEM_IRQFD, false),
> + DEFINE_PROP_BIT("msi", IVShmemState, features, IVSHMEM_MSI, true),
> + DEFINE_PROP_STRING("shm", IVShmemState, shmobj),
> + DEFINE_PROP_END_OF_LIST(),
> + }
> +};
> +
> +static void ivshmem_register_devices(void)
> +{
> + pci_qdev_register(&ivshmem_info);
> +}
> +
> +device_init(ivshmem_register_devices)
> diff --git a/qemu-char.c b/qemu-char.c
> index 048da3f..41cb8c7 100644
> --- a/qemu-char.c
> +++ b/qemu-char.c
> @@ -2076,6 +2076,12 @@ static void tcp_chr_read(void *opaque)
> }
> }
>
> +CharDriverState *qemu_chr_open_eventfd(int eventfd){
> +
> + return qemu_chr_open_fd(eventfd, eventfd);
> +
> +}
> +
> static void tcp_chr_connect(void *opaque)
> {
> CharDriverState *chr = opaque;
> diff --git a/qemu-char.h b/qemu-char.h
> index 3a9427b..1571091 100644
> --- a/qemu-char.h
> +++ b/qemu-char.h
> @@ -93,6 +93,9 @@ void qemu_chr_info_print(Monitor *mon, const QObject *ret_data);
> void qemu_chr_info(Monitor *mon, QObject **ret_data);
> CharDriverState *qemu_chr_find(const char *name);
>
> +/* add an eventfd to the qemu devices that are polled */
> +CharDriverState *qemu_chr_open_eventfd(int eventfd);
> +
> extern int term_escape_char;
>
> /* async I/O support */
> diff --git a/qemu-doc.texi b/qemu-doc.texi
> index 6647b7b..2df4687 100644
> --- a/qemu-doc.texi
> +++ b/qemu-doc.texi
> @@ -706,6 +706,31 @@ Using the @option{-net socket} option, it is possible to make VLANs
> that span several QEMU instances. See @ref{sec_invocation} to have a
> basic example.
>
> +@section Other Devices
> +
> +@subsection Inter-VM Shared Memory device
> +
> +With KVM enabled on a Linux host, a shared memory device is available. Guests
> +map a POSIX shared memory region into the guest as a PCI device that enables
> +zero-copy communication to the application level of the guests. The basic
> +syntax is:
> +
> +@example
> +qemu -device ivshmem,size=<size in format accepted by -m>[,shm=<shm name>]
> +@end example
> +
> +If desired, interrupts can be sent between guest VMs accessing the same shared
> +memory region. Interrupt support requires using a shared memory server and
> +using a chardev socket to connect to it. The code for the shared memory server
> +is qemu.git/contrib/ivshmem-server. An example syntax when using the shared
> +memory server is:
> +
> +@example
> +qemu -device ivshmem,size=<size in format accepted by -m>[,shm=<shm name>]
> + [,chardev=<id>][,msi=on][,irqfd=on][,vectors=n]
> +qemu -chardev socket,path=<path>,id=<id>
> +@end example
> +
> @node direct_linux_boot
> @section Direct Linux Boot
>
>
next prev parent reply other threads:[~2010-05-06 17:32 UTC|newest]
Thread overview: 102+ messages / expand[flat|nested] mbox.gz Atom feed top
2010-04-21 17:53 [PATCH v5 0/5] PCI Shared Memory device Cam Macdonell
2010-04-21 17:53 ` [Qemu-devel] " Cam Macdonell
2010-04-21 17:53 ` [PATCH v5 1/5] Device specification for shared memory PCI device Cam Macdonell
2010-04-21 17:53 ` [Qemu-devel] " Cam Macdonell
2010-04-21 17:53 ` [PATCH v5 2/5] Support adding a file to qemu's ram allocation Cam Macdonell
2010-04-21 17:53 ` [Qemu-devel] " Cam Macdonell
2010-04-21 17:53 ` [PATCH v5 3/5] Add functions for assigning ioeventfd and irqfds Cam Macdonell
2010-04-21 17:53 ` [Qemu-devel] " Cam Macdonell
2010-04-21 17:53 ` [PATCH v5 4/5] Inter-VM shared memory PCI device Cam Macdonell
2010-04-21 17:53 ` [Qemu-devel] " Cam Macdonell
2010-04-21 18:00 ` [PATCH v5 5/5] shared memory server for inter-VM shared memory Cam Macdonell
2010-04-21 18:00 ` [Qemu-devel] " Cam Macdonell
2010-05-05 16:57 ` [PATCH v5 4/5] RESEND: Inter-VM shared memory PCI device Cam Macdonell
2010-05-05 16:57 ` [Qemu-devel] " Cam Macdonell
2010-05-06 17:32 ` Anthony Liguori [this message]
2010-05-06 17:32 ` [Qemu-devel] Re: [PATCH v5 4/5] " Anthony Liguori
2010-05-06 17:59 ` Cam Macdonell
2010-05-06 17:59 ` [Qemu-devel] " Cam Macdonell
2010-05-10 11:59 ` Avi Kivity
2010-05-10 11:59 ` [Qemu-devel] " Avi Kivity
2010-05-10 15:22 ` Cam Macdonell
2010-05-10 15:22 ` [Qemu-devel] " Cam Macdonell
2010-05-10 15:28 ` Avi Kivity
2010-05-10 15:28 ` [Qemu-devel] " Avi Kivity
2010-05-10 15:38 ` Anthony Liguori
2010-05-10 15:38 ` [Qemu-devel] " Anthony Liguori
2010-05-10 16:20 ` Cam Macdonell
2010-05-10 16:20 ` [Qemu-devel] " Cam Macdonell
2010-05-10 16:52 ` Anthony Liguori
2010-05-10 16:52 ` [Qemu-devel] " Anthony Liguori
2010-05-18 16:58 ` Cam Macdonell
2010-05-18 16:58 ` [Qemu-devel] " Cam Macdonell
2010-05-18 17:27 ` Avi Kivity
2010-05-18 17:27 ` [Qemu-devel] " Avi Kivity
2010-05-10 16:59 ` Avi Kivity
2010-05-10 16:59 ` [Qemu-devel] " Avi Kivity
2010-05-10 17:25 ` Anthony Liguori
2010-05-10 17:25 ` [Qemu-devel] " Anthony Liguori
2010-05-10 17:43 ` Cam Macdonell
2010-05-10 17:43 ` [Qemu-devel] " Cam Macdonell
2010-05-10 17:52 ` Anthony Liguori
2010-05-10 17:52 ` [Qemu-devel] " Anthony Liguori
2010-05-10 18:01 ` Cam Macdonell
2010-05-10 18:01 ` [Qemu-devel] " Cam Macdonell
2010-05-11 7:59 ` Avi Kivity
2010-05-11 7:59 ` [Qemu-devel] " Avi Kivity
2010-05-11 13:10 ` Anthony Liguori
2010-05-11 13:10 ` [Qemu-devel] " Anthony Liguori
2010-05-11 14:03 ` Avi Kivity
2010-05-11 14:03 ` [Qemu-devel] " Avi Kivity
2010-05-11 14:17 ` Cam Macdonell
2010-05-11 14:17 ` [Qemu-devel] " Cam Macdonell
2010-05-11 14:53 ` Avi Kivity
2010-05-11 14:53 ` [Qemu-devel] " Avi Kivity
2010-05-11 15:51 ` Anthony Liguori
2010-05-11 15:51 ` [Qemu-devel] " Anthony Liguori
2010-05-11 16:39 ` Cam Macdonell
2010-05-11 16:39 ` [Qemu-devel] " Cam Macdonell
2010-05-11 17:05 ` Anthony Liguori
2010-05-11 17:05 ` [Qemu-devel] " Anthony Liguori
2010-05-11 17:50 ` Cam Macdonell
2010-05-11 17:50 ` [Qemu-devel] " Cam Macdonell
2010-05-11 18:13 ` Avi Kivity
2010-05-11 18:13 ` [Qemu-devel] " Avi Kivity
2010-05-12 15:32 ` Cam Macdonell
2010-05-12 15:32 ` [Qemu-devel] " Cam Macdonell
2010-05-12 15:48 ` Avi Kivity
2010-05-12 15:48 ` [Qemu-devel] " Avi Kivity
2010-05-11 18:09 ` Avi Kivity
2010-05-11 18:09 ` [Qemu-devel] " Avi Kivity
2010-05-11 7:55 ` Avi Kivity
2010-05-11 7:55 ` [Qemu-devel] " Avi Kivity
2010-05-10 15:41 ` Cam Macdonell
2010-05-10 15:41 ` [Qemu-devel] " Cam Macdonell
2010-05-10 16:40 ` Avi Kivity
2010-05-10 16:40 ` [Qemu-devel] " Avi Kivity
2010-05-10 16:48 ` Cam Macdonell
2010-05-10 16:48 ` [Qemu-devel] " Cam Macdonell
2010-05-12 15:49 ` Avi Kivity
2010-05-12 15:49 ` [Qemu-devel] " Avi Kivity
2010-05-12 16:14 ` Cam Macdonell
2010-05-12 16:14 ` [Qemu-devel] " Cam Macdonell
2010-05-12 16:45 ` Avi Kivity
2010-05-12 16:45 ` [Qemu-devel] " Avi Kivity
2010-05-10 23:17 ` Cam Macdonell
2010-05-10 23:17 ` [Qemu-devel] " Cam Macdonell
2010-05-11 8:03 ` Avi Kivity
2010-05-11 8:03 ` [Qemu-devel] " Avi Kivity
2010-05-13 21:10 ` Cam Macdonell
2010-05-13 21:10 ` [Qemu-devel] " Cam Macdonell
2010-05-15 6:05 ` Avi Kivity
2010-05-15 6:05 ` [Qemu-devel] " Avi Kivity
2010-05-10 10:43 ` [PATCH v5 3/5] Add functions for assigning ioeventfd and irqfds Avi Kivity
2010-05-10 10:43 ` [Qemu-devel] " Avi Kivity
2010-05-10 15:13 ` Cam Macdonell
2010-05-10 15:13 ` [Qemu-devel] " Cam Macdonell
2010-05-10 15:17 ` Avi Kivity
2010-05-10 15:17 ` [Qemu-devel] " Avi Kivity
2010-05-10 10:39 ` [PATCH v5 2/5] Support adding a file to qemu's ram allocation Avi Kivity
2010-05-10 10:39 ` [Qemu-devel] " Avi Kivity
2010-05-10 15:32 ` Cam Macdonell
2010-05-10 15:32 ` [Qemu-devel] " Cam Macdonell
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=4BE2FD45.8000601@codemonkey.ws \
--to=anthony@codemonkey.ws \
--cc=cam@cs.ualberta.ca \
--cc=kvm@vger.kernel.org \
--cc=qemu-devel@nongnu.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.