public inbox for kvm@vger.kernel.org
 help / color / mirror / Atom feed
From: Ben-Ami Yassour <benami@il.ibm.com>
To: avi@qumranet.com
Cc: amit.shah@qumranet.com, kvm@vger.kernel.org, muli@il.ibm.com,
	benami@il.ibm.com, weidong.han@intel.com, anthony@codemonkey.ws,
	Nir Peleg <nir@tutis.com>,
	Glauber de Oliveira Costa <gcosta@redhat.com>
Subject: [PATCH 1/1] KVM/userspace: Support for assigning PCI devices to guest
Date: Tue, 22 Jul 2008 15:18:25 +0300	[thread overview]
Message-ID: <1216729105-19863-2-git-send-email-benami@il.ibm.com> (raw)
In-Reply-To: <1216729105-19863-1-git-send-email-benami@il.ibm.com>

Based on a patch from: Amit Shah <amit.shah@qumranet.com>

We can assign a device from the host machine to a guest. The
original code comes from Neocleus.

A new command-line option, -pcidevice is added.
For example, to invoke it for an Ethernet device sitting at
PCI bus:dev.fn 04:08.0 with host IRQ 18, use this:

        -pcidevice Ethernet/04:08.0

The host ethernet driver is to be removed before doing assigning
the device to a guest. If not, the device assignment fails but the
guest continues without the assignment.

If kvm uses the in-kernel irqchip, interrupts are routed to the
guest via the kvm module (accompanied kernel changes are
necessary).

Signed-off-by: Amit Shah <amit.shah@qumranet.com>
Signed-off-by: Nir Peleg <nir@tutis.com>
Signed-off-by: Glauber de Oliveira Costa <gcosta@redhat.com>
Signed-off-by: Ben-Ami Yassour <benami@il.ibm.com>
---
 libkvm/libkvm-x86.c         |    8 +
 libkvm/libkvm.h             |   15 ++
 qemu/Makefile.target        |    1 +
 qemu/hw/device-assignment.c |  575 +++++++++++++++++++++++++++++++++++++++++++
 qemu/hw/device-assignment.h |   95 +++++++
 qemu/hw/isa.h               |    2 +
 qemu/hw/pc.c                |    9 +
 qemu/hw/pci.c               |   12 +
 qemu/hw/pci.h               |    1 +
 qemu/hw/piix_pci.c          |   19 ++
 qemu/vl.c                   |   17 ++
 11 files changed, 754 insertions(+), 0 deletions(-)
 create mode 100644 qemu/hw/device-assignment.c
 create mode 100644 qemu/hw/device-assignment.h

diff --git a/libkvm/libkvm-x86.c b/libkvm/libkvm-x86.c
index ea97bdd..ea4f0ef 100644
--- a/libkvm/libkvm-x86.c
+++ b/libkvm/libkvm-x86.c
@@ -126,6 +126,14 @@ static int kvm_init_tss(kvm_context_t kvm)
 	return 0;
 }
 
+#ifdef KVM_CAP_DEVICE_ASSIGNMENT
+int kvm_update_assigned_device(kvm_context_t kvm,
+			       struct kvm_assigned_dev *assigned_dev)
+{
+	return ioctl(kvm->vm_fd, KVM_UPDATE_ASSIGNED_DEVICE, assigned_dev);
+}
+#endif
+
 int kvm_arch_create_default_phys_mem(kvm_context_t kvm,
 				       unsigned long phys_mem_bytes,
 				       void **vm_mem)
diff --git a/libkvm/libkvm.h b/libkvm/libkvm.h
index 9f06fcc..276f6f0 100644
--- a/libkvm/libkvm.h
+++ b/libkvm/libkvm.h
@@ -647,6 +647,21 @@ int kvm_disable_tpr_access_reporting(kvm_context_t kvm, int vcpu);
 
 int kvm_enable_vapic(kvm_context_t kvm, int vcpu, uint64_t vapic);
 
+#ifdef KVM_CAP_DEVICE_ASSIGNMENT
+/*!
+ * \brief Notifies host kernel about changes to a PCI device assigned to guest
+ *
+ * Used for PCI device assignment, this function notifies the host
+ * kernel about the assigning of the physical PCI device and the guest
+ * PCI parameters or updates to the PCI config space from the guest
+ * (mainly the device irq)
+ *
+ * \param kvm Pointer to the current kvm_context
+ * \param assigned_dev Parameters like irq, PCI bus, devfn number, etc
+ */
+int kvm_update_assigned_device(kvm_context_t kvm,
+			       struct kvm_assigned_dev *assigned_dev);
+#endif
 #endif
 
 #if defined(__s390__)
diff --git a/qemu/Makefile.target b/qemu/Makefile.target
index 54480e4..94a6393 100644
--- a/qemu/Makefile.target
+++ b/qemu/Makefile.target
@@ -602,6 +602,7 @@ OBJS+= ide.o pckbd.o ps2.o vga.o $(SOUND_HW) dma.o
 OBJS+= fdc.o mc146818rtc.o serial.o i8259.o i8254.o pcspk.o pc.o
 OBJS+= cirrus_vga.o apic.o parallel.o acpi.o piix_pci.o
 OBJS+= usb-uhci.o vmmouse.o vmport.o vmware_vga.o extboot.o
+OBJS+= device-assignment.o
 ifeq ($(USE_KVM_PIT), 1)
 OBJS+= i8254-kvm.o
 endif
diff --git a/qemu/hw/device-assignment.c b/qemu/hw/device-assignment.c
new file mode 100644
index 0000000..ea98b18
--- /dev/null
+++ b/qemu/hw/device-assignment.c
@@ -0,0 +1,575 @@
+/*
+ * Copyright (c) 2007, Neocleus Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ *
+ *  Pass a PCI device from the host to a guest VM.
+ *
+ *  Adapted for KVM by Qumranet.
+ *
+ *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
+ *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
+ *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
+ */
+#include <stdio.h>
+#include <pthread.h>
+#include <sys/io.h>
+#include <sys/ioctl.h>
+#include <linux/types.h>
+
+/* From linux/ioport.h */
+#define IORESOURCE_IO		0x00000100	/* Resource type */
+#define IORESOURCE_MEM		0x00000200
+#define IORESOURCE_IRQ		0x00000400
+#define IORESOURCE_DMA		0x00000800
+#define IORESOURCE_PREFETCH	0x00001000	/* No side effects */
+
+#include "device-assignment.h"
+#include "irq.h"
+
+#include "qemu-kvm.h"
+#include <linux/kvm_para.h>
+extern FILE *logfile;
+
+/* #define DEVICE_ASSIGNMENT_DEBUG */
+
+#ifdef DEVICE_ASSIGNMENT_DEBUG
+#define DEBUG(fmt, args...) fprintf(stderr, "%s: " fmt, __func__ , ## args)
+#else
+#define DEBUG(fmt, args...)
+#endif
+
+#define assigned_dev_ioport_write(suffix)				\
+static void assigned_dev_ioport_write##suffix(void *opaque,             \
+					      uint32_t addr,            \
+                                              uint32_t value)		\
+{									\
+	assigned_dev_region_t *r_access =                               \
+                  (assigned_dev_region_t *)opaque;			\
+	uint32_t r_pio = (unsigned long)r_access->r_virtbase		\
+		+ (addr - r_access->e_physbase);			\
+	if (r_access->debug & DEVICE_ASSIGNMENT_DEBUG_PIO) {		\
+		fprintf(logfile, "assigned_dev_ioport_write" #suffix	\
+			": r_pio=%08x e_physbase=%08x"			\
+			" r_virtbase=%08lx value=%08x\n",		\
+			r_pio, (int)r_access->e_physbase,		\
+			(unsigned long)r_access->r_virtbase, value);	\
+	}								\
+	out##suffix(value, r_pio);					\
+}
+assigned_dev_ioport_write(b)
+assigned_dev_ioport_write(w)
+assigned_dev_ioport_write(l)
+
+#define assigned_dev_ioport_read(suffix)				\
+static uint32_t assigned_dev_ioport_read##suffix(void *opaque,          \
+                                                 uint32_t addr)  	\
+{									\
+	assigned_dev_region_t *r_access =                               \
+                    (assigned_dev_region_t *)opaque;     		\
+	uint32_t r_pio = (addr - r_access->e_physbase)			\
+		+ (unsigned long)r_access->r_virtbase;			\
+		uint32_t value = in##suffix(r_pio);			\
+		if (r_access->debug & DEVICE_ASSIGNMENT_DEBUG_PIO) {	\
+			fprintf(logfile, "assigned_dev_ioport_read"     \
+                                #suffix	": r_pio=%08x "                 \
+                                "e_physbase=%08x r_virtbase=%08lx "	\
+				"value=%08x\n",				\
+				r_pio, (int)r_access->e_physbase,	\
+				(unsigned long)r_access->r_virtbase,    \
+                                value); 				\
+		}							\
+		return value;						\
+}
+
+assigned_dev_ioport_read(b)
+assigned_dev_ioport_read(w)
+assigned_dev_ioport_read(l)
+
+void assigned_dev_iomem_map(PCIDevice * pci_dev, int region_num,
+			    uint32_t e_phys, uint32_t e_size, int type)
+{
+	assigned_dev_t *r_dev = (assigned_dev_t *) pci_dev;
+	assigned_dev_region_t *region = &r_dev->v_addrs[region_num];
+	int first_map = (region->e_size == 0);
+	int ret = 0;
+
+	DEBUG("e_phys=%08x r_virt=%p type=%d len=%08x region_num=%d \n",
+	      e_phys, r_dev->v_addrs[region_num].r_virtbase, type, e_size,
+	      region_num);
+
+	region->e_physbase = e_phys;
+	region->e_size = e_size;
+
+	if (!first_map)
+		kvm_destroy_phys_mem(kvm_context, e_phys, e_size);
+
+	if (e_size > 0)
+		ret = kvm_register_userspace_phys_mem(kvm_context,
+						      e_phys,
+						      region->r_virtbase,
+						      e_size,
+						      0);
+	if (ret != 0)
+		fprintf(logfile, "Error: create new mapping failed\n");
+}
+
+static void assigned_dev_ioport_map(PCIDevice *pci_dev, int region_num,
+				    uint32_t addr, uint32_t size, int type)
+{
+	assigned_dev_t *r_dev = (assigned_dev_t *) pci_dev;
+	int i;
+	uint32_t ((*rf[])(void *, uint32_t)) =  { assigned_dev_ioport_readb,
+						  assigned_dev_ioport_readw,
+						  assigned_dev_ioport_readl
+	};
+	void ((*wf[])(void *, uint32_t, uint32_t)) =
+		{ assigned_dev_ioport_writeb,
+		  assigned_dev_ioport_writew,
+		  assigned_dev_ioport_writel
+		};
+
+	r_dev->v_addrs[region_num].e_physbase = addr;
+	DEBUG("assigned_dev_ioport_map: address=0x%x type=0x%x len=%d"
+	      "region_num=%d \n", addr, type, size, region_num);
+
+	for (i = 0; i < 3; i++) {
+		register_ioport_write(addr, size, 1<<i, wf[i],
+				      (void *) (r_dev->v_addrs + region_num));
+		register_ioport_read(addr, size, 1<<i, rf[i],
+				     (void *) (r_dev->v_addrs + region_num));
+	}
+}
+
+static void assigned_dev_pci_write_config(PCIDevice *d, uint32_t address,
+					  uint32_t val, int len)
+{
+	int fd, r;
+
+	DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n",
+	      ((d->devfn >> 3) & 0x1F), (d->devfn & 0x7), (uint16_t) address,
+	      val, len);
+
+	if (address == 0x4)
+		pci_default_write_config(d, address, val, len);
+
+	if ((address >= 0x10 && address <= 0x24) || address == 0x34 ||
+	    address == 0x3c || address == 0x3d) {
+		/* used for update-mappings (BAR emulation) */
+		pci_default_write_config(d, address, val, len);
+		return;
+	}
+
+	DEBUG("NON BAR (%x.%x): address=%04x val=0x%08x len=%d\n",
+	      ((d->devfn >> 3) & 0x1F), (d->devfn & 0x7), (uint16_t) address,
+	      val, len);
+	fd = ((assigned_dev_t *)d)->real_device.config_fd;
+	lseek(fd, address, SEEK_SET);
+again:
+	r = write(fd, &val, len);
+	if (r < 0) {
+		if (errno == EINTR || errno == EAGAIN)
+			goto again;
+		fprintf(stderr, "%s: write failed, errno = %d\n", __func__,
+			errno);
+	}
+}
+
+static uint32_t assigned_dev_pci_read_config(PCIDevice *d, uint32_t address,
+					     int len)
+{
+	uint32_t val = 0;
+	int fd, r;
+
+	if ((address >= 0x10 && address <= 0x24) || address == 0x34 ||
+	    address == 0x3c || address == 0x3d) {
+		val = pci_default_read_config(d, address, len);
+		DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n",
+		      (d->devfn >> 3) & 0x1F, (d->devfn & 0x7), address, val,
+		      len);
+		return val;
+	}
+
+	/* vga specific, remove later */
+	if (address == 0xFC)
+		goto do_log;
+
+	fd = ((assigned_dev_t *)d)->real_device.config_fd;
+	lseek(fd, address, SEEK_SET);
+again:
+	r = read(fd, &val, len);
+	if (r < 0) {
+		if (errno == EINTR || errno == EAGAIN)
+			goto again;
+		fprintf(stderr, "%s: read failed, errno = %d\n", __func__,
+			errno);
+	}
+
+do_log:
+	DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n",
+	      (d->devfn >> 3) & 0x1F, (d->devfn & 0x7), address, val, len);
+
+	/* kill the special capabilities */
+	if (address == 4 && len == 4)
+		val &= ~0x100000;
+	else if (address == 6)
+		val &= ~0x10;
+
+	return val;
+}
+
+static int assigned_dev_register_regions(pci_region_t *io_regions,
+					 unsigned long regions_num,
+					 assigned_dev_t *pci_dev)
+{
+	uint32_t i;
+	pci_region_t *cur_region = io_regions;
+
+	for (i = 0; i < regions_num; i++, cur_region++) {
+		if (!cur_region->valid)
+			continue;
+#ifdef DEVICE_ASSIGNMENT_DEBUG
+		pci_dev->v_addrs[i].debug |= DEVICE_ASSIGNMENT_DEBUG_MMIO |
+			DEVICE_ASSIGNMENT_DEBUG_PIO;
+#endif
+		pci_dev->v_addrs[i].num = i;
+
+		/* handle memory io regions */
+		if (cur_region->type & IORESOURCE_MEM) {
+			int t = cur_region->type & IORESOURCE_PREFETCH
+				? PCI_ADDRESS_SPACE_MEM_PREFETCH
+				: PCI_ADDRESS_SPACE_MEM;
+
+			/* map physical memory */
+			pci_dev->v_addrs[i].e_physbase = cur_region->base_addr;
+			pci_dev->v_addrs[i].r_virtbase =
+				mmap(NULL,
+				     (cur_region->size + 0xFFF) & 0xFFFFF000,
+				     PROT_WRITE | PROT_READ, MAP_SHARED,
+				     cur_region->resource_fd, (off_t) 0);
+
+			if ((void *) -1 == pci_dev->v_addrs[i].r_virtbase) {
+				fprintf(stderr, "Error: Couldn't mmap 0x%x!\n",
+					(uint32_t) (cur_region->base_addr));
+				return -1;
+			}
+			pci_dev->v_addrs[i].r_size = cur_region->size;
+			pci_dev->v_addrs[i].e_size = 0;
+
+			/* add offset */
+			pci_dev->v_addrs[i].r_virtbase +=
+				(cur_region->base_addr & 0xFFF);
+
+			pci_register_io_region((PCIDevice *) pci_dev, i,
+					       cur_region->size, t,
+					       assigned_dev_iomem_map);
+
+			continue;
+		}
+		/* handle port io regions */
+
+		pci_register_io_region((PCIDevice *) pci_dev, i,
+				       cur_region->size, PCI_ADDRESS_SPACE_IO,
+				       assigned_dev_ioport_map);
+
+		pci_dev->v_addrs[i].e_physbase = cur_region->base_addr;
+		pci_dev->v_addrs[i].r_virtbase =
+			(void *)(long)cur_region->base_addr;
+		/* not relevant for port io */
+		pci_dev->v_addrs[i].memory_index = 0;
+	}
+
+	/* success */
+	return 0;
+
+}
+
+static int get_real_device(assigned_dev_t *pci_dev, uint8_t r_bus,
+			   uint8_t r_dev, uint8_t r_func)
+{
+	char dir[128], name[128], comp[16];
+	int fd, r = 0;
+	FILE *f;
+	unsigned long long start, end, size, flags;
+	pci_region_t *rp;
+	pci_dev_t *dev = &pci_dev->real_device;
+
+	dev->region_number = 0;
+
+	sprintf(dir, "/sys/bus/pci/devices/0000:%02x:%02x.%x/",
+		r_bus, r_dev, r_func);
+	strcpy(name, dir);
+	strcat(name, "config");
+	fd = open(name, O_RDWR);
+	if (fd == -1) {
+		fprintf(stderr, "%s: %m\n", name);
+		return 1;
+	}
+	dev->config_fd = fd;
+again:
+	r = read(fd, pci_dev->dev.config, sizeof pci_dev->dev.config);
+	if (r < 0) {
+		if (errno == EINTR || errno == EAGAIN)
+			goto again;
+		fprintf(stderr, "%s: read failed, errno = %d\n", __func__,
+			errno);
+	}
+
+	strcpy(name, dir);
+	strcat(name, "resource");
+	f = fopen(name, "r");
+	if (f == NULL) {
+		fprintf(stderr, "%s: %m\n", name);
+		return 1;
+	}
+
+	for (r = 0; fscanf(f, "%lli %lli %lli\n", &start, &end, &flags) == 3;
+	     r++) {
+		rp = dev->regions + r;
+		rp->valid = 0;
+		size = end - start + 1;
+		flags &= IORESOURCE_IO | IORESOURCE_MEM | IORESOURCE_PREFETCH;
+		if (size == 0 || (flags & ~IORESOURCE_PREFETCH) == 0)
+			continue;
+		if (flags & IORESOURCE_MEM) {
+			flags &= ~IORESOURCE_IO;
+			sprintf(comp, "resource%d", r);
+			strcpy(name, dir);
+			strcat(name, comp);
+			fd = open(name, O_RDWR);
+			if (fd == -1)
+				continue;		/* probably ROM */
+			rp->resource_fd = fd;
+		} else
+			flags &= ~IORESOURCE_PREFETCH;
+
+		rp->type = flags;
+		rp->valid = 1;
+		rp->base_addr = start;
+		rp->size = size;
+		DEBUG("region %d size %d start 0x%x type %d "
+		      "resource_fd %d\n", r, rp->size, start, rp->type,
+		      rp->resource_fd);
+	}
+	fclose(f);
+
+	dev->region_number = r;
+	return 0;
+}
+
+static assigned_dev_t *register_real_device(PCIBus *e_bus,
+					    const char *e_dev_name,
+					    int e_devfn, uint8_t r_bus,
+					    uint8_t r_dev,
+					    uint8_t r_func)
+{
+	int rc;
+	assigned_dev_t *pci_dev;
+	uint8_t e_device, e_intx;
+
+	DEBUG("register_real_device: Registering real physical "
+	      "device %s (devfn=0x%x)\n", e_dev_name, e_devfn);
+	
+	pci_dev = (assigned_dev_t *)
+		pci_register_device(e_bus, e_dev_name,
+				    sizeof(assigned_dev_t), e_devfn,
+				    assigned_dev_pci_read_config,
+				    assigned_dev_pci_write_config);
+
+	if (NULL == pci_dev) {
+		fprintf(stderr, "register_real_device: Error: Couldn't "
+			"register real device %s\n", e_dev_name);
+		return NULL;
+	}
+	if (get_real_device(pci_dev, r_bus, r_dev, r_func)) {
+		fprintf(stderr, "register_real_device: Error: Couldn't get "
+			"real device (%s)!\n", e_dev_name);
+		return NULL;
+	}
+
+	/* handle real device's MMIO/PIO BARs */
+	if (assigned_dev_register_regions(pci_dev->real_device.regions,
+					  pci_dev->real_device.region_number,
+					  pci_dev))
+		return NULL;
+
+	/* handle interrupt routing */
+	e_device = (pci_dev->dev.devfn >> 3) & 0x1f;
+	e_intx = pci_dev->dev.config[0x3d] - 1;
+	pci_dev->intpin = e_intx;
+	pci_dev->run = 0;
+	pci_dev->girq = 0;
+	pci_dev->h_busnr = r_bus;
+	pci_dev->h_devfn = PCI_DEVFN(r_dev, r_func);
+
+#ifdef KVM_CAP_DEVICE_ASSIGNMENT
+	if (kvm_enabled()) {
+		struct kvm_assigned_dev assigned_dev_data;
+
+		memset(&assigned_dev_data, 0, sizeof(assigned_dev_data));
+		assigned_dev_data.guest.busnr = pci_bus_num(e_bus);
+		assigned_dev_data.guest.devfn = PCI_DEVFN(e_device, r_func);
+		assigned_dev_data.guest.num_valid_irqs = 1;
+		assigned_dev_data.host.busnr  = pci_dev->h_busnr;
+		assigned_dev_data.host.devfn  = pci_dev->h_devfn;
+		assigned_dev_data.host.num_valid_irqs = 1;
+		/* We'll set the value of the guest irq as and when
+		 * the piix config gets updated. See assigned_dev_update_irq.
+		 * The host irq field never gets used anyway
+		 */
+		rc = kvm_update_assigned_device(kvm_context,
+						&assigned_dev_data);
+		if (rc < 0) {
+			fprintf(stderr, "Could not notify kernel about "
+				"assigned device\n");
+			perror("pt-ioctl");
+			return NULL;
+		}
+	}
+#endif
+
+	fprintf(logfile, "Registered host PCI device %02x:%02x.%1x "
+		"as guest device %02x:%02x.%1x\n",
+		r_bus, r_dev, r_func,
+		pci_bus_num(e_bus), e_device, r_func);
+
+	return pci_dev;
+}
+
+#define	MAX_ASSIGNED_DEVS 4
+struct {
+	char name[128];
+	int bus;
+	int dev;
+	int func;
+	assigned_dev_t *assigned_dev;
+} assigned_devices[MAX_ASSIGNED_DEVS];
+
+int num_assigned_devices;
+extern int piix_get_irq(int);
+
+#ifdef KVM_CAP_DEVICE_ASSIGNMENT
+/* The pci config space got updated. Check if irq numbers have changed
+ * for our devices
+ */
+void assigned_dev_update_irq(PCIDevice *d)
+{
+	int i, irq, r;
+	assigned_dev_t *assigned_dev;
+
+	for (i = 0; i < num_assigned_devices; i++) {
+		assigned_dev = assigned_devices[i].assigned_dev;
+		if (assigned_dev == NULL)
+			continue;
+
+		irq = pci_map_irq(&assigned_dev->dev, assigned_dev->intpin);
+		irq = piix_get_irq(irq);
+		if (irq != assigned_dev->girq) {
+			struct kvm_assigned_dev assigned_dev_data;
+
+			memset(&assigned_dev_data, 0,
+			       sizeof(assigned_dev_data));
+			assigned_dev_data.guest.irq[0] = irq;
+			assigned_dev_data.guest.num_valid_irqs = 1;
+			assigned_dev_data.host.busnr = assigned_dev->h_busnr;
+			assigned_dev_data.host.devfn = assigned_dev->h_devfn;
+			assigned_dev_data.host.num_valid_irqs = 1;
+			r = kvm_update_assigned_device(kvm_context,
+						       &assigned_dev_data);
+			if (r < 0) {
+				perror("assigned_dev_update_irq");
+				continue;
+			}
+			assigned_dev->girq = irq;
+		}
+	}
+}
+#endif
+
+int init_device_assignment(void)
+{
+	/* Do we have any devices to be assigned? */
+	if (num_assigned_devices == 0)
+		return -1;
+
+	iopl(3);
+
+	return 0;
+}
+
+int init_assigned_device(PCIBus *bus, int *index)
+{
+	assigned_dev_t *dev = NULL;
+	int i, ret = 0;
+
+	if (*index == -1) {
+		if (init_device_assignment() < 0)
+			return -1;
+
+		*index = num_assigned_devices - 1;
+	}
+	i = *index;
+
+	dev = register_real_device(bus, assigned_devices[i].name, -1,
+				   assigned_devices[i].bus,
+				   assigned_devices[i].dev,
+				   assigned_devices[i].func);
+	if (dev == NULL) {
+		fprintf(stderr, "Error: Couldn't register device %s\n",
+			assigned_devices[i].name);
+		ret = -1;
+	}
+	assigned_devices[i].assigned_dev = dev;
+
+	--*index;
+	return ret;
+}
+
+void add_assigned_device(const char *arg)
+{
+	/* name/bus:dev.func */
+	char *cp, *cp1;
+
+	if (num_assigned_devices >= MAX_ASSIGNED_DEVS) {
+		fprintf(stderr, "Too many assigned devices (max %d)\n",
+			MAX_ASSIGNED_DEVS);
+		return;
+	}
+	strcpy(assigned_devices[num_assigned_devices].name, arg);
+	cp = strchr(assigned_devices[num_assigned_devices].name, '/');
+	if (cp == NULL)
+		goto bad;
+	*cp++ = 0;
+
+	assigned_devices[num_assigned_devices].bus = strtoul(cp, &cp1, 16);
+	if (*cp1 != ':')
+		goto bad;
+	cp = cp1 + 1;
+
+	assigned_devices[num_assigned_devices].dev = strtoul(cp, &cp1, 16);
+	if (*cp1 != '.')
+		goto bad;
+	cp = cp1 + 1;
+
+	assigned_devices[num_assigned_devices].func = strtoul(cp, &cp1, 16);
+	if (*cp1 != 0)
+		goto bad;
+
+	num_assigned_devices++;
+	return;
+bad:
+	fprintf(stderr, "assigned device arg (%s) not in the form of "
+		"name/bus:dev.func\n", arg);
+}
diff --git a/qemu/hw/device-assignment.h b/qemu/hw/device-assignment.h
new file mode 100644
index 0000000..f80a1d5
--- /dev/null
+++ b/qemu/hw/device-assignment.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2007, Neocleus Corporation.
+ * Copyright (c) 2007, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ *  Data structures for storing PCI state
+ *
+ *  Adapted to kvm by Qumranet
+ *
+ *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
+ *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
+ *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
+ */
+
+#ifndef __DEVICE_ASSIGNMENT_H__
+#define __DEVICE_ASSIGNMENT_H__
+
+#include <sys/mman.h>
+#include "qemu-common.h"
+#include "pci.h"
+#include <linux/types.h>
+
+#define DEVICE_ASSIGNMENT_DEBUG_PIO	(0x01)
+#define DEVICE_ASSIGNMENT_DEBUG_MMIO	(0x02)
+
+/* From include/linux/pci.h in the kernel sources */
+#define PCI_DEVFN(slot, func)	((((slot) & 0x1f) << 3) | ((func) & 0x07))
+
+typedef uint32_t pciaddr_t;
+
+#define MAX_IO_REGIONS			(6)
+
+typedef struct pci_region_s {
+	int type;	/* Memory or port I/O */
+	int valid;
+	pciaddr_t base_addr;
+	pciaddr_t size;		/* size of the region */
+	int resource_fd;
+} pci_region_t;
+
+typedef struct pci_dev_s {
+	uint8_t bus, dev, func;	/* Bus inside domain, device and function */
+	int irq;		/* IRQ number */
+	uint16_t region_number;	/* number of active regions */
+
+	/* Port I/O or MMIO Regions */
+	pci_region_t regions[MAX_IO_REGIONS];
+	int config_fd;
+} pci_dev_t;
+
+typedef struct assigned_dev_region_s {
+	target_phys_addr_t e_physbase;
+	uint32_t memory_index;
+	void *r_virtbase;	/* mmapped access address */
+	int num;		/* our index within v_addrs[] */
+	uint32_t e_size;        /* emulated size of region in bytes */
+	uint32_t r_size;        /* real size of region in bytes */
+	uint32_t debug;
+} assigned_dev_region_t;
+
+typedef struct assigned_dev_s {
+	PCIDevice dev;
+	int intpin;
+	uint8_t debug_flags;
+	assigned_dev_region_t v_addrs[PCI_NUM_REGIONS];
+	pci_dev_t real_device;
+	int run;
+	int girq;
+	char sirq[4];
+	unsigned char h_busnr;
+	unsigned int h_devfn;
+	int bound;
+} assigned_dev_t;
+
+/* Initialization functions */
+int init_assigned_device(PCIBus *bus, int *index);
+void add_assigned_device(const char *arg);
+void assigned_dev_set_vector(int irq, int vector);
+void assigned_dev_ack_mirq(int vector);
+
+#define logfile stderr
+
+#endif				/* __DEVICE_ASSIGNMENT_H__ */
diff --git a/qemu/hw/isa.h b/qemu/hw/isa.h
index 89b3004..c720f5e 100644
--- a/qemu/hw/isa.h
+++ b/qemu/hw/isa.h
@@ -1,5 +1,7 @@
 /* ISA bus */
 
+#include "hw.h"
+
 extern target_phys_addr_t isa_mem_base;
 
 int register_ioport_read(int start, int length, int size,
diff --git a/qemu/hw/pc.c b/qemu/hw/pc.c
index da60199..71a491d 100644
--- a/qemu/hw/pc.c
+++ b/qemu/hw/pc.c
@@ -32,6 +32,7 @@
 #include "smbus.h"
 #include "boards.h"
 #include "console.h"
+#include "device-assignment.h"
 
 #include "qemu-kvm.h"
 
@@ -994,6 +995,14 @@ static void pc_init1(ram_addr_t ram_size, int vga_ram_size,
         }
     }
 
+    /* Initialize device assignment */
+    if (pci_enabled) {
+	    int r = -1;
+	    do {
+		    init_assigned_device(pci_bus, &r);
+	    } while (r >= 0);
+    }
+
     rtc_state = rtc_init(0x70, i8259[8]);
 
     qemu_register_boot_set(pc_boot_set, rtc_state);
diff --git a/qemu/hw/pci.c b/qemu/hw/pci.c
index 92683d1..d45d0ce 100644
--- a/qemu/hw/pci.c
+++ b/qemu/hw/pci.c
@@ -50,6 +50,7 @@ struct PCIBus {
 
 static void pci_update_mappings(PCIDevice *d);
 static void pci_set_irq(void *opaque, int irq_num, int level);
+static void assigned_dev_update_irq(PCIDevice *d);
 
 target_phys_addr_t pci_mem_base;
 static int pci_irq_index;
@@ -453,6 +454,12 @@ void pci_default_write_config(PCIDevice *d,
         val >>= 8;
     }
 
+#ifdef KVM_CAP_DEVICE_ASSIGNMENT
+    if (kvm_enabled() && qemu_kvm_irqchip_in_kernel() &&
+	address >= 0x60 && address <= 0x63)
+	assigned_dev_update_irq(d);
+#endif
+
     end = address + len;
     if (end > PCI_COMMAND && address < (PCI_COMMAND + 2)) {
         /* if the command register is modified, we must modify the mappings */
@@ -555,6 +562,11 @@ static void pci_set_irq(void *opaque, int irq_num, int level)
     bus->set_irq(bus->irq_opaque, irq_num, bus->irq_count[irq_num] != 0);
 }
 
+int pci_map_irq(PCIDevice *pci_dev, int pin)
+{
+	return pci_dev->bus->map_irq(pci_dev, pin);
+}
+
 /***********************************************************/
 /* monitor info on PCI */
 
diff --git a/qemu/hw/pci.h b/qemu/hw/pci.h
index 60e4094..e11fbbf 100644
--- a/qemu/hw/pci.h
+++ b/qemu/hw/pci.h
@@ -81,6 +81,7 @@ void pci_register_io_region(PCIDevice *pci_dev, int region_num,
                             uint32_t size, int type,
                             PCIMapIORegionFunc *map_func);
 
+int pci_map_irq(PCIDevice *pci_dev, int pin);
 uint32_t pci_default_read_config(PCIDevice *d,
                                  uint32_t address, int len);
 void pci_default_write_config(PCIDevice *d,
diff --git a/qemu/hw/piix_pci.c b/qemu/hw/piix_pci.c
index 90cb3a6..9ba1d8e 100644
--- a/qemu/hw/piix_pci.c
+++ b/qemu/hw/piix_pci.c
@@ -237,6 +237,25 @@ static void piix3_set_irq(qemu_irq *pic, int irq_num, int level)
     }
 }
 
+int piix3_get_pin(int pic_irq)
+{
+    int i;
+    for (i = 0; i < 4; i++)
+	    if (piix3_dev->config[0x60+i] == pic_irq)
+		    return i;
+    return -1;
+}
+
+int piix_get_irq(int pin)
+{
+    if (piix3_dev)
+	    return piix3_dev->config[0x60+pin];
+    if (piix4_dev)
+	    return piix4_dev->config[0x60+pin];
+
+    return 0;
+}
+
 static void piix3_reset(PCIDevice *d)
 {
     uint8_t *pci_conf = d->config;
diff --git a/qemu/vl.c b/qemu/vl.c
index d9b7db2..04ca724 100644
--- a/qemu/vl.c
+++ b/qemu/vl.c
@@ -37,6 +37,7 @@
 #include "qemu-char.h"
 #include "block.h"
 #include "audio/audio.h"
+#include "hw/device-assignment.h"
 #include "migration.h"
 #include "qemu-kvm.h"
 
@@ -7953,6 +7954,11 @@ static void help(int exitcode)
 #endif
 	   "-no-kvm-irqchip disable KVM kernel mode PIC/IOAPIC/LAPIC\n"
 	   "-no-kvm-pit	    disable KVM kernel mode PIT\n"
+#if defined(TARGET_I386) || defined(TARGET_X86_64)
+	   "-pcidevice name/bus:dev.func\n"
+	   "                expose a PCI device to the guest OS.\n"
+	   "                'name' is just used for debug logs.\n"
+#endif
 #endif
 #ifdef TARGET_I386
            "-std-vga        simulate a standard VGA card with VESA Bochs Extensions\n"
@@ -8076,6 +8082,9 @@ enum {
     QEMU_OPTION_no_kvm,
     QEMU_OPTION_no_kvm_irqchip,
     QEMU_OPTION_no_kvm_pit,
+#if defined(TARGET_I386) || defined(TARGET_X86_64)
+    QEMU_OPTION_pcidevice,
+#endif
     QEMU_OPTION_no_reboot,
     QEMU_OPTION_no_shutdown,
     QEMU_OPTION_show_cursor,
@@ -8165,6 +8174,9 @@ const QEMUOption qemu_options[] = {
 #endif
     { "no-kvm-irqchip", 0, QEMU_OPTION_no_kvm_irqchip },
     { "no-kvm-pit", 0, QEMU_OPTION_no_kvm_pit },
+#if defined(TARGET_I386) || defined(TARGET_X86_64)
+    { "pcidevice", HAS_ARG, QEMU_OPTION_pcidevice },
+#endif
 #endif
 #if defined(TARGET_PPC) || defined(TARGET_SPARC)
     { "g", 1, QEMU_OPTION_g },
@@ -9047,6 +9059,11 @@ int main(int argc, char **argv)
 		kvm_pit = 0;
 		break;
 	    }
+#if defined(TARGET_I386) || defined(TARGET_X86_64)
+	    case QEMU_OPTION_pcidevice:
+		add_assigned_device(optarg);
+		break;
+#endif
 #endif
             case QEMU_OPTION_usb:
                 usb_enabled = 1;
-- 
1.5.6


  reply	other threads:[~2008-07-22 12:18 UTC|newest]

Thread overview: 33+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2008-07-22 12:13 device assignemnt: updated patches Ben-Ami Yassour
2008-07-22 12:13 ` [PATCH 1/4] KVM: Add irq ack notifier list Ben-Ami Yassour
2008-07-22 12:13   ` [PATCH 2/4] KVM: pci device assignment Ben-Ami Yassour
2008-07-22 12:13     ` [PATCH 3/4] VT-d: changes to support KVM Ben-Ami Yassour
2008-07-22 12:13       ` [PATCH 4/4] KVM: Device assignemnt with VT-d Ben-Ami Yassour
2008-07-22 12:18         ` device assignment - userspace part Ben-Ami Yassour
2008-07-22 12:18           ` Ben-Ami Yassour [this message]
2008-07-26  9:02         ` [PATCH 4/4] KVM: Device assignemnt with VT-d Avi Kivity
2008-07-28  6:49         ` Han, Weidong
2008-07-28 16:34           ` Ben-Ami Yassour
2008-07-26  8:46     ` [PATCH 2/4] KVM: pci device assignment Avi Kivity
2008-07-28  7:27     ` Yang, Sheng
2008-07-28 16:41       ` Ben-Ami Yassour
2008-07-28  8:33     ` Amit Shah
2008-07-26  8:19   ` [PATCH 1/4] KVM: Add irq ack notifier list Avi Kivity
2008-07-26  9:05 ` device assignemnt: updated patches Avi Kivity
2008-07-26  9:24   ` Han, Weidong
2008-07-26  9:32     ` Avi Kivity
2008-07-26  9:48       ` Han, Weidong
  -- strict thread matches above, loose matches on Subject: below --
2008-07-28 16:26 [PATCH 5/5] This patch extends the VT-d driver to support KVM Ben-Ami Yassour
2008-07-28 16:32 ` Device assignment - userspace part Ben-Ami Yassour
2008-07-28 16:32   ` [PATCH 1/1] KVM/userspace: Support for assigning PCI devices to guest Ben-Ami Yassour
2008-08-01  3:09     ` Han, Weidong
2008-08-05  9:41       ` Ben-Ami Yassour
2008-06-02  6:46 Amit Shah
2008-06-02  7:18 ` Han, Weidong
2008-06-02  8:11   ` Amit Shah
2008-06-04 14:19     ` Avi Kivity
2008-06-04 14:43       ` Amit Shah
2008-06-04 14:53         ` Avi Kivity
2008-06-04 15:53           ` Amit Shah
2008-06-04 16:11           ` Muli Ben-Yehuda
2008-06-02 13:57 ` Anthony Liguori
2008-06-02 17:07   ` Amit Shah
2008-06-02 13:58 ` Anthony Liguori

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1216729105-19863-2-git-send-email-benami@il.ibm.com \
    --to=benami@il.ibm.com \
    --cc=amit.shah@qumranet.com \
    --cc=anthony@codemonkey.ws \
    --cc=avi@qumranet.com \
    --cc=gcosta@redhat.com \
    --cc=kvm@vger.kernel.org \
    --cc=muli@il.ibm.com \
    --cc=nir@tutis.com \
    --cc=weidong.han@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox