qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
From: Amit Shah <amit.shah@qumranet.com>
To: kvm-devel@lists.sourceforge.net
Cc: Amit Shah <amit.shah@qumranet.com>, qemu-devel@nongnu.org
Subject: [Qemu-devel] [PATCH 1/2] KVM userspace: Add PCI device passthrough support
Date: Wed,  7 Nov 2007 21:45:12 +0200	[thread overview]
Message-ID: <11944647133010-git-send-email-amit.shah@qumranet.com> (raw)
Message-ID: <cc1a3d4ee5e648e13b3c75fc62d9c6c00405c322.1194464687.git.amit.shah@qumranet.com> (raw)

This patch introduces support for device passthrough
from the host to a paravirtualized guest.

A new command-line option, -passthrough is added.
For example, to invoke it for an Ethernet device sitting at
PCI bus:dev.fn 04:08.0 with host IRQ 18, use this:

-passthrough Ethernet/04:08.0-18

The host driver is to be removed before doing the passthrough.

Signed-off-by: Amit Shah <amit.shah@qumranet.com>
---
 qemu/Makefile                      |    6 +-
 qemu/Makefile.target               |    4 +-
 qemu/exec.c                        |    1 +
 qemu/hw/apic.c                     |    2 +
 qemu/hw/passthrough/neo_pci_tree.h |   44 +++
 qemu/hw/passthrough/passthrough.c  |  604 ++++++++++++++++++++++++++++++++++++
 qemu/hw/passthrough/passthrough.h  |   64 ++++
 qemu/hw/pc.c                       |    3 +
 qemu/hw/pci.c                      |    5 +
 qemu/hw/piix_pci.c                 |    6 +
 qemu/vl.c                          |    6 +
 tools/pci_barsize.c                |   53 ++++
 tools/pci_mmio.c                   |   82 +++++
 13 files changed, 876 insertions(+), 4 deletions(-)
 create mode 100644 qemu/hw/passthrough/neo_pci_tree.h
 create mode 100644 qemu/hw/passthrough/passthrough.c
 create mode 100644 qemu/hw/passthrough/passthrough.h
 create mode 100644 tools/pci_barsize.c
 create mode 100644 tools/pci_mmio.c

diff --git a/qemu/Makefile b/qemu/Makefile
index 053c88c..3e599f3 100644
--- a/qemu/Makefile
+++ b/qemu/Makefile
@@ -37,7 +37,7 @@ qemu-img$(EXESUF): qemu-img.c cutils.c block.c block-raw.c block-cow.c block-qco
 dyngen$(EXESUF): dyngen.c
 	$(HOST_CC) $(CFLAGS) $(CPPFLAGS) $(BASE_CFLAGS) -o $@ $^
 
-clean:
+clean: 
 # avoid old build problems by removing potentially incorrect old files
 	rm -f config.mak config.h op-i386.h opc-i386.h gen-op-i386.h op-arm.h opc-arm.h gen-op-arm.h
 	rm -f *.o *.a $(TOOLS) dyngen$(EXESUF) TAGS cscope.* *.pod *~ */*~
@@ -88,8 +88,8 @@ endif
 test speed test2: all
 	$(MAKE) -C tests $@
 
-TAGS:
-	etags *.[ch] tests/*.[ch]
+TAGS: 
+	etags *.[ch] tests/*.[ch] hw/passthrough/*.[ch]
 
 cscope:
 	rm -f ./cscope.*
diff --git a/qemu/Makefile.target b/qemu/Makefile.target
index 65f449e..9a96011 100644
--- a/qemu/Makefile.target
+++ b/qemu/Makefile.target
@@ -24,7 +24,7 @@ ifeq ($(TARGET_ARCH), sparc64)
 TARGET_BASE_ARCH:=sparc
 endif
 TARGET_PATH=$(SRC_PATH)/target-$(TARGET_BASE_ARCH)
-VPATH=$(SRC_PATH):$(TARGET_PATH):$(SRC_PATH)/hw:$(SRC_PATH)/audio
+VPATH=$(SRC_PATH):$(TARGET_PATH):$(SRC_PATH)/hw:$(SRC_PATH)/hw/passthrough:$(SRC_PATH)/audio
 CPPFLAGS=-I. -I.. -I$(TARGET_PATH) -I$(SRC_PATH)
 ifdef CONFIG_DARWIN_USER
 VPATH+=:$(SRC_PATH)/darwin-user
@@ -454,6 +454,8 @@ VL_OBJS+= ide.o pckbd.o ps2.o vga.o $(SOUND_HW) dma.o $(AUDIODRV)
 VL_OBJS+= fdc.o mc146818rtc.o serial.o i8259.o i8254.o pcspk.o pc.o
 VL_OBJS+= cirrus_vga.o apic.o parallel.o acpi.o piix_pci.o
 VL_OBJS+= usb-uhci.o smbus_eeprom.o vmmouse.o vmport.o vmware_vga.o
+# passthrough support
+VL_OBJS+= passthrough.o
 CPPFLAGS += -DHAS_AUDIO -DHAS_AUDIO_CHOICE
 endif
 ifeq ($(TARGET_BASE_ARCH), ppc)
diff --git a/qemu/exec.c b/qemu/exec.c
index 3e588d5..7a21ca5 100644
--- a/qemu/exec.c
+++ b/qemu/exec.c
@@ -2484,6 +2484,7 @@ int cpu_register_io_memory(int io_index,
         if (io_mem_nb >= IO_MEM_NB_ENTRIES)
             return -1;
         io_index = io_mem_nb++;
+	fprintf(stderr, "iomem index %d out of %d\n", io_index, IO_MEM_NB_ENTRIES);
     } else {
         if (io_index >= IO_MEM_NB_ENTRIES)
             return -1;
diff --git a/qemu/hw/apic.c b/qemu/hw/apic.c
index 60d31fa..5b1bdf4 100644
--- a/qemu/hw/apic.c
+++ b/qemu/hw/apic.c
@@ -349,6 +349,7 @@ static void apic_eoi(APICState *s)
     /* XXX: send the EOI packet to the APIC bus to allow the I/O APIC to
             set the remote IRR bit for level triggered interrupts. */
     apic_update_irq(s);
+    pt_ack_mirq(isrv);
 }
 
 static void apic_get_delivery_bitmask(uint32_t *deliver_bitmask,
@@ -1122,6 +1123,7 @@ static void ioapic_mem_writel(void *opaque, target_phys_addr_t addr, uint32_t va
                     } else {
                         s->ioredtbl[index] &= ~0xffffffffULL;
                         s->ioredtbl[index] |= val;
+                        pt_set_vector(index, (val << 24) >> 24);
                     }
                     ioapic_service(s);
                 }
diff --git a/qemu/hw/passthrough/neo_pci_tree.h b/qemu/hw/passthrough/neo_pci_tree.h
new file mode 100644
index 0000000..79adef9
--- /dev/null
+++ b/qemu/hw/passthrough/neo_pci_tree.h
@@ -0,0 +1,44 @@
+/*************************************************************************************************
+
+    Some data structures to save the result of the PCI probing.
+    
+    Copyright (c) 2007, Neocleus: Guy Zana, Alex Novik
+
+**************************************************************************************************/
+
+#ifndef __XC_NEO_PCI_TREE_H__
+#define __XC_NEO_PCI_TREE_H__
+
+#include <linux/types.h>
+
+typedef __u8 u8;
+typedef __u16 u16;
+typedef __u32 u32;
+typedef __u64 u64;
+
+
+/************************************ Data Types / Structures ************************************/
+
+typedef u32 pciaddr_t;
+
+#define MAX_IO_REGIONS			(6)
+
+typedef struct pci_region_s {
+	int type;	/* Memory or port I/O */
+	int valid;
+	pciaddr_t base_addr;
+	pciaddr_t size;		/* size of the region */
+	int resource_fd;
+} pci_region_t;
+
+typedef struct neo_pci_dev_s {
+	u8 bus, dev, func;	/* Bus inside domain, device and function */
+	int irq;		/* IRQ number */
+	u16 region_number;	/* number of active regions */
+
+	/* Port I/O or MMIO Regions */
+	pci_region_t regions[MAX_IO_REGIONS];
+	int config_fd;
+} neo_pci_dev_t;
+
+#endif				/* __XC_NEO_PCI_TREE_H__ */
diff --git a/qemu/hw/passthrough/passthrough.c b/qemu/hw/passthrough/passthrough.c
new file mode 100644
index 0000000..42540a7
--- /dev/null
+++ b/qemu/hw/passthrough/passthrough.c
@@ -0,0 +1,604 @@
+/******************************************************************************
+
+    PCI config handling, MMIO & PIO access through dom0 is done for
+    debugging needs.
+
+    Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
+    Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
+
+******************************************************************************/
+#include <stdio.h>
+#include <pthread.h>
+#include <sys/io.h>
+#include <sys/ioctl.h>
+
+#include "neo_pci_tree.h"
+
+typedef u64 resource_size_t;
+#define __deprecated 
+
+#include <linux/ioport.h>
+#include "vl.h"
+#include "passthrough.h"
+
+#ifdef USE_KVM
+#include "qemu-kvm.h"
+#include <linux/kvm_para.h>
+extern kvm_context_t kvm_context;
+#endif
+extern FILE *logfile;
+
+CPUReadMemoryFunc *pt_mmio_read_cb[3] = {
+	pt_mmio_readb,
+	pt_mmio_readw,
+	pt_mmio_readl
+};
+
+CPUWriteMemoryFunc *pt_mmio_write_cb[3] = {
+	pt_mmio_writeb,
+	pt_mmio_writew,
+	pt_mmio_writel
+};
+
+//#define PT_DEBUG
+
+#ifdef PT_DEBUG
+#define DEBUG(fmt, args...) fprintf(stderr, "%s: " fmt, __FUNCTION__ , ## args)
+#else
+#define DEBUG(fmt, args...)
+#endif
+
+#define pt_mmio_write(suffix, type)					\
+void pt_mmio_write##suffix(void *opaque, target_phys_addr_t e_phys,	\
+				uint32_t value)				\
+{									\
+	pt_region_t *r_access = (pt_region_t *)opaque;			\
+	void *r_virt = (u8 *)r_access->r_virtbase + 			\
+			(e_phys - r_access->e_physbase);		\
+	if (r_access->debug & PT_DEBUG_MMIO) {				\
+		fprintf(logfile, "pt_mmio_write" #suffix		\
+			": e_physbase=%p e_phys=%p r_virt=%p value=%08x\n", \
+			(void *)r_access->e_physbase, (void *)e_phys,	\
+			r_virt, value);					\
+	}								\
+	*(type *)r_virt = (type)value;					\
+}
+
+pt_mmio_write(b, u8)
+pt_mmio_write(w, u16)
+pt_mmio_write(l, u32)
+
+#define pt_mmio_read(suffix, type)					\
+uint32_t pt_mmio_read##suffix(void *opaque, target_phys_addr_t e_phys)	\
+{									\
+	pt_region_t *r_access = (pt_region_t *)opaque;			\
+	void *r_virt = (u8 *)r_access->r_virtbase + 			\
+			(e_phys - r_access->e_physbase);		\
+	uint32_t value = (u32) (*(type *) r_virt);			\
+	if (r_access->debug & PT_DEBUG_MMIO) {				\
+		fprintf(logfile,					\
+			"pt_mmio_read" #suffix ": e_physbase=%p "	\
+			"e_phys=%p r_virt=%p value=%08x\n",		\
+			(void *)r_access->e_physbase,			\
+			(void *)e_phys, r_virt, value);			\
+	}								\
+	return value;							\
+}
+
+pt_mmio_read(b, u8)
+pt_mmio_read(w, u16)
+pt_mmio_read(l, u32)
+
+#define pt_ioport_write(suffix)						\
+void pt_ioport_write##suffix(void *opaque, uint32_t addr, uint32_t value) \
+{									\
+	pt_region_t *r_access = (pt_region_t *)opaque;			\
+	uint32_t r_pio = (unsigned long)r_access->r_virtbase		\
+			 + (addr - r_access->e_physbase);		\
+	if (r_access->debug & PT_DEBUG_PIO) {				\
+		fprintf(logfile, "pt_ioport_write" #suffix 		\
+			": r_pio=%08x e_physbase=%08x"			\
+			" r_virtbase=%08lx value=%08x\n", 		\
+			r_pio, (int)r_access->e_physbase,		\
+			(unsigned long)r_access->r_virtbase, value);	\
+	}								\
+	out##suffix(value, r_pio);					\
+}
+
+pt_ioport_write(b)
+pt_ioport_write(w)
+pt_ioport_write(l)
+
+#define pt_ioport_read(suffix)						\
+uint32_t pt_ioport_read##suffix(void *opaque, uint32_t addr)		\
+{									\
+	pt_region_t *r_access = (pt_region_t *)opaque;			\
+	uint32_t r_pio = (addr - r_access->e_physbase)			\
+			+ (unsigned long)r_access->r_virtbase;		\
+	uint32_t value = in##suffix(r_pio);				\
+	if (r_access->debug & PT_DEBUG_PIO) {				\
+		fprintf(logfile, "pt_ioport_read" #suffix 		\
+			": r_pio=%08x e_physbase=%08x r_virtbase=%08lx "\
+			"value=%08x\n", 				\
+			r_pio, (int)r_access->e_physbase, 		\
+			(unsigned long)r_access->r_virtbase, value);	\
+	}								\
+	return (value);							\
+}
+
+pt_ioport_read(b)
+pt_ioport_read(w)
+pt_ioport_read(l)
+
+static void pt_iomem_map(PCIDevice * d, int region_num,
+			 uint32_t e_phys, uint32_t e_size, int type)
+{
+	pt_dev_t *r_dev = (pt_dev_t *) d;
+
+	r_dev->v_addrs[region_num].e_physbase = e_phys;
+
+	DEBUG("e_phys=%08x r_virt=%p type=%d len=%08x region_num=%d \n",
+	      e_phys, r_dev->v_addrs[region_num].r_virtbase, type, e_size,
+	      region_num);
+
+	cpu_register_physical_memory(e_phys,
+				     r_dev->dev.io_regions[region_num].size,
+				     r_dev->v_addrs[region_num].memory_index);
+}
+
+
+static void pt_ioport_map(PCIDevice * pci_dev, int region_num,
+			  uint32_t addr, uint32_t size, int type)
+{
+	pt_dev_t *r_dev = (pt_dev_t *) pci_dev;
+	int i;
+	uint32_t ((*rf[])(void *, uint32_t)) =  { pt_ioport_readb,
+						  pt_ioport_readw,
+						  pt_ioport_readl
+						};
+	void ((*wf[])(void *, uint32_t, uint32_t)) = { pt_ioport_writeb,
+						       pt_ioport_writew,
+						       pt_ioport_writel
+						     };
+
+	r_dev->v_addrs[region_num].e_physbase = addr;
+	fprintf(logfile, "pt_ioport_map: address=0x%x type=0x%x len=%d"
+		"region_num=%d \n", addr, type, size, region_num);
+
+	for (i = 0; i < 3; i++) {
+		register_ioport_write(addr, size, 1<<i, wf[i],
+				      (void *) (r_dev->v_addrs + region_num));
+		register_ioport_read(addr, size, 1<<i, rf[i],
+				     (void *) (r_dev->v_addrs + region_num));
+	}
+}
+
+static void pt_pci_write_config(PCIDevice * d, uint32_t address, uint32_t val,
+				int len)
+{
+	int fd;
+
+	DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n",
+	      ((d->devfn >> 3) & 0x1F), (d->devfn & 0x7), (uint16_t) address,
+	      val, len);
+
+	if (address == 0x4)
+		pci_default_write_config(d, address, val, len);
+
+	if ((address >= 0x10 && address <= 0x24) || address == 0x34 ||
+	    address == 0x3c || address == 0x3d) {
+	  /* used for update-mappings (BAR emulation) */
+		pci_default_write_config(d, address, val, len);
+		return;
+	}
+
+	DEBUG("NON BAR (%x.%x): address=%04x val=0x%08x len=%d\n",
+	      ((d->devfn >> 3) & 0x1F), (d->devfn & 0x7), (uint16_t) address,
+	      val, len);
+	fd = ((pt_dev_t *)d)->real_device.config_fd;
+	lseek(fd, address, SEEK_SET);
+	write(fd, &val, len);
+}
+
+static uint32_t pt_pci_read_config(PCIDevice *d, uint32_t address, int len)
+{
+	uint32_t val = 0;
+	int fd;
+
+	if ((address >= 0x10 && address <= 0x24) || address == 0x34 ||
+	    address == 0x3c || address == 0x3d) {
+		val = pci_default_read_config(d, address, len);
+		DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n",
+		      (d->devfn >> 3) & 0x1F, (d->devfn & 0x7), address, val,
+		      len);
+		return (val);
+	}
+
+	/* vga specific, remove later */
+	if (address == 0xFC)
+		goto do_log;
+
+	fd = ((pt_dev_t *)d)->real_device.config_fd;
+	lseek(fd, address, SEEK_SET);
+	read(fd, &val, len);
+
+      do_log:
+	DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n",
+	      (d->devfn >> 3) & 0x1F, (d->devfn & 0x7), address, val, len);
+
+	/* kill the special capabilities */
+	if (address == 4 && len == 4)
+		val &= ~0x100000;
+	else if (address == 6)
+		val &= ~0x10;
+
+	return (val);
+}
+
+
+int pt_register_regions(pci_region_t * io_regions,
+			unsigned long regions_num, pt_dev_t * pci_dev)
+{
+	uint32_t i;
+	pci_region_t *cur_region = io_regions;
+
+	for (i = 0; i < regions_num; i++, cur_region++) {
+		if (!cur_region->valid)
+			continue;
+#ifdef PT_DEBUG
+		pci_dev->v_addrs[i].debug |= PT_DEBUG_MMIO | PT_DEBUG_PIO;
+#endif
+		pci_dev->v_addrs[i].num = i;
+
+		/* handle memory io regions */
+		if (cur_region->type & IORESOURCE_MEM) {
+			int t = cur_region->type & IORESOURCE_PREFETCH ? PCI_ADDRESS_SPACE_MEM_PREFETCH : PCI_ADDRESS_SPACE_MEM;
+
+			/* map physical memory */
+			pci_dev->v_addrs[i].e_physbase = cur_region->base_addr;
+			pci_dev->v_addrs[i].r_virtbase =
+			    	mmap(NULL, (cur_region->size + 0xFFF) & 0xFFFFF000, PROT_WRITE | PROT_READ,
+					    MAP_SHARED, cur_region->resource_fd, (off_t) 0);
+
+			if ((void *) -1 ==
+			    pci_dev->v_addrs[i].r_virtbase) {
+				fprintf(logfile, "NEO: Error: Couldn't mmap 0x%x!\n",
+					(uint32_t) (cur_region->base_addr));
+				return (-1);
+			}
+
+			/* add offset */
+			pci_dev->v_addrs[i].r_virtbase += (cur_region->base_addr & 0xFFF);
+
+			pci_register_io_region((PCIDevice *) pci_dev, i, cur_region->size, t, pt_iomem_map);
+
+			pci_dev->v_addrs[i].memory_index =
+			    cpu_register_io_memory(0, pt_mmio_read_cb, pt_mmio_write_cb,
+						(void *) &(pci_dev->v_addrs[i]));
+
+			continue;
+		}
+		/* handle port io regions */
+
+		pci_register_io_region((PCIDevice *) pci_dev, i, cur_region->size, PCI_ADDRESS_SPACE_IO,
+				       pt_ioport_map);
+
+		pci_dev->v_addrs[i].e_physbase = cur_region->base_addr;
+		pci_dev->v_addrs[i].r_virtbase = (void *)(long)cur_region->base_addr;
+		pci_dev->v_addrs[i].memory_index = 0;	// not relevant for port io
+	}
+
+	/* success */
+	return (0);
+
+}
+
+int
+pt_get_real_device(pt_dev_t *pci_dev, uint8_t r_bus, uint8_t r_dev,
+		   uint8_t r_func)
+{
+	char dir[128], name[128], comp[16];
+	int fd, r = 0;
+	FILE *f;
+	unsigned long long start, end, size, flags;
+	pci_region_t *rp;
+	neo_pci_dev_t *dev = &pci_dev->real_device;
+
+	dev->region_number = 0;
+
+	sprintf(dir, "/sys/bus/pci/devices/0000:%02x:%02x.%x/",
+		r_bus, r_dev, r_func);
+	strcpy(name, dir);
+	strcat(name, "config");
+	if ((fd = open(name, O_RDWR)) == -1) {
+		fprintf(logfile, "%s: %m\n", name);
+		return 1;
+	}
+	dev->config_fd = fd;
+	read(fd, pci_dev->dev.config, sizeof pci_dev->dev.config);
+
+	strcpy(name, dir);
+	strcat(name, "resource");
+	if ((f = fopen(name, "r")) == NULL) {
+		fprintf(logfile, "%s: %m\n", name);
+		return 1;
+	}
+
+	for (r = 0; fscanf(f, "%lli %lli %lli\n", &start, &end, &flags) == 3; r++) {
+		rp = dev->regions + r;
+		rp->valid = 0;
+		size = end - start + 1;
+		flags &= IORESOURCE_IO | IORESOURCE_MEM | IORESOURCE_PREFETCH;
+		if (size == 0 || (flags & ~IORESOURCE_PREFETCH) == 0)
+			continue;
+		if (flags & IORESOURCE_MEM) {
+			flags &= ~IORESOURCE_IO;
+			sprintf(comp, "resource%d", r);
+			strcpy(name, dir);
+			strcat(name, comp);
+			if ((fd = open(name, O_RDWR)) == -1)
+				continue;		// probably ROM
+			rp->resource_fd = fd;
+		} else
+			flags &= ~IORESOURCE_PREFETCH;
+
+		rp->type = flags;
+		rp->valid = 1;
+		rp->base_addr = start;
+		rp->size = size;
+		fprintf(logfile, "region %d size %d start 0x%x type %d resource_fd %d\n", r, rp->size, start, rp->type, rp->resource_fd);
+	}
+	fclose(f);
+	
+	dev->region_number = r;
+	return 0;
+}
+
+/* From include/linux/pci.h in the kernel sources */
+#define PCI_DEVFN(slot,func)	((((slot) & 0x1f) << 3) | ((func) & 0x07))
+
+pt_dev_t *register_real_device(PCIBus * e_bus, const char *e_dev_name,
+			       int e_devfn, uint8_t r_bus, uint8_t r_dev,
+			       uint8_t r_func, uint32_t machine_irq)
+{
+	int rc;
+	pt_dev_t *pci_dev;
+	uint8_t e_device, e_intx;
+	struct kvm_pv_passthrough_dev pv_pci_dev;
+
+	fprintf(logfile, "register_real_device: Registering real physical device %s (devfn=0x%x)\n", e_dev_name, e_devfn);
+
+	pci_dev = (pt_dev_t *) pci_register_device(e_bus, e_dev_name, sizeof(pt_dev_t), e_devfn,
+						   pt_pci_read_config, pt_pci_write_config);
+
+	if (NULL == pci_dev) {
+		fprintf(logfile, "register_real_device: Error: Couldn't register real device %s\n", e_dev_name);
+		return (NULL);
+	}
+	if (pt_get_real_device(pci_dev, r_bus, r_dev, r_func)) {
+		fprintf(logfile, "register_real_device: Error: Couldn't get real device (%s)!\n", e_dev_name);
+		return NULL;
+	}
+
+	/* handle real device's MMIO/PIO BARs */
+	if (pt_register_regions(pci_dev->real_device.regions, pci_dev->real_device.region_number, pci_dev))
+		return (NULL);
+
+	/* handle interrupt routing */
+	e_device = (pci_dev->dev.devfn >> 3) & 0x1f;
+	e_intx = pci_dev->dev.config[0x3d] - 1;
+	pci_dev->intpin = e_intx;
+	pci_dev->run = 0;
+	pci_dev->mirq = machine_irq;
+
+	/* bind machine_irq to device */
+	if (machine_irq) {
+		fprintf(logfile, "Binding mirq %u to device=0x%x intpin=0x%x\n",
+				machine_irq, e_device, pci_dev->intpin);
+		rc = pt_bind_mirq(r_bus, r_dev, r_func);
+		if (rc) {
+			fprintf(logfile, "pt_bind %d failed rc=%d\n", pci_dev->mirq, rc);
+			return NULL;
+		}
+		sprintf(pci_dev->sirq, "%d", pci_dev->mirq);
+	}
+
+#ifdef USE_KVM
+	/* Let the host kernel know we'll dealing with this device hereafter */
+	pv_pci_dev.guest.busnr = pci_bus_num(e_bus);
+	pv_pci_dev.guest.devfn = PCI_DEVFN(e_device, r_func);
+	pv_pci_dev.mach.busnr  = r_bus;
+	pv_pci_dev.mach.devfn  = PCI_DEVFN(r_dev, r_func);
+
+	rc = ioctl(kvm_get_vm_fd(kvm_context), KVM_ASSIGN_PV_PCI_DEV,
+		   &pv_pci_dev);
+	if (rc == -1) {
+		fprintf(stderr, "Could not notify kernel about passthrough "
+			"device\n");
+		perror("pt-ioctl:");
+		return NULL;
+	}
+#endif
+	fprintf(logfile, "register_real_device: Real physical device (%02x:%02x.%x) \"%s\" registered successfully!\n", r_bus, r_dev, r_func, e_dev_name);
+
+	return (pci_dev);
+}
+
+#define	MAX_PTDEVS 4
+struct {
+	char name[128];
+	int bus;
+	int dev;
+	int func;
+	int irq;
+	pt_dev_t *ptdev;
+} ptdevs[MAX_PTDEVS];
+
+int nptdevs;
+
+static QEMUBH *ptbh;
+static int irqfd;
+static pt_dev_t **apicv[0xfe]; //0x10 - 0xfe according to intel IOAPIC spec
+#define IRQHOOK_DEV "/dev/irqhook"
+static pthread_t irqthread;
+
+void pt_irq(void *arg)
+{
+	char buf[20];
+	int irq;
+	int i;
+	pt_dev_t *dev;
+
+	if (!irqfd) {
+		fprintf(stderr, "pt_irq: irqfd %d, exiting\n", irqfd);
+		exit(-1);
+	}
+
+	for (;;) {
+		if (read(irqfd, buf, 20) == -1) {
+			if (errno == EINTR) continue;
+			perror("irq read: ");
+		}
+
+		irq = atoi(buf);
+		DEBUG("read irq %d\n", irq);
+		if (!irq) continue;
+		
+		for (i = 0; i < nptdevs; i++) if ((dev = ptdevs[i].ptdev) && dev->mirq == irq) dev->run = 1;
+		qemu_bh_schedule(ptbh);
+	}
+}
+
+static void pt_bh(void *p)
+{
+	int i;
+	pt_dev_t *dev;
+	for (i = 0; i < nptdevs; i++)
+		if ((dev = ptdevs[i].ptdev) && dev->run) {
+			qemu_set_irq(dev->dev.irq[dev->intpin], 1);
+			dev->run = 0;
+			if (cpu_single_env) cpu_interrupt(cpu_single_env, CPU_INTERRUPT_EXIT);
+		}
+}
+
+int pt_init(PCIBus * bus)
+{
+	pt_dev_t *dev;
+	int i, ret = 0;
+
+	iopl(3);
+
+	if (!(ptbh = qemu_bh_new(pt_bh, 0))) {
+		fprintf(logfile, "Couldn't register PT callback\n"); 
+		return -1;
+	}
+
+	if (!(irqfd = open(IRQHOOK_DEV, O_RDWR))) {
+		fprintf(logfile, "Couldn't open PT irqhook dev\n");
+		return -1;
+	}
+
+	if (pthread_create(&irqthread, 0, pt_irq, 0)) {
+		fprintf(logfile, "Couldn't create IRQ thread\n");
+		return -1;
+	}
+
+	for (i = 0; i < nptdevs; i++) {
+		dev = register_real_device(bus, ptdevs[i].name, -1, ptdevs[i].bus, ptdevs[i].dev, ptdevs[i].func, ptdevs[i].irq);
+
+		if (dev == NULL) {
+			fprintf(logfile, "NEO: Error: Couldn't register %s\n", "AUDIO_0");
+			ret = -1;
+		}
+		ptdevs[i].ptdev = dev;
+	}
+
+	/* success */
+	return (ret);
+}
+
+void
+add_passthrough_device(char *arg)
+{
+	/* name/bus:dev.func-intr */
+	char *cp, *cp1;
+
+	if (nptdevs >= MAX_PTDEVS) {
+		fprintf(logfile, "Too many passthrough devices (max %d)\n", MAX_PTDEVS);
+		return;
+	}
+	strcpy(ptdevs[nptdevs].name, arg);
+	cp = strchr(ptdevs[nptdevs].name, '/');
+	if (cp == NULL)
+		goto bad;
+	*cp++ = 0;
+
+	ptdevs[nptdevs].bus = strtoul(cp, &cp1, 16);
+	if (*cp1 != ':')
+		goto bad;
+	cp = cp1 + 1;
+
+	ptdevs[nptdevs].dev = strtoul(cp, &cp1, 16);
+	if (*cp1 != '.')
+		goto bad;
+	cp = cp1 + 1;
+
+	ptdevs[nptdevs].func = strtoul(cp, &cp1, 16);
+	if (*cp1 != '-')
+		goto bad;
+	cp = cp1 + 1;
+
+	ptdevs[nptdevs].irq = strtoul(cp, &cp1, 0);
+	if (*cp1 != 0)
+		goto bad;
+
+	nptdevs++;
+	return;
+    bad:
+	fprintf(logfile, "passthrough arg (%s) not in the form of name/bus:dev.func-intr\n", arg);
+}
+
+void pt_ack_mirq(int vector)
+{
+	pt_dev_t **p = apicv[vector];
+	if (!p) return;
+
+	for (; *p; *p++) {
+		write(irqfd, (*p)->sirq, strlen((*p)->sirq));
+		qemu_set_irq((*p)->dev.irq[(*p)->intpin], 0);
+	}
+}
+
+int pt_bind_mirq(int bus, int dev, int fn)
+{
+	char s[64];
+	sprintf(s, "+%d:%d.%d", bus, dev, fn);
+	if (write(irqfd, s, strlen(s)) != strlen(s)) {
+		perror("pt_bind_mirq:");
+		exit(-1);
+	}
+	return 0;
+}
+
+void pt_set_vector(int irq, int vector)
+{
+	int i, j;
+	int pin = piix3_get_pin(irq);
+	pt_dev_t *pt, **p;
+
+	DEBUG("irq %d vector %d\n", irq, vector);
+	if (vector > 0xfe) return;
+	for (i = 0; i < nptdevs; i++) {
+		pt = ptdevs[i].ptdev;
+		if (!pt || pt->bound) continue;
+		if (pci_map_irq(&pt->dev, pt->intpin) == pin) {
+			for (j = 1, p = apicv[vector]; p; j++, *p++);
+			apicv[vector] = realloc(apicv[vector], j * sizeof pt);
+			p = &apicv[vector][j];
+			*(p-1) = pt;
+			*p = 0;
+			pt->bound = 1;
+		}
+	}
+	DEBUG("done\n");
+}
diff --git a/qemu/hw/passthrough/passthrough.h b/qemu/hw/passthrough/passthrough.h
new file mode 100644
index 0000000..3d8542d
--- /dev/null
+++ b/qemu/hw/passthrough/passthrough.h
@@ -0,0 +1,64 @@
+/*************************************************************************************************
+
+    PCI config handling, MMIO & PIO access through dom0 is done for debugging needs.
+
+    Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
+    Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
+
+**************************************************************************************************/
+
+#include <sys/mman.h>
+#include "vl.h"
+
+#ifndef __PASSTHROUGH_H__
+#define __PASSTHROUGH_H__
+
+#define PT_DEBUG_PIO	(0x01)
+#define PT_DEBUG_MMIO	(0x02)
+
+typedef struct pt_region_s {
+	target_phys_addr_t e_physbase;
+	uint32_t memory_index;
+	void *r_virtbase;	/* mmapped access address */
+	int num;		/* our index within v_addrs[] */
+	uint32_t debug;
+} pt_region_t;
+
+typedef struct pt_dev_s {
+	PCIDevice dev;
+	int intpin;
+	uint8_t debug_flags;
+	pt_region_t v_addrs[PCI_NUM_REGIONS];
+	neo_pci_dev_t real_device;
+	int run;
+	int mirq;
+	char sirq[4];
+	int bound;
+} pt_dev_t;
+
+
+/* MMIO access functions */
+uint32_t pt_mmio_readb(void *opaque, target_phys_addr_t e_phys);
+uint32_t pt_mmio_readw(void *opaque, target_phys_addr_t e_phys);
+uint32_t pt_mmio_readl(void *opaque, target_phys_addr_t e_phys);
+void pt_mmio_writeb(void *opaque, target_phys_addr_t e_phys, uint32_t value);
+void pt_mmio_writew(void *opaque, target_phys_addr_t e_phys, uint32_t value);
+void pt_mmio_writel(void *opaque, target_phys_addr_t e_phys, uint32_t value);
+
+/* PIO access functions */
+uint32_t pt_ioport_readb(void *opaque, uint32_t addr);
+uint32_t pt_ioport_readw(void *opaque, uint32_t addr);
+uint32_t pt_ioport_readl(void *opaque, uint32_t addr);
+void pt_ioport_writeb(void *opaque, uint32_t addr, uint32_t value);
+void pt_ioport_writew(void *opaque, uint32_t addr, uint32_t value);
+void pt_ioport_writel(void *opaque, uint32_t addr, uint32_t value);
+
+/* Registration functions */
+int register_pt_pio_region(uint32_t pio_start, uint32_t length,
+			   uint8_t do_logging);
+int register_pt_mmio_region(uint32_t mmio_addr, uint32_t length,
+			    uint8_t do_logging);
+
+#define logfile stderr
+
+#endif				/* __PASSTHROUGH_H__ */
diff --git a/qemu/hw/pc.c b/qemu/hw/pc.c
index 8aae814..d7892e0 100644
--- a/qemu/hw/pc.c
+++ b/qemu/hw/pc.c
@@ -888,6 +888,9 @@ static void pc_init1(ram_addr_t ram_size, int vga_ram_size, int boot_device,
         }
     }
 
+    /* Initialize pass-through */
+    pt_init(pci_bus);
+
     rtc_state = rtc_init(0x70, i8259[8]);
 
     register_ioport_read(0x92, 1, 1, ioport92_read, NULL);
diff --git a/qemu/hw/pci.c b/qemu/hw/pci.c
index 7e8adc4..8be3645 100644
--- a/qemu/hw/pci.c
+++ b/qemu/hw/pci.c
@@ -457,6 +457,11 @@ static void pci_set_irq(void *opaque, int irq_num, int level)
     bus->set_irq(bus->irq_opaque, irq_num, bus->irq_count[irq_num] != 0);
 }
 
+int pci_map_irq(PCIDevice *pci_dev, int pin)
+{
+	return pci_dev->bus->map_irq(pci_dev, pin);
+}
+
 /***********************************************************/
 /* monitor info on PCI */
 
diff --git a/qemu/hw/piix_pci.c b/qemu/hw/piix_pci.c
index 8c00f0d..a9d87bd 100644
--- a/qemu/hw/piix_pci.c
+++ b/qemu/hw/piix_pci.c
@@ -225,6 +225,12 @@ static void piix3_set_irq(qemu_irq *pic, int irq_num, int level)
     }
 }
 
+int piix3_get_pin(int pic_irq)
+{
+	int i;
+	for (i = 0; i < 4; i++) if (piix3_dev->config[0x60+i] == pic_irq) return i;
+}
+
 static void piix3_reset(PCIDevice *d)
 {
     uint8_t *pci_conf = d->config;
diff --git a/qemu/vl.c b/qemu/vl.c
index 634fb34..21b3d47 100644
--- a/qemu/vl.c
+++ b/qemu/vl.c
@@ -1182,6 +1182,7 @@ static void host_alarm_handler(int host_signum)
         SetEvent(data->host_alarm);
 #endif
         CPUState *env = cpu_single_env;
+
         if (env) {
             /* stop the currently executing cpu because a timer occured */
             cpu_interrupt(env, CPU_INTERRUPT_EXIT);
@@ -7532,6 +7533,7 @@ enum {
     QEMU_OPTION_vnc,
     QEMU_OPTION_no_acpi,
     QEMU_OPTION_no_kvm,
+    QEMU_OPTION_passthrough,
     QEMU_OPTION_no_kvm_irqchip,
     QEMU_OPTION_no_reboot,
     QEMU_OPTION_show_cursor,
@@ -7611,6 +7613,7 @@ const QEMUOption qemu_options[] = {
 #endif
 #ifdef USE_KVM
     { "no-kvm", 0, QEMU_OPTION_no_kvm },
+    { "passthrough", HAS_ARG, QEMU_OPTION_passthrough },
     { "no-kvm-irqchip", 0, QEMU_OPTION_no_kvm_irqchip },
 #endif
 #if defined(TARGET_PPC) || defined(TARGET_SPARC)
@@ -8427,6 +8430,9 @@ int main(int argc, char **argv)
 	    case QEMU_OPTION_no_kvm:
 		kvm_allowed = 0;
 		break;
+	    case QEMU_OPTION_passthrough:
+		add_passthrough_device(optarg);
+		break;
 	    case QEMU_OPTION_no_kvm_irqchip:
 		kvm_irqchip = 0;
 		break;
diff --git a/tools/pci_barsize.c b/tools/pci_barsize.c
new file mode 100644
index 0000000..dd230c9
--- /dev/null
+++ b/tools/pci_barsize.c
@@ -0,0 +1,53 @@
+#include <stdio.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stdlib.h>
+
+int
+panic(char *msg)
+{
+	perror(msg);
+	exit(1);
+}
+
+int
+main(int argc, char **argv)
+{
+	unsigned l, b, sz;
+	int fd, ismem, bar = 0, offs;
+
+	if (argc < 2)
+		panic("usage: pci_barsize <file> [bar no]");
+	
+	if ((fd = open(argv[1], O_RDWR)) < 0)
+		panic("open");
+
+	if (argc > 2)
+		bar = strtoul(argv[2], 0, 0);
+	if (bar < 0 || bar > 5)
+		panic("bar range 0-5");
+
+	offs = 0x10 + bar * 4;
+	lseek(fd, offs, 0);
+	read(fd, &l, sizeof(l));
+	printf("bar %d (offs 0x%x) == %x\n", bar, offs, l);
+
+	ismem = !(l & 0x01);
+	
+	b = ~0;
+	lseek(fd, offs, 0);
+	write(fd, &b, sizeof(b));
+
+	lseek(fd, offs, 0);
+	read(fd, &b, sizeof(b));
+	sz = ~(b & (ismem ? ~0x15 : ~0x1)) + 1;
+	printf("bar %d %s size 0x%x == %ldKB (%x)\n",
+		bar, ismem ? "memory" : "IO", sz, sz / 1024, b);
+
+	lseek(fd, offs, 0);
+	write(fd, &l, sizeof(l));
+
+	return 0;
+}
diff --git a/tools/pci_mmio.c b/tools/pci_mmio.c
new file mode 100644
index 0000000..6e91571
--- /dev/null
+++ b/tools/pci_mmio.c
@@ -0,0 +1,82 @@
+#include <stdio.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+
+int
+panic(char *msg)
+{
+	perror(msg);
+	exit(1);
+}
+
+int
+main(int argc, char **argv)
+{
+	unsigned sz;
+	int fd, cnt, rsz, offs = 0;
+	void *map;
+	struct stat st;
+
+	if (argc < 2)
+		panic("usage: pci_mmio <resouce-file> [offset [count]]");
+	
+	if ((fd = open(argv[1], O_RDWR)) < 0)
+		panic("open");
+
+	if (fstat(fd, &st) < 0)
+		panic("fstat");
+	cnt = sz = st.st_size;
+
+	if (argc > 2)
+		offs = strtoul(argv[2], 0, 0);
+	if (argc > 3)
+		cnt = strtoul(argv[3], 0, 0);
+
+	if (cnt < 0 || cnt > sz)
+		panic("bad count");
+	if (offs < 0 || offs > sz)
+		panic("bad offset");
+	if (offs + cnt > sz) {
+		cnt = sz - offs;
+		fprintf(stderr, "count truncated to %d", cnt);
+	}
+	if (cnt > 4 && offs % 4)
+		panic("read bigger than 4 must be 4 bytes aligned");
+	if (cnt == 2 && offs % 2)
+		panic("2 bytes read must be 2 bytes aligned");
+	if (cnt != 1 && cnt != 2 && cnt != 4 && cnt % 4)
+		panic("counts must be 1, 2, 4 or 4*n");
+
+	fprintf(stderr, "reading %s [%d:%d]\n", argv[1], offs, offs + cnt);
+	map = mmap(NULL, sz, PROT_WRITE | PROT_READ, MAP_SHARED, fd, 0);
+
+	if (!map)
+		panic("mmap");
+
+	rsz = cnt > 4 ? 4 : cnt;
+	fprintf(stderr, "rsz: %d cnt %d\n", rsz, cnt);
+	while (cnt > 0) {
+		char buf[8];
+		switch (rsz) {
+		case 1:
+			*(char *)buf = *(char *)map + offs;
+			break;
+		case 2:
+			*(short *)buf = *(short *)map + offs/sizeof(short);
+			break;
+		case 4:
+			*(int *)buf = *(int *)map + offs/4;
+			break;
+		}
+		write(1, buf, rsz);
+
+		offs += rsz;
+		cnt -= rsz;
+	}
+	fprintf(stderr, "done\n");
+	return 0;
+}
-- 
1.5.3

             reply	other threads:[~2007-11-07 19:44 UTC|newest]

Thread overview: 7+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
     [not found] <cc1a3d4ee5e648e13b3c75fc62d9c6c00405c322.1194464687.git.amit.shah@qumranet.com>
2007-11-07 19:45 ` Amit Shah [this message]
2007-11-08  6:28   ` [Qemu-devel] Re: [kvm-devel] [PATCH 1/2] KVM userspace: Add PCI device passthrough support Avi Kivity
2007-11-08  9:19   ` [Qemu-devel] " Fabrice Bellard
2007-11-08 10:00     ` [kvm-devel] " Dor Laor
     [not found] ` <95597be41c7ffbb889a0e53cb8294203ac6b5519.1194464687.git.amit.shah@qumranet.com>
2007-11-07 19:45   ` [Qemu-devel] [PATCH 2/2] KVM Userspace: IRQ injection into guest Amit Shah
2007-11-07 20:01 ` [Qemu-devel] Re: [kvm-devel] [PATCH 1/2] KVM userspace: Add PCI device passthrough support Hollis Blanchard
2007-11-08  6:12   ` Amit Shah

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=11944647133010-git-send-email-amit.shah@qumranet.com \
    --to=amit.shah@qumranet.com \
    --cc=kvm-devel@lists.sourceforge.net \
    --cc=qemu-devel@nongnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).