public inbox for kvm@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH] device-assignment: Use PCI I/O port sysfs resource file when available
@ 2010-07-20 22:11 Alex Williamson
  2010-07-20 23:13 ` Chris Wright
  2010-07-21  3:30 ` [PATCH v2] " Alex Williamson
  0 siblings, 2 replies; 8+ messages in thread
From: Alex Williamson @ 2010-07-20 22:11 UTC (permalink / raw)
  To: kvm; +Cc: ddutile, chrisw, alex.williamson

When supported by the host kernel, we can use read/write on the
PCI sysfs resource file for I/O port regions.  This allows us to
avoid raw in/out commands and works with deprivileged guests via
libvirt.  For uid 0 callers, we use in/out directly to avoid any
compatibility issues.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---

 Required kernel patch pending here:
 http://www.spinics.net/lists/linux-pci/msg09389.html

 hw/device-assignment.c |  131 ++++++++++++++++++++++++++++++++++++------------
 hw/device-assignment.h |    1 
 2 files changed, 99 insertions(+), 33 deletions(-)

diff --git a/hw/device-assignment.c b/hw/device-assignment.c
index 2bba22f..37c1278 100644
--- a/hw/device-assignment.c
+++ b/hw/device-assignment.c
@@ -67,6 +67,28 @@ static uint32_t guest_to_host_ioport(AssignedDevRegion *region, uint32_t addr)
     return region->u.r_baseport + (addr - region->e_physbase);
 }
 
+static int assigned_dev_ioport_rw(AssignedDevRegion *dev_region,
+                                  uint32_t addr, int len, uint32_t *val,
+                                  int write)
+{
+    if (dev_region->region->resource_fd == -1)
+        return -1;
+
+    if (write) {
+        if (pwrite(dev_region->region->resource_fd, val, len,
+                  (addr - dev_region->e_physbase)) != len) {
+            return -1;
+        }
+    } else {
+        if (pread(dev_region->region->resource_fd, val, len,
+                  (addr - dev_region->e_physbase)) != len) {
+            return -1;
+        }
+    }
+
+    return 0;
+}
+
 static void assigned_dev_ioport_writeb(void *opaque, uint32_t addr,
                                        uint32_t value)
 {
@@ -77,7 +99,9 @@ static void assigned_dev_ioport_writeb(void *opaque, uint32_t addr,
 	  r_pio, (int)r_access->e_physbase,
 	  (unsigned long)r_access->u.r_baseport, value);
 
-    outb(value, r_pio);
+    if (assigned_dev_ioport_rw(r_access, addr, 1, &value, 1) != 0) {
+        outb(value, r_pio);
+    }
 }
 
 static void assigned_dev_ioport_writew(void *opaque, uint32_t addr,
@@ -90,7 +114,9 @@ static void assigned_dev_ioport_writew(void *opaque, uint32_t addr,
           r_pio, (int)r_access->e_physbase,
 	  (unsigned long)r_access->u.r_baseport, value);
 
-    outw(value, r_pio);
+    if (assigned_dev_ioport_rw(r_access, addr, 2, &value, 1) != 0) {
+        outw(value, r_pio);
+    }
 }
 
 static void assigned_dev_ioport_writel(void *opaque, uint32_t addr,
@@ -103,7 +129,9 @@ static void assigned_dev_ioport_writel(void *opaque, uint32_t addr,
 	  r_pio, (int)r_access->e_physbase,
           (unsigned long)r_access->u.r_baseport, value);
 
-    outl(value, r_pio);
+    if (assigned_dev_ioport_rw(r_access, addr, 4, &value, 1) != 0) {
+        outl(value, r_pio);
+    }
 }
 
 static uint32_t assigned_dev_ioport_readb(void *opaque, uint32_t addr)
@@ -112,7 +140,9 @@ static uint32_t assigned_dev_ioport_readb(void *opaque, uint32_t addr)
     uint32_t r_pio = guest_to_host_ioport(r_access, addr);
     uint32_t value;
 
-    value = inb(r_pio);
+    if (assigned_dev_ioport_rw(r_access, addr, 1, &value, 0) != 0) {
+        value = inb(r_pio);
+    }
 
     DEBUG("r_pio=%08x e_physbase=%08x r_=%08lx value=%08x\n",
           r_pio, (int)r_access->e_physbase,
@@ -127,7 +157,9 @@ static uint32_t assigned_dev_ioport_readw(void *opaque, uint32_t addr)
     uint32_t r_pio = guest_to_host_ioport(r_access, addr);
     uint32_t value;
 
-    value = inw(r_pio);
+    if (assigned_dev_ioport_rw(r_access, addr, 2, &value, 0) != 0) {
+        value = inw(r_pio);
+    }
 
     DEBUG("r_pio=%08x e_physbase=%08x r_baseport=%08lx value=%08x\n",
           r_pio, (int)r_access->e_physbase,
@@ -142,7 +174,9 @@ static uint32_t assigned_dev_ioport_readl(void *opaque, uint32_t addr)
     uint32_t r_pio = guest_to_host_ioport(r_access, addr);
     uint32_t value;
 
-    value = inl(r_pio);
+    if (assigned_dev_ioport_rw(r_access, addr, 4, &value, 0) != 0) {
+        value = inl(r_pio);
+    }
 
     DEBUG("r_pio=%08x e_physbase=%08x r_baseport=%08lx value=%08x\n",
           r_pio, (int)r_access->e_physbase,
@@ -305,7 +339,7 @@ static void assigned_dev_ioport_map(PCIDevice *pci_dev, int region_num,
     DEBUG("e_phys=0x%" FMT_PCIBUS " r_baseport=%x type=0x%x len=%" FMT_PCIBUS " region_num=%d \n",
           addr, region->u.r_baseport, type, size, region_num);
 
-    if (first_map) {
+    if (first_map && region->region->resource_fd < 0) {
 	struct ioperm_data *data;
 
 	data = qemu_mallocz(sizeof(struct ioperm_data));
@@ -586,19 +620,46 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
                              slow_map ? assigned_dev_iomem_map_slow
                                       : assigned_dev_iomem_map);
             continue;
+        } else {
+            /* handle port io regions */
+            uint32_t val;
+            int ret;
+
+            /* Test kernel support for ioport resource read/write.  Old
+             * kernels return EIO.  New kernels only allow 1/2/4 byte reads
+             * so should return EINVAL for a 3 byte read */
+            ret = pread(pci_dev->v_addrs[i].region->resource_fd, &val, 3, 0);
+            if (ret == 3) {
+                fprintf(stderr, "I/O port resource supports 3 byte read?!\n");
+                abort();
+            } else if (errno == EIO) {
+                fprintf(stderr,
+                        "pcisysfs does not support rw ioport resource\n");
+                close(pci_dev->v_addrs[i].region->resource_fd);
+                pci_dev->v_addrs[i].region->resource_fd = -1;
+            } else if (errno != EINVAL) {
+                fprintf(stderr, "Unexpected return from ioport pread (%d) %s\n",
+                        errno, strerror(errno));
+                abort();
+            }
+
+            /* Root user can use direct access for compatibility */
+            if (getuid() == 0) {
+                close(pci_dev->v_addrs[i].region->resource_fd);
+                pci_dev->v_addrs[i].region->resource_fd = -1;
+            }
+            pci_dev->v_addrs[i].e_physbase = cur_region->base_addr;
+            pci_dev->v_addrs[i].u.r_baseport = cur_region->base_addr;
+            pci_dev->v_addrs[i].r_size = cur_region->size;
+            pci_dev->v_addrs[i].e_size = 0;
+
+            pci_register_bar((PCIDevice *) pci_dev, i,
+                             cur_region->size, PCI_BASE_ADDRESS_SPACE_IO,
+                             assigned_dev_ioport_map);
+
+            /* not relevant for port io */
+            pci_dev->v_addrs[i].memory_index = 0;
         }
-        /* handle port io regions */
-        pci_dev->v_addrs[i].e_physbase = cur_region->base_addr;
-        pci_dev->v_addrs[i].u.r_baseport = cur_region->base_addr;
-        pci_dev->v_addrs[i].r_size = cur_region->size;
-        pci_dev->v_addrs[i].e_size = 0;
-
-        pci_register_bar((PCIDevice *) pci_dev, i,
-                         cur_region->size, PCI_BASE_ADDRESS_SPACE_IO,
-                         assigned_dev_ioport_map);
-
-        /* not relevant for port io */
-        pci_dev->v_addrs[i].memory_index = 0;
     }
 
     /* success */
@@ -705,20 +766,22 @@ again:
             continue;
         if (flags & IORESOURCE_MEM) {
             flags &= ~IORESOURCE_IO;
-            if (r != PCI_ROM_SLOT) {
-                snprintf(name, sizeof(name), "%sresource%d", dir, r);
-                fd = open(name, O_RDWR);
-                if (fd == -1)
-                    continue;
-                rp->resource_fd = fd;
-            }
-        } else
+        } else {
             flags &= ~IORESOURCE_PREFETCH;
+        }
+        if (r != PCI_ROM_SLOT) {
+            snprintf(name, sizeof(name), "%sresource%d", dir, r);
+            fd = open(name, O_RDWR);
+            if (fd == -1)
+                continue;
+            rp->resource_fd = fd;
+        }
 
         rp->type = flags;
         rp->valid = 1;
         rp->base_addr = start;
         rp->size = size;
+        pci_dev->v_addrs[r].region = rp;
         DEBUG("region %d size %d start 0x%llx type %d resource_fd %d\n",
               r, rp->size, start, rp->type, rp->resource_fd);
     }
@@ -780,8 +843,10 @@ static void free_assigned_device(AssignedDevice *dev)
                 continue;
 
             if (pci_region->type & IORESOURCE_IO) {
-                kvm_remove_ioperm_data(region->u.r_baseport, region->r_size);
-                continue;
+                if (pci_region->resource_fd < 0) {
+                    kvm_remove_ioperm_data(region->u.r_baseport,
+                                           region->r_size);
+                }
             } else if (pci_region->type & IORESOURCE_MEM) {
                 if (region->u.r_virtbase) {
                     if (region->memory_index) {
@@ -795,11 +860,11 @@ static void free_assigned_device(AssignedDevice *dev)
                         fprintf(stderr,
 				"Failed to unmap assigned device region: %s\n",
 				strerror(errno));
-                    if (pci_region->resource_fd >= 0) {
-                        close(pci_region->resource_fd);
-                    }
                 }
-	    }
+            }
+            if (pci_region->resource_fd >= 0) {
+                close(pci_region->resource_fd);
+            }
         }
 
         if (dev->cap.available & ASSIGNED_DEVICE_CAP_MSIX)
diff --git a/hw/device-assignment.h b/hw/device-assignment.h
index 4e7fe87..9a3ea12 100644
--- a/hw/device-assignment.h
+++ b/hw/device-assignment.h
@@ -71,6 +71,7 @@ typedef struct {
     int num;            /* our index within v_addrs[] */
     pcibus_t e_size;    /* emulated size of region in bytes */
     pcibus_t r_size;    /* real size of region in bytes */
+    PCIRegion *region;
 } AssignedDevRegion;
 
 typedef struct AssignedDevice {


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH] device-assignment: Use PCI I/O port sysfs resource file when available
  2010-07-20 22:11 [PATCH] device-assignment: Use PCI I/O port sysfs resource file when available Alex Williamson
@ 2010-07-20 23:13 ` Chris Wright
  2010-07-21  8:17   ` Daniel P. Berrange
  2010-07-21  3:30 ` [PATCH v2] " Alex Williamson
  1 sibling, 1 reply; 8+ messages in thread
From: Chris Wright @ 2010-07-20 23:13 UTC (permalink / raw)
  To: Alex Williamson; +Cc: kvm, ddutile, chrisw

* Alex Williamson (alex.williamson@redhat.com) wrote:
> When supported by the host kernel, we can use read/write on the
> PCI sysfs resource file for I/O port regions.  This allows us to
> avoid raw in/out commands and works with deprivileged guests via
> libvirt.  For uid 0 callers, we use in/out directly to avoid any
> compatibility issues.

won't uid 0 test will fail if libvirt launches qemu with user set to
root (capabilities still get dropped)?

thanks,
-chris

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH v2] device-assignment: Use PCI I/O port sysfs resource file when available
  2010-07-20 22:11 [PATCH] device-assignment: Use PCI I/O port sysfs resource file when available Alex Williamson
  2010-07-20 23:13 ` Chris Wright
@ 2010-07-21  3:30 ` Alex Williamson
  2010-07-21 14:24   ` [PATCH v3] " Alex Williamson
  1 sibling, 1 reply; 8+ messages in thread
From: Alex Williamson @ 2010-07-21  3:30 UTC (permalink / raw)
  To: kvm; +Cc: ddutile, chrisw, alex.williamson

When supported by the host kernel, we can use read/write on the
PCI sysfs resource file for I/O port regions.  This allows us to
avoid raw in/out commands and works with deprivileged guests via
libvirt.  For uid 0 callers, we use in/out directly to avoid any
compatibility issues.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---

 Required kernel patch pending here:
 http://www.spinics.net/lists/linux-pci/msg09389.html

 v2: Drop getuid() since it doesn't guarantee permissions
     Don't use in/out as a fallback since we don't have permissions
     Consolidate ioport read/write functions

 hw/device-assignment.c |  205 ++++++++++++++++++++++++++++--------------------
 hw/device-assignment.h |    1 
 2 files changed, 120 insertions(+), 86 deletions(-)

diff --git a/hw/device-assignment.c b/hw/device-assignment.c
index 2bba22f..2e141ac 100644
--- a/hw/device-assignment.c
+++ b/hw/device-assignment.c
@@ -62,93 +62,100 @@ static void assigned_dev_load_option_rom(AssignedDevice *dev);
 
 static void assigned_dev_unregister_msix_mmio(AssignedDevice *dev);
 
-static uint32_t guest_to_host_ioport(AssignedDevRegion *region, uint32_t addr)
+static uint32_t assigned_dev_ioport_rw(AssignedDevRegion *dev_region,
+                                       uint32_t addr, int len, uint32_t *val)
 {
-    return region->u.r_baseport + (addr - region->e_physbase);
+    uint32_t ret = 0;
+    uint32_t offset = addr - dev_region->e_physbase;
+    int fd = dev_region->region->resource_fd;
+
+    if (fd >= 0) {
+        if (val) {
+            DEBUG("pwrite val=%x, len=%d, e_phys=%x, offset=%x\n",
+                  *val, len, addr, offset);
+            if (pwrite(fd, val, len, offset) != len) {
+                fprintf(stderr, "%s - pwrite failed %s\n",
+                        __func__, strerror(errno));
+            }
+        } else {
+            if (pread(fd, &ret, len, offset) != len) {
+                fprintf(stderr, "%s - pread failed %s\n",
+                        __func__, strerror(errno));
+                ret = (1UL << (len * 8)) - 1;
+            }
+            DEBUG("pread ret=%x, len=%d, e_phys=%x, offset=%x\n",
+                  ret, len, addr, offset);
+        }
+    } else {
+        uint32_t port = offset + dev_region->u.r_baseport;
+
+        if (val) {
+            DEBUG("out val=%x, len=%d, e_phys=%x, host=%x\n",
+                  *val, len, addr, port);
+            switch (len) {
+                case 1:
+                    outb(*val, port);
+                    break;
+                case 2:
+                    outw(*val, port);
+                    break;
+                case 4:
+                    outl(*val, port);
+                    break;
+            }
+        } else {
+            switch (len) {
+                case 1:
+                    ret = inb(port);
+                    break;
+                case 2:
+                    ret = inw(port);
+                    break;
+                case 4:
+                    ret = inl(port);
+                    break;
+            }
+            DEBUG("in val=%x, len=%d, e_phys=%x, host=%x\n",
+                  ret, len, addr, port);
+        }
+    }
+    return ret;
 }
 
 static void assigned_dev_ioport_writeb(void *opaque, uint32_t addr,
                                        uint32_t value)
 {
-    AssignedDevRegion *r_access = opaque;
-    uint32_t r_pio = guest_to_host_ioport(r_access, addr);
-
-    DEBUG("r_pio=%08x e_physbase=%08x r_baseport=%08lx value=%08x\n",
-	  r_pio, (int)r_access->e_physbase,
-	  (unsigned long)r_access->u.r_baseport, value);
-
-    outb(value, r_pio);
+    assigned_dev_ioport_rw(opaque, addr, 1, &value);
+    return;
 }
 
 static void assigned_dev_ioport_writew(void *opaque, uint32_t addr,
                                        uint32_t value)
 {
-    AssignedDevRegion *r_access = opaque;
-    uint32_t r_pio = guest_to_host_ioport(r_access, addr);
-
-    DEBUG("r_pio=%08x e_physbase=%08x r_baseport=%08lx value=%08x\n",
-          r_pio, (int)r_access->e_physbase,
-	  (unsigned long)r_access->u.r_baseport, value);
-
-    outw(value, r_pio);
+    assigned_dev_ioport_rw(opaque, addr, 2, &value);
+    return;
 }
 
 static void assigned_dev_ioport_writel(void *opaque, uint32_t addr,
                        uint32_t value)
 {
-    AssignedDevRegion *r_access = opaque;
-    uint32_t r_pio = guest_to_host_ioport(r_access, addr);
-
-    DEBUG("r_pio=%08x e_physbase=%08x r_baseport=%08lx value=%08x\n",
-	  r_pio, (int)r_access->e_physbase,
-          (unsigned long)r_access->u.r_baseport, value);
-
-    outl(value, r_pio);
+    assigned_dev_ioport_rw(opaque, addr, 4, &value);
+    return;
 }
 
 static uint32_t assigned_dev_ioport_readb(void *opaque, uint32_t addr)
 {
-    AssignedDevRegion *r_access = opaque;
-    uint32_t r_pio = guest_to_host_ioport(r_access, addr);
-    uint32_t value;
-
-    value = inb(r_pio);
-
-    DEBUG("r_pio=%08x e_physbase=%08x r_=%08lx value=%08x\n",
-          r_pio, (int)r_access->e_physbase,
-          (unsigned long)r_access->u.r_baseport, value);
-
-    return value;
+    return assigned_dev_ioport_rw(opaque, addr, 1, NULL);
 }
 
 static uint32_t assigned_dev_ioport_readw(void *opaque, uint32_t addr)
 {
-    AssignedDevRegion *r_access = opaque;
-    uint32_t r_pio = guest_to_host_ioport(r_access, addr);
-    uint32_t value;
-
-    value = inw(r_pio);
-
-    DEBUG("r_pio=%08x e_physbase=%08x r_baseport=%08lx value=%08x\n",
-          r_pio, (int)r_access->e_physbase,
-	  (unsigned long)r_access->u.r_baseport, value);
-
-    return value;
+    return assigned_dev_ioport_rw(opaque, addr, 2, NULL);
 }
 
 static uint32_t assigned_dev_ioport_readl(void *opaque, uint32_t addr)
 {
-    AssignedDevRegion *r_access = opaque;
-    uint32_t r_pio = guest_to_host_ioport(r_access, addr);
-    uint32_t value;
-
-    value = inl(r_pio);
-
-    DEBUG("r_pio=%08x e_physbase=%08x r_baseport=%08lx value=%08x\n",
-          r_pio, (int)r_access->e_physbase,
-          (unsigned long)r_access->u.r_baseport, value);
-
-    return value;
+    return assigned_dev_ioport_rw(opaque, addr, 4, NULL);
 }
 
 static uint32_t slow_bar_readb(void *opaque, target_phys_addr_t addr)
@@ -305,7 +312,7 @@ static void assigned_dev_ioport_map(PCIDevice *pci_dev, int region_num,
     DEBUG("e_phys=0x%" FMT_PCIBUS " r_baseport=%x type=0x%x len=%" FMT_PCIBUS " region_num=%d \n",
           addr, region->u.r_baseport, type, size, region_num);
 
-    if (first_map) {
+    if (first_map && region->region->resource_fd < 0) {
 	struct ioperm_data *data;
 
 	data = qemu_mallocz(sizeof(struct ioperm_data));
@@ -586,19 +593,41 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
                              slow_map ? assigned_dev_iomem_map_slow
                                       : assigned_dev_iomem_map);
             continue;
+        } else {
+            /* handle port io regions */
+            uint32_t val;
+            int ret;
+
+            /* Test kernel support for ioport resource read/write.  Old
+             * kernels return EIO.  New kernels only allow 1/2/4 byte reads
+             * so should return EINVAL for a 3 byte read */
+            ret = pread(pci_dev->v_addrs[i].region->resource_fd, &val, 3, 0);
+            if (ret == 3) {
+                fprintf(stderr, "I/O port resource supports 3 byte read?!\n");
+                abort();
+            } else if (errno == EIO) {
+                fprintf(stderr,
+                        "pcisysfs does not support rw ioport resource\n");
+                close(pci_dev->v_addrs[i].region->resource_fd);
+                pci_dev->v_addrs[i].region->resource_fd = -1;
+            } else if (errno != EINVAL) {
+                fprintf(stderr, "Unexpected return from ioport pread (%d) %s\n",
+                        errno, strerror(errno));
+                abort();
+            }
+
+            pci_dev->v_addrs[i].e_physbase = cur_region->base_addr;
+            pci_dev->v_addrs[i].u.r_baseport = cur_region->base_addr;
+            pci_dev->v_addrs[i].r_size = cur_region->size;
+            pci_dev->v_addrs[i].e_size = 0;
+
+            pci_register_bar((PCIDevice *) pci_dev, i,
+                             cur_region->size, PCI_BASE_ADDRESS_SPACE_IO,
+                             assigned_dev_ioport_map);
+
+            /* not relevant for port io */
+            pci_dev->v_addrs[i].memory_index = 0;
         }
-        /* handle port io regions */
-        pci_dev->v_addrs[i].e_physbase = cur_region->base_addr;
-        pci_dev->v_addrs[i].u.r_baseport = cur_region->base_addr;
-        pci_dev->v_addrs[i].r_size = cur_region->size;
-        pci_dev->v_addrs[i].e_size = 0;
-
-        pci_register_bar((PCIDevice *) pci_dev, i,
-                         cur_region->size, PCI_BASE_ADDRESS_SPACE_IO,
-                         assigned_dev_ioport_map);
-
-        /* not relevant for port io */
-        pci_dev->v_addrs[i].memory_index = 0;
     }
 
     /* success */
@@ -705,20 +734,22 @@ again:
             continue;
         if (flags & IORESOURCE_MEM) {
             flags &= ~IORESOURCE_IO;
-            if (r != PCI_ROM_SLOT) {
-                snprintf(name, sizeof(name), "%sresource%d", dir, r);
-                fd = open(name, O_RDWR);
-                if (fd == -1)
-                    continue;
-                rp->resource_fd = fd;
-            }
-        } else
+        } else {
             flags &= ~IORESOURCE_PREFETCH;
+        }
+        if (r != PCI_ROM_SLOT) {
+            snprintf(name, sizeof(name), "%sresource%d", dir, r);
+            fd = open(name, O_RDWR);
+            if (fd == -1)
+                continue;
+            rp->resource_fd = fd;
+        }
 
         rp->type = flags;
         rp->valid = 1;
         rp->base_addr = start;
         rp->size = size;
+        pci_dev->v_addrs[r].region = rp;
         DEBUG("region %d size %d start 0x%llx type %d resource_fd %d\n",
               r, rp->size, start, rp->type, rp->resource_fd);
     }
@@ -780,8 +811,10 @@ static void free_assigned_device(AssignedDevice *dev)
                 continue;
 
             if (pci_region->type & IORESOURCE_IO) {
-                kvm_remove_ioperm_data(region->u.r_baseport, region->r_size);
-                continue;
+                if (pci_region->resource_fd < 0) {
+                    kvm_remove_ioperm_data(region->u.r_baseport,
+                                           region->r_size);
+                }
             } else if (pci_region->type & IORESOURCE_MEM) {
                 if (region->u.r_virtbase) {
                     if (region->memory_index) {
@@ -795,11 +828,11 @@ static void free_assigned_device(AssignedDevice *dev)
                         fprintf(stderr,
 				"Failed to unmap assigned device region: %s\n",
 				strerror(errno));
-                    if (pci_region->resource_fd >= 0) {
-                        close(pci_region->resource_fd);
-                    }
                 }
-	    }
+            }
+            if (pci_region->resource_fd >= 0) {
+                close(pci_region->resource_fd);
+            }
         }
 
         if (dev->cap.available & ASSIGNED_DEVICE_CAP_MSIX)
diff --git a/hw/device-assignment.h b/hw/device-assignment.h
index 4e7fe87..9a3ea12 100644
--- a/hw/device-assignment.h
+++ b/hw/device-assignment.h
@@ -71,6 +71,7 @@ typedef struct {
     int num;            /* our index within v_addrs[] */
     pcibus_t e_size;    /* emulated size of region in bytes */
     pcibus_t r_size;    /* real size of region in bytes */
+    PCIRegion *region;
 } AssignedDevRegion;
 
 typedef struct AssignedDevice {


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH] device-assignment: Use PCI I/O port sysfs resource file when available
  2010-07-20 23:13 ` Chris Wright
@ 2010-07-21  8:17   ` Daniel P. Berrange
  0 siblings, 0 replies; 8+ messages in thread
From: Daniel P. Berrange @ 2010-07-21  8:17 UTC (permalink / raw)
  To: Chris Wright; +Cc: Alex Williamson, kvm, ddutile

On Tue, Jul 20, 2010 at 04:13:06PM -0700, Chris Wright wrote:
> * Alex Williamson (alex.williamson@redhat.com) wrote:
> > When supported by the host kernel, we can use read/write on the
> > PCI sysfs resource file for I/O port regions.  This allows us to
> > avoid raw in/out commands and works with deprivileged guests via
> > libvirt.  For uid 0 callers, we use in/out directly to avoid any
> > compatibility issues.
> 
> won't uid 0 test will fail if libvirt launches qemu with user set to
> root (capabilities still get dropped)?

Yes, if the kernel is doing a CAP_SYS_ADMIN check (or similar), then
testing uid==0 is definitely wrong. You'd need to test have(CAP_SYS_ADMIN)
instead. 

REgards,
Daniel
-- 
|: Red Hat, Engineering, London    -o-   http://people.redhat.com/berrange/ :|
|: http://libvirt.org -o- http://virt-manager.org -o- http://deltacloud.org :|
|: http://autobuild.org        -o-         http://search.cpan.org/~danberr/ :|
|: GnuPG: 7D3B9505  -o-   F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :|

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH v3] device-assignment: Use PCI I/O port sysfs resource file when available
  2010-07-21  3:30 ` [PATCH v2] " Alex Williamson
@ 2010-07-21 14:24   ` Alex Williamson
  2010-07-23 21:47     ` [PATCH v4] " Alex Williamson
  0 siblings, 1 reply; 8+ messages in thread
From: Alex Williamson @ 2010-07-21 14:24 UTC (permalink / raw)
  To: kvm; +Cc: ddutile, chrisw, alex.williamson

When supported by the host kernel, we can use read/write on the
PCI sysfs resource file for I/O port regions.  This allows us to
avoid raw in/out commands and works with deprivileged guests via
libvirt.  For uid 0 callers, we use in/out directly to avoid any
compatibility issues.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---

 Required kernel patch pending here:
 http://www.spinics.net/lists/linux-pci/msg09389.html

 v3: Fixup sysfs errno checking, anything except EINVAL should fall
     back to in/out.  This covers things like EPERM on the resource
     file too.

 v2: Drop getuid() since it doesn't guarantee permissions
     Don't use in/out as a fallback since we don't have permissions
     Consolidate ioport read/write functions

 hw/device-assignment.c |  201 +++++++++++++++++++++++++++---------------------
 hw/device-assignment.h |    1 
 2 files changed, 116 insertions(+), 86 deletions(-)

diff --git a/hw/device-assignment.c b/hw/device-assignment.c
index 2bba22f..c56870e 100644
--- a/hw/device-assignment.c
+++ b/hw/device-assignment.c
@@ -62,93 +62,100 @@ static void assigned_dev_load_option_rom(AssignedDevice *dev);
 
 static void assigned_dev_unregister_msix_mmio(AssignedDevice *dev);
 
-static uint32_t guest_to_host_ioport(AssignedDevRegion *region, uint32_t addr)
+static uint32_t assigned_dev_ioport_rw(AssignedDevRegion *dev_region,
+                                       uint32_t addr, int len, uint32_t *val)
 {
-    return region->u.r_baseport + (addr - region->e_physbase);
+    uint32_t ret = 0;
+    uint32_t offset = addr - dev_region->e_physbase;
+    int fd = dev_region->region->resource_fd;
+
+    if (fd >= 0) {
+        if (val) {
+            DEBUG("pwrite val=%x, len=%d, e_phys=%x, offset=%x\n",
+                  *val, len, addr, offset);
+            if (pwrite(fd, val, len, offset) != len) {
+                fprintf(stderr, "%s - pwrite failed %s\n",
+                        __func__, strerror(errno));
+            }
+        } else {
+            if (pread(fd, &ret, len, offset) != len) {
+                fprintf(stderr, "%s - pread failed %s\n",
+                        __func__, strerror(errno));
+                ret = (1UL << (len * 8)) - 1;
+            }
+            DEBUG("pread ret=%x, len=%d, e_phys=%x, offset=%x\n",
+                  ret, len, addr, offset);
+        }
+    } else {
+        uint32_t port = offset + dev_region->u.r_baseport;
+
+        if (val) {
+            DEBUG("out val=%x, len=%d, e_phys=%x, host=%x\n",
+                  *val, len, addr, port);
+            switch (len) {
+                case 1:
+                    outb(*val, port);
+                    break;
+                case 2:
+                    outw(*val, port);
+                    break;
+                case 4:
+                    outl(*val, port);
+                    break;
+            }
+        } else {
+            switch (len) {
+                case 1:
+                    ret = inb(port);
+                    break;
+                case 2:
+                    ret = inw(port);
+                    break;
+                case 4:
+                    ret = inl(port);
+                    break;
+            }
+            DEBUG("in val=%x, len=%d, e_phys=%x, host=%x\n",
+                  ret, len, addr, port);
+        }
+    }
+    return ret;
 }
 
 static void assigned_dev_ioport_writeb(void *opaque, uint32_t addr,
                                        uint32_t value)
 {
-    AssignedDevRegion *r_access = opaque;
-    uint32_t r_pio = guest_to_host_ioport(r_access, addr);
-
-    DEBUG("r_pio=%08x e_physbase=%08x r_baseport=%08lx value=%08x\n",
-	  r_pio, (int)r_access->e_physbase,
-	  (unsigned long)r_access->u.r_baseport, value);
-
-    outb(value, r_pio);
+    assigned_dev_ioport_rw(opaque, addr, 1, &value);
+    return;
 }
 
 static void assigned_dev_ioport_writew(void *opaque, uint32_t addr,
                                        uint32_t value)
 {
-    AssignedDevRegion *r_access = opaque;
-    uint32_t r_pio = guest_to_host_ioport(r_access, addr);
-
-    DEBUG("r_pio=%08x e_physbase=%08x r_baseport=%08lx value=%08x\n",
-          r_pio, (int)r_access->e_physbase,
-	  (unsigned long)r_access->u.r_baseport, value);
-
-    outw(value, r_pio);
+    assigned_dev_ioport_rw(opaque, addr, 2, &value);
+    return;
 }
 
 static void assigned_dev_ioport_writel(void *opaque, uint32_t addr,
                        uint32_t value)
 {
-    AssignedDevRegion *r_access = opaque;
-    uint32_t r_pio = guest_to_host_ioport(r_access, addr);
-
-    DEBUG("r_pio=%08x e_physbase=%08x r_baseport=%08lx value=%08x\n",
-	  r_pio, (int)r_access->e_physbase,
-          (unsigned long)r_access->u.r_baseport, value);
-
-    outl(value, r_pio);
+    assigned_dev_ioport_rw(opaque, addr, 4, &value);
+    return;
 }
 
 static uint32_t assigned_dev_ioport_readb(void *opaque, uint32_t addr)
 {
-    AssignedDevRegion *r_access = opaque;
-    uint32_t r_pio = guest_to_host_ioport(r_access, addr);
-    uint32_t value;
-
-    value = inb(r_pio);
-
-    DEBUG("r_pio=%08x e_physbase=%08x r_=%08lx value=%08x\n",
-          r_pio, (int)r_access->e_physbase,
-          (unsigned long)r_access->u.r_baseport, value);
-
-    return value;
+    return assigned_dev_ioport_rw(opaque, addr, 1, NULL);
 }
 
 static uint32_t assigned_dev_ioport_readw(void *opaque, uint32_t addr)
 {
-    AssignedDevRegion *r_access = opaque;
-    uint32_t r_pio = guest_to_host_ioport(r_access, addr);
-    uint32_t value;
-
-    value = inw(r_pio);
-
-    DEBUG("r_pio=%08x e_physbase=%08x r_baseport=%08lx value=%08x\n",
-          r_pio, (int)r_access->e_physbase,
-	  (unsigned long)r_access->u.r_baseport, value);
-
-    return value;
+    return assigned_dev_ioport_rw(opaque, addr, 2, NULL);
 }
 
 static uint32_t assigned_dev_ioport_readl(void *opaque, uint32_t addr)
 {
-    AssignedDevRegion *r_access = opaque;
-    uint32_t r_pio = guest_to_host_ioport(r_access, addr);
-    uint32_t value;
-
-    value = inl(r_pio);
-
-    DEBUG("r_pio=%08x e_physbase=%08x r_baseport=%08lx value=%08x\n",
-          r_pio, (int)r_access->e_physbase,
-          (unsigned long)r_access->u.r_baseport, value);
-
-    return value;
+    return assigned_dev_ioport_rw(opaque, addr, 4, NULL);
 }
 
 static uint32_t slow_bar_readb(void *opaque, target_phys_addr_t addr)
@@ -305,7 +312,7 @@ static void assigned_dev_ioport_map(PCIDevice *pci_dev, int region_num,
     DEBUG("e_phys=0x%" FMT_PCIBUS " r_baseport=%x type=0x%x len=%" FMT_PCIBUS " region_num=%d \n",
           addr, region->u.r_baseport, type, size, region_num);
 
-    if (first_map) {
+    if (first_map && region->region->resource_fd < 0) {
 	struct ioperm_data *data;
 
 	data = qemu_mallocz(sizeof(struct ioperm_data));
@@ -586,19 +593,37 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
                              slow_map ? assigned_dev_iomem_map_slow
                                       : assigned_dev_iomem_map);
             continue;
+        } else {
+            /* handle port io regions */
+            uint32_t val;
+            int ret;
+
+            /* Test kernel support for ioport resource read/write.  Old
+             * kernels return EIO.  New kernels only allow 1/2/4 byte reads
+             * so should return EINVAL for a 3 byte read */
+            ret = pread(pci_dev->v_addrs[i].region->resource_fd, &val, 3, 0);
+            if (ret == 3) {
+                fprintf(stderr, "I/O port resource supports 3 byte read?!\n");
+                abort();
+            } else if (errno != EINVAL) {
+                fprintf(stderr, "Using raw in/out ioport access (sysfs - %s)\n",
+                        strerror(errno));
+                close(pci_dev->v_addrs[i].region->resource_fd);
+                pci_dev->v_addrs[i].region->resource_fd = -1;
+            }
+
+            pci_dev->v_addrs[i].e_physbase = cur_region->base_addr;
+            pci_dev->v_addrs[i].u.r_baseport = cur_region->base_addr;
+            pci_dev->v_addrs[i].r_size = cur_region->size;
+            pci_dev->v_addrs[i].e_size = 0;
+
+            pci_register_bar((PCIDevice *) pci_dev, i,
+                             cur_region->size, PCI_BASE_ADDRESS_SPACE_IO,
+                             assigned_dev_ioport_map);
+
+            /* not relevant for port io */
+            pci_dev->v_addrs[i].memory_index = 0;
         }
-        /* handle port io regions */
-        pci_dev->v_addrs[i].e_physbase = cur_region->base_addr;
-        pci_dev->v_addrs[i].u.r_baseport = cur_region->base_addr;
-        pci_dev->v_addrs[i].r_size = cur_region->size;
-        pci_dev->v_addrs[i].e_size = 0;
-
-        pci_register_bar((PCIDevice *) pci_dev, i,
-                         cur_region->size, PCI_BASE_ADDRESS_SPACE_IO,
-                         assigned_dev_ioport_map);
-
-        /* not relevant for port io */
-        pci_dev->v_addrs[i].memory_index = 0;
     }
 
     /* success */
@@ -705,20 +730,22 @@ again:
             continue;
         if (flags & IORESOURCE_MEM) {
             flags &= ~IORESOURCE_IO;
-            if (r != PCI_ROM_SLOT) {
-                snprintf(name, sizeof(name), "%sresource%d", dir, r);
-                fd = open(name, O_RDWR);
-                if (fd == -1)
-                    continue;
-                rp->resource_fd = fd;
-            }
-        } else
+        } else {
             flags &= ~IORESOURCE_PREFETCH;
+        }
+        if (r != PCI_ROM_SLOT) {
+            snprintf(name, sizeof(name), "%sresource%d", dir, r);
+            fd = open(name, O_RDWR);
+            if (fd == -1)
+                continue;
+            rp->resource_fd = fd;
+        }
 
         rp->type = flags;
         rp->valid = 1;
         rp->base_addr = start;
         rp->size = size;
+        pci_dev->v_addrs[r].region = rp;
         DEBUG("region %d size %d start 0x%llx type %d resource_fd %d\n",
               r, rp->size, start, rp->type, rp->resource_fd);
     }
@@ -780,8 +807,10 @@ static void free_assigned_device(AssignedDevice *dev)
                 continue;
 
             if (pci_region->type & IORESOURCE_IO) {
-                kvm_remove_ioperm_data(region->u.r_baseport, region->r_size);
-                continue;
+                if (pci_region->resource_fd < 0) {
+                    kvm_remove_ioperm_data(region->u.r_baseport,
+                                           region->r_size);
+                }
             } else if (pci_region->type & IORESOURCE_MEM) {
                 if (region->u.r_virtbase) {
                     if (region->memory_index) {
@@ -795,11 +824,11 @@ static void free_assigned_device(AssignedDevice *dev)
                         fprintf(stderr,
 				"Failed to unmap assigned device region: %s\n",
 				strerror(errno));
-                    if (pci_region->resource_fd >= 0) {
-                        close(pci_region->resource_fd);
-                    }
                 }
-	    }
+            }
+            if (pci_region->resource_fd >= 0) {
+                close(pci_region->resource_fd);
+            }
         }
 
         if (dev->cap.available & ASSIGNED_DEVICE_CAP_MSIX)
diff --git a/hw/device-assignment.h b/hw/device-assignment.h
index 4e7fe87..9a3ea12 100644
--- a/hw/device-assignment.h
+++ b/hw/device-assignment.h
@@ -71,6 +71,7 @@ typedef struct {
     int num;            /* our index within v_addrs[] */
     pcibus_t e_size;    /* emulated size of region in bytes */
     pcibus_t r_size;    /* real size of region in bytes */
+    PCIRegion *region;
 } AssignedDevRegion;
 
 typedef struct AssignedDevice {


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH v4] device-assignment: Use PCI I/O port sysfs resource file when available
  2010-07-21 14:24   ` [PATCH v3] " Alex Williamson
@ 2010-07-23 21:47     ` Alex Williamson
  2010-07-23 23:01       ` Chris Wright
  2010-07-27 20:37       ` Marcelo Tosatti
  0 siblings, 2 replies; 8+ messages in thread
From: Alex Williamson @ 2010-07-23 21:47 UTC (permalink / raw)
  To: kvm; +Cc: ddutile, chrisw, alex.williamson

When supported by the host kernel, we can use read/write on the
PCI sysfs resource file for I/O port regions.  This allows us to
avoid raw in/out commands and works with deprivileged guests via
libvirt.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---

 Required kernel patch pending here:
 http://www.spinics.net/lists/linux-pci/msg09389.html

 v4: Remove commit sentence about uid since we removed the code
     in v2.  Identical patch to v3.  Also note preliminary
     agreement to include kerenl patch:
     http://www.spinics.net/lists/linux-pci/msg09412.html

 v3: Fixup sysfs errno checking, anything except EINVAL should fall
     back to in/out.  This covers things like EPERM on the resource
     file too.

 v2: Drop getuid() since it doesn't guarantee permissions
     Don't use in/out as a fallback since we don't have permissions
     Consolidate ioport read/write functions

 hw/device-assignment.c |  201 +++++++++++++++++++++++++++---------------------
 hw/device-assignment.h |    1 
 2 files changed, 116 insertions(+), 86 deletions(-)

diff --git a/hw/device-assignment.c b/hw/device-assignment.c
index 2bba22f..c56870e 100644
--- a/hw/device-assignment.c
+++ b/hw/device-assignment.c
@@ -62,93 +62,100 @@ static void assigned_dev_load_option_rom(AssignedDevice *dev);
 
 static void assigned_dev_unregister_msix_mmio(AssignedDevice *dev);
 
-static uint32_t guest_to_host_ioport(AssignedDevRegion *region, uint32_t addr)
+static uint32_t assigned_dev_ioport_rw(AssignedDevRegion *dev_region,
+                                       uint32_t addr, int len, uint32_t *val)
 {
-    return region->u.r_baseport + (addr - region->e_physbase);
+    uint32_t ret = 0;
+    uint32_t offset = addr - dev_region->e_physbase;
+    int fd = dev_region->region->resource_fd;
+
+    if (fd >= 0) {
+        if (val) {
+            DEBUG("pwrite val=%x, len=%d, e_phys=%x, offset=%x\n",
+                  *val, len, addr, offset);
+            if (pwrite(fd, val, len, offset) != len) {
+                fprintf(stderr, "%s - pwrite failed %s\n",
+                        __func__, strerror(errno));
+            }
+        } else {
+            if (pread(fd, &ret, len, offset) != len) {
+                fprintf(stderr, "%s - pread failed %s\n",
+                        __func__, strerror(errno));
+                ret = (1UL << (len * 8)) - 1;
+            }
+            DEBUG("pread ret=%x, len=%d, e_phys=%x, offset=%x\n",
+                  ret, len, addr, offset);
+        }
+    } else {
+        uint32_t port = offset + dev_region->u.r_baseport;
+
+        if (val) {
+            DEBUG("out val=%x, len=%d, e_phys=%x, host=%x\n",
+                  *val, len, addr, port);
+            switch (len) {
+                case 1:
+                    outb(*val, port);
+                    break;
+                case 2:
+                    outw(*val, port);
+                    break;
+                case 4:
+                    outl(*val, port);
+                    break;
+            }
+        } else {
+            switch (len) {
+                case 1:
+                    ret = inb(port);
+                    break;
+                case 2:
+                    ret = inw(port);
+                    break;
+                case 4:
+                    ret = inl(port);
+                    break;
+            }
+            DEBUG("in val=%x, len=%d, e_phys=%x, host=%x\n",
+                  ret, len, addr, port);
+        }
+    }
+    return ret;
 }
 
 static void assigned_dev_ioport_writeb(void *opaque, uint32_t addr,
                                        uint32_t value)
 {
-    AssignedDevRegion *r_access = opaque;
-    uint32_t r_pio = guest_to_host_ioport(r_access, addr);
-
-    DEBUG("r_pio=%08x e_physbase=%08x r_baseport=%08lx value=%08x\n",
-	  r_pio, (int)r_access->e_physbase,
-	  (unsigned long)r_access->u.r_baseport, value);
-
-    outb(value, r_pio);
+    assigned_dev_ioport_rw(opaque, addr, 1, &value);
+    return;
 }
 
 static void assigned_dev_ioport_writew(void *opaque, uint32_t addr,
                                        uint32_t value)
 {
-    AssignedDevRegion *r_access = opaque;
-    uint32_t r_pio = guest_to_host_ioport(r_access, addr);
-
-    DEBUG("r_pio=%08x e_physbase=%08x r_baseport=%08lx value=%08x\n",
-          r_pio, (int)r_access->e_physbase,
-	  (unsigned long)r_access->u.r_baseport, value);
-
-    outw(value, r_pio);
+    assigned_dev_ioport_rw(opaque, addr, 2, &value);
+    return;
 }
 
 static void assigned_dev_ioport_writel(void *opaque, uint32_t addr,
                        uint32_t value)
 {
-    AssignedDevRegion *r_access = opaque;
-    uint32_t r_pio = guest_to_host_ioport(r_access, addr);
-
-    DEBUG("r_pio=%08x e_physbase=%08x r_baseport=%08lx value=%08x\n",
-	  r_pio, (int)r_access->e_physbase,
-          (unsigned long)r_access->u.r_baseport, value);
-
-    outl(value, r_pio);
+    assigned_dev_ioport_rw(opaque, addr, 4, &value);
+    return;
 }
 
 static uint32_t assigned_dev_ioport_readb(void *opaque, uint32_t addr)
 {
-    AssignedDevRegion *r_access = opaque;
-    uint32_t r_pio = guest_to_host_ioport(r_access, addr);
-    uint32_t value;
-
-    value = inb(r_pio);
-
-    DEBUG("r_pio=%08x e_physbase=%08x r_=%08lx value=%08x\n",
-          r_pio, (int)r_access->e_physbase,
-          (unsigned long)r_access->u.r_baseport, value);
-
-    return value;
+    return assigned_dev_ioport_rw(opaque, addr, 1, NULL);
 }
 
 static uint32_t assigned_dev_ioport_readw(void *opaque, uint32_t addr)
 {
-    AssignedDevRegion *r_access = opaque;
-    uint32_t r_pio = guest_to_host_ioport(r_access, addr);
-    uint32_t value;
-
-    value = inw(r_pio);
-
-    DEBUG("r_pio=%08x e_physbase=%08x r_baseport=%08lx value=%08x\n",
-          r_pio, (int)r_access->e_physbase,
-	  (unsigned long)r_access->u.r_baseport, value);
-
-    return value;
+    return assigned_dev_ioport_rw(opaque, addr, 2, NULL);
 }
 
 static uint32_t assigned_dev_ioport_readl(void *opaque, uint32_t addr)
 {
-    AssignedDevRegion *r_access = opaque;
-    uint32_t r_pio = guest_to_host_ioport(r_access, addr);
-    uint32_t value;
-
-    value = inl(r_pio);
-
-    DEBUG("r_pio=%08x e_physbase=%08x r_baseport=%08lx value=%08x\n",
-          r_pio, (int)r_access->e_physbase,
-          (unsigned long)r_access->u.r_baseport, value);
-
-    return value;
+    return assigned_dev_ioport_rw(opaque, addr, 4, NULL);
 }
 
 static uint32_t slow_bar_readb(void *opaque, target_phys_addr_t addr)
@@ -305,7 +312,7 @@ static void assigned_dev_ioport_map(PCIDevice *pci_dev, int region_num,
     DEBUG("e_phys=0x%" FMT_PCIBUS " r_baseport=%x type=0x%x len=%" FMT_PCIBUS " region_num=%d \n",
           addr, region->u.r_baseport, type, size, region_num);
 
-    if (first_map) {
+    if (first_map && region->region->resource_fd < 0) {
 	struct ioperm_data *data;
 
 	data = qemu_mallocz(sizeof(struct ioperm_data));
@@ -586,19 +593,37 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
                              slow_map ? assigned_dev_iomem_map_slow
                                       : assigned_dev_iomem_map);
             continue;
+        } else {
+            /* handle port io regions */
+            uint32_t val;
+            int ret;
+
+            /* Test kernel support for ioport resource read/write.  Old
+             * kernels return EIO.  New kernels only allow 1/2/4 byte reads
+             * so should return EINVAL for a 3 byte read */
+            ret = pread(pci_dev->v_addrs[i].region->resource_fd, &val, 3, 0);
+            if (ret == 3) {
+                fprintf(stderr, "I/O port resource supports 3 byte read?!\n");
+                abort();
+            } else if (errno != EINVAL) {
+                fprintf(stderr, "Using raw in/out ioport access (sysfs - %s)\n",
+                        strerror(errno));
+                close(pci_dev->v_addrs[i].region->resource_fd);
+                pci_dev->v_addrs[i].region->resource_fd = -1;
+            }
+
+            pci_dev->v_addrs[i].e_physbase = cur_region->base_addr;
+            pci_dev->v_addrs[i].u.r_baseport = cur_region->base_addr;
+            pci_dev->v_addrs[i].r_size = cur_region->size;
+            pci_dev->v_addrs[i].e_size = 0;
+
+            pci_register_bar((PCIDevice *) pci_dev, i,
+                             cur_region->size, PCI_BASE_ADDRESS_SPACE_IO,
+                             assigned_dev_ioport_map);
+
+            /* not relevant for port io */
+            pci_dev->v_addrs[i].memory_index = 0;
         }
-        /* handle port io regions */
-        pci_dev->v_addrs[i].e_physbase = cur_region->base_addr;
-        pci_dev->v_addrs[i].u.r_baseport = cur_region->base_addr;
-        pci_dev->v_addrs[i].r_size = cur_region->size;
-        pci_dev->v_addrs[i].e_size = 0;
-
-        pci_register_bar((PCIDevice *) pci_dev, i,
-                         cur_region->size, PCI_BASE_ADDRESS_SPACE_IO,
-                         assigned_dev_ioport_map);
-
-        /* not relevant for port io */
-        pci_dev->v_addrs[i].memory_index = 0;
     }
 
     /* success */
@@ -705,20 +730,22 @@ again:
             continue;
         if (flags & IORESOURCE_MEM) {
             flags &= ~IORESOURCE_IO;
-            if (r != PCI_ROM_SLOT) {
-                snprintf(name, sizeof(name), "%sresource%d", dir, r);
-                fd = open(name, O_RDWR);
-                if (fd == -1)
-                    continue;
-                rp->resource_fd = fd;
-            }
-        } else
+        } else {
             flags &= ~IORESOURCE_PREFETCH;
+        }
+        if (r != PCI_ROM_SLOT) {
+            snprintf(name, sizeof(name), "%sresource%d", dir, r);
+            fd = open(name, O_RDWR);
+            if (fd == -1)
+                continue;
+            rp->resource_fd = fd;
+        }
 
         rp->type = flags;
         rp->valid = 1;
         rp->base_addr = start;
         rp->size = size;
+        pci_dev->v_addrs[r].region = rp;
         DEBUG("region %d size %d start 0x%llx type %d resource_fd %d\n",
               r, rp->size, start, rp->type, rp->resource_fd);
     }
@@ -780,8 +807,10 @@ static void free_assigned_device(AssignedDevice *dev)
                 continue;
 
             if (pci_region->type & IORESOURCE_IO) {
-                kvm_remove_ioperm_data(region->u.r_baseport, region->r_size);
-                continue;
+                if (pci_region->resource_fd < 0) {
+                    kvm_remove_ioperm_data(region->u.r_baseport,
+                                           region->r_size);
+                }
             } else if (pci_region->type & IORESOURCE_MEM) {
                 if (region->u.r_virtbase) {
                     if (region->memory_index) {
@@ -795,11 +824,11 @@ static void free_assigned_device(AssignedDevice *dev)
                         fprintf(stderr,
 				"Failed to unmap assigned device region: %s\n",
 				strerror(errno));
-                    if (pci_region->resource_fd >= 0) {
-                        close(pci_region->resource_fd);
-                    }
                 }
-	    }
+            }
+            if (pci_region->resource_fd >= 0) {
+                close(pci_region->resource_fd);
+            }
         }
 
         if (dev->cap.available & ASSIGNED_DEVICE_CAP_MSIX)
diff --git a/hw/device-assignment.h b/hw/device-assignment.h
index 4e7fe87..9a3ea12 100644
--- a/hw/device-assignment.h
+++ b/hw/device-assignment.h
@@ -71,6 +71,7 @@ typedef struct {
     int num;            /* our index within v_addrs[] */
     pcibus_t e_size;    /* emulated size of region in bytes */
     pcibus_t r_size;    /* real size of region in bytes */
+    PCIRegion *region;
 } AssignedDevRegion;
 
 typedef struct AssignedDevice {


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH v4] device-assignment: Use PCI I/O port sysfs resource file when available
  2010-07-23 21:47     ` [PATCH v4] " Alex Williamson
@ 2010-07-23 23:01       ` Chris Wright
  2010-07-27 20:37       ` Marcelo Tosatti
  1 sibling, 0 replies; 8+ messages in thread
From: Chris Wright @ 2010-07-23 23:01 UTC (permalink / raw)
  To: Alex Williamson; +Cc: kvm, ddutile, chrisw

* Alex Williamson (alex.williamson@redhat.com) wrote:
> When supported by the host kernel, we can use read/write on the
> PCI sysfs resource file for I/O port regions.  This allows us to
> avoid raw in/out commands and works with deprivileged guests via
> libvirt.
> 
> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>

Acked-by: Chris Wright <chrisw@redhat.com>

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v4] device-assignment: Use PCI I/O port sysfs resource file when available
  2010-07-23 21:47     ` [PATCH v4] " Alex Williamson
  2010-07-23 23:01       ` Chris Wright
@ 2010-07-27 20:37       ` Marcelo Tosatti
  1 sibling, 0 replies; 8+ messages in thread
From: Marcelo Tosatti @ 2010-07-27 20:37 UTC (permalink / raw)
  To: Alex Williamson; +Cc: kvm, ddutile, chrisw

On Fri, Jul 23, 2010 at 03:47:18PM -0600, Alex Williamson wrote:
> When supported by the host kernel, we can use read/write on the
> PCI sysfs resource file for I/O port regions.  This allows us to
> avoid raw in/out commands and works with deprivileged guests via
> libvirt.
> 
> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> ---
> 
>  Required kernel patch pending here:
>  http://www.spinics.net/lists/linux-pci/msg09389.html
> 
>  v4: Remove commit sentence about uid since we removed the code
>      in v2.  Identical patch to v3.  Also note preliminary
>      agreement to include kerenl patch:
>      http://www.spinics.net/lists/linux-pci/msg09412.html
> 
>  v3: Fixup sysfs errno checking, anything except EINVAL should fall
>      back to in/out.  This covers things like EPERM on the resource
>      file too.
> 
>  v2: Drop getuid() since it doesn't guarantee permissions
>      Don't use in/out as a fallback since we don't have permissions
>      Consolidate ioport read/write functions
> 
>  hw/device-assignment.c |  201 +++++++++++++++++++++++++++---------------------
>  hw/device-assignment.h |    1 
>  2 files changed, 116 insertions(+), 86 deletions(-)

Applied, thanks.


^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2010-07-27 21:22 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-07-20 22:11 [PATCH] device-assignment: Use PCI I/O port sysfs resource file when available Alex Williamson
2010-07-20 23:13 ` Chris Wright
2010-07-21  8:17   ` Daniel P. Berrange
2010-07-21  3:30 ` [PATCH v2] " Alex Williamson
2010-07-21 14:24   ` [PATCH v3] " Alex Williamson
2010-07-23 21:47     ` [PATCH v4] " Alex Williamson
2010-07-23 23:01       ` Chris Wright
2010-07-27 20:37       ` Marcelo Tosatti

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox