[PATCH] Enable non page boundary BAR device assignment

kvm.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] Enable non page boundary BAR device assignment
@ 2009-12-09 17:38 Alexander Graf
  2009-12-09 20:49 ` Michael S. Tsirkin
  2009-12-10  5:16 ` Muli Ben-Yehuda
  0 siblings, 2 replies; 34+ messages in thread
From: Alexander Graf @ 2009-12-09 17:38 UTC (permalink / raw)
  To: kvm; +Cc: mst

While trying to get device passthrough working with an emulex hba, kvm
refused to pass it through because it has a BAR of 256 bytes:

        Region 0: Memory at d2100000 (64-bit, non-prefetchable) [size=4K]
        Region 2: Memory at d2101000 (64-bit, non-prefetchable) [size=256]
        Region 4: I/O ports at b100 [size=256]

Since the page boundary is an arbitrary optimization to allow 1:1 mapping of
physical to virtual addresses, we can still take the old MMIO callback route.

So let's add a second code path that allows for size & 0xFFF != 0 sized regions
by looping it through userspace.

I verified that it works by passing through an e1000 with this additional patch
applied and the card acted the same way it did without this patch:

             map_func = assigned_dev_iomem_map;
-            if (cur_region->size & 0xFFF) {
+            if (i != PCI_ROM_SLOT){
                 fprintf(stderr, "PCI region %d at address 0x%llx "

Signed-off-by: Alexander Graf <agraf@suse.de>
---
 hw/device-assignment.c |  129 +++++++++++++++++++++++++++++++++++++++++++++--
 1 files changed, 123 insertions(+), 6 deletions(-)

diff --git a/hw/device-assignment.c b/hw/device-assignment.c
index 13a86bb..beb488c 100644
--- a/hw/device-assignment.c
+++ b/hw/device-assignment.c
@@ -148,6 +148,105 @@ static uint32_t assigned_dev_ioport_readl(void *opaque, uint32_t addr)
     return value;
 }
 
+static uint32_t slow_bar_readb(void *opaque, target_phys_addr_t addr)
+{
+    AssignedDevRegion *d = opaque;
+    uint8_t *in = (uint8_t*)(d->u.r_virtbase + addr);
+    uint32_t r = -1;
+
+    r = *in;
+    DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
+
+    return r;
+}
+
+static uint32_t slow_bar_readw(void *opaque, target_phys_addr_t addr)
+{
+    AssignedDevRegion *d = opaque;
+    uint16_t *in = (uint16_t*)(d->u.r_virtbase + addr);
+    uint32_t r = -1;
+
+    r = *in;
+    DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
+
+    return r;
+}
+
+static uint32_t slow_bar_readl(void *opaque, target_phys_addr_t addr)
+{
+    AssignedDevRegion *d = opaque;
+    uint32_t *in = (uint32_t*)(d->u.r_virtbase + addr);
+    uint32_t r = -1;
+
+    r = *in;
+    DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
+
+    return r;
+}
+
+static void slow_bar_writeb(void *opaque, target_phys_addr_t addr, uint32_t val)
+{
+    AssignedDevRegion *d = opaque;
+    uint8_t *out = (uint8_t*)(d->u.r_virtbase + addr);
+
+    DEBUG("slow_bar_writeb addr=0x" TARGET_FMT_plx " val=0x%02x\n", addr, val);
+    *out = val;
+}
+
+static void slow_bar_writew(void *opaque, target_phys_addr_t addr, uint32_t val)
+{
+    AssignedDevRegion *d = opaque;
+    uint16_t *out = (uint16_t*)(d->u.r_virtbase + addr);
+
+    DEBUG("slow_bar_writew addr=0x" TARGET_FMT_plx " val=0x%04x\n", addr, val);
+    *out = val;
+}
+
+static void slow_bar_writel(void *opaque, target_phys_addr_t addr, uint32_t val)
+{
+    AssignedDevRegion *d = opaque;
+    uint32_t *out = (uint32_t*)(d->u.r_virtbase + addr);
+
+    DEBUG("slow_bar_writel addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, val);
+    *out = val;
+}
+
+static CPUWriteMemoryFunc * const slow_bar_write[] = {
+    &slow_bar_writeb,
+    &slow_bar_writew,
+    &slow_bar_writel
+};
+
+static CPUReadMemoryFunc * const slow_bar_read[] = {
+    &slow_bar_readb,
+    &slow_bar_readw,
+    &slow_bar_readl
+};
+
+static void assigned_dev_iomem_map_slow(PCIDevice *pci_dev, int region_num,
+                                        pcibus_t e_phys, pcibus_t e_size,
+                                        int type)
+{
+    AssignedDevice *r_dev = container_of(pci_dev, AssignedDevice, dev);
+    AssignedDevRegion *region = &r_dev->v_addrs[region_num];
+    PCIRegion *real_region = &r_dev->real_device.regions[region_num];
+    int m;
+
+    DEBUG("slow map\n");
+    m = cpu_register_io_memory(slow_bar_read, slow_bar_write, region);
+    cpu_register_physical_memory(e_phys, e_size, m);
+
+    /* MSI-X MMIO page */
+    if ((e_size > 0) &&
+        real_region->base_addr <= r_dev->msix_table_addr &&
+        real_region->base_addr + real_region->size >= r_dev->msix_table_addr) {
+        int offset = r_dev->msix_table_addr - real_region->base_addr;
+
+        cpu_register_physical_memory(e_phys + offset,
+                TARGET_PAGE_SIZE, r_dev->mmio_index);
+    }
+}
+
 static void assigned_dev_iomem_map(PCIDevice *pci_dev, int region_num,
                                    pcibus_t e_phys, pcibus_t e_size, int type)
 {
@@ -381,15 +480,25 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
 
         /* handle memory io regions */
         if (cur_region->type & IORESOURCE_MEM) {
+            void (*map_func)(PCIDevice *, int, pcibus_t, pcibus_t, int);
+            int slow_map = 0;
             int t = cur_region->type & IORESOURCE_PREFETCH
                 ? PCI_BASE_ADDRESS_MEM_PREFETCH
                 : PCI_BASE_ADDRESS_SPACE_MEMORY;
+
+            map_func = assigned_dev_iomem_map;
             if (cur_region->size & 0xFFF) {
-                fprintf(stderr, "Unable to assign device: PCI region %d "
-                        "at address 0x%llx has size 0x%x, "
-                        " which is not a multiple of 4K\n",
+                fprintf(stderr, "PCI region %d at address 0x%llx "
+                        "has size 0x%x, which is not a multiple of 4K. "
+                        "Using slow map\n",
                         i, (unsigned long long)cur_region->base_addr,
                         cur_region->size);
+                map_func = assigned_dev_iomem_map_slow;
+                slow_map = 1;
+            }
+
+            if (slow_map && (i == PCI_ROM_SLOT)) {
+                fprintf(stderr, "ROM not aligned - can't continue\n");
                 return -1;
             }
 
@@ -402,6 +511,12 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
                          PROT_WRITE | PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE,
                          0, (off_t) 0);
 
+            } else if (slow_map) {
+                pci_dev->v_addrs[i].u.r_virtbase =
+                    mmap(NULL,
+                         cur_region->size,
+                         PROT_WRITE | PROT_READ, MAP_SHARED,
+                         cur_region->resource_fd, (off_t) 0);
             } else {
                 pci_dev->v_addrs[i].u.r_virtbase =
                     mmap(NULL,
@@ -429,12 +544,14 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
             pci_dev->v_addrs[i].e_size = 0;
 
             /* add offset */
-            pci_dev->v_addrs[i].u.r_virtbase +=
-                (cur_region->base_addr & 0xFFF);
+            if (!slow_map) {
+                pci_dev->v_addrs[i].u.r_virtbase +=
+                    (cur_region->base_addr & 0xFFF);
+            }
 
             pci_register_bar((PCIDevice *) pci_dev, i,
                              cur_region->size, t,
-                             assigned_dev_iomem_map);
+                             map_func);
             continue;
         }
         /* handle port io regions */
-- 
1.6.0.2


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* Re: [PATCH] Enable non page boundary BAR device assignment
  2009-12-09 17:38 Alexander Graf
@ 2009-12-09 20:49 ` Michael S. Tsirkin
  2009-12-09 21:06   ` Alexander Graf
  2009-12-10  5:16 ` Muli Ben-Yehuda
  1 sibling, 1 reply; 34+ messages in thread
From: Michael S. Tsirkin @ 2009-12-09 20:49 UTC (permalink / raw)
  To: Alexander Graf; +Cc: kvm

On Wed, Dec 09, 2009 at 06:38:54PM +0100, Alexander Graf wrote:
> While trying to get device passthrough working with an emulex hba, kvm
> refused to pass it through because it has a BAR of 256 bytes:
> 
>         Region 0: Memory at d2100000 (64-bit, non-prefetchable) [size=4K]
>         Region 2: Memory at d2101000 (64-bit, non-prefetchable) [size=256]
>         Region 4: I/O ports at b100 [size=256]
> 
> Since the page boundary is an arbitrary optimization to allow 1:1 mapping of
> physical to virtual addresses, we can still take the old MMIO callback route.
> 
> So let's add a second code path that allows for size & 0xFFF != 0 sized regions
> by looping it through userspace.
> 
> I verified that it works by passing through an e1000 with this additional patch
> applied and the card acted the same way it did without this patch:
> 
>              map_func = assigned_dev_iomem_map;
> -            if (cur_region->size & 0xFFF) {
> +            if (i != PCI_ROM_SLOT){
>                  fprintf(stderr, "PCI region %d at address 0x%llx "
> 
> Signed-off-by: Alexander Graf <agraf@suse.de>

Good idea.

One thing I am concerned with, is that some users might
see performance degradation and not see the error message.
Maybe we can have a flag to optionally fail unless
passthrough can be fast? I'm not really sure it's worth it
to add such a flag though - what do others think?

Minor nits below.

> ---
>  hw/device-assignment.c |  129 +++++++++++++++++++++++++++++++++++++++++++++--
>  1 files changed, 123 insertions(+), 6 deletions(-)
> 
> diff --git a/hw/device-assignment.c b/hw/device-assignment.c
> index 13a86bb..beb488c 100644
> --- a/hw/device-assignment.c
> +++ b/hw/device-assignment.c
> @@ -148,6 +148,105 @@ static uint32_t assigned_dev_ioport_readl(void *opaque, uint32_t addr)
>      return value;
>  }
>  
> +static uint32_t slow_bar_readb(void *opaque, target_phys_addr_t addr)
> +{
> +    AssignedDevRegion *d = opaque;
> +    uint8_t *in = (uint8_t*)(d->u.r_virtbase + addr);
> +    uint32_t r = -1;
> +
> +    r = *in;
> +    DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
> +
> +    return r;
> +}
> +
> +static uint32_t slow_bar_readw(void *opaque, target_phys_addr_t addr)
> +{
> +    AssignedDevRegion *d = opaque;
> +    uint16_t *in = (uint16_t*)(d->u.r_virtbase + addr);
> +    uint32_t r = -1;
> +
> +    r = *in;
> +    DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
> +
> +    return r;
> +}
> +
> +static uint32_t slow_bar_readl(void *opaque, target_phys_addr_t addr)
> +{
> +    AssignedDevRegion *d = opaque;
> +    uint32_t *in = (uint32_t*)(d->u.r_virtbase + addr);
> +    uint32_t r = -1;
> +
> +    r = *in;
> +    DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
> +
> +    return r;
> +}
> +
> +static void slow_bar_writeb(void *opaque, target_phys_addr_t addr, uint32_t val)
> +{
> +    AssignedDevRegion *d = opaque;
> +    uint8_t *out = (uint8_t*)(d->u.r_virtbase + addr);
> +
> +    DEBUG("slow_bar_writeb addr=0x" TARGET_FMT_plx " val=0x%02x\n", addr, val);
> +    *out = val;
> +}
> +
> +static void slow_bar_writew(void *opaque, target_phys_addr_t addr, uint32_t val)
> +{
> +    AssignedDevRegion *d = opaque;
> +    uint16_t *out = (uint16_t*)(d->u.r_virtbase + addr);
> +
> +    DEBUG("slow_bar_writew addr=0x" TARGET_FMT_plx " val=0x%04x\n", addr, val);
> +    *out = val;
> +}
> +
> +static void slow_bar_writel(void *opaque, target_phys_addr_t addr, uint32_t val)
> +{
> +    AssignedDevRegion *d = opaque;
> +    uint32_t *out = (uint32_t*)(d->u.r_virtbase + addr);
> +
> +    DEBUG("slow_bar_writel addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, val);
> +    *out = val;
> +}
> +
> +static CPUWriteMemoryFunc * const slow_bar_write[] = {
> +    &slow_bar_writeb,
> +    &slow_bar_writew,
> +    &slow_bar_writel
> +};
> +
> +static CPUReadMemoryFunc * const slow_bar_read[] = {
> +    &slow_bar_readb,
> +    &slow_bar_readw,
> +    &slow_bar_readl
> +};
> +
> +static void assigned_dev_iomem_map_slow(PCIDevice *pci_dev, int region_num,
> +                                        pcibus_t e_phys, pcibus_t e_size,
> +                                        int type)
> +{
> +    AssignedDevice *r_dev = container_of(pci_dev, AssignedDevice, dev);
> +    AssignedDevRegion *region = &r_dev->v_addrs[region_num];
> +    PCIRegion *real_region = &r_dev->real_device.regions[region_num];
> +    int m;
> +
> +    DEBUG("slow map\n");
> +    m = cpu_register_io_memory(slow_bar_read, slow_bar_write, region);
> +    cpu_register_physical_memory(e_phys, e_size, m);
> +
> +    /* MSI-X MMIO page */
> +    if ((e_size > 0) &&
> +        real_region->base_addr <= r_dev->msix_table_addr &&
> +        real_region->base_addr + real_region->size >= r_dev->msix_table_addr) {
> +        int offset = r_dev->msix_table_addr - real_region->base_addr;
> +
> +        cpu_register_physical_memory(e_phys + offset,
> +                TARGET_PAGE_SIZE, r_dev->mmio_index);
> +    }
> +}
> +
>  static void assigned_dev_iomem_map(PCIDevice *pci_dev, int region_num,
>                                     pcibus_t e_phys, pcibus_t e_size, int type)
>  {
> @@ -381,15 +480,25 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
>  
>          /* handle memory io regions */
>          if (cur_region->type & IORESOURCE_MEM) {
> +            void (*map_func)(PCIDevice *, int, pcibus_t, pcibus_t, int);
> +            int slow_map = 0;

This is a bit too clever for my taste. Let's just do below
              pci_register_bar((PCIDevice *) pci_dev, i,
                               cur_region->size, t,
                              slow_map ? assigned_dev_iomem_map : assigned_dev_iomem_map_slow);


(by the way, this cast to (PCIDevice *) is also unnecessary,
isn't, it?

>              int t = cur_region->type & IORESOURCE_PREFETCH
>                  ? PCI_BASE_ADDRESS_MEM_PREFETCH
>                  : PCI_BASE_ADDRESS_SPACE_MEMORY;
> +
> +            map_func = assigned_dev_iomem_map;
>              if (cur_region->size & 0xFFF) {
> -                fprintf(stderr, "Unable to assign device: PCI region %d "
> -                        "at address 0x%llx has size 0x%x, "
> -                        " which is not a multiple of 4K\n",
> +                fprintf(stderr, "PCI region %d at address 0x%llx "
> +                        "has size 0x%x, which is not a multiple of 4K. "
> +                        "Using slow map\n",

Maybe replace "Using slow map" with "The device will be slow.".

>                          i, (unsigned long long)cur_region->base_addr,
>                          cur_region->size);
> +                map_func = assigned_dev_iomem_map_slow;
> +                slow_map = 1;

> +            }
> +
> +            if (slow_map && (i == PCI_ROM_SLOT)) {
> +                fprintf(stderr, "ROM not aligned - can't continue\n");
>                  return -1;
>              }
>  
> @@ -402,6 +511,12 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
>                           PROT_WRITE | PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE,
>                           0, (off_t) 0);
>  
> +            } else if (slow_map) {
> +                pci_dev->v_addrs[i].u.r_virtbase =
> +                    mmap(NULL,
> +                         cur_region->size,
> +                         PROT_WRITE | PROT_READ, MAP_SHARED,
> +                         cur_region->resource_fd, (off_t) 0);

Isn't this the same as the case below it?
I expect mmap to act identically for slow and fast ...
Just replace (cur_region->size + 0xFFF) & 0xFFFFF000, with
cur_region->size.

>              } else {
>                  pci_dev->v_addrs[i].u.r_virtbase =
>                      mmap(NULL,
> @@ -429,12 +544,14 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
>              pci_dev->v_addrs[i].e_size = 0;
>  
>              /* add offset */
> -            pci_dev->v_addrs[i].u.r_virtbase +=
> -                (cur_region->base_addr & 0xFFF);
> +            if (!slow_map) {
> +                pci_dev->v_addrs[i].u.r_virtbase +=
> +                    (cur_region->base_addr & 0xFFF);
> +            }
>  
>              pci_register_bar((PCIDevice *) pci_dev, i,
>                               cur_region->size, t,
> -                             assigned_dev_iomem_map);
> +                             map_func);
>              continue;
>          }
>          /* handle port io regions */
> -- 
> 1.6.0.2

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] Enable non page boundary BAR device assignment
  2009-12-09 20:49 ` Michael S. Tsirkin
@ 2009-12-09 21:06   ` Alexander Graf
  2009-12-10 10:35     ` Avi Kivity
  0 siblings, 1 reply; 34+ messages in thread
From: Alexander Graf @ 2009-12-09 21:06 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: kvm@vger.kernel.org


Am 09.12.2009 um 21:49 schrieb "Michael S. Tsirkin" <mst@redhat.com>:

> On Wed, Dec 09, 2009 at 06:38:54PM +0100, Alexander Graf wrote:
>> While trying to get device passthrough working with an emulex hba,  
>> kvm
>> refused to pass it through because it has a BAR of 256 bytes:
>>
>>        Region 0: Memory at d2100000 (64-bit, non-prefetchable)  
>> [size=4K]
>>        Region 2: Memory at d2101000 (64-bit, non-prefetchable)  
>> [size=256]
>>        Region 4: I/O ports at b100 [size=256]
>>
>> Since the page boundary is an arbitrary optimization to allow 1:1  
>> mapping of
>> physical to virtual addresses, we can still take the old MMIO  
>> callback route.
>>
>> So let's add a second code path that allows for size & 0xFFF != 0  
>> sized regions
>> by looping it through userspace.
>>
>> I verified that it works by passing through an e1000 with this  
>> additional patch
>> applied and the card acted the same way it did without this patch:
>>
>>             map_func = assigned_dev_iomem_map;
>> -            if (cur_region->size & 0xFFF) {
>> +            if (i != PCI_ROM_SLOT){
>>                 fprintf(stderr, "PCI region %d at address 0x%llx "
>>
>> Signed-off-by: Alexander Graf <agraf@suse.de>
>
> Good idea.
>
> One thing I am concerned with, is that some users might
> see performance degradation and not see the error message.
> Maybe we can have a flag to optionally fail unless
> passthrough can be fast? I'm not really sure it's worth it
> to add such a flag though - what do others think?

It only kicks in on small BARs which are usually not _that_  
performance critical. We're mostly talking about ack and status bits.

Also not failing > failing IMHO :). Even if it's not as fast as native.

>
> Minor nits below.
>
>> ---
>> hw/device-assignment.c |  129 ++++++++++++++++++++++++++++++++++++++ 
>> +++++++--
>> 1 files changed, 123 insertions(+), 6 deletions(-)
>>
>> diff --git a/hw/device-assignment.c b/hw/device-assignment.c
>> index 13a86bb..beb488c 100644
>> --- a/hw/device-assignment.c
>> +++ b/hw/device-assignment.c
>> @@ -148,6 +148,105 @@ static uint32_t assigned_dev_ioport_readl 
>> (void *opaque, uint32_t addr)
>>     return value;
>> }
>>
>> +static uint32_t slow_bar_readb(void *opaque, target_phys_addr_t  
>> addr)
>> +{
>> +    AssignedDevRegion *d = opaque;
>> +    uint8_t *in = (uint8_t*)(d->u.r_virtbase + addr);
>> +    uint32_t r = -1;
>> +
>> +    r = *in;
>> +    DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n",  
>> addr, r);
>> +
>> +    return r;
>> +}
>> +
>> +static uint32_t slow_bar_readw(void *opaque, target_phys_addr_t  
>> addr)
>> +{
>> +    AssignedDevRegion *d = opaque;
>> +    uint16_t *in = (uint16_t*)(d->u.r_virtbase + addr);
>> +    uint32_t r = -1;
>> +
>> +    r = *in;
>> +    DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n",  
>> addr, r);
>> +
>> +    return r;
>> +}
>> +
>> +static uint32_t slow_bar_readl(void *opaque, target_phys_addr_t  
>> addr)
>> +{
>> +    AssignedDevRegion *d = opaque;
>> +    uint32_t *in = (uint32_t*)(d->u.r_virtbase + addr);
>> +    uint32_t r = -1;
>> +
>> +    r = *in;
>> +    DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n",  
>> addr, r);
>> +
>> +    return r;
>> +}
>> +
>> +static void slow_bar_writeb(void *opaque, target_phys_addr_t addr,  
>> uint32_t val)
>> +{
>> +    AssignedDevRegion *d = opaque;
>> +    uint8_t *out = (uint8_t*)(d->u.r_virtbase + addr);
>> +
>> +    DEBUG("slow_bar_writeb addr=0x" TARGET_FMT_plx " val=0x%02x 
>> \n", addr, val);
>> +    *out = val;
>> +}
>> +
>> +static void slow_bar_writew(void *opaque, target_phys_addr_t addr,  
>> uint32_t val)
>> +{
>> +    AssignedDevRegion *d = opaque;
>> +    uint16_t *out = (uint16_t*)(d->u.r_virtbase + addr);
>> +
>> +    DEBUG("slow_bar_writew addr=0x" TARGET_FMT_plx " val=0x%04x 
>> \n", addr, val);
>> +    *out = val;
>> +}
>> +
>> +static void slow_bar_writel(void *opaque, target_phys_addr_t addr,  
>> uint32_t val)
>> +{
>> +    AssignedDevRegion *d = opaque;
>> +    uint32_t *out = (uint32_t*)(d->u.r_virtbase + addr);
>> +
>> +    DEBUG("slow_bar_writel addr=0x" TARGET_FMT_plx " val=0x%08x 
>> \n", addr, val);
>> +    *out = val;
>> +}
>> +
>> +static CPUWriteMemoryFunc * const slow_bar_write[] = {
>> +    &slow_bar_writeb,
>> +    &slow_bar_writew,
>> +    &slow_bar_writel
>> +};
>> +
>> +static CPUReadMemoryFunc * const slow_bar_read[] = {
>> +    &slow_bar_readb,
>> +    &slow_bar_readw,
>> +    &slow_bar_readl
>> +};
>> +
>> +static void assigned_dev_iomem_map_slow(PCIDevice *pci_dev, int  
>> region_num,
>> +                                        pcibus_t e_phys, pcibus_t  
>> e_size,
>> +                                        int type)
>> +{
>> +    AssignedDevice *r_dev = container_of(pci_dev, AssignedDevice,  
>> dev);
>> +    AssignedDevRegion *region = &r_dev->v_addrs[region_num];
>> +    PCIRegion *real_region = &r_dev->real_device.regions 
>> [region_num];
>> +    int m;
>> +
>> +    DEBUG("slow map\n");
>> +    m = cpu_register_io_memory(slow_bar_read, slow_bar_write,  
>> region);
>> +    cpu_register_physical_memory(e_phys, e_size, m);
>> +
>> +    /* MSI-X MMIO page */
>> +    if ((e_size > 0) &&
>> +        real_region->base_addr <= r_dev->msix_table_addr &&
>> +        real_region->base_addr + real_region->size >= r_dev- 
>> >msix_table_addr) {
>> +        int offset = r_dev->msix_table_addr - real_region- 
>> >base_addr;
>> +
>> +        cpu_register_physical_memory(e_phys + offset,
>> +                TARGET_PAGE_SIZE, r_dev->mmio_index);
>> +    }
>> +}
>> +
>> static void assigned_dev_iomem_map(PCIDevice *pci_dev, int  
>> region_num,
>>                                    pcibus_t e_phys, pcibus_t  
>> e_size, int type)
>> {
>> @@ -381,15 +480,25 @@ static int assigned_dev_register_regions 
>> (PCIRegion *io_regions,
>>
>>         /* handle memory io regions */
>>         if (cur_region->type & IORESOURCE_MEM) {
>> +            void (*map_func)(PCIDevice *, int, pcibus_t, pcibus_t,  
>> int);
>> +            int slow_map = 0;
>
> This is a bit too clever for my taste. Let's just do below
>              pci_register_bar((PCIDevice *) pci_dev, i,
>                               cur_region->size, t,
>                              slow_map ? assigned_dev_iomem_map :  
> assigned_dev_iomem_map_slow);

We could do that too, yeah. It's a matter of taste I guess. I find  
function pointers pretty easily readable :).

> (by the way, this cast to (PCIDevice *) is also unnecessary,
> isn't, it?
>
>>             int t = cur_region->type & IORESOURCE_PREFETCH
>>                 ? PCI_BASE_ADDRESS_MEM_PREFETCH
>>                 : PCI_BASE_ADDRESS_SPACE_MEMORY;
>> +
>> +            map_func = assigned_dev_iomem_map;
>>             if (cur_region->size & 0xFFF) {
>> -                fprintf(stderr, "Unable to assign device: PCI  
>> region %d "
>> -                        "at address 0x%llx has size 0x%x, "
>> -                        " which is not a multiple of 4K\n",
>> +                fprintf(stderr, "PCI region %d at address 0x%llx "
>> +                        "has size 0x%x, which is not a multiple of  
>> 4K. "
>> +                        "Using slow map\n",
>
> Maybe replace "Using slow map" with "The device will be slow.".

I doubt it would. So far I only found the emulex and an ehci  
controller that have BARs that are no multiple of page_size. I didn't  
test the ehci controller and the emulex is still broken despite this  
patch, probably interrupt related.

>
>>                         i, (unsigned long long)cur_region->base_addr,
>>                         cur_region->size);
>> +                map_func = assigned_dev_iomem_map_slow;
>> +                slow_map = 1;
>
>> +            }
>> +
>> +            if (slow_map && (i == PCI_ROM_SLOT)) {
>> +                fprintf(stderr, "ROM not aligned - can't continue 
>> \n");
>>                 return -1;
>>             }
>>
>> @@ -402,6 +511,12 @@ static int assigned_dev_register_regions 
>> (PCIRegion *io_regions,
>>                          PROT_WRITE | PROT_READ, MAP_ANONYMOUS |  
>> MAP_PRIVATE,
>>                          0, (off_t) 0);
>>
>> +            } else if (slow_map) {
>> +                pci_dev->v_addrs[i].u.r_virtbase =
>> +                    mmap(NULL,
>> +                         cur_region->size,
>> +                         PROT_WRITE | PROT_READ, MAP_SHARED,
>> +                         cur_region->resource_fd, (off_t) 0);
>
> Isn't this the same as the case below it?
> I expect mmap to act identically for slow and fast ...
> Just replace (cur_region->size + 0xFFF) & 0xFFFFF000, with
> cur_region->size.

You mean remove the alignment for the fast case? Hum, we shouldn't hit  
unaligned bars with fast mapping anymore, so I guess you're right.

>
>>             } else {
>>                 pci_dev->v_addrs[i].u.r_virtbase =
>>                     mmap(NULL,
>> @@ -429,12 +544,14 @@ static int assigned_dev_register_regions 
>> (PCIRegion *io_regions,
>>             pci_dev->v_addrs[i].e_size = 0;
>>
>>             /* add offset */
>> -            pci_dev->v_addrs[i].u.r_virtbase +=
>> -                (cur_region->base_addr & 0xFFF);
>> +            if (!slow_map) {
>> +                pci_dev->v_addrs[i].u.r_virtbase +=
>> +                    (cur_region->base_addr & 0xFFF);
>> +            }
>>
>>             pci_register_bar((PCIDevice *) pci_dev, i,
>>                              cur_region->size, t,
>> -                             assigned_dev_iomem_map);
>> +                             map_func);
>>             continue;
>>         }
>>         /* handle port io regions */
>> -- 
>> 1.6.0.2
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] Enable non page boundary BAR device assignment
  2009-12-09 17:38 Alexander Graf
  2009-12-09 20:49 ` Michael S. Tsirkin
@ 2009-12-10  5:16 ` Muli Ben-Yehuda
  2009-12-10  9:35   ` Alexander Graf
  2009-12-10  9:43   ` Michael S. Tsirkin
  1 sibling, 2 replies; 34+ messages in thread
From: Muli Ben-Yehuda @ 2009-12-10  5:16 UTC (permalink / raw)
  To: Alexander Graf; +Cc: kvm, mst

On Wed, Dec 09, 2009 at 06:38:54PM +0100, Alexander Graf wrote:

> While trying to get device passthrough working with an emulex hba,
> kvm refused to pass it through because it has a BAR of 256 bytes:
>
>         Region 0: Memory at d2100000 (64-bit, non-prefetchable) [size=4K]
>         Region 2: Memory at d2101000 (64-bit, non-prefetchable) [size=256]
>         Region 4: I/O ports at b100 [size=256]
>
> Since the page boundary is an arbitrary optimization to allow 1:1
> mapping of physical to virtual addresses, we can still take the old
> MMIO callback route.
>
> So let's add a second code path that allows for size & 0xFFF != 0
> sized regions by looping it through userspace.

That makes sense in general *but* the 4K-aligned check isn't just an
optimization, it also has a security implication. Consider the
theoretical case where has a multi-function device has BARs for two
functions on the same page (within a 4K boundary), and each function
is assigned to a different guest. With your current patch both guests
will be able to write to each other's BARs. Another case is where a
device has a bug and you must not write beyond the BAR or Bad Things
Happen. With this patch an *unprivileged* guest could exploit that bug
and make bad things happen.

This can be fixed if the slow userspace mmio path checks that all MMIO
accesses by a guest fall within the portion of the page that is
assigned to it.

Cheers,
Muli
-- 
Muli Ben-Yehuda | muli@il.ibm.com | +972-4-8281080
Manager, Virtualization and Systems Architecture
Master Inventor, IBM Research -- Haifa
Second Workshop on I/O Virtualization (WIOV '10):
http://sysrun.haifa.il.ibm.com/hrl/wiov2010/

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] Enable non page boundary BAR device assignment
  2009-12-10  5:16 ` Muli Ben-Yehuda
@ 2009-12-10  9:35   ` Alexander Graf
  2009-12-10 10:21     ` Muli Ben-Yehuda
  2009-12-10  9:43   ` Michael S. Tsirkin
  1 sibling, 1 reply; 34+ messages in thread
From: Alexander Graf @ 2009-12-10  9:35 UTC (permalink / raw)
  To: Muli Ben-Yehuda; +Cc: kvm, mst


On 10.12.2009, at 06:16, Muli Ben-Yehuda wrote:

> On Wed, Dec 09, 2009 at 06:38:54PM +0100, Alexander Graf wrote:
> 
>> While trying to get device passthrough working with an emulex hba,
>> kvm refused to pass it through because it has a BAR of 256 bytes:
>> 
>>        Region 0: Memory at d2100000 (64-bit, non-prefetchable) [size=4K]
>>        Region 2: Memory at d2101000 (64-bit, non-prefetchable) [size=256]
>>        Region 4: I/O ports at b100 [size=256]
>> 
>> Since the page boundary is an arbitrary optimization to allow 1:1
>> mapping of physical to virtual addresses, we can still take the old
>> MMIO callback route.
>> 
>> So let's add a second code path that allows for size & 0xFFF != 0
>> sized regions by looping it through userspace.
> 
> That makes sense in general *but* the 4K-aligned check isn't just an
> optimization, it also has a security implication. Consider the
> theoretical case where has a multi-function device has BARs for two
> functions on the same page (within a 4K boundary), and each function
> is assigned to a different guest. With your current patch both guests
> will be able to write to each other's BARs. Another case is where a
> device has a bug and you must not write beyond the BAR or Bad Things
> Happen. With this patch an *unprivileged* guest could exploit that bug
> and make bad things happen.
> 
> This can be fixed if the slow userspace mmio path checks that all MMIO
> accesses by a guest fall within the portion of the page that is
> assigned to it.

That's exactly what this patch does. It only allows access to the region defined in the BAR.

Alex

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] Enable non page boundary BAR device assignment
  2009-12-10  5:16 ` Muli Ben-Yehuda
  2009-12-10  9:35   ` Alexander Graf
@ 2009-12-10  9:43   ` Michael S. Tsirkin
  2009-12-10  9:52     ` Alexander Graf
  1 sibling, 1 reply; 34+ messages in thread
From: Michael S. Tsirkin @ 2009-12-10  9:43 UTC (permalink / raw)
  To: Muli Ben-Yehuda; +Cc: Alexander Graf, kvm

On Thu, Dec 10, 2009 at 07:16:04AM +0200, Muli Ben-Yehuda wrote:
> On Wed, Dec 09, 2009 at 06:38:54PM +0100, Alexander Graf wrote:
> 
> > While trying to get device passthrough working with an emulex hba,
> > kvm refused to pass it through because it has a BAR of 256 bytes:
> >
> >         Region 0: Memory at d2100000 (64-bit, non-prefetchable) [size=4K]
> >         Region 2: Memory at d2101000 (64-bit, non-prefetchable) [size=256]
> >         Region 4: I/O ports at b100 [size=256]
> >
> > Since the page boundary is an arbitrary optimization to allow 1:1
> > mapping of physical to virtual addresses, we can still take the old
> > MMIO callback route.
> >
> > So let's add a second code path that allows for size & 0xFFF != 0
> > sized regions by looping it through userspace.
> 
> That makes sense in general *but* the 4K-aligned check isn't just an
> optimization, it also has a security implication. Consider the
> theoretical case where has a multi-function device has BARs for two
> functions on the same page (within a 4K boundary), and each function
> is assigned to a different guest. With your current patch both guests
> will be able to write to each other's BARs. Another case is where a
> device has a bug and you must not write beyond the BAR or Bad Things
> Happen. With this patch an *unprivileged* guest could exploit that bug
> and make bad things happen.
> 
> This can be fixed if the slow userspace mmio path checks that all MMIO
> accesses by a guest fall within the portion of the page that is
> assigned to it.

This patch seems to implement range checks correctly,
let me know if I am missing something.

One also notes that we currently link qemu with libpci
which I think requires admin cap to work.
However, in the future we might extend this to
also support getting device fds over a unix socket
from a higher priviledged process.

If or when this is done, we will have to be
extra careful when passing
device file descriptor to an unpriveledged qemu process if
the BARs are less than full page in size: mapping
such BAR will allow qemu access outside this BAR.

A possible solution to this problem
if/when it arises would be adding yet another sysfs file
for each resource, which would allow read/write but not
mmap access, and perform range checks in the kernel.


> Cheers,
> Muli
> -- 
> Muli Ben-Yehuda | muli@il.ibm.com | +972-4-8281080
> Manager, Virtualization and Systems Architecture
> Master Inventor, IBM Research -- Haifa
> Second Workshop on I/O Virtualization (WIOV '10):
> http://sysrun.haifa.il.ibm.com/hrl/wiov2010/

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] Enable non page boundary BAR device assignment
  2009-12-10  9:43   ` Michael S. Tsirkin
@ 2009-12-10  9:52     ` Alexander Graf
  2009-12-10 10:08       ` Alexander Graf
  2009-12-10 10:23       ` Muli Ben-Yehuda
  0 siblings, 2 replies; 34+ messages in thread
From: Alexander Graf @ 2009-12-10  9:52 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Muli Ben-Yehuda, kvm


On 10.12.2009, at 10:43, Michael S. Tsirkin wrote:

> On Thu, Dec 10, 2009 at 07:16:04AM +0200, Muli Ben-Yehuda wrote:
>> On Wed, Dec 09, 2009 at 06:38:54PM +0100, Alexander Graf wrote:
>> 
>>> While trying to get device passthrough working with an emulex hba,
>>> kvm refused to pass it through because it has a BAR of 256 bytes:
>>> 
>>>        Region 0: Memory at d2100000 (64-bit, non-prefetchable) [size=4K]
>>>        Region 2: Memory at d2101000 (64-bit, non-prefetchable) [size=256]
>>>        Region 4: I/O ports at b100 [size=256]
>>> 
>>> Since the page boundary is an arbitrary optimization to allow 1:1
>>> mapping of physical to virtual addresses, we can still take the old
>>> MMIO callback route.
>>> 
>>> So let's add a second code path that allows for size & 0xFFF != 0
>>> sized regions by looping it through userspace.
>> 
>> That makes sense in general *but* the 4K-aligned check isn't just an
>> optimization, it also has a security implication. Consider the
>> theoretical case where has a multi-function device has BARs for two
>> functions on the same page (within a 4K boundary), and each function
>> is assigned to a different guest. With your current patch both guests
>> will be able to write to each other's BARs. Another case is where a
>> device has a bug and you must not write beyond the BAR or Bad Things
>> Happen. With this patch an *unprivileged* guest could exploit that bug
>> and make bad things happen.
>> 
>> This can be fixed if the slow userspace mmio path checks that all MMIO
>> accesses by a guest fall within the portion of the page that is
>> assigned to it.
> 
> This patch seems to implement range checks correctly,
> let me know if I am missing something.
> 
> One also notes that we currently link qemu with libpci
> which I think requires admin cap to work.
> However, in the future we might extend this to
> also support getting device fds over a unix socket
> from a higher priviledged process.
> 
> If or when this is done, we will have to be
> extra careful when passing
> device file descriptor to an unpriveledged qemu process if
> the BARs are less than full page in size: mapping
> such BAR will allow qemu access outside this BAR.
> 
> A possible solution to this problem
> if/when it arises would be adding yet another sysfs file
> for each resource, which would allow read/write but not
> mmap access, and perform range checks in the kernel.

Sounds like the best solution to this problem, yeah. Though we'd only need those for non-page-boundary BARs. So I guess the best would be to always export them from the kernel, but only use them when BAR & (PAGE_SIZE-1).

Either way, FWIW the device assignment stuff needs to be completely rewritten for qemu upstream anyways. So while it's good to collect ideas for now, let's not too put too much effort code-wise into the current code (unless it doesn't work).

Alex

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] Enable non page boundary BAR device assignment
  2009-12-10  9:52     ` Alexander Graf
@ 2009-12-10 10:08       ` Alexander Graf
  2009-12-10 10:27         ` Michael S. Tsirkin
  2009-12-10 10:23       ` Muli Ben-Yehuda
  1 sibling, 1 reply; 34+ messages in thread
From: Alexander Graf @ 2009-12-10 10:08 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Muli Ben-Yehuda, kvm@vger.kernel.org list


On 10.12.2009, at 10:52, Alexander Graf wrote:

> 
> On 10.12.2009, at 10:43, Michael S. Tsirkin wrote:
> 
>> On Thu, Dec 10, 2009 at 07:16:04AM +0200, Muli Ben-Yehuda wrote:
>>> On Wed, Dec 09, 2009 at 06:38:54PM +0100, Alexander Graf wrote:
>>> 
>>>> While trying to get device passthrough working with an emulex hba,
>>>> kvm refused to pass it through because it has a BAR of 256 bytes:
>>>> 
>>>>       Region 0: Memory at d2100000 (64-bit, non-prefetchable) [size=4K]
>>>>       Region 2: Memory at d2101000 (64-bit, non-prefetchable) [size=256]
>>>>       Region 4: I/O ports at b100 [size=256]
>>>> 
>>>> Since the page boundary is an arbitrary optimization to allow 1:1
>>>> mapping of physical to virtual addresses, we can still take the old
>>>> MMIO callback route.
>>>> 
>>>> So let's add a second code path that allows for size & 0xFFF != 0
>>>> sized regions by looping it through userspace.
>>> 
>>> That makes sense in general *but* the 4K-aligned check isn't just an
>>> optimization, it also has a security implication. Consider the
>>> theoretical case where has a multi-function device has BARs for two
>>> functions on the same page (within a 4K boundary), and each function
>>> is assigned to a different guest. With your current patch both guests
>>> will be able to write to each other's BARs. Another case is where a
>>> device has a bug and you must not write beyond the BAR or Bad Things
>>> Happen. With this patch an *unprivileged* guest could exploit that bug
>>> and make bad things happen.
>>> 
>>> This can be fixed if the slow userspace mmio path checks that all MMIO
>>> accesses by a guest fall within the portion of the page that is
>>> assigned to it.
>> 
>> This patch seems to implement range checks correctly,
>> let me know if I am missing something.
>> 
>> One also notes that we currently link qemu with libpci
>> which I think requires admin cap to work.
>> However, in the future we might extend this to
>> also support getting device fds over a unix socket
>> from a higher priviledged process.
>> 
>> If or when this is done, we will have to be
>> extra careful when passing
>> device file descriptor to an unpriveledged qemu process if
>> the BARs are less than full page in size: mapping
>> such BAR will allow qemu access outside this BAR.
>> 
>> A possible solution to this problem
>> if/when it arises would be adding yet another sysfs file
>> for each resource, which would allow read/write but not
>> mmap access, and perform range checks in the kernel.
> 
> Sounds like the best solution to this problem, yeah. Though we'd only need those for non-page-boundary BARs. So I guess the best would be to always export them from the kernel, but only use them when BAR & (PAGE_SIZE-1).

Hm, or add read/write fd functions that always do boundary checks to the existing interface and only allow mmap on size & PAGE_SIZE. Or only allow non-aligned mmap when the admin cap is present.

Alex

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] Enable non page boundary BAR device assignment
  2009-12-10  9:35   ` Alexander Graf
@ 2009-12-10 10:21     ` Muli Ben-Yehuda
  0 siblings, 0 replies; 34+ messages in thread
From: Muli Ben-Yehuda @ 2009-12-10 10:21 UTC (permalink / raw)
  To: Alexander Graf; +Cc: kvm, mst

On Thu, Dec 10, 2009 at 10:35:41AM +0100, Alexander Graf wrote:

> That's exactly what this patch does. It only allows access to the
> region defined in the BAR.

Sorry, missed it!

Cheers,
Muli

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] Enable non page boundary BAR device assignment
  2009-12-10  9:52     ` Alexander Graf
  2009-12-10 10:08       ` Alexander Graf
@ 2009-12-10 10:23       ` Muli Ben-Yehuda
  2009-12-10 10:31         ` Alexander Graf
  1 sibling, 1 reply; 34+ messages in thread
From: Muli Ben-Yehuda @ 2009-12-10 10:23 UTC (permalink / raw)
  To: Alexander Graf; +Cc: Michael S. Tsirkin, kvm

On Thu, Dec 10, 2009 at 10:52:46AM +0100, Alexander Graf wrote:

> Either way, FWIW the device assignment stuff needs to be completely
> rewritten for qemu upstream anyways. So while it's good to collect
> ideas for now, let's not too put too much effort code-wise into the
> current code (unless it doesn't work).

What do you have in mind for such a rewrite?

Cheers,
Muli

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] Enable non page boundary BAR device assignment
  2009-12-10 10:08       ` Alexander Graf
@ 2009-12-10 10:27         ` Michael S. Tsirkin
  2009-12-10 10:31           ` Alexander Graf
  0 siblings, 1 reply; 34+ messages in thread
From: Michael S. Tsirkin @ 2009-12-10 10:27 UTC (permalink / raw)
  To: Alexander Graf; +Cc: Muli Ben-Yehuda, kvm@vger.kernel.org list

On Thu, Dec 10, 2009 at 11:08:58AM +0100, Alexander Graf wrote:
> 
> On 10.12.2009, at 10:52, Alexander Graf wrote:
> 
> > 
> > On 10.12.2009, at 10:43, Michael S. Tsirkin wrote:
> > 
> >> On Thu, Dec 10, 2009 at 07:16:04AM +0200, Muli Ben-Yehuda wrote:
> >>> On Wed, Dec 09, 2009 at 06:38:54PM +0100, Alexander Graf wrote:
> >>> 
> >>>> While trying to get device passthrough working with an emulex hba,
> >>>> kvm refused to pass it through because it has a BAR of 256 bytes:
> >>>> 
> >>>>       Region 0: Memory at d2100000 (64-bit, non-prefetchable) [size=4K]
> >>>>       Region 2: Memory at d2101000 (64-bit, non-prefetchable) [size=256]
> >>>>       Region 4: I/O ports at b100 [size=256]
> >>>> 
> >>>> Since the page boundary is an arbitrary optimization to allow 1:1
> >>>> mapping of physical to virtual addresses, we can still take the old
> >>>> MMIO callback route.
> >>>> 
> >>>> So let's add a second code path that allows for size & 0xFFF != 0
> >>>> sized regions by looping it through userspace.
> >>> 
> >>> That makes sense in general *but* the 4K-aligned check isn't just an
> >>> optimization, it also has a security implication. Consider the
> >>> theoretical case where has a multi-function device has BARs for two
> >>> functions on the same page (within a 4K boundary), and each function
> >>> is assigned to a different guest. With your current patch both guests
> >>> will be able to write to each other's BARs. Another case is where a
> >>> device has a bug and you must not write beyond the BAR or Bad Things
> >>> Happen. With this patch an *unprivileged* guest could exploit that bug
> >>> and make bad things happen.
> >>> 
> >>> This can be fixed if the slow userspace mmio path checks that all MMIO
> >>> accesses by a guest fall within the portion of the page that is
> >>> assigned to it.
> >> 
> >> This patch seems to implement range checks correctly,
> >> let me know if I am missing something.
> >> 
> >> One also notes that we currently link qemu with libpci
> >> which I think requires admin cap to work.
> >> However, in the future we might extend this to
> >> also support getting device fds over a unix socket
> >> from a higher priviledged process.
> >> 
> >> If or when this is done, we will have to be
> >> extra careful when passing
> >> device file descriptor to an unpriveledged qemu process if
> >> the BARs are less than full page in size: mapping
> >> such BAR will allow qemu access outside this BAR.
> >> 
> >> A possible solution to this problem
> >> if/when it arises would be adding yet another sysfs file
> >> for each resource, which would allow read/write but not
> >> mmap access, and perform range checks in the kernel.
> > 
> > Sounds like the best solution to this problem, yeah. Though we'd only need those for non-page-boundary BARs. So I guess the best would be to always export them from the kernel, but only use them when BAR & (PAGE_SIZE-1).
> 
> Hm, or add read/write fd functions that always do boundary checks to the existing interface and only allow mmap on size & PAGE_SIZE. Or only allow non-aligned mmap when the admin cap is present.
> 
> Alex

This might break existing applications.
We don't want that.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] Enable non page boundary BAR device assignment
  2009-12-10 10:23       ` Muli Ben-Yehuda
@ 2009-12-10 10:31         ` Alexander Graf
  2009-12-10 10:37           ` Muli Ben-Yehuda
  0 siblings, 1 reply; 34+ messages in thread
From: Alexander Graf @ 2009-12-10 10:31 UTC (permalink / raw)
  To: Muli Ben-Yehuda; +Cc: Michael S. Tsirkin, kvm


On 10.12.2009, at 11:23, Muli Ben-Yehuda wrote:

> On Thu, Dec 10, 2009 at 10:52:46AM +0100, Alexander Graf wrote:
> 
>> Either way, FWIW the device assignment stuff needs to be completely
>> rewritten for qemu upstream anyways. So while it's good to collect
>> ideas for now, let's not too put too much effort code-wise into the
>> current code (unless it doesn't work).
> 
> What do you have in mind for such a rewrite?

I'd like to see it more well-abstracted and versatile. I don't see an obvious reason why we shouldn't be able to use a physical device in a TCG target :-).

Alex

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] Enable non page boundary BAR device assignment
  2009-12-10 10:27         ` Michael S. Tsirkin
@ 2009-12-10 10:31           ` Alexander Graf
  2009-12-10 10:42             ` Michael S. Tsirkin
  0 siblings, 1 reply; 34+ messages in thread
From: Alexander Graf @ 2009-12-10 10:31 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Muli Ben-Yehuda, kvm@vger.kernel.org list


On 10.12.2009, at 11:27, Michael S. Tsirkin wrote:

> On Thu, Dec 10, 2009 at 11:08:58AM +0100, Alexander Graf wrote:
>> 
>> On 10.12.2009, at 10:52, Alexander Graf wrote:
>> 
>>> 
>>> On 10.12.2009, at 10:43, Michael S. Tsirkin wrote:
>>> 
>>>> On Thu, Dec 10, 2009 at 07:16:04AM +0200, Muli Ben-Yehuda wrote:
>>>>> On Wed, Dec 09, 2009 at 06:38:54PM +0100, Alexander Graf wrote:
>>>>> 
>>>>>> While trying to get device passthrough working with an emulex hba,
>>>>>> kvm refused to pass it through because it has a BAR of 256 bytes:
>>>>>> 
>>>>>>      Region 0: Memory at d2100000 (64-bit, non-prefetchable) [size=4K]
>>>>>>      Region 2: Memory at d2101000 (64-bit, non-prefetchable) [size=256]
>>>>>>      Region 4: I/O ports at b100 [size=256]
>>>>>> 
>>>>>> Since the page boundary is an arbitrary optimization to allow 1:1
>>>>>> mapping of physical to virtual addresses, we can still take the old
>>>>>> MMIO callback route.
>>>>>> 
>>>>>> So let's add a second code path that allows for size & 0xFFF != 0
>>>>>> sized regions by looping it through userspace.
>>>>> 
>>>>> That makes sense in general *but* the 4K-aligned check isn't just an
>>>>> optimization, it also has a security implication. Consider the
>>>>> theoretical case where has a multi-function device has BARs for two
>>>>> functions on the same page (within a 4K boundary), and each function
>>>>> is assigned to a different guest. With your current patch both guests
>>>>> will be able to write to each other's BARs. Another case is where a
>>>>> device has a bug and you must not write beyond the BAR or Bad Things
>>>>> Happen. With this patch an *unprivileged* guest could exploit that bug
>>>>> and make bad things happen.
>>>>> 
>>>>> This can be fixed if the slow userspace mmio path checks that all MMIO
>>>>> accesses by a guest fall within the portion of the page that is
>>>>> assigned to it.
>>>> 
>>>> This patch seems to implement range checks correctly,
>>>> let me know if I am missing something.
>>>> 
>>>> One also notes that we currently link qemu with libpci
>>>> which I think requires admin cap to work.
>>>> However, in the future we might extend this to
>>>> also support getting device fds over a unix socket
>>>> from a higher priviledged process.
>>>> 
>>>> If or when this is done, we will have to be
>>>> extra careful when passing
>>>> device file descriptor to an unpriveledged qemu process if
>>>> the BARs are less than full page in size: mapping
>>>> such BAR will allow qemu access outside this BAR.
>>>> 
>>>> A possible solution to this problem
>>>> if/when it arises would be adding yet another sysfs file
>>>> for each resource, which would allow read/write but not
>>>> mmap access, and perform range checks in the kernel.
>>> 
>>> Sounds like the best solution to this problem, yeah. Though we'd only need those for non-page-boundary BARs. So I guess the best would be to always export them from the kernel, but only use them when BAR & (PAGE_SIZE-1).
>> 
>> Hm, or add read/write fd functions that always do boundary checks to the existing interface and only allow mmap on size & PAGE_SIZE. Or only allow non-aligned mmap when the admin cap is present.
>> 
>> Alex
> 
> This might break existing applications.
> We don't want that.

Well currently you can't mmap the resource at all without at least r/w rights on the file, right?
But yeah, we'd probably change behavior that could break someone - sigh.

Alex

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] Enable non page boundary BAR device assignment
  2009-12-09 21:06   ` Alexander Graf
@ 2009-12-10 10:35     ` Avi Kivity
  0 siblings, 0 replies; 34+ messages in thread
From: Avi Kivity @ 2009-12-10 10:35 UTC (permalink / raw)
  To: Alexander Graf; +Cc: Michael S. Tsirkin, kvm@vger.kernel.org

On 12/09/2009 11:06 PM, Alexander Graf wrote:
>
> Am 09.12.2009 um 21:49 schrieb "Michael S. Tsirkin" <mst@redhat.com>:
>
>> On Wed, Dec 09, 2009 at 06:38:54PM +0100, Alexander Graf wrote:
>>> While trying to get device passthrough working with an emulex hba, kvm
>>> refused to pass it through because it has a BAR of 256 bytes:
>>>
>>>        Region 0: Memory at d2100000 (64-bit, non-prefetchable) 
>>> [size=4K]
>>>        Region 2: Memory at d2101000 (64-bit, non-prefetchable) 
>>> [size=256]
>>>        Region 4: I/O ports at b100 [size=256]
>>>
>>> Since the page boundary is an arbitrary optimization to allow 1:1 
>>> mapping of
>>> physical to virtual addresses, we can still take the old MMIO 
>>> callback route.
>>>
>>> So let's add a second code path that allows for size & 0xFFF != 0 
>>> sized regions
>>> by looping it through userspace.
>>>
>>> I verified that it works by passing through an e1000 with this 
>>> additional patch
>>> applied and the card acted the same way it did without this patch:
>>>
>>>             map_func = assigned_dev_iomem_map;
>>> -            if (cur_region->size & 0xFFF) {
>>> +            if (i != PCI_ROM_SLOT){
>>>                 fprintf(stderr, "PCI region %d at address 0x%llx "
>>>
>>> Signed-off-by: Alexander Graf <agraf@suse.de>
>>
>> Good idea.
>>
>> One thing I am concerned with, is that some users might
>> see performance degradation and not see the error message.
>> Maybe we can have a flag to optionally fail unless
>> passthrough can be fast? I'm not really sure it's worth it
>> to add such a flag though - what do others think?
>
> It only kicks in on small BARs which are usually not _that_ 
> performance critical. We're mostly talking about ack and status bits.
>

Well, ack and status are pretty important if accessed every interrupt.

> Also not failing > failing IMHO :). Even if it's not as fast as native.

Yes.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] Enable non page boundary BAR device assignment
  2009-12-10 10:31         ` Alexander Graf
@ 2009-12-10 10:37           ` Muli Ben-Yehuda
  2009-12-10 10:56             ` Michael S. Tsirkin
  0 siblings, 1 reply; 34+ messages in thread
From: Muli Ben-Yehuda @ 2009-12-10 10:37 UTC (permalink / raw)
  To: Alexander Graf; +Cc: Michael S. Tsirkin, kvm

On Thu, Dec 10, 2009 at 11:31:01AM +0100, Alexander Graf wrote:

> > What do you have in mind for such a rewrite?
> 
> I'd like to see it more well-abstracted and versatile. I don't see
> an obvious reason why we shouldn't be able to use a physical device
> in a TCG target :-).

mmio and pio are easy, DMA you'd need an IOMMU for security, or
whatever uio does just for translation, and interrupts you probably
get for free from uio. Seems eminently doable to me. Why you'd want to
is another matter :-)

Cheers,
Muli

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] Enable non page boundary BAR device assignment
  2009-12-10 10:31           ` Alexander Graf
@ 2009-12-10 10:42             ` Michael S. Tsirkin
  0 siblings, 0 replies; 34+ messages in thread
From: Michael S. Tsirkin @ 2009-12-10 10:42 UTC (permalink / raw)
  To: Alexander Graf; +Cc: Muli Ben-Yehuda, kvm@vger.kernel.org list

On Thu, Dec 10, 2009 at 11:31:54AM +0100, Alexander Graf wrote:
> 
> On 10.12.2009, at 11:27, Michael S. Tsirkin wrote:
> 
> > On Thu, Dec 10, 2009 at 11:08:58AM +0100, Alexander Graf wrote:
> >> 
> >> On 10.12.2009, at 10:52, Alexander Graf wrote:
> >> 
> >>> 
> >>> On 10.12.2009, at 10:43, Michael S. Tsirkin wrote:
> >>> 
> >>>> On Thu, Dec 10, 2009 at 07:16:04AM +0200, Muli Ben-Yehuda wrote:
> >>>>> On Wed, Dec 09, 2009 at 06:38:54PM +0100, Alexander Graf wrote:
> >>>>> 
> >>>>>> While trying to get device passthrough working with an emulex hba,
> >>>>>> kvm refused to pass it through because it has a BAR of 256 bytes:
> >>>>>> 
> >>>>>>      Region 0: Memory at d2100000 (64-bit, non-prefetchable) [size=4K]
> >>>>>>      Region 2: Memory at d2101000 (64-bit, non-prefetchable) [size=256]
> >>>>>>      Region 4: I/O ports at b100 [size=256]
> >>>>>> 
> >>>>>> Since the page boundary is an arbitrary optimization to allow 1:1
> >>>>>> mapping of physical to virtual addresses, we can still take the old
> >>>>>> MMIO callback route.
> >>>>>> 
> >>>>>> So let's add a second code path that allows for size & 0xFFF != 0
> >>>>>> sized regions by looping it through userspace.
> >>>>> 
> >>>>> That makes sense in general *but* the 4K-aligned check isn't just an
> >>>>> optimization, it also has a security implication. Consider the
> >>>>> theoretical case where has a multi-function device has BARs for two
> >>>>> functions on the same page (within a 4K boundary), and each function
> >>>>> is assigned to a different guest. With your current patch both guests
> >>>>> will be able to write to each other's BARs. Another case is where a
> >>>>> device has a bug and you must not write beyond the BAR or Bad Things
> >>>>> Happen. With this patch an *unprivileged* guest could exploit that bug
> >>>>> and make bad things happen.
> >>>>> 
> >>>>> This can be fixed if the slow userspace mmio path checks that all MMIO
> >>>>> accesses by a guest fall within the portion of the page that is
> >>>>> assigned to it.
> >>>> 
> >>>> This patch seems to implement range checks correctly,
> >>>> let me know if I am missing something.
> >>>> 
> >>>> One also notes that we currently link qemu with libpci
> >>>> which I think requires admin cap to work.
> >>>> However, in the future we might extend this to
> >>>> also support getting device fds over a unix socket
> >>>> from a higher priviledged process.
> >>>> 
> >>>> If or when this is done, we will have to be
> >>>> extra careful when passing
> >>>> device file descriptor to an unpriveledged qemu process if
> >>>> the BARs are less than full page in size: mapping
> >>>> such BAR will allow qemu access outside this BAR.
> >>>> 
> >>>> A possible solution to this problem
> >>>> if/when it arises would be adding yet another sysfs file
> >>>> for each resource, which would allow read/write but not
> >>>> mmap access, and perform range checks in the kernel.
> >>> 
> >>> Sounds like the best solution to this problem, yeah. Though we'd only need those for non-page-boundary BARs. So I guess the best would be to always export them from the kernel, but only use them when BAR & (PAGE_SIZE-1).
> >> 
> >> Hm, or add read/write fd functions that always do boundary checks to the existing interface and only allow mmap on size & PAGE_SIZE. Or only allow non-aligned mmap when the admin cap is present.
> >> 
> >> Alex
> > 
> > This might break existing applications.
> > We don't want that.
> 
> Well currently you can't mmap the resource at all without at least r/w rights on the file, right?

You could have dropped the cap or got the fd from another
process.

> But yeah, we'd probably change behavior that could break someone - sigh.
> 
> Alex

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] Enable non page boundary BAR device assignment
  2009-12-10 10:37           ` Muli Ben-Yehuda
@ 2009-12-10 10:56             ` Michael S. Tsirkin
  2009-12-10 11:09               ` Alexander Graf
  2009-12-10 11:28               ` Muli Ben-Yehuda
  0 siblings, 2 replies; 34+ messages in thread
From: Michael S. Tsirkin @ 2009-12-10 10:56 UTC (permalink / raw)
  To: Muli Ben-Yehuda; +Cc: Alexander Graf, kvm

On Thu, Dec 10, 2009 at 12:37:37PM +0200, Muli Ben-Yehuda wrote:
> On Thu, Dec 10, 2009 at 11:31:01AM +0100, Alexander Graf wrote:
> 
> > > What do you have in mind for such a rewrite?
> > 
> > I'd like to see it more well-abstracted and versatile. I don't see
> > an obvious reason why we shouldn't be able to use a physical device
> > in a TCG target :-).
> 
> mmio and pio are easy, DMA you'd need an IOMMU for security, or
> whatever uio does just for translation,

uio currently does not support DMA, but I plan to fix this

> and interrupts you probably
> get for free from uio. Seems eminently doable to me. Why you'd want to
> is another matter :-)
> 
> Cheers,
> Muli

The list above ignores the biggest issue: you would have to change TCG
code generation to make this work.

For example, I think a read memory barrier is currently ignored in
translation, and host CPU will reorder reads.  Some drivers might also
rely on ordering guarantees that depend on CPU cacheline sizes.  Atomics
is another bag of tricks but I expect atomics on a DMA memory are not
widely used.

I am not sure this problem is solvable unless host and guest
architectures are very similar.

-- 
MST

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] Enable non page boundary BAR device assignment
  2009-12-10 10:56             ` Michael S. Tsirkin
@ 2009-12-10 11:09               ` Alexander Graf
  2009-12-10 11:21                 ` Michael S. Tsirkin
  2009-12-10 11:28               ` Muli Ben-Yehuda
  1 sibling, 1 reply; 34+ messages in thread
From: Alexander Graf @ 2009-12-10 11:09 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Muli Ben-Yehuda, kvm


On 10.12.2009, at 11:56, Michael S. Tsirkin wrote:

> On Thu, Dec 10, 2009 at 12:37:37PM +0200, Muli Ben-Yehuda wrote:
>> On Thu, Dec 10, 2009 at 11:31:01AM +0100, Alexander Graf wrote:
>> 
>>>> What do you have in mind for such a rewrite?
>>> 
>>> I'd like to see it more well-abstracted and versatile. I don't see
>>> an obvious reason why we shouldn't be able to use a physical device
>>> in a TCG target :-).
>> 
>> mmio and pio are easy, DMA you'd need an IOMMU for security, or
>> whatever uio does just for translation,
> 
> uio currently does not support DMA, but I plan to fix this
> 
>> and interrupts you probably
>> get for free from uio. Seems eminently doable to me. Why you'd want to
>> is another matter :-)
>> 
>> Cheers,
>> Muli
> 
> The list above ignores the biggest issue: you would have to change TCG
> code generation to make this work.
> 
> For example, I think a read memory barrier is currently ignored in
> translation, and host CPU will reorder reads.  Some drivers might also
> rely on ordering guarantees that depend on CPU cacheline sizes.  Atomics
> is another bag of tricks but I expect atomics on a DMA memory are not
> widely used.

Since we'd use the mmio callbacks for MMIO we'd be strictly ordered, no?

Alex

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] Enable non page boundary BAR device assignment
  2009-12-10 11:09               ` Alexander Graf
@ 2009-12-10 11:21                 ` Michael S. Tsirkin
  2009-12-10 12:12                   ` Gleb Natapov
  0 siblings, 1 reply; 34+ messages in thread
From: Michael S. Tsirkin @ 2009-12-10 11:21 UTC (permalink / raw)
  To: Alexander Graf; +Cc: Muli Ben-Yehuda, kvm

On Thu, Dec 10, 2009 at 12:09:13PM +0100, Alexander Graf wrote:
> 
> On 10.12.2009, at 11:56, Michael S. Tsirkin wrote:
> 
> > On Thu, Dec 10, 2009 at 12:37:37PM +0200, Muli Ben-Yehuda wrote:
> >> On Thu, Dec 10, 2009 at 11:31:01AM +0100, Alexander Graf wrote:
> >> 
> >>>> What do you have in mind for such a rewrite?
> >>> 
> >>> I'd like to see it more well-abstracted and versatile. I don't see
> >>> an obvious reason why we shouldn't be able to use a physical device
> >>> in a TCG target :-).
> >> 
> >> mmio and pio are easy, DMA you'd need an IOMMU for security, or
> >> whatever uio does just for translation,
> > 
> > uio currently does not support DMA, but I plan to fix this
> > 
> >> and interrupts you probably
> >> get for free from uio. Seems eminently doable to me. Why you'd want to
> >> is another matter :-)
> >> 
> >> Cheers,
> >> Muli
> > 
> > The list above ignores the biggest issue: you would have to change TCG
> > code generation to make this work.
> > 
> > For example, I think a read memory barrier is currently ignored in
> > translation, and host CPU will reorder reads.  Some drivers might also
> > rely on ordering guarantees that depend on CPU cacheline sizes.  Atomics
> > is another bag of tricks but I expect atomics on a DMA memory are not
> > widely used.
> 
> Since we'd use the mmio callbacks for MMIO we'd be strictly ordered, no?
> 
> Alex

Not unless you issue appropriate host memory barriers on mmio callbacks (kvm currently
uses a lock for this, which has an implicit barrier, but I do not
think TCG does this).

But even then, it depends on the device, for some devices DMA memory
reads/writes might depend on each other. Look at virtio as an example, a
real device might have the same semantics.  As a simpler example, some
devices DMA the following into ring in host memory to signal data
available:
- valid tag
- data length
host will read tag, and when it's valid use data length,
but e.g. on intel this only works well if these share a cache line,
otherwise data read might bypass tag read and you get invalid data.









-- 
MST

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] Enable non page boundary BAR device assignment
  2009-12-10 10:56             ` Michael S. Tsirkin
  2009-12-10 11:09               ` Alexander Graf
@ 2009-12-10 11:28               ` Muli Ben-Yehuda
  2009-12-10 11:34                 ` Alexander Graf
  2009-12-10 11:37                 ` Michael S. Tsirkin
  1 sibling, 2 replies; 34+ messages in thread
From: Muli Ben-Yehuda @ 2009-12-10 11:28 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Alexander Graf, kvm

On Thu, Dec 10, 2009 at 12:56:56PM +0200, Michael S. Tsirkin wrote:

> > mmio and pio are easy, DMA you'd need an IOMMU for security, or
> > whatever uio does just for translation,
> 
> uio currently does not support DMA, but I plan to fix this

With or without an IOMMU?

> > and interrupts you probably get for free from uio. Seems eminently
> > doable to me. Why you'd want to is another matter :-)
> 
> The list above ignores the biggest issue: you would have to change
> TCG code generation to make this work.

Yep, I know nothing about TCG, only looking at this from the device
interaction side.

> I am not sure this problem is solvable unless host and guest
> architectures are very similar.

Now you are ignoring the most interesting issue, namely, why would you
want to solve it? What is the value of device assignment for TCG
targets?

Cheers,
Muli
-- 
Muli Ben-Yehuda | muli@il.ibm.com | +972-4-8281080
Manager, Virtualization and Systems Architecture
Master Inventor, IBM Research -- Haifa
Second Workshop on I/O Virtualization (WIOV '10):
http://sysrun.haifa.il.ibm.com/hrl/wiov2010/

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] Enable non page boundary BAR device assignment
  2009-12-10 11:28               ` Muli Ben-Yehuda
@ 2009-12-10 11:34                 ` Alexander Graf
  2009-12-10 11:46                   ` Michael S. Tsirkin
  2009-12-10 11:37                 ` Michael S. Tsirkin
  1 sibling, 1 reply; 34+ messages in thread
From: Alexander Graf @ 2009-12-10 11:34 UTC (permalink / raw)
  To: Muli Ben-Yehuda; +Cc: Michael S. Tsirkin, kvm

On 10.12.2009, at 12:28, Muli Ben-Yehuda wrote:

> On Thu, Dec 10, 2009 at 12:56:56PM +0200, Michael S. Tsirkin wrote:
> 
>>> mmio and pio are easy, DMA you'd need an IOMMU for security, or
>>> whatever uio does just for translation,
>> 
>> uio currently does not support DMA, but I plan to fix this
> 
> With or without an IOMMU?
> 
>>> and interrupts you probably get for free from uio. Seems eminently
>>> doable to me. Why you'd want to is another matter :-)
>> 
>> The list above ignores the biggest issue: you would have to change
>> TCG code generation to make this work.
> 
> Yep, I know nothing about TCG, only looking at this from the device
> interaction side.
> 
>> I am not sure this problem is solvable unless host and guest
>> architectures are very similar.
> 
> Now you are ignoring the most interesting issue, namely, why would you
> want to solve it? What is the value of device assignment for TCG
> targets?

Why would you want to emulate a machine at all? ;-)

Imagine you were a hardware manufacturer who develops some self-made hardware. Now that manufacturer of course has x86 boxes. Chances are pretty low he has PPC ones and imagine we'd ever get MMIO/PCI on S390, he definitely does not have those.

Still, while developing his driver it'd be really valuable to know that it works on other architectures as well. Especially since he can claim support for architectures he doesn't have himself. At least until the first customer arrives :-).

While this is a scenario I just made up and don't really have myself, its goal is to illustrate why we shouldn't close doors that don't have to be closed. If TCG doesn't deal with it properly, that doesn't mean we shouldn't work on making the other end compatible.

Alex

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] Enable non page boundary BAR device assignment
  2009-12-10 11:28               ` Muli Ben-Yehuda
  2009-12-10 11:34                 ` Alexander Graf
@ 2009-12-10 11:37                 ` Michael S. Tsirkin
  1 sibling, 0 replies; 34+ messages in thread
From: Michael S. Tsirkin @ 2009-12-10 11:37 UTC (permalink / raw)
  To: Muli Ben-Yehuda; +Cc: Alexander Graf, kvm

On Thu, Dec 10, 2009 at 01:28:49PM +0200, Muli Ben-Yehuda wrote:
> On Thu, Dec 10, 2009 at 12:56:56PM +0200, Michael S. Tsirkin wrote:
> 
> > > mmio and pio are easy, DMA you'd need an IOMMU for security, or
> > > whatever uio does just for translation,
> > 
> > uio currently does not support DMA, but I plan to fix this
> 
> With or without an IOMMU?

With an IOMMU.

> > > and interrupts you probably get for free from uio. Seems eminently
> > > doable to me. Why you'd want to is another matter :-)
> > 
> > The list above ignores the biggest issue: you would have to change
> > TCG code generation to make this work.
> 
> Yep, I know nothing about TCG, only looking at this from the device
> interaction side.
> 
> > I am not sure this problem is solvable unless host and guest
> > architectures are very similar.
> 
> Now you are ignoring the most interesting issue, namely, why would you
> want to solve it? What is the value of device assignment for TCG
> targets?

No idea.

> Cheers,
> Muli
> -- 
> Muli Ben-Yehuda | muli@il.ibm.com | +972-4-8281080
> Manager, Virtualization and Systems Architecture
> Master Inventor, IBM Research -- Haifa
> Second Workshop on I/O Virtualization (WIOV '10):
> http://sysrun.haifa.il.ibm.com/hrl/wiov2010/

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] Enable non page boundary BAR device assignment
  2009-12-10 11:34                 ` Alexander Graf
@ 2009-12-10 11:46                   ` Michael S. Tsirkin
  0 siblings, 0 replies; 34+ messages in thread
From: Michael S. Tsirkin @ 2009-12-10 11:46 UTC (permalink / raw)
  To: Alexander Graf; +Cc: Muli Ben-Yehuda, kvm

On Thu, Dec 10, 2009 at 12:34:38PM +0100, Alexander Graf wrote:
> 
> On 10.12.2009, at 12:28, Muli Ben-Yehuda wrote:
> 
> > On Thu, Dec 10, 2009 at 12:56:56PM +0200, Michael S. Tsirkin wrote:
> > 
> >>> mmio and pio are easy, DMA you'd need an IOMMU for security, or
> >>> whatever uio does just for translation,
> >> 
> >> uio currently does not support DMA, but I plan to fix this
> > 
> > With or without an IOMMU?
> > 
> >>> and interrupts you probably get for free from uio. Seems eminently
> >>> doable to me. Why you'd want to is another matter :-)
> >> 
> >> The list above ignores the biggest issue: you would have to change
> >> TCG code generation to make this work.
> > 
> > Yep, I know nothing about TCG, only looking at this from the device
> > interaction side.
> > 
> >> I am not sure this problem is solvable unless host and guest
> >> architectures are very similar.
> > 
> > Now you are ignoring the most interesting issue, namely, why would you
> > want to solve it? What is the value of device assignment for TCG
> > targets?
> 
> Why would you want to emulate a machine at all? ;-)
> 
> Imagine you were a hardware manufacturer who develops some self-made hardware. Now that manufacturer of course has x86 boxes. Chances are pretty low he has PPC ones and imagine we'd ever get MMIO/PCI on S390, he definitely does not have those.
> 
> Still, while developing his driver it'd be really valuable to know that it works on other architectures as well. Especially since he can claim support for architectures he doesn't have himself. At least until the first customer arrives :-).
> 
> While this is a scenario I just made up and don't really have myself, its goal is to illustrate why we shouldn't close doors that don't have to be closed. If TCG doesn't deal with it properly, that doesn't mean we shouldn't work on making the other end compatible.
> 

Well, it does IMO mean that such a project should not be a blocker for
passthrough in upstream qemu, after fixing endian-ness issues and
generally cleaning up the code.


> Alex

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] Enable non page boundary BAR device assignment
  2009-12-10 11:21                 ` Michael S. Tsirkin
@ 2009-12-10 12:12                   ` Gleb Natapov
  0 siblings, 0 replies; 34+ messages in thread
From: Gleb Natapov @ 2009-12-10 12:12 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Alexander Graf, Muli Ben-Yehuda, kvm

On Thu, Dec 10, 2009 at 01:21:40PM +0200, Michael S. Tsirkin wrote:
> On Thu, Dec 10, 2009 at 12:09:13PM +0100, Alexander Graf wrote:
> > 
> > On 10.12.2009, at 11:56, Michael S. Tsirkin wrote:
> > 
> > > On Thu, Dec 10, 2009 at 12:37:37PM +0200, Muli Ben-Yehuda wrote:
> > >> On Thu, Dec 10, 2009 at 11:31:01AM +0100, Alexander Graf wrote:
> > >> 
> > >>>> What do you have in mind for such a rewrite?
> > >>> 
> > >>> I'd like to see it more well-abstracted and versatile. I don't see
> > >>> an obvious reason why we shouldn't be able to use a physical device
> > >>> in a TCG target :-).
> > >> 
> > >> mmio and pio are easy, DMA you'd need an IOMMU for security, or
> > >> whatever uio does just for translation,
> > > 
> > > uio currently does not support DMA, but I plan to fix this
> > > 
> > >> and interrupts you probably
> > >> get for free from uio. Seems eminently doable to me. Why you'd want to
> > >> is another matter :-)
> > >> 
> > >> Cheers,
> > >> Muli
> > > 
> > > The list above ignores the biggest issue: you would have to change TCG
> > > code generation to make this work.
> > > 
> > > For example, I think a read memory barrier is currently ignored in
> > > translation, and host CPU will reorder reads.  Some drivers might also
> > > rely on ordering guarantees that depend on CPU cacheline sizes.  Atomics
> > > is another bag of tricks but I expect atomics on a DMA memory are not
> > > widely used.
> > 
> > Since we'd use the mmio callbacks for MMIO we'd be strictly ordered, no?
> > 
> > Alex
> 
> Not unless you issue appropriate host memory barriers on mmio callbacks (kvm currently
> uses a lock for this, which has an implicit barrier, but I do not
> think TCG does this).
> 
> But even then, it depends on the device, for some devices DMA memory
> reads/writes might depend on each other. Look at virtio as an example, a
> real device might have the same semantics.  As a simpler example, some
> devices DMA the following into ring in host memory to signal data
> available:
> - valid tag
> - data length
> host will read tag, and when it's valid use data length,
May be:
- data length
- valid tag
Otherwise how can you guaranty that at the time tag is valid data
length is already up-to-date and not in process to be written? An on
arch such as Altix DMA from device to memory can arrive out of order
if memory is not mapped in a special way, but then DMA is slow.

--
			Gleb.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* [PATCH] Enable non page boundary BAR device assignment
@ 2009-12-10 23:06 Alexander Graf
  2009-12-10 23:06 ` [PATCH] Inform users about busy device assignment attempt Alexander Graf
  2009-12-11 11:05 ` [PATCH] Enable non page boundary BAR device assignment Michael S. Tsirkin
  0 siblings, 2 replies; 34+ messages in thread
From: Alexander Graf @ 2009-12-10 23:06 UTC (permalink / raw)
  To: kvm list; +Cc: Gleb Natapov, Michael S. Tsirkin, Muli Ben-Yehuda

While trying to get device passthrough working with an emulex hba, kvm
refused to pass it through because it has a BAR of 256 bytes:

        Region 0: Memory at d2100000 (64-bit, non-prefetchable) [size=4K]
        Region 2: Memory at d2101000 (64-bit, non-prefetchable) [size=256]
        Region 4: I/O ports at b100 [size=256]

Since the page boundary is an arbitrary optimization to allow 1:1 mapping of
physical to virtual addresses, we can still take the old MMIO callback route.

So let's add a second code path that allows for size & 0xFFF != 0 sized regions
by looping it through userspace.

I verified that it works by passing through an e1000 with this additional patch
applied and the card acted the same way it did without this patch:

             map_func = assigned_dev_iomem_map;
-            if (cur_region->size & 0xFFF) {
+            if (i != PCI_ROM_SLOT){
                 fprintf(stderr, "PCI region %d at address 0x%llx "

Signed-off-by: Alexander Graf <agraf@suse.de>

---

v1 -> v2:

  - don't use map_func function pointer
  - use the same code for mmap on fast and slow path
---
 hw/device-assignment.c |  123 +++++++++++++++++++++++++++++++++++++++++++++---
 1 files changed, 116 insertions(+), 7 deletions(-)

diff --git a/hw/device-assignment.c b/hw/device-assignment.c
index 13a86bb..5cee929 100644
--- a/hw/device-assignment.c
+++ b/hw/device-assignment.c
@@ -148,6 +148,105 @@ static uint32_t assigned_dev_ioport_readl(void *opaque, uint32_t addr)
     return value;
 }
 
+static uint32_t slow_bar_readb(void *opaque, target_phys_addr_t addr)
+{
+    AssignedDevRegion *d = opaque;
+    uint8_t *in = (uint8_t*)(d->u.r_virtbase + addr);
+    uint32_t r = -1;
+
+    r = *in;
+    DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
+
+    return r;
+}
+
+static uint32_t slow_bar_readw(void *opaque, target_phys_addr_t addr)
+{
+    AssignedDevRegion *d = opaque;
+    uint16_t *in = (uint16_t*)(d->u.r_virtbase + addr);
+    uint32_t r = -1;
+
+    r = *in;
+    DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
+
+    return r;
+}
+
+static uint32_t slow_bar_readl(void *opaque, target_phys_addr_t addr)
+{
+    AssignedDevRegion *d = opaque;
+    uint32_t *in = (uint32_t*)(d->u.r_virtbase + addr);
+    uint32_t r = -1;
+
+    r = *in;
+    DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
+
+    return r;
+}
+
+static void slow_bar_writeb(void *opaque, target_phys_addr_t addr, uint32_t val)
+{
+    AssignedDevRegion *d = opaque;
+    uint8_t *out = (uint8_t*)(d->u.r_virtbase + addr);
+
+    DEBUG("slow_bar_writeb addr=0x" TARGET_FMT_plx " val=0x%02x\n", addr, val);
+    *out = val;
+}
+
+static void slow_bar_writew(void *opaque, target_phys_addr_t addr, uint32_t val)
+{
+    AssignedDevRegion *d = opaque;
+    uint16_t *out = (uint16_t*)(d->u.r_virtbase + addr);
+
+    DEBUG("slow_bar_writew addr=0x" TARGET_FMT_plx " val=0x%04x\n", addr, val);
+    *out = val;
+}
+
+static void slow_bar_writel(void *opaque, target_phys_addr_t addr, uint32_t val)
+{
+    AssignedDevRegion *d = opaque;
+    uint32_t *out = (uint32_t*)(d->u.r_virtbase + addr);
+
+    DEBUG("slow_bar_writel addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, val);
+    *out = val;
+}
+
+static CPUWriteMemoryFunc * const slow_bar_write[] = {
+    &slow_bar_writeb,
+    &slow_bar_writew,
+    &slow_bar_writel
+};
+
+static CPUReadMemoryFunc * const slow_bar_read[] = {
+    &slow_bar_readb,
+    &slow_bar_readw,
+    &slow_bar_readl
+};
+
+static void assigned_dev_iomem_map_slow(PCIDevice *pci_dev, int region_num,
+                                        pcibus_t e_phys, pcibus_t e_size,
+                                        int type)
+{
+    AssignedDevice *r_dev = container_of(pci_dev, AssignedDevice, dev);
+    AssignedDevRegion *region = &r_dev->v_addrs[region_num];
+    PCIRegion *real_region = &r_dev->real_device.regions[region_num];
+    int m;
+
+    DEBUG("slow map\n");
+    m = cpu_register_io_memory(slow_bar_read, slow_bar_write, region);
+    cpu_register_physical_memory(e_phys, e_size, m);
+
+    /* MSI-X MMIO page */
+    if ((e_size > 0) &&
+        real_region->base_addr <= r_dev->msix_table_addr &&
+        real_region->base_addr + real_region->size >= r_dev->msix_table_addr) {
+        int offset = r_dev->msix_table_addr - real_region->base_addr;
+
+        cpu_register_physical_memory(e_phys + offset,
+                TARGET_PAGE_SIZE, r_dev->mmio_index);
+    }
+}
+
 static void assigned_dev_iomem_map(PCIDevice *pci_dev, int region_num,
                                    pcibus_t e_phys, pcibus_t e_size, int type)
 {
@@ -381,15 +480,22 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
 
         /* handle memory io regions */
         if (cur_region->type & IORESOURCE_MEM) {
+            int slow_map = 0;
             int t = cur_region->type & IORESOURCE_PREFETCH
                 ? PCI_BASE_ADDRESS_MEM_PREFETCH
                 : PCI_BASE_ADDRESS_SPACE_MEMORY;
+
             if (cur_region->size & 0xFFF) {
-                fprintf(stderr, "Unable to assign device: PCI region %d "
-                        "at address 0x%llx has size 0x%x, "
-                        " which is not a multiple of 4K\n",
+                fprintf(stderr, "PCI region %d at address 0x%llx "
+                        "has size 0x%x, which is not a multiple of 4K. "
+                        "Using slow map\n",
                         i, (unsigned long long)cur_region->base_addr,
                         cur_region->size);
+                slow_map = 1;
+            }
+
+            if (slow_map && (i == PCI_ROM_SLOT)) {
+                fprintf(stderr, "ROM not aligned - can't continue\n");
                 return -1;
             }
 
@@ -405,7 +511,7 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
             } else {
                 pci_dev->v_addrs[i].u.r_virtbase =
                     mmap(NULL,
-                         (cur_region->size + 0xFFF) & 0xFFFFF000,
+                         cur_region->size,
                          PROT_WRITE | PROT_READ, MAP_SHARED,
                          cur_region->resource_fd, (off_t) 0);
             }
@@ -429,12 +535,15 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
             pci_dev->v_addrs[i].e_size = 0;
 
             /* add offset */
-            pci_dev->v_addrs[i].u.r_virtbase +=
-                (cur_region->base_addr & 0xFFF);
+            if (!slow_map) {
+                pci_dev->v_addrs[i].u.r_virtbase +=
+                    (cur_region->base_addr & 0xFFF);
+            }
 
             pci_register_bar((PCIDevice *) pci_dev, i,
                              cur_region->size, t,
-                             assigned_dev_iomem_map);
+                             slow_map ? assigned_dev_iomem_map_slow
+                                      : assigned_dev_iomem_map);
             continue;
         }
         /* handle port io regions */
-- 
1.6.0.2


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH] Inform users about busy device assignment attempt
  2009-12-10 23:06 [PATCH] Enable non page boundary BAR device assignment Alexander Graf
@ 2009-12-10 23:06 ` Alexander Graf
  2009-12-11 11:38   ` Michael S. Tsirkin
  2009-12-11 11:05 ` [PATCH] Enable non page boundary BAR device assignment Michael S. Tsirkin
  1 sibling, 1 reply; 34+ messages in thread
From: Alexander Graf @ 2009-12-10 23:06 UTC (permalink / raw)
  To: kvm list
  Cc: Gleb Natapov, Michael S. Tsirkin, Muli Ben-Yehuda,
	Daniel P. Berrange

When using -pcidevice on a device that is already in use by a kernel driver
all the user gets is the following (very useful) information:

  Failed to assign device "04:00.0" : Device or resource busy
  Failed to deassign device "04:00.0" : Invalid argument
  Error initializing device pci-assign

Since I usually prefer to have my computer do the thinking for me, I figured
it might be a good idea to check and see if a device is actually used by a
driver. If so, tell the user.

So with this patch applied you get the following output:

  Failed to assign device "04:00.0" : Device or resource busy
  *** The driver 'igb' is occupying your device 04:00.0.
  ***
  *** You can try the following commands to free it:
  ***
  *** $ echo "8086 150a" > /sys/bus/pci/drivers/pci-stub/new_id
  *** $ echo "0000:04:00.0" > /sys/bus/pci/drivers/igb/unbind
  *** $ echo "0000:04:00.0" > /sys/bus/pci/drivers/pci-stub/bind
  *** $ echo "8086 150a" > /sys/bus/pci/drivers/pci-stub/remove_id
  ***
  Failed to deassign device "04:00.0" : Invalid argument
  Error initializing device pci-assign

That should keep people like me from doing the most obvious misuses :-).

CC: Daniel P. Berrange <berrange@redhat.com>
Signed-off-by: Alexander Graf <agraf@suse.de>

---

v1 -> v2:

  - add more helpful guidance thanks to Daniel Berrange

v2 -> v3:

  - clear name variable before using it, thus 0-terminating the string
  - fix region numbers
  - use correct unbind/bind names
---
 hw/device-assignment.c |  109 +++++++++++++++++++++++++++++++++++++-----------
 1 files changed, 85 insertions(+), 24 deletions(-)

diff --git a/hw/device-assignment.c b/hw/device-assignment.c
index 5cee929..98faa83 100644
--- a/hw/device-assignment.c
+++ b/hw/device-assignment.c
@@ -564,14 +564,44 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
     return 0;
 }
 
+static int get_real_id(const char *devpath, const char *idname, uint16_t *val)
+{
+    FILE *f;
+    char name[128];
+    long id;
+
+    snprintf(name, sizeof(name), "%s%s", devpath, idname);
+    f = fopen(name, "r");
+    if (f == NULL) {
+        fprintf(stderr, "%s: %s: %m\n", __func__, name);
+        return -1;
+    }
+    if (fscanf(f, "%li\n", &id) == 1) {
+        *val = id;
+    }
+    fclose(f);
+
+    return 0;
+}
+
+static int get_real_vendor_id(const char *devpath, uint16_t *val)
+{
+    return get_real_id(devpath, "vendor", val);
+}
+
+static int get_real_device_id(const char *devpath, uint16_t *val)
+{
+    return get_real_id(devpath, "device", val);
+}
+
 static int get_real_device(AssignedDevice *pci_dev, uint8_t r_bus,
                            uint8_t r_dev, uint8_t r_func)
 {
     char dir[128], name[128];
-    int fd, r = 0;
+    int fd, r = 0, v;
     FILE *f;
     unsigned long long start, end, size, flags;
-    unsigned long id;
+    uint16_t id;
     struct stat statbuf;
     PCIRegion *rp;
     PCIDevRegions *dev = &pci_dev->real_device;
@@ -637,31 +667,21 @@ again:
 
     fclose(f);
 
-    /* read and fill device ID */
-    snprintf(name, sizeof(name), "%svendor", dir);
-    f = fopen(name, "r");
-    if (f == NULL) {
-        fprintf(stderr, "%s: %s: %m\n", __func__, name);
+    /* read and fill vendor ID */
+    v = get_real_vendor_id(dir, &id);
+    if (v) {
         return 1;
     }
-    if (fscanf(f, "%li\n", &id) == 1) {
-	pci_dev->dev.config[0] = id & 0xff;
-	pci_dev->dev.config[1] = (id & 0xff00) >> 8;
-    }
-    fclose(f);
+    pci_dev->dev.config[0] = id & 0xff;
+    pci_dev->dev.config[1] = (id & 0xff00) >> 8;
 
-    /* read and fill vendor ID */
-    snprintf(name, sizeof(name), "%sdevice", dir);
-    f = fopen(name, "r");
-    if (f == NULL) {
-        fprintf(stderr, "%s: %s: %m\n", __func__, name);
+    /* read and fill device ID */
+    v = get_real_device_id(dir, &id);
+    if (v) {
         return 1;
     }
-    if (fscanf(f, "%li\n", &id) == 1) {
-	pci_dev->dev.config[2] = id & 0xff;
-	pci_dev->dev.config[3] = (id & 0xff00) >> 8;
-    }
-    fclose(f);
+    pci_dev->dev.config[2] = id & 0xff;
+    pci_dev->dev.config[3] = (id & 0xff00) >> 8;
 
     /* dealing with virtual function device */
     snprintf(name, sizeof(name), "%sphysfn/", dir);
@@ -739,7 +759,9 @@ static uint32_t calc_assigned_dev_id(uint8_t bus, uint8_t devfn)
 static int assign_device(AssignedDevice *dev)
 {
     struct kvm_assigned_pci_dev assigned_dev_data;
-    int r;
+    char name[128], dir[128], driver[128], *ns;
+    uint16_t vendor_id, device_id;
+    int r, v;
 
     memset(&assigned_dev_data, 0, sizeof(assigned_dev_data));
     assigned_dev_data.assigned_dev_id  =
@@ -761,9 +783,48 @@ static int assign_device(AssignedDevice *dev)
 #endif
 
     r = kvm_assign_pci_device(kvm_context, &assigned_dev_data);
-    if (r < 0)
+    if (r < 0) {
 	fprintf(stderr, "Failed to assign device \"%s\" : %s\n",
                 dev->dev.qdev.id, strerror(-r));
+
+        snprintf(dir, sizeof(dir),
+                 "/sys/bus/pci/devices/0000:%02x:%02x.%x/",
+	         dev->host.bus, dev->host.dev, dev->host.func);
+
+        snprintf(name, sizeof(name), "%sdriver", dir);
+
+        memset(driver, 0, sizeof(driver));
+        v = readlink(name, driver, sizeof(driver));
+        if ((v <= 0) || !(ns = strrchr(driver, '/'))) {
+            return r;
+        }
+
+        ns++;
+
+        if (get_real_vendor_id(dir, &vendor_id) ||
+            get_real_device_id(dir, &device_id)) {
+            return r;
+        }
+
+        fprintf(stderr, "*** The driver '%s' is occupying your device "
+                        "%02x:%02x.%x.\n",
+                ns, dev->host.bus, dev->host.dev, dev->host.func);
+        fprintf(stderr, "***\n");
+        fprintf(stderr, "*** You can try the following commands to free "
+                        "it:\n");
+        fprintf(stderr, "***\n");
+        fprintf(stderr, "*** $ echo \"%04x %04x\" > /sys/bus/pci/drivers/"
+                        "pci-stub/new_id\n", vendor_id, device_id);
+        fprintf(stderr, "*** $ echo \"0000:%02x:%02x.%x\" > /sys/bus/pci"
+                        "/drivers/%s /unbind\n",
+                dev->host.bus, dev->host.dev, dev->host.func, ns);
+        fprintf(stderr, "*** $ echo \"0000:%02x:%02x.%x\" > /sys/bus/pci"
+                        "/drivers/ pci-stub/bind\n",
+                dev->host.bus, dev->host.dev, dev->host.func);
+        fprintf(stderr, "*** $ echo \"%x %x\" > /sys/bus/pci/drivers/pci-stub"
+                        "/remove_id\n", vendor_id, device_id);
+        fprintf(stderr, "***\n");
+    }
     return r;
 }
 
-- 
1.6.0.2


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* Re: [PATCH] Enable non page boundary BAR device assignment
  2009-12-10 23:06 [PATCH] Enable non page boundary BAR device assignment Alexander Graf
  2009-12-10 23:06 ` [PATCH] Inform users about busy device assignment attempt Alexander Graf
@ 2009-12-11 11:05 ` Michael S. Tsirkin
  2009-12-15 18:16   ` Alexander Graf
  1 sibling, 1 reply; 34+ messages in thread
From: Michael S. Tsirkin @ 2009-12-11 11:05 UTC (permalink / raw)
  To: Alexander Graf; +Cc: kvm list, Gleb Natapov, Muli Ben-Yehuda

On Fri, Dec 11, 2009 at 12:06:25AM +0100, Alexander Graf wrote:
> While trying to get device passthrough working with an emulex hba, kvm
> refused to pass it through because it has a BAR of 256 bytes:
> 
>         Region 0: Memory at d2100000 (64-bit, non-prefetchable) [size=4K]
>         Region 2: Memory at d2101000 (64-bit, non-prefetchable) [size=256]
>         Region 4: I/O ports at b100 [size=256]
> 
> Since the page boundary is an arbitrary optimization to allow 1:1 mapping of
> physical to virtual addresses, we can still take the old MMIO callback route.
> 
> So let's add a second code path that allows for size & 0xFFF != 0 sized regions
> by looping it through userspace.
> 
> I verified that it works by passing through an e1000 with this additional patch
> applied and the card acted the same way it did without this patch:
> 
>              map_func = assigned_dev_iomem_map;
> -            if (cur_region->size & 0xFFF) {
> +            if (i != PCI_ROM_SLOT){
>                  fprintf(stderr, "PCI region %d at address 0x%llx "

The patch is pretty clean. However, I think I see a bug below.
Did you also try a device with sub-page BAR?
If yes, did the bar get non page aligned value?

> Signed-off-by: Alexander Graf <agraf@suse.de>
> 
> ---
> 
> v1 -> v2:
> 
>   - don't use map_func function pointer
>   - use the same code for mmap on fast and slow path
> ---
>  hw/device-assignment.c |  123 +++++++++++++++++++++++++++++++++++++++++++++---
>  1 files changed, 116 insertions(+), 7 deletions(-)
> 
> diff --git a/hw/device-assignment.c b/hw/device-assignment.c
> index 13a86bb..5cee929 100644
> --- a/hw/device-assignment.c
> +++ b/hw/device-assignment.c
> @@ -148,6 +148,105 @@ static uint32_t assigned_dev_ioport_readl(void *opaque, uint32_t addr)
>      return value;
>  }
>  
> +static uint32_t slow_bar_readb(void *opaque, target_phys_addr_t addr)
> +{
> +    AssignedDevRegion *d = opaque;
> +    uint8_t *in = (uint8_t*)(d->u.r_virtbase + addr);

this is a void* pointer, no need to cast

> +    uint32_t r = -1;

don't initialize r here as you override 1 line below.
> +
> +    r = *in;
> +    DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
> +
> +    return r;
> +}
> +
> +static uint32_t slow_bar_readw(void *opaque, target_phys_addr_t addr)
> +{
> +    AssignedDevRegion *d = opaque;
> +    uint16_t *in = (uint16_t*)(d->u.r_virtbase + addr);
> +    uint32_t r = -1;
> +
> +    r = *in;
> +    DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
> +
> +    return r;

same as above

> +}
> +
> +static uint32_t slow_bar_readl(void *opaque, target_phys_addr_t addr)
> +{
> +    AssignedDevRegion *d = opaque;
> +    uint32_t *in = (uint32_t*)(d->u.r_virtbase + addr);
> +    uint32_t r = -1;
> +
> +    r = *in;
> +    DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
> +
> +    return r;

same as above

> +}
> +
> +static void slow_bar_writeb(void *opaque, target_phys_addr_t addr, uint32_t val)
> +{
> +    AssignedDevRegion *d = opaque;
> +    uint8_t *out = (uint8_t*)(d->u.r_virtbase + addr);

no need for cast as r_virtbase is a void pointer.

> +
> +    DEBUG("slow_bar_writeb addr=0x" TARGET_FMT_plx " val=0x%02x\n", addr, val);
> +    *out = val;
> +}
> +
> +static void slow_bar_writew(void *opaque, target_phys_addr_t addr, uint32_t val)
> +{
> +    AssignedDevRegion *d = opaque;
> +    uint16_t *out = (uint16_t*)(d->u.r_virtbase + addr);
> +
> +    DEBUG("slow_bar_writew addr=0x" TARGET_FMT_plx " val=0x%04x\n", addr, val);
> +    *out = val;
> +}
> +
> +static void slow_bar_writel(void *opaque, target_phys_addr_t addr, uint32_t val)
> +{
> +    AssignedDevRegion *d = opaque;
> +    uint32_t *out = (uint32_t*)(d->u.r_virtbase + addr);

same as above

> +
> +    DEBUG("slow_bar_writel addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, val);
> +    *out = val;
> +}
> +
> +static CPUWriteMemoryFunc * const slow_bar_write[] = {
> +    &slow_bar_writeb,
> +    &slow_bar_writew,
> +    &slow_bar_writel
> +};
> +
> +static CPUReadMemoryFunc * const slow_bar_read[] = {
> +    &slow_bar_readb,
> +    &slow_bar_readw,
> +    &slow_bar_readl
> +};
> +
> +static void assigned_dev_iomem_map_slow(PCIDevice *pci_dev, int region_num,
> +                                        pcibus_t e_phys, pcibus_t e_size,
> +                                        int type)
> +{
> +    AssignedDevice *r_dev = container_of(pci_dev, AssignedDevice, dev);
> +    AssignedDevRegion *region = &r_dev->v_addrs[region_num];
> +    PCIRegion *real_region = &r_dev->real_device.regions[region_num];
> +    int m;
> +
> +    DEBUG("slow map\n");
> +    m = cpu_register_io_memory(slow_bar_read, slow_bar_write, region);
> +    cpu_register_physical_memory(e_phys, e_size, m);
> +
> +    /* MSI-X MMIO page */

some code duplication  ... use a common function?

> +    if ((e_size > 0) &&
> +        real_region->base_addr <= r_dev->msix_table_addr &&
> +        real_region->base_addr + real_region->size >= r_dev->msix_table_addr) {
> +        int offset = r_dev->msix_table_addr - real_region->base_addr;
> +
> +        cpu_register_physical_memory(e_phys + offset,
> +                TARGET_PAGE_SIZE, r_dev->mmio_index);

How does this work? Does the last registered callback win
or are both called for MSIX page?

> +    }
> +}
> +
>  static void assigned_dev_iomem_map(PCIDevice *pci_dev, int region_num,
>                                     pcibus_t e_phys, pcibus_t e_size, int type)
>  {
> @@ -381,15 +480,22 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
>  
>          /* handle memory io regions */
>          if (cur_region->type & IORESOURCE_MEM) {
> +            int slow_map = 0;
>              int t = cur_region->type & IORESOURCE_PREFETCH
>                  ? PCI_BASE_ADDRESS_MEM_PREFETCH
>                  : PCI_BASE_ADDRESS_SPACE_MEMORY;
> +
>              if (cur_region->size & 0xFFF) {
> -                fprintf(stderr, "Unable to assign device: PCI region %d "
> -                        "at address 0x%llx has size 0x%x, "
> -                        " which is not a multiple of 4K\n",
> +                fprintf(stderr, "PCI region %d at address 0x%llx "
> +                        "has size 0x%x, which is not a multiple of 4K. "
> +                        "Using slow map\n",

I still think "Using slow map" tells the user nothing.
How about "Disabling direct guest access, device will be slow"?

>                          i, (unsigned long long)cur_region->base_addr,
>                          cur_region->size);
> +                slow_map = 1;
> +            }
> +
> +            if (slow_map && (i == PCI_ROM_SLOT)) {
> +                fprintf(stderr, "ROM not aligned - can't continue\n");
>                  return -1;
>              }
>  
> @@ -405,7 +511,7 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
>              } else {
>                  pci_dev->v_addrs[i].u.r_virtbase =
>                      mmap(NULL,
> -                         (cur_region->size + 0xFFF) & 0xFFFFF000,
> +                         cur_region->size,
>                           PROT_WRITE | PROT_READ, MAP_SHARED,
>                           cur_region->resource_fd, (off_t) 0);
>              }
> @@ -429,12 +535,15 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
>              pci_dev->v_addrs[i].e_size = 0;
>  
>              /* add offset */
> -            pci_dev->v_addrs[i].u.r_virtbase +=
> -                (cur_region->base_addr & 0xFFF);
> +            if (!slow_map) {
> +                pci_dev->v_addrs[i].u.r_virtbase +=
> +                    (cur_region->base_addr & 0xFFF);
> +            }

This looks wrong. I think mmap returns a page aligned address,
that's why we did this offset math. No?
And if not, there was no reason for this code in the first place.


>  
>              pci_register_bar((PCIDevice *) pci_dev, i,
>                               cur_region->size, t,
> -                             assigned_dev_iomem_map);
> +                             slow_map ? assigned_dev_iomem_map_slow
> +                                      : assigned_dev_iomem_map);
>              continue;
>          }
>          /* handle port io regions */
> -- 
> 1.6.0.2

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] Inform users about busy device assignment attempt
  2009-12-10 23:06 ` [PATCH] Inform users about busy device assignment attempt Alexander Graf
@ 2009-12-11 11:38   ` Michael S. Tsirkin
  2009-12-15 15:18     ` Alexander Graf
  0 siblings, 1 reply; 34+ messages in thread
From: Michael S. Tsirkin @ 2009-12-11 11:38 UTC (permalink / raw)
  To: Alexander Graf
  Cc: kvm list, Gleb Natapov, Muli Ben-Yehuda, Daniel P. Berrange

On Fri, Dec 11, 2009 at 12:06:26AM +0100, Alexander Graf wrote:
> When using -pcidevice on a device that is already in use by a kernel driver
> all the user gets is the following (very useful) information:
> 
>   Failed to assign device "04:00.0" : Device or resource busy
>   Failed to deassign device "04:00.0" : Invalid argument
>   Error initializing device pci-assign
> 
> Since I usually prefer to have my computer do the thinking for me, I figured
> it might be a good idea to check and see if a device is actually used by a
> driver. If so, tell the user.
> 
> So with this patch applied you get the following output:
> 
>   Failed to assign device "04:00.0" : Device or resource busy
>   *** The driver 'igb' is occupying your device 04:00.0.
>   ***
>   *** You can try the following commands to free it:
>   ***
>   *** $ echo "8086 150a" > /sys/bus/pci/drivers/pci-stub/new_id
>   *** $ echo "0000:04:00.0" > /sys/bus/pci/drivers/igb/unbind
>   *** $ echo "0000:04:00.0" > /sys/bus/pci/drivers/pci-stub/bind
>   *** $ echo "8086 150a" > /sys/bus/pci/drivers/pci-stub/remove_id
>   ***
>   Failed to deassign device "04:00.0" : Invalid argument
>   Error initializing device pci-assign
> 
> That should keep people like me from doing the most obvious misuses :-).
> 
> CC: Daniel P. Berrange <berrange@redhat.com>
> Signed-off-by: Alexander Graf <agraf@suse.de>

Minor nits and a bug.

> ---
> 
> v1 -> v2:
> 
>   - add more helpful guidance thanks to Daniel Berrange
> 
> v2 -> v3:
> 
>   - clear name variable before using it, thus 0-terminating the string
>   - fix region numbers
>   - use correct unbind/bind names
> ---
>  hw/device-assignment.c |  109 +++++++++++++++++++++++++++++++++++++-----------
>  1 files changed, 85 insertions(+), 24 deletions(-)
> 
> diff --git a/hw/device-assignment.c b/hw/device-assignment.c
> index 5cee929..98faa83 100644
> --- a/hw/device-assignment.c
> +++ b/hw/device-assignment.c
> @@ -564,14 +564,44 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
>      return 0;
>  }
>  
> +static int get_real_id(const char *devpath, const char *idname, uint16_t *val)
> +{
> +    FILE *f;
> +    char name[128];

let's not introduce arbitraty file name length limitations.
strlen is not hard to use. I know all this module is
broken this way, but let's not add more.

> +    long id;
> +
> +    snprintf(name, sizeof(name), "%s%s", devpath, idname);
> +    f = fopen(name, "r");
> +    if (f == NULL) {
> +        fprintf(stderr, "%s: %s: %m\n", __func__, name);
> +        return -1;
> +    }
> +    if (fscanf(f, "%li\n", &id) == 1) {
> +        *val = id;
> +    }

handle fscanf error?

> +    fclose(f);
> +
> +    return 0;
> +}
> +
> +static int get_real_vendor_id(const char *devpath, uint16_t *val)
> +{
> +    return get_real_id(devpath, "vendor", val);
> +}
> +
> +static int get_real_device_id(const char *devpath, uint16_t *val)
> +{
> +    return get_real_id(devpath, "device", val);
> +}
> +
>  static int get_real_device(AssignedDevice *pci_dev, uint8_t r_bus,
>                             uint8_t r_dev, uint8_t r_func)
>  {
>      char dir[128], name[128];
> -    int fd, r = 0;
> +    int fd, r = 0, v;
>      FILE *f;
>      unsigned long long start, end, size, flags;
> -    unsigned long id;
> +    uint16_t id;
>      struct stat statbuf;
>      PCIRegion *rp;
>      PCIDevRegions *dev = &pci_dev->real_device;
> @@ -637,31 +667,21 @@ again:
>  
>      fclose(f);
>  
> -    /* read and fill device ID */
> -    snprintf(name, sizeof(name), "%svendor", dir);
> -    f = fopen(name, "r");
> -    if (f == NULL) {
> -        fprintf(stderr, "%s: %s: %m\n", __func__, name);
> +    /* read and fill vendor ID */
> +    v = get_real_vendor_id(dir, &id);
> +    if (v) {
>          return 1;
>      }
> -    if (fscanf(f, "%li\n", &id) == 1) {
> -	pci_dev->dev.config[0] = id & 0xff;
> -	pci_dev->dev.config[1] = (id & 0xff00) >> 8;
> -    }
> -    fclose(f);
> +    pci_dev->dev.config[0] = id & 0xff;
> +    pci_dev->dev.config[1] = (id & 0xff00) >> 8;
>  

this seems an unrelated cleanup?
If so better as a separate patch?


> -    /* read and fill vendor ID */
> -    snprintf(name, sizeof(name), "%sdevice", dir);
> -    f = fopen(name, "r");
> -    if (f == NULL) {
> -        fprintf(stderr, "%s: %s: %m\n", __func__, name);
> +    /* read and fill device ID */
> +    v = get_real_device_id(dir, &id);
> +    if (v) {
>          return 1;
>      }
> -    if (fscanf(f, "%li\n", &id) == 1) {
> -	pci_dev->dev.config[2] = id & 0xff;
> -	pci_dev->dev.config[3] = (id & 0xff00) >> 8;
> -    }
> -    fclose(f);
> +    pci_dev->dev.config[2] = id & 0xff;
> +    pci_dev->dev.config[3] = (id & 0xff00) >> 8;
>  
>      /* dealing with virtual function device */
>      snprintf(name, sizeof(name), "%sphysfn/", dir);
> @@ -739,7 +759,9 @@ static uint32_t calc_assigned_dev_id(uint8_t bus, uint8_t devfn)
>  static int assign_device(AssignedDevice *dev)
>  {
>      struct kvm_assigned_pci_dev assigned_dev_data;
> -    int r;
> +    char name[128], dir[128], driver[128], *ns;

Yes 128 will be enough for now. But it's pretty ugly.
In this case, something like
    char dir[] = "/sys/bus/pci/devices/0000:00:00.0/";
will allocate just enough memory.
Or use MAX PATH.

> +    uint16_t vendor_id, device_id;
> +    int r, v;
>  
>      memset(&assigned_dev_data, 0, sizeof(assigned_dev_data));
>      assigned_dev_data.assigned_dev_id  =
> @@ -761,9 +783,48 @@ static int assign_device(AssignedDevice *dev)
>  #endif
>  
>      r = kvm_assign_pci_device(kvm_context, &assigned_dev_data);
> -    if (r < 0)
> +    if (r < 0) {


Please put all of the below in a separate function.

>  	fprintf(stderr, "Failed to assign device \"%s\" : %s\n",
>                  dev->dev.qdev.id, strerror(-r));
> +
> +        snprintf(dir, sizeof(dir),

snprintf? So you worry about overflowing dir?
But dir will not be 0 terminated on overflow,
so use of %s below would crash anyway.
As in fact we know this can not overflow, just use sprintf.

> +                 "/sys/bus/pci/devices/0000:%02x:%02x.%x/",
> +	         dev->host.bus, dev->host.dev, dev->host.func);

This assumes domain 0. I know multidomain is
broken with device assignment, but pls add
TOIDO here so we don't forget to fix it.

> +
> +        snprintf(name, sizeof(name), "%sdriver", dir);

So why do sprintf twice? Just put "driver" as part
of the template above.

> +
> +        memset(driver, 0, sizeof(driver));

just initialize driver to 0 by = {};

> +        v = readlink(name, driver, sizeof(driver));

So if readlink fills up all of driver, strrchr
below will cause coredump, right? Better check v against
sizeof driver.

> +        if ((v <= 0) || !(ns = strrchr(driver, '/'))) {
> +            return r;

Add some fprintf here. Maybe report errno as well.

> +        }
> +
> +        ns++;
> +
> +        if (get_real_vendor_id(dir, &vendor_id) ||
> +            get_real_device_id(dir, &device_id)) {
> +            return r;

And here.

> +        }
> +
> +        fprintf(stderr, "*** The driver '%s' is occupying your device "
> +                        "%02x:%02x.%x.\n",
> +                ns, dev->host.bus, dev->host.dev, dev->host.func);
> +        fprintf(stderr, "***\n");
> +        fprintf(stderr, "*** You can try the following commands to free "
> +                        "it:\n");
> +        fprintf(stderr, "***\n");
> +        fprintf(stderr, "*** $ echo \"%04x %04x\" > /sys/bus/pci/drivers/"
> +                        "pci-stub/new_id\n", vendor_id, device_id);
> +        fprintf(stderr, "*** $ echo \"0000:%02x:%02x.%x\" > /sys/bus/pci"
> +                        "/drivers/%s /unbind\n",
> +                dev->host.bus, dev->host.dev, dev->host.func, ns);
> +        fprintf(stderr, "*** $ echo \"0000:%02x:%02x.%x\" > /sys/bus/pci"
> +                        "/drivers/ pci-stub/bind\n",
> +                dev->host.bus, dev->host.dev, dev->host.func);
> +        fprintf(stderr, "*** $ echo \"%x %x\" > /sys/bus/pci/drivers/pci-stub"
> +                        "/remove_id\n", vendor_id, device_id);
> +        fprintf(stderr, "***\n");

above assumes domain zero. Please add a TODO to fix.

> +    }
>      return r;
>  }
>  
> -- 
> 1.6.0.2

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] Inform users about busy device assignment attempt
  2009-12-11 11:38   ` Michael S. Tsirkin
@ 2009-12-15 15:18     ` Alexander Graf
  2009-12-15 15:18       ` Michael S. Tsirkin
  0 siblings, 1 reply; 34+ messages in thread
From: Alexander Graf @ 2009-12-15 15:18 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: kvm list, Gleb Natapov, Muli Ben-Yehuda, Daniel P. Berrange

Michael S. Tsirkin wrote:
> On Fri, Dec 11, 2009 at 12:06:26AM +0100, Alexander Graf wrote:
>   
>> When using -pcidevice on a device that is already in use by a kernel driver
>> all the user gets is the following (very useful) information:
>>
>>   Failed to assign device "04:00.0" : Device or resource busy
>>   Failed to deassign device "04:00.0" : Invalid argument
>>   Error initializing device pci-assign
>>
>> Since I usually prefer to have my computer do the thinking for me, I figured
>> it might be a good idea to check and see if a device is actually used by a
>> driver. If so, tell the user.
>>
>> So with this patch applied you get the following output:
>>
>>   Failed to assign device "04:00.0" : Device or resource busy
>>   *** The driver 'igb' is occupying your device 04:00.0.
>>   ***
>>   *** You can try the following commands to free it:
>>   ***
>>   *** $ echo "8086 150a" > /sys/bus/pci/drivers/pci-stub/new_id
>>   *** $ echo "0000:04:00.0" > /sys/bus/pci/drivers/igb/unbind
>>   *** $ echo "0000:04:00.0" > /sys/bus/pci/drivers/pci-stub/bind
>>   *** $ echo "8086 150a" > /sys/bus/pci/drivers/pci-stub/remove_id
>>   ***
>>   Failed to deassign device "04:00.0" : Invalid argument
>>   Error initializing device pci-assign
>>
>> That should keep people like me from doing the most obvious misuses :-).
>>
>> CC: Daniel P. Berrange <berrange@redhat.com>
>> Signed-off-by: Alexander Graf <agraf@suse.de>
>>     
>
> Minor nits and a bug.
>
>   
>> ---
>>
>> v1 -> v2:
>>
>>   - add more helpful guidance thanks to Daniel Berrange
>>
>> v2 -> v3:
>>
>>   - clear name variable before using it, thus 0-terminating the string
>>   - fix region numbers
>>   - use correct unbind/bind names
>> ---
>>  hw/device-assignment.c |  109 +++++++++++++++++++++++++++++++++++++-----------
>>  1 files changed, 85 insertions(+), 24 deletions(-)
>>
>> diff --git a/hw/device-assignment.c b/hw/device-assignment.c
>> index 5cee929..98faa83 100644
>> --- a/hw/device-assignment.c
>> +++ b/hw/device-assignment.c
>> @@ -564,14 +564,44 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
>>      return 0;
>>  }
>>  
>> +static int get_real_id(const char *devpath, const char *idname, uint16_t *val)
>> +{
>> +    FILE *f;
>> +    char name[128];
>>     
>
> let's not introduce arbitraty file name length limitations.
> strlen is not hard to use. I know all this module is
> broken this way, but let's not add more.
>   

It's just a move of existing code. I tried to change it as little as
possible. Cleanups for that are welcome for later.

>   
>> +    long id;
>> +
>> +    snprintf(name, sizeof(name), "%s%s", devpath, idname);
>> +    f = fopen(name, "r");
>> +    if (f == NULL) {
>> +        fprintf(stderr, "%s: %s: %m\n", __func__, name);
>> +        return -1;
>> +    }
>> +    if (fscanf(f, "%li\n", &id) == 1) {
>> +        *val = id;
>> +    }
>>     
>
> handle fscanf error?
>   

Interesting. I don't think it was done before, but I can put it in.

>   
>> +    fclose(f);
>> +
>> +    return 0;
>> +}
>> +
>> +static int get_real_vendor_id(const char *devpath, uint16_t *val)
>> +{
>> +    return get_real_id(devpath, "vendor", val);
>> +}
>> +
>> +static int get_real_device_id(const char *devpath, uint16_t *val)
>> +{
>> +    return get_real_id(devpath, "device", val);
>> +}
>> +
>>  static int get_real_device(AssignedDevice *pci_dev, uint8_t r_bus,
>>                             uint8_t r_dev, uint8_t r_func)
>>  {
>>      char dir[128], name[128];
>> -    int fd, r = 0;
>> +    int fd, r = 0, v;
>>      FILE *f;
>>      unsigned long long start, end, size, flags;
>> -    unsigned long id;
>> +    uint16_t id;
>>      struct stat statbuf;
>>      PCIRegion *rp;
>>      PCIDevRegions *dev = &pci_dev->real_device;
>> @@ -637,31 +667,21 @@ again:
>>  
>>      fclose(f);
>>  
>> -    /* read and fill device ID */
>> -    snprintf(name, sizeof(name), "%svendor", dir);
>> -    f = fopen(name, "r");
>> -    if (f == NULL) {
>> -        fprintf(stderr, "%s: %s: %m\n", __func__, name);
>> +    /* read and fill vendor ID */
>> +    v = get_real_vendor_id(dir, &id);
>> +    if (v) {
>>          return 1;
>>      }
>> -    if (fscanf(f, "%li\n", &id) == 1) {
>> -	pci_dev->dev.config[0] = id & 0xff;
>> -	pci_dev->dev.config[1] = (id & 0xff00) >> 8;
>> -    }
>> -    fclose(f);
>> +    pci_dev->dev.config[0] = id & 0xff;
>> +    pci_dev->dev.config[1] = (id & 0xff00) >> 8;
>>  
>>     
>
> this seems an unrelated cleanup?
> If so better as a separate patch?
>   

It's the code move. I split it now.

>
>   
>> -    /* read and fill vendor ID */
>> -    snprintf(name, sizeof(name), "%sdevice", dir);
>> -    f = fopen(name, "r");
>> -    if (f == NULL) {
>> -        fprintf(stderr, "%s: %s: %m\n", __func__, name);
>> +    /* read and fill device ID */
>> +    v = get_real_device_id(dir, &id);
>> +    if (v) {
>>          return 1;
>>      }
>> -    if (fscanf(f, "%li\n", &id) == 1) {
>> -	pci_dev->dev.config[2] = id & 0xff;
>> -	pci_dev->dev.config[3] = (id & 0xff00) >> 8;
>> -    }
>> -    fclose(f);
>> +    pci_dev->dev.config[2] = id & 0xff;
>> +    pci_dev->dev.config[3] = (id & 0xff00) >> 8;
>>  
>>      /* dealing with virtual function device */
>>      snprintf(name, sizeof(name), "%sphysfn/", dir);
>> @@ -739,7 +759,9 @@ static uint32_t calc_assigned_dev_id(uint8_t bus, uint8_t devfn)
>>  static int assign_device(AssignedDevice *dev)
>>  {
>>      struct kvm_assigned_pci_dev assigned_dev_data;
>> -    int r;
>> +    char name[128], dir[128], driver[128], *ns;
>>     
>
> Yes 128 will be enough for now. But it's pretty ugly.
> In this case, something like
>     char dir[] = "/sys/bus/pci/devices/0000:00:00.0/";
> will allocate just enough memory.
> Or use MAX PATH.
>   

Used MAX_PATH now.

>   
>> +    uint16_t vendor_id, device_id;
>> +    int r, v;
>>  
>>      memset(&assigned_dev_data, 0, sizeof(assigned_dev_data));
>>      assigned_dev_data.assigned_dev_id  =
>> @@ -761,9 +783,48 @@ static int assign_device(AssignedDevice *dev)
>>  #endif
>>  
>>      r = kvm_assign_pci_device(kvm_context, &assigned_dev_data);
>> -    if (r < 0)
>> +    if (r < 0) {
>>     
>
>
> Please put all of the below in a separate function.
>   

Ok.

>   
>>  	fprintf(stderr, "Failed to assign device \"%s\" : %s\n",
>>                  dev->dev.qdev.id, strerror(-r));
>> +
>> +        snprintf(dir, sizeof(dir),
>>     
>
> snprintf? So you worry about overflowing dir?
> But dir will not be 0 terminated on overflow,
> so use of %s below would crash anyway.
> As in fact we know this can not overflow, just use sprintf.
>   

Ok.

>   
>> +                 "/sys/bus/pci/devices/0000:%02x:%02x.%x/",
>> +	         dev->host.bus, dev->host.dev, dev->host.func);
>>     
>
> This assumes domain 0. I know multidomain is
> broken with device assignment, but pls add
> TOIDO here so we don't forget to fix it.
>   

Ok.

>   
>> +
>> +        snprintf(name, sizeof(name), "%sdriver", dir);
>>     
>
> So why do sprintf twice? Just put "driver" as part
> of the template above.
>   

We're using dir later in the code.

>   
>> +
>> +        memset(driver, 0, sizeof(driver));
>>     
>
> just initialize driver to 0 by = {};
>
>   

That initializes it to 0? I mean, all elements?

>> +        v = readlink(name, driver, sizeof(driver));
>>     
>
> So if readlink fills up all of driver, strrchr
> below will cause coredump, right? Better check v against
> sizeof driver.
>   

Ok.

>   
>> +        if ((v <= 0) || !(ns = strrchr(driver, '/'))) {
>> +            return r;
>>     
>
> Add some fprintf here. Maybe report errno as well.
>   

Ok.

>   
>> +        }
>> +
>> +        ns++;
>> +
>> +        if (get_real_vendor_id(dir, &vendor_id) ||
>> +            get_real_device_id(dir, &device_id)) {
>> +            return r;
>>     
>
> And here.
>   

Yep.

>   
>> +        }
>> +
>> +        fprintf(stderr, "*** The driver '%s' is occupying your device "
>> +                        "%02x:%02x.%x.\n",
>> +                ns, dev->host.bus, dev->host.dev, dev->host.func);
>> +        fprintf(stderr, "***\n");
>> +        fprintf(stderr, "*** You can try the following commands to free "
>> +                        "it:\n");
>> +        fprintf(stderr, "***\n");
>> +        fprintf(stderr, "*** $ echo \"%04x %04x\" > /sys/bus/pci/drivers/"
>> +                        "pci-stub/new_id\n", vendor_id, device_id);
>> +        fprintf(stderr, "*** $ echo \"0000:%02x:%02x.%x\" > /sys/bus/pci"
>> +                        "/drivers/%s /unbind\n",
>> +                dev->host.bus, dev->host.dev, dev->host.func, ns);
>> +        fprintf(stderr, "*** $ echo \"0000:%02x:%02x.%x\" > /sys/bus/pci"
>> +                        "/drivers/ pci-stub/bind\n",
>> +                dev->host.bus, dev->host.dev, dev->host.func);
>> +        fprintf(stderr, "*** $ echo \"%x %x\" > /sys/bus/pci/drivers/pci-stub"
>> +                        "/remove_id\n", vendor_id, device_id);
>> +        fprintf(stderr, "***\n");
>>     
>
> above assumes domain zero. Please add a TODO to fix.
>   

Same as above, right? In fact, a lot of the code assumes that so it's
more of a generic TODO :-(.

Alex

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] Inform users about busy device assignment attempt
  2009-12-15 15:18     ` Alexander Graf
@ 2009-12-15 15:18       ` Michael S. Tsirkin
  0 siblings, 0 replies; 34+ messages in thread
From: Michael S. Tsirkin @ 2009-12-15 15:18 UTC (permalink / raw)
  To: Alexander Graf
  Cc: kvm list, Gleb Natapov, Muli Ben-Yehuda, Daniel P. Berrange

On Tue, Dec 15, 2009 at 04:18:00PM +0100, Alexander Graf wrote:
> Michael S. Tsirkin wrote:
> > On Fri, Dec 11, 2009 at 12:06:26AM +0100, Alexander Graf wrote:
> >   
> >> When using -pcidevice on a device that is already in use by a kernel driver
> >> all the user gets is the following (very useful) information:
> >>
> >>   Failed to assign device "04:00.0" : Device or resource busy
> >>   Failed to deassign device "04:00.0" : Invalid argument
> >>   Error initializing device pci-assign
> >>
> >> Since I usually prefer to have my computer do the thinking for me, I figured
> >> it might be a good idea to check and see if a device is actually used by a
> >> driver. If so, tell the user.
> >>
> >> So with this patch applied you get the following output:
> >>
> >>   Failed to assign device "04:00.0" : Device or resource busy
> >>   *** The driver 'igb' is occupying your device 04:00.0.
> >>   ***
> >>   *** You can try the following commands to free it:
> >>   ***
> >>   *** $ echo "8086 150a" > /sys/bus/pci/drivers/pci-stub/new_id
> >>   *** $ echo "0000:04:00.0" > /sys/bus/pci/drivers/igb/unbind
> >>   *** $ echo "0000:04:00.0" > /sys/bus/pci/drivers/pci-stub/bind
> >>   *** $ echo "8086 150a" > /sys/bus/pci/drivers/pci-stub/remove_id
> >>   ***
> >>   Failed to deassign device "04:00.0" : Invalid argument
> >>   Error initializing device pci-assign
> >>
> >> That should keep people like me from doing the most obvious misuses :-).
> >>
> >> CC: Daniel P. Berrange <berrange@redhat.com>
> >> Signed-off-by: Alexander Graf <agraf@suse.de>
> >>     
> >
> > Minor nits and a bug.
> >
> >   
> >> ---
> >>
> >> v1 -> v2:
> >>
> >>   - add more helpful guidance thanks to Daniel Berrange
> >>
> >> v2 -> v3:
> >>
> >>   - clear name variable before using it, thus 0-terminating the string
> >>   - fix region numbers
> >>   - use correct unbind/bind names
> >> ---
> >>  hw/device-assignment.c |  109 +++++++++++++++++++++++++++++++++++++-----------
> >>  1 files changed, 85 insertions(+), 24 deletions(-)
> >>
> >> diff --git a/hw/device-assignment.c b/hw/device-assignment.c
> >> index 5cee929..98faa83 100644
> >> --- a/hw/device-assignment.c
> >> +++ b/hw/device-assignment.c
> >> @@ -564,14 +564,44 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
> >>      return 0;
> >>  }
> >>  
> >> +static int get_real_id(const char *devpath, const char *idname, uint16_t *val)
> >> +{
> >> +    FILE *f;
> >> +    char name[128];
> >>     
> >
> > let's not introduce arbitraty file name length limitations.
> > strlen is not hard to use. I know all this module is
> > broken this way, but let's not add more.
> >   
> 
> It's just a move of existing code. I tried to change it as little as
> possible. Cleanups for that are welcome for later.
> 
> >   
> >> +    long id;
> >> +
> >> +    snprintf(name, sizeof(name), "%s%s", devpath, idname);
> >> +    f = fopen(name, "r");
> >> +    if (f == NULL) {
> >> +        fprintf(stderr, "%s: %s: %m\n", __func__, name);
> >> +        return -1;
> >> +    }
> >> +    if (fscanf(f, "%li\n", &id) == 1) {
> >> +        *val = id;
> >> +    }
> >>     
> >
> > handle fscanf error?
> >   
> 
> Interesting. I don't think it was done before, but I can put it in.
> 
> >   
> >> +    fclose(f);
> >> +
> >> +    return 0;
> >> +}
> >> +
> >> +static int get_real_vendor_id(const char *devpath, uint16_t *val)
> >> +{
> >> +    return get_real_id(devpath, "vendor", val);
> >> +}
> >> +
> >> +static int get_real_device_id(const char *devpath, uint16_t *val)
> >> +{
> >> +    return get_real_id(devpath, "device", val);
> >> +}
> >> +
> >>  static int get_real_device(AssignedDevice *pci_dev, uint8_t r_bus,
> >>                             uint8_t r_dev, uint8_t r_func)
> >>  {
> >>      char dir[128], name[128];
> >> -    int fd, r = 0;
> >> +    int fd, r = 0, v;
> >>      FILE *f;
> >>      unsigned long long start, end, size, flags;
> >> -    unsigned long id;
> >> +    uint16_t id;
> >>      struct stat statbuf;
> >>      PCIRegion *rp;
> >>      PCIDevRegions *dev = &pci_dev->real_device;
> >> @@ -637,31 +667,21 @@ again:
> >>  
> >>      fclose(f);
> >>  
> >> -    /* read and fill device ID */
> >> -    snprintf(name, sizeof(name), "%svendor", dir);
> >> -    f = fopen(name, "r");
> >> -    if (f == NULL) {
> >> -        fprintf(stderr, "%s: %s: %m\n", __func__, name);
> >> +    /* read and fill vendor ID */
> >> +    v = get_real_vendor_id(dir, &id);
> >> +    if (v) {
> >>          return 1;
> >>      }
> >> -    if (fscanf(f, "%li\n", &id) == 1) {
> >> -	pci_dev->dev.config[0] = id & 0xff;
> >> -	pci_dev->dev.config[1] = (id & 0xff00) >> 8;
> >> -    }
> >> -    fclose(f);
> >> +    pci_dev->dev.config[0] = id & 0xff;
> >> +    pci_dev->dev.config[1] = (id & 0xff00) >> 8;
> >>  
> >>     
> >
> > this seems an unrelated cleanup?
> > If so better as a separate patch?
> >   
> 
> It's the code move. I split it now.
> 
> >
> >   
> >> -    /* read and fill vendor ID */
> >> -    snprintf(name, sizeof(name), "%sdevice", dir);
> >> -    f = fopen(name, "r");
> >> -    if (f == NULL) {
> >> -        fprintf(stderr, "%s: %s: %m\n", __func__, name);
> >> +    /* read and fill device ID */
> >> +    v = get_real_device_id(dir, &id);
> >> +    if (v) {
> >>          return 1;
> >>      }
> >> -    if (fscanf(f, "%li\n", &id) == 1) {
> >> -	pci_dev->dev.config[2] = id & 0xff;
> >> -	pci_dev->dev.config[3] = (id & 0xff00) >> 8;
> >> -    }
> >> -    fclose(f);
> >> +    pci_dev->dev.config[2] = id & 0xff;
> >> +    pci_dev->dev.config[3] = (id & 0xff00) >> 8;
> >>  
> >>      /* dealing with virtual function device */
> >>      snprintf(name, sizeof(name), "%sphysfn/", dir);
> >> @@ -739,7 +759,9 @@ static uint32_t calc_assigned_dev_id(uint8_t bus, uint8_t devfn)
> >>  static int assign_device(AssignedDevice *dev)
> >>  {
> >>      struct kvm_assigned_pci_dev assigned_dev_data;
> >> -    int r;
> >> +    char name[128], dir[128], driver[128], *ns;
> >>     
> >
> > Yes 128 will be enough for now. But it's pretty ugly.
> > In this case, something like
> >     char dir[] = "/sys/bus/pci/devices/0000:00:00.0/";
> > will allocate just enough memory.
> > Or use MAX PATH.
> >   
> 
> Used MAX_PATH now.
> 
> >   
> >> +    uint16_t vendor_id, device_id;
> >> +    int r, v;
> >>  
> >>      memset(&assigned_dev_data, 0, sizeof(assigned_dev_data));
> >>      assigned_dev_data.assigned_dev_id  =
> >> @@ -761,9 +783,48 @@ static int assign_device(AssignedDevice *dev)
> >>  #endif
> >>  
> >>      r = kvm_assign_pci_device(kvm_context, &assigned_dev_data);
> >> -    if (r < 0)
> >> +    if (r < 0) {
> >>     
> >
> >
> > Please put all of the below in a separate function.
> >   
> 
> Ok.
> 
> >   
> >>  	fprintf(stderr, "Failed to assign device \"%s\" : %s\n",
> >>                  dev->dev.qdev.id, strerror(-r));
> >> +
> >> +        snprintf(dir, sizeof(dir),
> >>     
> >
> > snprintf? So you worry about overflowing dir?
> > But dir will not be 0 terminated on overflow,
> > so use of %s below would crash anyway.
> > As in fact we know this can not overflow, just use sprintf.
> >   
> 
> Ok.
> 
> >   
> >> +                 "/sys/bus/pci/devices/0000:%02x:%02x.%x/",
> >> +	         dev->host.bus, dev->host.dev, dev->host.func);
> >>     
> >
> > This assumes domain 0. I know multidomain is
> > broken with device assignment, but pls add
> > TOIDO here so we don't forget to fix it.
> >   
> 
> Ok.
> 
> >   
> >> +
> >> +        snprintf(name, sizeof(name), "%sdriver", dir);
> >>     
> >
> > So why do sprintf twice? Just put "driver" as part
> > of the template above.
> >   
> 
> We're using dir later in the code.
> 
> >   
> >> +
> >> +        memset(driver, 0, sizeof(driver));
> >>     
> >
> > just initialize driver to 0 by = {};
> >
> >   
> 
> That initializes it to 0? I mean, all elements?

Yes.

> >> +        v = readlink(name, driver, sizeof(driver));
> >>     
> >
> > So if readlink fills up all of driver, strrchr
> > below will cause coredump, right? Better check v against
> > sizeof driver.
> >   
> 
> Ok.
> 
> >   
> >> +        if ((v <= 0) || !(ns = strrchr(driver, '/'))) {
> >> +            return r;
> >>     
> >
> > Add some fprintf here. Maybe report errno as well.
> >   
> 
> Ok.
> 
> >   
> >> +        }
> >> +
> >> +        ns++;
> >> +
> >> +        if (get_real_vendor_id(dir, &vendor_id) ||
> >> +            get_real_device_id(dir, &device_id)) {
> >> +            return r;
> >>     
> >
> > And here.
> >   
> 
> Yep.
> 
> >   
> >> +        }
> >> +
> >> +        fprintf(stderr, "*** The driver '%s' is occupying your device "
> >> +                        "%02x:%02x.%x.\n",
> >> +                ns, dev->host.bus, dev->host.dev, dev->host.func);
> >> +        fprintf(stderr, "***\n");
> >> +        fprintf(stderr, "*** You can try the following commands to free "
> >> +                        "it:\n");
> >> +        fprintf(stderr, "***\n");
> >> +        fprintf(stderr, "*** $ echo \"%04x %04x\" > /sys/bus/pci/drivers/"
> >> +                        "pci-stub/new_id\n", vendor_id, device_id);
> >> +        fprintf(stderr, "*** $ echo \"0000:%02x:%02x.%x\" > /sys/bus/pci"
> >> +                        "/drivers/%s /unbind\n",
> >> +                dev->host.bus, dev->host.dev, dev->host.func, ns);
> >> +        fprintf(stderr, "*** $ echo \"0000:%02x:%02x.%x\" > /sys/bus/pci"
> >> +                        "/drivers/ pci-stub/bind\n",
> >> +                dev->host.bus, dev->host.dev, dev->host.func);
> >> +        fprintf(stderr, "*** $ echo \"%x %x\" > /sys/bus/pci/drivers/pci-stub"
> >> +                        "/remove_id\n", vendor_id, device_id);
> >> +        fprintf(stderr, "***\n");
> >>     
> >
> > above assumes domain zero. Please add a TODO to fix.
> >   
> 
> Same as above, right? In fact, a lot of the code assumes that so it's
> more of a generic TODO :-(.
> 
> Alex


Yes it is, unfortunately :(


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] Enable non page boundary BAR device assignment
  2009-12-11 11:05 ` [PATCH] Enable non page boundary BAR device assignment Michael S. Tsirkin
@ 2009-12-15 18:16   ` Alexander Graf
  2009-12-15 18:20     ` Michael S. Tsirkin
  0 siblings, 1 reply; 34+ messages in thread
From: Alexander Graf @ 2009-12-15 18:16 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: kvm list, Gleb Natapov, Muli Ben-Yehuda

Michael S. Tsirkin wrote:
> On Fri, Dec 11, 2009 at 12:06:25AM +0100, Alexander Graf wrote:
>   
>> While trying to get device passthrough working with an emulex hba, kvm
>> refused to pass it through because it has a BAR of 256 bytes:
>>
>>         Region 0: Memory at d2100000 (64-bit, non-prefetchable) [size=4K]
>>         Region 2: Memory at d2101000 (64-bit, non-prefetchable) [size=256]
>>         Region 4: I/O ports at b100 [size=256]
>>
>> Since the page boundary is an arbitrary optimization to allow 1:1 mapping of
>> physical to virtual addresses, we can still take the old MMIO callback route.
>>
>> So let's add a second code path that allows for size & 0xFFF != 0 sized regions
>> by looping it through userspace.
>>
>> I verified that it works by passing through an e1000 with this additional patch
>> applied and the card acted the same way it did without this patch:
>>
>>              map_func = assigned_dev_iomem_map;
>> -            if (cur_region->size & 0xFFF) {
>> +            if (i != PCI_ROM_SLOT){
>>                  fprintf(stderr, "PCI region %d at address 0x%llx "
>>     
>
> The patch is pretty clean. However, I think I see a bug below.
> Did you also try a device with sub-page BAR?
> If yes, did the bar get non page aligned value?
>
>   
>> Signed-off-by: Alexander Graf <agraf@suse.de>
>>
>> ---
>>
>> v1 -> v2:
>>
>>   - don't use map_func function pointer
>>   - use the same code for mmap on fast and slow path
>> ---
>>  hw/device-assignment.c |  123 +++++++++++++++++++++++++++++++++++++++++++++---
>>  1 files changed, 116 insertions(+), 7 deletions(-)
>>
>> diff --git a/hw/device-assignment.c b/hw/device-assignment.c
>> index 13a86bb..5cee929 100644
>> --- a/hw/device-assignment.c
>> +++ b/hw/device-assignment.c
>> @@ -148,6 +148,105 @@ static uint32_t assigned_dev_ioport_readl(void *opaque, uint32_t addr)
>>      return value;
>>  }
>>  
>> +static uint32_t slow_bar_readb(void *opaque, target_phys_addr_t addr)
>> +{
>> +    AssignedDevRegion *d = opaque;
>> +    uint8_t *in = (uint8_t*)(d->u.r_virtbase + addr);
>>     
>
> this is a void* pointer, no need to cast
>
>   
>> +    uint32_t r = -1;
>>     
>
> don't initialize r here as you override 1 line below.
>   
>> +
>> +    r = *in;
>> +    DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
>> +
>> +    return r;
>> +}
>> +
>> +static uint32_t slow_bar_readw(void *opaque, target_phys_addr_t addr)
>> +{
>> +    AssignedDevRegion *d = opaque;
>> +    uint16_t *in = (uint16_t*)(d->u.r_virtbase + addr);
>> +    uint32_t r = -1;
>> +
>> +    r = *in;
>> +    DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
>> +
>> +    return r;
>>     
>
> same as above
>
>   
>> +}
>> +
>> +static uint32_t slow_bar_readl(void *opaque, target_phys_addr_t addr)
>> +{
>> +    AssignedDevRegion *d = opaque;
>> +    uint32_t *in = (uint32_t*)(d->u.r_virtbase + addr);
>> +    uint32_t r = -1;
>> +
>> +    r = *in;
>> +    DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
>> +
>> +    return r;
>>     
>
> same as above
>
>   
>> +}
>> +
>> +static void slow_bar_writeb(void *opaque, target_phys_addr_t addr, uint32_t val)
>> +{
>> +    AssignedDevRegion *d = opaque;
>> +    uint8_t *out = (uint8_t*)(d->u.r_virtbase + addr);
>>     
>
> no need for cast as r_virtbase is a void pointer.
>
>   
>> +
>> +    DEBUG("slow_bar_writeb addr=0x" TARGET_FMT_plx " val=0x%02x\n", addr, val);
>> +    *out = val;
>> +}
>> +
>> +static void slow_bar_writew(void *opaque, target_phys_addr_t addr, uint32_t val)
>> +{
>> +    AssignedDevRegion *d = opaque;
>> +    uint16_t *out = (uint16_t*)(d->u.r_virtbase + addr);
>> +
>> +    DEBUG("slow_bar_writew addr=0x" TARGET_FMT_plx " val=0x%04x\n", addr, val);
>> +    *out = val;
>> +}
>> +
>> +static void slow_bar_writel(void *opaque, target_phys_addr_t addr, uint32_t val)
>> +{
>> +    AssignedDevRegion *d = opaque;
>> +    uint32_t *out = (uint32_t*)(d->u.r_virtbase + addr);
>>     
>
> same as above
>
>   
>> +
>> +    DEBUG("slow_bar_writel addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, val);
>> +    *out = val;
>> +}
>> +
>> +static CPUWriteMemoryFunc * const slow_bar_write[] = {
>> +    &slow_bar_writeb,
>> +    &slow_bar_writew,
>> +    &slow_bar_writel
>> +};
>> +
>> +static CPUReadMemoryFunc * const slow_bar_read[] = {
>> +    &slow_bar_readb,
>> +    &slow_bar_readw,
>> +    &slow_bar_readl
>> +};
>> +
>> +static void assigned_dev_iomem_map_slow(PCIDevice *pci_dev, int region_num,
>> +                                        pcibus_t e_phys, pcibus_t e_size,
>> +                                        int type)
>> +{
>> +    AssignedDevice *r_dev = container_of(pci_dev, AssignedDevice, dev);
>> +    AssignedDevRegion *region = &r_dev->v_addrs[region_num];
>> +    PCIRegion *real_region = &r_dev->real_device.regions[region_num];
>> +    int m;
>> +
>> +    DEBUG("slow map\n");
>> +    m = cpu_register_io_memory(slow_bar_read, slow_bar_write, region);
>> +    cpu_register_physical_memory(e_phys, e_size, m);
>> +
>> +    /* MSI-X MMIO page */
>>     
>
> some code duplication  ... use a common function?
>   

I'm not sure. It's pretty little duplication. If you feel like this
should be merged, feel free to make a cleanup patch afterwards :-).

>   
>> +    if ((e_size > 0) &&
>> +        real_region->base_addr <= r_dev->msix_table_addr &&
>> +        real_region->base_addr + real_region->size >= r_dev->msix_table_addr) {
>> +        int offset = r_dev->msix_table_addr - real_region->base_addr;
>> +
>> +        cpu_register_physical_memory(e_phys + offset,
>> +                TARGET_PAGE_SIZE, r_dev->mmio_index);
>>     
>
> How does this work? Does the last registered callback win
> or are both called for MSIX page?
>   

In qemu memory allocations the last one always wins.

>   
>> +    }
>> +}
>> +
>>  static void assigned_dev_iomem_map(PCIDevice *pci_dev, int region_num,
>>                                     pcibus_t e_phys, pcibus_t e_size, int type)
>>  {
>> @@ -381,15 +480,22 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
>>  
>>          /* handle memory io regions */
>>          if (cur_region->type & IORESOURCE_MEM) {
>> +            int slow_map = 0;
>>              int t = cur_region->type & IORESOURCE_PREFETCH
>>                  ? PCI_BASE_ADDRESS_MEM_PREFETCH
>>                  : PCI_BASE_ADDRESS_SPACE_MEMORY;
>> +
>>              if (cur_region->size & 0xFFF) {
>> -                fprintf(stderr, "Unable to assign device: PCI region %d "
>> -                        "at address 0x%llx has size 0x%x, "
>> -                        " which is not a multiple of 4K\n",
>> +                fprintf(stderr, "PCI region %d at address 0x%llx "
>> +                        "has size 0x%x, which is not a multiple of 4K. "
>> +                        "Using slow map\n",
>>     
>
> I still think "Using slow map" tells the user nothing.
> How about "Disabling direct guest access, device will be slow"?
>   

Changed.

>   
>>                          i, (unsigned long long)cur_region->base_addr,
>>                          cur_region->size);
>> +                slow_map = 1;
>> +            }
>> +
>> +            if (slow_map && (i == PCI_ROM_SLOT)) {
>> +                fprintf(stderr, "ROM not aligned - can't continue\n");
>>                  return -1;
>>              }
>>  
>> @@ -405,7 +511,7 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
>>              } else {
>>                  pci_dev->v_addrs[i].u.r_virtbase =
>>                      mmap(NULL,
>> -                         (cur_region->size + 0xFFF) & 0xFFFFF000,
>> +                         cur_region->size,
>>                           PROT_WRITE | PROT_READ, MAP_SHARED,
>>                           cur_region->resource_fd, (off_t) 0);
>>              }
>> @@ -429,12 +535,15 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
>>              pci_dev->v_addrs[i].e_size = 0;
>>  
>>              /* add offset */
>> -            pci_dev->v_addrs[i].u.r_virtbase +=
>> -                (cur_region->base_addr & 0xFFF);
>> +            if (!slow_map) {
>> +                pci_dev->v_addrs[i].u.r_virtbase +=
>> +                    (cur_region->base_addr & 0xFFF);
>> +            }
>>     
>
> This looks wrong. I think mmap returns a page aligned address,
> that's why we did this offset math. No?
> And if not, there was no reason for this code in the first place.
>   

Yeah, I guess you're right.

Alex

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] Enable non page boundary BAR device assignment
  2009-12-15 18:16   ` Alexander Graf
@ 2009-12-15 18:20     ` Michael S. Tsirkin
  2009-12-15 18:24       ` Alexander Graf
  0 siblings, 1 reply; 34+ messages in thread
From: Michael S. Tsirkin @ 2009-12-15 18:20 UTC (permalink / raw)
  To: Alexander Graf; +Cc: kvm list, Gleb Natapov, Muli Ben-Yehuda

On Tue, Dec 15, 2009 at 07:16:13PM +0100, Alexander Graf wrote:
> Michael S. Tsirkin wrote:
> > On Fri, Dec 11, 2009 at 12:06:25AM +0100, Alexander Graf wrote:
> >   
> >> While trying to get device passthrough working with an emulex hba, kvm
> >> refused to pass it through because it has a BAR of 256 bytes:
> >>
> >>         Region 0: Memory at d2100000 (64-bit, non-prefetchable) [size=4K]
> >>         Region 2: Memory at d2101000 (64-bit, non-prefetchable) [size=256]
> >>         Region 4: I/O ports at b100 [size=256]
> >>
> >> Since the page boundary is an arbitrary optimization to allow 1:1 mapping of
> >> physical to virtual addresses, we can still take the old MMIO callback route.
> >>
> >> So let's add a second code path that allows for size & 0xFFF != 0 sized regions
> >> by looping it through userspace.
> >>
> >> I verified that it works by passing through an e1000 with this additional patch
> >> applied and the card acted the same way it did without this patch:
> >>
> >>              map_func = assigned_dev_iomem_map;
> >> -            if (cur_region->size & 0xFFF) {
> >> +            if (i != PCI_ROM_SLOT){
> >>                  fprintf(stderr, "PCI region %d at address 0x%llx "
> >>     
> >
> > The patch is pretty clean. However, I think I see a bug below.
> > Did you also try a device with sub-page BAR?
> > If yes, did the bar get non page aligned value?
> >
> >   
> >> Signed-off-by: Alexander Graf <agraf@suse.de>
> >>
> >> ---
> >>
> >> v1 -> v2:
> >>
> >>   - don't use map_func function pointer
> >>   - use the same code for mmap on fast and slow path
> >> ---
> >>  hw/device-assignment.c |  123 +++++++++++++++++++++++++++++++++++++++++++++---
> >>  1 files changed, 116 insertions(+), 7 deletions(-)
> >>
> >> diff --git a/hw/device-assignment.c b/hw/device-assignment.c
> >> index 13a86bb..5cee929 100644
> >> --- a/hw/device-assignment.c
> >> +++ b/hw/device-assignment.c
> >> @@ -148,6 +148,105 @@ static uint32_t assigned_dev_ioport_readl(void *opaque, uint32_t addr)
> >>      return value;
> >>  }
> >>  
> >> +static uint32_t slow_bar_readb(void *opaque, target_phys_addr_t addr)
> >> +{
> >> +    AssignedDevRegion *d = opaque;
> >> +    uint8_t *in = (uint8_t*)(d->u.r_virtbase + addr);
> >>     
> >
> > this is a void* pointer, no need to cast
> >
> >   
> >> +    uint32_t r = -1;
> >>     
> >
> > don't initialize r here as you override 1 line below.
> >   
> >> +
> >> +    r = *in;
> >> +    DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
> >> +
> >> +    return r;
> >> +}
> >> +
> >> +static uint32_t slow_bar_readw(void *opaque, target_phys_addr_t addr)
> >> +{
> >> +    AssignedDevRegion *d = opaque;
> >> +    uint16_t *in = (uint16_t*)(d->u.r_virtbase + addr);
> >> +    uint32_t r = -1;
> >> +
> >> +    r = *in;
> >> +    DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
> >> +
> >> +    return r;
> >>     
> >
> > same as above
> >
> >   
> >> +}
> >> +
> >> +static uint32_t slow_bar_readl(void *opaque, target_phys_addr_t addr)
> >> +{
> >> +    AssignedDevRegion *d = opaque;
> >> +    uint32_t *in = (uint32_t*)(d->u.r_virtbase + addr);
> >> +    uint32_t r = -1;
> >> +
> >> +    r = *in;
> >> +    DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
> >> +
> >> +    return r;
> >>     
> >
> > same as above
> >
> >   
> >> +}
> >> +
> >> +static void slow_bar_writeb(void *opaque, target_phys_addr_t addr, uint32_t val)
> >> +{
> >> +    AssignedDevRegion *d = opaque;
> >> +    uint8_t *out = (uint8_t*)(d->u.r_virtbase + addr);
> >>     
> >
> > no need for cast as r_virtbase is a void pointer.
> >
> >   
> >> +
> >> +    DEBUG("slow_bar_writeb addr=0x" TARGET_FMT_plx " val=0x%02x\n", addr, val);
> >> +    *out = val;
> >> +}
> >> +
> >> +static void slow_bar_writew(void *opaque, target_phys_addr_t addr, uint32_t val)
> >> +{
> >> +    AssignedDevRegion *d = opaque;
> >> +    uint16_t *out = (uint16_t*)(d->u.r_virtbase + addr);
> >> +
> >> +    DEBUG("slow_bar_writew addr=0x" TARGET_FMT_plx " val=0x%04x\n", addr, val);
> >> +    *out = val;
> >> +}
> >> +
> >> +static void slow_bar_writel(void *opaque, target_phys_addr_t addr, uint32_t val)
> >> +{
> >> +    AssignedDevRegion *d = opaque;
> >> +    uint32_t *out = (uint32_t*)(d->u.r_virtbase + addr);
> >>     
> >
> > same as above
> >
> >   
> >> +
> >> +    DEBUG("slow_bar_writel addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, val);
> >> +    *out = val;
> >> +}
> >> +
> >> +static CPUWriteMemoryFunc * const slow_bar_write[] = {
> >> +    &slow_bar_writeb,
> >> +    &slow_bar_writew,
> >> +    &slow_bar_writel
> >> +};
> >> +
> >> +static CPUReadMemoryFunc * const slow_bar_read[] = {
> >> +    &slow_bar_readb,
> >> +    &slow_bar_readw,
> >> +    &slow_bar_readl
> >> +};
> >> +
> >> +static void assigned_dev_iomem_map_slow(PCIDevice *pci_dev, int region_num,
> >> +                                        pcibus_t e_phys, pcibus_t e_size,
> >> +                                        int type)
> >> +{
> >> +    AssignedDevice *r_dev = container_of(pci_dev, AssignedDevice, dev);
> >> +    AssignedDevRegion *region = &r_dev->v_addrs[region_num];
> >> +    PCIRegion *real_region = &r_dev->real_device.regions[region_num];
> >> +    int m;
> >> +
> >> +    DEBUG("slow map\n");
> >> +    m = cpu_register_io_memory(slow_bar_read, slow_bar_write, region);
> >> +    cpu_register_physical_memory(e_phys, e_size, m);
> >> +
> >> +    /* MSI-X MMIO page */
> >>     
> >
> > some code duplication  ... use a common function?
> >   
> 
> I'm not sure. It's pretty little duplication. If you feel like this
> should be merged, feel free to make a cleanup patch afterwards :-).
> 
> >   
> >> +    if ((e_size > 0) &&
> >> +        real_region->base_addr <= r_dev->msix_table_addr &&
> >> +        real_region->base_addr + real_region->size >= r_dev->msix_table_addr) {
> >> +        int offset = r_dev->msix_table_addr - real_region->base_addr;
> >> +
> >> +        cpu_register_physical_memory(e_phys + offset,
> >> +                TARGET_PAGE_SIZE, r_dev->mmio_index);
> >>     
> >
> > How does this work? Does the last registered callback win
> > or are both called for MSIX page?
> >   
> 
> In qemu memory allocations the last one always wins.
> 
> >   
> >> +    }
> >> +}
> >> +
> >>  static void assigned_dev_iomem_map(PCIDevice *pci_dev, int region_num,
> >>                                     pcibus_t e_phys, pcibus_t e_size, int type)
> >>  {
> >> @@ -381,15 +480,22 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
> >>  
> >>          /* handle memory io regions */
> >>          if (cur_region->type & IORESOURCE_MEM) {
> >> +            int slow_map = 0;
> >>              int t = cur_region->type & IORESOURCE_PREFETCH
> >>                  ? PCI_BASE_ADDRESS_MEM_PREFETCH
> >>                  : PCI_BASE_ADDRESS_SPACE_MEMORY;
> >> +
> >>              if (cur_region->size & 0xFFF) {
> >> -                fprintf(stderr, "Unable to assign device: PCI region %d "
> >> -                        "at address 0x%llx has size 0x%x, "
> >> -                        " which is not a multiple of 4K\n",
> >> +                fprintf(stderr, "PCI region %d at address 0x%llx "
> >> +                        "has size 0x%x, which is not a multiple of 4K. "
> >> +                        "Using slow map\n",
> >>     
> >
> > I still think "Using slow map" tells the user nothing.
> > How about "Disabling direct guest access, device will be slow"?
> >   
> 
> Changed.
> 
> >   
> >>                          i, (unsigned long long)cur_region->base_addr,
> >>                          cur_region->size);
> >> +                slow_map = 1;
> >> +            }
> >> +
> >> +            if (slow_map && (i == PCI_ROM_SLOT)) {
> >> +                fprintf(stderr, "ROM not aligned - can't continue\n");
> >>                  return -1;
> >>              }
> >>  
> >> @@ -405,7 +511,7 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
> >>              } else {
> >>                  pci_dev->v_addrs[i].u.r_virtbase =
> >>                      mmap(NULL,
> >> -                         (cur_region->size + 0xFFF) & 0xFFFFF000,
> >> +                         cur_region->size,
> >>                           PROT_WRITE | PROT_READ, MAP_SHARED,
> >>                           cur_region->resource_fd, (off_t) 0);
> >>              }
> >> @@ -429,12 +535,15 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
> >>              pci_dev->v_addrs[i].e_size = 0;
> >>  
> >>              /* add offset */
> >> -            pci_dev->v_addrs[i].u.r_virtbase +=
> >> -                (cur_region->base_addr & 0xFFF);
> >> +            if (!slow_map) {
> >> +                pci_dev->v_addrs[i].u.r_virtbase +=
> >> +                    (cur_region->base_addr & 0xFFF);
> >> +            }
> >>     
> >
> > This looks wrong. I think mmap returns a page aligned address,
> > that's why we did this offset math. No?
> > And if not, there was no reason for this code in the first place.
> >   
> 
> Yeah, I guess you're right.
> 
> Alex

I guess this means you'll have to find a device with a sub-page BAR
to test this on, instead of hacking driver for a device with full
page BAR. Did anyone ever try doing passthrough on an emulated
device in nested virt?

-- 
MST

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] Enable non page boundary BAR device assignment
  2009-12-15 18:20     ` Michael S. Tsirkin
@ 2009-12-15 18:24       ` Alexander Graf
  2009-12-16 20:12         ` Muli Ben-Yehuda
  0 siblings, 1 reply; 34+ messages in thread
From: Alexander Graf @ 2009-12-15 18:24 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: kvm list, Gleb Natapov, Muli Ben-Yehuda

Michael S. Tsirkin wrote:
> I guess this means you'll have to find a device with a sub-page BAR
> to test this on, instead of hacking driver for a device with full
> page BAR. Did anyone ever try doing passthrough on an emulated
> device in nested virt?
>   


We don't emulate an IOMMU, so no.

Alex

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] Enable non page boundary BAR device assignment
  2009-12-15 18:24       ` Alexander Graf
@ 2009-12-16 20:12         ` Muli Ben-Yehuda
  0 siblings, 0 replies; 34+ messages in thread
From: Muli Ben-Yehuda @ 2009-12-16 20:12 UTC (permalink / raw)
  To: Alexander Graf; +Cc: Michael S. Tsirkin, kvm list, Gleb Natapov

On Tue, Dec 15, 2009 at 07:24:47PM +0100, Alexander Graf wrote:
> Michael S. Tsirkin wrote:
>
> > I guess this means you'll have to find a device with a sub-page
> > BAR to test this on, instead of hacking driver for a device with
> > full page BAR. Did anyone ever try doing passthrough on an
> > emulated device in nested virt?
> 
> We don't emulate an IOMMU, so no.

We have in-house patches to emulate VT-d, which work sufficiently well
to run unmodified Linux drivers for QEMU's emulation of e1000 and LSI
adapter, using unmodified Linux VT-d DMA-mapping code. The patches
need some lovin', but they'll be coming eventually.

Cheers,
Muli
-- 
Muli Ben-Yehuda | muli@il.ibm.com | +972-4-8281080
Manager, Virtualization and Systems Architecture
Master Inventor, IBM Research -- Haifa
Second Workshop on I/O Virtualization (WIOV '10):
http://sysrun.haifa.il.ibm.com/hrl/wiov2010/

^ permalink raw reply	[flat|nested] 34+ messages in thread

end of thread, other threads:[~2009-12-16 20:39 UTC | newest]

Thread overview: 34+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-12-10 23:06 [PATCH] Enable non page boundary BAR device assignment Alexander Graf
2009-12-10 23:06 ` [PATCH] Inform users about busy device assignment attempt Alexander Graf
2009-12-11 11:38   ` Michael S. Tsirkin
2009-12-15 15:18     ` Alexander Graf
2009-12-15 15:18       ` Michael S. Tsirkin
2009-12-11 11:05 ` [PATCH] Enable non page boundary BAR device assignment Michael S. Tsirkin
2009-12-15 18:16   ` Alexander Graf
2009-12-15 18:20     ` Michael S. Tsirkin
2009-12-15 18:24       ` Alexander Graf
2009-12-16 20:12         ` Muli Ben-Yehuda
  -- strict thread matches above, loose matches on Subject: below --
2009-12-09 17:38 Alexander Graf
2009-12-09 20:49 ` Michael S. Tsirkin
2009-12-09 21:06   ` Alexander Graf
2009-12-10 10:35     ` Avi Kivity
2009-12-10  5:16 ` Muli Ben-Yehuda
2009-12-10  9:35   ` Alexander Graf
2009-12-10 10:21     ` Muli Ben-Yehuda
2009-12-10  9:43   ` Michael S. Tsirkin
2009-12-10  9:52     ` Alexander Graf
2009-12-10 10:08       ` Alexander Graf
2009-12-10 10:27         ` Michael S. Tsirkin
2009-12-10 10:31           ` Alexander Graf
2009-12-10 10:42             ` Michael S. Tsirkin
2009-12-10 10:23       ` Muli Ben-Yehuda
2009-12-10 10:31         ` Alexander Graf
2009-12-10 10:37           ` Muli Ben-Yehuda
2009-12-10 10:56             ` Michael S. Tsirkin
2009-12-10 11:09               ` Alexander Graf
2009-12-10 11:21                 ` Michael S. Tsirkin
2009-12-10 12:12                   ` Gleb Natapov
2009-12-10 11:28               ` Muli Ben-Yehuda
2009-12-10 11:34                 ` Alexander Graf
2009-12-10 11:46                   ` Michael S. Tsirkin
2009-12-10 11:37                 ` Michael S. Tsirkin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).