* [PATCH 1/3] Enable non page boundary BAR device assignment
2009-12-15 18:30 [PATCH 0/3] Device Assignment fixes Alexander Graf
@ 2009-12-15 18:30 ` Alexander Graf
2009-12-15 18:38 ` Michael S. Tsirkin
2009-12-15 18:30 ` [PATCH 2/3] Split off sysfs id retrieval Alexander Graf
2009-12-15 18:30 ` [PATCH 3/3] Inform users about busy device assignment attempt Alexander Graf
2 siblings, 1 reply; 25+ messages in thread
From: Alexander Graf @ 2009-12-15 18:30 UTC (permalink / raw)
To: kvm list; +Cc: Gleb Natapov, Michael S. Tsirkin, Muli Ben-Yehuda, Chris Wright
While trying to get device passthrough working with an emulex hba, kvm
refused to pass it through because it has a BAR of 256 bytes:
Region 0: Memory at d2100000 (64-bit, non-prefetchable) [size=4K]
Region 2: Memory at d2101000 (64-bit, non-prefetchable) [size=256]
Region 4: I/O ports at b100 [size=256]
Since the page boundary is an arbitrary optimization to allow 1:1 mapping of
physical to virtual addresses, we can still take the old MMIO callback route.
So let's add a second code path that allows for size & 0xFFF != 0 sized regions
by looping it through userspace.
I verified that it works by passing through an e1000 with this additional patch
applied and the card acted the same way it did without this patch:
map_func = assigned_dev_iomem_map;
- if (cur_region->size & 0xFFF) {
+ if (i != PCI_ROM_SLOT){
fprintf(stderr, "PCI region %d at address 0x%llx "
Signed-off-by: Alexander Graf <agraf@suse.de>
---
v1 -> v2:
- don't use map_func function pointer
- use the same code for mmap on fast and slow path
v2 -> v3:
- address mst's comments
---
hw/device-assignment.c | 117 +++++++++++++++++++++++++++++++++++++++++++++--
1 files changed, 112 insertions(+), 5 deletions(-)
diff --git a/hw/device-assignment.c b/hw/device-assignment.c
index 13a86bb..000fa61 100644
--- a/hw/device-assignment.c
+++ b/hw/device-assignment.c
@@ -148,6 +148,105 @@ static uint32_t assigned_dev_ioport_readl(void *opaque, uint32_t addr)
return value;
}
+static uint32_t slow_bar_readb(void *opaque, target_phys_addr_t addr)
+{
+ AssignedDevRegion *d = opaque;
+ uint8_t *in = d->u.r_virtbase + addr;
+ uint32_t r;
+
+ r = *in;
+ DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
+
+ return r;
+}
+
+static uint32_t slow_bar_readw(void *opaque, target_phys_addr_t addr)
+{
+ AssignedDevRegion *d = opaque;
+ uint16_t *in = d->u.r_virtbase + addr;
+ uint32_t r;
+
+ r = *in;
+ DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
+
+ return r;
+}
+
+static uint32_t slow_bar_readl(void *opaque, target_phys_addr_t addr)
+{
+ AssignedDevRegion *d = opaque;
+ uint32_t *in = d->u.r_virtbase + addr;
+ uint32_t r;
+
+ r = *in;
+ DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
+
+ return r;
+}
+
+static void slow_bar_writeb(void *opaque, target_phys_addr_t addr, uint32_t val)
+{
+ AssignedDevRegion *d = opaque;
+ uint8_t *out = d->u.r_virtbase + addr;
+
+ DEBUG("slow_bar_writeb addr=0x" TARGET_FMT_plx " val=0x%02x\n", addr, val);
+ *out = val;
+}
+
+static void slow_bar_writew(void *opaque, target_phys_addr_t addr, uint32_t val)
+{
+ AssignedDevRegion *d = opaque;
+ uint16_t *out = d->u.r_virtbase + addr;
+
+ DEBUG("slow_bar_writew addr=0x" TARGET_FMT_plx " val=0x%04x\n", addr, val);
+ *out = val;
+}
+
+static void slow_bar_writel(void *opaque, target_phys_addr_t addr, uint32_t val)
+{
+ AssignedDevRegion *d = opaque;
+ uint32_t *out = d->u.r_virtbase + addr;
+
+ DEBUG("slow_bar_writel addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, val);
+ *out = val;
+}
+
+static CPUWriteMemoryFunc * const slow_bar_write[] = {
+ &slow_bar_writeb,
+ &slow_bar_writew,
+ &slow_bar_writel
+};
+
+static CPUReadMemoryFunc * const slow_bar_read[] = {
+ &slow_bar_readb,
+ &slow_bar_readw,
+ &slow_bar_readl
+};
+
+static void assigned_dev_iomem_map_slow(PCIDevice *pci_dev, int region_num,
+ pcibus_t e_phys, pcibus_t e_size,
+ int type)
+{
+ AssignedDevice *r_dev = container_of(pci_dev, AssignedDevice, dev);
+ AssignedDevRegion *region = &r_dev->v_addrs[region_num];
+ PCIRegion *real_region = &r_dev->real_device.regions[region_num];
+ int m;
+
+ DEBUG("slow map\n");
+ m = cpu_register_io_memory(slow_bar_read, slow_bar_write, region);
+ cpu_register_physical_memory(e_phys, e_size, m);
+
+ /* MSI-X MMIO page */
+ if ((e_size > 0) &&
+ real_region->base_addr <= r_dev->msix_table_addr &&
+ real_region->base_addr + real_region->size >= r_dev->msix_table_addr) {
+ int offset = r_dev->msix_table_addr - real_region->base_addr;
+
+ cpu_register_physical_memory(e_phys + offset,
+ TARGET_PAGE_SIZE, r_dev->mmio_index);
+ }
+}
+
static void assigned_dev_iomem_map(PCIDevice *pci_dev, int region_num,
pcibus_t e_phys, pcibus_t e_size, int type)
{
@@ -381,15 +480,22 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
/* handle memory io regions */
if (cur_region->type & IORESOURCE_MEM) {
+ int slow_map = 0;
int t = cur_region->type & IORESOURCE_PREFETCH
? PCI_BASE_ADDRESS_MEM_PREFETCH
: PCI_BASE_ADDRESS_SPACE_MEMORY;
+
if (cur_region->size & 0xFFF) {
- fprintf(stderr, "Unable to assign device: PCI region %d "
- "at address 0x%llx has size 0x%x, "
- " which is not a multiple of 4K\n",
+ fprintf(stderr, "PCI region %d at address 0x%llx "
+ "has size 0x%x, which is not a multiple of 4K. "
+ "You might experience some performance hit due to that.\n",
i, (unsigned long long)cur_region->base_addr,
cur_region->size);
+ slow_map = 1;
+ }
+
+ if (slow_map && (i == PCI_ROM_SLOT)) {
+ fprintf(stderr, "ROM not aligned - can't continue\n");
return -1;
}
@@ -405,7 +511,7 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
} else {
pci_dev->v_addrs[i].u.r_virtbase =
mmap(NULL,
- (cur_region->size + 0xFFF) & 0xFFFFF000,
+ cur_region->size,
PROT_WRITE | PROT_READ, MAP_SHARED,
cur_region->resource_fd, (off_t) 0);
}
@@ -434,7 +540,8 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
pci_register_bar((PCIDevice *) pci_dev, i,
cur_region->size, t,
- assigned_dev_iomem_map);
+ slow_map ? assigned_dev_iomem_map_slow
+ : assigned_dev_iomem_map);
continue;
}
/* handle port io regions */
--
1.6.0.2
^ permalink raw reply related [flat|nested] 25+ messages in thread* Re: [PATCH 1/3] Enable non page boundary BAR device assignment
2009-12-15 18:30 ` [PATCH 1/3] Enable non page boundary BAR device assignment Alexander Graf
@ 2009-12-15 18:38 ` Michael S. Tsirkin
2009-12-15 18:47 ` Alexander Graf
0 siblings, 1 reply; 25+ messages in thread
From: Michael S. Tsirkin @ 2009-12-15 18:38 UTC (permalink / raw)
To: Alexander Graf; +Cc: kvm list, Gleb Natapov, Muli Ben-Yehuda, Chris Wright
On Tue, Dec 15, 2009 at 07:30:26PM +0100, Alexander Graf wrote:
> While trying to get device passthrough working with an emulex hba, kvm
> refused to pass it through because it has a BAR of 256 bytes:
>
> Region 0: Memory at d2100000 (64-bit, non-prefetchable) [size=4K]
> Region 2: Memory at d2101000 (64-bit, non-prefetchable) [size=256]
> Region 4: I/O ports at b100 [size=256]
>
> Since the page boundary is an arbitrary optimization to allow 1:1 mapping of
> physical to virtual addresses, we can still take the old MMIO callback route.
>
> So let's add a second code path that allows for size & 0xFFF != 0 sized regions
> by looping it through userspace.
>
> I verified that it works by passing through an e1000 with this additional patch
> applied and the card acted the same way it did without this patch:
>
> map_func = assigned_dev_iomem_map;
> - if (cur_region->size & 0xFFF) {
> + if (i != PCI_ROM_SLOT){
> fprintf(stderr, "PCI region %d at address 0x%llx "
>
> Signed-off-by: Alexander Graf <agraf@suse.de>
Looks good.
1 question:
> ---
>
> v1 -> v2:
>
> - don't use map_func function pointer
> - use the same code for mmap on fast and slow path
>
> v2 -> v3:
>
> - address mst's comments
> ---
> hw/device-assignment.c | 117 +++++++++++++++++++++++++++++++++++++++++++++--
> 1 files changed, 112 insertions(+), 5 deletions(-)
>
> diff --git a/hw/device-assignment.c b/hw/device-assignment.c
> index 13a86bb..000fa61 100644
> --- a/hw/device-assignment.c
> +++ b/hw/device-assignment.c
> @@ -148,6 +148,105 @@ static uint32_t assigned_dev_ioport_readl(void *opaque, uint32_t addr)
> return value;
> }
>
> +static uint32_t slow_bar_readb(void *opaque, target_phys_addr_t addr)
> +{
> + AssignedDevRegion *d = opaque;
> + uint8_t *in = d->u.r_virtbase + addr;
> + uint32_t r;
> +
> + r = *in;
> + DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
> +
> + return r;
> +}
> +
> +static uint32_t slow_bar_readw(void *opaque, target_phys_addr_t addr)
> +{
> + AssignedDevRegion *d = opaque;
> + uint16_t *in = d->u.r_virtbase + addr;
> + uint32_t r;
> +
> + r = *in;
> + DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
> +
> + return r;
> +}
> +
> +static uint32_t slow_bar_readl(void *opaque, target_phys_addr_t addr)
> +{
> + AssignedDevRegion *d = opaque;
> + uint32_t *in = d->u.r_virtbase + addr;
> + uint32_t r;
> +
> + r = *in;
> + DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
> +
> + return r;
> +}
> +
> +static void slow_bar_writeb(void *opaque, target_phys_addr_t addr, uint32_t val)
> +{
> + AssignedDevRegion *d = opaque;
> + uint8_t *out = d->u.r_virtbase + addr;
> +
> + DEBUG("slow_bar_writeb addr=0x" TARGET_FMT_plx " val=0x%02x\n", addr, val);
> + *out = val;
> +}
> +
> +static void slow_bar_writew(void *opaque, target_phys_addr_t addr, uint32_t val)
> +{
> + AssignedDevRegion *d = opaque;
> + uint16_t *out = d->u.r_virtbase + addr;
> +
> + DEBUG("slow_bar_writew addr=0x" TARGET_FMT_plx " val=0x%04x\n", addr, val);
> + *out = val;
> +}
> +
> +static void slow_bar_writel(void *opaque, target_phys_addr_t addr, uint32_t val)
> +{
> + AssignedDevRegion *d = opaque;
> + uint32_t *out = d->u.r_virtbase + addr;
> +
> + DEBUG("slow_bar_writel addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, val);
> + *out = val;
> +}
> +
> +static CPUWriteMemoryFunc * const slow_bar_write[] = {
> + &slow_bar_writeb,
> + &slow_bar_writew,
> + &slow_bar_writel
> +};
> +
> +static CPUReadMemoryFunc * const slow_bar_read[] = {
> + &slow_bar_readb,
> + &slow_bar_readw,
> + &slow_bar_readl
> +};
> +
> +static void assigned_dev_iomem_map_slow(PCIDevice *pci_dev, int region_num,
> + pcibus_t e_phys, pcibus_t e_size,
> + int type)
> +{
> + AssignedDevice *r_dev = container_of(pci_dev, AssignedDevice, dev);
> + AssignedDevRegion *region = &r_dev->v_addrs[region_num];
> + PCIRegion *real_region = &r_dev->real_device.regions[region_num];
> + int m;
> +
> + DEBUG("slow map\n");
> + m = cpu_register_io_memory(slow_bar_read, slow_bar_write, region);
> + cpu_register_physical_memory(e_phys, e_size, m);
> +
> + /* MSI-X MMIO page */
> + if ((e_size > 0) &&
> + real_region->base_addr <= r_dev->msix_table_addr &&
> + real_region->base_addr + real_region->size >= r_dev->msix_table_addr) {
> + int offset = r_dev->msix_table_addr - real_region->base_addr;
> +
> + cpu_register_physical_memory(e_phys + offset,
> + TARGET_PAGE_SIZE, r_dev->mmio_index);
> + }
> +}
> +
> static void assigned_dev_iomem_map(PCIDevice *pci_dev, int region_num,
> pcibus_t e_phys, pcibus_t e_size, int type)
> {
> @@ -381,15 +480,22 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
>
> /* handle memory io regions */
> if (cur_region->type & IORESOURCE_MEM) {
> + int slow_map = 0;
> int t = cur_region->type & IORESOURCE_PREFETCH
> ? PCI_BASE_ADDRESS_MEM_PREFETCH
> : PCI_BASE_ADDRESS_SPACE_MEMORY;
> +
> if (cur_region->size & 0xFFF) {
> - fprintf(stderr, "Unable to assign device: PCI region %d "
> - "at address 0x%llx has size 0x%x, "
> - " which is not a multiple of 4K\n",
> + fprintf(stderr, "PCI region %d at address 0x%llx "
> + "has size 0x%x, which is not a multiple of 4K. "
> + "You might experience some performance hit due to that.\n",
> i, (unsigned long long)cur_region->base_addr,
> cur_region->size);
> + slow_map = 1;
> + }
> +
> + if (slow_map && (i == PCI_ROM_SLOT)) {
> + fprintf(stderr, "ROM not aligned - can't continue\n");
> return -1;
Why is this? Can't ROM be handled in the same way?
I think it was previously ...
> }
>
> @@ -405,7 +511,7 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
> } else {
> pci_dev->v_addrs[i].u.r_virtbase =
> mmap(NULL,
> - (cur_region->size + 0xFFF) & 0xFFFFF000,
> + cur_region->size,
Hmm, one assumes this code did work at some point ...
do you know why was it done this way?
> PROT_WRITE | PROT_READ, MAP_SHARED,
> cur_region->resource_fd, (off_t) 0);
> }
> @@ -434,7 +540,8 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
>
> pci_register_bar((PCIDevice *) pci_dev, i,
> cur_region->size, t,
> - assigned_dev_iomem_map);
> + slow_map ? assigned_dev_iomem_map_slow
> + : assigned_dev_iomem_map);
> continue;
> }
> /* handle port io regions */
> --
> 1.6.0.2
^ permalink raw reply [flat|nested] 25+ messages in thread* Re: [PATCH 1/3] Enable non page boundary BAR device assignment
2009-12-15 18:38 ` Michael S. Tsirkin
@ 2009-12-15 18:47 ` Alexander Graf
2009-12-15 18:50 ` Michael S. Tsirkin
0 siblings, 1 reply; 25+ messages in thread
From: Alexander Graf @ 2009-12-15 18:47 UTC (permalink / raw)
To: Michael S. Tsirkin; +Cc: kvm list, Gleb Natapov, Muli Ben-Yehuda, Chris Wright
Michael S. Tsirkin wrote:
> On Tue, Dec 15, 2009 at 07:30:26PM +0100, Alexander Graf wrote:
>
>> While trying to get device passthrough working with an emulex hba, kvm
>> refused to pass it through because it has a BAR of 256 bytes:
>>
>> Region 0: Memory at d2100000 (64-bit, non-prefetchable) [size=4K]
>> Region 2: Memory at d2101000 (64-bit, non-prefetchable) [size=256]
>> Region 4: I/O ports at b100 [size=256]
>>
>> Since the page boundary is an arbitrary optimization to allow 1:1 mapping of
>> physical to virtual addresses, we can still take the old MMIO callback route.
>>
>> So let's add a second code path that allows for size & 0xFFF != 0 sized regions
>> by looping it through userspace.
>>
>> I verified that it works by passing through an e1000 with this additional patch
>> applied and the card acted the same way it did without this patch:
>>
>> map_func = assigned_dev_iomem_map;
>> - if (cur_region->size & 0xFFF) {
>> + if (i != PCI_ROM_SLOT){
>> fprintf(stderr, "PCI region %d at address 0x%llx "
>>
>> Signed-off-by: Alexander Graf <agraf@suse.de>
>>
>
>
> Looks good.
> 1 question:
>
>
>> ---
>>
>> v1 -> v2:
>>
>> - don't use map_func function pointer
>> - use the same code for mmap on fast and slow path
>>
>> v2 -> v3:
>>
>> - address mst's comments
>> ---
>> hw/device-assignment.c | 117 +++++++++++++++++++++++++++++++++++++++++++++--
>> 1 files changed, 112 insertions(+), 5 deletions(-)
>>
>> diff --git a/hw/device-assignment.c b/hw/device-assignment.c
>> index 13a86bb..000fa61 100644
>> --- a/hw/device-assignment.c
>> +++ b/hw/device-assignment.c
>> @@ -148,6 +148,105 @@ static uint32_t assigned_dev_ioport_readl(void *opaque, uint32_t addr)
>> return value;
>> }
>>
>> +static uint32_t slow_bar_readb(void *opaque, target_phys_addr_t addr)
>> +{
>> + AssignedDevRegion *d = opaque;
>> + uint8_t *in = d->u.r_virtbase + addr;
>> + uint32_t r;
>> +
>> + r = *in;
>> + DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
>> +
>> + return r;
>> +}
>> +
>> +static uint32_t slow_bar_readw(void *opaque, target_phys_addr_t addr)
>> +{
>> + AssignedDevRegion *d = opaque;
>> + uint16_t *in = d->u.r_virtbase + addr;
>> + uint32_t r;
>> +
>> + r = *in;
>> + DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
>> +
>> + return r;
>> +}
>> +
>> +static uint32_t slow_bar_readl(void *opaque, target_phys_addr_t addr)
>> +{
>> + AssignedDevRegion *d = opaque;
>> + uint32_t *in = d->u.r_virtbase + addr;
>> + uint32_t r;
>> +
>> + r = *in;
>> + DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
>> +
>> + return r;
>> +}
>> +
>> +static void slow_bar_writeb(void *opaque, target_phys_addr_t addr, uint32_t val)
>> +{
>> + AssignedDevRegion *d = opaque;
>> + uint8_t *out = d->u.r_virtbase + addr;
>> +
>> + DEBUG("slow_bar_writeb addr=0x" TARGET_FMT_plx " val=0x%02x\n", addr, val);
>> + *out = val;
>> +}
>> +
>> +static void slow_bar_writew(void *opaque, target_phys_addr_t addr, uint32_t val)
>> +{
>> + AssignedDevRegion *d = opaque;
>> + uint16_t *out = d->u.r_virtbase + addr;
>> +
>> + DEBUG("slow_bar_writew addr=0x" TARGET_FMT_plx " val=0x%04x\n", addr, val);
>> + *out = val;
>> +}
>> +
>> +static void slow_bar_writel(void *opaque, target_phys_addr_t addr, uint32_t val)
>> +{
>> + AssignedDevRegion *d = opaque;
>> + uint32_t *out = d->u.r_virtbase + addr;
>> +
>> + DEBUG("slow_bar_writel addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, val);
>> + *out = val;
>> +}
>> +
>> +static CPUWriteMemoryFunc * const slow_bar_write[] = {
>> + &slow_bar_writeb,
>> + &slow_bar_writew,
>> + &slow_bar_writel
>> +};
>> +
>> +static CPUReadMemoryFunc * const slow_bar_read[] = {
>> + &slow_bar_readb,
>> + &slow_bar_readw,
>> + &slow_bar_readl
>> +};
>> +
>> +static void assigned_dev_iomem_map_slow(PCIDevice *pci_dev, int region_num,
>> + pcibus_t e_phys, pcibus_t e_size,
>> + int type)
>> +{
>> + AssignedDevice *r_dev = container_of(pci_dev, AssignedDevice, dev);
>> + AssignedDevRegion *region = &r_dev->v_addrs[region_num];
>> + PCIRegion *real_region = &r_dev->real_device.regions[region_num];
>> + int m;
>> +
>> + DEBUG("slow map\n");
>> + m = cpu_register_io_memory(slow_bar_read, slow_bar_write, region);
>> + cpu_register_physical_memory(e_phys, e_size, m);
>> +
>> + /* MSI-X MMIO page */
>> + if ((e_size > 0) &&
>> + real_region->base_addr <= r_dev->msix_table_addr &&
>> + real_region->base_addr + real_region->size >= r_dev->msix_table_addr) {
>> + int offset = r_dev->msix_table_addr - real_region->base_addr;
>> +
>> + cpu_register_physical_memory(e_phys + offset,
>> + TARGET_PAGE_SIZE, r_dev->mmio_index);
>> + }
>> +}
>> +
>> static void assigned_dev_iomem_map(PCIDevice *pci_dev, int region_num,
>> pcibus_t e_phys, pcibus_t e_size, int type)
>> {
>> @@ -381,15 +480,22 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
>>
>> /* handle memory io regions */
>> if (cur_region->type & IORESOURCE_MEM) {
>> + int slow_map = 0;
>> int t = cur_region->type & IORESOURCE_PREFETCH
>> ? PCI_BASE_ADDRESS_MEM_PREFETCH
>> : PCI_BASE_ADDRESS_SPACE_MEMORY;
>> +
>> if (cur_region->size & 0xFFF) {
>> - fprintf(stderr, "Unable to assign device: PCI region %d "
>> - "at address 0x%llx has size 0x%x, "
>> - " which is not a multiple of 4K\n",
>> + fprintf(stderr, "PCI region %d at address 0x%llx "
>> + "has size 0x%x, which is not a multiple of 4K. "
>> + "You might experience some performance hit due to that.\n",
>> i, (unsigned long long)cur_region->base_addr,
>> cur_region->size);
>> + slow_map = 1;
>> + }
>> +
>> + if (slow_map && (i == PCI_ROM_SLOT)) {
>> + fprintf(stderr, "ROM not aligned - can't continue\n");
>> return -1;
>>
>
> Why is this? Can't ROM be handled in the same way?
> I think it was previously ...
>
You can't execute code from MMIO regions, so slow_map doesn't work there.
>
>> }
>>
>> @@ -405,7 +511,7 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
>> } else {
>> pci_dev->v_addrs[i].u.r_virtbase =
>> mmap(NULL,
>> - (cur_region->size + 0xFFF) & 0xFFFFF000,
>> + cur_region->size,
>>
>
> Hmm, one assumes this code did work at some point ...
> do you know why was it done this way?
>
Nope :-).
Alex
^ permalink raw reply [flat|nested] 25+ messages in thread* Re: [PATCH 1/3] Enable non page boundary BAR device assignment
2009-12-15 18:47 ` Alexander Graf
@ 2009-12-15 18:50 ` Michael S. Tsirkin
2009-12-15 19:04 ` Alexander Graf
0 siblings, 1 reply; 25+ messages in thread
From: Michael S. Tsirkin @ 2009-12-15 18:50 UTC (permalink / raw)
To: Alexander Graf; +Cc: kvm list, Gleb Natapov, Muli Ben-Yehuda, Chris Wright
On Tue, Dec 15, 2009 at 07:47:35PM +0100, Alexander Graf wrote:
> Michael S. Tsirkin wrote:
> > On Tue, Dec 15, 2009 at 07:30:26PM +0100, Alexander Graf wrote:
> >
> >> While trying to get device passthrough working with an emulex hba, kvm
> >> refused to pass it through because it has a BAR of 256 bytes:
> >>
> >> Region 0: Memory at d2100000 (64-bit, non-prefetchable) [size=4K]
> >> Region 2: Memory at d2101000 (64-bit, non-prefetchable) [size=256]
> >> Region 4: I/O ports at b100 [size=256]
> >>
> >> Since the page boundary is an arbitrary optimization to allow 1:1 mapping of
> >> physical to virtual addresses, we can still take the old MMIO callback route.
> >>
> >> So let's add a second code path that allows for size & 0xFFF != 0 sized regions
> >> by looping it through userspace.
> >>
> >> I verified that it works by passing through an e1000 with this additional patch
> >> applied and the card acted the same way it did without this patch:
> >>
> >> map_func = assigned_dev_iomem_map;
> >> - if (cur_region->size & 0xFFF) {
> >> + if (i != PCI_ROM_SLOT){
> >> fprintf(stderr, "PCI region %d at address 0x%llx "
> >>
> >> Signed-off-by: Alexander Graf <agraf@suse.de>
> >>
> >
> >
> > Looks good.
> > 1 question:
> >
> >
> >> ---
> >>
> >> v1 -> v2:
> >>
> >> - don't use map_func function pointer
> >> - use the same code for mmap on fast and slow path
> >>
> >> v2 -> v3:
> >>
> >> - address mst's comments
> >> ---
> >> hw/device-assignment.c | 117 +++++++++++++++++++++++++++++++++++++++++++++--
> >> 1 files changed, 112 insertions(+), 5 deletions(-)
> >>
> >> diff --git a/hw/device-assignment.c b/hw/device-assignment.c
> >> index 13a86bb..000fa61 100644
> >> --- a/hw/device-assignment.c
> >> +++ b/hw/device-assignment.c
> >> @@ -148,6 +148,105 @@ static uint32_t assigned_dev_ioport_readl(void *opaque, uint32_t addr)
> >> return value;
> >> }
> >>
> >> +static uint32_t slow_bar_readb(void *opaque, target_phys_addr_t addr)
> >> +{
> >> + AssignedDevRegion *d = opaque;
> >> + uint8_t *in = d->u.r_virtbase + addr;
> >> + uint32_t r;
> >> +
> >> + r = *in;
> >> + DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
> >> +
> >> + return r;
> >> +}
> >> +
> >> +static uint32_t slow_bar_readw(void *opaque, target_phys_addr_t addr)
> >> +{
> >> + AssignedDevRegion *d = opaque;
> >> + uint16_t *in = d->u.r_virtbase + addr;
> >> + uint32_t r;
> >> +
> >> + r = *in;
> >> + DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
> >> +
> >> + return r;
> >> +}
> >> +
> >> +static uint32_t slow_bar_readl(void *opaque, target_phys_addr_t addr)
> >> +{
> >> + AssignedDevRegion *d = opaque;
> >> + uint32_t *in = d->u.r_virtbase + addr;
> >> + uint32_t r;
> >> +
> >> + r = *in;
> >> + DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
> >> +
> >> + return r;
> >> +}
> >> +
> >> +static void slow_bar_writeb(void *opaque, target_phys_addr_t addr, uint32_t val)
> >> +{
> >> + AssignedDevRegion *d = opaque;
> >> + uint8_t *out = d->u.r_virtbase + addr;
> >> +
> >> + DEBUG("slow_bar_writeb addr=0x" TARGET_FMT_plx " val=0x%02x\n", addr, val);
> >> + *out = val;
> >> +}
> >> +
> >> +static void slow_bar_writew(void *opaque, target_phys_addr_t addr, uint32_t val)
> >> +{
> >> + AssignedDevRegion *d = opaque;
> >> + uint16_t *out = d->u.r_virtbase + addr;
> >> +
> >> + DEBUG("slow_bar_writew addr=0x" TARGET_FMT_plx " val=0x%04x\n", addr, val);
> >> + *out = val;
> >> +}
> >> +
> >> +static void slow_bar_writel(void *opaque, target_phys_addr_t addr, uint32_t val)
> >> +{
> >> + AssignedDevRegion *d = opaque;
> >> + uint32_t *out = d->u.r_virtbase + addr;
> >> +
> >> + DEBUG("slow_bar_writel addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, val);
> >> + *out = val;
> >> +}
> >> +
> >> +static CPUWriteMemoryFunc * const slow_bar_write[] = {
> >> + &slow_bar_writeb,
> >> + &slow_bar_writew,
> >> + &slow_bar_writel
> >> +};
> >> +
> >> +static CPUReadMemoryFunc * const slow_bar_read[] = {
> >> + &slow_bar_readb,
> >> + &slow_bar_readw,
> >> + &slow_bar_readl
> >> +};
> >> +
> >> +static void assigned_dev_iomem_map_slow(PCIDevice *pci_dev, int region_num,
> >> + pcibus_t e_phys, pcibus_t e_size,
> >> + int type)
> >> +{
> >> + AssignedDevice *r_dev = container_of(pci_dev, AssignedDevice, dev);
> >> + AssignedDevRegion *region = &r_dev->v_addrs[region_num];
> >> + PCIRegion *real_region = &r_dev->real_device.regions[region_num];
> >> + int m;
> >> +
> >> + DEBUG("slow map\n");
> >> + m = cpu_register_io_memory(slow_bar_read, slow_bar_write, region);
> >> + cpu_register_physical_memory(e_phys, e_size, m);
> >> +
> >> + /* MSI-X MMIO page */
> >> + if ((e_size > 0) &&
> >> + real_region->base_addr <= r_dev->msix_table_addr &&
> >> + real_region->base_addr + real_region->size >= r_dev->msix_table_addr) {
> >> + int offset = r_dev->msix_table_addr - real_region->base_addr;
> >> +
> >> + cpu_register_physical_memory(e_phys + offset,
> >> + TARGET_PAGE_SIZE, r_dev->mmio_index);
> >> + }
> >> +}
> >> +
> >> static void assigned_dev_iomem_map(PCIDevice *pci_dev, int region_num,
> >> pcibus_t e_phys, pcibus_t e_size, int type)
> >> {
> >> @@ -381,15 +480,22 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
> >>
> >> /* handle memory io regions */
> >> if (cur_region->type & IORESOURCE_MEM) {
> >> + int slow_map = 0;
> >> int t = cur_region->type & IORESOURCE_PREFETCH
> >> ? PCI_BASE_ADDRESS_MEM_PREFETCH
> >> : PCI_BASE_ADDRESS_SPACE_MEMORY;
> >> +
> >> if (cur_region->size & 0xFFF) {
> >> - fprintf(stderr, "Unable to assign device: PCI region %d "
> >> - "at address 0x%llx has size 0x%x, "
> >> - " which is not a multiple of 4K\n",
> >> + fprintf(stderr, "PCI region %d at address 0x%llx "
> >> + "has size 0x%x, which is not a multiple of 4K. "
> >> + "You might experience some performance hit due to that.\n",
> >> i, (unsigned long long)cur_region->base_addr,
> >> cur_region->size);
> >> + slow_map = 1;
> >> + }
> >> +
> >> + if (slow_map && (i == PCI_ROM_SLOT)) {
> >> + fprintf(stderr, "ROM not aligned - can't continue\n");
> >> return -1;
> >>
> >
> > Why is this? Can't ROM be handled in the same way?
> > I think it was previously ...
> >
>
> You can't execute code from MMIO regions, so slow_map doesn't work there.
Hmm, how does it work with passthrough?
Anyway, I don't think a lot of users run ROM
code directly, they just shadow it and run from there.
> >
> >> }
> >>
> >> @@ -405,7 +511,7 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
> >> } else {
> >> pci_dev->v_addrs[i].u.r_virtbase =
> >> mmap(NULL,
> >> - (cur_region->size + 0xFFF) & 0xFFFFF000,
> >> + cur_region->size,
> >>
> >
> > Hmm, one assumes this code did work at some point ...
> > do you know why was it done this way?
> >
>
> Nope :-).
So maybe keep it that way or make the change in a separate patch.
> Alex
^ permalink raw reply [flat|nested] 25+ messages in thread* Re: [PATCH 1/3] Enable non page boundary BAR device assignment
2009-12-15 18:50 ` Michael S. Tsirkin
@ 2009-12-15 19:04 ` Alexander Graf
2009-12-15 19:50 ` Chris Wright
2009-12-15 21:04 ` Michael S. Tsirkin
0 siblings, 2 replies; 25+ messages in thread
From: Alexander Graf @ 2009-12-15 19:04 UTC (permalink / raw)
To: Michael S. Tsirkin; +Cc: kvm list, Gleb Natapov, Muli Ben-Yehuda, Chris Wright
Michael S. Tsirkin wrote:
> On Tue, Dec 15, 2009 at 07:47:35PM +0100, Alexander Graf wrote:
>
>> Michael S. Tsirkin wrote:
>>
>>> On Tue, Dec 15, 2009 at 07:30:26PM +0100, Alexander Graf wrote:
>>>
>>>
>>>> While trying to get device passthrough working with an emulex hba, kvm
>>>> refused to pass it through because it has a BAR of 256 bytes:
>>>>
>>>> Region 0: Memory at d2100000 (64-bit, non-prefetchable) [size=4K]
>>>> Region 2: Memory at d2101000 (64-bit, non-prefetchable) [size=256]
>>>> Region 4: I/O ports at b100 [size=256]
>>>>
>>>> Since the page boundary is an arbitrary optimization to allow 1:1 mapping of
>>>> physical to virtual addresses, we can still take the old MMIO callback route.
>>>>
>>>> So let's add a second code path that allows for size & 0xFFF != 0 sized regions
>>>> by looping it through userspace.
>>>>
>>>> I verified that it works by passing through an e1000 with this additional patch
>>>> applied and the card acted the same way it did without this patch:
>>>>
>>>> map_func = assigned_dev_iomem_map;
>>>> - if (cur_region->size & 0xFFF) {
>>>> + if (i != PCI_ROM_SLOT){
>>>> fprintf(stderr, "PCI region %d at address 0x%llx "
>>>>
>>>> Signed-off-by: Alexander Graf <agraf@suse.de>
>>>>
>>>>
>>> Looks good.
>>> 1 question:
>>>
>>>
>>>
>>>> ---
>>>>
>>>> v1 -> v2:
>>>>
>>>> - don't use map_func function pointer
>>>> - use the same code for mmap on fast and slow path
>>>>
>>>> v2 -> v3:
>>>>
>>>> - address mst's comments
>>>> ---
>>>> hw/device-assignment.c | 117 +++++++++++++++++++++++++++++++++++++++++++++--
>>>> 1 files changed, 112 insertions(+), 5 deletions(-)
>>>>
>>>> diff --git a/hw/device-assignment.c b/hw/device-assignment.c
>>>> index 13a86bb..000fa61 100644
>>>> --- a/hw/device-assignment.c
>>>> +++ b/hw/device-assignment.c
>>>> @@ -148,6 +148,105 @@ static uint32_t assigned_dev_ioport_readl(void *opaque, uint32_t addr)
>>>> return value;
>>>> }
>>>>
>>>> +static uint32_t slow_bar_readb(void *opaque, target_phys_addr_t addr)
>>>> +{
>>>> + AssignedDevRegion *d = opaque;
>>>> + uint8_t *in = d->u.r_virtbase + addr;
>>>> + uint32_t r;
>>>> +
>>>> + r = *in;
>>>> + DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
>>>> +
>>>> + return r;
>>>> +}
>>>> +
>>>> +static uint32_t slow_bar_readw(void *opaque, target_phys_addr_t addr)
>>>> +{
>>>> + AssignedDevRegion *d = opaque;
>>>> + uint16_t *in = d->u.r_virtbase + addr;
>>>> + uint32_t r;
>>>> +
>>>> + r = *in;
>>>> + DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
>>>> +
>>>> + return r;
>>>> +}
>>>> +
>>>> +static uint32_t slow_bar_readl(void *opaque, target_phys_addr_t addr)
>>>> +{
>>>> + AssignedDevRegion *d = opaque;
>>>> + uint32_t *in = d->u.r_virtbase + addr;
>>>> + uint32_t r;
>>>> +
>>>> + r = *in;
>>>> + DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
>>>> +
>>>> + return r;
>>>> +}
>>>> +
>>>> +static void slow_bar_writeb(void *opaque, target_phys_addr_t addr, uint32_t val)
>>>> +{
>>>> + AssignedDevRegion *d = opaque;
>>>> + uint8_t *out = d->u.r_virtbase + addr;
>>>> +
>>>> + DEBUG("slow_bar_writeb addr=0x" TARGET_FMT_plx " val=0x%02x\n", addr, val);
>>>> + *out = val;
>>>> +}
>>>> +
>>>> +static void slow_bar_writew(void *opaque, target_phys_addr_t addr, uint32_t val)
>>>> +{
>>>> + AssignedDevRegion *d = opaque;
>>>> + uint16_t *out = d->u.r_virtbase + addr;
>>>> +
>>>> + DEBUG("slow_bar_writew addr=0x" TARGET_FMT_plx " val=0x%04x\n", addr, val);
>>>> + *out = val;
>>>> +}
>>>> +
>>>> +static void slow_bar_writel(void *opaque, target_phys_addr_t addr, uint32_t val)
>>>> +{
>>>> + AssignedDevRegion *d = opaque;
>>>> + uint32_t *out = d->u.r_virtbase + addr;
>>>> +
>>>> + DEBUG("slow_bar_writel addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, val);
>>>> + *out = val;
>>>> +}
>>>> +
>>>> +static CPUWriteMemoryFunc * const slow_bar_write[] = {
>>>> + &slow_bar_writeb,
>>>> + &slow_bar_writew,
>>>> + &slow_bar_writel
>>>> +};
>>>> +
>>>> +static CPUReadMemoryFunc * const slow_bar_read[] = {
>>>> + &slow_bar_readb,
>>>> + &slow_bar_readw,
>>>> + &slow_bar_readl
>>>> +};
>>>> +
>>>> +static void assigned_dev_iomem_map_slow(PCIDevice *pci_dev, int region_num,
>>>> + pcibus_t e_phys, pcibus_t e_size,
>>>> + int type)
>>>> +{
>>>> + AssignedDevice *r_dev = container_of(pci_dev, AssignedDevice, dev);
>>>> + AssignedDevRegion *region = &r_dev->v_addrs[region_num];
>>>> + PCIRegion *real_region = &r_dev->real_device.regions[region_num];
>>>> + int m;
>>>> +
>>>> + DEBUG("slow map\n");
>>>> + m = cpu_register_io_memory(slow_bar_read, slow_bar_write, region);
>>>> + cpu_register_physical_memory(e_phys, e_size, m);
>>>> +
>>>> + /* MSI-X MMIO page */
>>>> + if ((e_size > 0) &&
>>>> + real_region->base_addr <= r_dev->msix_table_addr &&
>>>> + real_region->base_addr + real_region->size >= r_dev->msix_table_addr) {
>>>> + int offset = r_dev->msix_table_addr - real_region->base_addr;
>>>> +
>>>> + cpu_register_physical_memory(e_phys + offset,
>>>> + TARGET_PAGE_SIZE, r_dev->mmio_index);
>>>> + }
>>>> +}
>>>> +
>>>> static void assigned_dev_iomem_map(PCIDevice *pci_dev, int region_num,
>>>> pcibus_t e_phys, pcibus_t e_size, int type)
>>>> {
>>>> @@ -381,15 +480,22 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
>>>>
>>>> /* handle memory io regions */
>>>> if (cur_region->type & IORESOURCE_MEM) {
>>>> + int slow_map = 0;
>>>> int t = cur_region->type & IORESOURCE_PREFETCH
>>>> ? PCI_BASE_ADDRESS_MEM_PREFETCH
>>>> : PCI_BASE_ADDRESS_SPACE_MEMORY;
>>>> +
>>>> if (cur_region->size & 0xFFF) {
>>>> - fprintf(stderr, "Unable to assign device: PCI region %d "
>>>> - "at address 0x%llx has size 0x%x, "
>>>> - " which is not a multiple of 4K\n",
>>>> + fprintf(stderr, "PCI region %d at address 0x%llx "
>>>> + "has size 0x%x, which is not a multiple of 4K. "
>>>> + "You might experience some performance hit due to that.\n",
>>>> i, (unsigned long long)cur_region->base_addr,
>>>> cur_region->size);
>>>> + slow_map = 1;
>>>> + }
>>>> +
>>>> + if (slow_map && (i == PCI_ROM_SLOT)) {
>>>> + fprintf(stderr, "ROM not aligned - can't continue\n");
>>>> return -1;
>>>>
>>>>
>>> Why is this? Can't ROM be handled in the same way?
>>> I think it was previously ...
>>>
>>>
>> You can't execute code from MMIO regions, so slow_map doesn't work there.
>>
>
> Hmm, how does it work with passthrough?
> Anyway, I don't think a lot of users run ROM
> code directly, they just shadow it and run from there.
>
>
>>>
>>>
>>>> }
>>>>
>>>> @@ -405,7 +511,7 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
>>>> } else {
>>>> pci_dev->v_addrs[i].u.r_virtbase =
>>>> mmap(NULL,
>>>> - (cur_region->size + 0xFFF) & 0xFFFFF000,
>>>> + cur_region->size,
>>>>
>>>>
>>> Hmm, one assumes this code did work at some point ...
>>> do you know why was it done this way?
>>>
>>>
>> Nope :-).
>>
>
> So maybe keep it that way or make the change in a separate patch.
>
Well we're sure the normal path is size aligned, no? In fact, you
recommended I should change the code to what this looks like ;-).
Alex
^ permalink raw reply [flat|nested] 25+ messages in thread* Re: [PATCH 1/3] Enable non page boundary BAR device assignment
2009-12-15 19:04 ` Alexander Graf
@ 2009-12-15 19:50 ` Chris Wright
2009-12-15 21:12 ` Michael S. Tsirkin
2009-12-15 21:04 ` Michael S. Tsirkin
1 sibling, 1 reply; 25+ messages in thread
From: Chris Wright @ 2009-12-15 19:50 UTC (permalink / raw)
To: Alexander Graf
Cc: Michael S. Tsirkin, kvm list, Gleb Natapov, Muli Ben-Yehuda,
Chris Wright
* Alexander Graf (agraf@suse.de) wrote:
> >>>> @@ -405,7 +511,7 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
> >>>> } else {
> >>>> pci_dev->v_addrs[i].u.r_virtbase =
> >>>> mmap(NULL,
> >>>> - (cur_region->size + 0xFFF) & 0xFFFFF000,
> >>>> + cur_region->size,
> >>>>
> >>>>
> >>> Hmm, one assumes this code did work at some point ...
> >>> do you know why was it done this way?
> >>>
> >> Nope :-).
> >
> > So maybe keep it that way or make the change in a separate patch.
>
> Well we're sure the normal path is size aligned, no? In fact, you
> recommended I should change the code to what this looks like ;-).
Sort of moot. The typical case is page size aligned, and the kernel is
going to round up anyway.
^ permalink raw reply [flat|nested] 25+ messages in thread* Re: [PATCH 1/3] Enable non page boundary BAR device assignment
2009-12-15 19:50 ` Chris Wright
@ 2009-12-15 21:12 ` Michael S. Tsirkin
2009-12-16 5:44 ` Chris Wright
0 siblings, 1 reply; 25+ messages in thread
From: Michael S. Tsirkin @ 2009-12-15 21:12 UTC (permalink / raw)
To: Chris Wright; +Cc: Alexander Graf, kvm list, Gleb Natapov, Muli Ben-Yehuda
On Tue, Dec 15, 2009 at 11:50:23AM -0800, Chris Wright wrote:
> * Alexander Graf (agraf@suse.de) wrote:
> > >>>> @@ -405,7 +511,7 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
> > >>>> } else {
> > >>>> pci_dev->v_addrs[i].u.r_virtbase =
> > >>>> mmap(NULL,
> > >>>> - (cur_region->size + 0xFFF) & 0xFFFFF000,
> > >>>> + cur_region->size,
> > >>>>
> > >>>>
> > >>> Hmm, one assumes this code did work at some point ...
> > >>> do you know why was it done this way?
> > >>>
> > >> Nope :-).
> > >
> > > So maybe keep it that way or make the change in a separate patch.
> >
> > Well we're sure the normal path is size aligned, no? In fact, you
> > recommended I should change the code to what this looks like ;-).
>
> Sort of moot. The typical case is page size aligned, and the kernel is
> going to round up anyway.
Why was this code here originally then? Any idea?
--
MST
^ permalink raw reply [flat|nested] 25+ messages in thread* Re: [PATCH 1/3] Enable non page boundary BAR device assignment
2009-12-15 21:12 ` Michael S. Tsirkin
@ 2009-12-16 5:44 ` Chris Wright
0 siblings, 0 replies; 25+ messages in thread
From: Chris Wright @ 2009-12-16 5:44 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: Chris Wright, Alexander Graf, kvm list, Gleb Natapov,
Muli Ben-Yehuda
* Michael S. Tsirkin (mst@redhat.com) wrote:
> On Tue, Dec 15, 2009 at 11:50:23AM -0800, Chris Wright wrote:
> > * Alexander Graf (agraf@suse.de) wrote:
> > > >>>> @@ -405,7 +511,7 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
> > > >>>> } else {
> > > >>>> pci_dev->v_addrs[i].u.r_virtbase =
> > > >>>> mmap(NULL,
> > > >>>> - (cur_region->size + 0xFFF) & 0xFFFFF000,
> > > >>>> + cur_region->size,
> > > >>>>
> > > >>>>
> > > >>> Hmm, one assumes this code did work at some point ...
> > > >>> do you know why was it done this way?
> > > >>>
> > > >> Nope :-).
> > > >
> > > > So maybe keep it that way or make the change in a separate patch.
> > >
> > > Well we're sure the normal path is size aligned, no? In fact, you
> > > recommended I should change the code to what this looks like ;-).
> >
> > Sort of moot. The typical case is page size aligned, and the kernel is
> > going to round up anyway.
>
> Why was this code here originally then? Any idea?
I think it's redundant, being paranoid to ensure a multiple of page mappings
w/out realizing that's what mmap will do anyway.
thanks,
-chris
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH 1/3] Enable non page boundary BAR device assignment
2009-12-15 19:04 ` Alexander Graf
2009-12-15 19:50 ` Chris Wright
@ 2009-12-15 21:04 ` Michael S. Tsirkin
2009-12-15 21:11 ` Alexander Graf
1 sibling, 1 reply; 25+ messages in thread
From: Michael S. Tsirkin @ 2009-12-15 21:04 UTC (permalink / raw)
To: Alexander Graf; +Cc: kvm list, Gleb Natapov, Muli Ben-Yehuda, Chris Wright
On Tue, Dec 15, 2009 at 08:04:10PM +0100, Alexander Graf wrote:
> Michael S. Tsirkin wrote:
> > On Tue, Dec 15, 2009 at 07:47:35PM +0100, Alexander Graf wrote:
> >
> >> Michael S. Tsirkin wrote:
> >>
> >>> On Tue, Dec 15, 2009 at 07:30:26PM +0100, Alexander Graf wrote:
> >>>
> >>>
> >>>> While trying to get device passthrough working with an emulex hba, kvm
> >>>> refused to pass it through because it has a BAR of 256 bytes:
> >>>>
> >>>> Region 0: Memory at d2100000 (64-bit, non-prefetchable) [size=4K]
> >>>> Region 2: Memory at d2101000 (64-bit, non-prefetchable) [size=256]
> >>>> Region 4: I/O ports at b100 [size=256]
> >>>>
> >>>> Since the page boundary is an arbitrary optimization to allow 1:1 mapping of
> >>>> physical to virtual addresses, we can still take the old MMIO callback route.
> >>>>
> >>>> So let's add a second code path that allows for size & 0xFFF != 0 sized regions
> >>>> by looping it through userspace.
> >>>>
> >>>> I verified that it works by passing through an e1000 with this additional patch
> >>>> applied and the card acted the same way it did without this patch:
> >>>>
> >>>> map_func = assigned_dev_iomem_map;
> >>>> - if (cur_region->size & 0xFFF) {
> >>>> + if (i != PCI_ROM_SLOT){
> >>>> fprintf(stderr, "PCI region %d at address 0x%llx "
> >>>>
> >>>> Signed-off-by: Alexander Graf <agraf@suse.de>
> >>>>
> >>>>
> >>> Looks good.
> >>> 1 question:
> >>>
> >>>
> >>>
> >>>> ---
> >>>>
> >>>> v1 -> v2:
> >>>>
> >>>> - don't use map_func function pointer
> >>>> - use the same code for mmap on fast and slow path
> >>>>
> >>>> v2 -> v3:
> >>>>
> >>>> - address mst's comments
> >>>> ---
> >>>> hw/device-assignment.c | 117 +++++++++++++++++++++++++++++++++++++++++++++--
> >>>> 1 files changed, 112 insertions(+), 5 deletions(-)
> >>>>
> >>>> diff --git a/hw/device-assignment.c b/hw/device-assignment.c
> >>>> index 13a86bb..000fa61 100644
> >>>> --- a/hw/device-assignment.c
> >>>> +++ b/hw/device-assignment.c
> >>>> @@ -148,6 +148,105 @@ static uint32_t assigned_dev_ioport_readl(void *opaque, uint32_t addr)
> >>>> return value;
> >>>> }
> >>>>
> >>>> +static uint32_t slow_bar_readb(void *opaque, target_phys_addr_t addr)
> >>>> +{
> >>>> + AssignedDevRegion *d = opaque;
> >>>> + uint8_t *in = d->u.r_virtbase + addr;
> >>>> + uint32_t r;
> >>>> +
> >>>> + r = *in;
> >>>> + DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
> >>>> +
> >>>> + return r;
> >>>> +}
> >>>> +
> >>>> +static uint32_t slow_bar_readw(void *opaque, target_phys_addr_t addr)
> >>>> +{
> >>>> + AssignedDevRegion *d = opaque;
> >>>> + uint16_t *in = d->u.r_virtbase + addr;
> >>>> + uint32_t r;
> >>>> +
> >>>> + r = *in;
> >>>> + DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
> >>>> +
> >>>> + return r;
> >>>> +}
> >>>> +
> >>>> +static uint32_t slow_bar_readl(void *opaque, target_phys_addr_t addr)
> >>>> +{
> >>>> + AssignedDevRegion *d = opaque;
> >>>> + uint32_t *in = d->u.r_virtbase + addr;
> >>>> + uint32_t r;
> >>>> +
> >>>> + r = *in;
> >>>> + DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
> >>>> +
> >>>> + return r;
> >>>> +}
> >>>> +
> >>>> +static void slow_bar_writeb(void *opaque, target_phys_addr_t addr, uint32_t val)
> >>>> +{
> >>>> + AssignedDevRegion *d = opaque;
> >>>> + uint8_t *out = d->u.r_virtbase + addr;
> >>>> +
> >>>> + DEBUG("slow_bar_writeb addr=0x" TARGET_FMT_plx " val=0x%02x\n", addr, val);
> >>>> + *out = val;
> >>>> +}
> >>>> +
> >>>> +static void slow_bar_writew(void *opaque, target_phys_addr_t addr, uint32_t val)
> >>>> +{
> >>>> + AssignedDevRegion *d = opaque;
> >>>> + uint16_t *out = d->u.r_virtbase + addr;
> >>>> +
> >>>> + DEBUG("slow_bar_writew addr=0x" TARGET_FMT_plx " val=0x%04x\n", addr, val);
> >>>> + *out = val;
> >>>> +}
> >>>> +
> >>>> +static void slow_bar_writel(void *opaque, target_phys_addr_t addr, uint32_t val)
> >>>> +{
> >>>> + AssignedDevRegion *d = opaque;
> >>>> + uint32_t *out = d->u.r_virtbase + addr;
> >>>> +
> >>>> + DEBUG("slow_bar_writel addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, val);
> >>>> + *out = val;
> >>>> +}
> >>>> +
> >>>> +static CPUWriteMemoryFunc * const slow_bar_write[] = {
> >>>> + &slow_bar_writeb,
> >>>> + &slow_bar_writew,
> >>>> + &slow_bar_writel
> >>>> +};
> >>>> +
> >>>> +static CPUReadMemoryFunc * const slow_bar_read[] = {
> >>>> + &slow_bar_readb,
> >>>> + &slow_bar_readw,
> >>>> + &slow_bar_readl
> >>>> +};
> >>>> +
> >>>> +static void assigned_dev_iomem_map_slow(PCIDevice *pci_dev, int region_num,
> >>>> + pcibus_t e_phys, pcibus_t e_size,
> >>>> + int type)
> >>>> +{
> >>>> + AssignedDevice *r_dev = container_of(pci_dev, AssignedDevice, dev);
> >>>> + AssignedDevRegion *region = &r_dev->v_addrs[region_num];
> >>>> + PCIRegion *real_region = &r_dev->real_device.regions[region_num];
> >>>> + int m;
> >>>> +
> >>>> + DEBUG("slow map\n");
> >>>> + m = cpu_register_io_memory(slow_bar_read, slow_bar_write, region);
> >>>> + cpu_register_physical_memory(e_phys, e_size, m);
> >>>> +
> >>>> + /* MSI-X MMIO page */
> >>>> + if ((e_size > 0) &&
> >>>> + real_region->base_addr <= r_dev->msix_table_addr &&
> >>>> + real_region->base_addr + real_region->size >= r_dev->msix_table_addr) {
> >>>> + int offset = r_dev->msix_table_addr - real_region->base_addr;
> >>>> +
> >>>> + cpu_register_physical_memory(e_phys + offset,
> >>>> + TARGET_PAGE_SIZE, r_dev->mmio_index);
> >>>> + }
> >>>> +}
> >>>> +
> >>>> static void assigned_dev_iomem_map(PCIDevice *pci_dev, int region_num,
> >>>> pcibus_t e_phys, pcibus_t e_size, int type)
> >>>> {
> >>>> @@ -381,15 +480,22 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
> >>>>
> >>>> /* handle memory io regions */
> >>>> if (cur_region->type & IORESOURCE_MEM) {
> >>>> + int slow_map = 0;
> >>>> int t = cur_region->type & IORESOURCE_PREFETCH
> >>>> ? PCI_BASE_ADDRESS_MEM_PREFETCH
> >>>> : PCI_BASE_ADDRESS_SPACE_MEMORY;
> >>>> +
> >>>> if (cur_region->size & 0xFFF) {
> >>>> - fprintf(stderr, "Unable to assign device: PCI region %d "
> >>>> - "at address 0x%llx has size 0x%x, "
> >>>> - " which is not a multiple of 4K\n",
> >>>> + fprintf(stderr, "PCI region %d at address 0x%llx "
> >>>> + "has size 0x%x, which is not a multiple of 4K. "
> >>>> + "You might experience some performance hit due to that.\n",
> >>>> i, (unsigned long long)cur_region->base_addr,
> >>>> cur_region->size);
> >>>> + slow_map = 1;
> >>>> + }
> >>>> +
> >>>> + if (slow_map && (i == PCI_ROM_SLOT)) {
> >>>> + fprintf(stderr, "ROM not aligned - can't continue\n");
> >>>> return -1;
> >>>>
> >>>>
> >>> Why is this? Can't ROM be handled in the same way?
> >>> I think it was previously ...
> >>>
> >>>
> >> You can't execute code from MMIO regions, so slow_map doesn't work there.
> >>
> >
> > Hmm, how does it work with passthrough?
> > Anyway, I don't think a lot of users run ROM
> > code directly, they just shadow it and run from there.
> >
> >
> >>>
> >>>
> >>>> }
> >>>>
> >>>> @@ -405,7 +511,7 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
> >>>> } else {
> >>>> pci_dev->v_addrs[i].u.r_virtbase =
> >>>> mmap(NULL,
> >>>> - (cur_region->size + 0xFFF) & 0xFFFFF000,
> >>>> + cur_region->size,
> >>>>
> >>>>
> >>> Hmm, one assumes this code did work at some point ...
> >>> do you know why was it done this way?
> >>>
> >>>
> >> Nope :-).
> >>
> >
> > So maybe keep it that way or make the change in a separate patch.
> >
>
> Well we're sure the normal path is size aligned, no? In fact, you
> recommended I should change the code to what this looks like ;-).
>
> Alex
Existing code first page aligned address, passes this to mmap,
then adds page offset. I do not know why. Could be e.g. to handle
different libc/kernel versions. If we are not sure it is
not needed, let's keep it this way. If we are sure, let's
cleanup in a separate patch. Makes sense?
--
MST
^ permalink raw reply [flat|nested] 25+ messages in thread* Re: [PATCH 1/3] Enable non page boundary BAR device assignment
2009-12-15 21:04 ` Michael S. Tsirkin
@ 2009-12-15 21:11 ` Alexander Graf
2009-12-16 5:47 ` Chris Wright
0 siblings, 1 reply; 25+ messages in thread
From: Alexander Graf @ 2009-12-15 21:11 UTC (permalink / raw)
To: Michael S. Tsirkin; +Cc: kvm list, Gleb Natapov, Muli Ben-Yehuda, Chris Wright
Michael S. Tsirkin wrote:
> On Tue, Dec 15, 2009 at 08:04:10PM +0100, Alexander Graf wrote:
>
>> Michael S. Tsirkin wrote:
>>
>>> On Tue, Dec 15, 2009 at 07:47:35PM +0100, Alexander Graf wrote:
>>>
>>>
>>>> Michael S. Tsirkin wrote:
>>>>
>>>>
>>>>> On Tue, Dec 15, 2009 at 07:30:26PM +0100, Alexander Graf wrote:
>>>>>
>>>>>
>>>>>
>>>>>> While trying to get device passthrough working with an emulex hba, kvm
>>>>>> refused to pass it through because it has a BAR of 256 bytes:
>>>>>>
>>>>>> Region 0: Memory at d2100000 (64-bit, non-prefetchable) [size=4K]
>>>>>> Region 2: Memory at d2101000 (64-bit, non-prefetchable) [size=256]
>>>>>> Region 4: I/O ports at b100 [size=256]
>>>>>>
>>>>>> Since the page boundary is an arbitrary optimization to allow 1:1 mapping of
>>>>>> physical to virtual addresses, we can still take the old MMIO callback route.
>>>>>>
>>>>>> So let's add a second code path that allows for size & 0xFFF != 0 sized regions
>>>>>> by looping it through userspace.
>>>>>>
>>>>>> I verified that it works by passing through an e1000 with this additional patch
>>>>>> applied and the card acted the same way it did without this patch:
>>>>>>
>>>>>> map_func = assigned_dev_iomem_map;
>>>>>> - if (cur_region->size & 0xFFF) {
>>>>>> + if (i != PCI_ROM_SLOT){
>>>>>> fprintf(stderr, "PCI region %d at address 0x%llx "
>>>>>>
>>>>>> Signed-off-by: Alexander Graf <agraf@suse.de>
>>>>>>
>>>>>>
>>>>>>
>>>>> Looks good.
>>>>> 1 question:
>>>>>
>>>>>
>>>>>
>>>>>
>>>>>> ---
>>>>>>
>>>>>> v1 -> v2:
>>>>>>
>>>>>> - don't use map_func function pointer
>>>>>> - use the same code for mmap on fast and slow path
>>>>>>
>>>>>> v2 -> v3:
>>>>>>
>>>>>> - address mst's comments
>>>>>> ---
>>>>>> hw/device-assignment.c | 117 +++++++++++++++++++++++++++++++++++++++++++++--
>>>>>> 1 files changed, 112 insertions(+), 5 deletions(-)
>>>>>>
>>>>>> diff --git a/hw/device-assignment.c b/hw/device-assignment.c
>>>>>> index 13a86bb..000fa61 100644
>>>>>> --- a/hw/device-assignment.c
>>>>>> +++ b/hw/device-assignment.c
>>>>>> @@ -148,6 +148,105 @@ static uint32_t assigned_dev_ioport_readl(void *opaque, uint32_t addr)
>>>>>> return value;
>>>>>> }
>>>>>>
>>>>>> +static uint32_t slow_bar_readb(void *opaque, target_phys_addr_t addr)
>>>>>> +{
>>>>>> + AssignedDevRegion *d = opaque;
>>>>>> + uint8_t *in = d->u.r_virtbase + addr;
>>>>>> + uint32_t r;
>>>>>> +
>>>>>> + r = *in;
>>>>>> + DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
>>>>>> +
>>>>>> + return r;
>>>>>> +}
>>>>>> +
>>>>>> +static uint32_t slow_bar_readw(void *opaque, target_phys_addr_t addr)
>>>>>> +{
>>>>>> + AssignedDevRegion *d = opaque;
>>>>>> + uint16_t *in = d->u.r_virtbase + addr;
>>>>>> + uint32_t r;
>>>>>> +
>>>>>> + r = *in;
>>>>>> + DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
>>>>>> +
>>>>>> + return r;
>>>>>> +}
>>>>>> +
>>>>>> +static uint32_t slow_bar_readl(void *opaque, target_phys_addr_t addr)
>>>>>> +{
>>>>>> + AssignedDevRegion *d = opaque;
>>>>>> + uint32_t *in = d->u.r_virtbase + addr;
>>>>>> + uint32_t r;
>>>>>> +
>>>>>> + r = *in;
>>>>>> + DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
>>>>>> +
>>>>>> + return r;
>>>>>> +}
>>>>>> +
>>>>>> +static void slow_bar_writeb(void *opaque, target_phys_addr_t addr, uint32_t val)
>>>>>> +{
>>>>>> + AssignedDevRegion *d = opaque;
>>>>>> + uint8_t *out = d->u.r_virtbase + addr;
>>>>>> +
>>>>>> + DEBUG("slow_bar_writeb addr=0x" TARGET_FMT_plx " val=0x%02x\n", addr, val);
>>>>>> + *out = val;
>>>>>> +}
>>>>>> +
>>>>>> +static void slow_bar_writew(void *opaque, target_phys_addr_t addr, uint32_t val)
>>>>>> +{
>>>>>> + AssignedDevRegion *d = opaque;
>>>>>> + uint16_t *out = d->u.r_virtbase + addr;
>>>>>> +
>>>>>> + DEBUG("slow_bar_writew addr=0x" TARGET_FMT_plx " val=0x%04x\n", addr, val);
>>>>>> + *out = val;
>>>>>> +}
>>>>>> +
>>>>>> +static void slow_bar_writel(void *opaque, target_phys_addr_t addr, uint32_t val)
>>>>>> +{
>>>>>> + AssignedDevRegion *d = opaque;
>>>>>> + uint32_t *out = d->u.r_virtbase + addr;
>>>>>> +
>>>>>> + DEBUG("slow_bar_writel addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, val);
>>>>>> + *out = val;
>>>>>> +}
>>>>>> +
>>>>>> +static CPUWriteMemoryFunc * const slow_bar_write[] = {
>>>>>> + &slow_bar_writeb,
>>>>>> + &slow_bar_writew,
>>>>>> + &slow_bar_writel
>>>>>> +};
>>>>>> +
>>>>>> +static CPUReadMemoryFunc * const slow_bar_read[] = {
>>>>>> + &slow_bar_readb,
>>>>>> + &slow_bar_readw,
>>>>>> + &slow_bar_readl
>>>>>> +};
>>>>>> +
>>>>>> +static void assigned_dev_iomem_map_slow(PCIDevice *pci_dev, int region_num,
>>>>>> + pcibus_t e_phys, pcibus_t e_size,
>>>>>> + int type)
>>>>>> +{
>>>>>> + AssignedDevice *r_dev = container_of(pci_dev, AssignedDevice, dev);
>>>>>> + AssignedDevRegion *region = &r_dev->v_addrs[region_num];
>>>>>> + PCIRegion *real_region = &r_dev->real_device.regions[region_num];
>>>>>> + int m;
>>>>>> +
>>>>>> + DEBUG("slow map\n");
>>>>>> + m = cpu_register_io_memory(slow_bar_read, slow_bar_write, region);
>>>>>> + cpu_register_physical_memory(e_phys, e_size, m);
>>>>>> +
>>>>>> + /* MSI-X MMIO page */
>>>>>> + if ((e_size > 0) &&
>>>>>> + real_region->base_addr <= r_dev->msix_table_addr &&
>>>>>> + real_region->base_addr + real_region->size >= r_dev->msix_table_addr) {
>>>>>> + int offset = r_dev->msix_table_addr - real_region->base_addr;
>>>>>> +
>>>>>> + cpu_register_physical_memory(e_phys + offset,
>>>>>> + TARGET_PAGE_SIZE, r_dev->mmio_index);
>>>>>> + }
>>>>>> +}
>>>>>> +
>>>>>> static void assigned_dev_iomem_map(PCIDevice *pci_dev, int region_num,
>>>>>> pcibus_t e_phys, pcibus_t e_size, int type)
>>>>>> {
>>>>>> @@ -381,15 +480,22 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
>>>>>>
>>>>>> /* handle memory io regions */
>>>>>> if (cur_region->type & IORESOURCE_MEM) {
>>>>>> + int slow_map = 0;
>>>>>> int t = cur_region->type & IORESOURCE_PREFETCH
>>>>>> ? PCI_BASE_ADDRESS_MEM_PREFETCH
>>>>>> : PCI_BASE_ADDRESS_SPACE_MEMORY;
>>>>>> +
>>>>>> if (cur_region->size & 0xFFF) {
>>>>>> - fprintf(stderr, "Unable to assign device: PCI region %d "
>>>>>> - "at address 0x%llx has size 0x%x, "
>>>>>> - " which is not a multiple of 4K\n",
>>>>>> + fprintf(stderr, "PCI region %d at address 0x%llx "
>>>>>> + "has size 0x%x, which is not a multiple of 4K. "
>>>>>> + "You might experience some performance hit due to that.\n",
>>>>>> i, (unsigned long long)cur_region->base_addr,
>>>>>> cur_region->size);
>>>>>> + slow_map = 1;
>>>>>> + }
>>>>>> +
>>>>>> + if (slow_map && (i == PCI_ROM_SLOT)) {
>>>>>> + fprintf(stderr, "ROM not aligned - can't continue\n");
>>>>>> return -1;
>>>>>>
>>>>>>
>>>>>>
>>>>> Why is this? Can't ROM be handled in the same way?
>>>>> I think it was previously ...
>>>>>
>>>>>
>>>>>
>>>> You can't execute code from MMIO regions, so slow_map doesn't work there.
>>>>
>>>>
>>> Hmm, how does it work with passthrough?
>>> Anyway, I don't think a lot of users run ROM
>>> code directly, they just shadow it and run from there.
>>>
>>>
>>>
>>>>>
>>>>>
>>>>>
>>>>>> }
>>>>>>
>>>>>> @@ -405,7 +511,7 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
>>>>>> } else {
>>>>>> pci_dev->v_addrs[i].u.r_virtbase =
>>>>>> mmap(NULL,
>>>>>> - (cur_region->size + 0xFFF) & 0xFFFFF000,
>>>>>> + cur_region->size,
>>>>>>
>>>>>>
>>>>>>
>>>>> Hmm, one assumes this code did work at some point ...
>>>>> do you know why was it done this way?
>>>>>
>>>>>
>>>>>
>>>> Nope :-).
>>>>
>>>>
>>> So maybe keep it that way or make the change in a separate patch.
>>>
>>>
>> Well we're sure the normal path is size aligned, no? In fact, you
>> recommended I should change the code to what this looks like ;-).
>>
>> Alex
>>
>
>
> Existing code first page aligned address, passes this to mmap,
> then adds page offset. I do not know why. Could be e.g. to handle
> different libc/kernel versions. If we are not sure it is
> not needed, let's keep it this way. If we are sure, let's
> cleanup in a separate patch. Makes sense?
>
Chris, want to make a statement here? :-)
Alex
^ permalink raw reply [flat|nested] 25+ messages in thread* Re: [PATCH 1/3] Enable non page boundary BAR device assignment
2009-12-15 21:11 ` Alexander Graf
@ 2009-12-16 5:47 ` Chris Wright
2009-12-16 10:41 ` Michael S. Tsirkin
0 siblings, 1 reply; 25+ messages in thread
From: Chris Wright @ 2009-12-16 5:47 UTC (permalink / raw)
To: Alexander Graf
Cc: Michael S. Tsirkin, kvm list, Gleb Natapov, Muli Ben-Yehuda,
Chris Wright
* Alexander Graf (agraf@suse.de) wrote:
> Michael S. Tsirkin wrote:
> > Existing code first page aligned address, passes this to mmap,
Did you mean page aligned size?
> > then adds page offset. I do not know why. Could be e.g. to handle
> > different libc/kernel versions. If we are not sure it is
> > not needed, let's keep it this way. If we are sure, let's
> > cleanup in a separate patch. Makes sense?
>
> Chris, want to make a statement here? :-)
I have no issue w/ splitting that out. I don't think it is necessary.
The "adds page offset" bit is, however.
thanks,
-chris
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH 1/3] Enable non page boundary BAR device assignment
2009-12-16 5:47 ` Chris Wright
@ 2009-12-16 10:41 ` Michael S. Tsirkin
0 siblings, 0 replies; 25+ messages in thread
From: Michael S. Tsirkin @ 2009-12-16 10:41 UTC (permalink / raw)
To: Chris Wright; +Cc: Alexander Graf, kvm list, Gleb Natapov, Muli Ben-Yehuda
On Tue, Dec 15, 2009 at 09:47:09PM -0800, Chris Wright wrote:
> * Alexander Graf (agraf@suse.de) wrote:
> > Michael S. Tsirkin wrote:
> > > Existing code first page aligned address, passes this to mmap,
>
> Did you mean page aligned size?
Yes, of course.
> > > then adds page offset. I do not know why. Could be e.g. to handle
> > > different libc/kernel versions. If we are not sure it is
> > > not needed, let's keep it this way. If we are sure, let's
> > > cleanup in a separate patch. Makes sense?
> >
> > Chris, want to make a statement here? :-)
>
> I have no issue w/ splitting that out. I don't think it is necessary.
> The "adds page offset" bit is, however.
>
> thanks,
> -chris
I looked at kernel source history, as far as I can tell
all mmap implementations round size up to full page size.
So yes, this seems safe.
--
MST
^ permalink raw reply [flat|nested] 25+ messages in thread
* [PATCH 2/3] Split off sysfs id retrieval
2009-12-15 18:30 [PATCH 0/3] Device Assignment fixes Alexander Graf
2009-12-15 18:30 ` [PATCH 1/3] Enable non page boundary BAR device assignment Alexander Graf
@ 2009-12-15 18:30 ` Alexander Graf
2009-12-15 18:39 ` Michael S. Tsirkin
2009-12-15 18:30 ` [PATCH 3/3] Inform users about busy device assignment attempt Alexander Graf
2 siblings, 1 reply; 25+ messages in thread
From: Alexander Graf @ 2009-12-15 18:30 UTC (permalink / raw)
To: kvm list; +Cc: Gleb Natapov, Michael S. Tsirkin, Muli Ben-Yehuda, Chris Wright
To retreive device and vendor ID from a PCI device, we need to read a
sysfs file. That code is currently hand written at least two times,
the later patch introducing two more calls.
So let's move that out to a function.
Signed-off-by: Alexander Graf <agraf@suse.de>
---
hw/device-assignment.c | 66 ++++++++++++++++++++++++++++++++----------------
1 files changed, 44 insertions(+), 22 deletions(-)
diff --git a/hw/device-assignment.c b/hw/device-assignment.c
index 000fa61..566671c 100644
--- a/hw/device-assignment.c
+++ b/hw/device-assignment.c
@@ -562,14 +562,46 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
return 0;
}
+static int get_real_id(const char *devpath, const char *idname, uint16_t *val)
+{
+ FILE *f;
+ char name[128];
+ long id;
+
+ snprintf(name, sizeof(name), "%s%s", devpath, idname);
+ f = fopen(name, "r");
+ if (f == NULL) {
+ fprintf(stderr, "%s: %s: %m\n", __func__, name);
+ return -1;
+ }
+ if (fscanf(f, "%li\n", &id) == 1) {
+ *val = id;
+ } else {
+ return -1;
+ }
+ fclose(f);
+
+ return 0;
+}
+
+static int get_real_vendor_id(const char *devpath, uint16_t *val)
+{
+ return get_real_id(devpath, "vendor", val);
+}
+
+static int get_real_device_id(const char *devpath, uint16_t *val)
+{
+ return get_real_id(devpath, "device", val);
+}
+
static int get_real_device(AssignedDevice *pci_dev, uint8_t r_bus,
uint8_t r_dev, uint8_t r_func)
{
char dir[128], name[128];
- int fd, r = 0;
+ int fd, r = 0, v;
FILE *f;
unsigned long long start, end, size, flags;
- unsigned long id;
+ uint16_t id;
struct stat statbuf;
PCIRegion *rp;
PCIDevRegions *dev = &pci_dev->real_device;
@@ -635,31 +667,21 @@ again:
fclose(f);
- /* read and fill device ID */
- snprintf(name, sizeof(name), "%svendor", dir);
- f = fopen(name, "r");
- if (f == NULL) {
- fprintf(stderr, "%s: %s: %m\n", __func__, name);
+ /* read and fill vendor ID */
+ v = get_real_vendor_id(dir, &id);
+ if (v) {
return 1;
}
- if (fscanf(f, "%li\n", &id) == 1) {
- pci_dev->dev.config[0] = id & 0xff;
- pci_dev->dev.config[1] = (id & 0xff00) >> 8;
- }
- fclose(f);
+ pci_dev->dev.config[0] = id & 0xff;
+ pci_dev->dev.config[1] = (id & 0xff00) >> 8;
- /* read and fill vendor ID */
- snprintf(name, sizeof(name), "%sdevice", dir);
- f = fopen(name, "r");
- if (f == NULL) {
- fprintf(stderr, "%s: %s: %m\n", __func__, name);
+ /* read and fill device ID */
+ v = get_real_device_id(dir, &id);
+ if (v) {
return 1;
}
- if (fscanf(f, "%li\n", &id) == 1) {
- pci_dev->dev.config[2] = id & 0xff;
- pci_dev->dev.config[3] = (id & 0xff00) >> 8;
- }
- fclose(f);
+ pci_dev->dev.config[2] = id & 0xff;
+ pci_dev->dev.config[3] = (id & 0xff00) >> 8;
/* dealing with virtual function device */
snprintf(name, sizeof(name), "%sphysfn/", dir);
--
1.6.0.2
^ permalink raw reply related [flat|nested] 25+ messages in thread* Re: [PATCH 2/3] Split off sysfs id retrieval
2009-12-15 18:30 ` [PATCH 2/3] Split off sysfs id retrieval Alexander Graf
@ 2009-12-15 18:39 ` Michael S. Tsirkin
2009-12-15 18:54 ` Alexander Graf
2009-12-15 19:57 ` Chris Wright
0 siblings, 2 replies; 25+ messages in thread
From: Michael S. Tsirkin @ 2009-12-15 18:39 UTC (permalink / raw)
To: Alexander Graf; +Cc: kvm list, Gleb Natapov, Muli Ben-Yehuda, Chris Wright
On Tue, Dec 15, 2009 at 07:30:27PM +0100, Alexander Graf wrote:
> To retreive device and vendor ID from a PCI device, we need to read a
> sysfs file. That code is currently hand written at least two times,
> the later patch introducing two more calls.
>
> So let's move that out to a function.
>
> Signed-off-by: Alexander Graf <agraf@suse.de>
Shouldn't we be using libpci, as we already do
in other places?
> ---
> hw/device-assignment.c | 66 ++++++++++++++++++++++++++++++++----------------
> 1 files changed, 44 insertions(+), 22 deletions(-)
>
> diff --git a/hw/device-assignment.c b/hw/device-assignment.c
> index 000fa61..566671c 100644
> --- a/hw/device-assignment.c
> +++ b/hw/device-assignment.c
> @@ -562,14 +562,46 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
> return 0;
> }
>
> +static int get_real_id(const char *devpath, const char *idname, uint16_t *val)
> +{
> + FILE *f;
> + char name[128];
> + long id;
> +
> + snprintf(name, sizeof(name), "%s%s", devpath, idname);
> + f = fopen(name, "r");
> + if (f == NULL) {
> + fprintf(stderr, "%s: %s: %m\n", __func__, name);
> + return -1;
> + }
> + if (fscanf(f, "%li\n", &id) == 1) {
> + *val = id;
> + } else {
> + return -1;
> + }
> + fclose(f);
> +
> + return 0;
> +}
> +
> +static int get_real_vendor_id(const char *devpath, uint16_t *val)
> +{
> + return get_real_id(devpath, "vendor", val);
> +}
> +
> +static int get_real_device_id(const char *devpath, uint16_t *val)
> +{
> + return get_real_id(devpath, "device", val);
> +}
> +
> static int get_real_device(AssignedDevice *pci_dev, uint8_t r_bus,
> uint8_t r_dev, uint8_t r_func)
> {
> char dir[128], name[128];
> - int fd, r = 0;
> + int fd, r = 0, v;
> FILE *f;
> unsigned long long start, end, size, flags;
> - unsigned long id;
> + uint16_t id;
> struct stat statbuf;
> PCIRegion *rp;
> PCIDevRegions *dev = &pci_dev->real_device;
> @@ -635,31 +667,21 @@ again:
>
> fclose(f);
>
> - /* read and fill device ID */
> - snprintf(name, sizeof(name), "%svendor", dir);
> - f = fopen(name, "r");
> - if (f == NULL) {
> - fprintf(stderr, "%s: %s: %m\n", __func__, name);
> + /* read and fill vendor ID */
> + v = get_real_vendor_id(dir, &id);
> + if (v) {
> return 1;
> }
> - if (fscanf(f, "%li\n", &id) == 1) {
> - pci_dev->dev.config[0] = id & 0xff;
> - pci_dev->dev.config[1] = (id & 0xff00) >> 8;
> - }
> - fclose(f);
> + pci_dev->dev.config[0] = id & 0xff;
> + pci_dev->dev.config[1] = (id & 0xff00) >> 8;
>
> - /* read and fill vendor ID */
> - snprintf(name, sizeof(name), "%sdevice", dir);
> - f = fopen(name, "r");
> - if (f == NULL) {
> - fprintf(stderr, "%s: %s: %m\n", __func__, name);
> + /* read and fill device ID */
> + v = get_real_device_id(dir, &id);
> + if (v) {
> return 1;
> }
> - if (fscanf(f, "%li\n", &id) == 1) {
> - pci_dev->dev.config[2] = id & 0xff;
> - pci_dev->dev.config[3] = (id & 0xff00) >> 8;
> - }
> - fclose(f);
> + pci_dev->dev.config[2] = id & 0xff;
> + pci_dev->dev.config[3] = (id & 0xff00) >> 8;
>
> /* dealing with virtual function device */
> snprintf(name, sizeof(name), "%sphysfn/", dir);
> --
> 1.6.0.2
^ permalink raw reply [flat|nested] 25+ messages in thread* Re: [PATCH 2/3] Split off sysfs id retrieval
2009-12-15 18:39 ` Michael S. Tsirkin
@ 2009-12-15 18:54 ` Alexander Graf
2009-12-15 21:05 ` Michael S. Tsirkin
2009-12-15 19:57 ` Chris Wright
1 sibling, 1 reply; 25+ messages in thread
From: Alexander Graf @ 2009-12-15 18:54 UTC (permalink / raw)
To: Michael S. Tsirkin; +Cc: kvm list, Gleb Natapov, Muli Ben-Yehuda, Chris Wright
Michael S. Tsirkin wrote:
> On Tue, Dec 15, 2009 at 07:30:27PM +0100, Alexander Graf wrote:
>
>> To retreive device and vendor ID from a PCI device, we need to read a
>> sysfs file. That code is currently hand written at least two times,
>> the later patch introducing two more calls.
>>
>> So let's move that out to a function.
>>
>> Signed-off-by: Alexander Graf <agraf@suse.de>
>>
>
> Shouldn't we be using libpci, as we already do
> in other places?
>
Yes, please! But converting it to libvirt is a different beast from just
splitting up existing code :-).
Tackling the device assignment code to make it readable and mergable is
definitely something we need to address - just not now :-).
Alex
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH 2/3] Split off sysfs id retrieval
2009-12-15 18:54 ` Alexander Graf
@ 2009-12-15 21:05 ` Michael S. Tsirkin
0 siblings, 0 replies; 25+ messages in thread
From: Michael S. Tsirkin @ 2009-12-15 21:05 UTC (permalink / raw)
To: Alexander Graf; +Cc: kvm list, Gleb Natapov, Muli Ben-Yehuda, Chris Wright
On Tue, Dec 15, 2009 at 07:54:41PM +0100, Alexander Graf wrote:
> Michael S. Tsirkin wrote:
> > On Tue, Dec 15, 2009 at 07:30:27PM +0100, Alexander Graf wrote:
> >
> >> To retreive device and vendor ID from a PCI device, we need to read a
> >> sysfs file. That code is currently hand written at least two times,
> >> the later patch introducing two more calls.
> >>
> >> So let's move that out to a function.
> >>
> >> Signed-off-by: Alexander Graf <agraf@suse.de>
> >>
> >
> > Shouldn't we be using libpci, as we already do
> > in other places?
> >
>
> Yes, please! But converting it to libvirt is a different beast from just
> splitting up existing code :-).
>
> Tackling the device assignment code to make it readable and mergable is
> definitely something we need to address - just not now :-).
>
> Alex
Hmm, you are right.
Acked-by: Michael S. Tsirkin <mst@redhat.com>
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH 2/3] Split off sysfs id retrieval
2009-12-15 18:39 ` Michael S. Tsirkin
2009-12-15 18:54 ` Alexander Graf
@ 2009-12-15 19:57 ` Chris Wright
1 sibling, 0 replies; 25+ messages in thread
From: Chris Wright @ 2009-12-15 19:57 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: Alexander Graf, kvm list, Gleb Natapov, Muli Ben-Yehuda,
Chris Wright
* Michael S. Tsirkin (mst@redhat.com) wrote:
> On Tue, Dec 15, 2009 at 07:30:27PM +0100, Alexander Graf wrote:
> > To retreive device and vendor ID from a PCI device, we need to read a
> > sysfs file. That code is currently hand written at least two times,
> > the later patch introducing two more calls.
> >
> > So let's move that out to a function.
> >
> > Signed-off-by: Alexander Graf <agraf@suse.de>
>
> Shouldn't we be using libpci, as we already do
> in other places?
The main issue is making sure we get the cooked value of vendor and
device id (they are invalid in config space for VFs). I looked at
this before (w/ the same intention) and concluded it wasn't a benefit
(of course, I can't recall why now...re-reading libpci, as long as we
force PCI_ACCESS_SYS_BUS_PCI it does the right read).
Anyway, this is a simple, nice cleanup.
Acked-by: Chris Wright <chrisw@redhat.com>
^ permalink raw reply [flat|nested] 25+ messages in thread
* [PATCH 3/3] Inform users about busy device assignment attempt
2009-12-15 18:30 [PATCH 0/3] Device Assignment fixes Alexander Graf
2009-12-15 18:30 ` [PATCH 1/3] Enable non page boundary BAR device assignment Alexander Graf
2009-12-15 18:30 ` [PATCH 2/3] Split off sysfs id retrieval Alexander Graf
@ 2009-12-15 18:30 ` Alexander Graf
2009-12-15 18:41 ` Michael S. Tsirkin
2009-12-15 20:28 ` Chris Wright
2 siblings, 2 replies; 25+ messages in thread
From: Alexander Graf @ 2009-12-15 18:30 UTC (permalink / raw)
To: kvm list
Cc: Gleb Natapov, Michael S. Tsirkin, Muli Ben-Yehuda, Chris Wright,
Daniel P. Berrange
When using -pcidevice on a device that is already in use by a kernel driver
all the user gets is the following (very useful) information:
Failed to assign device "04:00.0" : Device or resource busy
Failed to deassign device "04:00.0" : Invalid argument
Error initializing device pci-assign
Since I usually prefer to have my computer do the thinking for me, I figured
it might be a good idea to check and see if a device is actually used by a
driver. If so, tell the user.
So with this patch applied you get the following output:
Failed to assign device "04:00.0" : Device or resource busy
*** The driver 'igb' is occupying your device 04:00.0.
***
*** You can try the following commands to free it:
***
*** $ echo "8086 150a" > /sys/bus/pci/drivers/pci-stub/new_id
*** $ echo "0000:04:00.0" > /sys/bus/pci/drivers/igb/unbind
*** $ echo "0000:04:00.0" > /sys/bus/pci/drivers/pci-stub/bind
*** $ echo "8086 150a" > /sys/bus/pci/drivers/pci-stub/remove_id
***
Failed to deassign device "04:00.0" : Invalid argument
Error initializing device pci-assign
That should keep people like me from doing the most obvious misuses :-).
CC: Daniel P. Berrange <berrange@redhat.com>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
v1 -> v2:
- add more helpful guidance thanks to Daniel Berrange
v2 -> v3:
- clear name variable before using it, thus 0-terminating the string
- fix region numbers
- use correct unbind/bind names
v3 -> v4:
- split id retrieval part
- mst comments
---
hw/device-assignment.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++-
1 files changed, 53 insertions(+), 2 deletions(-)
diff --git a/hw/device-assignment.c b/hw/device-assignment.c
index 566671c..d7a03c9 100644
--- a/hw/device-assignment.c
+++ b/hw/device-assignment.c
@@ -756,6 +756,54 @@ static uint32_t calc_assigned_dev_id(uint8_t bus, uint8_t devfn)
return (uint32_t)bus << 8 | (uint32_t)devfn;
}
+static void assign_failed_examine(AssignedDevice *dev)
+{
+ char name[PATH_MAX], dir[PATH_MAX], driver[PATH_MAX] = {}, *ns;
+ uint16_t vendor_id, device_id;
+ int r;
+
+ /* XXX implement multidomain */
+ sprintf(dir, "/sys/bus/pci/devices/0000:%02x:%02x.%01x/",
+ dev->host.bus, dev->host.dev, dev->host.func);
+
+ sprintf(name, "%sdriver", dir);
+
+ r = readlink(name, driver, sizeof(driver));
+ if ((r <= 0) || !(ns = strrchr(driver, '/')) || r >= sizeof(driver)) {
+ goto fail;
+ }
+
+ ns++;
+
+ if (get_real_vendor_id(dir, &vendor_id) ||
+ get_real_device_id(dir, &device_id)) {
+ goto fail;
+ }
+
+ fprintf(stderr, "*** The driver '%s' is occupying your device "
+ "%02x:%02x.%x.\n",
+ ns, dev->host.bus, dev->host.dev, dev->host.func);
+ fprintf(stderr, "***\n");
+ fprintf(stderr, "*** You can try the following commands to free it:\n");
+ fprintf(stderr, "***\n");
+ fprintf(stderr, "*** $ echo \"%04x %04x\" > /sys/bus/pci/drivers/pci-stub/"
+ "new_id\n", vendor_id, device_id);
+ fprintf(stderr, "*** $ echo \"0000:%02x:%02x.%x\" > /sys/bus/pci/drivers/"
+ "%s/unbind\n",
+ dev->host.bus, dev->host.dev, dev->host.func, ns);
+ fprintf(stderr, "*** $ echo \"0000:%02x:%02x.%x\" > /sys/bus/pci/drivers/"
+ "pci-stub/bind\n",
+ dev->host.bus, dev->host.dev, dev->host.func);
+ fprintf(stderr, "*** $ echo \"%04x %04x\" > /sys/bus/pci/drivers/pci-stub"
+ "/remove_id\n", vendor_id, device_id);
+ fprintf(stderr, "***\n");
+
+ return;
+
+fail:
+ fprintf(stderr, "Couldn't find out why.\n");
+}
+
static int assign_device(AssignedDevice *dev)
{
struct kvm_assigned_pci_dev assigned_dev_data;
@@ -781,9 +829,12 @@ static int assign_device(AssignedDevice *dev)
#endif
r = kvm_assign_pci_device(kvm_context, &assigned_dev_data);
- if (r < 0)
- fprintf(stderr, "Failed to assign device \"%s\" : %s\n",
+ if (r < 0) {
+ fprintf(stderr, "Failed to assign device \"%s\" : %s\n",
dev->dev.qdev.id, strerror(-r));
+
+ assign_failed_examine(dev);
+ }
return r;
}
--
1.6.0.2
^ permalink raw reply related [flat|nested] 25+ messages in thread* Re: [PATCH 3/3] Inform users about busy device assignment attempt
2009-12-15 18:30 ` [PATCH 3/3] Inform users about busy device assignment attempt Alexander Graf
@ 2009-12-15 18:41 ` Michael S. Tsirkin
2009-12-15 20:28 ` Chris Wright
1 sibling, 0 replies; 25+ messages in thread
From: Michael S. Tsirkin @ 2009-12-15 18:41 UTC (permalink / raw)
To: Alexander Graf
Cc: kvm list, Gleb Natapov, Muli Ben-Yehuda, Chris Wright,
Daniel P. Berrange
On Tue, Dec 15, 2009 at 07:30:28PM +0100, Alexander Graf wrote:
> When using -pcidevice on a device that is already in use by a kernel driver
> all the user gets is the following (very useful) information:
>
> Failed to assign device "04:00.0" : Device or resource busy
> Failed to deassign device "04:00.0" : Invalid argument
> Error initializing device pci-assign
>
> Since I usually prefer to have my computer do the thinking for me, I figured
> it might be a good idea to check and see if a device is actually used by a
> driver. If so, tell the user.
>
> So with this patch applied you get the following output:
>
> Failed to assign device "04:00.0" : Device or resource busy
> *** The driver 'igb' is occupying your device 04:00.0.
> ***
> *** You can try the following commands to free it:
> ***
> *** $ echo "8086 150a" > /sys/bus/pci/drivers/pci-stub/new_id
> *** $ echo "0000:04:00.0" > /sys/bus/pci/drivers/igb/unbind
> *** $ echo "0000:04:00.0" > /sys/bus/pci/drivers/pci-stub/bind
> *** $ echo "8086 150a" > /sys/bus/pci/drivers/pci-stub/remove_id
> ***
> Failed to deassign device "04:00.0" : Invalid argument
> Error initializing device pci-assign
>
> That should keep people like me from doing the most obvious misuses :-).
>
> CC: Daniel P. Berrange <berrange@redhat.com>
> Signed-off-by: Alexander Graf <agraf@suse.de>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
> ---
>
> v1 -> v2:
>
> - add more helpful guidance thanks to Daniel Berrange
>
> v2 -> v3:
>
> - clear name variable before using it, thus 0-terminating the string
> - fix region numbers
> - use correct unbind/bind names
>
> v3 -> v4:
>
> - split id retrieval part
> - mst comments
> ---
> hw/device-assignment.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++-
> 1 files changed, 53 insertions(+), 2 deletions(-)
>
> diff --git a/hw/device-assignment.c b/hw/device-assignment.c
> index 566671c..d7a03c9 100644
> --- a/hw/device-assignment.c
> +++ b/hw/device-assignment.c
> @@ -756,6 +756,54 @@ static uint32_t calc_assigned_dev_id(uint8_t bus, uint8_t devfn)
> return (uint32_t)bus << 8 | (uint32_t)devfn;
> }
>
> +static void assign_failed_examine(AssignedDevice *dev)
> +{
> + char name[PATH_MAX], dir[PATH_MAX], driver[PATH_MAX] = {}, *ns;
> + uint16_t vendor_id, device_id;
> + int r;
> +
> + /* XXX implement multidomain */
> + sprintf(dir, "/sys/bus/pci/devices/0000:%02x:%02x.%01x/",
> + dev->host.bus, dev->host.dev, dev->host.func);
> +
> + sprintf(name, "%sdriver", dir);
> +
> + r = readlink(name, driver, sizeof(driver));
> + if ((r <= 0) || !(ns = strrchr(driver, '/')) || r >= sizeof(driver)) {
> + goto fail;
> + }
> +
> + ns++;
> +
> + if (get_real_vendor_id(dir, &vendor_id) ||
> + get_real_device_id(dir, &device_id)) {
> + goto fail;
> + }
> +
> + fprintf(stderr, "*** The driver '%s' is occupying your device "
> + "%02x:%02x.%x.\n",
> + ns, dev->host.bus, dev->host.dev, dev->host.func);
> + fprintf(stderr, "***\n");
> + fprintf(stderr, "*** You can try the following commands to free it:\n");
> + fprintf(stderr, "***\n");
> + fprintf(stderr, "*** $ echo \"%04x %04x\" > /sys/bus/pci/drivers/pci-stub/"
> + "new_id\n", vendor_id, device_id);
> + fprintf(stderr, "*** $ echo \"0000:%02x:%02x.%x\" > /sys/bus/pci/drivers/"
> + "%s/unbind\n",
> + dev->host.bus, dev->host.dev, dev->host.func, ns);
> + fprintf(stderr, "*** $ echo \"0000:%02x:%02x.%x\" > /sys/bus/pci/drivers/"
> + "pci-stub/bind\n",
> + dev->host.bus, dev->host.dev, dev->host.func);
> + fprintf(stderr, "*** $ echo \"%04x %04x\" > /sys/bus/pci/drivers/pci-stub"
> + "/remove_id\n", vendor_id, device_id);
> + fprintf(stderr, "***\n");
> +
> + return;
> +
> +fail:
> + fprintf(stderr, "Couldn't find out why.\n");
> +}
> +
> static int assign_device(AssignedDevice *dev)
> {
> struct kvm_assigned_pci_dev assigned_dev_data;
> @@ -781,9 +829,12 @@ static int assign_device(AssignedDevice *dev)
> #endif
>
> r = kvm_assign_pci_device(kvm_context, &assigned_dev_data);
> - if (r < 0)
> - fprintf(stderr, "Failed to assign device \"%s\" : %s\n",
> + if (r < 0) {
> + fprintf(stderr, "Failed to assign device \"%s\" : %s\n",
> dev->dev.qdev.id, strerror(-r));
> +
> + assign_failed_examine(dev);
> + }
> return r;
> }
>
> --
> 1.6.0.2
^ permalink raw reply [flat|nested] 25+ messages in thread* Re: [PATCH 3/3] Inform users about busy device assignment attempt
2009-12-15 18:30 ` [PATCH 3/3] Inform users about busy device assignment attempt Alexander Graf
2009-12-15 18:41 ` Michael S. Tsirkin
@ 2009-12-15 20:28 ` Chris Wright
2009-12-15 21:09 ` Michael S. Tsirkin
2009-12-15 21:10 ` Alexander Graf
1 sibling, 2 replies; 25+ messages in thread
From: Chris Wright @ 2009-12-15 20:28 UTC (permalink / raw)
To: Alexander Graf
Cc: kvm list, Gleb Natapov, Michael S. Tsirkin, Muli Ben-Yehuda,
Chris Wright, Daniel P. Berrange
* Alexander Graf (agraf@suse.de) wrote:
> +static void assign_failed_examine(AssignedDevice *dev)
> +{
> + char name[PATH_MAX], dir[PATH_MAX], driver[PATH_MAX] = {}, *ns;
> + uint16_t vendor_id, device_id;
> + int r;
> +
> + /* XXX implement multidomain */
> + sprintf(dir, "/sys/bus/pci/devices/0000:%02x:%02x.%01x/",
> + dev->host.bus, dev->host.dev, dev->host.func);
> +
> + sprintf(name, "%sdriver", dir);
> +
> + r = readlink(name, driver, sizeof(driver));
> + if ((r <= 0) || !(ns = strrchr(driver, '/')) || r >= sizeof(driver)) {
While the symlink should never be that long, I think you want to
check bytes stored in driver before strrchr, else you may walk off a
non-NULL-terminated buffer.
> + goto fail;
> + }
> +
> + ns++;
> +
> + if (get_real_vendor_id(dir, &vendor_id) ||
> + get_real_device_id(dir, &device_id)) {
> + goto fail;
> + }
> +
> + fprintf(stderr, "*** The driver '%s' is occupying your device "
> + "%02x:%02x.%x.\n",
> + ns, dev->host.bus, dev->host.dev, dev->host.func);
> + fprintf(stderr, "***\n");
> + fprintf(stderr, "*** You can try the following commands to free it:\n");
> + fprintf(stderr, "***\n");
> + fprintf(stderr, "*** $ echo \"%04x %04x\" > /sys/bus/pci/drivers/pci-stub/"
> + "new_id\n", vendor_id, device_id);
> + fprintf(stderr, "*** $ echo \"0000:%02x:%02x.%x\" > /sys/bus/pci/drivers/"
> + "%s/unbind\n",
> + dev->host.bus, dev->host.dev, dev->host.func, ns);
> + fprintf(stderr, "*** $ echo \"0000:%02x:%02x.%x\" > /sys/bus/pci/drivers/"
> + "pci-stub/bind\n",
> + dev->host.bus, dev->host.dev, dev->host.func);
> + fprintf(stderr, "*** $ echo \"%04x %04x\" > /sys/bus/pci/drivers/pci-stub"
> + "/remove_id\n", vendor_id, device_id);
> + fprintf(stderr, "***\n");
> +
> + return;
> +
> +fail:
> + fprintf(stderr, "Couldn't find out why.\n");
> +}
> +
> static int assign_device(AssignedDevice *dev)
> {
> struct kvm_assigned_pci_dev assigned_dev_data;
> @@ -781,9 +829,12 @@ static int assign_device(AssignedDevice *dev)
> #endif
>
> r = kvm_assign_pci_device(kvm_context, &assigned_dev_data);
> - if (r < 0)
> - fprintf(stderr, "Failed to assign device \"%s\" : %s\n",
> + if (r < 0) {
> + fprintf(stderr, "Failed to assign device \"%s\" : %s\n",
> dev->dev.qdev.id, strerror(-r));
> +
> + assign_failed_examine(dev);
Only really catching the EBUSY case, maybe test for explicitly?
^ permalink raw reply [flat|nested] 25+ messages in thread* Re: [PATCH 3/3] Inform users about busy device assignment attempt
2009-12-15 20:28 ` Chris Wright
@ 2009-12-15 21:09 ` Michael S. Tsirkin
2009-12-15 21:10 ` Alexander Graf
1 sibling, 0 replies; 25+ messages in thread
From: Michael S. Tsirkin @ 2009-12-15 21:09 UTC (permalink / raw)
To: Chris Wright
Cc: Alexander Graf, kvm list, Gleb Natapov, Muli Ben-Yehuda,
Daniel P. Berrange
On Tue, Dec 15, 2009 at 12:28:52PM -0800, Chris Wright wrote:
> * Alexander Graf (agraf@suse.de) wrote:
> > +static void assign_failed_examine(AssignedDevice *dev)
> > +{
> > + char name[PATH_MAX], dir[PATH_MAX], driver[PATH_MAX] = {}, *ns;
> > + uint16_t vendor_id, device_id;
> > + int r;
> > +
> > + /* XXX implement multidomain */
> > + sprintf(dir, "/sys/bus/pci/devices/0000:%02x:%02x.%01x/",
> > + dev->host.bus, dev->host.dev, dev->host.func);
> > +
> > + sprintf(name, "%sdriver", dir);
> > +
> > + r = readlink(name, driver, sizeof(driver));
> > + if ((r <= 0) || !(ns = strrchr(driver, '/')) || r >= sizeof(driver)) {
>
> While the symlink should never be that long, I think you want to
> check bytes stored in driver before strrchr, else you may walk off a
> non-NULL-terminated buffer.
On I missed this. Yes, sizeof check must be done before strrchr.
Good catch!
> > + goto fail;
> > + }
> > +
> > + ns++;
> > +
> > + if (get_real_vendor_id(dir, &vendor_id) ||
> > + get_real_device_id(dir, &device_id)) {
> > + goto fail;
> > + }
> > +
> > + fprintf(stderr, "*** The driver '%s' is occupying your device "
> > + "%02x:%02x.%x.\n",
> > + ns, dev->host.bus, dev->host.dev, dev->host.func);
> > + fprintf(stderr, "***\n");
> > + fprintf(stderr, "*** You can try the following commands to free it:\n");
> > + fprintf(stderr, "***\n");
> > + fprintf(stderr, "*** $ echo \"%04x %04x\" > /sys/bus/pci/drivers/pci-stub/"
> > + "new_id\n", vendor_id, device_id);
> > + fprintf(stderr, "*** $ echo \"0000:%02x:%02x.%x\" > /sys/bus/pci/drivers/"
> > + "%s/unbind\n",
> > + dev->host.bus, dev->host.dev, dev->host.func, ns);
> > + fprintf(stderr, "*** $ echo \"0000:%02x:%02x.%x\" > /sys/bus/pci/drivers/"
> > + "pci-stub/bind\n",
> > + dev->host.bus, dev->host.dev, dev->host.func);
> > + fprintf(stderr, "*** $ echo \"%04x %04x\" > /sys/bus/pci/drivers/pci-stub"
> > + "/remove_id\n", vendor_id, device_id);
> > + fprintf(stderr, "***\n");
> > +
> > + return;
> > +
> > +fail:
> > + fprintf(stderr, "Couldn't find out why.\n");
> > +}
> > +
> > static int assign_device(AssignedDevice *dev)
> > {
> > struct kvm_assigned_pci_dev assigned_dev_data;
> > @@ -781,9 +829,12 @@ static int assign_device(AssignedDevice *dev)
> > #endif
> >
> > r = kvm_assign_pci_device(kvm_context, &assigned_dev_data);
> > - if (r < 0)
> > - fprintf(stderr, "Failed to assign device \"%s\" : %s\n",
> > + if (r < 0) {
> > + fprintf(stderr, "Failed to assign device \"%s\" : %s\n",
> > dev->dev.qdev.id, strerror(-r));
> > +
> > + assign_failed_examine(dev);
>
> Only really catching the EBUSY case, maybe test for explicitly?
^ permalink raw reply [flat|nested] 25+ messages in thread* Re: [PATCH 3/3] Inform users about busy device assignment attempt
2009-12-15 20:28 ` Chris Wright
2009-12-15 21:09 ` Michael S. Tsirkin
@ 2009-12-15 21:10 ` Alexander Graf
2009-12-15 23:13 ` Chris Wright
1 sibling, 1 reply; 25+ messages in thread
From: Alexander Graf @ 2009-12-15 21:10 UTC (permalink / raw)
To: Chris Wright
Cc: kvm list, Gleb Natapov, Michael S. Tsirkin, Muli Ben-Yehuda,
Daniel P. Berrange
Chris Wright wrote:
> * Alexander Graf (agraf@suse.de) wrote:
>
>> +static void assign_failed_examine(AssignedDevice *dev)
>> +{
>> + char name[PATH_MAX], dir[PATH_MAX], driver[PATH_MAX] = {}, *ns;
>> + uint16_t vendor_id, device_id;
>> + int r;
>> +
>> + /* XXX implement multidomain */
>> + sprintf(dir, "/sys/bus/pci/devices/0000:%02x:%02x.%01x/",
>> + dev->host.bus, dev->host.dev, dev->host.func);
>> +
>> + sprintf(name, "%sdriver", dir);
>> +
>> + r = readlink(name, driver, sizeof(driver));
>> + if ((r <= 0) || !(ns = strrchr(driver, '/')) || r >= sizeof(driver)) {
>>
>
> While the symlink should never be that long, I think you want to
> check bytes stored in driver before strrchr, else you may walk off a
> non-NULL-terminated buffer.
>
Hum, yeah. I'm starting to understand why the shell was invented.
>
>> + goto fail;
>> + }
>> +
>> + ns++;
>> +
>> + if (get_real_vendor_id(dir, &vendor_id) ||
>> + get_real_device_id(dir, &device_id)) {
>> + goto fail;
>> + }
>> +
>> + fprintf(stderr, "*** The driver '%s' is occupying your device "
>> + "%02x:%02x.%x.\n",
>> + ns, dev->host.bus, dev->host.dev, dev->host.func);
>> + fprintf(stderr, "***\n");
>> + fprintf(stderr, "*** You can try the following commands to free it:\n");
>> + fprintf(stderr, "***\n");
>> + fprintf(stderr, "*** $ echo \"%04x %04x\" > /sys/bus/pci/drivers/pci-stub/"
>> + "new_id\n", vendor_id, device_id);
>> + fprintf(stderr, "*** $ echo \"0000:%02x:%02x.%x\" > /sys/bus/pci/drivers/"
>> + "%s/unbind\n",
>> + dev->host.bus, dev->host.dev, dev->host.func, ns);
>> + fprintf(stderr, "*** $ echo \"0000:%02x:%02x.%x\" > /sys/bus/pci/drivers/"
>> + "pci-stub/bind\n",
>> + dev->host.bus, dev->host.dev, dev->host.func);
>> + fprintf(stderr, "*** $ echo \"%04x %04x\" > /sys/bus/pci/drivers/pci-stub"
>> + "/remove_id\n", vendor_id, device_id);
>> + fprintf(stderr, "***\n");
>> +
>> + return;
>> +
>> +fail:
>> + fprintf(stderr, "Couldn't find out why.\n");
>> +}
>> +
>> static int assign_device(AssignedDevice *dev)
>> {
>> struct kvm_assigned_pci_dev assigned_dev_data;
>> @@ -781,9 +829,12 @@ static int assign_device(AssignedDevice *dev)
>> #endif
>>
>> r = kvm_assign_pci_device(kvm_context, &assigned_dev_data);
>> - if (r < 0)
>> - fprintf(stderr, "Failed to assign device \"%s\" : %s\n",
>> + if (r < 0) {
>> + fprintf(stderr, "Failed to assign device \"%s\" : %s\n",
>> dev->dev.qdev.id, strerror(-r));
>> +
>> + assign_failed_examine(dev);
>>
>
> Only really catching the EBUSY case, maybe test for explicitly?
>
What others do we have?
^ permalink raw reply [flat|nested] 25+ messages in thread* Re: [PATCH 3/3] Inform users about busy device assignment attempt
2009-12-15 21:10 ` Alexander Graf
@ 2009-12-15 23:13 ` Chris Wright
0 siblings, 0 replies; 25+ messages in thread
From: Chris Wright @ 2009-12-15 23:13 UTC (permalink / raw)
To: Alexander Graf
Cc: Chris Wright, kvm list, Gleb Natapov, Michael S. Tsirkin,
Muli Ben-Yehuda, Daniel P. Berrange
* Alexander Graf (agraf@suse.de) wrote:
> Chris Wright wrote:
> > * Alexander Graf (agraf@suse.de) wrote:
> >
> >> +static void assign_failed_examine(AssignedDevice *dev)
> >> +{
> >> + char name[PATH_MAX], dir[PATH_MAX], driver[PATH_MAX] = {}, *ns;
> >> + uint16_t vendor_id, device_id;
> >> + int r;
> >> +
> >> + /* XXX implement multidomain */
> >> + sprintf(dir, "/sys/bus/pci/devices/0000:%02x:%02x.%01x/",
> >> + dev->host.bus, dev->host.dev, dev->host.func);
> >> +
> >> + sprintf(name, "%sdriver", dir);
> >> +
> >> + r = readlink(name, driver, sizeof(driver));
> >> + if ((r <= 0) || !(ns = strrchr(driver, '/')) || r >= sizeof(driver)) {
> >>
> >
> > While the symlink should never be that long, I think you want to
> > check bytes stored in driver before strrchr, else you may walk off a
> > non-NULL-terminated buffer.
> >
>
> Hum, yeah. I'm starting to understand why the shell was invented.
Heh
> >> + goto fail;
> >> + }
> >> +
> >> + ns++;
> >> +
> >> + if (get_real_vendor_id(dir, &vendor_id) ||
> >> + get_real_device_id(dir, &device_id)) {
> >> + goto fail;
> >> + }
> >> +
> >> + fprintf(stderr, "*** The driver '%s' is occupying your device "
> >> + "%02x:%02x.%x.\n",
> >> + ns, dev->host.bus, dev->host.dev, dev->host.func);
> >> + fprintf(stderr, "***\n");
> >> + fprintf(stderr, "*** You can try the following commands to free it:\n");
> >> + fprintf(stderr, "***\n");
> >> + fprintf(stderr, "*** $ echo \"%04x %04x\" > /sys/bus/pci/drivers/pci-stub/"
> >> + "new_id\n", vendor_id, device_id);
> >> + fprintf(stderr, "*** $ echo \"0000:%02x:%02x.%x\" > /sys/bus/pci/drivers/"
> >> + "%s/unbind\n",
> >> + dev->host.bus, dev->host.dev, dev->host.func, ns);
> >> + fprintf(stderr, "*** $ echo \"0000:%02x:%02x.%x\" > /sys/bus/pci/drivers/"
> >> + "pci-stub/bind\n",
> >> + dev->host.bus, dev->host.dev, dev->host.func);
> >> + fprintf(stderr, "*** $ echo \"%04x %04x\" > /sys/bus/pci/drivers/pci-stub"
> >> + "/remove_id\n", vendor_id, device_id);
> >> + fprintf(stderr, "***\n");
> >> +
> >> + return;
> >> +
> >> +fail:
> >> + fprintf(stderr, "Couldn't find out why.\n");
> >> +}
> >> +
> >> static int assign_device(AssignedDevice *dev)
> >> {
> >> struct kvm_assigned_pci_dev assigned_dev_data;
> >> @@ -781,9 +829,12 @@ static int assign_device(AssignedDevice *dev)
> >> #endif
> >>
> >> r = kvm_assign_pci_device(kvm_context, &assigned_dev_data);
> >> - if (r < 0)
> >> - fprintf(stderr, "Failed to assign device \"%s\" : %s\n",
> >> + if (r < 0) {
> >> + fprintf(stderr, "Failed to assign device \"%s\" : %s\n",
> >> dev->dev.qdev.id, strerror(-r));
> >> +
> >> + assign_failed_examine(dev);
> >>
> >
> > Only really catching the EBUSY case, maybe test for explicitly?
>
> What others do we have?
The above would trigger when you've already assigned the same device,
and the error message would be confusing: "The driver pci_stub
is occupying..." (btw, perhaps s/occupying your/bound to/ would be
clearer). EEXIST is used for that case. I think that's it for errors the
user could do something with.
thanks,
-chris
^ permalink raw reply [flat|nested] 25+ messages in thread
* [PATCH 3/3] Inform users about busy device assignment attempt
2009-12-17 15:04 [PATCH 0/3] Device Assignment fixes Alexander Graf
@ 2009-12-17 15:04 ` Alexander Graf
0 siblings, 0 replies; 25+ messages in thread
From: Alexander Graf @ 2009-12-17 15:04 UTC (permalink / raw)
To: kvm list
Cc: Gleb Natapov, Michael S. Tsirkin, Muli Ben-Yehuda, Chris Wright,
Daniel P. Berrange
When using -pcidevice on a device that is already in use by a kernel driver
all the user gets is the following (very useful) information:
Failed to assign device "04:00.0" : Device or resource busy
Failed to deassign device "04:00.0" : Invalid argument
Error initializing device pci-assign
Since I usually prefer to have my computer do the thinking for me, I figured
it might be a good idea to check and see if a device is actually used by a
driver. If so, tell the user.
So with this patch applied you get the following output:
Failed to assign device "04:00.0" : Device or resource busy
*** The driver 'igb' is occupying your device 04:00.0.
***
*** You can try the following commands to free it:
***
*** $ echo "8086 150a" > /sys/bus/pci/drivers/pci-stub/new_id
*** $ echo "0000:04:00.0" > /sys/bus/pci/drivers/igb/unbind
*** $ echo "0000:04:00.0" > /sys/bus/pci/drivers/pci-stub/bind
*** $ echo "8086 150a" > /sys/bus/pci/drivers/pci-stub/remove_id
***
Failed to deassign device "04:00.0" : Invalid argument
Error initializing device pci-assign
That should keep people like me from doing the most obvious misuses :-).
CC: Daniel P. Berrange <berrange@redhat.com>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
v1 -> v2:
- add more helpful guidance thanks to Daniel Berrange
v2 -> v3:
- clear name variable before using it, thus 0-terminating the string
- fix region numbers
- use correct unbind/bind names
v3 -> v4:
- split id retrieval part
- mst comments
v4 -> v5:
- address chris' comments
---
hw/device-assignment.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++-
1 files changed, 59 insertions(+), 2 deletions(-)
diff --git a/hw/device-assignment.c b/hw/device-assignment.c
index 566671c..d6d44eb 100644
--- a/hw/device-assignment.c
+++ b/hw/device-assignment.c
@@ -756,6 +756,54 @@ static uint32_t calc_assigned_dev_id(uint8_t bus, uint8_t devfn)
return (uint32_t)bus << 8 | (uint32_t)devfn;
}
+static void assign_failed_examine(AssignedDevice *dev)
+{
+ char name[PATH_MAX], dir[PATH_MAX], driver[PATH_MAX] = {}, *ns;
+ uint16_t vendor_id, device_id;
+ int r;
+
+ /* XXX implement multidomain */
+ sprintf(dir, "/sys/bus/pci/devices/0000:%02x:%02x.%01x/",
+ dev->host.bus, dev->host.dev, dev->host.func);
+
+ sprintf(name, "%sdriver", dir);
+
+ r = readlink(name, driver, sizeof(driver));
+ if ((r <= 0) || r >= sizeof(driver) || !(ns = strrchr(driver, '/'))) {
+ goto fail;
+ }
+
+ ns++;
+
+ if (get_real_vendor_id(dir, &vendor_id) ||
+ get_real_device_id(dir, &device_id)) {
+ goto fail;
+ }
+
+ fprintf(stderr, "*** The driver '%s' is occupying your device "
+ "%02x:%02x.%x.\n",
+ ns, dev->host.bus, dev->host.dev, dev->host.func);
+ fprintf(stderr, "***\n");
+ fprintf(stderr, "*** You can try the following commands to free it:\n");
+ fprintf(stderr, "***\n");
+ fprintf(stderr, "*** $ echo \"%04x %04x\" > /sys/bus/pci/drivers/pci-stub/"
+ "new_id\n", vendor_id, device_id);
+ fprintf(stderr, "*** $ echo \"0000:%02x:%02x.%x\" > /sys/bus/pci/drivers/"
+ "%s/unbind\n",
+ dev->host.bus, dev->host.dev, dev->host.func, ns);
+ fprintf(stderr, "*** $ echo \"0000:%02x:%02x.%x\" > /sys/bus/pci/drivers/"
+ "pci-stub/bind\n",
+ dev->host.bus, dev->host.dev, dev->host.func);
+ fprintf(stderr, "*** $ echo \"%04x %04x\" > /sys/bus/pci/drivers/pci-stub"
+ "/remove_id\n", vendor_id, device_id);
+ fprintf(stderr, "***\n");
+
+ return;
+
+fail:
+ fprintf(stderr, "Couldn't find out why.\n");
+}
+
static int assign_device(AssignedDevice *dev)
{
struct kvm_assigned_pci_dev assigned_dev_data;
@@ -781,9 +829,18 @@ static int assign_device(AssignedDevice *dev)
#endif
r = kvm_assign_pci_device(kvm_context, &assigned_dev_data);
- if (r < 0)
- fprintf(stderr, "Failed to assign device \"%s\" : %s\n",
+ if (r < 0) {
+ fprintf(stderr, "Failed to assign device \"%s\" : %s\n",
dev->dev.qdev.id, strerror(-r));
+
+ switch (r) {
+ case -EBUSY:
+ assign_failed_examine(dev);
+ break;
+ default:
+ break;
+ }
+ }
return r;
}
--
1.6.0.2
^ permalink raw reply related [flat|nested] 25+ messages in thread