- * [RFC 1/3] util/vfio-helpers: Collect IOVA reserved regions
  2020-09-25 13:48 [RFC 0/3] NVMe passthrough: Take into account host IOVA reserved regions Eric Auger
@ 2020-09-25 13:48 ` Eric Auger
  2020-09-25 14:43   ` Fam Zheng
  2020-09-25 13:48 ` [RFC 2/3] util/vfio-helpers: Dynamically compute the min/max IOVA Eric Auger
  2020-09-25 13:48 ` [RFC 3/3] util/vfio-helpers: Rework the IOVA allocator to avoid IOVA reserved regions Eric Auger
  2 siblings, 1 reply; 8+ messages in thread
From: Eric Auger @ 2020-09-25 13:48 UTC (permalink / raw)
  To: eric.auger.pro, eric.auger, qemu-devel, qemu-arm, stefanha, fam,
	philmd, alex.williamson
  Cc: lvivier, kwolf, cohuck, mreitz
The IOVA allocator currently ignores host reserved regions.
As a result some chosen IOVAs may collide with some of them,
resulting in VFIO MAP_DMA errors later on. This happens on ARM
where the MSI reserved window quickly is encountered:
[0x8000000, 0x8100000]. since 5.4 kernel, VFIO returns the usable
IOVA regions. So let's enumerate them in the prospect to avoid
them, later on.
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
 util/vfio-helpers.c | 75 +++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 73 insertions(+), 2 deletions(-)
diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c
index 583bdfb36f..8e91beba95 100644
--- a/util/vfio-helpers.c
+++ b/util/vfio-helpers.c
@@ -40,6 +40,11 @@ typedef struct {
     uint64_t iova;
 } IOVAMapping;
 
+struct IOVARange {
+    uint64_t start;
+    uint64_t end;
+};
+
 struct QEMUVFIOState {
     QemuMutex lock;
 
@@ -49,6 +54,8 @@ struct QEMUVFIOState {
     int device;
     RAMBlockNotifier ram_notifier;
     struct vfio_region_info config_region_info, bar_region_info[6];
+    struct IOVARange *usable_iova_ranges;
+    uint8_t nb_iova_ranges;
 
     /* These fields are protected by @lock */
     /* VFIO's IO virtual address space is managed by splitting into a few
@@ -236,6 +243,36 @@ static int qemu_vfio_pci_write_config(QEMUVFIOState *s, void *buf, int size, int
     return ret == size ? 0 : -errno;
 }
 
+static void collect_usable_iova_ranges(QEMUVFIOState *s, void *first_cap)
+{
+    struct vfio_iommu_type1_info_cap_iova_range *cap_iova_range;
+    struct vfio_info_cap_header *cap = first_cap;
+    void *offset = first_cap;
+    int i;
+
+    while (cap->id != VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE) {
+        if (cap->next) {
+            return;
+        }
+        offset += cap->next;
+        cap = (struct vfio_info_cap_header *)cap;
+    }
+
+    cap_iova_range = (struct vfio_iommu_type1_info_cap_iova_range *)cap;
+
+    s->nb_iova_ranges = cap_iova_range->nr_iovas;
+    if (s->nb_iova_ranges > 1) {
+        s->usable_iova_ranges =
+            g_realloc(s->usable_iova_ranges,
+                      s->nb_iova_ranges * sizeof(struct IOVARange));
+    }
+
+    for (i = 0; i <  s->nb_iova_ranges; i++) {
+        s->usable_iova_ranges[i].start = cap_iova_range->iova_ranges[i].start;
+        s->usable_iova_ranges[i].end = cap_iova_range->iova_ranges[i].end;
+    }
+}
+
 static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
                               Error **errp)
 {
@@ -243,10 +280,13 @@ static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
     int i;
     uint16_t pci_cmd;
     struct vfio_group_status group_status = { .argsz = sizeof(group_status) };
-    struct vfio_iommu_type1_info iommu_info = { .argsz = sizeof(iommu_info) };
+    struct vfio_iommu_type1_info *iommu_info = NULL;
+    size_t iommu_info_size = sizeof(*iommu_info);
     struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
     char *group_file = NULL;
 
+    s->usable_iova_ranges = NULL;
+
     /* Create a new container */
     s->container = open("/dev/vfio/vfio", O_RDWR);
 
@@ -310,13 +350,38 @@ static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
         goto fail;
     }
 
+    iommu_info = g_malloc0(iommu_info_size);
+    iommu_info->argsz = iommu_info_size;
+
     /* Get additional IOMMU info */
-    if (ioctl(s->container, VFIO_IOMMU_GET_INFO, &iommu_info)) {
+    if (ioctl(s->container, VFIO_IOMMU_GET_INFO, iommu_info)) {
         error_setg_errno(errp, errno, "Failed to get IOMMU info");
         ret = -errno;
         goto fail;
     }
 
+    /*
+     * if the kernel does not report usable IOVA regions, choose
+     * the legacy [QEMU_VFIO_IOVA_MIN, QEMU_VFIO_IOVA_MAX -1] region
+     */
+    s->nb_iova_ranges = 1;
+    s->usable_iova_ranges = g_new0(struct IOVARange, 1);
+    s->usable_iova_ranges[0].start = QEMU_VFIO_IOVA_MIN;
+    s->usable_iova_ranges[0].end = QEMU_VFIO_IOVA_MAX - 1;
+
+    if (iommu_info->argsz > iommu_info_size) {
+        void *first_cap;
+
+        iommu_info_size = iommu_info->argsz;
+        iommu_info = g_realloc(iommu_info, iommu_info_size);
+        if (ioctl(s->container, VFIO_IOMMU_GET_INFO, iommu_info)) {
+            ret = -errno;
+            goto fail;
+        }
+        first_cap = (void *)iommu_info + iommu_info->cap_offset;
+        collect_usable_iova_ranges(s, first_cap);
+    }
+
     s->device = ioctl(s->group, VFIO_GROUP_GET_DEVICE_FD, device);
 
     if (s->device < 0) {
@@ -365,8 +430,12 @@ static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
     if (ret) {
         goto fail;
     }
+    g_free(iommu_info);
     return 0;
 fail:
+    g_free(s->usable_iova_ranges);
+    s->nb_iova_ranges = 0;
+    g_free(iommu_info);
     close(s->group);
 fail_container:
     close(s->container);
@@ -716,6 +785,8 @@ void qemu_vfio_close(QEMUVFIOState *s)
         qemu_vfio_undo_mapping(s, &s->mappings[i], NULL);
     }
     ram_block_notifier_remove(&s->ram_notifier);
+    g_free(s->usable_iova_ranges);
+    s->nb_iova_ranges = 0;
     qemu_vfio_reset(s);
     close(s->device);
     close(s->group);
-- 
2.21.3
^ permalink raw reply related	[flat|nested] 8+ messages in thread
- * Re: [RFC 1/3] util/vfio-helpers: Collect IOVA reserved regions
  2020-09-25 13:48 ` [RFC 1/3] util/vfio-helpers: Collect " Eric Auger
@ 2020-09-25 14:43   ` Fam Zheng
  2020-09-25 15:23     ` Auger Eric
  0 siblings, 1 reply; 8+ messages in thread
From: Fam Zheng @ 2020-09-25 14:43 UTC (permalink / raw)
  To: Eric Auger
  Cc: lvivier, kwolf, cohuck, qemu-devel, mreitz, alex.williamson,
	qemu-arm, stefanha, philmd, eric.auger.pro
On 2020-09-25 15:48, Eric Auger wrote:
> The IOVA allocator currently ignores host reserved regions.
> As a result some chosen IOVAs may collide with some of them,
> resulting in VFIO MAP_DMA errors later on. This happens on ARM
> where the MSI reserved window quickly is encountered:
> [0x8000000, 0x8100000]. since 5.4 kernel, VFIO returns the usable
> IOVA regions. So let's enumerate them in the prospect to avoid
> them, later on.
> 
> Signed-off-by: Eric Auger <eric.auger@redhat.com>
> ---
>  util/vfio-helpers.c | 75 +++++++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 73 insertions(+), 2 deletions(-)
> 
> diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c
> index 583bdfb36f..8e91beba95 100644
> --- a/util/vfio-helpers.c
> +++ b/util/vfio-helpers.c
> @@ -40,6 +40,11 @@ typedef struct {
>      uint64_t iova;
>  } IOVAMapping;
>  
> +struct IOVARange {
> +    uint64_t start;
> +    uint64_t end;
> +};
> +
>  struct QEMUVFIOState {
>      QemuMutex lock;
>  
> @@ -49,6 +54,8 @@ struct QEMUVFIOState {
>      int device;
>      RAMBlockNotifier ram_notifier;
>      struct vfio_region_info config_region_info, bar_region_info[6];
> +    struct IOVARange *usable_iova_ranges;
> +    uint8_t nb_iova_ranges;
>  
>      /* These fields are protected by @lock */
>      /* VFIO's IO virtual address space is managed by splitting into a few
> @@ -236,6 +243,36 @@ static int qemu_vfio_pci_write_config(QEMUVFIOState *s, void *buf, int size, int
>      return ret == size ? 0 : -errno;
>  }
>  
> +static void collect_usable_iova_ranges(QEMUVFIOState *s, void *first_cap)
> +{
> +    struct vfio_iommu_type1_info_cap_iova_range *cap_iova_range;
> +    struct vfio_info_cap_header *cap = first_cap;
> +    void *offset = first_cap;
> +    int i;
> +
> +    while (cap->id != VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE) {
> +        if (cap->next) {
This check looks reversed.
> +            return;
> +        }
> +        offset += cap->next;
@offset is unused.
> +        cap = (struct vfio_info_cap_header *)cap;
This assignment is nop.
> +    }
> +
> +    cap_iova_range = (struct vfio_iommu_type1_info_cap_iova_range *)cap;
> +
> +    s->nb_iova_ranges = cap_iova_range->nr_iovas;
> +    if (s->nb_iova_ranges > 1) {
> +        s->usable_iova_ranges =
> +            g_realloc(s->usable_iova_ranges,
> +                      s->nb_iova_ranges * sizeof(struct IOVARange));
> +    }
> +
> +    for (i = 0; i <  s->nb_iova_ranges; i++) {
s/  / /
> +        s->usable_iova_ranges[i].start = cap_iova_range->iova_ranges[i].start;
> +        s->usable_iova_ranges[i].end = cap_iova_range->iova_ranges[i].end;
> +    }
> +}
> +
>  static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
>                                Error **errp)
>  {
> @@ -243,10 +280,13 @@ static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
>      int i;
>      uint16_t pci_cmd;
>      struct vfio_group_status group_status = { .argsz = sizeof(group_status) };
> -    struct vfio_iommu_type1_info iommu_info = { .argsz = sizeof(iommu_info) };
> +    struct vfio_iommu_type1_info *iommu_info = NULL;
> +    size_t iommu_info_size = sizeof(*iommu_info);
>      struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
>      char *group_file = NULL;
>  
> +    s->usable_iova_ranges = NULL;
> +
>      /* Create a new container */
>      s->container = open("/dev/vfio/vfio", O_RDWR);
>  
> @@ -310,13 +350,38 @@ static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
>          goto fail;
>      }
>  
> +    iommu_info = g_malloc0(iommu_info_size);
> +    iommu_info->argsz = iommu_info_size;
> +
>      /* Get additional IOMMU info */
> -    if (ioctl(s->container, VFIO_IOMMU_GET_INFO, &iommu_info)) {
> +    if (ioctl(s->container, VFIO_IOMMU_GET_INFO, iommu_info)) {
>          error_setg_errno(errp, errno, "Failed to get IOMMU info");
>          ret = -errno;
>          goto fail;
>      }
>  
> +    /*
> +     * if the kernel does not report usable IOVA regions, choose
> +     * the legacy [QEMU_VFIO_IOVA_MIN, QEMU_VFIO_IOVA_MAX -1] region
> +     */
> +    s->nb_iova_ranges = 1;
> +    s->usable_iova_ranges = g_new0(struct IOVARange, 1);
> +    s->usable_iova_ranges[0].start = QEMU_VFIO_IOVA_MIN;
> +    s->usable_iova_ranges[0].end = QEMU_VFIO_IOVA_MAX - 1;
> +
> +    if (iommu_info->argsz > iommu_info_size) {
> +        void *first_cap;
> +
> +        iommu_info_size = iommu_info->argsz;
> +        iommu_info = g_realloc(iommu_info, iommu_info_size);
> +        if (ioctl(s->container, VFIO_IOMMU_GET_INFO, iommu_info)) {
> +            ret = -errno;
> +            goto fail;
> +        }
> +        first_cap = (void *)iommu_info + iommu_info->cap_offset;
> +        collect_usable_iova_ranges(s, first_cap);
> +    }
> +
>      s->device = ioctl(s->group, VFIO_GROUP_GET_DEVICE_FD, device);
>  
>      if (s->device < 0) {
> @@ -365,8 +430,12 @@ static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
>      if (ret) {
>          goto fail;
>      }
> +    g_free(iommu_info);
>      return 0;
>  fail:
> +    g_free(s->usable_iova_ranges);
Set s->usable_iova_ranges to NULL to avoid double free?
> +    s->nb_iova_ranges = 0;
> +    g_free(iommu_info);
>      close(s->group);
>  fail_container:
>      close(s->container);
> @@ -716,6 +785,8 @@ void qemu_vfio_close(QEMUVFIOState *s)
>          qemu_vfio_undo_mapping(s, &s->mappings[i], NULL);
>      }
>      ram_block_notifier_remove(&s->ram_notifier);
> +    g_free(s->usable_iova_ranges);
> +    s->nb_iova_ranges = 0;
>      qemu_vfio_reset(s);
>      close(s->device);
>      close(s->group);
> -- 
> 2.21.3
> 
> 
Fam
^ permalink raw reply	[flat|nested] 8+ messages in thread
- * Re: [RFC 1/3] util/vfio-helpers: Collect IOVA reserved regions
  2020-09-25 14:43   ` Fam Zheng
@ 2020-09-25 15:23     ` Auger Eric
  2020-09-25 15:44       ` Fam Zheng
  0 siblings, 1 reply; 8+ messages in thread
From: Auger Eric @ 2020-09-25 15:23 UTC (permalink / raw)
  To: Fam Zheng
  Cc: lvivier, kwolf, cohuck, qemu-devel, mreitz, alex.williamson,
	qemu-arm, stefanha, philmd, eric.auger.pro
Hi Fam,
On 9/25/20 4:43 PM, Fam Zheng wrote:
> On 2020-09-25 15:48, Eric Auger wrote:
>> The IOVA allocator currently ignores host reserved regions.
>> As a result some chosen IOVAs may collide with some of them,
>> resulting in VFIO MAP_DMA errors later on. This happens on ARM
>> where the MSI reserved window quickly is encountered:
>> [0x8000000, 0x8100000]. since 5.4 kernel, VFIO returns the usable
>> IOVA regions. So let's enumerate them in the prospect to avoid
>> them, later on.
>>
>> Signed-off-by: Eric Auger <eric.auger@redhat.com>
>> ---
>>  util/vfio-helpers.c | 75 +++++++++++++++++++++++++++++++++++++++++++--
>>  1 file changed, 73 insertions(+), 2 deletions(-)
>>
>> diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c
>> index 583bdfb36f..8e91beba95 100644
>> --- a/util/vfio-helpers.c
>> +++ b/util/vfio-helpers.c
>> @@ -40,6 +40,11 @@ typedef struct {
>>      uint64_t iova;
>>  } IOVAMapping;
>>  
>> +struct IOVARange {
>> +    uint64_t start;
>> +    uint64_t end;
>> +};
>> +
>>  struct QEMUVFIOState {
>>      QemuMutex lock;
>>  
>> @@ -49,6 +54,8 @@ struct QEMUVFIOState {
>>      int device;
>>      RAMBlockNotifier ram_notifier;
>>      struct vfio_region_info config_region_info, bar_region_info[6];
>> +    struct IOVARange *usable_iova_ranges;
>> +    uint8_t nb_iova_ranges;
>>  
>>      /* These fields are protected by @lock */
>>      /* VFIO's IO virtual address space is managed by splitting into a few
>> @@ -236,6 +243,36 @@ static int qemu_vfio_pci_write_config(QEMUVFIOState *s, void *buf, int size, int
>>      return ret == size ? 0 : -errno;
>>  }
>>  
>> +static void collect_usable_iova_ranges(QEMUVFIOState *s, void *first_cap)
>> +{
>> +    struct vfio_iommu_type1_info_cap_iova_range *cap_iova_range;
>> +    struct vfio_info_cap_header *cap = first_cap;
>> +    void *offset = first_cap;
>> +    int i;
>> +
>> +    while (cap->id != VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE) {
>> +        if (cap->next) {
> 
> This check looks reversed.
> 
>> +            return;
>> +        }
>> +        offset += cap->next;
> 
> @offset is unused.
> 
>> +        cap = (struct vfio_info_cap_header *)cap;
> 
> This assignment is nop.
ugh indeed, that loop implementation is totally crap. I will test the
rewriting by adding an extra cap on kernel side.
> 
>> +    }
>> +
>> +    cap_iova_range = (struct vfio_iommu_type1_info_cap_iova_range *)cap;
>> +
>> +    s->nb_iova_ranges = cap_iova_range->nr_iovas;
>> +    if (s->nb_iova_ranges > 1) {
>> +        s->usable_iova_ranges =
>> +            g_realloc(s->usable_iova_ranges,
>> +                      s->nb_iova_ranges * sizeof(struct IOVARange));
>> +    }
>> +
>> +    for (i = 0; i <  s->nb_iova_ranges; i++) {
> 
> s/  / /
ok
> 
>> +        s->usable_iova_ranges[i].start = cap_iova_range->iova_ranges[i].start;
>> +        s->usable_iova_ranges[i].end = cap_iova_range->iova_ranges[i].end;
>> +    }
>> +}
>> +
>>  static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
>>                                Error **errp)
>>  {
>> @@ -243,10 +280,13 @@ static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
>>      int i;
>>      uint16_t pci_cmd;
>>      struct vfio_group_status group_status = { .argsz = sizeof(group_status) };
>> -    struct vfio_iommu_type1_info iommu_info = { .argsz = sizeof(iommu_info) };
>> +    struct vfio_iommu_type1_info *iommu_info = NULL;
>> +    size_t iommu_info_size = sizeof(*iommu_info);
>>      struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
>>      char *group_file = NULL;
>>  
>> +    s->usable_iova_ranges = NULL;
>> +
>>      /* Create a new container */
>>      s->container = open("/dev/vfio/vfio", O_RDWR);
>>  
>> @@ -310,13 +350,38 @@ static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
>>          goto fail;
>>      }
>>  
>> +    iommu_info = g_malloc0(iommu_info_size);
>> +    iommu_info->argsz = iommu_info_size;
>> +
>>      /* Get additional IOMMU info */
>> -    if (ioctl(s->container, VFIO_IOMMU_GET_INFO, &iommu_info)) {
>> +    if (ioctl(s->container, VFIO_IOMMU_GET_INFO, iommu_info)) {
>>          error_setg_errno(errp, errno, "Failed to get IOMMU info");
>>          ret = -errno;
>>          goto fail;
>>      }
>>  
>> +    /*
>> +     * if the kernel does not report usable IOVA regions, choose
>> +     * the legacy [QEMU_VFIO_IOVA_MIN, QEMU_VFIO_IOVA_MAX -1] region
>> +     */
>> +    s->nb_iova_ranges = 1;
>> +    s->usable_iova_ranges = g_new0(struct IOVARange, 1);
>> +    s->usable_iova_ranges[0].start = QEMU_VFIO_IOVA_MIN;
>> +    s->usable_iova_ranges[0].end = QEMU_VFIO_IOVA_MAX - 1;
>> +
>> +    if (iommu_info->argsz > iommu_info_size) {
>> +        void *first_cap;
>> +
>> +        iommu_info_size = iommu_info->argsz;
>> +        iommu_info = g_realloc(iommu_info, iommu_info_size);
>> +        if (ioctl(s->container, VFIO_IOMMU_GET_INFO, iommu_info)) {
>> +            ret = -errno;
>> +            goto fail;
>> +        }
>> +        first_cap = (void *)iommu_info + iommu_info->cap_offset;
>> +        collect_usable_iova_ranges(s, first_cap);
>> +    }
>> +
>>      s->device = ioctl(s->group, VFIO_GROUP_GET_DEVICE_FD, device);
>>  
>>      if (s->device < 0) {
>> @@ -365,8 +430,12 @@ static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
>>      if (ret) {
>>          goto fail;
>>      }
>> +    g_free(iommu_info);
>>      return 0;
>>  fail:
>> +    g_free(s->usable_iova_ranges);
> 
> Set s->usable_iova_ranges to NULL to avoid double free?
I think I did at the beginning of qemu_vfio_init_pci()
Thanks
Eric
> 
>> +    s->nb_iova_ranges = 0;
>> +    g_free(iommu_info);
>>      close(s->group);
>>  fail_container:
>>      close(s->container);
>> @@ -716,6 +785,8 @@ void qemu_vfio_close(QEMUVFIOState *s)
>>          qemu_vfio_undo_mapping(s, &s->mappings[i], NULL);
>>      }
>>      ram_block_notifier_remove(&s->ram_notifier);
>> +    g_free(s->usable_iova_ranges);
>> +    s->nb_iova_ranges = 0;
>>      qemu_vfio_reset(s);
>>      close(s->device);
>>      close(s->group);
>> -- 
>> 2.21.3
>>
>>
> 
> Fam
> 
^ permalink raw reply	[flat|nested] 8+ messages in thread
- * Re: [RFC 1/3] util/vfio-helpers: Collect IOVA reserved regions
  2020-09-25 15:23     ` Auger Eric
@ 2020-09-25 15:44       ` Fam Zheng
  2020-09-25 15:53         ` Auger Eric
  0 siblings, 1 reply; 8+ messages in thread
From: Fam Zheng @ 2020-09-25 15:44 UTC (permalink / raw)
  To: Auger Eric
  Cc: lvivier, kwolf, cohuck, qemu-devel, mreitz, alex.williamson,
	qemu-arm, stefanha, philmd, eric.auger.pro
On Fri, 2020-09-25 at 17:23 +0200, Auger Eric wrote:
> > > @@ -365,8 +430,12 @@ static int qemu_vfio_init_pci(QEMUVFIOState
> > > *s, const char *device,
> > >      if (ret) {
> > >          goto fail;
> > >      }
> > > +    g_free(iommu_info);
> > >      return 0;
> > >  fail:
> > > +    g_free(s->usable_iova_ranges);
> > 
> > Set s->usable_iova_ranges to NULL to avoid double free?
> 
> I think I did at the beginning of qemu_vfio_init_pci()
Yes, but I mean clearing the pointer will make calling
qemu_vfio_close() safe, there is also a g_free() on this one.
Fam
> 
> Thanks
> 
> Eric
> > 
> > > +    s->nb_iova_ranges = 0;
> > > +    g_free(iommu_info);
> > >      close(s->group);
> > >  fail_container:
> > >      close(s->container);
> > > @@ -716,6 +785,8 @@ void qemu_vfio_close(QEMUVFIOState *s)
> > >          qemu_vfio_undo_mapping(s, &s->mappings[i], NULL);
> > >      }
> > >      ram_block_notifier_remove(&s->ram_notifier);
> > > +    g_free(s->usable_iova_ranges);
> > > +    s->nb_iova_ranges = 0;
> > >      qemu_vfio_reset(s);
> > >      close(s->device);
> > >      close(s->group);
> > > -- 
> > > 2.21.3
> > > 
> > > 
> > 
> > Fam
> > 
> 
> 
^ permalink raw reply	[flat|nested] 8+ messages in thread
- * Re: [RFC 1/3] util/vfio-helpers: Collect IOVA reserved regions
  2020-09-25 15:44       ` Fam Zheng
@ 2020-09-25 15:53         ` Auger Eric
  0 siblings, 0 replies; 8+ messages in thread
From: Auger Eric @ 2020-09-25 15:53 UTC (permalink / raw)
  To: Fam Zheng
  Cc: lvivier, kwolf, cohuck, qemu-devel, mreitz, alex.williamson,
	qemu-arm, stefanha, philmd, eric.auger.pro
Hi Fam,
On 9/25/20 5:44 PM, Fam Zheng wrote:
> On Fri, 2020-09-25 at 17:23 +0200, Auger Eric wrote:
>>>> @@ -365,8 +430,12 @@ static int qemu_vfio_init_pci(QEMUVFIOState
>>>> *s, const char *device,
>>>>      if (ret) {
>>>>          goto fail;
>>>>      }
>>>> +    g_free(iommu_info);
>>>>      return 0;
>>>>  fail:
>>>> +    g_free(s->usable_iova_ranges);
>>>
>>> Set s->usable_iova_ranges to NULL to avoid double free?
>>
>> I think I did at the beginning of qemu_vfio_init_pci()
> 
> Yes, but I mean clearing the pointer will make calling
> qemu_vfio_close() safe, there is also a g_free() on this one.
Oh yes, got it.
Thank you for the review.
Best Regards
Eric
> 
> Fam
> 
>>
>> Thanks
>>
>> Eric
>>>
>>>> +    s->nb_iova_ranges = 0;
>>>> +    g_free(iommu_info);
>>>>      close(s->group);
>>>>  fail_container:
>>>>      close(s->container);
>>>> @@ -716,6 +785,8 @@ void qemu_vfio_close(QEMUVFIOState *s)
>>>>          qemu_vfio_undo_mapping(s, &s->mappings[i], NULL);
>>>>      }
>>>>      ram_block_notifier_remove(&s->ram_notifier);
>>>> +    g_free(s->usable_iova_ranges);
>>>> +    s->nb_iova_ranges = 0;
>>>>      qemu_vfio_reset(s);
>>>>      close(s->device);
>>>>      close(s->group);
>>>> -- 
>>>> 2.21.3
>>>>
>>>>
>>>
>>> Fam
>>>
>>
>>
> 
^ permalink raw reply	[flat|nested] 8+ messages in thread
 
 
 
 
- * [RFC 2/3] util/vfio-helpers: Dynamically compute the min/max IOVA
  2020-09-25 13:48 [RFC 0/3] NVMe passthrough: Take into account host IOVA reserved regions Eric Auger
  2020-09-25 13:48 ` [RFC 1/3] util/vfio-helpers: Collect " Eric Auger
@ 2020-09-25 13:48 ` Eric Auger
  2020-09-25 13:48 ` [RFC 3/3] util/vfio-helpers: Rework the IOVA allocator to avoid IOVA reserved regions Eric Auger
  2 siblings, 0 replies; 8+ messages in thread
From: Eric Auger @ 2020-09-25 13:48 UTC (permalink / raw)
  To: eric.auger.pro, eric.auger, qemu-devel, qemu-arm, stefanha, fam,
	philmd, alex.williamson
  Cc: lvivier, kwolf, cohuck, mreitz
Currently the min/max IOVA are hardcoded to [0x10000, 1 << 39].
Now we dynamically fetch the info from VFIO, if the kernel supports
it, let's use the dynamically retrieved value.
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
 util/vfio-helpers.c | 34 ++++++++++++++++++++--------------
 1 file changed, 20 insertions(+), 14 deletions(-)
diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c
index 8e91beba95..567bcf1ded 100644
--- a/util/vfio-helpers.c
+++ b/util/vfio-helpers.c
@@ -26,11 +26,11 @@
 
 #define QEMU_VFIO_DEBUG 0
 
+/*
+ * Min/Max IOVA addresses, only used if VFIO does not report
+ * the usable IOVA ranges
+ */
 #define QEMU_VFIO_IOVA_MIN 0x10000ULL
-/* XXX: Once VFIO exposes the iova bit width in the IOMMU capability interface,
- * we can use a runtime limit; alternatively it's also possible to do platform
- * specific detection by reading sysfs entries. Until then, 39 is a safe bet.
- **/
 #define QEMU_VFIO_IOVA_MAX (1ULL << 39)
 
 typedef struct {
@@ -56,6 +56,8 @@ struct QEMUVFIOState {
     struct vfio_region_info config_region_info, bar_region_info[6];
     struct IOVARange *usable_iova_ranges;
     uint8_t nb_iova_ranges;
+    uint64_t max_iova;
+    uint64_t min_iova;
 
     /* These fields are protected by @lock */
     /* VFIO's IO virtual address space is managed by splitting into a few
@@ -63,7 +65,7 @@ struct QEMUVFIOState {
      *
      * ---------------       <= 0
      * |xxxxxxxxxxxxx|
-     * |-------------|       <= QEMU_VFIO_IOVA_MIN
+     * |-------------|       <= min_iova
      * |             |
      * |    Fixed    |
      * |             |
@@ -75,20 +77,20 @@ struct QEMUVFIOState {
      * |             |
      * |    Temp     |
      * |             |
-     * |-------------|       <= QEMU_VFIO_IOVA_MAX
+     * |-------------|       <= max_iova
      * |xxxxxxxxxxxxx|
      * |xxxxxxxxxxxxx|
      * ---------------
      *
-     * - Addresses lower than QEMU_VFIO_IOVA_MIN are reserved as invalid;
+     * - Addresses lower than min_iova are reserved as invalid;
      *
      * - Fixed mappings of HVAs are assigned "low" IOVAs in the range of
-     *   [QEMU_VFIO_IOVA_MIN, low_water_mark).  Once allocated they will not be
+     *   [min_iova, low_water_mark).  Once allocated they will not be
      *   reclaimed - low_water_mark never shrinks;
      *
      * - IOVAs in range [low_water_mark, high_water_mark) are free;
      *
-     * - IOVAs in range [high_water_mark, QEMU_VFIO_IOVA_MAX) are volatile
+     * - IOVAs in range [high_water_mark, max_iova) are volatile
      *   mappings. At each qemu_vfio_dma_reset_temporary() call, the whole area
      *   is recycled. The caller should make sure I/O's depending on these
      *   mappings are completed before calling.
@@ -271,6 +273,8 @@ static void collect_usable_iova_ranges(QEMUVFIOState *s, void *first_cap)
         s->usable_iova_ranges[i].start = cap_iova_range->iova_ranges[i].start;
         s->usable_iova_ranges[i].end = cap_iova_range->iova_ranges[i].end;
     }
+    s->min_iova = s->usable_iova_ranges[0].start;
+    s->max_iova = s->usable_iova_ranges[i - 1].end + 1;
 }
 
 static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
@@ -362,12 +366,14 @@ static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
 
     /*
      * if the kernel does not report usable IOVA regions, choose
-     * the legacy [QEMU_VFIO_IOVA_MIN, QEMU_VFIO_IOVA_MAX -1] region
+     * the legacy [QEMU_VFIO_IOVA_MIN, QEMU_VFIO_IOVA_MAX - 1] region
      */
     s->nb_iova_ranges = 1;
     s->usable_iova_ranges = g_new0(struct IOVARange, 1);
     s->usable_iova_ranges[0].start = QEMU_VFIO_IOVA_MIN;
     s->usable_iova_ranges[0].end = QEMU_VFIO_IOVA_MAX - 1;
+    s->min_iova = QEMU_VFIO_IOVA_MIN;
+    s->max_iova = QEMU_VFIO_IOVA_MAX;
 
     if (iommu_info->argsz > iommu_info_size) {
         void *first_cap;
@@ -484,8 +490,8 @@ static void qemu_vfio_open_common(QEMUVFIOState *s)
     s->ram_notifier.ram_block_added = qemu_vfio_ram_block_added;
     s->ram_notifier.ram_block_removed = qemu_vfio_ram_block_removed;
     ram_block_notifier_add(&s->ram_notifier);
-    s->low_water_mark = QEMU_VFIO_IOVA_MIN;
-    s->high_water_mark = QEMU_VFIO_IOVA_MAX;
+    s->low_water_mark = s->min_iova;
+    s->high_water_mark = s->max_iova;
     qemu_ram_foreach_block(qemu_vfio_init_ramblock, s);
 }
 
@@ -734,7 +740,7 @@ int qemu_vfio_dma_reset_temporary(QEMUVFIOState *s)
         .argsz = sizeof(unmap),
         .flags = 0,
         .iova = s->high_water_mark,
-        .size = QEMU_VFIO_IOVA_MAX - s->high_water_mark,
+        .size = s->max_iova - s->high_water_mark,
     };
     trace_qemu_vfio_dma_reset_temporary(s);
     QEMU_LOCK_GUARD(&s->lock);
@@ -742,7 +748,7 @@ int qemu_vfio_dma_reset_temporary(QEMUVFIOState *s)
         error_report("VFIO_UNMAP_DMA failed: %s", strerror(errno));
         return -errno;
     }
-    s->high_water_mark = QEMU_VFIO_IOVA_MAX;
+    s->high_water_mark = s->max_iova;
     return 0;
 }
 
-- 
2.21.3
^ permalink raw reply related	[flat|nested] 8+ messages in thread
- * [RFC 3/3] util/vfio-helpers: Rework the IOVA allocator to avoid IOVA reserved regions
  2020-09-25 13:48 [RFC 0/3] NVMe passthrough: Take into account host IOVA reserved regions Eric Auger
  2020-09-25 13:48 ` [RFC 1/3] util/vfio-helpers: Collect " Eric Auger
  2020-09-25 13:48 ` [RFC 2/3] util/vfio-helpers: Dynamically compute the min/max IOVA Eric Auger
@ 2020-09-25 13:48 ` Eric Auger
  2 siblings, 0 replies; 8+ messages in thread
From: Eric Auger @ 2020-09-25 13:48 UTC (permalink / raw)
  To: eric.auger.pro, eric.auger, qemu-devel, qemu-arm, stefanha, fam,
	philmd, alex.williamson
  Cc: lvivier, kwolf, cohuck, mreitz
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
 util/vfio-helpers.c | 55 +++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 51 insertions(+), 4 deletions(-)
diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c
index 567bcf1ded..c98d0d882e 100644
--- a/util/vfio-helpers.c
+++ b/util/vfio-helpers.c
@@ -676,6 +676,48 @@ static bool qemu_vfio_verify_mappings(QEMUVFIOState *s)
     return true;
 }
 
+static int
+qemu_vfio_find_fixed_iova(QEMUVFIOState *s, size_t size, uint64_t *iova)
+{
+    int i;
+
+    for (i = 0; i < s->nb_iova_ranges; i++) {
+        if (s->usable_iova_ranges[i].end < s->low_water_mark) {
+            continue;
+        }
+        s->low_water_mark =
+            MAX(s->low_water_mark, s->usable_iova_ranges[i].start);
+
+        if (s->usable_iova_ranges[i].end - s->low_water_mark + 1 >= size) {
+            *iova = s->low_water_mark;
+            s->low_water_mark += size;
+            return 0;
+        }
+    }
+    return -ENOMEM;
+}
+
+static int
+qemu_vfio_find_temp_iova(QEMUVFIOState *s, size_t size, uint64_t *iova)
+{
+    int i;
+
+    for (i = s->nb_iova_ranges - 1; i >= 0; i--) {
+        if (s->usable_iova_ranges[i].start > s->high_water_mark) {
+            continue;
+        }
+        s->high_water_mark =
+            MIN(s->high_water_mark, s->usable_iova_ranges[i].end + 1);
+
+        if (s->high_water_mark - s->usable_iova_ranges[i].start + 1 >= size) {
+            *iova = s->high_water_mark - size;
+            s->high_water_mark = *iova;
+            return 0;
+        }
+    }
+    return -ENOMEM;
+}
+
 /* Map [host, host + size) area into a contiguous IOVA address space, and store
  * the result in @iova if not NULL. The caller need to make sure the area is
  * aligned to page size, and mustn't overlap with existing mapping areas (split
@@ -702,7 +744,11 @@ int qemu_vfio_dma_map(QEMUVFIOState *s, void *host, size_t size,
             goto out;
         }
         if (!temporary) {
-            iova0 = s->low_water_mark;
+            if (qemu_vfio_find_fixed_iova(s, size, &iova0)) {
+                ret = -ENOMEM;
+                goto out;
+            }
+
             mapping = qemu_vfio_add_mapping(s, host, size, index + 1, iova0);
             if (!mapping) {
                 ret = -ENOMEM;
@@ -714,15 +760,16 @@ int qemu_vfio_dma_map(QEMUVFIOState *s, void *host, size_t size,
                 qemu_vfio_undo_mapping(s, mapping, NULL);
                 goto out;
             }
-            s->low_water_mark += size;
             qemu_vfio_dump_mappings(s);
         } else {
-            iova0 = s->high_water_mark - size;
+            if (qemu_vfio_find_temp_iova(s, size, &iova0)) {
+                ret = -ENOMEM;
+                goto out;
+            }
             ret = qemu_vfio_do_mapping(s, host, size, iova0);
             if (ret) {
                 goto out;
             }
-            s->high_water_mark -= size;
         }
     }
     if (iova) {
-- 
2.21.3
^ permalink raw reply related	[flat|nested] 8+ messages in thread