[RFC 0/3] NVMe passthrough: Take into account host IOVA reserved regions

qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed

* [RFC 0/3] NVMe passthrough: Take into account host IOVA reserved regions
@ 2020-09-25 13:48 Eric Auger
  2020-09-25 13:48 ` [RFC 1/3] util/vfio-helpers: Collect " Eric Auger
                   ` (2 more replies)
  0 siblings, 3 replies; 8+ messages in thread
From: Eric Auger @ 2020-09-25 13:48 UTC (permalink / raw)
  To: eric.auger.pro, eric.auger, qemu-devel, qemu-arm, stefanha, fam,
	philmd, alex.williamson
  Cc: lvivier, kwolf, cohuck, mreitz

The current IOVA allocator allocates within the [0x10000, 1ULL << 39]
window, without paying attention to the host IOVA reserved regions.
This prevents NVMe passthtrough from working on ARM as the fixed
IOVAs rapidly grow up to the MSI reserved region [0x8000000, 0x8100000]
causing some VFIO MAP DMA failures. This series collects the usable
IOVA regions using VFIO GET_INFO (this requires the host to support
VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE) and rework the fixed and
temporary IOVA allocators to avoid those latter. Also the min/max
IOVAs now can be dynamically determined.

Unfortunately the usable host IOVA ranges reported by the kernel
currently do not take into account the dma_mask of devices within
the group. This needs to be fixed, otherwise this series might try
to allocate temporary IOVAs within the range supported by the IOMMU
but beyond the allowed dma_mask. I got the case where the SMMU
supports up to 48 bits but the dma_mask only is 42bits. This kernel
dependency plus the testing limited to one peculiar ARM platform
explains the RFC state.

Best Regards

Eric

This series can be found at:
https://github.com/eauger/qemu/tree/nvme_rfc

This was tested on ARM only.

Eric Auger (3):
  util/vfio-helpers: Collect IOVA reserved regions
  util/vfio-helpers: Dynamically compute the min/max IOVA
  util/vfio-helpers: Rework the IOVA allocator to avoid IOVA reserved
    regions

 util/vfio-helpers.c | 162 ++++++++++++++++++++++++++++++++++++++------
 1 file changed, 143 insertions(+), 19 deletions(-)

-- 
2.21.3

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [RFC 1/3] util/vfio-helpers: Collect IOVA reserved regions
  2020-09-25 13:48 [RFC 0/3] NVMe passthrough: Take into account host IOVA reserved regions Eric Auger
@ 2020-09-25 13:48 ` Eric Auger
  2020-09-25 14:43   ` Fam Zheng
  2020-09-25 13:48 ` [RFC 2/3] util/vfio-helpers: Dynamically compute the min/max IOVA Eric Auger
  2020-09-25 13:48 ` [RFC 3/3] util/vfio-helpers: Rework the IOVA allocator to avoid IOVA reserved regions Eric Auger
  2 siblings, 1 reply; 8+ messages in thread
From: Eric Auger @ 2020-09-25 13:48 UTC (permalink / raw)
  To: eric.auger.pro, eric.auger, qemu-devel, qemu-arm, stefanha, fam,
	philmd, alex.williamson
  Cc: lvivier, kwolf, cohuck, mreitz

The IOVA allocator currently ignores host reserved regions.
As a result some chosen IOVAs may collide with some of them,
resulting in VFIO MAP_DMA errors later on. This happens on ARM
where the MSI reserved window quickly is encountered:
[0x8000000, 0x8100000]. since 5.4 kernel, VFIO returns the usable
IOVA regions. So let's enumerate them in the prospect to avoid
them, later on.

Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
 util/vfio-helpers.c | 75 +++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 73 insertions(+), 2 deletions(-)

diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c
index 583bdfb36f..8e91beba95 100644
--- a/util/vfio-helpers.c
+++ b/util/vfio-helpers.c
@@ -40,6 +40,11 @@ typedef struct {
     uint64_t iova;
 } IOVAMapping;
 
+struct IOVARange {
+    uint64_t start;
+    uint64_t end;
+};
+
 struct QEMUVFIOState {
     QemuMutex lock;
 
@@ -49,6 +54,8 @@ struct QEMUVFIOState {
     int device;
     RAMBlockNotifier ram_notifier;
     struct vfio_region_info config_region_info, bar_region_info[6];
+    struct IOVARange *usable_iova_ranges;
+    uint8_t nb_iova_ranges;
 
     /* These fields are protected by @lock */
     /* VFIO's IO virtual address space is managed by splitting into a few
@@ -236,6 +243,36 @@ static int qemu_vfio_pci_write_config(QEMUVFIOState *s, void *buf, int size, int
     return ret == size ? 0 : -errno;
 }
 
+static void collect_usable_iova_ranges(QEMUVFIOState *s, void *first_cap)
+{
+    struct vfio_iommu_type1_info_cap_iova_range *cap_iova_range;
+    struct vfio_info_cap_header *cap = first_cap;
+    void *offset = first_cap;
+    int i;
+
+    while (cap->id != VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE) {
+        if (cap->next) {
+            return;
+        }
+        offset += cap->next;
+        cap = (struct vfio_info_cap_header *)cap;
+    }
+
+    cap_iova_range = (struct vfio_iommu_type1_info_cap_iova_range *)cap;
+
+    s->nb_iova_ranges = cap_iova_range->nr_iovas;
+    if (s->nb_iova_ranges > 1) {
+        s->usable_iova_ranges =
+            g_realloc(s->usable_iova_ranges,
+                      s->nb_iova_ranges * sizeof(struct IOVARange));
+    }
+
+    for (i = 0; i <  s->nb_iova_ranges; i++) {
+        s->usable_iova_ranges[i].start = cap_iova_range->iova_ranges[i].start;
+        s->usable_iova_ranges[i].end = cap_iova_range->iova_ranges[i].end;
+    }
+}
+
 static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
                               Error **errp)
 {
@@ -243,10 +280,13 @@ static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
     int i;
     uint16_t pci_cmd;
     struct vfio_group_status group_status = { .argsz = sizeof(group_status) };
-    struct vfio_iommu_type1_info iommu_info = { .argsz = sizeof(iommu_info) };
+    struct vfio_iommu_type1_info *iommu_info = NULL;
+    size_t iommu_info_size = sizeof(*iommu_info);
     struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
     char *group_file = NULL;
 
+    s->usable_iova_ranges = NULL;
+
     /* Create a new container */
     s->container = open("/dev/vfio/vfio", O_RDWR);
 
@@ -310,13 +350,38 @@ static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
         goto fail;
     }
 
+    iommu_info = g_malloc0(iommu_info_size);
+    iommu_info->argsz = iommu_info_size;
+
     /* Get additional IOMMU info */
-    if (ioctl(s->container, VFIO_IOMMU_GET_INFO, &iommu_info)) {
+    if (ioctl(s->container, VFIO_IOMMU_GET_INFO, iommu_info)) {
         error_setg_errno(errp, errno, "Failed to get IOMMU info");
         ret = -errno;
         goto fail;
     }
 
+    /*
+     * if the kernel does not report usable IOVA regions, choose
+     * the legacy [QEMU_VFIO_IOVA_MIN, QEMU_VFIO_IOVA_MAX -1] region
+     */
+    s->nb_iova_ranges = 1;
+    s->usable_iova_ranges = g_new0(struct IOVARange, 1);
+    s->usable_iova_ranges[0].start = QEMU_VFIO_IOVA_MIN;
+    s->usable_iova_ranges[0].end = QEMU_VFIO_IOVA_MAX - 1;
+
+    if (iommu_info->argsz > iommu_info_size) {
+        void *first_cap;
+
+        iommu_info_size = iommu_info->argsz;
+        iommu_info = g_realloc(iommu_info, iommu_info_size);
+        if (ioctl(s->container, VFIO_IOMMU_GET_INFO, iommu_info)) {
+            ret = -errno;
+            goto fail;
+        }
+        first_cap = (void *)iommu_info + iommu_info->cap_offset;
+        collect_usable_iova_ranges(s, first_cap);
+    }
+
     s->device = ioctl(s->group, VFIO_GROUP_GET_DEVICE_FD, device);
 
     if (s->device < 0) {
@@ -365,8 +430,12 @@ static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
     if (ret) {
         goto fail;
     }
+    g_free(iommu_info);
     return 0;
 fail:
+    g_free(s->usable_iova_ranges);
+    s->nb_iova_ranges = 0;
+    g_free(iommu_info);
     close(s->group);
 fail_container:
     close(s->container);
@@ -716,6 +785,8 @@ void qemu_vfio_close(QEMUVFIOState *s)
         qemu_vfio_undo_mapping(s, &s->mappings[i], NULL);
     }
     ram_block_notifier_remove(&s->ram_notifier);
+    g_free(s->usable_iova_ranges);
+    s->nb_iova_ranges = 0;
     qemu_vfio_reset(s);
     close(s->device);
     close(s->group);
-- 
2.21.3



^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [RFC 1/3] util/vfio-helpers: Collect IOVA reserved regions
  2020-09-25 13:48 ` [RFC 1/3] util/vfio-helpers: Collect " Eric Auger
@ 2020-09-25 14:43   ` Fam Zheng
  2020-09-25 15:23     ` Auger Eric
  0 siblings, 1 reply; 8+ messages in thread
From: Fam Zheng @ 2020-09-25 14:43 UTC (permalink / raw)
  To: Eric Auger
  Cc: lvivier, kwolf, cohuck, qemu-devel, mreitz, alex.williamson,
	qemu-arm, stefanha, philmd, eric.auger.pro

On 2020-09-25 15:48, Eric Auger wrote:
> The IOVA allocator currently ignores host reserved regions.
> As a result some chosen IOVAs may collide with some of them,
> resulting in VFIO MAP_DMA errors later on. This happens on ARM
> where the MSI reserved window quickly is encountered:
> [0x8000000, 0x8100000]. since 5.4 kernel, VFIO returns the usable
> IOVA regions. So let's enumerate them in the prospect to avoid
> them, later on.
> 
> Signed-off-by: Eric Auger <eric.auger@redhat.com>
> ---
>  util/vfio-helpers.c | 75 +++++++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 73 insertions(+), 2 deletions(-)
> 
> diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c
> index 583bdfb36f..8e91beba95 100644
> --- a/util/vfio-helpers.c
> +++ b/util/vfio-helpers.c
> @@ -40,6 +40,11 @@ typedef struct {
>      uint64_t iova;
>  } IOVAMapping;
>  
> +struct IOVARange {
> +    uint64_t start;
> +    uint64_t end;
> +};
> +
>  struct QEMUVFIOState {
>      QemuMutex lock;
>  
> @@ -49,6 +54,8 @@ struct QEMUVFIOState {
>      int device;
>      RAMBlockNotifier ram_notifier;
>      struct vfio_region_info config_region_info, bar_region_info[6];
> +    struct IOVARange *usable_iova_ranges;
> +    uint8_t nb_iova_ranges;
>  
>      /* These fields are protected by @lock */
>      /* VFIO's IO virtual address space is managed by splitting into a few
> @@ -236,6 +243,36 @@ static int qemu_vfio_pci_write_config(QEMUVFIOState *s, void *buf, int size, int
>      return ret == size ? 0 : -errno;
>  }
>  
> +static void collect_usable_iova_ranges(QEMUVFIOState *s, void *first_cap)
> +{
> +    struct vfio_iommu_type1_info_cap_iova_range *cap_iova_range;
> +    struct vfio_info_cap_header *cap = first_cap;
> +    void *offset = first_cap;
> +    int i;
> +
> +    while (cap->id != VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE) {
> +        if (cap->next) {

This check looks reversed.

> +            return;
> +        }
> +        offset += cap->next;

@offset is unused.

> +        cap = (struct vfio_info_cap_header *)cap;

This assignment is nop.

> +    }
> +
> +    cap_iova_range = (struct vfio_iommu_type1_info_cap_iova_range *)cap;
> +
> +    s->nb_iova_ranges = cap_iova_range->nr_iovas;
> +    if (s->nb_iova_ranges > 1) {
> +        s->usable_iova_ranges =
> +            g_realloc(s->usable_iova_ranges,
> +                      s->nb_iova_ranges * sizeof(struct IOVARange));
> +    }
> +
> +    for (i = 0; i <  s->nb_iova_ranges; i++) {

s/  / /

> +        s->usable_iova_ranges[i].start = cap_iova_range->iova_ranges[i].start;
> +        s->usable_iova_ranges[i].end = cap_iova_range->iova_ranges[i].end;
> +    }
> +}
> +
>  static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
>                                Error **errp)
>  {
> @@ -243,10 +280,13 @@ static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
>      int i;
>      uint16_t pci_cmd;
>      struct vfio_group_status group_status = { .argsz = sizeof(group_status) };
> -    struct vfio_iommu_type1_info iommu_info = { .argsz = sizeof(iommu_info) };
> +    struct vfio_iommu_type1_info *iommu_info = NULL;
> +    size_t iommu_info_size = sizeof(*iommu_info);
>      struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
>      char *group_file = NULL;
>  
> +    s->usable_iova_ranges = NULL;
> +
>      /* Create a new container */
>      s->container = open("/dev/vfio/vfio", O_RDWR);
>  
> @@ -310,13 +350,38 @@ static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
>          goto fail;
>      }
>  
> +    iommu_info = g_malloc0(iommu_info_size);
> +    iommu_info->argsz = iommu_info_size;
> +
>      /* Get additional IOMMU info */
> -    if (ioctl(s->container, VFIO_IOMMU_GET_INFO, &iommu_info)) {
> +    if (ioctl(s->container, VFIO_IOMMU_GET_INFO, iommu_info)) {
>          error_setg_errno(errp, errno, "Failed to get IOMMU info");
>          ret = -errno;
>          goto fail;
>      }
>  
> +    /*
> +     * if the kernel does not report usable IOVA regions, choose
> +     * the legacy [QEMU_VFIO_IOVA_MIN, QEMU_VFIO_IOVA_MAX -1] region
> +     */
> +    s->nb_iova_ranges = 1;
> +    s->usable_iova_ranges = g_new0(struct IOVARange, 1);
> +    s->usable_iova_ranges[0].start = QEMU_VFIO_IOVA_MIN;
> +    s->usable_iova_ranges[0].end = QEMU_VFIO_IOVA_MAX - 1;
> +
> +    if (iommu_info->argsz > iommu_info_size) {
> +        void *first_cap;
> +
> +        iommu_info_size = iommu_info->argsz;
> +        iommu_info = g_realloc(iommu_info, iommu_info_size);
> +        if (ioctl(s->container, VFIO_IOMMU_GET_INFO, iommu_info)) {
> +            ret = -errno;
> +            goto fail;
> +        }
> +        first_cap = (void *)iommu_info + iommu_info->cap_offset;
> +        collect_usable_iova_ranges(s, first_cap);
> +    }
> +
>      s->device = ioctl(s->group, VFIO_GROUP_GET_DEVICE_FD, device);
>  
>      if (s->device < 0) {
> @@ -365,8 +430,12 @@ static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
>      if (ret) {
>          goto fail;
>      }
> +    g_free(iommu_info);
>      return 0;
>  fail:
> +    g_free(s->usable_iova_ranges);

Set s->usable_iova_ranges to NULL to avoid double free?

> +    s->nb_iova_ranges = 0;
> +    g_free(iommu_info);
>      close(s->group);
>  fail_container:
>      close(s->container);
> @@ -716,6 +785,8 @@ void qemu_vfio_close(QEMUVFIOState *s)
>          qemu_vfio_undo_mapping(s, &s->mappings[i], NULL);
>      }
>      ram_block_notifier_remove(&s->ram_notifier);
> +    g_free(s->usable_iova_ranges);
> +    s->nb_iova_ranges = 0;
>      qemu_vfio_reset(s);
>      close(s->device);
>      close(s->group);
> -- 
> 2.21.3
> 
> 

Fam


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC 1/3] util/vfio-helpers: Collect IOVA reserved regions
  2020-09-25 14:43   ` Fam Zheng
@ 2020-09-25 15:23     ` Auger Eric
  2020-09-25 15:44       ` Fam Zheng
  0 siblings, 1 reply; 8+ messages in thread
From: Auger Eric @ 2020-09-25 15:23 UTC (permalink / raw)
  To: Fam Zheng
  Cc: lvivier, kwolf, cohuck, qemu-devel, mreitz, alex.williamson,
	qemu-arm, stefanha, philmd, eric.auger.pro

Hi Fam,

On 9/25/20 4:43 PM, Fam Zheng wrote:
> On 2020-09-25 15:48, Eric Auger wrote:
>> The IOVA allocator currently ignores host reserved regions.
>> As a result some chosen IOVAs may collide with some of them,
>> resulting in VFIO MAP_DMA errors later on. This happens on ARM
>> where the MSI reserved window quickly is encountered:
>> [0x8000000, 0x8100000]. since 5.4 kernel, VFIO returns the usable
>> IOVA regions. So let's enumerate them in the prospect to avoid
>> them, later on.
>>
>> Signed-off-by: Eric Auger <eric.auger@redhat.com>
>> ---
>>  util/vfio-helpers.c | 75 +++++++++++++++++++++++++++++++++++++++++++--
>>  1 file changed, 73 insertions(+), 2 deletions(-)
>>
>> diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c
>> index 583bdfb36f..8e91beba95 100644
>> --- a/util/vfio-helpers.c
>> +++ b/util/vfio-helpers.c
>> @@ -40,6 +40,11 @@ typedef struct {
>>      uint64_t iova;
>>  } IOVAMapping;
>>  
>> +struct IOVARange {
>> +    uint64_t start;
>> +    uint64_t end;
>> +};
>> +
>>  struct QEMUVFIOState {
>>      QemuMutex lock;
>>  
>> @@ -49,6 +54,8 @@ struct QEMUVFIOState {
>>      int device;
>>      RAMBlockNotifier ram_notifier;
>>      struct vfio_region_info config_region_info, bar_region_info[6];
>> +    struct IOVARange *usable_iova_ranges;
>> +    uint8_t nb_iova_ranges;
>>  
>>      /* These fields are protected by @lock */
>>      /* VFIO's IO virtual address space is managed by splitting into a few
>> @@ -236,6 +243,36 @@ static int qemu_vfio_pci_write_config(QEMUVFIOState *s, void *buf, int size, int
>>      return ret == size ? 0 : -errno;
>>  }
>>  
>> +static void collect_usable_iova_ranges(QEMUVFIOState *s, void *first_cap)
>> +{
>> +    struct vfio_iommu_type1_info_cap_iova_range *cap_iova_range;
>> +    struct vfio_info_cap_header *cap = first_cap;
>> +    void *offset = first_cap;
>> +    int i;
>> +
>> +    while (cap->id != VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE) {
>> +        if (cap->next) {
> 
> This check looks reversed.
> 
>> +            return;
>> +        }
>> +        offset += cap->next;
> 
> @offset is unused.
> 
>> +        cap = (struct vfio_info_cap_header *)cap;
> 
> This assignment is nop.
ugh indeed, that loop implementation is totally crap. I will test the
rewriting by adding an extra cap on kernel side.
> 
>> +    }
>> +
>> +    cap_iova_range = (struct vfio_iommu_type1_info_cap_iova_range *)cap;
>> +
>> +    s->nb_iova_ranges = cap_iova_range->nr_iovas;
>> +    if (s->nb_iova_ranges > 1) {
>> +        s->usable_iova_ranges =
>> +            g_realloc(s->usable_iova_ranges,
>> +                      s->nb_iova_ranges * sizeof(struct IOVARange));
>> +    }
>> +
>> +    for (i = 0; i <  s->nb_iova_ranges; i++) {
> 
> s/  / /
ok
> 
>> +        s->usable_iova_ranges[i].start = cap_iova_range->iova_ranges[i].start;
>> +        s->usable_iova_ranges[i].end = cap_iova_range->iova_ranges[i].end;
>> +    }
>> +}
>> +
>>  static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
>>                                Error **errp)
>>  {
>> @@ -243,10 +280,13 @@ static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
>>      int i;
>>      uint16_t pci_cmd;
>>      struct vfio_group_status group_status = { .argsz = sizeof(group_status) };
>> -    struct vfio_iommu_type1_info iommu_info = { .argsz = sizeof(iommu_info) };
>> +    struct vfio_iommu_type1_info *iommu_info = NULL;
>> +    size_t iommu_info_size = sizeof(*iommu_info);
>>      struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
>>      char *group_file = NULL;
>>  
>> +    s->usable_iova_ranges = NULL;
>> +
>>      /* Create a new container */
>>      s->container = open("/dev/vfio/vfio", O_RDWR);
>>  
>> @@ -310,13 +350,38 @@ static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
>>          goto fail;
>>      }
>>  
>> +    iommu_info = g_malloc0(iommu_info_size);
>> +    iommu_info->argsz = iommu_info_size;
>> +
>>      /* Get additional IOMMU info */
>> -    if (ioctl(s->container, VFIO_IOMMU_GET_INFO, &iommu_info)) {
>> +    if (ioctl(s->container, VFIO_IOMMU_GET_INFO, iommu_info)) {
>>          error_setg_errno(errp, errno, "Failed to get IOMMU info");
>>          ret = -errno;
>>          goto fail;
>>      }
>>  
>> +    /*
>> +     * if the kernel does not report usable IOVA regions, choose
>> +     * the legacy [QEMU_VFIO_IOVA_MIN, QEMU_VFIO_IOVA_MAX -1] region
>> +     */
>> +    s->nb_iova_ranges = 1;
>> +    s->usable_iova_ranges = g_new0(struct IOVARange, 1);
>> +    s->usable_iova_ranges[0].start = QEMU_VFIO_IOVA_MIN;
>> +    s->usable_iova_ranges[0].end = QEMU_VFIO_IOVA_MAX - 1;
>> +
>> +    if (iommu_info->argsz > iommu_info_size) {
>> +        void *first_cap;
>> +
>> +        iommu_info_size = iommu_info->argsz;
>> +        iommu_info = g_realloc(iommu_info, iommu_info_size);
>> +        if (ioctl(s->container, VFIO_IOMMU_GET_INFO, iommu_info)) {
>> +            ret = -errno;
>> +            goto fail;
>> +        }
>> +        first_cap = (void *)iommu_info + iommu_info->cap_offset;
>> +        collect_usable_iova_ranges(s, first_cap);
>> +    }
>> +
>>      s->device = ioctl(s->group, VFIO_GROUP_GET_DEVICE_FD, device);
>>  
>>      if (s->device < 0) {
>> @@ -365,8 +430,12 @@ static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
>>      if (ret) {
>>          goto fail;
>>      }
>> +    g_free(iommu_info);
>>      return 0;
>>  fail:
>> +    g_free(s->usable_iova_ranges);
> 
> Set s->usable_iova_ranges to NULL to avoid double free?
I think I did at the beginning of qemu_vfio_init_pci()

Thanks

Eric
> 
>> +    s->nb_iova_ranges = 0;
>> +    g_free(iommu_info);
>>      close(s->group);
>>  fail_container:
>>      close(s->container);
>> @@ -716,6 +785,8 @@ void qemu_vfio_close(QEMUVFIOState *s)
>>          qemu_vfio_undo_mapping(s, &s->mappings[i], NULL);
>>      }
>>      ram_block_notifier_remove(&s->ram_notifier);
>> +    g_free(s->usable_iova_ranges);
>> +    s->nb_iova_ranges = 0;
>>      qemu_vfio_reset(s);
>>      close(s->device);
>>      close(s->group);
>> -- 
>> 2.21.3
>>
>>
> 
> Fam
> 



^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC 1/3] util/vfio-helpers: Collect IOVA reserved regions
  2020-09-25 15:23     ` Auger Eric
@ 2020-09-25 15:44       ` Fam Zheng
  2020-09-25 15:53         ` Auger Eric
  0 siblings, 1 reply; 8+ messages in thread
From: Fam Zheng @ 2020-09-25 15:44 UTC (permalink / raw)
  To: Auger Eric
  Cc: lvivier, kwolf, cohuck, qemu-devel, mreitz, alex.williamson,
	qemu-arm, stefanha, philmd, eric.auger.pro

On Fri, 2020-09-25 at 17:23 +0200, Auger Eric wrote:
> > > @@ -365,8 +430,12 @@ static int qemu_vfio_init_pci(QEMUVFIOState
> > > *s, const char *device,
> > >      if (ret) {
> > >          goto fail;
> > >      }
> > > +    g_free(iommu_info);
> > >      return 0;
> > >  fail:
> > > +    g_free(s->usable_iova_ranges);
> > 
> > Set s->usable_iova_ranges to NULL to avoid double free?
> 
> I think I did at the beginning of qemu_vfio_init_pci()

Yes, but I mean clearing the pointer will make calling
qemu_vfio_close() safe, there is also a g_free() on this one.

Fam

> 
> Thanks
> 
> Eric
> > 
> > > +    s->nb_iova_ranges = 0;
> > > +    g_free(iommu_info);
> > >      close(s->group);
> > >  fail_container:
> > >      close(s->container);
> > > @@ -716,6 +785,8 @@ void qemu_vfio_close(QEMUVFIOState *s)
> > >          qemu_vfio_undo_mapping(s, &s->mappings[i], NULL);
> > >      }
> > >      ram_block_notifier_remove(&s->ram_notifier);
> > > +    g_free(s->usable_iova_ranges);
> > > +    s->nb_iova_ranges = 0;
> > >      qemu_vfio_reset(s);
> > >      close(s->device);
> > >      close(s->group);
> > > -- 
> > > 2.21.3
> > > 
> > > 
> > 
> > Fam
> > 
> 
> 



^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC 1/3] util/vfio-helpers: Collect IOVA reserved regions
  2020-09-25 15:44       ` Fam Zheng
@ 2020-09-25 15:53         ` Auger Eric
  0 siblings, 0 replies; 8+ messages in thread
From: Auger Eric @ 2020-09-25 15:53 UTC (permalink / raw)
  To: Fam Zheng
  Cc: lvivier, kwolf, cohuck, qemu-devel, mreitz, alex.williamson,
	qemu-arm, stefanha, philmd, eric.auger.pro

Hi Fam,

On 9/25/20 5:44 PM, Fam Zheng wrote:
> On Fri, 2020-09-25 at 17:23 +0200, Auger Eric wrote:
>>>> @@ -365,8 +430,12 @@ static int qemu_vfio_init_pci(QEMUVFIOState
>>>> *s, const char *device,
>>>>      if (ret) {
>>>>          goto fail;
>>>>      }
>>>> +    g_free(iommu_info);
>>>>      return 0;
>>>>  fail:
>>>> +    g_free(s->usable_iova_ranges);
>>>
>>> Set s->usable_iova_ranges to NULL to avoid double free?
>>
>> I think I did at the beginning of qemu_vfio_init_pci()
> 
> Yes, but I mean clearing the pointer will make calling
> qemu_vfio_close() safe, there is also a g_free() on this one.
Oh yes, got it.

Thank you for the review.

Best Regards

Eric
> 
> Fam
> 
>>
>> Thanks
>>
>> Eric
>>>
>>>> +    s->nb_iova_ranges = 0;
>>>> +    g_free(iommu_info);
>>>>      close(s->group);
>>>>  fail_container:
>>>>      close(s->container);
>>>> @@ -716,6 +785,8 @@ void qemu_vfio_close(QEMUVFIOState *s)
>>>>          qemu_vfio_undo_mapping(s, &s->mappings[i], NULL);
>>>>      }
>>>>      ram_block_notifier_remove(&s->ram_notifier);
>>>> +    g_free(s->usable_iova_ranges);
>>>> +    s->nb_iova_ranges = 0;
>>>>      qemu_vfio_reset(s);
>>>>      close(s->device);
>>>>      close(s->group);
>>>> -- 
>>>> 2.21.3
>>>>
>>>>
>>>
>>> Fam
>>>
>>
>>
> 



^ permalink raw reply	[flat|nested] 8+ messages in thread

* [RFC 2/3] util/vfio-helpers: Dynamically compute the min/max IOVA
  2020-09-25 13:48 [RFC 0/3] NVMe passthrough: Take into account host IOVA reserved regions Eric Auger
  2020-09-25 13:48 ` [RFC 1/3] util/vfio-helpers: Collect " Eric Auger
@ 2020-09-25 13:48 ` Eric Auger
  2020-09-25 13:48 ` [RFC 3/3] util/vfio-helpers: Rework the IOVA allocator to avoid IOVA reserved regions Eric Auger
  2 siblings, 0 replies; 8+ messages in thread
From: Eric Auger @ 2020-09-25 13:48 UTC (permalink / raw)
  To: eric.auger.pro, eric.auger, qemu-devel, qemu-arm, stefanha, fam,
	philmd, alex.williamson
  Cc: lvivier, kwolf, cohuck, mreitz

Currently the min/max IOVA are hardcoded to [0x10000, 1 << 39].
Now we dynamically fetch the info from VFIO, if the kernel supports
it, let's use the dynamically retrieved value.

Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
 util/vfio-helpers.c | 34 ++++++++++++++++++++--------------
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c
index 8e91beba95..567bcf1ded 100644
--- a/util/vfio-helpers.c
+++ b/util/vfio-helpers.c
@@ -26,11 +26,11 @@
 
 #define QEMU_VFIO_DEBUG 0
 
+/*
+ * Min/Max IOVA addresses, only used if VFIO does not report
+ * the usable IOVA ranges
+ */
 #define QEMU_VFIO_IOVA_MIN 0x10000ULL
-/* XXX: Once VFIO exposes the iova bit width in the IOMMU capability interface,
- * we can use a runtime limit; alternatively it's also possible to do platform
- * specific detection by reading sysfs entries. Until then, 39 is a safe bet.
- **/
 #define QEMU_VFIO_IOVA_MAX (1ULL << 39)
 
 typedef struct {
@@ -56,6 +56,8 @@ struct QEMUVFIOState {
     struct vfio_region_info config_region_info, bar_region_info[6];
     struct IOVARange *usable_iova_ranges;
     uint8_t nb_iova_ranges;
+    uint64_t max_iova;
+    uint64_t min_iova;
 
     /* These fields are protected by @lock */
     /* VFIO's IO virtual address space is managed by splitting into a few
@@ -63,7 +65,7 @@ struct QEMUVFIOState {
      *
      * ---------------       <= 0
      * |xxxxxxxxxxxxx|
-     * |-------------|       <= QEMU_VFIO_IOVA_MIN
+     * |-------------|       <= min_iova
      * |             |
      * |    Fixed    |
      * |             |
@@ -75,20 +77,20 @@ struct QEMUVFIOState {
      * |             |
      * |    Temp     |
      * |             |
-     * |-------------|       <= QEMU_VFIO_IOVA_MAX
+     * |-------------|       <= max_iova
      * |xxxxxxxxxxxxx|
      * |xxxxxxxxxxxxx|
      * ---------------
      *
-     * - Addresses lower than QEMU_VFIO_IOVA_MIN are reserved as invalid;
+     * - Addresses lower than min_iova are reserved as invalid;
      *
      * - Fixed mappings of HVAs are assigned "low" IOVAs in the range of
-     *   [QEMU_VFIO_IOVA_MIN, low_water_mark).  Once allocated they will not be
+     *   [min_iova, low_water_mark).  Once allocated they will not be
      *   reclaimed - low_water_mark never shrinks;
      *
      * - IOVAs in range [low_water_mark, high_water_mark) are free;
      *
-     * - IOVAs in range [high_water_mark, QEMU_VFIO_IOVA_MAX) are volatile
+     * - IOVAs in range [high_water_mark, max_iova) are volatile
      *   mappings. At each qemu_vfio_dma_reset_temporary() call, the whole area
      *   is recycled. The caller should make sure I/O's depending on these
      *   mappings are completed before calling.
@@ -271,6 +273,8 @@ static void collect_usable_iova_ranges(QEMUVFIOState *s, void *first_cap)
         s->usable_iova_ranges[i].start = cap_iova_range->iova_ranges[i].start;
         s->usable_iova_ranges[i].end = cap_iova_range->iova_ranges[i].end;
     }
+    s->min_iova = s->usable_iova_ranges[0].start;
+    s->max_iova = s->usable_iova_ranges[i - 1].end + 1;
 }
 
 static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
@@ -362,12 +366,14 @@ static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
 
     /*
      * if the kernel does not report usable IOVA regions, choose
-     * the legacy [QEMU_VFIO_IOVA_MIN, QEMU_VFIO_IOVA_MAX -1] region
+     * the legacy [QEMU_VFIO_IOVA_MIN, QEMU_VFIO_IOVA_MAX - 1] region
      */
     s->nb_iova_ranges = 1;
     s->usable_iova_ranges = g_new0(struct IOVARange, 1);
     s->usable_iova_ranges[0].start = QEMU_VFIO_IOVA_MIN;
     s->usable_iova_ranges[0].end = QEMU_VFIO_IOVA_MAX - 1;
+    s->min_iova = QEMU_VFIO_IOVA_MIN;
+    s->max_iova = QEMU_VFIO_IOVA_MAX;
 
     if (iommu_info->argsz > iommu_info_size) {
         void *first_cap;
@@ -484,8 +490,8 @@ static void qemu_vfio_open_common(QEMUVFIOState *s)
     s->ram_notifier.ram_block_added = qemu_vfio_ram_block_added;
     s->ram_notifier.ram_block_removed = qemu_vfio_ram_block_removed;
     ram_block_notifier_add(&s->ram_notifier);
-    s->low_water_mark = QEMU_VFIO_IOVA_MIN;
-    s->high_water_mark = QEMU_VFIO_IOVA_MAX;
+    s->low_water_mark = s->min_iova;
+    s->high_water_mark = s->max_iova;
     qemu_ram_foreach_block(qemu_vfio_init_ramblock, s);
 }
 
@@ -734,7 +740,7 @@ int qemu_vfio_dma_reset_temporary(QEMUVFIOState *s)
         .argsz = sizeof(unmap),
         .flags = 0,
         .iova = s->high_water_mark,
-        .size = QEMU_VFIO_IOVA_MAX - s->high_water_mark,
+        .size = s->max_iova - s->high_water_mark,
     };
     trace_qemu_vfio_dma_reset_temporary(s);
     QEMU_LOCK_GUARD(&s->lock);
@@ -742,7 +748,7 @@ int qemu_vfio_dma_reset_temporary(QEMUVFIOState *s)
         error_report("VFIO_UNMAP_DMA failed: %s", strerror(errno));
         return -errno;
     }
-    s->high_water_mark = QEMU_VFIO_IOVA_MAX;
+    s->high_water_mark = s->max_iova;
     return 0;
 }
 
-- 
2.21.3



^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [RFC 3/3] util/vfio-helpers: Rework the IOVA allocator to avoid IOVA reserved regions
  2020-09-25 13:48 [RFC 0/3] NVMe passthrough: Take into account host IOVA reserved regions Eric Auger
  2020-09-25 13:48 ` [RFC 1/3] util/vfio-helpers: Collect " Eric Auger
  2020-09-25 13:48 ` [RFC 2/3] util/vfio-helpers: Dynamically compute the min/max IOVA Eric Auger
@ 2020-09-25 13:48 ` Eric Auger
  2 siblings, 0 replies; 8+ messages in thread
From: Eric Auger @ 2020-09-25 13:48 UTC (permalink / raw)
  To: eric.auger.pro, eric.auger, qemu-devel, qemu-arm, stefanha, fam,
	philmd, alex.williamson
  Cc: lvivier, kwolf, cohuck, mreitz

Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
 util/vfio-helpers.c | 55 +++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 51 insertions(+), 4 deletions(-)

diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c
index 567bcf1ded..c98d0d882e 100644
--- a/util/vfio-helpers.c
+++ b/util/vfio-helpers.c
@@ -676,6 +676,48 @@ static bool qemu_vfio_verify_mappings(QEMUVFIOState *s)
     return true;
 }
 
+static int
+qemu_vfio_find_fixed_iova(QEMUVFIOState *s, size_t size, uint64_t *iova)
+{
+    int i;
+
+    for (i = 0; i < s->nb_iova_ranges; i++) {
+        if (s->usable_iova_ranges[i].end < s->low_water_mark) {
+            continue;
+        }
+        s->low_water_mark =
+            MAX(s->low_water_mark, s->usable_iova_ranges[i].start);
+
+        if (s->usable_iova_ranges[i].end - s->low_water_mark + 1 >= size) {
+            *iova = s->low_water_mark;
+            s->low_water_mark += size;
+            return 0;
+        }
+    }
+    return -ENOMEM;
+}
+
+static int
+qemu_vfio_find_temp_iova(QEMUVFIOState *s, size_t size, uint64_t *iova)
+{
+    int i;
+
+    for (i = s->nb_iova_ranges - 1; i >= 0; i--) {
+        if (s->usable_iova_ranges[i].start > s->high_water_mark) {
+            continue;
+        }
+        s->high_water_mark =
+            MIN(s->high_water_mark, s->usable_iova_ranges[i].end + 1);
+
+        if (s->high_water_mark - s->usable_iova_ranges[i].start + 1 >= size) {
+            *iova = s->high_water_mark - size;
+            s->high_water_mark = *iova;
+            return 0;
+        }
+    }
+    return -ENOMEM;
+}
+
 /* Map [host, host + size) area into a contiguous IOVA address space, and store
  * the result in @iova if not NULL. The caller need to make sure the area is
  * aligned to page size, and mustn't overlap with existing mapping areas (split
@@ -702,7 +744,11 @@ int qemu_vfio_dma_map(QEMUVFIOState *s, void *host, size_t size,
             goto out;
         }
         if (!temporary) {
-            iova0 = s->low_water_mark;
+            if (qemu_vfio_find_fixed_iova(s, size, &iova0)) {
+                ret = -ENOMEM;
+                goto out;
+            }
+
             mapping = qemu_vfio_add_mapping(s, host, size, index + 1, iova0);
             if (!mapping) {
                 ret = -ENOMEM;
@@ -714,15 +760,16 @@ int qemu_vfio_dma_map(QEMUVFIOState *s, void *host, size_t size,
                 qemu_vfio_undo_mapping(s, mapping, NULL);
                 goto out;
             }
-            s->low_water_mark += size;
             qemu_vfio_dump_mappings(s);
         } else {
-            iova0 = s->high_water_mark - size;
+            if (qemu_vfio_find_temp_iova(s, size, &iova0)) {
+                ret = -ENOMEM;
+                goto out;
+            }
             ret = qemu_vfio_do_mapping(s, host, size, iova0);
             if (ret) {
                 goto out;
             }
-            s->high_water_mark -= size;
         }
     }
     if (iova) {
-- 
2.21.3



^ permalink raw reply related	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2020-09-25 16:12 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2020-09-25 13:48 [RFC 0/3] NVMe passthrough: Take into account host IOVA reserved regions Eric Auger
2020-09-25 13:48 ` [RFC 1/3] util/vfio-helpers: Collect " Eric Auger
2020-09-25 14:43   ` Fam Zheng
2020-09-25 15:23     ` Auger Eric
2020-09-25 15:44       ` Fam Zheng
2020-09-25 15:53         ` Auger Eric
2020-09-25 13:48 ` [RFC 2/3] util/vfio-helpers: Dynamically compute the min/max IOVA Eric Auger
2020-09-25 13:48 ` [RFC 3/3] util/vfio-helpers: Rework the IOVA allocator to avoid IOVA reserved regions Eric Auger

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).