* [PATCH v2 1/2] util/vfio-helpers: Collect IOVA reserved regions
2020-09-29 8:55 [PATCH v2 0/2] NVMe passthrough: Take into account host IOVA reserved regions Eric Auger
@ 2020-09-29 8:55 ` Eric Auger
2020-09-29 8:55 ` [PATCH v2 2/2] util/vfio-helpers: Rework the IOVA allocator to avoid " Eric Auger
2020-09-30 9:23 ` [PATCH v2 0/2] NVMe passthrough: Take into account host " Stefan Hajnoczi
2 siblings, 0 replies; 7+ messages in thread
From: Eric Auger @ 2020-09-29 8:55 UTC (permalink / raw)
To: eric.auger.pro, eric.auger, qemu-devel, qemu-arm, stefanha, fam,
philmd, alex.williamson
Cc: lvivier, kwolf, cohuck, mreitz
The IOVA allocator currently ignores host reserved regions.
As a result some chosen IOVAs may collide with some of them,
resulting in VFIO MAP_DMA errors later on. This happens on ARM
where the MSI reserved window quickly is encountered:
[0x8000000, 0x8100000]. since 5.4 kernel, VFIO returns the usable
IOVA regions. So let's enumerate them in the prospect to avoid
them, later on.
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
v1 -> v2:
- fix the loop on caps
- set s->usable_iova_ranges=NULL to avoid double free
---
util/vfio-helpers.c | 72 +++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 70 insertions(+), 2 deletions(-)
diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c
index 583bdfb36f..ba0ee6e21c 100644
--- a/util/vfio-helpers.c
+++ b/util/vfio-helpers.c
@@ -40,6 +40,11 @@ typedef struct {
uint64_t iova;
} IOVAMapping;
+struct IOVARange {
+ uint64_t start;
+ uint64_t end;
+};
+
struct QEMUVFIOState {
QemuMutex lock;
@@ -49,6 +54,8 @@ struct QEMUVFIOState {
int device;
RAMBlockNotifier ram_notifier;
struct vfio_region_info config_region_info, bar_region_info[6];
+ struct IOVARange *usable_iova_ranges;
+ uint8_t nb_iova_ranges;
/* These fields are protected by @lock */
/* VFIO's IO virtual address space is managed by splitting into a few
@@ -236,6 +243,35 @@ static int qemu_vfio_pci_write_config(QEMUVFIOState *s, void *buf, int size, int
return ret == size ? 0 : -errno;
}
+static void collect_usable_iova_ranges(QEMUVFIOState *s, void *buf)
+{
+ struct vfio_iommu_type1_info *info = (struct vfio_iommu_type1_info *)buf;
+ struct vfio_info_cap_header *cap = (void *)buf + info->cap_offset;
+ struct vfio_iommu_type1_info_cap_iova_range *cap_iova_range;
+ int i;
+
+ while (cap->id != VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE) {
+ if (!cap->next) {
+ return;
+ }
+ cap = (struct vfio_info_cap_header *)(buf + cap->next);
+ }
+
+ cap_iova_range = (struct vfio_iommu_type1_info_cap_iova_range *)cap;
+
+ s->nb_iova_ranges = cap_iova_range->nr_iovas;
+ if (s->nb_iova_ranges > 1) {
+ s->usable_iova_ranges =
+ g_realloc(s->usable_iova_ranges,
+ s->nb_iova_ranges * sizeof(struct IOVARange));
+ }
+
+ for (i = 0; i < s->nb_iova_ranges; i++) {
+ s->usable_iova_ranges[i].start = cap_iova_range->iova_ranges[i].start;
+ s->usable_iova_ranges[i].end = cap_iova_range->iova_ranges[i].end;
+ }
+}
+
static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
Error **errp)
{
@@ -243,10 +279,13 @@ static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
int i;
uint16_t pci_cmd;
struct vfio_group_status group_status = { .argsz = sizeof(group_status) };
- struct vfio_iommu_type1_info iommu_info = { .argsz = sizeof(iommu_info) };
+ struct vfio_iommu_type1_info *iommu_info = NULL;
+ size_t iommu_info_size = sizeof(*iommu_info);
struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
char *group_file = NULL;
+ s->usable_iova_ranges = NULL;
+
/* Create a new container */
s->container = open("/dev/vfio/vfio", O_RDWR);
@@ -310,13 +349,35 @@ static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
goto fail;
}
+ iommu_info = g_malloc0(iommu_info_size);
+ iommu_info->argsz = iommu_info_size;
+
/* Get additional IOMMU info */
- if (ioctl(s->container, VFIO_IOMMU_GET_INFO, &iommu_info)) {
+ if (ioctl(s->container, VFIO_IOMMU_GET_INFO, iommu_info)) {
error_setg_errno(errp, errno, "Failed to get IOMMU info");
ret = -errno;
goto fail;
}
+ /*
+ * if the kernel does not report usable IOVA regions, choose
+ * the legacy [QEMU_VFIO_IOVA_MIN, QEMU_VFIO_IOVA_MAX -1] region
+ */
+ s->nb_iova_ranges = 1;
+ s->usable_iova_ranges = g_new0(struct IOVARange, 1);
+ s->usable_iova_ranges[0].start = QEMU_VFIO_IOVA_MIN;
+ s->usable_iova_ranges[0].end = QEMU_VFIO_IOVA_MAX - 1;
+
+ if (iommu_info->argsz > iommu_info_size) {
+ iommu_info_size = iommu_info->argsz;
+ iommu_info = g_realloc(iommu_info, iommu_info_size);
+ if (ioctl(s->container, VFIO_IOMMU_GET_INFO, iommu_info)) {
+ ret = -errno;
+ goto fail;
+ }
+ collect_usable_iova_ranges(s, iommu_info);
+ }
+
s->device = ioctl(s->group, VFIO_GROUP_GET_DEVICE_FD, device);
if (s->device < 0) {
@@ -365,8 +426,13 @@ static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
if (ret) {
goto fail;
}
+ g_free(iommu_info);
return 0;
fail:
+ g_free(s->usable_iova_ranges);
+ s->usable_iova_ranges = NULL;
+ s->nb_iova_ranges = 0;
+ g_free(iommu_info);
close(s->group);
fail_container:
close(s->container);
@@ -716,6 +782,8 @@ void qemu_vfio_close(QEMUVFIOState *s)
qemu_vfio_undo_mapping(s, &s->mappings[i], NULL);
}
ram_block_notifier_remove(&s->ram_notifier);
+ g_free(s->usable_iova_ranges);
+ s->nb_iova_ranges = 0;
qemu_vfio_reset(s);
close(s->device);
close(s->group);
--
2.21.3
^ permalink raw reply related [flat|nested] 7+ messages in thread
* [PATCH v2 2/2] util/vfio-helpers: Rework the IOVA allocator to avoid IOVA reserved regions
2020-09-29 8:55 [PATCH v2 0/2] NVMe passthrough: Take into account host IOVA reserved regions Eric Auger
2020-09-29 8:55 ` [PATCH v2 1/2] util/vfio-helpers: Collect " Eric Auger
@ 2020-09-29 8:55 ` Eric Auger
2020-09-29 15:59 ` Stefan Hajnoczi
2020-09-30 9:23 ` [PATCH v2 0/2] NVMe passthrough: Take into account host " Stefan Hajnoczi
2 siblings, 1 reply; 7+ messages in thread
From: Eric Auger @ 2020-09-29 8:55 UTC (permalink / raw)
To: eric.auger.pro, eric.auger, qemu-devel, qemu-arm, stefanha, fam,
philmd, alex.williamson
Cc: lvivier, kwolf, cohuck, mreitz
Introduce the qemu_vfio_find_fixed/temp_iova helpers which
respectively allocate IOVAs from the bottom/top parts of the
usable IOVA range, without picking within host IOVA reserved
windows. The allocation remains basic: if the size is too big
for the remaining of the current usable IOVA range, we jump
to the next one, leaving a hole in the address map.
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
v1 -> v2:
- handle possible wrap
---
util/vfio-helpers.c | 57 +++++++++++++++++++++++++++++++++++++++++----
1 file changed, 53 insertions(+), 4 deletions(-)
diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c
index ba0ee6e21c..71145970f3 100644
--- a/util/vfio-helpers.c
+++ b/util/vfio-helpers.c
@@ -667,6 +667,50 @@ static bool qemu_vfio_verify_mappings(QEMUVFIOState *s)
return true;
}
+static int
+qemu_vfio_find_fixed_iova(QEMUVFIOState *s, size_t size, uint64_t *iova)
+{
+ int i;
+
+ for (i = 0; i < s->nb_iova_ranges; i++) {
+ if (s->usable_iova_ranges[i].end < s->low_water_mark) {
+ continue;
+ }
+ s->low_water_mark =
+ MAX(s->low_water_mark, s->usable_iova_ranges[i].start);
+
+ if (s->usable_iova_ranges[i].end - s->low_water_mark + 1 >= size ||
+ s->usable_iova_ranges[i].end - s->low_water_mark + 1 == 0) {
+ *iova = s->low_water_mark;
+ s->low_water_mark += size;
+ return 0;
+ }
+ }
+ return -ENOMEM;
+}
+
+static int
+qemu_vfio_find_temp_iova(QEMUVFIOState *s, size_t size, uint64_t *iova)
+{
+ int i;
+
+ for (i = s->nb_iova_ranges - 1; i >= 0; i--) {
+ if (s->usable_iova_ranges[i].start > s->high_water_mark) {
+ continue;
+ }
+ s->high_water_mark =
+ MIN(s->high_water_mark, s->usable_iova_ranges[i].end + 1);
+
+ if (s->high_water_mark - s->usable_iova_ranges[i].start + 1 >= size ||
+ s->high_water_mark - s->usable_iova_ranges[i].start + 1 == 0) {
+ *iova = s->high_water_mark - size;
+ s->high_water_mark = *iova;
+ return 0;
+ }
+ }
+ return -ENOMEM;
+}
+
/* Map [host, host + size) area into a contiguous IOVA address space, and store
* the result in @iova if not NULL. The caller need to make sure the area is
* aligned to page size, and mustn't overlap with existing mapping areas (split
@@ -693,7 +737,11 @@ int qemu_vfio_dma_map(QEMUVFIOState *s, void *host, size_t size,
goto out;
}
if (!temporary) {
- iova0 = s->low_water_mark;
+ if (qemu_vfio_find_fixed_iova(s, size, &iova0)) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
mapping = qemu_vfio_add_mapping(s, host, size, index + 1, iova0);
if (!mapping) {
ret = -ENOMEM;
@@ -705,15 +753,16 @@ int qemu_vfio_dma_map(QEMUVFIOState *s, void *host, size_t size,
qemu_vfio_undo_mapping(s, mapping, NULL);
goto out;
}
- s->low_water_mark += size;
qemu_vfio_dump_mappings(s);
} else {
- iova0 = s->high_water_mark - size;
+ if (qemu_vfio_find_temp_iova(s, size, &iova0)) {
+ ret = -ENOMEM;
+ goto out;
+ }
ret = qemu_vfio_do_mapping(s, host, size, iova0);
if (ret) {
goto out;
}
- s->high_water_mark -= size;
}
}
if (iova) {
--
2.21.3
^ permalink raw reply related [flat|nested] 7+ messages in thread
* Re: [PATCH v2 2/2] util/vfio-helpers: Rework the IOVA allocator to avoid IOVA reserved regions
2020-09-29 8:55 ` [PATCH v2 2/2] util/vfio-helpers: Rework the IOVA allocator to avoid " Eric Auger
@ 2020-09-29 15:59 ` Stefan Hajnoczi
2020-09-29 19:44 ` Auger Eric
0 siblings, 1 reply; 7+ messages in thread
From: Stefan Hajnoczi @ 2020-09-29 15:59 UTC (permalink / raw)
To: Eric Auger
Cc: fam, lvivier, cohuck, qemu-devel, mreitz, alex.williamson,
qemu-arm, kwolf, philmd, eric.auger.pro
[-- Attachment #1: Type: text/plain, Size: 982 bytes --]
On Tue, Sep 29, 2020 at 10:55:50AM +0200, Eric Auger wrote:
> diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c
> index ba0ee6e21c..71145970f3 100644
> --- a/util/vfio-helpers.c
> +++ b/util/vfio-helpers.c
> @@ -667,6 +667,50 @@ static bool qemu_vfio_verify_mappings(QEMUVFIOState *s)
> return true;
> }
>
> +static int
> +qemu_vfio_find_fixed_iova(QEMUVFIOState *s, size_t size, uint64_t *iova)
> +{
> + int i;
> +
> + for (i = 0; i < s->nb_iova_ranges; i++) {
> + if (s->usable_iova_ranges[i].end < s->low_water_mark) {
> + continue;
> + }
> + s->low_water_mark =
> + MAX(s->low_water_mark, s->usable_iova_ranges[i].start);
> +
> + if (s->usable_iova_ranges[i].end - s->low_water_mark + 1 >= size ||
> + s->usable_iova_ranges[i].end - s->low_water_mark + 1 == 0) {
I don't understand the == 0 case. It seems like we are allocating an
IOVA beyond usable_iova_ranges[i].end?
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH v2 2/2] util/vfio-helpers: Rework the IOVA allocator to avoid IOVA reserved regions
2020-09-29 15:59 ` Stefan Hajnoczi
@ 2020-09-29 19:44 ` Auger Eric
2020-09-30 9:18 ` Stefan Hajnoczi
0 siblings, 1 reply; 7+ messages in thread
From: Auger Eric @ 2020-09-29 19:44 UTC (permalink / raw)
To: Stefan Hajnoczi
Cc: fam, lvivier, cohuck, qemu-devel, mreitz, alex.williamson,
qemu-arm, kwolf, philmd, eric.auger.pro
Hi Stefan,
On 9/29/20 5:59 PM, Stefan Hajnoczi wrote:
> On Tue, Sep 29, 2020 at 10:55:50AM +0200, Eric Auger wrote:
>> diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c
>> index ba0ee6e21c..71145970f3 100644
>> --- a/util/vfio-helpers.c
>> +++ b/util/vfio-helpers.c
>> @@ -667,6 +667,50 @@ static bool qemu_vfio_verify_mappings(QEMUVFIOState *s)
>> return true;
>> }
>>
>> +static int
>> +qemu_vfio_find_fixed_iova(QEMUVFIOState *s, size_t size, uint64_t *iova)
>> +{
>> + int i;
>> +
>> + for (i = 0; i < s->nb_iova_ranges; i++) {
>> + if (s->usable_iova_ranges[i].end < s->low_water_mark) {
>> + continue;
>> + }
>> + s->low_water_mark =
>> + MAX(s->low_water_mark, s->usable_iova_ranges[i].start);
>> +
>> + if (s->usable_iova_ranges[i].end - s->low_water_mark + 1 >= size ||
>> + s->usable_iova_ranges[i].end - s->low_water_mark + 1 == 0) {
>
> I don't understand the == 0 case. It seems like we are allocating an
> IOVA beyond usable_iova_ranges[i].end?>
It is meant to handle the case were low_water_mark = 0 and
s->usable_iova_ranges[0].end = ULLONG_MAX (I know it cannot exist at the
moment but may happen in the future) where we get an overflow. Given the
if (s->usable_iova_ranges[i].end < s->low_water_mark) {
continue;
}
I think this prevents us from allocating beyond
usable_iova_ranges[i].end or do I miss something?
Thanks
Eric
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH v2 2/2] util/vfio-helpers: Rework the IOVA allocator to avoid IOVA reserved regions
2020-09-29 19:44 ` Auger Eric
@ 2020-09-30 9:18 ` Stefan Hajnoczi
0 siblings, 0 replies; 7+ messages in thread
From: Stefan Hajnoczi @ 2020-09-30 9:18 UTC (permalink / raw)
To: Auger Eric
Cc: fam, lvivier, cohuck, qemu-devel, mreitz, alex.williamson,
qemu-arm, kwolf, philmd, eric.auger.pro
[-- Attachment #1: Type: text/plain, Size: 2068 bytes --]
On Tue, Sep 29, 2020 at 09:44:48PM +0200, Auger Eric wrote:
> Hi Stefan,
>
> On 9/29/20 5:59 PM, Stefan Hajnoczi wrote:
> > On Tue, Sep 29, 2020 at 10:55:50AM +0200, Eric Auger wrote:
> >> diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c
> >> index ba0ee6e21c..71145970f3 100644
> >> --- a/util/vfio-helpers.c
> >> +++ b/util/vfio-helpers.c
> >> @@ -667,6 +667,50 @@ static bool qemu_vfio_verify_mappings(QEMUVFIOState *s)
> >> return true;
> >> }
> >>
> >> +static int
> >> +qemu_vfio_find_fixed_iova(QEMUVFIOState *s, size_t size, uint64_t *iova)
> >> +{
> >> + int i;
> >> +
> >> + for (i = 0; i < s->nb_iova_ranges; i++) {
> >> + if (s->usable_iova_ranges[i].end < s->low_water_mark) {
> >> + continue;
> >> + }
> >> + s->low_water_mark =
> >> + MAX(s->low_water_mark, s->usable_iova_ranges[i].start);
> >> +
> >> + if (s->usable_iova_ranges[i].end - s->low_water_mark + 1 >= size ||
> >> + s->usable_iova_ranges[i].end - s->low_water_mark + 1 == 0) {
> >
> > I don't understand the == 0 case. It seems like we are allocating an
> > IOVA beyond usable_iova_ranges[i].end?>
> It is meant to handle the case were low_water_mark = 0 and
> s->usable_iova_ranges[0].end = ULLONG_MAX (I know it cannot exist at the
> moment but may happen in the future) where we get an overflow. Given the
> if (s->usable_iova_ranges[i].end < s->low_water_mark) {
> continue;
> }
> I think this prevents us from allocating beyond
> usable_iova_ranges[i].end or do I miss something?
Yes, you are right. Here are the constraints:
e >= l
j = max(l, s)
e - j + 1 < s
e - j + 1 == 0
Assume l >= s so we can replace j with l:
e >= l
e - l + 1 < s
e - l + 1 == 0
The case I'm worried about is when the iova range cannot fit s bytes.
The last condition is only true when e = l - 1, but this violates the
first condition e >= l.
So the problem scenario cannot occur.
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH v2 0/2] NVMe passthrough: Take into account host IOVA reserved regions
2020-09-29 8:55 [PATCH v2 0/2] NVMe passthrough: Take into account host IOVA reserved regions Eric Auger
2020-09-29 8:55 ` [PATCH v2 1/2] util/vfio-helpers: Collect " Eric Auger
2020-09-29 8:55 ` [PATCH v2 2/2] util/vfio-helpers: Rework the IOVA allocator to avoid " Eric Auger
@ 2020-09-30 9:23 ` Stefan Hajnoczi
2 siblings, 0 replies; 7+ messages in thread
From: Stefan Hajnoczi @ 2020-09-30 9:23 UTC (permalink / raw)
To: Eric Auger
Cc: fam, lvivier, cohuck, qemu-devel, mreitz, alex.williamson,
qemu-arm, kwolf, philmd, eric.auger.pro
[-- Attachment #1: Type: text/plain, Size: 1895 bytes --]
On Tue, Sep 29, 2020 at 10:55:48AM +0200, Eric Auger wrote:
> The current IOVA allocator allocates within the [0x10000, 1ULL << 39]
> window, without paying attention to the host IOVA reserved regions.
> This prevents NVMe passthtrough from working on ARM as the fixed
> IOVAs rapidly grow up to the MSI reserved region [0x8000000, 0x8100000]
> causing some VFIO MAP DMA failures. This series collects the usable
> IOVA regions using VFIO GET_INFO (this requires the host to support
> VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE) and rework the fixed and
> temporary IOVA allocators to avoid those latter.
>
> For the time being we do not change the arbitrary min/max IOVAs.
> In theory they could be dynamically determined but the kernel
> currently fails to expose some HW limitations described in the ACPI
> tables (such as PCI root complex Device Memory Address Size Limit).
> See kernel thread related to "[RFC 0/3] iommu: Reserved regions for
> IOVAs beyond dma_mask and iommu aperture" for more details:
> https://lkml.org/lkml/2020/9/28/1102
>
> Best Regards
>
> Eric
>
> This series can be found at:
> https://github.com/eauger/qemu/tree/nvme_resv_v2
>
> This was tested on ARM only.
>
> History:
> v1 -> v2:
> - remove "util/vfio-helpers: Dynamically compute the min/max IOVA" to
> relax the kernel dependency
> - Fix cabapbility enumeration loop
> - set s->usable_iova_ranges=NULL to avoid double free
> - handle possible u64 wrap
>
> Eric Auger (2):
> util/vfio-helpers: Collect IOVA reserved regions
> util/vfio-helpers: Rework the IOVA allocator to avoid IOVA reserved
> regions
>
> util/vfio-helpers.c | 129 +++++++++++++++++++++++++++++++++++++++++---
> 1 file changed, 123 insertions(+), 6 deletions(-)
>
> --
> 2.21.3
>
Thanks, applied to my block tree:
https://github.com/stefanha/qemu/commits/block
Stefan
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]
^ permalink raw reply [flat|nested] 7+ messages in thread