* [PATCH v4 1/3] vfio/type1: sanitize for overflow using check_*_overflow
2025-10-13 5:32 [PATCH v4 0/3] vfio: handle DMA map/unmap up to the addressable limit Alex Mastro
@ 2025-10-13 5:32 ` Alex Mastro
2025-10-13 5:32 ` [PATCH v4 2/3] vfio/type1: move iova increment to unmap_unpin_* caller Alex Mastro
` (5 subsequent siblings)
6 siblings, 0 replies; 27+ messages in thread
From: Alex Mastro @ 2025-10-13 5:32 UTC (permalink / raw)
To: Alex Williamson
Cc: Jason Gunthorpe, Alejandro Jimenez, kvm, linux-kernel,
Alex Mastro
Adopt check_*_overflow functions to clearly express overflow check
intent.
Signed-off-by: Alex Mastro <amastro@fb.com>
---
drivers/vfio/vfio_iommu_type1.c | 86 ++++++++++++++++++++++++++++++-----------
1 file changed, 63 insertions(+), 23 deletions(-)
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index f8d68fe77b41..1ac056b27f27 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -37,6 +37,7 @@
#include <linux/vfio.h>
#include <linux/workqueue.h>
#include <linux/notifier.h>
+#include <linux/overflow.h>
#include "vfio.h"
#define DRIVER_VERSION "0.2"
@@ -180,7 +181,7 @@ static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
}
static struct rb_node *vfio_find_dma_first_node(struct vfio_iommu *iommu,
- dma_addr_t start, u64 size)
+ dma_addr_t start, size_t size)
{
struct rb_node *res = NULL;
struct rb_node *node = iommu->dma_list.rb_node;
@@ -825,14 +826,20 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
unsigned long remote_vaddr;
struct vfio_dma *dma;
bool do_accounting;
+ dma_addr_t iova_end;
+ size_t iova_size;
- if (!iommu || !pages)
+ if (!iommu || !pages || npage <= 0)
return -EINVAL;
/* Supported for v2 version only */
if (!iommu->v2)
return -EACCES;
+ if (check_mul_overflow(npage, PAGE_SIZE, &iova_size) ||
+ check_add_overflow(user_iova, iova_size - 1, &iova_end))
+ return -EOVERFLOW;
+
mutex_lock(&iommu->lock);
if (WARN_ONCE(iommu->vaddr_invalid_count,
@@ -938,12 +945,21 @@ static void vfio_iommu_type1_unpin_pages(void *iommu_data,
{
struct vfio_iommu *iommu = iommu_data;
bool do_accounting;
+ dma_addr_t iova_end;
+ size_t iova_size;
int i;
/* Supported for v2 version only */
if (WARN_ON(!iommu->v2))
return;
+ if (WARN_ON(npage <= 0))
+ return;
+
+ if (WARN_ON(check_mul_overflow(npage, PAGE_SIZE, &iova_size) ||
+ check_add_overflow(user_iova, iova_size - 1, &iova_end)))
+ return;
+
mutex_lock(&iommu->lock);
do_accounting = list_empty(&iommu->domain_list);
@@ -1304,7 +1320,8 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
int ret = -EINVAL, retries = 0;
unsigned long pgshift;
dma_addr_t iova = unmap->iova;
- u64 size = unmap->size;
+ dma_addr_t iova_end;
+ size_t size = unmap->size;
bool unmap_all = unmap->flags & VFIO_DMA_UNMAP_FLAG_ALL;
bool invalidate_vaddr = unmap->flags & VFIO_DMA_UNMAP_FLAG_VADDR;
struct rb_node *n, *first_n;
@@ -1317,6 +1334,11 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
goto unlock;
}
+ if (iova != unmap->iova || size != unmap->size) {
+ ret = -EOVERFLOW;
+ goto unlock;
+ }
+
pgshift = __ffs(iommu->pgsize_bitmap);
pgsize = (size_t)1 << pgshift;
@@ -1326,10 +1348,15 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
if (unmap_all) {
if (iova || size)
goto unlock;
- size = U64_MAX;
- } else if (!size || size & (pgsize - 1) ||
- iova + size - 1 < iova || size > SIZE_MAX) {
- goto unlock;
+ size = SIZE_MAX;
+ } else {
+ if (!size || size & (pgsize - 1))
+ goto unlock;
+
+ if (check_add_overflow(iova, size - 1, &iova_end)) {
+ ret = -EOVERFLOW;
+ goto unlock;
+ }
}
/* When dirty tracking is enabled, allow only min supported pgsize */
@@ -1376,7 +1403,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
if (dma && dma->iova != iova)
goto unlock;
- dma = vfio_find_dma(iommu, iova + size - 1, 0);
+ dma = vfio_find_dma(iommu, iova_end, 0);
if (dma && dma->iova + dma->size != iova + size)
goto unlock;
}
@@ -1578,7 +1605,9 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
{
bool set_vaddr = map->flags & VFIO_DMA_MAP_FLAG_VADDR;
dma_addr_t iova = map->iova;
+ dma_addr_t iova_end;
unsigned long vaddr = map->vaddr;
+ unsigned long vaddr_end;
size_t size = map->size;
int ret = 0, prot = 0;
size_t pgsize;
@@ -1586,8 +1615,15 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
/* Verify that none of our __u64 fields overflow */
if (map->size != size || map->vaddr != vaddr || map->iova != iova)
+ return -EOVERFLOW;
+
+ if (!size)
return -EINVAL;
+ if (check_add_overflow(iova, size - 1, &iova_end) ||
+ check_add_overflow(vaddr, size - 1, &vaddr_end))
+ return -EOVERFLOW;
+
/* READ/WRITE from device perspective */
if (map->flags & VFIO_DMA_MAP_FLAG_WRITE)
prot |= IOMMU_WRITE;
@@ -1603,13 +1639,7 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
WARN_ON((pgsize - 1) & PAGE_MASK);
- if (!size || (size | iova | vaddr) & (pgsize - 1)) {
- ret = -EINVAL;
- goto out_unlock;
- }
-
- /* Don't allow IOVA or virtual address wrap */
- if (iova + size - 1 < iova || vaddr + size - 1 < vaddr) {
+ if ((size | iova | vaddr) & (pgsize - 1)) {
ret = -EINVAL;
goto out_unlock;
}
@@ -1640,7 +1670,7 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
goto out_unlock;
}
- if (!vfio_iommu_iova_dma_valid(iommu, iova, iova + size - 1)) {
+ if (!vfio_iommu_iova_dma_valid(iommu, iova, iova_end)) {
ret = -EINVAL;
goto out_unlock;
}
@@ -2907,7 +2937,8 @@ static int vfio_iommu_type1_dirty_pages(struct vfio_iommu *iommu,
struct vfio_iommu_type1_dirty_bitmap_get range;
unsigned long pgshift;
size_t data_size = dirty.argsz - minsz;
- size_t iommu_pgsize;
+ size_t size, iommu_pgsize;
+ dma_addr_t iova, iova_end;
if (!data_size || data_size < sizeof(range))
return -EINVAL;
@@ -2916,14 +2947,24 @@ static int vfio_iommu_type1_dirty_pages(struct vfio_iommu *iommu,
sizeof(range)))
return -EFAULT;
- if (range.iova + range.size < range.iova)
+ iova = range.iova;
+ size = range.size;
+
+ if (iova != range.iova || size != range.size)
+ return -EOVERFLOW;
+
+ if (!size)
return -EINVAL;
+
+ if (check_add_overflow(iova, size - 1, &iova_end))
+ return -EOVERFLOW;
+
if (!access_ok((void __user *)range.bitmap.data,
range.bitmap.size))
return -EINVAL;
pgshift = __ffs(range.bitmap.pgsize);
- ret = verify_bitmap_size(range.size >> pgshift,
+ ret = verify_bitmap_size(size >> pgshift,
range.bitmap.size);
if (ret)
return ret;
@@ -2937,19 +2978,18 @@ static int vfio_iommu_type1_dirty_pages(struct vfio_iommu *iommu,
ret = -EINVAL;
goto out_unlock;
}
- if (range.iova & (iommu_pgsize - 1)) {
+ if (iova & (iommu_pgsize - 1)) {
ret = -EINVAL;
goto out_unlock;
}
- if (!range.size || range.size & (iommu_pgsize - 1)) {
+ if (size & (iommu_pgsize - 1)) {
ret = -EINVAL;
goto out_unlock;
}
if (iommu->dirty_page_tracking)
ret = vfio_iova_dirty_bitmap(range.bitmap.data,
- iommu, range.iova,
- range.size,
+ iommu, iova, size,
range.bitmap.pgsize);
else
ret = -EINVAL;
--
2.47.3
^ permalink raw reply related [flat|nested] 27+ messages in thread* [PATCH v4 2/3] vfio/type1: move iova increment to unmap_unpin_* caller
2025-10-13 5:32 [PATCH v4 0/3] vfio: handle DMA map/unmap up to the addressable limit Alex Mastro
2025-10-13 5:32 ` [PATCH v4 1/3] vfio/type1: sanitize for overflow using check_*_overflow Alex Mastro
@ 2025-10-13 5:32 ` Alex Mastro
2025-10-13 5:32 ` [PATCH v4 3/3] vfio/type1: handle DMA map/unmap up to the addressable limit Alex Mastro
` (4 subsequent siblings)
6 siblings, 0 replies; 27+ messages in thread
From: Alex Mastro @ 2025-10-13 5:32 UTC (permalink / raw)
To: Alex Williamson
Cc: Jason Gunthorpe, Alejandro Jimenez, kvm, linux-kernel,
Alex Mastro
Move incrementing iova to the caller of these functions as part of
preparing to handle end of address space map/unmap.
Signed-off-by: Alex Mastro <amastro@fb.com>
---
drivers/vfio/vfio_iommu_type1.c | 20 ++++++++++----------
1 file changed, 10 insertions(+), 10 deletions(-)
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 1ac056b27f27..48b84a7af2e1 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -1013,7 +1013,7 @@ static long vfio_sync_unpin(struct vfio_dma *dma, struct vfio_domain *domain,
#define VFIO_IOMMU_TLB_SYNC_MAX 512
static size_t unmap_unpin_fast(struct vfio_domain *domain,
- struct vfio_dma *dma, dma_addr_t *iova,
+ struct vfio_dma *dma, dma_addr_t iova,
size_t len, phys_addr_t phys, long *unlocked,
struct list_head *unmapped_list,
int *unmapped_cnt,
@@ -1023,18 +1023,17 @@ static size_t unmap_unpin_fast(struct vfio_domain *domain,
struct vfio_regions *entry = kzalloc(sizeof(*entry), GFP_KERNEL);
if (entry) {
- unmapped = iommu_unmap_fast(domain->domain, *iova, len,
+ unmapped = iommu_unmap_fast(domain->domain, iova, len,
iotlb_gather);
if (!unmapped) {
kfree(entry);
} else {
- entry->iova = *iova;
+ entry->iova = iova;
entry->phys = phys;
entry->len = unmapped;
list_add_tail(&entry->list, unmapped_list);
- *iova += unmapped;
(*unmapped_cnt)++;
}
}
@@ -1053,18 +1052,17 @@ static size_t unmap_unpin_fast(struct vfio_domain *domain,
}
static size_t unmap_unpin_slow(struct vfio_domain *domain,
- struct vfio_dma *dma, dma_addr_t *iova,
+ struct vfio_dma *dma, dma_addr_t iova,
size_t len, phys_addr_t phys,
long *unlocked)
{
- size_t unmapped = iommu_unmap(domain->domain, *iova, len);
+ size_t unmapped = iommu_unmap(domain->domain, iova, len);
if (unmapped) {
- *unlocked += vfio_unpin_pages_remote(dma, *iova,
+ *unlocked += vfio_unpin_pages_remote(dma, iova,
phys >> PAGE_SHIFT,
unmapped >> PAGE_SHIFT,
false);
- *iova += unmapped;
cond_resched();
}
return unmapped;
@@ -1127,16 +1125,18 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
* First, try to use fast unmap/unpin. In case of failure,
* switch to slow unmap/unpin path.
*/
- unmapped = unmap_unpin_fast(domain, dma, &iova, len, phys,
+ unmapped = unmap_unpin_fast(domain, dma, iova, len, phys,
&unlocked, &unmapped_region_list,
&unmapped_region_cnt,
&iotlb_gather);
if (!unmapped) {
- unmapped = unmap_unpin_slow(domain, dma, &iova, len,
+ unmapped = unmap_unpin_slow(domain, dma, iova, len,
phys, &unlocked);
if (WARN_ON(!unmapped))
break;
}
+
+ iova += unmapped;
}
dma->iommu_mapped = false;
--
2.47.3
^ permalink raw reply related [flat|nested] 27+ messages in thread* [PATCH v4 3/3] vfio/type1: handle DMA map/unmap up to the addressable limit
2025-10-13 5:32 [PATCH v4 0/3] vfio: handle DMA map/unmap up to the addressable limit Alex Mastro
2025-10-13 5:32 ` [PATCH v4 1/3] vfio/type1: sanitize for overflow using check_*_overflow Alex Mastro
2025-10-13 5:32 ` [PATCH v4 2/3] vfio/type1: move iova increment to unmap_unpin_* caller Alex Mastro
@ 2025-10-13 5:32 ` Alex Mastro
2025-10-21 22:18 ` Alejandro Jimenez
2025-10-15 19:24 ` [PATCH v4 0/3] vfio: " Alex Williamson
` (3 subsequent siblings)
6 siblings, 1 reply; 27+ messages in thread
From: Alex Mastro @ 2025-10-13 5:32 UTC (permalink / raw)
To: Alex Williamson
Cc: Jason Gunthorpe, Alejandro Jimenez, kvm, linux-kernel,
Alex Mastro
Handle DMA map/unmap operations up to the addressable limit by comparing
against inclusive end-of-range limits, and changing iteration to
perform relative traversals across range sizes, rather than absolute
traversals across addresses.
vfio_link_dma inserts a zero-sized vfio_dma into the rb-tree, and is
only used for that purpose, so discard the size from consideration for
the insertion point.
Signed-off-by: Alex Mastro <amastro@fb.com>
---
drivers/vfio/vfio_iommu_type1.c | 77 ++++++++++++++++++++++-------------------
1 file changed, 42 insertions(+), 35 deletions(-)
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 48b84a7af2e1..a65625dcf708 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -166,12 +166,14 @@ static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
{
struct rb_node *node = iommu->dma_list.rb_node;
+ WARN_ON(!size);
+
while (node) {
struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
- if (start + size <= dma->iova)
+ if (start + size - 1 < dma->iova)
node = node->rb_left;
- else if (start >= dma->iova + dma->size)
+ else if (start > dma->iova + dma->size - 1)
node = node->rb_right;
else
return dma;
@@ -181,16 +183,19 @@ static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
}
static struct rb_node *vfio_find_dma_first_node(struct vfio_iommu *iommu,
- dma_addr_t start, size_t size)
+ dma_addr_t start,
+ dma_addr_t end)
{
struct rb_node *res = NULL;
struct rb_node *node = iommu->dma_list.rb_node;
struct vfio_dma *dma_res = NULL;
+ WARN_ON(end < start);
+
while (node) {
struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
- if (start < dma->iova + dma->size) {
+ if (start <= dma->iova + dma->size - 1) {
res = node;
dma_res = dma;
if (start >= dma->iova)
@@ -200,7 +205,7 @@ static struct rb_node *vfio_find_dma_first_node(struct vfio_iommu *iommu,
node = node->rb_right;
}
}
- if (res && size && dma_res->iova >= start + size)
+ if (res && dma_res->iova > end)
res = NULL;
return res;
}
@@ -210,11 +215,13 @@ static void vfio_link_dma(struct vfio_iommu *iommu, struct vfio_dma *new)
struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL;
struct vfio_dma *dma;
+ WARN_ON(new->size != 0);
+
while (*link) {
parent = *link;
dma = rb_entry(parent, struct vfio_dma, node);
- if (new->iova + new->size <= dma->iova)
+ if (new->iova <= dma->iova)
link = &(*link)->rb_left;
else
link = &(*link)->rb_right;
@@ -1071,12 +1078,12 @@ static size_t unmap_unpin_slow(struct vfio_domain *domain,
static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
bool do_accounting)
{
- dma_addr_t iova = dma->iova, end = dma->iova + dma->size;
struct vfio_domain *domain, *d;
LIST_HEAD(unmapped_region_list);
struct iommu_iotlb_gather iotlb_gather;
int unmapped_region_cnt = 0;
long unlocked = 0;
+ size_t pos = 0;
if (!dma->size)
return 0;
@@ -1100,13 +1107,14 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
}
iommu_iotlb_gather_init(&iotlb_gather);
- while (iova < end) {
+ while (pos < dma->size) {
size_t unmapped, len;
phys_addr_t phys, next;
+ dma_addr_t iova = dma->iova + pos;
phys = iommu_iova_to_phys(domain->domain, iova);
if (WARN_ON(!phys)) {
- iova += PAGE_SIZE;
+ pos += PAGE_SIZE;
continue;
}
@@ -1115,7 +1123,7 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
* may require hardware cache flushing, try to find the
* largest contiguous physical memory chunk to unmap.
*/
- for (len = PAGE_SIZE; iova + len < end; len += PAGE_SIZE) {
+ for (len = PAGE_SIZE; pos + len < dma->size; len += PAGE_SIZE) {
next = iommu_iova_to_phys(domain->domain, iova + len);
if (next != phys + len)
break;
@@ -1136,7 +1144,7 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
break;
}
- iova += unmapped;
+ pos += unmapped;
}
dma->iommu_mapped = false;
@@ -1228,7 +1236,7 @@ static int update_user_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu,
}
static int vfio_iova_dirty_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu,
- dma_addr_t iova, size_t size, size_t pgsize)
+ dma_addr_t iova, dma_addr_t iova_end, size_t pgsize)
{
struct vfio_dma *dma;
struct rb_node *n;
@@ -1245,8 +1253,8 @@ static int vfio_iova_dirty_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu,
if (dma && dma->iova != iova)
return -EINVAL;
- dma = vfio_find_dma(iommu, iova + size - 1, 0);
- if (dma && dma->iova + dma->size != iova + size)
+ dma = vfio_find_dma(iommu, iova_end, 1);
+ if (dma && dma->iova + dma->size - 1 != iova_end)
return -EINVAL;
for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
@@ -1255,7 +1263,7 @@ static int vfio_iova_dirty_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu,
if (dma->iova < iova)
continue;
- if (dma->iova > iova + size - 1)
+ if (dma->iova > iova_end)
break;
ret = update_user_bitmap(bitmap, iommu, dma, iova, pgsize);
@@ -1348,7 +1356,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
if (unmap_all) {
if (iova || size)
goto unlock;
- size = SIZE_MAX;
+ iova_end = ~(dma_addr_t)0;
} else {
if (!size || size & (pgsize - 1))
goto unlock;
@@ -1403,17 +1411,17 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
if (dma && dma->iova != iova)
goto unlock;
- dma = vfio_find_dma(iommu, iova_end, 0);
- if (dma && dma->iova + dma->size != iova + size)
+ dma = vfio_find_dma(iommu, iova_end, 1);
+ if (dma && dma->iova + dma->size - 1 != iova_end)
goto unlock;
}
ret = 0;
- n = first_n = vfio_find_dma_first_node(iommu, iova, size);
+ n = first_n = vfio_find_dma_first_node(iommu, iova, iova_end);
while (n) {
dma = rb_entry(n, struct vfio_dma, node);
- if (dma->iova >= iova + size)
+ if (dma->iova > iova_end)
break;
if (!iommu->v2 && iova > dma->iova)
@@ -1743,12 +1751,12 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
for (; n; n = rb_next(n)) {
struct vfio_dma *dma;
- dma_addr_t iova;
+ size_t pos = 0;
dma = rb_entry(n, struct vfio_dma, node);
- iova = dma->iova;
- while (iova < dma->iova + dma->size) {
+ while (pos < dma->size) {
+ dma_addr_t iova = dma->iova + pos;
phys_addr_t phys;
size_t size;
@@ -1764,14 +1772,14 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
phys = iommu_iova_to_phys(d->domain, iova);
if (WARN_ON(!phys)) {
- iova += PAGE_SIZE;
+ pos += PAGE_SIZE;
continue;
}
size = PAGE_SIZE;
p = phys + size;
i = iova + size;
- while (i < dma->iova + dma->size &&
+ while (pos + size < dma->size &&
p == iommu_iova_to_phys(d->domain, i)) {
size += PAGE_SIZE;
p += PAGE_SIZE;
@@ -1779,9 +1787,8 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
}
} else {
unsigned long pfn;
- unsigned long vaddr = dma->vaddr +
- (iova - dma->iova);
- size_t n = dma->iova + dma->size - iova;
+ unsigned long vaddr = dma->vaddr + pos;
+ size_t n = dma->size - pos;
long npage;
npage = vfio_pin_pages_remote(dma, vaddr,
@@ -1812,7 +1819,7 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
goto unwind;
}
- iova += size;
+ pos += size;
}
}
@@ -1829,29 +1836,29 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
unwind:
for (; n; n = rb_prev(n)) {
struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
- dma_addr_t iova;
+ size_t pos = 0;
if (dma->iommu_mapped) {
iommu_unmap(domain->domain, dma->iova, dma->size);
continue;
}
- iova = dma->iova;
- while (iova < dma->iova + dma->size) {
+ while (pos < dma->size) {
+ dma_addr_t iova = dma->iova + pos;
phys_addr_t phys, p;
size_t size;
dma_addr_t i;
phys = iommu_iova_to_phys(domain->domain, iova);
if (!phys) {
- iova += PAGE_SIZE;
+ pos += PAGE_SIZE;
continue;
}
size = PAGE_SIZE;
p = phys + size;
i = iova + size;
- while (i < dma->iova + dma->size &&
+ while (pos + size < dma->size &&
p == iommu_iova_to_phys(domain->domain, i)) {
size += PAGE_SIZE;
p += PAGE_SIZE;
@@ -2989,7 +2996,7 @@ static int vfio_iommu_type1_dirty_pages(struct vfio_iommu *iommu,
if (iommu->dirty_page_tracking)
ret = vfio_iova_dirty_bitmap(range.bitmap.data,
- iommu, iova, size,
+ iommu, iova, iova_end,
range.bitmap.pgsize);
else
ret = -EINVAL;
--
2.47.3
^ permalink raw reply related [flat|nested] 27+ messages in thread* Re: [PATCH v4 3/3] vfio/type1: handle DMA map/unmap up to the addressable limit
2025-10-13 5:32 ` [PATCH v4 3/3] vfio/type1: handle DMA map/unmap up to the addressable limit Alex Mastro
@ 2025-10-21 22:18 ` Alejandro Jimenez
2025-10-22 14:24 ` Alex Mastro
0 siblings, 1 reply; 27+ messages in thread
From: Alejandro Jimenez @ 2025-10-21 22:18 UTC (permalink / raw)
To: Alex Mastro, Alex Williamson; +Cc: Jason Gunthorpe, kvm, linux-kernel
Hi Alex,
On 10/13/25 1:32 AM, Alex Mastro wrote:
> Handle DMA map/unmap operations up to the addressable limit by comparing
> against inclusive end-of-range limits, and changing iteration to
> perform relative traversals across range sizes, rather than absolute
> traversals across addresses.
>
> vfio_link_dma inserts a zero-sized vfio_dma into the rb-tree, and is
> only used for that purpose, so discard the size from consideration for
> the insertion point.
I made a small comment about this on the corresponding code below..
>
> Signed-off-by: Alex Mastro <amastro@fb.com>
> ---
> drivers/vfio/vfio_iommu_type1.c | 77 ++++++++++++++++++++++-------------------
> 1 file changed, 42 insertions(+), 35 deletions(-)
>
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 48b84a7af2e1..a65625dcf708 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -166,12 +166,14 @@ static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
> {
> struct rb_node *node = iommu->dma_list.rb_node;
>
> + WARN_ON(!size);
> +
> while (node) {
> struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
>
> - if (start + size <= dma->iova)
> + if (start + size - 1 < dma->iova)
> node = node->rb_left;
> - else if (start >= dma->iova + dma->size)
> + else if (start > dma->iova + dma->size - 1)
> node = node->rb_right;
> else
> return dma;
> @@ -181,16 +183,19 @@ static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
> }
>
> static struct rb_node *vfio_find_dma_first_node(struct vfio_iommu *iommu,
> - dma_addr_t start, size_t size)
> + dma_addr_t start,
> + dma_addr_t end)
> {
> struct rb_node *res = NULL;
> struct rb_node *node = iommu->dma_list.rb_node;
> struct vfio_dma *dma_res = NULL;
>
> + WARN_ON(end < start);
> +
> while (node) {
> struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
>
> - if (start < dma->iova + dma->size) {
> + if (start <= dma->iova + dma->size - 1) {
> res = node;
> dma_res = dma;
> if (start >= dma->iova)
> @@ -200,7 +205,7 @@ static struct rb_node *vfio_find_dma_first_node(struct vfio_iommu *iommu,
> node = node->rb_right;
> }
> }
> - if (res && size && dma_res->iova >= start + size)
> + if (res && dma_res->iova > end)
> res = NULL;
> return res;
> }
> @@ -210,11 +215,13 @@ static void vfio_link_dma(struct vfio_iommu *iommu, struct vfio_dma *new)
> struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL;
> struct vfio_dma *dma;
>
> + WARN_ON(new->size != 0);
> +
> while (*link) {
> parent = *link;
> dma = rb_entry(parent, struct vfio_dma, node);
>
> - if (new->iova + new->size <= dma->iova)
> + if (new->iova <= dma->iova)
It is possible I missed a previous thread where this was already
discussed, but why are we adding this new restriction that
vfio_link_dma() will _always_ be called with dma->size = 0? I know it is
the case now, but is there a reason why future code could not try to
insert a non-zero sized node?
I thought it would be more fitting to add overflow protection here too,
as it is done for other code paths in the file? I know the WARN_ON()
above will make us aware if there is ever another caller that attempts
to use size !=0, so this is more of a nit about consistency than a
concern about correctness.
Thank you,
Alejandro
> link = &(*link)->rb_left;
> else
> link = &(*link)->rb_right;
> @@ -1071,12 +1078,12 @@ static size_t unmap_unpin_slow(struct vfio_domain *domain,
> static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
> bool do_accounting)
> {
> - dma_addr_t iova = dma->iova, end = dma->iova + dma->size;
> struct vfio_domain *domain, *d;
> LIST_HEAD(unmapped_region_list);
> struct iommu_iotlb_gather iotlb_gather;
> int unmapped_region_cnt = 0;
> long unlocked = 0;
> + size_t pos = 0;
>
> if (!dma->size)
> return 0;
> @@ -1100,13 +1107,14 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
> }
>
> iommu_iotlb_gather_init(&iotlb_gather);
> - while (iova < end) {
> + while (pos < dma->size) {
> size_t unmapped, len;
> phys_addr_t phys, next;
> + dma_addr_t iova = dma->iova + pos;
>
> phys = iommu_iova_to_phys(domain->domain, iova);
> if (WARN_ON(!phys)) {
> - iova += PAGE_SIZE;
> + pos += PAGE_SIZE;
> continue;
> }
>
> @@ -1115,7 +1123,7 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
> * may require hardware cache flushing, try to find the
> * largest contiguous physical memory chunk to unmap.
> */
> - for (len = PAGE_SIZE; iova + len < end; len += PAGE_SIZE) {
> + for (len = PAGE_SIZE; pos + len < dma->size; len += PAGE_SIZE) {
> next = iommu_iova_to_phys(domain->domain, iova + len);
> if (next != phys + len)
> break;
> @@ -1136,7 +1144,7 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
> break;
> }
>
> - iova += unmapped;
> + pos += unmapped;
> }
>
> dma->iommu_mapped = false;
> @@ -1228,7 +1236,7 @@ static int update_user_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu,
> }
>
> static int vfio_iova_dirty_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu,
> - dma_addr_t iova, size_t size, size_t pgsize)
> + dma_addr_t iova, dma_addr_t iova_end, size_t pgsize)
> {
> struct vfio_dma *dma;
> struct rb_node *n;
> @@ -1245,8 +1253,8 @@ static int vfio_iova_dirty_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu,
> if (dma && dma->iova != iova)
> return -EINVAL;
>
> - dma = vfio_find_dma(iommu, iova + size - 1, 0);
> - if (dma && dma->iova + dma->size != iova + size)
> + dma = vfio_find_dma(iommu, iova_end, 1);
> + if (dma && dma->iova + dma->size - 1 != iova_end)
> return -EINVAL;
>
> for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
> @@ -1255,7 +1263,7 @@ static int vfio_iova_dirty_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu,
> if (dma->iova < iova)
> continue;
>
> - if (dma->iova > iova + size - 1)
> + if (dma->iova > iova_end)
> break;
>
> ret = update_user_bitmap(bitmap, iommu, dma, iova, pgsize);
> @@ -1348,7 +1356,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
> if (unmap_all) {
> if (iova || size)
> goto unlock;
> - size = SIZE_MAX;
> + iova_end = ~(dma_addr_t)0;
> } else {
> if (!size || size & (pgsize - 1))
> goto unlock;
> @@ -1403,17 +1411,17 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
> if (dma && dma->iova != iova)
> goto unlock;
>
> - dma = vfio_find_dma(iommu, iova_end, 0);
> - if (dma && dma->iova + dma->size != iova + size)
> + dma = vfio_find_dma(iommu, iova_end, 1);
> + if (dma && dma->iova + dma->size - 1 != iova_end)
> goto unlock;
> }
>
> ret = 0;
> - n = first_n = vfio_find_dma_first_node(iommu, iova, size);
> + n = first_n = vfio_find_dma_first_node(iommu, iova, iova_end);
>
> while (n) {
> dma = rb_entry(n, struct vfio_dma, node);
> - if (dma->iova >= iova + size)
> + if (dma->iova > iova_end)
> break;
>
> if (!iommu->v2 && iova > dma->iova)
> @@ -1743,12 +1751,12 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
>
> for (; n; n = rb_next(n)) {
> struct vfio_dma *dma;
> - dma_addr_t iova;
> + size_t pos = 0;
>
> dma = rb_entry(n, struct vfio_dma, node);
> - iova = dma->iova;
>
> - while (iova < dma->iova + dma->size) {
> + while (pos < dma->size) {
> + dma_addr_t iova = dma->iova + pos;
> phys_addr_t phys;
> size_t size;
>
> @@ -1764,14 +1772,14 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
> phys = iommu_iova_to_phys(d->domain, iova);
>
> if (WARN_ON(!phys)) {
> - iova += PAGE_SIZE;
> + pos += PAGE_SIZE;
> continue;
> }
>
> size = PAGE_SIZE;
> p = phys + size;
> i = iova + size;
> - while (i < dma->iova + dma->size &&
> + while (pos + size < dma->size &&
> p == iommu_iova_to_phys(d->domain, i)) {
> size += PAGE_SIZE;
> p += PAGE_SIZE;
> @@ -1779,9 +1787,8 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
> }
> } else {
> unsigned long pfn;
> - unsigned long vaddr = dma->vaddr +
> - (iova - dma->iova);
> - size_t n = dma->iova + dma->size - iova;
> + unsigned long vaddr = dma->vaddr + pos;
> + size_t n = dma->size - pos;
> long npage;
>
> npage = vfio_pin_pages_remote(dma, vaddr,
> @@ -1812,7 +1819,7 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
> goto unwind;
> }
>
> - iova += size;
> + pos += size;
> }
> }
>
> @@ -1829,29 +1836,29 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
> unwind:
> for (; n; n = rb_prev(n)) {
> struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
> - dma_addr_t iova;
> + size_t pos = 0;
>
> if (dma->iommu_mapped) {
> iommu_unmap(domain->domain, dma->iova, dma->size);
> continue;
> }
>
> - iova = dma->iova;
> - while (iova < dma->iova + dma->size) {
> + while (pos < dma->size) {
> + dma_addr_t iova = dma->iova + pos;
> phys_addr_t phys, p;
> size_t size;
> dma_addr_t i;
>
> phys = iommu_iova_to_phys(domain->domain, iova);
> if (!phys) {
> - iova += PAGE_SIZE;
> + pos += PAGE_SIZE;
> continue;
> }
>
> size = PAGE_SIZE;
> p = phys + size;
> i = iova + size;
> - while (i < dma->iova + dma->size &&
> + while (pos + size < dma->size &&
> p == iommu_iova_to_phys(domain->domain, i)) {
> size += PAGE_SIZE;
> p += PAGE_SIZE;
> @@ -2989,7 +2996,7 @@ static int vfio_iommu_type1_dirty_pages(struct vfio_iommu *iommu,
>
> if (iommu->dirty_page_tracking)
> ret = vfio_iova_dirty_bitmap(range.bitmap.data,
> - iommu, iova, size,
> + iommu, iova, iova_end,
> range.bitmap.pgsize);
> else
> ret = -EINVAL;
>
^ permalink raw reply [flat|nested] 27+ messages in thread* Re: [PATCH v4 3/3] vfio/type1: handle DMA map/unmap up to the addressable limit
2025-10-21 22:18 ` Alejandro Jimenez
@ 2025-10-22 14:24 ` Alex Mastro
0 siblings, 0 replies; 27+ messages in thread
From: Alex Mastro @ 2025-10-22 14:24 UTC (permalink / raw)
To: Alejandro Jimenez; +Cc: Alex Williamson, Jason Gunthorpe, kvm, linux-kernel
On Tue, Oct 21, 2025 at 06:18:00PM -0400, Alejandro Jimenez wrote:
> @@ -210,11 +215,13 @@ static void vfio_link_dma(struct vfio_iommu *iommu, struct vfio_dma *new)
> > struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL;
> > struct vfio_dma *dma;
> > + WARN_ON(new->size != 0);
> > +
> > while (*link) {
> > parent = *link;
> > dma = rb_entry(parent, struct vfio_dma, node);
> > - if (new->iova + new->size <= dma->iova)
> > + if (new->iova <= dma->iova)
> It is possible I missed a previous thread where this was already discussed,
> but why are we adding this new restriction that vfio_link_dma() will
> _always_ be called with dma->size = 0? I know it is the case now, but is
> there a reason why future code could not try to insert a non-zero sized
> node?
Perhaps the WARN_ON is too coddlesome, but given that this helper is used for
exactly one purpose today, the intent is to strongly hint to a future user to
consider what they're doing by deviating from the current usage.
iommu->dma_list's invariant is that all elems should have non-overlapping iova
ranges, which is currently enforced pre-insertion in vfio_dma_do_map by the
vfio_find_dma check. After vfio_pin_map_dma returns, either the vfio_dma has
been grown to its full size, or has been removed from iommu->dma_list on error
via vfio_remove_dma.
> I thought it would be more fitting to add overflow protection here too, as
> it is done for other code paths in the file? I know the WARN_ON() above will
> make us aware if there is ever another caller that attempts to use size !=0,
> so this is more of a nit about consistency than a concern about correctness.
The other code paths which check for overflow focus on sanitizing args at
the vfio_iommu_driver_ops boundary. Since this helper is downstream from those
existing checks, and given its specificity, I'm not sure additional checks here
would be helpful.
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH v4 0/3] vfio: handle DMA map/unmap up to the addressable limit
2025-10-13 5:32 [PATCH v4 0/3] vfio: handle DMA map/unmap up to the addressable limit Alex Mastro
` (2 preceding siblings ...)
2025-10-13 5:32 ` [PATCH v4 3/3] vfio/type1: handle DMA map/unmap up to the addressable limit Alex Mastro
@ 2025-10-15 19:24 ` Alex Williamson
2025-10-15 21:25 ` Alejandro Jimenez
2025-10-21 12:36 ` Jason Gunthorpe
` (2 subsequent siblings)
6 siblings, 1 reply; 27+ messages in thread
From: Alex Williamson @ 2025-10-15 19:24 UTC (permalink / raw)
To: Alex Mastro; +Cc: Jason Gunthorpe, Alejandro Jimenez, kvm, linux-kernel
On Sun, 12 Oct 2025 22:32:23 -0700
Alex Mastro <amastro@fb.com> wrote:
> This patch series aims to fix vfio_iommu_type.c to support
> VFIO_IOMMU_MAP_DMA and VFIO_IOMMU_UNMAP_DMA operations targeting IOVA
> ranges which lie against the addressable limit. i.e. ranges where
> iova_start + iova_size would overflow to exactly zero.
The series looks good to me and passes my testing. Any further reviews
from anyone? I think we should make this v6.18-rc material. Thanks,
Alex
^ permalink raw reply [flat|nested] 27+ messages in thread* Re: [PATCH v4 0/3] vfio: handle DMA map/unmap up to the addressable limit
2025-10-15 19:24 ` [PATCH v4 0/3] vfio: " Alex Williamson
@ 2025-10-15 21:25 ` Alejandro Jimenez
2025-10-16 21:19 ` Alex Mastro
0 siblings, 1 reply; 27+ messages in thread
From: Alejandro Jimenez @ 2025-10-15 21:25 UTC (permalink / raw)
To: Alex Williamson, Alex Mastro; +Cc: Jason Gunthorpe, kvm, linux-kernel
On 10/15/25 3:24 PM, Alex Williamson wrote:
> On Sun, 12 Oct 2025 22:32:23 -0700
> Alex Mastro <amastro@fb.com> wrote:
>
>> This patch series aims to fix vfio_iommu_type.c to support
>> VFIO_IOMMU_MAP_DMA and VFIO_IOMMU_UNMAP_DMA operations targeting IOVA
>> ranges which lie against the addressable limit. i.e. ranges where
>> iova_start + iova_size would overflow to exactly zero.
>
> The series looks good to me and passes my testing. Any further reviews
> from anyone? I think we should make this v6.18-rc material. Thanks,
>
I haven't had a chance yet to closely review the latest patchset
versions, but I did test this v4 and confirmed that it solves the issue
of not being able to unmap an IOVA range extending up to the address
space boundary. I verified both with the simplified test case at:
https://gist.github.com/aljimenezb/f3338c9c2eda9b0a7bf5f76b40354db8
plus using QEMU's amd-iommu and a guest with iommu.passthrough=0
iommu.forcedac=1 (which is how I first found the problem).
So Alex Mastro, please feel free to add:
Tested-by: Alejandro Jimenez <alejandro.j.jimenez@oracle.com>
for the series. I'll try to find time to review the patches in detail.
Thank you,
Alejandro
> Alex
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH v4 0/3] vfio: handle DMA map/unmap up to the addressable limit
2025-10-15 21:25 ` Alejandro Jimenez
@ 2025-10-16 21:19 ` Alex Mastro
2025-10-16 22:01 ` Alex Williamson
0 siblings, 1 reply; 27+ messages in thread
From: Alex Mastro @ 2025-10-16 21:19 UTC (permalink / raw)
To: Alejandro Jimenez; +Cc: Alex Williamson, Jason Gunthorpe, kvm, linux-kernel
On Wed, Oct 15, 2025 at 05:25:14PM -0400, Alejandro Jimenez wrote:
>
>
> On 10/15/25 3:24 PM, Alex Williamson wrote:
> > On Sun, 12 Oct 2025 22:32:23 -0700
> > Alex Mastro <amastro@fb.com> wrote:
> >
> > > This patch series aims to fix vfio_iommu_type.c to support
> > > VFIO_IOMMU_MAP_DMA and VFIO_IOMMU_UNMAP_DMA operations targeting IOVA
> > > ranges which lie against the addressable limit. i.e. ranges where
> > > iova_start + iova_size would overflow to exactly zero.
> >
> > The series looks good to me and passes my testing. Any further reviews
> > from anyone? I think we should make this v6.18-rc material. Thanks,
> >
>
> I haven't had a chance yet to closely review the latest patchset versions,
> but I did test this v4 and confirmed that it solves the issue of not being
> able to unmap an IOVA range extending up to the address space boundary. I
> verified both with the simplified test case at:
> https://gist.github.com/aljimenezb/f3338c9c2eda9b0a7bf5f76b40354db8
>
> plus using QEMU's amd-iommu and a guest with iommu.passthrough=0
> iommu.forcedac=1 (which is how I first found the problem).
>
> So Alex Mastro, please feel free to add:
>
> Tested-by: Alejandro Jimenez <alejandro.j.jimenez@oracle.com>
>
> for the series. I'll try to find time to review the patches in detail.
>
> Thank you,
> Alejandro
>
> > Alex
>
Thanks all. I would like additional scrutiny around vfio_iommu_replay. It was
the one block of affected code I have not been able to test, since I don't have
/ am not sure how to simulate a setup which can cause mappings to be replayed
on a newly added IOMMU domain. My confidence in that code is from close review
only.
I explicitly tested various combinations of the following with mappings up to
the addressable limit:
- VFIO_IOMMU_MAP_DMA
- VFIO_IOMMU_UNMAP_DMA range-based, and VFIO_DMA_UNMAP_FLAG_ALL
- VFIO_IOMMU_DIRTY_PAGES with VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP
My understanding is that my changes to vfio_iommu_replay would be traversed
when binding a group to a container with existing mappings where the group's
IOMMU domain is not already one of the domains in the container.
Thanks,
Alex
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH v4 0/3] vfio: handle DMA map/unmap up to the addressable limit
2025-10-16 21:19 ` Alex Mastro
@ 2025-10-16 22:01 ` Alex Williamson
2025-10-17 16:29 ` Alex Mastro
0 siblings, 1 reply; 27+ messages in thread
From: Alex Williamson @ 2025-10-16 22:01 UTC (permalink / raw)
To: Alex Mastro; +Cc: Alejandro Jimenez, Jason Gunthorpe, kvm, linux-kernel
On Thu, 16 Oct 2025 14:19:53 -0700
Alex Mastro <amastro@fb.com> wrote:
> On Wed, Oct 15, 2025 at 05:25:14PM -0400, Alejandro Jimenez wrote:
> >
> >
> > On 10/15/25 3:24 PM, Alex Williamson wrote:
> > > On Sun, 12 Oct 2025 22:32:23 -0700
> > > Alex Mastro <amastro@fb.com> wrote:
> > >
> > > > This patch series aims to fix vfio_iommu_type.c to support
> > > > VFIO_IOMMU_MAP_DMA and VFIO_IOMMU_UNMAP_DMA operations targeting IOVA
> > > > ranges which lie against the addressable limit. i.e. ranges where
> > > > iova_start + iova_size would overflow to exactly zero.
> > >
> > > The series looks good to me and passes my testing. Any further reviews
> > > from anyone? I think we should make this v6.18-rc material. Thanks,
> > >
> >
> > I haven't had a chance yet to closely review the latest patchset versions,
> > but I did test this v4 and confirmed that it solves the issue of not being
> > able to unmap an IOVA range extending up to the address space boundary. I
> > verified both with the simplified test case at:
> > https://gist.github.com/aljimenezb/f3338c9c2eda9b0a7bf5f76b40354db8
> >
> > plus using QEMU's amd-iommu and a guest with iommu.passthrough=0
> > iommu.forcedac=1 (which is how I first found the problem).
> >
> > So Alex Mastro, please feel free to add:
> >
> > Tested-by: Alejandro Jimenez <alejandro.j.jimenez@oracle.com>
> >
> > for the series. I'll try to find time to review the patches in detail.
> >
> > Thank you,
> > Alejandro
> >
> > > Alex
> >
>
> Thanks all. I would like additional scrutiny around vfio_iommu_replay. It was
> the one block of affected code I have not been able to test, since I don't have
> / am not sure how to simulate a setup which can cause mappings to be replayed
> on a newly added IOMMU domain. My confidence in that code is from close review
> only.
>
> I explicitly tested various combinations of the following with mappings up to
> the addressable limit:
> - VFIO_IOMMU_MAP_DMA
> - VFIO_IOMMU_UNMAP_DMA range-based, and VFIO_DMA_UNMAP_FLAG_ALL
> - VFIO_IOMMU_DIRTY_PAGES with VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP
>
> My understanding is that my changes to vfio_iommu_replay would be traversed
> when binding a group to a container with existing mappings where the group's
> IOMMU domain is not already one of the domains in the container.
I really appreciate you making note of it if you're uneasy about the
change. I'll take a closer look there and hopefully others can as
well.
The legacy vfio container represents a single IOMMU context, which is
typically managed by a single domain. The replay comes into play when
groups are under IOMMUs with different properties that prevent us from
re-using the domain. The case that most comes to mind for this is
Intel platforms with integrated graphics where there's a separate IOMMU
for the GPU, which iirc has different coherency settings.
That mechanism for triggering replay requires a specific hardware
configuration, but we can easily trigger it through code
instrumentation, ex:
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 5167bec14e36..2cb19ddbb524 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -2368,7 +2368,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
d->enforce_cache_coherency ==
domain->enforce_cache_coherency) {
iommu_detach_group(domain->domain, group->iommu_group);
- if (!iommu_attach_group(d->domain,
+ if (0 && !iommu_attach_group(d->domain,
group->iommu_group)) {
list_add(&group->next, &d->group_list);
iommu_domain_free(domain->domain);
We might consider whether it's useful for testing purposes to expose a
mechanism to toggle this. For a unit test, if we create a container,
add a group, and build up some suspect mappings, if we then add another
group to the container with the above bypass we should trigger the
replay.
In general though the replay shouldn't have a mechanism to trigger
overflows, we're simply iterating the current set of mappings that have
already been validated and applying them to a new domain.
In any case, we can all take a second look at the changes there.
Thanks,
Alex
^ permalink raw reply related [flat|nested] 27+ messages in thread* Re: [PATCH v4 0/3] vfio: handle DMA map/unmap up to the addressable limit
2025-10-16 22:01 ` Alex Williamson
@ 2025-10-17 16:29 ` Alex Mastro
2025-10-20 21:36 ` Alex Williamson
0 siblings, 1 reply; 27+ messages in thread
From: Alex Mastro @ 2025-10-17 16:29 UTC (permalink / raw)
To: Alex Williamson; +Cc: Alejandro Jimenez, Jason Gunthorpe, kvm, linux-kernel
On Thu, Oct 16, 2025 at 04:01:38PM -0600, Alex Williamson wrote:
> The legacy vfio container represents a single IOMMU context, which is
> typically managed by a single domain. The replay comes into play when
> groups are under IOMMUs with different properties that prevent us from
> re-using the domain. The case that most comes to mind for this is
> Intel platforms with integrated graphics where there's a separate IOMMU
> for the GPU, which iirc has different coherency settings.
Thanks, this context is helpful and makes sense.
> That mechanism for triggering replay requires a specific hardware
> configuration, but we can easily trigger it through code
> instrumentation, ex:
>
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 5167bec14e36..2cb19ddbb524 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -2368,7 +2368,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
> d->enforce_cache_coherency ==
> domain->enforce_cache_coherency) {
> iommu_detach_group(domain->domain, group->iommu_group);
> - if (!iommu_attach_group(d->domain,
> + if (0 && !iommu_attach_group(d->domain,
> group->iommu_group)) {
> list_add(&group->next, &d->group_list);
> iommu_domain_free(domain->domain);
>
> We might consider whether it's useful for testing purposes to expose a
> mechanism to toggle this. For a unit test, if we create a container,
> add a group, and build up some suspect mappings, if we then add another
> group to the container with the above bypass we should trigger the
> replay.
Thanks for the tip. I did this, and validated via bpftrace-ing iommu_map that
the container's mappings (one of which lies at the end of address space) are
replayed correctly. Without the fix, the loop body
while (iova < dma->iova + dma->size) { ... iommu_map() ... }
would never be entered for the end of address space mapping due to
dma->iova + dma->size == 0
$ sudo bpftrace -e 'kprobe:iommu_map { printf("pid=%d comm=%s domain=%p iova=%p paddr=%p size=%p prot=%p gfp=%p\n", pid, comm, (void*)arg0, (void*)arg1, (void*)arg2, (void*)arg3, (void*)arg4, (void*)arg5); }'
Attached 1 probe
# original mappings
pid=616477 comm=test_dma_map_un domain=0xff11012805dac210 iova=0x10000000000 paddr=0x12ecfdd0000 size=0x1000 prot=0x7 gfp=0x400cc0
pid=616477 comm=test_dma_map_un domain=0xff11012805dac210 iova=0x10000001000 paddr=0x12ecfdd0000 size=0x1000 prot=0x7 gfp=0x400cc0
pid=616477 comm=test_dma_map_un domain=0xff11012805dac210 iova=0xfffffffffffff000 paddr=0x12ecfdd0000 size=0x1000 prot=0x7 gfp=0x400cc0
# replayed mapping
pid=616477 comm=test_dma_map_un domain=0xff11012805dab610 iova=0x10000000000 paddr=0x12ecfdd0000 size=0x1000 prot=0x7 gfp=0x400cc0
pid=616477 comm=test_dma_map_un domain=0xff11012805dab610 iova=0x10000001000 paddr=0x12ecfdd0000 size=0x1000 prot=0x7 gfp=0x400cc0
pid=616477 comm=test_dma_map_un domain=0xff11012805dab610 iova=0xfffffffffffff000 paddr=0x12ecfdd0000 size=0x1000 prot=0x7 gfp=0x400cc0
> In general though the replay shouldn't have a mechanism to trigger
> overflows, we're simply iterating the current set of mappings that have
> already been validated and applying them to a new domain.
Agree. Overflow means that some other invariant has broken, and nonsensical
vfio_dma have infiltrated iommu->dma_list. The combination of iommu->lock
serialization + overflow checks elsewhere should have prevented that.
> In any case, we can all take a second look at the changes there.
Thanks!
Alex
^ permalink raw reply [flat|nested] 27+ messages in thread* Re: [PATCH v4 0/3] vfio: handle DMA map/unmap up to the addressable limit
2025-10-17 16:29 ` Alex Mastro
@ 2025-10-20 21:36 ` Alex Williamson
2025-10-21 16:25 ` Alex Mastro
0 siblings, 1 reply; 27+ messages in thread
From: Alex Williamson @ 2025-10-20 21:36 UTC (permalink / raw)
To: Alex Mastro
Cc: Alejandro Jimenez, Jason Gunthorpe, kvm, linux-kernel,
David Matlack
On Fri, 17 Oct 2025 09:29:26 -0700
Alex Mastro <amastro@fb.com> wrote:
> On Thu, Oct 16, 2025 at 04:01:38PM -0600, Alex Williamson wrote:
> > That mechanism for triggering replay requires a specific hardware
> > configuration, but we can easily trigger it through code
> > instrumentation, ex:
> >
> > diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> > index 5167bec14e36..2cb19ddbb524 100644
> > --- a/drivers/vfio/vfio_iommu_type1.c
> > +++ b/drivers/vfio/vfio_iommu_type1.c
> > @@ -2368,7 +2368,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
> > d->enforce_cache_coherency ==
> > domain->enforce_cache_coherency) {
> > iommu_detach_group(domain->domain, group->iommu_group);
> > - if (!iommu_attach_group(d->domain,
> > + if (0 && !iommu_attach_group(d->domain,
> > group->iommu_group)) {
> > list_add(&group->next, &d->group_list);
> > iommu_domain_free(domain->domain);
> >
> > We might consider whether it's useful for testing purposes to expose a
> > mechanism to toggle this. For a unit test, if we create a container,
> > add a group, and build up some suspect mappings, if we then add another
> > group to the container with the above bypass we should trigger the
> > replay.
>
> Thanks for the tip. I did this, and validated via bpftrace-ing iommu_map that
> the container's mappings (one of which lies at the end of address space) are
> replayed correctly. Without the fix, the loop body
>
> while (iova < dma->iova + dma->size) { ... iommu_map() ... }
>
> would never be entered for the end of address space mapping due to
>
> dma->iova + dma->size == 0
>
> $ sudo bpftrace -e 'kprobe:iommu_map { printf("pid=%d comm=%s domain=%p iova=%p paddr=%p size=%p prot=%p gfp=%p\n", pid, comm, (void*)arg0, (void*)arg1, (void*)arg2, (void*)arg3, (void*)arg4, (void*)arg5); }'
> Attached 1 probe
> # original mappings
> pid=616477 comm=test_dma_map_un domain=0xff11012805dac210 iova=0x10000000000 paddr=0x12ecfdd0000 size=0x1000 prot=0x7 gfp=0x400cc0
> pid=616477 comm=test_dma_map_un domain=0xff11012805dac210 iova=0x10000001000 paddr=0x12ecfdd0000 size=0x1000 prot=0x7 gfp=0x400cc0
> pid=616477 comm=test_dma_map_un domain=0xff11012805dac210 iova=0xfffffffffffff000 paddr=0x12ecfdd0000 size=0x1000 prot=0x7 gfp=0x400cc0
> # replayed mapping
> pid=616477 comm=test_dma_map_un domain=0xff11012805dab610 iova=0x10000000000 paddr=0x12ecfdd0000 size=0x1000 prot=0x7 gfp=0x400cc0
> pid=616477 comm=test_dma_map_un domain=0xff11012805dab610 iova=0x10000001000 paddr=0x12ecfdd0000 size=0x1000 prot=0x7 gfp=0x400cc0
> pid=616477 comm=test_dma_map_un domain=0xff11012805dab610 iova=0xfffffffffffff000 paddr=0x12ecfdd0000 size=0x1000 prot=0x7 gfp=0x400cc0
>
> > In general though the replay shouldn't have a mechanism to trigger
> > overflows, we're simply iterating the current set of mappings that have
> > already been validated and applying them to a new domain.
>
> Agree. Overflow means that some other invariant has broken, and nonsensical
> vfio_dma have infiltrated iommu->dma_list. The combination of iommu->lock
> serialization + overflow checks elsewhere should have prevented that.
>
> > In any case, we can all take a second look at the changes there.
> Thanks!
Thanks for the further testing. Looking again at the changes, it still
looks good to me.
I do note that we're missing a Fixes: tag. I think we've had hints of
this issue all the way back to the original implementation, so perhaps
the last commit should include:
Fixes: 73fa0d10d077 ("vfio: Type1 IOMMU implementation")
Unless you've identified a more specific target.
Along with the tag, it would probably be useful in that same commit to
expand on the scope of the issue in the commit log. I believe we allow
mappings to be created at the top of the address space that cannot be
removed via ioctl, but such inconsistency should result in an
application error due to the failed ioctl and does not affect cleanup
on release. Should we also therefore expand the DMA mapping tests in
tools/testing/selftests/vfio to include an end of address space test?
Thanks,
Alex
^ permalink raw reply [flat|nested] 27+ messages in thread* Re: [PATCH v4 0/3] vfio: handle DMA map/unmap up to the addressable limit
2025-10-20 21:36 ` Alex Williamson
@ 2025-10-21 16:25 ` Alex Mastro
2025-10-21 16:31 ` David Matlack
` (2 more replies)
0 siblings, 3 replies; 27+ messages in thread
From: Alex Mastro @ 2025-10-21 16:25 UTC (permalink / raw)
To: Alex Williamson
Cc: Alejandro Jimenez, Jason Gunthorpe, kvm, linux-kernel,
David Matlack
On Mon, Oct 20, 2025 at 03:36:33PM -0600, Alex Williamson wrote:
> I do note that we're missing a Fixes: tag. I think we've had hints of
> this issue all the way back to the original implementation, so perhaps
> the last commit should include:
>
> Fixes: 73fa0d10d077 ("vfio: Type1 IOMMU implementation")
SGTM
> Unless you've identified a more specific target.
I have not.
> Along with the tag, it would probably be useful in that same commit to
> expand on the scope of the issue in the commit log. I believe we allow
> mappings to be created at the top of the address space that cannot be
> removed via ioctl, but such inconsistency should result in an
> application error due to the failed ioctl and does not affect cleanup
> on release.
Makes sense. I will clarify the commit msg in v5 to be more specific about what
this series changes relative to existing functionality.
> Should we also therefore expand the DMA mapping tests in
> tools/testing/selftests/vfio to include an end of address space test?
Yes. I will append such a commit to the end of the series in v5. Our VFIO tests
are built on top of a hermetic rust wrapper library over VFIO ioctls, but they
aren't quite ready to be open sourced yet.
^ permalink raw reply [flat|nested] 27+ messages in thread* Re: [PATCH v4 0/3] vfio: handle DMA map/unmap up to the addressable limit
2025-10-21 16:25 ` Alex Mastro
@ 2025-10-21 16:31 ` David Matlack
2025-10-21 19:13 ` Alex Mastro
2025-10-23 20:52 ` Alex Mastro
2025-10-27 16:02 ` Alex Mastro
2 siblings, 1 reply; 27+ messages in thread
From: David Matlack @ 2025-10-21 16:31 UTC (permalink / raw)
To: Alex Mastro
Cc: Alex Williamson, Alejandro Jimenez, Jason Gunthorpe, kvm,
linux-kernel
On Tue, Oct 21, 2025 at 9:26 AM Alex Mastro <amastro@fb.com> wrote:
> On Mon, Oct 20, 2025 at 03:36:33PM -0600, Alex Williamson wrote:
> > Should we also therefore expand the DMA mapping tests in
> > tools/testing/selftests/vfio to include an end of address space test?
>
> Yes. I will append such a commit to the end of the series in v5. Our VFIO tests
> are built on top of a hermetic rust wrapper library over VFIO ioctls, but they
> aren't quite ready to be open sourced yet.
Feel free to reach out if you have any questions about writing or
running the VFIO selftests.
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH v4 0/3] vfio: handle DMA map/unmap up to the addressable limit
2025-10-21 16:31 ` David Matlack
@ 2025-10-21 19:13 ` Alex Mastro
2025-10-22 0:38 ` David Matlack
0 siblings, 1 reply; 27+ messages in thread
From: Alex Mastro @ 2025-10-21 19:13 UTC (permalink / raw)
To: David Matlack
Cc: Alex Williamson, Alejandro Jimenez, Jason Gunthorpe, kvm,
linux-kernel
On Tue, Oct 21, 2025 at 09:31:59AM -0700, David Matlack wrote:
> On Tue, Oct 21, 2025 at 9:26 AM Alex Mastro <amastro@fb.com> wrote:
> > On Mon, Oct 20, 2025 at 03:36:33PM -0600, Alex Williamson wrote:
> > > Should we also therefore expand the DMA mapping tests in
> > > tools/testing/selftests/vfio to include an end of address space test?
> >
> > Yes. I will append such a commit to the end of the series in v5. Our VFIO tests
> > are built on top of a hermetic rust wrapper library over VFIO ioctls, but they
> > aren't quite ready to be open sourced yet.
>
> Feel free to reach out if you have any questions about writing or
> running the VFIO selftests.
Thanks David. I built and ran using below. I am not too familiar with
kselftests, so open to tips.
$ make LLVM=1 -j kselftest-install INSTALL_PATH=/tmp/kst TARGETS="vfio"
$ VFIO_SELFTESTS_BDF=0000:05:00.0 /tmp/kst/run_kselftest.sh
I added the following. Is this the right direction? Is multiple fixtures per
file OK? Seems related enough to vfio_dma_mapping_test.c to keep together.
I updated the *_unmap function signatures to return the count of bytes unmapped,
since that is part of the test pass criteria. Also added unmap_all flavors,
since those exercise different code paths than range-based unmap.
Relevant test output:
# # RUN vfio_dma_map_limit_test.vfio_type1_iommu.end_of_address_space ...
# Mapped HVA 0x7f6638222000 (size 0x1000) at IOVA 0xfffffffffffff000
# Mapped HVA 0x7f6638222000 (size 0x1000) at IOVA 0xfffffffffffff000
# # OK vfio_dma_map_limit_test.vfio_type1_iommu.end_of_address_space
# ok 16 vfio_dma_map_limit_test.vfio_type1_iommu.end_of_address_space
# # RUN vfio_dma_map_limit_test.vfio_type1v2_iommu.end_of_address_space ...
# Mapped HVA 0x7f6638222000 (size 0x1000) at IOVA 0xfffffffffffff000
# Mapped HVA 0x7f6638222000 (size 0x1000) at IOVA 0xfffffffffffff000
# # OK vfio_dma_map_limit_test.vfio_type1v2_iommu.end_of_address_space
# ok 17 vfio_dma_map_limit_test.vfio_type1v2_iommu.end_of_address_space
# # RUN vfio_dma_map_limit_test.iommufd_compat_type1.end_of_address_space ...
# Mapped HVA 0x7f6638222000 (size 0x1000) at IOVA 0xfffffffffffff000
# Mapped HVA 0x7f6638222000 (size 0x1000) at IOVA 0xfffffffffffff000
# # OK vfio_dma_map_limit_test.iommufd_compat_type1.end_of_address_space
# ok 18 vfio_dma_map_limit_test.iommufd_compat_type1.end_of_address_space
# # RUN vfio_dma_map_limit_test.iommufd_compat_type1v2.end_of_address_space ...
# Mapped HVA 0x7f6638222000 (size 0x1000) at IOVA 0xfffffffffffff000
# Mapped HVA 0x7f6638222000 (size 0x1000) at IOVA 0xfffffffffffff000
# # OK vfio_dma_map_limit_test.iommufd_compat_type1v2.end_of_address_space
# ok 19 vfio_dma_map_limit_test.iommufd_compat_type1v2.end_of_address_space
# # RUN vfio_dma_map_limit_test.iommufd.end_of_address_space ...
# Mapped HVA 0x7f6638222000 (size 0x1000) at IOVA 0xfffffffffffff000
# Mapped HVA 0x7f6638222000 (size 0x1000) at IOVA 0xfffffffffffff000
diff --git a/tools/testing/selftests/vfio/lib/include/vfio_util.h b/tools/testing/selftests/vfio/lib/include/vfio_util.h
index ed31606e01b7..8e9d40845ccc 100644
--- a/tools/testing/selftests/vfio/lib/include/vfio_util.h
+++ b/tools/testing/selftests/vfio/lib/include/vfio_util.h
@@ -208,8 +208,9 @@ void vfio_pci_device_reset(struct vfio_pci_device *device);
void vfio_pci_dma_map(struct vfio_pci_device *device,
struct vfio_dma_region *region);
-void vfio_pci_dma_unmap(struct vfio_pci_device *device,
- struct vfio_dma_region *region);
+u64 vfio_pci_dma_unmap(struct vfio_pci_device *device,
+ struct vfio_dma_region *region);
+u64 vfio_pci_dma_unmap_all(struct vfio_pci_device *device);
void vfio_pci_config_access(struct vfio_pci_device *device, bool write,
size_t config, size_t size, void *data);
diff --git a/tools/testing/selftests/vfio/lib/vfio_pci_device.c b/tools/testing/selftests/vfio/lib/vfio_pci_device.c
index 0921b2451ba5..f5ae68a7df9c 100644
--- a/tools/testing/selftests/vfio/lib/vfio_pci_device.c
+++ b/tools/testing/selftests/vfio/lib/vfio_pci_device.c
@@ -183,7 +183,7 @@ void vfio_pci_dma_map(struct vfio_pci_device *device,
list_add(®ion->link, &device->dma_regions);
}
-static void vfio_iommu_dma_unmap(struct vfio_pci_device *device,
+static u64 vfio_iommu_dma_unmap(struct vfio_pci_device *device,
struct vfio_dma_region *region)
{
struct vfio_iommu_type1_dma_unmap args = {
@@ -193,9 +193,25 @@ static void vfio_iommu_dma_unmap(struct vfio_pci_device *device,
};
ioctl_assert(device->container_fd, VFIO_IOMMU_UNMAP_DMA, &args);
+
+ return args.size;
+}
+
+static u64 vfio_iommu_dma_unmap_all(struct vfio_pci_device *device)
+{
+ struct vfio_iommu_type1_dma_unmap args = {
+ .argsz = sizeof(args),
+ .iova = 0,
+ .size = 0,
+ .flags = VFIO_DMA_UNMAP_FLAG_ALL,
+ };
+
+ ioctl_assert(device->container_fd, VFIO_IOMMU_UNMAP_DMA, &args);
+
+ return args.size;
}
-static void iommufd_dma_unmap(struct vfio_pci_device *device,
+static u64 iommufd_dma_unmap(struct vfio_pci_device *device,
struct vfio_dma_region *region)
{
struct iommu_ioas_unmap args = {
@@ -206,17 +222,54 @@ static void iommufd_dma_unmap(struct vfio_pci_device *device,
};
ioctl_assert(device->iommufd, IOMMU_IOAS_UNMAP, &args);
+
+ return args.length;
+}
+
+static u64 iommufd_dma_unmap_all(struct vfio_pci_device *device)
+{
+ struct iommu_ioas_unmap args = {
+ .size = sizeof(args),
+ .iova = 0,
+ .length = UINT64_MAX,
+ .ioas_id = device->ioas_id,
+ };
+
+ ioctl_assert(device->iommufd, IOMMU_IOAS_UNMAP, &args);
+
+ return args.length;
}
-void vfio_pci_dma_unmap(struct vfio_pci_device *device,
+u64 vfio_pci_dma_unmap(struct vfio_pci_device *device,
struct vfio_dma_region *region)
{
+ u64 unmapped;
+
if (device->iommufd)
- iommufd_dma_unmap(device, region);
+ unmapped = iommufd_dma_unmap(device, region);
else
- vfio_iommu_dma_unmap(device, region);
+ unmapped = vfio_iommu_dma_unmap(device, region);
list_del(®ion->link);
+
+ return unmapped;
+}
+
+u64 vfio_pci_dma_unmap_all(struct vfio_pci_device *device)
+{
+ u64 unmapped;
+ struct vfio_dma_region *curr, *next;
+
+ if (device->iommufd)
+ unmapped = iommufd_dma_unmap_all(device);
+ else
+ unmapped = vfio_iommu_dma_unmap_all(device);
+
+ list_for_each_entry_safe(curr, next, &device->dma_regions, link) {
+ list_del(&curr->link);
+ }
+
+ return unmapped;
}
static void vfio_pci_region_get(struct vfio_pci_device *device, int index,
diff --git a/tools/testing/selftests/vfio/vfio_dma_mapping_test.c b/tools/testing/selftests/vfio/vfio_dma_mapping_test.c
index ab19c54a774d..e908c1fe7103 100644
--- a/tools/testing/selftests/vfio/vfio_dma_mapping_test.c
+++ b/tools/testing/selftests/vfio/vfio_dma_mapping_test.c
@@ -122,6 +122,8 @@ FIXTURE_TEARDOWN(vfio_dma_mapping_test)
vfio_pci_device_cleanup(self->device);
}
+#undef FIXTURE_VARIANT_ADD_IOMMU_MODE
+
TEST_F(vfio_dma_mapping_test, dma_map_unmap)
{
const u64 size = variant->size ?: getpagesize();
@@ -192,6 +194,61 @@ TEST_F(vfio_dma_mapping_test, dma_map_unmap)
ASSERT_TRUE(!munmap(region.vaddr, size));
}
+FIXTURE(vfio_dma_map_limit_test) {
+ struct vfio_pci_device *device;
+};
+
+FIXTURE_VARIANT(vfio_dma_map_limit_test) {
+ const char *iommu_mode;
+};
+
+#define FIXTURE_VARIANT_ADD_IOMMU_MODE(_iommu_mode) \
+FIXTURE_VARIANT_ADD(vfio_dma_map_limit_test, _iommu_mode) { \
+ .iommu_mode = #_iommu_mode, \
+}
+
+FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES();
+
+FIXTURE_SETUP(vfio_dma_map_limit_test)
+{
+ self->device = vfio_pci_device_init(device_bdf, variant->iommu_mode);
+}
+
+FIXTURE_TEARDOWN(vfio_dma_map_limit_test)
+{
+ vfio_pci_device_cleanup(self->device);
+}
+
+#undef FIXTURE_VARIANT_ADD_IOMMU_MODE
+
+TEST_F(vfio_dma_map_limit_test, end_of_address_space)
+{
+ struct vfio_dma_region region = {};
+ u64 size = getpagesize();
+ u64 unmapped;
+
+ region.vaddr = mmap(NULL, size, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+ ASSERT_NE(region.vaddr, MAP_FAILED);
+
+ region.iova = ~(iova_t)0 & ~(size - 1);
+ region.size = size;
+
+ vfio_pci_dma_map(self->device, ®ion);
+ printf("Mapped HVA %p (size 0x%lx) at IOVA 0x%lx\n", region.vaddr, size, region.iova);
+ ASSERT_EQ(region.iova, to_iova(self->device, region.vaddr));
+
+ unmapped = vfio_pci_dma_unmap(self->device, ®ion);
+ ASSERT_EQ(unmapped, size);
+
+ vfio_pci_dma_map(self->device, ®ion);
+ printf("Mapped HVA %p (size 0x%lx) at IOVA 0x%lx\n", region.vaddr, size, region.iova);
+ ASSERT_EQ(region.iova, to_iova(self->device, region.vaddr));
+
+ unmapped = vfio_pci_dma_unmap_all(self->device);
+ ASSERT_EQ(unmapped, size);
+}
+
int main(int argc, char *argv[])
{
device_bdf = vfio_selftests_get_bdf(&argc, argv);
^ permalink raw reply related [flat|nested] 27+ messages in thread* Re: [PATCH v4 0/3] vfio: handle DMA map/unmap up to the addressable limit
2025-10-21 19:13 ` Alex Mastro
@ 2025-10-22 0:38 ` David Matlack
2025-10-22 14:55 ` Alex Mastro
0 siblings, 1 reply; 27+ messages in thread
From: David Matlack @ 2025-10-22 0:38 UTC (permalink / raw)
To: Alex Mastro
Cc: Alex Williamson, Alejandro Jimenez, Jason Gunthorpe, kvm,
linux-kernel
On Tue, Oct 21, 2025 at 12:13 PM Alex Mastro <amastro@fb.com> wrote:
>
> On Tue, Oct 21, 2025 at 09:31:59AM -0700, David Matlack wrote:
> > On Tue, Oct 21, 2025 at 9:26 AM Alex Mastro <amastro@fb.com> wrote:
> > > On Mon, Oct 20, 2025 at 03:36:33PM -0600, Alex Williamson wrote:
> > > > Should we also therefore expand the DMA mapping tests in
> > > > tools/testing/selftests/vfio to include an end of address space test?
> > >
> > > Yes. I will append such a commit to the end of the series in v5. Our VFIO tests
> > > are built on top of a hermetic rust wrapper library over VFIO ioctls, but they
> > > aren't quite ready to be open sourced yet.
> >
> > Feel free to reach out if you have any questions about writing or
> > running the VFIO selftests.
>
> Thanks David. I built and ran using below. I am not too familiar with
> kselftests, so open to tips.
>
> $ make LLVM=1 -j kselftest-install INSTALL_PATH=/tmp/kst TARGETS="vfio"
> $ VFIO_SELFTESTS_BDF=0000:05:00.0 /tmp/kst/run_kselftest.sh
>
> I added the following. Is this the right direction? Is multiple fixtures per
> file OK? Seems related enough to vfio_dma_mapping_test.c to keep together.
Adding a fixture to vfio_dma_mapping_test.c is what I was imagining as well.
Overall looks good, we can hash out the specifics in the patches if
you prefer. But I added some thoughts below.
>
> I updated the *_unmap function signatures to return the count of bytes unmapped,
> since that is part of the test pass criteria. Also added unmap_all flavors,
> since those exercise different code paths than range-based unmap.
When you send, can you introduce these in a separate commit and update
the existing test function in vfio_dma_mapping_test.c to assert on it?
>
> Relevant test output:
>
> # # RUN vfio_dma_map_limit_test.vfio_type1_iommu.end_of_address_space ...
> # Mapped HVA 0x7f6638222000 (size 0x1000) at IOVA 0xfffffffffffff000
> # Mapped HVA 0x7f6638222000 (size 0x1000) at IOVA 0xfffffffffffff000
> # # OK vfio_dma_map_limit_test.vfio_type1_iommu.end_of_address_space
> # ok 16 vfio_dma_map_limit_test.vfio_type1_iommu.end_of_address_space
> # # RUN vfio_dma_map_limit_test.vfio_type1v2_iommu.end_of_address_space ...
> # Mapped HVA 0x7f6638222000 (size 0x1000) at IOVA 0xfffffffffffff000
> # Mapped HVA 0x7f6638222000 (size 0x1000) at IOVA 0xfffffffffffff000
> # # OK vfio_dma_map_limit_test.vfio_type1v2_iommu.end_of_address_space
> # ok 17 vfio_dma_map_limit_test.vfio_type1v2_iommu.end_of_address_space
> # # RUN vfio_dma_map_limit_test.iommufd_compat_type1.end_of_address_space ...
> # Mapped HVA 0x7f6638222000 (size 0x1000) at IOVA 0xfffffffffffff000
> # Mapped HVA 0x7f6638222000 (size 0x1000) at IOVA 0xfffffffffffff000
> # # OK vfio_dma_map_limit_test.iommufd_compat_type1.end_of_address_space
> # ok 18 vfio_dma_map_limit_test.iommufd_compat_type1.end_of_address_space
> # # RUN vfio_dma_map_limit_test.iommufd_compat_type1v2.end_of_address_space ...
> # Mapped HVA 0x7f6638222000 (size 0x1000) at IOVA 0xfffffffffffff000
> # Mapped HVA 0x7f6638222000 (size 0x1000) at IOVA 0xfffffffffffff000
> # # OK vfio_dma_map_limit_test.iommufd_compat_type1v2.end_of_address_space
> # ok 19 vfio_dma_map_limit_test.iommufd_compat_type1v2.end_of_address_space
> # # RUN vfio_dma_map_limit_test.iommufd.end_of_address_space ...
> # Mapped HVA 0x7f6638222000 (size 0x1000) at IOVA 0xfffffffffffff000
> # Mapped HVA 0x7f6638222000 (size 0x1000) at IOVA 0xfffffffffffff000
>
> diff --git a/tools/testing/selftests/vfio/lib/include/vfio_util.h b/tools/testing/selftests/vfio/lib/include/vfio_util.h
> index ed31606e01b7..8e9d40845ccc 100644
> --- a/tools/testing/selftests/vfio/lib/include/vfio_util.h
> +++ b/tools/testing/selftests/vfio/lib/include/vfio_util.h
> @@ -208,8 +208,9 @@ void vfio_pci_device_reset(struct vfio_pci_device *device);
>
> void vfio_pci_dma_map(struct vfio_pci_device *device,
> struct vfio_dma_region *region);
> -void vfio_pci_dma_unmap(struct vfio_pci_device *device,
> - struct vfio_dma_region *region);
> +u64 vfio_pci_dma_unmap(struct vfio_pci_device *device,
> + struct vfio_dma_region *region);
> +u64 vfio_pci_dma_unmap_all(struct vfio_pci_device *device);
>
> void vfio_pci_config_access(struct vfio_pci_device *device, bool write,
> size_t config, size_t size, void *data);
> diff --git a/tools/testing/selftests/vfio/lib/vfio_pci_device.c b/tools/testing/selftests/vfio/lib/vfio_pci_device.c
> index 0921b2451ba5..f5ae68a7df9c 100644
> --- a/tools/testing/selftests/vfio/lib/vfio_pci_device.c
> +++ b/tools/testing/selftests/vfio/lib/vfio_pci_device.c
> @@ -183,7 +183,7 @@ void vfio_pci_dma_map(struct vfio_pci_device *device,
> list_add(®ion->link, &device->dma_regions);
> }
>
> -static void vfio_iommu_dma_unmap(struct vfio_pci_device *device,
> +static u64 vfio_iommu_dma_unmap(struct vfio_pci_device *device,
> struct vfio_dma_region *region)
> {
> struct vfio_iommu_type1_dma_unmap args = {
> @@ -193,9 +193,25 @@ static void vfio_iommu_dma_unmap(struct vfio_pci_device *device,
> };
>
> ioctl_assert(device->container_fd, VFIO_IOMMU_UNMAP_DMA, &args);
> +
> + return args.size;
> +}
> +
> +static u64 vfio_iommu_dma_unmap_all(struct vfio_pci_device *device)
> +{
> + struct vfio_iommu_type1_dma_unmap args = {
> + .argsz = sizeof(args),
> + .iova = 0,
> + .size = 0,
> + .flags = VFIO_DMA_UNMAP_FLAG_ALL,
> + };
> +
> + ioctl_assert(device->container_fd, VFIO_IOMMU_UNMAP_DMA, &args);
> +
> + return args.size;
> }
>
> -static void iommufd_dma_unmap(struct vfio_pci_device *device,
> +static u64 iommufd_dma_unmap(struct vfio_pci_device *device,
> struct vfio_dma_region *region)
> {
> struct iommu_ioas_unmap args = {
> @@ -206,17 +222,54 @@ static void iommufd_dma_unmap(struct vfio_pci_device *device,
> };
>
> ioctl_assert(device->iommufd, IOMMU_IOAS_UNMAP, &args);
> +
> + return args.length;
> +}
> +
> +static u64 iommufd_dma_unmap_all(struct vfio_pci_device *device)
> +{
> + struct iommu_ioas_unmap args = {
> + .size = sizeof(args),
> + .iova = 0,
> + .length = UINT64_MAX,
> + .ioas_id = device->ioas_id,
> + };
> +
> + ioctl_assert(device->iommufd, IOMMU_IOAS_UNMAP, &args);
> +
> + return args.length;
> }
>
> -void vfio_pci_dma_unmap(struct vfio_pci_device *device,
> +u64 vfio_pci_dma_unmap(struct vfio_pci_device *device,
> struct vfio_dma_region *region)
> {
> + u64 unmapped;
> +
> if (device->iommufd)
> - iommufd_dma_unmap(device, region);
> + unmapped = iommufd_dma_unmap(device, region);
> else
> - vfio_iommu_dma_unmap(device, region);
> + unmapped = vfio_iommu_dma_unmap(device, region);
>
> list_del(®ion->link);
> +
> + return unmapped;
> +}
> +
> +u64 vfio_pci_dma_unmap_all(struct vfio_pci_device *device)
> +{
> + u64 unmapped;
> + struct vfio_dma_region *curr, *next;
> +
> + if (device->iommufd)
> + unmapped = iommufd_dma_unmap_all(device);
> + else
> + unmapped = vfio_iommu_dma_unmap_all(device);
> +
> + list_for_each_entry_safe(curr, next, &device->dma_regions, link) {
> + list_del(&curr->link);
> + }
> +
> + return unmapped;
> }
>
> static void vfio_pci_region_get(struct vfio_pci_device *device, int index,
> diff --git a/tools/testing/selftests/vfio/vfio_dma_mapping_test.c b/tools/testing/selftests/vfio/vfio_dma_mapping_test.c
> index ab19c54a774d..e908c1fe7103 100644
> --- a/tools/testing/selftests/vfio/vfio_dma_mapping_test.c
> +++ b/tools/testing/selftests/vfio/vfio_dma_mapping_test.c
> @@ -122,6 +122,8 @@ FIXTURE_TEARDOWN(vfio_dma_mapping_test)
> vfio_pci_device_cleanup(self->device);
> }
>
> +#undef FIXTURE_VARIANT_ADD_IOMMU_MODE
I think this can/should go just after the
FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES(); statement. The same below.
> +
> TEST_F(vfio_dma_mapping_test, dma_map_unmap)
> {
> const u64 size = variant->size ?: getpagesize();
> @@ -192,6 +194,61 @@ TEST_F(vfio_dma_mapping_test, dma_map_unmap)
> ASSERT_TRUE(!munmap(region.vaddr, size));
> }
>
> +FIXTURE(vfio_dma_map_limit_test) {
> + struct vfio_pci_device *device;
> +};
> +
> +FIXTURE_VARIANT(vfio_dma_map_limit_test) {
> + const char *iommu_mode;
> +};
> +
> +#define FIXTURE_VARIANT_ADD_IOMMU_MODE(_iommu_mode) \
> +FIXTURE_VARIANT_ADD(vfio_dma_map_limit_test, _iommu_mode) { \
> + .iommu_mode = #_iommu_mode, \
> +}
> +
> +FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES();
> +
> +FIXTURE_SETUP(vfio_dma_map_limit_test)
> +{
> + self->device = vfio_pci_device_init(device_bdf, variant->iommu_mode);
> +}
> +
> +FIXTURE_TEARDOWN(vfio_dma_map_limit_test)
> +{
> + vfio_pci_device_cleanup(self->device);
> +}
> +
> +#undef FIXTURE_VARIANT_ADD_IOMMU_MODE
> +
> +TEST_F(vfio_dma_map_limit_test, end_of_address_space)
> +{
> + struct vfio_dma_region region = {};
> + u64 size = getpagesize();
> + u64 unmapped;
> +
> + region.vaddr = mmap(NULL, size, PROT_READ | PROT_WRITE,
> + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
> + ASSERT_NE(region.vaddr, MAP_FAILED);
> +
> + region.iova = ~(iova_t)0 & ~(size - 1);
> + region.size = size;
> +
> + vfio_pci_dma_map(self->device, ®ion);
> + printf("Mapped HVA %p (size 0x%lx) at IOVA 0x%lx\n", region.vaddr, size, region.iova);
> + ASSERT_EQ(region.iova, to_iova(self->device, region.vaddr));
> +
> + unmapped = vfio_pci_dma_unmap(self->device, ®ion);
> + ASSERT_EQ(unmapped, size);
> +
> + vfio_pci_dma_map(self->device, ®ion);
> + printf("Mapped HVA %p (size 0x%lx) at IOVA 0x%lx\n", region.vaddr, size, region.iova);
> + ASSERT_EQ(region.iova, to_iova(self->device, region.vaddr));
> +
> + unmapped = vfio_pci_dma_unmap_all(self->device);
> + ASSERT_EQ(unmapped, size);
The unmap_all test should probably be in a separate TEST_F. You can
put the struct vfio_dma_region in the FIXTURE and initialize it in the
FIXTURE_SETUP() to reduce code duplication.
> +}
Would it be useful to add negative map/unmap tests as well? If so we'd
need a way to plumb the return value of the ioctl up to the caller so
you can assert that it failed, which will conflict with returning the
amount of unmapped bytes.
Maybe we should make unmapped an output parameter like so?
int __vfio_pci_dma_map(struct vfio_pci_device *device,
struct vfio_dma_region *region);
void vfio_pci_dma_map(struct vfio_pci_device *device,
struct vfio_dma_region *region);
int __vfio_pci_dma_unmap(struct vfio_pci_device *device,
struct vfio_dma_region *region, u64 *unmapped);
void vfio_pci_dma_unmap(struct vfio_pci_device *device,
struct vfio_dma_region *region, u64 *unmapped);
int __vfio_pci_dma_unmap_all(struct vfio_pci_device *device, u64 *unmapped);
void vfio_pci_dma_unmap_all(struct vfio_pci_device *device, u64 *unmapped);
unmapped can be optional and callers that don't care can pass in NULL.
It'll be a little gross though to see NULL on all the unmap calls
though... Maybe unmapped can be restricted to __vfio_pci_dma_unmap().
So something like this:
int __vfio_pci_dma_unmap(struct vfio_pci_device *device,
struct vfio_dma_region *region, u64 *unmapped);
void vfio_pci_dma_unmap(struct vfio_pci_device *device,
struct vfio_dma_region *region);
^ permalink raw reply [flat|nested] 27+ messages in thread* Re: [PATCH v4 0/3] vfio: handle DMA map/unmap up to the addressable limit
2025-10-22 0:38 ` David Matlack
@ 2025-10-22 14:55 ` Alex Mastro
0 siblings, 0 replies; 27+ messages in thread
From: Alex Mastro @ 2025-10-22 14:55 UTC (permalink / raw)
To: David Matlack
Cc: Alex Williamson, Alejandro Jimenez, Jason Gunthorpe, kvm,
linux-kernel
Thanks David -- this is good feedback. Will roll these suggestions into v5.
On Tue, Oct 21, 2025 at 05:38:31PM -0700, David Matlack wrote:
> On Tue, Oct 21, 2025 at 12:13 PM Alex Mastro <amastro@fb.com> wrote:
> > I updated the *_unmap function signatures to return the count of bytes unmapped,
> > since that is part of the test pass criteria. Also added unmap_all flavors,
> > since those exercise different code paths than range-based unmap.
>
> When you send, can you introduce these in a separate commit and update
> the existing test function in vfio_dma_mapping_test.c to assert on it?
SGTM
> > +#undef FIXTURE_VARIANT_ADD_IOMMU_MODE
>
> I think this can/should go just after the
> FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES(); statement. The same below.
Ack.
> > + unmapped = vfio_pci_dma_unmap_all(self->device);
> > + ASSERT_EQ(unmapped, size);
>
> The unmap_all test should probably be in a separate TEST_F. You can
> put the struct vfio_dma_region in the FIXTURE and initialize it in the
> FIXTURE_SETUP() to reduce code duplication.
> > +}
Make sense.
> Would it be useful to add negative map/unmap tests as well? If so we'd
> need a way to plumb the return value of the ioctl up to the caller so
> you can assert that it failed, which will conflict with returning the
> amount of unmapped bytes.
Testing negative cases would be useful. Not sure about the mechanics yet.
>
> Maybe we should make unmapped an output parameter like so?
>
> int __vfio_pci_dma_map(struct vfio_pci_device *device,
> struct vfio_dma_region *region);
>
> void vfio_pci_dma_map(struct vfio_pci_device *device,
> struct vfio_dma_region *region);
>
> int __vfio_pci_dma_unmap(struct vfio_pci_device *device,
> struct vfio_dma_region *region, u64 *unmapped);
>
> void vfio_pci_dma_unmap(struct vfio_pci_device *device,
> struct vfio_dma_region *region, u64 *unmapped);
>
> int __vfio_pci_dma_unmap_all(struct vfio_pci_device *device, u64 *unmapped);
> void vfio_pci_dma_unmap_all(struct vfio_pci_device *device, u64 *unmapped);
>
> unmapped can be optional and callers that don't care can pass in NULL.
> It'll be a little gross though to see NULL on all the unmap calls
> though... Maybe unmapped can be restricted to __vfio_pci_dma_unmap().
> So something like this:
>
> int __vfio_pci_dma_unmap(struct vfio_pci_device *device,
> struct vfio_dma_region *region, u64 *unmapped);
>
> void vfio_pci_dma_unmap(struct vfio_pci_device *device,
> struct vfio_dma_region *region);
I'll put some thought into this and propose something in v5.
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH v4 0/3] vfio: handle DMA map/unmap up to the addressable limit
2025-10-21 16:25 ` Alex Mastro
2025-10-21 16:31 ` David Matlack
@ 2025-10-23 20:52 ` Alex Mastro
2025-10-23 22:33 ` Alex Williamson
2025-10-27 16:02 ` Alex Mastro
2 siblings, 1 reply; 27+ messages in thread
From: Alex Mastro @ 2025-10-23 20:52 UTC (permalink / raw)
To: Alex Williamson
Cc: Alejandro Jimenez, Jason Gunthorpe, kvm, linux-kernel,
David Matlack
One point to clarify
> On Mon, Oct 20, 2025 at 03:36:33PM -0600, Alex Williamson wrote:
> > Along with the tag, it would probably be useful in that same commit to
> > expand on the scope of the issue in the commit log. I believe we allow
> > mappings to be created at the top of the address space that cannot be
> > removed via ioctl,
True
> > but such inconsistency should result in an application error due to the
> > failed ioctl
Actually, the ioctl does not fail in the sense that the caller gets an errno.
Attempting to unmap a range ending at the end of address space succeeds (returns
zero), but unmaps zero bytes. An application would only detect this if it
explicitly checked the out size field of vfio_iommu_type1_dma_unmap. Or
attempted to create another overlapping mapping on top of the ranges it expected
to be unmapped.
Annotated below:
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 916cad80941c..039cba5a38ca 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -165,19 +165,27 @@ vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
dma_addr_t start, size_t size)
{
+ // start == ~(dma_addr_t)0
+ // size == 0
struct rb_node *node = iommu->dma_list.rb_node;
while (node) {
struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
+ // never true because all dma->iova < ~(dma_addr_t)0
if (start + size <= dma->iova)
node = node->rb_left;
+ // traverses right until we get to the end of address space
+ // dma, then we walk off the end because
+ // ~(dma_addr_t)0 >= 0 == true
+ // node = NULL
else if (start >= dma->iova + dma->size)
node = node->rb_right;
else
return dma;
}
+ // This happens
return NULL;
}
@@ -201,6 +209,8 @@ static struct rb_node *vfio_find_dma_first_node(struct vfio_iommu *iommu,
node = node->rb_right;
}
}
+ // iova >= start + size == true, due to overflow to zero => no first
+ // node found
if (res && size && dma_res->iova >= start + size)
res = NULL;
return res;
@@ -1397,6 +1407,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
if (iova || size)
goto unlock;
size = U64_MAX;
+ // end of address space unmap passes these checks
} else if (!size || size & (pgsize - 1) ||
iova + size - 1 < iova || size > SIZE_MAX) {
goto unlock;
@@ -1442,18 +1453,23 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
* mappings within the range.
*/
if (iommu->v2 && !unmap_all) {
+ // passes this check as long as the unmap start doesn't split an
+ // existing dma
dma = vfio_find_dma(iommu, iova, 1);
if (dma && dma->iova != iova)
goto unlock;
+ // dma = NULL, we pass this check
dma = vfio_find_dma(iommu, iova + size - 1, 0);
if (dma && dma->iova + dma->size != iova + size)
goto unlock;
}
ret = 0;
+ // n = NULL
n = first_n = vfio_find_dma_first_node(iommu, iova, size);
+ // loop body never entered
while (n) {
dma = rb_entry(n, struct vfio_dma, node);
if (dma->iova >= iova + size)
@@ -1513,6 +1529,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
/* Report how much was unmapped */
unmap->size = unmapped;
+ // return 0
return ret;
}
^ permalink raw reply related [flat|nested] 27+ messages in thread* Re: [PATCH v4 0/3] vfio: handle DMA map/unmap up to the addressable limit
2025-10-23 20:52 ` Alex Mastro
@ 2025-10-23 22:33 ` Alex Williamson
0 siblings, 0 replies; 27+ messages in thread
From: Alex Williamson @ 2025-10-23 22:33 UTC (permalink / raw)
To: Alex Mastro
Cc: Alejandro Jimenez, Jason Gunthorpe, kvm, linux-kernel,
David Matlack
On Thu, 23 Oct 2025 13:52:04 -0700
Alex Mastro <amastro@fb.com> wrote:
> One point to clarify
>
> > On Mon, Oct 20, 2025 at 03:36:33PM -0600, Alex Williamson wrote:
> > > Along with the tag, it would probably be useful in that same commit to
> > > expand on the scope of the issue in the commit log. I believe we allow
> > > mappings to be created at the top of the address space that cannot be
> > > removed via ioctl,
>
> True
>
> > > but such inconsistency should result in an application error due to the
> > > failed ioctl
>
> Actually, the ioctl does not fail in the sense that the caller gets an errno.
> Attempting to unmap a range ending at the end of address space succeeds (returns
> zero), but unmaps zero bytes. An application would only detect this if it
> explicitly checked the out size field of vfio_iommu_type1_dma_unmap. Or
> attempted to create another overlapping mapping on top of the ranges it expected
> to be unmapped.
Yes, the ioctl doesn't fail but the returned unmap size exposes the
inconsistency. This is because we don't require a 1:1 unmap, an unmap
can span multiple mappings, or none. Prior to TYPE1v2 we even allowed
splitting previous mappings. I agree that it obfuscates the
application detecting the error, but I'm not sure there's much we can
do about it given the uAPI. Let's document the scenario as best we
can. Thanks,
Alex
> Annotated below:
>
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 916cad80941c..039cba5a38ca 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -165,19 +165,27 @@ vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
> static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
> dma_addr_t start, size_t size)
> {
> + // start == ~(dma_addr_t)0
> + // size == 0
> struct rb_node *node = iommu->dma_list.rb_node;
>
> while (node) {
> struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
>
> + // never true because all dma->iova < ~(dma_addr_t)0
> if (start + size <= dma->iova)
> node = node->rb_left;
> + // traverses right until we get to the end of address space
> + // dma, then we walk off the end because
> + // ~(dma_addr_t)0 >= 0 == true
> + // node = NULL
> else if (start >= dma->iova + dma->size)
> node = node->rb_right;
> else
> return dma;
> }
>
> + // This happens
> return NULL;
> }
>
> @@ -201,6 +209,8 @@ static struct rb_node *vfio_find_dma_first_node(struct vfio_iommu *iommu,
> node = node->rb_right;
> }
> }
> + // iova >= start + size == true, due to overflow to zero => no first
> + // node found
> if (res && size && dma_res->iova >= start + size)
> res = NULL;
> return res;
> @@ -1397,6 +1407,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
> if (iova || size)
> goto unlock;
> size = U64_MAX;
> + // end of address space unmap passes these checks
> } else if (!size || size & (pgsize - 1) ||
> iova + size - 1 < iova || size > SIZE_MAX) {
> goto unlock;
> @@ -1442,18 +1453,23 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
> * mappings within the range.
> */
> if (iommu->v2 && !unmap_all) {
> + // passes this check as long as the unmap start doesn't split an
> + // existing dma
> dma = vfio_find_dma(iommu, iova, 1);
> if (dma && dma->iova != iova)
> goto unlock;
>
> + // dma = NULL, we pass this check
> dma = vfio_find_dma(iommu, iova + size - 1, 0);
> if (dma && dma->iova + dma->size != iova + size)
> goto unlock;
> }
>
> ret = 0;
> + // n = NULL
> n = first_n = vfio_find_dma_first_node(iommu, iova, size);
>
> + // loop body never entered
> while (n) {
> dma = rb_entry(n, struct vfio_dma, node);
> if (dma->iova >= iova + size)
> @@ -1513,6 +1529,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
> /* Report how much was unmapped */
> unmap->size = unmapped;
>
> + // return 0
> return ret;
> }
>
>
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH v4 0/3] vfio: handle DMA map/unmap up to the addressable limit
2025-10-21 16:25 ` Alex Mastro
2025-10-21 16:31 ` David Matlack
2025-10-23 20:52 ` Alex Mastro
@ 2025-10-27 16:02 ` Alex Mastro
2025-10-28 1:57 ` Alex Williamson
2 siblings, 1 reply; 27+ messages in thread
From: Alex Mastro @ 2025-10-27 16:02 UTC (permalink / raw)
To: Alex Williamson
Cc: Alejandro Jimenez, Jason Gunthorpe, kvm, linux-kernel,
David Matlack
On Tue, Oct 21, 2025 at 09:25:55AM -0700, Alex Mastro wrote:
> On Mon, Oct 20, 2025 at 03:36:33PM -0600, Alex Williamson wrote:
> > I do note that we're missing a Fixes: tag. I think we've had hints of
> > this issue all the way back to the original implementation, so perhaps
> > the last commit should include:
> >
> > Fixes: 73fa0d10d077 ("vfio: Type1 IOMMU implementation")
>
> SGTM
>
> > Unless you've identified a more specific target.
>
> I have not.
>
> > Along with the tag, it would probably be useful in that same commit to
> > expand on the scope of the issue in the commit log. I believe we allow
> > mappings to be created at the top of the address space that cannot be
> > removed via ioctl, but such inconsistency should result in an
> > application error due to the failed ioctl and does not affect cleanup
> > on release.
I want to make sure I understand the cleanup on release path. Is my supposition
below correct?
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 916cad80941c..7f8d23b06680 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -1127,6 +1127,7 @@ static size_t unmap_unpin_slow(struct vfio_domain *domain,
static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
bool do_accounting)
{
+ // end == 0 due to overflow
dma_addr_t iova = dma->iova, end = dma->iova + dma->size;
struct vfio_domain *domain, *d;
LIST_HEAD(unmapped_region_list);
@@ -1156,6 +1157,7 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
}
iommu_iotlb_gather_init(&iotlb_gather);
+ // doesn't enter the loop, never calls iommu_unmap
while (iova < end) {
size_t unmapped, len;
phys_addr_t phys, next;
@@ -1210,6 +1212,7 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
{
WARN_ON(!RB_EMPTY_ROOT(&dma->pfn_list));
+ // go here
vfio_unmap_unpin(iommu, dma, true);
vfio_unlink_dma(iommu, dma);
put_task_struct(dma->task);
@@ -2394,6 +2397,8 @@ static void vfio_iommu_unmap_unpin_all(struct vfio_iommu *iommu)
struct rb_node *node;
while ((node = rb_first(&iommu->dma_list)))
+ // eventually, we attempt to remove the end of address space
+ // mapping
vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, node));
}
@@ -2628,6 +2633,8 @@ static void vfio_release_domain(struct vfio_domain *domain)
kfree(group);
}
+ // Is this backstop what saves us? Even though we didn't do individual
+ // unmaps, the "leaked" end of address space mappings get freed here?
iommu_domain_free(domain->domain);
}
@@ -2643,10 +2650,12 @@ static void vfio_iommu_type1_release(void *iommu_data)
kfree(group);
}
+ // start here
vfio_iommu_unmap_unpin_all(iommu);
list_for_each_entry_safe(domain, domain_tmp,
&iommu->domain_list, next) {
+ // eventually...
vfio_release_domain(domain);
list_del(&domain->next);
kfree(domain);
^ permalink raw reply related [flat|nested] 27+ messages in thread* Re: [PATCH v4 0/3] vfio: handle DMA map/unmap up to the addressable limit
2025-10-27 16:02 ` Alex Mastro
@ 2025-10-28 1:57 ` Alex Williamson
2025-10-28 15:29 ` Alex Mastro
0 siblings, 1 reply; 27+ messages in thread
From: Alex Williamson @ 2025-10-28 1:57 UTC (permalink / raw)
To: Alex Mastro
Cc: Alejandro Jimenez, Jason Gunthorpe, kvm, linux-kernel,
David Matlack
On Mon, 27 Oct 2025 09:02:55 -0700
Alex Mastro <amastro@fb.com> wrote:
> On Tue, Oct 21, 2025 at 09:25:55AM -0700, Alex Mastro wrote:
> > On Mon, Oct 20, 2025 at 03:36:33PM -0600, Alex Williamson wrote:
> > > Along with the tag, it would probably be useful in that same commit to
> > > expand on the scope of the issue in the commit log. I believe we allow
> > > mappings to be created at the top of the address space that cannot be
> > > removed via ioctl, but such inconsistency should result in an
> > > application error due to the failed ioctl and does not affect cleanup
> > > on release.
>
> I want to make sure I understand the cleanup on release path. Is my supposition
> below correct?
>
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 916cad80941c..7f8d23b06680 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -1127,6 +1127,7 @@ static size_t unmap_unpin_slow(struct vfio_domain *domain,
> static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
> bool do_accounting)
> {
> + // end == 0 due to overflow
> dma_addr_t iova = dma->iova, end = dma->iova + dma->size;
> struct vfio_domain *domain, *d;
> LIST_HEAD(unmapped_region_list);
> @@ -1156,6 +1157,7 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
> }
>
> iommu_iotlb_gather_init(&iotlb_gather);
> + // doesn't enter the loop, never calls iommu_unmap
If it were only that, I think the iommu_domain_free() would be
sufficient, but it looks like we're also missing the unpin. Freeing
the IOMMU domain isn't going to resolve that. So it actually appears
we're leaking those pinned pages and this isn't as self-resolving as I
had thought. I imagine if you ran your new unit test to the point where
we'd pinned and failed to unpin the majority of memory you'd start to
see system-wide problems. Thanks,
Alex
> while (iova < end) {
> size_t unmapped, len;
> phys_addr_t phys, next;
> @@ -1210,6 +1212,7 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
> static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
> {
> WARN_ON(!RB_EMPTY_ROOT(&dma->pfn_list));
> + // go here
> vfio_unmap_unpin(iommu, dma, true);
> vfio_unlink_dma(iommu, dma);
> put_task_struct(dma->task);
> @@ -2394,6 +2397,8 @@ static void vfio_iommu_unmap_unpin_all(struct vfio_iommu *iommu)
> struct rb_node *node;
>
> while ((node = rb_first(&iommu->dma_list)))
> + // eventually, we attempt to remove the end of address space
> + // mapping
> vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, node));
> }
>
> @@ -2628,6 +2633,8 @@ static void vfio_release_domain(struct vfio_domain *domain)
> kfree(group);
> }
>
> + // Is this backstop what saves us? Even though we didn't do individual
> + // unmaps, the "leaked" end of address space mappings get freed here?
> iommu_domain_free(domain->domain);
> }
>
> @@ -2643,10 +2650,12 @@ static void vfio_iommu_type1_release(void *iommu_data)
> kfree(group);
> }
>
> + // start here
> vfio_iommu_unmap_unpin_all(iommu);
>
> list_for_each_entry_safe(domain, domain_tmp,
> &iommu->domain_list, next) {
> + // eventually...
> vfio_release_domain(domain);
> list_del(&domain->next);
> kfree(domain);
^ permalink raw reply [flat|nested] 27+ messages in thread* Re: [PATCH v4 0/3] vfio: handle DMA map/unmap up to the addressable limit
2025-10-28 1:57 ` Alex Williamson
@ 2025-10-28 15:29 ` Alex Mastro
0 siblings, 0 replies; 27+ messages in thread
From: Alex Mastro @ 2025-10-28 15:29 UTC (permalink / raw)
To: Alex Williamson
Cc: Alejandro Jimenez, Jason Gunthorpe, kvm, linux-kernel,
David Matlack
On Mon, Oct 27, 2025 at 07:57:32PM -0600, Alex Williamson wrote:
> On Mon, 27 Oct 2025 09:02:55 -0700
> Alex Mastro <amastro@fb.com> wrote:
>
> > On Tue, Oct 21, 2025 at 09:25:55AM -0700, Alex Mastro wrote:
> > > On Mon, Oct 20, 2025 at 03:36:33PM -0600, Alex Williamson wrote:
> > > > Along with the tag, it would probably be useful in that same commit to
> > > > expand on the scope of the issue in the commit log. I believe we allow
> > > > mappings to be created at the top of the address space that cannot be
> > > > removed via ioctl, but such inconsistency should result in an
> > > > application error due to the failed ioctl and does not affect cleanup
> > > > on release.
> >
> > I want to make sure I understand the cleanup on release path. Is my supposition
> > below correct?
> >
> > diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> > index 916cad80941c..7f8d23b06680 100644
> > --- a/drivers/vfio/vfio_iommu_type1.c
> > +++ b/drivers/vfio/vfio_iommu_type1.c
> > @@ -1127,6 +1127,7 @@ static size_t unmap_unpin_slow(struct vfio_domain *domain,
> > static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
> > bool do_accounting)
> > {
> > + // end == 0 due to overflow
> > dma_addr_t iova = dma->iova, end = dma->iova + dma->size;
> > struct vfio_domain *domain, *d;
> > LIST_HEAD(unmapped_region_list);
> > @@ -1156,6 +1157,7 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
> > }
> >
> > iommu_iotlb_gather_init(&iotlb_gather);
> > + // doesn't enter the loop, never calls iommu_unmap
>
> If it were only that, I think the iommu_domain_free() would be
> sufficient, but it looks like we're also missing the unpin. Freeing
Oh, right.
> the IOMMU domain isn't going to resolve that. So it actually appears
> we're leaking those pinned pages and this isn't as self-resolving as I
> had thought. I imagine if you ran your new unit test to the point where
> we'd pinned and failed to unpin the majority of memory you'd start to
> see system-wide problems. Thanks,
Makes sense.
> Alex
>
> > while (iova < end) {
> > size_t unmapped, len;
> > phys_addr_t phys, next;
> > @@ -1210,6 +1212,7 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
> > static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
> > {
> > WARN_ON(!RB_EMPTY_ROOT(&dma->pfn_list));
> > + // go here
> > vfio_unmap_unpin(iommu, dma, true);
> > vfio_unlink_dma(iommu, dma);
> > put_task_struct(dma->task);
> > @@ -2394,6 +2397,8 @@ static void vfio_iommu_unmap_unpin_all(struct vfio_iommu *iommu)
> > struct rb_node *node;
> >
> > while ((node = rb_first(&iommu->dma_list)))
> > + // eventually, we attempt to remove the end of address space
> > + // mapping
> > vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, node));
> > }
> >
> > @@ -2628,6 +2633,8 @@ static void vfio_release_domain(struct vfio_domain *domain)
> > kfree(group);
> > }
> >
> > + // Is this backstop what saves us? Even though we didn't do individual
> > + // unmaps, the "leaked" end of address space mappings get freed here?
> > iommu_domain_free(domain->domain);
> > }
> >
> > @@ -2643,10 +2650,12 @@ static void vfio_iommu_type1_release(void *iommu_data)
> > kfree(group);
> > }
> >
> > + // start here
> > vfio_iommu_unmap_unpin_all(iommu);
> >
> > list_for_each_entry_safe(domain, domain_tmp,
> > &iommu->domain_list, next) {
> > + // eventually...
> > vfio_release_domain(domain);
> > list_del(&domain->next);
> > kfree(domain);
>
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH v4 0/3] vfio: handle DMA map/unmap up to the addressable limit
2025-10-13 5:32 [PATCH v4 0/3] vfio: handle DMA map/unmap up to the addressable limit Alex Mastro
` (3 preceding siblings ...)
2025-10-15 19:24 ` [PATCH v4 0/3] vfio: " Alex Williamson
@ 2025-10-21 12:36 ` Jason Gunthorpe
2025-10-21 22:21 ` Alejandro Jimenez
2025-10-25 18:11 ` Alex Mastro
6 siblings, 0 replies; 27+ messages in thread
From: Jason Gunthorpe @ 2025-10-21 12:36 UTC (permalink / raw)
To: Alex Mastro; +Cc: Alex Williamson, Alejandro Jimenez, kvm, linux-kernel
On Sun, Oct 12, 2025 at 10:32:23PM -0700, Alex Mastro wrote:
> This patch series aims to fix vfio_iommu_type.c to support
> VFIO_IOMMU_MAP_DMA and VFIO_IOMMU_UNMAP_DMA operations targeting IOVA
> ranges which lie against the addressable limit. i.e. ranges where
> iova_start + iova_size would overflow to exactly zero.
I didn't try to look through all the math changes but it looks like
the right thing to do to me
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Jason
^ permalink raw reply [flat|nested] 27+ messages in thread* Re: [PATCH v4 0/3] vfio: handle DMA map/unmap up to the addressable limit
2025-10-13 5:32 [PATCH v4 0/3] vfio: handle DMA map/unmap up to the addressable limit Alex Mastro
` (4 preceding siblings ...)
2025-10-21 12:36 ` Jason Gunthorpe
@ 2025-10-21 22:21 ` Alejandro Jimenez
2025-10-25 18:11 ` Alex Mastro
6 siblings, 0 replies; 27+ messages in thread
From: Alejandro Jimenez @ 2025-10-21 22:21 UTC (permalink / raw)
To: Alex Mastro, Alex Williamson; +Cc: Jason Gunthorpe, kvm, linux-kernel
On 10/13/25 1:32 AM, Alex Mastro wrote:
> This patch series aims to fix vfio_iommu_type.c to support
> VFIO_IOMMU_MAP_DMA and VFIO_IOMMU_UNMAP_DMA operations targeting IOVA
> ranges which lie against the addressable limit. i.e. ranges where
> iova_start + iova_size would overflow to exactly zero.
>
I sent a reply to Patch 3 with a small nit, but regardless of whether
you chose to make a change or not, the current changes look good to me.
Reviewed-by: Alejandro Jimenez <alejandro.j.jimenez@oracle.com>
Thank you,
Alejandro
^ permalink raw reply [flat|nested] 27+ messages in thread* Re: [PATCH v4 0/3] vfio: handle DMA map/unmap up to the addressable limit
2025-10-13 5:32 [PATCH v4 0/3] vfio: handle DMA map/unmap up to the addressable limit Alex Mastro
` (5 preceding siblings ...)
2025-10-21 22:21 ` Alejandro Jimenez
@ 2025-10-25 18:11 ` Alex Mastro
2025-10-27 13:39 ` Jason Gunthorpe
6 siblings, 1 reply; 27+ messages in thread
From: Alex Mastro @ 2025-10-25 18:11 UTC (permalink / raw)
To: Alex Williamson
Cc: Jason Gunthorpe, Alejandro Jimenez, David Matlack, kvm,
linux-kernel
Alex and Jason, during my testing, I found that the behavior of range-based
(!VFIO_DMA_UNMAP_FLAG_ALL) VFIO_IOMMU_UNMAP_DMA differs slightly when using
/dev/iommu as the container.
iommufd treats range-based unmap where there are no hits in the range as an
error, and the ioctl fails with ENOENT.
vfio_iommu_type1.c treats this as a success and reports zero bytes unmapped in
vfio_iommu_type1_dma_unmap.size.
It implies to me that we may need some more shimming in
drivers/iommu/iommufd/vfio_compat.c to iron out observable differences in UAPI
usage.
Addressing it would be outside the scope of this patch series, so this mail is
just to make note of my findings.
^ permalink raw reply [flat|nested] 27+ messages in thread* Re: [PATCH v4 0/3] vfio: handle DMA map/unmap up to the addressable limit
2025-10-25 18:11 ` Alex Mastro
@ 2025-10-27 13:39 ` Jason Gunthorpe
2025-10-28 18:42 ` Alex Mastro
0 siblings, 1 reply; 27+ messages in thread
From: Jason Gunthorpe @ 2025-10-27 13:39 UTC (permalink / raw)
To: Alex Mastro
Cc: Alex Williamson, Alejandro Jimenez, David Matlack, kvm,
linux-kernel
On Sat, Oct 25, 2025 at 11:11:49AM -0700, Alex Mastro wrote:
> Alex and Jason, during my testing, I found that the behavior of range-based
> (!VFIO_DMA_UNMAP_FLAG_ALL) VFIO_IOMMU_UNMAP_DMA differs slightly when using
> /dev/iommu as the container.
>
> iommufd treats range-based unmap where there are no hits in the range as an
> error, and the ioctl fails with ENOENT.
> vfio_iommu_type1.c treats this as a success and reports zero bytes unmapped in
> vfio_iommu_type1_dma_unmap.size.
Oh, weird...
What do you think about this:
diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c
index c0360c450880b8..1124f68ec9020d 100644
--- a/drivers/iommu/iommufd/io_pagetable.c
+++ b/drivers/iommu/iommufd/io_pagetable.c
@@ -707,7 +707,8 @@ static int iopt_unmap_iova_range(struct io_pagetable *iopt, unsigned long start,
struct iopt_area *area;
unsigned long unmapped_bytes = 0;
unsigned int tries = 0;
- int rc = -ENOENT;
+ /* If there are no mapped entries then success */
+ int rc = 0;
/*
* The domains_rwsem must be held in read mode any time any area->pages
diff --git a/drivers/iommu/iommufd/ioas.c b/drivers/iommu/iommufd/ioas.c
index 1542c5fd10a85c..ef5e56672dea56 100644
--- a/drivers/iommu/iommufd/ioas.c
+++ b/drivers/iommu/iommufd/ioas.c
@@ -367,6 +367,8 @@ int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd)
&unmapped);
if (rc)
goto out_put;
+ if (!unmapped)
+ rc = -ENOENT;
}
cmd->length = unmapped;
Thanks,
Jason
^ permalink raw reply related [flat|nested] 27+ messages in thread
* Re: [PATCH v4 0/3] vfio: handle DMA map/unmap up to the addressable limit
2025-10-27 13:39 ` Jason Gunthorpe
@ 2025-10-28 18:42 ` Alex Mastro
0 siblings, 0 replies; 27+ messages in thread
From: Alex Mastro @ 2025-10-28 18:42 UTC (permalink / raw)
To: Jason Gunthorpe
Cc: Alex Williamson, Alejandro Jimenez, David Matlack, kvm,
linux-kernel
On Mon, Oct 27, 2025 at 10:39:04AM -0300, Jason Gunthorpe wrote:
> On Sat, Oct 25, 2025 at 11:11:49AM -0700, Alex Mastro wrote:
> > Alex and Jason, during my testing, I found that the behavior of range-based
> > (!VFIO_DMA_UNMAP_FLAG_ALL) VFIO_IOMMU_UNMAP_DMA differs slightly when using
> > /dev/iommu as the container.
> >
> > iommufd treats range-based unmap where there are no hits in the range as an
> > error, and the ioctl fails with ENOENT.
>
> > vfio_iommu_type1.c treats this as a success and reports zero bytes unmapped in
> > vfio_iommu_type1_dma_unmap.size.
>
> Oh, weird...
>
> What do you think about this:
>
> diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c
> index c0360c450880b8..1124f68ec9020d 100644
> --- a/drivers/iommu/iommufd/io_pagetable.c
> +++ b/drivers/iommu/iommufd/io_pagetable.c
> @@ -707,7 +707,8 @@ static int iopt_unmap_iova_range(struct io_pagetable *iopt, unsigned long start,
> struct iopt_area *area;
> unsigned long unmapped_bytes = 0;
> unsigned int tries = 0;
> - int rc = -ENOENT;
> + /* If there are no mapped entries then success */
> + int rc = 0;
>
> /*
> * The domains_rwsem must be held in read mode any time any area->pages
> diff --git a/drivers/iommu/iommufd/ioas.c b/drivers/iommu/iommufd/ioas.c
> index 1542c5fd10a85c..ef5e56672dea56 100644
> --- a/drivers/iommu/iommufd/ioas.c
> +++ b/drivers/iommu/iommufd/ioas.c
> @@ -367,6 +367,8 @@ int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd)
> &unmapped);
> if (rc)
> goto out_put;
> + if (!unmapped)
> + rc = -ENOENT;
> }
>
> cmd->length = unmapped;
Seems reasonable to me. The only affected callers are
drivers/iommu/iommufd/ioas.c
366: rc = iopt_unmap_iova(&ioas->iopt, cmd->iova, cmd->length,
drivers/iommu/iommufd/vfio_compat.c
244: rc = iopt_unmap_iova(&ioas->iopt, unmap.iova, unmap.size,
So your proposal should get vfio_compat.c into good shape.
I think these locations need more scrutiny after your change
diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c
index c0360c450880..e271696f726f 100644
--- a/drivers/iommu/iommufd/io_pagetable.c
+++ b/drivers/iommu/iommufd/io_pagetable.c
@@ -777,6 +777,7 @@ static int iopt_unmap_iova_range(struct io_pagetable *iopt, unsigned long start,
down_write(&iopt->iova_rwsem);
}
+ // redundant?
if (unmapped_bytes)
rc = 0;
@@ -818,6 +819,7 @@ int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped)
int rc;
rc = iopt_unmap_iova_range(iopt, 0, ULONG_MAX, unmapped);
+ // intent still holds?
/* If the IOVAs are empty then unmap all succeeds */
if (rc == -ENOENT)
return 0;
^ permalink raw reply related [flat|nested] 27+ messages in thread