* [RFC V1 1/4] iommufd: Export do_update_pinned
2024-07-20 18:56 [RFC V1 0/4] iommufd live update Steve Sistare
@ 2024-07-20 18:56 ` Steve Sistare
2024-07-20 18:56 ` [RFC V1] iommufd debug print Steve Sistare
` (6 subsequent siblings)
7 siblings, 0 replies; 27+ messages in thread
From: Steve Sistare @ 2024-07-20 18:56 UTC (permalink / raw)
To: iommu
Cc: Jason Gunthorpe, Kevin Tian, Alex Williamson, Cornelia Huck,
Steve Sistare
Export do_update_pinned. No functional change.
Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
---
drivers/iommu/iommufd/io_pagetable.h | 5 +++++
drivers/iommu/iommufd/pages.c | 10 +++++-----
2 files changed, 10 insertions(+), 5 deletions(-)
diff --git a/drivers/iommu/iommufd/io_pagetable.h b/drivers/iommu/iommufd/io_pagetable.h
index 0ec3509..c37b680 100644
--- a/drivers/iommu/iommufd/io_pagetable.h
+++ b/drivers/iommu/iommufd/io_pagetable.h
@@ -238,4 +238,9 @@ struct iopt_pages_access {
unsigned int users;
};
+struct pfn_reader_user;
+
+int iopt_update_pinned(struct iopt_pages *pages, unsigned long npages,
+ bool inc, struct pfn_reader_user *user);
+
#endif
diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c
index 528f356..2366c15 100644
--- a/drivers/iommu/iommufd/pages.c
+++ b/drivers/iommu/iommufd/pages.c
@@ -854,8 +854,8 @@ static int update_mm_locked_vm(struct iopt_pages *pages, unsigned long npages,
return rc;
}
-static int do_update_pinned(struct iopt_pages *pages, unsigned long npages,
- bool inc, struct pfn_reader_user *user)
+int iopt_update_pinned(struct iopt_pages *pages, unsigned long npages,
+ bool inc, struct pfn_reader_user *user)
{
int rc = 0;
@@ -889,8 +889,8 @@ static void update_unpinned(struct iopt_pages *pages)
return;
if (pages->npinned == pages->last_npinned)
return;
- do_update_pinned(pages, pages->last_npinned - pages->npinned, false,
- NULL);
+ iopt_update_pinned(pages, pages->last_npinned - pages->npinned, false,
+ NULL);
}
/*
@@ -920,7 +920,7 @@ static int pfn_reader_user_update_pinned(struct pfn_reader_user *user,
npages = pages->npinned - pages->last_npinned;
inc = true;
}
- return do_update_pinned(pages, npages, inc, user);
+ return iopt_update_pinned(pages, npages, inc, user);
}
/*
--
1.8.3.1
^ permalink raw reply related [flat|nested] 27+ messages in thread* [RFC V1] iommufd debug print
2024-07-20 18:56 [RFC V1 0/4] iommufd live update Steve Sistare
2024-07-20 18:56 ` [RFC V1 1/4] iommufd: Export do_update_pinned Steve Sistare
@ 2024-07-20 18:56 ` Steve Sistare
2024-07-20 19:01 ` Steven Sistare
2024-07-20 18:56 ` [RFC V1 2/4] iommufd: Lock all objects Steve Sistare
` (5 subsequent siblings)
7 siblings, 1 reply; 27+ messages in thread
From: Steve Sistare @ 2024-07-20 18:56 UTC (permalink / raw)
To: iommu
Cc: Jason Gunthorpe, Kevin Tian, Alex Williamson, Cornelia Huck,
Steve Sistare
---
drivers/iommu/iommufd/ioas.c | 71 ++++++++++++++++++++++++++++++++++++++++---
drivers/iommu/iommufd/main.c | 6 +++-
drivers/iommu/iommufd/pages.c | 10 ++++--
drivers/vfio/group.c | 1 +
4 files changed, 81 insertions(+), 7 deletions(-)
diff --git a/drivers/iommu/iommufd/ioas.c b/drivers/iommu/iommufd/ioas.c
index 5d23bdd..7b6a73b 100644
--- a/drivers/iommu/iommufd/ioas.c
+++ b/drivers/iommu/iommufd/ioas.c
@@ -329,9 +329,14 @@ static void iommufd_release_all_iova_rwsem(struct iommufd_ctx *ictx,
struct iommufd_ioas *ioas;
unsigned long index;
+ printk("iommufd: %s\n", __func__);
xa_for_each(ioas_list, index, ioas) {
+ printk("up_write for ioas %px index %ld rwsem %px\n",
+ ioas, index, &ioas->iopt.iova_rwsem);
up_write(&ioas->iopt.iova_rwsem);
refcount_dec(&ioas->obj.users);
+ printk("obj->users refcount after is %d\n",
+ refcount_read(&ioas->obj.users));
}
up_write(&ictx->ioas_creation_lock);
xa_destroy(ioas_list);
@@ -344,6 +349,7 @@ static int iommufd_take_all_iova_rwsem(struct iommufd_ctx *ictx,
unsigned long index;
int rc;
+ printk("iommufd: %s\n", __func__);
/*
* This is very ugly, it is done instead of adding a lock around
* pages->source_mm, which is a performance path for mdev, we just
@@ -362,19 +368,30 @@ static int iommufd_take_all_iova_rwsem(struct iommufd_ctx *ictx,
xa_for_each(&ictx->objects, index, obj) {
struct iommufd_ioas *ioas;
- if (!obj || obj->type != IOMMUFD_OBJ_IOAS)
+ printk("obj %px type %d index %ld\n",
+ obj, obj ? obj->type : -1, index);
+ if (!obj || obj->type != IOMMUFD_OBJ_IOAS) {
+ printk("skip wrong type\n");
continue;
+ }
- if (!refcount_inc_not_zero(&obj->users))
+ printk("obj->users refcount before is %d\n",
+ refcount_read(&obj->users));
+ if (!refcount_inc_not_zero(&obj->users)) {
+ printk("skip !refcount_inc_not_zero\n");
continue;
+ }
xa_unlock(&ictx->objects);
ioas = container_of(obj, struct iommufd_ioas, obj);
down_write(&ioas->iopt.iova_rwsem);
+ printk("down_write for ioas %px index %ld rwsem %px\n",
+ ioas, index, &ioas->iopt.iova_rwsem);
rc = xa_err(xa_store(ioas_list, index, ioas, GFP_KERNEL));
if (rc) {
+ printk("xa_store failed!!\n");
lockdep_on();
iommufd_release_all_iova_rwsem(ictx, ioas_list);
return rc;
@@ -389,23 +406,38 @@ static int iommufd_take_all_iova_rwsem(struct iommufd_ctx *ictx,
static bool need_charge_update(struct iopt_pages *pages)
{
+ printk("iommufd: %s mode %u: %sgroup_leader, src pid %d cur pid %d,"
+ "src mm %px cur mm %px, src user %px cur user %px\n",
+ __func__, pages->account_mode,
+ pages->source_task == current->group_leader ? "is " : "not ",
+ pages->source_task->pid, current->group_leader->pid,
+ pages->source_mm, current->mm,
+ pages->source_user, current_user());
+
if (pages->source_task == current->group_leader &&
pages->source_mm == current->mm &&
pages->source_user == current_user()) {
+ printk("same mm and user, return false\n");
return false;
}
switch (pages->account_mode) {
case IOPT_PAGES_ACCOUNT_NONE:
+ printk("accounting disabled, return false\n");
return false;
case IOPT_PAGES_ACCOUNT_USER:
- if (pages->source_user == current_user())
+ if (pages->source_user == current_user()) {
+ printk("same user, return false\n");
return false;
+ }
break;
case IOPT_PAGES_ACCOUNT_MM:
- if (pages->source_mm == current->mm)
+ if (pages->source_mm == current->mm) {
+ printk("same mm, return false\n");
return false;
+ }
}
+ printk("iommufd: %s mode %u returns true\n", __func__, pages->account_mode);
return true;
}
@@ -425,6 +457,8 @@ static int charge_current(unsigned long *npinned)
continue;
tmp.account_mode = account_mode;
+ printk("iommufd: call update_pinned mode %u %lu inc=true\n",
+ account_mode, npinned[account_mode]);
rc = iopt_update_pinned(&tmp, npinned[account_mode], true,
NULL);
if (rc)
@@ -436,6 +470,8 @@ static int charge_current(unsigned long *npinned)
while (account_mode != 0) {
account_mode--;
tmp.account_mode = account_mode;
+ printk("iommufd: call do_update_pinned mode %u %lu inc=false\n",
+ account_mode, npinned[account_mode]);
iopt_update_pinned(&tmp, npinned[account_mode], false, NULL);
}
return rc;
@@ -447,6 +483,12 @@ static void change_mm(struct iopt_pages *pages)
struct user_struct *old_user = pages->source_user;
struct mm_struct *old_mm = pages->source_mm;
+ printk("iommufd: %s from pid %d user %px mm %px "
+ "to pid %d user %px mm %px\n",
+ __func__,
+ old_task->pid, old_user, old_mm,
+ current->group_leader->pid, current_user(), current->mm);
+
pages->source_mm = current->mm;
mmgrab(pages->source_mm);
mmdrop(old_mm);
@@ -468,6 +510,7 @@ static int iommufd_get_umap(struct iommu_ioas_change_process *cmd,
int len = cmd->n_umap * sizeof(**p_umap);
struct iommu_ioas_userspace_map *umap = kzalloc(len, GFP_KERNEL);
+ printk("iommufd: %s\n", __func__);
if (!umap)
return -ENOMEM;
@@ -486,6 +529,7 @@ static int iommufd_get_umap(struct iommu_ioas_change_process *cmd,
}
}
*p_umap = umap;
+ printk("iommufd: %s returns\n", __func__);
return 0;
}
@@ -509,9 +553,18 @@ static int pages_validate_userspace(struct iopt_pages *pages,
struct iommu_ioas_userspace_map *newaddr;
size_t pages_size = pages->npages * PAGE_SIZE;
+ printk("iommufd: %s: uptr = %px, size = %ld\n",
+ __func__, pages->uptr, pages_size);
+
if (!n_umap)
return 0;
newaddr = bsearch(pages->uptr, umap, n_umap, sizeof(*umap), cmp_addr);
+ if (!newaddr || pages_size > newaddr->size) {
+ if (newaddr)
+ printk("umap invalid: new size %llx\n", newaddr->size);
+ else
+ printk("umap invalid: newaddr not found\n");
+ }
if (!newaddr || pages_size > newaddr->size)
return -EINVAL;
return 0;
@@ -522,11 +575,15 @@ static void pages_update_userspace(struct iopt_pages *pages,
int n_umap)
{
struct iommu_ioas_userspace_map *newaddr;
+ size_t pages_size = pages->npages * PAGE_SIZE;
+ void *uptr = pages->uptr;
if (!n_umap)
return;
newaddr = bsearch(pages->uptr, umap, n_umap, sizeof(*umap), cmp_addr);
pages->uptr += newaddr->addr_new - newaddr->addr_old;
+ printk("iommufd: %s: uptr = %px, size = %ld -> new uptr %px\n",
+ __func__, uptr, pages_size, pages->uptr);
}
#define for_each_ioas_area(_xa, _index, _ioas, _area) \
@@ -548,6 +605,8 @@ int iommufd_ioas_change_process(struct iommufd_ucmd *ucmd)
unsigned long index;
int rc;
+ printk("iommufd: %s\n", __func__);
+
if (cmd->flags || cmd->__reserved)
return -EOPNOTSUPP;
@@ -582,6 +641,9 @@ int iommufd_ioas_change_process(struct iommufd_ucmd *ucmd)
}
}
+ printk("iommufd: all_npinned = %ld, %ld, %ld\n",
+ all_npinned[0], all_npinned[1], all_npinned[2]);
+
rc = charge_current(all_npinned);
if (rc) {
@@ -607,6 +669,7 @@ int iommufd_ioas_change_process(struct iommufd_ucmd *ucmd)
out_no_release:
kfree(umap);
+ printk("iommufd: %s returns\n", __func__);
return rc;
}
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index f8a2736..9a777b2 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -165,6 +165,7 @@ int iommufd_object_remove(struct iommufd_ctx *ictx,
* defer cleaning this object until close.
*/
refcount_dec(&to_destroy->users);
+ printk("iommufd_object_remove bug? ret %d\n", ret);
return ret;
}
zerod_shortterm = true;
@@ -204,8 +205,10 @@ int iommufd_object_remove(struct iommufd_ctx *ictx,
*/
if (!zerod_shortterm) {
ret = iommufd_object_dec_wait_shortterm(ictx, obj);
- if (WARN_ON(ret))
+ if (WARN_ON(ret)) {
+ printk("iommufd_object_remove bug 2? ret %d\n", ret);
return ret;
+ }
}
iommufd_object_ops[obj->type].destroy(obj);
@@ -213,6 +216,7 @@ int iommufd_object_remove(struct iommufd_ctx *ictx,
return 0;
err_xa:
+ printk("iommufd_object_remove err_xa ret %d\n", ret);
if (zerod_shortterm) {
/* Restore the xarray owned reference */
refcount_set(&obj->shortterm_users, 1);
diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c
index 2366c15..d17178e4 100644
--- a/drivers/iommu/iommufd/pages.c
+++ b/drivers/iommu/iommufd/pages.c
@@ -812,8 +812,11 @@ static int incr_user_locked_vm(struct iopt_pages *pages, unsigned long npages)
do {
cur_pages = atomic_long_read(&pages->source_user->locked_vm);
new_pages = cur_pages + npages;
- if (new_pages > lock_limit)
+ if (new_pages > lock_limit) {
+ pr_info("%s new_pages=%ld limit=%ld src pid=%d\n",
+ __func__, new_pages, lock_limit, pages->source_task->pid);
return -ENOMEM;
+ }
} while (atomic_long_cmpxchg(&pages->source_user->locked_vm, cur_pages,
new_pages) != cur_pages);
return 0;
@@ -821,8 +824,11 @@ static int incr_user_locked_vm(struct iopt_pages *pages, unsigned long npages)
static void decr_user_locked_vm(struct iopt_pages *pages, unsigned long npages)
{
- if (WARN_ON(atomic_long_read(&pages->source_user->locked_vm) < npages))
+ if (WARN_ON(atomic_long_read(&pages->source_user->locked_vm) < npages)){
+ pr_info("%s locked_vm=%ld < npages=%ld\n",
+ __func__, atomic_long_read(&pages->source_user->locked_vm), npages);
return;
+ }
atomic_long_sub(npages, &pages->source_user->locked_vm);
}
diff --git a/drivers/vfio/group.c b/drivers/vfio/group.c
index 610a429..8197695 100644
--- a/drivers/vfio/group.c
+++ b/drivers/vfio/group.c
@@ -147,6 +147,7 @@ static int vfio_group_ioctl_set_container(struct vfio_group *group,
group->iommufd = iommufd;
goto out_unlock;
}
+ pr_info("container %px has iommufd %px\n", container, iommufd);
/* The FD passed is not recognized. */
ret = -EBADFD;
--
1.8.3.1
^ permalink raw reply related [flat|nested] 27+ messages in thread* Re: [RFC V1] iommufd debug print
2024-07-20 18:56 ` [RFC V1] iommufd debug print Steve Sistare
@ 2024-07-20 19:01 ` Steven Sistare
0 siblings, 0 replies; 27+ messages in thread
From: Steven Sistare @ 2024-07-20 19:01 UTC (permalink / raw)
To: iommu; +Cc: Jason Gunthorpe, Kevin Tian, Alex Williamson, Cornelia Huck
Please ignore this cruft that slipped into my patch directory - steve
On 7/20/2024 2:56 PM, Steve Sistare wrote:
> ---
> drivers/iommu/iommufd/ioas.c | 71 ++++++++++++++++++++++++++++++++++++++++---
> drivers/iommu/iommufd/main.c | 6 +++-
> drivers/iommu/iommufd/pages.c | 10 ++++--
> drivers/vfio/group.c | 1 +
> 4 files changed, 81 insertions(+), 7 deletions(-)
>
^ permalink raw reply [flat|nested] 27+ messages in thread
* [RFC V1 2/4] iommufd: Lock all objects
2024-07-20 18:56 [RFC V1 0/4] iommufd live update Steve Sistare
2024-07-20 18:56 ` [RFC V1 1/4] iommufd: Export do_update_pinned Steve Sistare
2024-07-20 18:56 ` [RFC V1] iommufd debug print Steve Sistare
@ 2024-07-20 18:56 ` Steve Sistare
2024-07-22 15:37 ` Jason Gunthorpe
2024-07-20 18:56 ` [RFC V1 3/4] iommufd: Add IOMMU_IOAS_CHANGE_PROCESS Steve Sistare
` (4 subsequent siblings)
7 siblings, 1 reply; 27+ messages in thread
From: Steve Sistare @ 2024-07-20 18:56 UTC (permalink / raw)
To: iommu
Cc: Jason Gunthorpe, Kevin Tian, Alex Williamson, Cornelia Huck,
Steve Sistare
Define helpers to lock and unlock all iommufd_ctx objects.
This will allow DMA mappings to be updated atomically during live update.
This code is substantially the same as an initial version provided by
Jason, plus fixes.
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
---
drivers/iommu/iommufd/ioas.c | 67 +++++++++++++++++++++++++++++++++
drivers/iommu/iommufd/iommufd_private.h | 1 +
drivers/iommu/iommufd/main.c | 1 +
3 files changed, 69 insertions(+)
diff --git a/drivers/iommu/iommufd/ioas.c b/drivers/iommu/iommufd/ioas.c
index 7422482..ab31879 100644
--- a/drivers/iommu/iommufd/ioas.c
+++ b/drivers/iommu/iommufd/ioas.c
@@ -51,7 +51,10 @@ int iommufd_ioas_alloc_ioctl(struct iommufd_ucmd *ucmd)
rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
if (rc)
goto out_table;
+
+ down_read(&ucmd->ictx->ioas_creation_lock);
iommufd_object_finalize(ucmd->ictx, &ioas->obj);
+ up_read(&ucmd->ictx->ioas_creation_lock);
return 0;
out_table:
@@ -319,6 +322,70 @@ int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd)
return rc;
}
+static void iommufd_release_all_iova_rwsem(struct iommufd_ctx *ictx,
+ struct xarray *ioas_list)
+{
+ struct iommufd_ioas *ioas;
+ unsigned long index;
+
+ xa_for_each(ioas_list, index, ioas) {
+ up_write(&ioas->iopt.iova_rwsem);
+ refcount_dec(&ioas->obj.users);
+ }
+ up_write(&ictx->ioas_creation_lock);
+ xa_destroy(ioas_list);
+}
+
+static int iommufd_take_all_iova_rwsem(struct iommufd_ctx *ictx,
+ struct xarray *ioas_list)
+{
+ struct iommufd_object *obj;
+ unsigned long index;
+ int rc;
+
+ /*
+ * This is very ugly, it is done instead of adding a lock around
+ * pages->source_mm, which is a performance path for mdev, we just
+ * obtain the write side of all the iova_rwsems which also protects the
+ * pages->source_*. Due to copies we can't know which IOAS could read
+ * from the pages, so we just lock everything. This is the only place
+ * locks are nested and they are uniformly taken in ID order.
+ *
+ * ioas_creation_lock prevents new IOAS from being installed in the
+ * xarray while we do this, and also prevents more than one thread from
+ * holding nested locks.
+ */
+ down_write(&ictx->ioas_creation_lock);
+ xa_lock(&ictx->objects);
+ lockdep_off();
+ xa_for_each(&ictx->objects, index, obj) {
+ struct iommufd_ioas *ioas;
+
+ if (!obj || obj->type != IOMMUFD_OBJ_IOAS)
+ continue;
+
+ if (!refcount_inc_not_zero(&obj->users))
+ continue;
+
+ xa_unlock(&ictx->objects);
+
+ ioas = container_of(obj, struct iommufd_ioas, obj);
+ down_write(&ioas->iopt.iova_rwsem);
+
+ rc = xa_err(xa_store(ioas_list, index, ioas, GFP_KERNEL));
+ if (rc) {
+ lockdep_on();
+ iommufd_release_all_iova_rwsem(ictx, ioas_list);
+ return rc;
+ }
+
+ xa_lock(&ictx->objects);
+ }
+ lockdep_on();
+ xa_unlock(&ictx->objects);
+ return 0;
+}
+
int iommufd_option_rlimit_mode(struct iommu_option *cmd,
struct iommufd_ctx *ictx)
{
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index 991f864..1ad19a9 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -22,6 +22,7 @@ struct iommufd_ctx {
struct xarray objects;
struct xarray groups;
wait_queue_head_t destroy_wait;
+ struct rw_semaphore ioas_creation_lock;
u8 account_mode;
/* Compatibility with VFIO no iommu */
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index 39b3293..4a15cf9 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -248,6 +248,7 @@ static int iommufd_fops_open(struct inode *inode, struct file *filp)
pr_info_once("IOMMUFD is providing /dev/vfio/vfio, not VFIO.\n");
}
+ init_rwsem(&ictx->ioas_creation_lock);
xa_init_flags(&ictx->objects, XA_FLAGS_ALLOC1 | XA_FLAGS_ACCOUNT);
xa_init(&ictx->groups);
ictx->file = filp;
--
1.8.3.1
^ permalink raw reply related [flat|nested] 27+ messages in thread* Re: [RFC V1 2/4] iommufd: Lock all objects
2024-07-20 18:56 ` [RFC V1 2/4] iommufd: Lock all objects Steve Sistare
@ 2024-07-22 15:37 ` Jason Gunthorpe
2024-08-05 19:01 ` Steven Sistare
0 siblings, 1 reply; 27+ messages in thread
From: Jason Gunthorpe @ 2024-07-22 15:37 UTC (permalink / raw)
To: Steve Sistare; +Cc: iommu, Kevin Tian, Alex Williamson, Cornelia Huck
On Sat, Jul 20, 2024 at 11:56:43AM -0700, Steve Sistare wrote:
> +static int iommufd_take_all_iova_rwsem(struct iommufd_ctx *ictx,
> + struct xarray *ioas_list)
> +{
> + struct iommufd_object *obj;
> + unsigned long index;
> + int rc;
> +
> + /*
> + * This is very ugly, it is done instead of adding a lock around
> + * pages->source_mm, which is a performance path for mdev, we just
> + * obtain the write side of all the iova_rwsems which also protects the
> + * pages->source_*. Due to copies we can't know which IOAS could read
> + * from the pages, so we just lock everything. This is the only place
> + * locks are nested and they are uniformly taken in ID order.
> + *
> + * ioas_creation_lock prevents new IOAS from being installed in the
> + * xarray while we do this, and also prevents more than one thread from
> + * holding nested locks.
> + */
> + down_write(&ictx->ioas_creation_lock);
> + xa_lock(&ictx->objects);
> + lockdep_off();
IIRC we used the nested annotation on the down_write instead of the
global lockdep_off? See how mm_take_all_locks() did a similar madness
Jason
^ permalink raw reply [flat|nested] 27+ messages in thread* Re: [RFC V1 2/4] iommufd: Lock all objects
2024-07-22 15:37 ` Jason Gunthorpe
@ 2024-08-05 19:01 ` Steven Sistare
2024-09-26 14:00 ` Steven Sistare
0 siblings, 1 reply; 27+ messages in thread
From: Steven Sistare @ 2024-08-05 19:01 UTC (permalink / raw)
To: Jason Gunthorpe; +Cc: iommu, Kevin Tian, Alex Williamson, Cornelia Huck
On 7/22/2024 11:37 AM, Jason Gunthorpe wrote:
> On Sat, Jul 20, 2024 at 11:56:43AM -0700, Steve Sistare wrote:
>> +static int iommufd_take_all_iova_rwsem(struct iommufd_ctx *ictx,
>> + struct xarray *ioas_list)
>> +{
>> + struct iommufd_object *obj;
>> + unsigned long index;
>> + int rc;
>> +
>> + /*
>> + * This is very ugly, it is done instead of adding a lock around
>> + * pages->source_mm, which is a performance path for mdev, we just
>> + * obtain the write side of all the iova_rwsems which also protects the
>> + * pages->source_*. Due to copies we can't know which IOAS could read
>> + * from the pages, so we just lock everything. This is the only place
>> + * locks are nested and they are uniformly taken in ID order.
>> + *
>> + * ioas_creation_lock prevents new IOAS from being installed in the
>> + * xarray while we do this, and also prevents more than one thread from
>> + * holding nested locks.
>> + */
>> + down_write(&ictx->ioas_creation_lock);
>> + xa_lock(&ictx->objects);
>> + lockdep_off();
>
> IIRC we used the nested annotation on the down_write instead of the
> global lockdep_off? See how mm_take_all_locks() did a similar madness
The code you sent me used lockdep_off but had a comment wishing for a better
method. I'll try the mm_take_all_locks method.
(BTW, sorry for the lag time, I just returned from vacation).
- Steve
^ permalink raw reply [flat|nested] 27+ messages in thread* Re: [RFC V1 2/4] iommufd: Lock all objects
2024-08-05 19:01 ` Steven Sistare
@ 2024-09-26 14:00 ` Steven Sistare
0 siblings, 0 replies; 27+ messages in thread
From: Steven Sistare @ 2024-09-26 14:00 UTC (permalink / raw)
To: Jason Gunthorpe; +Cc: iommu, Kevin Tian, Alex Williamson, Cornelia Huck
On 8/5/2024 3:01 PM, Steven Sistare wrote:
> On 7/22/2024 11:37 AM, Jason Gunthorpe wrote:
>> On Sat, Jul 20, 2024 at 11:56:43AM -0700, Steve Sistare wrote:
>>> +static int iommufd_take_all_iova_rwsem(struct iommufd_ctx *ictx,
>>> + struct xarray *ioas_list)
>>> +{
>>> + struct iommufd_object *obj;
>>> + unsigned long index;
>>> + int rc;
>>> +
>>> + /*
>>> + * This is very ugly, it is done instead of adding a lock around
>>> + * pages->source_mm, which is a performance path for mdev, we just
>>> + * obtain the write side of all the iova_rwsems which also protects the
>>> + * pages->source_*. Due to copies we can't know which IOAS could read
>>> + * from the pages, so we just lock everything. This is the only place
>>> + * locks are nested and they are uniformly taken in ID order.
>>> + *
>>> + * ioas_creation_lock prevents new IOAS from being installed in the
>>> + * xarray while we do this, and also prevents more than one thread from
>>> + * holding nested locks.
>>> + */
>>> + down_write(&ictx->ioas_creation_lock);
>>> + xa_lock(&ictx->objects);
>>> + lockdep_off();
>>
>> IIRC we used the nested annotation on the down_write instead of the
>> global lockdep_off? See how mm_take_all_locks() did a similar madness
>
> The code you sent me used lockdep_off but had a comment wishing for a better
> method. I'll try the mm_take_all_locks method.
Sorry, I forgot to address this in V2 which I just sent.
Will investigate and add to V3 if it works out.
- Steve
^ permalink raw reply [flat|nested] 27+ messages in thread
* [RFC V1 3/4] iommufd: Add IOMMU_IOAS_CHANGE_PROCESS
2024-07-20 18:56 [RFC V1 0/4] iommufd live update Steve Sistare
` (2 preceding siblings ...)
2024-07-20 18:56 ` [RFC V1 2/4] iommufd: Lock all objects Steve Sistare
@ 2024-07-20 18:56 ` Steve Sistare
2024-07-20 18:56 ` [RFC V1 4/4] iommufd: update VA Steve Sistare
` (3 subsequent siblings)
7 siblings, 0 replies; 27+ messages in thread
From: Steve Sistare @ 2024-07-20 18:56 UTC (permalink / raw)
To: iommu
Cc: Jason Gunthorpe, Kevin Tian, Alex Williamson, Cornelia Huck,
Steve Sistare
Add an ioctl that updates all DMA mappings to reflect the current process,
Change the mm and transfer locked memory accounting from old to current mm.
This will be used for live update, allowing an old process to hand the
iommufd device descriptor to a new process. The new process calls the
ioctl.
This is only a partial implementation. A subsequent patch also changes
the userland VA of each mapping. The API is documented in that patch.
This code is substantially the same as an initial version provided by
Jason, plus fixes.
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
---
drivers/iommu/iommufd/io_pagetable.h | 1 +
drivers/iommu/iommufd/ioas.c | 135 ++++++++++++++++++++++++++++++++
drivers/iommu/iommufd/iommufd_private.h | 1 +
drivers/iommu/iommufd/main.c | 2 +
include/uapi/linux/iommufd.h | 9 +++
5 files changed, 148 insertions(+)
diff --git a/drivers/iommu/iommufd/io_pagetable.h b/drivers/iommu/iommufd/io_pagetable.h
index c37b680..8c29fbb 100644
--- a/drivers/iommu/iommufd/io_pagetable.h
+++ b/drivers/iommu/iommufd/io_pagetable.h
@@ -173,6 +173,7 @@ enum {
IOPT_PAGES_ACCOUNT_NONE = 0,
IOPT_PAGES_ACCOUNT_USER = 1,
IOPT_PAGES_ACCOUNT_MM = 2,
+ IOPT_PAGES_ACCOUNT_MODE_NUM = 3,
};
/*
diff --git a/drivers/iommu/iommufd/ioas.c b/drivers/iommu/iommufd/ioas.c
index ab31879..62af26e 100644
--- a/drivers/iommu/iommufd/ioas.c
+++ b/drivers/iommu/iommufd/ioas.c
@@ -386,6 +386,141 @@ static int iommufd_take_all_iova_rwsem(struct iommufd_ctx *ictx,
return 0;
}
+static bool need_charge_update(struct iopt_pages *pages)
+{
+ if (pages->source_task == current->group_leader &&
+ pages->source_mm == current->mm &&
+ pages->source_user == current_user()) {
+ return false;
+ }
+
+ switch (pages->account_mode) {
+ case IOPT_PAGES_ACCOUNT_NONE:
+ return false;
+ case IOPT_PAGES_ACCOUNT_USER:
+ if (pages->source_user == current_user())
+ return false;
+ break;
+ case IOPT_PAGES_ACCOUNT_MM:
+ if (pages->source_mm == current->mm)
+ return false;
+ }
+ return true;
+}
+
+static int charge_current(unsigned long *npinned)
+{
+ struct iopt_pages tmp = {
+ .source_mm = current->mm,
+ .source_task = current->group_leader,
+ .source_user = current_user(),
+ };
+ unsigned int account_mode;
+ int rc;
+
+ for (account_mode = 0; account_mode != IOPT_PAGES_ACCOUNT_MODE_NUM;
+ account_mode++) {
+ if (!npinned[account_mode])
+ continue;
+
+ tmp.account_mode = account_mode;
+ rc = iopt_update_pinned(&tmp, npinned[account_mode], true,
+ NULL);
+ if (rc)
+ goto err_undo;
+ }
+ return 0;
+
+err_undo:
+ while (account_mode != 0) {
+ account_mode--;
+ tmp.account_mode = account_mode;
+ iopt_update_pinned(&tmp, npinned[account_mode], false, NULL);
+ }
+ return rc;
+}
+
+static void change_mm(struct iopt_pages *pages)
+{
+ struct task_struct *old_task = pages->source_task;
+ struct user_struct *old_user = pages->source_user;
+ struct mm_struct *old_mm = pages->source_mm;
+
+ pages->source_mm = current->mm;
+ mmgrab(pages->source_mm);
+ mmdrop(old_mm);
+
+ pages->source_task = current->group_leader;
+ get_task_struct(pages->source_task);
+ put_task_struct(old_task);
+
+ pages->source_user = get_uid(current_user());
+ free_uid(old_user);
+}
+
+#define for_each_ioas_area(_xa, _index, _ioas, _area) \
+ xa_for_each((_xa), (_index), (_ioas)) \
+ for (_area = iopt_area_iter_first(&_ioas->iopt, 0, ULONG_MAX); \
+ _area; \
+ _area = iopt_area_iter_next(_area, 0, ULONG_MAX))
+
+int iommufd_ioas_change_process(struct iommufd_ucmd *ucmd)
+{
+ struct iommufd_ctx *ictx = ucmd->ictx;
+ unsigned long all_npinned[IOPT_PAGES_ACCOUNT_MODE_NUM] = {0, 0, 0};
+ struct iommufd_ioas *ioas;
+ struct iopt_area *area;
+ struct iopt_pages *pages;
+ struct xarray ioas_list;
+ unsigned long index;
+ int rc;
+
+ xa_init(&ioas_list);
+ rc = iommufd_take_all_iova_rwsem(ictx, &ioas_list);
+ if (rc)
+ goto out_no_release;
+
+ /*
+ * Count last_pinned pages, then clear it to avoid double counting
+ * if the same iopt_pages is visited multiple times in this loop.
+ * Since we are under all the locks, npinned == last_npinned, so we
+ * can easily restore last_npinned before we return.
+ */
+ for_each_ioas_area(&ioas_list, index, ioas, area) {
+ pages = area->pages;
+
+ if (need_charge_update(pages)) {
+ all_npinned[pages->account_mode] += pages->last_npinned;
+ pages->last_npinned = 0;
+ }
+ }
+
+ rc = charge_current(all_npinned);
+
+ if (rc) {
+ /* Charge failed. Fix last_npinned and bail. */
+ for_each_ioas_area(&ioas_list, index, ioas, area)
+ area->pages->last_npinned = area->pages->npinned;
+ goto out;
+ }
+
+ for_each_ioas_area(&ioas_list, index, ioas, area) {
+ pages = area->pages;
+
+ /* Uncharge the old one (which also restores last_npinned) */
+ if (need_charge_update(pages))
+ iopt_update_pinned(pages, pages->npinned, false, NULL);
+
+ change_mm(pages);
+ }
+
+out:
+ iommufd_release_all_iova_rwsem(ictx, &ioas_list);
+
+out_no_release:
+ return rc;
+}
+
int iommufd_option_rlimit_mode(struct iommu_option *cmd,
struct iommufd_ctx *ictx)
{
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index 1ad19a9..4f747ba 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -274,6 +274,7 @@ static inline struct iommufd_ioas *iommufd_get_ioas(struct iommufd_ctx *ictx,
int iommufd_ioas_iova_ranges(struct iommufd_ucmd *ucmd);
int iommufd_ioas_allow_iovas(struct iommufd_ucmd *ucmd);
int iommufd_ioas_map(struct iommufd_ucmd *ucmd);
+int iommufd_ioas_change_process(struct iommufd_ucmd *ucmd);
int iommufd_ioas_copy(struct iommufd_ucmd *ucmd);
int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd);
int iommufd_ioas_option(struct iommufd_ucmd *ucmd);
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index 4a15cf9..f8a2736 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -370,6 +370,8 @@ struct iommufd_ioctl_op {
struct iommu_ioas_alloc, out_ioas_id),
IOCTL_OP(IOMMU_IOAS_ALLOW_IOVAS, iommufd_ioas_allow_iovas,
struct iommu_ioas_allow_iovas, allowed_iovas),
+ IOCTL_OP(IOMMU_IOAS_CHANGE_PROCESS, iommufd_ioas_change_process,
+ struct iommu_ioas_change_process, size),
IOCTL_OP(IOMMU_IOAS_COPY, iommufd_ioas_copy, struct iommu_ioas_copy,
src_iova),
IOCTL_OP(IOMMU_IOAS_IOVA_RANGES, iommufd_ioas_iova_ranges,
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index 1dfeaa2..6f7c243 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -50,6 +50,7 @@ enum {
IOMMUFD_CMD_HWPT_SET_DIRTY_TRACKING,
IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP,
IOMMUFD_CMD_HWPT_INVALIDATE,
+ IOMMUFD_CMD_IOAS_CHANGE_PROCESS,
};
/**
@@ -692,4 +693,12 @@ struct iommu_hwpt_invalidate {
__u32 __reserved;
};
#define IOMMU_HWPT_INVALIDATE _IO(IOMMUFD_TYPE, IOMMUFD_CMD_HWPT_INVALIDATE)
+
+struct iommu_ioas_change_process {
+ __u32 size;
+};
+
+#define IOMMU_IOAS_CHANGE_PROCESS \
+ _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_CHANGE_PROCESS)
+
#endif
--
1.8.3.1
^ permalink raw reply related [flat|nested] 27+ messages in thread* [RFC V1 4/4] iommufd: update VA
2024-07-20 18:56 [RFC V1 0/4] iommufd live update Steve Sistare
` (3 preceding siblings ...)
2024-07-20 18:56 ` [RFC V1 3/4] iommufd: Add IOMMU_IOAS_CHANGE_PROCESS Steve Sistare
@ 2024-07-20 18:56 ` Steve Sistare
2024-07-22 15:51 ` Jason Gunthorpe
2024-07-20 19:21 ` [RFC V1 0/4] iommufd live update Steven Sistare
` (2 subsequent siblings)
7 siblings, 1 reply; 27+ messages in thread
From: Steve Sistare @ 2024-07-20 18:56 UTC (permalink / raw)
To: iommu
Cc: Jason Gunthorpe, Kevin Tian, Alex Williamson, Cornelia Huck,
Steve Sistare
Extend IOMMU_IOAS_CHANGE_PROCESS so that the userland VA of each mapping
is also updated.
Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
---
drivers/iommu/iommufd/ioas.c | 89 ++++++++++++++++++++++++++++++++++++++++++++
include/uapi/linux/iommufd.h | 51 ++++++++++++++++++++++++-
2 files changed, 138 insertions(+), 2 deletions(-)
diff --git a/drivers/iommu/iommufd/ioas.c b/drivers/iommu/iommufd/ioas.c
index 62af26e..5d23bdd 100644
--- a/drivers/iommu/iommufd/ioas.c
+++ b/drivers/iommu/iommufd/ioas.c
@@ -2,6 +2,7 @@
/*
* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES
*/
+#include <linux/bsearch.h>
#include <linux/interval_tree.h>
#include <linux/iommufd.h>
#include <linux/iommu.h>
@@ -458,6 +459,76 @@ static void change_mm(struct iopt_pages *pages)
free_uid(old_user);
}
+#define MAX_UMAP_ENTRIES 1024
+
+static int iommufd_get_umap(struct iommu_ioas_change_process *cmd,
+ struct iommu_ioas_userspace_map **p_umap)
+{
+ int i;
+ int len = cmd->n_umap * sizeof(**p_umap);
+ struct iommu_ioas_userspace_map *umap = kzalloc(len, GFP_KERNEL);
+
+ if (!umap)
+ return -ENOMEM;
+
+ if (len > MAX_UMAP_ENTRIES * sizeof(*umap))
+ return -E2BIG;
+
+ if (copy_from_user(umap, u64_to_user_ptr(cmd->umap), len)) {
+ kfree(umap);
+ return -EFAULT;
+ }
+
+ for (i = 0; i < cmd->n_umap - 1; i++) {
+ if (umap[i].addr_old > umap[i+1].addr_old) {
+ kfree(umap);
+ return -EINVAL;
+ }
+ }
+ *p_umap = umap;
+ return 0;
+}
+
+static int cmp_addr(const void *a, const void *b)
+{
+ u64 key = (u64)a;
+ const struct iommu_ioas_userspace_map *elem = b;
+
+ if (key < elem->addr_old)
+ return -1;
+ else if (key < elem->addr_old + elem->size)
+ return 0;
+ else
+ return 1;
+}
+
+static int pages_validate_userspace(struct iopt_pages *pages,
+ struct iommu_ioas_userspace_map *umap,
+ int n_umap)
+{
+ struct iommu_ioas_userspace_map *newaddr;
+ size_t pages_size = pages->npages * PAGE_SIZE;
+
+ if (!n_umap)
+ return 0;
+ newaddr = bsearch(pages->uptr, umap, n_umap, sizeof(*umap), cmp_addr);
+ if (!newaddr || pages_size > newaddr->size)
+ return -EINVAL;
+ return 0;
+}
+
+static void pages_update_userspace(struct iopt_pages *pages,
+ struct iommu_ioas_userspace_map *umap,
+ int n_umap)
+{
+ struct iommu_ioas_userspace_map *newaddr;
+
+ if (!n_umap)
+ return;
+ newaddr = bsearch(pages->uptr, umap, n_umap, sizeof(*umap), cmp_addr);
+ pages->uptr += newaddr->addr_new - newaddr->addr_old;
+}
+
#define for_each_ioas_area(_xa, _index, _ioas, _area) \
xa_for_each((_xa), (_index), (_ioas)) \
for (_area = iopt_area_iter_first(&_ioas->iopt, 0, ULONG_MAX); \
@@ -467,6 +538,8 @@ static void change_mm(struct iopt_pages *pages)
int iommufd_ioas_change_process(struct iommufd_ucmd *ucmd)
{
struct iommufd_ctx *ictx = ucmd->ictx;
+ struct iommu_ioas_change_process *cmd = ucmd->cmd;
+ struct iommu_ioas_userspace_map *umap = NULL;
unsigned long all_npinned[IOPT_PAGES_ACCOUNT_MODE_NUM] = {0, 0, 0};
struct iommufd_ioas *ioas;
struct iopt_area *area;
@@ -475,6 +548,17 @@ int iommufd_ioas_change_process(struct iommufd_ucmd *ucmd)
unsigned long index;
int rc;
+ if (cmd->flags || cmd->__reserved)
+ return -EOPNOTSUPP;
+
+ if (cmd->n_umap > 0) {
+ rc = iommufd_get_umap(cmd, &umap);
+ if (rc)
+ return rc;
+ } else if (cmd->n_umap < 0) {
+ return -EINVAL;
+ }
+
xa_init(&ioas_list);
rc = iommufd_take_all_iova_rwsem(ictx, &ioas_list);
if (rc)
@@ -488,6 +572,9 @@ int iommufd_ioas_change_process(struct iommufd_ucmd *ucmd)
*/
for_each_ioas_area(&ioas_list, index, ioas, area) {
pages = area->pages;
+ rc = pages_validate_userspace(pages, umap, cmd->n_umap);
+ if (rc)
+ goto out;
if (need_charge_update(pages)) {
all_npinned[pages->account_mode] += pages->last_npinned;
@@ -506,6 +593,7 @@ int iommufd_ioas_change_process(struct iommufd_ucmd *ucmd)
for_each_ioas_area(&ioas_list, index, ioas, area) {
pages = area->pages;
+ pages_update_userspace(pages, umap, cmd->n_umap);
/* Uncharge the old one (which also restores last_npinned) */
if (need_charge_update(pages))
@@ -518,6 +606,7 @@ int iommufd_ioas_change_process(struct iommufd_ucmd *ucmd)
iommufd_release_all_iova_rwsem(ictx, &ioas_list);
out_no_release:
+ kfree(umap);
return rc;
}
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index 6f7c243..b097ef1 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -694,11 +694,58 @@ struct iommu_hwpt_invalidate {
};
#define IOMMU_HWPT_INVALIDATE _IO(IOMMUFD_TYPE, IOMMUFD_CMD_HWPT_INVALIDATE)
+/**
+ * struct iommu_ioas_change_process - ioctl(VFIO_IOAS_CHANGE_PROCESS)
+ * @size: sizeof(struct iommu_ioas_change_process)
+ * @flags: Must be 0
+ * @n_umap: Number of entries in @umap
+ * @umap: User pointer to an array of mappings to change.
+ * @addr_old and @size must match the virtual address and size of
+ * an existing mapping exactly.
+ * @__reserved: Must be 0
+ *
+ * This changes the process backing the memory maps for every memory map
+ * created in every IOAS in the context to the current process. Pinned
+ * memory counts are transferred to the current process, and the virtual
+ * address of each mapping in @umap is updated to @addr_new.
+ *
+ * If @n_umap == 0, then no mappings are updated. This is only suitable if
+ * the current process has the same virtual addresses as the previous process,
+ * such as after fork. If @n_map > 0, then @umap must contain entries for all
+ * existing mappings, or the operation fails. The @umap entries must be sorted
+ * by @addr_old.
+ *
+ * The range size@new_addr in the new process must refer to the same memory
+ * object as size@old_addr in the old process, but the implementation does
+ * not enforce this restriction. Violation of this rule is not detected and
+ * may result in the process corrupting its own memory (but no one else's).
+ *
+ * If the ioctl returns a failure status, then nothing is changed.
+ *
+ * This API is useful for transferring operation of a device from one process
+ * to another, such as during userland live update. Beware that transferring
+ * a context containing mappings accessed by kernel threads (such as a vfio
+ * mdev) requires special care, if the original process directly exec's its
+ * successor. The original process must fork an identical caretaker
+ * process which calls VFIO_IOAS_CHANGE_PROCESS to take temporary ownership
+ * of the mappings without changing VA's. The original process then exec's
+ * its successor, which calls VFIO_IOAS_CHANGE_PROCESS with new VA's to take
+ * ownership from the caretaker.
+ */
+
+struct iommu_ioas_userspace_map {
+ __u64 addr_old; /* old userspace virtual address */
+ __u64 addr_new; /* new userspace virtual address */
+ __u64 size; /* size of the mapping in bytes */
+};
+
struct iommu_ioas_change_process {
__u32 size;
+ __u32 flags;
+ __u32 n_umap;
+ __u32 __reserved;
+ __aligned_u64 umap;
};
-
#define IOMMU_IOAS_CHANGE_PROCESS \
_IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_CHANGE_PROCESS)
-
#endif
--
1.8.3.1
^ permalink raw reply related [flat|nested] 27+ messages in thread* Re: [RFC V1 4/4] iommufd: update VA
2024-07-20 18:56 ` [RFC V1 4/4] iommufd: update VA Steve Sistare
@ 2024-07-22 15:51 ` Jason Gunthorpe
2024-08-05 19:02 ` Steven Sistare
0 siblings, 1 reply; 27+ messages in thread
From: Jason Gunthorpe @ 2024-07-22 15:51 UTC (permalink / raw)
To: Steve Sistare; +Cc: iommu, Kevin Tian, Alex Williamson, Cornelia Huck
On Sat, Jul 20, 2024 at 11:56:45AM -0700, Steve Sistare wrote:
> @@ -458,6 +459,76 @@ static void change_mm(struct iopt_pages *pages)
> free_uid(old_user);
> }
>
> +#define MAX_UMAP_ENTRIES 1024
Only 170 of these fit in a 4k page :\
Are you worried about the high order allocation this could trigger?
I guess the alternative is to load it into a maple tree or similar,
but that is alot of memory allocations.
> +static int iommufd_get_umap(struct iommu_ioas_change_process *cmd,
> + struct iommu_ioas_userspace_map **p_umap)
> +{
> + int i;
> + int len = cmd->n_umap * sizeof(**p_umap);
> + struct iommu_ioas_userspace_map *umap = kzalloc(len, GFP_KERNEL);
> +
> + if (!umap)
> + return -ENOMEM;
> +
> + if (len > MAX_UMAP_ENTRIES * sizeof(*umap))
> + return -E2BIG;
> +
> + if (copy_from_user(umap, u64_to_user_ptr(cmd->umap), len)) {
> + kfree(umap);
> + return -EFAULT;
> + }
This is memdup_user ?
> +struct iommu_ioas_userspace_map {
> + __u64 addr_old; /* old userspace virtual address */
> + __u64 addr_new; /* new userspace virtual address */
> + __u64 size; /* size of the mapping in bytes */
> +};
Use __aligned_u64 here as well
> struct iommu_ioas_change_process {
> __u32 size;
> + __u32 flags;
> + __u32 n_umap;
> + __u32 __reserved;
> + __aligned_u64 umap;
> };
> -
> #define IOMMU_IOAS_CHANGE_PROCESS \
> _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_CHANGE_PROCESS)
> -
These - lines should go in the prior patch
Jason
^ permalink raw reply [flat|nested] 27+ messages in thread* Re: [RFC V1 4/4] iommufd: update VA
2024-07-22 15:51 ` Jason Gunthorpe
@ 2024-08-05 19:02 ` Steven Sistare
2024-08-06 12:54 ` Jason Gunthorpe
0 siblings, 1 reply; 27+ messages in thread
From: Steven Sistare @ 2024-08-05 19:02 UTC (permalink / raw)
To: Jason Gunthorpe; +Cc: iommu, Kevin Tian, Alex Williamson, Cornelia Huck
On 7/22/2024 11:51 AM, Jason Gunthorpe wrote:
> On Sat, Jul 20, 2024 at 11:56:45AM -0700, Steve Sistare wrote:
>> @@ -458,6 +459,76 @@ static void change_mm(struct iopt_pages *pages)
>> free_uid(old_user);
>> }
>>
>> +#define MAX_UMAP_ENTRIES 1024
>
> Only 170 of these fit in a 4k page :\
>
> Are you worried about the high order allocation this could trigger?
Should I worry?
> I guess the alternative is to load it into a maple tree or simila
> but that is alot of memory allocations.
I define a max to guard against a DOS from userland requesting a huge number
of entries and causing the driver to allocate a huge amount of memory.
The use cases I have seen only have dozens of entries. Do you have a figure
in mind for the upper bound?
If not, I could just copy_from_user one entry at a time in the loops, and in the
bsearch comparison function, requiring O(N * lg(n)) copy operations.
>> +static int iommufd_get_umap(struct iommu_ioas_change_process *cmd,
>> + struct iommu_ioas_userspace_map **p_umap)
>> +{
>> + int i;
>> + int len = cmd->n_umap * sizeof(**p_umap);
>> + struct iommu_ioas_userspace_map *umap = kzalloc(len, GFP_KERNEL);
>> +
>> + if (!umap)
>> + return -ENOMEM;
>> +
>> + if (len > MAX_UMAP_ENTRIES * sizeof(*umap))
>> + return -E2BIG;
>> +
>> + if (copy_from_user(umap, u64_to_user_ptr(cmd->umap), len)) {
>> + kfree(umap);
>> + return -EFAULT;
>> + }
>
> This is memdup_user ?
Sure, will do if I continue to copy the entire array.
>> +struct iommu_ioas_userspace_map {
>> + __u64 addr_old; /* old userspace virtual address */
>> + __u64 addr_new; /* new userspace virtual address */
>> + __u64 size; /* size of the mapping in bytes */
>> +};
>
> Use __aligned_u64 here as well
OK.
>> struct iommu_ioas_change_process {
>> __u32 size;
>> + __u32 flags;
>> + __u32 n_umap;
>> + __u32 __reserved;
>> + __aligned_u64 umap;
>> };
>> -
>> #define IOMMU_IOAS_CHANGE_PROCESS \
>> _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_CHANGE_PROCESS)
>> -
>
> These - lines should go in the prior patch
Yes, thanks.
- Steve
^ permalink raw reply [flat|nested] 27+ messages in thread* Re: [RFC V1 4/4] iommufd: update VA
2024-08-05 19:02 ` Steven Sistare
@ 2024-08-06 12:54 ` Jason Gunthorpe
0 siblings, 0 replies; 27+ messages in thread
From: Jason Gunthorpe @ 2024-08-06 12:54 UTC (permalink / raw)
To: Steven Sistare; +Cc: iommu, Kevin Tian, Alex Williamson, Cornelia Huck
On Mon, Aug 05, 2024 at 03:02:22PM -0400, Steven Sistare wrote:
> On 7/22/2024 11:51 AM, Jason Gunthorpe wrote:
> > On Sat, Jul 20, 2024 at 11:56:45AM -0700, Steve Sistare wrote:
> > > @@ -458,6 +459,76 @@ static void change_mm(struct iopt_pages *pages)
> > > free_uid(old_user);
> > > }
> > > +#define MAX_UMAP_ENTRIES 1024
> >
> > Only 170 of these fit in a 4k page :\
> >
> > Are you worried about the high order allocation this could trigger?
>
> Should I worry?
Probably? Experiance shows busy servers cannot do a high order
allocation, even order 4 is likely to fail due to fragmentation. If
you imagine you will need more than 1-4 pages of this then it would
probably be a good idea to avoid the higher order allocation.,
> I define a max to guard against a DOS from userland requesting a huge number
> of entries and causing the driver to allocate a huge amount of memory.
>
> The use cases I have seen only have dozens of entries. Do you have a figure
> in mind for the upper bound?
Then it is probably fine as is
Jason
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [RFC V1 0/4] iommufd live update
2024-07-20 18:56 [RFC V1 0/4] iommufd live update Steve Sistare
` (4 preceding siblings ...)
2024-07-20 18:56 ` [RFC V1 4/4] iommufd: update VA Steve Sistare
@ 2024-07-20 19:21 ` Steven Sistare
2024-07-22 15:55 ` Jason Gunthorpe
2024-07-23 12:48 ` Jason Gunthorpe
7 siblings, 0 replies; 27+ messages in thread
From: Steven Sistare @ 2024-07-20 19:21 UTC (permalink / raw)
To: iommu; +Cc: Jason Gunthorpe, Kevin Tian, Alex Williamson, Cornelia Huck
On 7/20/2024 2:56 PM, Steve Sistare wrote:
> Live update is a technique wherein an application saves its state, launches
> an updated version of itself, and restores its state. Clients of the
> application experience a brief suspension of service, on the order of
> 100's of milliseconds, but are otherwise unaffected.
>
> Define the IOMMU_IOAS_CHANGE_PROCESS ioctl to allow management and use
> of an iommufd device to be transferred from one process to another. The
> application is responsible for transferring the device descriptor to the new
> process, eg either by preservation across fork and exec or via SCM_RIGHTS.
>
> All memory objects that were mapped for iommufd DMA in the old process
> must also be mapped in the new process, but they may have different virtual
> addresses in the new. The application passes an array of new addresses to
> IOMMU_IOAS_CHANGE_PROCESS, and it atomically updates everything to the current
> process, grabbing the new mm, transferring pinned page counts, and recording
> new virtual addresses.
>
> If the application directly exec's the new version of itself, it must take
> special care if the mappings may be accessed by kernel threads, such as by
> vfio mdevs. The original process must fork an identical caretaker
> process which calls VFIO_IOAS_CHANGE_PROCESS to take temporary ownership
> of the mappings without changing VA's. The original process then exec's
> its successor, which calls VFIO_IOAS_CHANGE_PROCESS with new VA's to take
> ownership from the caretaker.
>
> Thanks to Jason Gunthorpe for code and ideas in this series.
>
> This is implemented in QEMU by the patch series "Live update: iommufd"
> https://lore.kernel.org/qemu-devel (to be posted shortly)
https://lore.kernel.org/qemu-devel/1721502937-87102-1-git-send-email-steven.sistare@oracle.com
- steve
> Steve Sistare (4):
> iommufd: Export do_update_pinned
> iommufd: Lock all objects
> iommufd: Add IOMMU_IOAS_CHANGE_PROCESS
> iommufd: update VA
>
> drivers/iommu/iommufd/io_pagetable.h | 6 +
> drivers/iommu/iommufd/ioas.c | 291 ++++++++++++++++++++++++++++++++
> drivers/iommu/iommufd/iommufd_private.h | 2 +
> drivers/iommu/iommufd/main.c | 3 +
> drivers/iommu/iommufd/pages.c | 10 +-
> include/uapi/linux/iommufd.h | 56 ++++++
> 6 files changed, 363 insertions(+), 5 deletions(-)
>
^ permalink raw reply [flat|nested] 27+ messages in thread* Re: [RFC V1 0/4] iommufd live update
2024-07-20 18:56 [RFC V1 0/4] iommufd live update Steve Sistare
` (5 preceding siblings ...)
2024-07-20 19:21 ` [RFC V1 0/4] iommufd live update Steven Sistare
@ 2024-07-22 15:55 ` Jason Gunthorpe
2024-08-05 19:03 ` Steven Sistare
2024-07-23 12:48 ` Jason Gunthorpe
7 siblings, 1 reply; 27+ messages in thread
From: Jason Gunthorpe @ 2024-07-22 15:55 UTC (permalink / raw)
To: Steve Sistare; +Cc: iommu, Kevin Tian, Alex Williamson, Cornelia Huck
On Sat, Jul 20, 2024 at 11:56:40AM -0700, Steve Sistare wrote:
> Live update is a technique wherein an application saves its state, launches
> an updated version of itself, and restores its state. Clients of the
> application experience a brief suspension of service, on the order of
> 100's of milliseconds, but are otherwise unaffected.
>
> Define the IOMMU_IOAS_CHANGE_PROCESS ioctl to allow management and use
> of an iommufd device to be transferred from one process to another. The
> application is responsible for transferring the device descriptor to the new
> process, eg either by preservation across fork and exec or via SCM_RIGHTS.
It seems Ok to me, I'm glad it worked out for you
But have you considered using something like the new
memfd_pin_folios() system so that iommufd is bound to the FDs backing
the memory instead of VMAs?
https://lore.kernel.org/all/20240624063952.1572359-1-vivek.kasireddy@intel.com/
I've been expecting to add support for that, but does it help this scenario?
Jason
^ permalink raw reply [flat|nested] 27+ messages in thread* Re: [RFC V1 0/4] iommufd live update
2024-07-22 15:55 ` Jason Gunthorpe
@ 2024-08-05 19:03 ` Steven Sistare
2024-08-06 12:56 ` Jason Gunthorpe
0 siblings, 1 reply; 27+ messages in thread
From: Steven Sistare @ 2024-08-05 19:03 UTC (permalink / raw)
To: Jason Gunthorpe; +Cc: iommu, Kevin Tian, Alex Williamson, Cornelia Huck
On 7/22/2024 11:55 AM, Jason Gunthorpe wrote:
> On Sat, Jul 20, 2024 at 11:56:40AM -0700, Steve Sistare wrote:
>> Live update is a technique wherein an application saves its state, launches
>> an updated version of itself, and restores its state. Clients of the
>> application experience a brief suspension of service, on the order of
>> 100's of milliseconds, but are otherwise unaffected.
>>
>> Define the IOMMU_IOAS_CHANGE_PROCESS ioctl to allow management and use
>> of an iommufd device to be transferred from one process to another. The
>> application is responsible for transferring the device descriptor to the new
>> process, eg either by preservation across fork and exec or via SCM_RIGHTS.
>
> It seems Ok to me, I'm glad it worked out for you
>
> But have you considered using something like the new
> memfd_pin_folios() system so that iommufd is bound to the FDs backing
> the memory instead of VMAs?
>
> https://lore.kernel.org/all/20240624063952.1572359-1-vivek.kasireddy@intel.com/
>
> I've been expecting to add support for that, but does it help this scenario?
Thanks for the pointer, I had not seen it.
AFAICT it does not affect live update. The memfd is passed to new qemu, and
the manner in which its pages were pinned does not matter, as long as the effect
on the mm fields that we manipulate is the same.
- Steve
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [RFC V1 0/4] iommufd live update
2024-08-05 19:03 ` Steven Sistare
@ 2024-08-06 12:56 ` Jason Gunthorpe
2024-08-08 19:15 ` Steven Sistare
0 siblings, 1 reply; 27+ messages in thread
From: Jason Gunthorpe @ 2024-08-06 12:56 UTC (permalink / raw)
To: Steven Sistare; +Cc: iommu, Kevin Tian, Alex Williamson, Cornelia Huck
On Mon, Aug 05, 2024 at 03:03:30PM -0400, Steven Sistare wrote:
> On 7/22/2024 11:55 AM, Jason Gunthorpe wrote:
> > On Sat, Jul 20, 2024 at 11:56:40AM -0700, Steve Sistare wrote:
> > > Live update is a technique wherein an application saves its state, launches
> > > an updated version of itself, and restores its state. Clients of the
> > > application experience a brief suspension of service, on the order of
> > > 100's of milliseconds, but are otherwise unaffected.
> > >
> > > Define the IOMMU_IOAS_CHANGE_PROCESS ioctl to allow management and use
> > > of an iommufd device to be transferred from one process to another. The
> > > application is responsible for transferring the device descriptor to the new
> > > process, eg either by preservation across fork and exec or via SCM_RIGHTS.
> >
> > It seems Ok to me, I'm glad it worked out for you
> >
> > But have you considered using something like the new
> > memfd_pin_folios() system so that iommufd is bound to the FDs backing
> > the memory instead of VMAs?
> >
> > https://lore.kernel.org/all/20240624063952.1572359-1-vivek.kasireddy@intel.com/
> >
> > I've been expecting to add support for that, but does it help this scenario?
>
> Thanks for the pointer, I had not seen it.
> AFAICT it does not affect live update. The memfd is passed to new qemu, and
> the manner in which its pages were pinned does not matter, as long as the effect
> on the mm fields that we manipulate is the same.
I mean instead of using mmap's() and telling iommfd to take the pages
from a VMA you'd use a memfd and tell iommufd to take the pages from
the memfd directly.
Since the memfd is not part of a process or mm_struct it is not
effected by live update's exec() and none of these gyrations are
necessary.
Jason
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [RFC V1 0/4] iommufd live update
2024-08-06 12:56 ` Jason Gunthorpe
@ 2024-08-08 19:15 ` Steven Sistare
2024-08-08 19:52 ` Jason Gunthorpe
0 siblings, 1 reply; 27+ messages in thread
From: Steven Sistare @ 2024-08-08 19:15 UTC (permalink / raw)
To: Jason Gunthorpe; +Cc: iommu, Kevin Tian, Alex Williamson, Cornelia Huck
On 8/6/2024 8:56 AM, Jason Gunthorpe wrote:
> On Mon, Aug 05, 2024 at 03:03:30PM -0400, Steven Sistare wrote:
>> On 7/22/2024 11:55 AM, Jason Gunthorpe wrote:
>>> On Sat, Jul 20, 2024 at 11:56:40AM -0700, Steve Sistare wrote:
>>>> Live update is a technique wherein an application saves its state, launches
>>>> an updated version of itself, and restores its state. Clients of the
>>>> application experience a brief suspension of service, on the order of
>>>> 100's of milliseconds, but are otherwise unaffected.
>>>>
>>>> Define the IOMMU_IOAS_CHANGE_PROCESS ioctl to allow management and use
>>>> of an iommufd device to be transferred from one process to another. The
>>>> application is responsible for transferring the device descriptor to the new
>>>> process, eg either by preservation across fork and exec or via SCM_RIGHTS.
>>>
>>> It seems Ok to me, I'm glad it worked out for you
>>>
>>> But have you considered using something like the new
>>> memfd_pin_folios() system so that iommufd is bound to the FDs backing
>>> the memory instead of VMAs?
>>>
>>> https://lore.kernel.org/all/20240624063952.1572359-1-vivek.kasireddy@intel.com/
>>>
>>> I've been expecting to add support for that, but does it help this scenario?
>>
>> Thanks for the pointer, I had not seen it.
>> AFAICT it does not affect live update. The memfd is passed to new qemu, and
>> the manner in which its pages were pinned does not matter, as long as the effect
>> on the mm fields that we manipulate is the same.
>
> I mean instead of using mmap's() and telling iommfd to take the pages
> from a VMA you'd use a memfd and tell iommufd to take the pages from
> the memfd directly.
>
> Since the memfd is not part of a process or mm_struct it is not
> effected by live update's exec() and none of these gyrations are
> necessary.
The problem is that kernel clients (eg mdevs) use userland VA to identify
memory when calling iommufd, so we must update the VA's after exec.
vdpa does the same, if/when it converts to iommufd. I cannot see us
changing vaddr to (file, offset) everywhere in iommufd and its clients,
up through the mdev code stack, can you?
Perhaps I am still missing your point.
- Steve
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [RFC V1 0/4] iommufd live update
2024-08-08 19:15 ` Steven Sistare
@ 2024-08-08 19:52 ` Jason Gunthorpe
2024-08-12 17:41 ` Steven Sistare
0 siblings, 1 reply; 27+ messages in thread
From: Jason Gunthorpe @ 2024-08-08 19:52 UTC (permalink / raw)
To: Steven Sistare; +Cc: iommu, Kevin Tian, Alex Williamson, Cornelia Huck
On Thu, Aug 08, 2024 at 03:15:02PM -0400, Steven Sistare wrote:
>
>
> On 8/6/2024 8:56 AM, Jason Gunthorpe wrote:
> > On Mon, Aug 05, 2024 at 03:03:30PM -0400, Steven Sistare wrote:
> > > On 7/22/2024 11:55 AM, Jason Gunthorpe wrote:
> > > > On Sat, Jul 20, 2024 at 11:56:40AM -0700, Steve Sistare wrote:
> > > > > Live update is a technique wherein an application saves its state, launches
> > > > > an updated version of itself, and restores its state. Clients of the
> > > > > application experience a brief suspension of service, on the order of
> > > > > 100's of milliseconds, but are otherwise unaffected.
> > > > >
> > > > > Define the IOMMU_IOAS_CHANGE_PROCESS ioctl to allow management and use
> > > > > of an iommufd device to be transferred from one process to another. The
> > > > > application is responsible for transferring the device descriptor to the new
> > > > > process, eg either by preservation across fork and exec or via SCM_RIGHTS.
> > > >
> > > > It seems Ok to me, I'm glad it worked out for you
> > > >
> > > > But have you considered using something like the new
> > > > memfd_pin_folios() system so that iommufd is bound to the FDs backing
> > > > the memory instead of VMAs?
> > > >
> > > > https://lore.kernel.org/all/20240624063952.1572359-1-vivek.kasireddy@intel.com/
> > > >
> > > > I've been expecting to add support for that, but does it help this scenario?
> > >
> > > Thanks for the pointer, I had not seen it.
> > > AFAICT it does not affect live update. The memfd is passed to new qemu, and
> > > the manner in which its pages were pinned does not matter, as long as the effect
> > > on the mm fields that we manipulate is the same.
> >
> > I mean instead of using mmap's() and telling iommfd to take the pages
> > from a VMA you'd use a memfd and tell iommufd to take the pages from
> > the memfd directly.
> >
> > Since the memfd is not part of a process or mm_struct it is not
> > effected by live update's exec() and none of these gyrations are
> > necessary.
>
> The problem is that kernel clients (eg mdevs) use userland VA to identify
> memory when calling iommufd, so we must update the VA's after exec.
Technically no, they use IOVA too and iommufd translates IOVA into a
VMA and what not.
So if we teach iommufd how to do memfd it would also learn how to
adapt it to mdevs as well.
> vdpa does the same, if/when it converts to iommufd. I cannot see us
> changing vaddr to (file, offset) everywhere in iommufd and its clients,
> up through the mdev code stack, can you?
That is exactly what I imagine, because it isn't vaddr already, it is
IOVA and IOVA always already translates to an area which gets you the
vaddr.
It is why this series can remap the vaddrs on the fly without reaching
outside the area struct.
Jason
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [RFC V1 0/4] iommufd live update
2024-08-08 19:52 ` Jason Gunthorpe
@ 2024-08-12 17:41 ` Steven Sistare
2024-08-19 14:59 ` Jason Gunthorpe
0 siblings, 1 reply; 27+ messages in thread
From: Steven Sistare @ 2024-08-12 17:41 UTC (permalink / raw)
To: Jason Gunthorpe; +Cc: iommu, Kevin Tian, Alex Williamson, Cornelia Huck
On 8/8/2024 3:52 PM, Jason Gunthorpe wrote:
> On Thu, Aug 08, 2024 at 03:15:02PM -0400, Steven Sistare wrote:
>> On 8/6/2024 8:56 AM, Jason Gunthorpe wrote:
>>> On Mon, Aug 05, 2024 at 03:03:30PM -0400, Steven Sistare wrote:
>>>> On 7/22/2024 11:55 AM, Jason Gunthorpe wrote:
>>>>> On Sat, Jul 20, 2024 at 11:56:40AM -0700, Steve Sistare wrote:
>>>>>> Live update is a technique wherein an application saves its state, launches
>>>>>> an updated version of itself, and restores its state. Clients of the
>>>>>> application experience a brief suspension of service, on the order of
>>>>>> 100's of milliseconds, but are otherwise unaffected.
>>>>>>
>>>>>> Define the IOMMU_IOAS_CHANGE_PROCESS ioctl to allow management and use
>>>>>> of an iommufd device to be transferred from one process to another. The
>>>>>> application is responsible for transferring the device descriptor to the new
>>>>>> process, eg either by preservation across fork and exec or via SCM_RIGHTS.
>>>>>
>>>>> It seems Ok to me, I'm glad it worked out for you
>>>>>
>>>>> But have you considered using something like the new
>>>>> memfd_pin_folios() system so that iommufd is bound to the FDs backing
>>>>> the memory instead of VMAs?
>>>>>
>>>>> https://lore.kernel.org/all/20240624063952.1572359-1-vivek.kasireddy@intel.com/
>>>>>
>>>>> I've been expecting to add support for that, but does it help this scenario?
>>>>
>>>> Thanks for the pointer, I had not seen it.
>>>> AFAICT it does not affect live update. The memfd is passed to new qemu, and
>>>> the manner in which its pages were pinned does not matter, as long as the effect
>>>> on the mm fields that we manipulate is the same.
>>>
>>> I mean instead of using mmap's() and telling iommfd to take the pages
>>> from a VMA you'd use a memfd and tell iommufd to take the pages from
>>> the memfd directly.
>>>
>>> Since the memfd is not part of a process or mm_struct it is not
>>> effected by live update's exec() and none of these gyrations are
>>> necessary.
>>
>> The problem is that kernel clients (eg mdevs) use userland VA to identify
>> memory when calling iommufd, so we must update the VA's after exec.
>
> Technically no, they use IOVA too and iommufd translates IOVA into a
> VMA and what not.
>
> So if we teach iommufd how to do memfd it would also learn how to
> adapt it to mdevs as well.
>
>> vdpa does the same, if/when it converts to iommufd. I cannot see us
>> changing vaddr to (file, offset) everywhere in iommufd and its clients,
>> up through the mdev code stack, can you?
>
> That is exactly what I imagine, because it isn't vaddr already, it is
> IOVA and IOVA always already translates to an area which gets you the
> vaddr.
>
> It is why this series can remap the vaddrs on the fly without reaching
> outside the area struct.
OK, that looks tractable. There are not too many instances of struct iopt_pages uptr
to fiddle with, adding support for file+offset. We must of course keep uptr to continue
to support anonymous memory for iommufd, but such memory will not be supported for
live update.
Do you envision a new userland interface variant of IOMMU_IOAS_MAP that takes fd and offset?
Or have userland pass user_va as usual, but have the kernel check if it maps to a file,
and save the file? The latter is more work in the kernel but requires no change in
applications.
Do you plan to work on this any time soon? Do you want me to?
We still need IOMMU_IOAS_CHANGE_PROCESS to handle IOMMU_OPTION_RLIMIT_MODE, and to handle
a changed uid.
- Steve
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [RFC V1 0/4] iommufd live update
2024-08-12 17:41 ` Steven Sistare
@ 2024-08-19 14:59 ` Jason Gunthorpe
2024-08-21 17:54 ` Steven Sistare
0 siblings, 1 reply; 27+ messages in thread
From: Jason Gunthorpe @ 2024-08-19 14:59 UTC (permalink / raw)
To: Steven Sistare; +Cc: iommu, Kevin Tian, Alex Williamson, Cornelia Huck
On Mon, Aug 12, 2024 at 01:41:46PM -0400, Steven Sistare wrote:
> On 8/8/2024 3:52 PM, Jason Gunthorpe wrote:
> > On Thu, Aug 08, 2024 at 03:15:02PM -0400, Steven Sistare wrote:
> > > On 8/6/2024 8:56 AM, Jason Gunthorpe wrote:
> > > > On Mon, Aug 05, 2024 at 03:03:30PM -0400, Steven Sistare wrote:
> > > > > On 7/22/2024 11:55 AM, Jason Gunthorpe wrote:
> > > > > > On Sat, Jul 20, 2024 at 11:56:40AM -0700, Steve Sistare wrote:
> > > > > > > Live update is a technique wherein an application saves its state, launches
> > > > > > > an updated version of itself, and restores its state. Clients of the
> > > > > > > application experience a brief suspension of service, on the order of
> > > > > > > 100's of milliseconds, but are otherwise unaffected.
> > > > > > >
> > > > > > > Define the IOMMU_IOAS_CHANGE_PROCESS ioctl to allow management and use
> > > > > > > of an iommufd device to be transferred from one process to another. The
> > > > > > > application is responsible for transferring the device descriptor to the new
> > > > > > > process, eg either by preservation across fork and exec or via SCM_RIGHTS.
> > > > > >
> > > > > > It seems Ok to me, I'm glad it worked out for you
> > > > > >
> > > > > > But have you considered using something like the new
> > > > > > memfd_pin_folios() system so that iommufd is bound to the FDs backing
> > > > > > the memory instead of VMAs?
> > > > > >
> > > > > > https://lore.kernel.org/all/20240624063952.1572359-1-vivek.kasireddy@intel.com/
> > > > > >
> > > > > > I've been expecting to add support for that, but does it help this scenario?
> > > > >
> > > > > Thanks for the pointer, I had not seen it.
> > > > > AFAICT it does not affect live update. The memfd is passed to new qemu, and
> > > > > the manner in which its pages were pinned does not matter, as long as the effect
> > > > > on the mm fields that we manipulate is the same.
> > > >
> > > > I mean instead of using mmap's() and telling iommfd to take the pages
> > > > from a VMA you'd use a memfd and tell iommufd to take the pages from
> > > > the memfd directly.
> > > >
> > > > Since the memfd is not part of a process or mm_struct it is not
> > > > effected by live update's exec() and none of these gyrations are
> > > > necessary.
> > >
> > > The problem is that kernel clients (eg mdevs) use userland VA to identify
> > > memory when calling iommufd, so we must update the VA's after exec.
> >
> > Technically no, they use IOVA too and iommufd translates IOVA into a
> > VMA and what not.
> >
> > So if we teach iommufd how to do memfd it would also learn how to
> > adapt it to mdevs as well.
> >
> > > vdpa does the same, if/when it converts to iommufd. I cannot see us
> > > changing vaddr to (file, offset) everywhere in iommufd and its clients,
> > > up through the mdev code stack, can you?
> >
> > That is exactly what I imagine, because it isn't vaddr already, it is
> > IOVA and IOVA always already translates to an area which gets you the
> > vaddr.
> >
> > It is why this series can remap the vaddrs on the fly without reaching
> > outside the area struct.
>
> OK, that looks tractable. There are not too many instances of
> struct iopt_pages uptr to fiddle with, adding support for
> file+offset. We must of course keep uptr to continue to support
> anonymous memory for iommufd, but such memory will not be supported
> for live update.
>
> Do you envision a new userland interface variant of IOMMU_IOAS_MAP
> that takes fd and offset?
Yes
> Or have userland pass user_va as usual, but have the kernel check if it maps to a file,
> and save the file? The latter is more work in the kernel but requires no change in
> applications.
Maybe this is possible too..
> Do you plan to work on this any time soon? Do you want me to?
I wasn't at the point of this yet, if you are interested I suggest
taking a stab. Now that the the infrastructure is in the mm it should
mostly just be changing pin_user_pages() to the other one. It might be
quite shore
> We still need IOMMU_IOAS_CHANGE_PROCESS to handle
> IOMMU_OPTION_RLIMIT_MODE, and to handle a changed uid.
Yes that makes sense
Jason
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [RFC V1 0/4] iommufd live update
2024-08-19 14:59 ` Jason Gunthorpe
@ 2024-08-21 17:54 ` Steven Sistare
2024-08-21 18:04 ` Jason Gunthorpe
0 siblings, 1 reply; 27+ messages in thread
From: Steven Sistare @ 2024-08-21 17:54 UTC (permalink / raw)
To: Jason Gunthorpe; +Cc: iommu, Kevin Tian, Alex Williamson, Cornelia Huck
On 8/19/2024 10:59 AM, Jason Gunthorpe wrote:
> On Mon, Aug 12, 2024 at 01:41:46PM -0400, Steven Sistare wrote:
>> On 8/8/2024 3:52 PM, Jason Gunthorpe wrote:
>>> On Thu, Aug 08, 2024 at 03:15:02PM -0400, Steven Sistare wrote:
>>>> On 8/6/2024 8:56 AM, Jason Gunthorpe wrote:
>>>>> On Mon, Aug 05, 2024 at 03:03:30PM -0400, Steven Sistare wrote:
>>>>>> On 7/22/2024 11:55 AM, Jason Gunthorpe wrote:
>>>>>>> On Sat, Jul 20, 2024 at 11:56:40AM -0700, Steve Sistare wrote:
>>>>>>>> Live update is a technique wherein an application saves its state, launches
>>>>>>>> an updated version of itself, and restores its state. Clients of the
>>>>>>>> application experience a brief suspension of service, on the order of
>>>>>>>> 100's of milliseconds, but are otherwise unaffected.
>>>>>>>>
>>>>>>>> Define the IOMMU_IOAS_CHANGE_PROCESS ioctl to allow management and use
>>>>>>>> of an iommufd device to be transferred from one process to another. The
>>>>>>>> application is responsible for transferring the device descriptor to the new
>>>>>>>> process, eg either by preservation across fork and exec or via SCM_RIGHTS.
>>>>>>>
>>>>>>> It seems Ok to me, I'm glad it worked out for you
>>>>>>>
>>>>>>> But have you considered using something like the new
>>>>>>> memfd_pin_folios() system so that iommufd is bound to the FDs backing
>>>>>>> the memory instead of VMAs?
>>>>>>>
>>>>>>> https://lore.kernel.org/all/20240624063952.1572359-1-vivek.kasireddy@intel.com/
>>>>>>>
>>>>>>> I've been expecting to add support for that, but does it help this scenario?
>>>>>>
>>>>>> Thanks for the pointer, I had not seen it.
>>>>>> AFAICT it does not affect live update. The memfd is passed to new qemu, and
>>>>>> the manner in which its pages were pinned does not matter, as long as the effect
>>>>>> on the mm fields that we manipulate is the same.
>>>>>
>>>>> I mean instead of using mmap's() and telling iommfd to take the pages
>>>>> from a VMA you'd use a memfd and tell iommufd to take the pages from
>>>>> the memfd directly.
>>>>>
>>>>> Since the memfd is not part of a process or mm_struct it is not
>>>>> effected by live update's exec() and none of these gyrations are
>>>>> necessary.
>>>>
>>>> The problem is that kernel clients (eg mdevs) use userland VA to identify
>>>> memory when calling iommufd, so we must update the VA's after exec.
>>>
>>> Technically no, they use IOVA too and iommufd translates IOVA into a
>>> VMA and what not.
>>>
>>> So if we teach iommufd how to do memfd it would also learn how to
>>> adapt it to mdevs as well.
>>>
>>>> vdpa does the same, if/when it converts to iommufd. I cannot see us
>>>> changing vaddr to (file, offset) everywhere in iommufd and its clients,
>>>> up through the mdev code stack, can you?
>>>
>>> That is exactly what I imagine, because it isn't vaddr already, it is
>>> IOVA and IOVA always already translates to an area which gets you the
>>> vaddr.
>>>
>>> It is why this series can remap the vaddrs on the fly without reaching
>>> outside the area struct.
>>
>> OK, that looks tractable. There are not too many instances of
>> struct iopt_pages uptr to fiddle with, adding support for
>> file+offset. We must of course keep uptr to continue to support
>> anonymous memory for iommufd, but such memory will not be supported
>> for live update.
>>
>> Do you envision a new userland interface variant of IOMMU_IOAS_MAP
>> that takes fd and offset?
>
> Yes
>
>> Or have userland pass user_va as usual, but have the kernel check if it maps to a file,
>> and save the file? The latter is more work in the kernel but requires no change in
>> applications.
>
> Maybe this is possible too..
>
>> Do you plan to work on this any time soon? Do you want me to?
>
> I wasn't at the point of this yet, if you are interested I suggest
> taking a stab. Now that the the infrastructure is in the mm it should
> mostly just be changing pin_user_pages() to the other one. It might be
> quite shore
I'm on the fence about taking a stab. It will not be short!
memfd_pin_folios() returns an array of folios which must be saved and
later passed back to unpin_folios(). That's a slight bummer, since
currently you save memory by retrieving PFNs from the iommu as needed,
rather than saving all of them explicitly (with the exception of pinned_pfns).
Also, the current map() call stack uses pfn_reader, which pins in small batches.
This offers no advantage for memfd since we must save all folios regardless.
So, we could short-circuit the call stack above the level where pfn_reader
is used. Or, add folios to pfn_reader, and continue to pin in batches, and manage
the separate arrays of folios that result. Or, add folios to pfn_reader, and pin
using a single giant batch.
Also, I must truncate the unmap() call stacks above where PFNs are used.
This is mostly FYI, but if you prefer one approach, let me know.
- Steve'
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [RFC V1 0/4] iommufd live update
2024-08-21 17:54 ` Steven Sistare
@ 2024-08-21 18:04 ` Jason Gunthorpe
2024-08-22 21:05 ` Steven Sistare
0 siblings, 1 reply; 27+ messages in thread
From: Jason Gunthorpe @ 2024-08-21 18:04 UTC (permalink / raw)
To: Steven Sistare; +Cc: iommu, Kevin Tian, Alex Williamson, Cornelia Huck
On Wed, Aug 21, 2024 at 01:54:04PM -0400, Steven Sistare wrote:
> memfd_pin_folios() returns an array of folios which must be saved and
> later passed back to unpin_folios(). That's a slight bummer, since
> currently you save memory by retrieving PFNs from the iommu as needed,
> rather than saving all of them explicitly (with the exception of pinned_pfns).
You wouldn't save the folios directly. The existing PFN mechanism is
fine and already works with folios memory.
The mismatch betwen gup returning pages and memfd_pin_folios returning
folios is fixed up by adjusting the folio refcount back to page
granularity, then treating it the same as a bunch of pages and using
unpin_user_pages() as normal.
gup.c already has code that can do this adjustment it would probably
need to be exported. Something like:
folio_refs_to_pages(folio, len);
Then don't change very much in iommufd, just stuff the folio's PFNs
into the existing reader mechanism the same as the pin_user_pages()
flow after fixing the refcount.
Jason
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [RFC V1 0/4] iommufd live update
2024-08-21 18:04 ` Jason Gunthorpe
@ 2024-08-22 21:05 ` Steven Sistare
2024-08-22 21:10 ` Jason Gunthorpe
0 siblings, 1 reply; 27+ messages in thread
From: Steven Sistare @ 2024-08-22 21:05 UTC (permalink / raw)
To: Jason Gunthorpe, Alex Williamson; +Cc: iommu, Kevin Tian, Cornelia Huck
On 8/21/2024 2:04 PM, Jason Gunthorpe wrote:
> On Wed, Aug 21, 2024 at 01:54:04PM -0400, Steven Sistare wrote:
>
>> memfd_pin_folios() returns an array of folios which must be saved and
>> later passed back to unpin_folios(). That's a slight bummer, since
>> currently you save memory by retrieving PFNs from the iommu as needed,
>> rather than saving all of them explicitly (with the exception of pinned_pfns).
>
> You wouldn't save the folios directly. The existing PFN mechanism is
> fine and already works with folios memory.
>
> The mismatch betwen gup returning pages and memfd_pin_folios returning
> folios is fixed up by adjusting the folio refcount back to page
> granularity, then treating it the same as a bunch of pages and using
> unpin_user_pages() as normal.
>
> gup.c already has code that can do this adjustment it would probably
> need to be exported. Something like:
>
> folio_refs_to_pages(folio, len);
>
> Then don't change very much in iommufd, just stuff the folio's PFNs
> into the existing reader mechanism the same as the pin_user_pages()
> flow after fixing the refcount.
Thanks, I will keep working on it.
I found a screw case -- PCI BARs.
When iommufd supports mapping them (I assume someone is working on that?),
mdev access to those mappings must work after live update. We must be
able to translate iova to PFN without using va, by recognizing the mapping
is special and calling something special to generate the PFN, such as
the extracted guts of vfio_pci_mmap_fault.
- Steve
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [RFC V1 0/4] iommufd live update
2024-08-22 21:05 ` Steven Sistare
@ 2024-08-22 21:10 ` Jason Gunthorpe
0 siblings, 0 replies; 27+ messages in thread
From: Jason Gunthorpe @ 2024-08-22 21:10 UTC (permalink / raw)
To: Steven Sistare; +Cc: Alex Williamson, iommu, Kevin Tian, Cornelia Huck
On Thu, Aug 22, 2024 at 05:05:19PM -0400, Steven Sistare wrote:
> When iommufd supports mapping them (I assume someone is working on that?),
> mdev access to those mappings must work after live update. We must be
> able to translate iova to PFN without using va, by recognizing the mapping
> is special and calling something special to generate the PFN, such as
> the extracted guts of vfio_pci_mmap_fault.
I intend to support MMIO via dmabuf, the current VMA approach cannot
ever be correctly locked.
So it will work the same as this in that vision, there are patches in
progress for the parts, it is unexpectedly big :(
Jason
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [RFC V1 0/4] iommufd live update
2024-07-20 18:56 [RFC V1 0/4] iommufd live update Steve Sistare
` (6 preceding siblings ...)
2024-07-22 15:55 ` Jason Gunthorpe
@ 2024-07-23 12:48 ` Jason Gunthorpe
2024-08-05 19:02 ` Steven Sistare
7 siblings, 1 reply; 27+ messages in thread
From: Jason Gunthorpe @ 2024-07-23 12:48 UTC (permalink / raw)
To: Steve Sistare; +Cc: iommu, Kevin Tian, Alex Williamson, Cornelia Huck
On Sat, Jul 20, 2024 at 11:56:40AM -0700, Steve Sistare wrote:
> Live update is a technique wherein an application saves its state, launches
> an updated version of itself, and restores its state. Clients of the
> application experience a brief suspension of service, on the order of
> 100's of milliseconds, but are otherwise unaffected.
>
> Define the IOMMU_IOAS_CHANGE_PROCESS ioctl to allow management and use
> of an iommufd device to be transferred from one process to another. The
> application is responsible for transferring the device descriptor to the new
> process, eg either by preservation across fork and exec or via SCM_RIGHTS.
>
> All memory objects that were mapped for iommufd DMA in the old process
> must also be mapped in the new process, but they may have different virtual
> addresses in the new. The application passes an array of new addresses to
> IOMMU_IOAS_CHANGE_PROCESS, and it atomically updates everything to the current
> process, grabbing the new mm, transferring pinned page counts, and recording
> new virtual addresses.
>
> If the application directly exec's the new version of itself, it must take
> special care if the mappings may be accessed by kernel threads, such as by
> vfio mdevs. The original process must fork an identical caretaker
> process which calls VFIO_IOAS_CHANGE_PROCESS to take temporary ownership
> of the mappings without changing VA's. The original process then exec's
> its successor, which calls VFIO_IOAS_CHANGE_PROCESS with new VA's to take
> ownership from the caretaker.
>
> Thanks to Jason Gunthorpe for code and ideas in this series.
>
> This is implemented in QEMU by the patch series "Live update: iommufd"
> https://lore.kernel.org/qemu-devel (to be posted shortly)
>
> Steve Sistare (4):
> iommufd: Export do_update_pinned
> iommufd: Lock all objects
> iommufd: Add IOMMU_IOAS_CHANGE_PROCESS
> iommufd: update VA
Oh, this should also be covered in the iommufd selftests..
Jason
^ permalink raw reply [flat|nested] 27+ messages in thread* Re: [RFC V1 0/4] iommufd live update
2024-07-23 12:48 ` Jason Gunthorpe
@ 2024-08-05 19:02 ` Steven Sistare
0 siblings, 0 replies; 27+ messages in thread
From: Steven Sistare @ 2024-08-05 19:02 UTC (permalink / raw)
To: Jason Gunthorpe; +Cc: iommu, Kevin Tian, Alex Williamson, Cornelia Huck
On 7/23/2024 8:48 AM, Jason Gunthorpe wrote:
> On Sat, Jul 20, 2024 at 11:56:40AM -0700, Steve Sistare wrote:
>> Live update is a technique wherein an application saves its state, launches
>> an updated version of itself, and restores its state. Clients of the
>> application experience a brief suspension of service, on the order of
>> 100's of milliseconds, but are otherwise unaffected.
>>
>> Define the IOMMU_IOAS_CHANGE_PROCESS ioctl to allow management and use
>> of an iommufd device to be transferred from one process to another. The
>> application is responsible for transferring the device descriptor to the new
>> process, eg either by preservation across fork and exec or via SCM_RIGHTS.
>>
>> All memory objects that were mapped for iommufd DMA in the old process
>> must also be mapped in the new process, but they may have different virtual
>> addresses in the new. The application passes an array of new addresses to
>> IOMMU_IOAS_CHANGE_PROCESS, and it atomically updates everything to the current
>> process, grabbing the new mm, transferring pinned page counts, and recording
>> new virtual addresses.
>>
>> If the application directly exec's the new version of itself, it must take
>> special care if the mappings may be accessed by kernel threads, such as by
>> vfio mdevs. The original process must fork an identical caretaker
>> process which calls VFIO_IOAS_CHANGE_PROCESS to take temporary ownership
>> of the mappings without changing VA's. The original process then exec's
>> its successor, which calls VFIO_IOAS_CHANGE_PROCESS with new VA's to take
>> ownership from the caretaker.
>>
>> Thanks to Jason Gunthorpe for code and ideas in this series.
>>
>> This is implemented in QEMU by the patch series "Live update: iommufd"
>> https://lore.kernel.org/qemu-devel (to be posted shortly)
>>
>> Steve Sistare (4):
>> iommufd: Export do_update_pinned
>> iommufd: Lock all objects
>> iommufd: Add IOMMU_IOAS_CHANGE_PROCESS
>> iommufd: update VA
>
> Oh, this should also be covered in the iommufd selftests..
Will do - steve
^ permalink raw reply [flat|nested] 27+ messages in thread