From: Matt Evans <mattev@meta.com>
To: "Alex Williamson" <alex@shazbot.org>,
"Leon Romanovsky" <leon@kernel.org>,
"Jason Gunthorpe" <jgg@nvidia.com>,
"Alex Mastro" <amastro@fb.com>,
"Christian König" <christian.koenig@amd.com>
Cc: "Mahmoud Adam" <mngyadam@amazon.de>,
"David Matlack" <dmatlack@google.com>,
"Björn Töpel" <bjorn@kernel.org>,
"Sumit Semwal" <sumit.semwal@linaro.org>,
"Kevin Tian" <kevin.tian@intel.com>,
"Ankit Agrawal" <ankita@nvidia.com>,
"Pranjal Shrivastava" <praan@google.com>,
"Alistair Popple" <apopple@nvidia.com>,
"Vivek Kasireddy" <vivek.kasireddy@intel.com>,
linux-kernel@vger.kernel.org, linux-media@vger.kernel.org,
dri-devel@lists.freedesktop.org, linaro-mm-sig@lists.linaro.org,
kvm@vger.kernel.org
Subject: [PATCH 8/9] vfio/pci: Permanently revoke a DMABUF on request
Date: Thu, 16 Apr 2026 06:17:51 -0700 [thread overview]
Message-ID: <20260416131815.2729131-9-mattev@meta.com> (raw)
In-Reply-To: <20260416131815.2729131-1-mattev@meta.com>
Expand the VFIO DMABUF revocation state to three states:
Not revoked, temporarily revoked, and permanently revoked.
The first two are for existing transient revocation, e.g. across a
function reset, and the DMABUF is put into the last in response to a
new ioctl(VFIO_DEVICE_PCI_DMABUF_REVOKE) request.
This VFIO device fd ioctl passes a DMABUF by fd and requests that the
DMABUF is permanently revoked. On success, it's guaranteed that the
buffer can never be imported/attached/mmap()ed in future, that dynamic
imports have been cleanly detached, and all mappings made
inaccessible/PTEs zapped.
This is useful for lifecycle management, to reclaim VFIO PCI BAR
ranges previously delegated to a subordinate client process: The
driver process can ensure that the loaned resources are revoked when
the client is deemed "done", and exported ranges can be safely re-used
elsewhere.
Signed-off-by: Matt Evans <mattev@meta.com>
---
drivers/vfio/pci/vfio_pci_core.c | 21 +++-
drivers/vfio/pci/vfio_pci_dmabuf.c | 158 +++++++++++++++++++++--------
drivers/vfio/pci/vfio_pci_priv.h | 14 ++-
include/uapi/linux/vfio.h | 30 ++++++
4 files changed, 179 insertions(+), 44 deletions(-)
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index cad126cf8737..59582fcfba97 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -1461,6 +1461,21 @@ static int vfio_pci_ioctl_ioeventfd(struct vfio_pci_core_device *vdev,
ioeventfd.fd);
}
+static int vfio_pci_ioctl_dmabuf_revoke(struct vfio_pci_core_device *vdev,
+ struct vfio_pci_dmabuf_revoke __user *arg)
+{
+ unsigned long minsz = offsetofend(struct vfio_pci_dmabuf_revoke, dmabuf_fd);
+ struct vfio_pci_dmabuf_revoke revoke;
+
+ if (copy_from_user(&revoke, arg, minsz))
+ return -EFAULT;
+
+ if (revoke.argsz < minsz)
+ return -EINVAL;
+
+ return vfio_pci_dma_buf_revoke(vdev, revoke.dmabuf_fd);
+}
+
long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd,
unsigned long arg)
{
@@ -1483,6 +1498,8 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd,
return vfio_pci_ioctl_reset(vdev, uarg);
case VFIO_DEVICE_SET_IRQS:
return vfio_pci_ioctl_set_irqs(vdev, uarg);
+ case VFIO_DEVICE_PCI_DMABUF_REVOKE:
+ return vfio_pci_ioctl_dmabuf_revoke(vdev, uarg);
default:
return -ENOTTY;
}
@@ -1752,7 +1769,7 @@ static vm_fault_t vfio_pci_mmap_huge_fault(struct vm_fault *vmf,
dma_resv_lock(priv->dmabuf->resv, NULL);
vdev = READ_ONCE(priv->vdev);
- if (READ_ONCE(priv->revoked) || !vdev) {
+ if (READ_ONCE(priv->status) != VFIO_PCI_DMABUF_OK || !vdev) {
pr_debug_ratelimited("%s VA 0x%lx, pgoff 0x%lx: DMABUF revoked/cleaned up\n",
__func__, vmf->address, vma->vm_pgoff);
dma_resv_unlock(priv->dmabuf->resv);
@@ -1774,7 +1791,7 @@ static vm_fault_t vfio_pci_mmap_huge_fault(struct vm_fault *vmf,
dma_resv_unlock(priv->dmabuf->resv);
scoped_guard(rwsem_read, &vdev->memory_lock) {
- if (!READ_ONCE(priv->revoked)) {
+ if (READ_ONCE(priv->status) == VFIO_PCI_DMABUF_OK) {
int pres = vfio_pci_dma_buf_find_pfn(priv, vma,
vmf->address,
order, &pfn);
diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c
index cc477f46a7d5..48ec4da2db8b 100644
--- a/drivers/vfio/pci/vfio_pci_dmabuf.c
+++ b/drivers/vfio/pci/vfio_pci_dmabuf.c
@@ -19,7 +19,7 @@ static int vfio_pci_dma_buf_attach(struct dma_buf *dmabuf,
if (!attachment->peer2peer)
return -EOPNOTSUPP;
- if (priv->revoked)
+ if (priv->status != VFIO_PCI_DMABUF_OK)
return -ENODEV;
if (!dma_buf_attach_revocable(attachment))
@@ -33,7 +33,7 @@ static int vfio_pci_dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *
struct vfio_pci_dma_buf *priv = dmabuf->priv;
u64 req_len, req_start;
- if (priv->revoked)
+ if (priv->status != VFIO_PCI_DMABUF_OK)
return -ENODEV;
if ((vma->vm_flags & VM_SHARED) == 0)
return -EINVAL;
@@ -73,7 +73,7 @@ vfio_pci_dma_buf_map(struct dma_buf_attachment *attachment,
dma_resv_assert_held(priv->dmabuf->resv);
- if (priv->revoked)
+ if (priv->status != VFIO_PCI_DMABUF_OK)
return ERR_PTR(-ENODEV);
ret = dma_buf_phys_vec_to_sgt(attachment, priv->provider,
@@ -270,7 +270,8 @@ static int vfio_pci_dmabuf_export(struct vfio_pci_core_device *vdev,
INIT_LIST_HEAD(&priv->dmabufs_elm);
down_write(&vdev->memory_lock);
dma_resv_lock(priv->dmabuf->resv, NULL);
- priv->revoked = !__vfio_pci_memory_enabled(vdev);
+ priv->status = __vfio_pci_memory_enabled(vdev) ? VFIO_PCI_DMABUF_OK :
+ VFIO_PCI_DMABUF_TEMP_REVOKED;
list_add_tail(&priv->dmabufs_elm, &vdev->dmabufs);
dma_resv_unlock(priv->dmabuf->resv);
up_write(&vdev->memory_lock);
@@ -301,7 +302,7 @@ int vfio_pci_dma_buf_iommufd_map(struct dma_buf_attachment *attachment,
return -EOPNOTSUPP;
priv = attachment->dmabuf->priv;
- if (priv->revoked)
+ if (priv->status != VFIO_PCI_DMABUF_OK)
return -ENODEV;
/* More than one range to iommufd will require proper DMABUF support */
@@ -581,6 +582,64 @@ int vfio_pci_core_mmap_prep_dmabuf(struct vfio_pci_core_device *vdev,
return ret;
}
+static void __vfio_pci_dma_buf_revoke(struct vfio_pci_dma_buf *priv, bool revoked,
+ bool permanently)
+{
+ bool was_revoked;
+
+ lockdep_assert_held_write(&priv->vdev->memory_lock);
+
+ if ((priv->status == VFIO_PCI_DMABUF_PERM_REVOKED) ||
+ (priv->status == VFIO_PCI_DMABUF_OK && !revoked) ||
+ (priv->status == VFIO_PCI_DMABUF_TEMP_REVOKED && revoked && !permanently)) {
+ return;
+ }
+
+ dma_resv_lock(priv->dmabuf->resv, NULL);
+ was_revoked = priv->status != VFIO_PCI_DMABUF_OK;
+
+ if (revoked)
+ priv->status = permanently ?
+ VFIO_PCI_DMABUF_PERM_REVOKED : VFIO_PCI_DMABUF_TEMP_REVOKED;
+
+ /*
+ * If TEMP_REVOKED is being upgraded to PERM_REVOKED, the
+ * buffer is already gone. Don't wait on it again.
+ */
+ if (was_revoked && revoked) {
+ dma_resv_unlock(priv->dmabuf->resv);
+ return;
+ }
+
+ dma_buf_invalidate_mappings(priv->dmabuf);
+ dma_resv_wait_timeout(priv->dmabuf->resv,
+ DMA_RESV_USAGE_BOOKKEEP, false,
+ MAX_SCHEDULE_TIMEOUT);
+ dma_resv_unlock(priv->dmabuf->resv);
+ if (revoked) {
+ kref_put(&priv->kref, vfio_pci_dma_buf_done);
+ wait_for_completion(&priv->comp);
+ unmap_mapping_range(priv->dmabuf->file->f_mapping,
+ 0, priv->size, 1);
+ } else {
+ /*
+ * Kref is initialize again, because when revoke
+ * was performed the reference counter was decreased
+ * to zero to trigger completion.
+ */
+ kref_init(&priv->kref);
+ /*
+ * There is no need to wait as no mapping was
+ * performed when the previous status was
+ * priv->status == *REVOKED.
+ */
+ reinit_completion(&priv->comp);
+ dma_resv_lock(priv->dmabuf->resv, NULL);
+ priv->status = VFIO_PCI_DMABUF_OK;
+ dma_resv_unlock(priv->dmabuf->resv);
+ }
+}
+
void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked)
{
struct vfio_pci_dma_buf *priv;
@@ -589,45 +648,13 @@ void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked)
lockdep_assert_held_write(&vdev->memory_lock);
/*
* Holding memory_lock ensures a racing VMA fault observes
- * priv->revoked properly.
+ * priv->status properly.
*/
list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) {
if (!get_file_active(&priv->dmabuf->file))
continue;
-
- if (priv->revoked != revoked) {
- dma_resv_lock(priv->dmabuf->resv, NULL);
- if (revoked)
- priv->revoked = true;
- dma_buf_invalidate_mappings(priv->dmabuf);
- dma_resv_wait_timeout(priv->dmabuf->resv,
- DMA_RESV_USAGE_BOOKKEEP, false,
- MAX_SCHEDULE_TIMEOUT);
- dma_resv_unlock(priv->dmabuf->resv);
- if (revoked) {
- kref_put(&priv->kref, vfio_pci_dma_buf_done);
- wait_for_completion(&priv->comp);
- unmap_mapping_range(priv->dmabuf->file->f_mapping,
- 0, priv->size, 1);
- } else {
- /*
- * Kref is initialize again, because when revoke
- * was performed the reference counter was decreased
- * to zero to trigger completion.
- */
- kref_init(&priv->kref);
- /*
- * There is no need to wait as no mapping was
- * performed when the previous status was
- * priv->revoked == true.
- */
- reinit_completion(&priv->comp);
- dma_resv_lock(priv->dmabuf->resv, NULL);
- priv->revoked = false;
- dma_resv_unlock(priv->dmabuf->resv);
- }
- }
+ __vfio_pci_dma_buf_revoke(priv, revoked, false);
fput(priv->dmabuf->file);
}
}
@@ -647,8 +674,8 @@ void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev)
dma_resv_lock(priv->dmabuf->resv, NULL);
list_del_init(&priv->dmabufs_elm);
priv->vdev = NULL;
- was_revoked = priv->revoked;
- priv->revoked = true;
+ was_revoked = (priv->status != VFIO_PCI_DMABUF_OK);
+ priv->status = VFIO_PCI_DMABUF_PERM_REVOKED;
dma_buf_invalidate_mappings(priv->dmabuf);
dma_resv_wait_timeout(priv->dmabuf->resv,
DMA_RESV_USAGE_BOOKKEEP, false,
@@ -665,3 +692,52 @@ void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev)
}
up_write(&vdev->memory_lock);
}
+
+#ifdef CONFIG_VFIO_PCI_DMABUF
+int vfio_pci_dma_buf_revoke(struct vfio_pci_core_device *vdev, int dmabuf_fd)
+{
+ struct dma_buf *dmabuf;
+ struct vfio_pci_dma_buf *priv;
+ int ret = 0;
+
+ dmabuf = dma_buf_get(dmabuf_fd);
+ if (IS_ERR(dmabuf))
+ return PTR_ERR(dmabuf);
+
+ /*
+ * Sanity-check the DMABUF is really a vfio_pci_dma_buf _and_
+ * (below) relates to the VFIO device it was provided with:
+ */
+ if (dmabuf->ops != &vfio_pci_dmabuf_ops) {
+ ret = -ENODEV;
+ goto out_put_buf;
+ }
+
+ priv = dmabuf->priv;
+
+ scoped_guard(rwsem_write, &vdev->memory_lock) {
+ struct vfio_pci_core_device *db_vdev = READ_ONCE(priv->vdev);
+
+ /*
+ * Reading priv->vdev inside the lock is conservative,
+ * because cleanup (changes vdev) is (today) prevented
+ * from running concurrently by the VFIO device fd
+ * being held open by the caller, ioctl.
+ */
+ if (!db_vdev || db_vdev != vdev) {
+ ret = -ENODEV;
+ break;
+ }
+
+ if (priv->status == VFIO_PCI_DMABUF_PERM_REVOKED)
+ ret = -EBADFD;
+ else
+ __vfio_pci_dma_buf_revoke(priv, true, true);
+ }
+
+ out_put_buf:
+ dma_buf_put(dmabuf);
+
+ return ret;
+}
+#endif /* CONFIG_VFIO_PCI_DMABUF */
diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h
index f837d6c8bddc..eac5606ca161 100644
--- a/drivers/vfio/pci/vfio_pci_priv.h
+++ b/drivers/vfio/pci/vfio_pci_priv.h
@@ -23,6 +23,12 @@ struct vfio_pci_ioeventfd {
bool test_mem;
};
+enum vfio_pci_dma_buf_status {
+ VFIO_PCI_DMABUF_OK = 0,
+ VFIO_PCI_DMABUF_TEMP_REVOKED = 1,
+ VFIO_PCI_DMABUF_PERM_REVOKED = 2,
+};
+
struct vfio_pci_dma_buf {
struct dma_buf *dmabuf;
struct vfio_pci_core_device *vdev;
@@ -34,7 +40,7 @@ struct vfio_pci_dma_buf {
u32 nr_ranges;
struct kref kref;
struct completion comp;
- u8 revoked : 1;
+ enum vfio_pci_dma_buf_status status;
};
extern const struct vm_operations_struct vfio_pci_mmap_ops;
@@ -147,6 +153,7 @@ void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked);
int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
struct vfio_device_feature_dma_buf __user *arg,
size_t argsz);
+int vfio_pci_dma_buf_revoke(struct vfio_pci_core_device *vdev, int dmabuf_fd);
#else
static inline int
vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
@@ -155,6 +162,11 @@ vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
{
return -ENOTTY;
}
+static inline int vfio_pci_dma_buf_revoke(struct vfio_pci_core_device *vdev,
+ int dmabuf_fd)
+{
+ return -ENODEV;
+}
#endif
#endif
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 5de618a3a5ee..77225ed8115f 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -1321,6 +1321,36 @@ struct vfio_precopy_info {
#define VFIO_MIG_GET_PRECOPY_INFO _IO(VFIO_TYPE, VFIO_BASE + 21)
+/**
+ * VFIO_DEVICE_PCI_DMABUF_REVOKE - _IO(VFIO_TYPE, VFIO_BASE + 22)
+ *
+ * This ioctl is used on the device FD, and requests that access to
+ * the buffer corresponding to the DMABUF FD parameter is immediately
+ * and permanently revoked. On successful return, the buffer is not
+ * accessible through any mmap() or dma-buf import. The request fails
+ * if the buffer is pinned; otherwise, the exporter marks the buffer
+ * as inaccessible and uses the move_notify callback to inform
+ * importers of the change. The buffer is permanently disabled, and
+ * VFIO refuses all map, mmap, attach, etc. requests.
+ *
+ * Returns:
+ *
+ * Return: 0 on success, -1 and errno set on failure:
+ *
+ * ENODEV if the associated dmabuf FD no longer exists/is closed,
+ * or is not a DMABUF created for this device.
+ * EINVAL if the dmabuf_fd parameter isn't a DMABUF.
+ * EBADF if the dmabuf_fd parameter isn't a valid file number.
+ * EBADFD if the buffer has already been revoked.
+ *
+ */
+struct vfio_pci_dmabuf_revoke {
+ __u32 argsz;
+ __u32 dmabuf_fd;
+};
+
+#define VFIO_DEVICE_PCI_DMABUF_REVOKE _IO(VFIO_TYPE, VFIO_BASE + 22)
+
/*
* Upon VFIO_DEVICE_FEATURE_SET, allow the device to be moved into a low power
* state with the platform-based power management. Device use of lower power
--
2.47.3
next prev parent reply other threads:[~2026-04-17 7:06 UTC|newest]
Thread overview: 48+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-04-16 13:17 [PATCH 0/9] vfio/pci: Add mmap() for DMABUFs Matt Evans
2026-04-16 13:17 ` [PATCH 1/9] vfio/pci: Fix vfio_pci_dma_buf_cleanup() double-put Matt Evans
2026-04-24 18:05 ` Jason Gunthorpe
2026-05-01 19:12 ` Alex Williamson
2026-05-06 13:53 ` Matt Evans
2026-05-06 15:29 ` Leon Romanovsky
2026-05-06 15:55 ` Matt Evans
2026-05-06 16:14 ` Leon Romanovsky
2026-05-06 16:42 ` Matt Evans
2026-04-16 13:17 ` [PATCH 2/9] vfio/pci: Add a helper to look up PFNs for DMABUFs Matt Evans
2026-04-24 18:15 ` Jason Gunthorpe
2026-05-07 15:48 ` Matt Evans
2026-04-16 13:17 ` [PATCH 3/9] vfio/pci: Add a helper to create a DMABUF for a BAR-map VMA Matt Evans
2026-04-24 18:24 ` Jason Gunthorpe
2026-04-30 16:47 ` Matt Evans
2026-04-30 17:11 ` Jason Gunthorpe
2026-05-05 18:13 ` Matt Evans
2026-05-06 19:03 ` Matt Evans
2026-04-16 13:17 ` [PATCH 4/9] vfio/pci: Convert BAR mmap() to use a DMABUF Matt Evans
2026-05-01 22:19 ` Alex Williamson
2026-05-04 7:40 ` Jason Gunthorpe
2026-05-05 10:49 ` Leon Romanovsky
2026-05-05 14:50 ` Alex Williamson
2026-05-05 14:59 ` Jason Gunthorpe
2026-05-06 5:35 ` Leon Romanovsky
2026-05-14 17:52 ` Matt Evans
2026-04-16 13:17 ` [PATCH 5/9] vfio/pci: Provide a user-facing name for BAR mappings Matt Evans
2026-04-24 18:26 ` Jason Gunthorpe
2026-05-01 22:44 ` Alex Williamson
2026-05-07 16:56 ` Matt Evans
2026-05-07 17:17 ` Matt Evans
2026-04-16 13:17 ` [PATCH 6/9] vfio/pci: Clean up BAR zap and revocation Matt Evans
2026-05-01 23:19 ` Alex Williamson
2026-05-05 10:58 ` Leon Romanovsky
2026-04-16 13:17 ` [PATCH 7/9] vfio/pci: Support mmap() of a VFIO DMABUF Matt Evans
2026-04-24 18:30 ` Jason Gunthorpe
2026-05-07 16:09 ` Matt Evans
2026-04-16 13:17 ` Matt Evans [this message]
2026-04-16 13:17 ` [PATCH 9/9] vfio/pci: Add mmap() attributes to DMABUF feature Matt Evans
2026-04-24 18:31 ` Jason Gunthorpe
2026-04-26 10:52 ` Leon Romanovsky
2026-04-27 14:36 ` Alex Williamson
2026-05-11 15:30 ` Matt Evans
2026-05-11 17:51 ` Leon Romanovsky
2026-05-11 20:09 ` Alex Williamson
2026-05-12 17:51 ` Matt Evans
2026-05-13 18:27 ` Alex Williamson
2026-05-14 13:55 ` Matt Evans
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260416131815.2729131-9-mattev@meta.com \
--to=mattev@meta.com \
--cc=alex@shazbot.org \
--cc=amastro@fb.com \
--cc=ankita@nvidia.com \
--cc=apopple@nvidia.com \
--cc=bjorn@kernel.org \
--cc=christian.koenig@amd.com \
--cc=dmatlack@google.com \
--cc=dri-devel@lists.freedesktop.org \
--cc=jgg@nvidia.com \
--cc=kevin.tian@intel.com \
--cc=kvm@vger.kernel.org \
--cc=leon@kernel.org \
--cc=linaro-mm-sig@lists.linaro.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-media@vger.kernel.org \
--cc=mngyadam@amazon.de \
--cc=praan@google.com \
--cc=sumit.semwal@linaro.org \
--cc=vivek.kasireddy@intel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.