From: Matt Evans <mattev@meta.com>
To: "Alex Williamson" <alex@shazbot.org>,
"Leon Romanovsky" <leon@kernel.org>,
"Jason Gunthorpe" <jgg@nvidia.com>,
"Alex Mastro" <amastro@fb.com>,
"Christian König" <christian.koenig@amd.com>
Cc: "Mahmoud Adam" <mngyadam@amazon.de>,
"David Matlack" <dmatlack@google.com>,
"Björn Töpel" <bjorn@kernel.org>,
"Sumit Semwal" <sumit.semwal@linaro.org>,
"Kevin Tian" <kevin.tian@intel.com>,
"Ankit Agrawal" <ankita@nvidia.com>,
"Pranjal Shrivastava" <praan@google.com>,
"Alistair Popple" <apopple@nvidia.com>,
"Vivek Kasireddy" <vivek.kasireddy@intel.com>,
linux-kernel@vger.kernel.org, linux-media@vger.kernel.org,
dri-devel@lists.freedesktop.org, linaro-mm-sig@lists.linaro.org,
kvm@vger.kernel.org
Subject: [PATCH 8/9] vfio/pci: Permanently revoke a DMABUF on request
Date: Thu, 16 Apr 2026 06:17:51 -0700 [thread overview]
Message-ID: <20260416131815.2729131-9-mattev@meta.com> (raw)
In-Reply-To: <20260416131815.2729131-1-mattev@meta.com>
Expand the VFIO DMABUF revocation state to three states:
Not revoked, temporarily revoked, and permanently revoked.
The first two are for existing transient revocation, e.g. across a
function reset, and the DMABUF is put into the last in response to a
new ioctl(VFIO_DEVICE_PCI_DMABUF_REVOKE) request.
This VFIO device fd ioctl passes a DMABUF by fd and requests that the
DMABUF is permanently revoked. On success, it's guaranteed that the
buffer can never be imported/attached/mmap()ed in future, that dynamic
imports have been cleanly detached, and all mappings made
inaccessible/PTEs zapped.
This is useful for lifecycle management, to reclaim VFIO PCI BAR
ranges previously delegated to a subordinate client process: The
driver process can ensure that the loaned resources are revoked when
the client is deemed "done", and exported ranges can be safely re-used
elsewhere.
Signed-off-by: Matt Evans <mattev@meta.com>
---
drivers/vfio/pci/vfio_pci_core.c | 21 +++-
drivers/vfio/pci/vfio_pci_dmabuf.c | 158 +++++++++++++++++++++--------
drivers/vfio/pci/vfio_pci_priv.h | 14 ++-
include/uapi/linux/vfio.h | 30 ++++++
4 files changed, 179 insertions(+), 44 deletions(-)
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index cad126cf8737..59582fcfba97 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -1461,6 +1461,21 @@ static int vfio_pci_ioctl_ioeventfd(struct vfio_pci_core_device *vdev,
ioeventfd.fd);
}
+static int vfio_pci_ioctl_dmabuf_revoke(struct vfio_pci_core_device *vdev,
+ struct vfio_pci_dmabuf_revoke __user *arg)
+{
+ unsigned long minsz = offsetofend(struct vfio_pci_dmabuf_revoke, dmabuf_fd);
+ struct vfio_pci_dmabuf_revoke revoke;
+
+ if (copy_from_user(&revoke, arg, minsz))
+ return -EFAULT;
+
+ if (revoke.argsz < minsz)
+ return -EINVAL;
+
+ return vfio_pci_dma_buf_revoke(vdev, revoke.dmabuf_fd);
+}
+
long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd,
unsigned long arg)
{
@@ -1483,6 +1498,8 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd,
return vfio_pci_ioctl_reset(vdev, uarg);
case VFIO_DEVICE_SET_IRQS:
return vfio_pci_ioctl_set_irqs(vdev, uarg);
+ case VFIO_DEVICE_PCI_DMABUF_REVOKE:
+ return vfio_pci_ioctl_dmabuf_revoke(vdev, uarg);
default:
return -ENOTTY;
}
@@ -1752,7 +1769,7 @@ static vm_fault_t vfio_pci_mmap_huge_fault(struct vm_fault *vmf,
dma_resv_lock(priv->dmabuf->resv, NULL);
vdev = READ_ONCE(priv->vdev);
- if (READ_ONCE(priv->revoked) || !vdev) {
+ if (READ_ONCE(priv->status) != VFIO_PCI_DMABUF_OK || !vdev) {
pr_debug_ratelimited("%s VA 0x%lx, pgoff 0x%lx: DMABUF revoked/cleaned up\n",
__func__, vmf->address, vma->vm_pgoff);
dma_resv_unlock(priv->dmabuf->resv);
@@ -1774,7 +1791,7 @@ static vm_fault_t vfio_pci_mmap_huge_fault(struct vm_fault *vmf,
dma_resv_unlock(priv->dmabuf->resv);
scoped_guard(rwsem_read, &vdev->memory_lock) {
- if (!READ_ONCE(priv->revoked)) {
+ if (READ_ONCE(priv->status) == VFIO_PCI_DMABUF_OK) {
int pres = vfio_pci_dma_buf_find_pfn(priv, vma,
vmf->address,
order, &pfn);
diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c
index cc477f46a7d5..48ec4da2db8b 100644
--- a/drivers/vfio/pci/vfio_pci_dmabuf.c
+++ b/drivers/vfio/pci/vfio_pci_dmabuf.c
@@ -19,7 +19,7 @@ static int vfio_pci_dma_buf_attach(struct dma_buf *dmabuf,
if (!attachment->peer2peer)
return -EOPNOTSUPP;
- if (priv->revoked)
+ if (priv->status != VFIO_PCI_DMABUF_OK)
return -ENODEV;
if (!dma_buf_attach_revocable(attachment))
@@ -33,7 +33,7 @@ static int vfio_pci_dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *
struct vfio_pci_dma_buf *priv = dmabuf->priv;
u64 req_len, req_start;
- if (priv->revoked)
+ if (priv->status != VFIO_PCI_DMABUF_OK)
return -ENODEV;
if ((vma->vm_flags & VM_SHARED) == 0)
return -EINVAL;
@@ -73,7 +73,7 @@ vfio_pci_dma_buf_map(struct dma_buf_attachment *attachment,
dma_resv_assert_held(priv->dmabuf->resv);
- if (priv->revoked)
+ if (priv->status != VFIO_PCI_DMABUF_OK)
return ERR_PTR(-ENODEV);
ret = dma_buf_phys_vec_to_sgt(attachment, priv->provider,
@@ -270,7 +270,8 @@ static int vfio_pci_dmabuf_export(struct vfio_pci_core_device *vdev,
INIT_LIST_HEAD(&priv->dmabufs_elm);
down_write(&vdev->memory_lock);
dma_resv_lock(priv->dmabuf->resv, NULL);
- priv->revoked = !__vfio_pci_memory_enabled(vdev);
+ priv->status = __vfio_pci_memory_enabled(vdev) ? VFIO_PCI_DMABUF_OK :
+ VFIO_PCI_DMABUF_TEMP_REVOKED;
list_add_tail(&priv->dmabufs_elm, &vdev->dmabufs);
dma_resv_unlock(priv->dmabuf->resv);
up_write(&vdev->memory_lock);
@@ -301,7 +302,7 @@ int vfio_pci_dma_buf_iommufd_map(struct dma_buf_attachment *attachment,
return -EOPNOTSUPP;
priv = attachment->dmabuf->priv;
- if (priv->revoked)
+ if (priv->status != VFIO_PCI_DMABUF_OK)
return -ENODEV;
/* More than one range to iommufd will require proper DMABUF support */
@@ -581,6 +582,64 @@ int vfio_pci_core_mmap_prep_dmabuf(struct vfio_pci_core_device *vdev,
return ret;
}
+static void __vfio_pci_dma_buf_revoke(struct vfio_pci_dma_buf *priv, bool revoked,
+ bool permanently)
+{
+ bool was_revoked;
+
+ lockdep_assert_held_write(&priv->vdev->memory_lock);
+
+ if ((priv->status == VFIO_PCI_DMABUF_PERM_REVOKED) ||
+ (priv->status == VFIO_PCI_DMABUF_OK && !revoked) ||
+ (priv->status == VFIO_PCI_DMABUF_TEMP_REVOKED && revoked && !permanently)) {
+ return;
+ }
+
+ dma_resv_lock(priv->dmabuf->resv, NULL);
+ was_revoked = priv->status != VFIO_PCI_DMABUF_OK;
+
+ if (revoked)
+ priv->status = permanently ?
+ VFIO_PCI_DMABUF_PERM_REVOKED : VFIO_PCI_DMABUF_TEMP_REVOKED;
+
+ /*
+ * If TEMP_REVOKED is being upgraded to PERM_REVOKED, the
+ * buffer is already gone. Don't wait on it again.
+ */
+ if (was_revoked && revoked) {
+ dma_resv_unlock(priv->dmabuf->resv);
+ return;
+ }
+
+ dma_buf_invalidate_mappings(priv->dmabuf);
+ dma_resv_wait_timeout(priv->dmabuf->resv,
+ DMA_RESV_USAGE_BOOKKEEP, false,
+ MAX_SCHEDULE_TIMEOUT);
+ dma_resv_unlock(priv->dmabuf->resv);
+ if (revoked) {
+ kref_put(&priv->kref, vfio_pci_dma_buf_done);
+ wait_for_completion(&priv->comp);
+ unmap_mapping_range(priv->dmabuf->file->f_mapping,
+ 0, priv->size, 1);
+ } else {
+ /*
+ * Kref is initialize again, because when revoke
+ * was performed the reference counter was decreased
+ * to zero to trigger completion.
+ */
+ kref_init(&priv->kref);
+ /*
+ * There is no need to wait as no mapping was
+ * performed when the previous status was
+ * priv->status == *REVOKED.
+ */
+ reinit_completion(&priv->comp);
+ dma_resv_lock(priv->dmabuf->resv, NULL);
+ priv->status = VFIO_PCI_DMABUF_OK;
+ dma_resv_unlock(priv->dmabuf->resv);
+ }
+}
+
void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked)
{
struct vfio_pci_dma_buf *priv;
@@ -589,45 +648,13 @@ void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked)
lockdep_assert_held_write(&vdev->memory_lock);
/*
* Holding memory_lock ensures a racing VMA fault observes
- * priv->revoked properly.
+ * priv->status properly.
*/
list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) {
if (!get_file_active(&priv->dmabuf->file))
continue;
-
- if (priv->revoked != revoked) {
- dma_resv_lock(priv->dmabuf->resv, NULL);
- if (revoked)
- priv->revoked = true;
- dma_buf_invalidate_mappings(priv->dmabuf);
- dma_resv_wait_timeout(priv->dmabuf->resv,
- DMA_RESV_USAGE_BOOKKEEP, false,
- MAX_SCHEDULE_TIMEOUT);
- dma_resv_unlock(priv->dmabuf->resv);
- if (revoked) {
- kref_put(&priv->kref, vfio_pci_dma_buf_done);
- wait_for_completion(&priv->comp);
- unmap_mapping_range(priv->dmabuf->file->f_mapping,
- 0, priv->size, 1);
- } else {
- /*
- * Kref is initialize again, because when revoke
- * was performed the reference counter was decreased
- * to zero to trigger completion.
- */
- kref_init(&priv->kref);
- /*
- * There is no need to wait as no mapping was
- * performed when the previous status was
- * priv->revoked == true.
- */
- reinit_completion(&priv->comp);
- dma_resv_lock(priv->dmabuf->resv, NULL);
- priv->revoked = false;
- dma_resv_unlock(priv->dmabuf->resv);
- }
- }
+ __vfio_pci_dma_buf_revoke(priv, revoked, false);
fput(priv->dmabuf->file);
}
}
@@ -647,8 +674,8 @@ void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev)
dma_resv_lock(priv->dmabuf->resv, NULL);
list_del_init(&priv->dmabufs_elm);
priv->vdev = NULL;
- was_revoked = priv->revoked;
- priv->revoked = true;
+ was_revoked = (priv->status != VFIO_PCI_DMABUF_OK);
+ priv->status = VFIO_PCI_DMABUF_PERM_REVOKED;
dma_buf_invalidate_mappings(priv->dmabuf);
dma_resv_wait_timeout(priv->dmabuf->resv,
DMA_RESV_USAGE_BOOKKEEP, false,
@@ -665,3 +692,52 @@ void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev)
}
up_write(&vdev->memory_lock);
}
+
+#ifdef CONFIG_VFIO_PCI_DMABUF
+int vfio_pci_dma_buf_revoke(struct vfio_pci_core_device *vdev, int dmabuf_fd)
+{
+ struct dma_buf *dmabuf;
+ struct vfio_pci_dma_buf *priv;
+ int ret = 0;
+
+ dmabuf = dma_buf_get(dmabuf_fd);
+ if (IS_ERR(dmabuf))
+ return PTR_ERR(dmabuf);
+
+ /*
+ * Sanity-check the DMABUF is really a vfio_pci_dma_buf _and_
+ * (below) relates to the VFIO device it was provided with:
+ */
+ if (dmabuf->ops != &vfio_pci_dmabuf_ops) {
+ ret = -ENODEV;
+ goto out_put_buf;
+ }
+
+ priv = dmabuf->priv;
+
+ scoped_guard(rwsem_write, &vdev->memory_lock) {
+ struct vfio_pci_core_device *db_vdev = READ_ONCE(priv->vdev);
+
+ /*
+ * Reading priv->vdev inside the lock is conservative,
+ * because cleanup (changes vdev) is (today) prevented
+ * from running concurrently by the VFIO device fd
+ * being held open by the caller, ioctl.
+ */
+ if (!db_vdev || db_vdev != vdev) {
+ ret = -ENODEV;
+ break;
+ }
+
+ if (priv->status == VFIO_PCI_DMABUF_PERM_REVOKED)
+ ret = -EBADFD;
+ else
+ __vfio_pci_dma_buf_revoke(priv, true, true);
+ }
+
+ out_put_buf:
+ dma_buf_put(dmabuf);
+
+ return ret;
+}
+#endif /* CONFIG_VFIO_PCI_DMABUF */
diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h
index f837d6c8bddc..eac5606ca161 100644
--- a/drivers/vfio/pci/vfio_pci_priv.h
+++ b/drivers/vfio/pci/vfio_pci_priv.h
@@ -23,6 +23,12 @@ struct vfio_pci_ioeventfd {
bool test_mem;
};
+enum vfio_pci_dma_buf_status {
+ VFIO_PCI_DMABUF_OK = 0,
+ VFIO_PCI_DMABUF_TEMP_REVOKED = 1,
+ VFIO_PCI_DMABUF_PERM_REVOKED = 2,
+};
+
struct vfio_pci_dma_buf {
struct dma_buf *dmabuf;
struct vfio_pci_core_device *vdev;
@@ -34,7 +40,7 @@ struct vfio_pci_dma_buf {
u32 nr_ranges;
struct kref kref;
struct completion comp;
- u8 revoked : 1;
+ enum vfio_pci_dma_buf_status status;
};
extern const struct vm_operations_struct vfio_pci_mmap_ops;
@@ -147,6 +153,7 @@ void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked);
int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
struct vfio_device_feature_dma_buf __user *arg,
size_t argsz);
+int vfio_pci_dma_buf_revoke(struct vfio_pci_core_device *vdev, int dmabuf_fd);
#else
static inline int
vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
@@ -155,6 +162,11 @@ vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
{
return -ENOTTY;
}
+static inline int vfio_pci_dma_buf_revoke(struct vfio_pci_core_device *vdev,
+ int dmabuf_fd)
+{
+ return -ENODEV;
+}
#endif
#endif
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 5de618a3a5ee..77225ed8115f 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -1321,6 +1321,36 @@ struct vfio_precopy_info {
#define VFIO_MIG_GET_PRECOPY_INFO _IO(VFIO_TYPE, VFIO_BASE + 21)
+/**
+ * VFIO_DEVICE_PCI_DMABUF_REVOKE - _IO(VFIO_TYPE, VFIO_BASE + 22)
+ *
+ * This ioctl is used on the device FD, and requests that access to
+ * the buffer corresponding to the DMABUF FD parameter is immediately
+ * and permanently revoked. On successful return, the buffer is not
+ * accessible through any mmap() or dma-buf import. The request fails
+ * if the buffer is pinned; otherwise, the exporter marks the buffer
+ * as inaccessible and uses the move_notify callback to inform
+ * importers of the change. The buffer is permanently disabled, and
+ * VFIO refuses all map, mmap, attach, etc. requests.
+ *
+ * Returns:
+ *
+ * Return: 0 on success, -1 and errno set on failure:
+ *
+ * ENODEV if the associated dmabuf FD no longer exists/is closed,
+ * or is not a DMABUF created for this device.
+ * EINVAL if the dmabuf_fd parameter isn't a DMABUF.
+ * EBADF if the dmabuf_fd parameter isn't a valid file number.
+ * EBADFD if the buffer has already been revoked.
+ *
+ */
+struct vfio_pci_dmabuf_revoke {
+ __u32 argsz;
+ __u32 dmabuf_fd;
+};
+
+#define VFIO_DEVICE_PCI_DMABUF_REVOKE _IO(VFIO_TYPE, VFIO_BASE + 22)
+
/*
* Upon VFIO_DEVICE_FEATURE_SET, allow the device to be moved into a low power
* state with the platform-based power management. Device use of lower power
--
2.47.3
next prev parent reply other threads:[~2026-04-16 13:19 UTC|newest]
Thread overview: 10+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-04-16 13:17 [PATCH 0/9] vfio/pci: Add mmap() for DMABUFs Matt Evans
2026-04-16 13:17 ` [PATCH 1/9] vfio/pci: Fix vfio_pci_dma_buf_cleanup() double-put Matt Evans
2026-04-16 13:17 ` [PATCH 2/9] vfio/pci: Add a helper to look up PFNs for DMABUFs Matt Evans
2026-04-16 13:17 ` [PATCH 3/9] vfio/pci: Add a helper to create a DMABUF for a BAR-map VMA Matt Evans
2026-04-16 13:17 ` [PATCH 4/9] vfio/pci: Convert BAR mmap() to use a DMABUF Matt Evans
2026-04-16 13:17 ` [PATCH 5/9] vfio/pci: Provide a user-facing name for BAR mappings Matt Evans
2026-04-16 13:17 ` [PATCH 6/9] vfio/pci: Clean up BAR zap and revocation Matt Evans
2026-04-16 13:17 ` [PATCH 7/9] vfio/pci: Support mmap() of a VFIO DMABUF Matt Evans
2026-04-16 13:17 ` Matt Evans [this message]
2026-04-16 13:17 ` [PATCH 9/9] vfio/pci: Add mmap() attributes to DMABUF feature Matt Evans
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260416131815.2729131-9-mattev@meta.com \
--to=mattev@meta.com \
--cc=alex@shazbot.org \
--cc=amastro@fb.com \
--cc=ankita@nvidia.com \
--cc=apopple@nvidia.com \
--cc=bjorn@kernel.org \
--cc=christian.koenig@amd.com \
--cc=dmatlack@google.com \
--cc=dri-devel@lists.freedesktop.org \
--cc=jgg@nvidia.com \
--cc=kevin.tian@intel.com \
--cc=kvm@vger.kernel.org \
--cc=leon@kernel.org \
--cc=linaro-mm-sig@lists.linaro.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-media@vger.kernel.org \
--cc=mngyadam@amazon.de \
--cc=praan@google.com \
--cc=sumit.semwal@linaro.org \
--cc=vivek.kasireddy@intel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox