From: "Thomas Hellström" <thomas.hellstrom@linux.intel.com>
To: intel-xe@lists.freedesktop.org
Cc: "Thomas Hellström" <thomas.hellstrom@linux.intel.com>,
"Matthew Brost" <matthew.brost@intel.com>,
dri-devel@lists.freedesktop.org, himal.prasad.ghimiray@intel.com,
apopple@nvidia.com, airlied@gmail.com,
"Simona Vetter" <simona.vetter@ffwll.ch>,
felix.kuehling@amd.com,
"Christian König" <christian.koenig@amd.com>,
dakr@kernel.org, "Mrozek, Michal" <michal.mrozek@intel.com>,
"Joonas Lahtinen" <joonas.lahtinen@linux.intel.com>
Subject: [PATCH v5 06/24] drm/pagemap, drm/xe: Manage drm_pagemap provider lifetimes
Date: Thu, 18 Dec 2025 17:20:43 +0100 [thread overview]
Message-ID: <20251218162101.605379-7-thomas.hellstrom@linux.intel.com> (raw)
In-Reply-To: <20251218162101.605379-1-thomas.hellstrom@linux.intel.com>
If a device holds a reference on a foregin device's drm_pagemap,
and a device unbind is executed on the foreign device,
Typically that foreign device would evict its device-private
pages and then continue its device-managed cleanup eventually
releasing its drm device and possibly allow for module unload.
However, since we're still holding a reference on a drm_pagemap,
when that reference is released and the provider module is
unloaded we'd execute out of undefined memory.
Therefore keep a reference on the provider device and module until
the last drm_pagemap reference is gone.
Note that in theory, the drm_gpusvm_helper module may be unloaded
as soon as the final module_put() of the provider driver module is
executed, so we need to add a module_exit() function that waits
for the work item executing the module_put() has completed.
v2:
- Better commit message (Matt Brost)
Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Reviewed-by: Matthew Brost <matthew.brost@intel.com>
---
drivers/gpu/drm/drm_pagemap.c | 101 ++++++++++++++++++++++++++++++++--
drivers/gpu/drm/xe/xe_svm.c | 15 ++++-
include/drm/drm_pagemap.h | 10 +++-
3 files changed, 117 insertions(+), 9 deletions(-)
diff --git a/drivers/gpu/drm/drm_pagemap.c b/drivers/gpu/drm/drm_pagemap.c
index 4cc57ced4993..97a2b5561ab5 100644
--- a/drivers/gpu/drm/drm_pagemap.c
+++ b/drivers/gpu/drm/drm_pagemap.c
@@ -9,6 +9,7 @@
#include <linux/pagemap.h>
#include <drm/drm_drv.h>
#include <drm/drm_pagemap.h>
+#include <drm/drm_print.h>
/**
* DOC: Overview
@@ -549,16 +550,92 @@ static int drm_pagemap_migrate_populate_ram_pfn(struct vm_area_struct *vas,
return -ENOMEM;
}
+static void drm_pagemap_dev_unhold_work(struct work_struct *work);
+static LLIST_HEAD(drm_pagemap_unhold_list);
+static DECLARE_WORK(drm_pagemap_work, drm_pagemap_dev_unhold_work);
+
+/**
+ * struct drm_pagemap_dev_hold - Struct to aid in drm_device release.
+ * @link: Link into drm_pagemap_unhold_list for deferred reference releases.
+ * @drm: drm device to put.
+ *
+ * When a struct drm_pagemap is released, we also need to release the
+ * reference it holds on the drm device. However, typically that needs
+ * to be done separately from a system-wide workqueue.
+ * Each time a struct drm_pagemap is initialized
+ * (or re-initialized if cached) therefore allocate a separate
+ * drm_pagemap_dev_hold item, from which we put the drm device and
+ * associated module.
+ */
+struct drm_pagemap_dev_hold {
+ struct llist_node link;
+ struct drm_device *drm;
+};
+
static void drm_pagemap_release(struct kref *ref)
{
struct drm_pagemap *dpagemap = container_of(ref, typeof(*dpagemap), ref);
-
+ struct drm_pagemap_dev_hold *dev_hold = dpagemap->dev_hold;
+
+ /*
+ * We know the pagemap provider is alive at this point, since
+ * the struct drm_pagemap_dev_hold holds a reference to the
+ * pagemap provider drm_device and its module.
+ */
+ dpagemap->dev_hold = NULL;
kfree(dpagemap);
+ llist_add(&dev_hold->link, &drm_pagemap_unhold_list);
+ schedule_work(&drm_pagemap_work);
+ /*
+ * Here, either the provider device is still alive, since if called from
+ * page_free(), the caller is holding a reference on the dev_pagemap,
+ * or if called from drm_pagemap_put(), the direct caller is still alive.
+ * This ensures we can't race with THIS module unload.
+ */
+}
+
+static void drm_pagemap_dev_unhold_work(struct work_struct *work)
+{
+ struct llist_node *node = llist_del_all(&drm_pagemap_unhold_list);
+ struct drm_pagemap_dev_hold *dev_hold, *next;
+
+ /*
+ * Deferred release of drm_pagemap provider device and module.
+ * THIS module is kept alive during the release by the
+ * flush_work() in the drm_pagemap_exit() function.
+ */
+ llist_for_each_entry_safe(dev_hold, next, node, link) {
+ struct drm_device *drm = dev_hold->drm;
+ struct module *module = drm->driver->fops->owner;
+
+ drm_dbg(drm, "Releasing reference on provider device and module.\n");
+ drm_dev_put(drm);
+ module_put(module);
+ kfree(dev_hold);
+ }
+}
+
+static struct drm_pagemap_dev_hold *
+drm_pagemap_dev_hold(struct drm_pagemap *dpagemap)
+{
+ struct drm_pagemap_dev_hold *dev_hold;
+ struct drm_device *drm = dpagemap->drm;
+
+ dev_hold = kzalloc(sizeof(*dev_hold), GFP_KERNEL);
+ if (!dev_hold)
+ return ERR_PTR(-ENOMEM);
+
+ init_llist_node(&dev_hold->link);
+ dev_hold->drm = drm;
+ (void)try_module_get(drm->driver->fops->owner);
+ drm_dev_get(drm);
+
+ return dev_hold;
}
/**
* drm_pagemap_create() - Create a struct drm_pagemap.
- * @dev: Pointer to a struct device providing the device-private memory.
+ * @drm: Pointer to a struct drm_device providing the device-private memory.
* @pagemap: Pointer to a pre-setup struct dev_pagemap providing the struct pages.
* @ops: Pointer to the struct drm_pagemap_ops.
*
@@ -568,20 +645,28 @@ static void drm_pagemap_release(struct kref *ref)
* Error pointer on error.
*/
struct drm_pagemap *
-drm_pagemap_create(struct device *dev,
+drm_pagemap_create(struct drm_device *drm,
struct dev_pagemap *pagemap,
const struct drm_pagemap_ops *ops)
{
struct drm_pagemap *dpagemap = kzalloc(sizeof(*dpagemap), GFP_KERNEL);
+ struct drm_pagemap_dev_hold *dev_hold;
if (!dpagemap)
return ERR_PTR(-ENOMEM);
kref_init(&dpagemap->ref);
- dpagemap->dev = dev;
+ dpagemap->drm = drm;
dpagemap->ops = ops;
dpagemap->pagemap = pagemap;
+ dev_hold = drm_pagemap_dev_hold(dpagemap);
+ if (IS_ERR(dev_hold)) {
+ kfree(dpagemap);
+ return ERR_CAST(dev_hold);
+ }
+ dpagemap->dev_hold = dev_hold;
+
return dpagemap;
}
EXPORT_SYMBOL(drm_pagemap_create);
@@ -933,3 +1018,11 @@ int drm_pagemap_populate_mm(struct drm_pagemap *dpagemap,
return err;
}
EXPORT_SYMBOL(drm_pagemap_populate_mm);
+
+static void drm_pagemap_exit(void)
+{
+ flush_work(&drm_pagemap_work);
+ if (WARN_ON(!llist_empty(&drm_pagemap_unhold_list)))
+ disable_work_sync(&drm_pagemap_work);
+}
+module_exit(drm_pagemap_exit);
diff --git a/drivers/gpu/drm/xe/xe_svm.c b/drivers/gpu/drm/xe/xe_svm.c
index b72d523a4416..b72bc989e7ee 100644
--- a/drivers/gpu/drm/xe/xe_svm.c
+++ b/drivers/gpu/drm/xe/xe_svm.c
@@ -1479,7 +1479,7 @@ xe_drm_pagemap_device_map(struct drm_pagemap *dpagemap,
unsigned int order,
enum dma_data_direction dir)
{
- struct device *pgmap_dev = dpagemap->dev;
+ struct device *pgmap_dev = dpagemap->drm->dev;
enum drm_interconnect_protocol prot;
dma_addr_t addr;
@@ -1499,6 +1499,14 @@ static const struct drm_pagemap_ops xe_drm_pagemap_ops = {
.populate_mm = xe_drm_pagemap_populate_mm,
};
+static void xe_devm_release(void *data)
+{
+ struct xe_vram_region *vr = data;
+
+ drm_pagemap_put(vr->dpagemap);
+ vr->dpagemap = NULL;
+}
+
/**
* xe_devm_add: Remap and provide memmap backing for device memory
* @tile: tile that the memory region belongs to
@@ -1524,7 +1532,7 @@ int xe_devm_add(struct xe_tile *tile, struct xe_vram_region *vr)
return ret;
}
- vr->dpagemap = drm_pagemap_create(dev, &vr->pagemap,
+ vr->dpagemap = drm_pagemap_create(&xe->drm, &vr->pagemap,
&xe_drm_pagemap_ops);
if (IS_ERR(vr->dpagemap)) {
drm_err(&xe->drm, "Failed to create drm_pagemap tile %d memory: %pe\n",
@@ -1532,6 +1540,9 @@ int xe_devm_add(struct xe_tile *tile, struct xe_vram_region *vr)
ret = PTR_ERR(vr->dpagemap);
goto out_no_dpagemap;
}
+ ret = devm_add_action_or_reset(dev, xe_devm_release, vr);
+ if (ret)
+ goto out_no_dpagemap;
vr->pagemap.type = MEMORY_DEVICE_PRIVATE;
vr->pagemap.range.start = res->start;
diff --git a/include/drm/drm_pagemap.h b/include/drm/drm_pagemap.h
index 093e7199c44b..b3edcdde4454 100644
--- a/include/drm/drm_pagemap.h
+++ b/include/drm/drm_pagemap.h
@@ -10,6 +10,7 @@
struct dma_fence;
struct drm_pagemap;
+struct drm_pagemap_dev_hold;
struct drm_pagemap_zdd;
struct device;
@@ -131,14 +132,17 @@ struct drm_pagemap_ops {
* used for device p2p handshaking.
* @ops: The struct drm_pagemap_ops.
* @ref: Reference count.
- * @dev: The struct drevice owning the device-private memory.
+ * @drm: The struct drm device owning the device-private memory.
* @pagemap: Pointer to the underlying dev_pagemap.
+ * @dev_hold: Pointer to a struct drm_pagemap_dev_hold for
+ * device referencing.
*/
struct drm_pagemap {
const struct drm_pagemap_ops *ops;
struct kref ref;
- struct device *dev;
+ struct drm_device *drm;
struct dev_pagemap *pagemap;
+ struct drm_pagemap_dev_hold *dev_hold;
};
struct drm_pagemap_devmem;
@@ -213,7 +217,7 @@ struct drm_pagemap_devmem_ops {
struct dma_fence *pre_migrate_fence);
};
-struct drm_pagemap *drm_pagemap_create(struct device *dev,
+struct drm_pagemap *drm_pagemap_create(struct drm_device *drm,
struct dev_pagemap *pagemap,
const struct drm_pagemap_ops *ops);
--
2.51.1
next prev parent reply other threads:[~2025-12-18 16:21 UTC|newest]
Thread overview: 37+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-12-18 16:20 [PATCH v5 00/24] Dynamic drm_pagemaps and Initial multi-device SVM Thomas Hellström
2025-12-18 16:20 ` [PATCH v5 01/24] drm/xe/svm: Fix a debug printout Thomas Hellström
2025-12-18 16:20 ` [PATCH v5 02/24] drm/pagemap: Remove some dead code Thomas Hellström
2025-12-18 18:16 ` Matthew Brost
2025-12-18 16:20 ` [PATCH v5 03/24] drm/pagemap, drm/xe: Ensure that the devmem allocation is idle before use Thomas Hellström
2025-12-18 18:33 ` Matthew Brost
2025-12-18 19:18 ` Thomas Hellström
2025-12-18 19:33 ` Matthew Brost
2025-12-18 16:20 ` [PATCH v5 04/24] drm/pagemap, drm/xe: Add refcounting to struct drm_pagemap Thomas Hellström
2025-12-18 16:20 ` [PATCH v5 05/24] drm/pagemap: Add a refcounted drm_pagemap backpointer to struct drm_pagemap_zdd Thomas Hellström
2025-12-18 16:20 ` Thomas Hellström [this message]
2025-12-18 16:20 ` [PATCH v5 07/24] drm/pagemap: Add a drm_pagemap cache and shrinker Thomas Hellström
2025-12-18 16:20 ` [PATCH v5 08/24] drm/xe: Use the " Thomas Hellström
2025-12-18 16:20 ` [PATCH v5 09/24] drm/pagemap: Remove the drm_pagemap_create() interface Thomas Hellström
2025-12-18 16:20 ` [PATCH v5 10/24] drm/pagemap_util: Add a utility to assign an owner to a set of interconnected gpus Thomas Hellström
2025-12-18 16:20 ` [PATCH v5 11/24] drm/xe: Use the drm_pagemap_util helper to get a svm pagemap owner Thomas Hellström
2025-12-18 16:20 ` [PATCH v5 12/24] drm/xe: Pass a drm_pagemap pointer around with the memory advise attributes Thomas Hellström
2025-12-18 16:20 ` [PATCH v5 13/24] drm/xe: Use the vma attibute drm_pagemap to select where to migrate Thomas Hellström
2025-12-18 16:20 ` [PATCH v5 14/24] drm/xe: Simplify madvise_preferred_mem_loc() Thomas Hellström
2025-12-18 16:20 ` [PATCH v5 15/24] drm/xe/uapi: Extend the madvise functionality to support foreign pagemap placement for svm Thomas Hellström
2025-12-18 16:20 ` [PATCH v5 16/24] drm/xe: Support pcie p2p dma as a fast interconnect Thomas Hellström
2025-12-18 16:20 ` [PATCH v5 17/24] drm/xe/vm: Add a couple of VM debug printouts Thomas Hellström
2025-12-18 16:20 ` [PATCH v5 18/24] drm/xe/svm: Document how xe keeps drm_pagemap references Thomas Hellström
2025-12-18 16:20 ` [PATCH v5 19/24] drm/pagemap, drm/xe: Clean up the use of the device-private page owner Thomas Hellström
2025-12-18 16:20 ` [PATCH v5 20/24] drm/gpusvm: Introduce a function to scan the current migration state Thomas Hellström
2025-12-18 16:20 ` [PATCH v5 21/24] drm/xe: Use drm_gpusvm_scan_mm() Thomas Hellström
2025-12-18 16:20 ` [PATCH v5 22/24] drm/pagemap, drm/xe: Support destination migration over interconnect Thomas Hellström
2025-12-18 18:40 ` Matthew Brost
2025-12-18 16:21 ` [PATCH v5 23/24] drm/pagemap: Support source " Thomas Hellström
2025-12-18 20:36 ` Matthew Brost
2025-12-18 23:01 ` Matthew Brost
2025-12-18 16:21 ` [PATCH v5 24/24] drm/xe/svm: Serialize migration to device if racing Thomas Hellström
2025-12-18 19:03 ` Matthew Brost
2025-12-18 16:58 ` ✗ CI.checkpatch: warning for Dynamic drm_pagemaps and Initial multi-device SVM (rev6) Patchwork
2025-12-18 16:59 ` ✓ CI.KUnit: success " Patchwork
2025-12-18 17:38 ` ✓ Xe.CI.BAT: " Patchwork
2025-12-19 14:56 ` ✓ Xe.CI.Full: " Patchwork
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20251218162101.605379-7-thomas.hellstrom@linux.intel.com \
--to=thomas.hellstrom@linux.intel.com \
--cc=airlied@gmail.com \
--cc=apopple@nvidia.com \
--cc=christian.koenig@amd.com \
--cc=dakr@kernel.org \
--cc=dri-devel@lists.freedesktop.org \
--cc=felix.kuehling@amd.com \
--cc=himal.prasad.ghimiray@intel.com \
--cc=intel-xe@lists.freedesktop.org \
--cc=joonas.lahtinen@linux.intel.com \
--cc=matthew.brost@intel.com \
--cc=michal.mrozek@intel.com \
--cc=simona.vetter@ffwll.ch \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.