Linux Documentation
 help / color / mirror / Atom feed
* [PATCH v4 3/3] drm/panthor: Reduce padding in gems debugfs for refcount
From: Nicolas Frattaroli @ 2026-05-20 13:04 UTC (permalink / raw)
  To: Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann, David Airlie,
	Simona Vetter, Boris Brezillon, Steven Price, Liviu Dudau,
	Jonathan Corbet, Shuah Khan, Tvrtko Ursulin
  Cc: dri-devel, linux-kernel, kernel, linux-doc, Nicolas Frattaroli
In-Reply-To: <20260520-panthor-bo-reclaim-observability-v4-0-a47ab61cb80d@collabora.com>

The "gems" debugfs file is getting a little too wide for comfort. While
a lot of this is unavoidable due to the theoretical upper limits of
numbers here (e.g. size needs to be 16 chars because 2**48-1 in decimal
is 15 digits, plus one space for separation), the refcount column has a
decent 5 characters to be saved, as it can only ever contain a 10-digit
decimal number.

Reduce the refcount column's width to 11, which fulfils this requirement
with an additional space for separation.

Reviewed-by: Steven Price <steven.price@arm.com>
Signed-off-by: Nicolas Frattaroli <nicolas.frattaroli@collabora.com>
---
 drivers/gpu/drm/panthor/panthor_gem.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/panthor/panthor_gem.c b/drivers/gpu/drm/panthor/panthor_gem.c
index 068aa935c8fc..dfdcda3d0189 100644
--- a/drivers/gpu/drm/panthor/panthor_gem.c
+++ b/drivers/gpu/drm/panthor/panthor_gem.c
@@ -1644,7 +1644,7 @@ static void panthor_gem_debugfs_bo_print(struct panthor_gem_object *bo,
 
 	snprintf(creator_info, sizeof(creator_info),
 		 "%s/%d", bo->debugfs.creator.process_name, bo->debugfs.creator.tgid);
-	seq_printf(m, "%-32s%-16d%-16d%-11d%-16zd%-16zd0x%-16lx",
+	seq_printf(m, "%-32s%-16d%-11d%-11d%-16zd%-16zd0x%-16lx",
 		   creator_info,
 		   bo->base.name,
 		   refcount,
@@ -1681,8 +1681,8 @@ static void panthor_gem_debugfs_print_bos(struct panthor_device *ptdev,
 
 	panthor_gem_debugfs_print_flag_names(m);
 
-	seq_puts(m, "created-by                      global-name     refcount        evictions  size            resident-size   file-offset       state      usage       label\n");
-	seq_puts(m, "---------------------------------------------------------------------------------------------------------------------------------------------------------\n");
+	seq_puts(m, "created-by                      global-name     refcount   evictions  size            resident-size   file-offset       state      usage       label\n");
+	seq_puts(m, "----------------------------------------------------------------------------------------------------------------------------------------------------\n");
 
 	scoped_guard(mutex, &ptdev->gems.lock) {
 		list_for_each_entry(bo, &ptdev->gems.node, debugfs.node) {
@@ -1690,7 +1690,7 @@ static void panthor_gem_debugfs_print_bos(struct panthor_device *ptdev,
 		}
 	}
 
-	seq_puts(m, "=========================================================================================================================================================\n");
+	seq_puts(m, "====================================================================================================================================================\n");
 	seq_printf(m, "Total size: %zd, Total resident: %zd, Total reclaimable: %zd\n",
 		   totals.size, totals.resident, totals.reclaimable);
 }

-- 
2.54.0


^ permalink raw reply related

* Re: [PATCH v4 2/3] drm/panthor: Implement evicted status for GEM objects
From: Boris Brezillon @ 2026-05-20 13:24 UTC (permalink / raw)
  To: Nicolas Frattaroli
  Cc: Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann, David Airlie,
	Simona Vetter, Steven Price, Liviu Dudau, Jonathan Corbet,
	Shuah Khan, Tvrtko Ursulin, dri-devel, linux-kernel, kernel,
	linux-doc
In-Reply-To: <20260520-panthor-bo-reclaim-observability-v4-2-a47ab61cb80d@collabora.com>

On Wed, 20 May 2026 15:04:49 +0200
Nicolas Frattaroli <nicolas.frattaroli@collabora.com> wrote:

> For fdinfo to be able to fill its evicted counter with data, panthor
> needs to keep track of whether a GEM object has ever been reclaimed.
> Just checking whether the pages are resident isn't enough, as newly
> allocated objects also won't be resident.
> 
> Do this with a new atomic_t member on panthor_gem_object. It's increased
> when an object gets evicted by the shrinker, and saturates at INT_MAX.
> This means that once an object has been evicted at least once, its
> reclaim counter will never return to 0.
> 
> Due to this, it's possible to distinguish evicted non-resident pages
> from newly allocated non-resident pages by checking whether
> reclaimed_count is != 0
> 
> Use this new member to then set the appropriate DRM_GEM_OBJECT_EVICTED
> status flag for fdinfo.
> 
> Also add a new column and status flag to the panthor gems debugfs: the
> column is the number of times an object has been evicted, whereas the
> flag indicates whether it currently is evicted.
> 
> Reviewed-by: Steven Price <steven.price@arm.com>
> Signed-off-by: Nicolas Frattaroli <nicolas.frattaroli@collabora.com>

Reviewed-by: Boris Brezillon <boris.brezillon@collabora.com>

> ---
>  drivers/gpu/drm/panthor/panthor_gem.c | 18 ++++++++++++++----
>  drivers/gpu/drm/panthor/panthor_gem.h | 10 ++++++++++
>  2 files changed, 24 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/gpu/drm/panthor/panthor_gem.c b/drivers/gpu/drm/panthor/panthor_gem.c
> index 13295d7a593d..068aa935c8fc 100644
> --- a/drivers/gpu/drm/panthor/panthor_gem.c
> +++ b/drivers/gpu/drm/panthor/panthor_gem.c
> @@ -687,6 +687,8 @@ static void panthor_gem_evict_locked(struct panthor_gem_object *bo)
>  	if (drm_WARN_ON_ONCE(bo->base.dev, !bo->backing.pages))
>  		return;
>  
> +	atomic_add_unless(&bo->reclaimed_count, 1, INT_MAX);
> +
>  	panthor_gem_dev_map_cleanup_locked(bo);
>  	panthor_gem_backing_cleanup_locked(bo);
>  	panthor_gem_update_reclaim_state_locked(bo, NULL);
> @@ -788,6 +790,8 @@ static enum drm_gem_object_status panthor_gem_status(struct drm_gem_object *obj)
>  
>  	if (drm_gem_is_imported(&bo->base) || bo->backing.pages)
>  		res |= DRM_GEM_OBJECT_RESIDENT;
> +	else if (atomic_read(&bo->reclaimed_count))
> +		res |= DRM_GEM_OBJECT_EVICTED;
>  
>  	return res;
>  }
> @@ -1595,6 +1599,7 @@ static void panthor_gem_debugfs_print_flag_names(struct seq_file *m)
>  	static const char * const gem_state_flags_names[] = {
>  		[PANTHOR_DEBUGFS_GEM_STATE_IMPORTED_BIT] = "imported",
>  		[PANTHOR_DEBUGFS_GEM_STATE_EXPORTED_BIT] = "exported",
> +		[PANTHOR_DEBUGFS_GEM_STATE_EVICTED_BIT] = "evicted",
>  	};
>  
>  	static const char * const gem_usage_flags_names[] = {
> @@ -1625,6 +1630,7 @@ static void panthor_gem_debugfs_bo_print(struct panthor_gem_object *bo,
>  {
>  	enum panthor_gem_reclaim_state reclaim_state = bo->reclaim_state;
>  	unsigned int refcount = kref_read(&bo->base.refcount);
> +	int reclaimed_count = atomic_read(&bo->reclaimed_count);
>  	char creator_info[32] = {};
>  	size_t resident_size;
>  	u32 gem_usage_flags = bo->debugfs.flags;
> @@ -1638,16 +1644,20 @@ static void panthor_gem_debugfs_bo_print(struct panthor_gem_object *bo,
>  
>  	snprintf(creator_info, sizeof(creator_info),
>  		 "%s/%d", bo->debugfs.creator.process_name, bo->debugfs.creator.tgid);
> -	seq_printf(m, "%-32s%-16d%-16d%-16zd%-16zd0x%-16lx",
> +	seq_printf(m, "%-32s%-16d%-16d%-11d%-16zd%-16zd0x%-16lx",
>  		   creator_info,
>  		   bo->base.name,
>  		   refcount,
> +		   reclaimed_count,
>  		   bo->base.size,
>  		   resident_size,
>  		   drm_vma_node_start(&bo->base.vma_node));
>  
>  	if (drm_gem_is_imported(&bo->base))
>  		gem_state_flags |= PANTHOR_DEBUGFS_GEM_STATE_FLAG_IMPORTED;
> +	else if (!resident_size && reclaimed_count)
> +		gem_state_flags |= PANTHOR_DEBUGFS_GEM_STATE_FLAG_EVICTED;
> +
>  	if (bo->base.dma_buf)
>  		gem_state_flags |= PANTHOR_DEBUGFS_GEM_STATE_FLAG_EXPORTED;
>  
> @@ -1671,8 +1681,8 @@ static void panthor_gem_debugfs_print_bos(struct panthor_device *ptdev,
>  
>  	panthor_gem_debugfs_print_flag_names(m);
>  
> -	seq_puts(m, "created-by                      global-name     refcount        size            resident-size   file-offset       state      usage       label\n");
> -	seq_puts(m, "----------------------------------------------------------------------------------------------------------------------------------------------\n");
> +	seq_puts(m, "created-by                      global-name     refcount        evictions  size            resident-size   file-offset       state      usage       label\n");
> +	seq_puts(m, "---------------------------------------------------------------------------------------------------------------------------------------------------------\n");
>  
>  	scoped_guard(mutex, &ptdev->gems.lock) {
>  		list_for_each_entry(bo, &ptdev->gems.node, debugfs.node) {
> @@ -1680,7 +1690,7 @@ static void panthor_gem_debugfs_print_bos(struct panthor_device *ptdev,
>  		}
>  	}
>  
> -	seq_puts(m, "==============================================================================================================================================\n");
> +	seq_puts(m, "=========================================================================================================================================================\n");
>  	seq_printf(m, "Total size: %zd, Total resident: %zd, Total reclaimable: %zd\n",
>  		   totals.size, totals.resident, totals.reclaimable);
>  }
> diff --git a/drivers/gpu/drm/panthor/panthor_gem.h b/drivers/gpu/drm/panthor/panthor_gem.h
> index ae0491d0b121..56d63137b4eb 100644
> --- a/drivers/gpu/drm/panthor/panthor_gem.h
> +++ b/drivers/gpu/drm/panthor/panthor_gem.h
> @@ -19,12 +19,16 @@ struct panthor_vm;
>  enum panthor_debugfs_gem_state_flags {
>  	PANTHOR_DEBUGFS_GEM_STATE_IMPORTED_BIT = 0,
>  	PANTHOR_DEBUGFS_GEM_STATE_EXPORTED_BIT = 1,
> +	PANTHOR_DEBUGFS_GEM_STATE_EVICTED_BIT = 2,
>  
>  	/** @PANTHOR_DEBUGFS_GEM_STATE_FLAG_IMPORTED: GEM BO is PRIME imported. */
>  	PANTHOR_DEBUGFS_GEM_STATE_FLAG_IMPORTED = BIT(PANTHOR_DEBUGFS_GEM_STATE_IMPORTED_BIT),
>  
>  	/** @PANTHOR_DEBUGFS_GEM_STATE_FLAG_EXPORTED: GEM BO is PRIME exported. */
>  	PANTHOR_DEBUGFS_GEM_STATE_FLAG_EXPORTED = BIT(PANTHOR_DEBUGFS_GEM_STATE_EXPORTED_BIT),
> +
> +	/** @PANTHOR_DEBUGFS_GEM_STATE_FLAG_EVICTED: GEM BO is evicted to swap. */
> +	PANTHOR_DEBUGFS_GEM_STATE_FLAG_EVICTED = BIT(PANTHOR_DEBUGFS_GEM_STATE_EVICTED_BIT),
>  };
>  
>  enum panthor_debugfs_gem_usage_flags {
> @@ -172,6 +176,12 @@ struct panthor_gem_object {
>  	/** @reclaim_state: Cached reclaim state */
>  	enum panthor_gem_reclaim_state reclaim_state;
>  
> +	/**
> +	 * @reclaimed_count: How many times object has been evicted to swap.
> +	 * The count saturates at %INT_MAX and will never wrap around to 0.
> +	 */
> +	atomic_t reclaimed_count;
> +
>  	/**
>  	 * @exclusive_vm_root_gem: Root GEM of the exclusive VM this GEM object
>  	 * is attached to.
> 


^ permalink raw reply

* Re: [PATCH v6 06/43] KVM: x86/mmu: Bug the VM if gmem attributes are queried to determine max mapping level
From: Fuad Tabba @ 2026-05-20 13:33 UTC (permalink / raw)
  To: ackerleytng
  Cc: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	ira.weiny, jmattson, jthoughton, michael.roth, oupton,
	pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
	steven.price, willy, wyihan, yan.y.zhao, forkloop, pratyush,
	suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
	Sean Christopherson, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
	Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
	Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka,
	kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco
In-Reply-To: <20260507-gmem-inplace-conversion-v6-6-91ab5a8b19a4@google.com>

On Thu, 7 May 2026 at 21:22, Ackerley Tng via B4 Relay
<devnull+ackerleytng.google.com@kernel.org> wrote:
>
> From: Ackerley Tng <ackerleytng@google.com>
>
> When the maximum mapping level is queried, KVM's MMU lock is held, and
> while the MMU lock is held, guest_memfd cannot take the
> filemap_invalidate_lock() to look up the current shared/private state of
> the gfn, for these reasons:
>
> + The MMU lock is a spinlock or rwlock and cannot be held while taking a
>   lock that can sleep.
> + In guest_memfd's code paths (such as truncate), the
>   filemap_invalidate_lock() is held while taking the MMU lock, and taking
>   the locks in reverse order would introduce a AB-BA deadlock.
>
> Currently, the maximum mapping level is only queried from guest_memfd in
> the process of recovering huge pages, if dirty logging is disabled on a
> memslot. Dirty logging is not currently supported for guest_memfd, and
> guest_memfd memslots also cannot be updated.
>
> For now, bug the VM if guest_memfd needs to be queried to determine the
> maximum mapping level. This guard can be removed if/when support is added.
>
> Signed-off-by: Ackerley Tng <ackerleytng@google.com>
> ---
>  arch/x86/kvm/mmu/mmu.c | 9 +++++++++
>  1 file changed, 9 insertions(+)
>
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index a80a876ab4ad6..153bcc5369985 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -3357,6 +3357,15 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_page_fault *fault,
>                 max_level = fault->max_level;
>                 is_private = fault->is_private;
>         } else {
> +               /*
> +                * Memory attributes cannot be obtained from guest_memfd while
> +                * the MMU lock is held.
> +                */
> +               if (KVM_BUG_ON(static_call_query(__kvm_get_memory_attributes) ==
> +                              kvm_gmem_get_memory_attributes, kvm)) {
> +                       return 0;
> +               }
> +

This directly takes the address of kvm_gmem_get_memory_attributes,
which is only compiled if CONFIG_KVM_GUEST_MEMFD=y. This breaks
ARCH=i386.

Cheers,
/fuad

>                 max_level = PG_LEVEL_NUM;
>                 is_private = kvm_mem_is_private(kvm, gfn);
>         }
>
> --
> 2.54.0.563.g4f69b47b94-goog
>
>

^ permalink raw reply

* Re: [PATCH v3 1/3] soc: bcm2835: raspberrypi-firmware: Add voltage domain IDs
From: Guenter Roeck @ 2026-05-20 13:35 UTC (permalink / raw)
  To: Shubham Chakraborty
  Cc: Florian Fainelli, Jonathan Corbet, Shuah Khan,
	Broadcom internal kernel review list, Ray Jui, Scott Branden,
	linux-hwmon, linux-doc, linux-rpi-kernel, linux-arm-kernel,
	linux-kernel
In-Reply-To: <20260517080445.103962-2-chakrabortyshubham66@gmail.com>

On Sun, May 17, 2026 at 01:34:43PM +0530, Shubham Chakraborty wrote:
> Add Raspberry Pi firmware voltage domain identifiers for the mailbox
> property interface.
> 
> Also add the voltage request structure used with
> RPI_FIRMWARE_GET_VOLTAGE so firmware clients can share the common API
> definition from the firmware header.
> 
> Signed-off-by: Shubham Chakraborty <chakrabortyshubham66@gmail.com>
> Acked-by: Florian Fainelli <florian.fainelli@broadcom.com>

Applied.

Thanks,
Guenter

^ permalink raw reply

* Re: [PATCH v3 2/3] hwmon: raspberrypi: Add voltage input support
From: Guenter Roeck @ 2026-05-20 13:37 UTC (permalink / raw)
  To: Shubham Chakraborty
  Cc: Florian Fainelli, Jonathan Corbet, Shuah Khan,
	Broadcom internal kernel review list, Ray Jui, Scott Branden,
	linux-hwmon, linux-doc, linux-rpi-kernel, linux-arm-kernel,
	linux-kernel
In-Reply-To: <20260517080445.103962-3-chakrabortyshubham66@gmail.com>

On Sun, May 17, 2026 at 01:34:44PM +0530, Shubham Chakraborty wrote:
> Extend the raspberrypi-hwmon driver to expose firmware-provided
> voltage measurements through the hwmon subsystem.
> 
> The driver now exports the following voltage inputs:
> 
>   - in0_input (core)
>   - in1_input (sdram_c)
>   - in2_input (sdram_i)
>   - in3_input (sdram_p)
> 
> Voltage values returned by firmware are converted from microvolts
> to millivolts as expected by the hwmon subsystem.
> 
> Update the documentation related to it.
> 
> The existing undervoltage sticky alarm handling is preserved and
> associated with the first voltage channel.
> 
> Tested in -
> - Raspberry Pi 3b+ (Linux raspberrypi 6.12.75+rpt-rpi-v8 #1 SMP PREEMPT
>   Debian 1:6.12.75-1+rpt1 (2026-03-11) aarch64 GNU/Linux)
> 
> Signed-off-by: Shubham Chakraborty <chakrabortyshubham66@gmail.com>
> Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>

Applied, after fixing:

WARNING: Missing a blank line after declarations
#207: FILE: drivers/hwmon/raspberrypi-hwmon.c:74:
+	int ret;
+	ret = rpi_firmware_property(data->fw, RPI_FIRMWARE_GET_VOLTAGE,

Please run checkpatch on your patches.

Thanks,
Guenter

^ permalink raw reply

* Re: [PATCH v3 3/3] hwmon: raspberrypi: Fix delayed-work teardown race
From: Guenter Roeck @ 2026-05-20 13:39 UTC (permalink / raw)
  To: Shubham Chakraborty
  Cc: Florian Fainelli, Jonathan Corbet, Shuah Khan,
	Broadcom internal kernel review list, Ray Jui, Scott Branden,
	linux-hwmon, linux-doc, linux-rpi-kernel, linux-arm-kernel,
	linux-kernel
In-Reply-To: <20260517080445.103962-4-chakrabortyshubham66@gmail.com>

On Sun, May 17, 2026 at 01:34:45PM +0530, Shubham Chakraborty wrote:
> The delayed polling work rearms itself from the work function, so use
> explicit delayed-work setup and cleanup instead of
> devm_delayed_work_autocancel().
> 
> Initialize the delayed work with INIT_DELAYED_WORK() and register a
> devres cleanup action that calls disable_delayed_work_sync() during
> teardown.
> 
> This addresses the concern raised during review about the polling work
> being able to requeue itself while the driver is being removed.
> 
> Signed-off-by: Shubham Chakraborty <chakrabortyshubham66@gmail.com>

Applied.

Thanks,
Guenter

^ permalink raw reply

* [PATCH v10 00/25] Runtime TDX module update support
From: Chao Gao @ 2026-05-20 13:38 UTC (permalink / raw)
  To: kvm, linux-coco, x86, linux-kernel, linux-rt-devel, linux-doc
  Cc: binbin.wu, dave.hansen, djbw, ira.weiny, kai.huang, kas,
	nik.borisov, paulmck, pbonzini, reinette.chatre, rick.p.edgecombe,
	sagis, seanjc, tony.lindgren, vannapurve, vishal.l.verma,
	yilun.xu, xiaoyao.li, yan.y.zhao, Chao Gao, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, H. Peter Anvin,
	Sebastian Andrzej Siewior, Clark Williams, Steven Rostedt,
	Jonathan Corbet, Shuah Khan

Hi Dave & Rick,

Thanks for your thorough review of v9. This v10 addresses the issues you
pointed out. The main changes in this version are polishing changelogs
and variable renames to improve readability. Specifically:
 
   - Patches 1-2 (new): Split the original "Consolidate TDX global
     initialization states" into two steps — first move the statics to
     file scope, then clarify the result-caching logic in
     try_init_module_global().
   - Patch 6: Removed user-facing Kconfig help text for TDX_HOST_SERVICES
     (now a silent tristate auto-selected by INTEL_TDX_HOST).
   - Patch 13: Renamed "size" to "data_len" in seamldr_install_module()
     and init_seamldr_params(); renamed "HEADER_SIZE" to
     "TDX_IMAGE_HEADER_SIZE"; renamed "primary" to "is_lead_cpu" in the
     update state machine.
   - Patch 13: Added early data_len validation and explicit bounds checks
     on sigstruct_nr_pages/module_nr_pages against SEAMLDR_MAX_NR_*
     limits, removing the implicit clamping in populate_pa_list().
   - Patch 22: Fixed BIT(16) -> BIT_ULL(16) for
     TDX_SYS_SHUTDOWN_AVOID_COMPAT_SENSITIVE.
   - Patch 22: Removed unused TDX_FEATURES0_UPDATE_COMPAT definition.
   - Various patches: Shortened sysfs ABI descriptions, tightened
     comments across seamldr.h and seamldr.c, and minor style fixes
     (return 0 -> return false, unfolded conditionals)

Please take a look at this new version. I hope it can still be merged
for 7.2.
---

(For transparency, note that I used AI tools to help proofread this
cover-letter and commit messages)

This series adds support for runtime TDX module updates that preserve
running TDX guests. It is also available at:

  https://github.com/gaochaointel/linux-dev/commits/tdx-module-updates-v10/

== Background ==

Intel TDX isolates Trusted Domains (TDs), or confidential guests, from the
host. A key component of Intel TDX is the TDX module, which enforces
security policies to protect the memory and CPU states of TDs from the
host. However, the TDX module is software that requires updates.

== Problems ==

Currently, the TDX module is loaded by the BIOS at boot time, and the only
way to update it is through a reboot, which results in significant system
downtime. Users expect the TDX module to be updatable at runtime without
disrupting TDX guests.

== Solution ==

On TDX platforms, P-SEAMLDR[1] is a component within the protected SEAM
range. It is loaded by the BIOS and provides the host with functions to
install a TDX module at runtime.

This series implements runtime TDX module updates through the fw_upload
mechanism. That interface is a good fit because TDX module selection is not
a simple "load a known file from disk" problem. The update image to load
depends on module versioning, compatibility rules. fw_upload lets userspace
choose the module explicitly while the kernel provides the update
mechanism.

This design intentionally keeps most update validation/policy in userspace.
The kernel exposes the information userspace needs, such as TDX module
version and P-SEAMLDR information, but userspace is responsible for
understanding TDX module's versioning and compatibility rules and for
choosing an appropriate update image (see "TDX module versioning" below).

The kernel still enforces the pieces that must be handled in-kernel:

1. Validate the tdx_blob header fields that are not passed through tothe
TDX module. Just the standard overflow and reserved bits defensive ABI stuff.

2. Make sure no non-update SEAMCALLs are called during the update.

3. Make sure SEAMCALLs are on the right CPU, for any the user has made
available to the kernel.

4. Handle the race between updates and concurrent TD builds by
returning -EBUSY to userspace.

Everything else remains a userspace responsibility.

In the unlikely event the update fails, for example userspace picks an
incompatible update image, or the image is otherwise corrupted, all TDs
will experience SEAMCALL failures and be killed. The recovery of TD
operation from that event requires a reboot.

Given there is no mechanism to quiesce SEAMCALLs, the TDs themselves must
pause execution over an update. The most straightforward way to meet the
'pause TDs while update executes' constraint is to run the update in
stop_machine() context. All other evaluated solutions export more
complexity to KVM, or exports more fragility to userspace.

== How to test this series ==

NOTE: This v10 uses a new tdx_blob format. The scripts and module blobs in
https://github.com/intel/tdx-module-binaries have not yet been updated
to match this version. Those updates will be done separately later.

== Other information relevant to Runtime TDX module updates ==

=== TDX module versioning ===

Each TDX module is assigned a version number x.y.z, where x represents the
"major" version, y the "minor" version, and z the "update" version.

Runtime TDX module updates are restricted to Z-stream releases.

Note that Z-stream releases do not necessarily guarantee compatibility. A
new release may not be compatible with all previous versions. To address this,
Intel provides a separate file containing compatibility information, which
specifies the minimum module version required for a particular update. This
information is referenced by the tool to determine if two modules are
compatible.

=== TCB Stability ===

Updates change the TCB as viewed by attestation reports. In TDX there is
a distinction between "launch-time" version and "current" version where
runtime TDX module updates cause that "current" version number to change,
subject to Z-stream constraints.

The concern that a malicious host may attack confidential VMs by loading
insecure updates was addressed by Alex in [3]. Similarly, the scenario
where some "theoretical paranoid tenant" in the cloud wants to audit
updates and stop trusting the host after updates until audit completion
was also addressed in [4]. Users not in the cloud control the host machine
and can manage updates themselves, so they don't have these concerns.

See more about the implications of current TCB version changes in
attestation as summarized by Dave in [5].

=== TDX module Distribution Model ===

At a high level, Intel publishes all TDX modules on the github [2], along
with a mapping_file.json which documents the compatibility information
about each TDX module and a userspace tool to install the TDX module. OS
vendors can package these modules and distribute them. Administrators
install the package and use the tool to select the appropriate TDX module
and install it via the interfaces exposed by this series.

[1]: https://cdrdv2.intel.com/v1/dl/getContent/733584
[2]: https://github.com/intel/tdx-module-binaries
[3]: https://lore.kernel.org/all/665c5ae0-4b7c-4852-8995-255adf7b3a2f@amazon.com/
[4]: https://lore.kernel.org/all/5d1da767-491b-4077-b472-2cc3d73246d6@amazon.com/
[5]: https://lore.kernel.org/all/94d6047e-3b7c-4bc1-819c-85c16ff85abf@intel.com/



Chao Gao (24):
  x86/virt/tdx: Clarify try_init_module_global() result caching
  x86/virt/tdx: Move TDX global initialization states to file scope
  x86/virt/tdx: Consolidate TDX global initialization states
  x86/virt/tdx: Move TDX_FEATURES0 bits to asm/tdx.h
  coco/tdx-host: Introduce a "tdx_host" device
  coco/tdx-host: Expose TDX module version
  x86/virt/seamldr: Introduce a wrapper for P-SEAMLDR SEAMCALLs
  x86/virt/seamldr: Add a helper to retrieve P-SEAMLDR information
  coco/tdx-host: Expose P-SEAMLDR information via sysfs
  coco/tdx-host: Don't expose P-SEAMLDR information on CPUs with erratum
  coco/tdx-host: Implement firmware upload sysfs ABI for TDX module
    updates
  x86/virt/seamldr: Allocate and populate a module update request
  x86/virt/seamldr: Introduce skeleton for TDX module updates
  x86/virt/seamldr: Abort updates after a failed step
  x86/virt/seamldr: Shut down the current TDX module
  x86/virt/tdx: Reset software states during TDX module shutdown
  x86/virt/seamldr: Install a new TDX module
  x86/virt/seamldr: Do TDX global and per-CPU init after module
    installation
  x86/virt/tdx: Restore TDX module state
  x86/virt/tdx: Refresh TDX module version after update
  x86/virt/tdx: Reject updates during compatibility-sensitive operations
  x86/virt/tdx: Enable TDX module runtime updates
  coco/tdx-host: Document TDX module update compatibility criteria
  x86/virt/tdx: Document TDX module update

Kai Huang (1):
  x86/virt/tdx: Move low level SEAMCALL helpers out of <asm/tdx.h>

 .../ABI/testing/sysfs-devices-faux-tdx-host   |  66 ++++
 Documentation/arch/x86/tdx.rst                |  34 ++
 arch/x86/include/asm/cpufeatures.h            |   1 +
 arch/x86/include/asm/seamldr.h                |  36 ++
 arch/x86/include/asm/tdx.h                    |  66 +---
 arch/x86/include/asm/tdx_global_metadata.h    |   4 +
 arch/x86/include/asm/vmx.h                    |   1 +
 arch/x86/virt/vmx/tdx/Makefile                |   2 +-
 arch/x86/virt/vmx/tdx/seamcall_internal.h     | 109 ++++++
 arch/x86/virt/vmx/tdx/seamldr.c               | 324 ++++++++++++++++++
 arch/x86/virt/vmx/tdx/tdx.c                   | 169 +++++----
 arch/x86/virt/vmx/tdx/tdx.h                   |   8 +-
 arch/x86/virt/vmx/tdx/tdx_global_metadata.c   |  17 +-
 drivers/virt/coco/Kconfig                     |   2 +
 drivers/virt/coco/Makefile                    |   1 +
 drivers/virt/coco/tdx-host/Kconfig            |   6 +
 drivers/virt/coco/tdx-host/Makefile           |   1 +
 drivers/virt/coco/tdx-host/tdx-host.c         | 231 +++++++++++++
 18 files changed, 965 insertions(+), 113 deletions(-)
 create mode 100644 Documentation/ABI/testing/sysfs-devices-faux-tdx-host
 create mode 100644 arch/x86/include/asm/seamldr.h
 create mode 100644 arch/x86/virt/vmx/tdx/seamcall_internal.h
 create mode 100644 arch/x86/virt/vmx/tdx/seamldr.c
 create mode 100644 drivers/virt/coco/tdx-host/Kconfig
 create mode 100644 drivers/virt/coco/tdx-host/Makefile
 create mode 100644 drivers/virt/coco/tdx-host/tdx-host.c


base-commit: 5209e5bfe5cab593476c3e7754e42c5e47ce36de
-- 
2.52.0


^ permalink raw reply

* [PATCH v10 25/25] x86/virt/tdx: Document TDX module update
From: Chao Gao @ 2026-05-20 13:38 UTC (permalink / raw)
  To: kvm, linux-coco, linux-kernel, linux-doc
  Cc: binbin.wu, dave.hansen, djbw, ira.weiny, kai.huang, kas,
	nik.borisov, paulmck, pbonzini, reinette.chatre, rick.p.edgecombe,
	sagis, seanjc, tony.lindgren, vannapurve, vishal.l.verma,
	yilun.xu, xiaoyao.li, yan.y.zhao, Chao Gao, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, x86, H. Peter Anvin,
	Jonathan Corbet, Shuah Khan
In-Reply-To: <20260520133909.409394-1-chao.gao@intel.com>

Document TDX module update as a subsection of "TDX Host Kernel Support" to
provide background information and cover key points that developers and
users may need to know, for example:

 - update is done in stop_machine() context
 - update instructions and results
 - update policy and tooling

Signed-off-by: Chao Gao <chao.gao@intel.com>
Reviewed-by: Kai Huang <kai.huang@intel.com>
Reviewed-by: Kiryl Shutsemau (Meta) <kas@kernel.org>
Reviewed-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
---
 Documentation/arch/x86/tdx.rst | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/Documentation/arch/x86/tdx.rst b/Documentation/arch/x86/tdx.rst
index 1a3b5bac1021..9d2b7db166b5 100644
--- a/Documentation/arch/x86/tdx.rst
+++ b/Documentation/arch/x86/tdx.rst
@@ -73,6 +73,40 @@ initialize::
 
   [..] virt/tdx: TDX-Module initialization failed ...
 
+TDX module Runtime Update
+-------------------------
+
+The TDX architecture includes a persistent SEAM loader (P-SEAMLDR) that
+runs in SEAM mode separately from the TDX module. The kernel can
+communicate with P-SEAMLDR to perform runtime updates of the TDX module.
+
+During updates, the TDX module becomes unresponsive to other TDX
+operations. To prevent components using TDX (such as KVM) from
+experiencing unexpected errors during updates, updates are performed in
+stop_machine() context.
+
+TDX module updates have complex compatibility requirements; the new module
+must be compatible with the current CPU, P-SEAMLDR, and running TDX module.
+Rather than implementing complex module selection and policy enforcement
+logic in the kernel, userspace is responsible for auditing and selecting
+appropriate updates.
+
+Updates use the standard firmware upload interface. See
+Documentation/driver-api/firmware/fw_upload.rst for detailed instructions.
+
+If updates failed, running TDs may be killed and further TDX operations may
+not be possible until reboot. For detailed error information, see
+Documentation/ABI/testing/sysfs-devices-faux-tdx-host.
+
+Given the risk of losing existing TDs, userspace should verify that the
+update is compatible with the current system and properly validated before
+applying it.
+
+A reference userspace tool that implements necessary checks is available
+at:
+
+  https://github.com/intel/tdx-module-binaries
+
 TDX Interaction to Other Kernel Components
 ------------------------------------------
 
-- 
2.52.0


^ permalink raw reply related

* Re: [PATCH v10 00/25] Runtime TDX module update support
From: Chao Gao @ 2026-05-20 13:46 UTC (permalink / raw)
  To: kvm, linux-coco, x86, linux-kernel, linux-rt-devel, linux-doc
  Cc: binbin.wu, dave.hansen, djbw, ira.weiny, kai.huang, kas,
	nik.borisov, paulmck, pbonzini, reinette.chatre, rick.p.edgecombe,
	sagis, seanjc, tony.lindgren, vannapurve, vishal.l.verma,
	yilun.xu, xiaoyao.li, yan.y.zhao, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, H. Peter Anvin, Sebastian Andrzej Siewior,
	Clark Williams, Steven Rostedt, Jonathan Corbet, Shuah Khan
In-Reply-To: <20260520133909.409394-1-chao.gao@intel.com>

On Wed, May 20, 2026 at 06:38:03AM -0700, Chao Gao wrote:
>Hi Dave & Rick,
>
>Thanks for your thorough review of v9. This v10 addresses the issues you
>pointed out. The main changes in this version are polishing changelogs
>and variable renames to improve readability. Specifically:
> 
>   - Patches 1-2 (new): Split the original "Consolidate TDX global
>     initialization states" into two steps — first move the statics to
>     file scope, then clarify the result-caching logic in
>     try_init_module_global().
>   - Patch 6: Removed user-facing Kconfig help text for TDX_HOST_SERVICES
>     (now a silent tristate auto-selected by INTEL_TDX_HOST).
>   - Patch 13: Renamed "size" to "data_len" in seamldr_install_module()
>     and init_seamldr_params(); renamed "HEADER_SIZE" to
>     "TDX_IMAGE_HEADER_SIZE"; renamed "primary" to "is_lead_cpu" in the
>     update state machine.
>   - Patch 13: Added early data_len validation and explicit bounds checks
>     on sigstruct_nr_pages/module_nr_pages against SEAMLDR_MAX_NR_*
>     limits, removing the implicit clamping in populate_pa_list().
>   - Patch 22: Fixed BIT(16) -> BIT_ULL(16) for
>     TDX_SYS_SHUTDOWN_AVOID_COMPAT_SENSITIVE.
>   - Patch 22: Removed unused TDX_FEATURES0_UPDATE_COMPAT definition.
>   - Various patches: Shortened sysfs ABI descriptions, tightened
>     comments across seamldr.h and seamldr.c, and minor style fixes
>     (return 0 -> return false, unfolded conditionals)

FYI, below is the diff between v9 and v10:

diff --git a/Documentation/ABI/testing/sysfs-devices-faux-tdx-host b/Documentation/ABI/testing/sysfs-devices-faux-tdx-host
index 9e08db231da1..5f18ac972468 100644
--- a/Documentation/ABI/testing/sysfs-devices-faux-tdx-host
+++ b/Documentation/ABI/testing/sysfs-devices-faux-tdx-host
@@ -1,16 +1,14 @@
 What:		/sys/devices/faux/tdx_host/version
 Contact:	linux-coco@lists.linux.dev
-Description:	(RO) Report the version of the loaded TDX module. The TDX module
-		version is formatted as x.y.z, where "x" is the major version,
-		"y" is the minor version and "z" is the update version. Versions
-		are used for bug reporting, TDX module updates etc.
+Description:	(RO) Report the version of the loaded TDX module.
+		Formatted as "major.minor.update". Used by TDX module
+		update tooling. Example: "1.2.03".
 
 What:		/sys/devices/faux/tdx_host/seamldr_version
 Contact:	linux-coco@lists.linux.dev
-Description:	(RO) Report the version of the loaded P-SEAMLDR. The P-SEAMLDR
-		version is formatted as x.y.z, where "x" is the major version,
-		"y" is the minor version and "z" is the update version. Versions
-		are used for bug reporting and compatibility checks.
+Description:	(RO) Report the version of the loaded P-SEAMLDR.
+		Formatted as a TDX module version. Used by TDX module
+		update tooling.
 
 What:		/sys/devices/faux/tdx_host/num_remaining_updates
 Contact:	linux-coco@lists.linux.dev
diff --git a/arch/x86/include/asm/seamldr.h b/arch/x86/include/asm/seamldr.h
index ac6f80f7208b..43084e2daa2d 100644
--- a/arch/x86/include/asm/seamldr.h
+++ b/arch/x86/include/asm/seamldr.h
@@ -5,11 +5,10 @@
 #include <linux/types.h>
 
 /*
- * This is called the "SEAMLDR_INFO" data structure and is defined
- * in "SEAM Loader (SEAMLDR) Interface Specification".
+ * This is the "SEAMLDR_INFO" data structure defined in the
+ * "SEAM Loader (SEAMLDR) Interface Specification".
  *
- * The SEAMLDR.INFO documentation requires this to be aligned to a
- * 256-byte boundary.
+ * Must be aligned to a 256-byte boundary.
  */
 struct seamldr_info {
	u32	version;
@@ -32,6 +31,6 @@ struct seamldr_info {
 static_assert(sizeof(struct seamldr_info) == 256);
 
 int seamldr_get_info(struct seamldr_info *seamldr_info);
-int seamldr_install_module(const u8 *data, u32 size);
+int seamldr_install_module(const u8 *data, u32 data_len);
 
 #endif /* _ASM_X86_SEAMLDR_H */
diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
index ac042b369843..c848483d815f 100644
--- a/arch/x86/include/asm/tdx.h
+++ b/arch/x86/include/asm/tdx.h
@@ -36,7 +36,6 @@
 /* Bit definitions of TDX_FEATURES0 metadata field */
 #define TDX_FEATURES0_TD_PRESERVING	BIT_ULL(1)
 #define TDX_FEATURES0_NO_RBP_MOD	BIT_ULL(18)
-#define TDX_FEATURES0_UPDATE_COMPAT	BIT_ULL(47)
 
 #ifndef __ASSEMBLER__
 
diff --git a/arch/x86/virt/vmx/tdx/seamldr.c b/arch/x86/virt/vmx/tdx/seamldr.c
index 6a39c9e3ef7d..ff95d8dd1162 100644
--- a/arch/x86/virt/vmx/tdx/seamldr.c
+++ b/arch/x86/virt/vmx/tdx/seamldr.c
@@ -32,10 +32,12 @@
 #define SEAMLDR_SCENARIO_UPDATE		1
 
 /*
- * This is called the "SEAMLDR_PARAMS" data structure and is defined
- * in "SEAM Loader (SEAMLDR) Interface Specification".
+ * This is the "SEAMLDR_PARAMS" data structure defined in the
+ * "SEAM Loader (SEAMLDR) Interface Specification".
  *
- * It describes the TDX module that will be installed.
+ * It is the in-memory ABI that the kernel passes to the P-SEAMLDR
+ * to update the TDX module. It breaks the TDX module image up in
+ * page-size pieces.
  */
 struct seamldr_params {
	u32	version;
@@ -87,7 +89,7 @@ static int seamldr_install(const struct seamldr_params *params)
 #define TDX_IMAGE_VERSION_2		0x200
 
 struct tdx_image_header {
-	u16	version; // This ABI is always 0x200
+	u16	version;
	u16	checksum;
	u8	signature[8];
	u32	sigstruct_nr_pages;
@@ -95,23 +97,28 @@ struct tdx_image_header {
	u8	reserved[4076];
 } __packed;
 
-#define HEADER_SIZE sizeof(struct tdx_image_header)
-static_assert(HEADER_SIZE == 4096);
+#define TDX_IMAGE_HEADER_SIZE sizeof(struct tdx_image_header)
+static_assert(TDX_IMAGE_HEADER_SIZE == 4096);
 
-/* Intel TDX module update ABI structure. aka. "TDX module blob". */
+/*
+ * Intel TDX module update ABI structure. aka. "TDX module blob".
+ *
+ * @payload contains sigstruct pages followed by module pages.
+ */
 struct tdx_image {
	struct tdx_image_header header;
-	u8 payload[]; // Contains sigstruct pages followed by module pages
+	u8 payload[];
 };
 
-static void populate_pa_list(u64 *pa_list, u32 max_entries, const u8 *start, u32 nr_pages)
+static void populate_pa_list(u64 *pa_list, const u8 *vmalloc_addr, u32 vmalloc_len_pages)
 {
	int i;
 
-	nr_pages = MIN(nr_pages, max_entries);
-	for (i = 0; i < nr_pages; i++) {
-		pa_list[i] = vmalloc_to_pfn(start) << PAGE_SHIFT;
-		start += PAGE_SIZE;
+	for (i = 0; i < vmalloc_len_pages; i++) {
+		unsigned long offset = i * PAGE_SIZE;
+		unsigned long pfn = vmalloc_to_pfn(&vmalloc_addr[offset]);
+
+		pa_list[i] = pfn << PAGE_SHIFT;
	}
 }
 
@@ -123,39 +130,43 @@ static void populate_seamldr_params(struct seamldr_params *params,
	params->scenario		= SEAMLDR_SCENARIO_UPDATE;
	params->module_nr_pages		= mod_nr_pages;
 
-	populate_pa_list(params->sigstruct_pages_pa_list, SEAMLDR_MAX_NR_SIG_PAGES,
-			 sig, sig_nr_pages);
-	populate_pa_list(params->module_pages_pa_list, SEAMLDR_MAX_NR_MODULE_PAGES,
-			 mod, mod_nr_pages);
+	populate_pa_list(params->sigstruct_pages_pa_list, sig, sig_nr_pages);
+	populate_pa_list(params->module_pages_pa_list, mod, mod_nr_pages);
 }
 
-static int init_seamldr_params(struct seamldr_params *params, const u8 *data, u32 size)
+/*
+ * @image points to a vmalloc()'d 'struct tdx_image'. Transform
+ * it into @params which is the P-SEAMLDR ABI format.
+ */
+static int init_seamldr_params(struct seamldr_params *params,
+			       const struct tdx_image *image,
+			       u32 image_len)
 {
-	const struct tdx_image *image		= (const void *)data;
	const struct tdx_image_header *header	= &image->header;
 
	u32 sigstruct_len	= header->sigstruct_nr_pages * PAGE_SIZE;
	u32 module_len		= header->module_nr_pages * PAGE_SIZE;
 
	u8 *header_start	= (u8 *)header;
-	u8 *header_end		= header_start + HEADER_SIZE;
+	u8 *header_end		= header_start + TDX_IMAGE_HEADER_SIZE;
 
	u8 *sigstruct_start	= header_end;
	u8 *sigstruct_end	= sigstruct_start + sigstruct_len;
 
	u8 *module_start	= sigstruct_end;
 
-	/* Check the calculated payload size against the data size. */
-	if (HEADER_SIZE + sigstruct_len + module_len != size)
+	/* Check the calculated payload size against the image size. */
+	if (TDX_IMAGE_HEADER_SIZE + sigstruct_len + module_len != image_len)
		return -EINVAL;
 
-	/*
-	 * Don't care about user passing the wrong file, but protect
-	 * kernel ABI by preventing accepting garbage.
-	 */
+	/* Reject unsupported tdx_image ABI versions. */
	if (header->version != TDX_IMAGE_VERSION_2)
		return -EINVAL;
 
+	if (header->sigstruct_nr_pages > SEAMLDR_MAX_NR_SIG_PAGES ||
+	    header->module_nr_pages > SEAMLDR_MAX_NR_MODULE_PAGES)
+		return -EINVAL;
+
	if (memcmp(header->signature, "TDX-BLOB", sizeof(header->signature)))
		return -EINVAL;
 
@@ -163,7 +174,7 @@ static int init_seamldr_params(struct seamldr_params *params, const u8 *data, u3
		return -EINVAL;
 
	populate_seamldr_params(params, sigstruct_start, header->sigstruct_nr_pages,
-				module_start, header->module_nr_pages);
+					module_start,    header->module_nr_pages);
	return 0;
 }
 
@@ -230,14 +241,14 @@ static int do_seamldr_install_module(void *seamldr_params)
 {
	enum module_update_state newstate, curstate = MODULE_UPDATE_START;
	int cpu = smp_processor_id();
-	bool primary;
+	bool is_lead_cpu;
	int ret = 0;
 
	/*
-	 * Use CPU 0 to execute update steps that must run exactly once.
-	 * Note CPU 0 is always online.
+	 * Some steps must be run on exactly one CPU. Pick a "lead" CPU to
+	 * execute those steps. Use CPU 0 because it is always online.
	 */
-	primary = cpu == 0;
+	is_lead_cpu = cpu == 0;
 
	do {
		newstate = READ_ONCE(update_ctrl.state);
@@ -250,7 +261,7 @@ static int do_seamldr_install_module(void *seamldr_params)
		curstate = newstate;
		switch (curstate) {
		case MODULE_UPDATE_SHUTDOWN:
-			if (primary)
+			if (is_lead_cpu)
				ret = tdx_module_shutdown();
			break;
		case MODULE_UPDATE_CPU_INSTALL:
@@ -260,7 +271,7 @@ static int do_seamldr_install_module(void *seamldr_params)
			ret = tdx_cpu_enable();
			break;
		case MODULE_UPDATE_RUN_UPDATE:
-			if (primary)
+			if (is_lead_cpu)
				ret = tdx_module_run_update();
			break;
		default:
@@ -276,20 +287,27 @@ static int do_seamldr_install_module(void *seamldr_params)
 /**
  * seamldr_install_module - Install a new TDX module.
  * @data: Pointer to the TDX module image.
- * @size: Size of the TDX module image.
+ * @data_len: Size of the TDX module image.
  *
  * Returns 0 on success, negative error code on failure.
  */
-int seamldr_install_module(const u8 *data, u32 size)
+int seamldr_install_module(const u8 *data, u32 data_len)
 {
	struct seamldr_params *params;
+	const struct tdx_image *image;
	int ret;
 
+	if (data_len < TDX_IMAGE_HEADER_SIZE)
+		return -EINVAL;
+
+	image = (const struct tdx_image *)data;
+
	params = kzalloc_obj(*params);
	if (!params)
		return -ENOMEM;
 
-	ret = init_seamldr_params(params, data, size);
+	/* Populate 'params' from 'image'. */
+	ret = init_seamldr_params(params, image, data_len);
	if (ret)
		goto out;
 
diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
index 2ab6f6efe6d1..0c5660c9ab45 100644
--- a/arch/x86/virt/vmx/tdx/tdx.c
+++ b/arch/x86/virt/vmx/tdx/tdx.c
@@ -69,6 +69,8 @@ static LIST_HEAD(tdx_memlist);
 
 static struct tdx_sys_info tdx_sysinfo;
 
+static DEFINE_RAW_SPINLOCK(sysinit_lock);
+
 /*
  * Do the module global initialization once and return its result.
  * It can be done on any cpu, and from task or IRQ context.
@@ -76,29 +78,34 @@ static struct tdx_sys_info tdx_sysinfo;
 static int try_init_module_global(void)
 {
	struct tdx_module_args args = {};
-	static DEFINE_RAW_SPINLOCK(sysinit_lock);
+	int ret;
 
	raw_spin_lock(&sysinit_lock);
 
-	if (tdx_module_state.sysinit_done)
+	/* Return the "cached" return code. */
+	if (tdx_module_state.sysinit_done) {
+		ret = tdx_module_state.sysinit_ret;
		goto out;
+	}
 
	/* RCX is module attributes and all bits are reserved */
	args.rcx = 0;
-	tdx_module_state.sysinit_ret = seamcall_prerr(TDH_SYS_INIT, &args);
+	ret = seamcall_prerr(TDH_SYS_INIT, &args);
 
	/*
	 * The first SEAMCALL also detects the TDX module, thus
	 * it can fail due to the TDX module is not loaded.
	 * Dump message to let the user know.
	 */
-	if (tdx_module_state.sysinit_ret == -ENODEV)
+	if (ret == -ENODEV)
		pr_err("module not loaded\n");
 
+	/* Save the return code for later callers. */
	tdx_module_state.sysinit_done = true;
+	tdx_module_state.sysinit_ret = ret;
 out:
	raw_spin_unlock(&sysinit_lock);
-	return tdx_module_state.sysinit_ret;
+	return ret;
 }
 
 /**
@@ -1267,7 +1274,7 @@ static __init int tdx_enable(void)
 }
 subsys_initcall(tdx_enable);
 
-#define TDX_SYS_SHUTDOWN_AVOID_COMPAT_SENSITIVE BIT(16)
+#define TDX_SYS_SHUTDOWN_AVOID_COMPAT_SENSITIVE BIT_ULL(16)
 
 int tdx_module_shutdown(void)
 {
diff --git a/drivers/virt/coco/tdx-host/Kconfig b/drivers/virt/coco/tdx-host/Kconfig
index ca600a39d97b..57d0c01a4357 100644
--- a/drivers/virt/coco/tdx-host/Kconfig
+++ b/drivers/virt/coco/tdx-host/Kconfig
@@ -1,12 +1,6 @@
 config TDX_HOST_SERVICES
-	tristate "TDX Host Services Driver"
+	tristate
	depends on INTEL_TDX_HOST
	select FW_LOADER
	select FW_UPLOAD
	default m
-	help
-	  Enable access to TDX host services like module update and
-	  extensions (e.g. TDX Connect).
-
-	  Say y or m if enabling support for confidential virtual machine
-	  support (CONFIG_INTEL_TDX_HOST). The module is called tdx_host.ko.
diff --git a/drivers/virt/coco/tdx-host/tdx-host.c b/drivers/virt/coco/tdx-host/tdx-host.c
index ad116e56aa1a..291464490fe0 100644
--- a/drivers/virt/coco/tdx-host/tdx-host.c
+++ b/drivers/virt/coco/tdx-host/tdx-host.c
@@ -76,6 +76,10 @@ static ssize_t num_remaining_updates_show(struct device *dev,
	return sysfs_emit(buf, "%u\n", info.num_remaining_updates);
 }
 
+/*
+ * These attributes are intended for managing TDX module updates. Reading
+ * them issues a slow, serialized P-SEAMLDR query, so keep them admin-only.
+ */
 static DEVICE_ATTR_ADMIN_RO(seamldr_version);
 static DEVICE_ATTR_ADMIN_RO(num_remaining_updates);
 
@@ -90,7 +94,10 @@ static bool supports_runtime_update(void)
	const struct tdx_sys_info *sysinfo = tdx_get_sysinfo();
 
	if (!sysinfo)
-		return 0;
+		return false;
+
+	if (!tdx_supports_runtime_update(sysinfo))
+		return false;
 
	/*
	 * Calling P-SEAMLDR on CPUs with the seamret_invd_vmcs bug clears
@@ -98,14 +105,17 @@ static bool supports_runtime_update(void)
	 * present before exposing P-SEAMLDR features.
	 */
	if (boot_cpu_has_bug(X86_BUG_SEAMRET_INVD_VMCS))
-		return 0;
+		return false;
 
-	return tdx_supports_runtime_update(sysinfo);
+	return true;
 }
 
 static umode_t seamldr_group_visible(struct kobject *kobj, struct attribute *attr, int idx)
 {
-	return supports_runtime_update() ? attr->mode : 0;
+	if (!supports_runtime_update())
+		return 0;
+
+	return attr->mode;
 }
 
 static const struct attribute_group seamldr_group = {
@@ -120,20 +130,20 @@ static const struct attribute_group *tdx_host_groups[] = {
 };
 
 static enum fw_upload_err tdx_fw_prepare(struct fw_upload *fwl,
-					 const u8 *data, u32 size)
+					 const u8 *data, u32 data_len)
 {
	return FW_UPLOAD_ERR_NONE;
 }
 
 static enum fw_upload_err tdx_fw_write(struct fw_upload *fwl, const u8 *data,
-				       u32 offset, u32 size, u32 *written)
+				       u32 offset, u32 data_len, u32 *written)
 {
	int ret;
 
-	ret = seamldr_install_module(data, size);
+	ret = seamldr_install_module(data, data_len);
	switch (ret) {
	case 0:
-		*written = size;
+		*written = data_len;
		return FW_UPLOAD_ERR_NONE;
	case -EBUSY:
		return FW_UPLOAD_ERR_BUSY;


^ permalink raw reply related

* Re: [PATCH v6 07/43] KVM: guest_memfd: Update kvm_gmem_populate() to use gmem attributes
From: Fuad Tabba @ 2026-05-20 13:47 UTC (permalink / raw)
  To: ackerleytng
  Cc: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	ira.weiny, jmattson, jthoughton, michael.roth, oupton,
	pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
	steven.price, willy, wyihan, yan.y.zhao, forkloop, pratyush,
	suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
	Sean Christopherson, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
	Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
	Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka,
	kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco
In-Reply-To: <20260507-gmem-inplace-conversion-v6-7-91ab5a8b19a4@google.com>

On Thu, 7 May 2026 at 21:22, Ackerley Tng via B4 Relay
<devnull+ackerleytng.google.com@kernel.org> wrote:
>
> From: Ackerley Tng <ackerleytng@google.com>
>
> Update the guest_memfd populate() flow to pull memory attributes from the
> gmem instance instead of the VM when KVM is not configured to track
> shared/private status in the VM.
>
> Rename the per-VM API to make it clear that it retrieves per-VM
> attributes, i.e. is not suitable for use outside of flows that are
> specific to generic per-VM attributes.
>
> Co-developed-by: Sean Christopherson <seanjc@google.com>
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> Signed-off-by: Ackerley Tng <ackerleytng@google.com>

Reviewed-by: Fuad Tabba <tabba@google.com>
/fuad


> ---
>  arch/x86/kvm/mmu/mmu.c   |  2 +-
>  include/linux/kvm_host.h | 14 +++++++++++++-
>  virt/kvm/guest_memfd.c   | 24 +++++++++++++++++++++---
>  virt/kvm/kvm_main.c      |  8 +++-----
>  4 files changed, 38 insertions(+), 10 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 153bcc5369985..bfcf9be25598e 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -7997,7 +7997,7 @@ static bool hugepage_has_attrs(struct kvm *kvm, struct kvm_memory_slot *slot,
>         const unsigned long end = start + KVM_PAGES_PER_HPAGE(level);
>
>         if (level == PG_LEVEL_2M)
> -               return kvm_range_has_memory_attributes(kvm, start, end, ~0, attrs);
> +               return kvm_range_has_vm_memory_attributes(kvm, start, end, ~0, attrs);
>
>         for (gfn = start; gfn < end; gfn += KVM_PAGES_PER_HPAGE(level - 1)) {
>                 if (hugepage_test_mixed(slot, gfn, level - 1) ||
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 28a54298d27db..1deab76dc0a2c 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -2549,12 +2549,24 @@ static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
>  #endif
>
>  #ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
> -bool kvm_range_has_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
> +extern bool vm_memory_attributes;
> +bool kvm_range_has_vm_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
>                                      unsigned long mask, unsigned long attrs);
>  bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm,
>                                         struct kvm_gfn_range *range);
>  bool kvm_arch_post_set_memory_attributes(struct kvm *kvm,
>                                          struct kvm_gfn_range *range);
> +#else
> +#define vm_memory_attributes false
> +static inline bool kvm_range_has_vm_memory_attributes(struct kvm *kvm,
> +                                                     gfn_t start, gfn_t end,
> +                                                     unsigned long mask,
> +                                                     unsigned long attrs)
> +{
> +       WARN_ONCE(1, "Unexpected call to kvm_range_has_vm_memory_attributes()");
> +
> +       return false;
> +}
>  #endif /* CONFIG_KVM_VM_MEMORY_ATTRIBUTES */
>
>  unsigned long kvm_gmem_get_memory_attributes(struct kvm *kvm, gfn_t gfn);
> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> index f055e058a3f28..9d025f518c025 100644
> --- a/virt/kvm/guest_memfd.c
> +++ b/virt/kvm/guest_memfd.c
> @@ -924,12 +924,31 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
>  EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_get_pfn);
>
>  #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_POPULATE
> +static bool kvm_gmem_range_is_private(struct gmem_inode *gi, pgoff_t index,
> +                                     size_t nr_pages, struct kvm *kvm, gfn_t gfn)
> +{
> +       pgoff_t end = index + nr_pages - 1;
> +       void *entry;
> +
> +       if (vm_memory_attributes)
> +               return kvm_range_has_vm_memory_attributes(kvm, gfn, gfn + nr_pages,
> +                                                      KVM_MEMORY_ATTRIBUTE_PRIVATE,
> +                                                      KVM_MEMORY_ATTRIBUTE_PRIVATE);
> +
> +       mt_for_each(&gi->attributes, entry, index, end) {
> +               if (xa_to_value(entry) != KVM_MEMORY_ATTRIBUTE_PRIVATE)
> +                       return false;
> +       }
> +
> +       return true;
> +}
>
>  static long __kvm_gmem_populate(struct kvm *kvm, struct kvm_memory_slot *slot,
>                                 struct file *file, gfn_t gfn, struct page *src_page,
>                                 kvm_gmem_populate_cb post_populate, void *opaque)
>  {
>         pgoff_t index = kvm_gmem_get_index(slot, gfn);
> +       struct gmem_inode *gi;
>         struct folio *folio;
>         kvm_pfn_t pfn;
>         int ret;
> @@ -944,9 +963,8 @@ static long __kvm_gmem_populate(struct kvm *kvm, struct kvm_memory_slot *slot,
>
>         folio_unlock(folio);
>
> -       if (!kvm_range_has_memory_attributes(kvm, gfn, gfn + 1,
> -                                            KVM_MEMORY_ATTRIBUTE_PRIVATE,
> -                                            KVM_MEMORY_ATTRIBUTE_PRIVATE)) {
> +       gi = GMEM_I(file_inode(file));
> +       if (!kvm_gmem_range_is_private(gi, index, 1, kvm, gfn)) {
>                 ret = -EINVAL;
>                 goto out_put_folio;
>         }
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 4139e903f756a..0a4024948711a 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -103,9 +103,7 @@ module_param(allow_unsafe_mappings, bool, 0444);
>
>  #ifdef CONFIG_KVM_MEMORY_ATTRIBUTES
>  #ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
> -static bool vm_memory_attributes = true;
> -#else
> -#define vm_memory_attributes false
> +bool vm_memory_attributes = true;
>  #endif
>  DEFINE_STATIC_CALL_RET0(__kvm_get_memory_attributes, kvm_get_memory_attributes_t);
>  EXPORT_SYMBOL_FOR_KVM_INTERNAL(STATIC_CALL_KEY(__kvm_get_memory_attributes));
> @@ -2450,7 +2448,7 @@ static unsigned long kvm_get_vm_memory_attributes(struct kvm *kvm, gfn_t gfn)
>   * Returns true if _all_ gfns in the range [@start, @end) have attributes
>   * such that the bits in @mask match @attrs.
>   */
> -bool kvm_range_has_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
> +bool kvm_range_has_vm_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
>                                      unsigned long mask, unsigned long attrs)
>  {
>         XA_STATE(xas, &kvm->mem_attr_array, start);
> @@ -2584,7 +2582,7 @@ static int kvm_vm_set_mem_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
>         mutex_lock(&kvm->slots_lock);
>
>         /* Nothing to do if the entire range has the desired attributes. */
> -       if (kvm_range_has_memory_attributes(kvm, start, end, ~0, attributes))
> +       if (kvm_range_has_vm_memory_attributes(kvm, start, end, ~0, attributes))
>                 goto out_unlock;
>
>         /*
>
> --
> 2.54.0.563.g4f69b47b94-goog
>
>

^ permalink raw reply

* Re: [PATCH v4 1/4] Introducing pw_lock() and per-cpu queue & flush work
From: Sebastian Andrzej Siewior @ 2026-05-20 13:48 UTC (permalink / raw)
  To: Leonardo Bras
  Cc: Jonathan Corbet, Shuah Khan, Peter Zijlstra, Ingo Molnar,
	Will Deacon, Boqun Feng, Waiman Long, Andrew Morton,
	David Hildenbrand, Lorenzo Stoakes, Liam R. Howlett,
	Vlastimil Babka, Mike Rapoport, Suren Baghdasaryan, Michal Hocko,
	Jann Horn, Pedro Falcato, Brendan Jackman, Johannes Weiner,
	Zi Yan, Harry Yoo, Hao Li, Christoph Lameter, David Rientjes,
	Roman Gushchin, Chris Li, Kairui Song, Kemeng Shi, Nhat Pham,
	Baoquan He, Barry Song, Youngjun Park, Qi Zheng, Shakeel Butt,
	Axel Rasmussen, Yuanchu Xie, Wei Xu, Borislav Petkov (AMD),
	Randy Dunlap, Feng Tang, Dapeng Mi, Kees Cook, Marco Elver,
	Jakub Kicinski, Li RongQing, Eric Biggers, Paul E. McKenney,
	Nathan Chancellor, Nicolas Schier, Miguel Ojeda,
	Thomas Weißschuh, Thomas Gleixner, Douglas Anderson,
	Gary Guo, Christian Brauner, Pasha Tatashin, Coiby Xu,
	Masahiro Yamada, Frederic Weisbecker, linux-doc, linux-kernel,
	linux-mm, linux-rt-devel, Marcelo Tosatti
In-Reply-To: <20260519012754.240804-2-leobras.c@gmail.com>

On 2026-05-18 22:27:47 [-0300], Leonardo Bras wrote:
> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
> index 4d0f545fb3ec..68c8a6f9d227 100644
> --- a/Documentation/admin-guide/kernel-parameters.txt
> +++ b/Documentation/admin-guide/kernel-parameters.txt
> @@ -2810,20 +2810,30 @@ Kernel parameters
>  			  If a queue's affinity mask contains only isolated
>  			  CPUs then this parameter has no effect on the
>  			  interrupt routing decision, though interrupts are
>  			  only delivered when tasks running on those
>  			  isolated CPUs submit IO. IO submitted on
>  			  housekeeping CPUs has no influence on those
>  			  queues.
>  
>  			The format of <cpu-list> is described above.
>  
> +	pwlocks=	[KNL,SMP] Select a behavior on per-CPU resource sharing
> +			and remote interference mechanism on a kernel built with
> +			CONFIG_PWLOCKS.
> +			Format: { "0" | "1" }
> +			0 - local_lock() + queue_work_on(remote_cpu)
> +			1 - spin_lock() for both local and remote operations
> +
> +			Selecting 1 may be interesting for systems that want
> +			to avoid interruption & context switches from IPIs.
> +

This documentation is supposed to be for an administrator/ user of the
system. Exposing him to underlying kernel technique shouldn't happen.
It does not explain the users/ outcome so it sounds like best hope.

>  	iucv=		[HW,NET]
>  
>  	ivrs_ioapic	[HW,X86-64]
>  			Provide an override to the IOAPIC-ID<->DEVICE-ID
>  			mapping provided in the IVRS ACPI table.
>  			By default, PCI segment is 0, and can be omitted.
>  
>  			For example, to map IOAPIC-ID decimal 10 to
>  			PCI segment 0x1 and PCI device 00:14.0,
>  			write the parameter as:
> diff --git a/Documentation/locking/pwlocks.rst b/Documentation/locking/pwlocks.rst
> new file mode 100644
> index 000000000000..09f4a5417bc1
> --- /dev/null
> +++ b/Documentation/locking/pwlocks.rst
> @@ -0,0 +1,76 @@
> +.. SPDX-License-Identifier: GPL-2.0
> +
> +=========
> +PW (Per-CPU Work) locks
> +=========
> +
> +Some places in the kernel implement a parallel programming strategy
> +consisting on local_locks() for most of the work, and some rare remote
> +operations are scheduled on target cpu. This keeps cache bouncing low since
> +cacheline tends to be mostly local, and avoids the cost of locks in non-RT

PREEMPT_RT can be spelled out if you mean it so it is not confused with
other meanings of the two letters.

> +kernels, even though the very few remote operations will be expensive due
> +to scheduling overhead.
> +
> +On the other hand, for RT workloads this can represent a problem:
> +scheduling work on remote cpu that are executing low latency tasks
> +is undesired and can introduce unexpected deadline misses.
> +
> +PW locks help to convert sites that use local_locks (for cpu local operations)
> +and queue_work_on (for queueing work remotely, to be executed
> +locally on the owner cpu of the lock) to a spinlocks.

not spinlocks.

> +
> +The lock is declared pw_lock_t type.
> +The lock is initialized with pw_lock_init.
> +The lock is locked with pw_lock (takes a lock and cpu as a parameter).
> +The lock is unlocked with pw_unlock (takes a lock and cpu as a parameter).

If it is a function, it should end with ()

> +The pw_lock_irqsave function disables interrupts and saves current interrupt state,
> +cpu as a parameter.

CPU.

> +For trylock variant, there is the pw_trylock_t type, initialized with
> +pw_trylock_init. Then the corresponding pw_trylock and pw_trylock_irqsave.
> +
> +work_struct should be replaced by pw_struct, which contains a cpu parameter
> +(owner cpu of the lock), initialized by INIT_PW.
> +
> +The queue work related functions (analogous to queue_work_on and flush_work) are:
> +pw_queue_on and pw_flush.
> +
> +The behaviour of the PW lock functions is as follows:
> +
> +* !CONFIG_PWLOCKS (or CONFIG_PWLOCKS and pwlocks=off kernel boot parameter):
> +        - pw_lock:			local_lock
> +        - pw_lock_irqsave:		local_lock_irqsave
> +        - pw_trylock:			local_trylock
> +        - pw_trylock_irqsave:		local_trylock_irqsave
> +        - pw_unlock:			local_unlock
> +        - pw_lock_local:		local_lock
> +        - pw_trylock_local:		local_trylock
> +        - pw_unlock_local:		local_unlock
> +        - pw_queue_on:         		queue_work_on
> +        - pw_flush:	            	flush_work
> +
> +* CONFIG_PWLOCKS (and CONFIG_PWLOCKS_DEFAULT=y or pwlocks=on kernel boot parameter),
> +        - pw_lock:			spin_lock
> +        - pw_lock_irqsave:		spin_lock_irqsave
> +        - pw_trylock:			spin_trylock
> +        - pw_trylock_irqsave:		spin_trylock_irqsave
> +        - pw_unlock:			spin_unlock
> +        - pw_lock_local:		preempt_disable OR migrate_disable + spin_lock
> +        - pw_trylock_local:		preempt_disable OR migrate_disable + spin_trylock
> +        - pw_unlock_local:		preempt_enable OR migrate_enable + spin_unlock
> +        - pw_queue_on:         		executes work function on caller cpu
> +        - pw_flush:            		empty
> +
> +pw_get_cpu(work_struct), to be called from within per-cpu work function,
> +returns the target cpu.
> +
> +On the locking functions above, there are the local locking functions
> +(pw_lock_local, pw_trylock_local and pw_unlock_local) that must only
> +be used to access per-CPU data from the CPU that owns that data,
> +and never remotely. They disable preemption/migration and don't require
> +a cpu parameter, making them a replacement for local_lock functions that
> +does not introduce overhead.

Why do you need to either the one or the other? My only guess is that
migrate_disable() is sufficient but you prefer preempt_disable() on
!PREEMPT_RT because it is cheaper.

> +These should only be used when accessing per-CPU data of the local CPU.
> +
> diff --git a/init/Kconfig b/init/Kconfig
> index 2937c4d308ae..3fb751dc4530 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -764,20 +764,55 @@ config CPU_ISOLATION
>  	depends on SMP
>  	default y
>  	help
>  	  Make sure that CPUs running critical tasks are not disturbed by
>  	  any source of "noise" such as unbound workqueues, timers, kthreads...
>  	  Unbound jobs get offloaded to housekeeping CPUs. This is driven by
>  	  the "isolcpus=" boot parameter.
>  
>  	  Say Y if unsure.
>  
> +config PWLOCKS
> +	bool "Per-CPU Work locks"
> +	depends on SMP || COMPILE_TEST
> +	default n
> +	help
> +	  Allow changing the behavior on per-CPU resource sharing with cache,
> +	  from the regular local_locks() + queue_work_on(remote_cpu) to using
> +	  per-CPU spinlocks on both local and remote operations.
> +
> +	  This is useful to give user the option on reducing IPIs to CPUs, and
> +	  thus reduce interruptions and context switches. On the other hand, it
> +	  increases generated code and will use atomic operations if spinlocks
> +	  are selected.

I think the goal is to avoid scheduling a task on a remote CPU to get
something done.

> +
> +	  If set, will use the default behavior set in PWLOCKS_DEFAULT unless boot
> +	  parameter pwlocks is passed with a different behavior.
> +
> +	  If unset, will use the local_lock() + queue_work_on() strategy,
> +	  regardless of the boot parameter or PWLOCKS_DEFAULT.

This sounds like it affects the greater kernel.

> +	  Say N if unsure.
> +
> +config PWLOCKS_DEFAULT
> +	bool "Use per-CPU spinlocks by default on PWLOCKS"
> +	depends on PWLOCKS
> +	default n

n is default.

> +	help
> +	  If set, will use per-CPU spinlocks as default behavior for per-CPU
> +	  remote operations.
> +
> +	  If unset, will use local_lock() + queue_work_on(cpu) as default
> +	  behavior for remote operations.
> +
> +	  Say N if unsure
> +
>  source "kernel/rcu/Kconfig"
>  
>  config IKCONFIG
>  	tristate "Kernel .config support"
>  	help
>  	  This option enables the complete Linux kernel ".config" file
>  	  contents to be saved in the kernel. It provides documentation
>  	  of which kernel options are used in a running kernel or in an
>  	  on-disk kernel.  This information can be extracted from the kernel
>  	  image file with the script scripts/extract-ikconfig and used as
> diff --git a/include/linux/pwlocks.h b/include/linux/pwlocks.h
> new file mode 100644
> index 000000000000..3d79621655f9
> --- /dev/null
> +++ b/include/linux/pwlocks.h
> @@ -0,0 +1,265 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _LINUX_PWLOCKS_H
> +#define _LINUX_PWLOCKS_H
> +
> +#include "linux/spinlock.h"
> +#include "linux/local_lock.h"
> +#include "linux/workqueue.h"
> +
> +#ifndef CONFIG_PWLOCKS
> +
> +typedef local_lock_t pw_lock_t;
> +typedef local_trylock_t pw_trylock_t;
> +
> +struct pw_struct {
> +	struct work_struct work;
> +};
> +
> +#define pw_lock_init(lock)				\
> +	local_lock_init(lock)
> +
> +#define pw_trylock_init(lock)				\
> +	local_trylock_init(lock)
> +
> +#define pw_lock(lock, cpu)				\
> +	local_lock(lock)
> +
> +#define pw_lock_local(lock)				\
> +	local_lock(lock)
> +
> +#define pw_lock_irqsave(lock, flags, cpu)		\
> +	local_lock_irqsave(lock, flags)

The part where you have a `cpu' argument which is not used is entirely
confusing.

> +
> +#define pw_lock_local_irqsave(lock, flags)		\
> +	local_lock_irqsave(lock, flags)
> +
> +#define pw_trylock(lock, cpu)				\
> +	local_trylock(lock)
> +
> +#define pw_trylock_local(lock)				\
> +	local_trylock(lock)
> +
> +#define pw_trylock_irqsave(lock, flags, cpu)		\
> +	local_trylock_irqsave(lock, flags)
> +
> +#define pw_unlock(lock, cpu)				\
> +	local_unlock(lock)
> +
> +#define pw_unlock_local(lock)				\
> +	local_unlock(lock)
> +
> +#define pw_unlock_irqrestore(lock, flags, cpu)		\
> +	local_unlock_irqrestore(lock, flags)
> +
> +#define pw_unlock_local_irqrestore(lock, flags)		\
> +	local_unlock_irqrestore(lock, flags)
> +
> +#define pw_lockdep_assert_held(lock)			\
> +	lockdep_assert_held(lock)
> +
> +#define pw_queue_on(c, wq, pw)				\
> +	queue_work_on(c, wq, &(pw)->work)
> +
> +#define pw_flush(pw)					\
> +	flush_work(&(pw)->work)
> +
> +#define pw_get_cpu(pw)	smp_processor_id()
> +
> +#define pw_is_cpu_remote(cpu)		(false)
> +
> +#define INIT_PW(pw, func, c)				\
> +	INIT_WORK(&(pw)->work, (func))
> +
> +#else /* CONFIG_PWLOCKS */
> +
> +DECLARE_STATIC_KEY_MAYBE(CONFIG_PWLOCKS_DEFAULT, pw_sl);
> +
> +typedef union {
> +	spinlock_t sl;
> +	local_lock_t ll;
> +} pw_lock_t;
> +
> +typedef union {
> +	spinlock_t sl;
> +	local_trylock_t ll;
> +} pw_trylock_t;

Why do you use local_trylock_t ? Its use case is different compared to
local_lock_t. _IF_ you are fine with local_trylock_t then you should be
able to deal with a per-CPU spinlock_t and none of this should be
needed.

> +struct pw_struct {
> +	struct work_struct work;
> +	int cpu;
> +};
> +
> +#ifdef CONFIG_PREEMPT_RT
> +#define preempt_or_migrate_disable migrate_disable
> +#define preempt_or_migrate_enable migrate_enable
> +#else
> +#define preempt_or_migrate_disable preempt_disable
> +#define preempt_or_migrate_enable preempt_enable
> +#endif

if then () but this looks terrible.

> +
> +#define pw_lock_init(lock)							\
> +do {										\
> +	if (static_branch_maybe(CONFIG_PWLOCKS_DEFAULT, &pw_sl))		\
> +		spin_lock_init(lock.sl);					\
> +	else									\
> +		local_lock_init(lock.ll);					\
> +} while (0)
> +
> +#define pw_trylock_init(lock)							\
> +do {										\
> +	if (static_branch_maybe(CONFIG_PWLOCKS_DEFAULT, &pw_sl))		\
> +		spin_lock_init(lock.sl);					\
> +	else									\
> +		local_trylock_init(lock.ll);					\
> +} while (0)
> +
> +#define pw_lock(lock, cpu)							\
> +do {										\
> +	if (static_branch_maybe(CONFIG_PWLOCKS_DEFAULT, &pw_sl))		\
> +		spin_lock(per_cpu_ptr(lock.sl, cpu));				\
> +	else									\
> +		local_lock(lock.ll);						\
> +} while (0)
> +
> +#define pw_lock_local(lock)							\
> +do {										\
> +	if (static_branch_maybe(CONFIG_PWLOCKS_DEFAULT, &pw_sl)) {		\
> +		preempt_or_migrate_disable();					\
> +		spin_lock(this_cpu_ptr(lock.sl));				\
> +	} else {								\
> +		local_lock(lock.ll);						\
> +	}									\
> +} while (0)
> +
> +#define pw_lock_irqsave(lock, flags, cpu)					\
> +do {										\
> +	if (static_branch_maybe(CONFIG_PWLOCKS_DEFAULT, &pw_sl))		\
> +		spin_lock_irqsave(per_cpu_ptr(lock.sl, cpu), flags);	\
> +	else									\
> +		local_lock_irqsave(lock.ll, flags);				\
> +} while (0)
> +
> +#define pw_lock_local_irqsave(lock, flags)					\
> +do {										\
> +	if (static_branch_maybe(CONFIG_PWLOCKS_DEFAULT, &pw_sl)) {		\
> +		preempt_or_migrate_disable();					\
> +		spin_lock_irqsave(this_cpu_ptr(lock.sl), flags);		\
> +	} else {								\
> +		local_lock_irqsave(lock.ll, flags);				\
> +	}									\
> +} while (0)
> +
> +#define pw_trylock(lock, cpu)							\
> +({										\
> +	int t;									\
> +	if (static_branch_maybe(CONFIG_PWLOCKS_DEFAULT, &pw_sl))		\
> +		t = spin_trylock(per_cpu_ptr(lock.sl, cpu));			\
> +	else									\
> +		t = local_trylock(lock.ll);					\
> +	t;									\
> +})
> +
> +#define pw_trylock_local(lock)							\
> +({										\
> +	int t;									\
> +	if (static_branch_maybe(CONFIG_PWLOCKS_DEFAULT, &pw_sl)) {		\
> +		preempt_or_migrate_disable();					\
> +		t = spin_trylock(this_cpu_ptr(lock.sl));			\
> +		if (!t)								\
> +			preempt_or_migrate_enable();				\
> +	} else {								\
> +		t = local_trylock(lock.ll);					\
> +	}									\
> +	t;									\
> +})
> +
> +#define pw_trylock_irqsave(lock, flags, cpu)					\
> +({										\
> +	int t;									\
> +	if (static_branch_maybe(CONFIG_PWLOCKS_DEFAULT, &pw_sl))		\
> +		t = spin_trylock_irqsave(per_cpu_ptr(lock.sl, cpu), flags);	\
> +	else									\
> +		t = local_trylock_irqsave(lock.ll, flags);			\
> +	t;									\
> +})
> +
> +#define pw_unlock(lock, cpu)							\
> +do {										\
> +	if (static_branch_maybe(CONFIG_PWLOCKS_DEFAULT, &pw_sl))		\
> +		spin_unlock(per_cpu_ptr(lock.sl, cpu));			\
> +	else									\
> +		local_unlock(lock.ll);					\
> +} while (0)
> +
> +#define pw_unlock_local(lock)							\
> +do {										\
> +	if (static_branch_maybe(CONFIG_PWLOCKS_DEFAULT, &pw_sl)) {		\
> +		spin_unlock(this_cpu_ptr(lock.sl));				\
> +		preempt_or_migrate_enable();					\
> +	} else {								\
> +		local_unlock(lock.ll);						\
> +	}									\
> +} while (0)
> +
> +#define pw_unlock_irqrestore(lock, flags, cpu)					\
> +do {										\
> +	if (static_branch_maybe(CONFIG_PWLOCKS_DEFAULT, &pw_sl))		\
> +		spin_unlock_irqrestore(per_cpu_ptr(lock.sl, cpu), flags);	\
> +	else									\
> +		local_unlock_irqrestore(lock.ll, flags);			\
> +} while (0)
> +
> +#define pw_unlock_local_irqrestore(lock, flags)					\
> +do {										\
> +	if (static_branch_maybe(CONFIG_PWLOCKS_DEFAULT, &pw_sl)) {		\
> +		spin_unlock_irqrestore(this_cpu_ptr(lock.sl), flags);	\
> +		preempt_or_migrate_enable();					\
> +	} else {								\
> +		local_unlock_irqrestore(lock.ll, flags);			\
> +	}									\
> +} while (0)
> +
> +#define pw_lockdep_assert_held(lock)						\
> +do {										\
> +	if (static_branch_maybe(CONFIG_PWLOCKS_DEFAULT, &pw_sl))		\
> +		lockdep_assert_held(this_cpu_ptr(lock.sl));			\
> +	else									\
> +		lockdep_assert_held(this_cpu_ptr(lock.ll));			\
> +} while (0)
> +
> +#define pw_queue_on(c, wq, pw)							\
> +do {										\
> +	int __c = c;								\
> +	struct pw_struct *__pw = (pw);						\
> +	if (static_branch_maybe(CONFIG_PWLOCKS_DEFAULT, &pw_sl)) {		\
> +		WARN_ON((__c) != __pw->cpu);					\
> +		__pw->work.func(&__pw->work);					\
> +	} else {								\
> +		queue_work_on(__c, wq, &(__pw)->work);				\
> +	}									\
> +} while (0)
> +
> +/*
> + * Does nothing if PWLOCKS is set to use spinlock, as the task is already done at the
> + * time pw_queue_on() returns.
> + */
> +#define pw_flush(pw)								\
> +do {										\
> +	struct pw_struct *__pw = (pw);						\
> +	if (!static_branch_maybe(CONFIG_PWLOCKS_DEFAULT, &pw_sl))		\
> +		flush_work(&__pw->work);					\
> +} while (0)

I don't think this should be a collection of macros. Either proper
functions or static inline _if_ this is performance critical for some
reason.

> +
> +#define pw_get_cpu(w)			container_of((w), struct pw_struct, work)->cpu
> +
> +#define pw_is_cpu_remote(cpu)		((cpu) != smp_processor_id())
> +
> +#define INIT_PW(pw, func, c)							\
> +do {										\
> +	struct pw_struct *__pw = (pw);						\
> +	INIT_WORK(&__pw->work, (func));						\
> +	__pw->cpu = (c);							\
> +} while (0)
> +
> +#endif /* CONFIG_PWLOCKS */
> +#endif /* LINUX_PWLOCKS_H */
> diff --git a/kernel/pwlocks.c b/kernel/pwlocks.c
> new file mode 100644
> index 000000000000..1ebf5cb979b9
> --- /dev/null
> +++ b/kernel/pwlocks.c
> @@ -0,0 +1,47 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#include "linux/export.h"
> +#include <linux/sched.h>
> +#include <linux/pwlocks.h>
> +#include <linux/string.h>
> +#include <linux/sched/isolation.h>
> +
> +DEFINE_STATIC_KEY_MAYBE(CONFIG_PWLOCKS_DEFAULT, pw_sl);
> +EXPORT_SYMBOL(pw_sl);
> +
> +static bool pwlocks_param_specified;
> +
> +static int __init pwlocks_setup(char *str)
> +{
> +	int opt;
> +
> +	if (!get_option(&str, &opt)) {
> +		pr_warn("PWLOCKS: invalid pwlocks parameter: %s, ignoring.\n", str);
> +		return 0;
> +	}
> +
> +	if (opt)
> +		static_branch_enable(&pw_sl);
> +	else
> +		static_branch_disable(&pw_sl);
> +
> +	pwlocks_param_specified = true;
> +
> +	return 1;
> +}
> +__setup("pwlocks=", pwlocks_setup);
> +
> +/*
> + * Enable PWLOCKS if CPUs want to avoid kernel noise.
> + */
> +static int __init pwlocks_init(void)
> +{
> +	if (pwlocks_param_specified)
> +		return 0;
> +
> +	if (housekeeping_enabled(HK_TYPE_KERNEL_NOISE))
> +		static_branch_enable(&pw_sl);

How likely is it, that you you had users before late_initcall()? Also
can it happen that one of them uses one function to lock and the other
unlock in this brief window? There is no check if this was used before
static_branch usage.

> +
> +	return 0;
> +}
> +
> +late_initcall(pwlocks_init);

Sebastian

^ permalink raw reply

* Re: [PATCH v6 08/43] KVM: guest_memfd: Only prepare folios for private pages
From: Fuad Tabba @ 2026-05-20 13:51 UTC (permalink / raw)
  To: ackerleytng
  Cc: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	ira.weiny, jmattson, jthoughton, michael.roth, oupton,
	pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
	steven.price, willy, wyihan, yan.y.zhao, forkloop, pratyush,
	suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
	Sean Christopherson, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
	Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
	Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka,
	kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco
In-Reply-To: <20260507-gmem-inplace-conversion-v6-8-91ab5a8b19a4@google.com>

On Thu, 7 May 2026 at 21:22, Ackerley Tng via B4 Relay
<devnull+ackerleytng.google.com@kernel.org> wrote:
>
> From: Ackerley Tng <ackerleytng@google.com>
>
> All-shared guest_memfd used to be only supported for non-CoCo VMs where
> preparation doesn't apply. INIT_SHARED is about to be supported for
> non-CoCo VMs in a later patch in this series.
>
> In addition, KVM_SET_MEMORY_ATTRIBUTES2 is about to be supported in
> guest_memfd in a later patch in this series.
>
> This means that the kvm fault handler may now call kvm_gmem_get_pfn() on a
> shared folio for a CoCo VM where preparation applies.
>
> Add a check to make sure that preparation is only performed for private
> folios.
>
> Preparation will be undone on freeing (see kvm_gmem_free_folio()) and on
> conversion to shared.
>
> Signed-off-by: Ackerley Tng <ackerleytng@google.com>

Reviewed-by: Fuad Tabba <tabba@google.com>

Cheers,
/fuad
> ---
>  virt/kvm/guest_memfd.c | 9 ++++++---
>  1 file changed, 6 insertions(+), 3 deletions(-)
>
> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> index 9d025f518c025..4f7c4824c3a45 100644
> --- a/virt/kvm/guest_memfd.c
> +++ b/virt/kvm/guest_memfd.c
> @@ -888,6 +888,7 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
>                      int *max_order)
>  {
>         pgoff_t index = kvm_gmem_get_index(slot, gfn);
> +       struct inode *inode;
>         struct folio *folio;
>         int r = 0;
>
> @@ -895,7 +896,8 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
>         if (!file)
>                 return -EFAULT;
>
> -       filemap_invalidate_lock_shared(file_inode(file)->i_mapping);
> +       inode = file_inode(file);
> +       filemap_invalidate_lock_shared(inode->i_mapping);
>
>         folio = __kvm_gmem_get_pfn(file, slot, index, pfn, max_order);
>         if (IS_ERR(folio)) {
> @@ -908,7 +910,8 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
>                 folio_mark_uptodate(folio);
>         }
>
> -       r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio);
> +       if (kvm_gmem_is_private_mem(inode, index))
> +               r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio);
>
>         folio_unlock(folio);
>
> @@ -918,7 +921,7 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
>                 folio_put(folio);
>
>  out:
> -       filemap_invalidate_unlock_shared(file_inode(file)->i_mapping);
> +       filemap_invalidate_unlock_shared(inode->i_mapping);
>         return r;
>  }
>  EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_get_pfn);
>
> --
> 2.54.0.563.g4f69b47b94-goog
>
>

^ permalink raw reply

* Re: [PATCH v6 09/43] KVM: Move kvm_supported_mem_attributes() to kvm_host.h
From: Fuad Tabba @ 2026-05-20 13:53 UTC (permalink / raw)
  To: ackerleytng
  Cc: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	ira.weiny, jmattson, jthoughton, michael.roth, oupton,
	pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
	steven.price, willy, wyihan, yan.y.zhao, forkloop, pratyush,
	suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
	Sean Christopherson, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
	Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
	Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka,
	kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco
In-Reply-To: <20260507-gmem-inplace-conversion-v6-9-91ab5a8b19a4@google.com>

On Thu, 7 May 2026 at 21:22, Ackerley Tng via B4 Relay
<devnull+ackerleytng.google.com@kernel.org> wrote:
>
> From: Ackerley Tng <ackerleytng@google.com>
>
> Move kvm_supported_mem_attributes() from kvm_main.c to kvm_host.h and
> make it a static inline function. This allows the helper to be used in
> other parts of the KVM subsystem outside of kvm_main.c. This helper will be
> used later by guest_memfd.
>
> No functional change intended.
>
> Signed-off-by: Ackerley Tng <ackerleytng@google.com>

Reviewed-by: Fuad Tabba <tabba@google.com>

Cheers,
/fuad
> ---
>  include/linux/kvm_host.h | 10 ++++++++++
>  virt/kvm/kvm_main.c      | 10 ----------
>  2 files changed, 10 insertions(+), 10 deletions(-)
>
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 1deab76dc0a2c..f9ea95e33d050 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -2529,6 +2529,16 @@ static inline bool kvm_memslot_is_gmem_only(const struct kvm_memory_slot *slot)
>  }
>
>  #ifdef CONFIG_KVM_MEMORY_ATTRIBUTES
> +static inline u64 kvm_supported_mem_attributes(struct kvm *kvm)
> +{
> +#ifdef kvm_arch_has_private_mem
> +       if (!kvm || kvm_arch_has_private_mem(kvm))
> +               return KVM_MEMORY_ATTRIBUTE_PRIVATE;
> +#endif
> +
> +       return 0;
> +}
> +
>  typedef unsigned long (kvm_get_memory_attributes_t)(struct kvm *kvm, gfn_t gfn);
>  DECLARE_STATIC_CALL(__kvm_get_memory_attributes, kvm_get_memory_attributes_t);
>
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 0a4024948711a..ff20e63143642 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -2428,16 +2428,6 @@ static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
>  #endif /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
>
>  #ifdef CONFIG_KVM_MEMORY_ATTRIBUTES
> -static u64 kvm_supported_mem_attributes(struct kvm *kvm)
> -{
> -#ifdef kvm_arch_has_private_mem
> -       if (!kvm || kvm_arch_has_private_mem(kvm))
> -               return KVM_MEMORY_ATTRIBUTE_PRIVATE;
> -#endif
> -
> -       return 0;
> -}
> -
>  #ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
>  static unsigned long kvm_get_vm_memory_attributes(struct kvm *kvm, gfn_t gfn)
>  {
>
> --
> 2.54.0.563.g4f69b47b94-goog
>
>

^ permalink raw reply

* Re: [PATCH 12/15] accel/qda: Add FastRPC invocation support
From: Dmitry Baryshkov @ 2026-05-20 13:56 UTC (permalink / raw)
  To: ekansh.gupta
  Cc: Oded Gabbay, Jonathan Corbet, Shuah Khan, Joerg Roedel,
	Will Deacon, Robin Murphy, Maarten Lankhorst, Maxime Ripard,
	Thomas Zimmermann, David Airlie, Simona Vetter, Sumit Semwal,
	Christian König, Bharath Kumar, Chenna Kesava Raju, srini,
	andersson, konradybcio, robin.clark, linux-kernel, dri-devel,
	linux-doc, linux-arm-msm, iommu, linux-media, linaro-mm-sig
In-Reply-To: <20260519-qda-series-v1-12-b2d984c297f8@oss.qualcomm.com>

On Tue, May 19, 2026 at 11:46:02AM +0530, Ekansh Gupta via B4 Relay wrote:
> From: Ekansh Gupta <ekansh.gupta@oss.qualcomm.com>
> 
> Implement the FastRPC remote procedure call path, allowing user-space
> to invoke methods on the DSP via DRM_IOCTL_QDA_REMOTE_INVOKE.
> 
> qda_fastrpc.c / qda_fastrpc.h
>   Implements the FastRPC protocol layer: argument marshalling
>   (qda_fastrpc_invoke_pack), response unmarshalling
>   (qda_fastrpc_invoke_unpack), and invocation context lifecycle
>   management. Each invocation allocates a fastrpc_invoke_context
>   which tracks buffer descriptors, GEM objects, and the completion
>   used to synchronise with the DSP response.
> 
>   Buffer arguments are handled in three ways:
>   - DMA-BUF fd: imported via PRIME, IOMMU-mapped dma_addr used
>   - Direct (inline): copied into the GEM-backed message buffer
>   - DMA handle: fd forwarded to DSP, physical page descriptor computed

No. This needs to go away. The QDA should support only one way to pass
data - via the GEM buffers. Everything else should be handled by the
shim layer, etc.

> 
> qda_rpmsg.c
>   Implements qda_rpmsg_send_msg() which sends the wire-format
>   fastrpc_msg (embedded as the first member of qda_msg) directly
>   via rpmsg_send(), and qda_rpmsg_wait_for_rsp() which blocks on
>   the context completion. The RPMsg callback dispatches responses
>   to waiting contexts via the ctx_xa XArray.
> 
> qda_ioctl.c
>   qda_ioctl_invoke() drives the full invocation lifecycle:
>   allocate context → assign XArray ID → prepare args → allocate
>   GEM message buffer → pack → send → wait → unpack → free.
> 
> qda_drv.h / qda_drv.c
>   qda_dev gains ctx_xa (XArray for in-flight context lookup) and
>   remote_session_id_counter (atomic counter for session IDs).
>   qda_file_priv gains remote_session_id for per-session tracking.
> 
> include/uapi/drm/qda_accel.h
>   Adds DRM_IOCTL_QDA_REMOTE_INVOKE (command 0x07; command numbers
>   0x03–0x06 are reserved) and the associated drm_qda_invoke_args
>   and drm_qda_fastrpc_invoke_args structures.
> 
> Assisted-by: Claude:claude-4-6-sonnet
> Signed-off-by: Ekansh Gupta <ekansh.gupta@oss.qualcomm.com>
> ---
>  drivers/accel/qda/Makefile      |   1 +
>  drivers/accel/qda/qda_drv.c     |  17 ++
>  drivers/accel/qda/qda_drv.h     |   8 +
>  drivers/accel/qda/qda_fastrpc.c | 597 ++++++++++++++++++++++++++++++++++++++++
>  drivers/accel/qda/qda_fastrpc.h | 271 ++++++++++++++++++
>  drivers/accel/qda/qda_ioctl.c   | 104 +++++++
>  drivers/accel/qda/qda_ioctl.h   |   1 +
>  drivers/accel/qda/qda_rpmsg.c   | 136 ++++++++-
>  drivers/accel/qda/qda_rpmsg.h   |  17 ++
>  include/uapi/drm/qda_accel.h    |  39 +++
>  10 files changed, 1189 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/accel/qda/Makefile b/drivers/accel/qda/Makefile
> index fb092e56d7f3..2d10420cd1ec 100644
> --- a/drivers/accel/qda/Makefile
> +++ b/drivers/accel/qda/Makefile
> @@ -8,6 +8,7 @@ obj-$(CONFIG_DRM_ACCEL_QDA)	:= qda.o
>  qda-y := \
>  	qda_cb.o \
>  	qda_drv.o \
> +	qda_fastrpc.o \
>  	qda_gem.o \
>  	qda_ioctl.o \
>  	qda_memory_dma.o \
> diff --git a/drivers/accel/qda/qda_drv.c b/drivers/accel/qda/qda_drv.c
> index ef8bd573b836..704c7d3127d2 100644
> --- a/drivers/accel/qda/qda_drv.c
> +++ b/drivers/accel/qda/qda_drv.c
> @@ -26,6 +26,8 @@ static int qda_open(struct drm_device *dev, struct drm_file *file)
>  
>  	qda_file_priv->pid = current->pid;
>  	qda_file_priv->qda_dev = qda_dev_from_drm(dev);
> +	qda_file_priv->remote_session_id =
> +		atomic_inc_return(&qda_file_priv->qda_dev->remote_session_id_counter);
>  	file->driver_priv = qda_file_priv;
>  
>  	return 0;
> @@ -57,6 +59,7 @@ static const struct drm_ioctl_desc qda_ioctls[] = {
>  	DRM_IOCTL_DEF_DRV(QDA_QUERY, qda_ioctl_query, 0),
>  	DRM_IOCTL_DEF_DRV(QDA_GEM_CREATE, qda_ioctl_gem_create, 0),
>  	DRM_IOCTL_DEF_DRV(QDA_GEM_MMAP_OFFSET, qda_ioctl_gem_mmap_offset, 0),
> +	DRM_IOCTL_DEF_DRV(QDA_REMOTE_INVOKE, qda_ioctl_invoke, 0),
>  };
>  
>  static const struct drm_driver qda_drm_driver = {
> @@ -93,6 +96,17 @@ static void cleanup_memory_manager(struct qda_dev *qdev)
>  	}
>  }
>  
> +static void cleanup_device_resources(struct qda_dev *qdev)
> +{
> +	xa_destroy(&qdev->ctx_xa);
> +}
> +
> +static void init_device_resources(struct qda_dev *qdev)
> +{
> +	atomic_set(&qdev->remote_session_id_counter, 0);
> +	xa_init_flags(&qdev->ctx_xa, XA_FLAGS_ALLOC1);
> +}
> +
>  static int init_memory_manager(struct qda_dev *qdev)
>  {
>  	qdev->iommu_mgr = kzalloc_obj(*qdev->iommu_mgr);
> @@ -106,6 +120,7 @@ void qda_deinit_device(struct qda_dev *qdev)
>  {
>  	mutex_destroy(&qdev->import_lock);
>  	cleanup_memory_manager(qdev);
> +	cleanup_device_resources(qdev);
>  }
>  
>  int qda_init_device(struct qda_dev *qdev)
> @@ -114,10 +129,12 @@ int qda_init_device(struct qda_dev *qdev)
>  
>  	mutex_init(&qdev->import_lock);
>  	qdev->current_import_file_priv = NULL;
> +	init_device_resources(qdev);
>  
>  	ret = init_memory_manager(qdev);
>  	if (ret) {
>  		drm_err(&qdev->drm_dev, "Failed to initialize memory manager: %d\n", ret);
> +		cleanup_device_resources(qdev);
>  		mutex_destroy(&qdev->import_lock);
>  	}
>  
> diff --git a/drivers/accel/qda/qda_drv.h b/drivers/accel/qda/qda_drv.h
> index 96ce4135e2d9..420cccff42bf 100644
> --- a/drivers/accel/qda/qda_drv.h
> +++ b/drivers/accel/qda/qda_drv.h
> @@ -6,10 +6,12 @@
>  #ifndef __QDA_DRV_H__
>  #define __QDA_DRV_H__
>  
> +#include <linux/atomic.h>
>  #include <linux/device.h>
>  #include <linux/list.h>
>  #include <linux/rpmsg.h>
>  #include <linux/types.h>
> +#include <linux/xarray.h>
>  #include <drm/drm_device.h>
>  #include <drm/drm_drv.h>
>  #include <drm/drm_file.h>
> @@ -28,6 +30,8 @@ struct qda_file_priv {
>  	struct qda_iommu_device *assigned_iommu_dev;
>  	/** @pid: Process ID for tracking */
>  	pid_t pid;
> +	/** @remote_session_id: Unique session identifier */
> +	u32 remote_session_id;
>  };
>  
>  /**
> @@ -51,8 +55,12 @@ struct qda_dev {
>  	struct mutex import_lock;
>  	/** @current_import_file_priv: Current file_priv during prime import */
>  	struct drm_file *current_import_file_priv;
> +	/** @ctx_xa: XArray for FastRPC context management */
> +	struct xarray ctx_xa;
>  	/** @dsp_name: Name of the DSP domain (e.g. "cdsp", "adsp") */
>  	const char *dsp_name;
> +	/** @remote_session_id_counter: Atomic counter for unique session IDs */
> +	atomic_t remote_session_id_counter;
>  };
>  
>  /**
> diff --git a/drivers/accel/qda/qda_fastrpc.c b/drivers/accel/qda/qda_fastrpc.c
> new file mode 100644
> index 000000000000..0ec37175a098
> --- /dev/null
> +++ b/drivers/accel/qda/qda_fastrpc.c
> @@ -0,0 +1,597 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +// Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
> +#include <linux/slab.h>
> +#include <linux/uaccess.h>
> +#include <linux/sort.h>
> +#include <linux/completion.h>
> +#include <linux/dma-buf.h>
> +#include <drm/drm_gem.h>
> +#include "qda_fastrpc.h"
> +#include "qda_drv.h"
> +#include "qda_gem.h"
> +#include "qda_memory_manager.h"
> +#include "qda_prime.h"
> +
> +/**
> + * get_gem_obj_from_dmabuf_fd() - Import a DMA-BUF fd and return the GEM object
> + * @ctx:       FastRPC invocation context
> + * @dmabuf_fd: DMA-BUF file descriptor supplied by user space
> + * @gem_obj:   Output GEM object (caller must call drm_gem_object_put() when done)
> + *
> + * Imports the DMA-BUF fd into the QDA device via qda_prime_fd_to_handle()
> + * (which performs IOMMU device assignment for newly imported buffers) and
> + * then looks up the resulting GEM object.  The caller is responsible for
> + * calling drm_gem_object_put() on the returned object.
> + *
> + * Return: 0 on success, negative error code on failure
> + */
> +static int get_gem_obj_from_dmabuf_fd(struct fastrpc_invoke_context *ctx,
> +				      int dmabuf_fd,
> +				      struct drm_gem_object **gem_obj)
> +{
> +	struct drm_device *dev = ctx->file_priv->minor->dev;
> +	u32 handle;
> +	int ret;
> +
> +	ret = qda_prime_fd_to_handle(dev, ctx->file_priv, dmabuf_fd, &handle);
> +	if (ret)
> +		return ret;
> +
> +	*gem_obj = drm_gem_object_lookup(ctx->file_priv, handle);
> +	if (!*gem_obj)
> +		return -ENOENT;
> +
> +	return 0;
> +}
> +
> +static void setup_pages_from_gem_obj(struct qda_gem_obj *qda_gem_obj,
> +				     struct fastrpc_phy_page *pages)
> +{
> +	pages->addr = qda_gem_obj->dma_addr;
> +	pages->size = qda_gem_obj->size;
> +}
> +
> +static u64 calculate_vma_offset(u64 user_ptr)
> +{
> +	struct vm_area_struct *vma;
> +	u64 user_ptr_page_mask = user_ptr & PAGE_MASK;
> +	u64 vma_offset = 0;
> +
> +	mmap_read_lock(current->mm);
> +	vma = find_vma(current->mm, user_ptr);
> +	if (vma)
> +		vma_offset = user_ptr_page_mask - vma->vm_start;
> +	mmap_read_unlock(current->mm);
> +
> +	return vma_offset;
> +}
> +
> +static u64 calculate_page_aligned_size(u64 ptr, u64 len)
> +{
> +	u64 pg_start = (ptr & PAGE_MASK) >> PAGE_SHIFT;
> +	u64 pg_end = ((ptr + len - 1) & PAGE_MASK) >> PAGE_SHIFT;
> +	u64 aligned_size = (pg_end - pg_start + 1) * PAGE_SIZE;
> +
> +	return aligned_size;
> +}
> +
> +static struct fastrpc_invoke_buf *fastrpc_invoke_buf_start(union fastrpc_remote_arg *pra, int len)
> +{
> +	return (struct fastrpc_invoke_buf *)(&pra[len]);
> +}
> +
> +static struct fastrpc_phy_page *fastrpc_phy_page_start(struct fastrpc_invoke_buf *buf, int len)
> +{
> +	return (struct fastrpc_phy_page *)(&buf[len]);
> +}
> +
> +static int fastrpc_get_meta_size(struct fastrpc_invoke_context *ctx)
> +{
> +	int size = 0;
> +
> +	size = (sizeof(struct fastrpc_remote_buf) +
> +		sizeof(struct fastrpc_invoke_buf) +
> +		sizeof(struct fastrpc_phy_page)) * ctx->nscalars +
> +		sizeof(u64) * FASTRPC_MAX_FDLIST +
> +		sizeof(u32) * FASTRPC_MAX_CRCLIST;
> +
> +	return size;
> +}
> +
> +static u64 fastrpc_get_payload_size(struct fastrpc_invoke_context *ctx, int metalen)
> +{
> +	u64 size = 0;
> +	int oix;
> +
> +	size = ALIGN(metalen, FASTRPC_ALIGN);
> +
> +	for (oix = 0; oix < ctx->nbufs; oix++) {
> +		int i = ctx->olaps[oix].raix;
> +
> +		if (ctx->args[i].fd == 0 || ctx->args[i].fd == -1) {
> +			if (ctx->olaps[oix].offset == 0)
> +				size = ALIGN(size, FASTRPC_ALIGN);
> +
> +			size += (ctx->olaps[oix].mend - ctx->olaps[oix].mstart);
> +		}
> +	}
> +
> +	return size;
> +}
> +
> +/**
> + * qda_fastrpc_context_free() - Free an invocation context
> + * @ref: Reference counter embedded in the context
> + *
> + * Called when the reference count reaches zero; releases all resources
> + * associated with the invocation context.
> + */
> +void qda_fastrpc_context_free(struct kref *ref)
> +{
> +	struct fastrpc_invoke_context *ctx;
> +	int i;
> +
> +	ctx = container_of(ref, struct fastrpc_invoke_context, refcount);
> +	if (ctx->gem_objs) {
> +		for (i = 0; i < ctx->nscalars; ++i) {
> +			if (ctx->gem_objs[i])
> +				drm_gem_object_put(ctx->gem_objs[i]);
> +		}
> +		kfree(ctx->gem_objs);
> +	}
> +
> +	if (ctx->msg_gem_obj)
> +		drm_gem_object_put(&ctx->msg_gem_obj->base);
> +
> +	kfree(ctx->olaps);
> +
> +	kfree(ctx->args);
> +	kfree(ctx->req);
> +	kfree(ctx->rsp);
> +	kfree(ctx->input_pages);
> +	kfree(ctx->inbuf);
> +
> +	kfree(ctx);
> +}
> +
> +#define CMP(aa, bb) ((aa) == (bb) ? 0 : (aa) < (bb) ? -1 : 1)
> +
> +static int olaps_cmp(const void *a, const void *b)
> +{
> +	struct fastrpc_buf_overlap *pa = (struct fastrpc_buf_overlap *)a;
> +	struct fastrpc_buf_overlap *pb = (struct fastrpc_buf_overlap *)b;
> +	/* sort with lowest starting buffer first */
> +	int st = CMP(pa->start, pb->start);
> +	/* sort with highest ending buffer first */
> +	int ed = CMP(pb->end, pa->end);
> +
> +	return st == 0 ? ed : st;
> +}
> +
> +static void fastrpc_get_buff_overlaps(struct fastrpc_invoke_context *ctx)
> +{
> +	u64 max_end = 0;
> +	int i;
> +
> +	for (i = 0; i < ctx->nbufs; ++i) {
> +		ctx->olaps[i].start = ctx->args[i].ptr;
> +		ctx->olaps[i].end = ctx->olaps[i].start + ctx->args[i].length;
> +		ctx->olaps[i].raix = i;
> +	}
> +
> +	sort(ctx->olaps, ctx->nbufs, sizeof(*ctx->olaps), olaps_cmp, NULL);
> +
> +	for (i = 0; i < ctx->nbufs; ++i) {
> +		if (ctx->olaps[i].start < max_end) {
> +			ctx->olaps[i].mstart = max_end;
> +			ctx->olaps[i].mend = ctx->olaps[i].end;
> +			ctx->olaps[i].offset = max_end - ctx->olaps[i].start;
> +
> +			if (ctx->olaps[i].end > max_end) {
> +				max_end = ctx->olaps[i].end;
> +			} else {
> +				ctx->olaps[i].mend = 0;
> +				ctx->olaps[i].mstart = 0;
> +			}
> +		} else {
> +			ctx->olaps[i].mend = ctx->olaps[i].end;
> +			ctx->olaps[i].mstart = ctx->olaps[i].start;
> +			ctx->olaps[i].offset = 0;
> +			max_end = ctx->olaps[i].end;
> +		}
> +	}
> +}
> +
> +/**
> + * qda_fastrpc_context_alloc() - Allocate a new FastRPC invocation context
> + *
> + * Return: Pointer to allocated context, or ERR_PTR on failure
> + */
> +struct fastrpc_invoke_context *qda_fastrpc_context_alloc(void)
> +{
> +	struct fastrpc_invoke_context *ctx = NULL;
> +
> +	ctx = kzalloc_obj(*ctx);
> +	if (!ctx)
> +		return ERR_PTR(-ENOMEM);
> +
> +	INIT_LIST_HEAD(&ctx->node);
> +
> +	ctx->retval = -1;
> +	ctx->pid = current->pid;
> +	init_completion(&ctx->work);
> +	ctx->msg_gem_obj = NULL;
> +	kref_init(&ctx->refcount);
> +
> +	return ctx;
> +}
> +
> +/*
> + * process_fd_buffer() - Handle an in/out buffer argument backed by a DMA-BUF fd
> + *
> + * args[i].fd is a DMA-BUF fd.  We import it to obtain the GEM object and its
> + * IOMMU-mapped dma_addr for the physical page descriptor.  The DSP uses the
> + * physical address directly for this buffer type; the fd is not forwarded.
> + */
> +static int process_fd_buffer(struct fastrpc_invoke_context *ctx, int i,
> +			     union fastrpc_remote_arg *rpra, struct fastrpc_phy_page *pages)
> +{
> +	struct drm_gem_object *gem_obj;
> +	struct qda_gem_obj *qda_gem_obj;
> +	int err;
> +	u64 len = ctx->args[i].length;
> +	u64 vma_offset;
> +
> +	err = get_gem_obj_from_dmabuf_fd(ctx, ctx->args[i].fd, &gem_obj);
> +	if (err)
> +		return err;
> +
> +	ctx->gem_objs[i] = gem_obj;
> +	qda_gem_obj = to_qda_gem_obj(gem_obj);
> +
> +	rpra[i].buf.pv = (u64)ctx->args[i].ptr;
> +
> +	pages[i].addr = qda_gem_obj->dma_addr;
> +
> +	vma_offset = calculate_vma_offset(ctx->args[i].ptr);
> +	pages[i].addr += vma_offset;
> +	pages[i].size = calculate_page_aligned_size(ctx->args[i].ptr, len);
> +
> +	return 0;
> +}
> +
> +static int process_direct_buffer(struct fastrpc_invoke_context *ctx, int i, int oix,
> +				 union fastrpc_remote_arg *rpra, struct fastrpc_phy_page *pages,
> +				 uintptr_t *args, u64 *rlen, u64 pkt_size)
> +{
> +	int mlen;
> +	u64 len = ctx->args[i].length;
> +	int inbufs = ctx->inbufs;
> +
> +	if (ctx->olaps[oix].offset == 0) {
> +		*rlen -= ALIGN(*args, FASTRPC_ALIGN) - *args;
> +		*args = ALIGN(*args, FASTRPC_ALIGN);
> +	}
> +
> +	mlen = ctx->olaps[oix].mend - ctx->olaps[oix].mstart;
> +
> +	if (*rlen < mlen)
> +		return -ENOSPC;
> +
> +	rpra[i].buf.pv = *args - ctx->olaps[oix].offset;
> +
> +	pages[i].addr = ctx->msg->phys - ctx->olaps[oix].offset + (pkt_size - *rlen);
> +	pages[i].addr = pages[i].addr & PAGE_MASK;
> +	pages[i].size = calculate_page_aligned_size(rpra[i].buf.pv, len);
> +
> +	*args = *args + mlen;
> +	*rlen -= mlen;
> +
> +	if (i < inbufs) {
> +		void *dst = (void *)(uintptr_t)rpra[i].buf.pv;
> +		void *src = (void *)(uintptr_t)ctx->args[i].ptr;
> +
> +		/*
> +		 * For user-space invocations (INVOKE_DYNAMIC), ptr is a user
> +		 * virtual address and must be copied safely. For all other
> +		 * (kernel-internal) invocations, ptr is a kernel address set
> +		 * by the driver itself and can be copied directly.
> +		 */
> +		if (ctx->type == FASTRPC_RMID_INVOKE_DYNAMIC) {
> +			if (copy_from_user(dst, (void __user *)src, len))
> +				return -EFAULT;
> +		} else {
> +			memcpy(dst, src, len);
> +		}
> +	}
> +
> +	return 0;
> +}
> +
> +/*
> + * process_dma_handle() - Handle a DMA-handle scalar argument
> + *
> + * args[i].fd is a DMA-BUF fd.  We import it to get the physical page
> + * descriptor for the kernel, but forward the original DMA-BUF fd to the
> + * DSP in rpra[i].dma.fd so the DSP can identify the buffer by its fd.
> + */
> +static int process_dma_handle(struct fastrpc_invoke_context *ctx, int i,
> +			      union fastrpc_remote_arg *rpra, struct fastrpc_phy_page *pages)
> +{
> +	if (ctx->args[i].fd > 0) {
> +		struct drm_gem_object *gem_obj;
> +		struct qda_gem_obj *qda_gem_obj;
> +		int err;
> +
> +		err = get_gem_obj_from_dmabuf_fd(ctx, ctx->args[i].fd, &gem_obj);
> +		if (err)
> +			return err;
> +
> +		ctx->gem_objs[i] = gem_obj;
> +		qda_gem_obj = to_qda_gem_obj(gem_obj);
> +
> +		setup_pages_from_gem_obj(qda_gem_obj, &pages[i]);
> +
> +		/* Forward the original DMA-BUF fd to the DSP */
> +		rpra[i].dma.fd     = ctx->args[i].fd;
> +		rpra[i].dma.len    = ctx->args[i].length;
> +		rpra[i].dma.offset = (u64)ctx->args[i].ptr;
> +	} else {
> +		rpra[i].buf.pv  = ctx->args[i].ptr;
> +		rpra[i].buf.len = ctx->args[i].length;
> +	}
> +
> +	return 0;
> +}
> +
> +/**
> + * qda_fastrpc_get_header_size() - Compute the FastRPC message header size
> + * @ctx: FastRPC invocation context
> + * @out_size: Pointer to store the aligned packet size in bytes
> + *
> + * Return: 0 on success, negative error code on failure
> + */
> +int qda_fastrpc_get_header_size(struct fastrpc_invoke_context *ctx, size_t *out_size)
> +{
> +	ctx->inbufs = REMOTE_SCALARS_INBUFS(ctx->sc);
> +	ctx->metalen = fastrpc_get_meta_size(ctx);
> +	ctx->pkt_size = fastrpc_get_payload_size(ctx, ctx->metalen);
> +
> +	ctx->aligned_pkt_size = PAGE_ALIGN(ctx->pkt_size);
> +	if (ctx->aligned_pkt_size == 0)
> +		return -EINVAL;
> +
> +	*out_size = ctx->aligned_pkt_size;
> +	return 0;
> +}
> +
> +static int fastrpc_get_args(struct fastrpc_invoke_context *ctx)
> +{
> +	union fastrpc_remote_arg *rpra;
> +	struct fastrpc_invoke_buf *list;
> +	struct fastrpc_phy_page *pages;
> +	int i, oix, err = 0;
> +	u64 rlen;
> +	uintptr_t args;
> +	size_t hdr_size;
> +
> +	ctx->inbufs = REMOTE_SCALARS_INBUFS(ctx->sc);
> +	err = qda_fastrpc_get_header_size(ctx, &hdr_size);
> +	if (err)
> +		return err;
> +
> +	ctx->msg->buf = ctx->msg_gem_obj->virt;
> +	ctx->msg->phys = ctx->msg_gem_obj->dma_addr;
> +
> +	memset(ctx->msg->buf, 0, ctx->aligned_pkt_size);
> +
> +	rpra = (union fastrpc_remote_arg *)ctx->msg->buf;
> +	ctx->list = fastrpc_invoke_buf_start(rpra, ctx->nscalars);
> +	ctx->pages = fastrpc_phy_page_start(ctx->list, ctx->nscalars);
> +	list = ctx->list;
> +	pages = ctx->pages;
> +	args = (uintptr_t)ctx->msg->buf + ctx->metalen;
> +	rlen = ctx->pkt_size - ctx->metalen;
> +	ctx->rpra = rpra;
> +
> +	for (oix = 0; oix < ctx->nbufs; ++oix) {
> +		i = ctx->olaps[oix].raix;
> +
> +		rpra[i].buf.pv = 0;
> +		rpra[i].buf.len = ctx->args[i].length;
> +		list[i].num = ctx->args[i].length ? 1 : 0;
> +		list[i].pgidx = i;
> +
> +		if (!ctx->args[i].length)
> +			continue;
> +
> +		if (ctx->args[i].fd > 0)
> +			err = process_fd_buffer(ctx, i, rpra, pages);
> +		else
> +			err = process_direct_buffer(ctx, i, oix, rpra, pages, &args, &rlen,
> +						    ctx->pkt_size);
> +
> +		if (err)
> +			goto bail_gem;
> +	}
> +
> +	for (i = ctx->nbufs; i < ctx->nscalars; ++i) {
> +		list[i].num = ctx->args[i].length ? 1 : 0;
> +		list[i].pgidx = i;
> +
> +		err = process_dma_handle(ctx, i, rpra, pages);
> +		if (err)
> +			goto bail_gem;
> +	}
> +
> +	return 0;
> +
> +bail_gem:
> +	if (ctx->msg_gem_obj) {
> +		drm_gem_object_put(&ctx->msg_gem_obj->base);
> +		ctx->msg_gem_obj = NULL;
> +	}
> +
> +	return err;
> +}
> +
> +static int fastrpc_put_args(struct fastrpc_invoke_context *ctx, struct qda_msg *msg)
> +{
> +	union fastrpc_remote_arg *rpra;
> +	int i, err = 0;
> +
> +	if (!ctx)
> +		return -EINVAL;
> +
> +	rpra = ctx->rpra;
> +	if (!rpra)
> +		return -EINVAL;
> +
> +	for (i = ctx->inbufs; i < ctx->nbufs; ++i) {
> +		if (ctx->args[i].fd <= 0) {
> +			void *src = (void *)(uintptr_t)rpra[i].buf.pv;
> +			void *dst = (void *)(uintptr_t)ctx->args[i].ptr;
> +			u64 len = rpra[i].buf.len;
> +
> +			if (ctx->type == FASTRPC_RMID_INVOKE_DYNAMIC)
> +				err = copy_to_user((void __user *)dst, src, len) ? -EFAULT : 0;
> +			else
> +				memcpy(dst, src, len);
> +			if (err)
> +				break;
> +		}
> +	}
> +
> +	return err;
> +}
> +
> +/**
> + * qda_fastrpc_invoke_pack() - Pack an invocation context into a QDA message
> + * @ctx: FastRPC invocation context
> + * @msg: QDA message structure to pack into
> + *
> + * Return: 0 on success, negative error code on failure
> + */
> +int qda_fastrpc_invoke_pack(struct fastrpc_invoke_context *ctx,
> +			    struct qda_msg *msg)
> +{
> +	int err = 0;
> +
> +	if (ctx->handle == FASTRPC_INIT_HANDLE)
> +		msg->fastrpc.remote_session_id = 0;
> +	else
> +		msg->fastrpc.remote_session_id = ctx->remote_session_id;
> +
> +	ctx->msg = msg;
> +
> +	err = fastrpc_get_args(ctx);
> +	if (err)
> +		return err;
> +
> +	dma_wmb();
> +
> +	msg->fastrpc.tid    = ctx->pid;
> +	msg->fastrpc.ctx    = ctx->ctxid | ctx->pd;
> +	msg->fastrpc.handle = ctx->handle;
> +	msg->fastrpc.sc     = ctx->sc;
> +	msg->fastrpc.addr   = ctx->msg->phys;
> +	msg->fastrpc.size   = roundup(ctx->pkt_size, PAGE_SIZE);
> +	msg->fastrpc_ctx    = ctx;
> +	msg->file_priv      = ctx->file_priv;
> +
> +	return 0;
> +}
> +
> +/**
> + * qda_fastrpc_invoke_unpack() - Unpack a response message into an invocation context
> + * @ctx: FastRPC invocation context
> + * @msg: QDA message structure to unpack from
> + *
> + * Return: 0 on success, negative error code on failure
> + */
> +int qda_fastrpc_invoke_unpack(struct fastrpc_invoke_context *ctx,
> +			      struct qda_msg *msg)
> +{
> +	int err;
> +
> +	dma_rmb();
> +
> +	err = fastrpc_put_args(ctx, msg);
> +	if (err)
> +		return err;
> +
> +	err = ctx->retval;
> +	return err;
> +}
> +
> +static int fastrpc_prepare_args_invoke(struct fastrpc_invoke_context *ctx, char __user *argp)
> +{
> +	struct drm_qda_invoke_args invoke_args;
> +	struct drm_qda_fastrpc_invoke_args *args = NULL;
> +	u32 nscalars;
> +
> +	/* argp is DRM ioctl data (kernel pointer); args pointer within it is user-space */
> +	memcpy(&invoke_args, argp, sizeof(invoke_args));
> +
> +	ctx->handle = invoke_args.handle;
> +	ctx->sc = invoke_args.sc;
> +
> +	nscalars = REMOTE_SCALARS_LENGTH(ctx->sc);
> +	if (!nscalars) {
> +		ctx->args = NULL;
> +		return 0;
> +	}
> +
> +	args = kcalloc(nscalars, sizeof(*args), GFP_KERNEL);
> +	if (!args)
> +		return -ENOMEM;
> +
> +	if (copy_from_user(args, u64_to_user_ptr(invoke_args.args),
> +			   nscalars * sizeof(*args))) {
> +		kfree(args);
> +		return -EFAULT;
> +	}
> +
> +	ctx->args = args;
> +	return 0;
> +}
> +
> +/**
> + * qda_fastrpc_prepare_args() - Prepare arguments for a FastRPC invocation
> + * @ctx: FastRPC invocation context
> + * @argp: User-space pointer to invocation arguments
> + *
> + * Return: 0 on success, negative error code on failure
> + */
> +int qda_fastrpc_prepare_args(struct fastrpc_invoke_context *ctx, char __user *argp)
> +{
> +	int err;
> +
> +	switch (ctx->type) {
> +	case FASTRPC_RMID_INVOKE_DYNAMIC:
> +		err = fastrpc_prepare_args_invoke(ctx, argp);
> +		break;
> +	default:
> +		return -EINVAL;
> +	}
> +	if (err)
> +		return err;
> +
> +	ctx->nscalars = REMOTE_SCALARS_LENGTH(ctx->sc);
> +	ctx->nbufs = REMOTE_SCALARS_INBUFS(ctx->sc) + REMOTE_SCALARS_OUTBUFS(ctx->sc);
> +
> +	if (ctx->nscalars) {
> +		ctx->gem_objs = kcalloc(ctx->nscalars, sizeof(*ctx->gem_objs), GFP_KERNEL);
> +		if (!ctx->gem_objs)
> +			return -ENOMEM;
> +		ctx->olaps = kcalloc(ctx->nscalars, sizeof(*ctx->olaps), GFP_KERNEL);
> +		if (!ctx->olaps) {
> +			kfree(ctx->gem_objs);
> +			ctx->gem_objs = NULL;
> +			return -ENOMEM;
> +		}
> +		fastrpc_get_buff_overlaps(ctx);
> +	}
> +
> +	return err;
> +}
> diff --git a/drivers/accel/qda/qda_fastrpc.h b/drivers/accel/qda/qda_fastrpc.h
> new file mode 100644
> index 000000000000..ce77baeccfba
> --- /dev/null
> +++ b/drivers/accel/qda/qda_fastrpc.h
> @@ -0,0 +1,271 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
> + */
> +
> +#ifndef __QDA_FASTRPC_H__
> +#define __QDA_FASTRPC_H__
> +
> +#include <linux/completion.h>
> +#include <linux/kref.h>
> +#include <linux/list.h>
> +#include <linux/types.h>
> +#include <drm/drm_drv.h>
> +#include <drm/drm_file.h>
> +#include <drm/qda_accel.h>
> +
> +/* Forward declarations */
> +struct qda_gem_obj;
> +
> +/*
> + * FastRPC scalar extraction macros
> + *
> + * These macros extract different fields from the scalar value that describes
> + * the arguments passed in a FastRPC invocation.
> + */
> +#define REMOTE_SCALARS_INBUFS(sc)	(((sc) >> 16) & 0x0ff)
> +#define REMOTE_SCALARS_OUTBUFS(sc)	(((sc) >> 8) & 0x0ff)
> +#define REMOTE_SCALARS_INHANDLES(sc)	(((sc) >> 4) & 0x0f)
> +#define REMOTE_SCALARS_OUTHANDLES(sc)	((sc) & 0x0f)
> +#define REMOTE_SCALARS_LENGTH(sc)	(REMOTE_SCALARS_INBUFS(sc) +   \
> +					 REMOTE_SCALARS_OUTBUFS(sc) +  \
> +					 REMOTE_SCALARS_INHANDLES(sc) + \
> +					 REMOTE_SCALARS_OUTHANDLES(sc))
> +
> +/* FastRPC configuration constants */
> +#define FASTRPC_ALIGN		128		/* Alignment requirement */
> +#define FASTRPC_MAX_FDLIST	16		/* Maximum file descriptors */
> +#define FASTRPC_MAX_CRCLIST	64		/* Maximum CRC list entries */
> +
> +/*
> + * FastRPC scalar construction macros
> + *
> + * These macros build the scalar value that describes the arguments
> + * for a FastRPC invocation.
> + */
> +#define FASTRPC_BUILD_SCALARS(attr, method, in, out, oin, oout)		\
> +				(((attr & 0x07) << 29) |		\
> +				((method & 0x1f) << 24) |		\
> +				((in & 0xff) << 16) |			\
> +				((out & 0xff) <<  8) |			\
> +				((oin & 0x0f) <<  4) |			\
> +				(oout & 0x0f))
> +
> +#define FASTRPC_SCALARS(method, in, out) \
> +		FASTRPC_BUILD_SCALARS(0, method, in, out, 0, 0)
> +
> +/**
> + * struct fastrpc_buf_overlap - Buffer overlap tracking structure
> + *
> + * Tracks overlapping buffer regions to optimise memory mapping and avoid
> + * redundant mappings of the same physical memory.

WHat for? Even if this is a valid optimization, implement it as a
subsequent patch. The first goal should be very simple - get GEM buffers
from the app, pass them to the DSP, read the results.

> + */
> +struct fastrpc_buf_overlap {

Stop clashing the names with the existing fastrpc driver.

> +	/** @start: Start address of the buffer in user virtual address space */
> +	u64 start;
> +	/** @end: End address of the buffer in user virtual address space */
> +	u64 end;
> +	/** @raix: Remote argument index associated with this overlap */
> +	int raix;
> +	/** @mstart: Start address of the mapped region */
> +	u64 mstart;
> +	/** @mend: End address of the mapped region */
> +	u64 mend;
> +	/** @offset: Offset within the mapped region */
> +	u64 offset;
> +};
> +
> +/**
> + * struct fastrpc_remote_dmahandle - Remote DMA handle descriptor
> + */
> +struct fastrpc_remote_dmahandle {
> +	/** @fd: DMA-BUF file descriptor */
> +	s32 fd;
> +	/** @offset: Byte offset within the DMA-BUF */
> +	u32 offset;
> +	/** @len: Length of the region in bytes */
> +	u32 len;
> +};
> +
> +/**
> + * struct fastrpc_remote_buf - Remote buffer descriptor
> + */
> +struct fastrpc_remote_buf {
> +	/** @pv: Buffer pointer (user virtual address) */
> +	u64 pv;
> +	/** @len: Length of the buffer in bytes */
> +	u64 len;
> +};
> +
> +/**
> + * union fastrpc_remote_arg - Remote argument (buffer or DMA handle)
> + */
> +union fastrpc_remote_arg {
> +	/** @buf: Inline buffer descriptor */
> +	struct fastrpc_remote_buf buf;
> +	/** @dma: DMA-BUF handle descriptor */
> +	struct fastrpc_remote_dmahandle dma;
> +};
> +
> +/**
> + * struct fastrpc_phy_page - Physical page descriptor
> + */
> +struct fastrpc_phy_page {
> +	/** @addr: Physical (IOMMU) address of the page */
> +	u64 addr;
> +	/** @size: Size of the contiguous region in bytes */
> +	u64 size;
> +};
> +
> +/**
> + * struct fastrpc_invoke_buf - Invoke buffer descriptor
> + */
> +struct fastrpc_invoke_buf {
> +	/** @num: Number of contiguous physical regions */
> +	u32 num;
> +	/** @pgidx: Index into the physical page array */
> +	u32 pgidx;
> +};
> +
> +/**
> + * struct fastrpc_msg - FastRPC wire message for remote invocations
> + *
> + * Sent to the remote processor via RPMsg. This is the exact layout
> + * the DSP expects; do not reorder or add fields without DSP firmware
> + * coordination.
> + */
> +struct fastrpc_msg {
> +	/** @remote_session_id: Session identifier on the remote processor */
> +	int remote_session_id;
> +	/** @tid: Thread ID of the invoking thread */
> +	int tid;
> +	/** @ctx: Context identifier for matching request/response */
> +	u64 ctx;
> +	/** @handle: Handle of the remote method to invoke */
> +	u32 handle;
> +	/** @sc: Scalars value encoding in/out buffer counts */
> +	u32 sc;
> +	/** @addr: Physical address of the message payload buffer */
> +	u64 addr;
> +	/** @size: Size of the message payload in bytes */
> +	u64 size;
> +};
> +
> +/**
> + * struct qda_msg - FastRPC message with kernel-internal bookkeeping
> + *
> + * The wire-format portion is kept in the embedded @fastrpc member (must
> + * be first) so that &qda_msg->fastrpc can be passed directly to
> + * rpmsg_send() without a copy.
> + */
> +struct qda_msg {
> +	/**
> +	 * @fastrpc: Wire-format message sent to the DSP via RPMsg.
> +	 * Must be the first member.
> +	 */
> +	struct fastrpc_msg fastrpc;
> +	/** @buf: Kernel virtual address of the payload buffer */
> +	void *buf;
> +	/** @phys: Physical/DMA address of the payload buffer */
> +	u64 phys;
> +	/** @ret: Return value from the remote processor */
> +	int ret;
> +	/** @fastrpc_ctx: Back-pointer to the owning invocation context */
> +	struct fastrpc_invoke_context *fastrpc_ctx;
> +	/** @file_priv: DRM file private data for GEM object lookup */
> +	struct drm_file *file_priv;
> +};
> +
> +/**
> + * struct fastrpc_invoke_context - Remote procedure call invocation context
> + *
> + * Maintains all state for a single remote procedure call, including buffer
> + * management, synchronisation, and result handling.
> + */
> +struct fastrpc_invoke_context {
> +	/** @node: List node for linking contexts in a queue */
> +	struct list_head node;
> +	/** @ctxid: Unique context identifier (XArray key shifted left by 4) */
> +	u64 ctxid;
> +	/** @inbufs: Number of input buffers */
> +	int inbufs;
> +	/** @outbufs: Number of output buffers */
> +	int outbufs;
> +	/** @handles: Number of DMA-BUF handle arguments */
> +	int handles;
> +	/** @nscalars: Total number of scalar arguments */
> +	int nscalars;
> +	/** @nbufs: Total number of buffer arguments (inbufs + outbufs) */
> +	int nbufs;

If it is inbufs + outbufs, why do you need it here?

> +	/** @pid: Process ID of the calling process */
> +	int pid;
> +	/** @retval: Return value from the remote invocation */
> +	int retval;
> +	/** @metalen: Length of the FastRPC metadata header in bytes */
> +	int metalen;

size_t, also why do you need it?

> +	/** @remote_session_id: Session identifier on the remote processor */
> +	int remote_session_id;
> +	/** @pd: Protection domain identifier encoded into the context ID */
> +	int pd;
> +	/** @type: Invocation type (e.g. FASTRPC_RMID_INVOKE_DYNAMIC) */
> +	int type;
> +	/** @sc: Scalars value encoding in/out buffer counts */
> +	u32 sc;

How is this different from the counts above?

> +	/** @handle: Handle of the remote method being invoked */
> +	u32 handle;
> +	/** @crc: Pointer to CRC values for data integrity checking */
> +	u32 *crc;

Add it later. It's unused. Drop all unused fields.

> +	/** @fdlist: Pointer to array of DMA-BUF file descriptors */
> +	u64 *fdlist;

Why do you need DMA-BUFs in the invocation context? They all should be
GEM buffers.

> +	/** @pkt_size: Total payload size in bytes */
> +	u64 pkt_size;
> +	/** @aligned_pkt_size: Page-aligned payload size for GEM allocation */
> +	u64 aligned_pkt_size;
> +	/** @list: Array of invoke buffer descriptors */
> +	struct fastrpc_invoke_buf *list;
> +	/** @pages: Array of physical page descriptors for all arguments */
> +	struct fastrpc_phy_page *pages;
> +	/** @input_pages: Array of physical page descriptors for input buffers */
> +	struct fastrpc_phy_page *input_pages;

I think you are trying to bring all the complexity from the old driver
with no added benefit. Please don't. Use the existing memory manager.
Let it handle all the gory details. If someting is not there, we should
consider extending GEM instead.

> +	/** @work: Completion used to synchronise with the DSP response */
> +	struct completion work;
> +	/** @msg: Pointer to the QDA message structure for this invocation */
> +	struct qda_msg *msg;
> +	/** @rpra: Array of remote procedure arguments */
> +	union fastrpc_remote_arg *rpra;
> +	/** @gem_objs: Array of GEM objects imported for argument buffers */
> +	struct drm_gem_object **gem_objs;
> +	/** @args: User-space invoke argument descriptors */
> +	struct drm_qda_fastrpc_invoke_args *args;
> +	/** @olaps: Array of buffer overlap descriptors for deduplication */
> +	struct fastrpc_buf_overlap *olaps;
> +	/** @refcount: Reference counter for context lifetime management */
> +	struct kref refcount;
> +	/** @msg_gem_obj: GEM object backing the message payload buffer */
> +	struct qda_gem_obj *msg_gem_obj;
> +	/** @file_priv: DRM file private data */
> +	struct drm_file *file_priv;
> +	/** @init_mem_gem_obj: GEM object for protection domain init memory */
> +	struct qda_gem_obj *init_mem_gem_obj;
> +	/** @req: Pointer to kernel-internal request buffer */
> +	void *req;
> +	/** @rsp: Pointer to kernel-internal response buffer */
> +	void *rsp;
> +	/** @inbuf: Pointer to kernel-internal input buffer */
> +	void *inbuf;
> +};
> +
> +/* Remote Method ID table - identifies initialization and control operations */
> +#define FASTRPC_RMID_INVOKE_DYNAMIC	0xFFFFFFFF	/* Dynamic method invocation */
> +
> +/* Common handle for initialization operations */
> +#define FASTRPC_INIT_HANDLE		0x1
> +
> +void qda_fastrpc_context_free(struct kref *ref);
> +struct fastrpc_invoke_context *qda_fastrpc_context_alloc(void);
> +int qda_fastrpc_prepare_args(struct fastrpc_invoke_context *ctx, char __user *argp);
> +int qda_fastrpc_get_header_size(struct fastrpc_invoke_context *ctx, size_t *out_size);
> +int qda_fastrpc_invoke_pack(struct fastrpc_invoke_context *ctx, struct qda_msg *msg);
> +int qda_fastrpc_invoke_unpack(struct fastrpc_invoke_context *ctx, struct qda_msg *msg);
> +
> +#endif /* __QDA_FASTRPC_H__ */
> diff --git a/drivers/accel/qda/qda_ioctl.c b/drivers/accel/qda/qda_ioctl.c
> index 1769c85a3e98..c81268c20b04 100644
> --- a/drivers/accel/qda/qda_ioctl.c
> +++ b/drivers/accel/qda/qda_ioctl.c
> @@ -3,8 +3,10 @@
>  #include <drm/drm_ioctl.h>
>  #include <drm/qda_accel.h>
>  #include "qda_drv.h"
> +#include "qda_fastrpc.h"
>  #include "qda_gem.h"
>  #include "qda_ioctl.h"
> +#include "qda_rpmsg.h"
>  
>  /**
>   * qda_ioctl_query() - Query DSP device information
> @@ -74,3 +76,105 @@ int qda_ioctl_gem_mmap_offset(struct drm_device *dev, void *data, struct drm_fil
>  
>  	return drm_gem_dumb_map_offset(file_priv, dev, args->handle, &args->offset);
>  }
> +
> +static int fastrpc_context_get_id(struct fastrpc_invoke_context *ctx, struct qda_dev *qdev)
> +{
> +	int ret;
> +	u32 id;
> +
> +	if (!qdev)
> +		return -EINVAL;
> +
> +	ret = xa_alloc(&qdev->ctx_xa, &id, ctx, xa_limit_32b, GFP_KERNEL);
> +	if (ret)
> +		return ret;
> +
> +	ctx->ctxid = id << 4;

Why is it being shifted?

> +	return 0;
> +}
> +

-- 
With best wishes
Dmitry

^ permalink raw reply

* Re: [PATCH 13/15] accel/qda: Add DSP process creation and release
From: Dmitry Baryshkov @ 2026-05-20 14:00 UTC (permalink / raw)
  To: ekansh.gupta
  Cc: Oded Gabbay, Jonathan Corbet, Shuah Khan, Joerg Roedel,
	Will Deacon, Robin Murphy, Maarten Lankhorst, Maxime Ripard,
	Thomas Zimmermann, David Airlie, Simona Vetter, Sumit Semwal,
	Christian König, Bharath Kumar, Chenna Kesava Raju, srini,
	andersson, konradybcio, robin.clark, linux-kernel, dri-devel,
	linux-doc, linux-arm-msm, iommu, linux-media, linaro-mm-sig
In-Reply-To: <20260519-qda-series-v1-13-b2d984c297f8@oss.qualcomm.com>

On Tue, May 19, 2026 at 11:46:03AM +0530, Ekansh Gupta via B4 Relay wrote:
> From: Ekansh Gupta <ekansh.gupta@oss.qualcomm.com>
> 
> Implement the REMOTE_SESSION_CREATE and INIT_RELEASE FastRPC
> operations, which establish and tear down a user process on the
> DSP.
> 
> DRM_IOCTL_QDA_REMOTE_SESSION_CREATE (drm_qda_init_create)
>   Creates a new process on the DSP by sending an INIT_CREATE message
>   via the FastRPC INIT_HANDLE. The caller provides an ELF file (via
>   DMA-BUF fd or direct pointer) and optional process attributes. A
>   4 MB GEM buffer is allocated per session to hold the DSP process
>   image; this buffer is stored in qda_file_priv and reused for the
>   lifetime of the session.
> 
>   If attrs is non-zero, INIT_CREATE_ATTR is used instead of
>   INIT_CREATE to pass the extended attribute and signature fields.

What is the difference?

> 
> INIT_RELEASE
>   Sends a release message to the DSP when the DRM file is closed
>   (qda_postclose via qda_release_dsp_process), freeing the remote
>   process and its resources. The release is skipped if the device
>   has already been unplugged.
> 
> qda_fastrpc.c
>   fastrpc_prepare_args_init_create() marshals the six-argument
>   create-process payload: the inbuf descriptor, process name,
>   ELF file, physical pages, attrs, and siglen.
>   fastrpc_prepare_args_release_process() marshals the single-
>   argument release payload (remote_session_id).
> 
> qda_drv.c
>   qda_postclose() is extended to call qda_release_dsp_process()
>   under drm_dev_enter() so the release message is only sent while
>   the device is still accessible.
> 
> Assisted-by: Claude:claude-4-6-sonnet
> Signed-off-by: Ekansh Gupta <ekansh.gupta@oss.qualcomm.com>
> ---
>  drivers/accel/qda/qda_drv.c     |   8 +++
>  drivers/accel/qda/qda_drv.h     |   5 ++
>  drivers/accel/qda/qda_fastrpc.c | 140 ++++++++++++++++++++++++++++++++++++++++
>  drivers/accel/qda/qda_fastrpc.h |  39 +++++++++--
>  drivers/accel/qda/qda_ioctl.c   |  52 +++++++++++++++
>  drivers/accel/qda/qda_ioctl.h   |   1 +
>  include/uapi/drm/qda_accel.h    |  32 ++++++++-
>  7 files changed, 270 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/accel/qda/qda_drv.c b/drivers/accel/qda/qda_drv.c
> index 704c7d3127d2..4eaba9b050c0 100644
> --- a/drivers/accel/qda/qda_drv.c
> +++ b/drivers/accel/qda/qda_drv.c
> @@ -36,6 +36,13 @@ static int qda_open(struct drm_device *dev, struct drm_file *file)
>  static void qda_postclose(struct drm_device *dev, struct drm_file *file)
>  {
>  	struct qda_file_priv *qda_file_priv = file->driver_priv;
> +	int idx;
> +
> +	/* Only send the DSP release message while the device is accessible */
> +	if (drm_dev_enter(dev, &idx)) {
> +		qda_release_dsp_process(qda_file_priv->qda_dev, file);
> +		drm_dev_exit(idx);
> +	}
>  
>  	if (qda_file_priv->assigned_iommu_dev) {
>  		struct qda_iommu_device *iommu_dev = qda_file_priv->assigned_iommu_dev;
> @@ -59,6 +66,7 @@ static const struct drm_ioctl_desc qda_ioctls[] = {
>  	DRM_IOCTL_DEF_DRV(QDA_QUERY, qda_ioctl_query, 0),
>  	DRM_IOCTL_DEF_DRV(QDA_GEM_CREATE, qda_ioctl_gem_create, 0),
>  	DRM_IOCTL_DEF_DRV(QDA_GEM_MMAP_OFFSET, qda_ioctl_gem_mmap_offset, 0),
> +	DRM_IOCTL_DEF_DRV(QDA_REMOTE_SESSION_CREATE, qda_ioctl_init_create, 0),

Why is it being added in the middle?

>  	DRM_IOCTL_DEF_DRV(QDA_REMOTE_INVOKE, qda_ioctl_invoke, 0),
>  };
>  
> diff --git a/drivers/accel/qda/qda_drv.h b/drivers/accel/qda/qda_drv.h
> index 420cccff42bf..4b4639961d95 100644
> --- a/drivers/accel/qda/qda_drv.h
> +++ b/drivers/accel/qda/qda_drv.h
> @@ -28,6 +28,8 @@ struct qda_file_priv {
>  	struct qda_dev *qda_dev;
>  	/** @assigned_iommu_dev: IOMMU device assigned to this process */
>  	struct qda_iommu_device *assigned_iommu_dev;
> +	/** @init_mem_gem_obj: GEM object for PD initialization memory */
> +	struct qda_gem_obj *init_mem_gem_obj;
>  	/** @pid: Process ID for tracking */
>  	pid_t pid;
>  	/** @remote_session_id: Unique session identifier */
> @@ -83,4 +85,7 @@ void qda_deinit_device(struct qda_dev *qdev);
>  int qda_register_device(struct qda_dev *qdev);
>  void qda_unregister_device(struct qda_dev *qdev);
>  
> +/* DSP process / protection domain management */
> +int qda_release_dsp_process(struct qda_dev *qdev, struct drm_file *file_priv);
> +
>  #endif /* __QDA_DRV_H__ */
> diff --git a/drivers/accel/qda/qda_fastrpc.c b/drivers/accel/qda/qda_fastrpc.c
> index 0ec37175a098..305915022b91 100644
> --- a/drivers/accel/qda/qda_fastrpc.c
> +++ b/drivers/accel/qda/qda_fastrpc.c
> @@ -524,6 +524,138 @@ int qda_fastrpc_invoke_unpack(struct fastrpc_invoke_context *ctx,
>  	return err;
>  }
>  
> +static void setup_create_process_args(struct drm_qda_fastrpc_invoke_args *args,
> +				      struct fastrpc_create_process_inbuf *inbuf,
> +				      struct drm_qda_init_create *init,
> +				      struct fastrpc_phy_page *pages)
> +{
> +	args[0].ptr = (u64)(uintptr_t)inbuf;
> +	args[0].length = sizeof(*inbuf);
> +	args[0].fd = -1;
> +
> +	args[1].ptr = (u64)(uintptr_t)current->comm;
> +	args[1].length = inbuf->namelen;
> +	args[1].fd = -1;
> +
> +	args[2].ptr = (u64)init->file;
> +	args[2].length = inbuf->filelen;
> +	args[2].fd = init->filefd;	/* DMA-BUF fd forwarded to DSP */
> +
> +	args[3].ptr = (u64)(uintptr_t)pages;
> +	args[3].length = 1 * sizeof(*pages);
> +	args[3].fd = -1;
> +
> +	args[4].ptr = (u64)(uintptr_t)&inbuf->attrs;
> +	args[4].length = sizeof(inbuf->attrs);
> +	args[4].fd = -1;
> +
> +	args[5].ptr = (u64)(uintptr_t)&inbuf->siglen;
> +	args[5].length = sizeof(inbuf->siglen);
> +	args[5].fd = -1;
> +}
> +
> +static void setup_single_arg(struct drm_qda_fastrpc_invoke_args *args, const void *ptr, size_t size)
> +{
> +	args[0].ptr = (u64)(uintptr_t)ptr;
> +	args[0].length = size;
> +	args[0].fd = -1;
> +}
> +
> +static int fastrpc_prepare_args_release_process(struct fastrpc_invoke_context *ctx)
> +{
> +	struct drm_qda_fastrpc_invoke_args *args;
> +
> +	args = kzalloc_obj(*args);
> +	if (!args)
> +		return -ENOMEM;
> +
> +	setup_single_arg(args, &ctx->remote_session_id, sizeof(ctx->remote_session_id));
> +	ctx->sc = FASTRPC_SCALARS(FASTRPC_RMID_INIT_RELEASE, 1, 0);
> +	ctx->args = args;
> +	ctx->handle = FASTRPC_INIT_HANDLE;
> +
> +	return 0;
> +}
> +
> +static int fastrpc_prepare_args_init_create(struct fastrpc_invoke_context *ctx,
> +					    char __user *argp)
> +{
> +	struct drm_qda_init_create init;
> +	struct drm_qda_fastrpc_invoke_args *args;
> +	struct fastrpc_create_process_inbuf *inbuf;
> +	int err;
> +	u32 sc;
> +
> +	args = kcalloc(FASTRPC_CREATE_PROCESS_NARGS, sizeof(*args), GFP_KERNEL);
> +	if (!args)
> +		return -ENOMEM;
> +
> +	ctx->input_pages = kcalloc(1, sizeof(*ctx->input_pages), GFP_KERNEL);
> +	if (!ctx->input_pages) {
> +		err = -ENOMEM;
> +		goto err_free_args;
> +	}
> +
> +	ctx->inbuf = kcalloc(1, sizeof(*inbuf), GFP_KERNEL);
> +	if (!ctx->inbuf) {
> +		err = -ENOMEM;
> +		goto err_free_input_pages;
> +	}
> +	inbuf = ctx->inbuf;
> +
> +	memcpy(&init, argp, sizeof(init));
> +
> +	if (init.filelen > FASTRPC_INIT_FILELEN_MAX) {
> +		err = -EINVAL;
> +		goto err_free_inbuf;
> +	}
> +
> +	/*
> +	 * Validate that the DMA-BUF fd is importable.  The fd itself is kept
> +	 * in init.filefd and forwarded to the DSP via setup_create_process_args().
> +	 */
> +	if (init.filelen && init.filefd > 0) {
> +		struct drm_gem_object *file_gem_obj;
> +
> +		err = get_gem_obj_from_dmabuf_fd(ctx, init.filefd, &file_gem_obj);
> +		if (err) {
> +			err = -EINVAL;
> +			goto err_free_inbuf;
> +		}
> +		drm_gem_object_put(file_gem_obj);
> +	}
> +
> +	inbuf->remote_session_id = ctx->remote_session_id;
> +	inbuf->namelen = strlen(current->comm) + 1;
> +	inbuf->filelen = init.filelen;
> +	inbuf->pageslen = 1;
> +	inbuf->attrs = init.attrs;
> +	inbuf->siglen = init.siglen;
> +
> +	setup_pages_from_gem_obj(ctx->init_mem_gem_obj, &ctx->input_pages[0]);
> +
> +	setup_create_process_args(args, inbuf, &init, ctx->input_pages);
> +
> +	sc = FASTRPC_SCALARS(FASTRPC_RMID_INIT_CREATE, 4, 0);
> +	if (init.attrs)
> +		sc = FASTRPC_SCALARS(FASTRPC_RMID_INIT_CREATE_ATTR, 4, 0);
> +	ctx->sc = sc;
> +	ctx->args = args;
> +	ctx->handle = FASTRPC_INIT_HANDLE;
> +
> +	return 0;
> +
> +err_free_inbuf:
> +	kfree(ctx->inbuf);
> +	ctx->inbuf = NULL;
> +err_free_input_pages:
> +	kfree(ctx->input_pages);
> +	ctx->input_pages = NULL;
> +err_free_args:
> +	kfree(args);
> +	return err;
> +}
> +
>  static int fastrpc_prepare_args_invoke(struct fastrpc_invoke_context *ctx, char __user *argp)
>  {
>  	struct drm_qda_invoke_args invoke_args;
> @@ -568,6 +700,14 @@ int qda_fastrpc_prepare_args(struct fastrpc_invoke_context *ctx, char __user *ar
>  	int err;
>  
>  	switch (ctx->type) {
> +	case FASTRPC_RMID_INIT_RELEASE:
> +		err = fastrpc_prepare_args_release_process(ctx);
> +		break;
> +	case FASTRPC_RMID_INIT_CREATE:
> +	case FASTRPC_RMID_INIT_CREATE_ATTR:
> +		ctx->pd = QDA_USER_PD;
> +		err = fastrpc_prepare_args_init_create(ctx, argp);
> +		break;
>  	case FASTRPC_RMID_INVOKE_DYNAMIC:
>  		err = fastrpc_prepare_args_invoke(ctx, argp);
>  		break;
> diff --git a/drivers/accel/qda/qda_fastrpc.h b/drivers/accel/qda/qda_fastrpc.h
> index ce77baeccfba..1c1236f9525e 100644
> --- a/drivers/accel/qda/qda_fastrpc.h
> +++ b/drivers/accel/qda/qda_fastrpc.h
> @@ -127,6 +127,27 @@ struct fastrpc_invoke_buf {
>  	u32 pgidx;
>  };
>  
> +/**
> + * struct fastrpc_create_process_inbuf - Input buffer for process creation
> + *
> + * This structure defines the input buffer format for creating a new
> + * process on the remote DSP.
> + */
> +struct fastrpc_create_process_inbuf {
> +	/** @remote_session_id: Client identifier for the session */
> +	int remote_session_id;
> +	/** @namelen: Length of the process name string including NUL terminator */
> +	u32 namelen;
> +	/** @filelen: Length of the ELF shell file in bytes */
> +	u32 filelen;
> +	/** @pageslen: Number of physical page descriptors */
> +	u32 pageslen;
> +	/** @attrs: Process attribute flags */
> +	u32 attrs;
> +	/** @siglen: Length of the signature data in bytes */
> +	u32 siglen;
> +};
> +
>  /**
>   * struct fastrpc_msg - FastRPC wire message for remote invocations
>   *
> @@ -153,10 +174,6 @@ struct fastrpc_msg {
>  
>  /**
>   * struct qda_msg - FastRPC message with kernel-internal bookkeeping
> - *
> - * The wire-format portion is kept in the embedded @fastrpc member (must
> - * be first) so that &qda_msg->fastrpc can be passed directly to
> - * rpmsg_send() without a copy.
>   */
>  struct qda_msg {
>  	/**
> @@ -245,7 +262,7 @@ struct fastrpc_invoke_context {
>  	struct qda_gem_obj *msg_gem_obj;
>  	/** @file_priv: DRM file private data */
>  	struct drm_file *file_priv;
> -	/** @init_mem_gem_obj: GEM object for protection domain init memory */
> +	/** @init_mem_gem_obj: GEM object for PD initialization memory */
>  	struct qda_gem_obj *init_mem_gem_obj;
>  	/** @req: Pointer to kernel-internal request buffer */
>  	void *req;
> @@ -256,11 +273,23 @@ struct fastrpc_invoke_context {
>  };
>  
>  /* Remote Method ID table - identifies initialization and control operations */
> +#define FASTRPC_RMID_INIT_RELEASE	1	/* Release DSP process */
> +#define FASTRPC_RMID_INIT_CREATE	6	/* Create DSP process */
> +#define FASTRPC_RMID_INIT_CREATE_ATTR	7	/* Create DSP process with attributes */
>  #define FASTRPC_RMID_INVOKE_DYNAMIC	0xFFFFFFFF	/* Dynamic method invocation */
>  
>  /* Common handle for initialization operations */
>  #define FASTRPC_INIT_HANDLE		0x1
>  
> +/* Protection Domain (PD) identifiers */
> +#define QDA_ROOT_PD		(0)
> +#define QDA_USER_PD		(1)
> +
> +/* Number of arguments for process creation */
> +#define FASTRPC_CREATE_PROCESS_NARGS	6
> +/* Maximum initialization file size (4 MB) */
> +#define FASTRPC_INIT_FILELEN_MAX	(4 * 1024 * 1024)
> +
>  void qda_fastrpc_context_free(struct kref *ref);
>  struct fastrpc_invoke_context *qda_fastrpc_context_alloc(void);
>  int qda_fastrpc_prepare_args(struct fastrpc_invoke_context *ctx, char __user *argp);
> diff --git a/drivers/accel/qda/qda_ioctl.c b/drivers/accel/qda/qda_ioctl.c
> index c81268c20b04..33f0a798ad13 100644
> --- a/drivers/accel/qda/qda_ioctl.c
> +++ b/drivers/accel/qda/qda_ioctl.c
> @@ -109,6 +109,7 @@ static int fastrpc_invoke(int type, struct drm_device *dev, void *data,
>  	struct drm_gem_object *gem_obj;
>  	int err;
>  	size_t hdr_size;
> +	size_t initmem_size = FASTRPC_INIT_FILELEN_MAX;
>  
>  	ctx = qda_fastrpc_context_alloc();
>  	if (IS_ERR(ctx))
> @@ -124,6 +125,27 @@ static int fastrpc_invoke(int type, struct drm_device *dev, void *data,
>  	ctx->file_priv = file_priv;
>  	ctx->remote_session_id = qda_file_priv->remote_session_id;
>  
> +	if (type == FASTRPC_RMID_INIT_CREATE) {
> +		struct drm_gem_object *initmem_gem_obj;
> +
> +		if (qda_file_priv->init_mem_gem_obj) {

Why is it non-NULL here?

> +			drm_gem_object_put(&qda_file_priv->init_mem_gem_obj->base);
> +			qda_file_priv->init_mem_gem_obj = NULL;
> +		}
> +
> +		initmem_gem_obj = qda_gem_create_object(dev, qdev->iommu_mgr,
> +							initmem_size, file_priv);
> +		if (IS_ERR(initmem_gem_obj)) {
> +			err = PTR_ERR(initmem_gem_obj);
> +			goto err_context_free;
> +		}
> +
> +		ctx->init_mem_gem_obj = to_qda_gem_obj(initmem_gem_obj);
> +		qda_file_priv->init_mem_gem_obj = ctx->init_mem_gem_obj;
> +	} else if (type == FASTRPC_RMID_INIT_RELEASE) {
> +		ctx->init_mem_gem_obj = qda_file_priv->init_mem_gem_obj;
> +	}
> +
>  	err = qda_fastrpc_prepare_args(ctx, (char __user *)data);
>  	if (err)
>  		goto err_context_free;
> @@ -161,11 +183,41 @@ static int fastrpc_invoke(int type, struct drm_device *dev, void *data,
>  	return 0;
>  
>  err_context_free:
> +	if (type == FASTRPC_RMID_INIT_RELEASE && !err && qda_file_priv->init_mem_gem_obj) {
> +		drm_gem_object_put(&qda_file_priv->init_mem_gem_obj->base);
> +		qda_file_priv->init_mem_gem_obj = NULL;
> +	}
> +
>  	fastrpc_context_put_id(ctx, qdev);
>  	kref_put(&ctx->refcount, qda_fastrpc_context_free);
>  	return err;
>  }
>  
> +/**
> + * qda_ioctl_init_create() - Create a DSP process
> + * @dev: DRM device structure
> + * @data: User-space data (struct drm_qda_init_create)
> + * @file_priv: DRM file private data
> + *
> + * Return: 0 on success, negative error code on failure
> + */
> +int qda_ioctl_init_create(struct drm_device *dev, void *data, struct drm_file *file_priv)
> +{
> +	return fastrpc_invoke(FASTRPC_RMID_INIT_CREATE, dev, data, file_priv);

Where is INIT_CREATE_ATTR, which you described earlier?

> +}
> +
> +/**
> + * qda_release_dsp_process() - Release DSP process resources for a file
> + * @qdev: QDA device structure
> + * @file_priv: DRM file private data
> + *
> + * Return: 0 on success, negative error code on failure
> + */
> +int qda_release_dsp_process(struct qda_dev *qdev, struct drm_file *file_priv)
> +{
> +	return fastrpc_invoke(FASTRPC_RMID_INIT_RELEASE, &qdev->drm_dev, NULL, file_priv);
> +}
> +
>  /**
>   * qda_ioctl_invoke() - Perform a dynamic FastRPC method invocation
>   * @dev: DRM device structure
> diff --git a/drivers/accel/qda/qda_ioctl.h b/drivers/accel/qda/qda_ioctl.h
> index 3bb9cfd98370..192565434363 100644
> --- a/drivers/accel/qda/qda_ioctl.h
> +++ b/drivers/accel/qda/qda_ioctl.h
> @@ -9,6 +9,7 @@
>  #include "qda_drv.h"
>  
>  int qda_ioctl_query(struct drm_device *dev, void *data, struct drm_file *file_priv);
> +int qda_ioctl_init_create(struct drm_device *dev, void *data, struct drm_file *file_priv);
>  int qda_ioctl_gem_create(struct drm_device *dev, void *data, struct drm_file *file_priv);
>  int qda_ioctl_gem_mmap_offset(struct drm_device *dev, void *data, struct drm_file *file_priv);
>  int qda_ioctl_invoke(struct drm_device *dev, void *data, struct drm_file *file_priv);
> diff --git a/include/uapi/drm/qda_accel.h b/include/uapi/drm/qda_accel.h
> index 72512213741f..711e2523a570 100644
> --- a/include/uapi/drm/qda_accel.h
> +++ b/include/uapi/drm/qda_accel.h
> @@ -21,8 +21,9 @@ extern "C" {
>  #define DRM_QDA_QUERY		0x00
>  #define DRM_QDA_GEM_CREATE		0x01
>  #define DRM_QDA_GEM_MMAP_OFFSET	0x02
> -/* Command numbers 0x03-0x06 reserved for INIT_ATTACH, INIT_CREATE, MAP, MUNMAP */
> -#define DRM_QDA_REMOTE_INVOKE		0x07
> +/* Command number 0x03 reserved for INIT_ATTACH; 0x05-0x06 reserved for MAP, MUNMAP */
> +#define DRM_QDA_REMOTE_SESSION_CREATE		0x04
> +#define DRM_QDA_REMOTE_INVOKE			0x07
>  
>  /*
>   * QDA IOCTL definitions
> @@ -37,6 +38,9 @@ extern "C" {
>  					  struct drm_qda_gem_create)
>  #define DRM_IOCTL_QDA_GEM_MMAP_OFFSET	DRM_IOWR(DRM_COMMAND_BASE + DRM_QDA_GEM_MMAP_OFFSET, \
>  					  struct drm_qda_gem_mmap_offset)
> +#define DRM_IOCTL_QDA_REMOTE_SESSION_CREATE					\
> +	DRM_IOWR(DRM_COMMAND_BASE + DRM_QDA_REMOTE_SESSION_CREATE,		\
> +		 struct drm_qda_init_create)
>  #define DRM_IOCTL_QDA_REMOTE_INVOKE	DRM_IOWR(DRM_COMMAND_BASE + DRM_QDA_REMOTE_INVOKE, \
>  					  struct drm_qda_invoke_args)
>  
> @@ -99,6 +103,30 @@ struct drm_qda_fastrpc_invoke_args {
>  	__u32 attr;
>  };
>  
> +/**
> + * struct drm_qda_init_create - Accelerator process initialization parameters
> + * @filelen: Length of the ELF file in bytes
> + * @filefd: DMA-BUF file descriptor containing the ELF file
> + * @attrs: Process attributes flags
> + * @siglen: Length of signature data in bytes
> + * @file: Pointer to ELF file data if not using filefd
> + *
> + * This structure is used with DRM_IOCTL_QDA_INIT_CREATE to initialize
> + * a new process on the accelerator. The process code is provided either
> + * via a file descriptor (filefd, typically a GEM object) or a direct
> + * pointer (file). Set file to 0 if using filefd.
> + *
> + * The attrs field contains bit flags for debug mode, privileged execution,
> + * and other process attributes.
> + */
> +struct drm_qda_init_create {
> +	__u32 filelen;
> +	__s32 filefd;
> +	__u32 attrs;
> +	__u32 siglen;
> +	__u64 file;
> +};
> +
>  /**
>   * struct drm_qda_invoke_args - Dynamic FastRPC invocation parameters
>   * @handle: Remote handle to invoke on the DSP
> 
> -- 
> 2.34.1
> 
> 

-- 
With best wishes
Dmitry

^ permalink raw reply

* Re: [PATCH v6 10/43] KVM: guest_memfd: Add base support for KVM_SET_MEMORY_ATTRIBUTES2
From: Fuad Tabba @ 2026-05-20 14:00 UTC (permalink / raw)
  To: ackerleytng
  Cc: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	ira.weiny, jmattson, jthoughton, michael.roth, oupton,
	pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
	steven.price, willy, wyihan, yan.y.zhao, forkloop, pratyush,
	suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
	Sean Christopherson, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
	Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
	Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka,
	kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco
In-Reply-To: <20260507-gmem-inplace-conversion-v6-10-91ab5a8b19a4@google.com>

On Thu, 7 May 2026 at 21:22, Ackerley Tng via B4 Relay
<devnull+ackerleytng.google.com@kernel.org> wrote:
>
> From: Ackerley Tng <ackerleytng@google.com>
>
> Introduce base support for KVM_SET_MEMORY_ATTRIBUTES2 in guest_memfd, which
> just updates attributes tracked by guest_memfd.
>
> Validate input fields in general. Guard usage of KVM_SET_MEMORY_ATTRIBUTES2
> by making sure requested attributes are supported for this instance of kvm.
>
> A new KVM_SET_MEMORY_ATTRIBUTES2 is defined to support writes (unlike
> KVM_SET_MEMORY_ATTRIBUTES) in addition to reads so it can provide error
> details to userspace. This will be used in a later patch.
>
> The two ioctls use their corresponding structs with no overlap, but
> backward compatibility is baked in for future support of
> KVM_SET_MEMORY_ATTRIBUTES2 and struct kvm_memory_attributes2 in the VM
> ioctl.
>
> The process of setting memory attributes is set up such that the later half
> will not fail due to allocation. Any necessary checks are performed before
> the point of no return.
>
> Signed-off-by: Ackerley Tng <ackerleytng@google.com>
> Co-developed-by: Vishal Annapurve <vannapurve@google.com>
> Signed-off-by: Vishal Annapurve <vannapurve@google.com>
> Co-developed-by: Sean Christoperson <seanjc@google.com>
> Signed-off-by: Sean Christoperson <seanjc@google.com>

Reviewed-by: Fuad Tabba <tabba@google.com>
/fuad
> ---
>  include/uapi/linux/kvm.h |  13 ++++++
>  virt/kvm/Kconfig         |   1 +
>  virt/kvm/guest_memfd.c   | 114 +++++++++++++++++++++++++++++++++++++++++++++++
>  virt/kvm/kvm_main.c      |  12 +++++
>  4 files changed, 140 insertions(+)
>
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 6c8afa2047bf3..e6bbf68a83813 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -1648,6 +1648,19 @@ struct kvm_memory_attributes {
>         __u64 flags;
>  };
>
> +#define KVM_SET_MEMORY_ATTRIBUTES2              _IOWR(KVMIO,  0xd2, struct kvm_memory_attributes2)
> +
> +struct kvm_memory_attributes2 {
> +       union {
> +               __u64 address;
> +               __u64 offset;
> +       };
> +       __u64 size;
> +       __u64 attributes;
> +       __u64 flags;
> +       __u64 reserved[12];
> +};
> +
>  #define KVM_MEMORY_ATTRIBUTE_PRIVATE           (1ULL << 3)
>
>  #define KVM_CREATE_GUEST_MEMFD _IOWR(KVMIO,  0xd4, struct kvm_create_guest_memfd)
> diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
> index 3fea89c45cfb4..e371e079e2c50 100644
> --- a/virt/kvm/Kconfig
> +++ b/virt/kvm/Kconfig
> @@ -109,6 +109,7 @@ config KVM_VM_MEMORY_ATTRIBUTES
>
>  config KVM_GUEST_MEMFD
>         select XARRAY_MULTI
> +       select KVM_MEMORY_ATTRIBUTES
>         bool
>
>  config HAVE_KVM_ARCH_GMEM_PREPARE
> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> index 4f7c4824c3a45..91e89b188f583 100644
> --- a/virt/kvm/guest_memfd.c
> +++ b/virt/kvm/guest_memfd.c
> @@ -540,11 +540,125 @@ unsigned long kvm_gmem_get_memory_attributes(struct kvm *kvm, gfn_t gfn)
>  }
>  EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_get_memory_attributes);
>
> +/*
> + * Preallocate memory for attributes to be stored on a maple tree, pointed to
> + * by mas.  Adjacent ranges with attributes identical to the new attributes
> + * will be merged.  Also sets mas's bounds up for storing attributes.
> + *
> + * This maintains the invariant that ranges with the same attributes will
> + * always be merged.
> + */
> +static int kvm_gmem_mas_preallocate(struct ma_state *mas, u64 attributes,
> +                                   pgoff_t start, size_t nr_pages)
> +{
> +       pgoff_t end = start + nr_pages;
> +       pgoff_t last = end - 1;
> +       void *entry;
> +
> +       /* Try extending range. entry is NULL on overflow/wrap-around. */
> +       mas_set(mas, end);
> +       entry = mas_find(mas, end);
> +       if (entry && xa_to_value(entry) == attributes)
> +               last = mas->last;
> +
> +       if (start > 0) {
> +               mas_set(mas, start - 1);
> +               entry = mas_find(mas, start - 1);
> +               if (entry && xa_to_value(entry) == attributes)
> +                       start = mas->index;
> +       }
> +
> +       mas_set_range(mas, start, last);
> +       return mas_preallocate(mas, xa_mk_value(attributes), GFP_KERNEL);
> +}
> +
> +static int __kvm_gmem_set_attributes(struct inode *inode, pgoff_t start,
> +                                    size_t nr_pages, uint64_t attrs)
> +{
> +       struct address_space *mapping = inode->i_mapping;
> +       struct gmem_inode *gi = GMEM_I(inode);
> +       pgoff_t end = start + nr_pages;
> +       struct maple_tree *mt;
> +       struct ma_state mas;
> +       int r;
> +
> +       mt = &gi->attributes;
> +
> +       filemap_invalidate_lock(mapping);
> +
> +       mas_init(&mas, mt, start);
> +       r = kvm_gmem_mas_preallocate(&mas, attrs, start, nr_pages);
> +       if (r)
> +               goto out;
> +
> +       /*
> +        * From this point on guest_memfd has performed necessary
> +        * checks and can proceed to do guest-breaking changes.
> +        */
> +
> +       kvm_gmem_invalidate_begin(inode, start, end);
> +       mas_store_prealloc(&mas, xa_mk_value(attrs));
> +       kvm_gmem_invalidate_end(inode, start, end);
> +out:
> +       filemap_invalidate_unlock(mapping);
> +       return r;
> +}
> +
> +static long kvm_gmem_set_attributes(struct file *file, void __user *argp)
> +{
> +       struct gmem_file *f = file->private_data;
> +       struct inode *inode = file_inode(file);
> +       struct kvm_memory_attributes2 attrs;
> +       size_t nr_pages;
> +       pgoff_t index;
> +       int i;
> +
> +       if (copy_from_user(&attrs, argp, sizeof(attrs)))
> +               return -EFAULT;
> +
> +       if (attrs.flags)
> +               return -EINVAL;
> +       for (i = 0; i < ARRAY_SIZE(attrs.reserved); i++) {
> +               if (attrs.reserved[i])
> +                       return -EINVAL;
> +       }
> +       if (attrs.attributes & ~kvm_supported_mem_attributes(f->kvm))
> +               return -EINVAL;
> +       if (attrs.size == 0 || attrs.offset + attrs.size < attrs.offset)
> +               return -EINVAL;
> +       if (!PAGE_ALIGNED(attrs.offset) || !PAGE_ALIGNED(attrs.size))
> +               return -EINVAL;
> +
> +       if (attrs.offset >= i_size_read(inode) ||
> +           attrs.offset + attrs.size > i_size_read(inode))
> +               return -EINVAL;
> +
> +       nr_pages = attrs.size >> PAGE_SHIFT;
> +       index = attrs.offset >> PAGE_SHIFT;
> +       return __kvm_gmem_set_attributes(inode, index, nr_pages,
> +                                        attrs.attributes);
> +}
> +
> +static long kvm_gmem_ioctl(struct file *file, unsigned int ioctl,
> +                          unsigned long arg)
> +{
> +       switch (ioctl) {
> +       case KVM_SET_MEMORY_ATTRIBUTES2:
> +               if (vm_memory_attributes)
> +                       return -ENOTTY;
> +
> +               return kvm_gmem_set_attributes(file, (void __user *)arg);
> +       default:
> +               return -ENOTTY;
> +       }
> +}
> +
>  static struct file_operations kvm_gmem_fops = {
>         .mmap           = kvm_gmem_mmap,
>         .open           = generic_file_open,
>         .release        = kvm_gmem_release,
>         .fallocate      = kvm_gmem_fallocate,
> +       .unlocked_ioctl = kvm_gmem_ioctl,
>  };
>
>  static int kvm_gmem_migrate_folio(struct address_space *mapping,
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index ff20e63143642..4d7bf52b7b717 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -110,6 +110,18 @@ EXPORT_SYMBOL_FOR_KVM_INTERNAL(STATIC_CALL_KEY(__kvm_get_memory_attributes));
>  EXPORT_SYMBOL_FOR_KVM_INTERNAL(STATIC_CALL_TRAMP(__kvm_get_memory_attributes));
>  #endif
>
> +#define MEMORY_ATTRIBUTES_MATCH(one, two)                              \
> +       static_assert(offsetof(struct kvm_memory_attributes, one) ==    \
> +                     offsetof(struct kvm_memory_attributes2, two));    \
> +       static_assert(sizeof_field(struct kvm_memory_attributes, one) ==\
> +                     sizeof_field(struct kvm_memory_attributes2, two))
> +
> +/* Ensure the common parts of the two structs are identical. */
> +MEMORY_ATTRIBUTES_MATCH(address, address);
> +MEMORY_ATTRIBUTES_MATCH(size, size);
> +MEMORY_ATTRIBUTES_MATCH(attributes, attributes);
> +MEMORY_ATTRIBUTES_MATCH(flags, flags);
> +
>  /*
>   * Ordering of locks:
>   *
>
> --
> 2.54.0.563.g4f69b47b94-goog
>
>

^ permalink raw reply

* Re: [PATCH 00/12] misc/syncobj: add /dev/syncobj device
From: Christian König @ 2026-05-20 14:06 UTC (permalink / raw)
  To: Xaver Hugl
  Cc: Julian Orth, Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann,
	David Airlie, Simona Vetter, Sumit Semwal, Jonathan Corbet,
	Shuah Khan, Arnd Bergmann, Greg Kroah-Hartman, dri-devel,
	linux-kernel, linux-media, linaro-mm-sig, linux-doc,
	wayland-devel, Michel Dänzer
In-Reply-To: <CAFZQkGxpPm081Fz8UtDuBA1PKD42+9YDA+cc6fbSpfawXwu9+g@mail.gmail.com>

On 5/20/26 14:33, Xaver Hugl wrote:
> Am Mi., 20. Mai 2026 um 10:08 Uhr schrieb Christian König
> <christian.koenig@amd.com>:
>> Well I would say the other way around is a pretty common use case.
>>
>> In other words the compositors uses the internal GPU for composing and displaying the picture. And the client uses the external GPU for fast rendering.
> Sure, but that's not what I'm talking about.

Yeah sorry for that, I wasn't sure if I misunderstood your use case because it's usually the other way around.

>>> - the buffers from the client stay valid
>>
>> Buffers from the hot plugged GPU don't stay valid. Accessing CPU mappings either result in a SIGBUS or are redirected to a dummy page.
> Again, not what I wrote about. The buffers are on the integrated GPU.

General rule of thumb is that as long as the exporter stays around the buffers stay around as well.

>>> - the syncobj stays valid on the client side
>>> - the syncobj becomes invalid on the compositor side
>>
>> Nope that's not correct. The syncobj itself stays valid even if you completely hot plug the device.
>>
>> It can just be that the fences inside the syncobj are terminated with an error.
> What about eventfd created for a point on the syncobj?

The eventfd unfortunately doesn't has error handling as far as I know, so when a fence signals with an error condition then the eventfd you only sees that it is signaled.

> Another (future) problem with hotplugs will be if the sync file hasn't
> materialized for the timeline point when the device is hotunplugged,
> since there can't be an error on the fence if there isn't one. Or
> could userspace somehow set an 'artificial' fence with an error in
> that case?

In general the answer is yes, userspace needs to take care of inserting fences when wait before signal is used and the work can not be submitted to the HW for some reason.

Currently we only have an IOCTL to insert the signaled dummy fence at some timeline sequence, but it should be trivial as well to insert a signaled fence with an error code.

But the compositor needs to be able to handle that case anyway, because it can be that a malicious or just buggy client just never inserts the fence.

So that a device is hot plugged is not different to just a client not inserting the fence in the first place.

>>> "invalid" there means either
>>> - the acquire point of the client is marked as signaled, before
>>> rendering on the client side is completed
>>> - the acquire point of the client is never signaled. Since the
>>> compositor waits for the acquire point, the Wayland surface is stuck
>>> forever
>>
>> Both of those would be a *massive* violation of documented kernel rules for hot-plugging which could lead to random data corruption and/or deadlocks.
>>
>> If you see any HW driver showing behavior like that please open up a bug report and ping the relevant maintainers immediately.
> If there are no error codes with syncobj yet, then to userspace, the
> latter behavior is exactly what we get, isn't it?

No, from userspace side you just see a signaled fence. It's just that you need to export the timeline point of the syncobj to a syncfile and then you can call the QUERY IOCTL on the syncfile to see the error code.

>> When a hotplug happens all operations of the device should return an -ENODEV error, even when exposed to other devices/application through syncobj or syncfile.
> Okay, that at least gives us a way to fail imports somewhat
> gracefully. Normally, failing to import a syncobj is a fatal error in
> the Wayland protocol.

So the task at hand would be to avoid importing the syncobj into a driver. That should be relatively trivial.

The only real problem I see is if you want to create a syncobj without having any device whatsoever.

>> One problem is that only syncfile allows for querying such error codes at the moment, we have patches pending to add that to syncobj as well but we lack a compositor with support for that as userspace client.
> As long as the error case can be detected with an eventfd,

Yeah that's the problem. The eventfd only tells you if the operation is completed (or at least has materialized).

To query the error you would need to ask the underlying syncobj or syncfile directly.

> implementing that in KWin shouldn't be a challenge.
> 
>> Well the question here is if the device the compositor is using or the client is using is gone?
>>
>> If the client device is hot removed the compositor should be perfectly capable to import the syncobj.
>>
>> If the compositor device is gone then you don't have a device to display anything any more, so generating the next frame doesn't seem to make sense either.
>>
>> What could be is that you want the compositor to be kept alive even when the display device is gone to switch over to vkms or whatever so that a VNC session or other remote desktop still works.
> There are two GPUs in the example I gave. The compositor can use both
> for rendering (in cosmic-comp's case) or switch between them (what I'm
> trying to do with KWin), or use one device for rendering, and another
> for importing the syncobj.

Ah! I think I got the problem now. You basically want to avoid importing the syncobj because when the wrong device goes away you are busted.

The reason we didn't considered having the IOCTLs on the FD is because if you don't import them and instead keep them around you can run out file descriptors quite quickly.

When you have an use case where you receive an FD from the client and do a one shot conversion to an eventfd that will probably work, but for keeping them in the long run you need some kind of container for the syncobjs, don't you?

>>>>>>> 3. It removes the need to translate between syncobjs fds and handles.
>>>>>>
>>>>>> That's a pretty big no-go as well. The differentiation between FDs and handles is completely intentional.
>>>>> Could you expand on why it's needed? For compositors, the handle is
>>>>> just an intermediary thing when translating between file descriptors.
>>>>
>>>> Well what we could do is to add an IOCTL to directly attach an syncobj file descriptor to an eventfd.
>>> That would be nice.
>>
>> Take a look at drm_syncobj_file_fops and how drm_syncobj_add_eventfd() is used. Adding that functionality shouldn't be more than a typing exercise.
> Yeah, this patchset already adds that functionality (on the new device).
> 
>> Do I see it right that this would already solve most problems in the compositor side?
> Skipping the syncobj handle step would only reduce the amounts of
> ioctls the compositor does, but afaict it wouldn't solve any
> compositor problems. At least not as long as it's still tied to a drm
> device.

Yeah, you need something like a syncobj container or dummy DRM device.

> For device hotplugs, the only new thing we need for correctly handling
> syncobj is a way to receive errors on the eventfd.

I need to look into the eventfd code, could be that this is somehow possible but it's clearly not something I used before.

> A device-independent way to create and use syncobj would still be
> useful to us though, both to simplify the compositor and to improve
> the software rendering use cases.

Yeah not sure how to cleanly do that. We could have a dummy /dev/dri/rendersync or something like that, but that would be quite a hack.

At least I understand the requirement now.

Thanks,
Christian.

> 
> - Xaver


^ permalink raw reply

* Re: [PATCH v4 04/30] KVM: x86: Add KVM_[GS]ET_CLOCK_GUEST for accurate KVM clock migration
From: David Woodhouse @ 2026-05-20 14:08 UTC (permalink / raw)
  To: Dongli Zhang, kvm
  Cc: Paolo Bonzini, Jonathan Corbet, Shuah Khan, Thomas Gleixner,
	Sean Christopherson, Ingo Molnar, Borislav Petkov, H. Peter Anvin,
	Dave Hansen, Vitaly Kuznetsov, x86, Marc Zyngier, Juergen Gross,
	Boris Ostrovsky, Paul Durrant, Jonathan Cameron, Sascha Bischoff,
	Jack Allister, Joey Gouly, joe.jin, linux-doc, linux-kernel,
	xen-devel, linux-kselftest
In-Reply-To: <08a64760-a431-4d0a-9480-562f8f38c908@oracle.com>

[-- Attachment #1: Type: text/plain, Size: 844 bytes --]

On Tue, 2026-05-19 at 16:34 -0700, Dongli Zhang wrote:
> 
> I would really appreciate it if this document could be revived. I don't see it
> in your most recent v4 PATCH 7. It is very helpful as a guideline for how
> userspace VMMs should take advantage of these APIs.

In the kvmclock5 branch I'm revising for the next round.

https://git.infradead.org/?p=users/dwmw2/linux.git;a=commitdiff;h=ae872d9b75

Now has a selftest which serves as an example of the process that was
documented. And an extra fix right before it in the tree, to update the
master clock on vCPU creation because otherwise otherwise KVM_GET_CLOCK
wasn't returning the host_tsc.

I think I need to change that to only do so if ka->use_master_clock
isn't *already* set though, or we risk introducing discrepancies on
hotplug again? Will tweak that...



[-- Attachment #2: smime.p7s --]
[-- Type: application/pkcs7-signature, Size: 5069 bytes --]

^ permalink raw reply

* [PATCH v3 0/3] mm/hmm: Add mmap lock-drop support for userfaultfd-backed mappings
From: Stanislav Kinsburskii @ 2026-05-20 14:09 UTC (permalink / raw)
  To: Liam.Howlett, akpm, akpm, david, jgg, corbet, leon, ljs, mhocko,
	rppt, shuah, skhan, surenb, vbabka, skinsburskii
  Cc: linux-doc, linux-kernel, linux-kernel, linux-kselftest, linux-mm

This series extends the HMM framework to support userfaultfd-backed memory
by allowing the mmap read lock to be dropped during hmm_range_fault().

Some page fault handlers — most notably userfaultfd — require the mmap lock
to be released so that userspace can resolve the fault. The current HMM
interface never sets FAULT_FLAG_ALLOW_RETRY, making it impossible to fault
in pages from userfaultfd-registered regions.

This series follows the established int *locked pattern from
get_user_pages_remote() in mm/gup.c. A new entry point,
hmm_range_fault_unlockable(), accepts an int *locked parameter. When the
mmap lock is dropped during fault resolution (VM_FAULT_RETRY or
VM_FAULT_COMPLETED), the function returns 0 with *locked = 0, signalling
the caller to restart its walk. The existing hmm_range_fault() is
refactored into a thin wrapper that passes NULL, preserving current
behavior for all existing callers.

Faulting hugetlb pages on the unlockable path is not supported because
walk_hugetlb_range() unconditionally holds and releases
hugetlb_vma_lock_read across the callback; if the mmap lock is dropped
inside the callback, the VMA may be freed before the walk framework's
unlock. Hugetlb pages already present in page tables are handled normally.
Possible approaches to lift this limitation are documented in
Documentation/mm/hmm.rst.

Changes in v3:
 - Return -EFAULT from dmirror_fault_unlockable() when the mirrored mm can no longer be pinned.
 - Add an eventfd stop signal for the userfaultfd handler thread to avoid waiting for the poll timeout on successful test completion.


Changes in v2:

 - Split into a preparatory refactor (new patch 1) that moves
   handle_mm_fault() out of the walk callbacks, plus a smaller feature
   patch on top.  Suggested by David Hildenbrand.
 - Hugetlb regions are now supported on the unlockable path; the v1
   -EFAULT short-circuit and the hugetlb_vma_lock_read drop/retake
   dance are gone.
 - Distinct internal sentinels for "needs fault" (HMM_FAULT_PENDING)
   and "lock dropped" (HMM_FAULT_UNLOCKED).
 - Outer loop now re-walks after a successful internal fault so the
   faulted pfns end up in range->hmm_pfns.
 - Kernel-doc on hmm_range_fault_unlockable() and the
   Documentation/mm/hmm.rst example match the implementation.
 - Dropped the mshv driver conversion (v1 patch 2); will post
   separately.
 - Selftest converted to drive the path through test_hmm with a
   userfaultfd handler (new HMM_DMIRROR_READ_UNLOCKABLE ioctl).

---

Stanislav Kinsburskii (3):
      mm/hmm: move page fault handling out of walk callbacks
      mm/hmm: add hmm_range_fault_unlockable() for mmap lock-drop support
      selftests/mm: add userfaultfd test for HMM unlockable path


 Documentation/mm/hmm.rst               |   62 +++++++++++
 include/linux/hmm.h                    |    1 
 lib/test_hmm.c                         |  122 +++++++++++++++++++++
 lib/test_hmm_uapi.h                    |    1 
 mm/hmm.c                               |  187 ++++++++++++++++++++++++--------
 tools/testing/selftests/mm/hmm-tests.c |  149 +++++++++++++++++++++++++
 6 files changed, 478 insertions(+), 44 deletions(-)


^ permalink raw reply

* [PATCH v3 1/3] mm/hmm: move page fault handling out of walk callbacks
From: Stanislav Kinsburskii @ 2026-05-20 14:09 UTC (permalink / raw)
  To: Liam.Howlett, akpm, akpm, david, jgg, corbet, leon, ljs, mhocko,
	rppt, shuah, skhan, surenb, vbabka, skinsburskii
  Cc: linux-doc, linux-kernel, linux-kernel, linux-kselftest, linux-mm
In-Reply-To: <177928604779.589431.14703161356676674288.stgit@skinsburskii>

From: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>

hmm_range_fault() currently triggers page faults from inside the page-table
walk callbacks: hmm_vma_walk_pmd(), hmm_vma_walk_pud(),
hmm_vma_walk_hugetlb_entry() and the pte-level helper all call
hmm_vma_fault(), which in turn calls handle_mm_fault() while the walker
still holds nested locks.  The pte spinlock is dropped explicitly by each
caller, and the hugetlb path manually drops and retakes
hugetlb_vma_lock_read around the fault to dodge a deadlock against the walk
framework's unconditional unlock.

This layering does not extend cleanly to fault handlers that may release
mmap_lock (VM_FAULT_RETRY, VM_FAULT_COMPLETED). If the lock is dropped
while walk_page_range() is mid-traversal, the VMA can be freed before the
walk framework's matching hugetlb_vma_unlock_read(), turning that unlock
into a use-after-free.

Split the responsibilities the way get_user_pages() does. Walk callbacks
become inspect-only: when they detect a range that needs to be faulted in,
they record it in struct hmm_vma_walk and return a private sentinel
(HMM_FAULT_PENDING). The outer loop in hmm_range_fault() then drops out of
walk_page_range(), invokes a new helper hmm_do_fault() that calls
handle_mm_fault() with only mmap_lock held, and restarts the walk so the
now-present entries are collected into hmm_pfns.

No functional change for existing callers. As a side effect the hugetlb
callback no longer needs the hugetlb_vma_{un}lock_read dance, and every
fault-path exit from the callbacks now releases the pte spinlock on a
single, common path. This refactor is also a precursor for adding an
unlockable variant of hmm_range_fault() in a follow-up patch.

Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
---
 mm/hmm.c |  118 +++++++++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 75 insertions(+), 43 deletions(-)

diff --git a/mm/hmm.c b/mm/hmm.c
index c72c9ddfdb2f..446dd0c39b3a 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -33,8 +33,17 @@
 struct hmm_vma_walk {
 	struct hmm_range	*range;
 	unsigned long		last;
+	unsigned long		end;
+	unsigned int		required_fault;
 };
 
+/*
+ * Internal sentinel returned by walk callbacks when they need a page fault.
+ * The callback stores end/required_fault in hmm_vma_walk; the outer loop
+ * consumes the sentinel and never propagates it to the caller.
+ */
+#define HMM_FAULT_PENDING	-EAGAIN
+
 enum {
 	HMM_NEED_FAULT = 1 << 0,
 	HMM_NEED_WRITE_FAULT = 1 << 1,
@@ -60,37 +69,25 @@ static int hmm_pfns_fill(unsigned long addr, unsigned long end,
 }
 
 /*
- * hmm_vma_fault() - fault in a range lacking valid pmd or pte(s)
- * @addr: range virtual start address (inclusive)
- * @end: range virtual end address (exclusive)
- * @required_fault: HMM_NEED_* flags
- * @walk: mm_walk structure
- * Return: -EBUSY after page fault, or page fault error
+ * hmm_record_fault() - record a range that needs to be faulted in
  *
- * This function will be called whenever pmd_none() or pte_none() returns true,
- * or whenever there is no page directory covering the virtual address range.
+ * Called by the walk callbacks when they discover that part of the range
+ * needs a page fault.  The callback records what to fault and returns
+ * HMM_FAULT_PENDING; the outer loop in hmm_range_fault() drops back out of
+ * walk_page_range() and invokes handle_mm_fault() from a context where no
+ * page-table or hugetlb_vma_lock is held.
  */
-static int hmm_vma_fault(unsigned long addr, unsigned long end,
-			 unsigned int required_fault, struct mm_walk *walk)
+static int hmm_record_fault(unsigned long addr, unsigned long end,
+			    unsigned int required_fault,
+			    struct mm_walk *walk)
 {
 	struct hmm_vma_walk *hmm_vma_walk = walk->private;
-	struct vm_area_struct *vma = walk->vma;
-	unsigned int fault_flags = FAULT_FLAG_REMOTE;
 
 	WARN_ON_ONCE(!required_fault);
 	hmm_vma_walk->last = addr;
-
-	if (required_fault & HMM_NEED_WRITE_FAULT) {
-		if (!(vma->vm_flags & VM_WRITE))
-			return -EPERM;
-		fault_flags |= FAULT_FLAG_WRITE;
-	}
-
-	for (; addr < end; addr += PAGE_SIZE)
-		if (handle_mm_fault(vma, addr, fault_flags, NULL) &
-		    VM_FAULT_ERROR)
-			return -EFAULT;
-	return -EBUSY;
+	hmm_vma_walk->end = end;
+	hmm_vma_walk->required_fault = required_fault;
+	return HMM_FAULT_PENDING;
 }
 
 static unsigned int hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
@@ -174,7 +171,7 @@ static int hmm_vma_walk_hole(unsigned long addr, unsigned long end,
 		return hmm_pfns_fill(addr, end, range, HMM_PFN_ERROR);
 	}
 	if (required_fault)
-		return hmm_vma_fault(addr, end, required_fault, walk);
+		return hmm_record_fault(addr, end, required_fault, walk);
 	return hmm_pfns_fill(addr, end, range, 0);
 }
 
@@ -209,7 +206,7 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr,
 	required_fault =
 		hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, cpu_flags);
 	if (required_fault)
-		return hmm_vma_fault(addr, end, required_fault, walk);
+		return hmm_record_fault(addr, end, required_fault, walk);
 
 	pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
 	for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) {
@@ -328,7 +325,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
 fault:
 	pte_unmap(ptep);
 	/* Fault any virtual address we were asked to fault */
-	return hmm_vma_fault(addr, end, required_fault, walk);
+	return hmm_record_fault(addr, end, required_fault, walk);
 }
 
 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
@@ -371,7 +368,7 @@ static int hmm_vma_handle_absent_pmd(struct mm_walk *walk, unsigned long start,
 					      npages, 0);
 	if (required_fault) {
 		if (softleaf_is_device_private(entry))
-			return hmm_vma_fault(addr, end, required_fault, walk);
+			return hmm_record_fault(addr, end, required_fault, walk);
 		else
 			return -EFAULT;
 	}
@@ -517,7 +514,7 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
 						      npages, cpu_flags);
 		if (required_fault) {
 			spin_unlock(ptl);
-			return hmm_vma_fault(addr, end, required_fault, walk);
+			return hmm_record_fault(addr, end, required_fault, walk);
 		}
 
 		pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
@@ -564,21 +561,8 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
 	required_fault =
 		hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, cpu_flags);
 	if (required_fault) {
-		int ret;
-
 		spin_unlock(ptl);
-		hugetlb_vma_unlock_read(vma);
-		/*
-		 * Avoid deadlock: drop the vma lock before calling
-		 * hmm_vma_fault(), which will itself potentially take and
-		 * drop the vma lock. This is also correct from a
-		 * protection point of view, because there is no further
-		 * use here of either pte or ptl after dropping the vma
-		 * lock.
-		 */
-		ret = hmm_vma_fault(addr, end, required_fault, walk);
-		hugetlb_vma_lock_read(vma);
-		return ret;
+		return hmm_record_fault(addr, end, required_fault, walk);
 	}
 
 	pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT);
@@ -637,6 +621,44 @@ static const struct mm_walk_ops hmm_walk_ops = {
 	.walk_lock	= PGWALK_RDLOCK,
 };
 
+/*
+ * hmm_do_fault - fault in a range recorded by a walk callback
+ *
+ * Called from the outer loop in hmm_range_fault() after a callback
+ * returned HMM_FAULT_PENDING.  At this point we hold only mmap_lock;
+ * the page-table spinlock and any hugetlb_vma_lock acquired by the walk
+ * framework have already been released by the unwind.
+ *
+ * Returns -EBUSY on success (all pages faulted, caller should re-walk).
+ * Returns a negative errno on failure.
+ */
+static int hmm_do_fault(struct mm_struct *mm,
+			struct hmm_vma_walk *hmm_vma_walk)
+{
+	unsigned long addr = hmm_vma_walk->last;
+	unsigned long end = hmm_vma_walk->end;
+	unsigned int required_fault = hmm_vma_walk->required_fault;
+	unsigned int fault_flags = FAULT_FLAG_REMOTE;
+	struct vm_area_struct *vma;
+
+	vma = vma_lookup(mm, addr);
+	if (!vma)
+		return -EFAULT;
+
+	if (required_fault & HMM_NEED_WRITE_FAULT) {
+		if (!(vma->vm_flags & VM_WRITE))
+			return -EPERM;
+		fault_flags |= FAULT_FLAG_WRITE;
+	}
+
+	for (; addr < end; addr += PAGE_SIZE)
+		if (handle_mm_fault(vma, addr, fault_flags, NULL) &
+		    VM_FAULT_ERROR)
+			return -EFAULT;
+
+	return -EBUSY;
+}
+
 /**
  * hmm_range_fault - try to fault some address in a virtual address range
  * @range:	argument structure
@@ -674,6 +696,16 @@ int hmm_range_fault(struct hmm_range *range)
 			return -EBUSY;
 		ret = walk_page_range(mm, hmm_vma_walk.last, range->end,
 				      &hmm_walk_ops, &hmm_vma_walk);
+		/*
+		 * When HMM_FAULT_PENDING is returned a walk callback
+		 * recorded a range that needs handle_mm_fault();
+		 * hmm_do_fault() runs the fault outside walk_page_range()
+		 * (so no page-table or hugetlb_vma_lock is held) and
+		 * returns -EBUSY so the loop re-walks and picks up the
+		 * now-present entries.
+		 */
+		if (ret == HMM_FAULT_PENDING)
+			ret = hmm_do_fault(mm, &hmm_vma_walk);
 		/*
 		 * When -EBUSY is returned the loop restarts with
 		 * hmm_vma_walk.last set to an address that has not been stored



^ permalink raw reply related

* [PATCH v3 2/3] mm/hmm: add hmm_range_fault_unlockable() for mmap lock-drop support
From: Stanislav Kinsburskii @ 2026-05-20 14:09 UTC (permalink / raw)
  To: Liam.Howlett, akpm, akpm, david, jgg, corbet, leon, ljs, mhocko,
	rppt, shuah, skhan, surenb, vbabka, skinsburskii
  Cc: linux-doc, linux-kernel, linux-kernel, linux-kselftest, linux-mm
In-Reply-To: <177928604779.589431.14703161356676674288.stgit@skinsburskii>

From: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>

hmm_range_fault() holds the mmap read lock for the duration of the call.
This is incompatible with mappings whose fault handler may release the mmap
lock - notably userfaultfd-managed regions, where handle_mm_fault() returns
VM_FAULT_RETRY or VM_FAULT_COMPLETED after dropping the lock. Drivers that
need to populate device page tables for such mappings have no way to do so
today.

Add hmm_range_fault_unlockable(), modelled on the int *locked pattern from
get_user_pages_remote() in mm/gup.c.  Callers set *locked = 1 and pass
&locked; the function may set *locked = 0 to report that handle_mm_fault()
dropped the mmap lock during a page fault, in which case the caller must
reacquire it and restart the walk with a fresh mmu_interval_read_begin()
sequence.

The implementation is local to hmm_do_fault() and the outer loop in
hmm_range_fault_unlockable(). hmm_do_fault() conditionally sets
FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE when locked is non-NULL and
translates VM_FAULT_RETRY / VM_FAULT_COMPLETED into *locked = 0 plus a
private return code consumed by the outer loop, which in turn returns 0 (or
-EINTR on fatal signal) to the caller.

The previous refactor that moved page fault handling out of the page-table
walk callbacks is what makes this change small. Faults now run after
walk_page_range() has unwound, with only the mmap lock held, so dropping it
does not interact with the walker's pte spinlock or hugetlb_vma_lock.
Hugetlb regions therefore participate in the unlockable path uniformly with
PTE- and PMD-level mappings; no special case is required.

hmm_range_fault() becomes a thin wrapper, preserving exact behaviour for
all existing callers. No EXPORT_SYMBOL behaviour change for
hmm_range_fault.

Documentation/mm/hmm.rst is updated with a description of the new API and
the recommended caller pattern.

Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
---
 Documentation/mm/hmm.rst |   62 +++++++++++++++++++++++++++++++++++++
 include/linux/hmm.h      |    1 +
 mm/hmm.c                 |   77 +++++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 135 insertions(+), 5 deletions(-)

diff --git a/Documentation/mm/hmm.rst b/Documentation/mm/hmm.rst
index 7d61b7a8b65b..a9309023ec23 100644
--- a/Documentation/mm/hmm.rst
+++ b/Documentation/mm/hmm.rst
@@ -208,6 +208,68 @@ invalidate() callback. That lock must be held before calling
 mmu_interval_read_retry() to avoid any race with a concurrent CPU page table
 update.
 
+Dropping the mmap lock during page faults
+=========================================
+
+Some VMAs have fault handlers that need to release the mmap lock while
+servicing a fault (for example, regions managed by ``userfaultfd``).
+``hmm_range_fault()`` cannot be used on such mappings because it must hold the
+mmap lock for the duration of the call. Drivers that need to support them
+should call::
+
+  int hmm_range_fault_unlockable(struct hmm_range *range, int *locked);
+
+The caller sets ``*locked = 1`` and holds ``mmap_read_lock`` before the call.
+If the mmap lock is dropped inside ``handle_mm_fault()``, the function sets
+``*locked = 0`` and returns ``0``; the caller is responsible for reacquiring
+the lock and restarting the walk from ``range->start`` with a fresh notifier
+sequence. When ``locked`` is ``NULL`` the function keeps the lock held for the
+duration of the call, identical to ``hmm_range_fault()``.
+
+A typical caller looks like this::
+
+ int driver_populate_range_unlockable(...)
+ {
+      struct hmm_range range;
+      int locked;
+      ...
+
+      range.notifier = &interval_sub;
+      range.start = ...;
+      range.end = ...;
+      range.hmm_pfns = ...;
+
+      if (!mmget_not_zero(interval_sub.mm))
+          return -EFAULT;
+
+ again:
+      range.notifier_seq = mmu_interval_read_begin(&interval_sub);
+      locked = 1;
+      mmap_read_lock(mm);
+      ret = hmm_range_fault_unlockable(&range, &locked);
+      if (locked)
+          mmap_read_unlock(mm);
+      if (ret) {
+          if (ret == -EBUSY)
+              goto again;
+          return ret;
+      }
+      if (!locked)
+          goto again;
+
+      take_lock(driver->update);
+      if (mmu_interval_read_retry(&interval_sub, range.notifier_seq)) {
+          release_lock(driver->update);
+          goto again;
+      }
+
+      /* Use pfns array content to update device page table,
+       * under the update lock */
+
+      release_lock(driver->update);
+      return 0;
+ }
+
 Leverage default_flags and pfn_flags_mask
 =========================================
 
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index db75ffc949a7..46e581865c48 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -123,6 +123,7 @@ struct hmm_range {
  * Please see Documentation/mm/hmm.rst for how to use the range API.
  */
 int hmm_range_fault(struct hmm_range *range);
+int hmm_range_fault_unlockable(struct hmm_range *range, int *locked);
 
 /*
  * HMM_RANGE_DEFAULT_TIMEOUT - default timeout (ms) when waiting for a range
diff --git a/mm/hmm.c b/mm/hmm.c
index 446dd0c39b3a..b9ced5003e16 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -32,6 +32,7 @@
 
 struct hmm_vma_walk {
 	struct hmm_range	*range;
+	int			*locked;
 	unsigned long		last;
 	unsigned long		end;
 	unsigned int		required_fault;
@@ -44,6 +45,13 @@ struct hmm_vma_walk {
  */
 #define HMM_FAULT_PENDING	-EAGAIN
 
+/*
+ * Internal sentinel returned by hmm_do_fault() when handle_mm_fault() drops
+ * the mmap lock during a page fault. hmm_do_fault() sets *locked = 0; the
+ * outer loop consumes the sentinel and never propagates it to the caller.
+ */
+#define HMM_FAULT_UNLOCKED	-ENOLCK
+
 enum {
 	HMM_NEED_FAULT = 1 << 0,
 	HMM_NEED_WRITE_FAULT = 1 << 1,
@@ -639,6 +647,7 @@ static int hmm_do_fault(struct mm_struct *mm,
 	unsigned long end = hmm_vma_walk->end;
 	unsigned int required_fault = hmm_vma_walk->required_fault;
 	unsigned int fault_flags = FAULT_FLAG_REMOTE;
+	int *locked = hmm_vma_walk->locked;
 	struct vm_area_struct *vma;
 
 	vma = vma_lookup(mm, addr);
@@ -651,10 +660,20 @@ static int hmm_do_fault(struct mm_struct *mm,
 		fault_flags |= FAULT_FLAG_WRITE;
 	}
 
-	for (; addr < end; addr += PAGE_SIZE)
-		if (handle_mm_fault(vma, addr, fault_flags, NULL) &
-		    VM_FAULT_ERROR)
+	if (locked)
+		fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
+
+	for (; addr < end; addr += PAGE_SIZE) {
+		vm_fault_t ret;
+
+		ret = handle_mm_fault(vma, addr, fault_flags, NULL);
+		if (ret & (VM_FAULT_RETRY | VM_FAULT_COMPLETED)) {
+			*locked = 0;
+			return HMM_FAULT_UNLOCKED;
+		}
+		if (ret & VM_FAULT_ERROR)
 			return -EFAULT;
+	}
 
 	return -EBUSY;
 }
@@ -677,11 +696,53 @@ static int hmm_do_fault(struct mm_struct *mm,
  *
  * This is similar to get_user_pages(), except that it can read the page tables
  * without mutating them (ie causing faults).
+ *
+ * The mmap lock must be held by the caller and will remain held on return.
+ * For a variant that allows the mmap lock to be dropped during faults (e.g.,
+ * for userfaultfd support), see hmm_range_fault_unlockable().
  */
 int hmm_range_fault(struct hmm_range *range)
+{
+	return hmm_range_fault_unlockable(range, NULL);
+}
+EXPORT_SYMBOL(hmm_range_fault);
+
+/**
+ * hmm_range_fault_unlockable - fault in a range, possibly dropping the mmap lock
+ * @range:     argument structure
+ * @locked:    pointer to caller's lock state, or %NULL
+ *
+ * Behaves like hmm_range_fault(), but allows handle_mm_fault() to drop the
+ * mmap read lock during a fault.  This makes the function usable on mappings
+ * whose fault path may release the lock (for example, userfaultfd-managed
+ * regions).
+ *
+ * If @locked is %NULL the mmap lock is never released and the function
+ * behaves exactly like hmm_range_fault().
+ *
+ * If @locked is non-%NULL the caller must hold mmap_read_lock and set
+ * *@locked = 1 before the call.  On return:
+ *
+ *   *@locked == 1: the mmap lock is still held.  The return value has the
+ *                  same meaning as hmm_range_fault() (0 on success, or one
+ *                  of the error codes documented there).
+ *
+ *   *@locked == 0: the mmap lock was dropped during a page fault.  No PFNs
+ *                  collected so far are guaranteed to be valid because the
+ *                  address space may have changed under us.  The return
+ *                  value is either 0 (caller must reacquire the lock and
+ *                  restart with a fresh mmu_interval_read_begin()) or
+ *                  -EINTR (a fatal signal is pending; abort).
+ *
+ * The caller is responsible for reacquiring mmap_read_lock and restarting
+ * the operation from range->start.  See Documentation/mm/hmm.rst for the
+ * full usage pattern.
+ */
+int hmm_range_fault_unlockable(struct hmm_range *range, int *locked)
 {
 	struct hmm_vma_walk hmm_vma_walk = {
 		.range = range,
+		.locked = locked,
 		.last = range->start,
 	};
 	struct mm_struct *mm = range->notifier->mm;
@@ -704,8 +765,14 @@ int hmm_range_fault(struct hmm_range *range)
 		 * returns -EBUSY so the loop re-walks and picks up the
 		 * now-present entries.
 		 */
-		if (ret == HMM_FAULT_PENDING)
+		if (ret == HMM_FAULT_PENDING) {
 			ret = hmm_do_fault(mm, &hmm_vma_walk);
+			if (ret == HMM_FAULT_UNLOCKED) {
+				if (fatal_signal_pending(current))
+					return -EINTR;
+				return 0;     /* caller must restart */
+			}
+		}
 		/*
 		 * When -EBUSY is returned the loop restarts with
 		 * hmm_vma_walk.last set to an address that has not been stored
@@ -715,7 +782,7 @@ int hmm_range_fault(struct hmm_range *range)
 	} while (ret == -EBUSY);
 	return ret;
 }
-EXPORT_SYMBOL(hmm_range_fault);
+EXPORT_SYMBOL(hmm_range_fault_unlockable);
 
 /**
  * hmm_dma_map_alloc - Allocate HMM map structure



^ permalink raw reply related

* [PATCH v3 3/3] selftests/mm: add userfaultfd test for HMM unlockable path
From: Stanislav Kinsburskii @ 2026-05-20 14:09 UTC (permalink / raw)
  To: Liam.Howlett, akpm, akpm, david, jgg, corbet, leon, ljs, mhocko,
	rppt, shuah, skhan, surenb, vbabka, skinsburskii
  Cc: linux-doc, linux-kernel, linux-kernel, linux-kselftest, linux-mm
In-Reply-To: <177928604779.589431.14703161356676674288.stgit@skinsburskii>

From: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>

Add a selftest that exercises hmm_range_fault_unlockable() with a
userfaultfd-backed mapping. The test:

1. Creates an anonymous mmap region
2. Registers it with userfaultfd (UFFDIO_REGISTER_MODE_MISSING)
3. Spawns a handler thread that responds to page faults by filling
   pages with a known pattern (0xAB) via UFFDIO_COPY
4. Issues HMM_DMIRROR_READ_UNLOCKABLE to the test_hmm driver, which
   calls hmm_range_fault_unlockable() internally
5. Verifies the device read back the data provided by the userfaultfd
   handler

This requires changes to the test_hmm kernel module:
- New dmirror_range_fault_unlockable() that uses the new HMM API
- New dmirror_fault_unlockable() and dmirror_read_unlockable() wrappers
- New HMM_DMIRROR_READ_UNLOCKABLE ioctl (0x09)

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 lib/test_hmm.c                         |  122 ++++++++++++++++++++++++++
 lib/test_hmm_uapi.h                    |    1 
 tools/testing/selftests/mm/hmm-tests.c |  149 ++++++++++++++++++++++++++++++++
 3 files changed, 272 insertions(+)

diff --git a/lib/test_hmm.c b/lib/test_hmm.c
index 38996c4baa40..7cc7320c9494 100644
--- a/lib/test_hmm.c
+++ b/lib/test_hmm.c
@@ -389,6 +389,84 @@ static int dmirror_range_fault(struct dmirror *dmirror,
 	return ret;
 }
 
+static int dmirror_range_fault_unlockable(struct dmirror *dmirror,
+					  struct hmm_range *range)
+{
+	struct mm_struct *mm = dmirror->notifier.mm;
+	unsigned long timeout =
+		jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
+	int locked;
+	int ret;
+
+	while (true) {
+		if (time_after(jiffies, timeout)) {
+			ret = -EBUSY;
+			goto out;
+		}
+
+		range->notifier_seq = mmu_interval_read_begin(range->notifier);
+		locked = 1;
+		mmap_read_lock(mm);
+		ret = hmm_range_fault_unlockable(range, &locked);
+		if (locked)
+			mmap_read_unlock(mm);
+		if (ret) {
+			if (ret == -EBUSY)
+				continue;
+			goto out;
+		}
+		if (!locked)
+			continue;
+
+		mutex_lock(&dmirror->mutex);
+		if (mmu_interval_read_retry(range->notifier,
+					    range->notifier_seq)) {
+			mutex_unlock(&dmirror->mutex);
+			continue;
+		}
+		break;
+	}
+
+	ret = dmirror_do_fault(dmirror, range);
+
+	mutex_unlock(&dmirror->mutex);
+out:
+	return ret;
+}
+
+static int dmirror_fault_unlockable(struct dmirror *dmirror,
+				    unsigned long start,
+				    unsigned long end, bool write)
+{
+	struct mm_struct *mm = dmirror->notifier.mm;
+	unsigned long addr;
+	unsigned long pfns[32];
+	struct hmm_range range = {
+		.notifier = &dmirror->notifier,
+		.hmm_pfns = pfns,
+		.pfn_flags_mask = 0,
+		.default_flags =
+			HMM_PFN_REQ_FAULT | (write ? HMM_PFN_REQ_WRITE : 0),
+		.dev_private_owner = dmirror->mdevice,
+	};
+	int ret = 0;
+
+	if (!mmget_not_zero(mm))
+		return -EFAULT;
+
+	for (addr = start; addr < end; addr = range.end) {
+		range.start = addr;
+		range.end = min(addr + (ARRAY_SIZE(pfns) << PAGE_SHIFT), end);
+
+		ret = dmirror_range_fault_unlockable(dmirror, &range);
+		if (ret)
+			break;
+	}
+
+	mmput(mm);
+	return ret;
+}
+
 static int dmirror_fault(struct dmirror *dmirror, unsigned long start,
 			 unsigned long end, bool write)
 {
@@ -488,6 +566,47 @@ static int dmirror_read(struct dmirror *dmirror, struct hmm_dmirror_cmd *cmd)
 	return ret;
 }
 
+static int dmirror_read_unlockable(struct dmirror *dmirror,
+				   struct hmm_dmirror_cmd *cmd)
+{
+	struct dmirror_bounce bounce;
+	unsigned long start, end;
+	unsigned long size = cmd->npages << PAGE_SHIFT;
+	int ret;
+
+	start = cmd->addr;
+	end = start + size;
+	if (end < start)
+		return -EINVAL;
+
+	ret = dmirror_bounce_init(&bounce, start, size);
+	if (ret)
+		return ret;
+
+	while (1) {
+		mutex_lock(&dmirror->mutex);
+		ret = dmirror_do_read(dmirror, start, end, &bounce);
+		mutex_unlock(&dmirror->mutex);
+		if (ret != -ENOENT)
+			break;
+
+		start = cmd->addr + (bounce.cpages << PAGE_SHIFT);
+		ret = dmirror_fault_unlockable(dmirror, start, end, false);
+		if (ret)
+			break;
+		cmd->faults++;
+	}
+
+	if (ret == 0) {
+		if (copy_to_user(u64_to_user_ptr(cmd->ptr), bounce.ptr,
+				 bounce.size))
+			ret = -EFAULT;
+	}
+	cmd->cpages = bounce.cpages;
+	dmirror_bounce_fini(&bounce);
+	return ret;
+}
+
 static int dmirror_do_write(struct dmirror *dmirror, unsigned long start,
 			    unsigned long end, struct dmirror_bounce *bounce)
 {
@@ -1549,6 +1668,9 @@ static long dmirror_fops_unlocked_ioctl(struct file *filp,
 		dmirror->flags = cmd.npages;
 		ret = 0;
 		break;
+	case HMM_DMIRROR_READ_UNLOCKABLE:
+		ret = dmirror_read_unlockable(dmirror, &cmd);
+		break;
 
 	default:
 		return -EINVAL;
diff --git a/lib/test_hmm_uapi.h b/lib/test_hmm_uapi.h
index f94c6d457338..076df6df9227 100644
--- a/lib/test_hmm_uapi.h
+++ b/lib/test_hmm_uapi.h
@@ -38,6 +38,7 @@ struct hmm_dmirror_cmd {
 #define HMM_DMIRROR_CHECK_EXCLUSIVE	_IOWR('H', 0x06, struct hmm_dmirror_cmd)
 #define HMM_DMIRROR_RELEASE		_IOWR('H', 0x07, struct hmm_dmirror_cmd)
 #define HMM_DMIRROR_FLAGS		_IOWR('H', 0x08, struct hmm_dmirror_cmd)
+#define HMM_DMIRROR_READ_UNLOCKABLE	_IOWR('H', 0x09, struct hmm_dmirror_cmd)
 
 #define HMM_DMIRROR_FLAG_FAIL_ALLOC	(1ULL << 0)
 
diff --git a/tools/testing/selftests/mm/hmm-tests.c b/tools/testing/selftests/mm/hmm-tests.c
index e1c8a679a4cf..cc5525be74b0 100644
--- a/tools/testing/selftests/mm/hmm-tests.c
+++ b/tools/testing/selftests/mm/hmm-tests.c
@@ -27,6 +27,10 @@
 #include <sys/mman.h>
 #include <sys/ioctl.h>
 #include <sys/time.h>
+#include <sys/syscall.h>
+#include <sys/eventfd.h>
+#include <linux/userfaultfd.h>
+#include <poll.h>
 
 /*
  * This is a private UAPI to the kernel test module so it isn't exported
@@ -2853,4 +2857,149 @@ TEST_F_TIMEOUT(hmm, benchmark_thp_migration, 120)
 					&thp_results, &regular_results);
 	}
 }
+/*
+ * Test that HMM can fault in pages backed by userfaultfd using the
+ * hmm_range_fault_unlockable() path. This exercises the lock-drop retry
+ * logic in the HMM framework.
+ */
+struct uffd_thread_args {
+	int uffd;
+	int stop_fd;
+	void *page_buffer;
+	unsigned long page_size;
+};
+
+static void *uffd_handler_thread(void *arg)
+{
+	struct uffd_thread_args *args = arg;
+	struct uffd_msg msg;
+	struct uffdio_copy copy;
+	struct pollfd pollfd[2];
+	int ret;
+
+	pollfd[0].fd = args->uffd;
+	pollfd[0].events = POLLIN;
+	pollfd[1].fd = args->stop_fd;
+	pollfd[1].events = POLLIN;
+
+	while (1) {
+		ret = poll(pollfd, 2, -1);
+		if (ret <= 0)
+			break;
+		if (pollfd[1].revents)
+			break;
+		if (!(pollfd[0].revents & POLLIN))
+			break;
+
+		ret = read(args->uffd, &msg, sizeof(msg));
+		if (ret != sizeof(msg))
+			break;
+
+		if (msg.event != UFFD_EVENT_PAGEFAULT)
+			break;
+
+		/* Fill the page with a known pattern */
+		memset(args->page_buffer, 0xAB, args->page_size);
+
+		copy.dst = msg.arg.pagefault.address & ~(args->page_size - 1);
+		copy.src = (unsigned long)args->page_buffer;
+		copy.len = args->page_size;
+		copy.mode = 0;
+		copy.copy = 0;
+
+		ret = ioctl(args->uffd, UFFDIO_COPY, &copy);
+		if (ret < 0)
+			break;
+	}
+
+	return NULL;
+}
+
+TEST_F(hmm, userfaultfd_read)
+{
+	struct hmm_buffer *buffer;
+	struct uffd_thread_args uffd_args;
+	unsigned long npages;
+	unsigned long size;
+	unsigned long i;
+	unsigned char *ptr;
+	pthread_t thread;
+	int uffd;
+	int stop_fd;
+	int ret;
+	struct uffdio_api api;
+	struct uffdio_register reg;
+	uint64_t stop = 1;
+	ssize_t nwrite;
+
+	npages = 4;
+	size = npages << self->page_shift;
+
+	/* Create userfaultfd */
+	uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
+	if (uffd < 0)
+		SKIP(return, "userfaultfd not available");
+
+	api.api = UFFD_API;
+	api.features = 0;
+	ret = ioctl(uffd, UFFDIO_API, &api);
+	ASSERT_EQ(ret, 0);
+
+	buffer = malloc(sizeof(*buffer));
+	ASSERT_NE(buffer, NULL);
+
+	buffer->fd = -1;
+	buffer->size = size;
+	buffer->mirror = malloc(size);
+	ASSERT_NE(buffer->mirror, NULL);
+
+	/* Create anonymous mapping */
+	buffer->ptr = mmap(NULL, size,
+			   PROT_READ | PROT_WRITE,
+			   MAP_PRIVATE | MAP_ANONYMOUS,
+			   -1, 0);
+	ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+	/* Register the region with userfaultfd */
+	reg.range.start = (unsigned long)buffer->ptr;
+	reg.range.len = size;
+	reg.mode = UFFDIO_REGISTER_MODE_MISSING;
+	ret = ioctl(uffd, UFFDIO_REGISTER, &reg);
+	ASSERT_EQ(ret, 0);
+
+	/* Set up the handler thread */
+	uffd_args.uffd = uffd;
+	stop_fd = eventfd(0, EFD_CLOEXEC);
+	ASSERT_GE(stop_fd, 0);
+	uffd_args.stop_fd = stop_fd;
+	uffd_args.page_buffer = malloc(self->page_size);
+	ASSERT_NE(uffd_args.page_buffer, NULL);
+	uffd_args.page_size = self->page_size;
+
+	ret = pthread_create(&thread, NULL, uffd_handler_thread, &uffd_args);
+	ASSERT_EQ(ret, 0);
+
+	/*
+	 * Use the unlockable read path which allows the mmap lock to be
+	 * dropped during the fault, enabling userfaultfd resolution.
+	 */
+	ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ_UNLOCKABLE,
+			      buffer, npages);
+	ASSERT_EQ(ret, 0);
+	ASSERT_EQ(buffer->cpages, npages);
+
+	/* Verify the device read the data filled by the uffd handler */
+	ptr = buffer->mirror;
+	for (i = 0; i < size; ++i)
+		ASSERT_EQ(ptr[i], (unsigned char)0xAB);
+
+	nwrite = write(stop_fd, &stop, sizeof(stop));
+	ASSERT_EQ(nwrite, sizeof(stop));
+	pthread_join(thread, NULL);
+	close(stop_fd);
+	free(uffd_args.page_buffer);
+	close(uffd);
+	hmm_buffer_free(buffer);
+}
+
 TEST_HARNESS_MAIN



^ permalink raw reply related

* Re: [PATCH v2 0/3] mm/hmm: Add mmap lock-drop support for userfaultfd-backed mappings
From: Stanislav Kinsburskii @ 2026-05-20 14:12 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Stanislav Kinsburskii, kys, Liam.Howlett, david, jgg, corbet,
	leon, ljs, mhocko, rppt, shuah, skhan, surenb, vbabka, linux-doc,
	linux-kernel, linux-kselftest, linux-mm
In-Reply-To: <20260518104808.ba773348f8564303c4330a33@linux-foundation.org>

On Mon, May 18, 2026 at 10:48:08AM -0700, Andrew Morton wrote:
> On Wed, 13 May 2026 02:40:11 +0000 Stanislav Kinsburskii <skinsburskii@linux.microsoft.com> wrote:
> 
> > This series extends the HMM framework to support userfaultfd-backed memory
> > by allowing the mmap read lock to be dropped during hmm_range_fault().
> > 
> > Some page fault handlers — most notably userfaultfd — require the mmap lock
> > to be released so that userspace can resolve the fault. The current HMM
> > interface never sets FAULT_FLAG_ALLOW_RETRY, making it impossible to fault
> > in pages from userfaultfd-registered regions.
> > 
> > This series follows the established int *locked pattern from
> > get_user_pages_remote() in mm/gup.c. A new entry point,
> > hmm_range_fault_unlockable(), accepts an int *locked parameter. When the
> > mmap lock is dropped during fault resolution (VM_FAULT_RETRY or
> > VM_FAULT_COMPLETED), the function returns 0 with *locked = 0, signalling
> > the caller to restart its walk. The existing hmm_range_fault() is
> > refactored into a thin wrapper that passes NULL, preserving current
> > behavior for all existing callers.
> > 
> > Faulting hugetlb pages on the unlockable path is not supported because
> > walk_hugetlb_range() unconditionally holds and releases
> > hugetlb_vma_lock_read across the callback; if the mmap lock is dropped
> > inside the callback, the VMA may be freed before the walk framework's
> > unlock. Hugetlb pages already present in page tables are handled normally.
> > Possible approaches to lift this limitation are documented in
> > Documentation/mm/hmm.rst.
> 
> Thanks.  AI review asked some questions:
> 	https://sashiko.dev/#/patchset/177863991557.82528.15288076059759579141.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net
> 
> I'd ignore the fist one: don't write buggy fault handlers!
> 
> 

Thank you for the review.
I addressed the issues found in the new self test in v3 of this series.
Please, take a look.

Thanks,
Stanislav


^ permalink raw reply

* Re: [PATCH 02/15] accel/qda: Add QDA driver documentation
From: Dmitry Baryshkov @ 2026-05-20 14:12 UTC (permalink / raw)
  To: ekansh.gupta
  Cc: Oded Gabbay, Jonathan Corbet, Shuah Khan, Joerg Roedel,
	Will Deacon, Robin Murphy, Maarten Lankhorst, Maxime Ripard,
	Thomas Zimmermann, David Airlie, Simona Vetter, Sumit Semwal,
	Christian König, Bharath Kumar, Chenna Kesava Raju, srini,
	andersson, konradybcio, robin.clark, linux-kernel, dri-devel,
	linux-doc, linux-arm-msm, iommu, linux-media, linaro-mm-sig
In-Reply-To: <20260519-qda-series-v1-2-b2d984c297f8@oss.qualcomm.com>

On Tue, May 19, 2026 at 11:45:52AM +0530, Ekansh Gupta via B4 Relay wrote:
> From: Ekansh Gupta <ekansh.gupta@oss.qualcomm.com>
> 
> Add documentation for the Qualcomm DSP Accelerator (QDA) driver under
> Documentation/accel/qda/. The documentation covers the driver
> architecture, GEM-based buffer management, IOMMU context bank
> isolation, and the RPMsg transport layer.
> 
> The user-space API section describes the DRM IOCTLs for session
> management, GEM buffer allocation, and remote procedure invocation via
> the FastRPC protocol, along with a typical application lifecycle
> example. Sections for dynamic debug and basic testing are also
> included.
> 
> Wire the new documentation into the Compute Accelerators index at
> Documentation/accel/index.rst.
> 
> Assisted-by: Claude:claude-4-6-sonnet
> Signed-off-by: Ekansh Gupta <ekansh.gupta@oss.qualcomm.com>
> ---
>  Documentation/accel/index.rst     |   1 +
>  Documentation/accel/qda/index.rst |  13 ++++
>  Documentation/accel/qda/qda.rst   | 146 ++++++++++++++++++++++++++++++++++++++
>  3 files changed, 160 insertions(+)
> 
> diff --git a/Documentation/accel/index.rst b/Documentation/accel/index.rst
> index cbc7d4c3876a..5901ea7f784c 100644
> --- a/Documentation/accel/index.rst
> +++ b/Documentation/accel/index.rst
> @@ -10,4 +10,5 @@ Compute Accelerators
>     introduction
>     amdxdna/index
>     qaic/index
> +   qda/index
>     rocket/index
> diff --git a/Documentation/accel/qda/index.rst b/Documentation/accel/qda/index.rst
> new file mode 100644
> index 000000000000..013400cf9c25
> --- /dev/null
> +++ b/Documentation/accel/qda/index.rst
> @@ -0,0 +1,13 @@
> +.. SPDX-License-Identifier: GPL-2.0-only
> +
> +==================================
> +accel/qda Qualcomm DSP Accelerator
> +==================================
> +
> +The QDA driver provides a DRM accel based interface for Qualcomm DSP offload.
> +It uses the FastRPC protocol and integrates with DRM and GEM infrastructure
> +for device and buffer management.
> +
> +.. toctree::
> +
> +   qda
> diff --git a/Documentation/accel/qda/qda.rst b/Documentation/accel/qda/qda.rst
> new file mode 100644
> index 000000000000..9f49af6e6acc
> --- /dev/null
> +++ b/Documentation/accel/qda/qda.rst
> @@ -0,0 +1,146 @@
> +.. SPDX-License-Identifier: GPL-2.0-only
> +
> +=====================================
> +Qualcomm DSP Accelerator (QDA) Driver
> +=====================================
> +
> +Introduction
> +============
> +
> +The QDA driver is a DRM accel driver for Qualcomm's DSPs. It provides a
> +DRM accel based interface for Qualcomm DSP offload, supporting workloads
> +such as AI inference, computer vision, audio processing, and sensor offload
> +on Qualcomm SoCs. It uses the FastRPC protocol and integrates with DRM and
> +GEM infrastructure for device and buffer management.
> +
> +Key Features
> +============
> +
> +*   **DRM accel Interface**: Exposes a standard character device node
> +    (e.g., ``/dev/accel/accel0``) via the DRM accel subsystem.
> +*   **FastRPC Protocol**: Implements the FastRPC protocol for communication
> +    between the application processor and the DSP.
> +*   **GEM Buffer Management**: Uses the DRM GEM interface for buffer
> +    allocation, lifecycle management, and DMA-BUF import/export.
> +*   **IOMMU Isolation**: Uses IOMMU context banks to enforce memory isolation
> +    between different DSP user sessions.
> +*   **Modular Design**: Clean separation between the core DRM logic, the
> +    memory manager, and the RPMsg-based transport layer.
> +
> +Architecture
> +============
> +
> +The QDA driver consists of several functional blocks:
> +
> +1.  **Core Driver (``qda_drv``)**: Manages device registration, file operations,
> +    and DRM accel integration.
> +2.  **Memory Manager (``qda_memory_manager``)**: A flexible memory management
> +    layer that handles IOMMU context banks. It supports pluggable backends
> +    (such as DMA-coherent) to adapt to different SoC memory architectures.
> +3.  **GEM Subsystem**: Implements the DRM GEM interface for buffer management:
> +
> +    * **``qda_gem``**: Core GEM object management, including allocation, mmap
> +      operations, and buffer lifecycle management.
> +    * **``qda_prime``**: PRIME import functionality for DMA-BUF interoperability
> +      with other kernel subsystems.
> +
> +4.  **Transport Layer (``qda_rpmsg``)**: Abstraction over the RPMsg framework
> +    to handle low-level message passing with the DSP firmware.
> +5.  **Compute Bus (``qda_compute_bus``)**: A custom virtual bus used to
> +    enumerate and manage the specific compute context banks defined in the
> +    device tree. The bus was introduced because IOMMU context banks (CBs) are
> +    synthetic constructs — not real platform devices — making a platform driver
> +    an incorrect abstraction for them. The earlier platform-driver approach also
> +    had a race condition: device nodes were created before the RPMsg channel
> +    resources were fully initialized, and because ``probe`` runs asynchronously,
> +    applications could open a CB device and attempt to start a session before
> +    the underlying transport was ready. The compute bus makes CB lifetime
> +    explicitly subordinate to the parent QDA device, closing that window.
> +6.  **FastRPC Core (``qda_fastrpc``)**: Implements the protocol logic for
> +    marshalling arguments and handling remote invocations.
> +
> +User-Space API
> +==============
> +
> +The driver exposes a set of DRM-compliant IOCTLs:
> +
> +*   ``DRM_IOCTL_QDA_QUERY``: Query DSP type (e.g., "cdsp", "adsp")
> +    and capabilities.
> +*   ``DRM_IOCTL_QDA_REMOTE_SESSION_CREATE``: Initialize a new process context
> +    on the DSP.
> +*   ``DRM_IOCTL_QDA_REMOTE_INVOKE``: Submit a remote method invocation (the
> +    primary execution unit).
> +*   ``DRM_IOCTL_QDA_GEM_CREATE``: Allocate a GEM buffer object for DSP usage.
> +*   ``DRM_IOCTL_QDA_GEM_MMAP_OFFSET``: Retrieve mmap offsets for memory mapping.
> +*   ``DRM_IOCTL_QDA_REMOTE_MAP`` / ``DRM_IOCTL_QDA_REMOTE_MUNMAP``: Map or unmap
> +    buffers into the DSP's virtual address space. Each accepts a ``request``
> +    field selecting between a legacy operation (``QDA_MAP_REQUEST_LEGACY`` /
> +    ``QDA_MUNMAP_REQUEST_LEGACY``) and an attribute-based operation
> +    (``QDA_MAP_REQUEST_ATTR`` / ``QDA_MUNMAP_REQUEST_ATTR``).

Explain, what happens in the users don't map the buffers into the DSP
space. Will DRM_IOCTL_QDA_REMOTE_INVOKE handle the mapping or not? What
is the difference between those two modes?

Would the driver benefit from using GPUVM?

> +
> +Usage Example
> +=============
> +
> +A typical lifecycle for a user-space application:
> +
> +1.  **Discovery**: Open ``/dev/accel/accel*`` and use
> +    ``DRM_IOCTL_QDA_QUERY`` to identify the DSP domain served by that
> +    device node.
> +2.  **Initialization**: Call ``DRM_IOCTL_QDA_REMOTE_SESSION_CREATE`` to
> +    establish a session and create a process context on the DSP.
> +3.  **Memory**: Allocate buffers via ``DRM_IOCTL_QDA_GEM_CREATE`` or import
> +    DMA-BUFs (PRIME fd) from other drivers using ``DRM_IOCTL_PRIME_FD_TO_HANDLE``.
> +4.  **Execution**: Use ``DRM_IOCTL_QDA_REMOTE_INVOKE`` to pass arguments and
> +    execute functions on the DSP.
> +5.  **Cleanup**: Close file descriptors to automatically release resources and
> +    detach the session.

I'd have expected the description of the actual example. I.e. clone the
app from https://the.addr, prepare clang >= NN.MM, QAIC (https://foo),
run make, run the app, check the results. I'd remind that DRM Accel has
a very specific requirement of having the working toolhain in the
open-source.

> +
> +Internal Implementation
> +=======================
> +
> +Memory Management
> +-----------------
> +The driver's memory manager creates virtual "IOMMU devices" that map to
> +hardware context banks. This allows the driver to manage multiple isolated
> +address spaces. The implementation uses a DMA-coherent backend to ensure data consistency
> +between the CPU and DSP without manual cache maintenance in most cases.

GEM usage?

> +
> +Debugging
> +=========
> +The driver includes extensive dynamic debug support. Enable it via the
> +kernel's dynamic debug control:
> +
> +.. code-block:: bash
> +
> +    echo "file drivers/accel/qda/* +p" > /sys/kernel/debug/dynamic_debug/control
> +
> +Testing
> +=======
> +The QDA driver can be exercised using the ``fastrpc_test`` utility from the
> +FastRPC userspace library. Run the test application:

pointer

> +
> +.. code-block:: bash
> +
> +    fastrpc_test -d 3 -U 1 -t linux -a v68
> +
> +**Options**
> +
> +``-d domain``
> +    Select the DSP domain to run on:
> +
> +    * ``0`` — ADSP
> +    * ``1`` — MDSP
> +    * ``2`` — SDSP
> +    * ``3`` — CDSP *(default on targets with CDSP)*
> +
> +``-U unsigned_PD``
> +    Select signed or unsigned protection domain:
> +
> +    * ``0`` — signed PD
> +    * ``1`` — unsigned PD *(default)*
> +
> +``-t target``
> +    Target platform: ``android`` or ``linux`` *(default: linux)*
> +
> +``-a arch_version``
> +    DSP architecture version, e.g. ``v68``, ``v75`` *(default: v68)*
> 
> -- 
> 2.34.1
> 
> 

-- 
With best wishes
Dmitry

^ permalink raw reply

* Re: [PATCH 03/15] accel/qda: Add initial QDA DRM accelerator driver
From: Dmitry Baryshkov @ 2026-05-20 14:18 UTC (permalink / raw)
  To: ekansh.gupta
  Cc: Oded Gabbay, Jonathan Corbet, Shuah Khan, Joerg Roedel,
	Will Deacon, Robin Murphy, Maarten Lankhorst, Maxime Ripard,
	Thomas Zimmermann, David Airlie, Simona Vetter, Sumit Semwal,
	Christian König, Bharath Kumar, Chenna Kesava Raju, srini,
	andersson, konradybcio, robin.clark, linux-kernel, dri-devel,
	linux-doc, linux-arm-msm, iommu, linux-media, linaro-mm-sig
In-Reply-To: <20260519-qda-series-v1-3-b2d984c297f8@oss.qualcomm.com>

On Tue, May 19, 2026 at 11:45:53AM +0530, Ekansh Gupta via B4 Relay wrote:
> From: Ekansh Gupta <ekansh.gupta@oss.qualcomm.com>
> 
> Add the foundational driver files for the Qualcomm DSP Accelerator
> (QDA), a DRM accel driver for Qualcomm DSPs. The driver integrates
> with the DRM accel subsystem (drivers/accel/) and provides:
> 
>   - A standard /dev/accel/accel* character device node via DRM.
>   - GEM-based buffer management with DMA-BUF import/export (PRIME).
>   - IOMMU context bank management for per-session memory isolation.
>   - Standard DRM IOCTLs for device management and job submission.
> 
> qda_drv.c / qda_drv.h: Core DRM driver registration. Defines the
> drm_driver ops table, per-file private state (qda_file_priv), and the
> main device structure (qda_dev) which embeds drm_device.
> 
> qda_rpmsg.c / qda_rpmsg.h: RPMsg transport layer. Registers an
> rpmsg_driver matching the "qcom,fastrpc" compatible string. On probe
> it allocates a qda_dev, reads the DSP domain name from the "label" DT
> property, and registers the DRM device.
> 
> Assisted-by: Claude:claude-4-6-sonnet
> Signed-off-by: Ekansh Gupta <ekansh.gupta@oss.qualcomm.com>
> ---
>  drivers/accel/Kconfig         |  1 +
>  drivers/accel/Makefile        |  1 +
>  drivers/accel/qda/Kconfig     | 30 +++++++++++++
>  drivers/accel/qda/Makefile    | 10 +++++
>  drivers/accel/qda/qda_drv.c   | 97 ++++++++++++++++++++++++++++++++++++++++++
>  drivers/accel/qda/qda_drv.h   | 62 +++++++++++++++++++++++++++
>  drivers/accel/qda/qda_rpmsg.c | 99 +++++++++++++++++++++++++++++++++++++++++++
>  drivers/accel/qda/qda_rpmsg.h | 13 ++++++
>  8 files changed, 313 insertions(+)
> 
> diff --git a/drivers/accel/Kconfig b/drivers/accel/Kconfig
> index bdf48ccafcf2..74ac0f71bc9d 100644
> --- a/drivers/accel/Kconfig
> +++ b/drivers/accel/Kconfig
> @@ -29,6 +29,7 @@ source "drivers/accel/ethosu/Kconfig"
>  source "drivers/accel/habanalabs/Kconfig"
>  source "drivers/accel/ivpu/Kconfig"
>  source "drivers/accel/qaic/Kconfig"
> +source "drivers/accel/qda/Kconfig"
>  source "drivers/accel/rocket/Kconfig"
>  
>  endif
> diff --git a/drivers/accel/Makefile b/drivers/accel/Makefile
> index 1d3a7251b950..58c08dd5f389 100644
> --- a/drivers/accel/Makefile
> +++ b/drivers/accel/Makefile
> @@ -5,4 +5,5 @@ obj-$(CONFIG_DRM_ACCEL_ARM_ETHOSU)	+= ethosu/
>  obj-$(CONFIG_DRM_ACCEL_HABANALABS)	+= habanalabs/
>  obj-$(CONFIG_DRM_ACCEL_IVPU)		+= ivpu/
>  obj-$(CONFIG_DRM_ACCEL_QAIC)		+= qaic/
> +obj-$(CONFIG_DRM_ACCEL_QDA)		+= qda/
>  obj-$(CONFIG_DRM_ACCEL_ROCKET)		+= rocket/
> \ No newline at end of file
> diff --git a/drivers/accel/qda/Kconfig b/drivers/accel/qda/Kconfig
> new file mode 100644
> index 000000000000..484d21ff1b55
> --- /dev/null
> +++ b/drivers/accel/qda/Kconfig
> @@ -0,0 +1,30 @@
> +# SPDX-License-Identifier: GPL-2.0-only
> +#
> +# Qualcomm DSP accelerator driver
> +#
> +
> +config DRM_ACCEL_QDA
> +	tristate "Qualcomm DSP accelerator"
> +	depends on DRM_ACCEL
> +	depends on ARCH_QCOM || COMPILE_TEST
> +	depends on RPMSG
> +	help
> +	  Enables the DRM-based accelerator driver for Qualcomm's Hexagon DSPs.
> +	  This driver provides a standardized interface for offloading computational
> +	  tasks to the DSP, including audio processing, sensor offload, computer
> +	  vision, and AI inference workloads.
> +
> +	  The driver supports all DSP domains (ADSP, CDSP, SDSP, GDSP) and
> +	  implements the FastRPC protocol for communication between the application
> +	  processor and DSP. It integrates with the Linux kernel's Compute
> +	  Accelerators subsystem (drivers/accel/) and provides a modern alternative
> +	  to the legacy FastRPC driver found in drivers/misc/.
> +
> +	  Key features include DMA-BUF interoperability for seamless buffer sharing

Key features of what? Consider distro maintainers reading your help text
in order to identify whether to enable it or not.

> +	  with other multimedia subsystems, IOMMU-based memory isolation, and
> +	  standard DRM IOCTLs for device management and job submission.
> +
> +	  If unsure, say N.
> +
> +	  To compile this driver as a module, choose M here: the
> +	  module will be called qda.
> diff --git a/drivers/accel/qda/Makefile b/drivers/accel/qda/Makefile
> new file mode 100644
> index 000000000000..dbe809067a8b
> --- /dev/null
> +++ b/drivers/accel/qda/Makefile
> @@ -0,0 +1,10 @@
> +# SPDX-License-Identifier: GPL-2.0-only
> +#
> +# Makefile for Qualcomm DSP accelerator driver
> +#
> +
> +obj-$(CONFIG_DRM_ACCEL_QDA)	:= qda.o
> +
> +qda-y := \
> +	qda_drv.o \
> +	qda_rpmsg.o
> diff --git a/drivers/accel/qda/qda_drv.c b/drivers/accel/qda/qda_drv.c
> new file mode 100644
> index 000000000000..1c1bab68d445
> --- /dev/null
> +++ b/drivers/accel/qda/qda_drv.c
> @@ -0,0 +1,97 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +// Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
> +#include <linux/module.h>
> +#include <linux/slab.h>
> +#include <drm/drm_accel.h>
> +#include <drm/drm_drv.h>
> +#include <drm/drm_file.h>
> +#include <drm/drm_gem.h>
> +#include <drm/drm_ioctl.h>
> +#include <drm/drm_print.h>
> +
> +#include "qda_drv.h"
> +#include "qda_rpmsg.h"
> +
> +static int qda_open(struct drm_device *dev, struct drm_file *file)
> +{
> +	struct qda_file_priv *qda_file_priv;
> +
> +	qda_file_priv = kzalloc_obj(*qda_file_priv);
> +	if (!qda_file_priv)
> +		return -ENOMEM;
> +
> +	qda_file_priv->qda_dev = qda_dev_from_drm(dev);
> +	file->driver_priv = qda_file_priv;
> +
> +	return 0;
> +}
> +
> +static void qda_postclose(struct drm_device *dev, struct drm_file *file)
> +{
> +	struct qda_file_priv *qda_file_priv = file->driver_priv;
> +
> +	kfree(qda_file_priv);
> +	file->driver_priv = NULL;
> +}
> +
> +DEFINE_DRM_ACCEL_FOPS(qda_accel_fops);
> +
> +static const struct drm_driver qda_drm_driver = {
> +	.driver_features = DRIVER_COMPUTE_ACCEL,
> +	.fops = &qda_accel_fops,
> +	.open = qda_open,
> +	.postclose = qda_postclose,
> +	.name = QDA_DRIVER_NAME,
> +	.desc = "Qualcomm DSP Accelerator Driver",
> +};
> +
> +struct qda_dev *qda_alloc_device(struct device *dev)
> +{
> +	struct qda_dev *qdev;
> +
> +	qdev = devm_drm_dev_alloc(dev, &qda_drm_driver, struct qda_dev, drm_dev);
> +	if (IS_ERR(qdev))
> +		return ERR_CAST(qdev);
> +
> +	return qdev;
> +}
> +
> +void qda_unregister_device(struct qda_dev *qdev)
> +{
> +	drm_dev_unregister(&qdev->drm_dev);
> +}
> +
> +int qda_register_device(struct qda_dev *qdev)
> +{
> +	int ret;
> +
> +	ret = drm_dev_register(&qdev->drm_dev, 0);
> +	if (ret)
> +		drm_err(&qdev->drm_dev, "Failed to register DRM device: %d\n", ret);
> +
> +	return ret;
> +}
> +
> +static int __init qda_core_init(void)
> +{
> +	int ret;
> +
> +	ret = qda_rpmsg_register();
> +	if (ret)
> +		return ret;
> +
> +	pr_info("qda: QDA driver initialization complete\n");
> +	return 0;
> +}
> +
> +static void __exit qda_core_exit(void)
> +{
> +	qda_rpmsg_unregister();
> +}
> +
> +module_init(qda_core_init);
> +module_exit(qda_core_exit);
> +
> +MODULE_AUTHOR("Qualcomm AI Infra Team");
> +MODULE_DESCRIPTION("Qualcomm DSP Accelerator Driver");
> +MODULE_LICENSE("GPL");
> diff --git a/drivers/accel/qda/qda_drv.h b/drivers/accel/qda/qda_drv.h
> new file mode 100644
> index 000000000000..7ba2ef19a411
> --- /dev/null
> +++ b/drivers/accel/qda/qda_drv.h
> @@ -0,0 +1,62 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
> + */
> +
> +#ifndef __QDA_DRV_H__
> +#define __QDA_DRV_H__
> +
> +#include <linux/device.h>
> +#include <linux/rpmsg.h>
> +#include <linux/types.h>
> +#include <drm/drm_device.h>
> +#include <drm/drm_drv.h>
> +#include <drm/drm_file.h>
> +
> +/* Driver identification */
> +#define QDA_DRIVER_NAME "qda"
> +
> +/**
> + * struct qda_file_priv - Per-process private data for DRM file
> + */
> +struct qda_file_priv {
> +	/** @qda_dev: Back-pointer to device structure */
> +	struct qda_dev *qda_dev;
> +};
> +
> +/**
> + * struct qda_dev - Main device structure for QDA driver
> + *
> + * The DRM device is embedded as the first member so that container_of()
> + * can recover the qda_dev from any drm_device pointer.
> + */
> +struct qda_dev {
> +	/** @drm_dev: Embedded DRM device; recover via qda_dev_from_drm() */
> +	struct drm_device drm_dev;
> +	/** @rpdev: RPMsg device for communication with the remote processor */
> +	struct rpmsg_device *rpdev;
> +	/** @dev: Underlying Linux device */
> +	struct device *dev;
> +	/** @dsp_name: Name of the DSP domain (e.g. "cdsp", "adsp") */
> +	const char *dsp_name;
> +};
> +
> +/**
> + * qda_dev_from_drm - Recover qda_dev from an embedded drm_device pointer
> + * @dev: Pointer to the embedded drm_device
> + *
> + * Return: Pointer to the enclosing qda_dev.
> + */
> +static inline struct qda_dev *qda_dev_from_drm(struct drm_device *dev)
> +{
> +	return container_of(dev, struct qda_dev, drm_dev);
> +}
> +
> +/* Device allocation (uses devm_drm_dev_alloc internally) */
> +struct qda_dev *qda_alloc_device(struct device *dev);
> +
> +/* Core device lifecycle */
> +int qda_register_device(struct qda_dev *qdev);
> +void qda_unregister_device(struct qda_dev *qdev);
> +
> +#endif /* __QDA_DRV_H__ */
> diff --git a/drivers/accel/qda/qda_rpmsg.c b/drivers/accel/qda/qda_rpmsg.c
> new file mode 100644
> index 000000000000..6eaf1b145f8a
> --- /dev/null
> +++ b/drivers/accel/qda/qda_rpmsg.c
> @@ -0,0 +1,99 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +// Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
> +#include <linux/module.h>
> +#include <linux/of.h>
> +#include <linux/rpmsg.h>
> +#include <drm/drm_print.h>
> +
> +#include "qda_drv.h"
> +#include "qda_rpmsg.h"
> +
> +static struct qda_dev *alloc_and_init_qdev(struct rpmsg_device *rpdev)

Use the prefix uniformly.

> +{
> +	struct qda_dev *qdev;
> +
> +	qdev = qda_alloc_device(&rpdev->dev);
> +	if (IS_ERR(qdev))
> +		return qdev;
> +
> +	qdev->dev = &rpdev->dev;
> +	qdev->rpdev = rpdev;
> +	dev_set_drvdata(&rpdev->dev, qdev);
> +
> +	return qdev;
> +}
> +
> +static int qda_rpmsg_cb(struct rpmsg_device *rpdev, void *data, int len,
> +			void *priv, u32 src)
> +{
> +	/* Placeholder: responses will be dispatched here */
> +	return 0;
> +}
> +
> +static void qda_rpmsg_remove(struct rpmsg_device *rpdev)
> +{
> +	struct qda_dev *qdev = dev_get_drvdata(&rpdev->dev);
> +
> +	drm_dev_unplug(&qdev->drm_dev);
> +	qdev->rpdev = NULL;
> +	qda_unregister_device(qdev);
> +	dev_info(qdev->dev, "RPMsg device removed\n");

Drop the spamming. And useless (where it is useless) drm_dbg() / dev_dbg() spamming too.

> +}
> +
> +static int qda_rpmsg_probe(struct rpmsg_device *rpdev)
> +{
> +	struct qda_dev *qdev;
> +	const char *label;
> +	int ret;
> +
> +	dev_dbg(&rpdev->dev, "QDA RPMsg probe starting\n");
> +
> +	qdev = alloc_and_init_qdev(rpdev);
> +	if (IS_ERR(qdev))
> +		return PTR_ERR(qdev);
> +
> +	ret = of_property_read_string(rpdev->dev.of_node, "label", &label);
> +	if (ret) {
> +		dev_err(qdev->dev, "Missing 'label' property in DT node: %d\n", ret);
> +		return ret;
> +	}
> +	qdev->dsp_name = label;

Why not just of_property_read_string(...., &qdev->dsp_name)?

> +
> +	ret = qda_register_device(qdev);

return qda_register_device();

> +	if (ret)
> +		return ret;
> +
> +	drm_info(&qdev->drm_dev, "QDA RPMsg probe complete for %s\n", qdev->dsp_name);
> +	return 0;
> +}
> +
> +static const struct of_device_id qda_rpmsg_id_table[] = {
> +	{ .compatible = "qcom,fastrpc" },
> +	{},
> +};
> +MODULE_DEVICE_TABLE(of, qda_rpmsg_id_table);
> +
> +static struct rpmsg_driver qda_rpmsg_driver = {
> +	.probe = qda_rpmsg_probe,
> +	.remove = qda_rpmsg_remove,
> +	.callback = qda_rpmsg_cb,
> +	.drv = {
> +		.name = "qcom,fastrpc",
> +		.of_match_table = qda_rpmsg_id_table,
> +	},
> +};
> +
> +int qda_rpmsg_register(void)
> +{
> +	int ret = register_rpmsg_driver(&qda_rpmsg_driver);
> +
> +	if (ret)
> +		pr_err("qda: Failed to register RPMsg driver: %d\n", ret);
> +
> +	return ret;
> +}
> +
> +void qda_rpmsg_unregister(void)
> +{
> +	unregister_rpmsg_driver(&qda_rpmsg_driver);
> +}

Just use module_rpmsg_driver(), drop all the wrappers and module_init()
/ exit().

> diff --git a/drivers/accel/qda/qda_rpmsg.h b/drivers/accel/qda/qda_rpmsg.h
> new file mode 100644
> index 000000000000..5229d834b34b
> --- /dev/null
> +++ b/drivers/accel/qda/qda_rpmsg.h
> @@ -0,0 +1,13 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
> + */
> +
> +#ifndef __QDA_RPMSG_H__
> +#define __QDA_RPMSG_H__
> +
> +/* RPMsg transport layer registration */
> +int qda_rpmsg_register(void);
> +void qda_rpmsg_unregister(void);
> +
> +#endif /* __QDA_RPMSG_H__ */
> 
> -- 
> 2.34.1
> 
> 

-- 
With best wishes
Dmitry

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox