All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2] drm/v3d: bound CPU-job query writes to their destination BO
@ 2026-06-14 21:16 Michael Bommarito
  2026-06-14 21:31 ` sashiko-bot
  2026-06-15 12:40 ` Maíra Canal
  0 siblings, 2 replies; 3+ messages in thread
From: Michael Bommarito @ 2026-06-14 21:16 UTC (permalink / raw)
  To: Melissa Wen, Maira Canal
  Cc: Maxime Ripard, Thomas Zimmermann, David Airlie, Simona Vetter,
	Iago Toral Quiroga, dri-devel, linux-kernel

The V3D_SUBMIT_CPU query CPU jobs store a user-supplied destination
offset (and, for the copy variants, a per-query stride and query count)
on the job and consume them at exec time without checking that the
writes stay inside the destination BO:

  - TIMESTAMP_QUERY and RESET_TIMESTAMP_QUERY write one u64 per query
    into bo[0] at a fully user-controlled per-query offset.
  - COPY_TIMESTAMP_QUERY copies one u64 per query into bo[0] at
    offset + i * stride, and reads each result from a user-controlled
    offset in the source bo[1].
  - COPY_PERFORMANCE_QUERY writes nperfmons * DRM_V3D_MAX_PERF_COUNTERS
    counter slots plus an availability slot into bo[0] at the same
    geometry.

None of these are bounded against the BO size, so a render-node user
(DRM_RENDER_ALLOW, no master, no capability) can make the handlers
write past the BO's vmap mapping.

Validate the full write extent against the destination BO size once the
BOs are looked up, before the job is queued, rejecting out-of-range or
overflowing geometry with -EINVAL. The copy extent
offset + (count - 1) * stride + write_size is accumulated with
check_*_overflow() so a u32 product cannot wrap the bound, and the
performance slot count is computed the same way since nperfmons and
ncounters are user values. The bare timestamp writes and the copy
source reads are a single fixed u64 slot per query and are bounded
directly.

Fixes: 9ba0ff3e083f ("drm/v3d: Create a CPU job extension for the timestamp query job")
Assisted-by: Claude:claude-opus-4-8
Signed-off-by: Michael Bommarito <michael.bommarito@gmail.com>
---
v2:
  - Extend the bound to the bare TIMESTAMP_QUERY and
    RESET_TIMESTAMP_QUERY job types, which also write one u64 per query
    into bo[0] at a user-controlled offset (Maira Canal).
  - Simplify v3d_cpu_job_bounds_check(): the timestamp source/dest u64
    slots are bounded directly without the can-never-overflow
    check_mul_overflow(); only the genuinely user-sized copy extent and
    the performance slot count keep check_*_overflow().
  - Drop the redundant local in v3d_check_copy_extent(); rename the gate
    to v3d_cpu_job_bounds_check(); reword the helper comments.
  - Drop the KUnit reproducer patch; the suite is kept out of tree.
  - Re-pin Fixes: to the earliest introducing commit (the timestamp
    query job) now that the bare timestamp writes are covered.

Reproduced under KASAN with an out-of-tree KUnit driving the real
handlers over shmem-backed BOs: with a query offset at the BO size the
stock tree reports a vmalloc-out-of-bounds write in each of
v3d_copy_query_results() (4 bytes), v3d_timestamp_query() (8 bytes) and
v3d_reset_timestamp_queries() (8 bytes); with this patch the submit-time
gate rejects the geometry and all in-bounds controls still pass. The
write is plain CPU memory and not architecture specific (x86_64,
COMPILE_TEST).

 drivers/gpu/drm/v3d/v3d_submit.c | 102 +++++++++++++++++++++++++++++++
 1 file changed, 102 insertions(+)

diff --git a/drivers/gpu/drm/v3d/v3d_submit.c b/drivers/gpu/drm/v3d/v3d_submit.c
index ee4512db294b3..fe11fd7e6e14a 100644
--- a/drivers/gpu/drm/v3d/v3d_submit.c
+++ b/drivers/gpu/drm/v3d/v3d_submit.c
@@ -1246,6 +1246,104 @@ static const unsigned int cpu_job_bo_handle_count[] = {
 	[V3D_CPU_JOB_TYPE_COPY_PERFORMANCE_QUERY] = 1,
 };
 
+/* Reject offset + (count - 1) * stride + write_size if it leaves the BO. */
+static int
+v3d_check_copy_extent(struct drm_device *dev, size_t bo_size,
+		      u32 offset, u32 stride, u32 count, u32 write_size)
+{
+	u32 last;
+
+	if (!count)
+		return 0;
+
+	if (check_mul_overflow(stride, count - 1, &last) ||
+	    check_add_overflow(last, write_size, &last) ||
+	    check_add_overflow(last, offset, &last) ||
+	    last > bo_size) {
+		drm_dbg(dev, "CPU job copy buffer exceeds the destination BO.\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/* Reject a query CPU job whose writes would land outside their BO. */
+static int
+v3d_cpu_job_bounds_check(struct v3d_cpu_job *job)
+{
+	struct drm_device *dev = &job->base.v3d->drm;
+	struct v3d_timestamp_query_info *tquery = &job->timestamp_query;
+	struct v3d_copy_query_results_info *copy = &job->copy;
+	u32 elem = copy->do_64bit ? sizeof(u64) : sizeof(u32);
+	struct v3d_bo *dst, *src;
+	u32 slots, write_size;
+	int i;
+
+	switch (job->job_type) {
+	case V3D_CPU_JOB_TYPE_TIMESTAMP_QUERY:
+	case V3D_CPU_JOB_TYPE_RESET_TIMESTAMP_QUERY:
+		/* Each query writes one u64 timestamp slot into bo[0]. */
+		dst = to_v3d_bo(job->base.bo[0]);
+
+		for (i = 0; i < tquery->count; i++) {
+			if ((u64)tquery->queries[i].offset + sizeof(u64) >
+			    dst->base.base.size)
+				goto err_range;
+		}
+		return 0;
+	case V3D_CPU_JOB_TYPE_COPY_TIMESTAMP_QUERY:
+		/* Copies one u64 per query from bo[1] into bo[0]. */
+		dst = to_v3d_bo(job->base.bo[0]);
+		src = to_v3d_bo(job->base.bo[1]);
+
+		for (i = 0; i < tquery->count; i++) {
+			if ((u64)tquery->queries[i].offset + sizeof(u64) >
+			    src->base.base.size)
+				goto err_range;
+		}
+
+		write_size = (copy->availability_bit ? 2 : 1) * elem;
+		return v3d_check_copy_extent(dev, dst->base.base.size,
+					     copy->offset, copy->stride,
+					     tquery->count, write_size);
+	case V3D_CPU_JOB_TYPE_COPY_PERFORMANCE_QUERY:
+		/*
+		 * Each query writes nperfmons * DRM_V3D_MAX_PERF_COUNTERS
+		 * counter slots into bo[0], plus an availability slot at
+		 * index ncounters. nperfmons and ncounters are user values,
+		 * so the slot count is computed overflow-safe.
+		 */
+		dst = to_v3d_bo(job->base.bo[0]);
+
+		if (check_mul_overflow(job->performance_query.nperfmons,
+				       (u32)DRM_V3D_MAX_PERF_COUNTERS, &slots))
+			goto err_range;
+
+		if (copy->availability_bit) {
+			u32 avail_slots;
+
+			if (check_add_overflow(job->performance_query.ncounters,
+					       1u, &avail_slots))
+				goto err_range;
+			slots = max(slots, avail_slots);
+		}
+
+		if (check_mul_overflow(slots, elem, &write_size))
+			goto err_range;
+
+		return v3d_check_copy_extent(dev, dst->base.base.size,
+					     copy->offset, copy->stride,
+					     job->performance_query.count,
+					     write_size);
+	default:
+		return 0;
+	}
+
+err_range:
+	drm_dbg(dev, "CPU job query offset exceeds the BO.\n");
+	return -EINVAL;
+}
+
 /**
  * v3d_submit_cpu_ioctl() - Submits a CPU job to the V3D.
  * @dev: DRM device
@@ -1317,6 +1415,10 @@ v3d_submit_cpu_ioctl(struct drm_device *dev, void *data,
 		if (ret)
 			goto fail;
 
+		ret = v3d_cpu_job_bounds_check(cpu_job);
+		if (ret)
+			goto fail;
+
 		ret = v3d_lock_bo_reservations(&cpu_job->base, &acquire_ctx);
 		if (ret)
 			goto fail;
-- 
2.53.0


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH v2] drm/v3d: bound CPU-job query writes to their destination BO
  2026-06-14 21:16 [PATCH v2] drm/v3d: bound CPU-job query writes to their destination BO Michael Bommarito
@ 2026-06-14 21:31 ` sashiko-bot
  2026-06-15 12:40 ` Maíra Canal
  1 sibling, 0 replies; 3+ messages in thread
From: sashiko-bot @ 2026-06-14 21:31 UTC (permalink / raw)
  To: Michael Bommarito; +Cc: dri-devel

Thank you for your contribution! Sashiko AI review found 3 potential issue(s) to consider:

Pre-existing issues:
- [Critical] The V3D_CPU_JOB_TYPE_INDIRECT_CSD job type was omitted from the new CPU job bounds checks, leaving unmitigated arbitrary out-of-bounds kernel memory read and write vulnerabilities.
- [High] v3d_get_cpu_indirect_csd_params does not check if drm_gem_object_lookup returns NULL for the indirect handle, leading to a NULL pointer dereference during job execution.
- [High] Kernel crash due to calling cleanup functions on uninitialized memory if CPU job validation fails early.
--

commit 0ce9f49e205ba6d1094ea08101223cdd03baade5
Author: Michael Bommarito <michael.bommarito@gmail.com>
Subject: drm/v3d: bound CPU-job query writes to their destination BO

This commit adds bounds checking for CPU job queries to ensure they do not
write past their destination buffer object. It validates the full write
extent against the destination BO size before the job is queued.

> diff --git a/drivers/gpu/drm/v3d/v3d_submit.c b/drivers/gpu/drm/v3d/v3d_submit.c
> index 1db43c6a078d5..ed45989b84b28 100644
> --- a/drivers/gpu/drm/v3d/v3d_submit.c
> +++ b/drivers/gpu/drm/v3d/v3d_submit.c
[ ... ]
> +/* Reject a query CPU job whose writes would land outside their BO. */
> +static int
> +v3d_cpu_job_bounds_check(struct v3d_cpu_job *job)
> +{
[ ... ]
> +	switch (job->job_type) {
> +	case V3D_CPU_JOB_TYPE_TIMESTAMP_QUERY:
> +	case V3D_CPU_JOB_TYPE_RESET_TIMESTAMP_QUERY:
[ ... ]
> +	default:
> +		return 0;

[Severity: Critical]
This isn't a bug introduced by this patch, but since we are adding bounds
checking here, should V3D_CPU_JOB_TYPE_INDIRECT_CSD be validated as well?

Because this switch falls through to the default case for INDIRECT_CSD
jobs, the user-controlled indirect_csd->offset is left unvalidated.
When this is later used in v3d_rewrite_csd_job_wg_counts_from_indirect():

v3d_rewrite_csd_job_wg_counts_from_indirect()
    wg_counts = (uint32_t *)(bo->vaddr + indirect_csd->offset);
    ...
    ((uint32_t *)indirect->vaddr)[uniform_idx] = wg_counts[i];

Could this allow out-of-bounds kernel memory reads and writes?

> +	}
> +
> +err_range:
> +	drm_dbg(dev, "CPU job query offset exceeds the BO.\n");
> +	return -EINVAL;
> +}
> +
>  /**
>   * v3d_submit_cpu_ioctl() - Submits a CPU job to the V3D.
>   * @dev: DRM device
> @@ -1346,6 +1444,10 @@ v3d_submit_cpu_ioctl(struct drm_device *dev, void *data,
>  		if (ret)
>  			goto fail;
>  
> +		ret = v3d_cpu_job_bounds_check(cpu_job);
> +		if (ret)
> +			goto fail;
> +

[Severity: High]
This is a pre-existing issue, but does calling goto fail here or earlier
in v3d_submit_cpu_ioctl() risk a NULL pointer dereference?

If validation fails before v3d_job_init() is called (for example, if
bo_handle_count is incorrect), cpu_job->base.s_fence remains NULL. The
error path then cleans up the uninitialized job:

v3d_submit_cpu_ioctl()
    ...
    if (args->bo_handle_count != ...)
        goto fail;
    ...
fail:
    v3d_job_cleanup((void *)cpu_job);

When drm_sched_job_cleanup() is reached:

drm_sched_job_cleanup()
    if (kref_read(&job->s_fence->finished.refcount)) {

Would this unconditionally dereference the NULL s_fence and crash the
kernel?

[Severity: High]
This is another pre-existing issue in the CPU job submission path, but
is it possible for the indirect_csd parameters to trigger a crash if
the user provides an invalid GEM handle?

In v3d_get_cpu_indirect_csd_params(), the return value of
drm_gem_object_lookup() is stored without a NULL check:

v3d_get_cpu_indirect_csd_params()
    info->indirect = drm_gem_object_lookup(file_priv, indirect_csd.indirect);

If this is NULL, when the job executes:

v3d_rewrite_csd_job_wg_counts_from_indirect()
    v3d_get_bo_vaddr(indirect);

v3d_get_bo_vaddr()
    bo->vaddr = vmap(obj->pages, obj->base.size >> PAGE_SHIFT, ...);

Will this lead to an immediate NULL pointer dereference when obj->pages or
obj->base.size is accessed?

>  		ret = v3d_lock_bo_reservations(&cpu_job->base, &acquire_ctx);
>  		if (ret)
>  			goto fail;

-- 
Sashiko AI review · https://sashiko.dev/#/patchset/20260614211644.217116-1-michael.bommarito@gmail.com?part=1

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH v2] drm/v3d: bound CPU-job query writes to their destination BO
  2026-06-14 21:16 [PATCH v2] drm/v3d: bound CPU-job query writes to their destination BO Michael Bommarito
  2026-06-14 21:31 ` sashiko-bot
@ 2026-06-15 12:40 ` Maíra Canal
  1 sibling, 0 replies; 3+ messages in thread
From: Maíra Canal @ 2026-06-15 12:40 UTC (permalink / raw)
  To: Michael Bommarito, Melissa Wen
  Cc: Maxime Ripard, Thomas Zimmermann, David Airlie, Simona Vetter,
	Iago Toral Quiroga, dri-devel, linux-kernel

Hi Michael,

I just noticed that the same problem also apply to
V3D_CPU_JOB_TYPE_INDIRECT_CSD. Could you also address this issue for
such extension?

On 6/14/26 18:16, Michael Bommarito wrote:
> The V3D_SUBMIT_CPU query CPU jobs store a user-supplied destination
> offset (and, for the copy variants, a per-query stride and query count)
> on the job and consume them at exec time without checking that the
> writes stay inside the destination BO:
> 
>    - TIMESTAMP_QUERY and RESET_TIMESTAMP_QUERY write one u64 per query
>      into bo[0] at a fully user-controlled per-query offset.
>    - COPY_TIMESTAMP_QUERY copies one u64 per query into bo[0] at
>      offset + i * stride, and reads each result from a user-controlled
>      offset in the source bo[1].
>    - COPY_PERFORMANCE_QUERY writes nperfmons * DRM_V3D_MAX_PERF_COUNTERS
>      counter slots plus an availability slot into bo[0] at the same
>      geometry.
> 
> None of these are bounded against the BO size, so a render-node user
> (DRM_RENDER_ALLOW, no master, no capability) can make the handlers
> write past the BO's vmap mapping.
> 
> Validate the full write extent against the destination BO size once the
> BOs are looked up, before the job is queued, rejecting out-of-range or
> overflowing geometry with -EINVAL. The copy extent
> offset + (count - 1) * stride + write_size is accumulated with
> check_*_overflow() so a u32 product cannot wrap the bound, and the
> performance slot count is computed the same way since nperfmons and
> ncounters are user values. The bare timestamp writes and the copy
> source reads are a single fixed u64 slot per query and are bounded
> directly.
> 
> Fixes: 9ba0ff3e083f ("drm/v3d: Create a CPU job extension for the timestamp query job")
> Assisted-by: Claude:claude-opus-4-8
> Signed-off-by: Michael Bommarito <michael.bommarito@gmail.com>
> ---
> v2:
>    - Extend the bound to the bare TIMESTAMP_QUERY and
>      RESET_TIMESTAMP_QUERY job types, which also write one u64 per query
>      into bo[0] at a user-controlled offset (Maira Canal).
>    - Simplify v3d_cpu_job_bounds_check(): the timestamp source/dest u64
>      slots are bounded directly without the can-never-overflow
>      check_mul_overflow(); only the genuinely user-sized copy extent and
>      the performance slot count keep check_*_overflow().
>    - Drop the redundant local in v3d_check_copy_extent(); rename the gate
>      to v3d_cpu_job_bounds_check(); reword the helper comments.
>    - Drop the KUnit reproducer patch; the suite is kept out of tree.
>    - Re-pin Fixes: to the earliest introducing commit (the timestamp
>      query job) now that the bare timestamp writes are covered.
> 
> Reproduced under KASAN with an out-of-tree KUnit driving the real
> handlers over shmem-backed BOs: with a query offset at the BO size the
> stock tree reports a vmalloc-out-of-bounds write in each of
> v3d_copy_query_results() (4 bytes), v3d_timestamp_query() (8 bytes) and
> v3d_reset_timestamp_queries() (8 bytes); with this patch the submit-time
> gate rejects the geometry and all in-bounds controls still pass. The
> write is plain CPU memory and not architecture specific (x86_64,
> COMPILE_TEST).
> 
>   drivers/gpu/drm/v3d/v3d_submit.c | 102 +++++++++++++++++++++++++++++++
>   1 file changed, 102 insertions(+)
> 
> diff --git a/drivers/gpu/drm/v3d/v3d_submit.c b/drivers/gpu/drm/v3d/v3d_submit.c
> index ee4512db294b3..fe11fd7e6e14a 100644
> --- a/drivers/gpu/drm/v3d/v3d_submit.c
> +++ b/drivers/gpu/drm/v3d/v3d_submit.c
> @@ -1246,6 +1246,104 @@ static const unsigned int cpu_job_bo_handle_count[] = {
>   	[V3D_CPU_JOB_TYPE_COPY_PERFORMANCE_QUERY] = 1,
>   };
>   
> +/* Reject offset + (count - 1) * stride + write_size if it leaves the BO. */
> +static int
> +v3d_check_copy_extent(struct drm_device *dev, size_t bo_size,
> +		      u32 offset, u32 stride, u32 count, u32 write_size)

Looking at the actual copy executors, I think the extent math in
v3d_cpu_job_bounds_check() should be done in u64 rather than u32 +
check_*_overflow().

The executors (v3d_copy_query_results/v3d_copy_performance_query)
compute the destination at pointer width:

       data = (u8 *)bo->vaddr + copy->offset;
       for (...) { write data[idx]; data += copy->stride; }

So the furthest byte is offset + (count - 1) * stride + write_size,
accumulated on a real u8 * (64-bit on arm64). Doing the extent in u64
mirrors the executor and would simplify the code.

I'd suggest having v3d_check_copy_extent() take u64 for write_size and
compute offset + (count - 1) * stride + write_size in u64. The (count -
1) * stride term is u32 x u32, so it's exact in u64 and needs no
overflow check, which removes the bulk of the check_*_overflow() dance.
The final additions can still be guarded with a single
check_add_overflow().

> +{
> +	u32 last;
> +
> +	if (!count)
> +		return 0;
> +
> +	if (check_mul_overflow(stride, count - 1, &last) ||
> +	    check_add_overflow(last, write_size, &last) ||
> +	    check_add_overflow(last, offset, &last) ||
> +	    last > bo_size) {
> +		drm_dbg(dev, "CPU job copy buffer exceeds the destination BO.\n");
> +		return -EINVAL;
> +	}
> +
> +	return 0;
> +}
> +
> +/* Reject a query CPU job whose writes would land outside their BO. */
> +static int
> +v3d_cpu_job_bounds_check(struct v3d_cpu_job *job)
> +{
> +	struct drm_device *dev = &job->base.v3d->drm;
> +	struct v3d_timestamp_query_info *tquery = &job->timestamp_query;
> +	struct v3d_copy_query_results_info *copy = &job->copy;
> +	u32 elem = copy->do_64bit ? sizeof(u64) : sizeof(u32);
> +	struct v3d_bo *dst, *src;
> +	u32 slots, write_size;
> +	int i;
> +
> +	switch (job->job_type) {
> +	case V3D_CPU_JOB_TYPE_TIMESTAMP_QUERY:
> +	case V3D_CPU_JOB_TYPE_RESET_TIMESTAMP_QUERY:
> +		/* Each query writes one u64 timestamp slot into bo[0]. */
> +		dst = to_v3d_bo(job->base.bo[0]);
> +
> +		for (i = 0; i < tquery->count; i++) {

Maybe add a comment here mentioning that we need (u64) so a 0xffffffff
offset won't wrap.

> +			if ((u64)tquery->queries[i].offset + sizeof(u64) >
> +			    dst->base.base.size)
> +				goto err_range;
> +		}
> +		return 0;
> +	case V3D_CPU_JOB_TYPE_COPY_TIMESTAMP_QUERY:
> +		/* Copies one u64 per query from bo[1] into bo[0]. */
> +		dst = to_v3d_bo(job->base.bo[0]);
> +		src = to_v3d_bo(job->base.bo[1]);
> +
> +		for (i = 0; i < tquery->count; i++) {
> +			if ((u64)tquery->queries[i].offset + sizeof(u64) >
> +			    src->base.base.size)
> +				goto err_range;
> +		}
> +
> +		write_size = (copy->availability_bit ? 2 : 1) * elem;
> +		return v3d_check_copy_extent(dev, dst->base.base.size,
> +					     copy->offset, copy->stride,
> +					     tquery->count, write_size);
> +	case V3D_CPU_JOB_TYPE_COPY_PERFORMANCE_QUERY:
> +		/*
> +		 * Each query writes nperfmons * DRM_V3D_MAX_PERF_COUNTERS
> +		 * counter slots into bo[0], plus an availability slot at
> +		 * index ncounters. nperfmons and ncounters are user values,
> +		 * so the slot count is computed overflow-safe.
> +		 */
> +		dst = to_v3d_bo(job->base.bo[0]);
> +
> +		if (check_mul_overflow(job->performance_query.nperfmons,
> +				       (u32)DRM_V3D_MAX_PERF_COUNTERS, &slots))
> +			goto err_range;

With the previous suggestion, you can drop this check.

Best regards,
- Maíra

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2026-06-15 12:40 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-06-14 21:16 [PATCH v2] drm/v3d: bound CPU-job query writes to their destination BO Michael Bommarito
2026-06-14 21:31 ` sashiko-bot
2026-06-15 12:40 ` Maíra Canal

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.