AMD-GFX Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/amdgpu: fix strsep() corrupting lockup_timeout module parameter on multi-GPU
@ 2026-03-17 18:17 Ruijing Dong
  2026-03-18  7:30 ` Christian König
  0 siblings, 1 reply; 2+ messages in thread
From: Ruijing Dong @ 2026-03-17 18:17 UTC (permalink / raw)
  To: Christian.Koenig, Alexander.Deucher, amd-gfx; +Cc: ruijing.dong, leo.liu

amdgpu_device_get_job_timeout_settings() passes a pointer directly to
the global amdgpu_lockup_timeout[] buffer into strsep(). strsep()
destructively replaces delimiter characters with '\0' in-place.

On multi-GPU systems, this function is called once per device. When a
multi-value setting like "0,0,0,-1" is used, the first GPU's call
transforms the global buffer into "0\00\00\0-1". The second GPU then
sees only "0" (terminated at the first '\0'), parses a single value,
hits the single-value fallthrough (index == 1), and applies timeout=0
to all rings — causing immediate false job timeouts.

Fix this by using kstrdup() to make a local copy before calling strsep(),
so the global module parameter buffer remains intact across calls. A
separate pointer is kept to the allocation start since strsep() advances
the working pointer to NULL by the end of parsing.

Signed-off-by: Ruijing Dong <ruijing.dong@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index dcae77b6c272..97ebcc5bb763 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3498,7 +3498,7 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
 
 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
 {
-	char *input = amdgpu_lockup_timeout;
+	char *input, *input_copy;
 	char *timeout_setting = NULL;
 	int index = 0;
 	long timeout;
@@ -3508,14 +3508,25 @@ static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
 	adev->gfx_timeout = adev->compute_timeout = adev->sdma_timeout =
 		adev->video_timeout = msecs_to_jiffies(2000);
 
-	if (!strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH))
+	if (!strnlen(amdgpu_lockup_timeout, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH))
 		return 0;
 
+	/*
+	 * strsep() destructively modifies its input by replacing delimiters
+	 * with '\0'. Make a local copy so the global module parameter buffer
+	 * remains intact for multi-GPU systems where this function is called
+	 * once per device.
+	 */
+	input = kstrdup(amdgpu_lockup_timeout, GFP_KERNEL);
+	if (!input)
+		return -ENOMEM;
+	input_copy = input;
+
 	while ((timeout_setting = strsep(&input, ",")) &&
 	       strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
 		ret = kstrtol(timeout_setting, 0, &timeout);
 		if (ret)
-			return ret;
+			goto out_free;
 
 		if (timeout == 0) {
 			index++;
@@ -3551,6 +3562,8 @@ static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
 		adev->gfx_timeout = adev->compute_timeout = adev->sdma_timeout =
 			adev->video_timeout = timeout;
 
+out_free:
+	kfree(input_copy);
 	return ret;
 }
 
-- 
2.49.0.593.gd86a19f485


^ permalink raw reply related	[flat|nested] 2+ messages in thread

* Re: [PATCH] drm/amdgpu: fix strsep() corrupting lockup_timeout module parameter on multi-GPU
  2026-03-17 18:17 [PATCH] drm/amdgpu: fix strsep() corrupting lockup_timeout module parameter on multi-GPU Ruijing Dong
@ 2026-03-18  7:30 ` Christian König
  0 siblings, 0 replies; 2+ messages in thread
From: Christian König @ 2026-03-18  7:30 UTC (permalink / raw)
  To: Ruijing Dong, Alexander.Deucher, amd-gfx; +Cc: leo.liu

On 3/17/26 19:17, Ruijing Dong wrote:
> amdgpu_device_get_job_timeout_settings() passes a pointer directly to
> the global amdgpu_lockup_timeout[] buffer into strsep(). strsep()
> destructively replaces delimiter characters with '\0' in-place.
> 
> On multi-GPU systems, this function is called once per device. When a
> multi-value setting like "0,0,0,-1" is used, the first GPU's call
> transforms the global buffer into "0\00\00\0-1". The second GPU then
> sees only "0" (terminated at the first '\0'), parses a single value,
> hits the single-value fallthrough (index == 1), and applies timeout=0
> to all rings — causing immediate false job timeouts.
> 
> Fix this by using kstrdup() to make a local copy before calling strsep(),
> so the global module parameter buffer remains intact across calls. A
> separate pointer is kept to the allocation start since strsep() advances
> the working pointer to NULL by the end of parsing.
> 
> Signed-off-by: Ruijing Dong <ruijing.dong@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 19 ++++++++++++++++---
>  1 file changed, 16 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index dcae77b6c272..97ebcc5bb763 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3498,7 +3498,7 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
>  
>  static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
>  {
> -	char *input = amdgpu_lockup_timeout;
> +	char *input, *input_copy;
>  	char *timeout_setting = NULL;
>  	int index = 0;
>  	long timeout;
> @@ -3508,14 +3508,25 @@ static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
>  	adev->gfx_timeout = adev->compute_timeout = adev->sdma_timeout =
>  		adev->video_timeout = msecs_to_jiffies(2000);
>  
> -	if (!strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH))
> +	if (!strnlen(amdgpu_lockup_timeout, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH))
>  		return 0;
>  
> +	/*
> +	 * strsep() destructively modifies its input by replacing delimiters
> +	 * with '\0'. Make a local copy so the global module parameter buffer
> +	 * remains intact for multi-GPU systems where this function is called
> +	 * once per device.
> +	 */
> +	input = kstrdup(amdgpu_lockup_timeout, GFP_KERNEL);

I think it is save to copy the parameter to the stack instead of using kmalloc() here.

Apart from that it's a pretty good catch.

Regards,
Christian.

> +	if (!input)
> +		return -ENOMEM;
> +	input_copy = input;
> +
>  	while ((timeout_setting = strsep(&input, ",")) &&
>  	       strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
>  		ret = kstrtol(timeout_setting, 0, &timeout);
>  		if (ret)
> -			return ret;
> +			goto out_free;
>  
>  		if (timeout == 0) {
>  			index++;
> @@ -3551,6 +3562,8 @@ static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
>  		adev->gfx_timeout = adev->compute_timeout = adev->sdma_timeout =
>  			adev->video_timeout = timeout;
>  
> +out_free:
> +	kfree(input_copy);
>  	return ret;
>  }
>  


^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2026-03-18  7:32 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-03-17 18:17 [PATCH] drm/amdgpu: fix strsep() corrupting lockup_timeout module parameter on multi-GPU Ruijing Dong
2026-03-18  7:30 ` Christian König

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox