Building the Linux kernel with Clang and LLVM
 help / color / mirror / Atom feed
* [PATCH] drm/msm/a6xx: Fix excessive stack usage
@ 2024-10-27 18:05 Akhil P Oommen
  2024-10-27 18:43 ` Arnd Bergmann
                   ` (2 more replies)
  0 siblings, 3 replies; 13+ messages in thread
From: Akhil P Oommen @ 2024-10-27 18:05 UTC (permalink / raw)
  To: Rob Clark, Sean Paul, Konrad Dybcio, Abhinav Kumar,
	Dmitry Baryshkov, Marijn Suijten, David Airlie, Simona Vetter,
	Nathan Chancellor, Nick Desaulniers, Bill Wendling, Justin Stitt
  Cc: linux-arm-msm, dri-devel, freedreno, linux-kernel, llvm,
	Arnd Bergmann, Akhil P Oommen

Clang-19 and above sometimes end up with multiple copies of the large
a6xx_hfi_msg_bw_table structure on the stack. The problem is that
a6xx_hfi_send_bw_table() calls a number of device specific functions to
fill the structure, but these create another copy of the structure on
the stack which gets copied to the first.

If the functions get inlined, that busts the warning limit:

drivers/gpu/drm/msm/adreno/a6xx_hfi.c:631:12: error: stack frame size (1032) exceeds limit (1024) in 'a6xx_hfi_send_bw_table' [-Werror,-Wframe-larger-than]

Fix this by kmalloc-ating struct a6xx_hfi_msg_bw_table instead of using
the stack. Also, use this opportunity to skip re-initializing this table
to optimize gpu wake up latency.

Cc: Arnd Bergmann <arnd@kernel.org>

Signed-off-by: Akhil P Oommen <quic_akhilpo@quicinc.com>
---
 drivers/gpu/drm/msm/adreno/a6xx_gmu.h |  1 +
 drivers/gpu/drm/msm/adreno/a6xx_hfi.c | 34 ++++++++++++++++++++++------------
 2 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.h b/drivers/gpu/drm/msm/adreno/a6xx_gmu.h
index 94b6c5cab6f4..b4a79f88ccf4 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.h
+++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.h
@@ -99,6 +99,7 @@ struct a6xx_gmu {
 	struct completion pd_gate;
 
 	struct qmp *qmp;
+	struct a6xx_hfi_msg_bw_table *bw_table;
 };
 
 static inline u32 gmu_read(struct a6xx_gmu *gmu, u32 offset)
diff --git a/drivers/gpu/drm/msm/adreno/a6xx_hfi.c b/drivers/gpu/drm/msm/adreno/a6xx_hfi.c
index cdb3f6e74d3e..55e51c81be1f 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_hfi.c
+++ b/drivers/gpu/drm/msm/adreno/a6xx_hfi.c
@@ -630,32 +630,42 @@ static void a6xx_build_bw_table(struct a6xx_hfi_msg_bw_table *msg)
 
 static int a6xx_hfi_send_bw_table(struct a6xx_gmu *gmu)
 {
-	struct a6xx_hfi_msg_bw_table msg = { 0 };
+	struct a6xx_hfi_msg_bw_table *msg;
 	struct a6xx_gpu *a6xx_gpu = container_of(gmu, struct a6xx_gpu, gmu);
 	struct adreno_gpu *adreno_gpu = &a6xx_gpu->base;
 
+	if (gmu->bw_table)
+		goto send;
+
+	msg = devm_kzalloc(gmu->dev, sizeof(*msg), GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
 	if (adreno_is_a618(adreno_gpu))
-		a618_build_bw_table(&msg);
+		a618_build_bw_table(msg);
 	else if (adreno_is_a619(adreno_gpu))
-		a619_build_bw_table(&msg);
+		a619_build_bw_table(msg);
 	else if (adreno_is_a640_family(adreno_gpu))
-		a640_build_bw_table(&msg);
+		a640_build_bw_table(msg);
 	else if (adreno_is_a650(adreno_gpu))
-		a650_build_bw_table(&msg);
+		a650_build_bw_table(msg);
 	else if (adreno_is_7c3(adreno_gpu))
-		adreno_7c3_build_bw_table(&msg);
+		adreno_7c3_build_bw_table(msg);
 	else if (adreno_is_a660(adreno_gpu))
-		a660_build_bw_table(&msg);
+		a660_build_bw_table(msg);
 	else if (adreno_is_a690(adreno_gpu))
-		a690_build_bw_table(&msg);
+		a690_build_bw_table(msg);
 	else if (adreno_is_a730(adreno_gpu))
-		a730_build_bw_table(&msg);
+		a730_build_bw_table(msg);
 	else if (adreno_is_a740_family(adreno_gpu))
-		a740_build_bw_table(&msg);
+		a740_build_bw_table(msg);
 	else
-		a6xx_build_bw_table(&msg);
+		a6xx_build_bw_table(msg);
+
+	gmu->bw_table = msg;
 
-	return a6xx_hfi_send_msg(gmu, HFI_H2F_MSG_BW_TABLE, &msg, sizeof(msg),
+send:
+	return a6xx_hfi_send_msg(gmu, HFI_H2F_MSG_BW_TABLE, gmu->bw_table, sizeof(*(gmu->bw_table)),
 		NULL, 0);
 }
 

---
base-commit: 74c374648ed08efb2ef339656f2764c28c046956
change-id: 20241024-stack-size-fix-28af7abd3fab

Best regards,
-- 
Akhil P Oommen <quic_akhilpo@quicinc.com>


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH] drm/msm/a6xx: Fix excessive stack usage
  2024-10-27 18:05 [PATCH] drm/msm/a6xx: Fix excessive stack usage Akhil P Oommen
@ 2024-10-27 18:43 ` Arnd Bergmann
  2024-10-28  9:52   ` Akhil P Oommen
  2024-10-28  8:26 ` Dmitry Baryshkov
  2024-10-28 12:24 ` Dmitry Baryshkov
  2 siblings, 1 reply; 13+ messages in thread
From: Arnd Bergmann @ 2024-10-27 18:43 UTC (permalink / raw)
  To: Akhil P Oommen, Rob Clark, Sean Paul, Konrad Dybcio,
	Abhinav Kumar, Dmitry Baryshkov, Marijn Suijten, Dave Airlie,
	Simona Vetter, Nathan Chancellor, Nick Desaulniers, Bill Wendling,
	Justin Stitt
  Cc: linux-arm-msm, dri-devel, freedreno, linux-kernel, llvm

On Sun, Oct 27, 2024, at 18:05, Akhil P Oommen wrote:
> Clang-19 and above sometimes end up with multiple copies of the large
> a6xx_hfi_msg_bw_table structure on the stack. The problem is that
> a6xx_hfi_send_bw_table() calls a number of device specific functions to
> fill the structure, but these create another copy of the structure on
> the stack which gets copied to the first.
>
> If the functions get inlined, that busts the warning limit:
>
> drivers/gpu/drm/msm/adreno/a6xx_hfi.c:631:12: error: stack frame size 
> (1032) exceeds limit (1024) in 'a6xx_hfi_send_bw_table' 
> [-Werror,-Wframe-larger-than]
>
> Fix this by kmalloc-ating struct a6xx_hfi_msg_bw_table instead of using
> the stack. Also, use this opportunity to skip re-initializing this table
> to optimize gpu wake up latency.
>
> Cc: Arnd Bergmann <arnd@kernel.org>

Please change this to "Reported-by:"

The patch looks correct to me, just one idea for improvement.

> b/drivers/gpu/drm/msm/adreno/a6xx_gmu.h
> index 94b6c5cab6f4..b4a79f88ccf4 100644
> --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.h
> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.h
> @@ -99,6 +99,7 @@ struct a6xx_gmu {
>  	struct completion pd_gate;
> 
>  	struct qmp *qmp;
> +	struct a6xx_hfi_msg_bw_table *bw_table;
>  };

I think the bw_table is better just embedded
in here rather than referenced as a pointer:

> +	if (gmu->bw_table)
> +		goto send;
> +
> +	msg = devm_kzalloc(gmu->dev, sizeof(*msg), GFP_KERNEL);
> +	if (!msg)
> +		return -ENOMEM;

It looked like it's always allocated here when the device
is up, so you can avoid the extra overhead for keeping
track of the allocation.

      Arnd

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] drm/msm/a6xx: Fix excessive stack usage
  2024-10-27 18:05 [PATCH] drm/msm/a6xx: Fix excessive stack usage Akhil P Oommen
  2024-10-27 18:43 ` Arnd Bergmann
@ 2024-10-28  8:26 ` Dmitry Baryshkov
  2024-10-28 10:07   ` Akhil P Oommen
  2024-10-28 12:24 ` Dmitry Baryshkov
  2 siblings, 1 reply; 13+ messages in thread
From: Dmitry Baryshkov @ 2024-10-28  8:26 UTC (permalink / raw)
  To: Akhil P Oommen
  Cc: Rob Clark, Sean Paul, Konrad Dybcio, Abhinav Kumar,
	Marijn Suijten, David Airlie, Simona Vetter, Nathan Chancellor,
	Nick Desaulniers, Bill Wendling, Justin Stitt, linux-arm-msm,
	dri-devel, freedreno, linux-kernel, llvm, Arnd Bergmann

On Sun, Oct 27, 2024 at 11:35:47PM +0530, Akhil P Oommen wrote:
> Clang-19 and above sometimes end up with multiple copies of the large
> a6xx_hfi_msg_bw_table structure on the stack. The problem is that
> a6xx_hfi_send_bw_table() calls a number of device specific functions to
> fill the structure, but these create another copy of the structure on
> the stack which gets copied to the first.
> 
> If the functions get inlined, that busts the warning limit:
> 
> drivers/gpu/drm/msm/adreno/a6xx_hfi.c:631:12: error: stack frame size (1032) exceeds limit (1024) in 'a6xx_hfi_send_bw_table' [-Werror,-Wframe-larger-than]
> 
> Fix this by kmalloc-ating struct a6xx_hfi_msg_bw_table instead of using
> the stack. Also, use this opportunity to skip re-initializing this table
> to optimize gpu wake up latency.
> 
> Cc: Arnd Bergmann <arnd@kernel.org>
> 
> Signed-off-by: Akhil P Oommen <quic_akhilpo@quicinc.com>
> ---
>  drivers/gpu/drm/msm/adreno/a6xx_gmu.h |  1 +
>  drivers/gpu/drm/msm/adreno/a6xx_hfi.c | 34 ++++++++++++++++++++++------------
>  2 files changed, 23 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.h b/drivers/gpu/drm/msm/adreno/a6xx_gmu.h
> index 94b6c5cab6f4..b4a79f88ccf4 100644
> --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.h
> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.h
> @@ -99,6 +99,7 @@ struct a6xx_gmu {
>  	struct completion pd_gate;
>  
>  	struct qmp *qmp;
> +	struct a6xx_hfi_msg_bw_table *bw_table;
>  };
>  
>  static inline u32 gmu_read(struct a6xx_gmu *gmu, u32 offset)
> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_hfi.c b/drivers/gpu/drm/msm/adreno/a6xx_hfi.c
> index cdb3f6e74d3e..55e51c81be1f 100644
> --- a/drivers/gpu/drm/msm/adreno/a6xx_hfi.c
> +++ b/drivers/gpu/drm/msm/adreno/a6xx_hfi.c
> @@ -630,32 +630,42 @@ static void a6xx_build_bw_table(struct a6xx_hfi_msg_bw_table *msg)
>  
>  static int a6xx_hfi_send_bw_table(struct a6xx_gmu *gmu)
>  {
> -	struct a6xx_hfi_msg_bw_table msg = { 0 };
> +	struct a6xx_hfi_msg_bw_table *msg;
>  	struct a6xx_gpu *a6xx_gpu = container_of(gmu, struct a6xx_gpu, gmu);
>  	struct adreno_gpu *adreno_gpu = &a6xx_gpu->base;
>  
> +	if (gmu->bw_table)
> +		goto send;
> +
> +	msg = devm_kzalloc(gmu->dev, sizeof(*msg), GFP_KERNEL);

Is it necessary after being sent? Isn't it better to just kzalloc() it
and then kfree() it at the end of the function?

> +	if (!msg)
> +		return -ENOMEM;
> +
>  	if (adreno_is_a618(adreno_gpu))
> -		a618_build_bw_table(&msg);
> +		a618_build_bw_table(msg);
>  	else if (adreno_is_a619(adreno_gpu))
> -		a619_build_bw_table(&msg);
> +		a619_build_bw_table(msg);
>  	else if (adreno_is_a640_family(adreno_gpu))
> -		a640_build_bw_table(&msg);
> +		a640_build_bw_table(msg);
>  	else if (adreno_is_a650(adreno_gpu))
> -		a650_build_bw_table(&msg);
> +		a650_build_bw_table(msg);
>  	else if (adreno_is_7c3(adreno_gpu))
> -		adreno_7c3_build_bw_table(&msg);
> +		adreno_7c3_build_bw_table(msg);
>  	else if (adreno_is_a660(adreno_gpu))
> -		a660_build_bw_table(&msg);
> +		a660_build_bw_table(msg);
>  	else if (adreno_is_a690(adreno_gpu))
> -		a690_build_bw_table(&msg);
> +		a690_build_bw_table(msg);
>  	else if (adreno_is_a730(adreno_gpu))
> -		a730_build_bw_table(&msg);
> +		a730_build_bw_table(msg);
>  	else if (adreno_is_a740_family(adreno_gpu))
> -		a740_build_bw_table(&msg);
> +		a740_build_bw_table(msg);
>  	else
> -		a6xx_build_bw_table(&msg);
> +		a6xx_build_bw_table(msg);
> +
> +	gmu->bw_table = msg;
>  
> -	return a6xx_hfi_send_msg(gmu, HFI_H2F_MSG_BW_TABLE, &msg, sizeof(msg),
> +send:
> +	return a6xx_hfi_send_msg(gmu, HFI_H2F_MSG_BW_TABLE, gmu->bw_table, sizeof(*(gmu->bw_table)),
>  		NULL, 0);
>  }
>  
> 
> ---
> base-commit: 74c374648ed08efb2ef339656f2764c28c046956
> change-id: 20241024-stack-size-fix-28af7abd3fab
> 
> Best regards,
> -- 
> Akhil P Oommen <quic_akhilpo@quicinc.com>
> 

-- 
With best wishes
Dmitry

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] drm/msm/a6xx: Fix excessive stack usage
  2024-10-27 18:43 ` Arnd Bergmann
@ 2024-10-28  9:52   ` Akhil P Oommen
  2024-10-28 10:39     ` Konrad Dybcio
  0 siblings, 1 reply; 13+ messages in thread
From: Akhil P Oommen @ 2024-10-28  9:52 UTC (permalink / raw)
  To: Arnd Bergmann, Rob Clark, Sean Paul, Konrad Dybcio, Abhinav Kumar,
	Dmitry Baryshkov, Marijn Suijten, Dave Airlie, Simona Vetter,
	Nathan Chancellor, Nick Desaulniers, Bill Wendling, Justin Stitt
  Cc: linux-arm-msm, dri-devel, freedreno, linux-kernel, llvm

On 10/28/2024 12:13 AM, Arnd Bergmann wrote:
> On Sun, Oct 27, 2024, at 18:05, Akhil P Oommen wrote:
>> Clang-19 and above sometimes end up with multiple copies of the large
>> a6xx_hfi_msg_bw_table structure on the stack. The problem is that
>> a6xx_hfi_send_bw_table() calls a number of device specific functions to
>> fill the structure, but these create another copy of the structure on
>> the stack which gets copied to the first.
>>
>> If the functions get inlined, that busts the warning limit:
>>
>> drivers/gpu/drm/msm/adreno/a6xx_hfi.c:631:12: error: stack frame size 
>> (1032) exceeds limit (1024) in 'a6xx_hfi_send_bw_table' 
>> [-Werror,-Wframe-larger-than]
>>
>> Fix this by kmalloc-ating struct a6xx_hfi_msg_bw_table instead of using
>> the stack. Also, use this opportunity to skip re-initializing this table
>> to optimize gpu wake up latency.
>>
>> Cc: Arnd Bergmann <arnd@kernel.org>
> 
> Please change this to "Reported-by:"

Sure.

> 
> The patch looks correct to me, just one idea for improvement.
> 
>> b/drivers/gpu/drm/msm/adreno/a6xx_gmu.h
>> index 94b6c5cab6f4..b4a79f88ccf4 100644
>> --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.h
>> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.h
>> @@ -99,6 +99,7 @@ struct a6xx_gmu {
>>  	struct completion pd_gate;
>>
>>  	struct qmp *qmp;
>> +	struct a6xx_hfi_msg_bw_table *bw_table;
>>  };
> 
> I think the bw_table is better just embedded
> in here rather than referenced as a pointer:
> 
There are some low tier chipsets with relatively lower RAM size that
doesn't require this table. So, dynamically allocating this here helps
to save 640 bytes (minus the overhead of tracking).

-Akhil

>> +	if (gmu->bw_table)
>> +		goto send;
>> +
>> +	msg = devm_kzalloc(gmu->dev, sizeof(*msg), GFP_KERNEL);
>> +	if (!msg)
>> +		return -ENOMEM;
> 
> It looked like it's always allocated here when the device
> is up, so you can avoid the extra overhead for keeping
> track of the allocation.
> 
>       Arnd


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] drm/msm/a6xx: Fix excessive stack usage
  2024-10-28  8:26 ` Dmitry Baryshkov
@ 2024-10-28 10:07   ` Akhil P Oommen
  2024-10-28 10:27     ` Dmitry Baryshkov
  0 siblings, 1 reply; 13+ messages in thread
From: Akhil P Oommen @ 2024-10-28 10:07 UTC (permalink / raw)
  To: Dmitry Baryshkov
  Cc: Rob Clark, Sean Paul, Konrad Dybcio, Abhinav Kumar,
	Marijn Suijten, David Airlie, Simona Vetter, Nathan Chancellor,
	Nick Desaulniers, Bill Wendling, Justin Stitt, linux-arm-msm,
	dri-devel, freedreno, linux-kernel, llvm, Arnd Bergmann

On 10/28/2024 1:56 PM, Dmitry Baryshkov wrote:
> On Sun, Oct 27, 2024 at 11:35:47PM +0530, Akhil P Oommen wrote:
>> Clang-19 and above sometimes end up with multiple copies of the large
>> a6xx_hfi_msg_bw_table structure on the stack. The problem is that
>> a6xx_hfi_send_bw_table() calls a number of device specific functions to
>> fill the structure, but these create another copy of the structure on
>> the stack which gets copied to the first.
>>
>> If the functions get inlined, that busts the warning limit:
>>
>> drivers/gpu/drm/msm/adreno/a6xx_hfi.c:631:12: error: stack frame size (1032) exceeds limit (1024) in 'a6xx_hfi_send_bw_table' [-Werror,-Wframe-larger-than]
>>
>> Fix this by kmalloc-ating struct a6xx_hfi_msg_bw_table instead of using
>> the stack. Also, use this opportunity to skip re-initializing this table
>> to optimize gpu wake up latency.
>>
>> Cc: Arnd Bergmann <arnd@kernel.org>
>>
>> Signed-off-by: Akhil P Oommen <quic_akhilpo@quicinc.com>
>> ---
>>  drivers/gpu/drm/msm/adreno/a6xx_gmu.h |  1 +
>>  drivers/gpu/drm/msm/adreno/a6xx_hfi.c | 34 ++++++++++++++++++++++------------
>>  2 files changed, 23 insertions(+), 12 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.h b/drivers/gpu/drm/msm/adreno/a6xx_gmu.h
>> index 94b6c5cab6f4..b4a79f88ccf4 100644
>> --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.h
>> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.h
>> @@ -99,6 +99,7 @@ struct a6xx_gmu {
>>  	struct completion pd_gate;
>>  
>>  	struct qmp *qmp;
>> +	struct a6xx_hfi_msg_bw_table *bw_table;
>>  };
>>  
>>  static inline u32 gmu_read(struct a6xx_gmu *gmu, u32 offset)
>> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_hfi.c b/drivers/gpu/drm/msm/adreno/a6xx_hfi.c
>> index cdb3f6e74d3e..55e51c81be1f 100644
>> --- a/drivers/gpu/drm/msm/adreno/a6xx_hfi.c
>> +++ b/drivers/gpu/drm/msm/adreno/a6xx_hfi.c
>> @@ -630,32 +630,42 @@ static void a6xx_build_bw_table(struct a6xx_hfi_msg_bw_table *msg)
>>  
>>  static int a6xx_hfi_send_bw_table(struct a6xx_gmu *gmu)
>>  {
>> -	struct a6xx_hfi_msg_bw_table msg = { 0 };
>> +	struct a6xx_hfi_msg_bw_table *msg;
>>  	struct a6xx_gpu *a6xx_gpu = container_of(gmu, struct a6xx_gpu, gmu);
>>  	struct adreno_gpu *adreno_gpu = &a6xx_gpu->base;
>>  
>> +	if (gmu->bw_table)
>> +		goto send;
>> +
>> +	msg = devm_kzalloc(gmu->dev, sizeof(*msg), GFP_KERNEL);
> 
> Is it necessary after being sent? Isn't it better to just kzalloc() it
> and then kfree() it at the end of the function?

Keeping it around will help to cut down unnecessary work during
subsequent gpu wake ups.

-Akhil.

> 
>> +	if (!msg)
>> +		return -ENOMEM;
>> +
>>  	if (adreno_is_a618(adreno_gpu))
>> -		a618_build_bw_table(&msg);
>> +		a618_build_bw_table(msg);
>>  	else if (adreno_is_a619(adreno_gpu))
>> -		a619_build_bw_table(&msg);
>> +		a619_build_bw_table(msg);
>>  	else if (adreno_is_a640_family(adreno_gpu))
>> -		a640_build_bw_table(&msg);
>> +		a640_build_bw_table(msg);
>>  	else if (adreno_is_a650(adreno_gpu))
>> -		a650_build_bw_table(&msg);
>> +		a650_build_bw_table(msg);
>>  	else if (adreno_is_7c3(adreno_gpu))
>> -		adreno_7c3_build_bw_table(&msg);
>> +		adreno_7c3_build_bw_table(msg);
>>  	else if (adreno_is_a660(adreno_gpu))
>> -		a660_build_bw_table(&msg);
>> +		a660_build_bw_table(msg);
>>  	else if (adreno_is_a690(adreno_gpu))
>> -		a690_build_bw_table(&msg);
>> +		a690_build_bw_table(msg);
>>  	else if (adreno_is_a730(adreno_gpu))
>> -		a730_build_bw_table(&msg);
>> +		a730_build_bw_table(msg);
>>  	else if (adreno_is_a740_family(adreno_gpu))
>> -		a740_build_bw_table(&msg);
>> +		a740_build_bw_table(msg);
>>  	else
>> -		a6xx_build_bw_table(&msg);
>> +		a6xx_build_bw_table(msg);
>> +
>> +	gmu->bw_table = msg;
>>  
>> -	return a6xx_hfi_send_msg(gmu, HFI_H2F_MSG_BW_TABLE, &msg, sizeof(msg),
>> +send:
>> +	return a6xx_hfi_send_msg(gmu, HFI_H2F_MSG_BW_TABLE, gmu->bw_table, sizeof(*(gmu->bw_table)),
>>  		NULL, 0);
>>  }
>>  
>>
>> ---
>> base-commit: 74c374648ed08efb2ef339656f2764c28c046956
>> change-id: 20241024-stack-size-fix-28af7abd3fab
>>
>> Best regards,
>> -- 
>> Akhil P Oommen <quic_akhilpo@quicinc.com>
>>
> 


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] drm/msm/a6xx: Fix excessive stack usage
  2024-10-28 10:07   ` Akhil P Oommen
@ 2024-10-28 10:27     ` Dmitry Baryshkov
  2024-10-28 10:36       ` Konrad Dybcio
  0 siblings, 1 reply; 13+ messages in thread
From: Dmitry Baryshkov @ 2024-10-28 10:27 UTC (permalink / raw)
  To: Akhil P Oommen
  Cc: Rob Clark, Sean Paul, Konrad Dybcio, Abhinav Kumar,
	Marijn Suijten, David Airlie, Simona Vetter, Nathan Chancellor,
	Nick Desaulniers, Bill Wendling, Justin Stitt, linux-arm-msm,
	dri-devel, freedreno, linux-kernel, llvm, Arnd Bergmann

On Mon, 28 Oct 2024 at 12:08, Akhil P Oommen <quic_akhilpo@quicinc.com> wrote:
>
> On 10/28/2024 1:56 PM, Dmitry Baryshkov wrote:
> > On Sun, Oct 27, 2024 at 11:35:47PM +0530, Akhil P Oommen wrote:
> >> Clang-19 and above sometimes end up with multiple copies of the large
> >> a6xx_hfi_msg_bw_table structure on the stack. The problem is that
> >> a6xx_hfi_send_bw_table() calls a number of device specific functions to
> >> fill the structure, but these create another copy of the structure on
> >> the stack which gets copied to the first.
> >>
> >> If the functions get inlined, that busts the warning limit:
> >>
> >> drivers/gpu/drm/msm/adreno/a6xx_hfi.c:631:12: error: stack frame size (1032) exceeds limit (1024) in 'a6xx_hfi_send_bw_table' [-Werror,-Wframe-larger-than]
> >>
> >> Fix this by kmalloc-ating struct a6xx_hfi_msg_bw_table instead of using
> >> the stack. Also, use this opportunity to skip re-initializing this table
> >> to optimize gpu wake up latency.
> >>
> >> Cc: Arnd Bergmann <arnd@kernel.org>
> >>
> >> Signed-off-by: Akhil P Oommen <quic_akhilpo@quicinc.com>
> >> ---
> >>  drivers/gpu/drm/msm/adreno/a6xx_gmu.h |  1 +
> >>  drivers/gpu/drm/msm/adreno/a6xx_hfi.c | 34 ++++++++++++++++++++++------------
> >>  2 files changed, 23 insertions(+), 12 deletions(-)
> >>
> >> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.h b/drivers/gpu/drm/msm/adreno/a6xx_gmu.h
> >> index 94b6c5cab6f4..b4a79f88ccf4 100644
> >> --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.h
> >> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.h
> >> @@ -99,6 +99,7 @@ struct a6xx_gmu {
> >>      struct completion pd_gate;
> >>
> >>      struct qmp *qmp;
> >> +    struct a6xx_hfi_msg_bw_table *bw_table;
> >>  };
> >>
> >>  static inline u32 gmu_read(struct a6xx_gmu *gmu, u32 offset)
> >> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_hfi.c b/drivers/gpu/drm/msm/adreno/a6xx_hfi.c
> >> index cdb3f6e74d3e..55e51c81be1f 100644
> >> --- a/drivers/gpu/drm/msm/adreno/a6xx_hfi.c
> >> +++ b/drivers/gpu/drm/msm/adreno/a6xx_hfi.c
> >> @@ -630,32 +630,42 @@ static void a6xx_build_bw_table(struct a6xx_hfi_msg_bw_table *msg)
> >>
> >>  static int a6xx_hfi_send_bw_table(struct a6xx_gmu *gmu)
> >>  {
> >> -    struct a6xx_hfi_msg_bw_table msg = { 0 };
> >> +    struct a6xx_hfi_msg_bw_table *msg;
> >>      struct a6xx_gpu *a6xx_gpu = container_of(gmu, struct a6xx_gpu, gmu);
> >>      struct adreno_gpu *adreno_gpu = &a6xx_gpu->base;
> >>
> >> +    if (gmu->bw_table)
> >> +            goto send;
> >> +
> >> +    msg = devm_kzalloc(gmu->dev, sizeof(*msg), GFP_KERNEL);
> >
> > Is it necessary after being sent? Isn't it better to just kzalloc() it
> > and then kfree() it at the end of the function?
>
> Keeping it around will help to cut down unnecessary work during
> subsequent gpu wake ups.

Then, I'd say, it is better to make it a part of the a6xx_gpu struct.


-- 
With best wishes
Dmitry

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] drm/msm/a6xx: Fix excessive stack usage
  2024-10-28 10:27     ` Dmitry Baryshkov
@ 2024-10-28 10:36       ` Konrad Dybcio
  2024-10-28 10:52         ` Dmitry Baryshkov
  0 siblings, 1 reply; 13+ messages in thread
From: Konrad Dybcio @ 2024-10-28 10:36 UTC (permalink / raw)
  To: Dmitry Baryshkov, Akhil P Oommen
  Cc: Rob Clark, Sean Paul, Konrad Dybcio, Abhinav Kumar,
	Marijn Suijten, David Airlie, Simona Vetter, Nathan Chancellor,
	Nick Desaulniers, Bill Wendling, Justin Stitt, linux-arm-msm,
	dri-devel, freedreno, linux-kernel, llvm, Arnd Bergmann

On 28.10.2024 11:27 AM, Dmitry Baryshkov wrote:
> On Mon, 28 Oct 2024 at 12:08, Akhil P Oommen <quic_akhilpo@quicinc.com> wrote:
>>
>> On 10/28/2024 1:56 PM, Dmitry Baryshkov wrote:
>>> On Sun, Oct 27, 2024 at 11:35:47PM +0530, Akhil P Oommen wrote:
>>>> Clang-19 and above sometimes end up with multiple copies of the large
>>>> a6xx_hfi_msg_bw_table structure on the stack. The problem is that
>>>> a6xx_hfi_send_bw_table() calls a number of device specific functions to
>>>> fill the structure, but these create another copy of the structure on
>>>> the stack which gets copied to the first.
>>>>
>>>> If the functions get inlined, that busts the warning limit:
>>>>
>>>> drivers/gpu/drm/msm/adreno/a6xx_hfi.c:631:12: error: stack frame size (1032) exceeds limit (1024) in 'a6xx_hfi_send_bw_table' [-Werror,-Wframe-larger-than]
>>>>
>>>> Fix this by kmalloc-ating struct a6xx_hfi_msg_bw_table instead of using
>>>> the stack. Also, use this opportunity to skip re-initializing this table
>>>> to optimize gpu wake up latency.
>>>>
>>>> Cc: Arnd Bergmann <arnd@kernel.org>
>>>>
>>>> Signed-off-by: Akhil P Oommen <quic_akhilpo@quicinc.com>
>>>> ---
>>>>  drivers/gpu/drm/msm/adreno/a6xx_gmu.h |  1 +
>>>>  drivers/gpu/drm/msm/adreno/a6xx_hfi.c | 34 ++++++++++++++++++++++------------
>>>>  2 files changed, 23 insertions(+), 12 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.h b/drivers/gpu/drm/msm/adreno/a6xx_gmu.h
>>>> index 94b6c5cab6f4..b4a79f88ccf4 100644
>>>> --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.h
>>>> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.h
>>>> @@ -99,6 +99,7 @@ struct a6xx_gmu {
>>>>      struct completion pd_gate;
>>>>
>>>>      struct qmp *qmp;
>>>> +    struct a6xx_hfi_msg_bw_table *bw_table;
>>>>  };
>>>>
>>>>  static inline u32 gmu_read(struct a6xx_gmu *gmu, u32 offset)
>>>> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_hfi.c b/drivers/gpu/drm/msm/adreno/a6xx_hfi.c
>>>> index cdb3f6e74d3e..55e51c81be1f 100644
>>>> --- a/drivers/gpu/drm/msm/adreno/a6xx_hfi.c
>>>> +++ b/drivers/gpu/drm/msm/adreno/a6xx_hfi.c
>>>> @@ -630,32 +630,42 @@ static void a6xx_build_bw_table(struct a6xx_hfi_msg_bw_table *msg)
>>>>
>>>>  static int a6xx_hfi_send_bw_table(struct a6xx_gmu *gmu)
>>>>  {
>>>> -    struct a6xx_hfi_msg_bw_table msg = { 0 };
>>>> +    struct a6xx_hfi_msg_bw_table *msg;
>>>>      struct a6xx_gpu *a6xx_gpu = container_of(gmu, struct a6xx_gpu, gmu);
>>>>      struct adreno_gpu *adreno_gpu = &a6xx_gpu->base;
>>>>
>>>> +    if (gmu->bw_table)
>>>> +            goto send;
>>>> +
>>>> +    msg = devm_kzalloc(gmu->dev, sizeof(*msg), GFP_KERNEL);
>>>
>>> Is it necessary after being sent? Isn't it better to just kzalloc() it
>>> and then kfree() it at the end of the function?
>>
>> Keeping it around will help to cut down unnecessary work during
>> subsequent gpu wake ups.
> 
> Then, I'd say, it is better to make it a part of the a6xx_gpu struct.

I think a6xx_gmu makes more logical sense here.

FWIW, the driver allocates both _gmu and _gpu for all GPUs regardless

Konrad

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] drm/msm/a6xx: Fix excessive stack usage
  2024-10-28  9:52   ` Akhil P Oommen
@ 2024-10-28 10:39     ` Konrad Dybcio
  2024-10-28 10:50       ` Dmitry Baryshkov
  0 siblings, 1 reply; 13+ messages in thread
From: Konrad Dybcio @ 2024-10-28 10:39 UTC (permalink / raw)
  To: Akhil P Oommen, Arnd Bergmann, Rob Clark, Sean Paul,
	Konrad Dybcio, Abhinav Kumar, Dmitry Baryshkov, Marijn Suijten,
	Dave Airlie, Simona Vetter, Nathan Chancellor, Nick Desaulniers,
	Bill Wendling, Justin Stitt
  Cc: linux-arm-msm, dri-devel, freedreno, linux-kernel, llvm

On 28.10.2024 10:52 AM, Akhil P Oommen wrote:
> On 10/28/2024 12:13 AM, Arnd Bergmann wrote:
>> On Sun, Oct 27, 2024, at 18:05, Akhil P Oommen wrote:
>>> Clang-19 and above sometimes end up with multiple copies of the large
>>> a6xx_hfi_msg_bw_table structure on the stack. The problem is that
>>> a6xx_hfi_send_bw_table() calls a number of device specific functions to
>>> fill the structure, but these create another copy of the structure on
>>> the stack which gets copied to the first.
>>>
>>> If the functions get inlined, that busts the warning limit:
>>>
>>> drivers/gpu/drm/msm/adreno/a6xx_hfi.c:631:12: error: stack frame size 
>>> (1032) exceeds limit (1024) in 'a6xx_hfi_send_bw_table' 
>>> [-Werror,-Wframe-larger-than]
>>>
>>> Fix this by kmalloc-ating struct a6xx_hfi_msg_bw_table instead of using
>>> the stack. Also, use this opportunity to skip re-initializing this table
>>> to optimize gpu wake up latency.
>>>
>>> Cc: Arnd Bergmann <arnd@kernel.org>
>>
>> Please change this to "Reported-by:"
> 
> Sure.
> 
>>
>> The patch looks correct to me, just one idea for improvement.
>>
>>> b/drivers/gpu/drm/msm/adreno/a6xx_gmu.h
>>> index 94b6c5cab6f4..b4a79f88ccf4 100644
>>> --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.h
>>> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.h
>>> @@ -99,6 +99,7 @@ struct a6xx_gmu {
>>>  	struct completion pd_gate;
>>>
>>>  	struct qmp *qmp;
>>> +	struct a6xx_hfi_msg_bw_table *bw_table;
>>>  };
>>
>> I think the bw_table is better just embedded
>> in here rather than referenced as a pointer:
>>
> There are some low tier chipsets with relatively lower RAM size that
> doesn't require this table. So, dynamically allocating this here helps
> to save 640 bytes (minus the overhead of tracking).

I'd second this, said chipsets often ship with 1-2 GiB of RAM (which
is still a lot in comparison, but you know.. every little bit counts)

Konrad

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] drm/msm/a6xx: Fix excessive stack usage
  2024-10-28 10:39     ` Konrad Dybcio
@ 2024-10-28 10:50       ` Dmitry Baryshkov
  0 siblings, 0 replies; 13+ messages in thread
From: Dmitry Baryshkov @ 2024-10-28 10:50 UTC (permalink / raw)
  To: Konrad Dybcio
  Cc: Akhil P Oommen, Arnd Bergmann, Rob Clark, Sean Paul,
	Konrad Dybcio, Abhinav Kumar, Marijn Suijten, Dave Airlie,
	Simona Vetter, Nathan Chancellor, Nick Desaulniers, Bill Wendling,
	Justin Stitt, linux-arm-msm, dri-devel, freedreno, linux-kernel,
	llvm

On Mon, Oct 28, 2024 at 11:39:16AM +0100, Konrad Dybcio wrote:
> On 28.10.2024 10:52 AM, Akhil P Oommen wrote:
> > On 10/28/2024 12:13 AM, Arnd Bergmann wrote:
> >> On Sun, Oct 27, 2024, at 18:05, Akhil P Oommen wrote:
> >>> Clang-19 and above sometimes end up with multiple copies of the large
> >>> a6xx_hfi_msg_bw_table structure on the stack. The problem is that
> >>> a6xx_hfi_send_bw_table() calls a number of device specific functions to
> >>> fill the structure, but these create another copy of the structure on
> >>> the stack which gets copied to the first.
> >>>
> >>> If the functions get inlined, that busts the warning limit:
> >>>
> >>> drivers/gpu/drm/msm/adreno/a6xx_hfi.c:631:12: error: stack frame size 
> >>> (1032) exceeds limit (1024) in 'a6xx_hfi_send_bw_table' 
> >>> [-Werror,-Wframe-larger-than]
> >>>
> >>> Fix this by kmalloc-ating struct a6xx_hfi_msg_bw_table instead of using
> >>> the stack. Also, use this opportunity to skip re-initializing this table
> >>> to optimize gpu wake up latency.
> >>>
> >>> Cc: Arnd Bergmann <arnd@kernel.org>
> >>
> >> Please change this to "Reported-by:"
> > 
> > Sure.
> > 
> >>
> >> The patch looks correct to me, just one idea for improvement.
> >>
> >>> b/drivers/gpu/drm/msm/adreno/a6xx_gmu.h
> >>> index 94b6c5cab6f4..b4a79f88ccf4 100644
> >>> --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.h
> >>> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.h
> >>> @@ -99,6 +99,7 @@ struct a6xx_gmu {
> >>>  	struct completion pd_gate;
> >>>
> >>>  	struct qmp *qmp;
> >>> +	struct a6xx_hfi_msg_bw_table *bw_table;
> >>>  };
> >>
> >> I think the bw_table is better just embedded
> >> in here rather than referenced as a pointer:
> >>
> > There are some low tier chipsets with relatively lower RAM size that
> > doesn't require this table. So, dynamically allocating this here helps
> > to save 640 bytes (minus the overhead of tracking).
> 
> I'd second this, said chipsets often ship with 1-2 GiB of RAM (which
> is still a lot in comparison, but you know.. every little bit counts)

Okay from my side. Yeah, poor Gnome runnning on top of 1 GiB device is
very sad.

-- 
With best wishes
Dmitry

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] drm/msm/a6xx: Fix excessive stack usage
  2024-10-28 10:36       ` Konrad Dybcio
@ 2024-10-28 10:52         ` Dmitry Baryshkov
  2024-10-28 11:31           ` Konrad Dybcio
  0 siblings, 1 reply; 13+ messages in thread
From: Dmitry Baryshkov @ 2024-10-28 10:52 UTC (permalink / raw)
  To: Konrad Dybcio
  Cc: Akhil P Oommen, Rob Clark, Sean Paul, Konrad Dybcio,
	Abhinav Kumar, Marijn Suijten, David Airlie, Simona Vetter,
	Nathan Chancellor, Nick Desaulniers, Bill Wendling, Justin Stitt,
	linux-arm-msm, dri-devel, freedreno, linux-kernel, llvm,
	Arnd Bergmann

On Mon, Oct 28, 2024 at 11:36:15AM +0100, Konrad Dybcio wrote:
> On 28.10.2024 11:27 AM, Dmitry Baryshkov wrote:
> > On Mon, 28 Oct 2024 at 12:08, Akhil P Oommen <quic_akhilpo@quicinc.com> wrote:
> >>
> >> On 10/28/2024 1:56 PM, Dmitry Baryshkov wrote:
> >>> On Sun, Oct 27, 2024 at 11:35:47PM +0530, Akhil P Oommen wrote:
> >>>> Clang-19 and above sometimes end up with multiple copies of the large
> >>>> a6xx_hfi_msg_bw_table structure on the stack. The problem is that
> >>>> a6xx_hfi_send_bw_table() calls a number of device specific functions to
> >>>> fill the structure, but these create another copy of the structure on
> >>>> the stack which gets copied to the first.
> >>>>
> >>>> If the functions get inlined, that busts the warning limit:
> >>>>
> >>>> drivers/gpu/drm/msm/adreno/a6xx_hfi.c:631:12: error: stack frame size (1032) exceeds limit (1024) in 'a6xx_hfi_send_bw_table' [-Werror,-Wframe-larger-than]
> >>>>
> >>>> Fix this by kmalloc-ating struct a6xx_hfi_msg_bw_table instead of using
> >>>> the stack. Also, use this opportunity to skip re-initializing this table
> >>>> to optimize gpu wake up latency.
> >>>>
> >>>> Cc: Arnd Bergmann <arnd@kernel.org>
> >>>>
> >>>> Signed-off-by: Akhil P Oommen <quic_akhilpo@quicinc.com>
> >>>> ---
> >>>>  drivers/gpu/drm/msm/adreno/a6xx_gmu.h |  1 +
> >>>>  drivers/gpu/drm/msm/adreno/a6xx_hfi.c | 34 ++++++++++++++++++++++------------
> >>>>  2 files changed, 23 insertions(+), 12 deletions(-)
> >>>>
> >>>> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.h b/drivers/gpu/drm/msm/adreno/a6xx_gmu.h
> >>>> index 94b6c5cab6f4..b4a79f88ccf4 100644
> >>>> --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.h
> >>>> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.h
> >>>> @@ -99,6 +99,7 @@ struct a6xx_gmu {
> >>>>      struct completion pd_gate;
> >>>>
> >>>>      struct qmp *qmp;
> >>>> +    struct a6xx_hfi_msg_bw_table *bw_table;
> >>>>  };
> >>>>
> >>>>  static inline u32 gmu_read(struct a6xx_gmu *gmu, u32 offset)
> >>>> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_hfi.c b/drivers/gpu/drm/msm/adreno/a6xx_hfi.c
> >>>> index cdb3f6e74d3e..55e51c81be1f 100644
> >>>> --- a/drivers/gpu/drm/msm/adreno/a6xx_hfi.c
> >>>> +++ b/drivers/gpu/drm/msm/adreno/a6xx_hfi.c
> >>>> @@ -630,32 +630,42 @@ static void a6xx_build_bw_table(struct a6xx_hfi_msg_bw_table *msg)
> >>>>
> >>>>  static int a6xx_hfi_send_bw_table(struct a6xx_gmu *gmu)
> >>>>  {
> >>>> -    struct a6xx_hfi_msg_bw_table msg = { 0 };
> >>>> +    struct a6xx_hfi_msg_bw_table *msg;
> >>>>      struct a6xx_gpu *a6xx_gpu = container_of(gmu, struct a6xx_gpu, gmu);
> >>>>      struct adreno_gpu *adreno_gpu = &a6xx_gpu->base;
> >>>>
> >>>> +    if (gmu->bw_table)
> >>>> +            goto send;
> >>>> +
> >>>> +    msg = devm_kzalloc(gmu->dev, sizeof(*msg), GFP_KERNEL);
> >>>
> >>> Is it necessary after being sent? Isn't it better to just kzalloc() it
> >>> and then kfree() it at the end of the function?
> >>
> >> Keeping it around will help to cut down unnecessary work during
> >> subsequent gpu wake ups.
> > 
> > Then, I'd say, it is better to make it a part of the a6xx_gpu struct.
> 
> I think a6xx_gmu makes more logical sense here.
> 
> FWIW, the driver allocates both _gmu and _gpu for all GPUs regardless

Hmm, are we expected to handle / perform BW requests in case of GMU-less
devices?

-- 
With best wishes
Dmitry

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] drm/msm/a6xx: Fix excessive stack usage
  2024-10-28 10:52         ` Dmitry Baryshkov
@ 2024-10-28 11:31           ` Konrad Dybcio
  2024-10-28 12:22             ` Dmitry Baryshkov
  0 siblings, 1 reply; 13+ messages in thread
From: Konrad Dybcio @ 2024-10-28 11:31 UTC (permalink / raw)
  To: Dmitry Baryshkov, Konrad Dybcio
  Cc: Akhil P Oommen, Rob Clark, Sean Paul, Konrad Dybcio,
	Abhinav Kumar, Marijn Suijten, David Airlie, Simona Vetter,
	Nathan Chancellor, Nick Desaulniers, Bill Wendling, Justin Stitt,
	linux-arm-msm, dri-devel, freedreno, linux-kernel, llvm,
	Arnd Bergmann

On 28.10.2024 11:52 AM, Dmitry Baryshkov wrote:
> On Mon, Oct 28, 2024 at 11:36:15AM +0100, Konrad Dybcio wrote:
>> On 28.10.2024 11:27 AM, Dmitry Baryshkov wrote:
>>> On Mon, 28 Oct 2024 at 12:08, Akhil P Oommen <quic_akhilpo@quicinc.com> wrote:
>>>>
>>>> On 10/28/2024 1:56 PM, Dmitry Baryshkov wrote:
>>>>> On Sun, Oct 27, 2024 at 11:35:47PM +0530, Akhil P Oommen wrote:
>>>>>> Clang-19 and above sometimes end up with multiple copies of the large
>>>>>> a6xx_hfi_msg_bw_table structure on the stack. The problem is that
>>>>>> a6xx_hfi_send_bw_table() calls a number of device specific functions to
>>>>>> fill the structure, but these create another copy of the structure on
>>>>>> the stack which gets copied to the first.
>>>>>>
>>>>>> If the functions get inlined, that busts the warning limit:
>>>>>>
>>>>>> drivers/gpu/drm/msm/adreno/a6xx_hfi.c:631:12: error: stack frame size (1032) exceeds limit (1024) in 'a6xx_hfi_send_bw_table' [-Werror,-Wframe-larger-than]
>>>>>>
>>>>>> Fix this by kmalloc-ating struct a6xx_hfi_msg_bw_table instead of using
>>>>>> the stack. Also, use this opportunity to skip re-initializing this table
>>>>>> to optimize gpu wake up latency.
>>>>>>
>>>>>> Cc: Arnd Bergmann <arnd@kernel.org>
>>>>>>
>>>>>> Signed-off-by: Akhil P Oommen <quic_akhilpo@quicinc.com>
>>>>>> ---
>>>>>>  drivers/gpu/drm/msm/adreno/a6xx_gmu.h |  1 +
>>>>>>  drivers/gpu/drm/msm/adreno/a6xx_hfi.c | 34 ++++++++++++++++++++++------------
>>>>>>  2 files changed, 23 insertions(+), 12 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.h b/drivers/gpu/drm/msm/adreno/a6xx_gmu.h
>>>>>> index 94b6c5cab6f4..b4a79f88ccf4 100644
>>>>>> --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.h
>>>>>> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.h
>>>>>> @@ -99,6 +99,7 @@ struct a6xx_gmu {
>>>>>>      struct completion pd_gate;
>>>>>>
>>>>>>      struct qmp *qmp;
>>>>>> +    struct a6xx_hfi_msg_bw_table *bw_table;
>>>>>>  };
>>>>>>
>>>>>>  static inline u32 gmu_read(struct a6xx_gmu *gmu, u32 offset)
>>>>>> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_hfi.c b/drivers/gpu/drm/msm/adreno/a6xx_hfi.c
>>>>>> index cdb3f6e74d3e..55e51c81be1f 100644
>>>>>> --- a/drivers/gpu/drm/msm/adreno/a6xx_hfi.c
>>>>>> +++ b/drivers/gpu/drm/msm/adreno/a6xx_hfi.c
>>>>>> @@ -630,32 +630,42 @@ static void a6xx_build_bw_table(struct a6xx_hfi_msg_bw_table *msg)
>>>>>>
>>>>>>  static int a6xx_hfi_send_bw_table(struct a6xx_gmu *gmu)
>>>>>>  {
>>>>>> -    struct a6xx_hfi_msg_bw_table msg = { 0 };
>>>>>> +    struct a6xx_hfi_msg_bw_table *msg;
>>>>>>      struct a6xx_gpu *a6xx_gpu = container_of(gmu, struct a6xx_gpu, gmu);
>>>>>>      struct adreno_gpu *adreno_gpu = &a6xx_gpu->base;
>>>>>>
>>>>>> +    if (gmu->bw_table)
>>>>>> +            goto send;
>>>>>> +
>>>>>> +    msg = devm_kzalloc(gmu->dev, sizeof(*msg), GFP_KERNEL);
>>>>>
>>>>> Is it necessary after being sent? Isn't it better to just kzalloc() it
>>>>> and then kfree() it at the end of the function?
>>>>
>>>> Keeping it around will help to cut down unnecessary work during
>>>> subsequent gpu wake ups.
>>>
>>> Then, I'd say, it is better to make it a part of the a6xx_gpu struct.
>>
>> I think a6xx_gmu makes more logical sense here.
>>
>> FWIW, the driver allocates both _gmu and _gpu for all GPUs regardless
> 
> Hmm, are we expected to handle / perform BW requests in case of GMU-less
> devices?

opp-table does that for us

In case of no gmu ("gmu wrapper"), Linux is the only entity that controls
things

Konrad

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] drm/msm/a6xx: Fix excessive stack usage
  2024-10-28 11:31           ` Konrad Dybcio
@ 2024-10-28 12:22             ` Dmitry Baryshkov
  0 siblings, 0 replies; 13+ messages in thread
From: Dmitry Baryshkov @ 2024-10-28 12:22 UTC (permalink / raw)
  To: Konrad Dybcio
  Cc: Akhil P Oommen, Rob Clark, Sean Paul, Konrad Dybcio,
	Abhinav Kumar, Marijn Suijten, David Airlie, Simona Vetter,
	Nathan Chancellor, Nick Desaulniers, Bill Wendling, Justin Stitt,
	linux-arm-msm, dri-devel, freedreno, linux-kernel, llvm,
	Arnd Bergmann

On Mon, Oct 28, 2024 at 12:31:50PM +0100, Konrad Dybcio wrote:
> On 28.10.2024 11:52 AM, Dmitry Baryshkov wrote:
> > On Mon, Oct 28, 2024 at 11:36:15AM +0100, Konrad Dybcio wrote:
> >> On 28.10.2024 11:27 AM, Dmitry Baryshkov wrote:
> >>> On Mon, 28 Oct 2024 at 12:08, Akhil P Oommen <quic_akhilpo@quicinc.com> wrote:
> >>>>
> >>>> On 10/28/2024 1:56 PM, Dmitry Baryshkov wrote:
> >>>>> On Sun, Oct 27, 2024 at 11:35:47PM +0530, Akhil P Oommen wrote:
> >>>>>> Clang-19 and above sometimes end up with multiple copies of the large
> >>>>>> a6xx_hfi_msg_bw_table structure on the stack. The problem is that
> >>>>>> a6xx_hfi_send_bw_table() calls a number of device specific functions to
> >>>>>> fill the structure, but these create another copy of the structure on
> >>>>>> the stack which gets copied to the first.
> >>>>>>
> >>>>>> If the functions get inlined, that busts the warning limit:
> >>>>>>
> >>>>>> drivers/gpu/drm/msm/adreno/a6xx_hfi.c:631:12: error: stack frame size (1032) exceeds limit (1024) in 'a6xx_hfi_send_bw_table' [-Werror,-Wframe-larger-than]
> >>>>>>
> >>>>>> Fix this by kmalloc-ating struct a6xx_hfi_msg_bw_table instead of using
> >>>>>> the stack. Also, use this opportunity to skip re-initializing this table
> >>>>>> to optimize gpu wake up latency.
> >>>>>>
> >>>>>> Cc: Arnd Bergmann <arnd@kernel.org>
> >>>>>>
> >>>>>> Signed-off-by: Akhil P Oommen <quic_akhilpo@quicinc.com>
> >>>>>> ---
> >>>>>>  drivers/gpu/drm/msm/adreno/a6xx_gmu.h |  1 +
> >>>>>>  drivers/gpu/drm/msm/adreno/a6xx_hfi.c | 34 ++++++++++++++++++++++------------
> >>>>>>  2 files changed, 23 insertions(+), 12 deletions(-)
> >>>>>>
> >>>>>> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.h b/drivers/gpu/drm/msm/adreno/a6xx_gmu.h
> >>>>>> index 94b6c5cab6f4..b4a79f88ccf4 100644
> >>>>>> --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.h
> >>>>>> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.h
> >>>>>> @@ -99,6 +99,7 @@ struct a6xx_gmu {
> >>>>>>      struct completion pd_gate;
> >>>>>>
> >>>>>>      struct qmp *qmp;
> >>>>>> +    struct a6xx_hfi_msg_bw_table *bw_table;
> >>>>>>  };
> >>>>>>
> >>>>>>  static inline u32 gmu_read(struct a6xx_gmu *gmu, u32 offset)
> >>>>>> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_hfi.c b/drivers/gpu/drm/msm/adreno/a6xx_hfi.c
> >>>>>> index cdb3f6e74d3e..55e51c81be1f 100644
> >>>>>> --- a/drivers/gpu/drm/msm/adreno/a6xx_hfi.c
> >>>>>> +++ b/drivers/gpu/drm/msm/adreno/a6xx_hfi.c
> >>>>>> @@ -630,32 +630,42 @@ static void a6xx_build_bw_table(struct a6xx_hfi_msg_bw_table *msg)
> >>>>>>
> >>>>>>  static int a6xx_hfi_send_bw_table(struct a6xx_gmu *gmu)
> >>>>>>  {
> >>>>>> -    struct a6xx_hfi_msg_bw_table msg = { 0 };
> >>>>>> +    struct a6xx_hfi_msg_bw_table *msg;
> >>>>>>      struct a6xx_gpu *a6xx_gpu = container_of(gmu, struct a6xx_gpu, gmu);
> >>>>>>      struct adreno_gpu *adreno_gpu = &a6xx_gpu->base;
> >>>>>>
> >>>>>> +    if (gmu->bw_table)
> >>>>>> +            goto send;
> >>>>>> +
> >>>>>> +    msg = devm_kzalloc(gmu->dev, sizeof(*msg), GFP_KERNEL);
> >>>>>
> >>>>> Is it necessary after being sent? Isn't it better to just kzalloc() it
> >>>>> and then kfree() it at the end of the function?
> >>>>
> >>>> Keeping it around will help to cut down unnecessary work during
> >>>> subsequent gpu wake ups.
> >>>
> >>> Then, I'd say, it is better to make it a part of the a6xx_gpu struct.
> >>
> >> I think a6xx_gmu makes more logical sense here.
> >>
> >> FWIW, the driver allocates both _gmu and _gpu for all GPUs regardless
> > 
> > Hmm, are we expected to handle / perform BW requests in case of GMU-less
> > devices?
> 
> opp-table does that for us
> 
> In case of no gmu ("gmu wrapper"), Linux is the only entity that controls
> things

Ack

-- 
With best wishes
Dmitry

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] drm/msm/a6xx: Fix excessive stack usage
  2024-10-27 18:05 [PATCH] drm/msm/a6xx: Fix excessive stack usage Akhil P Oommen
  2024-10-27 18:43 ` Arnd Bergmann
  2024-10-28  8:26 ` Dmitry Baryshkov
@ 2024-10-28 12:24 ` Dmitry Baryshkov
  2 siblings, 0 replies; 13+ messages in thread
From: Dmitry Baryshkov @ 2024-10-28 12:24 UTC (permalink / raw)
  To: Akhil P Oommen
  Cc: Rob Clark, Sean Paul, Konrad Dybcio, Abhinav Kumar,
	Marijn Suijten, David Airlie, Simona Vetter, Nathan Chancellor,
	Nick Desaulniers, Bill Wendling, Justin Stitt, linux-arm-msm,
	dri-devel, freedreno, linux-kernel, llvm, Arnd Bergmann

On Sun, Oct 27, 2024 at 11:35:47PM +0530, Akhil P Oommen wrote:
> Clang-19 and above sometimes end up with multiple copies of the large
> a6xx_hfi_msg_bw_table structure on the stack. The problem is that
> a6xx_hfi_send_bw_table() calls a number of device specific functions to
> fill the structure, but these create another copy of the structure on
> the stack which gets copied to the first.
> 
> If the functions get inlined, that busts the warning limit:
> 
> drivers/gpu/drm/msm/adreno/a6xx_hfi.c:631:12: error: stack frame size (1032) exceeds limit (1024) in 'a6xx_hfi_send_bw_table' [-Werror,-Wframe-larger-than]
> 
> Fix this by kmalloc-ating struct a6xx_hfi_msg_bw_table instead of using
> the stack. Also, use this opportunity to skip re-initializing this table
> to optimize gpu wake up latency.
> 
> Cc: Arnd Bergmann <arnd@kernel.org>
> 

Please no empty lines between tags.

> Signed-off-by: Akhil P Oommen <quic_akhilpo@quicinc.com>

After all the discussions:

Reviewed-by: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>

> ---
>  drivers/gpu/drm/msm/adreno/a6xx_gmu.h |  1 +
>  drivers/gpu/drm/msm/adreno/a6xx_hfi.c | 34 ++++++++++++++++++++++------------
>  2 files changed, 23 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.h b/drivers/gpu/drm/msm/adreno/a6xx_gmu.h
> index 94b6c5cab6f4..b4a79f88ccf4 100644
> --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.h
> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.h
> @@ -99,6 +99,7 @@ struct a6xx_gmu {
>  	struct completion pd_gate;
>  
>  	struct qmp *qmp;
> +	struct a6xx_hfi_msg_bw_table *bw_table;
>  };
>  
>  static inline u32 gmu_read(struct a6xx_gmu *gmu, u32 offset)
> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_hfi.c b/drivers/gpu/drm/msm/adreno/a6xx_hfi.c
> index cdb3f6e74d3e..55e51c81be1f 100644
> --- a/drivers/gpu/drm/msm/adreno/a6xx_hfi.c
> +++ b/drivers/gpu/drm/msm/adreno/a6xx_hfi.c
> @@ -630,32 +630,42 @@ static void a6xx_build_bw_table(struct a6xx_hfi_msg_bw_table *msg)
>  
>  static int a6xx_hfi_send_bw_table(struct a6xx_gmu *gmu)
>  {
> -	struct a6xx_hfi_msg_bw_table msg = { 0 };
> +	struct a6xx_hfi_msg_bw_table *msg;
>  	struct a6xx_gpu *a6xx_gpu = container_of(gmu, struct a6xx_gpu, gmu);
>  	struct adreno_gpu *adreno_gpu = &a6xx_gpu->base;
>  
> +	if (gmu->bw_table)
> +		goto send;
> +
> +	msg = devm_kzalloc(gmu->dev, sizeof(*msg), GFP_KERNEL);
> +	if (!msg)
> +		return -ENOMEM;
> +
>  	if (adreno_is_a618(adreno_gpu))
> -		a618_build_bw_table(&msg);
> +		a618_build_bw_table(msg);
>  	else if (adreno_is_a619(adreno_gpu))
> -		a619_build_bw_table(&msg);
> +		a619_build_bw_table(msg);
>  	else if (adreno_is_a640_family(adreno_gpu))
> -		a640_build_bw_table(&msg);
> +		a640_build_bw_table(msg);
>  	else if (adreno_is_a650(adreno_gpu))
> -		a650_build_bw_table(&msg);
> +		a650_build_bw_table(msg);
>  	else if (adreno_is_7c3(adreno_gpu))
> -		adreno_7c3_build_bw_table(&msg);
> +		adreno_7c3_build_bw_table(msg);
>  	else if (adreno_is_a660(adreno_gpu))
> -		a660_build_bw_table(&msg);
> +		a660_build_bw_table(msg);
>  	else if (adreno_is_a690(adreno_gpu))
> -		a690_build_bw_table(&msg);
> +		a690_build_bw_table(msg);
>  	else if (adreno_is_a730(adreno_gpu))
> -		a730_build_bw_table(&msg);
> +		a730_build_bw_table(msg);
>  	else if (adreno_is_a740_family(adreno_gpu))
> -		a740_build_bw_table(&msg);
> +		a740_build_bw_table(msg);
>  	else
> -		a6xx_build_bw_table(&msg);
> +		a6xx_build_bw_table(msg);

Note for the future improvement: this begs to be migrated to the catalog
data, adding device-specific callback instead of this if/else series.


> +
> +	gmu->bw_table = msg;
>  
> -	return a6xx_hfi_send_msg(gmu, HFI_H2F_MSG_BW_TABLE, &msg, sizeof(msg),
> +send:
> +	return a6xx_hfi_send_msg(gmu, HFI_H2F_MSG_BW_TABLE, gmu->bw_table, sizeof(*(gmu->bw_table)),
>  		NULL, 0);
>  }
>  
> 
> ---
> base-commit: 74c374648ed08efb2ef339656f2764c28c046956
> change-id: 20241024-stack-size-fix-28af7abd3fab
> 
> Best regards,
> -- 
> Akhil P Oommen <quic_akhilpo@quicinc.com>
> 

-- 
With best wishes
Dmitry

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2024-10-28 12:24 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-10-27 18:05 [PATCH] drm/msm/a6xx: Fix excessive stack usage Akhil P Oommen
2024-10-27 18:43 ` Arnd Bergmann
2024-10-28  9:52   ` Akhil P Oommen
2024-10-28 10:39     ` Konrad Dybcio
2024-10-28 10:50       ` Dmitry Baryshkov
2024-10-28  8:26 ` Dmitry Baryshkov
2024-10-28 10:07   ` Akhil P Oommen
2024-10-28 10:27     ` Dmitry Baryshkov
2024-10-28 10:36       ` Konrad Dybcio
2024-10-28 10:52         ` Dmitry Baryshkov
2024-10-28 11:31           ` Konrad Dybcio
2024-10-28 12:22             ` Dmitry Baryshkov
2024-10-28 12:24 ` Dmitry Baryshkov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox