Re: [PATCH] drm/msm: Optimize GPU crashstate capture read path

From: Sharat Masetty <smasetty-sgV2jX0FEOL9JmXXK+q4OQ@public.gmane.org>
To: freedreno-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org,
	dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org,
	linux-arm-msm-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
Subject: Re: [PATCH] drm/msm: Optimize GPU crashstate capture read path
Date: Fri, 2 Nov 2018 15:12:51 +0530	[thread overview]
Message-ID: <de4ad9fb-4a76-e1ea-1c67-49fa1b8d7924@codeaurora.org> (raw)
In-Reply-To: <20181101150449.GA20483-9PYrDHPZ2Orvke4nUoYGnHL1okKdlPRT@public.gmane.org>

Thanks for the comments Jordan -

On 11/1/2018 8:34 PM, Jordan Crouse wrote:
> On Thu, Nov 01, 2018 at 02:05:41PM +0530, Sharat Masetty wrote:
>> When the userspace tries to read the crashstate dump, the read side
>> implementation in the driver currently ascii85 encodes all the binary
>> buffers and it does this each time the read system call is called.
>> A userspace tool like cat typically does a page by page read and the
>> number of read calls depends on the size of the data captured by the
>> driver. This is certainly not desirable and does not scale well with
>> large captures.
>>
>> This patch starts off with moving the encoding part to the capture time,
>> that is when the GPU tries to recover after a hang. Now we only encode
>> once per buffer object and not per page read. With this there is an
>> immediate >10X speed improvement in crashstate save time.
>>
>> The flip side however is that the GPU recovery time increases and this
>> negatively impacts the user experience, so this is handled by moving the
>> encoding part to a worker thread and only register with the dev_coredump
>> once the encoding of the buffers is complete.
> 
> Does your tree have https://patchwork.freedesktop.org/patch/257181/ ? That will
> help with a lot of the problems you have described.
The issue is that even with small sized files like ~5MB or so, the 
capture time is huge and sometime does not survive the dev_coredump 
timeout of 5 minutes. With this change however I am able to dump large 
file sizes like 60MB or so in reasonable amount of time (~50 seconds).

As for the patch that you shared, I am thinking that we should drop it 
and instead go with this or whatever we agree on. The patch loses stuff 
like IB2's which might be important while debugging hang issues.
> 
>> Signed-off-by: Sharat Masetty <smasetty@codeaurora.org>
>> ---
>>   drivers/gpu/drm/msm/adreno/adreno_gpu.c | 35 +++----------
>>   drivers/gpu/drm/msm/msm_gpu.c           | 93 +++++++++++++++++++++++++++++++--
>>   drivers/gpu/drm/msm/msm_gpu.h           |  1 +
>>   3 files changed, 98 insertions(+), 31 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
>> index 141062f..7441571 100644
>> --- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c
>> +++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
>> @@ -406,7 +406,7 @@ int adreno_gpu_state_get(struct msm_gpu *gpu, struct msm_gpu_state *state)
>>   				size = j + 1;
>>   
>>   		if (size) {
>> -			state->ring[i].data = kmalloc(size << 2, GFP_KERNEL);
>> +			state->ring[i].data = kvmalloc(size << 2, GFP_KERNEL);
>>   			if (state->ring[i].data) {
>>   				memcpy(state->ring[i].data, gpu->rb[i]->start, size << 2);
>>   				state->ring[i].data_size = size << 2;
> 
> This is a good fix, but unrelated to this change as detailed above. Should be
> separate.
> 
>> @@ -445,7 +445,7 @@ void adreno_gpu_state_destroy(struct msm_gpu_state *state)
>>   	int i;
>>   
>>   	for (i = 0; i < ARRAY_SIZE(state->ring); i++)
>> -		kfree(state->ring[i].data);
>> +		kvfree(state->ring[i].data);
> 
> As above.
> 
>>   
>>   	for (i = 0; state->bos && i < state->nr_bos; i++)
>>   		kvfree(state->bos[i].data);
>> @@ -475,34 +475,15 @@ int adreno_gpu_state_put(struct msm_gpu_state *state)
>>   
>>   #if defined(CONFIG_DEBUG_FS) || defined(CONFIG_DEV_COREDUMP)
>>   
>> -static void adreno_show_object(struct drm_printer *p, u32 *ptr, int len)
>> +static void adreno_show_object(struct drm_printer *p, void *ptr)
>>   {
>> -	char out[ASCII85_BUFSZ];
>> -	long l, datalen, i;
>> -
>> -	if (!ptr || !len)
>> -		return;
>> -
>> -	/*
>> -	 * Only dump the non-zero part of the buffer - rarely will any data
>> -	 * completely fill the entire allocated size of the buffer
>> -	 */
>> -	for (datalen = 0, i = 0; i < len >> 2; i++) {
>> -		if (ptr[i])
>> -			datalen = (i << 2) + 1;
>> -	}
>> -
>> -	/* Skip printing the object if it is empty */
>> -	if (datalen == 0)
>> +	if (!ptr)
>>   		return;
>>   
>> -	l = ascii85_encode_len(datalen);
>> -
>>   	drm_puts(p, "    data: !!ascii85 |\n");
>>   	drm_puts(p, "     ");
>>   
>> -	for (i = 0; i < l; i++)
>> -		drm_puts(p, ascii85_encode(ptr[i], out));
>> +	drm_puts(p, ptr);
>>   
>>   	drm_puts(p, "\n");
>>   }
>> @@ -534,8 +515,7 @@ void adreno_show(struct msm_gpu *gpu, struct msm_gpu_state *state,
>>   		drm_printf(p, "    wptr: %d\n", state->ring[i].wptr);
>>   		drm_printf(p, "    size: %d\n", MSM_GPU_RINGBUFFER_SZ);
>>   
>> -		adreno_show_object(p, state->ring[i].data,
>> -			state->ring[i].data_size);
>> +		adreno_show_object(p, state->ring[i].data);
>>   	}
>>   
>>   	if (state->bos) {
>> @@ -546,8 +526,7 @@ void adreno_show(struct msm_gpu *gpu, struct msm_gpu_state *state,
>>   				state->bos[i].iova);
>>   			drm_printf(p, "    size: %zd\n", state->bos[i].size);
>>   
>> -			adreno_show_object(p, state->bos[i].data,
>> -				state->bos[i].size);
>> +			adreno_show_object(p, state->bos[i].data);
>>   		}
>>   	}
>>   
>> diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
>> index ff95848..d5533f1 100644
>> --- a/drivers/gpu/drm/msm/msm_gpu.c
>> +++ b/drivers/gpu/drm/msm/msm_gpu.c
>> @@ -21,6 +21,7 @@
>>   #include "msm_fence.h"
>>   
>>   #include <generated/utsrelease.h>
>> +#include <linux/ascii85.h>
>>   #include <linux/string_helpers.h>
>>   #include <linux/pm_opp.h>
>>   #include <linux/devfreq.h>
>> @@ -375,9 +376,7 @@ static void msm_gpu_crashstate_capture(struct msm_gpu *gpu,
>>   	/* Set the active crash state to be dumped on failure */
>>   	gpu->crashstate = state;
>>   
>> -	/* FIXME: Release the crashstate if this errors out? */
>> -	dev_coredumpm(gpu->dev->dev, THIS_MODULE, gpu, 0, GFP_KERNEL,
>> -		msm_gpu_devcoredump_read, msm_gpu_devcoredump_free);
>> +	schedule_work(&gpu->crashstate_work);
> 
> I'm super hesitant to ever add anything having to do with scheduling because we
> already abuse it so badly but this seems like it might be okay since it isn't
> high priority and not related to timing.  I would be curious what others think.
> 
>>   }
>>   #else
>>   static void msm_gpu_crashstate_capture(struct msm_gpu *gpu,
>> @@ -420,6 +419,93 @@ static void update_fences(struct msm_gpu *gpu, struct msm_ringbuffer *ring,
>>   
>>   static void retire_submits(struct msm_gpu *gpu);
>>   
>> +static char *msm_gpu_ascii85_encode_buf(u32 *src, size_t len)
>> +{
>> +	void *buf;
>> +	size_t buf_itr = 0, buffer_size;
>> +	char out[ASCII85_BUFSZ];
>> +	long l;
>> +	int i;
>> +
>> +	if (!src || !len)
>> +		return NULL;
> 
> I don't think src() will ever legitimately be NULL so you can remove the check.
> 
>> +	l = ascii85_encode_len(len);
>> +
>> +	/*
>> +	 * Ascii85 outputs either a 5 byte string or a 1 byte string. So we
>> +	 * account for the worst case of 5 bytes per dword plus the 1 for '\0'
>> +	 */
>> +	buffer_size = (l * 5) + 1;
> 
> buffer_size is't needed - you can do the math inline in the kvmalloc() function.
> 
>> +
>> +	buf = kvmalloc(buffer_size, GFP_KERNEL);
>> +	if (!buf)
>> +		return NULL;
>> +
>> +	for (i = 0; i < l; i++)
>> +		buf_itr += snprintf(buf + buf_itr, buffer_size - buf_itr, "%s",
> 
> This specific algorithm should use scnprintf() here but on a broader note
> sprintf() and friends is a bad choice if performance is important because it
> jumps through more hoops to eventually do what amounts to a mempcy (this is the
> reason why seq_printf() and seq_puts() are different things and checkpatch
> insists that you use seq_puts() for constant strings).
> 
> Inf you are planning on  outputting into a buffer you should just make a custom
> ascii85 function to output directly and avoid copying in the first place.
> 
> If you wanted to get fancy you could add it to include/linux/ascii85.h -
> ascii85_encode_to_buf() or some such.
> 
Ah! This is a good point - I will try out something on these lines.
>> +				ascii85_encode(src[i], out));
>> +
>> +	return buf;
>> +}
>> +
>> +/*
>> + * Convert the binary data in the gpu crash state capture(like bos and
>> + * ringbuffer data) to ascii85 encoded strings. Note that the encoded
>> + * string is NULL terminated.
> 
> This is technically not true because you are specifically accounting for a NULL
> terminator in the function above.
> 
>> + */
>> +static void crashstate_worker(struct work_struct *work)
>> +{
>> +	struct msm_gpu_state *state;
>> +	struct msm_gpu *gpu = container_of(work, struct msm_gpu,
>> +			crashstate_work);
>> +	int i;
>> +
>> +	state = msm_gpu_crashstate_get(gpu);
>> +	if (!state)
>> +		return;
>> +
>> +	for (i = 0; i < MSM_GPU_MAX_RINGS; i++) {
>> +		void *buf = NULL;
> 
> You don't needed to initialize buf.
> 
>> +
>> +		if (!state->ring[i].data)
>> +			continue;
>> +
> 
> A comment here describing the variable switch-a-roo you are doing would be
> helpful.
> 
>> +		buf = state->ring[i].data;
>> +
>> +		state->ring[i].data = msm_gpu_ascii85_encode_buf(buf,
>> +				state->ring[i].data_size);
>> +		kvfree(buf);
>> +	}
>> +
>> +	for (i = 0; i < state->nr_bos; i++) {
>> +		u32 *buf = NULL;
> 
> buf does not need to be initialized.
> 
>> +		long datalen, j;
>> +
>> +		if (!state->bos[i].data)
>> +			continue;
>> +
>> +		buf = state->bos[i].data;
>> +
>> +		/*
>> +		 * Only dump the non-zero part of the buffer - rarely will
>> +		 * any data completely fill the entire allocated size of the
>> +		 * buffer
>> +		 */
>> +		for (datalen = 0, j = 0; j < state->bos[i].size >> 2; j++)
>> +			if (buf[j])
>> +				datalen = ((j + 1) << 2);
>> +
>> +		state->bos[i].data = msm_gpu_ascii85_encode_buf(buf, datalen);
>> +		kvfree(buf);
>> +	}
>> +
>> +	msm_gpu_crashstate_put(gpu);
>> +
>> +	dev_coredumpm(gpu->dev->dev, THIS_MODULE, gpu, 0, GFP_KERNEL,
>> +		msm_gpu_devcoredump_read, msm_gpu_devcoredump_free);
>> +}
>> +
>>   static void recover_worker(struct work_struct *work)
>>   {
>>   	struct msm_gpu *gpu = container_of(work, struct msm_gpu, recover_work);
>> @@ -856,6 +942,7 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev,
>>   	INIT_LIST_HEAD(&gpu->active_list);
>>   	INIT_WORK(&gpu->retire_work, retire_worker);
>>   	INIT_WORK(&gpu->recover_work, recover_worker);
>> +	INIT_WORK(&gpu->crashstate_work, crashstate_worker);
>>   
>>   
>>   	timer_setup(&gpu->hangcheck_timer, hangcheck_handler, 0);
>> diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
>> index f82bac0..8825069 100644
>> --- a/drivers/gpu/drm/msm/msm_gpu.h
>> +++ b/drivers/gpu/drm/msm/msm_gpu.h
>> @@ -137,6 +137,7 @@ struct msm_gpu {
>>   	} devfreq;
>>   
>>   	struct msm_gpu_state *crashstate;
>> +	struct work_struct crashstate_work;
>>   };
>>   
>>   /* It turns out that all targets use the same ringbuffer size */
>> -- 
>> 1.9.1
>>
> 

-- 
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
Linux Foundation Collaborative Project
_______________________________________________
Freedreno mailing list
Freedreno@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/freedreno