* [PATCH v5 1/5] drm/amdgpu: Allocate coredump memory in a nonblocking way
2023-08-17 18:20 [PATCH v5 0/5] drm/amdgpu: Rework coredump memory allocation André Almeida
@ 2023-08-17 18:20 ` André Almeida
2023-08-17 18:20 ` [PATCH v5 2/5] drm/amdgpu: Rework coredump to use memory dynamically André Almeida
` (3 subsequent siblings)
4 siblings, 0 replies; 10+ messages in thread
From: André Almeida @ 2023-08-17 18:20 UTC (permalink / raw)
To: dri-devel, amd-gfx, linux-kernel
Cc: kernel-dev, alexander.deucher, christian.koenig,
pierre-eric.pelloux-prayer, 'Marek Olšák',
Samuel Pitoiset, Bas Nieuwenhuizen, Timur Kristóf,
Shashank Sharma, André Almeida
During a GPU reset, a normal memory reclaim could block to reclaim
memory. Giving that coredump is a best effort mechanism, it shouldn't
disturb the reset path. Change its memory allocation flag to a
nonblocking one.
Signed-off-by: André Almeida <andrealmeid@igalia.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
---
v5: no change
---
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index aa171db68639..bf4781551f88 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4847,7 +4847,7 @@ static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev)
struct drm_device *dev = adev_to_drm(adev);
ktime_get_ts64(&adev->reset_time);
- dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_KERNEL,
+ dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_NOWAIT,
amdgpu_devcoredump_read, amdgpu_devcoredump_free);
}
#endif
--
2.41.0
^ permalink raw reply related [flat|nested] 10+ messages in thread* [PATCH v5 2/5] drm/amdgpu: Rework coredump to use memory dynamically
2023-08-17 18:20 [PATCH v5 0/5] drm/amdgpu: Rework coredump memory allocation André Almeida
2023-08-17 18:20 ` [PATCH v5 1/5] drm/amdgpu: Allocate coredump memory in a nonblocking way André Almeida
@ 2023-08-17 18:20 ` André Almeida
2023-08-21 14:20 ` Shashank Sharma
2023-08-17 18:20 ` [PATCH v5 3/5] drm/amdgpu: Encapsulate all device reset info André Almeida
` (2 subsequent siblings)
4 siblings, 1 reply; 10+ messages in thread
From: André Almeida @ 2023-08-17 18:20 UTC (permalink / raw)
To: dri-devel, amd-gfx, linux-kernel
Cc: kernel-dev, alexander.deucher, christian.koenig,
pierre-eric.pelloux-prayer, 'Marek Olšák',
Samuel Pitoiset, Bas Nieuwenhuizen, Timur Kristóf,
Shashank Sharma, André Almeida
Instead of storing coredump information inside amdgpu_device struct,
move if to a proper separated struct and allocate it dynamically. This
will make it easier to further expand the logged information.
Signed-off-by: André Almeida <andrealmeid@igalia.com>
---
v5: no change
v4: change kmalloc to kzalloc
---
drivers/gpu/drm/amd/amdgpu/amdgpu.h | 14 +++--
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 63 ++++++++++++++--------
2 files changed, 49 insertions(+), 28 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 9c6a332261ab..0d560b713948 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1088,11 +1088,6 @@ struct amdgpu_device {
uint32_t *reset_dump_reg_list;
uint32_t *reset_dump_reg_value;
int num_regs;
-#ifdef CONFIG_DEV_COREDUMP
- struct amdgpu_task_info reset_task_info;
- bool reset_vram_lost;
- struct timespec64 reset_time;
-#endif
bool scpm_enabled;
uint32_t scpm_status;
@@ -1105,6 +1100,15 @@ struct amdgpu_device {
uint32_t aid_mask;
};
+#ifdef CONFIG_DEV_COREDUMP
+struct amdgpu_coredump_info {
+ struct amdgpu_device *adev;
+ struct amdgpu_task_info reset_task_info;
+ struct timespec64 reset_time;
+ bool reset_vram_lost;
+};
+#endif
+
static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev)
{
return container_of(ddev, struct amdgpu_device, ddev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index bf4781551f88..b5b879bcc5c9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4799,12 +4799,17 @@ static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
return 0;
}
-#ifdef CONFIG_DEV_COREDUMP
+#ifndef CONFIG_DEV_COREDUMP
+static void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
+ struct amdgpu_reset_context *reset_context)
+{
+}
+#else
static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset,
size_t count, void *data, size_t datalen)
{
struct drm_printer p;
- struct amdgpu_device *adev = data;
+ struct amdgpu_coredump_info *coredump = data;
struct drm_print_iterator iter;
int i;
@@ -4818,21 +4823,21 @@ static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset,
drm_printf(&p, "**** AMDGPU Device Coredump ****\n");
drm_printf(&p, "kernel: " UTS_RELEASE "\n");
drm_printf(&p, "module: " KBUILD_MODNAME "\n");
- drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec);
- if (adev->reset_task_info.pid)
+ drm_printf(&p, "time: %lld.%09ld\n", coredump->reset_time.tv_sec, coredump->reset_time.tv_nsec);
+ if (coredump->reset_task_info.pid)
drm_printf(&p, "process_name: %s PID: %d\n",
- adev->reset_task_info.process_name,
- adev->reset_task_info.pid);
+ coredump->reset_task_info.process_name,
+ coredump->reset_task_info.pid);
- if (adev->reset_vram_lost)
+ if (coredump->reset_vram_lost)
drm_printf(&p, "VRAM is lost due to GPU reset!\n");
- if (adev->num_regs) {
+ if (coredump->adev->num_regs) {
drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n");
- for (i = 0; i < adev->num_regs; i++)
+ for (i = 0; i < coredump->adev->num_regs; i++)
drm_printf(&p, "0x%08x: 0x%08x\n",
- adev->reset_dump_reg_list[i],
- adev->reset_dump_reg_value[i]);
+ coredump->adev->reset_dump_reg_list[i],
+ coredump->adev->reset_dump_reg_value[i]);
}
return count - iter.remain;
@@ -4840,14 +4845,32 @@ static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset,
static void amdgpu_devcoredump_free(void *data)
{
+ kfree(data);
}
-static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev)
+static void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
+ struct amdgpu_reset_context *reset_context)
{
+ struct amdgpu_coredump_info *coredump;
struct drm_device *dev = adev_to_drm(adev);
- ktime_get_ts64(&adev->reset_time);
- dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_NOWAIT,
+ coredump = kzalloc(sizeof(*coredump), GFP_NOWAIT);
+
+ if (!coredump) {
+ DRM_ERROR("%s: failed to allocate memory for coredump\n", __func__);
+ return;
+ }
+
+ coredump->reset_vram_lost = vram_lost;
+
+ if (reset_context->job && reset_context->job->vm)
+ coredump->reset_task_info = reset_context->job->vm->task_info;
+
+ coredump->adev = adev;
+
+ ktime_get_ts64(&coredump->reset_time);
+
+ dev_coredumpm(dev->dev, THIS_MODULE, coredump, 0, GFP_NOWAIT,
amdgpu_devcoredump_read, amdgpu_devcoredump_free);
}
#endif
@@ -4955,15 +4978,9 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
goto out;
vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
-#ifdef CONFIG_DEV_COREDUMP
- tmp_adev->reset_vram_lost = vram_lost;
- memset(&tmp_adev->reset_task_info, 0,
- sizeof(tmp_adev->reset_task_info));
- if (reset_context->job && reset_context->job->vm)
- tmp_adev->reset_task_info =
- reset_context->job->vm->task_info;
- amdgpu_reset_capture_coredumpm(tmp_adev);
-#endif
+
+ amdgpu_coredump(tmp_adev, vram_lost, reset_context);
+
if (vram_lost) {
DRM_INFO("VRAM is lost due to GPU reset!\n");
amdgpu_inc_vram_lost(tmp_adev);
--
2.41.0
^ permalink raw reply related [flat|nested] 10+ messages in thread* Re: [PATCH v5 2/5] drm/amdgpu: Rework coredump to use memory dynamically
2023-08-17 18:20 ` [PATCH v5 2/5] drm/amdgpu: Rework coredump to use memory dynamically André Almeida
@ 2023-08-21 14:20 ` Shashank Sharma
0 siblings, 0 replies; 10+ messages in thread
From: Shashank Sharma @ 2023-08-21 14:20 UTC (permalink / raw)
To: André Almeida, dri-devel, amd-gfx, linux-kernel
Cc: kernel-dev, alexander.deucher, christian.koenig,
pierre-eric.pelloux-prayer, 'Marek Olšák',
Samuel Pitoiset, Bas Nieuwenhuizen, Timur Kristóf
On 17/08/2023 20:20, André Almeida wrote:
> Instead of storing coredump information inside amdgpu_device struct,
> move if to a proper separated struct and allocate it dynamically. This
> will make it easier to further expand the logged information.
>
> Signed-off-by: André Almeida <andrealmeid@igalia.com>
> ---
> v5: no change
> v4: change kmalloc to kzalloc
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 14 +++--
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 63 ++++++++++++++--------
> 2 files changed, 49 insertions(+), 28 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 9c6a332261ab..0d560b713948 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1088,11 +1088,6 @@ struct amdgpu_device {
> uint32_t *reset_dump_reg_list;
> uint32_t *reset_dump_reg_value;
> int num_regs;
> -#ifdef CONFIG_DEV_COREDUMP
> - struct amdgpu_task_info reset_task_info;
> - bool reset_vram_lost;
> - struct timespec64 reset_time;
> -#endif
>
> bool scpm_enabled;
> uint32_t scpm_status;
> @@ -1105,6 +1100,15 @@ struct amdgpu_device {
> uint32_t aid_mask;
> };
>
> +#ifdef CONFIG_DEV_COREDUMP
> +struct amdgpu_coredump_info {
> + struct amdgpu_device *adev;
> + struct amdgpu_task_info reset_task_info;
> + struct timespec64 reset_time;
> + bool reset_vram_lost;
> +};
> +#endif
> +
> static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev)
> {
> return container_of(ddev, struct amdgpu_device, ddev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index bf4781551f88..b5b879bcc5c9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -4799,12 +4799,17 @@ static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
> return 0;
> }
>
> -#ifdef CONFIG_DEV_COREDUMP
> +#ifndef CONFIG_DEV_COREDUMP
> +static void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
> + struct amdgpu_reset_context *reset_context)
> +{
> +}
> +#else
> static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset,
> size_t count, void *data, size_t datalen)
> {
> struct drm_printer p;
> - struct amdgpu_device *adev = data;
> + struct amdgpu_coredump_info *coredump = data;
> struct drm_print_iterator iter;
> int i;
>
> @@ -4818,21 +4823,21 @@ static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset,
> drm_printf(&p, "**** AMDGPU Device Coredump ****\n");
> drm_printf(&p, "kernel: " UTS_RELEASE "\n");
> drm_printf(&p, "module: " KBUILD_MODNAME "\n");
> - drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec);
> - if (adev->reset_task_info.pid)
> + drm_printf(&p, "time: %lld.%09ld\n", coredump->reset_time.tv_sec, coredump->reset_time.tv_nsec);
> + if (coredump->reset_task_info.pid)
> drm_printf(&p, "process_name: %s PID: %d\n",
> - adev->reset_task_info.process_name,
> - adev->reset_task_info.pid);
> + coredump->reset_task_info.process_name,
> + coredump->reset_task_info.pid);
>
> - if (adev->reset_vram_lost)
> + if (coredump->reset_vram_lost)
> drm_printf(&p, "VRAM is lost due to GPU reset!\n");
> - if (adev->num_regs) {
> + if (coredump->adev->num_regs) {
> drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n");
>
> - for (i = 0; i < adev->num_regs; i++)
> + for (i = 0; i < coredump->adev->num_regs; i++)
> drm_printf(&p, "0x%08x: 0x%08x\n",
> - adev->reset_dump_reg_list[i],
> - adev->reset_dump_reg_value[i]);
> + coredump->adev->reset_dump_reg_list[i],
> + coredump->adev->reset_dump_reg_value[i]);
> }
>
> return count - iter.remain;
> @@ -4840,14 +4845,32 @@ static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset,
>
> static void amdgpu_devcoredump_free(void *data)
> {
> + kfree(data);
> }
>
> -static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev)
> +static void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
> + struct amdgpu_reset_context *reset_context)
> {
> + struct amdgpu_coredump_info *coredump;
> struct drm_device *dev = adev_to_drm(adev);
>
> - ktime_get_ts64(&adev->reset_time);
> - dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_NOWAIT,
> + coredump = kzalloc(sizeof(*coredump), GFP_NOWAIT);
> +
> + if (!coredump) {
> + DRM_ERROR("%s: failed to allocate memory for coredump\n", __func__);
> + return;
> + }
> +
> + coredump->reset_vram_lost = vram_lost;
> +
> + if (reset_context->job && reset_context->job->vm)
> + coredump->reset_task_info = reset_context->job->vm->task_info;
> +
> + coredump->adev = adev;
> +
> + ktime_get_ts64(&coredump->reset_time);
> +
> + dev_coredumpm(dev->dev, THIS_MODULE, coredump, 0, GFP_NOWAIT,
> amdgpu_devcoredump_read, amdgpu_devcoredump_free);
> }
> #endif
> @@ -4955,15 +4978,9 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
> goto out;
>
> vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
> -#ifdef CONFIG_DEV_COREDUMP
> - tmp_adev->reset_vram_lost = vram_lost;
> - memset(&tmp_adev->reset_task_info, 0,
> - sizeof(tmp_adev->reset_task_info));
> - if (reset_context->job && reset_context->job->vm)
> - tmp_adev->reset_task_info =
> - reset_context->job->vm->task_info;
> - amdgpu_reset_capture_coredumpm(tmp_adev);
> -#endif
> +
> + amdgpu_coredump(tmp_adev, vram_lost, reset_context);
> +
> if (vram_lost) {
> DRM_INFO("VRAM is lost due to GPU reset!\n");
> amdgpu_inc_vram_lost(tmp_adev);
Reviewed-by: Shashank Sharma <shashank.sharma@amd.com>
- Shashank
^ permalink raw reply [flat|nested] 10+ messages in thread
* [PATCH v5 3/5] drm/amdgpu: Encapsulate all device reset info
2023-08-17 18:20 [PATCH v5 0/5] drm/amdgpu: Rework coredump memory allocation André Almeida
2023-08-17 18:20 ` [PATCH v5 1/5] drm/amdgpu: Allocate coredump memory in a nonblocking way André Almeida
2023-08-17 18:20 ` [PATCH v5 2/5] drm/amdgpu: Rework coredump to use memory dynamically André Almeida
@ 2023-08-17 18:20 ` André Almeida
2023-08-21 14:21 ` Shashank Sharma
2023-08-17 18:20 ` [PATCH v5 4/5] drm/amdgpu: Move coredump code to amdgpu_reset file André Almeida
2023-08-17 18:20 ` [PATCH v5 5/5] drm/amdgpu: Create version number for coredumps André Almeida
4 siblings, 1 reply; 10+ messages in thread
From: André Almeida @ 2023-08-17 18:20 UTC (permalink / raw)
To: dri-devel, amd-gfx, linux-kernel
Cc: kernel-dev, alexander.deucher, christian.koenig,
pierre-eric.pelloux-prayer, 'Marek Olšák',
Samuel Pitoiset, Bas Nieuwenhuizen, Timur Kristóf,
Shashank Sharma, André Almeida
To better organize struct amdgpu_device, keep all reset information
related fields together in a separated struct.
Signed-off-by: André Almeida <andrealmeid@igalia.com>
---
v5: new patch, as requested by Shashank Sharma
---
drivers/gpu/drm/amd/amdgpu/amdgpu.h | 34 +++++++++++++--------
drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 10 +++---
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 16 +++++-----
3 files changed, 34 insertions(+), 26 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 0d560b713948..56d78ca6e917 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -781,6 +781,26 @@ struct amdgpu_mqd {
#define AMDGPU_PRODUCT_NAME_LEN 64
struct amdgpu_reset_domain;
+#ifdef CONFIG_DEV_COREDUMP
+struct amdgpu_coredump_info {
+ struct amdgpu_device *adev;
+ struct amdgpu_task_info reset_task_info;
+ struct timespec64 reset_time;
+ bool reset_vram_lost;
+};
+#endif
+
+struct amdgpu_reset_info {
+ /* reset dump register */
+ u32 *reset_dump_reg_list;
+ u32 *reset_dump_reg_value;
+ int num_regs;
+
+#ifdef CONFIG_DEV_COREDUMP
+ struct amdgpu_coredump_info *coredump_info;
+#endif
+};
+
/*
* Non-zero (true) if the GPU has VRAM. Zero (false) otherwise.
*/
@@ -1084,10 +1104,7 @@ struct amdgpu_device {
struct mutex benchmark_mutex;
- /* reset dump register */
- uint32_t *reset_dump_reg_list;
- uint32_t *reset_dump_reg_value;
- int num_regs;
+ struct amdgpu_reset_info reset_info;
bool scpm_enabled;
uint32_t scpm_status;
@@ -1100,15 +1117,6 @@ struct amdgpu_device {
uint32_t aid_mask;
};
-#ifdef CONFIG_DEV_COREDUMP
-struct amdgpu_coredump_info {
- struct amdgpu_device *adev;
- struct amdgpu_task_info reset_task_info;
- struct timespec64 reset_time;
- bool reset_vram_lost;
-};
-#endif
-
static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev)
{
return container_of(ddev, struct amdgpu_device, ddev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index a4faea4fa0b5..3136a0774dd9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -2016,8 +2016,8 @@ static ssize_t amdgpu_reset_dump_register_list_read(struct file *f,
if (ret)
return ret;
- for (i = 0; i < adev->num_regs; i++) {
- sprintf(reg_offset, "0x%x\n", adev->reset_dump_reg_list[i]);
+ for (i = 0; i < adev->reset_info.num_regs; i++) {
+ sprintf(reg_offset, "0x%x\n", adev->reset_info.reset_dump_reg_list[i]);
up_read(&adev->reset_domain->sem);
if (copy_to_user(buf + len, reg_offset, strlen(reg_offset)))
return -EFAULT;
@@ -2074,9 +2074,9 @@ static ssize_t amdgpu_reset_dump_register_list_write(struct file *f,
if (ret)
goto error_free;
- swap(adev->reset_dump_reg_list, tmp);
- swap(adev->reset_dump_reg_value, new);
- adev->num_regs = i;
+ swap(adev->reset_info.reset_dump_reg_list, tmp);
+ swap(adev->reset_info.reset_dump_reg_value, new);
+ adev->reset_info.num_regs = i;
up_write(&adev->reset_domain->sem);
ret = size;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index b5b879bcc5c9..96975591841d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4790,10 +4790,10 @@ static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
lockdep_assert_held(&adev->reset_domain->sem);
- for (i = 0; i < adev->num_regs; i++) {
- adev->reset_dump_reg_value[i] = RREG32(adev->reset_dump_reg_list[i]);
- trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i],
- adev->reset_dump_reg_value[i]);
+ for (i = 0; i < adev->reset_info.num_regs; i++) {
+ adev->reset_info.reset_dump_reg_value[i] = RREG32(adev->reset_info.reset_dump_reg_list[i]);
+ trace_amdgpu_reset_reg_dumps(adev->reset_info.reset_dump_reg_list[i],
+ adev->reset_info.reset_dump_reg_value[i]);
}
return 0;
@@ -4831,13 +4831,13 @@ static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset,
if (coredump->reset_vram_lost)
drm_printf(&p, "VRAM is lost due to GPU reset!\n");
- if (coredump->adev->num_regs) {
+ if (coredump->adev->reset_info.num_regs) {
drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n");
- for (i = 0; i < coredump->adev->num_regs; i++)
+ for (i = 0; i < coredump->adev->reset_info.num_regs; i++)
drm_printf(&p, "0x%08x: 0x%08x\n",
- coredump->adev->reset_dump_reg_list[i],
- coredump->adev->reset_dump_reg_value[i]);
+ coredump->adev->reset_info.reset_dump_reg_list[i],
+ coredump->adev->reset_info.reset_dump_reg_value[i]);
}
return count - iter.remain;
--
2.41.0
^ permalink raw reply related [flat|nested] 10+ messages in thread* Re: [PATCH v5 3/5] drm/amdgpu: Encapsulate all device reset info
2023-08-17 18:20 ` [PATCH v5 3/5] drm/amdgpu: Encapsulate all device reset info André Almeida
@ 2023-08-21 14:21 ` Shashank Sharma
0 siblings, 0 replies; 10+ messages in thread
From: Shashank Sharma @ 2023-08-21 14:21 UTC (permalink / raw)
To: André Almeida, dri-devel, amd-gfx, linux-kernel
Cc: kernel-dev, alexander.deucher, christian.koenig,
pierre-eric.pelloux-prayer, 'Marek Olšák',
Samuel Pitoiset, Bas Nieuwenhuizen, Timur Kristóf
On 17/08/2023 20:20, André Almeida wrote:
> To better organize struct amdgpu_device, keep all reset information
> related fields together in a separated struct.
>
> Signed-off-by: André Almeida <andrealmeid@igalia.com>
> ---
> v5: new patch, as requested by Shashank Sharma
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 34 +++++++++++++--------
> drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 10 +++---
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 16 +++++-----
> 3 files changed, 34 insertions(+), 26 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 0d560b713948..56d78ca6e917 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -781,6 +781,26 @@ struct amdgpu_mqd {
> #define AMDGPU_PRODUCT_NAME_LEN 64
> struct amdgpu_reset_domain;
>
> +#ifdef CONFIG_DEV_COREDUMP
> +struct amdgpu_coredump_info {
> + struct amdgpu_device *adev;
> + struct amdgpu_task_info reset_task_info;
> + struct timespec64 reset_time;
> + bool reset_vram_lost;
> +};
> +#endif
> +
> +struct amdgpu_reset_info {
> + /* reset dump register */
> + u32 *reset_dump_reg_list;
> + u32 *reset_dump_reg_value;
> + int num_regs;
> +
> +#ifdef CONFIG_DEV_COREDUMP
> + struct amdgpu_coredump_info *coredump_info;
> +#endif
> +};
> +
> /*
> * Non-zero (true) if the GPU has VRAM. Zero (false) otherwise.
> */
> @@ -1084,10 +1104,7 @@ struct amdgpu_device {
>
> struct mutex benchmark_mutex;
>
> - /* reset dump register */
> - uint32_t *reset_dump_reg_list;
> - uint32_t *reset_dump_reg_value;
> - int num_regs;
> + struct amdgpu_reset_info reset_info;
>
> bool scpm_enabled;
> uint32_t scpm_status;
> @@ -1100,15 +1117,6 @@ struct amdgpu_device {
> uint32_t aid_mask;
> };
>
> -#ifdef CONFIG_DEV_COREDUMP
> -struct amdgpu_coredump_info {
> - struct amdgpu_device *adev;
> - struct amdgpu_task_info reset_task_info;
> - struct timespec64 reset_time;
> - bool reset_vram_lost;
> -};
> -#endif
> -
> static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev)
> {
> return container_of(ddev, struct amdgpu_device, ddev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> index a4faea4fa0b5..3136a0774dd9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> @@ -2016,8 +2016,8 @@ static ssize_t amdgpu_reset_dump_register_list_read(struct file *f,
> if (ret)
> return ret;
>
> - for (i = 0; i < adev->num_regs; i++) {
> - sprintf(reg_offset, "0x%x\n", adev->reset_dump_reg_list[i]);
> + for (i = 0; i < adev->reset_info.num_regs; i++) {
> + sprintf(reg_offset, "0x%x\n", adev->reset_info.reset_dump_reg_list[i]);
> up_read(&adev->reset_domain->sem);
> if (copy_to_user(buf + len, reg_offset, strlen(reg_offset)))
> return -EFAULT;
> @@ -2074,9 +2074,9 @@ static ssize_t amdgpu_reset_dump_register_list_write(struct file *f,
> if (ret)
> goto error_free;
>
> - swap(adev->reset_dump_reg_list, tmp);
> - swap(adev->reset_dump_reg_value, new);
> - adev->num_regs = i;
> + swap(adev->reset_info.reset_dump_reg_list, tmp);
> + swap(adev->reset_info.reset_dump_reg_value, new);
> + adev->reset_info.num_regs = i;
> up_write(&adev->reset_domain->sem);
> ret = size;
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index b5b879bcc5c9..96975591841d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -4790,10 +4790,10 @@ static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
>
> lockdep_assert_held(&adev->reset_domain->sem);
>
> - for (i = 0; i < adev->num_regs; i++) {
> - adev->reset_dump_reg_value[i] = RREG32(adev->reset_dump_reg_list[i]);
> - trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i],
> - adev->reset_dump_reg_value[i]);
> + for (i = 0; i < adev->reset_info.num_regs; i++) {
> + adev->reset_info.reset_dump_reg_value[i] = RREG32(adev->reset_info.reset_dump_reg_list[i]);
> + trace_amdgpu_reset_reg_dumps(adev->reset_info.reset_dump_reg_list[i],
> + adev->reset_info.reset_dump_reg_value[i]);
> }
>
> return 0;
> @@ -4831,13 +4831,13 @@ static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset,
>
> if (coredump->reset_vram_lost)
> drm_printf(&p, "VRAM is lost due to GPU reset!\n");
> - if (coredump->adev->num_regs) {
> + if (coredump->adev->reset_info.num_regs) {
> drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n");
>
> - for (i = 0; i < coredump->adev->num_regs; i++)
> + for (i = 0; i < coredump->adev->reset_info.num_regs; i++)
> drm_printf(&p, "0x%08x: 0x%08x\n",
> - coredump->adev->reset_dump_reg_list[i],
> - coredump->adev->reset_dump_reg_value[i]);
> + coredump->adev->reset_info.reset_dump_reg_list[i],
> + coredump->adev->reset_info.reset_dump_reg_value[i]);
> }
Reviewed-by: Shashank Sharma <shashank.sharma@amd.com>
- Shashank
>
> return count - iter.remain;
^ permalink raw reply [flat|nested] 10+ messages in thread
* [PATCH v5 4/5] drm/amdgpu: Move coredump code to amdgpu_reset file
2023-08-17 18:20 [PATCH v5 0/5] drm/amdgpu: Rework coredump memory allocation André Almeida
` (2 preceding siblings ...)
2023-08-17 18:20 ` [PATCH v5 3/5] drm/amdgpu: Encapsulate all device reset info André Almeida
@ 2023-08-17 18:20 ` André Almeida
2023-08-21 14:22 ` Shashank Sharma
2023-08-17 18:20 ` [PATCH v5 5/5] drm/amdgpu: Create version number for coredumps André Almeida
4 siblings, 1 reply; 10+ messages in thread
From: André Almeida @ 2023-08-17 18:20 UTC (permalink / raw)
To: dri-devel, amd-gfx, linux-kernel
Cc: kernel-dev, alexander.deucher, christian.koenig,
pierre-eric.pelloux-prayer, 'Marek Olšák',
Samuel Pitoiset, Bas Nieuwenhuizen, Timur Kristóf,
Shashank Sharma, André Almeida
Giving that we use codedump just for device resets, move it's functions
and structs to a more semantic file, the amdgpu_reset.{c, h}.
Signed-off-by: André Almeida <andrealmeid@igalia.com>
---
v5: no change
---
drivers/gpu/drm/amd/amdgpu/amdgpu.h | 9 ---
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 78 ----------------------
drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 76 +++++++++++++++++++++
drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 10 +++
4 files changed, 86 insertions(+), 87 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 56d78ca6e917..b11187d153ef 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -781,15 +781,6 @@ struct amdgpu_mqd {
#define AMDGPU_PRODUCT_NAME_LEN 64
struct amdgpu_reset_domain;
-#ifdef CONFIG_DEV_COREDUMP
-struct amdgpu_coredump_info {
- struct amdgpu_device *adev;
- struct amdgpu_task_info reset_task_info;
- struct timespec64 reset_time;
- bool reset_vram_lost;
-};
-#endif
-
struct amdgpu_reset_info {
/* reset dump register */
u32 *reset_dump_reg_list;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 96975591841d..883953f2ae53 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -32,8 +32,6 @@
#include <linux/slab.h>
#include <linux/iommu.h>
#include <linux/pci.h>
-#include <linux/devcoredump.h>
-#include <generated/utsrelease.h>
#include <linux/pci-p2pdma.h>
#include <linux/apple-gmux.h>
@@ -4799,82 +4797,6 @@ static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
return 0;
}
-#ifndef CONFIG_DEV_COREDUMP
-static void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
- struct amdgpu_reset_context *reset_context)
-{
-}
-#else
-static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset,
- size_t count, void *data, size_t datalen)
-{
- struct drm_printer p;
- struct amdgpu_coredump_info *coredump = data;
- struct drm_print_iterator iter;
- int i;
-
- iter.data = buffer;
- iter.offset = 0;
- iter.start = offset;
- iter.remain = count;
-
- p = drm_coredump_printer(&iter);
-
- drm_printf(&p, "**** AMDGPU Device Coredump ****\n");
- drm_printf(&p, "kernel: " UTS_RELEASE "\n");
- drm_printf(&p, "module: " KBUILD_MODNAME "\n");
- drm_printf(&p, "time: %lld.%09ld\n", coredump->reset_time.tv_sec, coredump->reset_time.tv_nsec);
- if (coredump->reset_task_info.pid)
- drm_printf(&p, "process_name: %s PID: %d\n",
- coredump->reset_task_info.process_name,
- coredump->reset_task_info.pid);
-
- if (coredump->reset_vram_lost)
- drm_printf(&p, "VRAM is lost due to GPU reset!\n");
- if (coredump->adev->reset_info.num_regs) {
- drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n");
-
- for (i = 0; i < coredump->adev->reset_info.num_regs; i++)
- drm_printf(&p, "0x%08x: 0x%08x\n",
- coredump->adev->reset_info.reset_dump_reg_list[i],
- coredump->adev->reset_info.reset_dump_reg_value[i]);
- }
-
- return count - iter.remain;
-}
-
-static void amdgpu_devcoredump_free(void *data)
-{
- kfree(data);
-}
-
-static void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
- struct amdgpu_reset_context *reset_context)
-{
- struct amdgpu_coredump_info *coredump;
- struct drm_device *dev = adev_to_drm(adev);
-
- coredump = kzalloc(sizeof(*coredump), GFP_NOWAIT);
-
- if (!coredump) {
- DRM_ERROR("%s: failed to allocate memory for coredump\n", __func__);
- return;
- }
-
- coredump->reset_vram_lost = vram_lost;
-
- if (reset_context->job && reset_context->job->vm)
- coredump->reset_task_info = reset_context->job->vm->task_info;
-
- coredump->adev = adev;
-
- ktime_get_ts64(&coredump->reset_time);
-
- dev_coredumpm(dev->dev, THIS_MODULE, coredump, 0, GFP_NOWAIT,
- amdgpu_devcoredump_read, amdgpu_devcoredump_free);
-}
-#endif
-
int amdgpu_do_asic_reset(struct list_head *device_list_handle,
struct amdgpu_reset_context *reset_context)
{
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
index 5fed06ffcc6b..579b70a3cdab 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
@@ -21,6 +21,9 @@
*
*/
+#include <linux/devcoredump.h>
+#include <generated/utsrelease.h>
+
#include "amdgpu_reset.h"
#include "aldebaran.h"
#include "sienna_cichlid.h"
@@ -167,5 +170,78 @@ void amdgpu_device_unlock_reset_domain(struct amdgpu_reset_domain *reset_domain)
up_write(&reset_domain->sem);
}
+#ifndef CONFIG_DEV_COREDUMP
+void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
+ struct amdgpu_reset_context *reset_context)
+{
+}
+#else
+static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset,
+ size_t count, void *data, size_t datalen)
+{
+ struct drm_printer p;
+ struct amdgpu_coredump_info *coredump = data;
+ struct drm_print_iterator iter;
+ int i;
+
+ iter.data = buffer;
+ iter.offset = 0;
+ iter.start = offset;
+ iter.remain = count;
+
+ p = drm_coredump_printer(&iter);
+
+ drm_printf(&p, "**** AMDGPU Device Coredump ****\n");
+ drm_printf(&p, "kernel: " UTS_RELEASE "\n");
+ drm_printf(&p, "module: " KBUILD_MODNAME "\n");
+ drm_printf(&p, "time: %lld.%09ld\n", coredump->reset_time.tv_sec, coredump->reset_time.tv_nsec);
+ if (coredump->reset_task_info.pid)
+ drm_printf(&p, "process_name: %s PID: %d\n",
+ coredump->reset_task_info.process_name,
+ coredump->reset_task_info.pid);
+
+ if (coredump->reset_vram_lost)
+ drm_printf(&p, "VRAM is lost due to GPU reset!\n");
+ if (coredump->adev->reset_info.num_regs) {
+ drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n");
+
+ for (i = 0; i < coredump->adev->reset_info.num_regs; i++)
+ drm_printf(&p, "0x%08x: 0x%08x\n",
+ coredump->adev->reset_info.reset_dump_reg_list[i],
+ coredump->adev->reset_info.reset_dump_reg_value[i]);
+ }
+
+ return count - iter.remain;
+}
+static void amdgpu_devcoredump_free(void *data)
+{
+ kfree(data);
+}
+void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
+ struct amdgpu_reset_context *reset_context)
+{
+ struct amdgpu_coredump_info *coredump;
+ struct drm_device *dev = adev_to_drm(adev);
+
+ coredump = kzalloc(sizeof(*coredump), GFP_NOWAIT);
+
+ if (!coredump) {
+ DRM_ERROR("%s: failed to allocate memory for coredump\n", __func__);
+ return;
+ }
+
+ coredump->reset_vram_lost = vram_lost;
+
+ if (reset_context->job && reset_context->job->vm)
+ coredump->reset_task_info = reset_context->job->vm->task_info;
+
+ coredump->adev = adev;
+
+ ktime_get_ts64(&coredump->reset_time);
+
+ dev_coredumpm(dev->dev, THIS_MODULE, coredump, 0, GFP_NOWAIT,
+ amdgpu_devcoredump_read, amdgpu_devcoredump_free);
+}
+#endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
index f4a501ff87d9..01e8183ade4b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
@@ -87,6 +87,14 @@ struct amdgpu_reset_domain {
atomic_t reset_res;
};
+#ifdef CONFIG_DEV_COREDUMP
+struct amdgpu_coredump_info {
+ struct amdgpu_device *adev;
+ struct amdgpu_task_info reset_task_info;
+ struct timespec64 reset_time;
+ bool reset_vram_lost;
+};
+#endif
int amdgpu_reset_init(struct amdgpu_device *adev);
int amdgpu_reset_fini(struct amdgpu_device *adev);
@@ -126,4 +134,6 @@ void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain *reset_domain);
void amdgpu_device_unlock_reset_domain(struct amdgpu_reset_domain *reset_domain);
+void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
+ struct amdgpu_reset_context *reset_context);
#endif
--
2.41.0
^ permalink raw reply related [flat|nested] 10+ messages in thread* Re: [PATCH v5 4/5] drm/amdgpu: Move coredump code to amdgpu_reset file
2023-08-17 18:20 ` [PATCH v5 4/5] drm/amdgpu: Move coredump code to amdgpu_reset file André Almeida
@ 2023-08-21 14:22 ` Shashank Sharma
0 siblings, 0 replies; 10+ messages in thread
From: Shashank Sharma @ 2023-08-21 14:22 UTC (permalink / raw)
To: André Almeida, dri-devel, amd-gfx, linux-kernel
Cc: kernel-dev, alexander.deucher, christian.koenig,
pierre-eric.pelloux-prayer, 'Marek Olšák',
Samuel Pitoiset, Bas Nieuwenhuizen, Timur Kristóf
On 17/08/2023 20:20, André Almeida wrote:
> Giving that we use codedump just for device resets, move it's functions
> and structs to a more semantic file, the amdgpu_reset.{c, h}.
>
> Signed-off-by: André Almeida <andrealmeid@igalia.com>
> ---
> v5: no change
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 9 ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 78 ----------------------
> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 76 +++++++++++++++++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 10 +++
> 4 files changed, 86 insertions(+), 87 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 56d78ca6e917..b11187d153ef 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -781,15 +781,6 @@ struct amdgpu_mqd {
> #define AMDGPU_PRODUCT_NAME_LEN 64
> struct amdgpu_reset_domain;
>
> -#ifdef CONFIG_DEV_COREDUMP
> -struct amdgpu_coredump_info {
> - struct amdgpu_device *adev;
> - struct amdgpu_task_info reset_task_info;
> - struct timespec64 reset_time;
> - bool reset_vram_lost;
> -};
> -#endif
> -
> struct amdgpu_reset_info {
> /* reset dump register */
> u32 *reset_dump_reg_list;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 96975591841d..883953f2ae53 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -32,8 +32,6 @@
> #include <linux/slab.h>
> #include <linux/iommu.h>
> #include <linux/pci.h>
> -#include <linux/devcoredump.h>
> -#include <generated/utsrelease.h>
> #include <linux/pci-p2pdma.h>
> #include <linux/apple-gmux.h>
>
> @@ -4799,82 +4797,6 @@ static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
> return 0;
> }
>
> -#ifndef CONFIG_DEV_COREDUMP
> -static void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
> - struct amdgpu_reset_context *reset_context)
> -{
> -}
> -#else
> -static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset,
> - size_t count, void *data, size_t datalen)
> -{
> - struct drm_printer p;
> - struct amdgpu_coredump_info *coredump = data;
> - struct drm_print_iterator iter;
> - int i;
> -
> - iter.data = buffer;
> - iter.offset = 0;
> - iter.start = offset;
> - iter.remain = count;
> -
> - p = drm_coredump_printer(&iter);
> -
> - drm_printf(&p, "**** AMDGPU Device Coredump ****\n");
> - drm_printf(&p, "kernel: " UTS_RELEASE "\n");
> - drm_printf(&p, "module: " KBUILD_MODNAME "\n");
> - drm_printf(&p, "time: %lld.%09ld\n", coredump->reset_time.tv_sec, coredump->reset_time.tv_nsec);
> - if (coredump->reset_task_info.pid)
> - drm_printf(&p, "process_name: %s PID: %d\n",
> - coredump->reset_task_info.process_name,
> - coredump->reset_task_info.pid);
> -
> - if (coredump->reset_vram_lost)
> - drm_printf(&p, "VRAM is lost due to GPU reset!\n");
> - if (coredump->adev->reset_info.num_regs) {
> - drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n");
> -
> - for (i = 0; i < coredump->adev->reset_info.num_regs; i++)
> - drm_printf(&p, "0x%08x: 0x%08x\n",
> - coredump->adev->reset_info.reset_dump_reg_list[i],
> - coredump->adev->reset_info.reset_dump_reg_value[i]);
> - }
> -
> - return count - iter.remain;
> -}
> -
> -static void amdgpu_devcoredump_free(void *data)
> -{
> - kfree(data);
> -}
> -
> -static void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
> - struct amdgpu_reset_context *reset_context)
> -{
> - struct amdgpu_coredump_info *coredump;
> - struct drm_device *dev = adev_to_drm(adev);
> -
> - coredump = kzalloc(sizeof(*coredump), GFP_NOWAIT);
> -
> - if (!coredump) {
> - DRM_ERROR("%s: failed to allocate memory for coredump\n", __func__);
> - return;
> - }
> -
> - coredump->reset_vram_lost = vram_lost;
> -
> - if (reset_context->job && reset_context->job->vm)
> - coredump->reset_task_info = reset_context->job->vm->task_info;
> -
> - coredump->adev = adev;
> -
> - ktime_get_ts64(&coredump->reset_time);
> -
> - dev_coredumpm(dev->dev, THIS_MODULE, coredump, 0, GFP_NOWAIT,
> - amdgpu_devcoredump_read, amdgpu_devcoredump_free);
> -}
> -#endif
> -
> int amdgpu_do_asic_reset(struct list_head *device_list_handle,
> struct amdgpu_reset_context *reset_context)
> {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> index 5fed06ffcc6b..579b70a3cdab 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> @@ -21,6 +21,9 @@
> *
> */
>
> +#include <linux/devcoredump.h>
> +#include <generated/utsrelease.h>
> +
> #include "amdgpu_reset.h"
> #include "aldebaran.h"
> #include "sienna_cichlid.h"
> @@ -167,5 +170,78 @@ void amdgpu_device_unlock_reset_domain(struct amdgpu_reset_domain *reset_domain)
> up_write(&reset_domain->sem);
> }
>
> +#ifndef CONFIG_DEV_COREDUMP
> +void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
> + struct amdgpu_reset_context *reset_context)
> +{
> +}
> +#else
> +static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset,
> + size_t count, void *data, size_t datalen)
> +{
> + struct drm_printer p;
> + struct amdgpu_coredump_info *coredump = data;
> + struct drm_print_iterator iter;
> + int i;
> +
> + iter.data = buffer;
> + iter.offset = 0;
> + iter.start = offset;
> + iter.remain = count;
> +
> + p = drm_coredump_printer(&iter);
> +
> + drm_printf(&p, "**** AMDGPU Device Coredump ****\n");
> + drm_printf(&p, "kernel: " UTS_RELEASE "\n");
> + drm_printf(&p, "module: " KBUILD_MODNAME "\n");
> + drm_printf(&p, "time: %lld.%09ld\n", coredump->reset_time.tv_sec, coredump->reset_time.tv_nsec);
> + if (coredump->reset_task_info.pid)
> + drm_printf(&p, "process_name: %s PID: %d\n",
> + coredump->reset_task_info.process_name,
> + coredump->reset_task_info.pid);
> +
> + if (coredump->reset_vram_lost)
> + drm_printf(&p, "VRAM is lost due to GPU reset!\n");
> + if (coredump->adev->reset_info.num_regs) {
> + drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n");
> +
> + for (i = 0; i < coredump->adev->reset_info.num_regs; i++)
> + drm_printf(&p, "0x%08x: 0x%08x\n",
> + coredump->adev->reset_info.reset_dump_reg_list[i],
> + coredump->adev->reset_info.reset_dump_reg_value[i]);
> + }
> +
> + return count - iter.remain;
> +}
>
> +static void amdgpu_devcoredump_free(void *data)
> +{
> + kfree(data);
> +}
>
> +void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
> + struct amdgpu_reset_context *reset_context)
> +{
> + struct amdgpu_coredump_info *coredump;
> + struct drm_device *dev = adev_to_drm(adev);
> +
> + coredump = kzalloc(sizeof(*coredump), GFP_NOWAIT);
> +
> + if (!coredump) {
> + DRM_ERROR("%s: failed to allocate memory for coredump\n", __func__);
> + return;
> + }
> +
> + coredump->reset_vram_lost = vram_lost;
> +
> + if (reset_context->job && reset_context->job->vm)
> + coredump->reset_task_info = reset_context->job->vm->task_info;
> +
> + coredump->adev = adev;
> +
> + ktime_get_ts64(&coredump->reset_time);
> +
> + dev_coredumpm(dev->dev, THIS_MODULE, coredump, 0, GFP_NOWAIT,
> + amdgpu_devcoredump_read, amdgpu_devcoredump_free);
> +}
> +#endif
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> index f4a501ff87d9..01e8183ade4b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> @@ -87,6 +87,14 @@ struct amdgpu_reset_domain {
> atomic_t reset_res;
> };
>
> +#ifdef CONFIG_DEV_COREDUMP
> +struct amdgpu_coredump_info {
> + struct amdgpu_device *adev;
> + struct amdgpu_task_info reset_task_info;
> + struct timespec64 reset_time;
> + bool reset_vram_lost;
> +};
> +#endif
>
> int amdgpu_reset_init(struct amdgpu_device *adev);
> int amdgpu_reset_fini(struct amdgpu_device *adev);
> @@ -126,4 +134,6 @@ void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain *reset_domain);
>
> void amdgpu_device_unlock_reset_domain(struct amdgpu_reset_domain *reset_domain);
>
> +void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
> + struct amdgpu_reset_context *reset_context);
> #endif
Reviewed-by: Shashank Sharma <shashank.sharma@amd.com>
- Shashank
^ permalink raw reply [flat|nested] 10+ messages in thread
* [PATCH v5 5/5] drm/amdgpu: Create version number for coredumps
2023-08-17 18:20 [PATCH v5 0/5] drm/amdgpu: Rework coredump memory allocation André Almeida
` (3 preceding siblings ...)
2023-08-17 18:20 ` [PATCH v5 4/5] drm/amdgpu: Move coredump code to amdgpu_reset file André Almeida
@ 2023-08-17 18:20 ` André Almeida
2023-08-21 14:24 ` Shashank Sharma
4 siblings, 1 reply; 10+ messages in thread
From: André Almeida @ 2023-08-17 18:20 UTC (permalink / raw)
To: dri-devel, amd-gfx, linux-kernel
Cc: kernel-dev, alexander.deucher, christian.koenig,
pierre-eric.pelloux-prayer, 'Marek Olšák',
Samuel Pitoiset, Bas Nieuwenhuizen, Timur Kristóf,
Shashank Sharma, André Almeida
Even if there's nothing currently parsing amdgpu's coredump files, if
we eventually have such tools they will be glad to find a version field
to properly read the file.
Create a version number to be displayed on top of coredump file, to be
incremented when the file format or content get changed.
Signed-off-by: André Almeida <andrealmeid@igalia.com>
---
v5: new patch
drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 1 +
drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 3 +++
2 files changed, 4 insertions(+)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
index 579b70a3cdab..e92c81ff27be 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
@@ -192,6 +192,7 @@ static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset,
p = drm_coredump_printer(&iter);
drm_printf(&p, "**** AMDGPU Device Coredump ****\n");
+ drm_printf(&p, "version: " AMDGPU_COREDUMP_VERSION "\n");
drm_printf(&p, "kernel: " UTS_RELEASE "\n");
drm_printf(&p, "module: " KBUILD_MODNAME "\n");
drm_printf(&p, "time: %lld.%09ld\n", coredump->reset_time.tv_sec, coredump->reset_time.tv_nsec);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
index 01e8183ade4b..ec3a409ec509 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
@@ -88,6 +88,9 @@ struct amdgpu_reset_domain {
};
#ifdef CONFIG_DEV_COREDUMP
+
+#define AMDGPU_COREDUMP_VERSION "1"
+
struct amdgpu_coredump_info {
struct amdgpu_device *adev;
struct amdgpu_task_info reset_task_info;
--
2.41.0
^ permalink raw reply related [flat|nested] 10+ messages in thread* Re: [PATCH v5 5/5] drm/amdgpu: Create version number for coredumps
2023-08-17 18:20 ` [PATCH v5 5/5] drm/amdgpu: Create version number for coredumps André Almeida
@ 2023-08-21 14:24 ` Shashank Sharma
0 siblings, 0 replies; 10+ messages in thread
From: Shashank Sharma @ 2023-08-21 14:24 UTC (permalink / raw)
To: André Almeida, dri-devel, amd-gfx, linux-kernel
Cc: kernel-dev, alexander.deucher, christian.koenig,
pierre-eric.pelloux-prayer, 'Marek Olšák',
Samuel Pitoiset, Bas Nieuwenhuizen, Timur Kristóf
On 17/08/2023 20:20, André Almeida wrote:
> Even if there's nothing currently parsing amdgpu's coredump files, if
> we eventually have such tools they will be glad to find a version field
> to properly read the file.
>
> Create a version number to be displayed on top of coredump file, to be
> incremented when the file format or content get changed.
>
> Signed-off-by: André Almeida <andrealmeid@igalia.com>
> ---
> v5: new patch
> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 1 +
> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 3 +++
> 2 files changed, 4 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> index 579b70a3cdab..e92c81ff27be 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> @@ -192,6 +192,7 @@ static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset,
> p = drm_coredump_printer(&iter);
>
> drm_printf(&p, "**** AMDGPU Device Coredump ****\n");
> + drm_printf(&p, "version: " AMDGPU_COREDUMP_VERSION "\n");
> drm_printf(&p, "kernel: " UTS_RELEASE "\n");
> drm_printf(&p, "module: " KBUILD_MODNAME "\n");
> drm_printf(&p, "time: %lld.%09ld\n", coredump->reset_time.tv_sec, coredump->reset_time.tv_nsec);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> index 01e8183ade4b..ec3a409ec509 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> @@ -88,6 +88,9 @@ struct amdgpu_reset_domain {
> };
>
> #ifdef CONFIG_DEV_COREDUMP
> +
> +#define AMDGPU_COREDUMP_VERSION "1"
> +
Reviewed-by: Shashank Sharma <shashank.sharma@amd.com>
- Shashank
> struct amdgpu_coredump_info {
> struct amdgpu_device *adev;
> struct amdgpu_task_info reset_task_info;
^ permalink raw reply [flat|nested] 10+ messages in thread