AMD-GFX Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v6 1/2] drm/amdgpu: add debugfs for reset registers list
@ 2022-02-16  9:49 Somalapuram Amaranath
  2022-02-16  9:49 ` [PATCH v6 2/2] drm/amdgpu: add reset register dump trace on GPU reset Somalapuram Amaranath
                   ` (2 more replies)
  0 siblings, 3 replies; 21+ messages in thread
From: Somalapuram Amaranath @ 2022-02-16  9:49 UTC (permalink / raw)
  To: amd-gfx
  Cc: alexander.deucher, Somalapuram Amaranath, christian.koenig,
	shashank.sharma

List of register populated for dump collection during the GPU reset.

Signed-off-by: Somalapuram Amaranath <Amaranath.Somalapuram@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h         |  5 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 95 +++++++++++++++++++++
 2 files changed, 100 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index b85b67a88a3d..57965316873b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1097,6 +1097,11 @@ struct amdgpu_device {
 
 	struct amdgpu_reset_control     *reset_cntl;
 	uint32_t                        ip_versions[HW_ID_MAX][HWIP_MAX_INSTANCE];
+
+	/* reset dump register */
+	uint32_t			*reset_dump_reg_list;
+	int                             n_regs;
+	struct mutex			reset_dump_mutex;
 };
 
 static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 164d6a9e9fbb..faf985c7cb93 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1609,6 +1609,98 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, NULL,
 DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL,
 			amdgpu_debugfs_sclk_set, "%llu\n");
 
+static ssize_t amdgpu_reset_dump_register_list_read(struct file *f,
+				char __user *buf, size_t size, loff_t *pos)
+{
+	struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
+	char reg_offset[11];
+	int i, r, len = 0;
+
+	if (*pos)
+		return 0;
+
+	if (adev->n_regs == 0)
+		return 0;
+
+	for (i = 0; i < adev->n_regs; i++) {
+		sprintf(reg_offset, "0x%x ", adev->reset_dump_reg_list[i]);
+		r = copy_to_user(buf + len, reg_offset, strlen(reg_offset));
+
+		if (r)
+			return -EFAULT;
+
+		len += strlen(reg_offset);
+	}
+
+	r = copy_to_user(buf + len, "\n", 1);
+
+	if (r)
+		return -EFAULT;
+
+	len++;
+	*pos += len;
+
+	return len;
+}
+
+static ssize_t amdgpu_reset_dump_register_list_write(struct file *f,
+			const char __user *buf, size_t size, loff_t *pos)
+{
+	struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
+	char *reg_offset, *reg, reg_temp[11];
+	static int alloc_count;
+	int ret, i = 0, len = 0;
+
+	do {
+		reg_offset = reg_temp;
+		memset(reg_offset,  0, 11);
+		ret = copy_from_user(reg_offset, buf + len, min(11, ((int)size-len)));
+
+		if (ret)
+			goto failed;
+
+		reg = strsep(&reg_offset, " ");
+
+		if (alloc_count <= i) {
+			adev->reset_dump_reg_list =  krealloc_array(
+							adev->reset_dump_reg_list, 1,
+							sizeof(uint32_t), GFP_KERNEL);
+			alloc_count++;
+		}
+
+		ret = kstrtouint(reg, 16, &adev->reset_dump_reg_list[i]);
+
+		if (ret)
+			goto failed;
+
+		len += strlen(reg) + 1;
+		i++;
+
+	} while (len < size);
+
+	adev->n_regs = i;
+
+	return size;
+
+failed:
+	mutex_lock(&adev->reset_dump_mutex);
+	kfree(adev->reset_dump_reg_list);
+	adev->reset_dump_reg_list = NULL;
+	alloc_count = 0;
+	adev->n_regs = 0;
+	mutex_unlock(&adev->reset_dump_mutex);
+	return -EFAULT;
+}
+
+
+
+static const struct file_operations amdgpu_reset_dump_register_list = {
+	.owner = THIS_MODULE,
+	.read = amdgpu_reset_dump_register_list_read,
+	.write = amdgpu_reset_dump_register_list_write,
+	.llseek = default_llseek
+};
+
 int amdgpu_debugfs_init(struct amdgpu_device *adev)
 {
 	struct dentry *root = adev_to_drm(adev)->primary->debugfs_root;
@@ -1618,6 +1710,7 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
 	if (!debugfs_initialized())
 		return 0;
 
+	mutex_init(&adev->reset_dump_mutex);
 	ent = debugfs_create_file("amdgpu_preempt_ib", 0600, root, adev,
 				  &fops_ib_preempt);
 	if (IS_ERR(ent)) {
@@ -1672,6 +1765,8 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
 			    &amdgpu_debugfs_test_ib_fops);
 	debugfs_create_file("amdgpu_vm_info", 0444, root, adev,
 			    &amdgpu_debugfs_vm_info_fops);
+	debugfs_create_file("amdgpu_reset_dump_register_list", 0644, root, adev,
+			    &amdgpu_reset_dump_register_list);
 
 	adev->debugfs_vbios_blob.data = adev->bios;
 	adev->debugfs_vbios_blob.size = adev->bios_size;
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH v6 2/2] drm/amdgpu: add reset register dump trace on GPU reset
  2022-02-16  9:49 [PATCH v6 1/2] drm/amdgpu: add debugfs for reset registers list Somalapuram Amaranath
@ 2022-02-16  9:49 ` Somalapuram Amaranath
  2022-02-16 10:11 ` [PATCH v6 1/2] drm/amdgpu: add debugfs for reset registers list Christian König
  2022-02-16 10:15 ` Lazar, Lijo
  2 siblings, 0 replies; 21+ messages in thread
From: Somalapuram Amaranath @ 2022-02-16  9:49 UTC (permalink / raw)
  To: amd-gfx
  Cc: alexander.deucher, Somalapuram Amaranath, christian.koenig,
	shashank.sharma

Dump the list of register values to trace event on GPU reset.

Signed-off-by: Somalapuram Amaranath <Amaranath.Somalapuram@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 19 ++++++++++++++++++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h  | 16 ++++++++++++++++
 2 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 1e651b959141..247e4f97de33 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4534,6 +4534,21 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
 	return r;
 }
 
+static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
+{
+	uint32_t reg_value;
+	int i;
+
+	mutex_lock(&adev->reset_dump_mutex);
+	for (i = 0; i < adev->n_regs; i++) {
+		reg_value = RREG32(adev->reset_dump_reg_list[i]);
+		trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i], reg_value);
+	}
+	mutex_unlock(&adev->reset_dump_mutex);
+
+	return 0;
+}
+
 int amdgpu_do_asic_reset(struct list_head *device_list_handle,
 			 struct amdgpu_reset_context *reset_context)
 {
@@ -4567,8 +4582,10 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
 				tmp_adev->gmc.xgmi.pending_reset = false;
 				if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
 					r = -EALREADY;
-			} else
+			} else {
+				amdgpu_reset_reg_dumps(tmp_adev);
 				r = amdgpu_asic_reset(tmp_adev);
+			}
 
 			if (r) {
 				dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
index d855cb53c7e0..b9637925e85c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
@@ -537,6 +537,22 @@ TRACE_EVENT(amdgpu_ib_pipe_sync,
 		      __entry->seqno)
 );
 
+TRACE_EVENT(amdgpu_reset_reg_dumps,
+	    TP_PROTO(uint32_t address, uint32_t value),
+	    TP_ARGS(address, value),
+	    TP_STRUCT__entry(
+			     __field(uint32_t, address)
+			     __field(uint32_t, value)
+			     ),
+	    TP_fast_assign(
+			   __entry->address = address;
+			   __entry->value = value;
+			   ),
+	    TP_printk("amdgpu register dump 0x%x: 0x%x",
+		      __entry->address,
+		      __entry->value)
+);
+
 #undef AMDGPU_JOB_GET_TIMELINE_NAME
 #endif
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH v6 1/2] drm/amdgpu: add debugfs for reset registers list
  2022-02-16  9:49 [PATCH v6 1/2] drm/amdgpu: add debugfs for reset registers list Somalapuram Amaranath
  2022-02-16  9:49 ` [PATCH v6 2/2] drm/amdgpu: add reset register dump trace on GPU reset Somalapuram Amaranath
@ 2022-02-16 10:11 ` Christian König
  2022-02-16 13:11   ` Somalapuram, Amaranath
  2022-02-16 10:15 ` Lazar, Lijo
  2 siblings, 1 reply; 21+ messages in thread
From: Christian König @ 2022-02-16 10:11 UTC (permalink / raw)
  To: Somalapuram Amaranath, amd-gfx; +Cc: alexander.deucher, shashank.sharma

Am 16.02.22 um 10:49 schrieb Somalapuram Amaranath:
> List of register populated for dump collection during the GPU reset.
>
> Signed-off-by: Somalapuram Amaranath <Amaranath.Somalapuram@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h         |  5 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 95 +++++++++++++++++++++
>   2 files changed, 100 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index b85b67a88a3d..57965316873b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1097,6 +1097,11 @@ struct amdgpu_device {
>   
>   	struct amdgpu_reset_control     *reset_cntl;
>   	uint32_t                        ip_versions[HW_ID_MAX][HWIP_MAX_INSTANCE];
> +
> +	/* reset dump register */
> +	uint32_t			*reset_dump_reg_list;
> +	int                             n_regs;
> +	struct mutex			reset_dump_mutex;

I think we should rather use the reset lock for this instead of 
introducing just another mutex.

>   };
>   
>   static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> index 164d6a9e9fbb..faf985c7cb93 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> @@ -1609,6 +1609,98 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, NULL,
>   DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL,
>   			amdgpu_debugfs_sclk_set, "%llu\n");
>   
> +static ssize_t amdgpu_reset_dump_register_list_read(struct file *f,
> +				char __user *buf, size_t size, loff_t *pos)
> +{
> +	struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
> +	char reg_offset[11];
> +	int i, r, len = 0;
> +
> +	if (*pos)
> +		return 0;
> +
> +	if (adev->n_regs == 0)
> +		return 0;
> +
> +	for (i = 0; i < adev->n_regs; i++) {
> +		sprintf(reg_offset, "0x%x ", adev->reset_dump_reg_list[i]);
> +		r = copy_to_user(buf + len, reg_offset, strlen(reg_offset));
> +
> +		if (r)
> +			return -EFAULT;
> +
> +		len += strlen(reg_offset);
> +	}

You need to hold the lock protecting adev->reset_dump_reg_list and 
adev->n_regs while accessing those.

(BTW: num_regs instead of n_regs would match more what we use elsewhere, 
but is not a must have).

> +
> +	r = copy_to_user(buf + len, "\n", 1);
> +
> +	if (r)
> +		return -EFAULT;
> +
> +	len++;
> +	*pos += len;
> +
> +	return len;
> +}
> +
> +static ssize_t amdgpu_reset_dump_register_list_write(struct file *f,
> +			const char __user *buf, size_t size, loff_t *pos)
> +{
> +	struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
> +	char *reg_offset, *reg, reg_temp[11];
> +	static int alloc_count;
> +	int ret, i = 0, len = 0;
> +
> +	do {
> +		reg_offset = reg_temp;
> +		memset(reg_offset,  0, 11);
> +		ret = copy_from_user(reg_offset, buf + len, min(11, ((int)size-len)));
> +
> +		if (ret)
> +			goto failed;
> +
> +		reg = strsep(&reg_offset, " ");
> +
> +		if (alloc_count <= i) {

> +			adev->reset_dump_reg_list =  krealloc_array(
> +							adev->reset_dump_reg_list, 1,
> +							sizeof(uint32_t), GFP_KERNEL);
> +			alloc_count++;
> +		}
> +
> +		ret = kstrtouint(reg, 16, &adev->reset_dump_reg_list[i]);

This here is modifying adev->reset_dump_reg_list as well and so must be 
protected by a lock as well.

The tricky part is that we can't allocate memory while holding this lock 
(because we need it during reset as well).

One solution for this is to read the register list into a local array 
first and when that's done swap the local array with the one in 
adev->reset_dump_reg_list while holding the lock.

Regards,
Christian.

> +
> +		if (ret)
> +			goto failed;
> +
> +		len += strlen(reg) + 1;
> +		i++;
> +
> +	} while (len < size);
> +
> +	adev->n_regs = i;
> +
> +	return size;
> +
> +failed:
> +	mutex_lock(&adev->reset_dump_mutex);
> +	kfree(adev->reset_dump_reg_list);
> +	adev->reset_dump_reg_list = NULL;
> +	alloc_count = 0;
> +	adev->n_regs = 0;
> +	mutex_unlock(&adev->reset_dump_mutex);
> +	return -EFAULT;
> +}
> +
> +
> +
> +static const struct file_operations amdgpu_reset_dump_register_list = {
> +	.owner = THIS_MODULE,
> +	.read = amdgpu_reset_dump_register_list_read,
> +	.write = amdgpu_reset_dump_register_list_write,
> +	.llseek = default_llseek
> +};
> +
>   int amdgpu_debugfs_init(struct amdgpu_device *adev)
>   {
>   	struct dentry *root = adev_to_drm(adev)->primary->debugfs_root;
> @@ -1618,6 +1710,7 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
>   	if (!debugfs_initialized())
>   		return 0;
>   
> +	mutex_init(&adev->reset_dump_mutex);
>   	ent = debugfs_create_file("amdgpu_preempt_ib", 0600, root, adev,
>   				  &fops_ib_preempt);
>   	if (IS_ERR(ent)) {
> @@ -1672,6 +1765,8 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
>   			    &amdgpu_debugfs_test_ib_fops);
>   	debugfs_create_file("amdgpu_vm_info", 0444, root, adev,
>   			    &amdgpu_debugfs_vm_info_fops);
> +	debugfs_create_file("amdgpu_reset_dump_register_list", 0644, root, adev,
> +			    &amdgpu_reset_dump_register_list);
>   
>   	adev->debugfs_vbios_blob.data = adev->bios;
>   	adev->debugfs_vbios_blob.size = adev->bios_size;


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v6 1/2] drm/amdgpu: add debugfs for reset registers list
  2022-02-16  9:49 [PATCH v6 1/2] drm/amdgpu: add debugfs for reset registers list Somalapuram Amaranath
  2022-02-16  9:49 ` [PATCH v6 2/2] drm/amdgpu: add reset register dump trace on GPU reset Somalapuram Amaranath
  2022-02-16 10:11 ` [PATCH v6 1/2] drm/amdgpu: add debugfs for reset registers list Christian König
@ 2022-02-16 10:15 ` Lazar, Lijo
  2022-02-16 10:34   ` Somalapuram, Amaranath
  2 siblings, 1 reply; 21+ messages in thread
From: Lazar, Lijo @ 2022-02-16 10:15 UTC (permalink / raw)
  To: Somalapuram Amaranath, amd-gfx
  Cc: alexander.deucher, christian.koenig, shashank.sharma



On 2/16/2022 3:19 PM, Somalapuram Amaranath wrote:
> List of register populated for dump collection during the GPU reset.
> 
> Signed-off-by: Somalapuram Amaranath <Amaranath.Somalapuram@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h         |  5 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 95 +++++++++++++++++++++
>   2 files changed, 100 insertions(+)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index b85b67a88a3d..57965316873b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1097,6 +1097,11 @@ struct amdgpu_device {
>   
>   	struct amdgpu_reset_control     *reset_cntl;
>   	uint32_t                        ip_versions[HW_ID_MAX][HWIP_MAX_INSTANCE];
> +
> +	/* reset dump register */
> +	uint32_t			*reset_dump_reg_list;
> +	int                             n_regs;
> +	struct mutex			reset_dump_mutex;
>   };
>   
>   static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> index 164d6a9e9fbb..faf985c7cb93 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> @@ -1609,6 +1609,98 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, NULL,
>   DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL,
>   			amdgpu_debugfs_sclk_set, "%llu\n");
>   
> +static ssize_t amdgpu_reset_dump_register_list_read(struct file *f,
> +				char __user *buf, size_t size, loff_t *pos)
> +{
> +	struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
> +	char reg_offset[11];
> +	int i, r, len = 0;
> +
> +	if (*pos)
> +		return 0;
> +
> +	if (adev->n_regs == 0)
> +		return 0;
> +
> +	for (i = 0; i < adev->n_regs; i++) {
> +		sprintf(reg_offset, "0x%x ", adev->reset_dump_reg_list[i]);
> +		r = copy_to_user(buf + len, reg_offset, strlen(reg_offset));
> +
> +		if (r)
> +			return -EFAULT;
> +
> +		len += strlen(reg_offset);
> +	}
> +
> +	r = copy_to_user(buf + len, "\n", 1);
> +
> +	if (r)
> +		return -EFAULT;
> +
> +	len++;
> +	*pos += len;
> +
> +	return len;
> +}
> +
> +static ssize_t amdgpu_reset_dump_register_list_write(struct file *f,
> +			const char __user *buf, size_t size, loff_t *pos)
> +{
> +	struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
> +	char *reg_offset, *reg, reg_temp[11];
> +	static int alloc_count;

This being static what happens when it is called on a second device?

Thanks,
Lijo

> +	int ret, i = 0, len = 0;
> +
> +	do {
> +		reg_offset = reg_temp;
> +		memset(reg_offset,  0, 11);
> +		ret = copy_from_user(reg_offset, buf + len, min(11, ((int)size-len)));
> +
> +		if (ret)
> +			goto failed;
> +
> +		reg = strsep(&reg_offset, " ");
> +
> +		if (alloc_count <= i) {
> +			adev->reset_dump_reg_list =  krealloc_array(
> +							adev->reset_dump_reg_list, 1,
> +							sizeof(uint32_t), GFP_KERNEL);
> +			alloc_count++;
> +		}
> +
> +		ret = kstrtouint(reg, 16, &adev->reset_dump_reg_list[i]);
> +
> +		if (ret)
> +			goto failed;
> +
> +		len += strlen(reg) + 1;
> +		i++;
> +
> +	} while (len < size);
> +
> +	adev->n_regs = i;
> +
> +	return size;
> +
> +failed:
> +	mutex_lock(&adev->reset_dump_mutex);
> +	kfree(adev->reset_dump_reg_list);
> +	adev->reset_dump_reg_list = NULL;
> +	alloc_count = 0;
> +	adev->n_regs = 0;
> +	mutex_unlock(&adev->reset_dump_mutex);
> +	return -EFAULT;
> +}
> +
> +
> +
> +static const struct file_operations amdgpu_reset_dump_register_list = {
> +	.owner = THIS_MODULE,
> +	.read = amdgpu_reset_dump_register_list_read,
> +	.write = amdgpu_reset_dump_register_list_write,
> +	.llseek = default_llseek
> +};
> +
>   int amdgpu_debugfs_init(struct amdgpu_device *adev)
>   {
>   	struct dentry *root = adev_to_drm(adev)->primary->debugfs_root;
> @@ -1618,6 +1710,7 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
>   	if (!debugfs_initialized())
>   		return 0;
>   
> +	mutex_init(&adev->reset_dump_mutex);
>   	ent = debugfs_create_file("amdgpu_preempt_ib", 0600, root, adev,
>   				  &fops_ib_preempt);
>   	if (IS_ERR(ent)) {
> @@ -1672,6 +1765,8 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
>   			    &amdgpu_debugfs_test_ib_fops);
>   	debugfs_create_file("amdgpu_vm_info", 0444, root, adev,
>   			    &amdgpu_debugfs_vm_info_fops);
> +	debugfs_create_file("amdgpu_reset_dump_register_list", 0644, root, adev,
> +			    &amdgpu_reset_dump_register_list);
>   
>   	adev->debugfs_vbios_blob.data = adev->bios;
>   	adev->debugfs_vbios_blob.size = adev->bios_size;
> 

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v6 1/2] drm/amdgpu: add debugfs for reset registers list
  2022-02-16 10:15 ` Lazar, Lijo
@ 2022-02-16 10:34   ` Somalapuram, Amaranath
  2022-02-16 10:43     ` Lazar, Lijo
  0 siblings, 1 reply; 21+ messages in thread
From: Somalapuram, Amaranath @ 2022-02-16 10:34 UTC (permalink / raw)
  To: Lazar, Lijo, Somalapuram Amaranath, amd-gfx
  Cc: alexander.deucher, christian.koenig, shashank.sharma


On 2/16/2022 3:45 PM, Lazar, Lijo wrote:
>
>
> On 2/16/2022 3:19 PM, Somalapuram Amaranath wrote:
>> List of register populated for dump collection during the GPU reset.
>>
>> Signed-off-by: Somalapuram Amaranath <Amaranath.Somalapuram@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h         |  5 ++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 95 +++++++++++++++++++++
>>   2 files changed, 100 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index b85b67a88a3d..57965316873b 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -1097,6 +1097,11 @@ struct amdgpu_device {
>>         struct amdgpu_reset_control     *reset_cntl;
>>       uint32_t ip_versions[HW_ID_MAX][HWIP_MAX_INSTANCE];
>> +
>> +    /* reset dump register */
>> +    uint32_t            *reset_dump_reg_list;
>> +    int                             n_regs;
>> +    struct mutex            reset_dump_mutex;
>>   };
>>     static inline struct amdgpu_device *drm_to_adev(struct drm_device 
>> *ddev)
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>> index 164d6a9e9fbb..faf985c7cb93 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>> @@ -1609,6 +1609,98 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, NULL,
>>   DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL,
>>               amdgpu_debugfs_sclk_set, "%llu\n");
>>   +static ssize_t amdgpu_reset_dump_register_list_read(struct file *f,
>> +                char __user *buf, size_t size, loff_t *pos)
>> +{
>> +    struct amdgpu_device *adev = (struct amdgpu_device 
>> *)file_inode(f)->i_private;
>> +    char reg_offset[11];
>> +    int i, r, len = 0;
>> +
>> +    if (*pos)
>> +        return 0;
>> +
>> +    if (adev->n_regs == 0)
>> +        return 0;
>> +
>> +    for (i = 0; i < adev->n_regs; i++) {
>> +        sprintf(reg_offset, "0x%x ", adev->reset_dump_reg_list[i]);
>> +        r = copy_to_user(buf + len, reg_offset, strlen(reg_offset));
>> +
>> +        if (r)
>> +            return -EFAULT;
>> +
>> +        len += strlen(reg_offset);
>> +    }
>> +
>> +    r = copy_to_user(buf + len, "\n", 1);
>> +
>> +    if (r)
>> +        return -EFAULT;
>> +
>> +    len++;
>> +    *pos += len;
>> +
>> +    return len;
>> +}
>> +
>> +static ssize_t amdgpu_reset_dump_register_list_write(struct file *f,
>> +            const char __user *buf, size_t size, loff_t *pos)
>> +{
>> +    struct amdgpu_device *adev = (struct amdgpu_device 
>> *)file_inode(f)->i_private;
>> +    char *reg_offset, *reg, reg_temp[11];
>> +    static int alloc_count;
>
> This being static what happens when it is called on a second device?
>
> Thanks,
> Lijo
>
I tried to avoid adding to adev. It wont work for multiple devices.
>> +    int ret, i = 0, len = 0;
>> +
>> +    do {
>> +        reg_offset = reg_temp;
>> +        memset(reg_offset,  0, 11);
>> +        ret = copy_from_user(reg_offset, buf + len, min(11, 
>> ((int)size-len)));
>> +
>> +        if (ret)
>> +            goto failed;
>> +
>> +        reg = strsep(&reg_offset, " ");
>> +
>> +        if (alloc_count <= i) {
>> +            adev->reset_dump_reg_list =  krealloc_array(
>> +                            adev->reset_dump_reg_list, 1,
>> +                            sizeof(uint32_t), GFP_KERNEL);
>> +            alloc_count++;
>> +        }
>> +
>> +        ret = kstrtouint(reg, 16, &adev->reset_dump_reg_list[i]);
>> +
>> +        if (ret)
>> +            goto failed;
>> +
>> +        len += strlen(reg) + 1;
>> +        i++;
>> +
>> +    } while (len < size);
>> +
>> +    adev->n_regs = i;
>> +
>> +    return size;
>> +
>> +failed:
>> +    mutex_lock(&adev->reset_dump_mutex);
>> +    kfree(adev->reset_dump_reg_list);
>> +    adev->reset_dump_reg_list = NULL;
>> +    alloc_count = 0;
>> +    adev->n_regs = 0;
>> +    mutex_unlock(&adev->reset_dump_mutex);
>> +    return -EFAULT;
>> +}
>> +
>> +
>> +
>> +static const struct file_operations amdgpu_reset_dump_register_list = {
>> +    .owner = THIS_MODULE,
>> +    .read = amdgpu_reset_dump_register_list_read,
>> +    .write = amdgpu_reset_dump_register_list_write,
>> +    .llseek = default_llseek
>> +};
>> +
>>   int amdgpu_debugfs_init(struct amdgpu_device *adev)
>>   {
>>       struct dentry *root = adev_to_drm(adev)->primary->debugfs_root;
>> @@ -1618,6 +1710,7 @@ int amdgpu_debugfs_init(struct amdgpu_device 
>> *adev)
>>       if (!debugfs_initialized())
>>           return 0;
>>   +    mutex_init(&adev->reset_dump_mutex);
>>       ent = debugfs_create_file("amdgpu_preempt_ib", 0600, root, adev,
>>                     &fops_ib_preempt);
>>       if (IS_ERR(ent)) {
>> @@ -1672,6 +1765,8 @@ int amdgpu_debugfs_init(struct amdgpu_device 
>> *adev)
>>                   &amdgpu_debugfs_test_ib_fops);
>>       debugfs_create_file("amdgpu_vm_info", 0444, root, adev,
>>                   &amdgpu_debugfs_vm_info_fops);
>> +    debugfs_create_file("amdgpu_reset_dump_register_list", 0644, 
>> root, adev,
>> +                &amdgpu_reset_dump_register_list);
>>         adev->debugfs_vbios_blob.data = adev->bios;
>>       adev->debugfs_vbios_blob.size = adev->bios_size;
>>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v6 1/2] drm/amdgpu: add debugfs for reset registers list
  2022-02-16 10:34   ` Somalapuram, Amaranath
@ 2022-02-16 10:43     ` Lazar, Lijo
  2022-02-16 11:00       ` Christian König
  2022-02-16 11:09       ` Somalapuram, Amaranath
  0 siblings, 2 replies; 21+ messages in thread
From: Lazar, Lijo @ 2022-02-16 10:43 UTC (permalink / raw)
  To: Somalapuram, Amaranath, Somalapuram Amaranath, amd-gfx
  Cc: alexander.deucher, christian.koenig, shashank.sharma



On 2/16/2022 4:04 PM, Somalapuram, Amaranath wrote:
> 
> On 2/16/2022 3:45 PM, Lazar, Lijo wrote:
>>
>>
>> On 2/16/2022 3:19 PM, Somalapuram Amaranath wrote:
>>> List of register populated for dump collection during the GPU reset.
>>>
>>> Signed-off-by: Somalapuram Amaranath <Amaranath.Somalapuram@amd.com>
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h         |  5 ++
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 95 +++++++++++++++++++++
>>>   2 files changed, 100 insertions(+)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> index b85b67a88a3d..57965316873b 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> @@ -1097,6 +1097,11 @@ struct amdgpu_device {
>>>         struct amdgpu_reset_control     *reset_cntl;
>>>       uint32_t ip_versions[HW_ID_MAX][HWIP_MAX_INSTANCE];
>>> +
>>> +    /* reset dump register */
>>> +    uint32_t            *reset_dump_reg_list;
>>> +    int                             n_regs;
>>> +    struct mutex            reset_dump_mutex;
>>>   };
>>>     static inline struct amdgpu_device *drm_to_adev(struct drm_device 
>>> *ddev)
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>> index 164d6a9e9fbb..faf985c7cb93 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>> @@ -1609,6 +1609,98 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, NULL,
>>>   DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL,
>>>               amdgpu_debugfs_sclk_set, "%llu\n");
>>>   +static ssize_t amdgpu_reset_dump_register_list_read(struct file *f,
>>> +                char __user *buf, size_t size, loff_t *pos)
>>> +{
>>> +    struct amdgpu_device *adev = (struct amdgpu_device 
>>> *)file_inode(f)->i_private;
>>> +    char reg_offset[11];
>>> +    int i, r, len = 0;
>>> +
>>> +    if (*pos)
>>> +        return 0;
>>> +
>>> +    if (adev->n_regs == 0)
>>> +        return 0;
>>> +
>>> +    for (i = 0; i < adev->n_regs; i++) {
>>> +        sprintf(reg_offset, "0x%x ", adev->reset_dump_reg_list[i]);
>>> +        r = copy_to_user(buf + len, reg_offset, strlen(reg_offset));
>>> +
>>> +        if (r)
>>> +            return -EFAULT;
>>> +
>>> +        len += strlen(reg_offset);
>>> +    }
>>> +
>>> +    r = copy_to_user(buf + len, "\n", 1);
>>> +
>>> +    if (r)
>>> +        return -EFAULT;
>>> +
>>> +    len++;
>>> +    *pos += len;
>>> +
>>> +    return len;
>>> +}
>>> +
>>> +static ssize_t amdgpu_reset_dump_register_list_write(struct file *f,
>>> +            const char __user *buf, size_t size, loff_t *pos)
>>> +{
>>> +    struct amdgpu_device *adev = (struct amdgpu_device 
>>> *)file_inode(f)->i_private;
>>> +    char *reg_offset, *reg, reg_temp[11];
>>> +    static int alloc_count;
>>
>> This being static what happens when it is called on a second device?
>>
>> Thanks,
>> Lijo
>>
> I tried to avoid adding to adev. It wont work for multiple devices.

Hmm.This is not friendly for single device also. Some one could just 
parse a text file of reg offsets and do
	sudo echo offset > file

This will overwrite whatever is there. Instead you may define a syntax like	
	sudo echo 0x000 > file =>  Clears all
	sudo echo offset > file => Append to the existing set.

Taking all offsets in one go may not be needed.

Thanks,
Lijo

>>> +    int ret, i = 0, len = 0;
>>> +
>>> +    do {
>>> +        reg_offset = reg_temp;
>>> +        memset(reg_offset,  0, 11);
>>> +        ret = copy_from_user(reg_offset, buf + len, min(11, 
>>> ((int)size-len)));
>>> +
>>> +        if (ret)
>>> +            goto failed;
>>> +
>>> +        reg = strsep(&reg_offset, " ");
>>> +
>>> +        if (alloc_count <= i) {
>>> +            adev->reset_dump_reg_list =  krealloc_array(
>>> +                            adev->reset_dump_reg_list, 1,
>>> +                            sizeof(uint32_t), GFP_KERNEL);
>>> +            alloc_count++;
>>> +        }
>>> +
>>> +        ret = kstrtouint(reg, 16, &adev->reset_dump_reg_list[i]);
>>> +
>>> +        if (ret)
>>> +            goto failed;
>>> +
>>> +        len += strlen(reg) + 1;
>>> +        i++;
>>> +
>>> +    } while (len < size);
>>> +
>>> +    adev->n_regs = i;
>>> +
>>> +    return size;
>>> +
>>> +failed:
>>> +    mutex_lock(&adev->reset_dump_mutex);
>>> +    kfree(adev->reset_dump_reg_list);
>>> +    adev->reset_dump_reg_list = NULL;
>>> +    alloc_count = 0;
>>> +    adev->n_regs = 0;
>>> +    mutex_unlock(&adev->reset_dump_mutex);
>>> +    return -EFAULT;
>>> +}
>>> +
>>> +
>>> +
>>> +static const struct file_operations amdgpu_reset_dump_register_list = {
>>> +    .owner = THIS_MODULE,
>>> +    .read = amdgpu_reset_dump_register_list_read,
>>> +    .write = amdgpu_reset_dump_register_list_write,
>>> +    .llseek = default_llseek
>>> +};
>>> +
>>>   int amdgpu_debugfs_init(struct amdgpu_device *adev)
>>>   {
>>>       struct dentry *root = adev_to_drm(adev)->primary->debugfs_root;
>>> @@ -1618,6 +1710,7 @@ int amdgpu_debugfs_init(struct amdgpu_device 
>>> *adev)
>>>       if (!debugfs_initialized())
>>>           return 0;
>>>   +    mutex_init(&adev->reset_dump_mutex);
>>>       ent = debugfs_create_file("amdgpu_preempt_ib", 0600, root, adev,
>>>                     &fops_ib_preempt);
>>>       if (IS_ERR(ent)) {
>>> @@ -1672,6 +1765,8 @@ int amdgpu_debugfs_init(struct amdgpu_device 
>>> *adev)
>>>                   &amdgpu_debugfs_test_ib_fops);
>>>       debugfs_create_file("amdgpu_vm_info", 0444, root, adev,
>>>                   &amdgpu_debugfs_vm_info_fops);
>>> +    debugfs_create_file("amdgpu_reset_dump_register_list", 0644, 
>>> root, adev,
>>> +                &amdgpu_reset_dump_register_list);
>>>         adev->debugfs_vbios_blob.data = adev->bios;
>>>       adev->debugfs_vbios_blob.size = adev->bios_size;
>>>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v6 1/2] drm/amdgpu: add debugfs for reset registers list
  2022-02-16 10:43     ` Lazar, Lijo
@ 2022-02-16 11:00       ` Christian König
  2022-02-16 11:09       ` Somalapuram, Amaranath
  1 sibling, 0 replies; 21+ messages in thread
From: Christian König @ 2022-02-16 11:00 UTC (permalink / raw)
  To: Lazar, Lijo, Somalapuram, Amaranath, Somalapuram Amaranath,
	amd-gfx
  Cc: alexander.deucher, shashank.sharma

Am 16.02.22 um 11:43 schrieb Lazar, Lijo:
>
>
> On 2/16/2022 4:04 PM, Somalapuram, Amaranath wrote:
>>
>> On 2/16/2022 3:45 PM, Lazar, Lijo wrote:
>>>
>>>
>>> On 2/16/2022 3:19 PM, Somalapuram Amaranath wrote:
>>>> List of register populated for dump collection during the GPU reset.
>>>>
>>>> Signed-off-by: Somalapuram Amaranath <Amaranath.Somalapuram@amd.com>
>>>> ---
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h         |  5 ++
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 95 
>>>> +++++++++++++++++++++
>>>>   2 files changed, 100 insertions(+)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> index b85b67a88a3d..57965316873b 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> @@ -1097,6 +1097,11 @@ struct amdgpu_device {
>>>>         struct amdgpu_reset_control     *reset_cntl;
>>>>       uint32_t ip_versions[HW_ID_MAX][HWIP_MAX_INSTANCE];
>>>> +
>>>> +    /* reset dump register */
>>>> +    uint32_t            *reset_dump_reg_list;
>>>> +    int                             n_regs;
>>>> +    struct mutex            reset_dump_mutex;
>>>>   };
>>>>     static inline struct amdgpu_device *drm_to_adev(struct 
>>>> drm_device *ddev)
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>> index 164d6a9e9fbb..faf985c7cb93 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>> @@ -1609,6 +1609,98 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, NULL,
>>>>   DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL,
>>>>               amdgpu_debugfs_sclk_set, "%llu\n");
>>>>   +static ssize_t amdgpu_reset_dump_register_list_read(struct file *f,
>>>> +                char __user *buf, size_t size, loff_t *pos)
>>>> +{
>>>> +    struct amdgpu_device *adev = (struct amdgpu_device 
>>>> *)file_inode(f)->i_private;
>>>> +    char reg_offset[11];
>>>> +    int i, r, len = 0;
>>>> +
>>>> +    if (*pos)
>>>> +        return 0;
>>>> +
>>>> +    if (adev->n_regs == 0)
>>>> +        return 0;
>>>> +
>>>> +    for (i = 0; i < adev->n_regs; i++) {
>>>> +        sprintf(reg_offset, "0x%x ", adev->reset_dump_reg_list[i]);
>>>> +        r = copy_to_user(buf + len, reg_offset, strlen(reg_offset));
>>>> +
>>>> +        if (r)
>>>> +            return -EFAULT;
>>>> +
>>>> +        len += strlen(reg_offset);
>>>> +    }
>>>> +
>>>> +    r = copy_to_user(buf + len, "\n", 1);
>>>> +
>>>> +    if (r)
>>>> +        return -EFAULT;
>>>> +
>>>> +    len++;
>>>> +    *pos += len;
>>>> +
>>>> +    return len;
>>>> +}
>>>> +
>>>> +static ssize_t amdgpu_reset_dump_register_list_write(struct file *f,
>>>> +            const char __user *buf, size_t size, loff_t *pos)
>>>> +{
>>>> +    struct amdgpu_device *adev = (struct amdgpu_device 
>>>> *)file_inode(f)->i_private;
>>>> +    char *reg_offset, *reg, reg_temp[11];
>>>> +    static int alloc_count;
>>>
>>> This being static what happens when it is called on a second device?

Good point. I've totally missed that.

>>>
>>> Thanks,
>>> Lijo
>>>
>> I tried to avoid adding to adev. It wont work for multiple devices.
>
> Hmm.This is not friendly for single device also. Some one could just 
> parse a text file of reg offsets and do
>     sudo echo offset > file
>
> This will overwrite whatever is there. Instead you may define a syntax 
> like
>     sudo echo 0x000 > file =>  Clears all
>     sudo echo offset > file => Append to the existing set.
>
> Taking all offsets in one go may not be needed.

Yes, completely agree. Now that you mention it the code here is severely 
broken in quite a number of ways, but see below.

>
> Thanks,
> Lijo
>
>>>> +    int ret, i = 0, len = 0;
>>>> +
>>>> +    do {
>>>> +        reg_offset = reg_temp;
>>>> +        memset(reg_offset,  0, 11);
>>>> +        ret = copy_from_user(reg_offset, buf + len, min(11, 
>>>> ((int)size-len)));
>>>> +
>>>> +        if (ret)
>>>> +            goto failed;
>>>> +
>>>> +        reg = strsep(&reg_offset, " ");
>>>> +
>>>> +        if (alloc_count <= i) {

The alloc_count variable and this check is completely unnecessary.

>>>> + adev->reset_dump_reg_list =  krealloc_array(
>>>> +                            adev->reset_dump_reg_list, 1,

The memory management knows internally how much memory is allocated and 
when the buffer needs to be re-allocated for grow.

So all you need to do is is "array = krealloc_array(array, i, 
sizeof(*array), GFP_KERNEL);"

Regards,
Christian.


>>>> + sizeof(uint32_t), GFP_KERNEL);
>>>> +            alloc_count++;
>>>> +        }
>>>> +
>>>> +        ret = kstrtouint(reg, 16, &adev->reset_dump_reg_list[i]);
>>>> +
>>>> +        if (ret)
>>>> +            goto failed;
>>>> +
>>>> +        len += strlen(reg) + 1;
>>>> +        i++;
>>>> +
>>>> +    } while (len < size);
>>>> +
>>>> +    adev->n_regs = i;
>>>> +
>>>> +    return size;
>>>> +
>>>> +failed:
>>>> +    mutex_lock(&adev->reset_dump_mutex);
>>>> +    kfree(adev->reset_dump_reg_list);
>>>> +    adev->reset_dump_reg_list = NULL;
>>>> +    alloc_count = 0;
>>>> +    adev->n_regs = 0;
>>>> +    mutex_unlock(&adev->reset_dump_mutex);
>>>> +    return -EFAULT;
>>>> +}
>>>> +
>>>> +
>>>> +
>>>> +static const struct file_operations 
>>>> amdgpu_reset_dump_register_list = {
>>>> +    .owner = THIS_MODULE,
>>>> +    .read = amdgpu_reset_dump_register_list_read,
>>>> +    .write = amdgpu_reset_dump_register_list_write,
>>>> +    .llseek = default_llseek
>>>> +};
>>>> +
>>>>   int amdgpu_debugfs_init(struct amdgpu_device *adev)
>>>>   {
>>>>       struct dentry *root = adev_to_drm(adev)->primary->debugfs_root;
>>>> @@ -1618,6 +1710,7 @@ int amdgpu_debugfs_init(struct amdgpu_device 
>>>> *adev)
>>>>       if (!debugfs_initialized())
>>>>           return 0;
>>>>   +    mutex_init(&adev->reset_dump_mutex);
>>>>       ent = debugfs_create_file("amdgpu_preempt_ib", 0600, root, adev,
>>>>                     &fops_ib_preempt);
>>>>       if (IS_ERR(ent)) {
>>>> @@ -1672,6 +1765,8 @@ int amdgpu_debugfs_init(struct amdgpu_device 
>>>> *adev)
>>>>                   &amdgpu_debugfs_test_ib_fops);
>>>>       debugfs_create_file("amdgpu_vm_info", 0444, root, adev,
>>>>                   &amdgpu_debugfs_vm_info_fops);
>>>> +    debugfs_create_file("amdgpu_reset_dump_register_list", 0644, 
>>>> root, adev,
>>>> +                &amdgpu_reset_dump_register_list);
>>>>         adev->debugfs_vbios_blob.data = adev->bios;
>>>>       adev->debugfs_vbios_blob.size = adev->bios_size;
>>>>


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v6 1/2] drm/amdgpu: add debugfs for reset registers list
  2022-02-16 10:43     ` Lazar, Lijo
  2022-02-16 11:00       ` Christian König
@ 2022-02-16 11:09       ` Somalapuram, Amaranath
  2022-02-16 13:17         ` Lazar, Lijo
  1 sibling, 1 reply; 21+ messages in thread
From: Somalapuram, Amaranath @ 2022-02-16 11:09 UTC (permalink / raw)
  To: Lazar, Lijo, Somalapuram Amaranath, amd-gfx
  Cc: alexander.deucher, christian.koenig, shashank.sharma


On 2/16/2022 4:13 PM, Lazar, Lijo wrote:
>
>
> On 2/16/2022 4:04 PM, Somalapuram, Amaranath wrote:
>>
>> On 2/16/2022 3:45 PM, Lazar, Lijo wrote:
>>>
>>>
>>> On 2/16/2022 3:19 PM, Somalapuram Amaranath wrote:
>>>> List of register populated for dump collection during the GPU reset.
>>>>
>>>> Signed-off-by: Somalapuram Amaranath <Amaranath.Somalapuram@amd.com>
>>>> ---
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h         |  5 ++
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 95 
>>>> +++++++++++++++++++++
>>>>   2 files changed, 100 insertions(+)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> index b85b67a88a3d..57965316873b 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> @@ -1097,6 +1097,11 @@ struct amdgpu_device {
>>>>         struct amdgpu_reset_control     *reset_cntl;
>>>>       uint32_t ip_versions[HW_ID_MAX][HWIP_MAX_INSTANCE];
>>>> +
>>>> +    /* reset dump register */
>>>> +    uint32_t            *reset_dump_reg_list;
>>>> +    int                             n_regs;
>>>> +    struct mutex            reset_dump_mutex;
>>>>   };
>>>>     static inline struct amdgpu_device *drm_to_adev(struct 
>>>> drm_device *ddev)
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>> index 164d6a9e9fbb..faf985c7cb93 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>> @@ -1609,6 +1609,98 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, NULL,
>>>>   DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL,
>>>>               amdgpu_debugfs_sclk_set, "%llu\n");
>>>>   +static ssize_t amdgpu_reset_dump_register_list_read(struct file *f,
>>>> +                char __user *buf, size_t size, loff_t *pos)
>>>> +{
>>>> +    struct amdgpu_device *adev = (struct amdgpu_device 
>>>> *)file_inode(f)->i_private;
>>>> +    char reg_offset[11];
>>>> +    int i, r, len = 0;
>>>> +
>>>> +    if (*pos)
>>>> +        return 0;
>>>> +
>>>> +    if (adev->n_regs == 0)
>>>> +        return 0;
>>>> +
>>>> +    for (i = 0; i < adev->n_regs; i++) {
>>>> +        sprintf(reg_offset, "0x%x ", adev->reset_dump_reg_list[i]);
>>>> +        r = copy_to_user(buf + len, reg_offset, strlen(reg_offset));
>>>> +
>>>> +        if (r)
>>>> +            return -EFAULT;
>>>> +
>>>> +        len += strlen(reg_offset);
>>>> +    }
>>>> +
>>>> +    r = copy_to_user(buf + len, "\n", 1);
>>>> +
>>>> +    if (r)
>>>> +        return -EFAULT;
>>>> +
>>>> +    len++;
>>>> +    *pos += len;
>>>> +
>>>> +    return len;
>>>> +}
>>>> +
>>>> +static ssize_t amdgpu_reset_dump_register_list_write(struct file *f,
>>>> +            const char __user *buf, size_t size, loff_t *pos)
>>>> +{
>>>> +    struct amdgpu_device *adev = (struct amdgpu_device 
>>>> *)file_inode(f)->i_private;
>>>> +    char *reg_offset, *reg, reg_temp[11];
>>>> +    static int alloc_count;
>>>
>>> This being static what happens when it is called on a second device?
>>>
>>> Thanks,
>>> Lijo
>>>
>> I tried to avoid adding to adev. It wont work for multiple devices.
>
> Hmm.This is not friendly for single device also. Some one could just 
> parse a text file of reg offsets and do
>     sudo echo offset > file
>
> This will overwrite whatever is there. Instead you may define a syntax 
> like
>     sudo echo 0x000 > file =>  Clears all
>     sudo echo offset > file => Append to the existing set.
>
> Taking all offsets in one go may not be needed.
>
> Thanks,
> Lijo
>

0x000 can be offset for some registers !
This is application responsibly; any wrong data should clear the list.
Application can read back the list for confomation.

Regards,
S.Amarnath
>>>> +    int ret, i = 0, len = 0;
>>>> +
>>>> +    do {
>>>> +        reg_offset = reg_temp;
>>>> +        memset(reg_offset,  0, 11);
>>>> +        ret = copy_from_user(reg_offset, buf + len, min(11, 
>>>> ((int)size-len)));
>>>> +
>>>> +        if (ret)
>>>> +            goto failed;
>>>> +
>>>> +        reg = strsep(&reg_offset, " ");
>>>> +
>>>> +        if (alloc_count <= i) {
>>>> +            adev->reset_dump_reg_list =  krealloc_array(
>>>> +                            adev->reset_dump_reg_list, 1,
>>>> +                            sizeof(uint32_t), GFP_KERNEL);
>>>> +            alloc_count++;
>>>> +        }
>>>> +
>>>> +        ret = kstrtouint(reg, 16, &adev->reset_dump_reg_list[i]);
>>>> +
>>>> +        if (ret)
>>>> +            goto failed;
>>>> +
>>>> +        len += strlen(reg) + 1;
>>>> +        i++;
>>>> +
>>>> +    } while (len < size);
>>>> +
>>>> +    adev->n_regs = i;
>>>> +
>>>> +    return size;
>>>> +
>>>> +failed:
>>>> +    mutex_lock(&adev->reset_dump_mutex);
>>>> +    kfree(adev->reset_dump_reg_list);
>>>> +    adev->reset_dump_reg_list = NULL;
>>>> +    alloc_count = 0;
>>>> +    adev->n_regs = 0;
>>>> +    mutex_unlock(&adev->reset_dump_mutex);
>>>> +    return -EFAULT;
>>>> +}
>>>> +
>>>> +
>>>> +
>>>> +static const struct file_operations 
>>>> amdgpu_reset_dump_register_list = {
>>>> +    .owner = THIS_MODULE,
>>>> +    .read = amdgpu_reset_dump_register_list_read,
>>>> +    .write = amdgpu_reset_dump_register_list_write,
>>>> +    .llseek = default_llseek
>>>> +};
>>>> +
>>>>   int amdgpu_debugfs_init(struct amdgpu_device *adev)
>>>>   {
>>>>       struct dentry *root = adev_to_drm(adev)->primary->debugfs_root;
>>>> @@ -1618,6 +1710,7 @@ int amdgpu_debugfs_init(struct amdgpu_device 
>>>> *adev)
>>>>       if (!debugfs_initialized())
>>>>           return 0;
>>>>   +    mutex_init(&adev->reset_dump_mutex);
>>>>       ent = debugfs_create_file("amdgpu_preempt_ib", 0600, root, adev,
>>>>                     &fops_ib_preempt);
>>>>       if (IS_ERR(ent)) {
>>>> @@ -1672,6 +1765,8 @@ int amdgpu_debugfs_init(struct amdgpu_device 
>>>> *adev)
>>>>                   &amdgpu_debugfs_test_ib_fops);
>>>>       debugfs_create_file("amdgpu_vm_info", 0444, root, adev,
>>>>                   &amdgpu_debugfs_vm_info_fops);
>>>> +    debugfs_create_file("amdgpu_reset_dump_register_list", 0644, 
>>>> root, adev,
>>>> +                &amdgpu_reset_dump_register_list);
>>>>         adev->debugfs_vbios_blob.data = adev->bios;
>>>>       adev->debugfs_vbios_blob.size = adev->bios_size;
>>>>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v6 1/2] drm/amdgpu: add debugfs for reset registers list
  2022-02-16 10:11 ` [PATCH v6 1/2] drm/amdgpu: add debugfs for reset registers list Christian König
@ 2022-02-16 13:11   ` Somalapuram, Amaranath
  2022-02-16 14:56     ` Christian König
  0 siblings, 1 reply; 21+ messages in thread
From: Somalapuram, Amaranath @ 2022-02-16 13:11 UTC (permalink / raw)
  To: Christian König, Somalapuram Amaranath, amd-gfx
  Cc: alexander.deucher, shashank.sharma

[-- Attachment #1: Type: text/plain, Size: 6989 bytes --]


On 2/16/2022 3:41 PM, Christian König wrote:
> Am 16.02.22 um 10:49 schrieb Somalapuram Amaranath:
>> List of register populated for dump collection during the GPU reset.
>>
>> Signed-off-by: Somalapuram Amaranath <Amaranath.Somalapuram@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h         |  5 ++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 95 +++++++++++++++++++++
>>   2 files changed, 100 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index b85b67a88a3d..57965316873b 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -1097,6 +1097,11 @@ struct amdgpu_device {
>>         struct amdgpu_reset_control     *reset_cntl;
>>       uint32_t ip_versions[HW_ID_MAX][HWIP_MAX_INSTANCE];
>> +
>> +    /* reset dump register */
>> +    uint32_t            *reset_dump_reg_list;
>> +    int                             n_regs;
>> +    struct mutex            reset_dump_mutex;
>
> I think we should rather use the reset lock for this instead of 
> introducing just another mutex.
>
>>   };
>>     static inline struct amdgpu_device *drm_to_adev(struct drm_device 
>> *ddev)
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>> index 164d6a9e9fbb..faf985c7cb93 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>> @@ -1609,6 +1609,98 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, NULL,
>>   DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL,
>>               amdgpu_debugfs_sclk_set, "%llu\n");
>>   +static ssize_t amdgpu_reset_dump_register_list_read(struct file *f,
>> +                char __user *buf, size_t size, loff_t *pos)
>> +{
>> +    struct amdgpu_device *adev = (struct amdgpu_device 
>> *)file_inode(f)->i_private;
>> +    char reg_offset[11];
>> +    int i, r, len = 0;
>> +
>> +    if (*pos)
>> +        return 0;
>> +
>> +    if (adev->n_regs == 0)
>> +        return 0;
>> +
>> +    for (i = 0; i < adev->n_regs; i++) {
>> +        sprintf(reg_offset, "0x%x ", adev->reset_dump_reg_list[i]);
>> +        r = copy_to_user(buf + len, reg_offset, strlen(reg_offset));
>> +
>> +        if (r)
>> +            return -EFAULT;
>> +
>> +        len += strlen(reg_offset);
>> +    }
>
> You need to hold the lock protecting adev->reset_dump_reg_list and 
> adev->n_regs while accessing those.
>
> (BTW: num_regs instead of n_regs would match more what we use 
> elsewhere, but is not a must have).
>
This is read function for user and returns only list of reg offsets, I 
did not understand correctly !
>> +
>> +    r = copy_to_user(buf + len, "\n", 1);
>> +
>> +    if (r)
>> +        return -EFAULT;
>> +
>> +    len++;
>> +    *pos += len;
>> +
>> +    return len;
>> +}
>> +
>> +static ssize_t amdgpu_reset_dump_register_list_write(struct file *f,
>> +            const char __user *buf, size_t size, loff_t *pos)
>> +{
>> +    struct amdgpu_device *adev = (struct amdgpu_device 
>> *)file_inode(f)->i_private;
>> +    char *reg_offset, *reg, reg_temp[11];
>> +    static int alloc_count;
>> +    int ret, i = 0, len = 0;
>> +
>> +    do {
>> +        reg_offset = reg_temp;
>> +        memset(reg_offset,  0, 11);
>> +        ret = copy_from_user(reg_offset, buf + len, min(11, 
>> ((int)size-len)));
>> +
>> +        if (ret)
>> +            goto failed;
>> +
>> +        reg = strsep(&reg_offset, " ");
>> +
>> +        if (alloc_count <= i) {
>
>> +            adev->reset_dump_reg_list =  krealloc_array(
>> +                            adev->reset_dump_reg_list, 1,
>> +                            sizeof(uint32_t), GFP_KERNEL);
>> +            alloc_count++;
>> +        }
>> +
>> +        ret = kstrtouint(reg, 16, &adev->reset_dump_reg_list[i]);
>
> This here is modifying adev->reset_dump_reg_list as well and so must 
> be protected by a lock as well.
>
> The tricky part is that we can't allocate memory while holding this 
> lock (because we need it during reset as well).
>
> One solution for this is to read the register list into a local array 
> first and when that's done swap the local array with the one in 
> adev->reset_dump_reg_list while holding the lock.
>
> Regards,
> Christian.
>
There are 2 situations:
1st time creating list n_regs will be 0 and trace event will not be 
triggered
2nd time while updating list n_regs is already set and 
adev->reset_dump_reg_list will have some offsets address (hypothetically 
speaking *during reset + update* read values from RREG32 will mix up of 
old list and new list)
its only critical when its freed and n_regs is not 0

Regards,
S.Amarnath
>> +
>> +        if (ret)
>> +            goto failed;
>> +
>> +        len += strlen(reg) + 1;
>> +        i++;
>> +
>> +    } while (len < size);
>> +
>> +    adev->n_regs = i;
>> +
>> +    return size;
>> +
>> +failed:
>> +    mutex_lock(&adev->reset_dump_mutex);
>> +    kfree(adev->reset_dump_reg_list);
>> +    adev->reset_dump_reg_list = NULL;
>> +    alloc_count = 0;
>> +    adev->n_regs = 0;
>> +    mutex_unlock(&adev->reset_dump_mutex);
>> +    return -EFAULT;
>> +}
>> +
>> +
>> +
>> +static const struct file_operations amdgpu_reset_dump_register_list = {
>> +    .owner = THIS_MODULE,
>> +    .read = amdgpu_reset_dump_register_list_read,
>> +    .write = amdgpu_reset_dump_register_list_write,
>> +    .llseek = default_llseek
>> +};
>> +
>>   int amdgpu_debugfs_init(struct amdgpu_device *adev)
>>   {
>>       struct dentry *root = adev_to_drm(adev)->primary->debugfs_root;
>> @@ -1618,6 +1710,7 @@ int amdgpu_debugfs_init(struct amdgpu_device 
>> *adev)
>>       if (!debugfs_initialized())
>>           return 0;
>>   +    mutex_init(&adev->reset_dump_mutex);
>>       ent = debugfs_create_file("amdgpu_preempt_ib", 0600, root, adev,
>>                     &fops_ib_preempt);
>>       if (IS_ERR(ent)) {
>> @@ -1672,6 +1765,8 @@ int amdgpu_debugfs_init(struct amdgpu_device 
>> *adev)
>>                   &amdgpu_debugfs_test_ib_fops);
>>       debugfs_create_file("amdgpu_vm_info", 0444, root, adev,
>>                   &amdgpu_debugfs_vm_info_fops);
>> +    debugfs_create_file("amdgpu_reset_dump_register_list", 0644, 
>> root, adev,
>> +                &amdgpu_reset_dump_register_list);
>>         adev->debugfs_vbios_blob.data = adev->bios;
>>       adev->debugfs_vbios_blob.size = adev->bios_size;
>

[-- Attachment #2: Type: text/html, Size: 13448 bytes --]

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v6 1/2] drm/amdgpu: add debugfs for reset registers list
  2022-02-16 11:09       ` Somalapuram, Amaranath
@ 2022-02-16 13:17         ` Lazar, Lijo
  2022-02-16 13:55           ` Somalapuram, Amaranath
  0 siblings, 1 reply; 21+ messages in thread
From: Lazar, Lijo @ 2022-02-16 13:17 UTC (permalink / raw)
  To: Somalapuram, Amaranath, Somalapuram Amaranath, amd-gfx
  Cc: alexander.deucher, christian.koenig, shashank.sharma



On 2/16/2022 4:39 PM, Somalapuram, Amaranath wrote:
> 
> On 2/16/2022 4:13 PM, Lazar, Lijo wrote:
>>
>>
>> On 2/16/2022 4:04 PM, Somalapuram, Amaranath wrote:
>>>
>>> On 2/16/2022 3:45 PM, Lazar, Lijo wrote:
>>>>
>>>>
>>>> On 2/16/2022 3:19 PM, Somalapuram Amaranath wrote:
>>>>> List of register populated for dump collection during the GPU reset.
>>>>>
>>>>> Signed-off-by: Somalapuram Amaranath <Amaranath.Somalapuram@amd.com>
>>>>> ---
>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h         |  5 ++
>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 95 
>>>>> +++++++++++++++++++++
>>>>>   2 files changed, 100 insertions(+)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>> index b85b67a88a3d..57965316873b 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>> @@ -1097,6 +1097,11 @@ struct amdgpu_device {
>>>>>         struct amdgpu_reset_control     *reset_cntl;
>>>>>       uint32_t ip_versions[HW_ID_MAX][HWIP_MAX_INSTANCE];
>>>>> +
>>>>> +    /* reset dump register */
>>>>> +    uint32_t            *reset_dump_reg_list;
>>>>> +    int                             n_regs;
>>>>> +    struct mutex            reset_dump_mutex;
>>>>>   };
>>>>>     static inline struct amdgpu_device *drm_to_adev(struct 
>>>>> drm_device *ddev)
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>>> index 164d6a9e9fbb..faf985c7cb93 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>>> @@ -1609,6 +1609,98 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, NULL,
>>>>>   DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL,
>>>>>               amdgpu_debugfs_sclk_set, "%llu\n");
>>>>>   +static ssize_t amdgpu_reset_dump_register_list_read(struct file *f,
>>>>> +                char __user *buf, size_t size, loff_t *pos)
>>>>> +{
>>>>> +    struct amdgpu_device *adev = (struct amdgpu_device 
>>>>> *)file_inode(f)->i_private;
>>>>> +    char reg_offset[11];
>>>>> +    int i, r, len = 0;
>>>>> +
>>>>> +    if (*pos)
>>>>> +        return 0;
>>>>> +
>>>>> +    if (adev->n_regs == 0)
>>>>> +        return 0;
>>>>> +
>>>>> +    for (i = 0; i < adev->n_regs; i++) {
>>>>> +        sprintf(reg_offset, "0x%x ", adev->reset_dump_reg_list[i]);
>>>>> +        r = copy_to_user(buf + len, reg_offset, strlen(reg_offset));
>>>>> +
>>>>> +        if (r)
>>>>> +            return -EFAULT;
>>>>> +
>>>>> +        len += strlen(reg_offset);
>>>>> +    }
>>>>> +
>>>>> +    r = copy_to_user(buf + len, "\n", 1);
>>>>> +
>>>>> +    if (r)
>>>>> +        return -EFAULT;
>>>>> +
>>>>> +    len++;
>>>>> +    *pos += len;
>>>>> +
>>>>> +    return len;
>>>>> +}
>>>>> +
>>>>> +static ssize_t amdgpu_reset_dump_register_list_write(struct file *f,
>>>>> +            const char __user *buf, size_t size, loff_t *pos)
>>>>> +{
>>>>> +    struct amdgpu_device *adev = (struct amdgpu_device 
>>>>> *)file_inode(f)->i_private;
>>>>> +    char *reg_offset, *reg, reg_temp[11];
>>>>> +    static int alloc_count;
>>>>
>>>> This being static what happens when it is called on a second device?
>>>>
>>>> Thanks,
>>>> Lijo
>>>>
>>> I tried to avoid adding to adev. It wont work for multiple devices.
>>
>> Hmm.This is not friendly for single device also. Some one could just 
>> parse a text file of reg offsets and do
>>     sudo echo offset > file
>>
>> This will overwrite whatever is there. Instead you may define a syntax 
>> like
>>     sudo echo 0x000 > file =>  Clears all
>>     sudo echo offset > file => Append to the existing set.
>>
>> Taking all offsets in one go may not be needed.
>>
>> Thanks,
>> Lijo
>>
> 
> 0x000 can be offset for some registers !

It's indeed a valid register called MM_INDEX register. The register 
doesn't have any meaning in standalone.

> This is application responsibly; any wrong data should clear the list.
> Application can read back the list for confomation.
> 

It needs to be done by user app anyway. This is more about how 
convenient the interface is. Probably you could switch to a user 
standpoint and try to add some 20-30 registers to the list. Then steps 
needed to add a revised list.

Thanks,
Lijo

> Regards,
> S.Amarnath
>>>>> +    int ret, i = 0, len = 0;
>>>>> +
>>>>> +    do {
>>>>> +        reg_offset = reg_temp;
>>>>> +        memset(reg_offset,  0, 11);
>>>>> +        ret = copy_from_user(reg_offset, buf + len, min(11, 
>>>>> ((int)size-len)));
>>>>> +
>>>>> +        if (ret)
>>>>> +            goto failed;
>>>>> +
>>>>> +        reg = strsep(&reg_offset, " ");
>>>>> +
>>>>> +        if (alloc_count <= i) {
>>>>> +            adev->reset_dump_reg_list =  krealloc_array(
>>>>> +                            adev->reset_dump_reg_list, 1,
>>>>> +                            sizeof(uint32_t), GFP_KERNEL);
>>>>> +            alloc_count++;
>>>>> +        }
>>>>> +
>>>>> +        ret = kstrtouint(reg, 16, &adev->reset_dump_reg_list[i]);
>>>>> +
>>>>> +        if (ret)
>>>>> +            goto failed;
>>>>> +
>>>>> +        len += strlen(reg) + 1;
>>>>> +        i++;
>>>>> +
>>>>> +    } while (len < size);
>>>>> +
>>>>> +    adev->n_regs = i;
>>>>> +
>>>>> +    return size;
>>>>> +
>>>>> +failed:
>>>>> +    mutex_lock(&adev->reset_dump_mutex);
>>>>> +    kfree(adev->reset_dump_reg_list);
>>>>> +    adev->reset_dump_reg_list = NULL;
>>>>> +    alloc_count = 0;
>>>>> +    adev->n_regs = 0;
>>>>> +    mutex_unlock(&adev->reset_dump_mutex);
>>>>> +    return -EFAULT;
>>>>> +}
>>>>> +
>>>>> +
>>>>> +
>>>>> +static const struct file_operations 
>>>>> amdgpu_reset_dump_register_list = {
>>>>> +    .owner = THIS_MODULE,
>>>>> +    .read = amdgpu_reset_dump_register_list_read,
>>>>> +    .write = amdgpu_reset_dump_register_list_write,
>>>>> +    .llseek = default_llseek
>>>>> +};
>>>>> +
>>>>>   int amdgpu_debugfs_init(struct amdgpu_device *adev)
>>>>>   {
>>>>>       struct dentry *root = adev_to_drm(adev)->primary->debugfs_root;
>>>>> @@ -1618,6 +1710,7 @@ int amdgpu_debugfs_init(struct amdgpu_device 
>>>>> *adev)
>>>>>       if (!debugfs_initialized())
>>>>>           return 0;
>>>>>   +    mutex_init(&adev->reset_dump_mutex);
>>>>>       ent = debugfs_create_file("amdgpu_preempt_ib", 0600, root, adev,
>>>>>                     &fops_ib_preempt);
>>>>>       if (IS_ERR(ent)) {
>>>>> @@ -1672,6 +1765,8 @@ int amdgpu_debugfs_init(struct amdgpu_device 
>>>>> *adev)
>>>>>                   &amdgpu_debugfs_test_ib_fops);
>>>>>       debugfs_create_file("amdgpu_vm_info", 0444, root, adev,
>>>>>                   &amdgpu_debugfs_vm_info_fops);
>>>>> +    debugfs_create_file("amdgpu_reset_dump_register_list", 0644, 
>>>>> root, adev,
>>>>> +                &amdgpu_reset_dump_register_list);
>>>>>         adev->debugfs_vbios_blob.data = adev->bios;
>>>>>       adev->debugfs_vbios_blob.size = adev->bios_size;
>>>>>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v6 1/2] drm/amdgpu: add debugfs for reset registers list
  2022-02-16 13:17         ` Lazar, Lijo
@ 2022-02-16 13:55           ` Somalapuram, Amaranath
  2022-02-16 14:53             ` Christian König
  0 siblings, 1 reply; 21+ messages in thread
From: Somalapuram, Amaranath @ 2022-02-16 13:55 UTC (permalink / raw)
  To: Lazar, Lijo, Somalapuram Amaranath, amd-gfx
  Cc: alexander.deucher, christian.koenig, shashank.sharma


On 2/16/2022 6:47 PM, Lazar, Lijo wrote:
>
>
> On 2/16/2022 4:39 PM, Somalapuram, Amaranath wrote:
>>
>> On 2/16/2022 4:13 PM, Lazar, Lijo wrote:
>>>
>>>
>>> On 2/16/2022 4:04 PM, Somalapuram, Amaranath wrote:
>>>>
>>>> On 2/16/2022 3:45 PM, Lazar, Lijo wrote:
>>>>>
>>>>>
>>>>> On 2/16/2022 3:19 PM, Somalapuram Amaranath wrote:
>>>>>> List of register populated for dump collection during the GPU reset.
>>>>>>
>>>>>> Signed-off-by: Somalapuram Amaranath <Amaranath.Somalapuram@amd.com>
>>>>>> ---
>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h         |  5 ++
>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 95 
>>>>>> +++++++++++++++++++++
>>>>>>   2 files changed, 100 insertions(+)
>>>>>>
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>> index b85b67a88a3d..57965316873b 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>> @@ -1097,6 +1097,11 @@ struct amdgpu_device {
>>>>>>         struct amdgpu_reset_control     *reset_cntl;
>>>>>>       uint32_t ip_versions[HW_ID_MAX][HWIP_MAX_INSTANCE];
>>>>>> +
>>>>>> +    /* reset dump register */
>>>>>> +    uint32_t            *reset_dump_reg_list;
>>>>>> +    int                             n_regs;
>>>>>> +    struct mutex            reset_dump_mutex;
>>>>>>   };
>>>>>>     static inline struct amdgpu_device *drm_to_adev(struct 
>>>>>> drm_device *ddev)
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>>>> index 164d6a9e9fbb..faf985c7cb93 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>>>> @@ -1609,6 +1609,98 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, 
>>>>>> NULL,
>>>>>>   DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL,
>>>>>>               amdgpu_debugfs_sclk_set, "%llu\n");
>>>>>>   +static ssize_t amdgpu_reset_dump_register_list_read(struct 
>>>>>> file *f,
>>>>>> +                char __user *buf, size_t size, loff_t *pos)
>>>>>> +{
>>>>>> +    struct amdgpu_device *adev = (struct amdgpu_device 
>>>>>> *)file_inode(f)->i_private;
>>>>>> +    char reg_offset[11];
>>>>>> +    int i, r, len = 0;
>>>>>> +
>>>>>> +    if (*pos)
>>>>>> +        return 0;
>>>>>> +
>>>>>> +    if (adev->n_regs == 0)
>>>>>> +        return 0;
>>>>>> +
>>>>>> +    for (i = 0; i < adev->n_regs; i++) {
>>>>>> +        sprintf(reg_offset, "0x%x ", adev->reset_dump_reg_list[i]);
>>>>>> +        r = copy_to_user(buf + len, reg_offset, 
>>>>>> strlen(reg_offset));
>>>>>> +
>>>>>> +        if (r)
>>>>>> +            return -EFAULT;
>>>>>> +
>>>>>> +        len += strlen(reg_offset);
>>>>>> +    }
>>>>>> +
>>>>>> +    r = copy_to_user(buf + len, "\n", 1);
>>>>>> +
>>>>>> +    if (r)
>>>>>> +        return -EFAULT;
>>>>>> +
>>>>>> +    len++;
>>>>>> +    *pos += len;
>>>>>> +
>>>>>> +    return len;
>>>>>> +}
>>>>>> +
>>>>>> +static ssize_t amdgpu_reset_dump_register_list_write(struct file 
>>>>>> *f,
>>>>>> +            const char __user *buf, size_t size, loff_t *pos)
>>>>>> +{
>>>>>> +    struct amdgpu_device *adev = (struct amdgpu_device 
>>>>>> *)file_inode(f)->i_private;
>>>>>> +    char *reg_offset, *reg, reg_temp[11];
>>>>>> +    static int alloc_count;
>>>>>
>>>>> This being static what happens when it is called on a second device?
>>>>>
>>>>> Thanks,
>>>>> Lijo
>>>>>
>>>> I tried to avoid adding to adev. It wont work for multiple devices.
>>>
>>> Hmm.This is not friendly for single device also. Some one could just 
>>> parse a text file of reg offsets and do
>>>     sudo echo offset > file
>>>
>>> This will overwrite whatever is there. Instead you may define a 
>>> syntax like
>>>     sudo echo 0x000 > file =>  Clears all
>>>     sudo echo offset > file => Append to the existing set.
>>>
>>> Taking all offsets in one go may not be needed.
>>>
>>> Thanks,
>>> Lijo
>>>
>>
>> 0x000 can be offset for some registers !
>
> It's indeed a valid register called MM_INDEX register. The register 
> doesn't have any meaning in standalone.
>
>> This is application responsibly; any wrong data should clear the list.
>> Application can read back the list for confomation.
>>
>
> It needs to be done by user app anyway. This is more about how 
> convenient the interface is. Probably you could switch to a user 
> standpoint and try to add some 20-30 registers to the list. Then steps 
> needed to add a revised list.
>

For clear we can send text “clear”
On next write should we replace or append ? (I think with "clear" append 
is better option)

Christian which is better ?

Regards,
S.Amarnath
> Thanks,
> Lijo
>
>> Regards,
>> S.Amarnath
>>>>>> +    int ret, i = 0, len = 0;
>>>>>> +
>>>>>> +    do {
>>>>>> +        reg_offset = reg_temp;
>>>>>> +        memset(reg_offset,  0, 11);
>>>>>> +        ret = copy_from_user(reg_offset, buf + len, min(11, 
>>>>>> ((int)size-len)));
>>>>>> +
>>>>>> +        if (ret)
>>>>>> +            goto failed;
>>>>>> +
>>>>>> +        reg = strsep(&reg_offset, " ");
>>>>>> +
>>>>>> +        if (alloc_count <= i) {
>>>>>> +            adev->reset_dump_reg_list = krealloc_array(
>>>>>> + adev->reset_dump_reg_list, 1,
>>>>>> +                            sizeof(uint32_t), GFP_KERNEL);
>>>>>> +            alloc_count++;
>>>>>> +        }
>>>>>> +
>>>>>> +        ret = kstrtouint(reg, 16, &adev->reset_dump_reg_list[i]);
>>>>>> +
>>>>>> +        if (ret)
>>>>>> +            goto failed;
>>>>>> +
>>>>>> +        len += strlen(reg) + 1;
>>>>>> +        i++;
>>>>>> +
>>>>>> +    } while (len < size);
>>>>>> +
>>>>>> +    adev->n_regs = i;
>>>>>> +
>>>>>> +    return size;
>>>>>> +
>>>>>> +failed:
>>>>>> +    mutex_lock(&adev->reset_dump_mutex);
>>>>>> +    kfree(adev->reset_dump_reg_list);
>>>>>> +    adev->reset_dump_reg_list = NULL;
>>>>>> +    alloc_count = 0;
>>>>>> +    adev->n_regs = 0;
>>>>>> +    mutex_unlock(&adev->reset_dump_mutex);
>>>>>> +    return -EFAULT;
>>>>>> +}
>>>>>> +
>>>>>> +
>>>>>> +
>>>>>> +static const struct file_operations 
>>>>>> amdgpu_reset_dump_register_list = {
>>>>>> +    .owner = THIS_MODULE,
>>>>>> +    .read = amdgpu_reset_dump_register_list_read,
>>>>>> +    .write = amdgpu_reset_dump_register_list_write,
>>>>>> +    .llseek = default_llseek
>>>>>> +};
>>>>>> +
>>>>>>   int amdgpu_debugfs_init(struct amdgpu_device *adev)
>>>>>>   {
>>>>>>       struct dentry *root = 
>>>>>> adev_to_drm(adev)->primary->debugfs_root;
>>>>>> @@ -1618,6 +1710,7 @@ int amdgpu_debugfs_init(struct 
>>>>>> amdgpu_device *adev)
>>>>>>       if (!debugfs_initialized())
>>>>>>           return 0;
>>>>>>   +    mutex_init(&adev->reset_dump_mutex);
>>>>>>       ent = debugfs_create_file("amdgpu_preempt_ib", 0600, root, 
>>>>>> adev,
>>>>>>                     &fops_ib_preempt);
>>>>>>       if (IS_ERR(ent)) {
>>>>>> @@ -1672,6 +1765,8 @@ int amdgpu_debugfs_init(struct 
>>>>>> amdgpu_device *adev)
>>>>>>                   &amdgpu_debugfs_test_ib_fops);
>>>>>>       debugfs_create_file("amdgpu_vm_info", 0444, root, adev,
>>>>>>                   &amdgpu_debugfs_vm_info_fops);
>>>>>> + debugfs_create_file("amdgpu_reset_dump_register_list", 0644, 
>>>>>> root, adev,
>>>>>> +                &amdgpu_reset_dump_register_list);
>>>>>>         adev->debugfs_vbios_blob.data = adev->bios;
>>>>>>       adev->debugfs_vbios_blob.size = adev->bios_size;
>>>>>>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v6 1/2] drm/amdgpu: add debugfs for reset registers list
  2022-02-16 13:55           ` Somalapuram, Amaranath
@ 2022-02-16 14:53             ` Christian König
  0 siblings, 0 replies; 21+ messages in thread
From: Christian König @ 2022-02-16 14:53 UTC (permalink / raw)
  To: Somalapuram, Amaranath, Lazar, Lijo, Somalapuram Amaranath,
	amd-gfx
  Cc: alexander.deucher, shashank.sharma

Am 16.02.22 um 14:55 schrieb Somalapuram, Amaranath:
>
> On 2/16/2022 6:47 PM, Lazar, Lijo wrote:
>>
>>
>> On 2/16/2022 4:39 PM, Somalapuram, Amaranath wrote:
>>>
>>> On 2/16/2022 4:13 PM, Lazar, Lijo wrote:
>>>>
>>>>
>>>> On 2/16/2022 4:04 PM, Somalapuram, Amaranath wrote:
>>>>>
>>>>> On 2/16/2022 3:45 PM, Lazar, Lijo wrote:
>>>>>>
>>>>>>
>>>>>> On 2/16/2022 3:19 PM, Somalapuram Amaranath wrote:
>>>>>>> List of register populated for dump collection during the GPU 
>>>>>>> reset.
>>>>>>>
>>>>>>> Signed-off-by: Somalapuram Amaranath 
>>>>>>> <Amaranath.Somalapuram@amd.com>
>>>>>>> ---
>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h         |  5 ++
>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 95 
>>>>>>> +++++++++++++++++++++
>>>>>>>   2 files changed, 100 insertions(+)
>>>>>>>
>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>> index b85b67a88a3d..57965316873b 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>> @@ -1097,6 +1097,11 @@ struct amdgpu_device {
>>>>>>>         struct amdgpu_reset_control     *reset_cntl;
>>>>>>>       uint32_t ip_versions[HW_ID_MAX][HWIP_MAX_INSTANCE];
>>>>>>> +
>>>>>>> +    /* reset dump register */
>>>>>>> +    uint32_t            *reset_dump_reg_list;
>>>>>>> +    int                             n_regs;
>>>>>>> +    struct mutex            reset_dump_mutex;
>>>>>>>   };
>>>>>>>     static inline struct amdgpu_device *drm_to_adev(struct 
>>>>>>> drm_device *ddev)
>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>>>>> index 164d6a9e9fbb..faf985c7cb93 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>>>>> @@ -1609,6 +1609,98 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, 
>>>>>>> NULL,
>>>>>>>   DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL,
>>>>>>>               amdgpu_debugfs_sclk_set, "%llu\n");
>>>>>>>   +static ssize_t amdgpu_reset_dump_register_list_read(struct 
>>>>>>> file *f,
>>>>>>> +                char __user *buf, size_t size, loff_t *pos)
>>>>>>> +{
>>>>>>> +    struct amdgpu_device *adev = (struct amdgpu_device 
>>>>>>> *)file_inode(f)->i_private;
>>>>>>> +    char reg_offset[11];
>>>>>>> +    int i, r, len = 0;
>>>>>>> +
>>>>>>> +    if (*pos)
>>>>>>> +        return 0;
>>>>>>> +
>>>>>>> +    if (adev->n_regs == 0)
>>>>>>> +        return 0;
>>>>>>> +
>>>>>>> +    for (i = 0; i < adev->n_regs; i++) {
>>>>>>> +        sprintf(reg_offset, "0x%x ", 
>>>>>>> adev->reset_dump_reg_list[i]);
>>>>>>> +        r = copy_to_user(buf + len, reg_offset, 
>>>>>>> strlen(reg_offset));
>>>>>>> +
>>>>>>> +        if (r)
>>>>>>> +            return -EFAULT;
>>>>>>> +
>>>>>>> +        len += strlen(reg_offset);
>>>>>>> +    }
>>>>>>> +
>>>>>>> +    r = copy_to_user(buf + len, "\n", 1);
>>>>>>> +
>>>>>>> +    if (r)
>>>>>>> +        return -EFAULT;
>>>>>>> +
>>>>>>> +    len++;
>>>>>>> +    *pos += len;
>>>>>>> +
>>>>>>> +    return len;
>>>>>>> +}
>>>>>>> +
>>>>>>> +static ssize_t amdgpu_reset_dump_register_list_write(struct 
>>>>>>> file *f,
>>>>>>> +            const char __user *buf, size_t size, loff_t *pos)
>>>>>>> +{
>>>>>>> +    struct amdgpu_device *adev = (struct amdgpu_device 
>>>>>>> *)file_inode(f)->i_private;
>>>>>>> +    char *reg_offset, *reg, reg_temp[11];
>>>>>>> +    static int alloc_count;
>>>>>>
>>>>>> This being static what happens when it is called on a second device?
>>>>>>
>>>>>> Thanks,
>>>>>> Lijo
>>>>>>
>>>>> I tried to avoid adding to adev. It wont work for multiple devices.
>>>>
>>>> Hmm.This is not friendly for single device also. Some one could 
>>>> just parse a text file of reg offsets and do
>>>>     sudo echo offset > file
>>>>
>>>> This will overwrite whatever is there. Instead you may define a 
>>>> syntax like
>>>>     sudo echo 0x000 > file =>  Clears all
>>>>     sudo echo offset > file => Append to the existing set.
>>>>
>>>> Taking all offsets in one go may not be needed.
>>>>
>>>> Thanks,
>>>> Lijo
>>>>
>>>
>>> 0x000 can be offset for some registers !
>>
>> It's indeed a valid register called MM_INDEX register. The register 
>> doesn't have any meaning in standalone.
>>
>>> This is application responsibly; any wrong data should clear the list.
>>> Application can read back the list for confomation.
>>>
>>
>> It needs to be done by user app anyway. This is more about how 
>> convenient the interface is. Probably you could switch to a user 
>> standpoint and try to add some 20-30 registers to the list. Then 
>> steps needed to add a revised list.
>>
>
> For clear we can send text “clear”
> On next write should we replace or append ? (I think with "clear" 
> append is better option)
>
> Christian which is better ?

I think your original idea to just always set the full list sounds 
better to me.

As I said before I think the best approach is to parse the application 
input into a full list inside the kernel first and then replace 
everything in just one go.

Clearing the list of registers to dump can then easily be done with just 
echo > file

Regards,
Christian.

>
> Regards,
> S.Amarnath
>> Thanks,
>> Lijo
>>
>>> Regards,
>>> S.Amarnath
>>>>>>> +    int ret, i = 0, len = 0;
>>>>>>> +
>>>>>>> +    do {
>>>>>>> +        reg_offset = reg_temp;
>>>>>>> +        memset(reg_offset,  0, 11);
>>>>>>> +        ret = copy_from_user(reg_offset, buf + len, min(11, 
>>>>>>> ((int)size-len)));
>>>>>>> +
>>>>>>> +        if (ret)
>>>>>>> +            goto failed;
>>>>>>> +
>>>>>>> +        reg = strsep(&reg_offset, " ");
>>>>>>> +
>>>>>>> +        if (alloc_count <= i) {
>>>>>>> +            adev->reset_dump_reg_list = krealloc_array(
>>>>>>> + adev->reset_dump_reg_list, 1,
>>>>>>> +                            sizeof(uint32_t), GFP_KERNEL);
>>>>>>> +            alloc_count++;
>>>>>>> +        }
>>>>>>> +
>>>>>>> +        ret = kstrtouint(reg, 16, &adev->reset_dump_reg_list[i]);
>>>>>>> +
>>>>>>> +        if (ret)
>>>>>>> +            goto failed;
>>>>>>> +
>>>>>>> +        len += strlen(reg) + 1;
>>>>>>> +        i++;
>>>>>>> +
>>>>>>> +    } while (len < size);
>>>>>>> +
>>>>>>> +    adev->n_regs = i;
>>>>>>> +
>>>>>>> +    return size;
>>>>>>> +
>>>>>>> +failed:
>>>>>>> +    mutex_lock(&adev->reset_dump_mutex);
>>>>>>> +    kfree(adev->reset_dump_reg_list);
>>>>>>> +    adev->reset_dump_reg_list = NULL;
>>>>>>> +    alloc_count = 0;
>>>>>>> +    adev->n_regs = 0;
>>>>>>> +    mutex_unlock(&adev->reset_dump_mutex);
>>>>>>> +    return -EFAULT;
>>>>>>> +}
>>>>>>> +
>>>>>>> +
>>>>>>> +
>>>>>>> +static const struct file_operations 
>>>>>>> amdgpu_reset_dump_register_list = {
>>>>>>> +    .owner = THIS_MODULE,
>>>>>>> +    .read = amdgpu_reset_dump_register_list_read,
>>>>>>> +    .write = amdgpu_reset_dump_register_list_write,
>>>>>>> +    .llseek = default_llseek
>>>>>>> +};
>>>>>>> +
>>>>>>>   int amdgpu_debugfs_init(struct amdgpu_device *adev)
>>>>>>>   {
>>>>>>>       struct dentry *root = 
>>>>>>> adev_to_drm(adev)->primary->debugfs_root;
>>>>>>> @@ -1618,6 +1710,7 @@ int amdgpu_debugfs_init(struct 
>>>>>>> amdgpu_device *adev)
>>>>>>>       if (!debugfs_initialized())
>>>>>>>           return 0;
>>>>>>>   +    mutex_init(&adev->reset_dump_mutex);
>>>>>>>       ent = debugfs_create_file("amdgpu_preempt_ib", 0600, root, 
>>>>>>> adev,
>>>>>>>                     &fops_ib_preempt);
>>>>>>>       if (IS_ERR(ent)) {
>>>>>>> @@ -1672,6 +1765,8 @@ int amdgpu_debugfs_init(struct 
>>>>>>> amdgpu_device *adev)
>>>>>>>                   &amdgpu_debugfs_test_ib_fops);
>>>>>>>       debugfs_create_file("amdgpu_vm_info", 0444, root, adev,
>>>>>>>                   &amdgpu_debugfs_vm_info_fops);
>>>>>>> + debugfs_create_file("amdgpu_reset_dump_register_list", 0644, 
>>>>>>> root, adev,
>>>>>>> + &amdgpu_reset_dump_register_list);
>>>>>>>         adev->debugfs_vbios_blob.data = adev->bios;
>>>>>>>       adev->debugfs_vbios_blob.size = adev->bios_size;
>>>>>>>


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v6 1/2] drm/amdgpu: add debugfs for reset registers list
  2022-02-16 13:11   ` Somalapuram, Amaranath
@ 2022-02-16 14:56     ` Christian König
  2022-02-17  7:54       ` Somalapuram, Amaranath
  0 siblings, 1 reply; 21+ messages in thread
From: Christian König @ 2022-02-16 14:56 UTC (permalink / raw)
  To: Somalapuram, Amaranath, Christian König,
	Somalapuram Amaranath, amd-gfx
  Cc: alexander.deucher, shashank.sharma

[-- Attachment #1: Type: text/plain, Size: 7599 bytes --]

Am 16.02.22 um 14:11 schrieb Somalapuram, Amaranath:
>
> On 2/16/2022 3:41 PM, Christian König wrote:
>
>> Am 16.02.22 um 10:49 schrieb Somalapuram Amaranath:
>>> List of register populated for dump collection during the GPU reset.
>>>
>>> Signed-off-by: Somalapuram Amaranath <Amaranath.Somalapuram@amd.com>
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h         |  5 ++
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 95 
>>> +++++++++++++++++++++
>>>   2 files changed, 100 insertions(+)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> index b85b67a88a3d..57965316873b 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> @@ -1097,6 +1097,11 @@ struct amdgpu_device {
>>>         struct amdgpu_reset_control     *reset_cntl;
>>>       uint32_t ip_versions[HW_ID_MAX][HWIP_MAX_INSTANCE];
>>> +
>>> +    /* reset dump register */
>>> +    uint32_t            *reset_dump_reg_list;
>>> +    int                             n_regs;
>>> +    struct mutex            reset_dump_mutex;
>>
>> I think we should rather use the reset lock for this instead of 
>> introducing just another mutex.
>>
>>>   };
>>>     static inline struct amdgpu_device *drm_to_adev(struct 
>>> drm_device *ddev)
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>> index 164d6a9e9fbb..faf985c7cb93 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>> @@ -1609,6 +1609,98 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, NULL,
>>>   DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL,
>>>               amdgpu_debugfs_sclk_set, "%llu\n");
>>>   +static ssize_t amdgpu_reset_dump_register_list_read(struct file *f,
>>> +                char __user *buf, size_t size, loff_t *pos)
>>> +{
>>> +    struct amdgpu_device *adev = (struct amdgpu_device 
>>> *)file_inode(f)->i_private;
>>> +    char reg_offset[11];
>>> +    int i, r, len = 0;
>>> +
>>> +    if (*pos)
>>> +        return 0;
>>> +
>>> +    if (adev->n_regs == 0)
>>> +        return 0;
>>> +
>>> +    for (i = 0; i < adev->n_regs; i++) {
>>> +        sprintf(reg_offset, "0x%x ", adev->reset_dump_reg_list[i]);
>>> +        r = copy_to_user(buf + len, reg_offset, strlen(reg_offset));
>>> +
>>> +        if (r)
>>> +            return -EFAULT;
>>> +
>>> +        len += strlen(reg_offset);
>>> +    }
>>
>> You need to hold the lock protecting adev->reset_dump_reg_list and 
>> adev->n_regs while accessing those.
>>
>> (BTW: num_regs instead of n_regs would match more what we use 
>> elsewhere, but is not a must have).
>>
> This is read function for user and returns only list of reg offsets, I 
> did not understand correctly !
>>> +
>>> +    r = copy_to_user(buf + len, "\n", 1);
>>> +
>>> +    if (r)
>>> +        return -EFAULT;
>>> +
>>> +    len++;
>>> +    *pos += len;
>>> +
>>> +    return len;
>>> +}
>>> +
>>> +static ssize_t amdgpu_reset_dump_register_list_write(struct file *f,
>>> +            const char __user *buf, size_t size, loff_t *pos)
>>> +{
>>> +    struct amdgpu_device *adev = (struct amdgpu_device 
>>> *)file_inode(f)->i_private;
>>> +    char *reg_offset, *reg, reg_temp[11];
>>> +    static int alloc_count;
>>> +    int ret, i = 0, len = 0;
>>> +
>>> +    do {
>>> +        reg_offset = reg_temp;
>>> +        memset(reg_offset,  0, 11);
>>> +        ret = copy_from_user(reg_offset, buf + len, min(11, 
>>> ((int)size-len)));
>>> +
>>> +        if (ret)
>>> +            goto failed;
>>> +
>>> +        reg = strsep(&reg_offset, " ");
>>> +
>>> +        if (alloc_count <= i) {
>>
>>> + adev->reset_dump_reg_list =  krealloc_array(
>>> +                            adev->reset_dump_reg_list, 1,
>>> +                            sizeof(uint32_t), GFP_KERNEL);
>>> +            alloc_count++;
>>> +        }
>>> +
>>> +        ret = kstrtouint(reg, 16, &adev->reset_dump_reg_list[i]);
>>
>> This here is modifying adev->reset_dump_reg_list as well and so must 
>> be protected by a lock as well.
>>
>> The tricky part is that we can't allocate memory while holding this 
>> lock (because we need it during reset as well).
>>
>> One solution for this is to read the register list into a local array 
>> first and when that's done swap the local array with the one in 
>> adev->reset_dump_reg_list while holding the lock.
>>
>> Regards,
>> Christian.
>>
> There are 2 situations:
> 1st time creating list n_regs will be 0 and trace event will not be 
> triggered
> 2nd time while updating list n_regs is already set and 
> adev->reset_dump_reg_list will have some offsets address 
> (hypothetically speaking *during reset + update* read values from 
> RREG32 will mix up of old list and new list)
> its only critical when its freed and n_regs is not 0

No, that won't work like this. See you *must* always hold a lock when 
reading or writing the array.

Otherwise it is perfectly possible that one thread sees only halve of 
the updates of another thread.

The only alternative would be RCU, atomic replace and manual barrier 
handling, but that would be complete overkill for that feature.

Regards,
Christian.

>
> Regards,
> S.Amarnath
>>> +
>>> +        if (ret)
>>> +            goto failed;
>>> +
>>> +        len += strlen(reg) + 1;
>>> +        i++;
>>> +
>>> +    } while (len < size);
>>> +
>>> +    adev->n_regs = i;
>>> +
>>> +    return size;
>>> +
>>> +failed:
>>> +    mutex_lock(&adev->reset_dump_mutex);
>>> +    kfree(adev->reset_dump_reg_list);
>>> +    adev->reset_dump_reg_list = NULL;
>>> +    alloc_count = 0;
>>> +    adev->n_regs = 0;
>>> +    mutex_unlock(&adev->reset_dump_mutex);
>>> +    return -EFAULT;
>>> +}
>>> +
>>> +
>>> +
>>> +static const struct file_operations amdgpu_reset_dump_register_list 
>>> = {
>>> +    .owner = THIS_MODULE,
>>> +    .read = amdgpu_reset_dump_register_list_read,
>>> +    .write = amdgpu_reset_dump_register_list_write,
>>> +    .llseek = default_llseek
>>> +};
>>> +
>>>   int amdgpu_debugfs_init(struct amdgpu_device *adev)
>>>   {
>>>       struct dentry *root = adev_to_drm(adev)->primary->debugfs_root;
>>> @@ -1618,6 +1710,7 @@ int amdgpu_debugfs_init(struct amdgpu_device 
>>> *adev)
>>>       if (!debugfs_initialized())
>>>           return 0;
>>>   +    mutex_init(&adev->reset_dump_mutex);
>>>       ent = debugfs_create_file("amdgpu_preempt_ib", 0600, root, adev,
>>>                     &fops_ib_preempt);
>>>       if (IS_ERR(ent)) {
>>> @@ -1672,6 +1765,8 @@ int amdgpu_debugfs_init(struct amdgpu_device 
>>> *adev)
>>>                   &amdgpu_debugfs_test_ib_fops);
>>>       debugfs_create_file("amdgpu_vm_info", 0444, root, adev,
>>>                   &amdgpu_debugfs_vm_info_fops);
>>> +    debugfs_create_file("amdgpu_reset_dump_register_list", 0644, 
>>> root, adev,
>>> +                &amdgpu_reset_dump_register_list);
>>>         adev->debugfs_vbios_blob.data = adev->bios;
>>>       adev->debugfs_vbios_blob.size = adev->bios_size;
>>

[-- Attachment #2: Type: text/html, Size: 11104 bytes --]

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v6 1/2] drm/amdgpu: add debugfs for reset registers list
  2022-02-16 14:56     ` Christian König
@ 2022-02-17  7:54       ` Somalapuram, Amaranath
  2022-02-17  8:00         ` Christian König
  0 siblings, 1 reply; 21+ messages in thread
From: Somalapuram, Amaranath @ 2022-02-17  7:54 UTC (permalink / raw)
  To: Christian König, Christian König, Somalapuram Amaranath,
	amd-gfx
  Cc: alexander.deucher, shashank.sharma

[-- Attachment #1: Type: text/plain, Size: 7962 bytes --]


On 2/16/2022 8:26 PM, Christian König wrote:
> Am 16.02.22 um 14:11 schrieb Somalapuram, Amaranath:
>>
>> On 2/16/2022 3:41 PM, Christian König wrote:
>>
>>> Am 16.02.22 um 10:49 schrieb Somalapuram Amaranath:
>>>> List of register populated for dump collection during the GPU reset.
>>>>
>>>> Signed-off-by: Somalapuram Amaranath <Amaranath.Somalapuram@amd.com>
>>>> ---
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h         |  5 ++
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 95 
>>>> +++++++++++++++++++++
>>>>   2 files changed, 100 insertions(+)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> index b85b67a88a3d..57965316873b 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> @@ -1097,6 +1097,11 @@ struct amdgpu_device {
>>>>         struct amdgpu_reset_control     *reset_cntl;
>>>>       uint32_t ip_versions[HW_ID_MAX][HWIP_MAX_INSTANCE];
>>>> +
>>>> +    /* reset dump register */
>>>> +    uint32_t            *reset_dump_reg_list;
>>>> +    int                             n_regs;
>>>> +    struct mutex            reset_dump_mutex;
>>>
>>> I think we should rather use the reset lock for this instead of 
>>> introducing just another mutex.
>>>
>>>>   };
>>>>     static inline struct amdgpu_device *drm_to_adev(struct 
>>>> drm_device *ddev)
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>> index 164d6a9e9fbb..faf985c7cb93 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>> @@ -1609,6 +1609,98 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, NULL,
>>>>   DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL,
>>>>               amdgpu_debugfs_sclk_set, "%llu\n");
>>>>   +static ssize_t amdgpu_reset_dump_register_list_read(struct file *f,
>>>> +                char __user *buf, size_t size, loff_t *pos)
>>>> +{
>>>> +    struct amdgpu_device *adev = (struct amdgpu_device 
>>>> *)file_inode(f)->i_private;
>>>> +    char reg_offset[11];
>>>> +    int i, r, len = 0;
>>>> +
>>>> +    if (*pos)
>>>> +        return 0;
>>>> +
>>>> +    if (adev->n_regs == 0)
>>>> +        return 0;
>>>> +
>>>> +    for (i = 0; i < adev->n_regs; i++) {
>>>> +        sprintf(reg_offset, "0x%x ", adev->reset_dump_reg_list[i]);
>>>> +        r = copy_to_user(buf + len, reg_offset, strlen(reg_offset));
>>>> +
>>>> +        if (r)
>>>> +            return -EFAULT;
>>>> +
>>>> +        len += strlen(reg_offset);
>>>> +    }
>>>
>>> You need to hold the lock protecting adev->reset_dump_reg_list and 
>>> adev->n_regs while accessing those.
>>>
>>> (BTW: num_regs instead of n_regs would match more what we use 
>>> elsewhere, but is not a must have).
>>>
>> This is read function for user and returns only list of reg offsets, 
>> I did not understand correctly !
>>>> +
>>>> +    r = copy_to_user(buf + len, "\n", 1);
>>>> +
>>>> +    if (r)
>>>> +        return -EFAULT;
>>>> +
>>>> +    len++;
>>>> +    *pos += len;
>>>> +
>>>> +    return len;
>>>> +}
>>>> +
>>>> +static ssize_t amdgpu_reset_dump_register_list_write(struct file *f,
>>>> +            const char __user *buf, size_t size, loff_t *pos)
>>>> +{
>>>> +    struct amdgpu_device *adev = (struct amdgpu_device 
>>>> *)file_inode(f)->i_private;
>>>> +    char *reg_offset, *reg, reg_temp[11];
>>>> +    static int alloc_count;
>>>> +    int ret, i = 0, len = 0;
>>>> +
>>>> +    do {
>>>> +        reg_offset = reg_temp;
>>>> +        memset(reg_offset,  0, 11);
>>>> +        ret = copy_from_user(reg_offset, buf + len, min(11, 
>>>> ((int)size-len)));
>>>> +
>>>> +        if (ret)
>>>> +            goto failed;
>>>> +
>>>> +        reg = strsep(&reg_offset, " ");
>>>> +
>>>> +        if (alloc_count <= i) {
>>>
>>>> + adev->reset_dump_reg_list =  krealloc_array(
>>>> +                            adev->reset_dump_reg_list, 1,
>>>> +                            sizeof(uint32_t), GFP_KERNEL);
>>>> +            alloc_count++;
>>>> +        }
>>>> +
>>>> +        ret = kstrtouint(reg, 16, &adev->reset_dump_reg_list[i]);
>>>
>>> This here is modifying adev->reset_dump_reg_list as well and so must 
>>> be protected by a lock as well.
>>>
>>> The tricky part is that we can't allocate memory while holding this 
>>> lock (because we need it during reset as well).
>>>
>>> One solution for this is to read the register list into a local 
>>> array first and when that's done swap the local array with the one 
>>> in adev->reset_dump_reg_list while holding the lock.
>>>
krealloc_array should be inside lock or outside lock? this may be problem.

Regards,

S.Amarnath

>>> Regards,
>>> Christian.
>>>
>> There are 2 situations:
>> 1st time creating list n_regs will be 0 and trace event will not be 
>> triggered
>> 2nd time while updating list n_regs is already set and 
>> adev->reset_dump_reg_list will have some offsets address 
>> (hypothetically speaking *during reset + update* read values from 
>> RREG32 will mix up of old list and new list)
>> its only critical when its freed and n_regs is not 0
>
> No, that won't work like this. See you *must* always hold a lock when 
> reading or writing the array.
>
> Otherwise it is perfectly possible that one thread sees only halve of 
> the updates of another thread.
>
> The only alternative would be RCU, atomic replace and manual barrier 
> handling, but that would be complete overkill for that feature.
>
> Regards,
> Christian.
>
>>
>> Regards,
>> S.Amarnath
>>>> +
>>>> +        if (ret)
>>>> +            goto failed;
>>>> +
>>>> +        len += strlen(reg) + 1;
>>>> +        i++;
>>>> +
>>>> +    } while (len < size);
>>>> +
>>>> +    adev->n_regs = i;
>>>> +
>>>> +    return size;
>>>> +
>>>> +failed:
>>>> +    mutex_lock(&adev->reset_dump_mutex);
>>>> +    kfree(adev->reset_dump_reg_list);
>>>> +    adev->reset_dump_reg_list = NULL;
>>>> +    alloc_count = 0;
>>>> +    adev->n_regs = 0;
>>>> +    mutex_unlock(&adev->reset_dump_mutex);
>>>> +    return -EFAULT;
>>>> +}
>>>> +
>>>> +
>>>> +
>>>> +static const struct file_operations 
>>>> amdgpu_reset_dump_register_list = {
>>>> +    .owner = THIS_MODULE,
>>>> +    .read = amdgpu_reset_dump_register_list_read,
>>>> +    .write = amdgpu_reset_dump_register_list_write,
>>>> +    .llseek = default_llseek
>>>> +};
>>>> +
>>>>   int amdgpu_debugfs_init(struct amdgpu_device *adev)
>>>>   {
>>>>       struct dentry *root = adev_to_drm(adev)->primary->debugfs_root;
>>>> @@ -1618,6 +1710,7 @@ int amdgpu_debugfs_init(struct amdgpu_device 
>>>> *adev)
>>>>       if (!debugfs_initialized())
>>>>           return 0;
>>>>   +    mutex_init(&adev->reset_dump_mutex);
>>>>       ent = debugfs_create_file("amdgpu_preempt_ib", 0600, root, adev,
>>>>                     &fops_ib_preempt);
>>>>       if (IS_ERR(ent)) {
>>>> @@ -1672,6 +1765,8 @@ int amdgpu_debugfs_init(struct amdgpu_device 
>>>> *adev)
>>>>                   &amdgpu_debugfs_test_ib_fops);
>>>>       debugfs_create_file("amdgpu_vm_info", 0444, root, adev,
>>>>                   &amdgpu_debugfs_vm_info_fops);
>>>> +    debugfs_create_file("amdgpu_reset_dump_register_list", 0644, 
>>>> root, adev,
>>>> +                &amdgpu_reset_dump_register_list);
>>>>         adev->debugfs_vbios_blob.data = adev->bios;
>>>>       adev->debugfs_vbios_blob.size = adev->bios_size;
>>>
>

[-- Attachment #2: Type: text/html, Size: 14693 bytes --]

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v6 1/2] drm/amdgpu: add debugfs for reset registers list
  2022-02-17  7:54       ` Somalapuram, Amaranath
@ 2022-02-17  8:00         ` Christian König
  2022-02-17  9:44           ` Lazar, Lijo
  0 siblings, 1 reply; 21+ messages in thread
From: Christian König @ 2022-02-17  8:00 UTC (permalink / raw)
  To: Somalapuram, Amaranath, Christian König,
	Somalapuram Amaranath, amd-gfx
  Cc: alexander.deucher, shashank.sharma

[-- Attachment #1: Type: text/plain, Size: 8640 bytes --]



Am 17.02.22 um 08:54 schrieb Somalapuram, Amaranath:
>
>
> On 2/16/2022 8:26 PM, Christian König wrote:
>> Am 16.02.22 um 14:11 schrieb Somalapuram, Amaranath:
>>>
>>> On 2/16/2022 3:41 PM, Christian König wrote:
>>>
>>>> Am 16.02.22 um 10:49 schrieb Somalapuram Amaranath:
>>>>> List of register populated for dump collection during the GPU reset.
>>>>>
>>>>> Signed-off-by: Somalapuram Amaranath <Amaranath.Somalapuram@amd.com>
>>>>> ---
>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h         |  5 ++
>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 95 
>>>>> +++++++++++++++++++++
>>>>>   2 files changed, 100 insertions(+)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>> index b85b67a88a3d..57965316873b 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>> @@ -1097,6 +1097,11 @@ struct amdgpu_device {
>>>>>         struct amdgpu_reset_control     *reset_cntl;
>>>>>       uint32_t ip_versions[HW_ID_MAX][HWIP_MAX_INSTANCE];
>>>>> +
>>>>> +    /* reset dump register */
>>>>> +    uint32_t            *reset_dump_reg_list;
>>>>> +    int                             n_regs;
>>>>> +    struct mutex            reset_dump_mutex;
>>>>
>>>> I think we should rather use the reset lock for this instead of 
>>>> introducing just another mutex.
>>>>
>>>>>   };
>>>>>     static inline struct amdgpu_device *drm_to_adev(struct 
>>>>> drm_device *ddev)
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>>> index 164d6a9e9fbb..faf985c7cb93 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>>> @@ -1609,6 +1609,98 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, 
>>>>> NULL,
>>>>>   DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL,
>>>>>               amdgpu_debugfs_sclk_set, "%llu\n");
>>>>>   +static ssize_t amdgpu_reset_dump_register_list_read(struct file 
>>>>> *f,
>>>>> +                char __user *buf, size_t size, loff_t *pos)
>>>>> +{
>>>>> +    struct amdgpu_device *adev = (struct amdgpu_device 
>>>>> *)file_inode(f)->i_private;
>>>>> +    char reg_offset[11];
>>>>> +    int i, r, len = 0;
>>>>> +
>>>>> +    if (*pos)
>>>>> +        return 0;
>>>>> +
>>>>> +    if (adev->n_regs == 0)
>>>>> +        return 0;
>>>>> +
>>>>> +    for (i = 0; i < adev->n_regs; i++) {
>>>>> +        sprintf(reg_offset, "0x%x ", adev->reset_dump_reg_list[i]);
>>>>> +        r = copy_to_user(buf + len, reg_offset, strlen(reg_offset));
>>>>> +
>>>>> +        if (r)
>>>>> +            return -EFAULT;
>>>>> +
>>>>> +        len += strlen(reg_offset);
>>>>> +    }
>>>>
>>>> You need to hold the lock protecting adev->reset_dump_reg_list and 
>>>> adev->n_regs while accessing those.
>>>>
>>>> (BTW: num_regs instead of n_regs would match more what we use 
>>>> elsewhere, but is not a must have).
>>>>
>>> This is read function for user and returns only list of reg offsets, 
>>> I did not understand correctly !
>>>>> +
>>>>> +    r = copy_to_user(buf + len, "\n", 1);
>>>>> +
>>>>> +    if (r)
>>>>> +        return -EFAULT;
>>>>> +
>>>>> +    len++;
>>>>> +    *pos += len;
>>>>> +
>>>>> +    return len;
>>>>> +}
>>>>> +
>>>>> +static ssize_t amdgpu_reset_dump_register_list_write(struct file *f,
>>>>> +            const char __user *buf, size_t size, loff_t *pos)
>>>>> +{
>>>>> +    struct amdgpu_device *adev = (struct amdgpu_device 
>>>>> *)file_inode(f)->i_private;
>>>>> +    char *reg_offset, *reg, reg_temp[11];
>>>>> +    static int alloc_count;
>>>>> +    int ret, i = 0, len = 0;
>>>>> +
>>>>> +    do {
>>>>> +        reg_offset = reg_temp;
>>>>> +        memset(reg_offset,  0, 11);
>>>>> +        ret = copy_from_user(reg_offset, buf + len, min(11, 
>>>>> ((int)size-len)));
>>>>> +
>>>>> +        if (ret)
>>>>> +            goto failed;
>>>>> +
>>>>> +        reg = strsep(&reg_offset, " ");
>>>>> +
>>>>> +        if (alloc_count <= i) {
>>>>
>>>>> + adev->reset_dump_reg_list =  krealloc_array(
>>>>> +                            adev->reset_dump_reg_list, 1,
>>>>> +                            sizeof(uint32_t), GFP_KERNEL);
>>>>> +            alloc_count++;
>>>>> +        }
>>>>> +
>>>>> +        ret = kstrtouint(reg, 16, &adev->reset_dump_reg_list[i]);
>>>>
>>>> This here is modifying adev->reset_dump_reg_list as well and so 
>>>> must be protected by a lock as well.
>>>>
>>>> The tricky part is that we can't allocate memory while holding this 
>>>> lock (because we need it during reset as well).
>>>>
>>>> One solution for this is to read the register list into a local 
>>>> array first and when that's done swap the local array with the one 
>>>> in adev->reset_dump_reg_list while holding the lock.
>>>>
> krealloc_array should be inside lock or outside lock? this may be problem.
>

This *must* be outside the lock because we need to take the lock during 
GPU reset which has a dependency to not allocate memory or wait for 
locks under which memory is allocated.

That's why I said you need an approach which first parses the string 
from userspace, build up the register list and then swap that with the 
existing one while holding the lock.

Regards,
Christian.

> Regards,
>
> S.Amarnath
>
>>>> Regards,
>>>> Christian.
>>>>
>>> There are 2 situations:
>>> 1st time creating list n_regs will be 0 and trace event will not be 
>>> triggered
>>> 2nd time while updating list n_regs is already set and 
>>> adev->reset_dump_reg_list will have some offsets address 
>>> (hypothetically speaking *during reset + update* read values from 
>>> RREG32 will mix up of old list and new list)
>>> its only critical when its freed and n_regs is not 0
>>
>> No, that won't work like this. See you *must* always hold a lock when 
>> reading or writing the array.
>>
>> Otherwise it is perfectly possible that one thread sees only halve of 
>> the updates of another thread.
>>
>> The only alternative would be RCU, atomic replace and manual barrier 
>> handling, but that would be complete overkill for that feature.
>>
>> Regards,
>> Christian.
>>
>>>
>>> Regards,
>>> S.Amarnath
>>>>> +
>>>>> +        if (ret)
>>>>> +            goto failed;
>>>>> +
>>>>> +        len += strlen(reg) + 1;
>>>>> +        i++;
>>>>> +
>>>>> +    } while (len < size);
>>>>> +
>>>>> +    adev->n_regs = i;
>>>>> +
>>>>> +    return size;
>>>>> +
>>>>> +failed:
>>>>> +    mutex_lock(&adev->reset_dump_mutex);
>>>>> +    kfree(adev->reset_dump_reg_list);
>>>>> +    adev->reset_dump_reg_list = NULL;
>>>>> +    alloc_count = 0;
>>>>> +    adev->n_regs = 0;
>>>>> +    mutex_unlock(&adev->reset_dump_mutex);
>>>>> +    return -EFAULT;
>>>>> +}
>>>>> +
>>>>> +
>>>>> +
>>>>> +static const struct file_operations 
>>>>> amdgpu_reset_dump_register_list = {
>>>>> +    .owner = THIS_MODULE,
>>>>> +    .read = amdgpu_reset_dump_register_list_read,
>>>>> +    .write = amdgpu_reset_dump_register_list_write,
>>>>> +    .llseek = default_llseek
>>>>> +};
>>>>> +
>>>>>   int amdgpu_debugfs_init(struct amdgpu_device *adev)
>>>>>   {
>>>>>       struct dentry *root = adev_to_drm(adev)->primary->debugfs_root;
>>>>> @@ -1618,6 +1710,7 @@ int amdgpu_debugfs_init(struct amdgpu_device 
>>>>> *adev)
>>>>>       if (!debugfs_initialized())
>>>>>           return 0;
>>>>>   +    mutex_init(&adev->reset_dump_mutex);
>>>>>       ent = debugfs_create_file("amdgpu_preempt_ib", 0600, root, 
>>>>> adev,
>>>>>                     &fops_ib_preempt);
>>>>>       if (IS_ERR(ent)) {
>>>>> @@ -1672,6 +1765,8 @@ int amdgpu_debugfs_init(struct amdgpu_device 
>>>>> *adev)
>>>>>                   &amdgpu_debugfs_test_ib_fops);
>>>>>       debugfs_create_file("amdgpu_vm_info", 0444, root, adev,
>>>>>                   &amdgpu_debugfs_vm_info_fops);
>>>>> + debugfs_create_file("amdgpu_reset_dump_register_list", 0644, 
>>>>> root, adev,
>>>>> +                &amdgpu_reset_dump_register_list);
>>>>>         adev->debugfs_vbios_blob.data = adev->bios;
>>>>>       adev->debugfs_vbios_blob.size = adev->bios_size;
>>>>
>>

[-- Attachment #2: Type: text/html, Size: 16072 bytes --]

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v6 1/2] drm/amdgpu: add debugfs for reset registers list
  2022-02-17  8:00         ` Christian König
@ 2022-02-17  9:44           ` Lazar, Lijo
  2022-02-17  9:48             ` Christian König
  0 siblings, 1 reply; 21+ messages in thread
From: Lazar, Lijo @ 2022-02-17  9:44 UTC (permalink / raw)
  To: Christian König, Somalapuram, Amaranath,
	Christian König, Somalapuram Amaranath, amd-gfx
  Cc: alexander.deucher, shashank.sharma



On 2/17/2022 1:30 PM, Christian König wrote:
> 
> 
> Am 17.02.22 um 08:54 schrieb Somalapuram, Amaranath:
>>
>>
>> On 2/16/2022 8:26 PM, Christian König wrote:
>>> Am 16.02.22 um 14:11 schrieb Somalapuram, Amaranath:
>>>>
>>>> On 2/16/2022 3:41 PM, Christian König wrote:
>>>>
>>>>> Am 16.02.22 um 10:49 schrieb Somalapuram Amaranath:
>>>>>> List of register populated for dump collection during the GPU reset.
>>>>>>
>>>>>> Signed-off-by: Somalapuram Amaranath <Amaranath.Somalapuram@amd.com>
>>>>>> ---
>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h         |  5 ++
>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 95 
>>>>>> +++++++++++++++++++++
>>>>>>   2 files changed, 100 insertions(+)
>>>>>>
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>> index b85b67a88a3d..57965316873b 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>> @@ -1097,6 +1097,11 @@ struct amdgpu_device {
>>>>>>         struct amdgpu_reset_control     *reset_cntl;
>>>>>>       uint32_t ip_versions[HW_ID_MAX][HWIP_MAX_INSTANCE];
>>>>>> +
>>>>>> +    /* reset dump register */
>>>>>> +    uint32_t            *reset_dump_reg_list;
>>>>>> +    int                             n_regs;
>>>>>> +    struct mutex            reset_dump_mutex;
>>>>>
>>>>> I think we should rather use the reset lock for this instead of 
>>>>> introducing just another mutex.
>>>>>
>>>>>>   };
>>>>>>     static inline struct amdgpu_device *drm_to_adev(struct 
>>>>>> drm_device *ddev)
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>>>> index 164d6a9e9fbb..faf985c7cb93 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>>>> @@ -1609,6 +1609,98 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, 
>>>>>> NULL,
>>>>>>   DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL,
>>>>>>               amdgpu_debugfs_sclk_set, "%llu\n");
>>>>>>   +static ssize_t amdgpu_reset_dump_register_list_read(struct file 
>>>>>> *f,
>>>>>> +                char __user *buf, size_t size, loff_t *pos)
>>>>>> +{
>>>>>> +    struct amdgpu_device *adev = (struct amdgpu_device 
>>>>>> *)file_inode(f)->i_private;
>>>>>> +    char reg_offset[11];
>>>>>> +    int i, r, len = 0;
>>>>>> +
>>>>>> +    if (*pos)
>>>>>> +        return 0;
>>>>>> +
>>>>>> +    if (adev->n_regs == 0)
>>>>>> +        return 0;
>>>>>> +
>>>>>> +    for (i = 0; i < adev->n_regs; i++) {
>>>>>> +        sprintf(reg_offset, "0x%x ", adev->reset_dump_reg_list[i]);
>>>>>> +        r = copy_to_user(buf + len, reg_offset, strlen(reg_offset));
>>>>>> +
>>>>>> +        if (r)
>>>>>> +            return -EFAULT;
>>>>>> +
>>>>>> +        len += strlen(reg_offset);
>>>>>> +    }
>>>>>
>>>>> You need to hold the lock protecting adev->reset_dump_reg_list and 
>>>>> adev->n_regs while accessing those.
>>>>>
>>>>> (BTW: num_regs instead of n_regs would match more what we use 
>>>>> elsewhere, but is not a must have).
>>>>>
>>>> This is read function for user and returns only list of reg offsets, 
>>>> I did not understand correctly !
>>>>>> +
>>>>>> +    r = copy_to_user(buf + len, "\n", 1);
>>>>>> +
>>>>>> +    if (r)
>>>>>> +        return -EFAULT;
>>>>>> +
>>>>>> +    len++;
>>>>>> +    *pos += len;
>>>>>> +
>>>>>> +    return len;
>>>>>> +}
>>>>>> +
>>>>>> +static ssize_t amdgpu_reset_dump_register_list_write(struct file *f,
>>>>>> +            const char __user *buf, size_t size, loff_t *pos)
>>>>>> +{
>>>>>> +    struct amdgpu_device *adev = (struct amdgpu_device 
>>>>>> *)file_inode(f)->i_private;
>>>>>> +    char *reg_offset, *reg, reg_temp[11];
>>>>>> +    static int alloc_count;
>>>>>> +    int ret, i = 0, len = 0;
>>>>>> +
>>>>>> +    do {
>>>>>> +        reg_offset = reg_temp;
>>>>>> +        memset(reg_offset,  0, 11);
>>>>>> +        ret = copy_from_user(reg_offset, buf + len, min(11, 
>>>>>> ((int)size-len)));
>>>>>> +
>>>>>> +        if (ret)
>>>>>> +            goto failed;
>>>>>> +
>>>>>> +        reg = strsep(&reg_offset, " ");
>>>>>> +
>>>>>> +        if (alloc_count <= i) {
>>>>>
>>>>>> + adev->reset_dump_reg_list =  krealloc_array(
>>>>>> +                            adev->reset_dump_reg_list, 1,
>>>>>> +                            sizeof(uint32_t), GFP_KERNEL);
>>>>>> +            alloc_count++;
>>>>>> +        }
>>>>>> +
>>>>>> +        ret = kstrtouint(reg, 16, &adev->reset_dump_reg_list[i]);
>>>>>
>>>>> This here is modifying adev->reset_dump_reg_list as well and so 
>>>>> must be protected by a lock as well.
>>>>>
>>>>> The tricky part is that we can't allocate memory while holding this 
>>>>> lock (because we need it during reset as well).
>>>>>
>>>>> One solution for this is to read the register list into a local 
>>>>> array first and when that's done swap the local array with the one 
>>>>> in adev->reset_dump_reg_list while holding the lock.
>>>>>
>> krealloc_array should be inside lock or outside lock? this may be problem.
>>
> 
> This *must* be outside the lock because we need to take the lock during 
> GPU reset which has a dependency to not allocate memory or wait for 
> locks under which memory is allocated.
> 
> That's why I said you need an approach which first parses the string 
> from userspace, build up the register list and then swap that with the 
> existing one while holding the lock.
> 

Another approach would be to just protect debugfs write with 
down_read(&adev->reset_sem) or reset domain semaphore.

Other than that if apps are trying to read and modify the list at the 
same time, probably we should leave that to user mode since this is 
mainly a debug feature.

Thanks,
Lijo

> Regards,
> Christian.
> 
>> Regards,
>>
>> S.Amarnath
>>
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>> There are 2 situations:
>>>> 1st time creating list n_regs will be 0 and trace event will not be 
>>>> triggered
>>>> 2nd time while updating list n_regs is already set and 
>>>> adev->reset_dump_reg_list will have some offsets address 
>>>> (hypothetically speaking *during reset + update* read values from 
>>>> RREG32 will mix up of old list and new list)
>>>> its only critical when its freed and n_regs is not 0
>>>
>>> No, that won't work like this. See you *must* always hold a lock when 
>>> reading or writing the array.
>>>
>>> Otherwise it is perfectly possible that one thread sees only halve of 
>>> the updates of another thread.
>>>
>>> The only alternative would be RCU, atomic replace and manual barrier 
>>> handling, but that would be complete overkill for that feature.
>>>
>>> Regards,
>>> Christian.
>>>
>>>>
>>>> Regards,
>>>> S.Amarnath
>>>>>> +
>>>>>> +        if (ret)
>>>>>> +            goto failed;
>>>>>> +
>>>>>> +        len += strlen(reg) + 1;
>>>>>> +        i++;
>>>>>> +
>>>>>> +    } while (len < size);
>>>>>> +
>>>>>> +    adev->n_regs = i;
>>>>>> +
>>>>>> +    return size;
>>>>>> +
>>>>>> +failed:
>>>>>> +    mutex_lock(&adev->reset_dump_mutex);
>>>>>> +    kfree(adev->reset_dump_reg_list);
>>>>>> +    adev->reset_dump_reg_list = NULL;
>>>>>> +    alloc_count = 0;
>>>>>> +    adev->n_regs = 0;
>>>>>> +    mutex_unlock(&adev->reset_dump_mutex);
>>>>>> +    return -EFAULT;
>>>>>> +}
>>>>>> +
>>>>>> +
>>>>>> +
>>>>>> +static const struct file_operations 
>>>>>> amdgpu_reset_dump_register_list = {
>>>>>> +    .owner = THIS_MODULE,
>>>>>> +    .read = amdgpu_reset_dump_register_list_read,
>>>>>> +    .write = amdgpu_reset_dump_register_list_write,
>>>>>> +    .llseek = default_llseek
>>>>>> +};
>>>>>> +
>>>>>>   int amdgpu_debugfs_init(struct amdgpu_device *adev)
>>>>>>   {
>>>>>>       struct dentry *root = adev_to_drm(adev)->primary->debugfs_root;
>>>>>> @@ -1618,6 +1710,7 @@ int amdgpu_debugfs_init(struct amdgpu_device 
>>>>>> *adev)
>>>>>>       if (!debugfs_initialized())
>>>>>>           return 0;
>>>>>>   +    mutex_init(&adev->reset_dump_mutex);
>>>>>>       ent = debugfs_create_file("amdgpu_preempt_ib", 0600, root, 
>>>>>> adev,
>>>>>>                     &fops_ib_preempt);
>>>>>>       if (IS_ERR(ent)) {
>>>>>> @@ -1672,6 +1765,8 @@ int amdgpu_debugfs_init(struct amdgpu_device 
>>>>>> *adev)
>>>>>>                   &amdgpu_debugfs_test_ib_fops);
>>>>>>       debugfs_create_file("amdgpu_vm_info", 0444, root, adev,
>>>>>>                   &amdgpu_debugfs_vm_info_fops);
>>>>>> + debugfs_create_file("amdgpu_reset_dump_register_list", 0644, 
>>>>>> root, adev,
>>>>>> +                &amdgpu_reset_dump_register_list);
>>>>>>         adev->debugfs_vbios_blob.data = adev->bios;
>>>>>>       adev->debugfs_vbios_blob.size = adev->bios_size;
>>>>>
>>>
> 

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v6 1/2] drm/amdgpu: add debugfs for reset registers list
  2022-02-17  9:44           ` Lazar, Lijo
@ 2022-02-17  9:48             ` Christian König
  2022-02-17 10:01               ` Lazar, Lijo
  0 siblings, 1 reply; 21+ messages in thread
From: Christian König @ 2022-02-17  9:48 UTC (permalink / raw)
  To: Lazar, Lijo, Somalapuram, Amaranath, Christian König,
	Somalapuram Amaranath, amd-gfx
  Cc: alexander.deucher, shashank.sharma

Am 17.02.22 um 10:44 schrieb Lazar, Lijo:
>
>
> On 2/17/2022 1:30 PM, Christian König wrote:
>>
>>
>> Am 17.02.22 um 08:54 schrieb Somalapuram, Amaranath:
>>>
>>>
>>> On 2/16/2022 8:26 PM, Christian König wrote:
>>>> Am 16.02.22 um 14:11 schrieb Somalapuram, Amaranath:
>>>>>
>>>>> On 2/16/2022 3:41 PM, Christian König wrote:
>>>>>
>>>>>> Am 16.02.22 um 10:49 schrieb Somalapuram Amaranath:
>>>>>>> List of register populated for dump collection during the GPU 
>>>>>>> reset.
>>>>>>>
>>>>>>> Signed-off-by: Somalapuram Amaranath 
>>>>>>> <Amaranath.Somalapuram@amd.com>
>>>>>>> ---
>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h         |  5 ++
>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 95 
>>>>>>> +++++++++++++++++++++
>>>>>>>   2 files changed, 100 insertions(+)
>>>>>>>
>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>> index b85b67a88a3d..57965316873b 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>> @@ -1097,6 +1097,11 @@ struct amdgpu_device {
>>>>>>>         struct amdgpu_reset_control     *reset_cntl;
>>>>>>>       uint32_t ip_versions[HW_ID_MAX][HWIP_MAX_INSTANCE];
>>>>>>> +
>>>>>>> +    /* reset dump register */
>>>>>>> +    uint32_t            *reset_dump_reg_list;
>>>>>>> +    int                             n_regs;
>>>>>>> +    struct mutex            reset_dump_mutex;
>>>>>>
>>>>>> I think we should rather use the reset lock for this instead of 
>>>>>> introducing just another mutex.
>>>>>>
>>>>>>>   };
>>>>>>>     static inline struct amdgpu_device *drm_to_adev(struct 
>>>>>>> drm_device *ddev)
>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>>>>> index 164d6a9e9fbb..faf985c7cb93 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>>>>> @@ -1609,6 +1609,98 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, 
>>>>>>> NULL,
>>>>>>>   DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL,
>>>>>>>               amdgpu_debugfs_sclk_set, "%llu\n");
>>>>>>>   +static ssize_t amdgpu_reset_dump_register_list_read(struct 
>>>>>>> file *f,
>>>>>>> +                char __user *buf, size_t size, loff_t *pos)
>>>>>>> +{
>>>>>>> +    struct amdgpu_device *adev = (struct amdgpu_device 
>>>>>>> *)file_inode(f)->i_private;
>>>>>>> +    char reg_offset[11];
>>>>>>> +    int i, r, len = 0;
>>>>>>> +
>>>>>>> +    if (*pos)
>>>>>>> +        return 0;
>>>>>>> +
>>>>>>> +    if (adev->n_regs == 0)
>>>>>>> +        return 0;
>>>>>>> +
>>>>>>> +    for (i = 0; i < adev->n_regs; i++) {
>>>>>>> +        sprintf(reg_offset, "0x%x ", 
>>>>>>> adev->reset_dump_reg_list[i]);
>>>>>>> +        r = copy_to_user(buf + len, reg_offset, 
>>>>>>> strlen(reg_offset));
>>>>>>> +
>>>>>>> +        if (r)
>>>>>>> +            return -EFAULT;
>>>>>>> +
>>>>>>> +        len += strlen(reg_offset);
>>>>>>> +    }
>>>>>>
>>>>>> You need to hold the lock protecting adev->reset_dump_reg_list 
>>>>>> and adev->n_regs while accessing those.
>>>>>>
>>>>>> (BTW: num_regs instead of n_regs would match more what we use 
>>>>>> elsewhere, but is not a must have).
>>>>>>
>>>>> This is read function for user and returns only list of reg 
>>>>> offsets, I did not understand correctly !
>>>>>>> +
>>>>>>> +    r = copy_to_user(buf + len, "\n", 1);
>>>>>>> +
>>>>>>> +    if (r)
>>>>>>> +        return -EFAULT;
>>>>>>> +
>>>>>>> +    len++;
>>>>>>> +    *pos += len;
>>>>>>> +
>>>>>>> +    return len;
>>>>>>> +}
>>>>>>> +
>>>>>>> +static ssize_t amdgpu_reset_dump_register_list_write(struct 
>>>>>>> file *f,
>>>>>>> +            const char __user *buf, size_t size, loff_t *pos)
>>>>>>> +{
>>>>>>> +    struct amdgpu_device *adev = (struct amdgpu_device 
>>>>>>> *)file_inode(f)->i_private;
>>>>>>> +    char *reg_offset, *reg, reg_temp[11];
>>>>>>> +    static int alloc_count;
>>>>>>> +    int ret, i = 0, len = 0;
>>>>>>> +
>>>>>>> +    do {
>>>>>>> +        reg_offset = reg_temp;
>>>>>>> +        memset(reg_offset,  0, 11);
>>>>>>> +        ret = copy_from_user(reg_offset, buf + len, min(11, 
>>>>>>> ((int)size-len)));
>>>>>>> +
>>>>>>> +        if (ret)
>>>>>>> +            goto failed;
>>>>>>> +
>>>>>>> +        reg = strsep(&reg_offset, " ");
>>>>>>> +
>>>>>>> +        if (alloc_count <= i) {
>>>>>>
>>>>>>> + adev->reset_dump_reg_list =  krealloc_array(
>>>>>>> + adev->reset_dump_reg_list, 1,
>>>>>>> +                            sizeof(uint32_t), GFP_KERNEL);
>>>>>>> +            alloc_count++;
>>>>>>> +        }
>>>>>>> +
>>>>>>> +        ret = kstrtouint(reg, 16, &adev->reset_dump_reg_list[i]);
>>>>>>
>>>>>> This here is modifying adev->reset_dump_reg_list as well and so 
>>>>>> must be protected by a lock as well.
>>>>>>
>>>>>> The tricky part is that we can't allocate memory while holding 
>>>>>> this lock (because we need it during reset as well).
>>>>>>
>>>>>> One solution for this is to read the register list into a local 
>>>>>> array first and when that's done swap the local array with the 
>>>>>> one in adev->reset_dump_reg_list while holding the lock.
>>>>>>
>>> krealloc_array should be inside lock or outside lock? this may be 
>>> problem.
>>>
>>
>> This *must* be outside the lock because we need to take the lock 
>> during GPU reset which has a dependency to not allocate memory or 
>> wait for locks under which memory is allocated.
>>
>> That's why I said you need an approach which first parses the string 
>> from userspace, build up the register list and then swap that with 
>> the existing one while holding the lock.
>>
>
> Another approach would be to just protect debugfs write with 
> down_read(&adev->reset_sem) or reset domain semaphore.

No, exactly that doesn't work.

See the down_write(&adev->reset_sem) would then wait for this reader and 
the reader is allocating memory and allocating memory might wait for the 
reset to finish => deadlock.

Regards,
Christian.

>
> Other than that if apps are trying to read and modify the list at the 
> same time, probably we should leave that to user mode since this is 
> mainly a debug feature.
>
> Thanks,
> Lijo
>
>> Regards,
>> Christian.
>>
>>> Regards,
>>>
>>> S.Amarnath
>>>
>>>>>> Regards,
>>>>>> Christian.
>>>>>>
>>>>> There are 2 situations:
>>>>> 1st time creating list n_regs will be 0 and trace event will not 
>>>>> be triggered
>>>>> 2nd time while updating list n_regs is already set and 
>>>>> adev->reset_dump_reg_list will have some offsets address 
>>>>> (hypothetically speaking *during reset + update* read values from 
>>>>> RREG32 will mix up of old list and new list)
>>>>> its only critical when its freed and n_regs is not 0
>>>>
>>>> No, that won't work like this. See you *must* always hold a lock 
>>>> when reading or writing the array.
>>>>
>>>> Otherwise it is perfectly possible that one thread sees only halve 
>>>> of the updates of another thread.
>>>>
>>>> The only alternative would be RCU, atomic replace and manual 
>>>> barrier handling, but that would be complete overkill for that 
>>>> feature.
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>>>
>>>>> Regards,
>>>>> S.Amarnath
>>>>>>> +
>>>>>>> +        if (ret)
>>>>>>> +            goto failed;
>>>>>>> +
>>>>>>> +        len += strlen(reg) + 1;
>>>>>>> +        i++;
>>>>>>> +
>>>>>>> +    } while (len < size);
>>>>>>> +
>>>>>>> +    adev->n_regs = i;
>>>>>>> +
>>>>>>> +    return size;
>>>>>>> +
>>>>>>> +failed:
>>>>>>> +    mutex_lock(&adev->reset_dump_mutex);
>>>>>>> +    kfree(adev->reset_dump_reg_list);
>>>>>>> +    adev->reset_dump_reg_list = NULL;
>>>>>>> +    alloc_count = 0;
>>>>>>> +    adev->n_regs = 0;
>>>>>>> +    mutex_unlock(&adev->reset_dump_mutex);
>>>>>>> +    return -EFAULT;
>>>>>>> +}
>>>>>>> +
>>>>>>> +
>>>>>>> +
>>>>>>> +static const struct file_operations 
>>>>>>> amdgpu_reset_dump_register_list = {
>>>>>>> +    .owner = THIS_MODULE,
>>>>>>> +    .read = amdgpu_reset_dump_register_list_read,
>>>>>>> +    .write = amdgpu_reset_dump_register_list_write,
>>>>>>> +    .llseek = default_llseek
>>>>>>> +};
>>>>>>> +
>>>>>>>   int amdgpu_debugfs_init(struct amdgpu_device *adev)
>>>>>>>   {
>>>>>>>       struct dentry *root = 
>>>>>>> adev_to_drm(adev)->primary->debugfs_root;
>>>>>>> @@ -1618,6 +1710,7 @@ int amdgpu_debugfs_init(struct 
>>>>>>> amdgpu_device *adev)
>>>>>>>       if (!debugfs_initialized())
>>>>>>>           return 0;
>>>>>>>   +    mutex_init(&adev->reset_dump_mutex);
>>>>>>>       ent = debugfs_create_file("amdgpu_preempt_ib", 0600, root, 
>>>>>>> adev,
>>>>>>>                     &fops_ib_preempt);
>>>>>>>       if (IS_ERR(ent)) {
>>>>>>> @@ -1672,6 +1765,8 @@ int amdgpu_debugfs_init(struct 
>>>>>>> amdgpu_device *adev)
>>>>>>>                   &amdgpu_debugfs_test_ib_fops);
>>>>>>>       debugfs_create_file("amdgpu_vm_info", 0444, root, adev,
>>>>>>>                   &amdgpu_debugfs_vm_info_fops);
>>>>>>> + debugfs_create_file("amdgpu_reset_dump_register_list", 0644, 
>>>>>>> root, adev,
>>>>>>> + &amdgpu_reset_dump_register_list);
>>>>>>>         adev->debugfs_vbios_blob.data = adev->bios;
>>>>>>>       adev->debugfs_vbios_blob.size = adev->bios_size;
>>>>>>
>>>>
>>


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v6 1/2] drm/amdgpu: add debugfs for reset registers list
  2022-02-17  9:48             ` Christian König
@ 2022-02-17 10:01               ` Lazar, Lijo
  2022-02-17 10:04                 ` Christian König
  0 siblings, 1 reply; 21+ messages in thread
From: Lazar, Lijo @ 2022-02-17 10:01 UTC (permalink / raw)
  To: Christian König, Somalapuram, Amaranath,
	Christian König, Somalapuram Amaranath, amd-gfx
  Cc: alexander.deucher, shashank.sharma



On 2/17/2022 3:18 PM, Christian König wrote:
> Am 17.02.22 um 10:44 schrieb Lazar, Lijo:
>>
>>
>> On 2/17/2022 1:30 PM, Christian König wrote:
>>>
>>>
>>> Am 17.02.22 um 08:54 schrieb Somalapuram, Amaranath:
>>>>
>>>>
>>>> On 2/16/2022 8:26 PM, Christian König wrote:
>>>>> Am 16.02.22 um 14:11 schrieb Somalapuram, Amaranath:
>>>>>>
>>>>>> On 2/16/2022 3:41 PM, Christian König wrote:
>>>>>>
>>>>>>> Am 16.02.22 um 10:49 schrieb Somalapuram Amaranath:
>>>>>>>> List of register populated for dump collection during the GPU 
>>>>>>>> reset.
>>>>>>>>
>>>>>>>> Signed-off-by: Somalapuram Amaranath 
>>>>>>>> <Amaranath.Somalapuram@amd.com>
>>>>>>>> ---
>>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h         |  5 ++
>>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 95 
>>>>>>>> +++++++++++++++++++++
>>>>>>>>   2 files changed, 100 insertions(+)
>>>>>>>>
>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>> index b85b67a88a3d..57965316873b 100644
>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>> @@ -1097,6 +1097,11 @@ struct amdgpu_device {
>>>>>>>>         struct amdgpu_reset_control     *reset_cntl;
>>>>>>>>       uint32_t ip_versions[HW_ID_MAX][HWIP_MAX_INSTANCE];
>>>>>>>> +
>>>>>>>> +    /* reset dump register */
>>>>>>>> +    uint32_t            *reset_dump_reg_list;
>>>>>>>> +    int                             n_regs;
>>>>>>>> +    struct mutex            reset_dump_mutex;
>>>>>>>
>>>>>>> I think we should rather use the reset lock for this instead of 
>>>>>>> introducing just another mutex.
>>>>>>>
>>>>>>>>   };
>>>>>>>>     static inline struct amdgpu_device *drm_to_adev(struct 
>>>>>>>> drm_device *ddev)
>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>>>>>> index 164d6a9e9fbb..faf985c7cb93 100644
>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>>>>>> @@ -1609,6 +1609,98 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, 
>>>>>>>> NULL,
>>>>>>>>   DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL,
>>>>>>>>               amdgpu_debugfs_sclk_set, "%llu\n");
>>>>>>>>   +static ssize_t amdgpu_reset_dump_register_list_read(struct 
>>>>>>>> file *f,
>>>>>>>> +                char __user *buf, size_t size, loff_t *pos)
>>>>>>>> +{
>>>>>>>> +    struct amdgpu_device *adev = (struct amdgpu_device 
>>>>>>>> *)file_inode(f)->i_private;
>>>>>>>> +    char reg_offset[11];
>>>>>>>> +    int i, r, len = 0;
>>>>>>>> +
>>>>>>>> +    if (*pos)
>>>>>>>> +        return 0;
>>>>>>>> +
>>>>>>>> +    if (adev->n_regs == 0)
>>>>>>>> +        return 0;
>>>>>>>> +
>>>>>>>> +    for (i = 0; i < adev->n_regs; i++) {
>>>>>>>> +        sprintf(reg_offset, "0x%x ", 
>>>>>>>> adev->reset_dump_reg_list[i]);
>>>>>>>> +        r = copy_to_user(buf + len, reg_offset, 
>>>>>>>> strlen(reg_offset));
>>>>>>>> +
>>>>>>>> +        if (r)
>>>>>>>> +            return -EFAULT;
>>>>>>>> +
>>>>>>>> +        len += strlen(reg_offset);
>>>>>>>> +    }
>>>>>>>
>>>>>>> You need to hold the lock protecting adev->reset_dump_reg_list 
>>>>>>> and adev->n_regs while accessing those.
>>>>>>>
>>>>>>> (BTW: num_regs instead of n_regs would match more what we use 
>>>>>>> elsewhere, but is not a must have).
>>>>>>>
>>>>>> This is read function for user and returns only list of reg 
>>>>>> offsets, I did not understand correctly !
>>>>>>>> +
>>>>>>>> +    r = copy_to_user(buf + len, "\n", 1);
>>>>>>>> +
>>>>>>>> +    if (r)
>>>>>>>> +        return -EFAULT;
>>>>>>>> +
>>>>>>>> +    len++;
>>>>>>>> +    *pos += len;
>>>>>>>> +
>>>>>>>> +    return len;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +static ssize_t amdgpu_reset_dump_register_list_write(struct 
>>>>>>>> file *f,
>>>>>>>> +            const char __user *buf, size_t size, loff_t *pos)
>>>>>>>> +{
>>>>>>>> +    struct amdgpu_device *adev = (struct amdgpu_device 
>>>>>>>> *)file_inode(f)->i_private;
>>>>>>>> +    char *reg_offset, *reg, reg_temp[11];
>>>>>>>> +    static int alloc_count;
>>>>>>>> +    int ret, i = 0, len = 0;
>>>>>>>> +
>>>>>>>> +    do {
>>>>>>>> +        reg_offset = reg_temp;
>>>>>>>> +        memset(reg_offset,  0, 11);
>>>>>>>> +        ret = copy_from_user(reg_offset, buf + len, min(11, 
>>>>>>>> ((int)size-len)));
>>>>>>>> +
>>>>>>>> +        if (ret)
>>>>>>>> +            goto failed;
>>>>>>>> +
>>>>>>>> +        reg = strsep(&reg_offset, " ");
>>>>>>>> +
>>>>>>>> +        if (alloc_count <= i) {
>>>>>>>
>>>>>>>> + adev->reset_dump_reg_list =  krealloc_array(
>>>>>>>> + adev->reset_dump_reg_list, 1,
>>>>>>>> +                            sizeof(uint32_t), GFP_KERNEL);
>>>>>>>> +            alloc_count++;
>>>>>>>> +        }
>>>>>>>> +
>>>>>>>> +        ret = kstrtouint(reg, 16, &adev->reset_dump_reg_list[i]);
>>>>>>>
>>>>>>> This here is modifying adev->reset_dump_reg_list as well and so 
>>>>>>> must be protected by a lock as well.
>>>>>>>
>>>>>>> The tricky part is that we can't allocate memory while holding 
>>>>>>> this lock (because we need it during reset as well).
>>>>>>>
>>>>>>> One solution for this is to read the register list into a local 
>>>>>>> array first and when that's done swap the local array with the 
>>>>>>> one in adev->reset_dump_reg_list while holding the lock.
>>>>>>>
>>>> krealloc_array should be inside lock or outside lock? this may be 
>>>> problem.
>>>>
>>>
>>> This *must* be outside the lock because we need to take the lock 
>>> during GPU reset which has a dependency to not allocate memory or 
>>> wait for locks under which memory is allocated.
>>>
>>> That's why I said you need an approach which first parses the string 
>>> from userspace, build up the register list and then swap that with 
>>> the existing one while holding the lock.
>>>
>>
>> Another approach would be to just protect debugfs write with 
>> down_read(&adev->reset_sem) or reset domain semaphore.
> 
> No, exactly that doesn't work.
> 
> See the down_write(&adev->reset_sem) would then wait for this reader and 
> the reader is allocating memory and allocating memory might wait for the 
> reset to finish => deadlock.

I didn't get this part - allocating memory might wait for the reset to 
finish.

down_write() is called as one of the first steps during device reset, 
and therefore device reset hasn't started. When you say " reset to 
finish", do you mean device reset or something else?

Thanks,
Lijo

> 
> Regards,
> Christian.
> 
>>
>> Other than that if apps are trying to read and modify the list at the 
>> same time, probably we should leave that to user mode since this is 
>> mainly a debug feature.
>>
>> Thanks,
>> Lijo
>>
>>> Regards,
>>> Christian.
>>>
>>>> Regards,
>>>>
>>>> S.Amarnath
>>>>
>>>>>>> Regards,
>>>>>>> Christian.
>>>>>>>
>>>>>> There are 2 situations:
>>>>>> 1st time creating list n_regs will be 0 and trace event will not 
>>>>>> be triggered
>>>>>> 2nd time while updating list n_regs is already set and 
>>>>>> adev->reset_dump_reg_list will have some offsets address 
>>>>>> (hypothetically speaking *during reset + update* read values from 
>>>>>> RREG32 will mix up of old list and new list)
>>>>>> its only critical when its freed and n_regs is not 0
>>>>>
>>>>> No, that won't work like this. See you *must* always hold a lock 
>>>>> when reading or writing the array.
>>>>>
>>>>> Otherwise it is perfectly possible that one thread sees only halve 
>>>>> of the updates of another thread.
>>>>>
>>>>> The only alternative would be RCU, atomic replace and manual 
>>>>> barrier handling, but that would be complete overkill for that 
>>>>> feature.
>>>>>
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>>>
>>>>>> Regards,
>>>>>> S.Amarnath
>>>>>>>> +
>>>>>>>> +        if (ret)
>>>>>>>> +            goto failed;
>>>>>>>> +
>>>>>>>> +        len += strlen(reg) + 1;
>>>>>>>> +        i++;
>>>>>>>> +
>>>>>>>> +    } while (len < size);
>>>>>>>> +
>>>>>>>> +    adev->n_regs = i;
>>>>>>>> +
>>>>>>>> +    return size;
>>>>>>>> +
>>>>>>>> +failed:
>>>>>>>> +    mutex_lock(&adev->reset_dump_mutex);
>>>>>>>> +    kfree(adev->reset_dump_reg_list);
>>>>>>>> +    adev->reset_dump_reg_list = NULL;
>>>>>>>> +    alloc_count = 0;
>>>>>>>> +    adev->n_regs = 0;
>>>>>>>> +    mutex_unlock(&adev->reset_dump_mutex);
>>>>>>>> +    return -EFAULT;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +
>>>>>>>> +
>>>>>>>> +static const struct file_operations 
>>>>>>>> amdgpu_reset_dump_register_list = {
>>>>>>>> +    .owner = THIS_MODULE,
>>>>>>>> +    .read = amdgpu_reset_dump_register_list_read,
>>>>>>>> +    .write = amdgpu_reset_dump_register_list_write,
>>>>>>>> +    .llseek = default_llseek
>>>>>>>> +};
>>>>>>>> +
>>>>>>>>   int amdgpu_debugfs_init(struct amdgpu_device *adev)
>>>>>>>>   {
>>>>>>>>       struct dentry *root = 
>>>>>>>> adev_to_drm(adev)->primary->debugfs_root;
>>>>>>>> @@ -1618,6 +1710,7 @@ int amdgpu_debugfs_init(struct 
>>>>>>>> amdgpu_device *adev)
>>>>>>>>       if (!debugfs_initialized())
>>>>>>>>           return 0;
>>>>>>>>   +    mutex_init(&adev->reset_dump_mutex);
>>>>>>>>       ent = debugfs_create_file("amdgpu_preempt_ib", 0600, root, 
>>>>>>>> adev,
>>>>>>>>                     &fops_ib_preempt);
>>>>>>>>       if (IS_ERR(ent)) {
>>>>>>>> @@ -1672,6 +1765,8 @@ int amdgpu_debugfs_init(struct 
>>>>>>>> amdgpu_device *adev)
>>>>>>>>                   &amdgpu_debugfs_test_ib_fops);
>>>>>>>>       debugfs_create_file("amdgpu_vm_info", 0444, root, adev,
>>>>>>>>                   &amdgpu_debugfs_vm_info_fops);
>>>>>>>> + debugfs_create_file("amdgpu_reset_dump_register_list", 0644, 
>>>>>>>> root, adev,
>>>>>>>> + &amdgpu_reset_dump_register_list);
>>>>>>>>         adev->debugfs_vbios_blob.data = adev->bios;
>>>>>>>>       adev->debugfs_vbios_blob.size = adev->bios_size;
>>>>>>>
>>>>>
>>>
> 

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v6 1/2] drm/amdgpu: add debugfs for reset registers list
  2022-02-17 10:01               ` Lazar, Lijo
@ 2022-02-17 10:04                 ` Christian König
  2022-02-17 10:09                   ` Lazar, Lijo
  0 siblings, 1 reply; 21+ messages in thread
From: Christian König @ 2022-02-17 10:04 UTC (permalink / raw)
  To: Lazar, Lijo, Somalapuram, Amaranath, Christian König,
	Somalapuram Amaranath, amd-gfx
  Cc: alexander.deucher, shashank.sharma



Am 17.02.22 um 11:01 schrieb Lazar, Lijo:
>
>
> On 2/17/2022 3:18 PM, Christian König wrote:
>> Am 17.02.22 um 10:44 schrieb Lazar, Lijo:
>>>
>>>
>>> On 2/17/2022 1:30 PM, Christian König wrote:
>>>>
>>>>
>>>> Am 17.02.22 um 08:54 schrieb Somalapuram, Amaranath:
>>>>>
>>>>>
>>>>> On 2/16/2022 8:26 PM, Christian König wrote:
>>>>>> Am 16.02.22 um 14:11 schrieb Somalapuram, Amaranath:
>>>>>>>
>>>>>>> On 2/16/2022 3:41 PM, Christian König wrote:
>>>>>>>
>>>>>>>> Am 16.02.22 um 10:49 schrieb Somalapuram Amaranath:
>>>>>>>>> List of register populated for dump collection during the GPU 
>>>>>>>>> reset.
>>>>>>>>>
>>>>>>>>> Signed-off-by: Somalapuram Amaranath 
>>>>>>>>> <Amaranath.Somalapuram@amd.com>
>>>>>>>>> ---
>>>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h         |  5 ++
>>>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 95 
>>>>>>>>> +++++++++++++++++++++
>>>>>>>>>   2 files changed, 100 insertions(+)
>>>>>>>>>
>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>> index b85b67a88a3d..57965316873b 100644
>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>> @@ -1097,6 +1097,11 @@ struct amdgpu_device {
>>>>>>>>>         struct amdgpu_reset_control *reset_cntl;
>>>>>>>>>       uint32_t ip_versions[HW_ID_MAX][HWIP_MAX_INSTANCE];
>>>>>>>>> +
>>>>>>>>> +    /* reset dump register */
>>>>>>>>> +    uint32_t            *reset_dump_reg_list;
>>>>>>>>> +    int                             n_regs;
>>>>>>>>> +    struct mutex            reset_dump_mutex;
>>>>>>>>
>>>>>>>> I think we should rather use the reset lock for this instead of 
>>>>>>>> introducing just another mutex.
>>>>>>>>
>>>>>>>>>   };
>>>>>>>>>     static inline struct amdgpu_device *drm_to_adev(struct 
>>>>>>>>> drm_device *ddev)
>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>>>>>>> index 164d6a9e9fbb..faf985c7cb93 100644
>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>>>>>>> @@ -1609,6 +1609,98 @@ 
>>>>>>>>> DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, NULL,
>>>>>>>>>   DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL,
>>>>>>>>>               amdgpu_debugfs_sclk_set, "%llu\n");
>>>>>>>>>   +static ssize_t amdgpu_reset_dump_register_list_read(struct 
>>>>>>>>> file *f,
>>>>>>>>> +                char __user *buf, size_t size, loff_t *pos)
>>>>>>>>> +{
>>>>>>>>> +    struct amdgpu_device *adev = (struct amdgpu_device 
>>>>>>>>> *)file_inode(f)->i_private;
>>>>>>>>> +    char reg_offset[11];
>>>>>>>>> +    int i, r, len = 0;
>>>>>>>>> +
>>>>>>>>> +    if (*pos)
>>>>>>>>> +        return 0;
>>>>>>>>> +
>>>>>>>>> +    if (adev->n_regs == 0)
>>>>>>>>> +        return 0;
>>>>>>>>> +
>>>>>>>>> +    for (i = 0; i < adev->n_regs; i++) {
>>>>>>>>> +        sprintf(reg_offset, "0x%x ", 
>>>>>>>>> adev->reset_dump_reg_list[i]);
>>>>>>>>> +        r = copy_to_user(buf + len, reg_offset, 
>>>>>>>>> strlen(reg_offset));
>>>>>>>>> +
>>>>>>>>> +        if (r)
>>>>>>>>> +            return -EFAULT;
>>>>>>>>> +
>>>>>>>>> +        len += strlen(reg_offset);
>>>>>>>>> +    }
>>>>>>>>
>>>>>>>> You need to hold the lock protecting adev->reset_dump_reg_list 
>>>>>>>> and adev->n_regs while accessing those.
>>>>>>>>
>>>>>>>> (BTW: num_regs instead of n_regs would match more what we use 
>>>>>>>> elsewhere, but is not a must have).
>>>>>>>>
>>>>>>> This is read function for user and returns only list of reg 
>>>>>>> offsets, I did not understand correctly !
>>>>>>>>> +
>>>>>>>>> +    r = copy_to_user(buf + len, "\n", 1);
>>>>>>>>> +
>>>>>>>>> +    if (r)
>>>>>>>>> +        return -EFAULT;
>>>>>>>>> +
>>>>>>>>> +    len++;
>>>>>>>>> +    *pos += len;
>>>>>>>>> +
>>>>>>>>> +    return len;
>>>>>>>>> +}
>>>>>>>>> +
>>>>>>>>> +static ssize_t amdgpu_reset_dump_register_list_write(struct 
>>>>>>>>> file *f,
>>>>>>>>> +            const char __user *buf, size_t size, loff_t *pos)
>>>>>>>>> +{
>>>>>>>>> +    struct amdgpu_device *adev = (struct amdgpu_device 
>>>>>>>>> *)file_inode(f)->i_private;
>>>>>>>>> +    char *reg_offset, *reg, reg_temp[11];
>>>>>>>>> +    static int alloc_count;
>>>>>>>>> +    int ret, i = 0, len = 0;
>>>>>>>>> +
>>>>>>>>> +    do {
>>>>>>>>> +        reg_offset = reg_temp;
>>>>>>>>> +        memset(reg_offset,  0, 11);
>>>>>>>>> +        ret = copy_from_user(reg_offset, buf + len, min(11, 
>>>>>>>>> ((int)size-len)));
>>>>>>>>> +
>>>>>>>>> +        if (ret)
>>>>>>>>> +            goto failed;
>>>>>>>>> +
>>>>>>>>> +        reg = strsep(&reg_offset, " ");
>>>>>>>>> +
>>>>>>>>> +        if (alloc_count <= i) {
>>>>>>>>
>>>>>>>>> + adev->reset_dump_reg_list =  krealloc_array(
>>>>>>>>> + adev->reset_dump_reg_list, 1,
>>>>>>>>> +                            sizeof(uint32_t), GFP_KERNEL);
>>>>>>>>> +            alloc_count++;
>>>>>>>>> +        }
>>>>>>>>> +
>>>>>>>>> +        ret = kstrtouint(reg, 16, 
>>>>>>>>> &adev->reset_dump_reg_list[i]);
>>>>>>>>
>>>>>>>> This here is modifying adev->reset_dump_reg_list as well and so 
>>>>>>>> must be protected by a lock as well.
>>>>>>>>
>>>>>>>> The tricky part is that we can't allocate memory while holding 
>>>>>>>> this lock (because we need it during reset as well).
>>>>>>>>
>>>>>>>> One solution for this is to read the register list into a local 
>>>>>>>> array first and when that's done swap the local array with the 
>>>>>>>> one in adev->reset_dump_reg_list while holding the lock.
>>>>>>>>
>>>>> krealloc_array should be inside lock or outside lock? this may be 
>>>>> problem.
>>>>>
>>>>
>>>> This *must* be outside the lock because we need to take the lock 
>>>> during GPU reset which has a dependency to not allocate memory or 
>>>> wait for locks under which memory is allocated.
>>>>
>>>> That's why I said you need an approach which first parses the 
>>>> string from userspace, build up the register list and then swap 
>>>> that with the existing one while holding the lock.
>>>>
>>>
>>> Another approach would be to just protect debugfs write with 
>>> down_read(&adev->reset_sem) or reset domain semaphore.
>>
>> No, exactly that doesn't work.
>>
>> See the down_write(&adev->reset_sem) would then wait for this reader 
>> and the reader is allocating memory and allocating memory might wait 
>> for the reset to finish => deadlock.
>
> I didn't get this part - allocating memory might wait for the reset to 
> finish.
>
> down_write() is called as one of the first steps during device reset, 
> and therefore device reset hasn't started. When you say " reset to 
> finish", do you mean device reset or something else?

I mean device reset. Holding the reset lock prevents device reset from 
starting because of the down_write() and core memory management might 
wait for this before it continues allocating memory.

Regards,
Christian.

>
> Thanks,
> Lijo
>
>>
>> Regards,
>> Christian.
>>
>>>
>>> Other than that if apps are trying to read and modify the list at 
>>> the same time, probably we should leave that to user mode since this 
>>> is mainly a debug feature.
>>>
>>> Thanks,
>>> Lijo
>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>>> Regards,
>>>>>
>>>>> S.Amarnath
>>>>>
>>>>>>>> Regards,
>>>>>>>> Christian.
>>>>>>>>
>>>>>>> There are 2 situations:
>>>>>>> 1st time creating list n_regs will be 0 and trace event will not 
>>>>>>> be triggered
>>>>>>> 2nd time while updating list n_regs is already set and 
>>>>>>> adev->reset_dump_reg_list will have some offsets address 
>>>>>>> (hypothetically speaking *during reset + update* read values 
>>>>>>> from RREG32 will mix up of old list and new list)
>>>>>>> its only critical when its freed and n_regs is not 0
>>>>>>
>>>>>> No, that won't work like this. See you *must* always hold a lock 
>>>>>> when reading or writing the array.
>>>>>>
>>>>>> Otherwise it is perfectly possible that one thread sees only 
>>>>>> halve of the updates of another thread.
>>>>>>
>>>>>> The only alternative would be RCU, atomic replace and manual 
>>>>>> barrier handling, but that would be complete overkill for that 
>>>>>> feature.
>>>>>>
>>>>>> Regards,
>>>>>> Christian.
>>>>>>
>>>>>>>
>>>>>>> Regards,
>>>>>>> S.Amarnath
>>>>>>>>> +
>>>>>>>>> +        if (ret)
>>>>>>>>> +            goto failed;
>>>>>>>>> +
>>>>>>>>> +        len += strlen(reg) + 1;
>>>>>>>>> +        i++;
>>>>>>>>> +
>>>>>>>>> +    } while (len < size);
>>>>>>>>> +
>>>>>>>>> +    adev->n_regs = i;
>>>>>>>>> +
>>>>>>>>> +    return size;
>>>>>>>>> +
>>>>>>>>> +failed:
>>>>>>>>> +    mutex_lock(&adev->reset_dump_mutex);
>>>>>>>>> +    kfree(adev->reset_dump_reg_list);
>>>>>>>>> +    adev->reset_dump_reg_list = NULL;
>>>>>>>>> +    alloc_count = 0;
>>>>>>>>> +    adev->n_regs = 0;
>>>>>>>>> +    mutex_unlock(&adev->reset_dump_mutex);
>>>>>>>>> +    return -EFAULT;
>>>>>>>>> +}
>>>>>>>>> +
>>>>>>>>> +
>>>>>>>>> +
>>>>>>>>> +static const struct file_operations 
>>>>>>>>> amdgpu_reset_dump_register_list = {
>>>>>>>>> +    .owner = THIS_MODULE,
>>>>>>>>> +    .read = amdgpu_reset_dump_register_list_read,
>>>>>>>>> +    .write = amdgpu_reset_dump_register_list_write,
>>>>>>>>> +    .llseek = default_llseek
>>>>>>>>> +};
>>>>>>>>> +
>>>>>>>>>   int amdgpu_debugfs_init(struct amdgpu_device *adev)
>>>>>>>>>   {
>>>>>>>>>       struct dentry *root = 
>>>>>>>>> adev_to_drm(adev)->primary->debugfs_root;
>>>>>>>>> @@ -1618,6 +1710,7 @@ int amdgpu_debugfs_init(struct 
>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>       if (!debugfs_initialized())
>>>>>>>>>           return 0;
>>>>>>>>>   +    mutex_init(&adev->reset_dump_mutex);
>>>>>>>>>       ent = debugfs_create_file("amdgpu_preempt_ib", 0600, 
>>>>>>>>> root, adev,
>>>>>>>>>                     &fops_ib_preempt);
>>>>>>>>>       if (IS_ERR(ent)) {
>>>>>>>>> @@ -1672,6 +1765,8 @@ int amdgpu_debugfs_init(struct 
>>>>>>>>> amdgpu_device *adev)
>>>>>>>>> &amdgpu_debugfs_test_ib_fops);
>>>>>>>>>       debugfs_create_file("amdgpu_vm_info", 0444, root, adev,
>>>>>>>>> &amdgpu_debugfs_vm_info_fops);
>>>>>>>>> + debugfs_create_file("amdgpu_reset_dump_register_list", 0644, 
>>>>>>>>> root, adev,
>>>>>>>>> + &amdgpu_reset_dump_register_list);
>>>>>>>>>         adev->debugfs_vbios_blob.data = adev->bios;
>>>>>>>>>       adev->debugfs_vbios_blob.size = adev->bios_size;
>>>>>>>>
>>>>>>
>>>>
>>


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v6 1/2] drm/amdgpu: add debugfs for reset registers list
  2022-02-17 10:04                 ` Christian König
@ 2022-02-17 10:09                   ` Lazar, Lijo
  2022-02-17 10:43                     ` Christian König
  0 siblings, 1 reply; 21+ messages in thread
From: Lazar, Lijo @ 2022-02-17 10:09 UTC (permalink / raw)
  To: Christian König, Somalapuram, Amaranath,
	Christian König, Somalapuram Amaranath, amd-gfx
  Cc: alexander.deucher, shashank.sharma



On 2/17/2022 3:34 PM, Christian König wrote:
> 
> 
> Am 17.02.22 um 11:01 schrieb Lazar, Lijo:
>>
>>
>> On 2/17/2022 3:18 PM, Christian König wrote:
>>> Am 17.02.22 um 10:44 schrieb Lazar, Lijo:
>>>>
>>>>
>>>> On 2/17/2022 1:30 PM, Christian König wrote:
>>>>>
>>>>>
>>>>> Am 17.02.22 um 08:54 schrieb Somalapuram, Amaranath:
>>>>>>
>>>>>>
>>>>>> On 2/16/2022 8:26 PM, Christian König wrote:
>>>>>>> Am 16.02.22 um 14:11 schrieb Somalapuram, Amaranath:
>>>>>>>>
>>>>>>>> On 2/16/2022 3:41 PM, Christian König wrote:
>>>>>>>>
>>>>>>>>> Am 16.02.22 um 10:49 schrieb Somalapuram Amaranath:
>>>>>>>>>> List of register populated for dump collection during the GPU 
>>>>>>>>>> reset.
>>>>>>>>>>
>>>>>>>>>> Signed-off-by: Somalapuram Amaranath 
>>>>>>>>>> <Amaranath.Somalapuram@amd.com>
>>>>>>>>>> ---
>>>>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h         |  5 ++
>>>>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 95 
>>>>>>>>>> +++++++++++++++++++++
>>>>>>>>>>   2 files changed, 100 insertions(+)
>>>>>>>>>>
>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>> index b85b67a88a3d..57965316873b 100644
>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>>>>> @@ -1097,6 +1097,11 @@ struct amdgpu_device {
>>>>>>>>>>         struct amdgpu_reset_control *reset_cntl;
>>>>>>>>>>       uint32_t ip_versions[HW_ID_MAX][HWIP_MAX_INSTANCE];
>>>>>>>>>> +
>>>>>>>>>> +    /* reset dump register */
>>>>>>>>>> +    uint32_t            *reset_dump_reg_list;
>>>>>>>>>> +    int                             n_regs;
>>>>>>>>>> +    struct mutex            reset_dump_mutex;
>>>>>>>>>
>>>>>>>>> I think we should rather use the reset lock for this instead of 
>>>>>>>>> introducing just another mutex.
>>>>>>>>>
>>>>>>>>>>   };
>>>>>>>>>>     static inline struct amdgpu_device *drm_to_adev(struct 
>>>>>>>>>> drm_device *ddev)
>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>>>>>>>> index 164d6a9e9fbb..faf985c7cb93 100644
>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>>>>>>>> @@ -1609,6 +1609,98 @@ 
>>>>>>>>>> DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, NULL,
>>>>>>>>>>   DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL,
>>>>>>>>>>               amdgpu_debugfs_sclk_set, "%llu\n");
>>>>>>>>>>   +static ssize_t amdgpu_reset_dump_register_list_read(struct 
>>>>>>>>>> file *f,
>>>>>>>>>> +                char __user *buf, size_t size, loff_t *pos)
>>>>>>>>>> +{
>>>>>>>>>> +    struct amdgpu_device *adev = (struct amdgpu_device 
>>>>>>>>>> *)file_inode(f)->i_private;
>>>>>>>>>> +    char reg_offset[11];
>>>>>>>>>> +    int i, r, len = 0;
>>>>>>>>>> +
>>>>>>>>>> +    if (*pos)
>>>>>>>>>> +        return 0;
>>>>>>>>>> +
>>>>>>>>>> +    if (adev->n_regs == 0)
>>>>>>>>>> +        return 0;
>>>>>>>>>> +
>>>>>>>>>> +    for (i = 0; i < adev->n_regs; i++) {
>>>>>>>>>> +        sprintf(reg_offset, "0x%x ", 
>>>>>>>>>> adev->reset_dump_reg_list[i]);
>>>>>>>>>> +        r = copy_to_user(buf + len, reg_offset, 
>>>>>>>>>> strlen(reg_offset));
>>>>>>>>>> +
>>>>>>>>>> +        if (r)
>>>>>>>>>> +            return -EFAULT;
>>>>>>>>>> +
>>>>>>>>>> +        len += strlen(reg_offset);
>>>>>>>>>> +    }
>>>>>>>>>
>>>>>>>>> You need to hold the lock protecting adev->reset_dump_reg_list 
>>>>>>>>> and adev->n_regs while accessing those.
>>>>>>>>>
>>>>>>>>> (BTW: num_regs instead of n_regs would match more what we use 
>>>>>>>>> elsewhere, but is not a must have).
>>>>>>>>>
>>>>>>>> This is read function for user and returns only list of reg 
>>>>>>>> offsets, I did not understand correctly !
>>>>>>>>>> +
>>>>>>>>>> +    r = copy_to_user(buf + len, "\n", 1);
>>>>>>>>>> +
>>>>>>>>>> +    if (r)
>>>>>>>>>> +        return -EFAULT;
>>>>>>>>>> +
>>>>>>>>>> +    len++;
>>>>>>>>>> +    *pos += len;
>>>>>>>>>> +
>>>>>>>>>> +    return len;
>>>>>>>>>> +}
>>>>>>>>>> +
>>>>>>>>>> +static ssize_t amdgpu_reset_dump_register_list_write(struct 
>>>>>>>>>> file *f,
>>>>>>>>>> +            const char __user *buf, size_t size, loff_t *pos)
>>>>>>>>>> +{
>>>>>>>>>> +    struct amdgpu_device *adev = (struct amdgpu_device 
>>>>>>>>>> *)file_inode(f)->i_private;
>>>>>>>>>> +    char *reg_offset, *reg, reg_temp[11];
>>>>>>>>>> +    static int alloc_count;
>>>>>>>>>> +    int ret, i = 0, len = 0;
>>>>>>>>>> +
>>>>>>>>>> +    do {
>>>>>>>>>> +        reg_offset = reg_temp;
>>>>>>>>>> +        memset(reg_offset,  0, 11);
>>>>>>>>>> +        ret = copy_from_user(reg_offset, buf + len, min(11, 
>>>>>>>>>> ((int)size-len)));
>>>>>>>>>> +
>>>>>>>>>> +        if (ret)
>>>>>>>>>> +            goto failed;
>>>>>>>>>> +
>>>>>>>>>> +        reg = strsep(&reg_offset, " ");
>>>>>>>>>> +
>>>>>>>>>> +        if (alloc_count <= i) {
>>>>>>>>>
>>>>>>>>>> + adev->reset_dump_reg_list =  krealloc_array(
>>>>>>>>>> + adev->reset_dump_reg_list, 1,
>>>>>>>>>> +                            sizeof(uint32_t), GFP_KERNEL);
>>>>>>>>>> +            alloc_count++;
>>>>>>>>>> +        }
>>>>>>>>>> +
>>>>>>>>>> +        ret = kstrtouint(reg, 16, 
>>>>>>>>>> &adev->reset_dump_reg_list[i]);
>>>>>>>>>
>>>>>>>>> This here is modifying adev->reset_dump_reg_list as well and so 
>>>>>>>>> must be protected by a lock as well.
>>>>>>>>>
>>>>>>>>> The tricky part is that we can't allocate memory while holding 
>>>>>>>>> this lock (because we need it during reset as well).
>>>>>>>>>
>>>>>>>>> One solution for this is to read the register list into a local 
>>>>>>>>> array first and when that's done swap the local array with the 
>>>>>>>>> one in adev->reset_dump_reg_list while holding the lock.
>>>>>>>>>
>>>>>> krealloc_array should be inside lock or outside lock? this may be 
>>>>>> problem.
>>>>>>
>>>>>
>>>>> This *must* be outside the lock because we need to take the lock 
>>>>> during GPU reset which has a dependency to not allocate memory or 
>>>>> wait for locks under which memory is allocated.
>>>>>
>>>>> That's why I said you need an approach which first parses the 
>>>>> string from userspace, build up the register list and then swap 
>>>>> that with the existing one while holding the lock.
>>>>>
>>>>
>>>> Another approach would be to just protect debugfs write with 
>>>> down_read(&adev->reset_sem) or reset domain semaphore.
>>>
>>> No, exactly that doesn't work.
>>>
>>> See the down_write(&adev->reset_sem) would then wait for this reader 
>>> and the reader is allocating memory and allocating memory might wait 
>>> for the reset to finish => deadlock.
>>
>> I didn't get this part - allocating memory might wait for the reset to 
>> finish.
>>
>> down_write() is called as one of the first steps during device reset, 
>> and therefore device reset hasn't started. When you say " reset to 
>> finish", do you mean device reset or something else?
> 
> I mean device reset. Holding the reset lock prevents device reset from 
> starting because of the down_write() and core memory management might 
> wait for this before it continues allocating memory.
> 

Is this passed to core memory management by a component like drm or ttm?

I thought reset was mainly internal/contained to device level as we 
reset for multiple reasons and system as whole shouldn't be affected.

Thanks,
Lijo

> Regards,
> Christian.
> 
>>
>> Thanks,
>> Lijo
>>
>>>
>>> Regards,
>>> Christian.
>>>
>>>>
>>>> Other than that if apps are trying to read and modify the list at 
>>>> the same time, probably we should leave that to user mode since this 
>>>> is mainly a debug feature.
>>>>
>>>> Thanks,
>>>> Lijo
>>>>
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>>> Regards,
>>>>>>
>>>>>> S.Amarnath
>>>>>>
>>>>>>>>> Regards,
>>>>>>>>> Christian.
>>>>>>>>>
>>>>>>>> There are 2 situations:
>>>>>>>> 1st time creating list n_regs will be 0 and trace event will not 
>>>>>>>> be triggered
>>>>>>>> 2nd time while updating list n_regs is already set and 
>>>>>>>> adev->reset_dump_reg_list will have some offsets address 
>>>>>>>> (hypothetically speaking *during reset + update* read values 
>>>>>>>> from RREG32 will mix up of old list and new list)
>>>>>>>> its only critical when its freed and n_regs is not 0
>>>>>>>
>>>>>>> No, that won't work like this. See you *must* always hold a lock 
>>>>>>> when reading or writing the array.
>>>>>>>
>>>>>>> Otherwise it is perfectly possible that one thread sees only 
>>>>>>> halve of the updates of another thread.
>>>>>>>
>>>>>>> The only alternative would be RCU, atomic replace and manual 
>>>>>>> barrier handling, but that would be complete overkill for that 
>>>>>>> feature.
>>>>>>>
>>>>>>> Regards,
>>>>>>> Christian.
>>>>>>>
>>>>>>>>
>>>>>>>> Regards,
>>>>>>>> S.Amarnath
>>>>>>>>>> +
>>>>>>>>>> +        if (ret)
>>>>>>>>>> +            goto failed;
>>>>>>>>>> +
>>>>>>>>>> +        len += strlen(reg) + 1;
>>>>>>>>>> +        i++;
>>>>>>>>>> +
>>>>>>>>>> +    } while (len < size);
>>>>>>>>>> +
>>>>>>>>>> +    adev->n_regs = i;
>>>>>>>>>> +
>>>>>>>>>> +    return size;
>>>>>>>>>> +
>>>>>>>>>> +failed:
>>>>>>>>>> +    mutex_lock(&adev->reset_dump_mutex);
>>>>>>>>>> +    kfree(adev->reset_dump_reg_list);
>>>>>>>>>> +    adev->reset_dump_reg_list = NULL;
>>>>>>>>>> +    alloc_count = 0;
>>>>>>>>>> +    adev->n_regs = 0;
>>>>>>>>>> +    mutex_unlock(&adev->reset_dump_mutex);
>>>>>>>>>> +    return -EFAULT;
>>>>>>>>>> +}
>>>>>>>>>> +
>>>>>>>>>> +
>>>>>>>>>> +
>>>>>>>>>> +static const struct file_operations 
>>>>>>>>>> amdgpu_reset_dump_register_list = {
>>>>>>>>>> +    .owner = THIS_MODULE,
>>>>>>>>>> +    .read = amdgpu_reset_dump_register_list_read,
>>>>>>>>>> +    .write = amdgpu_reset_dump_register_list_write,
>>>>>>>>>> +    .llseek = default_llseek
>>>>>>>>>> +};
>>>>>>>>>> +
>>>>>>>>>>   int amdgpu_debugfs_init(struct amdgpu_device *adev)
>>>>>>>>>>   {
>>>>>>>>>>       struct dentry *root = 
>>>>>>>>>> adev_to_drm(adev)->primary->debugfs_root;
>>>>>>>>>> @@ -1618,6 +1710,7 @@ int amdgpu_debugfs_init(struct 
>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>       if (!debugfs_initialized())
>>>>>>>>>>           return 0;
>>>>>>>>>>   +    mutex_init(&adev->reset_dump_mutex);
>>>>>>>>>>       ent = debugfs_create_file("amdgpu_preempt_ib", 0600, 
>>>>>>>>>> root, adev,
>>>>>>>>>>                     &fops_ib_preempt);
>>>>>>>>>>       if (IS_ERR(ent)) {
>>>>>>>>>> @@ -1672,6 +1765,8 @@ int amdgpu_debugfs_init(struct 
>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>> &amdgpu_debugfs_test_ib_fops);
>>>>>>>>>>       debugfs_create_file("amdgpu_vm_info", 0444, root, adev,
>>>>>>>>>> &amdgpu_debugfs_vm_info_fops);
>>>>>>>>>> + debugfs_create_file("amdgpu_reset_dump_register_list", 0644, 
>>>>>>>>>> root, adev,
>>>>>>>>>> + &amdgpu_reset_dump_register_list);
>>>>>>>>>>         adev->debugfs_vbios_blob.data = adev->bios;
>>>>>>>>>>       adev->debugfs_vbios_blob.size = adev->bios_size;
>>>>>>>>>
>>>>>>>
>>>>>
>>>
> 

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v6 1/2] drm/amdgpu: add debugfs for reset registers list
  2022-02-17 10:09                   ` Lazar, Lijo
@ 2022-02-17 10:43                     ` Christian König
  0 siblings, 0 replies; 21+ messages in thread
From: Christian König @ 2022-02-17 10:43 UTC (permalink / raw)
  To: Lazar, Lijo, Somalapuram, Amaranath, Christian König,
	Somalapuram Amaranath, amd-gfx
  Cc: alexander.deucher, shashank.sharma



Am 17.02.22 um 11:09 schrieb Lazar, Lijo:
>
>
> On 2/17/2022 3:34 PM, Christian König wrote:
>>
>> [SNIP]
>
> Is this passed to core memory management by a component like drm or ttm?

The dependency comes from multiple directions. But yes, both DRM core as 
well as TTM have that restriction.

Additional to that we have dependencies through userptr etc...

>
> I thought reset was mainly internal/contained to device level as we 
> reset for multiple reasons and system as whole shouldn't be affected.

No, device reset is vital for core memory management. Otherwise the 
whole memory management concept wouldn't work.

Regards,
Christian.

>
> Thanks,
> Lijo
>
>> Regards,
>> Christian.


^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2022-02-17 10:43 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2022-02-16  9:49 [PATCH v6 1/2] drm/amdgpu: add debugfs for reset registers list Somalapuram Amaranath
2022-02-16  9:49 ` [PATCH v6 2/2] drm/amdgpu: add reset register dump trace on GPU reset Somalapuram Amaranath
2022-02-16 10:11 ` [PATCH v6 1/2] drm/amdgpu: add debugfs for reset registers list Christian König
2022-02-16 13:11   ` Somalapuram, Amaranath
2022-02-16 14:56     ` Christian König
2022-02-17  7:54       ` Somalapuram, Amaranath
2022-02-17  8:00         ` Christian König
2022-02-17  9:44           ` Lazar, Lijo
2022-02-17  9:48             ` Christian König
2022-02-17 10:01               ` Lazar, Lijo
2022-02-17 10:04                 ` Christian König
2022-02-17 10:09                   ` Lazar, Lijo
2022-02-17 10:43                     ` Christian König
2022-02-16 10:15 ` Lazar, Lijo
2022-02-16 10:34   ` Somalapuram, Amaranath
2022-02-16 10:43     ` Lazar, Lijo
2022-02-16 11:00       ` Christian König
2022-02-16 11:09       ` Somalapuram, Amaranath
2022-02-16 13:17         ` Lazar, Lijo
2022-02-16 13:55           ` Somalapuram, Amaranath
2022-02-16 14:53             ` Christian König

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox