[PATCH 1/4] drm/msm/adreno: Fix null ptr access in adreno_gpu

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH 1/4] drm/msm/adreno: Fix null ptr access in adreno_gpu_cleanup()
@ 2022-12-03 22:41 Akhil P Oommen
  2022-12-03 22:41 ` [PATCH 2/4] drm/msm: Fix failure paths in msm_drm_init() Akhil P Oommen
                   ` (3 more replies)
  0 siblings, 4 replies; 6+ messages in thread
From: Akhil P Oommen @ 2022-12-03 22:41 UTC (permalink / raw)
  To: freedreno, dri-devel, linux-arm-msm
  Cc: Akhil P Oommen, Abhinav Kumar, Dan Carpenter, Daniel Vetter,
	David Airlie, Dmitry Baryshkov, Emma Anholt, Rob Clark, Sean Paul,
	linux-kernel

Fix the below kernel panic due to null pointer access:
[   18.504431] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000048
[   18.513464] Mem abort info:
[   18.516346]   ESR = 0x0000000096000005
[   18.520204]   EC = 0x25: DABT (current EL), IL = 32 bits
[   18.525706]   SET = 0, FnV = 0
[   18.528878]   EA = 0, S1PTW = 0
[   18.532117]   FSC = 0x05: level 1 translation fault
[   18.537138] Data abort info:
[   18.540110]   ISV = 0, ISS = 0x00000005
[   18.544060]   CM = 0, WnR = 0
[   18.547109] user pgtable: 4k pages, 39-bit VAs, pgdp=0000000112826000
[   18.553738] [0000000000000048] pgd=0000000000000000, p4d=0000000000000000, pud=0000000000000000
[   18.562690] Internal error: Oops: 0000000096000005 [#1] PREEMPT SMP
**Snip**
[   18.696758] Call trace:
[   18.699278]  adreno_gpu_cleanup+0x30/0x88
[   18.703396]  a6xx_destroy+0xc0/0x130
[   18.707066]  a6xx_gpu_init+0x308/0x424
[   18.710921]  adreno_bind+0x178/0x288
[   18.714590]  component_bind_all+0xe0/0x214
[   18.718797]  msm_drm_bind+0x1d4/0x614
[   18.722566]  try_to_bring_up_aggregate_device+0x16c/0x1b8
[   18.728105]  __component_add+0xa0/0x158
[   18.732048]  component_add+0x20/0x2c
[   18.735719]  adreno_probe+0x40/0xc0
[   18.739300]  platform_probe+0xb4/0xd4
[   18.743068]  really_probe+0xfc/0x284
[   18.746738]  __driver_probe_device+0xc0/0xec
[   18.751129]  driver_probe_device+0x48/0x110
[   18.755421]  __device_attach_driver+0xa8/0xd0
[   18.759900]  bus_for_each_drv+0x90/0xdc
[   18.763843]  __device_attach+0xfc/0x174
[   18.767786]  device_initial_probe+0x20/0x2c
[   18.772090]  bus_probe_device+0x40/0xa0
[   18.776032]  deferred_probe_work_func+0x94/0xd0
[   18.780686]  process_one_work+0x190/0x3d0
[   18.784805]  worker_thread+0x280/0x3d4
[   18.788659]  kthread+0x104/0x1c0
[   18.791981]  ret_from_fork+0x10/0x20
[   18.795654] Code: f9400408 aa0003f3 aa1f03f4 91142015 (f9402516)
[   18.801913] ---[ end trace 0000000000000000 ]---
[   18.809039] Kernel panic - not syncing: Oops: Fatal exception

Signed-off-by: Akhil P Oommen <quic_akhilpo@quicinc.com>
---

 drivers/gpu/drm/msm/adreno/adreno_gpu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
index 382fb7f9e497..118d07e5c66c 100644
--- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
@@ -1073,13 +1073,13 @@ int adreno_gpu_init(struct drm_device *drm, struct platform_device *pdev,
 void adreno_gpu_cleanup(struct adreno_gpu *adreno_gpu)
 {
 	struct msm_gpu *gpu = &adreno_gpu->base;
-	struct msm_drm_private *priv = gpu->dev->dev_private;
+	struct msm_drm_private *priv = gpu->dev ? gpu->dev->dev_private : NULL;
 	unsigned int i;
 
 	for (i = 0; i < ARRAY_SIZE(adreno_gpu->info->fw); i++)
 		release_firmware(adreno_gpu->fw[i]);
 
-	if (pm_runtime_enabled(&priv->gpu_pdev->dev))
+	if (priv && pm_runtime_enabled(&priv->gpu_pdev->dev))
 		pm_runtime_disable(&priv->gpu_pdev->dev);
 
 	msm_gpu_cleanup(&adreno_gpu->base);
-- 
2.7.4


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 2/4] drm/msm: Fix failure paths in msm_drm_init()
  2022-12-03 22:41 [PATCH 1/4] drm/msm/adreno: Fix null ptr access in adreno_gpu_cleanup() Akhil P Oommen
@ 2022-12-03 22:41 ` Akhil P Oommen
  2022-12-03 22:41 ` [PATCH 3/4] drm/msm/a6xx: Update a6xx gpu coredump Akhil P Oommen
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 6+ messages in thread
From: Akhil P Oommen @ 2022-12-03 22:41 UTC (permalink / raw)
  To: freedreno, dri-devel, linux-arm-msm
  Cc: Akhil P Oommen, Abhinav Kumar, Daniel Vetter, David Airlie,
	Dmitry Baryshkov, Duoming Zhou, Greg Kroah-Hartman, Johannes Berg,
	Rob Clark, Sean Paul, linux-kernel

Ensure that we do drm_dev_put() when there is an early return in
msm_drm_init().

Signed-off-by: Akhil P Oommen <quic_akhilpo@quicinc.com>
---

 drivers/gpu/drm/msm/disp/msm_disp_snapshot.c |  3 +++
 drivers/gpu/drm/msm/msm_drv.c                | 11 +++++++----
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/msm/disp/msm_disp_snapshot.c b/drivers/gpu/drm/msm/disp/msm_disp_snapshot.c
index e75b97127c0d..b73031cd48e4 100644
--- a/drivers/gpu/drm/msm/disp/msm_disp_snapshot.c
+++ b/drivers/gpu/drm/msm/disp/msm_disp_snapshot.c
@@ -129,6 +129,9 @@ void msm_disp_snapshot_destroy(struct drm_device *drm_dev)
 	}
 
 	priv = drm_dev->dev_private;
+	if (!priv->kms)
+		return;
+
 	kms = priv->kms;
 
 	if (kms->dump_worker)
diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c
index eb5b056ce3f7..544e041dd710 100644
--- a/drivers/gpu/drm/msm/msm_drv.c
+++ b/drivers/gpu/drm/msm/msm_drv.c
@@ -149,6 +149,9 @@ static void msm_irq_uninstall(struct drm_device *dev)
 	struct msm_drm_private *priv = dev->dev_private;
 	struct msm_kms *kms = priv->kms;
 
+	if (!priv->kms)
+		return;
+
 	kms->funcs->irq_uninstall(kms);
 	if (kms->irq_requested)
 		free_irq(kms->irq, dev);
@@ -265,8 +268,6 @@ static int msm_drm_uninit(struct device *dev)
 	component_unbind_all(dev, ddev);
 
 	ddev->dev_private = NULL;
-	drm_dev_put(ddev);
-
 	destroy_workqueue(priv->wq);
 
 	return 0;
@@ -441,12 +442,12 @@ static int msm_drm_init(struct device *dev, const struct drm_driver *drv)
 
 	ret = msm_init_vram(ddev);
 	if (ret)
-		return ret;
+		goto err_drm_dev_put;
 
 	/* Bind all our sub-components: */
 	ret = component_bind_all(dev, ddev);
 	if (ret)
-		return ret;
+		goto err_drm_dev_put;
 
 	dma_set_max_seg_size(dev, UINT_MAX);
 
@@ -541,6 +542,8 @@ static int msm_drm_init(struct device *dev, const struct drm_driver *drv)
 
 err_msm_uninit:
 	msm_drm_uninit(dev);
+err_drm_dev_put:
+	drm_dev_put(ddev);
 	return ret;
 }
 
-- 
2.7.4


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 3/4] drm/msm/a6xx: Update a6xx gpu coredump
  2022-12-03 22:41 [PATCH 1/4] drm/msm/adreno: Fix null ptr access in adreno_gpu_cleanup() Akhil P Oommen
  2022-12-03 22:41 ` [PATCH 2/4] drm/msm: Fix failure paths in msm_drm_init() Akhil P Oommen
@ 2022-12-03 22:41 ` Akhil P Oommen
  2022-12-03 22:41 ` [PATCH 4/4] drm/msm/a6xx: Update ROQ size in coredump Akhil P Oommen
  2022-12-05  8:40 ` [PATCH 1/4] drm/msm/adreno: Fix null ptr access in adreno_gpu_cleanup() Dan Carpenter
  3 siblings, 0 replies; 6+ messages in thread
From: Akhil P Oommen @ 2022-12-03 22:41 UTC (permalink / raw)
  To: freedreno, dri-devel, linux-arm-msm
  Cc: Akhil P Oommen, Abhinav Kumar, Daniel Vetter, David Airlie,
	Dmitry Baryshkov, Rob Clark, Sean Paul, linux-kernel

Update gpu coredump for a660/a650 family of gpus with the extra
information available.

Signed-off-by: Akhil P Oommen <quic_akhilpo@quicinc.com>
---

 drivers/gpu/drm/msm/adreno/a6xx.xml.h       | 18 +++++++++++
 drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c | 50 ++++++++++++++++++++++++++++-
 drivers/gpu/drm/msm/adreno/a6xx_gpu_state.h | 49 +++++++++++++++++++++++-----
 3 files changed, 108 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/msm/adreno/a6xx.xml.h b/drivers/gpu/drm/msm/adreno/a6xx.xml.h
index beea4a7fc1df..a92788019376 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx.xml.h
+++ b/drivers/gpu/drm/msm/adreno/a6xx.xml.h
@@ -241,6 +241,9 @@ enum a6xx_shader_id {
 	A6XX_HLSQ_FRONTEND_META = 97,
 	A6XX_HLSQ_INDIRECT_META = 98,
 	A6XX_HLSQ_BACKEND_META = 99,
+	A6XX_SP_LB_6_DATA = 112,
+	A6XX_SP_LB_7_DATA = 113,
+	A6XX_HLSQ_INST_RAM_1 = 115,
 };
 
 enum a6xx_debugbus_id {
@@ -274,19 +277,32 @@ enum a6xx_debugbus_id {
 	A6XX_DBGBUS_HLSQ_SPTP = 31,
 	A6XX_DBGBUS_RB_0 = 32,
 	A6XX_DBGBUS_RB_1 = 33,
+	A6XX_DBGBUS_RB_2 = 34,
 	A6XX_DBGBUS_UCHE_WRAPPER = 36,
 	A6XX_DBGBUS_CCU_0 = 40,
 	A6XX_DBGBUS_CCU_1 = 41,
+	A6XX_DBGBUS_CCU_2 = 42,
 	A6XX_DBGBUS_VFD_0 = 56,
 	A6XX_DBGBUS_VFD_1 = 57,
 	A6XX_DBGBUS_VFD_2 = 58,
 	A6XX_DBGBUS_VFD_3 = 59,
+	A6XX_DBGBUS_VFD_4 = 60,
+	A6XX_DBGBUS_VFD_5 = 61,
 	A6XX_DBGBUS_SP_0 = 64,
 	A6XX_DBGBUS_SP_1 = 65,
+	A6XX_DBGBUS_SP_2 = 66,
 	A6XX_DBGBUS_TPL1_0 = 72,
 	A6XX_DBGBUS_TPL1_1 = 73,
 	A6XX_DBGBUS_TPL1_2 = 74,
 	A6XX_DBGBUS_TPL1_3 = 75,
+	A6XX_DBGBUS_TPL1_4 = 76,
+	A6XX_DBGBUS_TPL1_5 = 77,
+	A6XX_DBGBUS_SPTP_0 = 88,
+	A6XX_DBGBUS_SPTP_1 = 89,
+	A6XX_DBGBUS_SPTP_2 = 90,
+	A6XX_DBGBUS_SPTP_3 = 91,
+	A6XX_DBGBUS_SPTP_4 = 92,
+	A6XX_DBGBUS_SPTP_5 = 93,
 };
 
 enum a6xx_cp_perfcounter_select {
@@ -1071,6 +1087,8 @@ enum a6xx_tex_type {
 
 #define REG_A6XX_CP_MISC_CNTL					0x00000840
 
+#define REG_A6XX_CP_CHICKEN_DBG					0x00000841
+
 #define REG_A6XX_CP_APRIV_CNTL					0x00000844
 
 #define REG_A6XX_CP_ROQ_THRESHOLDS_1				0x000008c1
diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c
index c61b233aff09..da190b6ddba0 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c
+++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c
@@ -385,6 +385,9 @@ static void a6xx_get_debugbus(struct msm_gpu *gpu,
 	nr_debugbus_blocks = ARRAY_SIZE(a6xx_debugbus_blocks) +
 		(a6xx_has_gbif(to_adreno_gpu(gpu)) ? 1 : 0);
 
+	if (adreno_is_a650_family(to_adreno_gpu(gpu)))
+		nr_debugbus_blocks += ARRAY_SIZE(a650_debugbus_blocks);
+
 	a6xx_state->debugbus = state_kcalloc(a6xx_state, nr_debugbus_blocks,
 			sizeof(*a6xx_state->debugbus));
 
@@ -411,6 +414,15 @@ static void a6xx_get_debugbus(struct msm_gpu *gpu,
 
 			a6xx_state->nr_debugbus += 1;
 		}
+
+
+		if (adreno_is_a650_family(to_adreno_gpu(gpu))) {
+			for (i = 0; i < ARRAY_SIZE(a650_debugbus_blocks); i++)
+				a6xx_get_debugbus_block(gpu,
+					a6xx_state,
+					&a650_debugbus_blocks[i],
+					&a6xx_state->debugbus[i]);
+		}
 	}
 
 	/*  Dump the VBIF debugbus on applicable targets */
@@ -524,10 +536,21 @@ static void a6xx_get_cluster(struct msm_gpu *gpu,
 		struct a6xx_gpu_state_obj *obj,
 		struct a6xx_crashdumper *dumper)
 {
+	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
 	u64 *in = dumper->ptr;
 	u64 out = dumper->iova + A6XX_CD_DATA_OFFSET;
 	size_t datasize;
 	int i, regcount = 0;
+	u32 id = cluster->id;
+
+	/* Skip registers that are not present on older generation */
+	if (!adreno_is_a660_family(adreno_gpu) &&
+			cluster->registers == a660_fe_cluster)
+		return;
+
+	if (adreno_is_a650_family(adreno_gpu) &&
+			cluster->registers == a6xx_ps_cluster)
+		id = CLUSTER_VPC_PS;
 
 	/* Some clusters need a selector register to be programmed too */
 	if (cluster->sel_reg)
@@ -537,7 +560,7 @@ static void a6xx_get_cluster(struct msm_gpu *gpu,
 		int j;
 
 		in += CRASHDUMP_WRITE(in, REG_A6XX_CP_APERTURE_CNTL_CD,
-			(cluster->id << 8) | (i << 4) | i);
+			(id << 8) | (i << 4) | i);
 
 		for (j = 0; j < cluster->count; j += 2) {
 			int count = RANGE(cluster->registers, j);
@@ -687,6 +710,11 @@ static void a6xx_get_crashdumper_registers(struct msm_gpu *gpu,
 	u64 out = dumper->iova + A6XX_CD_DATA_OFFSET;
 	int i, regcount = 0;
 
+	/* Skip unsupported registers on older generations */
+	if (!adreno_is_a660_family(to_adreno_gpu(gpu)) &&
+			(regs->registers == a660_registers))
+		return;
+
 	/* Some blocks might need to program a selector register first */
 	if (regs->val0)
 		in += CRASHDUMP_WRITE(in, regs->val0, regs->val1);
@@ -721,6 +749,11 @@ static void a6xx_get_ahb_gpu_registers(struct msm_gpu *gpu,
 {
 	int i, regcount = 0, index = 0;
 
+	/* Skip unsupported registers on older generations */
+	if (!adreno_is_a660_family(to_adreno_gpu(gpu)) &&
+			(regs->registers == a660_registers))
+		return;
+
 	for (i = 0; i < regs->count; i += 2)
 		regcount += RANGE(regs->registers, i);
 
@@ -943,6 +976,21 @@ static void a6xx_get_indexed_registers(struct msm_gpu *gpu,
 		a6xx_get_indexed_regs(gpu, a6xx_state, &a6xx_indexed_reglist[i],
 			&a6xx_state->indexed_regs[i]);
 
+	if (adreno_is_a650_family(to_adreno_gpu(gpu))) {
+		u32 val;
+
+		val = gpu_read(gpu, REG_A6XX_CP_CHICKEN_DBG);
+		gpu_write(gpu, REG_A6XX_CP_CHICKEN_DBG, val | 4);
+
+		/* Get the contents of the CP mempool */
+		a6xx_get_indexed_regs(gpu, a6xx_state, &a6xx_cp_mempool_indexed,
+			&a6xx_state->indexed_regs[i]);
+
+		gpu_write(gpu, REG_A6XX_CP_CHICKEN_DBG, val);
+		a6xx_state->nr_indexed_regs = count;
+		return;
+	}
+
 	/* Set the CP mempool size to 0 to stabilize it while dumping */
 	mempool_size = gpu_read(gpu, REG_A6XX_CP_MEM_POOL_SIZE);
 	gpu_write(gpu, REG_A6XX_CP_MEM_POOL_SIZE, 0);
diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.h b/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.h
index 2fb58b7098e4..808121c88662 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.h
+++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.h
@@ -36,16 +36,21 @@ static const u32 a6xx_fe_cluster[] = {
 	0xa00e, 0xa0ef, 0xa0f8, 0xa0f8,
 };
 
+static const u32 a660_fe_cluster[] = {
+	0x9807, 0x9807,
+};
+
 static const u32 a6xx_pc_vs_cluster[] = {
 	0x9100, 0x9108, 0x9300, 0x9306, 0x9980, 0x9981, 0x9b00, 0x9b07,
 };
 
-#define CLUSTER_FE    0
-#define CLUSTER_SP_VS 1
-#define CLUSTER_PC_VS 2
-#define CLUSTER_GRAS  3
-#define CLUSTER_SP_PS 4
-#define CLUSTER_PS    5
+#define CLUSTER_FE	0
+#define CLUSTER_SP_VS	1
+#define CLUSTER_PC_VS	2
+#define CLUSTER_GRAS	3
+#define CLUSTER_SP_PS	4
+#define CLUSTER_PS	5
+#define CLUSTER_VPC_PS	6
 
 #define CLUSTER(_id, _reg, _sel_reg, _sel_val) \
 	{ .id = _id, .name = #_id,\
@@ -67,6 +72,7 @@ static const struct a6xx_cluster {
 	CLUSTER(CLUSTER_PS, a6xx_ps_cluster, 0, 0),
 	CLUSTER(CLUSTER_FE, a6xx_fe_cluster, 0, 0),
 	CLUSTER(CLUSTER_PC_VS, a6xx_pc_vs_cluster, 0, 0),
+	CLUSTER(CLUSTER_FE, a660_fe_cluster, 0, 0),
 };
 
 static const u32 a6xx_sp_vs_hlsq_cluster[] = {
@@ -105,7 +111,7 @@ static const u32 a6xx_sp_ps_hlsq_2d_cluster[] = {
 
 static const u32 a6xx_sp_ps_sp_cluster[] = {
 	0xa980, 0xa9a8, 0xa9b0, 0xa9bc, 0xa9d0, 0xa9d3, 0xa9e0, 0xa9f3,
-	0xaa00, 0xaa00, 0xaa30, 0xaa31,
+	0xaa00, 0xaa00, 0xaa30, 0xaa31, 0xaaf2, 0xaaf2,
 };
 
 static const u32 a6xx_sp_ps_sp_2d_cluster[] = {
@@ -229,6 +235,9 @@ static const struct a6xx_shader_block {
 	SHADER(A6XX_HLSQ_DATAPATH_META, 0x40),
 	SHADER(A6XX_HLSQ_FRONTEND_META, 0x40),
 	SHADER(A6XX_HLSQ_INDIRECT_META, 0x40),
+	SHADER(A6XX_SP_LB_6_DATA, 0x200),
+	SHADER(A6XX_SP_LB_7_DATA, 0x200),
+	SHADER(A6XX_HLSQ_INST_RAM_1, 0x200),
 };
 
 static const u32 a6xx_rb_rac_registers[] = {
@@ -251,7 +260,7 @@ static const u32 a6xx_registers[] = {
 	0x0540, 0x0555,
 	/* CP */
 	0x0800, 0x0808, 0x0810, 0x0813, 0x0820, 0x0821, 0x0823, 0x0824,
-	0x0826, 0x0827, 0x0830, 0x0833, 0x0840, 0x0843, 0x084f, 0x086f,
+	0x0826, 0x0827, 0x0830, 0x0833, 0x0840, 0x0845, 0x084f, 0x086f,
 	0x0880, 0x088a, 0x08a0, 0x08ab, 0x08c0, 0x08c4, 0x08d0, 0x08dd,
 	0x08f0, 0x08f3, 0x0900, 0x0903, 0x0908, 0x0911, 0x0928, 0x093e,
 	0x0942, 0x094d, 0x0980, 0x0984, 0x098d, 0x0996, 0x0998, 0x099e,
@@ -274,6 +283,13 @@ static const u32 a6xx_registers[] = {
 	/* VFD */
 	0xa600, 0xa601, 0xa603, 0xa603, 0xa60a, 0xa60a, 0xa610, 0xa617,
 	0xa630, 0xa630,
+	/* HLSQ */
+	0xd002, 0xd003,
+};
+
+static const u32 a660_registers[] = {
+	/* UCHE */
+	0x0e3c, 0x0e3c,
 };
 
 #define REGS(_array, _sel_reg, _sel_val) \
@@ -282,6 +298,7 @@ static const u32 a6xx_registers[] = {
 
 static const struct a6xx_registers a6xx_reglist[] = {
 	REGS(a6xx_registers, 0, 0),
+	REGS(a660_registers, 0, 0),
 	REGS(a6xx_rb_rac_registers, REG_A6XX_RB_RB_SUB_BLOCK_SEL_CNTL_CD, 0),
 	REGS(a6xx_rb_rbp_registers, REG_A6XX_RB_RB_SUB_BLOCK_SEL_CNTL_CD, 9),
 };
@@ -443,4 +460,20 @@ static const struct a6xx_debugbus_block a6xx_cx_debugbus_blocks[] = {
 	DEBUGBUS(A6XX_DBGBUS_CX, 0x100),
 };
 
+static const struct a6xx_debugbus_block a650_debugbus_blocks[] = {
+	DEBUGBUS(A6XX_DBGBUS_RB_2, 0x100),
+	DEBUGBUS(A6XX_DBGBUS_CCU_2, 0x100),
+	DEBUGBUS(A6XX_DBGBUS_VFD_4, 0x100),
+	DEBUGBUS(A6XX_DBGBUS_VFD_5, 0x100),
+	DEBUGBUS(A6XX_DBGBUS_SP_2, 0x100),
+	DEBUGBUS(A6XX_DBGBUS_TPL1_4, 0x100),
+	DEBUGBUS(A6XX_DBGBUS_TPL1_5, 0x100),
+	DEBUGBUS(A6XX_DBGBUS_SPTP_0, 0x100),
+	DEBUGBUS(A6XX_DBGBUS_SPTP_1, 0x100),
+	DEBUGBUS(A6XX_DBGBUS_SPTP_2, 0x100),
+	DEBUGBUS(A6XX_DBGBUS_SPTP_3, 0x100),
+	DEBUGBUS(A6XX_DBGBUS_SPTP_4, 0x100),
+	DEBUGBUS(A6XX_DBGBUS_SPTP_5, 0x100),
+};
+
 #endif
-- 
2.7.4


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 4/4] drm/msm/a6xx: Update ROQ size in coredump
  2022-12-03 22:41 [PATCH 1/4] drm/msm/adreno: Fix null ptr access in adreno_gpu_cleanup() Akhil P Oommen
  2022-12-03 22:41 ` [PATCH 2/4] drm/msm: Fix failure paths in msm_drm_init() Akhil P Oommen
  2022-12-03 22:41 ` [PATCH 3/4] drm/msm/a6xx: Update a6xx gpu coredump Akhil P Oommen
@ 2022-12-03 22:41 ` Akhil P Oommen
  2022-12-05  8:40 ` [PATCH 1/4] drm/msm/adreno: Fix null ptr access in adreno_gpu_cleanup() Dan Carpenter
  3 siblings, 0 replies; 6+ messages in thread
From: Akhil P Oommen @ 2022-12-03 22:41 UTC (permalink / raw)
  To: freedreno, dri-devel, linux-arm-msm
  Cc: Akhil P Oommen, Abhinav Kumar, Daniel Vetter, David Airlie,
	Dmitry Baryshkov, Rob Clark, Sean Paul, linux-kernel

Since RoQ size differs between generations, calculate dynamically the
RoQ size while capturing coredump.

Signed-off-by: Akhil P Oommen <quic_akhilpo@quicinc.com>
---

 drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c | 11 ++++++++++-
 drivers/gpu/drm/msm/adreno/a6xx_gpu_state.h | 17 ++++++++++-------
 2 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c
index da190b6ddba0..80e60e34ce7d 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c
+++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c
@@ -939,15 +939,24 @@ static void a6xx_get_registers(struct msm_gpu *gpu,
 			dumper);
 }
 
+static u32 a6xx_get_cp_roq_size(struct msm_gpu *gpu)
+{
+	/* The value at [16:31] is in 4dword units. Convert it to dwords */
+	return gpu_read(gpu, REG_A6XX_CP_ROQ_THRESHOLDS_2) >> 14;
+}
+
 /* Read a block of data from an indexed register pair */
 static void a6xx_get_indexed_regs(struct msm_gpu *gpu,
 		struct a6xx_gpu_state *a6xx_state,
-		const struct a6xx_indexed_registers *indexed,
+		struct a6xx_indexed_registers *indexed,
 		struct a6xx_gpu_state_obj *obj)
 {
 	int i;
 
 	obj->handle = (const void *) indexed;
+	if (indexed->count_fn)
+		indexed->count = indexed->count_fn(gpu);
+
 	obj->data = state_kcalloc(a6xx_state, indexed->count, sizeof(u32));
 	if (!obj->data)
 		return;
diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.h b/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.h
index 808121c88662..790f55e24533 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.h
+++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.h
@@ -383,25 +383,28 @@ static const struct a6xx_registers a6xx_gmu_reglist[] = {
 	REGS(a6xx_gmu_gx_registers, 0, 0),
 };
 
-static const struct a6xx_indexed_registers {
+static u32 a6xx_get_cp_roq_size(struct msm_gpu *gpu);
+
+static struct a6xx_indexed_registers {
 	const char *name;
 	u32 addr;
 	u32 data;
 	u32 count;
+	u32 (*count_fn)(struct msm_gpu *gpu);
 } a6xx_indexed_reglist[] = {
 	{ "CP_SQE_STAT", REG_A6XX_CP_SQE_STAT_ADDR,
-		REG_A6XX_CP_SQE_STAT_DATA, 0x33 },
+		REG_A6XX_CP_SQE_STAT_DATA, 0x33, NULL },
 	{ "CP_DRAW_STATE", REG_A6XX_CP_DRAW_STATE_ADDR,
-		REG_A6XX_CP_DRAW_STATE_DATA, 0x100 },
+		REG_A6XX_CP_DRAW_STATE_DATA, 0x100, NULL },
 	{ "CP_UCODE_DBG_DATA", REG_A6XX_CP_SQE_UCODE_DBG_ADDR,
-		REG_A6XX_CP_SQE_UCODE_DBG_DATA, 0x6000 },
+		REG_A6XX_CP_SQE_UCODE_DBG_DATA, 0x8000, NULL },
 	{ "CP_ROQ", REG_A6XX_CP_ROQ_DBG_ADDR,
-		REG_A6XX_CP_ROQ_DBG_DATA, 0x400 },
+		REG_A6XX_CP_ROQ_DBG_DATA, 0, a6xx_get_cp_roq_size},
 };
 
-static const struct a6xx_indexed_registers a6xx_cp_mempool_indexed = {
+static struct a6xx_indexed_registers a6xx_cp_mempool_indexed = {
 	"CP_MEMPOOL", REG_A6XX_CP_MEM_POOL_DBG_ADDR,
-		REG_A6XX_CP_MEM_POOL_DBG_DATA, 0x2060,
+		REG_A6XX_CP_MEM_POOL_DBG_DATA, 0x2060, NULL,
 };
 
 #define DEBUGBUS(_id, _count) { .id = _id, .name = #_id, .count = _count }
-- 
2.7.4


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH 1/4] drm/msm/adreno: Fix null ptr access in adreno_gpu_cleanup()
  2022-12-03 22:41 [PATCH 1/4] drm/msm/adreno: Fix null ptr access in adreno_gpu_cleanup() Akhil P Oommen
                   ` (2 preceding siblings ...)
  2022-12-03 22:41 ` [PATCH 4/4] drm/msm/a6xx: Update ROQ size in coredump Akhil P Oommen
@ 2022-12-05  8:40 ` Dan Carpenter
  2022-12-06 20:30   ` Akhil P Oommen
  3 siblings, 1 reply; 6+ messages in thread
From: Dan Carpenter @ 2022-12-05  8:40 UTC (permalink / raw)
  To: Akhil P Oommen, Jonathan Marek
  Cc: freedreno, dri-devel, linux-arm-msm, Abhinav Kumar, Daniel Vetter,
	David Airlie, Dmitry Baryshkov, Emma Anholt, Rob Clark, Sean Paul,
	linux-kernel

On Sun, Dec 04, 2022 at 04:11:41AM +0530, Akhil P Oommen wrote:
> Fix the below kernel panic due to null pointer access:
> [   18.504431] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000048
> [   18.513464] Mem abort info:
> [   18.516346]   ESR = 0x0000000096000005
> [   18.520204]   EC = 0x25: DABT (current EL), IL = 32 bits
> [   18.525706]   SET = 0, FnV = 0
> [   18.528878]   EA = 0, S1PTW = 0
> [   18.532117]   FSC = 0x05: level 1 translation fault
> [   18.537138] Data abort info:
> [   18.540110]   ISV = 0, ISS = 0x00000005
> [   18.544060]   CM = 0, WnR = 0
> [   18.547109] user pgtable: 4k pages, 39-bit VAs, pgdp=0000000112826000
> [   18.553738] [0000000000000048] pgd=0000000000000000, p4d=0000000000000000, pud=0000000000000000
> [   18.562690] Internal error: Oops: 0000000096000005 [#1] PREEMPT SMP
> **Snip**
> [   18.696758] Call trace:
> [   18.699278]  adreno_gpu_cleanup+0x30/0x88
> [   18.703396]  a6xx_destroy+0xc0/0x130
> [   18.707066]  a6xx_gpu_init+0x308/0x424

Fixes: 17e822f7591f ("drm/msm: fix unbalanced pm_runtime_enable in adreno_gpu_{init, cleanup}")

Let's add Jonathan to the CC list so he can Ack your patch.

Although the real issue is that a6xx_gpu_init has bad error handling.

The a6xx_destroy() function supposed to free *everything* so then the
question becomes how do we avoid freeing something which was not
allocated?  With normal kernel style we just free things one by one
in the reverse order from how they were allocated.  See my blog for more
details:
https://staticthinking.wordpress.com/2022/04/28/free-the-last-thing-style/

However this code is written in One Function Frees Everything Style
which is difficult to review and prone to bugs.  The common mistakes are
the kind of NULL dereference that you've seen, double frees, and missing
frees.

The only way to read this code is to open a new text editor window and
line up the allocations with the frees.

  1725  static void a6xx_destroy(struct msm_gpu *gpu)
  1726  {
  1727          struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
  1728          struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu);
  1729  
  1730          if (a6xx_gpu->sqe_bo) {
  1731                  msm_gem_unpin_iova(a6xx_gpu->sqe_bo, gpu->aspace);
  1732                  drm_gem_object_put(a6xx_gpu->sqe_bo);

These unpin/put must be done together and should be in their own
function.  The ->sqe_bo pointer is allocated in a6xx_ucode_init().  It's
assigned to an error pointer, but then set to NULL on error or after a
free.  So this is okay.

  1733          }
  1734  
  1735          if (a6xx_gpu->shadow_bo) {

->shadow_bo is allocated in hw_init().  Should there be a call to
msm_gem_put_vaddr(a6xx_gpu->shadow)?  It's unclear.  [QUESTION #1]

  1736                  msm_gem_unpin_iova(a6xx_gpu->shadow_bo, gpu->aspace);
  1737                  drm_gem_object_put(a6xx_gpu->shadow_bo);
  1738          }
  1739  
  1740          a6xx_llc_slices_destroy(a6xx_gpu);

This has IS_ERR_OR_NULL() checks so it's okay.

  1741  
  1742          a6xx_gmu_remove(a6xx_gpu);

This uses a gmu->initialized flag which allows it to safely clean up
everything.  Fine.

  1743  
  1744          adreno_gpu_cleanup(adreno_gpu);

This function has the bug that you identified.  Let's dig into it.
(With normal kernel error handling you can read the error handling by
looking at the label name but with this style we need to jump around and
compare code from different files).

  1745  
  1746          kfree(a6xx_gpu);
  1747  }

drivers/gpu/drm/msm/adreno/adreno_gpu.c
  1079  void adreno_gpu_cleanup(struct adreno_gpu *adreno_gpu)
  1080  {
  1081          struct msm_gpu *gpu = &adreno_gpu->base;
  1082          struct msm_drm_private *priv = gpu->dev->dev_private;
  1083          unsigned int i;
  1084  
  1085          for (i = 0; i < ARRAY_SIZE(adreno_gpu->info->fw); i++)
  1086                  release_firmware(adreno_gpu->fw[i]);

This is okay.  ->fw[i] is either valid or NULL and releasing a NULL is
fine.

  1087  
  1088          if (pm_runtime_enabled(&priv->gpu_pdev->dev))

This is the bug you found.

  1089                  pm_runtime_disable(&priv->gpu_pdev->dev);
  1090  
  1091          msm_gpu_cleanup(&adreno_gpu->base);

Let's dig into msm_gpu_cleanup().

  1092  }

drivers/gpu/drm/msm/msm_gpu.c
  1006  void msm_gpu_cleanup(struct msm_gpu *gpu)
  1007  {
  1008          int i;
  1009  
  1010          DBG("%s", gpu->name);
  1011  
  1012          for (i = 0; i < ARRAY_SIZE(gpu->rb); i++) {
  1013                  msm_ringbuffer_destroy(gpu->rb[i]);

Destroying an error pointer is fine so this is okay.

  1014                  gpu->rb[i] = NULL;
  1015          }
  1016  
  1017          msm_gem_kernel_put(gpu->memptrs_bo, gpu->aspace);
                                                    ^^^^^^^^^^^
[QUESTION #2] Passing an error pointer here will trigger a stack trace
so this is bug.  The drivers->aspace pointer is allocted in
msm_gpu_create_private_address_space()

drivers/gpu/drm/msm/msm_gpu.c
   824  struct msm_gem_address_space *
   825  msm_gpu_create_private_address_space(struct msm_gpu *gpu, struct task_struct *task)
   826  {
   827          struct msm_gem_address_space *aspace = NULL;
   828          if (!gpu)
   829                  return NULL;
   830  
   831          /*
   832           * If the target doesn't support private address spaces then return
   833           * the global one
   834           */
   835          if (gpu->funcs->create_private_address_space) {
   836                  aspace = gpu->funcs->create_private_address_space(gpu);
   837                  if (!IS_ERR(aspace))
   838                          aspace->pid = get_pid(task_pid(task));
   839          }
   840  
   841          if (IS_ERR_OR_NULL(aspace))
                    ^^^^^^^^^^^^^^^^^^^^^^
[QUESTION #3] This check seems reversed.  It should be if *NOT* is error
or null.

   842                  aspace = msm_gem_address_space_get(gpu->aspace);
   843  
   844          return aspace;
   845  }

regards,
dan carpenter



^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH 1/4] drm/msm/adreno: Fix null ptr access in adreno_gpu_cleanup()
  2022-12-05  8:40 ` [PATCH 1/4] drm/msm/adreno: Fix null ptr access in adreno_gpu_cleanup() Dan Carpenter
@ 2022-12-06 20:30   ` Akhil P Oommen
  0 siblings, 0 replies; 6+ messages in thread
From: Akhil P Oommen @ 2022-12-06 20:30 UTC (permalink / raw)
  To: Dan Carpenter, Jonathan Marek
  Cc: freedreno, dri-devel, linux-arm-msm, Abhinav Kumar, Daniel Vetter,
	David Airlie, Dmitry Baryshkov, Emma Anholt, Rob Clark, Sean Paul,
	linux-kernel

On 12/5/2022 2:10 PM, Dan Carpenter wrote:
> On Sun, Dec 04, 2022 at 04:11:41AM +0530, Akhil P Oommen wrote:
>> Fix the below kernel panic due to null pointer access:
>> [   18.504431] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000048
>> [   18.513464] Mem abort info:
>> [   18.516346]   ESR = 0x0000000096000005
>> [   18.520204]   EC = 0x25: DABT (current EL), IL = 32 bits
>> [   18.525706]   SET = 0, FnV = 0
>> [   18.528878]   EA = 0, S1PTW = 0
>> [   18.532117]   FSC = 0x05: level 1 translation fault
>> [   18.537138] Data abort info:
>> [   18.540110]   ISV = 0, ISS = 0x00000005
>> [   18.544060]   CM = 0, WnR = 0
>> [   18.547109] user pgtable: 4k pages, 39-bit VAs, pgdp=0000000112826000
>> [   18.553738] [0000000000000048] pgd=0000000000000000, p4d=0000000000000000, pud=0000000000000000
>> [   18.562690] Internal error: Oops: 0000000096000005 [#1] PREEMPT SMP
>> **Snip**
>> [   18.696758] Call trace:
>> [   18.699278]  adreno_gpu_cleanup+0x30/0x88
>> [   18.703396]  a6xx_destroy+0xc0/0x130
>> [   18.707066]  a6xx_gpu_init+0x308/0x424
> Fixes: 17e822f7591f ("drm/msm: fix unbalanced pm_runtime_enable in adreno_gpu_{init, cleanup}")
>
> Let's add Jonathan to the CC list so he can Ack your patch.
Thanks, will post a v2.

-Akhil.
>
> Although the real issue is that a6xx_gpu_init has bad error handling.
>
> The a6xx_destroy() function supposed to free *everything* so then the
> question becomes how do we avoid freeing something which was not
> allocated?  With normal kernel style we just free things one by one
> in the reverse order from how they were allocated.  See my blog for more
> details:
> https://staticthinking.wordpress.com/2022/04/28/free-the-last-thing-style/
Nice post. Thanks for sharing.
>
> However this code is written in One Function Frees Everything Style
> which is difficult to review and prone to bugs.  The common mistakes are
> the kind of NULL dereference that you've seen, double frees, and missing
> frees.
>
> The only way to read this code is to open a new text editor window and
> line up the allocations with the frees.
>
>   1725  static void a6xx_destroy(struct msm_gpu *gpu)
>   1726  {
>   1727          struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
>   1728          struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu);
>   1729  
>   1730          if (a6xx_gpu->sqe_bo) {
>   1731                  msm_gem_unpin_iova(a6xx_gpu->sqe_bo, gpu->aspace);
>   1732                  drm_gem_object_put(a6xx_gpu->sqe_bo);
>
> These unpin/put must be done together and should be in their own
> function.  The ->sqe_bo pointer is allocated in a6xx_ucode_init().  It's
> assigned to an error pointer, but then set to NULL on error or after a
> free.  So this is okay.
Agree. This warrants a helper function. I count 11 instances where it will be useful.

>
>   1733          }
>   1734  
>   1735          if (a6xx_gpu->shadow_bo) {
>
> ->shadow_bo is allocated in hw_init().  Should there be a call to
> msm_gem_put_vaddr(a6xx_gpu->shadow)?  It's unclear.  [QUESTION #1]
Yes. This should be freed with msm_gem_kernel_put() which takes care of freeing vaddr.
>
>   1736                  msm_gem_unpin_iova(a6xx_gpu->shadow_bo, gpu->aspace);
>   1737                  drm_gem_object_put(a6xx_gpu->shadow_bo);
>   1738          }
>   1739  
>   1740          a6xx_llc_slices_destroy(a6xx_gpu);
>
> This has IS_ERR_OR_NULL() checks so it's okay.
>
>   1741  
>   1742          a6xx_gmu_remove(a6xx_gpu);
>
> This uses a gmu->initialized flag which allows it to safely clean up
> everything.  Fine.
>
>   1743  
>   1744          adreno_gpu_cleanup(adreno_gpu);
>
> This function has the bug that you identified.  Let's dig into it.
> (With normal kernel error handling you can read the error handling by
> looking at the label name but with this style we need to jump around and
> compare code from different files).
>
>   1745  
>   1746          kfree(a6xx_gpu);
>   1747  }
>
> drivers/gpu/drm/msm/adreno/adreno_gpu.c
>   1079  void adreno_gpu_cleanup(struct adreno_gpu *adreno_gpu)
>   1080  {
>   1081          struct msm_gpu *gpu = &adreno_gpu->base;
>   1082          struct msm_drm_private *priv = gpu->dev->dev_private;
>   1083          unsigned int i;
>   1084  
>   1085          for (i = 0; i < ARRAY_SIZE(adreno_gpu->info->fw); i++)
>   1086                  release_firmware(adreno_gpu->fw[i]);
>
> This is okay.  ->fw[i] is either valid or NULL and releasing a NULL is
> fine.
>
>   1087  
>   1088          if (pm_runtime_enabled(&priv->gpu_pdev->dev))
>
> This is the bug you found.
>
>   1089                  pm_runtime_disable(&priv->gpu_pdev->dev);
>   1090  
>   1091          msm_gpu_cleanup(&adreno_gpu->base);
>
> Let's dig into msm_gpu_cleanup().
>
>   1092  }
>
> drivers/gpu/drm/msm/msm_gpu.c
>   1006  void msm_gpu_cleanup(struct msm_gpu *gpu)
>   1007  {
>   1008          int i;
>   1009  
>   1010          DBG("%s", gpu->name);
>   1011  
>   1012          for (i = 0; i < ARRAY_SIZE(gpu->rb); i++) {
>   1013                  msm_ringbuffer_destroy(gpu->rb[i]);
>
> Destroying an error pointer is fine so this is okay.
>
>   1014                  gpu->rb[i] = NULL;
>   1015          }
>   1016  
>   1017          msm_gem_kernel_put(gpu->memptrs_bo, gpu->aspace);
>                                                     ^^^^^^^^^^^
> [QUESTION #2] Passing an error pointer here will trigger a stack trace
> so this is bug.  The drivers->aspace pointer is allocted in
> msm_gpu_create_private_address_space()
No. aspace is allocated using gpu->funcs->create_address_space() from msm_gpu_init(). If memptrs_bo is not NULL, then 'aspace' should be valid. So we don't have any issue here.
>
> drivers/gpu/drm/msm/msm_gpu.c
>    824  struct msm_gem_address_space *
>    825  msm_gpu_create_private_address_space(struct msm_gpu *gpu, struct task_struct *task)
>    826  {
>    827          struct msm_gem_address_space *aspace = NULL;
>    828          if (!gpu)
>    829                  return NULL;
>    830  
>    831          /*
>    832           * If the target doesn't support private address spaces then return
>    833           * the global one
>    834           */
>    835          if (gpu->funcs->create_private_address_space) {
>    836                  aspace = gpu->funcs->create_private_address_space(gpu);
>    837                  if (!IS_ERR(aspace))
>    838                          aspace->pid = get_pid(task_pid(task));
>    839          }
>    840  
>    841          if (IS_ERR_OR_NULL(aspace))
>                     ^^^^^^^^^^^^^^^^^^^^^^
> [QUESTION #3] This check seems reversed.  It should be if *NOT* is error
> or null.
You are confused about global aspace (which is attached to ttbr1 in gpu smmu) and private aspace (which is separate for each process and it is attached to ttbr0). Here the logic is that when private aspace creation fails, we fall back to global aspace. So this logic is correct.

-Akhil.
>
>    842                  aspace = msm_gem_address_space_get(gpu->aspace);
>    843  
>    844          return aspace;
>    845  }
>
> regards,
> dan carpenter
>
>


^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2022-12-06 20:31 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2022-12-03 22:41 [PATCH 1/4] drm/msm/adreno: Fix null ptr access in adreno_gpu_cleanup() Akhil P Oommen
2022-12-03 22:41 ` [PATCH 2/4] drm/msm: Fix failure paths in msm_drm_init() Akhil P Oommen
2022-12-03 22:41 ` [PATCH 3/4] drm/msm/a6xx: Update a6xx gpu coredump Akhil P Oommen
2022-12-03 22:41 ` [PATCH 4/4] drm/msm/a6xx: Update ROQ size in coredump Akhil P Oommen
2022-12-05  8:40 ` [PATCH 1/4] drm/msm/adreno: Fix null ptr access in adreno_gpu_cleanup() Dan Carpenter
2022-12-06 20:30   ` Akhil P Oommen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox