All of lore.kernel.org
 help / color / mirror / Atom feed
From: j.glisse@gmail.com
To: dri-devel@lists.freedesktop.org
Cc: Jerome Glisse <jglisse@redhat.com>
Subject: [PATCH 24/24] drm/radeon: add faulty command buffer dump facilities
Date: Wed, 25 Apr 2012 15:03:29 -0400	[thread overview]
Message-ID: <1335380609-5804-25-git-send-email-j.glisse@gmail.com> (raw)
In-Reply-To: <1335380609-5804-1-git-send-email-j.glisse@gmail.com>

From: Jerome Glisse <jglisse@redhat.com>

This add a command buffer dumping facilities, that will
dump command buffer and all associated bo that most likely
triggered a lockup.

Idea is that we go through unsignaled fence and we dump the
ib of the oldest unsignaled fence. Dumping is a 2 step process
on lockup detection we try to allocate a big object that will
old all the the current state (ib pm4 packet, bo content,
relocation table). Upon reading radeon_lockup_blob debugfs
file user will get this big blob and kernel will free memory.

Kernel side try to handle as gracefully as possible failure
such as mapping bo by not dumping such bo. Userspace tools
those need to have enough logic to handle such cases.

Signed-off-by: Jerome Glisse <jglisse@redhat.com>
---
 drivers/gpu/drm/radeon/radeon.h        |   14 ++++-
 drivers/gpu/drm/radeon/radeon_cs.c     |   20 ++++--
 drivers/gpu/drm/radeon/radeon_device.c |    3 +
 drivers/gpu/drm/radeon/radeon_fence.c  |   19 +++++
 drivers/gpu/drm/radeon/radeon_gart.c   |   10 ++-
 drivers/gpu/drm/radeon/radeon_ring.c   |  118 ++++++++++++++++++++++++++++++++
 6 files changed, 173 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
index 7b2125b..c9f51be 100644
--- a/drivers/gpu/drm/radeon/radeon.h
+++ b/drivers/gpu/drm/radeon/radeon.h
@@ -241,6 +241,7 @@ struct radeon_fence {
 int radeon_fence_driver_start_ring(struct radeon_device *rdev, int ring);
 int radeon_fence_driver_init(struct radeon_device *rdev);
 void radeon_fence_driver_fini(struct radeon_device *rdev);
+void radeon_fence_blob_faulty_ib(struct radeon_device *rdev, int ring);
 int radeon_fence_create(struct radeon_device *rdev, struct radeon_fence **fence, int ring);
 int radeon_fence_emit(struct radeon_device *rdev, struct radeon_fence *fence);
 void radeon_fence_process(struct radeon_device *rdev, int ring);
@@ -569,6 +570,10 @@ void radeon_irq_kms_pflip_irq_put(struct radeon_device *rdev, int crtc);
  */
 struct radeon_cs_reloc;
 
+#define RADEON_IB_TYPE_NONE		0
+#define RADEON_IB_TYPE_CS		1
+#define RADEON_IB_TYPE_CS_VM		2
+
 struct radeon_ib {
 	struct radeon_sa_bo	sa_bo;
 	uint32_t		length_dw;
@@ -579,6 +584,7 @@ struct radeon_ib {
 	bool			is_const_ib;
 	unsigned		nrelocs;
 	struct radeon_cs_reloc	*relocs;
+	unsigned		type;
 };
 
 struct radeon_ring {
@@ -745,6 +751,7 @@ int radeon_ring_init(struct radeon_device *rdev, struct radeon_ring *cp, unsigne
 		     unsigned rptr_offs, unsigned rptr_reg, unsigned wptr_reg,
 		     u32 ptr_reg_shift, u32 ptr_reg_mask, u32 nop);
 void radeon_ring_fini(struct radeon_device *rdev, struct radeon_ring *cp);
+void radeon_lockup_build_blob(struct radeon_device *rdev, struct radeon_ib *ib);
 
 
 /*
@@ -756,6 +763,7 @@ struct radeon_cs_reloc {
 	struct radeon_bo_list		lobj;
 	uint32_t			handle;
 	uint32_t			flags;
+	uint64_t			gpu_addr;
 };
 
 struct radeon_cs_chunk {
@@ -1496,6 +1504,9 @@ struct radeon_device {
 	unsigned 		debugfs_count;
 	/* virtual memory */
 	struct radeon_vm_manager	vm_manager;
+	uint32_t			*blob;
+	unsigned			blob_size_dw;
+	struct mutex			blob_mutex;
 };
 
 int radeon_device_init(struct radeon_device *rdev,
@@ -1742,7 +1753,8 @@ void radeon_vm_unbind(struct radeon_device *rdev, struct radeon_vm *vm);
 int radeon_vm_bo_update_pte(struct radeon_device *rdev,
 			    struct radeon_vm *vm,
 			    struct radeon_bo *bo,
-			    struct ttm_mem_reg *mem);
+			    struct ttm_mem_reg *mem,
+			    uint64_t *gpu_addr);
 void radeon_vm_bo_invalidate(struct radeon_device *rdev,
 			     struct radeon_bo *bo);
 int radeon_vm_bo_add(struct radeon_device *rdev,
diff --git a/drivers/gpu/drm/radeon/radeon_cs.c b/drivers/gpu/drm/radeon/radeon_cs.c
index ecef708..0c0bcaa 100644
--- a/drivers/gpu/drm/radeon/radeon_cs.c
+++ b/drivers/gpu/drm/radeon/radeon_cs.c
@@ -334,6 +334,7 @@ static int radeon_cs_ib_chunk(struct radeon_device *rdev,
 			      struct radeon_cs_parser *parser)
 {
 	struct radeon_cs_chunk *ib_chunk;
+	unsigned i;
 	int r;
 
 	if (parser->chunk_ib_idx == -1)
@@ -369,6 +370,10 @@ static int radeon_cs_ib_chunk(struct radeon_device *rdev,
 		DRM_ERROR("Failed to synchronize rings !\n");
 	}
 	parser->ib->vm_id = 0;
+	parser->ib->type = RADEON_IB_TYPE_CS;
+	for (i = 0; i < parser->nrelocs; ++i) {
+		parser->relocs[i].gpu_addr = parser->relocs[i].lobj.gpu_offset;
+	}
 	r = radeon_ib_schedule(rdev, parser->ib);
 	if (r) {
 		DRM_ERROR("Failed to schedule IB !\n");
@@ -379,13 +384,13 @@ static int radeon_cs_ib_chunk(struct radeon_device *rdev,
 static int radeon_bo_vm_update_pte(struct radeon_cs_parser *parser,
 				   struct radeon_vm *vm)
 {
-	struct radeon_bo_list *lobj;
-	struct radeon_bo *bo;
-	int r;
+	unsigned i;
+
+	for (i = 0; i < parser->nrelocs; ++i) {
+		struct radeon_bo *bo = parser->relocs[i].robj;
+		int r;
 
-	list_for_each_entry(lobj, &parser->validated, tv.head) {
-		bo = lobj->bo;
-		r = radeon_vm_bo_update_pte(parser->rdev, vm, bo, &bo->tbo.mem);
+		r = radeon_vm_bo_update_pte(parser->rdev, vm, bo, &bo->tbo.mem, &parser->relocs[i].gpu_addr);
 		if (r) {
 			return r;
 		}
@@ -476,17 +481,18 @@ static int radeon_cs_ib_vm_chunk(struct radeon_device *rdev,
 		 * offset inside the pool bo
 		 */
 		parser->const_ib->gpu_addr = parser->const_ib->sa_bo.offset;
+		parser->ib->type = RADEON_IB_TYPE_NONE;
 		r = radeon_ib_schedule(rdev, parser->const_ib);
 		if (r)
 			goto out;
 	}
 
-	parser->ib->vm_id = vm->id;
 	/* ib pool is bind at 0 in virtual address space to gpu_addr is the
 	 * offset inside the pool bo
 	 */
 	parser->ib->gpu_addr = parser->ib->sa_bo.offset;
 	parser->ib->is_const_ib = false;
+	parser->ib->type = RADEON_IB_TYPE_CS_VM;
 	r = radeon_ib_schedule(rdev, parser->ib);
 out:
 	if (!r) {
diff --git a/drivers/gpu/drm/radeon/radeon_device.c b/drivers/gpu/drm/radeon/radeon_device.c
index 5df53dd..bf327fa 100644
--- a/drivers/gpu/drm/radeon/radeon_device.c
+++ b/drivers/gpu/drm/radeon/radeon_device.c
@@ -723,6 +723,7 @@ int radeon_device_init(struct radeon_device *rdev,
 	/* mutex initialization are all done here so we
 	 * can recall function without having locking issues */
 	mutex_init(&rdev->cs_mutex);
+	mutex_init(&rdev->blob_mutex);
 	for (i = 0; i < RADEON_NUM_RINGS; ++i)
 		mutex_init(&rdev->ring[i].mutex);
 	mutex_init(&rdev->dc_hw_i2c_mutex);
@@ -984,6 +985,8 @@ int radeon_gpu_reset(struct radeon_device *rdev)
 	int r;
 	int resched;
 
+	radeon_fence_blob_faulty_ib(rdev, RADEON_RING_TYPE_GFX_INDEX);
+
 	radeon_save_bios_scratch_regs(rdev);
 	/* block TTM */
 	resched = ttm_bo_lock_delayed_workqueue(&rdev->mman.bdev);
diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c
index 2e56101..16cbc65 100644
--- a/drivers/gpu/drm/radeon/radeon_fence.c
+++ b/drivers/gpu/drm/radeon/radeon_fence.c
@@ -405,6 +405,25 @@ bool radeon_fence_set_associated_ib(struct radeon_fence *fence, struct radeon_ib
 	return isset;
 }
 
+void radeon_fence_blob_faulty_ib(struct radeon_device *rdev, int ring)
+{
+	struct radeon_fence *fence;
+	struct list_head *i;
+	unsigned long irq_flags;
+	uint32_t seq;
+
+	write_lock_irqsave(&rdev->fence_lock, irq_flags);
+	seq = radeon_fence_read(rdev, ring);
+	list_for_each(i, &rdev->fence_drv[ring].emitted) {
+		fence = list_entry(i, struct radeon_fence, list);
+		if (fence->seq != seq && fence->ib) {
+			radeon_lockup_build_blob(rdev, fence->ib);
+			break;
+		}
+	}
+	write_unlock_irqrestore(&rdev->fence_lock, irq_flags);
+}
+
 int radeon_fence_driver_start_ring(struct radeon_device *rdev, int ring)
 {
 	unsigned long irq_flags;
diff --git a/drivers/gpu/drm/radeon/radeon_gart.c b/drivers/gpu/drm/radeon/radeon_gart.c
index b4a4982..a491edf 100644
--- a/drivers/gpu/drm/radeon/radeon_gart.c
+++ b/drivers/gpu/drm/radeon/radeon_gart.c
@@ -435,7 +435,7 @@ retry_id:
 	vm->id = id;
 	list_add_tail(&vm->list, &rdev->vm_manager.lru_vm);
 	return radeon_vm_bo_update_pte(rdev, vm, rdev->sa_manager.bo,
-				       &rdev->sa_manager.bo->tbo.mem);
+				       &rdev->sa_manager.bo->tbo.mem, NULL);
 }
 
 /* object have to be reserved */
@@ -542,7 +542,8 @@ static u64 radeon_vm_get_addr(struct radeon_device *rdev,
 int radeon_vm_bo_update_pte(struct radeon_device *rdev,
 			    struct radeon_vm *vm,
 			    struct radeon_bo *bo,
-			    struct ttm_mem_reg *mem)
+			    struct ttm_mem_reg *mem,
+			    uint64_t *gpu_addr)
 {
 	struct radeon_bo_va *bo_va;
 	unsigned ngpu_pages, i;
@@ -562,6 +563,9 @@ int radeon_vm_bo_update_pte(struct radeon_device *rdev,
 	if (bo_va->valid)
 		return 0;
 
+	if (gpu_addr) {
+		*gpu_addr = bo_va->soffset;
+	}
 	ngpu_pages = radeon_bo_ngpu_pages(bo);
 	bo_va->flags &= ~RADEON_VM_PAGE_VALID;
 	bo_va->flags &= ~RADEON_VM_PAGE_SYSTEM;
@@ -599,7 +603,7 @@ int radeon_vm_bo_rmv(struct radeon_device *rdev,
 
 	mutex_lock(&vm->mutex);
 	mutex_lock(&rdev->cs_mutex);
-	radeon_vm_bo_update_pte(rdev, vm, bo, NULL);
+	radeon_vm_bo_update_pte(rdev, vm, bo, NULL, NULL);
 	mutex_unlock(&rdev->cs_mutex);
 	list_del(&bo_va->vm_list);
 	mutex_unlock(&vm->mutex);
diff --git a/drivers/gpu/drm/radeon/radeon_ring.c b/drivers/gpu/drm/radeon/radeon_ring.c
index c635aad..de93ba3 100644
--- a/drivers/gpu/drm/radeon/radeon_ring.c
+++ b/drivers/gpu/drm/radeon/radeon_ring.c
@@ -34,6 +34,8 @@
 #include "radeon.h"
 #include "atom.h"
 
+static int radeon_debugfs_lockup_init(struct radeon_device *rdev);
+
 /*
  * IB.
  */
@@ -116,6 +118,7 @@ void radeon_ib_free(struct radeon_device *rdev, struct radeon_ib **ib)
 	}
 
 	if (destroy) {
+		tmp->type = RADEON_IB_TYPE_NONE;
 		radeon_sa_bo_free(rdev, &tmp->sa_bo);
 		radeon_fence_unref(&tmp->fence);
 		kfree(tmp->relocs);
@@ -163,6 +166,9 @@ int radeon_ib_pool_init(struct radeon_device *rdev)
 
 	DRM_INFO("radeon: ib pool ready.\n");
 	rdev->ib_pool_ready = true;
+	if (radeon_debugfs_lockup_init(rdev)) {
+		DRM_ERROR("Failed to register debugfs file for lockup blob!\n");
+	}
 	return 0;
 }
 
@@ -465,3 +471,115 @@ void radeon_ring_fini(struct radeon_device *rdev, struct radeon_ring *ring)
 		radeon_bo_unref(&ring_obj);
 	}
 }
+
+#define BLOB_TYPE_GLOBAL	0
+#define BLOB_TYPE_CS		1
+#define BLOB_TYPE_RELOC		2
+#define BLOB_TYPE_BO		3
+#define BLOB_TYPE_CS_VM		4
+
+static unsigned radeon_blob_out(u32 *blob, unsigned offset,
+				unsigned type, unsigned id,
+				unsigned size, void *ptr)
+{
+	blob[offset++] = type;
+	blob[offset++] = id;
+	blob[offset++] = size;
+	if (ptr) {
+		memcpy(&blob[offset], ptr, size * 4);
+		return offset + size;
+	}
+	return offset;
+}
+
+void radeon_lockup_build_blob(struct radeon_device *rdev, struct radeon_ib *ib)
+{
+	unsigned i, size_dw, offset;
+
+	/* 3 dwords per object: cs, relocs, and one for each bo  */
+	size_dw = 6 + ib->nrelocs * 4;
+	size_dw += ib->length_dw + ib->nrelocs * 3;
+	for (i = 0; i < ib->nrelocs; ++i) {
+		struct radeon_bo *bo = ib->relocs[i].robj;
+		size_dw += (radeon_bo_size(bo) >> 2);
+	}
+	mutex_lock(&rdev->blob_mutex);
+	vfree(rdev->blob);
+	rdev->blob = vmalloc(size_dw * 4);
+	if (rdev->blob == NULL) {
+		mutex_unlock(&rdev->blob_mutex);
+		return;
+	}
+
+	rdev->blob_size_dw = size_dw;
+	offset = 0;
+	offset = radeon_blob_out(rdev->blob, offset, BLOB_TYPE_GLOBAL,
+				 0, size_dw, NULL);
+	if (RADEON_IB_TYPE_CS_VM) {
+		offset = radeon_blob_out(rdev->blob, offset, BLOB_TYPE_CS_VM,
+					 0, ib->length_dw, ib->ptr);
+	} else {
+		offset = radeon_blob_out(rdev->blob, offset, BLOB_TYPE_CS,
+					 0, ib->length_dw, ib->ptr);
+	}
+	/* dump relocation */
+	rdev->blob[offset++] = BLOB_TYPE_RELOC;
+	rdev->blob[offset++] = 0;
+	rdev->blob[offset++] = ib->nrelocs * 4;
+	for (i = 0; i < ib->nrelocs; ++i) {
+		rdev->blob[offset++] = ib->relocs[i].handle;
+		rdev->blob[offset++] = lower_32_bits(ib->relocs[i].gpu_addr);
+		rdev->blob[offset++] = upper_32_bits(ib->relocs[i].gpu_addr);
+		rdev->blob[offset++] = ib->relocs[i].flags;
+	}
+	/* dump bo */
+	for (i = 0; i < ib->nrelocs; ++i) {
+		struct radeon_bo *bo = ib->relocs[i].robj;
+		void *ptr;
+
+		if (!radeon_bo_kmap(bo, &ptr)) {
+			offset = radeon_blob_out(rdev->blob, offset, BLOB_TYPE_BO,
+						 ib->relocs[i].handle,
+						 (radeon_bo_size(bo) >> 2),
+						 ptr);
+			radeon_bo_kunmap(bo);
+		}
+	}
+	mutex_unlock(&rdev->blob_mutex);
+}
+
+
+#if defined(CONFIG_DEBUG_FS)
+static int radeon_debugfs_blob(struct seq_file *m, void *data)
+{
+	struct drm_info_node *node = (struct drm_info_node *) m->private;
+	struct drm_device *dev = node->minor->dev;
+	struct radeon_device *rdev = dev->dev_private;
+	int r = 0;
+
+	mutex_lock(&rdev->blob_mutex);
+	if (rdev->blob) {
+		r = seq_write(m, rdev->blob, rdev->blob_size_dw * 4);
+		vfree(rdev->blob);
+		rdev->blob = 0;
+		rdev->blob_size_dw = 0;
+	}
+	mutex_unlock(&rdev->blob_mutex);
+	return r;
+}
+
+static struct drm_info_list radeon_debugfs_ring_info_list[] = {
+	{"radeon_lockup_blob", radeon_debugfs_blob, 0, NULL},
+};
+#endif
+
+
+static int radeon_debugfs_lockup_init(struct radeon_device *rdev)
+{
+#if defined(CONFIG_DEBUG_FS)
+	return radeon_debugfs_add_files(rdev, radeon_debugfs_ring_info_list,
+					ARRAY_SIZE(radeon_debugfs_ring_info_list));
+#else
+	return 0;
+#endif
+}
-- 
1.7.7.6

  parent reply	other threads:[~2012-04-25 19:04 UTC|newest]

Thread overview: 31+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2012-04-25 19:03 Reworking of GPU reset logic + dumping j.glisse
2012-04-25 19:03 ` [PATCH 01/24] drm/radeon: remove fence/ring/ib debugfs files j.glisse
2012-04-26  9:11   ` Christian König
2012-04-26  9:18     ` David Airlie
2012-04-26 13:36       ` Jerome Glisse
2012-04-26 13:58         ` Alex Deucher
2012-04-25 19:03 ` [PATCH 02/24] drm/radeon: make radeon_gpu_is_lockup a per ring function j.glisse
2012-04-25 19:03 ` [PATCH 03/24] drm/radeon: replace gpu_lockup with ring->ready flag j.glisse
2012-04-25 19:03 ` [PATCH 04/24] drm/radeon: use central function for IB testing j.glisse
2012-04-25 19:03 ` [PATCH 05/24] drm/radeon: rework gpu lockup detection and processing j.glisse
2012-04-25 19:03 ` [PATCH 06/24] drm/radeon: fix a critical bug in the SA code j.glisse
2012-04-25 19:03 ` [PATCH 07/24] drm/radeon: add proper locking to the SA v2 j.glisse
2012-04-25 19:03 ` [PATCH 08/24] drm/radeon: add biggest hole tracking and wakequeue to the sa v4 j.glisse
2012-04-25 19:03 ` [PATCH 09/24] drm/radeon: simplify semaphore handling j.glisse
2012-04-25 19:03 ` [PATCH 10/24] drm/radeon: return -ENOENT in fence_wait_next v2 j.glisse
2012-04-25 19:03 ` [PATCH 11/24] drm/radeon: rename fence_wait_last to fence_wait_empty j.glisse
2012-04-25 19:03 ` [PATCH 12/24] drm/radeon: rip out the ib pool v2 j.glisse
2012-04-25 19:03 ` [PATCH 13/24] drm/radeon: fix a bug with the ring syncing code j.glisse
2012-04-25 19:03 ` [PATCH 14/24] drm/radeon: rework recursive gpu reset handling j.glisse
2012-04-25 19:03 ` [PATCH 15/24] drm/radeon: remove recursive mutex implementation j.glisse
2012-04-25 19:03 ` [PATCH 16/24] drm/radeon: move lockup detection code into radeon_ring.c v2 j.glisse
2012-04-25 19:03 ` [PATCH 17/24] drm/radeon: make lockup timeout a module param j.glisse
2012-04-25 19:03 ` [PATCH 18/24] drm/radeon: unlock the ring mutex while waiting for the next fence j.glisse
2012-04-25 19:03 ` [PATCH 19/24] drm/radeon: make forcing ring activity a common function j.glisse
2012-04-25 19:03 ` [PATCH 20/24] drm/radeon: remove r300_gpu_is_lockup j.glisse
2012-04-25 19:03 ` [PATCH 21/24] drm/radeon: remove cayman_gpu_is_lockup j.glisse
2012-04-25 19:03 ` [PATCH 22/24] drm/radeon: extend ring debugfs files with fence info c2 j.glisse
2012-04-25 19:03 ` [PATCH 23/24] drm/radeon: keep the cs relocs inside the ib j.glisse
2012-04-25 19:03 ` j.glisse [this message]
2012-04-25 21:53   ` [PATCH 24/24] drm/radeon: add faulty command buffer dump facilities Luca Tettamanti
2012-04-25 22:30     ` Jerome Glisse

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1335380609-5804-25-git-send-email-j.glisse@gmail.com \
    --to=j.glisse@gmail.com \
    --cc=dri-devel@lists.freedesktop.org \
    --cc=jglisse@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.