From: Oded Gabbay <oded.gabbay@gmail.com>
To: gregkh@linuxfoundation.org, linux-kernel@vger.kernel.org
Cc: Tomer Tayar <ttayar@habana.ai>
Subject: [PATCH 01/15] habanalabs: Dissociate RAZWI info from event types
Date: Thu, 28 Feb 2019 10:46:10 +0200 [thread overview]
Message-ID: <20190228084624.25288-2-oded.gabbay@gmail.com> (raw)
In-Reply-To: <20190228084624.25288-1-oded.gabbay@gmail.com>
From: Tomer Tayar <ttayar@habana.ai>
This patch provides a workaround for a H/W bug in the RAZWI logger in
Goya. The logger doesn't recognize the initiator correctly and as a
result, accesses from one initiator are reported that were coming from a
different initiator.
The WA is to print the error information from the event entries we receive
without looking at the RAZWI logger at all.
Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
---
drivers/misc/habanalabs/goya/goya.c | 227 ++++++++++++++++------------
1 file changed, 127 insertions(+), 100 deletions(-)
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index 54218f147627..447d907bddf3 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -111,29 +111,6 @@ static u16 goya_packet_sizes[MAX_PACKET_ID] = {
[PACKET_STOP] = sizeof(struct packet_stop)
};
-static const char *goya_axi_name[GOYA_MAX_INITIATORS] = {
- "MME0",
- "MME1",
- "MME2",
- "MME3",
- "MME4",
- "MME5",
- "TPC0",
- "TPC1",
- "TPC2",
- "TPC3",
- "TPC4",
- "TPC5",
- "TPC6",
- "TPC7",
- "PCI",
- "DMA", /* HBW */
- "DMA", /* LBW */
- "PSOC",
- "CPU",
- "MMU"
-};
-
static u64 goya_mmu_regs[GOYA_MMU_REGS_NUM] = {
mmDMA_QM_0_GLBL_NON_SECURE_PROPS,
mmDMA_QM_1_GLBL_NON_SECURE_PROPS,
@@ -4554,111 +4531,161 @@ static void goya_write_pte(struct hl_device *hdev, u64 addr, u64 val)
(addr - goya->ddr_bar_cur_addr));
}
-static void goya_get_axi_name(struct hl_device *hdev, u32 agent_id,
- u16 event_type, char *axi_name, int len)
+static const char *_goya_get_event_desc(u16 event_type)
{
- if (!strcmp(goya_axi_name[agent_id], "DMA"))
- if (event_type >= GOYA_ASYNC_EVENT_ID_DMA0_CH)
- snprintf(axi_name, len, "DMA %d",
- event_type - GOYA_ASYNC_EVENT_ID_DMA0_CH);
- else
- snprintf(axi_name, len, "DMA %d",
- event_type - GOYA_ASYNC_EVENT_ID_DMA0_QM);
- else
- snprintf(axi_name, len, "%s", goya_axi_name[agent_id]);
+ switch (event_type) {
+ case GOYA_ASYNC_EVENT_ID_PCIE_DEC:
+ return "PCIe_dec";
+ case GOYA_ASYNC_EVENT_ID_TPC0_DEC:
+ case GOYA_ASYNC_EVENT_ID_TPC1_DEC:
+ case GOYA_ASYNC_EVENT_ID_TPC2_DEC:
+ case GOYA_ASYNC_EVENT_ID_TPC3_DEC:
+ case GOYA_ASYNC_EVENT_ID_TPC4_DEC:
+ case GOYA_ASYNC_EVENT_ID_TPC5_DEC:
+ case GOYA_ASYNC_EVENT_ID_TPC6_DEC:
+ case GOYA_ASYNC_EVENT_ID_TPC7_DEC:
+ return "TPC%d_dec";
+ case GOYA_ASYNC_EVENT_ID_MME_WACS:
+ return "MME_wacs";
+ case GOYA_ASYNC_EVENT_ID_MME_WACSD:
+ return "MME_wacsd";
+ case GOYA_ASYNC_EVENT_ID_CPU_AXI_SPLITTER:
+ return "CPU_axi_splitter";
+ case GOYA_ASYNC_EVENT_ID_PSOC_AXI_DEC:
+ return "PSOC_axi_dec";
+ case GOYA_ASYNC_EVENT_ID_PSOC:
+ return "PSOC";
+ case GOYA_ASYNC_EVENT_ID_TPC0_KRN_ERR:
+ case GOYA_ASYNC_EVENT_ID_TPC1_KRN_ERR:
+ case GOYA_ASYNC_EVENT_ID_TPC2_KRN_ERR:
+ case GOYA_ASYNC_EVENT_ID_TPC3_KRN_ERR:
+ case GOYA_ASYNC_EVENT_ID_TPC4_KRN_ERR:
+ case GOYA_ASYNC_EVENT_ID_TPC5_KRN_ERR:
+ case GOYA_ASYNC_EVENT_ID_TPC6_KRN_ERR:
+ case GOYA_ASYNC_EVENT_ID_TPC7_KRN_ERR:
+ return "TPC%d_krn_err";
+ case GOYA_ASYNC_EVENT_ID_TPC0_CMDQ ... GOYA_ASYNC_EVENT_ID_TPC7_CMDQ:
+ return "TPC%d_cq";
+ case GOYA_ASYNC_EVENT_ID_TPC0_QM ... GOYA_ASYNC_EVENT_ID_TPC7_QM:
+ return "TPC%d_qm";
+ case GOYA_ASYNC_EVENT_ID_MME_QM:
+ return "MME_qm";
+ case GOYA_ASYNC_EVENT_ID_MME_CMDQ:
+ return "MME_cq";
+ case GOYA_ASYNC_EVENT_ID_DMA0_QM ... GOYA_ASYNC_EVENT_ID_DMA4_QM:
+ return "DMA%d_qm";
+ case GOYA_ASYNC_EVENT_ID_DMA0_CH ... GOYA_ASYNC_EVENT_ID_DMA4_CH:
+ return "DMA%d_ch";
+ default:
+ return "N/A";
+ }
}
-static void goya_print_razwi_info(struct hl_device *hdev, u64 reg,
- bool is_hbw, bool is_read, u16 event_type)
+static void goya_get_event_desc(u16 event_type, char *desc, size_t size)
{
- u32 val, agent_id;
- char axi_name[10] = {0};
-
- val = RREG32(reg);
+ u8 index;
- if (is_hbw)
- agent_id = (val & GOYA_IRQ_HBW_AGENT_ID_MASK) >>
- GOYA_IRQ_HBW_AGENT_ID_SHIFT;
- else
- agent_id = (val & GOYA_IRQ_LBW_AGENT_ID_MASK) >>
- GOYA_IRQ_LBW_AGENT_ID_SHIFT;
-
- if (agent_id >= GOYA_MAX_INITIATORS) {
- dev_err(hdev->dev,
- "Illegal %s %s with wrong initiator id %d, H/W IRQ %d\n",
- is_read ? "read from" : "write to",
- is_hbw ? "HBW" : "LBW",
- agent_id,
- event_type);
- } else {
- goya_get_axi_name(hdev, agent_id, event_type, axi_name,
- sizeof(axi_name));
- dev_err(hdev->dev, "Illegal %s by %s %s %s, H/W IRQ %d\n",
- is_read ? "read" : "write",
- axi_name,
- is_read ? "from" : "to",
- is_hbw ? "HBW" : "LBW",
- event_type);
+ switch (event_type) {
+ case GOYA_ASYNC_EVENT_ID_TPC0_DEC:
+ case GOYA_ASYNC_EVENT_ID_TPC1_DEC:
+ case GOYA_ASYNC_EVENT_ID_TPC2_DEC:
+ case GOYA_ASYNC_EVENT_ID_TPC3_DEC:
+ case GOYA_ASYNC_EVENT_ID_TPC4_DEC:
+ case GOYA_ASYNC_EVENT_ID_TPC5_DEC:
+ case GOYA_ASYNC_EVENT_ID_TPC6_DEC:
+ case GOYA_ASYNC_EVENT_ID_TPC7_DEC:
+ index = (event_type - GOYA_ASYNC_EVENT_ID_TPC0_DEC) / 3;
+ snprintf(desc, size, _goya_get_event_desc(event_type), index);
+ break;
+ case GOYA_ASYNC_EVENT_ID_TPC0_KRN_ERR:
+ case GOYA_ASYNC_EVENT_ID_TPC1_KRN_ERR:
+ case GOYA_ASYNC_EVENT_ID_TPC2_KRN_ERR:
+ case GOYA_ASYNC_EVENT_ID_TPC3_KRN_ERR:
+ case GOYA_ASYNC_EVENT_ID_TPC4_KRN_ERR:
+ case GOYA_ASYNC_EVENT_ID_TPC5_KRN_ERR:
+ case GOYA_ASYNC_EVENT_ID_TPC6_KRN_ERR:
+ case GOYA_ASYNC_EVENT_ID_TPC7_KRN_ERR:
+ index = (event_type - GOYA_ASYNC_EVENT_ID_TPC0_KRN_ERR) / 10;
+ snprintf(desc, size, _goya_get_event_desc(event_type), index);
+ break;
+ case GOYA_ASYNC_EVENT_ID_TPC0_CMDQ ... GOYA_ASYNC_EVENT_ID_TPC7_CMDQ:
+ index = event_type - GOYA_ASYNC_EVENT_ID_TPC0_CMDQ;
+ snprintf(desc, size, _goya_get_event_desc(event_type), index);
+ break;
+ case GOYA_ASYNC_EVENT_ID_TPC0_QM ... GOYA_ASYNC_EVENT_ID_TPC7_QM:
+ index = event_type - GOYA_ASYNC_EVENT_ID_TPC0_QM;
+ snprintf(desc, size, _goya_get_event_desc(event_type), index);
+ break;
+ case GOYA_ASYNC_EVENT_ID_DMA0_QM ... GOYA_ASYNC_EVENT_ID_DMA4_QM:
+ index = event_type - GOYA_ASYNC_EVENT_ID_DMA0_QM;
+ snprintf(desc, size, _goya_get_event_desc(event_type), index);
+ break;
+ case GOYA_ASYNC_EVENT_ID_DMA0_CH ... GOYA_ASYNC_EVENT_ID_DMA4_CH:
+ index = event_type - GOYA_ASYNC_EVENT_ID_DMA0_CH;
+ snprintf(desc, size, _goya_get_event_desc(event_type), index);
+ break;
+ default:
+ snprintf(desc, size, _goya_get_event_desc(event_type));
+ break;
}
}
-static void goya_print_irq_info(struct hl_device *hdev, u16 event_type)
+static void goya_print_razwi_info(struct hl_device *hdev)
{
- struct goya_device *goya = hdev->asic_specific;
- bool is_hbw = false, is_read = false, is_info = false;
-
if (RREG32(mmDMA_MACRO_RAZWI_LBW_WT_VLD)) {
- goya_print_razwi_info(hdev, mmDMA_MACRO_RAZWI_LBW_WT_ID, is_hbw,
- is_read, event_type);
+ dev_err(hdev->dev, "Illegal write to LBW\n");
WREG32(mmDMA_MACRO_RAZWI_LBW_WT_VLD, 0);
- is_info = true;
}
+
if (RREG32(mmDMA_MACRO_RAZWI_LBW_RD_VLD)) {
- is_read = true;
- goya_print_razwi_info(hdev, mmDMA_MACRO_RAZWI_LBW_RD_ID, is_hbw,
- is_read, event_type);
+ dev_err(hdev->dev, "Illegal read from LBW\n");
WREG32(mmDMA_MACRO_RAZWI_LBW_RD_VLD, 0);
- is_info = true;
}
+
if (RREG32(mmDMA_MACRO_RAZWI_HBW_WT_VLD)) {
- is_hbw = true;
- goya_print_razwi_info(hdev, mmDMA_MACRO_RAZWI_HBW_WT_ID, is_hbw,
- is_read, event_type);
+ dev_err(hdev->dev, "Illegal write to HBW\n");
WREG32(mmDMA_MACRO_RAZWI_HBW_WT_VLD, 0);
- is_info = true;
}
+
if (RREG32(mmDMA_MACRO_RAZWI_HBW_RD_VLD)) {
- is_hbw = true;
- is_read = true;
- goya_print_razwi_info(hdev, mmDMA_MACRO_RAZWI_HBW_RD_ID, is_hbw,
- is_read, event_type);
+ dev_err(hdev->dev, "Illegal read from HBW\n");
WREG32(mmDMA_MACRO_RAZWI_HBW_RD_VLD, 0);
- is_info = true;
- }
- if (!is_info) {
- dev_err(hdev->dev,
- "Received H/W interrupt %d, no additional info\n",
- event_type);
- return;
}
+}
- if (goya->hw_cap_initialized & HW_CAP_MMU) {
- u32 val = RREG32(mmMMU_PAGE_ERROR_CAPTURE);
- u64 addr;
+static void goya_print_mmu_error_info(struct hl_device *hdev)
+{
+ struct goya_device *goya = hdev->asic_specific;
+ u64 addr;
+ u32 val;
+
+ if (!(goya->hw_cap_initialized & HW_CAP_MMU))
+ return;
- if (val & MMU_PAGE_ERROR_CAPTURE_ENTRY_VALID_MASK) {
- addr = val & MMU_PAGE_ERROR_CAPTURE_VA_49_32_MASK;
- addr <<= 32;
- addr |= RREG32(mmMMU_PAGE_ERROR_CAPTURE_VA);
+ val = RREG32(mmMMU_PAGE_ERROR_CAPTURE);
+ if (val & MMU_PAGE_ERROR_CAPTURE_ENTRY_VALID_MASK) {
+ addr = val & MMU_PAGE_ERROR_CAPTURE_VA_49_32_MASK;
+ addr <<= 32;
+ addr |= RREG32(mmMMU_PAGE_ERROR_CAPTURE_VA);
- dev_err(hdev->dev, "MMU page fault on va 0x%llx\n",
- addr);
+ dev_err(hdev->dev, "MMU page fault on va 0x%llx\n", addr);
- WREG32(mmMMU_PAGE_ERROR_CAPTURE, 0);
- }
+ WREG32(mmMMU_PAGE_ERROR_CAPTURE, 0);
}
}
+static void goya_print_irq_info(struct hl_device *hdev, u16 event_type)
+{
+ char desc[20] = "";
+
+ goya_get_event_desc(event_type, desc, sizeof(desc));
+ dev_err(hdev->dev, "Received H/W interrupt %d [\"%s\"]\n",
+ event_type, desc);
+
+ goya_print_razwi_info(hdev);
+ goya_print_mmu_error_info(hdev);
+}
+
static int goya_unmask_irq_arr(struct hl_device *hdev, u32 *irq_arr,
size_t irq_arr_size)
{
--
2.17.1
next prev parent reply other threads:[~2019-02-28 8:46 UTC|newest]
Thread overview: 18+ messages / expand[flat|nested] mbox.gz Atom feed top
2019-02-28 8:46 [PATCH 00/15] habanalabs fixes for merge window Oded Gabbay
2019-02-28 8:46 ` Oded Gabbay [this message]
2019-02-28 8:46 ` [PATCH 02/15] habanalabs: add MMU DRAM default page mapping Oded Gabbay
2019-02-28 8:46 ` [PATCH 03/15] habanalabs: disable CPU access on timeouts Oded Gabbay
2019-02-28 8:46 ` [PATCH 04/15] habanalabs: fix mmu cache registers init Oded Gabbay
2019-02-28 8:46 ` [PATCH 05/15] habanalabs: fix validation of WREG32 to DMA completion Oded Gabbay
2019-02-28 8:46 ` [PATCH 06/15] habanalabs: set DMA0 completion to SOB 1007 Oded Gabbay
2019-02-28 8:46 ` [PATCH 07/15] habanalabs: extend QMAN0 job timeout Oded Gabbay
2019-02-28 8:46 ` [PATCH 08/15] habanalabs: add comments in uapi/misc/habanalabs.h Oded Gabbay
2019-02-28 8:46 ` [PATCH 09/15] habanalabs: return correct error code on MMU mapping failure Oded Gabbay
2019-02-28 8:46 ` [PATCH 10/15] habanalabs: fix memory leak with CBs with unaligned size Oded Gabbay
2019-02-28 8:46 ` [PATCH 11/15] habanalabs: print pointer using %p Oded Gabbay
2019-02-28 9:31 ` Greg KH
2019-02-28 9:47 ` Oded Gabbay
2019-02-28 8:46 ` [PATCH 12/15] habanalabs: soft-reset device if context-switch fails Oded Gabbay
2019-02-28 8:46 ` [PATCH 13/15] habanalabs: fix little-endian<->cpu conversion warnings Oded Gabbay
2019-02-28 8:46 ` [PATCH 14/15] habanalabs: use NULL to initialize array of pointers Oded Gabbay
2019-02-28 8:46 ` [PATCH 15/15] habanalabs: fix little-endian<->cpu conversion warnings Oded Gabbay
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20190228084624.25288-2-oded.gabbay@gmail.com \
--to=oded.gabbay@gmail.com \
--cc=gregkh@linuxfoundation.org \
--cc=linux-kernel@vger.kernel.org \
--cc=ttayar@habana.ai \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.