public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: Lizhi Hou <lizhi.hou@amd.com>
To: Jeffrey Hugo <quic_jhugo@quicinc.com>, <ogabbay@kernel.org>,
	<dri-devel@lists.freedesktop.org>
Cc: <linux-kernel@vger.kernel.org>, <min.ma@amd.com>,
	<max.zhen@amd.com>, <sonal.santan@amd.com>, <king.tam@amd.com>
Subject: Re: [PATCH V4 09/10] accel/amdxdna: Add error handling
Date: Fri, 18 Oct 2024 14:19:05 -0700	[thread overview]
Message-ID: <0fe13309-b0bb-6868-e1d2-2fa59f329b7b@amd.com> (raw)
In-Reply-To: <1c16640f-0bc3-2692-910f-09ea5869a5b6@quicinc.com>


On 10/18/24 14:01, Jeffrey Hugo wrote:
> On 10/11/2024 5:12 PM, Lizhi Hou wrote:
>> When there is a hardware error, the NPU firmware notifies the host 
>> through
>> a mailbox message. The message includes details of the error, such as 
>> the
>> tile and column indexes where the error occurred.
>>
>> The driver starts a thread to handle the NPU error message. The thread
>> stops the clients which are using the column where error occurred. Then
>> the driver resets that column.
>>
>> Co-developed-by: Min Ma<min.ma@amd.com>
>> Signed-off-by: Min Ma<min.ma@amd.com>
>> Signed-off-by: Lizhi Hou<lizhi.hou@amd.com>
>> ---
>>   drivers/accel/amdxdna/Makefile       |   1 +
>>   drivers/accel/amdxdna/aie2_error.c   | 356 +++++++++++++++++++++++++++
>>   drivers/accel/amdxdna/aie2_message.c |  19 ++
>>   drivers/accel/amdxdna/aie2_pci.c     |  32 +++
>>   drivers/accel/amdxdna/aie2_pci.h     |   9 +
>>   5 files changed, 417 insertions(+)
>>   create mode 100644 drivers/accel/amdxdna/aie2_error.c
>>
>> diff --git a/drivers/accel/amdxdna/Makefile 
>> b/drivers/accel/amdxdna/Makefile
>> index a688c378761f..ed6f87910880 100644
>> --- a/drivers/accel/amdxdna/Makefile
>> +++ b/drivers/accel/amdxdna/Makefile
>> @@ -2,6 +2,7 @@
>>     amdxdna-y := \
>>       aie2_ctx.o \
>> +    aie2_error.o \
>>       aie2_message.o \
>>       aie2_pci.o \
>>       aie2_psp.o \
>> diff --git a/drivers/accel/amdxdna/aie2_error.c 
>> b/drivers/accel/amdxdna/aie2_error.c
>> new file mode 100644
>> index 000000000000..d2787549f3b7
>> --- /dev/null
>> +++ b/drivers/accel/amdxdna/aie2_error.c
>> @@ -0,0 +1,356 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +/*
>> + * Copyright (C) 2023-2024, Advanced Micro Devices, Inc.
>> + */
>> +
>> +#include <drm/drm_cache.h>
>> +#include <drm/drm_device.h>
>> +#include <drm/drm_print.h>
>> +#include <drm/gpu_scheduler.h>
>> +#include <linux/dma-mapping.h>
>> +#include <linux/kthread.h>
>> +#include <linux/kernel.h>
>> +
>> +#include "aie2_msg_priv.h"
>> +#include "aie2_pci.h"
>> +#include "amdxdna_mailbox.h"
>> +#include "amdxdna_pci_drv.h"
>> +
>> +struct async_event {
>> +    struct amdxdna_dev_hdl        *ndev;
>> +    struct async_event_msg_resp    resp;
>> +    struct workqueue_struct        *wq;
>> +    struct work_struct        work;
>> +    u8                *buf;
>> +    dma_addr_t            addr;
>> +    u32                size;
>> +};
>> +
>> +struct async_events {
>> +    struct workqueue_struct        *wq;
>> +    u8                *buf;
>> +    dma_addr_t            addr;
>> +    u32                size;
>> +    u32                event_cnt;
>> +    struct async_event        event[] __counted_by(event_cnt);
>> +};
>> +
>> +/*
>> + * Below enum, struct and lookup tables are porting from XAIE util 
>> header file.
>> + *
>> + * Below data is defined by AIE device and it is used for decode 
>> error message
>> + * from the device.
>> + */
>> +
>> +enum aie_module_type {
>> +    AIE_MEM_MOD = 0,
>> +    AIE_CORE_MOD,
>> +    AIE_PL_MOD,
>> +};
>> +
>> +enum aie_error_category {
>> +    AIE_ERROR_SATURATION = 0,
>> +    AIE_ERROR_FP,
>> +    AIE_ERROR_STREAM,
>> +    AIE_ERROR_ACCESS,
>> +    AIE_ERROR_BUS,
>> +    AIE_ERROR_INSTRUCTION,
>> +    AIE_ERROR_ECC,
>> +    AIE_ERROR_LOCK,
>> +    AIE_ERROR_DMA,
>> +    AIE_ERROR_MEM_PARITY,
>> +    /* Unknown is not from XAIE, added for better category */
>> +    AIE_ERROR_UNKNOWN,
>> +};
>> +
>> +/* Don't pack, unless XAIE side changed */
>> +struct aie_error {
>> +    u8            row;
>> +    u8            col;
>> +    u32            mod_type;
>> +    u8            event_id;
>> +};
>
> This looks like it is a structure to decode data from an external 
> device.  Assuming that is so, the wrong types are used here. Should be 
> the "__" types like "__u8", no?  Normal u8, etc are kernel internal 
> only types.

Yes, you are correct. I will fix this.


Thanks,

Lizhi

>
>> +
>> +struct aie_err_info {
>> +    u32            err_cnt;
>> +    u32            ret_code;
>> +    u32            rsvd;
>> +    struct aie_error    payload[] __counted_by(err_cnt);
>> +};
>> +
>> +struct aie_event_category {
>> +    u8            event_id;
>> +    enum aie_error_category category;
>> +};
>

  reply	other threads:[~2024-10-18 21:19 UTC|newest]

Thread overview: 23+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-10-11 23:12 [PATCH V4 00/10] AMD XDNA driver Lizhi Hou
2024-10-11 23:12 ` [PATCH V4 01/10] accel/amdxdna: Add documentation for AMD NPU accelerator driver Lizhi Hou
2024-10-18 20:24   ` Jeffrey Hugo
2024-10-11 23:12 ` [PATCH V4 02/10] accel/amdxdna: Add a new driver for AMD AI Engine Lizhi Hou
2024-10-18 20:31   ` Jeffrey Hugo
2024-10-11 23:12 ` [PATCH V4 03/10] accel/amdxdna: Support hardware mailbox Lizhi Hou
2024-10-18 20:34   ` Jeffrey Hugo
2024-10-11 23:12 ` [PATCH V4 04/10] accel/amdxdna: Add hardware resource solver Lizhi Hou
2024-10-11 23:12 ` [PATCH V4 05/10] accel/amdxdna: Add hardware context Lizhi Hou
2024-10-18 20:47   ` Jeffrey Hugo
2024-10-11 23:12 ` [PATCH V4 06/10] accel/amdxdna: Add GEM buffer object management Lizhi Hou
2024-10-18 20:49   ` Jeffrey Hugo
2024-10-11 23:12 ` [PATCH V4 07/10] accel/amdxdna: Add command execution Lizhi Hou
2024-10-15  2:13   ` Matthew Brost
2024-10-17  3:53     ` Lizhi Hou
2024-10-17 15:51       ` Matthew Brost
2024-10-11 23:12 ` [PATCH V4 08/10] accel/amdxdna: Add suspend and resume Lizhi Hou
2024-10-18 20:53   ` Jeffrey Hugo
2024-10-11 23:12 ` [PATCH V4 09/10] accel/amdxdna: Add error handling Lizhi Hou
2024-10-18 21:01   ` Jeffrey Hugo
2024-10-18 21:19     ` Lizhi Hou [this message]
2024-10-11 23:12 ` [PATCH V4 10/10] accel/amdxdna: Add query functions Lizhi Hou
2024-10-18 21:07   ` Jeffrey Hugo

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=0fe13309-b0bb-6868-e1d2-2fa59f329b7b@amd.com \
    --to=lizhi.hou@amd.com \
    --cc=dri-devel@lists.freedesktop.org \
    --cc=king.tam@amd.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=max.zhen@amd.com \
    --cc=min.ma@amd.com \
    --cc=ogabbay@kernel.org \
    --cc=quic_jhugo@quicinc.com \
    --cc=sonal.santan@amd.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox