* [PATCH v4 2/2] remoteproc: xlnx: add crash detection mechanism
2026-03-03 23:35 [PATCH v4 0/2] remoteproc: xlnx: remote crash recovery Tanmay Shah
2026-03-03 23:35 ` [PATCH v4 1/2] remoteproc: core: full attach detach during recovery Tanmay Shah
@ 2026-03-03 23:35 ` Tanmay Shah
2026-03-11 17:21 ` [PATCH v4 0/2] remoteproc: xlnx: remote crash recovery Mathieu Poirier
2026-03-17 21:16 ` Shah, Tanmay
3 siblings, 0 replies; 5+ messages in thread
From: Tanmay Shah @ 2026-03-03 23:35 UTC (permalink / raw)
To: andersson, mathieu.poirier; +Cc: linux-remoteproc, linux-kernel, Tanmay Shah
Remote processor will report the crash reason via the resource table
and notify the host via mailbox notification. The host checks this
crash reason on every mailbox notification from the remote and report
to the rproc core framework. Then the rproc core framework will start
the recovery process.
Signed-off-by: Tanmay Shah <tanmay.shah@amd.com>
---
Changes in v4:
- Optimize crash resource memory by changing type to u32 to u8
- Introduce version field in the crash resource
- Check crash related condition before rproc state related condition
- Move crash reporting to the bottom half of the isr instead of
actual irq handler
- Introduce 16 bytes array in the crash report to store crash reason
in the string format
- Remove redundant type cast
Changes in v3:
- %s/kick/mailbox notification/
- %s/core framework/rproc core framework/
- fold simple function within zynqmp_r5_handle_rsc().
- remove spurious change
- reset crash state after reporting the crash
- document set and reset of ATTACH_ON_RECOVERY flag
- set recovery_disabled flag to false
- check condition rproc->crash_reason != NULL
Changes in v2:
- clear attach recovery boot flag during detach and stop ops
drivers/remoteproc/xlnx_r5_remoteproc.c | 71 ++++++++++++++++++++++++-
1 file changed, 70 insertions(+), 1 deletion(-)
diff --git a/drivers/remoteproc/xlnx_r5_remoteproc.c b/drivers/remoteproc/xlnx_r5_remoteproc.c
index 5a468d959f1e..9c7cf9f37294 100644
--- a/drivers/remoteproc/xlnx_r5_remoteproc.c
+++ b/drivers/remoteproc/xlnx_r5_remoteproc.c
@@ -108,6 +108,10 @@ struct rsc_tbl_data {
const uintptr_t rsc_tbl;
} __packed;
+enum xlnx_rproc_fw_rsc {
+ XLNX_RPROC_FW_CRASH_REASON = RSC_VENDOR_START,
+};
+
/*
* Hardcoded TCM bank values. This will stay in driver to maintain backward
* compatibility with device-tree that does not have TCM information.
@@ -127,9 +131,25 @@ static const struct mem_bank_data zynqmp_tcm_banks_lockstep[] = {
{0xffe30000UL, 0x30000, 0x10000UL, PD_R5_1_BTCM, "btcm1"},
};
+/**
+ * struct xlnx_rproc_crash_report - resource to know crash status and reason
+ *
+ * @version: version of this resource
+ * @crash_state: if true, the rproc is notifying crash, time to recover
+ * @crash_reason: number to describe reason of crash
+ * @crash_reason_str: short string description of crash reason
+ */
+struct xlnx_rproc_crash_report {
+ u8 version;
+ u8 crash_state;
+ u8 crash_reason;
+ char crash_reason_str[16];
+} __packed;
+
/**
* struct zynqmp_r5_core - remoteproc core's internal data
*
+ * @crash_report: rproc crash state and reason
* @rsc_tbl_va: resource table virtual address
* @sram: Array of sram memories assigned to this core
* @num_sram: number of sram for this core
@@ -143,6 +163,7 @@ static const struct mem_bank_data zynqmp_tcm_banks_lockstep[] = {
* @ipi: pointer to mailbox information
*/
struct zynqmp_r5_core {
+ struct xlnx_rproc_crash_report *crash_report;
void __iomem *rsc_tbl_va;
struct zynqmp_sram_bank *sram;
int num_sram;
@@ -200,11 +221,27 @@ static int event_notified_idr_cb(int id, void *ptr, void *data)
*/
static void handle_event_notified(struct work_struct *work)
{
+ struct zynqmp_r5_core *r5_core;
struct mbox_info *ipi;
struct rproc *rproc;
ipi = container_of(work, struct mbox_info, mbox_work);
rproc = ipi->r5_core->rproc;
+ r5_core = ipi->r5_core;
+
+ /* report crash only if expected */
+ if (r5_core->crash_report && r5_core->crash_report->crash_state) {
+ if (rproc->state == RPROC_ATTACHED || rproc->state == RPROC_RUNNING) {
+ dev_warn(&rproc->dev, "crash reason id: %d %.15s\n",
+ r5_core->crash_report->crash_reason,
+ r5_core->crash_report->crash_reason_str);
+ rproc_report_crash(rproc, RPROC_FATAL_ERROR);
+ r5_core->crash_report->crash_state = false;
+ r5_core->crash_report->crash_reason = 0;
+ r5_core->crash_report->crash_reason_str[0] = '\0';
+ return;
+ }
+ }
/*
* We only use IPI for interrupt. The RPU firmware side may or may
@@ -438,6 +475,13 @@ static int zynqmp_r5_rproc_stop(struct rproc *rproc)
if (ret)
dev_err(r5_core->dev, "core force power down failed\n");
+ /*
+ * Clear attach on recovery flag during stop operation. The next state
+ * of the remote processor is expected to be "Running" state. In this
+ * state boot recovery method must take place over attach on recovery.
+ */
+ test_and_clear_bit(RPROC_FEAT_ATTACH_ON_RECOVERY, rproc->features);
+
return ret;
}
@@ -859,6 +903,9 @@ static int zynqmp_r5_get_rsc_table_va(struct zynqmp_r5_core *r5_core)
static int zynqmp_r5_attach(struct rproc *rproc)
{
+ /* Enable attach on recovery method. Clear it during rproc stop. */
+ rproc_set_feature(rproc, RPROC_FEAT_ATTACH_ON_RECOVERY);
+
dev_dbg(&rproc->dev, "rproc %d attached\n", rproc->index);
return 0;
@@ -873,9 +920,30 @@ static int zynqmp_r5_detach(struct rproc *rproc)
*/
zynqmp_r5_rproc_kick(rproc, 0);
+ clear_bit(RPROC_FEAT_ATTACH_ON_RECOVERY, rproc->features);
+
return 0;
}
+static int zynqmp_r5_handle_rsc(struct rproc *rproc, u32 rsc_type, void *rsc,
+ int offset, int avail)
+{
+ struct zynqmp_r5_core *r5_core = rproc->priv;
+ void *rsc_offset = (r5_core->rsc_tbl_va + offset);
+
+ if (rsc_type == XLNX_RPROC_FW_CRASH_REASON) {
+ r5_core->crash_report = rsc_offset;
+ /* reset all values */
+ r5_core->crash_report->crash_state = false;
+ r5_core->crash_report->crash_reason = 0;
+ r5_core->crash_report->crash_reason_str[0] = '\0';
+ } else {
+ return RSC_IGNORED;
+ }
+
+ return RSC_HANDLED;
+}
+
static const struct rproc_ops zynqmp_r5_rproc_ops = {
.prepare = zynqmp_r5_rproc_prepare,
.unprepare = zynqmp_r5_rproc_unprepare,
@@ -890,6 +958,7 @@ static const struct rproc_ops zynqmp_r5_rproc_ops = {
.get_loaded_rsc_table = zynqmp_r5_get_loaded_rsc_table,
.attach = zynqmp_r5_attach,
.detach = zynqmp_r5_detach,
+ .handle_rsc = zynqmp_r5_handle_rsc,
};
/**
@@ -923,7 +992,7 @@ static struct zynqmp_r5_core *zynqmp_r5_add_rproc_core(struct device *cdev)
rproc_coredump_set_elf_info(r5_rproc, ELFCLASS32, EM_ARM);
- r5_rproc->recovery_disabled = true;
+ r5_rproc->recovery_disabled = false;
r5_rproc->has_iommu = false;
r5_rproc->auto_boot = false;
r5_core = r5_rproc->priv;
--
2.34.1
^ permalink raw reply related [flat|nested] 5+ messages in thread* Re: [PATCH v4 0/2] remoteproc: xlnx: remote crash recovery
2026-03-03 23:35 [PATCH v4 0/2] remoteproc: xlnx: remote crash recovery Tanmay Shah
2026-03-03 23:35 ` [PATCH v4 1/2] remoteproc: core: full attach detach during recovery Tanmay Shah
2026-03-03 23:35 ` [PATCH v4 2/2] remoteproc: xlnx: add crash detection mechanism Tanmay Shah
@ 2026-03-11 17:21 ` Mathieu Poirier
2026-03-17 21:16 ` Shah, Tanmay
3 siblings, 0 replies; 5+ messages in thread
From: Mathieu Poirier @ 2026-03-11 17:21 UTC (permalink / raw)
To: Tanmay Shah; +Cc: andersson, linux-remoteproc, linux-kernel
On Tue, Mar 03, 2026 at 03:35:31PM -0800, Tanmay Shah wrote:
> Remote processor can crash or hang during normal execution. Linux
> remoteproc framework supports different mechanisms to recover the
> remote processor and re-establish the RPMsg communication in such case.
>
> Crash reporting on AMD-Xilinx platform:
>
> 1) Using debugfs node
>
> User can report the crash to the core framework via debugfs node using
> following command:
>
> echo 1 > /sys/kernel/debug/remoteproc/remoteproc0/crash
>
> 2) Remoteproc notify to the host about crash state and crash reason
> via the resource table
>
> This is a platform specific method where the remote firmware contains
> vendor specific resource to update the crash state and the crash
> reason. Then the remote notifies the crash to the host via mailbox
> notification. The host then will check this resource on every mbox
> notification and reports the crash to the core framework if needed.
>
> Crash recovery mechanism on AMD-Xilnx platform:
>
> There are two mechanisms available to recover the remote processor from
> the crash. 1) boot recovery, 2) attach on recovery
>
> Remoteproc core framework will choose proper mechanism based on the
> rproc features set by the platform driver.
>
> 1) Boot recovery
>
> This is the default mechanism to recover the remote processor.
> In this method core framework will first stop the remote processor,
> load the firmware again and then starts the remote processor. On
> AMD-Xilinx platforms this method is supported. The default coredump
> method is supported.
>
> 2) Attach on recovery
>
> If RPROC_ATTACH_ON_RECOVERY feature is enabled by the platform driver,
> then the core framework will choose this method for recovery.
>
> On versal and later platforms following is the sequence of events expected
> during remoteproc crash and attach on recovery:
>
> a) Remoteproc attach/detach flow is working, and RPMsg comm is established
> b) Remote processor (RPU) crashed (crash not reported yet)
> c) Platform management controller is instructed to stop and reload elf
> on inactive remote processor before reboot (platform specific method)
> d) Platform management controller reboots the remote processor
> e) Remote processor boots again, and detects previous crash (platform
> specific mechanism to detect the crash)
> f) Remote processor Reports crash to the Linux (Host) and wait for
> the recovery.
> g) Linux performs full detach and reattach to remote processor.
> h) Normal RPMsg communication is established.
>
> It is required to destroy all RPMsg related resources and recreate them
> during recovery to establish successful RPMsg communication. To achieve
> this complete rproc_detach followed by rproc_boot calls are needed. That
> is what this patch-series is fixing along with adding rproc recovery
> methods for AMD-Xilinx platforms.
>
> Change log:
>
> Changes in 3:
> - both rproc_attach_recovery() and
> rproc_boot_recovery() are called the same way.
> - remove unrelated changes
> - %s/kick/mailbox notification/
> - %s/core framework/rproc core framework/
> - fold simple function within zynqmp_r5_handle_rsc().
> - remove spurious change
> - reset crash state after reporting the crash
> - document set and reset of ATTACH_ON_RECOVERY flag
> - set recovery_disabled flag to false
> - check condition rproc->crash_reason != NULL
>
For V3 Bjorn made several comments in relation with QCOM use cases. As such I
will let him continue with this patchset.
Thanks,
Mathieu
> Changes in v2:
> - use rproc_boot instead of rproc_attach
> - move debug message early in the function
> - clear attach recovery boot flag during detach and stop ops
> Tanmay Shah (2):
> remoteproc: core: full attach detach during recovery
> remoteproc: xlnx: add crash detection mechanism
>
> drivers/remoteproc/remoteproc_core.c | 15 +++++-
> drivers/remoteproc/xlnx_r5_remoteproc.c | 71 ++++++++++++++++++++++++-
> 2 files changed, 84 insertions(+), 2 deletions(-)
>
>
> base-commit: 098493c6dced7b02545e8bd0053ef4099a2b769e
> --
> 2.34.1
>
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH v4 0/2] remoteproc: xlnx: remote crash recovery
2026-03-03 23:35 [PATCH v4 0/2] remoteproc: xlnx: remote crash recovery Tanmay Shah
` (2 preceding siblings ...)
2026-03-11 17:21 ` [PATCH v4 0/2] remoteproc: xlnx: remote crash recovery Mathieu Poirier
@ 2026-03-17 21:16 ` Shah, Tanmay
3 siblings, 0 replies; 5+ messages in thread
From: Shah, Tanmay @ 2026-03-17 21:16 UTC (permalink / raw)
To: Tanmay Shah, andersson, mathieu.poirier; +Cc: linux-remoteproc, linux-kernel
On 3/3/2026 5:35 PM, Tanmay Shah wrote:
Hi Bjorn, requesting reviews on this patch series.
Thanks,
Tanmay
> Remote processor can crash or hang during normal execution. Linux
> remoteproc framework supports different mechanisms to recover the
> remote processor and re-establish the RPMsg communication in such case.
>
> Crash reporting on AMD-Xilinx platform:
>
> 1) Using debugfs node
>
> User can report the crash to the core framework via debugfs node using
> following command:
>
> echo 1 > /sys/kernel/debug/remoteproc/remoteproc0/crash
>
> 2) Remoteproc notify to the host about crash state and crash reason
> via the resource table
>
> This is a platform specific method where the remote firmware contains
> vendor specific resource to update the crash state and the crash
> reason. Then the remote notifies the crash to the host via mailbox
> notification. The host then will check this resource on every mbox
> notification and reports the crash to the core framework if needed.
>
> Crash recovery mechanism on AMD-Xilnx platform:
>
> There are two mechanisms available to recover the remote processor from
> the crash. 1) boot recovery, 2) attach on recovery
>
> Remoteproc core framework will choose proper mechanism based on the
> rproc features set by the platform driver.
>
> 1) Boot recovery
>
> This is the default mechanism to recover the remote processor.
> In this method core framework will first stop the remote processor,
> load the firmware again and then starts the remote processor. On
> AMD-Xilinx platforms this method is supported. The default coredump
> method is supported.
>
> 2) Attach on recovery
>
> If RPROC_ATTACH_ON_RECOVERY feature is enabled by the platform driver,
> then the core framework will choose this method for recovery.
>
> On versal and later platforms following is the sequence of events expected
> during remoteproc crash and attach on recovery:
>
> a) Remoteproc attach/detach flow is working, and RPMsg comm is established
> b) Remote processor (RPU) crashed (crash not reported yet)
> c) Platform management controller is instructed to stop and reload elf
> on inactive remote processor before reboot (platform specific method)
> d) Platform management controller reboots the remote processor
> e) Remote processor boots again, and detects previous crash (platform
> specific mechanism to detect the crash)
> f) Remote processor Reports crash to the Linux (Host) and wait for
> the recovery.
> g) Linux performs full detach and reattach to remote processor.
> h) Normal RPMsg communication is established.
>
> It is required to destroy all RPMsg related resources and recreate them
> during recovery to establish successful RPMsg communication. To achieve
> this complete rproc_detach followed by rproc_boot calls are needed. That
> is what this patch-series is fixing along with adding rproc recovery
> methods for AMD-Xilinx platforms.
>
> Change log:
>
> Changes in 3:
> - both rproc_attach_recovery() and
> rproc_boot_recovery() are called the same way.
> - remove unrelated changes
> - %s/kick/mailbox notification/
> - %s/core framework/rproc core framework/
> - fold simple function within zynqmp_r5_handle_rsc().
> - remove spurious change
> - reset crash state after reporting the crash
> - document set and reset of ATTACH_ON_RECOVERY flag
> - set recovery_disabled flag to false
> - check condition rproc->crash_reason != NULL
>
> Changes in v2:
> - use rproc_boot instead of rproc_attach
> - move debug message early in the function
> - clear attach recovery boot flag during detach and stop ops
> Tanmay Shah (2):
> remoteproc: core: full attach detach during recovery
> remoteproc: xlnx: add crash detection mechanism
>
> drivers/remoteproc/remoteproc_core.c | 15 +++++-
> drivers/remoteproc/xlnx_r5_remoteproc.c | 71 ++++++++++++++++++++++++-
> 2 files changed, 84 insertions(+), 2 deletions(-)
>
>
> base-commit: 098493c6dced7b02545e8bd0053ef4099a2b769e
^ permalink raw reply [flat|nested] 5+ messages in thread