linux-trace-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v4] PCI: hotplug: Add a generic RAS tracepoint for hotplug event
@ 2024-11-23 11:31 Shuai Xue
  2025-01-07 11:30 ` Shuai Xue
  2025-01-07 23:19 ` Bjorn Helgaas
  0 siblings, 2 replies; 8+ messages in thread
From: Shuai Xue @ 2024-11-23 11:31 UTC (permalink / raw)
  To: rostedt, lukas, linux-pci, linux-kernel, linux-edac,
	linux-trace-kernel
  Cc: bhelgaas, tony.luck, bp, xueshuai, mhiramat, mathieu.desnoyers,
	oleg, naveen, davem, anil.s.keshavamurthy, mark.rutland, peterz

Hotplug events are critical indicators for analyzing hardware health,
particularly in AI supercomputers where surprise link downs can
significantly impact system performance and reliability. The failure
characterization analysis illustrates the significance of failures
caused by the Infiniband link errors. Meta observes that 2% in a machine
learning cluster and 6% in a vision application cluster of Infiniband
failures co-occur with GPU failures, such as falling off the bus, which
may indicate a correlation with PCIe.[1]

To this end, define a new TRACING_SYSTEM named pci, add a generic RAS
tracepoint for hotplug event to help healthy check, and generate
tracepoints for pcie hotplug event. To monitor these tracepoints in
userspace, e.g. with rasdaemon, put `enum pci_hotplug_event` in uapi
header.

The output like below:
$ echo 1 > /sys/kernel/debug/tracing/events/pci/pci_hp_event/enable
$ cat /sys/kernel/debug/tracing/trace_pipe
           <...>-206     [001] .....    40.373870: pci_hp_event: 0000:00:02.0 slot:10, event:Link Down

           <...>-206     [001] .....    40.374871: pci_hp_event: 0000:00:02.0 slot:10, event:Card not present

[1]https://arxiv.org/abs/2410.21680

Suggested-by: Lukas Wunner <lukas@wunner.de>
Suggested-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Shuai Xue <xueshuai@linux.alibaba.com>
Reviewed-by: Lukas Wunner <lukas@wunner.de>
---
changes sincel v3:
- rename TRACING_SYSTEM from pci_hotplug to pci
- add Reviewed-by tag from Lukas
- add Suggested-by tag from Lukas and Steven
---
 drivers/pci/hotplug/pciehp_ctrl.c | 33 ++++++++++++---
 drivers/pci/hotplug/trace.h       | 68 +++++++++++++++++++++++++++++++
 include/uapi/linux/pci.h          |  7 ++++
 3 files changed, 102 insertions(+), 6 deletions(-)
 create mode 100644 drivers/pci/hotplug/trace.h

diff --git a/drivers/pci/hotplug/pciehp_ctrl.c b/drivers/pci/hotplug/pciehp_ctrl.c
index dcdbfcf404dd..c836462ff067 100644
--- a/drivers/pci/hotplug/pciehp_ctrl.c
+++ b/drivers/pci/hotplug/pciehp_ctrl.c
@@ -21,6 +21,9 @@
 #include <linux/pci.h>
 #include "pciehp.h"
 
+#define CREATE_TRACE_POINTS
+#include "trace.h"
+
 /* The following routines constitute the bulk of the
    hotplug controller logic
  */
@@ -239,12 +242,20 @@ void pciehp_handle_presence_or_link_change(struct controller *ctrl, u32 events)
 	case ON_STATE:
 		ctrl->state = POWEROFF_STATE;
 		mutex_unlock(&ctrl->state_lock);
-		if (events & PCI_EXP_SLTSTA_DLLSC)
+		if (events & PCI_EXP_SLTSTA_DLLSC) {
 			ctrl_info(ctrl, "Slot(%s): Link Down\n",
 				  slot_name(ctrl));
-		if (events & PCI_EXP_SLTSTA_PDC)
+			trace_pci_hp_event(pci_name(ctrl->pcie->port),
+					   slot_name(ctrl),
+					   PCI_HOTPLUG_LINK_DOWN);
+		}
+		if (events & PCI_EXP_SLTSTA_PDC) {
 			ctrl_info(ctrl, "Slot(%s): Card not present\n",
 				  slot_name(ctrl));
+			trace_pci_hp_event(pci_name(ctrl->pcie->port),
+					   slot_name(ctrl),
+					   PCI_HOTPLUG_CARD_NOT_PRESENT);
+		}
 		pciehp_disable_slot(ctrl, SURPRISE_REMOVAL);
 		break;
 	default:
@@ -264,6 +275,9 @@ void pciehp_handle_presence_or_link_change(struct controller *ctrl, u32 events)
 					      INDICATOR_NOOP);
 			ctrl_info(ctrl, "Slot(%s): Card not present\n",
 				  slot_name(ctrl));
+			trace_pci_hp_event(pci_name(ctrl->pcie->port),
+					   slot_name(ctrl),
+					   PCI_HOTPLUG_CARD_NOT_PRESENT);
 		}
 		mutex_unlock(&ctrl->state_lock);
 		return;
@@ -276,12 +290,19 @@ void pciehp_handle_presence_or_link_change(struct controller *ctrl, u32 events)
 	case OFF_STATE:
 		ctrl->state = POWERON_STATE;
 		mutex_unlock(&ctrl->state_lock);
-		if (present)
+		if (present) {
 			ctrl_info(ctrl, "Slot(%s): Card present\n",
 				  slot_name(ctrl));
-		if (link_active)
-			ctrl_info(ctrl, "Slot(%s): Link Up\n",
-				  slot_name(ctrl));
+			trace_pci_hp_event(pci_name(ctrl->pcie->port),
+					   slot_name(ctrl),
+					   PCI_HOTPLUG_CARD_PRESENT);
+		}
+		if (link_active) {
+			ctrl_info(ctrl, "Slot(%s): Link Up\n", slot_name(ctrl));
+			trace_pci_hp_event(pci_name(ctrl->pcie->port),
+					   slot_name(ctrl),
+					   PCI_HOTPLUG_LINK_UP);
+		}
 		ctrl->request_result = pciehp_enable_slot(ctrl);
 		break;
 	default:
diff --git a/drivers/pci/hotplug/trace.h b/drivers/pci/hotplug/trace.h
new file mode 100644
index 000000000000..5b60cd7bcffb
--- /dev/null
+++ b/drivers/pci/hotplug/trace.h
@@ -0,0 +1,68 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#if !defined(_TRACE_HW_EVENT_PCI_HP_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_HW_EVENT_PCI_HP_H
+
+#include <linux/tracepoint.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM pci
+
+#define PCI_HOTPLUG_EVENT					\
+	EM(PCI_HOTPLUG_LINK_UP,			"Link Up")	\
+	EM(PCI_HOTPLUG_LINK_DOWN,		"Link Down")	\
+	EM(PCI_HOTPLUG_CARD_PRESENT,		"Card present")	\
+	EMe(PCI_HOTPLUG_CARD_NOT_PRESENT,	"Card not present")
+
+/* Enums require being exported to userspace, for user tool parsing */
+#undef EM
+#undef EMe
+#define EM(a, b)	TRACE_DEFINE_ENUM(a);
+#define EMe(a, b)	TRACE_DEFINE_ENUM(a);
+
+PCI_HOTPLUG_EVENT
+
+/*
+ * Now redefine the EM() and EMe() macros to map the enums to the strings
+ * that will be printed in the output.
+ */
+#undef EM
+#undef EMe
+#define EM(a, b)	{a, b},
+#define EMe(a, b)	{a, b}
+
+TRACE_EVENT(pci_hp_event,
+
+	TP_PROTO(const char *port_name,
+		 const char *slot,
+		 const int event),
+
+	TP_ARGS(port_name, slot, event),
+
+	TP_STRUCT__entry(
+		__string(	port_name,	port_name	)
+		__string(	slot,		slot		)
+		__field(	int,		event	)
+	),
+
+	TP_fast_assign(
+		__assign_str(port_name);
+		__assign_str(slot);
+		__entry->event = event;
+	),
+
+	TP_printk("%s slot:%s, event:%s\n",
+		__get_str(port_name),
+		__get_str(slot),
+		__print_symbolic(__entry->event, PCI_HOTPLUG_EVENT)
+	)
+);
+
+#endif /* _TRACE_HW_EVENT_PCI_HP_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH  ../../drivers/pci/hotplug
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/uapi/linux/pci.h b/include/uapi/linux/pci.h
index a769eefc5139..4f150028965d 100644
--- a/include/uapi/linux/pci.h
+++ b/include/uapi/linux/pci.h
@@ -39,4 +39,11 @@
 #define PCIIOC_MMAP_IS_MEM	(PCIIOC_BASE | 0x02)	/* Set mmap state to MEM space. */
 #define PCIIOC_WRITE_COMBINE	(PCIIOC_BASE | 0x03)	/* Enable/disable write-combining. */
 
+enum pci_hotplug_event {
+	PCI_HOTPLUG_LINK_UP,
+	PCI_HOTPLUG_LINK_DOWN,
+	PCI_HOTPLUG_CARD_PRESENT,
+	PCI_HOTPLUG_CARD_NOT_PRESENT,
+};
+
 #endif /* _UAPILINUX_PCI_H */
-- 
2.39.3


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH v4] PCI: hotplug: Add a generic RAS tracepoint for hotplug event
  2024-11-23 11:31 [PATCH v4] PCI: hotplug: Add a generic RAS tracepoint for hotplug event Shuai Xue
@ 2025-01-07 11:30 ` Shuai Xue
  2025-01-07 12:53   ` Lukas Wunner
  2025-01-07 23:19 ` Bjorn Helgaas
  1 sibling, 1 reply; 8+ messages in thread
From: Shuai Xue @ 2025-01-07 11:30 UTC (permalink / raw)
  To: Lukas Wunner
  Cc: bhelgaas, tony.luck, bp, mhiramat, mathieu.desnoyers, oleg,
	naveen, davem, anil.s.keshavamurthy, mark.rutland, peterz,
	rostedt, linux-pci, linux-kernel, linux-edac, linux-trace-kernel



在 2024/11/23 19:31, Shuai Xue 写道:
> Hotplug events are critical indicators for analyzing hardware health,
> particularly in AI supercomputers where surprise link downs can
> significantly impact system performance and reliability. The failure
> characterization analysis illustrates the significance of failures
> caused by the Infiniband link errors. Meta observes that 2% in a machine
> learning cluster and 6% in a vision application cluster of Infiniband
> failures co-occur with GPU failures, such as falling off the bus, which
> may indicate a correlation with PCIe.[1]
> 
> To this end, define a new TRACING_SYSTEM named pci, add a generic RAS
> tracepoint for hotplug event to help healthy check, and generate
> tracepoints for pcie hotplug event. To monitor these tracepoints in
> userspace, e.g. with rasdaemon, put `enum pci_hotplug_event` in uapi
> header.
> 
> The output like below:
> $ echo 1 > /sys/kernel/debug/tracing/events/pci/pci_hp_event/enable
> $ cat /sys/kernel/debug/tracing/trace_pipe
>             <...>-206     [001] .....    40.373870: pci_hp_event: 0000:00:02.0 slot:10, event:Link Down
> 
>             <...>-206     [001] .....    40.374871: pci_hp_event: 0000:00:02.0 slot:10, event:Card not present
> 
> [1]https://arxiv.org/abs/2410.21680
> 
> Suggested-by: Lukas Wunner <lukas@wunner.de>
> Suggested-by: Steven Rostedt <rostedt@goodmis.org>
> Signed-off-by: Shuai Xue <xueshuai@linux.alibaba.com>
> Reviewed-by: Lukas Wunner <lukas@wunner.de>
> ---
> changes sincel v3:
> - rename TRACING_SYSTEM from pci_hotplug to pci
> - add Reviewed-by tag from Lukas
> - add Suggested-by tag from Lukas and Steven
> ---
>   drivers/pci/hotplug/pciehp_ctrl.c | 33 ++++++++++++---
>   drivers/pci/hotplug/trace.h       | 68 +++++++++++++++++++++++++++++++
>   include/uapi/linux/pci.h          |  7 ++++
>   3 files changed, 102 insertions(+), 6 deletions(-)
>   create mode 100644 drivers/pci/hotplug/trace.h
> 
> diff --git a/drivers/pci/hotplug/pciehp_ctrl.c b/drivers/pci/hotplug/pciehp_ctrl.c
> index dcdbfcf404dd..c836462ff067 100644
> --- a/drivers/pci/hotplug/pciehp_ctrl.c
> +++ b/drivers/pci/hotplug/pciehp_ctrl.c
> @@ -21,6 +21,9 @@
>   #include <linux/pci.h>
>   #include "pciehp.h"
>   
> +#define CREATE_TRACE_POINTS
> +#include "trace.h"
> +
>   /* The following routines constitute the bulk of the
>      hotplug controller logic
>    */
> @@ -239,12 +242,20 @@ void pciehp_handle_presence_or_link_change(struct controller *ctrl, u32 events)
>   	case ON_STATE:
>   		ctrl->state = POWEROFF_STATE;
>   		mutex_unlock(&ctrl->state_lock);
> -		if (events & PCI_EXP_SLTSTA_DLLSC)
> +		if (events & PCI_EXP_SLTSTA_DLLSC) {
>   			ctrl_info(ctrl, "Slot(%s): Link Down\n",
>   				  slot_name(ctrl));
> -		if (events & PCI_EXP_SLTSTA_PDC)
> +			trace_pci_hp_event(pci_name(ctrl->pcie->port),
> +					   slot_name(ctrl),
> +					   PCI_HOTPLUG_LINK_DOWN);
> +		}
> +		if (events & PCI_EXP_SLTSTA_PDC) {
>   			ctrl_info(ctrl, "Slot(%s): Card not present\n",
>   				  slot_name(ctrl));
> +			trace_pci_hp_event(pci_name(ctrl->pcie->port),
> +					   slot_name(ctrl),
> +					   PCI_HOTPLUG_CARD_NOT_PRESENT);
> +		}
>   		pciehp_disable_slot(ctrl, SURPRISE_REMOVAL);
>   		break;
>   	default:
> @@ -264,6 +275,9 @@ void pciehp_handle_presence_or_link_change(struct controller *ctrl, u32 events)
>   					      INDICATOR_NOOP);
>   			ctrl_info(ctrl, "Slot(%s): Card not present\n",
>   				  slot_name(ctrl));
> +			trace_pci_hp_event(pci_name(ctrl->pcie->port),
> +					   slot_name(ctrl),
> +					   PCI_HOTPLUG_CARD_NOT_PRESENT);
>   		}
>   		mutex_unlock(&ctrl->state_lock);
>   		return;
> @@ -276,12 +290,19 @@ void pciehp_handle_presence_or_link_change(struct controller *ctrl, u32 events)
>   	case OFF_STATE:
>   		ctrl->state = POWERON_STATE;
>   		mutex_unlock(&ctrl->state_lock);
> -		if (present)
> +		if (present) {
>   			ctrl_info(ctrl, "Slot(%s): Card present\n",
>   				  slot_name(ctrl));
> -		if (link_active)
> -			ctrl_info(ctrl, "Slot(%s): Link Up\n",
> -				  slot_name(ctrl));
> +			trace_pci_hp_event(pci_name(ctrl->pcie->port),
> +					   slot_name(ctrl),
> +					   PCI_HOTPLUG_CARD_PRESENT);
> +		}
> +		if (link_active) {
> +			ctrl_info(ctrl, "Slot(%s): Link Up\n", slot_name(ctrl));
> +			trace_pci_hp_event(pci_name(ctrl->pcie->port),
> +					   slot_name(ctrl),
> +					   PCI_HOTPLUG_LINK_UP);
> +		}
>   		ctrl->request_result = pciehp_enable_slot(ctrl);
>   		break;
>   	default:
> diff --git a/drivers/pci/hotplug/trace.h b/drivers/pci/hotplug/trace.h
> new file mode 100644
> index 000000000000..5b60cd7bcffb
> --- /dev/null
> +++ b/drivers/pci/hotplug/trace.h
> @@ -0,0 +1,68 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#if !defined(_TRACE_HW_EVENT_PCI_HP_H) || defined(TRACE_HEADER_MULTI_READ)
> +#define _TRACE_HW_EVENT_PCI_HP_H
> +
> +#include <linux/tracepoint.h>
> +
> +#undef TRACE_SYSTEM
> +#define TRACE_SYSTEM pci
> +
> +#define PCI_HOTPLUG_EVENT					\
> +	EM(PCI_HOTPLUG_LINK_UP,			"Link Up")	\
> +	EM(PCI_HOTPLUG_LINK_DOWN,		"Link Down")	\
> +	EM(PCI_HOTPLUG_CARD_PRESENT,		"Card present")	\
> +	EMe(PCI_HOTPLUG_CARD_NOT_PRESENT,	"Card not present")
> +
> +/* Enums require being exported to userspace, for user tool parsing */
> +#undef EM
> +#undef EMe
> +#define EM(a, b)	TRACE_DEFINE_ENUM(a);
> +#define EMe(a, b)	TRACE_DEFINE_ENUM(a);
> +
> +PCI_HOTPLUG_EVENT
> +
> +/*
> + * Now redefine the EM() and EMe() macros to map the enums to the strings
> + * that will be printed in the output.
> + */
> +#undef EM
> +#undef EMe
> +#define EM(a, b)	{a, b},
> +#define EMe(a, b)	{a, b}
> +
> +TRACE_EVENT(pci_hp_event,
> +
> +	TP_PROTO(const char *port_name,
> +		 const char *slot,
> +		 const int event),
> +
> +	TP_ARGS(port_name, slot, event),
> +
> +	TP_STRUCT__entry(
> +		__string(	port_name,	port_name	)
> +		__string(	slot,		slot		)
> +		__field(	int,		event	)
> +	),
> +
> +	TP_fast_assign(
> +		__assign_str(port_name);
> +		__assign_str(slot);
> +		__entry->event = event;
> +	),
> +
> +	TP_printk("%s slot:%s, event:%s\n",
> +		__get_str(port_name),
> +		__get_str(slot),
> +		__print_symbolic(__entry->event, PCI_HOTPLUG_EVENT)
> +	)
> +);
> +
> +#endif /* _TRACE_HW_EVENT_PCI_HP_H */
> +
> +#undef TRACE_INCLUDE_PATH
> +#define TRACE_INCLUDE_PATH  ../../drivers/pci/hotplug
> +#undef TRACE_INCLUDE_FILE
> +#define TRACE_INCLUDE_FILE trace
> +
> +/* This part must be outside protection */
> +#include <trace/define_trace.h>
> diff --git a/include/uapi/linux/pci.h b/include/uapi/linux/pci.h
> index a769eefc5139..4f150028965d 100644
> --- a/include/uapi/linux/pci.h
> +++ b/include/uapi/linux/pci.h
> @@ -39,4 +39,11 @@
>   #define PCIIOC_MMAP_IS_MEM	(PCIIOC_BASE | 0x02)	/* Set mmap state to MEM space. */
>   #define PCIIOC_WRITE_COMBINE	(PCIIOC_BASE | 0x03)	/* Enable/disable write-combining. */
>   
> +enum pci_hotplug_event {
> +	PCI_HOTPLUG_LINK_UP,
> +	PCI_HOTPLUG_LINK_DOWN,
> +	PCI_HOTPLUG_CARD_PRESENT,
> +	PCI_HOTPLUG_CARD_NOT_PRESENT,
> +};
> +
>   #endif /* _UAPILINUX_PCI_H */

Hi, Lukas,

I hope you're doing well.

I would like to inquire that if there are any further actions required from my
end to proceed for this patch?

Additionally, I would appreciate if you could inform me about who will be
pick up this patch.

Thank you.

Best Regards,
Shuai

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v4] PCI: hotplug: Add a generic RAS tracepoint for hotplug event
  2025-01-07 11:30 ` Shuai Xue
@ 2025-01-07 12:53   ` Lukas Wunner
  2025-01-08  1:47     ` Shuai Xue
  0 siblings, 1 reply; 8+ messages in thread
From: Lukas Wunner @ 2025-01-07 12:53 UTC (permalink / raw)
  To: Shuai Xue
  Cc: bhelgaas, tony.luck, bp, mhiramat, mathieu.desnoyers, oleg,
	naveen, davem, anil.s.keshavamurthy, mark.rutland, peterz,
	rostedt, linux-pci, linux-kernel, linux-edac, linux-trace-kernel

On Tue, Jan 07, 2025 at 07:30:28PM +0800, Shuai Xue wrote:
> 2024/11/23 19:31, Shuai Xue:
> > To this end, define a new TRACING_SYSTEM named pci, add a generic RAS
> > tracepoint for hotplug event to help healthy check, and generate
> > tracepoints for pcie hotplug event. To monitor these tracepoints in
> > userspace, e.g. with rasdaemon, put `enum pci_hotplug_event` in uapi
> > header.
[...]
> I would like to inquire that if there are any further actions required
> from my end to proceed for this patch?
> 
> Additionally, I would appreciate if you could inform me about who will be
> pick up this patch.

I'm fine with this patch, as indicated by the Reviewed-by,
so no actions required from you for now.  This will be
picked up by Bjorn once he gets to it (assuming he's happy with it).
A lot of folks were on holidays and are catching up on e-mails,
so please be patient.

Thanks,

Lukas

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v4] PCI: hotplug: Add a generic RAS tracepoint for hotplug event
  2024-11-23 11:31 [PATCH v4] PCI: hotplug: Add a generic RAS tracepoint for hotplug event Shuai Xue
  2025-01-07 11:30 ` Shuai Xue
@ 2025-01-07 23:19 ` Bjorn Helgaas
  2025-01-08  9:04   ` Shuai Xue
  1 sibling, 1 reply; 8+ messages in thread
From: Bjorn Helgaas @ 2025-01-07 23:19 UTC (permalink / raw)
  To: Shuai Xue
  Cc: rostedt, lukas, linux-pci, linux-kernel, linux-edac,
	linux-trace-kernel, bhelgaas, tony.luck, bp, mhiramat,
	mathieu.desnoyers, oleg, naveen, davem, anil.s.keshavamurthy,
	mark.rutland, peterz

On Sat, Nov 23, 2024 at 07:31:08PM +0800, Shuai Xue wrote:
> Hotplug events are critical indicators for analyzing hardware health,
> particularly in AI supercomputers where surprise link downs can
> significantly impact system performance and reliability. The failure
> characterization analysis illustrates the significance of failures
> caused by the Infiniband link errors. Meta observes that 2% in a machine
> learning cluster and 6% in a vision application cluster of Infiniband
> failures co-occur with GPU failures, such as falling off the bus, which
> may indicate a correlation with PCIe.[1]
> 
> To this end, define a new TRACING_SYSTEM named pci, add a generic RAS
> tracepoint for hotplug event to help healthy check, and generate
> tracepoints for pcie hotplug event. To monitor these tracepoints in
> userspace, e.g. with rasdaemon, put `enum pci_hotplug_event` in uapi
> header.
> 
> The output like below:
> $ echo 1 > /sys/kernel/debug/tracing/events/pci/pci_hp_event/enable
> $ cat /sys/kernel/debug/tracing/trace_pipe
>            <...>-206     [001] .....    40.373870: pci_hp_event: 0000:00:02.0 slot:10, event:Link Down
> 
>            <...>-206     [001] .....    40.374871: pci_hp_event: 0000:00:02.0 slot:10, event:Card not present
> 
> [1]https://arxiv.org/abs/2410.21680

Doesn't apply on pci/main (v6.13-rc1); can you rebase it?

s/pcie/PCIe/ in English text.

Probably more detail than necessary about AI supercomputers,
Infiniband, vision applications, etc.  This is a very generic issue.

"Falling off the bus" doesn't really mean anything to me.  I suppose
it's another way to describe a "link down" event that leads to UR
errors when trying to access the device?

I'm guessing that monitoring these via rasdaemon requires more than
just adding "enum pci_hotplug_event"?  Or does rasdaemon read
include/uapi/linux/pci.h and automagically incorporate new events?
Maybe there's at least a rebuild involved?

Anything in the arxiv link that is specifically relevant to this patch
needs to be in the commit log itself.  But I think there's already
enough information here to motivate this change, and whatever is in
the arxiv link may be of general interest, but is probably not
required to justify, understand, or debug this useful functionality.

Bjorn

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v4] PCI: hotplug: Add a generic RAS tracepoint for hotplug event
  2025-01-07 12:53   ` Lukas Wunner
@ 2025-01-08  1:47     ` Shuai Xue
  0 siblings, 0 replies; 8+ messages in thread
From: Shuai Xue @ 2025-01-08  1:47 UTC (permalink / raw)
  To: Lukas Wunner
  Cc: bhelgaas, tony.luck, bp, mhiramat, mathieu.desnoyers, oleg,
	naveen, davem, anil.s.keshavamurthy, mark.rutland, peterz,
	rostedt, linux-pci, linux-kernel, linux-edac, linux-trace-kernel



在 2025/1/7 20:53, Lukas Wunner 写道:
> On Tue, Jan 07, 2025 at 07:30:28PM +0800, Shuai Xue wrote:
>> 2024/11/23 19:31, Shuai Xue:
>>> To this end, define a new TRACING_SYSTEM named pci, add a generic RAS
>>> tracepoint for hotplug event to help healthy check, and generate
>>> tracepoints for pcie hotplug event. To monitor these tracepoints in
>>> userspace, e.g. with rasdaemon, put `enum pci_hotplug_event` in uapi
>>> header.
> [...]
>> I would like to inquire that if there are any further actions required
>> from my end to proceed for this patch?
>>
>> Additionally, I would appreciate if you could inform me about who will be
>> pick up this patch.
> 
> I'm fine with this patch, as indicated by the Reviewed-by,
> so no actions required from you for now.  This will be
> picked up by Bjorn once he gets to it (assuming he's happy with it).
> A lot of folks were on holidays and are catching up on e-mails,
> so please be patient.
> 
> Thanks,
> 
> Lukas

Got it, thank you:)

Sorry for the noise.

Best Regards,
Shuai

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v4] PCI: hotplug: Add a generic RAS tracepoint for hotplug event
  2025-01-07 23:19 ` Bjorn Helgaas
@ 2025-01-08  9:04   ` Shuai Xue
  2025-01-08 17:59     ` Bjorn Helgaas
  0 siblings, 1 reply; 8+ messages in thread
From: Shuai Xue @ 2025-01-08  9:04 UTC (permalink / raw)
  To: Bjorn Helgaas
  Cc: rostedt, lukas, linux-pci, linux-kernel, linux-edac,
	linux-trace-kernel, bhelgaas, tony.luck, bp, mhiramat,
	mathieu.desnoyers, oleg, naveen, davem, anil.s.keshavamurthy,
	mark.rutland, peterz



在 2025/1/8 07:19, Bjorn Helgaas 写道:
> On Sat, Nov 23, 2024 at 07:31:08PM +0800, Shuai Xue wrote:
>> Hotplug events are critical indicators for analyzing hardware health,
>> particularly in AI supercomputers where surprise link downs can
>> significantly impact system performance and reliability. The failure
>> characterization analysis illustrates the significance of failures
>> caused by the Infiniband link errors. Meta observes that 2% in a machine
>> learning cluster and 6% in a vision application cluster of Infiniband
>> failures co-occur with GPU failures, such as falling off the bus, which
>> may indicate a correlation with PCIe.[1]
>>
>> To this end, define a new TRACING_SYSTEM named pci, add a generic RAS
>> tracepoint for hotplug event to help healthy check, and generate
>> tracepoints for pcie hotplug event. To monitor these tracepoints in
>> userspace, e.g. with rasdaemon, put `enum pci_hotplug_event` in uapi
>> header.
>>
>> The output like below:
>> $ echo 1 > /sys/kernel/debug/tracing/events/pci/pci_hp_event/enable
>> $ cat /sys/kernel/debug/tracing/trace_pipe
>>             <...>-206     [001] .....    40.373870: pci_hp_event: 0000:00:02.0 slot:10, event:Link Down
>>
>>             <...>-206     [001] .....    40.374871: pci_hp_event: 0000:00:02.0 slot:10, event:Card not present
>>
>> [1]https://arxiv.org/abs/2410.21680
> 
> Doesn't apply on pci/main (v6.13-rc1); can you rebase it?

Sure. Do you mean Git repository at:

   git://git.kernel.org/pub/scm/linux/kernel/git/pci/pci.git
   branch main

> 
> s/pcie/PCIe/ in English text.

Will fix it.

> 
> Probably more detail than necessary about AI supercomputers,
> Infiniband, vision applications, etc.  This is a very generic issue.

Agreed. It is generic. Are you asking for the first background paragraph to be
deleted?

> 
> "Falling off the bus" doesn't really mean anything to me.  I suppose
> it's another way to describe a "link down" event that leads to UR
> errors when trying to access the device?

Sorry for the confusion. "Falling off the bus" is a common error for NVIDIA GPU
observed in production. The GPU driver will log a such message when GPU is not
accessible. And we also see many hotplug event like bellow:

[12945750.691652] pcieport 0000:42:02.0: pciehp: Slot(65): Link Down
[12945750.691655] pcieport 0000:42:02.0: pciehp: Slot(65): Card not present

> https://docs.nvidia.com/deploy/xid-errors/index.html#xid-79-gpu-has-fallen-off-the-bus

> 
> I'm guessing that monitoring these via rasdaemon requires more than
> just adding "enum pci_hotplug_event"?  Or does rasdaemon read
> include/uapi/linux/pci.h and automagically incorporate new events?
> Maybe there's at least a rebuild involved?

Yes, a rebuild is needed. Rasdaemon has a basic infrastructure to manually
register a tracepoint event handler. For example, for this new event, we can
register to handle pci_hp_event:

     rc = add_event_handler(ras, pevent, page_size, "pci", "pci_hp_event",
			   ras_pci_hp_event_handler, NULL, PCI_HOTPLUG_EVENT);

> 
> Anything in the arxiv link that is specifically relevant to this patch
> needs to be in the commit log itself.  But I think there's already
> enough information here to motivate this change, and whatever is in
> the arxiv link may be of general interest, but is probably not
> required to justify, understand, or debug this useful functionality.

I see, will remove the arxiv link.

> 
> Bjorn

Thank you for quick reply.

Best Regards,
Shuai

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v4] PCI: hotplug: Add a generic RAS tracepoint for hotplug event
  2025-01-08  9:04   ` Shuai Xue
@ 2025-01-08 17:59     ` Bjorn Helgaas
  2025-01-09  1:40       ` Shuai Xue
  0 siblings, 1 reply; 8+ messages in thread
From: Bjorn Helgaas @ 2025-01-08 17:59 UTC (permalink / raw)
  To: Shuai Xue
  Cc: rostedt, lukas, linux-pci, linux-kernel, linux-edac,
	linux-trace-kernel, bhelgaas, tony.luck, bp, mhiramat,
	mathieu.desnoyers, oleg, naveen, davem, anil.s.keshavamurthy,
	mark.rutland, peterz

On Wed, Jan 08, 2025 at 05:04:25PM +0800, Shuai Xue wrote:
> 在 2025/1/8 07:19, Bjorn Helgaas 写道:
> > On Sat, Nov 23, 2024 at 07:31:08PM +0800, Shuai Xue wrote:
> > > Hotplug events are critical indicators for analyzing hardware health,
> > > particularly in AI supercomputers where surprise link downs can
> > > significantly impact system performance and reliability. The failure
> > > characterization analysis illustrates the significance of failures
> > > caused by the Infiniband link errors. Meta observes that 2% in a machine
> > > learning cluster and 6% in a vision application cluster of Infiniband
> > > failures co-occur with GPU failures, such as falling off the bus, which
> > > may indicate a correlation with PCIe.[1]
> > > 
> > > To this end, define a new TRACING_SYSTEM named pci, add a generic RAS
> > > tracepoint for hotplug event to help healthy check, and generate
> > > tracepoints for pcie hotplug event. To monitor these tracepoints in
> > > userspace, e.g. with rasdaemon, put `enum pci_hotplug_event` in uapi
> > > header.
> > > 
> > > The output like below:
> > > $ echo 1 > /sys/kernel/debug/tracing/events/pci/pci_hp_event/enable
> > > $ cat /sys/kernel/debug/tracing/trace_pipe
> > >             <...>-206     [001] .....    40.373870: pci_hp_event: 0000:00:02.0 slot:10, event:Link Down
> > > 
> > >             <...>-206     [001] .....    40.374871: pci_hp_event: 0000:00:02.0 slot:10, event:Card not present
> > > 
> > > [1]https://arxiv.org/abs/2410.21680
> > 
> > Doesn't apply on pci/main (v6.13-rc1); can you rebase it?
> 
> Sure. Do you mean Git repository at:
> 
>   git://git.kernel.org/pub/scm/linux/kernel/git/pci/pci.git
>   branch main

Yes.  The most recent -rc1 is generally a safe bet for basing patches.

> > Probably more detail than necessary about AI supercomputers,
> > Infiniband, vision applications, etc.  This is a very generic issue.
> 
> Agreed. It is generic. Are you asking for the first background paragraph to be
> deleted?

I think the important part is that hotplug and link down events are
critical indicators of hardware health.  That's enough to motivate
this patch.

> > "Falling off the bus" doesn't really mean anything to me.  I suppose
> > it's another way to describe a "link down" event that leads to UR
> > errors when trying to access the device?
> 
> Sorry for the confusion. "Falling off the bus" is a common error for
> NVIDIA GPU observed in production. The GPU driver will log a such
> message when GPU is not accessible.

Yep, I see those too, and I wish the message weren't phrased so
casually.  IIRC this is typically logged when an MMIO read returns ~0,
which happens when a UR or similar error occurs.

> > I'm guessing that monitoring these via rasdaemon requires more than
> > just adding "enum pci_hotplug_event"?  Or does rasdaemon read
> > include/uapi/linux/pci.h and automagically incorporate new events?
> > Maybe there's at least a rebuild involved?
> 
> Yes, a rebuild is needed. Rasdaemon has a basic infrastructure to manually
> register a tracepoint event handler. For example, for this new event, we can
> register to handle pci_hp_event:
> 
>     rc = add_event_handler(ras, pevent, page_size, "pci", "pci_hp_event",
> 			   ras_pci_hp_event_handler, NULL, PCI_HOTPLUG_EVENT);

I would say something like "Add enum pci_hotplug_event in
include/uapi/linux/pci.h so applications like rasdaemon can register
tracepoint event handlers for it."

Bjorn

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v4] PCI: hotplug: Add a generic RAS tracepoint for hotplug event
  2025-01-08 17:59     ` Bjorn Helgaas
@ 2025-01-09  1:40       ` Shuai Xue
  0 siblings, 0 replies; 8+ messages in thread
From: Shuai Xue @ 2025-01-09  1:40 UTC (permalink / raw)
  To: Bjorn Helgaas
  Cc: rostedt, lukas, linux-pci, linux-kernel, linux-edac,
	linux-trace-kernel, bhelgaas, tony.luck, bp, mhiramat,
	mathieu.desnoyers, oleg, naveen, davem, anil.s.keshavamurthy,
	mark.rutland, peterz



在 2025/1/9 01:59, Bjorn Helgaas 写道:
> On Wed, Jan 08, 2025 at 05:04:25PM +0800, Shuai Xue wrote:
>> 在 2025/1/8 07:19, Bjorn Helgaas 写道:
>>> On Sat, Nov 23, 2024 at 07:31:08PM +0800, Shuai Xue wrote:
>>>> Hotplug events are critical indicators for analyzing hardware health,
>>>> particularly in AI supercomputers where surprise link downs can
>>>> significantly impact system performance and reliability. The failure
>>>> characterization analysis illustrates the significance of failures
>>>> caused by the Infiniband link errors. Meta observes that 2% in a machine
>>>> learning cluster and 6% in a vision application cluster of Infiniband
>>>> failures co-occur with GPU failures, such as falling off the bus, which
>>>> may indicate a correlation with PCIe.[1]
>>>>
>>>> To this end, define a new TRACING_SYSTEM named pci, add a generic RAS
>>>> tracepoint for hotplug event to help healthy check, and generate
>>>> tracepoints for pcie hotplug event. To monitor these tracepoints in
>>>> userspace, e.g. with rasdaemon, put `enum pci_hotplug_event` in uapi
>>>> header.
>>>>
>>>> The output like below:
>>>> $ echo 1 > /sys/kernel/debug/tracing/events/pci/pci_hp_event/enable
>>>> $ cat /sys/kernel/debug/tracing/trace_pipe
>>>>              <...>-206     [001] .....    40.373870: pci_hp_event: 0000:00:02.0 slot:10, event:Link Down
>>>>
>>>>              <...>-206     [001] .....    40.374871: pci_hp_event: 0000:00:02.0 slot:10, event:Card not present
>>>>
>>>> [1]https://arxiv.org/abs/2410.21680
>>>
>>> Doesn't apply on pci/main (v6.13-rc1); can you rebase it?
>>
>> Sure. Do you mean Git repository at:
>>
>>    git://git.kernel.org/pub/scm/linux/kernel/git/pci/pci.git
>>    branch main
> 
> Yes.  The most recent -rc1 is generally a safe bet for basing patches.

Got it. Will send a new version later.

> 
>>> Probably more detail than necessary about AI supercomputers,
>>> Infiniband, vision applications, etc.  This is a very generic issue.
>>
>> Agreed. It is generic. Are you asking for the first background paragraph to be
>> deleted?
> 
> I think the important part is that hotplug and link down events are
> critical indicators of hardware health.  That's enough to motivate
> this patch.

OK, I'd like to rewrite with a generic motivation.

> 
>>> "Falling off the bus" doesn't really mean anything to me.  I suppose
>>> it's another way to describe a "link down" event that leads to UR
>>> errors when trying to access the device?
>>
>> Sorry for the confusion. "Falling off the bus" is a common error for
>> NVIDIA GPU observed in production. The GPU driver will log a such
>> message when GPU is not accessible.
> 
> Yep, I see those too, and I wish the message weren't phrased so
> casually.  IIRC this is typically logged when an MMIO read returns ~0,
> which happens when a UR or similar error occurs.
> 
>>> I'm guessing that monitoring these via rasdaemon requires more than
>>> just adding "enum pci_hotplug_event"?  Or does rasdaemon read
>>> include/uapi/linux/pci.h and automagically incorporate new events?
>>> Maybe there's at least a rebuild involved?
>>
>> Yes, a rebuild is needed. Rasdaemon has a basic infrastructure to manually
>> register a tracepoint event handler. For example, for this new event, we can
>> register to handle pci_hp_event:
>>
>>      rc = add_event_handler(ras, pevent, page_size, "pci", "pci_hp_event",
>> 			   ras_pci_hp_event_handler, NULL, PCI_HOTPLUG_EVENT);
> 
> I would say something like "Add enum pci_hotplug_event in
> include/uapi/linux/pci.h so applications like rasdaemon can register
> tracepoint event handlers for it."

Will rewrite it.

> 
> Bjorn

Thank you for valuable comments.

Best Regards,
Shuai


^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2025-01-09  1:40 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-11-23 11:31 [PATCH v4] PCI: hotplug: Add a generic RAS tracepoint for hotplug event Shuai Xue
2025-01-07 11:30 ` Shuai Xue
2025-01-07 12:53   ` Lukas Wunner
2025-01-08  1:47     ` Shuai Xue
2025-01-07 23:19 ` Bjorn Helgaas
2025-01-08  9:04   ` Shuai Xue
2025-01-08 17:59     ` Bjorn Helgaas
2025-01-09  1:40       ` Shuai Xue

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).