public inbox for linux-edac@vger.kernel.org
 help / color / mirror / Atom feed
From: Shiju Jose <shiju.jose@huawei.com>
To: Alison Schofield <alison.schofield@intel.com>
Cc: "linux-edac@vger.kernel.org" <linux-edac@vger.kernel.org>,
	"linux-cxl@vger.kernel.org" <linux-cxl@vger.kernel.org>,
	"mchehab@kernel.org" <mchehab@kernel.org>,
	Jonathan Cameron <jonathan.cameron@huawei.com>,
	Linuxarm <linuxarm@huawei.com>
Subject: RE: [RFC PATCH 2/4] rasdaemon: Add support for the CXL poison events
Date: Mon, 23 Jan 2023 11:14:05 +0000	[thread overview]
Message-ID: <f0dc812ee02e4358b8c457cb2d558615@huawei.com> (raw)
In-Reply-To: <Y84WZp4K3mLzLkwh@aschofie-mobl2>

Hi Alison,

Thanks for the feedback.

>-----Original Message-----
>From: Alison Schofield <alison.schofield@intel.com>
>Sent: 23 January 2023 05:09
>To: Shiju Jose <shiju.jose@huawei.com>
>Cc: linux-edac@vger.kernel.org; linux-cxl@vger.kernel.org;
>mchehab@kernel.org; Jonathan Cameron
><jonathan.cameron@huawei.com>; Linuxarm <linuxarm@huawei.com>
>Subject: Re: [RFC PATCH 2/4] rasdaemon: Add support for the CXL poison
>events
>
>On Thu, Jan 19, 2023 at 05:18:07PM +0000, shiju.jose@huawei.com wrote:
>> From: Shiju Jose <shiju.jose@huawei.com>
>>
>> Add support to log and record the CXL poison events.
>>
>> The corresponding Kernel patches here:
>> https://lore.kernel.org/lkml/cover.1668115235.git.alison.schofield@int
>> el.com/
>>
>> Presently RFC draft version for logging, could be extended for the
>> policy based recovery action for the frequent poison events depending
>> on the above kernel patches.
>>
>> Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
>> ---
>>  Makefile.am       |   8 ++-
>>  configure.ac      |  11 ++++
>>  ras-cxl-handler.c | 162
>> ++++++++++++++++++++++++++++++++++++++++++++++
>>  ras-cxl-handler.h |  24 +++++++
>>  ras-events.c      |  15 +++++
>>  ras-events.h      |   1 +
>>  ras-record.c      |  81 +++++++++++++++++++++++
>>  ras-record.h      |  20 ++++++
>>  ras-report.c      |  83 ++++++++++++++++++++++++
>>  ras-report.h      |   2 +
>>  10 files changed, 406 insertions(+), 1 deletion(-)  create mode
>> 100644 ras-cxl-handler.c  create mode 100644 ras-cxl-handler.h
>>
>> diff --git a/Makefile.am b/Makefile.am index a322b9a..4216370 100644
>> --- a/Makefile.am
>> +++ b/Makefile.am
>> @@ -69,13 +69,19 @@ endif
>>  if WITH_AMP_NS_DECODE
>>     rasdaemon_SOURCES += non-standard-ampere.c  endif
>> +
>> +if WITH_CXL
>> +   rasdaemon_SOURCES += ras-cxl-handler.c endif
>> +
>>  rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) libtrace/libtrace.a
>>
>>  include_HEADERS = config.h  ras-events.h  ras-logger.h  ras-mc-handler.h \
>>  		  ras-aer-handler.h ras-mce-handler.h ras-record.h bitfield.h
>ras-report.h \
>>  		  ras-extlog-handler.h ras-arm-handler.h ras-non-standard-
>handler.h \
>>  		  ras-devlink-handler.h ras-diskerror-handler.h rbtree.h ras-
>page-isolation.h \
>> -		  non-standard-hisilicon.h non-standard-ampere.h ras-
>memory-failure-handler.h
>> +		  non-standard-hisilicon.h non-standard-ampere.h ras-
>memory-failure-handler.h \
>> +		  ras-cxl-handler.h
>>
>>  # This rule can't be called with more than one Makefile job (like
>> make -j8)  # I can't figure out a way to fix that diff --git
>> a/configure.ac b/configure.ac index a77991f..c18a67d 100644
>> --- a/configure.ac
>> +++ b/configure.ac
>> @@ -161,6 +161,16 @@ AS_IF([test "x$enable_amp_ns_decode" = "xyes"
>||
>> test "x$enable_all" == "xyes"],
>AM_CONDITIONAL([WITH_AMP_NS_DECODE],
>> [test x$enable_amp_ns_decode = xyes || test x$enable_all == xyes])
>> AM_COND_IF([WITH_AMP_NS_DECODE], [USE_AMP_NS_DECODE="yes"],
>> [USE_AMP_NS_DECODE="no"])
>>
>> +AC_ARG_ENABLE([cxl],
>> +    AS_HELP_STRING([--enable-cxl], [enable CXL events (currently
>> +experimental)]))
>> +
>> +AS_IF([test "x$enable_cxl" = "xyes" || test "x$enable_all" ==
>> +"xyes"], [
>> +  AC_DEFINE(HAVE_CXL,1,"have CXL events collect")
>> +  AC_SUBST([WITH_CXL])
>> +])
>> +AM_CONDITIONAL([WITH_CXL], [test x$enable_cxl = xyes || test
>> +x$enable_all == xyes]) AM_COND_IF([WITH_CXL], [USE_CXL="yes"],
>> +[USE_CXL="no"])
>> +
>>  test "$sysconfdir" = '${prefix}/etc' && sysconfdir=/etc
>>
>>  CFLAGS="$CFLAGS -Wall -Wmissing-prototypes -Wstrict-prototypes"
>> @@ -201,4 +211,5 @@ compile time options summary
>>      Memory Failure      : $USE_MEMORY_FAILURE
>>      Memory CE PFA       : $USE_MEMORY_CE_PFA
>>      AMP RAS errors      : $USE_AMP_NS_DECODE
>> +    CXL events          : $USE_CXL
>>  EOF
>> diff --git a/ras-cxl-handler.c b/ras-cxl-handler.c new file mode
>> 100644 index 0000000..11531ef
>> --- /dev/null
>> +++ b/ras-cxl-handler.c
>> @@ -0,0 +1,162 @@
>> +/*
>> + * Copyright (c) Huawei Technologies Co., Ltd. 2023. All rights reserved.
>> + *
>> + * This program is free software; you can redistribute it and/or
>> +modify
>> + * it under the terms of the GNU General Public License as published
>> +by
>> + * the Free Software Foundation; either version 2 of the License, or
>> + * (at your option) any later version.
>> + *
>> + * This program is distributed in the hope that it will be useful,
>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>> + * GNU General Public License for more details.
>> + */
>> +
>> +#include <stdio.h>
>> +#include <stdlib.h>
>> +#include <string.h>
>> +#include <unistd.h>
>> +#include "libtrace/kbuffer.h"
>> +#include "ras-cxl-handler.h"
>> +#include "ras-record.h"
>> +#include "ras-logger.h"
>> +#include "ras-report.h"
>> +
>> +/* Poison List: Payload out flags */
>> +#define CXL_POISON_FLAG_MORE            BIT(0)
>> +#define CXL_POISON_FLAG_OVERFLOW        BIT(1)
>> +#define CXL_POISON_FLAG_SCANNING        BIT(2)
>> +
>> +/* CXL poison - source types */
>> +enum cxl_poison_source {
>> +	CXL_POISON_SOURCE_UNKNOWN = 0,
>> +	CXL_POISON_SOURCE_EXTERNAL = 1,
>> +	CXL_POISON_SOURCE_INTERNAL = 2,
>> +	CXL_POISON_SOURCE_INJECTED = 3,
>> +	CXL_POISON_SOURCE_VENDOR = 7,
>> +};
>> +
>> +int ras_cxl_poison_event_handler(struct trace_seq *s,
>> +				 struct pevent_record *record,
>> +				 struct event_format *event, void *context) {
>> +	int len;
>> +	unsigned long long val;
>> +	struct ras_events *ras = context;
>> +	time_t now;
>> +	struct tm *tm;
>> +	struct ras_cxl_poison_event ev;
>> +
>> +	now = record->ts/user_hz + ras->uptime_diff;
>> +	tm = localtime(&now);
>> +	if (tm)
>> +		strftime(ev.timestamp, sizeof(ev.timestamp),
>> +			 "%Y-%m-%d %H:%M:%S %z", tm);
>> +	else
>> +		strncpy(ev.timestamp, "1970-01-01 00:00:00 +0000",
>sizeof(ev.timestamp));
>> +	trace_seq_printf(s, "%s ", ev.timestamp);
>> +
>> +	ev.memdev = pevent_get_field_raw(s, event, "memdev",
>> +					   record, &len, 1);
>> +	if (!ev.memdev)
>> +		return -1;
>> +	trace_seq_printf(s, "memdev:%s ", ev.memdev);
>> +
>> +	ev.pcidev = pevent_get_field_raw(s, event, "pcidev",
>> +					   record, &len, 1);
>> +	if (!ev.pcidev)
>> +		return -1;
>> +	trace_seq_printf(s, "pcidev:%s ", ev.pcidev);
>> +
>> +	ev.region = pevent_get_field_raw(s, event, "region",
>> +					   record, &len, 1);
>> +	if (!ev.region)
>> +		return -1;
>> +	trace_seq_printf(s, "region:%s ", ev.region);
>
>Hi Shiju,
>
>Does the above work OK when the region name is assigned a NULL string?
>That's what happens in the trace code (__assign_str(region, "");) when region
>is NULL.
>
I checked. It work ok with NULL string in the region name because pevent_get_field_raw()
returns the pointer to the data field.  
   
>Not sure if this is different since you last tested w posted poison set. Latest
>are here:
>
>https://lore.kernel.org/linux-
>cxl/de11785ff05844299b40b100f8e0f56c7eef7f08.1674070170.git.alison.schofie
>ld@intel.com/

I tested today with the v5 patch set, it worked ok.
 
>
>Alison
>
>> +
>> +	ev.uuid = pevent_get_field_raw(s, event, "uuid",
...
>>
>> --
>> 2.25.1
>>
Thanks,
Shiju

  reply	other threads:[~2023-01-23 11:14 UTC|newest]

Thread overview: 13+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-01-19 17:18 [RFC PATCH 0/4] rasdaemon: Add support for the CXL error events shiju.jose
2023-01-19 17:18 ` [RFC PATCH 1/4] rasdaemon: Move definition for BIT and BIT_ULL to a common file shiju.jose
2023-01-20 14:28   ` Jonathan Cameron
2023-01-19 17:18 ` [RFC PATCH 2/4] rasdaemon: Add support for the CXL poison events shiju.jose
2023-01-23  5:08   ` Alison Schofield
2023-01-23 11:14     ` Shiju Jose [this message]
2023-01-23 12:21   ` Jonathan Cameron
2023-01-19 17:18 ` [RFC PATCH 3/4] rasdaemon: Add support for the CXL AER uncorrectable errors shiju.jose
2023-01-20 16:21   ` Dave Jiang
2023-01-20 18:11     ` Shiju Jose
2023-01-23 12:22   ` Jonathan Cameron
2023-01-19 17:18 ` [RFC PATCH 4/4] rasdaemon: Add support for the CXL AER correctable errors shiju.jose
2023-01-23 12:24   ` Jonathan Cameron

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=f0dc812ee02e4358b8c457cb2d558615@huawei.com \
    --to=shiju.jose@huawei.com \
    --cc=alison.schofield@intel.com \
    --cc=jonathan.cameron@huawei.com \
    --cc=linux-cxl@vger.kernel.org \
    --cc=linux-edac@vger.kernel.org \
    --cc=linuxarm@huawei.com \
    --cc=mchehab@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox