linux-acpi.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Len Brown <lenb@kernel.org>
To: linux-acpi@vger.kernel.org
Cc: Huang Ying <ying.huang@intel.com>, Len Brown <len.brown@intel.com>
Subject: [PATCH 50/60] ACPI, APEI, Use ERST for hardware error persisting before panic
Date: Mon, 25 Oct 2010 02:20:58 -0400	[thread overview]
Message-ID: <d7a625a122d5990cb7b4fe131ac76005b4f88ff5.1287987547.git.len.brown@intel.com> (raw)
In-Reply-To: <1287987668-17584-1-git-send-email-lenb@kernel.org>
In-Reply-To: <a210080195c95ebca2a517ee3057d71607aa65e0.1287987547.git.len.brown@intel.com>

From: Huang Ying <ying.huang@intel.com>

Corrected/Recoverable hardware errors can usually be saved into disk
and/or sent to network for logging.  But if uncorrected/fatal hardware
errors occur, system may not reliable enough for desk and/or network
accessing and needs to go panic as soon as possible for error
containment.  In this situation, ERST can be used to save the valuable
error records into some persistent storage such as flash.

This patch implements herr_persist interface using ERST.  So that
hardware error device core can use ERST to save hardware error record
before panic.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 drivers/acpi/apei/Kconfig |    1 +
 drivers/acpi/apei/cper.c  |   19 +++++
 drivers/acpi/apei/erst.c  |  169 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/cper.h      |    1 +
 4 files changed, 190 insertions(+), 0 deletions(-)

diff --git a/drivers/acpi/apei/Kconfig b/drivers/acpi/apei/Kconfig
index fca34cc..e7501d6 100644
--- a/drivers/acpi/apei/Kconfig
+++ b/drivers/acpi/apei/Kconfig
@@ -1,6 +1,7 @@
 config ACPI_APEI
 	bool "ACPI Platform Error Interface (APEI)"
 	depends on X86
+	select HERR_DEV_CORE
 	help
 	  APEI allows to report errors (for example from the chipset)
 	  to the operating system. This improves NMI handling
diff --git a/drivers/acpi/apei/cper.c b/drivers/acpi/apei/cper.c
index f4cf2fc..ef72fb1 100644
--- a/drivers/acpi/apei/cper.c
+++ b/drivers/acpi/apei/cper.c
@@ -29,6 +29,25 @@
 #include <linux/time.h>
 #include <linux/cper.h>
 #include <linux/acpi.h>
+#include <linux/herror_record.h>
+
+int herr_severity_to_cper(int herr_severity)
+{
+	switch (herr_severity) {
+	case HERR_SEV_NONE:
+		return CPER_SEV_INFORMATIONAL;
+	case HERR_SEV_CORRECTED:
+		return CPER_SEV_CORRECTED;
+	case HERR_SEV_RECOVERABLE:
+		return CPER_SEV_RECOVERABLE;
+	case HERR_SEV_FATAL:
+		return CPER_SEV_FATAL;
+	default:
+		BUG();
+		return CPER_SEV_FATAL;
+	}
+}
+EXPORT_SYMBOL_GPL(herr_severity_to_cper);
 
 /*
  * CPER record ID need to be unique even after reboot, because record
diff --git a/drivers/acpi/apei/erst.c b/drivers/acpi/apei/erst.c
index addd47f..2c37855 100644
--- a/drivers/acpi/apei/erst.c
+++ b/drivers/acpi/apei/erst.c
@@ -33,6 +33,7 @@
 #include <linux/uaccess.h>
 #include <linux/cper.h>
 #include <linux/nmi.h>
+#include <linux/herror.h>
 #include <linux/hardirq.h>
 #include <acpi/apei.h>
 
@@ -88,6 +89,12 @@ static struct erst_erange {
  */
 static DEFINE_SPINLOCK(erst_lock);
 
+static void *erst_buf;
+static unsigned int erst_buf_len;
+
+/* Prevent erst_buf from being accessed simultaneously */
+static DEFINE_MUTEX(erst_buf_mutex);
+
 static inline int erst_errno(int command_status)
 {
 	switch (command_status) {
@@ -774,6 +781,12 @@ static int __erst_write_to_nvram(const struct cper_record_header *record)
 	return -ENOSYS;
 }
 
+static int __erst_write_herr_record_to_nvram(const struct herr_record *ercd)
+{
+	/* do not print message, because printk is not safe for NMI */
+	return -ENOSYS;
+}
+
 static int __erst_read_to_erange_from_nvram(u64 record_id, u64 *offset)
 {
 	pr_unimpl_nvram();
@@ -910,6 +923,156 @@ out:
 }
 EXPORT_SYMBOL_GPL(erst_clear);
 
+#define CPER_CREATOR_ERST						\
+	UUID_LE(0xEACBBA0C, 0x803A, 0x4096, 0xB1, 0x1D, 0xC3, 0xC7,	\
+		0x6E, 0xE7, 0x94, 0xF9)
+
+#define CPER_SEC_HERR_RECORD						\
+	UUID_LE(0x633AB656, 0x6703, 0x11DF, 0x87, 0xCF, 0x00, 0x19,	\
+		0xD1, 0x2A, 0x29, 0xEF)
+
+static ssize_t erst_herr_record_to_cper(struct cper_record_header *crcd,
+					size_t buf_size,
+					const struct herr_record *ercd)
+{
+	struct cper_section_descriptor *csec;
+	unsigned int crcd_len;
+	void *csec_data;
+
+	crcd_len = sizeof(*crcd) + sizeof(*csec) + ercd->length;
+	if (crcd_len > buf_size)
+		return crcd_len;
+
+	memset(crcd, 0, crcd_len);
+	memcpy(crcd->signature, CPER_SIG_RECORD, CPER_SIG_SIZE);
+	crcd->revision = CPER_RECORD_REV;
+	crcd->signature_end = CPER_SIG_END;
+	crcd->error_severity = herr_severity_to_cper(ercd->severity);
+	/* timestamp, platform_id, partition_id is invalid */
+	crcd->validation_bits = 0;
+	crcd->creator_id = CPER_CREATOR_ERST;
+	crcd->section_count = 1;
+	crcd->record_length = crcd_len;
+	crcd->record_id = ercd->id;
+
+	csec = (struct cper_section_descriptor *)(crcd + 1);
+	csec_data = csec + 1;
+
+	csec->section_length = ercd->length;
+	csec->revision = CPER_SEC_REV;
+	csec->section_type = CPER_SEC_HERR_RECORD;
+	csec->section_severity = crcd->error_severity;
+	csec->section_offset = (void *)csec_data - (void *)crcd;
+
+	memcpy(csec_data, ercd, ercd->length);
+
+	return crcd_len;
+}
+
+static int erst_write_herr_record(const struct herr_record *ercd)
+{
+	struct cper_record_header *crcd;
+	ssize_t crcd_len;
+	unsigned long flags;
+	int rc;
+
+	if (!spin_trylock_irqsave(&erst_lock, flags))
+		return -EBUSY;
+
+	if (erst_erange.attr & ERST_RANGE_NVRAM) {
+		rc = __erst_write_herr_record_to_nvram(ercd);
+		goto out;
+	}
+
+	rc = -EINVAL;
+	crcd_len = erst_herr_record_to_cper(erst_erange.vaddr,
+					    erst_erange.size, ercd);
+	if (crcd_len > erst_erange.size)
+		goto out;
+	crcd = erst_erange.vaddr;
+	/* signature for serialization system */
+	memcpy(&crcd->persistence_information, "ER", 2);
+	rc = __erst_write_to_storage(0);
+out:
+	spin_unlock_irqrestore(&erst_lock, flags);
+
+	return rc;
+}
+
+static ssize_t erst_persist_peek_user(u64 *record_id, char __user *ubuf,
+				      size_t usize)
+{
+	int rc, pos;
+	ssize_t len, clen;
+	u64 id;
+	struct cper_record_header *crcd;
+	struct cper_section_descriptor *csec;
+	struct herr_record *ercd;
+
+	if (mutex_lock_interruptible(&erst_buf_mutex) != 0)
+		return -EINTR;
+	erst_get_record_id_begin(&pos);
+retry_next:
+	len = 0;
+	rc = erst_get_record_id_next(&pos, &id);
+	if (rc)
+		goto out;
+	/* no more record */
+	if (id == APEI_ERST_INVALID_RECORD_ID)
+		goto out;
+retry:
+	rc = clen = erst_read(id, erst_buf, erst_buf_len);
+	/* someone else has cleared the record, try next one */
+	if (rc == -ENOENT)
+		goto retry_next;
+	else if (rc < 0)
+		goto out;
+	else if (clen > erst_buf_len) {
+		void *p;
+		rc = -ENOMEM;
+		p = kmalloc(clen, GFP_KERNEL);
+		if (!p)
+			goto out;
+		kfree(erst_buf);
+		erst_buf = p;
+		erst_buf_len = clen;
+		goto retry;
+	}
+
+	crcd = erst_buf;
+	csec = (struct cper_section_descriptor *)(crcd + 1);
+	if (crcd->section_count != 1 ||
+	    uuid_le_cmp(crcd->creator_id, CPER_CREATOR_ERST) ||
+	    uuid_le_cmp(csec->section_type, CPER_SEC_HERR_RECORD))
+		goto retry_next;
+
+	ercd = (struct herr_record *)(csec + 1);
+	len = ercd->length;
+
+	rc = -EINVAL;
+	if (len > usize)
+		goto out;
+
+	ercd->flags |= HERR_RCD_PREV | HERR_RCD_PERSIST;
+
+	rc = -EFAULT;
+	if (copy_to_user(ubuf, ercd, len))
+		goto out;
+	*record_id = id;
+	rc = 0;
+out:
+	erst_get_record_id_end();
+	mutex_unlock(&erst_buf_mutex);
+	return rc ? rc : len;
+}
+
+static struct herr_persist erst_persist = {
+	.name		= "ERST",
+	.in		= erst_write_herr_record,
+	.peek_user	= erst_persist_peek_user,
+	.clear		= erst_clear,
+};
+
 static int __init setup_erst_disable(char *str)
 {
 	erst_disable = 1;
@@ -1007,11 +1170,17 @@ static int __init erst_init(void)
 	if (!erst_erange.vaddr)
 		goto err_release_erange;
 
+	rc = herr_persist_register(&erst_persist);
+	if (rc)
+		goto err_unmap_erange;
+
 	pr_info(ERST_PFX
 	"Error Record Serialization Table (ERST) support is initialized.\n");
 
 	return 0;
 
+err_unmap_erange:
+	iounmap(erst_erange.vaddr);
 err_release_erange:
 	release_mem_region(erst_erange.base, erst_erange.size);
 err_unmap_reg:
diff --git a/include/linux/cper.h b/include/linux/cper.h
index bf972f8..7e00f1e 100644
--- a/include/linux/cper.h
+++ b/include/linux/cper.h
@@ -309,6 +309,7 @@ struct cper_sec_mem_err {
 /* Reset to default packing */
 #pragma pack()
 
+int herr_severity_to_cper(int herr_severity);
 u64 cper_next_record_id(void);
 
 #endif
-- 
1.7.3.2.90.gd4c43


  parent reply	other threads:[~2010-10-25  8:11 UTC|newest]

Thread overview: 62+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2010-10-25  6:20 ACPI patches for 2.6.37-merge Len Brown
2010-10-25  6:20 ` [PATCH 01/60] ACPI / ACPICA: Defer enabling of runtime GPEs (v3) Len Brown
2010-10-25  6:20   ` [PATCH 02/60] ACPICA: Fix acpi_os_read_pci_configuration prototype Len Brown
2010-10-25  6:20   ` [PATCH 03/60] ACPICA: Revert "Revert "Enable multi-byte EC transfers Len Brown
2010-10-25  6:20   ` [PATCH 04/60] ACPICA/ACPI: Add new host interfaces for _OSI support Len Brown
2010-10-25  6:20   ` [PATCH 05/60] ACPICA: Update version to 20100806 Len Brown
2010-10-25  6:20   ` [PATCH 06/60] ACPICA: Obsolete the acpi_os_derive_pci_id OSL interface Len Brown
2010-10-25  6:20   ` [PATCH 07/60] ACPICA: Add ACPI_INLINE configuration parameter Len Brown
2010-10-25  6:20   ` [PATCH 08/60] ACPICA: Make acpi_thread_id no longer configurable, always u64 Len Brown
2010-10-25  6:20   ` [PATCH 09/60] ACPICA: Update math module; no functional change Len Brown
2010-10-25  6:20   ` [PATCH 10/60] ACPICA: Make acpi_gbl_system_awake_and_running publically available Len Brown
2010-10-25  6:20   ` [PATCH 11/60] ACPICA: iASL/Disassembler: Write ACPI errors to stderr instead of output file Len Brown
2010-10-25  6:20   ` [PATCH 12/60] ACPICA: Add repair for _HID and _CID strings Len Brown
2010-10-25  6:20   ` [PATCH 13/60] ACPICA: Increase configurability of error messages Len Brown
2010-10-25  6:20   ` [PATCH 14/60] ACPICA: Update version to 20100915 Len Brown
2010-10-25  6:20   ` [PATCH 15/60] PNP: log PNP resources, as we do for PCI Len Brown
2010-10-25  6:20   ` [PATCH 16/60] PNPACPI: cope with invalid device IDs Len Brown
2010-10-25  6:20   ` [PATCH 17/60] ACPI: Remove unused #define ACPI_PROCESSOR_FILE_POWER Len Brown
2010-10-25  6:20   ` [PATCH 18/60] ACPI: Do not export hid/modalias sysfs file for ACPI objects without a HID Len Brown
2010-10-25  6:20   ` [PATCH 19/60] ACPI/PNP: A HID value of an object never changes -> make it const Len Brown
2010-10-25  6:20   ` [PATCH 20/60] acpi-cpufreq: fix a memleak when unloading driver Len Brown
2010-10-25  6:20   ` [PATCH 21/60] ACPI / PM: Fix problems with acpi_pm_device_sleep_state() Len Brown
2010-10-25  6:20   ` [PATCH 22/60] ACPI: add FW_BUG to OSI(Linux) message Len Brown
2010-10-25  6:20   ` [PATCH 23/60] ACPI ac/battery/sbs: sysfs I/F always built in, procfs I/F disabled by default Len Brown
2010-10-25  6:20   ` [PATCH 24/60] ACPI fan: remove deprecated procfs I/F Len Brown
2010-10-25  6:20   ` [PATCH 25/60] ACPI thermal: " Len Brown
2010-10-25  6:20   ` [PATCH 26/60] ACPI video: " Len Brown
2010-10-25  6:20   ` [PATCH 27/60] ACPI processor: make /proc/acpi/processor/*/throttle depends on CONFIG_ACPI_PROCFS Len Brown
2010-10-25  6:20   ` [PATCH 28/60] ACPI: remove unused declaration of proc_fs.h Len Brown
2010-10-25  6:20   ` [PATCH 29/60] ACPICA: Comment update; no functional change Len Brown
2010-10-25  6:20   ` [PATCH 30/60] ACPICA: Change type of _TZ from ThermalZone to Device Len Brown
2010-10-25  6:20   ` [PATCH 31/60] ACPICA: Eliminate duplicate code in acpi_ut_execute_* functions Len Brown
2010-10-25  6:20   ` [PATCH 32/60] ACPICA: Add Vista SP2 to supported _OSI strings Len Brown
2010-10-25  6:20   ` [PATCH 33/60] ACPICA: Clear PCIEXP_WAKE_STS when clearing ACPI events Len Brown
2010-10-25  6:20   ` [PATCH 34/60] ACPICA: Update version to 20101013 Len Brown
2010-10-25  6:20   ` [PATCH 35/60] ACPI: Only processor needs CPU_IDLE Len Brown
2010-10-25  6:20   ` [PATCH 36/60] ACPI: delete dedicated MAINTAINERS entries for ACPI EC and BATTERY drivers Len Brown
2010-10-25  6:20   ` [PATCH 37/60] ACPI: remove dead code Len Brown
2010-10-25  6:20   ` [PATCH 38/60] ACPI: static sleep_states[] and acpi_gts_bfs_check Len Brown
2010-10-25  6:20   ` [PATCH 39/60] ACPI: thermal: remove unused limit code Len Brown
2010-10-25  6:20   ` [PATCH 40/60] ACPI dock: move some functions to .init.text Len Brown
2010-10-25  6:20   ` [PATCH 41/60] ACPI: Make Embedded Controller command timeout delay configurable Len Brown
2010-10-25  6:20   ` [PATCH 42/60] ACPI battery: support percentage battery remaining capacity Len Brown
2010-10-25  6:20   ` [PATCH 43/60] Subject: [PATCH] ACPICA: Fix Scope() op in module level code Len Brown
2010-10-25  6:20   ` [PATCH 44/60] ACPI, APEI, Add ERST record ID cache Len Brown
2010-10-25  8:56     ` Ingo Molnar
2010-10-25  6:20   ` [PATCH 45/60] Add lock-less version of bitmap_set/clear Len Brown
2010-10-25  6:20   ` [PATCH 46/60] lock-less NULL terminated single list implementation Len Brown
2010-10-25  6:20   ` [PATCH 47/60] lock-less general memory allocator Len Brown
2010-10-25  6:20   ` [PATCH 48/60] Hardware error device core Len Brown
2010-10-25  6:20   ` [PATCH 49/60] Hardware error record persistent support Len Brown
2010-10-25  6:20   ` Len Brown [this message]
2010-10-25  6:20   ` [PATCH 51/60] ACPI, APEI, Report GHES error record with hardware error device core Len Brown
2010-10-25  6:21   ` [PATCH 52/60] ACPI, APEI, Generic Hardware Error Source POLL/IRQ/NMI notification type support Len Brown
2010-10-25  6:21   ` [PATCH 53/60] ACPI / PM: Fix reference counting of power resources Len Brown
2010-10-25  6:21   ` [PATCH 54/60] ACPI / Battery: Return -ENODEV for unknown values in get_property() Len Brown
2010-10-25  6:21   ` [PATCH 55/60] ACPI: Fix ioremap size for MMIO reads and writes Len Brown
2010-10-25  6:21   ` [PATCH 56/60] ACPI: Maintain a list of ACPI memory mapped I/O remappings Len Brown
2010-10-25  6:21   ` [PATCH 57/60] ACPI: Add interfaces for ioremapping/iounmapping ACPI registers Len Brown
2010-10-25  6:21   ` [PATCH 58/60] ACPI: Pre-map 'system event' related register blocks Len Brown
2010-10-25  6:21   ` [PATCH 59/60] ACPI: Convert simple locking to RCU based locking Len Brown
2010-10-25  6:21   ` [PATCH 60/60] ACPI: Page based coalescing of I/O remappings optimization Len Brown

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=d7a625a122d5990cb7b4fe131ac76005b4f88ff5.1287987547.git.len.brown@intel.com \
    --to=lenb@kernel.org \
    --cc=len.brown@intel.com \
    --cc=linux-acpi@vger.kernel.org \
    --cc=ying.huang@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).