All of lore.kernel.org
 help / color / mirror / Atom feed
From: Borislav Petkov <bp@alien8.de>
To: Ingo Molnar <mingo@kernel.org>
Cc: Tony Luck <tony.luck@intel.com>, Jiri Kosina <jkosina@suse.cz>,
	x86-ml <x86@kernel.org>, linux-edac <linux-edac@vger.kernel.org>,
	lkml <linux-kernel@vger.kernel.org>,
	"Rafael J. Wysocki" <rjw@rjwysocki.net>,
	Len Brown <lenb@kernel.org>
Subject: [GIT PULL] RAS for 4.2
Date: Mon, 4 May 2015 11:33:47 +0200	[thread overview]
Message-ID: <20150504093347.GC3829@pd.tnic> (raw)

Hi Ingo,

please queue for 4.2.

Thanks.

@Len, @Rafael: Guys, I'm sending this through tip even though strictly
speaking it is ACPI.

I also am guessing to the point of being almost right, though, that
you're secretly happy I'm doing that because it is one less issue you
have to deal with. :-) :-)

Let me know if you still want to pick that up though.

Thanks.

---
The following changes since commit b787f68c36d49bb1d9236f403813641efa74a031:

  Linux 4.1-rc1 (2015-04-26 17:59:10 -0700)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/ras/ras.git tags/ras_for_4.2

for you to fetch changes up to 6fe9e7c26a97105645fd24f264f1b94e21aade3e:

  GHES: Make NMI handler have a single reader (2015-04-27 21:35:33 +0200)

----------------------------------------------------------------
GHES: Seriously speedup and cleanup NMI handler (Jiri Kosina and Borislav Petkov)

This is the result of us seeing this during boot

[   24.332560] INFO: NMI handler (ghes_notify_nmi) took too long to run: 3.265 msecs
[   24.332567] INFO: NMI handler (ghes_notify_nmi) took too long to run: 5.946 msecs
[   24.332568] INFO: NMI handler (ghes_notify_nmi) took too long to run: 5.948 msecs

and a report of people running perf stat and the machine softlocking.
"hest_disable" was helping in this case, which made us look at that
particular NMI handler. It was grabbing a lock each time it is run and
on each CPU. But this is not needed as the GHES sources are global and
they need only a single reader.

This patchset does that and cleans up the handler in the process.

----------------------------------------------------------------
Borislav Petkov (4):
      GHES: Carve out error queueing in a separate function
      GHES: Carve out the panic functionality
      GHES: Panic right after detection
      GHES: Elliminate double-loop in the NMI handler

Jiri Kosina (1):
      GHES: Make NMI handler have a single reader

 drivers/acpi/apei/ghes.c | 108 ++++++++++++++++++++++++-----------------------
 1 file changed, 55 insertions(+), 53 deletions(-)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index e82d0976a5d0..2bfd53cbfe80 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -729,10 +729,10 @@ static struct llist_head ghes_estatus_llist;
 static struct irq_work ghes_proc_irq_work;
 
 /*
- * NMI may be triggered on any CPU, so ghes_nmi_lock is used for
- * mutual exclusion.
+ * NMI may be triggered on any CPU, so ghes_in_nmi is used for
+ * having only one concurrent reader.
  */
-static DEFINE_RAW_SPINLOCK(ghes_nmi_lock);
+static atomic_t ghes_in_nmi = ATOMIC_INIT(0);
 
 static LIST_HEAD(ghes_nmi);
 
@@ -797,73 +797,75 @@ static void ghes_print_queued_estatus(void)
 	}
 }
 
+/* Save estatus for further processing in IRQ context */
+static void __process_error(struct ghes *ghes)
+{
+#ifdef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
+	u32 len, node_len;
+	struct ghes_estatus_node *estatus_node;
+	struct acpi_hest_generic_status *estatus;
+
+	if (ghes_estatus_cached(ghes->estatus))
+		return;
+
+	len = cper_estatus_len(ghes->estatus);
+	node_len = GHES_ESTATUS_NODE_LEN(len);
+
+	estatus_node = (void *)gen_pool_alloc(ghes_estatus_pool, node_len);
+	if (!estatus_node)
+		return;
+
+	estatus_node->ghes = ghes;
+	estatus_node->generic = ghes->generic;
+	estatus = GHES_ESTATUS_FROM_NODE(estatus_node);
+	memcpy(estatus, ghes->estatus, len);
+	llist_add(&estatus_node->llnode, &ghes_estatus_llist);
+#endif
+}
+
+static void __ghes_panic(struct ghes *ghes)
+{
+	oops_begin();
+	ghes_print_queued_estatus();
+	__ghes_print_estatus(KERN_EMERG, ghes->generic, ghes->estatus);
+
+	/* reboot to log the error! */
+	if (panic_timeout == 0)
+		panic_timeout = ghes_panic_timeout;
+	panic("Fatal hardware error!");
+}
+
 static int ghes_notify_nmi(unsigned int cmd, struct pt_regs *regs)
 {
-	struct ghes *ghes, *ghes_global = NULL;
-	int sev, sev_global = -1;
-	int ret = NMI_DONE;
+	struct ghes *ghes;
+	int sev, ret = NMI_DONE;
+
+	if (!atomic_add_unless(&ghes_in_nmi, 1, 1))
+		return ret;
 
-	raw_spin_lock(&ghes_nmi_lock);
 	list_for_each_entry_rcu(ghes, &ghes_nmi, list) {
 		if (ghes_read_estatus(ghes, 1)) {
 			ghes_clear_estatus(ghes);
 			continue;
 		}
-		sev = ghes_severity(ghes->estatus->error_severity);
-		if (sev > sev_global) {
-			sev_global = sev;
-			ghes_global = ghes;
-		}
-		ret = NMI_HANDLED;
-	}
-
-	if (ret == NMI_DONE)
-		goto out;
 
-	if (sev_global >= GHES_SEV_PANIC) {
-		oops_begin();
-		ghes_print_queued_estatus();
-		__ghes_print_estatus(KERN_EMERG, ghes_global->generic,
-				     ghes_global->estatus);
-		/* reboot to log the error! */
-		if (panic_timeout == 0)
-			panic_timeout = ghes_panic_timeout;
-		panic("Fatal hardware error!");
-	}
+		sev = ghes_severity(ghes->estatus->error_severity);
+		if (sev >= GHES_SEV_PANIC)
+			__ghes_panic(ghes);
 
-	list_for_each_entry_rcu(ghes, &ghes_nmi, list) {
-#ifdef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
-		u32 len, node_len;
-		struct ghes_estatus_node *estatus_node;
-		struct acpi_hest_generic_status *estatus;
-#endif
 		if (!(ghes->flags & GHES_TO_CLEAR))
 			continue;
-#ifdef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
-		if (ghes_estatus_cached(ghes->estatus))
-			goto next;
-		/* Save estatus for further processing in IRQ context */
-		len = cper_estatus_len(ghes->estatus);
-		node_len = GHES_ESTATUS_NODE_LEN(len);
-		estatus_node = (void *)gen_pool_alloc(ghes_estatus_pool,
-						      node_len);
-		if (estatus_node) {
-			estatus_node->ghes = ghes;
-			estatus_node->generic = ghes->generic;
-			estatus = GHES_ESTATUS_FROM_NODE(estatus_node);
-			memcpy(estatus, ghes->estatus, len);
-			llist_add(&estatus_node->llnode, &ghes_estatus_llist);
-		}
-next:
-#endif
+
+		__process_error(ghes);
 		ghes_clear_estatus(ghes);
+
+		ret = NMI_HANDLED;
 	}
+
 #ifdef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
 	irq_work_queue(&ghes_proc_irq_work);
 #endif
-
-out:
-	raw_spin_unlock(&ghes_nmi_lock);
+	atomic_dec(&ghes_in_nmi);
 	return ret;
 }
 

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.
--

             reply	other threads:[~2015-05-04  9:34 UTC|newest]

Thread overview: 5+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-05-04  9:33 Borislav Petkov [this message]
2015-05-04 13:36 ` [GIT PULL] RAS for 4.2 Rafael J. Wysocki
2015-05-04 13:16   ` Borislav Petkov
2015-05-04 14:02     ` Rafael J. Wysocki
2015-05-04 21:04       ` Rafael J. Wysocki

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20150504093347.GC3829@pd.tnic \
    --to=bp@alien8.de \
    --cc=jkosina@suse.cz \
    --cc=lenb@kernel.org \
    --cc=linux-edac@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@kernel.org \
    --cc=rjw@rjwysocki.net \
    --cc=tony.luck@intel.com \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.