stable.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
To: stable@vger.kernel.org
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>,
	patches@lists.linux.dev, Ashok Raj <ashok.raj@intel.com>,
	Thomas Gleixner <tglx@linutronix.de>
Subject: [PATCH 6.1 11/30] x86/smp: Cure kexec() vs. mwait_play_dead() breakage
Date: Thu, 29 Jun 2023 20:43:30 +0200	[thread overview]
Message-ID: <20230629184152.118895023@linuxfoundation.org> (raw)
In-Reply-To: <20230629184151.651069086@linuxfoundation.org>

From: Thomas Gleixner <tglx@linutronix.de>

commit d7893093a7417527c0d73c9832244e65c9d0114f upstream.

TLDR: It's a mess.

When kexec() is executed on a system with offline CPUs, which are parked in
mwait_play_dead() it can end up in a triple fault during the bootup of the
kexec kernel or cause hard to diagnose data corruption.

The reason is that kexec() eventually overwrites the previous kernel's text,
page tables, data and stack. If it writes to the cache line which is
monitored by a previously offlined CPU, MWAIT resumes execution and ends
up executing the wrong text, dereferencing overwritten page tables or
corrupting the kexec kernels data.

Cure this by bringing the offlined CPUs out of MWAIT into HLT.

Write to the monitored cache line of each offline CPU, which makes MWAIT
resume execution. The written control word tells the offlined CPUs to issue
HLT, which does not have the MWAIT problem.

That does not help, if a stray NMI, MCE or SMI hits the offlined CPUs as
those make it come out of HLT.

A follow up change will put them into INIT, which protects at least against
NMI and SMI.

Fixes: ea53069231f9 ("x86, hotplug: Use mwait to offline a processor, fix the legacy case")
Reported-by: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Ashok Raj <ashok.raj@intel.com>
Reviewed-by: Ashok Raj <ashok.raj@intel.com>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20230615193330.492257119@linutronix.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/smp.h |    2 +
 arch/x86/kernel/smp.c      |    5 +++
 arch/x86/kernel/smpboot.c  |   59 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 66 insertions(+)

--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -132,6 +132,8 @@ void wbinvd_on_cpu(int cpu);
 int wbinvd_on_all_cpus(void);
 void cond_wakeup_cpu0(void);
 
+void smp_kick_mwait_play_dead(void);
+
 void native_smp_send_reschedule(int cpu);
 void native_send_call_func_ipi(const struct cpumask *mask);
 void native_send_call_func_single_ipi(int cpu);
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -21,6 +21,7 @@
 #include <linux/interrupt.h>
 #include <linux/cpu.h>
 #include <linux/gfp.h>
+#include <linux/kexec.h>
 
 #include <asm/mtrr.h>
 #include <asm/tlbflush.h>
@@ -157,6 +158,10 @@ static void native_stop_other_cpus(int w
 	if (atomic_cmpxchg(&stopping_cpu, -1, cpu) != -1)
 		return;
 
+	/* For kexec, ensure that offline CPUs are out of MWAIT and in HLT */
+	if (kexec_in_progress)
+		smp_kick_mwait_play_dead();
+
 	/*
 	 * 1) Send an IPI on the reboot vector to all other CPUs.
 	 *
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -53,6 +53,7 @@
 #include <linux/tboot.h>
 #include <linux/gfp.h>
 #include <linux/cpuidle.h>
+#include <linux/kexec.h>
 #include <linux/numa.h>
 #include <linux/pgtable.h>
 #include <linux/overflow.h>
@@ -104,6 +105,9 @@ struct mwait_cpu_dead {
 	unsigned int	status;
 };
 
+#define CPUDEAD_MWAIT_WAIT	0xDEADBEEF
+#define CPUDEAD_MWAIT_KEXEC_HLT	0x4A17DEAD
+
 /*
  * Cache line aligned data for mwait_play_dead(). Separate on purpose so
  * that it's unlikely to be touched by other CPUs.
@@ -166,6 +170,10 @@ static void smp_callin(void)
 {
 	int cpuid;
 
+	/* Mop up eventual mwait_play_dead() wreckage */
+	this_cpu_write(mwait_cpu_dead.status, 0);
+	this_cpu_write(mwait_cpu_dead.control, 0);
+
 	/*
 	 * If waken up by an INIT in an 82489DX configuration
 	 * cpu_callout_mask guarantees we don't get here before
@@ -1795,6 +1803,10 @@ static inline void mwait_play_dead(void)
 			(highest_subcstate - 1);
 	}
 
+	/* Set up state for the kexec() hack below */
+	md->status = CPUDEAD_MWAIT_WAIT;
+	md->control = CPUDEAD_MWAIT_WAIT;
+
 	wbinvd();
 
 	while (1) {
@@ -1812,10 +1824,57 @@ static inline void mwait_play_dead(void)
 		mb();
 		__mwait(eax, 0);
 
+		if (READ_ONCE(md->control) == CPUDEAD_MWAIT_KEXEC_HLT) {
+			/*
+			 * Kexec is about to happen. Don't go back into mwait() as
+			 * the kexec kernel might overwrite text and data including
+			 * page tables and stack. So mwait() would resume when the
+			 * monitor cache line is written to and then the CPU goes
+			 * south due to overwritten text, page tables and stack.
+			 *
+			 * Note: This does _NOT_ protect against a stray MCE, NMI,
+			 * SMI. They will resume execution at the instruction
+			 * following the HLT instruction and run into the problem
+			 * which this is trying to prevent.
+			 */
+			WRITE_ONCE(md->status, CPUDEAD_MWAIT_KEXEC_HLT);
+			while(1)
+				native_halt();
+		}
+
 		cond_wakeup_cpu0();
 	}
 }
 
+/*
+ * Kick all "offline" CPUs out of mwait on kexec(). See comment in
+ * mwait_play_dead().
+ */
+void smp_kick_mwait_play_dead(void)
+{
+	u32 newstate = CPUDEAD_MWAIT_KEXEC_HLT;
+	struct mwait_cpu_dead *md;
+	unsigned int cpu, i;
+
+	for_each_cpu_andnot(cpu, cpu_present_mask, cpu_online_mask) {
+		md = per_cpu_ptr(&mwait_cpu_dead, cpu);
+
+		/* Does it sit in mwait_play_dead() ? */
+		if (READ_ONCE(md->status) != CPUDEAD_MWAIT_WAIT)
+			continue;
+
+		/* Wait up to 5ms */
+		for (i = 0; READ_ONCE(md->status) != newstate && i < 1000; i++) {
+			/* Bring it out of mwait */
+			WRITE_ONCE(md->control, newstate);
+			udelay(5);
+		}
+
+		if (READ_ONCE(md->status) != newstate)
+			pr_err_once("CPU%u is stuck in mwait_play_dead()\n", cpu);
+	}
+}
+
 void hlt_play_dead(void)
 {
 	if (__this_cpu_read(cpu_info.x86) >= 4)



  parent reply	other threads:[~2023-06-29 18:45 UTC|newest]

Thread overview: 36+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-06-29 18:43 [PATCH 6.1 00/30] 6.1.37-rc1 review Greg Kroah-Hartman
2023-06-29 18:43 ` [PATCH 6.1 01/30] mm/mmap: Fix error path in do_vmi_align_munmap() Greg Kroah-Hartman
2023-06-29 18:43 ` [PATCH 6.1 02/30] mm/mmap: Fix error return " Greg Kroah-Hartman
2023-06-29 18:43 ` [PATCH 6.1 03/30] mptcp: ensure listener is unhashed before updating the sk status Greg Kroah-Hartman
2023-06-29 18:43 ` [PATCH 6.1 04/30] mm, hwpoison: try to recover from copy-on write faults Greg Kroah-Hartman
2023-06-29 18:43 ` [PATCH 6.1 05/30] mm, hwpoison: when copy-on-write hits poison, take page offline Greg Kroah-Hartman
2023-06-29 18:43 ` [PATCH 6.1 06/30] x86/microcode/AMD: Load late on both threads too Greg Kroah-Hartman
2023-06-29 18:43 ` [PATCH 6.1 07/30] x86/smp: Make stop_other_cpus() more robust Greg Kroah-Hartman
2023-06-29 18:43 ` [PATCH 6.1 08/30] x86/smp: Dont access non-existing CPUID leaf Greg Kroah-Hartman
2023-06-29 18:43 ` [PATCH 6.1 09/30] x86/smp: Remove pointless wmb()s from native_stop_other_cpus() Greg Kroah-Hartman
2023-06-29 18:43 ` [PATCH 6.1 10/30] x86/smp: Use dedicated cache-line for mwait_play_dead() Greg Kroah-Hartman
2023-06-29 18:43 ` Greg Kroah-Hartman [this message]
2023-06-29 18:43 ` [PATCH 6.1 12/30] can: isotp: isotp_sendmsg(): fix return error fix on TX path Greg Kroah-Hartman
2023-06-29 18:43 ` [PATCH 6.1 13/30] maple_tree: fix potential out-of-bounds access in mas_wr_end_piv() Greg Kroah-Hartman
2023-06-29 18:43 ` [PATCH 6.1 14/30] mm: introduce new lock_mm_and_find_vma() page fault helper Greg Kroah-Hartman
2023-06-29 18:43 ` [PATCH 6.1 15/30] mm: make the page fault mmap locking killable Greg Kroah-Hartman
2023-06-29 18:43 ` [PATCH 6.1 16/30] arm64/mm: Convert to using lock_mm_and_find_vma() Greg Kroah-Hartman
2023-06-29 18:43 ` [PATCH 6.1 17/30] powerpc/mm: " Greg Kroah-Hartman
2023-06-29 18:43 ` [PATCH 6.1 18/30] mips/mm: " Greg Kroah-Hartman
2023-06-29 18:43 ` [PATCH 6.1 19/30] riscv/mm: " Greg Kroah-Hartman
2023-06-29 18:43 ` [PATCH 6.1 20/30] arm/mm: " Greg Kroah-Hartman
2023-06-29 18:43 ` [PATCH 6.1 21/30] mm/fault: convert remaining simple cases to lock_mm_and_find_vma() Greg Kroah-Hartman
2023-06-29 18:43 ` [PATCH 6.1 22/30] powerpc/mm: convert coprocessor fault " Greg Kroah-Hartman
2023-06-29 18:43 ` [PATCH 6.1 23/30] mm: make find_extend_vma() fail if write lock not held Greg Kroah-Hartman
2023-06-29 18:43 ` [PATCH 6.1 24/30] execve: expand new process stack manually ahead of time Greg Kroah-Hartman
2023-06-29 18:43 ` [PATCH 6.1 25/30] mm: always expand the stack with the mmap write lock held Greg Kroah-Hartman
2023-06-29 18:43 ` [PATCH 6.1 26/30] fbdev: fix potential OOB read in fast_imageblit() Greg Kroah-Hartman
2023-06-29 18:43 ` [PATCH 6.1 27/30] HID: hidraw: fix data race on device refcount Greg Kroah-Hartman
2023-06-29 18:43 ` [PATCH 6.1 28/30] HID: wacom: Use ktime_t rather than int when dealing with timestamps Greg Kroah-Hartman
2023-06-29 18:43 ` [PATCH 6.1 29/30] HID: logitech-hidpp: add HIDPP_QUIRK_DELAYED_INIT for the T651 Greg Kroah-Hartman
2023-06-29 18:43 ` [PATCH 6.1 30/30] Revert "thermal/drivers/mediatek: Use devm_of_iomap to avoid resource leak in mtk_thermal_probe" Greg Kroah-Hartman
2023-06-29 21:56 ` [PATCH 6.1 00/30] 6.1.37-rc1 review ogasawara takeshi
2023-06-29 22:25 ` Daniel Díaz
2023-06-30  5:18   ` Greg Kroah-Hartman
2023-06-30  5:21     ` Daniel Díaz
2023-06-30  5:30       ` Greg Kroah-Hartman

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230629184152.118895023@linuxfoundation.org \
    --to=gregkh@linuxfoundation.org \
    --cc=ashok.raj@intel.com \
    --cc=patches@lists.linux.dev \
    --cc=stable@vger.kernel.org \
    --cc=tglx@linutronix.de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).