public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: Alok Kataria <akataria@vmware.com>
To: kexec@lists.infradead.org, the arch/x86 maintainers <x86@kernel.org>
Cc: LKML <linux-kernel@vger.kernel.org>, Dan Hecht <dhecht@vmware.com>
Subject: [RFC PATCH] Bug during kexec...not all cpus are stopped
Date: Fri, 08 Oct 2010 13:34:47 -0700	[thread overview]
Message-ID: <1286570087.8769.27.camel@ank32.eng.vmware.com> (raw)


Before starting the new kernel kexec calls machine_shutdown to stop all
the cpus, which internally calls native_smp_send_stop. AFAIU, kexec
expects that all the cpus are now halted after that call returns.
Now, looking at the code for native_smp_send_stop, it assumes that all
the processors have processed the REBOOT ipi in 1 second after the IPI
was sent.
native_smp_send_stop()
---------------------------------------------------------
	apic->send_IPI_allbutself(REBOOT_VECTOR);

        /* Don't wait longer than a second */
        wait = USEC_PER_SEC;
        while (num_online_cpus() > 1 && wait--)
        	udelay(1);
---------------------------------------------------------

It just returns after that 1 second irrespective of whether all cpus
were halted or not. This brings up a issue in the kexec case, since we
can have the BSP starting the new kernel and AP's still processing the
REBOOT IPI simultaneously.

Many distribution kernels use kexec to load the newly installed kernel
during the installation phase, in virtualized environment with the host
heavily overcommitted, we have seen some instances when vcpu fails to
process the IPI in the allotted 1 sec and as a result the AP's end up
accessing uninitialized state (the BSP has already gone ahead with
setting up the new state) and causing GPF's.

IMO, kexec expects machine_shutdown to return only after all cpus are
stopped.

The patch below should fix the issue, comments ??

--
machine_shutdown now takes a parameter "wait", if it is true, it waits
until all the cpus are halted. All the callers except kexec still
fallback to the earlier version of the shutdown call, where it just
waited for max 1 sec before returning.

Signed-off-by: Alok N Kataria <akataria@vmware.com>

Index: linux-2.6/arch/x86/include/asm/reboot.h
===================================================================
--- linux-2.6.orig/arch/x86/include/asm/reboot.h	2010-03-26 16:57:18.000000000 -0700
+++ linux-2.6/arch/x86/include/asm/reboot.h	2010-10-07 16:41:58.000000000 -0700
@@ -9,7 +9,7 @@ struct machine_ops {
 	void (*restart)(char *cmd);
 	void (*halt)(void);
 	void (*power_off)(void);
-	void (*shutdown)(void);
+	void (*shutdown)(int wait);
 	void (*crash_shutdown)(struct pt_regs *);
 	void (*emergency_restart)(void);
 };
@@ -17,7 +17,7 @@ struct machine_ops {
 extern struct machine_ops machine_ops;
 
 void native_machine_crash_shutdown(struct pt_regs *regs);
-void native_machine_shutdown(void);
+void native_machine_shutdown(int wait);
 void machine_real_restart(const unsigned char *code, int length);
 
 typedef void (*nmi_shootdown_cb)(int, struct die_args*);
Index: linux-2.6/arch/x86/include/asm/smp.h
===================================================================
--- linux-2.6.orig/arch/x86/include/asm/smp.h	2010-08-03 12:43:47.000000000 -0700
+++ linux-2.6/arch/x86/include/asm/smp.h	2010-10-07 16:37:41.000000000 -0700
@@ -50,7 +50,7 @@ struct smp_ops {
 	void (*smp_prepare_cpus)(unsigned max_cpus);
 	void (*smp_cpus_done)(unsigned max_cpus);
 
-	void (*smp_send_stop)(void);
+	void (*smp_send_stop)(int wait);
 	void (*smp_send_reschedule)(int cpu);
 
 	int (*cpu_up)(unsigned cpu);
@@ -71,9 +71,9 @@ extern void set_cpu_sibling_map(int cpu)
 #endif
 extern struct smp_ops smp_ops;
 
-static inline void smp_send_stop(void)
+static inline void smp_send_stop(int wait)
 {
-	smp_ops.smp_send_stop();
+	smp_ops.smp_send_stop(wait);
 }
 
 static inline void smp_prepare_boot_cpu(void)
Index: linux-2.6/arch/x86/kernel/kvmclock.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/kvmclock.c	2010-08-03 12:43:47.000000000 -0700
+++ linux-2.6/arch/x86/kernel/kvmclock.c	2010-10-07 16:43:28.000000000 -0700
@@ -174,10 +174,10 @@ static void kvm_crash_shutdown(struct pt
 }
 #endif
 
-static void kvm_shutdown(void)
+static void kvm_shutdown(int wait)
 {
 	native_write_msr(msr_kvm_system_time, 0, 0);
-	native_machine_shutdown();
+	native_machine_shutdown(wait);
 }
 
 void __init kvmclock_init(void)
Index: linux-2.6/arch/x86/kernel/reboot.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/reboot.c	2010-08-03 12:43:47.000000000 -0700
+++ linux-2.6/arch/x86/kernel/reboot.c	2010-10-07 16:45:24.000000000 -0700
@@ -616,7 +616,7 @@ static void native_machine_emergency_res
 	}
 }
 
-void native_machine_shutdown(void)
+void native_machine_shutdown(int wait)
 {
 	/* Stop the cpus and apics */
 #ifdef CONFIG_SMP
@@ -641,7 +641,7 @@ void native_machine_shutdown(void)
 	/* O.K Now that I'm on the appropriate processor,
 	 * stop all of the others.
 	 */
-	smp_send_stop();
+	smp_send_stop(wait);
 #endif
 
 	lapic_shutdown();
@@ -670,14 +670,14 @@ static void native_machine_restart(char 
 	printk("machine restart\n");
 
 	if (!reboot_force)
-		machine_shutdown();
+		machine_shutdown(0);
 	__machine_emergency_restart(0);
 }
 
 static void native_machine_halt(void)
 {
 	/* stop other cpus and apics */
-	machine_shutdown();
+	machine_shutdown(0);
 
 	tboot_shutdown(TB_SHUTDOWN_HALT);
 
@@ -689,7 +689,7 @@ static void native_machine_power_off(voi
 {
 	if (pm_power_off) {
 		if (!reboot_force)
-			machine_shutdown();
+			machine_shutdown(0);
 		pm_power_off();
 	}
 	/* a fallback in case there is no PM info available */
@@ -712,9 +712,9 @@ void machine_power_off(void)
 	machine_ops.power_off();
 }
 
-void machine_shutdown(void)
+void machine_shutdown(int wait)
 {
-	machine_ops.shutdown();
+	machine_ops.shutdown(wait);
 }
 
 void machine_emergency_restart(void)
Index: linux-2.6/arch/x86/kernel/smp.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/smp.c	2010-10-07 16:30:34.000000000 -0700
+++ linux-2.6/arch/x86/kernel/smp.c	2010-10-07 16:34:16.000000000 -0700
@@ -159,10 +159,10 @@ asmlinkage void smp_reboot_interrupt(voi
 	irq_exit();
 }
 
-static void native_smp_send_stop(void)
+static void native_smp_send_stop(int wait)
 {
 	unsigned long flags;
-	unsigned long wait;
+	unsigned long timeout;
 
 	if (reboot_force)
 		return;
@@ -179,9 +179,9 @@ static void native_smp_send_stop(void)
 	if (num_online_cpus() > 1) {
 		apic->send_IPI_allbutself(REBOOT_VECTOR);
 
-		/* Don't wait longer than a second */
-		wait = USEC_PER_SEC;
-		while (num_online_cpus() > 1 && wait--)
+		/* Don't wait longer than a second if this not a synchronous call */
+		timeout = USEC_PER_SEC;
+		while (num_online_cpus() > 1 && (wait || timeout--))
 			udelay(1);
 	}
 
Index: linux-2.6/arch/x86/xen/enlighten.c
===================================================================
--- linux-2.6.orig/arch/x86/xen/enlighten.c	2010-08-30 16:20:50.000000000 -0700
+++ linux-2.6/arch/x86/xen/enlighten.c	2010-10-07 16:49:00.000000000 -0700
@@ -1018,7 +1018,7 @@ static void xen_reboot(int reason)
 	struct sched_shutdown r = { .reason = reason };
 
 #ifdef CONFIG_SMP
-	smp_send_stop();
+	smp_send_stop(0);
 #endif
 
 	if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r))
Index: linux-2.6/arch/x86/xen/smp.c
===================================================================
--- linux-2.6.orig/arch/x86/xen/smp.c	2010-08-30 16:20:50.000000000 -0700
+++ linux-2.6/arch/x86/xen/smp.c	2010-10-07 16:46:27.000000000 -0700
@@ -400,7 +400,7 @@ static void stop_self(void *v)
 	BUG();
 }
 
-static void xen_smp_send_stop(void)
+static void xen_smp_send_stop(int wait)
 {
 	smp_call_function(stop_self, NULL, 0);
 }
Index: linux-2.6/drivers/s390/char/sclp_quiesce.c
===================================================================
--- linux-2.6.orig/drivers/s390/char/sclp_quiesce.c	2010-08-03 12:45:04.000000000 -0700
+++ linux-2.6/drivers/s390/char/sclp_quiesce.c	2010-10-07 16:49:12.000000000 -0700
@@ -29,7 +29,7 @@ static void do_machine_quiesce(void)
 {
 	psw_t quiesce_psw;
 
-	smp_send_stop();
+	smp_send_stop(0);
 	quiesce_psw.mask = PSW_BASE_BITS | PSW_MASK_WAIT;
 	quiesce_psw.addr = 0xfff;
 	__load_psw(quiesce_psw);
Index: linux-2.6/include/linux/reboot.h
===================================================================
--- linux-2.6.orig/include/linux/reboot.h	2010-08-03 12:46:08.000000000 -0700
+++ linux-2.6/include/linux/reboot.h	2010-10-07 16:44:21.000000000 -0700
@@ -51,7 +51,7 @@ extern void machine_restart(char *cmd);
 extern void machine_halt(void);
 extern void machine_power_off(void);
 
-extern void machine_shutdown(void);
+extern void machine_shutdown(int wait);
 struct pt_regs;
 extern void machine_crash_shutdown(struct pt_regs *);
 
Index: linux-2.6/include/linux/smp.h
===================================================================
--- linux-2.6.orig/include/linux/smp.h	2010-08-03 12:46:08.000000000 -0700
+++ linux-2.6/include/linux/smp.h	2010-10-07 16:49:41.000000000 -0700
@@ -43,7 +43,7 @@ int smp_call_function_single(int cpuid, 
 /*
  * stops all CPUs but the current one:
  */
-extern void smp_send_stop(void);
+extern void smp_send_stop(int wait);
 
 /*
  * sends a 'reschedule' event to another CPU:
@@ -116,7 +116,7 @@ extern unsigned int setup_max_cpus;
 
 #else /* !SMP */
 
-static inline void smp_send_stop(void) { }
+static inline void smp_send_stop(int wait) { }
 
 /*
  *	These macros fold the SMP functionality into a single CPU system
Index: linux-2.6/kernel/kexec.c
===================================================================
--- linux-2.6.orig/kernel/kexec.c	2010-10-07 14:33:57.000000000 -0700
+++ linux-2.6/kernel/kexec.c	2010-10-07 16:45:38.000000000 -0700
@@ -1538,7 +1538,7 @@ int kernel_kexec(void)
 	{
 		kernel_restart_prepare(NULL);
 		printk(KERN_EMERG "Starting new kernel\n");
-		machine_shutdown();
+		machine_shutdown(1);
 	}
 
 	machine_kexec(kexec_image);
Index: linux-2.6/kernel/panic.c
===================================================================
--- linux-2.6.orig/kernel/panic.c	2010-08-30 16:22:03.000000000 -0700
+++ linux-2.6/kernel/panic.c	2010-10-07 16:50:05.000000000 -0700
@@ -94,7 +94,7 @@ NORET_TYPE void panic(const char * fmt, 
 	 * unfortunately means it may not be hardened to work in a panic
 	 * situation.
 	 */
-	smp_send_stop();
+	smp_send_stop(0);
 
 	atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
 



             reply	other threads:[~2010-10-08 20:34 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2010-10-08 20:34 Alok Kataria [this message]
2010-10-11 17:09 ` [RFC PATCH] Bug during kexec...not all cpus are stopped Alok Kataria
2010-10-11 18:07   ` Eric W. Biederman
2010-10-11 19:41     ` Alok Kataria
2010-10-11 21:17       ` Eric W. Biederman
2010-10-11 21:37         ` Alok Kataria
2010-10-21 21:40           ` [tip:x86/urgent] x86, kexec: Make sure to stop all CPUs before exiting the kernel tip-bot for Alok Kataria
2010-10-11 21:39       ` [RFC PATCH] Bug during kexec...not all cpus are stopped Vivek Goyal
2010-10-11 21:47         ` Alok Kataria
2010-10-11 22:10         ` Eric W. Biederman
2010-10-12 22:17           ` Vivek Goyal
2010-10-13  0:23             ` Alok Kataria
2010-10-21 19:09               ` Alok Kataria
2010-10-21 20:26                 ` H. Peter Anvin
2010-10-21 21:10                   ` Alok Kataria
2010-10-21 21:24                     ` H. Peter Anvin

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1286570087.8769.27.camel@ank32.eng.vmware.com \
    --to=akataria@vmware.com \
    --cc=dhecht@vmware.com \
    --cc=kexec@lists.infradead.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox