All of lore.kernel.org
 help / color / mirror / Atom feed
From: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
To: linux-kernel@vger.kernel.org
Cc: Rusty Russell <rusty@rustcorp.com.au>
Subject: [PATCH] stopmachine: add stopmachine_timeout
Date: Mon, 14 Jul 2008 16:52:46 +0900	[thread overview]
Message-ID: <487B05CE.1050508@jp.fujitsu.com> (raw)

If stop_machine() invoked while one of onlined cpu is locked up
by some reason, stop_machine cannot finish its work because the
locked cpu cannot stop.  This means all other healthy cpus
will be blocked infinitely by one dead cpu.

This patch allows stop_machine to return -EBUSY with some printk
messages if any of stop_machine's threads cannot start running on
its target cpu.

Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
---
 kernel/stop_machine.c |   52 +++++++++++++++++++++++++++++++++++++++++++++---
 kernel/sysctl.c       |   15 ++++++++++++++
 2 files changed, 63 insertions(+), 4 deletions(-)

diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 5b72c2b..9e36809 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -35,15 +35,19 @@ struct stop_machine_data {
 };
 
 /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
-static unsigned int num_threads;
+static atomic_t num_threads;
 static atomic_t thread_ack;
+static cpumask_t prepared_cpus;
 static struct completion finished;
 static DEFINE_MUTEX(lock);
 
+static unsigned long limit;
+unsigned long stopmachine_timeout = 5; /* secs, arbitrary */
+
 static void set_state(enum stopmachine_state newstate)
 {
 	/* Reset ack counter. */
-	atomic_set(&thread_ack, num_threads);
+	atomic_set(&thread_ack, atomic_read(&num_threads));
 	smp_wmb();
 	state = newstate;
 }
@@ -62,11 +66,14 @@ static void ack_state(void)
 
 /* This is the actual thread which stops the CPU.  It exits by itself rather
  * than waiting for kthread_stop(), because it's easier for hotplug CPU. */
-static int stop_cpu(struct stop_machine_data *smdata)
+static int stop_cpu(void *__smdata)
 {
+	struct stop_machine_data *smdata = __smdata;
 	enum stopmachine_state curstate = STOPMACHINE_NONE;
 	int uninitialized_var(ret);
 
+	cpu_set(smp_processor_id(), prepared_cpus);
+
 	/* Simple state machine */
 	do {
 		/* Chill out and ensure we re-read stopmachine_state. */
@@ -90,6 +97,7 @@ static int stop_cpu(struct stop_machine_data *smdata)
 		}
 	} while (curstate != STOPMACHINE_EXIT);
 
+	atomic_dec(&num_threads);
 	local_irq_enable();
 	do_exit(0);
 }
@@ -106,6 +114,14 @@ int __stop_machine_run(int (*fn)(void *), void *data, const cpumask_t *cpus)
 	struct stop_machine_data active, idle;
 	struct task_struct **threads;
 
+	if (atomic_read(&num_threads)) {
+		/*
+		 * previous try was timeout, and still there are some
+		 * unfinished thread dangling stucked CPUs.
+		 */
+		return -EBUSY;
+	}
+
 	active.fn = fn;
 	active.data = data;
 	active.fnret = 0;
@@ -120,7 +136,8 @@ int __stop_machine_run(int (*fn)(void *), void *data, const cpumask_t *cpus)
 	/* Set up initial state. */
 	mutex_lock(&lock);
 	init_completion(&finished);
-	num_threads = num_online_cpus();
+	atomic_set(&num_threads, num_online_cpus());
+	limit = jiffies + msecs_to_jiffies(stopmachine_timeout * MSEC_PER_SEC);
 	set_state(STOPMACHINE_PREPARE);
 
 	for_each_online_cpu(i) {
@@ -152,10 +169,18 @@ int __stop_machine_run(int (*fn)(void *), void *data, const cpumask_t *cpus)
 
 	/* We've created all the threads.  Wake them all: hold this CPU so one
 	 * doesn't hit this CPU until we're ready. */
+	cpus_clear(prepared_cpus);
 	get_cpu();
 	for_each_online_cpu(i)
 		wake_up_process(threads[i]);
 
+	/* Wait all others come to life */
+	while (cpus_weight(prepared_cpus) != num_online_cpus() - 1) {
+		if (time_is_before_jiffies(limit))
+			goto timeout;
+		cpu_relax();
+	}
+
 	/* This will release the thread on our CPU. */
 	put_cpu();
 	wait_for_completion(&finished);
@@ -169,10 +194,29 @@ kill_threads:
 	for_each_online_cpu(i)
 		if (threads[i])
 			kthread_stop(threads[i]);
+	atomic_set(&num_threads, 0);
 	mutex_unlock(&lock);
 
 	kfree(threads);
 	return err;
+
+timeout:
+	printk(KERN_CRIT "stopmachine: Failed to stop machine in time(%lds).\n",
+			stopmachine_timeout);
+	for_each_online_cpu(i) {
+		if (!cpu_isset(i, prepared_cpus) && i != smp_processor_id())
+			printk(KERN_CRIT "stopmachine: cpu#%d seems to be "
+					"stuck.\n", i);
+		/* Unbind threads */
+		set_cpus_allowed(threads[i], cpu_online_map);
+	}
+
+	set_state(STOPMACHINE_EXIT);
+	put_cpu();
+	mutex_unlock(&lock);
+	kfree(threads);
+
+	return -EBUSY;	/* canceled */
 }
 
 int stop_machine_run(int (*fn)(void *), void *data, const cpumask_t *cpus)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 2911665..3c7ca98 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -146,6 +146,10 @@ extern int no_unaligned_warning;
 extern int max_lock_depth;
 #endif
 
+#ifdef CONFIG_STOP_MACHINE
+extern unsigned long stopmachine_timeout;
+#endif
+
 #ifdef CONFIG_PROC_SYSCTL
 static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp,
 		  void __user *buffer, size_t *lenp, loff_t *ppos);
@@ -813,6 +817,17 @@ static struct ctl_table kern_table[] = {
 		.child		= key_sysctls,
 	},
 #endif
+#ifdef CONFIG_STOP_MACHINE
+	{
+		.ctl_name       = CTL_UNNUMBERED,
+		.procname       = "stopmachine_timeout",
+		.data           = &stopmachine_timeout,
+		.maxlen         = sizeof(unsigned long),
+		.mode           = 0644,
+		.proc_handler   = &proc_doulongvec_minmax,
+		.strategy       = &sysctl_intvec,
+	},
+#endif
 /*
  * NOTE: do not add new entries to this table unless you have read
  * Documentation/sysctl/ctl_unnumbered.txt
-- 

             reply	other threads:[~2008-07-14  7:55 UTC|newest]

Thread overview: 57+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2008-07-14  7:52 Hidetoshi Seto [this message]
2008-07-14  8:19 ` [PATCH] stopmachine: add stopmachine_timeout Hidetoshi Seto
2008-07-14 10:43 ` Rusty Russell
2008-07-15  1:11   ` Hidetoshi Seto
2008-07-15  7:50     ` Rusty Russell
2008-07-16  4:05       ` Hidetoshi Seto
2008-07-20  9:45         ` Rusty Russell
2008-07-22  3:28           ` [PATCH] stopmachine: allow force progress on timeout Hidetoshi Seto
2008-07-14 11:51 ` [PATCH] stopmachine: add stopmachine_timeout Christian Borntraeger
2008-07-14 12:34   ` Rusty Russell
2008-07-14 12:34   ` Rusty Russell
2008-07-14 18:56     ` Jeremy Fitzhardinge
2008-07-14 18:56     ` Jeremy Fitzhardinge
2008-07-14 21:20       ` Heiko Carstens
2008-07-15  1:14         ` Rusty Russell
2008-07-15  1:14         ` Rusty Russell
2008-07-15  2:24         ` Hidetoshi Seto
2008-07-15  2:24         ` Hidetoshi Seto
2008-07-15  2:37           ` Max Krasnyansky
2008-07-15  2:37             ` Max Krasnyansky
2008-07-15  2:24         ` Max Krasnyansky
2008-07-15  6:09           ` Heiko Carstens
2008-07-15  6:09           ` Heiko Carstens
2008-07-15  8:09           ` Rusty Russell
2008-07-15  8:39             ` Heiko Carstens
2008-07-15  8:39             ` Heiko Carstens
2008-07-15  8:51             ` Max Krasnyansky
2008-07-15  8:51             ` Max Krasnyansky
2008-07-16  9:15             ` Christian Borntraeger
2008-07-16  9:15             ` Christian Borntraeger
2008-07-15  8:09           ` Rusty Russell
2008-07-15  2:24         ` Max Krasnyansky
2008-07-14 21:20       ` Heiko Carstens
2008-07-16  4:27 ` [PATCH] stopmachine: add stopmachine_timeout v2 Hidetoshi Seto
2008-07-16  6:23   ` Max Krasnyansky
2008-07-16  6:23   ` Max Krasnyansky
2008-07-16  6:35     ` Hidetoshi Seto
2008-07-16  6:35     ` Hidetoshi Seto
2008-07-16  6:51       ` [PATCH] stopmachine: add stopmachine_timeout v3 Hidetoshi Seto
2008-07-16  7:33         ` Peter Zijlstra
2008-07-16  8:12           ` Hidetoshi Seto
2008-07-16  8:12           ` Hidetoshi Seto
2008-07-16  7:33         ` Peter Zijlstra
2008-07-16  6:51       ` Hidetoshi Seto
2008-07-16 10:11   ` [PATCH] stopmachine: add stopmachine_timeout v2 Jeremy Fitzhardinge
2008-07-17  3:40     ` Hidetoshi Seto
2008-07-17  3:40     ` Hidetoshi Seto
2008-07-17  5:37       ` Jeremy Fitzhardinge
2008-07-17  5:37       ` Jeremy Fitzhardinge
2008-07-18  4:18       ` Rusty Russell
2008-07-18  4:18       ` Rusty Russell
2008-07-16 10:11   ` Jeremy Fitzhardinge
2008-07-16  4:27 ` Hidetoshi Seto
2008-07-17  6:12 ` [PATCH] stopmachine: add stopmachine_timeout v4 Hidetoshi Seto
2008-07-17  6:12   ` Hidetoshi Seto
2008-07-17  7:09   ` Max Krasnyansky
2008-07-17  7:09   ` Max Krasnyansky

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=487B05CE.1050508@jp.fujitsu.com \
    --to=seto.hidetoshi@jp.fujitsu.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=rusty@rustcorp.com.au \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.