[PATCH 2/2] cpuhotplug: make get_online_cpus() scalability by using percpu counter

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

From: Lai Jiangshan <laijs@cn.fujitsu.com>
To: Rusty Russell <rusty@rustcorp.com.au>,
	Benjamin Herrenschmidt <benh@kernel.crashing.org>,
	Hugh Dickins <hugh.dickins@tiscali.co.uk>,
	Ingo Molnar <mingo@elte.hu>,
	"Paul E. McKenney" <paulmck@linux.vnet.ibm.com>,
	Nathan Fontenot <nfont@austin.ibm.com>,
	Peter Zijlstra <peterz@infradead.org>,
	Andrew Morton <akpm@linux-foundation.org>,
	Thomas Gleixner <tglx@linutronix.de>,
	Oleg Nesterov <oleg@redhat.com>, Sachin Sant <sachinp@in.ibm.com>,
	"H. Peter Anvin" <hpa@zytor.com>,
	Shane Wang <shane.wang@intel.com>,
	Roland McGrath <roland@redhat.com>,
	linux-kernel@vger.kernel.org, Gautham R Shenoy <ego@in.ibm.com>
Subject: [PATCH 2/2] cpuhotplug: make get_online_cpus() scalability by using percpu counter
Date: Mon, 05 Apr 2010 18:38:02 +0800	[thread overview]
Message-ID: <4BB9BD8A.9040209@cn.fujitsu.com> (raw)


Current get_online_cpus() acquires a mutex lock and then
release it. It is not scale and it hurts cache. This patch rewrite it.

1) get_online_cpus() must be allowed to be called recursively, so I added
   get_online_cpus_nest for every task for new code.

   This patch just allows get_online_cpus() to be called recursively,
   but when it is not nested, get_online_cpus() will wait until
   cpuhotplug finished, so the potential starvation is avoided.

   And, the livelock of cpu_hotplug_begin() is avoided, so the comment
   is removed.

2) This new code use PER_CPU counters, and this counters protected
   by RCU. These counters acts like the reference counters of a modules.
   (Actually, all these code is stolen from module.c: try_refcount_get()
   is stolen from try_module_get(), put_online_cpus() from module_put()...)

   After this patch applied, get_online_cpus() is very light and scale when
   cpuhotplug is not running. It just disables preemption and increase
   the cpu counter and then enables preemption.

3) Since we have try_refcount_get(), I add a new API try_get_online_cpus().

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
---
 include/linux/cpu.h   |    2
 include/linux/sched.h |    3 +
 kernel/cpu.c          |  131 ++++++++++++++++++++++++++++++++------------------
 kernel/fork.c         |    3 +
 4 files changed, 94 insertions(+), 45 deletions(-)
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index e287863..a32809c 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -112,6 +112,7 @@ extern struct sysdev_class cpu_sysdev_class;
 
 extern void get_online_cpus(void);
 extern void put_online_cpus(void);
+extern int try_get_online_cpus(void);
 #define hotcpu_notifier(fn, pri)	cpu_notifier(fn, pri)
 #define register_hotcpu_notifier(nb)	register_cpu_notifier(nb)
 #define unregister_hotcpu_notifier(nb)	unregister_cpu_notifier(nb)
@@ -134,6 +135,7 @@ static inline void cpu_hotplug_driver_unlock(void)
 
 #define get_online_cpus()	do { } while (0)
 #define put_online_cpus()	do { } while (0)
+#define try_get_online_cpus()	(1)
 #define hotcpu_notifier(fn, pri)	do { (void)(fn); } while (0)
 /* These aren't inline functions due to a GCC bug. */
 #define register_hotcpu_notifier(nb)	({ (void)(nb); 0; })
diff --git a/include/linux/sched.h b/include/linux/sched.h
index c46b6e5..0422ea3 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1501,6 +1501,9 @@ struct task_struct {
 		unsigned long memsw_bytes; /* uncharged mem+swap usage */
 	} memcg_batch;
 #endif
+#ifdef CONFIG_HOTPLUG_CPU
+	int get_online_cpus_nest;
+#endif
 };
 
 /* Future-safe accessor for struct task_struct's cpus_allowed. */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index bc1e3d5..ede02c6 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -14,6 +14,7 @@
 #include <linux/kthread.h>
 #include <linux/stop_machine.h>
 #include <linux/mutex.h>
+#include <linux/percpu.h>
 
 #ifdef CONFIG_SMP
 /* Serializes the updates to cpu_online_mask, cpu_present_mask */
@@ -42,41 +43,82 @@ static int cpu_hotplug_disabled;
 
 #ifdef CONFIG_HOTPLUG_CPU
 
-static struct {
-	struct task_struct *active_writer;
-	struct mutex lock; /* Synchronizes accesses to refcount, */
-	/*
-	 * Also blocks the new readers during
-	 * an ongoing cpu hotplug operation.
-	 */
-	int refcount;
-} cpu_hotplug = {
-	.active_writer = NULL,
-	.lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
-	.refcount = 0,
-};
+DEFINE_MUTEX(cpu_hotplug_lock);
+struct task_struct *cpu_hotplug_task;
+DEFINE_PER_CPU(int, refcount);
+
+static int try_refcount_get(void)
+{
+	preempt_disable();
+
+	if (likely(!cpu_hotplug_task)) {
+		__get_cpu_var(refcount)++;
+		preempt_enable();
+		return 1;
+	}
+
+	preempt_enable();
+	return 0;
+}
+
+int try_get_online_cpus(void)
+{
+	if (cpu_hotplug_task == current)
+		return 1;
+
+	if (current->get_online_cpus_nest || try_refcount_get()) {
+		current->get_online_cpus_nest++;
+		return 1;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(try_get_online_cpus);
 
 void get_online_cpus(void)
 {
 	might_sleep();
-	if (cpu_hotplug.active_writer == current)
+	if (cpu_hotplug_task == current)
+		return;
+
+	if (current->get_online_cpus_nest++)
+		return;
+
+	if (likely(try_refcount_get()))
 		return;
-	mutex_lock(&cpu_hotplug.lock);
-	cpu_hotplug.refcount++;
-	mutex_unlock(&cpu_hotplug.lock);
 
+	mutex_lock(&cpu_hotplug_lock);
+	percpu_add(refcount, 1);
+	mutex_unlock(&cpu_hotplug_lock);
 }
 EXPORT_SYMBOL_GPL(get_online_cpus);
 
+static unsigned int refcount_sum(void)
+{
+	unsigned int total = 0;
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		total += per_cpu(refcount, cpu);
+
+	return total;
+}
+
 void put_online_cpus(void)
 {
-	if (cpu_hotplug.active_writer == current)
+	if (cpu_hotplug_task == current)
+		return;
+
+	if (WARN_ON(!current->get_online_cpus_nest))
 		return;
-	mutex_lock(&cpu_hotplug.lock);
-	if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer))
-		wake_up_process(cpu_hotplug.active_writer);
-	mutex_unlock(&cpu_hotplug.lock);
 
+	if (!--current->get_online_cpus_nest) {
+		preempt_disable();
+		__get_cpu_var(refcount)--;
+		if (cpu_hotplug_task)
+			wake_up_process(cpu_hotplug_task);
+		preempt_enable();
+	}
 }
 EXPORT_SYMBOL_GPL(put_online_cpus);
 
@@ -85,41 +127,40 @@ EXPORT_SYMBOL_GPL(put_online_cpus);
  * refcount goes to zero.
  *
  * Note that during a cpu-hotplug operation, the new readers, if any,
- * will be blocked by the cpu_hotplug.lock
- *
- * Since cpu_hotplug_begin() is always called after invoking
- * cpu_maps_update_begin(), we can be sure that only one writer is active.
- *
- * Note that theoretically, there is a possibility of a livelock:
- * - Refcount goes to zero, last reader wakes up the sleeping
- *   writer.
- * - Last reader unlocks the cpu_hotplug.lock.
- * - A new reader arrives at this moment, bumps up the refcount.
- * - The writer acquires the cpu_hotplug.lock finds the refcount
- *   non zero and goes to sleep again.
- *
- * However, this is very difficult to achieve in practice since
- * get_online_cpus() not an api which is called all that often.
- *
+ * will be blocked by the cpu_hotplug_lock
  */
 static void cpu_hotplug_begin(void)
 {
-	cpu_hotplug.active_writer = current;
+	mutex_lock(&cpu_hotplug_lock);
+
+	/*
+	 * Set cpu_hotplug_task. Wait until all running try_refcount_get()
+	 * finished and all these try_refcount_get() behavior are seen.
+	 */
+	cpu_hotplug_task = current;
+	synchronize_sched();
 
+	/* Wait for zero refcount */
 	for (;;) {
-		mutex_lock(&cpu_hotplug.lock);
-		if (likely(!cpu_hotplug.refcount))
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		if (!refcount_sum())
 			break;
-		__set_current_state(TASK_UNINTERRUPTIBLE);
-		mutex_unlock(&cpu_hotplug.lock);
 		schedule();
 	}
+
+	__set_current_state(TASK_RUNNING);
 }
 
 static void cpu_hotplug_done(void)
 {
-	cpu_hotplug.active_writer = NULL;
-	mutex_unlock(&cpu_hotplug.lock);
+	/*
+	 * Ensure try_refcount_get() sees the front befavior
+	 * after it sees cpu_hotplug_task == NULL.
+	 */
+	smp_mb();
+
+	cpu_hotplug_task = NULL;
+	mutex_unlock(&cpu_hotplug_lock);
 }
 
 #else /* #if CONFIG_HOTPLUG_CPU */
diff --git a/kernel/fork.c b/kernel/fork.c
index d67f1db..b162014 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1109,6 +1109,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	p->memcg_batch.memcg = NULL;
 #endif
 	p->stack_start = stack_start;
+#ifdef CONFIG_HOTPLUG_CPU
+	p->get_online_cpus_nest = 0;
+#endif
 
 	/* Perform scheduler related setup. Assign this task to a CPU. */
 	sched_fork(p, clone_flags);

next             reply	other threads:[~2010-04-05 10:38 UTC|newest]

Thread overview: 12+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2010-04-05 10:38 Lai Jiangshan [this message]
2010-04-05 16:29 ` [PATCH 2/2] cpuhotplug: make get_online_cpus() scalability by using percpu counter Oleg Nesterov
2010-04-06 12:00   ` Oleg Nesterov
2010-04-07 13:35     ` Lai Jiangshan
2010-04-07 13:54       ` Oleg Nesterov
2010-04-09 12:12         ` Oleg Nesterov
2010-04-12  9:24           ` Lai Jiangshan
2010-04-12  9:28             ` Peter Zijlstra
2010-04-12 12:30               ` Lai Jiangshan
2010-04-12 12:34                 ` Peter Zijlstra
2010-04-13  1:47                   ` Lai Jiangshan
2010-04-12 18:16             ` Oleg Nesterov

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:e287863 dfblob:a32809c dfblob:c46b6e5 dfblob:0422ea3
dfblob:bc1e3d5 dfblob:ede02c6 dfblob:d67f1db dfblob:b162014 )
 OR (
bs:"[PATCH 2/2] cpuhotplug: make get_online_cpus() scalability by using percpu counter" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=4BB9BD8A.9040209@cn.fujitsu.com \
    --to=laijs@cn.fujitsu.com \
    --cc=akpm@linux-foundation.org \
    --cc=benh@kernel.crashing.org \
    --cc=ego@in.ibm.com \
    --cc=hpa@zytor.com \
    --cc=hugh.dickins@tiscali.co.uk \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@elte.hu \
    --cc=nfont@austin.ibm.com \
    --cc=oleg@redhat.com \
    --cc=paulmck@linux.vnet.ibm.com \
    --cc=peterz@infradead.org \
    --cc=roland@redhat.com \
    --cc=rusty@rustcorp.com.au \
    --cc=sachinp@in.ibm.com \
    --cc=shane.wang@intel.com \
    --cc=tglx@linutronix.de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox