From: Lai Jiangshan <laijs@cn.fujitsu.com>
To: Rusty Russell <rusty@rustcorp.com.au>,
Benjamin Herrenschmidt <benh@kernel.crashing.org>,
Hugh Dickins <hugh.dickins@tiscali.co.uk>,
Ingo Molnar <mingo@elte.hu>,
"Paul E. McKenney" <paulmck@linux.vnet.ibm.com>,
Nathan Fontenot <nfont@austin.ibm.com>,
Peter Zijlstra <peterz@infradead.org>,
Andrew Morton <akpm@linux-foundation.org>,
Thomas Gleixner <tglx@linutronix.de>,
Oleg Nesterov <oleg@redhat.com>, Sachin Sant <sachinp@in.ibm.com>,
"H. Peter Anvin" <hpa@zytor.com>,
Shane Wang <shane.wang@intel.com>,
Roland McGrath <roland@redhat.com>,
linux-kernel@vger.kernel.org, Gautham R Shenoy <ego@in.ibm.com>
Subject: [PATCH 2/2] cpuhotplug: make get_online_cpus() scalability by using percpu counter
Date: Mon, 05 Apr 2010 18:38:02 +0800 [thread overview]
Message-ID: <4BB9BD8A.9040209@cn.fujitsu.com> (raw)
Current get_online_cpus() acquires a mutex lock and then
release it. It is not scale and it hurts cache. This patch rewrite it.
1) get_online_cpus() must be allowed to be called recursively, so I added
get_online_cpus_nest for every task for new code.
This patch just allows get_online_cpus() to be called recursively,
but when it is not nested, get_online_cpus() will wait until
cpuhotplug finished, so the potential starvation is avoided.
And, the livelock of cpu_hotplug_begin() is avoided, so the comment
is removed.
2) This new code use PER_CPU counters, and this counters protected
by RCU. These counters acts like the reference counters of a modules.
(Actually, all these code is stolen from module.c: try_refcount_get()
is stolen from try_module_get(), put_online_cpus() from module_put()...)
After this patch applied, get_online_cpus() is very light and scale when
cpuhotplug is not running. It just disables preemption and increase
the cpu counter and then enables preemption.
3) Since we have try_refcount_get(), I add a new API try_get_online_cpus().
Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
---
include/linux/cpu.h | 2
include/linux/sched.h | 3 +
kernel/cpu.c | 131 ++++++++++++++++++++++++++++++++------------------
kernel/fork.c | 3 +
4 files changed, 94 insertions(+), 45 deletions(-)
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index e287863..a32809c 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -112,6 +112,7 @@ extern struct sysdev_class cpu_sysdev_class;
extern void get_online_cpus(void);
extern void put_online_cpus(void);
+extern int try_get_online_cpus(void);
#define hotcpu_notifier(fn, pri) cpu_notifier(fn, pri)
#define register_hotcpu_notifier(nb) register_cpu_notifier(nb)
#define unregister_hotcpu_notifier(nb) unregister_cpu_notifier(nb)
@@ -134,6 +135,7 @@ static inline void cpu_hotplug_driver_unlock(void)
#define get_online_cpus() do { } while (0)
#define put_online_cpus() do { } while (0)
+#define try_get_online_cpus() (1)
#define hotcpu_notifier(fn, pri) do { (void)(fn); } while (0)
/* These aren't inline functions due to a GCC bug. */
#define register_hotcpu_notifier(nb) ({ (void)(nb); 0; })
diff --git a/include/linux/sched.h b/include/linux/sched.h
index c46b6e5..0422ea3 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1501,6 +1501,9 @@ struct task_struct {
unsigned long memsw_bytes; /* uncharged mem+swap usage */
} memcg_batch;
#endif
+#ifdef CONFIG_HOTPLUG_CPU
+ int get_online_cpus_nest;
+#endif
};
/* Future-safe accessor for struct task_struct's cpus_allowed. */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index bc1e3d5..ede02c6 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -14,6 +14,7 @@
#include <linux/kthread.h>
#include <linux/stop_machine.h>
#include <linux/mutex.h>
+#include <linux/percpu.h>
#ifdef CONFIG_SMP
/* Serializes the updates to cpu_online_mask, cpu_present_mask */
@@ -42,41 +43,82 @@ static int cpu_hotplug_disabled;
#ifdef CONFIG_HOTPLUG_CPU
-static struct {
- struct task_struct *active_writer;
- struct mutex lock; /* Synchronizes accesses to refcount, */
- /*
- * Also blocks the new readers during
- * an ongoing cpu hotplug operation.
- */
- int refcount;
-} cpu_hotplug = {
- .active_writer = NULL,
- .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
- .refcount = 0,
-};
+DEFINE_MUTEX(cpu_hotplug_lock);
+struct task_struct *cpu_hotplug_task;
+DEFINE_PER_CPU(int, refcount);
+
+static int try_refcount_get(void)
+{
+ preempt_disable();
+
+ if (likely(!cpu_hotplug_task)) {
+ __get_cpu_var(refcount)++;
+ preempt_enable();
+ return 1;
+ }
+
+ preempt_enable();
+ return 0;
+}
+
+int try_get_online_cpus(void)
+{
+ if (cpu_hotplug_task == current)
+ return 1;
+
+ if (current->get_online_cpus_nest || try_refcount_get()) {
+ current->get_online_cpus_nest++;
+ return 1;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(try_get_online_cpus);
void get_online_cpus(void)
{
might_sleep();
- if (cpu_hotplug.active_writer == current)
+ if (cpu_hotplug_task == current)
+ return;
+
+ if (current->get_online_cpus_nest++)
+ return;
+
+ if (likely(try_refcount_get()))
return;
- mutex_lock(&cpu_hotplug.lock);
- cpu_hotplug.refcount++;
- mutex_unlock(&cpu_hotplug.lock);
+ mutex_lock(&cpu_hotplug_lock);
+ percpu_add(refcount, 1);
+ mutex_unlock(&cpu_hotplug_lock);
}
EXPORT_SYMBOL_GPL(get_online_cpus);
+static unsigned int refcount_sum(void)
+{
+ unsigned int total = 0;
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ total += per_cpu(refcount, cpu);
+
+ return total;
+}
+
void put_online_cpus(void)
{
- if (cpu_hotplug.active_writer == current)
+ if (cpu_hotplug_task == current)
+ return;
+
+ if (WARN_ON(!current->get_online_cpus_nest))
return;
- mutex_lock(&cpu_hotplug.lock);
- if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer))
- wake_up_process(cpu_hotplug.active_writer);
- mutex_unlock(&cpu_hotplug.lock);
+ if (!--current->get_online_cpus_nest) {
+ preempt_disable();
+ __get_cpu_var(refcount)--;
+ if (cpu_hotplug_task)
+ wake_up_process(cpu_hotplug_task);
+ preempt_enable();
+ }
}
EXPORT_SYMBOL_GPL(put_online_cpus);
@@ -85,41 +127,40 @@ EXPORT_SYMBOL_GPL(put_online_cpus);
* refcount goes to zero.
*
* Note that during a cpu-hotplug operation, the new readers, if any,
- * will be blocked by the cpu_hotplug.lock
- *
- * Since cpu_hotplug_begin() is always called after invoking
- * cpu_maps_update_begin(), we can be sure that only one writer is active.
- *
- * Note that theoretically, there is a possibility of a livelock:
- * - Refcount goes to zero, last reader wakes up the sleeping
- * writer.
- * - Last reader unlocks the cpu_hotplug.lock.
- * - A new reader arrives at this moment, bumps up the refcount.
- * - The writer acquires the cpu_hotplug.lock finds the refcount
- * non zero and goes to sleep again.
- *
- * However, this is very difficult to achieve in practice since
- * get_online_cpus() not an api which is called all that often.
- *
+ * will be blocked by the cpu_hotplug_lock
*/
static void cpu_hotplug_begin(void)
{
- cpu_hotplug.active_writer = current;
+ mutex_lock(&cpu_hotplug_lock);
+
+ /*
+ * Set cpu_hotplug_task. Wait until all running try_refcount_get()
+ * finished and all these try_refcount_get() behavior are seen.
+ */
+ cpu_hotplug_task = current;
+ synchronize_sched();
+ /* Wait for zero refcount */
for (;;) {
- mutex_lock(&cpu_hotplug.lock);
- if (likely(!cpu_hotplug.refcount))
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ if (!refcount_sum())
break;
- __set_current_state(TASK_UNINTERRUPTIBLE);
- mutex_unlock(&cpu_hotplug.lock);
schedule();
}
+
+ __set_current_state(TASK_RUNNING);
}
static void cpu_hotplug_done(void)
{
- cpu_hotplug.active_writer = NULL;
- mutex_unlock(&cpu_hotplug.lock);
+ /*
+ * Ensure try_refcount_get() sees the front befavior
+ * after it sees cpu_hotplug_task == NULL.
+ */
+ smp_mb();
+
+ cpu_hotplug_task = NULL;
+ mutex_unlock(&cpu_hotplug_lock);
}
#else /* #if CONFIG_HOTPLUG_CPU */
diff --git a/kernel/fork.c b/kernel/fork.c
index d67f1db..b162014 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1109,6 +1109,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->memcg_batch.memcg = NULL;
#endif
p->stack_start = stack_start;
+#ifdef CONFIG_HOTPLUG_CPU
+ p->get_online_cpus_nest = 0;
+#endif
/* Perform scheduler related setup. Assign this task to a CPU. */
sched_fork(p, clone_flags);
next reply other threads:[~2010-04-05 10:38 UTC|newest]
Thread overview: 12+ messages / expand[flat|nested] mbox.gz Atom feed top
2010-04-05 10:38 Lai Jiangshan [this message]
2010-04-05 16:29 ` [PATCH 2/2] cpuhotplug: make get_online_cpus() scalability by using percpu counter Oleg Nesterov
2010-04-06 12:00 ` Oleg Nesterov
2010-04-07 13:35 ` Lai Jiangshan
2010-04-07 13:54 ` Oleg Nesterov
2010-04-09 12:12 ` Oleg Nesterov
2010-04-12 9:24 ` Lai Jiangshan
2010-04-12 9:28 ` Peter Zijlstra
2010-04-12 12:30 ` Lai Jiangshan
2010-04-12 12:34 ` Peter Zijlstra
2010-04-13 1:47 ` Lai Jiangshan
2010-04-12 18:16 ` Oleg Nesterov
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=4BB9BD8A.9040209@cn.fujitsu.com \
--to=laijs@cn.fujitsu.com \
--cc=akpm@linux-foundation.org \
--cc=benh@kernel.crashing.org \
--cc=ego@in.ibm.com \
--cc=hpa@zytor.com \
--cc=hugh.dickins@tiscali.co.uk \
--cc=linux-kernel@vger.kernel.org \
--cc=mingo@elte.hu \
--cc=nfont@austin.ibm.com \
--cc=oleg@redhat.com \
--cc=paulmck@linux.vnet.ibm.com \
--cc=peterz@infradead.org \
--cc=roland@redhat.com \
--cc=rusty@rustcorp.com.au \
--cc=sachinp@in.ibm.com \
--cc=shane.wang@intel.com \
--cc=tglx@linutronix.de \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox