From: Rusty Russell <rusty@rustcorp.com.au>
To: Andrew Morton <akpm@linux-foundation.org>
Cc: Eric Dumazet <dada1@cosmosbay.com>,
Peter Zijlstra <a.p.zijlstra@chello.nl>,
Theodore Tso <tytso@mit.edu>,
linux kernel <linux-kernel@vger.kernel.org>,
"David S. Miller" <davem@davemloft.net>,
Mingming Cao <cmm@us.ibm.com>,
linux-ext4@vger.kernel.org, Christoph Lameter <clameter@sgi.com>,
Paul Mackerras <paulus@samba.org>
Subject: Re: [PATCH] percpu_counter: Fix __percpu_counter_sum()
Date: Mon, 15 Dec 2008 23:23:52 +1030 [thread overview]
Message-ID: <200812152323.53592.rusty@rustcorp.com.au> (raw)
In-Reply-To: <20081209214921.b3944687.akpm@linux-foundation.org>
On Wednesday 10 December 2008 16:19:21 Andrew Morton wrote:
> Rusty, Christoph: talk to me. If we add a new user of local_t in core
> kernel, will we regret it?
Interesting. There's new local_t infrastructure, but it's not used.
Here's a benchmark patch showing some results. x86 is pretty close to
optimal already, though my results on Power show atomic_long is a bad choice
there.
I'll do an audit of the users, then send out some local_t cleanup patches etc.
Benchmarks for local_t variants
(This patch also fixes the x86 cpu_local_* macros, which are obviously
unused).
I chose a large array (1M longs) for the inc/add/add_return tests so
the trivalue case would show some cache pressure.
The cpu_local_inc case is always cache-hot, so it's not comparable to
the others.
Time in ns per iteration (brackets is with CONFIG_PREEMPT=y):
inc add add_return cpu_local_inc read
x86-32: 2.13 Ghz Core Duo 2
atomic_long 118 118 115 17 17
irqsave/rest 77 78 77 23 16
trivalue 45 45 127 3(6) 21
local_t 36 36 36 1(5) 17
x86-64: 2.6 GHz Dual-Core AMD Opteron 2218
atomic_long 55 60 - 6 19
irqsave/rest 54 54 - 11 19
trivalue 47 47 - 5 28
local_t 47 46 - 1 19
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
arch/x86/include/asm/local.h | 20 ++--
init/main.c | 198 +++++++++++++++++++++++++++++++++++++++++++
2 files changed, 208 insertions(+), 10 deletions(-)
diff --git a/arch/x86/include/asm/local.h b/arch/x86/include/asm/local.h
--- a/arch/x86/include/asm/local.h
+++ b/arch/x86/include/asm/local.h
@@ -220,16 +220,16 @@ static inline long local_sub_return(long
preempt_enable(); \
}) \
-#define cpu_local_read(l) cpu_local_wrap_v(local_read(&__get_cpu_var((l))))
-#define cpu_local_set(l, i) cpu_local_wrap(local_set(&__get_cpu_var((l)), (i)))
-#define cpu_local_inc(l) cpu_local_wrap(local_inc(&__get_cpu_var((l))))
-#define cpu_local_dec(l) cpu_local_wrap(local_dec(&__get_cpu_var((l))))
-#define cpu_local_add(i, l) cpu_local_wrap(local_add((i), &__get_cpu_var((l))))
-#define cpu_local_sub(i, l) cpu_local_wrap(local_sub((i), &__get_cpu_var((l))))
+#define cpu_local_read(l) cpu_local_wrap_v(local_read(&__get_cpu_var(l)))
+#define cpu_local_set(l, i) cpu_local_wrap(local_set(&__get_cpu_var(l), (i)))
+#define cpu_local_inc(l) cpu_local_wrap(local_inc(&__get_cpu_var(l)))
+#define cpu_local_dec(l) cpu_local_wrap(local_dec(&__get_cpu_var(l)))
+#define cpu_local_add(i, l) cpu_local_wrap(local_add((i), &__get_cpu_var(l)))
+#define cpu_local_sub(i, l) cpu_local_wrap(local_sub((i), &__get_cpu_var(l)))
-#define __cpu_local_inc(l) cpu_local_inc((l))
-#define __cpu_local_dec(l) cpu_local_dec((l))
-#define __cpu_local_add(i, l) cpu_local_add((i), (l))
-#define __cpu_local_sub(i, l) cpu_local_sub((i), (l))
+#define __cpu_local_inc(l) cpu_local_inc(l)
+#define __cpu_local_dec(l) cpu_local_dec(l)
+#define __cpu_local_add(i, l) cpu_local_add((i), l)
+#define __cpu_local_sub(i, l) cpu_local_sub((i), l)
#endif /* _ASM_X86_LOCAL_H */
diff --git a/init/main.c b/init/main.c
--- a/init/main.c
+++ b/init/main.c
@@ -539,6 +539,200 @@ void __init __weak thread_info_cache_ini
{
}
+/* There are three obvious ways to implement local_t on an arch which
+ * can't do single-instruction inc/dec etc.
+ * 1) atomic_long
+ * 2) irq_save/irq_restore
+ * 3) multiple counters.
+ *
+ * This does a very rough benchmark on each one.
+ */
+struct local1 {
+ atomic_long_t v;
+};
+struct local2 {
+ unsigned long v;
+};
+struct local3 {
+ unsigned long v[3];
+};
+
+/* Enough to put some pressure on the caches. */
+#define NUM_LOCAL_TEST (1024*1024)
+#define NUM_LOCAL_RUNS (NUM_LOCAL_TEST*32)
+/* This will make it jump around looking random */
+#define STRIDE 514001
+
+static void *test_local_variants_mem;
+
+static void init_test_local_variants(void)
+{
+ unsigned long size;
+ size = max(sizeof(struct local1),
+ max(sizeof(struct local2),
+ max(sizeof(struct local3), sizeof(local_t))))
+ * NUM_LOCAL_TEST;
+ /* Assume this works in early boot. */
+ test_local_variants_mem = alloc_bootmem_nopanic(size);
+
+ if (!test_local_variants_mem) {
+ printk("test_local_variants: failed to allocate %lu bytes\n",
+ size);
+ return;
+ }
+}
+
+static void print_result(const char *str,
+ struct timespec start, struct timespec end)
+{
+ s64 diff;
+
+ diff = ktime_to_ns(ktime_sub(timespec_to_ktime(end), timespec_to_ktime(start)));
+ printk("%s=%lli/%lli ",
+ str, diff, diff/NUM_LOCAL_RUNS);
+}
+
+static unsigned int warm_local_test_cache(const void *mem, size_t len)
+{
+ unsigned int i, total = 0;
+ for (i = 0; i < len; i++)
+ total += ((char *)mem)[i];
+ return total;
+}
+
+#define TEST_LOOP(expr) \
+ n = 0; \
+ getnstimeofday(&start); \
+ for (i = 0; i < NUM_LOCAL_RUNS; i++) { \
+ expr; \
+ n += STRIDE; \
+ n %= NUM_LOCAL_TEST; \
+ } \
+ getnstimeofday(&end);
+
+/* This doesn't test cache effects at all */
+#define NUM_PERCPU_VARS 16
+DEFINE_PER_CPU(struct local1[NUM_PERCPU_VARS], local1_test);
+DEFINE_PER_CPU(struct local2[NUM_PERCPU_VARS], local2_test);
+DEFINE_PER_CPU(struct local3[NUM_PERCPU_VARS], local3_test);
+DEFINE_PER_CPU(local_t[NUM_PERCPU_VARS], local4_test);
+
+static void test_local_variants(void)
+{
+ struct timespec start, end;
+ unsigned int i, n;
+ unsigned long total, warm_total = 0;
+ struct local1 *l1;
+ struct local2 *l2;
+ struct local3 *l3;
+ local_t *l4;
+
+ if (!test_local_variants_mem)
+ return;
+
+ printk("Running local_t variant benchmarks\n");
+ l1 = test_local_variants_mem;
+ l2 = test_local_variants_mem;
+ l3 = test_local_variants_mem;
+ l4 = test_local_variants_mem;
+
+ printk("atomic_long: ");
+ memset(l1, 0, sizeof(*l1)*NUM_LOCAL_TEST);
+ TEST_LOOP(atomic_long_inc(&l1[n].v));
+ print_result("local_inc", start, end);
+
+ warm_total += warm_local_test_cache(l1, sizeof(*l1)*NUM_LOCAL_TEST);
+ TEST_LOOP(atomic_long_add(1234, &l1[n].v));
+ print_result("local_add", start, end);
+
+ warm_total += warm_local_test_cache(l1, sizeof(*l1)*NUM_LOCAL_TEST);
+ TEST_LOOP(atomic_long_inc(&__get_cpu_var(local1_test)[n%NUM_PERCPU_VARS].v));
+ print_result("cpu_local_inc", start, end);
+
+ warm_total += warm_local_test_cache(l1, sizeof(*l1)*NUM_LOCAL_TEST);
+ total = 0;
+ TEST_LOOP(total += atomic_long_read(&l1[n].v));
+ print_result("local_read", start, end);
+
+ printk("(total was %lu)\n", total);
+
+ printk("irqsave/restore: ");
+ memset(l2, 0, sizeof(*l2)*NUM_LOCAL_TEST);
+ TEST_LOOP(unsigned long flags;
+ local_irq_save(flags);
+ l2[n].v++;
+ local_irq_restore(flags));
+ print_result("local_inc", start, end);
+
+ warm_total += warm_local_test_cache(l2, sizeof(*l2)*NUM_LOCAL_TEST);
+ TEST_LOOP(unsigned long flags;
+ local_irq_save(flags);
+ l2[n].v += 1234;
+ local_irq_restore(flags));
+ print_result("local_add", start, end);
+
+ warm_total += warm_local_test_cache(l2, sizeof(*l2)*NUM_LOCAL_TEST);
+ TEST_LOOP(unsigned long flags;
+ local_irq_save(flags);
+ __get_cpu_var(local2_test)[n%NUM_PERCPU_VARS].v++;
+ local_irq_restore(flags));
+ print_result("cpu_local_inc", start, end);
+
+ warm_total += warm_local_test_cache(l2, sizeof(*l2)*NUM_LOCAL_TEST);
+ total = 0;
+ TEST_LOOP(total += l2[n].v);
+ print_result("local_read", start, end);
+ printk("(total was %lu)\n", total);
+
+ printk("trivalue: ");
+ memset(l3, 0, sizeof(*l3)*NUM_LOCAL_TEST);
+ TEST_LOOP(unsigned int idx
+ = !(preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK)) +
+ !(preempt_count() & HARDIRQ_MASK);
+ l3[n].v[idx]++);
+ print_result("local_inc", start, end);
+
+ warm_total += warm_local_test_cache(l3, sizeof(*l3)*NUM_LOCAL_TEST);
+ TEST_LOOP(unsigned int idx
+ = !(preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK)) +
+ !(preempt_count() & HARDIRQ_MASK);
+ l3[n].v[idx] += 1234);
+ print_result("local_add", start, end);
+
+ warm_total += warm_local_test_cache(l3, sizeof(*l3)*NUM_LOCAL_TEST);
+ TEST_LOOP(unsigned int idx
+ = !(preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK)) +
+ !(preempt_count() & HARDIRQ_MASK);
+ get_cpu_var(local3_test)[n%NUM_PERCPU_VARS].v[idx]++;
+ put_cpu_var());
+ print_result("cpu_local_inc", start, end);
+
+ warm_total += warm_local_test_cache(l3, sizeof(*l3)*NUM_LOCAL_TEST);
+ total = 0;
+ TEST_LOOP(total += l3[n].v[0] + l3[n].v[1] + l3[n].v[2]);
+ print_result("local_read", start, end);
+ printk("(total was %lu)\n", total);
+
+ printk("local_t: ");
+ memset(l4, 0, sizeof(*l4)*NUM_LOCAL_TEST);
+ TEST_LOOP(local_inc(&l4[n]));
+ print_result("local_inc", start, end);
+
+ warm_total += warm_local_test_cache(l4, sizeof(*l4)*NUM_LOCAL_TEST);
+ TEST_LOOP(local_add(1234, &l4[n]));
+ print_result("local_add", start, end);
+
+ warm_total += warm_local_test_cache(l4, sizeof(*l4)*NUM_LOCAL_TEST);
+ TEST_LOOP(cpu_local_inc(local4_test[n%NUM_PERCPU_VARS]));
+ print_result("cpu_local_inc", start, end);
+
+ warm_total += warm_local_test_cache(l4, sizeof(*l4)*NUM_LOCAL_TEST);
+ total = 0;
+ TEST_LOOP(total += local_read(&l4[n]));
+ print_result("local_read", start, end);
+ printk("(total was %lu, warm_total %lu)\n", total, warm_total);
+}
+
asmlinkage void __init start_kernel(void)
{
char * command_line;
@@ -635,6 +829,8 @@ asmlinkage void __init start_kernel(void
*/
locking_selftest();
+ init_test_local_variants();
+
#ifdef CONFIG_BLK_DEV_INITRD
if (initrd_start && !initrd_below_start_ok &&
page_to_pfn(virt_to_page((void *)initrd_start)) < min_low_pfn) {
@@ -692,6 +888,8 @@ asmlinkage void __init start_kernel(void
acpi_early_init(); /* before LAPIC and SMP init */
ftrace_init();
+
+ test_local_variants();
/* Do the rest non-__init'ed, we're now alive */
rest_init();
next prev parent reply other threads:[~2008-12-15 12:53 UTC|newest]
Thread overview: 52+ messages / expand[flat|nested] mbox.gz Atom feed top
2008-12-03 18:40 [PATCH] percpu_counter: fix CPU unplug race in percpu_counter_destroy() Eric Dumazet
2008-12-03 20:24 ` [PATCH] percpu_counter: Fix __percpu_counter_sum() Eric Dumazet
2008-12-03 20:45 ` Peter Zijlstra
2008-12-04 6:14 ` David Miller
2008-12-07 4:22 ` Andrew Morton
2008-12-07 4:22 ` Andrew Morton
2008-12-07 10:25 ` Peter Zijlstra
2008-12-07 13:28 ` Eric Dumazet
2008-12-07 13:28 ` Eric Dumazet
2008-12-07 17:28 ` Andrew Morton
2008-12-07 18:00 ` Eric Dumazet
2008-12-07 18:00 ` Eric Dumazet
2008-12-08 4:52 ` Andrew Morton
2008-12-08 22:12 ` Theodore Tso
2008-12-08 22:20 ` Peter Zijlstra
2008-12-08 23:00 ` Theodore Tso
2008-12-08 23:05 ` Peter Zijlstra
2008-12-08 23:08 ` Peter Zijlstra
2008-12-09 8:12 ` Eric Dumazet
2008-12-09 8:12 ` Eric Dumazet
2008-12-09 8:34 ` Peter Zijlstra
2008-12-09 8:34 ` Peter Zijlstra
2008-12-10 5:09 ` Eric Dumazet
2008-12-10 5:49 ` Andrew Morton
2008-12-10 22:56 ` Eric Dumazet
2008-12-10 22:56 ` Eric Dumazet
2008-12-12 8:17 ` Rusty Russell
2008-12-12 8:22 ` Eric Dumazet
2008-12-12 8:22 ` Eric Dumazet
2008-12-12 11:08 ` [PATCH] percpu_counter: use local_t and atomic_long_t if possible Eric Dumazet
2008-12-12 11:08 ` Eric Dumazet
2008-12-12 11:29 ` Peter Zijlstra
2008-12-23 11:43 ` Peter Zijlstra
2008-12-25 13:26 ` Rusty Russell
2008-12-15 12:53 ` Rusty Russell [this message]
2008-12-16 20:16 ` [PATCH] percpu_counter: Fix __percpu_counter_sum() Ingo Molnar
2008-12-10 7:12 ` Peter Zijlstra
2008-12-08 23:07 ` Andrew Morton
2008-12-08 23:49 ` Theodore Tso
2008-12-08 22:22 ` Andrew Morton
2008-12-08 22:44 ` Mingming Cao
2008-12-08 22:44 ` Mingming Cao
2008-12-07 22:24 ` [PATCH] atomic: fix a typo in atomic_long_xchg() Eric Dumazet
2008-12-07 22:24 ` Eric Dumazet
2008-12-07 15:28 ` [PATCH] percpu_counter: Fix __percpu_counter_sum() Theodore Tso
2008-12-08 4:42 ` Andrew Morton
2008-12-08 17:55 ` Mingming Cao
2008-12-08 17:55 ` Mingming Cao
2008-12-11 16:32 ` [stable] " Greg KH
2008-12-08 17:44 ` Mingming Cao
2008-12-08 17:44 ` Mingming Cao
2008-12-04 6:13 ` [PATCH] percpu_counter: fix CPU unplug race in percpu_counter_destroy() David Miller
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=200812152323.53592.rusty@rustcorp.com.au \
--to=rusty@rustcorp.com.au \
--cc=a.p.zijlstra@chello.nl \
--cc=akpm@linux-foundation.org \
--cc=clameter@sgi.com \
--cc=cmm@us.ibm.com \
--cc=dada1@cosmosbay.com \
--cc=davem@davemloft.net \
--cc=linux-ext4@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=paulus@samba.org \
--cc=tytso@mit.edu \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.