From: Chuck Ebbert <76306.1226@compuserve.com>
To: Andi Kleen <ak@suse.de>
Cc: Ingo Molnar <mingo@elte.hu>, Linus Torvalds <torvalds@osdl.org>,
Andrew Morton <akpm@osdl.org>,
linux-kernel <linux-kernel@vger.kernel.org>
Subject: Re: [patch 2.6.14-rc4] i386: spinlock optimization
Date: Fri, 14 Oct 2005 21:24:19 -0400 [thread overview]
Message-ID: <200510142128_MC3-1-ACAD-8CD3@compuserve.com> (raw)
In-Reply-To: <20051014180414.GD3502@verdi.suse.de>
On Fri, 14 Oct 2005 at 20:04:14 +0200, Andi Kleen wrote:
> I doubt your change adds any noticeable delay on a aggressive
> OOO CPU, which are pretty much all modern x86s. It's probably
> a nop.
Well it certainly improves the spinlock fairness when owner releases
the lock and tries to re-acquire it. And on the test machine with a slow
bus my program runs faster even though the spinlock ping-pongs more often
with the new code than with the old. ("Total runs" == number of times
spinlock changes hands during test run; "run length" == how many cycles
in a row the current owner held the spinlock.) Timer interrupts skew these
results a bit by stealing the CPU, so don't take them too literally.
--------------------------------------------
Dual Pentium II Xeon 350 running 2.6.14-rc4:
--------------------------------------------
$ gcc -O2 -Wall -o spintest.o -DSETAFFINITY=2 -DFAST_SPINLOCK=0 -mcpu=pentium2 spintest.c
$ ./spintest.o
Parent CPU 1, child CPU 0, using old code for lock
CPU clocks: 2066947815
Parent did: 5033294 of 10000001 iterations
Run length histogram:
1: 4
2: 105
3: 1161
4: 502
5: 113
6: 130
7: 56
8: 54
9: 81
10: 60
11: 72
12: 57
13: 72
14: 43
15: 85
>=16: 94661
Total runs 97256, longest was 2713 iterations, average was 102
$ gcc -O2 -Wall -o spintest.o -DSETAFFINITY=2 -DFAST_SPINLOCK=1 -mcpu=pentium2 spintest.c
$ ./spintest.o
Parent CPU 1, child CPU 0, using new code for lock
CPU clocks: 2818166922
Parent did: 4148727 of 10000001 iterations
Run length histogram:
1: 52346
2: 383050
3: 137185
4: 42970
5: 36475
6: 8407
7: 11177
8: 2666
9: 1033
10: 791
11: 763
12: 870
13: 726
14: 1075
15: 1041
>=16: 68272
Total runs 748847, longest was 3488 iterations, average was 13
-----------------------------------------------------------------
Dual Pentium II OverDrive 333 with Socket 8 bus running 2.6.13.4:
-----------------------------------------------------------------
$ gcc -O2 -Wall -DSETAFFINITY=3 -DFAST_SPINLOCK=0 -o spintest.o -mcpu=pentium2 spintest.c
$ ./spintest.o
Parent CPU 1, child CPU 0, using old code for lock
CPU clocks: 5635093038
Parent did: 4825431 of 10000001 iterations
Run length histogram:
1: 2264420
2: 1137639
3: 565597
4: 7039
5: 560015
6: 5832
7: 240
8: 128
9: 82
10: 37
11: 24
12: 11
13: 16
14: 20
15: 21
>=16: 3312
Total runs 4544433, longest was 31193 iterations, average was 2
$ gcc -O2 -Wall -DSETAFFINITY=3 -DFAST_SPINLOCK=1 -o spintest.o -mcpu=pentium2 spintest.c
$ ./spintest.o
Parent CPU 1, child CPU 0, using new code for lock
CPU clocks: 5250078921
Parent did: 4831071 of 10000001 iterations
Run length histogram:
1: 2792457
2: 2596109
3: 201958
4: 28941
5: 82932
6: 8071
7: 518
8: 207
9: 75
10: 58
11: 38
12: 11
13: 6
14: 6
15: 8
>=16: 3254
Total runs 5714649, longest was 29512 iterations, average was 1
-------------
Test program:
-------------
/* spinlock test
*
* Tests Linux spinlock fairness on SMP i386 machine.
* 32-bit spinlocks instead of Linux's 8-bit locks.
*/
/* number of tests */
#ifndef ITERS
#define ITERS 10000000
#endif
/* cpu to run parent process; child will use !PARENT_CPU */
#ifndef PARENT_CPU
#define PARENT_CPU 1
#endif
/* use faster spinlock code */
#ifndef FAST_SPINLOCK
#define FAST_SPINLOCK 0
#endif
/* change this to match your version of glibc -- "3" means use 3 args */
#ifndef SETAFFINITY
#define SETAFFINITY 3
#endif
#if SETAFFINITY == 3
#define setaffinity(pid, mask) sched_setaffinity((pid), sizeof(mask), &(mask))
#else /* 2 args */
#define setaffinity(pid, mask) sched_setaffinity((pid), &(mask))
#endif
#define _GNU_SOURCE
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <fcntl.h>
#include <sched.h>
#include <sys/ptrace.h>
#include <sys/wait.h>
#include <sys/mman.h>
#include <asm/user.h>
#define likely(x) __builtin_expect(!!(x), 1)
#define unlikely(x) __builtin_expect(!!(x), 0)
#define RDTSCLL(r) asm volatile("rdtsc" : "=A" (r))
unsigned long long tsc1, tsc2;
cpu_set_t cpuset0, cpuset1;
int test = 1; /* for testing -- starts unlocked */
int strt = 0; /* sync startup -- starts locked */
unsigned int i, runs;
int stk[2048];
int shrd_iters __attribute__ ((aligned(64)));
struct stats
{
unsigned int iters;
unsigned int prev;
unsigned int run, max_run;
unsigned int hist[16];
};
struct stats p __attribute__ ((aligned(64)));
struct stats c __attribute__ ((aligned(64)));
static inline void __raw_spin_lock(int *l)
{
asm volatile(
"\n1:\t"
"lock ; decl %0\n\t"
"jns 3f\n"
"2:\t"
"rep;nop\n\t"
"cmpl $0,%0\n\t"
#if FAST_SPINLOCK == 1
"jg 1b\n\t"
"jmp 2b\n"
#else
"jle 2b\n\t"
"jmp 1b\n"
#endif
"3:\n\t"
:"=m" (*l) : : "memory");
}
static inline void __raw_spin_unlock(int *l)
{
asm volatile("movl $1,%0"
: "=m" (*l) : : "memory");
}
int do_child(void *vp)
{
CPU_SET(!PARENT_CPU, &cpuset1);
if (unlikely(setaffinity(0, cpuset1)))
perror("child setaffinity");
__raw_spin_unlock(&strt); /* release parent */
again:
__raw_spin_lock(&test);
c.iters++;
if (shrd_iters == c.prev)
c.run++;
else {
if (c.run > c.max_run)
c.max_run = c.run;
c.hist[c.run <= 15 ? c.run : 15]++;
c.run = 0;
}
if (likely(++shrd_iters < ITERS)) {
c.prev = shrd_iters;
__raw_spin_unlock(&test);
goto again;
}
__raw_spin_unlock(&test);
return 0;
}
int main(int argc, char * const argv[])
{
CPU_SET(PARENT_CPU, &cpuset0);
if (unlikely(setaffinity(0, cpuset0)))
perror("parent setaffinity");
clone(do_child, &stk[2047], CLONE_VM, 0);
__raw_spin_lock(&strt); /* wait for child to init */
RDTSCLL(tsc1);
again:
__raw_spin_lock(&test);
p.iters++;
if (shrd_iters == p.prev)
p.run++;
else {
if (p.run > p.max_run)
p.max_run = p.run;
p.hist[p.run <= 15 ? p.run : 15]++;
p.run = 0;
}
if (likely(++shrd_iters < ITERS)) {
p.prev = shrd_iters;
__raw_spin_unlock(&test);
goto again;
}
__raw_spin_unlock(&test);
RDTSCLL(tsc2);
printf("Parent CPU %d, child CPU %d, ", PARENT_CPU, !PARENT_CPU);
printf("using %s code for lock\n", FAST_SPINLOCK == 1 ? "new" : "old");
printf("CPU clocks: %llu\n", tsc2 - tsc1);
printf("Parent did: %u of %u iterations\n", p.iters, shrd_iters);
printf("Run length histogram:\n");
for (i = 0; i < 16; i++) {
printf ("%s%2u: %u\n", i == 15 ? ">=" : " ",
i + 1, p.hist[i] + c.hist[i]);
runs += p.hist[i] + c.hist[i];
}
printf("Total runs %u, longest was %u iterations, average was %u\n",
runs, p.max_run > c.max_run ? p.max_run : c.max_run,
shrd_iters / runs);
return 0;
}
__
Chuck
next reply other threads:[~2005-10-15 1:33 UTC|newest]
Thread overview: 9+ messages / expand[flat|nested] mbox.gz Atom feed top
2005-10-15 1:24 Chuck Ebbert [this message]
2005-10-15 15:09 ` [patch 2.6.14-rc4] i386: spinlock optimization Ingo Molnar
-- strict thread matches above, loose matches on Subject: below --
2005-10-14 17:47 Chuck Ebbert
2005-10-14 18:04 ` Andi Kleen
2005-10-14 18:24 ` linux-os (Dick Johnson)
2005-10-14 18:32 ` Andrew Morton
2005-10-14 18:55 ` linux-os (Dick Johnson)
2005-10-14 18:37 ` Linus Torvalds
2005-10-14 18:56 ` linux-os (Dick Johnson)
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=200510142128_MC3-1-ACAD-8CD3@compuserve.com \
--to=76306.1226@compuserve.com \
--cc=ak@suse.de \
--cc=akpm@osdl.org \
--cc=linux-kernel@vger.kernel.org \
--cc=mingo@elte.hu \
--cc=torvalds@osdl.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.