public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* Re: [patch 2.6.14-rc4] i386: spinlock optimization
@ 2005-10-15  1:24 Chuck Ebbert
  2005-10-15 15:09 ` Ingo Molnar
  0 siblings, 1 reply; 9+ messages in thread
From: Chuck Ebbert @ 2005-10-15  1:24 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Ingo Molnar, Linus Torvalds, Andrew Morton, linux-kernel

In-Reply-To: <20051014180414.GD3502@verdi.suse.de>

On Fri, 14 Oct 2005 at 20:04:14 +0200, Andi Kleen wrote:
> I doubt your change adds any noticeable delay on a aggressive
> OOO CPU, which are pretty much all modern x86s. It's probably
> a nop.

 Well it certainly improves the spinlock fairness when owner releases
the lock and tries to re-acquire it.  And on the test machine with a slow
bus my program runs faster even though the spinlock ping-pongs more often
with the new code than with the old.  ("Total runs" == number of times
spinlock changes hands during test run; "run length" == how many cycles
in a row the current owner held the spinlock.) Timer interrupts skew these
results a bit by stealing the CPU, so don't take them too literally.


--------------------------------------------
Dual Pentium II Xeon 350 running 2.6.14-rc4:
--------------------------------------------

$ gcc -O2 -Wall -o spintest.o -DSETAFFINITY=2 -DFAST_SPINLOCK=0 -mcpu=pentium2 spintest.c
$ ./spintest.o
Parent CPU 1, child CPU 0, using old code for lock
CPU clocks: 2066947815
Parent did: 5033294 of 10000001 iterations
Run length histogram:
   1: 4
   2: 105
   3: 1161
   4: 502
   5: 113
   6: 130
   7: 56
   8: 54
   9: 81
  10: 60
  11: 72
  12: 57
  13: 72
  14: 43
  15: 85
>=16: 94661
Total runs 97256, longest was 2713 iterations, average was 102

$ gcc -O2 -Wall -o spintest.o -DSETAFFINITY=2 -DFAST_SPINLOCK=1 -mcpu=pentium2 spintest.c
$ ./spintest.o
Parent CPU 1, child CPU 0, using new code for lock
CPU clocks: 2818166922
Parent did: 4148727 of 10000001 iterations
Run length histogram:
   1: 52346
   2: 383050
   3: 137185
   4: 42970
   5: 36475
   6: 8407
   7: 11177
   8: 2666
   9: 1033
  10: 791
  11: 763
  12: 870
  13: 726
  14: 1075
  15: 1041
>=16: 68272
Total runs 748847, longest was 3488 iterations, average was 13

-----------------------------------------------------------------
Dual Pentium II OverDrive 333 with Socket 8 bus running 2.6.13.4:
-----------------------------------------------------------------

$ gcc -O2 -Wall -DSETAFFINITY=3 -DFAST_SPINLOCK=0 -o spintest.o -mcpu=pentium2 spintest.c
$ ./spintest.o
Parent CPU 1, child CPU 0, using old code for lock
CPU clocks: 5635093038
Parent did: 4825431 of 10000001 iterations
Run length histogram:
   1: 2264420
   2: 1137639
   3: 565597
   4: 7039
   5: 560015
   6: 5832
   7: 240
   8: 128
   9: 82
  10: 37
  11: 24
  12: 11
  13: 16
  14: 20
  15: 21
>=16: 3312
Total runs 4544433, longest was 31193 iterations, average was 2

$ gcc -O2 -Wall -DSETAFFINITY=3 -DFAST_SPINLOCK=1 -o spintest.o -mcpu=pentium2 spintest.c
$ ./spintest.o
Parent CPU 1, child CPU 0, using new code for lock
CPU clocks: 5250078921
Parent did: 4831071 of 10000001 iterations
Run length histogram:
   1: 2792457
   2: 2596109
   3: 201958
   4: 28941
   5: 82932
   6: 8071
   7: 518
   8: 207
   9: 75
  10: 58
  11: 38
  12: 11
  13: 6
  14: 6
  15: 8
>=16: 3254
Total runs 5714649, longest was 29512 iterations, average was 1

-------------
Test program:
-------------

/* spinlock test
 *
 * Tests Linux spinlock fairness on SMP i386 machine.
 * 32-bit spinlocks instead of Linux's 8-bit locks.
 */

/* number of tests  */
#ifndef ITERS
#define ITERS 10000000
#endif

/* cpu to run parent process; child will use !PARENT_CPU  */
#ifndef PARENT_CPU
#define PARENT_CPU 1
#endif

/* use faster spinlock code  */
#ifndef FAST_SPINLOCK
#define FAST_SPINLOCK 0
#endif

/* change this to match your version of glibc -- "3" means use 3 args */
#ifndef SETAFFINITY
#define SETAFFINITY 3
#endif

#if SETAFFINITY == 3
#define setaffinity(pid, mask) sched_setaffinity((pid), sizeof(mask), &(mask))
#else /* 2 args  */
#define setaffinity(pid, mask) sched_setaffinity((pid), &(mask))
#endif

#define _GNU_SOURCE
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <fcntl.h>
#include <sched.h>
#include <sys/ptrace.h>
#include <sys/wait.h>
#include <sys/mman.h>
#include <asm/user.h>

#define likely(x)	__builtin_expect(!!(x), 1)
#define unlikely(x)	__builtin_expect(!!(x), 0)

#define RDTSCLL(r) asm volatile("rdtsc" : "=A" (r))

unsigned long long tsc1, tsc2;
cpu_set_t cpuset0, cpuset1;
int test = 1; /* for testing -- starts unlocked  */
int strt = 0; /* sync startup  -- starts locked  */
unsigned int i, runs;
int stk[2048];
int shrd_iters __attribute__ ((aligned(64)));
struct stats
{
	unsigned int iters;
	unsigned int prev;
	unsigned int run, max_run;
	unsigned int hist[16];
};
struct stats p __attribute__ ((aligned(64)));
struct stats c __attribute__ ((aligned(64)));

static inline void __raw_spin_lock(int *l)
{
	asm volatile(
		"\n1:\t"
		"lock ; decl %0\n\t"
		"jns 3f\n"
		"2:\t"
		"rep;nop\n\t"
		"cmpl $0,%0\n\t"
#if FAST_SPINLOCK == 1
		"jg 1b\n\t"
		"jmp 2b\n"
#else
		"jle 2b\n\t"
		"jmp 1b\n"
#endif
		"3:\n\t"
		:"=m" (*l) : : "memory");
}

static inline void __raw_spin_unlock(int *l)
{
	asm volatile("movl $1,%0"
		     : "=m" (*l) : : "memory");
}

int do_child(void *vp)
{
	CPU_SET(!PARENT_CPU, &cpuset1);
	if (unlikely(setaffinity(0, cpuset1)))
		perror("child setaffinity");

	__raw_spin_unlock(&strt); /* release parent  */
again:
	__raw_spin_lock(&test);
	c.iters++;
	if (shrd_iters == c.prev)
		c.run++;
	else {
		if (c.run > c.max_run)
			c.max_run = c.run;
		c.hist[c.run <= 15 ? c.run : 15]++;
		c.run = 0;
	}
	if (likely(++shrd_iters < ITERS)) {
		c.prev = shrd_iters;
		__raw_spin_unlock(&test);
		goto again;
	}
	__raw_spin_unlock(&test);

	return 0;
}

int main(int argc, char * const argv[])
{
	CPU_SET(PARENT_CPU, &cpuset0);
	if (unlikely(setaffinity(0, cpuset0)))
		perror("parent setaffinity");

	clone(do_child, &stk[2047], CLONE_VM, 0);
	__raw_spin_lock(&strt); /* wait for child to init  */

	RDTSCLL(tsc1);
again:
	__raw_spin_lock(&test);
	p.iters++;
	if (shrd_iters == p.prev)
		p.run++;
	else {
		if (p.run > p.max_run)
			p.max_run = p.run;
		p.hist[p.run <= 15 ? p.run : 15]++;
		p.run = 0;
	}
	if (likely(++shrd_iters < ITERS)) {
		p.prev = shrd_iters;
		__raw_spin_unlock(&test);
		goto again;
	}
	__raw_spin_unlock(&test);
	RDTSCLL(tsc2);

	printf("Parent CPU %d, child CPU %d, ", PARENT_CPU, !PARENT_CPU);
	printf("using %s code for lock\n", FAST_SPINLOCK == 1 ? "new" : "old");
	printf("CPU clocks: %llu\n", tsc2 - tsc1);
	printf("Parent did: %u of %u iterations\n", p.iters, shrd_iters);
	printf("Run length histogram:\n");
	for (i = 0; i < 16; i++) {
		printf ("%s%2u: %u\n", i == 15 ? ">=" : "  ",
			i + 1, p.hist[i] + c.hist[i]);
		runs += p.hist[i] + c.hist[i];
	}
	printf("Total runs %u, longest was %u iterations, average was %u\n",
		runs, p.max_run > c.max_run ? p.max_run : c.max_run,
		shrd_iters / runs);

	return 0;
}
__
Chuck

^ permalink raw reply	[flat|nested] 9+ messages in thread
* [patch 2.6.14-rc4] i386: spinlock optimization
@ 2005-10-14 17:47 Chuck Ebbert
  2005-10-14 18:04 ` Andi Kleen
  2005-10-14 18:24 ` linux-os (Dick Johnson)
  0 siblings, 2 replies; 9+ messages in thread
From: Chuck Ebbert @ 2005-10-14 17:47 UTC (permalink / raw)
  To: linux-kernel
  Cc: Andi Kleen, Andrew Morton, Linus Torvalds, Ingo Molnar,
	Chuck Ebbert

Attempt to acquire spinlock sooner after spinning and then noticing
it has become available.  Also adds a slight delay before testing the
spinlock again when it's not available, reducing bus traffic.

This change makes spinlocks fairer in the case where the owner drops
the lock and then immediately tries to take it again.

Signed-Off-By: Chuck Ebbert <76306.1226@compuserve.com>
---

 include/asm-i386/spinlock.h |    4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

--- 2.6.14-rc4a.orig/include/asm-i386/spinlock.h
+++ 2.6.14-rc4a/include/asm-i386/spinlock.h
@@ -28,8 +28,8 @@
 	"2:\t" \
 	"rep;nop\n\t" \
 	"cmpb $0,%0\n\t" \
-	"jle 2b\n\t" \
-	"jmp 1b\n" \
+	"jg 1b\n\t" \
+	"jmp 2b\n" \
 	"3:\n\t"
 
 #define __raw_spin_lock_string_flags \

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2005-10-15 15:09 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2005-10-15  1:24 [patch 2.6.14-rc4] i386: spinlock optimization Chuck Ebbert
2005-10-15 15:09 ` Ingo Molnar
  -- strict thread matches above, loose matches on Subject: below --
2005-10-14 17:47 Chuck Ebbert
2005-10-14 18:04 ` Andi Kleen
2005-10-14 18:24 ` linux-os (Dick Johnson)
2005-10-14 18:32   ` Andrew Morton
2005-10-14 18:55     ` linux-os (Dick Johnson)
2005-10-14 18:37   ` Linus Torvalds
2005-10-14 18:56     ` linux-os (Dick Johnson)

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox