From: Parag Warudkar <kernel-stuff@comcast.net>
To: linux-kernel@vger.kernel.org
Subject: X86_64 Ctx switch times - 32bit vs 64bit
Date: Thu, 5 May 2005 21:38:57 -0400 [thread overview]
Message-ID: <200505052138.57821.kernel-stuff@comcast.net> (raw)
[-- Attachment #1: Type: text/plain, Size: 608 bytes --]
I was experimenting with the attached program (taken from an IBM
Developerworks article) to find the context switch times on AMD64 machine.
With a 64bit binary I get average 5 to 8 usec/cswitch, whereas the same
program compiled as 32bit consistently gives >= 10 usec/cswitch - sometimes
even 13 usec/cswitch.
Are there more context switching overheads when running 32bit programs on a
64bit kernel?
Kernel version is 2.6.11-gentoo x86_64.
64bit compile - g++ -O2 -pthread csfast5.cpp -ocsfast64
32bit compile - g++ -m32 -O2 -pthread csfast5.cpp -ocsfast32
Run - ./csfast{32/64} -t 40 -c4 10
Parag
[-- Attachment #2: csfast5.cpp --]
[-- Type: text/x-c++src, Size: 11598 bytes --]
#ifdef _WIN32
# include <windows.h>
# define errno GetLastError()
# define SLEEP(n) Sleep(1000*(n))
# define CRITS CRITICAL_SECTION
# define LOCK EnterCriticalSection
# define UNLOCK LeaveCriticalSection
# define _WIN32_WINNT 0x0500
# define SLASHC '\\'
# define SLASHSTR "\\"
char *facility = "CRITSECT";
HANDLE *th_handles;
typedef HANDLE THREAD_T;
void tstart(LARGE_INTEGER *);
void tend(LARGE_INTEGER *);
double tval(LARGE_INTEGER *, LARGE_INTEGER *);
#else
# define WINAPI
# include <unistd.h>
# include <stdlib.h>
# include <string.h>
# include <errno.h>
# include <sys/types.h>
# include <sys/wait.h>
# include <sys/time.h>
# include <fcntl.h>
# include <pthread.h>
# define SLEEP(n) sleep(n)
# define CRITS pthread_mutex_t
# define LOCK pthread_mutex_lock
# define UNLOCK pthread_mutex_unlock
# define SLASHC '/'
# define SLASHSTR "/"
char *facility = "mutex_lock";
pthread_t *th_handles;
typedef pthread_t THREAD_T;
void tstart(struct timeval *);
void tend(struct timeval *);
double tval(struct timeval *, struct timeval *);
#endif
typedef struct thrdmem {
unsigned long thrdId;
#ifdef _WIN32
LARGE_INTEGER _tstart;
LARGE_INTEGER _tend;
#else
struct timeval _tstart;
struct timeval _tend;
#endif
int threadnum;
unsigned long tcounter;
} thrdmem_t;
CRITS *crits;
int ncrits;
thrdmem_t *thrdm;
int nthreads = 2;
int showme = 0;
int csv = 0;
int Unlock(int);
int Lock(int);
void *Malloc(size_t);
#include <stdio.h>
#include <ctype.h>
#define equal !strcmp
#define equaln !strncmp
#define MAXCOUNT 100000
//
// csfast [-d] -[t nthreads] [-c ncrits] [maxcount]
//
// This program does pthread_mutexes and Win2k Critical Sections
// These are the fastest thread synchronization primitives on the
// respective platforms.
//
// We create nthreads execution environments and and ncrits locks
// (ncrits > nthreads) and pass a token back and forth
// between them as fast as we can. We count the number and times and
// produce a context switches per second number.
//
void USAGE();
int do_threads();
size_t atoik(char *s);
unsigned long maxcount = MAXCOUNT;
char *applname;
char applnamebuf[256];
unsigned long thrdId; // Thread ID
int main(int ac, char *av[])
{
int ret = 0;
//strcpy(applnamebuf,av[0]);
if(strrchr(av[0],SLASHC))
strcpy(applnamebuf, strrchr(av[0],SLASHC)+1);
else
strcpy(applnamebuf, av[0]);
#ifdef _WIN32
{
char *q;
if((q=strrchr(applnamebuf, '.')))
if(!equal(q+1,"exe"))
strcat(applnamebuf,".exe");
}
#endif
applname = applnamebuf;
if(ac == 1) {
USAGE();
return 0;
}
while(ac > 1) {
if(equal(av[1],"-debug") || equal(av[1],"-d")) {
ac--;
av++;
showme++;
}
else if(equal(av[1],"-csv")) {
ac--;
av++;
csv = 1;
}
else if(equaln(av[1], "-t",2)) {
if(av[1][2] == 0) {
ac--;
av++;
nthreads = atoik(av[1]);
}
else {
nthreads = atoik(&av[1][2]);
}
//if(nthreads > 1000) nthreads = 1000;
if(nthreads < 2) nthreads = 2;
ac--;
av++;
}
else if(equaln(av[1], "-c",2)) {
if(av[1][2] == 0) {
ac--;
av++;
ncrits = atoik(av[1]);
}
else {
ncrits = atoik(&av[1][2]);
}
ac--;
av++;
}
else if(isdigit(av[1][0])) {
maxcount = atoik(av[1]);
ac--;
av++;
if(maxcount == 0)
maxcount = 1;
}
}
//
// There has to be at least 1 more critical section than threads.
//
if(ncrits <= nthreads)
ncrits = nthreads + 1;
ret = do_threads();
return ret;
}
void USAGE()
{
printf("%s [-d [-d [-d]]] [-t nthreads] [-c ncrits] [maximum count]\n",applname);
return;
}
unsigned long WINAPI threadrun(void * var)
{
unsigned i;
thrdmem_t *t = (thrdmem_t *)var;
int tnum = t->threadnum;
int k = tnum;
int k1;
int counterA = tnum;
Lock(k);
#ifdef _WIN32
Sleep(100);
#else
sleep(1);
#endif
tstart(&t->_tstart);
for(i = 0; i < maxcount; i++) {
k1 = k + 1;
if(k1 >= ncrits)
k1 = 0;
Lock(k1);
Unlock(k);
if(showme) {
if(showme > 1) {
printf("T%d\n",tnum); fflush(stdout);
}
else if (showme > 2) {
printf("T%d: i=%d %d\n", tnum,i,counterA); fflush(stdout);
}
}
counterA += nthreads;
k = k1;
t->tcounter++;
}
Unlock(k);
tend(&t->_tend);
if(showme > 0) {
// Don't let my printf's interfere with the timing of other threads.
SLEEP(2+(nthreads/40));
double tim = tval(&t->_tstart, &t->_tend);
printf("%lu %s/thread Context switches in %7.3f sec ",
maxcount, facility, tim);
printf("%7.3f usec/cswitch",
(tim*1e6)/(maxcount*nthreads));
printf("\n");
fflush(stdout);
}
#ifdef _WIN32
ExitThread(0);
#endif
return 0;
}
int Unlock(int k)
{
UNLOCK((CRITS *)&crits[k]);
return 1;
}
int Lock(int k)
{
LOCK((CRITS *)&crits[k]);
return 1;
}
int do_threads()
{
int i;
unsigned mem;
//
// creates ncrits critical sections for use by the threads.
// creates nthreads thread memories
// creates nthreads threads and passes a token back and forth.
//
mem = (ncrits+1) * sizeof(CRITS);
//mem = ((mem + 4095)/4096) * 4096;
crits = (CRITS *) Malloc(mem);
mem = (nthreads+1)*sizeof(thrdmem_t);
//mem = ((mem + 4095)/4096) * 4096;
thrdm = (thrdmem_t *) Malloc(mem);
mem = (nthreads+1)*sizeof(THREAD_T);
//mem = ((mem + 4095)/4096) * 4096;
th_handles = (THREAD_T *) Malloc(mem);
for(i = 0; i < ncrits + 1; i++)
#ifdef _WIN32
InitializeCriticalSection(&crits[i]);
#else
pthread_mutex_init(&crits[i],NULL);
#endif
//printf("%d Threads\n",nthreads); fflush(stdout);
for(i = 0; i < nthreads; i++) {
thrdm[i].threadnum = i;
#ifdef _WIN32
//printf("\b\b\b\b%4d",i); fflush(stdout);
//if((th_handles[i] = CreateThread(NULL, 4096, threadrun,
if((th_handles[i] = CreateThread(NULL, 8192, threadrun,
(void *)&thrdm[i], NULL, &thrdId)) == NULL) {
printf("Creation of %d thread failed err=%d\n", i,errno);
fflush(stdout);
return 1;
}
thrdm[i].thrdId = thrdId;
#else
int terr;
# define DEC ( void *(*)(void*) )
terr = pthread_create(&th_handles[i], NULL,
DEC threadrun, (void *)&thrdm[i]);
if(terr) {
printf("pthread_create %d failed: err=%d\n", i,terr);
printf("%s", strerror(terr));
fflush(stdout);
return 1;
}
#endif
}
//printf("\n"); fflush(stdout);
for(i = 0; i < nthreads; i++) {
//printf("\b\b\b\b%4d",i); fflush(stdout);
#ifdef _WIN32
if(WaitForSingleObject(th_handles[i],INFINITE) == WAIT_FAILED) {
printf("WaitForSingleObject FAILED: err=%d\n",errno);
#else
if(pthread_join(th_handles[i],NULL)) {
printf("pthread_join FAILED: err=%d\n",errno);
#endif
fflush(stdout);
return 1;
}
}
// Check that all threads actually completed their tasks.
if(thrdm[0].tcounter != maxcount) {
printf("Thread 0 did %lu out of %lu work\n",
thrdm[0].tcounter,maxcount);
fflush(stdout);
return 1;
}
for(i = 1; i < nthreads; i++) {
if(thrdm[i].tcounter != thrdm[0].tcounter) {
printf("Thread %d did %lu out of %lu work\n",
i,thrdm[0].tcounter,maxcount);
fflush(stdout);
return 1;
}
}
#ifdef _WIN32
//printf("All Complete\n"); fflush(stdout);
#endif
double sum = 0.0;
double sum2 = 0.0;
double maxv, minv;
double avg = 0.0;
double tim;
maxv = minv = tval(&thrdm[0]._tstart, &thrdm[0]._tend);
for(i = 0; i < nthreads; i++) {
tim = tval(&thrdm[i]._tstart, &thrdm[i]._tend);
sum += tim;
sum2 += (tim*tim);
if(tim < minv)
minv = tim;
if(tim > maxv)
maxv = tim;
}
avg = sum/nthreads;
if(csv) {
printf("\"%s\",%lu,%d,%d,",
facility, maxcount, nthreads, ncrits);
printf("%.6f,%.6f,%.6f",
(avg*1e6)/(maxcount*nthreads),
(minv*1e6)/(maxcount*nthreads),
(maxv*1e6)/(maxcount*nthreads));
fflush(stdout);
}
else {
printf("AVG: %lu %s t=%d c=%d in %7.3f sec ",
maxcount, facility, nthreads, ncrits, avg);
printf("%7.3f usec/cswitch",
(avg*1e6)/(maxcount*nthreads));
fflush(stdout);
}
printf("\n");
return 0;
}
#include <ctype.h>
size_t atoik(char *s)
{
size_t ret = 0;
size_t base;
if(*s == '0') {
base = 8;
if(*++s == 'x' || *s == 'X') {
base = 16;
s++;
}
}
else
base = 10;
for(; isxdigit(*s); s++) {
if(base == 16)
if(isalpha(*s))
ret = base*ret + (toupper(*s) - 'A');
else
ret = base*ret + (*s - '0');
else if(isdigit(*s))
ret = base*ret + (*s - '0');
else
break;
}
for(; isalpha(*s); s++) {
switch(toupper(*s)) {
case 'K': ret *= 1024; break;
case 'M': ret *= 1024*1024; break;
default:
return ret;
}
}
return ret;
}
#ifdef _WIN32
static LARGE_INTEGER freq; // GLOBAL
static int tfirst = 1;
void tstart(LARGE_INTEGER *t)
{
if(tfirst) {
QueryPerformanceFrequency(&freq);
tfirst = 0;
}
QueryPerformanceCounter(t);
}
void tend(LARGE_INTEGER *t)
{
QueryPerformanceCounter(t);
}
double tval(LARGE_INTEGER *t1, LARGE_INTEGER *t2)
{
return ((double)t2->QuadPart -
(double)t1->QuadPart)/((double)freq.QuadPart);
}
#else
void tstart(struct timeval *t)
{
gettimeofday(t, NULL);
}
void tend(struct timeval *t)
{
gettimeofday(t,NULL);
}
double tval(struct timeval *tv1, struct timeval *tv2)
{
double t1, t2;
t1 = (double)tv1->tv_sec + (double)tv1->tv_usec/(1000*1000);
t2 = (double)tv2->tv_sec + (double)tv2->tv_usec/(1000*1000);
return t2-t1;
}
#endif
void *Malloc(size_t sz)
{
char *p;
if(showme) printf("Malloc(%d)=", sz);
p = (char *)malloc(sz);
if(p == NULL) {
(void)printf("malloc(%d) failed\n",sz);
fflush(stdout);
exit(1);
}
memset(p, '\0', sz);
if(showme) printf("%x\n",(unsigned int)p); if(showme) fflush(stdout);
return (void *)p;
}
// typedef struct _RTL_CRITICAL_SECTION {
// PRTL_CRITICAL_SECTION_DEBUG DebugInfo;
// LONG LockCount;
// LONG RecursionCount;
// HANDLE OwningThread;
// HANDLE LockSemaphore;
// ULONG_PTR SpinCount;
// } RTL_CRITICAL_SECTION, *PRTL_CRITICAL_SECTION;
next reply other threads:[~2005-05-06 1:41 UTC|newest]
Thread overview: 4+ messages / expand[flat|nested] mbox.gz Atom feed top
2005-05-06 1:38 Parag Warudkar [this message]
2005-05-07 14:40 ` X86_64 Ctx switch times - 32bit vs 64bit Andi Kleen
2005-05-10 1:31 ` Parag Warudkar
2005-05-10 1:48 ` Andi Kleen
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=200505052138.57821.kernel-stuff@comcast.net \
--to=kernel-stuff@comcast.net \
--cc=linux-kernel@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.