// // cpulat.cpp // Authored by Gianni Mariani 13-Jun-1999 // // This is intended to run on a dual CPU PII or better system. // This measures the latentcy in CPU clock cycles from one // CPU making a change to memory to the other CPU being able to // react. This latentcy is a critical factor in shared memory // parallel communications. // // // LICENSE - GPL - GNU Public Licencse // // Description // This program sets up a page of shared memory and then forks. // Each process will share the page of memory and will use it // to do raw shared memory spin locks using a simple while() // loop. // // The child process goes into a spin and the parent process // wakes breaks it's spin. Both processes read their respective // time stamps just after these events. The parent process waits // for the child to place the counter value in the shared memory // and then measures the difference. The roles are then reversed // and this interaction is measured 10 times. // // If the time stamp counters are not set up correctly by the OS // you will get weird behaviour. // // This is the output of a run on a Tyan S1837 with PIII 450's using // a stock RH 6.0 (2.2.5-15smp) Linux kernel. // // diff = 165 // odiff = 187 // diff = 201 // odiff = 175 // diff = 194 // odiff = 166 // diff = 192 // odiff = 182 // diff = 186 // odiff = 169 // diff = 181 // odiff = 177 // diff = 193 // odiff = 170 // diff = 162 // odiff = 219 // diff = 193 // odiff = 168 // // compile line : g++ -O2 -o cpulat cpulat.cpp // #include #include #include #include #include #include #include #include using namespace std; // // This snippet of __asm__ magic was taken from // a news posting. I don't know who the original // author is. I did do a regular net search. typedef unsigned long long u64; inline u64 rdtsc(void) { u64 clock; __asm__ __volatile__("rdtsc" : "=A" (clock)); return clock; } #define LOOPS 10 main() { volatile char * adr; int len; volatile u64 * data; int pgsz = getpagesize(); int fd; char * file = tmpnam( ( char * ) 0 ); // create a temporary file if ( ( fd = open( file, O_RDWR | O_CREAT, 0666 ) ) == -1 ) { bombed: perror( file ); return 1; } // Delete the file now, no need to have it hang around in // the directory. unlink( file ); // write a page to the file if ( lseek( fd, pgsz-1, SEEK_SET ) == (off_t)-1 ) goto bombed; if ( write( fd, "", 1 ) != 1 ) goto bombed; // Map the page to memory adr = ( char * ) mmap( ( char * ) 0, // Address to map to pgsz, // Length of the mapping PROT_WRITE, // Map for read MAP_SHARED, // Map a shared copy fd, 0 ); // Check to see if the mmap went OK if ( -1l == ( ptrdiff_t ) adr ) goto bombed; // No need for the file handle any more close( fd ); * adr = 0; adr[ 1 ] = 0; data = 1 + ( u64 * ) adr; int procid = fork(); if ( procid == -1 ) { perror( "fork()" ); return 1; } if ( ! procid ) { int n = 0; int n2 = 1; while ( 1 ) { // spin and wait for the other process // to modify memory. while ( ( * adr ) == n ) ; // First thing - get the cpu clock time u64 val = rdtsc(); // Write the time to the shared memory * data = val; n = * adr; // Wait a tad while ( rdtsc() < ( val + 450000000LL/2 ) ); // Break the other CPU out of the spin adr[ 1 ] = n2; // Immediatly get the time u64 val1 = rdtsc(); // Write it to shared memory data[ 1 ] = val1; // increment the count n2 ++; // If we've been around a few times // bail if ( n == LOOPS ) { exit( 0 ); } } } else { int n = 1; int n2 = 0; // Wait for the other process to get created and spun up. u64 val = rdtsc(); while ( rdtsc() < ( val + 450000000LL/4 ) ); // Go around a bit for ( int i = 1; i < (LOOPS+1); i ++ ) { // break the other process. * adr = n; // Immediatly get the time u64 val = rdtsc(); n ++; if ( n == (LOOPS+1) ) { exit( 0 ); } // Wait about 1/4 second while ( rdtsc() < ( val + 450000000LL/4 ) ); // Print out latentcy to other CPU from this CPU printf( "diff = %lld\n", * data - val ); // spin and wait for the other process while ( ( adr[ 1 ] ) == n2 ) ; // Immediatly get the time u64 val2 = rdtsc(); // set the new value n2 = adr[ 1 ]; // Wait for a tad. while ( rdtsc() < ( val + (450000000LL*3)/4 ) ); // Print out latentcy from this CPU from other CPU printf( "odiff = %lld\n", val2 - data[ 1 ] ); while ( rdtsc() < ( val + 450000000LL ) ); } } return 0; }