X86_64 Ctx switch times - 32bit vs 64bit

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



I was experimenting with the attached program (taken from an IBM 
Developerworks article) to find the context switch times on AMD64 machine.

With a 64bit binary I get average 5 to 8 usec/cswitch, whereas the same 
program compiled as 32bit consistently gives >= 10 usec/cswitch - sometimes 
even 13 usec/cswitch.

Are there more context switching overheads when running 32bit programs on a 
64bit kernel?

Kernel version is 2.6.11-gentoo x86_64.
64bit compile  - g++ -O2 -pthread csfast5.cpp -ocsfast64
32bit compile  - g++ -m32 -O2 -pthread csfast5.cpp -ocsfast32
Run - ./csfast{32/64} -t 40 -c4 10

Parag
#ifdef _WIN32
#    include <windows.h>
#    define errno        GetLastError()
#    define SLEEP(n)        Sleep(1000*(n))
#    define CRITS CRITICAL_SECTION
#    define LOCK    EnterCriticalSection
#    define UNLOCK LeaveCriticalSection
#    define _WIN32_WINNT    0x0500
#    define SLASHC    '\\'
#    define SLASHSTR    "\\"

    char *facility = "CRITSECT";
    HANDLE *th_handles;
    typedef HANDLE THREAD_T;

    void tstart(LARGE_INTEGER *);
    void tend(LARGE_INTEGER *);
    double tval(LARGE_INTEGER *, LARGE_INTEGER *);

#else
#    define WINAPI
#    include <unistd.h>
#    include <stdlib.h>
#    include <string.h>
#    include <errno.h>
#    include <sys/types.h>
#    include <sys/wait.h>
#    include <sys/time.h>
#    include <fcntl.h>
#    include <pthread.h>
#    define SLEEP(n)        sleep(n)
#    define CRITS pthread_mutex_t
#    define LOCK    pthread_mutex_lock
#    define UNLOCK    pthread_mutex_unlock
#    define SLASHC    '/'
#    define SLASHSTR    "/"

    char *facility = "mutex_lock";
    pthread_t *th_handles;
    typedef pthread_t THREAD_T;

    void tstart(struct timeval *);
    void tend(struct timeval *);
    double tval(struct timeval *, struct timeval *);

#endif

typedef struct thrdmem {
    unsigned long thrdId;
#ifdef _WIN32
    LARGE_INTEGER _tstart;
    LARGE_INTEGER _tend;
#else
    struct timeval _tstart;
    struct timeval _tend;
#endif
    int threadnum;
    unsigned long tcounter;
} thrdmem_t;
CRITS *crits;
int ncrits;
thrdmem_t *thrdm;

int nthreads = 2;
int showme = 0;
int csv = 0;

int Unlock(int);
int Lock(int);

void *Malloc(size_t);

#include <stdio.h>
#include <ctype.h>

#define equal    !strcmp
#define equaln    !strncmp
#define MAXCOUNT    100000


//
// csfast [-d] -[t nthreads] [-c ncrits] [maxcount]
//
// This program does pthread_mutexes and Win2k Critical Sections
// These are the fastest thread synchronization primitives on the
// respective platforms.
//
// We create nthreads execution environments and and ncrits locks
// (ncrits > nthreads) and pass a token back and forth
// between them as fast as we can. We count the number and times and
// produce a context switches per second number.
//


void USAGE();
int do_threads();
size_t atoik(char *s);

unsigned long maxcount = MAXCOUNT;

char *applname;
char applnamebuf[256];

unsigned long thrdId;        // Thread ID

int main(int ac, char *av[])
{
    int ret = 0;

    //strcpy(applnamebuf,av[0]);
    if(strrchr(av[0],SLASHC))
        strcpy(applnamebuf, strrchr(av[0],SLASHC)+1);
    else
        strcpy(applnamebuf, av[0]);
#ifdef _WIN32
    {
        char *q;
        
        if((q=strrchr(applnamebuf, '.')))
            if(!equal(q+1,"exe"))
                strcat(applnamebuf,".exe");
    }
#endif
    applname = applnamebuf;

    if(ac == 1) {
        USAGE();
        return 0;
    }
    while(ac > 1) {
        if(equal(av[1],"-debug") || equal(av[1],"-d")) {
            ac--;
            av++;
            showme++;
        }
        else if(equal(av[1],"-csv")) {
            ac--;
            av++;
            csv = 1;
        }
        else if(equaln(av[1], "-t",2)) {
            if(av[1][2] == 0) {
                ac--;
                av++;
                nthreads = atoik(av[1]);
            }
            else {
                nthreads = atoik(&av[1][2]);
            }
            //if(nthreads > 1000) nthreads = 1000;
            if(nthreads < 2) nthreads = 2;
            ac--;
            av++;
        }
        else if(equaln(av[1], "-c",2)) {
            if(av[1][2] == 0) {
                ac--;
                av++;
                ncrits = atoik(av[1]);
            }
            else {
                ncrits = atoik(&av[1][2]);
            }
            ac--;
            av++;
        }
        else if(isdigit(av[1][0])) {
            maxcount = atoik(av[1]);
            ac--;
            av++;
            if(maxcount == 0)
                maxcount = 1;
        }
    }
    //
    // There has to be at least 1 more critical section than threads.
    //
    if(ncrits <= nthreads)
        ncrits = nthreads + 1;

    ret = do_threads();
    return ret;
}

void USAGE()
{
    printf("%s [-d [-d [-d]]] [-t nthreads] [-c ncrits] [maximum count]\n",applname);
    return;
}

unsigned long WINAPI threadrun(void * var)
{
    unsigned i;
    thrdmem_t *t = (thrdmem_t *)var;

    int tnum = t->threadnum;
    int k = tnum;
    int k1;
    int counterA = tnum;

    Lock(k);
#ifdef _WIN32
    Sleep(100);
#else
    sleep(1);
#endif

    tstart(&t->_tstart);

    for(i = 0; i < maxcount; i++) {
        k1 = k + 1;
        if(k1 >= ncrits)
            k1 = 0;
        Lock(k1);
        Unlock(k);
        if(showme) {
            if(showme > 1) {
                printf("T%d\n",tnum); fflush(stdout);
            }
            else if (showme > 2) {
                printf("T%d: i=%d %d\n", tnum,i,counterA); fflush(stdout);
            }
        }
        counterA += nthreads;

        k = k1;
        t->tcounter++;
    }
    Unlock(k);
    tend(&t->_tend);

    if(showme > 0) {
        // Don't let my printf's interfere with the timing of other threads.
        SLEEP(2+(nthreads/40));
        double tim = tval(&t->_tstart, &t->_tend);

        printf("%lu %s/thread Context switches in %7.3f sec ",
            maxcount, facility, tim);

        printf("%7.3f usec/cswitch",
            (tim*1e6)/(maxcount*nthreads));
        printf("\n");
        fflush(stdout);
    }
#ifdef _WIN32
    ExitThread(0);
#endif
    return 0;
}
int Unlock(int k)
{
    UNLOCK((CRITS *)&crits[k]);
    return 1;
}
int Lock(int k)
{
    LOCK((CRITS *)&crits[k]);
    return 1;
}

int do_threads()
{
    int i;
    unsigned mem;

    //
    // creates ncrits critical sections for use by the threads.
    // creates nthreads thread memories
    // creates nthreads threads and passes a token back and forth.
    //

    mem = (ncrits+1) * sizeof(CRITS);
    //mem = ((mem + 4095)/4096) * 4096;
    crits      = (CRITS     *) Malloc(mem);

    mem = (nthreads+1)*sizeof(thrdmem_t);
    //mem = ((mem + 4095)/4096) * 4096;
    thrdm      = (thrdmem_t *) Malloc(mem);

    mem = (nthreads+1)*sizeof(THREAD_T);
    //mem = ((mem + 4095)/4096) * 4096;
    th_handles = (THREAD_T  *) Malloc(mem);

    for(i = 0; i < ncrits + 1; i++)
#ifdef _WIN32
        InitializeCriticalSection(&crits[i]);
#else
        pthread_mutex_init(&crits[i],NULL);
#endif

    //printf("%d Threads\n",nthreads); fflush(stdout);
    for(i = 0; i < nthreads; i++) {
        thrdm[i].threadnum = i;
#ifdef _WIN32
        //printf("\b\b\b\b%4d",i); fflush(stdout);
        //if((th_handles[i] = CreateThread(NULL, 4096, threadrun,
        if((th_handles[i] = CreateThread(NULL, 8192, threadrun,
                    (void *)&thrdm[i], NULL, &thrdId)) == NULL) {
            printf("Creation of %d thread failed err=%d\n", i,errno);
            fflush(stdout);
            return 1;
        }
        thrdm[i].thrdId = thrdId;
#else
        int terr;

#        define DEC    ( void *(*)(void*) )
        terr = pthread_create(&th_handles[i], NULL,
                    DEC  threadrun, (void *)&thrdm[i]);
        if(terr) {
            printf("pthread_create %d failed: err=%d\n", i,terr);
	    printf("%s", strerror(terr));
            fflush(stdout);
            return 1;
        }
#endif
    }
    //printf("\n"); fflush(stdout);

    for(i = 0; i < nthreads; i++) {
        //printf("\b\b\b\b%4d",i); fflush(stdout);
#ifdef _WIN32
        if(WaitForSingleObject(th_handles[i],INFINITE) == WAIT_FAILED) {
            printf("WaitForSingleObject FAILED: err=%d\n",errno);
#else
        if(pthread_join(th_handles[i],NULL)) {
            printf("pthread_join FAILED: err=%d\n",errno);
#endif
            fflush(stdout);
            return 1;
        }
    }

    //  Check that all threads actually completed their tasks.
    if(thrdm[0].tcounter != maxcount) {
        printf("Thread 0 did %lu out of %lu work\n",
                thrdm[0].tcounter,maxcount);
        fflush(stdout);
        return 1;
    }
    for(i = 1; i < nthreads; i++) {
        if(thrdm[i].tcounter != thrdm[0].tcounter) {
            printf("Thread %d did %lu out of %lu work\n",
                    i,thrdm[0].tcounter,maxcount);
            fflush(stdout);
            return 1;
        }
    }
#ifdef _WIN32
    //printf("All Complete\n"); fflush(stdout);
#endif

    double sum = 0.0;
    double sum2 = 0.0;
    double maxv, minv;
    double avg = 0.0;
    double tim;

    maxv = minv = tval(&thrdm[0]._tstart, &thrdm[0]._tend);
    for(i = 0; i < nthreads; i++) {
        tim = tval(&thrdm[i]._tstart, &thrdm[i]._tend);
        sum  += tim;
        sum2 += (tim*tim);
        if(tim < minv)
            minv = tim;
        if(tim > maxv)
            maxv = tim;
    }

    avg = sum/nthreads;
    if(csv) {
        printf("\"%s\",%lu,%d,%d,",
            facility, maxcount, nthreads, ncrits);
        printf("%.6f,%.6f,%.6f",
            (avg*1e6)/(maxcount*nthreads),
            (minv*1e6)/(maxcount*nthreads),
            (maxv*1e6)/(maxcount*nthreads));
        fflush(stdout);
    }
    else {
        printf("AVG: %lu %s t=%d c=%d in %7.3f sec ",
            maxcount, facility, nthreads, ncrits, avg);
        printf("%7.3f usec/cswitch",
            (avg*1e6)/(maxcount*nthreads));
        fflush(stdout);
    }

    printf("\n");

    return 0;
}

#include <ctype.h>

size_t atoik(char *s)
{
    size_t ret = 0;
    size_t base;

    if(*s == '0') {
        base = 8;
        if(*++s == 'x' || *s == 'X') {
            base = 16;
            s++;
        }
    }
    else
        base = 10;

    for(; isxdigit(*s); s++) {
        if(base == 16)
            if(isalpha(*s))
                ret = base*ret + (toupper(*s) - 'A');
            else
                ret = base*ret + (*s - '0');
        else if(isdigit(*s))
                ret = base*ret + (*s - '0');
        else
            break;
    }
    for(; isalpha(*s); s++) {
        switch(toupper(*s)) {
        case 'K': ret *= 1024; break;
        case 'M': ret *= 1024*1024; break;
        default:
            return ret;
        }
    }
    return ret;
}

#ifdef _WIN32
static LARGE_INTEGER freq; // GLOBAL
static int tfirst = 1;

void tstart(LARGE_INTEGER *t)
{
    if(tfirst) {
        QueryPerformanceFrequency(&freq);
        tfirst = 0;
    }
    QueryPerformanceCounter(t);
}
void tend(LARGE_INTEGER *t)
{
    QueryPerformanceCounter(t);
}

double tval(LARGE_INTEGER *t1, LARGE_INTEGER *t2)
{
    return ((double)t2->QuadPart -
                (double)t1->QuadPart)/((double)freq.QuadPart);
}
#else

void tstart(struct timeval *t)
{
    gettimeofday(t, NULL);
}
void tend(struct timeval *t)
{
    gettimeofday(t,NULL);
}

double tval(struct timeval *tv1, struct timeval *tv2)
{
    double t1, t2;

    t1 =  (double)tv1->tv_sec + (double)tv1->tv_usec/(1000*1000);
    t2 =  (double)tv2->tv_sec + (double)tv2->tv_usec/(1000*1000);
    return t2-t1;
}
#endif
void *Malloc(size_t sz)
{
    char *p;

    if(showme) printf("Malloc(%d)=", sz);
    p = (char *)malloc(sz);
    if(p == NULL) {
        (void)printf("malloc(%d) failed\n",sz);
        fflush(stdout);
        exit(1);
    }
    memset(p, '\0', sz);
    if(showme) printf("%x\n",(unsigned int)p); if(showme) fflush(stdout);
    return (void *)p;
}

// typedef struct _RTL_CRITICAL_SECTION {
//     PRTL_CRITICAL_SECTION_DEBUG DebugInfo;
//     LONG LockCount;
//     LONG RecursionCount;
//     HANDLE OwningThread;        
//     HANDLE LockSemaphore;
//     ULONG_PTR SpinCount;        
// } RTL_CRITICAL_SECTION, *PRTL_CRITICAL_SECTION;

[Index of Archives]     [Kernel Newbies]     [Netfilter]     [Bugtraq]     [Photo]     [Stuff]     [Gimp]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Video 4 Linux]     [Linux for the blind]     [Linux Resources]
  Powered by Linux