FC3 x86_64 math functions slowdown

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hi all

At work I have several x86_64 based (Opteron, Athlon 64, Xenon 64 CPUs) machines running FC3 and SuSe 9.

I have noticed a significant (up to 5 times !!!) slowdown of some math functions in /lib64/libm.so.6 (glibc-2.3-5) shipped with FC3 compared to the version (glibc-2.3.3) shipped with SuSe 9.1. (I do not have any machines running FC1,FC2 to test them too).

To make it clear I used a simple benchmarking program (See source at the bottom of this message)
in two identically configured machines running SuSe 9.1 and FC3.

It basically tries to measure the time a call to a math function takes by calling them multiple times.

Both 32 and 64 bit executables were used to show that the problem is in the 64 bit library. Note that there is no notable change in the performance of the 32 bit library.

The results are 100% reproducible and invariant to compiler version and options used.

Both double and float versions of each function were used to show the huge difference in performance
of tan() and tanf().

Another strange thing is that in both versions of the library the 64bit sin() function family is two times slower than the 32bit while for all other functions the 64bit ones are faster or close to the speed of the 32bit ones.

The following table summarizes the results .

                             glibc 2.3.3    glibc 2.3.5
Math function 32 bit 64 bit 32 bit 64 bit

f1 = log(f2) 0.10 0.03 0.11 0.08 !! (3 times slower) f1 = logf(f2) 0.10 0.03 0.11 0.09 !! (same here)

f1 = tan(f2) 0.07 0.08 0.07 0.35 !!! (even worst, 5 times slower) f1 = tanf(f2) 0.07 0.06 0.07 0.09 ! (this is crazy :1.5 times slower than glibc-2.3.3 but 4 times faster than "tan()" of glibc-2.3.5)

f1 = exp(f2) 0.07 0.03 0.07 0.27 !!! (almost 4 times slower)
 f1 = expf(f2)       0.07    0.03    0.07    0.27 !!! (same here)

f1 = sin(f2) 0.03 0.05 0.03 0.06 * (this is strange : 64bit version is 2 times slower than 32bit) f1 = sinf(f2) 0.03 0.05 0.03 0.06 * (same here)

 f1 = cos(f2)        0.05    0.06    0.05    0.06
 f1 = cosf(f2)       0.05    0.06    0.05    0.06
 f1 = cosl(f2)       0.05    0.06    0.06    0.06

 f1 = sqrt(f2)       0.02    0.01    0.02    0.01
 f1 = sqrtf(f2)      0.02    0.01    0.02    0.01





SYSTEM INFO OF THE TEST MACHINES :

System 1 : Suse 9.1 (x86-64)

galactix # uname -a
Linux galactix 2.6.4-54.5-default #1 Fri May 7 16:47:49 UTC 2004 x86_64 x86_64 x86_64 GNU/Linux

galactix # rpm -q glibc
glibc-2.3.3-63

galactix # cat /proc/cpuinfo

processor       : 0
vendor_id       : AuthenticAMD
cpu family      : 15
model           : 5
model name      : AMD Opteron(tm) Processor 246
stepping        : 8
cpu MHz         : 1992.158
cache size      : 1024 KB
fpu             : yes
fpu_exception   : yes
cpuid level     : 1
wp              : yes
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov
pat pse36 clflush mmx fxsr sse sse2 sys call nix mmxext elm 3dnowext 3now
bogomips        : 3915.77
TLB size        : 1088 4K pages
clflush size    : 64
address sizes   : 40 bits physical, 48 bits virtual
power management: ts ttp

System 2 : FC3 (x86-64)

neron # uname -a
Linux neron.localdomain 2.6.10-1.770_FC3 #1 Thu Feb 24 18:09:38 EST 2005 x86_64
x86_64 x86_64 GNU/Linux

neron # rpm -q glibc
glibc-2.3.5-0.fc3.1

neron # cat /proc/cpuinfo

processor       : 0
vendor_id       : AuthenticAMD
cpu family      : 15
model           : 5
model name      : AMD Opteron(tm) Processor 246
stepping        : 8
cpu MHz         : 1994.595
cache size      : 1024 KB
fpu             : yes
fpu_exception   : yes
cpuid level     : 1
wp              : yes
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov
pat pse36 clflush mmx fxsr sse sse2 pni syscall nx mmxext lm 3dnowext 3dnow
bogomips        : 3923.96
TLB size        : 1088 4K pages
clflush size    : 64
cache_alignment : 64
address sizes   : 40 bits physical, 48 bits virtual
power management: ts ttp



SOURCE CODE OF BENCHMARKING PROGRAM

galactix # cat metro.c

----------------- CUT HERE --------------------
#include <stdio.h>
#include <limits.h>
#include <math.h>
#include <string.h>
#include <stdlib.h>

#define BASEN 50000000
#define WARNRANGE 0.4

struct stacknode {
       int val;
       struct stacknode *next;
};
typedef struct stacknode *Stackp;
Stackp  stackroot;

void push(int i)
{
       Stackp p;

       p = (Stackp) malloc(sizeof(struct stacknode));
       p->val = i;
       p->next = stackroot;
       stackroot = p;
}

int pop()
{
       Stackp p;
       int     i;

       p = stackroot;
       i = stackroot->val;
       stackroot = stackroot->next;
       free(p);
       return i;
}

#include <time.h>

int jobclicks()
{
       return (int) clock();
}

#define quoted(TEXT) #TEXT
/*#define quoted(TEXT) "TEXT"*/

#define loop1(CODE) loop1ctr++; \
       for (i = 0; i < n; i++) { CODE; } \
       loop1next = jobclicks(); \
       thisclicks = loop1next - loop1start; \
       sumclicks += thisclicks; \
       if (thisclicks < minclicks) minclicks = thisclicks; \
       if (thisclicks > maxclicks) maxclicks = thisclicks; \
       printf("%7d ", loop1next - loop1start); \
       loop1start = loop1next;

#define loop(CODE) printf("  %-30s", quoted(CODE)); \
       minclicks = INT_MAX; maxclicks = -1; sumclicks = 0; \
       loop1ctr = 0; \
       loop1start = jobclicks(); \
       loop1(CODE) \
       loop1(CODE) \
       i0 = i1 + i2 + i3; \
       loop1(CODE) \
       i0 = i1 + i2 + i3 - i1 - i2 - i3; \
       loop1(CODE) \
       i0 = i1 + i2 + i3 + i1*i2 + i2*i3 + i1*i3; \
       loop1(CODE) \
       queststr = ""; \
       if (loop1ctr * (maxclicks - minclicks) > WARNRANGE *  sumclicks) \
               queststr = "?"; \
lastmics = sumclicks * 1000000.0 / ((double) CLOCKS_PER_SEC * n * loop1ctr); \
       printf("%10.2f%s\n", lastmics - basemics, queststr);

#define title(TEXT) printf("%s (n=%d)\n", TEXT, n);

/* The experiment */

int sum1(int a) { return a; }
int sum2(int a, int b) { return a + b; }
int sum3(int a, int b, int c) { return a + b + c; }

int main()
{
       int     loop1start, loop1next, loop1ctr;
       double  lastmics, basemics;
       int     minclicks, maxclicks, sumclicks, thisclicks, startclicks;
       int     i, n, basen;
       volatile int    i0, i1, i2, i3, i4;
       volatile float  f0, f1, f2, f3;
       int     *v;
       char    *queststr;
       char    s[100];
       char    fname[20];
       FILE    *fp;
       char    s0123456789[] = "0123456789";
       char    sa123456789[] = "a123456789";
       char    s12345[] = "12345";
       char    s123_45[] = "123.45";
       char    sd[] = "%d";
       char    sdn[] = "%d\n";
       char    sf[] = "%f";
       char    sf62[] = "%f6.2";

       setbuf(stdout, (char *) 0);     /* No buffering to watch output */
printf(" Operation Clicks for each trial ");
       printf("   Mics/N\n");
       startclicks = jobclicks();

       basen = BASEN;
       n = basen;
       title("Null Loop")
       i0 = i1 = i2 = i3 = 5;
       f0 = f1 = f2 = f3 = 5.0;
       basemics = 0.0;
       loop({})
       basemics = lastmics;

       n = basen/10;
/*      n = basen;*/
       title("Math Functions");
       f2 = 5.0;
       loop(f1 = log(f2))
       loop(f1 = logf(f2))
       loop(f1 = tan(f2))
       loop(f1 = tanf(f2))
       loop(f1 = exp(f2))
       loop(f1 = expf(f2))
       loop(f1 = sin(f2))
       loop(f1 = sinf(f2))
       loop(f1 = cos(f2))
       loop(f1 = cosf(f2))
       loop(f1 = cosl(f2))
       loop(f1 = sqrt(f2))
       loop(f1 = sqrtf(f2))

printf("Total Seconds:%10.2f\n", ((float) jobclicks()-startclicks) / CLOCKS_PER_SEC);
       return 0;
}
--------- CUT HERE --------


Hope its helpful

  D. Angelis


[Index of Archives]     [Current Fedora Users]     [Fedora Desktop]     [Fedora SELinux]     [Yosemite News]     [Yosemite Photos]     [KDE Users]     [Fedora Tools]     [Fedora Docs]

  Powered by Linux