Hi all
At work I have several x86_64 based (Opteron, Athlon 64, Xenon 64 CPUs)
machines running FC3 and SuSe 9.
I have noticed a significant (up to 5 times !!!) slowdown of some math
functions in /lib64/libm.so.6 (glibc-2.3-5) shipped with FC3
compared to the version (glibc-2.3.3) shipped with SuSe 9.1. (I do not
have any machines running FC1,FC2 to test them too).
To make it clear I used a simple benchmarking program (See source at the
bottom of this message)
in two identically configured machines running SuSe 9.1 and FC3.
It basically tries to measure the time a call to a math function takes
by calling them multiple times.
Both 32 and 64 bit executables were used to show that the problem is in
the 64 bit library.
Note that there is no notable change in the performance of the 32 bit
library.
The results are 100% reproducible and invariant to compiler version and
options used.
Both double and float versions of each function were used to show the
huge difference in performance
of tan() and tanf().
Another strange thing is that in both versions of the library the 64bit
sin() function family is two times slower than the 32bit
while for all other functions the 64bit ones are faster or close to the
speed of the 32bit ones.
The following table summarizes the results .
glibc 2.3.3 glibc 2.3.5
Math function 32 bit 64 bit 32 bit 64 bit
f1 = log(f2) 0.10 0.03 0.11 0.08 !! (3 times slower)
f1 = logf(f2) 0.10 0.03 0.11 0.09 !! (same here)
f1 = tan(f2) 0.07 0.08 0.07 0.35 !!! (even worst, 5
times slower)
f1 = tanf(f2) 0.07 0.06 0.07 0.09 ! (this is crazy
:1.5 times slower than glibc-2.3.3 but 4 times faster than "tan()" of
glibc-2.3.5)
f1 = exp(f2) 0.07 0.03 0.07 0.27 !!! (almost 4 times
slower)
f1 = expf(f2) 0.07 0.03 0.07 0.27 !!! (same here)
f1 = sin(f2) 0.03 0.05 0.03 0.06 * (this is strange :
64bit version is 2 times slower than 32bit)
f1 = sinf(f2) 0.03 0.05 0.03 0.06 * (same here)
f1 = cos(f2) 0.05 0.06 0.05 0.06
f1 = cosf(f2) 0.05 0.06 0.05 0.06
f1 = cosl(f2) 0.05 0.06 0.06 0.06
f1 = sqrt(f2) 0.02 0.01 0.02 0.01
f1 = sqrtf(f2) 0.02 0.01 0.02 0.01
SYSTEM INFO OF THE TEST MACHINES :
System 1 : Suse 9.1 (x86-64)
galactix # uname -a
Linux galactix 2.6.4-54.5-default #1 Fri May 7 16:47:49 UTC 2004 x86_64
x86_64 x86_64 GNU/Linux
galactix # rpm -q glibc
glibc-2.3.3-63
galactix # cat /proc/cpuinfo
processor : 0
vendor_id : AuthenticAMD
cpu family : 15
model : 5
model name : AMD Opteron(tm) Processor 246
stepping : 8
cpu MHz : 1992.158
cache size : 1024 KB
fpu : yes
fpu_exception : yes
cpuid level : 1
wp : yes
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge
mca cmov
pat pse36 clflush mmx fxsr sse sse2 sys call nix mmxext elm 3dnowext 3now
bogomips : 3915.77
TLB size : 1088 4K pages
clflush size : 64
address sizes : 40 bits physical, 48 bits virtual
power management: ts ttp
System 2 : FC3 (x86-64)
neron # uname -a
Linux neron.localdomain 2.6.10-1.770_FC3 #1 Thu Feb 24 18:09:38 EST 2005
x86_64
x86_64 x86_64 GNU/Linux
neron # rpm -q glibc
glibc-2.3.5-0.fc3.1
neron # cat /proc/cpuinfo
processor : 0
vendor_id : AuthenticAMD
cpu family : 15
model : 5
model name : AMD Opteron(tm) Processor 246
stepping : 8
cpu MHz : 1994.595
cache size : 1024 KB
fpu : yes
fpu_exception : yes
cpuid level : 1
wp : yes
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge
mca cmov
pat pse36 clflush mmx fxsr sse sse2 pni syscall nx mmxext lm 3dnowext 3dnow
bogomips : 3923.96
TLB size : 1088 4K pages
clflush size : 64
cache_alignment : 64
address sizes : 40 bits physical, 48 bits virtual
power management: ts ttp
SOURCE CODE OF BENCHMARKING PROGRAM
galactix # cat metro.c
----------------- CUT HERE --------------------
#include <stdio.h>
#include <limits.h>
#include <math.h>
#include <string.h>
#include <stdlib.h>
#define BASEN 50000000
#define WARNRANGE 0.4
struct stacknode {
int val;
struct stacknode *next;
};
typedef struct stacknode *Stackp;
Stackp stackroot;
void push(int i)
{
Stackp p;
p = (Stackp) malloc(sizeof(struct stacknode));
p->val = i;
p->next = stackroot;
stackroot = p;
}
int pop()
{
Stackp p;
int i;
p = stackroot;
i = stackroot->val;
stackroot = stackroot->next;
free(p);
return i;
}
#include <time.h>
int jobclicks()
{
return (int) clock();
}
#define quoted(TEXT) #TEXT
/*#define quoted(TEXT) "TEXT"*/
#define loop1(CODE) loop1ctr++; \
for (i = 0; i < n; i++) { CODE; } \
loop1next = jobclicks(); \
thisclicks = loop1next - loop1start; \
sumclicks += thisclicks; \
if (thisclicks < minclicks) minclicks = thisclicks; \
if (thisclicks > maxclicks) maxclicks = thisclicks; \
printf("%7d ", loop1next - loop1start); \
loop1start = loop1next;
#define loop(CODE) printf(" %-30s", quoted(CODE)); \
minclicks = INT_MAX; maxclicks = -1; sumclicks = 0; \
loop1ctr = 0; \
loop1start = jobclicks(); \
loop1(CODE) \
loop1(CODE) \
i0 = i1 + i2 + i3; \
loop1(CODE) \
i0 = i1 + i2 + i3 - i1 - i2 - i3; \
loop1(CODE) \
i0 = i1 + i2 + i3 + i1*i2 + i2*i3 + i1*i3; \
loop1(CODE) \
queststr = ""; \
if (loop1ctr * (maxclicks - minclicks) > WARNRANGE * sumclicks) \
queststr = "?"; \
lastmics = sumclicks * 1000000.0 / ((double) CLOCKS_PER_SEC * n
* loop1ctr); \
printf("%10.2f%s\n", lastmics - basemics, queststr);
#define title(TEXT) printf("%s (n=%d)\n", TEXT, n);
/* The experiment */
int sum1(int a) { return a; }
int sum2(int a, int b) { return a + b; }
int sum3(int a, int b, int c) { return a + b + c; }
int main()
{
int loop1start, loop1next, loop1ctr;
double lastmics, basemics;
int minclicks, maxclicks, sumclicks, thisclicks, startclicks;
int i, n, basen;
volatile int i0, i1, i2, i3, i4;
volatile float f0, f1, f2, f3;
int *v;
char *queststr;
char s[100];
char fname[20];
FILE *fp;
char s0123456789[] = "0123456789";
char sa123456789[] = "a123456789";
char s12345[] = "12345";
char s123_45[] = "123.45";
char sd[] = "%d";
char sdn[] = "%d\n";
char sf[] = "%f";
char sf62[] = "%f6.2";
setbuf(stdout, (char *) 0); /* No buffering to watch output */
printf(" Operation Clicks for each
trial ");
printf(" Mics/N\n");
startclicks = jobclicks();
basen = BASEN;
n = basen;
title("Null Loop")
i0 = i1 = i2 = i3 = 5;
f0 = f1 = f2 = f3 = 5.0;
basemics = 0.0;
loop({})
basemics = lastmics;
n = basen/10;
/* n = basen;*/
title("Math Functions");
f2 = 5.0;
loop(f1 = log(f2))
loop(f1 = logf(f2))
loop(f1 = tan(f2))
loop(f1 = tanf(f2))
loop(f1 = exp(f2))
loop(f1 = expf(f2))
loop(f1 = sin(f2))
loop(f1 = sinf(f2))
loop(f1 = cos(f2))
loop(f1 = cosf(f2))
loop(f1 = cosl(f2))
loop(f1 = sqrt(f2))
loop(f1 = sqrtf(f2))
printf("Total Seconds:%10.2f\n", ((float)
jobclicks()-startclicks) / CLOCKS_PER_SEC);
return 0;
}
--------- CUT HERE --------
Hope its helpful
D. Angelis