Re: [PATCH] fix i386 memcpy

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Tuesday 29 March 2005 23:22, Denis Vlasenko wrote:
> This patch shortens non-constant memcpy() by two bytes
> and fixes spurious out-of-line constant memcpy().
> 
> Patch is run-tested (I run on patched kernel right now).
> 
> Benchmark and code generation test program will be mailed as reply.
/* Compile with: gcc -Os -fomit-frame-pointer -falign-functions=32 */
/* results:
processor       : 0
vendor_id       : GenuineIntel
cpu family      : 6
model           : 11
model name      : Intel(R) Celeron
stepping        : 1
cpu MHz         : 1196.236
cache size      : 256 KB
movsl_X wins    : N<=3
rep_movsl wins  : N>=6
('assign' wins always at the cost of much larger code)
*/

#include <time.h>
#include <stdio.h>

#define N 5

#define MOVSL1 __asm__ __volatile__("movsl")
#define MOVSL2 MOVSL1;MOVSL1
#define MOVSL3 MOVSL2;MOVSL1
#define MOVSL4 MOVSL3;MOVSL1
#define MOVSL5 MOVSL4;MOVSL1
#define MOVSL6 MOVSL5;MOVSL1
#define MOVSL7 MOVSL6;MOVSL1
#define MOVSL8 MOVSL7;MOVSL1
#define MOVSL9 MOVSL8;MOVSL1
#define MOVSL10 MOVSL9;MOVSL1
#define MOVSL11 MOVSL10;MOVSL1
#define MOVSL12 MOVSL11;MOVSL1
#define MOVSL13 MOVSL12;MOVSL1
#define MOVSL14 MOVSL13;MOVSL1
#define MOVSL15 MOVSL14;MOVSL1
#define MOVSL16 MOVSL15;MOVSL1
#define MOVSL17 MOVSL16;MOVSL1
#define MOVSL18 MOVSL17;MOVSL1
#define MOVSL19 MOVSL18;MOVSL1

#define MOVSL_(n) MOVSL##n
#define MOVSL(n) MOVSL_(n)

static inline void * rep_movsl(void * to, const void * from, size_t n)
{
	{
		int esi, edi;
		__asm__ __volatile__(
			""
			: "=&D" (edi), "=&S" (esi)
			: "0" ((long) to),"1" ((long) from)
			: "memory"
		);
	}
	{
		int ecx;
		__asm__ __volatile__(
			"rep ; movsl"
			: "=&c" (ecx)
			: "0" (n/4)
		);
	}
}

static inline void * movsl_X(void * to, const void * from, size_t n)
{
	{
		int esi, edi;
		__asm__ __volatile__(
			""
			: "=&D" (edi), "=&S" (esi)
			: "0" ((long) to),"1" ((long) from)
			: "memory"
		);
	}
	MOVSL(N);
}

static inline void * assign(void * to, const void * from, size_t n)
{
	switch (n) {
		case 4:
			*(unsigned long *)to = *(const unsigned long *)from;
			return to;
		case 8:
			*(unsigned long *)to = *(const unsigned long *)from;
			*(1+(unsigned long *)to) = *(1+(const unsigned long *)from);
			return to;
		case 12:
			*(unsigned long *)to = *(const unsigned long *)from;
			*(1+(unsigned long *)to) = *(1+(const unsigned long *)from);
			*(2+(unsigned long *)to) = *(2+(const unsigned long *)from);
			return to;
		case 16:
			*(unsigned long *)to = *(const unsigned long *)from;
			*(1+(unsigned long *)to) = *(1+(const unsigned long *)from);
			*(2+(unsigned long *)to) = *(2+(const unsigned long *)from);
			*(3+(unsigned long *)to) = *(3+(const unsigned long *)from);
			return to;
		case 20:
			*(unsigned long *)to = *(const unsigned long *)from;
			*(1+(unsigned long *)to) = *(1+(const unsigned long *)from);
			*(2+(unsigned long *)to) = *(2+(const unsigned long *)from);
			*(3+(unsigned long *)to) = *(3+(const unsigned long *)from);
			*(4+(unsigned long *)to) = *(4+(const unsigned long *)from);
			return to;
		default:
			return rep_movsl(to, from, n);
	}
}


char f[256],t[256];

char *fp = f;
char *tp = t;

void r() { rep_movsl(f,t,N*4); }
void m() { movsl_X(f,t,N*4); }
void a() { assign(f,t,N*4); }

void rp() { rep_movsl(fp,tp,N*4); }
void mp() { movsl_X(fp,tp,N*4); }
void ap() { assign(fp,tp,N*4); }

int measure(void (*f)()) {
    int cnt = 0;
    time_t t = time(0);
    while(t==time(0)) f(); /* cache hot */
    t = time(0);
    while(t==time(0)) {
	f(); f(); f(); f(); f(); f(); f(); f();
	f(); f(); f(); f(); f(); f(); f(); f();
	cnt += 16;
    }
    return cnt;
}

int main() {
    printf("On global array:\n");
    printf("rep movsl(%d) per sec: %d\n", N, measure(r));
    printf("  movsl_X(%d) per sec: %d\n", N, measure(m));
    printf("   assign(%d) per sec: %d\n", N, measure(a));
    printf("Indirect:\n");
    printf("rep movsl(%d) per sec: %d\n", N, measure(rp));
    printf("  movsl_X(%d) per sec: %d\n", N, measure(mp));
    printf("   assign(%d) per sec: %d\n", N, measure(ap));
    return 0;
}
/* Compile with: gcc -Os -fomit-frame-pointer */
/* Check for correctness/size: objdump -r -d <file.o> | $PAGER */

typedef unsigned int size_t;

static inline void * __constant_memcpy(void * to, const void * from, size_t n)
{
#if 1	/* want to do small copies with non-string ops? */
	switch (n) {
		case 0: return to;
		case 1: *(char*)to = *(char*)from; return to;
		case 2: *(short*)to = *(short*)from; return to;
		case 4: *(int*)to = *(int*)from; return to;
#if 1	/* including those doable with two moves? */
		case 3: *(short*)to = *(short*)from;
			*((char*)to+2) = *((char*)from+2); return to;
		case 5: *(int*)to = *(int*)from;
			*((char*)to+4) = *((char*)from+4); return to;
		case 6: *(int*)to = *(int*)from;
			*((short*)to+2) = *((short*)from+2); return to;
		case 8: *(int*)to = *(int*)from;
			*((int*)to+1) = *((int*)from+1); return to;
#endif
	}
#else
	if (!n) return to;
#endif
	{
		/* load esi/edi */
		int esi, edi;
		__asm__ __volatile__(
			""
			: "=&D" (edi), "=&S" (esi)
			: "0" ((long) to),"1" ((long) from)
			: "memory"
		);
	}
	if (n >= 5*4) {
		/* large block: use rep prefix */
		int ecx;
		__asm__ __volatile__(
			"rep ; movsl"
			: "=&c" (ecx)
			: "0" (n/4)
		);
	} else {
		/* small block: don't clobber ecx + smaller code */
		if (n >= 4*4) __asm__ __volatile__("movsl");
		if (n >= 3*4) __asm__ __volatile__("movsl");
		if (n >= 2*4) __asm__ __volatile__("movsl");
		if (n >= 1*4) __asm__ __volatile__("movsl");
	}
	switch (n % 4) {
		/* tail */
		case 0: return to;
		case 1: __asm__ __volatile__("movsb"); return to;
		case 2: __asm__ __volatile__("movsw"); return to;
		default: __asm__ __volatile__("movsw\n\tmovsb"); return to;
	}
}

static inline void * __memcpy(void * to, const void * from, size_t n)
{
int d0, d1, d2;
__asm__ __volatile__(
	"rep ; movsl\n\t"
	"movl %4,%%ecx\n\t"
	"andl $3,%%ecx\n\t"
	"jz 1f\n\t"	/* pay 2 byte penalty for a chance to skip microcoded rep */
	"rep ; movsb\n\t"
	"1:"
	: "=&c" (d0), "=&D" (d1), "=&S" (d2)
	: "0" (n/4), "g" (n), "1" ((long) to), "2" ((long) from)
	: "memory");
return (to);
}

#define memcpy(t, f, n) \
(__builtin_constant_p(n) ? \
 __constant_memcpy((t),(f),(n)) : \
 __memcpy((t),(f),(n)))

int f00(char *a, char *b) __attribute__ ((section ("ff00"))); int f00(char *a, char *b) { memcpy(a,b,0); }
int f01(char *a, char *b) __attribute__ ((section ("ff01"))); int f01(char *a, char *b) { memcpy(a,b,1); }
int f02(char *a, char *b) __attribute__ ((section ("ff02"))); int f02(char *a, char *b) { memcpy(a,b,2); }
int f03(char *a, char *b) __attribute__ ((section ("ff03"))); int f03(char *a, char *b) { memcpy(a,b,3); }
int f04(char *a, char *b) __attribute__ ((section ("ff04"))); int f04(char *a, char *b) { memcpy(a,b,4); }
int f05(char *a, char *b) __attribute__ ((section ("ff05"))); int f05(char *a, char *b) { memcpy(a,b,5); }
int f06(char *a, char *b) __attribute__ ((section ("ff06"))); int f06(char *a, char *b) { memcpy(a,b,6); }
int f07(char *a, char *b) __attribute__ ((section ("ff07"))); int f07(char *a, char *b) { memcpy(a,b,7); }
int f08(char *a, char *b) __attribute__ ((section ("ff08"))); int f08(char *a, char *b) { memcpy(a,b,8); }
int f09(char *a, char *b) __attribute__ ((section ("ff09"))); int f09(char *a, char *b) { memcpy(a,b,9); }
int f10(char *a, char *b) __attribute__ ((section ("ff10"))); int f10(char *a, char *b) { memcpy(a,b,10); }
int f11(char *a, char *b) __attribute__ ((section ("ff11"))); int f11(char *a, char *b) { memcpy(a,b,11); }
int f12(char *a, char *b) __attribute__ ((section ("ff12"))); int f12(char *a, char *b) { memcpy(a,b,12); }
int f13(char *a, char *b) __attribute__ ((section ("ff13"))); int f13(char *a, char *b) { memcpy(a,b,13); }
int f14(char *a, char *b) __attribute__ ((section ("ff14"))); int f14(char *a, char *b) { memcpy(a,b,14); }
int f15(char *a, char *b) __attribute__ ((section ("ff15"))); int f15(char *a, char *b) { memcpy(a,b,15); }
int f16(char *a, char *b) __attribute__ ((section ("ff16"))); int f16(char *a, char *b) { memcpy(a,b,16); }
int f17(char *a, char *b) __attribute__ ((section ("ff17"))); int f17(char *a, char *b) { memcpy(a,b,17); }
int f18(char *a, char *b) __attribute__ ((section ("ff18"))); int f18(char *a, char *b) { memcpy(a,b,18); }
int f19(char *a, char *b) __attribute__ ((section ("ff19"))); int f19(char *a, char *b) { memcpy(a,b,19); }
int f20(char *a, char *b) __attribute__ ((section ("ff20"))); int f20(char *a, char *b) { memcpy(a,b,20); }
int f21(char *a, char *b) __attribute__ ((section ("ff21"))); int f21(char *a, char *b) { memcpy(a,b,21); }
int f22(char *a, char *b) __attribute__ ((section ("ff22"))); int f22(char *a, char *b) { memcpy(a,b,22); }
int f23(char *a, char *b) __attribute__ ((section ("ff23"))); int f23(char *a, char *b) { memcpy(a,b,23); }
int f24(char *a, char *b) __attribute__ ((section ("ff24"))); int f24(char *a, char *b) { memcpy(a,b,24); }
int f25(char *a, char *b) __attribute__ ((section ("ff25"))); int f25(char *a, char *b) { memcpy(a,b,25); }
int f26(char *a, char *b) __attribute__ ((section ("ff26"))); int f26(char *a, char *b) { memcpy(a,b,26); }
int f27(char *a, char *b) __attribute__ ((section ("ff27"))); int f27(char *a, char *b) { memcpy(a,b,27); }
int f28(char *a, char *b) __attribute__ ((section ("ff28"))); int f28(char *a, char *b) { memcpy(a,b,28); }
int f29(char *a, char *b) __attribute__ ((section ("ff29"))); int f29(char *a, char *b) { memcpy(a,b,29); }
int f3k(char *a, char *b) __attribute__ ((section ("ff3k"))); int f3k(char *a, char *b) { memcpy(a,b,3000); }

int f(char *a, char *b) {
memcpy(a,b,0);
memcpy(a,b,1); 
memcpy(a,b,2); 
memcpy(a,b,3); 
memcpy(a,b,4); 
memcpy(a,b,5); 
memcpy(a,b,6); 
memcpy(a,b,7); 
memcpy(a,b,8); 
memcpy(a,b,9); 
memcpy(a,b,10);
memcpy(a,b,11);
memcpy(a,b,12);
memcpy(a,b,13);
memcpy(a,b,14);
memcpy(a,b,15);
memcpy(a,b,16);
memcpy(a,b,17);
memcpy(a,b,18);
memcpy(a,b,19);
memcpy(a,b,20);
memcpy(a,b,21);
memcpy(a,b,22);
memcpy(a,b,23);
memcpy(a,b,24);
memcpy(a,b,25);
memcpy(a,b,3000);
}

[Index of Archives]     [Kernel Newbies]     [Netfilter]     [Bugtraq]     [Photo]     [Stuff]     [Gimp]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Video 4 Linux]     [Linux for the blind]     [Linux Resources]
  Powered by Linux