[PATCH] fix i386 memcpy

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



This patch shortens non-constant memcpy() by two bytes
and fixes spurious out-of-line constant memcpy().

Patch is run-tested (I run on patched kernel right now).

Benchmark and code generation test program will be mailed as reply.

# size vmlinux.org vmlinux
   text    data     bss     dec     hex filename
3954591 1553426  236544 5744561  57a7b1 vmlinux.org
3952615 1553426  236544 5742585  579ff9 vmlinux

Example of changes (part of dump_fpu() body):
old.............................................. new......................
8d 83 40 02 00 00 lea    0x240(%ebx),%eax         8d b3 40 02 00 00 lea    0x240(%ebx),%esi
74 31             je     c0108b27 <dump_fpu+0x9c> 74 2e             je     c0108b1d <dump_fpu+0x92>
6a 1c             push   $0x1c                    8b 7d 0c          mov    0xc(%ebp),%edi
50                push   %eax                     b9 07 00 00 00    mov    $0x7,%ecx
56                push   %esi                     f3 a5             repz movsl %ds:(%esi),%es:(%edi)
e8 49 21 10 00    call   c020ac48 <memcpy>        8b 55 0c          mov    0xc(%ebp),%edx
83 c4 0c          add    $0xc,%esp                83 c2 1c          add    $0x1c,%edx
83 c6 1c          add    $0x1c,%esi               8d 83 60 02 00 00 lea    0x260(%ebx),%eax
81 c3 60 02 00 00 add    $0x260,%ebx              b9 07 00 00 00    mov    $0x7,%ecx
bf 07 00 00 00    mov    $0x7,%edi                89 d7             mov    %edx,%edi
6a 0a             push   $0xa                     89 c6             mov    %eax,%esi
53                push   %ebx                     a5                movsl  %ds:(%esi),%es:(%edi)
56                push   %esi                     a5                movsl  %ds:(%esi),%es:(%edi)
e8 2f 21 10 00    call   c020ac48 <memcpy>        66 a5             movsw  %ds:(%esi),%es:(%edi)
83 c4 0c          add    $0xc,%esp                83 c2 0a          add    $0xa,%edx
83 c6 0a          add    $0xa,%esi                83 c0 10          add    $0x10,%eax
83 c3 10          add    $0x10,%ebx               49                dec    %ecx
4f                dec    %edi                     79 ef             jns    c0108b0a <dump_fpu+0x7f>
79 eb             jns    c0108b10 <dump_fpu+0x85> eb 0a             jmp    c0108b27 <dump_fpu+0x9c>
eb 0c             jmp    c0108b33 <dump_fpu+0xa8> 8b 7d 0c          mov    0xc(%ebp),%edi
6a 6c             push   $0x6c                    b9 1b 00 00 00    mov    $0x1b,%ecx
50                push   %eax                     f3 a5             repz movsl %ds:(%esi),%es:(%edi)
56                push   %esi                     8b 45 f0          mov    0xfffffff0(%ebp),%eax
e8 18 21 10 00    call   c020ac48 <memcpy>        5a                pop    %edx
83 c4 0c          add    $0xc,%esp                5b                pop    %ebx
8b 45 f0          mov    0xfffffff0(%ebp),%eax
8d 65 f4          lea    0xfffffff4(%ebp),%esp
5b                pop    %ebx
5e                pop    %esi

--
vda
--- linux-2.6.11.src/include/asm-i386/string.h.orig	Thu Mar  3 09:31:08 2005
+++ linux-2.6.11.src/include/asm-i386/string.h	Tue Mar 29 22:05:00 2005
@@ -198,46 +198,75 @@ static inline void * __memcpy(void * to,
 int d0, d1, d2;
 __asm__ __volatile__(
 	"rep ; movsl\n\t"
-	"testb $2,%b4\n\t"
-	"je 1f\n\t"
-	"movsw\n"
-	"1:\ttestb $1,%b4\n\t"
-	"je 2f\n\t"
-	"movsb\n"
-	"2:"
+	"movl %4,%%ecx\n\t"
+	"andl $3,%%ecx\n\t"
+	"jz 1f\n\t"	/* pay 2 byte penalty for a chance to skip microcoded rep */
+	"rep ; movsb\n\t"
+	"1:"
 	: "=&c" (d0), "=&D" (d1), "=&S" (d2)
-	:"0" (n/4), "q" (n),"1" ((long) to),"2" ((long) from)
+	: "0" (n/4), "g" (n), "1" ((long) to), "2" ((long) from)
 	: "memory");
 return (to);
 }
 
 /*
- * This looks horribly ugly, but the compiler can optimize it totally,
+ * This looks ugly, but the compiler can optimize it totally,
  * as the count is constant.
  */
 static inline void * __constant_memcpy(void * to, const void * from, size_t n)
 {
-	if (n <= 128)
-		return __builtin_memcpy(to, from, n);
-
-#define COMMON(x) \
-__asm__ __volatile__( \
-	"rep ; movsl" \
-	x \
-	: "=&c" (d0), "=&D" (d1), "=&S" (d2) \
-	: "0" (n/4),"1" ((long) to),"2" ((long) from) \
-	: "memory");
-{
-	int d0, d1, d2;
+#if 1	/* want to do small copies with non-string ops? */
+	switch (n) {
+		case 0: return to;
+		case 1: *(char*)to = *(char*)from; return to;
+		case 2: *(short*)to = *(short*)from; return to;
+		case 4: *(int*)to = *(int*)from; return to;
+#if 1	/* including those doable with two moves? */
+		case 3: *(short*)to = *(short*)from;
+			*((char*)to+2) = *((char*)from+2); return to;
+		case 5: *(int*)to = *(int*)from;
+			*((char*)to+4) = *((char*)from+4); return to;
+		case 6: *(int*)to = *(int*)from;
+			*((short*)to+2) = *((short*)from+2); return to;
+		case 8: *(int*)to = *(int*)from;
+			*((int*)to+1) = *((int*)from+1); return to;
+#endif
+	}
+#else
+	if (!n) return to;
+#endif
+	{
+		/* load esi/edi */
+		int esi, edi;
+		__asm__ __volatile__(
+			""
+			: "=&D" (edi), "=&S" (esi)
+			: "0" ((long) to),"1" ((long) from)
+			: "memory"
+		);
+	}
+	if (n >= 5*4) {
+		/* large block: use rep prefix */
+		int ecx;
+		__asm__ __volatile__(
+			"rep ; movsl"
+			: "=&c" (ecx)
+			: "0" (n/4)
+		);
+	} else {
+		/* small block: don't clobber ecx + smaller code */
+		if (n >= 4*4) __asm__ __volatile__("movsl");
+		if (n >= 3*4) __asm__ __volatile__("movsl");
+		if (n >= 2*4) __asm__ __volatile__("movsl");
+		if (n >= 1*4) __asm__ __volatile__("movsl");
+	}
 	switch (n % 4) {
-		case 0: COMMON(""); return to;
-		case 1: COMMON("\n\tmovsb"); return to;
-		case 2: COMMON("\n\tmovsw"); return to;
-		default: COMMON("\n\tmovsw\n\tmovsb"); return to;
+		/* tail */
+		case 0: return to;
+		case 1: __asm__ __volatile__("movsb"); return to;
+		case 2: __asm__ __volatile__("movsw"); return to;
+		default: __asm__ __volatile__("movsw\n\tmovsb"); return to;
 	}
-}
-  
-#undef COMMON
 }
 
 #define __HAVE_ARCH_MEMCPY

[Index of Archives]     [Kernel Newbies]     [Netfilter]     [Bugtraq]     [Photo]     [Stuff]     [Gimp]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Video 4 Linux]     [Linux for the blind]     [Linux Resources]
  Powered by Linux