Re: [BUG mm] "fixed" i386 memcpy inlining buggy

On Tuesday 05 April 2005 19:34, Christophe Saout wrote:
> the new i386 memcpy macro is a ticking timebomb.
> 
> I've been debugging a new mISDN crash, just to find out that a memcpy
> was not inlined correctly.
> 
> Andrew, you should drop the fix-i386-memcpy.patch (or have it fixed).

Updated patch against 2.6.11 follows. This one, like the original
patch, is run tested too.

This time I took no chances, esi/edi contents are
explicitly propagated from one asm() block to another.
I didn't do it before, not expecting that gcc can be
soooo incredibly clever. Sorry.

Christophe does this one look/compile ok?
--
vda

--- linux-2.6.11.src/include/asm-i386/string.h.orig	Thu Mar  3 09:31:08 2005
+++ linux-2.6.11.src/include/asm-i386/string.h	Wed Apr  6 19:08:39 2005
@@ -198,47 +198,80 @@ static inline void * __memcpy(void * to,
 int d0, d1, d2;
 __asm__ __volatile__(
 	"rep ; movsl\n\t"
-	"testb $2,%b4\n\t"
-	"je 1f\n\t"
-	"movsw\n"
-	"1:\ttestb $1,%b4\n\t"
-	"je 2f\n\t"
-	"movsb\n"
-	"2:"
+	"movl %4,%%ecx\n\t"
+	"andl $3,%%ecx\n\t"
+#if 1	/* want to pay 2 byte penalty for a chance to skip microcoded rep? */
+	"jz 1f\n\t"
+#endif
+	"rep ; movsb\n\t"
+	"1:"
 	: "=&c" (d0), "=&D" (d1), "=&S" (d2)
-	:"0" (n/4), "q" (n),"1" ((long) to),"2" ((long) from)
+	: "0" (n/4), "g" (n), "1" ((long) to), "2" ((long) from)
 	: "memory");
 return (to);
 }
 
 /*
- * This looks horribly ugly, but the compiler can optimize it totally,
+ * This looks ugly, but the compiler can optimize it totally,
  * as the count is constant.
  */
 static inline void * __constant_memcpy(void * to, const void * from, size_t n)
 {
-	if (n <= 128)
-		return __builtin_memcpy(to, from, n);
-
-#define COMMON(x) \
-__asm__ __volatile__( \
-	"rep ; movsl" \
-	x \
-	: "=&c" (d0), "=&D" (d1), "=&S" (d2) \
-	: "0" (n/4),"1" ((long) to),"2" ((long) from) \
-	: "memory");
-{
-	int d0, d1, d2;
+	long esi, edi;
+	if (!n) return to;
+#if 1	/* want to do small copies with non-string ops? */
+	switch (n) {
+		case 1: *(char*)to = *(char*)from; return to;
+		case 2: *(short*)to = *(short*)from; return to;
+		case 4: *(int*)to = *(int*)from; return to;
+#if 1	/* including those doable with two moves? */
+		case 3: *(short*)to = *(short*)from;
+			*((char*)to+2) = *((char*)from+2); return to;
+		case 5: *(int*)to = *(int*)from;
+			*((char*)to+4) = *((char*)from+4); return to;
+		case 6: *(int*)to = *(int*)from;
+			*((short*)to+2) = *((short*)from+2); return to;
+		case 8: *(int*)to = *(int*)from;
+			*((int*)to+1) = *((int*)from+1); return to;
+#endif
+	}
+#endif
+	esi = (long) from;
+	edi = (long) to;
+	if (n >= 5*4) {
+		/* large block: use rep prefix */
+		int ecx;
+		__asm__ __volatile__(
+			"rep ; movsl"
+			: "=&c" (ecx), "=&D" (edi), "=&S" (esi)
+			: "0" (n/4), "1" (edi),"2" (esi)
+			: "memory"
+		);
+	} else {
+		/* small block: don't clobber ecx + smaller code */
+		if (n >= 4*4) __asm__ __volatile__("movsl"
+			:"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory");
+		if (n >= 3*4) __asm__ __volatile__("movsl"
+			:"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory");
+		if (n >= 2*4) __asm__ __volatile__("movsl"
+			:"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory");
+		if (n >= 1*4) __asm__ __volatile__("movsl"
+			:"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory");
+	}
 	switch (n % 4) {
-		case 0: COMMON(""); return to;
-		case 1: COMMON("\n\tmovsb"); return to;
-		case 2: COMMON("\n\tmovsw"); return to;
-		default: COMMON("\n\tmovsw\n\tmovsb"); return to;
+		/* tail */
+		case 0: return to;
+		case 1: __asm__ __volatile__("movsb"
+			:"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory");
+			return to;
+		case 2: __asm__ __volatile__("movsw"
+			:"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory");
+			return to;
+		default: __asm__ __volatile__("movsw\n\tmovsb"
+			:"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory");
+			return to;
 	}
 }
-  
-#undef COMMON
-}
 
 #define __HAVE_ARCH_MEMCPY

References:
- Re: memcpy(a,b,CONST) is not inlined by gcc 3.4.1 in Linux kernel
  - From: Denis Vlasenko <[email protected]>
- [BUG mm] "fixed" i386 memcpy inlining buggy
  - From: Christophe Saout <[email protected]>

Prev by Date: Re: Kernel SCM saga..
Next by Date: return value of ptep_get_and_clear
Previous by thread: Re: [BUG mm] "fixed" i386 memcpy inlining buggy
Next by thread: Delay in a tasklet.
Index(es):
- Date
- Thread

[Index of Archives] [Kernel Newbies] [Netfilter] [Bugtraq] [Photo] [Stuff] [Gimp] [Yosemite News] [MIPS Linux] [ARM Linux] [Linux Security] [Linux RAID] [Video 4 Linux] [Linux for the blind] [Linux Resources]