working around gcc bogons (was: Re: Whirlpool oopses in 2.6.11 and 2.6.12-rc2)

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



[resending to lkml for wider audience]

> > > modprobe tcrypt hangs the box on both kernels.
> > > The last printks are:
> > > 
> > > <wp256 test runs ok>
> > > 
> > > testing wp384
> > > NN<n>Unable to handle kernel paging request at virtual address eXXXXXXX
> > > 
> > > Nothing is printed after this and system locks up solid.
> > > No Sysrq-B.
> > > 
> > > IIRC, 2.6.9 was okay.
> > 
> > Update: it does not oops on another machine. CPU or .config related,
> > I'll look into it...
> 
> Any update?  This is candidate for -stable fixing if it's an actual bug.

Yes. wp512_process_buffer() was using 3k of stack if compiled with -O2.
The appenede wp512.c (sans table at top) is instrumented to show it.
Use "make crypto/wp512.s".

The meat of the matter is this:

		L[0]  = C0[BYTE7(K[0])] XEND
		X(L[0]) C1[BYTE6(K[7])] XEND
		X(L[0]) C2[BYTE5(K[6])] XEND
		X(L[0]) C3[BYTE4(K[5])] XEND
		X(L[0]) C4[BYTE3(K[4])] XEND
		X(L[0]) C5[BYTE2(K[3])] XEND
		X(L[0]) C6[BYTE1(K[2])] XEND
		X(L[0]) C7[BYTE0(K[1])] XEND
		X(L[0]) rc[r];

It can be done either as v = c1 ^ c2 ^ c3 ^ c4;
or as v = c1; v ^= c2; v ^= c3; v ^= c4; macros
X and XEND allow to test both ways.

gcc 3.2.3 (IIRC, the box is at home) eats
~3k of stack for first method. gcc 3.4.1
seems to do better, but gcc 3.2.3 is in wide
use.

There is more.

#define BYTE7(v) ((u8)((v) >> 56))
gcc produce full u32 load, shift and zero extend
for this one.

#define BYTE7(v) (((u8*)&v)[7])
gcc does simple byte load.

There is even more.

#define BYTE3(v) ((u8)((u32)(v) >> 24))
                       ^^^^^
Without this, gcc produces:
        shrdl   $8, %edx, %eax <=== not needed
        andl    $255, %eax

This is all seen on i386 only. I expect this to be different
on each arch.

I'd like to generate good code, yet without heavy tailoring
for gcc versions and CPU architectures.

What shall I do?
--
vda

/**
 * The core Whirlpool transform.
 */

static void wp512_process_buffer(struct wp512_ctx *wctx) {
	int i, r;
	u64 K[8];        /* the round key */
	u64 block[8];    /* mu(buffer) */
	u64 state[8];    /* the cipher state */
	u64 L[8];

	for (i = 0; i < 8; i++) {
		block[i] = be64_to_cpu( ((__be64*)wctx->buffer)[i] );
	}

	state[0] = block[0] ^ (K[0] = wctx->hash[0]);
	state[1] = block[1] ^ (K[1] = wctx->hash[1]);
	state[2] = block[2] ^ (K[2] = wctx->hash[2]);
	state[3] = block[3] ^ (K[3] = wctx->hash[3]);
	state[4] = block[4] ^ (K[4] = wctx->hash[4]);
	state[5] = block[5] ^ (K[5] = wctx->hash[5]);
	state[6] = block[6] ^ (K[6] = wctx->hash[6]);
	state[7] = block[7] ^ (K[7] = wctx->hash[7]);


// gcc optimizer bug: first method is noticeably
// worse than second: loads full u32, shifts and
// zero-extends low u8 to u32
#if 0
 #define BYTE7(v) ((u8)((v) >> 56))
 #define BYTE6(v) ((u8)((v) >> 48))
 #define BYTE5(v) ((u8)((v) >> 40))
 #define BYTE4(v) ((u8)((v) >> 32))
 // gcc optimizer bug: without (u32) below will emit
 // spurious shrd insns
 #define BYTE3(v) ((u8)((u32)(v) >> 24))
 #define BYTE2(v) ((u8)((u32)(v) >> 16))
 #define BYTE1(v) ((u8)((u32)(v) >>  8))
 #define BYTE0(v) ((u8)(v))
#else
// little-endian
 #define BYTE7(v) (((u8*)&v)[7])
 #define BYTE6(v) (((u8*)&v)[6])
 #define BYTE5(v) (((u8*)&v)[5])
 #define BYTE4(v) (((u8*)&v)[4])
 #define BYTE3(v) (((u8*)&v)[3])
 #define BYTE2(v) (((u8*)&v)[2])
 #define BYTE1(v) (((u8*)&v)[1])
 #define BYTE0(v) (((u8*)&v)[0])
#endif

// gcc -O2 optimizer bug: second method
// causes excessive spills (~3K stack used)
#if 1
 #define X(a) a ^=
 #define XEND ;
#else
 #define X(a) ^
 #define XEND
#endif
	for (r = 1; r <= WHIRLPOOL_ROUNDS; r++) {
asm("#1");
		L[0]  = C0[BYTE7(K[0])] XEND
		X(L[0]) C1[BYTE6(K[7])] XEND
		X(L[0]) C2[BYTE5(K[6])] XEND
		X(L[0]) C3[BYTE4(K[5])] XEND
		X(L[0]) C4[BYTE3(K[4])] XEND
		X(L[0]) C5[BYTE2(K[3])] XEND
		X(L[0]) C6[BYTE1(K[2])] XEND
		X(L[0]) C7[BYTE0(K[1])] XEND
		X(L[0]) rc[r];
asm("#2");

		L[1]  = C0[BYTE7(K[1])] XEND
		X(L[1]) C1[BYTE6(K[0])] XEND
		X(L[1]) C2[BYTE5(K[7])] XEND
		X(L[1]) C3[BYTE4(K[6])] XEND
		X(L[1]) C4[BYTE3(K[5])] XEND
		X(L[1]) C5[BYTE2(K[4])] XEND
		X(L[1]) C6[BYTE1(K[3])] XEND
		X(L[1]) C7[BYTE0(K[2])];

		L[2]  = C0[BYTE7(K[2])] XEND
		X(L[2]) C1[BYTE6(K[1])] XEND
		X(L[2]) C2[BYTE5(K[0])] XEND
		X(L[2]) C3[BYTE4(K[7])] XEND
		X(L[2]) C4[BYTE3(K[6])] XEND
		X(L[2]) C5[BYTE2(K[5])] XEND
		X(L[2]) C6[BYTE1(K[4])] XEND
		X(L[2]) C7[BYTE0(K[3])];

		L[3]  = C0[BYTE7(K[3])] XEND
		X(L[3]) C1[BYTE6(K[2])] XEND
		X(L[3]) C2[BYTE5(K[1])] XEND
		X(L[3]) C3[BYTE4(K[0])] XEND
		X(L[3]) C4[BYTE3(K[7])] XEND
		X(L[3]) C5[BYTE2(K[6])] XEND
		X(L[3]) C6[BYTE1(K[5])] XEND
		X(L[3]) C7[BYTE0(K[4])];

		L[4]  = C0[BYTE7(K[4])] XEND
		X(L[4]) C1[BYTE6(K[3])] XEND
		X(L[4]) C2[BYTE5(K[2])] XEND
		X(L[4]) C3[BYTE4(K[1])] XEND
		X(L[4]) C4[BYTE3(K[0])] XEND
		X(L[4]) C5[BYTE2(K[7])] XEND
		X(L[4]) C6[BYTE1(K[6])] XEND
		X(L[4]) C7[BYTE0(K[5])];

		L[5]  = C0[BYTE7(K[5])] XEND
		X(L[5]) C1[BYTE6(K[4])] XEND
		X(L[5]) C2[BYTE5(K[3])] XEND
		X(L[5]) C3[BYTE4(K[2])] XEND
		X(L[5]) C4[BYTE3(K[1])] XEND
		X(L[5]) C5[BYTE2(K[0])] XEND
		X(L[5]) C6[BYTE1(K[7])] XEND
		X(L[5]) C7[BYTE0(K[6])];

		L[6]  = C0[BYTE7(K[6])] XEND
		X(L[6]) C1[BYTE6(K[5])] XEND
		X(L[6]) C2[BYTE5(K[4])] XEND
		X(L[6]) C3[BYTE4(K[3])] XEND
		X(L[6]) C4[BYTE3(K[2])] XEND
		X(L[6]) C5[BYTE2(K[1])] XEND
		X(L[6]) C6[BYTE1(K[0])] XEND
		X(L[6]) C7[BYTE0(K[7])];

		L[7]  = C0[BYTE7(K[7])] XEND
		X(L[7]) C1[BYTE6(K[6])] XEND
		X(L[7]) C2[BYTE5(K[5])] XEND
		X(L[7]) C3[BYTE4(K[4])] XEND
		X(L[7]) C4[BYTE3(K[3])] XEND
		X(L[7]) C5[BYTE2(K[2])] XEND
		X(L[7]) C6[BYTE1(K[1])] XEND
		X(L[7]) C7[BYTE0(K[0])];

		K[0] = L[0];
		K[1] = L[1];
		K[2] = L[2];
		K[3] = L[3];
		K[4] = L[4];
		K[5] = L[5];
		K[6] = L[6];
		K[7] = L[7];

		L[0]  = C0[BYTE7(state[0])] XEND
		X(L[0]) C1[BYTE6(state[7])] XEND
		X(L[0]) C2[BYTE5(state[6])] XEND
		X(L[0]) C3[BYTE4(state[5])] XEND
		X(L[0]) C4[BYTE3(state[4])] XEND
		X(L[0]) C5[BYTE2(state[3])] XEND
		X(L[0]) C6[BYTE1(state[2])] XEND
		X(L[0]) C7[BYTE0(state[1])] XEND
		X(L[0]) K[0];

		L[1]  = C0[BYTE7(state[1])] XEND
		X(L[1]) C1[BYTE6(state[0])] XEND
		X(L[1]) C2[BYTE5(state[7])] XEND
		X(L[1]) C3[BYTE4(state[6])] XEND
		X(L[1]) C4[BYTE3(state[5])] XEND
		X(L[1]) C5[BYTE2(state[4])] XEND
		X(L[1]) C6[BYTE1(state[3])] XEND
		X(L[1]) C7[BYTE0(state[2])] XEND
		X(L[1]) K[1];

		L[2]  = C0[BYTE7(state[2])] XEND
		X(L[2]) C1[BYTE6(state[1])] XEND
		X(L[2]) C2[BYTE5(state[0])] XEND
		X(L[2]) C3[BYTE4(state[7])] XEND
		X(L[2]) C4[BYTE3(state[6])] XEND
		X(L[2]) C5[BYTE2(state[5])] XEND
		X(L[2]) C6[BYTE1(state[4])] XEND
		X(L[2]) C7[BYTE0(state[3])] XEND
		X(L[2]) K[2];

		L[3]  = C0[BYTE7(state[3])] XEND
		X(L[3]) C1[BYTE6(state[2])] XEND
		X(L[3]) C2[BYTE5(state[1])] XEND
		X(L[3]) C3[BYTE4(state[0])] XEND
		X(L[3]) C4[BYTE3(state[7])] XEND
		X(L[3]) C5[BYTE2(state[6])] XEND
		X(L[3]) C6[BYTE1(state[5])] XEND
		X(L[3]) C7[BYTE0(state[4])] XEND
		X(L[3]) K[3];

		L[4]  = C0[BYTE7(state[4])] XEND
		X(L[4]) C1[BYTE6(state[3])] XEND
		X(L[4]) C2[BYTE5(state[2])] XEND
		X(L[4]) C3[BYTE4(state[1])] XEND
		X(L[4]) C4[BYTE3(state[0])] XEND
		X(L[4]) C5[BYTE2(state[7])] XEND
		X(L[4]) C6[BYTE1(state[6])] XEND
		X(L[4]) C7[BYTE0(state[5])] XEND
		X(L[4]) K[4];

		L[5]  = C0[BYTE7(state[5])] XEND
		X(L[5]) C1[BYTE6(state[4])] XEND
		X(L[5]) C2[BYTE5(state[3])] XEND
		X(L[5]) C3[BYTE4(state[2])] XEND
		X(L[5]) C4[BYTE3(state[1])] XEND
		X(L[5]) C5[BYTE2(state[0])] XEND
		X(L[5]) C6[BYTE1(state[7])] XEND
		X(L[5]) C7[BYTE0(state[6])] XEND
		X(L[5]) K[5];

		L[6]  = C0[BYTE7(state[6])] XEND
		X(L[6]) C1[BYTE6(state[5])] XEND
		X(L[6]) C2[BYTE5(state[4])] XEND
		X(L[6]) C3[BYTE4(state[3])] XEND
		X(L[6]) C4[BYTE3(state[2])] XEND
		X(L[6]) C5[BYTE2(state[1])] XEND
		X(L[6]) C6[BYTE1(state[0])] XEND
		X(L[6]) C7[BYTE0(state[7])] XEND
		X(L[6]) K[6];

		L[7]  = C0[BYTE7(state[7])] XEND
		X(L[7]) C1[BYTE6(state[6])] XEND
		X(L[7]) C2[BYTE5(state[5])] XEND
		X(L[7]) C3[BYTE4(state[4])] XEND
		X(L[7]) C4[BYTE3(state[3])] XEND
		X(L[7]) C5[BYTE2(state[2])] XEND
		X(L[7]) C6[BYTE1(state[1])] XEND
		X(L[7]) C7[BYTE0(state[0])] XEND
		X(L[7]) K[7];

		state[0] = L[0];
		state[1] = L[1];
		state[2] = L[2];
		state[3] = L[3];
		state[4] = L[4];
		state[5] = L[5];
		state[6] = L[6];
		state[7] = L[7];
	}
	/*
	* apply the Miyaguchi-Preneel compression function:
	*/
	wctx->hash[0] ^= state[0] ^ block[0];
	wctx->hash[1] ^= state[1] ^ block[1];
	wctx->hash[2] ^= state[2] ^ block[2];
	wctx->hash[3] ^= state[3] ^ block[3];
	wctx->hash[4] ^= state[4] ^ block[4];
	wctx->hash[5] ^= state[5] ^ block[5];
	wctx->hash[6] ^= state[6] ^ block[6];
	wctx->hash[7] ^= state[7] ^ block[7];
}

static void wp512_init(void *ctx) {
	int i;
	struct wp512_ctx *wctx = ctx;

	memset(wctx->bitLength, 0, 32);
	wctx->bufferBits = wctx->bufferPos = 0;
	wctx->buffer[0] = 0;
	for (i = 0; i < 8; i++) {
		wctx->hash[i] = 0L;
	}
}

static void wp512_update(void *ctx, const u8 *source, unsigned int len)
{

	struct wp512_ctx *wctx = ctx;
	int sourcePos    = 0;
	unsigned int bits_len = len * 8; // convert to number of bits
	int sourceGap    = (8 - ((int)bits_len & 7)) & 7;
	int bufferRem    = wctx->bufferBits & 7;
	int i;
	u32 b, carry;
	u8 *buffer       = wctx->buffer;
	u8 *bitLength    = wctx->bitLength;
	int bufferBits   = wctx->bufferBits;
	int bufferPos    = wctx->bufferPos;

	u64 value = bits_len;
	for (i = 31, carry = 0; i >= 0 && (carry != 0 || value != 0ULL); i--) {
		carry += bitLength[i] + ((u32)value & 0xff);
		bitLength[i] = (u8)carry;
		carry >>= 8;
		value >>= 8;
	}
	while (bits_len > 8) {
		b = ((source[sourcePos] << sourceGap) & 0xff) |
		((source[sourcePos + 1] & 0xff) >> (8 - sourceGap));
		buffer[bufferPos++] |= (u8)(b >> bufferRem);
		bufferBits += 8 - bufferRem;
		if (bufferBits == WP512_BLOCK_SIZE * 8) {
			wp512_process_buffer(wctx);
			bufferBits = bufferPos = 0;
		}
		buffer[bufferPos] = b << (8 - bufferRem);
		bufferBits += bufferRem;
		bits_len -= 8;
		sourcePos++;
	}
	if (bits_len > 0) {
		b = (source[sourcePos] << sourceGap) & 0xff;
		buffer[bufferPos] |= b >> bufferRem;
	} else {
		b = 0;
	}
	if (bufferRem + bits_len < 8) {
		bufferBits += bits_len;
	} else {
		bufferPos++;
		bufferBits += 8 - bufferRem;
		bits_len -= 8 - bufferRem;
		if (bufferBits == WP512_BLOCK_SIZE * 8) {
			wp512_process_buffer(wctx);
			bufferBits = bufferPos = 0;
		}
		buffer[bufferPos] = b << (8 - bufferRem);
		bufferBits += (int)bits_len;
	}

	wctx->bufferBits   = bufferBits;
	wctx->bufferPos    = bufferPos;
}

static void wp512_final(void *ctx, u8 *out)
{
	struct wp512_ctx *wctx = ctx;
	int i;
   	u8 *buffer      = wctx->buffer;
   	u8 *bitLength   = wctx->bitLength;
   	int bufferBits  = wctx->bufferBits;
   	int bufferPos   = wctx->bufferPos;

   	buffer[bufferPos] |= 0x80U >> (bufferBits & 7);
   	bufferPos++;
   	if (bufferPos > WP512_BLOCK_SIZE - WP512_LENGTHBYTES) {
   		if (bufferPos < WP512_BLOCK_SIZE) {
	   	memset(&buffer[bufferPos], 0, WP512_BLOCK_SIZE - bufferPos);
   		}
   		wp512_process_buffer(wctx);
   		bufferPos = 0;
   	}
   	if (bufferPos < WP512_BLOCK_SIZE - WP512_LENGTHBYTES) {
   		memset(&buffer[bufferPos], 0,
			  (WP512_BLOCK_SIZE - WP512_LENGTHBYTES) - bufferPos);
   	}
   	bufferPos = WP512_BLOCK_SIZE - WP512_LENGTHBYTES;
   	memcpy(&buffer[WP512_BLOCK_SIZE - WP512_LENGTHBYTES],
		   bitLength, WP512_LENGTHBYTES);
   	wp512_process_buffer(wctx);
   	for (i = 0; i < WP512_DIGEST_SIZE/8; i++) {
		((__be64*)out)[i] = cpu_to_be64(wctx->hash[i]);
   	}
   	wctx->bufferBits   = bufferBits;
   	wctx->bufferPos    = bufferPos;
}

static void wp384_final(void *ctx, u8 *out)
{
	struct wp512_ctx *wctx = ctx;
	u8 D[64];

	wp512_final (wctx, D);
	memcpy (out, D, WP384_DIGEST_SIZE);
	memset (D, 0, WP512_DIGEST_SIZE);
}

static void wp256_final(void *ctx, u8 *out)
{
	struct wp512_ctx *wctx = ctx;
	u8 D[64];

	wp512_final (wctx, D);
	memcpy (out, D, WP256_DIGEST_SIZE);
	memset (D, 0, WP512_DIGEST_SIZE);
}

static struct crypto_alg wp512 = {
	.cra_name	=	"wp512",
	.cra_flags	=	CRYPTO_ALG_TYPE_DIGEST,
	.cra_blocksize	=	WP512_BLOCK_SIZE,
	.cra_ctxsize	=	sizeof(struct wp512_ctx),
	.cra_module	=	THIS_MODULE,
	.cra_list       =       LIST_HEAD_INIT(wp512.cra_list),	
	.cra_u		=	{ .digest = {
	.dia_digestsize	=	WP512_DIGEST_SIZE,
	.dia_init   	= 	wp512_init,
	.dia_update 	=	wp512_update,
	.dia_final  	=	wp512_final } }
};

static struct crypto_alg wp384 = {
	.cra_name	=	"wp384",
	.cra_flags	=	CRYPTO_ALG_TYPE_DIGEST,
	.cra_blocksize	=	WP512_BLOCK_SIZE,
	.cra_ctxsize	=	sizeof(struct wp512_ctx),
	.cra_module	=	THIS_MODULE,
	.cra_list       =       LIST_HEAD_INIT(wp384.cra_list),	
	.cra_u		=	{ .digest = {
	.dia_digestsize	=	WP384_DIGEST_SIZE,
	.dia_init   	= 	wp512_init,
	.dia_update 	=	wp512_update,
	.dia_final  	=	wp384_final } }
};

static struct crypto_alg wp256 = {
	.cra_name	=	"wp256",
	.cra_flags	=	CRYPTO_ALG_TYPE_DIGEST,
	.cra_blocksize	=	WP512_BLOCK_SIZE,
	.cra_ctxsize	=	sizeof(struct wp512_ctx),
	.cra_module	=	THIS_MODULE,
	.cra_list       =       LIST_HEAD_INIT(wp256.cra_list),
	.cra_u		=	{ .digest = {
	.dia_digestsize	=	WP256_DIGEST_SIZE,
	.dia_init   	= 	wp512_init,
	.dia_update 	=	wp512_update,
	.dia_final  	=	wp256_final } }
};

static int __init init(void)
{
	int ret = 0;

	ret = crypto_register_alg(&wp512);

	if (ret < 0)
		goto out;

	ret = crypto_register_alg(&wp384);
	if (ret < 0)
	{
		crypto_unregister_alg(&wp512);
		goto out;
	}

	ret = crypto_register_alg(&wp256);
	if (ret < 0)
	{
		crypto_unregister_alg(&wp512);
		crypto_unregister_alg(&wp384);
	}
out:
	return ret;
}

static void __exit fini(void)
{
	crypto_unregister_alg(&wp512);
	crypto_unregister_alg(&wp384);
	crypto_unregister_alg(&wp256);
}

MODULE_ALIAS("wp384");
MODULE_ALIAS("wp256");

module_init(init);
module_exit(fini);

MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Whirlpool Message Digest Algorithm");

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[Index of Archives]     [Kernel Newbies]     [Netfilter]     [Bugtraq]     [Photo]     [Stuff]     [Gimp]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Video 4 Linux]     [Linux for the blind]     [Linux Resources]
  Powered by Linux