[PATCH 2/3] [CRYPTO] Add optimized SHA-1 implementation for i486+

Add x86-optimized implementation of the SHA-1 hash function, taken from
Nettle under the LGPL.  This code will be enabled on kernels compiled for
486es or better; kernels which support 386es will use the generic
implementation (since we need BSWAP).

We disable building lib/sha1.o when an optimized implementation is
available, as the library link order for x86 (and x86_64) would otherwise
ignore the optimized version.  The existing optimized implementation for ARM
does not do this; the library link order for that architecture appears to
favor the arch/arm/ version automatically.  I've left this situation alone
since I'm not familiar with the ARM code, but a !ARM condition could be
added to CONFIG_SHA1_GENERIC if it makes sense.

The code has been tested with tcrypt and the NIST test vectors.

Signed-off-by: Benjamin Gilbert <[email protected]>
---

 arch/i386/kernel/i386_ksyms.c |    5 +
 arch/i386/lib/Makefile        |    1 
 arch/i386/lib/sha1.S          |  299 +++++++++++++++++++++++++++++++++++++++++
 include/linux/cryptohash.h    |    9 +
 lib/Kconfig                   |   13 ++
 lib/Makefile                  |    3 
 6 files changed, 328 insertions(+), 2 deletions(-)

diff --git a/arch/i386/kernel/i386_ksyms.c b/arch/i386/kernel/i386_ksyms.c
index e3d4b73..812bc4e 100644
--- a/arch/i386/kernel/i386_ksyms.c
+++ b/arch/i386/kernel/i386_ksyms.c
@@ -1,4 +1,5 @@
 #include <linux/module.h>
+#include <linux/cryptohash.h>
 #include <asm/checksum.h>
 #include <asm/desc.h>
 
@@ -28,3 +29,7 @@ EXPORT_SYMBOL(__read_lock_failed);
 #endif
 
 EXPORT_SYMBOL(csum_partial);
+
+#ifdef CONFIG_SHA1_X86
+EXPORT_SYMBOL(sha_transform);
+#endif
diff --git a/arch/i386/lib/Makefile b/arch/i386/lib/Makefile
index 22d8ac5..69f4845 100644
--- a/arch/i386/lib/Makefile
+++ b/arch/i386/lib/Makefile
@@ -6,6 +6,7 @@
 lib-y = checksum.o delay.o usercopy.o getuser.o putuser.o memcpy.o strstr.o \
 	bitops.o semaphore.o
 
+lib-$(CONFIG_SHA1_X86) += sha1.o
 lib-$(CONFIG_X86_USE_3DNOW) += mmx.o
 
 obj-$(CONFIG_SMP)	+= msr-on-cpu.o
diff --git a/arch/i386/lib/sha1.S b/arch/i386/lib/sha1.S
new file mode 100644
index 0000000..28aa4b7
--- /dev/null
+++ b/arch/i386/lib/sha1.S
@@ -0,0 +1,299 @@
+/*
+ * x86-optimized SHA1 hash algorithm (i486 and above)
+ *
+ * Originally from Nettle
+ * Ported from M4 to cpp by Benjamin Gilbert <[email protected]>
+ *
+ * Copyright (C) 2004, Niels M�r
+ * Copyright (C) 2006-2007 Carnegie Mellon University
+ *
+ * This library is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License as
+ * published by the Free Software Foundation.  A copy of the GNU Lesser General
+ * Public License should have been distributed along with this library in the
+ * file LICENSE.LGPL.
+ *
+ * This library is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
+ * for more details.
+ */
+
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+
+/* Register usage */
+#define SA	%eax
+#define SB	%ebx
+#define SC	%ecx
+#define SD	%edx
+#define SE	%ebp
+#define DATA	%esp
+#define TMP	%edi
+#define TMP2	%esi			/* Used by SWAP and F3 */
+#define TMP3	64(%esp)
+
+/* Constants */
+#define K1VALUE	$0x5A827999		/* Rounds  0-19 */
+#define K2VALUE	$0x6ED9EBA1		/* Rounds 20-39 */
+#define K3VALUE	$0x8F1BBCDC		/* Rounds 40-59 */
+#define K4VALUE	$0xCA62C1D6		/* Rounds 60-79 */
+
+/* Convert stack offsets in words to offsets in bytes */
+#define OFFSET(i) 4*(i)
+
+/* Reads the input via TMP2 into register, byteswaps it, and stores it in
+   the DATA array. */
+#define SWAP(index, register)					\
+	movl	OFFSET(index)(TMP2), register;			\
+	bswap	register;					\
+	movl	register, OFFSET(index)(DATA)
+
+/* Sets the workspace word at the given index to TMP. */
+#define CLEAR(index)						\
+	movl	TMP, OFFSET(index)(DATA)
+
+/* pushl/popl wrappers that update the DWARF unwind table */
+#define PUSH(regname)						\
+	pushl			%regname;			\
+	CFI_ADJUST_CFA_OFFSET	4;				\
+	CFI_REL_OFFSET		regname, 0
+
+#define POP(regname)						\
+	popl			%regname;			\
+	CFI_ADJUST_CFA_OFFSET	-4;				\
+	CFI_RESTORE		regname
+
+/*
+ * expand(i) is the expansion function
+ *
+ *   W[i] = (W[i - 16] ^ W[i - 14] ^ W[i - 8] ^ W[i - 3]) <<< 1
+ *
+ * where W[i] is stored in DATA[i mod 16].
+ *
+ * Result is stored back in W[i], and also left in TMP, the only
+ * register that is used.
+ */
+#define EXPAND(i)						\
+	movl	OFFSET(i % 16)(DATA), TMP;			\
+	xorl	OFFSET((i + 2) % 16)(DATA), TMP;		\
+	xorl	OFFSET((i + 8) % 16)(DATA), TMP;		\
+	xorl	OFFSET((i + 13) % 16)(DATA), TMP;		\
+	roll	$1, TMP;					\
+	movl	TMP, OFFSET(i % 16)(DATA)
+
+/*
+ * The f functions,
+ *
+ *  f1(x,y,z) = z ^ (x & (y ^ z))
+ *  f2(x,y,z) = x ^ y ^ z
+ *  f3(x,y,z) = (x & y) | (z & (x | y))
+ *  f4 = f2
+ *
+ * The macro Fk(x,y,z) computes = fk(x,y,z).
+ * Result is left in TMP.
+ */
+#define F1(x,y,z)						\
+	movl	z, TMP;						\
+	xorl	y, TMP;						\
+	andl	x, TMP;						\
+	xorl	z, TMP
+
+#define F2(x,y,z)						\
+	movl	x, TMP;						\
+	xorl	y, TMP;						\
+	xorl	z, TMP
+
+#define F3(x,y,z)						\
+	movl	x, TMP2;					\
+	andl	y, TMP2;					\
+	movl	x, TMP;						\
+	orl	y, TMP;						\
+	andl	z, TMP;						\
+	orl	TMP2, TMP
+
+/*
+ * The form of one sha1 round is
+ *
+ *   a' = e + a <<< 5 + f( b, c, d ) + k + w;
+ *   b' = a;
+ *   c' = b <<< 30;
+ *   d' = c;
+ *   e' = d;
+ *
+ * where <<< denotes rotation. We permute our variables, so that we
+ * instead get
+ *
+ *   e += a <<< 5 + f( b, c, d ) + k + w;
+ *   b <<<= 30
+ *
+ * Using the TMP register for the rotate could be avoided, by rotating
+ * %a in place, adding, and then rotating back.
+ */
+#define ROUND(a,b,c,d,e,f,k,w)					\
+	addl	k, e;						\
+	addl	w, e;						\
+	f(b,c,d);						\
+	addl	TMP, e;						\
+	movl	a, TMP;						\
+	roll	$5, TMP;					\
+	addl	TMP, e;						\
+	roll	$30, b;
+
+/* sha_transform(__u32 *digest, const char *in, __u32 *W) */
+/* The workspace argument is ignored; we don't have enough registers to handle
+   a workspace that isn't on our own stack. */
+.text
+ENTRY(sha_transform)
+	CFI_STARTPROC
+	/* save all registers that need to be saved */
+	PUSH(ebx)		/* 80(%esp) */
+	PUSH(ebp)		/* 76(%esp) */
+	PUSH(esi)		/* 72(%esp) */
+	PUSH(edi)		/* 68(%esp) */
+
+	subl	$68, %esp	/* %esp = W */
+	CFI_ADJUST_CFA_OFFSET	68
+
+	/* Load and byteswap data */
+	movl	92(%esp), TMP2
+
+	SWAP( 0, %eax); SWAP( 1, %ebx); SWAP( 2, %ecx); SWAP( 3, %edx)
+	SWAP( 4, %eax); SWAP( 5, %ebx); SWAP( 6, %ecx); SWAP( 7, %edx)
+	SWAP( 8, %eax); SWAP( 9, %ebx); SWAP(10, %ecx); SWAP(11, %edx)
+	SWAP(12, %eax); SWAP(13, %ebx); SWAP(14, %ecx); SWAP(15, %edx)
+
+	/* load the state vector */
+	movl	88(%esp),TMP
+	movl	(TMP),   SA
+	movl	4(TMP),  SB
+	movl	8(TMP),  SC
+	movl	12(TMP), SD
+	movl	16(TMP), SE
+
+	movl	K1VALUE, TMP2
+	ROUND(SA, SB, SC, SD, SE, F1, TMP2, OFFSET( 0)(DATA))
+	ROUND(SE, SA, SB, SC, SD, F1, TMP2, OFFSET( 1)(DATA))
+	ROUND(SD, SE, SA, SB, SC, F1, TMP2, OFFSET( 2)(DATA))
+	ROUND(SC, SD, SE, SA, SB, F1, TMP2, OFFSET( 3)(DATA))
+	ROUND(SB, SC, SD, SE, SA, F1, TMP2, OFFSET( 4)(DATA))
+
+	ROUND(SA, SB, SC, SD, SE, F1, TMP2, OFFSET( 5)(DATA))
+	ROUND(SE, SA, SB, SC, SD, F1, TMP2, OFFSET( 6)(DATA))
+	ROUND(SD, SE, SA, SB, SC, F1, TMP2, OFFSET( 7)(DATA))
+	ROUND(SC, SD, SE, SA, SB, F1, TMP2, OFFSET( 8)(DATA))
+	ROUND(SB, SC, SD, SE, SA, F1, TMP2, OFFSET( 9)(DATA))
+
+	ROUND(SA, SB, SC, SD, SE, F1, TMP2, OFFSET(10)(DATA))
+	ROUND(SE, SA, SB, SC, SD, F1, TMP2, OFFSET(11)(DATA))
+	ROUND(SD, SE, SA, SB, SC, F1, TMP2, OFFSET(12)(DATA))
+	ROUND(SC, SD, SE, SA, SB, F1, TMP2, OFFSET(13)(DATA))
+	ROUND(SB, SC, SD, SE, SA, F1, TMP2, OFFSET(14)(DATA))
+
+	ROUND(SA, SB, SC, SD, SE, F1, TMP2, OFFSET(15)(DATA))
+	EXPAND(16); ROUND(SE, SA, SB, SC, SD, F1, TMP2, TMP)
+	EXPAND(17); ROUND(SD, SE, SA, SB, SC, F1, TMP2, TMP)
+	EXPAND(18); ROUND(SC, SD, SE, SA, SB, F1, TMP2, TMP)
+	EXPAND(19); ROUND(SB, SC, SD, SE, SA, F1, TMP2, TMP)
+
+	/* TMP2 is free to use in these rounds */
+	movl	K2VALUE, TMP2
+	EXPAND(20); ROUND(SA, SB, SC, SD, SE, F2, TMP2, TMP)
+	EXPAND(21); ROUND(SE, SA, SB, SC, SD, F2, TMP2, TMP)
+	EXPAND(22); ROUND(SD, SE, SA, SB, SC, F2, TMP2, TMP)
+	EXPAND(23); ROUND(SC, SD, SE, SA, SB, F2, TMP2, TMP)
+	EXPAND(24); ROUND(SB, SC, SD, SE, SA, F2, TMP2, TMP)
+
+	EXPAND(25); ROUND(SA, SB, SC, SD, SE, F2, TMP2, TMP)
+	EXPAND(26); ROUND(SE, SA, SB, SC, SD, F2, TMP2, TMP)
+	EXPAND(27); ROUND(SD, SE, SA, SB, SC, F2, TMP2, TMP)
+	EXPAND(28); ROUND(SC, SD, SE, SA, SB, F2, TMP2, TMP)
+	EXPAND(29); ROUND(SB, SC, SD, SE, SA, F2, TMP2, TMP)
+
+	EXPAND(30); ROUND(SA, SB, SC, SD, SE, F2, TMP2, TMP)
+	EXPAND(31); ROUND(SE, SA, SB, SC, SD, F2, TMP2, TMP)
+	EXPAND(32); ROUND(SD, SE, SA, SB, SC, F2, TMP2, TMP)
+	EXPAND(33); ROUND(SC, SD, SE, SA, SB, F2, TMP2, TMP)
+	EXPAND(34); ROUND(SB, SC, SD, SE, SA, F2, TMP2, TMP)
+
+	EXPAND(35); ROUND(SA, SB, SC, SD, SE, F2, TMP2, TMP)
+	EXPAND(36); ROUND(SE, SA, SB, SC, SD, F2, TMP2, TMP)
+	EXPAND(37); ROUND(SD, SE, SA, SB, SC, F2, TMP2, TMP)
+	EXPAND(38); ROUND(SC, SD, SE, SA, SB, F2, TMP2, TMP)
+	EXPAND(39); ROUND(SB, SC, SD, SE, SA, F2, TMP2, TMP)
+
+	/* We have to put this constant on the stack */
+	movl	K3VALUE, TMP3
+	EXPAND(40); ROUND(SA, SB, SC, SD, SE, F3, TMP3, TMP)
+	EXPAND(41); ROUND(SE, SA, SB, SC, SD, F3, TMP3, TMP)
+	EXPAND(42); ROUND(SD, SE, SA, SB, SC, F3, TMP3, TMP)
+	EXPAND(43); ROUND(SC, SD, SE, SA, SB, F3, TMP3, TMP)
+	EXPAND(44); ROUND(SB, SC, SD, SE, SA, F3, TMP3, TMP)
+
+	EXPAND(45); ROUND(SA, SB, SC, SD, SE, F3, TMP3, TMP)
+	EXPAND(46); ROUND(SE, SA, SB, SC, SD, F3, TMP3, TMP)
+	EXPAND(47); ROUND(SD, SE, SA, SB, SC, F3, TMP3, TMP)
+	EXPAND(48); ROUND(SC, SD, SE, SA, SB, F3, TMP3, TMP)
+	EXPAND(49); ROUND(SB, SC, SD, SE, SA, F3, TMP3, TMP)
+
+	EXPAND(50); ROUND(SA, SB, SC, SD, SE, F3, TMP3, TMP)
+	EXPAND(51); ROUND(SE, SA, SB, SC, SD, F3, TMP3, TMP)
+	EXPAND(52); ROUND(SD, SE, SA, SB, SC, F3, TMP3, TMP)
+	EXPAND(53); ROUND(SC, SD, SE, SA, SB, F3, TMP3, TMP)
+	EXPAND(54); ROUND(SB, SC, SD, SE, SA, F3, TMP3, TMP)
+
+	EXPAND(55); ROUND(SA, SB, SC, SD, SE, F3, TMP3, TMP)
+	EXPAND(56); ROUND(SE, SA, SB, SC, SD, F3, TMP3, TMP)
+	EXPAND(57); ROUND(SD, SE, SA, SB, SC, F3, TMP3, TMP)
+	EXPAND(58); ROUND(SC, SD, SE, SA, SB, F3, TMP3, TMP)
+	EXPAND(59); ROUND(SB, SC, SD, SE, SA, F3, TMP3, TMP)
+
+	movl	K4VALUE, TMP2
+	EXPAND(60); ROUND(SA, SB, SC, SD, SE, F2, TMP2, TMP)
+	EXPAND(61); ROUND(SE, SA, SB, SC, SD, F2, TMP2, TMP)
+	EXPAND(62); ROUND(SD, SE, SA, SB, SC, F2, TMP2, TMP)
+	EXPAND(63); ROUND(SC, SD, SE, SA, SB, F2, TMP2, TMP)
+	EXPAND(64); ROUND(SB, SC, SD, SE, SA, F2, TMP2, TMP)
+
+	EXPAND(65); ROUND(SA, SB, SC, SD, SE, F2, TMP2, TMP)
+	EXPAND(66); ROUND(SE, SA, SB, SC, SD, F2, TMP2, TMP)
+	EXPAND(67); ROUND(SD, SE, SA, SB, SC, F2, TMP2, TMP)
+	EXPAND(68); ROUND(SC, SD, SE, SA, SB, F2, TMP2, TMP)
+	EXPAND(69); ROUND(SB, SC, SD, SE, SA, F2, TMP2, TMP)
+
+	EXPAND(70); ROUND(SA, SB, SC, SD, SE, F2, TMP2, TMP)
+	EXPAND(71); ROUND(SE, SA, SB, SC, SD, F2, TMP2, TMP)
+	EXPAND(72); ROUND(SD, SE, SA, SB, SC, F2, TMP2, TMP)
+	EXPAND(73); ROUND(SC, SD, SE, SA, SB, F2, TMP2, TMP)
+	EXPAND(74); ROUND(SB, SC, SD, SE, SA, F2, TMP2, TMP)
+
+	EXPAND(75); ROUND(SA, SB, SC, SD, SE, F2, TMP2, TMP)
+	EXPAND(76); ROUND(SE, SA, SB, SC, SD, F2, TMP2, TMP)
+	EXPAND(77); ROUND(SD, SE, SA, SB, SC, F2, TMP2, TMP)
+	EXPAND(78); ROUND(SC, SD, SE, SA, SB, F2, TMP2, TMP)
+	EXPAND(79); ROUND(SB, SC, SD, SE, SA, F2, TMP2, TMP)
+
+	/* Update the state vector */
+	movl	88(%esp),TMP
+	addl	SA, (TMP)
+	addl	SB, 4(TMP)
+	addl	SC, 8(TMP)
+	addl	SD, 12(TMP)
+	addl	SE, 16(TMP)
+
+	/* Clear the workspace for security */
+	xorl	TMP, TMP
+	CLEAR( 0); CLEAR( 1); CLEAR( 2); CLEAR( 3);
+	CLEAR( 4); CLEAR( 5); CLEAR( 6); CLEAR( 7);
+	CLEAR( 8); CLEAR( 9); CLEAR(10); CLEAR(11);
+	CLEAR(12); CLEAR(13); CLEAR(14); CLEAR(15);
+
+	addl	$68, %esp
+	CFI_ADJUST_CFA_OFFSET	-68
+	POP(edi)
+	POP(esi)
+	POP(ebp)
+	POP(ebx)
+	ret
+	CFI_ENDPROC
+ENDPROC(sha_transform)
diff --git a/include/linux/cryptohash.h b/include/linux/cryptohash.h
index a172401..0da331c 100644
--- a/include/linux/cryptohash.h
+++ b/include/linux/cryptohash.h
@@ -1,8 +1,15 @@
 #ifndef __CRYPTOHASH_H
 #define __CRYPTOHASH_H
 
+#include <linux/linkage.h>
+
 #define SHA_DIGEST_WORDS 5
+
+#if defined(CONFIG_SHA1_X86)
+#define SHA_WORKSPACE_WORDS 0
+#else
 #define SHA_WORKSPACE_WORDS 80
+#endif
 
 /**
  * sha_init - initialize the vectors for a SHA1 digest
@@ -17,7 +24,7 @@ static inline void sha_init(__u32 *buf)
 	buf[4] = 0xc3d2e1f0;
 }
 
-void sha_transform(__u32 *digest, const char *data, __u32 *W);
+asmlinkage void sha_transform(__u32 *digest, const char *data, __u32 *W);
 
 __u32 half_md4_transform(__u32 buf[4], __u32 const in[8]);
 
diff --git a/lib/Kconfig b/lib/Kconfig
index 2e7ae6b..69fdb64 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -124,4 +124,17 @@ config HAS_DMA
 	depends on !NO_DMA
 	default y
 
+#
+# Optimized SHA-1 support is autoconfigured
+#
+config SHA1_X86
+	bool
+	depends on (X86 || UML_X86) && !64BIT && X86_BSWAP
+	default y
+
+config SHA1_GENERIC
+	bool
+	depends on !SHA1_X86
+	default y
+
 endmenu
diff --git a/lib/Makefile b/lib/Makefile
index c8c8e20..f67be29 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -5,10 +5,11 @@
 lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 rbtree.o radix-tree.o dump_stack.o \
 	 idr.o int_sqrt.o bitmap.o extable.o prio_tree.o \
-	 sha1.o irq_regs.o reciprocal_div.o
+	 irq_regs.o reciprocal_div.o
 
 lib-$(CONFIG_MMU) += ioremap.o
 lib-$(CONFIG_SMP) += cpumask.o
+lib-$(CONFIG_SHA1_GENERIC) += sha1.o
 
 lib-y	+= kobject.o kref.o kobject_uevent.o klist.o
 

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Follow-Ups:
- Re: [PATCH 2/3] [CRYPTO] Add optimized SHA-1 implementation for i486+
  - From: Matt Mackall <[email protected]>
- Re: [PATCH 2/3] [CRYPTO] Add optimized SHA-1 implementation for i486+
  - From: Jan Engelhardt <[email protected]>
References:
- [PATCH 0/3] Add optimized SHA-1 implementations for x86 and x86_64
  - From: Benjamin Gilbert <[email protected]>
Prev by Date: Re: [PATCH 1/1] ide: ide-generic, add another device exception
Next by Date: Re: [PATCH 0/2] i386: Fix two more NMI watchdog bugs
Previous by thread: Re: [PATCH 3/3] [CRYPTO] Add optimized SHA-1 implementation for x86_64
Next by thread: Re: [PATCH 2/3] [CRYPTO] Add optimized SHA-1 implementation for i486+
Index(es):
- Date
- Thread
[Index of Archives] [Kernel Newbies] [Netfilter] [Bugtraq] [Photo] [Stuff] [Gimp] [Yosemite News] [MIPS Linux] [ARM Linux] [Linux Security] [Linux RAID] [Video 4 Linux] [Linux for the blind] [Linux Resources]