Home
Reading
Searching
Subscribe
Sponsors
Statistics
Posting
Contact
Spam
Lists
Links
About
Hosting
Filtering
Features Download
Marketing
Archives
FAQ
Blog
 
Gmane
From: Huang, Ying <ying.huang <at> intel.com>
Subject: [PATCH -mm crypto] AES: x86_64 asm implementation optimization
Newsgroups: gmane.linux.kernel.cryptoapi
Date: Wednesday 9th April 2008 06:41:02 UTC (over 9 years ago)
This patch increases the performance of AES x86-64 implementation. The
average increment is more than 6.3% and the max increment is
more than 10.2% on Intel CORE 2 CPU. The performance increment is
gained via the following methods:

- Two additional temporary registers are used to hold the subset of
  the state, so that the dependency between instructions is reduced.

- The expanded key is loaded via 2 64bit load instead of 4 32-bit load.

This patch is based on 2.6.25-rc8-mm1.

The file attached is the test data via: modprobe tcrypt mode=200

- dmesg_1_core-stockn:	stock kernel data
- dmesg_1_core-op4n:	patched kernel data
- percent.txt:		(time_patched - time_stock) / time_stock * 100

Signed-off-by: Huang Ying <[email protected]>

---
 arch/x86/crypto/aes-x86_64-asm_64.S |  101
++++++++++++++++++++----------------
 include/crypto/aes.h                |    1 
 2 files changed, 58 insertions(+), 44 deletions(-)

--- a/arch/x86/crypto/aes-x86_64-asm_64.S
+++ b/arch/x86/crypto/aes-x86_64-asm_64.S
@@ -46,70 +46,81 @@
 #define R7	%rbp
 #define R7E	%ebp
 #define R8	%r8
+#define R8E	%r8d
 #define R9	%r9
+#define R9E	%r9d
 #define R10	%r10
 #define R11	%r11
+#define R12	%r12
+#define R12E	%r12d
+#define R16	%rsp
 
 #define prologue(FUNC,KEY,B128,B192,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11) \
 	.global	FUNC;			\
 	.type	FUNC,@function;		\
 	.align	8;			\
-FUNC:	movq	r1,r2;			\
-	movq	r3,r4;			\
-	leaq	BASE+KEY+48+4(r8),r9;	\
-	movq	r10,r11;		\
-	movl	(r7),r5 ## E;		\
-	movl	4(r7),r1 ## E;		\
-	movl	8(r7),r6 ## E;		\
-	movl	12(r7),r7 ## E;		\
-	movl	BASE+0(r8),r10 ## E;	\
-	xorl	-48(r9),r5 ## E;	\
-	xorl	-44(r9),r1 ## E;	\
-	xorl	-40(r9),r6 ## E;	\
-	xorl	-36(r9),r7 ## E;	\
-	cmpl	$24,r10 ## E;		\
+FUNC:	subq	$24,r11;		\
+	movl	(r6),r4 ## E;		\
+	leaq	BASE+KEY+48+8(r7),r8;	\
+	movq	r1,(r11);		\
+	movq	r9,r10;			\
+	movl	4(r6),r1 ## E;		\
+	movq	r2,8(r11);		\
+	movl	8(r6),r5 ## E;		\
+	movq	r3,16(r11);		\
+	movl	12(r6),r6 ## E;		\
+	movl	BASE+0(r7),r9 ## E;	\
+	xorl	-48(r8),r4 ## E;	\
+	xorl	-44(r8),r1 ## E;	\
+	xorl	-40(r8),r5 ## E;	\
+	xorl	-36(r8),r6 ## E;	\
+	cmpl	$24,r9 ## E;		\
 	jb	B128;			\
-	leaq	32(r9),r9;		\
+	leaq	32(r8),r8;		\
 	je	B192;			\
-	leaq	32(r9),r9;
+	leaq	32(r8),r8;
 
 #define epilogue(r1,r2,r3,r4,r5,r6,r7,r8,r9) \
-	movq	r1,r2;			\
-	movq	r3,r4;			\
-	movl	r5 ## E,(r9);		\
-	movl	r6 ## E,4(r9);		\
-	movl	r7 ## E,8(r9);		\
-	movl	r8 ## E,12(r9);		\
+	movq	(r9),r1;		\
+	movl	r4 ## E,(r8);		\
+	movq	8(r9),r2;		\
+	movl	r5 ## E,4(r8);		\
+	movq	16(r9),r3;		\
+	movl	r6 ## E,8(r8);		\
+	addq	$24,r9;			\
+	movl	r7 ## E,12(r8);		\
 	ret;
 
-#define round(TAB,OFFSET,r1,r2,r3,r4,r5,r6,r7,r8,ra,rb,rc,rd) \
+#define round(TAB,OFFSET,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,ra,rb,rc,rd) \
 	movzbl	r2 ## H,r5 ## E;	\
 	movzbl	r2 ## L,r6 ## E;	\
+	movl	r4 ## E,r8 ## E;	\
+	shrl	$16,r4 ## E;		\
 	movl	TAB+1024(,r5,4),r5 ## E;\
-	movw	r4 ## X,r2 ## X;	\
 	movl	TAB(,r6,4),r6 ## E;	\
-	roll	$16,r2 ## E;		\
-	shrl	$16,r4 ## E;		\
 	movzbl	r4 ## H,r7 ## E;	\
 	movzbl	r4 ## L,r4 ## E;	\
-	xorl	OFFSET(r8),ra ## E;	\
-	xorl	OFFSET+4(r8),rb ## E;	\
+	movq	OFFSET(r11),r10;	\
+	shrl	$16,r2 ## E;		\
+	movl	r3 ## E,r9 ## E;	\
 	xorl	TAB+3072(,r7,4),r5 ## E;\
 	xorl	TAB+2048(,r4,4),r6 ## E;\
-	movzbl	r1 ## L,r7 ## E;	\
 	movzbl	r1 ## H,r4 ## E;	\
-	movl	TAB+1024(,r4,4),r4 ## E;\
-	movw	r3 ## X,r1 ## X;	\
-	roll	$16,r1 ## E;		\
+	movzbl	r1 ## L,r7 ## E;	\
 	shrl	$16,r3 ## E;		\
+	movl	TAB+1024(,r4,4),r4 ## E;\
 	xorl	TAB(,r7,4),r5 ## E;	\
+	shrl	$16,r1 ## E;		\
 	movzbl	r3 ## H,r7 ## E;	\
 	movzbl	r3 ## L,r3 ## E;	\
 	xorl	TAB+3072(,r7,4),r4 ## E;\
 	xorl	TAB+2048(,r3,4),r5 ## E;\
 	movzbl	r1 ## H,r7 ## E;	\
 	movzbl	r1 ## L,r3 ## E;	\
-	shrl	$16,r1 ## E;		\
+	xorl	r10 ## E,ra ## E;	\
+	movl	r9 ## E,r1 ## E;	\
+	movq	OFFSET+8(r11),r9;	\
+	shrq	$32,r10;		\
 	xorl	TAB+3072(,r7,4),r6 ## E;\
 	movl	TAB+2048(,r3,4),r3 ## E;\
 	movzbl	r1 ## H,r7 ## E;	\
@@ -118,38 +129,40 @@ FUNC:	movq	r1,r2;			\
 	xorl	TAB(,r1,4),r3 ## E;	\
 	movzbl	r2 ## H,r1 ## E;	\
 	movzbl	r2 ## L,r7 ## E;	\
-	shrl	$16,r2 ## E;		\
+	xorl	r9 ## E, rc ## E;	\
+	movl	r8 ## E,r2 ## E;	\
+	shrq	$32,r9;			\
+	xorl	r10 ## E,rb ## E;	\
 	xorl	TAB+3072(,r1,4),r3 ## E;\
 	xorl	TAB+2048(,r7,4),r4 ## E;\
 	movzbl	r2 ## H,r1 ## E;	\
+	xorl	r9 ## E, rd ## E;	\
 	movzbl	r2 ## L,r2 ## E;	\
-	xorl	OFFSET+8(r8),rc ## E;	\
-	xorl	OFFSET+12(r8),rd ## E;	\
-	xorl	TAB+1024(,r1,4),r3 ## E;\
-	xorl	TAB(,r2,4),r4 ## E;
+	xorl	TAB(,r2,4),r4 ## E;	\
+	xorl	TAB+1024(,r1,4),r3 ## E;
 
 #define move_regs(r1,r2,r3,r4) \
 	movl	r3 ## E,r1 ## E;	\
 	movl	r4 ## E,r2 ## E;
 
 #define entry(FUNC,KEY,B128,B192) \
-	prologue(FUNC,KEY,B128,B192,R2,R8,R7,R9,R1,R3,R4,R6,R10,R5,R11)
+	prologue(FUNC,KEY,B128,B192,R2,R7,R12,R1,R3,R4,R6,R10,R5,R11,R16)
 
-#define return epilogue(R8,R2,R9,R7,R5,R6,R3,R4,R11)
+#define return epilogue(R2,R7,R12,R5,R6,R3,R4,R11,R16)
 
 #define encrypt_round(TAB,OFFSET) \
-	round(TAB,OFFSET,R1,R2,R3,R4,R5,R6,R7,R10,R5,R6,R3,R4) \
+	round(TAB,OFFSET,R1,R2,R3,R4,R5,R6,R7,R8,R9,R12,R10,R5,R6,R3,R4) \
 	move_regs(R1,R2,R5,R6)
 
 #define encrypt_final(TAB,OFFSET) \
-	round(TAB,OFFSET,R1,R2,R3,R4,R5,R6,R7,R10,R5,R6,R3,R4)
+	round(TAB,OFFSET,R1,R2,R3,R4,R5,R6,R7,R8,R9,R12,R10,R5,R6,R3,R4)
 
 #define decrypt_round(TAB,OFFSET) \
-	round(TAB,OFFSET,R2,R1,R4,R3,R6,R5,R7,R10,R5,R6,R3,R4) \
+	round(TAB,OFFSET,R2,R1,R4,R3,R6,R5,R7,R8,R9,R12,R10,R5,R6,R3,R4) \
 	move_regs(R1,R2,R5,R6)
 
 #define decrypt_final(TAB,OFFSET) \
-	round(TAB,OFFSET,R2,R1,R4,R3,R6,R5,R7,R10,R5,R6,R3,R4)
+	round(TAB,OFFSET,R2,R1,R4,R3,R6,R5,R7,R8,R9,R12,R10,R5,R6,R3,R4)
 
 /* void aes_enc_blk(stuct crypto_tfm *tfm, u8 *out, const u8 *in) */
 
--- a/include/crypto/aes.h
+++ b/include/crypto/aes.h
@@ -19,6 +19,7 @@
 
 struct crypto_aes_ctx {
 	u32 key_length;
+	u32 _pad1;
 	u32 key_enc[AES_MAX_KEYLENGTH_U32];
 	u32 key_dec[AES_MAX_KEYLENGTH_U32];
 };
 
CD: 11ms