diff options
Diffstat (limited to 'ANDROID_3.4.5/arch/x86/crypto/sha1_ssse3_asm.S')
-rw-r--r-- | ANDROID_3.4.5/arch/x86/crypto/sha1_ssse3_asm.S | 558 |
1 files changed, 0 insertions, 558 deletions
diff --git a/ANDROID_3.4.5/arch/x86/crypto/sha1_ssse3_asm.S b/ANDROID_3.4.5/arch/x86/crypto/sha1_ssse3_asm.S deleted file mode 100644 index b2c2f57d..00000000 --- a/ANDROID_3.4.5/arch/x86/crypto/sha1_ssse3_asm.S +++ /dev/null @@ -1,558 +0,0 @@ -/* - * This is a SIMD SHA-1 implementation. It requires the Intel(R) Supplemental - * SSE3 instruction set extensions introduced in Intel Core Microarchitecture - * processors. CPUs supporting Intel(R) AVX extensions will get an additional - * boost. - * - * This work was inspired by the vectorized implementation of Dean Gaudet. - * Additional information on it can be found at: - * http://www.arctic.org/~dean/crypto/sha1.html - * - * It was improved upon with more efficient vectorization of the message - * scheduling. This implementation has also been optimized for all current and - * several future generations of Intel CPUs. - * - * See this article for more information about the implementation details: - * http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/ - * - * Copyright (C) 2010, Intel Corp. - * Authors: Maxim Locktyukhin <maxim.locktyukhin@intel.com> - * Ronen Zohar <ronen.zohar@intel.com> - * - * Converted to AT&T syntax and adapted for inclusion in the Linux kernel: - * Author: Mathias Krause <minipli@googlemail.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ - -#define CTX %rdi // arg1 -#define BUF %rsi // arg2 -#define CNT %rdx // arg3 - -#define REG_A %ecx -#define REG_B %esi -#define REG_C %edi -#define REG_D %ebp -#define REG_E %edx - -#define REG_T1 %eax -#define REG_T2 %ebx - -#define K_BASE %r8 -#define HASH_PTR %r9 -#define BUFFER_PTR %r10 -#define BUFFER_END %r11 - -#define W_TMP1 %xmm0 -#define W_TMP2 %xmm9 - -#define W0 %xmm1 -#define W4 %xmm2 -#define W8 %xmm3 -#define W12 %xmm4 -#define W16 %xmm5 -#define W20 %xmm6 -#define W24 %xmm7 -#define W28 %xmm8 - -#define XMM_SHUFB_BSWAP %xmm10 - -/* we keep window of 64 w[i]+K pre-calculated values in a circular buffer */ -#define WK(t) (((t) & 15) * 4)(%rsp) -#define W_PRECALC_AHEAD 16 - -/* - * This macro implements the SHA-1 function's body for single 64-byte block - * param: function's name - */ -.macro SHA1_VECTOR_ASM name - .global \name - .type \name, @function - .align 32 -\name: - push %rbx - push %rbp - push %r12 - - mov %rsp, %r12 - sub $64, %rsp # allocate workspace - and $~15, %rsp # align stack - - mov CTX, HASH_PTR - mov BUF, BUFFER_PTR - - shl $6, CNT # multiply by 64 - add BUF, CNT - mov CNT, BUFFER_END - - lea K_XMM_AR(%rip), K_BASE - xmm_mov BSWAP_SHUFB_CTL(%rip), XMM_SHUFB_BSWAP - - SHA1_PIPELINED_MAIN_BODY - - # cleanup workspace - mov $8, %ecx - mov %rsp, %rdi - xor %rax, %rax - rep stosq - - mov %r12, %rsp # deallocate workspace - - pop %r12 - pop %rbp - pop %rbx - ret - - .size \name, .-\name -.endm - -/* - * This macro implements 80 rounds of SHA-1 for one 64-byte block - */ -.macro SHA1_PIPELINED_MAIN_BODY - INIT_REGALLOC - - mov (HASH_PTR), A - mov 4(HASH_PTR), B - mov 8(HASH_PTR), C - mov 12(HASH_PTR), D - mov 16(HASH_PTR), E - - .set i, 0 - .rept W_PRECALC_AHEAD - W_PRECALC i - .set i, (i+1) - .endr - -.align 4 -1: - RR F1,A,B,C,D,E,0 - RR F1,D,E,A,B,C,2 - RR F1,B,C,D,E,A,4 - RR F1,E,A,B,C,D,6 - RR F1,C,D,E,A,B,8 - - RR F1,A,B,C,D,E,10 - RR F1,D,E,A,B,C,12 - RR F1,B,C,D,E,A,14 - RR F1,E,A,B,C,D,16 - RR F1,C,D,E,A,B,18 - - RR F2,A,B,C,D,E,20 - RR F2,D,E,A,B,C,22 - RR F2,B,C,D,E,A,24 - RR F2,E,A,B,C,D,26 - RR F2,C,D,E,A,B,28 - - RR F2,A,B,C,D,E,30 - RR F2,D,E,A,B,C,32 - RR F2,B,C,D,E,A,34 - RR F2,E,A,B,C,D,36 - RR F2,C,D,E,A,B,38 - - RR F3,A,B,C,D,E,40 - RR F3,D,E,A,B,C,42 - RR F3,B,C,D,E,A,44 - RR F3,E,A,B,C,D,46 - RR F3,C,D,E,A,B,48 - - RR F3,A,B,C,D,E,50 - RR F3,D,E,A,B,C,52 - RR F3,B,C,D,E,A,54 - RR F3,E,A,B,C,D,56 - RR F3,C,D,E,A,B,58 - - add $64, BUFFER_PTR # move to the next 64-byte block - cmp BUFFER_END, BUFFER_PTR # if the current is the last one use - cmovae K_BASE, BUFFER_PTR # dummy source to avoid buffer overrun - - RR F4,A,B,C,D,E,60 - RR F4,D,E,A,B,C,62 - RR F4,B,C,D,E,A,64 - RR F4,E,A,B,C,D,66 - RR F4,C,D,E,A,B,68 - - RR F4,A,B,C,D,E,70 - RR F4,D,E,A,B,C,72 - RR F4,B,C,D,E,A,74 - RR F4,E,A,B,C,D,76 - RR F4,C,D,E,A,B,78 - - UPDATE_HASH (HASH_PTR), A - UPDATE_HASH 4(HASH_PTR), B - UPDATE_HASH 8(HASH_PTR), C - UPDATE_HASH 12(HASH_PTR), D - UPDATE_HASH 16(HASH_PTR), E - - RESTORE_RENAMED_REGS - cmp K_BASE, BUFFER_PTR # K_BASE means, we reached the end - jne 1b -.endm - -.macro INIT_REGALLOC - .set A, REG_A - .set B, REG_B - .set C, REG_C - .set D, REG_D - .set E, REG_E - .set T1, REG_T1 - .set T2, REG_T2 -.endm - -.macro RESTORE_RENAMED_REGS - # order is important (REG_C is where it should be) - mov B, REG_B - mov D, REG_D - mov A, REG_A - mov E, REG_E -.endm - -.macro SWAP_REG_NAMES a, b - .set _T, \a - .set \a, \b - .set \b, _T -.endm - -.macro F1 b, c, d - mov \c, T1 - SWAP_REG_NAMES \c, T1 - xor \d, T1 - and \b, T1 - xor \d, T1 -.endm - -.macro F2 b, c, d - mov \d, T1 - SWAP_REG_NAMES \d, T1 - xor \c, T1 - xor \b, T1 -.endm - -.macro F3 b, c ,d - mov \c, T1 - SWAP_REG_NAMES \c, T1 - mov \b, T2 - or \b, T1 - and \c, T2 - and \d, T1 - or T2, T1 -.endm - -.macro F4 b, c, d - F2 \b, \c, \d -.endm - -.macro UPDATE_HASH hash, val - add \hash, \val - mov \val, \hash -.endm - -/* - * RR does two rounds of SHA-1 back to back with W[] pre-calc - * t1 = F(b, c, d); e += w(i) - * e += t1; b <<= 30; d += w(i+1); - * t1 = F(a, b, c); - * d += t1; a <<= 5; - * e += a; - * t1 = e; a >>= 7; - * t1 <<= 5; - * d += t1; - */ -.macro RR F, a, b, c, d, e, round - add WK(\round), \e - \F \b, \c, \d # t1 = F(b, c, d); - W_PRECALC (\round + W_PRECALC_AHEAD) - rol $30, \b - add T1, \e - add WK(\round + 1), \d - - \F \a, \b, \c - W_PRECALC (\round + W_PRECALC_AHEAD + 1) - rol $5, \a - add \a, \e - add T1, \d - ror $7, \a # (a <<r 5) >>r 7) => a <<r 30) - - mov \e, T1 - SWAP_REG_NAMES \e, T1 - - rol $5, T1 - add T1, \d - - # write: \a, \b - # rotate: \a<=\d, \b<=\e, \c<=\a, \d<=\b, \e<=\c -.endm - -.macro W_PRECALC r - .set i, \r - - .if (i < 20) - .set K_XMM, 0 - .elseif (i < 40) - .set K_XMM, 16 - .elseif (i < 60) - .set K_XMM, 32 - .elseif (i < 80) - .set K_XMM, 48 - .endif - - .if ((i < 16) || ((i >= 80) && (i < (80 + W_PRECALC_AHEAD)))) - .set i, ((\r) % 80) # pre-compute for the next iteration - .if (i == 0) - W_PRECALC_RESET - .endif - W_PRECALC_00_15 - .elseif (i<32) - W_PRECALC_16_31 - .elseif (i < 80) // rounds 32-79 - W_PRECALC_32_79 - .endif -.endm - -.macro W_PRECALC_RESET - .set W, W0 - .set W_minus_04, W4 - .set W_minus_08, W8 - .set W_minus_12, W12 - .set W_minus_16, W16 - .set W_minus_20, W20 - .set W_minus_24, W24 - .set W_minus_28, W28 - .set W_minus_32, W -.endm - -.macro W_PRECALC_ROTATE - .set W_minus_32, W_minus_28 - .set W_minus_28, W_minus_24 - .set W_minus_24, W_minus_20 - .set W_minus_20, W_minus_16 - .set W_minus_16, W_minus_12 - .set W_minus_12, W_minus_08 - .set W_minus_08, W_minus_04 - .set W_minus_04, W - .set W, W_minus_32 -.endm - -.macro W_PRECALC_SSSE3 - -.macro W_PRECALC_00_15 - W_PRECALC_00_15_SSSE3 -.endm -.macro W_PRECALC_16_31 - W_PRECALC_16_31_SSSE3 -.endm -.macro W_PRECALC_32_79 - W_PRECALC_32_79_SSSE3 -.endm - -/* message scheduling pre-compute for rounds 0-15 */ -.macro W_PRECALC_00_15_SSSE3 - .if ((i & 3) == 0) - movdqu (i*4)(BUFFER_PTR), W_TMP1 - .elseif ((i & 3) == 1) - pshufb XMM_SHUFB_BSWAP, W_TMP1 - movdqa W_TMP1, W - .elseif ((i & 3) == 2) - paddd (K_BASE), W_TMP1 - .elseif ((i & 3) == 3) - movdqa W_TMP1, WK(i&~3) - W_PRECALC_ROTATE - .endif -.endm - -/* message scheduling pre-compute for rounds 16-31 - * - * - calculating last 32 w[i] values in 8 XMM registers - * - pre-calculate K+w[i] values and store to mem, for later load by ALU add - * instruction - * - * some "heavy-lifting" vectorization for rounds 16-31 due to w[i]->w[i-3] - * dependency, but improves for 32-79 - */ -.macro W_PRECALC_16_31_SSSE3 - # blended scheduling of vector and scalar instruction streams, one 4-wide - # vector iteration / 4 scalar rounds - .if ((i & 3) == 0) - movdqa W_minus_12, W - palignr $8, W_minus_16, W # w[i-14] - movdqa W_minus_04, W_TMP1 - psrldq $4, W_TMP1 # w[i-3] - pxor W_minus_08, W - .elseif ((i & 3) == 1) - pxor W_minus_16, W_TMP1 - pxor W_TMP1, W - movdqa W, W_TMP2 - movdqa W, W_TMP1 - pslldq $12, W_TMP2 - .elseif ((i & 3) == 2) - psrld $31, W - pslld $1, W_TMP1 - por W, W_TMP1 - movdqa W_TMP2, W - psrld $30, W_TMP2 - pslld $2, W - .elseif ((i & 3) == 3) - pxor W, W_TMP1 - pxor W_TMP2, W_TMP1 - movdqa W_TMP1, W - paddd K_XMM(K_BASE), W_TMP1 - movdqa W_TMP1, WK(i&~3) - W_PRECALC_ROTATE - .endif -.endm - -/* message scheduling pre-compute for rounds 32-79 - * - * in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1 - * instead we do equal: w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2 - * allows more efficient vectorization since w[i]=>w[i-3] dependency is broken - */ -.macro W_PRECALC_32_79_SSSE3 - .if ((i & 3) == 0) - movdqa W_minus_04, W_TMP1 - pxor W_minus_28, W # W is W_minus_32 before xor - palignr $8, W_minus_08, W_TMP1 - .elseif ((i & 3) == 1) - pxor W_minus_16, W - pxor W_TMP1, W - movdqa W, W_TMP1 - .elseif ((i & 3) == 2) - psrld $30, W - pslld $2, W_TMP1 - por W, W_TMP1 - .elseif ((i & 3) == 3) - movdqa W_TMP1, W - paddd K_XMM(K_BASE), W_TMP1 - movdqa W_TMP1, WK(i&~3) - W_PRECALC_ROTATE - .endif -.endm - -.endm // W_PRECALC_SSSE3 - - -#define K1 0x5a827999 -#define K2 0x6ed9eba1 -#define K3 0x8f1bbcdc -#define K4 0xca62c1d6 - -.section .rodata -.align 16 - -K_XMM_AR: - .long K1, K1, K1, K1 - .long K2, K2, K2, K2 - .long K3, K3, K3, K3 - .long K4, K4, K4, K4 - -BSWAP_SHUFB_CTL: - .long 0x00010203 - .long 0x04050607 - .long 0x08090a0b - .long 0x0c0d0e0f - - -.section .text - -W_PRECALC_SSSE3 -.macro xmm_mov a, b - movdqu \a,\b -.endm - -/* SSSE3 optimized implementation: - * extern "C" void sha1_transform_ssse3(u32 *digest, const char *data, u32 *ws, - * unsigned int rounds); - */ -SHA1_VECTOR_ASM sha1_transform_ssse3 - -#ifdef SHA1_ENABLE_AVX_SUPPORT - -.macro W_PRECALC_AVX - -.purgem W_PRECALC_00_15 -.macro W_PRECALC_00_15 - W_PRECALC_00_15_AVX -.endm -.purgem W_PRECALC_16_31 -.macro W_PRECALC_16_31 - W_PRECALC_16_31_AVX -.endm -.purgem W_PRECALC_32_79 -.macro W_PRECALC_32_79 - W_PRECALC_32_79_AVX -.endm - -.macro W_PRECALC_00_15_AVX - .if ((i & 3) == 0) - vmovdqu (i*4)(BUFFER_PTR), W_TMP1 - .elseif ((i & 3) == 1) - vpshufb XMM_SHUFB_BSWAP, W_TMP1, W - .elseif ((i & 3) == 2) - vpaddd (K_BASE), W, W_TMP1 - .elseif ((i & 3) == 3) - vmovdqa W_TMP1, WK(i&~3) - W_PRECALC_ROTATE - .endif -.endm - -.macro W_PRECALC_16_31_AVX - .if ((i & 3) == 0) - vpalignr $8, W_minus_16, W_minus_12, W # w[i-14] - vpsrldq $4, W_minus_04, W_TMP1 # w[i-3] - vpxor W_minus_08, W, W - vpxor W_minus_16, W_TMP1, W_TMP1 - .elseif ((i & 3) == 1) - vpxor W_TMP1, W, W - vpslldq $12, W, W_TMP2 - vpslld $1, W, W_TMP1 - .elseif ((i & 3) == 2) - vpsrld $31, W, W - vpor W, W_TMP1, W_TMP1 - vpslld $2, W_TMP2, W - vpsrld $30, W_TMP2, W_TMP2 - .elseif ((i & 3) == 3) - vpxor W, W_TMP1, W_TMP1 - vpxor W_TMP2, W_TMP1, W - vpaddd K_XMM(K_BASE), W, W_TMP1 - vmovdqu W_TMP1, WK(i&~3) - W_PRECALC_ROTATE - .endif -.endm - -.macro W_PRECALC_32_79_AVX - .if ((i & 3) == 0) - vpalignr $8, W_minus_08, W_minus_04, W_TMP1 - vpxor W_minus_28, W, W # W is W_minus_32 before xor - .elseif ((i & 3) == 1) - vpxor W_minus_16, W_TMP1, W_TMP1 - vpxor W_TMP1, W, W - .elseif ((i & 3) == 2) - vpslld $2, W, W_TMP1 - vpsrld $30, W, W - vpor W, W_TMP1, W - .elseif ((i & 3) == 3) - vpaddd K_XMM(K_BASE), W, W_TMP1 - vmovdqu W_TMP1, WK(i&~3) - W_PRECALC_ROTATE - .endif -.endm - -.endm // W_PRECALC_AVX - -W_PRECALC_AVX -.purgem xmm_mov -.macro xmm_mov a, b - vmovdqu \a,\b -.endm - - -/* AVX optimized implementation: - * extern "C" void sha1_transform_avx(u32 *digest, const char *data, u32 *ws, - * unsigned int rounds); - */ -SHA1_VECTOR_ASM sha1_transform_avx - -#endif |