summaryrefslogtreecommitdiff
path: root/arch
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2013-05-29 11:39:47 +1000
committerStephen Rothwell <sfr@canb.auug.org.au>2013-05-29 11:39:47 +1000
commit5f1b6ac5172febc682f2199b2141b060da3596b9 (patch)
tree524ae95ac66907f68c06db0e7e87595a7b19731c /arch
parent725b50704205d9049c05db0fc0f5d5f6c58d7e11 (diff)
parentbbb85b25688fcdc70b895711870c23de7b12721b (diff)
Merge remote-tracking branch 'crypto/master'
Diffstat (limited to 'arch')
-rw-r--r--arch/arm/boot/dts/imx28.dtsi2
-rw-r--r--arch/x86/crypto/Makefile2
-rw-r--r--arch/x86/crypto/crct10dif-pcl-asm_64.S643
-rw-r--r--arch/x86/crypto/crct10dif-pclmul_glue.c151
-rw-r--r--arch/x86/crypto/sha256_ssse3_glue.c57
-rw-r--r--arch/x86/crypto/sha512_ssse3_glue.c58
6 files changed, 902 insertions, 11 deletions
diff --git a/arch/arm/boot/dts/imx28.dtsi b/arch/arm/boot/dts/imx28.dtsi
index 600f7cb51f3e..077d0eb1a8b3 100644
--- a/arch/arm/boot/dts/imx28.dtsi
+++ b/arch/arm/boot/dts/imx28.dtsi
@@ -699,7 +699,7 @@
dcp@80028000 {
reg = <0x80028000 0x2000>;
interrupts = <52 53 54>;
- status = "disabled";
+ compatible = "fsl-dcp";
};
pxp@8002a000 {
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index a3a0ed80f17c..94cb151adc1d 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -29,6 +29,7 @@ obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o
obj-$(CONFIG_CRYPTO_CRC32_PCLMUL) += crc32-pclmul.o
obj-$(CONFIG_CRYPTO_SHA256_SSSE3) += sha256-ssse3.o
obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o
+obj-$(CONFIG_CRYPTO_CRCT10DIF_PCLMUL) += crct10dif-pclmul.o
# These modules require assembler to support AVX.
ifeq ($(avx_supported),yes)
@@ -87,3 +88,4 @@ crc32c-intel-$(CONFIG_64BIT) += crc32c-pcl-intel-asm_64.o
crc32-pclmul-y := crc32-pclmul_asm.o crc32-pclmul_glue.o
sha256-ssse3-y := sha256-ssse3-asm.o sha256-avx-asm.o sha256-avx2-asm.o sha256_ssse3_glue.o
sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o
+crct10dif-pclmul-y := crct10dif-pcl-asm_64.o crct10dif-pclmul_glue.o
diff --git a/arch/x86/crypto/crct10dif-pcl-asm_64.S b/arch/x86/crypto/crct10dif-pcl-asm_64.S
new file mode 100644
index 000000000000..35e97569d05f
--- /dev/null
+++ b/arch/x86/crypto/crct10dif-pcl-asm_64.S
@@ -0,0 +1,643 @@
+########################################################################
+# Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
+#
+# Copyright (c) 2013, Intel Corporation
+#
+# Authors:
+# Erdinc Ozturk <erdinc.ozturk@intel.com>
+# Vinodh Gopal <vinodh.gopal@intel.com>
+# James Guilford <james.guilford@intel.com>
+# Tim Chen <tim.c.chen@linux.intel.com>
+#
+# This software is available to you under a choice of one of two
+# licenses. You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the
+# distribution.
+#
+# * Neither the name of the Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+#
+# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+# Function API:
+# UINT16 crc_t10dif_pcl(
+# UINT16 init_crc, //initial CRC value, 16 bits
+# const unsigned char *buf, //buffer pointer to calculate CRC on
+# UINT64 len //buffer length in bytes (64-bit data)
+# );
+#
+# Reference paper titled "Fast CRC Computation for Generic
+# Polynomials Using PCLMULQDQ Instruction"
+# URL: http://www.intel.com/content/dam/www/public/us/en/documents
+# /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
+#
+#
+
+#include <linux/linkage.h>
+
+.text
+
+#define arg1 %rdi
+#define arg2 %rsi
+#define arg3 %rdx
+
+#define arg1_low32 %edi
+
+ENTRY(crc_t10dif_pcl)
+.align 16
+
+ # adjust the 16-bit initial_crc value, scale it to 32 bits
+ shl $16, arg1_low32
+
+ # Allocate Stack Space
+ mov %rsp, %rcx
+ sub $16*2, %rsp
+ # align stack to 16 byte boundary
+ and $~(0x10 - 1), %rsp
+
+ # check if smaller than 256
+ cmp $256, arg3
+
+ # for sizes less than 128, we can't fold 64B at a time...
+ jl _less_than_128
+
+
+ # load the initial crc value
+ movd arg1_low32, %xmm10 # initial crc
+
+ # crc value does not need to be byte-reflected, but it needs
+ # to be moved to the high part of the register.
+ # because data will be byte-reflected and will align with
+ # initial crc at correct place.
+ pslldq $12, %xmm10
+
+ movdqa SHUF_MASK(%rip), %xmm11
+ # receive the initial 64B data, xor the initial crc value
+ movdqu 16*0(arg2), %xmm0
+ movdqu 16*1(arg2), %xmm1
+ movdqu 16*2(arg2), %xmm2
+ movdqu 16*3(arg2), %xmm3
+ movdqu 16*4(arg2), %xmm4
+ movdqu 16*5(arg2), %xmm5
+ movdqu 16*6(arg2), %xmm6
+ movdqu 16*7(arg2), %xmm7
+
+ pshufb %xmm11, %xmm0
+ # XOR the initial_crc value
+ pxor %xmm10, %xmm0
+ pshufb %xmm11, %xmm1
+ pshufb %xmm11, %xmm2
+ pshufb %xmm11, %xmm3
+ pshufb %xmm11, %xmm4
+ pshufb %xmm11, %xmm5
+ pshufb %xmm11, %xmm6
+ pshufb %xmm11, %xmm7
+
+ movdqa rk3(%rip), %xmm10 #xmm10 has rk3 and rk4
+ #imm value of pclmulqdq instruction
+ #will determine which constant to use
+
+ #################################################################
+ # we subtract 256 instead of 128 to save one instruction from the loop
+ sub $256, arg3
+
+ # at this section of the code, there is 64*x+y (0<=y<64) bytes of
+ # buffer. The _fold_64_B_loop will fold 64B at a time
+ # until we have 64+y Bytes of buffer
+
+
+ # fold 64B at a time. This section of the code folds 4 xmm
+ # registers in parallel
+_fold_64_B_loop:
+
+ # update the buffer pointer
+ add $128, arg2 # buf += 64#
+
+ movdqu 16*0(arg2), %xmm9
+ movdqu 16*1(arg2), %xmm12
+ pshufb %xmm11, %xmm9
+ pshufb %xmm11, %xmm12
+ movdqa %xmm0, %xmm8
+ movdqa %xmm1, %xmm13
+ pclmulqdq $0x0 , %xmm10, %xmm0
+ pclmulqdq $0x11, %xmm10, %xmm8
+ pclmulqdq $0x0 , %xmm10, %xmm1
+ pclmulqdq $0x11, %xmm10, %xmm13
+ pxor %xmm9 , %xmm0
+ xorps %xmm8 , %xmm0
+ pxor %xmm12, %xmm1
+ xorps %xmm13, %xmm1
+
+ movdqu 16*2(arg2), %xmm9
+ movdqu 16*3(arg2), %xmm12
+ pshufb %xmm11, %xmm9
+ pshufb %xmm11, %xmm12
+ movdqa %xmm2, %xmm8
+ movdqa %xmm3, %xmm13
+ pclmulqdq $0x0, %xmm10, %xmm2
+ pclmulqdq $0x11, %xmm10, %xmm8
+ pclmulqdq $0x0, %xmm10, %xmm3
+ pclmulqdq $0x11, %xmm10, %xmm13
+ pxor %xmm9 , %xmm2
+ xorps %xmm8 , %xmm2
+ pxor %xmm12, %xmm3
+ xorps %xmm13, %xmm3
+
+ movdqu 16*4(arg2), %xmm9
+ movdqu 16*5(arg2), %xmm12
+ pshufb %xmm11, %xmm9
+ pshufb %xmm11, %xmm12
+ movdqa %xmm4, %xmm8
+ movdqa %xmm5, %xmm13
+ pclmulqdq $0x0, %xmm10, %xmm4
+ pclmulqdq $0x11, %xmm10, %xmm8
+ pclmulqdq $0x0, %xmm10, %xmm5
+ pclmulqdq $0x11, %xmm10, %xmm13
+ pxor %xmm9 , %xmm4
+ xorps %xmm8 , %xmm4
+ pxor %xmm12, %xmm5
+ xorps %xmm13, %xmm5
+
+ movdqu 16*6(arg2), %xmm9
+ movdqu 16*7(arg2), %xmm12
+ pshufb %xmm11, %xmm9
+ pshufb %xmm11, %xmm12
+ movdqa %xmm6 , %xmm8
+ movdqa %xmm7 , %xmm13
+ pclmulqdq $0x0 , %xmm10, %xmm6
+ pclmulqdq $0x11, %xmm10, %xmm8
+ pclmulqdq $0x0 , %xmm10, %xmm7
+ pclmulqdq $0x11, %xmm10, %xmm13
+ pxor %xmm9 , %xmm6
+ xorps %xmm8 , %xmm6
+ pxor %xmm12, %xmm7
+ xorps %xmm13, %xmm7
+
+ sub $128, arg3
+
+ # check if there is another 64B in the buffer to be able to fold
+ jge _fold_64_B_loop
+ ##################################################################
+
+
+ add $128, arg2
+ # at this point, the buffer pointer is pointing at the last y Bytes
+ # of the buffer the 64B of folded data is in 4 of the xmm
+ # registers: xmm0, xmm1, xmm2, xmm3
+
+
+ # fold the 8 xmm registers to 1 xmm register with different constants
+
+ movdqa rk9(%rip), %xmm10
+ movdqa %xmm0, %xmm8
+ pclmulqdq $0x11, %xmm10, %xmm0
+ pclmulqdq $0x0 , %xmm10, %xmm8
+ pxor %xmm8, %xmm7
+ xorps %xmm0, %xmm7
+
+ movdqa rk11(%rip), %xmm10
+ movdqa %xmm1, %xmm8
+ pclmulqdq $0x11, %xmm10, %xmm1
+ pclmulqdq $0x0 , %xmm10, %xmm8
+ pxor %xmm8, %xmm7
+ xorps %xmm1, %xmm7
+
+ movdqa rk13(%rip), %xmm10
+ movdqa %xmm2, %xmm8
+ pclmulqdq $0x11, %xmm10, %xmm2
+ pclmulqdq $0x0 , %xmm10, %xmm8
+ pxor %xmm8, %xmm7
+ pxor %xmm2, %xmm7
+
+ movdqa rk15(%rip), %xmm10
+ movdqa %xmm3, %xmm8
+ pclmulqdq $0x11, %xmm10, %xmm3
+ pclmulqdq $0x0 , %xmm10, %xmm8
+ pxor %xmm8, %xmm7
+ xorps %xmm3, %xmm7
+
+ movdqa rk17(%rip), %xmm10
+ movdqa %xmm4, %xmm8
+ pclmulqdq $0x11, %xmm10, %xmm4
+ pclmulqdq $0x0 , %xmm10, %xmm8
+ pxor %xmm8, %xmm7
+ pxor %xmm4, %xmm7
+
+ movdqa rk19(%rip), %xmm10
+ movdqa %xmm5, %xmm8
+ pclmulqdq $0x11, %xmm10, %xmm5
+ pclmulqdq $0x0 , %xmm10, %xmm8
+ pxor %xmm8, %xmm7
+ xorps %xmm5, %xmm7
+
+ movdqa rk1(%rip), %xmm10 #xmm10 has rk1 and rk2
+ #imm value of pclmulqdq instruction
+ #will determine which constant to use
+ movdqa %xmm6, %xmm8
+ pclmulqdq $0x11, %xmm10, %xmm6
+ pclmulqdq $0x0 , %xmm10, %xmm8
+ pxor %xmm8, %xmm7
+ pxor %xmm6, %xmm7
+
+
+ # instead of 64, we add 48 to the loop counter to save 1 instruction
+ # from the loop instead of a cmp instruction, we use the negative
+ # flag with the jl instruction
+ add $128-16, arg3
+ jl _final_reduction_for_128
+
+ # now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7
+ # and the rest is in memory. We can fold 16 bytes at a time if y>=16
+ # continue folding 16B at a time
+
+_16B_reduction_loop:
+ movdqa %xmm7, %xmm8
+ pclmulqdq $0x11, %xmm10, %xmm7
+ pclmulqdq $0x0 , %xmm10, %xmm8
+ pxor %xmm8, %xmm7
+ movdqu (arg2), %xmm0
+ pshufb %xmm11, %xmm0
+ pxor %xmm0 , %xmm7
+ add $16, arg2
+ sub $16, arg3
+ # instead of a cmp instruction, we utilize the flags with the
+ # jge instruction equivalent of: cmp arg3, 16-16
+ # check if there is any more 16B in the buffer to be able to fold
+ jge _16B_reduction_loop
+
+ #now we have 16+z bytes left to reduce, where 0<= z < 16.
+ #first, we reduce the data in the xmm7 register
+
+
+_final_reduction_for_128:
+ # check if any more data to fold. If not, compute the CRC of
+ # the final 128 bits
+ add $16, arg3
+ je _128_done
+
+ # here we are getting data that is less than 16 bytes.
+ # since we know that there was data before the pointer, we can
+ # offset the input pointer before the actual point, to receive
+ # exactly 16 bytes. after that the registers need to be adjusted.
+_get_last_two_xmms:
+ movdqa %xmm7, %xmm2
+
+ movdqu -16(arg2, arg3), %xmm1
+ pshufb %xmm11, %xmm1
+
+ # get rid of the extra data that was loaded before
+ # load the shift constant
+ lea pshufb_shf_table+16(%rip), %rax
+ sub arg3, %rax
+ movdqu (%rax), %xmm0
+
+ # shift xmm2 to the left by arg3 bytes
+ pshufb %xmm0, %xmm2
+
+ # shift xmm7 to the right by 16-arg3 bytes
+ pxor mask1(%rip), %xmm0
+ pshufb %xmm0, %xmm7
+ pblendvb %xmm2, %xmm1 #xmm0 is implicit
+
+ # fold 16 Bytes
+ movdqa %xmm1, %xmm2
+ movdqa %xmm7, %xmm8
+ pclmulqdq $0x11, %xmm10, %xmm7
+ pclmulqdq $0x0 , %xmm10, %xmm8
+ pxor %xmm8, %xmm7
+ pxor %xmm2, %xmm7
+
+_128_done:
+ # compute crc of a 128-bit value
+ movdqa rk5(%rip), %xmm10 # rk5 and rk6 in xmm10
+ movdqa %xmm7, %xmm0
+
+ #64b fold
+ pclmulqdq $0x1, %xmm10, %xmm7
+ pslldq $8 , %xmm0
+ pxor %xmm0, %xmm7
+
+ #32b fold
+ movdqa %xmm7, %xmm0
+
+ pand mask2(%rip), %xmm0
+
+ psrldq $12, %xmm7
+ pclmulqdq $0x10, %xmm10, %xmm7
+ pxor %xmm0, %xmm7
+
+ #barrett reduction
+_barrett:
+ movdqa rk7(%rip), %xmm10 # rk7 and rk8 in xmm10
+ movdqa %xmm7, %xmm0
+ pclmulqdq $0x01, %xmm10, %xmm7
+ pslldq $4, %xmm7
+ pclmulqdq $0x11, %xmm10, %xmm7
+
+ pslldq $4, %xmm7
+ pxor %xmm0, %xmm7
+ pextrd $1, %xmm7, %eax
+
+_cleanup:
+ # scale the result back to 16 bits
+ shr $16, %eax
+ mov %rcx, %rsp
+ ret
+
+########################################################################
+
+.align 16
+_less_than_128:
+
+ # check if there is enough buffer to be able to fold 16B at a time
+ cmp $32, arg3
+ jl _less_than_32
+ movdqa SHUF_MASK(%rip), %xmm11
+
+ # now if there is, load the constants
+ movdqa rk1(%rip), %xmm10 # rk1 and rk2 in xmm10
+
+ movd arg1_low32, %xmm0 # get the initial crc value
+ pslldq $12, %xmm0 # align it to its correct place
+ movdqu (arg2), %xmm7 # load the plaintext
+ pshufb %xmm11, %xmm7 # byte-reflect the plaintext
+ pxor %xmm0, %xmm7
+
+
+ # update the buffer pointer
+ add $16, arg2
+
+ # update the counter. subtract 32 instead of 16 to save one
+ # instruction from the loop
+ sub $32, arg3
+
+ jmp _16B_reduction_loop
+
+
+.align 16
+_less_than_32:
+ # mov initial crc to the return value. this is necessary for
+ # zero-length buffers.
+ mov arg1_low32, %eax
+ test arg3, arg3
+ je _cleanup
+
+ movdqa SHUF_MASK(%rip), %xmm11
+
+ movd arg1_low32, %xmm0 # get the initial crc value
+ pslldq $12, %xmm0 # align it to its correct place
+
+ cmp $16, arg3
+ je _exact_16_left
+ jl _less_than_16_left
+
+ movdqu (arg2), %xmm7 # load the plaintext
+ pshufb %xmm11, %xmm7 # byte-reflect the plaintext
+ pxor %xmm0 , %xmm7 # xor the initial crc value
+ add $16, arg2
+ sub $16, arg3
+ movdqa rk1(%rip), %xmm10 # rk1 and rk2 in xmm10
+ jmp _get_last_two_xmms
+
+
+.align 16
+_less_than_16_left:
+ # use stack space to load data less than 16 bytes, zero-out
+ # the 16B in memory first.
+
+ pxor %xmm1, %xmm1
+ mov %rsp, %r11
+ movdqa %xmm1, (%r11)
+
+ cmp $4, arg3
+ jl _only_less_than_4
+
+ # backup the counter value
+ mov arg3, %r9
+ cmp $8, arg3
+ jl _less_than_8_left
+
+ # load 8 Bytes
+ mov (arg2), %rax
+ mov %rax, (%r11)
+ add $8, %r11
+ sub $8, arg3
+ add $8, arg2
+_less_than_8_left:
+
+ cmp $4, arg3
+ jl _less_than_4_left
+
+ # load 4 Bytes
+ mov (arg2), %eax
+ mov %eax, (%r11)
+ add $4, %r11
+ sub $4, arg3
+ add $4, arg2
+_less_than_4_left:
+
+ cmp $2, arg3
+ jl _less_than_2_left
+
+ # load 2 Bytes
+ mov (arg2), %ax
+ mov %ax, (%r11)
+ add $2, %r11
+ sub $2, arg3
+ add $2, arg2
+_less_than_2_left:
+ cmp $1, arg3
+ jl _zero_left
+
+ # load 1 Byte
+ mov (arg2), %al
+ mov %al, (%r11)
+_zero_left:
+ movdqa (%rsp), %xmm7
+ pshufb %xmm11, %xmm7
+ pxor %xmm0 , %xmm7 # xor the initial crc value
+
+ # shl r9, 4
+ lea pshufb_shf_table+16(%rip), %rax
+ sub %r9, %rax
+ movdqu (%rax), %xmm0
+ pxor mask1(%rip), %xmm0
+
+ pshufb %xmm0, %xmm7
+ jmp _128_done
+
+.align 16
+_exact_16_left:
+ movdqu (arg2), %xmm7
+ pshufb %xmm11, %xmm7
+ pxor %xmm0 , %xmm7 # xor the initial crc value
+
+ jmp _128_done
+
+_only_less_than_4:
+ cmp $3, arg3
+ jl _only_less_than_3
+
+ # load 3 Bytes
+ mov (arg2), %al
+ mov %al, (%r11)
+
+ mov 1(arg2), %al
+ mov %al, 1(%r11)
+
+ mov 2(arg2), %al
+ mov %al, 2(%r11)
+
+ movdqa (%rsp), %xmm7
+ pshufb %xmm11, %xmm7
+ pxor %xmm0 , %xmm7 # xor the initial crc value
+
+ psrldq $5, %xmm7
+
+ jmp _barrett
+_only_less_than_3:
+ cmp $2, arg3
+ jl _only_less_than_2
+
+ # load 2 Bytes
+ mov (arg2), %al
+ mov %al, (%r11)
+
+ mov 1(arg2), %al
+ mov %al, 1(%r11)
+
+ movdqa (%rsp), %xmm7
+ pshufb %xmm11, %xmm7
+ pxor %xmm0 , %xmm7 # xor the initial crc value
+
+ psrldq $6, %xmm7
+
+ jmp _barrett
+_only_less_than_2:
+
+ # load 1 Byte
+ mov (arg2), %al
+ mov %al, (%r11)
+
+ movdqa (%rsp), %xmm7
+ pshufb %xmm11, %xmm7
+ pxor %xmm0 , %xmm7 # xor the initial crc value
+
+ psrldq $7, %xmm7
+
+ jmp _barrett
+
+ENDPROC(crc_t10dif_pcl)
+
+.data
+
+# precomputed constants
+# these constants are precomputed from the poly:
+# 0x8bb70000 (0x8bb7 scaled to 32 bits)
+.align 16
+# Q = 0x18BB70000
+# rk1 = 2^(32*3) mod Q << 32
+# rk2 = 2^(32*5) mod Q << 32
+# rk3 = 2^(32*15) mod Q << 32
+# rk4 = 2^(32*17) mod Q << 32
+# rk5 = 2^(32*3) mod Q << 32
+# rk6 = 2^(32*2) mod Q << 32
+# rk7 = floor(2^64/Q)
+# rk8 = Q
+rk1:
+.quad 0x2d56000000000000
+rk2:
+.quad 0x06df000000000000
+rk3:
+.quad 0x9d9d000000000000
+rk4:
+.quad 0x7cf5000000000000
+rk5:
+.quad 0x2d56000000000000
+rk6:
+.quad 0x1368000000000000
+rk7:
+.quad 0x00000001f65a57f8
+rk8:
+.quad 0x000000018bb70000
+
+rk9:
+.quad 0xceae000000000000
+rk10:
+.quad 0xbfd6000000000000
+rk11:
+.quad 0x1e16000000000000
+rk12:
+.quad 0x713c000000000000
+rk13:
+.quad 0xf7f9000000000000
+rk14:
+.quad 0x80a6000000000000
+rk15:
+.quad 0x044c000000000000
+rk16:
+.quad 0xe658000000000000
+rk17:
+.quad 0xad18000000000000
+rk18:
+.quad 0xa497000000000000
+rk19:
+.quad 0x6ee3000000000000
+rk20:
+.quad 0xe7b5000000000000
+
+
+
+mask1:
+.octa 0x80808080808080808080808080808080
+mask2:
+.octa 0x00000000FFFFFFFFFFFFFFFFFFFFFFFF
+
+SHUF_MASK:
+.octa 0x000102030405060708090A0B0C0D0E0F
+
+pshufb_shf_table:
+# use these values for shift constants for the pshufb instruction
+# different alignments result in values as shown:
+# DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1
+# DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2
+# DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3
+# DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4
+# DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5
+# DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6
+# DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9 (16-7) / shr7
+# DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8 (16-8) / shr8
+# DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7 (16-9) / shr9
+# DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6 (16-10) / shr10
+# DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5 (16-11) / shr11
+# DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4 (16-12) / shr12
+# DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3 (16-13) / shr13
+# DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2 (16-14) / shr14
+# DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1 (16-15) / shr15
+.octa 0x8f8e8d8c8b8a89888786858483828100
+.octa 0x000e0d0c0b0a09080706050403020100
diff --git a/arch/x86/crypto/crct10dif-pclmul_glue.c b/arch/x86/crypto/crct10dif-pclmul_glue.c
new file mode 100644
index 000000000000..7845d7fd54c0
--- /dev/null
+++ b/arch/x86/crypto/crct10dif-pclmul_glue.c
@@ -0,0 +1,151 @@
+/*
+ * Cryptographic API.
+ *
+ * T10 Data Integrity Field CRC16 Crypto Transform using PCLMULQDQ Instructions
+ *
+ * Copyright (C) 2013 Intel Corporation
+ * Author: Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/crc-t10dif.h>
+#include <crypto/internal/hash.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <asm/i387.h>
+#include <asm/cpufeature.h>
+#include <asm/cpu_device_id.h>
+
+asmlinkage __u16 crc_t10dif_pcl(__u16 crc, const unsigned char *buf,
+ size_t len);
+
+struct chksum_desc_ctx {
+ __u16 crc;
+};
+
+/*
+ * Steps through buffer one byte at at time, calculates reflected
+ * crc using table.
+ */
+
+static int chksum_init(struct shash_desc *desc)
+{
+ struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+
+ ctx->crc = 0;
+
+ return 0;
+}
+
+static int chksum_update(struct shash_desc *desc, const u8 *data,
+ unsigned int length)
+{
+ struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+
+ if (irq_fpu_usable()) {
+ kernel_fpu_begin();
+ ctx->crc = crc_t10dif_pcl(ctx->crc, data, length);
+ kernel_fpu_end();
+ } else
+ ctx->crc = crc_t10dif_generic(ctx->crc, data, length);
+ return 0;
+}
+
+static int chksum_final(struct shash_desc *desc, u8 *out)
+{
+ struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+
+ *(__u16 *)out = ctx->crc;
+ return 0;
+}
+
+static int __chksum_finup(__u16 *crcp, const u8 *data, unsigned int len,
+ u8 *out)
+{
+ if (irq_fpu_usable()) {
+ kernel_fpu_begin();
+ *(__u16 *)out = crc_t10dif_pcl(*crcp, data, len);
+ kernel_fpu_end();
+ } else
+ *(__u16 *)out = crc_t10dif_generic(*crcp, data, len);
+ return 0;
+}
+
+static int chksum_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+
+ return __chksum_finup(&ctx->crc, data, len, out);
+}
+
+static int chksum_digest(struct shash_desc *desc, const u8 *data,
+ unsigned int length, u8 *out)
+{
+ struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+
+ return __chksum_finup(&ctx->crc, data, length, out);
+}
+
+static struct shash_alg alg = {
+ .digestsize = CRC_T10DIF_DIGEST_SIZE,
+ .init = chksum_init,
+ .update = chksum_update,
+ .final = chksum_final,
+ .finup = chksum_finup,
+ .digest = chksum_digest,
+ .descsize = sizeof(struct chksum_desc_ctx),
+ .base = {
+ .cra_name = "crct10dif",
+ .cra_driver_name = "crct10dif-pclmul",
+ .cra_priority = 200,
+ .cra_blocksize = CRC_T10DIF_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+};
+
+static const struct x86_cpu_id crct10dif_cpu_id[] = {
+ X86_FEATURE_MATCH(X86_FEATURE_PCLMULQDQ),
+ {}
+};
+MODULE_DEVICE_TABLE(x86cpu, crct10dif_cpu_id);
+
+static int __init crct10dif_intel_mod_init(void)
+{
+ if (!x86_match_cpu(crct10dif_cpu_id))
+ return -ENODEV;
+
+ return crypto_register_shash(&alg);
+}
+
+static void __exit crct10dif_intel_mod_fini(void)
+{
+ crypto_unregister_shash(&alg);
+}
+
+module_init(crct10dif_intel_mod_init);
+module_exit(crct10dif_intel_mod_fini);
+
+MODULE_AUTHOR("Tim Chen <tim.c.chen@linux.intel.com>");
+MODULE_DESCRIPTION("T10 DIF CRC calculation accelerated with PCLMULQDQ.");
+MODULE_LICENSE("GPL");
+
+MODULE_ALIAS("crct10dif");
+MODULE_ALIAS("crct10dif-pclmul");
diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c
index 597d4da69656..50226c4b86ed 100644
--- a/arch/x86/crypto/sha256_ssse3_glue.c
+++ b/arch/x86/crypto/sha256_ssse3_glue.c
@@ -187,7 +187,36 @@ static int sha256_ssse3_import(struct shash_desc *desc, const void *in)
return 0;
}
-static struct shash_alg alg = {
+static int sha224_ssse3_init(struct shash_desc *desc)
+{
+ struct sha256_state *sctx = shash_desc_ctx(desc);
+
+ sctx->state[0] = SHA224_H0;
+ sctx->state[1] = SHA224_H1;
+ sctx->state[2] = SHA224_H2;
+ sctx->state[3] = SHA224_H3;
+ sctx->state[4] = SHA224_H4;
+ sctx->state[5] = SHA224_H5;
+ sctx->state[6] = SHA224_H6;
+ sctx->state[7] = SHA224_H7;
+ sctx->count = 0;
+
+ return 0;
+}
+
+static int sha224_ssse3_final(struct shash_desc *desc, u8 *hash)
+{
+ u8 D[SHA256_DIGEST_SIZE];
+
+ sha256_ssse3_final(desc, D);
+
+ memcpy(hash, D, SHA224_DIGEST_SIZE);
+ memset(D, 0, SHA256_DIGEST_SIZE);
+
+ return 0;
+}
+
+static struct shash_alg algs[] = { {
.digestsize = SHA256_DIGEST_SIZE,
.init = sha256_ssse3_init,
.update = sha256_ssse3_update,
@@ -204,7 +233,24 @@ static struct shash_alg alg = {
.cra_blocksize = SHA256_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}
-};
+}, {
+ .digestsize = SHA224_DIGEST_SIZE,
+ .init = sha224_ssse3_init,
+ .update = sha256_ssse3_update,
+ .final = sha224_ssse3_final,
+ .export = sha256_ssse3_export,
+ .import = sha256_ssse3_import,
+ .descsize = sizeof(struct sha256_state),
+ .statesize = sizeof(struct sha256_state),
+ .base = {
+ .cra_name = "sha224",
+ .cra_driver_name = "sha224-ssse3",
+ .cra_priority = 150,
+ .cra_flags = CRYPTO_ALG_TYPE_SHASH,
+ .cra_blocksize = SHA224_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+} };
#ifdef CONFIG_AS_AVX
static bool __init avx_usable(void)
@@ -227,7 +273,7 @@ static bool __init avx_usable(void)
static int __init sha256_ssse3_mod_init(void)
{
- /* test for SSE3 first */
+ /* test for SSSE3 first */
if (cpu_has_ssse3)
sha256_transform_asm = sha256_transform_ssse3;
@@ -254,7 +300,7 @@ static int __init sha256_ssse3_mod_init(void)
else
#endif
pr_info("Using SSSE3 optimized SHA-256 implementation\n");
- return crypto_register_shash(&alg);
+ return crypto_register_shashes(algs, ARRAY_SIZE(algs));
}
pr_info("Neither AVX nor SSSE3 is available/usable.\n");
@@ -263,7 +309,7 @@ static int __init sha256_ssse3_mod_init(void)
static void __exit sha256_ssse3_mod_fini(void)
{
- crypto_unregister_shash(&alg);
+ crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
}
module_init(sha256_ssse3_mod_init);
@@ -273,3 +319,4 @@ MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm, Supplemental SSE3 accelerated");
MODULE_ALIAS("sha256");
+MODULE_ALIAS("sha384");
diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c
index 6cbd8df348d2..f30cd10293f0 100644
--- a/arch/x86/crypto/sha512_ssse3_glue.c
+++ b/arch/x86/crypto/sha512_ssse3_glue.c
@@ -194,7 +194,37 @@ static int sha512_ssse3_import(struct shash_desc *desc, const void *in)
return 0;
}
-static struct shash_alg alg = {
+static int sha384_ssse3_init(struct shash_desc *desc)
+{
+ struct sha512_state *sctx = shash_desc_ctx(desc);
+
+ sctx->state[0] = SHA384_H0;
+ sctx->state[1] = SHA384_H1;
+ sctx->state[2] = SHA384_H2;
+ sctx->state[3] = SHA384_H3;
+ sctx->state[4] = SHA384_H4;
+ sctx->state[5] = SHA384_H5;
+ sctx->state[6] = SHA384_H6;
+ sctx->state[7] = SHA384_H7;
+
+ sctx->count[0] = sctx->count[1] = 0;
+
+ return 0;
+}
+
+static int sha384_ssse3_final(struct shash_desc *desc, u8 *hash)
+{
+ u8 D[SHA512_DIGEST_SIZE];
+
+ sha512_ssse3_final(desc, D);
+
+ memcpy(hash, D, SHA384_DIGEST_SIZE);
+ memset(D, 0, SHA512_DIGEST_SIZE);
+
+ return 0;
+}
+
+static struct shash_alg algs[] = { {
.digestsize = SHA512_DIGEST_SIZE,
.init = sha512_ssse3_init,
.update = sha512_ssse3_update,
@@ -211,7 +241,24 @@ static struct shash_alg alg = {
.cra_blocksize = SHA512_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}
-};
+}, {
+ .digestsize = SHA384_DIGEST_SIZE,
+ .init = sha384_ssse3_init,
+ .update = sha512_ssse3_update,
+ .final = sha384_ssse3_final,
+ .export = sha512_ssse3_export,
+ .import = sha512_ssse3_import,
+ .descsize = sizeof(struct sha512_state),
+ .statesize = sizeof(struct sha512_state),
+ .base = {
+ .cra_name = "sha384",
+ .cra_driver_name = "sha384-ssse3",
+ .cra_priority = 150,
+ .cra_flags = CRYPTO_ALG_TYPE_SHASH,
+ .cra_blocksize = SHA384_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+} };
#ifdef CONFIG_AS_AVX
static bool __init avx_usable(void)
@@ -234,7 +281,7 @@ static bool __init avx_usable(void)
static int __init sha512_ssse3_mod_init(void)
{
- /* test for SSE3 first */
+ /* test for SSSE3 first */
if (cpu_has_ssse3)
sha512_transform_asm = sha512_transform_ssse3;
@@ -261,7 +308,7 @@ static int __init sha512_ssse3_mod_init(void)
else
#endif
pr_info("Using SSSE3 optimized SHA-512 implementation\n");
- return crypto_register_shash(&alg);
+ return crypto_register_shashes(algs, ARRAY_SIZE(algs));
}
pr_info("Neither AVX nor SSSE3 is available/usable.\n");
@@ -270,7 +317,7 @@ static int __init sha512_ssse3_mod_init(void)
static void __exit sha512_ssse3_mod_fini(void)
{
- crypto_unregister_shash(&alg);
+ crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
}
module_init(sha512_ssse3_mod_init);
@@ -280,3 +327,4 @@ MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("SHA512 Secure Hash Algorithm, Supplemental SSE3 accelerated");
MODULE_ALIAS("sha512");
+MODULE_ALIAS("sha384");