Compare commits

...

6 Commits

Author SHA1 Message Date
Potuz
94768a3190 Added AVX tests 2021-11-19 15:26:06 -03:00
Potuz
662d4ec6e9 First Benchmarks with Shani 2021-11-19 10:03:12 -03:00
Potuz
b206bffe15 flat array merkleyzer 2021-11-18 22:16:03 -03:00
Potuz
18b83bf445 add SSE3 version for 2 chunks 2021-11-17 11:16:08 -03:00
Potuz
cff4a01ea0 add AVX2 version 2021-11-17 11:08:02 -03:00
Potuz
3a1901d7aa Working custom assembly hasher with sha_ni extensions 2021-11-17 10:58:31 -03:00
14 changed files with 4219 additions and 1 deletions

View File

@@ -17,6 +17,7 @@ go_library(
importpath = "github.com/prysmaticlabs/prysm/beacon-chain/state/stateutil",
visibility = [
"//beacon-chain:__subpackages__",
"//crypto/hash:__subpackages__",
"//proto/migration:__subpackages__",
"//proto/prysm/v1alpha1:__subpackages__",
"//proto/testing:__subpackages__",

View File

@@ -54,6 +54,236 @@ func ValidatorRootWithHasher(hasher ssz.HashFn, validator *ethpb.Validator) ([32
}
return ssz.BitwiseMerkleizeArrays(hasher, fieldRoots, uint64(len(fieldRoots)), uint64(len(fieldRoots)))
}
func merkleizeFlatArrayAVX(vec [][32]byte,
depth uint8,
hasher func([][32]byte, [][32]byte, uint64),
zero_hash_array [][32]byte) [32]byte {
if depth == 0 && len(vec) == 1 {
return vec[0]
}
if len(vec) == 0 {
panic("Can't have empty vec")
}
// allocate size for the buffer (everything hardcoded cause
layer := (len(vec) + 1) / 2
length := 0
for {
length += layer - 1
if layer == 1 {
break
}
layer = (layer + 1) / 2
}
length += int(depth)
hash_tree := make([][32]byte, length)
first := uint64(0)
height := uint8(1)
last := uint64(len(vec)+1) / 2
if len(vec) > 1 {
hasher(hash_tree, vec, last)
}
if len(vec)%2 == 1 {
hash_tree[last-1] = hash.Hash2ChunksAVX(vec[len(vec)-1], zero_hash_array[0])
}
for {
dist := last - first
if dist < 2 {
break
}
hasher(hash_tree[last:], hash_tree[first:], dist/2)
first = last
last += (dist + 1) / 2
if dist%2 != 0 {
hash_tree[last-1] = hash.Hash2ChunksAVX(hash_tree[first-1], zero_hash_array[height])
}
height++
}
for {
if height >= depth {
break
}
hash_tree[last] = hash.Hash2ChunksAVX(hash_tree[last-1], zero_hash_array[height])
last++
height++
}
return hash_tree[last-1]
}
func merkleizeFlatArrayAVX2(vec [][32]byte,
depth uint8,
hasher func([][32]byte, [][32]byte, uint64),
zero_hash_array [][32]byte) [32]byte {
if depth == 0 && len(vec) == 1 {
return vec[0]
}
if len(vec) == 0 {
panic("Can't have empty vec")
}
// allocate size for the buffer (everything hardcoded cause
layer := (len(vec) + 1) / 2
length := 0
for {
length += layer - 1
if layer == 1 {
break
}
layer = (layer + 1) / 2
}
length += int(depth)
hash_tree := make([][32]byte, length)
first := uint64(0)
height := uint8(1)
last := uint64(len(vec)+1) / 2
if len(vec) > 1 {
hasher(hash_tree, vec, last)
}
if len(vec)%2 == 1 {
hash_tree[last-1] = hash.Hash2ChunksAVX2(vec[len(vec)-1], zero_hash_array[0])
}
for {
dist := last - first
if dist < 2 {
break
}
hasher(hash_tree[last:], hash_tree[first:], dist/2)
first = last
last += (dist + 1) / 2
if dist%2 != 0 {
hash_tree[last-1] = hash.Hash2ChunksAVX2(hash_tree[first-1], zero_hash_array[height])
}
height++
}
for {
if height >= depth {
break
}
hash_tree[last] = hash.Hash2ChunksAVX2(hash_tree[last-1], zero_hash_array[height])
last++
height++
}
return hash_tree[last-1]
}
func merkleizeFlatArray(vec [][32]byte,
depth uint8,
hasher func([][32]byte, [][32]byte, uint64),
zero_hash_array [][32]byte) [32]byte {
if depth == 0 && len(vec) == 1 {
return vec[0]
}
if len(vec) == 0 {
panic("Can't have empty vec")
}
// allocate size for the buffer (everything hardcoded cause
layer := (len(vec) + 1) / 2
length := 0
for {
length += layer - 1
if layer == 1 {
break
}
layer = (layer + 1) / 2
}
length += int(depth)
hash_tree := make([][32]byte, length)
first := uint64(0)
height := uint8(1)
last := uint64(len(vec)+1) / 2
if len(vec) > 1 {
hasher(hash_tree, vec, last)
}
if len(vec)%2 == 1 {
hash_tree[last-1] = hash.Hash2ChunksShani(vec[len(vec)-1], zero_hash_array[0])
}
for {
dist := last - first
if dist < 2 {
break
}
hasher(hash_tree[last:], hash_tree[first:], dist/2)
first = last
last += (dist + 1) / 2
if dist%2 != 0 {
hash_tree[last-1] = hash.Hash2ChunksShani(hash_tree[first-1], zero_hash_array[height])
}
height++
}
for {
if height >= depth {
break
}
hash_tree[last] = hash.Hash2ChunksShani(hash_tree[last-1], zero_hash_array[height])
last++
height++
}
return hash_tree[last-1]
}
// Uint64ListRootWithRegistryLimitShani computes the HashTreeRoot Merkleization of
// a list of uint64 and mixed with registry limit. Flat array implementation
// using Shani extensions
func Uint64ListRootWithRegistryLimitShani(balances []uint64, zero_hash_array [][32]byte) ([32]byte, error) {
// assume len(balances) is multiple of 4 for this benchmark
lenChunks := len(balances) / 4
balancesChunks := make([][32]byte, lenChunks)
for i := 0; i < lenChunks; i++ {
binary.LittleEndian.PutUint64(balancesChunks[i][:], balances[4*i])
binary.LittleEndian.PutUint64(balancesChunks[i][8:], balances[4*i+1])
binary.LittleEndian.PutUint64(balancesChunks[i][16:], balances[4*i+2])
binary.LittleEndian.PutUint64(balancesChunks[i][24:], balances[4*i+3])
}
balancesRootsRoot := merkleizeFlatArray(balancesChunks, 38, hash.PotuzHasherShaniChunks, zero_hash_array)
return hash.MixinLengthShani(balancesRootsRoot, uint64(len(balances))), nil
}
// Uint64ListRootWithRegistryLimitAVX computes the HashTreeRoot Merkleization of
// a list of uint64 and mixed with registry limit. Flat array implementation
// using Shani extensions
func Uint64ListRootWithRegistryLimitAVX(balances []uint64, zero_hash_array [][32]byte) ([32]byte, error) {
// assume len(balances) is multiple of 4 for this benchmark
lenChunks := len(balances) / 4
balancesChunks := make([][32]byte, lenChunks)
for i := 0; i < lenChunks; i++ {
binary.LittleEndian.PutUint64(balancesChunks[i][:], balances[4*i])
binary.LittleEndian.PutUint64(balancesChunks[i][8:], balances[4*i+1])
binary.LittleEndian.PutUint64(balancesChunks[i][16:], balances[4*i+2])
binary.LittleEndian.PutUint64(balancesChunks[i][24:], balances[4*i+3])
}
balancesRootsRoot := merkleizeFlatArrayAVX(balancesChunks, 38, hash.PotuzHasherAVXChunks, zero_hash_array)
return hash.MixinLengthAVX(balancesRootsRoot, uint64(len(balances))), nil
}
// Uint64ListRootWithRegistryLimitAVX2 computes the HashTreeRoot Merkleization of
// a list of uint64 and mixed with registry limit. Flat array implementation
// using Shani extensions
func Uint64ListRootWithRegistryLimitAVX2(balances []uint64, zero_hash_array [][32]byte) ([32]byte, error) {
// assume len(balances) is multiple of 4 for this benchmark
lenChunks := len(balances) / 4
balancesChunks := make([][32]byte, lenChunks)
for i := 0; i < lenChunks; i++ {
binary.LittleEndian.PutUint64(balancesChunks[i][:], balances[4*i])
binary.LittleEndian.PutUint64(balancesChunks[i][8:], balances[4*i+1])
binary.LittleEndian.PutUint64(balancesChunks[i][16:], balances[4*i+2])
binary.LittleEndian.PutUint64(balancesChunks[i][24:], balances[4*i+3])
}
balancesRootsRoot := merkleizeFlatArrayAVX2(balancesChunks, 38, hash.PotuzHasherAVX2Chunks, zero_hash_array)
return hash.MixinLengthAVX2(balancesRootsRoot, uint64(len(balances))), nil
}
// Uint64ListRootWithRegistryLimit computes the HashTreeRoot Merkleization of
// a list of uint64 and mixed with registry limit.

View File

@@ -1,8 +1,13 @@
load("@prysm//tools/go:def.bzl", "go_library", "go_test")
load("@prysm//crypto/hash:yasm.bzl", "yasm_library")
go_library(
name = "go_default_library",
srcs = ["hash.go"],
srcs = [
"hash.go",
"custom_hasher/hasher.h",
],
cgo = True,
importpath = "github.com/prysmaticlabs/prysm/crypto/hash",
visibility = ["//visibility:public"],
deps = [
@@ -13,6 +18,7 @@ go_library(
"@org_golang_google_protobuf//proto:go_default_library",
"@org_golang_x_crypto//sha3:go_default_library",
],
cdeps = [":custom_hasher"],
)
go_test(
@@ -27,6 +33,29 @@ go_test(
"//proto/testing:go_default_library",
"//testing/assert:go_default_library",
"//testing/require:go_default_library",
"//beacon-chain/state/stateutil:go_default_library",
"@com_github_google_gofuzz//:go_default_library",
],
)
cc_library(
name = "custom_hasher",
srcs = [
"custom_hasher/hasher.h",
],
hdrs = [ "custom_hasher/hasher.h" ],
visibility = ["//visibility:public"],
deps = [ ":asm" ],
)
yasm_library(
name = "asm",
srcs = [
"custom_hasher/assembly/reg_sizes.asm",
"custom_hasher/assembly/sha256_avx_one_block.asm",
"custom_hasher/assembly/sha256_avx.asm",
"custom_hasher/assembly/sha256_avx2.asm",
"custom_hasher/assembly/sha256_shani.asm",
"custom_hasher/assembly/transpose_avx2.asm",
],
)

View File

@@ -0,0 +1,300 @@
;;
;; Copyright (c) 2012-2021, Intel Corporation
;;
;; Redistribution and use in source and binary forms, with or without
;; modification, are permitted provided that the following conditions are met:
;;
;; * Redistributions of source code must retain the above copyright notice,
;; this list of conditions and the following disclaimer.
;; * Redistributions in binary form must reproduce the above copyright
;; notice, this list of conditions and the following disclaimer in the
;; documentation and/or other materials provided with the distribution.
;; * Neither the name of Intel Corporation nor the names of its contributors
;; may be used to endorse or promote products derived from this software
;; without specific prior written permission.
;;
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;
; define d and w variants for registers
%ifndef _REG_SIZES_ASM_
%define _REG_SIZES_ASM_
%define raxd eax
%define raxw ax
%define raxb al
%define rbxd ebx
%define rbxw bx
%define rbxb bl
%define rcxd ecx
%define rcxw cx
%define rcxb cl
%define rdxd edx
%define rdxw dx
%define rdxb dl
%define rsid esi
%define rsiw si
%define rsib sil
%define rdid edi
%define rdiw di
%define rdib dil
%define rbpd ebp
%define rbpw bp
%define rbpb bpl
%define zmm0x xmm0
%define zmm1x xmm1
%define zmm2x xmm2
%define zmm3x xmm3
%define zmm4x xmm4
%define zmm5x xmm5
%define zmm6x xmm6
%define zmm7x xmm7
%define zmm8x xmm8
%define zmm9x xmm9
%define zmm10x xmm10
%define zmm11x xmm11
%define zmm12x xmm12
%define zmm13x xmm13
%define zmm14x xmm14
%define zmm15x xmm15
%define zmm16x xmm16
%define zmm17x xmm17
%define zmm18x xmm18
%define zmm19x xmm19
%define zmm20x xmm20
%define zmm21x xmm21
%define zmm22x xmm22
%define zmm23x xmm23
%define zmm24x xmm24
%define zmm25x xmm25
%define zmm26x xmm26
%define zmm27x xmm27
%define zmm28x xmm28
%define zmm29x xmm29
%define zmm30x xmm30
%define zmm31x xmm31
%define ymm0x xmm0
%define ymm1x xmm1
%define ymm2x xmm2
%define ymm3x xmm3
%define ymm4x xmm4
%define ymm5x xmm5
%define ymm6x xmm6
%define ymm7x xmm7
%define ymm8x xmm8
%define ymm9x xmm9
%define ymm10x xmm10
%define ymm11x xmm11
%define ymm12x xmm12
%define ymm13x xmm13
%define ymm14x xmm14
%define ymm15x xmm15
%define ymm16x xmm16
%define ymm17x xmm17
%define ymm18x xmm18
%define ymm19x xmm19
%define ymm20x xmm20
%define ymm21x xmm21
%define ymm22x xmm22
%define ymm23x xmm23
%define ymm24x xmm24
%define ymm25x xmm25
%define ymm26x xmm26
%define ymm27x xmm27
%define ymm28x xmm28
%define ymm29x xmm29
%define ymm30x xmm30
%define ymm31x xmm31
%define xmm0x xmm0
%define xmm1x xmm1
%define xmm2x xmm2
%define xmm3x xmm3
%define xmm4x xmm4
%define xmm5x xmm5
%define xmm6x xmm6
%define xmm7x xmm7
%define xmm8x xmm8
%define xmm9x xmm9
%define xmm10x xmm10
%define xmm11x xmm11
%define xmm12x xmm12
%define xmm13x xmm13
%define xmm14x xmm14
%define xmm15x xmm15
%define xmm16x xmm16
%define xmm17x xmm17
%define xmm18x xmm18
%define xmm19x xmm19
%define xmm20x xmm20
%define xmm21x xmm21
%define xmm22x xmm22
%define xmm23x xmm23
%define xmm24x xmm24
%define xmm25x xmm25
%define xmm26x xmm26
%define xmm27x xmm27
%define xmm28x xmm28
%define xmm29x xmm29
%define xmm30x xmm30
%define xmm31x xmm31
%define zmm0y ymm0
%define zmm1y ymm1
%define zmm2y ymm2
%define zmm3y ymm3
%define zmm4y ymm4
%define zmm5y ymm5
%define zmm6y ymm6
%define zmm7y ymm7
%define zmm8y ymm8
%define zmm9y ymm9
%define zmm10y ymm10
%define zmm11y ymm11
%define zmm12y ymm12
%define zmm13y ymm13
%define zmm14y ymm14
%define zmm15y ymm15
%define zmm16y ymm16
%define zmm17y ymm17
%define zmm18y ymm18
%define zmm19y ymm19
%define zmm20y ymm20
%define zmm21y ymm21
%define zmm22y ymm22
%define zmm23y ymm23
%define zmm24y ymm24
%define zmm25y ymm25
%define zmm26y ymm26
%define zmm27y ymm27
%define zmm28y ymm28
%define zmm29y ymm29
%define zmm30y ymm30
%define zmm31y ymm31
%define xmm0y ymm0
%define xmm1y ymm1
%define xmm2y ymm2
%define xmm3y ymm3
%define xmm4y ymm4
%define xmm5y ymm5
%define xmm6y ymm6
%define xmm7y ymm7
%define xmm8y ymm8
%define xmm9y ymm9
%define xmm10y ymm10
%define xmm11y ymm11
%define xmm12y ymm12
%define xmm13y ymm13
%define xmm14y ymm14
%define xmm15y ymm15
%define xmm16y ymm16
%define xmm17y ymm17
%define xmm18y ymm18
%define xmm19y ymm19
%define xmm20y ymm20
%define xmm21y ymm21
%define xmm22y ymm22
%define xmm23y ymm23
%define xmm24y ymm24
%define xmm25y ymm25
%define xmm26y ymm26
%define xmm27y ymm27
%define xmm28y ymm28
%define xmm29y ymm29
%define xmm30y ymm30
%define xmm31y ymm31
%define xmm0z zmm0
%define xmm1z zmm1
%define xmm2z zmm2
%define xmm3z zmm3
%define xmm4z zmm4
%define xmm5z zmm5
%define xmm6z zmm6
%define xmm7z zmm7
%define xmm8z zmm8
%define xmm9z zmm9
%define xmm10z zmm10
%define xmm11z zmm11
%define xmm12z zmm12
%define xmm13z zmm13
%define xmm14z zmm14
%define xmm15z zmm15
%define xmm16z zmm16
%define xmm17z zmm17
%define xmm18z zmm18
%define xmm19z zmm19
%define xmm20z zmm20
%define xmm21z zmm21
%define xmm22z zmm22
%define xmm23z zmm23
%define xmm24z zmm24
%define xmm25z zmm25
%define xmm26z zmm26
%define xmm27z zmm27
%define xmm28z zmm28
%define xmm29z zmm29
%define xmm30z zmm30
%define xmm31z zmm31
%define ymm0z zmm0
%define ymm1z zmm1
%define ymm2z zmm2
%define ymm3z zmm3
%define ymm4z zmm4
%define ymm5z zmm5
%define ymm6z zmm6
%define ymm7z zmm7
%define ymm8z zmm8
%define ymm9z zmm9
%define ymm10z zmm10
%define ymm11z zmm11
%define ymm12z zmm12
%define ymm13z zmm13
%define ymm14z zmm14
%define ymm15z zmm15
%define ymm16z zmm16
%define ymm17z zmm17
%define ymm18z zmm18
%define ymm19z zmm19
%define ymm20z zmm20
%define ymm21z zmm21
%define ymm22z zmm22
%define ymm23z zmm23
%define ymm24z zmm24
%define ymm25z zmm25
%define ymm26z zmm26
%define ymm27z zmm27
%define ymm28z zmm28
%define ymm29z zmm29
%define ymm30z zmm30
%define ymm31z zmm31
%define DWORD(reg) reg %+ d
%define WORD(reg) reg %+ w
%define BYTE(reg) reg %+ b
%define XWORD(reg) reg %+ x
%define YWORD(reg) reg %+ y
%define ZWORD(reg) reg %+ z
%endif ;; _REG_SIZES_ASM_

View File

@@ -0,0 +1,612 @@
;; sha256_avx.asm
; *
; * This file is part of Mammon.
; * mammon is a greedy and selfish ETH consensus client.
; *
; * Copyright (c) 2021 - Reimundo Heluani (potuz) potuz@potuz.net
; *
; * This program is free software: you can redistribute it and/or modify
; * it under the terms of the GNU General Public License as published by
; * the Free Software Foundation, either version 3 of the License, or
; * (at your option) any later version.
; *
; * This program is distributed in the hope that it will be useful,
; * but WITHOUT ANY WARRANTY; without even the implied warranty of
; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
; * GNU General Public License for more details.
; *
; You should have received a copy of the GNU General Public License
; along with this program. If not, see <http://www.gnu.org/licenses/>.
;
; This implementation is a 64 bytes optimized implementation based on Intel's code
; whose copyright follows
;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;; Copyright (c) 2012-2021, Intel Corporation
;;
;; Redistribution and use in source and binary forms, with or without
;; modification, are permitted provided that the following conditions are met:
;;
;; * Redistributions of source code must retain the above copyright notice,
;; this list of conditions and the following disclaimer.
;; * Redistributions in binary form must reproduce the above copyright
;; notice, this list of conditions and the following disclaimer in the
;; documentation and/or other materials provided with the distribution.
;; * Neither the name of Intel Corporation nor the names of its contributors
;; may be used to endorse or promote products derived from this software
;; without specific prior written permission.
;;
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;
;; code to compute quad SHA256 using AVX
;; outer calling routine takes care of save and restore of XMM registers
;; Logic designed/laid out by JDG
;; Stack must be aligned to 16 bytes before call
;; Windows clobbers: rax rdx r8 r9 r10 r11
;; Windows preserves: rcx rsi rdi rbp r12 r13 r14 r15
;;
;; Linux clobbers: rax rsi r8 r9 r10 r11
;; Linux preserves: rcx rdx rdi rbp r12 r13 r14 r15
;;
;; clobbers xmm0-15
extern sha256_1_avx
%ifdef WINABI
%define OUTPUT_PTR rcx ; 1st arg
%define DATA_PTR rdx ; 2nd arg
%define NUM_BLKS r8 ; 3rd arg
%define TBL rsi
%else
%define OUTPUT_PTR rdi ; 1st arg
%define DATA_PTR rsi ; 2nd arg
%define NUM_BLKS rdx ; 3rd arg
%define TBL rcx
%endif
%define ROUND rbx
%define inp0 r8
%define inp1 r9
%define inp2 r10
%define inp3 r11
%define a xmm0
%define b xmm1
%define c xmm2
%define d xmm3
%define e xmm4
%define f xmm5
%define g xmm6
%define h xmm7
%define a0 xmm8
%define a1 xmm9
%define a2 xmm10
%define TT0 xmm14
%define TT1 xmm13
%define TT2 xmm12
%define TT3 xmm11
%define TT4 xmm10
%define TT5 xmm9
%define T1 xmm14
%define TMP xmm15
%define SHA256_DIGEST_WORD_SIZE 4
%define NUM_SHA256_DIGEST_WORDS 8
%define SZ4 4*SHA256_DIGEST_WORD_SIZE ; Size of one vector register
%define ROUNDS 64*SZ4
; Define stack usage
struc STACK
_DATA: resb SZ4 * 16
_DIGEST: resb SZ4 * NUM_SHA256_DIGEST_WORDS
_RBX: resb 8
resb 16
endstruc
%define VMOVPS vmovups
; transpose r0, r1, r2, r3, t0, t1
; "transpose" data in {r0..r3} using temps {t0,t1}
; Input looks like: {r0 r1 r2 r3}
; r0 = {a3 a2 a1 a0}
; r1 = {b3 b2 b1 b0}
; r2 = {c3 c2 c1 c0}
; r3 = {d3 d2 d1 d0}
;
; output looks like: {t0 r1 r0 r3}
; t0 = {d0 c0 b0 a0}
; r1 = {d1 c1 b1 a1}
; r0 = {d2 c2 b2 a2}
; r3 = {d3 c3 b3 a3}
;
%macro TRANSPOSE 6
%define %%r0 %1
%define %%r1 %2
%define %%r2 %3
%define %%r3 %4
%define %%t0 %5
%define %%t1 %6
vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0}
vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2}
vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d1 d0 c1 c0}
vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2}
vshufps %%r1, %%t0, %%t1, 0xDD ; r1 = {d1 c1 b1 a1}
vshufps %%r3, %%r0, %%r2, 0xDD ; r3 = {d3 c3 b3 a3}
vshufps %%r0, %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2}
vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0}
%endmacro
%macro ROTATE_ARGS 0
%xdefine TMP_ h
%xdefine h g
%xdefine g f
%xdefine f e
%xdefine e d
%xdefine d c
%xdefine c b
%xdefine b a
%xdefine a TMP_
%endm
; PRORD reg, imm, tmp
%macro PRORD 3
%define %%reg %1
%define %%imm %2
%define %%tmp %3
vpslld %%tmp, %%reg, (32-(%%imm))
vpsrld %%reg, %%reg, %%imm
vpor %%reg, %%reg, %%tmp
%endmacro
; non-destructive
; PRORD_nd reg, imm, tmp, src
%macro PRORD_nd 4
%define %%reg %1
%define %%imm %2
%define %%tmp %3
%define %%src %4
;vmovdqa %%tmp, %%reg
vpslld %%tmp, %%src, (32-(%%imm))
vpsrld %%reg, %%src, %%imm
vpor %%reg, %%reg, %%tmp
%endmacro
; PRORD dst/src, amt
%macro PRORD 2
PRORD %1, %2, TMP
%endmacro
; PRORD_nd dst, src, amt
%macro PRORD_nd 3
PRORD_nd %1, %3, TMP, %2
%endmacro
;; arguments passed implicitly in preprocessor symbols i, a...h
%macro ROUND_00_15 2
%define %%T1 %1
%define %%i %2
PRORD_nd a0, e, (11-6) ; sig1: a0 = (e >> 5)
vpxor a2, f, g ; ch: a2 = f^g
vpand a2, a2, e ; ch: a2 = (f^g)&e
vpxor a2, a2, g ; a2 = ch
PRORD_nd a1, e, 25 ; sig1: a1 = (e >> 25)
vmovdqa [SZ4*(%%i&0xf) + rsp + _DATA], %%T1
vpaddd %%T1, %%T1, [TBL + ROUND] ; T1 = W + K
vpxor a0, a0, e ; sig1: a0 = e ^ (e >> 5)
PRORD a0, 6 ; sig1: a0 = (e >> 6) ^ (e >> 11)
vpaddd h, h, a2 ; h = h + ch
PRORD_nd a2, a, (13-2) ; sig0: a2 = (a >> 11)
vpaddd h, h, %%T1 ; h = h + ch + W + K
vpxor a0, a0, a1 ; a0 = sigma1
PRORD_nd a1, a, 22 ; sig0: a1 = (a >> 22)
vpxor %%T1, a, c ; maj: T1 = a^c
add ROUND, SZ4 ; ROUND++
vpand %%T1, %%T1, b ; maj: T1 = (a^c)&b
vpaddd h, h, a0
vpaddd d, d, h
vpxor a2, a2, a ; sig0: a2 = a ^ (a >> 11)
PRORD a2, 2 ; sig0: a2 = (a >> 2) ^ (a >> 13)
vpxor a2, a2, a1 ; a2 = sig0
vpand a1, a, c ; maj: a1 = a&c
vpor a1, a1, %%T1 ; a1 = maj
vpaddd h, h, a1 ; h = h + ch + W + K + maj
vpaddd h, h, a2 ; h = h + ch + W + K + maj + sigma0
ROTATE_ARGS
%endm
;; arguments passed implicitly in preprocessor symbols i, a...h
%macro ROUND_16_XX 2
%define %%T1 %1
%define %%i %2
vmovdqa %%T1, [SZ4*((%%i-15)&0xf) + rsp + _DATA]
vmovdqa a1, [SZ4*((%%i-2)&0xf) + rsp + _DATA]
vmovdqa a0, %%T1
PRORD %%T1, 18-7
vmovdqa a2, a1
PRORD a1, 19-17
vpxor %%T1, %%T1, a0
PRORD %%T1, 7
vpxor a1, a1, a2
PRORD a1, 17
vpsrld a0, a0, 3
vpxor %%T1, %%T1, a0
vpsrld a2, a2, 10
vpxor a1, a1, a2
vpaddd %%T1, %%T1, [SZ4*((%%i-16)&0xf) + rsp + _DATA]
vpaddd a1, a1, [SZ4*((%%i-7)&0xf) + rsp + _DATA]
vpaddd %%T1, %%T1, a1
ROUND_00_15 %%T1, %%i
%endm
;; arguments passed implicitly in preprocessor symbols i, a...h
%macro PADDING_ROUND_00_15 1
%define %%T1 %1
PRORD_nd a0, e, (11-6) ; sig1: a0 = (e >> 5)
vpxor a2, f, g ; ch: a2 = f^g
vpand a2, a2, e ; ch: a2 = (f^g)&e
vpxor a2, a2, g ; a2 = ch
PRORD_nd a1, e, 25 ; sig1: a1 = (e >> 25)
vmovdqa %%T1, [TBL + ROUND] ; T1 = W + K
vpxor a0, a0, e ; sig1: a0 = e ^ (e >> 5)
PRORD a0, 6 ; sig1: a0 = (e >> 6) ^ (e >> 11)
vpaddd h, h, a2 ; h = h + ch
PRORD_nd a2, a, (13-2) ; sig0: a2 = (a >> 11)
vpaddd h, h, %%T1 ; h = h + ch + W + K
vpxor a0, a0, a1 ; a0 = sigma1
PRORD_nd a1, a, 22 ; sig0: a1 = (a >> 22)
vpxor %%T1, a, c ; maj: T1 = a^c
add ROUND, SZ4 ; ROUND++
vpand %%T1, %%T1, b ; maj: T1 = (a^c)&b
vpaddd h, h, a0
vpaddd d, d, h
vpxor a2, a2, a ; sig0: a2 = a ^ (a >> 11)
PRORD a2, 2 ; sig0: a2 = (a >> 2) ^ (a >> 13)
vpxor a2, a2, a1 ; a2 = sig0
vpand a1, a, c ; maj: a1 = a&c
vpor a1, a1, %%T1 ; a1 = maj
vpaddd h, h, a1 ; h = h + ch + W + K + maj
vpaddd h, h, a2 ; h = h + ch + W + K + maj + sigma0
ROTATE_ARGS
%endm
section .data
default rel
align 64
K256_4:
dq 0x428a2f98428a2f98, 0x428a2f98428a2f98
dq 0x7137449171374491, 0x7137449171374491
dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
dq 0x3956c25b3956c25b, 0x3956c25b3956c25b
dq 0x59f111f159f111f1, 0x59f111f159f111f1
dq 0x923f82a4923f82a4, 0x923f82a4923f82a4
dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
dq 0xd807aa98d807aa98, 0xd807aa98d807aa98
dq 0x12835b0112835b01, 0x12835b0112835b01
dq 0x243185be243185be, 0x243185be243185be
dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
dq 0x72be5d7472be5d74, 0x72be5d7472be5d74
dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
dq 0xc19bf174c19bf174, 0xc19bf174c19bf174
dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
dq 0xefbe4786efbe4786, 0xefbe4786efbe4786
dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
dq 0x76f988da76f988da, 0x76f988da76f988da
dq 0x983e5152983e5152, 0x983e5152983e5152
dq 0xa831c66da831c66d, 0xa831c66da831c66d
dq 0xb00327c8b00327c8, 0xb00327c8b00327c8
dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
dq 0xd5a79147d5a79147, 0xd5a79147d5a79147
dq 0x06ca635106ca6351, 0x06ca635106ca6351
dq 0x1429296714292967, 0x1429296714292967
dq 0x27b70a8527b70a85, 0x27b70a8527b70a85
dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138
dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
dq 0x53380d1353380d13, 0x53380d1353380d13
dq 0x650a7354650a7354, 0x650a7354650a7354
dq 0x766a0abb766a0abb, 0x766a0abb766a0abb
dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
dq 0x92722c8592722c85, 0x92722c8592722c85
dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
dq 0xa81a664ba81a664b, 0xa81a664ba81a664b
dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
dq 0xd192e819d192e819, 0xd192e819d192e819
dq 0xd6990624d6990624, 0xd6990624d6990624
dq 0xf40e3585f40e3585, 0xf40e3585f40e3585
dq 0x106aa070106aa070, 0x106aa070106aa070
dq 0x19a4c11619a4c116, 0x19a4c11619a4c116
dq 0x1e376c081e376c08, 0x1e376c081e376c08
dq 0x2748774c2748774c, 0x2748774c2748774c
dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
dq 0x748f82ee748f82ee, 0x748f82ee748f82ee
dq 0x78a5636f78a5636f, 0x78a5636f78a5636f
dq 0x84c8781484c87814, 0x84c8781484c87814
dq 0x8cc702088cc70208, 0x8cc702088cc70208
dq 0x90befffa90befffa, 0x90befffa90befffa
dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb
dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
dq 0xc67178f2c67178f2, 0xc67178f2c67178f2
PADDING_4:
dq 0xc28a2f98c28a2f98, 0xc28a2f98c28a2f98
dq 0x7137449171374491, 0x7137449171374491
dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
dq 0x3956c25b3956c25b, 0x3956c25b3956c25b
dq 0x59f111f159f111f1, 0x59f111f159f111f1
dq 0x923f82a4923f82a4, 0x923f82a4923f82a4
dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
dq 0xd807aa98d807aa98, 0xd807aa98d807aa98
dq 0x12835b0112835b01, 0x12835b0112835b01
dq 0x243185be243185be, 0x243185be243185be
dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
dq 0x72be5d7472be5d74, 0x72be5d7472be5d74
dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
dq 0xc19bf374c19bf374, 0xc19bf374c19bf374
dq 0x649b69c1649b69c1, 0x649b69c1649b69c1
dq 0xf0fe4786f0fe4786, 0xf0fe4786f0fe4786
dq 0x0fe1edc60fe1edc6, 0x0fe1edc60fe1edc6
dq 0x240cf254240cf254, 0x240cf254240cf254
dq 0x4fe9346f4fe9346f, 0x4fe9346f4fe9346f
dq 0x6cc984be6cc984be, 0x6cc984be6cc984be
dq 0x61b9411e61b9411e, 0x61b9411e61b9411e
dq 0x16f988fa16f988fa, 0x16f988fa16f988fa
dq 0xf2c65152f2c65152, 0xf2c65152f2c65152
dq 0xa88e5a6da88e5a6d, 0xa88e5a6da88e5a6d
dq 0xb019fc65b019fc65, 0xb019fc65b019fc65
dq 0xb9d99ec7b9d99ec7, 0xb9d99ec7b9d99ec7
dq 0x9a1231c39a1231c3, 0x9a1231c39a1231c3
dq 0xe70eeaa0e70eeaa0, 0xe70eeaa0e70eeaa0
dq 0xfdb1232bfdb1232b, 0xfdb1232bfdb1232b
dq 0xc7353eb0c7353eb0, 0xc7353eb0c7353eb0
dq 0x3069bad53069bad5, 0x3069bad53069bad5
dq 0xcb976d5fcb976d5f, 0xcb976d5fcb976d5f
dq 0x5a0f118f5a0f118f, 0x5a0f118f5a0f118f
dq 0xdc1eeefddc1eeefd, 0xdc1eeefddc1eeefd
dq 0x0a35b6890a35b689, 0x0a35b6890a35b689
dq 0xde0b7a04de0b7a04, 0xde0b7a04de0b7a04
dq 0x58f4ca9d58f4ca9d, 0x58f4ca9d58f4ca9d
dq 0xe15d5b16e15d5b16, 0xe15d5b16e15d5b16
dq 0x007f3e86007f3e86, 0x007f3e86007f3e86
dq 0x3708898037088980, 0x3708898037088980
dq 0xa507ea32a507ea32, 0xa507ea32a507ea32
dq 0x6fab95376fab9537, 0x6fab95376fab9537
dq 0x1740611017406110, 0x1740611017406110
dq 0x0d8cd6f10d8cd6f1, 0x0d8cd6f10d8cd6f1
dq 0xcdaa3b6dcdaa3b6d, 0xcdaa3b6dcdaa3b6d
dq 0xc0bbbe37c0bbbe37, 0xc0bbbe37c0bbbe37
dq 0x83613bda83613bda, 0x83613bda83613bda
dq 0xdb48a363db48a363, 0xdb48a363db48a363
dq 0x0b02e9310b02e931, 0x0b02e9310b02e931
dq 0x6fd15ca76fd15ca7, 0x6fd15ca76fd15ca7
dq 0x521afaca521afaca, 0x521afaca521afaca
dq 0x3133843131338431, 0x3133843131338431
dq 0x6ed41a956ed41a95, 0x6ed41a956ed41a95
dq 0x6d4378906d437890, 0x6d4378906d437890
dq 0xc39c91f2c39c91f2, 0xc39c91f2c39c91f2
dq 0x9eccabbd9eccabbd, 0x9eccabbd9eccabbd
dq 0xb5c9a0e6b5c9a0e6, 0xb5c9a0e6b5c9a0e6
dq 0x532fb63c532fb63c, 0x532fb63c532fb63c
dq 0xd2c741c6d2c741c6, 0xd2c741c6d2c741c6
dq 0x07237ea307237ea3, 0x07237ea307237ea3
dq 0xa4954b68a4954b68, 0xa4954b68a4954b68
dq 0x4c191d764c191d76, 0x4c191d764c191d76
DIGEST_4:
dd 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667
dd 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85
dd 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372
dd 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a
dd 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f
dd 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c
dd 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab
dd 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19
PSHUFFLE_BYTE_FLIP_MASK:
dq 0x0405060700010203, 0x0c0d0e0f08090a0b
section .text
global sha256_4_avx:function
align 16
sha256_4_avx:
endbranch64
; outer calling routine saves all the XMM registers
sub rsp, STACK_size
mov [rsp + _RBX],rbx
.hash_4_blocks:
cmp NUM_BLKS, 4
jl .hash_1_block
xor ROUND, ROUND
;; Load the pre-transposed incoming digest.
lea TBL,[rel DIGEST_4]
vmovdqa a,[TBL + 0*SZ4]
vmovdqa b,[TBL + 1*SZ4]
vmovdqa c,[TBL + 2*SZ4]
vmovdqa d,[TBL + 3*SZ4]
vmovdqa e,[TBL + 4*SZ4]
vmovdqa f,[TBL + 5*SZ4]
vmovdqa g,[TBL + 6*SZ4]
vmovdqa h,[TBL + 7*SZ4]
lea TBL,[rel K256_4]
%assign i 0
%rep 4
vmovdqa TMP, [rel PSHUFFLE_BYTE_FLIP_MASK]
VMOVPS TT2,[DATA_PTR + 0*64 + i*16]
VMOVPS TT1,[DATA_PTR + 1*64 + i*16]
VMOVPS TT4,[DATA_PTR + 2*64 + i*16]
VMOVPS TT3,[DATA_PTR + 3*64 + i*16]
TRANSPOSE TT2, TT1, TT4, TT3, TT0, TT5
vpshufb TT0, TT0, TMP
vpshufb TT1, TT1, TMP
vpshufb TT2, TT2, TMP
vpshufb TT3, TT3, TMP
ROUND_00_15 TT0,(i*4+0)
ROUND_00_15 TT1,(i*4+1)
ROUND_00_15 TT2,(i*4+2)
ROUND_00_15 TT3,(i*4+3)
%assign i (i+1)
%endrep
%assign i (i*4)
jmp .Lrounds_16_xx
align 16
.Lrounds_16_xx:
%rep 16
ROUND_16_XX T1, i
%assign i (i+1)
%endrep
cmp ROUND,ROUNDS
jb .Lrounds_16_xx
;; add old digest
lea TBL,[rel DIGEST_4]
vpaddd a, a, [TBL + 0*SZ4]
vpaddd b, b, [TBL + 1*SZ4]
vpaddd c, c, [TBL + 2*SZ4]
vpaddd d, d, [TBL + 3*SZ4]
vpaddd e, e, [TBL + 4*SZ4]
vpaddd f, f, [TBL + 5*SZ4]
vpaddd g, g, [TBL + 6*SZ4]
vpaddd h, h, [TBL + 7*SZ4]
;; rounds with padding
;; save old digest
vmovdqa [rsp + _DIGEST + 0*SZ4], a
vmovdqa [rsp + _DIGEST + 1*SZ4], b
vmovdqa [rsp + _DIGEST + 2*SZ4], c
vmovdqa [rsp + _DIGEST + 3*SZ4], d
vmovdqa [rsp + _DIGEST + 4*SZ4], e
vmovdqa [rsp + _DIGEST + 5*SZ4], f
vmovdqa [rsp + _DIGEST + 6*SZ4], g
vmovdqa [rsp + _DIGEST + 7*SZ4], h
lea TBL,[rel PADDING_4]
xor ROUND,ROUND
jmp .Lrounds_padding
align 16
.Lrounds_padding:
%rep 64
PADDING_ROUND_00_15 T1
%endrep
;; add old digest
vpaddd a, a, [rsp + _DIGEST + 0*SZ4]
vpaddd b, b, [rsp + _DIGEST + 1*SZ4]
vpaddd c, c, [rsp + _DIGEST + 2*SZ4]
vpaddd d, d, [rsp + _DIGEST + 3*SZ4]
vpaddd e, e, [rsp + _DIGEST + 4*SZ4]
vpaddd f, f, [rsp + _DIGEST + 5*SZ4]
vpaddd g, g, [rsp + _DIGEST + 6*SZ4]
vpaddd h, h, [rsp + _DIGEST + 7*SZ4]
;; transpose the digest and convert to little endian to get the registers correctly
TRANSPOSE a, b, c, d, TT0, TT1
TRANSPOSE e, f, g, h, TT2, TT1
vmovdqa TMP, [rel PSHUFFLE_BYTE_FLIP_MASK]
vpshufb TT0, TMP
vpshufb TT2, TMP
vpshufb b, TMP
vpshufb f, TMP
vpshufb a, TMP
vpshufb e, TMP
vpshufb d, TMP
vpshufb h, TMP
;; write to output
vmovdqu [OUTPUT_PTR + 0*SZ4],TT0
vmovdqu [OUTPUT_PTR + 1*SZ4],TT2
vmovdqu [OUTPUT_PTR + 2*SZ4],b
vmovdqu [OUTPUT_PTR + 3*SZ4],f
vmovdqu [OUTPUT_PTR + 4*SZ4],a
vmovdqu [OUTPUT_PTR + 5*SZ4],e
vmovdqu [OUTPUT_PTR + 6*SZ4],d
vmovdqu [OUTPUT_PTR + 7*SZ4],h
; update pointers and loop
add DATA_PTR, 64*4
add OUTPUT_PTR, 32*4
sub NUM_BLKS, 4
jmp .hash_4_blocks
.hash_1_block:
test NUM_BLKS,NUM_BLKS
jz .done_hash
call sha256_1_avx
add DATA_PTR, 64
add OUTPUT_PTR, 32
dec NUM_BLKS
jmp .hash_1_block
.done_hash:
mov rbx,[rsp + _RBX]
add rsp, STACK_size
ret
%ifdef LINUX
section .note.GNU-stack noalloc noexec nowrite progbits
%endif

View File

@@ -0,0 +1,797 @@
;; sha256_avx2.asm
; *
; * This file is part of Mammon.
; * mammon is a greedy and selfish ETH consensus client.
; *
; * Copyright (c) 2021 - Reimundo Heluani (potuz) potuz@potuz.net
; *
; * This program is free software: you can redistribute it and/or modify
; * it under the terms of the GNU General Public License as published by
; * the Free Software Foundation, either version 3 of the License, or
; * (at your option) any later version.
; *
; * This program is distributed in the hope that it will be useful,
; * but WITHOUT ANY WARRANTY; without even the implied warranty of
; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
; * GNU General Public License for more details.
; *
; You should have received a copy of the GNU General Public License
; along with this program. If not, see <http://www.gnu.org/licenses/>.
;
; This implementation is a 64 bytes optimized implementation based on Intel's code
; whose copyright follows
;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;
;; Copyright (c) 2012-2021, Intel Corporation
;;
;; Redistribution and use in source and binary forms, with or without
;; modification, are permitted provided that the following conditions are met:
;;
;; * Redistributions of source code must retain the above copyright notice,
;; this list of conditions and the following disclaimer.
;; * Redistributions in binary form must reproduce the above copyright
;; notice, this list of conditions and the following disclaimer in the
;; documentation and/or other materials provided with the distribution.
;; * Neither the name of Intel Corporation nor the names of its contributors
;; may be used to endorse or promote products derived from this software
;; without specific prior written permission.
;;
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;
;; code to compute oct SHA256 using SSE-256
;; outer calling routine takes care of save and restore of XMM registers
;; Logic designed/laid out by JDG
;; Function clobbers: rax, rcx, rdx, rsi, rdi, r9-r15; ymm0-15
;; Stack must be aligned to 32 bytes before call
;; Windows clobbers: rax rdx rsi rdi r8 r9 r10 r11 r12 r13 r14
;; Windows preserves: rcx rbp r15
;;
;; Linux clobbers: rax rcx rdx rsi r8 r9 r10 r11 r12 r13 r14
;; Linux preserves: rdi rbp r15
;;
;; clobbers ymm0-15
%include "transpose_avx2.asm"
extern sha256_4_avx
section .data
default rel
align 64
K256_8:
dq 0x428a2f98428a2f98, 0x428a2f98428a2f98
dq 0x428a2f98428a2f98, 0x428a2f98428a2f98
dq 0x7137449171374491, 0x7137449171374491
dq 0x7137449171374491, 0x7137449171374491
dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
dq 0x3956c25b3956c25b, 0x3956c25b3956c25b
dq 0x3956c25b3956c25b, 0x3956c25b3956c25b
dq 0x59f111f159f111f1, 0x59f111f159f111f1
dq 0x59f111f159f111f1, 0x59f111f159f111f1
dq 0x923f82a4923f82a4, 0x923f82a4923f82a4
dq 0x923f82a4923f82a4, 0x923f82a4923f82a4
dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
dq 0xd807aa98d807aa98, 0xd807aa98d807aa98
dq 0xd807aa98d807aa98, 0xd807aa98d807aa98
dq 0x12835b0112835b01, 0x12835b0112835b01
dq 0x12835b0112835b01, 0x12835b0112835b01
dq 0x243185be243185be, 0x243185be243185be
dq 0x243185be243185be, 0x243185be243185be
dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
dq 0x72be5d7472be5d74, 0x72be5d7472be5d74
dq 0x72be5d7472be5d74, 0x72be5d7472be5d74
dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
dq 0xc19bf174c19bf174, 0xc19bf174c19bf174
dq 0xc19bf174c19bf174, 0xc19bf174c19bf174
dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
dq 0xefbe4786efbe4786, 0xefbe4786efbe4786
dq 0xefbe4786efbe4786, 0xefbe4786efbe4786
dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
dq 0x76f988da76f988da, 0x76f988da76f988da
dq 0x76f988da76f988da, 0x76f988da76f988da
dq 0x983e5152983e5152, 0x983e5152983e5152
dq 0x983e5152983e5152, 0x983e5152983e5152
dq 0xa831c66da831c66d, 0xa831c66da831c66d
dq 0xa831c66da831c66d, 0xa831c66da831c66d
dq 0xb00327c8b00327c8, 0xb00327c8b00327c8
dq 0xb00327c8b00327c8, 0xb00327c8b00327c8
dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
dq 0xd5a79147d5a79147, 0xd5a79147d5a79147
dq 0xd5a79147d5a79147, 0xd5a79147d5a79147
dq 0x06ca635106ca6351, 0x06ca635106ca6351
dq 0x06ca635106ca6351, 0x06ca635106ca6351
dq 0x1429296714292967, 0x1429296714292967
dq 0x1429296714292967, 0x1429296714292967
dq 0x27b70a8527b70a85, 0x27b70a8527b70a85
dq 0x27b70a8527b70a85, 0x27b70a8527b70a85
dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138
dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138
dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
dq 0x53380d1353380d13, 0x53380d1353380d13
dq 0x53380d1353380d13, 0x53380d1353380d13
dq 0x650a7354650a7354, 0x650a7354650a7354
dq 0x650a7354650a7354, 0x650a7354650a7354
dq 0x766a0abb766a0abb, 0x766a0abb766a0abb
dq 0x766a0abb766a0abb, 0x766a0abb766a0abb
dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
dq 0x92722c8592722c85, 0x92722c8592722c85
dq 0x92722c8592722c85, 0x92722c8592722c85
dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
dq 0xa81a664ba81a664b, 0xa81a664ba81a664b
dq 0xa81a664ba81a664b, 0xa81a664ba81a664b
dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
dq 0xd192e819d192e819, 0xd192e819d192e819
dq 0xd192e819d192e819, 0xd192e819d192e819
dq 0xd6990624d6990624, 0xd6990624d6990624
dq 0xd6990624d6990624, 0xd6990624d6990624
dq 0xf40e3585f40e3585, 0xf40e3585f40e3585
dq 0xf40e3585f40e3585, 0xf40e3585f40e3585
dq 0x106aa070106aa070, 0x106aa070106aa070
dq 0x106aa070106aa070, 0x106aa070106aa070
dq 0x19a4c11619a4c116, 0x19a4c11619a4c116
dq 0x19a4c11619a4c116, 0x19a4c11619a4c116
dq 0x1e376c081e376c08, 0x1e376c081e376c08
dq 0x1e376c081e376c08, 0x1e376c081e376c08
dq 0x2748774c2748774c, 0x2748774c2748774c
dq 0x2748774c2748774c, 0x2748774c2748774c
dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
dq 0x748f82ee748f82ee, 0x748f82ee748f82ee
dq 0x748f82ee748f82ee, 0x748f82ee748f82ee
dq 0x78a5636f78a5636f, 0x78a5636f78a5636f
dq 0x78a5636f78a5636f, 0x78a5636f78a5636f
dq 0x84c8781484c87814, 0x84c8781484c87814
dq 0x84c8781484c87814, 0x84c8781484c87814
dq 0x8cc702088cc70208, 0x8cc702088cc70208
dq 0x8cc702088cc70208, 0x8cc702088cc70208
dq 0x90befffa90befffa, 0x90befffa90befffa
dq 0x90befffa90befffa, 0x90befffa90befffa
dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb
dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb
dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
dq 0xc67178f2c67178f2, 0xc67178f2c67178f2
dq 0xc67178f2c67178f2, 0xc67178f2c67178f2
PADDING_8:
ddq 0xc28a2f98c28a2f98c28a2f98c28a2f98
ddq 0xc28a2f98c28a2f98c28a2f98c28a2f98
ddq 0x71374491713744917137449171374491
ddq 0x71374491713744917137449171374491
ddq 0xb5c0fbcfb5c0fbcfb5c0fbcfb5c0fbcf
ddq 0xb5c0fbcfb5c0fbcfb5c0fbcfb5c0fbcf
ddq 0xe9b5dba5e9b5dba5e9b5dba5e9b5dba5
ddq 0xe9b5dba5e9b5dba5e9b5dba5e9b5dba5
ddq 0x3956c25b3956c25b3956c25b3956c25b
ddq 0x3956c25b3956c25b3956c25b3956c25b
ddq 0x59f111f159f111f159f111f159f111f1
ddq 0x59f111f159f111f159f111f159f111f1
ddq 0x923f82a4923f82a4923f82a4923f82a4
ddq 0x923f82a4923f82a4923f82a4923f82a4
ddq 0xab1c5ed5ab1c5ed5ab1c5ed5ab1c5ed5
ddq 0xab1c5ed5ab1c5ed5ab1c5ed5ab1c5ed5
ddq 0xd807aa98d807aa98d807aa98d807aa98
ddq 0xd807aa98d807aa98d807aa98d807aa98
ddq 0x12835b0112835b0112835b0112835b01
ddq 0x12835b0112835b0112835b0112835b01
ddq 0x243185be243185be243185be243185be
ddq 0x243185be243185be243185be243185be
ddq 0x550c7dc3550c7dc3550c7dc3550c7dc3
ddq 0x550c7dc3550c7dc3550c7dc3550c7dc3
ddq 0x72be5d7472be5d7472be5d7472be5d74
ddq 0x72be5d7472be5d7472be5d7472be5d74
ddq 0x80deb1fe80deb1fe80deb1fe80deb1fe
ddq 0x80deb1fe80deb1fe80deb1fe80deb1fe
ddq 0x9bdc06a79bdc06a79bdc06a79bdc06a7
ddq 0x9bdc06a79bdc06a79bdc06a79bdc06a7
ddq 0xc19bf374c19bf374c19bf374c19bf374
ddq 0xc19bf374c19bf374c19bf374c19bf374
ddq 0x649b69c1649b69c1649b69c1649b69c1
ddq 0x649b69c1649b69c1649b69c1649b69c1
ddq 0xf0fe4786f0fe4786f0fe4786f0fe4786
ddq 0xf0fe4786f0fe4786f0fe4786f0fe4786
ddq 0x0fe1edc60fe1edc60fe1edc60fe1edc6
ddq 0x0fe1edc60fe1edc60fe1edc60fe1edc6
ddq 0x240cf254240cf254240cf254240cf254
ddq 0x240cf254240cf254240cf254240cf254
ddq 0x4fe9346f4fe9346f4fe9346f4fe9346f
ddq 0x4fe9346f4fe9346f4fe9346f4fe9346f
ddq 0x6cc984be6cc984be6cc984be6cc984be
ddq 0x6cc984be6cc984be6cc984be6cc984be
ddq 0x61b9411e61b9411e61b9411e61b9411e
ddq 0x61b9411e61b9411e61b9411e61b9411e
ddq 0x16f988fa16f988fa16f988fa16f988fa
ddq 0x16f988fa16f988fa16f988fa16f988fa
ddq 0xf2c65152f2c65152f2c65152f2c65152
ddq 0xf2c65152f2c65152f2c65152f2c65152
ddq 0xa88e5a6da88e5a6da88e5a6da88e5a6d
ddq 0xa88e5a6da88e5a6da88e5a6da88e5a6d
ddq 0xb019fc65b019fc65b019fc65b019fc65
ddq 0xb019fc65b019fc65b019fc65b019fc65
ddq 0xb9d99ec7b9d99ec7b9d99ec7b9d99ec7
ddq 0xb9d99ec7b9d99ec7b9d99ec7b9d99ec7
ddq 0x9a1231c39a1231c39a1231c39a1231c3
ddq 0x9a1231c39a1231c39a1231c39a1231c3
ddq 0xe70eeaa0e70eeaa0e70eeaa0e70eeaa0
ddq 0xe70eeaa0e70eeaa0e70eeaa0e70eeaa0
ddq 0xfdb1232bfdb1232bfdb1232bfdb1232b
ddq 0xfdb1232bfdb1232bfdb1232bfdb1232b
ddq 0xc7353eb0c7353eb0c7353eb0c7353eb0
ddq 0xc7353eb0c7353eb0c7353eb0c7353eb0
ddq 0x3069bad53069bad53069bad53069bad5
ddq 0x3069bad53069bad53069bad53069bad5
ddq 0xcb976d5fcb976d5fcb976d5fcb976d5f
ddq 0xcb976d5fcb976d5fcb976d5fcb976d5f
ddq 0x5a0f118f5a0f118f5a0f118f5a0f118f
ddq 0x5a0f118f5a0f118f5a0f118f5a0f118f
ddq 0xdc1eeefddc1eeefddc1eeefddc1eeefd
ddq 0xdc1eeefddc1eeefddc1eeefddc1eeefd
ddq 0x0a35b6890a35b6890a35b6890a35b689
ddq 0x0a35b6890a35b6890a35b6890a35b689
ddq 0xde0b7a04de0b7a04de0b7a04de0b7a04
ddq 0xde0b7a04de0b7a04de0b7a04de0b7a04
ddq 0x58f4ca9d58f4ca9d58f4ca9d58f4ca9d
ddq 0x58f4ca9d58f4ca9d58f4ca9d58f4ca9d
ddq 0xe15d5b16e15d5b16e15d5b16e15d5b16
ddq 0xe15d5b16e15d5b16e15d5b16e15d5b16
ddq 0x007f3e86007f3e86007f3e86007f3e86
ddq 0x007f3e86007f3e86007f3e86007f3e86
ddq 0x37088980370889803708898037088980
ddq 0x37088980370889803708898037088980
ddq 0xa507ea32a507ea32a507ea32a507ea32
ddq 0xa507ea32a507ea32a507ea32a507ea32
ddq 0x6fab95376fab95376fab95376fab9537
ddq 0x6fab95376fab95376fab95376fab9537
ddq 0x17406110174061101740611017406110
ddq 0x17406110174061101740611017406110
ddq 0x0d8cd6f10d8cd6f10d8cd6f10d8cd6f1
ddq 0x0d8cd6f10d8cd6f10d8cd6f10d8cd6f1
ddq 0xcdaa3b6dcdaa3b6dcdaa3b6dcdaa3b6d
ddq 0xcdaa3b6dcdaa3b6dcdaa3b6dcdaa3b6d
ddq 0xc0bbbe37c0bbbe37c0bbbe37c0bbbe37
ddq 0xc0bbbe37c0bbbe37c0bbbe37c0bbbe37
ddq 0x83613bda83613bda83613bda83613bda
ddq 0x83613bda83613bda83613bda83613bda
ddq 0xdb48a363db48a363db48a363db48a363
ddq 0xdb48a363db48a363db48a363db48a363
ddq 0x0b02e9310b02e9310b02e9310b02e931
ddq 0x0b02e9310b02e9310b02e9310b02e931
ddq 0x6fd15ca76fd15ca76fd15ca76fd15ca7
ddq 0x6fd15ca76fd15ca76fd15ca76fd15ca7
ddq 0x521afaca521afaca521afaca521afaca
ddq 0x521afaca521afaca521afaca521afaca
ddq 0x31338431313384313133843131338431
ddq 0x31338431313384313133843131338431
ddq 0x6ed41a956ed41a956ed41a956ed41a95
ddq 0x6ed41a956ed41a956ed41a956ed41a95
ddq 0x6d4378906d4378906d4378906d437890
ddq 0x6d4378906d4378906d4378906d437890
ddq 0xc39c91f2c39c91f2c39c91f2c39c91f2
ddq 0xc39c91f2c39c91f2c39c91f2c39c91f2
ddq 0x9eccabbd9eccabbd9eccabbd9eccabbd
ddq 0x9eccabbd9eccabbd9eccabbd9eccabbd
ddq 0xb5c9a0e6b5c9a0e6b5c9a0e6b5c9a0e6
ddq 0xb5c9a0e6b5c9a0e6b5c9a0e6b5c9a0e6
ddq 0x532fb63c532fb63c532fb63c532fb63c
ddq 0x532fb63c532fb63c532fb63c532fb63c
ddq 0xd2c741c6d2c741c6d2c741c6d2c741c6
ddq 0xd2c741c6d2c741c6d2c741c6d2c741c6
ddq 0x07237ea307237ea307237ea307237ea3
ddq 0x07237ea307237ea307237ea307237ea3
ddq 0xa4954b68a4954b68a4954b68a4954b68
ddq 0xa4954b68a4954b68a4954b68a4954b68
ddq 0x4c191d764c191d764c191d764c191d76
ddq 0x4c191d764c191d764c191d764c191d76
DIGEST_8:
dd 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667
dd 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667
dd 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85
dd 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85
dd 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372
dd 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372
dd 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a
dd 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a
dd 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f
dd 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f
dd 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c
dd 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c
dd 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab
dd 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab
dd 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19
dd 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19
PSHUFFLE_BYTE_FLIP_MASK:
dq 0x0405060700010203, 0x0c0d0e0f08090a0b
dq 0x0405060700010203, 0x0c0d0e0f08090a0b
STACK_ALIGNMENT_MASK:
dq 0xffffffffffffffe0
section .text
%ifdef WINABI
%define OUTPUT_PTR rcx ; 1st arg
%define DATA_PTR rdx ; 2nd arg
%define NUM_BLKS r8 ; 3rd arg
%define TBL rsi
%define reg1 rdi
%else
%define OUTPUT_PTR rdi ; 1st arg
%define DATA_PTR rsi ; 2nd arg
%define NUM_BLKS rdx ; 3rd arg
%define TBL rcx
%define reg1 r8
%endif
%define ROUND rax
%define inp0 r9
%define inp1 r10
%define inp2 r11
%define inp3 r12
%define inp4 r13
%define inp5 r14
%define inp6 reg1
%define inp7 reg2
; ymm0 a
; ymm1 b
; ymm2 c
; ymm3 d
; ymm4 e
; ymm5 f
; ymm6 g TMP0
; ymm7 h TMP1
; ymm8 T1 TT0
; ymm9 TT1
; ymm10 TT2
; ymm11 TT3
; ymm12 a0 TT4
; ymm13 a1 TT5
; ymm14 a2 TT6
; ymm15 TMP TT7
%define a ymm0
%define b ymm1
%define c ymm2
%define d ymm3
%define e ymm4
%define f ymm5
%define g ymm6
%define h ymm7
%define T1 ymm8
%define a0 ymm12
%define a1 ymm13
%define a2 ymm14
%define TMP ymm15
%define TMP0 ymm6
%define TMP1 ymm7
%define TT0 ymm8
%define TT1 ymm9
%define TT2 ymm10
%define TT3 ymm11
%define TT4 ymm12
%define TT5 ymm13
%define TT6 ymm14
%define TT7 ymm15
%define SHA256_DIGEST_WORD_SIZE 4;
%define SZ8 8*SHA256_DIGEST_WORD_SIZE ; Size of one vector register
%define ROUNDS 64*SZ8
; Define stack usage
;; Assume stack aligned to 32 bytes before call
;; Therefore FRAMESZ mod 32 must be 32-8 = 24
struc stack_frame
.data resb 16*SZ8
.digest resb 8*SZ8
.ytmp resb 4*SZ8
.regsave resb 4*64
endstruc
%define FRAMESZ stack_frame_size
%define _DIGEST stack_frame.digest
%define _YTMP stack_frame.ytmp
%define _RSAVE stack_frame.regsave
%define YTMP0 rsp + _YTMP + 0*SZ8
%define YTMP1 rsp + _YTMP + 1*SZ8
%define YTMP2 rsp + _YTMP + 2*SZ8
%define YTMP3 rsp + _YTMP + 3*SZ8
%define R12 rsp + _RSAVE + 0*64
%define R13 rsp + _RSAVE + 1*64
%define R14 rsp + _RSAVE + 2*64
%define R15 rsp + _RSAVE + 3*64
%define VMOVPS vmovups
%macro ROTATE_ARGS 0
%xdefine TMP_ h
%xdefine h g
%xdefine g f
%xdefine f e
%xdefine e d
%xdefine d c
%xdefine c b
%xdefine b a
%xdefine a TMP_
%endm
; PRORD reg, imm, tmp
%macro PRORD 3
%define %%reg %1
%define %%imm %2
%define %%tmp %3
vpslld %%tmp, %%reg, (32-(%%imm))
vpsrld %%reg, %%reg, %%imm
vpor %%reg, %%reg, %%tmp
%endmacro
; non-destructive
; PRORD_nd reg, imm, tmp, src
%macro PRORD_nd 4
%define %%reg %1
%define %%imm %2
%define %%tmp %3
%define %%src %4
;vmovdqa %%tmp, %%reg
vpslld %%tmp, %%src, (32-(%%imm))
vpsrld %%reg, %%src, %%imm
vpor %%reg, %%reg, %%tmp
%endmacro
; PRORD dst/src, amt
%macro PRORD 2
PRORD %1, %2, TMP
%endmacro
; PRORD_nd dst, src, amt
%macro PRORD_nd 3
PRORD_nd %1, %3, TMP, %2
%endmacro
;; arguments passed implicitly in preprocessor symbols i, a...h
%macro ROUND_00_15 2
%define %%T1 %1
%define %%i %2
PRORD_nd a0, e, (11-6) ; sig1: a0 = (e >> 5)
vpxor a2, f, g ; ch: a2 = f^g
vpand a2, a2, e ; ch: a2 = (f^g)&e
vpxor a2, a2, g ; a2 = ch
PRORD_nd a1, e, 25 ; sig1: a1 = (e >> 25)
vmovdqa [SZ8*(%%i&0xf) + rsp], %%T1 ; save current temp message
vpaddd %%T1, %%T1, [TBL + ROUND] ; T1 = W + K
vpxor a0, a0, e ; sig1: a0 = e ^ (e >> 5)
PRORD a0, 6 ; sig1: a0 = (e >> 6) ^ (e >> 11)
vpaddd h, h, a2 ; h = h + ch
PRORD_nd a2, a, (13-2) ; sig0: a2 = (a >> 11)
vpaddd h, h, %%T1 ; h = h + ch + W + K
vpxor a0, a0, a1 ; a0 = sigma1
PRORD_nd a1, a, 22 ; sig0: a1 = (a >> 22)
vpxor %%T1, a, c ; maj: T1 = a^c
add ROUND, SZ8 ; ROUND++
vpand %%T1, %%T1, b ; maj: T1 = (a^c)&b
vpaddd h, h, a0
vpaddd d, d, h
vpxor a2, a2, a ; sig0: a2 = a ^ (a >> 11)
PRORD a2, 2 ; sig0: a2 = (a >> 2) ^ (a >> 13)
vpxor a2, a2, a1 ; a2 = sig0
vpand a1, a, c ; maj: a1 = a&c
vpor a1, a1, %%T1 ; a1 = maj
vpaddd h, h, a1 ; h = h + ch + W + K + maj
vpaddd h, h, a2 ; h = h + ch + W + K + maj + sigma0
ROTATE_ARGS
%endm
;; arguments passed implicitly in preprocessor symbols i, a...h
%macro ROUND_16_XX 2
%define %%T1 %1
%define %%i %2
vmovdqa %%T1, [SZ8*((%%i-15)&0xf) + rsp]
vmovdqa a1, [SZ8*((%%i-2)&0xf) + rsp]
vmovdqa a0, %%T1
PRORD %%T1, 18-7
vmovdqa a2, a1
PRORD a1, 19-17
vpxor %%T1, %%T1, a0
PRORD %%T1, 7
vpxor a1, a1, a2
PRORD a1, 17
vpsrld a0, a0, 3
vpxor %%T1, %%T1, a0
vpsrld a2, a2, 10
vpxor a1, a1, a2
vpaddd %%T1, %%T1, [SZ8*((%%i-16)&0xf) + rsp] ; + W[i-16]
vpaddd a1, a1, [SZ8*((%%i-7)&0xf) + rsp] ; + W[i-7]
vpaddd %%T1, %%T1, a1
ROUND_00_15 %%T1, %%i
%endm
;; arguments passed implicitly in preprocessor symbols i, a...h
%macro PADDING_ROUND_00_15 1
%define %%T1 %1
PRORD_nd a0, e, (11-6) ; sig1: a0 = (e >> 5)
vpxor a2, f, g ; ch: a2 = f^g
vpand a2, a2, e ; ch: a2 = (f^g)&e
vpxor a2, a2, g ; a2 = ch
PRORD_nd a1, e, 25 ; sig1: a1 = (e >> 25)
vmovdqa %%T1, [TBL + ROUND] ; T1 = W + K
vpxor a0, a0, e ; sig1: a0 = e ^ (e >> 5)
PRORD a0, 6 ; sig1: a0 = (e >> 6) ^ (e >> 11)
vpaddd h, h, a2 ; h = h + ch
PRORD_nd a2, a, (13-2) ; sig0: a2 = (a >> 11)
vpaddd h, h, %%T1 ; h = h + ch + W + K
vpxor a0, a0, a1 ; a0 = sigma1
PRORD_nd a1, a, 22 ; sig0: a1 = (a >> 22)
vpxor %%T1, a, c ; maj: T1 = a^c
add ROUND, SZ8 ; ROUND++
vpand %%T1, %%T1, b ; maj: T1 = (a^c)&b
vpaddd h, h, a0
vpaddd d, d, h
vpxor a2, a2, a ; sig0: a2 = a ^ (a >> 11)
PRORD a2, 2 ; sig0: a2 = (a >> 2) ^ (a >> 13)
vpxor a2, a2, a1 ; a2 = sig0
vpand a1, a, c ; maj: a1 = a&c
vpor a1, a1, %%T1 ; a1 = maj
vpaddd h, h, a1 ; h = h + ch + W + K + maj
vpaddd h, h, a2 ; h = h + ch + W + K + maj + sigma0
ROTATE_ARGS
%endm
global sha256_8_avx2:function
align 16
sha256_8_avx2:
endbranch64
; outer calling routine saves all the XMM registers
push rbp
mov rbp,rsp
and rsp, [rel STACK_ALIGNMENT_MASK]
sub rsp, FRAMESZ
mov [R12], r12
mov [R13], r13
mov [R14], r14
mov [R15], r15
.hash_8_blocks:
cmp NUM_BLKS, 8
jl .hash_4_blocks
xor ROUND, ROUND
lea TBL,[rel DIGEST_8]
vmovdqa a,[TBL + 0*32]
vmovdqa b,[TBL + 1*32]
vmovdqa c,[TBL + 2*32]
vmovdqa d,[TBL + 3*32]
vmovdqa e,[TBL + 4*32]
vmovdqa f,[TBL + 5*32]
vmovdqa g,[TBL + 6*32]
vmovdqa h,[TBL + 7*32]
lea TBL,[rel K256_8]
%assign i 0
%rep 2
TRANSPOSE8_U32_LOAD8 TT0, TT1, TT2, TT3, TT4, TT5, TT6, TT7, \
DATA_PTR + 0*64, \
DATA_PTR + 1*64, \
DATA_PTR + 2*64, \
DATA_PTR + 3*64, \
DATA_PTR + 4*64, \
DATA_PTR + 5*64, \
DATA_PTR + 6*64, \
DATA_PTR + 7*64, \
i*32
vmovdqa [YTMP0], g
vmovdqa [YTMP1], h
TRANSPOSE8_U32_PRELOADED TT0, TT1, TT2, TT3, TT4, TT5, TT6, TT7, TMP0, TMP1
vmovdqa TMP1, [rel PSHUFFLE_BYTE_FLIP_MASK]
vmovdqa g, [YTMP0]
vpshufb TT0, TT0, TMP1
vpshufb TT1, TT1, TMP1
vpshufb TT2, TT2, TMP1
vpshufb TT3, TT3, TMP1
vpshufb TT4, TT4, TMP1
vpshufb TT5, TT5, TMP1
vpshufb TT6, TT6, TMP1
vpshufb TT7, TT7, TMP1
vmovdqa h, [YTMP1]
vmovdqa [YTMP0], TT4
vmovdqa [YTMP1], TT5
vmovdqa [YTMP2], TT6
vmovdqa [YTMP3], TT7
ROUND_00_15 TT0,(i*8+0)
vmovdqa TT0, [YTMP0]
ROUND_00_15 TT1,(i*8+1)
vmovdqa TT1, [YTMP1]
ROUND_00_15 TT2,(i*8+2)
vmovdqa TT2, [YTMP2]
ROUND_00_15 TT3,(i*8+3)
vmovdqa TT3, [YTMP3]
ROUND_00_15 TT0,(i*8+4)
ROUND_00_15 TT1,(i*8+5)
ROUND_00_15 TT2,(i*8+6)
ROUND_00_15 TT3,(i*8+7)
%assign i (i+1)
%endrep
%assign i (i*8)
jmp .Lrounds_16_xx
align 16
.Lrounds_16_xx:
%rep 16
ROUND_16_XX T1, i
%assign i (i+1)
%endrep
cmp ROUND,ROUNDS
jb .Lrounds_16_xx
;; add old digest
lea TBL,[rel DIGEST_8]
vpaddd a, a, [TBL + 0*SZ8]
vpaddd b, b, [TBL + 1*SZ8]
vpaddd c, c, [TBL + 2*SZ8]
vpaddd d, d, [TBL + 3*SZ8]
vpaddd e, e, [TBL + 4*SZ8]
vpaddd f, f, [TBL + 5*SZ8]
vpaddd g, g, [TBL + 6*SZ8]
vpaddd h, h, [TBL + 7*SZ8]
;; rounds with padding
;; save old digest
vmovdqa [rsp + _DIGEST + 0*SZ8], a
vmovdqa [rsp + _DIGEST + 1*SZ8], b
vmovdqa [rsp + _DIGEST + 2*SZ8], c
vmovdqa [rsp + _DIGEST + 3*SZ8], d
vmovdqa [rsp + _DIGEST + 4*SZ8], e
vmovdqa [rsp + _DIGEST + 5*SZ8], f
vmovdqa [rsp + _DIGEST + 6*SZ8], g
vmovdqa [rsp + _DIGEST + 7*SZ8], h
lea TBL,[rel PADDING_8]
xor ROUND,ROUND
jmp .Lrounds_padding
align 16
.Lrounds_padding:
%rep 64
PADDING_ROUND_00_15 T1
%endrep
;; add old digest
vpaddd a, a, [rsp + _DIGEST + 0*SZ8]
vpaddd b, b, [rsp + _DIGEST + 1*SZ8]
vpaddd c, c, [rsp + _DIGEST + 2*SZ8]
vpaddd d, d, [rsp + _DIGEST + 3*SZ8]
vpaddd e, e, [rsp + _DIGEST + 4*SZ8]
vpaddd f, f, [rsp + _DIGEST + 5*SZ8]
vpaddd g, g, [rsp + _DIGEST + 6*SZ8]
vpaddd h, h, [rsp + _DIGEST + 7*SZ8]
;; transpose the digest and convert to little endian to get the registers correctly
TRANSPOSE8_U32 a, b, c, d, e, f, g, h, TT0, TT1
vmovdqa TT0, [rel PSHUFFLE_BYTE_FLIP_MASK]
vpshufb a, a, TT0
vpshufb b, b, TT0
vpshufb c, c, TT0
vpshufb d, d, TT0
vpshufb e, e, TT0
vpshufb f, f, TT0
vpshufb g, g, TT0
vpshufb h, h, TT0
;; write to output
vmovdqu [OUTPUT_PTR + 0*32],a
vmovdqu [OUTPUT_PTR + 1*32],b
vmovdqu [OUTPUT_PTR + 2*32],c
vmovdqu [OUTPUT_PTR + 3*32],d
vmovdqu [OUTPUT_PTR + 4*32],e
vmovdqu [OUTPUT_PTR + 5*32],f
vmovdqu [OUTPUT_PTR + 6*32],g
vmovdqu [OUTPUT_PTR + 7*32],h
; update pointers and loop
add DATA_PTR, 64*8
add OUTPUT_PTR, 32*8
sub NUM_BLKS, 8
jmp .hash_8_blocks
.hash_4_blocks:
call sha256_4_avx
mov r12,[R12]
mov r13,[R13]
mov r14,[R14]
mov r15,[R15]
mov rsp,rbp
pop rbp
ret
%ifdef LINUX
section .note.GNU-stack noalloc noexec nowrite progbits
%endif

View File

@@ -0,0 +1,646 @@
;;
;; Copyright (c) 2012-2021, Intel Corporation
;;
;; Redistribution and use in source and binary forms, with or without
;; modification, are permitted provided that the following conditions are met:
;;
;; * Redistributions of source code must retain the above copyright notice,
;; this list of conditions and the following disclaimer.
;; * Redistributions in binary form must reproduce the above copyright
;; notice, this list of conditions and the following disclaimer in the
;; documentation and/or other materials provided with the distribution.
;; * Neither the name of Intel Corporation nor the names of its contributors
;; may be used to endorse or promote products derived from this software
;; without specific prior written permission.
;;
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;
; This code schedules 1 blocks at a time, with 4 lanes per block
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
section .data
default rel
align 64
K256:
dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
DIGEST:
dd 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
dd 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
PADDING:
dd 0xc28a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
dd 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
dd 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
dd 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf374
dd 0x649b69c1, 0xf0fe4786, 0xfe1edc6, 0x240cf254
dd 0x4fe9346f, 0x6cc984be, 0x61b9411e, 0x16f988fa
dd 0xf2c65152, 0xa88e5a6d, 0xb019fc65, 0xb9d99ec7
dd 0x9a1231c3, 0xe70eeaa0, 0xfdb1232b, 0xc7353eb0
dd 0x3069bad5, 0xcb976d5f, 0x5a0f118f, 0xdc1eeefd
dd 0xa35b689, 0xde0b7a04, 0x58f4ca9d, 0xe15d5b16
dd 0x7f3e86, 0x37088980, 0xa507ea32, 0x6fab9537
dd 0x17406110, 0xd8cd6f1, 0xcdaa3b6d, 0xc0bbbe37
dd 0x83613bda, 0xdb48a363, 0xb02e931, 0x6fd15ca7
dd 0x521afaca, 0x31338431, 0x6ed41a95, 0x6d437890
dd 0xc39c91f2, 0x9eccabbd, 0xb5c9a0e6, 0x532fb63c
dd 0xd2c741c6, 0x7237ea3, 0xa4954b68, 0x4c191d76
PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203
dq 0x0405060700010203, 0x0c0d0e0f08090a0b
; shuffle xBxA -> 00BA
_SHUF_00BA: ;ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100
dq 0x0b0a090803020100, 0xFFFFFFFFFFFFFFFF
; shuffle xDxC -> DC00
_SHUF_DC00: ;ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF
dq 0xFFFFFFFFFFFFFFFF, 0x0b0a090803020100
section .text
%define VMOVDQ vmovdqu ;; assume buffers not aligned
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
%macro MY_ROR 2
shld %1,%1,(32-(%2))
%endm
; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
; Load xmm with mem and byte swap each dword
%macro COPY_XMM_AND_BSWAP 3
VMOVDQ %1, %2
vpshufb %1, %1, %3
%endmacro
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
%define X0 xmm4
%define X1 xmm5
%define X2 xmm6
%define X3 xmm7
%define XTMP0 xmm0
%define XTMP1 xmm1
%define XTMP2 xmm2
%define XTMP3 xmm3
%define XTMP4 xmm8
%define XFER xmm9
%define XTMP5 xmm11
%define SHUF_00BA xmm10 ; shuffle xBxA -> 00BA
%define SHUF_DC00 xmm12 ; shuffle xDxC -> DC00
%define BYTE_FLIP_MASK xmm13
%ifdef WINABI
%define OUTPUT_PTR rcx ; 1st arg
%define DATA_PTR rdx ; 2nd arg
%define d r8d ; 3rd
%define TBL rsi
%define c edi
%else
%define OUTPUT_PTR rdi ; 1st arg
%define DATA_PTR rsi ; 2nd arg
%define c edx ; 3rd arg
%define TBL rcx
%define d r8d
%endif
%define a eax
%define b ebx
%define e r9d
%define f r10d
%define g r11d
%define h r12d
%define y0 r13d
%define y1 r14d
%define y2 r15d
struc STACK
_XFER: resb 32
_DIGEST: resb 32
%ifdef WINABI
_XMM_SAVE: reso 8
resb 16 ; alignment
%endif
endstruc
; rotate_Xs
; Rotate values of symbols X0...X3
%macro rotate_Xs 0
%xdefine X_ X0
%xdefine X0 X1
%xdefine X1 X2
%xdefine X2 X3
%xdefine X3 X_
%endm
; ROTATE_ARGS
; Rotate values of symbols a...h
%macro ROTATE_ARGS 0
%xdefine TMP_ h
%xdefine h g
%xdefine g f
%xdefine f e
%xdefine e d
%xdefine d c
%xdefine c b
%xdefine b a
%xdefine a TMP_
%endm
%macro FOUR_ROUNDS_AND_SCHED 0
;; compute s0 four at a time and s1 two at a time
;; compute W[-16] + W[-7] 4 at a time
;vmovdqa XTMP0, X3
mov y0, e ; y0 = e
MY_ROR y0, (25-11) ; y0 = e >> (25-11)
mov y1, a ; y1 = a
vpalignr XTMP0, X3, X2, 4 ; XTMP0 = W[-7]
MY_ROR y1, (22-13) ; y1 = a >> (22-13)
xor y0, e ; y0 = e ^ (e >> (25-11))
mov y2, f ; y2 = f
MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
;vmovdqa XTMP1, X1
xor y1, a ; y1 = a ^ (a >> (22-13)
xor y2, g ; y2 = f^g
vpaddd XTMP0, XTMP0, X0 ; XTMP0 = W[-7] + W[-16]
xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
and y2, e ; y2 = (f^g)&e
MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
;; compute s0
vpalignr XTMP1, X1, X0, 4 ; XTMP1 = W[-15]
xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
xor y2, g ; y2 = CH = ((f^g)&e)^g
MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
add y2, y0 ; y2 = S1 + CH
add y2, [rsp + _XFER + 0*4] ; y2 = k + w + S1 + CH
mov y0, a ; y0 = a
add h, y2 ; h = h + S1 + CH + k + w
mov y2, a ; y2 = a
vpsrld XTMP2, XTMP1, 7
or y0, c ; y0 = a|c
add d, h ; d = d + h + S1 + CH + k + w
and y2, c ; y2 = a&c
vpslld XTMP3, XTMP1, (32-7)
and y0, b ; y0 = (a|c)&b
add h, y1 ; h = h + S1 + CH + k + w + S0
vpor XTMP3, XTMP3, XTMP2 ; XTMP1 = W[-15] MY_ROR 7
or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
ROTATE_ARGS
mov y0, e ; y0 = e
mov y1, a ; y1 = a
MY_ROR y0, (25-11) ; y0 = e >> (25-11)
xor y0, e ; y0 = e ^ (e >> (25-11))
mov y2, f ; y2 = f
MY_ROR y1, (22-13) ; y1 = a >> (22-13)
vpsrld XTMP2, XTMP1,18
xor y1, a ; y1 = a ^ (a >> (22-13)
MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
xor y2, g ; y2 = f^g
vpsrld XTMP4, XTMP1, 3 ; XTMP4 = W[-15] >> 3
MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
and y2, e ; y2 = (f^g)&e
MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
vpslld XTMP1, XTMP1, (32-18)
xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
xor y2, g ; y2 = CH = ((f^g)&e)^g
vpxor XTMP3, XTMP3, XTMP1
add y2, y0 ; y2 = S1 + CH
add y2, [rsp + _XFER + 1*4] ; y2 = k + w + S1 + CH
MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
vpxor XTMP3, XTMP3, XTMP2 ; XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 18
mov y0, a ; y0 = a
add h, y2 ; h = h + S1 + CH + k + w
mov y2, a ; y2 = a
vpxor XTMP1, XTMP3, XTMP4 ; XTMP1 = s0
or y0, c ; y0 = a|c
add d, h ; d = d + h + S1 + CH + k + w
and y2, c ; y2 = a&c
;; compute low s1
vpshufd XTMP2, X3, 11111010b ; XTMP2 = W[-2] {BBAA}
and y0, b ; y0 = (a|c)&b
add h, y1 ; h = h + S1 + CH + k + w + S0
vpaddd XTMP0, XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0
or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
ROTATE_ARGS
;vmovdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {BBAA}
mov y0, e ; y0 = e
mov y1, a ; y1 = a
MY_ROR y0, (25-11) ; y0 = e >> (25-11)
;vmovdqa XTMP4, XTMP2 ; XTMP4 = W[-2] {BBAA}
xor y0, e ; y0 = e ^ (e >> (25-11))
MY_ROR y1, (22-13) ; y1 = a >> (22-13)
mov y2, f ; y2 = f
xor y1, a ; y1 = a ^ (a >> (22-13)
MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
vpsrld XTMP4, XTMP2, 10 ; XTMP4 = W[-2] >> 10 {BBAA}
xor y2, g ; y2 = f^g
vpsrlq XTMP3, XTMP2, 19 ; XTMP3 = W[-2] MY_ROR 19 {xBxA}
xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
and y2, e ; y2 = (f^g)&e
vpsrlq XTMP2, XTMP2, 17 ; XTMP2 = W[-2] MY_ROR 17 {xBxA}
MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
xor y2, g ; y2 = CH = ((f^g)&e)^g
MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
vpxor XTMP2, XTMP2, XTMP3
add y2, y0 ; y2 = S1 + CH
MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
add y2, [rsp + _XFER + 2*4] ; y2 = k + w + S1 + CH
vpxor XTMP4, XTMP4, XTMP2 ; XTMP4 = s1 {xBxA}
mov y0, a ; y0 = a
add h, y2 ; h = h + S1 + CH + k + w
mov y2, a ; y2 = a
vpshufb XTMP4, XTMP4, SHUF_00BA ; XTMP4 = s1 {00BA}
or y0, c ; y0 = a|c
add d, h ; d = d + h + S1 + CH + k + w
and y2, c ; y2 = a&c
vpaddd XTMP0, XTMP0, XTMP4 ; XTMP0 = {..., ..., W[1], W[0]}
and y0, b ; y0 = (a|c)&b
add h, y1 ; h = h + S1 + CH + k + w + S0
;; compute high s1
vpshufd XTMP2, XTMP0, 01010000b ; XTMP2 = W[-2] {DDCC}
or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
ROTATE_ARGS
;vmovdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {DDCC}
mov y0, e ; y0 = e
MY_ROR y0, (25-11) ; y0 = e >> (25-11)
mov y1, a ; y1 = a
;vmovdqa XTMP5, XTMP2 ; XTMP5 = W[-2] {DDCC}
MY_ROR y1, (22-13) ; y1 = a >> (22-13)
xor y0, e ; y0 = e ^ (e >> (25-11))
mov y2, f ; y2 = f
MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
vpsrld XTMP5, XTMP2, 10 ; XTMP5 = W[-2] >> 10 {DDCC}
xor y1, a ; y1 = a ^ (a >> (22-13)
xor y2, g ; y2 = f^g
vpsrlq XTMP3, XTMP2, 19 ; XTMP3 = W[-2] MY_ROR 19 {xDxC}
xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
and y2, e ; y2 = (f^g)&e
MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
vpsrlq XTMP2, XTMP2, 17 ; XTMP2 = W[-2] MY_ROR 17 {xDxC}
xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
xor y2, g ; y2 = CH = ((f^g)&e)^g
vpxor XTMP2, XTMP2, XTMP3
MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
add y2, y0 ; y2 = S1 + CH
add y2, [rsp + _XFER + 3*4] ; y2 = k + w + S1 + CH
vpxor XTMP5, XTMP5, XTMP2 ; XTMP5 = s1 {xDxC}
mov y0, a ; y0 = a
add h, y2 ; h = h + S1 + CH + k + w
mov y2, a ; y2 = a
vpshufb XTMP5, XTMP5, SHUF_DC00 ; XTMP5 = s1 {DC00}
or y0, c ; y0 = a|c
add d, h ; d = d + h + S1 + CH + k + w
and y2, c ; y2 = a&c
vpaddd X0, XTMP5, XTMP0 ; X0 = {W[3], W[2], W[1], W[0]}
and y0, b ; y0 = (a|c)&b
add h, y1 ; h = h + S1 + CH + k + w + S0
or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
ROTATE_ARGS
rotate_Xs
%endm
;; input is [rsp + _XFER + %1 * 4]
%macro DO_ROUND 1
mov y0, e ; y0 = e
MY_ROR y0, (25-11) ; y0 = e >> (25-11)
mov y1, a ; y1 = a
xor y0, e ; y0 = e ^ (e >> (25-11))
MY_ROR y1, (22-13) ; y1 = a >> (22-13)
mov y2, f ; y2 = f
xor y1, a ; y1 = a ^ (a >> (22-13)
MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
xor y2, g ; y2 = f^g
xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
and y2, e ; y2 = (f^g)&e
xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
xor y2, g ; y2 = CH = ((f^g)&e)^g
add y2, y0 ; y2 = S1 + CH
MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
add y2, [rsp + _XFER + %1 * 4] ; y2 = k + w + S1 + CH
mov y0, a ; y0 = a
add h, y2 ; h = h + S1 + CH + k + w
mov y2, a ; y2 = a
or y0, c ; y0 = a|c
add d, h ; d = d + h + S1 + CH + k + w
and y2, c ; y2 = a&c
and y0, b ; y0 = (a|c)&b
add h, y1 ; h = h + S1 + CH + k + w + S0
or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
ROTATE_ARGS
%endm
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; void FUNC(void *input_data, UINT32 digest[8], UINT64 num_blks)
;; arg 1 : pointer to input data
;; arg 2 : pointer to digest
section .text
global sha256_1_avx:function
align 32
sha256_1_avx:
endbranch64
push rbx
%ifdef WINABI
push r8
push rsi
push rdi
%else
push rdx
%endif
push rbp
push r12
push r13
push r14
push r15
sub rsp,STACK_size
%ifdef WINABI
vmovdqa [rsp + _XMM_SAVE + 0*16],xmm6
vmovdqa [rsp + _XMM_SAVE + 1*16],xmm7
vmovdqa [rsp + _XMM_SAVE + 2*16],xmm8
vmovdqa [rsp + _XMM_SAVE + 3*16],xmm9
vmovdqa [rsp + _XMM_SAVE + 4*16],xmm10
vmovdqa [rsp + _XMM_SAVE + 5*16],xmm11
vmovdqa [rsp + _XMM_SAVE + 6*16],xmm12
vmovdqa [rsp + _XMM_SAVE + 7*16],xmm13
%endif
vmovdqa BYTE_FLIP_MASK, [rel PSHUFFLE_BYTE_FLIP_MASK]
vmovdqa SHUF_00BA, [rel _SHUF_00BA]
vmovdqa SHUF_DC00, [rel _SHUF_DC00]
.hash_1_block:
;; load initial digest
lea TBL,[rel DIGEST]
mov a, [TBL + 0*4]
mov b, [TBL + 1*4]
mov c, [TBL + 2*4]
mov d, [TBL + 3*4]
mov e, [TBL + 4*4]
mov f, [TBL + 5*4]
mov g, [TBL + 6*4]
mov h, [TBL + 7*4]
lea TBL,[rel K256]
;; byte swap first 16 dwords
COPY_XMM_AND_BSWAP X0, [DATA_PTR + 0*16], BYTE_FLIP_MASK
COPY_XMM_AND_BSWAP X1, [DATA_PTR + 1*16], BYTE_FLIP_MASK
COPY_XMM_AND_BSWAP X2, [DATA_PTR + 2*16], BYTE_FLIP_MASK
COPY_XMM_AND_BSWAP X3, [DATA_PTR + 3*16], BYTE_FLIP_MASK
;; schedule 48 input dwords, by doing 3 rounds of 16 each
%rep 3
align 16
vpaddd XFER, X0, [TBL + 0*16]
vmovdqa [rsp + _XFER], XFER
FOUR_ROUNDS_AND_SCHED
vpaddd XFER, X0, [TBL + 1*16]
vmovdqa [rsp + _XFER], XFER
FOUR_ROUNDS_AND_SCHED
vpaddd XFER, X0, [TBL + 2*16]
vmovdqa [rsp + _XFER], XFER
FOUR_ROUNDS_AND_SCHED
vpaddd XFER, X0, [TBL + 3*16]
vmovdqa [rsp + _XFER], XFER
add TBL, 4*16
FOUR_ROUNDS_AND_SCHED
%endrep
%rep 2
vpaddd XFER, X0, [TBL + 0*16]
vmovdqa [rsp + _XFER], XFER
DO_ROUND 0
DO_ROUND 1
DO_ROUND 2
DO_ROUND 3
vpaddd XFER, X1, [TBL + 1*16]
vmovdqa [rsp + _XFER], XFER
add TBL, 2*16
DO_ROUND 0
DO_ROUND 1
DO_ROUND 2
DO_ROUND 3
vmovdqa X0, X2
vmovdqa X1, X3
%endrep
; add old digest
lea TBL,[rel DIGEST]
add a, [TBL + 0*4]
add b, [TBL + 1*4]
add c, [TBL + 2*4]
add d, [TBL + 3*4]
add e, [TBL + 4*4]
add f, [TBL + 5*4]
add g, [TBL + 6*4]
add h, [TBL + 7*4]
; rounds with padding
; save old digest
;
mov [rsp + _DIGEST + 0*4], a
mov [rsp + _DIGEST + 1*4], b
mov [rsp + _DIGEST + 2*4], c
mov [rsp + _DIGEST + 3*4], d
mov [rsp + _DIGEST + 4*4], e
mov [rsp + _DIGEST + 5*4], f
mov [rsp + _DIGEST + 6*4], g
mov [rsp + _DIGEST + 7*4], h
lea TBL,[rel PADDING]
%assign i 0
%rep 64
mov y0, e ; y0 = e
MY_ROR y0, (25-11) ; y0 = e >> (25-11)
mov y1, a ; y1 = a
xor y0, e ; y0 = e ^ (e >> (25-11))
MY_ROR y1, (22-13) ; y1 = a >> (22-13)
mov y2, f ; y2 = f
xor y1, a ; y1 = a ^ (a >> (22-13)
MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
xor y2, g ; y2 = f^g
xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
and y2, e ; y2 = (f^g)&e
xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
xor y2, g ; y2 = CH = ((f^g)&e)^g
add y2, y0 ; y2 = S1 + CH
MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
add y2, [TBL + i] ; y2 = k + w + S1 + CH
mov y0, a ; y0 = a
add h, y2 ; h = h + S1 + CH + k + w
mov y2, a ; y2 = a
or y0, c ; y0 = a|c
add d, h ; d = d + h + S1 + CH + k + w
and y2, c ; y2 = a&c
and y0, b ; y0 = (a|c)&b
add h, y1 ; h = h + S1 + CH + k + w + S0
or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
ROTATE_ARGS
%assign i (i+4)
%endrep
;; add the previous digest
add a, [rsp + _DIGEST + 0*4]
add b, [rsp + _DIGEST + 1*4]
add c, [rsp + _DIGEST + 2*4]
add d, [rsp + _DIGEST + 3*4]
add e, [rsp + _DIGEST + 4*4]
add f, [rsp + _DIGEST + 5*4]
add g, [rsp + _DIGEST + 6*4]
add h, [rsp + _DIGEST + 7*4]
;; shuffle the bytes to little endian
bswap a
bswap b
bswap c
bswap d
bswap e
bswap f
bswap g
bswap h
;; write resulting hash
mov [OUTPUT_PTR + 0*4], a
mov [OUTPUT_PTR + 1*4], b
mov [OUTPUT_PTR + 2*4], c
mov [OUTPUT_PTR + 3*4], d
mov [OUTPUT_PTR + 4*4], e
mov [OUTPUT_PTR + 5*4], f
mov [OUTPUT_PTR + 6*4], g
mov [OUTPUT_PTR + 7*4], h
%ifdef WINABI
vmovdqa xmm6,[rsp + _XMM_SAVE + 0*16]
vmovdqa xmm7,[rsp + _XMM_SAVE + 1*16]
vmovdqa xmm8,[rsp + _XMM_SAVE + 2*16]
vmovdqa xmm9,[rsp + _XMM_SAVE + 3*16]
vmovdqa xmm10,[rsp + _XMM_SAVE + 4*16]
vmovdqa xmm11,[rsp + _XMM_SAVE + 5*16]
vmovdqa xmm12,[rsp + _XMM_SAVE + 6*16]
vmovdqa xmm13,[rsp + _XMM_SAVE + 7*16]
%endif
add rsp, STACK_size
pop r15
pop r14
pop r13
pop r12
pop rbp
%ifdef WINABI
pop rdi
pop rsi
pop r8
%else
pop rdx
%endif
pop rbx
ret
%ifdef LINUX
section .note.GNU-stack noalloc noexec nowrite progbits
%endif

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,192 @@
;;
;; Copyright (c) 2012-2021, Intel Corporation
;;
;; Redistribution and use in source and binary forms, with or without
;; modification, are permitted provided that the following conditions are met:
;;
;; * Redistributions of source code must retain the above copyright notice,
;; this list of conditions and the following disclaimer.
;; * Redistributions in binary form must reproduce the above copyright
;; notice, this list of conditions and the following disclaimer in the
;; documentation and/or other materials provided with the distribution.
;; * Neither the name of Intel Corporation nor the names of its contributors
;; may be used to endorse or promote products derived from this software
;; without specific prior written permission.
;;
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;
%ifndef _TRANSPOSE_AVX2_ASM_
%define _TRANSPOSE_AVX2_ASM_
%include "reg_sizes.asm"
; LOAD ALL 8 LANES FOR 8x8 32-BIT TRANSPOSE
;
; r0-r7 [out] ymm registers which will contain the data to be transposed
; addr0-addr7 [in] pointers to the next 32-byte block of data to be fetch for all 8 lanes
; ptr_offset [in] offset to be applied on all pointers (addr0-addr7)
%macro TRANSPOSE8_U32_LOAD8 17
%define %%r0 %1
%define %%r1 %2
%define %%r2 %3
%define %%r3 %4
%define %%r4 %5
%define %%r5 %6
%define %%r6 %7
%define %%r7 %8
%define %%addr0 %9
%define %%addr1 %10
%define %%addr2 %11
%define %%addr3 %12
%define %%addr4 %13
%define %%addr5 %14
%define %%addr6 %15
%define %%addr7 %16
%define %%ptr_offset %17
; Expected output data
;
; r0 = {e3 e2 e1 e0 a3 a2 a1 a0}
; r1 = {f3 f2 f1 f0 b3 b2 b1 b0}
; r2 = {g3 g2 g1 g0 c3 c2 c1 c0}
; r3 = {h3 h2 h1 h0 d3 d2 d1 d0}
; r4 = {e7 e6 e5 e4 a7 a6 a5 a4}
; r5 = {f7 f6 f5 f4 b7 b6 b5 b4}
; r6 = {g7 g6 g5 g4 c7 c6 c5 c4}
; r7 = {h7 h6 h5 h4 d7 d6 d5 d4}
vmovups XWORD(%%r0),[%%addr0+%%ptr_offset]
vmovups XWORD(%%r1),[%%addr1+%%ptr_offset]
vmovups XWORD(%%r2),[%%addr2+%%ptr_offset]
vmovups XWORD(%%r3),[%%addr3+%%ptr_offset]
vmovups XWORD(%%r4),[%%addr0+%%ptr_offset+16]
vmovups XWORD(%%r5),[%%addr1+%%ptr_offset+16]
vmovups XWORD(%%r6),[%%addr2+%%ptr_offset+16]
vmovups XWORD(%%r7),[%%addr3+%%ptr_offset+16]
vinserti128 %%r0, %%r0, [%%addr4+%%ptr_offset], 0x01
vinserti128 %%r1, %%r1, [%%addr5+%%ptr_offset], 0x01
vinserti128 %%r2, %%r2, [%%addr6+%%ptr_offset], 0x01
vinserti128 %%r3, %%r3, [%%addr7+%%ptr_offset], 0x01
vinserti128 %%r4, %%r4, [%%addr4+%%ptr_offset+16], 0x01
vinserti128 %%r5, %%r5, [%%addr5+%%ptr_offset+16], 0x01
vinserti128 %%r6, %%r6, [%%addr6+%%ptr_offset+16], 0x01
vinserti128 %%r7, %%r7, [%%addr7+%%ptr_offset+16], 0x01
%endmacro
; 8x8 32-BIT TRANSPOSE
;
; Before calling this macro, TRANSPOSE8_U32_LOAD8 must be called.
;
; r0-r3 [in/out] ymm registers containing bytes 0-15 of each 32B block (e.g. ymm0 = [e3-e0 a3-a0])
; r4-r7 [in/out] ymm registers containing bytes 16-31 of each 32B block (e.g. ymm4 = [e4-e7 a4-a7])
; t0-t1 [clobbered] ymm temporary registers
%macro TRANSPOSE8_U32_PRELOADED 10
%define %%r0 %1
%define %%r1 %2
%define %%r2 %3
%define %%r3 %4
%define %%r4 %5
%define %%r5 %6
%define %%r6 %7
%define %%r7 %8
%define %%t0 %9
%define %%t1 %10
; Input looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
; r0 = {e3 e2 e1 e0 a3 a2 a1 a0}
; r1 = {f3 f2 f1 f0 b3 b2 b1 b0}
; r2 = {g3 g2 g1 g0 c3 c2 c1 c0}
; r3 = {h3 h2 h1 h0 d3 d2 d1 d0}
; r4 = {e7 e6 e5 e4 a7 a6 a5 a4}
; r5 = {f7 f6 f5 f4 b7 b6 b5 b4}
; r6 = {g7 g6 g5 g4 c7 c6 c5 c4}
; r7 = {h7 h6 h5 h4 d7 d6 d5 d4}
;
; Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
; r0 = {h0 g0 f0 e0 d0 c0 b0 a0}
; r1 = {h1 g1 f1 e1 d1 c1 b1 a1}
; r2 = {h2 g2 f2 e2 d2 c2 b2 a2}
; r3 = {h3 g3 f3 e3 d3 c3 b3 a3}
; r4 = {h4 g4 f4 e4 d4 c4 b4 a4}
; r5 = {h5 g5 f5 e5 d5 c5 b5 a5}
; r6 = {h6 g6 f6 e6 d6 c6 b6 a6}
; r7 = {h7 g7 f7 e7 d7 c7 b7 a7}
;
; process top half (r0..r3)
vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {f1 f0 e1 e0 b1 b0 a1 a0}
vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {f3 f2 e3 e2 b3 b2 a3 a2}
vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {h1 h0 g1 g0 d1 d0 c1 c0}
vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {h3 h2 g3 g2 d3 d2 c3 c2}
vshufps %%r1, %%t0, %%t1, 0xDD ; r1 = {h1 g1 f1 e1 d1 c1 b1 a1}
vshufps %%r3, %%r0, %%r2, 0xDD ; r3 = {h3 g3 f3 e3 d3 c3 b3 a3}
vshufps %%r2, %%r0, %%r2, 0x88 ; r2 = {h2 g2 f2 e2 d2 c2 b2 a2}
vshufps %%r0, %%t0, %%t1, 0x88 ; r0 = {h0 g0 f0 e0 d0 c0 b0 a0}
;; process bottom half (r4..r7)
vshufps %%t0, %%r4, %%r5, 0x44 ; t0 = {f5 f4 e5 e4 b5 b4 a5 a4}
vshufps %%r4, %%r4, %%r5, 0xEE ; r4 = {f7 f6 e7 e6 b7 b6 a7 a6}
vshufps %%t1, %%r6, %%r7, 0x44 ; t1 = {h5 h4 g5 g4 d5 d4 c5 c4}
vshufps %%r6, %%r6, %%r7, 0xEE ; r6 = {h7 h6 g7 g6 d7 d6 c7 c6}
vshufps %%r5, %%t0, %%t1, 0xDD ; r5 = {h5 g5 f5 e5 d5 c5 b5 a5}
vshufps %%r7, %%r4, %%r6, 0xDD ; r7 = {h7 g7 f7 e7 d7 c7 b7 a7}
vshufps %%r6, %%r4, %%r6, 0x88 ; r6 = {h6 g6 f6 e6 d6 c6 b6 a6}
vshufps %%r4, %%t0, %%t1, 0x88 ; r4 = {h4 g4 f4 e4 d4 c4 b4 a4}
%endmacro
%macro TRANSPOSE8_U32 10
%define %%r0 %1
%define %%r1 %2
%define %%r2 %3
%define %%r3 %4
%define %%r4 %5
%define %%r5 %6
%define %%r6 %7
%define %%r7 %8
%define %%t0 %9
%define %%t1 %10
; process top half (r0..r3) {a...d}
vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b5 b4 a5 a4 b1 b0 a1 a0}
vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b7 b6 a7 a6 b3 b2 a3 a2}
vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d5 d4 c5 c4 d1 d0 c1 c0}
vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d7 d6 c7 c6 d3 d2 c3 c2}
vshufps %%r3, %%t0, %%t1, 0xDD ; r3 = {d5 c5 b5 a5 d1 c1 b1 a1}
vshufps %%r1, %%r0, %%r2, 0x88 ; r1 = {d6 c6 b6 a6 d2 c2 b2 a2}
vshufps %%r0, %%r0, %%r2, 0xDD ; r0 = {d7 c7 b7 a7 d3 c3 b3 a3}
vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d4 c4 b4 a4 d0 c0 b0 a0}
; use r2 in place of t0
; process bottom half (r4..r7) {e...h}
vshufps %%r2, %%r4, %%r5, 0x44 ; r2 = {f5 f4 e5 e4 f1 f0 e1 e0}
vshufps %%r4, %%r4, %%r5, 0xEE ; r4 = {f7 f6 e7 e6 f3 f2 e3 e2}
vshufps %%t1, %%r6, %%r7, 0x44 ; t1 = {h5 h4 g5 g4 h1 h0 g1 g0}
vshufps %%r6, %%r6, %%r7, 0xEE ; r6 = {h7 h6 g7 g6 h3 h2 g3 g2}
vshufps %%r7, %%r2, %%t1, 0xDD ; r7 = {h5 g5 f5 e5 h1 g1 f1 e1}
vshufps %%r5, %%r4, %%r6, 0x88 ; r5 = {h6 g6 f6 e6 h2 g2 f2 e2}
vshufps %%r4, %%r4, %%r6, 0xDD ; r4 = {h7 g7 f7 e7 h3 g3 f3 e3}
vshufps %%t1, %%r2, %%t1, 0x88 ; t1 = {h4 g4 f4 e4 h0 g0 f0 e0}
vperm2f128 %%r6, %%r5, %%r1, 0x13 ; h6...a6
vperm2f128 %%r2, %%r5, %%r1, 0x02 ; h2...a2
vperm2f128 %%r5, %%r7, %%r3, 0x13 ; h5...a5
vperm2f128 %%r1, %%r7, %%r3, 0x02 ; h1...a1
vperm2f128 %%r7, %%r4, %%r0, 0x13 ; h7...a7
vperm2f128 %%r3, %%r4, %%r0, 0x02 ; h3...a3
vperm2f128 %%r4, %%t1, %%t0, 0x13 ; h4...a4
vperm2f128 %%r0, %%t1, %%t0, 0x02 ; h0...a0
%endmacro
%endif ;; _TRANSPOSE_AVX2_ASM_

View File

@@ -0,0 +1,61 @@
#include <cpuid.h>
#include "hasher.hpp"
extern "C" void sha256_1_avx(unsigned char* output, const unsigned char* input);
namespace {
constexpr auto CPUID_LEAF = 7;
}
namespace prysm {
void Hasher::sha256_sse(unsigned char* output, const unsigned char* input, std::size_t blocks) {
while (blocks) {
sha256_1_avx(output, input);
input += 2*constants::BYTES_PER_CHUNK;
output += constants::BYTES_PER_CHUNK;
blocks--;
}
}
const Hasher::IMPL Hasher::implemented() {
IMPL ret = IMPL::NONE;
std::uint32_t a, b, c, d; // NOLINT
__get_cpuid_count(CPUID_LEAF, 0, &a, &b, &c, &d);
if (b & bit_SHA) ret = ret | IMPL::SHA;
if (b & bit_AVX2) ret = ret | IMPL::AVX2;
__get_cpuid(1, &a, &b, &c, &d);
if (c & bit_AVX) ret = ret | IMPL::AVX;
if (c & bit_SSE3) ret = ret | IMPL::SSE;
return ret;
}
Hasher::SHA256_hasher Hasher::best_sha256_implementation() {
auto impl = implemented();
if (!!(impl & IMPL::SHA)) return &::sha256_shani;
if (!!(impl & IMPL::AVX2)) return &::sha256_8_avx2;
if (!!(impl & IMPL::AVX)) return &::sha256_4_avx;
return &sha256_sse;
}
Hasher::Hasher(Hasher::IMPL impl) {
switch (impl) {
case IMPL::SHA:
_hash_64b_blocks = sha256_shani;
break;
case IMPL::AVX2:
_hash_64b_blocks = sha256_8_avx2;
break;
case IMPL::AVX:
_hash_64b_blocks = sha256_4_avx;
break;
case IMPL::SSE:
_hash_64b_blocks = &sha256_sse;
break;
default:
_hash_64b_blocks = best_sha256_implementation();
}
}
} // namespace ssz

View File

@@ -0,0 +1,9 @@
#ifndef __CUSTOM_HASHER__
#define __CUSTOM_HASHER__
#include <stdint.h>
extern void sha256_4_avx(unsigned char* output, const unsigned char* input, uint64_t blocks);
extern void sha256_8_avx2(unsigned char* output, const unsigned char* input, uint64_t blocks);
extern void sha256_shani(unsigned char* output, const unsigned char* input, uint64_t blocks);
extern void sha256_1_avx(unsigned char* output, const unsigned char* input);
#endif

View File

@@ -1,7 +1,10 @@
// Package hashutil includes all hash-function related helpers for Prysm.
package hash
// #include "custom_hasher/hasher.h"
import "C"
import (
"encoding/binary"
"errors"
"hash"
"reflect"
@@ -133,3 +136,85 @@ func FastSum64(data []byte) uint64 {
func FastSum256(data []byte) [32]byte {
return highwayhash.Sum(data, fastSumHashKey[:])
}
// ------------------------------------
// No abstraction in these functions, just for playing until we get a feeling if
// it's worth pursuing.
func PotuzHasherAVX2Chunks(dst [][32]byte, inp [][32]byte, count uint64) {
C.sha256_8_avx2((*C.uchar)(&dst[0][0]), (*C.uchar)(&inp[0][0]), C.ulong(count))
}
func PotuzHasherAVXChunks(dst [][32]byte, inp [][32]byte, count uint64) {
C.sha256_4_avx((*C.uchar)(&dst[0][0]), (*C.uchar)(&inp[0][0]), C.ulong(count))
}
func PotuzHasherShaniChunks(dst [][32]byte, inp [][32]byte, count uint64) {
C.sha256_shani((*C.uchar)(&dst[0][0]), (*C.uchar)(&inp[0][0]), C.ulong(count))
}
func PotuzHasherShani(dst []byte, inp []byte, count uint64) {
C.sha256_shani((*C.uchar)(&dst[0]), (*C.uchar)(&inp[0]), C.ulong(count))
}
func PotuzHasherAVX(dst []byte, inp []byte, count uint64) {
C.sha256_4_avx((*C.uchar)(&dst[0]), (*C.uchar)(&inp[0]), C.ulong(count))
}
func PotuzHasherAVX2(dst []byte, inp []byte, count uint64) {
C.sha256_8_avx2((*C.uchar)(&dst[0]), (*C.uchar)(&inp[0]), C.ulong(count))
}
func PotuzHasher2Chunks(dst []byte, inp []byte) {
C.sha256_1_avx((*C.uchar)(&dst[0]), (*C.uchar)(&inp[0]))
}
// no check of the chunks length!
func Hash2ChunksAVX(first [32]byte, second [32]byte) [32]byte {
buf := [32]byte{}
chunks := make([]byte, 64)
copy(chunks, first[:])
copy(chunks[32:], second[:])
C.sha256_1_avx((*C.uchar)(&buf[0]), (*C.uchar)(&chunks[0]))
return buf
}
// no check of the chunks length!
func Hash2ChunksAVX2(first [32]byte, second [32]byte) [32]byte {
buf := [32]byte{}
chunks := make([]byte, 64)
copy(chunks, first[:])
copy(chunks[32:], second[:])
C.sha256_1_avx((*C.uchar)(&buf[0]), (*C.uchar)(&chunks[0]))
return buf
}
// no check of the chunks length!
func Hash2ChunksShani(first [32]byte, second [32]byte) [32]byte {
buf := [32]byte{}
chunks := make([]byte, 64)
copy(chunks, first[:])
copy(chunks[32:], second[:])
C.sha256_shani((*C.uchar)(&buf[0]), (*C.uchar)(&chunks[0]), C.ulong(1))
return buf
}
func MixinLengthAVX(root [32]byte, length uint64) [32]byte {
val := [32]byte{}
binary.LittleEndian.PutUint64(val[:], length)
return Hash2ChunksAVX(root, val)
}
func MixinLengthAVX2(root [32]byte, length uint64) [32]byte {
val := [32]byte{}
binary.LittleEndian.PutUint64(val[:], length)
return Hash2ChunksAVX2(root, val)
}
func MixinLengthShani(root [32]byte, length uint64) [32]byte {
val := [32]byte{}
binary.LittleEndian.PutUint64(val[:], length)
return Hash2ChunksShani(root, val)
}

View File

@@ -2,9 +2,11 @@ package hash_test
import (
"encoding/hex"
"math/rand"
"testing"
fuzz "github.com/google/gofuzz"
"github.com/prysmaticlabs/prysm/beacon-chain/state/stateutil"
"github.com/prysmaticlabs/prysm/crypto/bls"
"github.com/prysmaticlabs/prysm/crypto/hash"
"github.com/prysmaticlabs/prysm/encoding/bytesutil"
@@ -104,3 +106,122 @@ func BenchmarkHashProto(b *testing.B) {
}
}
}
// -------------------------------------------------------------
// Remove tests that give illegal instructions in your CPU if you want to run
// the benchmarks
/*
func TestCustomHash_Shani(t *testing.T) {
hash0 := make([]byte, 64)
root := make([]byte, 32)
hashOf1 := [32]byte{245, 165, 253, 66, 209, 106, 32, 48, 39, 152, 239, 110, 211, 9, 151, 155, 67, 0, 61, 35, 32, 217, 240, 232, 234, 152, 49, 169, 39, 89, 251, 75}
hash.PotuzHasherShani(root, hash0, 1)
assert.DeepEqual(t, hashOf1[:], root)
}
*/
func TestCustomHash_Avx2(t *testing.T) {
hash0 := make([]byte, 64)
root := make([]byte, 32)
hashOf1 := [32]byte{245, 165, 253, 66, 209, 106, 32, 48, 39, 152, 239, 110, 211, 9, 151, 155, 67, 0, 61, 35, 32, 217, 240, 232, 234, 152, 49, 169, 39, 89, 251, 75}
hash.PotuzHasherAVX2(root, hash0, 1)
assert.DeepEqual(t, hashOf1[:], root)
}
func TestCustomHash_SSE(t *testing.T) {
hash0 := make([]byte, 64)
root := make([]byte, 32)
hashOf1 := [32]byte{245, 165, 253, 66, 209, 106, 32, 48, 39, 152, 239, 110, 211, 9, 151, 155, 67, 0, 61, 35, 32, 217, 240, 232, 234, 152, 49, 169, 39, 89, 251, 75}
hash.PotuzHasher2Chunks(root, hash0)
assert.DeepEqual(t, hashOf1[:], root)
}
/*
func BenchmarkHashBalanceAVX2(b *testing.B) {
zero_hash_array := make([][32]byte, 40)
for i := 1; i < 40; i++ {
zero_hash_array[i] = hash.Hash2ChunksAVX2(zero_hash_array[i-1], zero_hash_array[i-1])
}
balances := make([]uint64, 400000)
for i := 0; i < len(balances); i++ {
balances[i] = rand.Uint64()
}
b.ResetTimer()
for i := 0; i < b.N; i++ {
_, err := stateutil.Uint64ListRootWithRegistryLimitAVX2(balances, zero_hash_array)
require.NoError(b, err)
}
}
*/
func BenchmarkHashBalanceAVX(b *testing.B) {
zero_hash_array := make([][32]byte, 40)
for i := 1; i < 40; i++ {
zero_hash_array[i] = hash.Hash2ChunksAVX(zero_hash_array[i-1], zero_hash_array[i-1])
}
balances := make([]uint64, 400000)
for i := 0; i < len(balances); i++ {
balances[i] = rand.Uint64()
}
b.ResetTimer()
for i := 0; i < b.N; i++ {
_, err := stateutil.Uint64ListRootWithRegistryLimitAVX(balances, zero_hash_array)
require.NoError(b, err)
}
}
/*
func BenchmarkHashBalanceShani(b *testing.B) {
zero_hash_array := make([][32]byte, 40)
for i := 1; i < 40; i++ {
zero_hash_array[i] = hash.Hash2ChunksShani(zero_hash_array[i-1], zero_hash_array[i-1])
}
balances := make([]uint64, 400000)
for i := 0; i < len(balances); i++ {
balances[i] = rand.Uint64()
}
b.ResetTimer()
for i := 0; i < b.N; i++ {
_, err := stateutil.Uint64ListRootWithRegistryLimitShani(balances, zero_hash_array)
require.NoError(b, err)
}
}
*/
func BenchmarkHashBalancePrysm(b *testing.B) {
zero_hash_array := make([][32]byte, 40)
for i := 1; i < 40; i++ {
zero_hash_array[i] = hash.Hash2ChunksAVX(zero_hash_array[i-1], zero_hash_array[i-1])
}
balances := make([]uint64, 400000)
for i := 0; i < len(balances); i++ {
balances[i] = rand.Uint64()
}
b.ResetTimer()
for i := 0; i < b.N; i++ {
_, err := stateutil.Uint64ListRootWithRegistryLimit(balances)
require.NoError(b, err)
}
}
/*
func TestHashBalancesShani(t *testing.T) {
zero_hash_array := make([][32]byte, 45)
for i := 1; i < 45; i++ {
zero_hash_array[i] = hash.Hash2ChunksShani(zero_hash_array[i-1], zero_hash_array[i-1])
}
balances := make([]uint64, 400000)
for i := 0; i < len(balances); i++ {
balances[i] = rand.Uint64()
}
root1, err := stateutil.Uint64ListRootWithRegistryLimitShani(balances, zero_hash_array)
require.NoError(t, err)
root2, err := stateutil.Uint64ListRootWithRegistryLimit(balances)
require.NoError(t, err)
assert.DeepEqual(t, root1, root2)
}
*/

110
crypto/hash/yasm.bzl Normal file
View File

@@ -0,0 +1,110 @@
load("@rules_cc//cc:toolchain_utils.bzl", "find_cpp_toolchain")
def _obj_yasm(ctx, arch, opts, src):
yasm_bin = ctx.attr.yasm_bin
out = ctx.actions.declare_file(src.basename.replace(src.extension, "o"))
opts = arch + ['-o', out.path] + opts + [src.path]
inputs = []
for i in ctx.attr.srcs + ctx.attr.hdrs + ctx.attr.deps:
if hasattr(i, "files"):
inputs += i.files.to_list()
else:
inputs.append(i)
ctx.actions.run(
outputs = [out],
inputs = inputs,
arguments = opts,
executable = yasm_bin,
mnemonic = 'YasmCompile',
)
return out
def _library_yasm(ctx, mysrc):
output_file = ctx.actions.declare_file(ctx.label.name + ".a")
cc_toolchain = find_cpp_toolchain(ctx)
feature_configuration = cc_common.configure_features(
ctx = ctx,
cc_toolchain = cc_toolchain,
requested_features = ctx.features,
unsupported_features = ctx.disabled_features,
)
linker_input = cc_common.create_linker_input(
owner = ctx.label,
libraries = depset(direct = [
cc_common.create_library_to_link(
actions = ctx.actions,
static_library = output_file,
cc_toolchain = cc_toolchain,
feature_configuration = feature_configuration,
),
]),
)
compilation_context = cc_common.create_compilation_context()
linking_context = cc_common.create_linking_context(linker_inputs = depset(direct = [linker_input]))
ctx.actions.run(
executable = ctx.attr.ar_bin,
arguments = ['r', output_file.path] + [i.path for i in mysrc],
inputs = mysrc,
outputs = [output_file],
mnemonic = "Archiving",
)
return CcInfo(compilation_context = compilation_context, linking_context = linking_context)
def _yasm_library_impl(ctx):
opts = ctx.attr.copts
deps = [_obj_yasm(ctx, ctx.attr.yasm_arch, opts, src)
for target in ctx.attr.srcs for src in target.files.to_list()]
for i in ctx.attr.hdrs:
if hasattr(i, "files"):
deps += i.files.to_list()
else:
deps.append(i)
cc_info = _library_yasm(ctx, deps)
return [cc_info]
YASM_BIN_DEFAULT = "/usr/bin/yasm"
AR_BIN_DEFAULT = "/usr/bin/ar"
YASM_ARCH_OPTS = ["-f", "elf64", "-m", "amd64"]
_yasm_library = rule(
implementation=_yasm_library_impl,
attrs={
"srcs": attr.label_list(allow_files=True),
"hdrs": attr.label_list(allow_files=True),
"deps": attr.label_list(allow_files=True),
"copts": attr.string_list(),
"yasm_bin": attr.string(default=""),
"ar_bin": attr.string(default=""),
"yasm_arch": attr.string_list(),
"_cc_toolchain": attr.label(default = Label("@bazel_tools//tools/cpp:current_cc_toolchain")),
},
fragments = ["cpp"],
toolchains = ["@bazel_tools//tools/cpp:toolchain_type"],
)
def yasm_library(name, srcs, hdrs=[], deps=[], copts=[],
yasm_bin=YASM_BIN_DEFAULT, ar_bin=AR_BIN_DEFAULT):
_yasm_library(
name = name,
srcs = srcs,
hdrs = hdrs,
copts = copts,
yasm_bin = yasm_bin,
ar_bin = ar_bin,
yasm_arch = YASM_ARCH_OPTS,
)