mirror of
https://github.com/OffchainLabs/prysm.git
synced 2026-01-09 21:38:05 -05:00
Compare commits
6 Commits
fix-bid-ch
...
custom_has
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
94768a3190 | ||
|
|
662d4ec6e9 | ||
|
|
b206bffe15 | ||
|
|
18b83bf445 | ||
|
|
cff4a01ea0 | ||
|
|
3a1901d7aa |
@@ -17,6 +17,7 @@ go_library(
|
||||
importpath = "github.com/prysmaticlabs/prysm/beacon-chain/state/stateutil",
|
||||
visibility = [
|
||||
"//beacon-chain:__subpackages__",
|
||||
"//crypto/hash:__subpackages__",
|
||||
"//proto/migration:__subpackages__",
|
||||
"//proto/prysm/v1alpha1:__subpackages__",
|
||||
"//proto/testing:__subpackages__",
|
||||
|
||||
@@ -54,6 +54,236 @@ func ValidatorRootWithHasher(hasher ssz.HashFn, validator *ethpb.Validator) ([32
|
||||
}
|
||||
return ssz.BitwiseMerkleizeArrays(hasher, fieldRoots, uint64(len(fieldRoots)), uint64(len(fieldRoots)))
|
||||
}
|
||||
func merkleizeFlatArrayAVX(vec [][32]byte,
|
||||
depth uint8,
|
||||
hasher func([][32]byte, [][32]byte, uint64),
|
||||
zero_hash_array [][32]byte) [32]byte {
|
||||
|
||||
if depth == 0 && len(vec) == 1 {
|
||||
return vec[0]
|
||||
}
|
||||
if len(vec) == 0 {
|
||||
panic("Can't have empty vec")
|
||||
}
|
||||
|
||||
// allocate size for the buffer (everything hardcoded cause
|
||||
layer := (len(vec) + 1) / 2
|
||||
length := 0
|
||||
for {
|
||||
length += layer - 1
|
||||
if layer == 1 {
|
||||
break
|
||||
}
|
||||
layer = (layer + 1) / 2
|
||||
}
|
||||
length += int(depth)
|
||||
hash_tree := make([][32]byte, length)
|
||||
|
||||
first := uint64(0)
|
||||
height := uint8(1)
|
||||
last := uint64(len(vec)+1) / 2
|
||||
if len(vec) > 1 {
|
||||
hasher(hash_tree, vec, last)
|
||||
}
|
||||
if len(vec)%2 == 1 {
|
||||
hash_tree[last-1] = hash.Hash2ChunksAVX(vec[len(vec)-1], zero_hash_array[0])
|
||||
}
|
||||
for {
|
||||
dist := last - first
|
||||
if dist < 2 {
|
||||
break
|
||||
}
|
||||
hasher(hash_tree[last:], hash_tree[first:], dist/2)
|
||||
first = last
|
||||
last += (dist + 1) / 2
|
||||
|
||||
if dist%2 != 0 {
|
||||
hash_tree[last-1] = hash.Hash2ChunksAVX(hash_tree[first-1], zero_hash_array[height])
|
||||
}
|
||||
height++
|
||||
}
|
||||
for {
|
||||
if height >= depth {
|
||||
break
|
||||
}
|
||||
hash_tree[last] = hash.Hash2ChunksAVX(hash_tree[last-1], zero_hash_array[height])
|
||||
last++
|
||||
height++
|
||||
}
|
||||
return hash_tree[last-1]
|
||||
}
|
||||
|
||||
func merkleizeFlatArrayAVX2(vec [][32]byte,
|
||||
depth uint8,
|
||||
hasher func([][32]byte, [][32]byte, uint64),
|
||||
zero_hash_array [][32]byte) [32]byte {
|
||||
|
||||
if depth == 0 && len(vec) == 1 {
|
||||
return vec[0]
|
||||
}
|
||||
if len(vec) == 0 {
|
||||
panic("Can't have empty vec")
|
||||
}
|
||||
|
||||
// allocate size for the buffer (everything hardcoded cause
|
||||
layer := (len(vec) + 1) / 2
|
||||
length := 0
|
||||
for {
|
||||
length += layer - 1
|
||||
if layer == 1 {
|
||||
break
|
||||
}
|
||||
layer = (layer + 1) / 2
|
||||
}
|
||||
length += int(depth)
|
||||
hash_tree := make([][32]byte, length)
|
||||
|
||||
first := uint64(0)
|
||||
height := uint8(1)
|
||||
last := uint64(len(vec)+1) / 2
|
||||
if len(vec) > 1 {
|
||||
hasher(hash_tree, vec, last)
|
||||
}
|
||||
if len(vec)%2 == 1 {
|
||||
hash_tree[last-1] = hash.Hash2ChunksAVX2(vec[len(vec)-1], zero_hash_array[0])
|
||||
}
|
||||
for {
|
||||
dist := last - first
|
||||
if dist < 2 {
|
||||
break
|
||||
}
|
||||
hasher(hash_tree[last:], hash_tree[first:], dist/2)
|
||||
first = last
|
||||
last += (dist + 1) / 2
|
||||
|
||||
if dist%2 != 0 {
|
||||
hash_tree[last-1] = hash.Hash2ChunksAVX2(hash_tree[first-1], zero_hash_array[height])
|
||||
}
|
||||
height++
|
||||
}
|
||||
for {
|
||||
if height >= depth {
|
||||
break
|
||||
}
|
||||
hash_tree[last] = hash.Hash2ChunksAVX2(hash_tree[last-1], zero_hash_array[height])
|
||||
last++
|
||||
height++
|
||||
}
|
||||
return hash_tree[last-1]
|
||||
}
|
||||
|
||||
func merkleizeFlatArray(vec [][32]byte,
|
||||
depth uint8,
|
||||
hasher func([][32]byte, [][32]byte, uint64),
|
||||
zero_hash_array [][32]byte) [32]byte {
|
||||
|
||||
if depth == 0 && len(vec) == 1 {
|
||||
return vec[0]
|
||||
}
|
||||
if len(vec) == 0 {
|
||||
panic("Can't have empty vec")
|
||||
}
|
||||
|
||||
// allocate size for the buffer (everything hardcoded cause
|
||||
layer := (len(vec) + 1) / 2
|
||||
length := 0
|
||||
for {
|
||||
length += layer - 1
|
||||
if layer == 1 {
|
||||
break
|
||||
}
|
||||
layer = (layer + 1) / 2
|
||||
}
|
||||
length += int(depth)
|
||||
hash_tree := make([][32]byte, length)
|
||||
|
||||
first := uint64(0)
|
||||
height := uint8(1)
|
||||
last := uint64(len(vec)+1) / 2
|
||||
if len(vec) > 1 {
|
||||
hasher(hash_tree, vec, last)
|
||||
}
|
||||
if len(vec)%2 == 1 {
|
||||
hash_tree[last-1] = hash.Hash2ChunksShani(vec[len(vec)-1], zero_hash_array[0])
|
||||
}
|
||||
for {
|
||||
dist := last - first
|
||||
if dist < 2 {
|
||||
break
|
||||
}
|
||||
hasher(hash_tree[last:], hash_tree[first:], dist/2)
|
||||
first = last
|
||||
last += (dist + 1) / 2
|
||||
|
||||
if dist%2 != 0 {
|
||||
hash_tree[last-1] = hash.Hash2ChunksShani(hash_tree[first-1], zero_hash_array[height])
|
||||
}
|
||||
height++
|
||||
}
|
||||
for {
|
||||
if height >= depth {
|
||||
break
|
||||
}
|
||||
hash_tree[last] = hash.Hash2ChunksShani(hash_tree[last-1], zero_hash_array[height])
|
||||
last++
|
||||
height++
|
||||
}
|
||||
return hash_tree[last-1]
|
||||
}
|
||||
|
||||
// Uint64ListRootWithRegistryLimitShani computes the HashTreeRoot Merkleization of
|
||||
// a list of uint64 and mixed with registry limit. Flat array implementation
|
||||
// using Shani extensions
|
||||
func Uint64ListRootWithRegistryLimitShani(balances []uint64, zero_hash_array [][32]byte) ([32]byte, error) {
|
||||
// assume len(balances) is multiple of 4 for this benchmark
|
||||
lenChunks := len(balances) / 4
|
||||
balancesChunks := make([][32]byte, lenChunks)
|
||||
for i := 0; i < lenChunks; i++ {
|
||||
binary.LittleEndian.PutUint64(balancesChunks[i][:], balances[4*i])
|
||||
binary.LittleEndian.PutUint64(balancesChunks[i][8:], balances[4*i+1])
|
||||
binary.LittleEndian.PutUint64(balancesChunks[i][16:], balances[4*i+2])
|
||||
binary.LittleEndian.PutUint64(balancesChunks[i][24:], balances[4*i+3])
|
||||
}
|
||||
balancesRootsRoot := merkleizeFlatArray(balancesChunks, 38, hash.PotuzHasherShaniChunks, zero_hash_array)
|
||||
|
||||
return hash.MixinLengthShani(balancesRootsRoot, uint64(len(balances))), nil
|
||||
}
|
||||
|
||||
// Uint64ListRootWithRegistryLimitAVX computes the HashTreeRoot Merkleization of
|
||||
// a list of uint64 and mixed with registry limit. Flat array implementation
|
||||
// using Shani extensions
|
||||
func Uint64ListRootWithRegistryLimitAVX(balances []uint64, zero_hash_array [][32]byte) ([32]byte, error) {
|
||||
// assume len(balances) is multiple of 4 for this benchmark
|
||||
lenChunks := len(balances) / 4
|
||||
balancesChunks := make([][32]byte, lenChunks)
|
||||
for i := 0; i < lenChunks; i++ {
|
||||
binary.LittleEndian.PutUint64(balancesChunks[i][:], balances[4*i])
|
||||
binary.LittleEndian.PutUint64(balancesChunks[i][8:], balances[4*i+1])
|
||||
binary.LittleEndian.PutUint64(balancesChunks[i][16:], balances[4*i+2])
|
||||
binary.LittleEndian.PutUint64(balancesChunks[i][24:], balances[4*i+3])
|
||||
}
|
||||
balancesRootsRoot := merkleizeFlatArrayAVX(balancesChunks, 38, hash.PotuzHasherAVXChunks, zero_hash_array)
|
||||
|
||||
return hash.MixinLengthAVX(balancesRootsRoot, uint64(len(balances))), nil
|
||||
}
|
||||
|
||||
// Uint64ListRootWithRegistryLimitAVX2 computes the HashTreeRoot Merkleization of
|
||||
// a list of uint64 and mixed with registry limit. Flat array implementation
|
||||
// using Shani extensions
|
||||
func Uint64ListRootWithRegistryLimitAVX2(balances []uint64, zero_hash_array [][32]byte) ([32]byte, error) {
|
||||
// assume len(balances) is multiple of 4 for this benchmark
|
||||
lenChunks := len(balances) / 4
|
||||
balancesChunks := make([][32]byte, lenChunks)
|
||||
for i := 0; i < lenChunks; i++ {
|
||||
binary.LittleEndian.PutUint64(balancesChunks[i][:], balances[4*i])
|
||||
binary.LittleEndian.PutUint64(balancesChunks[i][8:], balances[4*i+1])
|
||||
binary.LittleEndian.PutUint64(balancesChunks[i][16:], balances[4*i+2])
|
||||
binary.LittleEndian.PutUint64(balancesChunks[i][24:], balances[4*i+3])
|
||||
}
|
||||
balancesRootsRoot := merkleizeFlatArrayAVX2(balancesChunks, 38, hash.PotuzHasherAVX2Chunks, zero_hash_array)
|
||||
|
||||
return hash.MixinLengthAVX2(balancesRootsRoot, uint64(len(balances))), nil
|
||||
}
|
||||
|
||||
// Uint64ListRootWithRegistryLimit computes the HashTreeRoot Merkleization of
|
||||
// a list of uint64 and mixed with registry limit.
|
||||
|
||||
@@ -1,8 +1,13 @@
|
||||
load("@prysm//tools/go:def.bzl", "go_library", "go_test")
|
||||
load("@prysm//crypto/hash:yasm.bzl", "yasm_library")
|
||||
|
||||
go_library(
|
||||
name = "go_default_library",
|
||||
srcs = ["hash.go"],
|
||||
srcs = [
|
||||
"hash.go",
|
||||
"custom_hasher/hasher.h",
|
||||
],
|
||||
cgo = True,
|
||||
importpath = "github.com/prysmaticlabs/prysm/crypto/hash",
|
||||
visibility = ["//visibility:public"],
|
||||
deps = [
|
||||
@@ -13,6 +18,7 @@ go_library(
|
||||
"@org_golang_google_protobuf//proto:go_default_library",
|
||||
"@org_golang_x_crypto//sha3:go_default_library",
|
||||
],
|
||||
cdeps = [":custom_hasher"],
|
||||
)
|
||||
|
||||
go_test(
|
||||
@@ -27,6 +33,29 @@ go_test(
|
||||
"//proto/testing:go_default_library",
|
||||
"//testing/assert:go_default_library",
|
||||
"//testing/require:go_default_library",
|
||||
"//beacon-chain/state/stateutil:go_default_library",
|
||||
"@com_github_google_gofuzz//:go_default_library",
|
||||
],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "custom_hasher",
|
||||
srcs = [
|
||||
"custom_hasher/hasher.h",
|
||||
],
|
||||
hdrs = [ "custom_hasher/hasher.h" ],
|
||||
visibility = ["//visibility:public"],
|
||||
deps = [ ":asm" ],
|
||||
)
|
||||
|
||||
yasm_library(
|
||||
name = "asm",
|
||||
srcs = [
|
||||
"custom_hasher/assembly/reg_sizes.asm",
|
||||
"custom_hasher/assembly/sha256_avx_one_block.asm",
|
||||
"custom_hasher/assembly/sha256_avx.asm",
|
||||
"custom_hasher/assembly/sha256_avx2.asm",
|
||||
"custom_hasher/assembly/sha256_shani.asm",
|
||||
"custom_hasher/assembly/transpose_avx2.asm",
|
||||
],
|
||||
)
|
||||
|
||||
300
crypto/hash/custom_hasher/assembly/reg_sizes.asm
Normal file
300
crypto/hash/custom_hasher/assembly/reg_sizes.asm
Normal file
@@ -0,0 +1,300 @@
|
||||
;;
|
||||
;; Copyright (c) 2012-2021, Intel Corporation
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
;; modification, are permitted provided that the following conditions are met:
|
||||
;;
|
||||
;; * Redistributions of source code must retain the above copyright notice,
|
||||
;; this list of conditions and the following disclaimer.
|
||||
;; * Redistributions in binary form must reproduce the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer in the
|
||||
;; documentation and/or other materials provided with the distribution.
|
||||
;; * Neither the name of Intel Corporation nor the names of its contributors
|
||||
;; may be used to endorse or promote products derived from this software
|
||||
;; without specific prior written permission.
|
||||
;;
|
||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
|
||||
;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
;;
|
||||
|
||||
; define d and w variants for registers
|
||||
|
||||
%ifndef _REG_SIZES_ASM_
|
||||
%define _REG_SIZES_ASM_
|
||||
|
||||
%define raxd eax
|
||||
%define raxw ax
|
||||
%define raxb al
|
||||
|
||||
%define rbxd ebx
|
||||
%define rbxw bx
|
||||
%define rbxb bl
|
||||
|
||||
%define rcxd ecx
|
||||
%define rcxw cx
|
||||
%define rcxb cl
|
||||
|
||||
%define rdxd edx
|
||||
%define rdxw dx
|
||||
%define rdxb dl
|
||||
|
||||
%define rsid esi
|
||||
%define rsiw si
|
||||
%define rsib sil
|
||||
|
||||
%define rdid edi
|
||||
%define rdiw di
|
||||
%define rdib dil
|
||||
|
||||
%define rbpd ebp
|
||||
%define rbpw bp
|
||||
%define rbpb bpl
|
||||
|
||||
%define zmm0x xmm0
|
||||
%define zmm1x xmm1
|
||||
%define zmm2x xmm2
|
||||
%define zmm3x xmm3
|
||||
%define zmm4x xmm4
|
||||
%define zmm5x xmm5
|
||||
%define zmm6x xmm6
|
||||
%define zmm7x xmm7
|
||||
%define zmm8x xmm8
|
||||
%define zmm9x xmm9
|
||||
%define zmm10x xmm10
|
||||
%define zmm11x xmm11
|
||||
%define zmm12x xmm12
|
||||
%define zmm13x xmm13
|
||||
%define zmm14x xmm14
|
||||
%define zmm15x xmm15
|
||||
%define zmm16x xmm16
|
||||
%define zmm17x xmm17
|
||||
%define zmm18x xmm18
|
||||
%define zmm19x xmm19
|
||||
%define zmm20x xmm20
|
||||
%define zmm21x xmm21
|
||||
%define zmm22x xmm22
|
||||
%define zmm23x xmm23
|
||||
%define zmm24x xmm24
|
||||
%define zmm25x xmm25
|
||||
%define zmm26x xmm26
|
||||
%define zmm27x xmm27
|
||||
%define zmm28x xmm28
|
||||
%define zmm29x xmm29
|
||||
%define zmm30x xmm30
|
||||
%define zmm31x xmm31
|
||||
|
||||
%define ymm0x xmm0
|
||||
%define ymm1x xmm1
|
||||
%define ymm2x xmm2
|
||||
%define ymm3x xmm3
|
||||
%define ymm4x xmm4
|
||||
%define ymm5x xmm5
|
||||
%define ymm6x xmm6
|
||||
%define ymm7x xmm7
|
||||
%define ymm8x xmm8
|
||||
%define ymm9x xmm9
|
||||
%define ymm10x xmm10
|
||||
%define ymm11x xmm11
|
||||
%define ymm12x xmm12
|
||||
%define ymm13x xmm13
|
||||
%define ymm14x xmm14
|
||||
%define ymm15x xmm15
|
||||
%define ymm16x xmm16
|
||||
%define ymm17x xmm17
|
||||
%define ymm18x xmm18
|
||||
%define ymm19x xmm19
|
||||
%define ymm20x xmm20
|
||||
%define ymm21x xmm21
|
||||
%define ymm22x xmm22
|
||||
%define ymm23x xmm23
|
||||
%define ymm24x xmm24
|
||||
%define ymm25x xmm25
|
||||
%define ymm26x xmm26
|
||||
%define ymm27x xmm27
|
||||
%define ymm28x xmm28
|
||||
%define ymm29x xmm29
|
||||
%define ymm30x xmm30
|
||||
%define ymm31x xmm31
|
||||
|
||||
%define xmm0x xmm0
|
||||
%define xmm1x xmm1
|
||||
%define xmm2x xmm2
|
||||
%define xmm3x xmm3
|
||||
%define xmm4x xmm4
|
||||
%define xmm5x xmm5
|
||||
%define xmm6x xmm6
|
||||
%define xmm7x xmm7
|
||||
%define xmm8x xmm8
|
||||
%define xmm9x xmm9
|
||||
%define xmm10x xmm10
|
||||
%define xmm11x xmm11
|
||||
%define xmm12x xmm12
|
||||
%define xmm13x xmm13
|
||||
%define xmm14x xmm14
|
||||
%define xmm15x xmm15
|
||||
%define xmm16x xmm16
|
||||
%define xmm17x xmm17
|
||||
%define xmm18x xmm18
|
||||
%define xmm19x xmm19
|
||||
%define xmm20x xmm20
|
||||
%define xmm21x xmm21
|
||||
%define xmm22x xmm22
|
||||
%define xmm23x xmm23
|
||||
%define xmm24x xmm24
|
||||
%define xmm25x xmm25
|
||||
%define xmm26x xmm26
|
||||
%define xmm27x xmm27
|
||||
%define xmm28x xmm28
|
||||
%define xmm29x xmm29
|
||||
%define xmm30x xmm30
|
||||
%define xmm31x xmm31
|
||||
|
||||
%define zmm0y ymm0
|
||||
%define zmm1y ymm1
|
||||
%define zmm2y ymm2
|
||||
%define zmm3y ymm3
|
||||
%define zmm4y ymm4
|
||||
%define zmm5y ymm5
|
||||
%define zmm6y ymm6
|
||||
%define zmm7y ymm7
|
||||
%define zmm8y ymm8
|
||||
%define zmm9y ymm9
|
||||
%define zmm10y ymm10
|
||||
%define zmm11y ymm11
|
||||
%define zmm12y ymm12
|
||||
%define zmm13y ymm13
|
||||
%define zmm14y ymm14
|
||||
%define zmm15y ymm15
|
||||
%define zmm16y ymm16
|
||||
%define zmm17y ymm17
|
||||
%define zmm18y ymm18
|
||||
%define zmm19y ymm19
|
||||
%define zmm20y ymm20
|
||||
%define zmm21y ymm21
|
||||
%define zmm22y ymm22
|
||||
%define zmm23y ymm23
|
||||
%define zmm24y ymm24
|
||||
%define zmm25y ymm25
|
||||
%define zmm26y ymm26
|
||||
%define zmm27y ymm27
|
||||
%define zmm28y ymm28
|
||||
%define zmm29y ymm29
|
||||
%define zmm30y ymm30
|
||||
%define zmm31y ymm31
|
||||
|
||||
%define xmm0y ymm0
|
||||
%define xmm1y ymm1
|
||||
%define xmm2y ymm2
|
||||
%define xmm3y ymm3
|
||||
%define xmm4y ymm4
|
||||
%define xmm5y ymm5
|
||||
%define xmm6y ymm6
|
||||
%define xmm7y ymm7
|
||||
%define xmm8y ymm8
|
||||
%define xmm9y ymm9
|
||||
%define xmm10y ymm10
|
||||
%define xmm11y ymm11
|
||||
%define xmm12y ymm12
|
||||
%define xmm13y ymm13
|
||||
%define xmm14y ymm14
|
||||
%define xmm15y ymm15
|
||||
%define xmm16y ymm16
|
||||
%define xmm17y ymm17
|
||||
%define xmm18y ymm18
|
||||
%define xmm19y ymm19
|
||||
%define xmm20y ymm20
|
||||
%define xmm21y ymm21
|
||||
%define xmm22y ymm22
|
||||
%define xmm23y ymm23
|
||||
%define xmm24y ymm24
|
||||
%define xmm25y ymm25
|
||||
%define xmm26y ymm26
|
||||
%define xmm27y ymm27
|
||||
%define xmm28y ymm28
|
||||
%define xmm29y ymm29
|
||||
%define xmm30y ymm30
|
||||
%define xmm31y ymm31
|
||||
|
||||
%define xmm0z zmm0
|
||||
%define xmm1z zmm1
|
||||
%define xmm2z zmm2
|
||||
%define xmm3z zmm3
|
||||
%define xmm4z zmm4
|
||||
%define xmm5z zmm5
|
||||
%define xmm6z zmm6
|
||||
%define xmm7z zmm7
|
||||
%define xmm8z zmm8
|
||||
%define xmm9z zmm9
|
||||
%define xmm10z zmm10
|
||||
%define xmm11z zmm11
|
||||
%define xmm12z zmm12
|
||||
%define xmm13z zmm13
|
||||
%define xmm14z zmm14
|
||||
%define xmm15z zmm15
|
||||
%define xmm16z zmm16
|
||||
%define xmm17z zmm17
|
||||
%define xmm18z zmm18
|
||||
%define xmm19z zmm19
|
||||
%define xmm20z zmm20
|
||||
%define xmm21z zmm21
|
||||
%define xmm22z zmm22
|
||||
%define xmm23z zmm23
|
||||
%define xmm24z zmm24
|
||||
%define xmm25z zmm25
|
||||
%define xmm26z zmm26
|
||||
%define xmm27z zmm27
|
||||
%define xmm28z zmm28
|
||||
%define xmm29z zmm29
|
||||
%define xmm30z zmm30
|
||||
%define xmm31z zmm31
|
||||
|
||||
%define ymm0z zmm0
|
||||
%define ymm1z zmm1
|
||||
%define ymm2z zmm2
|
||||
%define ymm3z zmm3
|
||||
%define ymm4z zmm4
|
||||
%define ymm5z zmm5
|
||||
%define ymm6z zmm6
|
||||
%define ymm7z zmm7
|
||||
%define ymm8z zmm8
|
||||
%define ymm9z zmm9
|
||||
%define ymm10z zmm10
|
||||
%define ymm11z zmm11
|
||||
%define ymm12z zmm12
|
||||
%define ymm13z zmm13
|
||||
%define ymm14z zmm14
|
||||
%define ymm15z zmm15
|
||||
%define ymm16z zmm16
|
||||
%define ymm17z zmm17
|
||||
%define ymm18z zmm18
|
||||
%define ymm19z zmm19
|
||||
%define ymm20z zmm20
|
||||
%define ymm21z zmm21
|
||||
%define ymm22z zmm22
|
||||
%define ymm23z zmm23
|
||||
%define ymm24z zmm24
|
||||
%define ymm25z zmm25
|
||||
%define ymm26z zmm26
|
||||
%define ymm27z zmm27
|
||||
%define ymm28z zmm28
|
||||
%define ymm29z zmm29
|
||||
%define ymm30z zmm30
|
||||
%define ymm31z zmm31
|
||||
|
||||
%define DWORD(reg) reg %+ d
|
||||
%define WORD(reg) reg %+ w
|
||||
%define BYTE(reg) reg %+ b
|
||||
|
||||
%define XWORD(reg) reg %+ x
|
||||
%define YWORD(reg) reg %+ y
|
||||
%define ZWORD(reg) reg %+ z
|
||||
|
||||
%endif ;; _REG_SIZES_ASM_
|
||||
612
crypto/hash/custom_hasher/assembly/sha256_avx.asm
Normal file
612
crypto/hash/custom_hasher/assembly/sha256_avx.asm
Normal file
@@ -0,0 +1,612 @@
|
||||
;; sha256_avx.asm
|
||||
; *
|
||||
; * This file is part of Mammon.
|
||||
; * mammon is a greedy and selfish ETH consensus client.
|
||||
; *
|
||||
; * Copyright (c) 2021 - Reimundo Heluani (potuz) potuz@potuz.net
|
||||
; *
|
||||
; * This program is free software: you can redistribute it and/or modify
|
||||
; * it under the terms of the GNU General Public License as published by
|
||||
; * the Free Software Foundation, either version 3 of the License, or
|
||||
; * (at your option) any later version.
|
||||
; *
|
||||
; * This program is distributed in the hope that it will be useful,
|
||||
; * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
; * GNU General Public License for more details.
|
||||
; *
|
||||
; You should have received a copy of the GNU General Public License
|
||||
; along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
;
|
||||
; This implementation is a 64 bytes optimized implementation based on Intel's code
|
||||
; whose copyright follows
|
||||
;
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;;
|
||||
;; Copyright (c) 2012-2021, Intel Corporation
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
;; modification, are permitted provided that the following conditions are met:
|
||||
;;
|
||||
;; * Redistributions of source code must retain the above copyright notice,
|
||||
;; this list of conditions and the following disclaimer.
|
||||
;; * Redistributions in binary form must reproduce the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer in the
|
||||
;; documentation and/or other materials provided with the distribution.
|
||||
;; * Neither the name of Intel Corporation nor the names of its contributors
|
||||
;; may be used to endorse or promote products derived from this software
|
||||
;; without specific prior written permission.
|
||||
;;
|
||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
|
||||
;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
;;
|
||||
|
||||
;; code to compute quad SHA256 using AVX
|
||||
;; outer calling routine takes care of save and restore of XMM registers
|
||||
;; Logic designed/laid out by JDG
|
||||
|
||||
;; Stack must be aligned to 16 bytes before call
|
||||
;; Windows clobbers: rax rdx r8 r9 r10 r11
|
||||
;; Windows preserves: rcx rsi rdi rbp r12 r13 r14 r15
|
||||
;;
|
||||
;; Linux clobbers: rax rsi r8 r9 r10 r11
|
||||
;; Linux preserves: rcx rdx rdi rbp r12 r13 r14 r15
|
||||
;;
|
||||
;; clobbers xmm0-15
|
||||
|
||||
|
||||
extern sha256_1_avx
|
||||
|
||||
%ifdef WINABI
|
||||
%define OUTPUT_PTR rcx ; 1st arg
|
||||
%define DATA_PTR rdx ; 2nd arg
|
||||
%define NUM_BLKS r8 ; 3rd arg
|
||||
%define TBL rsi
|
||||
%else
|
||||
%define OUTPUT_PTR rdi ; 1st arg
|
||||
%define DATA_PTR rsi ; 2nd arg
|
||||
%define NUM_BLKS rdx ; 3rd arg
|
||||
%define TBL rcx
|
||||
%endif
|
||||
|
||||
%define ROUND rbx
|
||||
|
||||
%define inp0 r8
|
||||
%define inp1 r9
|
||||
%define inp2 r10
|
||||
%define inp3 r11
|
||||
|
||||
%define a xmm0
|
||||
%define b xmm1
|
||||
%define c xmm2
|
||||
%define d xmm3
|
||||
%define e xmm4
|
||||
%define f xmm5
|
||||
%define g xmm6
|
||||
%define h xmm7
|
||||
|
||||
%define a0 xmm8
|
||||
%define a1 xmm9
|
||||
%define a2 xmm10
|
||||
|
||||
%define TT0 xmm14
|
||||
%define TT1 xmm13
|
||||
%define TT2 xmm12
|
||||
%define TT3 xmm11
|
||||
%define TT4 xmm10
|
||||
%define TT5 xmm9
|
||||
|
||||
%define T1 xmm14
|
||||
%define TMP xmm15
|
||||
|
||||
%define SHA256_DIGEST_WORD_SIZE 4
|
||||
%define NUM_SHA256_DIGEST_WORDS 8
|
||||
%define SZ4 4*SHA256_DIGEST_WORD_SIZE ; Size of one vector register
|
||||
%define ROUNDS 64*SZ4
|
||||
|
||||
; Define stack usage
|
||||
struc STACK
|
||||
_DATA: resb SZ4 * 16
|
||||
_DIGEST: resb SZ4 * NUM_SHA256_DIGEST_WORDS
|
||||
_RBX: resb 8
|
||||
resb 16
|
||||
endstruc
|
||||
|
||||
%define VMOVPS vmovups
|
||||
|
||||
; transpose r0, r1, r2, r3, t0, t1
|
||||
; "transpose" data in {r0..r3} using temps {t0,t1}
|
||||
; Input looks like: {r0 r1 r2 r3}
|
||||
; r0 = {a3 a2 a1 a0}
|
||||
; r1 = {b3 b2 b1 b0}
|
||||
; r2 = {c3 c2 c1 c0}
|
||||
; r3 = {d3 d2 d1 d0}
|
||||
;
|
||||
; output looks like: {t0 r1 r0 r3}
|
||||
; t0 = {d0 c0 b0 a0}
|
||||
; r1 = {d1 c1 b1 a1}
|
||||
; r0 = {d2 c2 b2 a2}
|
||||
; r3 = {d3 c3 b3 a3}
|
||||
;
|
||||
%macro TRANSPOSE 6
|
||||
%define %%r0 %1
|
||||
%define %%r1 %2
|
||||
%define %%r2 %3
|
||||
%define %%r3 %4
|
||||
%define %%t0 %5
|
||||
%define %%t1 %6
|
||||
vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0}
|
||||
vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2}
|
||||
|
||||
vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d1 d0 c1 c0}
|
||||
vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2}
|
||||
|
||||
vshufps %%r1, %%t0, %%t1, 0xDD ; r1 = {d1 c1 b1 a1}
|
||||
|
||||
vshufps %%r3, %%r0, %%r2, 0xDD ; r3 = {d3 c3 b3 a3}
|
||||
|
||||
vshufps %%r0, %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2}
|
||||
vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0}
|
||||
%endmacro
|
||||
|
||||
|
||||
|
||||
%macro ROTATE_ARGS 0
|
||||
%xdefine TMP_ h
|
||||
%xdefine h g
|
||||
%xdefine g f
|
||||
%xdefine f e
|
||||
%xdefine e d
|
||||
%xdefine d c
|
||||
%xdefine c b
|
||||
%xdefine b a
|
||||
%xdefine a TMP_
|
||||
%endm
|
||||
|
||||
; PRORD reg, imm, tmp
|
||||
%macro PRORD 3
|
||||
%define %%reg %1
|
||||
%define %%imm %2
|
||||
%define %%tmp %3
|
||||
vpslld %%tmp, %%reg, (32-(%%imm))
|
||||
vpsrld %%reg, %%reg, %%imm
|
||||
vpor %%reg, %%reg, %%tmp
|
||||
%endmacro
|
||||
|
||||
; non-destructive
|
||||
; PRORD_nd reg, imm, tmp, src
|
||||
%macro PRORD_nd 4
|
||||
%define %%reg %1
|
||||
%define %%imm %2
|
||||
%define %%tmp %3
|
||||
%define %%src %4
|
||||
;vmovdqa %%tmp, %%reg
|
||||
vpslld %%tmp, %%src, (32-(%%imm))
|
||||
vpsrld %%reg, %%src, %%imm
|
||||
vpor %%reg, %%reg, %%tmp
|
||||
%endmacro
|
||||
|
||||
; PRORD dst/src, amt
|
||||
%macro PRORD 2
|
||||
PRORD %1, %2, TMP
|
||||
%endmacro
|
||||
|
||||
; PRORD_nd dst, src, amt
|
||||
%macro PRORD_nd 3
|
||||
PRORD_nd %1, %3, TMP, %2
|
||||
%endmacro
|
||||
|
||||
;; arguments passed implicitly in preprocessor symbols i, a...h
|
||||
%macro ROUND_00_15 2
|
||||
%define %%T1 %1
|
||||
%define %%i %2
|
||||
PRORD_nd a0, e, (11-6) ; sig1: a0 = (e >> 5)
|
||||
|
||||
vpxor a2, f, g ; ch: a2 = f^g
|
||||
vpand a2, a2, e ; ch: a2 = (f^g)&e
|
||||
vpxor a2, a2, g ; a2 = ch
|
||||
|
||||
PRORD_nd a1, e, 25 ; sig1: a1 = (e >> 25)
|
||||
vmovdqa [SZ4*(%%i&0xf) + rsp + _DATA], %%T1
|
||||
vpaddd %%T1, %%T1, [TBL + ROUND] ; T1 = W + K
|
||||
vpxor a0, a0, e ; sig1: a0 = e ^ (e >> 5)
|
||||
PRORD a0, 6 ; sig1: a0 = (e >> 6) ^ (e >> 11)
|
||||
vpaddd h, h, a2 ; h = h + ch
|
||||
PRORD_nd a2, a, (13-2) ; sig0: a2 = (a >> 11)
|
||||
vpaddd h, h, %%T1 ; h = h + ch + W + K
|
||||
vpxor a0, a0, a1 ; a0 = sigma1
|
||||
PRORD_nd a1, a, 22 ; sig0: a1 = (a >> 22)
|
||||
vpxor %%T1, a, c ; maj: T1 = a^c
|
||||
add ROUND, SZ4 ; ROUND++
|
||||
vpand %%T1, %%T1, b ; maj: T1 = (a^c)&b
|
||||
vpaddd h, h, a0
|
||||
|
||||
vpaddd d, d, h
|
||||
|
||||
vpxor a2, a2, a ; sig0: a2 = a ^ (a >> 11)
|
||||
PRORD a2, 2 ; sig0: a2 = (a >> 2) ^ (a >> 13)
|
||||
vpxor a2, a2, a1 ; a2 = sig0
|
||||
vpand a1, a, c ; maj: a1 = a&c
|
||||
vpor a1, a1, %%T1 ; a1 = maj
|
||||
vpaddd h, h, a1 ; h = h + ch + W + K + maj
|
||||
vpaddd h, h, a2 ; h = h + ch + W + K + maj + sigma0
|
||||
|
||||
ROTATE_ARGS
|
||||
%endm
|
||||
|
||||
|
||||
;; arguments passed implicitly in preprocessor symbols i, a...h
|
||||
%macro ROUND_16_XX 2
|
||||
%define %%T1 %1
|
||||
%define %%i %2
|
||||
vmovdqa %%T1, [SZ4*((%%i-15)&0xf) + rsp + _DATA]
|
||||
vmovdqa a1, [SZ4*((%%i-2)&0xf) + rsp + _DATA]
|
||||
vmovdqa a0, %%T1
|
||||
PRORD %%T1, 18-7
|
||||
vmovdqa a2, a1
|
||||
PRORD a1, 19-17
|
||||
vpxor %%T1, %%T1, a0
|
||||
PRORD %%T1, 7
|
||||
vpxor a1, a1, a2
|
||||
PRORD a1, 17
|
||||
vpsrld a0, a0, 3
|
||||
vpxor %%T1, %%T1, a0
|
||||
vpsrld a2, a2, 10
|
||||
vpxor a1, a1, a2
|
||||
vpaddd %%T1, %%T1, [SZ4*((%%i-16)&0xf) + rsp + _DATA]
|
||||
vpaddd a1, a1, [SZ4*((%%i-7)&0xf) + rsp + _DATA]
|
||||
vpaddd %%T1, %%T1, a1
|
||||
|
||||
ROUND_00_15 %%T1, %%i
|
||||
%endm
|
||||
|
||||
;; arguments passed implicitly in preprocessor symbols i, a...h
|
||||
%macro PADDING_ROUND_00_15 1
|
||||
%define %%T1 %1
|
||||
PRORD_nd a0, e, (11-6) ; sig1: a0 = (e >> 5)
|
||||
|
||||
vpxor a2, f, g ; ch: a2 = f^g
|
||||
vpand a2, a2, e ; ch: a2 = (f^g)&e
|
||||
vpxor a2, a2, g ; a2 = ch
|
||||
|
||||
PRORD_nd a1, e, 25 ; sig1: a1 = (e >> 25)
|
||||
vmovdqa %%T1, [TBL + ROUND] ; T1 = W + K
|
||||
vpxor a0, a0, e ; sig1: a0 = e ^ (e >> 5)
|
||||
PRORD a0, 6 ; sig1: a0 = (e >> 6) ^ (e >> 11)
|
||||
vpaddd h, h, a2 ; h = h + ch
|
||||
PRORD_nd a2, a, (13-2) ; sig0: a2 = (a >> 11)
|
||||
vpaddd h, h, %%T1 ; h = h + ch + W + K
|
||||
vpxor a0, a0, a1 ; a0 = sigma1
|
||||
PRORD_nd a1, a, 22 ; sig0: a1 = (a >> 22)
|
||||
vpxor %%T1, a, c ; maj: T1 = a^c
|
||||
add ROUND, SZ4 ; ROUND++
|
||||
vpand %%T1, %%T1, b ; maj: T1 = (a^c)&b
|
||||
vpaddd h, h, a0
|
||||
|
||||
vpaddd d, d, h
|
||||
|
||||
vpxor a2, a2, a ; sig0: a2 = a ^ (a >> 11)
|
||||
PRORD a2, 2 ; sig0: a2 = (a >> 2) ^ (a >> 13)
|
||||
vpxor a2, a2, a1 ; a2 = sig0
|
||||
vpand a1, a, c ; maj: a1 = a&c
|
||||
vpor a1, a1, %%T1 ; a1 = maj
|
||||
vpaddd h, h, a1 ; h = h + ch + W + K + maj
|
||||
vpaddd h, h, a2 ; h = h + ch + W + K + maj + sigma0
|
||||
|
||||
ROTATE_ARGS
|
||||
%endm
|
||||
|
||||
|
||||
|
||||
section .data
|
||||
default rel
|
||||
align 64
|
||||
|
||||
K256_4:
|
||||
dq 0x428a2f98428a2f98, 0x428a2f98428a2f98
|
||||
dq 0x7137449171374491, 0x7137449171374491
|
||||
dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
|
||||
dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
|
||||
dq 0x3956c25b3956c25b, 0x3956c25b3956c25b
|
||||
dq 0x59f111f159f111f1, 0x59f111f159f111f1
|
||||
dq 0x923f82a4923f82a4, 0x923f82a4923f82a4
|
||||
dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
|
||||
dq 0xd807aa98d807aa98, 0xd807aa98d807aa98
|
||||
dq 0x12835b0112835b01, 0x12835b0112835b01
|
||||
dq 0x243185be243185be, 0x243185be243185be
|
||||
dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
|
||||
dq 0x72be5d7472be5d74, 0x72be5d7472be5d74
|
||||
dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
|
||||
dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
|
||||
dq 0xc19bf174c19bf174, 0xc19bf174c19bf174
|
||||
dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
|
||||
dq 0xefbe4786efbe4786, 0xefbe4786efbe4786
|
||||
dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
|
||||
dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
|
||||
dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
|
||||
dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
|
||||
dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
|
||||
dq 0x76f988da76f988da, 0x76f988da76f988da
|
||||
dq 0x983e5152983e5152, 0x983e5152983e5152
|
||||
dq 0xa831c66da831c66d, 0xa831c66da831c66d
|
||||
dq 0xb00327c8b00327c8, 0xb00327c8b00327c8
|
||||
dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
|
||||
dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
|
||||
dq 0xd5a79147d5a79147, 0xd5a79147d5a79147
|
||||
dq 0x06ca635106ca6351, 0x06ca635106ca6351
|
||||
dq 0x1429296714292967, 0x1429296714292967
|
||||
dq 0x27b70a8527b70a85, 0x27b70a8527b70a85
|
||||
dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138
|
||||
dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
|
||||
dq 0x53380d1353380d13, 0x53380d1353380d13
|
||||
dq 0x650a7354650a7354, 0x650a7354650a7354
|
||||
dq 0x766a0abb766a0abb, 0x766a0abb766a0abb
|
||||
dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
|
||||
dq 0x92722c8592722c85, 0x92722c8592722c85
|
||||
dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
|
||||
dq 0xa81a664ba81a664b, 0xa81a664ba81a664b
|
||||
dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
|
||||
dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
|
||||
dq 0xd192e819d192e819, 0xd192e819d192e819
|
||||
dq 0xd6990624d6990624, 0xd6990624d6990624
|
||||
dq 0xf40e3585f40e3585, 0xf40e3585f40e3585
|
||||
dq 0x106aa070106aa070, 0x106aa070106aa070
|
||||
dq 0x19a4c11619a4c116, 0x19a4c11619a4c116
|
||||
dq 0x1e376c081e376c08, 0x1e376c081e376c08
|
||||
dq 0x2748774c2748774c, 0x2748774c2748774c
|
||||
dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
|
||||
dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
|
||||
dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
|
||||
dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
|
||||
dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
|
||||
dq 0x748f82ee748f82ee, 0x748f82ee748f82ee
|
||||
dq 0x78a5636f78a5636f, 0x78a5636f78a5636f
|
||||
dq 0x84c8781484c87814, 0x84c8781484c87814
|
||||
dq 0x8cc702088cc70208, 0x8cc702088cc70208
|
||||
dq 0x90befffa90befffa, 0x90befffa90befffa
|
||||
dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb
|
||||
dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
|
||||
dq 0xc67178f2c67178f2, 0xc67178f2c67178f2
|
||||
|
||||
PADDING_4:
|
||||
dq 0xc28a2f98c28a2f98, 0xc28a2f98c28a2f98
|
||||
dq 0x7137449171374491, 0x7137449171374491
|
||||
dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
|
||||
dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
|
||||
dq 0x3956c25b3956c25b, 0x3956c25b3956c25b
|
||||
dq 0x59f111f159f111f1, 0x59f111f159f111f1
|
||||
dq 0x923f82a4923f82a4, 0x923f82a4923f82a4
|
||||
dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
|
||||
dq 0xd807aa98d807aa98, 0xd807aa98d807aa98
|
||||
dq 0x12835b0112835b01, 0x12835b0112835b01
|
||||
dq 0x243185be243185be, 0x243185be243185be
|
||||
dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
|
||||
dq 0x72be5d7472be5d74, 0x72be5d7472be5d74
|
||||
dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
|
||||
dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
|
||||
dq 0xc19bf374c19bf374, 0xc19bf374c19bf374
|
||||
dq 0x649b69c1649b69c1, 0x649b69c1649b69c1
|
||||
dq 0xf0fe4786f0fe4786, 0xf0fe4786f0fe4786
|
||||
dq 0x0fe1edc60fe1edc6, 0x0fe1edc60fe1edc6
|
||||
dq 0x240cf254240cf254, 0x240cf254240cf254
|
||||
dq 0x4fe9346f4fe9346f, 0x4fe9346f4fe9346f
|
||||
dq 0x6cc984be6cc984be, 0x6cc984be6cc984be
|
||||
dq 0x61b9411e61b9411e, 0x61b9411e61b9411e
|
||||
dq 0x16f988fa16f988fa, 0x16f988fa16f988fa
|
||||
dq 0xf2c65152f2c65152, 0xf2c65152f2c65152
|
||||
dq 0xa88e5a6da88e5a6d, 0xa88e5a6da88e5a6d
|
||||
dq 0xb019fc65b019fc65, 0xb019fc65b019fc65
|
||||
dq 0xb9d99ec7b9d99ec7, 0xb9d99ec7b9d99ec7
|
||||
dq 0x9a1231c39a1231c3, 0x9a1231c39a1231c3
|
||||
dq 0xe70eeaa0e70eeaa0, 0xe70eeaa0e70eeaa0
|
||||
dq 0xfdb1232bfdb1232b, 0xfdb1232bfdb1232b
|
||||
dq 0xc7353eb0c7353eb0, 0xc7353eb0c7353eb0
|
||||
dq 0x3069bad53069bad5, 0x3069bad53069bad5
|
||||
dq 0xcb976d5fcb976d5f, 0xcb976d5fcb976d5f
|
||||
dq 0x5a0f118f5a0f118f, 0x5a0f118f5a0f118f
|
||||
dq 0xdc1eeefddc1eeefd, 0xdc1eeefddc1eeefd
|
||||
dq 0x0a35b6890a35b689, 0x0a35b6890a35b689
|
||||
dq 0xde0b7a04de0b7a04, 0xde0b7a04de0b7a04
|
||||
dq 0x58f4ca9d58f4ca9d, 0x58f4ca9d58f4ca9d
|
||||
dq 0xe15d5b16e15d5b16, 0xe15d5b16e15d5b16
|
||||
dq 0x007f3e86007f3e86, 0x007f3e86007f3e86
|
||||
dq 0x3708898037088980, 0x3708898037088980
|
||||
dq 0xa507ea32a507ea32, 0xa507ea32a507ea32
|
||||
dq 0x6fab95376fab9537, 0x6fab95376fab9537
|
||||
dq 0x1740611017406110, 0x1740611017406110
|
||||
dq 0x0d8cd6f10d8cd6f1, 0x0d8cd6f10d8cd6f1
|
||||
dq 0xcdaa3b6dcdaa3b6d, 0xcdaa3b6dcdaa3b6d
|
||||
dq 0xc0bbbe37c0bbbe37, 0xc0bbbe37c0bbbe37
|
||||
dq 0x83613bda83613bda, 0x83613bda83613bda
|
||||
dq 0xdb48a363db48a363, 0xdb48a363db48a363
|
||||
dq 0x0b02e9310b02e931, 0x0b02e9310b02e931
|
||||
dq 0x6fd15ca76fd15ca7, 0x6fd15ca76fd15ca7
|
||||
dq 0x521afaca521afaca, 0x521afaca521afaca
|
||||
dq 0x3133843131338431, 0x3133843131338431
|
||||
dq 0x6ed41a956ed41a95, 0x6ed41a956ed41a95
|
||||
dq 0x6d4378906d437890, 0x6d4378906d437890
|
||||
dq 0xc39c91f2c39c91f2, 0xc39c91f2c39c91f2
|
||||
dq 0x9eccabbd9eccabbd, 0x9eccabbd9eccabbd
|
||||
dq 0xb5c9a0e6b5c9a0e6, 0xb5c9a0e6b5c9a0e6
|
||||
dq 0x532fb63c532fb63c, 0x532fb63c532fb63c
|
||||
dq 0xd2c741c6d2c741c6, 0xd2c741c6d2c741c6
|
||||
dq 0x07237ea307237ea3, 0x07237ea307237ea3
|
||||
dq 0xa4954b68a4954b68, 0xa4954b68a4954b68
|
||||
dq 0x4c191d764c191d76, 0x4c191d764c191d76
|
||||
|
||||
DIGEST_4:
|
||||
dd 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667
|
||||
dd 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85
|
||||
dd 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372
|
||||
dd 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a
|
||||
dd 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f
|
||||
dd 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c
|
||||
dd 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab
|
||||
dd 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19
|
||||
|
||||
PSHUFFLE_BYTE_FLIP_MASK:
|
||||
dq 0x0405060700010203, 0x0c0d0e0f08090a0b
|
||||
|
||||
section .text
|
||||
|
||||
global sha256_4_avx:function
|
||||
align 16
|
||||
sha256_4_avx:
|
||||
endbranch64
|
||||
; outer calling routine saves all the XMM registers
|
||||
sub rsp, STACK_size
|
||||
mov [rsp + _RBX],rbx
|
||||
|
||||
.hash_4_blocks:
|
||||
cmp NUM_BLKS, 4
|
||||
jl .hash_1_block
|
||||
|
||||
xor ROUND, ROUND
|
||||
|
||||
;; Load the pre-transposed incoming digest.
|
||||
lea TBL,[rel DIGEST_4]
|
||||
vmovdqa a,[TBL + 0*SZ4]
|
||||
vmovdqa b,[TBL + 1*SZ4]
|
||||
vmovdqa c,[TBL + 2*SZ4]
|
||||
vmovdqa d,[TBL + 3*SZ4]
|
||||
vmovdqa e,[TBL + 4*SZ4]
|
||||
vmovdqa f,[TBL + 5*SZ4]
|
||||
vmovdqa g,[TBL + 6*SZ4]
|
||||
vmovdqa h,[TBL + 7*SZ4]
|
||||
|
||||
lea TBL,[rel K256_4]
|
||||
|
||||
%assign i 0
|
||||
%rep 4
|
||||
vmovdqa TMP, [rel PSHUFFLE_BYTE_FLIP_MASK]
|
||||
VMOVPS TT2,[DATA_PTR + 0*64 + i*16]
|
||||
VMOVPS TT1,[DATA_PTR + 1*64 + i*16]
|
||||
VMOVPS TT4,[DATA_PTR + 2*64 + i*16]
|
||||
VMOVPS TT3,[DATA_PTR + 3*64 + i*16]
|
||||
TRANSPOSE TT2, TT1, TT4, TT3, TT0, TT5
|
||||
vpshufb TT0, TT0, TMP
|
||||
vpshufb TT1, TT1, TMP
|
||||
vpshufb TT2, TT2, TMP
|
||||
vpshufb TT3, TT3, TMP
|
||||
ROUND_00_15 TT0,(i*4+0)
|
||||
ROUND_00_15 TT1,(i*4+1)
|
||||
ROUND_00_15 TT2,(i*4+2)
|
||||
ROUND_00_15 TT3,(i*4+3)
|
||||
%assign i (i+1)
|
||||
%endrep
|
||||
|
||||
%assign i (i*4)
|
||||
|
||||
jmp .Lrounds_16_xx
|
||||
align 16
|
||||
.Lrounds_16_xx:
|
||||
%rep 16
|
||||
ROUND_16_XX T1, i
|
||||
%assign i (i+1)
|
||||
%endrep
|
||||
|
||||
cmp ROUND,ROUNDS
|
||||
jb .Lrounds_16_xx
|
||||
|
||||
;; add old digest
|
||||
lea TBL,[rel DIGEST_4]
|
||||
vpaddd a, a, [TBL + 0*SZ4]
|
||||
vpaddd b, b, [TBL + 1*SZ4]
|
||||
vpaddd c, c, [TBL + 2*SZ4]
|
||||
vpaddd d, d, [TBL + 3*SZ4]
|
||||
vpaddd e, e, [TBL + 4*SZ4]
|
||||
vpaddd f, f, [TBL + 5*SZ4]
|
||||
vpaddd g, g, [TBL + 6*SZ4]
|
||||
vpaddd h, h, [TBL + 7*SZ4]
|
||||
|
||||
;; rounds with padding
|
||||
|
||||
;; save old digest
|
||||
|
||||
vmovdqa [rsp + _DIGEST + 0*SZ4], a
|
||||
vmovdqa [rsp + _DIGEST + 1*SZ4], b
|
||||
vmovdqa [rsp + _DIGEST + 2*SZ4], c
|
||||
vmovdqa [rsp + _DIGEST + 3*SZ4], d
|
||||
vmovdqa [rsp + _DIGEST + 4*SZ4], e
|
||||
vmovdqa [rsp + _DIGEST + 5*SZ4], f
|
||||
vmovdqa [rsp + _DIGEST + 6*SZ4], g
|
||||
vmovdqa [rsp + _DIGEST + 7*SZ4], h
|
||||
|
||||
lea TBL,[rel PADDING_4]
|
||||
xor ROUND,ROUND
|
||||
jmp .Lrounds_padding
|
||||
|
||||
align 16
|
||||
.Lrounds_padding:
|
||||
%rep 64
|
||||
PADDING_ROUND_00_15 T1
|
||||
%endrep
|
||||
;; add old digest
|
||||
vpaddd a, a, [rsp + _DIGEST + 0*SZ4]
|
||||
vpaddd b, b, [rsp + _DIGEST + 1*SZ4]
|
||||
vpaddd c, c, [rsp + _DIGEST + 2*SZ4]
|
||||
vpaddd d, d, [rsp + _DIGEST + 3*SZ4]
|
||||
vpaddd e, e, [rsp + _DIGEST + 4*SZ4]
|
||||
vpaddd f, f, [rsp + _DIGEST + 5*SZ4]
|
||||
vpaddd g, g, [rsp + _DIGEST + 6*SZ4]
|
||||
vpaddd h, h, [rsp + _DIGEST + 7*SZ4]
|
||||
|
||||
;; transpose the digest and convert to little endian to get the registers correctly
|
||||
|
||||
TRANSPOSE a, b, c, d, TT0, TT1
|
||||
TRANSPOSE e, f, g, h, TT2, TT1
|
||||
|
||||
vmovdqa TMP, [rel PSHUFFLE_BYTE_FLIP_MASK]
|
||||
vpshufb TT0, TMP
|
||||
vpshufb TT2, TMP
|
||||
vpshufb b, TMP
|
||||
vpshufb f, TMP
|
||||
vpshufb a, TMP
|
||||
vpshufb e, TMP
|
||||
vpshufb d, TMP
|
||||
vpshufb h, TMP
|
||||
|
||||
|
||||
;; write to output
|
||||
|
||||
vmovdqu [OUTPUT_PTR + 0*SZ4],TT0
|
||||
vmovdqu [OUTPUT_PTR + 1*SZ4],TT2
|
||||
vmovdqu [OUTPUT_PTR + 2*SZ4],b
|
||||
vmovdqu [OUTPUT_PTR + 3*SZ4],f
|
||||
vmovdqu [OUTPUT_PTR + 4*SZ4],a
|
||||
vmovdqu [OUTPUT_PTR + 5*SZ4],e
|
||||
vmovdqu [OUTPUT_PTR + 6*SZ4],d
|
||||
vmovdqu [OUTPUT_PTR + 7*SZ4],h
|
||||
|
||||
; update pointers and loop
|
||||
|
||||
add DATA_PTR, 64*4
|
||||
add OUTPUT_PTR, 32*4
|
||||
sub NUM_BLKS, 4
|
||||
jmp .hash_4_blocks
|
||||
|
||||
.hash_1_block:
|
||||
test NUM_BLKS,NUM_BLKS
|
||||
jz .done_hash
|
||||
call sha256_1_avx
|
||||
add DATA_PTR, 64
|
||||
add OUTPUT_PTR, 32
|
||||
dec NUM_BLKS
|
||||
jmp .hash_1_block
|
||||
|
||||
.done_hash:
|
||||
mov rbx,[rsp + _RBX]
|
||||
add rsp, STACK_size
|
||||
ret
|
||||
|
||||
%ifdef LINUX
|
||||
section .note.GNU-stack noalloc noexec nowrite progbits
|
||||
%endif
|
||||
797
crypto/hash/custom_hasher/assembly/sha256_avx2.asm
Normal file
797
crypto/hash/custom_hasher/assembly/sha256_avx2.asm
Normal file
@@ -0,0 +1,797 @@
|
||||
;; sha256_avx2.asm
|
||||
; *
|
||||
; * This file is part of Mammon.
|
||||
; * mammon is a greedy and selfish ETH consensus client.
|
||||
; *
|
||||
; * Copyright (c) 2021 - Reimundo Heluani (potuz) potuz@potuz.net
|
||||
; *
|
||||
; * This program is free software: you can redistribute it and/or modify
|
||||
; * it under the terms of the GNU General Public License as published by
|
||||
; * the Free Software Foundation, either version 3 of the License, or
|
||||
; * (at your option) any later version.
|
||||
; *
|
||||
; * This program is distributed in the hope that it will be useful,
|
||||
; * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
; * GNU General Public License for more details.
|
||||
; *
|
||||
; You should have received a copy of the GNU General Public License
|
||||
; along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
;
|
||||
; This implementation is a 64 bytes optimized implementation based on Intel's code
|
||||
; whose copyright follows
|
||||
;
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;
|
||||
;; Copyright (c) 2012-2021, Intel Corporation
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
;; modification, are permitted provided that the following conditions are met:
|
||||
;;
|
||||
;; * Redistributions of source code must retain the above copyright notice,
|
||||
;; this list of conditions and the following disclaimer.
|
||||
;; * Redistributions in binary form must reproduce the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer in the
|
||||
;; documentation and/or other materials provided with the distribution.
|
||||
;; * Neither the name of Intel Corporation nor the names of its contributors
|
||||
;; may be used to endorse or promote products derived from this software
|
||||
;; without specific prior written permission.
|
||||
;;
|
||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
|
||||
;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
;;
|
||||
|
||||
;; code to compute oct SHA256 using SSE-256
|
||||
;; outer calling routine takes care of save and restore of XMM registers
|
||||
;; Logic designed/laid out by JDG
|
||||
|
||||
;; Function clobbers: rax, rcx, rdx, rsi, rdi, r9-r15; ymm0-15
|
||||
;; Stack must be aligned to 32 bytes before call
|
||||
;; Windows clobbers: rax rdx rsi rdi r8 r9 r10 r11 r12 r13 r14
|
||||
;; Windows preserves: rcx rbp r15
|
||||
;;
|
||||
;; Linux clobbers: rax rcx rdx rsi r8 r9 r10 r11 r12 r13 r14
|
||||
;; Linux preserves: rdi rbp r15
|
||||
;;
|
||||
;; clobbers ymm0-15
|
||||
|
||||
%include "transpose_avx2.asm"
|
||||
|
||||
extern sha256_4_avx
|
||||
|
||||
section .data
|
||||
default rel
|
||||
align 64
|
||||
|
||||
K256_8:
|
||||
dq 0x428a2f98428a2f98, 0x428a2f98428a2f98
|
||||
dq 0x428a2f98428a2f98, 0x428a2f98428a2f98
|
||||
dq 0x7137449171374491, 0x7137449171374491
|
||||
dq 0x7137449171374491, 0x7137449171374491
|
||||
dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
|
||||
dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
|
||||
dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
|
||||
dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
|
||||
dq 0x3956c25b3956c25b, 0x3956c25b3956c25b
|
||||
dq 0x3956c25b3956c25b, 0x3956c25b3956c25b
|
||||
dq 0x59f111f159f111f1, 0x59f111f159f111f1
|
||||
dq 0x59f111f159f111f1, 0x59f111f159f111f1
|
||||
dq 0x923f82a4923f82a4, 0x923f82a4923f82a4
|
||||
dq 0x923f82a4923f82a4, 0x923f82a4923f82a4
|
||||
dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
|
||||
dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
|
||||
dq 0xd807aa98d807aa98, 0xd807aa98d807aa98
|
||||
dq 0xd807aa98d807aa98, 0xd807aa98d807aa98
|
||||
dq 0x12835b0112835b01, 0x12835b0112835b01
|
||||
dq 0x12835b0112835b01, 0x12835b0112835b01
|
||||
dq 0x243185be243185be, 0x243185be243185be
|
||||
dq 0x243185be243185be, 0x243185be243185be
|
||||
dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
|
||||
dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
|
||||
dq 0x72be5d7472be5d74, 0x72be5d7472be5d74
|
||||
dq 0x72be5d7472be5d74, 0x72be5d7472be5d74
|
||||
dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
|
||||
dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
|
||||
dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
|
||||
dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
|
||||
dq 0xc19bf174c19bf174, 0xc19bf174c19bf174
|
||||
dq 0xc19bf174c19bf174, 0xc19bf174c19bf174
|
||||
dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
|
||||
dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
|
||||
dq 0xefbe4786efbe4786, 0xefbe4786efbe4786
|
||||
dq 0xefbe4786efbe4786, 0xefbe4786efbe4786
|
||||
dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
|
||||
dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
|
||||
dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
|
||||
dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
|
||||
dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
|
||||
dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
|
||||
dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
|
||||
dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
|
||||
dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
|
||||
dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
|
||||
dq 0x76f988da76f988da, 0x76f988da76f988da
|
||||
dq 0x76f988da76f988da, 0x76f988da76f988da
|
||||
dq 0x983e5152983e5152, 0x983e5152983e5152
|
||||
dq 0x983e5152983e5152, 0x983e5152983e5152
|
||||
dq 0xa831c66da831c66d, 0xa831c66da831c66d
|
||||
dq 0xa831c66da831c66d, 0xa831c66da831c66d
|
||||
dq 0xb00327c8b00327c8, 0xb00327c8b00327c8
|
||||
dq 0xb00327c8b00327c8, 0xb00327c8b00327c8
|
||||
dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
|
||||
dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
|
||||
dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
|
||||
dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
|
||||
dq 0xd5a79147d5a79147, 0xd5a79147d5a79147
|
||||
dq 0xd5a79147d5a79147, 0xd5a79147d5a79147
|
||||
dq 0x06ca635106ca6351, 0x06ca635106ca6351
|
||||
dq 0x06ca635106ca6351, 0x06ca635106ca6351
|
||||
dq 0x1429296714292967, 0x1429296714292967
|
||||
dq 0x1429296714292967, 0x1429296714292967
|
||||
dq 0x27b70a8527b70a85, 0x27b70a8527b70a85
|
||||
dq 0x27b70a8527b70a85, 0x27b70a8527b70a85
|
||||
dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138
|
||||
dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138
|
||||
dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
|
||||
dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
|
||||
dq 0x53380d1353380d13, 0x53380d1353380d13
|
||||
dq 0x53380d1353380d13, 0x53380d1353380d13
|
||||
dq 0x650a7354650a7354, 0x650a7354650a7354
|
||||
dq 0x650a7354650a7354, 0x650a7354650a7354
|
||||
dq 0x766a0abb766a0abb, 0x766a0abb766a0abb
|
||||
dq 0x766a0abb766a0abb, 0x766a0abb766a0abb
|
||||
dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
|
||||
dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
|
||||
dq 0x92722c8592722c85, 0x92722c8592722c85
|
||||
dq 0x92722c8592722c85, 0x92722c8592722c85
|
||||
dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
|
||||
dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
|
||||
dq 0xa81a664ba81a664b, 0xa81a664ba81a664b
|
||||
dq 0xa81a664ba81a664b, 0xa81a664ba81a664b
|
||||
dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
|
||||
dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
|
||||
dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
|
||||
dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
|
||||
dq 0xd192e819d192e819, 0xd192e819d192e819
|
||||
dq 0xd192e819d192e819, 0xd192e819d192e819
|
||||
dq 0xd6990624d6990624, 0xd6990624d6990624
|
||||
dq 0xd6990624d6990624, 0xd6990624d6990624
|
||||
dq 0xf40e3585f40e3585, 0xf40e3585f40e3585
|
||||
dq 0xf40e3585f40e3585, 0xf40e3585f40e3585
|
||||
dq 0x106aa070106aa070, 0x106aa070106aa070
|
||||
dq 0x106aa070106aa070, 0x106aa070106aa070
|
||||
dq 0x19a4c11619a4c116, 0x19a4c11619a4c116
|
||||
dq 0x19a4c11619a4c116, 0x19a4c11619a4c116
|
||||
dq 0x1e376c081e376c08, 0x1e376c081e376c08
|
||||
dq 0x1e376c081e376c08, 0x1e376c081e376c08
|
||||
dq 0x2748774c2748774c, 0x2748774c2748774c
|
||||
dq 0x2748774c2748774c, 0x2748774c2748774c
|
||||
dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
|
||||
dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
|
||||
dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
|
||||
dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
|
||||
dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
|
||||
dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
|
||||
dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
|
||||
dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
|
||||
dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
|
||||
dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
|
||||
dq 0x748f82ee748f82ee, 0x748f82ee748f82ee
|
||||
dq 0x748f82ee748f82ee, 0x748f82ee748f82ee
|
||||
dq 0x78a5636f78a5636f, 0x78a5636f78a5636f
|
||||
dq 0x78a5636f78a5636f, 0x78a5636f78a5636f
|
||||
dq 0x84c8781484c87814, 0x84c8781484c87814
|
||||
dq 0x84c8781484c87814, 0x84c8781484c87814
|
||||
dq 0x8cc702088cc70208, 0x8cc702088cc70208
|
||||
dq 0x8cc702088cc70208, 0x8cc702088cc70208
|
||||
dq 0x90befffa90befffa, 0x90befffa90befffa
|
||||
dq 0x90befffa90befffa, 0x90befffa90befffa
|
||||
dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb
|
||||
dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb
|
||||
dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
|
||||
dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
|
||||
dq 0xc67178f2c67178f2, 0xc67178f2c67178f2
|
||||
dq 0xc67178f2c67178f2, 0xc67178f2c67178f2
|
||||
|
||||
PADDING_8:
|
||||
|
||||
ddq 0xc28a2f98c28a2f98c28a2f98c28a2f98
|
||||
ddq 0xc28a2f98c28a2f98c28a2f98c28a2f98
|
||||
ddq 0x71374491713744917137449171374491
|
||||
ddq 0x71374491713744917137449171374491
|
||||
ddq 0xb5c0fbcfb5c0fbcfb5c0fbcfb5c0fbcf
|
||||
ddq 0xb5c0fbcfb5c0fbcfb5c0fbcfb5c0fbcf
|
||||
ddq 0xe9b5dba5e9b5dba5e9b5dba5e9b5dba5
|
||||
ddq 0xe9b5dba5e9b5dba5e9b5dba5e9b5dba5
|
||||
ddq 0x3956c25b3956c25b3956c25b3956c25b
|
||||
ddq 0x3956c25b3956c25b3956c25b3956c25b
|
||||
ddq 0x59f111f159f111f159f111f159f111f1
|
||||
ddq 0x59f111f159f111f159f111f159f111f1
|
||||
ddq 0x923f82a4923f82a4923f82a4923f82a4
|
||||
ddq 0x923f82a4923f82a4923f82a4923f82a4
|
||||
ddq 0xab1c5ed5ab1c5ed5ab1c5ed5ab1c5ed5
|
||||
ddq 0xab1c5ed5ab1c5ed5ab1c5ed5ab1c5ed5
|
||||
ddq 0xd807aa98d807aa98d807aa98d807aa98
|
||||
ddq 0xd807aa98d807aa98d807aa98d807aa98
|
||||
ddq 0x12835b0112835b0112835b0112835b01
|
||||
ddq 0x12835b0112835b0112835b0112835b01
|
||||
ddq 0x243185be243185be243185be243185be
|
||||
ddq 0x243185be243185be243185be243185be
|
||||
ddq 0x550c7dc3550c7dc3550c7dc3550c7dc3
|
||||
ddq 0x550c7dc3550c7dc3550c7dc3550c7dc3
|
||||
ddq 0x72be5d7472be5d7472be5d7472be5d74
|
||||
ddq 0x72be5d7472be5d7472be5d7472be5d74
|
||||
ddq 0x80deb1fe80deb1fe80deb1fe80deb1fe
|
||||
ddq 0x80deb1fe80deb1fe80deb1fe80deb1fe
|
||||
ddq 0x9bdc06a79bdc06a79bdc06a79bdc06a7
|
||||
ddq 0x9bdc06a79bdc06a79bdc06a79bdc06a7
|
||||
ddq 0xc19bf374c19bf374c19bf374c19bf374
|
||||
ddq 0xc19bf374c19bf374c19bf374c19bf374
|
||||
ddq 0x649b69c1649b69c1649b69c1649b69c1
|
||||
ddq 0x649b69c1649b69c1649b69c1649b69c1
|
||||
ddq 0xf0fe4786f0fe4786f0fe4786f0fe4786
|
||||
ddq 0xf0fe4786f0fe4786f0fe4786f0fe4786
|
||||
ddq 0x0fe1edc60fe1edc60fe1edc60fe1edc6
|
||||
ddq 0x0fe1edc60fe1edc60fe1edc60fe1edc6
|
||||
ddq 0x240cf254240cf254240cf254240cf254
|
||||
ddq 0x240cf254240cf254240cf254240cf254
|
||||
ddq 0x4fe9346f4fe9346f4fe9346f4fe9346f
|
||||
ddq 0x4fe9346f4fe9346f4fe9346f4fe9346f
|
||||
ddq 0x6cc984be6cc984be6cc984be6cc984be
|
||||
ddq 0x6cc984be6cc984be6cc984be6cc984be
|
||||
ddq 0x61b9411e61b9411e61b9411e61b9411e
|
||||
ddq 0x61b9411e61b9411e61b9411e61b9411e
|
||||
ddq 0x16f988fa16f988fa16f988fa16f988fa
|
||||
ddq 0x16f988fa16f988fa16f988fa16f988fa
|
||||
ddq 0xf2c65152f2c65152f2c65152f2c65152
|
||||
ddq 0xf2c65152f2c65152f2c65152f2c65152
|
||||
ddq 0xa88e5a6da88e5a6da88e5a6da88e5a6d
|
||||
ddq 0xa88e5a6da88e5a6da88e5a6da88e5a6d
|
||||
ddq 0xb019fc65b019fc65b019fc65b019fc65
|
||||
ddq 0xb019fc65b019fc65b019fc65b019fc65
|
||||
ddq 0xb9d99ec7b9d99ec7b9d99ec7b9d99ec7
|
||||
ddq 0xb9d99ec7b9d99ec7b9d99ec7b9d99ec7
|
||||
ddq 0x9a1231c39a1231c39a1231c39a1231c3
|
||||
ddq 0x9a1231c39a1231c39a1231c39a1231c3
|
||||
ddq 0xe70eeaa0e70eeaa0e70eeaa0e70eeaa0
|
||||
ddq 0xe70eeaa0e70eeaa0e70eeaa0e70eeaa0
|
||||
ddq 0xfdb1232bfdb1232bfdb1232bfdb1232b
|
||||
ddq 0xfdb1232bfdb1232bfdb1232bfdb1232b
|
||||
ddq 0xc7353eb0c7353eb0c7353eb0c7353eb0
|
||||
ddq 0xc7353eb0c7353eb0c7353eb0c7353eb0
|
||||
ddq 0x3069bad53069bad53069bad53069bad5
|
||||
ddq 0x3069bad53069bad53069bad53069bad5
|
||||
ddq 0xcb976d5fcb976d5fcb976d5fcb976d5f
|
||||
ddq 0xcb976d5fcb976d5fcb976d5fcb976d5f
|
||||
ddq 0x5a0f118f5a0f118f5a0f118f5a0f118f
|
||||
ddq 0x5a0f118f5a0f118f5a0f118f5a0f118f
|
||||
ddq 0xdc1eeefddc1eeefddc1eeefddc1eeefd
|
||||
ddq 0xdc1eeefddc1eeefddc1eeefddc1eeefd
|
||||
ddq 0x0a35b6890a35b6890a35b6890a35b689
|
||||
ddq 0x0a35b6890a35b6890a35b6890a35b689
|
||||
ddq 0xde0b7a04de0b7a04de0b7a04de0b7a04
|
||||
ddq 0xde0b7a04de0b7a04de0b7a04de0b7a04
|
||||
ddq 0x58f4ca9d58f4ca9d58f4ca9d58f4ca9d
|
||||
ddq 0x58f4ca9d58f4ca9d58f4ca9d58f4ca9d
|
||||
ddq 0xe15d5b16e15d5b16e15d5b16e15d5b16
|
||||
ddq 0xe15d5b16e15d5b16e15d5b16e15d5b16
|
||||
ddq 0x007f3e86007f3e86007f3e86007f3e86
|
||||
ddq 0x007f3e86007f3e86007f3e86007f3e86
|
||||
ddq 0x37088980370889803708898037088980
|
||||
ddq 0x37088980370889803708898037088980
|
||||
ddq 0xa507ea32a507ea32a507ea32a507ea32
|
||||
ddq 0xa507ea32a507ea32a507ea32a507ea32
|
||||
ddq 0x6fab95376fab95376fab95376fab9537
|
||||
ddq 0x6fab95376fab95376fab95376fab9537
|
||||
ddq 0x17406110174061101740611017406110
|
||||
ddq 0x17406110174061101740611017406110
|
||||
ddq 0x0d8cd6f10d8cd6f10d8cd6f10d8cd6f1
|
||||
ddq 0x0d8cd6f10d8cd6f10d8cd6f10d8cd6f1
|
||||
ddq 0xcdaa3b6dcdaa3b6dcdaa3b6dcdaa3b6d
|
||||
ddq 0xcdaa3b6dcdaa3b6dcdaa3b6dcdaa3b6d
|
||||
ddq 0xc0bbbe37c0bbbe37c0bbbe37c0bbbe37
|
||||
ddq 0xc0bbbe37c0bbbe37c0bbbe37c0bbbe37
|
||||
ddq 0x83613bda83613bda83613bda83613bda
|
||||
ddq 0x83613bda83613bda83613bda83613bda
|
||||
ddq 0xdb48a363db48a363db48a363db48a363
|
||||
ddq 0xdb48a363db48a363db48a363db48a363
|
||||
ddq 0x0b02e9310b02e9310b02e9310b02e931
|
||||
ddq 0x0b02e9310b02e9310b02e9310b02e931
|
||||
ddq 0x6fd15ca76fd15ca76fd15ca76fd15ca7
|
||||
ddq 0x6fd15ca76fd15ca76fd15ca76fd15ca7
|
||||
ddq 0x521afaca521afaca521afaca521afaca
|
||||
ddq 0x521afaca521afaca521afaca521afaca
|
||||
ddq 0x31338431313384313133843131338431
|
||||
ddq 0x31338431313384313133843131338431
|
||||
ddq 0x6ed41a956ed41a956ed41a956ed41a95
|
||||
ddq 0x6ed41a956ed41a956ed41a956ed41a95
|
||||
ddq 0x6d4378906d4378906d4378906d437890
|
||||
ddq 0x6d4378906d4378906d4378906d437890
|
||||
ddq 0xc39c91f2c39c91f2c39c91f2c39c91f2
|
||||
ddq 0xc39c91f2c39c91f2c39c91f2c39c91f2
|
||||
ddq 0x9eccabbd9eccabbd9eccabbd9eccabbd
|
||||
ddq 0x9eccabbd9eccabbd9eccabbd9eccabbd
|
||||
ddq 0xb5c9a0e6b5c9a0e6b5c9a0e6b5c9a0e6
|
||||
ddq 0xb5c9a0e6b5c9a0e6b5c9a0e6b5c9a0e6
|
||||
ddq 0x532fb63c532fb63c532fb63c532fb63c
|
||||
ddq 0x532fb63c532fb63c532fb63c532fb63c
|
||||
ddq 0xd2c741c6d2c741c6d2c741c6d2c741c6
|
||||
ddq 0xd2c741c6d2c741c6d2c741c6d2c741c6
|
||||
ddq 0x07237ea307237ea307237ea307237ea3
|
||||
ddq 0x07237ea307237ea307237ea307237ea3
|
||||
ddq 0xa4954b68a4954b68a4954b68a4954b68
|
||||
ddq 0xa4954b68a4954b68a4954b68a4954b68
|
||||
ddq 0x4c191d764c191d764c191d764c191d76
|
||||
ddq 0x4c191d764c191d764c191d764c191d76
|
||||
|
||||
|
||||
DIGEST_8:
|
||||
dd 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667
|
||||
dd 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667
|
||||
dd 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85
|
||||
dd 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85
|
||||
dd 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372
|
||||
dd 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372
|
||||
dd 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a
|
||||
dd 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a
|
||||
dd 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f
|
||||
dd 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f
|
||||
dd 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c
|
||||
dd 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c
|
||||
dd 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab
|
||||
dd 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab
|
||||
dd 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19
|
||||
dd 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19
|
||||
|
||||
|
||||
PSHUFFLE_BYTE_FLIP_MASK:
|
||||
dq 0x0405060700010203, 0x0c0d0e0f08090a0b
|
||||
dq 0x0405060700010203, 0x0c0d0e0f08090a0b
|
||||
|
||||
STACK_ALIGNMENT_MASK:
|
||||
dq 0xffffffffffffffe0
|
||||
|
||||
section .text
|
||||
|
||||
%ifdef WINABI
|
||||
%define OUTPUT_PTR rcx ; 1st arg
|
||||
%define DATA_PTR rdx ; 2nd arg
|
||||
%define NUM_BLKS r8 ; 3rd arg
|
||||
%define TBL rsi
|
||||
%define reg1 rdi
|
||||
%else
|
||||
%define OUTPUT_PTR rdi ; 1st arg
|
||||
%define DATA_PTR rsi ; 2nd arg
|
||||
%define NUM_BLKS rdx ; 3rd arg
|
||||
%define TBL rcx
|
||||
%define reg1 r8
|
||||
%endif
|
||||
|
||||
%define ROUND rax
|
||||
|
||||
%define inp0 r9
|
||||
%define inp1 r10
|
||||
%define inp2 r11
|
||||
%define inp3 r12
|
||||
%define inp4 r13
|
||||
%define inp5 r14
|
||||
%define inp6 reg1
|
||||
%define inp7 reg2
|
||||
|
||||
|
||||
|
||||
; ymm0 a
|
||||
; ymm1 b
|
||||
; ymm2 c
|
||||
; ymm3 d
|
||||
; ymm4 e
|
||||
; ymm5 f
|
||||
; ymm6 g TMP0
|
||||
; ymm7 h TMP1
|
||||
; ymm8 T1 TT0
|
||||
; ymm9 TT1
|
||||
; ymm10 TT2
|
||||
; ymm11 TT3
|
||||
; ymm12 a0 TT4
|
||||
; ymm13 a1 TT5
|
||||
; ymm14 a2 TT6
|
||||
; ymm15 TMP TT7
|
||||
|
||||
%define a ymm0
|
||||
%define b ymm1
|
||||
%define c ymm2
|
||||
%define d ymm3
|
||||
%define e ymm4
|
||||
%define f ymm5
|
||||
%define g ymm6
|
||||
%define h ymm7
|
||||
|
||||
%define T1 ymm8
|
||||
|
||||
%define a0 ymm12
|
||||
%define a1 ymm13
|
||||
%define a2 ymm14
|
||||
%define TMP ymm15
|
||||
|
||||
%define TMP0 ymm6
|
||||
%define TMP1 ymm7
|
||||
|
||||
%define TT0 ymm8
|
||||
%define TT1 ymm9
|
||||
%define TT2 ymm10
|
||||
%define TT3 ymm11
|
||||
%define TT4 ymm12
|
||||
%define TT5 ymm13
|
||||
%define TT6 ymm14
|
||||
%define TT7 ymm15
|
||||
|
||||
%define SHA256_DIGEST_WORD_SIZE 4;
|
||||
%define SZ8 8*SHA256_DIGEST_WORD_SIZE ; Size of one vector register
|
||||
%define ROUNDS 64*SZ8
|
||||
|
||||
; Define stack usage
|
||||
|
||||
;; Assume stack aligned to 32 bytes before call
|
||||
;; Therefore FRAMESZ mod 32 must be 32-8 = 24
|
||||
struc stack_frame
|
||||
.data resb 16*SZ8
|
||||
.digest resb 8*SZ8
|
||||
.ytmp resb 4*SZ8
|
||||
.regsave resb 4*64
|
||||
endstruc
|
||||
%define FRAMESZ stack_frame_size
|
||||
%define _DIGEST stack_frame.digest
|
||||
%define _YTMP stack_frame.ytmp
|
||||
%define _RSAVE stack_frame.regsave
|
||||
|
||||
%define YTMP0 rsp + _YTMP + 0*SZ8
|
||||
%define YTMP1 rsp + _YTMP + 1*SZ8
|
||||
%define YTMP2 rsp + _YTMP + 2*SZ8
|
||||
%define YTMP3 rsp + _YTMP + 3*SZ8
|
||||
%define R12 rsp + _RSAVE + 0*64
|
||||
%define R13 rsp + _RSAVE + 1*64
|
||||
%define R14 rsp + _RSAVE + 2*64
|
||||
%define R15 rsp + _RSAVE + 3*64
|
||||
|
||||
|
||||
%define VMOVPS vmovups
|
||||
|
||||
|
||||
%macro ROTATE_ARGS 0
|
||||
%xdefine TMP_ h
|
||||
%xdefine h g
|
||||
%xdefine g f
|
||||
%xdefine f e
|
||||
%xdefine e d
|
||||
%xdefine d c
|
||||
%xdefine c b
|
||||
%xdefine b a
|
||||
%xdefine a TMP_
|
||||
%endm
|
||||
|
||||
; PRORD reg, imm, tmp
|
||||
%macro PRORD 3
|
||||
%define %%reg %1
|
||||
%define %%imm %2
|
||||
%define %%tmp %3
|
||||
vpslld %%tmp, %%reg, (32-(%%imm))
|
||||
vpsrld %%reg, %%reg, %%imm
|
||||
vpor %%reg, %%reg, %%tmp
|
||||
%endmacro
|
||||
|
||||
; non-destructive
|
||||
; PRORD_nd reg, imm, tmp, src
|
||||
%macro PRORD_nd 4
|
||||
%define %%reg %1
|
||||
%define %%imm %2
|
||||
%define %%tmp %3
|
||||
%define %%src %4
|
||||
;vmovdqa %%tmp, %%reg
|
||||
vpslld %%tmp, %%src, (32-(%%imm))
|
||||
vpsrld %%reg, %%src, %%imm
|
||||
vpor %%reg, %%reg, %%tmp
|
||||
%endmacro
|
||||
|
||||
; PRORD dst/src, amt
|
||||
%macro PRORD 2
|
||||
PRORD %1, %2, TMP
|
||||
%endmacro
|
||||
|
||||
; PRORD_nd dst, src, amt
|
||||
%macro PRORD_nd 3
|
||||
PRORD_nd %1, %3, TMP, %2
|
||||
%endmacro
|
||||
|
||||
;; arguments passed implicitly in preprocessor symbols i, a...h
|
||||
%macro ROUND_00_15 2
|
||||
%define %%T1 %1
|
||||
%define %%i %2
|
||||
PRORD_nd a0, e, (11-6) ; sig1: a0 = (e >> 5)
|
||||
|
||||
vpxor a2, f, g ; ch: a2 = f^g
|
||||
vpand a2, a2, e ; ch: a2 = (f^g)&e
|
||||
vpxor a2, a2, g ; a2 = ch
|
||||
|
||||
PRORD_nd a1, e, 25 ; sig1: a1 = (e >> 25)
|
||||
vmovdqa [SZ8*(%%i&0xf) + rsp], %%T1 ; save current temp message
|
||||
vpaddd %%T1, %%T1, [TBL + ROUND] ; T1 = W + K
|
||||
vpxor a0, a0, e ; sig1: a0 = e ^ (e >> 5)
|
||||
PRORD a0, 6 ; sig1: a0 = (e >> 6) ^ (e >> 11)
|
||||
vpaddd h, h, a2 ; h = h + ch
|
||||
PRORD_nd a2, a, (13-2) ; sig0: a2 = (a >> 11)
|
||||
vpaddd h, h, %%T1 ; h = h + ch + W + K
|
||||
vpxor a0, a0, a1 ; a0 = sigma1
|
||||
PRORD_nd a1, a, 22 ; sig0: a1 = (a >> 22)
|
||||
vpxor %%T1, a, c ; maj: T1 = a^c
|
||||
add ROUND, SZ8 ; ROUND++
|
||||
vpand %%T1, %%T1, b ; maj: T1 = (a^c)&b
|
||||
vpaddd h, h, a0
|
||||
|
||||
vpaddd d, d, h
|
||||
|
||||
vpxor a2, a2, a ; sig0: a2 = a ^ (a >> 11)
|
||||
PRORD a2, 2 ; sig0: a2 = (a >> 2) ^ (a >> 13)
|
||||
vpxor a2, a2, a1 ; a2 = sig0
|
||||
vpand a1, a, c ; maj: a1 = a&c
|
||||
vpor a1, a1, %%T1 ; a1 = maj
|
||||
vpaddd h, h, a1 ; h = h + ch + W + K + maj
|
||||
vpaddd h, h, a2 ; h = h + ch + W + K + maj + sigma0
|
||||
|
||||
ROTATE_ARGS
|
||||
%endm
|
||||
|
||||
|
||||
;; arguments passed implicitly in preprocessor symbols i, a...h
|
||||
%macro ROUND_16_XX 2
|
||||
%define %%T1 %1
|
||||
%define %%i %2
|
||||
vmovdqa %%T1, [SZ8*((%%i-15)&0xf) + rsp]
|
||||
vmovdqa a1, [SZ8*((%%i-2)&0xf) + rsp]
|
||||
vmovdqa a0, %%T1
|
||||
PRORD %%T1, 18-7
|
||||
vmovdqa a2, a1
|
||||
PRORD a1, 19-17
|
||||
vpxor %%T1, %%T1, a0
|
||||
PRORD %%T1, 7
|
||||
vpxor a1, a1, a2
|
||||
PRORD a1, 17
|
||||
vpsrld a0, a0, 3
|
||||
vpxor %%T1, %%T1, a0
|
||||
vpsrld a2, a2, 10
|
||||
vpxor a1, a1, a2
|
||||
vpaddd %%T1, %%T1, [SZ8*((%%i-16)&0xf) + rsp] ; + W[i-16]
|
||||
vpaddd a1, a1, [SZ8*((%%i-7)&0xf) + rsp] ; + W[i-7]
|
||||
vpaddd %%T1, %%T1, a1
|
||||
|
||||
ROUND_00_15 %%T1, %%i
|
||||
|
||||
%endm
|
||||
|
||||
;; arguments passed implicitly in preprocessor symbols i, a...h
|
||||
%macro PADDING_ROUND_00_15 1
|
||||
%define %%T1 %1
|
||||
PRORD_nd a0, e, (11-6) ; sig1: a0 = (e >> 5)
|
||||
|
||||
vpxor a2, f, g ; ch: a2 = f^g
|
||||
vpand a2, a2, e ; ch: a2 = (f^g)&e
|
||||
vpxor a2, a2, g ; a2 = ch
|
||||
|
||||
PRORD_nd a1, e, 25 ; sig1: a1 = (e >> 25)
|
||||
vmovdqa %%T1, [TBL + ROUND] ; T1 = W + K
|
||||
vpxor a0, a0, e ; sig1: a0 = e ^ (e >> 5)
|
||||
PRORD a0, 6 ; sig1: a0 = (e >> 6) ^ (e >> 11)
|
||||
vpaddd h, h, a2 ; h = h + ch
|
||||
PRORD_nd a2, a, (13-2) ; sig0: a2 = (a >> 11)
|
||||
vpaddd h, h, %%T1 ; h = h + ch + W + K
|
||||
vpxor a0, a0, a1 ; a0 = sigma1
|
||||
PRORD_nd a1, a, 22 ; sig0: a1 = (a >> 22)
|
||||
vpxor %%T1, a, c ; maj: T1 = a^c
|
||||
add ROUND, SZ8 ; ROUND++
|
||||
vpand %%T1, %%T1, b ; maj: T1 = (a^c)&b
|
||||
vpaddd h, h, a0
|
||||
|
||||
vpaddd d, d, h
|
||||
|
||||
vpxor a2, a2, a ; sig0: a2 = a ^ (a >> 11)
|
||||
PRORD a2, 2 ; sig0: a2 = (a >> 2) ^ (a >> 13)
|
||||
vpxor a2, a2, a1 ; a2 = sig0
|
||||
vpand a1, a, c ; maj: a1 = a&c
|
||||
vpor a1, a1, %%T1 ; a1 = maj
|
||||
vpaddd h, h, a1 ; h = h + ch + W + K + maj
|
||||
vpaddd h, h, a2 ; h = h + ch + W + K + maj + sigma0
|
||||
|
||||
ROTATE_ARGS
|
||||
%endm
|
||||
|
||||
|
||||
|
||||
global sha256_8_avx2:function
|
||||
align 16
|
||||
sha256_8_avx2:
|
||||
endbranch64
|
||||
; outer calling routine saves all the XMM registers
|
||||
push rbp
|
||||
mov rbp,rsp
|
||||
and rsp, [rel STACK_ALIGNMENT_MASK]
|
||||
sub rsp, FRAMESZ
|
||||
mov [R12], r12
|
||||
mov [R13], r13
|
||||
mov [R14], r14
|
||||
mov [R15], r15
|
||||
|
||||
.hash_8_blocks:
|
||||
cmp NUM_BLKS, 8
|
||||
jl .hash_4_blocks
|
||||
xor ROUND, ROUND
|
||||
|
||||
lea TBL,[rel DIGEST_8]
|
||||
vmovdqa a,[TBL + 0*32]
|
||||
vmovdqa b,[TBL + 1*32]
|
||||
vmovdqa c,[TBL + 2*32]
|
||||
vmovdqa d,[TBL + 3*32]
|
||||
vmovdqa e,[TBL + 4*32]
|
||||
vmovdqa f,[TBL + 5*32]
|
||||
vmovdqa g,[TBL + 6*32]
|
||||
vmovdqa h,[TBL + 7*32]
|
||||
|
||||
lea TBL,[rel K256_8]
|
||||
|
||||
%assign i 0
|
||||
%rep 2
|
||||
TRANSPOSE8_U32_LOAD8 TT0, TT1, TT2, TT3, TT4, TT5, TT6, TT7, \
|
||||
DATA_PTR + 0*64, \
|
||||
DATA_PTR + 1*64, \
|
||||
DATA_PTR + 2*64, \
|
||||
DATA_PTR + 3*64, \
|
||||
DATA_PTR + 4*64, \
|
||||
DATA_PTR + 5*64, \
|
||||
DATA_PTR + 6*64, \
|
||||
DATA_PTR + 7*64, \
|
||||
i*32
|
||||
|
||||
vmovdqa [YTMP0], g
|
||||
vmovdqa [YTMP1], h
|
||||
TRANSPOSE8_U32_PRELOADED TT0, TT1, TT2, TT3, TT4, TT5, TT6, TT7, TMP0, TMP1
|
||||
vmovdqa TMP1, [rel PSHUFFLE_BYTE_FLIP_MASK]
|
||||
vmovdqa g, [YTMP0]
|
||||
vpshufb TT0, TT0, TMP1
|
||||
vpshufb TT1, TT1, TMP1
|
||||
vpshufb TT2, TT2, TMP1
|
||||
vpshufb TT3, TT3, TMP1
|
||||
vpshufb TT4, TT4, TMP1
|
||||
vpshufb TT5, TT5, TMP1
|
||||
vpshufb TT6, TT6, TMP1
|
||||
vpshufb TT7, TT7, TMP1
|
||||
vmovdqa h, [YTMP1]
|
||||
vmovdqa [YTMP0], TT4
|
||||
vmovdqa [YTMP1], TT5
|
||||
vmovdqa [YTMP2], TT6
|
||||
vmovdqa [YTMP3], TT7
|
||||
ROUND_00_15 TT0,(i*8+0)
|
||||
vmovdqa TT0, [YTMP0]
|
||||
ROUND_00_15 TT1,(i*8+1)
|
||||
vmovdqa TT1, [YTMP1]
|
||||
ROUND_00_15 TT2,(i*8+2)
|
||||
vmovdqa TT2, [YTMP2]
|
||||
ROUND_00_15 TT3,(i*8+3)
|
||||
vmovdqa TT3, [YTMP3]
|
||||
ROUND_00_15 TT0,(i*8+4)
|
||||
ROUND_00_15 TT1,(i*8+5)
|
||||
ROUND_00_15 TT2,(i*8+6)
|
||||
ROUND_00_15 TT3,(i*8+7)
|
||||
%assign i (i+1)
|
||||
%endrep
|
||||
|
||||
%assign i (i*8)
|
||||
|
||||
jmp .Lrounds_16_xx
|
||||
align 16
|
||||
.Lrounds_16_xx:
|
||||
%rep 16
|
||||
ROUND_16_XX T1, i
|
||||
%assign i (i+1)
|
||||
%endrep
|
||||
|
||||
cmp ROUND,ROUNDS
|
||||
jb .Lrounds_16_xx
|
||||
|
||||
;; add old digest
|
||||
lea TBL,[rel DIGEST_8]
|
||||
vpaddd a, a, [TBL + 0*SZ8]
|
||||
vpaddd b, b, [TBL + 1*SZ8]
|
||||
vpaddd c, c, [TBL + 2*SZ8]
|
||||
vpaddd d, d, [TBL + 3*SZ8]
|
||||
vpaddd e, e, [TBL + 4*SZ8]
|
||||
vpaddd f, f, [TBL + 5*SZ8]
|
||||
vpaddd g, g, [TBL + 6*SZ8]
|
||||
vpaddd h, h, [TBL + 7*SZ8]
|
||||
|
||||
;; rounds with padding
|
||||
|
||||
;; save old digest
|
||||
vmovdqa [rsp + _DIGEST + 0*SZ8], a
|
||||
vmovdqa [rsp + _DIGEST + 1*SZ8], b
|
||||
vmovdqa [rsp + _DIGEST + 2*SZ8], c
|
||||
vmovdqa [rsp + _DIGEST + 3*SZ8], d
|
||||
vmovdqa [rsp + _DIGEST + 4*SZ8], e
|
||||
vmovdqa [rsp + _DIGEST + 5*SZ8], f
|
||||
vmovdqa [rsp + _DIGEST + 6*SZ8], g
|
||||
vmovdqa [rsp + _DIGEST + 7*SZ8], h
|
||||
|
||||
|
||||
lea TBL,[rel PADDING_8]
|
||||
xor ROUND,ROUND
|
||||
jmp .Lrounds_padding
|
||||
|
||||
align 16
|
||||
.Lrounds_padding:
|
||||
%rep 64
|
||||
PADDING_ROUND_00_15 T1
|
||||
%endrep
|
||||
;; add old digest
|
||||
vpaddd a, a, [rsp + _DIGEST + 0*SZ8]
|
||||
vpaddd b, b, [rsp + _DIGEST + 1*SZ8]
|
||||
vpaddd c, c, [rsp + _DIGEST + 2*SZ8]
|
||||
vpaddd d, d, [rsp + _DIGEST + 3*SZ8]
|
||||
vpaddd e, e, [rsp + _DIGEST + 4*SZ8]
|
||||
vpaddd f, f, [rsp + _DIGEST + 5*SZ8]
|
||||
vpaddd g, g, [rsp + _DIGEST + 6*SZ8]
|
||||
vpaddd h, h, [rsp + _DIGEST + 7*SZ8]
|
||||
|
||||
|
||||
;; transpose the digest and convert to little endian to get the registers correctly
|
||||
|
||||
TRANSPOSE8_U32 a, b, c, d, e, f, g, h, TT0, TT1
|
||||
vmovdqa TT0, [rel PSHUFFLE_BYTE_FLIP_MASK]
|
||||
vpshufb a, a, TT0
|
||||
vpshufb b, b, TT0
|
||||
vpshufb c, c, TT0
|
||||
vpshufb d, d, TT0
|
||||
vpshufb e, e, TT0
|
||||
vpshufb f, f, TT0
|
||||
vpshufb g, g, TT0
|
||||
vpshufb h, h, TT0
|
||||
|
||||
;; write to output
|
||||
|
||||
vmovdqu [OUTPUT_PTR + 0*32],a
|
||||
vmovdqu [OUTPUT_PTR + 1*32],b
|
||||
vmovdqu [OUTPUT_PTR + 2*32],c
|
||||
vmovdqu [OUTPUT_PTR + 3*32],d
|
||||
vmovdqu [OUTPUT_PTR + 4*32],e
|
||||
vmovdqu [OUTPUT_PTR + 5*32],f
|
||||
vmovdqu [OUTPUT_PTR + 6*32],g
|
||||
vmovdqu [OUTPUT_PTR + 7*32],h
|
||||
|
||||
; update pointers and loop
|
||||
|
||||
add DATA_PTR, 64*8
|
||||
add OUTPUT_PTR, 32*8
|
||||
sub NUM_BLKS, 8
|
||||
|
||||
jmp .hash_8_blocks
|
||||
|
||||
.hash_4_blocks:
|
||||
|
||||
call sha256_4_avx
|
||||
|
||||
mov r12,[R12]
|
||||
mov r13,[R13]
|
||||
mov r14,[R14]
|
||||
mov r15,[R15]
|
||||
|
||||
mov rsp,rbp
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
%ifdef LINUX
|
||||
section .note.GNU-stack noalloc noexec nowrite progbits
|
||||
%endif
|
||||
646
crypto/hash/custom_hasher/assembly/sha256_avx_one_block.asm
Normal file
646
crypto/hash/custom_hasher/assembly/sha256_avx_one_block.asm
Normal file
@@ -0,0 +1,646 @@
|
||||
;;
|
||||
;; Copyright (c) 2012-2021, Intel Corporation
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
;; modification, are permitted provided that the following conditions are met:
|
||||
;;
|
||||
;; * Redistributions of source code must retain the above copyright notice,
|
||||
;; this list of conditions and the following disclaimer.
|
||||
;; * Redistributions in binary form must reproduce the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer in the
|
||||
;; documentation and/or other materials provided with the distribution.
|
||||
;; * Neither the name of Intel Corporation nor the names of its contributors
|
||||
;; may be used to endorse or promote products derived from this software
|
||||
;; without specific prior written permission.
|
||||
;;
|
||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
|
||||
;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
;;
|
||||
|
||||
; This code schedules 1 blocks at a time, with 4 lanes per block
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
section .data
|
||||
default rel
|
||||
align 64
|
||||
K256:
|
||||
dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
|
||||
dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
|
||||
dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
|
||||
dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
|
||||
dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
|
||||
dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
|
||||
dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
|
||||
dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
|
||||
dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
|
||||
dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
|
||||
dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
|
||||
dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
|
||||
dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
|
||||
dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
|
||||
dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
|
||||
dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
|
||||
|
||||
DIGEST:
|
||||
dd 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
|
||||
dd 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
|
||||
|
||||
PADDING:
|
||||
dd 0xc28a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
|
||||
dd 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
|
||||
dd 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
|
||||
dd 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf374
|
||||
dd 0x649b69c1, 0xf0fe4786, 0xfe1edc6, 0x240cf254
|
||||
dd 0x4fe9346f, 0x6cc984be, 0x61b9411e, 0x16f988fa
|
||||
dd 0xf2c65152, 0xa88e5a6d, 0xb019fc65, 0xb9d99ec7
|
||||
dd 0x9a1231c3, 0xe70eeaa0, 0xfdb1232b, 0xc7353eb0
|
||||
dd 0x3069bad5, 0xcb976d5f, 0x5a0f118f, 0xdc1eeefd
|
||||
dd 0xa35b689, 0xde0b7a04, 0x58f4ca9d, 0xe15d5b16
|
||||
dd 0x7f3e86, 0x37088980, 0xa507ea32, 0x6fab9537
|
||||
dd 0x17406110, 0xd8cd6f1, 0xcdaa3b6d, 0xc0bbbe37
|
||||
dd 0x83613bda, 0xdb48a363, 0xb02e931, 0x6fd15ca7
|
||||
dd 0x521afaca, 0x31338431, 0x6ed41a95, 0x6d437890
|
||||
dd 0xc39c91f2, 0x9eccabbd, 0xb5c9a0e6, 0x532fb63c
|
||||
dd 0xd2c741c6, 0x7237ea3, 0xa4954b68, 0x4c191d76
|
||||
|
||||
|
||||
PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203
|
||||
dq 0x0405060700010203, 0x0c0d0e0f08090a0b
|
||||
|
||||
; shuffle xBxA -> 00BA
|
||||
_SHUF_00BA: ;ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100
|
||||
dq 0x0b0a090803020100, 0xFFFFFFFFFFFFFFFF
|
||||
|
||||
; shuffle xDxC -> DC00
|
||||
_SHUF_DC00: ;ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF
|
||||
dq 0xFFFFFFFFFFFFFFFF, 0x0b0a090803020100
|
||||
|
||||
section .text
|
||||
|
||||
%define VMOVDQ vmovdqu ;; assume buffers not aligned
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
|
||||
|
||||
%macro MY_ROR 2
|
||||
shld %1,%1,(32-(%2))
|
||||
%endm
|
||||
|
||||
; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
|
||||
; Load xmm with mem and byte swap each dword
|
||||
%macro COPY_XMM_AND_BSWAP 3
|
||||
VMOVDQ %1, %2
|
||||
vpshufb %1, %1, %3
|
||||
%endmacro
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
%define X0 xmm4
|
||||
%define X1 xmm5
|
||||
%define X2 xmm6
|
||||
%define X3 xmm7
|
||||
|
||||
%define XTMP0 xmm0
|
||||
%define XTMP1 xmm1
|
||||
%define XTMP2 xmm2
|
||||
%define XTMP3 xmm3
|
||||
%define XTMP4 xmm8
|
||||
%define XFER xmm9
|
||||
%define XTMP5 xmm11
|
||||
|
||||
%define SHUF_00BA xmm10 ; shuffle xBxA -> 00BA
|
||||
%define SHUF_DC00 xmm12 ; shuffle xDxC -> DC00
|
||||
%define BYTE_FLIP_MASK xmm13
|
||||
|
||||
%ifdef WINABI
|
||||
%define OUTPUT_PTR rcx ; 1st arg
|
||||
%define DATA_PTR rdx ; 2nd arg
|
||||
%define d r8d ; 3rd
|
||||
%define TBL rsi
|
||||
%define c edi
|
||||
%else
|
||||
%define OUTPUT_PTR rdi ; 1st arg
|
||||
%define DATA_PTR rsi ; 2nd arg
|
||||
%define c edx ; 3rd arg
|
||||
%define TBL rcx
|
||||
%define d r8d
|
||||
%endif
|
||||
|
||||
|
||||
%define a eax
|
||||
%define b ebx
|
||||
|
||||
%define e r9d
|
||||
%define f r10d
|
||||
%define g r11d
|
||||
%define h r12d
|
||||
|
||||
%define y0 r13d
|
||||
%define y1 r14d
|
||||
%define y2 r15d
|
||||
|
||||
|
||||
struc STACK
|
||||
_XFER: resb 32
|
||||
_DIGEST: resb 32
|
||||
%ifdef WINABI
|
||||
_XMM_SAVE: reso 8
|
||||
resb 16 ; alignment
|
||||
%endif
|
||||
endstruc
|
||||
|
||||
; rotate_Xs
|
||||
; Rotate values of symbols X0...X3
|
||||
%macro rotate_Xs 0
|
||||
%xdefine X_ X0
|
||||
%xdefine X0 X1
|
||||
%xdefine X1 X2
|
||||
%xdefine X2 X3
|
||||
%xdefine X3 X_
|
||||
%endm
|
||||
|
||||
; ROTATE_ARGS
|
||||
; Rotate values of symbols a...h
|
||||
%macro ROTATE_ARGS 0
|
||||
%xdefine TMP_ h
|
||||
%xdefine h g
|
||||
%xdefine g f
|
||||
%xdefine f e
|
||||
%xdefine e d
|
||||
%xdefine d c
|
||||
%xdefine c b
|
||||
%xdefine b a
|
||||
%xdefine a TMP_
|
||||
%endm
|
||||
|
||||
%macro FOUR_ROUNDS_AND_SCHED 0
|
||||
;; compute s0 four at a time and s1 two at a time
|
||||
;; compute W[-16] + W[-7] 4 at a time
|
||||
;vmovdqa XTMP0, X3
|
||||
mov y0, e ; y0 = e
|
||||
MY_ROR y0, (25-11) ; y0 = e >> (25-11)
|
||||
mov y1, a ; y1 = a
|
||||
vpalignr XTMP0, X3, X2, 4 ; XTMP0 = W[-7]
|
||||
MY_ROR y1, (22-13) ; y1 = a >> (22-13)
|
||||
xor y0, e ; y0 = e ^ (e >> (25-11))
|
||||
mov y2, f ; y2 = f
|
||||
MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
|
||||
;vmovdqa XTMP1, X1
|
||||
xor y1, a ; y1 = a ^ (a >> (22-13)
|
||||
xor y2, g ; y2 = f^g
|
||||
vpaddd XTMP0, XTMP0, X0 ; XTMP0 = W[-7] + W[-16]
|
||||
xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
|
||||
and y2, e ; y2 = (f^g)&e
|
||||
MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
|
||||
;; compute s0
|
||||
vpalignr XTMP1, X1, X0, 4 ; XTMP1 = W[-15]
|
||||
xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
|
||||
MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
|
||||
xor y2, g ; y2 = CH = ((f^g)&e)^g
|
||||
|
||||
MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
|
||||
add y2, y0 ; y2 = S1 + CH
|
||||
add y2, [rsp + _XFER + 0*4] ; y2 = k + w + S1 + CH
|
||||
|
||||
mov y0, a ; y0 = a
|
||||
add h, y2 ; h = h + S1 + CH + k + w
|
||||
mov y2, a ; y2 = a
|
||||
|
||||
vpsrld XTMP2, XTMP1, 7
|
||||
|
||||
or y0, c ; y0 = a|c
|
||||
add d, h ; d = d + h + S1 + CH + k + w
|
||||
and y2, c ; y2 = a&c
|
||||
|
||||
vpslld XTMP3, XTMP1, (32-7)
|
||||
|
||||
and y0, b ; y0 = (a|c)&b
|
||||
add h, y1 ; h = h + S1 + CH + k + w + S0
|
||||
|
||||
vpor XTMP3, XTMP3, XTMP2 ; XTMP1 = W[-15] MY_ROR 7
|
||||
|
||||
or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
|
||||
add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
|
||||
|
||||
ROTATE_ARGS
|
||||
|
||||
mov y0, e ; y0 = e
|
||||
mov y1, a ; y1 = a
|
||||
|
||||
|
||||
MY_ROR y0, (25-11) ; y0 = e >> (25-11)
|
||||
xor y0, e ; y0 = e ^ (e >> (25-11))
|
||||
mov y2, f ; y2 = f
|
||||
MY_ROR y1, (22-13) ; y1 = a >> (22-13)
|
||||
|
||||
vpsrld XTMP2, XTMP1,18
|
||||
|
||||
xor y1, a ; y1 = a ^ (a >> (22-13)
|
||||
MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
|
||||
xor y2, g ; y2 = f^g
|
||||
|
||||
vpsrld XTMP4, XTMP1, 3 ; XTMP4 = W[-15] >> 3
|
||||
|
||||
MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
|
||||
xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
|
||||
and y2, e ; y2 = (f^g)&e
|
||||
MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
|
||||
|
||||
vpslld XTMP1, XTMP1, (32-18)
|
||||
|
||||
xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
|
||||
xor y2, g ; y2 = CH = ((f^g)&e)^g
|
||||
|
||||
vpxor XTMP3, XTMP3, XTMP1
|
||||
|
||||
add y2, y0 ; y2 = S1 + CH
|
||||
add y2, [rsp + _XFER + 1*4] ; y2 = k + w + S1 + CH
|
||||
MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
|
||||
|
||||
vpxor XTMP3, XTMP3, XTMP2 ; XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 18
|
||||
|
||||
mov y0, a ; y0 = a
|
||||
add h, y2 ; h = h + S1 + CH + k + w
|
||||
mov y2, a ; y2 = a
|
||||
|
||||
vpxor XTMP1, XTMP3, XTMP4 ; XTMP1 = s0
|
||||
|
||||
or y0, c ; y0 = a|c
|
||||
add d, h ; d = d + h + S1 + CH + k + w
|
||||
and y2, c ; y2 = a&c
|
||||
;; compute low s1
|
||||
vpshufd XTMP2, X3, 11111010b ; XTMP2 = W[-2] {BBAA}
|
||||
and y0, b ; y0 = (a|c)&b
|
||||
add h, y1 ; h = h + S1 + CH + k + w + S0
|
||||
vpaddd XTMP0, XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0
|
||||
or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
|
||||
add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
|
||||
|
||||
ROTATE_ARGS
|
||||
;vmovdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {BBAA}
|
||||
|
||||
mov y0, e ; y0 = e
|
||||
mov y1, a ; y1 = a
|
||||
MY_ROR y0, (25-11) ; y0 = e >> (25-11)
|
||||
|
||||
;vmovdqa XTMP4, XTMP2 ; XTMP4 = W[-2] {BBAA}
|
||||
|
||||
xor y0, e ; y0 = e ^ (e >> (25-11))
|
||||
MY_ROR y1, (22-13) ; y1 = a >> (22-13)
|
||||
mov y2, f ; y2 = f
|
||||
xor y1, a ; y1 = a ^ (a >> (22-13)
|
||||
MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
|
||||
|
||||
vpsrld XTMP4, XTMP2, 10 ; XTMP4 = W[-2] >> 10 {BBAA}
|
||||
|
||||
xor y2, g ; y2 = f^g
|
||||
|
||||
vpsrlq XTMP3, XTMP2, 19 ; XTMP3 = W[-2] MY_ROR 19 {xBxA}
|
||||
|
||||
xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
|
||||
and y2, e ; y2 = (f^g)&e
|
||||
|
||||
vpsrlq XTMP2, XTMP2, 17 ; XTMP2 = W[-2] MY_ROR 17 {xBxA}
|
||||
|
||||
MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
|
||||
xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
|
||||
xor y2, g ; y2 = CH = ((f^g)&e)^g
|
||||
MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
|
||||
vpxor XTMP2, XTMP2, XTMP3
|
||||
add y2, y0 ; y2 = S1 + CH
|
||||
MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
|
||||
add y2, [rsp + _XFER + 2*4] ; y2 = k + w + S1 + CH
|
||||
vpxor XTMP4, XTMP4, XTMP2 ; XTMP4 = s1 {xBxA}
|
||||
mov y0, a ; y0 = a
|
||||
add h, y2 ; h = h + S1 + CH + k + w
|
||||
mov y2, a ; y2 = a
|
||||
vpshufb XTMP4, XTMP4, SHUF_00BA ; XTMP4 = s1 {00BA}
|
||||
or y0, c ; y0 = a|c
|
||||
add d, h ; d = d + h + S1 + CH + k + w
|
||||
and y2, c ; y2 = a&c
|
||||
vpaddd XTMP0, XTMP0, XTMP4 ; XTMP0 = {..., ..., W[1], W[0]}
|
||||
and y0, b ; y0 = (a|c)&b
|
||||
add h, y1 ; h = h + S1 + CH + k + w + S0
|
||||
;; compute high s1
|
||||
vpshufd XTMP2, XTMP0, 01010000b ; XTMP2 = W[-2] {DDCC}
|
||||
or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
|
||||
add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
|
||||
|
||||
ROTATE_ARGS
|
||||
;vmovdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {DDCC}
|
||||
mov y0, e ; y0 = e
|
||||
MY_ROR y0, (25-11) ; y0 = e >> (25-11)
|
||||
mov y1, a ; y1 = a
|
||||
;vmovdqa XTMP5, XTMP2 ; XTMP5 = W[-2] {DDCC}
|
||||
MY_ROR y1, (22-13) ; y1 = a >> (22-13)
|
||||
xor y0, e ; y0 = e ^ (e >> (25-11))
|
||||
mov y2, f ; y2 = f
|
||||
MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
|
||||
|
||||
vpsrld XTMP5, XTMP2, 10 ; XTMP5 = W[-2] >> 10 {DDCC}
|
||||
|
||||
xor y1, a ; y1 = a ^ (a >> (22-13)
|
||||
xor y2, g ; y2 = f^g
|
||||
|
||||
vpsrlq XTMP3, XTMP2, 19 ; XTMP3 = W[-2] MY_ROR 19 {xDxC}
|
||||
|
||||
xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
|
||||
and y2, e ; y2 = (f^g)&e
|
||||
MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
|
||||
|
||||
vpsrlq XTMP2, XTMP2, 17 ; XTMP2 = W[-2] MY_ROR 17 {xDxC}
|
||||
|
||||
xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
|
||||
MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
|
||||
xor y2, g ; y2 = CH = ((f^g)&e)^g
|
||||
|
||||
vpxor XTMP2, XTMP2, XTMP3
|
||||
|
||||
MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
|
||||
add y2, y0 ; y2 = S1 + CH
|
||||
add y2, [rsp + _XFER + 3*4] ; y2 = k + w + S1 + CH
|
||||
vpxor XTMP5, XTMP5, XTMP2 ; XTMP5 = s1 {xDxC}
|
||||
mov y0, a ; y0 = a
|
||||
add h, y2 ; h = h + S1 + CH + k + w
|
||||
mov y2, a ; y2 = a
|
||||
vpshufb XTMP5, XTMP5, SHUF_DC00 ; XTMP5 = s1 {DC00}
|
||||
or y0, c ; y0 = a|c
|
||||
add d, h ; d = d + h + S1 + CH + k + w
|
||||
and y2, c ; y2 = a&c
|
||||
vpaddd X0, XTMP5, XTMP0 ; X0 = {W[3], W[2], W[1], W[0]}
|
||||
and y0, b ; y0 = (a|c)&b
|
||||
add h, y1 ; h = h + S1 + CH + k + w + S0
|
||||
or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
|
||||
add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
|
||||
|
||||
ROTATE_ARGS
|
||||
rotate_Xs
|
||||
%endm
|
||||
|
||||
;; input is [rsp + _XFER + %1 * 4]
|
||||
%macro DO_ROUND 1
|
||||
mov y0, e ; y0 = e
|
||||
MY_ROR y0, (25-11) ; y0 = e >> (25-11)
|
||||
mov y1, a ; y1 = a
|
||||
xor y0, e ; y0 = e ^ (e >> (25-11))
|
||||
MY_ROR y1, (22-13) ; y1 = a >> (22-13)
|
||||
mov y2, f ; y2 = f
|
||||
xor y1, a ; y1 = a ^ (a >> (22-13)
|
||||
MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
|
||||
xor y2, g ; y2 = f^g
|
||||
xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
|
||||
MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
|
||||
and y2, e ; y2 = (f^g)&e
|
||||
xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
|
||||
MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
|
||||
xor y2, g ; y2 = CH = ((f^g)&e)^g
|
||||
add y2, y0 ; y2 = S1 + CH
|
||||
MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
|
||||
add y2, [rsp + _XFER + %1 * 4] ; y2 = k + w + S1 + CH
|
||||
mov y0, a ; y0 = a
|
||||
add h, y2 ; h = h + S1 + CH + k + w
|
||||
mov y2, a ; y2 = a
|
||||
or y0, c ; y0 = a|c
|
||||
add d, h ; d = d + h + S1 + CH + k + w
|
||||
and y2, c ; y2 = a&c
|
||||
and y0, b ; y0 = (a|c)&b
|
||||
add h, y1 ; h = h + S1 + CH + k + w + S0
|
||||
or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
|
||||
add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
|
||||
ROTATE_ARGS
|
||||
%endm
|
||||
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; void FUNC(void *input_data, UINT32 digest[8], UINT64 num_blks)
|
||||
;; arg 1 : pointer to input data
|
||||
;; arg 2 : pointer to digest
|
||||
section .text
|
||||
global sha256_1_avx:function
|
||||
align 32
|
||||
sha256_1_avx:
|
||||
endbranch64
|
||||
push rbx
|
||||
%ifdef WINABI
|
||||
push r8
|
||||
push rsi
|
||||
push rdi
|
||||
%else
|
||||
push rdx
|
||||
%endif
|
||||
push rbp
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
|
||||
sub rsp,STACK_size
|
||||
%ifdef WINABI
|
||||
vmovdqa [rsp + _XMM_SAVE + 0*16],xmm6
|
||||
vmovdqa [rsp + _XMM_SAVE + 1*16],xmm7
|
||||
vmovdqa [rsp + _XMM_SAVE + 2*16],xmm8
|
||||
vmovdqa [rsp + _XMM_SAVE + 3*16],xmm9
|
||||
vmovdqa [rsp + _XMM_SAVE + 4*16],xmm10
|
||||
vmovdqa [rsp + _XMM_SAVE + 5*16],xmm11
|
||||
vmovdqa [rsp + _XMM_SAVE + 6*16],xmm12
|
||||
vmovdqa [rsp + _XMM_SAVE + 7*16],xmm13
|
||||
%endif
|
||||
vmovdqa BYTE_FLIP_MASK, [rel PSHUFFLE_BYTE_FLIP_MASK]
|
||||
vmovdqa SHUF_00BA, [rel _SHUF_00BA]
|
||||
vmovdqa SHUF_DC00, [rel _SHUF_DC00]
|
||||
|
||||
.hash_1_block:
|
||||
;; load initial digest
|
||||
lea TBL,[rel DIGEST]
|
||||
mov a, [TBL + 0*4]
|
||||
mov b, [TBL + 1*4]
|
||||
mov c, [TBL + 2*4]
|
||||
mov d, [TBL + 3*4]
|
||||
mov e, [TBL + 4*4]
|
||||
mov f, [TBL + 5*4]
|
||||
mov g, [TBL + 6*4]
|
||||
mov h, [TBL + 7*4]
|
||||
|
||||
lea TBL,[rel K256]
|
||||
|
||||
;; byte swap first 16 dwords
|
||||
COPY_XMM_AND_BSWAP X0, [DATA_PTR + 0*16], BYTE_FLIP_MASK
|
||||
COPY_XMM_AND_BSWAP X1, [DATA_PTR + 1*16], BYTE_FLIP_MASK
|
||||
COPY_XMM_AND_BSWAP X2, [DATA_PTR + 2*16], BYTE_FLIP_MASK
|
||||
COPY_XMM_AND_BSWAP X3, [DATA_PTR + 3*16], BYTE_FLIP_MASK
|
||||
|
||||
;; schedule 48 input dwords, by doing 3 rounds of 16 each
|
||||
%rep 3
|
||||
align 16
|
||||
vpaddd XFER, X0, [TBL + 0*16]
|
||||
vmovdqa [rsp + _XFER], XFER
|
||||
FOUR_ROUNDS_AND_SCHED
|
||||
|
||||
vpaddd XFER, X0, [TBL + 1*16]
|
||||
vmovdqa [rsp + _XFER], XFER
|
||||
FOUR_ROUNDS_AND_SCHED
|
||||
|
||||
vpaddd XFER, X0, [TBL + 2*16]
|
||||
vmovdqa [rsp + _XFER], XFER
|
||||
FOUR_ROUNDS_AND_SCHED
|
||||
|
||||
vpaddd XFER, X0, [TBL + 3*16]
|
||||
vmovdqa [rsp + _XFER], XFER
|
||||
add TBL, 4*16
|
||||
FOUR_ROUNDS_AND_SCHED
|
||||
%endrep
|
||||
|
||||
%rep 2
|
||||
vpaddd XFER, X0, [TBL + 0*16]
|
||||
vmovdqa [rsp + _XFER], XFER
|
||||
DO_ROUND 0
|
||||
DO_ROUND 1
|
||||
DO_ROUND 2
|
||||
DO_ROUND 3
|
||||
|
||||
vpaddd XFER, X1, [TBL + 1*16]
|
||||
vmovdqa [rsp + _XFER], XFER
|
||||
add TBL, 2*16
|
||||
DO_ROUND 0
|
||||
DO_ROUND 1
|
||||
DO_ROUND 2
|
||||
DO_ROUND 3
|
||||
|
||||
vmovdqa X0, X2
|
||||
vmovdqa X1, X3
|
||||
|
||||
%endrep
|
||||
|
||||
; add old digest
|
||||
|
||||
lea TBL,[rel DIGEST]
|
||||
add a, [TBL + 0*4]
|
||||
add b, [TBL + 1*4]
|
||||
add c, [TBL + 2*4]
|
||||
add d, [TBL + 3*4]
|
||||
add e, [TBL + 4*4]
|
||||
add f, [TBL + 5*4]
|
||||
add g, [TBL + 6*4]
|
||||
add h, [TBL + 7*4]
|
||||
|
||||
|
||||
; rounds with padding
|
||||
|
||||
; save old digest
|
||||
;
|
||||
mov [rsp + _DIGEST + 0*4], a
|
||||
mov [rsp + _DIGEST + 1*4], b
|
||||
mov [rsp + _DIGEST + 2*4], c
|
||||
mov [rsp + _DIGEST + 3*4], d
|
||||
mov [rsp + _DIGEST + 4*4], e
|
||||
mov [rsp + _DIGEST + 5*4], f
|
||||
mov [rsp + _DIGEST + 6*4], g
|
||||
mov [rsp + _DIGEST + 7*4], h
|
||||
|
||||
lea TBL,[rel PADDING]
|
||||
|
||||
%assign i 0
|
||||
%rep 64
|
||||
mov y0, e ; y0 = e
|
||||
MY_ROR y0, (25-11) ; y0 = e >> (25-11)
|
||||
mov y1, a ; y1 = a
|
||||
xor y0, e ; y0 = e ^ (e >> (25-11))
|
||||
MY_ROR y1, (22-13) ; y1 = a >> (22-13)
|
||||
mov y2, f ; y2 = f
|
||||
xor y1, a ; y1 = a ^ (a >> (22-13)
|
||||
MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
|
||||
xor y2, g ; y2 = f^g
|
||||
xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
|
||||
MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
|
||||
and y2, e ; y2 = (f^g)&e
|
||||
xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
|
||||
MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
|
||||
xor y2, g ; y2 = CH = ((f^g)&e)^g
|
||||
add y2, y0 ; y2 = S1 + CH
|
||||
MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
|
||||
add y2, [TBL + i] ; y2 = k + w + S1 + CH
|
||||
mov y0, a ; y0 = a
|
||||
add h, y2 ; h = h + S1 + CH + k + w
|
||||
mov y2, a ; y2 = a
|
||||
or y0, c ; y0 = a|c
|
||||
add d, h ; d = d + h + S1 + CH + k + w
|
||||
and y2, c ; y2 = a&c
|
||||
and y0, b ; y0 = (a|c)&b
|
||||
add h, y1 ; h = h + S1 + CH + k + w + S0
|
||||
or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
|
||||
add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
|
||||
ROTATE_ARGS
|
||||
%assign i (i+4)
|
||||
%endrep
|
||||
|
||||
;; add the previous digest
|
||||
add a, [rsp + _DIGEST + 0*4]
|
||||
add b, [rsp + _DIGEST + 1*4]
|
||||
add c, [rsp + _DIGEST + 2*4]
|
||||
add d, [rsp + _DIGEST + 3*4]
|
||||
add e, [rsp + _DIGEST + 4*4]
|
||||
add f, [rsp + _DIGEST + 5*4]
|
||||
add g, [rsp + _DIGEST + 6*4]
|
||||
add h, [rsp + _DIGEST + 7*4]
|
||||
|
||||
;; shuffle the bytes to little endian
|
||||
bswap a
|
||||
bswap b
|
||||
bswap c
|
||||
bswap d
|
||||
bswap e
|
||||
bswap f
|
||||
bswap g
|
||||
bswap h
|
||||
|
||||
;; write resulting hash
|
||||
mov [OUTPUT_PTR + 0*4], a
|
||||
mov [OUTPUT_PTR + 1*4], b
|
||||
mov [OUTPUT_PTR + 2*4], c
|
||||
mov [OUTPUT_PTR + 3*4], d
|
||||
mov [OUTPUT_PTR + 4*4], e
|
||||
mov [OUTPUT_PTR + 5*4], f
|
||||
mov [OUTPUT_PTR + 6*4], g
|
||||
mov [OUTPUT_PTR + 7*4], h
|
||||
|
||||
%ifdef WINABI
|
||||
vmovdqa xmm6,[rsp + _XMM_SAVE + 0*16]
|
||||
vmovdqa xmm7,[rsp + _XMM_SAVE + 1*16]
|
||||
vmovdqa xmm8,[rsp + _XMM_SAVE + 2*16]
|
||||
vmovdqa xmm9,[rsp + _XMM_SAVE + 3*16]
|
||||
vmovdqa xmm10,[rsp + _XMM_SAVE + 4*16]
|
||||
vmovdqa xmm11,[rsp + _XMM_SAVE + 5*16]
|
||||
vmovdqa xmm12,[rsp + _XMM_SAVE + 6*16]
|
||||
vmovdqa xmm13,[rsp + _XMM_SAVE + 7*16]
|
||||
%endif
|
||||
|
||||
add rsp, STACK_size
|
||||
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rbp
|
||||
%ifdef WINABI
|
||||
pop rdi
|
||||
pop rsi
|
||||
pop r8
|
||||
%else
|
||||
pop rdx
|
||||
%endif
|
||||
pop rbx
|
||||
|
||||
ret
|
||||
|
||||
|
||||
%ifdef LINUX
|
||||
section .note.GNU-stack noalloc noexec nowrite progbits
|
||||
%endif
|
||||
1025
crypto/hash/custom_hasher/assembly/sha256_shani.asm
Normal file
1025
crypto/hash/custom_hasher/assembly/sha256_shani.asm
Normal file
File diff suppressed because it is too large
Load Diff
192
crypto/hash/custom_hasher/assembly/transpose_avx2.asm
Normal file
192
crypto/hash/custom_hasher/assembly/transpose_avx2.asm
Normal file
@@ -0,0 +1,192 @@
|
||||
;;
|
||||
;; Copyright (c) 2012-2021, Intel Corporation
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
;; modification, are permitted provided that the following conditions are met:
|
||||
;;
|
||||
;; * Redistributions of source code must retain the above copyright notice,
|
||||
;; this list of conditions and the following disclaimer.
|
||||
;; * Redistributions in binary form must reproduce the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer in the
|
||||
;; documentation and/or other materials provided with the distribution.
|
||||
;; * Neither the name of Intel Corporation nor the names of its contributors
|
||||
;; may be used to endorse or promote products derived from this software
|
||||
;; without specific prior written permission.
|
||||
;;
|
||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
|
||||
;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
;;
|
||||
|
||||
%ifndef _TRANSPOSE_AVX2_ASM_
|
||||
%define _TRANSPOSE_AVX2_ASM_
|
||||
|
||||
%include "reg_sizes.asm"
|
||||
|
||||
; LOAD ALL 8 LANES FOR 8x8 32-BIT TRANSPOSE
|
||||
;
|
||||
; r0-r7 [out] ymm registers which will contain the data to be transposed
|
||||
; addr0-addr7 [in] pointers to the next 32-byte block of data to be fetch for all 8 lanes
|
||||
; ptr_offset [in] offset to be applied on all pointers (addr0-addr7)
|
||||
%macro TRANSPOSE8_U32_LOAD8 17
|
||||
%define %%r0 %1
|
||||
%define %%r1 %2
|
||||
%define %%r2 %3
|
||||
%define %%r3 %4
|
||||
%define %%r4 %5
|
||||
%define %%r5 %6
|
||||
%define %%r6 %7
|
||||
%define %%r7 %8
|
||||
%define %%addr0 %9
|
||||
%define %%addr1 %10
|
||||
%define %%addr2 %11
|
||||
%define %%addr3 %12
|
||||
%define %%addr4 %13
|
||||
%define %%addr5 %14
|
||||
%define %%addr6 %15
|
||||
%define %%addr7 %16
|
||||
%define %%ptr_offset %17
|
||||
|
||||
; Expected output data
|
||||
;
|
||||
; r0 = {e3 e2 e1 e0 a3 a2 a1 a0}
|
||||
; r1 = {f3 f2 f1 f0 b3 b2 b1 b0}
|
||||
; r2 = {g3 g2 g1 g0 c3 c2 c1 c0}
|
||||
; r3 = {h3 h2 h1 h0 d3 d2 d1 d0}
|
||||
; r4 = {e7 e6 e5 e4 a7 a6 a5 a4}
|
||||
; r5 = {f7 f6 f5 f4 b7 b6 b5 b4}
|
||||
; r6 = {g7 g6 g5 g4 c7 c6 c5 c4}
|
||||
; r7 = {h7 h6 h5 h4 d7 d6 d5 d4}
|
||||
|
||||
vmovups XWORD(%%r0),[%%addr0+%%ptr_offset]
|
||||
vmovups XWORD(%%r1),[%%addr1+%%ptr_offset]
|
||||
vmovups XWORD(%%r2),[%%addr2+%%ptr_offset]
|
||||
vmovups XWORD(%%r3),[%%addr3+%%ptr_offset]
|
||||
vmovups XWORD(%%r4),[%%addr0+%%ptr_offset+16]
|
||||
vmovups XWORD(%%r5),[%%addr1+%%ptr_offset+16]
|
||||
vmovups XWORD(%%r6),[%%addr2+%%ptr_offset+16]
|
||||
vmovups XWORD(%%r7),[%%addr3+%%ptr_offset+16]
|
||||
|
||||
vinserti128 %%r0, %%r0, [%%addr4+%%ptr_offset], 0x01
|
||||
vinserti128 %%r1, %%r1, [%%addr5+%%ptr_offset], 0x01
|
||||
vinserti128 %%r2, %%r2, [%%addr6+%%ptr_offset], 0x01
|
||||
vinserti128 %%r3, %%r3, [%%addr7+%%ptr_offset], 0x01
|
||||
vinserti128 %%r4, %%r4, [%%addr4+%%ptr_offset+16], 0x01
|
||||
vinserti128 %%r5, %%r5, [%%addr5+%%ptr_offset+16], 0x01
|
||||
vinserti128 %%r6, %%r6, [%%addr6+%%ptr_offset+16], 0x01
|
||||
vinserti128 %%r7, %%r7, [%%addr7+%%ptr_offset+16], 0x01
|
||||
|
||||
%endmacro
|
||||
|
||||
; 8x8 32-BIT TRANSPOSE
|
||||
;
|
||||
; Before calling this macro, TRANSPOSE8_U32_LOAD8 must be called.
|
||||
;
|
||||
; r0-r3 [in/out] ymm registers containing bytes 0-15 of each 32B block (e.g. ymm0 = [e3-e0 a3-a0])
|
||||
; r4-r7 [in/out] ymm registers containing bytes 16-31 of each 32B block (e.g. ymm4 = [e4-e7 a4-a7])
|
||||
; t0-t1 [clobbered] ymm temporary registers
|
||||
%macro TRANSPOSE8_U32_PRELOADED 10
|
||||
%define %%r0 %1
|
||||
%define %%r1 %2
|
||||
%define %%r2 %3
|
||||
%define %%r3 %4
|
||||
%define %%r4 %5
|
||||
%define %%r5 %6
|
||||
%define %%r6 %7
|
||||
%define %%r7 %8
|
||||
%define %%t0 %9
|
||||
%define %%t1 %10
|
||||
; Input looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
|
||||
; r0 = {e3 e2 e1 e0 a3 a2 a1 a0}
|
||||
; r1 = {f3 f2 f1 f0 b3 b2 b1 b0}
|
||||
; r2 = {g3 g2 g1 g0 c3 c2 c1 c0}
|
||||
; r3 = {h3 h2 h1 h0 d3 d2 d1 d0}
|
||||
; r4 = {e7 e6 e5 e4 a7 a6 a5 a4}
|
||||
; r5 = {f7 f6 f5 f4 b7 b6 b5 b4}
|
||||
; r6 = {g7 g6 g5 g4 c7 c6 c5 c4}
|
||||
; r7 = {h7 h6 h5 h4 d7 d6 d5 d4}
|
||||
;
|
||||
; Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
|
||||
; r0 = {h0 g0 f0 e0 d0 c0 b0 a0}
|
||||
; r1 = {h1 g1 f1 e1 d1 c1 b1 a1}
|
||||
; r2 = {h2 g2 f2 e2 d2 c2 b2 a2}
|
||||
; r3 = {h3 g3 f3 e3 d3 c3 b3 a3}
|
||||
; r4 = {h4 g4 f4 e4 d4 c4 b4 a4}
|
||||
; r5 = {h5 g5 f5 e5 d5 c5 b5 a5}
|
||||
; r6 = {h6 g6 f6 e6 d6 c6 b6 a6}
|
||||
; r7 = {h7 g7 f7 e7 d7 c7 b7 a7}
|
||||
;
|
||||
; process top half (r0..r3)
|
||||
vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {f1 f0 e1 e0 b1 b0 a1 a0}
|
||||
vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {f3 f2 e3 e2 b3 b2 a3 a2}
|
||||
vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {h1 h0 g1 g0 d1 d0 c1 c0}
|
||||
vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {h3 h2 g3 g2 d3 d2 c3 c2}
|
||||
|
||||
vshufps %%r1, %%t0, %%t1, 0xDD ; r1 = {h1 g1 f1 e1 d1 c1 b1 a1}
|
||||
vshufps %%r3, %%r0, %%r2, 0xDD ; r3 = {h3 g3 f3 e3 d3 c3 b3 a3}
|
||||
vshufps %%r2, %%r0, %%r2, 0x88 ; r2 = {h2 g2 f2 e2 d2 c2 b2 a2}
|
||||
vshufps %%r0, %%t0, %%t1, 0x88 ; r0 = {h0 g0 f0 e0 d0 c0 b0 a0}
|
||||
|
||||
;; process bottom half (r4..r7)
|
||||
vshufps %%t0, %%r4, %%r5, 0x44 ; t0 = {f5 f4 e5 e4 b5 b4 a5 a4}
|
||||
vshufps %%r4, %%r4, %%r5, 0xEE ; r4 = {f7 f6 e7 e6 b7 b6 a7 a6}
|
||||
vshufps %%t1, %%r6, %%r7, 0x44 ; t1 = {h5 h4 g5 g4 d5 d4 c5 c4}
|
||||
vshufps %%r6, %%r6, %%r7, 0xEE ; r6 = {h7 h6 g7 g6 d7 d6 c7 c6}
|
||||
|
||||
vshufps %%r5, %%t0, %%t1, 0xDD ; r5 = {h5 g5 f5 e5 d5 c5 b5 a5}
|
||||
vshufps %%r7, %%r4, %%r6, 0xDD ; r7 = {h7 g7 f7 e7 d7 c7 b7 a7}
|
||||
vshufps %%r6, %%r4, %%r6, 0x88 ; r6 = {h6 g6 f6 e6 d6 c6 b6 a6}
|
||||
vshufps %%r4, %%t0, %%t1, 0x88 ; r4 = {h4 g4 f4 e4 d4 c4 b4 a4}
|
||||
%endmacro
|
||||
|
||||
%macro TRANSPOSE8_U32 10
|
||||
%define %%r0 %1
|
||||
%define %%r1 %2
|
||||
%define %%r2 %3
|
||||
%define %%r3 %4
|
||||
%define %%r4 %5
|
||||
%define %%r5 %6
|
||||
%define %%r6 %7
|
||||
%define %%r7 %8
|
||||
%define %%t0 %9
|
||||
%define %%t1 %10
|
||||
; process top half (r0..r3) {a...d}
|
||||
vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b5 b4 a5 a4 b1 b0 a1 a0}
|
||||
vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b7 b6 a7 a6 b3 b2 a3 a2}
|
||||
vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d5 d4 c5 c4 d1 d0 c1 c0}
|
||||
vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d7 d6 c7 c6 d3 d2 c3 c2}
|
||||
vshufps %%r3, %%t0, %%t1, 0xDD ; r3 = {d5 c5 b5 a5 d1 c1 b1 a1}
|
||||
vshufps %%r1, %%r0, %%r2, 0x88 ; r1 = {d6 c6 b6 a6 d2 c2 b2 a2}
|
||||
vshufps %%r0, %%r0, %%r2, 0xDD ; r0 = {d7 c7 b7 a7 d3 c3 b3 a3}
|
||||
vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d4 c4 b4 a4 d0 c0 b0 a0}
|
||||
|
||||
; use r2 in place of t0
|
||||
; process bottom half (r4..r7) {e...h}
|
||||
vshufps %%r2, %%r4, %%r5, 0x44 ; r2 = {f5 f4 e5 e4 f1 f0 e1 e0}
|
||||
vshufps %%r4, %%r4, %%r5, 0xEE ; r4 = {f7 f6 e7 e6 f3 f2 e3 e2}
|
||||
vshufps %%t1, %%r6, %%r7, 0x44 ; t1 = {h5 h4 g5 g4 h1 h0 g1 g0}
|
||||
vshufps %%r6, %%r6, %%r7, 0xEE ; r6 = {h7 h6 g7 g6 h3 h2 g3 g2}
|
||||
vshufps %%r7, %%r2, %%t1, 0xDD ; r7 = {h5 g5 f5 e5 h1 g1 f1 e1}
|
||||
vshufps %%r5, %%r4, %%r6, 0x88 ; r5 = {h6 g6 f6 e6 h2 g2 f2 e2}
|
||||
vshufps %%r4, %%r4, %%r6, 0xDD ; r4 = {h7 g7 f7 e7 h3 g3 f3 e3}
|
||||
vshufps %%t1, %%r2, %%t1, 0x88 ; t1 = {h4 g4 f4 e4 h0 g0 f0 e0}
|
||||
|
||||
|
||||
vperm2f128 %%r6, %%r5, %%r1, 0x13 ; h6...a6
|
||||
vperm2f128 %%r2, %%r5, %%r1, 0x02 ; h2...a2
|
||||
vperm2f128 %%r5, %%r7, %%r3, 0x13 ; h5...a5
|
||||
vperm2f128 %%r1, %%r7, %%r3, 0x02 ; h1...a1
|
||||
vperm2f128 %%r7, %%r4, %%r0, 0x13 ; h7...a7
|
||||
vperm2f128 %%r3, %%r4, %%r0, 0x02 ; h3...a3
|
||||
vperm2f128 %%r4, %%t1, %%t0, 0x13 ; h4...a4
|
||||
vperm2f128 %%r0, %%t1, %%t0, 0x02 ; h0...a0
|
||||
%endmacro
|
||||
|
||||
%endif ;; _TRANSPOSE_AVX2_ASM_
|
||||
61
crypto/hash/custom_hasher/hasher.cpp
Normal file
61
crypto/hash/custom_hasher/hasher.cpp
Normal file
@@ -0,0 +1,61 @@
|
||||
#include <cpuid.h>
|
||||
|
||||
#include "hasher.hpp"
|
||||
|
||||
extern "C" void sha256_1_avx(unsigned char* output, const unsigned char* input);
|
||||
|
||||
namespace {
|
||||
constexpr auto CPUID_LEAF = 7;
|
||||
}
|
||||
|
||||
namespace prysm {
|
||||
void Hasher::sha256_sse(unsigned char* output, const unsigned char* input, std::size_t blocks) {
|
||||
while (blocks) {
|
||||
sha256_1_avx(output, input);
|
||||
input += 2*constants::BYTES_PER_CHUNK;
|
||||
output += constants::BYTES_PER_CHUNK;
|
||||
blocks--;
|
||||
}
|
||||
}
|
||||
|
||||
const Hasher::IMPL Hasher::implemented() {
|
||||
IMPL ret = IMPL::NONE;
|
||||
std::uint32_t a, b, c, d; // NOLINT
|
||||
__get_cpuid_count(CPUID_LEAF, 0, &a, &b, &c, &d);
|
||||
if (b & bit_SHA) ret = ret | IMPL::SHA;
|
||||
if (b & bit_AVX2) ret = ret | IMPL::AVX2;
|
||||
|
||||
__get_cpuid(1, &a, &b, &c, &d);
|
||||
if (c & bit_AVX) ret = ret | IMPL::AVX;
|
||||
if (c & bit_SSE3) ret = ret | IMPL::SSE;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
Hasher::SHA256_hasher Hasher::best_sha256_implementation() {
|
||||
auto impl = implemented();
|
||||
if (!!(impl & IMPL::SHA)) return &::sha256_shani;
|
||||
if (!!(impl & IMPL::AVX2)) return &::sha256_8_avx2;
|
||||
if (!!(impl & IMPL::AVX)) return &::sha256_4_avx;
|
||||
return &sha256_sse;
|
||||
}
|
||||
|
||||
Hasher::Hasher(Hasher::IMPL impl) {
|
||||
switch (impl) {
|
||||
case IMPL::SHA:
|
||||
_hash_64b_blocks = sha256_shani;
|
||||
break;
|
||||
case IMPL::AVX2:
|
||||
_hash_64b_blocks = sha256_8_avx2;
|
||||
break;
|
||||
case IMPL::AVX:
|
||||
_hash_64b_blocks = sha256_4_avx;
|
||||
break;
|
||||
case IMPL::SSE:
|
||||
_hash_64b_blocks = &sha256_sse;
|
||||
break;
|
||||
default:
|
||||
_hash_64b_blocks = best_sha256_implementation();
|
||||
}
|
||||
}
|
||||
} // namespace ssz
|
||||
9
crypto/hash/custom_hasher/hasher.h
Normal file
9
crypto/hash/custom_hasher/hasher.h
Normal file
@@ -0,0 +1,9 @@
|
||||
#ifndef __CUSTOM_HASHER__
|
||||
#define __CUSTOM_HASHER__
|
||||
|
||||
#include <stdint.h>
|
||||
extern void sha256_4_avx(unsigned char* output, const unsigned char* input, uint64_t blocks);
|
||||
extern void sha256_8_avx2(unsigned char* output, const unsigned char* input, uint64_t blocks);
|
||||
extern void sha256_shani(unsigned char* output, const unsigned char* input, uint64_t blocks);
|
||||
extern void sha256_1_avx(unsigned char* output, const unsigned char* input);
|
||||
#endif
|
||||
@@ -1,7 +1,10 @@
|
||||
// Package hashutil includes all hash-function related helpers for Prysm.
|
||||
package hash
|
||||
|
||||
// #include "custom_hasher/hasher.h"
|
||||
import "C"
|
||||
import (
|
||||
"encoding/binary"
|
||||
"errors"
|
||||
"hash"
|
||||
"reflect"
|
||||
@@ -133,3 +136,85 @@ func FastSum64(data []byte) uint64 {
|
||||
func FastSum256(data []byte) [32]byte {
|
||||
return highwayhash.Sum(data, fastSumHashKey[:])
|
||||
}
|
||||
|
||||
// ------------------------------------
|
||||
// No abstraction in these functions, just for playing until we get a feeling if
|
||||
// it's worth pursuing.
|
||||
func PotuzHasherAVX2Chunks(dst [][32]byte, inp [][32]byte, count uint64) {
|
||||
C.sha256_8_avx2((*C.uchar)(&dst[0][0]), (*C.uchar)(&inp[0][0]), C.ulong(count))
|
||||
}
|
||||
|
||||
func PotuzHasherAVXChunks(dst [][32]byte, inp [][32]byte, count uint64) {
|
||||
C.sha256_4_avx((*C.uchar)(&dst[0][0]), (*C.uchar)(&inp[0][0]), C.ulong(count))
|
||||
}
|
||||
|
||||
func PotuzHasherShaniChunks(dst [][32]byte, inp [][32]byte, count uint64) {
|
||||
C.sha256_shani((*C.uchar)(&dst[0][0]), (*C.uchar)(&inp[0][0]), C.ulong(count))
|
||||
}
|
||||
|
||||
func PotuzHasherShani(dst []byte, inp []byte, count uint64) {
|
||||
C.sha256_shani((*C.uchar)(&dst[0]), (*C.uchar)(&inp[0]), C.ulong(count))
|
||||
}
|
||||
|
||||
func PotuzHasherAVX(dst []byte, inp []byte, count uint64) {
|
||||
C.sha256_4_avx((*C.uchar)(&dst[0]), (*C.uchar)(&inp[0]), C.ulong(count))
|
||||
}
|
||||
|
||||
func PotuzHasherAVX2(dst []byte, inp []byte, count uint64) {
|
||||
C.sha256_8_avx2((*C.uchar)(&dst[0]), (*C.uchar)(&inp[0]), C.ulong(count))
|
||||
}
|
||||
|
||||
func PotuzHasher2Chunks(dst []byte, inp []byte) {
|
||||
C.sha256_1_avx((*C.uchar)(&dst[0]), (*C.uchar)(&inp[0]))
|
||||
}
|
||||
|
||||
// no check of the chunks length!
|
||||
func Hash2ChunksAVX(first [32]byte, second [32]byte) [32]byte {
|
||||
buf := [32]byte{}
|
||||
chunks := make([]byte, 64)
|
||||
copy(chunks, first[:])
|
||||
copy(chunks[32:], second[:])
|
||||
|
||||
C.sha256_1_avx((*C.uchar)(&buf[0]), (*C.uchar)(&chunks[0]))
|
||||
return buf
|
||||
}
|
||||
|
||||
// no check of the chunks length!
|
||||
func Hash2ChunksAVX2(first [32]byte, second [32]byte) [32]byte {
|
||||
buf := [32]byte{}
|
||||
chunks := make([]byte, 64)
|
||||
copy(chunks, first[:])
|
||||
copy(chunks[32:], second[:])
|
||||
|
||||
C.sha256_1_avx((*C.uchar)(&buf[0]), (*C.uchar)(&chunks[0]))
|
||||
return buf
|
||||
}
|
||||
|
||||
// no check of the chunks length!
|
||||
func Hash2ChunksShani(first [32]byte, second [32]byte) [32]byte {
|
||||
buf := [32]byte{}
|
||||
chunks := make([]byte, 64)
|
||||
copy(chunks, first[:])
|
||||
copy(chunks[32:], second[:])
|
||||
|
||||
C.sha256_shani((*C.uchar)(&buf[0]), (*C.uchar)(&chunks[0]), C.ulong(1))
|
||||
return buf
|
||||
}
|
||||
|
||||
func MixinLengthAVX(root [32]byte, length uint64) [32]byte {
|
||||
val := [32]byte{}
|
||||
binary.LittleEndian.PutUint64(val[:], length)
|
||||
return Hash2ChunksAVX(root, val)
|
||||
}
|
||||
|
||||
func MixinLengthAVX2(root [32]byte, length uint64) [32]byte {
|
||||
val := [32]byte{}
|
||||
binary.LittleEndian.PutUint64(val[:], length)
|
||||
return Hash2ChunksAVX2(root, val)
|
||||
}
|
||||
|
||||
func MixinLengthShani(root [32]byte, length uint64) [32]byte {
|
||||
val := [32]byte{}
|
||||
binary.LittleEndian.PutUint64(val[:], length)
|
||||
return Hash2ChunksShani(root, val)
|
||||
}
|
||||
|
||||
@@ -2,9 +2,11 @@ package hash_test
|
||||
|
||||
import (
|
||||
"encoding/hex"
|
||||
"math/rand"
|
||||
"testing"
|
||||
|
||||
fuzz "github.com/google/gofuzz"
|
||||
"github.com/prysmaticlabs/prysm/beacon-chain/state/stateutil"
|
||||
"github.com/prysmaticlabs/prysm/crypto/bls"
|
||||
"github.com/prysmaticlabs/prysm/crypto/hash"
|
||||
"github.com/prysmaticlabs/prysm/encoding/bytesutil"
|
||||
@@ -104,3 +106,122 @@ func BenchmarkHashProto(b *testing.B) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------
|
||||
// Remove tests that give illegal instructions in your CPU if you want to run
|
||||
// the benchmarks
|
||||
/*
|
||||
func TestCustomHash_Shani(t *testing.T) {
|
||||
hash0 := make([]byte, 64)
|
||||
root := make([]byte, 32)
|
||||
|
||||
hashOf1 := [32]byte{245, 165, 253, 66, 209, 106, 32, 48, 39, 152, 239, 110, 211, 9, 151, 155, 67, 0, 61, 35, 32, 217, 240, 232, 234, 152, 49, 169, 39, 89, 251, 75}
|
||||
|
||||
hash.PotuzHasherShani(root, hash0, 1)
|
||||
assert.DeepEqual(t, hashOf1[:], root)
|
||||
}
|
||||
*/
|
||||
func TestCustomHash_Avx2(t *testing.T) {
|
||||
hash0 := make([]byte, 64)
|
||||
root := make([]byte, 32)
|
||||
|
||||
hashOf1 := [32]byte{245, 165, 253, 66, 209, 106, 32, 48, 39, 152, 239, 110, 211, 9, 151, 155, 67, 0, 61, 35, 32, 217, 240, 232, 234, 152, 49, 169, 39, 89, 251, 75}
|
||||
|
||||
hash.PotuzHasherAVX2(root, hash0, 1)
|
||||
assert.DeepEqual(t, hashOf1[:], root)
|
||||
}
|
||||
|
||||
func TestCustomHash_SSE(t *testing.T) {
|
||||
hash0 := make([]byte, 64)
|
||||
root := make([]byte, 32)
|
||||
|
||||
hashOf1 := [32]byte{245, 165, 253, 66, 209, 106, 32, 48, 39, 152, 239, 110, 211, 9, 151, 155, 67, 0, 61, 35, 32, 217, 240, 232, 234, 152, 49, 169, 39, 89, 251, 75}
|
||||
|
||||
hash.PotuzHasher2Chunks(root, hash0)
|
||||
assert.DeepEqual(t, hashOf1[:], root)
|
||||
}
|
||||
|
||||
/*
|
||||
func BenchmarkHashBalanceAVX2(b *testing.B) {
|
||||
zero_hash_array := make([][32]byte, 40)
|
||||
for i := 1; i < 40; i++ {
|
||||
zero_hash_array[i] = hash.Hash2ChunksAVX2(zero_hash_array[i-1], zero_hash_array[i-1])
|
||||
}
|
||||
balances := make([]uint64, 400000)
|
||||
for i := 0; i < len(balances); i++ {
|
||||
balances[i] = rand.Uint64()
|
||||
}
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
_, err := stateutil.Uint64ListRootWithRegistryLimitAVX2(balances, zero_hash_array)
|
||||
require.NoError(b, err)
|
||||
}
|
||||
}
|
||||
*/
|
||||
func BenchmarkHashBalanceAVX(b *testing.B) {
|
||||
zero_hash_array := make([][32]byte, 40)
|
||||
for i := 1; i < 40; i++ {
|
||||
zero_hash_array[i] = hash.Hash2ChunksAVX(zero_hash_array[i-1], zero_hash_array[i-1])
|
||||
}
|
||||
balances := make([]uint64, 400000)
|
||||
for i := 0; i < len(balances); i++ {
|
||||
balances[i] = rand.Uint64()
|
||||
}
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
_, err := stateutil.Uint64ListRootWithRegistryLimitAVX(balances, zero_hash_array)
|
||||
require.NoError(b, err)
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
func BenchmarkHashBalanceShani(b *testing.B) {
|
||||
zero_hash_array := make([][32]byte, 40)
|
||||
for i := 1; i < 40; i++ {
|
||||
zero_hash_array[i] = hash.Hash2ChunksShani(zero_hash_array[i-1], zero_hash_array[i-1])
|
||||
}
|
||||
balances := make([]uint64, 400000)
|
||||
for i := 0; i < len(balances); i++ {
|
||||
balances[i] = rand.Uint64()
|
||||
}
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
_, err := stateutil.Uint64ListRootWithRegistryLimitShani(balances, zero_hash_array)
|
||||
require.NoError(b, err)
|
||||
}
|
||||
}
|
||||
*/
|
||||
func BenchmarkHashBalancePrysm(b *testing.B) {
|
||||
zero_hash_array := make([][32]byte, 40)
|
||||
for i := 1; i < 40; i++ {
|
||||
zero_hash_array[i] = hash.Hash2ChunksAVX(zero_hash_array[i-1], zero_hash_array[i-1])
|
||||
}
|
||||
balances := make([]uint64, 400000)
|
||||
for i := 0; i < len(balances); i++ {
|
||||
balances[i] = rand.Uint64()
|
||||
}
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
_, err := stateutil.Uint64ListRootWithRegistryLimit(balances)
|
||||
require.NoError(b, err)
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
func TestHashBalancesShani(t *testing.T) {
|
||||
zero_hash_array := make([][32]byte, 45)
|
||||
for i := 1; i < 45; i++ {
|
||||
zero_hash_array[i] = hash.Hash2ChunksShani(zero_hash_array[i-1], zero_hash_array[i-1])
|
||||
}
|
||||
balances := make([]uint64, 400000)
|
||||
|
||||
for i := 0; i < len(balances); i++ {
|
||||
balances[i] = rand.Uint64()
|
||||
}
|
||||
root1, err := stateutil.Uint64ListRootWithRegistryLimitShani(balances, zero_hash_array)
|
||||
require.NoError(t, err)
|
||||
root2, err := stateutil.Uint64ListRootWithRegistryLimit(balances)
|
||||
require.NoError(t, err)
|
||||
assert.DeepEqual(t, root1, root2)
|
||||
}
|
||||
*/
|
||||
|
||||
110
crypto/hash/yasm.bzl
Normal file
110
crypto/hash/yasm.bzl
Normal file
@@ -0,0 +1,110 @@
|
||||
load("@rules_cc//cc:toolchain_utils.bzl", "find_cpp_toolchain")
|
||||
|
||||
|
||||
def _obj_yasm(ctx, arch, opts, src):
|
||||
yasm_bin = ctx.attr.yasm_bin
|
||||
out = ctx.actions.declare_file(src.basename.replace(src.extension, "o"))
|
||||
opts = arch + ['-o', out.path] + opts + [src.path]
|
||||
inputs = []
|
||||
|
||||
for i in ctx.attr.srcs + ctx.attr.hdrs + ctx.attr.deps:
|
||||
if hasattr(i, "files"):
|
||||
inputs += i.files.to_list()
|
||||
else:
|
||||
inputs.append(i)
|
||||
|
||||
ctx.actions.run(
|
||||
outputs = [out],
|
||||
inputs = inputs,
|
||||
arguments = opts,
|
||||
executable = yasm_bin,
|
||||
mnemonic = 'YasmCompile',
|
||||
)
|
||||
|
||||
return out
|
||||
|
||||
def _library_yasm(ctx, mysrc):
|
||||
output_file = ctx.actions.declare_file(ctx.label.name + ".a")
|
||||
|
||||
cc_toolchain = find_cpp_toolchain(ctx)
|
||||
|
||||
feature_configuration = cc_common.configure_features(
|
||||
ctx = ctx,
|
||||
cc_toolchain = cc_toolchain,
|
||||
requested_features = ctx.features,
|
||||
unsupported_features = ctx.disabled_features,
|
||||
)
|
||||
|
||||
linker_input = cc_common.create_linker_input(
|
||||
owner = ctx.label,
|
||||
libraries = depset(direct = [
|
||||
cc_common.create_library_to_link(
|
||||
actions = ctx.actions,
|
||||
static_library = output_file,
|
||||
cc_toolchain = cc_toolchain,
|
||||
feature_configuration = feature_configuration,
|
||||
),
|
||||
]),
|
||||
)
|
||||
|
||||
compilation_context = cc_common.create_compilation_context()
|
||||
linking_context = cc_common.create_linking_context(linker_inputs = depset(direct = [linker_input]))
|
||||
|
||||
ctx.actions.run(
|
||||
executable = ctx.attr.ar_bin,
|
||||
arguments = ['r', output_file.path] + [i.path for i in mysrc],
|
||||
inputs = mysrc,
|
||||
outputs = [output_file],
|
||||
mnemonic = "Archiving",
|
||||
)
|
||||
|
||||
return CcInfo(compilation_context = compilation_context, linking_context = linking_context)
|
||||
|
||||
def _yasm_library_impl(ctx):
|
||||
opts = ctx.attr.copts
|
||||
deps = [_obj_yasm(ctx, ctx.attr.yasm_arch, opts, src)
|
||||
for target in ctx.attr.srcs for src in target.files.to_list()]
|
||||
for i in ctx.attr.hdrs:
|
||||
if hasattr(i, "files"):
|
||||
deps += i.files.to_list()
|
||||
else:
|
||||
deps.append(i)
|
||||
|
||||
cc_info = _library_yasm(ctx, deps)
|
||||
|
||||
return [cc_info]
|
||||
|
||||
|
||||
YASM_BIN_DEFAULT = "/usr/bin/yasm"
|
||||
AR_BIN_DEFAULT = "/usr/bin/ar"
|
||||
YASM_ARCH_OPTS = ["-f", "elf64", "-m", "amd64"]
|
||||
|
||||
|
||||
_yasm_library = rule(
|
||||
implementation=_yasm_library_impl,
|
||||
attrs={
|
||||
"srcs": attr.label_list(allow_files=True),
|
||||
"hdrs": attr.label_list(allow_files=True),
|
||||
"deps": attr.label_list(allow_files=True),
|
||||
"copts": attr.string_list(),
|
||||
"yasm_bin": attr.string(default=""),
|
||||
"ar_bin": attr.string(default=""),
|
||||
"yasm_arch": attr.string_list(),
|
||||
"_cc_toolchain": attr.label(default = Label("@bazel_tools//tools/cpp:current_cc_toolchain")),
|
||||
},
|
||||
fragments = ["cpp"],
|
||||
toolchains = ["@bazel_tools//tools/cpp:toolchain_type"],
|
||||
)
|
||||
|
||||
|
||||
def yasm_library(name, srcs, hdrs=[], deps=[], copts=[],
|
||||
yasm_bin=YASM_BIN_DEFAULT, ar_bin=AR_BIN_DEFAULT):
|
||||
_yasm_library(
|
||||
name = name,
|
||||
srcs = srcs,
|
||||
hdrs = hdrs,
|
||||
copts = copts,
|
||||
yasm_bin = yasm_bin,
|
||||
ar_bin = ar_bin,
|
||||
yasm_arch = YASM_ARCH_OPTS,
|
||||
)
|
||||
Reference in New Issue
Block a user