Added AVX tests

First Benchmarks with Shani
flat array merkleyzer
2026-01-10 05:47:59 -05:00 · 2021-11-19 15:26:06 -03:00 · 2021-11-19 10:03:12 -03:00 · 2021-11-18 22:16:03 -03:00 · 2021-11-17 11:16:08 -03:00 · 2021-11-17 11:08:02 -03:00
14 changed files with 4219 additions and 1 deletions
--- a/beacon-chain/state/stateutil/BUILD.bazel
+++ b/beacon-chain/state/stateutil/BUILD.bazel
@@ -17,6 +17,7 @@ go_library(
    importpath = "github.com/prysmaticlabs/prysm/beacon-chain/state/stateutil",
    visibility = [
        "//beacon-chain:__subpackages__",
+	"//crypto/hash:__subpackages__",
        "//proto/migration:__subpackages__",
        "//proto/prysm/v1alpha1:__subpackages__",
        "//proto/testing:__subpackages__",
--- a/beacon-chain/state/stateutil/validator_root.go
+++ b/beacon-chain/state/stateutil/validator_root.go
@@ -54,6 +54,236 @@ func ValidatorRootWithHasher(hasher ssz.HashFn, validator *ethpb.Validator) ([32
 	}
 	return ssz.BitwiseMerkleizeArrays(hasher, fieldRoots, uint64(len(fieldRoots)), uint64(len(fieldRoots)))
 }
+func merkleizeFlatArrayAVX(vec [][32]byte,
+	depth uint8,
+	hasher func([][32]byte, [][32]byte, uint64),
+	zero_hash_array [][32]byte) [32]byte {
+
+	if depth == 0 && len(vec) == 1 {
+		return vec[0]
+	}
+	if len(vec) == 0 {
+		panic("Can't have empty vec")
+	}
+
+	// allocate size for the buffer (everything hardcoded cause
+	layer := (len(vec) + 1) / 2
+	length := 0
+	for {
+		length += layer - 1
+		if layer == 1 {
+			break
+		}
+		layer = (layer + 1) / 2
+	}
+	length += int(depth)
+	hash_tree := make([][32]byte, length)
+
+	first := uint64(0)
+	height := uint8(1)
+	last := uint64(len(vec)+1) / 2
+	if len(vec) > 1 {
+		hasher(hash_tree, vec, last)
+	}
+	if len(vec)%2 == 1 {
+		hash_tree[last-1] = hash.Hash2ChunksAVX(vec[len(vec)-1], zero_hash_array[0])
+	}
+	for {
+		dist := last - first
+		if dist < 2 {
+			break
+		}
+		hasher(hash_tree[last:], hash_tree[first:], dist/2)
+		first = last
+		last += (dist + 1) / 2
+
+		if dist%2 != 0 {
+			hash_tree[last-1] = hash.Hash2ChunksAVX(hash_tree[first-1], zero_hash_array[height])
+		}
+		height++
+	}
+	for {
+		if height >= depth {
+			break
+		}
+		hash_tree[last] = hash.Hash2ChunksAVX(hash_tree[last-1], zero_hash_array[height])
+		last++
+		height++
+	}
+	return hash_tree[last-1]
+}
+
+func merkleizeFlatArrayAVX2(vec [][32]byte,
+	depth uint8,
+	hasher func([][32]byte, [][32]byte, uint64),
+	zero_hash_array [][32]byte) [32]byte {
+
+	if depth == 0 && len(vec) == 1 {
+		return vec[0]
+	}
+	if len(vec) == 0 {
+		panic("Can't have empty vec")
+	}
+
+	// allocate size for the buffer (everything hardcoded cause
+	layer := (len(vec) + 1) / 2
+	length := 0
+	for {
+		length += layer - 1
+		if layer == 1 {
+			break
+		}
+		layer = (layer + 1) / 2
+	}
+	length += int(depth)
+	hash_tree := make([][32]byte, length)
+
+	first := uint64(0)
+	height := uint8(1)
+	last := uint64(len(vec)+1) / 2
+	if len(vec) > 1 {
+		hasher(hash_tree, vec, last)
+	}
+	if len(vec)%2 == 1 {
+		hash_tree[last-1] = hash.Hash2ChunksAVX2(vec[len(vec)-1], zero_hash_array[0])
+	}
+	for {
+		dist := last - first
+		if dist < 2 {
+			break
+		}
+		hasher(hash_tree[last:], hash_tree[first:], dist/2)
+		first = last
+		last += (dist + 1) / 2
+
+		if dist%2 != 0 {
+			hash_tree[last-1] = hash.Hash2ChunksAVX2(hash_tree[first-1], zero_hash_array[height])
+		}
+		height++
+	}
+	for {
+		if height >= depth {
+			break
+		}
+		hash_tree[last] = hash.Hash2ChunksAVX2(hash_tree[last-1], zero_hash_array[height])
+		last++
+		height++
+	}
+	return hash_tree[last-1]
+}
+
+func merkleizeFlatArray(vec [][32]byte,
+	depth uint8,
+	hasher func([][32]byte, [][32]byte, uint64),
+	zero_hash_array [][32]byte) [32]byte {
+
+	if depth == 0 && len(vec) == 1 {
+		return vec[0]
+	}
+	if len(vec) == 0 {
+		panic("Can't have empty vec")
+	}
+
+	// allocate size for the buffer (everything hardcoded cause
+	layer := (len(vec) + 1) / 2
+	length := 0
+	for {
+		length += layer - 1
+		if layer == 1 {
+			break
+		}
+		layer = (layer + 1) / 2
+	}
+	length += int(depth)
+	hash_tree := make([][32]byte, length)
+
+	first := uint64(0)
+	height := uint8(1)
+	last := uint64(len(vec)+1) / 2
+	if len(vec) > 1 {
+		hasher(hash_tree, vec, last)
+	}
+	if len(vec)%2 == 1 {
+		hash_tree[last-1] = hash.Hash2ChunksShani(vec[len(vec)-1], zero_hash_array[0])
+	}
+	for {
+		dist := last - first
+		if dist < 2 {
+			break
+		}
+		hasher(hash_tree[last:], hash_tree[first:], dist/2)
+		first = last
+		last += (dist + 1) / 2
+
+		if dist%2 != 0 {
+			hash_tree[last-1] = hash.Hash2ChunksShani(hash_tree[first-1], zero_hash_array[height])
+		}
+		height++
+	}
+	for {
+		if height >= depth {
+			break
+		}
+		hash_tree[last] = hash.Hash2ChunksShani(hash_tree[last-1], zero_hash_array[height])
+		last++
+		height++
+	}
+	return hash_tree[last-1]
+}
+
+// Uint64ListRootWithRegistryLimitShani computes the HashTreeRoot Merkleization of
+// a list of uint64 and mixed with registry limit. Flat array implementation
+// using Shani extensions
+func Uint64ListRootWithRegistryLimitShani(balances []uint64, zero_hash_array [][32]byte) ([32]byte, error) {
+	// assume len(balances) is multiple of 4 for this benchmark
+	lenChunks := len(balances) / 4
+	balancesChunks := make([][32]byte, lenChunks)
+	for i := 0; i < lenChunks; i++ {
+		binary.LittleEndian.PutUint64(balancesChunks[i][:], balances[4*i])
+		binary.LittleEndian.PutUint64(balancesChunks[i][8:], balances[4*i+1])
+		binary.LittleEndian.PutUint64(balancesChunks[i][16:], balances[4*i+2])
+		binary.LittleEndian.PutUint64(balancesChunks[i][24:], balances[4*i+3])
+	}
+	balancesRootsRoot := merkleizeFlatArray(balancesChunks, 38, hash.PotuzHasherShaniChunks, zero_hash_array)
+
+	return hash.MixinLengthShani(balancesRootsRoot, uint64(len(balances))), nil
+}
+
+// Uint64ListRootWithRegistryLimitAVX computes the HashTreeRoot Merkleization of
+// a list of uint64 and mixed with registry limit. Flat array implementation
+// using Shani extensions
+func Uint64ListRootWithRegistryLimitAVX(balances []uint64, zero_hash_array [][32]byte) ([32]byte, error) {
+	// assume len(balances) is multiple of 4 for this benchmark
+	lenChunks := len(balances) / 4
+	balancesChunks := make([][32]byte, lenChunks)
+	for i := 0; i < lenChunks; i++ {
+		binary.LittleEndian.PutUint64(balancesChunks[i][:], balances[4*i])
+		binary.LittleEndian.PutUint64(balancesChunks[i][8:], balances[4*i+1])
+		binary.LittleEndian.PutUint64(balancesChunks[i][16:], balances[4*i+2])
+		binary.LittleEndian.PutUint64(balancesChunks[i][24:], balances[4*i+3])
+	}
+	balancesRootsRoot := merkleizeFlatArrayAVX(balancesChunks, 38, hash.PotuzHasherAVXChunks, zero_hash_array)
+
+	return hash.MixinLengthAVX(balancesRootsRoot, uint64(len(balances))), nil
+}
+
+// Uint64ListRootWithRegistryLimitAVX2 computes the HashTreeRoot Merkleization of
+// a list of uint64 and mixed with registry limit. Flat array implementation
+// using Shani extensions
+func Uint64ListRootWithRegistryLimitAVX2(balances []uint64, zero_hash_array [][32]byte) ([32]byte, error) {
+	// assume len(balances) is multiple of 4 for this benchmark
+	lenChunks := len(balances) / 4
+	balancesChunks := make([][32]byte, lenChunks)
+	for i := 0; i < lenChunks; i++ {
+		binary.LittleEndian.PutUint64(balancesChunks[i][:], balances[4*i])
+		binary.LittleEndian.PutUint64(balancesChunks[i][8:], balances[4*i+1])
+		binary.LittleEndian.PutUint64(balancesChunks[i][16:], balances[4*i+2])
+		binary.LittleEndian.PutUint64(balancesChunks[i][24:], balances[4*i+3])
+	}
+	balancesRootsRoot := merkleizeFlatArrayAVX2(balancesChunks, 38, hash.PotuzHasherAVX2Chunks, zero_hash_array)
+
+	return hash.MixinLengthAVX2(balancesRootsRoot, uint64(len(balances))), nil
+}

 // Uint64ListRootWithRegistryLimit computes the HashTreeRoot Merkleization of
 // a list of uint64 and mixed with registry limit.
--- a/crypto/hash/BUILD.bazel
+++ b/crypto/hash/BUILD.bazel
@@ -1,8 +1,13 @@
 load("@prysm//tools/go:def.bzl", "go_library", "go_test")
+load("@prysm//crypto/hash:yasm.bzl", "yasm_library")

 go_library(
    name = "go_default_library",
-    srcs = ["hash.go"],
+    srcs = [
+	"hash.go",
+	"custom_hasher/hasher.h",
+    ],
+    cgo = True,
    importpath = "github.com/prysmaticlabs/prysm/crypto/hash",
    visibility = ["//visibility:public"],
    deps = [
@@ -13,6 +18,7 @@ go_library(
        "@org_golang_google_protobuf//proto:go_default_library",
        "@org_golang_x_crypto//sha3:go_default_library",
    ],
+    cdeps = [":custom_hasher"],
 )

 go_test(
@@ -27,6 +33,29 @@ go_test(
        "//proto/testing:go_default_library",
        "//testing/assert:go_default_library",
        "//testing/require:go_default_library",
+	"//beacon-chain/state/stateutil:go_default_library",
        "@com_github_google_gofuzz//:go_default_library",
    ],
 )
+
+cc_library(
+	name = "custom_hasher", 
+	srcs = [ 
+		"custom_hasher/hasher.h",
+	],
+	hdrs = [ "custom_hasher/hasher.h" ],
+	visibility = ["//visibility:public"],
+	deps = [ ":asm" ],
+)
+
+yasm_library(
+    name = "asm",
+    srcs = [
+	"custom_hasher/assembly/reg_sizes.asm",
+	"custom_hasher/assembly/sha256_avx_one_block.asm",
+	"custom_hasher/assembly/sha256_avx.asm",
+	"custom_hasher/assembly/sha256_avx2.asm", 
+	"custom_hasher/assembly/sha256_shani.asm",
+	"custom_hasher/assembly/transpose_avx2.asm",
+    ],
+)
--- a/crypto/hash/custom_hasher/assembly/reg_sizes.asm
+++ b/crypto/hash/custom_hasher/assembly/reg_sizes.asm
@@ -0,0 +1,300 @@
+;;
+;; Copyright (c) 2012-2021, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+; define d and w variants for registers
+
+%ifndef _REG_SIZES_ASM_
+%define _REG_SIZES_ASM_
+
+%define	raxd	eax
+%define raxw	ax
+%define raxb	al
+
+%define	rbxd	ebx
+%define rbxw	bx
+%define rbxb	bl
+
+%define	rcxd	ecx
+%define rcxw	cx
+%define rcxb	cl
+
+%define	rdxd	edx
+%define rdxw	dx
+%define rdxb	dl
+
+%define	rsid	esi
+%define rsiw	si
+%define rsib	sil
+
+%define	rdid	edi
+%define rdiw	di
+%define rdib	dil
+
+%define	rbpd	ebp
+%define rbpw	bp
+%define rbpb	bpl
+
+%define zmm0x xmm0
+%define zmm1x xmm1
+%define zmm2x xmm2
+%define zmm3x xmm3
+%define zmm4x xmm4
+%define zmm5x xmm5
+%define zmm6x xmm6
+%define zmm7x xmm7
+%define zmm8x xmm8
+%define zmm9x xmm9
+%define zmm10x xmm10
+%define zmm11x xmm11
+%define zmm12x xmm12
+%define zmm13x xmm13
+%define zmm14x xmm14
+%define zmm15x xmm15
+%define zmm16x xmm16
+%define zmm17x xmm17
+%define zmm18x xmm18
+%define zmm19x xmm19
+%define zmm20x xmm20
+%define zmm21x xmm21
+%define zmm22x xmm22
+%define zmm23x xmm23
+%define zmm24x xmm24
+%define zmm25x xmm25
+%define zmm26x xmm26
+%define zmm27x xmm27
+%define zmm28x xmm28
+%define zmm29x xmm29
+%define zmm30x xmm30
+%define zmm31x xmm31
+
+%define ymm0x xmm0
+%define ymm1x xmm1
+%define ymm2x xmm2
+%define ymm3x xmm3
+%define ymm4x xmm4
+%define ymm5x xmm5
+%define ymm6x xmm6
+%define ymm7x xmm7
+%define ymm8x xmm8
+%define ymm9x xmm9
+%define ymm10x xmm10
+%define ymm11x xmm11
+%define ymm12x xmm12
+%define ymm13x xmm13
+%define ymm14x xmm14
+%define ymm15x xmm15
+%define ymm16x xmm16
+%define ymm17x xmm17
+%define ymm18x xmm18
+%define ymm19x xmm19
+%define ymm20x xmm20
+%define ymm21x xmm21
+%define ymm22x xmm22
+%define ymm23x xmm23
+%define ymm24x xmm24
+%define ymm25x xmm25
+%define ymm26x xmm26
+%define ymm27x xmm27
+%define ymm28x xmm28
+%define ymm29x xmm29
+%define ymm30x xmm30
+%define ymm31x xmm31
+
+%define xmm0x xmm0
+%define xmm1x xmm1
+%define xmm2x xmm2
+%define xmm3x xmm3
+%define xmm4x xmm4
+%define xmm5x xmm5
+%define xmm6x xmm6
+%define xmm7x xmm7
+%define xmm8x xmm8
+%define xmm9x xmm9
+%define xmm10x xmm10
+%define xmm11x xmm11
+%define xmm12x xmm12
+%define xmm13x xmm13
+%define xmm14x xmm14
+%define xmm15x xmm15
+%define xmm16x xmm16
+%define xmm17x xmm17
+%define xmm18x xmm18
+%define xmm19x xmm19
+%define xmm20x xmm20
+%define xmm21x xmm21
+%define xmm22x xmm22
+%define xmm23x xmm23
+%define xmm24x xmm24
+%define xmm25x xmm25
+%define xmm26x xmm26
+%define xmm27x xmm27
+%define xmm28x xmm28
+%define xmm29x xmm29
+%define xmm30x xmm30
+%define xmm31x xmm31
+
+%define zmm0y ymm0
+%define zmm1y ymm1
+%define zmm2y ymm2
+%define zmm3y ymm3
+%define zmm4y ymm4
+%define zmm5y ymm5
+%define zmm6y ymm6
+%define zmm7y ymm7
+%define zmm8y ymm8
+%define zmm9y ymm9
+%define zmm10y ymm10
+%define zmm11y ymm11
+%define zmm12y ymm12
+%define zmm13y ymm13
+%define zmm14y ymm14
+%define zmm15y ymm15
+%define zmm16y ymm16
+%define zmm17y ymm17
+%define zmm18y ymm18
+%define zmm19y ymm19
+%define zmm20y ymm20
+%define zmm21y ymm21
+%define zmm22y ymm22
+%define zmm23y ymm23
+%define zmm24y ymm24
+%define zmm25y ymm25
+%define zmm26y ymm26
+%define zmm27y ymm27
+%define zmm28y ymm28
+%define zmm29y ymm29
+%define zmm30y ymm30
+%define zmm31y ymm31
+
+%define xmm0y ymm0
+%define xmm1y ymm1
+%define xmm2y ymm2
+%define xmm3y ymm3
+%define xmm4y ymm4
+%define xmm5y ymm5
+%define xmm6y ymm6
+%define xmm7y ymm7
+%define xmm8y ymm8
+%define xmm9y ymm9
+%define xmm10y ymm10
+%define xmm11y ymm11
+%define xmm12y ymm12
+%define xmm13y ymm13
+%define xmm14y ymm14
+%define xmm15y ymm15
+%define xmm16y ymm16
+%define xmm17y ymm17
+%define xmm18y ymm18
+%define xmm19y ymm19
+%define xmm20y ymm20
+%define xmm21y ymm21
+%define xmm22y ymm22
+%define xmm23y ymm23
+%define xmm24y ymm24
+%define xmm25y ymm25
+%define xmm26y ymm26
+%define xmm27y ymm27
+%define xmm28y ymm28
+%define xmm29y ymm29
+%define xmm30y ymm30
+%define xmm31y ymm31
+
+%define xmm0z zmm0
+%define xmm1z zmm1
+%define xmm2z zmm2
+%define xmm3z zmm3
+%define xmm4z zmm4
+%define xmm5z zmm5
+%define xmm6z zmm6
+%define xmm7z zmm7
+%define xmm8z zmm8
+%define xmm9z zmm9
+%define xmm10z zmm10
+%define xmm11z zmm11
+%define xmm12z zmm12
+%define xmm13z zmm13
+%define xmm14z zmm14
+%define xmm15z zmm15
+%define xmm16z zmm16
+%define xmm17z zmm17
+%define xmm18z zmm18
+%define xmm19z zmm19
+%define xmm20z zmm20
+%define xmm21z zmm21
+%define xmm22z zmm22
+%define xmm23z zmm23
+%define xmm24z zmm24
+%define xmm25z zmm25
+%define xmm26z zmm26
+%define xmm27z zmm27
+%define xmm28z zmm28
+%define xmm29z zmm29
+%define xmm30z zmm30
+%define xmm31z zmm31
+
+%define ymm0z zmm0
+%define ymm1z zmm1
+%define ymm2z zmm2
+%define ymm3z zmm3
+%define ymm4z zmm4
+%define ymm5z zmm5
+%define ymm6z zmm6
+%define ymm7z zmm7
+%define ymm8z zmm8
+%define ymm9z zmm9
+%define ymm10z zmm10
+%define ymm11z zmm11
+%define ymm12z zmm12
+%define ymm13z zmm13
+%define ymm14z zmm14
+%define ymm15z zmm15
+%define ymm16z zmm16
+%define ymm17z zmm17
+%define ymm18z zmm18
+%define ymm19z zmm19
+%define ymm20z zmm20
+%define ymm21z zmm21
+%define ymm22z zmm22
+%define ymm23z zmm23
+%define ymm24z zmm24
+%define ymm25z zmm25
+%define ymm26z zmm26
+%define ymm27z zmm27
+%define ymm28z zmm28
+%define ymm29z zmm29
+%define ymm30z zmm30
+%define ymm31z zmm31
+
+%define DWORD(reg) reg %+ d
+%define WORD(reg)  reg %+ w
+%define BYTE(reg)  reg %+ b
+
+%define XWORD(reg) reg %+ x
+%define YWORD(reg) reg %+ y
+%define ZWORD(reg) reg %+ z
+
+%endif ;; _REG_SIZES_ASM_
--- a/crypto/hash/custom_hasher/assembly/sha256_avx.asm
+++ b/crypto/hash/custom_hasher/assembly/sha256_avx.asm
@@ -0,0 +1,612 @@
+;;  sha256_avx.asm
+; *
+; *  This file is part of Mammon.
+; *  mammon is a greedy and selfish ETH consensus client.
+; *
+; *  Copyright (c) 2021 - Reimundo Heluani (potuz) potuz@potuz.net
+; *
+; *  This program is free software: you can redistribute it and/or modify
+; *  it under the terms of the GNU General Public License as published by
+; *  the Free Software Foundation, either version 3 of the License, or
+; *  (at your option) any later version.
+; *
+; *  This program is distributed in the hope that it will be useful,
+; *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+; *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+; *  GNU General Public License for more details.
+; *
+;  You should have received a copy of the GNU General Public License
+;  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+;
+;  This implementation is a 64 bytes optimized implementation based on Intel's code
+;  whose copyright follows
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;
+;; Copyright (c) 2012-2021, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;; code to compute quad SHA256 using AVX
+;; outer calling routine takes care of save and restore of XMM registers
+;; Logic designed/laid out by JDG
+
+;; Stack must be aligned to 16 bytes before call
+;; Windows clobbers:  rax rdx             r8 r9 r10 r11 
+;; Windows preserves:         rcx     rsi rdi rbp                r12 r13 r14 r15
+;;
+;; Linux clobbers:    rax rsi         r8 r9 r10 r11 
+;; Linux preserves:           rcx rdx     rdi rbp                r12 r13 r14 r15
+;;
+;; clobbers xmm0-15
+
+
+extern sha256_1_avx
+
+%ifdef WINABI
+	%define OUTPUT_PTR	rcx 	; 1st arg
+	%define DATA_PTR	rdx 	; 2nd arg
+	%define NUM_BLKS 	r8	; 3rd arg
+	%define TBL 		rsi
+%else
+	%define OUTPUT_PTR	rdi	; 1st arg
+	%define DATA_PTR	rsi	; 2nd arg
+	%define NUM_BLKS	rdx	; 3rd arg
+	%define TBL 		rcx
+%endif
+
+%define ROUND	rbx
+
+%define inp0 r8
+%define inp1 r9
+%define inp2 r10
+%define inp3 r11
+
+%define a xmm0
+%define b xmm1
+%define c xmm2
+%define d xmm3
+%define e xmm4
+%define f xmm5
+%define g xmm6
+%define h xmm7
+
+%define a0 xmm8
+%define a1 xmm9
+%define a2 xmm10
+
+%define TT0 xmm14
+%define TT1 xmm13
+%define TT2 xmm12
+%define TT3 xmm11
+%define TT4 xmm10
+%define TT5 xmm9
+
+%define T1  xmm14
+%define TMP xmm15
+
+%define SHA256_DIGEST_WORD_SIZE  4
+%define NUM_SHA256_DIGEST_WORDS 8
+%define SZ4	4*SHA256_DIGEST_WORD_SIZE	; Size of one vector register
+%define ROUNDS 64*SZ4
+
+; Define stack usage
+struc STACK
+	_DATA:		resb	SZ4 * 16
+	_DIGEST:	resb	SZ4 * NUM_SHA256_DIGEST_WORDS
+	_RBX: 		resb	8
+			resb 	16 	
+endstruc
+
+%define VMOVPS	vmovups
+
+; transpose r0, r1, r2, r3, t0, t1
+; "transpose" data in {r0..r3} using temps {t0,t1}
+; Input looks like: {r0 r1 r2 r3}
+; r0 = {a3 a2 a1 a0}
+; r1 = {b3 b2 b1 b0}
+; r2 = {c3 c2 c1 c0}
+; r3 = {d3 d2 d1 d0}
+;
+; output looks like: {t0 r1 r0 r3}
+; t0 = {d0 c0 b0 a0}
+; r1 = {d1 c1 b1 a1}
+; r0 = {d2 c2 b2 a2}
+; r3 = {d3 c3 b3 a3}
+;
+%macro TRANSPOSE 6
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%t0 %5
+%define %%t1 %6
+	vshufps	%%t0, %%r0, %%r1, 0x44	; t0 = {b1 b0 a1 a0}
+	vshufps	%%r0, %%r0, %%r1, 0xEE	; r0 = {b3 b2 a3 a2}
+
+	vshufps	%%t1, %%r2, %%r3, 0x44	; t1 = {d1 d0 c1 c0}
+	vshufps	%%r2, %%r2, %%r3, 0xEE	; r2 = {d3 d2 c3 c2}
+
+	vshufps	%%r1, %%t0, %%t1, 0xDD	; r1 = {d1 c1 b1 a1}
+
+	vshufps	%%r3, %%r0, %%r2, 0xDD	; r3 = {d3 c3 b3 a3}
+
+	vshufps	%%r0, %%r0, %%r2, 0x88	; r0 = {d2 c2 b2 a2}
+	vshufps	%%t0, %%t0, %%t1, 0x88	; t0 = {d0 c0 b0 a0}
+%endmacro
+
+
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endm
+
+; PRORD reg, imm, tmp
+%macro PRORD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+	vpslld	%%tmp, %%reg, (32-(%%imm))
+	vpsrld	%%reg, %%reg, %%imm
+	vpor	%%reg, %%reg, %%tmp
+%endmacro
+
+; non-destructive
+; PRORD_nd reg, imm, tmp, src
+%macro PRORD_nd 4
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+%define %%src %4
+	;vmovdqa	%%tmp, %%reg
+	vpslld	%%tmp, %%src, (32-(%%imm))
+	vpsrld	%%reg, %%src, %%imm
+	vpor	%%reg, %%reg, %%tmp
+%endmacro
+
+; PRORD dst/src, amt
+%macro PRORD 2
+	PRORD	%1, %2, TMP
+%endmacro
+
+; PRORD_nd dst, src, amt
+%macro PRORD_nd 3
+	PRORD_nd	%1, %3, TMP, %2
+%endmacro
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_00_15 2
+%define %%T1 %1
+%define %%i  %2
+	PRORD_nd	a0, e, (11-6)	; sig1: a0 = (e >> 5)
+
+	vpxor	a2, f, g	; ch: a2 = f^g
+	vpand	a2, a2, e	; ch: a2 = (f^g)&e
+	vpxor	a2, a2, g	; a2 = ch
+
+	PRORD_nd	a1, e, 25		; sig1: a1 = (e >> 25)
+	vmovdqa	[SZ4*(%%i&0xf) + rsp + _DATA], %%T1
+	vpaddd	%%T1, %%T1, [TBL + ROUND]	; T1 = W + K
+	vpxor	a0, a0, e	; sig1: a0 = e ^ (e >> 5)
+	PRORD	a0, 6		; sig1: a0 = (e >> 6) ^ (e >> 11)
+	vpaddd	h, h, a2	; h = h + ch
+	PRORD_nd	a2, a, (13-2)	; sig0: a2 = (a >> 11)
+	vpaddd	h, h, %%T1	; h = h + ch + W + K
+	vpxor	a0, a0, a1	; a0 = sigma1
+	PRORD_nd	a1, a, 22	; sig0: a1 = (a >> 22)
+	vpxor	%%T1, a, c	; maj: T1 = a^c
+	add	ROUND, SZ4	; ROUND++
+	vpand	%%T1, %%T1, b	; maj: T1 = (a^c)&b
+	vpaddd	h, h, a0
+
+	vpaddd	d, d, h
+
+	vpxor	a2, a2, a	; sig0: a2 = a ^ (a >> 11)
+	PRORD	a2, 2		; sig0: a2 = (a >> 2) ^ (a >> 13)
+	vpxor	a2, a2, a1	; a2 = sig0
+	vpand	a1, a, c	; maj: a1 = a&c
+	vpor	a1, a1, %%T1	; a1 = maj
+	vpaddd	h, h, a1	; h = h + ch + W + K + maj
+	vpaddd	h, h, a2	; h = h + ch + W + K + maj + sigma0
+
+	ROTATE_ARGS
+%endm
+
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_16_XX 2
+%define %%T1 %1
+%define %%i  %2
+	vmovdqa	%%T1, [SZ4*((%%i-15)&0xf) + rsp + _DATA]
+	vmovdqa	a1, [SZ4*((%%i-2)&0xf) + rsp + _DATA]
+	vmovdqa	a0, %%T1
+	PRORD	%%T1, 18-7
+	vmovdqa	a2, a1
+	PRORD	a1, 19-17
+	vpxor	%%T1, %%T1, a0
+	PRORD	%%T1, 7
+	vpxor	a1, a1, a2
+	PRORD	a1, 17
+	vpsrld	a0, a0, 3
+	vpxor	%%T1, %%T1, a0
+	vpsrld	a2, a2, 10
+	vpxor	a1, a1, a2
+	vpaddd	%%T1, %%T1, [SZ4*((%%i-16)&0xf) + rsp + _DATA]
+	vpaddd	a1, a1, [SZ4*((%%i-7)&0xf) + rsp + _DATA]
+	vpaddd	%%T1, %%T1, a1
+
+	ROUND_00_15 %%T1, %%i
+%endm
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro PADDING_ROUND_00_15 1
+%define %%T1 %1
+	PRORD_nd	a0, e, (11-6)	; sig1: a0 = (e >> 5)
+
+	vpxor	a2, f, g	; ch: a2 = f^g
+	vpand	a2, a2, e	; ch: a2 = (f^g)&e
+	vpxor	a2, a2, g	; a2 = ch
+
+	PRORD_nd	a1, e, 25		; sig1: a1 = (e >> 25)
+	vmovdqa 	%%T1, [TBL + ROUND]	; T1 = W + K
+	vpxor	a0, a0, e	; sig1: a0 = e ^ (e >> 5)
+	PRORD	a0, 6		; sig1: a0 = (e >> 6) ^ (e >> 11)
+	vpaddd	h, h, a2	; h = h + ch
+	PRORD_nd	a2, a, (13-2)	; sig0: a2 = (a >> 11)
+	vpaddd	h, h, %%T1	; h = h + ch + W + K
+	vpxor	a0, a0, a1	; a0 = sigma1
+	PRORD_nd	a1, a, 22	; sig0: a1 = (a >> 22)
+	vpxor	%%T1, a, c	; maj: T1 = a^c
+	add	ROUND, SZ4	; ROUND++
+	vpand	%%T1, %%T1, b	; maj: T1 = (a^c)&b
+	vpaddd	h, h, a0
+
+	vpaddd	d, d, h
+
+	vpxor	a2, a2, a	; sig0: a2 = a ^ (a >> 11)
+	PRORD	a2, 2		; sig0: a2 = (a >> 2) ^ (a >> 13)
+	vpxor	a2, a2, a1	; a2 = sig0
+	vpand	a1, a, c	; maj: a1 = a&c
+	vpor	a1, a1, %%T1	; a1 = maj
+	vpaddd	h, h, a1	; h = h + ch + W + K + maj
+	vpaddd	h, h, a2	; h = h + ch + W + K + maj + sigma0
+
+	ROTATE_ARGS
+%endm
+
+
+
+section .data
+default rel
+align 64
+
+K256_4:
+	dq	0x428a2f98428a2f98, 0x428a2f98428a2f98
+	dq	0x7137449171374491, 0x7137449171374491
+	dq	0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+	dq	0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+	dq	0x3956c25b3956c25b, 0x3956c25b3956c25b
+	dq	0x59f111f159f111f1, 0x59f111f159f111f1
+	dq	0x923f82a4923f82a4, 0x923f82a4923f82a4
+	dq	0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+	dq	0xd807aa98d807aa98, 0xd807aa98d807aa98
+	dq	0x12835b0112835b01, 0x12835b0112835b01
+	dq	0x243185be243185be, 0x243185be243185be
+	dq	0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+	dq	0x72be5d7472be5d74, 0x72be5d7472be5d74
+	dq	0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+	dq	0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+	dq	0xc19bf174c19bf174, 0xc19bf174c19bf174
+	dq	0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+	dq	0xefbe4786efbe4786, 0xefbe4786efbe4786
+	dq	0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+	dq	0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+	dq	0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+	dq	0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+	dq	0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+	dq	0x76f988da76f988da, 0x76f988da76f988da
+	dq	0x983e5152983e5152, 0x983e5152983e5152
+	dq	0xa831c66da831c66d, 0xa831c66da831c66d
+	dq	0xb00327c8b00327c8, 0xb00327c8b00327c8
+	dq	0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+	dq	0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+	dq	0xd5a79147d5a79147, 0xd5a79147d5a79147
+	dq	0x06ca635106ca6351, 0x06ca635106ca6351
+	dq	0x1429296714292967, 0x1429296714292967
+	dq	0x27b70a8527b70a85, 0x27b70a8527b70a85
+	dq	0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+	dq	0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+	dq	0x53380d1353380d13, 0x53380d1353380d13
+	dq	0x650a7354650a7354, 0x650a7354650a7354
+	dq	0x766a0abb766a0abb, 0x766a0abb766a0abb
+	dq	0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+	dq	0x92722c8592722c85, 0x92722c8592722c85
+	dq	0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+	dq	0xa81a664ba81a664b, 0xa81a664ba81a664b
+	dq	0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+	dq	0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+	dq	0xd192e819d192e819, 0xd192e819d192e819
+	dq	0xd6990624d6990624, 0xd6990624d6990624
+	dq	0xf40e3585f40e3585, 0xf40e3585f40e3585
+	dq	0x106aa070106aa070, 0x106aa070106aa070
+	dq	0x19a4c11619a4c116, 0x19a4c11619a4c116
+	dq	0x1e376c081e376c08, 0x1e376c081e376c08
+	dq	0x2748774c2748774c, 0x2748774c2748774c
+	dq	0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+	dq	0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+	dq	0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+	dq	0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+	dq	0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+	dq	0x748f82ee748f82ee, 0x748f82ee748f82ee
+	dq	0x78a5636f78a5636f, 0x78a5636f78a5636f
+	dq	0x84c8781484c87814, 0x84c8781484c87814
+	dq	0x8cc702088cc70208, 0x8cc702088cc70208
+	dq	0x90befffa90befffa, 0x90befffa90befffa
+	dq	0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+	dq	0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+	dq	0xc67178f2c67178f2, 0xc67178f2c67178f2
+
+PADDING_4:
+        dq      0xc28a2f98c28a2f98, 0xc28a2f98c28a2f98
+        dq      0x7137449171374491, 0x7137449171374491
+        dq      0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+        dq      0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+        dq      0x3956c25b3956c25b, 0x3956c25b3956c25b
+        dq      0x59f111f159f111f1, 0x59f111f159f111f1
+        dq      0x923f82a4923f82a4, 0x923f82a4923f82a4
+        dq      0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+        dq      0xd807aa98d807aa98, 0xd807aa98d807aa98
+        dq      0x12835b0112835b01, 0x12835b0112835b01
+        dq      0x243185be243185be, 0x243185be243185be
+        dq      0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+        dq      0x72be5d7472be5d74, 0x72be5d7472be5d74
+        dq      0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+        dq      0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+        dq      0xc19bf374c19bf374, 0xc19bf374c19bf374
+        dq      0x649b69c1649b69c1, 0x649b69c1649b69c1
+        dq      0xf0fe4786f0fe4786, 0xf0fe4786f0fe4786
+        dq      0x0fe1edc60fe1edc6, 0x0fe1edc60fe1edc6
+        dq      0x240cf254240cf254, 0x240cf254240cf254
+        dq      0x4fe9346f4fe9346f, 0x4fe9346f4fe9346f
+        dq      0x6cc984be6cc984be, 0x6cc984be6cc984be
+        dq      0x61b9411e61b9411e, 0x61b9411e61b9411e
+        dq      0x16f988fa16f988fa, 0x16f988fa16f988fa
+        dq      0xf2c65152f2c65152, 0xf2c65152f2c65152
+        dq      0xa88e5a6da88e5a6d, 0xa88e5a6da88e5a6d
+        dq      0xb019fc65b019fc65, 0xb019fc65b019fc65
+        dq      0xb9d99ec7b9d99ec7, 0xb9d99ec7b9d99ec7
+        dq      0x9a1231c39a1231c3, 0x9a1231c39a1231c3
+        dq      0xe70eeaa0e70eeaa0, 0xe70eeaa0e70eeaa0
+        dq      0xfdb1232bfdb1232b, 0xfdb1232bfdb1232b
+        dq      0xc7353eb0c7353eb0, 0xc7353eb0c7353eb0
+        dq      0x3069bad53069bad5, 0x3069bad53069bad5
+        dq      0xcb976d5fcb976d5f, 0xcb976d5fcb976d5f
+        dq      0x5a0f118f5a0f118f, 0x5a0f118f5a0f118f
+        dq      0xdc1eeefddc1eeefd, 0xdc1eeefddc1eeefd
+        dq      0x0a35b6890a35b689, 0x0a35b6890a35b689
+        dq      0xde0b7a04de0b7a04, 0xde0b7a04de0b7a04
+        dq      0x58f4ca9d58f4ca9d, 0x58f4ca9d58f4ca9d
+        dq      0xe15d5b16e15d5b16, 0xe15d5b16e15d5b16
+        dq      0x007f3e86007f3e86, 0x007f3e86007f3e86
+        dq      0x3708898037088980, 0x3708898037088980
+        dq      0xa507ea32a507ea32, 0xa507ea32a507ea32
+        dq      0x6fab95376fab9537, 0x6fab95376fab9537
+        dq      0x1740611017406110, 0x1740611017406110
+        dq      0x0d8cd6f10d8cd6f1, 0x0d8cd6f10d8cd6f1
+        dq      0xcdaa3b6dcdaa3b6d, 0xcdaa3b6dcdaa3b6d
+        dq      0xc0bbbe37c0bbbe37, 0xc0bbbe37c0bbbe37
+        dq      0x83613bda83613bda, 0x83613bda83613bda
+        dq      0xdb48a363db48a363, 0xdb48a363db48a363
+        dq      0x0b02e9310b02e931, 0x0b02e9310b02e931
+        dq      0x6fd15ca76fd15ca7, 0x6fd15ca76fd15ca7
+        dq      0x521afaca521afaca, 0x521afaca521afaca
+        dq      0x3133843131338431, 0x3133843131338431
+        dq      0x6ed41a956ed41a95, 0x6ed41a956ed41a95
+        dq      0x6d4378906d437890, 0x6d4378906d437890
+        dq      0xc39c91f2c39c91f2, 0xc39c91f2c39c91f2
+        dq      0x9eccabbd9eccabbd, 0x9eccabbd9eccabbd
+        dq      0xb5c9a0e6b5c9a0e6, 0xb5c9a0e6b5c9a0e6
+        dq      0x532fb63c532fb63c, 0x532fb63c532fb63c
+        dq      0xd2c741c6d2c741c6, 0xd2c741c6d2c741c6
+        dq      0x07237ea307237ea3, 0x07237ea307237ea3
+        dq      0xa4954b68a4954b68, 0xa4954b68a4954b68
+        dq      0x4c191d764c191d76, 0x4c191d764c191d76
+
+DIGEST_4:
+        dd      0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667
+	dd 	0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85 
+	dd      0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372 
+	dd 	0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a 
+	dd	0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f
+	dd 	0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c 
+	dd	0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab
+        dd      0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19
+
+PSHUFFLE_BYTE_FLIP_MASK: 
+	dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+
+section .text
+
+global sha256_4_avx:function
+align 16
+sha256_4_avx:
+        endbranch64
+	; outer calling routine saves all the XMM registers
+	sub	rsp, STACK_size
+	mov     [rsp + _RBX],rbx
+
+.hash_4_blocks:
+	cmp 	NUM_BLKS, 4
+	jl 	.hash_1_block
+
+	xor	ROUND, ROUND
+
+	;; Load the pre-transposed incoming digest.
+	lea TBL,[rel DIGEST_4]
+	vmovdqa	a,[TBL + 0*SZ4]
+	vmovdqa	b,[TBL + 1*SZ4]
+	vmovdqa	c,[TBL + 2*SZ4]
+	vmovdqa	d,[TBL + 3*SZ4]
+	vmovdqa	e,[TBL + 4*SZ4]
+	vmovdqa	f,[TBL + 5*SZ4]
+	vmovdqa	g,[TBL + 6*SZ4]
+	vmovdqa	h,[TBL + 7*SZ4]
+
+	lea	TBL,[rel K256_4]
+
+%assign i 0
+%rep 4
+	vmovdqa	TMP, [rel PSHUFFLE_BYTE_FLIP_MASK]
+	VMOVPS	TT2,[DATA_PTR + 0*64 + i*16]
+	VMOVPS	TT1,[DATA_PTR + 1*64 + i*16]
+	VMOVPS	TT4,[DATA_PTR + 2*64 + i*16]
+	VMOVPS	TT3,[DATA_PTR + 3*64 + i*16]
+	TRANSPOSE	TT2, TT1, TT4, TT3, TT0, TT5
+	vpshufb	TT0, TT0, TMP
+	vpshufb	TT1, TT1, TMP
+	vpshufb	TT2, TT2, TMP
+	vpshufb	TT3, TT3, TMP
+	ROUND_00_15	TT0,(i*4+0)
+	ROUND_00_15	TT1,(i*4+1)
+	ROUND_00_15	TT2,(i*4+2)
+	ROUND_00_15	TT3,(i*4+3)
+%assign i (i+1)
+%endrep
+
+%assign i (i*4)
+
+	jmp	.Lrounds_16_xx
+align 16
+.Lrounds_16_xx:
+%rep 16
+	ROUND_16_XX	T1, i
+%assign i (i+1)
+%endrep
+
+	cmp	ROUND,ROUNDS
+	jb	.Lrounds_16_xx
+
+	;; add old digest
+	lea TBL,[rel DIGEST_4]
+	vpaddd	a, a, [TBL + 0*SZ4]
+	vpaddd	b, b, [TBL + 1*SZ4]
+	vpaddd	c, c, [TBL + 2*SZ4]
+	vpaddd	d, d, [TBL + 3*SZ4]
+	vpaddd	e, e, [TBL + 4*SZ4]
+	vpaddd	f, f, [TBL + 5*SZ4]
+	vpaddd	g, g, [TBL + 6*SZ4]
+	vpaddd	h, h, [TBL + 7*SZ4]
+
+        ;; rounds with padding
+        
+        ;; save old digest
+        
+	vmovdqa	[rsp + _DIGEST + 0*SZ4], a
+	vmovdqa	[rsp + _DIGEST + 1*SZ4], b
+	vmovdqa	[rsp + _DIGEST + 2*SZ4], c
+	vmovdqa	[rsp + _DIGEST + 3*SZ4], d
+	vmovdqa	[rsp + _DIGEST + 4*SZ4], e
+	vmovdqa	[rsp + _DIGEST + 5*SZ4], f
+	vmovdqa	[rsp + _DIGEST + 6*SZ4], g
+	vmovdqa	[rsp + _DIGEST + 7*SZ4], h
+
+        lea   TBL,[rel PADDING_4]
+        xor   ROUND,ROUND
+        jmp   .Lrounds_padding
+
+align 16
+.Lrounds_padding:
+%rep 64
+	PADDING_ROUND_00_15 	T1
+%endrep
+	;; add old digest
+	vpaddd	a, a, [rsp + _DIGEST + 0*SZ4]
+	vpaddd	b, b, [rsp + _DIGEST + 1*SZ4]
+	vpaddd	c, c, [rsp + _DIGEST + 2*SZ4]
+	vpaddd	d, d, [rsp + _DIGEST + 3*SZ4]
+	vpaddd	e, e, [rsp + _DIGEST + 4*SZ4]
+	vpaddd	f, f, [rsp + _DIGEST + 5*SZ4]
+	vpaddd	g, g, [rsp + _DIGEST + 6*SZ4]
+	vpaddd	h, h, [rsp + _DIGEST + 7*SZ4]
+
+	;; transpose the digest and convert to little endian to get the registers correctly
+
+	TRANSPOSE a, b, c, d, TT0, TT1
+        TRANSPOSE e, f, g, h, TT2, TT1
+
+	vmovdqa	TMP, [rel PSHUFFLE_BYTE_FLIP_MASK]
+        vpshufb TT0, TMP
+        vpshufb TT2, TMP
+        vpshufb b, TMP
+        vpshufb f, TMP
+        vpshufb a, TMP
+        vpshufb e, TMP
+        vpshufb d, TMP
+        vpshufb h, TMP
+
+
+	;; write to output
+
+	vmovdqu	[OUTPUT_PTR + 0*SZ4],TT0
+	vmovdqu	[OUTPUT_PTR + 1*SZ4],TT2
+	vmovdqu	[OUTPUT_PTR + 2*SZ4],b
+	vmovdqu	[OUTPUT_PTR + 3*SZ4],f
+	vmovdqu	[OUTPUT_PTR + 4*SZ4],a
+	vmovdqu	[OUTPUT_PTR + 5*SZ4],e
+	vmovdqu	[OUTPUT_PTR + 6*SZ4],d
+	vmovdqu	[OUTPUT_PTR + 7*SZ4],h
+
+	; update pointers and loop
+
+        add 	DATA_PTR, 64*4
+	add 	OUTPUT_PTR, 32*4
+	sub 	NUM_BLKS, 4
+        jmp     .hash_4_blocks
+
+.hash_1_block:
+        test     NUM_BLKS,NUM_BLKS
+        jz      .done_hash
+        call    sha256_1_avx
+        add     DATA_PTR, 64
+        add     OUTPUT_PTR, 32
+        dec     NUM_BLKS
+        jmp     .hash_1_block
+
+.done_hash:
+	mov     rbx,[rsp + _RBX]
+	add	rsp, STACK_size
+	ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
--- a/crypto/hash/custom_hasher/assembly/sha256_avx2.asm
+++ b/crypto/hash/custom_hasher/assembly/sha256_avx2.asm
@@ -0,0 +1,797 @@
+;;  sha256_avx2.asm
+; *
+; *  This file is part of Mammon.
+; *  mammon is a greedy and selfish ETH consensus client.
+; *
+; *  Copyright (c) 2021 - Reimundo Heluani (potuz) potuz@potuz.net
+; *
+; *  This program is free software: you can redistribute it and/or modify
+; *  it under the terms of the GNU General Public License as published by
+; *  the Free Software Foundation, either version 3 of the License, or
+; *  (at your option) any later version.
+; *
+; *  This program is distributed in the hope that it will be useful,
+; *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+; *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+; *  GNU General Public License for more details.
+; *
+;  You should have received a copy of the GNU General Public License
+;  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+;
+;  This implementation is a 64 bytes optimized implementation based on Intel's code
+;  whose copyright follows
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;
+;; Copyright (c) 2012-2021, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;; code to compute oct SHA256 using SSE-256
+;; outer calling routine takes care of save and restore of XMM registers
+;; Logic designed/laid out by JDG
+
+;; Function clobbers: rax, rcx, rdx, rsi, rdi, r9-r15; ymm0-15
+;; Stack must be aligned to 32 bytes before call
+;; Windows clobbers:  rax rdx rsi rdi     r8 r9 r10 r11 r12 r13 r14
+;; Windows preserves:         rcx             rbp                           r15
+;;
+;; Linux clobbers:    rax rcx rdx rsi         r8 r9 r10 r11 r12 r13 r14
+;; Linux preserves:                       rdi rbp                           r15
+;;
+;; clobbers ymm0-15
+
+%include "transpose_avx2.asm"
+
+extern sha256_4_avx
+
+section .data
+default rel
+align 64
+
+K256_8:
+	dq	0x428a2f98428a2f98, 0x428a2f98428a2f98
+	dq	0x428a2f98428a2f98, 0x428a2f98428a2f98
+	dq	0x7137449171374491, 0x7137449171374491
+	dq	0x7137449171374491, 0x7137449171374491
+	dq	0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+	dq	0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+	dq	0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+	dq	0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+	dq	0x3956c25b3956c25b, 0x3956c25b3956c25b
+	dq	0x3956c25b3956c25b, 0x3956c25b3956c25b
+	dq	0x59f111f159f111f1, 0x59f111f159f111f1
+	dq	0x59f111f159f111f1, 0x59f111f159f111f1
+	dq	0x923f82a4923f82a4, 0x923f82a4923f82a4
+	dq	0x923f82a4923f82a4, 0x923f82a4923f82a4
+	dq	0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+	dq	0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+	dq	0xd807aa98d807aa98, 0xd807aa98d807aa98
+	dq	0xd807aa98d807aa98, 0xd807aa98d807aa98
+	dq	0x12835b0112835b01, 0x12835b0112835b01
+	dq	0x12835b0112835b01, 0x12835b0112835b01
+	dq	0x243185be243185be, 0x243185be243185be
+	dq	0x243185be243185be, 0x243185be243185be
+	dq	0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+	dq	0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+	dq	0x72be5d7472be5d74, 0x72be5d7472be5d74
+	dq	0x72be5d7472be5d74, 0x72be5d7472be5d74
+	dq	0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+	dq	0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+	dq	0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+	dq	0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+	dq	0xc19bf174c19bf174, 0xc19bf174c19bf174
+	dq	0xc19bf174c19bf174, 0xc19bf174c19bf174
+	dq	0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+	dq	0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+	dq	0xefbe4786efbe4786, 0xefbe4786efbe4786
+	dq	0xefbe4786efbe4786, 0xefbe4786efbe4786
+	dq	0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+	dq	0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+	dq	0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+	dq	0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+	dq	0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+	dq	0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+	dq	0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+	dq	0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+	dq	0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+	dq	0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+	dq	0x76f988da76f988da, 0x76f988da76f988da
+	dq	0x76f988da76f988da, 0x76f988da76f988da
+	dq	0x983e5152983e5152, 0x983e5152983e5152
+	dq	0x983e5152983e5152, 0x983e5152983e5152
+	dq	0xa831c66da831c66d, 0xa831c66da831c66d
+	dq	0xa831c66da831c66d, 0xa831c66da831c66d
+	dq	0xb00327c8b00327c8, 0xb00327c8b00327c8
+	dq	0xb00327c8b00327c8, 0xb00327c8b00327c8
+	dq	0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+	dq	0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+	dq	0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+	dq	0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+	dq	0xd5a79147d5a79147, 0xd5a79147d5a79147
+	dq	0xd5a79147d5a79147, 0xd5a79147d5a79147
+	dq	0x06ca635106ca6351, 0x06ca635106ca6351
+	dq	0x06ca635106ca6351, 0x06ca635106ca6351
+	dq	0x1429296714292967, 0x1429296714292967
+	dq	0x1429296714292967, 0x1429296714292967
+	dq	0x27b70a8527b70a85, 0x27b70a8527b70a85
+	dq	0x27b70a8527b70a85, 0x27b70a8527b70a85
+	dq	0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+	dq	0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+	dq	0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+	dq	0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+	dq	0x53380d1353380d13, 0x53380d1353380d13
+	dq	0x53380d1353380d13, 0x53380d1353380d13
+	dq	0x650a7354650a7354, 0x650a7354650a7354
+	dq	0x650a7354650a7354, 0x650a7354650a7354
+	dq	0x766a0abb766a0abb, 0x766a0abb766a0abb
+	dq	0x766a0abb766a0abb, 0x766a0abb766a0abb
+	dq	0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+	dq	0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+	dq	0x92722c8592722c85, 0x92722c8592722c85
+	dq	0x92722c8592722c85, 0x92722c8592722c85
+	dq	0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+	dq	0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+	dq	0xa81a664ba81a664b, 0xa81a664ba81a664b
+	dq	0xa81a664ba81a664b, 0xa81a664ba81a664b
+	dq	0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+	dq	0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+	dq	0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+	dq	0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+	dq	0xd192e819d192e819, 0xd192e819d192e819
+	dq	0xd192e819d192e819, 0xd192e819d192e819
+	dq	0xd6990624d6990624, 0xd6990624d6990624
+	dq	0xd6990624d6990624, 0xd6990624d6990624
+	dq	0xf40e3585f40e3585, 0xf40e3585f40e3585
+	dq	0xf40e3585f40e3585, 0xf40e3585f40e3585
+	dq	0x106aa070106aa070, 0x106aa070106aa070
+	dq	0x106aa070106aa070, 0x106aa070106aa070
+	dq	0x19a4c11619a4c116, 0x19a4c11619a4c116
+	dq	0x19a4c11619a4c116, 0x19a4c11619a4c116
+	dq	0x1e376c081e376c08, 0x1e376c081e376c08
+	dq	0x1e376c081e376c08, 0x1e376c081e376c08
+	dq	0x2748774c2748774c, 0x2748774c2748774c
+	dq	0x2748774c2748774c, 0x2748774c2748774c
+	dq	0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+	dq	0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+	dq	0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+	dq	0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+	dq	0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+	dq	0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+	dq	0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+	dq	0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+	dq	0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+	dq	0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+	dq	0x748f82ee748f82ee, 0x748f82ee748f82ee
+	dq	0x748f82ee748f82ee, 0x748f82ee748f82ee
+	dq	0x78a5636f78a5636f, 0x78a5636f78a5636f
+	dq	0x78a5636f78a5636f, 0x78a5636f78a5636f
+	dq	0x84c8781484c87814, 0x84c8781484c87814
+	dq	0x84c8781484c87814, 0x84c8781484c87814
+	dq	0x8cc702088cc70208, 0x8cc702088cc70208
+	dq	0x8cc702088cc70208, 0x8cc702088cc70208
+	dq	0x90befffa90befffa, 0x90befffa90befffa
+	dq	0x90befffa90befffa, 0x90befffa90befffa
+	dq	0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+	dq	0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+	dq	0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+	dq	0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+	dq	0xc67178f2c67178f2, 0xc67178f2c67178f2
+	dq	0xc67178f2c67178f2, 0xc67178f2c67178f2
+
+PADDING_8:
+
+	ddq     0xc28a2f98c28a2f98c28a2f98c28a2f98
+	ddq     0xc28a2f98c28a2f98c28a2f98c28a2f98
+	ddq 	0x71374491713744917137449171374491
+	ddq 	0x71374491713744917137449171374491
+	ddq 	0xb5c0fbcfb5c0fbcfb5c0fbcfb5c0fbcf
+	ddq 	0xb5c0fbcfb5c0fbcfb5c0fbcfb5c0fbcf
+	ddq 	0xe9b5dba5e9b5dba5e9b5dba5e9b5dba5
+	ddq 	0xe9b5dba5e9b5dba5e9b5dba5e9b5dba5
+        ddq     0x3956c25b3956c25b3956c25b3956c25b
+        ddq     0x3956c25b3956c25b3956c25b3956c25b
+        ddq     0x59f111f159f111f159f111f159f111f1
+        ddq     0x59f111f159f111f159f111f159f111f1
+        ddq     0x923f82a4923f82a4923f82a4923f82a4
+        ddq     0x923f82a4923f82a4923f82a4923f82a4
+        ddq     0xab1c5ed5ab1c5ed5ab1c5ed5ab1c5ed5
+        ddq     0xab1c5ed5ab1c5ed5ab1c5ed5ab1c5ed5
+        ddq     0xd807aa98d807aa98d807aa98d807aa98
+        ddq     0xd807aa98d807aa98d807aa98d807aa98
+        ddq     0x12835b0112835b0112835b0112835b01
+        ddq     0x12835b0112835b0112835b0112835b01
+        ddq     0x243185be243185be243185be243185be
+        ddq     0x243185be243185be243185be243185be
+        ddq     0x550c7dc3550c7dc3550c7dc3550c7dc3
+        ddq     0x550c7dc3550c7dc3550c7dc3550c7dc3
+        ddq     0x72be5d7472be5d7472be5d7472be5d74
+        ddq     0x72be5d7472be5d7472be5d7472be5d74
+        ddq     0x80deb1fe80deb1fe80deb1fe80deb1fe
+        ddq     0x80deb1fe80deb1fe80deb1fe80deb1fe
+        ddq     0x9bdc06a79bdc06a79bdc06a79bdc06a7
+        ddq     0x9bdc06a79bdc06a79bdc06a79bdc06a7
+        ddq     0xc19bf374c19bf374c19bf374c19bf374
+        ddq     0xc19bf374c19bf374c19bf374c19bf374
+        ddq     0x649b69c1649b69c1649b69c1649b69c1
+        ddq     0x649b69c1649b69c1649b69c1649b69c1
+        ddq     0xf0fe4786f0fe4786f0fe4786f0fe4786
+        ddq     0xf0fe4786f0fe4786f0fe4786f0fe4786
+        ddq     0x0fe1edc60fe1edc60fe1edc60fe1edc6
+        ddq     0x0fe1edc60fe1edc60fe1edc60fe1edc6
+        ddq     0x240cf254240cf254240cf254240cf254
+        ddq     0x240cf254240cf254240cf254240cf254
+        ddq     0x4fe9346f4fe9346f4fe9346f4fe9346f
+        ddq     0x4fe9346f4fe9346f4fe9346f4fe9346f
+        ddq     0x6cc984be6cc984be6cc984be6cc984be
+        ddq     0x6cc984be6cc984be6cc984be6cc984be
+        ddq     0x61b9411e61b9411e61b9411e61b9411e
+        ddq     0x61b9411e61b9411e61b9411e61b9411e
+        ddq     0x16f988fa16f988fa16f988fa16f988fa
+        ddq     0x16f988fa16f988fa16f988fa16f988fa
+        ddq     0xf2c65152f2c65152f2c65152f2c65152
+        ddq     0xf2c65152f2c65152f2c65152f2c65152
+        ddq     0xa88e5a6da88e5a6da88e5a6da88e5a6d
+        ddq     0xa88e5a6da88e5a6da88e5a6da88e5a6d
+        ddq     0xb019fc65b019fc65b019fc65b019fc65
+        ddq     0xb019fc65b019fc65b019fc65b019fc65
+        ddq     0xb9d99ec7b9d99ec7b9d99ec7b9d99ec7
+        ddq     0xb9d99ec7b9d99ec7b9d99ec7b9d99ec7
+        ddq     0x9a1231c39a1231c39a1231c39a1231c3
+        ddq     0x9a1231c39a1231c39a1231c39a1231c3
+        ddq     0xe70eeaa0e70eeaa0e70eeaa0e70eeaa0
+        ddq     0xe70eeaa0e70eeaa0e70eeaa0e70eeaa0
+        ddq     0xfdb1232bfdb1232bfdb1232bfdb1232b
+        ddq     0xfdb1232bfdb1232bfdb1232bfdb1232b
+        ddq     0xc7353eb0c7353eb0c7353eb0c7353eb0
+        ddq     0xc7353eb0c7353eb0c7353eb0c7353eb0
+        ddq     0x3069bad53069bad53069bad53069bad5
+        ddq     0x3069bad53069bad53069bad53069bad5
+        ddq     0xcb976d5fcb976d5fcb976d5fcb976d5f
+        ddq     0xcb976d5fcb976d5fcb976d5fcb976d5f
+        ddq     0x5a0f118f5a0f118f5a0f118f5a0f118f
+        ddq     0x5a0f118f5a0f118f5a0f118f5a0f118f
+        ddq     0xdc1eeefddc1eeefddc1eeefddc1eeefd
+        ddq     0xdc1eeefddc1eeefddc1eeefddc1eeefd
+        ddq     0x0a35b6890a35b6890a35b6890a35b689
+        ddq     0x0a35b6890a35b6890a35b6890a35b689
+        ddq     0xde0b7a04de0b7a04de0b7a04de0b7a04
+        ddq     0xde0b7a04de0b7a04de0b7a04de0b7a04
+        ddq     0x58f4ca9d58f4ca9d58f4ca9d58f4ca9d
+        ddq     0x58f4ca9d58f4ca9d58f4ca9d58f4ca9d
+        ddq     0xe15d5b16e15d5b16e15d5b16e15d5b16
+        ddq     0xe15d5b16e15d5b16e15d5b16e15d5b16
+        ddq     0x007f3e86007f3e86007f3e86007f3e86
+        ddq     0x007f3e86007f3e86007f3e86007f3e86
+        ddq     0x37088980370889803708898037088980
+        ddq     0x37088980370889803708898037088980
+        ddq     0xa507ea32a507ea32a507ea32a507ea32
+        ddq     0xa507ea32a507ea32a507ea32a507ea32
+        ddq     0x6fab95376fab95376fab95376fab9537
+        ddq     0x6fab95376fab95376fab95376fab9537
+        ddq     0x17406110174061101740611017406110
+        ddq     0x17406110174061101740611017406110
+        ddq     0x0d8cd6f10d8cd6f10d8cd6f10d8cd6f1
+        ddq     0x0d8cd6f10d8cd6f10d8cd6f10d8cd6f1
+        ddq     0xcdaa3b6dcdaa3b6dcdaa3b6dcdaa3b6d
+        ddq     0xcdaa3b6dcdaa3b6dcdaa3b6dcdaa3b6d
+        ddq     0xc0bbbe37c0bbbe37c0bbbe37c0bbbe37
+        ddq     0xc0bbbe37c0bbbe37c0bbbe37c0bbbe37
+        ddq     0x83613bda83613bda83613bda83613bda
+        ddq     0x83613bda83613bda83613bda83613bda
+        ddq     0xdb48a363db48a363db48a363db48a363
+        ddq     0xdb48a363db48a363db48a363db48a363
+        ddq     0x0b02e9310b02e9310b02e9310b02e931
+        ddq     0x0b02e9310b02e9310b02e9310b02e931
+        ddq     0x6fd15ca76fd15ca76fd15ca76fd15ca7
+        ddq     0x6fd15ca76fd15ca76fd15ca76fd15ca7
+        ddq     0x521afaca521afaca521afaca521afaca
+        ddq     0x521afaca521afaca521afaca521afaca
+        ddq     0x31338431313384313133843131338431
+        ddq     0x31338431313384313133843131338431
+        ddq     0x6ed41a956ed41a956ed41a956ed41a95
+        ddq     0x6ed41a956ed41a956ed41a956ed41a95
+        ddq     0x6d4378906d4378906d4378906d437890
+        ddq     0x6d4378906d4378906d4378906d437890
+        ddq     0xc39c91f2c39c91f2c39c91f2c39c91f2
+        ddq     0xc39c91f2c39c91f2c39c91f2c39c91f2
+        ddq     0x9eccabbd9eccabbd9eccabbd9eccabbd
+        ddq     0x9eccabbd9eccabbd9eccabbd9eccabbd
+        ddq     0xb5c9a0e6b5c9a0e6b5c9a0e6b5c9a0e6
+        ddq     0xb5c9a0e6b5c9a0e6b5c9a0e6b5c9a0e6
+        ddq     0x532fb63c532fb63c532fb63c532fb63c
+        ddq     0x532fb63c532fb63c532fb63c532fb63c
+        ddq     0xd2c741c6d2c741c6d2c741c6d2c741c6
+        ddq     0xd2c741c6d2c741c6d2c741c6d2c741c6
+        ddq     0x07237ea307237ea307237ea307237ea3
+        ddq     0x07237ea307237ea307237ea307237ea3
+        ddq     0xa4954b68a4954b68a4954b68a4954b68
+        ddq     0xa4954b68a4954b68a4954b68a4954b68
+        ddq     0x4c191d764c191d764c191d764c191d76
+        ddq     0x4c191d764c191d764c191d764c191d76
+
+
+DIGEST_8:
+        dd      0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667
+        dd      0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667
+	dd 	0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85 
+	dd 	0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85 
+	dd      0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372 
+	dd      0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372 
+	dd 	0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a 
+	dd 	0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a 
+	dd	0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f
+	dd	0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f
+	dd 	0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c 
+	dd 	0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c 
+	dd	0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab
+	dd	0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab
+        dd      0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19
+        dd      0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19
+
+
+PSHUFFLE_BYTE_FLIP_MASK: 
+	dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+	dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+
+STACK_ALIGNMENT_MASK:
+	dq 0xffffffffffffffe0
+
+section .text
+
+%ifdef WINABI
+	%define OUTPUT_PTR	rcx 	; 1st arg
+	%define DATA_PTR	rdx 	; 2nd arg
+	%define NUM_BLKS 	r8	; 3rd arg
+	%define TBL 		rsi
+	%define reg1		rdi
+%else
+	%define OUTPUT_PTR	rdi	; 1st arg
+	%define DATA_PTR	rsi	; 2nd arg
+	%define NUM_BLKS	rdx	; 3rd arg
+	%define TBL 		rcx
+	%define reg1 		r8
+%endif
+
+%define ROUND	rax
+
+%define inp0 r9
+%define inp1 r10
+%define inp2 r11
+%define inp3 r12
+%define inp4 r13
+%define inp5 r14
+%define inp6 reg1
+%define inp7 reg2
+
+
+
+; ymm0	a
+; ymm1	b
+; ymm2	c
+; ymm3	d
+; ymm4	e
+; ymm5	f
+; ymm6	g	TMP0
+; ymm7	h	TMP1
+; ymm8	T1	TT0
+; ymm9		TT1
+; ymm10		TT2
+; ymm11		TT3
+; ymm12	a0	TT4
+; ymm13	a1	TT5
+; ymm14	a2	TT6
+; ymm15	TMP	TT7
+
+%define a ymm0
+%define b ymm1
+%define c ymm2
+%define d ymm3
+%define e ymm4
+%define f ymm5
+%define g ymm6
+%define h ymm7
+
+%define T1  ymm8
+
+%define a0 ymm12
+%define a1 ymm13
+%define a2 ymm14
+%define TMP ymm15
+
+%define TMP0 ymm6
+%define TMP1 ymm7
+
+%define TT0 ymm8
+%define TT1 ymm9
+%define TT2 ymm10
+%define TT3 ymm11
+%define TT4 ymm12
+%define TT5 ymm13
+%define TT6 ymm14
+%define TT7 ymm15
+
+%define SHA256_DIGEST_WORD_SIZE  4;
+%define SZ8	8*SHA256_DIGEST_WORD_SIZE	; Size of one vector register
+%define ROUNDS 64*SZ8
+
+; Define stack usage
+
+;; Assume stack aligned to 32 bytes before call
+;; Therefore FRAMESZ mod 32 must be 32-8 = 24
+struc stack_frame
+  .data		resb	16*SZ8
+  .digest	resb	8*SZ8
+  .ytmp		resb	4*SZ8
+  .regsave	resb    4*64
+endstruc
+%define FRAMESZ	stack_frame_size
+%define _DIGEST	stack_frame.digest
+%define _YTMP	stack_frame.ytmp
+%define _RSAVE  stack_frame.regsave
+
+%define YTMP0	rsp + _YTMP + 0*SZ8
+%define YTMP1	rsp + _YTMP + 1*SZ8
+%define YTMP2	rsp + _YTMP + 2*SZ8
+%define YTMP3	rsp + _YTMP + 3*SZ8
+%define R12 	rsp + _RSAVE + 0*64
+%define R13 	rsp + _RSAVE + 1*64
+%define R14 	rsp + _RSAVE + 2*64
+%define R15 	rsp + _RSAVE + 3*64
+	
+
+%define VMOVPS	vmovups
+
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endm
+
+; PRORD reg, imm, tmp
+%macro PRORD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+	vpslld	%%tmp, %%reg, (32-(%%imm))
+	vpsrld	%%reg, %%reg, %%imm
+	vpor	%%reg, %%reg, %%tmp
+%endmacro
+
+; non-destructive
+; PRORD_nd reg, imm, tmp, src
+%macro PRORD_nd 4
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+%define %%src %4
+	;vmovdqa	%%tmp, %%reg
+	vpslld	%%tmp, %%src, (32-(%%imm))
+	vpsrld	%%reg, %%src, %%imm
+	vpor	%%reg, %%reg, %%tmp
+%endmacro
+
+; PRORD dst/src, amt
+%macro PRORD 2
+	PRORD	%1, %2, TMP
+%endmacro
+
+; PRORD_nd dst, src, amt
+%macro PRORD_nd 3
+	PRORD_nd	%1, %3, TMP, %2
+%endmacro
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_00_15 2
+%define %%T1 %1
+%define %%i  %2
+	PRORD_nd	a0, e, (11-6)	; sig1: a0 = (e >> 5)
+
+	vpxor	a2, f, g	; ch: a2 = f^g
+	vpand	a2, a2, e		; ch: a2 = (f^g)&e
+	vpxor	a2, a2, g		; a2 = ch
+
+	PRORD_nd	a1, e, 25		; sig1: a1 = (e >> 25)
+	vmovdqa	[SZ8*(%%i&0xf) + rsp], %%T1     ; save current temp message
+	vpaddd	%%T1, %%T1, [TBL + ROUND]	; T1 = W + K
+	vpxor	a0, a0, e	; sig1: a0 = e ^ (e >> 5)
+	PRORD	a0, 6		; sig1: a0 = (e >> 6) ^ (e >> 11)
+	vpaddd	h, h, a2	; h = h + ch
+	PRORD_nd	a2, a, (13-2)	; sig0: a2 = (a >> 11)
+	vpaddd	h, h, %%T1	; h = h + ch + W + K
+	vpxor	a0, a0, a1	; a0 = sigma1
+	PRORD_nd	a1, a, 22	; sig0: a1 = (a >> 22)
+	vpxor	%%T1, a, c	; maj: T1 = a^c
+	add	ROUND, SZ8	; ROUND++
+	vpand	%%T1, %%T1, b	; maj: T1 = (a^c)&b
+	vpaddd	h, h, a0
+
+	vpaddd	d, d, h
+
+	vpxor	a2, a2, a	; sig0: a2 = a ^ (a >> 11)
+	PRORD	a2, 2		; sig0: a2 = (a >> 2) ^ (a >> 13)
+	vpxor	a2, a2, a1	; a2 = sig0
+	vpand	a1, a, c	; maj: a1 = a&c
+	vpor	a1, a1, %%T1	; a1 = maj
+	vpaddd	h, h, a1	; h = h + ch + W + K + maj
+	vpaddd	h, h, a2	; h = h + ch + W + K + maj + sigma0
+
+	ROTATE_ARGS
+%endm
+
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_16_XX 2
+%define %%T1 %1
+%define %%i  %2
+	vmovdqa	%%T1, [SZ8*((%%i-15)&0xf) + rsp]
+	vmovdqa	a1, [SZ8*((%%i-2)&0xf) + rsp]
+	vmovdqa	a0, %%T1
+	PRORD	%%T1, 18-7
+	vmovdqa	a2, a1
+	PRORD	a1, 19-17
+	vpxor	%%T1, %%T1, a0
+	PRORD	%%T1, 7
+	vpxor	a1, a1, a2
+	PRORD	a1, 17
+	vpsrld	a0, a0, 3
+	vpxor	%%T1, %%T1, a0
+	vpsrld	a2, a2, 10
+	vpxor	a1, a1, a2
+	vpaddd	%%T1, %%T1, [SZ8*((%%i-16)&0xf) + rsp]   ; + W[i-16]
+	vpaddd	a1, a1, [SZ8*((%%i-7)&0xf) + rsp]        ; + W[i-7]
+	vpaddd	%%T1, %%T1, a1
+
+	ROUND_00_15 %%T1, %%i
+
+%endm
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro PADDING_ROUND_00_15 1
+%define %%T1 %1
+	PRORD_nd	a0, e, (11-6)	; sig1: a0 = (e >> 5)
+
+	vpxor	a2, f, g	; ch: a2 = f^g
+	vpand	a2, a2, e		; ch: a2 = (f^g)&e
+	vpxor	a2, a2, g		; a2 = ch
+
+	PRORD_nd	a1, e, 25		; sig1: a1 = (e >> 25)
+	vmovdqa 	%%T1, [TBL + ROUND]	; T1 = W + K
+	vpxor	a0, a0, e	; sig1: a0 = e ^ (e >> 5)
+	PRORD	a0, 6		; sig1: a0 = (e >> 6) ^ (e >> 11)
+	vpaddd	h, h, a2	; h = h + ch
+	PRORD_nd	a2, a, (13-2)	; sig0: a2 = (a >> 11)
+	vpaddd	h, h, %%T1	; h = h + ch + W + K
+	vpxor	a0, a0, a1	; a0 = sigma1
+	PRORD_nd	a1, a, 22	; sig0: a1 = (a >> 22)
+	vpxor	%%T1, a, c	; maj: T1 = a^c
+	add	ROUND, SZ8	; ROUND++
+	vpand	%%T1, %%T1, b	; maj: T1 = (a^c)&b
+	vpaddd	h, h, a0
+
+	vpaddd	d, d, h
+
+	vpxor	a2, a2, a	; sig0: a2 = a ^ (a >> 11)
+	PRORD	a2, 2		; sig0: a2 = (a >> 2) ^ (a >> 13)
+	vpxor	a2, a2, a1	; a2 = sig0
+	vpand	a1, a, c	; maj: a1 = a&c
+	vpor	a1, a1, %%T1	; a1 = maj
+	vpaddd	h, h, a1	; h = h + ch + W + K + maj
+	vpaddd	h, h, a2	; h = h + ch + W + K + maj + sigma0
+
+	ROTATE_ARGS
+%endm
+
+
+
+global sha256_8_avx2:function
+align 16
+sha256_8_avx2:
+        endbranch64
+	; outer calling routine saves all the XMM registers
+	push 	rbp
+	mov     rbp,rsp
+	and 	rsp, [rel STACK_ALIGNMENT_MASK]
+	sub	rsp, FRAMESZ
+	mov	[R12], r12
+	mov	[R13], r13
+	mov	[R14], r14
+	mov	[R15], r15
+	
+.hash_8_blocks:
+	cmp 	NUM_BLKS, 8
+	jl 	.hash_4_blocks
+	xor	ROUND, ROUND
+
+	lea TBL,[rel DIGEST_8]
+	vmovdqa	a,[TBL + 0*32]
+	vmovdqa	b,[TBL + 1*32]
+	vmovdqa	c,[TBL + 2*32]
+	vmovdqa	d,[TBL + 3*32]
+	vmovdqa	e,[TBL + 4*32]
+	vmovdqa	f,[TBL + 5*32]
+	vmovdqa	g,[TBL + 6*32]
+	vmovdqa	h,[TBL + 7*32]
+
+	lea	TBL,[rel K256_8]
+	
+%assign i 0
+%rep 2
+	TRANSPOSE8_U32_LOAD8 TT0, TT1, TT2, TT3, TT4, TT5, TT6, TT7, \
+			     DATA_PTR + 0*64, \
+			     DATA_PTR + 1*64, \
+			     DATA_PTR + 2*64, \
+			     DATA_PTR + 3*64, \
+			     DATA_PTR + 4*64, \
+			     DATA_PTR + 5*64, \
+			     DATA_PTR + 6*64, \
+			     DATA_PTR + 7*64, \
+			     i*32
+
+	vmovdqa	[YTMP0], g
+	vmovdqa	[YTMP1], h
+	TRANSPOSE8_U32_PRELOADED TT0, TT1, TT2, TT3, TT4, TT5, TT6, TT7,   TMP0, TMP1
+	vmovdqa	TMP1, [rel PSHUFFLE_BYTE_FLIP_MASK]
+	vmovdqa	g, [YTMP0]
+	vpshufb	TT0, TT0, TMP1
+	vpshufb	TT1, TT1, TMP1
+	vpshufb	TT2, TT2, TMP1
+	vpshufb	TT3, TT3, TMP1
+	vpshufb	TT4, TT4, TMP1
+	vpshufb	TT5, TT5, TMP1
+	vpshufb	TT6, TT6, TMP1
+	vpshufb	TT7, TT7, TMP1
+	vmovdqa	h, [YTMP1]
+	vmovdqa	[YTMP0], TT4
+	vmovdqa	[YTMP1], TT5
+	vmovdqa	[YTMP2], TT6
+	vmovdqa	[YTMP3], TT7
+	ROUND_00_15	TT0,(i*8+0)
+	vmovdqa	TT0, [YTMP0]
+	ROUND_00_15	TT1,(i*8+1)
+	vmovdqa	TT1, [YTMP1]
+	ROUND_00_15	TT2,(i*8+2)
+	vmovdqa	TT2, [YTMP2]
+	ROUND_00_15	TT3,(i*8+3)
+	vmovdqa	TT3, [YTMP3]
+	ROUND_00_15	TT0,(i*8+4)
+	ROUND_00_15	TT1,(i*8+5)
+	ROUND_00_15	TT2,(i*8+6)
+	ROUND_00_15	TT3,(i*8+7)
+%assign i (i+1)
+%endrep
+
+%assign i (i*8)
+
+	jmp	.Lrounds_16_xx
+align 16
+.Lrounds_16_xx:
+%rep 16
+	ROUND_16_XX	T1, i
+%assign i (i+1)
+%endrep
+
+	cmp	ROUND,ROUNDS
+	jb	.Lrounds_16_xx
+
+	;; add old digest
+	lea TBL,[rel DIGEST_8]
+	vpaddd	a, a, [TBL + 0*SZ8]
+	vpaddd	b, b, [TBL + 1*SZ8]
+	vpaddd	c, c, [TBL + 2*SZ8]
+	vpaddd	d, d, [TBL + 3*SZ8]
+	vpaddd	e, e, [TBL + 4*SZ8]
+	vpaddd	f, f, [TBL + 5*SZ8]
+	vpaddd	g, g, [TBL + 6*SZ8]
+	vpaddd	h, h, [TBL + 7*SZ8]
+
+	;; rounds with padding
+
+	;; save old digest
+	vmovdqa	[rsp + _DIGEST + 0*SZ8], a
+	vmovdqa	[rsp + _DIGEST + 1*SZ8], b
+	vmovdqa	[rsp + _DIGEST + 2*SZ8], c
+	vmovdqa	[rsp + _DIGEST + 3*SZ8], d
+	vmovdqa	[rsp + _DIGEST + 4*SZ8], e
+	vmovdqa	[rsp + _DIGEST + 5*SZ8], f
+	vmovdqa	[rsp + _DIGEST + 6*SZ8], g
+	vmovdqa	[rsp + _DIGEST + 7*SZ8], h
+
+
+	lea TBL,[rel PADDING_8]
+	xor ROUND,ROUND
+	jmp 	.Lrounds_padding
+
+align 16
+.Lrounds_padding:
+%rep 64
+	PADDING_ROUND_00_15 	T1
+%endrep
+	;; add old digest
+	vpaddd	a, a, [rsp + _DIGEST + 0*SZ8]
+	vpaddd	b, b, [rsp + _DIGEST + 1*SZ8]
+	vpaddd	c, c, [rsp + _DIGEST + 2*SZ8]
+	vpaddd	d, d, [rsp + _DIGEST + 3*SZ8]
+	vpaddd	e, e, [rsp + _DIGEST + 4*SZ8]
+	vpaddd	f, f, [rsp + _DIGEST + 5*SZ8]
+	vpaddd	g, g, [rsp + _DIGEST + 6*SZ8]
+	vpaddd	h, h, [rsp + _DIGEST + 7*SZ8]
+
+
+	;; transpose the digest and convert to little endian to get the registers correctly
+
+	TRANSPOSE8_U32 a, b, c, d, e, f, g, h, TT0, TT1
+	vmovdqa	TT0, [rel PSHUFFLE_BYTE_FLIP_MASK]
+	vpshufb	a, a, TT0
+	vpshufb	b, b, TT0
+	vpshufb	c, c, TT0
+	vpshufb	d, d, TT0
+	vpshufb	e, e, TT0
+	vpshufb	f, f, TT0
+	vpshufb	g, g, TT0
+	vpshufb	h, h, TT0
+
+	;; write to output
+
+	vmovdqu	[OUTPUT_PTR + 0*32],a
+	vmovdqu	[OUTPUT_PTR + 1*32],b
+	vmovdqu	[OUTPUT_PTR + 2*32],c
+	vmovdqu	[OUTPUT_PTR + 3*32],d
+	vmovdqu	[OUTPUT_PTR + 4*32],e
+	vmovdqu	[OUTPUT_PTR + 5*32],f
+	vmovdqu	[OUTPUT_PTR + 6*32],g
+	vmovdqu	[OUTPUT_PTR + 7*32],h
+
+	; update pointers and loop
+
+        add 	DATA_PTR, 64*8
+	add 	OUTPUT_PTR, 32*8
+	sub 	NUM_BLKS, 8
+
+	jmp     .hash_8_blocks
+
+.hash_4_blocks:
+
+	call  	sha256_4_avx
+
+	mov	r12,[R12]
+	mov	r13,[R13]
+	mov	r14,[R14]
+	mov	r15,[R15]
+
+	mov     rsp,rbp
+	pop     rbp
+	ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
--- a/crypto/hash/custom_hasher/assembly/sha256_avx_one_block.asm
+++ b/crypto/hash/custom_hasher/assembly/sha256_avx_one_block.asm
@@ -0,0 +1,646 @@
+;;
+;; Copyright (c) 2012-2021, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+; This code schedules 1 blocks at a time, with 4 lanes per block
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+section .data
+default rel
+align 64
+K256:
+	dd	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	dd	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	dd	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	dd	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	dd	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	dd	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	dd	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	dd	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	dd	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	dd	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	dd	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	dd	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	dd	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	dd	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	dd	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	dd	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+DIGEST:
+        dd      0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 
+	dd	0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
+
+PADDING:
+        dd      0xc28a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
+        dd      0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
+        dd      0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
+        dd      0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf374
+        dd      0x649b69c1, 0xf0fe4786, 0xfe1edc6, 0x240cf254
+        dd      0x4fe9346f, 0x6cc984be, 0x61b9411e, 0x16f988fa
+        dd      0xf2c65152, 0xa88e5a6d, 0xb019fc65, 0xb9d99ec7
+        dd      0x9a1231c3, 0xe70eeaa0, 0xfdb1232b, 0xc7353eb0
+        dd      0x3069bad5, 0xcb976d5f, 0x5a0f118f, 0xdc1eeefd
+        dd      0xa35b689, 0xde0b7a04, 0x58f4ca9d, 0xe15d5b16
+        dd      0x7f3e86, 0x37088980, 0xa507ea32, 0x6fab9537
+        dd      0x17406110, 0xd8cd6f1, 0xcdaa3b6d, 0xc0bbbe37
+        dd      0x83613bda, 0xdb48a363, 0xb02e931, 0x6fd15ca7
+        dd      0x521afaca, 0x31338431, 0x6ed41a95, 0x6d437890
+        dd      0xc39c91f2, 0x9eccabbd, 0xb5c9a0e6, 0x532fb63c
+        dd      0xd2c741c6, 0x7237ea3, 0xa4954b68, 0x4c191d76
+
+
+PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203
+	dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+
+; shuffle xBxA -> 00BA
+_SHUF_00BA:              ;ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100
+	dq 0x0b0a090803020100, 0xFFFFFFFFFFFFFFFF
+
+; shuffle xDxC -> DC00
+_SHUF_DC00:              ;ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF
+	dq 0xFFFFFFFFFFFFFFFF, 0x0b0a090803020100
+
+section .text
+
+%define	VMOVDQ vmovdqu ;; assume buffers not aligned
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
+
+%macro MY_ROR 2
+	shld	%1,%1,(32-(%2))
+%endm
+
+; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
+; Load xmm with mem and byte swap each dword
+%macro COPY_XMM_AND_BSWAP 3
+	VMOVDQ %1, %2
+	vpshufb %1, %1, %3
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define X0 xmm4
+%define X1 xmm5
+%define X2 xmm6
+%define X3 xmm7
+
+%define XTMP0 xmm0
+%define XTMP1 xmm1
+%define XTMP2 xmm2
+%define XTMP3 xmm3
+%define XTMP4 xmm8
+%define XFER  xmm9
+%define XTMP5 xmm11
+
+%define SHUF_00BA	xmm10 ; shuffle xBxA -> 00BA
+%define SHUF_DC00	xmm12 ; shuffle xDxC -> DC00
+%define BYTE_FLIP_MASK	xmm13
+
+%ifdef WINABI
+	%define OUTPUT_PTR	rcx 	; 1st arg
+	%define DATA_PTR	rdx 	; 2nd arg
+	%define d        	r8d     ; 3rd
+	%define TBL 		rsi
+        %define c               edi
+%else
+	%define OUTPUT_PTR	rdi	; 1st arg
+	%define DATA_PTR	rsi	; 2nd arg
+	%define c               edx	; 3rd arg
+	%define TBL 		rcx
+        %define d               r8d     
+%endif
+
+
+%define a eax
+%define b ebx
+
+%define e r9d
+%define f r10d
+%define g r11d
+%define h r12d
+
+%define y0 r13d
+%define y1 r14d
+%define y2 r15d
+
+
+struc STACK
+_XFER:		resb	32
+_DIGEST:         resb   32
+%ifdef WINABI
+_XMM_SAVE:	reso	8
+                resb    16 ; alignment
+%endif
+endstruc
+
+; rotate_Xs
+; Rotate values of symbols X0...X3
+%macro rotate_Xs 0
+%xdefine X_ X0
+%xdefine X0 X1
+%xdefine X1 X2
+%xdefine X2 X3
+%xdefine X3 X_
+%endm
+
+; ROTATE_ARGS
+; Rotate values of symbols a...h
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endm
+
+%macro FOUR_ROUNDS_AND_SCHED 0
+		;; compute s0 four at a time and s1 two at a time
+		;; compute W[-16] + W[-7] 4 at a time
+		;vmovdqa	XTMP0, X3
+	mov	y0, e		; y0 = e
+	MY_ROR	y0, (25-11)	; y0 = e >> (25-11)
+	mov	y1, a		; y1 = a
+		vpalignr	XTMP0, X3, X2, 4	; XTMP0 = W[-7]
+	MY_ROR	y1, (22-13)	; y1 = a >> (22-13)
+	xor	y0, e		; y0 = e ^ (e >> (25-11))
+	mov	y2, f		; y2 = f
+	MY_ROR	y0, (11-6)	; y0 = (e >> (11-6)) ^ (e >> (25-6))
+		;vmovdqa	XTMP1, X1
+	xor	y1, a		; y1 = a ^ (a >> (22-13)
+	xor	y2, g		; y2 = f^g
+		vpaddd	XTMP0, XTMP0, X0	; XTMP0 = W[-7] + W[-16]
+	xor	y0, e		; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+	and	y2, e		; y2 = (f^g)&e
+	MY_ROR	y1, (13-2)	; y1 = (a >> (13-2)) ^ (a >> (22-2))
+		;; compute s0
+		vpalignr	XTMP1, X1, X0, 4	; XTMP1 = W[-15]
+	xor	y1, a		; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+	MY_ROR	y0, 6		; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+	xor	y2, g		; y2 = CH = ((f^g)&e)^g
+
+	MY_ROR	y1, 2		; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+	add	y2, y0		; y2 = S1 + CH
+	add	y2, [rsp + _XFER + 0*4]	; y2 = k + w + S1 + CH
+
+	mov	y0, a		; y0 = a
+	add	h, y2		; h = h + S1 + CH + k + w
+	mov	y2, a		; y2 = a
+
+		vpsrld	XTMP2, XTMP1, 7
+
+	or	y0, c		; y0 = a|c
+	add	d, h		; d = d + h + S1 + CH + k + w
+	and	y2, c		; y2 = a&c
+
+		vpslld	XTMP3, XTMP1, (32-7)
+
+	and	y0, b		; y0 = (a|c)&b
+	add	h, y1		; h = h + S1 + CH + k + w + S0
+
+		vpor	XTMP3, XTMP3, XTMP2	; XTMP1 = W[-15] MY_ROR 7
+
+	or	y0, y2		; y0 = MAJ = (a|c)&b)|(a&c)
+	add	h, y0		; h = h + S1 + CH + k + w + S0 + MAJ
+
+ROTATE_ARGS
+
+	mov	y0, e		; y0 = e
+	mov	y1, a		; y1 = a
+
+
+	MY_ROR	y0, (25-11)	; y0 = e >> (25-11)
+	xor	y0, e		; y0 = e ^ (e >> (25-11))
+	mov	y2, f		; y2 = f
+	MY_ROR	y1, (22-13)	; y1 = a >> (22-13)
+
+		vpsrld	XTMP2, XTMP1,18
+
+	xor	y1, a		; y1 = a ^ (a >> (22-13)
+	MY_ROR	y0, (11-6)	; y0 = (e >> (11-6)) ^ (e >> (25-6))
+	xor	y2, g		; y2 = f^g
+
+		vpsrld	XTMP4, XTMP1, 3	; XTMP4 = W[-15] >> 3
+
+	MY_ROR	y1, (13-2)	; y1 = (a >> (13-2)) ^ (a >> (22-2))
+	xor	y0, e		; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+	and	y2, e		; y2 = (f^g)&e
+	MY_ROR	y0, 6		; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+
+		vpslld	XTMP1, XTMP1, (32-18)
+
+	xor	y1, a		; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+	xor	y2, g		; y2 = CH = ((f^g)&e)^g
+
+		vpxor	XTMP3, XTMP3, XTMP1
+
+	add	y2, y0		; y2 = S1 + CH
+	add	y2, [rsp + _XFER + 1*4]	; y2 = k + w + S1 + CH
+	MY_ROR	y1, 2		; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+
+		vpxor	XTMP3, XTMP3, XTMP2	; XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 18
+
+	mov	y0, a		; y0 = a
+	add	h, y2		; h = h + S1 + CH + k + w
+	mov	y2, a		; y2 = a
+
+		vpxor	XTMP1, XTMP3, XTMP4	; XTMP1 = s0
+
+	or	y0, c		; y0 = a|c
+	add	d, h		; d = d + h + S1 + CH + k + w
+	and	y2, c		; y2 = a&c
+		;; compute low s1
+		vpshufd	XTMP2, X3, 11111010b	; XTMP2 = W[-2] {BBAA}
+	and	y0, b		; y0 = (a|c)&b
+	add	h, y1		; h = h + S1 + CH + k + w + S0
+		vpaddd	XTMP0, XTMP0, XTMP1	; XTMP0 = W[-16] + W[-7] + s0
+	or	y0, y2		; y0 = MAJ = (a|c)&b)|(a&c)
+	add	h, y0		; h = h + S1 + CH + k + w + S0 + MAJ
+
+ROTATE_ARGS
+		;vmovdqa	XTMP3, XTMP2	; XTMP3 = W[-2] {BBAA}
+
+	mov	y0, e		; y0 = e
+	mov	y1, a		; y1 = a
+	MY_ROR	y0, (25-11)	; y0 = e >> (25-11)
+
+		;vmovdqa	XTMP4, XTMP2	; XTMP4 = W[-2] {BBAA}
+
+	xor	y0, e		; y0 = e ^ (e >> (25-11))
+	MY_ROR	y1, (22-13)	; y1 = a >> (22-13)
+	mov	y2, f		; y2 = f
+	xor	y1, a		; y1 = a ^ (a >> (22-13)
+	MY_ROR	y0, (11-6)	; y0 = (e >> (11-6)) ^ (e >> (25-6))
+
+		vpsrld	XTMP4, XTMP2, 10	; XTMP4 = W[-2] >> 10 {BBAA}
+
+	xor	y2, g		; y2 = f^g
+
+		vpsrlq	XTMP3, XTMP2, 19	; XTMP3 = W[-2] MY_ROR 19 {xBxA}
+
+	xor	y0, e		; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+	and	y2, e		; y2 = (f^g)&e
+
+		vpsrlq	XTMP2, XTMP2, 17	; XTMP2 = W[-2] MY_ROR 17 {xBxA}
+
+	MY_ROR	y1, (13-2)	; y1 = (a >> (13-2)) ^ (a >> (22-2))
+	xor	y1, a		; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+	xor	y2, g		; y2 = CH = ((f^g)&e)^g
+	MY_ROR	y0, 6		; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+		vpxor	XTMP2, XTMP2, XTMP3
+	add	y2, y0		; y2 = S1 + CH
+	MY_ROR	y1, 2		; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+	add	y2, [rsp + _XFER + 2*4]	; y2 = k + w + S1 + CH
+		vpxor	XTMP4, XTMP4, XTMP2	; XTMP4 = s1 {xBxA}
+	mov	y0, a		; y0 = a
+	add	h, y2		; h = h + S1 + CH + k + w
+	mov	y2, a		; y2 = a
+		vpshufb	XTMP4, XTMP4, SHUF_00BA	; XTMP4 = s1 {00BA}
+	or	y0, c		; y0 = a|c
+	add	d, h		; d = d + h + S1 + CH + k + w
+	and	y2, c		; y2 = a&c
+		vpaddd	XTMP0, XTMP0, XTMP4	; XTMP0 = {..., ..., W[1], W[0]}
+	and	y0, b		; y0 = (a|c)&b
+	add	h, y1		; h = h + S1 + CH + k + w + S0
+		;; compute high s1
+		vpshufd	XTMP2, XTMP0, 01010000b	; XTMP2 = W[-2] {DDCC}
+	or	y0, y2		; y0 = MAJ = (a|c)&b)|(a&c)
+	add	h, y0		; h = h + S1 + CH + k + w + S0 + MAJ
+
+ROTATE_ARGS
+		;vmovdqa	XTMP3, XTMP2	; XTMP3 = W[-2] {DDCC}
+	mov	y0, e		; y0 = e
+	MY_ROR	y0, (25-11)	; y0 = e >> (25-11)
+	mov	y1, a		; y1 = a
+		;vmovdqa	XTMP5,    XTMP2	; XTMP5    = W[-2] {DDCC}
+	MY_ROR	y1, (22-13)	; y1 = a >> (22-13)
+	xor	y0, e		; y0 = e ^ (e >> (25-11))
+	mov	y2, f		; y2 = f
+	MY_ROR	y0, (11-6)	; y0 = (e >> (11-6)) ^ (e >> (25-6))
+
+		vpsrld	XTMP5, XTMP2,   10	; XTMP5 = W[-2] >> 10 {DDCC}
+
+	xor	y1, a		; y1 = a ^ (a >> (22-13)
+	xor	y2, g		; y2 = f^g
+
+		vpsrlq	XTMP3, XTMP2, 19	; XTMP3 = W[-2] MY_ROR 19 {xDxC}
+
+	xor	y0, e		; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+	and	y2, e		; y2 = (f^g)&e
+	MY_ROR	y1, (13-2)	; y1 = (a >> (13-2)) ^ (a >> (22-2))
+
+		vpsrlq	XTMP2, XTMP2, 17	; XTMP2 = W[-2] MY_ROR 17 {xDxC}
+
+	xor	y1, a		; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+	MY_ROR	y0, 6		; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+	xor	y2, g		; y2 = CH = ((f^g)&e)^g
+
+		vpxor	XTMP2, XTMP2, XTMP3
+
+	MY_ROR	y1, 2		; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+	add	y2, y0		; y2 = S1 + CH
+	add	y2, [rsp + _XFER + 3*4]	; y2 = k + w + S1 + CH
+		vpxor	XTMP5, XTMP5, XTMP2	; XTMP5 = s1 {xDxC}
+	mov	y0, a		; y0 = a
+	add	h, y2		; h = h + S1 + CH + k + w
+	mov	y2, a		; y2 = a
+		vpshufb	XTMP5, XTMP5, SHUF_DC00	; XTMP5 = s1 {DC00}
+	or	y0, c		; y0 = a|c
+	add	d, h		; d = d + h + S1 + CH + k + w
+	and	y2, c		; y2 = a&c
+		vpaddd	X0, XTMP5, XTMP0	; X0 = {W[3], W[2], W[1], W[0]}
+	and	y0, b		; y0 = (a|c)&b
+	add	h, y1		; h = h + S1 + CH + k + w + S0
+	or	y0, y2		; y0 = MAJ = (a|c)&b)|(a&c)
+	add	h, y0		; h = h + S1 + CH + k + w + S0 + MAJ
+
+ROTATE_ARGS
+rotate_Xs
+%endm
+
+;; input is [rsp + _XFER + %1 * 4]
+%macro DO_ROUND 1
+	mov	y0, e		; y0 = e
+	MY_ROR	y0, (25-11)	; y0 = e >> (25-11)
+	mov	y1, a		; y1 = a
+	xor	y0, e		; y0 = e ^ (e >> (25-11))
+	MY_ROR	y1, (22-13)	; y1 = a >> (22-13)
+	mov	y2, f		; y2 = f
+	xor	y1, a		; y1 = a ^ (a >> (22-13)
+	MY_ROR	y0, (11-6)	; y0 = (e >> (11-6)) ^ (e >> (25-6))
+	xor	y2, g		; y2 = f^g
+	xor	y0, e		; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+	MY_ROR	y1, (13-2)	; y1 = (a >> (13-2)) ^ (a >> (22-2))
+	and	y2, e		; y2 = (f^g)&e
+	xor	y1, a		; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+	MY_ROR	y0, 6		; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+	xor	y2, g		; y2 = CH = ((f^g)&e)^g
+	add	y2, y0		; y2 = S1 + CH
+	MY_ROR	y1, 2		; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+	add	y2, [rsp + _XFER + %1 * 4]	; y2 = k + w + S1 + CH
+	mov	y0, a		; y0 = a
+	add	h, y2		; h = h + S1 + CH + k + w
+	mov	y2, a		; y2 = a
+	or	y0, c		; y0 = a|c
+	add	d, h		; d = d + h + S1 + CH + k + w
+	and	y2, c		; y2 = a&c
+	and	y0, b		; y0 = (a|c)&b
+	add	h, y1		; h = h + S1 + CH + k + w + S0
+	or	y0, y2		; y0 = MAJ = (a|c)&b)|(a&c)
+	add	h, y0		; h = h + S1 + CH + k + w + S0 + MAJ
+	ROTATE_ARGS
+%endm
+
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; void FUNC(void *input_data, UINT32 digest[8], UINT64 num_blks)
+;; arg 1 : pointer to input data
+;; arg 2 : pointer to digest
+section .text
+global sha256_1_avx:function
+align 32
+sha256_1_avx:
+        endbranch64
+	push	rbx
+%ifdef WINABI
+        push    r8
+	push	rsi
+	push	rdi
+%else
+        push    rdx
+%endif
+	push	rbp
+        push    r12
+	push	r13
+	push	r14
+	push	r15
+
+	sub	rsp,STACK_size
+%ifdef WINABI
+	vmovdqa	[rsp + _XMM_SAVE + 0*16],xmm6
+	vmovdqa	[rsp + _XMM_SAVE + 1*16],xmm7
+	vmovdqa	[rsp + _XMM_SAVE + 2*16],xmm8
+	vmovdqa	[rsp + _XMM_SAVE + 3*16],xmm9
+	vmovdqa	[rsp + _XMM_SAVE + 4*16],xmm10
+	vmovdqa	[rsp + _XMM_SAVE + 5*16],xmm11
+	vmovdqa	[rsp + _XMM_SAVE + 6*16],xmm12
+	vmovdqa	[rsp + _XMM_SAVE + 7*16],xmm13
+%endif
+	vmovdqa	BYTE_FLIP_MASK, [rel PSHUFFLE_BYTE_FLIP_MASK]
+	vmovdqa	SHUF_00BA, [rel _SHUF_00BA]
+	vmovdqa	SHUF_DC00, [rel _SHUF_DC00]
+
+.hash_1_block:
+	;; load initial digest
+	lea TBL,[rel DIGEST]
+	mov	a, [TBL + 0*4]
+	mov	b, [TBL + 1*4] 
+	mov	c, [TBL + 2*4] 
+	mov	d, [TBL + 3*4] 
+	mov	e, [TBL + 4*4] 
+	mov	f, [TBL + 5*4] 
+	mov	g, [TBL + 6*4] 
+	mov	h, [TBL + 7*4] 
+
+	lea	TBL,[rel K256]
+
+	;; byte swap first 16 dwords
+	COPY_XMM_AND_BSWAP	X0, [DATA_PTR + 0*16], BYTE_FLIP_MASK
+	COPY_XMM_AND_BSWAP	X1, [DATA_PTR + 1*16], BYTE_FLIP_MASK
+	COPY_XMM_AND_BSWAP	X2, [DATA_PTR + 2*16], BYTE_FLIP_MASK
+	COPY_XMM_AND_BSWAP	X3, [DATA_PTR + 3*16], BYTE_FLIP_MASK
+
+	;; schedule 48 input dwords, by doing 3 rounds of 16 each
+%rep 3
+align 16
+	vpaddd	XFER, X0, [TBL + 0*16]
+	vmovdqa	[rsp + _XFER], XFER
+	FOUR_ROUNDS_AND_SCHED
+
+	vpaddd	XFER, X0, [TBL + 1*16]
+	vmovdqa	[rsp + _XFER], XFER
+	FOUR_ROUNDS_AND_SCHED
+
+	vpaddd	XFER, X0, [TBL + 2*16]
+	vmovdqa	[rsp + _XFER], XFER
+	FOUR_ROUNDS_AND_SCHED
+
+	vpaddd	XFER, X0, [TBL + 3*16]
+	vmovdqa	[rsp + _XFER], XFER
+	add	TBL, 4*16
+	FOUR_ROUNDS_AND_SCHED
+%endrep
+
+%rep 2 
+	vpaddd	XFER, X0, [TBL + 0*16]
+	vmovdqa	[rsp + _XFER], XFER
+	DO_ROUND	0
+	DO_ROUND	1
+	DO_ROUND	2
+	DO_ROUND	3
+
+	vpaddd	XFER, X1, [TBL + 1*16]
+	vmovdqa	[rsp + _XFER], XFER
+	add	TBL, 2*16
+	DO_ROUND	0
+	DO_ROUND	1
+	DO_ROUND	2
+	DO_ROUND	3
+
+	vmovdqa	X0, X2
+	vmovdqa	X1, X3
+
+%endrep
+ 
+        ; add old digest
+
+	lea TBL,[rel DIGEST]
+	add	a, [TBL + 0*4]
+	add	b, [TBL + 1*4]
+	add	c, [TBL + 2*4]
+	add	d, [TBL + 3*4]
+	add	e, [TBL + 4*4]
+	add	f, [TBL + 5*4]
+	add	g, [TBL + 6*4]
+	add	h, [TBL + 7*4]
+
+
+        ; rounds with padding
+        
+        ; save old digest
+        ;
+        mov    [rsp + _DIGEST + 0*4], a
+        mov    [rsp + _DIGEST + 1*4], b
+        mov    [rsp + _DIGEST + 2*4], c
+        mov    [rsp + _DIGEST + 3*4], d
+        mov    [rsp + _DIGEST + 4*4], e
+        mov    [rsp + _DIGEST + 5*4], f
+        mov    [rsp + _DIGEST + 6*4], g
+        mov    [rsp + _DIGEST + 7*4], h
+       
+        lea     TBL,[rel PADDING]
+       
+%assign i 0
+%rep 64
+	mov	y0, e		; y0 = e
+	MY_ROR	y0, (25-11)	; y0 = e >> (25-11)
+	mov	y1, a		; y1 = a
+	xor	y0, e		; y0 = e ^ (e >> (25-11))
+	MY_ROR	y1, (22-13)	; y1 = a >> (22-13)
+	mov	y2, f		; y2 = f
+	xor	y1, a		; y1 = a ^ (a >> (22-13)
+	MY_ROR	y0, (11-6)	; y0 = (e >> (11-6)) ^ (e >> (25-6))
+	xor	y2, g		; y2 = f^g
+	xor	y0, e		; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+	MY_ROR	y1, (13-2)	; y1 = (a >> (13-2)) ^ (a >> (22-2))
+	and	y2, e		; y2 = (f^g)&e
+	xor	y1, a		; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+	MY_ROR	y0, 6		; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+	xor	y2, g		; y2 = CH = ((f^g)&e)^g
+	add	y2, y0		; y2 = S1 + CH
+	MY_ROR	y1, 2		; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+	add	y2, [TBL + i]	; y2 = k + w + S1 + CH
+	mov	y0, a		; y0 = a
+	add	h, y2		; h = h + S1 + CH + k + w
+	mov	y2, a		; y2 = a
+	or	y0, c		; y0 = a|c
+	add	d, h		; d = d + h + S1 + CH + k + w
+	and	y2, c		; y2 = a&c
+	and	y0, b		; y0 = (a|c)&b
+	add	h, y1		; h = h + S1 + CH + k + w + S0
+	or	y0, y2		; y0 = MAJ = (a|c)&b)|(a&c)
+	add	h, y0		; h = h + S1 + CH + k + w + S0 + MAJ
+	ROTATE_ARGS
+%assign i (i+4)
+%endrep
+
+        ;; add the previous digest
+        add   a, [rsp + _DIGEST + 0*4]
+        add   b, [rsp + _DIGEST + 1*4]
+        add   c, [rsp + _DIGEST + 2*4]
+        add   d, [rsp + _DIGEST + 3*4]
+        add   e, [rsp + _DIGEST + 4*4]
+        add   f, [rsp + _DIGEST + 5*4]
+        add   g, [rsp + _DIGEST + 6*4]
+        add   h, [rsp + _DIGEST + 7*4]
+
+        ;; shuffle the bytes to little endian
+        bswap  a
+        bswap  b
+        bswap  c
+        bswap  d
+        bswap  e
+        bswap  f
+        bswap  g
+        bswap  h
+
+        ;; write resulting hash
+        mov   [OUTPUT_PTR + 0*4], a
+        mov   [OUTPUT_PTR + 1*4], b
+        mov   [OUTPUT_PTR + 2*4], c
+        mov   [OUTPUT_PTR + 3*4], d
+        mov   [OUTPUT_PTR + 4*4], e
+        mov   [OUTPUT_PTR + 5*4], f
+        mov   [OUTPUT_PTR + 6*4], g
+        mov   [OUTPUT_PTR + 7*4], h
+
+%ifdef WINABI
+	vmovdqa	xmm6,[rsp + _XMM_SAVE + 0*16]
+	vmovdqa	xmm7,[rsp + _XMM_SAVE + 1*16]
+	vmovdqa	xmm8,[rsp + _XMM_SAVE + 2*16]
+	vmovdqa	xmm9,[rsp + _XMM_SAVE + 3*16]
+	vmovdqa	xmm10,[rsp + _XMM_SAVE + 4*16]
+	vmovdqa	xmm11,[rsp + _XMM_SAVE + 5*16]
+	vmovdqa	xmm12,[rsp + _XMM_SAVE + 6*16]
+	vmovdqa	xmm13,[rsp + _XMM_SAVE + 7*16]
+%endif 
+
+	add	rsp, STACK_size
+
+	pop	r15
+	pop	r14
+	pop	r13
+        pop     r12
+	pop	rbp
+%ifdef WINABI
+	pop	rdi
+	pop	rsi
+        pop     r8
+%else
+        pop     rdx
+%endif
+	pop	rbx
+
+	ret
+
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
--- a/crypto/hash/custom_hasher/assembly/sha256_shani.asm
+++ b/crypto/hash/custom_hasher/assembly/sha256_shani.asm
--- a/crypto/hash/custom_hasher/assembly/transpose_avx2.asm
+++ b/crypto/hash/custom_hasher/assembly/transpose_avx2.asm
@@ -0,0 +1,192 @@
+;;
+;; Copyright (c) 2012-2021, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%ifndef _TRANSPOSE_AVX2_ASM_
+%define _TRANSPOSE_AVX2_ASM_
+
+%include "reg_sizes.asm"
+
+; LOAD ALL 8 LANES FOR 8x8 32-BIT TRANSPOSE
+;
+; r0-r7       [out] ymm registers which will contain the data to be transposed
+; addr0-addr7 [in]  pointers to the next 32-byte block of data to be fetch for all 8 lanes
+; ptr_offset  [in] offset to be applied on all pointers (addr0-addr7)
+%macro TRANSPOSE8_U32_LOAD8 17
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%r4 %5
+%define %%r5 %6
+%define %%r6 %7
+%define %%r7 %8
+%define %%addr0 %9
+%define %%addr1 %10
+%define %%addr2 %11
+%define %%addr3 %12
+%define %%addr4 %13
+%define %%addr5 %14
+%define %%addr6 %15
+%define %%addr7 %16
+%define %%ptr_offset %17
+
+; Expected output data
+;
+; r0 = {e3 e2 e1 e0  a3 a2 a1 a0}
+; r1 = {f3 f2 f1 f0  b3 b2 b1 b0}
+; r2 = {g3 g2 g1 g0  c3 c2 c1 c0}
+; r3 = {h3 h2 h1 h0  d3 d2 d1 d0}
+; r4 = {e7 e6 e5 e4  a7 a6 a5 a4}
+; r5 = {f7 f6 f5 f4  b7 b6 b5 b4}
+; r6 = {g7 g6 g5 g4  c7 c6 c5 c4}
+; r7 = {h7 h6 h5 h4  d7 d6 d5 d4}
+
+	vmovups	XWORD(%%r0),[%%addr0+%%ptr_offset]
+	vmovups	XWORD(%%r1),[%%addr1+%%ptr_offset]
+	vmovups	XWORD(%%r2),[%%addr2+%%ptr_offset]
+	vmovups	XWORD(%%r3),[%%addr3+%%ptr_offset]
+	vmovups	XWORD(%%r4),[%%addr0+%%ptr_offset+16]
+	vmovups	XWORD(%%r5),[%%addr1+%%ptr_offset+16]
+	vmovups	XWORD(%%r6),[%%addr2+%%ptr_offset+16]
+	vmovups	XWORD(%%r7),[%%addr3+%%ptr_offset+16]
+
+	vinserti128 %%r0, %%r0, [%%addr4+%%ptr_offset], 0x01
+	vinserti128 %%r1, %%r1, [%%addr5+%%ptr_offset], 0x01
+	vinserti128 %%r2, %%r2, [%%addr6+%%ptr_offset], 0x01
+	vinserti128 %%r3, %%r3, [%%addr7+%%ptr_offset], 0x01
+	vinserti128 %%r4, %%r4, [%%addr4+%%ptr_offset+16], 0x01
+	vinserti128 %%r5, %%r5, [%%addr5+%%ptr_offset+16], 0x01
+	vinserti128 %%r6, %%r6, [%%addr6+%%ptr_offset+16], 0x01
+	vinserti128 %%r7, %%r7, [%%addr7+%%ptr_offset+16], 0x01
+
+%endmacro
+
+; 8x8 32-BIT TRANSPOSE
+;
+; Before calling this macro, TRANSPOSE8_U32_LOAD8 must be called.
+;
+; r0-r3          [in/out]    ymm registers containing bytes 0-15 of each 32B block (e.g. ymm0 = [e3-e0 a3-a0])
+; r4-r7          [in/out]    ymm registers containing bytes 16-31 of each 32B block (e.g. ymm4 = [e4-e7 a4-a7])
+; t0-t1          [clobbered] ymm temporary registers
+%macro TRANSPOSE8_U32_PRELOADED 10
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%r4 %5
+%define %%r5 %6
+%define %%r6 %7
+%define %%r7 %8
+%define %%t0 %9
+%define %%t1 %10
+; Input looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
+; r0 = {e3 e2 e1 e0   a3 a2 a1 a0}
+; r1 = {f3 f2 f1 f0   b3 b2 b1 b0}
+; r2 = {g3 g2 g1 g0   c3 c2 c1 c0}
+; r3 = {h3 h2 h1 h0   d3 d2 d1 d0}
+; r4 = {e7 e6 e5 e4   a7 a6 a5 a4}
+; r5 = {f7 f6 f5 f4   b7 b6 b5 b4}
+; r6 = {g7 g6 g5 g4   c7 c6 c5 c4}
+; r7 = {h7 h6 h5 h4   d7 d6 d5 d4}
+;
+; Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
+; r0 = {h0 g0 f0 e0   d0 c0 b0 a0}
+; r1 = {h1 g1 f1 e1   d1 c1 b1 a1}
+; r2 = {h2 g2 f2 e2   d2 c2 b2 a2}
+; r3 = {h3 g3 f3 e3   d3 c3 b3 a3}
+; r4 = {h4 g4 f4 e4   d4 c4 b4 a4}
+; r5 = {h5 g5 f5 e5   d5 c5 b5 a5}
+; r6 = {h6 g6 f6 e6   d6 c6 b6 a6}
+; r7 = {h7 g7 f7 e7   d7 c7 b7 a7}
+;
+	; process top half (r0..r3)
+	vshufps	%%t0, %%r0, %%r1, 0x44	; t0 = {f1 f0 e1 e0   b1 b0 a1 a0}
+	vshufps	%%r0, %%r0, %%r1, 0xEE	; r0 = {f3 f2 e3 e2   b3 b2 a3 a2}
+	vshufps %%t1, %%r2, %%r3, 0x44	; t1 = {h1 h0 g1 g0   d1 d0 c1 c0}
+	vshufps	%%r2, %%r2, %%r3, 0xEE	; r2 = {h3 h2 g3 g2   d3 d2 c3 c2}
+
+	vshufps	%%r1, %%t0, %%t1, 0xDD	; r1 = {h1 g1 f1 e1   d1 c1 b1 a1}
+	vshufps	%%r3, %%r0, %%r2, 0xDD	; r3 = {h3 g3 f3 e3   d3 c3 b3 a3}
+	vshufps	%%r2, %%r0, %%r2, 0x88	; r2 = {h2 g2 f2 e2   d2 c2 b2 a2}
+	vshufps	%%r0, %%t0, %%t1, 0x88	; r0 = {h0 g0 f0 e0   d0 c0 b0 a0}
+
+	;; process bottom half (r4..r7)
+	vshufps	%%t0, %%r4, %%r5, 0x44	; t0 = {f5 f4 e5 e4   b5 b4 a5 a4}
+	vshufps	%%r4, %%r4, %%r5, 0xEE	; r4 = {f7 f6 e7 e6   b7 b6 a7 a6}
+	vshufps %%t1, %%r6, %%r7, 0x44	; t1 = {h5 h4 g5 g4   d5 d4 c5 c4}
+	vshufps	%%r6, %%r6, %%r7, 0xEE	; r6 = {h7 h6 g7 g6   d7 d6 c7 c6}
+
+	vshufps	%%r5, %%t0, %%t1, 0xDD	; r5 = {h5 g5 f5 e5   d5 c5 b5 a5}
+	vshufps	%%r7, %%r4, %%r6, 0xDD	; r7 = {h7 g7 f7 e7   d7 c7 b7 a7}
+	vshufps	%%r6, %%r4, %%r6, 0x88	; r6 = {h6 g6 f6 e6   d6 c6 b6 a6}
+	vshufps	%%r4, %%t0, %%t1, 0x88	; r4 = {h4 g4 f4 e4   d4 c4 b4 a4}
+%endmacro
+
+%macro TRANSPOSE8_U32 10
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%r4 %5
+%define %%r5 %6
+%define %%r6 %7
+%define %%r7 %8
+%define %%t0 %9
+%define %%t1 %10
+       ; process top half (r0..r3) {a...d}
+       vshufps %%t0, %%r0, %%r1, 0x44  ; t0 = {b5 b4 a5 a4   b1 b0 a1 a0}
+       vshufps %%r0, %%r0, %%r1, 0xEE  ; r0 = {b7 b6 a7 a6   b3 b2 a3 a2}
+       vshufps %%t1, %%r2, %%r3, 0x44  ; t1 = {d5 d4 c5 c4   d1 d0 c1 c0}
+       vshufps %%r2, %%r2, %%r3, 0xEE  ; r2 = {d7 d6 c7 c6   d3 d2 c3 c2}
+       vshufps %%r3, %%t0, %%t1, 0xDD  ; r3 = {d5 c5 b5 a5   d1 c1 b1 a1}
+       vshufps %%r1, %%r0, %%r2, 0x88  ; r1 = {d6 c6 b6 a6   d2 c2 b2 a2}
+       vshufps %%r0, %%r0, %%r2, 0xDD  ; r0 = {d7 c7 b7 a7   d3 c3 b3 a3}
+       vshufps %%t0, %%t0, %%t1, 0x88  ; t0 = {d4 c4 b4 a4   d0 c0 b0 a0}
+
+       ; use r2 in place of t0
+       ; process bottom half (r4..r7) {e...h}
+       vshufps %%r2, %%r4, %%r5, 0x44  ; r2 = {f5 f4 e5 e4   f1 f0 e1 e0}
+       vshufps %%r4, %%r4, %%r5, 0xEE  ; r4 = {f7 f6 e7 e6   f3 f2 e3 e2}
+       vshufps %%t1, %%r6, %%r7, 0x44  ; t1 = {h5 h4 g5 g4   h1 h0 g1 g0}
+       vshufps %%r6, %%r6, %%r7, 0xEE  ; r6 = {h7 h6 g7 g6   h3 h2 g3 g2}
+       vshufps %%r7, %%r2, %%t1, 0xDD  ; r7 = {h5 g5 f5 e5   h1 g1 f1 e1}
+       vshufps %%r5, %%r4, %%r6, 0x88  ; r5 = {h6 g6 f6 e6   h2 g2 f2 e2}
+       vshufps %%r4, %%r4, %%r6, 0xDD  ; r4 = {h7 g7 f7 e7   h3 g3 f3 e3}
+       vshufps %%t1, %%r2, %%t1, 0x88  ; t1 = {h4 g4 f4 e4   h0 g0 f0 e0}
+
+
+       vperm2f128      %%r6, %%r5, %%r1, 0x13  ; h6...a6
+       vperm2f128      %%r2, %%r5, %%r1, 0x02  ; h2...a2
+       vperm2f128      %%r5, %%r7, %%r3, 0x13  ; h5...a5
+       vperm2f128      %%r1, %%r7, %%r3, 0x02  ; h1...a1
+       vperm2f128      %%r7, %%r4, %%r0, 0x13  ; h7...a7
+       vperm2f128      %%r3, %%r4, %%r0, 0x02  ; h3...a3
+       vperm2f128      %%r4, %%t1, %%t0, 0x13  ; h4...a4
+       vperm2f128      %%r0, %%t1, %%t0, 0x02  ; h0...a0
+%endmacro
+
+%endif ;; _TRANSPOSE_AVX2_ASM_
--- a/crypto/hash/custom_hasher/hasher.cpp
+++ b/crypto/hash/custom_hasher/hasher.cpp
@@ -0,0 +1,61 @@
+#include <cpuid.h>
+
+#include "hasher.hpp"
+
+extern "C" void sha256_1_avx(unsigned char* output, const unsigned char* input);
+
+namespace {
+constexpr auto CPUID_LEAF = 7;
+}
+
+namespace prysm {
+void Hasher::sha256_sse(unsigned char* output, const unsigned char* input, std::size_t blocks) {
+    while (blocks) {
+        sha256_1_avx(output, input);
+        input += 2*constants::BYTES_PER_CHUNK;
+        output += constants::BYTES_PER_CHUNK;
+        blocks--;
+    }
+}
+
+const Hasher::IMPL Hasher::implemented() {
+    IMPL ret = IMPL::NONE; 
+    std::uint32_t a, b, c, d;  // NOLINT
+    __get_cpuid_count(CPUID_LEAF, 0, &a, &b, &c, &d);
+    if (b & bit_SHA) ret = ret | IMPL::SHA;
+    if (b & bit_AVX2) ret = ret | IMPL::AVX2;
+
+    __get_cpuid(1, &a, &b, &c, &d);
+    if (c & bit_AVX) ret = ret |  IMPL::AVX;
+    if (c & bit_SSE3) ret = ret | IMPL::SSE;
+
+    return ret;
+}
+
+Hasher::SHA256_hasher Hasher::best_sha256_implementation() {
+    auto impl = implemented(); 
+    if (!!(impl & IMPL::SHA)) return &::sha256_shani;
+    if (!!(impl & IMPL::AVX2)) return &::sha256_8_avx2;
+    if (!!(impl & IMPL::AVX)) return &::sha256_4_avx;
+    return &sha256_sse;
+}
+
+Hasher::Hasher(Hasher::IMPL impl) {
+    switch (impl) {
+        case IMPL::SHA:
+            _hash_64b_blocks = sha256_shani;
+            break;
+        case IMPL::AVX2:
+            _hash_64b_blocks = sha256_8_avx2;
+            break;
+        case IMPL::AVX: 
+            _hash_64b_blocks = sha256_4_avx;
+            break;
+        case IMPL::SSE:
+            _hash_64b_blocks = &sha256_sse;
+            break;
+        default:
+            _hash_64b_blocks = best_sha256_implementation();
+    }
+}
+}  // namespace ssz
--- a/crypto/hash/custom_hasher/hasher.h
+++ b/crypto/hash/custom_hasher/hasher.h
@@ -0,0 +1,9 @@
+#ifndef __CUSTOM_HASHER__
+#define __CUSTOM_HASHER__
+
+#include <stdint.h>
+extern void sha256_4_avx(unsigned char* output, const unsigned char* input, uint64_t blocks);
+extern void sha256_8_avx2(unsigned char* output, const unsigned char* input, uint64_t blocks);
+extern void sha256_shani(unsigned char* output, const unsigned char* input, uint64_t blocks);
+extern void sha256_1_avx(unsigned char* output, const unsigned char* input);
+#endif
--- a/crypto/hash/hash.go
+++ b/crypto/hash/hash.go
@@ -1,7 +1,10 @@
 // Package hashutil includes all hash-function related helpers for Prysm.
 package hash

+// #include "custom_hasher/hasher.h"
+import "C"
 import (
+	"encoding/binary"
 	"errors"
 	"hash"
 	"reflect"
@@ -133,3 +136,85 @@ func FastSum64(data []byte) uint64 {
 func FastSum256(data []byte) [32]byte {
 	return highwayhash.Sum(data, fastSumHashKey[:])
 }
+
+// ------------------------------------
+// No abstraction in these functions, just for playing until we get a feeling if
+// it's worth pursuing.
+func PotuzHasherAVX2Chunks(dst [][32]byte, inp [][32]byte, count uint64) {
+	C.sha256_8_avx2((*C.uchar)(&dst[0][0]), (*C.uchar)(&inp[0][0]), C.ulong(count))
+}
+
+func PotuzHasherAVXChunks(dst [][32]byte, inp [][32]byte, count uint64) {
+	C.sha256_4_avx((*C.uchar)(&dst[0][0]), (*C.uchar)(&inp[0][0]), C.ulong(count))
+}
+
+func PotuzHasherShaniChunks(dst [][32]byte, inp [][32]byte, count uint64) {
+	C.sha256_shani((*C.uchar)(&dst[0][0]), (*C.uchar)(&inp[0][0]), C.ulong(count))
+}
+
+func PotuzHasherShani(dst []byte, inp []byte, count uint64) {
+	C.sha256_shani((*C.uchar)(&dst[0]), (*C.uchar)(&inp[0]), C.ulong(count))
+}
+
+func PotuzHasherAVX(dst []byte, inp []byte, count uint64) {
+	C.sha256_4_avx((*C.uchar)(&dst[0]), (*C.uchar)(&inp[0]), C.ulong(count))
+}
+
+func PotuzHasherAVX2(dst []byte, inp []byte, count uint64) {
+	C.sha256_8_avx2((*C.uchar)(&dst[0]), (*C.uchar)(&inp[0]), C.ulong(count))
+}
+
+func PotuzHasher2Chunks(dst []byte, inp []byte) {
+	C.sha256_1_avx((*C.uchar)(&dst[0]), (*C.uchar)(&inp[0]))
+}
+
+// no check of the chunks length!
+func Hash2ChunksAVX(first [32]byte, second [32]byte) [32]byte {
+	buf := [32]byte{}
+	chunks := make([]byte, 64)
+	copy(chunks, first[:])
+	copy(chunks[32:], second[:])
+
+	C.sha256_1_avx((*C.uchar)(&buf[0]), (*C.uchar)(&chunks[0]))
+	return buf
+}
+
+// no check of the chunks length!
+func Hash2ChunksAVX2(first [32]byte, second [32]byte) [32]byte {
+	buf := [32]byte{}
+	chunks := make([]byte, 64)
+	copy(chunks, first[:])
+	copy(chunks[32:], second[:])
+
+	C.sha256_1_avx((*C.uchar)(&buf[0]), (*C.uchar)(&chunks[0]))
+	return buf
+}
+
+// no check of the chunks length!
+func Hash2ChunksShani(first [32]byte, second [32]byte) [32]byte {
+	buf := [32]byte{}
+	chunks := make([]byte, 64)
+	copy(chunks, first[:])
+	copy(chunks[32:], second[:])
+
+	C.sha256_shani((*C.uchar)(&buf[0]), (*C.uchar)(&chunks[0]), C.ulong(1))
+	return buf
+}
+
+func MixinLengthAVX(root [32]byte, length uint64) [32]byte {
+	val := [32]byte{}
+	binary.LittleEndian.PutUint64(val[:], length)
+	return Hash2ChunksAVX(root, val)
+}
+
+func MixinLengthAVX2(root [32]byte, length uint64) [32]byte {
+	val := [32]byte{}
+	binary.LittleEndian.PutUint64(val[:], length)
+	return Hash2ChunksAVX2(root, val)
+}
+
+func MixinLengthShani(root [32]byte, length uint64) [32]byte {
+	val := [32]byte{}
+	binary.LittleEndian.PutUint64(val[:], length)
+	return Hash2ChunksShani(root, val)
+}
--- a/crypto/hash/hash_test.go
+++ b/crypto/hash/hash_test.go
@@ -2,9 +2,11 @@ package hash_test

 import (
 	"encoding/hex"
+	"math/rand"
 	"testing"

 	fuzz "github.com/google/gofuzz"
+	"github.com/prysmaticlabs/prysm/beacon-chain/state/stateutil"
 	"github.com/prysmaticlabs/prysm/crypto/bls"
 	"github.com/prysmaticlabs/prysm/crypto/hash"
 	"github.com/prysmaticlabs/prysm/encoding/bytesutil"
@@ -104,3 +106,122 @@ func BenchmarkHashProto(b *testing.B) {
 		}
 	}
 }
+
+// -------------------------------------------------------------
+// Remove tests that give illegal instructions in your CPU if you want to run
+// the benchmarks
+/*
+func TestCustomHash_Shani(t *testing.T) {
+	hash0 := make([]byte, 64)
+	root := make([]byte, 32)
+
+	hashOf1 := [32]byte{245, 165, 253, 66, 209, 106, 32, 48, 39, 152, 239, 110, 211, 9, 151, 155, 67, 0, 61, 35, 32, 217, 240, 232, 234, 152, 49, 169, 39, 89, 251, 75}
+
+	hash.PotuzHasherShani(root, hash0, 1)
+	assert.DeepEqual(t, hashOf1[:], root)
+}
+*/
+func TestCustomHash_Avx2(t *testing.T) {
+	hash0 := make([]byte, 64)
+	root := make([]byte, 32)
+
+	hashOf1 := [32]byte{245, 165, 253, 66, 209, 106, 32, 48, 39, 152, 239, 110, 211, 9, 151, 155, 67, 0, 61, 35, 32, 217, 240, 232, 234, 152, 49, 169, 39, 89, 251, 75}
+
+	hash.PotuzHasherAVX2(root, hash0, 1)
+	assert.DeepEqual(t, hashOf1[:], root)
+}
+
+func TestCustomHash_SSE(t *testing.T) {
+	hash0 := make([]byte, 64)
+	root := make([]byte, 32)
+
+	hashOf1 := [32]byte{245, 165, 253, 66, 209, 106, 32, 48, 39, 152, 239, 110, 211, 9, 151, 155, 67, 0, 61, 35, 32, 217, 240, 232, 234, 152, 49, 169, 39, 89, 251, 75}
+
+	hash.PotuzHasher2Chunks(root, hash0)
+	assert.DeepEqual(t, hashOf1[:], root)
+}
+
+/*
+func BenchmarkHashBalanceAVX2(b *testing.B) {
+	zero_hash_array := make([][32]byte, 40)
+	for i := 1; i < 40; i++ {
+		zero_hash_array[i] = hash.Hash2ChunksAVX2(zero_hash_array[i-1], zero_hash_array[i-1])
+	}
+	balances := make([]uint64, 400000)
+	for i := 0; i < len(balances); i++ {
+		balances[i] = rand.Uint64()
+	}
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		_, err := stateutil.Uint64ListRootWithRegistryLimitAVX2(balances, zero_hash_array)
+		require.NoError(b, err)
+	}
+}
+*/
+func BenchmarkHashBalanceAVX(b *testing.B) {
+	zero_hash_array := make([][32]byte, 40)
+	for i := 1; i < 40; i++ {
+		zero_hash_array[i] = hash.Hash2ChunksAVX(zero_hash_array[i-1], zero_hash_array[i-1])
+	}
+	balances := make([]uint64, 400000)
+	for i := 0; i < len(balances); i++ {
+		balances[i] = rand.Uint64()
+	}
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		_, err := stateutil.Uint64ListRootWithRegistryLimitAVX(balances, zero_hash_array)
+		require.NoError(b, err)
+	}
+}
+
+/*
+func BenchmarkHashBalanceShani(b *testing.B) {
+	zero_hash_array := make([][32]byte, 40)
+	for i := 1; i < 40; i++ {
+		zero_hash_array[i] = hash.Hash2ChunksShani(zero_hash_array[i-1], zero_hash_array[i-1])
+	}
+	balances := make([]uint64, 400000)
+	for i := 0; i < len(balances); i++ {
+		balances[i] = rand.Uint64()
+	}
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		_, err := stateutil.Uint64ListRootWithRegistryLimitShani(balances, zero_hash_array)
+		require.NoError(b, err)
+	}
+}
+*/
+func BenchmarkHashBalancePrysm(b *testing.B) {
+	zero_hash_array := make([][32]byte, 40)
+	for i := 1; i < 40; i++ {
+		zero_hash_array[i] = hash.Hash2ChunksAVX(zero_hash_array[i-1], zero_hash_array[i-1])
+	}
+	balances := make([]uint64, 400000)
+	for i := 0; i < len(balances); i++ {
+		balances[i] = rand.Uint64()
+	}
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		_, err := stateutil.Uint64ListRootWithRegistryLimit(balances)
+		require.NoError(b, err)
+	}
+}
+
+/*
+func TestHashBalancesShani(t *testing.T) {
+	zero_hash_array := make([][32]byte, 45)
+	for i := 1; i < 45; i++ {
+		zero_hash_array[i] = hash.Hash2ChunksShani(zero_hash_array[i-1], zero_hash_array[i-1])
+	}
+	balances := make([]uint64, 400000)
+
+	for i := 0; i < len(balances); i++ {
+		balances[i] = rand.Uint64()
+	}
+	root1, err := stateutil.Uint64ListRootWithRegistryLimitShani(balances, zero_hash_array)
+	require.NoError(t, err)
+	root2, err := stateutil.Uint64ListRootWithRegistryLimit(balances)
+	require.NoError(t, err)
+	assert.DeepEqual(t, root1, root2)
+}
+*/
--- a/crypto/hash/yasm.bzl
+++ b/crypto/hash/yasm.bzl
@@ -0,0 +1,110 @@
+load("@rules_cc//cc:toolchain_utils.bzl", "find_cpp_toolchain")
+
+
+def _obj_yasm(ctx, arch, opts, src):
+  yasm_bin = ctx.attr.yasm_bin
+  out = ctx.actions.declare_file(src.basename.replace(src.extension, "o"))
+  opts = arch + ['-o', out.path] + opts + [src.path]
+  inputs = []
+
+  for i in ctx.attr.srcs + ctx.attr.hdrs + ctx.attr.deps:
+    if hasattr(i, "files"):
+      inputs += i.files.to_list()
+    else:
+      inputs.append(i)
+
+  ctx.actions.run(
+      outputs = [out],
+      inputs = inputs,
+      arguments = opts,
+      executable = yasm_bin,
+      mnemonic = 'YasmCompile',
+  )
+
+  return out
+
+def _library_yasm(ctx, mysrc):
+  output_file = ctx.actions.declare_file(ctx.label.name + ".a")
+
+  cc_toolchain = find_cpp_toolchain(ctx)
+
+  feature_configuration = cc_common.configure_features(
+      ctx = ctx,
+      cc_toolchain = cc_toolchain,
+      requested_features = ctx.features,
+      unsupported_features = ctx.disabled_features,
+  )
+
+  linker_input = cc_common.create_linker_input(
+      owner = ctx.label,
+      libraries = depset(direct = [
+          cc_common.create_library_to_link(
+              actions = ctx.actions,
+              static_library = output_file,
+	      cc_toolchain = cc_toolchain,
+	      feature_configuration = feature_configuration,
+          ),
+      ]),
+  )
+
+  compilation_context = cc_common.create_compilation_context()
+  linking_context = cc_common.create_linking_context(linker_inputs = depset(direct = [linker_input]))
+
+  ctx.actions.run(
+    executable = ctx.attr.ar_bin,
+    arguments = ['r', output_file.path] + [i.path for i in mysrc],
+    inputs = mysrc,
+    outputs = [output_file],
+    mnemonic = "Archiving",
+  )
+
+  return CcInfo(compilation_context = compilation_context, linking_context = linking_context)
+
+def _yasm_library_impl(ctx):
+  opts = ctx.attr.copts 
+  deps = [_obj_yasm(ctx, ctx.attr.yasm_arch, opts, src)
+          for target in ctx.attr.srcs for src in target.files.to_list()]
+  for i in ctx.attr.hdrs:
+    if hasattr(i, "files"):
+      deps += i.files.to_list()
+    else:
+      deps.append(i)
+
+  cc_info =  _library_yasm(ctx, deps)
+
+  return [cc_info]
+
+
+YASM_BIN_DEFAULT = "/usr/bin/yasm"
+AR_BIN_DEFAULT = "/usr/bin/ar"
+YASM_ARCH_OPTS = ["-f", "elf64", "-m", "amd64"]
+
+
+_yasm_library = rule(
+  implementation=_yasm_library_impl,
+  attrs={
+    "srcs": attr.label_list(allow_files=True),
+    "hdrs": attr.label_list(allow_files=True),
+    "deps": attr.label_list(allow_files=True),
+    "copts": attr.string_list(),
+    "yasm_bin": attr.string(default=""),
+    "ar_bin": attr.string(default=""),
+    "yasm_arch": attr.string_list(),
+    "_cc_toolchain": attr.label(default = Label("@bazel_tools//tools/cpp:current_cc_toolchain")),
+  },
+  fragments = ["cpp"],
+  toolchains = ["@bazel_tools//tools/cpp:toolchain_type"],
+  )
+
+
+def yasm_library(name, srcs, hdrs=[], deps=[], copts=[],
+                 yasm_bin=YASM_BIN_DEFAULT, ar_bin=AR_BIN_DEFAULT):
+  _yasm_library(
+      name = name,
+      srcs = srcs,
+      hdrs = hdrs,
+      copts = copts,
+      yasm_bin = yasm_bin,
+      ar_bin = ar_bin,
+      yasm_arch = YASM_ARCH_OPTS,
+  )
Author	SHA1	Message	Date
Potuz	94768a3190	Added AVX tests	2021-11-19 15:26:06 -03:00
Potuz	662d4ec6e9	First Benchmarks with Shani	2021-11-19 10:03:12 -03:00
Potuz	b206bffe15	flat array merkleyzer	2021-11-18 22:16:03 -03:00
Potuz	18b83bf445	add SSE3 version for 2 chunks	2021-11-17 11:16:08 -03:00
Potuz	cff4a01ea0	add AVX2 version	2021-11-17 11:08:02 -03:00
Potuz	3a1901d7aa	Working custom assembly hasher with sha_ni extensions	2021-11-17 10:58:31 -03:00