From c2af4bc4c227f6f34b648004972c99676333dc9e Mon Sep 17 00:00:00 2001 From: lukechampine Date: Thu, 30 Jul 2020 13:54:11 -0400 Subject: [PATCH] add AVX2 implementation --- README.md | 31 +- avo/gen.go | 239 ++++++++ blake3.go | 360 ++---------- blake3_amd64.s | 1311 +++++++++++++++++++++++++++++++++++++++++++ blake3_test.go | 25 +- compress_amd64.go | 76 +++ compress_generic.go | 150 +++++ compress_noasm.go | 64 +++ go.mod | 2 + go.sum | 2 + 10 files changed, 1945 insertions(+), 315 deletions(-) create mode 100644 avo/gen.go create mode 100644 blake3_amd64.s create mode 100644 compress_amd64.go create mode 100644 compress_generic.go create mode 100644 compress_noasm.go create mode 100644 go.sum diff --git a/README.md b/README.md index f426db0..c5147ae 100644 --- a/README.md +++ b/README.md @@ -9,10 +9,29 @@ go get lukechampine.com/blake3 ``` `blake3` implements the [BLAKE3 cryptographic hash function](https://github.com/BLAKE3-team/BLAKE3). +This implementation aims to be performant without sacrificing (too much) +readability, in the hopes of eventually landing in `x/crypto`. -This implementation is a port of the Rust reference implementation, refactored -into more idiomatic Go style and with a handful of performance tweaks. -Performance is not great, not terrible. Eventually an assembly-optimized -implementation will be merged into `x/crypto`, and then you should switch to -that. In the meantime, you can use this package for code that needs BLAKE3 -compatibility and doesn't need to be blazing fast. +The pure-Go code is fairly well-optimized, achieving throughput of ~600 MB/s. +There is a separate code path for small inputs (up to 64 bytes) that runs in +~100 ns. On CPUs with AVX2 support, larger inputs (>=2 KB) are handled by +an [`avo`](https://github.com/mmcloughlin/avo)-generated assembly routine that compresses 8 chunks in parallel, +achieving throughput of ~2600 MB/s. Once [AVX-512 support](https://github.com/mmcloughlin/avo/issues/20) is added to `avo`, it +will be possible to compress 16 chunks in parallel, which should roughly double +throughput for sufficiently large inputs. + +Contributions are greatly appreciated. +[All contributors are eligible to receive an Urbit planet.](https://twitter.com/lukechampine/status/1274797924522885134) + + +## Benchmarks + +Tested on an i5-7600K @ 3.80GHz. + +``` +BenchmarkSum256/64 105 ns/op 609.51 MB/s +BenchmarkSum256/1024 1778 ns/op 576.00 MB/s +BenchmarkSum256/65536 24785 ns/op 2644.15 MB/s +BenchmarkWrite 389 ns/op 2631.78 MB/s +BenchmarkXOF 1591 ns/op 643.80 MB/s +``` diff --git a/avo/gen.go b/avo/gen.go new file mode 100644 index 0000000..482f6c7 --- /dev/null +++ b/avo/gen.go @@ -0,0 +1,239 @@ +// +build ignore + +package main + +import ( + "fmt" + + . "github.com/mmcloughlin/avo/build" + . "github.com/mmcloughlin/avo/operand" + . "github.com/mmcloughlin/avo/reg" +) + +func main() { + genGlobals() + genCompressChunksAVX2() + + Generate() +} + +var globals struct { + iv Mem + blockLen Mem + stride1024 Mem + incrementCounter Mem + setFlags Mem + shuffleRot8 Mem + shuffleRot16 Mem +} + +func genGlobals() { + globals.iv = GLOBL("iv", RODATA|NOPTR) + DATA(0*4, U32(0x6A09E667)) + DATA(1*4, U32(0xBB67AE85)) + DATA(2*4, U32(0x3C6EF372)) + DATA(3*4, U32(0xA54FF53A)) + + globals.blockLen = GLOBL("block_len", RODATA|NOPTR) + for i := 0; i < 8; i++ { + DATA(i*4, U32(64)) + } + + globals.stride1024 = GLOBL("stride_1024", RODATA|NOPTR) + for i := 0; i < 8; i++ { + DATA(i*4, U32(i*1024)) + } + globals.incrementCounter = GLOBL("increment_counter", RODATA|NOPTR) + for i := 0; i < 8; i++ { + DATA(i*8, U64(i)) + } + globals.setFlags = GLOBL("set_flags", RODATA|NOPTR) + for i := 0; i < 16; i++ { + if i == 0 { + DATA(i*4, U32(1)) + } else if i == 15 { + DATA(i*4, U32(2)) + } else { + DATA(i*4, U32(0)) + } + } + globals.shuffleRot8 = GLOBL("shuffle_rot8", RODATA|NOPTR) + for i := 0; i < 8; i++ { + DATA(i*4, U32(0x00030201+0x04040404*i)) + } + globals.shuffleRot16 = GLOBL("shuffle_rot16", RODATA|NOPTR) + for i := 0; i < 8; i++ { + DATA(i*4, U32(0x01000302+0x04040404*i)) + } +} + +func genCompressChunksAVX2() { + TEXT("compressChunksAVX2", NOSPLIT, "func(cvs *[8][8]uint32, buf *[8192]byte, key *[8]uint32, counter uint64, flags uint32)") + cvs := Mem{Base: Load(Param("cvs"), GP64())} + buf := Mem{Base: Load(Param("buf"), GP64())} + key := Mem{Base: Load(Param("key"), GP64())} + counter, _ := Param("counter").Resolve() + flags, _ := Param("flags").Resolve() + + vs := [16]VecVirtual{ + YMM(), YMM(), YMM(), YMM(), + YMM(), YMM(), YMM(), YMM(), + YMM(), YMM(), YMM(), YMM(), + YMM(), YMM(), YMM(), YMM(), + } + // stack space for transposed message vectors + var mv [16]Mem + for i := range mv { + mv[i] = AllocLocal(32) + } + // stack space for spilled vs[8] register + spillMem := AllocLocal(32) + + Comment("Load key") + for i := 0; i < 8; i++ { + VPBROADCASTD(key.Offset(i*4), vs[i]) + } + + Comment("Initialize counter") + counterLo := AllocLocal(32) + counterHi := AllocLocal(32) + VPBROADCASTQ(counter.Addr, vs[12]) + VPBROADCASTQ(counter.Addr, vs[13]) + VPADDQ(globals.incrementCounter.Offset(0*32), vs[12], vs[12]) + VPADDQ(globals.incrementCounter.Offset(1*32), vs[13], vs[13]) + VPUNPCKLDQ(vs[13], vs[12], vs[14]) + VPUNPCKHDQ(vs[13], vs[12], vs[15]) + VPUNPCKLDQ(vs[15], vs[14], vs[12]) + VPUNPCKHDQ(vs[15], vs[14], vs[13]) + VPERMQ(Imm(0xd8), vs[12], vs[12]) + VPERMQ(Imm(0xd8), vs[13], vs[13]) + VMOVDQU(vs[12], counterLo) + VMOVDQU(vs[13], counterHi) + + Comment("Initialize flags") + chunkFlags := AllocLocal(16 * 4) + VPBROADCASTD(flags.Addr, vs[14]) + VPOR(globals.setFlags.Offset(0*32), vs[14], vs[15]) + VMOVDQU(vs[15], chunkFlags.Offset(0*32)) + VPOR(globals.setFlags.Offset(1*32), vs[14], vs[15]) + VMOVDQU(vs[15], chunkFlags.Offset(1*32)) + + Comment("Loop index") + loop := GP64() + XORQ(loop, loop) + Label("loop") + + Comment("Load transposed block") + VMOVDQU(globals.stride1024, vs[9]) + for i := 0; i < 16; i++ { + VPCMPEQD(vs[8], vs[8], vs[8]) // fastest way to set all bits to 1 + VPGATHERDD(vs[8], buf.Offset(i*4).Idx(vs[9], 1), vs[10]) + VMOVDQU(vs[10], mv[i]) + } + ADDQ(Imm(64), buf.Base) + + Comment("Reload state vectors (other than CVs)") + for i := 0; i < 4; i++ { + VPBROADCASTD(globals.iv.Offset(i*4), vs[8+i]) + } + VMOVDQU(counterLo, vs[12]) + VMOVDQU(counterHi, vs[13]) + VMOVDQU(globals.blockLen, vs[14]) + VPBROADCASTD(chunkFlags.Idx(loop, 4), vs[15]) + + VMOVDQU(vs[8], spillMem) // spill + for i := 0; i < 7; i++ { + Comment(fmt.Sprintf("Round %v", i+1)) + round(vs, mv, vs[8], spillMem) + // permute + mv = [16]Mem{ + mv[2], mv[6], mv[3], mv[10], + mv[7], mv[0], mv[4], mv[13], + mv[1], mv[11], mv[12], mv[5], + mv[9], mv[14], mv[15], mv[8], + } + } + + Comment("Finalize CVs") + VMOVDQU(spillMem, vs[8]) // reload + for i := range vs[:8] { + VPXOR(vs[i], vs[i+8], vs[i]) + } + + Comment("Loop") + INCQ(loop) + CMPQ(loop, U32(16)) + JNE(LabelRef("loop")) + + Comment("Finished; transpose CVs") + src, dst := vs[:8], vs[8:] + // interleave uint32s + for i := 0; i < 8; i += 2 { + VPUNPCKLDQ(src[i+1], src[i], dst[i+0]) + VPUNPCKHDQ(src[i+1], src[i], dst[i+1]) + } + // interleave groups of two uint32s + for i := 0; i < 4; i++ { + j := i*2 - i%2 // j := 0,1,4,5 + VPUNPCKLQDQ(dst[j+2], dst[j], src[i*2+0]) + VPUNPCKHQDQ(dst[j+2], dst[j], src[i*2+1]) + } + // interleave groups of four uint32s + for i := 0; i < 4; i++ { + VPERM2I128(Imm(0x20), src[i+4], src[i], dst[i+0]) + VPERM2I128(Imm(0x31), src[i+4], src[i], dst[i+4]) + } + for i, v := range dst { + VMOVDQU(v, cvs.Offset(i*32)) + } + + RET() +} + +func round(sv [16]VecVirtual, mv [16]Mem, tmp VecVirtual, spillMem Mem) { + g(sv[0], sv[4], sv[8], sv[12], mv[0], mv[1], tmp, spillMem) + g(sv[1], sv[5], sv[9], sv[13], mv[2], mv[3], tmp, spillMem) + g(sv[2], sv[6], sv[10], sv[14], mv[4], mv[5], tmp, spillMem) + g(sv[3], sv[7], sv[11], sv[15], mv[6], mv[7], tmp, spillMem) + g(sv[0], sv[5], sv[10], sv[15], mv[8], mv[9], tmp, spillMem) + g(sv[1], sv[6], sv[11], sv[12], mv[10], mv[11], tmp, spillMem) + g(sv[2], sv[7], sv[8], sv[13], mv[12], mv[13], tmp, spillMem) + g(sv[3], sv[4], sv[9], sv[14], mv[14], mv[15], tmp, spillMem) +} + +func g(a, b, c, d VecVirtual, mx, my Mem, tmp VecVirtual, spillMem Mem) { + // Helper function for performing rotations. Also manages c, tmp and + // spillMem: if c == tmp, we need to spill and reload c using spillMem. + rotr := func(v VecVirtual, n uint64, dst VecVirtual) { + switch n { + case 8, 16: + shuf := [...]Mem{8: globals.shuffleRot8, 16: globals.shuffleRot16}[n] + VPSHUFB(shuf, v, dst) + if c == tmp { + VMOVDQU(spillMem, c) + } + case 7, 12: + if c == tmp { + VMOVDQU(c, spillMem) + } + VPSRLD(Imm(n), v, tmp) + VPSLLD(Imm(32-n), v, dst) + VPOR(dst, tmp, dst) + } + } + + VPADDD(a, b, a) + VPADDD(mx, a, a) + VPXOR(d, a, d) + rotr(d, 16, d) + VPADDD(c, d, c) + VPXOR(b, c, b) + rotr(b, 12, b) + VPADDD(a, b, a) + VPADDD(my, a, a) + VPXOR(d, a, d) + rotr(d, 8, d) + VPADDD(c, d, c) + VPXOR(b, c, b) + rotr(b, 7, b) +} diff --git a/blake3.go b/blake3.go index c6609fb..d06112f 100644 --- a/blake3.go +++ b/blake3.go @@ -10,12 +10,6 @@ import ( "math/bits" ) -const ( - blockSize = 64 - chunkSize = 1024 -) - -// flags const ( flagChunkStart = 1 << iota flagChunkEnd @@ -24,6 +18,9 @@ const ( flagKeyedHash flagDeriveKeyContext flagDeriveKeyMaterial + + blockSize = 64 + chunkSize = 1024 ) var iv = [8]uint32{ @@ -31,332 +28,82 @@ var iv = [8]uint32{ 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19, } -// helper functions for converting between bytes and BLAKE3 "words" - -func bytesToWords(bytes [64]byte, words *[16]uint32) { - words[0] = binary.LittleEndian.Uint32(bytes[0:]) - words[1] = binary.LittleEndian.Uint32(bytes[4:]) - words[2] = binary.LittleEndian.Uint32(bytes[8:]) - words[3] = binary.LittleEndian.Uint32(bytes[12:]) - words[4] = binary.LittleEndian.Uint32(bytes[16:]) - words[5] = binary.LittleEndian.Uint32(bytes[20:]) - words[6] = binary.LittleEndian.Uint32(bytes[24:]) - words[7] = binary.LittleEndian.Uint32(bytes[28:]) - words[8] = binary.LittleEndian.Uint32(bytes[32:]) - words[9] = binary.LittleEndian.Uint32(bytes[36:]) - words[10] = binary.LittleEndian.Uint32(bytes[40:]) - words[11] = binary.LittleEndian.Uint32(bytes[44:]) - words[12] = binary.LittleEndian.Uint32(bytes[48:]) - words[13] = binary.LittleEndian.Uint32(bytes[52:]) - words[14] = binary.LittleEndian.Uint32(bytes[56:]) - words[15] = binary.LittleEndian.Uint32(bytes[60:]) -} - -func wordsToBytes(words [16]uint32, block *[64]byte) { - binary.LittleEndian.PutUint32(block[0:], words[0]) - binary.LittleEndian.PutUint32(block[4:], words[1]) - binary.LittleEndian.PutUint32(block[8:], words[2]) - binary.LittleEndian.PutUint32(block[12:], words[3]) - binary.LittleEndian.PutUint32(block[16:], words[4]) - binary.LittleEndian.PutUint32(block[20:], words[5]) - binary.LittleEndian.PutUint32(block[24:], words[6]) - binary.LittleEndian.PutUint32(block[28:], words[7]) - binary.LittleEndian.PutUint32(block[32:], words[8]) - binary.LittleEndian.PutUint32(block[36:], words[9]) - binary.LittleEndian.PutUint32(block[40:], words[10]) - binary.LittleEndian.PutUint32(block[44:], words[11]) - binary.LittleEndian.PutUint32(block[48:], words[12]) - binary.LittleEndian.PutUint32(block[52:], words[13]) - binary.LittleEndian.PutUint32(block[56:], words[14]) - binary.LittleEndian.PutUint32(block[60:], words[15]) -} - -func g(a, b, c, d, mx, my uint32) (uint32, uint32, uint32, uint32) { - a += b + mx - d = bits.RotateLeft32(d^a, -16) - c += d - b = bits.RotateLeft32(b^c, -12) - a += b + my - d = bits.RotateLeft32(d^a, -8) - c += d - b = bits.RotateLeft32(b^c, -7) - return a, b, c, d -} - -// A node represents a chunk or parent in the BLAKE3 Merkle tree. In BLAKE3 -// terminology, the elements of the bottom layer (aka "leaves") of the tree are -// called chunk nodes, and the elements of upper layers (aka "interior nodes") -// are called parent nodes. -// -// Computing a BLAKE3 hash involves splitting the input into chunk nodes, then -// repeatedly merging these nodes into parent nodes, until only a single "root" -// node remains. The root node can then be used to generate up to 2^64 - 1 bytes -// of pseudorandom output. +// A node represents a chunk or parent in the BLAKE3 Merkle tree. type node struct { - // the chaining value from the previous state - cv [8]uint32 - // the current state + cv [8]uint32 // chaining value from previous node block [16]uint32 counter uint64 blockLen uint32 flags uint32 } -// compress is the core hash function, generating 16 pseudorandom words from a -// node. -func (n node) compress() [16]uint32 { - // NOTE: we unroll all of the rounds, as well as the permutations that occur - // between rounds. - - // round 1 (also initializes state) - // columns - s0, s4, s8, s12 := g(n.cv[0], n.cv[4], iv[0], uint32(n.counter), n.block[0], n.block[1]) - s1, s5, s9, s13 := g(n.cv[1], n.cv[5], iv[1], uint32(n.counter>>32), n.block[2], n.block[3]) - s2, s6, s10, s14 := g(n.cv[2], n.cv[6], iv[2], n.blockLen, n.block[4], n.block[5]) - s3, s7, s11, s15 := g(n.cv[3], n.cv[7], iv[3], n.flags, n.block[6], n.block[7]) - // diagonals - s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[8], n.block[9]) - s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[10], n.block[11]) - s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[12], n.block[13]) - s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[14], n.block[15]) - - // round 2 - s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[2], n.block[6]) - s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[3], n.block[10]) - s2, s6, s10, s14 = g(s2, s6, s10, s14, n.block[7], n.block[0]) - s3, s7, s11, s15 = g(s3, s7, s11, s15, n.block[4], n.block[13]) - s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[1], n.block[11]) - s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[12], n.block[5]) - s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[9], n.block[14]) - s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[15], n.block[8]) - - // round 3 - s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[3], n.block[4]) - s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[10], n.block[12]) - s2, s6, s10, s14 = g(s2, s6, s10, s14, n.block[13], n.block[2]) - s3, s7, s11, s15 = g(s3, s7, s11, s15, n.block[7], n.block[14]) - s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[6], n.block[5]) - s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[9], n.block[0]) - s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[11], n.block[15]) - s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[8], n.block[1]) - - // round 4 - s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[10], n.block[7]) - s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[12], n.block[9]) - s2, s6, s10, s14 = g(s2, s6, s10, s14, n.block[14], n.block[3]) - s3, s7, s11, s15 = g(s3, s7, s11, s15, n.block[13], n.block[15]) - s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[4], n.block[0]) - s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[11], n.block[2]) - s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[5], n.block[8]) - s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[1], n.block[6]) - - // round 5 - s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[12], n.block[13]) - s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[9], n.block[11]) - s2, s6, s10, s14 = g(s2, s6, s10, s14, n.block[15], n.block[10]) - s3, s7, s11, s15 = g(s3, s7, s11, s15, n.block[14], n.block[8]) - s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[7], n.block[2]) - s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[5], n.block[3]) - s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[0], n.block[1]) - s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[6], n.block[4]) - - // round 6 - s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[9], n.block[14]) - s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[11], n.block[5]) - s2, s6, s10, s14 = g(s2, s6, s10, s14, n.block[8], n.block[12]) - s3, s7, s11, s15 = g(s3, s7, s11, s15, n.block[15], n.block[1]) - s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[13], n.block[3]) - s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[0], n.block[10]) - s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[2], n.block[6]) - s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[4], n.block[7]) - - // round 7 - s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[11], n.block[15]) - s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[5], n.block[0]) - s2, s6, s10, s14 = g(s2, s6, s10, s14, n.block[1], n.block[9]) - s3, s7, s11, s15 = g(s3, s7, s11, s15, n.block[8], n.block[6]) - s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[14], n.block[10]) - s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[2], n.block[12]) - s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[3], n.block[4]) - s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[7], n.block[13]) - - // finalization - return [16]uint32{ - s0 ^ s8, s1 ^ s9, s2 ^ s10, s3 ^ s11, - s4 ^ s12, s5 ^ s13, s6 ^ s14, s7 ^ s15, - s8 ^ n.cv[0], s9 ^ n.cv[1], s10 ^ n.cv[2], s11 ^ n.cv[3], - s12 ^ n.cv[4], s13 ^ n.cv[5], s14 ^ n.cv[6], s15 ^ n.cv[7], - } -} - -// chainingValue returns the first 8 words of the compressed node. This is used -// in two places. First, when a chunk node is being constructed, its cv is -// overwritten with this value after each block of input is processed. Second, -// when two nodes are merged into a parent, each of their chaining values -// supplies half of the new node's block. -func (n node) chainingValue() (cv [8]uint32) { - full := n.compress() - copy(cv[:], full[:8]) - return -} - -// chunkState manages the state involved in hashing a single chunk of input. -type chunkState struct { - n node - block [blockSize]byte - blockLen int - bytesConsumed int -} - -// chunkCounter is the index of this chunk, i.e. the number of chunks that have -// been processed prior to this one. -func (cs *chunkState) chunkCounter() uint64 { - return cs.n.counter -} - -func (cs *chunkState) complete() bool { - return cs.bytesConsumed == chunkSize -} - -// update incorporates input into the chunkState. -func (cs *chunkState) update(input []byte) { - for len(input) > 0 { - // If the block buffer is full, compress it and clear it. More - // input is coming, so this compression is not flagChunkEnd. - if cs.blockLen == blockSize { - // copy the chunk block (bytes) into the node block and chain it. - bytesToWords(cs.block, &cs.n.block) - cs.n.cv = cs.n.chainingValue() - // clear the start flag for all but the first block - cs.n.flags &^= flagChunkStart - cs.blockLen = 0 - } - - // Copy input bytes into the chunk block. - n := copy(cs.block[cs.blockLen:], input) - cs.blockLen += n - cs.bytesConsumed += n - input = input[n:] - } -} - -// compiles to memclr -func clear(b []byte) { - for i := range b { - b[i] = 0 - } -} - -// node returns a node containing the chunkState's current state, with the -// ChunkEnd flag set. -func (cs *chunkState) node() node { - n := cs.n - // pad the remaining space in the block with zeros - clear(cs.block[cs.blockLen:]) - bytesToWords(cs.block, &n.block) - n.blockLen = uint32(cs.blockLen) - n.flags |= flagChunkEnd - return n -} - -func newChunkState(iv [8]uint32, chunkCounter uint64, flags uint32) chunkState { - return chunkState{ - n: node{ - cv: iv, - counter: chunkCounter, - blockLen: blockSize, - // compress the first block with the start flag set - flags: flags | flagChunkStart, - }, - } -} - // parentNode returns a node that incorporates the chaining values of two child // nodes. func parentNode(left, right [8]uint32, key [8]uint32, flags uint32) node { - var blockWords [16]uint32 - copy(blockWords[:8], left[:]) - copy(blockWords[8:], right[:]) - return node{ + n := node{ cv: key, - block: blockWords, counter: 0, // counter is reset for parents - blockLen: blockSize, // block is full: 8 words from left, 8 from right + blockLen: blockSize, // block is full flags: flags | flagParent, } + copy(n.block[:8], left[:]) + copy(n.block[8:], right[:]) + return n } // Hasher implements hash.Hash. type Hasher struct { - cs chunkState key [8]uint32 flags uint32 size int // output size, for Sum // log(n) set of Merkle subtree roots, at most one per height. - stack [54][8]uint32 // 2^54 * chunkSize = 2^64 - used uint64 // bit vector indicating which stack elems are valid; also number of chunks added + stack [51][8]uint32 // 2^51 * 8 * chunkSize = 2^64 + counter uint64 // number of buffers hashed; also serves as a bit vector indicating which stack elems are occupied + + buf [8 * chunkSize]byte + buflen int } func (h *Hasher) hasSubtreeAtHeight(i int) bool { - return h.used&(1< 0 { - // If the current chunk is complete, finalize it and add it to the tree, - // then reset the chunk state (but keep incrementing the counter across - // chunks). - if h.cs.complete() { - cv := h.cs.node().chainingValue() - h.addChunkChainingValue(cv) - h.cs = newChunkState(h.key, h.cs.chunkCounter()+1, h.flags) + if h.buflen == len(h.buf) { + n := compressBuffer(&h.buf, h.buflen, &h.key, h.counter*8, h.flags) + h.pushSubtree(chainingValue(n)) + h.buflen = 0 } - - // Compress input bytes into the current chunk state. - n := chunkSize - h.cs.bytesConsumed - if n > len(p) { - n = len(p) - } - h.cs.update(p[:n]) + n := copy(h.buf[h.buflen:], p) + h.buflen += n p = p[n:] } return lenp, nil @@ -377,6 +124,18 @@ func (h *Hasher) Sum(b []byte) (sum []byte) { return } +// Reset implements hash.Hash. +func (h *Hasher) Reset() { + h.counter = 0 + h.buflen = 0 +} + +// BlockSize implements hash.Hash. +func (h *Hasher) BlockSize() int { return 64 } + +// Size implements hash.Hash. +func (h *Hasher) Size() int { return h.size } + // XOF returns an OutputReader initialized with the current hash state. func (h *Hasher) XOF() *OutputReader { return &OutputReader{ @@ -386,7 +145,6 @@ func (h *Hasher) XOF() *OutputReader { func newHasher(key [8]uint32, flags uint32, size int) *Hasher { return &Hasher{ - cs: newChunkState(key, 0, flags), key: key, flags: flags, size: size, @@ -394,7 +152,7 @@ func newHasher(key [8]uint32, flags uint32, size int) *Hasher { } // New returns a Hasher for the specified size and key. If key is nil, the hash -// is unkeyed. +// is unkeyed. Otherwise, len(key) must be 32. func New(size int, key []byte) *Hasher { if key == nil { return newHasher(iv, 0, size) @@ -408,21 +166,30 @@ func New(size int, key []byte) *Hasher { // Sum256 and Sum512 always use the same hasher state, so we can save some time // when hashing small inputs by constructing the hasher ahead of time. -var defaultHasher = newHasher(iv, 0, 0) +var defaultHasher = New(0, nil) // Sum256 returns the unkeyed BLAKE3 hash of b, truncated to 256 bits. func Sum256(b []byte) (out [32]byte) { - h := *defaultHasher - h.Write(b) - h.XOF().Read(out[:]) + out512 := Sum512(b) + copy(out[:], out512[:]) return } // Sum512 returns the unkeyed BLAKE3 hash of b, truncated to 512 bits. func Sum512(b []byte) (out [64]byte) { - h := *defaultHasher - h.Write(b) - h.XOF().Read(out[:]) + var n node + if len(b) <= blockSize { + hashBlock(&out, b) + return + } else if len(b) <= chunkSize { + n = compressChunk(b, &iv, 0, 0) + n.flags |= flagRoot + } else { + h := *defaultHasher + h.Write(b) + n = h.rootNode() + } + wordsToBytes(compressNode(n), &out) return } @@ -473,10 +240,8 @@ func (or *OutputReader) Read(p []byte) (int, error) { for len(p) > 0 { if or.off%blockSize == 0 { or.n.counter = or.off / blockSize - words := or.n.compress() - wordsToBytes(words, &or.block) + wordsToBytes(compressNode(or.n), &or.block) } - n := copy(p, or.block[or.off%blockSize:]) p = p[n:] or.off += uint64(n) @@ -510,8 +275,7 @@ func (or *OutputReader) Seek(offset int64, whence int) (int64, error) { or.off = off or.n.counter = uint64(off) / blockSize if or.off%blockSize != 0 { - words := or.n.compress() - wordsToBytes(words, &or.block) + wordsToBytes(compressNode(or.n), &or.block) } // NOTE: or.off >= 2^63 will result in a negative return value. // Nothing we can do about this. diff --git a/blake3_amd64.s b/blake3_amd64.s new file mode 100644 index 0000000..f467663 --- /dev/null +++ b/blake3_amd64.s @@ -0,0 +1,1311 @@ +// Code generated by command: go run gen.go -out blake3_amd64.s. DO NOT EDIT. + +#include "textflag.h" + +DATA iv<>+0(SB)/4, $0x6a09e667 +DATA iv<>+4(SB)/4, $0xbb67ae85 +DATA iv<>+8(SB)/4, $0x3c6ef372 +DATA iv<>+12(SB)/4, $0xa54ff53a +GLOBL iv<>(SB), RODATA|NOPTR, $16 + +DATA block_len<>+0(SB)/4, $0x00000040 +DATA block_len<>+4(SB)/4, $0x00000040 +DATA block_len<>+8(SB)/4, $0x00000040 +DATA block_len<>+12(SB)/4, $0x00000040 +DATA block_len<>+16(SB)/4, $0x00000040 +DATA block_len<>+20(SB)/4, $0x00000040 +DATA block_len<>+24(SB)/4, $0x00000040 +DATA block_len<>+28(SB)/4, $0x00000040 +GLOBL block_len<>(SB), RODATA|NOPTR, $32 + +DATA stride_1024<>+0(SB)/4, $0x00000000 +DATA stride_1024<>+4(SB)/4, $0x00000400 +DATA stride_1024<>+8(SB)/4, $0x00000800 +DATA stride_1024<>+12(SB)/4, $0x00000c00 +DATA stride_1024<>+16(SB)/4, $0x00001000 +DATA stride_1024<>+20(SB)/4, $0x00001400 +DATA stride_1024<>+24(SB)/4, $0x00001800 +DATA stride_1024<>+28(SB)/4, $0x00001c00 +GLOBL stride_1024<>(SB), RODATA|NOPTR, $32 + +DATA increment_counter<>+0(SB)/8, $0x0000000000000000 +DATA increment_counter<>+8(SB)/8, $0x0000000000000001 +DATA increment_counter<>+16(SB)/8, $0x0000000000000002 +DATA increment_counter<>+24(SB)/8, $0x0000000000000003 +DATA increment_counter<>+32(SB)/8, $0x0000000000000004 +DATA increment_counter<>+40(SB)/8, $0x0000000000000005 +DATA increment_counter<>+48(SB)/8, $0x0000000000000006 +DATA increment_counter<>+56(SB)/8, $0x0000000000000007 +GLOBL increment_counter<>(SB), RODATA|NOPTR, $64 + +DATA set_flags<>+0(SB)/4, $0x00000001 +DATA set_flags<>+4(SB)/4, $0x00000000 +DATA set_flags<>+8(SB)/4, $0x00000000 +DATA set_flags<>+12(SB)/4, $0x00000000 +DATA set_flags<>+16(SB)/4, $0x00000000 +DATA set_flags<>+20(SB)/4, $0x00000000 +DATA set_flags<>+24(SB)/4, $0x00000000 +DATA set_flags<>+28(SB)/4, $0x00000000 +DATA set_flags<>+32(SB)/4, $0x00000000 +DATA set_flags<>+36(SB)/4, $0x00000000 +DATA set_flags<>+40(SB)/4, $0x00000000 +DATA set_flags<>+44(SB)/4, $0x00000000 +DATA set_flags<>+48(SB)/4, $0x00000000 +DATA set_flags<>+52(SB)/4, $0x00000000 +DATA set_flags<>+56(SB)/4, $0x00000000 +DATA set_flags<>+60(SB)/4, $0x00000002 +GLOBL set_flags<>(SB), RODATA|NOPTR, $64 + +DATA shuffle_rot8<>+0(SB)/4, $0x00030201 +DATA shuffle_rot8<>+4(SB)/4, $0x04070605 +DATA shuffle_rot8<>+8(SB)/4, $0x080b0a09 +DATA shuffle_rot8<>+12(SB)/4, $0x0c0f0e0d +DATA shuffle_rot8<>+16(SB)/4, $0x10131211 +DATA shuffle_rot8<>+20(SB)/4, $0x14171615 +DATA shuffle_rot8<>+24(SB)/4, $0x181b1a19 +DATA shuffle_rot8<>+28(SB)/4, $0x1c1f1e1d +GLOBL shuffle_rot8<>(SB), RODATA|NOPTR, $32 + +DATA shuffle_rot16<>+0(SB)/4, $0x01000302 +DATA shuffle_rot16<>+4(SB)/4, $0x05040706 +DATA shuffle_rot16<>+8(SB)/4, $0x09080b0a +DATA shuffle_rot16<>+12(SB)/4, $0x0d0c0f0e +DATA shuffle_rot16<>+16(SB)/4, $0x11101312 +DATA shuffle_rot16<>+20(SB)/4, $0x15141716 +DATA shuffle_rot16<>+24(SB)/4, $0x19181b1a +DATA shuffle_rot16<>+28(SB)/4, $0x1d1c1f1e +GLOBL shuffle_rot16<>(SB), RODATA|NOPTR, $32 + +// func compressChunksAVX2(cvs *[8][8]uint32, buf *[8192]byte, key *[8]uint32, counter uint64, flags uint32) +// Requires: AVX, AVX2 +TEXT ·compressChunksAVX2(SB), NOSPLIT, $672-40 + MOVQ cvs+0(FP), AX + MOVQ buf+8(FP), CX + MOVQ key+16(FP), DX + + // Load key + VPBROADCASTD (DX), Y0 + VPBROADCASTD 4(DX), Y1 + VPBROADCASTD 8(DX), Y2 + VPBROADCASTD 12(DX), Y3 + VPBROADCASTD 16(DX), Y4 + VPBROADCASTD 20(DX), Y5 + VPBROADCASTD 24(DX), Y6 + VPBROADCASTD 28(DX), Y7 + + // Initialize counter + VPBROADCASTQ counter+24(FP), Y12 + VPBROADCASTQ counter+24(FP), Y13 + VPADDQ increment_counter<>+0(SB), Y12, Y12 + VPADDQ increment_counter<>+32(SB), Y13, Y13 + VPUNPCKLDQ Y13, Y12, Y14 + VPUNPCKHDQ Y13, Y12, Y15 + VPUNPCKLDQ Y15, Y14, Y12 + VPUNPCKHDQ Y15, Y14, Y13 + VPERMQ $0xd8, Y12, Y12 + VPERMQ $0xd8, Y13, Y13 + VMOVDQU Y12, 544(SP) + VMOVDQU Y13, 576(SP) + + // Initialize flags + VPBROADCASTD flags+32(FP), Y14 + VPOR set_flags<>+0(SB), Y14, Y15 + VMOVDQU Y15, 608(SP) + VPOR set_flags<>+32(SB), Y14, Y15 + VMOVDQU Y15, 640(SP) + + // Loop index + XORQ DX, DX + +loop: + // Load transposed block + VMOVDQU stride_1024<>+0(SB), Y9 + VPCMPEQD Y8, Y8, Y8 + VPGATHERDD Y8, (CX)(Y9*1), Y10 + VMOVDQU Y10, (SP) + VPCMPEQD Y8, Y8, Y8 + VPGATHERDD Y8, 4(CX)(Y9*1), Y10 + VMOVDQU Y10, 32(SP) + VPCMPEQD Y8, Y8, Y8 + VPGATHERDD Y8, 8(CX)(Y9*1), Y10 + VMOVDQU Y10, 64(SP) + VPCMPEQD Y8, Y8, Y8 + VPGATHERDD Y8, 12(CX)(Y9*1), Y10 + VMOVDQU Y10, 96(SP) + VPCMPEQD Y8, Y8, Y8 + VPGATHERDD Y8, 16(CX)(Y9*1), Y10 + VMOVDQU Y10, 128(SP) + VPCMPEQD Y8, Y8, Y8 + VPGATHERDD Y8, 20(CX)(Y9*1), Y10 + VMOVDQU Y10, 160(SP) + VPCMPEQD Y8, Y8, Y8 + VPGATHERDD Y8, 24(CX)(Y9*1), Y10 + VMOVDQU Y10, 192(SP) + VPCMPEQD Y8, Y8, Y8 + VPGATHERDD Y8, 28(CX)(Y9*1), Y10 + VMOVDQU Y10, 224(SP) + VPCMPEQD Y8, Y8, Y8 + VPGATHERDD Y8, 32(CX)(Y9*1), Y10 + VMOVDQU Y10, 256(SP) + VPCMPEQD Y8, Y8, Y8 + VPGATHERDD Y8, 36(CX)(Y9*1), Y10 + VMOVDQU Y10, 288(SP) + VPCMPEQD Y8, Y8, Y8 + VPGATHERDD Y8, 40(CX)(Y9*1), Y10 + VMOVDQU Y10, 320(SP) + VPCMPEQD Y8, Y8, Y8 + VPGATHERDD Y8, 44(CX)(Y9*1), Y10 + VMOVDQU Y10, 352(SP) + VPCMPEQD Y8, Y8, Y8 + VPGATHERDD Y8, 48(CX)(Y9*1), Y10 + VMOVDQU Y10, 384(SP) + VPCMPEQD Y8, Y8, Y8 + VPGATHERDD Y8, 52(CX)(Y9*1), Y10 + VMOVDQU Y10, 416(SP) + VPCMPEQD Y8, Y8, Y8 + VPGATHERDD Y8, 56(CX)(Y9*1), Y10 + VMOVDQU Y10, 448(SP) + VPCMPEQD Y8, Y8, Y8 + VPGATHERDD Y8, 60(CX)(Y9*1), Y10 + VMOVDQU Y10, 480(SP) + ADDQ $0x40, CX + + // Reload state vectors (other than CVs) + VPBROADCASTD iv<>+0(SB), Y8 + VPBROADCASTD iv<>+4(SB), Y9 + VPBROADCASTD iv<>+8(SB), Y10 + VPBROADCASTD iv<>+12(SB), Y11 + VMOVDQU 544(SP), Y12 + VMOVDQU 576(SP), Y13 + VMOVDQU block_len<>+0(SB), Y14 + VPBROADCASTD 608(SP)(DX*4), Y15 + VMOVDQU Y8, 512(SP) + + // Round 1 + VPADDD Y0, Y4, Y0 + VPADDD (SP), Y0, Y0 + VPXOR Y12, Y0, Y12 + VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 + VMOVDQU 512(SP), Y8 + VPADDD Y8, Y12, Y8 + VPXOR Y4, Y8, Y4 + VMOVDQU Y8, 512(SP) + VPSRLD $0x0c, Y4, Y8 + VPSLLD $0x14, Y4, Y4 + VPOR Y4, Y8, Y4 + VPADDD Y0, Y4, Y0 + VPADDD 32(SP), Y0, Y0 + VPXOR Y12, Y0, Y12 + VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 + VMOVDQU 512(SP), Y8 + VPADDD Y8, Y12, Y8 + VPXOR Y4, Y8, Y4 + VMOVDQU Y8, 512(SP) + VPSRLD $0x07, Y4, Y8 + VPSLLD $0x19, Y4, Y4 + VPOR Y4, Y8, Y4 + VPADDD Y1, Y5, Y1 + VPADDD 64(SP), Y1, Y1 + VPXOR Y13, Y1, Y13 + VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 + VPADDD Y9, Y13, Y9 + VPXOR Y5, Y9, Y5 + VPSRLD $0x0c, Y5, Y8 + VPSLLD $0x14, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y1, Y5, Y1 + VPADDD 96(SP), Y1, Y1 + VPXOR Y13, Y1, Y13 + VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 + VPADDD Y9, Y13, Y9 + VPXOR Y5, Y9, Y5 + VPSRLD $0x07, Y5, Y8 + VPSLLD $0x19, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y2, Y6, Y2 + VPADDD 128(SP), Y2, Y2 + VPXOR Y14, Y2, Y14 + VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 + VPADDD Y10, Y14, Y10 + VPXOR Y6, Y10, Y6 + VPSRLD $0x0c, Y6, Y8 + VPSLLD $0x14, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y2, Y6, Y2 + VPADDD 160(SP), Y2, Y2 + VPXOR Y14, Y2, Y14 + VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 + VPADDD Y10, Y14, Y10 + VPXOR Y6, Y10, Y6 + VPSRLD $0x07, Y6, Y8 + VPSLLD $0x19, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y3, Y7, Y3 + VPADDD 192(SP), Y3, Y3 + VPXOR Y15, Y3, Y15 + VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 + VPADDD Y11, Y15, Y11 + VPXOR Y7, Y11, Y7 + VPSRLD $0x0c, Y7, Y8 + VPSLLD $0x14, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y3, Y7, Y3 + VPADDD 224(SP), Y3, Y3 + VPXOR Y15, Y3, Y15 + VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 + VPADDD Y11, Y15, Y11 + VPXOR Y7, Y11, Y7 + VPSRLD $0x07, Y7, Y8 + VPSLLD $0x19, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y0, Y5, Y0 + VPADDD 256(SP), Y0, Y0 + VPXOR Y15, Y0, Y15 + VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 + VPADDD Y10, Y15, Y10 + VPXOR Y5, Y10, Y5 + VPSRLD $0x0c, Y5, Y8 + VPSLLD $0x14, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y0, Y5, Y0 + VPADDD 288(SP), Y0, Y0 + VPXOR Y15, Y0, Y15 + VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 + VPADDD Y10, Y15, Y10 + VPXOR Y5, Y10, Y5 + VPSRLD $0x07, Y5, Y8 + VPSLLD $0x19, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y1, Y6, Y1 + VPADDD 320(SP), Y1, Y1 + VPXOR Y12, Y1, Y12 + VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 + VPADDD Y11, Y12, Y11 + VPXOR Y6, Y11, Y6 + VPSRLD $0x0c, Y6, Y8 + VPSLLD $0x14, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y1, Y6, Y1 + VPADDD 352(SP), Y1, Y1 + VPXOR Y12, Y1, Y12 + VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 + VPADDD Y11, Y12, Y11 + VPXOR Y6, Y11, Y6 + VPSRLD $0x07, Y6, Y8 + VPSLLD $0x19, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y2, Y7, Y2 + VPADDD 384(SP), Y2, Y2 + VPXOR Y13, Y2, Y13 + VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 + VMOVDQU 512(SP), Y8 + VPADDD Y8, Y13, Y8 + VPXOR Y7, Y8, Y7 + VMOVDQU Y8, 512(SP) + VPSRLD $0x0c, Y7, Y8 + VPSLLD $0x14, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y2, Y7, Y2 + VPADDD 416(SP), Y2, Y2 + VPXOR Y13, Y2, Y13 + VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 + VMOVDQU 512(SP), Y8 + VPADDD Y8, Y13, Y8 + VPXOR Y7, Y8, Y7 + VMOVDQU Y8, 512(SP) + VPSRLD $0x07, Y7, Y8 + VPSLLD $0x19, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y3, Y4, Y3 + VPADDD 448(SP), Y3, Y3 + VPXOR Y14, Y3, Y14 + VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 + VPADDD Y9, Y14, Y9 + VPXOR Y4, Y9, Y4 + VPSRLD $0x0c, Y4, Y8 + VPSLLD $0x14, Y4, Y4 + VPOR Y4, Y8, Y4 + VPADDD Y3, Y4, Y3 + VPADDD 480(SP), Y3, Y3 + VPXOR Y14, Y3, Y14 + VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 + VPADDD Y9, Y14, Y9 + VPXOR Y4, Y9, Y4 + VPSRLD $0x07, Y4, Y8 + VPSLLD $0x19, Y4, Y4 + VPOR Y4, Y8, Y4 + + // Round 2 + VPADDD Y0, Y4, Y0 + VPADDD 64(SP), Y0, Y0 + VPXOR Y12, Y0, Y12 + VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 + VMOVDQU 512(SP), Y8 + VPADDD Y8, Y12, Y8 + VPXOR Y4, Y8, Y4 + VMOVDQU Y8, 512(SP) + VPSRLD $0x0c, Y4, Y8 + VPSLLD $0x14, Y4, Y4 + VPOR Y4, Y8, Y4 + VPADDD Y0, Y4, Y0 + VPADDD 192(SP), Y0, Y0 + VPXOR Y12, Y0, Y12 + VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 + VMOVDQU 512(SP), Y8 + VPADDD Y8, Y12, Y8 + VPXOR Y4, Y8, Y4 + VMOVDQU Y8, 512(SP) + VPSRLD $0x07, Y4, Y8 + VPSLLD $0x19, Y4, Y4 + VPOR Y4, Y8, Y4 + VPADDD Y1, Y5, Y1 + VPADDD 96(SP), Y1, Y1 + VPXOR Y13, Y1, Y13 + VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 + VPADDD Y9, Y13, Y9 + VPXOR Y5, Y9, Y5 + VPSRLD $0x0c, Y5, Y8 + VPSLLD $0x14, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y1, Y5, Y1 + VPADDD 320(SP), Y1, Y1 + VPXOR Y13, Y1, Y13 + VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 + VPADDD Y9, Y13, Y9 + VPXOR Y5, Y9, Y5 + VPSRLD $0x07, Y5, Y8 + VPSLLD $0x19, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y2, Y6, Y2 + VPADDD 224(SP), Y2, Y2 + VPXOR Y14, Y2, Y14 + VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 + VPADDD Y10, Y14, Y10 + VPXOR Y6, Y10, Y6 + VPSRLD $0x0c, Y6, Y8 + VPSLLD $0x14, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y2, Y6, Y2 + VPADDD (SP), Y2, Y2 + VPXOR Y14, Y2, Y14 + VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 + VPADDD Y10, Y14, Y10 + VPXOR Y6, Y10, Y6 + VPSRLD $0x07, Y6, Y8 + VPSLLD $0x19, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y3, Y7, Y3 + VPADDD 128(SP), Y3, Y3 + VPXOR Y15, Y3, Y15 + VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 + VPADDD Y11, Y15, Y11 + VPXOR Y7, Y11, Y7 + VPSRLD $0x0c, Y7, Y8 + VPSLLD $0x14, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y3, Y7, Y3 + VPADDD 416(SP), Y3, Y3 + VPXOR Y15, Y3, Y15 + VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 + VPADDD Y11, Y15, Y11 + VPXOR Y7, Y11, Y7 + VPSRLD $0x07, Y7, Y8 + VPSLLD $0x19, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y0, Y5, Y0 + VPADDD 32(SP), Y0, Y0 + VPXOR Y15, Y0, Y15 + VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 + VPADDD Y10, Y15, Y10 + VPXOR Y5, Y10, Y5 + VPSRLD $0x0c, Y5, Y8 + VPSLLD $0x14, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y0, Y5, Y0 + VPADDD 352(SP), Y0, Y0 + VPXOR Y15, Y0, Y15 + VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 + VPADDD Y10, Y15, Y10 + VPXOR Y5, Y10, Y5 + VPSRLD $0x07, Y5, Y8 + VPSLLD $0x19, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y1, Y6, Y1 + VPADDD 384(SP), Y1, Y1 + VPXOR Y12, Y1, Y12 + VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 + VPADDD Y11, Y12, Y11 + VPXOR Y6, Y11, Y6 + VPSRLD $0x0c, Y6, Y8 + VPSLLD $0x14, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y1, Y6, Y1 + VPADDD 160(SP), Y1, Y1 + VPXOR Y12, Y1, Y12 + VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 + VPADDD Y11, Y12, Y11 + VPXOR Y6, Y11, Y6 + VPSRLD $0x07, Y6, Y8 + VPSLLD $0x19, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y2, Y7, Y2 + VPADDD 288(SP), Y2, Y2 + VPXOR Y13, Y2, Y13 + VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 + VMOVDQU 512(SP), Y8 + VPADDD Y8, Y13, Y8 + VPXOR Y7, Y8, Y7 + VMOVDQU Y8, 512(SP) + VPSRLD $0x0c, Y7, Y8 + VPSLLD $0x14, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y2, Y7, Y2 + VPADDD 448(SP), Y2, Y2 + VPXOR Y13, Y2, Y13 + VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 + VMOVDQU 512(SP), Y8 + VPADDD Y8, Y13, Y8 + VPXOR Y7, Y8, Y7 + VMOVDQU Y8, 512(SP) + VPSRLD $0x07, Y7, Y8 + VPSLLD $0x19, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y3, Y4, Y3 + VPADDD 480(SP), Y3, Y3 + VPXOR Y14, Y3, Y14 + VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 + VPADDD Y9, Y14, Y9 + VPXOR Y4, Y9, Y4 + VPSRLD $0x0c, Y4, Y8 + VPSLLD $0x14, Y4, Y4 + VPOR Y4, Y8, Y4 + VPADDD Y3, Y4, Y3 + VPADDD 256(SP), Y3, Y3 + VPXOR Y14, Y3, Y14 + VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 + VPADDD Y9, Y14, Y9 + VPXOR Y4, Y9, Y4 + VPSRLD $0x07, Y4, Y8 + VPSLLD $0x19, Y4, Y4 + VPOR Y4, Y8, Y4 + + // Round 3 + VPADDD Y0, Y4, Y0 + VPADDD 96(SP), Y0, Y0 + VPXOR Y12, Y0, Y12 + VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 + VMOVDQU 512(SP), Y8 + VPADDD Y8, Y12, Y8 + VPXOR Y4, Y8, Y4 + VMOVDQU Y8, 512(SP) + VPSRLD $0x0c, Y4, Y8 + VPSLLD $0x14, Y4, Y4 + VPOR Y4, Y8, Y4 + VPADDD Y0, Y4, Y0 + VPADDD 128(SP), Y0, Y0 + VPXOR Y12, Y0, Y12 + VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 + VMOVDQU 512(SP), Y8 + VPADDD Y8, Y12, Y8 + VPXOR Y4, Y8, Y4 + VMOVDQU Y8, 512(SP) + VPSRLD $0x07, Y4, Y8 + VPSLLD $0x19, Y4, Y4 + VPOR Y4, Y8, Y4 + VPADDD Y1, Y5, Y1 + VPADDD 320(SP), Y1, Y1 + VPXOR Y13, Y1, Y13 + VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 + VPADDD Y9, Y13, Y9 + VPXOR Y5, Y9, Y5 + VPSRLD $0x0c, Y5, Y8 + VPSLLD $0x14, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y1, Y5, Y1 + VPADDD 384(SP), Y1, Y1 + VPXOR Y13, Y1, Y13 + VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 + VPADDD Y9, Y13, Y9 + VPXOR Y5, Y9, Y5 + VPSRLD $0x07, Y5, Y8 + VPSLLD $0x19, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y2, Y6, Y2 + VPADDD 416(SP), Y2, Y2 + VPXOR Y14, Y2, Y14 + VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 + VPADDD Y10, Y14, Y10 + VPXOR Y6, Y10, Y6 + VPSRLD $0x0c, Y6, Y8 + VPSLLD $0x14, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y2, Y6, Y2 + VPADDD 64(SP), Y2, Y2 + VPXOR Y14, Y2, Y14 + VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 + VPADDD Y10, Y14, Y10 + VPXOR Y6, Y10, Y6 + VPSRLD $0x07, Y6, Y8 + VPSLLD $0x19, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y3, Y7, Y3 + VPADDD 224(SP), Y3, Y3 + VPXOR Y15, Y3, Y15 + VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 + VPADDD Y11, Y15, Y11 + VPXOR Y7, Y11, Y7 + VPSRLD $0x0c, Y7, Y8 + VPSLLD $0x14, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y3, Y7, Y3 + VPADDD 448(SP), Y3, Y3 + VPXOR Y15, Y3, Y15 + VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 + VPADDD Y11, Y15, Y11 + VPXOR Y7, Y11, Y7 + VPSRLD $0x07, Y7, Y8 + VPSLLD $0x19, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y0, Y5, Y0 + VPADDD 192(SP), Y0, Y0 + VPXOR Y15, Y0, Y15 + VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 + VPADDD Y10, Y15, Y10 + VPXOR Y5, Y10, Y5 + VPSRLD $0x0c, Y5, Y8 + VPSLLD $0x14, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y0, Y5, Y0 + VPADDD 160(SP), Y0, Y0 + VPXOR Y15, Y0, Y15 + VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 + VPADDD Y10, Y15, Y10 + VPXOR Y5, Y10, Y5 + VPSRLD $0x07, Y5, Y8 + VPSLLD $0x19, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y1, Y6, Y1 + VPADDD 288(SP), Y1, Y1 + VPXOR Y12, Y1, Y12 + VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 + VPADDD Y11, Y12, Y11 + VPXOR Y6, Y11, Y6 + VPSRLD $0x0c, Y6, Y8 + VPSLLD $0x14, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y1, Y6, Y1 + VPADDD (SP), Y1, Y1 + VPXOR Y12, Y1, Y12 + VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 + VPADDD Y11, Y12, Y11 + VPXOR Y6, Y11, Y6 + VPSRLD $0x07, Y6, Y8 + VPSLLD $0x19, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y2, Y7, Y2 + VPADDD 352(SP), Y2, Y2 + VPXOR Y13, Y2, Y13 + VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 + VMOVDQU 512(SP), Y8 + VPADDD Y8, Y13, Y8 + VPXOR Y7, Y8, Y7 + VMOVDQU Y8, 512(SP) + VPSRLD $0x0c, Y7, Y8 + VPSLLD $0x14, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y2, Y7, Y2 + VPADDD 480(SP), Y2, Y2 + VPXOR Y13, Y2, Y13 + VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 + VMOVDQU 512(SP), Y8 + VPADDD Y8, Y13, Y8 + VPXOR Y7, Y8, Y7 + VMOVDQU Y8, 512(SP) + VPSRLD $0x07, Y7, Y8 + VPSLLD $0x19, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y3, Y4, Y3 + VPADDD 256(SP), Y3, Y3 + VPXOR Y14, Y3, Y14 + VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 + VPADDD Y9, Y14, Y9 + VPXOR Y4, Y9, Y4 + VPSRLD $0x0c, Y4, Y8 + VPSLLD $0x14, Y4, Y4 + VPOR Y4, Y8, Y4 + VPADDD Y3, Y4, Y3 + VPADDD 32(SP), Y3, Y3 + VPXOR Y14, Y3, Y14 + VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 + VPADDD Y9, Y14, Y9 + VPXOR Y4, Y9, Y4 + VPSRLD $0x07, Y4, Y8 + VPSLLD $0x19, Y4, Y4 + VPOR Y4, Y8, Y4 + + // Round 4 + VPADDD Y0, Y4, Y0 + VPADDD 320(SP), Y0, Y0 + VPXOR Y12, Y0, Y12 + VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 + VMOVDQU 512(SP), Y8 + VPADDD Y8, Y12, Y8 + VPXOR Y4, Y8, Y4 + VMOVDQU Y8, 512(SP) + VPSRLD $0x0c, Y4, Y8 + VPSLLD $0x14, Y4, Y4 + VPOR Y4, Y8, Y4 + VPADDD Y0, Y4, Y0 + VPADDD 224(SP), Y0, Y0 + VPXOR Y12, Y0, Y12 + VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 + VMOVDQU 512(SP), Y8 + VPADDD Y8, Y12, Y8 + VPXOR Y4, Y8, Y4 + VMOVDQU Y8, 512(SP) + VPSRLD $0x07, Y4, Y8 + VPSLLD $0x19, Y4, Y4 + VPOR Y4, Y8, Y4 + VPADDD Y1, Y5, Y1 + VPADDD 384(SP), Y1, Y1 + VPXOR Y13, Y1, Y13 + VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 + VPADDD Y9, Y13, Y9 + VPXOR Y5, Y9, Y5 + VPSRLD $0x0c, Y5, Y8 + VPSLLD $0x14, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y1, Y5, Y1 + VPADDD 288(SP), Y1, Y1 + VPXOR Y13, Y1, Y13 + VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 + VPADDD Y9, Y13, Y9 + VPXOR Y5, Y9, Y5 + VPSRLD $0x07, Y5, Y8 + VPSLLD $0x19, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y2, Y6, Y2 + VPADDD 448(SP), Y2, Y2 + VPXOR Y14, Y2, Y14 + VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 + VPADDD Y10, Y14, Y10 + VPXOR Y6, Y10, Y6 + VPSRLD $0x0c, Y6, Y8 + VPSLLD $0x14, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y2, Y6, Y2 + VPADDD 96(SP), Y2, Y2 + VPXOR Y14, Y2, Y14 + VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 + VPADDD Y10, Y14, Y10 + VPXOR Y6, Y10, Y6 + VPSRLD $0x07, Y6, Y8 + VPSLLD $0x19, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y3, Y7, Y3 + VPADDD 416(SP), Y3, Y3 + VPXOR Y15, Y3, Y15 + VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 + VPADDD Y11, Y15, Y11 + VPXOR Y7, Y11, Y7 + VPSRLD $0x0c, Y7, Y8 + VPSLLD $0x14, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y3, Y7, Y3 + VPADDD 480(SP), Y3, Y3 + VPXOR Y15, Y3, Y15 + VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 + VPADDD Y11, Y15, Y11 + VPXOR Y7, Y11, Y7 + VPSRLD $0x07, Y7, Y8 + VPSLLD $0x19, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y0, Y5, Y0 + VPADDD 128(SP), Y0, Y0 + VPXOR Y15, Y0, Y15 + VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 + VPADDD Y10, Y15, Y10 + VPXOR Y5, Y10, Y5 + VPSRLD $0x0c, Y5, Y8 + VPSLLD $0x14, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y0, Y5, Y0 + VPADDD (SP), Y0, Y0 + VPXOR Y15, Y0, Y15 + VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 + VPADDD Y10, Y15, Y10 + VPXOR Y5, Y10, Y5 + VPSRLD $0x07, Y5, Y8 + VPSLLD $0x19, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y1, Y6, Y1 + VPADDD 352(SP), Y1, Y1 + VPXOR Y12, Y1, Y12 + VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 + VPADDD Y11, Y12, Y11 + VPXOR Y6, Y11, Y6 + VPSRLD $0x0c, Y6, Y8 + VPSLLD $0x14, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y1, Y6, Y1 + VPADDD 64(SP), Y1, Y1 + VPXOR Y12, Y1, Y12 + VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 + VPADDD Y11, Y12, Y11 + VPXOR Y6, Y11, Y6 + VPSRLD $0x07, Y6, Y8 + VPSLLD $0x19, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y2, Y7, Y2 + VPADDD 160(SP), Y2, Y2 + VPXOR Y13, Y2, Y13 + VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 + VMOVDQU 512(SP), Y8 + VPADDD Y8, Y13, Y8 + VPXOR Y7, Y8, Y7 + VMOVDQU Y8, 512(SP) + VPSRLD $0x0c, Y7, Y8 + VPSLLD $0x14, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y2, Y7, Y2 + VPADDD 256(SP), Y2, Y2 + VPXOR Y13, Y2, Y13 + VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 + VMOVDQU 512(SP), Y8 + VPADDD Y8, Y13, Y8 + VPXOR Y7, Y8, Y7 + VMOVDQU Y8, 512(SP) + VPSRLD $0x07, Y7, Y8 + VPSLLD $0x19, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y3, Y4, Y3 + VPADDD 32(SP), Y3, Y3 + VPXOR Y14, Y3, Y14 + VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 + VPADDD Y9, Y14, Y9 + VPXOR Y4, Y9, Y4 + VPSRLD $0x0c, Y4, Y8 + VPSLLD $0x14, Y4, Y4 + VPOR Y4, Y8, Y4 + VPADDD Y3, Y4, Y3 + VPADDD 192(SP), Y3, Y3 + VPXOR Y14, Y3, Y14 + VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 + VPADDD Y9, Y14, Y9 + VPXOR Y4, Y9, Y4 + VPSRLD $0x07, Y4, Y8 + VPSLLD $0x19, Y4, Y4 + VPOR Y4, Y8, Y4 + + // Round 5 + VPADDD Y0, Y4, Y0 + VPADDD 384(SP), Y0, Y0 + VPXOR Y12, Y0, Y12 + VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 + VMOVDQU 512(SP), Y8 + VPADDD Y8, Y12, Y8 + VPXOR Y4, Y8, Y4 + VMOVDQU Y8, 512(SP) + VPSRLD $0x0c, Y4, Y8 + VPSLLD $0x14, Y4, Y4 + VPOR Y4, Y8, Y4 + VPADDD Y0, Y4, Y0 + VPADDD 416(SP), Y0, Y0 + VPXOR Y12, Y0, Y12 + VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 + VMOVDQU 512(SP), Y8 + VPADDD Y8, Y12, Y8 + VPXOR Y4, Y8, Y4 + VMOVDQU Y8, 512(SP) + VPSRLD $0x07, Y4, Y8 + VPSLLD $0x19, Y4, Y4 + VPOR Y4, Y8, Y4 + VPADDD Y1, Y5, Y1 + VPADDD 288(SP), Y1, Y1 + VPXOR Y13, Y1, Y13 + VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 + VPADDD Y9, Y13, Y9 + VPXOR Y5, Y9, Y5 + VPSRLD $0x0c, Y5, Y8 + VPSLLD $0x14, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y1, Y5, Y1 + VPADDD 352(SP), Y1, Y1 + VPXOR Y13, Y1, Y13 + VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 + VPADDD Y9, Y13, Y9 + VPXOR Y5, Y9, Y5 + VPSRLD $0x07, Y5, Y8 + VPSLLD $0x19, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y2, Y6, Y2 + VPADDD 480(SP), Y2, Y2 + VPXOR Y14, Y2, Y14 + VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 + VPADDD Y10, Y14, Y10 + VPXOR Y6, Y10, Y6 + VPSRLD $0x0c, Y6, Y8 + VPSLLD $0x14, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y2, Y6, Y2 + VPADDD 320(SP), Y2, Y2 + VPXOR Y14, Y2, Y14 + VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 + VPADDD Y10, Y14, Y10 + VPXOR Y6, Y10, Y6 + VPSRLD $0x07, Y6, Y8 + VPSLLD $0x19, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y3, Y7, Y3 + VPADDD 448(SP), Y3, Y3 + VPXOR Y15, Y3, Y15 + VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 + VPADDD Y11, Y15, Y11 + VPXOR Y7, Y11, Y7 + VPSRLD $0x0c, Y7, Y8 + VPSLLD $0x14, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y3, Y7, Y3 + VPADDD 256(SP), Y3, Y3 + VPXOR Y15, Y3, Y15 + VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 + VPADDD Y11, Y15, Y11 + VPXOR Y7, Y11, Y7 + VPSRLD $0x07, Y7, Y8 + VPSLLD $0x19, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y0, Y5, Y0 + VPADDD 224(SP), Y0, Y0 + VPXOR Y15, Y0, Y15 + VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 + VPADDD Y10, Y15, Y10 + VPXOR Y5, Y10, Y5 + VPSRLD $0x0c, Y5, Y8 + VPSLLD $0x14, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y0, Y5, Y0 + VPADDD 64(SP), Y0, Y0 + VPXOR Y15, Y0, Y15 + VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 + VPADDD Y10, Y15, Y10 + VPXOR Y5, Y10, Y5 + VPSRLD $0x07, Y5, Y8 + VPSLLD $0x19, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y1, Y6, Y1 + VPADDD 160(SP), Y1, Y1 + VPXOR Y12, Y1, Y12 + VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 + VPADDD Y11, Y12, Y11 + VPXOR Y6, Y11, Y6 + VPSRLD $0x0c, Y6, Y8 + VPSLLD $0x14, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y1, Y6, Y1 + VPADDD 96(SP), Y1, Y1 + VPXOR Y12, Y1, Y12 + VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 + VPADDD Y11, Y12, Y11 + VPXOR Y6, Y11, Y6 + VPSRLD $0x07, Y6, Y8 + VPSLLD $0x19, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y2, Y7, Y2 + VPADDD (SP), Y2, Y2 + VPXOR Y13, Y2, Y13 + VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 + VMOVDQU 512(SP), Y8 + VPADDD Y8, Y13, Y8 + VPXOR Y7, Y8, Y7 + VMOVDQU Y8, 512(SP) + VPSRLD $0x0c, Y7, Y8 + VPSLLD $0x14, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y2, Y7, Y2 + VPADDD 32(SP), Y2, Y2 + VPXOR Y13, Y2, Y13 + VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 + VMOVDQU 512(SP), Y8 + VPADDD Y8, Y13, Y8 + VPXOR Y7, Y8, Y7 + VMOVDQU Y8, 512(SP) + VPSRLD $0x07, Y7, Y8 + VPSLLD $0x19, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y3, Y4, Y3 + VPADDD 192(SP), Y3, Y3 + VPXOR Y14, Y3, Y14 + VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 + VPADDD Y9, Y14, Y9 + VPXOR Y4, Y9, Y4 + VPSRLD $0x0c, Y4, Y8 + VPSLLD $0x14, Y4, Y4 + VPOR Y4, Y8, Y4 + VPADDD Y3, Y4, Y3 + VPADDD 128(SP), Y3, Y3 + VPXOR Y14, Y3, Y14 + VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 + VPADDD Y9, Y14, Y9 + VPXOR Y4, Y9, Y4 + VPSRLD $0x07, Y4, Y8 + VPSLLD $0x19, Y4, Y4 + VPOR Y4, Y8, Y4 + + // Round 6 + VPADDD Y0, Y4, Y0 + VPADDD 288(SP), Y0, Y0 + VPXOR Y12, Y0, Y12 + VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 + VMOVDQU 512(SP), Y8 + VPADDD Y8, Y12, Y8 + VPXOR Y4, Y8, Y4 + VMOVDQU Y8, 512(SP) + VPSRLD $0x0c, Y4, Y8 + VPSLLD $0x14, Y4, Y4 + VPOR Y4, Y8, Y4 + VPADDD Y0, Y4, Y0 + VPADDD 448(SP), Y0, Y0 + VPXOR Y12, Y0, Y12 + VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 + VMOVDQU 512(SP), Y8 + VPADDD Y8, Y12, Y8 + VPXOR Y4, Y8, Y4 + VMOVDQU Y8, 512(SP) + VPSRLD $0x07, Y4, Y8 + VPSLLD $0x19, Y4, Y4 + VPOR Y4, Y8, Y4 + VPADDD Y1, Y5, Y1 + VPADDD 352(SP), Y1, Y1 + VPXOR Y13, Y1, Y13 + VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 + VPADDD Y9, Y13, Y9 + VPXOR Y5, Y9, Y5 + VPSRLD $0x0c, Y5, Y8 + VPSLLD $0x14, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y1, Y5, Y1 + VPADDD 160(SP), Y1, Y1 + VPXOR Y13, Y1, Y13 + VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 + VPADDD Y9, Y13, Y9 + VPXOR Y5, Y9, Y5 + VPSRLD $0x07, Y5, Y8 + VPSLLD $0x19, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y2, Y6, Y2 + VPADDD 256(SP), Y2, Y2 + VPXOR Y14, Y2, Y14 + VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 + VPADDD Y10, Y14, Y10 + VPXOR Y6, Y10, Y6 + VPSRLD $0x0c, Y6, Y8 + VPSLLD $0x14, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y2, Y6, Y2 + VPADDD 384(SP), Y2, Y2 + VPXOR Y14, Y2, Y14 + VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 + VPADDD Y10, Y14, Y10 + VPXOR Y6, Y10, Y6 + VPSRLD $0x07, Y6, Y8 + VPSLLD $0x19, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y3, Y7, Y3 + VPADDD 480(SP), Y3, Y3 + VPXOR Y15, Y3, Y15 + VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 + VPADDD Y11, Y15, Y11 + VPXOR Y7, Y11, Y7 + VPSRLD $0x0c, Y7, Y8 + VPSLLD $0x14, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y3, Y7, Y3 + VPADDD 32(SP), Y3, Y3 + VPXOR Y15, Y3, Y15 + VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 + VPADDD Y11, Y15, Y11 + VPXOR Y7, Y11, Y7 + VPSRLD $0x07, Y7, Y8 + VPSLLD $0x19, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y0, Y5, Y0 + VPADDD 416(SP), Y0, Y0 + VPXOR Y15, Y0, Y15 + VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 + VPADDD Y10, Y15, Y10 + VPXOR Y5, Y10, Y5 + VPSRLD $0x0c, Y5, Y8 + VPSLLD $0x14, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y0, Y5, Y0 + VPADDD 96(SP), Y0, Y0 + VPXOR Y15, Y0, Y15 + VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 + VPADDD Y10, Y15, Y10 + VPXOR Y5, Y10, Y5 + VPSRLD $0x07, Y5, Y8 + VPSLLD $0x19, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y1, Y6, Y1 + VPADDD (SP), Y1, Y1 + VPXOR Y12, Y1, Y12 + VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 + VPADDD Y11, Y12, Y11 + VPXOR Y6, Y11, Y6 + VPSRLD $0x0c, Y6, Y8 + VPSLLD $0x14, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y1, Y6, Y1 + VPADDD 320(SP), Y1, Y1 + VPXOR Y12, Y1, Y12 + VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 + VPADDD Y11, Y12, Y11 + VPXOR Y6, Y11, Y6 + VPSRLD $0x07, Y6, Y8 + VPSLLD $0x19, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y2, Y7, Y2 + VPADDD 64(SP), Y2, Y2 + VPXOR Y13, Y2, Y13 + VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 + VMOVDQU 512(SP), Y8 + VPADDD Y8, Y13, Y8 + VPXOR Y7, Y8, Y7 + VMOVDQU Y8, 512(SP) + VPSRLD $0x0c, Y7, Y8 + VPSLLD $0x14, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y2, Y7, Y2 + VPADDD 192(SP), Y2, Y2 + VPXOR Y13, Y2, Y13 + VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 + VMOVDQU 512(SP), Y8 + VPADDD Y8, Y13, Y8 + VPXOR Y7, Y8, Y7 + VMOVDQU Y8, 512(SP) + VPSRLD $0x07, Y7, Y8 + VPSLLD $0x19, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y3, Y4, Y3 + VPADDD 128(SP), Y3, Y3 + VPXOR Y14, Y3, Y14 + VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 + VPADDD Y9, Y14, Y9 + VPXOR Y4, Y9, Y4 + VPSRLD $0x0c, Y4, Y8 + VPSLLD $0x14, Y4, Y4 + VPOR Y4, Y8, Y4 + VPADDD Y3, Y4, Y3 + VPADDD 224(SP), Y3, Y3 + VPXOR Y14, Y3, Y14 + VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 + VPADDD Y9, Y14, Y9 + VPXOR Y4, Y9, Y4 + VPSRLD $0x07, Y4, Y8 + VPSLLD $0x19, Y4, Y4 + VPOR Y4, Y8, Y4 + + // Round 7 + VPADDD Y0, Y4, Y0 + VPADDD 352(SP), Y0, Y0 + VPXOR Y12, Y0, Y12 + VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 + VMOVDQU 512(SP), Y8 + VPADDD Y8, Y12, Y8 + VPXOR Y4, Y8, Y4 + VMOVDQU Y8, 512(SP) + VPSRLD $0x0c, Y4, Y8 + VPSLLD $0x14, Y4, Y4 + VPOR Y4, Y8, Y4 + VPADDD Y0, Y4, Y0 + VPADDD 480(SP), Y0, Y0 + VPXOR Y12, Y0, Y12 + VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 + VMOVDQU 512(SP), Y8 + VPADDD Y8, Y12, Y8 + VPXOR Y4, Y8, Y4 + VMOVDQU Y8, 512(SP) + VPSRLD $0x07, Y4, Y8 + VPSLLD $0x19, Y4, Y4 + VPOR Y4, Y8, Y4 + VPADDD Y1, Y5, Y1 + VPADDD 160(SP), Y1, Y1 + VPXOR Y13, Y1, Y13 + VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 + VPADDD Y9, Y13, Y9 + VPXOR Y5, Y9, Y5 + VPSRLD $0x0c, Y5, Y8 + VPSLLD $0x14, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y1, Y5, Y1 + VPADDD (SP), Y1, Y1 + VPXOR Y13, Y1, Y13 + VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 + VPADDD Y9, Y13, Y9 + VPXOR Y5, Y9, Y5 + VPSRLD $0x07, Y5, Y8 + VPSLLD $0x19, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y2, Y6, Y2 + VPADDD 32(SP), Y2, Y2 + VPXOR Y14, Y2, Y14 + VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 + VPADDD Y10, Y14, Y10 + VPXOR Y6, Y10, Y6 + VPSRLD $0x0c, Y6, Y8 + VPSLLD $0x14, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y2, Y6, Y2 + VPADDD 288(SP), Y2, Y2 + VPXOR Y14, Y2, Y14 + VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 + VPADDD Y10, Y14, Y10 + VPXOR Y6, Y10, Y6 + VPSRLD $0x07, Y6, Y8 + VPSLLD $0x19, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y3, Y7, Y3 + VPADDD 256(SP), Y3, Y3 + VPXOR Y15, Y3, Y15 + VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 + VPADDD Y11, Y15, Y11 + VPXOR Y7, Y11, Y7 + VPSRLD $0x0c, Y7, Y8 + VPSLLD $0x14, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y3, Y7, Y3 + VPADDD 192(SP), Y3, Y3 + VPXOR Y15, Y3, Y15 + VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 + VPADDD Y11, Y15, Y11 + VPXOR Y7, Y11, Y7 + VPSRLD $0x07, Y7, Y8 + VPSLLD $0x19, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y0, Y5, Y0 + VPADDD 448(SP), Y0, Y0 + VPXOR Y15, Y0, Y15 + VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 + VPADDD Y10, Y15, Y10 + VPXOR Y5, Y10, Y5 + VPSRLD $0x0c, Y5, Y8 + VPSLLD $0x14, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y0, Y5, Y0 + VPADDD 320(SP), Y0, Y0 + VPXOR Y15, Y0, Y15 + VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 + VPADDD Y10, Y15, Y10 + VPXOR Y5, Y10, Y5 + VPSRLD $0x07, Y5, Y8 + VPSLLD $0x19, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y1, Y6, Y1 + VPADDD 64(SP), Y1, Y1 + VPXOR Y12, Y1, Y12 + VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 + VPADDD Y11, Y12, Y11 + VPXOR Y6, Y11, Y6 + VPSRLD $0x0c, Y6, Y8 + VPSLLD $0x14, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y1, Y6, Y1 + VPADDD 384(SP), Y1, Y1 + VPXOR Y12, Y1, Y12 + VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 + VPADDD Y11, Y12, Y11 + VPXOR Y6, Y11, Y6 + VPSRLD $0x07, Y6, Y8 + VPSLLD $0x19, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y2, Y7, Y2 + VPADDD 96(SP), Y2, Y2 + VPXOR Y13, Y2, Y13 + VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 + VMOVDQU 512(SP), Y8 + VPADDD Y8, Y13, Y8 + VPXOR Y7, Y8, Y7 + VMOVDQU Y8, 512(SP) + VPSRLD $0x0c, Y7, Y8 + VPSLLD $0x14, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y2, Y7, Y2 + VPADDD 128(SP), Y2, Y2 + VPXOR Y13, Y2, Y13 + VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 + VMOVDQU 512(SP), Y8 + VPADDD Y8, Y13, Y8 + VPXOR Y7, Y8, Y7 + VMOVDQU Y8, 512(SP) + VPSRLD $0x07, Y7, Y8 + VPSLLD $0x19, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y3, Y4, Y3 + VPADDD 224(SP), Y3, Y3 + VPXOR Y14, Y3, Y14 + VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 + VPADDD Y9, Y14, Y9 + VPXOR Y4, Y9, Y4 + VPSRLD $0x0c, Y4, Y8 + VPSLLD $0x14, Y4, Y4 + VPOR Y4, Y8, Y4 + VPADDD Y3, Y4, Y3 + VPADDD 416(SP), Y3, Y3 + VPXOR Y14, Y3, Y14 + VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 + VPADDD Y9, Y14, Y9 + VPXOR Y4, Y9, Y4 + VPSRLD $0x07, Y4, Y8 + VPSLLD $0x19, Y4, Y4 + VPOR Y4, Y8, Y4 + + // Finalize CVs + VMOVDQU 512(SP), Y8 + VPXOR Y0, Y8, Y0 + VPXOR Y1, Y9, Y1 + VPXOR Y2, Y10, Y2 + VPXOR Y3, Y11, Y3 + VPXOR Y4, Y12, Y4 + VPXOR Y5, Y13, Y5 + VPXOR Y6, Y14, Y6 + VPXOR Y7, Y15, Y7 + + // Loop + INCQ DX + CMPQ DX, $0x00000010 + JNE loop + + // Finished; transpose CVs + VPUNPCKLDQ Y1, Y0, Y8 + VPUNPCKHDQ Y1, Y0, Y9 + VPUNPCKLDQ Y3, Y2, Y10 + VPUNPCKHDQ Y3, Y2, Y11 + VPUNPCKLDQ Y5, Y4, Y12 + VPUNPCKHDQ Y5, Y4, Y13 + VPUNPCKLDQ Y7, Y6, Y14 + VPUNPCKHDQ Y7, Y6, Y15 + VPUNPCKLQDQ Y10, Y8, Y0 + VPUNPCKHQDQ Y10, Y8, Y1 + VPUNPCKLQDQ Y11, Y9, Y2 + VPUNPCKHQDQ Y11, Y9, Y3 + VPUNPCKLQDQ Y14, Y12, Y4 + VPUNPCKHQDQ Y14, Y12, Y5 + VPUNPCKLQDQ Y15, Y13, Y6 + VPUNPCKHQDQ Y15, Y13, Y7 + VPERM2I128 $0x20, Y4, Y0, Y8 + VPERM2I128 $0x31, Y4, Y0, Y12 + VPERM2I128 $0x20, Y5, Y1, Y9 + VPERM2I128 $0x31, Y5, Y1, Y13 + VPERM2I128 $0x20, Y6, Y2, Y10 + VPERM2I128 $0x31, Y6, Y2, Y14 + VPERM2I128 $0x20, Y7, Y3, Y11 + VPERM2I128 $0x31, Y7, Y3, Y15 + VMOVDQU Y8, (AX) + VMOVDQU Y9, 32(AX) + VMOVDQU Y10, 64(AX) + VMOVDQU Y11, 96(AX) + VMOVDQU Y12, 128(AX) + VMOVDQU Y13, 160(AX) + VMOVDQU Y14, 192(AX) + VMOVDQU Y15, 224(AX) + RET diff --git a/blake3_test.go b/blake3_test.go index fdf0ed7..7bfe6af 100644 --- a/blake3_test.go +++ b/blake3_test.go @@ -63,7 +63,7 @@ func TestVectors(t *testing.T) { subKey := make([]byte, len(vec.DeriveKey)/2) blake3.DeriveKey(subKey, ctx, in) if out := toHex(subKey); out != vec.DeriveKey { - t.Errorf("output did not match test vector:\n\texpected: %v...\n\t got: %v...", vec.DeriveKey[:10], subKey[:10]) + t.Errorf("output did not match test vector:\n\texpected: %v...\n\t got: %v...", vec.DeriveKey[:10], out[:10]) } } } @@ -150,7 +150,7 @@ func TestSum(t *testing.T) { h.Write(in) h.Sum(exp256[:0]) if got256 := blake3.Sum256(in); exp256 != got256 { - t.Errorf("Sum256 output did not match Sum output:\n\texpected: %v...\n\t got: %v...", exp256[:10], got256[:10]) + t.Errorf("Sum256 output did not match Sum output:\n\texpected: %x...\n\t got: %x...", exp256[:5], got256[:5]) } var exp512 [64]byte @@ -158,7 +158,7 @@ func TestSum(t *testing.T) { h.Write(in) h.Sum(exp512[:0]) if got512 := blake3.Sum512(in); exp512 != got512 { - t.Errorf("Sum512 output did not match Sum output:\n\texpected: %v...\n\t got: %v...", exp512[:10], got512[:10]) + t.Errorf("Sum512 output did not match Sum output:\n\texpected: %x...\n\t got: %x...", exp512[:5], got512[:5]) } } } @@ -190,13 +190,20 @@ func (nopReader) Read(p []byte) (int, error) { return len(p), nil } func BenchmarkWrite(b *testing.B) { b.ReportAllocs() - b.SetBytes(1) - io.CopyN(blake3.New(0, nil), nopReader{}, int64(b.N)) + b.SetBytes(1024) + io.CopyN(blake3.New(0, nil), nopReader{}, int64(b.N*1024)) +} + +func BenchmarkXOF(b *testing.B) { + b.ReportAllocs() + b.SetBytes(1024) + io.CopyN(ioutil.Discard, blake3.New(0, nil).XOF(), int64(b.N*1024)) } func BenchmarkSum256(b *testing.B) { b.Run("64", func(b *testing.B) { b.ReportAllocs() + b.SetBytes(64) buf := make([]byte, 64) for i := 0; i < b.N; i++ { blake3.Sum256(buf) @@ -204,6 +211,7 @@ func BenchmarkSum256(b *testing.B) { }) b.Run("1024", func(b *testing.B) { b.ReportAllocs() + b.SetBytes(1024) buf := make([]byte, 1024) for i := 0; i < b.N; i++ { blake3.Sum256(buf) @@ -211,15 +219,10 @@ func BenchmarkSum256(b *testing.B) { }) b.Run("65536", func(b *testing.B) { b.ReportAllocs() + b.SetBytes(65536) buf := make([]byte, 65536) for i := 0; i < b.N; i++ { blake3.Sum256(buf) } }) } - -func BenchmarkXOF(b *testing.B) { - b.ReportAllocs() - b.SetBytes(1) - io.CopyN(ioutil.Discard, blake3.New(0, nil).XOF(), int64(b.N)) -} diff --git a/compress_amd64.go b/compress_amd64.go new file mode 100644 index 0000000..cfe414b --- /dev/null +++ b/compress_amd64.go @@ -0,0 +1,76 @@ +package blake3 + +import ( + "unsafe" + + "golang.org/x/sys/cpu" +) + +//go:generate go run avo/gen.go -out blake3_amd64.s + +//go:noescape +func compressChunksAVX2(cvs *[8][8]uint32, buf *[8192]byte, key *[8]uint32, counter uint64, flags uint32) + +func compressNode(n node) (out [16]uint32) { + compressNodeGeneric(&out, n) + return +} + +func compressBufferLarge(buf *[8192]byte, buflen int, key *[8]uint32, counter uint64, flags uint32) node { + var cvs [8][8]uint32 + compressChunksAVX2(&cvs, buf, key, counter, flags) + numChunks := uint64(buflen / chunkSize) + if buflen%chunkSize != 0 { + // use non-asm for remainder + partialChunk := buf[buflen-buflen%chunkSize : buflen] + cvs[numChunks] = chainingValue(compressChunk(partialChunk, key, counter+numChunks, flags)) + numChunks++ + } + return mergeSubtrees(cvs[:numChunks], key, flags) +} + +func compressBuffer(buf *[8192]byte, buflen int, key *[8]uint32, counter uint64, flags uint32) node { + switch { + case cpu.X86.HasAVX2 && buflen >= chunkSize*2: + return compressBufferLarge(buf, buflen, key, counter, flags) + default: + return compressBufferGeneric(buf, buflen, key, counter, flags) + } +} + +func compressChunk(chunk []byte, key *[8]uint32, counter uint64, flags uint32) node { + n := node{ + cv: *key, + counter: counter, + blockLen: blockSize, + flags: flags | flagChunkStart, + } + blockBytes := (*[64]byte)(unsafe.Pointer(&n.block))[:] + for len(chunk) > blockSize { + copy(blockBytes, chunk) + chunk = chunk[blockSize:] + n.cv = chainingValue(n) + n.flags &^= flagChunkStart + } + // pad last block with zeros + n.block = [16]uint32{} + copy(blockBytes, chunk) + n.blockLen = uint32(len(chunk)) + n.flags |= flagChunkEnd + return n +} + +func wordsToBytes(words [16]uint32, block *[64]byte) { + *block = *(*[64]byte)(unsafe.Pointer(&words)) +} + +func hashBlock(out *[64]byte, buf []byte) { + var block [16]uint32 + copy((*[64]byte)(unsafe.Pointer(&block))[:], buf) + compressNodeGeneric((*[16]uint32)(unsafe.Pointer(out)), node{ + cv: iv, + block: block, + blockLen: uint32(len(buf)), + flags: flagChunkStart | flagChunkEnd | flagRoot, + }) +} diff --git a/compress_generic.go b/compress_generic.go new file mode 100644 index 0000000..0b4dca6 --- /dev/null +++ b/compress_generic.go @@ -0,0 +1,150 @@ +package blake3 + +import ( + "bytes" + "math/bits" +) + +func g(a, b, c, d, mx, my uint32) (uint32, uint32, uint32, uint32) { + a += b + mx + d = bits.RotateLeft32(d^a, -16) + c += d + b = bits.RotateLeft32(b^c, -12) + a += b + my + d = bits.RotateLeft32(d^a, -8) + c += d + b = bits.RotateLeft32(b^c, -7) + return a, b, c, d +} + +func compressNodeGeneric(out *[16]uint32, n node) { + // NOTE: we unroll all of the rounds, as well as the permutations that occur + // between rounds. + + // round 1 (also initializes state) + // columns + s0, s4, s8, s12 := g(n.cv[0], n.cv[4], iv[0], uint32(n.counter), n.block[0], n.block[1]) + s1, s5, s9, s13 := g(n.cv[1], n.cv[5], iv[1], uint32(n.counter>>32), n.block[2], n.block[3]) + s2, s6, s10, s14 := g(n.cv[2], n.cv[6], iv[2], n.blockLen, n.block[4], n.block[5]) + s3, s7, s11, s15 := g(n.cv[3], n.cv[7], iv[3], n.flags, n.block[6], n.block[7]) + // diagonals + s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[8], n.block[9]) + s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[10], n.block[11]) + s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[12], n.block[13]) + s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[14], n.block[15]) + + // round 2 + s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[2], n.block[6]) + s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[3], n.block[10]) + s2, s6, s10, s14 = g(s2, s6, s10, s14, n.block[7], n.block[0]) + s3, s7, s11, s15 = g(s3, s7, s11, s15, n.block[4], n.block[13]) + s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[1], n.block[11]) + s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[12], n.block[5]) + s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[9], n.block[14]) + s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[15], n.block[8]) + + // round 3 + s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[3], n.block[4]) + s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[10], n.block[12]) + s2, s6, s10, s14 = g(s2, s6, s10, s14, n.block[13], n.block[2]) + s3, s7, s11, s15 = g(s3, s7, s11, s15, n.block[7], n.block[14]) + s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[6], n.block[5]) + s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[9], n.block[0]) + s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[11], n.block[15]) + s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[8], n.block[1]) + + // round 4 + s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[10], n.block[7]) + s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[12], n.block[9]) + s2, s6, s10, s14 = g(s2, s6, s10, s14, n.block[14], n.block[3]) + s3, s7, s11, s15 = g(s3, s7, s11, s15, n.block[13], n.block[15]) + s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[4], n.block[0]) + s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[11], n.block[2]) + s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[5], n.block[8]) + s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[1], n.block[6]) + + // round 5 + s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[12], n.block[13]) + s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[9], n.block[11]) + s2, s6, s10, s14 = g(s2, s6, s10, s14, n.block[15], n.block[10]) + s3, s7, s11, s15 = g(s3, s7, s11, s15, n.block[14], n.block[8]) + s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[7], n.block[2]) + s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[5], n.block[3]) + s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[0], n.block[1]) + s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[6], n.block[4]) + + // round 6 + s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[9], n.block[14]) + s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[11], n.block[5]) + s2, s6, s10, s14 = g(s2, s6, s10, s14, n.block[8], n.block[12]) + s3, s7, s11, s15 = g(s3, s7, s11, s15, n.block[15], n.block[1]) + s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[13], n.block[3]) + s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[0], n.block[10]) + s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[2], n.block[6]) + s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[4], n.block[7]) + + // round 7 + s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[11], n.block[15]) + s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[5], n.block[0]) + s2, s6, s10, s14 = g(s2, s6, s10, s14, n.block[1], n.block[9]) + s3, s7, s11, s15 = g(s3, s7, s11, s15, n.block[8], n.block[6]) + s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[14], n.block[10]) + s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[2], n.block[12]) + s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[3], n.block[4]) + s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[7], n.block[13]) + + // finalization + *out = [16]uint32{ + s0 ^ s8, s1 ^ s9, s2 ^ s10, s3 ^ s11, + s4 ^ s12, s5 ^ s13, s6 ^ s14, s7 ^ s15, + s8 ^ n.cv[0], s9 ^ n.cv[1], s10 ^ n.cv[2], s11 ^ n.cv[3], + s12 ^ n.cv[4], s13 ^ n.cv[5], s14 ^ n.cv[6], s15 ^ n.cv[7], + } +} + +func compressBufferGeneric(buf *[8192]byte, buflen int, key *[8]uint32, counter uint64, flags uint32) (n node) { + if buflen <= chunkSize { + return compressChunk(buf[:buflen], key, counter, flags) + } + cvs := make([][8]uint32, 0, 8) + for bb := bytes.NewBuffer(buf[:buflen]); bb.Len() > 0; { + n := compressChunk(bb.Next(chunkSize), key, counter, flags) + cvs = append(cvs, chainingValue(n)) + counter++ + } + return mergeSubtrees(cvs, key, flags) +} + +func chainingValue(n node) (cv [8]uint32) { + full := compressNode(n) + copy(cv[:], full[:]) + return +} + +func mergeSubtrees(cvs [][8]uint32, key *[8]uint32, flags uint32) node { + parent := func(l, r [8]uint32) [8]uint32 { + return chainingValue(parentNode(l, r, *key, flags)) + } + switch len(cvs) { + case 8: + cvs[6] = parent(cvs[6], cvs[7]) + fallthrough + case 7: + cvs[4], cvs[5] = parent(cvs[4], cvs[5]), cvs[6] + fallthrough + case 6: + cvs[4] = parent(cvs[4], cvs[5]) + fallthrough + case 5: + fallthrough + case 4: + cvs[2] = parent(cvs[2], cvs[3]) + fallthrough + case 3: + cvs[0], cvs[1] = parent(cvs[0], cvs[1]), cvs[2] + } + if len(cvs) > 4 { + cvs[0], cvs[1] = parent(cvs[0], cvs[1]), cvs[4] + } + return parentNode(cvs[0], cvs[1], *key, flags) +} diff --git a/compress_noasm.go b/compress_noasm.go new file mode 100644 index 0000000..847a519 --- /dev/null +++ b/compress_noasm.go @@ -0,0 +1,64 @@ +// +build !amd64 + +package blake3 + +import "encoding/binary" + +func compressNode(n node) (out [16]uint32) { + compressNodeGeneric(&out, n) + return +} + +func compressBuffer(buf *[8192]byte, length int, key *[8]uint32, counter uint64, flags uint32) node { + return compressBufferGeneric(buf, length, key, counter, flags) +} + +func compressChunk(chunk []byte, key *[8]uint32, counter uint64, flags uint32) node { + n := node{ + cv: *key, + counter: counter, + blockLen: blockSize, + flags: flags | flagChunkStart, + } + var block [blockSize]byte + for len(chunk) > blockSize { + copy(block[:], chunk) + chunk = chunk[blockSize:] + bytesToWords(block, &n.block) + n.cv = chainingValue(n) + n.flags &^= flagChunkStart + } + // pad last block with zeros + block = [blockSize]byte{} + n.blockLen = uint32(len(chunk)) + copy(block[:], chunk) + bytesToWords(block, &n.block) + n.flags |= flagChunkEnd + return n +} + +func hashBlock(out *[64]byte, buf []byte) { + var block [64]byte + var words [16]uint32 + copy(block[:], buf) + bytesToWords(block, &words) + compressNodeGeneric(&words, node{ + cv: iv, + block: words, + blockLen: uint32(len(buf)), + flags: flagChunkStart | flagChunkEnd | flagRoot, + }) + wordsToBytes(words, out) +} + +func bytesToWords(bytes [64]byte, words *[16]uint32) { + for i := range words { + words[i] = binary.LittleEndian.Uint32(bytes[4*i:]) + } +} + +func wordsToBytes(words [16]uint32, block *[64]byte) { + for i, w := range words { + binary.LittleEndian.PutUint32(block[4*i:], w) + } +} diff --git a/go.mod b/go.mod index 51832fc..46beb99 100644 --- a/go.mod +++ b/go.mod @@ -1,3 +1,5 @@ module lukechampine.com/blake3 go 1.13 + +require golang.org/x/sys v0.0.0-20200202164722-d101bd2416d5 diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..4ad15a4 --- /dev/null +++ b/go.sum @@ -0,0 +1,2 @@ +golang.org/x/sys v0.0.0-20200202164722-d101bd2416d5 h1:LfCXLvNmTYH9kEmVgqbnsWfruoXZIrh4YBgqVHtDvw0= +golang.org/x/sys v0.0.0-20200202164722-d101bd2416d5/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=