From 221995220f45e56b5fd6cf5736881182a69f6a72 Mon Sep 17 00:00:00 2001 From: lukechampine Date: Sun, 2 Aug 2020 15:09:07 -0400 Subject: [PATCH] add AVX2 optimization for XOF --- README.md | 11 +- avo/gen.go | 188 +++++-- blake3.go | 16 +- blake3_amd64.s | 1272 ++++++++++++++++++++++++++++++++++++++++++- compress_amd64.go | 21 +- compress_generic.go | 7 + compress_noasm.go | 8 + 7 files changed, 1446 insertions(+), 77 deletions(-) diff --git a/README.md b/README.md index c5147ae..87d2bb5 100644 --- a/README.md +++ b/README.md @@ -15,10 +15,11 @@ readability, in the hopes of eventually landing in `x/crypto`. The pure-Go code is fairly well-optimized, achieving throughput of ~600 MB/s. There is a separate code path for small inputs (up to 64 bytes) that runs in ~100 ns. On CPUs with AVX2 support, larger inputs (>=2 KB) are handled by -an [`avo`](https://github.com/mmcloughlin/avo)-generated assembly routine that compresses 8 chunks in parallel, -achieving throughput of ~2600 MB/s. Once [AVX-512 support](https://github.com/mmcloughlin/avo/issues/20) is added to `avo`, it -will be possible to compress 16 chunks in parallel, which should roughly double -throughput for sufficiently large inputs. +an [`avo`](https://github.com/mmcloughlin/avo)-generated assembly routine that compresses 8 nodes in parallel, +achieving throughput of ~2600 MB/s. AVX2 is also used for BLAKE3's extendable output function, +enabling it to stream pseudorandom bytes at ~3500 MB/s. Once [AVX-512 support](https://github.com/mmcloughlin/avo/issues/20) is added to `avo`, it +will be possible to compress 16 nodes in parallel, which should roughly double +the current performance. Contributions are greatly appreciated. [All contributors are eligible to receive an Urbit planet.](https://twitter.com/lukechampine/status/1274797924522885134) @@ -33,5 +34,5 @@ BenchmarkSum256/64 105 ns/op 609.51 MB/s BenchmarkSum256/1024 1778 ns/op 576.00 MB/s BenchmarkSum256/65536 24785 ns/op 2644.15 MB/s BenchmarkWrite 389 ns/op 2631.78 MB/s -BenchmarkXOF 1591 ns/op 643.80 MB/s +BenchmarkXOF 293 ns/op 3492.94 MB/s ``` diff --git a/avo/gen.go b/avo/gen.go index 482f6c7..6639927 100644 --- a/avo/gen.go +++ b/avo/gen.go @@ -12,6 +12,7 @@ import ( func main() { genGlobals() + genCompressBlocksAVX2() genCompressChunksAVX2() Generate() @@ -38,7 +39,6 @@ func genGlobals() { for i := 0; i < 8; i++ { DATA(i*4, U32(64)) } - globals.stride1024 = GLOBL("stride_1024", RODATA|NOPTR) for i := 0; i < 8; i++ { DATA(i*4, U32(i*1024)) @@ -67,6 +67,80 @@ func genGlobals() { } } +func genCompressBlocksAVX2() { + TEXT("compressBlocksAVX2", NOSPLIT, "func(out *[512]byte, block *[16]uint32, cv *[8]uint32, counter uint64, blockLen uint32, flags uint32)") + out := Mem{Base: Load(Param("out"), GP64())} + block := Mem{Base: Load(Param("block"), GP64())} + cv := Mem{Base: Load(Param("cv"), GP64())} + counter, _ := Param("counter").Resolve() + blockLen, _ := Param("blockLen").Resolve() + flags, _ := Param("flags").Resolve() + + vs := [16]VecVirtual{ + YMM(), YMM(), YMM(), YMM(), + YMM(), YMM(), YMM(), YMM(), + YMM(), YMM(), YMM(), YMM(), + YMM(), YMM(), YMM(), YMM(), + } + + // stack space for message vectors + var mv [16]Mem + for i := range mv { + mv[i] = AllocLocal(32) + } + // stack space for spilled vs[8] register + spillMem := AllocLocal(32) + + Comment("Load block") + for i := 0; i < 16; i++ { + VPBROADCASTD(block.Offset(i*4), vs[0]) + VMOVDQU(vs[0], mv[i]) + } + + Comment("Initialize state vectors") + for i, v := range vs { + switch i { + case 0, 1, 2, 3, 4, 5, 6, 7: // cv + VPBROADCASTD(cv.Offset(i*4), v) + case 8, 9, 10, 11: // iv + VPBROADCASTD(globals.iv.Offset((i-8)*4), v) + case 12: // counter + loadCounter(counter.Addr, vs[12:14], vs[14:16]) + case 14: // blockLen + VPBROADCASTD(blockLen.Addr, v) + case 15: // flags + VPBROADCASTD(flags.Addr, v) + } + } + + performRounds(vs, mv, spillMem) + + Comment("Finalize CVs") + for i := 8; i < 16; i++ { + VMOVDQU(vs[i], mv[i]) + } + for i := range vs[:8] { + VPXOR(vs[i], vs[i+8], vs[i]) + } + transpose(vs[:8], vs[8:]) + for i, v := range vs[8:] { + VMOVDQU(v, out.Offset(i*64)) + } + for i := 8; i < 16; i++ { + VMOVDQU(mv[i], vs[i]) + } + for i, v := range vs[8:] { + VPBROADCASTD(cv.Offset(i*4), vs[0]) + VPXOR(vs[0], v, v) + } + transpose(vs[8:], vs[:8]) + for i, v := range vs[:8] { + VMOVDQU(v, out.Offset(i*64+32)) + } + + RET() +} + func genCompressChunksAVX2() { TEXT("compressChunksAVX2", NOSPLIT, "func(cvs *[8][8]uint32, buf *[8192]byte, key *[8]uint32, counter uint64, flags uint32)") cvs := Mem{Base: Load(Param("cvs"), GP64())} @@ -97,16 +171,7 @@ func genCompressChunksAVX2() { Comment("Initialize counter") counterLo := AllocLocal(32) counterHi := AllocLocal(32) - VPBROADCASTQ(counter.Addr, vs[12]) - VPBROADCASTQ(counter.Addr, vs[13]) - VPADDQ(globals.incrementCounter.Offset(0*32), vs[12], vs[12]) - VPADDQ(globals.incrementCounter.Offset(1*32), vs[13], vs[13]) - VPUNPCKLDQ(vs[13], vs[12], vs[14]) - VPUNPCKHDQ(vs[13], vs[12], vs[15]) - VPUNPCKLDQ(vs[15], vs[14], vs[12]) - VPUNPCKHDQ(vs[15], vs[14], vs[13]) - VPERMQ(Imm(0xd8), vs[12], vs[12]) - VPERMQ(Imm(0xd8), vs[13], vs[13]) + loadCounter(counter.Addr, vs[12:14], vs[14:16]) VMOVDQU(vs[12], counterLo) VMOVDQU(vs[13], counterHi) @@ -141,21 +206,9 @@ func genCompressChunksAVX2() { VMOVDQU(globals.blockLen, vs[14]) VPBROADCASTD(chunkFlags.Idx(loop, 4), vs[15]) - VMOVDQU(vs[8], spillMem) // spill - for i := 0; i < 7; i++ { - Comment(fmt.Sprintf("Round %v", i+1)) - round(vs, mv, vs[8], spillMem) - // permute - mv = [16]Mem{ - mv[2], mv[6], mv[3], mv[10], - mv[7], mv[0], mv[4], mv[13], - mv[1], mv[11], mv[12], mv[5], - mv[9], mv[14], mv[15], mv[8], - } - } + performRounds(vs, mv, spillMem) Comment("Finalize CVs") - VMOVDQU(spillMem, vs[8]) // reload for i := range vs[:8] { VPXOR(vs[i], vs[i+8], vs[i]) } @@ -166,39 +219,37 @@ func genCompressChunksAVX2() { JNE(LabelRef("loop")) Comment("Finished; transpose CVs") - src, dst := vs[:8], vs[8:] - // interleave uint32s - for i := 0; i < 8; i += 2 { - VPUNPCKLDQ(src[i+1], src[i], dst[i+0]) - VPUNPCKHDQ(src[i+1], src[i], dst[i+1]) - } - // interleave groups of two uint32s - for i := 0; i < 4; i++ { - j := i*2 - i%2 // j := 0,1,4,5 - VPUNPCKLQDQ(dst[j+2], dst[j], src[i*2+0]) - VPUNPCKHQDQ(dst[j+2], dst[j], src[i*2+1]) - } - // interleave groups of four uint32s - for i := 0; i < 4; i++ { - VPERM2I128(Imm(0x20), src[i+4], src[i], dst[i+0]) - VPERM2I128(Imm(0x31), src[i+4], src[i], dst[i+4]) - } - for i, v := range dst { + transpose(vs[:8], vs[8:]) + for i, v := range vs[8:] { VMOVDQU(v, cvs.Offset(i*32)) } RET() } -func round(sv [16]VecVirtual, mv [16]Mem, tmp VecVirtual, spillMem Mem) { - g(sv[0], sv[4], sv[8], sv[12], mv[0], mv[1], tmp, spillMem) - g(sv[1], sv[5], sv[9], sv[13], mv[2], mv[3], tmp, spillMem) - g(sv[2], sv[6], sv[10], sv[14], mv[4], mv[5], tmp, spillMem) - g(sv[3], sv[7], sv[11], sv[15], mv[6], mv[7], tmp, spillMem) - g(sv[0], sv[5], sv[10], sv[15], mv[8], mv[9], tmp, spillMem) - g(sv[1], sv[6], sv[11], sv[12], mv[10], mv[11], tmp, spillMem) - g(sv[2], sv[7], sv[8], sv[13], mv[12], mv[13], tmp, spillMem) - g(sv[3], sv[4], sv[9], sv[14], mv[14], mv[15], tmp, spillMem) +func performRounds(sv [16]VecVirtual, mv [16]Mem, spillMem Mem) { + tmp := sv[8] + VMOVDQU(sv[8], spillMem) // spill + for i := 0; i < 7; i++ { + Comment(fmt.Sprintf("Round %v", i+1)) + g(sv[0], sv[4], sv[8], sv[12], mv[0], mv[1], tmp, spillMem) + g(sv[1], sv[5], sv[9], sv[13], mv[2], mv[3], tmp, spillMem) + g(sv[2], sv[6], sv[10], sv[14], mv[4], mv[5], tmp, spillMem) + g(sv[3], sv[7], sv[11], sv[15], mv[6], mv[7], tmp, spillMem) + g(sv[0], sv[5], sv[10], sv[15], mv[8], mv[9], tmp, spillMem) + g(sv[1], sv[6], sv[11], sv[12], mv[10], mv[11], tmp, spillMem) + g(sv[2], sv[7], sv[8], sv[13], mv[12], mv[13], tmp, spillMem) + g(sv[3], sv[4], sv[9], sv[14], mv[14], mv[15], tmp, spillMem) + + // permute + mv = [16]Mem{ + mv[2], mv[6], mv[3], mv[10], + mv[7], mv[0], mv[4], mv[13], + mv[1], mv[11], mv[12], mv[5], + mv[9], mv[14], mv[15], mv[8], + } + } + VMOVDQU(spillMem, sv[8]) // reload } func g(a, b, c, d VecVirtual, mx, my Mem, tmp VecVirtual, spillMem Mem) { @@ -237,3 +288,38 @@ func g(a, b, c, d VecVirtual, mx, my Mem, tmp VecVirtual, spillMem Mem) { VPXOR(b, c, b) rotr(b, 7, b) } + +func loadCounter(counter Mem, dst, scratch []VecVirtual) { + // fill dst[0] and dst[1] with counter + 0,1,2,3,4,5,6,7, then transpose so + // that dst[0] contains low 32 bits and dst[1] contains high 32 bits. + VPBROADCASTQ(counter, dst[0]) + VPBROADCASTQ(counter, dst[1]) + VPADDQ(globals.incrementCounter.Offset(0*32), dst[0], dst[0]) + VPADDQ(globals.incrementCounter.Offset(1*32), dst[1], dst[1]) + VPUNPCKLDQ(dst[1], dst[0], scratch[0]) + VPUNPCKHDQ(dst[1], dst[0], scratch[1]) + VPUNPCKLDQ(scratch[1], scratch[0], dst[0]) + VPUNPCKHDQ(scratch[1], scratch[0], dst[1]) + const perm = 0<<0 | 2<<2 | 1<<4 | 3<<6 + VPERMQ(Imm(perm), dst[0], dst[0]) + VPERMQ(Imm(perm), dst[1], dst[1]) +} + +func transpose(src, dst []VecVirtual) { + // interleave uint32s + for i := 0; i < 8; i += 2 { + VPUNPCKLDQ(src[i+1], src[i], dst[i+0]) + VPUNPCKHDQ(src[i+1], src[i], dst[i+1]) + } + // interleave groups of two uint32s + for i := 0; i < 4; i++ { + j := i*2 - i%2 // j := 0,1,4,5 + VPUNPCKLQDQ(dst[j+2], dst[j], src[i*2+0]) + VPUNPCKHQDQ(dst[j+2], dst[j], src[i*2+1]) + } + // interleave groups of four uint32s + for i := 0; i < 4; i++ { + VPERM2I128(Imm(0x20), src[i+4], src[i], dst[i+0]) + VPERM2I128(Imm(0x31), src[i+4], src[i], dst[i+4]) + } +} diff --git a/blake3.go b/blake3.go index d06112f..da439b3 100644 --- a/blake3.go +++ b/blake3.go @@ -223,9 +223,9 @@ func DeriveKey(subKey []byte, ctx string, srcKey []byte) { // An OutputReader produces an seekable stream of 2^64 - 1 pseudorandom output // bytes. type OutputReader struct { - n node - block [blockSize]byte - off uint64 + n node + buf [8 * blockSize]byte + off uint64 } // Read implements io.Reader. Callers may assume that Read returns len(p), nil @@ -238,11 +238,11 @@ func (or *OutputReader) Read(p []byte) (int, error) { } lenp := len(p) for len(p) > 0 { - if or.off%blockSize == 0 { + if or.off%(8*blockSize) == 0 { or.n.counter = or.off / blockSize - wordsToBytes(compressNode(or.n), &or.block) + compressBlocks(&or.buf, or.n) } - n := copy(p, or.block[or.off%blockSize:]) + n := copy(p, or.buf[or.off%(8*blockSize):]) p = p[n:] or.off += uint64(n) } @@ -274,8 +274,8 @@ func (or *OutputReader) Seek(offset int64, whence int) (int64, error) { } or.off = off or.n.counter = uint64(off) / blockSize - if or.off%blockSize != 0 { - wordsToBytes(compressNode(or.n), &or.block) + if or.off%(8*blockSize) != 0 { + compressBlocks(&or.buf, or.n) } // NOTE: or.off >= 2^63 will result in a negative return value. // Nothing we can do about this. diff --git a/blake3_amd64.s b/blake3_amd64.s index f467663..2ab9ea0 100644 --- a/blake3_amd64.s +++ b/blake3_amd64.s @@ -76,6 +76,1260 @@ DATA shuffle_rot16<>+24(SB)/4, $0x19181b1a DATA shuffle_rot16<>+28(SB)/4, $0x1d1c1f1e GLOBL shuffle_rot16<>(SB), RODATA|NOPTR, $32 +// func compressBlocksAVX2(out *[512]byte, block *[16]uint32, cv *[8]uint32, counter uint64, blockLen uint32, flags uint32) +// Requires: AVX, AVX2 +TEXT ·compressBlocksAVX2(SB), NOSPLIT, $544-40 + MOVQ out+0(FP), AX + MOVQ block+8(FP), CX + MOVQ cv+16(FP), DX + + // Load block + VPBROADCASTD (CX), Y0 + VMOVDQU Y0, (SP) + VPBROADCASTD 4(CX), Y0 + VMOVDQU Y0, 32(SP) + VPBROADCASTD 8(CX), Y0 + VMOVDQU Y0, 64(SP) + VPBROADCASTD 12(CX), Y0 + VMOVDQU Y0, 96(SP) + VPBROADCASTD 16(CX), Y0 + VMOVDQU Y0, 128(SP) + VPBROADCASTD 20(CX), Y0 + VMOVDQU Y0, 160(SP) + VPBROADCASTD 24(CX), Y0 + VMOVDQU Y0, 192(SP) + VPBROADCASTD 28(CX), Y0 + VMOVDQU Y0, 224(SP) + VPBROADCASTD 32(CX), Y0 + VMOVDQU Y0, 256(SP) + VPBROADCASTD 36(CX), Y0 + VMOVDQU Y0, 288(SP) + VPBROADCASTD 40(CX), Y0 + VMOVDQU Y0, 320(SP) + VPBROADCASTD 44(CX), Y0 + VMOVDQU Y0, 352(SP) + VPBROADCASTD 48(CX), Y0 + VMOVDQU Y0, 384(SP) + VPBROADCASTD 52(CX), Y0 + VMOVDQU Y0, 416(SP) + VPBROADCASTD 56(CX), Y0 + VMOVDQU Y0, 448(SP) + VPBROADCASTD 60(CX), Y0 + VMOVDQU Y0, 480(SP) + + // Initialize state vectors + VPBROADCASTD (DX), Y0 + VPBROADCASTD 4(DX), Y1 + VPBROADCASTD 8(DX), Y2 + VPBROADCASTD 12(DX), Y3 + VPBROADCASTD 16(DX), Y4 + VPBROADCASTD 20(DX), Y5 + VPBROADCASTD 24(DX), Y6 + VPBROADCASTD 28(DX), Y7 + VPBROADCASTD iv<>+0(SB), Y8 + VPBROADCASTD iv<>+4(SB), Y9 + VPBROADCASTD iv<>+8(SB), Y10 + VPBROADCASTD iv<>+12(SB), Y11 + VPBROADCASTQ counter+24(FP), Y12 + VPBROADCASTQ counter+24(FP), Y13 + VPADDQ increment_counter<>+0(SB), Y12, Y12 + VPADDQ increment_counter<>+32(SB), Y13, Y13 + VPUNPCKLDQ Y13, Y12, Y14 + VPUNPCKHDQ Y13, Y12, Y15 + VPUNPCKLDQ Y15, Y14, Y12 + VPUNPCKHDQ Y15, Y14, Y13 + VPERMQ $0xd8, Y12, Y12 + VPERMQ $0xd8, Y13, Y13 + VPBROADCASTD blockLen+32(FP), Y14 + VPBROADCASTD flags+36(FP), Y15 + VMOVDQU Y8, 512(SP) + + // Round 1 + VPADDD Y0, Y4, Y0 + VPADDD (SP), Y0, Y0 + VPXOR Y12, Y0, Y12 + VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 + VMOVDQU 512(SP), Y8 + VPADDD Y8, Y12, Y8 + VPXOR Y4, Y8, Y4 + VMOVDQU Y8, 512(SP) + VPSRLD $0x0c, Y4, Y8 + VPSLLD $0x14, Y4, Y4 + VPOR Y4, Y8, Y4 + VPADDD Y0, Y4, Y0 + VPADDD 32(SP), Y0, Y0 + VPXOR Y12, Y0, Y12 + VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 + VMOVDQU 512(SP), Y8 + VPADDD Y8, Y12, Y8 + VPXOR Y4, Y8, Y4 + VMOVDQU Y8, 512(SP) + VPSRLD $0x07, Y4, Y8 + VPSLLD $0x19, Y4, Y4 + VPOR Y4, Y8, Y4 + VPADDD Y1, Y5, Y1 + VPADDD 64(SP), Y1, Y1 + VPXOR Y13, Y1, Y13 + VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 + VPADDD Y9, Y13, Y9 + VPXOR Y5, Y9, Y5 + VPSRLD $0x0c, Y5, Y8 + VPSLLD $0x14, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y1, Y5, Y1 + VPADDD 96(SP), Y1, Y1 + VPXOR Y13, Y1, Y13 + VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 + VPADDD Y9, Y13, Y9 + VPXOR Y5, Y9, Y5 + VPSRLD $0x07, Y5, Y8 + VPSLLD $0x19, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y2, Y6, Y2 + VPADDD 128(SP), Y2, Y2 + VPXOR Y14, Y2, Y14 + VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 + VPADDD Y10, Y14, Y10 + VPXOR Y6, Y10, Y6 + VPSRLD $0x0c, Y6, Y8 + VPSLLD $0x14, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y2, Y6, Y2 + VPADDD 160(SP), Y2, Y2 + VPXOR Y14, Y2, Y14 + VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 + VPADDD Y10, Y14, Y10 + VPXOR Y6, Y10, Y6 + VPSRLD $0x07, Y6, Y8 + VPSLLD $0x19, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y3, Y7, Y3 + VPADDD 192(SP), Y3, Y3 + VPXOR Y15, Y3, Y15 + VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 + VPADDD Y11, Y15, Y11 + VPXOR Y7, Y11, Y7 + VPSRLD $0x0c, Y7, Y8 + VPSLLD $0x14, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y3, Y7, Y3 + VPADDD 224(SP), Y3, Y3 + VPXOR Y15, Y3, Y15 + VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 + VPADDD Y11, Y15, Y11 + VPXOR Y7, Y11, Y7 + VPSRLD $0x07, Y7, Y8 + VPSLLD $0x19, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y0, Y5, Y0 + VPADDD 256(SP), Y0, Y0 + VPXOR Y15, Y0, Y15 + VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 + VPADDD Y10, Y15, Y10 + VPXOR Y5, Y10, Y5 + VPSRLD $0x0c, Y5, Y8 + VPSLLD $0x14, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y0, Y5, Y0 + VPADDD 288(SP), Y0, Y0 + VPXOR Y15, Y0, Y15 + VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 + VPADDD Y10, Y15, Y10 + VPXOR Y5, Y10, Y5 + VPSRLD $0x07, Y5, Y8 + VPSLLD $0x19, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y1, Y6, Y1 + VPADDD 320(SP), Y1, Y1 + VPXOR Y12, Y1, Y12 + VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 + VPADDD Y11, Y12, Y11 + VPXOR Y6, Y11, Y6 + VPSRLD $0x0c, Y6, Y8 + VPSLLD $0x14, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y1, Y6, Y1 + VPADDD 352(SP), Y1, Y1 + VPXOR Y12, Y1, Y12 + VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 + VPADDD Y11, Y12, Y11 + VPXOR Y6, Y11, Y6 + VPSRLD $0x07, Y6, Y8 + VPSLLD $0x19, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y2, Y7, Y2 + VPADDD 384(SP), Y2, Y2 + VPXOR Y13, Y2, Y13 + VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 + VMOVDQU 512(SP), Y8 + VPADDD Y8, Y13, Y8 + VPXOR Y7, Y8, Y7 + VMOVDQU Y8, 512(SP) + VPSRLD $0x0c, Y7, Y8 + VPSLLD $0x14, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y2, Y7, Y2 + VPADDD 416(SP), Y2, Y2 + VPXOR Y13, Y2, Y13 + VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 + VMOVDQU 512(SP), Y8 + VPADDD Y8, Y13, Y8 + VPXOR Y7, Y8, Y7 + VMOVDQU Y8, 512(SP) + VPSRLD $0x07, Y7, Y8 + VPSLLD $0x19, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y3, Y4, Y3 + VPADDD 448(SP), Y3, Y3 + VPXOR Y14, Y3, Y14 + VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 + VPADDD Y9, Y14, Y9 + VPXOR Y4, Y9, Y4 + VPSRLD $0x0c, Y4, Y8 + VPSLLD $0x14, Y4, Y4 + VPOR Y4, Y8, Y4 + VPADDD Y3, Y4, Y3 + VPADDD 480(SP), Y3, Y3 + VPXOR Y14, Y3, Y14 + VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 + VPADDD Y9, Y14, Y9 + VPXOR Y4, Y9, Y4 + VPSRLD $0x07, Y4, Y8 + VPSLLD $0x19, Y4, Y4 + VPOR Y4, Y8, Y4 + + // Round 2 + VPADDD Y0, Y4, Y0 + VPADDD 64(SP), Y0, Y0 + VPXOR Y12, Y0, Y12 + VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 + VMOVDQU 512(SP), Y8 + VPADDD Y8, Y12, Y8 + VPXOR Y4, Y8, Y4 + VMOVDQU Y8, 512(SP) + VPSRLD $0x0c, Y4, Y8 + VPSLLD $0x14, Y4, Y4 + VPOR Y4, Y8, Y4 + VPADDD Y0, Y4, Y0 + VPADDD 192(SP), Y0, Y0 + VPXOR Y12, Y0, Y12 + VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 + VMOVDQU 512(SP), Y8 + VPADDD Y8, Y12, Y8 + VPXOR Y4, Y8, Y4 + VMOVDQU Y8, 512(SP) + VPSRLD $0x07, Y4, Y8 + VPSLLD $0x19, Y4, Y4 + VPOR Y4, Y8, Y4 + VPADDD Y1, Y5, Y1 + VPADDD 96(SP), Y1, Y1 + VPXOR Y13, Y1, Y13 + VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 + VPADDD Y9, Y13, Y9 + VPXOR Y5, Y9, Y5 + VPSRLD $0x0c, Y5, Y8 + VPSLLD $0x14, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y1, Y5, Y1 + VPADDD 320(SP), Y1, Y1 + VPXOR Y13, Y1, Y13 + VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 + VPADDD Y9, Y13, Y9 + VPXOR Y5, Y9, Y5 + VPSRLD $0x07, Y5, Y8 + VPSLLD $0x19, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y2, Y6, Y2 + VPADDD 224(SP), Y2, Y2 + VPXOR Y14, Y2, Y14 + VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 + VPADDD Y10, Y14, Y10 + VPXOR Y6, Y10, Y6 + VPSRLD $0x0c, Y6, Y8 + VPSLLD $0x14, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y2, Y6, Y2 + VPADDD (SP), Y2, Y2 + VPXOR Y14, Y2, Y14 + VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 + VPADDD Y10, Y14, Y10 + VPXOR Y6, Y10, Y6 + VPSRLD $0x07, Y6, Y8 + VPSLLD $0x19, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y3, Y7, Y3 + VPADDD 128(SP), Y3, Y3 + VPXOR Y15, Y3, Y15 + VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 + VPADDD Y11, Y15, Y11 + VPXOR Y7, Y11, Y7 + VPSRLD $0x0c, Y7, Y8 + VPSLLD $0x14, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y3, Y7, Y3 + VPADDD 416(SP), Y3, Y3 + VPXOR Y15, Y3, Y15 + VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 + VPADDD Y11, Y15, Y11 + VPXOR Y7, Y11, Y7 + VPSRLD $0x07, Y7, Y8 + VPSLLD $0x19, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y0, Y5, Y0 + VPADDD 32(SP), Y0, Y0 + VPXOR Y15, Y0, Y15 + VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 + VPADDD Y10, Y15, Y10 + VPXOR Y5, Y10, Y5 + VPSRLD $0x0c, Y5, Y8 + VPSLLD $0x14, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y0, Y5, Y0 + VPADDD 352(SP), Y0, Y0 + VPXOR Y15, Y0, Y15 + VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 + VPADDD Y10, Y15, Y10 + VPXOR Y5, Y10, Y5 + VPSRLD $0x07, Y5, Y8 + VPSLLD $0x19, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y1, Y6, Y1 + VPADDD 384(SP), Y1, Y1 + VPXOR Y12, Y1, Y12 + VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 + VPADDD Y11, Y12, Y11 + VPXOR Y6, Y11, Y6 + VPSRLD $0x0c, Y6, Y8 + VPSLLD $0x14, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y1, Y6, Y1 + VPADDD 160(SP), Y1, Y1 + VPXOR Y12, Y1, Y12 + VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 + VPADDD Y11, Y12, Y11 + VPXOR Y6, Y11, Y6 + VPSRLD $0x07, Y6, Y8 + VPSLLD $0x19, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y2, Y7, Y2 + VPADDD 288(SP), Y2, Y2 + VPXOR Y13, Y2, Y13 + VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 + VMOVDQU 512(SP), Y8 + VPADDD Y8, Y13, Y8 + VPXOR Y7, Y8, Y7 + VMOVDQU Y8, 512(SP) + VPSRLD $0x0c, Y7, Y8 + VPSLLD $0x14, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y2, Y7, Y2 + VPADDD 448(SP), Y2, Y2 + VPXOR Y13, Y2, Y13 + VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 + VMOVDQU 512(SP), Y8 + VPADDD Y8, Y13, Y8 + VPXOR Y7, Y8, Y7 + VMOVDQU Y8, 512(SP) + VPSRLD $0x07, Y7, Y8 + VPSLLD $0x19, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y3, Y4, Y3 + VPADDD 480(SP), Y3, Y3 + VPXOR Y14, Y3, Y14 + VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 + VPADDD Y9, Y14, Y9 + VPXOR Y4, Y9, Y4 + VPSRLD $0x0c, Y4, Y8 + VPSLLD $0x14, Y4, Y4 + VPOR Y4, Y8, Y4 + VPADDD Y3, Y4, Y3 + VPADDD 256(SP), Y3, Y3 + VPXOR Y14, Y3, Y14 + VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 + VPADDD Y9, Y14, Y9 + VPXOR Y4, Y9, Y4 + VPSRLD $0x07, Y4, Y8 + VPSLLD $0x19, Y4, Y4 + VPOR Y4, Y8, Y4 + + // Round 3 + VPADDD Y0, Y4, Y0 + VPADDD 96(SP), Y0, Y0 + VPXOR Y12, Y0, Y12 + VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 + VMOVDQU 512(SP), Y8 + VPADDD Y8, Y12, Y8 + VPXOR Y4, Y8, Y4 + VMOVDQU Y8, 512(SP) + VPSRLD $0x0c, Y4, Y8 + VPSLLD $0x14, Y4, Y4 + VPOR Y4, Y8, Y4 + VPADDD Y0, Y4, Y0 + VPADDD 128(SP), Y0, Y0 + VPXOR Y12, Y0, Y12 + VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 + VMOVDQU 512(SP), Y8 + VPADDD Y8, Y12, Y8 + VPXOR Y4, Y8, Y4 + VMOVDQU Y8, 512(SP) + VPSRLD $0x07, Y4, Y8 + VPSLLD $0x19, Y4, Y4 + VPOR Y4, Y8, Y4 + VPADDD Y1, Y5, Y1 + VPADDD 320(SP), Y1, Y1 + VPXOR Y13, Y1, Y13 + VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 + VPADDD Y9, Y13, Y9 + VPXOR Y5, Y9, Y5 + VPSRLD $0x0c, Y5, Y8 + VPSLLD $0x14, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y1, Y5, Y1 + VPADDD 384(SP), Y1, Y1 + VPXOR Y13, Y1, Y13 + VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 + VPADDD Y9, Y13, Y9 + VPXOR Y5, Y9, Y5 + VPSRLD $0x07, Y5, Y8 + VPSLLD $0x19, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y2, Y6, Y2 + VPADDD 416(SP), Y2, Y2 + VPXOR Y14, Y2, Y14 + VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 + VPADDD Y10, Y14, Y10 + VPXOR Y6, Y10, Y6 + VPSRLD $0x0c, Y6, Y8 + VPSLLD $0x14, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y2, Y6, Y2 + VPADDD 64(SP), Y2, Y2 + VPXOR Y14, Y2, Y14 + VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 + VPADDD Y10, Y14, Y10 + VPXOR Y6, Y10, Y6 + VPSRLD $0x07, Y6, Y8 + VPSLLD $0x19, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y3, Y7, Y3 + VPADDD 224(SP), Y3, Y3 + VPXOR Y15, Y3, Y15 + VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 + VPADDD Y11, Y15, Y11 + VPXOR Y7, Y11, Y7 + VPSRLD $0x0c, Y7, Y8 + VPSLLD $0x14, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y3, Y7, Y3 + VPADDD 448(SP), Y3, Y3 + VPXOR Y15, Y3, Y15 + VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 + VPADDD Y11, Y15, Y11 + VPXOR Y7, Y11, Y7 + VPSRLD $0x07, Y7, Y8 + VPSLLD $0x19, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y0, Y5, Y0 + VPADDD 192(SP), Y0, Y0 + VPXOR Y15, Y0, Y15 + VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 + VPADDD Y10, Y15, Y10 + VPXOR Y5, Y10, Y5 + VPSRLD $0x0c, Y5, Y8 + VPSLLD $0x14, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y0, Y5, Y0 + VPADDD 160(SP), Y0, Y0 + VPXOR Y15, Y0, Y15 + VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 + VPADDD Y10, Y15, Y10 + VPXOR Y5, Y10, Y5 + VPSRLD $0x07, Y5, Y8 + VPSLLD $0x19, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y1, Y6, Y1 + VPADDD 288(SP), Y1, Y1 + VPXOR Y12, Y1, Y12 + VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 + VPADDD Y11, Y12, Y11 + VPXOR Y6, Y11, Y6 + VPSRLD $0x0c, Y6, Y8 + VPSLLD $0x14, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y1, Y6, Y1 + VPADDD (SP), Y1, Y1 + VPXOR Y12, Y1, Y12 + VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 + VPADDD Y11, Y12, Y11 + VPXOR Y6, Y11, Y6 + VPSRLD $0x07, Y6, Y8 + VPSLLD $0x19, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y2, Y7, Y2 + VPADDD 352(SP), Y2, Y2 + VPXOR Y13, Y2, Y13 + VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 + VMOVDQU 512(SP), Y8 + VPADDD Y8, Y13, Y8 + VPXOR Y7, Y8, Y7 + VMOVDQU Y8, 512(SP) + VPSRLD $0x0c, Y7, Y8 + VPSLLD $0x14, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y2, Y7, Y2 + VPADDD 480(SP), Y2, Y2 + VPXOR Y13, Y2, Y13 + VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 + VMOVDQU 512(SP), Y8 + VPADDD Y8, Y13, Y8 + VPXOR Y7, Y8, Y7 + VMOVDQU Y8, 512(SP) + VPSRLD $0x07, Y7, Y8 + VPSLLD $0x19, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y3, Y4, Y3 + VPADDD 256(SP), Y3, Y3 + VPXOR Y14, Y3, Y14 + VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 + VPADDD Y9, Y14, Y9 + VPXOR Y4, Y9, Y4 + VPSRLD $0x0c, Y4, Y8 + VPSLLD $0x14, Y4, Y4 + VPOR Y4, Y8, Y4 + VPADDD Y3, Y4, Y3 + VPADDD 32(SP), Y3, Y3 + VPXOR Y14, Y3, Y14 + VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 + VPADDD Y9, Y14, Y9 + VPXOR Y4, Y9, Y4 + VPSRLD $0x07, Y4, Y8 + VPSLLD $0x19, Y4, Y4 + VPOR Y4, Y8, Y4 + + // Round 4 + VPADDD Y0, Y4, Y0 + VPADDD 320(SP), Y0, Y0 + VPXOR Y12, Y0, Y12 + VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 + VMOVDQU 512(SP), Y8 + VPADDD Y8, Y12, Y8 + VPXOR Y4, Y8, Y4 + VMOVDQU Y8, 512(SP) + VPSRLD $0x0c, Y4, Y8 + VPSLLD $0x14, Y4, Y4 + VPOR Y4, Y8, Y4 + VPADDD Y0, Y4, Y0 + VPADDD 224(SP), Y0, Y0 + VPXOR Y12, Y0, Y12 + VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 + VMOVDQU 512(SP), Y8 + VPADDD Y8, Y12, Y8 + VPXOR Y4, Y8, Y4 + VMOVDQU Y8, 512(SP) + VPSRLD $0x07, Y4, Y8 + VPSLLD $0x19, Y4, Y4 + VPOR Y4, Y8, Y4 + VPADDD Y1, Y5, Y1 + VPADDD 384(SP), Y1, Y1 + VPXOR Y13, Y1, Y13 + VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 + VPADDD Y9, Y13, Y9 + VPXOR Y5, Y9, Y5 + VPSRLD $0x0c, Y5, Y8 + VPSLLD $0x14, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y1, Y5, Y1 + VPADDD 288(SP), Y1, Y1 + VPXOR Y13, Y1, Y13 + VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 + VPADDD Y9, Y13, Y9 + VPXOR Y5, Y9, Y5 + VPSRLD $0x07, Y5, Y8 + VPSLLD $0x19, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y2, Y6, Y2 + VPADDD 448(SP), Y2, Y2 + VPXOR Y14, Y2, Y14 + VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 + VPADDD Y10, Y14, Y10 + VPXOR Y6, Y10, Y6 + VPSRLD $0x0c, Y6, Y8 + VPSLLD $0x14, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y2, Y6, Y2 + VPADDD 96(SP), Y2, Y2 + VPXOR Y14, Y2, Y14 + VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 + VPADDD Y10, Y14, Y10 + VPXOR Y6, Y10, Y6 + VPSRLD $0x07, Y6, Y8 + VPSLLD $0x19, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y3, Y7, Y3 + VPADDD 416(SP), Y3, Y3 + VPXOR Y15, Y3, Y15 + VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 + VPADDD Y11, Y15, Y11 + VPXOR Y7, Y11, Y7 + VPSRLD $0x0c, Y7, Y8 + VPSLLD $0x14, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y3, Y7, Y3 + VPADDD 480(SP), Y3, Y3 + VPXOR Y15, Y3, Y15 + VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 + VPADDD Y11, Y15, Y11 + VPXOR Y7, Y11, Y7 + VPSRLD $0x07, Y7, Y8 + VPSLLD $0x19, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y0, Y5, Y0 + VPADDD 128(SP), Y0, Y0 + VPXOR Y15, Y0, Y15 + VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 + VPADDD Y10, Y15, Y10 + VPXOR Y5, Y10, Y5 + VPSRLD $0x0c, Y5, Y8 + VPSLLD $0x14, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y0, Y5, Y0 + VPADDD (SP), Y0, Y0 + VPXOR Y15, Y0, Y15 + VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 + VPADDD Y10, Y15, Y10 + VPXOR Y5, Y10, Y5 + VPSRLD $0x07, Y5, Y8 + VPSLLD $0x19, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y1, Y6, Y1 + VPADDD 352(SP), Y1, Y1 + VPXOR Y12, Y1, Y12 + VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 + VPADDD Y11, Y12, Y11 + VPXOR Y6, Y11, Y6 + VPSRLD $0x0c, Y6, Y8 + VPSLLD $0x14, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y1, Y6, Y1 + VPADDD 64(SP), Y1, Y1 + VPXOR Y12, Y1, Y12 + VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 + VPADDD Y11, Y12, Y11 + VPXOR Y6, Y11, Y6 + VPSRLD $0x07, Y6, Y8 + VPSLLD $0x19, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y2, Y7, Y2 + VPADDD 160(SP), Y2, Y2 + VPXOR Y13, Y2, Y13 + VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 + VMOVDQU 512(SP), Y8 + VPADDD Y8, Y13, Y8 + VPXOR Y7, Y8, Y7 + VMOVDQU Y8, 512(SP) + VPSRLD $0x0c, Y7, Y8 + VPSLLD $0x14, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y2, Y7, Y2 + VPADDD 256(SP), Y2, Y2 + VPXOR Y13, Y2, Y13 + VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 + VMOVDQU 512(SP), Y8 + VPADDD Y8, Y13, Y8 + VPXOR Y7, Y8, Y7 + VMOVDQU Y8, 512(SP) + VPSRLD $0x07, Y7, Y8 + VPSLLD $0x19, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y3, Y4, Y3 + VPADDD 32(SP), Y3, Y3 + VPXOR Y14, Y3, Y14 + VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 + VPADDD Y9, Y14, Y9 + VPXOR Y4, Y9, Y4 + VPSRLD $0x0c, Y4, Y8 + VPSLLD $0x14, Y4, Y4 + VPOR Y4, Y8, Y4 + VPADDD Y3, Y4, Y3 + VPADDD 192(SP), Y3, Y3 + VPXOR Y14, Y3, Y14 + VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 + VPADDD Y9, Y14, Y9 + VPXOR Y4, Y9, Y4 + VPSRLD $0x07, Y4, Y8 + VPSLLD $0x19, Y4, Y4 + VPOR Y4, Y8, Y4 + + // Round 5 + VPADDD Y0, Y4, Y0 + VPADDD 384(SP), Y0, Y0 + VPXOR Y12, Y0, Y12 + VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 + VMOVDQU 512(SP), Y8 + VPADDD Y8, Y12, Y8 + VPXOR Y4, Y8, Y4 + VMOVDQU Y8, 512(SP) + VPSRLD $0x0c, Y4, Y8 + VPSLLD $0x14, Y4, Y4 + VPOR Y4, Y8, Y4 + VPADDD Y0, Y4, Y0 + VPADDD 416(SP), Y0, Y0 + VPXOR Y12, Y0, Y12 + VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 + VMOVDQU 512(SP), Y8 + VPADDD Y8, Y12, Y8 + VPXOR Y4, Y8, Y4 + VMOVDQU Y8, 512(SP) + VPSRLD $0x07, Y4, Y8 + VPSLLD $0x19, Y4, Y4 + VPOR Y4, Y8, Y4 + VPADDD Y1, Y5, Y1 + VPADDD 288(SP), Y1, Y1 + VPXOR Y13, Y1, Y13 + VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 + VPADDD Y9, Y13, Y9 + VPXOR Y5, Y9, Y5 + VPSRLD $0x0c, Y5, Y8 + VPSLLD $0x14, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y1, Y5, Y1 + VPADDD 352(SP), Y1, Y1 + VPXOR Y13, Y1, Y13 + VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 + VPADDD Y9, Y13, Y9 + VPXOR Y5, Y9, Y5 + VPSRLD $0x07, Y5, Y8 + VPSLLD $0x19, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y2, Y6, Y2 + VPADDD 480(SP), Y2, Y2 + VPXOR Y14, Y2, Y14 + VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 + VPADDD Y10, Y14, Y10 + VPXOR Y6, Y10, Y6 + VPSRLD $0x0c, Y6, Y8 + VPSLLD $0x14, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y2, Y6, Y2 + VPADDD 320(SP), Y2, Y2 + VPXOR Y14, Y2, Y14 + VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 + VPADDD Y10, Y14, Y10 + VPXOR Y6, Y10, Y6 + VPSRLD $0x07, Y6, Y8 + VPSLLD $0x19, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y3, Y7, Y3 + VPADDD 448(SP), Y3, Y3 + VPXOR Y15, Y3, Y15 + VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 + VPADDD Y11, Y15, Y11 + VPXOR Y7, Y11, Y7 + VPSRLD $0x0c, Y7, Y8 + VPSLLD $0x14, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y3, Y7, Y3 + VPADDD 256(SP), Y3, Y3 + VPXOR Y15, Y3, Y15 + VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 + VPADDD Y11, Y15, Y11 + VPXOR Y7, Y11, Y7 + VPSRLD $0x07, Y7, Y8 + VPSLLD $0x19, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y0, Y5, Y0 + VPADDD 224(SP), Y0, Y0 + VPXOR Y15, Y0, Y15 + VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 + VPADDD Y10, Y15, Y10 + VPXOR Y5, Y10, Y5 + VPSRLD $0x0c, Y5, Y8 + VPSLLD $0x14, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y0, Y5, Y0 + VPADDD 64(SP), Y0, Y0 + VPXOR Y15, Y0, Y15 + VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 + VPADDD Y10, Y15, Y10 + VPXOR Y5, Y10, Y5 + VPSRLD $0x07, Y5, Y8 + VPSLLD $0x19, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y1, Y6, Y1 + VPADDD 160(SP), Y1, Y1 + VPXOR Y12, Y1, Y12 + VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 + VPADDD Y11, Y12, Y11 + VPXOR Y6, Y11, Y6 + VPSRLD $0x0c, Y6, Y8 + VPSLLD $0x14, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y1, Y6, Y1 + VPADDD 96(SP), Y1, Y1 + VPXOR Y12, Y1, Y12 + VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 + VPADDD Y11, Y12, Y11 + VPXOR Y6, Y11, Y6 + VPSRLD $0x07, Y6, Y8 + VPSLLD $0x19, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y2, Y7, Y2 + VPADDD (SP), Y2, Y2 + VPXOR Y13, Y2, Y13 + VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 + VMOVDQU 512(SP), Y8 + VPADDD Y8, Y13, Y8 + VPXOR Y7, Y8, Y7 + VMOVDQU Y8, 512(SP) + VPSRLD $0x0c, Y7, Y8 + VPSLLD $0x14, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y2, Y7, Y2 + VPADDD 32(SP), Y2, Y2 + VPXOR Y13, Y2, Y13 + VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 + VMOVDQU 512(SP), Y8 + VPADDD Y8, Y13, Y8 + VPXOR Y7, Y8, Y7 + VMOVDQU Y8, 512(SP) + VPSRLD $0x07, Y7, Y8 + VPSLLD $0x19, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y3, Y4, Y3 + VPADDD 192(SP), Y3, Y3 + VPXOR Y14, Y3, Y14 + VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 + VPADDD Y9, Y14, Y9 + VPXOR Y4, Y9, Y4 + VPSRLD $0x0c, Y4, Y8 + VPSLLD $0x14, Y4, Y4 + VPOR Y4, Y8, Y4 + VPADDD Y3, Y4, Y3 + VPADDD 128(SP), Y3, Y3 + VPXOR Y14, Y3, Y14 + VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 + VPADDD Y9, Y14, Y9 + VPXOR Y4, Y9, Y4 + VPSRLD $0x07, Y4, Y8 + VPSLLD $0x19, Y4, Y4 + VPOR Y4, Y8, Y4 + + // Round 6 + VPADDD Y0, Y4, Y0 + VPADDD 288(SP), Y0, Y0 + VPXOR Y12, Y0, Y12 + VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 + VMOVDQU 512(SP), Y8 + VPADDD Y8, Y12, Y8 + VPXOR Y4, Y8, Y4 + VMOVDQU Y8, 512(SP) + VPSRLD $0x0c, Y4, Y8 + VPSLLD $0x14, Y4, Y4 + VPOR Y4, Y8, Y4 + VPADDD Y0, Y4, Y0 + VPADDD 448(SP), Y0, Y0 + VPXOR Y12, Y0, Y12 + VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 + VMOVDQU 512(SP), Y8 + VPADDD Y8, Y12, Y8 + VPXOR Y4, Y8, Y4 + VMOVDQU Y8, 512(SP) + VPSRLD $0x07, Y4, Y8 + VPSLLD $0x19, Y4, Y4 + VPOR Y4, Y8, Y4 + VPADDD Y1, Y5, Y1 + VPADDD 352(SP), Y1, Y1 + VPXOR Y13, Y1, Y13 + VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 + VPADDD Y9, Y13, Y9 + VPXOR Y5, Y9, Y5 + VPSRLD $0x0c, Y5, Y8 + VPSLLD $0x14, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y1, Y5, Y1 + VPADDD 160(SP), Y1, Y1 + VPXOR Y13, Y1, Y13 + VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 + VPADDD Y9, Y13, Y9 + VPXOR Y5, Y9, Y5 + VPSRLD $0x07, Y5, Y8 + VPSLLD $0x19, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y2, Y6, Y2 + VPADDD 256(SP), Y2, Y2 + VPXOR Y14, Y2, Y14 + VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 + VPADDD Y10, Y14, Y10 + VPXOR Y6, Y10, Y6 + VPSRLD $0x0c, Y6, Y8 + VPSLLD $0x14, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y2, Y6, Y2 + VPADDD 384(SP), Y2, Y2 + VPXOR Y14, Y2, Y14 + VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 + VPADDD Y10, Y14, Y10 + VPXOR Y6, Y10, Y6 + VPSRLD $0x07, Y6, Y8 + VPSLLD $0x19, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y3, Y7, Y3 + VPADDD 480(SP), Y3, Y3 + VPXOR Y15, Y3, Y15 + VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 + VPADDD Y11, Y15, Y11 + VPXOR Y7, Y11, Y7 + VPSRLD $0x0c, Y7, Y8 + VPSLLD $0x14, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y3, Y7, Y3 + VPADDD 32(SP), Y3, Y3 + VPXOR Y15, Y3, Y15 + VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 + VPADDD Y11, Y15, Y11 + VPXOR Y7, Y11, Y7 + VPSRLD $0x07, Y7, Y8 + VPSLLD $0x19, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y0, Y5, Y0 + VPADDD 416(SP), Y0, Y0 + VPXOR Y15, Y0, Y15 + VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 + VPADDD Y10, Y15, Y10 + VPXOR Y5, Y10, Y5 + VPSRLD $0x0c, Y5, Y8 + VPSLLD $0x14, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y0, Y5, Y0 + VPADDD 96(SP), Y0, Y0 + VPXOR Y15, Y0, Y15 + VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 + VPADDD Y10, Y15, Y10 + VPXOR Y5, Y10, Y5 + VPSRLD $0x07, Y5, Y8 + VPSLLD $0x19, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y1, Y6, Y1 + VPADDD (SP), Y1, Y1 + VPXOR Y12, Y1, Y12 + VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 + VPADDD Y11, Y12, Y11 + VPXOR Y6, Y11, Y6 + VPSRLD $0x0c, Y6, Y8 + VPSLLD $0x14, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y1, Y6, Y1 + VPADDD 320(SP), Y1, Y1 + VPXOR Y12, Y1, Y12 + VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 + VPADDD Y11, Y12, Y11 + VPXOR Y6, Y11, Y6 + VPSRLD $0x07, Y6, Y8 + VPSLLD $0x19, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y2, Y7, Y2 + VPADDD 64(SP), Y2, Y2 + VPXOR Y13, Y2, Y13 + VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 + VMOVDQU 512(SP), Y8 + VPADDD Y8, Y13, Y8 + VPXOR Y7, Y8, Y7 + VMOVDQU Y8, 512(SP) + VPSRLD $0x0c, Y7, Y8 + VPSLLD $0x14, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y2, Y7, Y2 + VPADDD 192(SP), Y2, Y2 + VPXOR Y13, Y2, Y13 + VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 + VMOVDQU 512(SP), Y8 + VPADDD Y8, Y13, Y8 + VPXOR Y7, Y8, Y7 + VMOVDQU Y8, 512(SP) + VPSRLD $0x07, Y7, Y8 + VPSLLD $0x19, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y3, Y4, Y3 + VPADDD 128(SP), Y3, Y3 + VPXOR Y14, Y3, Y14 + VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 + VPADDD Y9, Y14, Y9 + VPXOR Y4, Y9, Y4 + VPSRLD $0x0c, Y4, Y8 + VPSLLD $0x14, Y4, Y4 + VPOR Y4, Y8, Y4 + VPADDD Y3, Y4, Y3 + VPADDD 224(SP), Y3, Y3 + VPXOR Y14, Y3, Y14 + VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 + VPADDD Y9, Y14, Y9 + VPXOR Y4, Y9, Y4 + VPSRLD $0x07, Y4, Y8 + VPSLLD $0x19, Y4, Y4 + VPOR Y4, Y8, Y4 + + // Round 7 + VPADDD Y0, Y4, Y0 + VPADDD 352(SP), Y0, Y0 + VPXOR Y12, Y0, Y12 + VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 + VMOVDQU 512(SP), Y8 + VPADDD Y8, Y12, Y8 + VPXOR Y4, Y8, Y4 + VMOVDQU Y8, 512(SP) + VPSRLD $0x0c, Y4, Y8 + VPSLLD $0x14, Y4, Y4 + VPOR Y4, Y8, Y4 + VPADDD Y0, Y4, Y0 + VPADDD 480(SP), Y0, Y0 + VPXOR Y12, Y0, Y12 + VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 + VMOVDQU 512(SP), Y8 + VPADDD Y8, Y12, Y8 + VPXOR Y4, Y8, Y4 + VMOVDQU Y8, 512(SP) + VPSRLD $0x07, Y4, Y8 + VPSLLD $0x19, Y4, Y4 + VPOR Y4, Y8, Y4 + VPADDD Y1, Y5, Y1 + VPADDD 160(SP), Y1, Y1 + VPXOR Y13, Y1, Y13 + VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 + VPADDD Y9, Y13, Y9 + VPXOR Y5, Y9, Y5 + VPSRLD $0x0c, Y5, Y8 + VPSLLD $0x14, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y1, Y5, Y1 + VPADDD (SP), Y1, Y1 + VPXOR Y13, Y1, Y13 + VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 + VPADDD Y9, Y13, Y9 + VPXOR Y5, Y9, Y5 + VPSRLD $0x07, Y5, Y8 + VPSLLD $0x19, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y2, Y6, Y2 + VPADDD 32(SP), Y2, Y2 + VPXOR Y14, Y2, Y14 + VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 + VPADDD Y10, Y14, Y10 + VPXOR Y6, Y10, Y6 + VPSRLD $0x0c, Y6, Y8 + VPSLLD $0x14, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y2, Y6, Y2 + VPADDD 288(SP), Y2, Y2 + VPXOR Y14, Y2, Y14 + VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 + VPADDD Y10, Y14, Y10 + VPXOR Y6, Y10, Y6 + VPSRLD $0x07, Y6, Y8 + VPSLLD $0x19, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y3, Y7, Y3 + VPADDD 256(SP), Y3, Y3 + VPXOR Y15, Y3, Y15 + VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 + VPADDD Y11, Y15, Y11 + VPXOR Y7, Y11, Y7 + VPSRLD $0x0c, Y7, Y8 + VPSLLD $0x14, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y3, Y7, Y3 + VPADDD 192(SP), Y3, Y3 + VPXOR Y15, Y3, Y15 + VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 + VPADDD Y11, Y15, Y11 + VPXOR Y7, Y11, Y7 + VPSRLD $0x07, Y7, Y8 + VPSLLD $0x19, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y0, Y5, Y0 + VPADDD 448(SP), Y0, Y0 + VPXOR Y15, Y0, Y15 + VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 + VPADDD Y10, Y15, Y10 + VPXOR Y5, Y10, Y5 + VPSRLD $0x0c, Y5, Y8 + VPSLLD $0x14, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y0, Y5, Y0 + VPADDD 320(SP), Y0, Y0 + VPXOR Y15, Y0, Y15 + VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 + VPADDD Y10, Y15, Y10 + VPXOR Y5, Y10, Y5 + VPSRLD $0x07, Y5, Y8 + VPSLLD $0x19, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y1, Y6, Y1 + VPADDD 64(SP), Y1, Y1 + VPXOR Y12, Y1, Y12 + VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 + VPADDD Y11, Y12, Y11 + VPXOR Y6, Y11, Y6 + VPSRLD $0x0c, Y6, Y8 + VPSLLD $0x14, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y1, Y6, Y1 + VPADDD 384(SP), Y1, Y1 + VPXOR Y12, Y1, Y12 + VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 + VPADDD Y11, Y12, Y11 + VPXOR Y6, Y11, Y6 + VPSRLD $0x07, Y6, Y8 + VPSLLD $0x19, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y2, Y7, Y2 + VPADDD 96(SP), Y2, Y2 + VPXOR Y13, Y2, Y13 + VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 + VMOVDQU 512(SP), Y8 + VPADDD Y8, Y13, Y8 + VPXOR Y7, Y8, Y7 + VMOVDQU Y8, 512(SP) + VPSRLD $0x0c, Y7, Y8 + VPSLLD $0x14, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y2, Y7, Y2 + VPADDD 128(SP), Y2, Y2 + VPXOR Y13, Y2, Y13 + VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 + VMOVDQU 512(SP), Y8 + VPADDD Y8, Y13, Y8 + VPXOR Y7, Y8, Y7 + VMOVDQU Y8, 512(SP) + VPSRLD $0x07, Y7, Y8 + VPSLLD $0x19, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y3, Y4, Y3 + VPADDD 224(SP), Y3, Y3 + VPXOR Y14, Y3, Y14 + VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 + VPADDD Y9, Y14, Y9 + VPXOR Y4, Y9, Y4 + VPSRLD $0x0c, Y4, Y8 + VPSLLD $0x14, Y4, Y4 + VPOR Y4, Y8, Y4 + VPADDD Y3, Y4, Y3 + VPADDD 416(SP), Y3, Y3 + VPXOR Y14, Y3, Y14 + VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 + VPADDD Y9, Y14, Y9 + VPXOR Y4, Y9, Y4 + VPSRLD $0x07, Y4, Y8 + VPSLLD $0x19, Y4, Y4 + VPOR Y4, Y8, Y4 + VMOVDQU 512(SP), Y8 + + // Finalize CVs + VMOVDQU Y8, 256(SP) + VMOVDQU Y9, 288(SP) + VMOVDQU Y10, 320(SP) + VMOVDQU Y11, 352(SP) + VMOVDQU Y12, 384(SP) + VMOVDQU Y13, 416(SP) + VMOVDQU Y14, 448(SP) + VMOVDQU Y15, 480(SP) + VPXOR Y0, Y8, Y0 + VPXOR Y1, Y9, Y1 + VPXOR Y2, Y10, Y2 + VPXOR Y3, Y11, Y3 + VPXOR Y4, Y12, Y4 + VPXOR Y5, Y13, Y5 + VPXOR Y6, Y14, Y6 + VPXOR Y7, Y15, Y7 + VPUNPCKLDQ Y1, Y0, Y8 + VPUNPCKHDQ Y1, Y0, Y9 + VPUNPCKLDQ Y3, Y2, Y10 + VPUNPCKHDQ Y3, Y2, Y11 + VPUNPCKLDQ Y5, Y4, Y12 + VPUNPCKHDQ Y5, Y4, Y13 + VPUNPCKLDQ Y7, Y6, Y14 + VPUNPCKHDQ Y7, Y6, Y15 + VPUNPCKLQDQ Y10, Y8, Y0 + VPUNPCKHQDQ Y10, Y8, Y1 + VPUNPCKLQDQ Y11, Y9, Y2 + VPUNPCKHQDQ Y11, Y9, Y3 + VPUNPCKLQDQ Y14, Y12, Y4 + VPUNPCKHQDQ Y14, Y12, Y5 + VPUNPCKLQDQ Y15, Y13, Y6 + VPUNPCKHQDQ Y15, Y13, Y7 + VPERM2I128 $0x20, Y4, Y0, Y8 + VPERM2I128 $0x31, Y4, Y0, Y12 + VPERM2I128 $0x20, Y5, Y1, Y9 + VPERM2I128 $0x31, Y5, Y1, Y13 + VPERM2I128 $0x20, Y6, Y2, Y10 + VPERM2I128 $0x31, Y6, Y2, Y14 + VPERM2I128 $0x20, Y7, Y3, Y11 + VPERM2I128 $0x31, Y7, Y3, Y15 + VMOVDQU Y8, (AX) + VMOVDQU Y9, 64(AX) + VMOVDQU Y10, 128(AX) + VMOVDQU Y11, 192(AX) + VMOVDQU Y12, 256(AX) + VMOVDQU Y13, 320(AX) + VMOVDQU Y14, 384(AX) + VMOVDQU Y15, 448(AX) + VMOVDQU 256(SP), Y8 + VMOVDQU 288(SP), Y9 + VMOVDQU 320(SP), Y10 + VMOVDQU 352(SP), Y11 + VMOVDQU 384(SP), Y12 + VMOVDQU 416(SP), Y13 + VMOVDQU 448(SP), Y14 + VMOVDQU 480(SP), Y15 + VPBROADCASTD (DX), Y0 + VPXOR Y0, Y8, Y8 + VPBROADCASTD 4(DX), Y0 + VPXOR Y0, Y9, Y9 + VPBROADCASTD 8(DX), Y0 + VPXOR Y0, Y10, Y10 + VPBROADCASTD 12(DX), Y0 + VPXOR Y0, Y11, Y11 + VPBROADCASTD 16(DX), Y0 + VPXOR Y0, Y12, Y12 + VPBROADCASTD 20(DX), Y0 + VPXOR Y0, Y13, Y13 + VPBROADCASTD 24(DX), Y0 + VPXOR Y0, Y14, Y14 + VPBROADCASTD 28(DX), Y0 + VPXOR Y0, Y15, Y15 + VPUNPCKLDQ Y9, Y8, Y0 + VPUNPCKHDQ Y9, Y8, Y1 + VPUNPCKLDQ Y11, Y10, Y2 + VPUNPCKHDQ Y11, Y10, Y3 + VPUNPCKLDQ Y13, Y12, Y4 + VPUNPCKHDQ Y13, Y12, Y5 + VPUNPCKLDQ Y15, Y14, Y6 + VPUNPCKHDQ Y15, Y14, Y7 + VPUNPCKLQDQ Y2, Y0, Y8 + VPUNPCKHQDQ Y2, Y0, Y9 + VPUNPCKLQDQ Y3, Y1, Y10 + VPUNPCKHQDQ Y3, Y1, Y11 + VPUNPCKLQDQ Y6, Y4, Y12 + VPUNPCKHQDQ Y6, Y4, Y13 + VPUNPCKLQDQ Y7, Y5, Y14 + VPUNPCKHQDQ Y7, Y5, Y15 + VPERM2I128 $0x20, Y12, Y8, Y0 + VPERM2I128 $0x31, Y12, Y8, Y4 + VPERM2I128 $0x20, Y13, Y9, Y1 + VPERM2I128 $0x31, Y13, Y9, Y5 + VPERM2I128 $0x20, Y14, Y10, Y2 + VPERM2I128 $0x31, Y14, Y10, Y6 + VPERM2I128 $0x20, Y15, Y11, Y3 + VPERM2I128 $0x31, Y15, Y11, Y7 + VMOVDQU Y0, 32(AX) + VMOVDQU Y1, 96(AX) + VMOVDQU Y2, 160(AX) + VMOVDQU Y3, 224(AX) + VMOVDQU Y4, 288(AX) + VMOVDQU Y5, 352(AX) + VMOVDQU Y6, 416(AX) + VMOVDQU Y7, 480(AX) + RET + // func compressChunksAVX2(cvs *[8][8]uint32, buf *[8192]byte, key *[8]uint32, counter uint64, flags uint32) // Requires: AVX, AVX2 TEXT ·compressChunksAVX2(SB), NOSPLIT, $672-40 @@ -1258,17 +2512,17 @@ loop: VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 + VMOVDQU 512(SP), Y8 // Finalize CVs - VMOVDQU 512(SP), Y8 - VPXOR Y0, Y8, Y0 - VPXOR Y1, Y9, Y1 - VPXOR Y2, Y10, Y2 - VPXOR Y3, Y11, Y3 - VPXOR Y4, Y12, Y4 - VPXOR Y5, Y13, Y5 - VPXOR Y6, Y14, Y6 - VPXOR Y7, Y15, Y7 + VPXOR Y0, Y8, Y0 + VPXOR Y1, Y9, Y1 + VPXOR Y2, Y10, Y2 + VPXOR Y3, Y11, Y3 + VPXOR Y4, Y12, Y4 + VPXOR Y5, Y13, Y5 + VPXOR Y6, Y14, Y6 + VPXOR Y7, Y15, Y7 // Loop INCQ DX diff --git a/compress_amd64.go b/compress_amd64.go index cfe414b..ea19611 100644 --- a/compress_amd64.go +++ b/compress_amd64.go @@ -11,6 +11,9 @@ import ( //go:noescape func compressChunksAVX2(cvs *[8][8]uint32, buf *[8192]byte, key *[8]uint32, counter uint64, flags uint32) +//go:noescape +func compressBlocksAVX2(out *[512]byte, msgs *[16]uint32, cv *[8]uint32, counter uint64, blockLen uint32, flags uint32) + func compressNode(n node) (out [16]uint32) { compressNodeGeneric(&out, n) return @@ -60,10 +63,6 @@ func compressChunk(chunk []byte, key *[8]uint32, counter uint64, flags uint32) n return n } -func wordsToBytes(words [16]uint32, block *[64]byte) { - *block = *(*[64]byte)(unsafe.Pointer(&words)) -} - func hashBlock(out *[64]byte, buf []byte) { var block [16]uint32 copy((*[64]byte)(unsafe.Pointer(&block))[:], buf) @@ -74,3 +73,17 @@ func hashBlock(out *[64]byte, buf []byte) { flags: flagChunkStart | flagChunkEnd | flagRoot, }) } + +func compressBlocks(out *[512]byte, n node) { + switch { + case cpu.X86.HasAVX2: + compressBlocksAVX2(out, &n.block, &n.cv, n.counter, n.blockLen, n.flags) + default: + compressBlocksGeneric((*[8][64]byte)(unsafe.Pointer(out)), n) + } + +} + +func wordsToBytes(words [16]uint32, block *[64]byte) { + *block = *(*[64]byte)(unsafe.Pointer(&words)) +} diff --git a/compress_generic.go b/compress_generic.go index 0b4dca6..4581b8a 100644 --- a/compress_generic.go +++ b/compress_generic.go @@ -115,6 +115,13 @@ func compressBufferGeneric(buf *[8192]byte, buflen int, key *[8]uint32, counter return mergeSubtrees(cvs, key, flags) } +func compressBlocksGeneric(outs *[8][64]byte, n node) { + for i := range outs { + wordsToBytes(compressNode(n), &outs[i]) + n.counter++ + } +} + func chainingValue(n node) (cv [8]uint32) { full := compressNode(n) copy(cv[:], full[:]) diff --git a/compress_noasm.go b/compress_noasm.go index 847a519..5efa191 100644 --- a/compress_noasm.go +++ b/compress_noasm.go @@ -51,6 +51,14 @@ func hashBlock(out *[64]byte, buf []byte) { wordsToBytes(words, out) } +func compressBlocks(out *[512]byte, n node) { + var outs [8][64]byte + compressBlocksGeneric(&outs, n) + for i := range outs { + copy(out[i*64:], outs[i][:]) + } +} + func bytesToWords(bytes [64]byte, words *[16]uint32) { for i := range words { words[i] = binary.LittleEndian.Uint32(bytes[4*i:])