diff --git a/README.md b/README.md index 87d2bb5..a012737 100644 --- a/README.md +++ b/README.md @@ -12,14 +12,9 @@ go get lukechampine.com/blake3 This implementation aims to be performant without sacrificing (too much) readability, in the hopes of eventually landing in `x/crypto`. -The pure-Go code is fairly well-optimized, achieving throughput of ~600 MB/s. -There is a separate code path for small inputs (up to 64 bytes) that runs in -~100 ns. On CPUs with AVX2 support, larger inputs (>=2 KB) are handled by -an [`avo`](https://github.com/mmcloughlin/avo)-generated assembly routine that compresses 8 nodes in parallel, -achieving throughput of ~2600 MB/s. AVX2 is also used for BLAKE3's extendable output function, -enabling it to stream pseudorandom bytes at ~3500 MB/s. Once [AVX-512 support](https://github.com/mmcloughlin/avo/issues/20) is added to `avo`, it -will be possible to compress 16 nodes in parallel, which should roughly double -the current performance. +In addition to the pure-Go implementation, this package also contains AVX-512 +and AVX2 routines (generated by [`avo`](https://github.com/mmcloughlin/avo)) +that greatly increase performance for large inputs and outputs. Contributions are greatly appreciated. [All contributors are eligible to receive an Urbit planet.](https://twitter.com/lukechampine/status/1274797924522885134) @@ -27,12 +22,45 @@ Contributions are greatly appreciated. ## Benchmarks -Tested on an i5-7600K @ 3.80GHz. +Tested on a 2020 MacBook Air (i5-7600K @ 3.80GHz). Benchmarks will improve as +soon as I get access to a beefer AVX-512 machine. :wink: + +### AVX-512 ``` -BenchmarkSum256/64 105 ns/op 609.51 MB/s -BenchmarkSum256/1024 1778 ns/op 576.00 MB/s -BenchmarkSum256/65536 24785 ns/op 2644.15 MB/s -BenchmarkWrite 389 ns/op 2631.78 MB/s -BenchmarkXOF 293 ns/op 3492.94 MB/s +BenchmarkSum256/64 120 ns/op 533.00 MB/s +BenchmarkSum256/1024 2229 ns/op 459.36 MB/s +BenchmarkSum256/65536 16245 ns/op 4034.11 MB/s +BenchmarkWrite 245 ns/op 4177.38 MB/s +BenchmarkXOF 246 ns/op 4159.30 MB/s ``` + +### AVX2 + +``` +BenchmarkSum256/64 120 ns/op 533.00 MB/s +BenchmarkSum256/1024 2229 ns/op 459.36 MB/s +BenchmarkSum256/65536 31137 ns/op 2104.76 MB/s +BenchmarkWrite 487 ns/op 2103.12 MB/s +BenchmarkXOF 329 ns/op 3111.27 MB/s +``` + +### Pure Go + +``` +BenchmarkSum256/64 120 ns/op 533.00 MB/s +BenchmarkSum256/1024 2229 ns/op 459.36 MB/s +BenchmarkSum256/65536 133505 ns/op 490.89 MB/s +BenchmarkWrite 2022 ns/op 506.36 MB/s +BenchmarkXOF 1914 ns/op 534.98 MB/s +``` + +## Shortcomings + +There is no assembly routine for single-block compressions. This is most +noticeable for ~1KB inputs. + +Each assembly routine inlines all 7 rounds, causing thousands of lines of +duplicated code. Ideally the routines could be merged such that only a single +routine is generated for AVX-512 and AVX2, without sacrificing too much +performance. diff --git a/avo/gen.go b/avo/gen.go index 6639927..c40c52a 100644 --- a/avo/gen.go +++ b/avo/gen.go @@ -6,26 +6,28 @@ import ( "fmt" . "github.com/mmcloughlin/avo/build" + "github.com/mmcloughlin/avo/ir" . "github.com/mmcloughlin/avo/operand" . "github.com/mmcloughlin/avo/reg" ) func main() { genGlobals() + genCompressBlocksAVX512() + genCompressChunksAVX512() genCompressBlocksAVX2() genCompressChunksAVX2() + genCompressParentsAVX2() Generate() } var globals struct { - iv Mem - blockLen Mem - stride1024 Mem - incrementCounter Mem - setFlags Mem - shuffleRot8 Mem - shuffleRot16 Mem + iv Mem + seq Mem + seq64 Mem // for loadCounter + shuffleRot8 Mem + shuffleRot16 Mem } func genGlobals() { @@ -35,28 +37,14 @@ func genGlobals() { DATA(2*4, U32(0x3C6EF372)) DATA(3*4, U32(0xA54FF53A)) - globals.blockLen = GLOBL("block_len", RODATA|NOPTR) - for i := 0; i < 8; i++ { - DATA(i*4, U32(64)) + globals.seq = GLOBL("seq", RODATA|NOPTR) + for i := 0; i < 16; i++ { + DATA(i*4, U32(i)) } - globals.stride1024 = GLOBL("stride_1024", RODATA|NOPTR) - for i := 0; i < 8; i++ { - DATA(i*4, U32(i*1024)) - } - globals.incrementCounter = GLOBL("increment_counter", RODATA|NOPTR) + globals.seq64 = GLOBL("seq64", RODATA|NOPTR) for i := 0; i < 8; i++ { DATA(i*8, U64(i)) } - globals.setFlags = GLOBL("set_flags", RODATA|NOPTR) - for i := 0; i < 16; i++ { - if i == 0 { - DATA(i*4, U32(1)) - } else if i == 15 { - DATA(i*4, U32(2)) - } else { - DATA(i*4, U32(0)) - } - } globals.shuffleRot8 = GLOBL("shuffle_rot8", RODATA|NOPTR) for i := 0; i < 8; i++ { DATA(i*4, U32(0x00030201+0x04040404*i)) @@ -67,6 +55,186 @@ func genGlobals() { } } +func genCompressBlocksAVX512() { + TEXT("compressBlocksAVX512", NOSPLIT, "func(out *[1024]byte, block *[16]uint32, cv *[8]uint32, counter uint64, blockLen uint32, flags uint32)") + out := Mem{Base: Load(Param("out"), GP64())} + block := Mem{Base: Load(Param("block"), GP64())} + cv := Mem{Base: Load(Param("cv"), GP64())} + counter, _ := Param("counter").Resolve() + blockLen, _ := Param("blockLen").Resolve() + flags, _ := Param("flags").Resolve() + + Comment("Initialize block vectors") + var vs, mv [16]VecVirtual + for i := range vs { + vs[i], mv[i] = ZMM(), ZMM() + VPBROADCASTD_Z(block.Offset(i*4), mv[i]) + } + + Comment("Initialize state vectors") + for i, v := range vs { + switch i { + case 0, 1, 2, 3, 4, 5, 6, 7: // cv + VPBROADCASTD_Z(cv.Offset(i*4), v) + case 8, 9, 10, 11: // iv + VPBROADCASTD_Z(globals.iv.Offset((i-8)*4), v) + case 12: // counter + VPBROADCASTD_Z(counter.Addr, vs[12]) + VPADDD_Z(globals.seq, vs[12], vs[12]) + // set a 1 bit in K1 for each overflowed counter in vs[12] + VPCMPUD(Imm(1), globals.seq, vs[12], K1) + // add 1 to each counter in vs[13] for each 1 bit in K1 + VPBROADCASTD_Z(counter.Addr.Offset(1*4), vs[13]) + VPADDD_ZBK(globals.seq.Offset(4), vs[13], K1, vs[13]) + case 14: // blockLen + VPBROADCASTD_Z(blockLen.Addr, v) + case 15: // flags + VPBROADCASTD_Z(flags.Addr, v) + } + } + + performRoundsAVX512(vs, mv) + + Comment("Finalize CVs") + for i, v := range vs[:8] { + VPXORD_Z(v, vs[i+8], v) + } + for i, v := range vs[8:] { + VPXORD_ZB(cv.Offset(i*4), v, v) + } + stride := ZMM() + VMOVDQU32_Z(globals.seq, stride) + VPSLLD_Z(Imm(6), stride, stride) // stride of 64 + for i, v := range vs { + KXNORD(K1, K1, K1) // fastest way to set all bits to 1 + VPSCATTERDD_Z(v, K1, out.Offset(i*4).Idx(stride, 1)) + } + + RET() +} + +func genCompressChunksAVX512() { + TEXT("compressChunksAVX512", NOSPLIT, "func(cvs *[16][8]uint32, buf *[16384]byte, key *[8]uint32, counter uint64, flags uint32)") + cvs := Mem{Base: Load(Param("cvs"), GP64())} + buf := Mem{Base: Load(Param("buf"), GP64())} + key := Mem{Base: Load(Param("key"), GP64())} + counter, _ := Param("counter").Resolve() + flags, _ := Param("flags").Resolve() + + var vs, mv [16]VecVirtual + for i := range vs { + vs[i], mv[i] = ZMM(), ZMM() + } + + Comment("Initialize counter") + counterLo := AllocLocal(64) + counterHi := AllocLocal(64) + VPBROADCASTD_Z(counter.Addr, vs[0]) + VPADDD_Z(globals.seq, vs[0], vs[0]) + VPCMPUD(Imm(1), globals.seq, vs[0], K1) + VPBROADCASTD_Z(counter.Addr.Offset(4), vs[1]) + VPADDD_ZBK(globals.seq.Offset(4), vs[1], K1, vs[1]) + VMOVDQU32_Z(vs[0], counterLo) + VMOVDQU32_Z(vs[1], counterHi) + + Comment("Initialize flags") + chunkFlags := AllocLocal(16 * 4) + VPBROADCASTD_Z(flags.Addr, vs[0]) + VMOVDQU32_Z(vs[0], chunkFlags) + ORL(Imm(1), chunkFlags.Offset(0*4)) + ORL(Imm(2), chunkFlags.Offset(15*4)) + + Comment("Load key") + for i := 0; i < 8; i++ { + VPBROADCASTD_Z(key.Offset(i*4), vs[i]) + } + + Comment("Loop index") + loop := GP64() + XORQ(loop, loop) + Label("loop") + + Comment("Load transposed block") + VMOVDQU32_Z(globals.seq, vs[8]) + VPSLLD_Z(Imm(10), vs[8], vs[8]) // stride of 1024 + for i, m := range mv { + KXNORD(K1, K1, K1) + VPGATHERDD_Z(buf.Offset(i*4).Idx(vs[8], 1), K1, m) + } + ADDQ(Imm(64), buf.Base) + + Comment("Reload state vectors (other than CVs)") + for i := 0; i < 4; i++ { + VPBROADCASTD_Z(globals.iv.Offset(i*4), vs[8+i]) + } + VMOVDQU32_Z(counterLo, vs[12]) + VMOVDQU32_Z(counterHi, vs[13]) + VPBROADCASTD_Z(globals.seq.Offset(4), vs[14]) + VPSLLD_Z(Imm(6), vs[14], vs[14]) // 64 + VPBROADCASTD_Z(chunkFlags.Idx(loop, 4), vs[15]) + + performRoundsAVX512(vs, mv) + + Comment("Finalize CVs") + for i := range vs[:8] { + VPXORD_Z(vs[i], vs[i+8], vs[i]) + } + + Comment("Loop") + INCQ(loop) + CMPQ(loop, U32(16)) + JNE(LabelRef("loop")) + + Comment("Finished; transpose CVs") + VMOVDQU32_Z(globals.seq, vs[8]) + VPSLLD_Z(Imm(5), vs[8], vs[8]) // stride of 32 + for i, v := range vs[:8] { + KXNORD(K1, K1, K1) // fastest way to set all bits to 1 + VPSCATTERDD_Z(v, K1, cvs.Offset(i*4).Idx(vs[8], 1)) + } + + RET() +} + +func performRoundsAVX512(vs, mv [16]VecVirtual) { + g := func(a, b, c, d, mx, my VecVirtual) { + VPADDD_Z(a, b, a) + VPADDD_Z(mx, a, a) + VPXORD_Z(d, a, d) + VPRORD_Z(Imm(16), d, d) + VPADDD_Z(c, d, c) + VPXORD_Z(b, c, b) + VPRORD_Z(Imm(12), b, b) + VPADDD_Z(a, b, a) + VPADDD_Z(my, a, a) + VPXORD_Z(d, a, d) + VPRORD_Z(Imm(8), d, d) + VPADDD_Z(c, d, c) + VPXORD_Z(b, c, b) + VPRORD_Z(Imm(7), b, b) + } + + for i := 0; i < 7; i++ { + Comment(fmt.Sprintf("Round %v", i+1)) + g(vs[0], vs[4], vs[8], vs[12], mv[0], mv[1]) + g(vs[1], vs[5], vs[9], vs[13], mv[2], mv[3]) + g(vs[2], vs[6], vs[10], vs[14], mv[4], mv[5]) + g(vs[3], vs[7], vs[11], vs[15], mv[6], mv[7]) + g(vs[0], vs[5], vs[10], vs[15], mv[8], mv[9]) + g(vs[1], vs[6], vs[11], vs[12], mv[10], mv[11]) + g(vs[2], vs[7], vs[8], vs[13], mv[12], mv[13]) + g(vs[3], vs[4], vs[9], vs[14], mv[14], mv[15]) + + // permute + mv = [16]VecVirtual{ + mv[2], mv[6], mv[3], mv[10], + mv[7], mv[0], mv[4], mv[13], + mv[1], mv[11], mv[12], mv[5], + mv[9], mv[14], mv[15], mv[8], + } + } +} + func genCompressBlocksAVX2() { TEXT("compressBlocksAVX2", NOSPLIT, "func(out *[512]byte, block *[16]uint32, cv *[8]uint32, counter uint64, blockLen uint32, flags uint32)") out := Mem{Base: Load(Param("out"), GP64())} @@ -76,20 +244,12 @@ func genCompressBlocksAVX2() { blockLen, _ := Param("blockLen").Resolve() flags, _ := Param("flags").Resolve() - vs := [16]VecVirtual{ - YMM(), YMM(), YMM(), YMM(), - YMM(), YMM(), YMM(), YMM(), - YMM(), YMM(), YMM(), YMM(), - YMM(), YMM(), YMM(), YMM(), - } - - // stack space for message vectors + var vs [16]VecVirtual var mv [16]Mem - for i := range mv { + for i := range vs { + vs[i] = YMM() mv[i] = AllocLocal(32) } - // stack space for spilled vs[8] register - spillMem := AllocLocal(32) Comment("Load block") for i := 0; i < 16; i++ { @@ -113,7 +273,7 @@ func genCompressBlocksAVX2() { } } - performRounds(vs, mv, spillMem) + performRoundsAVX2(vs, mv) Comment("Finalize CVs") for i := 8; i < 16; i++ { @@ -149,19 +309,12 @@ func genCompressChunksAVX2() { counter, _ := Param("counter").Resolve() flags, _ := Param("flags").Resolve() - vs := [16]VecVirtual{ - YMM(), YMM(), YMM(), YMM(), - YMM(), YMM(), YMM(), YMM(), - YMM(), YMM(), YMM(), YMM(), - YMM(), YMM(), YMM(), YMM(), - } - // stack space for transposed message vectors + var vs [16]VecVirtual var mv [16]Mem - for i := range mv { + for i := range vs { + vs[i] = YMM() mv[i] = AllocLocal(32) } - // stack space for spilled vs[8] register - spillMem := AllocLocal(32) Comment("Load key") for i := 0; i < 8; i++ { @@ -178,10 +331,10 @@ func genCompressChunksAVX2() { Comment("Initialize flags") chunkFlags := AllocLocal(16 * 4) VPBROADCASTD(flags.Addr, vs[14]) - VPOR(globals.setFlags.Offset(0*32), vs[14], vs[15]) - VMOVDQU(vs[15], chunkFlags.Offset(0*32)) - VPOR(globals.setFlags.Offset(1*32), vs[14], vs[15]) - VMOVDQU(vs[15], chunkFlags.Offset(1*32)) + VMOVDQU(vs[14], chunkFlags.Offset(0*32)) + VMOVDQU(vs[14], chunkFlags.Offset(1*32)) + ORL(Imm(1), chunkFlags.Offset(0*4)) + ORL(Imm(2), chunkFlags.Offset(15*4)) Comment("Loop index") loop := GP64() @@ -189,7 +342,8 @@ func genCompressChunksAVX2() { Label("loop") Comment("Load transposed block") - VMOVDQU(globals.stride1024, vs[9]) + VMOVDQU(globals.seq, vs[9]) + VPSLLD(Imm(10), vs[9], vs[9]) // stride of 1024 for i := 0; i < 16; i++ { VPCMPEQD(vs[8], vs[8], vs[8]) // fastest way to set all bits to 1 VPGATHERDD(vs[8], buf.Offset(i*4).Idx(vs[9], 1), vs[10]) @@ -203,10 +357,11 @@ func genCompressChunksAVX2() { } VMOVDQU(counterLo, vs[12]) VMOVDQU(counterHi, vs[13]) - VMOVDQU(globals.blockLen, vs[14]) + VPBROADCASTD(globals.seq.Offset(4), vs[14]) + VPSLLD(Imm(6), vs[14], vs[14]) // 64 VPBROADCASTD(chunkFlags.Idx(loop, 4), vs[15]) - performRounds(vs, mv, spillMem) + performRoundsAVX2(vs, mv) Comment("Finalize CVs") for i := range vs[:8] { @@ -227,19 +382,112 @@ func genCompressChunksAVX2() { RET() } -func performRounds(sv [16]VecVirtual, mv [16]Mem, spillMem Mem) { +func genCompressParentsAVX2() { + TEXT("compressParentsAVX2", NOSPLIT, "func(parents *[8][8]uint32, cvs *[16][8]uint32, key *[8]uint32, flags uint32)") + parents := Mem{Base: Load(Param("parents"), GP64())} + cvs := Mem{Base: Load(Param("cvs"), GP64())} + key := Mem{Base: Load(Param("key"), GP64())} + flags, _ := Param("flags").Resolve() + + var vs [16]VecVirtual + var mv [16]Mem + for i := range vs { + vs[i] = YMM() + mv[i] = AllocLocal(32) + } + + Comment("Load transposed block") + VMOVDQU(globals.seq, vs[9]) + VPSLLD(Imm(6), vs[9], vs[9]) // stride of 64 + for i := 0; i < 16; i++ { + VPCMPEQD(vs[8], vs[8], vs[8]) // fastest way to set all bits to 1 + VPGATHERDD(vs[8], cvs.Offset(i*4).Idx(vs[9], 1), vs[10]) + VMOVDQU(vs[10], mv[i]) + } + + Comment("Initialize state vectors") + for i, v := range vs { + switch i { + case 0, 1, 2, 3, 4, 5, 6, 7: // cv + VPBROADCASTD(key.Offset(i*4), v) + case 8, 9, 10, 11: // iv + VPBROADCASTD(globals.iv.Offset((i-8)*4), v) + case 12, 13: // counter + VPXOR(v, v, v) + case 14: // blockLen + VPBROADCASTD(globals.seq.Offset(1*4), v) + VPSLLD(Imm(6), v, v) // 64 + case 15: // flags + ORL(Imm(4), flags.Addr) // flagParent + VPBROADCASTD(flags.Addr, v) + } + } + + performRoundsAVX2(vs, mv) + + Comment("Finalize CVs") + for i := range vs[:8] { + VPXOR(vs[i], vs[i+8], vs[i]) + } + transpose(vs[:8], vs[8:]) + for i, v := range vs[8:] { + VMOVDQU(v, parents.Offset(i*32)) + } + + RET() +} + +func performRoundsAVX2(sv [16]VecVirtual, mv [16]Mem) { + spillMem := AllocLocal(32) tmp := sv[8] + g := func(a, b, c, d VecVirtual, mx, my Mem) { + // Helper function for performing rotations. Also manages c, tmp and + // spillMem: if c == tmp, we need to spill and reload c using spillMem. + rotr := func(v VecVirtual, n uint64, dst VecVirtual) { + switch n { + case 8, 16: + shuf := [...]Mem{8: globals.shuffleRot8, 16: globals.shuffleRot16}[n] + VPSHUFB(shuf, v, dst) + if c == tmp { + VMOVDQU(spillMem, c) + } + case 7, 12: + if c == tmp { + VMOVDQU(c, spillMem) + } + VPSRLD(Imm(n), v, tmp) + VPSLLD(Imm(32-n), v, dst) + VPOR(dst, tmp, dst) + } + } + + VPADDD(a, b, a) + VPADDD(mx, a, a) + VPXOR(d, a, d) + rotr(d, 16, d) + VPADDD(c, d, c) + VPXOR(b, c, b) + rotr(b, 12, b) + VPADDD(a, b, a) + VPADDD(my, a, a) + VPXOR(d, a, d) + rotr(d, 8, d) + VPADDD(c, d, c) + VPXOR(b, c, b) + rotr(b, 7, b) + } + VMOVDQU(sv[8], spillMem) // spill for i := 0; i < 7; i++ { Comment(fmt.Sprintf("Round %v", i+1)) - g(sv[0], sv[4], sv[8], sv[12], mv[0], mv[1], tmp, spillMem) - g(sv[1], sv[5], sv[9], sv[13], mv[2], mv[3], tmp, spillMem) - g(sv[2], sv[6], sv[10], sv[14], mv[4], mv[5], tmp, spillMem) - g(sv[3], sv[7], sv[11], sv[15], mv[6], mv[7], tmp, spillMem) - g(sv[0], sv[5], sv[10], sv[15], mv[8], mv[9], tmp, spillMem) - g(sv[1], sv[6], sv[11], sv[12], mv[10], mv[11], tmp, spillMem) - g(sv[2], sv[7], sv[8], sv[13], mv[12], mv[13], tmp, spillMem) - g(sv[3], sv[4], sv[9], sv[14], mv[14], mv[15], tmp, spillMem) + g(sv[0], sv[4], sv[8], sv[12], mv[0], mv[1]) + g(sv[1], sv[5], sv[9], sv[13], mv[2], mv[3]) + g(sv[2], sv[6], sv[10], sv[14], mv[4], mv[5]) + g(sv[3], sv[7], sv[11], sv[15], mv[6], mv[7]) + g(sv[0], sv[5], sv[10], sv[15], mv[8], mv[9]) + g(sv[1], sv[6], sv[11], sv[12], mv[10], mv[11]) + g(sv[2], sv[7], sv[8], sv[13], mv[12], mv[13]) + g(sv[3], sv[4], sv[9], sv[14], mv[14], mv[15]) // permute mv = [16]Mem{ @@ -252,50 +500,13 @@ func performRounds(sv [16]VecVirtual, mv [16]Mem, spillMem Mem) { VMOVDQU(spillMem, sv[8]) // reload } -func g(a, b, c, d VecVirtual, mx, my Mem, tmp VecVirtual, spillMem Mem) { - // Helper function for performing rotations. Also manages c, tmp and - // spillMem: if c == tmp, we need to spill and reload c using spillMem. - rotr := func(v VecVirtual, n uint64, dst VecVirtual) { - switch n { - case 8, 16: - shuf := [...]Mem{8: globals.shuffleRot8, 16: globals.shuffleRot16}[n] - VPSHUFB(shuf, v, dst) - if c == tmp { - VMOVDQU(spillMem, c) - } - case 7, 12: - if c == tmp { - VMOVDQU(c, spillMem) - } - VPSRLD(Imm(n), v, tmp) - VPSLLD(Imm(32-n), v, dst) - VPOR(dst, tmp, dst) - } - } - - VPADDD(a, b, a) - VPADDD(mx, a, a) - VPXOR(d, a, d) - rotr(d, 16, d) - VPADDD(c, d, c) - VPXOR(b, c, b) - rotr(b, 12, b) - VPADDD(a, b, a) - VPADDD(my, a, a) - VPXOR(d, a, d) - rotr(d, 8, d) - VPADDD(c, d, c) - VPXOR(b, c, b) - rotr(b, 7, b) -} - func loadCounter(counter Mem, dst, scratch []VecVirtual) { // fill dst[0] and dst[1] with counter + 0,1,2,3,4,5,6,7, then transpose so // that dst[0] contains low 32 bits and dst[1] contains high 32 bits. VPBROADCASTQ(counter, dst[0]) VPBROADCASTQ(counter, dst[1]) - VPADDQ(globals.incrementCounter.Offset(0*32), dst[0], dst[0]) - VPADDQ(globals.incrementCounter.Offset(1*32), dst[1], dst[1]) + VPADDQ(globals.seq64.Offset(0*4), dst[0], dst[0]) + VPADDQ(globals.seq64.Offset(8*4), dst[1], dst[1]) VPUNPCKLDQ(dst[1], dst[0], scratch[0]) VPUNPCKHDQ(dst[1], dst[0], scratch[1]) VPUNPCKLDQ(scratch[1], scratch[0], dst[0]) @@ -323,3 +534,152 @@ func transpose(src, dst []VecVirtual) { VPERM2I128(Imm(0x31), src[i+4], src[i], dst[i+4]) } } + +// AVX-512 is not currently supported by avo, so we need to manually define the +// instructions we need + +type maskReg = LabelRef // hack; avo doesn't allow custom Op types + +const K0 maskReg = "K0" +const K1 maskReg = "K1" +const K2 maskReg = "K2" + +func VMOVDQU32_Z(src, dst Op) { + Instruction(&ir.Instruction{ + Opcode: "VMOVDQU32", + Operands: []Op{src, dst}, + Inputs: []Op{src}, + Outputs: []Op{dst}, + ISA: []string{"AVX512F"}, + }) +} + +func VPBROADCASTD_Z(src, dst Op) { + Instruction(&ir.Instruction{ + Opcode: "VPBROADCASTD", + Operands: []Op{src, dst}, + Inputs: []Op{src}, + Outputs: []Op{dst}, + ISA: []string{"AVX512F"}, + }) +} + +func VPGATHERDD_Z(src, mask, dst Op) { + Instruction(&ir.Instruction{ + Opcode: "VPGATHERDD", + Operands: []Op{src, mask, dst}, + Inputs: []Op{src, mask}, + Outputs: []Op{dst}, + ISA: []string{"AVX512F"}, + }) +} + +func VPSCATTERDD_Z(src, mask, dst Op) { + Instruction(&ir.Instruction{ + Opcode: "VPSCATTERDD", + Operands: []Op{src, mask, dst}, + Inputs: []Op{src, mask}, + Outputs: []Op{dst}, + ISA: []string{"AVX512F"}, + }) +} + +func VPORD_Z(x, y, dst Op) { + Instruction(&ir.Instruction{ + Opcode: "VPORD", + Operands: []Op{x, y, dst}, + Inputs: []Op{x, y}, + Outputs: []Op{dst}, + ISA: []string{"AVX512F"}, + }) +} + +func VPXORD_Z(x, y, dst Op) { + Instruction(&ir.Instruction{ + Opcode: "VPXORD", + Operands: []Op{x, y, dst}, + Inputs: []Op{x, y}, + Outputs: []Op{dst}, + ISA: []string{"AVX512F"}, + }) +} + +func VPXORD_ZB(x, y, dst Op) { + Instruction(&ir.Instruction{ + Opcode: "VPXORD.BCST", + Operands: []Op{x, y, dst}, + Inputs: []Op{x, y}, + Outputs: []Op{dst}, + ISA: []string{"AVX512F"}, + }) +} + +func VPRORD_Z(n, src, dst Op) { + Instruction(&ir.Instruction{ + Opcode: "VPRORD", + Operands: []Op{n, src, dst}, + Inputs: []Op{n, src}, + Outputs: []Op{dst}, + ISA: []string{"AVX512F"}, + }) +} + +func VPSLLD_Z(n, src, dst Op) { + Instruction(&ir.Instruction{ + Opcode: "VPSLLD", + Operands: []Op{n, src, dst}, + Inputs: []Op{n, src}, + Outputs: []Op{dst}, + ISA: []string{"AVX512F"}, + }) +} + +func VPADDD_Z(x, y, dst Op) { + Instruction(&ir.Instruction{ + Opcode: "VPADDD", + Operands: []Op{x, y, dst}, + Inputs: []Op{x, y}, + Outputs: []Op{dst}, + ISA: []string{"AVX512F"}, + }) +} + +func VPADDD_ZB(x, y, dst Op) { + Instruction(&ir.Instruction{ + Opcode: "VPADDD.BCST", + Operands: []Op{x, y, dst}, + Inputs: []Op{x, y}, + Outputs: []Op{dst}, + ISA: []string{"AVX512F"}, + }) +} + +func VPADDD_ZBK(x, y, mask, dst Op) { + Instruction(&ir.Instruction{ + Opcode: "VPADDD.BCST", + Operands: []Op{x, y, mask, dst}, + Inputs: []Op{x, y, mask}, + Outputs: []Op{dst}, + ISA: []string{"AVX512F"}, + }) +} + +func KXNORD(x, y, dst Op) { + Instruction(&ir.Instruction{ + Opcode: "KXNORD", + Operands: []Op{x, y, dst}, + Inputs: []Op{x, y}, + Outputs: []Op{dst}, + ISA: []string{"AVX512F"}, + }) +} + +func VPCMPUD(pred, x, y, dst Op) { + Instruction(&ir.Instruction{ + Opcode: "VPCMPUD", + Operands: []Op{pred, x, y, dst}, + Inputs: []Op{pred, x, y}, + Outputs: []Op{dst}, + ISA: []string{"AVX512F"}, + }) +} diff --git a/blake3.go b/blake3.go index da439b3..7bf5dc5 100644 --- a/blake3.go +++ b/blake3.go @@ -21,6 +21,8 @@ const ( blockSize = 64 chunkSize = 1024 + + maxSIMD = 16 // AVX-512 vectors can store 16 words ) var iv = [8]uint32{ @@ -58,10 +60,10 @@ type Hasher struct { size int // output size, for Sum // log(n) set of Merkle subtree roots, at most one per height. - stack [51][8]uint32 // 2^51 * 8 * chunkSize = 2^64 + stack [50][8]uint32 // 2^50 * maxSIMD * chunkSize = 2^64 counter uint64 // number of buffers hashed; also serves as a bit vector indicating which stack elems are occupied - buf [8 * chunkSize]byte + buf [maxSIMD * chunkSize]byte buflen int } @@ -83,7 +85,7 @@ func (h *Hasher) pushSubtree(cv [8]uint32) { // rootNode computes the root of the Merkle tree. It does not modify the // stack. func (h *Hasher) rootNode() node { - n := compressBuffer(&h.buf, h.buflen, &h.key, h.counter*8, h.flags) + n := compressBuffer(&h.buf, h.buflen, &h.key, h.counter*maxSIMD, h.flags) for i := bits.TrailingZeros64(h.counter); i < bits.Len64(h.counter); i++ { if h.hasSubtreeAtHeight(i) { n = parentNode(h.stack[i], chainingValue(n), h.key, h.flags) @@ -98,7 +100,7 @@ func (h *Hasher) Write(p []byte) (int, error) { lenp := len(p) for len(p) > 0 { if h.buflen == len(h.buf) { - n := compressBuffer(&h.buf, h.buflen, &h.key, h.counter*8, h.flags) + n := compressBuffer(&h.buf, h.buflen, &h.key, h.counter*maxSIMD, h.flags) h.pushSubtree(chainingValue(n)) h.buflen = 0 } @@ -119,8 +121,16 @@ func (h *Hasher) Sum(b []byte) (sum []byte) { sum = make([]byte, total) copy(sum, b) } - // Read into the appended portion of sum - h.XOF().Read(sum[len(b):]) + // Read into the appended portion of sum. Use a low-latency-low-throughput + // path for small digests (requiring a single compression), and a + // high-latency-high-throughput path for large digests. + if dst := sum[len(b):]; len(dst) <= 64 { + var out [64]byte + wordsToBytes(compressNode(h.rootNode()), &out) + copy(dst, out[:]) + } else { + h.XOF().Read(dst) + } return } @@ -224,7 +234,7 @@ func DeriveKey(subKey []byte, ctx string, srcKey []byte) { // bytes. type OutputReader struct { n node - buf [8 * blockSize]byte + buf [maxSIMD * blockSize]byte off uint64 } @@ -238,11 +248,11 @@ func (or *OutputReader) Read(p []byte) (int, error) { } lenp := len(p) for len(p) > 0 { - if or.off%(8*blockSize) == 0 { + if or.off%(maxSIMD*blockSize) == 0 { or.n.counter = or.off / blockSize compressBlocks(&or.buf, or.n) } - n := copy(p, or.buf[or.off%(8*blockSize):]) + n := copy(p, or.buf[or.off%(maxSIMD*blockSize):]) p = p[n:] or.off += uint64(n) } @@ -274,7 +284,7 @@ func (or *OutputReader) Seek(offset int64, whence int) (int64, error) { } or.off = off or.n.counter = uint64(off) / blockSize - if or.off%(8*blockSize) != 0 { + if or.off%(maxSIMD*blockSize) != 0 { compressBlocks(&or.buf, or.n) } // NOTE: or.off >= 2^63 will result in a negative return value. diff --git a/blake3_amd64.s b/blake3_amd64.s index 2ab9ea0..4a6d93b 100644 --- a/blake3_amd64.s +++ b/blake3_amd64.s @@ -8,53 +8,33 @@ DATA iv<>+8(SB)/4, $0x3c6ef372 DATA iv<>+12(SB)/4, $0xa54ff53a GLOBL iv<>(SB), RODATA|NOPTR, $16 -DATA block_len<>+0(SB)/4, $0x00000040 -DATA block_len<>+4(SB)/4, $0x00000040 -DATA block_len<>+8(SB)/4, $0x00000040 -DATA block_len<>+12(SB)/4, $0x00000040 -DATA block_len<>+16(SB)/4, $0x00000040 -DATA block_len<>+20(SB)/4, $0x00000040 -DATA block_len<>+24(SB)/4, $0x00000040 -DATA block_len<>+28(SB)/4, $0x00000040 -GLOBL block_len<>(SB), RODATA|NOPTR, $32 +DATA seq<>+0(SB)/4, $0x00000000 +DATA seq<>+4(SB)/4, $0x00000001 +DATA seq<>+8(SB)/4, $0x00000002 +DATA seq<>+12(SB)/4, $0x00000003 +DATA seq<>+16(SB)/4, $0x00000004 +DATA seq<>+20(SB)/4, $0x00000005 +DATA seq<>+24(SB)/4, $0x00000006 +DATA seq<>+28(SB)/4, $0x00000007 +DATA seq<>+32(SB)/4, $0x00000008 +DATA seq<>+36(SB)/4, $0x00000009 +DATA seq<>+40(SB)/4, $0x0000000a +DATA seq<>+44(SB)/4, $0x0000000b +DATA seq<>+48(SB)/4, $0x0000000c +DATA seq<>+52(SB)/4, $0x0000000d +DATA seq<>+56(SB)/4, $0x0000000e +DATA seq<>+60(SB)/4, $0x0000000f +GLOBL seq<>(SB), RODATA|NOPTR, $64 -DATA stride_1024<>+0(SB)/4, $0x00000000 -DATA stride_1024<>+4(SB)/4, $0x00000400 -DATA stride_1024<>+8(SB)/4, $0x00000800 -DATA stride_1024<>+12(SB)/4, $0x00000c00 -DATA stride_1024<>+16(SB)/4, $0x00001000 -DATA stride_1024<>+20(SB)/4, $0x00001400 -DATA stride_1024<>+24(SB)/4, $0x00001800 -DATA stride_1024<>+28(SB)/4, $0x00001c00 -GLOBL stride_1024<>(SB), RODATA|NOPTR, $32 - -DATA increment_counter<>+0(SB)/8, $0x0000000000000000 -DATA increment_counter<>+8(SB)/8, $0x0000000000000001 -DATA increment_counter<>+16(SB)/8, $0x0000000000000002 -DATA increment_counter<>+24(SB)/8, $0x0000000000000003 -DATA increment_counter<>+32(SB)/8, $0x0000000000000004 -DATA increment_counter<>+40(SB)/8, $0x0000000000000005 -DATA increment_counter<>+48(SB)/8, $0x0000000000000006 -DATA increment_counter<>+56(SB)/8, $0x0000000000000007 -GLOBL increment_counter<>(SB), RODATA|NOPTR, $64 - -DATA set_flags<>+0(SB)/4, $0x00000001 -DATA set_flags<>+4(SB)/4, $0x00000000 -DATA set_flags<>+8(SB)/4, $0x00000000 -DATA set_flags<>+12(SB)/4, $0x00000000 -DATA set_flags<>+16(SB)/4, $0x00000000 -DATA set_flags<>+20(SB)/4, $0x00000000 -DATA set_flags<>+24(SB)/4, $0x00000000 -DATA set_flags<>+28(SB)/4, $0x00000000 -DATA set_flags<>+32(SB)/4, $0x00000000 -DATA set_flags<>+36(SB)/4, $0x00000000 -DATA set_flags<>+40(SB)/4, $0x00000000 -DATA set_flags<>+44(SB)/4, $0x00000000 -DATA set_flags<>+48(SB)/4, $0x00000000 -DATA set_flags<>+52(SB)/4, $0x00000000 -DATA set_flags<>+56(SB)/4, $0x00000000 -DATA set_flags<>+60(SB)/4, $0x00000002 -GLOBL set_flags<>(SB), RODATA|NOPTR, $64 +DATA seq64<>+0(SB)/8, $0x0000000000000000 +DATA seq64<>+8(SB)/8, $0x0000000000000001 +DATA seq64<>+16(SB)/8, $0x0000000000000002 +DATA seq64<>+24(SB)/8, $0x0000000000000003 +DATA seq64<>+32(SB)/8, $0x0000000000000004 +DATA seq64<>+40(SB)/8, $0x0000000000000005 +DATA seq64<>+48(SB)/8, $0x0000000000000006 +DATA seq64<>+56(SB)/8, $0x0000000000000007 +GLOBL seq64<>(SB), RODATA|NOPTR, $64 DATA shuffle_rot8<>+0(SB)/4, $0x00030201 DATA shuffle_rot8<>+4(SB)/4, $0x04070605 @@ -76,6 +56,1821 @@ DATA shuffle_rot16<>+24(SB)/4, $0x19181b1a DATA shuffle_rot16<>+28(SB)/4, $0x1d1c1f1e GLOBL shuffle_rot16<>(SB), RODATA|NOPTR, $32 +// func compressBlocksAVX512(out *[1024]byte, block *[16]uint32, cv *[8]uint32, counter uint64, blockLen uint32, flags uint32) +// Requires: AVX512F +TEXT ·compressBlocksAVX512(SB), NOSPLIT, $0-40 + MOVQ out+0(FP), AX + MOVQ block+8(FP), CX + MOVQ cv+16(FP), DX + + // Initialize block vectors + VPBROADCASTD (CX), Z1 + VPBROADCASTD 4(CX), Z3 + VPBROADCASTD 8(CX), Z5 + VPBROADCASTD 12(CX), Z7 + VPBROADCASTD 16(CX), Z9 + VPBROADCASTD 20(CX), Z11 + VPBROADCASTD 24(CX), Z13 + VPBROADCASTD 28(CX), Z15 + VPBROADCASTD 32(CX), Z17 + VPBROADCASTD 36(CX), Z19 + VPBROADCASTD 40(CX), Z21 + VPBROADCASTD 44(CX), Z23 + VPBROADCASTD 48(CX), Z25 + VPBROADCASTD 52(CX), Z27 + VPBROADCASTD 56(CX), Z29 + VPBROADCASTD 60(CX), Z31 + + // Initialize state vectors + VPBROADCASTD (DX), Z0 + VPBROADCASTD 4(DX), Z2 + VPBROADCASTD 8(DX), Z4 + VPBROADCASTD 12(DX), Z6 + VPBROADCASTD 16(DX), Z8 + VPBROADCASTD 20(DX), Z10 + VPBROADCASTD 24(DX), Z12 + VPBROADCASTD 28(DX), Z14 + VPBROADCASTD iv<>+0(SB), Z16 + VPBROADCASTD iv<>+4(SB), Z18 + VPBROADCASTD iv<>+8(SB), Z20 + VPBROADCASTD iv<>+12(SB), Z22 + VPBROADCASTD counter+24(FP), Z24 + VPADDD seq<>+0(SB), Z24, Z24 + VPCMPUD $0x01, seq<>+0(SB), Z24, K1 + VPBROADCASTD counter+28(FP), Z26 + VPADDD.BCST seq<>+4(SB), Z26, K1, Z26 + VPBROADCASTD blockLen+32(FP), Z28 + VPBROADCASTD flags+36(FP), Z30 + + // Round 1 + VPADDD Z0, Z8, Z0 + VPADDD Z1, Z0, Z0 + VPXORD Z24, Z0, Z24 + VPRORD $0x10, Z24, Z24 + VPADDD Z16, Z24, Z16 + VPXORD Z8, Z16, Z8 + VPRORD $0x0c, Z8, Z8 + VPADDD Z0, Z8, Z0 + VPADDD Z3, Z0, Z0 + VPXORD Z24, Z0, Z24 + VPRORD $0x08, Z24, Z24 + VPADDD Z16, Z24, Z16 + VPXORD Z8, Z16, Z8 + VPRORD $0x07, Z8, Z8 + VPADDD Z2, Z10, Z2 + VPADDD Z5, Z2, Z2 + VPXORD Z26, Z2, Z26 + VPRORD $0x10, Z26, Z26 + VPADDD Z18, Z26, Z18 + VPXORD Z10, Z18, Z10 + VPRORD $0x0c, Z10, Z10 + VPADDD Z2, Z10, Z2 + VPADDD Z7, Z2, Z2 + VPXORD Z26, Z2, Z26 + VPRORD $0x08, Z26, Z26 + VPADDD Z18, Z26, Z18 + VPXORD Z10, Z18, Z10 + VPRORD $0x07, Z10, Z10 + VPADDD Z4, Z12, Z4 + VPADDD Z9, Z4, Z4 + VPXORD Z28, Z4, Z28 + VPRORD $0x10, Z28, Z28 + VPADDD Z20, Z28, Z20 + VPXORD Z12, Z20, Z12 + VPRORD $0x0c, Z12, Z12 + VPADDD Z4, Z12, Z4 + VPADDD Z11, Z4, Z4 + VPXORD Z28, Z4, Z28 + VPRORD $0x08, Z28, Z28 + VPADDD Z20, Z28, Z20 + VPXORD Z12, Z20, Z12 + VPRORD $0x07, Z12, Z12 + VPADDD Z6, Z14, Z6 + VPADDD Z13, Z6, Z6 + VPXORD Z30, Z6, Z30 + VPRORD $0x10, Z30, Z30 + VPADDD Z22, Z30, Z22 + VPXORD Z14, Z22, Z14 + VPRORD $0x0c, Z14, Z14 + VPADDD Z6, Z14, Z6 + VPADDD Z15, Z6, Z6 + VPXORD Z30, Z6, Z30 + VPRORD $0x08, Z30, Z30 + VPADDD Z22, Z30, Z22 + VPXORD Z14, Z22, Z14 + VPRORD $0x07, Z14, Z14 + VPADDD Z0, Z10, Z0 + VPADDD Z17, Z0, Z0 + VPXORD Z30, Z0, Z30 + VPRORD $0x10, Z30, Z30 + VPADDD Z20, Z30, Z20 + VPXORD Z10, Z20, Z10 + VPRORD $0x0c, Z10, Z10 + VPADDD Z0, Z10, Z0 + VPADDD Z19, Z0, Z0 + VPXORD Z30, Z0, Z30 + VPRORD $0x08, Z30, Z30 + VPADDD Z20, Z30, Z20 + VPXORD Z10, Z20, Z10 + VPRORD $0x07, Z10, Z10 + VPADDD Z2, Z12, Z2 + VPADDD Z21, Z2, Z2 + VPXORD Z24, Z2, Z24 + VPRORD $0x10, Z24, Z24 + VPADDD Z22, Z24, Z22 + VPXORD Z12, Z22, Z12 + VPRORD $0x0c, Z12, Z12 + VPADDD Z2, Z12, Z2 + VPADDD Z23, Z2, Z2 + VPXORD Z24, Z2, Z24 + VPRORD $0x08, Z24, Z24 + VPADDD Z22, Z24, Z22 + VPXORD Z12, Z22, Z12 + VPRORD $0x07, Z12, Z12 + VPADDD Z4, Z14, Z4 + VPADDD Z25, Z4, Z4 + VPXORD Z26, Z4, Z26 + VPRORD $0x10, Z26, Z26 + VPADDD Z16, Z26, Z16 + VPXORD Z14, Z16, Z14 + VPRORD $0x0c, Z14, Z14 + VPADDD Z4, Z14, Z4 + VPADDD Z27, Z4, Z4 + VPXORD Z26, Z4, Z26 + VPRORD $0x08, Z26, Z26 + VPADDD Z16, Z26, Z16 + VPXORD Z14, Z16, Z14 + VPRORD $0x07, Z14, Z14 + VPADDD Z6, Z8, Z6 + VPADDD Z29, Z6, Z6 + VPXORD Z28, Z6, Z28 + VPRORD $0x10, Z28, Z28 + VPADDD Z18, Z28, Z18 + VPXORD Z8, Z18, Z8 + VPRORD $0x0c, Z8, Z8 + VPADDD Z6, Z8, Z6 + VPADDD Z31, Z6, Z6 + VPXORD Z28, Z6, Z28 + VPRORD $0x08, Z28, Z28 + VPADDD Z18, Z28, Z18 + VPXORD Z8, Z18, Z8 + VPRORD $0x07, Z8, Z8 + + // Round 2 + VPADDD Z0, Z8, Z0 + VPADDD Z5, Z0, Z0 + VPXORD Z24, Z0, Z24 + VPRORD $0x10, Z24, Z24 + VPADDD Z16, Z24, Z16 + VPXORD Z8, Z16, Z8 + VPRORD $0x0c, Z8, Z8 + VPADDD Z0, Z8, Z0 + VPADDD Z13, Z0, Z0 + VPXORD Z24, Z0, Z24 + VPRORD $0x08, Z24, Z24 + VPADDD Z16, Z24, Z16 + VPXORD Z8, Z16, Z8 + VPRORD $0x07, Z8, Z8 + VPADDD Z2, Z10, Z2 + VPADDD Z7, Z2, Z2 + VPXORD Z26, Z2, Z26 + VPRORD $0x10, Z26, Z26 + VPADDD Z18, Z26, Z18 + VPXORD Z10, Z18, Z10 + VPRORD $0x0c, Z10, Z10 + VPADDD Z2, Z10, Z2 + VPADDD Z21, Z2, Z2 + VPXORD Z26, Z2, Z26 + VPRORD $0x08, Z26, Z26 + VPADDD Z18, Z26, Z18 + VPXORD Z10, Z18, Z10 + VPRORD $0x07, Z10, Z10 + VPADDD Z4, Z12, Z4 + VPADDD Z15, Z4, Z4 + VPXORD Z28, Z4, Z28 + VPRORD $0x10, Z28, Z28 + VPADDD Z20, Z28, Z20 + VPXORD Z12, Z20, Z12 + VPRORD $0x0c, Z12, Z12 + VPADDD Z4, Z12, Z4 + VPADDD Z1, Z4, Z4 + VPXORD Z28, Z4, Z28 + VPRORD $0x08, Z28, Z28 + VPADDD Z20, Z28, Z20 + VPXORD Z12, Z20, Z12 + VPRORD $0x07, Z12, Z12 + VPADDD Z6, Z14, Z6 + VPADDD Z9, Z6, Z6 + VPXORD Z30, Z6, Z30 + VPRORD $0x10, Z30, Z30 + VPADDD Z22, Z30, Z22 + VPXORD Z14, Z22, Z14 + VPRORD $0x0c, Z14, Z14 + VPADDD Z6, Z14, Z6 + VPADDD Z27, Z6, Z6 + VPXORD Z30, Z6, Z30 + VPRORD $0x08, Z30, Z30 + VPADDD Z22, Z30, Z22 + VPXORD Z14, Z22, Z14 + VPRORD $0x07, Z14, Z14 + VPADDD Z0, Z10, Z0 + VPADDD Z3, Z0, Z0 + VPXORD Z30, Z0, Z30 + VPRORD $0x10, Z30, Z30 + VPADDD Z20, Z30, Z20 + VPXORD Z10, Z20, Z10 + VPRORD $0x0c, Z10, Z10 + VPADDD Z0, Z10, Z0 + VPADDD Z23, Z0, Z0 + VPXORD Z30, Z0, Z30 + VPRORD $0x08, Z30, Z30 + VPADDD Z20, Z30, Z20 + VPXORD Z10, Z20, Z10 + VPRORD $0x07, Z10, Z10 + VPADDD Z2, Z12, Z2 + VPADDD Z25, Z2, Z2 + VPXORD Z24, Z2, Z24 + VPRORD $0x10, Z24, Z24 + VPADDD Z22, Z24, Z22 + VPXORD Z12, Z22, Z12 + VPRORD $0x0c, Z12, Z12 + VPADDD Z2, Z12, Z2 + VPADDD Z11, Z2, Z2 + VPXORD Z24, Z2, Z24 + VPRORD $0x08, Z24, Z24 + VPADDD Z22, Z24, Z22 + VPXORD Z12, Z22, Z12 + VPRORD $0x07, Z12, Z12 + VPADDD Z4, Z14, Z4 + VPADDD Z19, Z4, Z4 + VPXORD Z26, Z4, Z26 + VPRORD $0x10, Z26, Z26 + VPADDD Z16, Z26, Z16 + VPXORD Z14, Z16, Z14 + VPRORD $0x0c, Z14, Z14 + VPADDD Z4, Z14, Z4 + VPADDD Z29, Z4, Z4 + VPXORD Z26, Z4, Z26 + VPRORD $0x08, Z26, Z26 + VPADDD Z16, Z26, Z16 + VPXORD Z14, Z16, Z14 + VPRORD $0x07, Z14, Z14 + VPADDD Z6, Z8, Z6 + VPADDD Z31, Z6, Z6 + VPXORD Z28, Z6, Z28 + VPRORD $0x10, Z28, Z28 + VPADDD Z18, Z28, Z18 + VPXORD Z8, Z18, Z8 + VPRORD $0x0c, Z8, Z8 + VPADDD Z6, Z8, Z6 + VPADDD Z17, Z6, Z6 + VPXORD Z28, Z6, Z28 + VPRORD $0x08, Z28, Z28 + VPADDD Z18, Z28, Z18 + VPXORD Z8, Z18, Z8 + VPRORD $0x07, Z8, Z8 + + // Round 3 + VPADDD Z0, Z8, Z0 + VPADDD Z7, Z0, Z0 + VPXORD Z24, Z0, Z24 + VPRORD $0x10, Z24, Z24 + VPADDD Z16, Z24, Z16 + VPXORD Z8, Z16, Z8 + VPRORD $0x0c, Z8, Z8 + VPADDD Z0, Z8, Z0 + VPADDD Z9, Z0, Z0 + VPXORD Z24, Z0, Z24 + VPRORD $0x08, Z24, Z24 + VPADDD Z16, Z24, Z16 + VPXORD Z8, Z16, Z8 + VPRORD $0x07, Z8, Z8 + VPADDD Z2, Z10, Z2 + VPADDD Z21, Z2, Z2 + VPXORD Z26, Z2, Z26 + VPRORD $0x10, Z26, Z26 + VPADDD Z18, Z26, Z18 + VPXORD Z10, Z18, Z10 + VPRORD $0x0c, Z10, Z10 + VPADDD Z2, Z10, Z2 + VPADDD Z25, Z2, Z2 + VPXORD Z26, Z2, Z26 + VPRORD $0x08, Z26, Z26 + VPADDD Z18, Z26, Z18 + VPXORD Z10, Z18, Z10 + VPRORD $0x07, Z10, Z10 + VPADDD Z4, Z12, Z4 + VPADDD Z27, Z4, Z4 + VPXORD Z28, Z4, Z28 + VPRORD $0x10, Z28, Z28 + VPADDD Z20, Z28, Z20 + VPXORD Z12, Z20, Z12 + VPRORD $0x0c, Z12, Z12 + VPADDD Z4, Z12, Z4 + VPADDD Z5, Z4, Z4 + VPXORD Z28, Z4, Z28 + VPRORD $0x08, Z28, Z28 + VPADDD Z20, Z28, Z20 + VPXORD Z12, Z20, Z12 + VPRORD $0x07, Z12, Z12 + VPADDD Z6, Z14, Z6 + VPADDD Z15, Z6, Z6 + VPXORD Z30, Z6, Z30 + VPRORD $0x10, Z30, Z30 + VPADDD Z22, Z30, Z22 + VPXORD Z14, Z22, Z14 + VPRORD $0x0c, Z14, Z14 + VPADDD Z6, Z14, Z6 + VPADDD Z29, Z6, Z6 + VPXORD Z30, Z6, Z30 + VPRORD $0x08, Z30, Z30 + VPADDD Z22, Z30, Z22 + VPXORD Z14, Z22, Z14 + VPRORD $0x07, Z14, Z14 + VPADDD Z0, Z10, Z0 + VPADDD Z13, Z0, Z0 + VPXORD Z30, Z0, Z30 + VPRORD $0x10, Z30, Z30 + VPADDD Z20, Z30, Z20 + VPXORD Z10, Z20, Z10 + VPRORD $0x0c, Z10, Z10 + VPADDD Z0, Z10, Z0 + VPADDD Z11, Z0, Z0 + VPXORD Z30, Z0, Z30 + VPRORD $0x08, Z30, Z30 + VPADDD Z20, Z30, Z20 + VPXORD Z10, Z20, Z10 + VPRORD $0x07, Z10, Z10 + VPADDD Z2, Z12, Z2 + VPADDD Z19, Z2, Z2 + VPXORD Z24, Z2, Z24 + VPRORD $0x10, Z24, Z24 + VPADDD Z22, Z24, Z22 + VPXORD Z12, Z22, Z12 + VPRORD $0x0c, Z12, Z12 + VPADDD Z2, Z12, Z2 + VPADDD Z1, Z2, Z2 + VPXORD Z24, Z2, Z24 + VPRORD $0x08, Z24, Z24 + VPADDD Z22, Z24, Z22 + VPXORD Z12, Z22, Z12 + VPRORD $0x07, Z12, Z12 + VPADDD Z4, Z14, Z4 + VPADDD Z23, Z4, Z4 + VPXORD Z26, Z4, Z26 + VPRORD $0x10, Z26, Z26 + VPADDD Z16, Z26, Z16 + VPXORD Z14, Z16, Z14 + VPRORD $0x0c, Z14, Z14 + VPADDD Z4, Z14, Z4 + VPADDD Z31, Z4, Z4 + VPXORD Z26, Z4, Z26 + VPRORD $0x08, Z26, Z26 + VPADDD Z16, Z26, Z16 + VPXORD Z14, Z16, Z14 + VPRORD $0x07, Z14, Z14 + VPADDD Z6, Z8, Z6 + VPADDD Z17, Z6, Z6 + VPXORD Z28, Z6, Z28 + VPRORD $0x10, Z28, Z28 + VPADDD Z18, Z28, Z18 + VPXORD Z8, Z18, Z8 + VPRORD $0x0c, Z8, Z8 + VPADDD Z6, Z8, Z6 + VPADDD Z3, Z6, Z6 + VPXORD Z28, Z6, Z28 + VPRORD $0x08, Z28, Z28 + VPADDD Z18, Z28, Z18 + VPXORD Z8, Z18, Z8 + VPRORD $0x07, Z8, Z8 + + // Round 4 + VPADDD Z0, Z8, Z0 + VPADDD Z21, Z0, Z0 + VPXORD Z24, Z0, Z24 + VPRORD $0x10, Z24, Z24 + VPADDD Z16, Z24, Z16 + VPXORD Z8, Z16, Z8 + VPRORD $0x0c, Z8, Z8 + VPADDD Z0, Z8, Z0 + VPADDD Z15, Z0, Z0 + VPXORD Z24, Z0, Z24 + VPRORD $0x08, Z24, Z24 + VPADDD Z16, Z24, Z16 + VPXORD Z8, Z16, Z8 + VPRORD $0x07, Z8, Z8 + VPADDD Z2, Z10, Z2 + VPADDD Z25, Z2, Z2 + VPXORD Z26, Z2, Z26 + VPRORD $0x10, Z26, Z26 + VPADDD Z18, Z26, Z18 + VPXORD Z10, Z18, Z10 + VPRORD $0x0c, Z10, Z10 + VPADDD Z2, Z10, Z2 + VPADDD Z19, Z2, Z2 + VPXORD Z26, Z2, Z26 + VPRORD $0x08, Z26, Z26 + VPADDD Z18, Z26, Z18 + VPXORD Z10, Z18, Z10 + VPRORD $0x07, Z10, Z10 + VPADDD Z4, Z12, Z4 + VPADDD Z29, Z4, Z4 + VPXORD Z28, Z4, Z28 + VPRORD $0x10, Z28, Z28 + VPADDD Z20, Z28, Z20 + VPXORD Z12, Z20, Z12 + VPRORD $0x0c, Z12, Z12 + VPADDD Z4, Z12, Z4 + VPADDD Z7, Z4, Z4 + VPXORD Z28, Z4, Z28 + VPRORD $0x08, Z28, Z28 + VPADDD Z20, Z28, Z20 + VPXORD Z12, Z20, Z12 + VPRORD $0x07, Z12, Z12 + VPADDD Z6, Z14, Z6 + VPADDD Z27, Z6, Z6 + VPXORD Z30, Z6, Z30 + VPRORD $0x10, Z30, Z30 + VPADDD Z22, Z30, Z22 + VPXORD Z14, Z22, Z14 + VPRORD $0x0c, Z14, Z14 + VPADDD Z6, Z14, Z6 + VPADDD Z31, Z6, Z6 + VPXORD Z30, Z6, Z30 + VPRORD $0x08, Z30, Z30 + VPADDD Z22, Z30, Z22 + VPXORD Z14, Z22, Z14 + VPRORD $0x07, Z14, Z14 + VPADDD Z0, Z10, Z0 + VPADDD Z9, Z0, Z0 + VPXORD Z30, Z0, Z30 + VPRORD $0x10, Z30, Z30 + VPADDD Z20, Z30, Z20 + VPXORD Z10, Z20, Z10 + VPRORD $0x0c, Z10, Z10 + VPADDD Z0, Z10, Z0 + VPADDD Z1, Z0, Z0 + VPXORD Z30, Z0, Z30 + VPRORD $0x08, Z30, Z30 + VPADDD Z20, Z30, Z20 + VPXORD Z10, Z20, Z10 + VPRORD $0x07, Z10, Z10 + VPADDD Z2, Z12, Z2 + VPADDD Z23, Z2, Z2 + VPXORD Z24, Z2, Z24 + VPRORD $0x10, Z24, Z24 + VPADDD Z22, Z24, Z22 + VPXORD Z12, Z22, Z12 + VPRORD $0x0c, Z12, Z12 + VPADDD Z2, Z12, Z2 + VPADDD Z5, Z2, Z2 + VPXORD Z24, Z2, Z24 + VPRORD $0x08, Z24, Z24 + VPADDD Z22, Z24, Z22 + VPXORD Z12, Z22, Z12 + VPRORD $0x07, Z12, Z12 + VPADDD Z4, Z14, Z4 + VPADDD Z11, Z4, Z4 + VPXORD Z26, Z4, Z26 + VPRORD $0x10, Z26, Z26 + VPADDD Z16, Z26, Z16 + VPXORD Z14, Z16, Z14 + VPRORD $0x0c, Z14, Z14 + VPADDD Z4, Z14, Z4 + VPADDD Z17, Z4, Z4 + VPXORD Z26, Z4, Z26 + VPRORD $0x08, Z26, Z26 + VPADDD Z16, Z26, Z16 + VPXORD Z14, Z16, Z14 + VPRORD $0x07, Z14, Z14 + VPADDD Z6, Z8, Z6 + VPADDD Z3, Z6, Z6 + VPXORD Z28, Z6, Z28 + VPRORD $0x10, Z28, Z28 + VPADDD Z18, Z28, Z18 + VPXORD Z8, Z18, Z8 + VPRORD $0x0c, Z8, Z8 + VPADDD Z6, Z8, Z6 + VPADDD Z13, Z6, Z6 + VPXORD Z28, Z6, Z28 + VPRORD $0x08, Z28, Z28 + VPADDD Z18, Z28, Z18 + VPXORD Z8, Z18, Z8 + VPRORD $0x07, Z8, Z8 + + // Round 5 + VPADDD Z0, Z8, Z0 + VPADDD Z25, Z0, Z0 + VPXORD Z24, Z0, Z24 + VPRORD $0x10, Z24, Z24 + VPADDD Z16, Z24, Z16 + VPXORD Z8, Z16, Z8 + VPRORD $0x0c, Z8, Z8 + VPADDD Z0, Z8, Z0 + VPADDD Z27, Z0, Z0 + VPXORD Z24, Z0, Z24 + VPRORD $0x08, Z24, Z24 + VPADDD Z16, Z24, Z16 + VPXORD Z8, Z16, Z8 + VPRORD $0x07, Z8, Z8 + VPADDD Z2, Z10, Z2 + VPADDD Z19, Z2, Z2 + VPXORD Z26, Z2, Z26 + VPRORD $0x10, Z26, Z26 + VPADDD Z18, Z26, Z18 + VPXORD Z10, Z18, Z10 + VPRORD $0x0c, Z10, Z10 + VPADDD Z2, Z10, Z2 + VPADDD Z23, Z2, Z2 + VPXORD Z26, Z2, Z26 + VPRORD $0x08, Z26, Z26 + VPADDD Z18, Z26, Z18 + VPXORD Z10, Z18, Z10 + VPRORD $0x07, Z10, Z10 + VPADDD Z4, Z12, Z4 + VPADDD Z31, Z4, Z4 + VPXORD Z28, Z4, Z28 + VPRORD $0x10, Z28, Z28 + VPADDD Z20, Z28, Z20 + VPXORD Z12, Z20, Z12 + VPRORD $0x0c, Z12, Z12 + VPADDD Z4, Z12, Z4 + VPADDD Z21, Z4, Z4 + VPXORD Z28, Z4, Z28 + VPRORD $0x08, Z28, Z28 + VPADDD Z20, Z28, Z20 + VPXORD Z12, Z20, Z12 + VPRORD $0x07, Z12, Z12 + VPADDD Z6, Z14, Z6 + VPADDD Z29, Z6, Z6 + VPXORD Z30, Z6, Z30 + VPRORD $0x10, Z30, Z30 + VPADDD Z22, Z30, Z22 + VPXORD Z14, Z22, Z14 + VPRORD $0x0c, Z14, Z14 + VPADDD Z6, Z14, Z6 + VPADDD Z17, Z6, Z6 + VPXORD Z30, Z6, Z30 + VPRORD $0x08, Z30, Z30 + VPADDD Z22, Z30, Z22 + VPXORD Z14, Z22, Z14 + VPRORD $0x07, Z14, Z14 + VPADDD Z0, Z10, Z0 + VPADDD Z15, Z0, Z0 + VPXORD Z30, Z0, Z30 + VPRORD $0x10, Z30, Z30 + VPADDD Z20, Z30, Z20 + VPXORD Z10, Z20, Z10 + VPRORD $0x0c, Z10, Z10 + VPADDD Z0, Z10, Z0 + VPADDD Z5, Z0, Z0 + VPXORD Z30, Z0, Z30 + VPRORD $0x08, Z30, Z30 + VPADDD Z20, Z30, Z20 + VPXORD Z10, Z20, Z10 + VPRORD $0x07, Z10, Z10 + VPADDD Z2, Z12, Z2 + VPADDD Z11, Z2, Z2 + VPXORD Z24, Z2, Z24 + VPRORD $0x10, Z24, Z24 + VPADDD Z22, Z24, Z22 + VPXORD Z12, Z22, Z12 + VPRORD $0x0c, Z12, Z12 + VPADDD Z2, Z12, Z2 + VPADDD Z7, Z2, Z2 + VPXORD Z24, Z2, Z24 + VPRORD $0x08, Z24, Z24 + VPADDD Z22, Z24, Z22 + VPXORD Z12, Z22, Z12 + VPRORD $0x07, Z12, Z12 + VPADDD Z4, Z14, Z4 + VPADDD Z1, Z4, Z4 + VPXORD Z26, Z4, Z26 + VPRORD $0x10, Z26, Z26 + VPADDD Z16, Z26, Z16 + VPXORD Z14, Z16, Z14 + VPRORD $0x0c, Z14, Z14 + VPADDD Z4, Z14, Z4 + VPADDD Z3, Z4, Z4 + VPXORD Z26, Z4, Z26 + VPRORD $0x08, Z26, Z26 + VPADDD Z16, Z26, Z16 + VPXORD Z14, Z16, Z14 + VPRORD $0x07, Z14, Z14 + VPADDD Z6, Z8, Z6 + VPADDD Z13, Z6, Z6 + VPXORD Z28, Z6, Z28 + VPRORD $0x10, Z28, Z28 + VPADDD Z18, Z28, Z18 + VPXORD Z8, Z18, Z8 + VPRORD $0x0c, Z8, Z8 + VPADDD Z6, Z8, Z6 + VPADDD Z9, Z6, Z6 + VPXORD Z28, Z6, Z28 + VPRORD $0x08, Z28, Z28 + VPADDD Z18, Z28, Z18 + VPXORD Z8, Z18, Z8 + VPRORD $0x07, Z8, Z8 + + // Round 6 + VPADDD Z0, Z8, Z0 + VPADDD Z19, Z0, Z0 + VPXORD Z24, Z0, Z24 + VPRORD $0x10, Z24, Z24 + VPADDD Z16, Z24, Z16 + VPXORD Z8, Z16, Z8 + VPRORD $0x0c, Z8, Z8 + VPADDD Z0, Z8, Z0 + VPADDD Z29, Z0, Z0 + VPXORD Z24, Z0, Z24 + VPRORD $0x08, Z24, Z24 + VPADDD Z16, Z24, Z16 + VPXORD Z8, Z16, Z8 + VPRORD $0x07, Z8, Z8 + VPADDD Z2, Z10, Z2 + VPADDD Z23, Z2, Z2 + VPXORD Z26, Z2, Z26 + VPRORD $0x10, Z26, Z26 + VPADDD Z18, Z26, Z18 + VPXORD Z10, Z18, Z10 + VPRORD $0x0c, Z10, Z10 + VPADDD Z2, Z10, Z2 + VPADDD Z11, Z2, Z2 + VPXORD Z26, Z2, Z26 + VPRORD $0x08, Z26, Z26 + VPADDD Z18, Z26, Z18 + VPXORD Z10, Z18, Z10 + VPRORD $0x07, Z10, Z10 + VPADDD Z4, Z12, Z4 + VPADDD Z17, Z4, Z4 + VPXORD Z28, Z4, Z28 + VPRORD $0x10, Z28, Z28 + VPADDD Z20, Z28, Z20 + VPXORD Z12, Z20, Z12 + VPRORD $0x0c, Z12, Z12 + VPADDD Z4, Z12, Z4 + VPADDD Z25, Z4, Z4 + VPXORD Z28, Z4, Z28 + VPRORD $0x08, Z28, Z28 + VPADDD Z20, Z28, Z20 + VPXORD Z12, Z20, Z12 + VPRORD $0x07, Z12, Z12 + VPADDD Z6, Z14, Z6 + VPADDD Z31, Z6, Z6 + VPXORD Z30, Z6, Z30 + VPRORD $0x10, Z30, Z30 + VPADDD Z22, Z30, Z22 + VPXORD Z14, Z22, Z14 + VPRORD $0x0c, Z14, Z14 + VPADDD Z6, Z14, Z6 + VPADDD Z3, Z6, Z6 + VPXORD Z30, Z6, Z30 + VPRORD $0x08, Z30, Z30 + VPADDD Z22, Z30, Z22 + VPXORD Z14, Z22, Z14 + VPRORD $0x07, Z14, Z14 + VPADDD Z0, Z10, Z0 + VPADDD Z27, Z0, Z0 + VPXORD Z30, Z0, Z30 + VPRORD $0x10, Z30, Z30 + VPADDD Z20, Z30, Z20 + VPXORD Z10, Z20, Z10 + VPRORD $0x0c, Z10, Z10 + VPADDD Z0, Z10, Z0 + VPADDD Z7, Z0, Z0 + VPXORD Z30, Z0, Z30 + VPRORD $0x08, Z30, Z30 + VPADDD Z20, Z30, Z20 + VPXORD Z10, Z20, Z10 + VPRORD $0x07, Z10, Z10 + VPADDD Z2, Z12, Z2 + VPADDD Z1, Z2, Z2 + VPXORD Z24, Z2, Z24 + VPRORD $0x10, Z24, Z24 + VPADDD Z22, Z24, Z22 + VPXORD Z12, Z22, Z12 + VPRORD $0x0c, Z12, Z12 + VPADDD Z2, Z12, Z2 + VPADDD Z21, Z2, Z2 + VPXORD Z24, Z2, Z24 + VPRORD $0x08, Z24, Z24 + VPADDD Z22, Z24, Z22 + VPXORD Z12, Z22, Z12 + VPRORD $0x07, Z12, Z12 + VPADDD Z4, Z14, Z4 + VPADDD Z5, Z4, Z4 + VPXORD Z26, Z4, Z26 + VPRORD $0x10, Z26, Z26 + VPADDD Z16, Z26, Z16 + VPXORD Z14, Z16, Z14 + VPRORD $0x0c, Z14, Z14 + VPADDD Z4, Z14, Z4 + VPADDD Z13, Z4, Z4 + VPXORD Z26, Z4, Z26 + VPRORD $0x08, Z26, Z26 + VPADDD Z16, Z26, Z16 + VPXORD Z14, Z16, Z14 + VPRORD $0x07, Z14, Z14 + VPADDD Z6, Z8, Z6 + VPADDD Z9, Z6, Z6 + VPXORD Z28, Z6, Z28 + VPRORD $0x10, Z28, Z28 + VPADDD Z18, Z28, Z18 + VPXORD Z8, Z18, Z8 + VPRORD $0x0c, Z8, Z8 + VPADDD Z6, Z8, Z6 + VPADDD Z15, Z6, Z6 + VPXORD Z28, Z6, Z28 + VPRORD $0x08, Z28, Z28 + VPADDD Z18, Z28, Z18 + VPXORD Z8, Z18, Z8 + VPRORD $0x07, Z8, Z8 + + // Round 7 + VPADDD Z0, Z8, Z0 + VPADDD Z23, Z0, Z0 + VPXORD Z24, Z0, Z24 + VPRORD $0x10, Z24, Z24 + VPADDD Z16, Z24, Z16 + VPXORD Z8, Z16, Z8 + VPRORD $0x0c, Z8, Z8 + VPADDD Z0, Z8, Z0 + VPADDD Z31, Z0, Z0 + VPXORD Z24, Z0, Z24 + VPRORD $0x08, Z24, Z24 + VPADDD Z16, Z24, Z16 + VPXORD Z8, Z16, Z8 + VPRORD $0x07, Z8, Z8 + VPADDD Z2, Z10, Z2 + VPADDD Z11, Z2, Z2 + VPXORD Z26, Z2, Z26 + VPRORD $0x10, Z26, Z26 + VPADDD Z18, Z26, Z18 + VPXORD Z10, Z18, Z10 + VPRORD $0x0c, Z10, Z10 + VPADDD Z2, Z10, Z2 + VPADDD Z1, Z2, Z2 + VPXORD Z26, Z2, Z26 + VPRORD $0x08, Z26, Z26 + VPADDD Z18, Z26, Z18 + VPXORD Z10, Z18, Z10 + VPRORD $0x07, Z10, Z10 + VPADDD Z4, Z12, Z4 + VPADDD Z3, Z4, Z4 + VPXORD Z28, Z4, Z28 + VPRORD $0x10, Z28, Z28 + VPADDD Z20, Z28, Z20 + VPXORD Z12, Z20, Z12 + VPRORD $0x0c, Z12, Z12 + VPADDD Z4, Z12, Z4 + VPADDD Z19, Z4, Z4 + VPXORD Z28, Z4, Z28 + VPRORD $0x08, Z28, Z28 + VPADDD Z20, Z28, Z20 + VPXORD Z12, Z20, Z12 + VPRORD $0x07, Z12, Z12 + VPADDD Z6, Z14, Z6 + VPADDD Z17, Z6, Z6 + VPXORD Z30, Z6, Z30 + VPRORD $0x10, Z30, Z30 + VPADDD Z22, Z30, Z22 + VPXORD Z14, Z22, Z14 + VPRORD $0x0c, Z14, Z14 + VPADDD Z6, Z14, Z6 + VPADDD Z13, Z6, Z6 + VPXORD Z30, Z6, Z30 + VPRORD $0x08, Z30, Z30 + VPADDD Z22, Z30, Z22 + VPXORD Z14, Z22, Z14 + VPRORD $0x07, Z14, Z14 + VPADDD Z0, Z10, Z0 + VPADDD Z29, Z0, Z0 + VPXORD Z30, Z0, Z30 + VPRORD $0x10, Z30, Z30 + VPADDD Z20, Z30, Z20 + VPXORD Z10, Z20, Z10 + VPRORD $0x0c, Z10, Z10 + VPADDD Z0, Z10, Z0 + VPADDD Z21, Z0, Z0 + VPXORD Z30, Z0, Z30 + VPRORD $0x08, Z30, Z30 + VPADDD Z20, Z30, Z20 + VPXORD Z10, Z20, Z10 + VPRORD $0x07, Z10, Z10 + VPADDD Z2, Z12, Z2 + VPADDD Z5, Z2, Z2 + VPXORD Z24, Z2, Z24 + VPRORD $0x10, Z24, Z24 + VPADDD Z22, Z24, Z22 + VPXORD Z12, Z22, Z12 + VPRORD $0x0c, Z12, Z12 + VPADDD Z2, Z12, Z2 + VPADDD Z25, Z2, Z2 + VPXORD Z24, Z2, Z24 + VPRORD $0x08, Z24, Z24 + VPADDD Z22, Z24, Z22 + VPXORD Z12, Z22, Z12 + VPRORD $0x07, Z12, Z12 + VPADDD Z4, Z14, Z4 + VPADDD Z7, Z4, Z4 + VPXORD Z26, Z4, Z26 + VPRORD $0x10, Z26, Z26 + VPADDD Z16, Z26, Z16 + VPXORD Z14, Z16, Z14 + VPRORD $0x0c, Z14, Z14 + VPADDD Z4, Z14, Z4 + VPADDD Z9, Z4, Z4 + VPXORD Z26, Z4, Z26 + VPRORD $0x08, Z26, Z26 + VPADDD Z16, Z26, Z16 + VPXORD Z14, Z16, Z14 + VPRORD $0x07, Z14, Z14 + VPADDD Z6, Z8, Z6 + VPADDD Z15, Z6, Z6 + VPXORD Z28, Z6, Z28 + VPRORD $0x10, Z28, Z28 + VPADDD Z18, Z28, Z18 + VPXORD Z8, Z18, Z8 + VPRORD $0x0c, Z8, Z8 + VPADDD Z6, Z8, Z6 + VPADDD Z27, Z6, Z6 + VPXORD Z28, Z6, Z28 + VPRORD $0x08, Z28, Z28 + VPADDD Z18, Z28, Z18 + VPXORD Z8, Z18, Z8 + VPRORD $0x07, Z8, Z8 + + // Finalize CVs + VPXORD Z0, Z16, Z0 + VPXORD Z2, Z18, Z2 + VPXORD Z4, Z20, Z4 + VPXORD Z6, Z22, Z6 + VPXORD Z8, Z24, Z8 + VPXORD Z10, Z26, Z10 + VPXORD Z12, Z28, Z12 + VPXORD Z14, Z30, Z14 + VPXORD.BCST (DX), Z16, Z16 + VPXORD.BCST 4(DX), Z18, Z18 + VPXORD.BCST 8(DX), Z20, Z20 + VPXORD.BCST 12(DX), Z22, Z22 + VPXORD.BCST 16(DX), Z24, Z24 + VPXORD.BCST 20(DX), Z26, Z26 + VPXORD.BCST 24(DX), Z28, Z28 + VPXORD.BCST 28(DX), Z30, Z30 + VMOVDQU32 seq<>+0(SB), Z1 + VPSLLD $0x06, Z1, Z1 + KXNORD K1, K1, K1 + VPSCATTERDD Z0, K1, (AX)(Z1*1) + KXNORD K1, K1, K1 + VPSCATTERDD Z2, K1, 4(AX)(Z1*1) + KXNORD K1, K1, K1 + VPSCATTERDD Z4, K1, 8(AX)(Z1*1) + KXNORD K1, K1, K1 + VPSCATTERDD Z6, K1, 12(AX)(Z1*1) + KXNORD K1, K1, K1 + VPSCATTERDD Z8, K1, 16(AX)(Z1*1) + KXNORD K1, K1, K1 + VPSCATTERDD Z10, K1, 20(AX)(Z1*1) + KXNORD K1, K1, K1 + VPSCATTERDD Z12, K1, 24(AX)(Z1*1) + KXNORD K1, K1, K1 + VPSCATTERDD Z14, K1, 28(AX)(Z1*1) + KXNORD K1, K1, K1 + VPSCATTERDD Z16, K1, 32(AX)(Z1*1) + KXNORD K1, K1, K1 + VPSCATTERDD Z18, K1, 36(AX)(Z1*1) + KXNORD K1, K1, K1 + VPSCATTERDD Z20, K1, 40(AX)(Z1*1) + KXNORD K1, K1, K1 + VPSCATTERDD Z22, K1, 44(AX)(Z1*1) + KXNORD K1, K1, K1 + VPSCATTERDD Z24, K1, 48(AX)(Z1*1) + KXNORD K1, K1, K1 + VPSCATTERDD Z26, K1, 52(AX)(Z1*1) + KXNORD K1, K1, K1 + VPSCATTERDD Z28, K1, 56(AX)(Z1*1) + KXNORD K1, K1, K1 + VPSCATTERDD Z30, K1, 60(AX)(Z1*1) + RET + +// func compressChunksAVX512(cvs *[16][8]uint32, buf *[16384]byte, key *[8]uint32, counter uint64, flags uint32) +// Requires: AVX512F +TEXT ·compressChunksAVX512(SB), NOSPLIT, $192-40 + MOVQ cvs+0(FP), AX + MOVQ buf+8(FP), CX + MOVQ key+16(FP), DX + + // Initialize counter + VPBROADCASTD counter+24(FP), Z0 + VPADDD seq<>+0(SB), Z0, Z0 + VPCMPUD $0x01, seq<>+0(SB), Z0, K1 + VPBROADCASTD counter+28(FP), Z2 + VPADDD.BCST seq<>+4(SB), Z2, K1, Z2 + VMOVDQU32 Z0, (SP) + VMOVDQU32 Z2, 64(SP) + + // Initialize flags + VPBROADCASTD flags+32(FP), Z0 + VMOVDQU32 Z0, 128(SP) + ORL $0x01, 128(SP) + ORL $0x02, 188(SP) + + // Load key + VPBROADCASTD (DX), Z0 + VPBROADCASTD 4(DX), Z2 + VPBROADCASTD 8(DX), Z4 + VPBROADCASTD 12(DX), Z6 + VPBROADCASTD 16(DX), Z8 + VPBROADCASTD 20(DX), Z10 + VPBROADCASTD 24(DX), Z12 + VPBROADCASTD 28(DX), Z14 + + // Loop index + XORQ DX, DX + +loop: + // Load transposed block + VMOVDQU32 seq<>+0(SB), Z16 + VPSLLD $0x0a, Z16, Z16 + KXNORD K1, K1, K1 + VPGATHERDD (CX)(Z16*1), K1, Z1 + KXNORD K1, K1, K1 + VPGATHERDD 4(CX)(Z16*1), K1, Z3 + KXNORD K1, K1, K1 + VPGATHERDD 8(CX)(Z16*1), K1, Z5 + KXNORD K1, K1, K1 + VPGATHERDD 12(CX)(Z16*1), K1, Z7 + KXNORD K1, K1, K1 + VPGATHERDD 16(CX)(Z16*1), K1, Z9 + KXNORD K1, K1, K1 + VPGATHERDD 20(CX)(Z16*1), K1, Z11 + KXNORD K1, K1, K1 + VPGATHERDD 24(CX)(Z16*1), K1, Z13 + KXNORD K1, K1, K1 + VPGATHERDD 28(CX)(Z16*1), K1, Z15 + KXNORD K1, K1, K1 + VPGATHERDD 32(CX)(Z16*1), K1, Z17 + KXNORD K1, K1, K1 + VPGATHERDD 36(CX)(Z16*1), K1, Z19 + KXNORD K1, K1, K1 + VPGATHERDD 40(CX)(Z16*1), K1, Z21 + KXNORD K1, K1, K1 + VPGATHERDD 44(CX)(Z16*1), K1, Z23 + KXNORD K1, K1, K1 + VPGATHERDD 48(CX)(Z16*1), K1, Z25 + KXNORD K1, K1, K1 + VPGATHERDD 52(CX)(Z16*1), K1, Z27 + KXNORD K1, K1, K1 + VPGATHERDD 56(CX)(Z16*1), K1, Z29 + KXNORD K1, K1, K1 + VPGATHERDD 60(CX)(Z16*1), K1, Z31 + ADDQ $0x40, CX + + // Reload state vectors (other than CVs) + VPBROADCASTD iv<>+0(SB), Z16 + VPBROADCASTD iv<>+4(SB), Z18 + VPBROADCASTD iv<>+8(SB), Z20 + VPBROADCASTD iv<>+12(SB), Z22 + VMOVDQU32 (SP), Z24 + VMOVDQU32 64(SP), Z26 + VPBROADCASTD seq<>+4(SB), Z28 + VPSLLD $0x06, Z28, Z28 + VPBROADCASTD 128(SP)(DX*4), Z30 + + // Round 1 + VPADDD Z0, Z8, Z0 + VPADDD Z1, Z0, Z0 + VPXORD Z24, Z0, Z24 + VPRORD $0x10, Z24, Z24 + VPADDD Z16, Z24, Z16 + VPXORD Z8, Z16, Z8 + VPRORD $0x0c, Z8, Z8 + VPADDD Z0, Z8, Z0 + VPADDD Z3, Z0, Z0 + VPXORD Z24, Z0, Z24 + VPRORD $0x08, Z24, Z24 + VPADDD Z16, Z24, Z16 + VPXORD Z8, Z16, Z8 + VPRORD $0x07, Z8, Z8 + VPADDD Z2, Z10, Z2 + VPADDD Z5, Z2, Z2 + VPXORD Z26, Z2, Z26 + VPRORD $0x10, Z26, Z26 + VPADDD Z18, Z26, Z18 + VPXORD Z10, Z18, Z10 + VPRORD $0x0c, Z10, Z10 + VPADDD Z2, Z10, Z2 + VPADDD Z7, Z2, Z2 + VPXORD Z26, Z2, Z26 + VPRORD $0x08, Z26, Z26 + VPADDD Z18, Z26, Z18 + VPXORD Z10, Z18, Z10 + VPRORD $0x07, Z10, Z10 + VPADDD Z4, Z12, Z4 + VPADDD Z9, Z4, Z4 + VPXORD Z28, Z4, Z28 + VPRORD $0x10, Z28, Z28 + VPADDD Z20, Z28, Z20 + VPXORD Z12, Z20, Z12 + VPRORD $0x0c, Z12, Z12 + VPADDD Z4, Z12, Z4 + VPADDD Z11, Z4, Z4 + VPXORD Z28, Z4, Z28 + VPRORD $0x08, Z28, Z28 + VPADDD Z20, Z28, Z20 + VPXORD Z12, Z20, Z12 + VPRORD $0x07, Z12, Z12 + VPADDD Z6, Z14, Z6 + VPADDD Z13, Z6, Z6 + VPXORD Z30, Z6, Z30 + VPRORD $0x10, Z30, Z30 + VPADDD Z22, Z30, Z22 + VPXORD Z14, Z22, Z14 + VPRORD $0x0c, Z14, Z14 + VPADDD Z6, Z14, Z6 + VPADDD Z15, Z6, Z6 + VPXORD Z30, Z6, Z30 + VPRORD $0x08, Z30, Z30 + VPADDD Z22, Z30, Z22 + VPXORD Z14, Z22, Z14 + VPRORD $0x07, Z14, Z14 + VPADDD Z0, Z10, Z0 + VPADDD Z17, Z0, Z0 + VPXORD Z30, Z0, Z30 + VPRORD $0x10, Z30, Z30 + VPADDD Z20, Z30, Z20 + VPXORD Z10, Z20, Z10 + VPRORD $0x0c, Z10, Z10 + VPADDD Z0, Z10, Z0 + VPADDD Z19, Z0, Z0 + VPXORD Z30, Z0, Z30 + VPRORD $0x08, Z30, Z30 + VPADDD Z20, Z30, Z20 + VPXORD Z10, Z20, Z10 + VPRORD $0x07, Z10, Z10 + VPADDD Z2, Z12, Z2 + VPADDD Z21, Z2, Z2 + VPXORD Z24, Z2, Z24 + VPRORD $0x10, Z24, Z24 + VPADDD Z22, Z24, Z22 + VPXORD Z12, Z22, Z12 + VPRORD $0x0c, Z12, Z12 + VPADDD Z2, Z12, Z2 + VPADDD Z23, Z2, Z2 + VPXORD Z24, Z2, Z24 + VPRORD $0x08, Z24, Z24 + VPADDD Z22, Z24, Z22 + VPXORD Z12, Z22, Z12 + VPRORD $0x07, Z12, Z12 + VPADDD Z4, Z14, Z4 + VPADDD Z25, Z4, Z4 + VPXORD Z26, Z4, Z26 + VPRORD $0x10, Z26, Z26 + VPADDD Z16, Z26, Z16 + VPXORD Z14, Z16, Z14 + VPRORD $0x0c, Z14, Z14 + VPADDD Z4, Z14, Z4 + VPADDD Z27, Z4, Z4 + VPXORD Z26, Z4, Z26 + VPRORD $0x08, Z26, Z26 + VPADDD Z16, Z26, Z16 + VPXORD Z14, Z16, Z14 + VPRORD $0x07, Z14, Z14 + VPADDD Z6, Z8, Z6 + VPADDD Z29, Z6, Z6 + VPXORD Z28, Z6, Z28 + VPRORD $0x10, Z28, Z28 + VPADDD Z18, Z28, Z18 + VPXORD Z8, Z18, Z8 + VPRORD $0x0c, Z8, Z8 + VPADDD Z6, Z8, Z6 + VPADDD Z31, Z6, Z6 + VPXORD Z28, Z6, Z28 + VPRORD $0x08, Z28, Z28 + VPADDD Z18, Z28, Z18 + VPXORD Z8, Z18, Z8 + VPRORD $0x07, Z8, Z8 + + // Round 2 + VPADDD Z0, Z8, Z0 + VPADDD Z5, Z0, Z0 + VPXORD Z24, Z0, Z24 + VPRORD $0x10, Z24, Z24 + VPADDD Z16, Z24, Z16 + VPXORD Z8, Z16, Z8 + VPRORD $0x0c, Z8, Z8 + VPADDD Z0, Z8, Z0 + VPADDD Z13, Z0, Z0 + VPXORD Z24, Z0, Z24 + VPRORD $0x08, Z24, Z24 + VPADDD Z16, Z24, Z16 + VPXORD Z8, Z16, Z8 + VPRORD $0x07, Z8, Z8 + VPADDD Z2, Z10, Z2 + VPADDD Z7, Z2, Z2 + VPXORD Z26, Z2, Z26 + VPRORD $0x10, Z26, Z26 + VPADDD Z18, Z26, Z18 + VPXORD Z10, Z18, Z10 + VPRORD $0x0c, Z10, Z10 + VPADDD Z2, Z10, Z2 + VPADDD Z21, Z2, Z2 + VPXORD Z26, Z2, Z26 + VPRORD $0x08, Z26, Z26 + VPADDD Z18, Z26, Z18 + VPXORD Z10, Z18, Z10 + VPRORD $0x07, Z10, Z10 + VPADDD Z4, Z12, Z4 + VPADDD Z15, Z4, Z4 + VPXORD Z28, Z4, Z28 + VPRORD $0x10, Z28, Z28 + VPADDD Z20, Z28, Z20 + VPXORD Z12, Z20, Z12 + VPRORD $0x0c, Z12, Z12 + VPADDD Z4, Z12, Z4 + VPADDD Z1, Z4, Z4 + VPXORD Z28, Z4, Z28 + VPRORD $0x08, Z28, Z28 + VPADDD Z20, Z28, Z20 + VPXORD Z12, Z20, Z12 + VPRORD $0x07, Z12, Z12 + VPADDD Z6, Z14, Z6 + VPADDD Z9, Z6, Z6 + VPXORD Z30, Z6, Z30 + VPRORD $0x10, Z30, Z30 + VPADDD Z22, Z30, Z22 + VPXORD Z14, Z22, Z14 + VPRORD $0x0c, Z14, Z14 + VPADDD Z6, Z14, Z6 + VPADDD Z27, Z6, Z6 + VPXORD Z30, Z6, Z30 + VPRORD $0x08, Z30, Z30 + VPADDD Z22, Z30, Z22 + VPXORD Z14, Z22, Z14 + VPRORD $0x07, Z14, Z14 + VPADDD Z0, Z10, Z0 + VPADDD Z3, Z0, Z0 + VPXORD Z30, Z0, Z30 + VPRORD $0x10, Z30, Z30 + VPADDD Z20, Z30, Z20 + VPXORD Z10, Z20, Z10 + VPRORD $0x0c, Z10, Z10 + VPADDD Z0, Z10, Z0 + VPADDD Z23, Z0, Z0 + VPXORD Z30, Z0, Z30 + VPRORD $0x08, Z30, Z30 + VPADDD Z20, Z30, Z20 + VPXORD Z10, Z20, Z10 + VPRORD $0x07, Z10, Z10 + VPADDD Z2, Z12, Z2 + VPADDD Z25, Z2, Z2 + VPXORD Z24, Z2, Z24 + VPRORD $0x10, Z24, Z24 + VPADDD Z22, Z24, Z22 + VPXORD Z12, Z22, Z12 + VPRORD $0x0c, Z12, Z12 + VPADDD Z2, Z12, Z2 + VPADDD Z11, Z2, Z2 + VPXORD Z24, Z2, Z24 + VPRORD $0x08, Z24, Z24 + VPADDD Z22, Z24, Z22 + VPXORD Z12, Z22, Z12 + VPRORD $0x07, Z12, Z12 + VPADDD Z4, Z14, Z4 + VPADDD Z19, Z4, Z4 + VPXORD Z26, Z4, Z26 + VPRORD $0x10, Z26, Z26 + VPADDD Z16, Z26, Z16 + VPXORD Z14, Z16, Z14 + VPRORD $0x0c, Z14, Z14 + VPADDD Z4, Z14, Z4 + VPADDD Z29, Z4, Z4 + VPXORD Z26, Z4, Z26 + VPRORD $0x08, Z26, Z26 + VPADDD Z16, Z26, Z16 + VPXORD Z14, Z16, Z14 + VPRORD $0x07, Z14, Z14 + VPADDD Z6, Z8, Z6 + VPADDD Z31, Z6, Z6 + VPXORD Z28, Z6, Z28 + VPRORD $0x10, Z28, Z28 + VPADDD Z18, Z28, Z18 + VPXORD Z8, Z18, Z8 + VPRORD $0x0c, Z8, Z8 + VPADDD Z6, Z8, Z6 + VPADDD Z17, Z6, Z6 + VPXORD Z28, Z6, Z28 + VPRORD $0x08, Z28, Z28 + VPADDD Z18, Z28, Z18 + VPXORD Z8, Z18, Z8 + VPRORD $0x07, Z8, Z8 + + // Round 3 + VPADDD Z0, Z8, Z0 + VPADDD Z7, Z0, Z0 + VPXORD Z24, Z0, Z24 + VPRORD $0x10, Z24, Z24 + VPADDD Z16, Z24, Z16 + VPXORD Z8, Z16, Z8 + VPRORD $0x0c, Z8, Z8 + VPADDD Z0, Z8, Z0 + VPADDD Z9, Z0, Z0 + VPXORD Z24, Z0, Z24 + VPRORD $0x08, Z24, Z24 + VPADDD Z16, Z24, Z16 + VPXORD Z8, Z16, Z8 + VPRORD $0x07, Z8, Z8 + VPADDD Z2, Z10, Z2 + VPADDD Z21, Z2, Z2 + VPXORD Z26, Z2, Z26 + VPRORD $0x10, Z26, Z26 + VPADDD Z18, Z26, Z18 + VPXORD Z10, Z18, Z10 + VPRORD $0x0c, Z10, Z10 + VPADDD Z2, Z10, Z2 + VPADDD Z25, Z2, Z2 + VPXORD Z26, Z2, Z26 + VPRORD $0x08, Z26, Z26 + VPADDD Z18, Z26, Z18 + VPXORD Z10, Z18, Z10 + VPRORD $0x07, Z10, Z10 + VPADDD Z4, Z12, Z4 + VPADDD Z27, Z4, Z4 + VPXORD Z28, Z4, Z28 + VPRORD $0x10, Z28, Z28 + VPADDD Z20, Z28, Z20 + VPXORD Z12, Z20, Z12 + VPRORD $0x0c, Z12, Z12 + VPADDD Z4, Z12, Z4 + VPADDD Z5, Z4, Z4 + VPXORD Z28, Z4, Z28 + VPRORD $0x08, Z28, Z28 + VPADDD Z20, Z28, Z20 + VPXORD Z12, Z20, Z12 + VPRORD $0x07, Z12, Z12 + VPADDD Z6, Z14, Z6 + VPADDD Z15, Z6, Z6 + VPXORD Z30, Z6, Z30 + VPRORD $0x10, Z30, Z30 + VPADDD Z22, Z30, Z22 + VPXORD Z14, Z22, Z14 + VPRORD $0x0c, Z14, Z14 + VPADDD Z6, Z14, Z6 + VPADDD Z29, Z6, Z6 + VPXORD Z30, Z6, Z30 + VPRORD $0x08, Z30, Z30 + VPADDD Z22, Z30, Z22 + VPXORD Z14, Z22, Z14 + VPRORD $0x07, Z14, Z14 + VPADDD Z0, Z10, Z0 + VPADDD Z13, Z0, Z0 + VPXORD Z30, Z0, Z30 + VPRORD $0x10, Z30, Z30 + VPADDD Z20, Z30, Z20 + VPXORD Z10, Z20, Z10 + VPRORD $0x0c, Z10, Z10 + VPADDD Z0, Z10, Z0 + VPADDD Z11, Z0, Z0 + VPXORD Z30, Z0, Z30 + VPRORD $0x08, Z30, Z30 + VPADDD Z20, Z30, Z20 + VPXORD Z10, Z20, Z10 + VPRORD $0x07, Z10, Z10 + VPADDD Z2, Z12, Z2 + VPADDD Z19, Z2, Z2 + VPXORD Z24, Z2, Z24 + VPRORD $0x10, Z24, Z24 + VPADDD Z22, Z24, Z22 + VPXORD Z12, Z22, Z12 + VPRORD $0x0c, Z12, Z12 + VPADDD Z2, Z12, Z2 + VPADDD Z1, Z2, Z2 + VPXORD Z24, Z2, Z24 + VPRORD $0x08, Z24, Z24 + VPADDD Z22, Z24, Z22 + VPXORD Z12, Z22, Z12 + VPRORD $0x07, Z12, Z12 + VPADDD Z4, Z14, Z4 + VPADDD Z23, Z4, Z4 + VPXORD Z26, Z4, Z26 + VPRORD $0x10, Z26, Z26 + VPADDD Z16, Z26, Z16 + VPXORD Z14, Z16, Z14 + VPRORD $0x0c, Z14, Z14 + VPADDD Z4, Z14, Z4 + VPADDD Z31, Z4, Z4 + VPXORD Z26, Z4, Z26 + VPRORD $0x08, Z26, Z26 + VPADDD Z16, Z26, Z16 + VPXORD Z14, Z16, Z14 + VPRORD $0x07, Z14, Z14 + VPADDD Z6, Z8, Z6 + VPADDD Z17, Z6, Z6 + VPXORD Z28, Z6, Z28 + VPRORD $0x10, Z28, Z28 + VPADDD Z18, Z28, Z18 + VPXORD Z8, Z18, Z8 + VPRORD $0x0c, Z8, Z8 + VPADDD Z6, Z8, Z6 + VPADDD Z3, Z6, Z6 + VPXORD Z28, Z6, Z28 + VPRORD $0x08, Z28, Z28 + VPADDD Z18, Z28, Z18 + VPXORD Z8, Z18, Z8 + VPRORD $0x07, Z8, Z8 + + // Round 4 + VPADDD Z0, Z8, Z0 + VPADDD Z21, Z0, Z0 + VPXORD Z24, Z0, Z24 + VPRORD $0x10, Z24, Z24 + VPADDD Z16, Z24, Z16 + VPXORD Z8, Z16, Z8 + VPRORD $0x0c, Z8, Z8 + VPADDD Z0, Z8, Z0 + VPADDD Z15, Z0, Z0 + VPXORD Z24, Z0, Z24 + VPRORD $0x08, Z24, Z24 + VPADDD Z16, Z24, Z16 + VPXORD Z8, Z16, Z8 + VPRORD $0x07, Z8, Z8 + VPADDD Z2, Z10, Z2 + VPADDD Z25, Z2, Z2 + VPXORD Z26, Z2, Z26 + VPRORD $0x10, Z26, Z26 + VPADDD Z18, Z26, Z18 + VPXORD Z10, Z18, Z10 + VPRORD $0x0c, Z10, Z10 + VPADDD Z2, Z10, Z2 + VPADDD Z19, Z2, Z2 + VPXORD Z26, Z2, Z26 + VPRORD $0x08, Z26, Z26 + VPADDD Z18, Z26, Z18 + VPXORD Z10, Z18, Z10 + VPRORD $0x07, Z10, Z10 + VPADDD Z4, Z12, Z4 + VPADDD Z29, Z4, Z4 + VPXORD Z28, Z4, Z28 + VPRORD $0x10, Z28, Z28 + VPADDD Z20, Z28, Z20 + VPXORD Z12, Z20, Z12 + VPRORD $0x0c, Z12, Z12 + VPADDD Z4, Z12, Z4 + VPADDD Z7, Z4, Z4 + VPXORD Z28, Z4, Z28 + VPRORD $0x08, Z28, Z28 + VPADDD Z20, Z28, Z20 + VPXORD Z12, Z20, Z12 + VPRORD $0x07, Z12, Z12 + VPADDD Z6, Z14, Z6 + VPADDD Z27, Z6, Z6 + VPXORD Z30, Z6, Z30 + VPRORD $0x10, Z30, Z30 + VPADDD Z22, Z30, Z22 + VPXORD Z14, Z22, Z14 + VPRORD $0x0c, Z14, Z14 + VPADDD Z6, Z14, Z6 + VPADDD Z31, Z6, Z6 + VPXORD Z30, Z6, Z30 + VPRORD $0x08, Z30, Z30 + VPADDD Z22, Z30, Z22 + VPXORD Z14, Z22, Z14 + VPRORD $0x07, Z14, Z14 + VPADDD Z0, Z10, Z0 + VPADDD Z9, Z0, Z0 + VPXORD Z30, Z0, Z30 + VPRORD $0x10, Z30, Z30 + VPADDD Z20, Z30, Z20 + VPXORD Z10, Z20, Z10 + VPRORD $0x0c, Z10, Z10 + VPADDD Z0, Z10, Z0 + VPADDD Z1, Z0, Z0 + VPXORD Z30, Z0, Z30 + VPRORD $0x08, Z30, Z30 + VPADDD Z20, Z30, Z20 + VPXORD Z10, Z20, Z10 + VPRORD $0x07, Z10, Z10 + VPADDD Z2, Z12, Z2 + VPADDD Z23, Z2, Z2 + VPXORD Z24, Z2, Z24 + VPRORD $0x10, Z24, Z24 + VPADDD Z22, Z24, Z22 + VPXORD Z12, Z22, Z12 + VPRORD $0x0c, Z12, Z12 + VPADDD Z2, Z12, Z2 + VPADDD Z5, Z2, Z2 + VPXORD Z24, Z2, Z24 + VPRORD $0x08, Z24, Z24 + VPADDD Z22, Z24, Z22 + VPXORD Z12, Z22, Z12 + VPRORD $0x07, Z12, Z12 + VPADDD Z4, Z14, Z4 + VPADDD Z11, Z4, Z4 + VPXORD Z26, Z4, Z26 + VPRORD $0x10, Z26, Z26 + VPADDD Z16, Z26, Z16 + VPXORD Z14, Z16, Z14 + VPRORD $0x0c, Z14, Z14 + VPADDD Z4, Z14, Z4 + VPADDD Z17, Z4, Z4 + VPXORD Z26, Z4, Z26 + VPRORD $0x08, Z26, Z26 + VPADDD Z16, Z26, Z16 + VPXORD Z14, Z16, Z14 + VPRORD $0x07, Z14, Z14 + VPADDD Z6, Z8, Z6 + VPADDD Z3, Z6, Z6 + VPXORD Z28, Z6, Z28 + VPRORD $0x10, Z28, Z28 + VPADDD Z18, Z28, Z18 + VPXORD Z8, Z18, Z8 + VPRORD $0x0c, Z8, Z8 + VPADDD Z6, Z8, Z6 + VPADDD Z13, Z6, Z6 + VPXORD Z28, Z6, Z28 + VPRORD $0x08, Z28, Z28 + VPADDD Z18, Z28, Z18 + VPXORD Z8, Z18, Z8 + VPRORD $0x07, Z8, Z8 + + // Round 5 + VPADDD Z0, Z8, Z0 + VPADDD Z25, Z0, Z0 + VPXORD Z24, Z0, Z24 + VPRORD $0x10, Z24, Z24 + VPADDD Z16, Z24, Z16 + VPXORD Z8, Z16, Z8 + VPRORD $0x0c, Z8, Z8 + VPADDD Z0, Z8, Z0 + VPADDD Z27, Z0, Z0 + VPXORD Z24, Z0, Z24 + VPRORD $0x08, Z24, Z24 + VPADDD Z16, Z24, Z16 + VPXORD Z8, Z16, Z8 + VPRORD $0x07, Z8, Z8 + VPADDD Z2, Z10, Z2 + VPADDD Z19, Z2, Z2 + VPXORD Z26, Z2, Z26 + VPRORD $0x10, Z26, Z26 + VPADDD Z18, Z26, Z18 + VPXORD Z10, Z18, Z10 + VPRORD $0x0c, Z10, Z10 + VPADDD Z2, Z10, Z2 + VPADDD Z23, Z2, Z2 + VPXORD Z26, Z2, Z26 + VPRORD $0x08, Z26, Z26 + VPADDD Z18, Z26, Z18 + VPXORD Z10, Z18, Z10 + VPRORD $0x07, Z10, Z10 + VPADDD Z4, Z12, Z4 + VPADDD Z31, Z4, Z4 + VPXORD Z28, Z4, Z28 + VPRORD $0x10, Z28, Z28 + VPADDD Z20, Z28, Z20 + VPXORD Z12, Z20, Z12 + VPRORD $0x0c, Z12, Z12 + VPADDD Z4, Z12, Z4 + VPADDD Z21, Z4, Z4 + VPXORD Z28, Z4, Z28 + VPRORD $0x08, Z28, Z28 + VPADDD Z20, Z28, Z20 + VPXORD Z12, Z20, Z12 + VPRORD $0x07, Z12, Z12 + VPADDD Z6, Z14, Z6 + VPADDD Z29, Z6, Z6 + VPXORD Z30, Z6, Z30 + VPRORD $0x10, Z30, Z30 + VPADDD Z22, Z30, Z22 + VPXORD Z14, Z22, Z14 + VPRORD $0x0c, Z14, Z14 + VPADDD Z6, Z14, Z6 + VPADDD Z17, Z6, Z6 + VPXORD Z30, Z6, Z30 + VPRORD $0x08, Z30, Z30 + VPADDD Z22, Z30, Z22 + VPXORD Z14, Z22, Z14 + VPRORD $0x07, Z14, Z14 + VPADDD Z0, Z10, Z0 + VPADDD Z15, Z0, Z0 + VPXORD Z30, Z0, Z30 + VPRORD $0x10, Z30, Z30 + VPADDD Z20, Z30, Z20 + VPXORD Z10, Z20, Z10 + VPRORD $0x0c, Z10, Z10 + VPADDD Z0, Z10, Z0 + VPADDD Z5, Z0, Z0 + VPXORD Z30, Z0, Z30 + VPRORD $0x08, Z30, Z30 + VPADDD Z20, Z30, Z20 + VPXORD Z10, Z20, Z10 + VPRORD $0x07, Z10, Z10 + VPADDD Z2, Z12, Z2 + VPADDD Z11, Z2, Z2 + VPXORD Z24, Z2, Z24 + VPRORD $0x10, Z24, Z24 + VPADDD Z22, Z24, Z22 + VPXORD Z12, Z22, Z12 + VPRORD $0x0c, Z12, Z12 + VPADDD Z2, Z12, Z2 + VPADDD Z7, Z2, Z2 + VPXORD Z24, Z2, Z24 + VPRORD $0x08, Z24, Z24 + VPADDD Z22, Z24, Z22 + VPXORD Z12, Z22, Z12 + VPRORD $0x07, Z12, Z12 + VPADDD Z4, Z14, Z4 + VPADDD Z1, Z4, Z4 + VPXORD Z26, Z4, Z26 + VPRORD $0x10, Z26, Z26 + VPADDD Z16, Z26, Z16 + VPXORD Z14, Z16, Z14 + VPRORD $0x0c, Z14, Z14 + VPADDD Z4, Z14, Z4 + VPADDD Z3, Z4, Z4 + VPXORD Z26, Z4, Z26 + VPRORD $0x08, Z26, Z26 + VPADDD Z16, Z26, Z16 + VPXORD Z14, Z16, Z14 + VPRORD $0x07, Z14, Z14 + VPADDD Z6, Z8, Z6 + VPADDD Z13, Z6, Z6 + VPXORD Z28, Z6, Z28 + VPRORD $0x10, Z28, Z28 + VPADDD Z18, Z28, Z18 + VPXORD Z8, Z18, Z8 + VPRORD $0x0c, Z8, Z8 + VPADDD Z6, Z8, Z6 + VPADDD Z9, Z6, Z6 + VPXORD Z28, Z6, Z28 + VPRORD $0x08, Z28, Z28 + VPADDD Z18, Z28, Z18 + VPXORD Z8, Z18, Z8 + VPRORD $0x07, Z8, Z8 + + // Round 6 + VPADDD Z0, Z8, Z0 + VPADDD Z19, Z0, Z0 + VPXORD Z24, Z0, Z24 + VPRORD $0x10, Z24, Z24 + VPADDD Z16, Z24, Z16 + VPXORD Z8, Z16, Z8 + VPRORD $0x0c, Z8, Z8 + VPADDD Z0, Z8, Z0 + VPADDD Z29, Z0, Z0 + VPXORD Z24, Z0, Z24 + VPRORD $0x08, Z24, Z24 + VPADDD Z16, Z24, Z16 + VPXORD Z8, Z16, Z8 + VPRORD $0x07, Z8, Z8 + VPADDD Z2, Z10, Z2 + VPADDD Z23, Z2, Z2 + VPXORD Z26, Z2, Z26 + VPRORD $0x10, Z26, Z26 + VPADDD Z18, Z26, Z18 + VPXORD Z10, Z18, Z10 + VPRORD $0x0c, Z10, Z10 + VPADDD Z2, Z10, Z2 + VPADDD Z11, Z2, Z2 + VPXORD Z26, Z2, Z26 + VPRORD $0x08, Z26, Z26 + VPADDD Z18, Z26, Z18 + VPXORD Z10, Z18, Z10 + VPRORD $0x07, Z10, Z10 + VPADDD Z4, Z12, Z4 + VPADDD Z17, Z4, Z4 + VPXORD Z28, Z4, Z28 + VPRORD $0x10, Z28, Z28 + VPADDD Z20, Z28, Z20 + VPXORD Z12, Z20, Z12 + VPRORD $0x0c, Z12, Z12 + VPADDD Z4, Z12, Z4 + VPADDD Z25, Z4, Z4 + VPXORD Z28, Z4, Z28 + VPRORD $0x08, Z28, Z28 + VPADDD Z20, Z28, Z20 + VPXORD Z12, Z20, Z12 + VPRORD $0x07, Z12, Z12 + VPADDD Z6, Z14, Z6 + VPADDD Z31, Z6, Z6 + VPXORD Z30, Z6, Z30 + VPRORD $0x10, Z30, Z30 + VPADDD Z22, Z30, Z22 + VPXORD Z14, Z22, Z14 + VPRORD $0x0c, Z14, Z14 + VPADDD Z6, Z14, Z6 + VPADDD Z3, Z6, Z6 + VPXORD Z30, Z6, Z30 + VPRORD $0x08, Z30, Z30 + VPADDD Z22, Z30, Z22 + VPXORD Z14, Z22, Z14 + VPRORD $0x07, Z14, Z14 + VPADDD Z0, Z10, Z0 + VPADDD Z27, Z0, Z0 + VPXORD Z30, Z0, Z30 + VPRORD $0x10, Z30, Z30 + VPADDD Z20, Z30, Z20 + VPXORD Z10, Z20, Z10 + VPRORD $0x0c, Z10, Z10 + VPADDD Z0, Z10, Z0 + VPADDD Z7, Z0, Z0 + VPXORD Z30, Z0, Z30 + VPRORD $0x08, Z30, Z30 + VPADDD Z20, Z30, Z20 + VPXORD Z10, Z20, Z10 + VPRORD $0x07, Z10, Z10 + VPADDD Z2, Z12, Z2 + VPADDD Z1, Z2, Z2 + VPXORD Z24, Z2, Z24 + VPRORD $0x10, Z24, Z24 + VPADDD Z22, Z24, Z22 + VPXORD Z12, Z22, Z12 + VPRORD $0x0c, Z12, Z12 + VPADDD Z2, Z12, Z2 + VPADDD Z21, Z2, Z2 + VPXORD Z24, Z2, Z24 + VPRORD $0x08, Z24, Z24 + VPADDD Z22, Z24, Z22 + VPXORD Z12, Z22, Z12 + VPRORD $0x07, Z12, Z12 + VPADDD Z4, Z14, Z4 + VPADDD Z5, Z4, Z4 + VPXORD Z26, Z4, Z26 + VPRORD $0x10, Z26, Z26 + VPADDD Z16, Z26, Z16 + VPXORD Z14, Z16, Z14 + VPRORD $0x0c, Z14, Z14 + VPADDD Z4, Z14, Z4 + VPADDD Z13, Z4, Z4 + VPXORD Z26, Z4, Z26 + VPRORD $0x08, Z26, Z26 + VPADDD Z16, Z26, Z16 + VPXORD Z14, Z16, Z14 + VPRORD $0x07, Z14, Z14 + VPADDD Z6, Z8, Z6 + VPADDD Z9, Z6, Z6 + VPXORD Z28, Z6, Z28 + VPRORD $0x10, Z28, Z28 + VPADDD Z18, Z28, Z18 + VPXORD Z8, Z18, Z8 + VPRORD $0x0c, Z8, Z8 + VPADDD Z6, Z8, Z6 + VPADDD Z15, Z6, Z6 + VPXORD Z28, Z6, Z28 + VPRORD $0x08, Z28, Z28 + VPADDD Z18, Z28, Z18 + VPXORD Z8, Z18, Z8 + VPRORD $0x07, Z8, Z8 + + // Round 7 + VPADDD Z0, Z8, Z0 + VPADDD Z23, Z0, Z0 + VPXORD Z24, Z0, Z24 + VPRORD $0x10, Z24, Z24 + VPADDD Z16, Z24, Z16 + VPXORD Z8, Z16, Z8 + VPRORD $0x0c, Z8, Z8 + VPADDD Z0, Z8, Z0 + VPADDD Z31, Z0, Z0 + VPXORD Z24, Z0, Z24 + VPRORD $0x08, Z24, Z24 + VPADDD Z16, Z24, Z16 + VPXORD Z8, Z16, Z8 + VPRORD $0x07, Z8, Z8 + VPADDD Z2, Z10, Z2 + VPADDD Z11, Z2, Z2 + VPXORD Z26, Z2, Z26 + VPRORD $0x10, Z26, Z26 + VPADDD Z18, Z26, Z18 + VPXORD Z10, Z18, Z10 + VPRORD $0x0c, Z10, Z10 + VPADDD Z2, Z10, Z2 + VPADDD Z1, Z2, Z2 + VPXORD Z26, Z2, Z26 + VPRORD $0x08, Z26, Z26 + VPADDD Z18, Z26, Z18 + VPXORD Z10, Z18, Z10 + VPRORD $0x07, Z10, Z10 + VPADDD Z4, Z12, Z4 + VPADDD Z3, Z4, Z4 + VPXORD Z28, Z4, Z28 + VPRORD $0x10, Z28, Z28 + VPADDD Z20, Z28, Z20 + VPXORD Z12, Z20, Z12 + VPRORD $0x0c, Z12, Z12 + VPADDD Z4, Z12, Z4 + VPADDD Z19, Z4, Z4 + VPXORD Z28, Z4, Z28 + VPRORD $0x08, Z28, Z28 + VPADDD Z20, Z28, Z20 + VPXORD Z12, Z20, Z12 + VPRORD $0x07, Z12, Z12 + VPADDD Z6, Z14, Z6 + VPADDD Z17, Z6, Z6 + VPXORD Z30, Z6, Z30 + VPRORD $0x10, Z30, Z30 + VPADDD Z22, Z30, Z22 + VPXORD Z14, Z22, Z14 + VPRORD $0x0c, Z14, Z14 + VPADDD Z6, Z14, Z6 + VPADDD Z13, Z6, Z6 + VPXORD Z30, Z6, Z30 + VPRORD $0x08, Z30, Z30 + VPADDD Z22, Z30, Z22 + VPXORD Z14, Z22, Z14 + VPRORD $0x07, Z14, Z14 + VPADDD Z0, Z10, Z0 + VPADDD Z29, Z0, Z0 + VPXORD Z30, Z0, Z30 + VPRORD $0x10, Z30, Z30 + VPADDD Z20, Z30, Z20 + VPXORD Z10, Z20, Z10 + VPRORD $0x0c, Z10, Z10 + VPADDD Z0, Z10, Z0 + VPADDD Z21, Z0, Z0 + VPXORD Z30, Z0, Z30 + VPRORD $0x08, Z30, Z30 + VPADDD Z20, Z30, Z20 + VPXORD Z10, Z20, Z10 + VPRORD $0x07, Z10, Z10 + VPADDD Z2, Z12, Z2 + VPADDD Z5, Z2, Z2 + VPXORD Z24, Z2, Z24 + VPRORD $0x10, Z24, Z24 + VPADDD Z22, Z24, Z22 + VPXORD Z12, Z22, Z12 + VPRORD $0x0c, Z12, Z12 + VPADDD Z2, Z12, Z2 + VPADDD Z25, Z2, Z2 + VPXORD Z24, Z2, Z24 + VPRORD $0x08, Z24, Z24 + VPADDD Z22, Z24, Z22 + VPXORD Z12, Z22, Z12 + VPRORD $0x07, Z12, Z12 + VPADDD Z4, Z14, Z4 + VPADDD Z7, Z4, Z4 + VPXORD Z26, Z4, Z26 + VPRORD $0x10, Z26, Z26 + VPADDD Z16, Z26, Z16 + VPXORD Z14, Z16, Z14 + VPRORD $0x0c, Z14, Z14 + VPADDD Z4, Z14, Z4 + VPADDD Z9, Z4, Z4 + VPXORD Z26, Z4, Z26 + VPRORD $0x08, Z26, Z26 + VPADDD Z16, Z26, Z16 + VPXORD Z14, Z16, Z14 + VPRORD $0x07, Z14, Z14 + VPADDD Z6, Z8, Z6 + VPADDD Z15, Z6, Z6 + VPXORD Z28, Z6, Z28 + VPRORD $0x10, Z28, Z28 + VPADDD Z18, Z28, Z18 + VPXORD Z8, Z18, Z8 + VPRORD $0x0c, Z8, Z8 + VPADDD Z6, Z8, Z6 + VPADDD Z27, Z6, Z6 + VPXORD Z28, Z6, Z28 + VPRORD $0x08, Z28, Z28 + VPADDD Z18, Z28, Z18 + VPXORD Z8, Z18, Z8 + VPRORD $0x07, Z8, Z8 + + // Finalize CVs + VPXORD Z0, Z16, Z0 + VPXORD Z2, Z18, Z2 + VPXORD Z4, Z20, Z4 + VPXORD Z6, Z22, Z6 + VPXORD Z8, Z24, Z8 + VPXORD Z10, Z26, Z10 + VPXORD Z12, Z28, Z12 + VPXORD Z14, Z30, Z14 + + // Loop + INCQ DX + CMPQ DX, $0x00000010 + JNE loop + + // Finished; transpose CVs + VMOVDQU32 seq<>+0(SB), Z16 + VPSLLD $0x05, Z16, Z16 + KXNORD K1, K1, K1 + VPSCATTERDD Z0, K1, (AX)(Z16*1) + KXNORD K1, K1, K1 + VPSCATTERDD Z2, K1, 4(AX)(Z16*1) + KXNORD K1, K1, K1 + VPSCATTERDD Z4, K1, 8(AX)(Z16*1) + KXNORD K1, K1, K1 + VPSCATTERDD Z6, K1, 12(AX)(Z16*1) + KXNORD K1, K1, K1 + VPSCATTERDD Z8, K1, 16(AX)(Z16*1) + KXNORD K1, K1, K1 + VPSCATTERDD Z10, K1, 20(AX)(Z16*1) + KXNORD K1, K1, K1 + VPSCATTERDD Z12, K1, 24(AX)(Z16*1) + KXNORD K1, K1, K1 + VPSCATTERDD Z14, K1, 28(AX)(Z16*1) + RET + // func compressBlocksAVX2(out *[512]byte, block *[16]uint32, cv *[8]uint32, counter uint64, blockLen uint32, flags uint32) // Requires: AVX, AVX2 TEXT ·compressBlocksAVX2(SB), NOSPLIT, $544-40 @@ -132,8 +1927,8 @@ TEXT ·compressBlocksAVX2(SB), NOSPLIT, $544-40 VPBROADCASTD iv<>+12(SB), Y11 VPBROADCASTQ counter+24(FP), Y12 VPBROADCASTQ counter+24(FP), Y13 - VPADDQ increment_counter<>+0(SB), Y12, Y12 - VPADDQ increment_counter<>+32(SB), Y13, Y13 + VPADDQ seq64<>+0(SB), Y12, Y12 + VPADDQ seq64<>+32(SB), Y13, Y13 VPUNPCKLDQ Y13, Y12, Y14 VPUNPCKHDQ Y13, Y12, Y15 VPUNPCKLDQ Y15, Y14, Y12 @@ -1350,30 +3145,31 @@ TEXT ·compressChunksAVX2(SB), NOSPLIT, $672-40 // Initialize counter VPBROADCASTQ counter+24(FP), Y12 VPBROADCASTQ counter+24(FP), Y13 - VPADDQ increment_counter<>+0(SB), Y12, Y12 - VPADDQ increment_counter<>+32(SB), Y13, Y13 + VPADDQ seq64<>+0(SB), Y12, Y12 + VPADDQ seq64<>+32(SB), Y13, Y13 VPUNPCKLDQ Y13, Y12, Y14 VPUNPCKHDQ Y13, Y12, Y15 VPUNPCKLDQ Y15, Y14, Y12 VPUNPCKHDQ Y15, Y14, Y13 VPERMQ $0xd8, Y12, Y12 VPERMQ $0xd8, Y13, Y13 - VMOVDQU Y12, 544(SP) - VMOVDQU Y13, 576(SP) + VMOVDQU Y12, 512(SP) + VMOVDQU Y13, 544(SP) // Initialize flags VPBROADCASTD flags+32(FP), Y14 - VPOR set_flags<>+0(SB), Y14, Y15 - VMOVDQU Y15, 608(SP) - VPOR set_flags<>+32(SB), Y14, Y15 - VMOVDQU Y15, 640(SP) + VMOVDQU Y14, 576(SP) + VMOVDQU Y14, 608(SP) + ORL $0x01, 576(SP) + ORL $0x02, 636(SP) // Loop index XORQ DX, DX loop: // Load transposed block - VMOVDQU stride_1024<>+0(SB), Y9 + VMOVDQU seq<>+0(SB), Y9 + VPSLLD $0x0a, Y9, Y9 VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, (CX)(Y9*1), Y10 VMOVDQU Y10, (SP) @@ -1429,10 +3225,1220 @@ loop: VPBROADCASTD iv<>+4(SB), Y9 VPBROADCASTD iv<>+8(SB), Y10 VPBROADCASTD iv<>+12(SB), Y11 - VMOVDQU 544(SP), Y12 - VMOVDQU 576(SP), Y13 - VMOVDQU block_len<>+0(SB), Y14 - VPBROADCASTD 608(SP)(DX*4), Y15 + VMOVDQU 512(SP), Y12 + VMOVDQU 544(SP), Y13 + VPBROADCASTD seq<>+4(SB), Y14 + VPSLLD $0x06, Y14, Y14 + VPBROADCASTD 576(SP)(DX*4), Y15 + VMOVDQU Y8, 640(SP) + + // Round 1 + VPADDD Y0, Y4, Y0 + VPADDD (SP), Y0, Y0 + VPXOR Y12, Y0, Y12 + VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 + VMOVDQU 640(SP), Y8 + VPADDD Y8, Y12, Y8 + VPXOR Y4, Y8, Y4 + VMOVDQU Y8, 640(SP) + VPSRLD $0x0c, Y4, Y8 + VPSLLD $0x14, Y4, Y4 + VPOR Y4, Y8, Y4 + VPADDD Y0, Y4, Y0 + VPADDD 32(SP), Y0, Y0 + VPXOR Y12, Y0, Y12 + VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 + VMOVDQU 640(SP), Y8 + VPADDD Y8, Y12, Y8 + VPXOR Y4, Y8, Y4 + VMOVDQU Y8, 640(SP) + VPSRLD $0x07, Y4, Y8 + VPSLLD $0x19, Y4, Y4 + VPOR Y4, Y8, Y4 + VPADDD Y1, Y5, Y1 + VPADDD 64(SP), Y1, Y1 + VPXOR Y13, Y1, Y13 + VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 + VPADDD Y9, Y13, Y9 + VPXOR Y5, Y9, Y5 + VPSRLD $0x0c, Y5, Y8 + VPSLLD $0x14, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y1, Y5, Y1 + VPADDD 96(SP), Y1, Y1 + VPXOR Y13, Y1, Y13 + VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 + VPADDD Y9, Y13, Y9 + VPXOR Y5, Y9, Y5 + VPSRLD $0x07, Y5, Y8 + VPSLLD $0x19, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y2, Y6, Y2 + VPADDD 128(SP), Y2, Y2 + VPXOR Y14, Y2, Y14 + VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 + VPADDD Y10, Y14, Y10 + VPXOR Y6, Y10, Y6 + VPSRLD $0x0c, Y6, Y8 + VPSLLD $0x14, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y2, Y6, Y2 + VPADDD 160(SP), Y2, Y2 + VPXOR Y14, Y2, Y14 + VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 + VPADDD Y10, Y14, Y10 + VPXOR Y6, Y10, Y6 + VPSRLD $0x07, Y6, Y8 + VPSLLD $0x19, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y3, Y7, Y3 + VPADDD 192(SP), Y3, Y3 + VPXOR Y15, Y3, Y15 + VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 + VPADDD Y11, Y15, Y11 + VPXOR Y7, Y11, Y7 + VPSRLD $0x0c, Y7, Y8 + VPSLLD $0x14, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y3, Y7, Y3 + VPADDD 224(SP), Y3, Y3 + VPXOR Y15, Y3, Y15 + VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 + VPADDD Y11, Y15, Y11 + VPXOR Y7, Y11, Y7 + VPSRLD $0x07, Y7, Y8 + VPSLLD $0x19, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y0, Y5, Y0 + VPADDD 256(SP), Y0, Y0 + VPXOR Y15, Y0, Y15 + VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 + VPADDD Y10, Y15, Y10 + VPXOR Y5, Y10, Y5 + VPSRLD $0x0c, Y5, Y8 + VPSLLD $0x14, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y0, Y5, Y0 + VPADDD 288(SP), Y0, Y0 + VPXOR Y15, Y0, Y15 + VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 + VPADDD Y10, Y15, Y10 + VPXOR Y5, Y10, Y5 + VPSRLD $0x07, Y5, Y8 + VPSLLD $0x19, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y1, Y6, Y1 + VPADDD 320(SP), Y1, Y1 + VPXOR Y12, Y1, Y12 + VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 + VPADDD Y11, Y12, Y11 + VPXOR Y6, Y11, Y6 + VPSRLD $0x0c, Y6, Y8 + VPSLLD $0x14, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y1, Y6, Y1 + VPADDD 352(SP), Y1, Y1 + VPXOR Y12, Y1, Y12 + VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 + VPADDD Y11, Y12, Y11 + VPXOR Y6, Y11, Y6 + VPSRLD $0x07, Y6, Y8 + VPSLLD $0x19, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y2, Y7, Y2 + VPADDD 384(SP), Y2, Y2 + VPXOR Y13, Y2, Y13 + VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 + VMOVDQU 640(SP), Y8 + VPADDD Y8, Y13, Y8 + VPXOR Y7, Y8, Y7 + VMOVDQU Y8, 640(SP) + VPSRLD $0x0c, Y7, Y8 + VPSLLD $0x14, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y2, Y7, Y2 + VPADDD 416(SP), Y2, Y2 + VPXOR Y13, Y2, Y13 + VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 + VMOVDQU 640(SP), Y8 + VPADDD Y8, Y13, Y8 + VPXOR Y7, Y8, Y7 + VMOVDQU Y8, 640(SP) + VPSRLD $0x07, Y7, Y8 + VPSLLD $0x19, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y3, Y4, Y3 + VPADDD 448(SP), Y3, Y3 + VPXOR Y14, Y3, Y14 + VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 + VPADDD Y9, Y14, Y9 + VPXOR Y4, Y9, Y4 + VPSRLD $0x0c, Y4, Y8 + VPSLLD $0x14, Y4, Y4 + VPOR Y4, Y8, Y4 + VPADDD Y3, Y4, Y3 + VPADDD 480(SP), Y3, Y3 + VPXOR Y14, Y3, Y14 + VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 + VPADDD Y9, Y14, Y9 + VPXOR Y4, Y9, Y4 + VPSRLD $0x07, Y4, Y8 + VPSLLD $0x19, Y4, Y4 + VPOR Y4, Y8, Y4 + + // Round 2 + VPADDD Y0, Y4, Y0 + VPADDD 64(SP), Y0, Y0 + VPXOR Y12, Y0, Y12 + VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 + VMOVDQU 640(SP), Y8 + VPADDD Y8, Y12, Y8 + VPXOR Y4, Y8, Y4 + VMOVDQU Y8, 640(SP) + VPSRLD $0x0c, Y4, Y8 + VPSLLD $0x14, Y4, Y4 + VPOR Y4, Y8, Y4 + VPADDD Y0, Y4, Y0 + VPADDD 192(SP), Y0, Y0 + VPXOR Y12, Y0, Y12 + VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 + VMOVDQU 640(SP), Y8 + VPADDD Y8, Y12, Y8 + VPXOR Y4, Y8, Y4 + VMOVDQU Y8, 640(SP) + VPSRLD $0x07, Y4, Y8 + VPSLLD $0x19, Y4, Y4 + VPOR Y4, Y8, Y4 + VPADDD Y1, Y5, Y1 + VPADDD 96(SP), Y1, Y1 + VPXOR Y13, Y1, Y13 + VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 + VPADDD Y9, Y13, Y9 + VPXOR Y5, Y9, Y5 + VPSRLD $0x0c, Y5, Y8 + VPSLLD $0x14, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y1, Y5, Y1 + VPADDD 320(SP), Y1, Y1 + VPXOR Y13, Y1, Y13 + VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 + VPADDD Y9, Y13, Y9 + VPXOR Y5, Y9, Y5 + VPSRLD $0x07, Y5, Y8 + VPSLLD $0x19, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y2, Y6, Y2 + VPADDD 224(SP), Y2, Y2 + VPXOR Y14, Y2, Y14 + VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 + VPADDD Y10, Y14, Y10 + VPXOR Y6, Y10, Y6 + VPSRLD $0x0c, Y6, Y8 + VPSLLD $0x14, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y2, Y6, Y2 + VPADDD (SP), Y2, Y2 + VPXOR Y14, Y2, Y14 + VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 + VPADDD Y10, Y14, Y10 + VPXOR Y6, Y10, Y6 + VPSRLD $0x07, Y6, Y8 + VPSLLD $0x19, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y3, Y7, Y3 + VPADDD 128(SP), Y3, Y3 + VPXOR Y15, Y3, Y15 + VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 + VPADDD Y11, Y15, Y11 + VPXOR Y7, Y11, Y7 + VPSRLD $0x0c, Y7, Y8 + VPSLLD $0x14, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y3, Y7, Y3 + VPADDD 416(SP), Y3, Y3 + VPXOR Y15, Y3, Y15 + VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 + VPADDD Y11, Y15, Y11 + VPXOR Y7, Y11, Y7 + VPSRLD $0x07, Y7, Y8 + VPSLLD $0x19, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y0, Y5, Y0 + VPADDD 32(SP), Y0, Y0 + VPXOR Y15, Y0, Y15 + VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 + VPADDD Y10, Y15, Y10 + VPXOR Y5, Y10, Y5 + VPSRLD $0x0c, Y5, Y8 + VPSLLD $0x14, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y0, Y5, Y0 + VPADDD 352(SP), Y0, Y0 + VPXOR Y15, Y0, Y15 + VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 + VPADDD Y10, Y15, Y10 + VPXOR Y5, Y10, Y5 + VPSRLD $0x07, Y5, Y8 + VPSLLD $0x19, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y1, Y6, Y1 + VPADDD 384(SP), Y1, Y1 + VPXOR Y12, Y1, Y12 + VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 + VPADDD Y11, Y12, Y11 + VPXOR Y6, Y11, Y6 + VPSRLD $0x0c, Y6, Y8 + VPSLLD $0x14, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y1, Y6, Y1 + VPADDD 160(SP), Y1, Y1 + VPXOR Y12, Y1, Y12 + VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 + VPADDD Y11, Y12, Y11 + VPXOR Y6, Y11, Y6 + VPSRLD $0x07, Y6, Y8 + VPSLLD $0x19, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y2, Y7, Y2 + VPADDD 288(SP), Y2, Y2 + VPXOR Y13, Y2, Y13 + VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 + VMOVDQU 640(SP), Y8 + VPADDD Y8, Y13, Y8 + VPXOR Y7, Y8, Y7 + VMOVDQU Y8, 640(SP) + VPSRLD $0x0c, Y7, Y8 + VPSLLD $0x14, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y2, Y7, Y2 + VPADDD 448(SP), Y2, Y2 + VPXOR Y13, Y2, Y13 + VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 + VMOVDQU 640(SP), Y8 + VPADDD Y8, Y13, Y8 + VPXOR Y7, Y8, Y7 + VMOVDQU Y8, 640(SP) + VPSRLD $0x07, Y7, Y8 + VPSLLD $0x19, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y3, Y4, Y3 + VPADDD 480(SP), Y3, Y3 + VPXOR Y14, Y3, Y14 + VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 + VPADDD Y9, Y14, Y9 + VPXOR Y4, Y9, Y4 + VPSRLD $0x0c, Y4, Y8 + VPSLLD $0x14, Y4, Y4 + VPOR Y4, Y8, Y4 + VPADDD Y3, Y4, Y3 + VPADDD 256(SP), Y3, Y3 + VPXOR Y14, Y3, Y14 + VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 + VPADDD Y9, Y14, Y9 + VPXOR Y4, Y9, Y4 + VPSRLD $0x07, Y4, Y8 + VPSLLD $0x19, Y4, Y4 + VPOR Y4, Y8, Y4 + + // Round 3 + VPADDD Y0, Y4, Y0 + VPADDD 96(SP), Y0, Y0 + VPXOR Y12, Y0, Y12 + VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 + VMOVDQU 640(SP), Y8 + VPADDD Y8, Y12, Y8 + VPXOR Y4, Y8, Y4 + VMOVDQU Y8, 640(SP) + VPSRLD $0x0c, Y4, Y8 + VPSLLD $0x14, Y4, Y4 + VPOR Y4, Y8, Y4 + VPADDD Y0, Y4, Y0 + VPADDD 128(SP), Y0, Y0 + VPXOR Y12, Y0, Y12 + VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 + VMOVDQU 640(SP), Y8 + VPADDD Y8, Y12, Y8 + VPXOR Y4, Y8, Y4 + VMOVDQU Y8, 640(SP) + VPSRLD $0x07, Y4, Y8 + VPSLLD $0x19, Y4, Y4 + VPOR Y4, Y8, Y4 + VPADDD Y1, Y5, Y1 + VPADDD 320(SP), Y1, Y1 + VPXOR Y13, Y1, Y13 + VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 + VPADDD Y9, Y13, Y9 + VPXOR Y5, Y9, Y5 + VPSRLD $0x0c, Y5, Y8 + VPSLLD $0x14, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y1, Y5, Y1 + VPADDD 384(SP), Y1, Y1 + VPXOR Y13, Y1, Y13 + VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 + VPADDD Y9, Y13, Y9 + VPXOR Y5, Y9, Y5 + VPSRLD $0x07, Y5, Y8 + VPSLLD $0x19, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y2, Y6, Y2 + VPADDD 416(SP), Y2, Y2 + VPXOR Y14, Y2, Y14 + VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 + VPADDD Y10, Y14, Y10 + VPXOR Y6, Y10, Y6 + VPSRLD $0x0c, Y6, Y8 + VPSLLD $0x14, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y2, Y6, Y2 + VPADDD 64(SP), Y2, Y2 + VPXOR Y14, Y2, Y14 + VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 + VPADDD Y10, Y14, Y10 + VPXOR Y6, Y10, Y6 + VPSRLD $0x07, Y6, Y8 + VPSLLD $0x19, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y3, Y7, Y3 + VPADDD 224(SP), Y3, Y3 + VPXOR Y15, Y3, Y15 + VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 + VPADDD Y11, Y15, Y11 + VPXOR Y7, Y11, Y7 + VPSRLD $0x0c, Y7, Y8 + VPSLLD $0x14, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y3, Y7, Y3 + VPADDD 448(SP), Y3, Y3 + VPXOR Y15, Y3, Y15 + VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 + VPADDD Y11, Y15, Y11 + VPXOR Y7, Y11, Y7 + VPSRLD $0x07, Y7, Y8 + VPSLLD $0x19, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y0, Y5, Y0 + VPADDD 192(SP), Y0, Y0 + VPXOR Y15, Y0, Y15 + VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 + VPADDD Y10, Y15, Y10 + VPXOR Y5, Y10, Y5 + VPSRLD $0x0c, Y5, Y8 + VPSLLD $0x14, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y0, Y5, Y0 + VPADDD 160(SP), Y0, Y0 + VPXOR Y15, Y0, Y15 + VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 + VPADDD Y10, Y15, Y10 + VPXOR Y5, Y10, Y5 + VPSRLD $0x07, Y5, Y8 + VPSLLD $0x19, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y1, Y6, Y1 + VPADDD 288(SP), Y1, Y1 + VPXOR Y12, Y1, Y12 + VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 + VPADDD Y11, Y12, Y11 + VPXOR Y6, Y11, Y6 + VPSRLD $0x0c, Y6, Y8 + VPSLLD $0x14, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y1, Y6, Y1 + VPADDD (SP), Y1, Y1 + VPXOR Y12, Y1, Y12 + VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 + VPADDD Y11, Y12, Y11 + VPXOR Y6, Y11, Y6 + VPSRLD $0x07, Y6, Y8 + VPSLLD $0x19, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y2, Y7, Y2 + VPADDD 352(SP), Y2, Y2 + VPXOR Y13, Y2, Y13 + VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 + VMOVDQU 640(SP), Y8 + VPADDD Y8, Y13, Y8 + VPXOR Y7, Y8, Y7 + VMOVDQU Y8, 640(SP) + VPSRLD $0x0c, Y7, Y8 + VPSLLD $0x14, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y2, Y7, Y2 + VPADDD 480(SP), Y2, Y2 + VPXOR Y13, Y2, Y13 + VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 + VMOVDQU 640(SP), Y8 + VPADDD Y8, Y13, Y8 + VPXOR Y7, Y8, Y7 + VMOVDQU Y8, 640(SP) + VPSRLD $0x07, Y7, Y8 + VPSLLD $0x19, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y3, Y4, Y3 + VPADDD 256(SP), Y3, Y3 + VPXOR Y14, Y3, Y14 + VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 + VPADDD Y9, Y14, Y9 + VPXOR Y4, Y9, Y4 + VPSRLD $0x0c, Y4, Y8 + VPSLLD $0x14, Y4, Y4 + VPOR Y4, Y8, Y4 + VPADDD Y3, Y4, Y3 + VPADDD 32(SP), Y3, Y3 + VPXOR Y14, Y3, Y14 + VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 + VPADDD Y9, Y14, Y9 + VPXOR Y4, Y9, Y4 + VPSRLD $0x07, Y4, Y8 + VPSLLD $0x19, Y4, Y4 + VPOR Y4, Y8, Y4 + + // Round 4 + VPADDD Y0, Y4, Y0 + VPADDD 320(SP), Y0, Y0 + VPXOR Y12, Y0, Y12 + VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 + VMOVDQU 640(SP), Y8 + VPADDD Y8, Y12, Y8 + VPXOR Y4, Y8, Y4 + VMOVDQU Y8, 640(SP) + VPSRLD $0x0c, Y4, Y8 + VPSLLD $0x14, Y4, Y4 + VPOR Y4, Y8, Y4 + VPADDD Y0, Y4, Y0 + VPADDD 224(SP), Y0, Y0 + VPXOR Y12, Y0, Y12 + VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 + VMOVDQU 640(SP), Y8 + VPADDD Y8, Y12, Y8 + VPXOR Y4, Y8, Y4 + VMOVDQU Y8, 640(SP) + VPSRLD $0x07, Y4, Y8 + VPSLLD $0x19, Y4, Y4 + VPOR Y4, Y8, Y4 + VPADDD Y1, Y5, Y1 + VPADDD 384(SP), Y1, Y1 + VPXOR Y13, Y1, Y13 + VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 + VPADDD Y9, Y13, Y9 + VPXOR Y5, Y9, Y5 + VPSRLD $0x0c, Y5, Y8 + VPSLLD $0x14, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y1, Y5, Y1 + VPADDD 288(SP), Y1, Y1 + VPXOR Y13, Y1, Y13 + VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 + VPADDD Y9, Y13, Y9 + VPXOR Y5, Y9, Y5 + VPSRLD $0x07, Y5, Y8 + VPSLLD $0x19, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y2, Y6, Y2 + VPADDD 448(SP), Y2, Y2 + VPXOR Y14, Y2, Y14 + VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 + VPADDD Y10, Y14, Y10 + VPXOR Y6, Y10, Y6 + VPSRLD $0x0c, Y6, Y8 + VPSLLD $0x14, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y2, Y6, Y2 + VPADDD 96(SP), Y2, Y2 + VPXOR Y14, Y2, Y14 + VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 + VPADDD Y10, Y14, Y10 + VPXOR Y6, Y10, Y6 + VPSRLD $0x07, Y6, Y8 + VPSLLD $0x19, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y3, Y7, Y3 + VPADDD 416(SP), Y3, Y3 + VPXOR Y15, Y3, Y15 + VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 + VPADDD Y11, Y15, Y11 + VPXOR Y7, Y11, Y7 + VPSRLD $0x0c, Y7, Y8 + VPSLLD $0x14, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y3, Y7, Y3 + VPADDD 480(SP), Y3, Y3 + VPXOR Y15, Y3, Y15 + VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 + VPADDD Y11, Y15, Y11 + VPXOR Y7, Y11, Y7 + VPSRLD $0x07, Y7, Y8 + VPSLLD $0x19, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y0, Y5, Y0 + VPADDD 128(SP), Y0, Y0 + VPXOR Y15, Y0, Y15 + VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 + VPADDD Y10, Y15, Y10 + VPXOR Y5, Y10, Y5 + VPSRLD $0x0c, Y5, Y8 + VPSLLD $0x14, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y0, Y5, Y0 + VPADDD (SP), Y0, Y0 + VPXOR Y15, Y0, Y15 + VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 + VPADDD Y10, Y15, Y10 + VPXOR Y5, Y10, Y5 + VPSRLD $0x07, Y5, Y8 + VPSLLD $0x19, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y1, Y6, Y1 + VPADDD 352(SP), Y1, Y1 + VPXOR Y12, Y1, Y12 + VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 + VPADDD Y11, Y12, Y11 + VPXOR Y6, Y11, Y6 + VPSRLD $0x0c, Y6, Y8 + VPSLLD $0x14, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y1, Y6, Y1 + VPADDD 64(SP), Y1, Y1 + VPXOR Y12, Y1, Y12 + VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 + VPADDD Y11, Y12, Y11 + VPXOR Y6, Y11, Y6 + VPSRLD $0x07, Y6, Y8 + VPSLLD $0x19, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y2, Y7, Y2 + VPADDD 160(SP), Y2, Y2 + VPXOR Y13, Y2, Y13 + VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 + VMOVDQU 640(SP), Y8 + VPADDD Y8, Y13, Y8 + VPXOR Y7, Y8, Y7 + VMOVDQU Y8, 640(SP) + VPSRLD $0x0c, Y7, Y8 + VPSLLD $0x14, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y2, Y7, Y2 + VPADDD 256(SP), Y2, Y2 + VPXOR Y13, Y2, Y13 + VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 + VMOVDQU 640(SP), Y8 + VPADDD Y8, Y13, Y8 + VPXOR Y7, Y8, Y7 + VMOVDQU Y8, 640(SP) + VPSRLD $0x07, Y7, Y8 + VPSLLD $0x19, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y3, Y4, Y3 + VPADDD 32(SP), Y3, Y3 + VPXOR Y14, Y3, Y14 + VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 + VPADDD Y9, Y14, Y9 + VPXOR Y4, Y9, Y4 + VPSRLD $0x0c, Y4, Y8 + VPSLLD $0x14, Y4, Y4 + VPOR Y4, Y8, Y4 + VPADDD Y3, Y4, Y3 + VPADDD 192(SP), Y3, Y3 + VPXOR Y14, Y3, Y14 + VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 + VPADDD Y9, Y14, Y9 + VPXOR Y4, Y9, Y4 + VPSRLD $0x07, Y4, Y8 + VPSLLD $0x19, Y4, Y4 + VPOR Y4, Y8, Y4 + + // Round 5 + VPADDD Y0, Y4, Y0 + VPADDD 384(SP), Y0, Y0 + VPXOR Y12, Y0, Y12 + VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 + VMOVDQU 640(SP), Y8 + VPADDD Y8, Y12, Y8 + VPXOR Y4, Y8, Y4 + VMOVDQU Y8, 640(SP) + VPSRLD $0x0c, Y4, Y8 + VPSLLD $0x14, Y4, Y4 + VPOR Y4, Y8, Y4 + VPADDD Y0, Y4, Y0 + VPADDD 416(SP), Y0, Y0 + VPXOR Y12, Y0, Y12 + VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 + VMOVDQU 640(SP), Y8 + VPADDD Y8, Y12, Y8 + VPXOR Y4, Y8, Y4 + VMOVDQU Y8, 640(SP) + VPSRLD $0x07, Y4, Y8 + VPSLLD $0x19, Y4, Y4 + VPOR Y4, Y8, Y4 + VPADDD Y1, Y5, Y1 + VPADDD 288(SP), Y1, Y1 + VPXOR Y13, Y1, Y13 + VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 + VPADDD Y9, Y13, Y9 + VPXOR Y5, Y9, Y5 + VPSRLD $0x0c, Y5, Y8 + VPSLLD $0x14, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y1, Y5, Y1 + VPADDD 352(SP), Y1, Y1 + VPXOR Y13, Y1, Y13 + VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 + VPADDD Y9, Y13, Y9 + VPXOR Y5, Y9, Y5 + VPSRLD $0x07, Y5, Y8 + VPSLLD $0x19, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y2, Y6, Y2 + VPADDD 480(SP), Y2, Y2 + VPXOR Y14, Y2, Y14 + VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 + VPADDD Y10, Y14, Y10 + VPXOR Y6, Y10, Y6 + VPSRLD $0x0c, Y6, Y8 + VPSLLD $0x14, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y2, Y6, Y2 + VPADDD 320(SP), Y2, Y2 + VPXOR Y14, Y2, Y14 + VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 + VPADDD Y10, Y14, Y10 + VPXOR Y6, Y10, Y6 + VPSRLD $0x07, Y6, Y8 + VPSLLD $0x19, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y3, Y7, Y3 + VPADDD 448(SP), Y3, Y3 + VPXOR Y15, Y3, Y15 + VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 + VPADDD Y11, Y15, Y11 + VPXOR Y7, Y11, Y7 + VPSRLD $0x0c, Y7, Y8 + VPSLLD $0x14, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y3, Y7, Y3 + VPADDD 256(SP), Y3, Y3 + VPXOR Y15, Y3, Y15 + VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 + VPADDD Y11, Y15, Y11 + VPXOR Y7, Y11, Y7 + VPSRLD $0x07, Y7, Y8 + VPSLLD $0x19, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y0, Y5, Y0 + VPADDD 224(SP), Y0, Y0 + VPXOR Y15, Y0, Y15 + VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 + VPADDD Y10, Y15, Y10 + VPXOR Y5, Y10, Y5 + VPSRLD $0x0c, Y5, Y8 + VPSLLD $0x14, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y0, Y5, Y0 + VPADDD 64(SP), Y0, Y0 + VPXOR Y15, Y0, Y15 + VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 + VPADDD Y10, Y15, Y10 + VPXOR Y5, Y10, Y5 + VPSRLD $0x07, Y5, Y8 + VPSLLD $0x19, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y1, Y6, Y1 + VPADDD 160(SP), Y1, Y1 + VPXOR Y12, Y1, Y12 + VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 + VPADDD Y11, Y12, Y11 + VPXOR Y6, Y11, Y6 + VPSRLD $0x0c, Y6, Y8 + VPSLLD $0x14, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y1, Y6, Y1 + VPADDD 96(SP), Y1, Y1 + VPXOR Y12, Y1, Y12 + VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 + VPADDD Y11, Y12, Y11 + VPXOR Y6, Y11, Y6 + VPSRLD $0x07, Y6, Y8 + VPSLLD $0x19, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y2, Y7, Y2 + VPADDD (SP), Y2, Y2 + VPXOR Y13, Y2, Y13 + VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 + VMOVDQU 640(SP), Y8 + VPADDD Y8, Y13, Y8 + VPXOR Y7, Y8, Y7 + VMOVDQU Y8, 640(SP) + VPSRLD $0x0c, Y7, Y8 + VPSLLD $0x14, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y2, Y7, Y2 + VPADDD 32(SP), Y2, Y2 + VPXOR Y13, Y2, Y13 + VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 + VMOVDQU 640(SP), Y8 + VPADDD Y8, Y13, Y8 + VPXOR Y7, Y8, Y7 + VMOVDQU Y8, 640(SP) + VPSRLD $0x07, Y7, Y8 + VPSLLD $0x19, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y3, Y4, Y3 + VPADDD 192(SP), Y3, Y3 + VPXOR Y14, Y3, Y14 + VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 + VPADDD Y9, Y14, Y9 + VPXOR Y4, Y9, Y4 + VPSRLD $0x0c, Y4, Y8 + VPSLLD $0x14, Y4, Y4 + VPOR Y4, Y8, Y4 + VPADDD Y3, Y4, Y3 + VPADDD 128(SP), Y3, Y3 + VPXOR Y14, Y3, Y14 + VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 + VPADDD Y9, Y14, Y9 + VPXOR Y4, Y9, Y4 + VPSRLD $0x07, Y4, Y8 + VPSLLD $0x19, Y4, Y4 + VPOR Y4, Y8, Y4 + + // Round 6 + VPADDD Y0, Y4, Y0 + VPADDD 288(SP), Y0, Y0 + VPXOR Y12, Y0, Y12 + VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 + VMOVDQU 640(SP), Y8 + VPADDD Y8, Y12, Y8 + VPXOR Y4, Y8, Y4 + VMOVDQU Y8, 640(SP) + VPSRLD $0x0c, Y4, Y8 + VPSLLD $0x14, Y4, Y4 + VPOR Y4, Y8, Y4 + VPADDD Y0, Y4, Y0 + VPADDD 448(SP), Y0, Y0 + VPXOR Y12, Y0, Y12 + VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 + VMOVDQU 640(SP), Y8 + VPADDD Y8, Y12, Y8 + VPXOR Y4, Y8, Y4 + VMOVDQU Y8, 640(SP) + VPSRLD $0x07, Y4, Y8 + VPSLLD $0x19, Y4, Y4 + VPOR Y4, Y8, Y4 + VPADDD Y1, Y5, Y1 + VPADDD 352(SP), Y1, Y1 + VPXOR Y13, Y1, Y13 + VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 + VPADDD Y9, Y13, Y9 + VPXOR Y5, Y9, Y5 + VPSRLD $0x0c, Y5, Y8 + VPSLLD $0x14, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y1, Y5, Y1 + VPADDD 160(SP), Y1, Y1 + VPXOR Y13, Y1, Y13 + VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 + VPADDD Y9, Y13, Y9 + VPXOR Y5, Y9, Y5 + VPSRLD $0x07, Y5, Y8 + VPSLLD $0x19, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y2, Y6, Y2 + VPADDD 256(SP), Y2, Y2 + VPXOR Y14, Y2, Y14 + VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 + VPADDD Y10, Y14, Y10 + VPXOR Y6, Y10, Y6 + VPSRLD $0x0c, Y6, Y8 + VPSLLD $0x14, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y2, Y6, Y2 + VPADDD 384(SP), Y2, Y2 + VPXOR Y14, Y2, Y14 + VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 + VPADDD Y10, Y14, Y10 + VPXOR Y6, Y10, Y6 + VPSRLD $0x07, Y6, Y8 + VPSLLD $0x19, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y3, Y7, Y3 + VPADDD 480(SP), Y3, Y3 + VPXOR Y15, Y3, Y15 + VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 + VPADDD Y11, Y15, Y11 + VPXOR Y7, Y11, Y7 + VPSRLD $0x0c, Y7, Y8 + VPSLLD $0x14, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y3, Y7, Y3 + VPADDD 32(SP), Y3, Y3 + VPXOR Y15, Y3, Y15 + VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 + VPADDD Y11, Y15, Y11 + VPXOR Y7, Y11, Y7 + VPSRLD $0x07, Y7, Y8 + VPSLLD $0x19, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y0, Y5, Y0 + VPADDD 416(SP), Y0, Y0 + VPXOR Y15, Y0, Y15 + VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 + VPADDD Y10, Y15, Y10 + VPXOR Y5, Y10, Y5 + VPSRLD $0x0c, Y5, Y8 + VPSLLD $0x14, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y0, Y5, Y0 + VPADDD 96(SP), Y0, Y0 + VPXOR Y15, Y0, Y15 + VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 + VPADDD Y10, Y15, Y10 + VPXOR Y5, Y10, Y5 + VPSRLD $0x07, Y5, Y8 + VPSLLD $0x19, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y1, Y6, Y1 + VPADDD (SP), Y1, Y1 + VPXOR Y12, Y1, Y12 + VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 + VPADDD Y11, Y12, Y11 + VPXOR Y6, Y11, Y6 + VPSRLD $0x0c, Y6, Y8 + VPSLLD $0x14, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y1, Y6, Y1 + VPADDD 320(SP), Y1, Y1 + VPXOR Y12, Y1, Y12 + VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 + VPADDD Y11, Y12, Y11 + VPXOR Y6, Y11, Y6 + VPSRLD $0x07, Y6, Y8 + VPSLLD $0x19, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y2, Y7, Y2 + VPADDD 64(SP), Y2, Y2 + VPXOR Y13, Y2, Y13 + VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 + VMOVDQU 640(SP), Y8 + VPADDD Y8, Y13, Y8 + VPXOR Y7, Y8, Y7 + VMOVDQU Y8, 640(SP) + VPSRLD $0x0c, Y7, Y8 + VPSLLD $0x14, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y2, Y7, Y2 + VPADDD 192(SP), Y2, Y2 + VPXOR Y13, Y2, Y13 + VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 + VMOVDQU 640(SP), Y8 + VPADDD Y8, Y13, Y8 + VPXOR Y7, Y8, Y7 + VMOVDQU Y8, 640(SP) + VPSRLD $0x07, Y7, Y8 + VPSLLD $0x19, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y3, Y4, Y3 + VPADDD 128(SP), Y3, Y3 + VPXOR Y14, Y3, Y14 + VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 + VPADDD Y9, Y14, Y9 + VPXOR Y4, Y9, Y4 + VPSRLD $0x0c, Y4, Y8 + VPSLLD $0x14, Y4, Y4 + VPOR Y4, Y8, Y4 + VPADDD Y3, Y4, Y3 + VPADDD 224(SP), Y3, Y3 + VPXOR Y14, Y3, Y14 + VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 + VPADDD Y9, Y14, Y9 + VPXOR Y4, Y9, Y4 + VPSRLD $0x07, Y4, Y8 + VPSLLD $0x19, Y4, Y4 + VPOR Y4, Y8, Y4 + + // Round 7 + VPADDD Y0, Y4, Y0 + VPADDD 352(SP), Y0, Y0 + VPXOR Y12, Y0, Y12 + VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 + VMOVDQU 640(SP), Y8 + VPADDD Y8, Y12, Y8 + VPXOR Y4, Y8, Y4 + VMOVDQU Y8, 640(SP) + VPSRLD $0x0c, Y4, Y8 + VPSLLD $0x14, Y4, Y4 + VPOR Y4, Y8, Y4 + VPADDD Y0, Y4, Y0 + VPADDD 480(SP), Y0, Y0 + VPXOR Y12, Y0, Y12 + VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 + VMOVDQU 640(SP), Y8 + VPADDD Y8, Y12, Y8 + VPXOR Y4, Y8, Y4 + VMOVDQU Y8, 640(SP) + VPSRLD $0x07, Y4, Y8 + VPSLLD $0x19, Y4, Y4 + VPOR Y4, Y8, Y4 + VPADDD Y1, Y5, Y1 + VPADDD 160(SP), Y1, Y1 + VPXOR Y13, Y1, Y13 + VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 + VPADDD Y9, Y13, Y9 + VPXOR Y5, Y9, Y5 + VPSRLD $0x0c, Y5, Y8 + VPSLLD $0x14, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y1, Y5, Y1 + VPADDD (SP), Y1, Y1 + VPXOR Y13, Y1, Y13 + VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 + VPADDD Y9, Y13, Y9 + VPXOR Y5, Y9, Y5 + VPSRLD $0x07, Y5, Y8 + VPSLLD $0x19, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y2, Y6, Y2 + VPADDD 32(SP), Y2, Y2 + VPXOR Y14, Y2, Y14 + VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 + VPADDD Y10, Y14, Y10 + VPXOR Y6, Y10, Y6 + VPSRLD $0x0c, Y6, Y8 + VPSLLD $0x14, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y2, Y6, Y2 + VPADDD 288(SP), Y2, Y2 + VPXOR Y14, Y2, Y14 + VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 + VPADDD Y10, Y14, Y10 + VPXOR Y6, Y10, Y6 + VPSRLD $0x07, Y6, Y8 + VPSLLD $0x19, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y3, Y7, Y3 + VPADDD 256(SP), Y3, Y3 + VPXOR Y15, Y3, Y15 + VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 + VPADDD Y11, Y15, Y11 + VPXOR Y7, Y11, Y7 + VPSRLD $0x0c, Y7, Y8 + VPSLLD $0x14, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y3, Y7, Y3 + VPADDD 192(SP), Y3, Y3 + VPXOR Y15, Y3, Y15 + VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 + VPADDD Y11, Y15, Y11 + VPXOR Y7, Y11, Y7 + VPSRLD $0x07, Y7, Y8 + VPSLLD $0x19, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y0, Y5, Y0 + VPADDD 448(SP), Y0, Y0 + VPXOR Y15, Y0, Y15 + VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 + VPADDD Y10, Y15, Y10 + VPXOR Y5, Y10, Y5 + VPSRLD $0x0c, Y5, Y8 + VPSLLD $0x14, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y0, Y5, Y0 + VPADDD 320(SP), Y0, Y0 + VPXOR Y15, Y0, Y15 + VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 + VPADDD Y10, Y15, Y10 + VPXOR Y5, Y10, Y5 + VPSRLD $0x07, Y5, Y8 + VPSLLD $0x19, Y5, Y5 + VPOR Y5, Y8, Y5 + VPADDD Y1, Y6, Y1 + VPADDD 64(SP), Y1, Y1 + VPXOR Y12, Y1, Y12 + VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 + VPADDD Y11, Y12, Y11 + VPXOR Y6, Y11, Y6 + VPSRLD $0x0c, Y6, Y8 + VPSLLD $0x14, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y1, Y6, Y1 + VPADDD 384(SP), Y1, Y1 + VPXOR Y12, Y1, Y12 + VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 + VPADDD Y11, Y12, Y11 + VPXOR Y6, Y11, Y6 + VPSRLD $0x07, Y6, Y8 + VPSLLD $0x19, Y6, Y6 + VPOR Y6, Y8, Y6 + VPADDD Y2, Y7, Y2 + VPADDD 96(SP), Y2, Y2 + VPXOR Y13, Y2, Y13 + VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 + VMOVDQU 640(SP), Y8 + VPADDD Y8, Y13, Y8 + VPXOR Y7, Y8, Y7 + VMOVDQU Y8, 640(SP) + VPSRLD $0x0c, Y7, Y8 + VPSLLD $0x14, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y2, Y7, Y2 + VPADDD 128(SP), Y2, Y2 + VPXOR Y13, Y2, Y13 + VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 + VMOVDQU 640(SP), Y8 + VPADDD Y8, Y13, Y8 + VPXOR Y7, Y8, Y7 + VMOVDQU Y8, 640(SP) + VPSRLD $0x07, Y7, Y8 + VPSLLD $0x19, Y7, Y7 + VPOR Y7, Y8, Y7 + VPADDD Y3, Y4, Y3 + VPADDD 224(SP), Y3, Y3 + VPXOR Y14, Y3, Y14 + VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 + VPADDD Y9, Y14, Y9 + VPXOR Y4, Y9, Y4 + VPSRLD $0x0c, Y4, Y8 + VPSLLD $0x14, Y4, Y4 + VPOR Y4, Y8, Y4 + VPADDD Y3, Y4, Y3 + VPADDD 416(SP), Y3, Y3 + VPXOR Y14, Y3, Y14 + VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 + VPADDD Y9, Y14, Y9 + VPXOR Y4, Y9, Y4 + VPSRLD $0x07, Y4, Y8 + VPSLLD $0x19, Y4, Y4 + VPOR Y4, Y8, Y4 + VMOVDQU 640(SP), Y8 + + // Finalize CVs + VPXOR Y0, Y8, Y0 + VPXOR Y1, Y9, Y1 + VPXOR Y2, Y10, Y2 + VPXOR Y3, Y11, Y3 + VPXOR Y4, Y12, Y4 + VPXOR Y5, Y13, Y5 + VPXOR Y6, Y14, Y6 + VPXOR Y7, Y15, Y7 + + // Loop + INCQ DX + CMPQ DX, $0x00000010 + JNE loop + + // Finished; transpose CVs + VPUNPCKLDQ Y1, Y0, Y8 + VPUNPCKHDQ Y1, Y0, Y9 + VPUNPCKLDQ Y3, Y2, Y10 + VPUNPCKHDQ Y3, Y2, Y11 + VPUNPCKLDQ Y5, Y4, Y12 + VPUNPCKHDQ Y5, Y4, Y13 + VPUNPCKLDQ Y7, Y6, Y14 + VPUNPCKHDQ Y7, Y6, Y15 + VPUNPCKLQDQ Y10, Y8, Y0 + VPUNPCKHQDQ Y10, Y8, Y1 + VPUNPCKLQDQ Y11, Y9, Y2 + VPUNPCKHQDQ Y11, Y9, Y3 + VPUNPCKLQDQ Y14, Y12, Y4 + VPUNPCKHQDQ Y14, Y12, Y5 + VPUNPCKLQDQ Y15, Y13, Y6 + VPUNPCKHQDQ Y15, Y13, Y7 + VPERM2I128 $0x20, Y4, Y0, Y8 + VPERM2I128 $0x31, Y4, Y0, Y12 + VPERM2I128 $0x20, Y5, Y1, Y9 + VPERM2I128 $0x31, Y5, Y1, Y13 + VPERM2I128 $0x20, Y6, Y2, Y10 + VPERM2I128 $0x31, Y6, Y2, Y14 + VPERM2I128 $0x20, Y7, Y3, Y11 + VPERM2I128 $0x31, Y7, Y3, Y15 + VMOVDQU Y8, (AX) + VMOVDQU Y9, 32(AX) + VMOVDQU Y10, 64(AX) + VMOVDQU Y11, 96(AX) + VMOVDQU Y12, 128(AX) + VMOVDQU Y13, 160(AX) + VMOVDQU Y14, 192(AX) + VMOVDQU Y15, 224(AX) + RET + +// func compressParentsAVX2(parents *[8][8]uint32, cvs *[16][8]uint32, key *[8]uint32, flags uint32) +// Requires: AVX, AVX2 +TEXT ·compressParentsAVX2(SB), NOSPLIT, $544-32 + MOVQ parents+0(FP), AX + MOVQ cvs+8(FP), CX + MOVQ key+16(FP), DX + + // Load transposed block + VMOVDQU seq<>+0(SB), Y9 + VPSLLD $0x06, Y9, Y9 + VPCMPEQD Y8, Y8, Y8 + VPGATHERDD Y8, (CX)(Y9*1), Y10 + VMOVDQU Y10, (SP) + VPCMPEQD Y8, Y8, Y8 + VPGATHERDD Y8, 4(CX)(Y9*1), Y10 + VMOVDQU Y10, 32(SP) + VPCMPEQD Y8, Y8, Y8 + VPGATHERDD Y8, 8(CX)(Y9*1), Y10 + VMOVDQU Y10, 64(SP) + VPCMPEQD Y8, Y8, Y8 + VPGATHERDD Y8, 12(CX)(Y9*1), Y10 + VMOVDQU Y10, 96(SP) + VPCMPEQD Y8, Y8, Y8 + VPGATHERDD Y8, 16(CX)(Y9*1), Y10 + VMOVDQU Y10, 128(SP) + VPCMPEQD Y8, Y8, Y8 + VPGATHERDD Y8, 20(CX)(Y9*1), Y10 + VMOVDQU Y10, 160(SP) + VPCMPEQD Y8, Y8, Y8 + VPGATHERDD Y8, 24(CX)(Y9*1), Y10 + VMOVDQU Y10, 192(SP) + VPCMPEQD Y8, Y8, Y8 + VPGATHERDD Y8, 28(CX)(Y9*1), Y10 + VMOVDQU Y10, 224(SP) + VPCMPEQD Y8, Y8, Y8 + VPGATHERDD Y8, 32(CX)(Y9*1), Y10 + VMOVDQU Y10, 256(SP) + VPCMPEQD Y8, Y8, Y8 + VPGATHERDD Y8, 36(CX)(Y9*1), Y10 + VMOVDQU Y10, 288(SP) + VPCMPEQD Y8, Y8, Y8 + VPGATHERDD Y8, 40(CX)(Y9*1), Y10 + VMOVDQU Y10, 320(SP) + VPCMPEQD Y8, Y8, Y8 + VPGATHERDD Y8, 44(CX)(Y9*1), Y10 + VMOVDQU Y10, 352(SP) + VPCMPEQD Y8, Y8, Y8 + VPGATHERDD Y8, 48(CX)(Y9*1), Y10 + VMOVDQU Y10, 384(SP) + VPCMPEQD Y8, Y8, Y8 + VPGATHERDD Y8, 52(CX)(Y9*1), Y10 + VMOVDQU Y10, 416(SP) + VPCMPEQD Y8, Y8, Y8 + VPGATHERDD Y8, 56(CX)(Y9*1), Y10 + VMOVDQU Y10, 448(SP) + VPCMPEQD Y8, Y8, Y8 + VPGATHERDD Y8, 60(CX)(Y9*1), Y10 + VMOVDQU Y10, 480(SP) + + // Initialize state vectors + VPBROADCASTD (DX), Y0 + VPBROADCASTD 4(DX), Y1 + VPBROADCASTD 8(DX), Y2 + VPBROADCASTD 12(DX), Y3 + VPBROADCASTD 16(DX), Y4 + VPBROADCASTD 20(DX), Y5 + VPBROADCASTD 24(DX), Y6 + VPBROADCASTD 28(DX), Y7 + VPBROADCASTD iv<>+0(SB), Y8 + VPBROADCASTD iv<>+4(SB), Y9 + VPBROADCASTD iv<>+8(SB), Y10 + VPBROADCASTD iv<>+12(SB), Y11 + VPXOR Y12, Y12, Y12 + VPXOR Y13, Y13, Y13 + VPBROADCASTD seq<>+4(SB), Y14 + VPSLLD $0x06, Y14, Y14 + ORL $0x04, flags+24(FP) + VPBROADCASTD flags+24(FP), Y15 VMOVDQU Y8, 512(SP) // Round 1 @@ -2515,21 +5521,14 @@ loop: VMOVDQU 512(SP), Y8 // Finalize CVs - VPXOR Y0, Y8, Y0 - VPXOR Y1, Y9, Y1 - VPXOR Y2, Y10, Y2 - VPXOR Y3, Y11, Y3 - VPXOR Y4, Y12, Y4 - VPXOR Y5, Y13, Y5 - VPXOR Y6, Y14, Y6 - VPXOR Y7, Y15, Y7 - - // Loop - INCQ DX - CMPQ DX, $0x00000010 - JNE loop - - // Finished; transpose CVs + VPXOR Y0, Y8, Y0 + VPXOR Y1, Y9, Y1 + VPXOR Y2, Y10, Y2 + VPXOR Y3, Y11, Y3 + VPXOR Y4, Y12, Y4 + VPXOR Y5, Y13, Y5 + VPXOR Y6, Y14, Y6 + VPXOR Y7, Y15, Y7 VPUNPCKLDQ Y1, Y0, Y8 VPUNPCKHDQ Y1, Y0, Y9 VPUNPCKLDQ Y3, Y2, Y10 diff --git a/blake3_test.go b/blake3_test.go index 7bfe6af..e08de27 100644 --- a/blake3_test.go +++ b/blake3_test.go @@ -33,7 +33,7 @@ var testVectors = func() (vecs struct { }() var testInput = func() []byte { - input := make([]byte, 1<<15) + input := make([]byte, 1e6) for i := range input { input[i] = byte(i % 251) } diff --git a/compress_amd64.go b/compress_amd64.go index ea19611..fa2eb11 100644 --- a/compress_amd64.go +++ b/compress_amd64.go @@ -1,27 +1,32 @@ package blake3 -import ( - "unsafe" - - "golang.org/x/sys/cpu" -) +import "unsafe" //go:generate go run avo/gen.go -out blake3_amd64.s //go:noescape -func compressChunksAVX2(cvs *[8][8]uint32, buf *[8192]byte, key *[8]uint32, counter uint64, flags uint32) +func compressChunksAVX512(cvs *[16][8]uint32, buf *[16 * chunkSize]byte, key *[8]uint32, counter uint64, flags uint32) + +//go:noescape +func compressChunksAVX2(cvs *[8][8]uint32, buf *[8 * chunkSize]byte, key *[8]uint32, counter uint64, flags uint32) + +//go:noescape +func compressBlocksAVX512(out *[1024]byte, block *[16]uint32, cv *[8]uint32, counter uint64, blockLen uint32, flags uint32) //go:noescape func compressBlocksAVX2(out *[512]byte, msgs *[16]uint32, cv *[8]uint32, counter uint64, blockLen uint32, flags uint32) +//go:noescape +func compressParentsAVX2(parents *[8][8]uint32, cvs *[16][8]uint32, key *[8]uint32, flags uint32) + func compressNode(n node) (out [16]uint32) { compressNodeGeneric(&out, n) return } -func compressBufferLarge(buf *[8192]byte, buflen int, key *[8]uint32, counter uint64, flags uint32) node { - var cvs [8][8]uint32 - compressChunksAVX2(&cvs, buf, key, counter, flags) +func compressBufferAVX512(buf *[maxSIMD * chunkSize]byte, buflen int, key *[8]uint32, counter uint64, flags uint32) node { + var cvs [maxSIMD][8]uint32 + compressChunksAVX512(&cvs, buf, key, counter, flags) numChunks := uint64(buflen / chunkSize) if buflen%chunkSize != 0 { // use non-asm for remainder @@ -29,13 +34,33 @@ func compressBufferLarge(buf *[8192]byte, buflen int, key *[8]uint32, counter ui cvs[numChunks] = chainingValue(compressChunk(partialChunk, key, counter+numChunks, flags)) numChunks++ } - return mergeSubtrees(cvs[:numChunks], key, flags) + return mergeSubtrees(&cvs, numChunks, key, flags) } -func compressBuffer(buf *[8192]byte, buflen int, key *[8]uint32, counter uint64, flags uint32) node { +func compressBufferAVX2(buf *[maxSIMD * chunkSize]byte, buflen int, key *[8]uint32, counter uint64, flags uint32) node { + var cvs [maxSIMD][8]uint32 + cvHalves := (*[2][8][8]uint32)(unsafe.Pointer(&cvs)) + bufHalves := (*[2][8 * chunkSize]byte)(unsafe.Pointer(buf)) + compressChunksAVX2(&cvHalves[0], &bufHalves[0], key, counter, flags) + numChunks := uint64(buflen / chunkSize) + if numChunks > 8 { + compressChunksAVX2(&cvHalves[1], &bufHalves[1], key, counter+8, flags) + } + if buflen%chunkSize != 0 { + // use non-asm for remainder + partialChunk := buf[buflen-buflen%chunkSize : buflen] + cvs[numChunks] = chainingValue(compressChunk(partialChunk, key, counter+numChunks, flags)) + numChunks++ + } + return mergeSubtrees(&cvs, numChunks, key, flags) +} + +func compressBuffer(buf *[maxSIMD * chunkSize]byte, buflen int, key *[8]uint32, counter uint64, flags uint32) node { switch { - case cpu.X86.HasAVX2 && buflen >= chunkSize*2: - return compressBufferLarge(buf, buflen, key, counter, flags) + case haveAVX512 && buflen >= chunkSize*2: + return compressBufferAVX512(buf, buflen, key, counter, flags) + case haveAVX2 && buflen >= chunkSize*2: + return compressBufferAVX2(buf, buflen, key, counter, flags) default: return compressBufferGeneric(buf, buflen, key, counter, flags) } @@ -74,14 +99,36 @@ func hashBlock(out *[64]byte, buf []byte) { }) } -func compressBlocks(out *[512]byte, n node) { +func compressBlocks(out *[maxSIMD * blockSize]byte, n node) { switch { - case cpu.X86.HasAVX2: - compressBlocksAVX2(out, &n.block, &n.cv, n.counter, n.blockLen, n.flags) + case haveAVX512: + compressBlocksAVX512(out, &n.block, &n.cv, n.counter, n.blockLen, n.flags) + case haveAVX2: + outs := (*[2][512]byte)(unsafe.Pointer(out)) + compressBlocksAVX2(&outs[0], &n.block, &n.cv, n.counter, n.blockLen, n.flags) + compressBlocksAVX2(&outs[1], &n.block, &n.cv, n.counter+8, n.blockLen, n.flags) default: - compressBlocksGeneric((*[8][64]byte)(unsafe.Pointer(out)), n) + outs := (*[maxSIMD][64]byte)(unsafe.Pointer(out)) + compressBlocksGeneric(outs, n) } +} +func mergeSubtrees(cvs *[maxSIMD][8]uint32, numCVs uint64, key *[8]uint32, flags uint32) node { + if !haveAVX2 { + return mergeSubtreesGeneric(cvs, numCVs, key, flags) + } + for numCVs > 2 { + if numCVs%2 == 0 { + compressParentsAVX2((*[8][8]uint32)(unsafe.Pointer(cvs)), cvs, key, flags) + } else { + keep := cvs[numCVs-1] + compressParentsAVX2((*[8][8]uint32)(unsafe.Pointer(cvs)), cvs, key, flags) + cvs[numCVs/2] = keep + numCVs++ + } + numCVs /= 2 + } + return parentNode(cvs[0], cvs[1], *key, flags) } func wordsToBytes(words [16]uint32, block *[64]byte) { diff --git a/compress_generic.go b/compress_generic.go index 4581b8a..b033b65 100644 --- a/compress_generic.go +++ b/compress_generic.go @@ -5,19 +5,19 @@ import ( "math/bits" ) -func g(a, b, c, d, mx, my uint32) (uint32, uint32, uint32, uint32) { - a += b + mx - d = bits.RotateLeft32(d^a, -16) - c += d - b = bits.RotateLeft32(b^c, -12) - a += b + my - d = bits.RotateLeft32(d^a, -8) - c += d - b = bits.RotateLeft32(b^c, -7) - return a, b, c, d -} - func compressNodeGeneric(out *[16]uint32, n node) { + g := func(a, b, c, d, mx, my uint32) (uint32, uint32, uint32, uint32) { + a += b + mx + d = bits.RotateLeft32(d^a, -16) + c += d + b = bits.RotateLeft32(b^c, -12) + a += b + my + d = bits.RotateLeft32(d^a, -8) + c += d + b = bits.RotateLeft32(b^c, -7) + return a, b, c, d + } + // NOTE: we unroll all of the rounds, as well as the permutations that occur // between rounds. @@ -102,56 +102,42 @@ func compressNodeGeneric(out *[16]uint32, n node) { } } -func compressBufferGeneric(buf *[8192]byte, buflen int, key *[8]uint32, counter uint64, flags uint32) (n node) { - if buflen <= chunkSize { - return compressChunk(buf[:buflen], key, counter, flags) - } - cvs := make([][8]uint32, 0, 8) - for bb := bytes.NewBuffer(buf[:buflen]); bb.Len() > 0; { - n := compressChunk(bb.Next(chunkSize), key, counter, flags) - cvs = append(cvs, chainingValue(n)) - counter++ - } - return mergeSubtrees(cvs, key, flags) -} - -func compressBlocksGeneric(outs *[8][64]byte, n node) { - for i := range outs { - wordsToBytes(compressNode(n), &outs[i]) - n.counter++ - } -} - func chainingValue(n node) (cv [8]uint32) { full := compressNode(n) copy(cv[:], full[:]) return } -func mergeSubtrees(cvs [][8]uint32, key *[8]uint32, flags uint32) node { - parent := func(l, r [8]uint32) [8]uint32 { - return chainingValue(parentNode(l, r, *key, flags)) +func compressBufferGeneric(buf *[maxSIMD * chunkSize]byte, buflen int, key *[8]uint32, counter uint64, flags uint32) (n node) { + if buflen <= chunkSize { + return compressChunk(buf[:buflen], key, counter, flags) } - switch len(cvs) { - case 8: - cvs[6] = parent(cvs[6], cvs[7]) - fallthrough - case 7: - cvs[4], cvs[5] = parent(cvs[4], cvs[5]), cvs[6] - fallthrough - case 6: - cvs[4] = parent(cvs[4], cvs[5]) - fallthrough - case 5: - fallthrough - case 4: - cvs[2] = parent(cvs[2], cvs[3]) - fallthrough - case 3: - cvs[0], cvs[1] = parent(cvs[0], cvs[1]), cvs[2] + var cvs [maxSIMD][8]uint32 + var numCVs uint64 + for bb := bytes.NewBuffer(buf[:buflen]); bb.Len() > 0; numCVs++ { + cvs[numCVs] = chainingValue(compressChunk(bb.Next(chunkSize), key, counter+numCVs, flags)) } - if len(cvs) > 4 { - cvs[0], cvs[1] = parent(cvs[0], cvs[1]), cvs[4] + return mergeSubtrees(&cvs, numCVs, key, flags) +} + +func compressBlocksGeneric(outs *[maxSIMD][64]byte, n node) { + for i := range outs { + wordsToBytes(compressNode(n), &outs[i]) + n.counter++ + } +} + +func mergeSubtreesGeneric(cvs *[maxSIMD][8]uint32, numCVs uint64, key *[8]uint32, flags uint32) node { + for numCVs > 2 { + rem := numCVs / 2 + for i := range cvs[:rem] { + cvs[i] = chainingValue(parentNode(cvs[i*2], cvs[i*2+1], *key, flags)) + } + if numCVs%2 != 0 { + cvs[rem] = cvs[rem*2] + rem++ + } + numCVs = rem } return parentNode(cvs[0], cvs[1], *key, flags) } diff --git a/compress_noasm.go b/compress_noasm.go index 5efa191..0d30ba2 100644 --- a/compress_noasm.go +++ b/compress_noasm.go @@ -9,8 +9,8 @@ func compressNode(n node) (out [16]uint32) { return } -func compressBuffer(buf *[8192]byte, length int, key *[8]uint32, counter uint64, flags uint32) node { - return compressBufferGeneric(buf, length, key, counter, flags) +func compressBuffer(buf *[maxSIMD * chunkSize]byte, buflen int, key *[8]uint32, counter uint64, flags uint32) node { + return compressBufferGeneric(buf, buflen, key, counter, flags) } func compressChunk(chunk []byte, key *[8]uint32, counter uint64, flags uint32) node { @@ -51,14 +51,18 @@ func hashBlock(out *[64]byte, buf []byte) { wordsToBytes(words, out) } -func compressBlocks(out *[512]byte, n node) { - var outs [8][64]byte +func compressBlocks(out *[maxSIMD * blockSize]byte, n node) { + var outs [maxSIMD][64]byte compressBlocksGeneric(&outs, n) for i := range outs { copy(out[i*64:], outs[i][:]) } } +func mergeSubtrees(cvs *[maxSIMD][8]uint32, numCVs uint64, key *[8]uint32, flags uint32) node { + return mergeSubtreesGeneric(cvs, numCVs, key, flags) +} + func bytesToWords(bytes [64]byte, words *[16]uint32) { for i := range words { words[i] = binary.LittleEndian.Uint32(bytes[4*i:]) diff --git a/cpu.go b/cpu.go new file mode 100644 index 0000000..76fee54 --- /dev/null +++ b/cpu.go @@ -0,0 +1,10 @@ +// +build !darwin + +package blake3 + +import "github.com/klauspost/cpuid" + +var ( + haveAVX2 = cpuid.CPU.AVX2() + haveAVX512 = cpuid.CPU.AVX512F() +) diff --git a/cpu_darwin.go b/cpu_darwin.go new file mode 100644 index 0000000..e2c48d6 --- /dev/null +++ b/cpu_darwin.go @@ -0,0 +1,22 @@ +package blake3 + +import ( + "syscall" + + "github.com/klauspost/cpuid" +) + +var ( + haveAVX2 bool + haveAVX512 bool +) + +func init() { + haveAVX2 = cpuid.CPU.AVX2() + haveAVX512 = cpuid.CPU.AVX512F() + if !haveAVX512 { + // On some Macs, AVX512 detection is buggy, so fallback to sysctl + b, _ := syscall.Sysctl("hw.optional.avx512f") + haveAVX512 = b[0] == 1 + } +} diff --git a/go.mod b/go.mod index 46beb99..b793307 100644 --- a/go.mod +++ b/go.mod @@ -2,4 +2,6 @@ module lukechampine.com/blake3 go 1.13 -require golang.org/x/sys v0.0.0-20200202164722-d101bd2416d5 +require ( + github.com/klauspost/cpuid v1.3.1 +) diff --git a/go.sum b/go.sum index 4ad15a4..f568b65 100644 --- a/go.sum +++ b/go.sum @@ -1,2 +1,2 @@ -golang.org/x/sys v0.0.0-20200202164722-d101bd2416d5 h1:LfCXLvNmTYH9kEmVgqbnsWfruoXZIrh4YBgqVHtDvw0= -golang.org/x/sys v0.0.0-20200202164722-d101bd2416d5/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +github.com/klauspost/cpuid v1.3.1 h1:5JNjFYYQrZeKRJ0734q51WCEEn2huer72Dc7K+R/b6s= +github.com/klauspost/cpuid v1.3.1/go.mod h1:bYW4mA6ZgKPob1/Dlai2LviZJO7KGI3uoWLd42rAQw4= diff --git a/testdata/vectors.json b/testdata/vectors.json index 2a70bf1..115d242 100644 --- a/testdata/vectors.json +++ b/testdata/vectors.json @@ -126,6 +126,12 @@ "hash": "62b6960e1a44bcc1eb1a611a8d6235b6b4b78f32e7abc4fb4c6cdcce94895c47860cc51f2b0c28a7b77304bd55fe73af663c02d3f52ea053ba43431ca5bab7bfea2f5e9d7121770d88f70ae9649ea713087d1914f7f312147e247f87eb2d4ffef0ac978bf7b6579d57d533355aa20b8b77b13fd09748728a5cc327a8ec470f4013226f", "keyed_hash": "efa53b389ab67c593dba624d898d0f7353ab99e4ac9d42302ee64cbf9939a4193a7258db2d9cd32a7a3ecfce46144114b15c2fcb68a618a976bd74515d47be08b628be420b5e830fade7c080e351a076fbc38641ad80c736c8a18fe3c66ce12f95c61c2462a9770d60d0f77115bbcd3782b593016a4e728d4c06cee4505cb0c08a42ec", "derive_key": "39772aef80e0ebe60596361e45b061e8f417429d529171b6764468c22928e28e9759adeb797a3fbf771b1bcea30150a020e317982bf0d6e7d14dd9f064bc11025c25f31e81bd78a921db0174f03dd481d30e93fd8e90f8b2fee209f849f2d2a52f31719a490fb0ba7aea1e09814ee912eba111a9fde9d5c274185f7bae8ba85d300a2b" + }, + { + "input_len": 100000, + "hash": "d93c23eedaf165a7e0be908ba86f1a7a520d568d2d13cde787c8580c5c72cc54902b765d0e69ff7f278ef2f8bb839b673f0db20afa0566c78965ad819674822fd11a507251555fc6daec7437074bc7b7307dfe122411b3676a932b5b0360d5ad495f8e7431d3d025fac5b4e955ce893a3504f2569f838eea47cf1bb21c4ae659db522f", + "keyed_hash": "74c836d008247adebbc032d1bced2e71d19050b5c39fa03c43d4160ad8d170732f3b73e374a4500825c13d2c8c9384ce12c033adc49245ce42f50d5b48237397b8447bd414b0693bef98518db8a3494e6e8e3abc931f92f472d938f07eac97d1cc69b375426bce26c5e829b5b41cacbb5543544977749d503fa78309e7a158640e579c", + "derive_key": "039c0c0d76eacefea9c8d042698bd012d3cef4091ed5c5a7e32a30e4d51718930a99481bb11214d9e9e79e58d11875a789447731a887aa77499843148d35b1752c6314af6d36559341bd6895c5ee0a452c99cb47a9b22dfe36042932fc9a423d245b91b6246c85e4b0d415cbece3e0545d6e242853da7f3dd1f9b0f146ec72706b8c28" } ] }