From c2af4bc4c227f6f34b648004972c99676333dc9e Mon Sep 17 00:00:00 2001
From: lukechampine <luke.champine@gmail.com>
Date: Thu, 30 Jul 2020 13:54:11 -0400
Subject: [PATCH] add AVX2 implementation

---
 README.md           |   31 +-
 avo/gen.go          |  239 ++++++++
 blake3.go           |  360 ++----------
 blake3_amd64.s      | 1311 +++++++++++++++++++++++++++++++++++++++++++
 blake3_test.go      |   25 +-
 compress_amd64.go   |   76 +++
 compress_generic.go |  150 +++++
 compress_noasm.go   |   64 +++
 go.mod              |    2 +
 go.sum              |    2 +
 10 files changed, 1945 insertions(+), 315 deletions(-)
 create mode 100644 avo/gen.go
 create mode 100644 blake3_amd64.s
 create mode 100644 compress_amd64.go
 create mode 100644 compress_generic.go
 create mode 100644 compress_noasm.go
 create mode 100644 go.sum

diff --git a/README.md b/README.md
index f426db0..c5147ae 100644
--- a/README.md
+++ b/README.md
@@ -9,10 +9,29 @@ go get lukechampine.com/blake3
 ```
 
 `blake3` implements the [BLAKE3 cryptographic hash function](https://github.com/BLAKE3-team/BLAKE3).
+This implementation aims to be performant without sacrificing (too much)
+readability, in the hopes of eventually landing in `x/crypto`.
 
-This implementation is a port of the Rust reference implementation, refactored
-into more idiomatic Go style and with a handful of performance tweaks.
-Performance is not great, not terrible. Eventually an assembly-optimized
-implementation will be merged into `x/crypto`, and then you should switch to
-that. In the meantime, you can use this package for code that needs BLAKE3
-compatibility and doesn't need to be blazing fast.
+The pure-Go code is fairly well-optimized, achieving throughput of ~600 MB/s.
+There is a separate code path for small inputs (up to 64 bytes) that runs in
+~100 ns. On CPUs with AVX2 support, larger inputs (>=2 KB) are handled by
+an [`avo`](https://github.com/mmcloughlin/avo)-generated assembly routine that compresses 8 chunks in parallel,
+achieving throughput of ~2600 MB/s. Once [AVX-512 support](https://github.com/mmcloughlin/avo/issues/20) is added to `avo`, it
+will be possible to compress 16 chunks in parallel, which should roughly double
+throughput for sufficiently large inputs.
+
+Contributions are greatly appreciated.
+[All contributors are eligible to receive an Urbit planet.](https://twitter.com/lukechampine/status/1274797924522885134)
+
+
+## Benchmarks
+
+Tested on an i5-7600K @ 3.80GHz.
+
+```
+BenchmarkSum256/64           105 ns/op       609.51 MB/s
+BenchmarkSum256/1024        1778 ns/op       576.00 MB/s
+BenchmarkSum256/65536      24785 ns/op      2644.15 MB/s
+BenchmarkWrite               389 ns/op      2631.78 MB/s
+BenchmarkXOF                1591 ns/op       643.80 MB/s
+```
diff --git a/avo/gen.go b/avo/gen.go
new file mode 100644
index 0000000..482f6c7
--- /dev/null
+++ b/avo/gen.go
@@ -0,0 +1,239 @@
+// +build ignore
+
+package main
+
+import (
+	"fmt"
+
+	. "github.com/mmcloughlin/avo/build"
+	. "github.com/mmcloughlin/avo/operand"
+	. "github.com/mmcloughlin/avo/reg"
+)
+
+func main() {
+	genGlobals()
+	genCompressChunksAVX2()
+
+	Generate()
+}
+
+var globals struct {
+	iv               Mem
+	blockLen         Mem
+	stride1024       Mem
+	incrementCounter Mem
+	setFlags         Mem
+	shuffleRot8      Mem
+	shuffleRot16     Mem
+}
+
+func genGlobals() {
+	globals.iv = GLOBL("iv", RODATA|NOPTR)
+	DATA(0*4, U32(0x6A09E667))
+	DATA(1*4, U32(0xBB67AE85))
+	DATA(2*4, U32(0x3C6EF372))
+	DATA(3*4, U32(0xA54FF53A))
+
+	globals.blockLen = GLOBL("block_len", RODATA|NOPTR)
+	for i := 0; i < 8; i++ {
+		DATA(i*4, U32(64))
+	}
+
+	globals.stride1024 = GLOBL("stride_1024", RODATA|NOPTR)
+	for i := 0; i < 8; i++ {
+		DATA(i*4, U32(i*1024))
+	}
+	globals.incrementCounter = GLOBL("increment_counter", RODATA|NOPTR)
+	for i := 0; i < 8; i++ {
+		DATA(i*8, U64(i))
+	}
+	globals.setFlags = GLOBL("set_flags", RODATA|NOPTR)
+	for i := 0; i < 16; i++ {
+		if i == 0 {
+			DATA(i*4, U32(1))
+		} else if i == 15 {
+			DATA(i*4, U32(2))
+		} else {
+			DATA(i*4, U32(0))
+		}
+	}
+	globals.shuffleRot8 = GLOBL("shuffle_rot8", RODATA|NOPTR)
+	for i := 0; i < 8; i++ {
+		DATA(i*4, U32(0x00030201+0x04040404*i))
+	}
+	globals.shuffleRot16 = GLOBL("shuffle_rot16", RODATA|NOPTR)
+	for i := 0; i < 8; i++ {
+		DATA(i*4, U32(0x01000302+0x04040404*i))
+	}
+}
+
+func genCompressChunksAVX2() {
+	TEXT("compressChunksAVX2", NOSPLIT, "func(cvs *[8][8]uint32, buf *[8192]byte, key *[8]uint32, counter uint64, flags uint32)")
+	cvs := Mem{Base: Load(Param("cvs"), GP64())}
+	buf := Mem{Base: Load(Param("buf"), GP64())}
+	key := Mem{Base: Load(Param("key"), GP64())}
+	counter, _ := Param("counter").Resolve()
+	flags, _ := Param("flags").Resolve()
+
+	vs := [16]VecVirtual{
+		YMM(), YMM(), YMM(), YMM(),
+		YMM(), YMM(), YMM(), YMM(),
+		YMM(), YMM(), YMM(), YMM(),
+		YMM(), YMM(), YMM(), YMM(),
+	}
+	// stack space for transposed message vectors
+	var mv [16]Mem
+	for i := range mv {
+		mv[i] = AllocLocal(32)
+	}
+	// stack space for spilled vs[8] register
+	spillMem := AllocLocal(32)
+
+	Comment("Load key")
+	for i := 0; i < 8; i++ {
+		VPBROADCASTD(key.Offset(i*4), vs[i])
+	}
+
+	Comment("Initialize counter")
+	counterLo := AllocLocal(32)
+	counterHi := AllocLocal(32)
+	VPBROADCASTQ(counter.Addr, vs[12])
+	VPBROADCASTQ(counter.Addr, vs[13])
+	VPADDQ(globals.incrementCounter.Offset(0*32), vs[12], vs[12])
+	VPADDQ(globals.incrementCounter.Offset(1*32), vs[13], vs[13])
+	VPUNPCKLDQ(vs[13], vs[12], vs[14])
+	VPUNPCKHDQ(vs[13], vs[12], vs[15])
+	VPUNPCKLDQ(vs[15], vs[14], vs[12])
+	VPUNPCKHDQ(vs[15], vs[14], vs[13])
+	VPERMQ(Imm(0xd8), vs[12], vs[12])
+	VPERMQ(Imm(0xd8), vs[13], vs[13])
+	VMOVDQU(vs[12], counterLo)
+	VMOVDQU(vs[13], counterHi)
+
+	Comment("Initialize flags")
+	chunkFlags := AllocLocal(16 * 4)
+	VPBROADCASTD(flags.Addr, vs[14])
+	VPOR(globals.setFlags.Offset(0*32), vs[14], vs[15])
+	VMOVDQU(vs[15], chunkFlags.Offset(0*32))
+	VPOR(globals.setFlags.Offset(1*32), vs[14], vs[15])
+	VMOVDQU(vs[15], chunkFlags.Offset(1*32))
+
+	Comment("Loop index")
+	loop := GP64()
+	XORQ(loop, loop)
+	Label("loop")
+
+	Comment("Load transposed block")
+	VMOVDQU(globals.stride1024, vs[9])
+	for i := 0; i < 16; i++ {
+		VPCMPEQD(vs[8], vs[8], vs[8]) // fastest way to set all bits to 1
+		VPGATHERDD(vs[8], buf.Offset(i*4).Idx(vs[9], 1), vs[10])
+		VMOVDQU(vs[10], mv[i])
+	}
+	ADDQ(Imm(64), buf.Base)
+
+	Comment("Reload state vectors (other than CVs)")
+	for i := 0; i < 4; i++ {
+		VPBROADCASTD(globals.iv.Offset(i*4), vs[8+i])
+	}
+	VMOVDQU(counterLo, vs[12])
+	VMOVDQU(counterHi, vs[13])
+	VMOVDQU(globals.blockLen, vs[14])
+	VPBROADCASTD(chunkFlags.Idx(loop, 4), vs[15])
+
+	VMOVDQU(vs[8], spillMem) // spill
+	for i := 0; i < 7; i++ {
+		Comment(fmt.Sprintf("Round %v", i+1))
+		round(vs, mv, vs[8], spillMem)
+		// permute
+		mv = [16]Mem{
+			mv[2], mv[6], mv[3], mv[10],
+			mv[7], mv[0], mv[4], mv[13],
+			mv[1], mv[11], mv[12], mv[5],
+			mv[9], mv[14], mv[15], mv[8],
+		}
+	}
+
+	Comment("Finalize CVs")
+	VMOVDQU(spillMem, vs[8]) // reload
+	for i := range vs[:8] {
+		VPXOR(vs[i], vs[i+8], vs[i])
+	}
+
+	Comment("Loop")
+	INCQ(loop)
+	CMPQ(loop, U32(16))
+	JNE(LabelRef("loop"))
+
+	Comment("Finished; transpose CVs")
+	src, dst := vs[:8], vs[8:]
+	// interleave uint32s
+	for i := 0; i < 8; i += 2 {
+		VPUNPCKLDQ(src[i+1], src[i], dst[i+0])
+		VPUNPCKHDQ(src[i+1], src[i], dst[i+1])
+	}
+	// interleave groups of two uint32s
+	for i := 0; i < 4; i++ {
+		j := i*2 - i%2 // j := 0,1,4,5
+		VPUNPCKLQDQ(dst[j+2], dst[j], src[i*2+0])
+		VPUNPCKHQDQ(dst[j+2], dst[j], src[i*2+1])
+	}
+	// interleave groups of four uint32s
+	for i := 0; i < 4; i++ {
+		VPERM2I128(Imm(0x20), src[i+4], src[i], dst[i+0])
+		VPERM2I128(Imm(0x31), src[i+4], src[i], dst[i+4])
+	}
+	for i, v := range dst {
+		VMOVDQU(v, cvs.Offset(i*32))
+	}
+
+	RET()
+}
+
+func round(sv [16]VecVirtual, mv [16]Mem, tmp VecVirtual, spillMem Mem) {
+	g(sv[0], sv[4], sv[8], sv[12], mv[0], mv[1], tmp, spillMem)
+	g(sv[1], sv[5], sv[9], sv[13], mv[2], mv[3], tmp, spillMem)
+	g(sv[2], sv[6], sv[10], sv[14], mv[4], mv[5], tmp, spillMem)
+	g(sv[3], sv[7], sv[11], sv[15], mv[6], mv[7], tmp, spillMem)
+	g(sv[0], sv[5], sv[10], sv[15], mv[8], mv[9], tmp, spillMem)
+	g(sv[1], sv[6], sv[11], sv[12], mv[10], mv[11], tmp, spillMem)
+	g(sv[2], sv[7], sv[8], sv[13], mv[12], mv[13], tmp, spillMem)
+	g(sv[3], sv[4], sv[9], sv[14], mv[14], mv[15], tmp, spillMem)
+}
+
+func g(a, b, c, d VecVirtual, mx, my Mem, tmp VecVirtual, spillMem Mem) {
+	// Helper function for performing rotations. Also manages c, tmp and
+	// spillMem: if c == tmp, we need to spill and reload c using spillMem.
+	rotr := func(v VecVirtual, n uint64, dst VecVirtual) {
+		switch n {
+		case 8, 16:
+			shuf := [...]Mem{8: globals.shuffleRot8, 16: globals.shuffleRot16}[n]
+			VPSHUFB(shuf, v, dst)
+			if c == tmp {
+				VMOVDQU(spillMem, c)
+			}
+		case 7, 12:
+			if c == tmp {
+				VMOVDQU(c, spillMem)
+			}
+			VPSRLD(Imm(n), v, tmp)
+			VPSLLD(Imm(32-n), v, dst)
+			VPOR(dst, tmp, dst)
+		}
+	}
+
+	VPADDD(a, b, a)
+	VPADDD(mx, a, a)
+	VPXOR(d, a, d)
+	rotr(d, 16, d)
+	VPADDD(c, d, c)
+	VPXOR(b, c, b)
+	rotr(b, 12, b)
+	VPADDD(a, b, a)
+	VPADDD(my, a, a)
+	VPXOR(d, a, d)
+	rotr(d, 8, d)
+	VPADDD(c, d, c)
+	VPXOR(b, c, b)
+	rotr(b, 7, b)
+}
diff --git a/blake3.go b/blake3.go
index c6609fb..d06112f 100644
--- a/blake3.go
+++ b/blake3.go
@@ -10,12 +10,6 @@ import (
 	"math/bits"
 )
 
-const (
-	blockSize = 64
-	chunkSize = 1024
-)
-
-// flags
 const (
 	flagChunkStart = 1 << iota
 	flagChunkEnd
@@ -24,6 +18,9 @@ const (
 	flagKeyedHash
 	flagDeriveKeyContext
 	flagDeriveKeyMaterial
+
+	blockSize = 64
+	chunkSize = 1024
 )
 
 var iv = [8]uint32{
@@ -31,332 +28,82 @@ var iv = [8]uint32{
 	0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19,
 }
 
-// helper functions for converting between bytes and BLAKE3 "words"
-
-func bytesToWords(bytes [64]byte, words *[16]uint32) {
-	words[0] = binary.LittleEndian.Uint32(bytes[0:])
-	words[1] = binary.LittleEndian.Uint32(bytes[4:])
-	words[2] = binary.LittleEndian.Uint32(bytes[8:])
-	words[3] = binary.LittleEndian.Uint32(bytes[12:])
-	words[4] = binary.LittleEndian.Uint32(bytes[16:])
-	words[5] = binary.LittleEndian.Uint32(bytes[20:])
-	words[6] = binary.LittleEndian.Uint32(bytes[24:])
-	words[7] = binary.LittleEndian.Uint32(bytes[28:])
-	words[8] = binary.LittleEndian.Uint32(bytes[32:])
-	words[9] = binary.LittleEndian.Uint32(bytes[36:])
-	words[10] = binary.LittleEndian.Uint32(bytes[40:])
-	words[11] = binary.LittleEndian.Uint32(bytes[44:])
-	words[12] = binary.LittleEndian.Uint32(bytes[48:])
-	words[13] = binary.LittleEndian.Uint32(bytes[52:])
-	words[14] = binary.LittleEndian.Uint32(bytes[56:])
-	words[15] = binary.LittleEndian.Uint32(bytes[60:])
-}
-
-func wordsToBytes(words [16]uint32, block *[64]byte) {
-	binary.LittleEndian.PutUint32(block[0:], words[0])
-	binary.LittleEndian.PutUint32(block[4:], words[1])
-	binary.LittleEndian.PutUint32(block[8:], words[2])
-	binary.LittleEndian.PutUint32(block[12:], words[3])
-	binary.LittleEndian.PutUint32(block[16:], words[4])
-	binary.LittleEndian.PutUint32(block[20:], words[5])
-	binary.LittleEndian.PutUint32(block[24:], words[6])
-	binary.LittleEndian.PutUint32(block[28:], words[7])
-	binary.LittleEndian.PutUint32(block[32:], words[8])
-	binary.LittleEndian.PutUint32(block[36:], words[9])
-	binary.LittleEndian.PutUint32(block[40:], words[10])
-	binary.LittleEndian.PutUint32(block[44:], words[11])
-	binary.LittleEndian.PutUint32(block[48:], words[12])
-	binary.LittleEndian.PutUint32(block[52:], words[13])
-	binary.LittleEndian.PutUint32(block[56:], words[14])
-	binary.LittleEndian.PutUint32(block[60:], words[15])
-}
-
-func g(a, b, c, d, mx, my uint32) (uint32, uint32, uint32, uint32) {
-	a += b + mx
-	d = bits.RotateLeft32(d^a, -16)
-	c += d
-	b = bits.RotateLeft32(b^c, -12)
-	a += b + my
-	d = bits.RotateLeft32(d^a, -8)
-	c += d
-	b = bits.RotateLeft32(b^c, -7)
-	return a, b, c, d
-}
-
-// A node represents a chunk or parent in the BLAKE3 Merkle tree. In BLAKE3
-// terminology, the elements of the bottom layer (aka "leaves") of the tree are
-// called chunk nodes, and the elements of upper layers (aka "interior nodes")
-// are called parent nodes.
-//
-// Computing a BLAKE3 hash involves splitting the input into chunk nodes, then
-// repeatedly merging these nodes into parent nodes, until only a single "root"
-// node remains. The root node can then be used to generate up to 2^64 - 1 bytes
-// of pseudorandom output.
+// A node represents a chunk or parent in the BLAKE3 Merkle tree.
 type node struct {
-	// the chaining value from the previous state
-	cv [8]uint32
-	// the current state
+	cv       [8]uint32 // chaining value from previous node
 	block    [16]uint32
 	counter  uint64
 	blockLen uint32
 	flags    uint32
 }
 
-// compress is the core hash function, generating 16 pseudorandom words from a
-// node.
-func (n node) compress() [16]uint32 {
-	// NOTE: we unroll all of the rounds, as well as the permutations that occur
-	// between rounds.
-
-	// round 1 (also initializes state)
-	// columns
-	s0, s4, s8, s12 := g(n.cv[0], n.cv[4], iv[0], uint32(n.counter), n.block[0], n.block[1])
-	s1, s5, s9, s13 := g(n.cv[1], n.cv[5], iv[1], uint32(n.counter>>32), n.block[2], n.block[3])
-	s2, s6, s10, s14 := g(n.cv[2], n.cv[6], iv[2], n.blockLen, n.block[4], n.block[5])
-	s3, s7, s11, s15 := g(n.cv[3], n.cv[7], iv[3], n.flags, n.block[6], n.block[7])
-	// diagonals
-	s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[8], n.block[9])
-	s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[10], n.block[11])
-	s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[12], n.block[13])
-	s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[14], n.block[15])
-
-	// round 2
-	s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[2], n.block[6])
-	s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[3], n.block[10])
-	s2, s6, s10, s14 = g(s2, s6, s10, s14, n.block[7], n.block[0])
-	s3, s7, s11, s15 = g(s3, s7, s11, s15, n.block[4], n.block[13])
-	s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[1], n.block[11])
-	s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[12], n.block[5])
-	s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[9], n.block[14])
-	s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[15], n.block[8])
-
-	// round 3
-	s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[3], n.block[4])
-	s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[10], n.block[12])
-	s2, s6, s10, s14 = g(s2, s6, s10, s14, n.block[13], n.block[2])
-	s3, s7, s11, s15 = g(s3, s7, s11, s15, n.block[7], n.block[14])
-	s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[6], n.block[5])
-	s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[9], n.block[0])
-	s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[11], n.block[15])
-	s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[8], n.block[1])
-
-	// round 4
-	s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[10], n.block[7])
-	s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[12], n.block[9])
-	s2, s6, s10, s14 = g(s2, s6, s10, s14, n.block[14], n.block[3])
-	s3, s7, s11, s15 = g(s3, s7, s11, s15, n.block[13], n.block[15])
-	s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[4], n.block[0])
-	s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[11], n.block[2])
-	s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[5], n.block[8])
-	s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[1], n.block[6])
-
-	// round 5
-	s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[12], n.block[13])
-	s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[9], n.block[11])
-	s2, s6, s10, s14 = g(s2, s6, s10, s14, n.block[15], n.block[10])
-	s3, s7, s11, s15 = g(s3, s7, s11, s15, n.block[14], n.block[8])
-	s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[7], n.block[2])
-	s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[5], n.block[3])
-	s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[0], n.block[1])
-	s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[6], n.block[4])
-
-	// round 6
-	s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[9], n.block[14])
-	s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[11], n.block[5])
-	s2, s6, s10, s14 = g(s2, s6, s10, s14, n.block[8], n.block[12])
-	s3, s7, s11, s15 = g(s3, s7, s11, s15, n.block[15], n.block[1])
-	s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[13], n.block[3])
-	s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[0], n.block[10])
-	s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[2], n.block[6])
-	s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[4], n.block[7])
-
-	// round 7
-	s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[11], n.block[15])
-	s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[5], n.block[0])
-	s2, s6, s10, s14 = g(s2, s6, s10, s14, n.block[1], n.block[9])
-	s3, s7, s11, s15 = g(s3, s7, s11, s15, n.block[8], n.block[6])
-	s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[14], n.block[10])
-	s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[2], n.block[12])
-	s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[3], n.block[4])
-	s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[7], n.block[13])
-
-	// finalization
-	return [16]uint32{
-		s0 ^ s8, s1 ^ s9, s2 ^ s10, s3 ^ s11,
-		s4 ^ s12, s5 ^ s13, s6 ^ s14, s7 ^ s15,
-		s8 ^ n.cv[0], s9 ^ n.cv[1], s10 ^ n.cv[2], s11 ^ n.cv[3],
-		s12 ^ n.cv[4], s13 ^ n.cv[5], s14 ^ n.cv[6], s15 ^ n.cv[7],
-	}
-}
-
-// chainingValue returns the first 8 words of the compressed node. This is used
-// in two places. First, when a chunk node is being constructed, its cv is
-// overwritten with this value after each block of input is processed. Second,
-// when two nodes are merged into a parent, each of their chaining values
-// supplies half of the new node's block.
-func (n node) chainingValue() (cv [8]uint32) {
-	full := n.compress()
-	copy(cv[:], full[:8])
-	return
-}
-
-// chunkState manages the state involved in hashing a single chunk of input.
-type chunkState struct {
-	n             node
-	block         [blockSize]byte
-	blockLen      int
-	bytesConsumed int
-}
-
-// chunkCounter is the index of this chunk, i.e. the number of chunks that have
-// been processed prior to this one.
-func (cs *chunkState) chunkCounter() uint64 {
-	return cs.n.counter
-}
-
-func (cs *chunkState) complete() bool {
-	return cs.bytesConsumed == chunkSize
-}
-
-// update incorporates input into the chunkState.
-func (cs *chunkState) update(input []byte) {
-	for len(input) > 0 {
-		// If the block buffer is full, compress it and clear it. More
-		// input is coming, so this compression is not flagChunkEnd.
-		if cs.blockLen == blockSize {
-			// copy the chunk block (bytes) into the node block and chain it.
-			bytesToWords(cs.block, &cs.n.block)
-			cs.n.cv = cs.n.chainingValue()
-			// clear the start flag for all but the first block
-			cs.n.flags &^= flagChunkStart
-			cs.blockLen = 0
-		}
-
-		// Copy input bytes into the chunk block.
-		n := copy(cs.block[cs.blockLen:], input)
-		cs.blockLen += n
-		cs.bytesConsumed += n
-		input = input[n:]
-	}
-}
-
-// compiles to memclr
-func clear(b []byte) {
-	for i := range b {
-		b[i] = 0
-	}
-}
-
-// node returns a node containing the chunkState's current state, with the
-// ChunkEnd flag set.
-func (cs *chunkState) node() node {
-	n := cs.n
-	// pad the remaining space in the block with zeros
-	clear(cs.block[cs.blockLen:])
-	bytesToWords(cs.block, &n.block)
-	n.blockLen = uint32(cs.blockLen)
-	n.flags |= flagChunkEnd
-	return n
-}
-
-func newChunkState(iv [8]uint32, chunkCounter uint64, flags uint32) chunkState {
-	return chunkState{
-		n: node{
-			cv:       iv,
-			counter:  chunkCounter,
-			blockLen: blockSize,
-			// compress the first block with the start flag set
-			flags: flags | flagChunkStart,
-		},
-	}
-}
-
 // parentNode returns a node that incorporates the chaining values of two child
 // nodes.
 func parentNode(left, right [8]uint32, key [8]uint32, flags uint32) node {
-	var blockWords [16]uint32
-	copy(blockWords[:8], left[:])
-	copy(blockWords[8:], right[:])
-	return node{
+	n := node{
 		cv:       key,
-		block:    blockWords,
 		counter:  0,         // counter is reset for parents
-		blockLen: blockSize, // block is full: 8 words from left, 8 from right
+		blockLen: blockSize, // block is full
 		flags:    flags | flagParent,
 	}
+	copy(n.block[:8], left[:])
+	copy(n.block[8:], right[:])
+	return n
 }
 
 // Hasher implements hash.Hash.
 type Hasher struct {
-	cs    chunkState
 	key   [8]uint32
 	flags uint32
 	size  int // output size, for Sum
 
 	// log(n) set of Merkle subtree roots, at most one per height.
-	stack [54][8]uint32 // 2^54 * chunkSize = 2^64
-	used  uint64        // bit vector indicating which stack elems are valid; also number of chunks added
+	stack   [51][8]uint32 // 2^51 * 8 * chunkSize = 2^64
+	counter uint64        // number of buffers hashed; also serves as a bit vector indicating which stack elems are occupied
+
+	buf    [8 * chunkSize]byte
+	buflen int
 }
 
 func (h *Hasher) hasSubtreeAtHeight(i int) bool {
-	return h.used&(1<<i) != 0
+	return h.counter&(1<<i) != 0
 }
 
-// addChunkChainingValue appends a chunk to the right edge of the Merkle tree.
-func (h *Hasher) addChunkChainingValue(cv [8]uint32) {
+func (h *Hasher) pushSubtree(cv [8]uint32) {
 	// seek to first open stack slot, merging subtrees as we go
 	i := 0
-	for ; h.hasSubtreeAtHeight(i); i++ {
-		cv = parentNode(h.stack[i], cv, h.key, h.flags).chainingValue()
+	for h.hasSubtreeAtHeight(i) {
+		cv = chainingValue(parentNode(h.stack[i], cv, h.key, h.flags))
+		i++
 	}
 	h.stack[i] = cv
-	h.used++
+	h.counter++
 }
 
 // rootNode computes the root of the Merkle tree. It does not modify the
-// chainStack.
+// stack.
 func (h *Hasher) rootNode() node {
-	n := h.cs.node()
-	for i := bits.TrailingZeros64(h.used); i < bits.Len64(h.used); i++ {
+	n := compressBuffer(&h.buf, h.buflen, &h.key, h.counter*8, h.flags)
+	for i := bits.TrailingZeros64(h.counter); i < bits.Len64(h.counter); i++ {
 		if h.hasSubtreeAtHeight(i) {
-			n = parentNode(h.stack[i], n.chainingValue(), h.key, h.flags)
+			n = parentNode(h.stack[i], chainingValue(n), h.key, h.flags)
 		}
 	}
 	n.flags |= flagRoot
 	return n
 }
 
-// Reset implements hash.Hash.
-func (h *Hasher) Reset() {
-	h.cs = newChunkState(h.key, 0, h.flags)
-	h.used = 0
-}
-
-// BlockSize implements hash.Hash.
-func (h *Hasher) BlockSize() int { return 64 }
-
-// Size implements hash.Hash.
-func (h *Hasher) Size() int { return h.size }
-
 // Write implements hash.Hash.
 func (h *Hasher) Write(p []byte) (int, error) {
 	lenp := len(p)
 	for len(p) > 0 {
-		// If the current chunk is complete, finalize it and add it to the tree,
-		// then reset the chunk state (but keep incrementing the counter across
-		// chunks).
-		if h.cs.complete() {
-			cv := h.cs.node().chainingValue()
-			h.addChunkChainingValue(cv)
-			h.cs = newChunkState(h.key, h.cs.chunkCounter()+1, h.flags)
+		if h.buflen == len(h.buf) {
+			n := compressBuffer(&h.buf, h.buflen, &h.key, h.counter*8, h.flags)
+			h.pushSubtree(chainingValue(n))
+			h.buflen = 0
 		}
-
-		// Compress input bytes into the current chunk state.
-		n := chunkSize - h.cs.bytesConsumed
-		if n > len(p) {
-			n = len(p)
-		}
-		h.cs.update(p[:n])
+		n := copy(h.buf[h.buflen:], p)
+		h.buflen += n
 		p = p[n:]
 	}
 	return lenp, nil
@@ -377,6 +124,18 @@ func (h *Hasher) Sum(b []byte) (sum []byte) {
 	return
 }
 
+// Reset implements hash.Hash.
+func (h *Hasher) Reset() {
+	h.counter = 0
+	h.buflen = 0
+}
+
+// BlockSize implements hash.Hash.
+func (h *Hasher) BlockSize() int { return 64 }
+
+// Size implements hash.Hash.
+func (h *Hasher) Size() int { return h.size }
+
 // XOF returns an OutputReader initialized with the current hash state.
 func (h *Hasher) XOF() *OutputReader {
 	return &OutputReader{
@@ -386,7 +145,6 @@ func (h *Hasher) XOF() *OutputReader {
 
 func newHasher(key [8]uint32, flags uint32, size int) *Hasher {
 	return &Hasher{
-		cs:    newChunkState(key, 0, flags),
 		key:   key,
 		flags: flags,
 		size:  size,
@@ -394,7 +152,7 @@ func newHasher(key [8]uint32, flags uint32, size int) *Hasher {
 }
 
 // New returns a Hasher for the specified size and key. If key is nil, the hash
-// is unkeyed.
+// is unkeyed. Otherwise, len(key) must be 32.
 func New(size int, key []byte) *Hasher {
 	if key == nil {
 		return newHasher(iv, 0, size)
@@ -408,21 +166,30 @@ func New(size int, key []byte) *Hasher {
 
 // Sum256 and Sum512 always use the same hasher state, so we can save some time
 // when hashing small inputs by constructing the hasher ahead of time.
-var defaultHasher = newHasher(iv, 0, 0)
+var defaultHasher = New(0, nil)
 
 // Sum256 returns the unkeyed BLAKE3 hash of b, truncated to 256 bits.
 func Sum256(b []byte) (out [32]byte) {
-	h := *defaultHasher
-	h.Write(b)
-	h.XOF().Read(out[:])
+	out512 := Sum512(b)
+	copy(out[:], out512[:])
 	return
 }
 
 // Sum512 returns the unkeyed BLAKE3 hash of b, truncated to 512 bits.
 func Sum512(b []byte) (out [64]byte) {
-	h := *defaultHasher
-	h.Write(b)
-	h.XOF().Read(out[:])
+	var n node
+	if len(b) <= blockSize {
+		hashBlock(&out, b)
+		return
+	} else if len(b) <= chunkSize {
+		n = compressChunk(b, &iv, 0, 0)
+		n.flags |= flagRoot
+	} else {
+		h := *defaultHasher
+		h.Write(b)
+		n = h.rootNode()
+	}
+	wordsToBytes(compressNode(n), &out)
 	return
 }
 
@@ -473,10 +240,8 @@ func (or *OutputReader) Read(p []byte) (int, error) {
 	for len(p) > 0 {
 		if or.off%blockSize == 0 {
 			or.n.counter = or.off / blockSize
-			words := or.n.compress()
-			wordsToBytes(words, &or.block)
+			wordsToBytes(compressNode(or.n), &or.block)
 		}
-
 		n := copy(p, or.block[or.off%blockSize:])
 		p = p[n:]
 		or.off += uint64(n)
@@ -510,8 +275,7 @@ func (or *OutputReader) Seek(offset int64, whence int) (int64, error) {
 	or.off = off
 	or.n.counter = uint64(off) / blockSize
 	if or.off%blockSize != 0 {
-		words := or.n.compress()
-		wordsToBytes(words, &or.block)
+		wordsToBytes(compressNode(or.n), &or.block)
 	}
 	// NOTE: or.off >= 2^63 will result in a negative return value.
 	// Nothing we can do about this.
diff --git a/blake3_amd64.s b/blake3_amd64.s
new file mode 100644
index 0000000..f467663
--- /dev/null
+++ b/blake3_amd64.s
@@ -0,0 +1,1311 @@
+// Code generated by command: go run gen.go -out blake3_amd64.s. DO NOT EDIT.
+
+#include "textflag.h"
+
+DATA iv<>+0(SB)/4, $0x6a09e667
+DATA iv<>+4(SB)/4, $0xbb67ae85
+DATA iv<>+8(SB)/4, $0x3c6ef372
+DATA iv<>+12(SB)/4, $0xa54ff53a
+GLOBL iv<>(SB), RODATA|NOPTR, $16
+
+DATA block_len<>+0(SB)/4, $0x00000040
+DATA block_len<>+4(SB)/4, $0x00000040
+DATA block_len<>+8(SB)/4, $0x00000040
+DATA block_len<>+12(SB)/4, $0x00000040
+DATA block_len<>+16(SB)/4, $0x00000040
+DATA block_len<>+20(SB)/4, $0x00000040
+DATA block_len<>+24(SB)/4, $0x00000040
+DATA block_len<>+28(SB)/4, $0x00000040
+GLOBL block_len<>(SB), RODATA|NOPTR, $32
+
+DATA stride_1024<>+0(SB)/4, $0x00000000
+DATA stride_1024<>+4(SB)/4, $0x00000400
+DATA stride_1024<>+8(SB)/4, $0x00000800
+DATA stride_1024<>+12(SB)/4, $0x00000c00
+DATA stride_1024<>+16(SB)/4, $0x00001000
+DATA stride_1024<>+20(SB)/4, $0x00001400
+DATA stride_1024<>+24(SB)/4, $0x00001800
+DATA stride_1024<>+28(SB)/4, $0x00001c00
+GLOBL stride_1024<>(SB), RODATA|NOPTR, $32
+
+DATA increment_counter<>+0(SB)/8, $0x0000000000000000
+DATA increment_counter<>+8(SB)/8, $0x0000000000000001
+DATA increment_counter<>+16(SB)/8, $0x0000000000000002
+DATA increment_counter<>+24(SB)/8, $0x0000000000000003
+DATA increment_counter<>+32(SB)/8, $0x0000000000000004
+DATA increment_counter<>+40(SB)/8, $0x0000000000000005
+DATA increment_counter<>+48(SB)/8, $0x0000000000000006
+DATA increment_counter<>+56(SB)/8, $0x0000000000000007
+GLOBL increment_counter<>(SB), RODATA|NOPTR, $64
+
+DATA set_flags<>+0(SB)/4, $0x00000001
+DATA set_flags<>+4(SB)/4, $0x00000000
+DATA set_flags<>+8(SB)/4, $0x00000000
+DATA set_flags<>+12(SB)/4, $0x00000000
+DATA set_flags<>+16(SB)/4, $0x00000000
+DATA set_flags<>+20(SB)/4, $0x00000000
+DATA set_flags<>+24(SB)/4, $0x00000000
+DATA set_flags<>+28(SB)/4, $0x00000000
+DATA set_flags<>+32(SB)/4, $0x00000000
+DATA set_flags<>+36(SB)/4, $0x00000000
+DATA set_flags<>+40(SB)/4, $0x00000000
+DATA set_flags<>+44(SB)/4, $0x00000000
+DATA set_flags<>+48(SB)/4, $0x00000000
+DATA set_flags<>+52(SB)/4, $0x00000000
+DATA set_flags<>+56(SB)/4, $0x00000000
+DATA set_flags<>+60(SB)/4, $0x00000002
+GLOBL set_flags<>(SB), RODATA|NOPTR, $64
+
+DATA shuffle_rot8<>+0(SB)/4, $0x00030201
+DATA shuffle_rot8<>+4(SB)/4, $0x04070605
+DATA shuffle_rot8<>+8(SB)/4, $0x080b0a09
+DATA shuffle_rot8<>+12(SB)/4, $0x0c0f0e0d
+DATA shuffle_rot8<>+16(SB)/4, $0x10131211
+DATA shuffle_rot8<>+20(SB)/4, $0x14171615
+DATA shuffle_rot8<>+24(SB)/4, $0x181b1a19
+DATA shuffle_rot8<>+28(SB)/4, $0x1c1f1e1d
+GLOBL shuffle_rot8<>(SB), RODATA|NOPTR, $32
+
+DATA shuffle_rot16<>+0(SB)/4, $0x01000302
+DATA shuffle_rot16<>+4(SB)/4, $0x05040706
+DATA shuffle_rot16<>+8(SB)/4, $0x09080b0a
+DATA shuffle_rot16<>+12(SB)/4, $0x0d0c0f0e
+DATA shuffle_rot16<>+16(SB)/4, $0x11101312
+DATA shuffle_rot16<>+20(SB)/4, $0x15141716
+DATA shuffle_rot16<>+24(SB)/4, $0x19181b1a
+DATA shuffle_rot16<>+28(SB)/4, $0x1d1c1f1e
+GLOBL shuffle_rot16<>(SB), RODATA|NOPTR, $32
+
+// func compressChunksAVX2(cvs *[8][8]uint32, buf *[8192]byte, key *[8]uint32, counter uint64, flags uint32)
+// Requires: AVX, AVX2
+TEXT ·compressChunksAVX2(SB), NOSPLIT, $672-40
+	MOVQ cvs+0(FP), AX
+	MOVQ buf+8(FP), CX
+	MOVQ key+16(FP), DX
+
+	// Load key
+	VPBROADCASTD (DX), Y0
+	VPBROADCASTD 4(DX), Y1
+	VPBROADCASTD 8(DX), Y2
+	VPBROADCASTD 12(DX), Y3
+	VPBROADCASTD 16(DX), Y4
+	VPBROADCASTD 20(DX), Y5
+	VPBROADCASTD 24(DX), Y6
+	VPBROADCASTD 28(DX), Y7
+
+	// Initialize counter
+	VPBROADCASTQ counter+24(FP), Y12
+	VPBROADCASTQ counter+24(FP), Y13
+	VPADDQ       increment_counter<>+0(SB), Y12, Y12
+	VPADDQ       increment_counter<>+32(SB), Y13, Y13
+	VPUNPCKLDQ   Y13, Y12, Y14
+	VPUNPCKHDQ   Y13, Y12, Y15
+	VPUNPCKLDQ   Y15, Y14, Y12
+	VPUNPCKHDQ   Y15, Y14, Y13
+	VPERMQ       $0xd8, Y12, Y12
+	VPERMQ       $0xd8, Y13, Y13
+	VMOVDQU      Y12, 544(SP)
+	VMOVDQU      Y13, 576(SP)
+
+	// Initialize flags
+	VPBROADCASTD flags+32(FP), Y14
+	VPOR         set_flags<>+0(SB), Y14, Y15
+	VMOVDQU      Y15, 608(SP)
+	VPOR         set_flags<>+32(SB), Y14, Y15
+	VMOVDQU      Y15, 640(SP)
+
+	// Loop index
+	XORQ DX, DX
+
+loop:
+	// Load transposed block
+	VMOVDQU    stride_1024<>+0(SB), Y9
+	VPCMPEQD   Y8, Y8, Y8
+	VPGATHERDD Y8, (CX)(Y9*1), Y10
+	VMOVDQU    Y10, (SP)
+	VPCMPEQD   Y8, Y8, Y8
+	VPGATHERDD Y8, 4(CX)(Y9*1), Y10
+	VMOVDQU    Y10, 32(SP)
+	VPCMPEQD   Y8, Y8, Y8
+	VPGATHERDD Y8, 8(CX)(Y9*1), Y10
+	VMOVDQU    Y10, 64(SP)
+	VPCMPEQD   Y8, Y8, Y8
+	VPGATHERDD Y8, 12(CX)(Y9*1), Y10
+	VMOVDQU    Y10, 96(SP)
+	VPCMPEQD   Y8, Y8, Y8
+	VPGATHERDD Y8, 16(CX)(Y9*1), Y10
+	VMOVDQU    Y10, 128(SP)
+	VPCMPEQD   Y8, Y8, Y8
+	VPGATHERDD Y8, 20(CX)(Y9*1), Y10
+	VMOVDQU    Y10, 160(SP)
+	VPCMPEQD   Y8, Y8, Y8
+	VPGATHERDD Y8, 24(CX)(Y9*1), Y10
+	VMOVDQU    Y10, 192(SP)
+	VPCMPEQD   Y8, Y8, Y8
+	VPGATHERDD Y8, 28(CX)(Y9*1), Y10
+	VMOVDQU    Y10, 224(SP)
+	VPCMPEQD   Y8, Y8, Y8
+	VPGATHERDD Y8, 32(CX)(Y9*1), Y10
+	VMOVDQU    Y10, 256(SP)
+	VPCMPEQD   Y8, Y8, Y8
+	VPGATHERDD Y8, 36(CX)(Y9*1), Y10
+	VMOVDQU    Y10, 288(SP)
+	VPCMPEQD   Y8, Y8, Y8
+	VPGATHERDD Y8, 40(CX)(Y9*1), Y10
+	VMOVDQU    Y10, 320(SP)
+	VPCMPEQD   Y8, Y8, Y8
+	VPGATHERDD Y8, 44(CX)(Y9*1), Y10
+	VMOVDQU    Y10, 352(SP)
+	VPCMPEQD   Y8, Y8, Y8
+	VPGATHERDD Y8, 48(CX)(Y9*1), Y10
+	VMOVDQU    Y10, 384(SP)
+	VPCMPEQD   Y8, Y8, Y8
+	VPGATHERDD Y8, 52(CX)(Y9*1), Y10
+	VMOVDQU    Y10, 416(SP)
+	VPCMPEQD   Y8, Y8, Y8
+	VPGATHERDD Y8, 56(CX)(Y9*1), Y10
+	VMOVDQU    Y10, 448(SP)
+	VPCMPEQD   Y8, Y8, Y8
+	VPGATHERDD Y8, 60(CX)(Y9*1), Y10
+	VMOVDQU    Y10, 480(SP)
+	ADDQ       $0x40, CX
+
+	// Reload state vectors (other than CVs)
+	VPBROADCASTD iv<>+0(SB), Y8
+	VPBROADCASTD iv<>+4(SB), Y9
+	VPBROADCASTD iv<>+8(SB), Y10
+	VPBROADCASTD iv<>+12(SB), Y11
+	VMOVDQU      544(SP), Y12
+	VMOVDQU      576(SP), Y13
+	VMOVDQU      block_len<>+0(SB), Y14
+	VPBROADCASTD 608(SP)(DX*4), Y15
+	VMOVDQU      Y8, 512(SP)
+
+	// Round 1
+	VPADDD  Y0, Y4, Y0
+	VPADDD  (SP), Y0, Y0
+	VPXOR   Y12, Y0, Y12
+	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
+	VMOVDQU 512(SP), Y8
+	VPADDD  Y8, Y12, Y8
+	VPXOR   Y4, Y8, Y4
+	VMOVDQU Y8, 512(SP)
+	VPSRLD  $0x0c, Y4, Y8
+	VPSLLD  $0x14, Y4, Y4
+	VPOR    Y4, Y8, Y4
+	VPADDD  Y0, Y4, Y0
+	VPADDD  32(SP), Y0, Y0
+	VPXOR   Y12, Y0, Y12
+	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
+	VMOVDQU 512(SP), Y8
+	VPADDD  Y8, Y12, Y8
+	VPXOR   Y4, Y8, Y4
+	VMOVDQU Y8, 512(SP)
+	VPSRLD  $0x07, Y4, Y8
+	VPSLLD  $0x19, Y4, Y4
+	VPOR    Y4, Y8, Y4
+	VPADDD  Y1, Y5, Y1
+	VPADDD  64(SP), Y1, Y1
+	VPXOR   Y13, Y1, Y13
+	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
+	VPADDD  Y9, Y13, Y9
+	VPXOR   Y5, Y9, Y5
+	VPSRLD  $0x0c, Y5, Y8
+	VPSLLD  $0x14, Y5, Y5
+	VPOR    Y5, Y8, Y5
+	VPADDD  Y1, Y5, Y1
+	VPADDD  96(SP), Y1, Y1
+	VPXOR   Y13, Y1, Y13
+	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
+	VPADDD  Y9, Y13, Y9
+	VPXOR   Y5, Y9, Y5
+	VPSRLD  $0x07, Y5, Y8
+	VPSLLD  $0x19, Y5, Y5
+	VPOR    Y5, Y8, Y5
+	VPADDD  Y2, Y6, Y2
+	VPADDD  128(SP), Y2, Y2
+	VPXOR   Y14, Y2, Y14
+	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
+	VPADDD  Y10, Y14, Y10
+	VPXOR   Y6, Y10, Y6
+	VPSRLD  $0x0c, Y6, Y8
+	VPSLLD  $0x14, Y6, Y6
+	VPOR    Y6, Y8, Y6
+	VPADDD  Y2, Y6, Y2
+	VPADDD  160(SP), Y2, Y2
+	VPXOR   Y14, Y2, Y14
+	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
+	VPADDD  Y10, Y14, Y10
+	VPXOR   Y6, Y10, Y6
+	VPSRLD  $0x07, Y6, Y8
+	VPSLLD  $0x19, Y6, Y6
+	VPOR    Y6, Y8, Y6
+	VPADDD  Y3, Y7, Y3
+	VPADDD  192(SP), Y3, Y3
+	VPXOR   Y15, Y3, Y15
+	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
+	VPADDD  Y11, Y15, Y11
+	VPXOR   Y7, Y11, Y7
+	VPSRLD  $0x0c, Y7, Y8
+	VPSLLD  $0x14, Y7, Y7
+	VPOR    Y7, Y8, Y7
+	VPADDD  Y3, Y7, Y3
+	VPADDD  224(SP), Y3, Y3
+	VPXOR   Y15, Y3, Y15
+	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
+	VPADDD  Y11, Y15, Y11
+	VPXOR   Y7, Y11, Y7
+	VPSRLD  $0x07, Y7, Y8
+	VPSLLD  $0x19, Y7, Y7
+	VPOR    Y7, Y8, Y7
+	VPADDD  Y0, Y5, Y0
+	VPADDD  256(SP), Y0, Y0
+	VPXOR   Y15, Y0, Y15
+	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
+	VPADDD  Y10, Y15, Y10
+	VPXOR   Y5, Y10, Y5
+	VPSRLD  $0x0c, Y5, Y8
+	VPSLLD  $0x14, Y5, Y5
+	VPOR    Y5, Y8, Y5
+	VPADDD  Y0, Y5, Y0
+	VPADDD  288(SP), Y0, Y0
+	VPXOR   Y15, Y0, Y15
+	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
+	VPADDD  Y10, Y15, Y10
+	VPXOR   Y5, Y10, Y5
+	VPSRLD  $0x07, Y5, Y8
+	VPSLLD  $0x19, Y5, Y5
+	VPOR    Y5, Y8, Y5
+	VPADDD  Y1, Y6, Y1
+	VPADDD  320(SP), Y1, Y1
+	VPXOR   Y12, Y1, Y12
+	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
+	VPADDD  Y11, Y12, Y11
+	VPXOR   Y6, Y11, Y6
+	VPSRLD  $0x0c, Y6, Y8
+	VPSLLD  $0x14, Y6, Y6
+	VPOR    Y6, Y8, Y6
+	VPADDD  Y1, Y6, Y1
+	VPADDD  352(SP), Y1, Y1
+	VPXOR   Y12, Y1, Y12
+	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
+	VPADDD  Y11, Y12, Y11
+	VPXOR   Y6, Y11, Y6
+	VPSRLD  $0x07, Y6, Y8
+	VPSLLD  $0x19, Y6, Y6
+	VPOR    Y6, Y8, Y6
+	VPADDD  Y2, Y7, Y2
+	VPADDD  384(SP), Y2, Y2
+	VPXOR   Y13, Y2, Y13
+	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
+	VMOVDQU 512(SP), Y8
+	VPADDD  Y8, Y13, Y8
+	VPXOR   Y7, Y8, Y7
+	VMOVDQU Y8, 512(SP)
+	VPSRLD  $0x0c, Y7, Y8
+	VPSLLD  $0x14, Y7, Y7
+	VPOR    Y7, Y8, Y7
+	VPADDD  Y2, Y7, Y2
+	VPADDD  416(SP), Y2, Y2
+	VPXOR   Y13, Y2, Y13
+	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
+	VMOVDQU 512(SP), Y8
+	VPADDD  Y8, Y13, Y8
+	VPXOR   Y7, Y8, Y7
+	VMOVDQU Y8, 512(SP)
+	VPSRLD  $0x07, Y7, Y8
+	VPSLLD  $0x19, Y7, Y7
+	VPOR    Y7, Y8, Y7
+	VPADDD  Y3, Y4, Y3
+	VPADDD  448(SP), Y3, Y3
+	VPXOR   Y14, Y3, Y14
+	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
+	VPADDD  Y9, Y14, Y9
+	VPXOR   Y4, Y9, Y4
+	VPSRLD  $0x0c, Y4, Y8
+	VPSLLD  $0x14, Y4, Y4
+	VPOR    Y4, Y8, Y4
+	VPADDD  Y3, Y4, Y3
+	VPADDD  480(SP), Y3, Y3
+	VPXOR   Y14, Y3, Y14
+	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
+	VPADDD  Y9, Y14, Y9
+	VPXOR   Y4, Y9, Y4
+	VPSRLD  $0x07, Y4, Y8
+	VPSLLD  $0x19, Y4, Y4
+	VPOR    Y4, Y8, Y4
+
+	// Round 2
+	VPADDD  Y0, Y4, Y0
+	VPADDD  64(SP), Y0, Y0
+	VPXOR   Y12, Y0, Y12
+	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
+	VMOVDQU 512(SP), Y8
+	VPADDD  Y8, Y12, Y8
+	VPXOR   Y4, Y8, Y4
+	VMOVDQU Y8, 512(SP)
+	VPSRLD  $0x0c, Y4, Y8
+	VPSLLD  $0x14, Y4, Y4
+	VPOR    Y4, Y8, Y4
+	VPADDD  Y0, Y4, Y0
+	VPADDD  192(SP), Y0, Y0
+	VPXOR   Y12, Y0, Y12
+	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
+	VMOVDQU 512(SP), Y8
+	VPADDD  Y8, Y12, Y8
+	VPXOR   Y4, Y8, Y4
+	VMOVDQU Y8, 512(SP)
+	VPSRLD  $0x07, Y4, Y8
+	VPSLLD  $0x19, Y4, Y4
+	VPOR    Y4, Y8, Y4
+	VPADDD  Y1, Y5, Y1
+	VPADDD  96(SP), Y1, Y1
+	VPXOR   Y13, Y1, Y13
+	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
+	VPADDD  Y9, Y13, Y9
+	VPXOR   Y5, Y9, Y5
+	VPSRLD  $0x0c, Y5, Y8
+	VPSLLD  $0x14, Y5, Y5
+	VPOR    Y5, Y8, Y5
+	VPADDD  Y1, Y5, Y1
+	VPADDD  320(SP), Y1, Y1
+	VPXOR   Y13, Y1, Y13
+	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
+	VPADDD  Y9, Y13, Y9
+	VPXOR   Y5, Y9, Y5
+	VPSRLD  $0x07, Y5, Y8
+	VPSLLD  $0x19, Y5, Y5
+	VPOR    Y5, Y8, Y5
+	VPADDD  Y2, Y6, Y2
+	VPADDD  224(SP), Y2, Y2
+	VPXOR   Y14, Y2, Y14
+	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
+	VPADDD  Y10, Y14, Y10
+	VPXOR   Y6, Y10, Y6
+	VPSRLD  $0x0c, Y6, Y8
+	VPSLLD  $0x14, Y6, Y6
+	VPOR    Y6, Y8, Y6
+	VPADDD  Y2, Y6, Y2
+	VPADDD  (SP), Y2, Y2
+	VPXOR   Y14, Y2, Y14
+	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
+	VPADDD  Y10, Y14, Y10
+	VPXOR   Y6, Y10, Y6
+	VPSRLD  $0x07, Y6, Y8
+	VPSLLD  $0x19, Y6, Y6
+	VPOR    Y6, Y8, Y6
+	VPADDD  Y3, Y7, Y3
+	VPADDD  128(SP), Y3, Y3
+	VPXOR   Y15, Y3, Y15
+	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
+	VPADDD  Y11, Y15, Y11
+	VPXOR   Y7, Y11, Y7
+	VPSRLD  $0x0c, Y7, Y8
+	VPSLLD  $0x14, Y7, Y7
+	VPOR    Y7, Y8, Y7
+	VPADDD  Y3, Y7, Y3
+	VPADDD  416(SP), Y3, Y3
+	VPXOR   Y15, Y3, Y15
+	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
+	VPADDD  Y11, Y15, Y11
+	VPXOR   Y7, Y11, Y7
+	VPSRLD  $0x07, Y7, Y8
+	VPSLLD  $0x19, Y7, Y7
+	VPOR    Y7, Y8, Y7
+	VPADDD  Y0, Y5, Y0
+	VPADDD  32(SP), Y0, Y0
+	VPXOR   Y15, Y0, Y15
+	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
+	VPADDD  Y10, Y15, Y10
+	VPXOR   Y5, Y10, Y5
+	VPSRLD  $0x0c, Y5, Y8
+	VPSLLD  $0x14, Y5, Y5
+	VPOR    Y5, Y8, Y5
+	VPADDD  Y0, Y5, Y0
+	VPADDD  352(SP), Y0, Y0
+	VPXOR   Y15, Y0, Y15
+	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
+	VPADDD  Y10, Y15, Y10
+	VPXOR   Y5, Y10, Y5
+	VPSRLD  $0x07, Y5, Y8
+	VPSLLD  $0x19, Y5, Y5
+	VPOR    Y5, Y8, Y5
+	VPADDD  Y1, Y6, Y1
+	VPADDD  384(SP), Y1, Y1
+	VPXOR   Y12, Y1, Y12
+	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
+	VPADDD  Y11, Y12, Y11
+	VPXOR   Y6, Y11, Y6
+	VPSRLD  $0x0c, Y6, Y8
+	VPSLLD  $0x14, Y6, Y6
+	VPOR    Y6, Y8, Y6
+	VPADDD  Y1, Y6, Y1
+	VPADDD  160(SP), Y1, Y1
+	VPXOR   Y12, Y1, Y12
+	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
+	VPADDD  Y11, Y12, Y11
+	VPXOR   Y6, Y11, Y6
+	VPSRLD  $0x07, Y6, Y8
+	VPSLLD  $0x19, Y6, Y6
+	VPOR    Y6, Y8, Y6
+	VPADDD  Y2, Y7, Y2
+	VPADDD  288(SP), Y2, Y2
+	VPXOR   Y13, Y2, Y13
+	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
+	VMOVDQU 512(SP), Y8
+	VPADDD  Y8, Y13, Y8
+	VPXOR   Y7, Y8, Y7
+	VMOVDQU Y8, 512(SP)
+	VPSRLD  $0x0c, Y7, Y8
+	VPSLLD  $0x14, Y7, Y7
+	VPOR    Y7, Y8, Y7
+	VPADDD  Y2, Y7, Y2
+	VPADDD  448(SP), Y2, Y2
+	VPXOR   Y13, Y2, Y13
+	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
+	VMOVDQU 512(SP), Y8
+	VPADDD  Y8, Y13, Y8
+	VPXOR   Y7, Y8, Y7
+	VMOVDQU Y8, 512(SP)
+	VPSRLD  $0x07, Y7, Y8
+	VPSLLD  $0x19, Y7, Y7
+	VPOR    Y7, Y8, Y7
+	VPADDD  Y3, Y4, Y3
+	VPADDD  480(SP), Y3, Y3
+	VPXOR   Y14, Y3, Y14
+	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
+	VPADDD  Y9, Y14, Y9
+	VPXOR   Y4, Y9, Y4
+	VPSRLD  $0x0c, Y4, Y8
+	VPSLLD  $0x14, Y4, Y4
+	VPOR    Y4, Y8, Y4
+	VPADDD  Y3, Y4, Y3
+	VPADDD  256(SP), Y3, Y3
+	VPXOR   Y14, Y3, Y14
+	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
+	VPADDD  Y9, Y14, Y9
+	VPXOR   Y4, Y9, Y4
+	VPSRLD  $0x07, Y4, Y8
+	VPSLLD  $0x19, Y4, Y4
+	VPOR    Y4, Y8, Y4
+
+	// Round 3
+	VPADDD  Y0, Y4, Y0
+	VPADDD  96(SP), Y0, Y0
+	VPXOR   Y12, Y0, Y12
+	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
+	VMOVDQU 512(SP), Y8
+	VPADDD  Y8, Y12, Y8
+	VPXOR   Y4, Y8, Y4
+	VMOVDQU Y8, 512(SP)
+	VPSRLD  $0x0c, Y4, Y8
+	VPSLLD  $0x14, Y4, Y4
+	VPOR    Y4, Y8, Y4
+	VPADDD  Y0, Y4, Y0
+	VPADDD  128(SP), Y0, Y0
+	VPXOR   Y12, Y0, Y12
+	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
+	VMOVDQU 512(SP), Y8
+	VPADDD  Y8, Y12, Y8
+	VPXOR   Y4, Y8, Y4
+	VMOVDQU Y8, 512(SP)
+	VPSRLD  $0x07, Y4, Y8
+	VPSLLD  $0x19, Y4, Y4
+	VPOR    Y4, Y8, Y4
+	VPADDD  Y1, Y5, Y1
+	VPADDD  320(SP), Y1, Y1
+	VPXOR   Y13, Y1, Y13
+	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
+	VPADDD  Y9, Y13, Y9
+	VPXOR   Y5, Y9, Y5
+	VPSRLD  $0x0c, Y5, Y8
+	VPSLLD  $0x14, Y5, Y5
+	VPOR    Y5, Y8, Y5
+	VPADDD  Y1, Y5, Y1
+	VPADDD  384(SP), Y1, Y1
+	VPXOR   Y13, Y1, Y13
+	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
+	VPADDD  Y9, Y13, Y9
+	VPXOR   Y5, Y9, Y5
+	VPSRLD  $0x07, Y5, Y8
+	VPSLLD  $0x19, Y5, Y5
+	VPOR    Y5, Y8, Y5
+	VPADDD  Y2, Y6, Y2
+	VPADDD  416(SP), Y2, Y2
+	VPXOR   Y14, Y2, Y14
+	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
+	VPADDD  Y10, Y14, Y10
+	VPXOR   Y6, Y10, Y6
+	VPSRLD  $0x0c, Y6, Y8
+	VPSLLD  $0x14, Y6, Y6
+	VPOR    Y6, Y8, Y6
+	VPADDD  Y2, Y6, Y2
+	VPADDD  64(SP), Y2, Y2
+	VPXOR   Y14, Y2, Y14
+	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
+	VPADDD  Y10, Y14, Y10
+	VPXOR   Y6, Y10, Y6
+	VPSRLD  $0x07, Y6, Y8
+	VPSLLD  $0x19, Y6, Y6
+	VPOR    Y6, Y8, Y6
+	VPADDD  Y3, Y7, Y3
+	VPADDD  224(SP), Y3, Y3
+	VPXOR   Y15, Y3, Y15
+	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
+	VPADDD  Y11, Y15, Y11
+	VPXOR   Y7, Y11, Y7
+	VPSRLD  $0x0c, Y7, Y8
+	VPSLLD  $0x14, Y7, Y7
+	VPOR    Y7, Y8, Y7
+	VPADDD  Y3, Y7, Y3
+	VPADDD  448(SP), Y3, Y3
+	VPXOR   Y15, Y3, Y15
+	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
+	VPADDD  Y11, Y15, Y11
+	VPXOR   Y7, Y11, Y7
+	VPSRLD  $0x07, Y7, Y8
+	VPSLLD  $0x19, Y7, Y7
+	VPOR    Y7, Y8, Y7
+	VPADDD  Y0, Y5, Y0
+	VPADDD  192(SP), Y0, Y0
+	VPXOR   Y15, Y0, Y15
+	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
+	VPADDD  Y10, Y15, Y10
+	VPXOR   Y5, Y10, Y5
+	VPSRLD  $0x0c, Y5, Y8
+	VPSLLD  $0x14, Y5, Y5
+	VPOR    Y5, Y8, Y5
+	VPADDD  Y0, Y5, Y0
+	VPADDD  160(SP), Y0, Y0
+	VPXOR   Y15, Y0, Y15
+	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
+	VPADDD  Y10, Y15, Y10
+	VPXOR   Y5, Y10, Y5
+	VPSRLD  $0x07, Y5, Y8
+	VPSLLD  $0x19, Y5, Y5
+	VPOR    Y5, Y8, Y5
+	VPADDD  Y1, Y6, Y1
+	VPADDD  288(SP), Y1, Y1
+	VPXOR   Y12, Y1, Y12
+	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
+	VPADDD  Y11, Y12, Y11
+	VPXOR   Y6, Y11, Y6
+	VPSRLD  $0x0c, Y6, Y8
+	VPSLLD  $0x14, Y6, Y6
+	VPOR    Y6, Y8, Y6
+	VPADDD  Y1, Y6, Y1
+	VPADDD  (SP), Y1, Y1
+	VPXOR   Y12, Y1, Y12
+	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
+	VPADDD  Y11, Y12, Y11
+	VPXOR   Y6, Y11, Y6
+	VPSRLD  $0x07, Y6, Y8
+	VPSLLD  $0x19, Y6, Y6
+	VPOR    Y6, Y8, Y6
+	VPADDD  Y2, Y7, Y2
+	VPADDD  352(SP), Y2, Y2
+	VPXOR   Y13, Y2, Y13
+	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
+	VMOVDQU 512(SP), Y8
+	VPADDD  Y8, Y13, Y8
+	VPXOR   Y7, Y8, Y7
+	VMOVDQU Y8, 512(SP)
+	VPSRLD  $0x0c, Y7, Y8
+	VPSLLD  $0x14, Y7, Y7
+	VPOR    Y7, Y8, Y7
+	VPADDD  Y2, Y7, Y2
+	VPADDD  480(SP), Y2, Y2
+	VPXOR   Y13, Y2, Y13
+	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
+	VMOVDQU 512(SP), Y8
+	VPADDD  Y8, Y13, Y8
+	VPXOR   Y7, Y8, Y7
+	VMOVDQU Y8, 512(SP)
+	VPSRLD  $0x07, Y7, Y8
+	VPSLLD  $0x19, Y7, Y7
+	VPOR    Y7, Y8, Y7
+	VPADDD  Y3, Y4, Y3
+	VPADDD  256(SP), Y3, Y3
+	VPXOR   Y14, Y3, Y14
+	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
+	VPADDD  Y9, Y14, Y9
+	VPXOR   Y4, Y9, Y4
+	VPSRLD  $0x0c, Y4, Y8
+	VPSLLD  $0x14, Y4, Y4
+	VPOR    Y4, Y8, Y4
+	VPADDD  Y3, Y4, Y3
+	VPADDD  32(SP), Y3, Y3
+	VPXOR   Y14, Y3, Y14
+	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
+	VPADDD  Y9, Y14, Y9
+	VPXOR   Y4, Y9, Y4
+	VPSRLD  $0x07, Y4, Y8
+	VPSLLD  $0x19, Y4, Y4
+	VPOR    Y4, Y8, Y4
+
+	// Round 4
+	VPADDD  Y0, Y4, Y0
+	VPADDD  320(SP), Y0, Y0
+	VPXOR   Y12, Y0, Y12
+	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
+	VMOVDQU 512(SP), Y8
+	VPADDD  Y8, Y12, Y8
+	VPXOR   Y4, Y8, Y4
+	VMOVDQU Y8, 512(SP)
+	VPSRLD  $0x0c, Y4, Y8
+	VPSLLD  $0x14, Y4, Y4
+	VPOR    Y4, Y8, Y4
+	VPADDD  Y0, Y4, Y0
+	VPADDD  224(SP), Y0, Y0
+	VPXOR   Y12, Y0, Y12
+	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
+	VMOVDQU 512(SP), Y8
+	VPADDD  Y8, Y12, Y8
+	VPXOR   Y4, Y8, Y4
+	VMOVDQU Y8, 512(SP)
+	VPSRLD  $0x07, Y4, Y8
+	VPSLLD  $0x19, Y4, Y4
+	VPOR    Y4, Y8, Y4
+	VPADDD  Y1, Y5, Y1
+	VPADDD  384(SP), Y1, Y1
+	VPXOR   Y13, Y1, Y13
+	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
+	VPADDD  Y9, Y13, Y9
+	VPXOR   Y5, Y9, Y5
+	VPSRLD  $0x0c, Y5, Y8
+	VPSLLD  $0x14, Y5, Y5
+	VPOR    Y5, Y8, Y5
+	VPADDD  Y1, Y5, Y1
+	VPADDD  288(SP), Y1, Y1
+	VPXOR   Y13, Y1, Y13
+	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
+	VPADDD  Y9, Y13, Y9
+	VPXOR   Y5, Y9, Y5
+	VPSRLD  $0x07, Y5, Y8
+	VPSLLD  $0x19, Y5, Y5
+	VPOR    Y5, Y8, Y5
+	VPADDD  Y2, Y6, Y2
+	VPADDD  448(SP), Y2, Y2
+	VPXOR   Y14, Y2, Y14
+	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
+	VPADDD  Y10, Y14, Y10
+	VPXOR   Y6, Y10, Y6
+	VPSRLD  $0x0c, Y6, Y8
+	VPSLLD  $0x14, Y6, Y6
+	VPOR    Y6, Y8, Y6
+	VPADDD  Y2, Y6, Y2
+	VPADDD  96(SP), Y2, Y2
+	VPXOR   Y14, Y2, Y14
+	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
+	VPADDD  Y10, Y14, Y10
+	VPXOR   Y6, Y10, Y6
+	VPSRLD  $0x07, Y6, Y8
+	VPSLLD  $0x19, Y6, Y6
+	VPOR    Y6, Y8, Y6
+	VPADDD  Y3, Y7, Y3
+	VPADDD  416(SP), Y3, Y3
+	VPXOR   Y15, Y3, Y15
+	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
+	VPADDD  Y11, Y15, Y11
+	VPXOR   Y7, Y11, Y7
+	VPSRLD  $0x0c, Y7, Y8
+	VPSLLD  $0x14, Y7, Y7
+	VPOR    Y7, Y8, Y7
+	VPADDD  Y3, Y7, Y3
+	VPADDD  480(SP), Y3, Y3
+	VPXOR   Y15, Y3, Y15
+	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
+	VPADDD  Y11, Y15, Y11
+	VPXOR   Y7, Y11, Y7
+	VPSRLD  $0x07, Y7, Y8
+	VPSLLD  $0x19, Y7, Y7
+	VPOR    Y7, Y8, Y7
+	VPADDD  Y0, Y5, Y0
+	VPADDD  128(SP), Y0, Y0
+	VPXOR   Y15, Y0, Y15
+	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
+	VPADDD  Y10, Y15, Y10
+	VPXOR   Y5, Y10, Y5
+	VPSRLD  $0x0c, Y5, Y8
+	VPSLLD  $0x14, Y5, Y5
+	VPOR    Y5, Y8, Y5
+	VPADDD  Y0, Y5, Y0
+	VPADDD  (SP), Y0, Y0
+	VPXOR   Y15, Y0, Y15
+	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
+	VPADDD  Y10, Y15, Y10
+	VPXOR   Y5, Y10, Y5
+	VPSRLD  $0x07, Y5, Y8
+	VPSLLD  $0x19, Y5, Y5
+	VPOR    Y5, Y8, Y5
+	VPADDD  Y1, Y6, Y1
+	VPADDD  352(SP), Y1, Y1
+	VPXOR   Y12, Y1, Y12
+	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
+	VPADDD  Y11, Y12, Y11
+	VPXOR   Y6, Y11, Y6
+	VPSRLD  $0x0c, Y6, Y8
+	VPSLLD  $0x14, Y6, Y6
+	VPOR    Y6, Y8, Y6
+	VPADDD  Y1, Y6, Y1
+	VPADDD  64(SP), Y1, Y1
+	VPXOR   Y12, Y1, Y12
+	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
+	VPADDD  Y11, Y12, Y11
+	VPXOR   Y6, Y11, Y6
+	VPSRLD  $0x07, Y6, Y8
+	VPSLLD  $0x19, Y6, Y6
+	VPOR    Y6, Y8, Y6
+	VPADDD  Y2, Y7, Y2
+	VPADDD  160(SP), Y2, Y2
+	VPXOR   Y13, Y2, Y13
+	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
+	VMOVDQU 512(SP), Y8
+	VPADDD  Y8, Y13, Y8
+	VPXOR   Y7, Y8, Y7
+	VMOVDQU Y8, 512(SP)
+	VPSRLD  $0x0c, Y7, Y8
+	VPSLLD  $0x14, Y7, Y7
+	VPOR    Y7, Y8, Y7
+	VPADDD  Y2, Y7, Y2
+	VPADDD  256(SP), Y2, Y2
+	VPXOR   Y13, Y2, Y13
+	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
+	VMOVDQU 512(SP), Y8
+	VPADDD  Y8, Y13, Y8
+	VPXOR   Y7, Y8, Y7
+	VMOVDQU Y8, 512(SP)
+	VPSRLD  $0x07, Y7, Y8
+	VPSLLD  $0x19, Y7, Y7
+	VPOR    Y7, Y8, Y7
+	VPADDD  Y3, Y4, Y3
+	VPADDD  32(SP), Y3, Y3
+	VPXOR   Y14, Y3, Y14
+	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
+	VPADDD  Y9, Y14, Y9
+	VPXOR   Y4, Y9, Y4
+	VPSRLD  $0x0c, Y4, Y8
+	VPSLLD  $0x14, Y4, Y4
+	VPOR    Y4, Y8, Y4
+	VPADDD  Y3, Y4, Y3
+	VPADDD  192(SP), Y3, Y3
+	VPXOR   Y14, Y3, Y14
+	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
+	VPADDD  Y9, Y14, Y9
+	VPXOR   Y4, Y9, Y4
+	VPSRLD  $0x07, Y4, Y8
+	VPSLLD  $0x19, Y4, Y4
+	VPOR    Y4, Y8, Y4
+
+	// Round 5
+	VPADDD  Y0, Y4, Y0
+	VPADDD  384(SP), Y0, Y0
+	VPXOR   Y12, Y0, Y12
+	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
+	VMOVDQU 512(SP), Y8
+	VPADDD  Y8, Y12, Y8
+	VPXOR   Y4, Y8, Y4
+	VMOVDQU Y8, 512(SP)
+	VPSRLD  $0x0c, Y4, Y8
+	VPSLLD  $0x14, Y4, Y4
+	VPOR    Y4, Y8, Y4
+	VPADDD  Y0, Y4, Y0
+	VPADDD  416(SP), Y0, Y0
+	VPXOR   Y12, Y0, Y12
+	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
+	VMOVDQU 512(SP), Y8
+	VPADDD  Y8, Y12, Y8
+	VPXOR   Y4, Y8, Y4
+	VMOVDQU Y8, 512(SP)
+	VPSRLD  $0x07, Y4, Y8
+	VPSLLD  $0x19, Y4, Y4
+	VPOR    Y4, Y8, Y4
+	VPADDD  Y1, Y5, Y1
+	VPADDD  288(SP), Y1, Y1
+	VPXOR   Y13, Y1, Y13
+	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
+	VPADDD  Y9, Y13, Y9
+	VPXOR   Y5, Y9, Y5
+	VPSRLD  $0x0c, Y5, Y8
+	VPSLLD  $0x14, Y5, Y5
+	VPOR    Y5, Y8, Y5
+	VPADDD  Y1, Y5, Y1
+	VPADDD  352(SP), Y1, Y1
+	VPXOR   Y13, Y1, Y13
+	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
+	VPADDD  Y9, Y13, Y9
+	VPXOR   Y5, Y9, Y5
+	VPSRLD  $0x07, Y5, Y8
+	VPSLLD  $0x19, Y5, Y5
+	VPOR    Y5, Y8, Y5
+	VPADDD  Y2, Y6, Y2
+	VPADDD  480(SP), Y2, Y2
+	VPXOR   Y14, Y2, Y14
+	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
+	VPADDD  Y10, Y14, Y10
+	VPXOR   Y6, Y10, Y6
+	VPSRLD  $0x0c, Y6, Y8
+	VPSLLD  $0x14, Y6, Y6
+	VPOR    Y6, Y8, Y6
+	VPADDD  Y2, Y6, Y2
+	VPADDD  320(SP), Y2, Y2
+	VPXOR   Y14, Y2, Y14
+	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
+	VPADDD  Y10, Y14, Y10
+	VPXOR   Y6, Y10, Y6
+	VPSRLD  $0x07, Y6, Y8
+	VPSLLD  $0x19, Y6, Y6
+	VPOR    Y6, Y8, Y6
+	VPADDD  Y3, Y7, Y3
+	VPADDD  448(SP), Y3, Y3
+	VPXOR   Y15, Y3, Y15
+	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
+	VPADDD  Y11, Y15, Y11
+	VPXOR   Y7, Y11, Y7
+	VPSRLD  $0x0c, Y7, Y8
+	VPSLLD  $0x14, Y7, Y7
+	VPOR    Y7, Y8, Y7
+	VPADDD  Y3, Y7, Y3
+	VPADDD  256(SP), Y3, Y3
+	VPXOR   Y15, Y3, Y15
+	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
+	VPADDD  Y11, Y15, Y11
+	VPXOR   Y7, Y11, Y7
+	VPSRLD  $0x07, Y7, Y8
+	VPSLLD  $0x19, Y7, Y7
+	VPOR    Y7, Y8, Y7
+	VPADDD  Y0, Y5, Y0
+	VPADDD  224(SP), Y0, Y0
+	VPXOR   Y15, Y0, Y15
+	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
+	VPADDD  Y10, Y15, Y10
+	VPXOR   Y5, Y10, Y5
+	VPSRLD  $0x0c, Y5, Y8
+	VPSLLD  $0x14, Y5, Y5
+	VPOR    Y5, Y8, Y5
+	VPADDD  Y0, Y5, Y0
+	VPADDD  64(SP), Y0, Y0
+	VPXOR   Y15, Y0, Y15
+	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
+	VPADDD  Y10, Y15, Y10
+	VPXOR   Y5, Y10, Y5
+	VPSRLD  $0x07, Y5, Y8
+	VPSLLD  $0x19, Y5, Y5
+	VPOR    Y5, Y8, Y5
+	VPADDD  Y1, Y6, Y1
+	VPADDD  160(SP), Y1, Y1
+	VPXOR   Y12, Y1, Y12
+	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
+	VPADDD  Y11, Y12, Y11
+	VPXOR   Y6, Y11, Y6
+	VPSRLD  $0x0c, Y6, Y8
+	VPSLLD  $0x14, Y6, Y6
+	VPOR    Y6, Y8, Y6
+	VPADDD  Y1, Y6, Y1
+	VPADDD  96(SP), Y1, Y1
+	VPXOR   Y12, Y1, Y12
+	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
+	VPADDD  Y11, Y12, Y11
+	VPXOR   Y6, Y11, Y6
+	VPSRLD  $0x07, Y6, Y8
+	VPSLLD  $0x19, Y6, Y6
+	VPOR    Y6, Y8, Y6
+	VPADDD  Y2, Y7, Y2
+	VPADDD  (SP), Y2, Y2
+	VPXOR   Y13, Y2, Y13
+	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
+	VMOVDQU 512(SP), Y8
+	VPADDD  Y8, Y13, Y8
+	VPXOR   Y7, Y8, Y7
+	VMOVDQU Y8, 512(SP)
+	VPSRLD  $0x0c, Y7, Y8
+	VPSLLD  $0x14, Y7, Y7
+	VPOR    Y7, Y8, Y7
+	VPADDD  Y2, Y7, Y2
+	VPADDD  32(SP), Y2, Y2
+	VPXOR   Y13, Y2, Y13
+	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
+	VMOVDQU 512(SP), Y8
+	VPADDD  Y8, Y13, Y8
+	VPXOR   Y7, Y8, Y7
+	VMOVDQU Y8, 512(SP)
+	VPSRLD  $0x07, Y7, Y8
+	VPSLLD  $0x19, Y7, Y7
+	VPOR    Y7, Y8, Y7
+	VPADDD  Y3, Y4, Y3
+	VPADDD  192(SP), Y3, Y3
+	VPXOR   Y14, Y3, Y14
+	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
+	VPADDD  Y9, Y14, Y9
+	VPXOR   Y4, Y9, Y4
+	VPSRLD  $0x0c, Y4, Y8
+	VPSLLD  $0x14, Y4, Y4
+	VPOR    Y4, Y8, Y4
+	VPADDD  Y3, Y4, Y3
+	VPADDD  128(SP), Y3, Y3
+	VPXOR   Y14, Y3, Y14
+	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
+	VPADDD  Y9, Y14, Y9
+	VPXOR   Y4, Y9, Y4
+	VPSRLD  $0x07, Y4, Y8
+	VPSLLD  $0x19, Y4, Y4
+	VPOR    Y4, Y8, Y4
+
+	// Round 6
+	VPADDD  Y0, Y4, Y0
+	VPADDD  288(SP), Y0, Y0
+	VPXOR   Y12, Y0, Y12
+	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
+	VMOVDQU 512(SP), Y8
+	VPADDD  Y8, Y12, Y8
+	VPXOR   Y4, Y8, Y4
+	VMOVDQU Y8, 512(SP)
+	VPSRLD  $0x0c, Y4, Y8
+	VPSLLD  $0x14, Y4, Y4
+	VPOR    Y4, Y8, Y4
+	VPADDD  Y0, Y4, Y0
+	VPADDD  448(SP), Y0, Y0
+	VPXOR   Y12, Y0, Y12
+	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
+	VMOVDQU 512(SP), Y8
+	VPADDD  Y8, Y12, Y8
+	VPXOR   Y4, Y8, Y4
+	VMOVDQU Y8, 512(SP)
+	VPSRLD  $0x07, Y4, Y8
+	VPSLLD  $0x19, Y4, Y4
+	VPOR    Y4, Y8, Y4
+	VPADDD  Y1, Y5, Y1
+	VPADDD  352(SP), Y1, Y1
+	VPXOR   Y13, Y1, Y13
+	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
+	VPADDD  Y9, Y13, Y9
+	VPXOR   Y5, Y9, Y5
+	VPSRLD  $0x0c, Y5, Y8
+	VPSLLD  $0x14, Y5, Y5
+	VPOR    Y5, Y8, Y5
+	VPADDD  Y1, Y5, Y1
+	VPADDD  160(SP), Y1, Y1
+	VPXOR   Y13, Y1, Y13
+	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
+	VPADDD  Y9, Y13, Y9
+	VPXOR   Y5, Y9, Y5
+	VPSRLD  $0x07, Y5, Y8
+	VPSLLD  $0x19, Y5, Y5
+	VPOR    Y5, Y8, Y5
+	VPADDD  Y2, Y6, Y2
+	VPADDD  256(SP), Y2, Y2
+	VPXOR   Y14, Y2, Y14
+	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
+	VPADDD  Y10, Y14, Y10
+	VPXOR   Y6, Y10, Y6
+	VPSRLD  $0x0c, Y6, Y8
+	VPSLLD  $0x14, Y6, Y6
+	VPOR    Y6, Y8, Y6
+	VPADDD  Y2, Y6, Y2
+	VPADDD  384(SP), Y2, Y2
+	VPXOR   Y14, Y2, Y14
+	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
+	VPADDD  Y10, Y14, Y10
+	VPXOR   Y6, Y10, Y6
+	VPSRLD  $0x07, Y6, Y8
+	VPSLLD  $0x19, Y6, Y6
+	VPOR    Y6, Y8, Y6
+	VPADDD  Y3, Y7, Y3
+	VPADDD  480(SP), Y3, Y3
+	VPXOR   Y15, Y3, Y15
+	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
+	VPADDD  Y11, Y15, Y11
+	VPXOR   Y7, Y11, Y7
+	VPSRLD  $0x0c, Y7, Y8
+	VPSLLD  $0x14, Y7, Y7
+	VPOR    Y7, Y8, Y7
+	VPADDD  Y3, Y7, Y3
+	VPADDD  32(SP), Y3, Y3
+	VPXOR   Y15, Y3, Y15
+	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
+	VPADDD  Y11, Y15, Y11
+	VPXOR   Y7, Y11, Y7
+	VPSRLD  $0x07, Y7, Y8
+	VPSLLD  $0x19, Y7, Y7
+	VPOR    Y7, Y8, Y7
+	VPADDD  Y0, Y5, Y0
+	VPADDD  416(SP), Y0, Y0
+	VPXOR   Y15, Y0, Y15
+	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
+	VPADDD  Y10, Y15, Y10
+	VPXOR   Y5, Y10, Y5
+	VPSRLD  $0x0c, Y5, Y8
+	VPSLLD  $0x14, Y5, Y5
+	VPOR    Y5, Y8, Y5
+	VPADDD  Y0, Y5, Y0
+	VPADDD  96(SP), Y0, Y0
+	VPXOR   Y15, Y0, Y15
+	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
+	VPADDD  Y10, Y15, Y10
+	VPXOR   Y5, Y10, Y5
+	VPSRLD  $0x07, Y5, Y8
+	VPSLLD  $0x19, Y5, Y5
+	VPOR    Y5, Y8, Y5
+	VPADDD  Y1, Y6, Y1
+	VPADDD  (SP), Y1, Y1
+	VPXOR   Y12, Y1, Y12
+	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
+	VPADDD  Y11, Y12, Y11
+	VPXOR   Y6, Y11, Y6
+	VPSRLD  $0x0c, Y6, Y8
+	VPSLLD  $0x14, Y6, Y6
+	VPOR    Y6, Y8, Y6
+	VPADDD  Y1, Y6, Y1
+	VPADDD  320(SP), Y1, Y1
+	VPXOR   Y12, Y1, Y12
+	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
+	VPADDD  Y11, Y12, Y11
+	VPXOR   Y6, Y11, Y6
+	VPSRLD  $0x07, Y6, Y8
+	VPSLLD  $0x19, Y6, Y6
+	VPOR    Y6, Y8, Y6
+	VPADDD  Y2, Y7, Y2
+	VPADDD  64(SP), Y2, Y2
+	VPXOR   Y13, Y2, Y13
+	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
+	VMOVDQU 512(SP), Y8
+	VPADDD  Y8, Y13, Y8
+	VPXOR   Y7, Y8, Y7
+	VMOVDQU Y8, 512(SP)
+	VPSRLD  $0x0c, Y7, Y8
+	VPSLLD  $0x14, Y7, Y7
+	VPOR    Y7, Y8, Y7
+	VPADDD  Y2, Y7, Y2
+	VPADDD  192(SP), Y2, Y2
+	VPXOR   Y13, Y2, Y13
+	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
+	VMOVDQU 512(SP), Y8
+	VPADDD  Y8, Y13, Y8
+	VPXOR   Y7, Y8, Y7
+	VMOVDQU Y8, 512(SP)
+	VPSRLD  $0x07, Y7, Y8
+	VPSLLD  $0x19, Y7, Y7
+	VPOR    Y7, Y8, Y7
+	VPADDD  Y3, Y4, Y3
+	VPADDD  128(SP), Y3, Y3
+	VPXOR   Y14, Y3, Y14
+	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
+	VPADDD  Y9, Y14, Y9
+	VPXOR   Y4, Y9, Y4
+	VPSRLD  $0x0c, Y4, Y8
+	VPSLLD  $0x14, Y4, Y4
+	VPOR    Y4, Y8, Y4
+	VPADDD  Y3, Y4, Y3
+	VPADDD  224(SP), Y3, Y3
+	VPXOR   Y14, Y3, Y14
+	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
+	VPADDD  Y9, Y14, Y9
+	VPXOR   Y4, Y9, Y4
+	VPSRLD  $0x07, Y4, Y8
+	VPSLLD  $0x19, Y4, Y4
+	VPOR    Y4, Y8, Y4
+
+	// Round 7
+	VPADDD  Y0, Y4, Y0
+	VPADDD  352(SP), Y0, Y0
+	VPXOR   Y12, Y0, Y12
+	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
+	VMOVDQU 512(SP), Y8
+	VPADDD  Y8, Y12, Y8
+	VPXOR   Y4, Y8, Y4
+	VMOVDQU Y8, 512(SP)
+	VPSRLD  $0x0c, Y4, Y8
+	VPSLLD  $0x14, Y4, Y4
+	VPOR    Y4, Y8, Y4
+	VPADDD  Y0, Y4, Y0
+	VPADDD  480(SP), Y0, Y0
+	VPXOR   Y12, Y0, Y12
+	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
+	VMOVDQU 512(SP), Y8
+	VPADDD  Y8, Y12, Y8
+	VPXOR   Y4, Y8, Y4
+	VMOVDQU Y8, 512(SP)
+	VPSRLD  $0x07, Y4, Y8
+	VPSLLD  $0x19, Y4, Y4
+	VPOR    Y4, Y8, Y4
+	VPADDD  Y1, Y5, Y1
+	VPADDD  160(SP), Y1, Y1
+	VPXOR   Y13, Y1, Y13
+	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
+	VPADDD  Y9, Y13, Y9
+	VPXOR   Y5, Y9, Y5
+	VPSRLD  $0x0c, Y5, Y8
+	VPSLLD  $0x14, Y5, Y5
+	VPOR    Y5, Y8, Y5
+	VPADDD  Y1, Y5, Y1
+	VPADDD  (SP), Y1, Y1
+	VPXOR   Y13, Y1, Y13
+	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
+	VPADDD  Y9, Y13, Y9
+	VPXOR   Y5, Y9, Y5
+	VPSRLD  $0x07, Y5, Y8
+	VPSLLD  $0x19, Y5, Y5
+	VPOR    Y5, Y8, Y5
+	VPADDD  Y2, Y6, Y2
+	VPADDD  32(SP), Y2, Y2
+	VPXOR   Y14, Y2, Y14
+	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
+	VPADDD  Y10, Y14, Y10
+	VPXOR   Y6, Y10, Y6
+	VPSRLD  $0x0c, Y6, Y8
+	VPSLLD  $0x14, Y6, Y6
+	VPOR    Y6, Y8, Y6
+	VPADDD  Y2, Y6, Y2
+	VPADDD  288(SP), Y2, Y2
+	VPXOR   Y14, Y2, Y14
+	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
+	VPADDD  Y10, Y14, Y10
+	VPXOR   Y6, Y10, Y6
+	VPSRLD  $0x07, Y6, Y8
+	VPSLLD  $0x19, Y6, Y6
+	VPOR    Y6, Y8, Y6
+	VPADDD  Y3, Y7, Y3
+	VPADDD  256(SP), Y3, Y3
+	VPXOR   Y15, Y3, Y15
+	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
+	VPADDD  Y11, Y15, Y11
+	VPXOR   Y7, Y11, Y7
+	VPSRLD  $0x0c, Y7, Y8
+	VPSLLD  $0x14, Y7, Y7
+	VPOR    Y7, Y8, Y7
+	VPADDD  Y3, Y7, Y3
+	VPADDD  192(SP), Y3, Y3
+	VPXOR   Y15, Y3, Y15
+	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
+	VPADDD  Y11, Y15, Y11
+	VPXOR   Y7, Y11, Y7
+	VPSRLD  $0x07, Y7, Y8
+	VPSLLD  $0x19, Y7, Y7
+	VPOR    Y7, Y8, Y7
+	VPADDD  Y0, Y5, Y0
+	VPADDD  448(SP), Y0, Y0
+	VPXOR   Y15, Y0, Y15
+	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
+	VPADDD  Y10, Y15, Y10
+	VPXOR   Y5, Y10, Y5
+	VPSRLD  $0x0c, Y5, Y8
+	VPSLLD  $0x14, Y5, Y5
+	VPOR    Y5, Y8, Y5
+	VPADDD  Y0, Y5, Y0
+	VPADDD  320(SP), Y0, Y0
+	VPXOR   Y15, Y0, Y15
+	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
+	VPADDD  Y10, Y15, Y10
+	VPXOR   Y5, Y10, Y5
+	VPSRLD  $0x07, Y5, Y8
+	VPSLLD  $0x19, Y5, Y5
+	VPOR    Y5, Y8, Y5
+	VPADDD  Y1, Y6, Y1
+	VPADDD  64(SP), Y1, Y1
+	VPXOR   Y12, Y1, Y12
+	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
+	VPADDD  Y11, Y12, Y11
+	VPXOR   Y6, Y11, Y6
+	VPSRLD  $0x0c, Y6, Y8
+	VPSLLD  $0x14, Y6, Y6
+	VPOR    Y6, Y8, Y6
+	VPADDD  Y1, Y6, Y1
+	VPADDD  384(SP), Y1, Y1
+	VPXOR   Y12, Y1, Y12
+	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
+	VPADDD  Y11, Y12, Y11
+	VPXOR   Y6, Y11, Y6
+	VPSRLD  $0x07, Y6, Y8
+	VPSLLD  $0x19, Y6, Y6
+	VPOR    Y6, Y8, Y6
+	VPADDD  Y2, Y7, Y2
+	VPADDD  96(SP), Y2, Y2
+	VPXOR   Y13, Y2, Y13
+	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
+	VMOVDQU 512(SP), Y8
+	VPADDD  Y8, Y13, Y8
+	VPXOR   Y7, Y8, Y7
+	VMOVDQU Y8, 512(SP)
+	VPSRLD  $0x0c, Y7, Y8
+	VPSLLD  $0x14, Y7, Y7
+	VPOR    Y7, Y8, Y7
+	VPADDD  Y2, Y7, Y2
+	VPADDD  128(SP), Y2, Y2
+	VPXOR   Y13, Y2, Y13
+	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
+	VMOVDQU 512(SP), Y8
+	VPADDD  Y8, Y13, Y8
+	VPXOR   Y7, Y8, Y7
+	VMOVDQU Y8, 512(SP)
+	VPSRLD  $0x07, Y7, Y8
+	VPSLLD  $0x19, Y7, Y7
+	VPOR    Y7, Y8, Y7
+	VPADDD  Y3, Y4, Y3
+	VPADDD  224(SP), Y3, Y3
+	VPXOR   Y14, Y3, Y14
+	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
+	VPADDD  Y9, Y14, Y9
+	VPXOR   Y4, Y9, Y4
+	VPSRLD  $0x0c, Y4, Y8
+	VPSLLD  $0x14, Y4, Y4
+	VPOR    Y4, Y8, Y4
+	VPADDD  Y3, Y4, Y3
+	VPADDD  416(SP), Y3, Y3
+	VPXOR   Y14, Y3, Y14
+	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
+	VPADDD  Y9, Y14, Y9
+	VPXOR   Y4, Y9, Y4
+	VPSRLD  $0x07, Y4, Y8
+	VPSLLD  $0x19, Y4, Y4
+	VPOR    Y4, Y8, Y4
+
+	// Finalize CVs
+	VMOVDQU 512(SP), Y8
+	VPXOR   Y0, Y8, Y0
+	VPXOR   Y1, Y9, Y1
+	VPXOR   Y2, Y10, Y2
+	VPXOR   Y3, Y11, Y3
+	VPXOR   Y4, Y12, Y4
+	VPXOR   Y5, Y13, Y5
+	VPXOR   Y6, Y14, Y6
+	VPXOR   Y7, Y15, Y7
+
+	// Loop
+	INCQ DX
+	CMPQ DX, $0x00000010
+	JNE  loop
+
+	// Finished; transpose CVs
+	VPUNPCKLDQ  Y1, Y0, Y8
+	VPUNPCKHDQ  Y1, Y0, Y9
+	VPUNPCKLDQ  Y3, Y2, Y10
+	VPUNPCKHDQ  Y3, Y2, Y11
+	VPUNPCKLDQ  Y5, Y4, Y12
+	VPUNPCKHDQ  Y5, Y4, Y13
+	VPUNPCKLDQ  Y7, Y6, Y14
+	VPUNPCKHDQ  Y7, Y6, Y15
+	VPUNPCKLQDQ Y10, Y8, Y0
+	VPUNPCKHQDQ Y10, Y8, Y1
+	VPUNPCKLQDQ Y11, Y9, Y2
+	VPUNPCKHQDQ Y11, Y9, Y3
+	VPUNPCKLQDQ Y14, Y12, Y4
+	VPUNPCKHQDQ Y14, Y12, Y5
+	VPUNPCKLQDQ Y15, Y13, Y6
+	VPUNPCKHQDQ Y15, Y13, Y7
+	VPERM2I128  $0x20, Y4, Y0, Y8
+	VPERM2I128  $0x31, Y4, Y0, Y12
+	VPERM2I128  $0x20, Y5, Y1, Y9
+	VPERM2I128  $0x31, Y5, Y1, Y13
+	VPERM2I128  $0x20, Y6, Y2, Y10
+	VPERM2I128  $0x31, Y6, Y2, Y14
+	VPERM2I128  $0x20, Y7, Y3, Y11
+	VPERM2I128  $0x31, Y7, Y3, Y15
+	VMOVDQU     Y8, (AX)
+	VMOVDQU     Y9, 32(AX)
+	VMOVDQU     Y10, 64(AX)
+	VMOVDQU     Y11, 96(AX)
+	VMOVDQU     Y12, 128(AX)
+	VMOVDQU     Y13, 160(AX)
+	VMOVDQU     Y14, 192(AX)
+	VMOVDQU     Y15, 224(AX)
+	RET
diff --git a/blake3_test.go b/blake3_test.go
index fdf0ed7..7bfe6af 100644
--- a/blake3_test.go
+++ b/blake3_test.go
@@ -63,7 +63,7 @@ func TestVectors(t *testing.T) {
 		subKey := make([]byte, len(vec.DeriveKey)/2)
 		blake3.DeriveKey(subKey, ctx, in)
 		if out := toHex(subKey); out != vec.DeriveKey {
-			t.Errorf("output did not match test vector:\n\texpected: %v...\n\t     got: %v...", vec.DeriveKey[:10], subKey[:10])
+			t.Errorf("output did not match test vector:\n\texpected: %v...\n\t     got: %v...", vec.DeriveKey[:10], out[:10])
 		}
 	}
 }
@@ -150,7 +150,7 @@ func TestSum(t *testing.T) {
 		h.Write(in)
 		h.Sum(exp256[:0])
 		if got256 := blake3.Sum256(in); exp256 != got256 {
-			t.Errorf("Sum256 output did not match Sum output:\n\texpected: %v...\n\t     got: %v...", exp256[:10], got256[:10])
+			t.Errorf("Sum256 output did not match Sum output:\n\texpected: %x...\n\t     got: %x...", exp256[:5], got256[:5])
 		}
 
 		var exp512 [64]byte
@@ -158,7 +158,7 @@ func TestSum(t *testing.T) {
 		h.Write(in)
 		h.Sum(exp512[:0])
 		if got512 := blake3.Sum512(in); exp512 != got512 {
-			t.Errorf("Sum512 output did not match Sum output:\n\texpected: %v...\n\t     got: %v...", exp512[:10], got512[:10])
+			t.Errorf("Sum512 output did not match Sum output:\n\texpected: %x...\n\t     got: %x...", exp512[:5], got512[:5])
 		}
 	}
 }
@@ -190,13 +190,20 @@ func (nopReader) Read(p []byte) (int, error) { return len(p), nil }
 
 func BenchmarkWrite(b *testing.B) {
 	b.ReportAllocs()
-	b.SetBytes(1)
-	io.CopyN(blake3.New(0, nil), nopReader{}, int64(b.N))
+	b.SetBytes(1024)
+	io.CopyN(blake3.New(0, nil), nopReader{}, int64(b.N*1024))
+}
+
+func BenchmarkXOF(b *testing.B) {
+	b.ReportAllocs()
+	b.SetBytes(1024)
+	io.CopyN(ioutil.Discard, blake3.New(0, nil).XOF(), int64(b.N*1024))
 }
 
 func BenchmarkSum256(b *testing.B) {
 	b.Run("64", func(b *testing.B) {
 		b.ReportAllocs()
+		b.SetBytes(64)
 		buf := make([]byte, 64)
 		for i := 0; i < b.N; i++ {
 			blake3.Sum256(buf)
@@ -204,6 +211,7 @@ func BenchmarkSum256(b *testing.B) {
 	})
 	b.Run("1024", func(b *testing.B) {
 		b.ReportAllocs()
+		b.SetBytes(1024)
 		buf := make([]byte, 1024)
 		for i := 0; i < b.N; i++ {
 			blake3.Sum256(buf)
@@ -211,15 +219,10 @@ func BenchmarkSum256(b *testing.B) {
 	})
 	b.Run("65536", func(b *testing.B) {
 		b.ReportAllocs()
+		b.SetBytes(65536)
 		buf := make([]byte, 65536)
 		for i := 0; i < b.N; i++ {
 			blake3.Sum256(buf)
 		}
 	})
 }
-
-func BenchmarkXOF(b *testing.B) {
-	b.ReportAllocs()
-	b.SetBytes(1)
-	io.CopyN(ioutil.Discard, blake3.New(0, nil).XOF(), int64(b.N))
-}
diff --git a/compress_amd64.go b/compress_amd64.go
new file mode 100644
index 0000000..cfe414b
--- /dev/null
+++ b/compress_amd64.go
@@ -0,0 +1,76 @@
+package blake3
+
+import (
+	"unsafe"
+
+	"golang.org/x/sys/cpu"
+)
+
+//go:generate go run avo/gen.go -out blake3_amd64.s
+
+//go:noescape
+func compressChunksAVX2(cvs *[8][8]uint32, buf *[8192]byte, key *[8]uint32, counter uint64, flags uint32)
+
+func compressNode(n node) (out [16]uint32) {
+	compressNodeGeneric(&out, n)
+	return
+}
+
+func compressBufferLarge(buf *[8192]byte, buflen int, key *[8]uint32, counter uint64, flags uint32) node {
+	var cvs [8][8]uint32
+	compressChunksAVX2(&cvs, buf, key, counter, flags)
+	numChunks := uint64(buflen / chunkSize)
+	if buflen%chunkSize != 0 {
+		// use non-asm for remainder
+		partialChunk := buf[buflen-buflen%chunkSize : buflen]
+		cvs[numChunks] = chainingValue(compressChunk(partialChunk, key, counter+numChunks, flags))
+		numChunks++
+	}
+	return mergeSubtrees(cvs[:numChunks], key, flags)
+}
+
+func compressBuffer(buf *[8192]byte, buflen int, key *[8]uint32, counter uint64, flags uint32) node {
+	switch {
+	case cpu.X86.HasAVX2 && buflen >= chunkSize*2:
+		return compressBufferLarge(buf, buflen, key, counter, flags)
+	default:
+		return compressBufferGeneric(buf, buflen, key, counter, flags)
+	}
+}
+
+func compressChunk(chunk []byte, key *[8]uint32, counter uint64, flags uint32) node {
+	n := node{
+		cv:       *key,
+		counter:  counter,
+		blockLen: blockSize,
+		flags:    flags | flagChunkStart,
+	}
+	blockBytes := (*[64]byte)(unsafe.Pointer(&n.block))[:]
+	for len(chunk) > blockSize {
+		copy(blockBytes, chunk)
+		chunk = chunk[blockSize:]
+		n.cv = chainingValue(n)
+		n.flags &^= flagChunkStart
+	}
+	// pad last block with zeros
+	n.block = [16]uint32{}
+	copy(blockBytes, chunk)
+	n.blockLen = uint32(len(chunk))
+	n.flags |= flagChunkEnd
+	return n
+}
+
+func wordsToBytes(words [16]uint32, block *[64]byte) {
+	*block = *(*[64]byte)(unsafe.Pointer(&words))
+}
+
+func hashBlock(out *[64]byte, buf []byte) {
+	var block [16]uint32
+	copy((*[64]byte)(unsafe.Pointer(&block))[:], buf)
+	compressNodeGeneric((*[16]uint32)(unsafe.Pointer(out)), node{
+		cv:       iv,
+		block:    block,
+		blockLen: uint32(len(buf)),
+		flags:    flagChunkStart | flagChunkEnd | flagRoot,
+	})
+}
diff --git a/compress_generic.go b/compress_generic.go
new file mode 100644
index 0000000..0b4dca6
--- /dev/null
+++ b/compress_generic.go
@@ -0,0 +1,150 @@
+package blake3
+
+import (
+	"bytes"
+	"math/bits"
+)
+
+func g(a, b, c, d, mx, my uint32) (uint32, uint32, uint32, uint32) {
+	a += b + mx
+	d = bits.RotateLeft32(d^a, -16)
+	c += d
+	b = bits.RotateLeft32(b^c, -12)
+	a += b + my
+	d = bits.RotateLeft32(d^a, -8)
+	c += d
+	b = bits.RotateLeft32(b^c, -7)
+	return a, b, c, d
+}
+
+func compressNodeGeneric(out *[16]uint32, n node) {
+	// NOTE: we unroll all of the rounds, as well as the permutations that occur
+	// between rounds.
+
+	// round 1 (also initializes state)
+	// columns
+	s0, s4, s8, s12 := g(n.cv[0], n.cv[4], iv[0], uint32(n.counter), n.block[0], n.block[1])
+	s1, s5, s9, s13 := g(n.cv[1], n.cv[5], iv[1], uint32(n.counter>>32), n.block[2], n.block[3])
+	s2, s6, s10, s14 := g(n.cv[2], n.cv[6], iv[2], n.blockLen, n.block[4], n.block[5])
+	s3, s7, s11, s15 := g(n.cv[3], n.cv[7], iv[3], n.flags, n.block[6], n.block[7])
+	// diagonals
+	s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[8], n.block[9])
+	s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[10], n.block[11])
+	s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[12], n.block[13])
+	s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[14], n.block[15])
+
+	// round 2
+	s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[2], n.block[6])
+	s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[3], n.block[10])
+	s2, s6, s10, s14 = g(s2, s6, s10, s14, n.block[7], n.block[0])
+	s3, s7, s11, s15 = g(s3, s7, s11, s15, n.block[4], n.block[13])
+	s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[1], n.block[11])
+	s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[12], n.block[5])
+	s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[9], n.block[14])
+	s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[15], n.block[8])
+
+	// round 3
+	s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[3], n.block[4])
+	s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[10], n.block[12])
+	s2, s6, s10, s14 = g(s2, s6, s10, s14, n.block[13], n.block[2])
+	s3, s7, s11, s15 = g(s3, s7, s11, s15, n.block[7], n.block[14])
+	s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[6], n.block[5])
+	s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[9], n.block[0])
+	s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[11], n.block[15])
+	s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[8], n.block[1])
+
+	// round 4
+	s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[10], n.block[7])
+	s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[12], n.block[9])
+	s2, s6, s10, s14 = g(s2, s6, s10, s14, n.block[14], n.block[3])
+	s3, s7, s11, s15 = g(s3, s7, s11, s15, n.block[13], n.block[15])
+	s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[4], n.block[0])
+	s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[11], n.block[2])
+	s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[5], n.block[8])
+	s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[1], n.block[6])
+
+	// round 5
+	s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[12], n.block[13])
+	s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[9], n.block[11])
+	s2, s6, s10, s14 = g(s2, s6, s10, s14, n.block[15], n.block[10])
+	s3, s7, s11, s15 = g(s3, s7, s11, s15, n.block[14], n.block[8])
+	s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[7], n.block[2])
+	s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[5], n.block[3])
+	s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[0], n.block[1])
+	s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[6], n.block[4])
+
+	// round 6
+	s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[9], n.block[14])
+	s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[11], n.block[5])
+	s2, s6, s10, s14 = g(s2, s6, s10, s14, n.block[8], n.block[12])
+	s3, s7, s11, s15 = g(s3, s7, s11, s15, n.block[15], n.block[1])
+	s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[13], n.block[3])
+	s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[0], n.block[10])
+	s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[2], n.block[6])
+	s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[4], n.block[7])
+
+	// round 7
+	s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[11], n.block[15])
+	s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[5], n.block[0])
+	s2, s6, s10, s14 = g(s2, s6, s10, s14, n.block[1], n.block[9])
+	s3, s7, s11, s15 = g(s3, s7, s11, s15, n.block[8], n.block[6])
+	s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[14], n.block[10])
+	s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[2], n.block[12])
+	s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[3], n.block[4])
+	s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[7], n.block[13])
+
+	// finalization
+	*out = [16]uint32{
+		s0 ^ s8, s1 ^ s9, s2 ^ s10, s3 ^ s11,
+		s4 ^ s12, s5 ^ s13, s6 ^ s14, s7 ^ s15,
+		s8 ^ n.cv[0], s9 ^ n.cv[1], s10 ^ n.cv[2], s11 ^ n.cv[3],
+		s12 ^ n.cv[4], s13 ^ n.cv[5], s14 ^ n.cv[6], s15 ^ n.cv[7],
+	}
+}
+
+func compressBufferGeneric(buf *[8192]byte, buflen int, key *[8]uint32, counter uint64, flags uint32) (n node) {
+	if buflen <= chunkSize {
+		return compressChunk(buf[:buflen], key, counter, flags)
+	}
+	cvs := make([][8]uint32, 0, 8)
+	for bb := bytes.NewBuffer(buf[:buflen]); bb.Len() > 0; {
+		n := compressChunk(bb.Next(chunkSize), key, counter, flags)
+		cvs = append(cvs, chainingValue(n))
+		counter++
+	}
+	return mergeSubtrees(cvs, key, flags)
+}
+
+func chainingValue(n node) (cv [8]uint32) {
+	full := compressNode(n)
+	copy(cv[:], full[:])
+	return
+}
+
+func mergeSubtrees(cvs [][8]uint32, key *[8]uint32, flags uint32) node {
+	parent := func(l, r [8]uint32) [8]uint32 {
+		return chainingValue(parentNode(l, r, *key, flags))
+	}
+	switch len(cvs) {
+	case 8:
+		cvs[6] = parent(cvs[6], cvs[7])
+		fallthrough
+	case 7:
+		cvs[4], cvs[5] = parent(cvs[4], cvs[5]), cvs[6]
+		fallthrough
+	case 6:
+		cvs[4] = parent(cvs[4], cvs[5])
+		fallthrough
+	case 5:
+		fallthrough
+	case 4:
+		cvs[2] = parent(cvs[2], cvs[3])
+		fallthrough
+	case 3:
+		cvs[0], cvs[1] = parent(cvs[0], cvs[1]), cvs[2]
+	}
+	if len(cvs) > 4 {
+		cvs[0], cvs[1] = parent(cvs[0], cvs[1]), cvs[4]
+	}
+	return parentNode(cvs[0], cvs[1], *key, flags)
+}
diff --git a/compress_noasm.go b/compress_noasm.go
new file mode 100644
index 0000000..847a519
--- /dev/null
+++ b/compress_noasm.go
@@ -0,0 +1,64 @@
+// +build !amd64
+
+package blake3
+
+import "encoding/binary"
+
+func compressNode(n node) (out [16]uint32) {
+	compressNodeGeneric(&out, n)
+	return
+}
+
+func compressBuffer(buf *[8192]byte, length int, key *[8]uint32, counter uint64, flags uint32) node {
+	return compressBufferGeneric(buf, length, key, counter, flags)
+}
+
+func compressChunk(chunk []byte, key *[8]uint32, counter uint64, flags uint32) node {
+	n := node{
+		cv:       *key,
+		counter:  counter,
+		blockLen: blockSize,
+		flags:    flags | flagChunkStart,
+	}
+	var block [blockSize]byte
+	for len(chunk) > blockSize {
+		copy(block[:], chunk)
+		chunk = chunk[blockSize:]
+		bytesToWords(block, &n.block)
+		n.cv = chainingValue(n)
+		n.flags &^= flagChunkStart
+	}
+	// pad last block with zeros
+	block = [blockSize]byte{}
+	n.blockLen = uint32(len(chunk))
+	copy(block[:], chunk)
+	bytesToWords(block, &n.block)
+	n.flags |= flagChunkEnd
+	return n
+}
+
+func hashBlock(out *[64]byte, buf []byte) {
+	var block [64]byte
+	var words [16]uint32
+	copy(block[:], buf)
+	bytesToWords(block, &words)
+	compressNodeGeneric(&words, node{
+		cv:       iv,
+		block:    words,
+		blockLen: uint32(len(buf)),
+		flags:    flagChunkStart | flagChunkEnd | flagRoot,
+	})
+	wordsToBytes(words, out)
+}
+
+func bytesToWords(bytes [64]byte, words *[16]uint32) {
+	for i := range words {
+		words[i] = binary.LittleEndian.Uint32(bytes[4*i:])
+	}
+}
+
+func wordsToBytes(words [16]uint32, block *[64]byte) {
+	for i, w := range words {
+		binary.LittleEndian.PutUint32(block[4*i:], w)
+	}
+}
diff --git a/go.mod b/go.mod
index 51832fc..46beb99 100644
--- a/go.mod
+++ b/go.mod
@@ -1,3 +1,5 @@
 module lukechampine.com/blake3
 
 go 1.13
+
+require golang.org/x/sys v0.0.0-20200202164722-d101bd2416d5
diff --git a/go.sum b/go.sum
new file mode 100644
index 0000000..4ad15a4
--- /dev/null
+++ b/go.sum
@@ -0,0 +1,2 @@
+golang.org/x/sys v0.0.0-20200202164722-d101bd2416d5 h1:LfCXLvNmTYH9kEmVgqbnsWfruoXZIrh4YBgqVHtDvw0=
+golang.org/x/sys v0.0.0-20200202164722-d101bd2416d5/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=