add AVX2 optimization for XOF

2020-08-02 15:09:07 -04:00 · 2020-08-02 15:09:07 -04:00 · 221995220f
parent c2af4bc4c2
commit 221995220f
7 changed files with 1446 additions and 77 deletions
--- a/README.md
+++ b/README.md
@ -15,10 +15,11 @@ readability, in the hopes of eventually landing in `x/crypto`.
 The pure-Go code is fairly well-optimized, achieving throughput of ~600 MB/s.
 There is a separate code path for small inputs (up to 64 bytes) that runs in
 ~100 ns. On CPUs with AVX2 support, larger inputs (>=2 KB) are handled by
-an [`avo`](https://github.com/mmcloughlin/avo)-generated assembly routine that compresses 8 chunks in parallel,
-achieving throughput of ~2600 MB/s. Once [AVX-512 support](https://github.com/mmcloughlin/avo/issues/20) is added to `avo`, it
-will be possible to compress 16 chunks in parallel, which should roughly double
-throughput for sufficiently large inputs.
+an [`avo`](https://github.com/mmcloughlin/avo)-generated assembly routine that compresses 8 nodes in parallel,
+achieving throughput of ~2600 MB/s. AVX2 is also used for BLAKE3's extendable output function,
+enabling it to stream pseudorandom bytes at ~3500 MB/s. Once [AVX-512 support](https://github.com/mmcloughlin/avo/issues/20) is added to `avo`, it
+will be possible to compress 16 nodes in parallel, which should roughly double
+the current performance.

 Contributions are greatly appreciated.
 [All contributors are eligible to receive an Urbit planet.](https://twitter.com/lukechampine/status/1274797924522885134)
@ -33,5 +34,5 @@ BenchmarkSum256/64           105 ns/op       609.51 MB/s
 BenchmarkSum256/1024        1778 ns/op       576.00 MB/s
 BenchmarkSum256/65536      24785 ns/op      2644.15 MB/s
 BenchmarkWrite               389 ns/op      2631.78 MB/s
-BenchmarkXOF                1591 ns/op       643.80 MB/s
+BenchmarkXOF                 293 ns/op      3492.94 MB/s
 ```
--- a/avo/gen.go
+++ b/avo/gen.go
@ -12,6 +12,7 @@ import (

 func main() {
 	genGlobals()
+	genCompressBlocksAVX2()
 	genCompressChunksAVX2()

 	Generate()
@ -38,7 +39,6 @@ func genGlobals() {
 	for i := 0; i < 8; i++ {
 		DATA(i*4, U32(64))
 	}
-
 	globals.stride1024 = GLOBL("stride_1024", RODATA|NOPTR)
 	for i := 0; i < 8; i++ {
 		DATA(i*4, U32(i*1024))
@ -67,6 +67,80 @@ func genGlobals() {
 	}
 }

+func genCompressBlocksAVX2() {
+	TEXT("compressBlocksAVX2", NOSPLIT, "func(out *[512]byte, block *[16]uint32, cv *[8]uint32, counter uint64, blockLen uint32, flags uint32)")
+	out := Mem{Base: Load(Param("out"), GP64())}
+	block := Mem{Base: Load(Param("block"), GP64())}
+	cv := Mem{Base: Load(Param("cv"), GP64())}
+	counter, _ := Param("counter").Resolve()
+	blockLen, _ := Param("blockLen").Resolve()
+	flags, _ := Param("flags").Resolve()
+
+	vs := [16]VecVirtual{
+		YMM(), YMM(), YMM(), YMM(),
+		YMM(), YMM(), YMM(), YMM(),
+		YMM(), YMM(), YMM(), YMM(),
+		YMM(), YMM(), YMM(), YMM(),
+	}
+
+	// stack space for message vectors
+	var mv [16]Mem
+	for i := range mv {
+		mv[i] = AllocLocal(32)
+	}
+	// stack space for spilled vs[8] register
+	spillMem := AllocLocal(32)
+
+	Comment("Load block")
+	for i := 0; i < 16; i++ {
+		VPBROADCASTD(block.Offset(i*4), vs[0])
+		VMOVDQU(vs[0], mv[i])
+	}
+
+	Comment("Initialize state vectors")
+	for i, v := range vs {
+		switch i {
+		case 0, 1, 2, 3, 4, 5, 6, 7: // cv
+			VPBROADCASTD(cv.Offset(i*4), v)
+		case 8, 9, 10, 11: // iv
+			VPBROADCASTD(globals.iv.Offset((i-8)*4), v)
+		case 12: // counter
+			loadCounter(counter.Addr, vs[12:14], vs[14:16])
+		case 14: // blockLen
+			VPBROADCASTD(blockLen.Addr, v)
+		case 15: // flags
+			VPBROADCASTD(flags.Addr, v)
+		}
+	}
+
+	performRounds(vs, mv, spillMem)
+
+	Comment("Finalize CVs")
+	for i := 8; i < 16; i++ {
+		VMOVDQU(vs[i], mv[i])
+	}
+	for i := range vs[:8] {
+		VPXOR(vs[i], vs[i+8], vs[i])
+	}
+	transpose(vs[:8], vs[8:])
+	for i, v := range vs[8:] {
+		VMOVDQU(v, out.Offset(i*64))
+	}
+	for i := 8; i < 16; i++ {
+		VMOVDQU(mv[i], vs[i])
+	}
+	for i, v := range vs[8:] {
+		VPBROADCASTD(cv.Offset(i*4), vs[0])
+		VPXOR(vs[0], v, v)
+	}
+	transpose(vs[8:], vs[:8])
+	for i, v := range vs[:8] {
+		VMOVDQU(v, out.Offset(i*64+32))
+	}
+
+	RET()
+}
+
 func genCompressChunksAVX2() {
 	TEXT("compressChunksAVX2", NOSPLIT, "func(cvs *[8][8]uint32, buf *[8192]byte, key *[8]uint32, counter uint64, flags uint32)")
 	cvs := Mem{Base: Load(Param("cvs"), GP64())}
@ -97,16 +171,7 @@ func genCompressChunksAVX2() {
 	Comment("Initialize counter")
 	counterLo := AllocLocal(32)
 	counterHi := AllocLocal(32)
-	VPBROADCASTQ(counter.Addr, vs[12])
-	VPBROADCASTQ(counter.Addr, vs[13])
-	VPADDQ(globals.incrementCounter.Offset(0*32), vs[12], vs[12])
-	VPADDQ(globals.incrementCounter.Offset(1*32), vs[13], vs[13])
-	VPUNPCKLDQ(vs[13], vs[12], vs[14])
-	VPUNPCKHDQ(vs[13], vs[12], vs[15])
-	VPUNPCKLDQ(vs[15], vs[14], vs[12])
-	VPUNPCKHDQ(vs[15], vs[14], vs[13])
-	VPERMQ(Imm(0xd8), vs[12], vs[12])
-	VPERMQ(Imm(0xd8), vs[13], vs[13])
+	loadCounter(counter.Addr, vs[12:14], vs[14:16])
 	VMOVDQU(vs[12], counterLo)
 	VMOVDQU(vs[13], counterHi)

@ -141,21 +206,9 @@ func genCompressChunksAVX2() {
 	VMOVDQU(globals.blockLen, vs[14])
 	VPBROADCASTD(chunkFlags.Idx(loop, 4), vs[15])

-	VMOVDQU(vs[8], spillMem) // spill
-	for i := 0; i < 7; i++ {
-		Comment(fmt.Sprintf("Round %v", i+1))
-		round(vs, mv, vs[8], spillMem)
-		// permute
-		mv = [16]Mem{
-			mv[2], mv[6], mv[3], mv[10],
-			mv[7], mv[0], mv[4], mv[13],
-			mv[1], mv[11], mv[12], mv[5],
-			mv[9], mv[14], mv[15], mv[8],
-		}
-	}
+	performRounds(vs, mv, spillMem)

 	Comment("Finalize CVs")
-	VMOVDQU(spillMem, vs[8]) // reload
 	for i := range vs[:8] {
 		VPXOR(vs[i], vs[i+8], vs[i])
 	}
@ -166,31 +219,19 @@ func genCompressChunksAVX2() {
 	JNE(LabelRef("loop"))

 	Comment("Finished; transpose CVs")
-	src, dst := vs[:8], vs[8:]
-	// interleave uint32s
-	for i := 0; i < 8; i += 2 {
-		VPUNPCKLDQ(src[i+1], src[i], dst[i+0])
-		VPUNPCKHDQ(src[i+1], src[i], dst[i+1])
-	}
-	// interleave groups of two uint32s
-	for i := 0; i < 4; i++ {
-		j := i*2 - i%2 // j := 0,1,4,5
-		VPUNPCKLQDQ(dst[j+2], dst[j], src[i*2+0])
-		VPUNPCKHQDQ(dst[j+2], dst[j], src[i*2+1])
-	}
-	// interleave groups of four uint32s
-	for i := 0; i < 4; i++ {
-		VPERM2I128(Imm(0x20), src[i+4], src[i], dst[i+0])
-		VPERM2I128(Imm(0x31), src[i+4], src[i], dst[i+4])
-	}
-	for i, v := range dst {
+	transpose(vs[:8], vs[8:])
+	for i, v := range vs[8:] {
 		VMOVDQU(v, cvs.Offset(i*32))
 	}

 	RET()
 }

-func round(sv [16]VecVirtual, mv [16]Mem, tmp VecVirtual, spillMem Mem) {
+func performRounds(sv [16]VecVirtual, mv [16]Mem, spillMem Mem) {
+	tmp := sv[8]
+	VMOVDQU(sv[8], spillMem) // spill
+	for i := 0; i < 7; i++ {
+		Comment(fmt.Sprintf("Round %v", i+1))
 		g(sv[0], sv[4], sv[8], sv[12], mv[0], mv[1], tmp, spillMem)
 		g(sv[1], sv[5], sv[9], sv[13], mv[2], mv[3], tmp, spillMem)
 		g(sv[2], sv[6], sv[10], sv[14], mv[4], mv[5], tmp, spillMem)
@ -199,6 +240,16 @@ func round(sv [16]VecVirtual, mv [16]Mem, tmp VecVirtual, spillMem Mem) {
 		g(sv[1], sv[6], sv[11], sv[12], mv[10], mv[11], tmp, spillMem)
 		g(sv[2], sv[7], sv[8], sv[13], mv[12], mv[13], tmp, spillMem)
 		g(sv[3], sv[4], sv[9], sv[14], mv[14], mv[15], tmp, spillMem)
+
+		// permute
+		mv = [16]Mem{
+			mv[2], mv[6], mv[3], mv[10],
+			mv[7], mv[0], mv[4], mv[13],
+			mv[1], mv[11], mv[12], mv[5],
+			mv[9], mv[14], mv[15], mv[8],
+		}
+	}
+	VMOVDQU(spillMem, sv[8]) // reload
 }

 func g(a, b, c, d VecVirtual, mx, my Mem, tmp VecVirtual, spillMem Mem) {
@ -237,3 +288,38 @@ func g(a, b, c, d VecVirtual, mx, my Mem, tmp VecVirtual, spillMem Mem) {
 	VPXOR(b, c, b)
 	rotr(b, 7, b)
 }
+
+func loadCounter(counter Mem, dst, scratch []VecVirtual) {
+	// fill dst[0] and dst[1] with counter + 0,1,2,3,4,5,6,7, then transpose so
+	// that dst[0] contains low 32 bits and dst[1] contains high 32 bits.
+	VPBROADCASTQ(counter, dst[0])
+	VPBROADCASTQ(counter, dst[1])
+	VPADDQ(globals.incrementCounter.Offset(0*32), dst[0], dst[0])
+	VPADDQ(globals.incrementCounter.Offset(1*32), dst[1], dst[1])
+	VPUNPCKLDQ(dst[1], dst[0], scratch[0])
+	VPUNPCKHDQ(dst[1], dst[0], scratch[1])
+	VPUNPCKLDQ(scratch[1], scratch[0], dst[0])
+	VPUNPCKHDQ(scratch[1], scratch[0], dst[1])
+	const perm = 0<<0 | 2<<2 | 1<<4 | 3<<6
+	VPERMQ(Imm(perm), dst[0], dst[0])
+	VPERMQ(Imm(perm), dst[1], dst[1])
+}
+
+func transpose(src, dst []VecVirtual) {
+	// interleave uint32s
+	for i := 0; i < 8; i += 2 {
+		VPUNPCKLDQ(src[i+1], src[i], dst[i+0])
+		VPUNPCKHDQ(src[i+1], src[i], dst[i+1])
+	}
+	// interleave groups of two uint32s
+	for i := 0; i < 4; i++ {
+		j := i*2 - i%2 // j := 0,1,4,5
+		VPUNPCKLQDQ(dst[j+2], dst[j], src[i*2+0])
+		VPUNPCKHQDQ(dst[j+2], dst[j], src[i*2+1])
+	}
+	// interleave groups of four uint32s
+	for i := 0; i < 4; i++ {
+		VPERM2I128(Imm(0x20), src[i+4], src[i], dst[i+0])
+		VPERM2I128(Imm(0x31), src[i+4], src[i], dst[i+4])
+	}
+}
--- a/blake3.go
+++ b/blake3.go
@ -224,7 +224,7 @@ func DeriveKey(subKey []byte, ctx string, srcKey []byte) {
 // bytes.
 type OutputReader struct {
 	n   node
-	block [blockSize]byte
+	buf [8 * blockSize]byte
 	off uint64
 }

@ -238,11 +238,11 @@ func (or *OutputReader) Read(p []byte) (int, error) {
 	}
 	lenp := len(p)
 	for len(p) > 0 {
-		if or.off%blockSize == 0 {
+		if or.off%(8*blockSize) == 0 {
 			or.n.counter = or.off / blockSize
-			wordsToBytes(compressNode(or.n), &or.block)
+			compressBlocks(&or.buf, or.n)
 		}
-		n := copy(p, or.block[or.off%blockSize:])
+		n := copy(p, or.buf[or.off%(8*blockSize):])
 		p = p[n:]
 		or.off += uint64(n)
 	}
@ -274,8 +274,8 @@ func (or *OutputReader) Seek(offset int64, whence int) (int64, error) {
 	}
 	or.off = off
 	or.n.counter = uint64(off) / blockSize
-	if or.off%blockSize != 0 {
-		wordsToBytes(compressNode(or.n), &or.block)
+	if or.off%(8*blockSize) != 0 {
+		compressBlocks(&or.buf, or.n)
 	}
 	// NOTE: or.off >= 2^63 will result in a negative return value.
 	// Nothing we can do about this.
--- a/blake3_amd64.s
+++ b/blake3_amd64.s
--- a/compress_amd64.go
+++ b/compress_amd64.go
@ -11,6 +11,9 @@ import (
 //go:noescape
 func compressChunksAVX2(cvs *[8][8]uint32, buf *[8192]byte, key *[8]uint32, counter uint64, flags uint32)

+//go:noescape
+func compressBlocksAVX2(out *[512]byte, msgs *[16]uint32, cv *[8]uint32, counter uint64, blockLen uint32, flags uint32)
+
 func compressNode(n node) (out [16]uint32) {
 	compressNodeGeneric(&out, n)
 	return
@ -60,10 +63,6 @@ func compressChunk(chunk []byte, key *[8]uint32, counter uint64, flags uint32) n
 	return n
 }

-func wordsToBytes(words [16]uint32, block *[64]byte) {
-	*block = *(*[64]byte)(unsafe.Pointer(&words))
-}
-
 func hashBlock(out *[64]byte, buf []byte) {
 	var block [16]uint32
 	copy((*[64]byte)(unsafe.Pointer(&block))[:], buf)
@ -74,3 +73,17 @@ func hashBlock(out *[64]byte, buf []byte) {
 		flags:    flagChunkStart | flagChunkEnd | flagRoot,
 	})
 }
+
+func compressBlocks(out *[512]byte, n node) {
+	switch {
+	case cpu.X86.HasAVX2:
+		compressBlocksAVX2(out, &n.block, &n.cv, n.counter, n.blockLen, n.flags)
+	default:
+		compressBlocksGeneric((*[8][64]byte)(unsafe.Pointer(out)), n)
+	}
+
+}
+
+func wordsToBytes(words [16]uint32, block *[64]byte) {
+	*block = *(*[64]byte)(unsafe.Pointer(&words))
+}
--- a/compress_generic.go
+++ b/compress_generic.go
@ -115,6 +115,13 @@ func compressBufferGeneric(buf *[8192]byte, buflen int, key *[8]uint32, counter
 	return mergeSubtrees(cvs, key, flags)
 }

+func compressBlocksGeneric(outs *[8][64]byte, n node) {
+	for i := range outs {
+		wordsToBytes(compressNode(n), &outs[i])
+		n.counter++
+	}
+}
+
 func chainingValue(n node) (cv [8]uint32) {
 	full := compressNode(n)
 	copy(cv[:], full[:])
--- a/compress_noasm.go
+++ b/compress_noasm.go
@ -51,6 +51,14 @@ func hashBlock(out *[64]byte, buf []byte) {
 	wordsToBytes(words, out)
 }

+func compressBlocks(out *[512]byte, n node) {
+	var outs [8][64]byte
+	compressBlocksGeneric(&outs, n)
+	for i := range outs {
+		copy(out[i*64:], outs[i][:])
+	}
+}
+
 func bytesToWords(bytes [64]byte, words *[16]uint32) {
 	for i := range words {
 		words[i] = binary.LittleEndian.Uint32(bytes[4*i:])