add AVX512 implementations
This commit is contained in:
parent
221995220f
commit
7a6b5a0fe1
56
README.md
56
README.md
|
@ -12,14 +12,9 @@ go get lukechampine.com/blake3
|
|||
This implementation aims to be performant without sacrificing (too much)
|
||||
readability, in the hopes of eventually landing in `x/crypto`.
|
||||
|
||||
The pure-Go code is fairly well-optimized, achieving throughput of ~600 MB/s.
|
||||
There is a separate code path for small inputs (up to 64 bytes) that runs in
|
||||
~100 ns. On CPUs with AVX2 support, larger inputs (>=2 KB) are handled by
|
||||
an [`avo`](https://github.com/mmcloughlin/avo)-generated assembly routine that compresses 8 nodes in parallel,
|
||||
achieving throughput of ~2600 MB/s. AVX2 is also used for BLAKE3's extendable output function,
|
||||
enabling it to stream pseudorandom bytes at ~3500 MB/s. Once [AVX-512 support](https://github.com/mmcloughlin/avo/issues/20) is added to `avo`, it
|
||||
will be possible to compress 16 nodes in parallel, which should roughly double
|
||||
the current performance.
|
||||
In addition to the pure-Go implementation, this package also contains AVX-512
|
||||
and AVX2 routines (generated by [`avo`](https://github.com/mmcloughlin/avo))
|
||||
that greatly increase performance for large inputs and outputs.
|
||||
|
||||
Contributions are greatly appreciated.
|
||||
[All contributors are eligible to receive an Urbit planet.](https://twitter.com/lukechampine/status/1274797924522885134)
|
||||
|
@ -27,12 +22,45 @@ Contributions are greatly appreciated.
|
|||
|
||||
## Benchmarks
|
||||
|
||||
Tested on an i5-7600K @ 3.80GHz.
|
||||
Tested on a 2020 MacBook Air (i5-7600K @ 3.80GHz). Benchmarks will improve as
|
||||
soon as I get access to a beefer AVX-512 machine. :wink:
|
||||
|
||||
### AVX-512
|
||||
|
||||
```
|
||||
BenchmarkSum256/64 105 ns/op 609.51 MB/s
|
||||
BenchmarkSum256/1024 1778 ns/op 576.00 MB/s
|
||||
BenchmarkSum256/65536 24785 ns/op 2644.15 MB/s
|
||||
BenchmarkWrite 389 ns/op 2631.78 MB/s
|
||||
BenchmarkXOF 293 ns/op 3492.94 MB/s
|
||||
BenchmarkSum256/64 120 ns/op 533.00 MB/s
|
||||
BenchmarkSum256/1024 2229 ns/op 459.36 MB/s
|
||||
BenchmarkSum256/65536 16245 ns/op 4034.11 MB/s
|
||||
BenchmarkWrite 245 ns/op 4177.38 MB/s
|
||||
BenchmarkXOF 246 ns/op 4159.30 MB/s
|
||||
```
|
||||
|
||||
### AVX2
|
||||
|
||||
```
|
||||
BenchmarkSum256/64 120 ns/op 533.00 MB/s
|
||||
BenchmarkSum256/1024 2229 ns/op 459.36 MB/s
|
||||
BenchmarkSum256/65536 31137 ns/op 2104.76 MB/s
|
||||
BenchmarkWrite 487 ns/op 2103.12 MB/s
|
||||
BenchmarkXOF 329 ns/op 3111.27 MB/s
|
||||
```
|
||||
|
||||
### Pure Go
|
||||
|
||||
```
|
||||
BenchmarkSum256/64 120 ns/op 533.00 MB/s
|
||||
BenchmarkSum256/1024 2229 ns/op 459.36 MB/s
|
||||
BenchmarkSum256/65536 133505 ns/op 490.89 MB/s
|
||||
BenchmarkWrite 2022 ns/op 506.36 MB/s
|
||||
BenchmarkXOF 1914 ns/op 534.98 MB/s
|
||||
```
|
||||
|
||||
## Shortcomings
|
||||
|
||||
There is no assembly routine for single-block compressions. This is most
|
||||
noticeable for ~1KB inputs.
|
||||
|
||||
Each assembly routine inlines all 7 rounds, causing thousands of lines of
|
||||
duplicated code. Ideally the routines could be merged such that only a single
|
||||
routine is generated for AVX-512 and AVX2, without sacrificing too much
|
||||
performance.
|
||||
|
|
564
avo/gen.go
564
avo/gen.go
|
@ -6,26 +6,28 @@ import (
|
|||
"fmt"
|
||||
|
||||
. "github.com/mmcloughlin/avo/build"
|
||||
"github.com/mmcloughlin/avo/ir"
|
||||
. "github.com/mmcloughlin/avo/operand"
|
||||
. "github.com/mmcloughlin/avo/reg"
|
||||
)
|
||||
|
||||
func main() {
|
||||
genGlobals()
|
||||
genCompressBlocksAVX512()
|
||||
genCompressChunksAVX512()
|
||||
genCompressBlocksAVX2()
|
||||
genCompressChunksAVX2()
|
||||
genCompressParentsAVX2()
|
||||
|
||||
Generate()
|
||||
}
|
||||
|
||||
var globals struct {
|
||||
iv Mem
|
||||
blockLen Mem
|
||||
stride1024 Mem
|
||||
incrementCounter Mem
|
||||
setFlags Mem
|
||||
shuffleRot8 Mem
|
||||
shuffleRot16 Mem
|
||||
iv Mem
|
||||
seq Mem
|
||||
seq64 Mem // for loadCounter
|
||||
shuffleRot8 Mem
|
||||
shuffleRot16 Mem
|
||||
}
|
||||
|
||||
func genGlobals() {
|
||||
|
@ -35,28 +37,14 @@ func genGlobals() {
|
|||
DATA(2*4, U32(0x3C6EF372))
|
||||
DATA(3*4, U32(0xA54FF53A))
|
||||
|
||||
globals.blockLen = GLOBL("block_len", RODATA|NOPTR)
|
||||
for i := 0; i < 8; i++ {
|
||||
DATA(i*4, U32(64))
|
||||
globals.seq = GLOBL("seq", RODATA|NOPTR)
|
||||
for i := 0; i < 16; i++ {
|
||||
DATA(i*4, U32(i))
|
||||
}
|
||||
globals.stride1024 = GLOBL("stride_1024", RODATA|NOPTR)
|
||||
for i := 0; i < 8; i++ {
|
||||
DATA(i*4, U32(i*1024))
|
||||
}
|
||||
globals.incrementCounter = GLOBL("increment_counter", RODATA|NOPTR)
|
||||
globals.seq64 = GLOBL("seq64", RODATA|NOPTR)
|
||||
for i := 0; i < 8; i++ {
|
||||
DATA(i*8, U64(i))
|
||||
}
|
||||
globals.setFlags = GLOBL("set_flags", RODATA|NOPTR)
|
||||
for i := 0; i < 16; i++ {
|
||||
if i == 0 {
|
||||
DATA(i*4, U32(1))
|
||||
} else if i == 15 {
|
||||
DATA(i*4, U32(2))
|
||||
} else {
|
||||
DATA(i*4, U32(0))
|
||||
}
|
||||
}
|
||||
globals.shuffleRot8 = GLOBL("shuffle_rot8", RODATA|NOPTR)
|
||||
for i := 0; i < 8; i++ {
|
||||
DATA(i*4, U32(0x00030201+0x04040404*i))
|
||||
|
@ -67,6 +55,186 @@ func genGlobals() {
|
|||
}
|
||||
}
|
||||
|
||||
func genCompressBlocksAVX512() {
|
||||
TEXT("compressBlocksAVX512", NOSPLIT, "func(out *[1024]byte, block *[16]uint32, cv *[8]uint32, counter uint64, blockLen uint32, flags uint32)")
|
||||
out := Mem{Base: Load(Param("out"), GP64())}
|
||||
block := Mem{Base: Load(Param("block"), GP64())}
|
||||
cv := Mem{Base: Load(Param("cv"), GP64())}
|
||||
counter, _ := Param("counter").Resolve()
|
||||
blockLen, _ := Param("blockLen").Resolve()
|
||||
flags, _ := Param("flags").Resolve()
|
||||
|
||||
Comment("Initialize block vectors")
|
||||
var vs, mv [16]VecVirtual
|
||||
for i := range vs {
|
||||
vs[i], mv[i] = ZMM(), ZMM()
|
||||
VPBROADCASTD_Z(block.Offset(i*4), mv[i])
|
||||
}
|
||||
|
||||
Comment("Initialize state vectors")
|
||||
for i, v := range vs {
|
||||
switch i {
|
||||
case 0, 1, 2, 3, 4, 5, 6, 7: // cv
|
||||
VPBROADCASTD_Z(cv.Offset(i*4), v)
|
||||
case 8, 9, 10, 11: // iv
|
||||
VPBROADCASTD_Z(globals.iv.Offset((i-8)*4), v)
|
||||
case 12: // counter
|
||||
VPBROADCASTD_Z(counter.Addr, vs[12])
|
||||
VPADDD_Z(globals.seq, vs[12], vs[12])
|
||||
// set a 1 bit in K1 for each overflowed counter in vs[12]
|
||||
VPCMPUD(Imm(1), globals.seq, vs[12], K1)
|
||||
// add 1 to each counter in vs[13] for each 1 bit in K1
|
||||
VPBROADCASTD_Z(counter.Addr.Offset(1*4), vs[13])
|
||||
VPADDD_ZBK(globals.seq.Offset(4), vs[13], K1, vs[13])
|
||||
case 14: // blockLen
|
||||
VPBROADCASTD_Z(blockLen.Addr, v)
|
||||
case 15: // flags
|
||||
VPBROADCASTD_Z(flags.Addr, v)
|
||||
}
|
||||
}
|
||||
|
||||
performRoundsAVX512(vs, mv)
|
||||
|
||||
Comment("Finalize CVs")
|
||||
for i, v := range vs[:8] {
|
||||
VPXORD_Z(v, vs[i+8], v)
|
||||
}
|
||||
for i, v := range vs[8:] {
|
||||
VPXORD_ZB(cv.Offset(i*4), v, v)
|
||||
}
|
||||
stride := ZMM()
|
||||
VMOVDQU32_Z(globals.seq, stride)
|
||||
VPSLLD_Z(Imm(6), stride, stride) // stride of 64
|
||||
for i, v := range vs {
|
||||
KXNORD(K1, K1, K1) // fastest way to set all bits to 1
|
||||
VPSCATTERDD_Z(v, K1, out.Offset(i*4).Idx(stride, 1))
|
||||
}
|
||||
|
||||
RET()
|
||||
}
|
||||
|
||||
func genCompressChunksAVX512() {
|
||||
TEXT("compressChunksAVX512", NOSPLIT, "func(cvs *[16][8]uint32, buf *[16384]byte, key *[8]uint32, counter uint64, flags uint32)")
|
||||
cvs := Mem{Base: Load(Param("cvs"), GP64())}
|
||||
buf := Mem{Base: Load(Param("buf"), GP64())}
|
||||
key := Mem{Base: Load(Param("key"), GP64())}
|
||||
counter, _ := Param("counter").Resolve()
|
||||
flags, _ := Param("flags").Resolve()
|
||||
|
||||
var vs, mv [16]VecVirtual
|
||||
for i := range vs {
|
||||
vs[i], mv[i] = ZMM(), ZMM()
|
||||
}
|
||||
|
||||
Comment("Initialize counter")
|
||||
counterLo := AllocLocal(64)
|
||||
counterHi := AllocLocal(64)
|
||||
VPBROADCASTD_Z(counter.Addr, vs[0])
|
||||
VPADDD_Z(globals.seq, vs[0], vs[0])
|
||||
VPCMPUD(Imm(1), globals.seq, vs[0], K1)
|
||||
VPBROADCASTD_Z(counter.Addr.Offset(4), vs[1])
|
||||
VPADDD_ZBK(globals.seq.Offset(4), vs[1], K1, vs[1])
|
||||
VMOVDQU32_Z(vs[0], counterLo)
|
||||
VMOVDQU32_Z(vs[1], counterHi)
|
||||
|
||||
Comment("Initialize flags")
|
||||
chunkFlags := AllocLocal(16 * 4)
|
||||
VPBROADCASTD_Z(flags.Addr, vs[0])
|
||||
VMOVDQU32_Z(vs[0], chunkFlags)
|
||||
ORL(Imm(1), chunkFlags.Offset(0*4))
|
||||
ORL(Imm(2), chunkFlags.Offset(15*4))
|
||||
|
||||
Comment("Load key")
|
||||
for i := 0; i < 8; i++ {
|
||||
VPBROADCASTD_Z(key.Offset(i*4), vs[i])
|
||||
}
|
||||
|
||||
Comment("Loop index")
|
||||
loop := GP64()
|
||||
XORQ(loop, loop)
|
||||
Label("loop")
|
||||
|
||||
Comment("Load transposed block")
|
||||
VMOVDQU32_Z(globals.seq, vs[8])
|
||||
VPSLLD_Z(Imm(10), vs[8], vs[8]) // stride of 1024
|
||||
for i, m := range mv {
|
||||
KXNORD(K1, K1, K1)
|
||||
VPGATHERDD_Z(buf.Offset(i*4).Idx(vs[8], 1), K1, m)
|
||||
}
|
||||
ADDQ(Imm(64), buf.Base)
|
||||
|
||||
Comment("Reload state vectors (other than CVs)")
|
||||
for i := 0; i < 4; i++ {
|
||||
VPBROADCASTD_Z(globals.iv.Offset(i*4), vs[8+i])
|
||||
}
|
||||
VMOVDQU32_Z(counterLo, vs[12])
|
||||
VMOVDQU32_Z(counterHi, vs[13])
|
||||
VPBROADCASTD_Z(globals.seq.Offset(4), vs[14])
|
||||
VPSLLD_Z(Imm(6), vs[14], vs[14]) // 64
|
||||
VPBROADCASTD_Z(chunkFlags.Idx(loop, 4), vs[15])
|
||||
|
||||
performRoundsAVX512(vs, mv)
|
||||
|
||||
Comment("Finalize CVs")
|
||||
for i := range vs[:8] {
|
||||
VPXORD_Z(vs[i], vs[i+8], vs[i])
|
||||
}
|
||||
|
||||
Comment("Loop")
|
||||
INCQ(loop)
|
||||
CMPQ(loop, U32(16))
|
||||
JNE(LabelRef("loop"))
|
||||
|
||||
Comment("Finished; transpose CVs")
|
||||
VMOVDQU32_Z(globals.seq, vs[8])
|
||||
VPSLLD_Z(Imm(5), vs[8], vs[8]) // stride of 32
|
||||
for i, v := range vs[:8] {
|
||||
KXNORD(K1, K1, K1) // fastest way to set all bits to 1
|
||||
VPSCATTERDD_Z(v, K1, cvs.Offset(i*4).Idx(vs[8], 1))
|
||||
}
|
||||
|
||||
RET()
|
||||
}
|
||||
|
||||
func performRoundsAVX512(vs, mv [16]VecVirtual) {
|
||||
g := func(a, b, c, d, mx, my VecVirtual) {
|
||||
VPADDD_Z(a, b, a)
|
||||
VPADDD_Z(mx, a, a)
|
||||
VPXORD_Z(d, a, d)
|
||||
VPRORD_Z(Imm(16), d, d)
|
||||
VPADDD_Z(c, d, c)
|
||||
VPXORD_Z(b, c, b)
|
||||
VPRORD_Z(Imm(12), b, b)
|
||||
VPADDD_Z(a, b, a)
|
||||
VPADDD_Z(my, a, a)
|
||||
VPXORD_Z(d, a, d)
|
||||
VPRORD_Z(Imm(8), d, d)
|
||||
VPADDD_Z(c, d, c)
|
||||
VPXORD_Z(b, c, b)
|
||||
VPRORD_Z(Imm(7), b, b)
|
||||
}
|
||||
|
||||
for i := 0; i < 7; i++ {
|
||||
Comment(fmt.Sprintf("Round %v", i+1))
|
||||
g(vs[0], vs[4], vs[8], vs[12], mv[0], mv[1])
|
||||
g(vs[1], vs[5], vs[9], vs[13], mv[2], mv[3])
|
||||
g(vs[2], vs[6], vs[10], vs[14], mv[4], mv[5])
|
||||
g(vs[3], vs[7], vs[11], vs[15], mv[6], mv[7])
|
||||
g(vs[0], vs[5], vs[10], vs[15], mv[8], mv[9])
|
||||
g(vs[1], vs[6], vs[11], vs[12], mv[10], mv[11])
|
||||
g(vs[2], vs[7], vs[8], vs[13], mv[12], mv[13])
|
||||
g(vs[3], vs[4], vs[9], vs[14], mv[14], mv[15])
|
||||
|
||||
// permute
|
||||
mv = [16]VecVirtual{
|
||||
mv[2], mv[6], mv[3], mv[10],
|
||||
mv[7], mv[0], mv[4], mv[13],
|
||||
mv[1], mv[11], mv[12], mv[5],
|
||||
mv[9], mv[14], mv[15], mv[8],
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func genCompressBlocksAVX2() {
|
||||
TEXT("compressBlocksAVX2", NOSPLIT, "func(out *[512]byte, block *[16]uint32, cv *[8]uint32, counter uint64, blockLen uint32, flags uint32)")
|
||||
out := Mem{Base: Load(Param("out"), GP64())}
|
||||
|
@ -76,20 +244,12 @@ func genCompressBlocksAVX2() {
|
|||
blockLen, _ := Param("blockLen").Resolve()
|
||||
flags, _ := Param("flags").Resolve()
|
||||
|
||||
vs := [16]VecVirtual{
|
||||
YMM(), YMM(), YMM(), YMM(),
|
||||
YMM(), YMM(), YMM(), YMM(),
|
||||
YMM(), YMM(), YMM(), YMM(),
|
||||
YMM(), YMM(), YMM(), YMM(),
|
||||
}
|
||||
|
||||
// stack space for message vectors
|
||||
var vs [16]VecVirtual
|
||||
var mv [16]Mem
|
||||
for i := range mv {
|
||||
for i := range vs {
|
||||
vs[i] = YMM()
|
||||
mv[i] = AllocLocal(32)
|
||||
}
|
||||
// stack space for spilled vs[8] register
|
||||
spillMem := AllocLocal(32)
|
||||
|
||||
Comment("Load block")
|
||||
for i := 0; i < 16; i++ {
|
||||
|
@ -113,7 +273,7 @@ func genCompressBlocksAVX2() {
|
|||
}
|
||||
}
|
||||
|
||||
performRounds(vs, mv, spillMem)
|
||||
performRoundsAVX2(vs, mv)
|
||||
|
||||
Comment("Finalize CVs")
|
||||
for i := 8; i < 16; i++ {
|
||||
|
@ -149,19 +309,12 @@ func genCompressChunksAVX2() {
|
|||
counter, _ := Param("counter").Resolve()
|
||||
flags, _ := Param("flags").Resolve()
|
||||
|
||||
vs := [16]VecVirtual{
|
||||
YMM(), YMM(), YMM(), YMM(),
|
||||
YMM(), YMM(), YMM(), YMM(),
|
||||
YMM(), YMM(), YMM(), YMM(),
|
||||
YMM(), YMM(), YMM(), YMM(),
|
||||
}
|
||||
// stack space for transposed message vectors
|
||||
var vs [16]VecVirtual
|
||||
var mv [16]Mem
|
||||
for i := range mv {
|
||||
for i := range vs {
|
||||
vs[i] = YMM()
|
||||
mv[i] = AllocLocal(32)
|
||||
}
|
||||
// stack space for spilled vs[8] register
|
||||
spillMem := AllocLocal(32)
|
||||
|
||||
Comment("Load key")
|
||||
for i := 0; i < 8; i++ {
|
||||
|
@ -178,10 +331,10 @@ func genCompressChunksAVX2() {
|
|||
Comment("Initialize flags")
|
||||
chunkFlags := AllocLocal(16 * 4)
|
||||
VPBROADCASTD(flags.Addr, vs[14])
|
||||
VPOR(globals.setFlags.Offset(0*32), vs[14], vs[15])
|
||||
VMOVDQU(vs[15], chunkFlags.Offset(0*32))
|
||||
VPOR(globals.setFlags.Offset(1*32), vs[14], vs[15])
|
||||
VMOVDQU(vs[15], chunkFlags.Offset(1*32))
|
||||
VMOVDQU(vs[14], chunkFlags.Offset(0*32))
|
||||
VMOVDQU(vs[14], chunkFlags.Offset(1*32))
|
||||
ORL(Imm(1), chunkFlags.Offset(0*4))
|
||||
ORL(Imm(2), chunkFlags.Offset(15*4))
|
||||
|
||||
Comment("Loop index")
|
||||
loop := GP64()
|
||||
|
@ -189,7 +342,8 @@ func genCompressChunksAVX2() {
|
|||
Label("loop")
|
||||
|
||||
Comment("Load transposed block")
|
||||
VMOVDQU(globals.stride1024, vs[9])
|
||||
VMOVDQU(globals.seq, vs[9])
|
||||
VPSLLD(Imm(10), vs[9], vs[9]) // stride of 1024
|
||||
for i := 0; i < 16; i++ {
|
||||
VPCMPEQD(vs[8], vs[8], vs[8]) // fastest way to set all bits to 1
|
||||
VPGATHERDD(vs[8], buf.Offset(i*4).Idx(vs[9], 1), vs[10])
|
||||
|
@ -203,10 +357,11 @@ func genCompressChunksAVX2() {
|
|||
}
|
||||
VMOVDQU(counterLo, vs[12])
|
||||
VMOVDQU(counterHi, vs[13])
|
||||
VMOVDQU(globals.blockLen, vs[14])
|
||||
VPBROADCASTD(globals.seq.Offset(4), vs[14])
|
||||
VPSLLD(Imm(6), vs[14], vs[14]) // 64
|
||||
VPBROADCASTD(chunkFlags.Idx(loop, 4), vs[15])
|
||||
|
||||
performRounds(vs, mv, spillMem)
|
||||
performRoundsAVX2(vs, mv)
|
||||
|
||||
Comment("Finalize CVs")
|
||||
for i := range vs[:8] {
|
||||
|
@ -227,19 +382,112 @@ func genCompressChunksAVX2() {
|
|||
RET()
|
||||
}
|
||||
|
||||
func performRounds(sv [16]VecVirtual, mv [16]Mem, spillMem Mem) {
|
||||
func genCompressParentsAVX2() {
|
||||
TEXT("compressParentsAVX2", NOSPLIT, "func(parents *[8][8]uint32, cvs *[16][8]uint32, key *[8]uint32, flags uint32)")
|
||||
parents := Mem{Base: Load(Param("parents"), GP64())}
|
||||
cvs := Mem{Base: Load(Param("cvs"), GP64())}
|
||||
key := Mem{Base: Load(Param("key"), GP64())}
|
||||
flags, _ := Param("flags").Resolve()
|
||||
|
||||
var vs [16]VecVirtual
|
||||
var mv [16]Mem
|
||||
for i := range vs {
|
||||
vs[i] = YMM()
|
||||
mv[i] = AllocLocal(32)
|
||||
}
|
||||
|
||||
Comment("Load transposed block")
|
||||
VMOVDQU(globals.seq, vs[9])
|
||||
VPSLLD(Imm(6), vs[9], vs[9]) // stride of 64
|
||||
for i := 0; i < 16; i++ {
|
||||
VPCMPEQD(vs[8], vs[8], vs[8]) // fastest way to set all bits to 1
|
||||
VPGATHERDD(vs[8], cvs.Offset(i*4).Idx(vs[9], 1), vs[10])
|
||||
VMOVDQU(vs[10], mv[i])
|
||||
}
|
||||
|
||||
Comment("Initialize state vectors")
|
||||
for i, v := range vs {
|
||||
switch i {
|
||||
case 0, 1, 2, 3, 4, 5, 6, 7: // cv
|
||||
VPBROADCASTD(key.Offset(i*4), v)
|
||||
case 8, 9, 10, 11: // iv
|
||||
VPBROADCASTD(globals.iv.Offset((i-8)*4), v)
|
||||
case 12, 13: // counter
|
||||
VPXOR(v, v, v)
|
||||
case 14: // blockLen
|
||||
VPBROADCASTD(globals.seq.Offset(1*4), v)
|
||||
VPSLLD(Imm(6), v, v) // 64
|
||||
case 15: // flags
|
||||
ORL(Imm(4), flags.Addr) // flagParent
|
||||
VPBROADCASTD(flags.Addr, v)
|
||||
}
|
||||
}
|
||||
|
||||
performRoundsAVX2(vs, mv)
|
||||
|
||||
Comment("Finalize CVs")
|
||||
for i := range vs[:8] {
|
||||
VPXOR(vs[i], vs[i+8], vs[i])
|
||||
}
|
||||
transpose(vs[:8], vs[8:])
|
||||
for i, v := range vs[8:] {
|
||||
VMOVDQU(v, parents.Offset(i*32))
|
||||
}
|
||||
|
||||
RET()
|
||||
}
|
||||
|
||||
func performRoundsAVX2(sv [16]VecVirtual, mv [16]Mem) {
|
||||
spillMem := AllocLocal(32)
|
||||
tmp := sv[8]
|
||||
g := func(a, b, c, d VecVirtual, mx, my Mem) {
|
||||
// Helper function for performing rotations. Also manages c, tmp and
|
||||
// spillMem: if c == tmp, we need to spill and reload c using spillMem.
|
||||
rotr := func(v VecVirtual, n uint64, dst VecVirtual) {
|
||||
switch n {
|
||||
case 8, 16:
|
||||
shuf := [...]Mem{8: globals.shuffleRot8, 16: globals.shuffleRot16}[n]
|
||||
VPSHUFB(shuf, v, dst)
|
||||
if c == tmp {
|
||||
VMOVDQU(spillMem, c)
|
||||
}
|
||||
case 7, 12:
|
||||
if c == tmp {
|
||||
VMOVDQU(c, spillMem)
|
||||
}
|
||||
VPSRLD(Imm(n), v, tmp)
|
||||
VPSLLD(Imm(32-n), v, dst)
|
||||
VPOR(dst, tmp, dst)
|
||||
}
|
||||
}
|
||||
|
||||
VPADDD(a, b, a)
|
||||
VPADDD(mx, a, a)
|
||||
VPXOR(d, a, d)
|
||||
rotr(d, 16, d)
|
||||
VPADDD(c, d, c)
|
||||
VPXOR(b, c, b)
|
||||
rotr(b, 12, b)
|
||||
VPADDD(a, b, a)
|
||||
VPADDD(my, a, a)
|
||||
VPXOR(d, a, d)
|
||||
rotr(d, 8, d)
|
||||
VPADDD(c, d, c)
|
||||
VPXOR(b, c, b)
|
||||
rotr(b, 7, b)
|
||||
}
|
||||
|
||||
VMOVDQU(sv[8], spillMem) // spill
|
||||
for i := 0; i < 7; i++ {
|
||||
Comment(fmt.Sprintf("Round %v", i+1))
|
||||
g(sv[0], sv[4], sv[8], sv[12], mv[0], mv[1], tmp, spillMem)
|
||||
g(sv[1], sv[5], sv[9], sv[13], mv[2], mv[3], tmp, spillMem)
|
||||
g(sv[2], sv[6], sv[10], sv[14], mv[4], mv[5], tmp, spillMem)
|
||||
g(sv[3], sv[7], sv[11], sv[15], mv[6], mv[7], tmp, spillMem)
|
||||
g(sv[0], sv[5], sv[10], sv[15], mv[8], mv[9], tmp, spillMem)
|
||||
g(sv[1], sv[6], sv[11], sv[12], mv[10], mv[11], tmp, spillMem)
|
||||
g(sv[2], sv[7], sv[8], sv[13], mv[12], mv[13], tmp, spillMem)
|
||||
g(sv[3], sv[4], sv[9], sv[14], mv[14], mv[15], tmp, spillMem)
|
||||
g(sv[0], sv[4], sv[8], sv[12], mv[0], mv[1])
|
||||
g(sv[1], sv[5], sv[9], sv[13], mv[2], mv[3])
|
||||
g(sv[2], sv[6], sv[10], sv[14], mv[4], mv[5])
|
||||
g(sv[3], sv[7], sv[11], sv[15], mv[6], mv[7])
|
||||
g(sv[0], sv[5], sv[10], sv[15], mv[8], mv[9])
|
||||
g(sv[1], sv[6], sv[11], sv[12], mv[10], mv[11])
|
||||
g(sv[2], sv[7], sv[8], sv[13], mv[12], mv[13])
|
||||
g(sv[3], sv[4], sv[9], sv[14], mv[14], mv[15])
|
||||
|
||||
// permute
|
||||
mv = [16]Mem{
|
||||
|
@ -252,50 +500,13 @@ func performRounds(sv [16]VecVirtual, mv [16]Mem, spillMem Mem) {
|
|||
VMOVDQU(spillMem, sv[8]) // reload
|
||||
}
|
||||
|
||||
func g(a, b, c, d VecVirtual, mx, my Mem, tmp VecVirtual, spillMem Mem) {
|
||||
// Helper function for performing rotations. Also manages c, tmp and
|
||||
// spillMem: if c == tmp, we need to spill and reload c using spillMem.
|
||||
rotr := func(v VecVirtual, n uint64, dst VecVirtual) {
|
||||
switch n {
|
||||
case 8, 16:
|
||||
shuf := [...]Mem{8: globals.shuffleRot8, 16: globals.shuffleRot16}[n]
|
||||
VPSHUFB(shuf, v, dst)
|
||||
if c == tmp {
|
||||
VMOVDQU(spillMem, c)
|
||||
}
|
||||
case 7, 12:
|
||||
if c == tmp {
|
||||
VMOVDQU(c, spillMem)
|
||||
}
|
||||
VPSRLD(Imm(n), v, tmp)
|
||||
VPSLLD(Imm(32-n), v, dst)
|
||||
VPOR(dst, tmp, dst)
|
||||
}
|
||||
}
|
||||
|
||||
VPADDD(a, b, a)
|
||||
VPADDD(mx, a, a)
|
||||
VPXOR(d, a, d)
|
||||
rotr(d, 16, d)
|
||||
VPADDD(c, d, c)
|
||||
VPXOR(b, c, b)
|
||||
rotr(b, 12, b)
|
||||
VPADDD(a, b, a)
|
||||
VPADDD(my, a, a)
|
||||
VPXOR(d, a, d)
|
||||
rotr(d, 8, d)
|
||||
VPADDD(c, d, c)
|
||||
VPXOR(b, c, b)
|
||||
rotr(b, 7, b)
|
||||
}
|
||||
|
||||
func loadCounter(counter Mem, dst, scratch []VecVirtual) {
|
||||
// fill dst[0] and dst[1] with counter + 0,1,2,3,4,5,6,7, then transpose so
|
||||
// that dst[0] contains low 32 bits and dst[1] contains high 32 bits.
|
||||
VPBROADCASTQ(counter, dst[0])
|
||||
VPBROADCASTQ(counter, dst[1])
|
||||
VPADDQ(globals.incrementCounter.Offset(0*32), dst[0], dst[0])
|
||||
VPADDQ(globals.incrementCounter.Offset(1*32), dst[1], dst[1])
|
||||
VPADDQ(globals.seq64.Offset(0*4), dst[0], dst[0])
|
||||
VPADDQ(globals.seq64.Offset(8*4), dst[1], dst[1])
|
||||
VPUNPCKLDQ(dst[1], dst[0], scratch[0])
|
||||
VPUNPCKHDQ(dst[1], dst[0], scratch[1])
|
||||
VPUNPCKLDQ(scratch[1], scratch[0], dst[0])
|
||||
|
@ -323,3 +534,152 @@ func transpose(src, dst []VecVirtual) {
|
|||
VPERM2I128(Imm(0x31), src[i+4], src[i], dst[i+4])
|
||||
}
|
||||
}
|
||||
|
||||
// AVX-512 is not currently supported by avo, so we need to manually define the
|
||||
// instructions we need
|
||||
|
||||
type maskReg = LabelRef // hack; avo doesn't allow custom Op types
|
||||
|
||||
const K0 maskReg = "K0"
|
||||
const K1 maskReg = "K1"
|
||||
const K2 maskReg = "K2"
|
||||
|
||||
func VMOVDQU32_Z(src, dst Op) {
|
||||
Instruction(&ir.Instruction{
|
||||
Opcode: "VMOVDQU32",
|
||||
Operands: []Op{src, dst},
|
||||
Inputs: []Op{src},
|
||||
Outputs: []Op{dst},
|
||||
ISA: []string{"AVX512F"},
|
||||
})
|
||||
}
|
||||
|
||||
func VPBROADCASTD_Z(src, dst Op) {
|
||||
Instruction(&ir.Instruction{
|
||||
Opcode: "VPBROADCASTD",
|
||||
Operands: []Op{src, dst},
|
||||
Inputs: []Op{src},
|
||||
Outputs: []Op{dst},
|
||||
ISA: []string{"AVX512F"},
|
||||
})
|
||||
}
|
||||
|
||||
func VPGATHERDD_Z(src, mask, dst Op) {
|
||||
Instruction(&ir.Instruction{
|
||||
Opcode: "VPGATHERDD",
|
||||
Operands: []Op{src, mask, dst},
|
||||
Inputs: []Op{src, mask},
|
||||
Outputs: []Op{dst},
|
||||
ISA: []string{"AVX512F"},
|
||||
})
|
||||
}
|
||||
|
||||
func VPSCATTERDD_Z(src, mask, dst Op) {
|
||||
Instruction(&ir.Instruction{
|
||||
Opcode: "VPSCATTERDD",
|
||||
Operands: []Op{src, mask, dst},
|
||||
Inputs: []Op{src, mask},
|
||||
Outputs: []Op{dst},
|
||||
ISA: []string{"AVX512F"},
|
||||
})
|
||||
}
|
||||
|
||||
func VPORD_Z(x, y, dst Op) {
|
||||
Instruction(&ir.Instruction{
|
||||
Opcode: "VPORD",
|
||||
Operands: []Op{x, y, dst},
|
||||
Inputs: []Op{x, y},
|
||||
Outputs: []Op{dst},
|
||||
ISA: []string{"AVX512F"},
|
||||
})
|
||||
}
|
||||
|
||||
func VPXORD_Z(x, y, dst Op) {
|
||||
Instruction(&ir.Instruction{
|
||||
Opcode: "VPXORD",
|
||||
Operands: []Op{x, y, dst},
|
||||
Inputs: []Op{x, y},
|
||||
Outputs: []Op{dst},
|
||||
ISA: []string{"AVX512F"},
|
||||
})
|
||||
}
|
||||
|
||||
func VPXORD_ZB(x, y, dst Op) {
|
||||
Instruction(&ir.Instruction{
|
||||
Opcode: "VPXORD.BCST",
|
||||
Operands: []Op{x, y, dst},
|
||||
Inputs: []Op{x, y},
|
||||
Outputs: []Op{dst},
|
||||
ISA: []string{"AVX512F"},
|
||||
})
|
||||
}
|
||||
|
||||
func VPRORD_Z(n, src, dst Op) {
|
||||
Instruction(&ir.Instruction{
|
||||
Opcode: "VPRORD",
|
||||
Operands: []Op{n, src, dst},
|
||||
Inputs: []Op{n, src},
|
||||
Outputs: []Op{dst},
|
||||
ISA: []string{"AVX512F"},
|
||||
})
|
||||
}
|
||||
|
||||
func VPSLLD_Z(n, src, dst Op) {
|
||||
Instruction(&ir.Instruction{
|
||||
Opcode: "VPSLLD",
|
||||
Operands: []Op{n, src, dst},
|
||||
Inputs: []Op{n, src},
|
||||
Outputs: []Op{dst},
|
||||
ISA: []string{"AVX512F"},
|
||||
})
|
||||
}
|
||||
|
||||
func VPADDD_Z(x, y, dst Op) {
|
||||
Instruction(&ir.Instruction{
|
||||
Opcode: "VPADDD",
|
||||
Operands: []Op{x, y, dst},
|
||||
Inputs: []Op{x, y},
|
||||
Outputs: []Op{dst},
|
||||
ISA: []string{"AVX512F"},
|
||||
})
|
||||
}
|
||||
|
||||
func VPADDD_ZB(x, y, dst Op) {
|
||||
Instruction(&ir.Instruction{
|
||||
Opcode: "VPADDD.BCST",
|
||||
Operands: []Op{x, y, dst},
|
||||
Inputs: []Op{x, y},
|
||||
Outputs: []Op{dst},
|
||||
ISA: []string{"AVX512F"},
|
||||
})
|
||||
}
|
||||
|
||||
func VPADDD_ZBK(x, y, mask, dst Op) {
|
||||
Instruction(&ir.Instruction{
|
||||
Opcode: "VPADDD.BCST",
|
||||
Operands: []Op{x, y, mask, dst},
|
||||
Inputs: []Op{x, y, mask},
|
||||
Outputs: []Op{dst},
|
||||
ISA: []string{"AVX512F"},
|
||||
})
|
||||
}
|
||||
|
||||
func KXNORD(x, y, dst Op) {
|
||||
Instruction(&ir.Instruction{
|
||||
Opcode: "KXNORD",
|
||||
Operands: []Op{x, y, dst},
|
||||
Inputs: []Op{x, y},
|
||||
Outputs: []Op{dst},
|
||||
ISA: []string{"AVX512F"},
|
||||
})
|
||||
}
|
||||
|
||||
func VPCMPUD(pred, x, y, dst Op) {
|
||||
Instruction(&ir.Instruction{
|
||||
Opcode: "VPCMPUD",
|
||||
Operands: []Op{pred, x, y, dst},
|
||||
Inputs: []Op{pred, x, y},
|
||||
Outputs: []Op{dst},
|
||||
ISA: []string{"AVX512F"},
|
||||
})
|
||||
}
|
||||
|
|
30
blake3.go
30
blake3.go
|
@ -21,6 +21,8 @@ const (
|
|||
|
||||
blockSize = 64
|
||||
chunkSize = 1024
|
||||
|
||||
maxSIMD = 16 // AVX-512 vectors can store 16 words
|
||||
)
|
||||
|
||||
var iv = [8]uint32{
|
||||
|
@ -58,10 +60,10 @@ type Hasher struct {
|
|||
size int // output size, for Sum
|
||||
|
||||
// log(n) set of Merkle subtree roots, at most one per height.
|
||||
stack [51][8]uint32 // 2^51 * 8 * chunkSize = 2^64
|
||||
stack [50][8]uint32 // 2^50 * maxSIMD * chunkSize = 2^64
|
||||
counter uint64 // number of buffers hashed; also serves as a bit vector indicating which stack elems are occupied
|
||||
|
||||
buf [8 * chunkSize]byte
|
||||
buf [maxSIMD * chunkSize]byte
|
||||
buflen int
|
||||
}
|
||||
|
||||
|
@ -83,7 +85,7 @@ func (h *Hasher) pushSubtree(cv [8]uint32) {
|
|||
// rootNode computes the root of the Merkle tree. It does not modify the
|
||||
// stack.
|
||||
func (h *Hasher) rootNode() node {
|
||||
n := compressBuffer(&h.buf, h.buflen, &h.key, h.counter*8, h.flags)
|
||||
n := compressBuffer(&h.buf, h.buflen, &h.key, h.counter*maxSIMD, h.flags)
|
||||
for i := bits.TrailingZeros64(h.counter); i < bits.Len64(h.counter); i++ {
|
||||
if h.hasSubtreeAtHeight(i) {
|
||||
n = parentNode(h.stack[i], chainingValue(n), h.key, h.flags)
|
||||
|
@ -98,7 +100,7 @@ func (h *Hasher) Write(p []byte) (int, error) {
|
|||
lenp := len(p)
|
||||
for len(p) > 0 {
|
||||
if h.buflen == len(h.buf) {
|
||||
n := compressBuffer(&h.buf, h.buflen, &h.key, h.counter*8, h.flags)
|
||||
n := compressBuffer(&h.buf, h.buflen, &h.key, h.counter*maxSIMD, h.flags)
|
||||
h.pushSubtree(chainingValue(n))
|
||||
h.buflen = 0
|
||||
}
|
||||
|
@ -119,8 +121,16 @@ func (h *Hasher) Sum(b []byte) (sum []byte) {
|
|||
sum = make([]byte, total)
|
||||
copy(sum, b)
|
||||
}
|
||||
// Read into the appended portion of sum
|
||||
h.XOF().Read(sum[len(b):])
|
||||
// Read into the appended portion of sum. Use a low-latency-low-throughput
|
||||
// path for small digests (requiring a single compression), and a
|
||||
// high-latency-high-throughput path for large digests.
|
||||
if dst := sum[len(b):]; len(dst) <= 64 {
|
||||
var out [64]byte
|
||||
wordsToBytes(compressNode(h.rootNode()), &out)
|
||||
copy(dst, out[:])
|
||||
} else {
|
||||
h.XOF().Read(dst)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
|
@ -224,7 +234,7 @@ func DeriveKey(subKey []byte, ctx string, srcKey []byte) {
|
|||
// bytes.
|
||||
type OutputReader struct {
|
||||
n node
|
||||
buf [8 * blockSize]byte
|
||||
buf [maxSIMD * blockSize]byte
|
||||
off uint64
|
||||
}
|
||||
|
||||
|
@ -238,11 +248,11 @@ func (or *OutputReader) Read(p []byte) (int, error) {
|
|||
}
|
||||
lenp := len(p)
|
||||
for len(p) > 0 {
|
||||
if or.off%(8*blockSize) == 0 {
|
||||
if or.off%(maxSIMD*blockSize) == 0 {
|
||||
or.n.counter = or.off / blockSize
|
||||
compressBlocks(&or.buf, or.n)
|
||||
}
|
||||
n := copy(p, or.buf[or.off%(8*blockSize):])
|
||||
n := copy(p, or.buf[or.off%(maxSIMD*blockSize):])
|
||||
p = p[n:]
|
||||
or.off += uint64(n)
|
||||
}
|
||||
|
@ -274,7 +284,7 @@ func (or *OutputReader) Seek(offset int64, whence int) (int64, error) {
|
|||
}
|
||||
or.off = off
|
||||
or.n.counter = uint64(off) / blockSize
|
||||
if or.off%(8*blockSize) != 0 {
|
||||
if or.off%(maxSIMD*blockSize) != 0 {
|
||||
compressBlocks(&or.buf, or.n)
|
||||
}
|
||||
// NOTE: or.off >= 2^63 will result in a negative return value.
|
||||
|
|
3151
blake3_amd64.s
3151
blake3_amd64.s
File diff suppressed because it is too large
Load Diff
|
@ -33,7 +33,7 @@ var testVectors = func() (vecs struct {
|
|||
}()
|
||||
|
||||
var testInput = func() []byte {
|
||||
input := make([]byte, 1<<15)
|
||||
input := make([]byte, 1e6)
|
||||
for i := range input {
|
||||
input[i] = byte(i % 251)
|
||||
}
|
||||
|
|
|
@ -1,27 +1,32 @@
|
|||
package blake3
|
||||
|
||||
import (
|
||||
"unsafe"
|
||||
|
||||
"golang.org/x/sys/cpu"
|
||||
)
|
||||
import "unsafe"
|
||||
|
||||
//go:generate go run avo/gen.go -out blake3_amd64.s
|
||||
|
||||
//go:noescape
|
||||
func compressChunksAVX2(cvs *[8][8]uint32, buf *[8192]byte, key *[8]uint32, counter uint64, flags uint32)
|
||||
func compressChunksAVX512(cvs *[16][8]uint32, buf *[16 * chunkSize]byte, key *[8]uint32, counter uint64, flags uint32)
|
||||
|
||||
//go:noescape
|
||||
func compressChunksAVX2(cvs *[8][8]uint32, buf *[8 * chunkSize]byte, key *[8]uint32, counter uint64, flags uint32)
|
||||
|
||||
//go:noescape
|
||||
func compressBlocksAVX512(out *[1024]byte, block *[16]uint32, cv *[8]uint32, counter uint64, blockLen uint32, flags uint32)
|
||||
|
||||
//go:noescape
|
||||
func compressBlocksAVX2(out *[512]byte, msgs *[16]uint32, cv *[8]uint32, counter uint64, blockLen uint32, flags uint32)
|
||||
|
||||
//go:noescape
|
||||
func compressParentsAVX2(parents *[8][8]uint32, cvs *[16][8]uint32, key *[8]uint32, flags uint32)
|
||||
|
||||
func compressNode(n node) (out [16]uint32) {
|
||||
compressNodeGeneric(&out, n)
|
||||
return
|
||||
}
|
||||
|
||||
func compressBufferLarge(buf *[8192]byte, buflen int, key *[8]uint32, counter uint64, flags uint32) node {
|
||||
var cvs [8][8]uint32
|
||||
compressChunksAVX2(&cvs, buf, key, counter, flags)
|
||||
func compressBufferAVX512(buf *[maxSIMD * chunkSize]byte, buflen int, key *[8]uint32, counter uint64, flags uint32) node {
|
||||
var cvs [maxSIMD][8]uint32
|
||||
compressChunksAVX512(&cvs, buf, key, counter, flags)
|
||||
numChunks := uint64(buflen / chunkSize)
|
||||
if buflen%chunkSize != 0 {
|
||||
// use non-asm for remainder
|
||||
|
@ -29,13 +34,33 @@ func compressBufferLarge(buf *[8192]byte, buflen int, key *[8]uint32, counter ui
|
|||
cvs[numChunks] = chainingValue(compressChunk(partialChunk, key, counter+numChunks, flags))
|
||||
numChunks++
|
||||
}
|
||||
return mergeSubtrees(cvs[:numChunks], key, flags)
|
||||
return mergeSubtrees(&cvs, numChunks, key, flags)
|
||||
}
|
||||
|
||||
func compressBuffer(buf *[8192]byte, buflen int, key *[8]uint32, counter uint64, flags uint32) node {
|
||||
func compressBufferAVX2(buf *[maxSIMD * chunkSize]byte, buflen int, key *[8]uint32, counter uint64, flags uint32) node {
|
||||
var cvs [maxSIMD][8]uint32
|
||||
cvHalves := (*[2][8][8]uint32)(unsafe.Pointer(&cvs))
|
||||
bufHalves := (*[2][8 * chunkSize]byte)(unsafe.Pointer(buf))
|
||||
compressChunksAVX2(&cvHalves[0], &bufHalves[0], key, counter, flags)
|
||||
numChunks := uint64(buflen / chunkSize)
|
||||
if numChunks > 8 {
|
||||
compressChunksAVX2(&cvHalves[1], &bufHalves[1], key, counter+8, flags)
|
||||
}
|
||||
if buflen%chunkSize != 0 {
|
||||
// use non-asm for remainder
|
||||
partialChunk := buf[buflen-buflen%chunkSize : buflen]
|
||||
cvs[numChunks] = chainingValue(compressChunk(partialChunk, key, counter+numChunks, flags))
|
||||
numChunks++
|
||||
}
|
||||
return mergeSubtrees(&cvs, numChunks, key, flags)
|
||||
}
|
||||
|
||||
func compressBuffer(buf *[maxSIMD * chunkSize]byte, buflen int, key *[8]uint32, counter uint64, flags uint32) node {
|
||||
switch {
|
||||
case cpu.X86.HasAVX2 && buflen >= chunkSize*2:
|
||||
return compressBufferLarge(buf, buflen, key, counter, flags)
|
||||
case haveAVX512 && buflen >= chunkSize*2:
|
||||
return compressBufferAVX512(buf, buflen, key, counter, flags)
|
||||
case haveAVX2 && buflen >= chunkSize*2:
|
||||
return compressBufferAVX2(buf, buflen, key, counter, flags)
|
||||
default:
|
||||
return compressBufferGeneric(buf, buflen, key, counter, flags)
|
||||
}
|
||||
|
@ -74,14 +99,36 @@ func hashBlock(out *[64]byte, buf []byte) {
|
|||
})
|
||||
}
|
||||
|
||||
func compressBlocks(out *[512]byte, n node) {
|
||||
func compressBlocks(out *[maxSIMD * blockSize]byte, n node) {
|
||||
switch {
|
||||
case cpu.X86.HasAVX2:
|
||||
compressBlocksAVX2(out, &n.block, &n.cv, n.counter, n.blockLen, n.flags)
|
||||
case haveAVX512:
|
||||
compressBlocksAVX512(out, &n.block, &n.cv, n.counter, n.blockLen, n.flags)
|
||||
case haveAVX2:
|
||||
outs := (*[2][512]byte)(unsafe.Pointer(out))
|
||||
compressBlocksAVX2(&outs[0], &n.block, &n.cv, n.counter, n.blockLen, n.flags)
|
||||
compressBlocksAVX2(&outs[1], &n.block, &n.cv, n.counter+8, n.blockLen, n.flags)
|
||||
default:
|
||||
compressBlocksGeneric((*[8][64]byte)(unsafe.Pointer(out)), n)
|
||||
outs := (*[maxSIMD][64]byte)(unsafe.Pointer(out))
|
||||
compressBlocksGeneric(outs, n)
|
||||
}
|
||||
}
|
||||
|
||||
func mergeSubtrees(cvs *[maxSIMD][8]uint32, numCVs uint64, key *[8]uint32, flags uint32) node {
|
||||
if !haveAVX2 {
|
||||
return mergeSubtreesGeneric(cvs, numCVs, key, flags)
|
||||
}
|
||||
for numCVs > 2 {
|
||||
if numCVs%2 == 0 {
|
||||
compressParentsAVX2((*[8][8]uint32)(unsafe.Pointer(cvs)), cvs, key, flags)
|
||||
} else {
|
||||
keep := cvs[numCVs-1]
|
||||
compressParentsAVX2((*[8][8]uint32)(unsafe.Pointer(cvs)), cvs, key, flags)
|
||||
cvs[numCVs/2] = keep
|
||||
numCVs++
|
||||
}
|
||||
numCVs /= 2
|
||||
}
|
||||
return parentNode(cvs[0], cvs[1], *key, flags)
|
||||
}
|
||||
|
||||
func wordsToBytes(words [16]uint32, block *[64]byte) {
|
||||
|
|
|
@ -5,19 +5,19 @@ import (
|
|||
"math/bits"
|
||||
)
|
||||
|
||||
func g(a, b, c, d, mx, my uint32) (uint32, uint32, uint32, uint32) {
|
||||
a += b + mx
|
||||
d = bits.RotateLeft32(d^a, -16)
|
||||
c += d
|
||||
b = bits.RotateLeft32(b^c, -12)
|
||||
a += b + my
|
||||
d = bits.RotateLeft32(d^a, -8)
|
||||
c += d
|
||||
b = bits.RotateLeft32(b^c, -7)
|
||||
return a, b, c, d
|
||||
}
|
||||
|
||||
func compressNodeGeneric(out *[16]uint32, n node) {
|
||||
g := func(a, b, c, d, mx, my uint32) (uint32, uint32, uint32, uint32) {
|
||||
a += b + mx
|
||||
d = bits.RotateLeft32(d^a, -16)
|
||||
c += d
|
||||
b = bits.RotateLeft32(b^c, -12)
|
||||
a += b + my
|
||||
d = bits.RotateLeft32(d^a, -8)
|
||||
c += d
|
||||
b = bits.RotateLeft32(b^c, -7)
|
||||
return a, b, c, d
|
||||
}
|
||||
|
||||
// NOTE: we unroll all of the rounds, as well as the permutations that occur
|
||||
// between rounds.
|
||||
|
||||
|
@ -102,56 +102,42 @@ func compressNodeGeneric(out *[16]uint32, n node) {
|
|||
}
|
||||
}
|
||||
|
||||
func compressBufferGeneric(buf *[8192]byte, buflen int, key *[8]uint32, counter uint64, flags uint32) (n node) {
|
||||
if buflen <= chunkSize {
|
||||
return compressChunk(buf[:buflen], key, counter, flags)
|
||||
}
|
||||
cvs := make([][8]uint32, 0, 8)
|
||||
for bb := bytes.NewBuffer(buf[:buflen]); bb.Len() > 0; {
|
||||
n := compressChunk(bb.Next(chunkSize), key, counter, flags)
|
||||
cvs = append(cvs, chainingValue(n))
|
||||
counter++
|
||||
}
|
||||
return mergeSubtrees(cvs, key, flags)
|
||||
}
|
||||
|
||||
func compressBlocksGeneric(outs *[8][64]byte, n node) {
|
||||
for i := range outs {
|
||||
wordsToBytes(compressNode(n), &outs[i])
|
||||
n.counter++
|
||||
}
|
||||
}
|
||||
|
||||
func chainingValue(n node) (cv [8]uint32) {
|
||||
full := compressNode(n)
|
||||
copy(cv[:], full[:])
|
||||
return
|
||||
}
|
||||
|
||||
func mergeSubtrees(cvs [][8]uint32, key *[8]uint32, flags uint32) node {
|
||||
parent := func(l, r [8]uint32) [8]uint32 {
|
||||
return chainingValue(parentNode(l, r, *key, flags))
|
||||
func compressBufferGeneric(buf *[maxSIMD * chunkSize]byte, buflen int, key *[8]uint32, counter uint64, flags uint32) (n node) {
|
||||
if buflen <= chunkSize {
|
||||
return compressChunk(buf[:buflen], key, counter, flags)
|
||||
}
|
||||
switch len(cvs) {
|
||||
case 8:
|
||||
cvs[6] = parent(cvs[6], cvs[7])
|
||||
fallthrough
|
||||
case 7:
|
||||
cvs[4], cvs[5] = parent(cvs[4], cvs[5]), cvs[6]
|
||||
fallthrough
|
||||
case 6:
|
||||
cvs[4] = parent(cvs[4], cvs[5])
|
||||
fallthrough
|
||||
case 5:
|
||||
fallthrough
|
||||
case 4:
|
||||
cvs[2] = parent(cvs[2], cvs[3])
|
||||
fallthrough
|
||||
case 3:
|
||||
cvs[0], cvs[1] = parent(cvs[0], cvs[1]), cvs[2]
|
||||
var cvs [maxSIMD][8]uint32
|
||||
var numCVs uint64
|
||||
for bb := bytes.NewBuffer(buf[:buflen]); bb.Len() > 0; numCVs++ {
|
||||
cvs[numCVs] = chainingValue(compressChunk(bb.Next(chunkSize), key, counter+numCVs, flags))
|
||||
}
|
||||
if len(cvs) > 4 {
|
||||
cvs[0], cvs[1] = parent(cvs[0], cvs[1]), cvs[4]
|
||||
return mergeSubtrees(&cvs, numCVs, key, flags)
|
||||
}
|
||||
|
||||
func compressBlocksGeneric(outs *[maxSIMD][64]byte, n node) {
|
||||
for i := range outs {
|
||||
wordsToBytes(compressNode(n), &outs[i])
|
||||
n.counter++
|
||||
}
|
||||
}
|
||||
|
||||
func mergeSubtreesGeneric(cvs *[maxSIMD][8]uint32, numCVs uint64, key *[8]uint32, flags uint32) node {
|
||||
for numCVs > 2 {
|
||||
rem := numCVs / 2
|
||||
for i := range cvs[:rem] {
|
||||
cvs[i] = chainingValue(parentNode(cvs[i*2], cvs[i*2+1], *key, flags))
|
||||
}
|
||||
if numCVs%2 != 0 {
|
||||
cvs[rem] = cvs[rem*2]
|
||||
rem++
|
||||
}
|
||||
numCVs = rem
|
||||
}
|
||||
return parentNode(cvs[0], cvs[1], *key, flags)
|
||||
}
|
||||
|
|
|
@ -9,8 +9,8 @@ func compressNode(n node) (out [16]uint32) {
|
|||
return
|
||||
}
|
||||
|
||||
func compressBuffer(buf *[8192]byte, length int, key *[8]uint32, counter uint64, flags uint32) node {
|
||||
return compressBufferGeneric(buf, length, key, counter, flags)
|
||||
func compressBuffer(buf *[maxSIMD * chunkSize]byte, buflen int, key *[8]uint32, counter uint64, flags uint32) node {
|
||||
return compressBufferGeneric(buf, buflen, key, counter, flags)
|
||||
}
|
||||
|
||||
func compressChunk(chunk []byte, key *[8]uint32, counter uint64, flags uint32) node {
|
||||
|
@ -51,14 +51,18 @@ func hashBlock(out *[64]byte, buf []byte) {
|
|||
wordsToBytes(words, out)
|
||||
}
|
||||
|
||||
func compressBlocks(out *[512]byte, n node) {
|
||||
var outs [8][64]byte
|
||||
func compressBlocks(out *[maxSIMD * blockSize]byte, n node) {
|
||||
var outs [maxSIMD][64]byte
|
||||
compressBlocksGeneric(&outs, n)
|
||||
for i := range outs {
|
||||
copy(out[i*64:], outs[i][:])
|
||||
}
|
||||
}
|
||||
|
||||
func mergeSubtrees(cvs *[maxSIMD][8]uint32, numCVs uint64, key *[8]uint32, flags uint32) node {
|
||||
return mergeSubtreesGeneric(cvs, numCVs, key, flags)
|
||||
}
|
||||
|
||||
func bytesToWords(bytes [64]byte, words *[16]uint32) {
|
||||
for i := range words {
|
||||
words[i] = binary.LittleEndian.Uint32(bytes[4*i:])
|
||||
|
|
|
@ -0,0 +1,10 @@
|
|||
// +build !darwin
|
||||
|
||||
package blake3
|
||||
|
||||
import "github.com/klauspost/cpuid"
|
||||
|
||||
var (
|
||||
haveAVX2 = cpuid.CPU.AVX2()
|
||||
haveAVX512 = cpuid.CPU.AVX512F()
|
||||
)
|
|
@ -0,0 +1,22 @@
|
|||
package blake3
|
||||
|
||||
import (
|
||||
"syscall"
|
||||
|
||||
"github.com/klauspost/cpuid"
|
||||
)
|
||||
|
||||
var (
|
||||
haveAVX2 bool
|
||||
haveAVX512 bool
|
||||
)
|
||||
|
||||
func init() {
|
||||
haveAVX2 = cpuid.CPU.AVX2()
|
||||
haveAVX512 = cpuid.CPU.AVX512F()
|
||||
if !haveAVX512 {
|
||||
// On some Macs, AVX512 detection is buggy, so fallback to sysctl
|
||||
b, _ := syscall.Sysctl("hw.optional.avx512f")
|
||||
haveAVX512 = b[0] == 1
|
||||
}
|
||||
}
|
4
go.mod
4
go.mod
|
@ -2,4 +2,6 @@ module lukechampine.com/blake3
|
|||
|
||||
go 1.13
|
||||
|
||||
require golang.org/x/sys v0.0.0-20200202164722-d101bd2416d5
|
||||
require (
|
||||
github.com/klauspost/cpuid v1.3.1
|
||||
)
|
||||
|
|
4
go.sum
4
go.sum
|
@ -1,2 +1,2 @@
|
|||
golang.org/x/sys v0.0.0-20200202164722-d101bd2416d5 h1:LfCXLvNmTYH9kEmVgqbnsWfruoXZIrh4YBgqVHtDvw0=
|
||||
golang.org/x/sys v0.0.0-20200202164722-d101bd2416d5/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
github.com/klauspost/cpuid v1.3.1 h1:5JNjFYYQrZeKRJ0734q51WCEEn2huer72Dc7K+R/b6s=
|
||||
github.com/klauspost/cpuid v1.3.1/go.mod h1:bYW4mA6ZgKPob1/Dlai2LviZJO7KGI3uoWLd42rAQw4=
|
||||
|
|
|
@ -126,6 +126,12 @@
|
|||
"hash": "62b6960e1a44bcc1eb1a611a8d6235b6b4b78f32e7abc4fb4c6cdcce94895c47860cc51f2b0c28a7b77304bd55fe73af663c02d3f52ea053ba43431ca5bab7bfea2f5e9d7121770d88f70ae9649ea713087d1914f7f312147e247f87eb2d4ffef0ac978bf7b6579d57d533355aa20b8b77b13fd09748728a5cc327a8ec470f4013226f",
|
||||
"keyed_hash": "efa53b389ab67c593dba624d898d0f7353ab99e4ac9d42302ee64cbf9939a4193a7258db2d9cd32a7a3ecfce46144114b15c2fcb68a618a976bd74515d47be08b628be420b5e830fade7c080e351a076fbc38641ad80c736c8a18fe3c66ce12f95c61c2462a9770d60d0f77115bbcd3782b593016a4e728d4c06cee4505cb0c08a42ec",
|
||||
"derive_key": "39772aef80e0ebe60596361e45b061e8f417429d529171b6764468c22928e28e9759adeb797a3fbf771b1bcea30150a020e317982bf0d6e7d14dd9f064bc11025c25f31e81bd78a921db0174f03dd481d30e93fd8e90f8b2fee209f849f2d2a52f31719a490fb0ba7aea1e09814ee912eba111a9fde9d5c274185f7bae8ba85d300a2b"
|
||||
},
|
||||
{
|
||||
"input_len": 100000,
|
||||
"hash": "d93c23eedaf165a7e0be908ba86f1a7a520d568d2d13cde787c8580c5c72cc54902b765d0e69ff7f278ef2f8bb839b673f0db20afa0566c78965ad819674822fd11a507251555fc6daec7437074bc7b7307dfe122411b3676a932b5b0360d5ad495f8e7431d3d025fac5b4e955ce893a3504f2569f838eea47cf1bb21c4ae659db522f",
|
||||
"keyed_hash": "74c836d008247adebbc032d1bced2e71d19050b5c39fa03c43d4160ad8d170732f3b73e374a4500825c13d2c8c9384ce12c033adc49245ce42f50d5b48237397b8447bd414b0693bef98518db8a3494e6e8e3abc931f92f472d938f07eac97d1cc69b375426bce26c5e829b5b41cacbb5543544977749d503fa78309e7a158640e579c",
|
||||
"derive_key": "039c0c0d76eacefea9c8d042698bd012d3cef4091ed5c5a7e32a30e4d51718930a99481bb11214d9e9e79e58d11875a789447731a887aa77499843148d35b1752c6314af6d36559341bd6895c5ee0a452c99cb47a9b22dfe36042932fc9a423d245b91b6246c85e4b0d415cbece3e0545d6e242853da7f3dd1f9b0f146ec72706b8c28"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
Reference in New Issue