add AVX2 implementation
This commit is contained in:
parent
6c1c802729
commit
c2af4bc4c2
31
README.md
31
README.md
|
@ -9,10 +9,29 @@ go get lukechampine.com/blake3
|
||||||
```
|
```
|
||||||
|
|
||||||
`blake3` implements the [BLAKE3 cryptographic hash function](https://github.com/BLAKE3-team/BLAKE3).
|
`blake3` implements the [BLAKE3 cryptographic hash function](https://github.com/BLAKE3-team/BLAKE3).
|
||||||
|
This implementation aims to be performant without sacrificing (too much)
|
||||||
|
readability, in the hopes of eventually landing in `x/crypto`.
|
||||||
|
|
||||||
This implementation is a port of the Rust reference implementation, refactored
|
The pure-Go code is fairly well-optimized, achieving throughput of ~600 MB/s.
|
||||||
into more idiomatic Go style and with a handful of performance tweaks.
|
There is a separate code path for small inputs (up to 64 bytes) that runs in
|
||||||
Performance is not great, not terrible. Eventually an assembly-optimized
|
~100 ns. On CPUs with AVX2 support, larger inputs (>=2 KB) are handled by
|
||||||
implementation will be merged into `x/crypto`, and then you should switch to
|
an [`avo`](https://github.com/mmcloughlin/avo)-generated assembly routine that compresses 8 chunks in parallel,
|
||||||
that. In the meantime, you can use this package for code that needs BLAKE3
|
achieving throughput of ~2600 MB/s. Once [AVX-512 support](https://github.com/mmcloughlin/avo/issues/20) is added to `avo`, it
|
||||||
compatibility and doesn't need to be blazing fast.
|
will be possible to compress 16 chunks in parallel, which should roughly double
|
||||||
|
throughput for sufficiently large inputs.
|
||||||
|
|
||||||
|
Contributions are greatly appreciated.
|
||||||
|
[All contributors are eligible to receive an Urbit planet.](https://twitter.com/lukechampine/status/1274797924522885134)
|
||||||
|
|
||||||
|
|
||||||
|
## Benchmarks
|
||||||
|
|
||||||
|
Tested on an i5-7600K @ 3.80GHz.
|
||||||
|
|
||||||
|
```
|
||||||
|
BenchmarkSum256/64 105 ns/op 609.51 MB/s
|
||||||
|
BenchmarkSum256/1024 1778 ns/op 576.00 MB/s
|
||||||
|
BenchmarkSum256/65536 24785 ns/op 2644.15 MB/s
|
||||||
|
BenchmarkWrite 389 ns/op 2631.78 MB/s
|
||||||
|
BenchmarkXOF 1591 ns/op 643.80 MB/s
|
||||||
|
```
|
||||||
|
|
|
@ -0,0 +1,239 @@
|
||||||
|
// +build ignore
|
||||||
|
|
||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
|
||||||
|
. "github.com/mmcloughlin/avo/build"
|
||||||
|
. "github.com/mmcloughlin/avo/operand"
|
||||||
|
. "github.com/mmcloughlin/avo/reg"
|
||||||
|
)
|
||||||
|
|
||||||
|
func main() {
|
||||||
|
genGlobals()
|
||||||
|
genCompressChunksAVX2()
|
||||||
|
|
||||||
|
Generate()
|
||||||
|
}
|
||||||
|
|
||||||
|
var globals struct {
|
||||||
|
iv Mem
|
||||||
|
blockLen Mem
|
||||||
|
stride1024 Mem
|
||||||
|
incrementCounter Mem
|
||||||
|
setFlags Mem
|
||||||
|
shuffleRot8 Mem
|
||||||
|
shuffleRot16 Mem
|
||||||
|
}
|
||||||
|
|
||||||
|
func genGlobals() {
|
||||||
|
globals.iv = GLOBL("iv", RODATA|NOPTR)
|
||||||
|
DATA(0*4, U32(0x6A09E667))
|
||||||
|
DATA(1*4, U32(0xBB67AE85))
|
||||||
|
DATA(2*4, U32(0x3C6EF372))
|
||||||
|
DATA(3*4, U32(0xA54FF53A))
|
||||||
|
|
||||||
|
globals.blockLen = GLOBL("block_len", RODATA|NOPTR)
|
||||||
|
for i := 0; i < 8; i++ {
|
||||||
|
DATA(i*4, U32(64))
|
||||||
|
}
|
||||||
|
|
||||||
|
globals.stride1024 = GLOBL("stride_1024", RODATA|NOPTR)
|
||||||
|
for i := 0; i < 8; i++ {
|
||||||
|
DATA(i*4, U32(i*1024))
|
||||||
|
}
|
||||||
|
globals.incrementCounter = GLOBL("increment_counter", RODATA|NOPTR)
|
||||||
|
for i := 0; i < 8; i++ {
|
||||||
|
DATA(i*8, U64(i))
|
||||||
|
}
|
||||||
|
globals.setFlags = GLOBL("set_flags", RODATA|NOPTR)
|
||||||
|
for i := 0; i < 16; i++ {
|
||||||
|
if i == 0 {
|
||||||
|
DATA(i*4, U32(1))
|
||||||
|
} else if i == 15 {
|
||||||
|
DATA(i*4, U32(2))
|
||||||
|
} else {
|
||||||
|
DATA(i*4, U32(0))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
globals.shuffleRot8 = GLOBL("shuffle_rot8", RODATA|NOPTR)
|
||||||
|
for i := 0; i < 8; i++ {
|
||||||
|
DATA(i*4, U32(0x00030201+0x04040404*i))
|
||||||
|
}
|
||||||
|
globals.shuffleRot16 = GLOBL("shuffle_rot16", RODATA|NOPTR)
|
||||||
|
for i := 0; i < 8; i++ {
|
||||||
|
DATA(i*4, U32(0x01000302+0x04040404*i))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func genCompressChunksAVX2() {
|
||||||
|
TEXT("compressChunksAVX2", NOSPLIT, "func(cvs *[8][8]uint32, buf *[8192]byte, key *[8]uint32, counter uint64, flags uint32)")
|
||||||
|
cvs := Mem{Base: Load(Param("cvs"), GP64())}
|
||||||
|
buf := Mem{Base: Load(Param("buf"), GP64())}
|
||||||
|
key := Mem{Base: Load(Param("key"), GP64())}
|
||||||
|
counter, _ := Param("counter").Resolve()
|
||||||
|
flags, _ := Param("flags").Resolve()
|
||||||
|
|
||||||
|
vs := [16]VecVirtual{
|
||||||
|
YMM(), YMM(), YMM(), YMM(),
|
||||||
|
YMM(), YMM(), YMM(), YMM(),
|
||||||
|
YMM(), YMM(), YMM(), YMM(),
|
||||||
|
YMM(), YMM(), YMM(), YMM(),
|
||||||
|
}
|
||||||
|
// stack space for transposed message vectors
|
||||||
|
var mv [16]Mem
|
||||||
|
for i := range mv {
|
||||||
|
mv[i] = AllocLocal(32)
|
||||||
|
}
|
||||||
|
// stack space for spilled vs[8] register
|
||||||
|
spillMem := AllocLocal(32)
|
||||||
|
|
||||||
|
Comment("Load key")
|
||||||
|
for i := 0; i < 8; i++ {
|
||||||
|
VPBROADCASTD(key.Offset(i*4), vs[i])
|
||||||
|
}
|
||||||
|
|
||||||
|
Comment("Initialize counter")
|
||||||
|
counterLo := AllocLocal(32)
|
||||||
|
counterHi := AllocLocal(32)
|
||||||
|
VPBROADCASTQ(counter.Addr, vs[12])
|
||||||
|
VPBROADCASTQ(counter.Addr, vs[13])
|
||||||
|
VPADDQ(globals.incrementCounter.Offset(0*32), vs[12], vs[12])
|
||||||
|
VPADDQ(globals.incrementCounter.Offset(1*32), vs[13], vs[13])
|
||||||
|
VPUNPCKLDQ(vs[13], vs[12], vs[14])
|
||||||
|
VPUNPCKHDQ(vs[13], vs[12], vs[15])
|
||||||
|
VPUNPCKLDQ(vs[15], vs[14], vs[12])
|
||||||
|
VPUNPCKHDQ(vs[15], vs[14], vs[13])
|
||||||
|
VPERMQ(Imm(0xd8), vs[12], vs[12])
|
||||||
|
VPERMQ(Imm(0xd8), vs[13], vs[13])
|
||||||
|
VMOVDQU(vs[12], counterLo)
|
||||||
|
VMOVDQU(vs[13], counterHi)
|
||||||
|
|
||||||
|
Comment("Initialize flags")
|
||||||
|
chunkFlags := AllocLocal(16 * 4)
|
||||||
|
VPBROADCASTD(flags.Addr, vs[14])
|
||||||
|
VPOR(globals.setFlags.Offset(0*32), vs[14], vs[15])
|
||||||
|
VMOVDQU(vs[15], chunkFlags.Offset(0*32))
|
||||||
|
VPOR(globals.setFlags.Offset(1*32), vs[14], vs[15])
|
||||||
|
VMOVDQU(vs[15], chunkFlags.Offset(1*32))
|
||||||
|
|
||||||
|
Comment("Loop index")
|
||||||
|
loop := GP64()
|
||||||
|
XORQ(loop, loop)
|
||||||
|
Label("loop")
|
||||||
|
|
||||||
|
Comment("Load transposed block")
|
||||||
|
VMOVDQU(globals.stride1024, vs[9])
|
||||||
|
for i := 0; i < 16; i++ {
|
||||||
|
VPCMPEQD(vs[8], vs[8], vs[8]) // fastest way to set all bits to 1
|
||||||
|
VPGATHERDD(vs[8], buf.Offset(i*4).Idx(vs[9], 1), vs[10])
|
||||||
|
VMOVDQU(vs[10], mv[i])
|
||||||
|
}
|
||||||
|
ADDQ(Imm(64), buf.Base)
|
||||||
|
|
||||||
|
Comment("Reload state vectors (other than CVs)")
|
||||||
|
for i := 0; i < 4; i++ {
|
||||||
|
VPBROADCASTD(globals.iv.Offset(i*4), vs[8+i])
|
||||||
|
}
|
||||||
|
VMOVDQU(counterLo, vs[12])
|
||||||
|
VMOVDQU(counterHi, vs[13])
|
||||||
|
VMOVDQU(globals.blockLen, vs[14])
|
||||||
|
VPBROADCASTD(chunkFlags.Idx(loop, 4), vs[15])
|
||||||
|
|
||||||
|
VMOVDQU(vs[8], spillMem) // spill
|
||||||
|
for i := 0; i < 7; i++ {
|
||||||
|
Comment(fmt.Sprintf("Round %v", i+1))
|
||||||
|
round(vs, mv, vs[8], spillMem)
|
||||||
|
// permute
|
||||||
|
mv = [16]Mem{
|
||||||
|
mv[2], mv[6], mv[3], mv[10],
|
||||||
|
mv[7], mv[0], mv[4], mv[13],
|
||||||
|
mv[1], mv[11], mv[12], mv[5],
|
||||||
|
mv[9], mv[14], mv[15], mv[8],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Comment("Finalize CVs")
|
||||||
|
VMOVDQU(spillMem, vs[8]) // reload
|
||||||
|
for i := range vs[:8] {
|
||||||
|
VPXOR(vs[i], vs[i+8], vs[i])
|
||||||
|
}
|
||||||
|
|
||||||
|
Comment("Loop")
|
||||||
|
INCQ(loop)
|
||||||
|
CMPQ(loop, U32(16))
|
||||||
|
JNE(LabelRef("loop"))
|
||||||
|
|
||||||
|
Comment("Finished; transpose CVs")
|
||||||
|
src, dst := vs[:8], vs[8:]
|
||||||
|
// interleave uint32s
|
||||||
|
for i := 0; i < 8; i += 2 {
|
||||||
|
VPUNPCKLDQ(src[i+1], src[i], dst[i+0])
|
||||||
|
VPUNPCKHDQ(src[i+1], src[i], dst[i+1])
|
||||||
|
}
|
||||||
|
// interleave groups of two uint32s
|
||||||
|
for i := 0; i < 4; i++ {
|
||||||
|
j := i*2 - i%2 // j := 0,1,4,5
|
||||||
|
VPUNPCKLQDQ(dst[j+2], dst[j], src[i*2+0])
|
||||||
|
VPUNPCKHQDQ(dst[j+2], dst[j], src[i*2+1])
|
||||||
|
}
|
||||||
|
// interleave groups of four uint32s
|
||||||
|
for i := 0; i < 4; i++ {
|
||||||
|
VPERM2I128(Imm(0x20), src[i+4], src[i], dst[i+0])
|
||||||
|
VPERM2I128(Imm(0x31), src[i+4], src[i], dst[i+4])
|
||||||
|
}
|
||||||
|
for i, v := range dst {
|
||||||
|
VMOVDQU(v, cvs.Offset(i*32))
|
||||||
|
}
|
||||||
|
|
||||||
|
RET()
|
||||||
|
}
|
||||||
|
|
||||||
|
func round(sv [16]VecVirtual, mv [16]Mem, tmp VecVirtual, spillMem Mem) {
|
||||||
|
g(sv[0], sv[4], sv[8], sv[12], mv[0], mv[1], tmp, spillMem)
|
||||||
|
g(sv[1], sv[5], sv[9], sv[13], mv[2], mv[3], tmp, spillMem)
|
||||||
|
g(sv[2], sv[6], sv[10], sv[14], mv[4], mv[5], tmp, spillMem)
|
||||||
|
g(sv[3], sv[7], sv[11], sv[15], mv[6], mv[7], tmp, spillMem)
|
||||||
|
g(sv[0], sv[5], sv[10], sv[15], mv[8], mv[9], tmp, spillMem)
|
||||||
|
g(sv[1], sv[6], sv[11], sv[12], mv[10], mv[11], tmp, spillMem)
|
||||||
|
g(sv[2], sv[7], sv[8], sv[13], mv[12], mv[13], tmp, spillMem)
|
||||||
|
g(sv[3], sv[4], sv[9], sv[14], mv[14], mv[15], tmp, spillMem)
|
||||||
|
}
|
||||||
|
|
||||||
|
func g(a, b, c, d VecVirtual, mx, my Mem, tmp VecVirtual, spillMem Mem) {
|
||||||
|
// Helper function for performing rotations. Also manages c, tmp and
|
||||||
|
// spillMem: if c == tmp, we need to spill and reload c using spillMem.
|
||||||
|
rotr := func(v VecVirtual, n uint64, dst VecVirtual) {
|
||||||
|
switch n {
|
||||||
|
case 8, 16:
|
||||||
|
shuf := [...]Mem{8: globals.shuffleRot8, 16: globals.shuffleRot16}[n]
|
||||||
|
VPSHUFB(shuf, v, dst)
|
||||||
|
if c == tmp {
|
||||||
|
VMOVDQU(spillMem, c)
|
||||||
|
}
|
||||||
|
case 7, 12:
|
||||||
|
if c == tmp {
|
||||||
|
VMOVDQU(c, spillMem)
|
||||||
|
}
|
||||||
|
VPSRLD(Imm(n), v, tmp)
|
||||||
|
VPSLLD(Imm(32-n), v, dst)
|
||||||
|
VPOR(dst, tmp, dst)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
VPADDD(a, b, a)
|
||||||
|
VPADDD(mx, a, a)
|
||||||
|
VPXOR(d, a, d)
|
||||||
|
rotr(d, 16, d)
|
||||||
|
VPADDD(c, d, c)
|
||||||
|
VPXOR(b, c, b)
|
||||||
|
rotr(b, 12, b)
|
||||||
|
VPADDD(a, b, a)
|
||||||
|
VPADDD(my, a, a)
|
||||||
|
VPXOR(d, a, d)
|
||||||
|
rotr(d, 8, d)
|
||||||
|
VPADDD(c, d, c)
|
||||||
|
VPXOR(b, c, b)
|
||||||
|
rotr(b, 7, b)
|
||||||
|
}
|
360
blake3.go
360
blake3.go
|
@ -10,12 +10,6 @@ import (
|
||||||
"math/bits"
|
"math/bits"
|
||||||
)
|
)
|
||||||
|
|
||||||
const (
|
|
||||||
blockSize = 64
|
|
||||||
chunkSize = 1024
|
|
||||||
)
|
|
||||||
|
|
||||||
// flags
|
|
||||||
const (
|
const (
|
||||||
flagChunkStart = 1 << iota
|
flagChunkStart = 1 << iota
|
||||||
flagChunkEnd
|
flagChunkEnd
|
||||||
|
@ -24,6 +18,9 @@ const (
|
||||||
flagKeyedHash
|
flagKeyedHash
|
||||||
flagDeriveKeyContext
|
flagDeriveKeyContext
|
||||||
flagDeriveKeyMaterial
|
flagDeriveKeyMaterial
|
||||||
|
|
||||||
|
blockSize = 64
|
||||||
|
chunkSize = 1024
|
||||||
)
|
)
|
||||||
|
|
||||||
var iv = [8]uint32{
|
var iv = [8]uint32{
|
||||||
|
@ -31,332 +28,82 @@ var iv = [8]uint32{
|
||||||
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19,
|
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19,
|
||||||
}
|
}
|
||||||
|
|
||||||
// helper functions for converting between bytes and BLAKE3 "words"
|
// A node represents a chunk or parent in the BLAKE3 Merkle tree.
|
||||||
|
|
||||||
func bytesToWords(bytes [64]byte, words *[16]uint32) {
|
|
||||||
words[0] = binary.LittleEndian.Uint32(bytes[0:])
|
|
||||||
words[1] = binary.LittleEndian.Uint32(bytes[4:])
|
|
||||||
words[2] = binary.LittleEndian.Uint32(bytes[8:])
|
|
||||||
words[3] = binary.LittleEndian.Uint32(bytes[12:])
|
|
||||||
words[4] = binary.LittleEndian.Uint32(bytes[16:])
|
|
||||||
words[5] = binary.LittleEndian.Uint32(bytes[20:])
|
|
||||||
words[6] = binary.LittleEndian.Uint32(bytes[24:])
|
|
||||||
words[7] = binary.LittleEndian.Uint32(bytes[28:])
|
|
||||||
words[8] = binary.LittleEndian.Uint32(bytes[32:])
|
|
||||||
words[9] = binary.LittleEndian.Uint32(bytes[36:])
|
|
||||||
words[10] = binary.LittleEndian.Uint32(bytes[40:])
|
|
||||||
words[11] = binary.LittleEndian.Uint32(bytes[44:])
|
|
||||||
words[12] = binary.LittleEndian.Uint32(bytes[48:])
|
|
||||||
words[13] = binary.LittleEndian.Uint32(bytes[52:])
|
|
||||||
words[14] = binary.LittleEndian.Uint32(bytes[56:])
|
|
||||||
words[15] = binary.LittleEndian.Uint32(bytes[60:])
|
|
||||||
}
|
|
||||||
|
|
||||||
func wordsToBytes(words [16]uint32, block *[64]byte) {
|
|
||||||
binary.LittleEndian.PutUint32(block[0:], words[0])
|
|
||||||
binary.LittleEndian.PutUint32(block[4:], words[1])
|
|
||||||
binary.LittleEndian.PutUint32(block[8:], words[2])
|
|
||||||
binary.LittleEndian.PutUint32(block[12:], words[3])
|
|
||||||
binary.LittleEndian.PutUint32(block[16:], words[4])
|
|
||||||
binary.LittleEndian.PutUint32(block[20:], words[5])
|
|
||||||
binary.LittleEndian.PutUint32(block[24:], words[6])
|
|
||||||
binary.LittleEndian.PutUint32(block[28:], words[7])
|
|
||||||
binary.LittleEndian.PutUint32(block[32:], words[8])
|
|
||||||
binary.LittleEndian.PutUint32(block[36:], words[9])
|
|
||||||
binary.LittleEndian.PutUint32(block[40:], words[10])
|
|
||||||
binary.LittleEndian.PutUint32(block[44:], words[11])
|
|
||||||
binary.LittleEndian.PutUint32(block[48:], words[12])
|
|
||||||
binary.LittleEndian.PutUint32(block[52:], words[13])
|
|
||||||
binary.LittleEndian.PutUint32(block[56:], words[14])
|
|
||||||
binary.LittleEndian.PutUint32(block[60:], words[15])
|
|
||||||
}
|
|
||||||
|
|
||||||
func g(a, b, c, d, mx, my uint32) (uint32, uint32, uint32, uint32) {
|
|
||||||
a += b + mx
|
|
||||||
d = bits.RotateLeft32(d^a, -16)
|
|
||||||
c += d
|
|
||||||
b = bits.RotateLeft32(b^c, -12)
|
|
||||||
a += b + my
|
|
||||||
d = bits.RotateLeft32(d^a, -8)
|
|
||||||
c += d
|
|
||||||
b = bits.RotateLeft32(b^c, -7)
|
|
||||||
return a, b, c, d
|
|
||||||
}
|
|
||||||
|
|
||||||
// A node represents a chunk or parent in the BLAKE3 Merkle tree. In BLAKE3
|
|
||||||
// terminology, the elements of the bottom layer (aka "leaves") of the tree are
|
|
||||||
// called chunk nodes, and the elements of upper layers (aka "interior nodes")
|
|
||||||
// are called parent nodes.
|
|
||||||
//
|
|
||||||
// Computing a BLAKE3 hash involves splitting the input into chunk nodes, then
|
|
||||||
// repeatedly merging these nodes into parent nodes, until only a single "root"
|
|
||||||
// node remains. The root node can then be used to generate up to 2^64 - 1 bytes
|
|
||||||
// of pseudorandom output.
|
|
||||||
type node struct {
|
type node struct {
|
||||||
// the chaining value from the previous state
|
cv [8]uint32 // chaining value from previous node
|
||||||
cv [8]uint32
|
|
||||||
// the current state
|
|
||||||
block [16]uint32
|
block [16]uint32
|
||||||
counter uint64
|
counter uint64
|
||||||
blockLen uint32
|
blockLen uint32
|
||||||
flags uint32
|
flags uint32
|
||||||
}
|
}
|
||||||
|
|
||||||
// compress is the core hash function, generating 16 pseudorandom words from a
|
|
||||||
// node.
|
|
||||||
func (n node) compress() [16]uint32 {
|
|
||||||
// NOTE: we unroll all of the rounds, as well as the permutations that occur
|
|
||||||
// between rounds.
|
|
||||||
|
|
||||||
// round 1 (also initializes state)
|
|
||||||
// columns
|
|
||||||
s0, s4, s8, s12 := g(n.cv[0], n.cv[4], iv[0], uint32(n.counter), n.block[0], n.block[1])
|
|
||||||
s1, s5, s9, s13 := g(n.cv[1], n.cv[5], iv[1], uint32(n.counter>>32), n.block[2], n.block[3])
|
|
||||||
s2, s6, s10, s14 := g(n.cv[2], n.cv[6], iv[2], n.blockLen, n.block[4], n.block[5])
|
|
||||||
s3, s7, s11, s15 := g(n.cv[3], n.cv[7], iv[3], n.flags, n.block[6], n.block[7])
|
|
||||||
// diagonals
|
|
||||||
s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[8], n.block[9])
|
|
||||||
s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[10], n.block[11])
|
|
||||||
s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[12], n.block[13])
|
|
||||||
s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[14], n.block[15])
|
|
||||||
|
|
||||||
// round 2
|
|
||||||
s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[2], n.block[6])
|
|
||||||
s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[3], n.block[10])
|
|
||||||
s2, s6, s10, s14 = g(s2, s6, s10, s14, n.block[7], n.block[0])
|
|
||||||
s3, s7, s11, s15 = g(s3, s7, s11, s15, n.block[4], n.block[13])
|
|
||||||
s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[1], n.block[11])
|
|
||||||
s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[12], n.block[5])
|
|
||||||
s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[9], n.block[14])
|
|
||||||
s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[15], n.block[8])
|
|
||||||
|
|
||||||
// round 3
|
|
||||||
s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[3], n.block[4])
|
|
||||||
s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[10], n.block[12])
|
|
||||||
s2, s6, s10, s14 = g(s2, s6, s10, s14, n.block[13], n.block[2])
|
|
||||||
s3, s7, s11, s15 = g(s3, s7, s11, s15, n.block[7], n.block[14])
|
|
||||||
s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[6], n.block[5])
|
|
||||||
s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[9], n.block[0])
|
|
||||||
s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[11], n.block[15])
|
|
||||||
s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[8], n.block[1])
|
|
||||||
|
|
||||||
// round 4
|
|
||||||
s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[10], n.block[7])
|
|
||||||
s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[12], n.block[9])
|
|
||||||
s2, s6, s10, s14 = g(s2, s6, s10, s14, n.block[14], n.block[3])
|
|
||||||
s3, s7, s11, s15 = g(s3, s7, s11, s15, n.block[13], n.block[15])
|
|
||||||
s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[4], n.block[0])
|
|
||||||
s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[11], n.block[2])
|
|
||||||
s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[5], n.block[8])
|
|
||||||
s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[1], n.block[6])
|
|
||||||
|
|
||||||
// round 5
|
|
||||||
s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[12], n.block[13])
|
|
||||||
s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[9], n.block[11])
|
|
||||||
s2, s6, s10, s14 = g(s2, s6, s10, s14, n.block[15], n.block[10])
|
|
||||||
s3, s7, s11, s15 = g(s3, s7, s11, s15, n.block[14], n.block[8])
|
|
||||||
s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[7], n.block[2])
|
|
||||||
s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[5], n.block[3])
|
|
||||||
s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[0], n.block[1])
|
|
||||||
s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[6], n.block[4])
|
|
||||||
|
|
||||||
// round 6
|
|
||||||
s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[9], n.block[14])
|
|
||||||
s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[11], n.block[5])
|
|
||||||
s2, s6, s10, s14 = g(s2, s6, s10, s14, n.block[8], n.block[12])
|
|
||||||
s3, s7, s11, s15 = g(s3, s7, s11, s15, n.block[15], n.block[1])
|
|
||||||
s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[13], n.block[3])
|
|
||||||
s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[0], n.block[10])
|
|
||||||
s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[2], n.block[6])
|
|
||||||
s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[4], n.block[7])
|
|
||||||
|
|
||||||
// round 7
|
|
||||||
s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[11], n.block[15])
|
|
||||||
s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[5], n.block[0])
|
|
||||||
s2, s6, s10, s14 = g(s2, s6, s10, s14, n.block[1], n.block[9])
|
|
||||||
s3, s7, s11, s15 = g(s3, s7, s11, s15, n.block[8], n.block[6])
|
|
||||||
s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[14], n.block[10])
|
|
||||||
s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[2], n.block[12])
|
|
||||||
s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[3], n.block[4])
|
|
||||||
s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[7], n.block[13])
|
|
||||||
|
|
||||||
// finalization
|
|
||||||
return [16]uint32{
|
|
||||||
s0 ^ s8, s1 ^ s9, s2 ^ s10, s3 ^ s11,
|
|
||||||
s4 ^ s12, s5 ^ s13, s6 ^ s14, s7 ^ s15,
|
|
||||||
s8 ^ n.cv[0], s9 ^ n.cv[1], s10 ^ n.cv[2], s11 ^ n.cv[3],
|
|
||||||
s12 ^ n.cv[4], s13 ^ n.cv[5], s14 ^ n.cv[6], s15 ^ n.cv[7],
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// chainingValue returns the first 8 words of the compressed node. This is used
|
|
||||||
// in two places. First, when a chunk node is being constructed, its cv is
|
|
||||||
// overwritten with this value after each block of input is processed. Second,
|
|
||||||
// when two nodes are merged into a parent, each of their chaining values
|
|
||||||
// supplies half of the new node's block.
|
|
||||||
func (n node) chainingValue() (cv [8]uint32) {
|
|
||||||
full := n.compress()
|
|
||||||
copy(cv[:], full[:8])
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
// chunkState manages the state involved in hashing a single chunk of input.
|
|
||||||
type chunkState struct {
|
|
||||||
n node
|
|
||||||
block [blockSize]byte
|
|
||||||
blockLen int
|
|
||||||
bytesConsumed int
|
|
||||||
}
|
|
||||||
|
|
||||||
// chunkCounter is the index of this chunk, i.e. the number of chunks that have
|
|
||||||
// been processed prior to this one.
|
|
||||||
func (cs *chunkState) chunkCounter() uint64 {
|
|
||||||
return cs.n.counter
|
|
||||||
}
|
|
||||||
|
|
||||||
func (cs *chunkState) complete() bool {
|
|
||||||
return cs.bytesConsumed == chunkSize
|
|
||||||
}
|
|
||||||
|
|
||||||
// update incorporates input into the chunkState.
|
|
||||||
func (cs *chunkState) update(input []byte) {
|
|
||||||
for len(input) > 0 {
|
|
||||||
// If the block buffer is full, compress it and clear it. More
|
|
||||||
// input is coming, so this compression is not flagChunkEnd.
|
|
||||||
if cs.blockLen == blockSize {
|
|
||||||
// copy the chunk block (bytes) into the node block and chain it.
|
|
||||||
bytesToWords(cs.block, &cs.n.block)
|
|
||||||
cs.n.cv = cs.n.chainingValue()
|
|
||||||
// clear the start flag for all but the first block
|
|
||||||
cs.n.flags &^= flagChunkStart
|
|
||||||
cs.blockLen = 0
|
|
||||||
}
|
|
||||||
|
|
||||||
// Copy input bytes into the chunk block.
|
|
||||||
n := copy(cs.block[cs.blockLen:], input)
|
|
||||||
cs.blockLen += n
|
|
||||||
cs.bytesConsumed += n
|
|
||||||
input = input[n:]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// compiles to memclr
|
|
||||||
func clear(b []byte) {
|
|
||||||
for i := range b {
|
|
||||||
b[i] = 0
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// node returns a node containing the chunkState's current state, with the
|
|
||||||
// ChunkEnd flag set.
|
|
||||||
func (cs *chunkState) node() node {
|
|
||||||
n := cs.n
|
|
||||||
// pad the remaining space in the block with zeros
|
|
||||||
clear(cs.block[cs.blockLen:])
|
|
||||||
bytesToWords(cs.block, &n.block)
|
|
||||||
n.blockLen = uint32(cs.blockLen)
|
|
||||||
n.flags |= flagChunkEnd
|
|
||||||
return n
|
|
||||||
}
|
|
||||||
|
|
||||||
func newChunkState(iv [8]uint32, chunkCounter uint64, flags uint32) chunkState {
|
|
||||||
return chunkState{
|
|
||||||
n: node{
|
|
||||||
cv: iv,
|
|
||||||
counter: chunkCounter,
|
|
||||||
blockLen: blockSize,
|
|
||||||
// compress the first block with the start flag set
|
|
||||||
flags: flags | flagChunkStart,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// parentNode returns a node that incorporates the chaining values of two child
|
// parentNode returns a node that incorporates the chaining values of two child
|
||||||
// nodes.
|
// nodes.
|
||||||
func parentNode(left, right [8]uint32, key [8]uint32, flags uint32) node {
|
func parentNode(left, right [8]uint32, key [8]uint32, flags uint32) node {
|
||||||
var blockWords [16]uint32
|
n := node{
|
||||||
copy(blockWords[:8], left[:])
|
|
||||||
copy(blockWords[8:], right[:])
|
|
||||||
return node{
|
|
||||||
cv: key,
|
cv: key,
|
||||||
block: blockWords,
|
|
||||||
counter: 0, // counter is reset for parents
|
counter: 0, // counter is reset for parents
|
||||||
blockLen: blockSize, // block is full: 8 words from left, 8 from right
|
blockLen: blockSize, // block is full
|
||||||
flags: flags | flagParent,
|
flags: flags | flagParent,
|
||||||
}
|
}
|
||||||
|
copy(n.block[:8], left[:])
|
||||||
|
copy(n.block[8:], right[:])
|
||||||
|
return n
|
||||||
}
|
}
|
||||||
|
|
||||||
// Hasher implements hash.Hash.
|
// Hasher implements hash.Hash.
|
||||||
type Hasher struct {
|
type Hasher struct {
|
||||||
cs chunkState
|
|
||||||
key [8]uint32
|
key [8]uint32
|
||||||
flags uint32
|
flags uint32
|
||||||
size int // output size, for Sum
|
size int // output size, for Sum
|
||||||
|
|
||||||
// log(n) set of Merkle subtree roots, at most one per height.
|
// log(n) set of Merkle subtree roots, at most one per height.
|
||||||
stack [54][8]uint32 // 2^54 * chunkSize = 2^64
|
stack [51][8]uint32 // 2^51 * 8 * chunkSize = 2^64
|
||||||
used uint64 // bit vector indicating which stack elems are valid; also number of chunks added
|
counter uint64 // number of buffers hashed; also serves as a bit vector indicating which stack elems are occupied
|
||||||
|
|
||||||
|
buf [8 * chunkSize]byte
|
||||||
|
buflen int
|
||||||
}
|
}
|
||||||
|
|
||||||
func (h *Hasher) hasSubtreeAtHeight(i int) bool {
|
func (h *Hasher) hasSubtreeAtHeight(i int) bool {
|
||||||
return h.used&(1<<i) != 0
|
return h.counter&(1<<i) != 0
|
||||||
}
|
}
|
||||||
|
|
||||||
// addChunkChainingValue appends a chunk to the right edge of the Merkle tree.
|
func (h *Hasher) pushSubtree(cv [8]uint32) {
|
||||||
func (h *Hasher) addChunkChainingValue(cv [8]uint32) {
|
|
||||||
// seek to first open stack slot, merging subtrees as we go
|
// seek to first open stack slot, merging subtrees as we go
|
||||||
i := 0
|
i := 0
|
||||||
for ; h.hasSubtreeAtHeight(i); i++ {
|
for h.hasSubtreeAtHeight(i) {
|
||||||
cv = parentNode(h.stack[i], cv, h.key, h.flags).chainingValue()
|
cv = chainingValue(parentNode(h.stack[i], cv, h.key, h.flags))
|
||||||
|
i++
|
||||||
}
|
}
|
||||||
h.stack[i] = cv
|
h.stack[i] = cv
|
||||||
h.used++
|
h.counter++
|
||||||
}
|
}
|
||||||
|
|
||||||
// rootNode computes the root of the Merkle tree. It does not modify the
|
// rootNode computes the root of the Merkle tree. It does not modify the
|
||||||
// chainStack.
|
// stack.
|
||||||
func (h *Hasher) rootNode() node {
|
func (h *Hasher) rootNode() node {
|
||||||
n := h.cs.node()
|
n := compressBuffer(&h.buf, h.buflen, &h.key, h.counter*8, h.flags)
|
||||||
for i := bits.TrailingZeros64(h.used); i < bits.Len64(h.used); i++ {
|
for i := bits.TrailingZeros64(h.counter); i < bits.Len64(h.counter); i++ {
|
||||||
if h.hasSubtreeAtHeight(i) {
|
if h.hasSubtreeAtHeight(i) {
|
||||||
n = parentNode(h.stack[i], n.chainingValue(), h.key, h.flags)
|
n = parentNode(h.stack[i], chainingValue(n), h.key, h.flags)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
n.flags |= flagRoot
|
n.flags |= flagRoot
|
||||||
return n
|
return n
|
||||||
}
|
}
|
||||||
|
|
||||||
// Reset implements hash.Hash.
|
|
||||||
func (h *Hasher) Reset() {
|
|
||||||
h.cs = newChunkState(h.key, 0, h.flags)
|
|
||||||
h.used = 0
|
|
||||||
}
|
|
||||||
|
|
||||||
// BlockSize implements hash.Hash.
|
|
||||||
func (h *Hasher) BlockSize() int { return 64 }
|
|
||||||
|
|
||||||
// Size implements hash.Hash.
|
|
||||||
func (h *Hasher) Size() int { return h.size }
|
|
||||||
|
|
||||||
// Write implements hash.Hash.
|
// Write implements hash.Hash.
|
||||||
func (h *Hasher) Write(p []byte) (int, error) {
|
func (h *Hasher) Write(p []byte) (int, error) {
|
||||||
lenp := len(p)
|
lenp := len(p)
|
||||||
for len(p) > 0 {
|
for len(p) > 0 {
|
||||||
// If the current chunk is complete, finalize it and add it to the tree,
|
if h.buflen == len(h.buf) {
|
||||||
// then reset the chunk state (but keep incrementing the counter across
|
n := compressBuffer(&h.buf, h.buflen, &h.key, h.counter*8, h.flags)
|
||||||
// chunks).
|
h.pushSubtree(chainingValue(n))
|
||||||
if h.cs.complete() {
|
h.buflen = 0
|
||||||
cv := h.cs.node().chainingValue()
|
|
||||||
h.addChunkChainingValue(cv)
|
|
||||||
h.cs = newChunkState(h.key, h.cs.chunkCounter()+1, h.flags)
|
|
||||||
}
|
}
|
||||||
|
n := copy(h.buf[h.buflen:], p)
|
||||||
// Compress input bytes into the current chunk state.
|
h.buflen += n
|
||||||
n := chunkSize - h.cs.bytesConsumed
|
|
||||||
if n > len(p) {
|
|
||||||
n = len(p)
|
|
||||||
}
|
|
||||||
h.cs.update(p[:n])
|
|
||||||
p = p[n:]
|
p = p[n:]
|
||||||
}
|
}
|
||||||
return lenp, nil
|
return lenp, nil
|
||||||
|
@ -377,6 +124,18 @@ func (h *Hasher) Sum(b []byte) (sum []byte) {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Reset implements hash.Hash.
|
||||||
|
func (h *Hasher) Reset() {
|
||||||
|
h.counter = 0
|
||||||
|
h.buflen = 0
|
||||||
|
}
|
||||||
|
|
||||||
|
// BlockSize implements hash.Hash.
|
||||||
|
func (h *Hasher) BlockSize() int { return 64 }
|
||||||
|
|
||||||
|
// Size implements hash.Hash.
|
||||||
|
func (h *Hasher) Size() int { return h.size }
|
||||||
|
|
||||||
// XOF returns an OutputReader initialized with the current hash state.
|
// XOF returns an OutputReader initialized with the current hash state.
|
||||||
func (h *Hasher) XOF() *OutputReader {
|
func (h *Hasher) XOF() *OutputReader {
|
||||||
return &OutputReader{
|
return &OutputReader{
|
||||||
|
@ -386,7 +145,6 @@ func (h *Hasher) XOF() *OutputReader {
|
||||||
|
|
||||||
func newHasher(key [8]uint32, flags uint32, size int) *Hasher {
|
func newHasher(key [8]uint32, flags uint32, size int) *Hasher {
|
||||||
return &Hasher{
|
return &Hasher{
|
||||||
cs: newChunkState(key, 0, flags),
|
|
||||||
key: key,
|
key: key,
|
||||||
flags: flags,
|
flags: flags,
|
||||||
size: size,
|
size: size,
|
||||||
|
@ -394,7 +152,7 @@ func newHasher(key [8]uint32, flags uint32, size int) *Hasher {
|
||||||
}
|
}
|
||||||
|
|
||||||
// New returns a Hasher for the specified size and key. If key is nil, the hash
|
// New returns a Hasher for the specified size and key. If key is nil, the hash
|
||||||
// is unkeyed.
|
// is unkeyed. Otherwise, len(key) must be 32.
|
||||||
func New(size int, key []byte) *Hasher {
|
func New(size int, key []byte) *Hasher {
|
||||||
if key == nil {
|
if key == nil {
|
||||||
return newHasher(iv, 0, size)
|
return newHasher(iv, 0, size)
|
||||||
|
@ -408,21 +166,30 @@ func New(size int, key []byte) *Hasher {
|
||||||
|
|
||||||
// Sum256 and Sum512 always use the same hasher state, so we can save some time
|
// Sum256 and Sum512 always use the same hasher state, so we can save some time
|
||||||
// when hashing small inputs by constructing the hasher ahead of time.
|
// when hashing small inputs by constructing the hasher ahead of time.
|
||||||
var defaultHasher = newHasher(iv, 0, 0)
|
var defaultHasher = New(0, nil)
|
||||||
|
|
||||||
// Sum256 returns the unkeyed BLAKE3 hash of b, truncated to 256 bits.
|
// Sum256 returns the unkeyed BLAKE3 hash of b, truncated to 256 bits.
|
||||||
func Sum256(b []byte) (out [32]byte) {
|
func Sum256(b []byte) (out [32]byte) {
|
||||||
h := *defaultHasher
|
out512 := Sum512(b)
|
||||||
h.Write(b)
|
copy(out[:], out512[:])
|
||||||
h.XOF().Read(out[:])
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// Sum512 returns the unkeyed BLAKE3 hash of b, truncated to 512 bits.
|
// Sum512 returns the unkeyed BLAKE3 hash of b, truncated to 512 bits.
|
||||||
func Sum512(b []byte) (out [64]byte) {
|
func Sum512(b []byte) (out [64]byte) {
|
||||||
h := *defaultHasher
|
var n node
|
||||||
h.Write(b)
|
if len(b) <= blockSize {
|
||||||
h.XOF().Read(out[:])
|
hashBlock(&out, b)
|
||||||
|
return
|
||||||
|
} else if len(b) <= chunkSize {
|
||||||
|
n = compressChunk(b, &iv, 0, 0)
|
||||||
|
n.flags |= flagRoot
|
||||||
|
} else {
|
||||||
|
h := *defaultHasher
|
||||||
|
h.Write(b)
|
||||||
|
n = h.rootNode()
|
||||||
|
}
|
||||||
|
wordsToBytes(compressNode(n), &out)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -473,10 +240,8 @@ func (or *OutputReader) Read(p []byte) (int, error) {
|
||||||
for len(p) > 0 {
|
for len(p) > 0 {
|
||||||
if or.off%blockSize == 0 {
|
if or.off%blockSize == 0 {
|
||||||
or.n.counter = or.off / blockSize
|
or.n.counter = or.off / blockSize
|
||||||
words := or.n.compress()
|
wordsToBytes(compressNode(or.n), &or.block)
|
||||||
wordsToBytes(words, &or.block)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
n := copy(p, or.block[or.off%blockSize:])
|
n := copy(p, or.block[or.off%blockSize:])
|
||||||
p = p[n:]
|
p = p[n:]
|
||||||
or.off += uint64(n)
|
or.off += uint64(n)
|
||||||
|
@ -510,8 +275,7 @@ func (or *OutputReader) Seek(offset int64, whence int) (int64, error) {
|
||||||
or.off = off
|
or.off = off
|
||||||
or.n.counter = uint64(off) / blockSize
|
or.n.counter = uint64(off) / blockSize
|
||||||
if or.off%blockSize != 0 {
|
if or.off%blockSize != 0 {
|
||||||
words := or.n.compress()
|
wordsToBytes(compressNode(or.n), &or.block)
|
||||||
wordsToBytes(words, &or.block)
|
|
||||||
}
|
}
|
||||||
// NOTE: or.off >= 2^63 will result in a negative return value.
|
// NOTE: or.off >= 2^63 will result in a negative return value.
|
||||||
// Nothing we can do about this.
|
// Nothing we can do about this.
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -63,7 +63,7 @@ func TestVectors(t *testing.T) {
|
||||||
subKey := make([]byte, len(vec.DeriveKey)/2)
|
subKey := make([]byte, len(vec.DeriveKey)/2)
|
||||||
blake3.DeriveKey(subKey, ctx, in)
|
blake3.DeriveKey(subKey, ctx, in)
|
||||||
if out := toHex(subKey); out != vec.DeriveKey {
|
if out := toHex(subKey); out != vec.DeriveKey {
|
||||||
t.Errorf("output did not match test vector:\n\texpected: %v...\n\t got: %v...", vec.DeriveKey[:10], subKey[:10])
|
t.Errorf("output did not match test vector:\n\texpected: %v...\n\t got: %v...", vec.DeriveKey[:10], out[:10])
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -150,7 +150,7 @@ func TestSum(t *testing.T) {
|
||||||
h.Write(in)
|
h.Write(in)
|
||||||
h.Sum(exp256[:0])
|
h.Sum(exp256[:0])
|
||||||
if got256 := blake3.Sum256(in); exp256 != got256 {
|
if got256 := blake3.Sum256(in); exp256 != got256 {
|
||||||
t.Errorf("Sum256 output did not match Sum output:\n\texpected: %v...\n\t got: %v...", exp256[:10], got256[:10])
|
t.Errorf("Sum256 output did not match Sum output:\n\texpected: %x...\n\t got: %x...", exp256[:5], got256[:5])
|
||||||
}
|
}
|
||||||
|
|
||||||
var exp512 [64]byte
|
var exp512 [64]byte
|
||||||
|
@ -158,7 +158,7 @@ func TestSum(t *testing.T) {
|
||||||
h.Write(in)
|
h.Write(in)
|
||||||
h.Sum(exp512[:0])
|
h.Sum(exp512[:0])
|
||||||
if got512 := blake3.Sum512(in); exp512 != got512 {
|
if got512 := blake3.Sum512(in); exp512 != got512 {
|
||||||
t.Errorf("Sum512 output did not match Sum output:\n\texpected: %v...\n\t got: %v...", exp512[:10], got512[:10])
|
t.Errorf("Sum512 output did not match Sum output:\n\texpected: %x...\n\t got: %x...", exp512[:5], got512[:5])
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -190,13 +190,20 @@ func (nopReader) Read(p []byte) (int, error) { return len(p), nil }
|
||||||
|
|
||||||
func BenchmarkWrite(b *testing.B) {
|
func BenchmarkWrite(b *testing.B) {
|
||||||
b.ReportAllocs()
|
b.ReportAllocs()
|
||||||
b.SetBytes(1)
|
b.SetBytes(1024)
|
||||||
io.CopyN(blake3.New(0, nil), nopReader{}, int64(b.N))
|
io.CopyN(blake3.New(0, nil), nopReader{}, int64(b.N*1024))
|
||||||
|
}
|
||||||
|
|
||||||
|
func BenchmarkXOF(b *testing.B) {
|
||||||
|
b.ReportAllocs()
|
||||||
|
b.SetBytes(1024)
|
||||||
|
io.CopyN(ioutil.Discard, blake3.New(0, nil).XOF(), int64(b.N*1024))
|
||||||
}
|
}
|
||||||
|
|
||||||
func BenchmarkSum256(b *testing.B) {
|
func BenchmarkSum256(b *testing.B) {
|
||||||
b.Run("64", func(b *testing.B) {
|
b.Run("64", func(b *testing.B) {
|
||||||
b.ReportAllocs()
|
b.ReportAllocs()
|
||||||
|
b.SetBytes(64)
|
||||||
buf := make([]byte, 64)
|
buf := make([]byte, 64)
|
||||||
for i := 0; i < b.N; i++ {
|
for i := 0; i < b.N; i++ {
|
||||||
blake3.Sum256(buf)
|
blake3.Sum256(buf)
|
||||||
|
@ -204,6 +211,7 @@ func BenchmarkSum256(b *testing.B) {
|
||||||
})
|
})
|
||||||
b.Run("1024", func(b *testing.B) {
|
b.Run("1024", func(b *testing.B) {
|
||||||
b.ReportAllocs()
|
b.ReportAllocs()
|
||||||
|
b.SetBytes(1024)
|
||||||
buf := make([]byte, 1024)
|
buf := make([]byte, 1024)
|
||||||
for i := 0; i < b.N; i++ {
|
for i := 0; i < b.N; i++ {
|
||||||
blake3.Sum256(buf)
|
blake3.Sum256(buf)
|
||||||
|
@ -211,15 +219,10 @@ func BenchmarkSum256(b *testing.B) {
|
||||||
})
|
})
|
||||||
b.Run("65536", func(b *testing.B) {
|
b.Run("65536", func(b *testing.B) {
|
||||||
b.ReportAllocs()
|
b.ReportAllocs()
|
||||||
|
b.SetBytes(65536)
|
||||||
buf := make([]byte, 65536)
|
buf := make([]byte, 65536)
|
||||||
for i := 0; i < b.N; i++ {
|
for i := 0; i < b.N; i++ {
|
||||||
blake3.Sum256(buf)
|
blake3.Sum256(buf)
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
func BenchmarkXOF(b *testing.B) {
|
|
||||||
b.ReportAllocs()
|
|
||||||
b.SetBytes(1)
|
|
||||||
io.CopyN(ioutil.Discard, blake3.New(0, nil).XOF(), int64(b.N))
|
|
||||||
}
|
|
||||||
|
|
|
@ -0,0 +1,76 @@
|
||||||
|
package blake3
|
||||||
|
|
||||||
|
import (
|
||||||
|
"unsafe"
|
||||||
|
|
||||||
|
"golang.org/x/sys/cpu"
|
||||||
|
)
|
||||||
|
|
||||||
|
//go:generate go run avo/gen.go -out blake3_amd64.s
|
||||||
|
|
||||||
|
//go:noescape
|
||||||
|
func compressChunksAVX2(cvs *[8][8]uint32, buf *[8192]byte, key *[8]uint32, counter uint64, flags uint32)
|
||||||
|
|
||||||
|
func compressNode(n node) (out [16]uint32) {
|
||||||
|
compressNodeGeneric(&out, n)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
func compressBufferLarge(buf *[8192]byte, buflen int, key *[8]uint32, counter uint64, flags uint32) node {
|
||||||
|
var cvs [8][8]uint32
|
||||||
|
compressChunksAVX2(&cvs, buf, key, counter, flags)
|
||||||
|
numChunks := uint64(buflen / chunkSize)
|
||||||
|
if buflen%chunkSize != 0 {
|
||||||
|
// use non-asm for remainder
|
||||||
|
partialChunk := buf[buflen-buflen%chunkSize : buflen]
|
||||||
|
cvs[numChunks] = chainingValue(compressChunk(partialChunk, key, counter+numChunks, flags))
|
||||||
|
numChunks++
|
||||||
|
}
|
||||||
|
return mergeSubtrees(cvs[:numChunks], key, flags)
|
||||||
|
}
|
||||||
|
|
||||||
|
func compressBuffer(buf *[8192]byte, buflen int, key *[8]uint32, counter uint64, flags uint32) node {
|
||||||
|
switch {
|
||||||
|
case cpu.X86.HasAVX2 && buflen >= chunkSize*2:
|
||||||
|
return compressBufferLarge(buf, buflen, key, counter, flags)
|
||||||
|
default:
|
||||||
|
return compressBufferGeneric(buf, buflen, key, counter, flags)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func compressChunk(chunk []byte, key *[8]uint32, counter uint64, flags uint32) node {
|
||||||
|
n := node{
|
||||||
|
cv: *key,
|
||||||
|
counter: counter,
|
||||||
|
blockLen: blockSize,
|
||||||
|
flags: flags | flagChunkStart,
|
||||||
|
}
|
||||||
|
blockBytes := (*[64]byte)(unsafe.Pointer(&n.block))[:]
|
||||||
|
for len(chunk) > blockSize {
|
||||||
|
copy(blockBytes, chunk)
|
||||||
|
chunk = chunk[blockSize:]
|
||||||
|
n.cv = chainingValue(n)
|
||||||
|
n.flags &^= flagChunkStart
|
||||||
|
}
|
||||||
|
// pad last block with zeros
|
||||||
|
n.block = [16]uint32{}
|
||||||
|
copy(blockBytes, chunk)
|
||||||
|
n.blockLen = uint32(len(chunk))
|
||||||
|
n.flags |= flagChunkEnd
|
||||||
|
return n
|
||||||
|
}
|
||||||
|
|
||||||
|
func wordsToBytes(words [16]uint32, block *[64]byte) {
|
||||||
|
*block = *(*[64]byte)(unsafe.Pointer(&words))
|
||||||
|
}
|
||||||
|
|
||||||
|
func hashBlock(out *[64]byte, buf []byte) {
|
||||||
|
var block [16]uint32
|
||||||
|
copy((*[64]byte)(unsafe.Pointer(&block))[:], buf)
|
||||||
|
compressNodeGeneric((*[16]uint32)(unsafe.Pointer(out)), node{
|
||||||
|
cv: iv,
|
||||||
|
block: block,
|
||||||
|
blockLen: uint32(len(buf)),
|
||||||
|
flags: flagChunkStart | flagChunkEnd | flagRoot,
|
||||||
|
})
|
||||||
|
}
|
|
@ -0,0 +1,150 @@
|
||||||
|
package blake3
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"math/bits"
|
||||||
|
)
|
||||||
|
|
||||||
|
func g(a, b, c, d, mx, my uint32) (uint32, uint32, uint32, uint32) {
|
||||||
|
a += b + mx
|
||||||
|
d = bits.RotateLeft32(d^a, -16)
|
||||||
|
c += d
|
||||||
|
b = bits.RotateLeft32(b^c, -12)
|
||||||
|
a += b + my
|
||||||
|
d = bits.RotateLeft32(d^a, -8)
|
||||||
|
c += d
|
||||||
|
b = bits.RotateLeft32(b^c, -7)
|
||||||
|
return a, b, c, d
|
||||||
|
}
|
||||||
|
|
||||||
|
func compressNodeGeneric(out *[16]uint32, n node) {
|
||||||
|
// NOTE: we unroll all of the rounds, as well as the permutations that occur
|
||||||
|
// between rounds.
|
||||||
|
|
||||||
|
// round 1 (also initializes state)
|
||||||
|
// columns
|
||||||
|
s0, s4, s8, s12 := g(n.cv[0], n.cv[4], iv[0], uint32(n.counter), n.block[0], n.block[1])
|
||||||
|
s1, s5, s9, s13 := g(n.cv[1], n.cv[5], iv[1], uint32(n.counter>>32), n.block[2], n.block[3])
|
||||||
|
s2, s6, s10, s14 := g(n.cv[2], n.cv[6], iv[2], n.blockLen, n.block[4], n.block[5])
|
||||||
|
s3, s7, s11, s15 := g(n.cv[3], n.cv[7], iv[3], n.flags, n.block[6], n.block[7])
|
||||||
|
// diagonals
|
||||||
|
s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[8], n.block[9])
|
||||||
|
s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[10], n.block[11])
|
||||||
|
s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[12], n.block[13])
|
||||||
|
s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[14], n.block[15])
|
||||||
|
|
||||||
|
// round 2
|
||||||
|
s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[2], n.block[6])
|
||||||
|
s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[3], n.block[10])
|
||||||
|
s2, s6, s10, s14 = g(s2, s6, s10, s14, n.block[7], n.block[0])
|
||||||
|
s3, s7, s11, s15 = g(s3, s7, s11, s15, n.block[4], n.block[13])
|
||||||
|
s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[1], n.block[11])
|
||||||
|
s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[12], n.block[5])
|
||||||
|
s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[9], n.block[14])
|
||||||
|
s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[15], n.block[8])
|
||||||
|
|
||||||
|
// round 3
|
||||||
|
s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[3], n.block[4])
|
||||||
|
s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[10], n.block[12])
|
||||||
|
s2, s6, s10, s14 = g(s2, s6, s10, s14, n.block[13], n.block[2])
|
||||||
|
s3, s7, s11, s15 = g(s3, s7, s11, s15, n.block[7], n.block[14])
|
||||||
|
s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[6], n.block[5])
|
||||||
|
s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[9], n.block[0])
|
||||||
|
s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[11], n.block[15])
|
||||||
|
s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[8], n.block[1])
|
||||||
|
|
||||||
|
// round 4
|
||||||
|
s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[10], n.block[7])
|
||||||
|
s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[12], n.block[9])
|
||||||
|
s2, s6, s10, s14 = g(s2, s6, s10, s14, n.block[14], n.block[3])
|
||||||
|
s3, s7, s11, s15 = g(s3, s7, s11, s15, n.block[13], n.block[15])
|
||||||
|
s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[4], n.block[0])
|
||||||
|
s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[11], n.block[2])
|
||||||
|
s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[5], n.block[8])
|
||||||
|
s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[1], n.block[6])
|
||||||
|
|
||||||
|
// round 5
|
||||||
|
s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[12], n.block[13])
|
||||||
|
s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[9], n.block[11])
|
||||||
|
s2, s6, s10, s14 = g(s2, s6, s10, s14, n.block[15], n.block[10])
|
||||||
|
s3, s7, s11, s15 = g(s3, s7, s11, s15, n.block[14], n.block[8])
|
||||||
|
s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[7], n.block[2])
|
||||||
|
s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[5], n.block[3])
|
||||||
|
s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[0], n.block[1])
|
||||||
|
s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[6], n.block[4])
|
||||||
|
|
||||||
|
// round 6
|
||||||
|
s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[9], n.block[14])
|
||||||
|
s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[11], n.block[5])
|
||||||
|
s2, s6, s10, s14 = g(s2, s6, s10, s14, n.block[8], n.block[12])
|
||||||
|
s3, s7, s11, s15 = g(s3, s7, s11, s15, n.block[15], n.block[1])
|
||||||
|
s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[13], n.block[3])
|
||||||
|
s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[0], n.block[10])
|
||||||
|
s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[2], n.block[6])
|
||||||
|
s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[4], n.block[7])
|
||||||
|
|
||||||
|
// round 7
|
||||||
|
s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[11], n.block[15])
|
||||||
|
s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[5], n.block[0])
|
||||||
|
s2, s6, s10, s14 = g(s2, s6, s10, s14, n.block[1], n.block[9])
|
||||||
|
s3, s7, s11, s15 = g(s3, s7, s11, s15, n.block[8], n.block[6])
|
||||||
|
s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[14], n.block[10])
|
||||||
|
s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[2], n.block[12])
|
||||||
|
s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[3], n.block[4])
|
||||||
|
s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[7], n.block[13])
|
||||||
|
|
||||||
|
// finalization
|
||||||
|
*out = [16]uint32{
|
||||||
|
s0 ^ s8, s1 ^ s9, s2 ^ s10, s3 ^ s11,
|
||||||
|
s4 ^ s12, s5 ^ s13, s6 ^ s14, s7 ^ s15,
|
||||||
|
s8 ^ n.cv[0], s9 ^ n.cv[1], s10 ^ n.cv[2], s11 ^ n.cv[3],
|
||||||
|
s12 ^ n.cv[4], s13 ^ n.cv[5], s14 ^ n.cv[6], s15 ^ n.cv[7],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func compressBufferGeneric(buf *[8192]byte, buflen int, key *[8]uint32, counter uint64, flags uint32) (n node) {
|
||||||
|
if buflen <= chunkSize {
|
||||||
|
return compressChunk(buf[:buflen], key, counter, flags)
|
||||||
|
}
|
||||||
|
cvs := make([][8]uint32, 0, 8)
|
||||||
|
for bb := bytes.NewBuffer(buf[:buflen]); bb.Len() > 0; {
|
||||||
|
n := compressChunk(bb.Next(chunkSize), key, counter, flags)
|
||||||
|
cvs = append(cvs, chainingValue(n))
|
||||||
|
counter++
|
||||||
|
}
|
||||||
|
return mergeSubtrees(cvs, key, flags)
|
||||||
|
}
|
||||||
|
|
||||||
|
func chainingValue(n node) (cv [8]uint32) {
|
||||||
|
full := compressNode(n)
|
||||||
|
copy(cv[:], full[:])
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
func mergeSubtrees(cvs [][8]uint32, key *[8]uint32, flags uint32) node {
|
||||||
|
parent := func(l, r [8]uint32) [8]uint32 {
|
||||||
|
return chainingValue(parentNode(l, r, *key, flags))
|
||||||
|
}
|
||||||
|
switch len(cvs) {
|
||||||
|
case 8:
|
||||||
|
cvs[6] = parent(cvs[6], cvs[7])
|
||||||
|
fallthrough
|
||||||
|
case 7:
|
||||||
|
cvs[4], cvs[5] = parent(cvs[4], cvs[5]), cvs[6]
|
||||||
|
fallthrough
|
||||||
|
case 6:
|
||||||
|
cvs[4] = parent(cvs[4], cvs[5])
|
||||||
|
fallthrough
|
||||||
|
case 5:
|
||||||
|
fallthrough
|
||||||
|
case 4:
|
||||||
|
cvs[2] = parent(cvs[2], cvs[3])
|
||||||
|
fallthrough
|
||||||
|
case 3:
|
||||||
|
cvs[0], cvs[1] = parent(cvs[0], cvs[1]), cvs[2]
|
||||||
|
}
|
||||||
|
if len(cvs) > 4 {
|
||||||
|
cvs[0], cvs[1] = parent(cvs[0], cvs[1]), cvs[4]
|
||||||
|
}
|
||||||
|
return parentNode(cvs[0], cvs[1], *key, flags)
|
||||||
|
}
|
|
@ -0,0 +1,64 @@
|
||||||
|
// +build !amd64
|
||||||
|
|
||||||
|
package blake3
|
||||||
|
|
||||||
|
import "encoding/binary"
|
||||||
|
|
||||||
|
func compressNode(n node) (out [16]uint32) {
|
||||||
|
compressNodeGeneric(&out, n)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
func compressBuffer(buf *[8192]byte, length int, key *[8]uint32, counter uint64, flags uint32) node {
|
||||||
|
return compressBufferGeneric(buf, length, key, counter, flags)
|
||||||
|
}
|
||||||
|
|
||||||
|
func compressChunk(chunk []byte, key *[8]uint32, counter uint64, flags uint32) node {
|
||||||
|
n := node{
|
||||||
|
cv: *key,
|
||||||
|
counter: counter,
|
||||||
|
blockLen: blockSize,
|
||||||
|
flags: flags | flagChunkStart,
|
||||||
|
}
|
||||||
|
var block [blockSize]byte
|
||||||
|
for len(chunk) > blockSize {
|
||||||
|
copy(block[:], chunk)
|
||||||
|
chunk = chunk[blockSize:]
|
||||||
|
bytesToWords(block, &n.block)
|
||||||
|
n.cv = chainingValue(n)
|
||||||
|
n.flags &^= flagChunkStart
|
||||||
|
}
|
||||||
|
// pad last block with zeros
|
||||||
|
block = [blockSize]byte{}
|
||||||
|
n.blockLen = uint32(len(chunk))
|
||||||
|
copy(block[:], chunk)
|
||||||
|
bytesToWords(block, &n.block)
|
||||||
|
n.flags |= flagChunkEnd
|
||||||
|
return n
|
||||||
|
}
|
||||||
|
|
||||||
|
func hashBlock(out *[64]byte, buf []byte) {
|
||||||
|
var block [64]byte
|
||||||
|
var words [16]uint32
|
||||||
|
copy(block[:], buf)
|
||||||
|
bytesToWords(block, &words)
|
||||||
|
compressNodeGeneric(&words, node{
|
||||||
|
cv: iv,
|
||||||
|
block: words,
|
||||||
|
blockLen: uint32(len(buf)),
|
||||||
|
flags: flagChunkStart | flagChunkEnd | flagRoot,
|
||||||
|
})
|
||||||
|
wordsToBytes(words, out)
|
||||||
|
}
|
||||||
|
|
||||||
|
func bytesToWords(bytes [64]byte, words *[16]uint32) {
|
||||||
|
for i := range words {
|
||||||
|
words[i] = binary.LittleEndian.Uint32(bytes[4*i:])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func wordsToBytes(words [16]uint32, block *[64]byte) {
|
||||||
|
for i, w := range words {
|
||||||
|
binary.LittleEndian.PutUint32(block[4*i:], w)
|
||||||
|
}
|
||||||
|
}
|
2
go.mod
2
go.mod
|
@ -1,3 +1,5 @@
|
||||||
module lukechampine.com/blake3
|
module lukechampine.com/blake3
|
||||||
|
|
||||||
go 1.13
|
go 1.13
|
||||||
|
|
||||||
|
require golang.org/x/sys v0.0.0-20200202164722-d101bd2416d5
|
||||||
|
|
Reference in New Issue