add AVX2 optimization for XOF
This commit is contained in:
parent
c2af4bc4c2
commit
221995220f
11
README.md
11
README.md
|
@ -15,10 +15,11 @@ readability, in the hopes of eventually landing in `x/crypto`.
|
||||||
The pure-Go code is fairly well-optimized, achieving throughput of ~600 MB/s.
|
The pure-Go code is fairly well-optimized, achieving throughput of ~600 MB/s.
|
||||||
There is a separate code path for small inputs (up to 64 bytes) that runs in
|
There is a separate code path for small inputs (up to 64 bytes) that runs in
|
||||||
~100 ns. On CPUs with AVX2 support, larger inputs (>=2 KB) are handled by
|
~100 ns. On CPUs with AVX2 support, larger inputs (>=2 KB) are handled by
|
||||||
an [`avo`](https://github.com/mmcloughlin/avo)-generated assembly routine that compresses 8 chunks in parallel,
|
an [`avo`](https://github.com/mmcloughlin/avo)-generated assembly routine that compresses 8 nodes in parallel,
|
||||||
achieving throughput of ~2600 MB/s. Once [AVX-512 support](https://github.com/mmcloughlin/avo/issues/20) is added to `avo`, it
|
achieving throughput of ~2600 MB/s. AVX2 is also used for BLAKE3's extendable output function,
|
||||||
will be possible to compress 16 chunks in parallel, which should roughly double
|
enabling it to stream pseudorandom bytes at ~3500 MB/s. Once [AVX-512 support](https://github.com/mmcloughlin/avo/issues/20) is added to `avo`, it
|
||||||
throughput for sufficiently large inputs.
|
will be possible to compress 16 nodes in parallel, which should roughly double
|
||||||
|
the current performance.
|
||||||
|
|
||||||
Contributions are greatly appreciated.
|
Contributions are greatly appreciated.
|
||||||
[All contributors are eligible to receive an Urbit planet.](https://twitter.com/lukechampine/status/1274797924522885134)
|
[All contributors are eligible to receive an Urbit planet.](https://twitter.com/lukechampine/status/1274797924522885134)
|
||||||
|
@ -33,5 +34,5 @@ BenchmarkSum256/64 105 ns/op 609.51 MB/s
|
||||||
BenchmarkSum256/1024 1778 ns/op 576.00 MB/s
|
BenchmarkSum256/1024 1778 ns/op 576.00 MB/s
|
||||||
BenchmarkSum256/65536 24785 ns/op 2644.15 MB/s
|
BenchmarkSum256/65536 24785 ns/op 2644.15 MB/s
|
||||||
BenchmarkWrite 389 ns/op 2631.78 MB/s
|
BenchmarkWrite 389 ns/op 2631.78 MB/s
|
||||||
BenchmarkXOF 1591 ns/op 643.80 MB/s
|
BenchmarkXOF 293 ns/op 3492.94 MB/s
|
||||||
```
|
```
|
||||||
|
|
172
avo/gen.go
172
avo/gen.go
|
@ -12,6 +12,7 @@ import (
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
genGlobals()
|
genGlobals()
|
||||||
|
genCompressBlocksAVX2()
|
||||||
genCompressChunksAVX2()
|
genCompressChunksAVX2()
|
||||||
|
|
||||||
Generate()
|
Generate()
|
||||||
|
@ -38,7 +39,6 @@ func genGlobals() {
|
||||||
for i := 0; i < 8; i++ {
|
for i := 0; i < 8; i++ {
|
||||||
DATA(i*4, U32(64))
|
DATA(i*4, U32(64))
|
||||||
}
|
}
|
||||||
|
|
||||||
globals.stride1024 = GLOBL("stride_1024", RODATA|NOPTR)
|
globals.stride1024 = GLOBL("stride_1024", RODATA|NOPTR)
|
||||||
for i := 0; i < 8; i++ {
|
for i := 0; i < 8; i++ {
|
||||||
DATA(i*4, U32(i*1024))
|
DATA(i*4, U32(i*1024))
|
||||||
|
@ -67,6 +67,80 @@ func genGlobals() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func genCompressBlocksAVX2() {
|
||||||
|
TEXT("compressBlocksAVX2", NOSPLIT, "func(out *[512]byte, block *[16]uint32, cv *[8]uint32, counter uint64, blockLen uint32, flags uint32)")
|
||||||
|
out := Mem{Base: Load(Param("out"), GP64())}
|
||||||
|
block := Mem{Base: Load(Param("block"), GP64())}
|
||||||
|
cv := Mem{Base: Load(Param("cv"), GP64())}
|
||||||
|
counter, _ := Param("counter").Resolve()
|
||||||
|
blockLen, _ := Param("blockLen").Resolve()
|
||||||
|
flags, _ := Param("flags").Resolve()
|
||||||
|
|
||||||
|
vs := [16]VecVirtual{
|
||||||
|
YMM(), YMM(), YMM(), YMM(),
|
||||||
|
YMM(), YMM(), YMM(), YMM(),
|
||||||
|
YMM(), YMM(), YMM(), YMM(),
|
||||||
|
YMM(), YMM(), YMM(), YMM(),
|
||||||
|
}
|
||||||
|
|
||||||
|
// stack space for message vectors
|
||||||
|
var mv [16]Mem
|
||||||
|
for i := range mv {
|
||||||
|
mv[i] = AllocLocal(32)
|
||||||
|
}
|
||||||
|
// stack space for spilled vs[8] register
|
||||||
|
spillMem := AllocLocal(32)
|
||||||
|
|
||||||
|
Comment("Load block")
|
||||||
|
for i := 0; i < 16; i++ {
|
||||||
|
VPBROADCASTD(block.Offset(i*4), vs[0])
|
||||||
|
VMOVDQU(vs[0], mv[i])
|
||||||
|
}
|
||||||
|
|
||||||
|
Comment("Initialize state vectors")
|
||||||
|
for i, v := range vs {
|
||||||
|
switch i {
|
||||||
|
case 0, 1, 2, 3, 4, 5, 6, 7: // cv
|
||||||
|
VPBROADCASTD(cv.Offset(i*4), v)
|
||||||
|
case 8, 9, 10, 11: // iv
|
||||||
|
VPBROADCASTD(globals.iv.Offset((i-8)*4), v)
|
||||||
|
case 12: // counter
|
||||||
|
loadCounter(counter.Addr, vs[12:14], vs[14:16])
|
||||||
|
case 14: // blockLen
|
||||||
|
VPBROADCASTD(blockLen.Addr, v)
|
||||||
|
case 15: // flags
|
||||||
|
VPBROADCASTD(flags.Addr, v)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
performRounds(vs, mv, spillMem)
|
||||||
|
|
||||||
|
Comment("Finalize CVs")
|
||||||
|
for i := 8; i < 16; i++ {
|
||||||
|
VMOVDQU(vs[i], mv[i])
|
||||||
|
}
|
||||||
|
for i := range vs[:8] {
|
||||||
|
VPXOR(vs[i], vs[i+8], vs[i])
|
||||||
|
}
|
||||||
|
transpose(vs[:8], vs[8:])
|
||||||
|
for i, v := range vs[8:] {
|
||||||
|
VMOVDQU(v, out.Offset(i*64))
|
||||||
|
}
|
||||||
|
for i := 8; i < 16; i++ {
|
||||||
|
VMOVDQU(mv[i], vs[i])
|
||||||
|
}
|
||||||
|
for i, v := range vs[8:] {
|
||||||
|
VPBROADCASTD(cv.Offset(i*4), vs[0])
|
||||||
|
VPXOR(vs[0], v, v)
|
||||||
|
}
|
||||||
|
transpose(vs[8:], vs[:8])
|
||||||
|
for i, v := range vs[:8] {
|
||||||
|
VMOVDQU(v, out.Offset(i*64+32))
|
||||||
|
}
|
||||||
|
|
||||||
|
RET()
|
||||||
|
}
|
||||||
|
|
||||||
func genCompressChunksAVX2() {
|
func genCompressChunksAVX2() {
|
||||||
TEXT("compressChunksAVX2", NOSPLIT, "func(cvs *[8][8]uint32, buf *[8192]byte, key *[8]uint32, counter uint64, flags uint32)")
|
TEXT("compressChunksAVX2", NOSPLIT, "func(cvs *[8][8]uint32, buf *[8192]byte, key *[8]uint32, counter uint64, flags uint32)")
|
||||||
cvs := Mem{Base: Load(Param("cvs"), GP64())}
|
cvs := Mem{Base: Load(Param("cvs"), GP64())}
|
||||||
|
@ -97,16 +171,7 @@ func genCompressChunksAVX2() {
|
||||||
Comment("Initialize counter")
|
Comment("Initialize counter")
|
||||||
counterLo := AllocLocal(32)
|
counterLo := AllocLocal(32)
|
||||||
counterHi := AllocLocal(32)
|
counterHi := AllocLocal(32)
|
||||||
VPBROADCASTQ(counter.Addr, vs[12])
|
loadCounter(counter.Addr, vs[12:14], vs[14:16])
|
||||||
VPBROADCASTQ(counter.Addr, vs[13])
|
|
||||||
VPADDQ(globals.incrementCounter.Offset(0*32), vs[12], vs[12])
|
|
||||||
VPADDQ(globals.incrementCounter.Offset(1*32), vs[13], vs[13])
|
|
||||||
VPUNPCKLDQ(vs[13], vs[12], vs[14])
|
|
||||||
VPUNPCKHDQ(vs[13], vs[12], vs[15])
|
|
||||||
VPUNPCKLDQ(vs[15], vs[14], vs[12])
|
|
||||||
VPUNPCKHDQ(vs[15], vs[14], vs[13])
|
|
||||||
VPERMQ(Imm(0xd8), vs[12], vs[12])
|
|
||||||
VPERMQ(Imm(0xd8), vs[13], vs[13])
|
|
||||||
VMOVDQU(vs[12], counterLo)
|
VMOVDQU(vs[12], counterLo)
|
||||||
VMOVDQU(vs[13], counterHi)
|
VMOVDQU(vs[13], counterHi)
|
||||||
|
|
||||||
|
@ -141,21 +206,9 @@ func genCompressChunksAVX2() {
|
||||||
VMOVDQU(globals.blockLen, vs[14])
|
VMOVDQU(globals.blockLen, vs[14])
|
||||||
VPBROADCASTD(chunkFlags.Idx(loop, 4), vs[15])
|
VPBROADCASTD(chunkFlags.Idx(loop, 4), vs[15])
|
||||||
|
|
||||||
VMOVDQU(vs[8], spillMem) // spill
|
performRounds(vs, mv, spillMem)
|
||||||
for i := 0; i < 7; i++ {
|
|
||||||
Comment(fmt.Sprintf("Round %v", i+1))
|
|
||||||
round(vs, mv, vs[8], spillMem)
|
|
||||||
// permute
|
|
||||||
mv = [16]Mem{
|
|
||||||
mv[2], mv[6], mv[3], mv[10],
|
|
||||||
mv[7], mv[0], mv[4], mv[13],
|
|
||||||
mv[1], mv[11], mv[12], mv[5],
|
|
||||||
mv[9], mv[14], mv[15], mv[8],
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Comment("Finalize CVs")
|
Comment("Finalize CVs")
|
||||||
VMOVDQU(spillMem, vs[8]) // reload
|
|
||||||
for i := range vs[:8] {
|
for i := range vs[:8] {
|
||||||
VPXOR(vs[i], vs[i+8], vs[i])
|
VPXOR(vs[i], vs[i+8], vs[i])
|
||||||
}
|
}
|
||||||
|
@ -166,31 +219,19 @@ func genCompressChunksAVX2() {
|
||||||
JNE(LabelRef("loop"))
|
JNE(LabelRef("loop"))
|
||||||
|
|
||||||
Comment("Finished; transpose CVs")
|
Comment("Finished; transpose CVs")
|
||||||
src, dst := vs[:8], vs[8:]
|
transpose(vs[:8], vs[8:])
|
||||||
// interleave uint32s
|
for i, v := range vs[8:] {
|
||||||
for i := 0; i < 8; i += 2 {
|
|
||||||
VPUNPCKLDQ(src[i+1], src[i], dst[i+0])
|
|
||||||
VPUNPCKHDQ(src[i+1], src[i], dst[i+1])
|
|
||||||
}
|
|
||||||
// interleave groups of two uint32s
|
|
||||||
for i := 0; i < 4; i++ {
|
|
||||||
j := i*2 - i%2 // j := 0,1,4,5
|
|
||||||
VPUNPCKLQDQ(dst[j+2], dst[j], src[i*2+0])
|
|
||||||
VPUNPCKHQDQ(dst[j+2], dst[j], src[i*2+1])
|
|
||||||
}
|
|
||||||
// interleave groups of four uint32s
|
|
||||||
for i := 0; i < 4; i++ {
|
|
||||||
VPERM2I128(Imm(0x20), src[i+4], src[i], dst[i+0])
|
|
||||||
VPERM2I128(Imm(0x31), src[i+4], src[i], dst[i+4])
|
|
||||||
}
|
|
||||||
for i, v := range dst {
|
|
||||||
VMOVDQU(v, cvs.Offset(i*32))
|
VMOVDQU(v, cvs.Offset(i*32))
|
||||||
}
|
}
|
||||||
|
|
||||||
RET()
|
RET()
|
||||||
}
|
}
|
||||||
|
|
||||||
func round(sv [16]VecVirtual, mv [16]Mem, tmp VecVirtual, spillMem Mem) {
|
func performRounds(sv [16]VecVirtual, mv [16]Mem, spillMem Mem) {
|
||||||
|
tmp := sv[8]
|
||||||
|
VMOVDQU(sv[8], spillMem) // spill
|
||||||
|
for i := 0; i < 7; i++ {
|
||||||
|
Comment(fmt.Sprintf("Round %v", i+1))
|
||||||
g(sv[0], sv[4], sv[8], sv[12], mv[0], mv[1], tmp, spillMem)
|
g(sv[0], sv[4], sv[8], sv[12], mv[0], mv[1], tmp, spillMem)
|
||||||
g(sv[1], sv[5], sv[9], sv[13], mv[2], mv[3], tmp, spillMem)
|
g(sv[1], sv[5], sv[9], sv[13], mv[2], mv[3], tmp, spillMem)
|
||||||
g(sv[2], sv[6], sv[10], sv[14], mv[4], mv[5], tmp, spillMem)
|
g(sv[2], sv[6], sv[10], sv[14], mv[4], mv[5], tmp, spillMem)
|
||||||
|
@ -199,6 +240,16 @@ func round(sv [16]VecVirtual, mv [16]Mem, tmp VecVirtual, spillMem Mem) {
|
||||||
g(sv[1], sv[6], sv[11], sv[12], mv[10], mv[11], tmp, spillMem)
|
g(sv[1], sv[6], sv[11], sv[12], mv[10], mv[11], tmp, spillMem)
|
||||||
g(sv[2], sv[7], sv[8], sv[13], mv[12], mv[13], tmp, spillMem)
|
g(sv[2], sv[7], sv[8], sv[13], mv[12], mv[13], tmp, spillMem)
|
||||||
g(sv[3], sv[4], sv[9], sv[14], mv[14], mv[15], tmp, spillMem)
|
g(sv[3], sv[4], sv[9], sv[14], mv[14], mv[15], tmp, spillMem)
|
||||||
|
|
||||||
|
// permute
|
||||||
|
mv = [16]Mem{
|
||||||
|
mv[2], mv[6], mv[3], mv[10],
|
||||||
|
mv[7], mv[0], mv[4], mv[13],
|
||||||
|
mv[1], mv[11], mv[12], mv[5],
|
||||||
|
mv[9], mv[14], mv[15], mv[8],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
VMOVDQU(spillMem, sv[8]) // reload
|
||||||
}
|
}
|
||||||
|
|
||||||
func g(a, b, c, d VecVirtual, mx, my Mem, tmp VecVirtual, spillMem Mem) {
|
func g(a, b, c, d VecVirtual, mx, my Mem, tmp VecVirtual, spillMem Mem) {
|
||||||
|
@ -237,3 +288,38 @@ func g(a, b, c, d VecVirtual, mx, my Mem, tmp VecVirtual, spillMem Mem) {
|
||||||
VPXOR(b, c, b)
|
VPXOR(b, c, b)
|
||||||
rotr(b, 7, b)
|
rotr(b, 7, b)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func loadCounter(counter Mem, dst, scratch []VecVirtual) {
|
||||||
|
// fill dst[0] and dst[1] with counter + 0,1,2,3,4,5,6,7, then transpose so
|
||||||
|
// that dst[0] contains low 32 bits and dst[1] contains high 32 bits.
|
||||||
|
VPBROADCASTQ(counter, dst[0])
|
||||||
|
VPBROADCASTQ(counter, dst[1])
|
||||||
|
VPADDQ(globals.incrementCounter.Offset(0*32), dst[0], dst[0])
|
||||||
|
VPADDQ(globals.incrementCounter.Offset(1*32), dst[1], dst[1])
|
||||||
|
VPUNPCKLDQ(dst[1], dst[0], scratch[0])
|
||||||
|
VPUNPCKHDQ(dst[1], dst[0], scratch[1])
|
||||||
|
VPUNPCKLDQ(scratch[1], scratch[0], dst[0])
|
||||||
|
VPUNPCKHDQ(scratch[1], scratch[0], dst[1])
|
||||||
|
const perm = 0<<0 | 2<<2 | 1<<4 | 3<<6
|
||||||
|
VPERMQ(Imm(perm), dst[0], dst[0])
|
||||||
|
VPERMQ(Imm(perm), dst[1], dst[1])
|
||||||
|
}
|
||||||
|
|
||||||
|
func transpose(src, dst []VecVirtual) {
|
||||||
|
// interleave uint32s
|
||||||
|
for i := 0; i < 8; i += 2 {
|
||||||
|
VPUNPCKLDQ(src[i+1], src[i], dst[i+0])
|
||||||
|
VPUNPCKHDQ(src[i+1], src[i], dst[i+1])
|
||||||
|
}
|
||||||
|
// interleave groups of two uint32s
|
||||||
|
for i := 0; i < 4; i++ {
|
||||||
|
j := i*2 - i%2 // j := 0,1,4,5
|
||||||
|
VPUNPCKLQDQ(dst[j+2], dst[j], src[i*2+0])
|
||||||
|
VPUNPCKHQDQ(dst[j+2], dst[j], src[i*2+1])
|
||||||
|
}
|
||||||
|
// interleave groups of four uint32s
|
||||||
|
for i := 0; i < 4; i++ {
|
||||||
|
VPERM2I128(Imm(0x20), src[i+4], src[i], dst[i+0])
|
||||||
|
VPERM2I128(Imm(0x31), src[i+4], src[i], dst[i+4])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
12
blake3.go
12
blake3.go
|
@ -224,7 +224,7 @@ func DeriveKey(subKey []byte, ctx string, srcKey []byte) {
|
||||||
// bytes.
|
// bytes.
|
||||||
type OutputReader struct {
|
type OutputReader struct {
|
||||||
n node
|
n node
|
||||||
block [blockSize]byte
|
buf [8 * blockSize]byte
|
||||||
off uint64
|
off uint64
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -238,11 +238,11 @@ func (or *OutputReader) Read(p []byte) (int, error) {
|
||||||
}
|
}
|
||||||
lenp := len(p)
|
lenp := len(p)
|
||||||
for len(p) > 0 {
|
for len(p) > 0 {
|
||||||
if or.off%blockSize == 0 {
|
if or.off%(8*blockSize) == 0 {
|
||||||
or.n.counter = or.off / blockSize
|
or.n.counter = or.off / blockSize
|
||||||
wordsToBytes(compressNode(or.n), &or.block)
|
compressBlocks(&or.buf, or.n)
|
||||||
}
|
}
|
||||||
n := copy(p, or.block[or.off%blockSize:])
|
n := copy(p, or.buf[or.off%(8*blockSize):])
|
||||||
p = p[n:]
|
p = p[n:]
|
||||||
or.off += uint64(n)
|
or.off += uint64(n)
|
||||||
}
|
}
|
||||||
|
@ -274,8 +274,8 @@ func (or *OutputReader) Seek(offset int64, whence int) (int64, error) {
|
||||||
}
|
}
|
||||||
or.off = off
|
or.off = off
|
||||||
or.n.counter = uint64(off) / blockSize
|
or.n.counter = uint64(off) / blockSize
|
||||||
if or.off%blockSize != 0 {
|
if or.off%(8*blockSize) != 0 {
|
||||||
wordsToBytes(compressNode(or.n), &or.block)
|
compressBlocks(&or.buf, or.n)
|
||||||
}
|
}
|
||||||
// NOTE: or.off >= 2^63 will result in a negative return value.
|
// NOTE: or.off >= 2^63 will result in a negative return value.
|
||||||
// Nothing we can do about this.
|
// Nothing we can do about this.
|
||||||
|
|
1256
blake3_amd64.s
1256
blake3_amd64.s
File diff suppressed because it is too large
Load Diff
|
@ -11,6 +11,9 @@ import (
|
||||||
//go:noescape
|
//go:noescape
|
||||||
func compressChunksAVX2(cvs *[8][8]uint32, buf *[8192]byte, key *[8]uint32, counter uint64, flags uint32)
|
func compressChunksAVX2(cvs *[8][8]uint32, buf *[8192]byte, key *[8]uint32, counter uint64, flags uint32)
|
||||||
|
|
||||||
|
//go:noescape
|
||||||
|
func compressBlocksAVX2(out *[512]byte, msgs *[16]uint32, cv *[8]uint32, counter uint64, blockLen uint32, flags uint32)
|
||||||
|
|
||||||
func compressNode(n node) (out [16]uint32) {
|
func compressNode(n node) (out [16]uint32) {
|
||||||
compressNodeGeneric(&out, n)
|
compressNodeGeneric(&out, n)
|
||||||
return
|
return
|
||||||
|
@ -60,10 +63,6 @@ func compressChunk(chunk []byte, key *[8]uint32, counter uint64, flags uint32) n
|
||||||
return n
|
return n
|
||||||
}
|
}
|
||||||
|
|
||||||
func wordsToBytes(words [16]uint32, block *[64]byte) {
|
|
||||||
*block = *(*[64]byte)(unsafe.Pointer(&words))
|
|
||||||
}
|
|
||||||
|
|
||||||
func hashBlock(out *[64]byte, buf []byte) {
|
func hashBlock(out *[64]byte, buf []byte) {
|
||||||
var block [16]uint32
|
var block [16]uint32
|
||||||
copy((*[64]byte)(unsafe.Pointer(&block))[:], buf)
|
copy((*[64]byte)(unsafe.Pointer(&block))[:], buf)
|
||||||
|
@ -74,3 +73,17 @@ func hashBlock(out *[64]byte, buf []byte) {
|
||||||
flags: flagChunkStart | flagChunkEnd | flagRoot,
|
flags: flagChunkStart | flagChunkEnd | flagRoot,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func compressBlocks(out *[512]byte, n node) {
|
||||||
|
switch {
|
||||||
|
case cpu.X86.HasAVX2:
|
||||||
|
compressBlocksAVX2(out, &n.block, &n.cv, n.counter, n.blockLen, n.flags)
|
||||||
|
default:
|
||||||
|
compressBlocksGeneric((*[8][64]byte)(unsafe.Pointer(out)), n)
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
func wordsToBytes(words [16]uint32, block *[64]byte) {
|
||||||
|
*block = *(*[64]byte)(unsafe.Pointer(&words))
|
||||||
|
}
|
||||||
|
|
|
@ -115,6 +115,13 @@ func compressBufferGeneric(buf *[8192]byte, buflen int, key *[8]uint32, counter
|
||||||
return mergeSubtrees(cvs, key, flags)
|
return mergeSubtrees(cvs, key, flags)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func compressBlocksGeneric(outs *[8][64]byte, n node) {
|
||||||
|
for i := range outs {
|
||||||
|
wordsToBytes(compressNode(n), &outs[i])
|
||||||
|
n.counter++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func chainingValue(n node) (cv [8]uint32) {
|
func chainingValue(n node) (cv [8]uint32) {
|
||||||
full := compressNode(n)
|
full := compressNode(n)
|
||||||
copy(cv[:], full[:])
|
copy(cv[:], full[:])
|
||||||
|
|
|
@ -51,6 +51,14 @@ func hashBlock(out *[64]byte, buf []byte) {
|
||||||
wordsToBytes(words, out)
|
wordsToBytes(words, out)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func compressBlocks(out *[512]byte, n node) {
|
||||||
|
var outs [8][64]byte
|
||||||
|
compressBlocksGeneric(&outs, n)
|
||||||
|
for i := range outs {
|
||||||
|
copy(out[i*64:], outs[i][:])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func bytesToWords(bytes [64]byte, words *[16]uint32) {
|
func bytesToWords(bytes [64]byte, words *[16]uint32) {
|
||||||
for i := range words {
|
for i := range words {
|
||||||
words[i] = binary.LittleEndian.Uint32(bytes[4*i:])
|
words[i] = binary.LittleEndian.Uint32(bytes[4*i:])
|
||||||
|
|
Reference in New Issue