From 855147f46122a9fc5fda5fc743cc22eba03457f5 Mon Sep 17 00:00:00 2001 From: renthraysk Date: Sun, 12 Jan 2020 18:20:37 +0000 Subject: [PATCH 1/3] g() takes and returns state values, inlines. --- blake3.go | 243 ++++++++++++++++++++--------------------------------- gen/gen.go | 28 +++--- 2 files changed, 101 insertions(+), 170 deletions(-) diff --git a/blake3.go b/blake3.go index 46fbc5a..7019ca6 100644 --- a/blake3.go +++ b/blake3.go @@ -45,19 +45,16 @@ func wordsToBytes(words []uint32, bytes []byte) { } } -// The g function, split into two parts so that the compiler will inline it. -func gx(state *[16]uint32, a, b, c, d int, mx uint32) { - state[a] += state[b] + mx - state[d] = bits.RotateLeft32(state[d]^state[a], -16) - state[c] += state[d] - state[b] = bits.RotateLeft32(state[b]^state[c], -12) -} - -func gy(state *[16]uint32, a, b, c, d int, my uint32) { - state[a] += state[b] + my - state[d] = bits.RotateLeft32(state[d]^state[a], -8) - state[c] += state[d] - state[b] = bits.RotateLeft32(state[b]^state[c], -7) +func g(a, b, c, d, mx, my uint32) (uint32, uint32, uint32, uint32) { + a += b + mx + d = bits.RotateLeft32(d^a, -16) + c += d + b = bits.RotateLeft32(b^c, -12) + a += b + my + d = bits.RotateLeft32(d^a, -8) + c += d + b = bits.RotateLeft32(b^c, -7) + return a, b, c, d } // A node represents a chunk or parent in the BLAKE3 Merkle tree. In BLAKE3 @@ -83,172 +80,114 @@ type node struct { // node. When nodes are being merged into parents, only the first 8 words are // used. When the root node is being used to generate output, the full 16 words // are used. -func (n node) compress() (state [16]uint32) { - state = [16]uint32{ - n.cv[0], n.cv[1], n.cv[2], n.cv[3], - n.cv[4], n.cv[5], n.cv[6], n.cv[7], - iv[0], iv[1], iv[2], iv[3], - uint32(n.counter), uint32(n.counter >> 32), n.blockLen, n.flags, - } - - // round1 - - // Mix the columns. - gx(&state, 0, 4, 8, 12, n.block[0]) - gy(&state, 0, 4, 8, 12, n.block[1]) - gx(&state, 1, 5, 9, 13, n.block[2]) - gy(&state, 1, 5, 9, 13, n.block[3]) - gx(&state, 2, 6, 10, 14, n.block[4]) - gy(&state, 2, 6, 10, 14, n.block[5]) - gx(&state, 3, 7, 11, 15, n.block[6]) - gy(&state, 3, 7, 11, 15, n.block[7]) +func (n node) compress() (s [16]uint32) { + // round1 rather than init s and mix, do both. + // mix the columns. + s[0], s[4], s[8], s[12] = g(n.cv[0], n.cv[4], iv[0], uint32(n.counter), n.block[0], n.block[1]) + s[1], s[5], s[9], s[13] = g(n.cv[1], n.cv[5], iv[1], uint32(n.counter>>32), n.block[2], n.block[3]) + s[2], s[6], s[10], s[14] = g(n.cv[2], n.cv[6], iv[2], n.blockLen, n.block[4], n.block[5]) + s[3], s[7], s[11], s[15] = g(n.cv[3], n.cv[7], iv[3], n.flags, n.block[6], n.block[7]) // Mix the diagonals. - gx(&state, 0, 5, 10, 15, n.block[8]) - gy(&state, 0, 5, 10, 15, n.block[9]) - gx(&state, 1, 6, 11, 12, n.block[10]) - gy(&state, 1, 6, 11, 12, n.block[11]) - gx(&state, 2, 7, 8, 13, n.block[12]) - gy(&state, 2, 7, 8, 13, n.block[13]) - gx(&state, 3, 4, 9, 14, n.block[14]) - gy(&state, 3, 4, 9, 14, n.block[15]) - - // round2 + s[0], s[5], s[10], s[15] = g(s[0], s[5], s[10], s[15], n.block[8], n.block[9]) + s[1], s[6], s[11], s[12] = g(s[1], s[6], s[11], s[12], n.block[10], n.block[11]) + s[2], s[7], s[8], s[13] = g(s[2], s[7], s[8], s[13], n.block[12], n.block[13]) + s[3], s[4], s[9], s[14] = g(s[3], s[4], s[9], s[14], n.block[14], n.block[15]) + // round 2 // Mix the columns. - gx(&state, 0, 4, 8, 12, n.block[2]) - gy(&state, 0, 4, 8, 12, n.block[6]) - gx(&state, 1, 5, 9, 13, n.block[3]) - gy(&state, 1, 5, 9, 13, n.block[10]) - gx(&state, 2, 6, 10, 14, n.block[7]) - gy(&state, 2, 6, 10, 14, n.block[0]) - gx(&state, 3, 7, 11, 15, n.block[4]) - gy(&state, 3, 7, 11, 15, n.block[13]) + s[0], s[4], s[8], s[12] = g(s[0], s[4], s[8], s[12], n.block[2], n.block[6]) + s[1], s[5], s[9], s[13] = g(s[1], s[5], s[9], s[13], n.block[3], n.block[10]) + s[2], s[6], s[10], s[14] = g(s[2], s[6], s[10], s[14], n.block[7], n.block[0]) + s[3], s[7], s[11], s[15] = g(s[3], s[7], s[11], s[15], n.block[4], n.block[13]) // Mix the diagonals. - gx(&state, 0, 5, 10, 15, n.block[1]) - gy(&state, 0, 5, 10, 15, n.block[11]) - gx(&state, 1, 6, 11, 12, n.block[12]) - gy(&state, 1, 6, 11, 12, n.block[5]) - gx(&state, 2, 7, 8, 13, n.block[9]) - gy(&state, 2, 7, 8, 13, n.block[14]) - gx(&state, 3, 4, 9, 14, n.block[15]) - gy(&state, 3, 4, 9, 14, n.block[8]) - - // round3 + s[0], s[5], s[10], s[15] = g(s[0], s[5], s[10], s[15], n.block[1], n.block[11]) + s[1], s[6], s[11], s[12] = g(s[1], s[6], s[11], s[12], n.block[12], n.block[5]) + s[2], s[7], s[8], s[13] = g(s[2], s[7], s[8], s[13], n.block[9], n.block[14]) + s[3], s[4], s[9], s[14] = g(s[3], s[4], s[9], s[14], n.block[15], n.block[8]) + // round 3 // Mix the columns. - gx(&state, 0, 4, 8, 12, n.block[3]) - gy(&state, 0, 4, 8, 12, n.block[4]) - gx(&state, 1, 5, 9, 13, n.block[10]) - gy(&state, 1, 5, 9, 13, n.block[12]) - gx(&state, 2, 6, 10, 14, n.block[13]) - gy(&state, 2, 6, 10, 14, n.block[2]) - gx(&state, 3, 7, 11, 15, n.block[7]) - gy(&state, 3, 7, 11, 15, n.block[14]) + s[0], s[4], s[8], s[12] = g(s[0], s[4], s[8], s[12], n.block[3], n.block[4]) + s[1], s[5], s[9], s[13] = g(s[1], s[5], s[9], s[13], n.block[10], n.block[12]) + s[2], s[6], s[10], s[14] = g(s[2], s[6], s[10], s[14], n.block[13], n.block[2]) + s[3], s[7], s[11], s[15] = g(s[3], s[7], s[11], s[15], n.block[7], n.block[14]) // Mix the diagonals. - gx(&state, 0, 5, 10, 15, n.block[6]) - gy(&state, 0, 5, 10, 15, n.block[5]) - gx(&state, 1, 6, 11, 12, n.block[9]) - gy(&state, 1, 6, 11, 12, n.block[0]) - gx(&state, 2, 7, 8, 13, n.block[11]) - gy(&state, 2, 7, 8, 13, n.block[15]) - gx(&state, 3, 4, 9, 14, n.block[8]) - gy(&state, 3, 4, 9, 14, n.block[1]) - - // round4 + s[0], s[5], s[10], s[15] = g(s[0], s[5], s[10], s[15], n.block[6], n.block[5]) + s[1], s[6], s[11], s[12] = g(s[1], s[6], s[11], s[12], n.block[9], n.block[0]) + s[2], s[7], s[8], s[13] = g(s[2], s[7], s[8], s[13], n.block[11], n.block[15]) + s[3], s[4], s[9], s[14] = g(s[3], s[4], s[9], s[14], n.block[8], n.block[1]) + // round 4 // Mix the columns. - gx(&state, 0, 4, 8, 12, n.block[10]) - gy(&state, 0, 4, 8, 12, n.block[7]) - gx(&state, 1, 5, 9, 13, n.block[12]) - gy(&state, 1, 5, 9, 13, n.block[9]) - gx(&state, 2, 6, 10, 14, n.block[14]) - gy(&state, 2, 6, 10, 14, n.block[3]) - gx(&state, 3, 7, 11, 15, n.block[13]) - gy(&state, 3, 7, 11, 15, n.block[15]) + s[0], s[4], s[8], s[12] = g(s[0], s[4], s[8], s[12], n.block[10], n.block[7]) + s[1], s[5], s[9], s[13] = g(s[1], s[5], s[9], s[13], n.block[12], n.block[9]) + s[2], s[6], s[10], s[14] = g(s[2], s[6], s[10], s[14], n.block[14], n.block[3]) + s[3], s[7], s[11], s[15] = g(s[3], s[7], s[11], s[15], n.block[13], n.block[15]) // Mix the diagonals. - gx(&state, 0, 5, 10, 15, n.block[4]) - gy(&state, 0, 5, 10, 15, n.block[0]) - gx(&state, 1, 6, 11, 12, n.block[11]) - gy(&state, 1, 6, 11, 12, n.block[2]) - gx(&state, 2, 7, 8, 13, n.block[5]) - gy(&state, 2, 7, 8, 13, n.block[8]) - gx(&state, 3, 4, 9, 14, n.block[1]) - gy(&state, 3, 4, 9, 14, n.block[6]) - - // round5 + s[0], s[5], s[10], s[15] = g(s[0], s[5], s[10], s[15], n.block[4], n.block[0]) + s[1], s[6], s[11], s[12] = g(s[1], s[6], s[11], s[12], n.block[11], n.block[2]) + s[2], s[7], s[8], s[13] = g(s[2], s[7], s[8], s[13], n.block[5], n.block[8]) + s[3], s[4], s[9], s[14] = g(s[3], s[4], s[9], s[14], n.block[1], n.block[6]) + // round 5 // Mix the columns. - gx(&state, 0, 4, 8, 12, n.block[12]) - gy(&state, 0, 4, 8, 12, n.block[13]) - gx(&state, 1, 5, 9, 13, n.block[9]) - gy(&state, 1, 5, 9, 13, n.block[11]) - gx(&state, 2, 6, 10, 14, n.block[15]) - gy(&state, 2, 6, 10, 14, n.block[10]) - gx(&state, 3, 7, 11, 15, n.block[14]) - gy(&state, 3, 7, 11, 15, n.block[8]) + s[0], s[4], s[8], s[12] = g(s[0], s[4], s[8], s[12], n.block[12], n.block[13]) + s[1], s[5], s[9], s[13] = g(s[1], s[5], s[9], s[13], n.block[9], n.block[11]) + s[2], s[6], s[10], s[14] = g(s[2], s[6], s[10], s[14], n.block[15], n.block[10]) + s[3], s[7], s[11], s[15] = g(s[3], s[7], s[11], s[15], n.block[14], n.block[8]) // Mix the diagonals. - gx(&state, 0, 5, 10, 15, n.block[7]) - gy(&state, 0, 5, 10, 15, n.block[2]) - gx(&state, 1, 6, 11, 12, n.block[5]) - gy(&state, 1, 6, 11, 12, n.block[3]) - gx(&state, 2, 7, 8, 13, n.block[0]) - gy(&state, 2, 7, 8, 13, n.block[1]) - gx(&state, 3, 4, 9, 14, n.block[6]) - gy(&state, 3, 4, 9, 14, n.block[4]) - - // round6 + s[0], s[5], s[10], s[15] = g(s[0], s[5], s[10], s[15], n.block[7], n.block[2]) + s[1], s[6], s[11], s[12] = g(s[1], s[6], s[11], s[12], n.block[5], n.block[3]) + s[2], s[7], s[8], s[13] = g(s[2], s[7], s[8], s[13], n.block[0], n.block[1]) + s[3], s[4], s[9], s[14] = g(s[3], s[4], s[9], s[14], n.block[6], n.block[4]) + // round 6 // Mix the columns. - gx(&state, 0, 4, 8, 12, n.block[9]) - gy(&state, 0, 4, 8, 12, n.block[14]) - gx(&state, 1, 5, 9, 13, n.block[11]) - gy(&state, 1, 5, 9, 13, n.block[5]) - gx(&state, 2, 6, 10, 14, n.block[8]) - gy(&state, 2, 6, 10, 14, n.block[12]) - gx(&state, 3, 7, 11, 15, n.block[15]) - gy(&state, 3, 7, 11, 15, n.block[1]) + s[0], s[4], s[8], s[12] = g(s[0], s[4], s[8], s[12], n.block[9], n.block[14]) + s[1], s[5], s[9], s[13] = g(s[1], s[5], s[9], s[13], n.block[11], n.block[5]) + s[2], s[6], s[10], s[14] = g(s[2], s[6], s[10], s[14], n.block[8], n.block[12]) + s[3], s[7], s[11], s[15] = g(s[3], s[7], s[11], s[15], n.block[15], n.block[1]) // Mix the diagonals. - gx(&state, 0, 5, 10, 15, n.block[13]) - gy(&state, 0, 5, 10, 15, n.block[3]) - gx(&state, 1, 6, 11, 12, n.block[0]) - gy(&state, 1, 6, 11, 12, n.block[10]) - gx(&state, 2, 7, 8, 13, n.block[2]) - gy(&state, 2, 7, 8, 13, n.block[6]) - gx(&state, 3, 4, 9, 14, n.block[4]) - gy(&state, 3, 4, 9, 14, n.block[7]) - - // round7 + s[0], s[5], s[10], s[15] = g(s[0], s[5], s[10], s[15], n.block[13], n.block[3]) + s[1], s[6], s[11], s[12] = g(s[1], s[6], s[11], s[12], n.block[0], n.block[10]) + s[2], s[7], s[8], s[13] = g(s[2], s[7], s[8], s[13], n.block[2], n.block[6]) + s[3], s[4], s[9], s[14] = g(s[3], s[4], s[9], s[14], n.block[4], n.block[7]) + // round 7 // Mix the columns. - gx(&state, 0, 4, 8, 12, n.block[11]) - gy(&state, 0, 4, 8, 12, n.block[15]) - gx(&state, 1, 5, 9, 13, n.block[5]) - gy(&state, 1, 5, 9, 13, n.block[0]) - gx(&state, 2, 6, 10, 14, n.block[1]) - gy(&state, 2, 6, 10, 14, n.block[9]) - gx(&state, 3, 7, 11, 15, n.block[8]) - gy(&state, 3, 7, 11, 15, n.block[6]) + s[0], s[4], s[8], s[12] = g(s[0], s[4], s[8], s[12], n.block[11], n.block[15]) + s[1], s[5], s[9], s[13] = g(s[1], s[5], s[9], s[13], n.block[5], n.block[0]) + s[2], s[6], s[10], s[14] = g(s[2], s[6], s[10], s[14], n.block[1], n.block[9]) + s[3], s[7], s[11], s[15] = g(s[3], s[7], s[11], s[15], n.block[8], n.block[6]) // Mix the diagonals. - gx(&state, 0, 5, 10, 15, n.block[14]) - gy(&state, 0, 5, 10, 15, n.block[10]) - gx(&state, 1, 6, 11, 12, n.block[2]) - gy(&state, 1, 6, 11, 12, n.block[12]) - gx(&state, 2, 7, 8, 13, n.block[3]) - gy(&state, 2, 7, 8, 13, n.block[4]) - gx(&state, 3, 4, 9, 14, n.block[7]) - gy(&state, 3, 4, 9, 14, n.block[13]) + s[0], s[5], s[10], s[15] = g(s[0], s[5], s[10], s[15], n.block[14], n.block[10]) + s[1], s[6], s[11], s[12] = g(s[1], s[6], s[11], s[12], n.block[2], n.block[12]) + s[2], s[7], s[8], s[13] = g(s[2], s[7], s[8], s[13], n.block[3], n.block[4]) + s[3], s[4], s[9], s[14] = g(s[3], s[4], s[9], s[14], n.block[7], n.block[13]) - for i := range n.cv { - state[i] ^= state[i+8] - state[i+8] ^= n.cv[i] - } + s[0] ^= s[0+8] + s[1] ^= s[1+8] + s[2] ^= s[2+8] + s[3] ^= s[3+8] + s[4] ^= s[4+8] + s[5] ^= s[5+8] + s[6] ^= s[6+8] + s[7] ^= s[7+8] + s[0+8] ^= n.cv[0] + s[1+8] ^= n.cv[1] + s[2+8] ^= n.cv[2] + s[3+8] ^= n.cv[3] + s[4+8] ^= n.cv[4] + s[5+8] ^= n.cv[5] + s[6+8] ^= n.cv[6] + s[7+8] ^= n.cv[7] return } diff --git a/gen/gen.go b/gen/gen.go index 5c3179d..7999b37 100644 --- a/gen/gen.go +++ b/gen/gen.go @@ -24,26 +24,18 @@ func main() { fmt.Printf(`// round%d // Mix the columns. - gx(&state, 0, 4, 8, 12, n.block[%d]) - gy(&state, 0, 4, 8, 12, n.block[%d]) - gx(&state, 1, 5, 9, 13, n.block[%d]) - gy(&state, 1, 5, 9, 13, n.block[%d]) - gx(&state, 2, 6, 10, 14, n.block[%d]) - gy(&state, 2, 6, 10, 14, n.block[%d]) - gx(&state, 3, 7, 11, 15, n.block[%d]) - gy(&state, 3, 7, 11, 15, n.block[%d]) + s[0], s[4], s[8], s[12] = g(s[0], s[4], s[8], s[12], n.block[%d], n.block[%d]) + s[1], s[5], s[9], s[13] = g(s[1], s[5], s[9], s[13], n.block[%d], n.block[%d]) + s[2], s[6], s[10], s[14] = g(s[2], s[6], s[10], s[14], n.block[%d], n.block[%d]) + s[3], s[7], s[11], s[15] = g(s[3], s[7], s[11], s[15], n.block[%d], n.block[%d]) // Mix the diagonals. - gx(&state, 0, 5, 10, 15, n.block[%d]) - gy(&state, 0, 5, 10, 15, n.block[%d]) - gx(&state, 1, 6, 11, 12, n.block[%d]) - gy(&state, 1, 6, 11, 12, n.block[%d]) - gx(&state, 2, 7, 8, 13, n.block[%d]) - gy(&state, 2, 7, 8, 13, n.block[%d]) - gx(&state, 3, 4, 9, 14, n.block[%d]) - gy(&state, 3, 4, 9, 14, n.block[%d]) - -`, x, m[0], m[1], m[2], m[3], m[4], m[5], m[6], m[7], m[8], m[9], m[10], m[11], m[12], m[13], m[14], m[15]) + s[0], s[5], s[10], s[15] = g(s[0], s[5], s[10], s[15], n.block[%d], n.block[%d]) + s[1], s[6], s[11], s[12] = g(s[1], s[6], s[11], s[12], n.block[%d], n.block[%d]) + s[2], s[7], s[8], s[13] = g(s[2], s[7], s[8], s[13], n.block[%d], n.block[%d]) + s[3], s[4], s[9], s[14] = g(s[3], s[4], s[9], s[14], n.block[%d], n.block[%d]) +`, x, + m[0], m[1], m[2], m[3], m[4], m[5], m[6], m[7], m[8], m[9], m[10], m[11], m[12], m[13], m[14], m[15]) permute(&m) } } From 7786c919ad6084af5ab9e973c8ae30ddfcb2e434 Mon Sep 17 00:00:00 2001 From: renthraysk Date: Sun, 12 Jan 2020 18:59:44 +0000 Subject: [PATCH 2/3] Improve reg usage in compress() [16]uint32 to vars --- blake3.go | 149 +++++++++++++++++++++++++---------------------------- gen/gen.go | 16 +++--- 2 files changed, 77 insertions(+), 88 deletions(-) diff --git a/blake3.go b/blake3.go index 7019ca6..efb9bb1 100644 --- a/blake3.go +++ b/blake3.go @@ -80,115 +80,104 @@ type node struct { // node. When nodes are being merged into parents, only the first 8 words are // used. When the root node is being used to generate output, the full 16 words // are used. -func (n node) compress() (s [16]uint32) { +func (n node) compress() [16]uint32 { // round1 rather than init s and mix, do both. // mix the columns. - s[0], s[4], s[8], s[12] = g(n.cv[0], n.cv[4], iv[0], uint32(n.counter), n.block[0], n.block[1]) - s[1], s[5], s[9], s[13] = g(n.cv[1], n.cv[5], iv[1], uint32(n.counter>>32), n.block[2], n.block[3]) - s[2], s[6], s[10], s[14] = g(n.cv[2], n.cv[6], iv[2], n.blockLen, n.block[4], n.block[5]) - s[3], s[7], s[11], s[15] = g(n.cv[3], n.cv[7], iv[3], n.flags, n.block[6], n.block[7]) + s0, s4, s8, s12 := g(n.cv[0], n.cv[4], iv[0], uint32(n.counter), n.block[0], n.block[1]) + s1, s5, s9, s13 := g(n.cv[1], n.cv[5], iv[1], uint32(n.counter>>32), n.block[2], n.block[3]) + s2, s6, s10, s14 := g(n.cv[2], n.cv[6], iv[2], n.blockLen, n.block[4], n.block[5]) + s3, s7, s11, s15 := g(n.cv[3], n.cv[7], iv[3], n.flags, n.block[6], n.block[7]) // Mix the diagonals. - s[0], s[5], s[10], s[15] = g(s[0], s[5], s[10], s[15], n.block[8], n.block[9]) - s[1], s[6], s[11], s[12] = g(s[1], s[6], s[11], s[12], n.block[10], n.block[11]) - s[2], s[7], s[8], s[13] = g(s[2], s[7], s[8], s[13], n.block[12], n.block[13]) - s[3], s[4], s[9], s[14] = g(s[3], s[4], s[9], s[14], n.block[14], n.block[15]) + s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[8], n.block[9]) + s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[10], n.block[11]) + s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[12], n.block[13]) + s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[14], n.block[15]) + // round2 - // round 2 // Mix the columns. - s[0], s[4], s[8], s[12] = g(s[0], s[4], s[8], s[12], n.block[2], n.block[6]) - s[1], s[5], s[9], s[13] = g(s[1], s[5], s[9], s[13], n.block[3], n.block[10]) - s[2], s[6], s[10], s[14] = g(s[2], s[6], s[10], s[14], n.block[7], n.block[0]) - s[3], s[7], s[11], s[15] = g(s[3], s[7], s[11], s[15], n.block[4], n.block[13]) + s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[2], n.block[6]) + s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[3], n.block[10]) + s2, s6, s10, s14 = g(s2, s6, s10, s14, n.block[7], n.block[0]) + s3, s7, s11, s15 = g(s3, s7, s11, s15, n.block[4], n.block[13]) // Mix the diagonals. - s[0], s[5], s[10], s[15] = g(s[0], s[5], s[10], s[15], n.block[1], n.block[11]) - s[1], s[6], s[11], s[12] = g(s[1], s[6], s[11], s[12], n.block[12], n.block[5]) - s[2], s[7], s[8], s[13] = g(s[2], s[7], s[8], s[13], n.block[9], n.block[14]) - s[3], s[4], s[9], s[14] = g(s[3], s[4], s[9], s[14], n.block[15], n.block[8]) + s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[1], n.block[11]) + s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[12], n.block[5]) + s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[9], n.block[14]) + s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[15], n.block[8]) + // round3 - // round 3 // Mix the columns. - s[0], s[4], s[8], s[12] = g(s[0], s[4], s[8], s[12], n.block[3], n.block[4]) - s[1], s[5], s[9], s[13] = g(s[1], s[5], s[9], s[13], n.block[10], n.block[12]) - s[2], s[6], s[10], s[14] = g(s[2], s[6], s[10], s[14], n.block[13], n.block[2]) - s[3], s[7], s[11], s[15] = g(s[3], s[7], s[11], s[15], n.block[7], n.block[14]) + s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[3], n.block[4]) + s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[10], n.block[12]) + s2, s6, s10, s14 = g(s2, s6, s10, s14, n.block[13], n.block[2]) + s3, s7, s11, s15 = g(s3, s7, s11, s15, n.block[7], n.block[14]) // Mix the diagonals. - s[0], s[5], s[10], s[15] = g(s[0], s[5], s[10], s[15], n.block[6], n.block[5]) - s[1], s[6], s[11], s[12] = g(s[1], s[6], s[11], s[12], n.block[9], n.block[0]) - s[2], s[7], s[8], s[13] = g(s[2], s[7], s[8], s[13], n.block[11], n.block[15]) - s[3], s[4], s[9], s[14] = g(s[3], s[4], s[9], s[14], n.block[8], n.block[1]) + s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[6], n.block[5]) + s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[9], n.block[0]) + s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[11], n.block[15]) + s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[8], n.block[1]) + // round4 - // round 4 // Mix the columns. - s[0], s[4], s[8], s[12] = g(s[0], s[4], s[8], s[12], n.block[10], n.block[7]) - s[1], s[5], s[9], s[13] = g(s[1], s[5], s[9], s[13], n.block[12], n.block[9]) - s[2], s[6], s[10], s[14] = g(s[2], s[6], s[10], s[14], n.block[14], n.block[3]) - s[3], s[7], s[11], s[15] = g(s[3], s[7], s[11], s[15], n.block[13], n.block[15]) + s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[10], n.block[7]) + s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[12], n.block[9]) + s2, s6, s10, s14 = g(s2, s6, s10, s14, n.block[14], n.block[3]) + s3, s7, s11, s15 = g(s3, s7, s11, s15, n.block[13], n.block[15]) // Mix the diagonals. - s[0], s[5], s[10], s[15] = g(s[0], s[5], s[10], s[15], n.block[4], n.block[0]) - s[1], s[6], s[11], s[12] = g(s[1], s[6], s[11], s[12], n.block[11], n.block[2]) - s[2], s[7], s[8], s[13] = g(s[2], s[7], s[8], s[13], n.block[5], n.block[8]) - s[3], s[4], s[9], s[14] = g(s[3], s[4], s[9], s[14], n.block[1], n.block[6]) + s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[4], n.block[0]) + s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[11], n.block[2]) + s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[5], n.block[8]) + s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[1], n.block[6]) + // round5 - // round 5 // Mix the columns. - s[0], s[4], s[8], s[12] = g(s[0], s[4], s[8], s[12], n.block[12], n.block[13]) - s[1], s[5], s[9], s[13] = g(s[1], s[5], s[9], s[13], n.block[9], n.block[11]) - s[2], s[6], s[10], s[14] = g(s[2], s[6], s[10], s[14], n.block[15], n.block[10]) - s[3], s[7], s[11], s[15] = g(s[3], s[7], s[11], s[15], n.block[14], n.block[8]) + s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[12], n.block[13]) + s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[9], n.block[11]) + s2, s6, s10, s14 = g(s2, s6, s10, s14, n.block[15], n.block[10]) + s3, s7, s11, s15 = g(s3, s7, s11, s15, n.block[14], n.block[8]) // Mix the diagonals. - s[0], s[5], s[10], s[15] = g(s[0], s[5], s[10], s[15], n.block[7], n.block[2]) - s[1], s[6], s[11], s[12] = g(s[1], s[6], s[11], s[12], n.block[5], n.block[3]) - s[2], s[7], s[8], s[13] = g(s[2], s[7], s[8], s[13], n.block[0], n.block[1]) - s[3], s[4], s[9], s[14] = g(s[3], s[4], s[9], s[14], n.block[6], n.block[4]) + s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[7], n.block[2]) + s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[5], n.block[3]) + s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[0], n.block[1]) + s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[6], n.block[4]) + // round6 - // round 6 // Mix the columns. - s[0], s[4], s[8], s[12] = g(s[0], s[4], s[8], s[12], n.block[9], n.block[14]) - s[1], s[5], s[9], s[13] = g(s[1], s[5], s[9], s[13], n.block[11], n.block[5]) - s[2], s[6], s[10], s[14] = g(s[2], s[6], s[10], s[14], n.block[8], n.block[12]) - s[3], s[7], s[11], s[15] = g(s[3], s[7], s[11], s[15], n.block[15], n.block[1]) + s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[9], n.block[14]) + s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[11], n.block[5]) + s2, s6, s10, s14 = g(s2, s6, s10, s14, n.block[8], n.block[12]) + s3, s7, s11, s15 = g(s3, s7, s11, s15, n.block[15], n.block[1]) // Mix the diagonals. - s[0], s[5], s[10], s[15] = g(s[0], s[5], s[10], s[15], n.block[13], n.block[3]) - s[1], s[6], s[11], s[12] = g(s[1], s[6], s[11], s[12], n.block[0], n.block[10]) - s[2], s[7], s[8], s[13] = g(s[2], s[7], s[8], s[13], n.block[2], n.block[6]) - s[3], s[4], s[9], s[14] = g(s[3], s[4], s[9], s[14], n.block[4], n.block[7]) + s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[13], n.block[3]) + s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[0], n.block[10]) + s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[2], n.block[6]) + s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[4], n.block[7]) + // round7 - // round 7 // Mix the columns. - s[0], s[4], s[8], s[12] = g(s[0], s[4], s[8], s[12], n.block[11], n.block[15]) - s[1], s[5], s[9], s[13] = g(s[1], s[5], s[9], s[13], n.block[5], n.block[0]) - s[2], s[6], s[10], s[14] = g(s[2], s[6], s[10], s[14], n.block[1], n.block[9]) - s[3], s[7], s[11], s[15] = g(s[3], s[7], s[11], s[15], n.block[8], n.block[6]) + s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[11], n.block[15]) + s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[5], n.block[0]) + s2, s6, s10, s14 = g(s2, s6, s10, s14, n.block[1], n.block[9]) + s3, s7, s11, s15 = g(s3, s7, s11, s15, n.block[8], n.block[6]) // Mix the diagonals. - s[0], s[5], s[10], s[15] = g(s[0], s[5], s[10], s[15], n.block[14], n.block[10]) - s[1], s[6], s[11], s[12] = g(s[1], s[6], s[11], s[12], n.block[2], n.block[12]) - s[2], s[7], s[8], s[13] = g(s[2], s[7], s[8], s[13], n.block[3], n.block[4]) - s[3], s[4], s[9], s[14] = g(s[3], s[4], s[9], s[14], n.block[7], n.block[13]) + s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[14], n.block[10]) + s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[2], n.block[12]) + s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[3], n.block[4]) + s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[7], n.block[13]) - s[0] ^= s[0+8] - s[1] ^= s[1+8] - s[2] ^= s[2+8] - s[3] ^= s[3+8] - s[4] ^= s[4+8] - s[5] ^= s[5+8] - s[6] ^= s[6+8] - s[7] ^= s[7+8] - s[0+8] ^= n.cv[0] - s[1+8] ^= n.cv[1] - s[2+8] ^= n.cv[2] - s[3+8] ^= n.cv[3] - s[4+8] ^= n.cv[4] - s[5+8] ^= n.cv[5] - s[6+8] ^= n.cv[6] - s[7+8] ^= n.cv[7] - return + return [16]uint32{ + s0 ^ s8, s1 ^ s9, s2 ^ s10, s3 ^ s11, + s4 ^ s12, s5 ^ s13, s6 ^ s14, s7 ^ s15, + s8 ^ n.cv[0], s9 ^ n.cv[1], s10 ^ n.cv[2], s11 ^ n.cv[3], + s12 ^ n.cv[4], s13 ^ n.cv[5], s14 ^ n.cv[6], s15 ^ n.cv[7], + } } // chainingValue returns the first 8 words of the compressed node. This is used diff --git a/gen/gen.go b/gen/gen.go index 7999b37..b020dd7 100644 --- a/gen/gen.go +++ b/gen/gen.go @@ -24,16 +24,16 @@ func main() { fmt.Printf(`// round%d // Mix the columns. - s[0], s[4], s[8], s[12] = g(s[0], s[4], s[8], s[12], n.block[%d], n.block[%d]) - s[1], s[5], s[9], s[13] = g(s[1], s[5], s[9], s[13], n.block[%d], n.block[%d]) - s[2], s[6], s[10], s[14] = g(s[2], s[6], s[10], s[14], n.block[%d], n.block[%d]) - s[3], s[7], s[11], s[15] = g(s[3], s[7], s[11], s[15], n.block[%d], n.block[%d]) + s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[%d], n.block[%d]) + s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[%d], n.block[%d]) + s2, s6, s10, s14 = g(s2, s6, s10, s14, n.block[%d], n.block[%d]) + s3, s7, s11, s15 = g(s3, s7, s11, s15, n.block[%d], n.block[%d]) // Mix the diagonals. - s[0], s[5], s[10], s[15] = g(s[0], s[5], s[10], s[15], n.block[%d], n.block[%d]) - s[1], s[6], s[11], s[12] = g(s[1], s[6], s[11], s[12], n.block[%d], n.block[%d]) - s[2], s[7], s[8], s[13] = g(s[2], s[7], s[8], s[13], n.block[%d], n.block[%d]) - s[3], s[4], s[9], s[14] = g(s[3], s[4], s[9], s[14], n.block[%d], n.block[%d]) + s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[%d], n.block[%d]) + s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[%d], n.block[%d]) + s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[%d], n.block[%d]) + s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[%d], n.block[%d]) `, x, m[0], m[1], m[2], m[3], m[4], m[5], m[6], m[7], m[8], m[9], m[10], m[11], m[12], m[13], m[14], m[15]) permute(&m) From 4553d55fb5bc68a1dd73e8cf5c291a57343845ca Mon Sep 17 00:00:00 2001 From: renthraysk Date: Sun, 12 Jan 2020 22:01:00 +0000 Subject: [PATCH 3/3] Tidy up generated code & comments --- blake3.go | 19 ++++++++++--------- gen/gen.go | 24 +++++++++++++++++++++--- 2 files changed, 31 insertions(+), 12 deletions(-) diff --git a/blake3.go b/blake3.go index efb9bb1..3d6374e 100644 --- a/blake3.go +++ b/blake3.go @@ -80,9 +80,9 @@ type node struct { // node. When nodes are being merged into parents, only the first 8 words are // used. When the root node is being used to generate output, the full 16 words // are used. -func (n node) compress() [16]uint32 { +func (n node) compress() (state [16]uint32) { // round1 rather than init s and mix, do both. - // mix the columns. + // Mix the columns. s0, s4, s8, s12 := g(n.cv[0], n.cv[4], iv[0], uint32(n.counter), n.block[0], n.block[1]) s1, s5, s9, s13 := g(n.cv[1], n.cv[5], iv[1], uint32(n.counter>>32), n.block[2], n.block[3]) s2, s6, s10, s14 := g(n.cv[2], n.cv[6], iv[2], n.blockLen, n.block[4], n.block[5]) @@ -93,8 +93,8 @@ func (n node) compress() [16]uint32 { s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[10], n.block[11]) s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[12], n.block[13]) s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[14], n.block[15]) - // round2 + // round 2 // Mix the columns. s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[2], n.block[6]) s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[3], n.block[10]) @@ -106,8 +106,8 @@ func (n node) compress() [16]uint32 { s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[12], n.block[5]) s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[9], n.block[14]) s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[15], n.block[8]) - // round3 + // round 3 // Mix the columns. s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[3], n.block[4]) s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[10], n.block[12]) @@ -119,8 +119,8 @@ func (n node) compress() [16]uint32 { s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[9], n.block[0]) s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[11], n.block[15]) s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[8], n.block[1]) - // round4 + // round 4 // Mix the columns. s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[10], n.block[7]) s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[12], n.block[9]) @@ -132,8 +132,8 @@ func (n node) compress() [16]uint32 { s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[11], n.block[2]) s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[5], n.block[8]) s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[1], n.block[6]) - // round5 + // round 5 // Mix the columns. s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[12], n.block[13]) s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[9], n.block[11]) @@ -145,8 +145,8 @@ func (n node) compress() [16]uint32 { s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[5], n.block[3]) s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[0], n.block[1]) s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[6], n.block[4]) - // round6 + // round 6 // Mix the columns. s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[9], n.block[14]) s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[11], n.block[5]) @@ -158,8 +158,8 @@ func (n node) compress() [16]uint32 { s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[0], n.block[10]) s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[2], n.block[6]) s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[4], n.block[7]) - // round7 + // round 7 // Mix the columns. s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[11], n.block[15]) s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[5], n.block[0]) @@ -172,12 +172,13 @@ func (n node) compress() [16]uint32 { s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[3], n.block[4]) s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[7], n.block[13]) - return [16]uint32{ + state = [16]uint32{ s0 ^ s8, s1 ^ s9, s2 ^ s10, s3 ^ s11, s4 ^ s12, s5 ^ s13, s6 ^ s14, s7 ^ s15, s8 ^ n.cv[0], s9 ^ n.cv[1], s10 ^ n.cv[2], s11 ^ n.cv[3], s12 ^ n.cv[4], s13 ^ n.cv[5], s14 ^ n.cv[6], s15 ^ n.cv[7], } + return } // chainingValue returns the first 8 words of the compressed node. This is used diff --git a/gen/gen.go b/gen/gen.go index b020dd7..15efbd6 100644 --- a/gen/gen.go +++ b/gen/gen.go @@ -20,9 +20,26 @@ func main() { for i := range m { m[i] = uint32(i) } - for x := 1; x < 8; x++ { - fmt.Printf(`// round%d - + + fmt.Printf(` // round %d rather than init state and mix, do both. + // Mix the columns. + s0, s4, s8, s12 := g(n.cv[0], n.cv[4], iv[0], uint32(n.counter), n.block[%d], n.block[%d]) + s1, s5, s9, s13 := g(n.cv[1], n.cv[5], iv[1], uint32(n.counter>>32), n.block[%d], n.block[%d]) + s2, s6, s10, s14 := g(n.cv[2], n.cv[6], iv[2], n.blockLen, n.block[%d], n.block[%d]) + s3, s7, s11, s15 := g(n.cv[3], n.cv[7], iv[3], n.flags, n.block[%d], n.block[%d]) + + // Mix the diagonals. + s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[%d], n.block[%d]) + s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[%d], n.block[%d]) + s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[%d], n.block[%d]) + s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[%d], n.block[%d]) + +`, 1, m[0], m[1], m[2], m[3], m[4], m[5], m[6], m[7], m[8], m[9], m[10], m[11], m[12], m[13], m[14], m[15]) + + permute(&m) + + for x := 2; x < 8; x++ { + fmt.Printf(` // round %d // Mix the columns. s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[%d], n.block[%d]) s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[%d], n.block[%d]) @@ -34,6 +51,7 @@ func main() { s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[%d], n.block[%d]) s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[%d], n.block[%d]) s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[%d], n.block[%d]) + `, x, m[0], m[1], m[2], m[3], m[4], m[5], m[6], m[7], m[8], m[9], m[10], m[11], m[12], m[13], m[14], m[15]) permute(&m)