Merge pull request #1 from renthraysk/alternate-g-inline

Changing g() to pass values, allows inlining and keeping more of compress() state in registers
This commit is contained in:
renthraysk 2020-01-12 22:16:27 +00:00 committed by GitHub
commit dc64d4b5a6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 135 additions and 196 deletions

215
blake3.go
View File

@ -45,19 +45,16 @@ func wordsToBytes(words []uint32, bytes []byte) {
}
}
// The g function, split into two parts so that the compiler will inline it.
func gx(state *[16]uint32, a, b, c, d int, mx uint32) {
state[a] += state[b] + mx
state[d] = bits.RotateLeft32(state[d]^state[a], -16)
state[c] += state[d]
state[b] = bits.RotateLeft32(state[b]^state[c], -12)
}
func gy(state *[16]uint32, a, b, c, d int, my uint32) {
state[a] += state[b] + my
state[d] = bits.RotateLeft32(state[d]^state[a], -8)
state[c] += state[d]
state[b] = bits.RotateLeft32(state[b]^state[c], -7)
func g(a, b, c, d, mx, my uint32) (uint32, uint32, uint32, uint32) {
a += b + mx
d = bits.RotateLeft32(d^a, -16)
c += d
b = bits.RotateLeft32(b^c, -12)
a += b + my
d = bits.RotateLeft32(d^a, -8)
c += d
b = bits.RotateLeft32(b^c, -7)
return a, b, c, d
}
// A node represents a chunk or parent in the BLAKE3 Merkle tree. In BLAKE3
@ -84,170 +81,102 @@ type node struct {
// used. When the root node is being used to generate output, the full 16 words
// are used.
func (n node) compress() (state [16]uint32) {
state = [16]uint32{
n.cv[0], n.cv[1], n.cv[2], n.cv[3],
n.cv[4], n.cv[5], n.cv[6], n.cv[7],
iv[0], iv[1], iv[2], iv[3],
uint32(n.counter), uint32(n.counter >> 32), n.blockLen, n.flags,
}
// round1
// round1 rather than init s and mix, do both.
// Mix the columns.
gx(&state, 0, 4, 8, 12, n.block[0])
gy(&state, 0, 4, 8, 12, n.block[1])
gx(&state, 1, 5, 9, 13, n.block[2])
gy(&state, 1, 5, 9, 13, n.block[3])
gx(&state, 2, 6, 10, 14, n.block[4])
gy(&state, 2, 6, 10, 14, n.block[5])
gx(&state, 3, 7, 11, 15, n.block[6])
gy(&state, 3, 7, 11, 15, n.block[7])
s0, s4, s8, s12 := g(n.cv[0], n.cv[4], iv[0], uint32(n.counter), n.block[0], n.block[1])
s1, s5, s9, s13 := g(n.cv[1], n.cv[5], iv[1], uint32(n.counter>>32), n.block[2], n.block[3])
s2, s6, s10, s14 := g(n.cv[2], n.cv[6], iv[2], n.blockLen, n.block[4], n.block[5])
s3, s7, s11, s15 := g(n.cv[3], n.cv[7], iv[3], n.flags, n.block[6], n.block[7])
// Mix the diagonals.
gx(&state, 0, 5, 10, 15, n.block[8])
gy(&state, 0, 5, 10, 15, n.block[9])
gx(&state, 1, 6, 11, 12, n.block[10])
gy(&state, 1, 6, 11, 12, n.block[11])
gx(&state, 2, 7, 8, 13, n.block[12])
gy(&state, 2, 7, 8, 13, n.block[13])
gx(&state, 3, 4, 9, 14, n.block[14])
gy(&state, 3, 4, 9, 14, n.block[15])
s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[8], n.block[9])
s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[10], n.block[11])
s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[12], n.block[13])
s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[14], n.block[15])
// round 2
// Mix the columns.
gx(&state, 0, 4, 8, 12, n.block[2])
gy(&state, 0, 4, 8, 12, n.block[6])
gx(&state, 1, 5, 9, 13, n.block[3])
gy(&state, 1, 5, 9, 13, n.block[10])
gx(&state, 2, 6, 10, 14, n.block[7])
gy(&state, 2, 6, 10, 14, n.block[0])
gx(&state, 3, 7, 11, 15, n.block[4])
gy(&state, 3, 7, 11, 15, n.block[13])
s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[2], n.block[6])
s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[3], n.block[10])
s2, s6, s10, s14 = g(s2, s6, s10, s14, n.block[7], n.block[0])
s3, s7, s11, s15 = g(s3, s7, s11, s15, n.block[4], n.block[13])
// Mix the diagonals.
gx(&state, 0, 5, 10, 15, n.block[1])
gy(&state, 0, 5, 10, 15, n.block[11])
gx(&state, 1, 6, 11, 12, n.block[12])
gy(&state, 1, 6, 11, 12, n.block[5])
gx(&state, 2, 7, 8, 13, n.block[9])
gy(&state, 2, 7, 8, 13, n.block[14])
gx(&state, 3, 4, 9, 14, n.block[15])
gy(&state, 3, 4, 9, 14, n.block[8])
s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[1], n.block[11])
s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[12], n.block[5])
s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[9], n.block[14])
s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[15], n.block[8])
// round 3
// Mix the columns.
gx(&state, 0, 4, 8, 12, n.block[3])
gy(&state, 0, 4, 8, 12, n.block[4])
gx(&state, 1, 5, 9, 13, n.block[10])
gy(&state, 1, 5, 9, 13, n.block[12])
gx(&state, 2, 6, 10, 14, n.block[13])
gy(&state, 2, 6, 10, 14, n.block[2])
gx(&state, 3, 7, 11, 15, n.block[7])
gy(&state, 3, 7, 11, 15, n.block[14])
s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[3], n.block[4])
s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[10], n.block[12])
s2, s6, s10, s14 = g(s2, s6, s10, s14, n.block[13], n.block[2])
s3, s7, s11, s15 = g(s3, s7, s11, s15, n.block[7], n.block[14])
// Mix the diagonals.
gx(&state, 0, 5, 10, 15, n.block[6])
gy(&state, 0, 5, 10, 15, n.block[5])
gx(&state, 1, 6, 11, 12, n.block[9])
gy(&state, 1, 6, 11, 12, n.block[0])
gx(&state, 2, 7, 8, 13, n.block[11])
gy(&state, 2, 7, 8, 13, n.block[15])
gx(&state, 3, 4, 9, 14, n.block[8])
gy(&state, 3, 4, 9, 14, n.block[1])
s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[6], n.block[5])
s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[9], n.block[0])
s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[11], n.block[15])
s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[8], n.block[1])
// round 4
// Mix the columns.
gx(&state, 0, 4, 8, 12, n.block[10])
gy(&state, 0, 4, 8, 12, n.block[7])
gx(&state, 1, 5, 9, 13, n.block[12])
gy(&state, 1, 5, 9, 13, n.block[9])
gx(&state, 2, 6, 10, 14, n.block[14])
gy(&state, 2, 6, 10, 14, n.block[3])
gx(&state, 3, 7, 11, 15, n.block[13])
gy(&state, 3, 7, 11, 15, n.block[15])
s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[10], n.block[7])
s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[12], n.block[9])
s2, s6, s10, s14 = g(s2, s6, s10, s14, n.block[14], n.block[3])
s3, s7, s11, s15 = g(s3, s7, s11, s15, n.block[13], n.block[15])
// Mix the diagonals.
gx(&state, 0, 5, 10, 15, n.block[4])
gy(&state, 0, 5, 10, 15, n.block[0])
gx(&state, 1, 6, 11, 12, n.block[11])
gy(&state, 1, 6, 11, 12, n.block[2])
gx(&state, 2, 7, 8, 13, n.block[5])
gy(&state, 2, 7, 8, 13, n.block[8])
gx(&state, 3, 4, 9, 14, n.block[1])
gy(&state, 3, 4, 9, 14, n.block[6])
s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[4], n.block[0])
s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[11], n.block[2])
s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[5], n.block[8])
s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[1], n.block[6])
// round 5
// Mix the columns.
gx(&state, 0, 4, 8, 12, n.block[12])
gy(&state, 0, 4, 8, 12, n.block[13])
gx(&state, 1, 5, 9, 13, n.block[9])
gy(&state, 1, 5, 9, 13, n.block[11])
gx(&state, 2, 6, 10, 14, n.block[15])
gy(&state, 2, 6, 10, 14, n.block[10])
gx(&state, 3, 7, 11, 15, n.block[14])
gy(&state, 3, 7, 11, 15, n.block[8])
s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[12], n.block[13])
s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[9], n.block[11])
s2, s6, s10, s14 = g(s2, s6, s10, s14, n.block[15], n.block[10])
s3, s7, s11, s15 = g(s3, s7, s11, s15, n.block[14], n.block[8])
// Mix the diagonals.
gx(&state, 0, 5, 10, 15, n.block[7])
gy(&state, 0, 5, 10, 15, n.block[2])
gx(&state, 1, 6, 11, 12, n.block[5])
gy(&state, 1, 6, 11, 12, n.block[3])
gx(&state, 2, 7, 8, 13, n.block[0])
gy(&state, 2, 7, 8, 13, n.block[1])
gx(&state, 3, 4, 9, 14, n.block[6])
gy(&state, 3, 4, 9, 14, n.block[4])
s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[7], n.block[2])
s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[5], n.block[3])
s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[0], n.block[1])
s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[6], n.block[4])
// round 6
// Mix the columns.
gx(&state, 0, 4, 8, 12, n.block[9])
gy(&state, 0, 4, 8, 12, n.block[14])
gx(&state, 1, 5, 9, 13, n.block[11])
gy(&state, 1, 5, 9, 13, n.block[5])
gx(&state, 2, 6, 10, 14, n.block[8])
gy(&state, 2, 6, 10, 14, n.block[12])
gx(&state, 3, 7, 11, 15, n.block[15])
gy(&state, 3, 7, 11, 15, n.block[1])
s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[9], n.block[14])
s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[11], n.block[5])
s2, s6, s10, s14 = g(s2, s6, s10, s14, n.block[8], n.block[12])
s3, s7, s11, s15 = g(s3, s7, s11, s15, n.block[15], n.block[1])
// Mix the diagonals.
gx(&state, 0, 5, 10, 15, n.block[13])
gy(&state, 0, 5, 10, 15, n.block[3])
gx(&state, 1, 6, 11, 12, n.block[0])
gy(&state, 1, 6, 11, 12, n.block[10])
gx(&state, 2, 7, 8, 13, n.block[2])
gy(&state, 2, 7, 8, 13, n.block[6])
gx(&state, 3, 4, 9, 14, n.block[4])
gy(&state, 3, 4, 9, 14, n.block[7])
s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[13], n.block[3])
s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[0], n.block[10])
s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[2], n.block[6])
s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[4], n.block[7])
// round 7
// Mix the columns.
gx(&state, 0, 4, 8, 12, n.block[11])
gy(&state, 0, 4, 8, 12, n.block[15])
gx(&state, 1, 5, 9, 13, n.block[5])
gy(&state, 1, 5, 9, 13, n.block[0])
gx(&state, 2, 6, 10, 14, n.block[1])
gy(&state, 2, 6, 10, 14, n.block[9])
gx(&state, 3, 7, 11, 15, n.block[8])
gy(&state, 3, 7, 11, 15, n.block[6])
s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[11], n.block[15])
s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[5], n.block[0])
s2, s6, s10, s14 = g(s2, s6, s10, s14, n.block[1], n.block[9])
s3, s7, s11, s15 = g(s3, s7, s11, s15, n.block[8], n.block[6])
// Mix the diagonals.
gx(&state, 0, 5, 10, 15, n.block[14])
gy(&state, 0, 5, 10, 15, n.block[10])
gx(&state, 1, 6, 11, 12, n.block[2])
gy(&state, 1, 6, 11, 12, n.block[12])
gx(&state, 2, 7, 8, 13, n.block[3])
gy(&state, 2, 7, 8, 13, n.block[4])
gx(&state, 3, 4, 9, 14, n.block[7])
gy(&state, 3, 4, 9, 14, n.block[13])
s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[14], n.block[10])
s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[2], n.block[12])
s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[3], n.block[4])
s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[7], n.block[13])
for i := range n.cv {
state[i] ^= state[i+8]
state[i+8] ^= n.cv[i]
state = [16]uint32{
s0 ^ s8, s1 ^ s9, s2 ^ s10, s3 ^ s11,
s4 ^ s12, s5 ^ s13, s6 ^ s14, s7 ^ s15,
s8 ^ n.cv[0], s9 ^ n.cv[1], s10 ^ n.cv[2], s11 ^ n.cv[3],
s12 ^ n.cv[4], s13 ^ n.cv[5], s14 ^ n.cv[6], s15 ^ n.cv[7],
}
return
}

View File

@ -20,30 +20,40 @@ func main() {
for i := range m {
m[i] = uint32(i)
}
for x := 1; x < 8; x++ {
fmt.Printf(`// round%d
fmt.Printf(` // round %d rather than init state and mix, do both.
// Mix the columns.
gx(&state, 0, 4, 8, 12, n.block[%d])
gy(&state, 0, 4, 8, 12, n.block[%d])
gx(&state, 1, 5, 9, 13, n.block[%d])
gy(&state, 1, 5, 9, 13, n.block[%d])
gx(&state, 2, 6, 10, 14, n.block[%d])
gy(&state, 2, 6, 10, 14, n.block[%d])
gx(&state, 3, 7, 11, 15, n.block[%d])
gy(&state, 3, 7, 11, 15, n.block[%d])
s0, s4, s8, s12 := g(n.cv[0], n.cv[4], iv[0], uint32(n.counter), n.block[%d], n.block[%d])
s1, s5, s9, s13 := g(n.cv[1], n.cv[5], iv[1], uint32(n.counter>>32), n.block[%d], n.block[%d])
s2, s6, s10, s14 := g(n.cv[2], n.cv[6], iv[2], n.blockLen, n.block[%d], n.block[%d])
s3, s7, s11, s15 := g(n.cv[3], n.cv[7], iv[3], n.flags, n.block[%d], n.block[%d])
// Mix the diagonals.
gx(&state, 0, 5, 10, 15, n.block[%d])
gy(&state, 0, 5, 10, 15, n.block[%d])
gx(&state, 1, 6, 11, 12, n.block[%d])
gy(&state, 1, 6, 11, 12, n.block[%d])
gx(&state, 2, 7, 8, 13, n.block[%d])
gy(&state, 2, 7, 8, 13, n.block[%d])
gx(&state, 3, 4, 9, 14, n.block[%d])
gy(&state, 3, 4, 9, 14, n.block[%d])
s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[%d], n.block[%d])
s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[%d], n.block[%d])
s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[%d], n.block[%d])
s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[%d], n.block[%d])
`, x, m[0], m[1], m[2], m[3], m[4], m[5], m[6], m[7], m[8], m[9], m[10], m[11], m[12], m[13], m[14], m[15])
`, 1, m[0], m[1], m[2], m[3], m[4], m[5], m[6], m[7], m[8], m[9], m[10], m[11], m[12], m[13], m[14], m[15])
permute(&m)
for x := 2; x < 8; x++ {
fmt.Printf(` // round %d
// Mix the columns.
s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[%d], n.block[%d])
s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[%d], n.block[%d])
s2, s6, s10, s14 = g(s2, s6, s10, s14, n.block[%d], n.block[%d])
s3, s7, s11, s15 = g(s3, s7, s11, s15, n.block[%d], n.block[%d])
// Mix the diagonals.
s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[%d], n.block[%d])
s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[%d], n.block[%d])
s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[%d], n.block[%d])
s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[%d], n.block[%d])
`, x,
m[0], m[1], m[2], m[3], m[4], m[5], m[6], m[7], m[8], m[9], m[10], m[11], m[12], m[13], m[14], m[15])
permute(&m)
}
}