upgrade to avo@v0.4.0 (AVX-512 support, woo!)

This commit is contained in:
lukechampine 2021-11-12 23:09:50 -05:00
parent 09d3897aaa
commit bb7ece4161
2 changed files with 56 additions and 205 deletions

View File

@ -1,3 +1,4 @@
//go:build ignore
// +build ignore
package main
@ -6,7 +7,6 @@ import (
"fmt"
. "github.com/mmcloughlin/avo/build"
"github.com/mmcloughlin/avo/ir"
. "github.com/mmcloughlin/avo/operand"
. "github.com/mmcloughlin/avo/reg"
)
@ -68,28 +68,28 @@ func genCompressBlocksAVX512() {
var vs, mv [16]VecVirtual
for i := range vs {
vs[i], mv[i] = ZMM(), ZMM()
VPBROADCASTD_Z(block.Offset(i*4), mv[i])
VPBROADCASTD(block.Offset(i*4), mv[i])
}
Comment("Initialize state vectors")
for i, v := range vs {
switch i {
case 0, 1, 2, 3, 4, 5, 6, 7: // cv
VPBROADCASTD_Z(cv.Offset(i*4), v)
VPBROADCASTD(cv.Offset(i*4), v)
case 8, 9, 10, 11: // iv
VPBROADCASTD_Z(globals.iv.Offset((i-8)*4), v)
VPBROADCASTD(globals.iv.Offset((i-8)*4), v)
case 12: // counter
VPBROADCASTD_Z(counter.Addr, vs[12])
VPADDD_Z(globals.seq, vs[12], vs[12])
VPBROADCASTD(counter.Addr, vs[12])
VPADDD(globals.seq, vs[12], vs[12])
// set a 1 bit in K1 for each overflowed counter in vs[12]
VPCMPUD(Imm(1), globals.seq, vs[12], K1)
// add 1 to each counter in vs[13] for each 1 bit in K1
VPBROADCASTD_Z(counter.Addr.Offset(1*4), vs[13])
VPADDD_ZBK(globals.seq.Offset(4), vs[13], K1, vs[13])
VPBROADCASTD(counter.Addr.Offset(1*4), vs[13])
VPADDD_BCST(globals.seq.Offset(4), vs[13], K1, vs[13])
case 14: // blockLen
VPBROADCASTD_Z(blockLen.Addr, v)
VPBROADCASTD(blockLen.Addr, v)
case 15: // flags
VPBROADCASTD_Z(flags.Addr, v)
VPBROADCASTD(flags.Addr, v)
}
}
@ -97,17 +97,17 @@ func genCompressBlocksAVX512() {
Comment("Finalize CVs")
for i, v := range vs[:8] {
VPXORD_Z(v, vs[i+8], v)
VPXORD(v, vs[i+8], v)
}
for i, v := range vs[8:] {
VPXORD_ZB(cv.Offset(i*4), v, v)
VPXORD_BCST(cv.Offset(i*4), v, v)
}
stride := ZMM()
VMOVDQU32_Z(globals.seq, stride)
VPSLLD_Z(Imm(6), stride, stride) // stride of 64
VMOVDQU32(globals.seq, stride)
VPSLLD(Imm(6), stride, stride) // stride of 64
for i, v := range vs {
KXNORD(K1, K1, K1) // fastest way to set all bits to 1
VPSCATTERDD_Z(v, K1, out.Offset(i*4).Idx(stride, 1))
VPSCATTERDD(v, K1, out.Offset(i*4).Idx(stride, 1))
}
RET()
@ -129,24 +129,24 @@ func genCompressChunksAVX512() {
Comment("Initialize counter")
counterLo := AllocLocal(64)
counterHi := AllocLocal(64)
VPBROADCASTD_Z(counter.Addr, vs[0])
VPADDD_Z(globals.seq, vs[0], vs[0])
VPBROADCASTD(counter.Addr, vs[0])
VPADDD(globals.seq, vs[0], vs[0])
VPCMPUD(Imm(1), globals.seq, vs[0], K1)
VPBROADCASTD_Z(counter.Addr.Offset(4), vs[1])
VPADDD_ZBK(globals.seq.Offset(4), vs[1], K1, vs[1])
VMOVDQU32_Z(vs[0], counterLo)
VMOVDQU32_Z(vs[1], counterHi)
VPBROADCASTD(counter.Addr.Offset(4), vs[1])
VPADDD_BCST(globals.seq.Offset(4), vs[1], K1, vs[1])
VMOVDQU32(vs[0], counterLo)
VMOVDQU32(vs[1], counterHi)
Comment("Initialize flags")
chunkFlags := AllocLocal(16 * 4)
VPBROADCASTD_Z(flags.Addr, vs[0])
VMOVDQU32_Z(vs[0], chunkFlags)
VPBROADCASTD(flags.Addr, vs[0])
VMOVDQU32(vs[0], chunkFlags)
ORL(Imm(1), chunkFlags.Offset(0*4))
ORL(Imm(2), chunkFlags.Offset(15*4))
Comment("Load key")
for i := 0; i < 8; i++ {
VPBROADCASTD_Z(key.Offset(i*4), vs[i])
VPBROADCASTD(key.Offset(i*4), vs[i])
}
Comment("Loop index")
@ -155,29 +155,29 @@ func genCompressChunksAVX512() {
Label("loop")
Comment("Load transposed block")
VMOVDQU32_Z(globals.seq, vs[8])
VPSLLD_Z(Imm(10), vs[8], vs[8]) // stride of 1024
VMOVDQU32(globals.seq, vs[8])
VPSLLD(Imm(10), vs[8], vs[8]) // stride of 1024
for i, m := range mv {
KXNORD(K1, K1, K1)
VPGATHERDD_Z(buf.Offset(i*4).Idx(vs[8], 1), K1, m)
VPGATHERDD(buf.Offset(i*4).Idx(vs[8], 1), K1, m)
}
ADDQ(Imm(64), buf.Base)
Comment("Reload state vectors (other than CVs)")
for i := 0; i < 4; i++ {
VPBROADCASTD_Z(globals.iv.Offset(i*4), vs[8+i])
VPBROADCASTD(globals.iv.Offset(i*4), vs[8+i])
}
VMOVDQU32_Z(counterLo, vs[12])
VMOVDQU32_Z(counterHi, vs[13])
VPBROADCASTD_Z(globals.seq.Offset(4), vs[14])
VPSLLD_Z(Imm(6), vs[14], vs[14]) // 64
VPBROADCASTD_Z(chunkFlags.Idx(loop, 4), vs[15])
VMOVDQU32(counterLo, vs[12])
VMOVDQU32(counterHi, vs[13])
VPBROADCASTD(globals.seq.Offset(4), vs[14])
VPSLLD(Imm(6), vs[14], vs[14]) // 64
VPBROADCASTD(chunkFlags.Idx(loop, 4), vs[15])
performRoundsAVX512(vs, mv)
Comment("Finalize CVs")
for i := range vs[:8] {
VPXORD_Z(vs[i], vs[i+8], vs[i])
VPXORD(vs[i], vs[i+8], vs[i])
}
Comment("Loop")
@ -186,11 +186,11 @@ func genCompressChunksAVX512() {
JNE(LabelRef("loop"))
Comment("Finished; transpose CVs")
VMOVDQU32_Z(globals.seq, vs[8])
VPSLLD_Z(Imm(5), vs[8], vs[8]) // stride of 32
VMOVDQU32(globals.seq, vs[8])
VPSLLD(Imm(5), vs[8], vs[8]) // stride of 32
for i, v := range vs[:8] {
KXNORD(K1, K1, K1) // fastest way to set all bits to 1
VPSCATTERDD_Z(v, K1, cvs.Offset(i*4).Idx(vs[8], 1))
VPSCATTERDD(v, K1, cvs.Offset(i*4).Idx(vs[8], 1))
}
RET()
@ -198,20 +198,20 @@ func genCompressChunksAVX512() {
func performRoundsAVX512(vs, mv [16]VecVirtual) {
g := func(a, b, c, d, mx, my VecVirtual) {
VPADDD_Z(a, b, a)
VPADDD_Z(mx, a, a)
VPXORD_Z(d, a, d)
VPRORD_Z(Imm(16), d, d)
VPADDD_Z(c, d, c)
VPXORD_Z(b, c, b)
VPRORD_Z(Imm(12), b, b)
VPADDD_Z(a, b, a)
VPADDD_Z(my, a, a)
VPXORD_Z(d, a, d)
VPRORD_Z(Imm(8), d, d)
VPADDD_Z(c, d, c)
VPXORD_Z(b, c, b)
VPRORD_Z(Imm(7), b, b)
VPADDD(a, b, a)
VPADDD(mx, a, a)
VPXORD(d, a, d)
VPRORD(Imm(16), d, d)
VPADDD(c, d, c)
VPXORD(b, c, b)
VPRORD(Imm(12), b, b)
VPADDD(a, b, a)
VPADDD(my, a, a)
VPXORD(d, a, d)
VPRORD(Imm(8), d, d)
VPADDD(c, d, c)
VPXORD(b, c, b)
VPRORD(Imm(7), b, b)
}
for i := 0; i < 7; i++ {
@ -534,152 +534,3 @@ func transpose(src, dst []VecVirtual) {
VPERM2I128(Imm(0x31), src[i+4], src[i], dst[i+4])
}
}
// AVX-512 is not currently supported by avo, so we need to manually define the
// instructions we need
type maskReg = LabelRef // hack; avo doesn't allow custom Op types
const K0 maskReg = "K0"
const K1 maskReg = "K1"
const K2 maskReg = "K2"
func VMOVDQU32_Z(src, dst Op) {
Instruction(&ir.Instruction{
Opcode: "VMOVDQU32",
Operands: []Op{src, dst},
Inputs: []Op{src},
Outputs: []Op{dst},
ISA: []string{"AVX512F"},
})
}
func VPBROADCASTD_Z(src, dst Op) {
Instruction(&ir.Instruction{
Opcode: "VPBROADCASTD",
Operands: []Op{src, dst},
Inputs: []Op{src},
Outputs: []Op{dst},
ISA: []string{"AVX512F"},
})
}
func VPGATHERDD_Z(src, mask, dst Op) {
Instruction(&ir.Instruction{
Opcode: "VPGATHERDD",
Operands: []Op{src, mask, dst},
Inputs: []Op{src, mask},
Outputs: []Op{dst},
ISA: []string{"AVX512F"},
})
}
func VPSCATTERDD_Z(src, mask, dst Op) {
Instruction(&ir.Instruction{
Opcode: "VPSCATTERDD",
Operands: []Op{src, mask, dst},
Inputs: []Op{src, mask},
Outputs: []Op{dst},
ISA: []string{"AVX512F"},
})
}
func VPORD_Z(x, y, dst Op) {
Instruction(&ir.Instruction{
Opcode: "VPORD",
Operands: []Op{x, y, dst},
Inputs: []Op{x, y},
Outputs: []Op{dst},
ISA: []string{"AVX512F"},
})
}
func VPXORD_Z(x, y, dst Op) {
Instruction(&ir.Instruction{
Opcode: "VPXORD",
Operands: []Op{x, y, dst},
Inputs: []Op{x, y},
Outputs: []Op{dst},
ISA: []string{"AVX512F"},
})
}
func VPXORD_ZB(x, y, dst Op) {
Instruction(&ir.Instruction{
Opcode: "VPXORD.BCST",
Operands: []Op{x, y, dst},
Inputs: []Op{x, y},
Outputs: []Op{dst},
ISA: []string{"AVX512F"},
})
}
func VPRORD_Z(n, src, dst Op) {
Instruction(&ir.Instruction{
Opcode: "VPRORD",
Operands: []Op{n, src, dst},
Inputs: []Op{n, src},
Outputs: []Op{dst},
ISA: []string{"AVX512F"},
})
}
func VPSLLD_Z(n, src, dst Op) {
Instruction(&ir.Instruction{
Opcode: "VPSLLD",
Operands: []Op{n, src, dst},
Inputs: []Op{n, src},
Outputs: []Op{dst},
ISA: []string{"AVX512F"},
})
}
func VPADDD_Z(x, y, dst Op) {
Instruction(&ir.Instruction{
Opcode: "VPADDD",
Operands: []Op{x, y, dst},
Inputs: []Op{x, y},
Outputs: []Op{dst},
ISA: []string{"AVX512F"},
})
}
func VPADDD_ZB(x, y, dst Op) {
Instruction(&ir.Instruction{
Opcode: "VPADDD.BCST",
Operands: []Op{x, y, dst},
Inputs: []Op{x, y},
Outputs: []Op{dst},
ISA: []string{"AVX512F"},
})
}
func VPADDD_ZBK(x, y, mask, dst Op) {
Instruction(&ir.Instruction{
Opcode: "VPADDD.BCST",
Operands: []Op{x, y, mask, dst},
Inputs: []Op{x, y, mask},
Outputs: []Op{dst},
ISA: []string{"AVX512F"},
})
}
func KXNORD(x, y, dst Op) {
Instruction(&ir.Instruction{
Opcode: "KXNORD",
Operands: []Op{x, y, dst},
Inputs: []Op{x, y},
Outputs: []Op{dst},
ISA: []string{"AVX512F"},
})
}
func VPCMPUD(pred, x, y, dst Op) {
Instruction(&ir.Instruction{
Opcode: "VPCMPUD",
Operands: []Op{pred, x, y, dst},
Inputs: []Op{pred, x, y},
Outputs: []Op{dst},
ISA: []string{"AVX512F"},
})
}

View File

@ -57,7 +57,7 @@ DATA shuffle_rot16<>+28(SB)/4, $0x1d1c1f1e
GLOBL shuffle_rot16<>(SB), RODATA|NOPTR, $32
// func compressBlocksAVX512(out *[1024]byte, block *[16]uint32, cv *[8]uint32, counter uint64, blockLen uint32, flags uint32)
// Requires: AVX512F
// Requires: AVX512BW, AVX512F
TEXT ·compressBlocksAVX512(SB), NOSPLIT, $0-40
MOVQ out+0(FP), AX
MOVQ block+8(FP), CX
@ -954,8 +954,8 @@ TEXT ·compressBlocksAVX512(SB), NOSPLIT, $0-40
RET
// func compressChunksAVX512(cvs *[16][8]uint32, buf *[16384]byte, key *[8]uint32, counter uint64, flags uint32)
// Requires: AVX512F
TEXT ·compressChunksAVX512(SB), NOSPLIT, $192-40
// Requires: AVX512BW, AVX512F
TEXT ·compressChunksAVX512(SB), NOSPLIT, $192-36
MOVQ cvs+0(FP), AX
MOVQ buf+8(FP), CX
MOVQ key+16(FP), DX
@ -3127,7 +3127,7 @@ TEXT ·compressBlocksAVX2(SB), NOSPLIT, $544-40
// func compressChunksAVX2(cvs *[8][8]uint32, buf *[8192]byte, key *[8]uint32, counter uint64, flags uint32)
// Requires: AVX, AVX2
TEXT ·compressChunksAVX2(SB), NOSPLIT, $672-40
TEXT ·compressChunksAVX2(SB), NOSPLIT, $672-36
MOVQ cvs+0(FP), AX
MOVQ buf+8(FP), CX
MOVQ key+16(FP), DX
@ -4363,7 +4363,7 @@ loop:
// func compressParentsAVX2(parents *[8][8]uint32, cvs *[16][8]uint32, key *[8]uint32, flags uint32)
// Requires: AVX, AVX2
TEXT ·compressParentsAVX2(SB), NOSPLIT, $544-32
TEXT ·compressParentsAVX2(SB), NOSPLIT, $544-28
MOVQ parents+0(FP), AX
MOVQ cvs+8(FP), CX
MOVQ key+16(FP), DX