From bb7ece4161ea910c6c385c17799b17d658075fcd Mon Sep 17 00:00:00 2001 From: lukechampine Date: Fri, 12 Nov 2021 23:09:50 -0500 Subject: [PATCH] upgrade to avo@v0.4.0 (AVX-512 support, woo!) --- avo/gen.go | 251 ++++++++++--------------------------------------- blake3_amd64.s | 10 +- 2 files changed, 56 insertions(+), 205 deletions(-) diff --git a/avo/gen.go b/avo/gen.go index c40c52a..fbdd110 100644 --- a/avo/gen.go +++ b/avo/gen.go @@ -1,3 +1,4 @@ +//go:build ignore // +build ignore package main @@ -6,7 +7,6 @@ import ( "fmt" . "github.com/mmcloughlin/avo/build" - "github.com/mmcloughlin/avo/ir" . "github.com/mmcloughlin/avo/operand" . "github.com/mmcloughlin/avo/reg" ) @@ -68,28 +68,28 @@ func genCompressBlocksAVX512() { var vs, mv [16]VecVirtual for i := range vs { vs[i], mv[i] = ZMM(), ZMM() - VPBROADCASTD_Z(block.Offset(i*4), mv[i]) + VPBROADCASTD(block.Offset(i*4), mv[i]) } Comment("Initialize state vectors") for i, v := range vs { switch i { case 0, 1, 2, 3, 4, 5, 6, 7: // cv - VPBROADCASTD_Z(cv.Offset(i*4), v) + VPBROADCASTD(cv.Offset(i*4), v) case 8, 9, 10, 11: // iv - VPBROADCASTD_Z(globals.iv.Offset((i-8)*4), v) + VPBROADCASTD(globals.iv.Offset((i-8)*4), v) case 12: // counter - VPBROADCASTD_Z(counter.Addr, vs[12]) - VPADDD_Z(globals.seq, vs[12], vs[12]) + VPBROADCASTD(counter.Addr, vs[12]) + VPADDD(globals.seq, vs[12], vs[12]) // set a 1 bit in K1 for each overflowed counter in vs[12] VPCMPUD(Imm(1), globals.seq, vs[12], K1) // add 1 to each counter in vs[13] for each 1 bit in K1 - VPBROADCASTD_Z(counter.Addr.Offset(1*4), vs[13]) - VPADDD_ZBK(globals.seq.Offset(4), vs[13], K1, vs[13]) + VPBROADCASTD(counter.Addr.Offset(1*4), vs[13]) + VPADDD_BCST(globals.seq.Offset(4), vs[13], K1, vs[13]) case 14: // blockLen - VPBROADCASTD_Z(blockLen.Addr, v) + VPBROADCASTD(blockLen.Addr, v) case 15: // flags - VPBROADCASTD_Z(flags.Addr, v) + VPBROADCASTD(flags.Addr, v) } } @@ -97,17 +97,17 @@ func genCompressBlocksAVX512() { Comment("Finalize CVs") for i, v := range vs[:8] { - VPXORD_Z(v, vs[i+8], v) + VPXORD(v, vs[i+8], v) } for i, v := range vs[8:] { - VPXORD_ZB(cv.Offset(i*4), v, v) + VPXORD_BCST(cv.Offset(i*4), v, v) } stride := ZMM() - VMOVDQU32_Z(globals.seq, stride) - VPSLLD_Z(Imm(6), stride, stride) // stride of 64 + VMOVDQU32(globals.seq, stride) + VPSLLD(Imm(6), stride, stride) // stride of 64 for i, v := range vs { KXNORD(K1, K1, K1) // fastest way to set all bits to 1 - VPSCATTERDD_Z(v, K1, out.Offset(i*4).Idx(stride, 1)) + VPSCATTERDD(v, K1, out.Offset(i*4).Idx(stride, 1)) } RET() @@ -129,24 +129,24 @@ func genCompressChunksAVX512() { Comment("Initialize counter") counterLo := AllocLocal(64) counterHi := AllocLocal(64) - VPBROADCASTD_Z(counter.Addr, vs[0]) - VPADDD_Z(globals.seq, vs[0], vs[0]) + VPBROADCASTD(counter.Addr, vs[0]) + VPADDD(globals.seq, vs[0], vs[0]) VPCMPUD(Imm(1), globals.seq, vs[0], K1) - VPBROADCASTD_Z(counter.Addr.Offset(4), vs[1]) - VPADDD_ZBK(globals.seq.Offset(4), vs[1], K1, vs[1]) - VMOVDQU32_Z(vs[0], counterLo) - VMOVDQU32_Z(vs[1], counterHi) + VPBROADCASTD(counter.Addr.Offset(4), vs[1]) + VPADDD_BCST(globals.seq.Offset(4), vs[1], K1, vs[1]) + VMOVDQU32(vs[0], counterLo) + VMOVDQU32(vs[1], counterHi) Comment("Initialize flags") chunkFlags := AllocLocal(16 * 4) - VPBROADCASTD_Z(flags.Addr, vs[0]) - VMOVDQU32_Z(vs[0], chunkFlags) + VPBROADCASTD(flags.Addr, vs[0]) + VMOVDQU32(vs[0], chunkFlags) ORL(Imm(1), chunkFlags.Offset(0*4)) ORL(Imm(2), chunkFlags.Offset(15*4)) Comment("Load key") for i := 0; i < 8; i++ { - VPBROADCASTD_Z(key.Offset(i*4), vs[i]) + VPBROADCASTD(key.Offset(i*4), vs[i]) } Comment("Loop index") @@ -155,29 +155,29 @@ func genCompressChunksAVX512() { Label("loop") Comment("Load transposed block") - VMOVDQU32_Z(globals.seq, vs[8]) - VPSLLD_Z(Imm(10), vs[8], vs[8]) // stride of 1024 + VMOVDQU32(globals.seq, vs[8]) + VPSLLD(Imm(10), vs[8], vs[8]) // stride of 1024 for i, m := range mv { KXNORD(K1, K1, K1) - VPGATHERDD_Z(buf.Offset(i*4).Idx(vs[8], 1), K1, m) + VPGATHERDD(buf.Offset(i*4).Idx(vs[8], 1), K1, m) } ADDQ(Imm(64), buf.Base) Comment("Reload state vectors (other than CVs)") for i := 0; i < 4; i++ { - VPBROADCASTD_Z(globals.iv.Offset(i*4), vs[8+i]) + VPBROADCASTD(globals.iv.Offset(i*4), vs[8+i]) } - VMOVDQU32_Z(counterLo, vs[12]) - VMOVDQU32_Z(counterHi, vs[13]) - VPBROADCASTD_Z(globals.seq.Offset(4), vs[14]) - VPSLLD_Z(Imm(6), vs[14], vs[14]) // 64 - VPBROADCASTD_Z(chunkFlags.Idx(loop, 4), vs[15]) + VMOVDQU32(counterLo, vs[12]) + VMOVDQU32(counterHi, vs[13]) + VPBROADCASTD(globals.seq.Offset(4), vs[14]) + VPSLLD(Imm(6), vs[14], vs[14]) // 64 + VPBROADCASTD(chunkFlags.Idx(loop, 4), vs[15]) performRoundsAVX512(vs, mv) Comment("Finalize CVs") for i := range vs[:8] { - VPXORD_Z(vs[i], vs[i+8], vs[i]) + VPXORD(vs[i], vs[i+8], vs[i]) } Comment("Loop") @@ -186,11 +186,11 @@ func genCompressChunksAVX512() { JNE(LabelRef("loop")) Comment("Finished; transpose CVs") - VMOVDQU32_Z(globals.seq, vs[8]) - VPSLLD_Z(Imm(5), vs[8], vs[8]) // stride of 32 + VMOVDQU32(globals.seq, vs[8]) + VPSLLD(Imm(5), vs[8], vs[8]) // stride of 32 for i, v := range vs[:8] { KXNORD(K1, K1, K1) // fastest way to set all bits to 1 - VPSCATTERDD_Z(v, K1, cvs.Offset(i*4).Idx(vs[8], 1)) + VPSCATTERDD(v, K1, cvs.Offset(i*4).Idx(vs[8], 1)) } RET() @@ -198,20 +198,20 @@ func genCompressChunksAVX512() { func performRoundsAVX512(vs, mv [16]VecVirtual) { g := func(a, b, c, d, mx, my VecVirtual) { - VPADDD_Z(a, b, a) - VPADDD_Z(mx, a, a) - VPXORD_Z(d, a, d) - VPRORD_Z(Imm(16), d, d) - VPADDD_Z(c, d, c) - VPXORD_Z(b, c, b) - VPRORD_Z(Imm(12), b, b) - VPADDD_Z(a, b, a) - VPADDD_Z(my, a, a) - VPXORD_Z(d, a, d) - VPRORD_Z(Imm(8), d, d) - VPADDD_Z(c, d, c) - VPXORD_Z(b, c, b) - VPRORD_Z(Imm(7), b, b) + VPADDD(a, b, a) + VPADDD(mx, a, a) + VPXORD(d, a, d) + VPRORD(Imm(16), d, d) + VPADDD(c, d, c) + VPXORD(b, c, b) + VPRORD(Imm(12), b, b) + VPADDD(a, b, a) + VPADDD(my, a, a) + VPXORD(d, a, d) + VPRORD(Imm(8), d, d) + VPADDD(c, d, c) + VPXORD(b, c, b) + VPRORD(Imm(7), b, b) } for i := 0; i < 7; i++ { @@ -534,152 +534,3 @@ func transpose(src, dst []VecVirtual) { VPERM2I128(Imm(0x31), src[i+4], src[i], dst[i+4]) } } - -// AVX-512 is not currently supported by avo, so we need to manually define the -// instructions we need - -type maskReg = LabelRef // hack; avo doesn't allow custom Op types - -const K0 maskReg = "K0" -const K1 maskReg = "K1" -const K2 maskReg = "K2" - -func VMOVDQU32_Z(src, dst Op) { - Instruction(&ir.Instruction{ - Opcode: "VMOVDQU32", - Operands: []Op{src, dst}, - Inputs: []Op{src}, - Outputs: []Op{dst}, - ISA: []string{"AVX512F"}, - }) -} - -func VPBROADCASTD_Z(src, dst Op) { - Instruction(&ir.Instruction{ - Opcode: "VPBROADCASTD", - Operands: []Op{src, dst}, - Inputs: []Op{src}, - Outputs: []Op{dst}, - ISA: []string{"AVX512F"}, - }) -} - -func VPGATHERDD_Z(src, mask, dst Op) { - Instruction(&ir.Instruction{ - Opcode: "VPGATHERDD", - Operands: []Op{src, mask, dst}, - Inputs: []Op{src, mask}, - Outputs: []Op{dst}, - ISA: []string{"AVX512F"}, - }) -} - -func VPSCATTERDD_Z(src, mask, dst Op) { - Instruction(&ir.Instruction{ - Opcode: "VPSCATTERDD", - Operands: []Op{src, mask, dst}, - Inputs: []Op{src, mask}, - Outputs: []Op{dst}, - ISA: []string{"AVX512F"}, - }) -} - -func VPORD_Z(x, y, dst Op) { - Instruction(&ir.Instruction{ - Opcode: "VPORD", - Operands: []Op{x, y, dst}, - Inputs: []Op{x, y}, - Outputs: []Op{dst}, - ISA: []string{"AVX512F"}, - }) -} - -func VPXORD_Z(x, y, dst Op) { - Instruction(&ir.Instruction{ - Opcode: "VPXORD", - Operands: []Op{x, y, dst}, - Inputs: []Op{x, y}, - Outputs: []Op{dst}, - ISA: []string{"AVX512F"}, - }) -} - -func VPXORD_ZB(x, y, dst Op) { - Instruction(&ir.Instruction{ - Opcode: "VPXORD.BCST", - Operands: []Op{x, y, dst}, - Inputs: []Op{x, y}, - Outputs: []Op{dst}, - ISA: []string{"AVX512F"}, - }) -} - -func VPRORD_Z(n, src, dst Op) { - Instruction(&ir.Instruction{ - Opcode: "VPRORD", - Operands: []Op{n, src, dst}, - Inputs: []Op{n, src}, - Outputs: []Op{dst}, - ISA: []string{"AVX512F"}, - }) -} - -func VPSLLD_Z(n, src, dst Op) { - Instruction(&ir.Instruction{ - Opcode: "VPSLLD", - Operands: []Op{n, src, dst}, - Inputs: []Op{n, src}, - Outputs: []Op{dst}, - ISA: []string{"AVX512F"}, - }) -} - -func VPADDD_Z(x, y, dst Op) { - Instruction(&ir.Instruction{ - Opcode: "VPADDD", - Operands: []Op{x, y, dst}, - Inputs: []Op{x, y}, - Outputs: []Op{dst}, - ISA: []string{"AVX512F"}, - }) -} - -func VPADDD_ZB(x, y, dst Op) { - Instruction(&ir.Instruction{ - Opcode: "VPADDD.BCST", - Operands: []Op{x, y, dst}, - Inputs: []Op{x, y}, - Outputs: []Op{dst}, - ISA: []string{"AVX512F"}, - }) -} - -func VPADDD_ZBK(x, y, mask, dst Op) { - Instruction(&ir.Instruction{ - Opcode: "VPADDD.BCST", - Operands: []Op{x, y, mask, dst}, - Inputs: []Op{x, y, mask}, - Outputs: []Op{dst}, - ISA: []string{"AVX512F"}, - }) -} - -func KXNORD(x, y, dst Op) { - Instruction(&ir.Instruction{ - Opcode: "KXNORD", - Operands: []Op{x, y, dst}, - Inputs: []Op{x, y}, - Outputs: []Op{dst}, - ISA: []string{"AVX512F"}, - }) -} - -func VPCMPUD(pred, x, y, dst Op) { - Instruction(&ir.Instruction{ - Opcode: "VPCMPUD", - Operands: []Op{pred, x, y, dst}, - Inputs: []Op{pred, x, y}, - Outputs: []Op{dst}, - ISA: []string{"AVX512F"}, - }) -} diff --git a/blake3_amd64.s b/blake3_amd64.s index 4a6d93b..ca38ca6 100644 --- a/blake3_amd64.s +++ b/blake3_amd64.s @@ -57,7 +57,7 @@ DATA shuffle_rot16<>+28(SB)/4, $0x1d1c1f1e GLOBL shuffle_rot16<>(SB), RODATA|NOPTR, $32 // func compressBlocksAVX512(out *[1024]byte, block *[16]uint32, cv *[8]uint32, counter uint64, blockLen uint32, flags uint32) -// Requires: AVX512F +// Requires: AVX512BW, AVX512F TEXT ·compressBlocksAVX512(SB), NOSPLIT, $0-40 MOVQ out+0(FP), AX MOVQ block+8(FP), CX @@ -954,8 +954,8 @@ TEXT ·compressBlocksAVX512(SB), NOSPLIT, $0-40 RET // func compressChunksAVX512(cvs *[16][8]uint32, buf *[16384]byte, key *[8]uint32, counter uint64, flags uint32) -// Requires: AVX512F -TEXT ·compressChunksAVX512(SB), NOSPLIT, $192-40 +// Requires: AVX512BW, AVX512F +TEXT ·compressChunksAVX512(SB), NOSPLIT, $192-36 MOVQ cvs+0(FP), AX MOVQ buf+8(FP), CX MOVQ key+16(FP), DX @@ -3127,7 +3127,7 @@ TEXT ·compressBlocksAVX2(SB), NOSPLIT, $544-40 // func compressChunksAVX2(cvs *[8][8]uint32, buf *[8192]byte, key *[8]uint32, counter uint64, flags uint32) // Requires: AVX, AVX2 -TEXT ·compressChunksAVX2(SB), NOSPLIT, $672-40 +TEXT ·compressChunksAVX2(SB), NOSPLIT, $672-36 MOVQ cvs+0(FP), AX MOVQ buf+8(FP), CX MOVQ key+16(FP), DX @@ -4363,7 +4363,7 @@ loop: // func compressParentsAVX2(parents *[8][8]uint32, cvs *[16][8]uint32, key *[8]uint32, flags uint32) // Requires: AVX, AVX2 -TEXT ·compressParentsAVX2(SB), NOSPLIT, $544-32 +TEXT ·compressParentsAVX2(SB), NOSPLIT, $544-28 MOVQ parents+0(FP), AX MOVQ cvs+8(FP), CX MOVQ key+16(FP), DX