upgrade to avo@v0.4.0 (AVX-512 support, woo!)
This commit is contained in:
parent
09d3897aaa
commit
bb7ece4161
251
avo/gen.go
251
avo/gen.go
|
@ -1,3 +1,4 @@
|
||||||
|
//go:build ignore
|
||||||
// +build ignore
|
// +build ignore
|
||||||
|
|
||||||
package main
|
package main
|
||||||
|
@ -6,7 +7,6 @@ import (
|
||||||
"fmt"
|
"fmt"
|
||||||
|
|
||||||
. "github.com/mmcloughlin/avo/build"
|
. "github.com/mmcloughlin/avo/build"
|
||||||
"github.com/mmcloughlin/avo/ir"
|
|
||||||
. "github.com/mmcloughlin/avo/operand"
|
. "github.com/mmcloughlin/avo/operand"
|
||||||
. "github.com/mmcloughlin/avo/reg"
|
. "github.com/mmcloughlin/avo/reg"
|
||||||
)
|
)
|
||||||
|
@ -68,28 +68,28 @@ func genCompressBlocksAVX512() {
|
||||||
var vs, mv [16]VecVirtual
|
var vs, mv [16]VecVirtual
|
||||||
for i := range vs {
|
for i := range vs {
|
||||||
vs[i], mv[i] = ZMM(), ZMM()
|
vs[i], mv[i] = ZMM(), ZMM()
|
||||||
VPBROADCASTD_Z(block.Offset(i*4), mv[i])
|
VPBROADCASTD(block.Offset(i*4), mv[i])
|
||||||
}
|
}
|
||||||
|
|
||||||
Comment("Initialize state vectors")
|
Comment("Initialize state vectors")
|
||||||
for i, v := range vs {
|
for i, v := range vs {
|
||||||
switch i {
|
switch i {
|
||||||
case 0, 1, 2, 3, 4, 5, 6, 7: // cv
|
case 0, 1, 2, 3, 4, 5, 6, 7: // cv
|
||||||
VPBROADCASTD_Z(cv.Offset(i*4), v)
|
VPBROADCASTD(cv.Offset(i*4), v)
|
||||||
case 8, 9, 10, 11: // iv
|
case 8, 9, 10, 11: // iv
|
||||||
VPBROADCASTD_Z(globals.iv.Offset((i-8)*4), v)
|
VPBROADCASTD(globals.iv.Offset((i-8)*4), v)
|
||||||
case 12: // counter
|
case 12: // counter
|
||||||
VPBROADCASTD_Z(counter.Addr, vs[12])
|
VPBROADCASTD(counter.Addr, vs[12])
|
||||||
VPADDD_Z(globals.seq, vs[12], vs[12])
|
VPADDD(globals.seq, vs[12], vs[12])
|
||||||
// set a 1 bit in K1 for each overflowed counter in vs[12]
|
// set a 1 bit in K1 for each overflowed counter in vs[12]
|
||||||
VPCMPUD(Imm(1), globals.seq, vs[12], K1)
|
VPCMPUD(Imm(1), globals.seq, vs[12], K1)
|
||||||
// add 1 to each counter in vs[13] for each 1 bit in K1
|
// add 1 to each counter in vs[13] for each 1 bit in K1
|
||||||
VPBROADCASTD_Z(counter.Addr.Offset(1*4), vs[13])
|
VPBROADCASTD(counter.Addr.Offset(1*4), vs[13])
|
||||||
VPADDD_ZBK(globals.seq.Offset(4), vs[13], K1, vs[13])
|
VPADDD_BCST(globals.seq.Offset(4), vs[13], K1, vs[13])
|
||||||
case 14: // blockLen
|
case 14: // blockLen
|
||||||
VPBROADCASTD_Z(blockLen.Addr, v)
|
VPBROADCASTD(blockLen.Addr, v)
|
||||||
case 15: // flags
|
case 15: // flags
|
||||||
VPBROADCASTD_Z(flags.Addr, v)
|
VPBROADCASTD(flags.Addr, v)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -97,17 +97,17 @@ func genCompressBlocksAVX512() {
|
||||||
|
|
||||||
Comment("Finalize CVs")
|
Comment("Finalize CVs")
|
||||||
for i, v := range vs[:8] {
|
for i, v := range vs[:8] {
|
||||||
VPXORD_Z(v, vs[i+8], v)
|
VPXORD(v, vs[i+8], v)
|
||||||
}
|
}
|
||||||
for i, v := range vs[8:] {
|
for i, v := range vs[8:] {
|
||||||
VPXORD_ZB(cv.Offset(i*4), v, v)
|
VPXORD_BCST(cv.Offset(i*4), v, v)
|
||||||
}
|
}
|
||||||
stride := ZMM()
|
stride := ZMM()
|
||||||
VMOVDQU32_Z(globals.seq, stride)
|
VMOVDQU32(globals.seq, stride)
|
||||||
VPSLLD_Z(Imm(6), stride, stride) // stride of 64
|
VPSLLD(Imm(6), stride, stride) // stride of 64
|
||||||
for i, v := range vs {
|
for i, v := range vs {
|
||||||
KXNORD(K1, K1, K1) // fastest way to set all bits to 1
|
KXNORD(K1, K1, K1) // fastest way to set all bits to 1
|
||||||
VPSCATTERDD_Z(v, K1, out.Offset(i*4).Idx(stride, 1))
|
VPSCATTERDD(v, K1, out.Offset(i*4).Idx(stride, 1))
|
||||||
}
|
}
|
||||||
|
|
||||||
RET()
|
RET()
|
||||||
|
@ -129,24 +129,24 @@ func genCompressChunksAVX512() {
|
||||||
Comment("Initialize counter")
|
Comment("Initialize counter")
|
||||||
counterLo := AllocLocal(64)
|
counterLo := AllocLocal(64)
|
||||||
counterHi := AllocLocal(64)
|
counterHi := AllocLocal(64)
|
||||||
VPBROADCASTD_Z(counter.Addr, vs[0])
|
VPBROADCASTD(counter.Addr, vs[0])
|
||||||
VPADDD_Z(globals.seq, vs[0], vs[0])
|
VPADDD(globals.seq, vs[0], vs[0])
|
||||||
VPCMPUD(Imm(1), globals.seq, vs[0], K1)
|
VPCMPUD(Imm(1), globals.seq, vs[0], K1)
|
||||||
VPBROADCASTD_Z(counter.Addr.Offset(4), vs[1])
|
VPBROADCASTD(counter.Addr.Offset(4), vs[1])
|
||||||
VPADDD_ZBK(globals.seq.Offset(4), vs[1], K1, vs[1])
|
VPADDD_BCST(globals.seq.Offset(4), vs[1], K1, vs[1])
|
||||||
VMOVDQU32_Z(vs[0], counterLo)
|
VMOVDQU32(vs[0], counterLo)
|
||||||
VMOVDQU32_Z(vs[1], counterHi)
|
VMOVDQU32(vs[1], counterHi)
|
||||||
|
|
||||||
Comment("Initialize flags")
|
Comment("Initialize flags")
|
||||||
chunkFlags := AllocLocal(16 * 4)
|
chunkFlags := AllocLocal(16 * 4)
|
||||||
VPBROADCASTD_Z(flags.Addr, vs[0])
|
VPBROADCASTD(flags.Addr, vs[0])
|
||||||
VMOVDQU32_Z(vs[0], chunkFlags)
|
VMOVDQU32(vs[0], chunkFlags)
|
||||||
ORL(Imm(1), chunkFlags.Offset(0*4))
|
ORL(Imm(1), chunkFlags.Offset(0*4))
|
||||||
ORL(Imm(2), chunkFlags.Offset(15*4))
|
ORL(Imm(2), chunkFlags.Offset(15*4))
|
||||||
|
|
||||||
Comment("Load key")
|
Comment("Load key")
|
||||||
for i := 0; i < 8; i++ {
|
for i := 0; i < 8; i++ {
|
||||||
VPBROADCASTD_Z(key.Offset(i*4), vs[i])
|
VPBROADCASTD(key.Offset(i*4), vs[i])
|
||||||
}
|
}
|
||||||
|
|
||||||
Comment("Loop index")
|
Comment("Loop index")
|
||||||
|
@ -155,29 +155,29 @@ func genCompressChunksAVX512() {
|
||||||
Label("loop")
|
Label("loop")
|
||||||
|
|
||||||
Comment("Load transposed block")
|
Comment("Load transposed block")
|
||||||
VMOVDQU32_Z(globals.seq, vs[8])
|
VMOVDQU32(globals.seq, vs[8])
|
||||||
VPSLLD_Z(Imm(10), vs[8], vs[8]) // stride of 1024
|
VPSLLD(Imm(10), vs[8], vs[8]) // stride of 1024
|
||||||
for i, m := range mv {
|
for i, m := range mv {
|
||||||
KXNORD(K1, K1, K1)
|
KXNORD(K1, K1, K1)
|
||||||
VPGATHERDD_Z(buf.Offset(i*4).Idx(vs[8], 1), K1, m)
|
VPGATHERDD(buf.Offset(i*4).Idx(vs[8], 1), K1, m)
|
||||||
}
|
}
|
||||||
ADDQ(Imm(64), buf.Base)
|
ADDQ(Imm(64), buf.Base)
|
||||||
|
|
||||||
Comment("Reload state vectors (other than CVs)")
|
Comment("Reload state vectors (other than CVs)")
|
||||||
for i := 0; i < 4; i++ {
|
for i := 0; i < 4; i++ {
|
||||||
VPBROADCASTD_Z(globals.iv.Offset(i*4), vs[8+i])
|
VPBROADCASTD(globals.iv.Offset(i*4), vs[8+i])
|
||||||
}
|
}
|
||||||
VMOVDQU32_Z(counterLo, vs[12])
|
VMOVDQU32(counterLo, vs[12])
|
||||||
VMOVDQU32_Z(counterHi, vs[13])
|
VMOVDQU32(counterHi, vs[13])
|
||||||
VPBROADCASTD_Z(globals.seq.Offset(4), vs[14])
|
VPBROADCASTD(globals.seq.Offset(4), vs[14])
|
||||||
VPSLLD_Z(Imm(6), vs[14], vs[14]) // 64
|
VPSLLD(Imm(6), vs[14], vs[14]) // 64
|
||||||
VPBROADCASTD_Z(chunkFlags.Idx(loop, 4), vs[15])
|
VPBROADCASTD(chunkFlags.Idx(loop, 4), vs[15])
|
||||||
|
|
||||||
performRoundsAVX512(vs, mv)
|
performRoundsAVX512(vs, mv)
|
||||||
|
|
||||||
Comment("Finalize CVs")
|
Comment("Finalize CVs")
|
||||||
for i := range vs[:8] {
|
for i := range vs[:8] {
|
||||||
VPXORD_Z(vs[i], vs[i+8], vs[i])
|
VPXORD(vs[i], vs[i+8], vs[i])
|
||||||
}
|
}
|
||||||
|
|
||||||
Comment("Loop")
|
Comment("Loop")
|
||||||
|
@ -186,11 +186,11 @@ func genCompressChunksAVX512() {
|
||||||
JNE(LabelRef("loop"))
|
JNE(LabelRef("loop"))
|
||||||
|
|
||||||
Comment("Finished; transpose CVs")
|
Comment("Finished; transpose CVs")
|
||||||
VMOVDQU32_Z(globals.seq, vs[8])
|
VMOVDQU32(globals.seq, vs[8])
|
||||||
VPSLLD_Z(Imm(5), vs[8], vs[8]) // stride of 32
|
VPSLLD(Imm(5), vs[8], vs[8]) // stride of 32
|
||||||
for i, v := range vs[:8] {
|
for i, v := range vs[:8] {
|
||||||
KXNORD(K1, K1, K1) // fastest way to set all bits to 1
|
KXNORD(K1, K1, K1) // fastest way to set all bits to 1
|
||||||
VPSCATTERDD_Z(v, K1, cvs.Offset(i*4).Idx(vs[8], 1))
|
VPSCATTERDD(v, K1, cvs.Offset(i*4).Idx(vs[8], 1))
|
||||||
}
|
}
|
||||||
|
|
||||||
RET()
|
RET()
|
||||||
|
@ -198,20 +198,20 @@ func genCompressChunksAVX512() {
|
||||||
|
|
||||||
func performRoundsAVX512(vs, mv [16]VecVirtual) {
|
func performRoundsAVX512(vs, mv [16]VecVirtual) {
|
||||||
g := func(a, b, c, d, mx, my VecVirtual) {
|
g := func(a, b, c, d, mx, my VecVirtual) {
|
||||||
VPADDD_Z(a, b, a)
|
VPADDD(a, b, a)
|
||||||
VPADDD_Z(mx, a, a)
|
VPADDD(mx, a, a)
|
||||||
VPXORD_Z(d, a, d)
|
VPXORD(d, a, d)
|
||||||
VPRORD_Z(Imm(16), d, d)
|
VPRORD(Imm(16), d, d)
|
||||||
VPADDD_Z(c, d, c)
|
VPADDD(c, d, c)
|
||||||
VPXORD_Z(b, c, b)
|
VPXORD(b, c, b)
|
||||||
VPRORD_Z(Imm(12), b, b)
|
VPRORD(Imm(12), b, b)
|
||||||
VPADDD_Z(a, b, a)
|
VPADDD(a, b, a)
|
||||||
VPADDD_Z(my, a, a)
|
VPADDD(my, a, a)
|
||||||
VPXORD_Z(d, a, d)
|
VPXORD(d, a, d)
|
||||||
VPRORD_Z(Imm(8), d, d)
|
VPRORD(Imm(8), d, d)
|
||||||
VPADDD_Z(c, d, c)
|
VPADDD(c, d, c)
|
||||||
VPXORD_Z(b, c, b)
|
VPXORD(b, c, b)
|
||||||
VPRORD_Z(Imm(7), b, b)
|
VPRORD(Imm(7), b, b)
|
||||||
}
|
}
|
||||||
|
|
||||||
for i := 0; i < 7; i++ {
|
for i := 0; i < 7; i++ {
|
||||||
|
@ -534,152 +534,3 @@ func transpose(src, dst []VecVirtual) {
|
||||||
VPERM2I128(Imm(0x31), src[i+4], src[i], dst[i+4])
|
VPERM2I128(Imm(0x31), src[i+4], src[i], dst[i+4])
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// AVX-512 is not currently supported by avo, so we need to manually define the
|
|
||||||
// instructions we need
|
|
||||||
|
|
||||||
type maskReg = LabelRef // hack; avo doesn't allow custom Op types
|
|
||||||
|
|
||||||
const K0 maskReg = "K0"
|
|
||||||
const K1 maskReg = "K1"
|
|
||||||
const K2 maskReg = "K2"
|
|
||||||
|
|
||||||
func VMOVDQU32_Z(src, dst Op) {
|
|
||||||
Instruction(&ir.Instruction{
|
|
||||||
Opcode: "VMOVDQU32",
|
|
||||||
Operands: []Op{src, dst},
|
|
||||||
Inputs: []Op{src},
|
|
||||||
Outputs: []Op{dst},
|
|
||||||
ISA: []string{"AVX512F"},
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
func VPBROADCASTD_Z(src, dst Op) {
|
|
||||||
Instruction(&ir.Instruction{
|
|
||||||
Opcode: "VPBROADCASTD",
|
|
||||||
Operands: []Op{src, dst},
|
|
||||||
Inputs: []Op{src},
|
|
||||||
Outputs: []Op{dst},
|
|
||||||
ISA: []string{"AVX512F"},
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
func VPGATHERDD_Z(src, mask, dst Op) {
|
|
||||||
Instruction(&ir.Instruction{
|
|
||||||
Opcode: "VPGATHERDD",
|
|
||||||
Operands: []Op{src, mask, dst},
|
|
||||||
Inputs: []Op{src, mask},
|
|
||||||
Outputs: []Op{dst},
|
|
||||||
ISA: []string{"AVX512F"},
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
func VPSCATTERDD_Z(src, mask, dst Op) {
|
|
||||||
Instruction(&ir.Instruction{
|
|
||||||
Opcode: "VPSCATTERDD",
|
|
||||||
Operands: []Op{src, mask, dst},
|
|
||||||
Inputs: []Op{src, mask},
|
|
||||||
Outputs: []Op{dst},
|
|
||||||
ISA: []string{"AVX512F"},
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
func VPORD_Z(x, y, dst Op) {
|
|
||||||
Instruction(&ir.Instruction{
|
|
||||||
Opcode: "VPORD",
|
|
||||||
Operands: []Op{x, y, dst},
|
|
||||||
Inputs: []Op{x, y},
|
|
||||||
Outputs: []Op{dst},
|
|
||||||
ISA: []string{"AVX512F"},
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
func VPXORD_Z(x, y, dst Op) {
|
|
||||||
Instruction(&ir.Instruction{
|
|
||||||
Opcode: "VPXORD",
|
|
||||||
Operands: []Op{x, y, dst},
|
|
||||||
Inputs: []Op{x, y},
|
|
||||||
Outputs: []Op{dst},
|
|
||||||
ISA: []string{"AVX512F"},
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
func VPXORD_ZB(x, y, dst Op) {
|
|
||||||
Instruction(&ir.Instruction{
|
|
||||||
Opcode: "VPXORD.BCST",
|
|
||||||
Operands: []Op{x, y, dst},
|
|
||||||
Inputs: []Op{x, y},
|
|
||||||
Outputs: []Op{dst},
|
|
||||||
ISA: []string{"AVX512F"},
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
func VPRORD_Z(n, src, dst Op) {
|
|
||||||
Instruction(&ir.Instruction{
|
|
||||||
Opcode: "VPRORD",
|
|
||||||
Operands: []Op{n, src, dst},
|
|
||||||
Inputs: []Op{n, src},
|
|
||||||
Outputs: []Op{dst},
|
|
||||||
ISA: []string{"AVX512F"},
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
func VPSLLD_Z(n, src, dst Op) {
|
|
||||||
Instruction(&ir.Instruction{
|
|
||||||
Opcode: "VPSLLD",
|
|
||||||
Operands: []Op{n, src, dst},
|
|
||||||
Inputs: []Op{n, src},
|
|
||||||
Outputs: []Op{dst},
|
|
||||||
ISA: []string{"AVX512F"},
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
func VPADDD_Z(x, y, dst Op) {
|
|
||||||
Instruction(&ir.Instruction{
|
|
||||||
Opcode: "VPADDD",
|
|
||||||
Operands: []Op{x, y, dst},
|
|
||||||
Inputs: []Op{x, y},
|
|
||||||
Outputs: []Op{dst},
|
|
||||||
ISA: []string{"AVX512F"},
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
func VPADDD_ZB(x, y, dst Op) {
|
|
||||||
Instruction(&ir.Instruction{
|
|
||||||
Opcode: "VPADDD.BCST",
|
|
||||||
Operands: []Op{x, y, dst},
|
|
||||||
Inputs: []Op{x, y},
|
|
||||||
Outputs: []Op{dst},
|
|
||||||
ISA: []string{"AVX512F"},
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
func VPADDD_ZBK(x, y, mask, dst Op) {
|
|
||||||
Instruction(&ir.Instruction{
|
|
||||||
Opcode: "VPADDD.BCST",
|
|
||||||
Operands: []Op{x, y, mask, dst},
|
|
||||||
Inputs: []Op{x, y, mask},
|
|
||||||
Outputs: []Op{dst},
|
|
||||||
ISA: []string{"AVX512F"},
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
func KXNORD(x, y, dst Op) {
|
|
||||||
Instruction(&ir.Instruction{
|
|
||||||
Opcode: "KXNORD",
|
|
||||||
Operands: []Op{x, y, dst},
|
|
||||||
Inputs: []Op{x, y},
|
|
||||||
Outputs: []Op{dst},
|
|
||||||
ISA: []string{"AVX512F"},
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
func VPCMPUD(pred, x, y, dst Op) {
|
|
||||||
Instruction(&ir.Instruction{
|
|
||||||
Opcode: "VPCMPUD",
|
|
||||||
Operands: []Op{pred, x, y, dst},
|
|
||||||
Inputs: []Op{pred, x, y},
|
|
||||||
Outputs: []Op{dst},
|
|
||||||
ISA: []string{"AVX512F"},
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
|
@ -57,7 +57,7 @@ DATA shuffle_rot16<>+28(SB)/4, $0x1d1c1f1e
|
||||||
GLOBL shuffle_rot16<>(SB), RODATA|NOPTR, $32
|
GLOBL shuffle_rot16<>(SB), RODATA|NOPTR, $32
|
||||||
|
|
||||||
// func compressBlocksAVX512(out *[1024]byte, block *[16]uint32, cv *[8]uint32, counter uint64, blockLen uint32, flags uint32)
|
// func compressBlocksAVX512(out *[1024]byte, block *[16]uint32, cv *[8]uint32, counter uint64, blockLen uint32, flags uint32)
|
||||||
// Requires: AVX512F
|
// Requires: AVX512BW, AVX512F
|
||||||
TEXT ·compressBlocksAVX512(SB), NOSPLIT, $0-40
|
TEXT ·compressBlocksAVX512(SB), NOSPLIT, $0-40
|
||||||
MOVQ out+0(FP), AX
|
MOVQ out+0(FP), AX
|
||||||
MOVQ block+8(FP), CX
|
MOVQ block+8(FP), CX
|
||||||
|
@ -954,8 +954,8 @@ TEXT ·compressBlocksAVX512(SB), NOSPLIT, $0-40
|
||||||
RET
|
RET
|
||||||
|
|
||||||
// func compressChunksAVX512(cvs *[16][8]uint32, buf *[16384]byte, key *[8]uint32, counter uint64, flags uint32)
|
// func compressChunksAVX512(cvs *[16][8]uint32, buf *[16384]byte, key *[8]uint32, counter uint64, flags uint32)
|
||||||
// Requires: AVX512F
|
// Requires: AVX512BW, AVX512F
|
||||||
TEXT ·compressChunksAVX512(SB), NOSPLIT, $192-40
|
TEXT ·compressChunksAVX512(SB), NOSPLIT, $192-36
|
||||||
MOVQ cvs+0(FP), AX
|
MOVQ cvs+0(FP), AX
|
||||||
MOVQ buf+8(FP), CX
|
MOVQ buf+8(FP), CX
|
||||||
MOVQ key+16(FP), DX
|
MOVQ key+16(FP), DX
|
||||||
|
@ -3127,7 +3127,7 @@ TEXT ·compressBlocksAVX2(SB), NOSPLIT, $544-40
|
||||||
|
|
||||||
// func compressChunksAVX2(cvs *[8][8]uint32, buf *[8192]byte, key *[8]uint32, counter uint64, flags uint32)
|
// func compressChunksAVX2(cvs *[8][8]uint32, buf *[8192]byte, key *[8]uint32, counter uint64, flags uint32)
|
||||||
// Requires: AVX, AVX2
|
// Requires: AVX, AVX2
|
||||||
TEXT ·compressChunksAVX2(SB), NOSPLIT, $672-40
|
TEXT ·compressChunksAVX2(SB), NOSPLIT, $672-36
|
||||||
MOVQ cvs+0(FP), AX
|
MOVQ cvs+0(FP), AX
|
||||||
MOVQ buf+8(FP), CX
|
MOVQ buf+8(FP), CX
|
||||||
MOVQ key+16(FP), DX
|
MOVQ key+16(FP), DX
|
||||||
|
@ -4363,7 +4363,7 @@ loop:
|
||||||
|
|
||||||
// func compressParentsAVX2(parents *[8][8]uint32, cvs *[16][8]uint32, key *[8]uint32, flags uint32)
|
// func compressParentsAVX2(parents *[8][8]uint32, cvs *[16][8]uint32, key *[8]uint32, flags uint32)
|
||||||
// Requires: AVX, AVX2
|
// Requires: AVX, AVX2
|
||||||
TEXT ·compressParentsAVX2(SB), NOSPLIT, $544-32
|
TEXT ·compressParentsAVX2(SB), NOSPLIT, $544-28
|
||||||
MOVQ parents+0(FP), AX
|
MOVQ parents+0(FP), AX
|
||||||
MOVQ cvs+8(FP), CX
|
MOVQ cvs+8(FP), CX
|
||||||
MOVQ key+16(FP), DX
|
MOVQ key+16(FP), DX
|
||||||
|
|
Reference in New Issue