537 lines
14 KiB
Go
537 lines
14 KiB
Go
//go:build ignore
|
|
// +build ignore
|
|
|
|
package main
|
|
|
|
import (
|
|
"fmt"
|
|
|
|
. "github.com/mmcloughlin/avo/build"
|
|
. "github.com/mmcloughlin/avo/operand"
|
|
. "github.com/mmcloughlin/avo/reg"
|
|
)
|
|
|
|
func main() {
|
|
genGlobals()
|
|
genCompressBlocksAVX512()
|
|
genCompressChunksAVX512()
|
|
genCompressBlocksAVX2()
|
|
genCompressChunksAVX2()
|
|
genCompressParentsAVX2()
|
|
|
|
Generate()
|
|
}
|
|
|
|
var globals struct {
|
|
iv Mem
|
|
seq Mem
|
|
seq64 Mem // for loadCounter
|
|
shuffleRot8 Mem
|
|
shuffleRot16 Mem
|
|
}
|
|
|
|
func genGlobals() {
|
|
globals.iv = GLOBL("iv", RODATA|NOPTR)
|
|
DATA(0*4, U32(0x6A09E667))
|
|
DATA(1*4, U32(0xBB67AE85))
|
|
DATA(2*4, U32(0x3C6EF372))
|
|
DATA(3*4, U32(0xA54FF53A))
|
|
|
|
globals.seq = GLOBL("seq", RODATA|NOPTR)
|
|
for i := 0; i < 16; i++ {
|
|
DATA(i*4, U32(i))
|
|
}
|
|
globals.seq64 = GLOBL("seq64", RODATA|NOPTR)
|
|
for i := 0; i < 8; i++ {
|
|
DATA(i*8, U64(i))
|
|
}
|
|
globals.shuffleRot8 = GLOBL("shuffle_rot8", RODATA|NOPTR)
|
|
for i := 0; i < 8; i++ {
|
|
DATA(i*4, U32(0x00030201+0x04040404*i))
|
|
}
|
|
globals.shuffleRot16 = GLOBL("shuffle_rot16", RODATA|NOPTR)
|
|
for i := 0; i < 8; i++ {
|
|
DATA(i*4, U32(0x01000302+0x04040404*i))
|
|
}
|
|
}
|
|
|
|
func genCompressBlocksAVX512() {
|
|
TEXT("compressBlocksAVX512", NOSPLIT, "func(out *[1024]byte, block *[16]uint32, cv *[8]uint32, counter uint64, blockLen uint32, flags uint32)")
|
|
out := Mem{Base: Load(Param("out"), GP64())}
|
|
block := Mem{Base: Load(Param("block"), GP64())}
|
|
cv := Mem{Base: Load(Param("cv"), GP64())}
|
|
counter, _ := Param("counter").Resolve()
|
|
blockLen, _ := Param("blockLen").Resolve()
|
|
flags, _ := Param("flags").Resolve()
|
|
|
|
Comment("Initialize block vectors")
|
|
var vs, mv [16]VecVirtual
|
|
for i := range vs {
|
|
vs[i], mv[i] = ZMM(), ZMM()
|
|
VPBROADCASTD(block.Offset(i*4), mv[i])
|
|
}
|
|
|
|
Comment("Initialize state vectors")
|
|
for i, v := range vs {
|
|
switch i {
|
|
case 0, 1, 2, 3, 4, 5, 6, 7: // cv
|
|
VPBROADCASTD(cv.Offset(i*4), v)
|
|
case 8, 9, 10, 11: // iv
|
|
VPBROADCASTD(globals.iv.Offset((i-8)*4), v)
|
|
case 12: // counter
|
|
VPBROADCASTD(counter.Addr, vs[12])
|
|
VPADDD(globals.seq, vs[12], vs[12])
|
|
// set a 1 bit in K1 for each overflowed counter in vs[12]
|
|
VPCMPUD(Imm(1), globals.seq, vs[12], K1)
|
|
// add 1 to each counter in vs[13] for each 1 bit in K1
|
|
VPBROADCASTD(counter.Addr.Offset(1*4), vs[13])
|
|
VPADDD_BCST(globals.seq.Offset(4), vs[13], K1, vs[13])
|
|
case 14: // blockLen
|
|
VPBROADCASTD(blockLen.Addr, v)
|
|
case 15: // flags
|
|
VPBROADCASTD(flags.Addr, v)
|
|
}
|
|
}
|
|
|
|
performRoundsAVX512(vs, mv)
|
|
|
|
Comment("Finalize CVs")
|
|
for i, v := range vs[:8] {
|
|
VPXORD(v, vs[i+8], v)
|
|
}
|
|
for i, v := range vs[8:] {
|
|
VPXORD_BCST(cv.Offset(i*4), v, v)
|
|
}
|
|
stride := ZMM()
|
|
VMOVDQU32(globals.seq, stride)
|
|
VPSLLD(Imm(6), stride, stride) // stride of 64
|
|
for i, v := range vs {
|
|
KXNORD(K1, K1, K1) // fastest way to set all bits to 1
|
|
VPSCATTERDD(v, K1, out.Offset(i*4).Idx(stride, 1))
|
|
}
|
|
|
|
RET()
|
|
}
|
|
|
|
func genCompressChunksAVX512() {
|
|
TEXT("compressChunksAVX512", NOSPLIT, "func(cvs *[16][8]uint32, buf *[16384]byte, key *[8]uint32, counter uint64, flags uint32)")
|
|
cvs := Mem{Base: Load(Param("cvs"), GP64())}
|
|
buf := Mem{Base: Load(Param("buf"), GP64())}
|
|
key := Mem{Base: Load(Param("key"), GP64())}
|
|
counter, _ := Param("counter").Resolve()
|
|
flags, _ := Param("flags").Resolve()
|
|
|
|
var vs, mv [16]VecVirtual
|
|
for i := range vs {
|
|
vs[i], mv[i] = ZMM(), ZMM()
|
|
}
|
|
|
|
Comment("Initialize counter")
|
|
counterLo := AllocLocal(64)
|
|
counterHi := AllocLocal(64)
|
|
VPBROADCASTD(counter.Addr, vs[0])
|
|
VPADDD(globals.seq, vs[0], vs[0])
|
|
VPCMPUD(Imm(1), globals.seq, vs[0], K1)
|
|
VPBROADCASTD(counter.Addr.Offset(4), vs[1])
|
|
VPADDD_BCST(globals.seq.Offset(4), vs[1], K1, vs[1])
|
|
VMOVDQU32(vs[0], counterLo)
|
|
VMOVDQU32(vs[1], counterHi)
|
|
|
|
Comment("Initialize flags")
|
|
chunkFlags := AllocLocal(16 * 4)
|
|
VPBROADCASTD(flags.Addr, vs[0])
|
|
VMOVDQU32(vs[0], chunkFlags)
|
|
ORL(Imm(1), chunkFlags.Offset(0*4))
|
|
ORL(Imm(2), chunkFlags.Offset(15*4))
|
|
|
|
Comment("Load key")
|
|
for i := 0; i < 8; i++ {
|
|
VPBROADCASTD(key.Offset(i*4), vs[i])
|
|
}
|
|
|
|
Comment("Loop index")
|
|
loop := GP64()
|
|
XORQ(loop, loop)
|
|
Label("loop")
|
|
|
|
Comment("Load transposed block")
|
|
VMOVDQU32(globals.seq, vs[8])
|
|
VPSLLD(Imm(10), vs[8], vs[8]) // stride of 1024
|
|
for i, m := range mv {
|
|
KXNORD(K1, K1, K1)
|
|
VPGATHERDD(buf.Offset(i*4).Idx(vs[8], 1), K1, m)
|
|
}
|
|
ADDQ(Imm(64), buf.Base)
|
|
|
|
Comment("Reload state vectors (other than CVs)")
|
|
for i := 0; i < 4; i++ {
|
|
VPBROADCASTD(globals.iv.Offset(i*4), vs[8+i])
|
|
}
|
|
VMOVDQU32(counterLo, vs[12])
|
|
VMOVDQU32(counterHi, vs[13])
|
|
VPBROADCASTD(globals.seq.Offset(4), vs[14])
|
|
VPSLLD(Imm(6), vs[14], vs[14]) // 64
|
|
VPBROADCASTD(chunkFlags.Idx(loop, 4), vs[15])
|
|
|
|
performRoundsAVX512(vs, mv)
|
|
|
|
Comment("Finalize CVs")
|
|
for i := range vs[:8] {
|
|
VPXORD(vs[i], vs[i+8], vs[i])
|
|
}
|
|
|
|
Comment("Loop")
|
|
INCQ(loop)
|
|
CMPQ(loop, U32(16))
|
|
JNE(LabelRef("loop"))
|
|
|
|
Comment("Finished; transpose CVs")
|
|
VMOVDQU32(globals.seq, vs[8])
|
|
VPSLLD(Imm(5), vs[8], vs[8]) // stride of 32
|
|
for i, v := range vs[:8] {
|
|
KXNORD(K1, K1, K1) // fastest way to set all bits to 1
|
|
VPSCATTERDD(v, K1, cvs.Offset(i*4).Idx(vs[8], 1))
|
|
}
|
|
|
|
RET()
|
|
}
|
|
|
|
func performRoundsAVX512(vs, mv [16]VecVirtual) {
|
|
g := func(a, b, c, d, mx, my VecVirtual) {
|
|
VPADDD(a, b, a)
|
|
VPADDD(mx, a, a)
|
|
VPXORD(d, a, d)
|
|
VPRORD(Imm(16), d, d)
|
|
VPADDD(c, d, c)
|
|
VPXORD(b, c, b)
|
|
VPRORD(Imm(12), b, b)
|
|
VPADDD(a, b, a)
|
|
VPADDD(my, a, a)
|
|
VPXORD(d, a, d)
|
|
VPRORD(Imm(8), d, d)
|
|
VPADDD(c, d, c)
|
|
VPXORD(b, c, b)
|
|
VPRORD(Imm(7), b, b)
|
|
}
|
|
|
|
for i := 0; i < 7; i++ {
|
|
Comment(fmt.Sprintf("Round %v", i+1))
|
|
g(vs[0], vs[4], vs[8], vs[12], mv[0], mv[1])
|
|
g(vs[1], vs[5], vs[9], vs[13], mv[2], mv[3])
|
|
g(vs[2], vs[6], vs[10], vs[14], mv[4], mv[5])
|
|
g(vs[3], vs[7], vs[11], vs[15], mv[6], mv[7])
|
|
g(vs[0], vs[5], vs[10], vs[15], mv[8], mv[9])
|
|
g(vs[1], vs[6], vs[11], vs[12], mv[10], mv[11])
|
|
g(vs[2], vs[7], vs[8], vs[13], mv[12], mv[13])
|
|
g(vs[3], vs[4], vs[9], vs[14], mv[14], mv[15])
|
|
|
|
// permute
|
|
mv = [16]VecVirtual{
|
|
mv[2], mv[6], mv[3], mv[10],
|
|
mv[7], mv[0], mv[4], mv[13],
|
|
mv[1], mv[11], mv[12], mv[5],
|
|
mv[9], mv[14], mv[15], mv[8],
|
|
}
|
|
}
|
|
}
|
|
|
|
func genCompressBlocksAVX2() {
|
|
TEXT("compressBlocksAVX2", NOSPLIT, "func(out *[512]byte, block *[16]uint32, cv *[8]uint32, counter uint64, blockLen uint32, flags uint32)")
|
|
out := Mem{Base: Load(Param("out"), GP64())}
|
|
block := Mem{Base: Load(Param("block"), GP64())}
|
|
cv := Mem{Base: Load(Param("cv"), GP64())}
|
|
counter, _ := Param("counter").Resolve()
|
|
blockLen, _ := Param("blockLen").Resolve()
|
|
flags, _ := Param("flags").Resolve()
|
|
|
|
var vs [16]VecVirtual
|
|
var mv [16]Mem
|
|
for i := range vs {
|
|
vs[i] = YMM()
|
|
mv[i] = AllocLocal(32)
|
|
}
|
|
|
|
Comment("Load block")
|
|
for i := 0; i < 16; i++ {
|
|
VPBROADCASTD(block.Offset(i*4), vs[0])
|
|
VMOVDQU(vs[0], mv[i])
|
|
}
|
|
|
|
Comment("Initialize state vectors")
|
|
for i, v := range vs {
|
|
switch i {
|
|
case 0, 1, 2, 3, 4, 5, 6, 7: // cv
|
|
VPBROADCASTD(cv.Offset(i*4), v)
|
|
case 8, 9, 10, 11: // iv
|
|
VPBROADCASTD(globals.iv.Offset((i-8)*4), v)
|
|
case 12: // counter
|
|
loadCounter(counter.Addr, vs[12:14], vs[14:16])
|
|
case 14: // blockLen
|
|
VPBROADCASTD(blockLen.Addr, v)
|
|
case 15: // flags
|
|
VPBROADCASTD(flags.Addr, v)
|
|
}
|
|
}
|
|
|
|
performRoundsAVX2(vs, mv)
|
|
|
|
Comment("Finalize CVs")
|
|
for i := 8; i < 16; i++ {
|
|
VMOVDQU(vs[i], mv[i])
|
|
}
|
|
for i := range vs[:8] {
|
|
VPXOR(vs[i], vs[i+8], vs[i])
|
|
}
|
|
transpose(vs[:8], vs[8:])
|
|
for i, v := range vs[8:] {
|
|
VMOVDQU(v, out.Offset(i*64))
|
|
}
|
|
for i := 8; i < 16; i++ {
|
|
VMOVDQU(mv[i], vs[i])
|
|
}
|
|
for i, v := range vs[8:] {
|
|
VPBROADCASTD(cv.Offset(i*4), vs[0])
|
|
VPXOR(vs[0], v, v)
|
|
}
|
|
transpose(vs[8:], vs[:8])
|
|
for i, v := range vs[:8] {
|
|
VMOVDQU(v, out.Offset(i*64+32))
|
|
}
|
|
|
|
RET()
|
|
}
|
|
|
|
func genCompressChunksAVX2() {
|
|
TEXT("compressChunksAVX2", NOSPLIT, "func(cvs *[8][8]uint32, buf *[8192]byte, key *[8]uint32, counter uint64, flags uint32)")
|
|
cvs := Mem{Base: Load(Param("cvs"), GP64())}
|
|
buf := Mem{Base: Load(Param("buf"), GP64())}
|
|
key := Mem{Base: Load(Param("key"), GP64())}
|
|
counter, _ := Param("counter").Resolve()
|
|
flags, _ := Param("flags").Resolve()
|
|
|
|
var vs [16]VecVirtual
|
|
var mv [16]Mem
|
|
for i := range vs {
|
|
vs[i] = YMM()
|
|
mv[i] = AllocLocal(32)
|
|
}
|
|
|
|
Comment("Load key")
|
|
for i := 0; i < 8; i++ {
|
|
VPBROADCASTD(key.Offset(i*4), vs[i])
|
|
}
|
|
|
|
Comment("Initialize counter")
|
|
counterLo := AllocLocal(32)
|
|
counterHi := AllocLocal(32)
|
|
loadCounter(counter.Addr, vs[12:14], vs[14:16])
|
|
VMOVDQU(vs[12], counterLo)
|
|
VMOVDQU(vs[13], counterHi)
|
|
|
|
Comment("Initialize flags")
|
|
chunkFlags := AllocLocal(16 * 4)
|
|
VPBROADCASTD(flags.Addr, vs[14])
|
|
VMOVDQU(vs[14], chunkFlags.Offset(0*32))
|
|
VMOVDQU(vs[14], chunkFlags.Offset(1*32))
|
|
ORL(Imm(1), chunkFlags.Offset(0*4))
|
|
ORL(Imm(2), chunkFlags.Offset(15*4))
|
|
|
|
Comment("Loop index")
|
|
loop := GP64()
|
|
XORQ(loop, loop)
|
|
Label("loop")
|
|
|
|
Comment("Load transposed block")
|
|
VMOVDQU(globals.seq, vs[9])
|
|
VPSLLD(Imm(10), vs[9], vs[9]) // stride of 1024
|
|
for i := 0; i < 16; i++ {
|
|
VPCMPEQD(vs[8], vs[8], vs[8]) // fastest way to set all bits to 1
|
|
VPGATHERDD(vs[8], buf.Offset(i*4).Idx(vs[9], 1), vs[10])
|
|
VMOVDQU(vs[10], mv[i])
|
|
}
|
|
ADDQ(Imm(64), buf.Base)
|
|
|
|
Comment("Reload state vectors (other than CVs)")
|
|
for i := 0; i < 4; i++ {
|
|
VPBROADCASTD(globals.iv.Offset(i*4), vs[8+i])
|
|
}
|
|
VMOVDQU(counterLo, vs[12])
|
|
VMOVDQU(counterHi, vs[13])
|
|
VPBROADCASTD(globals.seq.Offset(4), vs[14])
|
|
VPSLLD(Imm(6), vs[14], vs[14]) // 64
|
|
VPBROADCASTD(chunkFlags.Idx(loop, 4), vs[15])
|
|
|
|
performRoundsAVX2(vs, mv)
|
|
|
|
Comment("Finalize CVs")
|
|
for i := range vs[:8] {
|
|
VPXOR(vs[i], vs[i+8], vs[i])
|
|
}
|
|
|
|
Comment("Loop")
|
|
INCQ(loop)
|
|
CMPQ(loop, U32(16))
|
|
JNE(LabelRef("loop"))
|
|
|
|
Comment("Finished; transpose CVs")
|
|
transpose(vs[:8], vs[8:])
|
|
for i, v := range vs[8:] {
|
|
VMOVDQU(v, cvs.Offset(i*32))
|
|
}
|
|
|
|
RET()
|
|
}
|
|
|
|
func genCompressParentsAVX2() {
|
|
TEXT("compressParentsAVX2", NOSPLIT, "func(parents *[8][8]uint32, cvs *[16][8]uint32, key *[8]uint32, flags uint32)")
|
|
parents := Mem{Base: Load(Param("parents"), GP64())}
|
|
cvs := Mem{Base: Load(Param("cvs"), GP64())}
|
|
key := Mem{Base: Load(Param("key"), GP64())}
|
|
flags, _ := Param("flags").Resolve()
|
|
|
|
var vs [16]VecVirtual
|
|
var mv [16]Mem
|
|
for i := range vs {
|
|
vs[i] = YMM()
|
|
mv[i] = AllocLocal(32)
|
|
}
|
|
|
|
Comment("Load transposed block")
|
|
VMOVDQU(globals.seq, vs[9])
|
|
VPSLLD(Imm(6), vs[9], vs[9]) // stride of 64
|
|
for i := 0; i < 16; i++ {
|
|
VPCMPEQD(vs[8], vs[8], vs[8]) // fastest way to set all bits to 1
|
|
VPGATHERDD(vs[8], cvs.Offset(i*4).Idx(vs[9], 1), vs[10])
|
|
VMOVDQU(vs[10], mv[i])
|
|
}
|
|
|
|
Comment("Initialize state vectors")
|
|
for i, v := range vs {
|
|
switch i {
|
|
case 0, 1, 2, 3, 4, 5, 6, 7: // cv
|
|
VPBROADCASTD(key.Offset(i*4), v)
|
|
case 8, 9, 10, 11: // iv
|
|
VPBROADCASTD(globals.iv.Offset((i-8)*4), v)
|
|
case 12, 13: // counter
|
|
VPXOR(v, v, v)
|
|
case 14: // blockLen
|
|
VPBROADCASTD(globals.seq.Offset(1*4), v)
|
|
VPSLLD(Imm(6), v, v) // 64
|
|
case 15: // flags
|
|
ORL(Imm(4), flags.Addr) // flagParent
|
|
VPBROADCASTD(flags.Addr, v)
|
|
}
|
|
}
|
|
|
|
performRoundsAVX2(vs, mv)
|
|
|
|
Comment("Finalize CVs")
|
|
for i := range vs[:8] {
|
|
VPXOR(vs[i], vs[i+8], vs[i])
|
|
}
|
|
transpose(vs[:8], vs[8:])
|
|
for i, v := range vs[8:] {
|
|
VMOVDQU(v, parents.Offset(i*32))
|
|
}
|
|
|
|
RET()
|
|
}
|
|
|
|
func performRoundsAVX2(sv [16]VecVirtual, mv [16]Mem) {
|
|
spillMem := AllocLocal(32)
|
|
tmp := sv[8]
|
|
g := func(a, b, c, d VecVirtual, mx, my Mem) {
|
|
// Helper function for performing rotations. Also manages c, tmp and
|
|
// spillMem: if c == tmp, we need to spill and reload c using spillMem.
|
|
rotr := func(v VecVirtual, n uint64, dst VecVirtual) {
|
|
switch n {
|
|
case 8, 16:
|
|
shuf := [...]Mem{8: globals.shuffleRot8, 16: globals.shuffleRot16}[n]
|
|
VPSHUFB(shuf, v, dst)
|
|
if c == tmp {
|
|
VMOVDQU(spillMem, c)
|
|
}
|
|
case 7, 12:
|
|
if c == tmp {
|
|
VMOVDQU(c, spillMem)
|
|
}
|
|
VPSRLD(Imm(n), v, tmp)
|
|
VPSLLD(Imm(32-n), v, dst)
|
|
VPOR(dst, tmp, dst)
|
|
}
|
|
}
|
|
|
|
VPADDD(a, b, a)
|
|
VPADDD(mx, a, a)
|
|
VPXOR(d, a, d)
|
|
rotr(d, 16, d)
|
|
VPADDD(c, d, c)
|
|
VPXOR(b, c, b)
|
|
rotr(b, 12, b)
|
|
VPADDD(a, b, a)
|
|
VPADDD(my, a, a)
|
|
VPXOR(d, a, d)
|
|
rotr(d, 8, d)
|
|
VPADDD(c, d, c)
|
|
VPXOR(b, c, b)
|
|
rotr(b, 7, b)
|
|
}
|
|
|
|
VMOVDQU(sv[8], spillMem) // spill
|
|
for i := 0; i < 7; i++ {
|
|
Comment(fmt.Sprintf("Round %v", i+1))
|
|
g(sv[0], sv[4], sv[8], sv[12], mv[0], mv[1])
|
|
g(sv[1], sv[5], sv[9], sv[13], mv[2], mv[3])
|
|
g(sv[2], sv[6], sv[10], sv[14], mv[4], mv[5])
|
|
g(sv[3], sv[7], sv[11], sv[15], mv[6], mv[7])
|
|
g(sv[0], sv[5], sv[10], sv[15], mv[8], mv[9])
|
|
g(sv[1], sv[6], sv[11], sv[12], mv[10], mv[11])
|
|
g(sv[2], sv[7], sv[8], sv[13], mv[12], mv[13])
|
|
g(sv[3], sv[4], sv[9], sv[14], mv[14], mv[15])
|
|
|
|
// permute
|
|
mv = [16]Mem{
|
|
mv[2], mv[6], mv[3], mv[10],
|
|
mv[7], mv[0], mv[4], mv[13],
|
|
mv[1], mv[11], mv[12], mv[5],
|
|
mv[9], mv[14], mv[15], mv[8],
|
|
}
|
|
}
|
|
VMOVDQU(spillMem, sv[8]) // reload
|
|
}
|
|
|
|
func loadCounter(counter Mem, dst, scratch []VecVirtual) {
|
|
// fill dst[0] and dst[1] with counter + 0,1,2,3,4,5,6,7, then transpose so
|
|
// that dst[0] contains low 32 bits and dst[1] contains high 32 bits.
|
|
VPBROADCASTQ(counter, dst[0])
|
|
VPBROADCASTQ(counter, dst[1])
|
|
VPADDQ(globals.seq64.Offset(0*4), dst[0], dst[0])
|
|
VPADDQ(globals.seq64.Offset(8*4), dst[1], dst[1])
|
|
VPUNPCKLDQ(dst[1], dst[0], scratch[0])
|
|
VPUNPCKHDQ(dst[1], dst[0], scratch[1])
|
|
VPUNPCKLDQ(scratch[1], scratch[0], dst[0])
|
|
VPUNPCKHDQ(scratch[1], scratch[0], dst[1])
|
|
const perm = 0<<0 | 2<<2 | 1<<4 | 3<<6
|
|
VPERMQ(Imm(perm), dst[0], dst[0])
|
|
VPERMQ(Imm(perm), dst[1], dst[1])
|
|
}
|
|
|
|
func transpose(src, dst []VecVirtual) {
|
|
// interleave uint32s
|
|
for i := 0; i < 8; i += 2 {
|
|
VPUNPCKLDQ(src[i+1], src[i], dst[i+0])
|
|
VPUNPCKHDQ(src[i+1], src[i], dst[i+1])
|
|
}
|
|
// interleave groups of two uint32s
|
|
for i := 0; i < 4; i++ {
|
|
j := i*2 - i%2 // j := 0,1,4,5
|
|
VPUNPCKLQDQ(dst[j+2], dst[j], src[i*2+0])
|
|
VPUNPCKHQDQ(dst[j+2], dst[j], src[i*2+1])
|
|
}
|
|
// interleave groups of four uint32s
|
|
for i := 0; i < 4; i++ {
|
|
VPERM2I128(Imm(0x20), src[i+4], src[i], dst[i+0])
|
|
VPERM2I128(Imm(0x31), src[i+4], src[i], dst[i+4])
|
|
}
|
|
}
|