1312 lines
32 KiB
ArmAsm
1312 lines
32 KiB
ArmAsm
|
// Code generated by command: go run gen.go -out blake3_amd64.s. DO NOT EDIT.
|
||
|
|
||
|
#include "textflag.h"
|
||
|
|
||
|
DATA iv<>+0(SB)/4, $0x6a09e667
|
||
|
DATA iv<>+4(SB)/4, $0xbb67ae85
|
||
|
DATA iv<>+8(SB)/4, $0x3c6ef372
|
||
|
DATA iv<>+12(SB)/4, $0xa54ff53a
|
||
|
GLOBL iv<>(SB), RODATA|NOPTR, $16
|
||
|
|
||
|
DATA block_len<>+0(SB)/4, $0x00000040
|
||
|
DATA block_len<>+4(SB)/4, $0x00000040
|
||
|
DATA block_len<>+8(SB)/4, $0x00000040
|
||
|
DATA block_len<>+12(SB)/4, $0x00000040
|
||
|
DATA block_len<>+16(SB)/4, $0x00000040
|
||
|
DATA block_len<>+20(SB)/4, $0x00000040
|
||
|
DATA block_len<>+24(SB)/4, $0x00000040
|
||
|
DATA block_len<>+28(SB)/4, $0x00000040
|
||
|
GLOBL block_len<>(SB), RODATA|NOPTR, $32
|
||
|
|
||
|
DATA stride_1024<>+0(SB)/4, $0x00000000
|
||
|
DATA stride_1024<>+4(SB)/4, $0x00000400
|
||
|
DATA stride_1024<>+8(SB)/4, $0x00000800
|
||
|
DATA stride_1024<>+12(SB)/4, $0x00000c00
|
||
|
DATA stride_1024<>+16(SB)/4, $0x00001000
|
||
|
DATA stride_1024<>+20(SB)/4, $0x00001400
|
||
|
DATA stride_1024<>+24(SB)/4, $0x00001800
|
||
|
DATA stride_1024<>+28(SB)/4, $0x00001c00
|
||
|
GLOBL stride_1024<>(SB), RODATA|NOPTR, $32
|
||
|
|
||
|
DATA increment_counter<>+0(SB)/8, $0x0000000000000000
|
||
|
DATA increment_counter<>+8(SB)/8, $0x0000000000000001
|
||
|
DATA increment_counter<>+16(SB)/8, $0x0000000000000002
|
||
|
DATA increment_counter<>+24(SB)/8, $0x0000000000000003
|
||
|
DATA increment_counter<>+32(SB)/8, $0x0000000000000004
|
||
|
DATA increment_counter<>+40(SB)/8, $0x0000000000000005
|
||
|
DATA increment_counter<>+48(SB)/8, $0x0000000000000006
|
||
|
DATA increment_counter<>+56(SB)/8, $0x0000000000000007
|
||
|
GLOBL increment_counter<>(SB), RODATA|NOPTR, $64
|
||
|
|
||
|
DATA set_flags<>+0(SB)/4, $0x00000001
|
||
|
DATA set_flags<>+4(SB)/4, $0x00000000
|
||
|
DATA set_flags<>+8(SB)/4, $0x00000000
|
||
|
DATA set_flags<>+12(SB)/4, $0x00000000
|
||
|
DATA set_flags<>+16(SB)/4, $0x00000000
|
||
|
DATA set_flags<>+20(SB)/4, $0x00000000
|
||
|
DATA set_flags<>+24(SB)/4, $0x00000000
|
||
|
DATA set_flags<>+28(SB)/4, $0x00000000
|
||
|
DATA set_flags<>+32(SB)/4, $0x00000000
|
||
|
DATA set_flags<>+36(SB)/4, $0x00000000
|
||
|
DATA set_flags<>+40(SB)/4, $0x00000000
|
||
|
DATA set_flags<>+44(SB)/4, $0x00000000
|
||
|
DATA set_flags<>+48(SB)/4, $0x00000000
|
||
|
DATA set_flags<>+52(SB)/4, $0x00000000
|
||
|
DATA set_flags<>+56(SB)/4, $0x00000000
|
||
|
DATA set_flags<>+60(SB)/4, $0x00000002
|
||
|
GLOBL set_flags<>(SB), RODATA|NOPTR, $64
|
||
|
|
||
|
DATA shuffle_rot8<>+0(SB)/4, $0x00030201
|
||
|
DATA shuffle_rot8<>+4(SB)/4, $0x04070605
|
||
|
DATA shuffle_rot8<>+8(SB)/4, $0x080b0a09
|
||
|
DATA shuffle_rot8<>+12(SB)/4, $0x0c0f0e0d
|
||
|
DATA shuffle_rot8<>+16(SB)/4, $0x10131211
|
||
|
DATA shuffle_rot8<>+20(SB)/4, $0x14171615
|
||
|
DATA shuffle_rot8<>+24(SB)/4, $0x181b1a19
|
||
|
DATA shuffle_rot8<>+28(SB)/4, $0x1c1f1e1d
|
||
|
GLOBL shuffle_rot8<>(SB), RODATA|NOPTR, $32
|
||
|
|
||
|
DATA shuffle_rot16<>+0(SB)/4, $0x01000302
|
||
|
DATA shuffle_rot16<>+4(SB)/4, $0x05040706
|
||
|
DATA shuffle_rot16<>+8(SB)/4, $0x09080b0a
|
||
|
DATA shuffle_rot16<>+12(SB)/4, $0x0d0c0f0e
|
||
|
DATA shuffle_rot16<>+16(SB)/4, $0x11101312
|
||
|
DATA shuffle_rot16<>+20(SB)/4, $0x15141716
|
||
|
DATA shuffle_rot16<>+24(SB)/4, $0x19181b1a
|
||
|
DATA shuffle_rot16<>+28(SB)/4, $0x1d1c1f1e
|
||
|
GLOBL shuffle_rot16<>(SB), RODATA|NOPTR, $32
|
||
|
|
||
|
// func compressChunksAVX2(cvs *[8][8]uint32, buf *[8192]byte, key *[8]uint32, counter uint64, flags uint32)
|
||
|
// Requires: AVX, AVX2
|
||
|
TEXT ·compressChunksAVX2(SB), NOSPLIT, $672-40
|
||
|
MOVQ cvs+0(FP), AX
|
||
|
MOVQ buf+8(FP), CX
|
||
|
MOVQ key+16(FP), DX
|
||
|
|
||
|
// Load key
|
||
|
VPBROADCASTD (DX), Y0
|
||
|
VPBROADCASTD 4(DX), Y1
|
||
|
VPBROADCASTD 8(DX), Y2
|
||
|
VPBROADCASTD 12(DX), Y3
|
||
|
VPBROADCASTD 16(DX), Y4
|
||
|
VPBROADCASTD 20(DX), Y5
|
||
|
VPBROADCASTD 24(DX), Y6
|
||
|
VPBROADCASTD 28(DX), Y7
|
||
|
|
||
|
// Initialize counter
|
||
|
VPBROADCASTQ counter+24(FP), Y12
|
||
|
VPBROADCASTQ counter+24(FP), Y13
|
||
|
VPADDQ increment_counter<>+0(SB), Y12, Y12
|
||
|
VPADDQ increment_counter<>+32(SB), Y13, Y13
|
||
|
VPUNPCKLDQ Y13, Y12, Y14
|
||
|
VPUNPCKHDQ Y13, Y12, Y15
|
||
|
VPUNPCKLDQ Y15, Y14, Y12
|
||
|
VPUNPCKHDQ Y15, Y14, Y13
|
||
|
VPERMQ $0xd8, Y12, Y12
|
||
|
VPERMQ $0xd8, Y13, Y13
|
||
|
VMOVDQU Y12, 544(SP)
|
||
|
VMOVDQU Y13, 576(SP)
|
||
|
|
||
|
// Initialize flags
|
||
|
VPBROADCASTD flags+32(FP), Y14
|
||
|
VPOR set_flags<>+0(SB), Y14, Y15
|
||
|
VMOVDQU Y15, 608(SP)
|
||
|
VPOR set_flags<>+32(SB), Y14, Y15
|
||
|
VMOVDQU Y15, 640(SP)
|
||
|
|
||
|
// Loop index
|
||
|
XORQ DX, DX
|
||
|
|
||
|
loop:
|
||
|
// Load transposed block
|
||
|
VMOVDQU stride_1024<>+0(SB), Y9
|
||
|
VPCMPEQD Y8, Y8, Y8
|
||
|
VPGATHERDD Y8, (CX)(Y9*1), Y10
|
||
|
VMOVDQU Y10, (SP)
|
||
|
VPCMPEQD Y8, Y8, Y8
|
||
|
VPGATHERDD Y8, 4(CX)(Y9*1), Y10
|
||
|
VMOVDQU Y10, 32(SP)
|
||
|
VPCMPEQD Y8, Y8, Y8
|
||
|
VPGATHERDD Y8, 8(CX)(Y9*1), Y10
|
||
|
VMOVDQU Y10, 64(SP)
|
||
|
VPCMPEQD Y8, Y8, Y8
|
||
|
VPGATHERDD Y8, 12(CX)(Y9*1), Y10
|
||
|
VMOVDQU Y10, 96(SP)
|
||
|
VPCMPEQD Y8, Y8, Y8
|
||
|
VPGATHERDD Y8, 16(CX)(Y9*1), Y10
|
||
|
VMOVDQU Y10, 128(SP)
|
||
|
VPCMPEQD Y8, Y8, Y8
|
||
|
VPGATHERDD Y8, 20(CX)(Y9*1), Y10
|
||
|
VMOVDQU Y10, 160(SP)
|
||
|
VPCMPEQD Y8, Y8, Y8
|
||
|
VPGATHERDD Y8, 24(CX)(Y9*1), Y10
|
||
|
VMOVDQU Y10, 192(SP)
|
||
|
VPCMPEQD Y8, Y8, Y8
|
||
|
VPGATHERDD Y8, 28(CX)(Y9*1), Y10
|
||
|
VMOVDQU Y10, 224(SP)
|
||
|
VPCMPEQD Y8, Y8, Y8
|
||
|
VPGATHERDD Y8, 32(CX)(Y9*1), Y10
|
||
|
VMOVDQU Y10, 256(SP)
|
||
|
VPCMPEQD Y8, Y8, Y8
|
||
|
VPGATHERDD Y8, 36(CX)(Y9*1), Y10
|
||
|
VMOVDQU Y10, 288(SP)
|
||
|
VPCMPEQD Y8, Y8, Y8
|
||
|
VPGATHERDD Y8, 40(CX)(Y9*1), Y10
|
||
|
VMOVDQU Y10, 320(SP)
|
||
|
VPCMPEQD Y8, Y8, Y8
|
||
|
VPGATHERDD Y8, 44(CX)(Y9*1), Y10
|
||
|
VMOVDQU Y10, 352(SP)
|
||
|
VPCMPEQD Y8, Y8, Y8
|
||
|
VPGATHERDD Y8, 48(CX)(Y9*1), Y10
|
||
|
VMOVDQU Y10, 384(SP)
|
||
|
VPCMPEQD Y8, Y8, Y8
|
||
|
VPGATHERDD Y8, 52(CX)(Y9*1), Y10
|
||
|
VMOVDQU Y10, 416(SP)
|
||
|
VPCMPEQD Y8, Y8, Y8
|
||
|
VPGATHERDD Y8, 56(CX)(Y9*1), Y10
|
||
|
VMOVDQU Y10, 448(SP)
|
||
|
VPCMPEQD Y8, Y8, Y8
|
||
|
VPGATHERDD Y8, 60(CX)(Y9*1), Y10
|
||
|
VMOVDQU Y10, 480(SP)
|
||
|
ADDQ $0x40, CX
|
||
|
|
||
|
// Reload state vectors (other than CVs)
|
||
|
VPBROADCASTD iv<>+0(SB), Y8
|
||
|
VPBROADCASTD iv<>+4(SB), Y9
|
||
|
VPBROADCASTD iv<>+8(SB), Y10
|
||
|
VPBROADCASTD iv<>+12(SB), Y11
|
||
|
VMOVDQU 544(SP), Y12
|
||
|
VMOVDQU 576(SP), Y13
|
||
|
VMOVDQU block_len<>+0(SB), Y14
|
||
|
VPBROADCASTD 608(SP)(DX*4), Y15
|
||
|
VMOVDQU Y8, 512(SP)
|
||
|
|
||
|
// Round 1
|
||
|
VPADDD Y0, Y4, Y0
|
||
|
VPADDD (SP), Y0, Y0
|
||
|
VPXOR Y12, Y0, Y12
|
||
|
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
|
||
|
VMOVDQU 512(SP), Y8
|
||
|
VPADDD Y8, Y12, Y8
|
||
|
VPXOR Y4, Y8, Y4
|
||
|
VMOVDQU Y8, 512(SP)
|
||
|
VPSRLD $0x0c, Y4, Y8
|
||
|
VPSLLD $0x14, Y4, Y4
|
||
|
VPOR Y4, Y8, Y4
|
||
|
VPADDD Y0, Y4, Y0
|
||
|
VPADDD 32(SP), Y0, Y0
|
||
|
VPXOR Y12, Y0, Y12
|
||
|
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
|
||
|
VMOVDQU 512(SP), Y8
|
||
|
VPADDD Y8, Y12, Y8
|
||
|
VPXOR Y4, Y8, Y4
|
||
|
VMOVDQU Y8, 512(SP)
|
||
|
VPSRLD $0x07, Y4, Y8
|
||
|
VPSLLD $0x19, Y4, Y4
|
||
|
VPOR Y4, Y8, Y4
|
||
|
VPADDD Y1, Y5, Y1
|
||
|
VPADDD 64(SP), Y1, Y1
|
||
|
VPXOR Y13, Y1, Y13
|
||
|
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
|
||
|
VPADDD Y9, Y13, Y9
|
||
|
VPXOR Y5, Y9, Y5
|
||
|
VPSRLD $0x0c, Y5, Y8
|
||
|
VPSLLD $0x14, Y5, Y5
|
||
|
VPOR Y5, Y8, Y5
|
||
|
VPADDD Y1, Y5, Y1
|
||
|
VPADDD 96(SP), Y1, Y1
|
||
|
VPXOR Y13, Y1, Y13
|
||
|
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
|
||
|
VPADDD Y9, Y13, Y9
|
||
|
VPXOR Y5, Y9, Y5
|
||
|
VPSRLD $0x07, Y5, Y8
|
||
|
VPSLLD $0x19, Y5, Y5
|
||
|
VPOR Y5, Y8, Y5
|
||
|
VPADDD Y2, Y6, Y2
|
||
|
VPADDD 128(SP), Y2, Y2
|
||
|
VPXOR Y14, Y2, Y14
|
||
|
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
|
||
|
VPADDD Y10, Y14, Y10
|
||
|
VPXOR Y6, Y10, Y6
|
||
|
VPSRLD $0x0c, Y6, Y8
|
||
|
VPSLLD $0x14, Y6, Y6
|
||
|
VPOR Y6, Y8, Y6
|
||
|
VPADDD Y2, Y6, Y2
|
||
|
VPADDD 160(SP), Y2, Y2
|
||
|
VPXOR Y14, Y2, Y14
|
||
|
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
|
||
|
VPADDD Y10, Y14, Y10
|
||
|
VPXOR Y6, Y10, Y6
|
||
|
VPSRLD $0x07, Y6, Y8
|
||
|
VPSLLD $0x19, Y6, Y6
|
||
|
VPOR Y6, Y8, Y6
|
||
|
VPADDD Y3, Y7, Y3
|
||
|
VPADDD 192(SP), Y3, Y3
|
||
|
VPXOR Y15, Y3, Y15
|
||
|
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
|
||
|
VPADDD Y11, Y15, Y11
|
||
|
VPXOR Y7, Y11, Y7
|
||
|
VPSRLD $0x0c, Y7, Y8
|
||
|
VPSLLD $0x14, Y7, Y7
|
||
|
VPOR Y7, Y8, Y7
|
||
|
VPADDD Y3, Y7, Y3
|
||
|
VPADDD 224(SP), Y3, Y3
|
||
|
VPXOR Y15, Y3, Y15
|
||
|
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
|
||
|
VPADDD Y11, Y15, Y11
|
||
|
VPXOR Y7, Y11, Y7
|
||
|
VPSRLD $0x07, Y7, Y8
|
||
|
VPSLLD $0x19, Y7, Y7
|
||
|
VPOR Y7, Y8, Y7
|
||
|
VPADDD Y0, Y5, Y0
|
||
|
VPADDD 256(SP), Y0, Y0
|
||
|
VPXOR Y15, Y0, Y15
|
||
|
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
|
||
|
VPADDD Y10, Y15, Y10
|
||
|
VPXOR Y5, Y10, Y5
|
||
|
VPSRLD $0x0c, Y5, Y8
|
||
|
VPSLLD $0x14, Y5, Y5
|
||
|
VPOR Y5, Y8, Y5
|
||
|
VPADDD Y0, Y5, Y0
|
||
|
VPADDD 288(SP), Y0, Y0
|
||
|
VPXOR Y15, Y0, Y15
|
||
|
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
|
||
|
VPADDD Y10, Y15, Y10
|
||
|
VPXOR Y5, Y10, Y5
|
||
|
VPSRLD $0x07, Y5, Y8
|
||
|
VPSLLD $0x19, Y5, Y5
|
||
|
VPOR Y5, Y8, Y5
|
||
|
VPADDD Y1, Y6, Y1
|
||
|
VPADDD 320(SP), Y1, Y1
|
||
|
VPXOR Y12, Y1, Y12
|
||
|
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
|
||
|
VPADDD Y11, Y12, Y11
|
||
|
VPXOR Y6, Y11, Y6
|
||
|
VPSRLD $0x0c, Y6, Y8
|
||
|
VPSLLD $0x14, Y6, Y6
|
||
|
VPOR Y6, Y8, Y6
|
||
|
VPADDD Y1, Y6, Y1
|
||
|
VPADDD 352(SP), Y1, Y1
|
||
|
VPXOR Y12, Y1, Y12
|
||
|
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
|
||
|
VPADDD Y11, Y12, Y11
|
||
|
VPXOR Y6, Y11, Y6
|
||
|
VPSRLD $0x07, Y6, Y8
|
||
|
VPSLLD $0x19, Y6, Y6
|
||
|
VPOR Y6, Y8, Y6
|
||
|
VPADDD Y2, Y7, Y2
|
||
|
VPADDD 384(SP), Y2, Y2
|
||
|
VPXOR Y13, Y2, Y13
|
||
|
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
|
||
|
VMOVDQU 512(SP), Y8
|
||
|
VPADDD Y8, Y13, Y8
|
||
|
VPXOR Y7, Y8, Y7
|
||
|
VMOVDQU Y8, 512(SP)
|
||
|
VPSRLD $0x0c, Y7, Y8
|
||
|
VPSLLD $0x14, Y7, Y7
|
||
|
VPOR Y7, Y8, Y7
|
||
|
VPADDD Y2, Y7, Y2
|
||
|
VPADDD 416(SP), Y2, Y2
|
||
|
VPXOR Y13, Y2, Y13
|
||
|
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
|
||
|
VMOVDQU 512(SP), Y8
|
||
|
VPADDD Y8, Y13, Y8
|
||
|
VPXOR Y7, Y8, Y7
|
||
|
VMOVDQU Y8, 512(SP)
|
||
|
VPSRLD $0x07, Y7, Y8
|
||
|
VPSLLD $0x19, Y7, Y7
|
||
|
VPOR Y7, Y8, Y7
|
||
|
VPADDD Y3, Y4, Y3
|
||
|
VPADDD 448(SP), Y3, Y3
|
||
|
VPXOR Y14, Y3, Y14
|
||
|
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
|
||
|
VPADDD Y9, Y14, Y9
|
||
|
VPXOR Y4, Y9, Y4
|
||
|
VPSRLD $0x0c, Y4, Y8
|
||
|
VPSLLD $0x14, Y4, Y4
|
||
|
VPOR Y4, Y8, Y4
|
||
|
VPADDD Y3, Y4, Y3
|
||
|
VPADDD 480(SP), Y3, Y3
|
||
|
VPXOR Y14, Y3, Y14
|
||
|
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
|
||
|
VPADDD Y9, Y14, Y9
|
||
|
VPXOR Y4, Y9, Y4
|
||
|
VPSRLD $0x07, Y4, Y8
|
||
|
VPSLLD $0x19, Y4, Y4
|
||
|
VPOR Y4, Y8, Y4
|
||
|
|
||
|
// Round 2
|
||
|
VPADDD Y0, Y4, Y0
|
||
|
VPADDD 64(SP), Y0, Y0
|
||
|
VPXOR Y12, Y0, Y12
|
||
|
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
|
||
|
VMOVDQU 512(SP), Y8
|
||
|
VPADDD Y8, Y12, Y8
|
||
|
VPXOR Y4, Y8, Y4
|
||
|
VMOVDQU Y8, 512(SP)
|
||
|
VPSRLD $0x0c, Y4, Y8
|
||
|
VPSLLD $0x14, Y4, Y4
|
||
|
VPOR Y4, Y8, Y4
|
||
|
VPADDD Y0, Y4, Y0
|
||
|
VPADDD 192(SP), Y0, Y0
|
||
|
VPXOR Y12, Y0, Y12
|
||
|
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
|
||
|
VMOVDQU 512(SP), Y8
|
||
|
VPADDD Y8, Y12, Y8
|
||
|
VPXOR Y4, Y8, Y4
|
||
|
VMOVDQU Y8, 512(SP)
|
||
|
VPSRLD $0x07, Y4, Y8
|
||
|
VPSLLD $0x19, Y4, Y4
|
||
|
VPOR Y4, Y8, Y4
|
||
|
VPADDD Y1, Y5, Y1
|
||
|
VPADDD 96(SP), Y1, Y1
|
||
|
VPXOR Y13, Y1, Y13
|
||
|
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
|
||
|
VPADDD Y9, Y13, Y9
|
||
|
VPXOR Y5, Y9, Y5
|
||
|
VPSRLD $0x0c, Y5, Y8
|
||
|
VPSLLD $0x14, Y5, Y5
|
||
|
VPOR Y5, Y8, Y5
|
||
|
VPADDD Y1, Y5, Y1
|
||
|
VPADDD 320(SP), Y1, Y1
|
||
|
VPXOR Y13, Y1, Y13
|
||
|
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
|
||
|
VPADDD Y9, Y13, Y9
|
||
|
VPXOR Y5, Y9, Y5
|
||
|
VPSRLD $0x07, Y5, Y8
|
||
|
VPSLLD $0x19, Y5, Y5
|
||
|
VPOR Y5, Y8, Y5
|
||
|
VPADDD Y2, Y6, Y2
|
||
|
VPADDD 224(SP), Y2, Y2
|
||
|
VPXOR Y14, Y2, Y14
|
||
|
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
|
||
|
VPADDD Y10, Y14, Y10
|
||
|
VPXOR Y6, Y10, Y6
|
||
|
VPSRLD $0x0c, Y6, Y8
|
||
|
VPSLLD $0x14, Y6, Y6
|
||
|
VPOR Y6, Y8, Y6
|
||
|
VPADDD Y2, Y6, Y2
|
||
|
VPADDD (SP), Y2, Y2
|
||
|
VPXOR Y14, Y2, Y14
|
||
|
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
|
||
|
VPADDD Y10, Y14, Y10
|
||
|
VPXOR Y6, Y10, Y6
|
||
|
VPSRLD $0x07, Y6, Y8
|
||
|
VPSLLD $0x19, Y6, Y6
|
||
|
VPOR Y6, Y8, Y6
|
||
|
VPADDD Y3, Y7, Y3
|
||
|
VPADDD 128(SP), Y3, Y3
|
||
|
VPXOR Y15, Y3, Y15
|
||
|
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
|
||
|
VPADDD Y11, Y15, Y11
|
||
|
VPXOR Y7, Y11, Y7
|
||
|
VPSRLD $0x0c, Y7, Y8
|
||
|
VPSLLD $0x14, Y7, Y7
|
||
|
VPOR Y7, Y8, Y7
|
||
|
VPADDD Y3, Y7, Y3
|
||
|
VPADDD 416(SP), Y3, Y3
|
||
|
VPXOR Y15, Y3, Y15
|
||
|
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
|
||
|
VPADDD Y11, Y15, Y11
|
||
|
VPXOR Y7, Y11, Y7
|
||
|
VPSRLD $0x07, Y7, Y8
|
||
|
VPSLLD $0x19, Y7, Y7
|
||
|
VPOR Y7, Y8, Y7
|
||
|
VPADDD Y0, Y5, Y0
|
||
|
VPADDD 32(SP), Y0, Y0
|
||
|
VPXOR Y15, Y0, Y15
|
||
|
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
|
||
|
VPADDD Y10, Y15, Y10
|
||
|
VPXOR Y5, Y10, Y5
|
||
|
VPSRLD $0x0c, Y5, Y8
|
||
|
VPSLLD $0x14, Y5, Y5
|
||
|
VPOR Y5, Y8, Y5
|
||
|
VPADDD Y0, Y5, Y0
|
||
|
VPADDD 352(SP), Y0, Y0
|
||
|
VPXOR Y15, Y0, Y15
|
||
|
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
|
||
|
VPADDD Y10, Y15, Y10
|
||
|
VPXOR Y5, Y10, Y5
|
||
|
VPSRLD $0x07, Y5, Y8
|
||
|
VPSLLD $0x19, Y5, Y5
|
||
|
VPOR Y5, Y8, Y5
|
||
|
VPADDD Y1, Y6, Y1
|
||
|
VPADDD 384(SP), Y1, Y1
|
||
|
VPXOR Y12, Y1, Y12
|
||
|
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
|
||
|
VPADDD Y11, Y12, Y11
|
||
|
VPXOR Y6, Y11, Y6
|
||
|
VPSRLD $0x0c, Y6, Y8
|
||
|
VPSLLD $0x14, Y6, Y6
|
||
|
VPOR Y6, Y8, Y6
|
||
|
VPADDD Y1, Y6, Y1
|
||
|
VPADDD 160(SP), Y1, Y1
|
||
|
VPXOR Y12, Y1, Y12
|
||
|
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
|
||
|
VPADDD Y11, Y12, Y11
|
||
|
VPXOR Y6, Y11, Y6
|
||
|
VPSRLD $0x07, Y6, Y8
|
||
|
VPSLLD $0x19, Y6, Y6
|
||
|
VPOR Y6, Y8, Y6
|
||
|
VPADDD Y2, Y7, Y2
|
||
|
VPADDD 288(SP), Y2, Y2
|
||
|
VPXOR Y13, Y2, Y13
|
||
|
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
|
||
|
VMOVDQU 512(SP), Y8
|
||
|
VPADDD Y8, Y13, Y8
|
||
|
VPXOR Y7, Y8, Y7
|
||
|
VMOVDQU Y8, 512(SP)
|
||
|
VPSRLD $0x0c, Y7, Y8
|
||
|
VPSLLD $0x14, Y7, Y7
|
||
|
VPOR Y7, Y8, Y7
|
||
|
VPADDD Y2, Y7, Y2
|
||
|
VPADDD 448(SP), Y2, Y2
|
||
|
VPXOR Y13, Y2, Y13
|
||
|
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
|
||
|
VMOVDQU 512(SP), Y8
|
||
|
VPADDD Y8, Y13, Y8
|
||
|
VPXOR Y7, Y8, Y7
|
||
|
VMOVDQU Y8, 512(SP)
|
||
|
VPSRLD $0x07, Y7, Y8
|
||
|
VPSLLD $0x19, Y7, Y7
|
||
|
VPOR Y7, Y8, Y7
|
||
|
VPADDD Y3, Y4, Y3
|
||
|
VPADDD 480(SP), Y3, Y3
|
||
|
VPXOR Y14, Y3, Y14
|
||
|
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
|
||
|
VPADDD Y9, Y14, Y9
|
||
|
VPXOR Y4, Y9, Y4
|
||
|
VPSRLD $0x0c, Y4, Y8
|
||
|
VPSLLD $0x14, Y4, Y4
|
||
|
VPOR Y4, Y8, Y4
|
||
|
VPADDD Y3, Y4, Y3
|
||
|
VPADDD 256(SP), Y3, Y3
|
||
|
VPXOR Y14, Y3, Y14
|
||
|
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
|
||
|
VPADDD Y9, Y14, Y9
|
||
|
VPXOR Y4, Y9, Y4
|
||
|
VPSRLD $0x07, Y4, Y8
|
||
|
VPSLLD $0x19, Y4, Y4
|
||
|
VPOR Y4, Y8, Y4
|
||
|
|
||
|
// Round 3
|
||
|
VPADDD Y0, Y4, Y0
|
||
|
VPADDD 96(SP), Y0, Y0
|
||
|
VPXOR Y12, Y0, Y12
|
||
|
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
|
||
|
VMOVDQU 512(SP), Y8
|
||
|
VPADDD Y8, Y12, Y8
|
||
|
VPXOR Y4, Y8, Y4
|
||
|
VMOVDQU Y8, 512(SP)
|
||
|
VPSRLD $0x0c, Y4, Y8
|
||
|
VPSLLD $0x14, Y4, Y4
|
||
|
VPOR Y4, Y8, Y4
|
||
|
VPADDD Y0, Y4, Y0
|
||
|
VPADDD 128(SP), Y0, Y0
|
||
|
VPXOR Y12, Y0, Y12
|
||
|
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
|
||
|
VMOVDQU 512(SP), Y8
|
||
|
VPADDD Y8, Y12, Y8
|
||
|
VPXOR Y4, Y8, Y4
|
||
|
VMOVDQU Y8, 512(SP)
|
||
|
VPSRLD $0x07, Y4, Y8
|
||
|
VPSLLD $0x19, Y4, Y4
|
||
|
VPOR Y4, Y8, Y4
|
||
|
VPADDD Y1, Y5, Y1
|
||
|
VPADDD 320(SP), Y1, Y1
|
||
|
VPXOR Y13, Y1, Y13
|
||
|
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
|
||
|
VPADDD Y9, Y13, Y9
|
||
|
VPXOR Y5, Y9, Y5
|
||
|
VPSRLD $0x0c, Y5, Y8
|
||
|
VPSLLD $0x14, Y5, Y5
|
||
|
VPOR Y5, Y8, Y5
|
||
|
VPADDD Y1, Y5, Y1
|
||
|
VPADDD 384(SP), Y1, Y1
|
||
|
VPXOR Y13, Y1, Y13
|
||
|
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
|
||
|
VPADDD Y9, Y13, Y9
|
||
|
VPXOR Y5, Y9, Y5
|
||
|
VPSRLD $0x07, Y5, Y8
|
||
|
VPSLLD $0x19, Y5, Y5
|
||
|
VPOR Y5, Y8, Y5
|
||
|
VPADDD Y2, Y6, Y2
|
||
|
VPADDD 416(SP), Y2, Y2
|
||
|
VPXOR Y14, Y2, Y14
|
||
|
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
|
||
|
VPADDD Y10, Y14, Y10
|
||
|
VPXOR Y6, Y10, Y6
|
||
|
VPSRLD $0x0c, Y6, Y8
|
||
|
VPSLLD $0x14, Y6, Y6
|
||
|
VPOR Y6, Y8, Y6
|
||
|
VPADDD Y2, Y6, Y2
|
||
|
VPADDD 64(SP), Y2, Y2
|
||
|
VPXOR Y14, Y2, Y14
|
||
|
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
|
||
|
VPADDD Y10, Y14, Y10
|
||
|
VPXOR Y6, Y10, Y6
|
||
|
VPSRLD $0x07, Y6, Y8
|
||
|
VPSLLD $0x19, Y6, Y6
|
||
|
VPOR Y6, Y8, Y6
|
||
|
VPADDD Y3, Y7, Y3
|
||
|
VPADDD 224(SP), Y3, Y3
|
||
|
VPXOR Y15, Y3, Y15
|
||
|
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
|
||
|
VPADDD Y11, Y15, Y11
|
||
|
VPXOR Y7, Y11, Y7
|
||
|
VPSRLD $0x0c, Y7, Y8
|
||
|
VPSLLD $0x14, Y7, Y7
|
||
|
VPOR Y7, Y8, Y7
|
||
|
VPADDD Y3, Y7, Y3
|
||
|
VPADDD 448(SP), Y3, Y3
|
||
|
VPXOR Y15, Y3, Y15
|
||
|
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
|
||
|
VPADDD Y11, Y15, Y11
|
||
|
VPXOR Y7, Y11, Y7
|
||
|
VPSRLD $0x07, Y7, Y8
|
||
|
VPSLLD $0x19, Y7, Y7
|
||
|
VPOR Y7, Y8, Y7
|
||
|
VPADDD Y0, Y5, Y0
|
||
|
VPADDD 192(SP), Y0, Y0
|
||
|
VPXOR Y15, Y0, Y15
|
||
|
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
|
||
|
VPADDD Y10, Y15, Y10
|
||
|
VPXOR Y5, Y10, Y5
|
||
|
VPSRLD $0x0c, Y5, Y8
|
||
|
VPSLLD $0x14, Y5, Y5
|
||
|
VPOR Y5, Y8, Y5
|
||
|
VPADDD Y0, Y5, Y0
|
||
|
VPADDD 160(SP), Y0, Y0
|
||
|
VPXOR Y15, Y0, Y15
|
||
|
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
|
||
|
VPADDD Y10, Y15, Y10
|
||
|
VPXOR Y5, Y10, Y5
|
||
|
VPSRLD $0x07, Y5, Y8
|
||
|
VPSLLD $0x19, Y5, Y5
|
||
|
VPOR Y5, Y8, Y5
|
||
|
VPADDD Y1, Y6, Y1
|
||
|
VPADDD 288(SP), Y1, Y1
|
||
|
VPXOR Y12, Y1, Y12
|
||
|
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
|
||
|
VPADDD Y11, Y12, Y11
|
||
|
VPXOR Y6, Y11, Y6
|
||
|
VPSRLD $0x0c, Y6, Y8
|
||
|
VPSLLD $0x14, Y6, Y6
|
||
|
VPOR Y6, Y8, Y6
|
||
|
VPADDD Y1, Y6, Y1
|
||
|
VPADDD (SP), Y1, Y1
|
||
|
VPXOR Y12, Y1, Y12
|
||
|
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
|
||
|
VPADDD Y11, Y12, Y11
|
||
|
VPXOR Y6, Y11, Y6
|
||
|
VPSRLD $0x07, Y6, Y8
|
||
|
VPSLLD $0x19, Y6, Y6
|
||
|
VPOR Y6, Y8, Y6
|
||
|
VPADDD Y2, Y7, Y2
|
||
|
VPADDD 352(SP), Y2, Y2
|
||
|
VPXOR Y13, Y2, Y13
|
||
|
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
|
||
|
VMOVDQU 512(SP), Y8
|
||
|
VPADDD Y8, Y13, Y8
|
||
|
VPXOR Y7, Y8, Y7
|
||
|
VMOVDQU Y8, 512(SP)
|
||
|
VPSRLD $0x0c, Y7, Y8
|
||
|
VPSLLD $0x14, Y7, Y7
|
||
|
VPOR Y7, Y8, Y7
|
||
|
VPADDD Y2, Y7, Y2
|
||
|
VPADDD 480(SP), Y2, Y2
|
||
|
VPXOR Y13, Y2, Y13
|
||
|
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
|
||
|
VMOVDQU 512(SP), Y8
|
||
|
VPADDD Y8, Y13, Y8
|
||
|
VPXOR Y7, Y8, Y7
|
||
|
VMOVDQU Y8, 512(SP)
|
||
|
VPSRLD $0x07, Y7, Y8
|
||
|
VPSLLD $0x19, Y7, Y7
|
||
|
VPOR Y7, Y8, Y7
|
||
|
VPADDD Y3, Y4, Y3
|
||
|
VPADDD 256(SP), Y3, Y3
|
||
|
VPXOR Y14, Y3, Y14
|
||
|
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
|
||
|
VPADDD Y9, Y14, Y9
|
||
|
VPXOR Y4, Y9, Y4
|
||
|
VPSRLD $0x0c, Y4, Y8
|
||
|
VPSLLD $0x14, Y4, Y4
|
||
|
VPOR Y4, Y8, Y4
|
||
|
VPADDD Y3, Y4, Y3
|
||
|
VPADDD 32(SP), Y3, Y3
|
||
|
VPXOR Y14, Y3, Y14
|
||
|
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
|
||
|
VPADDD Y9, Y14, Y9
|
||
|
VPXOR Y4, Y9, Y4
|
||
|
VPSRLD $0x07, Y4, Y8
|
||
|
VPSLLD $0x19, Y4, Y4
|
||
|
VPOR Y4, Y8, Y4
|
||
|
|
||
|
// Round 4
|
||
|
VPADDD Y0, Y4, Y0
|
||
|
VPADDD 320(SP), Y0, Y0
|
||
|
VPXOR Y12, Y0, Y12
|
||
|
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
|
||
|
VMOVDQU 512(SP), Y8
|
||
|
VPADDD Y8, Y12, Y8
|
||
|
VPXOR Y4, Y8, Y4
|
||
|
VMOVDQU Y8, 512(SP)
|
||
|
VPSRLD $0x0c, Y4, Y8
|
||
|
VPSLLD $0x14, Y4, Y4
|
||
|
VPOR Y4, Y8, Y4
|
||
|
VPADDD Y0, Y4, Y0
|
||
|
VPADDD 224(SP), Y0, Y0
|
||
|
VPXOR Y12, Y0, Y12
|
||
|
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
|
||
|
VMOVDQU 512(SP), Y8
|
||
|
VPADDD Y8, Y12, Y8
|
||
|
VPXOR Y4, Y8, Y4
|
||
|
VMOVDQU Y8, 512(SP)
|
||
|
VPSRLD $0x07, Y4, Y8
|
||
|
VPSLLD $0x19, Y4, Y4
|
||
|
VPOR Y4, Y8, Y4
|
||
|
VPADDD Y1, Y5, Y1
|
||
|
VPADDD 384(SP), Y1, Y1
|
||
|
VPXOR Y13, Y1, Y13
|
||
|
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
|
||
|
VPADDD Y9, Y13, Y9
|
||
|
VPXOR Y5, Y9, Y5
|
||
|
VPSRLD $0x0c, Y5, Y8
|
||
|
VPSLLD $0x14, Y5, Y5
|
||
|
VPOR Y5, Y8, Y5
|
||
|
VPADDD Y1, Y5, Y1
|
||
|
VPADDD 288(SP), Y1, Y1
|
||
|
VPXOR Y13, Y1, Y13
|
||
|
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
|
||
|
VPADDD Y9, Y13, Y9
|
||
|
VPXOR Y5, Y9, Y5
|
||
|
VPSRLD $0x07, Y5, Y8
|
||
|
VPSLLD $0x19, Y5, Y5
|
||
|
VPOR Y5, Y8, Y5
|
||
|
VPADDD Y2, Y6, Y2
|
||
|
VPADDD 448(SP), Y2, Y2
|
||
|
VPXOR Y14, Y2, Y14
|
||
|
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
|
||
|
VPADDD Y10, Y14, Y10
|
||
|
VPXOR Y6, Y10, Y6
|
||
|
VPSRLD $0x0c, Y6, Y8
|
||
|
VPSLLD $0x14, Y6, Y6
|
||
|
VPOR Y6, Y8, Y6
|
||
|
VPADDD Y2, Y6, Y2
|
||
|
VPADDD 96(SP), Y2, Y2
|
||
|
VPXOR Y14, Y2, Y14
|
||
|
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
|
||
|
VPADDD Y10, Y14, Y10
|
||
|
VPXOR Y6, Y10, Y6
|
||
|
VPSRLD $0x07, Y6, Y8
|
||
|
VPSLLD $0x19, Y6, Y6
|
||
|
VPOR Y6, Y8, Y6
|
||
|
VPADDD Y3, Y7, Y3
|
||
|
VPADDD 416(SP), Y3, Y3
|
||
|
VPXOR Y15, Y3, Y15
|
||
|
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
|
||
|
VPADDD Y11, Y15, Y11
|
||
|
VPXOR Y7, Y11, Y7
|
||
|
VPSRLD $0x0c, Y7, Y8
|
||
|
VPSLLD $0x14, Y7, Y7
|
||
|
VPOR Y7, Y8, Y7
|
||
|
VPADDD Y3, Y7, Y3
|
||
|
VPADDD 480(SP), Y3, Y3
|
||
|
VPXOR Y15, Y3, Y15
|
||
|
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
|
||
|
VPADDD Y11, Y15, Y11
|
||
|
VPXOR Y7, Y11, Y7
|
||
|
VPSRLD $0x07, Y7, Y8
|
||
|
VPSLLD $0x19, Y7, Y7
|
||
|
VPOR Y7, Y8, Y7
|
||
|
VPADDD Y0, Y5, Y0
|
||
|
VPADDD 128(SP), Y0, Y0
|
||
|
VPXOR Y15, Y0, Y15
|
||
|
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
|
||
|
VPADDD Y10, Y15, Y10
|
||
|
VPXOR Y5, Y10, Y5
|
||
|
VPSRLD $0x0c, Y5, Y8
|
||
|
VPSLLD $0x14, Y5, Y5
|
||
|
VPOR Y5, Y8, Y5
|
||
|
VPADDD Y0, Y5, Y0
|
||
|
VPADDD (SP), Y0, Y0
|
||
|
VPXOR Y15, Y0, Y15
|
||
|
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
|
||
|
VPADDD Y10, Y15, Y10
|
||
|
VPXOR Y5, Y10, Y5
|
||
|
VPSRLD $0x07, Y5, Y8
|
||
|
VPSLLD $0x19, Y5, Y5
|
||
|
VPOR Y5, Y8, Y5
|
||
|
VPADDD Y1, Y6, Y1
|
||
|
VPADDD 352(SP), Y1, Y1
|
||
|
VPXOR Y12, Y1, Y12
|
||
|
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
|
||
|
VPADDD Y11, Y12, Y11
|
||
|
VPXOR Y6, Y11, Y6
|
||
|
VPSRLD $0x0c, Y6, Y8
|
||
|
VPSLLD $0x14, Y6, Y6
|
||
|
VPOR Y6, Y8, Y6
|
||
|
VPADDD Y1, Y6, Y1
|
||
|
VPADDD 64(SP), Y1, Y1
|
||
|
VPXOR Y12, Y1, Y12
|
||
|
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
|
||
|
VPADDD Y11, Y12, Y11
|
||
|
VPXOR Y6, Y11, Y6
|
||
|
VPSRLD $0x07, Y6, Y8
|
||
|
VPSLLD $0x19, Y6, Y6
|
||
|
VPOR Y6, Y8, Y6
|
||
|
VPADDD Y2, Y7, Y2
|
||
|
VPADDD 160(SP), Y2, Y2
|
||
|
VPXOR Y13, Y2, Y13
|
||
|
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
|
||
|
VMOVDQU 512(SP), Y8
|
||
|
VPADDD Y8, Y13, Y8
|
||
|
VPXOR Y7, Y8, Y7
|
||
|
VMOVDQU Y8, 512(SP)
|
||
|
VPSRLD $0x0c, Y7, Y8
|
||
|
VPSLLD $0x14, Y7, Y7
|
||
|
VPOR Y7, Y8, Y7
|
||
|
VPADDD Y2, Y7, Y2
|
||
|
VPADDD 256(SP), Y2, Y2
|
||
|
VPXOR Y13, Y2, Y13
|
||
|
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
|
||
|
VMOVDQU 512(SP), Y8
|
||
|
VPADDD Y8, Y13, Y8
|
||
|
VPXOR Y7, Y8, Y7
|
||
|
VMOVDQU Y8, 512(SP)
|
||
|
VPSRLD $0x07, Y7, Y8
|
||
|
VPSLLD $0x19, Y7, Y7
|
||
|
VPOR Y7, Y8, Y7
|
||
|
VPADDD Y3, Y4, Y3
|
||
|
VPADDD 32(SP), Y3, Y3
|
||
|
VPXOR Y14, Y3, Y14
|
||
|
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
|
||
|
VPADDD Y9, Y14, Y9
|
||
|
VPXOR Y4, Y9, Y4
|
||
|
VPSRLD $0x0c, Y4, Y8
|
||
|
VPSLLD $0x14, Y4, Y4
|
||
|
VPOR Y4, Y8, Y4
|
||
|
VPADDD Y3, Y4, Y3
|
||
|
VPADDD 192(SP), Y3, Y3
|
||
|
VPXOR Y14, Y3, Y14
|
||
|
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
|
||
|
VPADDD Y9, Y14, Y9
|
||
|
VPXOR Y4, Y9, Y4
|
||
|
VPSRLD $0x07, Y4, Y8
|
||
|
VPSLLD $0x19, Y4, Y4
|
||
|
VPOR Y4, Y8, Y4
|
||
|
|
||
|
// Round 5
|
||
|
VPADDD Y0, Y4, Y0
|
||
|
VPADDD 384(SP), Y0, Y0
|
||
|
VPXOR Y12, Y0, Y12
|
||
|
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
|
||
|
VMOVDQU 512(SP), Y8
|
||
|
VPADDD Y8, Y12, Y8
|
||
|
VPXOR Y4, Y8, Y4
|
||
|
VMOVDQU Y8, 512(SP)
|
||
|
VPSRLD $0x0c, Y4, Y8
|
||
|
VPSLLD $0x14, Y4, Y4
|
||
|
VPOR Y4, Y8, Y4
|
||
|
VPADDD Y0, Y4, Y0
|
||
|
VPADDD 416(SP), Y0, Y0
|
||
|
VPXOR Y12, Y0, Y12
|
||
|
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
|
||
|
VMOVDQU 512(SP), Y8
|
||
|
VPADDD Y8, Y12, Y8
|
||
|
VPXOR Y4, Y8, Y4
|
||
|
VMOVDQU Y8, 512(SP)
|
||
|
VPSRLD $0x07, Y4, Y8
|
||
|
VPSLLD $0x19, Y4, Y4
|
||
|
VPOR Y4, Y8, Y4
|
||
|
VPADDD Y1, Y5, Y1
|
||
|
VPADDD 288(SP), Y1, Y1
|
||
|
VPXOR Y13, Y1, Y13
|
||
|
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
|
||
|
VPADDD Y9, Y13, Y9
|
||
|
VPXOR Y5, Y9, Y5
|
||
|
VPSRLD $0x0c, Y5, Y8
|
||
|
VPSLLD $0x14, Y5, Y5
|
||
|
VPOR Y5, Y8, Y5
|
||
|
VPADDD Y1, Y5, Y1
|
||
|
VPADDD 352(SP), Y1, Y1
|
||
|
VPXOR Y13, Y1, Y13
|
||
|
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
|
||
|
VPADDD Y9, Y13, Y9
|
||
|
VPXOR Y5, Y9, Y5
|
||
|
VPSRLD $0x07, Y5, Y8
|
||
|
VPSLLD $0x19, Y5, Y5
|
||
|
VPOR Y5, Y8, Y5
|
||
|
VPADDD Y2, Y6, Y2
|
||
|
VPADDD 480(SP), Y2, Y2
|
||
|
VPXOR Y14, Y2, Y14
|
||
|
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
|
||
|
VPADDD Y10, Y14, Y10
|
||
|
VPXOR Y6, Y10, Y6
|
||
|
VPSRLD $0x0c, Y6, Y8
|
||
|
VPSLLD $0x14, Y6, Y6
|
||
|
VPOR Y6, Y8, Y6
|
||
|
VPADDD Y2, Y6, Y2
|
||
|
VPADDD 320(SP), Y2, Y2
|
||
|
VPXOR Y14, Y2, Y14
|
||
|
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
|
||
|
VPADDD Y10, Y14, Y10
|
||
|
VPXOR Y6, Y10, Y6
|
||
|
VPSRLD $0x07, Y6, Y8
|
||
|
VPSLLD $0x19, Y6, Y6
|
||
|
VPOR Y6, Y8, Y6
|
||
|
VPADDD Y3, Y7, Y3
|
||
|
VPADDD 448(SP), Y3, Y3
|
||
|
VPXOR Y15, Y3, Y15
|
||
|
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
|
||
|
VPADDD Y11, Y15, Y11
|
||
|
VPXOR Y7, Y11, Y7
|
||
|
VPSRLD $0x0c, Y7, Y8
|
||
|
VPSLLD $0x14, Y7, Y7
|
||
|
VPOR Y7, Y8, Y7
|
||
|
VPADDD Y3, Y7, Y3
|
||
|
VPADDD 256(SP), Y3, Y3
|
||
|
VPXOR Y15, Y3, Y15
|
||
|
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
|
||
|
VPADDD Y11, Y15, Y11
|
||
|
VPXOR Y7, Y11, Y7
|
||
|
VPSRLD $0x07, Y7, Y8
|
||
|
VPSLLD $0x19, Y7, Y7
|
||
|
VPOR Y7, Y8, Y7
|
||
|
VPADDD Y0, Y5, Y0
|
||
|
VPADDD 224(SP), Y0, Y0
|
||
|
VPXOR Y15, Y0, Y15
|
||
|
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
|
||
|
VPADDD Y10, Y15, Y10
|
||
|
VPXOR Y5, Y10, Y5
|
||
|
VPSRLD $0x0c, Y5, Y8
|
||
|
VPSLLD $0x14, Y5, Y5
|
||
|
VPOR Y5, Y8, Y5
|
||
|
VPADDD Y0, Y5, Y0
|
||
|
VPADDD 64(SP), Y0, Y0
|
||
|
VPXOR Y15, Y0, Y15
|
||
|
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
|
||
|
VPADDD Y10, Y15, Y10
|
||
|
VPXOR Y5, Y10, Y5
|
||
|
VPSRLD $0x07, Y5, Y8
|
||
|
VPSLLD $0x19, Y5, Y5
|
||
|
VPOR Y5, Y8, Y5
|
||
|
VPADDD Y1, Y6, Y1
|
||
|
VPADDD 160(SP), Y1, Y1
|
||
|
VPXOR Y12, Y1, Y12
|
||
|
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
|
||
|
VPADDD Y11, Y12, Y11
|
||
|
VPXOR Y6, Y11, Y6
|
||
|
VPSRLD $0x0c, Y6, Y8
|
||
|
VPSLLD $0x14, Y6, Y6
|
||
|
VPOR Y6, Y8, Y6
|
||
|
VPADDD Y1, Y6, Y1
|
||
|
VPADDD 96(SP), Y1, Y1
|
||
|
VPXOR Y12, Y1, Y12
|
||
|
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
|
||
|
VPADDD Y11, Y12, Y11
|
||
|
VPXOR Y6, Y11, Y6
|
||
|
VPSRLD $0x07, Y6, Y8
|
||
|
VPSLLD $0x19, Y6, Y6
|
||
|
VPOR Y6, Y8, Y6
|
||
|
VPADDD Y2, Y7, Y2
|
||
|
VPADDD (SP), Y2, Y2
|
||
|
VPXOR Y13, Y2, Y13
|
||
|
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
|
||
|
VMOVDQU 512(SP), Y8
|
||
|
VPADDD Y8, Y13, Y8
|
||
|
VPXOR Y7, Y8, Y7
|
||
|
VMOVDQU Y8, 512(SP)
|
||
|
VPSRLD $0x0c, Y7, Y8
|
||
|
VPSLLD $0x14, Y7, Y7
|
||
|
VPOR Y7, Y8, Y7
|
||
|
VPADDD Y2, Y7, Y2
|
||
|
VPADDD 32(SP), Y2, Y2
|
||
|
VPXOR Y13, Y2, Y13
|
||
|
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
|
||
|
VMOVDQU 512(SP), Y8
|
||
|
VPADDD Y8, Y13, Y8
|
||
|
VPXOR Y7, Y8, Y7
|
||
|
VMOVDQU Y8, 512(SP)
|
||
|
VPSRLD $0x07, Y7, Y8
|
||
|
VPSLLD $0x19, Y7, Y7
|
||
|
VPOR Y7, Y8, Y7
|
||
|
VPADDD Y3, Y4, Y3
|
||
|
VPADDD 192(SP), Y3, Y3
|
||
|
VPXOR Y14, Y3, Y14
|
||
|
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
|
||
|
VPADDD Y9, Y14, Y9
|
||
|
VPXOR Y4, Y9, Y4
|
||
|
VPSRLD $0x0c, Y4, Y8
|
||
|
VPSLLD $0x14, Y4, Y4
|
||
|
VPOR Y4, Y8, Y4
|
||
|
VPADDD Y3, Y4, Y3
|
||
|
VPADDD 128(SP), Y3, Y3
|
||
|
VPXOR Y14, Y3, Y14
|
||
|
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
|
||
|
VPADDD Y9, Y14, Y9
|
||
|
VPXOR Y4, Y9, Y4
|
||
|
VPSRLD $0x07, Y4, Y8
|
||
|
VPSLLD $0x19, Y4, Y4
|
||
|
VPOR Y4, Y8, Y4
|
||
|
|
||
|
// Round 6
|
||
|
VPADDD Y0, Y4, Y0
|
||
|
VPADDD 288(SP), Y0, Y0
|
||
|
VPXOR Y12, Y0, Y12
|
||
|
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
|
||
|
VMOVDQU 512(SP), Y8
|
||
|
VPADDD Y8, Y12, Y8
|
||
|
VPXOR Y4, Y8, Y4
|
||
|
VMOVDQU Y8, 512(SP)
|
||
|
VPSRLD $0x0c, Y4, Y8
|
||
|
VPSLLD $0x14, Y4, Y4
|
||
|
VPOR Y4, Y8, Y4
|
||
|
VPADDD Y0, Y4, Y0
|
||
|
VPADDD 448(SP), Y0, Y0
|
||
|
VPXOR Y12, Y0, Y12
|
||
|
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
|
||
|
VMOVDQU 512(SP), Y8
|
||
|
VPADDD Y8, Y12, Y8
|
||
|
VPXOR Y4, Y8, Y4
|
||
|
VMOVDQU Y8, 512(SP)
|
||
|
VPSRLD $0x07, Y4, Y8
|
||
|
VPSLLD $0x19, Y4, Y4
|
||
|
VPOR Y4, Y8, Y4
|
||
|
VPADDD Y1, Y5, Y1
|
||
|
VPADDD 352(SP), Y1, Y1
|
||
|
VPXOR Y13, Y1, Y13
|
||
|
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
|
||
|
VPADDD Y9, Y13, Y9
|
||
|
VPXOR Y5, Y9, Y5
|
||
|
VPSRLD $0x0c, Y5, Y8
|
||
|
VPSLLD $0x14, Y5, Y5
|
||
|
VPOR Y5, Y8, Y5
|
||
|
VPADDD Y1, Y5, Y1
|
||
|
VPADDD 160(SP), Y1, Y1
|
||
|
VPXOR Y13, Y1, Y13
|
||
|
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
|
||
|
VPADDD Y9, Y13, Y9
|
||
|
VPXOR Y5, Y9, Y5
|
||
|
VPSRLD $0x07, Y5, Y8
|
||
|
VPSLLD $0x19, Y5, Y5
|
||
|
VPOR Y5, Y8, Y5
|
||
|
VPADDD Y2, Y6, Y2
|
||
|
VPADDD 256(SP), Y2, Y2
|
||
|
VPXOR Y14, Y2, Y14
|
||
|
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
|
||
|
VPADDD Y10, Y14, Y10
|
||
|
VPXOR Y6, Y10, Y6
|
||
|
VPSRLD $0x0c, Y6, Y8
|
||
|
VPSLLD $0x14, Y6, Y6
|
||
|
VPOR Y6, Y8, Y6
|
||
|
VPADDD Y2, Y6, Y2
|
||
|
VPADDD 384(SP), Y2, Y2
|
||
|
VPXOR Y14, Y2, Y14
|
||
|
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
|
||
|
VPADDD Y10, Y14, Y10
|
||
|
VPXOR Y6, Y10, Y6
|
||
|
VPSRLD $0x07, Y6, Y8
|
||
|
VPSLLD $0x19, Y6, Y6
|
||
|
VPOR Y6, Y8, Y6
|
||
|
VPADDD Y3, Y7, Y3
|
||
|
VPADDD 480(SP), Y3, Y3
|
||
|
VPXOR Y15, Y3, Y15
|
||
|
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
|
||
|
VPADDD Y11, Y15, Y11
|
||
|
VPXOR Y7, Y11, Y7
|
||
|
VPSRLD $0x0c, Y7, Y8
|
||
|
VPSLLD $0x14, Y7, Y7
|
||
|
VPOR Y7, Y8, Y7
|
||
|
VPADDD Y3, Y7, Y3
|
||
|
VPADDD 32(SP), Y3, Y3
|
||
|
VPXOR Y15, Y3, Y15
|
||
|
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
|
||
|
VPADDD Y11, Y15, Y11
|
||
|
VPXOR Y7, Y11, Y7
|
||
|
VPSRLD $0x07, Y7, Y8
|
||
|
VPSLLD $0x19, Y7, Y7
|
||
|
VPOR Y7, Y8, Y7
|
||
|
VPADDD Y0, Y5, Y0
|
||
|
VPADDD 416(SP), Y0, Y0
|
||
|
VPXOR Y15, Y0, Y15
|
||
|
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
|
||
|
VPADDD Y10, Y15, Y10
|
||
|
VPXOR Y5, Y10, Y5
|
||
|
VPSRLD $0x0c, Y5, Y8
|
||
|
VPSLLD $0x14, Y5, Y5
|
||
|
VPOR Y5, Y8, Y5
|
||
|
VPADDD Y0, Y5, Y0
|
||
|
VPADDD 96(SP), Y0, Y0
|
||
|
VPXOR Y15, Y0, Y15
|
||
|
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
|
||
|
VPADDD Y10, Y15, Y10
|
||
|
VPXOR Y5, Y10, Y5
|
||
|
VPSRLD $0x07, Y5, Y8
|
||
|
VPSLLD $0x19, Y5, Y5
|
||
|
VPOR Y5, Y8, Y5
|
||
|
VPADDD Y1, Y6, Y1
|
||
|
VPADDD (SP), Y1, Y1
|
||
|
VPXOR Y12, Y1, Y12
|
||
|
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
|
||
|
VPADDD Y11, Y12, Y11
|
||
|
VPXOR Y6, Y11, Y6
|
||
|
VPSRLD $0x0c, Y6, Y8
|
||
|
VPSLLD $0x14, Y6, Y6
|
||
|
VPOR Y6, Y8, Y6
|
||
|
VPADDD Y1, Y6, Y1
|
||
|
VPADDD 320(SP), Y1, Y1
|
||
|
VPXOR Y12, Y1, Y12
|
||
|
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
|
||
|
VPADDD Y11, Y12, Y11
|
||
|
VPXOR Y6, Y11, Y6
|
||
|
VPSRLD $0x07, Y6, Y8
|
||
|
VPSLLD $0x19, Y6, Y6
|
||
|
VPOR Y6, Y8, Y6
|
||
|
VPADDD Y2, Y7, Y2
|
||
|
VPADDD 64(SP), Y2, Y2
|
||
|
VPXOR Y13, Y2, Y13
|
||
|
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
|
||
|
VMOVDQU 512(SP), Y8
|
||
|
VPADDD Y8, Y13, Y8
|
||
|
VPXOR Y7, Y8, Y7
|
||
|
VMOVDQU Y8, 512(SP)
|
||
|
VPSRLD $0x0c, Y7, Y8
|
||
|
VPSLLD $0x14, Y7, Y7
|
||
|
VPOR Y7, Y8, Y7
|
||
|
VPADDD Y2, Y7, Y2
|
||
|
VPADDD 192(SP), Y2, Y2
|
||
|
VPXOR Y13, Y2, Y13
|
||
|
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
|
||
|
VMOVDQU 512(SP), Y8
|
||
|
VPADDD Y8, Y13, Y8
|
||
|
VPXOR Y7, Y8, Y7
|
||
|
VMOVDQU Y8, 512(SP)
|
||
|
VPSRLD $0x07, Y7, Y8
|
||
|
VPSLLD $0x19, Y7, Y7
|
||
|
VPOR Y7, Y8, Y7
|
||
|
VPADDD Y3, Y4, Y3
|
||
|
VPADDD 128(SP), Y3, Y3
|
||
|
VPXOR Y14, Y3, Y14
|
||
|
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
|
||
|
VPADDD Y9, Y14, Y9
|
||
|
VPXOR Y4, Y9, Y4
|
||
|
VPSRLD $0x0c, Y4, Y8
|
||
|
VPSLLD $0x14, Y4, Y4
|
||
|
VPOR Y4, Y8, Y4
|
||
|
VPADDD Y3, Y4, Y3
|
||
|
VPADDD 224(SP), Y3, Y3
|
||
|
VPXOR Y14, Y3, Y14
|
||
|
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
|
||
|
VPADDD Y9, Y14, Y9
|
||
|
VPXOR Y4, Y9, Y4
|
||
|
VPSRLD $0x07, Y4, Y8
|
||
|
VPSLLD $0x19, Y4, Y4
|
||
|
VPOR Y4, Y8, Y4
|
||
|
|
||
|
// Round 7
|
||
|
VPADDD Y0, Y4, Y0
|
||
|
VPADDD 352(SP), Y0, Y0
|
||
|
VPXOR Y12, Y0, Y12
|
||
|
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
|
||
|
VMOVDQU 512(SP), Y8
|
||
|
VPADDD Y8, Y12, Y8
|
||
|
VPXOR Y4, Y8, Y4
|
||
|
VMOVDQU Y8, 512(SP)
|
||
|
VPSRLD $0x0c, Y4, Y8
|
||
|
VPSLLD $0x14, Y4, Y4
|
||
|
VPOR Y4, Y8, Y4
|
||
|
VPADDD Y0, Y4, Y0
|
||
|
VPADDD 480(SP), Y0, Y0
|
||
|
VPXOR Y12, Y0, Y12
|
||
|
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
|
||
|
VMOVDQU 512(SP), Y8
|
||
|
VPADDD Y8, Y12, Y8
|
||
|
VPXOR Y4, Y8, Y4
|
||
|
VMOVDQU Y8, 512(SP)
|
||
|
VPSRLD $0x07, Y4, Y8
|
||
|
VPSLLD $0x19, Y4, Y4
|
||
|
VPOR Y4, Y8, Y4
|
||
|
VPADDD Y1, Y5, Y1
|
||
|
VPADDD 160(SP), Y1, Y1
|
||
|
VPXOR Y13, Y1, Y13
|
||
|
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
|
||
|
VPADDD Y9, Y13, Y9
|
||
|
VPXOR Y5, Y9, Y5
|
||
|
VPSRLD $0x0c, Y5, Y8
|
||
|
VPSLLD $0x14, Y5, Y5
|
||
|
VPOR Y5, Y8, Y5
|
||
|
VPADDD Y1, Y5, Y1
|
||
|
VPADDD (SP), Y1, Y1
|
||
|
VPXOR Y13, Y1, Y13
|
||
|
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
|
||
|
VPADDD Y9, Y13, Y9
|
||
|
VPXOR Y5, Y9, Y5
|
||
|
VPSRLD $0x07, Y5, Y8
|
||
|
VPSLLD $0x19, Y5, Y5
|
||
|
VPOR Y5, Y8, Y5
|
||
|
VPADDD Y2, Y6, Y2
|
||
|
VPADDD 32(SP), Y2, Y2
|
||
|
VPXOR Y14, Y2, Y14
|
||
|
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
|
||
|
VPADDD Y10, Y14, Y10
|
||
|
VPXOR Y6, Y10, Y6
|
||
|
VPSRLD $0x0c, Y6, Y8
|
||
|
VPSLLD $0x14, Y6, Y6
|
||
|
VPOR Y6, Y8, Y6
|
||
|
VPADDD Y2, Y6, Y2
|
||
|
VPADDD 288(SP), Y2, Y2
|
||
|
VPXOR Y14, Y2, Y14
|
||
|
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
|
||
|
VPADDD Y10, Y14, Y10
|
||
|
VPXOR Y6, Y10, Y6
|
||
|
VPSRLD $0x07, Y6, Y8
|
||
|
VPSLLD $0x19, Y6, Y6
|
||
|
VPOR Y6, Y8, Y6
|
||
|
VPADDD Y3, Y7, Y3
|
||
|
VPADDD 256(SP), Y3, Y3
|
||
|
VPXOR Y15, Y3, Y15
|
||
|
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
|
||
|
VPADDD Y11, Y15, Y11
|
||
|
VPXOR Y7, Y11, Y7
|
||
|
VPSRLD $0x0c, Y7, Y8
|
||
|
VPSLLD $0x14, Y7, Y7
|
||
|
VPOR Y7, Y8, Y7
|
||
|
VPADDD Y3, Y7, Y3
|
||
|
VPADDD 192(SP), Y3, Y3
|
||
|
VPXOR Y15, Y3, Y15
|
||
|
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
|
||
|
VPADDD Y11, Y15, Y11
|
||
|
VPXOR Y7, Y11, Y7
|
||
|
VPSRLD $0x07, Y7, Y8
|
||
|
VPSLLD $0x19, Y7, Y7
|
||
|
VPOR Y7, Y8, Y7
|
||
|
VPADDD Y0, Y5, Y0
|
||
|
VPADDD 448(SP), Y0, Y0
|
||
|
VPXOR Y15, Y0, Y15
|
||
|
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
|
||
|
VPADDD Y10, Y15, Y10
|
||
|
VPXOR Y5, Y10, Y5
|
||
|
VPSRLD $0x0c, Y5, Y8
|
||
|
VPSLLD $0x14, Y5, Y5
|
||
|
VPOR Y5, Y8, Y5
|
||
|
VPADDD Y0, Y5, Y0
|
||
|
VPADDD 320(SP), Y0, Y0
|
||
|
VPXOR Y15, Y0, Y15
|
||
|
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
|
||
|
VPADDD Y10, Y15, Y10
|
||
|
VPXOR Y5, Y10, Y5
|
||
|
VPSRLD $0x07, Y5, Y8
|
||
|
VPSLLD $0x19, Y5, Y5
|
||
|
VPOR Y5, Y8, Y5
|
||
|
VPADDD Y1, Y6, Y1
|
||
|
VPADDD 64(SP), Y1, Y1
|
||
|
VPXOR Y12, Y1, Y12
|
||
|
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
|
||
|
VPADDD Y11, Y12, Y11
|
||
|
VPXOR Y6, Y11, Y6
|
||
|
VPSRLD $0x0c, Y6, Y8
|
||
|
VPSLLD $0x14, Y6, Y6
|
||
|
VPOR Y6, Y8, Y6
|
||
|
VPADDD Y1, Y6, Y1
|
||
|
VPADDD 384(SP), Y1, Y1
|
||
|
VPXOR Y12, Y1, Y12
|
||
|
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
|
||
|
VPADDD Y11, Y12, Y11
|
||
|
VPXOR Y6, Y11, Y6
|
||
|
VPSRLD $0x07, Y6, Y8
|
||
|
VPSLLD $0x19, Y6, Y6
|
||
|
VPOR Y6, Y8, Y6
|
||
|
VPADDD Y2, Y7, Y2
|
||
|
VPADDD 96(SP), Y2, Y2
|
||
|
VPXOR Y13, Y2, Y13
|
||
|
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
|
||
|
VMOVDQU 512(SP), Y8
|
||
|
VPADDD Y8, Y13, Y8
|
||
|
VPXOR Y7, Y8, Y7
|
||
|
VMOVDQU Y8, 512(SP)
|
||
|
VPSRLD $0x0c, Y7, Y8
|
||
|
VPSLLD $0x14, Y7, Y7
|
||
|
VPOR Y7, Y8, Y7
|
||
|
VPADDD Y2, Y7, Y2
|
||
|
VPADDD 128(SP), Y2, Y2
|
||
|
VPXOR Y13, Y2, Y13
|
||
|
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
|
||
|
VMOVDQU 512(SP), Y8
|
||
|
VPADDD Y8, Y13, Y8
|
||
|
VPXOR Y7, Y8, Y7
|
||
|
VMOVDQU Y8, 512(SP)
|
||
|
VPSRLD $0x07, Y7, Y8
|
||
|
VPSLLD $0x19, Y7, Y7
|
||
|
VPOR Y7, Y8, Y7
|
||
|
VPADDD Y3, Y4, Y3
|
||
|
VPADDD 224(SP), Y3, Y3
|
||
|
VPXOR Y14, Y3, Y14
|
||
|
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
|
||
|
VPADDD Y9, Y14, Y9
|
||
|
VPXOR Y4, Y9, Y4
|
||
|
VPSRLD $0x0c, Y4, Y8
|
||
|
VPSLLD $0x14, Y4, Y4
|
||
|
VPOR Y4, Y8, Y4
|
||
|
VPADDD Y3, Y4, Y3
|
||
|
VPADDD 416(SP), Y3, Y3
|
||
|
VPXOR Y14, Y3, Y14
|
||
|
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
|
||
|
VPADDD Y9, Y14, Y9
|
||
|
VPXOR Y4, Y9, Y4
|
||
|
VPSRLD $0x07, Y4, Y8
|
||
|
VPSLLD $0x19, Y4, Y4
|
||
|
VPOR Y4, Y8, Y4
|
||
|
|
||
|
// Finalize CVs
|
||
|
VMOVDQU 512(SP), Y8
|
||
|
VPXOR Y0, Y8, Y0
|
||
|
VPXOR Y1, Y9, Y1
|
||
|
VPXOR Y2, Y10, Y2
|
||
|
VPXOR Y3, Y11, Y3
|
||
|
VPXOR Y4, Y12, Y4
|
||
|
VPXOR Y5, Y13, Y5
|
||
|
VPXOR Y6, Y14, Y6
|
||
|
VPXOR Y7, Y15, Y7
|
||
|
|
||
|
// Loop
|
||
|
INCQ DX
|
||
|
CMPQ DX, $0x00000010
|
||
|
JNE loop
|
||
|
|
||
|
// Finished; transpose CVs
|
||
|
VPUNPCKLDQ Y1, Y0, Y8
|
||
|
VPUNPCKHDQ Y1, Y0, Y9
|
||
|
VPUNPCKLDQ Y3, Y2, Y10
|
||
|
VPUNPCKHDQ Y3, Y2, Y11
|
||
|
VPUNPCKLDQ Y5, Y4, Y12
|
||
|
VPUNPCKHDQ Y5, Y4, Y13
|
||
|
VPUNPCKLDQ Y7, Y6, Y14
|
||
|
VPUNPCKHDQ Y7, Y6, Y15
|
||
|
VPUNPCKLQDQ Y10, Y8, Y0
|
||
|
VPUNPCKHQDQ Y10, Y8, Y1
|
||
|
VPUNPCKLQDQ Y11, Y9, Y2
|
||
|
VPUNPCKHQDQ Y11, Y9, Y3
|
||
|
VPUNPCKLQDQ Y14, Y12, Y4
|
||
|
VPUNPCKHQDQ Y14, Y12, Y5
|
||
|
VPUNPCKLQDQ Y15, Y13, Y6
|
||
|
VPUNPCKHQDQ Y15, Y13, Y7
|
||
|
VPERM2I128 $0x20, Y4, Y0, Y8
|
||
|
VPERM2I128 $0x31, Y4, Y0, Y12
|
||
|
VPERM2I128 $0x20, Y5, Y1, Y9
|
||
|
VPERM2I128 $0x31, Y5, Y1, Y13
|
||
|
VPERM2I128 $0x20, Y6, Y2, Y10
|
||
|
VPERM2I128 $0x31, Y6, Y2, Y14
|
||
|
VPERM2I128 $0x20, Y7, Y3, Y11
|
||
|
VPERM2I128 $0x31, Y7, Y3, Y15
|
||
|
VMOVDQU Y8, (AX)
|
||
|
VMOVDQU Y9, 32(AX)
|
||
|
VMOVDQU Y10, 64(AX)
|
||
|
VMOVDQU Y11, 96(AX)
|
||
|
VMOVDQU Y12, 128(AX)
|
||
|
VMOVDQU Y13, 160(AX)
|
||
|
VMOVDQU Y14, 192(AX)
|
||
|
VMOVDQU Y15, 224(AX)
|
||
|
RET
|