This repository has been archived on 2023-05-01. You can view files and clone it, but cannot push or open issues or pull requests.
blake3/blake3_amd64.s

1312 lines
32 KiB
ArmAsm

// Code generated by command: go run gen.go -out blake3_amd64.s. DO NOT EDIT.
#include "textflag.h"
DATA iv<>+0(SB)/4, $0x6a09e667
DATA iv<>+4(SB)/4, $0xbb67ae85
DATA iv<>+8(SB)/4, $0x3c6ef372
DATA iv<>+12(SB)/4, $0xa54ff53a
GLOBL iv<>(SB), RODATA|NOPTR, $16
DATA block_len<>+0(SB)/4, $0x00000040
DATA block_len<>+4(SB)/4, $0x00000040
DATA block_len<>+8(SB)/4, $0x00000040
DATA block_len<>+12(SB)/4, $0x00000040
DATA block_len<>+16(SB)/4, $0x00000040
DATA block_len<>+20(SB)/4, $0x00000040
DATA block_len<>+24(SB)/4, $0x00000040
DATA block_len<>+28(SB)/4, $0x00000040
GLOBL block_len<>(SB), RODATA|NOPTR, $32
DATA stride_1024<>+0(SB)/4, $0x00000000
DATA stride_1024<>+4(SB)/4, $0x00000400
DATA stride_1024<>+8(SB)/4, $0x00000800
DATA stride_1024<>+12(SB)/4, $0x00000c00
DATA stride_1024<>+16(SB)/4, $0x00001000
DATA stride_1024<>+20(SB)/4, $0x00001400
DATA stride_1024<>+24(SB)/4, $0x00001800
DATA stride_1024<>+28(SB)/4, $0x00001c00
GLOBL stride_1024<>(SB), RODATA|NOPTR, $32
DATA increment_counter<>+0(SB)/8, $0x0000000000000000
DATA increment_counter<>+8(SB)/8, $0x0000000000000001
DATA increment_counter<>+16(SB)/8, $0x0000000000000002
DATA increment_counter<>+24(SB)/8, $0x0000000000000003
DATA increment_counter<>+32(SB)/8, $0x0000000000000004
DATA increment_counter<>+40(SB)/8, $0x0000000000000005
DATA increment_counter<>+48(SB)/8, $0x0000000000000006
DATA increment_counter<>+56(SB)/8, $0x0000000000000007
GLOBL increment_counter<>(SB), RODATA|NOPTR, $64
DATA set_flags<>+0(SB)/4, $0x00000001
DATA set_flags<>+4(SB)/4, $0x00000000
DATA set_flags<>+8(SB)/4, $0x00000000
DATA set_flags<>+12(SB)/4, $0x00000000
DATA set_flags<>+16(SB)/4, $0x00000000
DATA set_flags<>+20(SB)/4, $0x00000000
DATA set_flags<>+24(SB)/4, $0x00000000
DATA set_flags<>+28(SB)/4, $0x00000000
DATA set_flags<>+32(SB)/4, $0x00000000
DATA set_flags<>+36(SB)/4, $0x00000000
DATA set_flags<>+40(SB)/4, $0x00000000
DATA set_flags<>+44(SB)/4, $0x00000000
DATA set_flags<>+48(SB)/4, $0x00000000
DATA set_flags<>+52(SB)/4, $0x00000000
DATA set_flags<>+56(SB)/4, $0x00000000
DATA set_flags<>+60(SB)/4, $0x00000002
GLOBL set_flags<>(SB), RODATA|NOPTR, $64
DATA shuffle_rot8<>+0(SB)/4, $0x00030201
DATA shuffle_rot8<>+4(SB)/4, $0x04070605
DATA shuffle_rot8<>+8(SB)/4, $0x080b0a09
DATA shuffle_rot8<>+12(SB)/4, $0x0c0f0e0d
DATA shuffle_rot8<>+16(SB)/4, $0x10131211
DATA shuffle_rot8<>+20(SB)/4, $0x14171615
DATA shuffle_rot8<>+24(SB)/4, $0x181b1a19
DATA shuffle_rot8<>+28(SB)/4, $0x1c1f1e1d
GLOBL shuffle_rot8<>(SB), RODATA|NOPTR, $32
DATA shuffle_rot16<>+0(SB)/4, $0x01000302
DATA shuffle_rot16<>+4(SB)/4, $0x05040706
DATA shuffle_rot16<>+8(SB)/4, $0x09080b0a
DATA shuffle_rot16<>+12(SB)/4, $0x0d0c0f0e
DATA shuffle_rot16<>+16(SB)/4, $0x11101312
DATA shuffle_rot16<>+20(SB)/4, $0x15141716
DATA shuffle_rot16<>+24(SB)/4, $0x19181b1a
DATA shuffle_rot16<>+28(SB)/4, $0x1d1c1f1e
GLOBL shuffle_rot16<>(SB), RODATA|NOPTR, $32
// func compressChunksAVX2(cvs *[8][8]uint32, buf *[8192]byte, key *[8]uint32, counter uint64, flags uint32)
// Requires: AVX, AVX2
TEXT ·compressChunksAVX2(SB), NOSPLIT, $672-40
MOVQ cvs+0(FP), AX
MOVQ buf+8(FP), CX
MOVQ key+16(FP), DX
// Load key
VPBROADCASTD (DX), Y0
VPBROADCASTD 4(DX), Y1
VPBROADCASTD 8(DX), Y2
VPBROADCASTD 12(DX), Y3
VPBROADCASTD 16(DX), Y4
VPBROADCASTD 20(DX), Y5
VPBROADCASTD 24(DX), Y6
VPBROADCASTD 28(DX), Y7
// Initialize counter
VPBROADCASTQ counter+24(FP), Y12
VPBROADCASTQ counter+24(FP), Y13
VPADDQ increment_counter<>+0(SB), Y12, Y12
VPADDQ increment_counter<>+32(SB), Y13, Y13
VPUNPCKLDQ Y13, Y12, Y14
VPUNPCKHDQ Y13, Y12, Y15
VPUNPCKLDQ Y15, Y14, Y12
VPUNPCKHDQ Y15, Y14, Y13
VPERMQ $0xd8, Y12, Y12
VPERMQ $0xd8, Y13, Y13
VMOVDQU Y12, 544(SP)
VMOVDQU Y13, 576(SP)
// Initialize flags
VPBROADCASTD flags+32(FP), Y14
VPOR set_flags<>+0(SB), Y14, Y15
VMOVDQU Y15, 608(SP)
VPOR set_flags<>+32(SB), Y14, Y15
VMOVDQU Y15, 640(SP)
// Loop index
XORQ DX, DX
loop:
// Load transposed block
VMOVDQU stride_1024<>+0(SB), Y9
VPCMPEQD Y8, Y8, Y8
VPGATHERDD Y8, (CX)(Y9*1), Y10
VMOVDQU Y10, (SP)
VPCMPEQD Y8, Y8, Y8
VPGATHERDD Y8, 4(CX)(Y9*1), Y10
VMOVDQU Y10, 32(SP)
VPCMPEQD Y8, Y8, Y8
VPGATHERDD Y8, 8(CX)(Y9*1), Y10
VMOVDQU Y10, 64(SP)
VPCMPEQD Y8, Y8, Y8
VPGATHERDD Y8, 12(CX)(Y9*1), Y10
VMOVDQU Y10, 96(SP)
VPCMPEQD Y8, Y8, Y8
VPGATHERDD Y8, 16(CX)(Y9*1), Y10
VMOVDQU Y10, 128(SP)
VPCMPEQD Y8, Y8, Y8
VPGATHERDD Y8, 20(CX)(Y9*1), Y10
VMOVDQU Y10, 160(SP)
VPCMPEQD Y8, Y8, Y8
VPGATHERDD Y8, 24(CX)(Y9*1), Y10
VMOVDQU Y10, 192(SP)
VPCMPEQD Y8, Y8, Y8
VPGATHERDD Y8, 28(CX)(Y9*1), Y10
VMOVDQU Y10, 224(SP)
VPCMPEQD Y8, Y8, Y8
VPGATHERDD Y8, 32(CX)(Y9*1), Y10
VMOVDQU Y10, 256(SP)
VPCMPEQD Y8, Y8, Y8
VPGATHERDD Y8, 36(CX)(Y9*1), Y10
VMOVDQU Y10, 288(SP)
VPCMPEQD Y8, Y8, Y8
VPGATHERDD Y8, 40(CX)(Y9*1), Y10
VMOVDQU Y10, 320(SP)
VPCMPEQD Y8, Y8, Y8
VPGATHERDD Y8, 44(CX)(Y9*1), Y10
VMOVDQU Y10, 352(SP)
VPCMPEQD Y8, Y8, Y8
VPGATHERDD Y8, 48(CX)(Y9*1), Y10
VMOVDQU Y10, 384(SP)
VPCMPEQD Y8, Y8, Y8
VPGATHERDD Y8, 52(CX)(Y9*1), Y10
VMOVDQU Y10, 416(SP)
VPCMPEQD Y8, Y8, Y8
VPGATHERDD Y8, 56(CX)(Y9*1), Y10
VMOVDQU Y10, 448(SP)
VPCMPEQD Y8, Y8, Y8
VPGATHERDD Y8, 60(CX)(Y9*1), Y10
VMOVDQU Y10, 480(SP)
ADDQ $0x40, CX
// Reload state vectors (other than CVs)
VPBROADCASTD iv<>+0(SB), Y8
VPBROADCASTD iv<>+4(SB), Y9
VPBROADCASTD iv<>+8(SB), Y10
VPBROADCASTD iv<>+12(SB), Y11
VMOVDQU 544(SP), Y12
VMOVDQU 576(SP), Y13
VMOVDQU block_len<>+0(SB), Y14
VPBROADCASTD 608(SP)(DX*4), Y15
VMOVDQU Y8, 512(SP)
// Round 1
VPADDD Y0, Y4, Y0
VPADDD (SP), Y0, Y0
VPXOR Y12, Y0, Y12
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
VMOVDQU 512(SP), Y8
VPADDD Y8, Y12, Y8
VPXOR Y4, Y8, Y4
VMOVDQU Y8, 512(SP)
VPSRLD $0x0c, Y4, Y8
VPSLLD $0x14, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y0, Y4, Y0
VPADDD 32(SP), Y0, Y0
VPXOR Y12, Y0, Y12
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
VMOVDQU 512(SP), Y8
VPADDD Y8, Y12, Y8
VPXOR Y4, Y8, Y4
VMOVDQU Y8, 512(SP)
VPSRLD $0x07, Y4, Y8
VPSLLD $0x19, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y1, Y5, Y1
VPADDD 64(SP), Y1, Y1
VPXOR Y13, Y1, Y13
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
VPADDD Y9, Y13, Y9
VPXOR Y5, Y9, Y5
VPSRLD $0x0c, Y5, Y8
VPSLLD $0x14, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y1, Y5, Y1
VPADDD 96(SP), Y1, Y1
VPXOR Y13, Y1, Y13
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
VPADDD Y9, Y13, Y9
VPXOR Y5, Y9, Y5
VPSRLD $0x07, Y5, Y8
VPSLLD $0x19, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y2, Y6, Y2
VPADDD 128(SP), Y2, Y2
VPXOR Y14, Y2, Y14
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
VPADDD Y10, Y14, Y10
VPXOR Y6, Y10, Y6
VPSRLD $0x0c, Y6, Y8
VPSLLD $0x14, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y2, Y6, Y2
VPADDD 160(SP), Y2, Y2
VPXOR Y14, Y2, Y14
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
VPADDD Y10, Y14, Y10
VPXOR Y6, Y10, Y6
VPSRLD $0x07, Y6, Y8
VPSLLD $0x19, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y3, Y7, Y3
VPADDD 192(SP), Y3, Y3
VPXOR Y15, Y3, Y15
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
VPADDD Y11, Y15, Y11
VPXOR Y7, Y11, Y7
VPSRLD $0x0c, Y7, Y8
VPSLLD $0x14, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y3, Y7, Y3
VPADDD 224(SP), Y3, Y3
VPXOR Y15, Y3, Y15
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
VPADDD Y11, Y15, Y11
VPXOR Y7, Y11, Y7
VPSRLD $0x07, Y7, Y8
VPSLLD $0x19, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y0, Y5, Y0
VPADDD 256(SP), Y0, Y0
VPXOR Y15, Y0, Y15
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
VPADDD Y10, Y15, Y10
VPXOR Y5, Y10, Y5
VPSRLD $0x0c, Y5, Y8
VPSLLD $0x14, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y0, Y5, Y0
VPADDD 288(SP), Y0, Y0
VPXOR Y15, Y0, Y15
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
VPADDD Y10, Y15, Y10
VPXOR Y5, Y10, Y5
VPSRLD $0x07, Y5, Y8
VPSLLD $0x19, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y1, Y6, Y1
VPADDD 320(SP), Y1, Y1
VPXOR Y12, Y1, Y12
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
VPADDD Y11, Y12, Y11
VPXOR Y6, Y11, Y6
VPSRLD $0x0c, Y6, Y8
VPSLLD $0x14, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y1, Y6, Y1
VPADDD 352(SP), Y1, Y1
VPXOR Y12, Y1, Y12
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
VPADDD Y11, Y12, Y11
VPXOR Y6, Y11, Y6
VPSRLD $0x07, Y6, Y8
VPSLLD $0x19, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y2, Y7, Y2
VPADDD 384(SP), Y2, Y2
VPXOR Y13, Y2, Y13
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
VMOVDQU 512(SP), Y8
VPADDD Y8, Y13, Y8
VPXOR Y7, Y8, Y7
VMOVDQU Y8, 512(SP)
VPSRLD $0x0c, Y7, Y8
VPSLLD $0x14, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y2, Y7, Y2
VPADDD 416(SP), Y2, Y2
VPXOR Y13, Y2, Y13
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
VMOVDQU 512(SP), Y8
VPADDD Y8, Y13, Y8
VPXOR Y7, Y8, Y7
VMOVDQU Y8, 512(SP)
VPSRLD $0x07, Y7, Y8
VPSLLD $0x19, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y3, Y4, Y3
VPADDD 448(SP), Y3, Y3
VPXOR Y14, Y3, Y14
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
VPADDD Y9, Y14, Y9
VPXOR Y4, Y9, Y4
VPSRLD $0x0c, Y4, Y8
VPSLLD $0x14, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y3, Y4, Y3
VPADDD 480(SP), Y3, Y3
VPXOR Y14, Y3, Y14
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
VPADDD Y9, Y14, Y9
VPXOR Y4, Y9, Y4
VPSRLD $0x07, Y4, Y8
VPSLLD $0x19, Y4, Y4
VPOR Y4, Y8, Y4
// Round 2
VPADDD Y0, Y4, Y0
VPADDD 64(SP), Y0, Y0
VPXOR Y12, Y0, Y12
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
VMOVDQU 512(SP), Y8
VPADDD Y8, Y12, Y8
VPXOR Y4, Y8, Y4
VMOVDQU Y8, 512(SP)
VPSRLD $0x0c, Y4, Y8
VPSLLD $0x14, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y0, Y4, Y0
VPADDD 192(SP), Y0, Y0
VPXOR Y12, Y0, Y12
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
VMOVDQU 512(SP), Y8
VPADDD Y8, Y12, Y8
VPXOR Y4, Y8, Y4
VMOVDQU Y8, 512(SP)
VPSRLD $0x07, Y4, Y8
VPSLLD $0x19, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y1, Y5, Y1
VPADDD 96(SP), Y1, Y1
VPXOR Y13, Y1, Y13
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
VPADDD Y9, Y13, Y9
VPXOR Y5, Y9, Y5
VPSRLD $0x0c, Y5, Y8
VPSLLD $0x14, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y1, Y5, Y1
VPADDD 320(SP), Y1, Y1
VPXOR Y13, Y1, Y13
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
VPADDD Y9, Y13, Y9
VPXOR Y5, Y9, Y5
VPSRLD $0x07, Y5, Y8
VPSLLD $0x19, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y2, Y6, Y2
VPADDD 224(SP), Y2, Y2
VPXOR Y14, Y2, Y14
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
VPADDD Y10, Y14, Y10
VPXOR Y6, Y10, Y6
VPSRLD $0x0c, Y6, Y8
VPSLLD $0x14, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y2, Y6, Y2
VPADDD (SP), Y2, Y2
VPXOR Y14, Y2, Y14
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
VPADDD Y10, Y14, Y10
VPXOR Y6, Y10, Y6
VPSRLD $0x07, Y6, Y8
VPSLLD $0x19, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y3, Y7, Y3
VPADDD 128(SP), Y3, Y3
VPXOR Y15, Y3, Y15
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
VPADDD Y11, Y15, Y11
VPXOR Y7, Y11, Y7
VPSRLD $0x0c, Y7, Y8
VPSLLD $0x14, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y3, Y7, Y3
VPADDD 416(SP), Y3, Y3
VPXOR Y15, Y3, Y15
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
VPADDD Y11, Y15, Y11
VPXOR Y7, Y11, Y7
VPSRLD $0x07, Y7, Y8
VPSLLD $0x19, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y0, Y5, Y0
VPADDD 32(SP), Y0, Y0
VPXOR Y15, Y0, Y15
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
VPADDD Y10, Y15, Y10
VPXOR Y5, Y10, Y5
VPSRLD $0x0c, Y5, Y8
VPSLLD $0x14, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y0, Y5, Y0
VPADDD 352(SP), Y0, Y0
VPXOR Y15, Y0, Y15
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
VPADDD Y10, Y15, Y10
VPXOR Y5, Y10, Y5
VPSRLD $0x07, Y5, Y8
VPSLLD $0x19, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y1, Y6, Y1
VPADDD 384(SP), Y1, Y1
VPXOR Y12, Y1, Y12
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
VPADDD Y11, Y12, Y11
VPXOR Y6, Y11, Y6
VPSRLD $0x0c, Y6, Y8
VPSLLD $0x14, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y1, Y6, Y1
VPADDD 160(SP), Y1, Y1
VPXOR Y12, Y1, Y12
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
VPADDD Y11, Y12, Y11
VPXOR Y6, Y11, Y6
VPSRLD $0x07, Y6, Y8
VPSLLD $0x19, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y2, Y7, Y2
VPADDD 288(SP), Y2, Y2
VPXOR Y13, Y2, Y13
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
VMOVDQU 512(SP), Y8
VPADDD Y8, Y13, Y8
VPXOR Y7, Y8, Y7
VMOVDQU Y8, 512(SP)
VPSRLD $0x0c, Y7, Y8
VPSLLD $0x14, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y2, Y7, Y2
VPADDD 448(SP), Y2, Y2
VPXOR Y13, Y2, Y13
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
VMOVDQU 512(SP), Y8
VPADDD Y8, Y13, Y8
VPXOR Y7, Y8, Y7
VMOVDQU Y8, 512(SP)
VPSRLD $0x07, Y7, Y8
VPSLLD $0x19, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y3, Y4, Y3
VPADDD 480(SP), Y3, Y3
VPXOR Y14, Y3, Y14
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
VPADDD Y9, Y14, Y9
VPXOR Y4, Y9, Y4
VPSRLD $0x0c, Y4, Y8
VPSLLD $0x14, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y3, Y4, Y3
VPADDD 256(SP), Y3, Y3
VPXOR Y14, Y3, Y14
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
VPADDD Y9, Y14, Y9
VPXOR Y4, Y9, Y4
VPSRLD $0x07, Y4, Y8
VPSLLD $0x19, Y4, Y4
VPOR Y4, Y8, Y4
// Round 3
VPADDD Y0, Y4, Y0
VPADDD 96(SP), Y0, Y0
VPXOR Y12, Y0, Y12
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
VMOVDQU 512(SP), Y8
VPADDD Y8, Y12, Y8
VPXOR Y4, Y8, Y4
VMOVDQU Y8, 512(SP)
VPSRLD $0x0c, Y4, Y8
VPSLLD $0x14, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y0, Y4, Y0
VPADDD 128(SP), Y0, Y0
VPXOR Y12, Y0, Y12
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
VMOVDQU 512(SP), Y8
VPADDD Y8, Y12, Y8
VPXOR Y4, Y8, Y4
VMOVDQU Y8, 512(SP)
VPSRLD $0x07, Y4, Y8
VPSLLD $0x19, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y1, Y5, Y1
VPADDD 320(SP), Y1, Y1
VPXOR Y13, Y1, Y13
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
VPADDD Y9, Y13, Y9
VPXOR Y5, Y9, Y5
VPSRLD $0x0c, Y5, Y8
VPSLLD $0x14, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y1, Y5, Y1
VPADDD 384(SP), Y1, Y1
VPXOR Y13, Y1, Y13
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
VPADDD Y9, Y13, Y9
VPXOR Y5, Y9, Y5
VPSRLD $0x07, Y5, Y8
VPSLLD $0x19, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y2, Y6, Y2
VPADDD 416(SP), Y2, Y2
VPXOR Y14, Y2, Y14
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
VPADDD Y10, Y14, Y10
VPXOR Y6, Y10, Y6
VPSRLD $0x0c, Y6, Y8
VPSLLD $0x14, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y2, Y6, Y2
VPADDD 64(SP), Y2, Y2
VPXOR Y14, Y2, Y14
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
VPADDD Y10, Y14, Y10
VPXOR Y6, Y10, Y6
VPSRLD $0x07, Y6, Y8
VPSLLD $0x19, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y3, Y7, Y3
VPADDD 224(SP), Y3, Y3
VPXOR Y15, Y3, Y15
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
VPADDD Y11, Y15, Y11
VPXOR Y7, Y11, Y7
VPSRLD $0x0c, Y7, Y8
VPSLLD $0x14, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y3, Y7, Y3
VPADDD 448(SP), Y3, Y3
VPXOR Y15, Y3, Y15
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
VPADDD Y11, Y15, Y11
VPXOR Y7, Y11, Y7
VPSRLD $0x07, Y7, Y8
VPSLLD $0x19, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y0, Y5, Y0
VPADDD 192(SP), Y0, Y0
VPXOR Y15, Y0, Y15
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
VPADDD Y10, Y15, Y10
VPXOR Y5, Y10, Y5
VPSRLD $0x0c, Y5, Y8
VPSLLD $0x14, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y0, Y5, Y0
VPADDD 160(SP), Y0, Y0
VPXOR Y15, Y0, Y15
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
VPADDD Y10, Y15, Y10
VPXOR Y5, Y10, Y5
VPSRLD $0x07, Y5, Y8
VPSLLD $0x19, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y1, Y6, Y1
VPADDD 288(SP), Y1, Y1
VPXOR Y12, Y1, Y12
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
VPADDD Y11, Y12, Y11
VPXOR Y6, Y11, Y6
VPSRLD $0x0c, Y6, Y8
VPSLLD $0x14, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y1, Y6, Y1
VPADDD (SP), Y1, Y1
VPXOR Y12, Y1, Y12
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
VPADDD Y11, Y12, Y11
VPXOR Y6, Y11, Y6
VPSRLD $0x07, Y6, Y8
VPSLLD $0x19, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y2, Y7, Y2
VPADDD 352(SP), Y2, Y2
VPXOR Y13, Y2, Y13
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
VMOVDQU 512(SP), Y8
VPADDD Y8, Y13, Y8
VPXOR Y7, Y8, Y7
VMOVDQU Y8, 512(SP)
VPSRLD $0x0c, Y7, Y8
VPSLLD $0x14, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y2, Y7, Y2
VPADDD 480(SP), Y2, Y2
VPXOR Y13, Y2, Y13
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
VMOVDQU 512(SP), Y8
VPADDD Y8, Y13, Y8
VPXOR Y7, Y8, Y7
VMOVDQU Y8, 512(SP)
VPSRLD $0x07, Y7, Y8
VPSLLD $0x19, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y3, Y4, Y3
VPADDD 256(SP), Y3, Y3
VPXOR Y14, Y3, Y14
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
VPADDD Y9, Y14, Y9
VPXOR Y4, Y9, Y4
VPSRLD $0x0c, Y4, Y8
VPSLLD $0x14, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y3, Y4, Y3
VPADDD 32(SP), Y3, Y3
VPXOR Y14, Y3, Y14
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
VPADDD Y9, Y14, Y9
VPXOR Y4, Y9, Y4
VPSRLD $0x07, Y4, Y8
VPSLLD $0x19, Y4, Y4
VPOR Y4, Y8, Y4
// Round 4
VPADDD Y0, Y4, Y0
VPADDD 320(SP), Y0, Y0
VPXOR Y12, Y0, Y12
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
VMOVDQU 512(SP), Y8
VPADDD Y8, Y12, Y8
VPXOR Y4, Y8, Y4
VMOVDQU Y8, 512(SP)
VPSRLD $0x0c, Y4, Y8
VPSLLD $0x14, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y0, Y4, Y0
VPADDD 224(SP), Y0, Y0
VPXOR Y12, Y0, Y12
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
VMOVDQU 512(SP), Y8
VPADDD Y8, Y12, Y8
VPXOR Y4, Y8, Y4
VMOVDQU Y8, 512(SP)
VPSRLD $0x07, Y4, Y8
VPSLLD $0x19, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y1, Y5, Y1
VPADDD 384(SP), Y1, Y1
VPXOR Y13, Y1, Y13
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
VPADDD Y9, Y13, Y9
VPXOR Y5, Y9, Y5
VPSRLD $0x0c, Y5, Y8
VPSLLD $0x14, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y1, Y5, Y1
VPADDD 288(SP), Y1, Y1
VPXOR Y13, Y1, Y13
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
VPADDD Y9, Y13, Y9
VPXOR Y5, Y9, Y5
VPSRLD $0x07, Y5, Y8
VPSLLD $0x19, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y2, Y6, Y2
VPADDD 448(SP), Y2, Y2
VPXOR Y14, Y2, Y14
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
VPADDD Y10, Y14, Y10
VPXOR Y6, Y10, Y6
VPSRLD $0x0c, Y6, Y8
VPSLLD $0x14, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y2, Y6, Y2
VPADDD 96(SP), Y2, Y2
VPXOR Y14, Y2, Y14
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
VPADDD Y10, Y14, Y10
VPXOR Y6, Y10, Y6
VPSRLD $0x07, Y6, Y8
VPSLLD $0x19, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y3, Y7, Y3
VPADDD 416(SP), Y3, Y3
VPXOR Y15, Y3, Y15
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
VPADDD Y11, Y15, Y11
VPXOR Y7, Y11, Y7
VPSRLD $0x0c, Y7, Y8
VPSLLD $0x14, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y3, Y7, Y3
VPADDD 480(SP), Y3, Y3
VPXOR Y15, Y3, Y15
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
VPADDD Y11, Y15, Y11
VPXOR Y7, Y11, Y7
VPSRLD $0x07, Y7, Y8
VPSLLD $0x19, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y0, Y5, Y0
VPADDD 128(SP), Y0, Y0
VPXOR Y15, Y0, Y15
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
VPADDD Y10, Y15, Y10
VPXOR Y5, Y10, Y5
VPSRLD $0x0c, Y5, Y8
VPSLLD $0x14, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y0, Y5, Y0
VPADDD (SP), Y0, Y0
VPXOR Y15, Y0, Y15
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
VPADDD Y10, Y15, Y10
VPXOR Y5, Y10, Y5
VPSRLD $0x07, Y5, Y8
VPSLLD $0x19, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y1, Y6, Y1
VPADDD 352(SP), Y1, Y1
VPXOR Y12, Y1, Y12
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
VPADDD Y11, Y12, Y11
VPXOR Y6, Y11, Y6
VPSRLD $0x0c, Y6, Y8
VPSLLD $0x14, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y1, Y6, Y1
VPADDD 64(SP), Y1, Y1
VPXOR Y12, Y1, Y12
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
VPADDD Y11, Y12, Y11
VPXOR Y6, Y11, Y6
VPSRLD $0x07, Y6, Y8
VPSLLD $0x19, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y2, Y7, Y2
VPADDD 160(SP), Y2, Y2
VPXOR Y13, Y2, Y13
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
VMOVDQU 512(SP), Y8
VPADDD Y8, Y13, Y8
VPXOR Y7, Y8, Y7
VMOVDQU Y8, 512(SP)
VPSRLD $0x0c, Y7, Y8
VPSLLD $0x14, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y2, Y7, Y2
VPADDD 256(SP), Y2, Y2
VPXOR Y13, Y2, Y13
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
VMOVDQU 512(SP), Y8
VPADDD Y8, Y13, Y8
VPXOR Y7, Y8, Y7
VMOVDQU Y8, 512(SP)
VPSRLD $0x07, Y7, Y8
VPSLLD $0x19, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y3, Y4, Y3
VPADDD 32(SP), Y3, Y3
VPXOR Y14, Y3, Y14
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
VPADDD Y9, Y14, Y9
VPXOR Y4, Y9, Y4
VPSRLD $0x0c, Y4, Y8
VPSLLD $0x14, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y3, Y4, Y3
VPADDD 192(SP), Y3, Y3
VPXOR Y14, Y3, Y14
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
VPADDD Y9, Y14, Y9
VPXOR Y4, Y9, Y4
VPSRLD $0x07, Y4, Y8
VPSLLD $0x19, Y4, Y4
VPOR Y4, Y8, Y4
// Round 5
VPADDD Y0, Y4, Y0
VPADDD 384(SP), Y0, Y0
VPXOR Y12, Y0, Y12
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
VMOVDQU 512(SP), Y8
VPADDD Y8, Y12, Y8
VPXOR Y4, Y8, Y4
VMOVDQU Y8, 512(SP)
VPSRLD $0x0c, Y4, Y8
VPSLLD $0x14, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y0, Y4, Y0
VPADDD 416(SP), Y0, Y0
VPXOR Y12, Y0, Y12
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
VMOVDQU 512(SP), Y8
VPADDD Y8, Y12, Y8
VPXOR Y4, Y8, Y4
VMOVDQU Y8, 512(SP)
VPSRLD $0x07, Y4, Y8
VPSLLD $0x19, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y1, Y5, Y1
VPADDD 288(SP), Y1, Y1
VPXOR Y13, Y1, Y13
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
VPADDD Y9, Y13, Y9
VPXOR Y5, Y9, Y5
VPSRLD $0x0c, Y5, Y8
VPSLLD $0x14, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y1, Y5, Y1
VPADDD 352(SP), Y1, Y1
VPXOR Y13, Y1, Y13
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
VPADDD Y9, Y13, Y9
VPXOR Y5, Y9, Y5
VPSRLD $0x07, Y5, Y8
VPSLLD $0x19, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y2, Y6, Y2
VPADDD 480(SP), Y2, Y2
VPXOR Y14, Y2, Y14
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
VPADDD Y10, Y14, Y10
VPXOR Y6, Y10, Y6
VPSRLD $0x0c, Y6, Y8
VPSLLD $0x14, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y2, Y6, Y2
VPADDD 320(SP), Y2, Y2
VPXOR Y14, Y2, Y14
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
VPADDD Y10, Y14, Y10
VPXOR Y6, Y10, Y6
VPSRLD $0x07, Y6, Y8
VPSLLD $0x19, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y3, Y7, Y3
VPADDD 448(SP), Y3, Y3
VPXOR Y15, Y3, Y15
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
VPADDD Y11, Y15, Y11
VPXOR Y7, Y11, Y7
VPSRLD $0x0c, Y7, Y8
VPSLLD $0x14, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y3, Y7, Y3
VPADDD 256(SP), Y3, Y3
VPXOR Y15, Y3, Y15
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
VPADDD Y11, Y15, Y11
VPXOR Y7, Y11, Y7
VPSRLD $0x07, Y7, Y8
VPSLLD $0x19, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y0, Y5, Y0
VPADDD 224(SP), Y0, Y0
VPXOR Y15, Y0, Y15
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
VPADDD Y10, Y15, Y10
VPXOR Y5, Y10, Y5
VPSRLD $0x0c, Y5, Y8
VPSLLD $0x14, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y0, Y5, Y0
VPADDD 64(SP), Y0, Y0
VPXOR Y15, Y0, Y15
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
VPADDD Y10, Y15, Y10
VPXOR Y5, Y10, Y5
VPSRLD $0x07, Y5, Y8
VPSLLD $0x19, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y1, Y6, Y1
VPADDD 160(SP), Y1, Y1
VPXOR Y12, Y1, Y12
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
VPADDD Y11, Y12, Y11
VPXOR Y6, Y11, Y6
VPSRLD $0x0c, Y6, Y8
VPSLLD $0x14, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y1, Y6, Y1
VPADDD 96(SP), Y1, Y1
VPXOR Y12, Y1, Y12
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
VPADDD Y11, Y12, Y11
VPXOR Y6, Y11, Y6
VPSRLD $0x07, Y6, Y8
VPSLLD $0x19, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y2, Y7, Y2
VPADDD (SP), Y2, Y2
VPXOR Y13, Y2, Y13
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
VMOVDQU 512(SP), Y8
VPADDD Y8, Y13, Y8
VPXOR Y7, Y8, Y7
VMOVDQU Y8, 512(SP)
VPSRLD $0x0c, Y7, Y8
VPSLLD $0x14, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y2, Y7, Y2
VPADDD 32(SP), Y2, Y2
VPXOR Y13, Y2, Y13
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
VMOVDQU 512(SP), Y8
VPADDD Y8, Y13, Y8
VPXOR Y7, Y8, Y7
VMOVDQU Y8, 512(SP)
VPSRLD $0x07, Y7, Y8
VPSLLD $0x19, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y3, Y4, Y3
VPADDD 192(SP), Y3, Y3
VPXOR Y14, Y3, Y14
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
VPADDD Y9, Y14, Y9
VPXOR Y4, Y9, Y4
VPSRLD $0x0c, Y4, Y8
VPSLLD $0x14, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y3, Y4, Y3
VPADDD 128(SP), Y3, Y3
VPXOR Y14, Y3, Y14
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
VPADDD Y9, Y14, Y9
VPXOR Y4, Y9, Y4
VPSRLD $0x07, Y4, Y8
VPSLLD $0x19, Y4, Y4
VPOR Y4, Y8, Y4
// Round 6
VPADDD Y0, Y4, Y0
VPADDD 288(SP), Y0, Y0
VPXOR Y12, Y0, Y12
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
VMOVDQU 512(SP), Y8
VPADDD Y8, Y12, Y8
VPXOR Y4, Y8, Y4
VMOVDQU Y8, 512(SP)
VPSRLD $0x0c, Y4, Y8
VPSLLD $0x14, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y0, Y4, Y0
VPADDD 448(SP), Y0, Y0
VPXOR Y12, Y0, Y12
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
VMOVDQU 512(SP), Y8
VPADDD Y8, Y12, Y8
VPXOR Y4, Y8, Y4
VMOVDQU Y8, 512(SP)
VPSRLD $0x07, Y4, Y8
VPSLLD $0x19, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y1, Y5, Y1
VPADDD 352(SP), Y1, Y1
VPXOR Y13, Y1, Y13
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
VPADDD Y9, Y13, Y9
VPXOR Y5, Y9, Y5
VPSRLD $0x0c, Y5, Y8
VPSLLD $0x14, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y1, Y5, Y1
VPADDD 160(SP), Y1, Y1
VPXOR Y13, Y1, Y13
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
VPADDD Y9, Y13, Y9
VPXOR Y5, Y9, Y5
VPSRLD $0x07, Y5, Y8
VPSLLD $0x19, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y2, Y6, Y2
VPADDD 256(SP), Y2, Y2
VPXOR Y14, Y2, Y14
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
VPADDD Y10, Y14, Y10
VPXOR Y6, Y10, Y6
VPSRLD $0x0c, Y6, Y8
VPSLLD $0x14, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y2, Y6, Y2
VPADDD 384(SP), Y2, Y2
VPXOR Y14, Y2, Y14
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
VPADDD Y10, Y14, Y10
VPXOR Y6, Y10, Y6
VPSRLD $0x07, Y6, Y8
VPSLLD $0x19, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y3, Y7, Y3
VPADDD 480(SP), Y3, Y3
VPXOR Y15, Y3, Y15
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
VPADDD Y11, Y15, Y11
VPXOR Y7, Y11, Y7
VPSRLD $0x0c, Y7, Y8
VPSLLD $0x14, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y3, Y7, Y3
VPADDD 32(SP), Y3, Y3
VPXOR Y15, Y3, Y15
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
VPADDD Y11, Y15, Y11
VPXOR Y7, Y11, Y7
VPSRLD $0x07, Y7, Y8
VPSLLD $0x19, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y0, Y5, Y0
VPADDD 416(SP), Y0, Y0
VPXOR Y15, Y0, Y15
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
VPADDD Y10, Y15, Y10
VPXOR Y5, Y10, Y5
VPSRLD $0x0c, Y5, Y8
VPSLLD $0x14, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y0, Y5, Y0
VPADDD 96(SP), Y0, Y0
VPXOR Y15, Y0, Y15
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
VPADDD Y10, Y15, Y10
VPXOR Y5, Y10, Y5
VPSRLD $0x07, Y5, Y8
VPSLLD $0x19, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y1, Y6, Y1
VPADDD (SP), Y1, Y1
VPXOR Y12, Y1, Y12
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
VPADDD Y11, Y12, Y11
VPXOR Y6, Y11, Y6
VPSRLD $0x0c, Y6, Y8
VPSLLD $0x14, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y1, Y6, Y1
VPADDD 320(SP), Y1, Y1
VPXOR Y12, Y1, Y12
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
VPADDD Y11, Y12, Y11
VPXOR Y6, Y11, Y6
VPSRLD $0x07, Y6, Y8
VPSLLD $0x19, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y2, Y7, Y2
VPADDD 64(SP), Y2, Y2
VPXOR Y13, Y2, Y13
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
VMOVDQU 512(SP), Y8
VPADDD Y8, Y13, Y8
VPXOR Y7, Y8, Y7
VMOVDQU Y8, 512(SP)
VPSRLD $0x0c, Y7, Y8
VPSLLD $0x14, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y2, Y7, Y2
VPADDD 192(SP), Y2, Y2
VPXOR Y13, Y2, Y13
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
VMOVDQU 512(SP), Y8
VPADDD Y8, Y13, Y8
VPXOR Y7, Y8, Y7
VMOVDQU Y8, 512(SP)
VPSRLD $0x07, Y7, Y8
VPSLLD $0x19, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y3, Y4, Y3
VPADDD 128(SP), Y3, Y3
VPXOR Y14, Y3, Y14
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
VPADDD Y9, Y14, Y9
VPXOR Y4, Y9, Y4
VPSRLD $0x0c, Y4, Y8
VPSLLD $0x14, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y3, Y4, Y3
VPADDD 224(SP), Y3, Y3
VPXOR Y14, Y3, Y14
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
VPADDD Y9, Y14, Y9
VPXOR Y4, Y9, Y4
VPSRLD $0x07, Y4, Y8
VPSLLD $0x19, Y4, Y4
VPOR Y4, Y8, Y4
// Round 7
VPADDD Y0, Y4, Y0
VPADDD 352(SP), Y0, Y0
VPXOR Y12, Y0, Y12
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
VMOVDQU 512(SP), Y8
VPADDD Y8, Y12, Y8
VPXOR Y4, Y8, Y4
VMOVDQU Y8, 512(SP)
VPSRLD $0x0c, Y4, Y8
VPSLLD $0x14, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y0, Y4, Y0
VPADDD 480(SP), Y0, Y0
VPXOR Y12, Y0, Y12
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
VMOVDQU 512(SP), Y8
VPADDD Y8, Y12, Y8
VPXOR Y4, Y8, Y4
VMOVDQU Y8, 512(SP)
VPSRLD $0x07, Y4, Y8
VPSLLD $0x19, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y1, Y5, Y1
VPADDD 160(SP), Y1, Y1
VPXOR Y13, Y1, Y13
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
VPADDD Y9, Y13, Y9
VPXOR Y5, Y9, Y5
VPSRLD $0x0c, Y5, Y8
VPSLLD $0x14, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y1, Y5, Y1
VPADDD (SP), Y1, Y1
VPXOR Y13, Y1, Y13
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
VPADDD Y9, Y13, Y9
VPXOR Y5, Y9, Y5
VPSRLD $0x07, Y5, Y8
VPSLLD $0x19, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y2, Y6, Y2
VPADDD 32(SP), Y2, Y2
VPXOR Y14, Y2, Y14
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
VPADDD Y10, Y14, Y10
VPXOR Y6, Y10, Y6
VPSRLD $0x0c, Y6, Y8
VPSLLD $0x14, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y2, Y6, Y2
VPADDD 288(SP), Y2, Y2
VPXOR Y14, Y2, Y14
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
VPADDD Y10, Y14, Y10
VPXOR Y6, Y10, Y6
VPSRLD $0x07, Y6, Y8
VPSLLD $0x19, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y3, Y7, Y3
VPADDD 256(SP), Y3, Y3
VPXOR Y15, Y3, Y15
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
VPADDD Y11, Y15, Y11
VPXOR Y7, Y11, Y7
VPSRLD $0x0c, Y7, Y8
VPSLLD $0x14, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y3, Y7, Y3
VPADDD 192(SP), Y3, Y3
VPXOR Y15, Y3, Y15
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
VPADDD Y11, Y15, Y11
VPXOR Y7, Y11, Y7
VPSRLD $0x07, Y7, Y8
VPSLLD $0x19, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y0, Y5, Y0
VPADDD 448(SP), Y0, Y0
VPXOR Y15, Y0, Y15
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
VPADDD Y10, Y15, Y10
VPXOR Y5, Y10, Y5
VPSRLD $0x0c, Y5, Y8
VPSLLD $0x14, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y0, Y5, Y0
VPADDD 320(SP), Y0, Y0
VPXOR Y15, Y0, Y15
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
VPADDD Y10, Y15, Y10
VPXOR Y5, Y10, Y5
VPSRLD $0x07, Y5, Y8
VPSLLD $0x19, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y1, Y6, Y1
VPADDD 64(SP), Y1, Y1
VPXOR Y12, Y1, Y12
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
VPADDD Y11, Y12, Y11
VPXOR Y6, Y11, Y6
VPSRLD $0x0c, Y6, Y8
VPSLLD $0x14, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y1, Y6, Y1
VPADDD 384(SP), Y1, Y1
VPXOR Y12, Y1, Y12
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
VPADDD Y11, Y12, Y11
VPXOR Y6, Y11, Y6
VPSRLD $0x07, Y6, Y8
VPSLLD $0x19, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y2, Y7, Y2
VPADDD 96(SP), Y2, Y2
VPXOR Y13, Y2, Y13
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
VMOVDQU 512(SP), Y8
VPADDD Y8, Y13, Y8
VPXOR Y7, Y8, Y7
VMOVDQU Y8, 512(SP)
VPSRLD $0x0c, Y7, Y8
VPSLLD $0x14, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y2, Y7, Y2
VPADDD 128(SP), Y2, Y2
VPXOR Y13, Y2, Y13
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
VMOVDQU 512(SP), Y8
VPADDD Y8, Y13, Y8
VPXOR Y7, Y8, Y7
VMOVDQU Y8, 512(SP)
VPSRLD $0x07, Y7, Y8
VPSLLD $0x19, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y3, Y4, Y3
VPADDD 224(SP), Y3, Y3
VPXOR Y14, Y3, Y14
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
VPADDD Y9, Y14, Y9
VPXOR Y4, Y9, Y4
VPSRLD $0x0c, Y4, Y8
VPSLLD $0x14, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y3, Y4, Y3
VPADDD 416(SP), Y3, Y3
VPXOR Y14, Y3, Y14
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
VPADDD Y9, Y14, Y9
VPXOR Y4, Y9, Y4
VPSRLD $0x07, Y4, Y8
VPSLLD $0x19, Y4, Y4
VPOR Y4, Y8, Y4
// Finalize CVs
VMOVDQU 512(SP), Y8
VPXOR Y0, Y8, Y0
VPXOR Y1, Y9, Y1
VPXOR Y2, Y10, Y2
VPXOR Y3, Y11, Y3
VPXOR Y4, Y12, Y4
VPXOR Y5, Y13, Y5
VPXOR Y6, Y14, Y6
VPXOR Y7, Y15, Y7
// Loop
INCQ DX
CMPQ DX, $0x00000010
JNE loop
// Finished; transpose CVs
VPUNPCKLDQ Y1, Y0, Y8
VPUNPCKHDQ Y1, Y0, Y9
VPUNPCKLDQ Y3, Y2, Y10
VPUNPCKHDQ Y3, Y2, Y11
VPUNPCKLDQ Y5, Y4, Y12
VPUNPCKHDQ Y5, Y4, Y13
VPUNPCKLDQ Y7, Y6, Y14
VPUNPCKHDQ Y7, Y6, Y15
VPUNPCKLQDQ Y10, Y8, Y0
VPUNPCKHQDQ Y10, Y8, Y1
VPUNPCKLQDQ Y11, Y9, Y2
VPUNPCKHQDQ Y11, Y9, Y3
VPUNPCKLQDQ Y14, Y12, Y4
VPUNPCKHQDQ Y14, Y12, Y5
VPUNPCKLQDQ Y15, Y13, Y6
VPUNPCKHQDQ Y15, Y13, Y7
VPERM2I128 $0x20, Y4, Y0, Y8
VPERM2I128 $0x31, Y4, Y0, Y12
VPERM2I128 $0x20, Y5, Y1, Y9
VPERM2I128 $0x31, Y5, Y1, Y13
VPERM2I128 $0x20, Y6, Y2, Y10
VPERM2I128 $0x31, Y6, Y2, Y14
VPERM2I128 $0x20, Y7, Y3, Y11
VPERM2I128 $0x31, Y7, Y3, Y15
VMOVDQU Y8, (AX)
VMOVDQU Y9, 32(AX)
VMOVDQU Y10, 64(AX)
VMOVDQU Y11, 96(AX)
VMOVDQU Y12, 128(AX)
VMOVDQU Y13, 160(AX)
VMOVDQU Y14, 192(AX)
VMOVDQU Y15, 224(AX)
RET