// Code generated by command: go run gen.go -out blake3_amd64.s. DO NOT EDIT. #include "textflag.h" DATA iv<>+0(SB)/4, $0x6a09e667 DATA iv<>+4(SB)/4, $0xbb67ae85 DATA iv<>+8(SB)/4, $0x3c6ef372 DATA iv<>+12(SB)/4, $0xa54ff53a GLOBL iv<>(SB), RODATA|NOPTR, $16 DATA block_len<>+0(SB)/4, $0x00000040 DATA block_len<>+4(SB)/4, $0x00000040 DATA block_len<>+8(SB)/4, $0x00000040 DATA block_len<>+12(SB)/4, $0x00000040 DATA block_len<>+16(SB)/4, $0x00000040 DATA block_len<>+20(SB)/4, $0x00000040 DATA block_len<>+24(SB)/4, $0x00000040 DATA block_len<>+28(SB)/4, $0x00000040 GLOBL block_len<>(SB), RODATA|NOPTR, $32 DATA stride_1024<>+0(SB)/4, $0x00000000 DATA stride_1024<>+4(SB)/4, $0x00000400 DATA stride_1024<>+8(SB)/4, $0x00000800 DATA stride_1024<>+12(SB)/4, $0x00000c00 DATA stride_1024<>+16(SB)/4, $0x00001000 DATA stride_1024<>+20(SB)/4, $0x00001400 DATA stride_1024<>+24(SB)/4, $0x00001800 DATA stride_1024<>+28(SB)/4, $0x00001c00 GLOBL stride_1024<>(SB), RODATA|NOPTR, $32 DATA increment_counter<>+0(SB)/8, $0x0000000000000000 DATA increment_counter<>+8(SB)/8, $0x0000000000000001 DATA increment_counter<>+16(SB)/8, $0x0000000000000002 DATA increment_counter<>+24(SB)/8, $0x0000000000000003 DATA increment_counter<>+32(SB)/8, $0x0000000000000004 DATA increment_counter<>+40(SB)/8, $0x0000000000000005 DATA increment_counter<>+48(SB)/8, $0x0000000000000006 DATA increment_counter<>+56(SB)/8, $0x0000000000000007 GLOBL increment_counter<>(SB), RODATA|NOPTR, $64 DATA set_flags<>+0(SB)/4, $0x00000001 DATA set_flags<>+4(SB)/4, $0x00000000 DATA set_flags<>+8(SB)/4, $0x00000000 DATA set_flags<>+12(SB)/4, $0x00000000 DATA set_flags<>+16(SB)/4, $0x00000000 DATA set_flags<>+20(SB)/4, $0x00000000 DATA set_flags<>+24(SB)/4, $0x00000000 DATA set_flags<>+28(SB)/4, $0x00000000 DATA set_flags<>+32(SB)/4, $0x00000000 DATA set_flags<>+36(SB)/4, $0x00000000 DATA set_flags<>+40(SB)/4, $0x00000000 DATA set_flags<>+44(SB)/4, $0x00000000 DATA set_flags<>+48(SB)/4, $0x00000000 DATA set_flags<>+52(SB)/4, $0x00000000 DATA set_flags<>+56(SB)/4, $0x00000000 DATA set_flags<>+60(SB)/4, $0x00000002 GLOBL set_flags<>(SB), RODATA|NOPTR, $64 DATA shuffle_rot8<>+0(SB)/4, $0x00030201 DATA shuffle_rot8<>+4(SB)/4, $0x04070605 DATA shuffle_rot8<>+8(SB)/4, $0x080b0a09 DATA shuffle_rot8<>+12(SB)/4, $0x0c0f0e0d DATA shuffle_rot8<>+16(SB)/4, $0x10131211 DATA shuffle_rot8<>+20(SB)/4, $0x14171615 DATA shuffle_rot8<>+24(SB)/4, $0x181b1a19 DATA shuffle_rot8<>+28(SB)/4, $0x1c1f1e1d GLOBL shuffle_rot8<>(SB), RODATA|NOPTR, $32 DATA shuffle_rot16<>+0(SB)/4, $0x01000302 DATA shuffle_rot16<>+4(SB)/4, $0x05040706 DATA shuffle_rot16<>+8(SB)/4, $0x09080b0a DATA shuffle_rot16<>+12(SB)/4, $0x0d0c0f0e DATA shuffle_rot16<>+16(SB)/4, $0x11101312 DATA shuffle_rot16<>+20(SB)/4, $0x15141716 DATA shuffle_rot16<>+24(SB)/4, $0x19181b1a DATA shuffle_rot16<>+28(SB)/4, $0x1d1c1f1e GLOBL shuffle_rot16<>(SB), RODATA|NOPTR, $32 // func compressChunksAVX2(cvs *[8][8]uint32, buf *[8192]byte, key *[8]uint32, counter uint64, flags uint32) // Requires: AVX, AVX2 TEXT ·compressChunksAVX2(SB), NOSPLIT, $672-40 MOVQ cvs+0(FP), AX MOVQ buf+8(FP), CX MOVQ key+16(FP), DX // Load key VPBROADCASTD (DX), Y0 VPBROADCASTD 4(DX), Y1 VPBROADCASTD 8(DX), Y2 VPBROADCASTD 12(DX), Y3 VPBROADCASTD 16(DX), Y4 VPBROADCASTD 20(DX), Y5 VPBROADCASTD 24(DX), Y6 VPBROADCASTD 28(DX), Y7 // Initialize counter VPBROADCASTQ counter+24(FP), Y12 VPBROADCASTQ counter+24(FP), Y13 VPADDQ increment_counter<>+0(SB), Y12, Y12 VPADDQ increment_counter<>+32(SB), Y13, Y13 VPUNPCKLDQ Y13, Y12, Y14 VPUNPCKHDQ Y13, Y12, Y15 VPUNPCKLDQ Y15, Y14, Y12 VPUNPCKHDQ Y15, Y14, Y13 VPERMQ $0xd8, Y12, Y12 VPERMQ $0xd8, Y13, Y13 VMOVDQU Y12, 544(SP) VMOVDQU Y13, 576(SP) // Initialize flags VPBROADCASTD flags+32(FP), Y14 VPOR set_flags<>+0(SB), Y14, Y15 VMOVDQU Y15, 608(SP) VPOR set_flags<>+32(SB), Y14, Y15 VMOVDQU Y15, 640(SP) // Loop index XORQ DX, DX loop: // Load transposed block VMOVDQU stride_1024<>+0(SB), Y9 VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, (CX)(Y9*1), Y10 VMOVDQU Y10, (SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 4(CX)(Y9*1), Y10 VMOVDQU Y10, 32(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 8(CX)(Y9*1), Y10 VMOVDQU Y10, 64(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 12(CX)(Y9*1), Y10 VMOVDQU Y10, 96(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 16(CX)(Y9*1), Y10 VMOVDQU Y10, 128(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 20(CX)(Y9*1), Y10 VMOVDQU Y10, 160(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 24(CX)(Y9*1), Y10 VMOVDQU Y10, 192(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 28(CX)(Y9*1), Y10 VMOVDQU Y10, 224(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 32(CX)(Y9*1), Y10 VMOVDQU Y10, 256(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 36(CX)(Y9*1), Y10 VMOVDQU Y10, 288(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 40(CX)(Y9*1), Y10 VMOVDQU Y10, 320(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 44(CX)(Y9*1), Y10 VMOVDQU Y10, 352(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 48(CX)(Y9*1), Y10 VMOVDQU Y10, 384(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 52(CX)(Y9*1), Y10 VMOVDQU Y10, 416(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 56(CX)(Y9*1), Y10 VMOVDQU Y10, 448(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 60(CX)(Y9*1), Y10 VMOVDQU Y10, 480(SP) ADDQ $0x40, CX // Reload state vectors (other than CVs) VPBROADCASTD iv<>+0(SB), Y8 VPBROADCASTD iv<>+4(SB), Y9 VPBROADCASTD iv<>+8(SB), Y10 VPBROADCASTD iv<>+12(SB), Y11 VMOVDQU 544(SP), Y12 VMOVDQU 576(SP), Y13 VMOVDQU block_len<>+0(SB), Y14 VPBROADCASTD 608(SP)(DX*4), Y15 VMOVDQU Y8, 512(SP) // Round 1 VPADDD Y0, Y4, Y0 VPADDD (SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 32(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 64(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 96(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 128(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 160(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 192(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 224(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 256(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 288(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 320(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 352(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 384(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 416(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 448(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 480(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 2 VPADDD Y0, Y4, Y0 VPADDD 64(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 192(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 96(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 320(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 224(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD (SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 128(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 416(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 32(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 352(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 384(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 160(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 288(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 448(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 480(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 256(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 3 VPADDD Y0, Y4, Y0 VPADDD 96(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 128(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 320(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 384(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 416(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 64(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 224(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 448(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 192(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 160(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 288(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD (SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 352(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 480(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 256(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 32(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 4 VPADDD Y0, Y4, Y0 VPADDD 320(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 224(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 384(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 288(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 448(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 96(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 416(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 480(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 128(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD (SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 352(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 64(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 160(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 256(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 32(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 192(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 5 VPADDD Y0, Y4, Y0 VPADDD 384(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 416(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 288(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 352(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 480(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 320(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 448(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 256(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 224(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 64(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 160(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 96(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD (SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 32(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 192(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 128(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 6 VPADDD Y0, Y4, Y0 VPADDD 288(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 448(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 352(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 160(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 256(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 384(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 480(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 32(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 416(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 96(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD (SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 320(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 64(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 192(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 128(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 224(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 7 VPADDD Y0, Y4, Y0 VPADDD 352(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 480(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 160(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD (SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 32(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 288(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 256(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 192(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 448(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 320(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 64(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 384(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 96(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 128(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 224(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 416(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Finalize CVs VMOVDQU 512(SP), Y8 VPXOR Y0, Y8, Y0 VPXOR Y1, Y9, Y1 VPXOR Y2, Y10, Y2 VPXOR Y3, Y11, Y3 VPXOR Y4, Y12, Y4 VPXOR Y5, Y13, Y5 VPXOR Y6, Y14, Y6 VPXOR Y7, Y15, Y7 // Loop INCQ DX CMPQ DX, $0x00000010 JNE loop // Finished; transpose CVs VPUNPCKLDQ Y1, Y0, Y8 VPUNPCKHDQ Y1, Y0, Y9 VPUNPCKLDQ Y3, Y2, Y10 VPUNPCKHDQ Y3, Y2, Y11 VPUNPCKLDQ Y5, Y4, Y12 VPUNPCKHDQ Y5, Y4, Y13 VPUNPCKLDQ Y7, Y6, Y14 VPUNPCKHDQ Y7, Y6, Y15 VPUNPCKLQDQ Y10, Y8, Y0 VPUNPCKHQDQ Y10, Y8, Y1 VPUNPCKLQDQ Y11, Y9, Y2 VPUNPCKHQDQ Y11, Y9, Y3 VPUNPCKLQDQ Y14, Y12, Y4 VPUNPCKHQDQ Y14, Y12, Y5 VPUNPCKLQDQ Y15, Y13, Y6 VPUNPCKHQDQ Y15, Y13, Y7 VPERM2I128 $0x20, Y4, Y0, Y8 VPERM2I128 $0x31, Y4, Y0, Y12 VPERM2I128 $0x20, Y5, Y1, Y9 VPERM2I128 $0x31, Y5, Y1, Y13 VPERM2I128 $0x20, Y6, Y2, Y10 VPERM2I128 $0x31, Y6, Y2, Y14 VPERM2I128 $0x20, Y7, Y3, Y11 VPERM2I128 $0x31, Y7, Y3, Y15 VMOVDQU Y8, (AX) VMOVDQU Y9, 32(AX) VMOVDQU Y10, 64(AX) VMOVDQU Y11, 96(AX) VMOVDQU Y12, 128(AX) VMOVDQU Y13, 160(AX) VMOVDQU Y14, 192(AX) VMOVDQU Y15, 224(AX) RET