// Code generated by command: go run gen.go -out blake3_amd64.s. DO NOT EDIT. #include "textflag.h" DATA iv<>+0(SB)/4, $0x6a09e667 DATA iv<>+4(SB)/4, $0xbb67ae85 DATA iv<>+8(SB)/4, $0x3c6ef372 DATA iv<>+12(SB)/4, $0xa54ff53a GLOBL iv<>(SB), RODATA|NOPTR, $16 DATA block_len<>+0(SB)/4, $0x00000040 DATA block_len<>+4(SB)/4, $0x00000040 DATA block_len<>+8(SB)/4, $0x00000040 DATA block_len<>+12(SB)/4, $0x00000040 DATA block_len<>+16(SB)/4, $0x00000040 DATA block_len<>+20(SB)/4, $0x00000040 DATA block_len<>+24(SB)/4, $0x00000040 DATA block_len<>+28(SB)/4, $0x00000040 GLOBL block_len<>(SB), RODATA|NOPTR, $32 DATA stride_1024<>+0(SB)/4, $0x00000000 DATA stride_1024<>+4(SB)/4, $0x00000400 DATA stride_1024<>+8(SB)/4, $0x00000800 DATA stride_1024<>+12(SB)/4, $0x00000c00 DATA stride_1024<>+16(SB)/4, $0x00001000 DATA stride_1024<>+20(SB)/4, $0x00001400 DATA stride_1024<>+24(SB)/4, $0x00001800 DATA stride_1024<>+28(SB)/4, $0x00001c00 GLOBL stride_1024<>(SB), RODATA|NOPTR, $32 DATA increment_counter<>+0(SB)/8, $0x0000000000000000 DATA increment_counter<>+8(SB)/8, $0x0000000000000001 DATA increment_counter<>+16(SB)/8, $0x0000000000000002 DATA increment_counter<>+24(SB)/8, $0x0000000000000003 DATA increment_counter<>+32(SB)/8, $0x0000000000000004 DATA increment_counter<>+40(SB)/8, $0x0000000000000005 DATA increment_counter<>+48(SB)/8, $0x0000000000000006 DATA increment_counter<>+56(SB)/8, $0x0000000000000007 GLOBL increment_counter<>(SB), RODATA|NOPTR, $64 DATA set_flags<>+0(SB)/4, $0x00000001 DATA set_flags<>+4(SB)/4, $0x00000000 DATA set_flags<>+8(SB)/4, $0x00000000 DATA set_flags<>+12(SB)/4, $0x00000000 DATA set_flags<>+16(SB)/4, $0x00000000 DATA set_flags<>+20(SB)/4, $0x00000000 DATA set_flags<>+24(SB)/4, $0x00000000 DATA set_flags<>+28(SB)/4, $0x00000000 DATA set_flags<>+32(SB)/4, $0x00000000 DATA set_flags<>+36(SB)/4, $0x00000000 DATA set_flags<>+40(SB)/4, $0x00000000 DATA set_flags<>+44(SB)/4, $0x00000000 DATA set_flags<>+48(SB)/4, $0x00000000 DATA set_flags<>+52(SB)/4, $0x00000000 DATA set_flags<>+56(SB)/4, $0x00000000 DATA set_flags<>+60(SB)/4, $0x00000002 GLOBL set_flags<>(SB), RODATA|NOPTR, $64 DATA shuffle_rot8<>+0(SB)/4, $0x00030201 DATA shuffle_rot8<>+4(SB)/4, $0x04070605 DATA shuffle_rot8<>+8(SB)/4, $0x080b0a09 DATA shuffle_rot8<>+12(SB)/4, $0x0c0f0e0d DATA shuffle_rot8<>+16(SB)/4, $0x10131211 DATA shuffle_rot8<>+20(SB)/4, $0x14171615 DATA shuffle_rot8<>+24(SB)/4, $0x181b1a19 DATA shuffle_rot8<>+28(SB)/4, $0x1c1f1e1d GLOBL shuffle_rot8<>(SB), RODATA|NOPTR, $32 DATA shuffle_rot16<>+0(SB)/4, $0x01000302 DATA shuffle_rot16<>+4(SB)/4, $0x05040706 DATA shuffle_rot16<>+8(SB)/4, $0x09080b0a DATA shuffle_rot16<>+12(SB)/4, $0x0d0c0f0e DATA shuffle_rot16<>+16(SB)/4, $0x11101312 DATA shuffle_rot16<>+20(SB)/4, $0x15141716 DATA shuffle_rot16<>+24(SB)/4, $0x19181b1a DATA shuffle_rot16<>+28(SB)/4, $0x1d1c1f1e GLOBL shuffle_rot16<>(SB), RODATA|NOPTR, $32 // func compressBlocksAVX2(out *[512]byte, block *[16]uint32, cv *[8]uint32, counter uint64, blockLen uint32, flags uint32) // Requires: AVX, AVX2 TEXT ·compressBlocksAVX2(SB), NOSPLIT, $544-40 MOVQ out+0(FP), AX MOVQ block+8(FP), CX MOVQ cv+16(FP), DX // Load block VPBROADCASTD (CX), Y0 VMOVDQU Y0, (SP) VPBROADCASTD 4(CX), Y0 VMOVDQU Y0, 32(SP) VPBROADCASTD 8(CX), Y0 VMOVDQU Y0, 64(SP) VPBROADCASTD 12(CX), Y0 VMOVDQU Y0, 96(SP) VPBROADCASTD 16(CX), Y0 VMOVDQU Y0, 128(SP) VPBROADCASTD 20(CX), Y0 VMOVDQU Y0, 160(SP) VPBROADCASTD 24(CX), Y0 VMOVDQU Y0, 192(SP) VPBROADCASTD 28(CX), Y0 VMOVDQU Y0, 224(SP) VPBROADCASTD 32(CX), Y0 VMOVDQU Y0, 256(SP) VPBROADCASTD 36(CX), Y0 VMOVDQU Y0, 288(SP) VPBROADCASTD 40(CX), Y0 VMOVDQU Y0, 320(SP) VPBROADCASTD 44(CX), Y0 VMOVDQU Y0, 352(SP) VPBROADCASTD 48(CX), Y0 VMOVDQU Y0, 384(SP) VPBROADCASTD 52(CX), Y0 VMOVDQU Y0, 416(SP) VPBROADCASTD 56(CX), Y0 VMOVDQU Y0, 448(SP) VPBROADCASTD 60(CX), Y0 VMOVDQU Y0, 480(SP) // Initialize state vectors VPBROADCASTD (DX), Y0 VPBROADCASTD 4(DX), Y1 VPBROADCASTD 8(DX), Y2 VPBROADCASTD 12(DX), Y3 VPBROADCASTD 16(DX), Y4 VPBROADCASTD 20(DX), Y5 VPBROADCASTD 24(DX), Y6 VPBROADCASTD 28(DX), Y7 VPBROADCASTD iv<>+0(SB), Y8 VPBROADCASTD iv<>+4(SB), Y9 VPBROADCASTD iv<>+8(SB), Y10 VPBROADCASTD iv<>+12(SB), Y11 VPBROADCASTQ counter+24(FP), Y12 VPBROADCASTQ counter+24(FP), Y13 VPADDQ increment_counter<>+0(SB), Y12, Y12 VPADDQ increment_counter<>+32(SB), Y13, Y13 VPUNPCKLDQ Y13, Y12, Y14 VPUNPCKHDQ Y13, Y12, Y15 VPUNPCKLDQ Y15, Y14, Y12 VPUNPCKHDQ Y15, Y14, Y13 VPERMQ $0xd8, Y12, Y12 VPERMQ $0xd8, Y13, Y13 VPBROADCASTD blockLen+32(FP), Y14 VPBROADCASTD flags+36(FP), Y15 VMOVDQU Y8, 512(SP) // Round 1 VPADDD Y0, Y4, Y0 VPADDD (SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 32(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 64(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 96(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 128(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 160(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 192(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 224(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 256(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 288(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 320(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 352(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 384(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 416(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 448(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 480(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 2 VPADDD Y0, Y4, Y0 VPADDD 64(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 192(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 96(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 320(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 224(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD (SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 128(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 416(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 32(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 352(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 384(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 160(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 288(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 448(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 480(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 256(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 3 VPADDD Y0, Y4, Y0 VPADDD 96(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 128(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 320(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 384(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 416(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 64(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 224(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 448(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 192(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 160(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 288(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD (SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 352(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 480(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 256(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 32(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 4 VPADDD Y0, Y4, Y0 VPADDD 320(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 224(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 384(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 288(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 448(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 96(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 416(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 480(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 128(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD (SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 352(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 64(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 160(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 256(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 32(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 192(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 5 VPADDD Y0, Y4, Y0 VPADDD 384(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 416(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 288(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 352(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 480(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 320(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 448(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 256(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 224(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 64(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 160(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 96(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD (SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 32(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 192(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 128(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 6 VPADDD Y0, Y4, Y0 VPADDD 288(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 448(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 352(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 160(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 256(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 384(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 480(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 32(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 416(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 96(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD (SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 320(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 64(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 192(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 128(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 224(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 7 VPADDD Y0, Y4, Y0 VPADDD 352(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 480(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 160(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD (SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 32(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 288(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 256(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 192(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 448(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 320(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 64(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 384(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 96(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 128(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 224(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 416(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VMOVDQU 512(SP), Y8 // Finalize CVs VMOVDQU Y8, 256(SP) VMOVDQU Y9, 288(SP) VMOVDQU Y10, 320(SP) VMOVDQU Y11, 352(SP) VMOVDQU Y12, 384(SP) VMOVDQU Y13, 416(SP) VMOVDQU Y14, 448(SP) VMOVDQU Y15, 480(SP) VPXOR Y0, Y8, Y0 VPXOR Y1, Y9, Y1 VPXOR Y2, Y10, Y2 VPXOR Y3, Y11, Y3 VPXOR Y4, Y12, Y4 VPXOR Y5, Y13, Y5 VPXOR Y6, Y14, Y6 VPXOR Y7, Y15, Y7 VPUNPCKLDQ Y1, Y0, Y8 VPUNPCKHDQ Y1, Y0, Y9 VPUNPCKLDQ Y3, Y2, Y10 VPUNPCKHDQ Y3, Y2, Y11 VPUNPCKLDQ Y5, Y4, Y12 VPUNPCKHDQ Y5, Y4, Y13 VPUNPCKLDQ Y7, Y6, Y14 VPUNPCKHDQ Y7, Y6, Y15 VPUNPCKLQDQ Y10, Y8, Y0 VPUNPCKHQDQ Y10, Y8, Y1 VPUNPCKLQDQ Y11, Y9, Y2 VPUNPCKHQDQ Y11, Y9, Y3 VPUNPCKLQDQ Y14, Y12, Y4 VPUNPCKHQDQ Y14, Y12, Y5 VPUNPCKLQDQ Y15, Y13, Y6 VPUNPCKHQDQ Y15, Y13, Y7 VPERM2I128 $0x20, Y4, Y0, Y8 VPERM2I128 $0x31, Y4, Y0, Y12 VPERM2I128 $0x20, Y5, Y1, Y9 VPERM2I128 $0x31, Y5, Y1, Y13 VPERM2I128 $0x20, Y6, Y2, Y10 VPERM2I128 $0x31, Y6, Y2, Y14 VPERM2I128 $0x20, Y7, Y3, Y11 VPERM2I128 $0x31, Y7, Y3, Y15 VMOVDQU Y8, (AX) VMOVDQU Y9, 64(AX) VMOVDQU Y10, 128(AX) VMOVDQU Y11, 192(AX) VMOVDQU Y12, 256(AX) VMOVDQU Y13, 320(AX) VMOVDQU Y14, 384(AX) VMOVDQU Y15, 448(AX) VMOVDQU 256(SP), Y8 VMOVDQU 288(SP), Y9 VMOVDQU 320(SP), Y10 VMOVDQU 352(SP), Y11 VMOVDQU 384(SP), Y12 VMOVDQU 416(SP), Y13 VMOVDQU 448(SP), Y14 VMOVDQU 480(SP), Y15 VPBROADCASTD (DX), Y0 VPXOR Y0, Y8, Y8 VPBROADCASTD 4(DX), Y0 VPXOR Y0, Y9, Y9 VPBROADCASTD 8(DX), Y0 VPXOR Y0, Y10, Y10 VPBROADCASTD 12(DX), Y0 VPXOR Y0, Y11, Y11 VPBROADCASTD 16(DX), Y0 VPXOR Y0, Y12, Y12 VPBROADCASTD 20(DX), Y0 VPXOR Y0, Y13, Y13 VPBROADCASTD 24(DX), Y0 VPXOR Y0, Y14, Y14 VPBROADCASTD 28(DX), Y0 VPXOR Y0, Y15, Y15 VPUNPCKLDQ Y9, Y8, Y0 VPUNPCKHDQ Y9, Y8, Y1 VPUNPCKLDQ Y11, Y10, Y2 VPUNPCKHDQ Y11, Y10, Y3 VPUNPCKLDQ Y13, Y12, Y4 VPUNPCKHDQ Y13, Y12, Y5 VPUNPCKLDQ Y15, Y14, Y6 VPUNPCKHDQ Y15, Y14, Y7 VPUNPCKLQDQ Y2, Y0, Y8 VPUNPCKHQDQ Y2, Y0, Y9 VPUNPCKLQDQ Y3, Y1, Y10 VPUNPCKHQDQ Y3, Y1, Y11 VPUNPCKLQDQ Y6, Y4, Y12 VPUNPCKHQDQ Y6, Y4, Y13 VPUNPCKLQDQ Y7, Y5, Y14 VPUNPCKHQDQ Y7, Y5, Y15 VPERM2I128 $0x20, Y12, Y8, Y0 VPERM2I128 $0x31, Y12, Y8, Y4 VPERM2I128 $0x20, Y13, Y9, Y1 VPERM2I128 $0x31, Y13, Y9, Y5 VPERM2I128 $0x20, Y14, Y10, Y2 VPERM2I128 $0x31, Y14, Y10, Y6 VPERM2I128 $0x20, Y15, Y11, Y3 VPERM2I128 $0x31, Y15, Y11, Y7 VMOVDQU Y0, 32(AX) VMOVDQU Y1, 96(AX) VMOVDQU Y2, 160(AX) VMOVDQU Y3, 224(AX) VMOVDQU Y4, 288(AX) VMOVDQU Y5, 352(AX) VMOVDQU Y6, 416(AX) VMOVDQU Y7, 480(AX) RET // func compressChunksAVX2(cvs *[8][8]uint32, buf *[8192]byte, key *[8]uint32, counter uint64, flags uint32) // Requires: AVX, AVX2 TEXT ·compressChunksAVX2(SB), NOSPLIT, $672-40 MOVQ cvs+0(FP), AX MOVQ buf+8(FP), CX MOVQ key+16(FP), DX // Load key VPBROADCASTD (DX), Y0 VPBROADCASTD 4(DX), Y1 VPBROADCASTD 8(DX), Y2 VPBROADCASTD 12(DX), Y3 VPBROADCASTD 16(DX), Y4 VPBROADCASTD 20(DX), Y5 VPBROADCASTD 24(DX), Y6 VPBROADCASTD 28(DX), Y7 // Initialize counter VPBROADCASTQ counter+24(FP), Y12 VPBROADCASTQ counter+24(FP), Y13 VPADDQ increment_counter<>+0(SB), Y12, Y12 VPADDQ increment_counter<>+32(SB), Y13, Y13 VPUNPCKLDQ Y13, Y12, Y14 VPUNPCKHDQ Y13, Y12, Y15 VPUNPCKLDQ Y15, Y14, Y12 VPUNPCKHDQ Y15, Y14, Y13 VPERMQ $0xd8, Y12, Y12 VPERMQ $0xd8, Y13, Y13 VMOVDQU Y12, 544(SP) VMOVDQU Y13, 576(SP) // Initialize flags VPBROADCASTD flags+32(FP), Y14 VPOR set_flags<>+0(SB), Y14, Y15 VMOVDQU Y15, 608(SP) VPOR set_flags<>+32(SB), Y14, Y15 VMOVDQU Y15, 640(SP) // Loop index XORQ DX, DX loop: // Load transposed block VMOVDQU stride_1024<>+0(SB), Y9 VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, (CX)(Y9*1), Y10 VMOVDQU Y10, (SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 4(CX)(Y9*1), Y10 VMOVDQU Y10, 32(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 8(CX)(Y9*1), Y10 VMOVDQU Y10, 64(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 12(CX)(Y9*1), Y10 VMOVDQU Y10, 96(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 16(CX)(Y9*1), Y10 VMOVDQU Y10, 128(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 20(CX)(Y9*1), Y10 VMOVDQU Y10, 160(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 24(CX)(Y9*1), Y10 VMOVDQU Y10, 192(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 28(CX)(Y9*1), Y10 VMOVDQU Y10, 224(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 32(CX)(Y9*1), Y10 VMOVDQU Y10, 256(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 36(CX)(Y9*1), Y10 VMOVDQU Y10, 288(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 40(CX)(Y9*1), Y10 VMOVDQU Y10, 320(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 44(CX)(Y9*1), Y10 VMOVDQU Y10, 352(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 48(CX)(Y9*1), Y10 VMOVDQU Y10, 384(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 52(CX)(Y9*1), Y10 VMOVDQU Y10, 416(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 56(CX)(Y9*1), Y10 VMOVDQU Y10, 448(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 60(CX)(Y9*1), Y10 VMOVDQU Y10, 480(SP) ADDQ $0x40, CX // Reload state vectors (other than CVs) VPBROADCASTD iv<>+0(SB), Y8 VPBROADCASTD iv<>+4(SB), Y9 VPBROADCASTD iv<>+8(SB), Y10 VPBROADCASTD iv<>+12(SB), Y11 VMOVDQU 544(SP), Y12 VMOVDQU 576(SP), Y13 VMOVDQU block_len<>+0(SB), Y14 VPBROADCASTD 608(SP)(DX*4), Y15 VMOVDQU Y8, 512(SP) // Round 1 VPADDD Y0, Y4, Y0 VPADDD (SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 32(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 64(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 96(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 128(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 160(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 192(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 224(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 256(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 288(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 320(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 352(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 384(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 416(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 448(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 480(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 2 VPADDD Y0, Y4, Y0 VPADDD 64(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 192(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 96(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 320(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 224(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD (SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 128(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 416(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 32(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 352(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 384(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 160(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 288(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 448(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 480(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 256(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 3 VPADDD Y0, Y4, Y0 VPADDD 96(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 128(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 320(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 384(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 416(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 64(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 224(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 448(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 192(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 160(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 288(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD (SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 352(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 480(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 256(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 32(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 4 VPADDD Y0, Y4, Y0 VPADDD 320(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 224(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 384(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 288(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 448(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 96(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 416(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 480(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 128(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD (SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 352(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 64(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 160(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 256(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 32(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 192(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 5 VPADDD Y0, Y4, Y0 VPADDD 384(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 416(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 288(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 352(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 480(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 320(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 448(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 256(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 224(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 64(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 160(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 96(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD (SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 32(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 192(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 128(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 6 VPADDD Y0, Y4, Y0 VPADDD 288(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 448(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 352(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 160(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 256(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 384(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 480(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 32(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 416(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 96(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD (SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 320(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 64(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 192(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 128(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 224(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 7 VPADDD Y0, Y4, Y0 VPADDD 352(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 480(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 160(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD (SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 32(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 288(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 256(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 192(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 448(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 320(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 64(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 384(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 96(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 128(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 224(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 416(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VMOVDQU 512(SP), Y8 // Finalize CVs VPXOR Y0, Y8, Y0 VPXOR Y1, Y9, Y1 VPXOR Y2, Y10, Y2 VPXOR Y3, Y11, Y3 VPXOR Y4, Y12, Y4 VPXOR Y5, Y13, Y5 VPXOR Y6, Y14, Y6 VPXOR Y7, Y15, Y7 // Loop INCQ DX CMPQ DX, $0x00000010 JNE loop // Finished; transpose CVs VPUNPCKLDQ Y1, Y0, Y8 VPUNPCKHDQ Y1, Y0, Y9 VPUNPCKLDQ Y3, Y2, Y10 VPUNPCKHDQ Y3, Y2, Y11 VPUNPCKLDQ Y5, Y4, Y12 VPUNPCKHDQ Y5, Y4, Y13 VPUNPCKLDQ Y7, Y6, Y14 VPUNPCKHDQ Y7, Y6, Y15 VPUNPCKLQDQ Y10, Y8, Y0 VPUNPCKHQDQ Y10, Y8, Y1 VPUNPCKLQDQ Y11, Y9, Y2 VPUNPCKHQDQ Y11, Y9, Y3 VPUNPCKLQDQ Y14, Y12, Y4 VPUNPCKHQDQ Y14, Y12, Y5 VPUNPCKLQDQ Y15, Y13, Y6 VPUNPCKHQDQ Y15, Y13, Y7 VPERM2I128 $0x20, Y4, Y0, Y8 VPERM2I128 $0x31, Y4, Y0, Y12 VPERM2I128 $0x20, Y5, Y1, Y9 VPERM2I128 $0x31, Y5, Y1, Y13 VPERM2I128 $0x20, Y6, Y2, Y10 VPERM2I128 $0x31, Y6, Y2, Y14 VPERM2I128 $0x20, Y7, Y3, Y11 VPERM2I128 $0x31, Y7, Y3, Y15 VMOVDQU Y8, (AX) VMOVDQU Y9, 32(AX) VMOVDQU Y10, 64(AX) VMOVDQU Y11, 96(AX) VMOVDQU Y12, 128(AX) VMOVDQU Y13, 160(AX) VMOVDQU Y14, 192(AX) VMOVDQU Y15, 224(AX) RET