// Code generated by command: go run gen.go -out blake3_amd64.s. DO NOT EDIT. #include "textflag.h" DATA iv<>+0(SB)/4, $0x6a09e667 DATA iv<>+4(SB)/4, $0xbb67ae85 DATA iv<>+8(SB)/4, $0x3c6ef372 DATA iv<>+12(SB)/4, $0xa54ff53a GLOBL iv<>(SB), RODATA|NOPTR, $16 DATA seq<>+0(SB)/4, $0x00000000 DATA seq<>+4(SB)/4, $0x00000001 DATA seq<>+8(SB)/4, $0x00000002 DATA seq<>+12(SB)/4, $0x00000003 DATA seq<>+16(SB)/4, $0x00000004 DATA seq<>+20(SB)/4, $0x00000005 DATA seq<>+24(SB)/4, $0x00000006 DATA seq<>+28(SB)/4, $0x00000007 DATA seq<>+32(SB)/4, $0x00000008 DATA seq<>+36(SB)/4, $0x00000009 DATA seq<>+40(SB)/4, $0x0000000a DATA seq<>+44(SB)/4, $0x0000000b DATA seq<>+48(SB)/4, $0x0000000c DATA seq<>+52(SB)/4, $0x0000000d DATA seq<>+56(SB)/4, $0x0000000e DATA seq<>+60(SB)/4, $0x0000000f GLOBL seq<>(SB), RODATA|NOPTR, $64 DATA seq64<>+0(SB)/8, $0x0000000000000000 DATA seq64<>+8(SB)/8, $0x0000000000000001 DATA seq64<>+16(SB)/8, $0x0000000000000002 DATA seq64<>+24(SB)/8, $0x0000000000000003 DATA seq64<>+32(SB)/8, $0x0000000000000004 DATA seq64<>+40(SB)/8, $0x0000000000000005 DATA seq64<>+48(SB)/8, $0x0000000000000006 DATA seq64<>+56(SB)/8, $0x0000000000000007 GLOBL seq64<>(SB), RODATA|NOPTR, $64 DATA shuffle_rot8<>+0(SB)/4, $0x00030201 DATA shuffle_rot8<>+4(SB)/4, $0x04070605 DATA shuffle_rot8<>+8(SB)/4, $0x080b0a09 DATA shuffle_rot8<>+12(SB)/4, $0x0c0f0e0d DATA shuffle_rot8<>+16(SB)/4, $0x10131211 DATA shuffle_rot8<>+20(SB)/4, $0x14171615 DATA shuffle_rot8<>+24(SB)/4, $0x181b1a19 DATA shuffle_rot8<>+28(SB)/4, $0x1c1f1e1d GLOBL shuffle_rot8<>(SB), RODATA|NOPTR, $32 DATA shuffle_rot16<>+0(SB)/4, $0x01000302 DATA shuffle_rot16<>+4(SB)/4, $0x05040706 DATA shuffle_rot16<>+8(SB)/4, $0x09080b0a DATA shuffle_rot16<>+12(SB)/4, $0x0d0c0f0e DATA shuffle_rot16<>+16(SB)/4, $0x11101312 DATA shuffle_rot16<>+20(SB)/4, $0x15141716 DATA shuffle_rot16<>+24(SB)/4, $0x19181b1a DATA shuffle_rot16<>+28(SB)/4, $0x1d1c1f1e GLOBL shuffle_rot16<>(SB), RODATA|NOPTR, $32 // func compressBlocksAVX512(out *[1024]byte, block *[16]uint32, cv *[8]uint32, counter uint64, blockLen uint32, flags uint32) // Requires: AVX512F TEXT ·compressBlocksAVX512(SB), NOSPLIT, $0-40 MOVQ out+0(FP), AX MOVQ block+8(FP), CX MOVQ cv+16(FP), DX // Initialize block vectors VPBROADCASTD (CX), Z1 VPBROADCASTD 4(CX), Z3 VPBROADCASTD 8(CX), Z5 VPBROADCASTD 12(CX), Z7 VPBROADCASTD 16(CX), Z9 VPBROADCASTD 20(CX), Z11 VPBROADCASTD 24(CX), Z13 VPBROADCASTD 28(CX), Z15 VPBROADCASTD 32(CX), Z17 VPBROADCASTD 36(CX), Z19 VPBROADCASTD 40(CX), Z21 VPBROADCASTD 44(CX), Z23 VPBROADCASTD 48(CX), Z25 VPBROADCASTD 52(CX), Z27 VPBROADCASTD 56(CX), Z29 VPBROADCASTD 60(CX), Z31 // Initialize state vectors VPBROADCASTD (DX), Z0 VPBROADCASTD 4(DX), Z2 VPBROADCASTD 8(DX), Z4 VPBROADCASTD 12(DX), Z6 VPBROADCASTD 16(DX), Z8 VPBROADCASTD 20(DX), Z10 VPBROADCASTD 24(DX), Z12 VPBROADCASTD 28(DX), Z14 VPBROADCASTD iv<>+0(SB), Z16 VPBROADCASTD iv<>+4(SB), Z18 VPBROADCASTD iv<>+8(SB), Z20 VPBROADCASTD iv<>+12(SB), Z22 VPBROADCASTD counter+24(FP), Z24 VPADDD seq<>+0(SB), Z24, Z24 VPCMPUD $0x01, seq<>+0(SB), Z24, K1 VPBROADCASTD counter+28(FP), Z26 VPADDD.BCST seq<>+4(SB), Z26, K1, Z26 VPBROADCASTD blockLen+32(FP), Z28 VPBROADCASTD flags+36(FP), Z30 // Round 1 VPADDD Z0, Z8, Z0 VPADDD Z1, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z0, Z8, Z0 VPADDD Z3, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x07, Z8, Z8 VPADDD Z2, Z10, Z2 VPADDD Z5, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z2, Z10, Z2 VPADDD Z7, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z4, Z12, Z4 VPADDD Z9, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z4, Z12, Z4 VPADDD Z11, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z6, Z14, Z6 VPADDD Z13, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z6, Z14, Z6 VPADDD Z15, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z0, Z10, Z0 VPADDD Z17, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z0, Z10, Z0 VPADDD Z19, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z2, Z12, Z2 VPADDD Z21, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z2, Z12, Z2 VPADDD Z23, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z4, Z14, Z4 VPADDD Z25, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z4, Z14, Z4 VPADDD Z27, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z6, Z8, Z6 VPADDD Z29, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z6, Z8, Z6 VPADDD Z31, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x07, Z8, Z8 // Round 2 VPADDD Z0, Z8, Z0 VPADDD Z5, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z0, Z8, Z0 VPADDD Z13, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x07, Z8, Z8 VPADDD Z2, Z10, Z2 VPADDD Z7, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z2, Z10, Z2 VPADDD Z21, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z4, Z12, Z4 VPADDD Z15, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z4, Z12, Z4 VPADDD Z1, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z6, Z14, Z6 VPADDD Z9, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z6, Z14, Z6 VPADDD Z27, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z0, Z10, Z0 VPADDD Z3, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z0, Z10, Z0 VPADDD Z23, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z2, Z12, Z2 VPADDD Z25, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z2, Z12, Z2 VPADDD Z11, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z4, Z14, Z4 VPADDD Z19, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z4, Z14, Z4 VPADDD Z29, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z6, Z8, Z6 VPADDD Z31, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z6, Z8, Z6 VPADDD Z17, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x07, Z8, Z8 // Round 3 VPADDD Z0, Z8, Z0 VPADDD Z7, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z0, Z8, Z0 VPADDD Z9, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x07, Z8, Z8 VPADDD Z2, Z10, Z2 VPADDD Z21, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z2, Z10, Z2 VPADDD Z25, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z4, Z12, Z4 VPADDD Z27, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z4, Z12, Z4 VPADDD Z5, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z6, Z14, Z6 VPADDD Z15, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z6, Z14, Z6 VPADDD Z29, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z0, Z10, Z0 VPADDD Z13, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z0, Z10, Z0 VPADDD Z11, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z2, Z12, Z2 VPADDD Z19, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z2, Z12, Z2 VPADDD Z1, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z4, Z14, Z4 VPADDD Z23, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z4, Z14, Z4 VPADDD Z31, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z6, Z8, Z6 VPADDD Z17, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z6, Z8, Z6 VPADDD Z3, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x07, Z8, Z8 // Round 4 VPADDD Z0, Z8, Z0 VPADDD Z21, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z0, Z8, Z0 VPADDD Z15, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x07, Z8, Z8 VPADDD Z2, Z10, Z2 VPADDD Z25, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z2, Z10, Z2 VPADDD Z19, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z4, Z12, Z4 VPADDD Z29, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z4, Z12, Z4 VPADDD Z7, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z6, Z14, Z6 VPADDD Z27, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z6, Z14, Z6 VPADDD Z31, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z0, Z10, Z0 VPADDD Z9, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z0, Z10, Z0 VPADDD Z1, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z2, Z12, Z2 VPADDD Z23, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z2, Z12, Z2 VPADDD Z5, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z4, Z14, Z4 VPADDD Z11, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z4, Z14, Z4 VPADDD Z17, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z6, Z8, Z6 VPADDD Z3, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z6, Z8, Z6 VPADDD Z13, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x07, Z8, Z8 // Round 5 VPADDD Z0, Z8, Z0 VPADDD Z25, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z0, Z8, Z0 VPADDD Z27, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x07, Z8, Z8 VPADDD Z2, Z10, Z2 VPADDD Z19, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z2, Z10, Z2 VPADDD Z23, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z4, Z12, Z4 VPADDD Z31, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z4, Z12, Z4 VPADDD Z21, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z6, Z14, Z6 VPADDD Z29, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z6, Z14, Z6 VPADDD Z17, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z0, Z10, Z0 VPADDD Z15, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z0, Z10, Z0 VPADDD Z5, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z2, Z12, Z2 VPADDD Z11, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z2, Z12, Z2 VPADDD Z7, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z4, Z14, Z4 VPADDD Z1, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z4, Z14, Z4 VPADDD Z3, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z6, Z8, Z6 VPADDD Z13, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z6, Z8, Z6 VPADDD Z9, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x07, Z8, Z8 // Round 6 VPADDD Z0, Z8, Z0 VPADDD Z19, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z0, Z8, Z0 VPADDD Z29, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x07, Z8, Z8 VPADDD Z2, Z10, Z2 VPADDD Z23, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z2, Z10, Z2 VPADDD Z11, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z4, Z12, Z4 VPADDD Z17, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z4, Z12, Z4 VPADDD Z25, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z6, Z14, Z6 VPADDD Z31, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z6, Z14, Z6 VPADDD Z3, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z0, Z10, Z0 VPADDD Z27, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z0, Z10, Z0 VPADDD Z7, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z2, Z12, Z2 VPADDD Z1, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z2, Z12, Z2 VPADDD Z21, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z4, Z14, Z4 VPADDD Z5, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z4, Z14, Z4 VPADDD Z13, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z6, Z8, Z6 VPADDD Z9, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z6, Z8, Z6 VPADDD Z15, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x07, Z8, Z8 // Round 7 VPADDD Z0, Z8, Z0 VPADDD Z23, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z0, Z8, Z0 VPADDD Z31, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x07, Z8, Z8 VPADDD Z2, Z10, Z2 VPADDD Z11, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z2, Z10, Z2 VPADDD Z1, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z4, Z12, Z4 VPADDD Z3, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z4, Z12, Z4 VPADDD Z19, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z6, Z14, Z6 VPADDD Z17, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z6, Z14, Z6 VPADDD Z13, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z0, Z10, Z0 VPADDD Z29, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z0, Z10, Z0 VPADDD Z21, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z2, Z12, Z2 VPADDD Z5, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z2, Z12, Z2 VPADDD Z25, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z4, Z14, Z4 VPADDD Z7, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z4, Z14, Z4 VPADDD Z9, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z6, Z8, Z6 VPADDD Z15, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z6, Z8, Z6 VPADDD Z27, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x07, Z8, Z8 // Finalize CVs VPXORD Z0, Z16, Z0 VPXORD Z2, Z18, Z2 VPXORD Z4, Z20, Z4 VPXORD Z6, Z22, Z6 VPXORD Z8, Z24, Z8 VPXORD Z10, Z26, Z10 VPXORD Z12, Z28, Z12 VPXORD Z14, Z30, Z14 VPXORD.BCST (DX), Z16, Z16 VPXORD.BCST 4(DX), Z18, Z18 VPXORD.BCST 8(DX), Z20, Z20 VPXORD.BCST 12(DX), Z22, Z22 VPXORD.BCST 16(DX), Z24, Z24 VPXORD.BCST 20(DX), Z26, Z26 VPXORD.BCST 24(DX), Z28, Z28 VPXORD.BCST 28(DX), Z30, Z30 VMOVDQU32 seq<>+0(SB), Z1 VPSLLD $0x06, Z1, Z1 KXNORD K1, K1, K1 VPSCATTERDD Z0, K1, (AX)(Z1*1) KXNORD K1, K1, K1 VPSCATTERDD Z2, K1, 4(AX)(Z1*1) KXNORD K1, K1, K1 VPSCATTERDD Z4, K1, 8(AX)(Z1*1) KXNORD K1, K1, K1 VPSCATTERDD Z6, K1, 12(AX)(Z1*1) KXNORD K1, K1, K1 VPSCATTERDD Z8, K1, 16(AX)(Z1*1) KXNORD K1, K1, K1 VPSCATTERDD Z10, K1, 20(AX)(Z1*1) KXNORD K1, K1, K1 VPSCATTERDD Z12, K1, 24(AX)(Z1*1) KXNORD K1, K1, K1 VPSCATTERDD Z14, K1, 28(AX)(Z1*1) KXNORD K1, K1, K1 VPSCATTERDD Z16, K1, 32(AX)(Z1*1) KXNORD K1, K1, K1 VPSCATTERDD Z18, K1, 36(AX)(Z1*1) KXNORD K1, K1, K1 VPSCATTERDD Z20, K1, 40(AX)(Z1*1) KXNORD K1, K1, K1 VPSCATTERDD Z22, K1, 44(AX)(Z1*1) KXNORD K1, K1, K1 VPSCATTERDD Z24, K1, 48(AX)(Z1*1) KXNORD K1, K1, K1 VPSCATTERDD Z26, K1, 52(AX)(Z1*1) KXNORD K1, K1, K1 VPSCATTERDD Z28, K1, 56(AX)(Z1*1) KXNORD K1, K1, K1 VPSCATTERDD Z30, K1, 60(AX)(Z1*1) RET // func compressChunksAVX512(cvs *[16][8]uint32, buf *[16384]byte, key *[8]uint32, counter uint64, flags uint32) // Requires: AVX512F TEXT ·compressChunksAVX512(SB), NOSPLIT, $192-40 MOVQ cvs+0(FP), AX MOVQ buf+8(FP), CX MOVQ key+16(FP), DX // Initialize counter VPBROADCASTD counter+24(FP), Z0 VPADDD seq<>+0(SB), Z0, Z0 VPCMPUD $0x01, seq<>+0(SB), Z0, K1 VPBROADCASTD counter+28(FP), Z2 VPADDD.BCST seq<>+4(SB), Z2, K1, Z2 VMOVDQU32 Z0, (SP) VMOVDQU32 Z2, 64(SP) // Initialize flags VPBROADCASTD flags+32(FP), Z0 VMOVDQU32 Z0, 128(SP) ORL $0x01, 128(SP) ORL $0x02, 188(SP) // Load key VPBROADCASTD (DX), Z0 VPBROADCASTD 4(DX), Z2 VPBROADCASTD 8(DX), Z4 VPBROADCASTD 12(DX), Z6 VPBROADCASTD 16(DX), Z8 VPBROADCASTD 20(DX), Z10 VPBROADCASTD 24(DX), Z12 VPBROADCASTD 28(DX), Z14 // Loop index XORQ DX, DX loop: // Load transposed block VMOVDQU32 seq<>+0(SB), Z16 VPSLLD $0x0a, Z16, Z16 KXNORD K1, K1, K1 VPGATHERDD (CX)(Z16*1), K1, Z1 KXNORD K1, K1, K1 VPGATHERDD 4(CX)(Z16*1), K1, Z3 KXNORD K1, K1, K1 VPGATHERDD 8(CX)(Z16*1), K1, Z5 KXNORD K1, K1, K1 VPGATHERDD 12(CX)(Z16*1), K1, Z7 KXNORD K1, K1, K1 VPGATHERDD 16(CX)(Z16*1), K1, Z9 KXNORD K1, K1, K1 VPGATHERDD 20(CX)(Z16*1), K1, Z11 KXNORD K1, K1, K1 VPGATHERDD 24(CX)(Z16*1), K1, Z13 KXNORD K1, K1, K1 VPGATHERDD 28(CX)(Z16*1), K1, Z15 KXNORD K1, K1, K1 VPGATHERDD 32(CX)(Z16*1), K1, Z17 KXNORD K1, K1, K1 VPGATHERDD 36(CX)(Z16*1), K1, Z19 KXNORD K1, K1, K1 VPGATHERDD 40(CX)(Z16*1), K1, Z21 KXNORD K1, K1, K1 VPGATHERDD 44(CX)(Z16*1), K1, Z23 KXNORD K1, K1, K1 VPGATHERDD 48(CX)(Z16*1), K1, Z25 KXNORD K1, K1, K1 VPGATHERDD 52(CX)(Z16*1), K1, Z27 KXNORD K1, K1, K1 VPGATHERDD 56(CX)(Z16*1), K1, Z29 KXNORD K1, K1, K1 VPGATHERDD 60(CX)(Z16*1), K1, Z31 ADDQ $0x40, CX // Reload state vectors (other than CVs) VPBROADCASTD iv<>+0(SB), Z16 VPBROADCASTD iv<>+4(SB), Z18 VPBROADCASTD iv<>+8(SB), Z20 VPBROADCASTD iv<>+12(SB), Z22 VMOVDQU32 (SP), Z24 VMOVDQU32 64(SP), Z26 VPBROADCASTD seq<>+4(SB), Z28 VPSLLD $0x06, Z28, Z28 VPBROADCASTD 128(SP)(DX*4), Z30 // Round 1 VPADDD Z0, Z8, Z0 VPADDD Z1, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z0, Z8, Z0 VPADDD Z3, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x07, Z8, Z8 VPADDD Z2, Z10, Z2 VPADDD Z5, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z2, Z10, Z2 VPADDD Z7, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z4, Z12, Z4 VPADDD Z9, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z4, Z12, Z4 VPADDD Z11, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z6, Z14, Z6 VPADDD Z13, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z6, Z14, Z6 VPADDD Z15, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z0, Z10, Z0 VPADDD Z17, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z0, Z10, Z0 VPADDD Z19, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z2, Z12, Z2 VPADDD Z21, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z2, Z12, Z2 VPADDD Z23, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z4, Z14, Z4 VPADDD Z25, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z4, Z14, Z4 VPADDD Z27, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z6, Z8, Z6 VPADDD Z29, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z6, Z8, Z6 VPADDD Z31, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x07, Z8, Z8 // Round 2 VPADDD Z0, Z8, Z0 VPADDD Z5, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z0, Z8, Z0 VPADDD Z13, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x07, Z8, Z8 VPADDD Z2, Z10, Z2 VPADDD Z7, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z2, Z10, Z2 VPADDD Z21, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z4, Z12, Z4 VPADDD Z15, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z4, Z12, Z4 VPADDD Z1, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z6, Z14, Z6 VPADDD Z9, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z6, Z14, Z6 VPADDD Z27, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z0, Z10, Z0 VPADDD Z3, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z0, Z10, Z0 VPADDD Z23, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z2, Z12, Z2 VPADDD Z25, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z2, Z12, Z2 VPADDD Z11, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z4, Z14, Z4 VPADDD Z19, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z4, Z14, Z4 VPADDD Z29, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z6, Z8, Z6 VPADDD Z31, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z6, Z8, Z6 VPADDD Z17, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x07, Z8, Z8 // Round 3 VPADDD Z0, Z8, Z0 VPADDD Z7, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z0, Z8, Z0 VPADDD Z9, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x07, Z8, Z8 VPADDD Z2, Z10, Z2 VPADDD Z21, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z2, Z10, Z2 VPADDD Z25, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z4, Z12, Z4 VPADDD Z27, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z4, Z12, Z4 VPADDD Z5, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z6, Z14, Z6 VPADDD Z15, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z6, Z14, Z6 VPADDD Z29, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z0, Z10, Z0 VPADDD Z13, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z0, Z10, Z0 VPADDD Z11, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z2, Z12, Z2 VPADDD Z19, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z2, Z12, Z2 VPADDD Z1, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z4, Z14, Z4 VPADDD Z23, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z4, Z14, Z4 VPADDD Z31, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z6, Z8, Z6 VPADDD Z17, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z6, Z8, Z6 VPADDD Z3, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x07, Z8, Z8 // Round 4 VPADDD Z0, Z8, Z0 VPADDD Z21, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z0, Z8, Z0 VPADDD Z15, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x07, Z8, Z8 VPADDD Z2, Z10, Z2 VPADDD Z25, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z2, Z10, Z2 VPADDD Z19, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z4, Z12, Z4 VPADDD Z29, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z4, Z12, Z4 VPADDD Z7, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z6, Z14, Z6 VPADDD Z27, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z6, Z14, Z6 VPADDD Z31, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z0, Z10, Z0 VPADDD Z9, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z0, Z10, Z0 VPADDD Z1, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z2, Z12, Z2 VPADDD Z23, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z2, Z12, Z2 VPADDD Z5, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z4, Z14, Z4 VPADDD Z11, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z4, Z14, Z4 VPADDD Z17, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z6, Z8, Z6 VPADDD Z3, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z6, Z8, Z6 VPADDD Z13, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x07, Z8, Z8 // Round 5 VPADDD Z0, Z8, Z0 VPADDD Z25, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z0, Z8, Z0 VPADDD Z27, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x07, Z8, Z8 VPADDD Z2, Z10, Z2 VPADDD Z19, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z2, Z10, Z2 VPADDD Z23, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z4, Z12, Z4 VPADDD Z31, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z4, Z12, Z4 VPADDD Z21, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z6, Z14, Z6 VPADDD Z29, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z6, Z14, Z6 VPADDD Z17, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z0, Z10, Z0 VPADDD Z15, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z0, Z10, Z0 VPADDD Z5, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z2, Z12, Z2 VPADDD Z11, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z2, Z12, Z2 VPADDD Z7, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z4, Z14, Z4 VPADDD Z1, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z4, Z14, Z4 VPADDD Z3, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z6, Z8, Z6 VPADDD Z13, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z6, Z8, Z6 VPADDD Z9, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x07, Z8, Z8 // Round 6 VPADDD Z0, Z8, Z0 VPADDD Z19, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z0, Z8, Z0 VPADDD Z29, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x07, Z8, Z8 VPADDD Z2, Z10, Z2 VPADDD Z23, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z2, Z10, Z2 VPADDD Z11, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z4, Z12, Z4 VPADDD Z17, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z4, Z12, Z4 VPADDD Z25, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z6, Z14, Z6 VPADDD Z31, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z6, Z14, Z6 VPADDD Z3, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z0, Z10, Z0 VPADDD Z27, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z0, Z10, Z0 VPADDD Z7, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z2, Z12, Z2 VPADDD Z1, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z2, Z12, Z2 VPADDD Z21, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z4, Z14, Z4 VPADDD Z5, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z4, Z14, Z4 VPADDD Z13, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z6, Z8, Z6 VPADDD Z9, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z6, Z8, Z6 VPADDD Z15, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x07, Z8, Z8 // Round 7 VPADDD Z0, Z8, Z0 VPADDD Z23, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z0, Z8, Z0 VPADDD Z31, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x07, Z8, Z8 VPADDD Z2, Z10, Z2 VPADDD Z11, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z2, Z10, Z2 VPADDD Z1, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z4, Z12, Z4 VPADDD Z3, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z4, Z12, Z4 VPADDD Z19, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z6, Z14, Z6 VPADDD Z17, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z6, Z14, Z6 VPADDD Z13, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z0, Z10, Z0 VPADDD Z29, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z0, Z10, Z0 VPADDD Z21, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z2, Z12, Z2 VPADDD Z5, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z2, Z12, Z2 VPADDD Z25, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z4, Z14, Z4 VPADDD Z7, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z4, Z14, Z4 VPADDD Z9, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z6, Z8, Z6 VPADDD Z15, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z6, Z8, Z6 VPADDD Z27, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x07, Z8, Z8 // Finalize CVs VPXORD Z0, Z16, Z0 VPXORD Z2, Z18, Z2 VPXORD Z4, Z20, Z4 VPXORD Z6, Z22, Z6 VPXORD Z8, Z24, Z8 VPXORD Z10, Z26, Z10 VPXORD Z12, Z28, Z12 VPXORD Z14, Z30, Z14 // Loop INCQ DX CMPQ DX, $0x00000010 JNE loop // Finished; transpose CVs VMOVDQU32 seq<>+0(SB), Z16 VPSLLD $0x05, Z16, Z16 KXNORD K1, K1, K1 VPSCATTERDD Z0, K1, (AX)(Z16*1) KXNORD K1, K1, K1 VPSCATTERDD Z2, K1, 4(AX)(Z16*1) KXNORD K1, K1, K1 VPSCATTERDD Z4, K1, 8(AX)(Z16*1) KXNORD K1, K1, K1 VPSCATTERDD Z6, K1, 12(AX)(Z16*1) KXNORD K1, K1, K1 VPSCATTERDD Z8, K1, 16(AX)(Z16*1) KXNORD K1, K1, K1 VPSCATTERDD Z10, K1, 20(AX)(Z16*1) KXNORD K1, K1, K1 VPSCATTERDD Z12, K1, 24(AX)(Z16*1) KXNORD K1, K1, K1 VPSCATTERDD Z14, K1, 28(AX)(Z16*1) RET // func compressBlocksAVX2(out *[512]byte, block *[16]uint32, cv *[8]uint32, counter uint64, blockLen uint32, flags uint32) // Requires: AVX, AVX2 TEXT ·compressBlocksAVX2(SB), NOSPLIT, $544-40 MOVQ out+0(FP), AX MOVQ block+8(FP), CX MOVQ cv+16(FP), DX // Load block VPBROADCASTD (CX), Y0 VMOVDQU Y0, (SP) VPBROADCASTD 4(CX), Y0 VMOVDQU Y0, 32(SP) VPBROADCASTD 8(CX), Y0 VMOVDQU Y0, 64(SP) VPBROADCASTD 12(CX), Y0 VMOVDQU Y0, 96(SP) VPBROADCASTD 16(CX), Y0 VMOVDQU Y0, 128(SP) VPBROADCASTD 20(CX), Y0 VMOVDQU Y0, 160(SP) VPBROADCASTD 24(CX), Y0 VMOVDQU Y0, 192(SP) VPBROADCASTD 28(CX), Y0 VMOVDQU Y0, 224(SP) VPBROADCASTD 32(CX), Y0 VMOVDQU Y0, 256(SP) VPBROADCASTD 36(CX), Y0 VMOVDQU Y0, 288(SP) VPBROADCASTD 40(CX), Y0 VMOVDQU Y0, 320(SP) VPBROADCASTD 44(CX), Y0 VMOVDQU Y0, 352(SP) VPBROADCASTD 48(CX), Y0 VMOVDQU Y0, 384(SP) VPBROADCASTD 52(CX), Y0 VMOVDQU Y0, 416(SP) VPBROADCASTD 56(CX), Y0 VMOVDQU Y0, 448(SP) VPBROADCASTD 60(CX), Y0 VMOVDQU Y0, 480(SP) // Initialize state vectors VPBROADCASTD (DX), Y0 VPBROADCASTD 4(DX), Y1 VPBROADCASTD 8(DX), Y2 VPBROADCASTD 12(DX), Y3 VPBROADCASTD 16(DX), Y4 VPBROADCASTD 20(DX), Y5 VPBROADCASTD 24(DX), Y6 VPBROADCASTD 28(DX), Y7 VPBROADCASTD iv<>+0(SB), Y8 VPBROADCASTD iv<>+4(SB), Y9 VPBROADCASTD iv<>+8(SB), Y10 VPBROADCASTD iv<>+12(SB), Y11 VPBROADCASTQ counter+24(FP), Y12 VPBROADCASTQ counter+24(FP), Y13 VPADDQ seq64<>+0(SB), Y12, Y12 VPADDQ seq64<>+32(SB), Y13, Y13 VPUNPCKLDQ Y13, Y12, Y14 VPUNPCKHDQ Y13, Y12, Y15 VPUNPCKLDQ Y15, Y14, Y12 VPUNPCKHDQ Y15, Y14, Y13 VPERMQ $0xd8, Y12, Y12 VPERMQ $0xd8, Y13, Y13 VPBROADCASTD blockLen+32(FP), Y14 VPBROADCASTD flags+36(FP), Y15 VMOVDQU Y8, 512(SP) // Round 1 VPADDD Y0, Y4, Y0 VPADDD (SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 32(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 64(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 96(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 128(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 160(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 192(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 224(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 256(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 288(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 320(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 352(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 384(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 416(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 448(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 480(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 2 VPADDD Y0, Y4, Y0 VPADDD 64(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 192(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 96(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 320(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 224(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD (SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 128(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 416(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 32(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 352(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 384(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 160(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 288(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 448(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 480(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 256(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 3 VPADDD Y0, Y4, Y0 VPADDD 96(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 128(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 320(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 384(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 416(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 64(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 224(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 448(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 192(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 160(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 288(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD (SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 352(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 480(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 256(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 32(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 4 VPADDD Y0, Y4, Y0 VPADDD 320(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 224(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 384(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 288(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 448(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 96(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 416(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 480(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 128(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD (SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 352(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 64(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 160(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 256(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 32(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 192(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 5 VPADDD Y0, Y4, Y0 VPADDD 384(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 416(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 288(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 352(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 480(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 320(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 448(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 256(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 224(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 64(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 160(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 96(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD (SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 32(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 192(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 128(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 6 VPADDD Y0, Y4, Y0 VPADDD 288(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 448(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 352(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 160(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 256(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 384(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 480(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 32(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 416(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 96(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD (SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 320(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 64(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 192(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 128(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 224(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 7 VPADDD Y0, Y4, Y0 VPADDD 352(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 480(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 160(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD (SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 32(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 288(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 256(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 192(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 448(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 320(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 64(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 384(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 96(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 128(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 224(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 416(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VMOVDQU 512(SP), Y8 // Finalize CVs VMOVDQU Y8, 256(SP) VMOVDQU Y9, 288(SP) VMOVDQU Y10, 320(SP) VMOVDQU Y11, 352(SP) VMOVDQU Y12, 384(SP) VMOVDQU Y13, 416(SP) VMOVDQU Y14, 448(SP) VMOVDQU Y15, 480(SP) VPXOR Y0, Y8, Y0 VPXOR Y1, Y9, Y1 VPXOR Y2, Y10, Y2 VPXOR Y3, Y11, Y3 VPXOR Y4, Y12, Y4 VPXOR Y5, Y13, Y5 VPXOR Y6, Y14, Y6 VPXOR Y7, Y15, Y7 VPUNPCKLDQ Y1, Y0, Y8 VPUNPCKHDQ Y1, Y0, Y9 VPUNPCKLDQ Y3, Y2, Y10 VPUNPCKHDQ Y3, Y2, Y11 VPUNPCKLDQ Y5, Y4, Y12 VPUNPCKHDQ Y5, Y4, Y13 VPUNPCKLDQ Y7, Y6, Y14 VPUNPCKHDQ Y7, Y6, Y15 VPUNPCKLQDQ Y10, Y8, Y0 VPUNPCKHQDQ Y10, Y8, Y1 VPUNPCKLQDQ Y11, Y9, Y2 VPUNPCKHQDQ Y11, Y9, Y3 VPUNPCKLQDQ Y14, Y12, Y4 VPUNPCKHQDQ Y14, Y12, Y5 VPUNPCKLQDQ Y15, Y13, Y6 VPUNPCKHQDQ Y15, Y13, Y7 VPERM2I128 $0x20, Y4, Y0, Y8 VPERM2I128 $0x31, Y4, Y0, Y12 VPERM2I128 $0x20, Y5, Y1, Y9 VPERM2I128 $0x31, Y5, Y1, Y13 VPERM2I128 $0x20, Y6, Y2, Y10 VPERM2I128 $0x31, Y6, Y2, Y14 VPERM2I128 $0x20, Y7, Y3, Y11 VPERM2I128 $0x31, Y7, Y3, Y15 VMOVDQU Y8, (AX) VMOVDQU Y9, 64(AX) VMOVDQU Y10, 128(AX) VMOVDQU Y11, 192(AX) VMOVDQU Y12, 256(AX) VMOVDQU Y13, 320(AX) VMOVDQU Y14, 384(AX) VMOVDQU Y15, 448(AX) VMOVDQU 256(SP), Y8 VMOVDQU 288(SP), Y9 VMOVDQU 320(SP), Y10 VMOVDQU 352(SP), Y11 VMOVDQU 384(SP), Y12 VMOVDQU 416(SP), Y13 VMOVDQU 448(SP), Y14 VMOVDQU 480(SP), Y15 VPBROADCASTD (DX), Y0 VPXOR Y0, Y8, Y8 VPBROADCASTD 4(DX), Y0 VPXOR Y0, Y9, Y9 VPBROADCASTD 8(DX), Y0 VPXOR Y0, Y10, Y10 VPBROADCASTD 12(DX), Y0 VPXOR Y0, Y11, Y11 VPBROADCASTD 16(DX), Y0 VPXOR Y0, Y12, Y12 VPBROADCASTD 20(DX), Y0 VPXOR Y0, Y13, Y13 VPBROADCASTD 24(DX), Y0 VPXOR Y0, Y14, Y14 VPBROADCASTD 28(DX), Y0 VPXOR Y0, Y15, Y15 VPUNPCKLDQ Y9, Y8, Y0 VPUNPCKHDQ Y9, Y8, Y1 VPUNPCKLDQ Y11, Y10, Y2 VPUNPCKHDQ Y11, Y10, Y3 VPUNPCKLDQ Y13, Y12, Y4 VPUNPCKHDQ Y13, Y12, Y5 VPUNPCKLDQ Y15, Y14, Y6 VPUNPCKHDQ Y15, Y14, Y7 VPUNPCKLQDQ Y2, Y0, Y8 VPUNPCKHQDQ Y2, Y0, Y9 VPUNPCKLQDQ Y3, Y1, Y10 VPUNPCKHQDQ Y3, Y1, Y11 VPUNPCKLQDQ Y6, Y4, Y12 VPUNPCKHQDQ Y6, Y4, Y13 VPUNPCKLQDQ Y7, Y5, Y14 VPUNPCKHQDQ Y7, Y5, Y15 VPERM2I128 $0x20, Y12, Y8, Y0 VPERM2I128 $0x31, Y12, Y8, Y4 VPERM2I128 $0x20, Y13, Y9, Y1 VPERM2I128 $0x31, Y13, Y9, Y5 VPERM2I128 $0x20, Y14, Y10, Y2 VPERM2I128 $0x31, Y14, Y10, Y6 VPERM2I128 $0x20, Y15, Y11, Y3 VPERM2I128 $0x31, Y15, Y11, Y7 VMOVDQU Y0, 32(AX) VMOVDQU Y1, 96(AX) VMOVDQU Y2, 160(AX) VMOVDQU Y3, 224(AX) VMOVDQU Y4, 288(AX) VMOVDQU Y5, 352(AX) VMOVDQU Y6, 416(AX) VMOVDQU Y7, 480(AX) RET // func compressChunksAVX2(cvs *[8][8]uint32, buf *[8192]byte, key *[8]uint32, counter uint64, flags uint32) // Requires: AVX, AVX2 TEXT ·compressChunksAVX2(SB), NOSPLIT, $672-40 MOVQ cvs+0(FP), AX MOVQ buf+8(FP), CX MOVQ key+16(FP), DX // Load key VPBROADCASTD (DX), Y0 VPBROADCASTD 4(DX), Y1 VPBROADCASTD 8(DX), Y2 VPBROADCASTD 12(DX), Y3 VPBROADCASTD 16(DX), Y4 VPBROADCASTD 20(DX), Y5 VPBROADCASTD 24(DX), Y6 VPBROADCASTD 28(DX), Y7 // Initialize counter VPBROADCASTQ counter+24(FP), Y12 VPBROADCASTQ counter+24(FP), Y13 VPADDQ seq64<>+0(SB), Y12, Y12 VPADDQ seq64<>+32(SB), Y13, Y13 VPUNPCKLDQ Y13, Y12, Y14 VPUNPCKHDQ Y13, Y12, Y15 VPUNPCKLDQ Y15, Y14, Y12 VPUNPCKHDQ Y15, Y14, Y13 VPERMQ $0xd8, Y12, Y12 VPERMQ $0xd8, Y13, Y13 VMOVDQU Y12, 512(SP) VMOVDQU Y13, 544(SP) // Initialize flags VPBROADCASTD flags+32(FP), Y14 VMOVDQU Y14, 576(SP) VMOVDQU Y14, 608(SP) ORL $0x01, 576(SP) ORL $0x02, 636(SP) // Loop index XORQ DX, DX loop: // Load transposed block VMOVDQU seq<>+0(SB), Y9 VPSLLD $0x0a, Y9, Y9 VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, (CX)(Y9*1), Y10 VMOVDQU Y10, (SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 4(CX)(Y9*1), Y10 VMOVDQU Y10, 32(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 8(CX)(Y9*1), Y10 VMOVDQU Y10, 64(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 12(CX)(Y9*1), Y10 VMOVDQU Y10, 96(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 16(CX)(Y9*1), Y10 VMOVDQU Y10, 128(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 20(CX)(Y9*1), Y10 VMOVDQU Y10, 160(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 24(CX)(Y9*1), Y10 VMOVDQU Y10, 192(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 28(CX)(Y9*1), Y10 VMOVDQU Y10, 224(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 32(CX)(Y9*1), Y10 VMOVDQU Y10, 256(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 36(CX)(Y9*1), Y10 VMOVDQU Y10, 288(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 40(CX)(Y9*1), Y10 VMOVDQU Y10, 320(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 44(CX)(Y9*1), Y10 VMOVDQU Y10, 352(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 48(CX)(Y9*1), Y10 VMOVDQU Y10, 384(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 52(CX)(Y9*1), Y10 VMOVDQU Y10, 416(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 56(CX)(Y9*1), Y10 VMOVDQU Y10, 448(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 60(CX)(Y9*1), Y10 VMOVDQU Y10, 480(SP) ADDQ $0x40, CX // Reload state vectors (other than CVs) VPBROADCASTD iv<>+0(SB), Y8 VPBROADCASTD iv<>+4(SB), Y9 VPBROADCASTD iv<>+8(SB), Y10 VPBROADCASTD iv<>+12(SB), Y11 VMOVDQU 512(SP), Y12 VMOVDQU 544(SP), Y13 VPBROADCASTD seq<>+4(SB), Y14 VPSLLD $0x06, Y14, Y14 VPBROADCASTD 576(SP)(DX*4), Y15 VMOVDQU Y8, 640(SP) // Round 1 VPADDD Y0, Y4, Y0 VPADDD (SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 640(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 640(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 32(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 640(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 640(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 64(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 96(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 128(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 160(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 192(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 224(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 256(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 288(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 320(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 352(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 384(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 640(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 640(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 416(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 640(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 640(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 448(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 480(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 2 VPADDD Y0, Y4, Y0 VPADDD 64(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 640(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 640(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 192(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 640(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 640(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 96(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 320(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 224(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD (SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 128(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 416(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 32(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 352(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 384(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 160(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 288(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 640(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 640(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 448(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 640(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 640(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 480(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 256(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 3 VPADDD Y0, Y4, Y0 VPADDD 96(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 640(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 640(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 128(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 640(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 640(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 320(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 384(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 416(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 64(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 224(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 448(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 192(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 160(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 288(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD (SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 352(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 640(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 640(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 480(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 640(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 640(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 256(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 32(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 4 VPADDD Y0, Y4, Y0 VPADDD 320(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 640(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 640(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 224(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 640(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 640(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 384(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 288(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 448(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 96(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 416(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 480(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 128(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD (SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 352(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 64(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 160(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 640(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 640(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 256(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 640(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 640(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 32(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 192(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 5 VPADDD Y0, Y4, Y0 VPADDD 384(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 640(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 640(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 416(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 640(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 640(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 288(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 352(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 480(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 320(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 448(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 256(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 224(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 64(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 160(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 96(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD (SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 640(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 640(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 32(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 640(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 640(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 192(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 128(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 6 VPADDD Y0, Y4, Y0 VPADDD 288(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 640(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 640(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 448(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 640(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 640(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 352(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 160(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 256(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 384(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 480(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 32(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 416(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 96(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD (SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 320(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 64(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 640(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 640(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 192(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 640(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 640(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 128(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 224(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 7 VPADDD Y0, Y4, Y0 VPADDD 352(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 640(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 640(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 480(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 640(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 640(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 160(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD (SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 32(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 288(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 256(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 192(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 448(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 320(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 64(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 384(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 96(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 640(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 640(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 128(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 640(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 640(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 224(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 416(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VMOVDQU 640(SP), Y8 // Finalize CVs VPXOR Y0, Y8, Y0 VPXOR Y1, Y9, Y1 VPXOR Y2, Y10, Y2 VPXOR Y3, Y11, Y3 VPXOR Y4, Y12, Y4 VPXOR Y5, Y13, Y5 VPXOR Y6, Y14, Y6 VPXOR Y7, Y15, Y7 // Loop INCQ DX CMPQ DX, $0x00000010 JNE loop // Finished; transpose CVs VPUNPCKLDQ Y1, Y0, Y8 VPUNPCKHDQ Y1, Y0, Y9 VPUNPCKLDQ Y3, Y2, Y10 VPUNPCKHDQ Y3, Y2, Y11 VPUNPCKLDQ Y5, Y4, Y12 VPUNPCKHDQ Y5, Y4, Y13 VPUNPCKLDQ Y7, Y6, Y14 VPUNPCKHDQ Y7, Y6, Y15 VPUNPCKLQDQ Y10, Y8, Y0 VPUNPCKHQDQ Y10, Y8, Y1 VPUNPCKLQDQ Y11, Y9, Y2 VPUNPCKHQDQ Y11, Y9, Y3 VPUNPCKLQDQ Y14, Y12, Y4 VPUNPCKHQDQ Y14, Y12, Y5 VPUNPCKLQDQ Y15, Y13, Y6 VPUNPCKHQDQ Y15, Y13, Y7 VPERM2I128 $0x20, Y4, Y0, Y8 VPERM2I128 $0x31, Y4, Y0, Y12 VPERM2I128 $0x20, Y5, Y1, Y9 VPERM2I128 $0x31, Y5, Y1, Y13 VPERM2I128 $0x20, Y6, Y2, Y10 VPERM2I128 $0x31, Y6, Y2, Y14 VPERM2I128 $0x20, Y7, Y3, Y11 VPERM2I128 $0x31, Y7, Y3, Y15 VMOVDQU Y8, (AX) VMOVDQU Y9, 32(AX) VMOVDQU Y10, 64(AX) VMOVDQU Y11, 96(AX) VMOVDQU Y12, 128(AX) VMOVDQU Y13, 160(AX) VMOVDQU Y14, 192(AX) VMOVDQU Y15, 224(AX) RET // func compressParentsAVX2(parents *[8][8]uint32, cvs *[16][8]uint32, key *[8]uint32, flags uint32) // Requires: AVX, AVX2 TEXT ·compressParentsAVX2(SB), NOSPLIT, $544-32 MOVQ parents+0(FP), AX MOVQ cvs+8(FP), CX MOVQ key+16(FP), DX // Load transposed block VMOVDQU seq<>+0(SB), Y9 VPSLLD $0x06, Y9, Y9 VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, (CX)(Y9*1), Y10 VMOVDQU Y10, (SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 4(CX)(Y9*1), Y10 VMOVDQU Y10, 32(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 8(CX)(Y9*1), Y10 VMOVDQU Y10, 64(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 12(CX)(Y9*1), Y10 VMOVDQU Y10, 96(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 16(CX)(Y9*1), Y10 VMOVDQU Y10, 128(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 20(CX)(Y9*1), Y10 VMOVDQU Y10, 160(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 24(CX)(Y9*1), Y10 VMOVDQU Y10, 192(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 28(CX)(Y9*1), Y10 VMOVDQU Y10, 224(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 32(CX)(Y9*1), Y10 VMOVDQU Y10, 256(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 36(CX)(Y9*1), Y10 VMOVDQU Y10, 288(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 40(CX)(Y9*1), Y10 VMOVDQU Y10, 320(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 44(CX)(Y9*1), Y10 VMOVDQU Y10, 352(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 48(CX)(Y9*1), Y10 VMOVDQU Y10, 384(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 52(CX)(Y9*1), Y10 VMOVDQU Y10, 416(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 56(CX)(Y9*1), Y10 VMOVDQU Y10, 448(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 60(CX)(Y9*1), Y10 VMOVDQU Y10, 480(SP) // Initialize state vectors VPBROADCASTD (DX), Y0 VPBROADCASTD 4(DX), Y1 VPBROADCASTD 8(DX), Y2 VPBROADCASTD 12(DX), Y3 VPBROADCASTD 16(DX), Y4 VPBROADCASTD 20(DX), Y5 VPBROADCASTD 24(DX), Y6 VPBROADCASTD 28(DX), Y7 VPBROADCASTD iv<>+0(SB), Y8 VPBROADCASTD iv<>+4(SB), Y9 VPBROADCASTD iv<>+8(SB), Y10 VPBROADCASTD iv<>+12(SB), Y11 VPXOR Y12, Y12, Y12 VPXOR Y13, Y13, Y13 VPBROADCASTD seq<>+4(SB), Y14 VPSLLD $0x06, Y14, Y14 ORL $0x04, flags+24(FP) VPBROADCASTD flags+24(FP), Y15 VMOVDQU Y8, 512(SP) // Round 1 VPADDD Y0, Y4, Y0 VPADDD (SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 32(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 64(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 96(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 128(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 160(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 192(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 224(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 256(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 288(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 320(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 352(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 384(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 416(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 448(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 480(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 2 VPADDD Y0, Y4, Y0 VPADDD 64(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 192(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 96(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 320(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 224(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD (SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 128(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 416(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 32(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 352(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 384(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 160(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 288(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 448(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 480(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 256(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 3 VPADDD Y0, Y4, Y0 VPADDD 96(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 128(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 320(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 384(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 416(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 64(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 224(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 448(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 192(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 160(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 288(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD (SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 352(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 480(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 256(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 32(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 4 VPADDD Y0, Y4, Y0 VPADDD 320(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 224(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 384(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 288(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 448(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 96(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 416(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 480(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 128(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD (SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 352(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 64(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 160(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 256(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 32(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 192(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 5 VPADDD Y0, Y4, Y0 VPADDD 384(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 416(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 288(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 352(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 480(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 320(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 448(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 256(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 224(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 64(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 160(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 96(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD (SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 32(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 192(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 128(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 6 VPADDD Y0, Y4, Y0 VPADDD 288(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 448(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 352(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 160(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 256(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 384(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 480(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 32(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 416(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 96(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD (SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 320(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 64(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 192(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 128(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 224(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 7 VPADDD Y0, Y4, Y0 VPADDD 352(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 480(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 160(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD (SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 32(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 288(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 256(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 192(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 448(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 320(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 64(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 384(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 96(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 128(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 224(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 416(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VMOVDQU 512(SP), Y8 // Finalize CVs VPXOR Y0, Y8, Y0 VPXOR Y1, Y9, Y1 VPXOR Y2, Y10, Y2 VPXOR Y3, Y11, Y3 VPXOR Y4, Y12, Y4 VPXOR Y5, Y13, Y5 VPXOR Y6, Y14, Y6 VPXOR Y7, Y15, Y7 VPUNPCKLDQ Y1, Y0, Y8 VPUNPCKHDQ Y1, Y0, Y9 VPUNPCKLDQ Y3, Y2, Y10 VPUNPCKHDQ Y3, Y2, Y11 VPUNPCKLDQ Y5, Y4, Y12 VPUNPCKHDQ Y5, Y4, Y13 VPUNPCKLDQ Y7, Y6, Y14 VPUNPCKHDQ Y7, Y6, Y15 VPUNPCKLQDQ Y10, Y8, Y0 VPUNPCKHQDQ Y10, Y8, Y1 VPUNPCKLQDQ Y11, Y9, Y2 VPUNPCKHQDQ Y11, Y9, Y3 VPUNPCKLQDQ Y14, Y12, Y4 VPUNPCKHQDQ Y14, Y12, Y5 VPUNPCKLQDQ Y15, Y13, Y6 VPUNPCKHQDQ Y15, Y13, Y7 VPERM2I128 $0x20, Y4, Y0, Y8 VPERM2I128 $0x31, Y4, Y0, Y12 VPERM2I128 $0x20, Y5, Y1, Y9 VPERM2I128 $0x31, Y5, Y1, Y13 VPERM2I128 $0x20, Y6, Y2, Y10 VPERM2I128 $0x31, Y6, Y2, Y14 VPERM2I128 $0x20, Y7, Y3, Y11 VPERM2I128 $0x31, Y7, Y3, Y15 VMOVDQU Y8, (AX) VMOVDQU Y9, 32(AX) VMOVDQU Y10, 64(AX) VMOVDQU Y11, 96(AX) VMOVDQU Y12, 128(AX) VMOVDQU Y13, 160(AX) VMOVDQU Y14, 192(AX) VMOVDQU Y15, 224(AX) RET