;****************************************************************************** ;* VP9 IDCT SIMD optimizations ;* ;* Copyright (C) 2025 Two Orioles, LLC ;* ;* This file is part of FFmpeg. ;* ;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* ;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public ;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** %include "libavutil/x86/x86util.asm" %if ARCH_X86_64 && HAVE_AVX512ICL_EXTERNAL SECTION_RODATA 64 dup16_perm: db 0, 1, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7 db 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15, 14, 15 db 16, 17, 16, 17, 18, 19, 18, 19, 20, 21, 20, 21, 22, 23, 22, 23 db 24, 25, 24, 25, 26, 27, 26, 27, 28, 29, 28, 29, 30, 31, 30, 31 itx_perm: dq 0x0000000820150440, 0x0000000231372604 dq 0x0000000ca8041551, 0x00000006b9263715 dq 0x00000001ec9d8c62, 0x0000000bfdbfae26 dq 0x00000005648c9d73, 0x0000000f75aebf37 deint_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 int_shuf1: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 int_shuf2: db 8, 9, 0, 1, 10, 11, 2, 3, 12, 13, 4, 5, 14, 15, 6, 7 pw_512: times 4 dw 512 pw_m512: times 4 dw -512 pw_15137_6270x2x4: times 4 dw 15137*2 times 4 dw 6270*2 pw_11585_m11585x2x4: times 4 dw 11585*2 pw_m11585_11585x2x4: times 4 dw -11585*2 pw_11585_11585x2: times 4 dw 11585*2 int_mshift: db 142, 150, 0, 0, 174, 182, 0, 0 pd_8192: dd 8192 pw_804x2: times 2 dw 804*2 pw_1606x2: times 2 dw 1606*2 pw_3196x2: times 2 dw 3196*2 pw_3981x2: times 2 dw 3981*2 pw_6270x2: times 2 dw 6270*2 pw_7005x2: times 2 dw 7005*2 pw_7723x2: times 2 dw 7723*2 pw_9760x2: times 2 dw 9760*2 pw_12140x2: times 2 dw 12140*2 pw_12665x2: times 2 dw 12665*2 pw_13160x2: times 2 dw 13160*2 pw_13623x2: times 2 dw 13623*2 pw_14053x2: times 2 dw 14053*2 pw_14449x2: times 2 dw 14449*2 pw_14811x2: times 2 dw 14811*2 pw_15137x2: times 2 dw 15137*2 pw_15426x2: times 2 dw 15426*2 pw_15679x2: times 2 dw 15679*2 pw_15893x2: times 2 dw 15893*2 pw_16069x2: times 2 dw 16069*2 pw_16207x2: times 2 dw 16207*2 pw_16305x2: times 2 dw 16305*2 pw_16364x2: times 2 dw 16364*2 pw_m2404x2: times 2 dw -2404*2 pw_m4756x2: times 2 dw -4756*2 pw_m5520x2: times 2 dw -5520*2 pw_m8423x2: times 2 dw -8423*2 pw_m9102x2: times 2 dw -9102*2 pw_m10394x2: times 2 dw -10394*2 pw_m11003x2: times 2 dw -11003*2 pw_804_16364x2: dw 804*2, 16364*2 pw_1606_16305x2: dw 1606*2, 16305*2 pw_3196_16069x2: dw 3196*2, 16069*2 pw_3981_15893x2: dw 3981*2, 15893*2 pw_7005_14811x2: dw 7005*2, 14811*2 pw_7723_14449x2: dw 7723*2, 14449*2 pw_9760_13160x2: dw 9760*2, 13160*2 pw_m2404_16207x2: dw -2404*2, 16207*2 pw_m4756_15679x2: dw -4756*2, 15679*2 pw_m5520_15426x2: dw -5520*2, 15426*2 pw_m8423_14053x2: dw -8423*2, 14053*2 pw_m9102_13623x2: dw -9102*2, 13623*2 pw_m10394_12665x2: dw -10394*2, 12665*2 pw_m11003_12140x2: dw -11003*2, 12140*2 %macro COEF_PAIR 2-3 0 %if %3 & 4 pw_%1_m%2: dw %1, -%2 %else pw_%1_%2: dw %1, %2 %if %3 & 2 pw_m%1_%2: dw -%1, %2 %else pw_m%2_%1: dw -%2, %1 %endif %endif %if %3 & 1 pw_m%1_m%2: dw -%1, -%2 %endif %endmacro COEF_PAIR 804, 16364 COEF_PAIR 1606, 16305 COEF_PAIR 3196, 16069, 1 COEF_PAIR 3981, 15893 COEF_PAIR 6270, 15137, 1 COEF_PAIR 7005, 14811 COEF_PAIR 7723, 14449 COEF_PAIR 9102, 13623 COEF_PAIR 9760, 13160 COEF_PAIR 11585, 11585, 1 COEF_PAIR 12140, 11003 COEF_PAIR 12665, 10394 COEF_PAIR 13623, 9102, 1 COEF_PAIR 14053, 8423 COEF_PAIR 15137, 6270 COEF_PAIR 15426, 5520 COEF_PAIR 15679, 4756 COEF_PAIR 16069, 3196 COEF_PAIR 16207, 2404 ; ADST16-only: COEF_PAIR 2404, 9760, 2 COEF_PAIR 5520, 7005, 2 COEF_PAIR 8423, 3981, 2 COEF_PAIR 11003, 804, 2 COEF_PAIR 12140, 16364, 5 COEF_PAIR 14053, 15893, 5 COEF_PAIR 15426, 14811, 5 COEF_PAIR 16207, 13160, 5 pw_11585_m11585: dw 11585, -11585 pw_16069_m3196: dw 16069, -3196 pw_9102_m13623: dw 9102, -13623 pw_15137_m6270: dw 15137, -6270 pw_6270_m15137: dw 6270, -15137 %define pw_11585x2 pw_11585_11585x2 %define pw_m11585x2 pw_m11585_11585x2x4 SECTION .text %define o_base pw_512 + 128 %define o(x) (r6 - (o_base) + (x)) %define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) ; flags: 1 = swap, 2 = interleave (l), 4 = interleave (t), 8 = no_pack, ; 16 = special_mul1, 32 = special_mul2, 64 = dst_in_tmp1 %macro ITX_MUL2X_PACK 6-7 0 ; dst/src, tmp[1-2], rnd, coef[1-2], flags mova m%2, m%4 %if %7 & 16 vpdpwssd m%2, m%1, [o(pw_%5)] {bcstd} mova m%3, m%4 %if %7 & 32 vpdpwssd m%3, m%1, [o(pw_%6)] {bcstd} %else vpdpwssd m%3, m%1, m%6 %endif %elif %7 & 32 vpdpwssd m%2, m%1, m%5 mova m%3, m%4 vpdpwssd m%3, m%1, [o(pw_%6)] {bcstd} %elif %6 < 32 vpdpwssd m%2, m%1, m%5 mova m%3, m%4 vpdpwssd m%3, m%1, m%6 %elif %7 & 1 vpdpwssd m%2, m%1, [o(pw_%5_%6)] {bcstd} mova m%3, m%4 vpdpwssd m%3, m%1, [o(pw_m%6_%5)] {bcstd} %else vpdpwssd m%2, m%1, [o(pw_m%6_%5)] {bcstd} mova m%3, m%4 vpdpwssd m%3, m%1, [o(pw_%5_%6)] {bcstd} %endif %if %7 & 2 psrld m%2, 14 pslld m%3, 2 vpshrdd m%1, m%3, m%2, 16 %elif %7 & 4 ; compared to using shifts (as above) this has better throughput, ; but worse latency and requires setting up the opmask/index ; registers, so only use this method for the larger transforms %if %7 & 64 pslld m%2, 2 vpmultishiftqb m%2{k7}, m13, m%3 %else pslld m%1, m%2, 2 vpmultishiftqb m%1{k7}, m13, m%3 %endif %else psrad m%2, 14 psrad m%3, 14 %if %7 & 8 == 0 packssdw m%1, m%3, m%2 %endif %endif %endmacro ; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12 ; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12 %macro ITX_MULSUB_2W 7 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2] punpcklwd m%3, m%2, m%1 punpckhwd m%2, m%1 %if %7 < 32 mova m%1, m%5 vpdpwssd m%1, m%3, m%7 mova m%4, m%5 vpdpwssd m%4, m%2, m%7 %else mova m%1, m%5 vpdpwssd m%1, m%3, [o(pw_m%7_%6)] {bcstd} mova m%4, m%5 vpdpwssd m%4, m%2, [o(pw_m%7_%6)] {bcstd} %endif psrad m%1, 14 psrad m%4, 14 packssdw m%1, m%4 mova m%4, m%5 %if %7 < 32 vpdpwssd m%4, m%2, m%6 mova m%2, m%5 vpdpwssd m%2, m%3, m%6 %else vpdpwssd m%4, m%2, [o(pw_%6_%7)] {bcstd} mova m%2, m%5 vpdpwssd m%2, m%3, [o(pw_%6_%7)] {bcstd} %endif psrad m%4, 14 psrad m%2, 14 packssdw m%2, m%4 %endmacro ; flags: 1 = swap, 2 = invert2, 4 = invert1 %macro ADST_MULSUB_4W 10-11 0 ; dst1/src1, src2, dst2, tmp[1-2], rnd, coef[1-4], flags mova m%3, m%6 %if %11 & 1 vpdpwssd m%3, m%1, [o(pw_m%8_%7)] {bcstd} %else vpdpwssd m%3, m%1, [o(pw_%7_%8)] {bcstd} %endif %if %11 & 4 vpbroadcastd m%4, [o(pw_m%9_%10)] %elif %11 & 2 vpbroadcastd m%4, [o(pw_%9_m%10)] %elif %11 & 1 vpbroadcastd m%4, [o(pw_%10_%9)] %else vpbroadcastd m%4, [o(pw_%9_%10)] %endif pmaddwd m%4, m%2 mova m%5, m%6 %if %11 & 4 vpdpwssd m%5, m%1, [o(pw_%8_m%7)] {bcstd} %elif %11 & 1 vpdpwssd m%5, m%1, [o(pw_%7_%8)] {bcstd} %else vpdpwssd m%5, m%1, [o(pw_m%8_%7)] {bcstd} %endif %if %11 & 2 vpbroadcastd m%1, [o(pw_%10_%9)] %elif %11 & 1 vpbroadcastd m%1, [o(pw_%9_m%10)] %else vpbroadcastd m%1, [o(pw_m%10_%9)] %endif pmaddwd m%2, m%1 paddd m%1, m%3, m%4 psubd m%3, m%4 paddd m%4, m%5, m%2 psubd m%5, m%2 pslld m%1, 2 pslld m%3, 2 vpmultishiftqb m%1{k7}, m13, m%4 vpmultishiftqb m%3{k7}, m13, m%5 %endmacro %macro WRAP_YMM 1+ INIT_YMM cpuname %1 INIT_ZMM cpuname %endmacro %macro INV_TXFM_FN 3-4 0 ; type1, type2, size, eob_offset cglobal vp9_i%1_i%2_%3_add, 4, 5, 0, dst, stride, c, eob, tx2 %undef cmp %define %%p1 m(vp9_i%1_%3_internal) lea r6, [o_base] ; Jump to the 1st txfm function if we're not taking the fast path, which ; in turn performs an indirect jump to the 2nd txfm function. lea tx2q, [m(vp9_i%2_%3_internal).pass2] %ifidn %1_%2, dct_dct cmp eobd, 1 jne %%p1 %else %if %4 add eobd, %4 %endif ; jump to the 1st txfm function unless it's located directly after this times ((%%end - %%p1) >> 31) & 1 jmp %%p1 ALIGN function_align %%end: %endif %endmacro %macro INV_TXFM_16X16_FN 2-3 0 ; type1, type2, eob_offset INV_TXFM_FN %1, %2, 16x16, %3 %ifidn %1_%2, dct_dct movd xmm0, [o(pw_11585x2)] pmulhrsw xmm3, xmm0, [cq] pxor ym2, ym2 pmulhrsw xmm3, xmm0 pmulhrsw xmm3, [o(pw_512)] mova [cq], xm2 add r3d, 7 vpbroadcastw ym3, xmm3 .dconly_loop: mova xm1, [dstq+strideq*0] vinserti32x4 ym1, [dstq+strideq*1], 1 punpcklbw ym0, ym1, ym2 punpckhbw ym1, ym2 paddw ym0, ym3 paddw ym1, ym3 packuswb ym0, ym1 mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], ym0, 1 lea dstq, [dstq+strideq*2] dec r3d jg .dconly_loop RET %endif %endmacro %macro IDCT16_MAIN 0-1 0 ; idct32 %if mmsize == 64 && %1 == 0 .main_fast: %endif vpbroadcastd m2, [o(pw_1606_16305x2)] vpbroadcastd m4, [o(pw_m10394_12665x2)] vpbroadcastd m11, [o(pw_7723_14449x2)] vpbroadcastd m12, [o(pw_m4756_15679x2)] pmulhrsw m8, m2 ; t8a t15a vpbroadcastd m2, [o(pw_3196_16069x2)] pmulhrsw m0, m4 ; t9a t14a vpbroadcastd m4, [o(pw_m9102_13623x2)] pmulhrsw m5, m11 ; t10a t13a vpbroadcastd m11, [o(pw_11585_11585x2)] pmulhrsw m1, m12 ; t11a t12a vbroadcasti32x4 m12, [o(pw_15137_6270x2x4)] pmulhrsw m7, m2 ; t4a t7a pmulhrsw m3, m4 ; t5a t6a pmulhrsw m9, m11 ; t0 t1 pmulhrsw m6, m12 ; t3 t2 %if mmsize == 64 && %1 == 0 jmp %%main2 ALIGN function_align .main: punpckhwd m8, m7, m0 ; dct16 in15 in1 punpcklwd m9, m4, m0 ; dct4 in2 in0 punpckhwd m0, m3, m4 ; dct16 in7 in9 punpcklwd m7, m1 ; dct8 in7 in1 punpckhwd m1, m6 ; dct16 in3 in13 punpcklwd m3, m5 ; dct8 in3 in5 punpckhwd m5, m2 ; dct16 in11 in5 punpcklwd m6, m2 ; dct4 in3 in1 ITX_MUL2X_PACK 8, 2, 4, 10, 1606, 16305, 5 ; t8a t15a ITX_MUL2X_PACK 0, 2, 4, 10, 12665, 10394, 5 ; t9a t14a ITX_MUL2X_PACK 5, 2, 4, 10, 7723, 14449, 5 ; t10a t13a ITX_MUL2X_PACK 1, 2, 4, 10, 15679, 4756, 5 ; t11a t12a ITX_MUL2X_PACK 7, 2, 4, 10, 3196, 16069, 5 ; t4a t7a ITX_MUL2X_PACK 3, 2, 4, 10, 13623, 9102, 5 ; t5a t6a ITX_MUL2X_PACK 9, 2, 4, 10, 11585, 11585 ; t0 t1 ITX_MUL2X_PACK 6, 2, 4, 10, 6270, 15137 ; t3 t2 %%main2: %endif psubw m2, m8, m0 ; t9 t14 paddw m8, m0 ; t8 t15 psubw m4, m1, m5 ; t10 t13 paddw m1, m5 ; t11 t12 ITX_MUL2X_PACK 2, 0, 5, 10, 6270, 15137, (1|%1*4) ; t9a t14a ITX_MUL2X_PACK 4, 0, 5, 10, m15137, 6270, (1|%1*4) ; t10a t13a vbroadcasti32x4 m5, [o(deint_shuf)] psubw m0, m8, m1 ; t11a t12a paddw m8, m1 ; t8a t15a psubw m1, m7, m3 ; t5a t6a paddw m7, m3 ; t4 t7 pshufb m8, m5 pshufb m7, m5 paddw m3, m2, m4 ; t9 t14 psubw m2, m4 ; t10 t13 %if %1 vpbroadcastd m12, [o(pw_11585_11585)] vpbroadcastd m11, [o(pw_m11585_11585)] pshufb m3, m5 ITX_MUL2X_PACK 1, 4, 5, 10, 12, 11 ; t5 t6 ITX_MUL2X_PACK 0, 4, 5, 10, 11, 12, 8 ; t11 t12 ITX_MUL2X_PACK 2, 0, 11, 10, 11, 12, 8 ; t10a t13a packssdw m5, m11 ; t12 t13a packssdw m4, m0 ; t11 t10a %else pshufb m0, m5 ITX_MUL2X_PACK 1, 4, 5, 10, 11585_11585, m11585_11585, 48 ; t5 t6 vpbroadcastd m11, [o(pw_11585x2)] punpckhqdq m5, m0, m2 ; t12a t13 punpcklqdq m0, m2 ; t11a t10 psubw m4, m5, m0 paddw m5, m0 pmulhrsw m4, m11 ; t11 t10a pmulhrsw m5, m11 ; t12 t13a %endif punpckhqdq m2, m7, m1 ; t7 t6 punpcklqdq m7, m1 ; t4 t5 psubw m1, m9, m6 ; t3 t2 paddw m9, m6 ; t0 t1 punpckhqdq m0, m8, m3 ; t15a t14 punpcklqdq m8, m3 ; t8a t9 psubw m3, m9, m2 ; t7 t6 paddw m9, m2 ; t0 t1 psubw m2, m1, m7 ; t4 t5 paddw m1, m7 ; t3 t2 psubw m7, m9, m0 ; out15 out14 paddw m0, m9 ; out0 out1 psubw m6, m1, m5 ; out12 out13 paddw m1, m5 ; out3 out2 psubw m5, m2, m4 ; out11 out10 paddw m2, m4 ; out4 out5 psubw m4, m3, m8 ; out8 out9 paddw m3, m8 ; out7 out6 %endmacro INIT_ZMM avx512icl INV_TXFM_16X16_FN dct, dct INV_TXFM_16X16_FN dct, adst, 39-23 cglobal vp9_idct_16x16_internal, 0, 5, 16, dst, stride, c, eob, tx2 mova m15, [o(itx_perm)] vpbroadcastd m10, [o(pd_8192)] vpbroadcastq m13, [o(int_mshift)] vpcmpub k7, m13, m10, 6 sub eobd, 39 jl .pass1_fast vpermq m0, m15, [cq+64*0] vpermq m1, m15, [cq+64*1] vpermq m2, m15, [cq+64*2] vpermq m3, m15, [cq+64*3] vpermq m4, m15, [cq+64*4] vpermq m5, m15, [cq+64*5] vpermq m6, m15, [cq+64*6] vpermq m7, m15, [cq+64*7] call .main vbroadcasti32x4 m12, [o(int_shuf1)] vbroadcasti32x4 m11, [o(int_shuf2)] pshufb m0, m12 pshufb m8, m1, m11 pshufb m2, m12 pshufb m9, m3, m11 pshufb m4, m12 pshufb m14, m5, m11 pshufb m6, m12 pshufb m11, m7, m11 punpckhdq m1, m0, m8 punpckldq m0, m8 punpckhdq m3, m2, m9 punpckldq m2, m9 punpckhdq m5, m4, m14 punpckldq m4, m14 punpckhdq m7, m6, m11 punpckldq m6, m11 .pass1_end: vshufi32x4 m8, m4, m6, q3232 vinserti32x8 m4, ym6, 1 vshufi32x4 m6, m0, m2, q3232 vinserti32x8 m0, ym2, 1 vshufi32x4 m9, m5, m7, q3232 vinserti32x8 m5, ym7, 1 vshufi32x4 m7, m1, m3, q3232 vinserti32x8 m1, ym3, 1 vshufi32x4 m2, m0, m4, q3131 ; 4 5 vshufi32x4 m0, m4, q2020 ; 0 1 vshufi32x4 m4, m6, m8, q2020 ; 8 9 vshufi32x4 m6, m8, q3131 ; 12 13 vshufi32x4 m3, m1, m5, q3131 ; 6 7 vshufi32x4 m1, m5, q2020 ; 2 3 vshufi32x4 m5, m7, m9, q2020 ; 10 11 vshufi32x4 m7, m9, q3131 ; 14 1 jmp tx2q .pass1_fast: mova ym3, [o(dup16_perm)] vbroadcasti32x4 ym9, [cq+32*0] vbroadcasti32x4 ym6, [cq+32*4] vpermb ym8, ym3, [cq+32*1] vpermb ym0, ym3, [cq+32*7] vpermb ym5, ym3, [cq+32*5] vpermb ym1, ym3, [cq+32*3] vpermb ym7, ym3, [cq+32*2] vpermb ym3, ym3, [cq+32*6] shufpd ym9, ym9, 0x0c shufpd ym6, ym6, 0x0c WRAP_YMM IDCT16_MAIN vbroadcasti32x4 m8, [o(int_shuf1)] vbroadcasti32x4 m9, [o(int_shuf2)] vinserti32x8 m0, ym2, 1 ; 0 1 | 4 5 vinserti32x8 m4, ym6, 1 ; 8 9 | 12 13 vinserti32x8 m1, ym3, 1 ; 3 2 | 7 6 vinserti32x8 m5, ym7, 1 ; 11 10 | 15 14 vshufi32x4 m2, m0, m4, q3131 vshufi32x4 m0, m4, q2020 vshufi32x4 m4, m1, m5, q2020 vshufi32x4 m1, m5, q3131 pshufb m2, m8 pshufb m0, m8 pshufb m4, m9 pshufb m1, m9 punpckhdq m3, m2, m1 ; 6-7 punpckldq m2, m1 ; 4-5 punpckhdq m1, m0, m4 ; 2-3 punpckldq m0, m4 ; 0-1 jmp tx2q .pass2: test eobd, eobd jl .pass2_fast call .main jmp .pass2_end .pass2_fast: punpcklqdq m9, m0, m0 punpckhwd m8, m0, m0 punpcklwd m7, m1, m1 punpckhwd m1, m1 punpcklqdq m6, m2, m2 punpckhwd m5, m2, m2 punpckhwd m0, m3, m3 punpcklwd m3, m3 call .main_fast .pass2_end: psrldq m8, m15, 1 psrlq m12, m15, 12 psrldq m9, m15, 2 psrlq m13, m15, 20 mova m10, m8 vpermi2q m8, m0, m2 ; 0 1 4 5 vpermt2q m0, m12, m2 mova m11, m9 vpermi2q m9, m1, m3 ; 2 3 6 7 vpermt2q m1, m13, m3 vpbroadcastd m2, [o(pw_512)] vpermi2q m10, m4, m6 ; 8 9 12 13 vpermt2q m4, m12, m6 vpermi2q m11, m5, m7 ; 10 11 14 15 vpermt2q m5, m13, m7 REPX {pmulhrsw x, m2}, m0, m1, m4, m5, m8, m9, m10, m11 .pass2_end2: lea r3, [strideq*3] lea r4, [dstq+strideq*4] lea r5, [dstq+strideq*8] lea r6, [r4 +strideq*8] mova xm3, [dstq+strideq*0] mova xm6, [dstq+strideq*2] vinserti32x4 ym3, [dstq+strideq*1], 1 vinserti32x4 ym6, [dstq+r3 ], 1 vinserti32x4 m3, [r4+strideq*0], 2 vinserti32x4 m6, [r4+strideq*2], 2 vinserti32x4 m3, [r4+strideq*1], 3 vinserti32x4 m6, [r4+r3 ], 3 mova xm12, [r5+strideq*0] mova xm13, [r5+strideq*2] vinserti32x4 ym12, [r5+strideq*1], 1 vinserti32x4 ym13, [r5+r3 ], 1 vinserti32x4 m12, [r6+strideq*0], 2 vinserti32x4 m13, [r6+strideq*2], 2 vinserti32x4 m12, [r6+strideq*1], 3 vinserti32x4 m13, [r6+r3 ], 3 pxor m7, m7 REPX {mova [cq+64*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 punpcklbw m2, m3, m7 punpckhbw m3, m7 paddw m0, m2 paddw m8, m3 packuswb m0, m8 punpcklbw m2, m6, m7 punpckhbw m6, m7 paddw m1, m2 paddw m9, m6 packuswb m1, m9 punpcklbw m2, m12, m7 punpckhbw m12, m7 paddw m2, m4 paddw m10, m12 packuswb m2, m10 punpcklbw m3, m13, m7 punpckhbw m13, m7 paddw m3, m5 paddw m11, m13 packuswb m3, m11 mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], ym0, 1 mova [dstq+strideq*2], xm1 vextracti32x4 [dstq+r3 ], ym1, 1 vextracti32x4 [r4+strideq*0], m0, 2 vextracti32x4 [r4+strideq*1], m0, 3 vextracti32x4 [r4+strideq*2], m1, 2 vextracti32x4 [r4+r3 ], m1, 3 mova [r5+strideq*0], xm2 vextracti32x4 [r5+strideq*1], ym2, 1 mova [r5+strideq*2], xm3 vextracti32x4 [r5+r3 ], ym3, 1 vextracti32x4 [r6+strideq*0], m2, 2 vextracti32x4 [r6+strideq*1], m2, 3 vextracti32x4 [r6+strideq*2], m3, 2 vextracti32x4 [r6+r3 ], m3, 3 RET ALIGN function_align IDCT16_MAIN ret %macro IADST16_MAIN 0 %if mmsize == 64 .main_fast: %endif punpcklwd m4, m3, m0 ; in7 in0 punpcklwd m11, m1, m2 ; in3 in4 punpckhwd m9, m2, m1 ; in5 in2 punpckhwd m7, m0, m3 ; in1 in6 ITX_MUL2X_PACK 4, 0, 6, 10, 11003_804, 12140_m16364, 116 ; t1a t0a ITX_MUL2X_PACK 4, 5, 6, 10, m11003_804, m12140_m16364, 52 ; t9a t8a ITX_MUL2X_PACK 11, 2, 6, 10, 5520_7005, 15426_m14811, 116 ; t5a t4a ITX_MUL2X_PACK 11, 5, 6, 10, m5520_7005, m15426_m14811, 52 ; t13a t12a ITX_MUL2X_PACK 9, 1, 6, 10, 8423_3981, 14053_m15893, 116 ; t3a t2a ITX_MUL2X_PACK 9, 5, 6, 10, m8423_3981, m14053_m15893, 52 ; t11a t10a ITX_MUL2X_PACK 7, 3, 6, 10, 2404_9760, 16207_m13160, 116 ; t7a t6a ITX_MUL2X_PACK 7, 5, 6, 10, m2404_9760, m16207_m13160, 52 ; t15a t14a %if mmsize == 64 ; for the ymm variant we only ever use the fast path jmp %%main2 ALIGN function_align .main: punpckhwd m8, m7, m0 ; in14 in1 punpcklwd m0, m7 ; in0 in15 punpcklwd m7, m6, m1 ; in12 in3 punpckhwd m1, m6 ; in2 in13 punpckhwd m6, m5, m2 ; in10 in5 punpcklwd m2, m5 ; in4 in11 punpcklwd m5, m4, m3 ; in8 in7 punpckhwd m3, m4 ; in6 in9 ADST_MULSUB_4W 0, 5, 4, 9, 11, 10, 804, 16364, 12140, 11003 ; t1a t0a, t9a t8a ADST_MULSUB_4W 2, 7, 11, 5, 9, 10, 7005, 14811, 15426, 5520 ; t5a t4a, t13a t12a ADST_MULSUB_4W 1, 6, 9, 5, 7, 10, 3981, 15893, 14053, 8423 ; t3a t2a, t11a t10a ADST_MULSUB_4W 3, 8, 7, 5, 6, 10, 9760, 13160, 16207, 2404 ; t7a t6a, t15a t14a %%main2: %endif psubw m5, m1, m3 ; t7 t6 paddw m6, m1, m3 ; t3 t2 psubw m1, m0, m2 ; t5 t4 paddw m2, m0 ; t1 t0 ADST_MULSUB_4W 4, 11, 8, 3, 0, 10, 3196, 16069, 16069, 3196, 1 ; t8a t9a, t12a t13a ADST_MULSUB_4W 9, 7, 0, 3, 11, 10, 13623, 9102, 9102, 13623, 1 ; t10a t11a, t14a t15a ADST_MULSUB_4W 1, 5, 11, 3, 7, 10, 6270, 15137, 15137, 6270, 2 ; out12 -out3, t7 t6 psubw m3, m2, m6 ; t3a t2a paddw m2, m6 ; -out15 out0 ADST_MULSUB_4W 8, 0, 5, 6, 7, 10, 15137, 6270, 6270, 15137, 6 ; -out13 out2, t15a t14 vbroadcasti32x4 m12, [o(deint_shuf)] paddw m0, m4, m9 ; -out1 out14 psubw m4, m9 ; t10 t11 pshufb m2, m12 pshufb m1, m12 pshufb m8, m12 pshufb m0, m12 punpcklqdq m6, m1, m8 ; out12 -out13 shufps m7, m0, m2, q1032 ; out14 -out15 %endmacro %macro IADST16_PASS1_END 0 shufps m0, m2, m0, q1032 ; out0 -out1 punpckhqdq m1, m8, m1 ; out2 -out3 mova m2, m10 vpdpwssd m2, m5, [o(pw_m11585_m11585)] {bcstd} ; out5 mova m8, m10 vpdpwssd m8, m11, [o(pw_11585_11585)] {bcstd} ; out4 mova m9, m10 vpdpwssd m9, m5, [o(pw_m11585_11585)] {bcstd} ; out10 mova m5, m10 vpdpwssd m5, m11, [o(pw_11585_m11585)] {bcstd} ; out11 mova m11, m10 vpdpwssd m11, m3, [o(pw_m11585_m11585)] {bcstd} ; out7 mova m14, m10 vpdpwssd m14, m4, [o(pw_11585_11585)] {bcstd} ; out6 mova m12, m10 vpdpwssd m12, m3, [o(pw_m11585_11585)] {bcstd} ; out8 mova m3, m10 vpdpwssd m3, m4, [o(pw_m11585_11585)] {bcstd} ; out9 %endmacro INV_TXFM_16X16_FN adst, dct, 39-18 INV_TXFM_16X16_FN adst, adst cglobal vp9_iadst_16x16_internal, 0, 5, 16, dst, stride, c, eob, tx2 mova m15, [o(itx_perm)] psrlq m7, m15, 4 vpermq m0, m15, [cq+64*0] ; 0 1 vpermq m1, m7, [cq+64*1] ; 3 2 vpermq m2, m15, [cq+64*2] ; 4 5 vpermq m3, m7, [cq+64*3] ; 7 6 vpbroadcastd m10, [o(pd_8192)] vpbroadcastq m13, [o(int_mshift)] vpcmpub k7, m13, m10, 6 sub eobd, 39 jl .pass1_fast vpermq m4, m15, [cq+64*4] ; 8 9 vpermq m5, m7, [cq+64*5] ; 11 10 vpermq m6, m15, [cq+64*6] ; 12 13 vpermq m7, m7, [cq+64*7] ; 15 14 call .main IADST16_PASS1_END REPX {psrad x, 14}, m2, m8, m9, m5, m11, m14, m12, m3 packssdw m2, m8, m2 ; out4 out5 packssdw m5, m9, m5 ; out10 out11 packssdw m4, m12, m3 ; out8 out9 packssdw m3, m14, m11 ; out6 out7 pxor m9, m9 punpckhwd m8, m0, m1 punpcklwd m0, m1 psubw m8, m9, m8 punpckhwd m1, m0, m8 punpcklwd m0, m8 punpckhwd m8, m2, m3 punpcklwd m2, m3 punpckhwd m3, m2, m8 punpcklwd m2, m8 punpckhwd m8, m4, m5 punpcklwd m4, m5 punpckhwd m5, m4, m8 punpcklwd m4, m8 punpckhwd m8, m6, m7 punpcklwd m6, m7 psubw m8, m9, m8 punpckhwd m7, m6, m8 punpcklwd m6, m8 jmp m(vp9_idct_16x16_internal).pass1_end .pass1_fast: WRAP_YMM IADST16_MAIN WRAP_YMM IADST16_PASS1_END vinserti32x8 m0, ym6, 1 vinserti32x8 m1, ym7, 1 vinserti32x8 m8, ym12, 1 vinserti32x8 m2, ym3, 1 vinserti32x8 m14, ym9, 1 vinserti32x8 m11, ym5, 1 pslld m14, 2 pslld m11, 2 punpckhwd m4, m0, m1 punpcklwd m0, m1 vpmultishiftqb m14{k7}, m13, m8 vpmultishiftqb m11{k7}, m13, m2 psrlq m1, m15, 24 pxor m2, m2 psubw m2, m4 punpckhwd m3, m0, m2 punpcklwd m0, m2 psrlq m2, m15, 28 punpckhwd m4, m14, m11 punpcklwd m14, m11 mova m5, m2 vpermi2q m2, m0, m14 vpermt2q m0, m1, m14 vpermi2q m1, m3, m4 vpermt2q m3, m5, m4 jmp tx2q .pass2: pshufd m1, m1, q1032 pshufd m3, m3, q1032 test eobd, eobd jl .pass2_fast pshufd m5, m5, q1032 pshufd m7, m7, q1032 call .main jmp .pass2_end .pass2_fast: call .main_fast .pass2_end: vbroadcasti32x4 m9, [o(pw_11585_m11585x2x4)] vbroadcasti32x4 m10, [o(pw_m11585_11585x2x4)] punpckhqdq m1, m8 ; -out3 out2 shufps m0, m2, q3210 ; -out1 out0 pshufb m2, m11, m12 pshufb m5, m12 pshufb m3, m12 pshufb m4, m12 vbroadcasti32x4 m11, [o(pw_512)] vpbroadcastd m12, [o(pw_512)] punpcklqdq m8, m5, m2 ; t15a t7 punpckhqdq m5, m2 ; t14a t6 shufps m2, m3, m4, q1032 ; t2a t10 shufps m3, m4, q3210 ; t3a t11 psubsw m4, m2, m3 paddsw m3, m2 paddsw m2, m5, m8 psubsw m5, m8 pmulhrsw m4, m9 ; out8 out9 pmulhrsw m3, m10 ; out7 out6 pmulhrsw m2, m10 ; out5 out4 pmulhrsw m5, m9 ; out10 out11 pmulhrsw m6, m11 pmulhrsw m7, m11 pshufd m11, m11, q1032 pmulhrsw m0, m11 pmulhrsw m1, m11 REPX {pmulhrsw x, m12}, m2, m3, m4, m5 psrldq m8, m15, 2 psrlq m12, m15, 20 psrldq m10, m15, 1 psrlq m13, m15, 12 mova m9, m8 vpermi2q m8, m0, m2 ; 0 1 4 5 vpermt2q m0, m12, m2 vpermi2q m9, m1, m3 ; 2 3 6 7 vpermt2q m1, m12, m3 mova m11, m10 vpermi2q m10, m4, m6 ; 8 9 12 13 vpermt2q m4, m13, m6 vpermi2q m11, m5, m7 ; 10 11 14 15 vpermt2q m5, m13, m7 jmp m(vp9_idct_16x16_internal).pass2_end2 ALIGN function_align IADST16_MAIN ret %macro IDCT_32x32_END 4 ; src, mem, stride[1-2] pmovzxbw m10, [dstq+%3] pmovzxbw m11, [r3 +%4] %if %2 < 8 paddw m8, m%2, m%1 psubw m9, m%2, m%1 %else mova m9, [rsp+64*(%2-8)] paddw m8, m9, m%1 psubw m9, m%1 %endif pmulhrsw m8, m12 pmulhrsw m9, m12 paddw m8, m10 paddw m9, m11 packuswb m8, m9 vpermq m8, m13, m8 mova [dstq+%3], ym8 vextracti32x8 [r3 +%4], m8, 1 %if %2 == 3 || %2 == 7 || %2 == 11 add dstq, r5 sub r3, r5 %endif %endmacro cglobal vp9_idct_idct_32x32_add, 4, 7, 0, dst, stride, c, eob %undef cmp lea r6, [o_base] cmp eobd, 1 jne .pass1 movd xmm0, [o(pw_11585x2)] pmulhrsw xmm3, xmm0, [cq] pxor m2, m2 pmulhrsw xmm3, xmm0 pmulhrsw xmm3, [o(pw_512)] movd [cq], xm2 add r3d, 15 vpbroadcastw m3, xmm3 .dconly_loop: mova ym1, [dstq+strideq*0] vinserti32x8 m1, [dstq+strideq*1], 1 punpcklbw m0, m1, m2 punpckhbw m1, m2 paddw m0, m3 paddw m1, m3 packuswb m0, m1 mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] dec r3d jg .dconly_loop RET .pass1: PROLOGUE 0, 7, 30, 64*16, dst, stride, c, eob sub eobd, 135 jl .fast mova m0, [cq+64* 0] mova m14, [cq+64* 2] mova m1, [cq+64* 4] mova m15, [cq+64* 6] mova m2, [cq+64* 8] mova m16, [cq+64*10] mova m3, [cq+64*12] mova m17, [cq+64*14] mova m4, [cq+64*16] mova m18, [cq+64*18] mova m5, [cq+64*20] mova m19, [cq+64*22] mova m6, [cq+64*24] mova m20, [cq+64*26] mova m7, [cq+64*28] mova m21, [cq+64*30] call .idct16 mova [rsp+64*0], m14 mova [rsp+64*1], m15 mova [rsp+64*2], m16 mova [rsp+64*3], m17 mova [rsp+64*4], m18 mova [rsp+64*5], m19 mova [rsp+64*6], m20 mova [rsp+64*7], m21 mova m22, [cq+64* 1] mova m23, [cq+64* 3] mova m24, [cq+64* 5] mova m25, [cq+64* 7] mova m26, [cq+64* 9] mova m27, [cq+64*11] mova m28, [cq+64*13] mova m29, [cq+64*15] mova m14, [cq+64*17] mova m15, [cq+64*19] mova m16, [cq+64*21] mova m17, [cq+64*23] mova m18, [cq+64*25] mova m19, [cq+64*27] mova m20, [cq+64*29] mova m21, [cq+64*31] call .main psubw m13, m0, m29 ; 31 paddw m0, m29 ; 0 psubw m29, m1, m28 ; 30 paddw m1, m28 ; 1 psubw m28, m2, m27 ; 29 paddw m2, m27 ; 2 psubw m27, m3, m26 ; 28 paddw m3, m26 ; 3 psubw m26, m4, m25 ; 27 paddw m4, m25 ; 4 psubw m25, m5, m24 ; 26 paddw m5, m24 ; 5 psubw m24, m6, m23 ; 25 paddw m6, m23 ; 6 psubw m23, m7, m22 ; 24 paddw m7, m22 ; 7 punpckhwd m8, m0, m1 ; a4 b4 a5 b5 a6 b6 a7 b7 punpcklwd m0, m1 ; a0 b0 a1 b1 a2 b2 a3 b3 punpckhwd m1, m2, m3 ; c4 d4 c5 d5 c6 d6 c7 d7 punpcklwd m2, m3 ; c0 d0 c1 d1 c2 d2 c3 d3 punpckhwd m22, m4, m5 ; e4 f4 e5 f5 e6 f6 e7 f7 punpcklwd m4, m5 ; e0 f0 e1 f1 e2 f2 e3 f3 punpckhwd m5, m6, m7 ; g4 h4 g5 h5 g6 h6 g7 h7 punpcklwd m6, m7 ; g0 h0 g1 h1 g2 h2 g3 h3 punpckhwd m3, m23, m24 punpcklwd m23, m24 punpckhwd m24, m25, m26 punpcklwd m25, m26 punpckhwd m26, m27, m28 punpcklwd m27, m28 punpckhwd m28, m29, m13 punpcklwd m29, m13 punpckhdq m7, m0, m2 ; a2 b2 c2 d2 a3 b3 c3 d3 punpckldq m0, m2 ; a0 b0 c0 d0 a1 b1 c1 d1 punpckhdq m2, m4, m6 ; e2 f2 g2 h2 e3 f3 g3 h3 punpckldq m4, m6 ; e0 f0 g0 h0 e1 f1 g1 h1 punpckhdq m6, m8, m1 ; a6 b6 c6 d6 a7 b7 c7 d7 punpckldq m8, m1 ; a4 b4 c4 d4 a5 b5 c5 d5 punpckhdq m1, m22, m5 ; e6 f6 g6 h6 e7 f7 g7 h7 punpckldq m22, m5 ; e4 f4 g4 h5 e5 f5 g5 h5 punpckhdq m13, m23, m25 punpckldq m23, m25 punpckhdq m25, m27, m29 punpckldq m27, m29 punpckhdq m9, m3, m24 punpckldq m3, m24 punpckhdq m24, m26, m28 punpckldq m26, m28 punpcklqdq m5, m23, m27 ; d00 d08 d16 d24 punpckhqdq m23, m27 ; d01 d09 d17 d25 punpckhqdq m27, m13, m25 ; d03 d11 d19 d27 punpcklqdq m13, m25 ; d02 d10 d18 d26 punpckhqdq m25, m3, m26 ; d05 d13 d21 d29 punpcklqdq m3, m26 ; d04 d12 d20 d28 punpckhqdq m26, m9, m24 ; d07 d15 d23 d31 punpcklqdq m9, m24 ; d06 d14 d22 d30 mova [rsp+64*12], m23 mova [rsp+64*13], m27 mova [rsp+64*14], m25 mova [rsp+64*15], m26 punpckhqdq m24, m8, m22 ; a05 a13 a21 a29 punpcklqdq m8, m22 ; a04 a12 a20 a28 punpckhqdq m22, m0, m4 ; a01 a09 a17 a25 punpcklqdq m0, m4 ; a00 a08 a16 a24 punpckhqdq m23, m7, m2 ; a03 a11 a19 a27 punpcklqdq m7, m2 ; a02 a10 a18 a26 punpckhqdq m25, m6, m1 ; a07 a15 a23 a31 punpcklqdq m6, m1 ; a06 a14 a22 a30 mova m2, [rsp+64*0] mova m11, [rsp+64*1] mova m12, [rsp+64*2] mova m29, [rsp+64*3] mova m27, [rsp+64*4] mova m26, [rsp+64*5] mova m4, [rsp+64*6] mova m28, [rsp+64*7] psubw m1, m2, m21 ; 23 paddw m2, m21 ; 8 psubw m21, m11, m20 ; 22 paddw m11, m20 ; 9 psubw m20, m12, m19 ; 21 paddw m12, m19 ; 10 psubw m19, m29, m18 ; 20 paddw m29, m18 ; 11 psubw m18, m27, m17 ; 19 paddw m27, m17 ; 12 psubw m17, m26, m16 ; 18 paddw m26, m16 ; 13 paddw m16, m4, m15 ; 14 psubw m4, m15 ; 17 mova m15, m6 psubw m6, m28, m14 ; 16 paddw m28, m14 ; 15 mova m14, m7 punpcklwd m7, m6, m4 punpckhwd m6, m4 punpckhwd m4, m17, m18 punpcklwd m17, m18 punpckhwd m18, m19, m20 punpcklwd m19, m20 punpckhwd m20, m21, m1 punpcklwd m21, m1 punpckhwd m1, m2, m11 ; i4 j4 i5 j5 i6 j6 i7 j7 punpcklwd m2, m11 ; i0 j1 i1 j1 i2 j2 i3 j3 punpckhwd m11, m12, m29 ; k4 l4 k5 l5 k6 l6 k7 l7 punpcklwd m12, m29 ; k0 l0 k1 l1 k2 l2 k3 l3 punpckhwd m29, m27, m26 ; m4 n4 m5 n5 m6 n6 m7 n7 punpcklwd m27, m26 ; m0 n0 m1 n1 m2 n2 m3 n3 punpckhwd m26, m16, m28 ; o4 p4 o5 p5 o6 p6 o7 p7 punpcklwd m16, m28 ; o0 p0 o1 p1 o2 p2 o3 p3 punpckhdq m28, m2, m12 ; i2 j2 k2 l2 i3 j3 k3 l3 punpckldq m2, m12 ; i0 j0 k0 l0 i1 j1 k1 l1 punpckhdq m12, m27, m16 ; m2 n2 o2 p2 m3 n3 o3 p3 punpckldq m27, m16 ; m0 n0 o0 p0 m1 n1 o1 p1 punpckhdq m16, m1, m11 ; i6 j6 k6 l6 i7 j7 k7 l7 punpckldq m1, m11 ; i4 j4 k4 l4 i5 j5 k5 l5 punpckhdq m11, m29, m26 ; m6 n6 o6 p6 m7 n7 o7 p7 punpckldq m29, m26 ; m4 n4 o4 p4 m5 n5 o5 p5 punpckhdq m26, m19, m21 punpckldq m19, m21 punpckhdq m21, m6, m4 punpckldq m6, m4 punpckhdq m4, m18, m20 punpckldq m18, m20 punpckhdq m20, m7, m17 punpckldq m7, m17 punpcklqdq m17, m28, m12 ; b02 b10 b18 b26 punpckhqdq m28, m12 ; b03 b11 b19 b27 punpckhqdq m12, m2, m27 ; b01 b09 b17 b25 punpcklqdq m2, m27 ; b00 b08 b16 b24 punpckhqdq m27, m1, m29 ; b05 b13 b21 b29 punpcklqdq m1, m29 ; b04 b12 b20 b28 punpckhqdq m29, m16, m11 ; b07 b15 b23 b31 punpcklqdq m16, m11 ; b06 b14 b22 b30 mova [rsp+64* 8], m12 mova [rsp+64* 9], m28 mova [rsp+64*10], m27 mova [rsp+64*11], m29 punpckhqdq m27, m20, m26 ; c03 c11 c19 c27 punpcklqdq m20, m26 ; c02 c10 c18 c26 punpckhqdq m26, m7, m19 ; c01 c09 c17 c25 punpcklqdq m7, m19 ; c00 c08 c16 c24 punpckhqdq m28, m6, m18 ; c05 c13 c21 c29 punpcklqdq m6, m18 ; c04 c12 c20 c28 punpckhqdq m29, m21, m4 ; c07 c15 c23 c31 punpcklqdq m21, m4 ; c06 c14 c22 c30 mov r3d, 64*28 pxor m4, m4 .zero_loop: mova [cq+r3+64*0], m4 mova [cq+r3+64*1], m4 mova [cq+r3+64*2], m4 mova [cq+r3+64*3], m4 sub r3d, 64*4 jge .zero_loop vshufi32x4 m4, m0, m2, q3232 ; a16 a24 b16 b24 vinserti32x8 m0, ym2, 1 ; a00 a08 b00 b08 vshufi32x4 m2, m7, m5, q3232 ; c16 c24 d16 d24 vinserti32x8 m7, ym5, 1 ; c00 c08 d00 d08 vshufi32x4 m5, m8, m1, q3232 ; a20 a28 b20 b28 vinserti32x8 m1, m8, ym1, 1 ; a04 a12 b04 b12 vshufi32x4 m8, m6, m3, q3232 ; c20 c28 d20 d28 vinserti32x8 m6, ym3, 1 ; c04 c12 d04 d12 vshufi32x4 m3, m1, m6, q3131 ; 12 vshufi32x4 m1, m6, q2020 ; 4 vshufi32x4 m6, m4, m2, q3131 ; 24 vshufi32x4 m4, m2, q2020 ; 16 vshufi32x4 m2, m0, m7, q3131 ; 8 vshufi32x4 m0, m7, q2020 ; 0 vshufi32x4 m7, m5, m8, q3131 ; 28 vshufi32x4 m5, m8, q2020 ; 20 vshufi32x4 m18, m14, m17, q3232 ; a18 a26 b18 b26 vinserti32x8 m14, ym17, 1 ; a02 a10 b02 b10 vshufi32x4 m17, m20, m13, q3232 ; c18 c26 d18 d26 vinserti32x8 m20, ym13, 1 ; c02 c10 d02 d10 vshufi32x4 m13, m21, m9, q3232 ; c22 c30 d22 d30 vinserti32x8 m21, ym9, 1 ; c06 c14 d06 d14 vshufi32x4 m19, m15, m16, q3232 ; a22 a30 b22 b30 vinserti32x8 m15, ym16, 1 ; a06 a14 b06 b14 vshufi32x4 m16, m14, m20, q3131 ; 10 vshufi32x4 m14, m20, q2020 ; 2 vshufi32x4 m20, m18, m17, q3131 ; 26 vshufi32x4 m18, m17, q2020 ; 18 vshufi32x4 m17, m15, m21, q3131 ; 14 vshufi32x4 m15, m21, q2020 ; 6 vshufi32x4 m21, m19, m13, q3131 ; 30 vshufi32x4 m19, m13, q2020 ; 22 call .idct16 mova [rsp+64*0], m14 mova [rsp+64*1], m15 mova [rsp+64*2], m16 mova [rsp+64*3], m17 mova [rsp+64*4], m18 mova [rsp+64*5], m19 mova [rsp+64*6], m20 mova [rsp+64*7], m21 mova m15, [rsp+64* 8] mova m16, [rsp+64* 9] mova m17, [rsp+64*10] mova m19, [rsp+64*11] mova m20, [rsp+64*12] mova m21, [rsp+64*13] mova m13, [rsp+64*14] mova m18, [rsp+64*15] vshufi32x4 m14, m22, m15, q3232 ; a17 a25 b17 b25 vinserti32x8 m22, ym15, 1 ; a01 a09 b01 b09 vshufi32x4 m15, m23, m16, q3232 ; a19 a27 b19 b27 vinserti32x8 m23, ym16, 1 ; a03 a11 b03 b11 vshufi32x4 m16, m24, m17, q3232 ; a21 a29 b21 b29 vinserti32x8 m24, ym17, 1 ; a05 a13 b05 b13 vshufi32x4 m17, m25, m19, q3232 ; a23 a31 b23 b31 vinserti32x8 m25, ym19, 1 ; a07 a15 b07 b15 vinserti32x8 m8, m26, ym20, 1 ; c01 c09 d01 d09 vshufi32x4 m26, m20, q3232 ; c17 c25 d17 d25 vinserti32x8 m9, m27, ym21, 1 ; c03 c11 d03 d11 vshufi32x4 m27, m21, q3232 ; c19 c27 d19 d27 vinserti32x8 m11, m28, ym13, 1 ; c05 c13 d05 d13 vshufi32x4 m28, m13, q3232 ; c21 c29 d21 d29 vinserti32x8 m12, m29, ym18, 1 ; c07 c15 d07 d15 vshufi32x4 m29, m18, q3232 ; c23 c31 d23 d31 vshufi32x4 m18, m14, m26, q3131 ; 25 vshufi32x4 m14, m26, q2020 ; 17 vshufi32x4 m19, m15, m27, q3131 ; 27 vshufi32x4 m15, m27, q2020 ; 19 vshufi32x4 m20, m16, m28, q3131 ; 29 vshufi32x4 m16, m28, q2020 ; 21 vshufi32x4 m21, m17, m29, q3131 ; 31 vshufi32x4 m17, m29, q2020 ; 23 vshufi32x4 m26, m22, m8, q3131 ; 9 vshufi32x4 m22, m8, q2020 ; 1 vshufi32x4 m27, m23, m9, q3131 ; 11 vshufi32x4 m23, m9, q2020 ; 3 vshufi32x4 m28, m24, m11, q3131 ; 13 vshufi32x4 m24, m11, q2020 ; 5 vshufi32x4 m29, m25, m12, q3131 ; 15 vshufi32x4 m25, m12, q2020 ; 7 call .main jmp .end .fast: mova m14, [o(dup16_perm)] pmovzxbw m9, [cq+64*0] pmovzxbw m6, [cq+64*8] vpermb m8, m14, [cq+64* 2] vpermb m0, m14, [cq+64*14] vpermb m5, m14, [cq+64*10] vpermb m1, m14, [cq+64* 6] vpermb m7, m14, [cq+64* 4] vpermb m3, m14, [cq+64*12] vpbroadcastd m10, [o(pd_8192)] vpbroadcastq m13, [o(int_mshift)] packuswb m9, m9 packuswb m6, m6 vpcmpub k7, m13, m10, 6 IDCT16_MAIN 1 vpermb m21, m14, [cq+64* 1] vpermb m17, m14, [cq+64*15] vpermb m20, m14, [cq+64* 9] vpermb m15, m14, [cq+64* 7] vpermb m18, m14, [cq+64* 5] vpermb m16, m14, [cq+64*11] vpermb m19, m14, [cq+64*13] vpermb m14, m14, [cq+64* 3] call .main_packed_fast punpcklwd m8, m0, m2 punpckhwd m0, m2 punpcklwd m2, m1, m3 punpckhwd m1, m3 punpcklwd m3, m4, m6 punpckhwd m4, m6 punpcklwd m6, m5, m7 punpckhwd m5, m7 punpcklwd m7, m14, m16 punpckhwd m14, m16 punpcklwd m16, m15, m17 punpckhwd m15, m17 punpcklwd m17, m19, m21 punpckhwd m19, m21 punpckhwd m21, m18, m20 punpcklwd m18, m20 punpcklwd m20, m8, m1 punpckhwd m8, m1 punpcklwd m1, m0, m2 punpckhwd m0, m2 punpcklwd m2, m3, m5 punpckhwd m3, m5 punpcklwd m5, m4, m6 punpckhwd m4, m6 punpcklwd m6, m7, m15 punpckhwd m7, m15 punpcklwd m15, m14, m16 punpckhwd m14, m16 punpckhwd m16, m18, m19 punpcklwd m18, m19 punpcklwd m19, m21, m17 punpckhwd m21, m17 punpcklwd m17, m8, m0 ; a2 a6 aa ae punpckhwd m8, m0 ; a3 a7 ab af punpcklwd m0, m20, m1 ; a0 a4 a8 ac punpckhwd m20, m1 ; a1 a5 a9 ad punpcklwd m1, m2, m5 ; b0 b4 b8 bc punpckhwd m2, m5 ; b1 b5 b9 bd punpcklwd m5, m3, m4 ; b2 b6 ba be punpckhwd m3, m4 ; b3 b7 bb bf punpcklwd m4, m6, m15 ; c0 c4 c8 cc punpckhwd m6, m15 ; c1 c5 c9 cd punpcklwd m15, m7, m14 ; c2 c6 ca ce punpckhwd m7, m14 ; c3 c7 cb cf punpcklwd m14, m18, m19 ; d0 d4 d8 dc punpckhwd m18, m19 ; d1 d5 d9 dd punpcklwd m9, m16, m21 ; d2 d6 da de punpckhwd m16, m21 ; d3 d7 db df mov r3d, 64*12 pxor ym21, ym21 .fast_zero_loop: mova [cq+r3+64*0], ym21 mova [cq+r3+64*1], ym21 mova [cq+r3+64*2], ym21 mova [cq+r3+64*3], ym21 sub r3d, 64*4 jge .fast_zero_loop vshufi32x4 m21, m0, m1, q3232 ; a8 ac b8 bc vinserti32x8 m0, ym1, 1 ; a0 a4 b0 b4 vinserti32x8 m1, m17, ym5, 1 ; a2 a6 b2 b6 vshufi32x4 m5, m17, m5, q3232 ; aa ae ba be vinserti32x8 m17, m8, ym3, 1 ; a3 a7 b3 b7 vshufi32x4 m19, m8, m3, q3232 ; ab af bb bf vinserti32x8 m3, m4, ym14, 1 ; c0 c4 d0 d4 vshufi32x4 m4, m14, q3232 ; c8 cc d8 dc vinserti32x8 m14, m20, ym2, 1 ; a1 a5 b1 b5 vshufi32x4 m20, m2, q3232 ; a9 ad b9 bd vinserti32x8 m2, m6, ym18, 1 ; c1 c5 d1 d5 vshufi32x4 m6, m18, q3232 ; c9 cd d9 dd vinserti32x8 m18, m15, ym9, 1 ; c2 c6 d2 d6 vshufi32x4 m15, m9, q3232 ; ca ce da de vinserti32x8 m9, m7, ym16, 1 ; c3 c7 d3 d7 vshufi32x4 m7, m16, q3232 ; cb cf db df vshufi32x4 m22, m14, m2, q2020 ; 1 vshufi32x4 m24, m14, m2, q3131 ; 5 vshufi32x4 m23, m17, m9, q2020 ; 3 vshufi32x4 m25, m17, m9, q3131 ; 7 vshufi32x4 m16, m5, m15, q2020 ; 10 vshufi32x4 m17, m5, m15, q3131 ; 14 vshufi32x4 m14, m1, m18, q2020 ; 2 vshufi32x4 m15, m1, m18, q3131 ; 6 vshufi32x4 m1, m0, m3, q3131 ; 4 vshufi32x4 m0, m3, q2020 ; 0 vshufi32x4 m3, m21, m4, q3131 ; 12 vshufi32x4 m2, m21, m4, q2020 ; 8 vshufi32x4 m26, m20, m6, q2020 ; 9 vshufi32x4 m28, m20, m6, q3131 ; 13 vshufi32x4 m27, m19, m7, q2020 ; 11 vshufi32x4 m29, m19, m7, q3131 ; 15 call .idct16_fast mova [rsp+64*0], m14 mova [rsp+64*1], m15 mova [rsp+64*2], m16 mova [rsp+64*3], m17 mova [rsp+64*4], m18 mova [rsp+64*5], m19 mova [rsp+64*6], m20 mova [rsp+64*7], m21 call .main_fast .end: lea r4, [strideq*3] vpbroadcastd m12, [o(pw_512)] movshdup m13, [o(itx_perm)] lea r3, [dstq+r4*8] lea r5, [strideq+r4] ; stride*4 add r3, r5 ; dst+stride*28 IDCT_32x32_END 29, 0, strideq*0, r4 IDCT_32x32_END 28, 1, strideq*1, strideq*2 IDCT_32x32_END 27, 2, strideq*2, strideq*1 IDCT_32x32_END 26, 3, r4 , strideq*0 IDCT_32x32_END 25, 4, strideq*0, r4 IDCT_32x32_END 24, 5, strideq*1, strideq*2 IDCT_32x32_END 23, 6, strideq*2, strideq*1 IDCT_32x32_END 22, 7, r4 , strideq*0 IDCT_32x32_END 21, 8, strideq*0, r4 IDCT_32x32_END 20, 9, strideq*1, strideq*2 IDCT_32x32_END 19, 10, strideq*2, strideq*1 IDCT_32x32_END 18, 11, r4 , strideq*0 IDCT_32x32_END 17, 12, strideq*0, r4 IDCT_32x32_END 16, 13, strideq*1, strideq*2 IDCT_32x32_END 15, 14, strideq*2, strideq*1 IDCT_32x32_END 14, 15, r4 , strideq*0 RET ALIGN function_align .idct16_fast: vpbroadcastd m21, [o(pw_16305x2)] vpbroadcastd m8, [o(pw_1606x2)] vpbroadcastd m18, [o(pw_m10394x2)] vpbroadcastd m9, [o(pw_12665x2)] pmulhrsw m21, m14 ; t15a vpbroadcastd m19, [o(pw_14449x2)] pmulhrsw m14, m8 ; t8a vpbroadcastd m8, [o(pw_7723x2)] pmulhrsw m18, m17 ; t9a vpbroadcastd m20, [o(pw_m4756x2)] pmulhrsw m17, m9 ; t14a vpbroadcastd m9, [o(pw_15679x2)] pmulhrsw m19, m16 ; t13a vpbroadcastd m5, [o(pw_m9102x2)] pmulhrsw m16, m8 ; t10a vpbroadcastd m8, [o(pw_13623x2)] pmulhrsw m20, m15 ; t11a vpbroadcastd m7, [o(pw_16069x2)] pmulhrsw m15, m9 ; t12a vpbroadcastd m9, [o(pw_3196x2)] pmulhrsw m5, m3 ; t5a vpbroadcastd m6, [o(pw_15137x2)] pmulhrsw m3, m8 ; t6a vpbroadcastd m8, [o(pw_6270x2)] pmulhrsw m7, m1 ; t7a vpbroadcastd m4, [o(pw_11585x2)] pmulhrsw m1, m9 ; t4 vpbroadcastd m10, [o(pd_8192)] pmulhrsw m6, m2 ; t3 pmulhrsw m2, m8 ; t2 pmulhrsw m4, m0 ; t0 mova m0, m4 ; t1 jmp .idct16b ALIGN function_align .idct16: vpbroadcastd m10, [o(pd_8192)] ITX_MULSUB_2W 14, 21, 8, 9, 10, 1606, 16305 ; t8a, t15a ITX_MULSUB_2W 18, 17, 8, 9, 10, 12665, 10394 ; t9a, t14a ITX_MULSUB_2W 16, 19, 8, 9, 10, 7723, 14449 ; t10a, t13a ITX_MULSUB_2W 20, 15, 8, 9, 10, 15679, 4756 ; t11a, t12 ITX_MULSUB_2W 5, 3, 8, 9, 10, 13623, 9102 ; t5a, t6a ITX_MULSUB_2W 1, 7, 8, 9, 10, 3196, 16069 ; t4a, t7a ITX_MULSUB_2W 2, 6, 8, 9, 10, 6270, 15137 ; t2, t3 ITX_MULSUB_2W 0, 4, 8, 9, 10, 11585, 11585 ; t1, t0 .idct16b: paddw m8, m20, m16 ; t11 psubw m20, m16 ; t10 paddw m16, m15, m19 ; t12 psubw m15, m19 ; t13 psubw m19, m14, m18 ; t9 paddw m14, m18 ; t8 psubw m18, m21, m17 ; t14 paddw m21, m17 ; t15 vpbroadcastd m11, [o(pw_6270_15137)] vpbroadcastd m12, [o(pw_m15137_6270)] ITX_MULSUB_2W 18, 19, 9, 17, 10, 11, 12 ; t9a, t14a vpbroadcastd m11, [o(pw_m6270_m15137)] ITX_MULSUB_2W 15, 20, 9, 17, 10, 12, 11 ; t10a, t13a vpbroadcastd m11, [o(pw_11585_11585)] vpbroadcastd m12, [o(pw_m11585_11585)] paddw m9, m7, m3 ; t7 psubw m3, m7, m3 ; t6a paddw m7, m1, m5 ; t4 psubw m1, m5 ; t5a psubw m17, m14, m8 ; t11a paddw m8, m14 ; t8a paddw m14, m18, m15 ; t9 psubw m18, m15 ; t10 psubw m15, m19, m20 ; t13 paddw m19, m20 ; t14 paddw m20, m21, m16 ; t15a psubw m16, m21, m16 ; t12a ITX_MULSUB_2W 3, 1, 5, 21, 10, 11, 12 ; t5, t6 ITX_MULSUB_2W 15, 18, 5, 21, 10, 11, 12 ; t10a, t13a ITX_MULSUB_2W 16, 17, 5, 21, 10, 11, 12 ; t11, t12 psubw m5, m0, m2 ; t2 paddw m2, m0 ; t1 paddw m0, m4, m6 ; t0 psubw m4, m6 ; t3 psubw m6, m2, m1 ; t6 paddw m1, m2 ; t1 paddw m2, m5, m3 ; t2 psubw m5, m3 ; t5 paddw m3, m4, m7 ; t3 psubw m4, m7 ; t4 psubw m7, m0, m9 ; t7 paddw m0, m9 ; t0 psubw m21, m0, m20 ; out15 paddw m0, m20 ; out0 psubw m20, m1, m19 ; out14 paddw m1, m19 ; out1 psubw m19, m2, m18 ; out13 paddw m2, m18 ; out2 psubw m18, m3, m17 ; out12 paddw m3, m17 ; out3 psubw m17, m4, m16 ; out11 paddw m4, m16 ; out4 psubw m16, m5, m15 ; out10 paddw m5, m15 ; out5 psubw m15, m6, m14 ; out9 paddw m6, m14 ; out6 psubw m14, m7, m8 ; out8 paddw m7, m8 ; out7 ret ALIGN function_align .main_fast: vpbroadcastd m21, [o(pw_16364x2)] vpbroadcastd m8, [o(pw_804x2)] vpbroadcastd m14, [o(pw_m11003x2)] vpbroadcastd m9, [o(pw_12140x2)] pmulhrsw m21, m22 ; t31a vpbroadcastd m17, [o(pw_14811x2)] pmulhrsw m22, m8 ; t16a vpbroadcastd m8, [o(pw_7005x2)] pmulhrsw m14, m29 ; t30a vpbroadcastd m18, [o(pw_m5520x2)] pmulhrsw m29, m9 ; t17a vpbroadcastd m9, [o(pw_15426x2)] pmulhrsw m17, m26 ; t29a vpbroadcastd m19, [o(pw_15893x2)] pmulhrsw m26, m8 ; t18a vpbroadcastd m8, [o(pw_3981x2)] pmulhrsw m18, m25 ; t19a vpbroadcastd m16, [o(pw_m8423x2)] pmulhrsw m25, m9 ; t28a vpbroadcastd m9, [o(pw_14053x2)] pmulhrsw m19, m24 ; t27a vpbroadcastd m15, [o(pw_13160x2)] pmulhrsw m24, m8 ; t20a vpbroadcastd m8, [o(pw_9760x2)] pmulhrsw m16, m27 ; t21a vpbroadcastd m20, [o(pw_m2404x2)] pmulhrsw m27, m9 ; t26a vpbroadcastd m9, [o(pw_16207x2)] pmulhrsw m15, m28 ; t25a pmulhrsw m28, m8 ; t22a pmulhrsw m20, m23 ; t23a pmulhrsw m23, m9 ; t24a jmp .main2 ALIGN function_align .main: ITX_MULSUB_2W 22, 21, 8, 9, 10, 804, 16364 ; t16a, t31a ITX_MULSUB_2W 14, 29, 8, 9, 10, 12140, 11003 ; t17a, t30a ITX_MULSUB_2W 26, 17, 8, 9, 10, 7005, 14811 ; t18a, t29a ITX_MULSUB_2W 18, 25, 8, 9, 10, 15426, 5520 ; t19a, t28a ITX_MULSUB_2W 24, 19, 8, 9, 10, 3981, 15893 ; t20a, t27a ITX_MULSUB_2W 16, 27, 8, 9, 10, 14053, 8423 ; t21a, t26a ITX_MULSUB_2W 28, 15, 8, 9, 10, 9760, 13160 ; t22a, t25a ITX_MULSUB_2W 20, 23, 8, 9, 10, 16207, 2404 ; t23a, t24a .main2: psubw m8, m22, m14 ; t17 paddw m22, m14 ; t16 paddw m14, m18, m26 ; t19 psubw m18, m26 ; t18 psubw m26, m24, m16 ; t21 paddw m24, m16 ; t20 psubw m16, m20, m28 ; t22 paddw m28, m20 ; t23 psubw m20, m23, m15 ; t25 paddw m23, m15 ; t24 psubw m15, m21, m29 ; t30 paddw m21, m29 ; t31 psubw m29, m19, m27 ; t26 paddw m19, m27 ; t27 paddw m27, m25, m17 ; t28 psubw m25, m17 ; t29 ITX_MULSUB_2W 15, 8, 9, 17, 10, 3196, 16069 ; t17a, t30a ITX_MULSUB_2W 25, 18, 9, 17, 10, m16069, 3196 ; t18a, t29a ITX_MULSUB_2W 29, 26, 9, 17, 10, 13623, 9102 ; t21a, t26a ITX_MULSUB_2W 20, 16, 9, 17, 10, m9102, 13623 ; t22a, t25a psubw m17, m21, m27 ; t28a paddw m21, m27 ; t31a psubw m27, m15, m25 ; t18 paddw m15, m25 ; t17 psubw m25, m20, m29 ; t21 paddw m20, m29 ; t22 psubw m29, m8, m18 ; t29 paddw m8, m18 ; t30 psubw m18, m22, m14 ; t19a paddw m22, m14 ; t16a psubw m14, m28, m24 ; t20a paddw m24, m28 ; t23a paddw m28, m16, m26 ; t25 psubw m16, m26 ; t26 psubw m26, m23, m19 ; t27a paddw m23, m19 ; t24a vpbroadcastd m12, [o(pw_m15137_6270)] vpbroadcastd m11, [o(pw_6270_15137)] ITX_MULSUB_2W 29, 27, 9, 19, 10, 11, 12 ; t18a, t29a ITX_MULSUB_2W 17, 18, 9, 19, 10, 11, 12 ; t19, t28 vpbroadcastd m11, [o(pw_m6270_m15137)] ITX_MULSUB_2W 16, 25, 9, 19, 10, 12, 11 ; t21a, t26a ITX_MULSUB_2W 26, 14, 9, 19, 10, 12, 11 ; t20, t27 vpbroadcastd m12, [o(pw_m11585_11585)] vpbroadcastd m11, [o(pw_11585_11585)] psubw m19, m27, m25 ; t26 paddw m27, m25 ; t29 psubw m25, m17, m26 ; t20a paddw m17, m26 ; t19a paddw m26, m18, m14 ; t28a psubw m18, m14 ; t27a paddw m14, m22, m24 ; t16 psubw m22, m24 ; t23 psubw m24, m29, m16 ; t21 paddw m16, m29 ; t18 paddw m29, m21, m23 ; t31 psubw m21, m23 ; t24 psubw m23, m15, m20 ; t22a paddw m15, m20 ; t17a psubw m20, m8, m28 ; t25a paddw m28, m8 ; t30a ITX_MULSUB_2W 18, 25, 8, 9, 10, 11, 12 ; t20, t27 ITX_MULSUB_2W 19, 24, 8, 9, 10, 11, 12 ; t21a, t26a ITX_MULSUB_2W 21, 22, 8, 9, 10, 11, 12 ; t23a, t24a ITX_MULSUB_2W 20, 23, 8, 9, 10, 11, 12 ; t22, t25 ret ALIGN function_align .main_packed_fast: vpbroadcastd m8, [o(pw_804_16364x2)] vpbroadcastd m9, [o(pw_m11003_12140x2)] vpbroadcastd m11, [o(pw_7005_14811x2)] vpbroadcastd m12, [o(pw_m5520_15426x2)] pmulhrsw m21, m8 ; t16a, t31a vpbroadcastd m8, [o(pw_3981_15893x2)] pmulhrsw m17, m9 ; t17a, t30a vpbroadcastd m9, [o(pw_m8423_14053x2)] pmulhrsw m20, m11 ; t18a, t29a vpbroadcastd m11, [o(pw_9760_13160x2)] pmulhrsw m15, m12 ; t19a, t28a vpbroadcastd m12, [o(pw_m2404_16207x2)] pmulhrsw m18, m8 ; t20a, t27a pmulhrsw m16, m9 ; t21a, t26a pmulhrsw m19, m11 ; t22a, t25a pmulhrsw m14, m12 ; t23a, t24a psubw m8, m21, m17 ; t17 t30 paddw m21, m17 ; t16 t31 psubw m17, m15, m20 ; t18 t29 paddw m20, m15 ; t19 t28 psubw m15, m18, m16 ; t21 t26 paddw m18, m16 ; t20 t27 psubw m16, m14, m19 ; t22 t25 paddw m14, m19 ; t23 t24 ITX_MUL2X_PACK 8, 9, 19, 10, 3196, 16069, 5 ; t17a t30a ITX_MUL2X_PACK 17, 9, 19, 10, m16069, 3196, 5 ; t18a t29a ITX_MUL2X_PACK 15, 9, 19, 10, 13623, 9102, 5 ; t21a t26a ITX_MUL2X_PACK 16, 9, 19, 10, m9102, 13623, 5 ; t22a t25a vpbroadcastd m11, [o(pw_m15137_6270)] psubw m19, m21, m20 ; t19a t28a paddw m21, m20 ; t16a t31a psubw m20, m14, m18 ; t20a t27a paddw m14, m18 ; t23a t24a psubw m18, m8, m17 ; t18 t29 paddw m8, m17 ; t17 t30 psubw m17, m16, m15 ; t21 t26 paddw m15, m16 ; t22 t25 ITX_MUL2X_PACK 18, 9, 16, 10, 6270_15137, 11, 20 ; t18a t29a ITX_MUL2X_PACK 19, 9, 16, 10, 6270_15137, 11, 20 ; t19 t28 ITX_MUL2X_PACK 20, 9, 16, 10, 11, m6270_m15137, 36 ; t20 t27 ITX_MUL2X_PACK 17, 9, 16, 10, 11, m6270_m15137, 36 ; t21a t26a vbroadcasti32x4 m9, [o(deint_shuf)] psubw m16, m21, m14 ; t23 t24 paddw m14, m21 ; t16 t31 psubw m21, m8, m15 ; t22a t25a paddw m15, m8 ; t17a t30a psubw m8, m18, m17 ; t21 t26 paddw m18, m17 ; t18 t29 paddw m17, m19, m20 ; t19a t28a psubw m19, m20 ; t20a t27a vpbroadcastd m11, [o(pw_m11585_11585)] vpbroadcastd m12, [o(pw_11585_11585)] REPX {pshufb x, m9}, m14, m15, m18, m17 mova m9, m10 vpdpwssd m9, m16, m11 mova m20, m10 vpdpwssd m20, m21, m11 psrad m9, 14 psrad m20, 14 packssdw m9, m20 ; t23a t22 mova m20, m10 vpdpwssd m20, m16, m12 mova m16, m10 vpdpwssd m16, m21, m12 psrad m20, 14 psrad m16, 14 packssdw m16, m20, m16 ; t24a t25 ITX_MUL2X_PACK 8, 21, 20, 10, 11, 12, 8 ; t21a t26a ITX_MUL2X_PACK 19, 8, 11, 10, 11, 12, 8 ; t20 t27 packssdw m11, m20 ; t27 t26a packssdw m8, m21 ; t20 t21a punpcklqdq m20, m14, m15 ; t16 t17a punpckhqdq m14, m15 ; t31 t30a punpckhqdq m15, m17, m18 ; t28a t29 punpcklqdq m17, m18 ; t19a t18 psubw m21, m0, m14 ; out31 out30 paddw m0, m14 ; out0 out1 psubw m14, m7, m20 ; out16 out17 paddw m7, m20 ; out15 out14 psubw m20, m1, m15 ; out28 out29 paddw m1, m15 ; out3 out2 psubw m15, m6, m17 ; out19 out18 paddw m6, m17 ; out12 out13 psubw m17, m4, m9 ; out23 out22 paddw m4, m9 ; out8 out9 psubw m18, m3, m16 ; out24 out25 paddw m3, m16 ; out7 out6 psubw m16, m5, m8 ; out20 out21 paddw m5, m8 ; out11 out10 psubw m19, m2, m11 ; out27 out26 paddw m2, m11 ; out4 out5 ret %endif