1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2025-08-04 22:03:09 +02:00
Files
FFmpeg/libavcodec/x86/vp9itxfm_avx512.asm

1630 lines
65 KiB
NASM
Raw Permalink Normal View History

;******************************************************************************
;* VP9 IDCT SIMD optimizations
;*
;* Copyright (C) 2025 Two Orioles, LLC
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
%if ARCH_X86_64 && HAVE_AVX512ICL_EXTERNAL
SECTION_RODATA 64
dup16_perm: db 0, 1, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7
db 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15, 14, 15
db 16, 17, 16, 17, 18, 19, 18, 19, 20, 21, 20, 21, 22, 23, 22, 23
db 24, 25, 24, 25, 26, 27, 26, 27, 28, 29, 28, 29, 30, 31, 30, 31
itx_perm: dq 0x0000000820150440, 0x0000000231372604
dq 0x0000000ca8041551, 0x00000006b9263715
dq 0x00000001ec9d8c62, 0x0000000bfdbfae26
dq 0x00000005648c9d73, 0x0000000f75aebf37
deint_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
int_shuf1: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
int_shuf2: db 8, 9, 0, 1, 10, 11, 2, 3, 12, 13, 4, 5, 14, 15, 6, 7
pw_512: times 4 dw 512
pw_m512: times 4 dw -512
pw_15137_6270x2x4: times 4 dw 15137*2
times 4 dw 6270*2
pw_11585_m11585x2x4: times 4 dw 11585*2
pw_m11585_11585x2x4: times 4 dw -11585*2
pw_11585_11585x2: times 4 dw 11585*2
int_mshift: db 142, 150, 0, 0, 174, 182, 0, 0
pd_8192: dd 8192
pw_804x2: times 2 dw 804*2
pw_1606x2: times 2 dw 1606*2
pw_3196x2: times 2 dw 3196*2
pw_3981x2: times 2 dw 3981*2
pw_6270x2: times 2 dw 6270*2
pw_7005x2: times 2 dw 7005*2
pw_7723x2: times 2 dw 7723*2
pw_9760x2: times 2 dw 9760*2
pw_12140x2: times 2 dw 12140*2
pw_12665x2: times 2 dw 12665*2
pw_13160x2: times 2 dw 13160*2
pw_13623x2: times 2 dw 13623*2
pw_14053x2: times 2 dw 14053*2
pw_14449x2: times 2 dw 14449*2
pw_14811x2: times 2 dw 14811*2
pw_15137x2: times 2 dw 15137*2
pw_15426x2: times 2 dw 15426*2
pw_15679x2: times 2 dw 15679*2
pw_15893x2: times 2 dw 15893*2
pw_16069x2: times 2 dw 16069*2
pw_16207x2: times 2 dw 16207*2
pw_16305x2: times 2 dw 16305*2
pw_16364x2: times 2 dw 16364*2
pw_m2404x2: times 2 dw -2404*2
pw_m4756x2: times 2 dw -4756*2
pw_m5520x2: times 2 dw -5520*2
pw_m8423x2: times 2 dw -8423*2
pw_m9102x2: times 2 dw -9102*2
pw_m10394x2: times 2 dw -10394*2
pw_m11003x2: times 2 dw -11003*2
pw_804_16364x2: dw 804*2, 16364*2
pw_1606_16305x2: dw 1606*2, 16305*2
pw_3196_16069x2: dw 3196*2, 16069*2
pw_3981_15893x2: dw 3981*2, 15893*2
pw_7005_14811x2: dw 7005*2, 14811*2
pw_7723_14449x2: dw 7723*2, 14449*2
pw_9760_13160x2: dw 9760*2, 13160*2
pw_m2404_16207x2: dw -2404*2, 16207*2
pw_m4756_15679x2: dw -4756*2, 15679*2
pw_m5520_15426x2: dw -5520*2, 15426*2
pw_m8423_14053x2: dw -8423*2, 14053*2
pw_m9102_13623x2: dw -9102*2, 13623*2
pw_m10394_12665x2: dw -10394*2, 12665*2
pw_m11003_12140x2: dw -11003*2, 12140*2
%macro COEF_PAIR 2-3 0
%if %3 & 4
pw_%1_m%2: dw %1, -%2
%else
pw_%1_%2: dw %1, %2
%if %3 & 2
pw_m%1_%2: dw -%1, %2
%else
pw_m%2_%1: dw -%2, %1
%endif
%endif
%if %3 & 1
pw_m%1_m%2: dw -%1, -%2
%endif
%endmacro
COEF_PAIR 804, 16364
COEF_PAIR 1606, 16305
COEF_PAIR 3196, 16069, 1
COEF_PAIR 3981, 15893
COEF_PAIR 6270, 15137, 1
COEF_PAIR 7005, 14811
COEF_PAIR 7723, 14449
COEF_PAIR 9102, 13623
COEF_PAIR 9760, 13160
COEF_PAIR 11585, 11585, 1
COEF_PAIR 12140, 11003
COEF_PAIR 12665, 10394
COEF_PAIR 13623, 9102, 1
COEF_PAIR 14053, 8423
COEF_PAIR 15137, 6270
COEF_PAIR 15426, 5520
COEF_PAIR 15679, 4756
COEF_PAIR 16069, 3196
COEF_PAIR 16207, 2404
; ADST16-only:
COEF_PAIR 2404, 9760, 2
COEF_PAIR 5520, 7005, 2
COEF_PAIR 8423, 3981, 2
COEF_PAIR 11003, 804, 2
COEF_PAIR 12140, 16364, 5
COEF_PAIR 14053, 15893, 5
COEF_PAIR 15426, 14811, 5
COEF_PAIR 16207, 13160, 5
pw_11585_m11585: dw 11585, -11585
pw_16069_m3196: dw 16069, -3196
pw_9102_m13623: dw 9102, -13623
pw_15137_m6270: dw 15137, -6270
pw_6270_m15137: dw 6270, -15137
%define pw_11585x2 pw_11585_11585x2
%define pw_m11585x2 pw_m11585_11585x2x4
SECTION .text
%define o_base pw_512 + 128
%define o(x) (r6 - (o_base) + (x))
%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
; flags: 1 = swap, 2 = interleave (l), 4 = interleave (t), 8 = no_pack,
; 16 = special_mul1, 32 = special_mul2, 64 = dst_in_tmp1
%macro ITX_MUL2X_PACK 6-7 0 ; dst/src, tmp[1-2], rnd, coef[1-2], flags
mova m%2, m%4
%if %7 & 16
vpdpwssd m%2, m%1, [o(pw_%5)] {bcstd}
mova m%3, m%4
%if %7 & 32
vpdpwssd m%3, m%1, [o(pw_%6)] {bcstd}
%else
vpdpwssd m%3, m%1, m%6
%endif
%elif %7 & 32
vpdpwssd m%2, m%1, m%5
mova m%3, m%4
vpdpwssd m%3, m%1, [o(pw_%6)] {bcstd}
%elif %6 < 32
vpdpwssd m%2, m%1, m%5
mova m%3, m%4
vpdpwssd m%3, m%1, m%6
%elif %7 & 1
vpdpwssd m%2, m%1, [o(pw_%5_%6)] {bcstd}
mova m%3, m%4
vpdpwssd m%3, m%1, [o(pw_m%6_%5)] {bcstd}
%else
vpdpwssd m%2, m%1, [o(pw_m%6_%5)] {bcstd}
mova m%3, m%4
vpdpwssd m%3, m%1, [o(pw_%5_%6)] {bcstd}
%endif
%if %7 & 2
psrld m%2, 14
pslld m%3, 2
vpshrdd m%1, m%3, m%2, 16
%elif %7 & 4
; compared to using shifts (as above) this has better throughput,
; but worse latency and requires setting up the opmask/index
; registers, so only use this method for the larger transforms
%if %7 & 64
pslld m%2, 2
vpmultishiftqb m%2{k7}, m13, m%3
%else
pslld m%1, m%2, 2
vpmultishiftqb m%1{k7}, m13, m%3
%endif
%else
psrad m%2, 14
psrad m%3, 14
%if %7 & 8 == 0
packssdw m%1, m%3, m%2
%endif
%endif
%endmacro
; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
%macro ITX_MULSUB_2W 7 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2]
punpcklwd m%3, m%2, m%1
punpckhwd m%2, m%1
%if %7 < 32
mova m%1, m%5
vpdpwssd m%1, m%3, m%7
mova m%4, m%5
vpdpwssd m%4, m%2, m%7
%else
mova m%1, m%5
vpdpwssd m%1, m%3, [o(pw_m%7_%6)] {bcstd}
mova m%4, m%5
vpdpwssd m%4, m%2, [o(pw_m%7_%6)] {bcstd}
%endif
psrad m%1, 14
psrad m%4, 14
packssdw m%1, m%4
mova m%4, m%5
%if %7 < 32
vpdpwssd m%4, m%2, m%6
mova m%2, m%5
vpdpwssd m%2, m%3, m%6
%else
vpdpwssd m%4, m%2, [o(pw_%6_%7)] {bcstd}
mova m%2, m%5
vpdpwssd m%2, m%3, [o(pw_%6_%7)] {bcstd}
%endif
psrad m%4, 14
psrad m%2, 14
packssdw m%2, m%4
%endmacro
; flags: 1 = swap, 2 = invert2, 4 = invert1
%macro ADST_MULSUB_4W 10-11 0 ; dst1/src1, src2, dst2, tmp[1-2], rnd, coef[1-4], flags
mova m%3, m%6
%if %11 & 1
vpdpwssd m%3, m%1, [o(pw_m%8_%7)] {bcstd}
%else
vpdpwssd m%3, m%1, [o(pw_%7_%8)] {bcstd}
%endif
%if %11 & 4
vpbroadcastd m%4, [o(pw_m%9_%10)]
%elif %11 & 2
vpbroadcastd m%4, [o(pw_%9_m%10)]
%elif %11 & 1
vpbroadcastd m%4, [o(pw_%10_%9)]
%else
vpbroadcastd m%4, [o(pw_%9_%10)]
%endif
pmaddwd m%4, m%2
mova m%5, m%6
%if %11 & 4
vpdpwssd m%5, m%1, [o(pw_%8_m%7)] {bcstd}
%elif %11 & 1
vpdpwssd m%5, m%1, [o(pw_%7_%8)] {bcstd}
%else
vpdpwssd m%5, m%1, [o(pw_m%8_%7)] {bcstd}
%endif
%if %11 & 2
vpbroadcastd m%1, [o(pw_%10_%9)]
%elif %11 & 1
vpbroadcastd m%1, [o(pw_%9_m%10)]
%else
vpbroadcastd m%1, [o(pw_m%10_%9)]
%endif
pmaddwd m%2, m%1
paddd m%1, m%3, m%4
psubd m%3, m%4
paddd m%4, m%5, m%2
psubd m%5, m%2
pslld m%1, 2
pslld m%3, 2
vpmultishiftqb m%1{k7}, m13, m%4
vpmultishiftqb m%3{k7}, m13, m%5
%endmacro
%macro WRAP_YMM 1+
INIT_YMM cpuname
%1
INIT_ZMM cpuname
%endmacro
%macro INV_TXFM_FN 3-4 0 ; type1, type2, size, eob_offset
cglobal vp9_i%1_i%2_%3_add, 4, 5, 0, dst, stride, c, eob, tx2
%undef cmp
%define %%p1 m(vp9_i%1_%3_internal)
lea r6, [o_base]
; Jump to the 1st txfm function if we're not taking the fast path, which
; in turn performs an indirect jump to the 2nd txfm function.
lea tx2q, [m(vp9_i%2_%3_internal).pass2]
%ifidn %1_%2, dct_dct
cmp eobd, 1
jne %%p1
%else
%if %4
add eobd, %4
%endif
; jump to the 1st txfm function unless it's located directly after this
times ((%%end - %%p1) >> 31) & 1 jmp %%p1
ALIGN function_align
%%end:
%endif
%endmacro
%macro INV_TXFM_16X16_FN 2-3 0 ; type1, type2, eob_offset
INV_TXFM_FN %1, %2, 16x16, %3
%ifidn %1_%2, dct_dct
movd xmm0, [o(pw_11585x2)]
pmulhrsw xmm3, xmm0, [cq]
pxor ym2, ym2
pmulhrsw xmm3, xmm0
pmulhrsw xmm3, [o(pw_512)]
mova [cq], xm2
add r3d, 7
vpbroadcastw ym3, xmm3
.dconly_loop:
mova xm1, [dstq+strideq*0]
vinserti32x4 ym1, [dstq+strideq*1], 1
punpcklbw ym0, ym1, ym2
punpckhbw ym1, ym2
paddw ym0, ym3
paddw ym1, ym3
packuswb ym0, ym1
mova [dstq+strideq*0], xm0
vextracti32x4 [dstq+strideq*1], ym0, 1
lea dstq, [dstq+strideq*2]
dec r3d
jg .dconly_loop
RET
%endif
%endmacro
%macro IDCT16_MAIN 0-1 0 ; idct32
%if mmsize == 64 && %1 == 0
.main_fast:
%endif
vpbroadcastd m2, [o(pw_1606_16305x2)]
vpbroadcastd m4, [o(pw_m10394_12665x2)]
vpbroadcastd m11, [o(pw_7723_14449x2)]
vpbroadcastd m12, [o(pw_m4756_15679x2)]
pmulhrsw m8, m2 ; t8a t15a
vpbroadcastd m2, [o(pw_3196_16069x2)]
pmulhrsw m0, m4 ; t9a t14a
vpbroadcastd m4, [o(pw_m9102_13623x2)]
pmulhrsw m5, m11 ; t10a t13a
vpbroadcastd m11, [o(pw_11585_11585x2)]
pmulhrsw m1, m12 ; t11a t12a
vbroadcasti32x4 m12, [o(pw_15137_6270x2x4)]
pmulhrsw m7, m2 ; t4a t7a
pmulhrsw m3, m4 ; t5a t6a
pmulhrsw m9, m11 ; t0 t1
pmulhrsw m6, m12 ; t3 t2
%if mmsize == 64 && %1 == 0
jmp %%main2
ALIGN function_align
.main:
punpckhwd m8, m7, m0 ; dct16 in15 in1
punpcklwd m9, m4, m0 ; dct4 in2 in0
punpckhwd m0, m3, m4 ; dct16 in7 in9
punpcklwd m7, m1 ; dct8 in7 in1
punpckhwd m1, m6 ; dct16 in3 in13
punpcklwd m3, m5 ; dct8 in3 in5
punpckhwd m5, m2 ; dct16 in11 in5
punpcklwd m6, m2 ; dct4 in3 in1
ITX_MUL2X_PACK 8, 2, 4, 10, 1606, 16305, 5 ; t8a t15a
ITX_MUL2X_PACK 0, 2, 4, 10, 12665, 10394, 5 ; t9a t14a
ITX_MUL2X_PACK 5, 2, 4, 10, 7723, 14449, 5 ; t10a t13a
ITX_MUL2X_PACK 1, 2, 4, 10, 15679, 4756, 5 ; t11a t12a
ITX_MUL2X_PACK 7, 2, 4, 10, 3196, 16069, 5 ; t4a t7a
ITX_MUL2X_PACK 3, 2, 4, 10, 13623, 9102, 5 ; t5a t6a
ITX_MUL2X_PACK 9, 2, 4, 10, 11585, 11585 ; t0 t1
ITX_MUL2X_PACK 6, 2, 4, 10, 6270, 15137 ; t3 t2
%%main2:
%endif
psubw m2, m8, m0 ; t9 t14
paddw m8, m0 ; t8 t15
psubw m4, m1, m5 ; t10 t13
paddw m1, m5 ; t11 t12
ITX_MUL2X_PACK 2, 0, 5, 10, 6270, 15137, (1|%1*4) ; t9a t14a
ITX_MUL2X_PACK 4, 0, 5, 10, m15137, 6270, (1|%1*4) ; t10a t13a
vbroadcasti32x4 m5, [o(deint_shuf)]
psubw m0, m8, m1 ; t11a t12a
paddw m8, m1 ; t8a t15a
psubw m1, m7, m3 ; t5a t6a
paddw m7, m3 ; t4 t7
pshufb m8, m5
pshufb m7, m5
paddw m3, m2, m4 ; t9 t14
psubw m2, m4 ; t10 t13
%if %1
vpbroadcastd m12, [o(pw_11585_11585)]
vpbroadcastd m11, [o(pw_m11585_11585)]
pshufb m3, m5
ITX_MUL2X_PACK 1, 4, 5, 10, 12, 11 ; t5 t6
ITX_MUL2X_PACK 0, 4, 5, 10, 11, 12, 8 ; t11 t12
ITX_MUL2X_PACK 2, 0, 11, 10, 11, 12, 8 ; t10a t13a
packssdw m5, m11 ; t12 t13a
packssdw m4, m0 ; t11 t10a
%else
pshufb m0, m5
ITX_MUL2X_PACK 1, 4, 5, 10, 11585_11585, m11585_11585, 48 ; t5 t6
vpbroadcastd m11, [o(pw_11585x2)]
punpckhqdq m5, m0, m2 ; t12a t13
punpcklqdq m0, m2 ; t11a t10
psubw m4, m5, m0
paddw m5, m0
pmulhrsw m4, m11 ; t11 t10a
pmulhrsw m5, m11 ; t12 t13a
%endif
punpckhqdq m2, m7, m1 ; t7 t6
punpcklqdq m7, m1 ; t4 t5
psubw m1, m9, m6 ; t3 t2
paddw m9, m6 ; t0 t1
punpckhqdq m0, m8, m3 ; t15a t14
punpcklqdq m8, m3 ; t8a t9
psubw m3, m9, m2 ; t7 t6
paddw m9, m2 ; t0 t1
psubw m2, m1, m7 ; t4 t5
paddw m1, m7 ; t3 t2
psubw m7, m9, m0 ; out15 out14
paddw m0, m9 ; out0 out1
psubw m6, m1, m5 ; out12 out13
paddw m1, m5 ; out3 out2
psubw m5, m2, m4 ; out11 out10
paddw m2, m4 ; out4 out5
psubw m4, m3, m8 ; out8 out9
paddw m3, m8 ; out7 out6
%endmacro
INIT_ZMM avx512icl
INV_TXFM_16X16_FN dct, dct
INV_TXFM_16X16_FN dct, adst, 39-23
cglobal vp9_idct_16x16_internal, 0, 5, 16, dst, stride, c, eob, tx2
mova m15, [o(itx_perm)]
vpbroadcastd m10, [o(pd_8192)]
vpbroadcastq m13, [o(int_mshift)]
vpcmpub k7, m13, m10, 6
sub eobd, 39
jl .pass1_fast
vpermq m0, m15, [cq+64*0]
vpermq m1, m15, [cq+64*1]
vpermq m2, m15, [cq+64*2]
vpermq m3, m15, [cq+64*3]
vpermq m4, m15, [cq+64*4]
vpermq m5, m15, [cq+64*5]
vpermq m6, m15, [cq+64*6]
vpermq m7, m15, [cq+64*7]
call .main
vbroadcasti32x4 m12, [o(int_shuf1)]
vbroadcasti32x4 m11, [o(int_shuf2)]
pshufb m0, m12
pshufb m8, m1, m11
pshufb m2, m12
pshufb m9, m3, m11
pshufb m4, m12
pshufb m14, m5, m11
pshufb m6, m12
pshufb m11, m7, m11
punpckhdq m1, m0, m8
punpckldq m0, m8
punpckhdq m3, m2, m9
punpckldq m2, m9
punpckhdq m5, m4, m14
punpckldq m4, m14
punpckhdq m7, m6, m11
punpckldq m6, m11
.pass1_end:
vshufi32x4 m8, m4, m6, q3232
vinserti32x8 m4, ym6, 1
vshufi32x4 m6, m0, m2, q3232
vinserti32x8 m0, ym2, 1
vshufi32x4 m9, m5, m7, q3232
vinserti32x8 m5, ym7, 1
vshufi32x4 m7, m1, m3, q3232
vinserti32x8 m1, ym3, 1
vshufi32x4 m2, m0, m4, q3131 ; 4 5
vshufi32x4 m0, m4, q2020 ; 0 1
vshufi32x4 m4, m6, m8, q2020 ; 8 9
vshufi32x4 m6, m8, q3131 ; 12 13
vshufi32x4 m3, m1, m5, q3131 ; 6 7
vshufi32x4 m1, m5, q2020 ; 2 3
vshufi32x4 m5, m7, m9, q2020 ; 10 11
vshufi32x4 m7, m9, q3131 ; 14 1
jmp tx2q
.pass1_fast:
mova ym3, [o(dup16_perm)]
vbroadcasti32x4 ym9, [cq+32*0]
vbroadcasti32x4 ym6, [cq+32*4]
vpermb ym8, ym3, [cq+32*1]
vpermb ym0, ym3, [cq+32*7]
vpermb ym5, ym3, [cq+32*5]
vpermb ym1, ym3, [cq+32*3]
vpermb ym7, ym3, [cq+32*2]
vpermb ym3, ym3, [cq+32*6]
shufpd ym9, ym9, 0x0c
shufpd ym6, ym6, 0x0c
WRAP_YMM IDCT16_MAIN
vbroadcasti32x4 m8, [o(int_shuf1)]
vbroadcasti32x4 m9, [o(int_shuf2)]
vinserti32x8 m0, ym2, 1 ; 0 1 | 4 5
vinserti32x8 m4, ym6, 1 ; 8 9 | 12 13
vinserti32x8 m1, ym3, 1 ; 3 2 | 7 6
vinserti32x8 m5, ym7, 1 ; 11 10 | 15 14
vshufi32x4 m2, m0, m4, q3131
vshufi32x4 m0, m4, q2020
vshufi32x4 m4, m1, m5, q2020
vshufi32x4 m1, m5, q3131
pshufb m2, m8
pshufb m0, m8
pshufb m4, m9
pshufb m1, m9
punpckhdq m3, m2, m1 ; 6-7
punpckldq m2, m1 ; 4-5
punpckhdq m1, m0, m4 ; 2-3
punpckldq m0, m4 ; 0-1
jmp tx2q
.pass2:
test eobd, eobd
jl .pass2_fast
call .main
jmp .pass2_end
.pass2_fast:
punpcklqdq m9, m0, m0
punpckhwd m8, m0, m0
punpcklwd m7, m1, m1
punpckhwd m1, m1
punpcklqdq m6, m2, m2
punpckhwd m5, m2, m2
punpckhwd m0, m3, m3
punpcklwd m3, m3
call .main_fast
.pass2_end:
psrldq m8, m15, 1
psrlq m12, m15, 12
psrldq m9, m15, 2
psrlq m13, m15, 20
mova m10, m8
vpermi2q m8, m0, m2 ; 0 1 4 5
vpermt2q m0, m12, m2
mova m11, m9
vpermi2q m9, m1, m3 ; 2 3 6 7
vpermt2q m1, m13, m3
vpbroadcastd m2, [o(pw_512)]
vpermi2q m10, m4, m6 ; 8 9 12 13
vpermt2q m4, m12, m6
vpermi2q m11, m5, m7 ; 10 11 14 15
vpermt2q m5, m13, m7
REPX {pmulhrsw x, m2}, m0, m1, m4, m5, m8, m9, m10, m11
.pass2_end2:
lea r3, [strideq*3]
lea r4, [dstq+strideq*4]
lea r5, [dstq+strideq*8]
lea r6, [r4 +strideq*8]
mova xm3, [dstq+strideq*0]
mova xm6, [dstq+strideq*2]
vinserti32x4 ym3, [dstq+strideq*1], 1
vinserti32x4 ym6, [dstq+r3 ], 1
vinserti32x4 m3, [r4+strideq*0], 2
vinserti32x4 m6, [r4+strideq*2], 2
vinserti32x4 m3, [r4+strideq*1], 3
vinserti32x4 m6, [r4+r3 ], 3
mova xm12, [r5+strideq*0]
mova xm13, [r5+strideq*2]
vinserti32x4 ym12, [r5+strideq*1], 1
vinserti32x4 ym13, [r5+r3 ], 1
vinserti32x4 m12, [r6+strideq*0], 2
vinserti32x4 m13, [r6+strideq*2], 2
vinserti32x4 m12, [r6+strideq*1], 3
vinserti32x4 m13, [r6+r3 ], 3
pxor m7, m7
REPX {mova [cq+64*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7
punpcklbw m2, m3, m7
punpckhbw m3, m7
paddw m0, m2
paddw m8, m3
packuswb m0, m8
punpcklbw m2, m6, m7
punpckhbw m6, m7
paddw m1, m2
paddw m9, m6
packuswb m1, m9
punpcklbw m2, m12, m7
punpckhbw m12, m7
paddw m2, m4
paddw m10, m12
packuswb m2, m10
punpcklbw m3, m13, m7
punpckhbw m13, m7
paddw m3, m5
paddw m11, m13
packuswb m3, m11
mova [dstq+strideq*0], xm0
vextracti32x4 [dstq+strideq*1], ym0, 1
mova [dstq+strideq*2], xm1
vextracti32x4 [dstq+r3 ], ym1, 1
vextracti32x4 [r4+strideq*0], m0, 2
vextracti32x4 [r4+strideq*1], m0, 3
vextracti32x4 [r4+strideq*2], m1, 2
vextracti32x4 [r4+r3 ], m1, 3
mova [r5+strideq*0], xm2
vextracti32x4 [r5+strideq*1], ym2, 1
mova [r5+strideq*2], xm3
vextracti32x4 [r5+r3 ], ym3, 1
vextracti32x4 [r6+strideq*0], m2, 2
vextracti32x4 [r6+strideq*1], m2, 3
vextracti32x4 [r6+strideq*2], m3, 2
vextracti32x4 [r6+r3 ], m3, 3
RET
ALIGN function_align
IDCT16_MAIN
ret
%macro IADST16_MAIN 0
%if mmsize == 64
.main_fast:
%endif
punpcklwd m4, m3, m0 ; in7 in0
punpcklwd m11, m1, m2 ; in3 in4
punpckhwd m9, m2, m1 ; in5 in2
punpckhwd m7, m0, m3 ; in1 in6
ITX_MUL2X_PACK 4, 0, 6, 10, 11003_804, 12140_m16364, 116 ; t1a t0a
ITX_MUL2X_PACK 4, 5, 6, 10, m11003_804, m12140_m16364, 52 ; t9a t8a
ITX_MUL2X_PACK 11, 2, 6, 10, 5520_7005, 15426_m14811, 116 ; t5a t4a
ITX_MUL2X_PACK 11, 5, 6, 10, m5520_7005, m15426_m14811, 52 ; t13a t12a
ITX_MUL2X_PACK 9, 1, 6, 10, 8423_3981, 14053_m15893, 116 ; t3a t2a
ITX_MUL2X_PACK 9, 5, 6, 10, m8423_3981, m14053_m15893, 52 ; t11a t10a
ITX_MUL2X_PACK 7, 3, 6, 10, 2404_9760, 16207_m13160, 116 ; t7a t6a
ITX_MUL2X_PACK 7, 5, 6, 10, m2404_9760, m16207_m13160, 52 ; t15a t14a
%if mmsize == 64 ; for the ymm variant we only ever use the fast path
jmp %%main2
ALIGN function_align
.main:
punpckhwd m8, m7, m0 ; in14 in1
punpcklwd m0, m7 ; in0 in15
punpcklwd m7, m6, m1 ; in12 in3
punpckhwd m1, m6 ; in2 in13
punpckhwd m6, m5, m2 ; in10 in5
punpcklwd m2, m5 ; in4 in11
punpcklwd m5, m4, m3 ; in8 in7
punpckhwd m3, m4 ; in6 in9
ADST_MULSUB_4W 0, 5, 4, 9, 11, 10, 804, 16364, 12140, 11003 ; t1a t0a, t9a t8a
ADST_MULSUB_4W 2, 7, 11, 5, 9, 10, 7005, 14811, 15426, 5520 ; t5a t4a, t13a t12a
ADST_MULSUB_4W 1, 6, 9, 5, 7, 10, 3981, 15893, 14053, 8423 ; t3a t2a, t11a t10a
ADST_MULSUB_4W 3, 8, 7, 5, 6, 10, 9760, 13160, 16207, 2404 ; t7a t6a, t15a t14a
%%main2:
%endif
psubw m5, m1, m3 ; t7 t6
paddw m6, m1, m3 ; t3 t2
psubw m1, m0, m2 ; t5 t4
paddw m2, m0 ; t1 t0
ADST_MULSUB_4W 4, 11, 8, 3, 0, 10, 3196, 16069, 16069, 3196, 1 ; t8a t9a, t12a t13a
ADST_MULSUB_4W 9, 7, 0, 3, 11, 10, 13623, 9102, 9102, 13623, 1 ; t10a t11a, t14a t15a
ADST_MULSUB_4W 1, 5, 11, 3, 7, 10, 6270, 15137, 15137, 6270, 2 ; out12 -out3, t7 t6
psubw m3, m2, m6 ; t3a t2a
paddw m2, m6 ; -out15 out0
ADST_MULSUB_4W 8, 0, 5, 6, 7, 10, 15137, 6270, 6270, 15137, 6 ; -out13 out2, t15a t14
vbroadcasti32x4 m12, [o(deint_shuf)]
paddw m0, m4, m9 ; -out1 out14
psubw m4, m9 ; t10 t11
pshufb m2, m12
pshufb m1, m12
pshufb m8, m12
pshufb m0, m12
punpcklqdq m6, m1, m8 ; out12 -out13
shufps m7, m0, m2, q1032 ; out14 -out15
%endmacro
%macro IADST16_PASS1_END 0
shufps m0, m2, m0, q1032 ; out0 -out1
punpckhqdq m1, m8, m1 ; out2 -out3
mova m2, m10
vpdpwssd m2, m5, [o(pw_m11585_m11585)] {bcstd} ; out5
mova m8, m10
vpdpwssd m8, m11, [o(pw_11585_11585)] {bcstd} ; out4
mova m9, m10
vpdpwssd m9, m5, [o(pw_m11585_11585)] {bcstd} ; out10
mova m5, m10
vpdpwssd m5, m11, [o(pw_11585_m11585)] {bcstd} ; out11
mova m11, m10
vpdpwssd m11, m3, [o(pw_m11585_m11585)] {bcstd} ; out7
mova m14, m10
vpdpwssd m14, m4, [o(pw_11585_11585)] {bcstd} ; out6
mova m12, m10
vpdpwssd m12, m3, [o(pw_m11585_11585)] {bcstd} ; out8
mova m3, m10
vpdpwssd m3, m4, [o(pw_m11585_11585)] {bcstd} ; out9
%endmacro
INV_TXFM_16X16_FN adst, dct, 39-18
INV_TXFM_16X16_FN adst, adst
cglobal vp9_iadst_16x16_internal, 0, 5, 16, dst, stride, c, eob, tx2
mova m15, [o(itx_perm)]
psrlq m7, m15, 4
vpermq m0, m15, [cq+64*0] ; 0 1
vpermq m1, m7, [cq+64*1] ; 3 2
vpermq m2, m15, [cq+64*2] ; 4 5
vpermq m3, m7, [cq+64*3] ; 7 6
vpbroadcastd m10, [o(pd_8192)]
vpbroadcastq m13, [o(int_mshift)]
vpcmpub k7, m13, m10, 6
sub eobd, 39
jl .pass1_fast
vpermq m4, m15, [cq+64*4] ; 8 9
vpermq m5, m7, [cq+64*5] ; 11 10
vpermq m6, m15, [cq+64*6] ; 12 13
vpermq m7, m7, [cq+64*7] ; 15 14
call .main
IADST16_PASS1_END
REPX {psrad x, 14}, m2, m8, m9, m5, m11, m14, m12, m3
packssdw m2, m8, m2 ; out4 out5
packssdw m5, m9, m5 ; out10 out11
packssdw m4, m12, m3 ; out8 out9
packssdw m3, m14, m11 ; out6 out7
pxor m9, m9
punpckhwd m8, m0, m1
punpcklwd m0, m1
psubw m8, m9, m8
punpckhwd m1, m0, m8
punpcklwd m0, m8
punpckhwd m8, m2, m3
punpcklwd m2, m3
punpckhwd m3, m2, m8
punpcklwd m2, m8
punpckhwd m8, m4, m5
punpcklwd m4, m5
punpckhwd m5, m4, m8
punpcklwd m4, m8
punpckhwd m8, m6, m7
punpcklwd m6, m7
psubw m8, m9, m8
punpckhwd m7, m6, m8
punpcklwd m6, m8
jmp m(vp9_idct_16x16_internal).pass1_end
.pass1_fast:
WRAP_YMM IADST16_MAIN
WRAP_YMM IADST16_PASS1_END
vinserti32x8 m0, ym6, 1
vinserti32x8 m1, ym7, 1
vinserti32x8 m8, ym12, 1
vinserti32x8 m2, ym3, 1
vinserti32x8 m14, ym9, 1
vinserti32x8 m11, ym5, 1
pslld m14, 2
pslld m11, 2
punpckhwd m4, m0, m1
punpcklwd m0, m1
vpmultishiftqb m14{k7}, m13, m8
vpmultishiftqb m11{k7}, m13, m2
psrlq m1, m15, 24
pxor m2, m2
psubw m2, m4
punpckhwd m3, m0, m2
punpcklwd m0, m2
psrlq m2, m15, 28
punpckhwd m4, m14, m11
punpcklwd m14, m11
mova m5, m2
vpermi2q m2, m0, m14
vpermt2q m0, m1, m14
vpermi2q m1, m3, m4
vpermt2q m3, m5, m4
jmp tx2q
.pass2:
pshufd m1, m1, q1032
pshufd m3, m3, q1032
test eobd, eobd
jl .pass2_fast
pshufd m5, m5, q1032
pshufd m7, m7, q1032
call .main
jmp .pass2_end
.pass2_fast:
call .main_fast
.pass2_end:
vbroadcasti32x4 m9, [o(pw_11585_m11585x2x4)]
vbroadcasti32x4 m10, [o(pw_m11585_11585x2x4)]
punpckhqdq m1, m8 ; -out3 out2
shufps m0, m2, q3210 ; -out1 out0
pshufb m2, m11, m12
pshufb m5, m12
pshufb m3, m12
pshufb m4, m12
vbroadcasti32x4 m11, [o(pw_512)]
vpbroadcastd m12, [o(pw_512)]
punpcklqdq m8, m5, m2 ; t15a t7
punpckhqdq m5, m2 ; t14a t6
shufps m2, m3, m4, q1032 ; t2a t10
shufps m3, m4, q3210 ; t3a t11
psubsw m4, m2, m3
paddsw m3, m2
paddsw m2, m5, m8
psubsw m5, m8
pmulhrsw m4, m9 ; out8 out9
pmulhrsw m3, m10 ; out7 out6
pmulhrsw m2, m10 ; out5 out4
pmulhrsw m5, m9 ; out10 out11
pmulhrsw m6, m11
pmulhrsw m7, m11
pshufd m11, m11, q1032
pmulhrsw m0, m11
pmulhrsw m1, m11
REPX {pmulhrsw x, m12}, m2, m3, m4, m5
psrldq m8, m15, 2
psrlq m12, m15, 20
psrldq m10, m15, 1
psrlq m13, m15, 12
mova m9, m8
vpermi2q m8, m0, m2 ; 0 1 4 5
vpermt2q m0, m12, m2
vpermi2q m9, m1, m3 ; 2 3 6 7
vpermt2q m1, m12, m3
mova m11, m10
vpermi2q m10, m4, m6 ; 8 9 12 13
vpermt2q m4, m13, m6
vpermi2q m11, m5, m7 ; 10 11 14 15
vpermt2q m5, m13, m7
jmp m(vp9_idct_16x16_internal).pass2_end2
ALIGN function_align
IADST16_MAIN
ret
%macro IDCT_32x32_END 4 ; src, mem, stride[1-2]
pmovzxbw m10, [dstq+%3]
pmovzxbw m11, [r3 +%4]
%if %2 < 8
paddw m8, m%2, m%1
psubw m9, m%2, m%1
%else
mova m9, [rsp+64*(%2-8)]
paddw m8, m9, m%1
psubw m9, m%1
%endif
pmulhrsw m8, m12
pmulhrsw m9, m12
paddw m8, m10
paddw m9, m11
packuswb m8, m9
vpermq m8, m13, m8
mova [dstq+%3], ym8
vextracti32x8 [r3 +%4], m8, 1
%if %2 == 3 || %2 == 7 || %2 == 11
add dstq, r5
sub r3, r5
%endif
%endmacro
cglobal vp9_idct_idct_32x32_add, 4, 7, 0, dst, stride, c, eob
%undef cmp
lea r6, [o_base]
cmp eobd, 1
jne .pass1
movd xmm0, [o(pw_11585x2)]
pmulhrsw xmm3, xmm0, [cq]
pxor m2, m2
pmulhrsw xmm3, xmm0
pmulhrsw xmm3, [o(pw_512)]
movd [cq], xm2
add r3d, 15
vpbroadcastw m3, xmm3
.dconly_loop:
mova ym1, [dstq+strideq*0]
vinserti32x8 m1, [dstq+strideq*1], 1
punpcklbw m0, m1, m2
punpckhbw m1, m2
paddw m0, m3
paddw m1, m3
packuswb m0, m1
mova [dstq+strideq*0], ym0
vextracti32x8 [dstq+strideq*1], m0, 1
lea dstq, [dstq+strideq*2]
dec r3d
jg .dconly_loop
RET
.pass1:
PROLOGUE 0, 7, 30, 64*16, dst, stride, c, eob
sub eobd, 135
jl .fast
mova m0, [cq+64* 0]
mova m14, [cq+64* 2]
mova m1, [cq+64* 4]
mova m15, [cq+64* 6]
mova m2, [cq+64* 8]
mova m16, [cq+64*10]
mova m3, [cq+64*12]
mova m17, [cq+64*14]
mova m4, [cq+64*16]
mova m18, [cq+64*18]
mova m5, [cq+64*20]
mova m19, [cq+64*22]
mova m6, [cq+64*24]
mova m20, [cq+64*26]
mova m7, [cq+64*28]
mova m21, [cq+64*30]
call .idct16
mova [rsp+64*0], m14
mova [rsp+64*1], m15
mova [rsp+64*2], m16
mova [rsp+64*3], m17
mova [rsp+64*4], m18
mova [rsp+64*5], m19
mova [rsp+64*6], m20
mova [rsp+64*7], m21
mova m22, [cq+64* 1]
mova m23, [cq+64* 3]
mova m24, [cq+64* 5]
mova m25, [cq+64* 7]
mova m26, [cq+64* 9]
mova m27, [cq+64*11]
mova m28, [cq+64*13]
mova m29, [cq+64*15]
mova m14, [cq+64*17]
mova m15, [cq+64*19]
mova m16, [cq+64*21]
mova m17, [cq+64*23]
mova m18, [cq+64*25]
mova m19, [cq+64*27]
mova m20, [cq+64*29]
mova m21, [cq+64*31]
call .main
psubw m13, m0, m29 ; 31
paddw m0, m29 ; 0
psubw m29, m1, m28 ; 30
paddw m1, m28 ; 1
psubw m28, m2, m27 ; 29
paddw m2, m27 ; 2
psubw m27, m3, m26 ; 28
paddw m3, m26 ; 3
psubw m26, m4, m25 ; 27
paddw m4, m25 ; 4
psubw m25, m5, m24 ; 26
paddw m5, m24 ; 5
psubw m24, m6, m23 ; 25
paddw m6, m23 ; 6
psubw m23, m7, m22 ; 24
paddw m7, m22 ; 7
punpckhwd m8, m0, m1 ; a4 b4 a5 b5 a6 b6 a7 b7
punpcklwd m0, m1 ; a0 b0 a1 b1 a2 b2 a3 b3
punpckhwd m1, m2, m3 ; c4 d4 c5 d5 c6 d6 c7 d7
punpcklwd m2, m3 ; c0 d0 c1 d1 c2 d2 c3 d3
punpckhwd m22, m4, m5 ; e4 f4 e5 f5 e6 f6 e7 f7
punpcklwd m4, m5 ; e0 f0 e1 f1 e2 f2 e3 f3
punpckhwd m5, m6, m7 ; g4 h4 g5 h5 g6 h6 g7 h7
punpcklwd m6, m7 ; g0 h0 g1 h1 g2 h2 g3 h3
punpckhwd m3, m23, m24
punpcklwd m23, m24
punpckhwd m24, m25, m26
punpcklwd m25, m26
punpckhwd m26, m27, m28
punpcklwd m27, m28
punpckhwd m28, m29, m13
punpcklwd m29, m13
punpckhdq m7, m0, m2 ; a2 b2 c2 d2 a3 b3 c3 d3
punpckldq m0, m2 ; a0 b0 c0 d0 a1 b1 c1 d1
punpckhdq m2, m4, m6 ; e2 f2 g2 h2 e3 f3 g3 h3
punpckldq m4, m6 ; e0 f0 g0 h0 e1 f1 g1 h1
punpckhdq m6, m8, m1 ; a6 b6 c6 d6 a7 b7 c7 d7
punpckldq m8, m1 ; a4 b4 c4 d4 a5 b5 c5 d5
punpckhdq m1, m22, m5 ; e6 f6 g6 h6 e7 f7 g7 h7
punpckldq m22, m5 ; e4 f4 g4 h5 e5 f5 g5 h5
punpckhdq m13, m23, m25
punpckldq m23, m25
punpckhdq m25, m27, m29
punpckldq m27, m29
punpckhdq m9, m3, m24
punpckldq m3, m24
punpckhdq m24, m26, m28
punpckldq m26, m28
punpcklqdq m5, m23, m27 ; d00 d08 d16 d24
punpckhqdq m23, m27 ; d01 d09 d17 d25
punpckhqdq m27, m13, m25 ; d03 d11 d19 d27
punpcklqdq m13, m25 ; d02 d10 d18 d26
punpckhqdq m25, m3, m26 ; d05 d13 d21 d29
punpcklqdq m3, m26 ; d04 d12 d20 d28
punpckhqdq m26, m9, m24 ; d07 d15 d23 d31
punpcklqdq m9, m24 ; d06 d14 d22 d30
mova [rsp+64*12], m23
mova [rsp+64*13], m27
mova [rsp+64*14], m25
mova [rsp+64*15], m26
punpckhqdq m24, m8, m22 ; a05 a13 a21 a29
punpcklqdq m8, m22 ; a04 a12 a20 a28
punpckhqdq m22, m0, m4 ; a01 a09 a17 a25
punpcklqdq m0, m4 ; a00 a08 a16 a24
punpckhqdq m23, m7, m2 ; a03 a11 a19 a27
punpcklqdq m7, m2 ; a02 a10 a18 a26
punpckhqdq m25, m6, m1 ; a07 a15 a23 a31
punpcklqdq m6, m1 ; a06 a14 a22 a30
mova m2, [rsp+64*0]
mova m11, [rsp+64*1]
mova m12, [rsp+64*2]
mova m29, [rsp+64*3]
mova m27, [rsp+64*4]
mova m26, [rsp+64*5]
mova m4, [rsp+64*6]
mova m28, [rsp+64*7]
psubw m1, m2, m21 ; 23
paddw m2, m21 ; 8
psubw m21, m11, m20 ; 22
paddw m11, m20 ; 9
psubw m20, m12, m19 ; 21
paddw m12, m19 ; 10
psubw m19, m29, m18 ; 20
paddw m29, m18 ; 11
psubw m18, m27, m17 ; 19
paddw m27, m17 ; 12
psubw m17, m26, m16 ; 18
paddw m26, m16 ; 13
paddw m16, m4, m15 ; 14
psubw m4, m15 ; 17
mova m15, m6
psubw m6, m28, m14 ; 16
paddw m28, m14 ; 15
mova m14, m7
punpcklwd m7, m6, m4
punpckhwd m6, m4
punpckhwd m4, m17, m18
punpcklwd m17, m18
punpckhwd m18, m19, m20
punpcklwd m19, m20
punpckhwd m20, m21, m1
punpcklwd m21, m1
punpckhwd m1, m2, m11 ; i4 j4 i5 j5 i6 j6 i7 j7
punpcklwd m2, m11 ; i0 j1 i1 j1 i2 j2 i3 j3
punpckhwd m11, m12, m29 ; k4 l4 k5 l5 k6 l6 k7 l7
punpcklwd m12, m29 ; k0 l0 k1 l1 k2 l2 k3 l3
punpckhwd m29, m27, m26 ; m4 n4 m5 n5 m6 n6 m7 n7
punpcklwd m27, m26 ; m0 n0 m1 n1 m2 n2 m3 n3
punpckhwd m26, m16, m28 ; o4 p4 o5 p5 o6 p6 o7 p7
punpcklwd m16, m28 ; o0 p0 o1 p1 o2 p2 o3 p3
punpckhdq m28, m2, m12 ; i2 j2 k2 l2 i3 j3 k3 l3
punpckldq m2, m12 ; i0 j0 k0 l0 i1 j1 k1 l1
punpckhdq m12, m27, m16 ; m2 n2 o2 p2 m3 n3 o3 p3
punpckldq m27, m16 ; m0 n0 o0 p0 m1 n1 o1 p1
punpckhdq m16, m1, m11 ; i6 j6 k6 l6 i7 j7 k7 l7
punpckldq m1, m11 ; i4 j4 k4 l4 i5 j5 k5 l5
punpckhdq m11, m29, m26 ; m6 n6 o6 p6 m7 n7 o7 p7
punpckldq m29, m26 ; m4 n4 o4 p4 m5 n5 o5 p5
punpckhdq m26, m19, m21
punpckldq m19, m21
punpckhdq m21, m6, m4
punpckldq m6, m4
punpckhdq m4, m18, m20
punpckldq m18, m20
punpckhdq m20, m7, m17
punpckldq m7, m17
punpcklqdq m17, m28, m12 ; b02 b10 b18 b26
punpckhqdq m28, m12 ; b03 b11 b19 b27
punpckhqdq m12, m2, m27 ; b01 b09 b17 b25
punpcklqdq m2, m27 ; b00 b08 b16 b24
punpckhqdq m27, m1, m29 ; b05 b13 b21 b29
punpcklqdq m1, m29 ; b04 b12 b20 b28
punpckhqdq m29, m16, m11 ; b07 b15 b23 b31
punpcklqdq m16, m11 ; b06 b14 b22 b30
mova [rsp+64* 8], m12
mova [rsp+64* 9], m28
mova [rsp+64*10], m27
mova [rsp+64*11], m29
punpckhqdq m27, m20, m26 ; c03 c11 c19 c27
punpcklqdq m20, m26 ; c02 c10 c18 c26
punpckhqdq m26, m7, m19 ; c01 c09 c17 c25
punpcklqdq m7, m19 ; c00 c08 c16 c24
punpckhqdq m28, m6, m18 ; c05 c13 c21 c29
punpcklqdq m6, m18 ; c04 c12 c20 c28
punpckhqdq m29, m21, m4 ; c07 c15 c23 c31
punpcklqdq m21, m4 ; c06 c14 c22 c30
mov r3d, 64*28
pxor m4, m4
.zero_loop:
mova [cq+r3+64*0], m4
mova [cq+r3+64*1], m4
mova [cq+r3+64*2], m4
mova [cq+r3+64*3], m4
sub r3d, 64*4
jge .zero_loop
vshufi32x4 m4, m0, m2, q3232 ; a16 a24 b16 b24
vinserti32x8 m0, ym2, 1 ; a00 a08 b00 b08
vshufi32x4 m2, m7, m5, q3232 ; c16 c24 d16 d24
vinserti32x8 m7, ym5, 1 ; c00 c08 d00 d08
vshufi32x4 m5, m8, m1, q3232 ; a20 a28 b20 b28
vinserti32x8 m1, m8, ym1, 1 ; a04 a12 b04 b12
vshufi32x4 m8, m6, m3, q3232 ; c20 c28 d20 d28
vinserti32x8 m6, ym3, 1 ; c04 c12 d04 d12
vshufi32x4 m3, m1, m6, q3131 ; 12
vshufi32x4 m1, m6, q2020 ; 4
vshufi32x4 m6, m4, m2, q3131 ; 24
vshufi32x4 m4, m2, q2020 ; 16
vshufi32x4 m2, m0, m7, q3131 ; 8
vshufi32x4 m0, m7, q2020 ; 0
vshufi32x4 m7, m5, m8, q3131 ; 28
vshufi32x4 m5, m8, q2020 ; 20
vshufi32x4 m18, m14, m17, q3232 ; a18 a26 b18 b26
vinserti32x8 m14, ym17, 1 ; a02 a10 b02 b10
vshufi32x4 m17, m20, m13, q3232 ; c18 c26 d18 d26
vinserti32x8 m20, ym13, 1 ; c02 c10 d02 d10
vshufi32x4 m13, m21, m9, q3232 ; c22 c30 d22 d30
vinserti32x8 m21, ym9, 1 ; c06 c14 d06 d14
vshufi32x4 m19, m15, m16, q3232 ; a22 a30 b22 b30
vinserti32x8 m15, ym16, 1 ; a06 a14 b06 b14
vshufi32x4 m16, m14, m20, q3131 ; 10
vshufi32x4 m14, m20, q2020 ; 2
vshufi32x4 m20, m18, m17, q3131 ; 26
vshufi32x4 m18, m17, q2020 ; 18
vshufi32x4 m17, m15, m21, q3131 ; 14
vshufi32x4 m15, m21, q2020 ; 6
vshufi32x4 m21, m19, m13, q3131 ; 30
vshufi32x4 m19, m13, q2020 ; 22
call .idct16
mova [rsp+64*0], m14
mova [rsp+64*1], m15
mova [rsp+64*2], m16
mova [rsp+64*3], m17
mova [rsp+64*4], m18
mova [rsp+64*5], m19
mova [rsp+64*6], m20
mova [rsp+64*7], m21
mova m15, [rsp+64* 8]
mova m16, [rsp+64* 9]
mova m17, [rsp+64*10]
mova m19, [rsp+64*11]
mova m20, [rsp+64*12]
mova m21, [rsp+64*13]
mova m13, [rsp+64*14]
mova m18, [rsp+64*15]
vshufi32x4 m14, m22, m15, q3232 ; a17 a25 b17 b25
vinserti32x8 m22, ym15, 1 ; a01 a09 b01 b09
vshufi32x4 m15, m23, m16, q3232 ; a19 a27 b19 b27
vinserti32x8 m23, ym16, 1 ; a03 a11 b03 b11
vshufi32x4 m16, m24, m17, q3232 ; a21 a29 b21 b29
vinserti32x8 m24, ym17, 1 ; a05 a13 b05 b13
vshufi32x4 m17, m25, m19, q3232 ; a23 a31 b23 b31
vinserti32x8 m25, ym19, 1 ; a07 a15 b07 b15
vinserti32x8 m8, m26, ym20, 1 ; c01 c09 d01 d09
vshufi32x4 m26, m20, q3232 ; c17 c25 d17 d25
vinserti32x8 m9, m27, ym21, 1 ; c03 c11 d03 d11
vshufi32x4 m27, m21, q3232 ; c19 c27 d19 d27
vinserti32x8 m11, m28, ym13, 1 ; c05 c13 d05 d13
vshufi32x4 m28, m13, q3232 ; c21 c29 d21 d29
vinserti32x8 m12, m29, ym18, 1 ; c07 c15 d07 d15
vshufi32x4 m29, m18, q3232 ; c23 c31 d23 d31
vshufi32x4 m18, m14, m26, q3131 ; 25
vshufi32x4 m14, m26, q2020 ; 17
vshufi32x4 m19, m15, m27, q3131 ; 27
vshufi32x4 m15, m27, q2020 ; 19
vshufi32x4 m20, m16, m28, q3131 ; 29
vshufi32x4 m16, m28, q2020 ; 21
vshufi32x4 m21, m17, m29, q3131 ; 31
vshufi32x4 m17, m29, q2020 ; 23
vshufi32x4 m26, m22, m8, q3131 ; 9
vshufi32x4 m22, m8, q2020 ; 1
vshufi32x4 m27, m23, m9, q3131 ; 11
vshufi32x4 m23, m9, q2020 ; 3
vshufi32x4 m28, m24, m11, q3131 ; 13
vshufi32x4 m24, m11, q2020 ; 5
vshufi32x4 m29, m25, m12, q3131 ; 15
vshufi32x4 m25, m12, q2020 ; 7
call .main
jmp .end
.fast:
mova m14, [o(dup16_perm)]
pmovzxbw m9, [cq+64*0]
pmovzxbw m6, [cq+64*8]
vpermb m8, m14, [cq+64* 2]
vpermb m0, m14, [cq+64*14]
vpermb m5, m14, [cq+64*10]
vpermb m1, m14, [cq+64* 6]
vpermb m7, m14, [cq+64* 4]
vpermb m3, m14, [cq+64*12]
vpbroadcastd m10, [o(pd_8192)]
vpbroadcastq m13, [o(int_mshift)]
packuswb m9, m9
packuswb m6, m6
vpcmpub k7, m13, m10, 6
IDCT16_MAIN 1
vpermb m21, m14, [cq+64* 1]
vpermb m17, m14, [cq+64*15]
vpermb m20, m14, [cq+64* 9]
vpermb m15, m14, [cq+64* 7]
vpermb m18, m14, [cq+64* 5]
vpermb m16, m14, [cq+64*11]
vpermb m19, m14, [cq+64*13]
vpermb m14, m14, [cq+64* 3]
call .main_packed_fast
punpcklwd m8, m0, m2
punpckhwd m0, m2
punpcklwd m2, m1, m3
punpckhwd m1, m3
punpcklwd m3, m4, m6
punpckhwd m4, m6
punpcklwd m6, m5, m7
punpckhwd m5, m7
punpcklwd m7, m14, m16
punpckhwd m14, m16
punpcklwd m16, m15, m17
punpckhwd m15, m17
punpcklwd m17, m19, m21
punpckhwd m19, m21
punpckhwd m21, m18, m20
punpcklwd m18, m20
punpcklwd m20, m8, m1
punpckhwd m8, m1
punpcklwd m1, m0, m2
punpckhwd m0, m2
punpcklwd m2, m3, m5
punpckhwd m3, m5
punpcklwd m5, m4, m6
punpckhwd m4, m6
punpcklwd m6, m7, m15
punpckhwd m7, m15
punpcklwd m15, m14, m16
punpckhwd m14, m16
punpckhwd m16, m18, m19
punpcklwd m18, m19
punpcklwd m19, m21, m17
punpckhwd m21, m17
punpcklwd m17, m8, m0 ; a2 a6 aa ae
punpckhwd m8, m0 ; a3 a7 ab af
punpcklwd m0, m20, m1 ; a0 a4 a8 ac
punpckhwd m20, m1 ; a1 a5 a9 ad
punpcklwd m1, m2, m5 ; b0 b4 b8 bc
punpckhwd m2, m5 ; b1 b5 b9 bd
punpcklwd m5, m3, m4 ; b2 b6 ba be
punpckhwd m3, m4 ; b3 b7 bb bf
punpcklwd m4, m6, m15 ; c0 c4 c8 cc
punpckhwd m6, m15 ; c1 c5 c9 cd
punpcklwd m15, m7, m14 ; c2 c6 ca ce
punpckhwd m7, m14 ; c3 c7 cb cf
punpcklwd m14, m18, m19 ; d0 d4 d8 dc
punpckhwd m18, m19 ; d1 d5 d9 dd
punpcklwd m9, m16, m21 ; d2 d6 da de
punpckhwd m16, m21 ; d3 d7 db df
mov r3d, 64*12
pxor ym21, ym21
.fast_zero_loop:
mova [cq+r3+64*0], ym21
mova [cq+r3+64*1], ym21
mova [cq+r3+64*2], ym21
mova [cq+r3+64*3], ym21
sub r3d, 64*4
jge .fast_zero_loop
vshufi32x4 m21, m0, m1, q3232 ; a8 ac b8 bc
vinserti32x8 m0, ym1, 1 ; a0 a4 b0 b4
vinserti32x8 m1, m17, ym5, 1 ; a2 a6 b2 b6
vshufi32x4 m5, m17, m5, q3232 ; aa ae ba be
vinserti32x8 m17, m8, ym3, 1 ; a3 a7 b3 b7
vshufi32x4 m19, m8, m3, q3232 ; ab af bb bf
vinserti32x8 m3, m4, ym14, 1 ; c0 c4 d0 d4
vshufi32x4 m4, m14, q3232 ; c8 cc d8 dc
vinserti32x8 m14, m20, ym2, 1 ; a1 a5 b1 b5
vshufi32x4 m20, m2, q3232 ; a9 ad b9 bd
vinserti32x8 m2, m6, ym18, 1 ; c1 c5 d1 d5
vshufi32x4 m6, m18, q3232 ; c9 cd d9 dd
vinserti32x8 m18, m15, ym9, 1 ; c2 c6 d2 d6
vshufi32x4 m15, m9, q3232 ; ca ce da de
vinserti32x8 m9, m7, ym16, 1 ; c3 c7 d3 d7
vshufi32x4 m7, m16, q3232 ; cb cf db df
vshufi32x4 m22, m14, m2, q2020 ; 1
vshufi32x4 m24, m14, m2, q3131 ; 5
vshufi32x4 m23, m17, m9, q2020 ; 3
vshufi32x4 m25, m17, m9, q3131 ; 7
vshufi32x4 m16, m5, m15, q2020 ; 10
vshufi32x4 m17, m5, m15, q3131 ; 14
vshufi32x4 m14, m1, m18, q2020 ; 2
vshufi32x4 m15, m1, m18, q3131 ; 6
vshufi32x4 m1, m0, m3, q3131 ; 4
vshufi32x4 m0, m3, q2020 ; 0
vshufi32x4 m3, m21, m4, q3131 ; 12
vshufi32x4 m2, m21, m4, q2020 ; 8
vshufi32x4 m26, m20, m6, q2020 ; 9
vshufi32x4 m28, m20, m6, q3131 ; 13
vshufi32x4 m27, m19, m7, q2020 ; 11
vshufi32x4 m29, m19, m7, q3131 ; 15
call .idct16_fast
mova [rsp+64*0], m14
mova [rsp+64*1], m15
mova [rsp+64*2], m16
mova [rsp+64*3], m17
mova [rsp+64*4], m18
mova [rsp+64*5], m19
mova [rsp+64*6], m20
mova [rsp+64*7], m21
call .main_fast
.end:
lea r4, [strideq*3]
vpbroadcastd m12, [o(pw_512)]
movshdup m13, [o(itx_perm)]
lea r3, [dstq+r4*8]
lea r5, [strideq+r4] ; stride*4
add r3, r5 ; dst+stride*28
IDCT_32x32_END 29, 0, strideq*0, r4
IDCT_32x32_END 28, 1, strideq*1, strideq*2
IDCT_32x32_END 27, 2, strideq*2, strideq*1
IDCT_32x32_END 26, 3, r4 , strideq*0
IDCT_32x32_END 25, 4, strideq*0, r4
IDCT_32x32_END 24, 5, strideq*1, strideq*2
IDCT_32x32_END 23, 6, strideq*2, strideq*1
IDCT_32x32_END 22, 7, r4 , strideq*0
IDCT_32x32_END 21, 8, strideq*0, r4
IDCT_32x32_END 20, 9, strideq*1, strideq*2
IDCT_32x32_END 19, 10, strideq*2, strideq*1
IDCT_32x32_END 18, 11, r4 , strideq*0
IDCT_32x32_END 17, 12, strideq*0, r4
IDCT_32x32_END 16, 13, strideq*1, strideq*2
IDCT_32x32_END 15, 14, strideq*2, strideq*1
IDCT_32x32_END 14, 15, r4 , strideq*0
RET
ALIGN function_align
.idct16_fast:
vpbroadcastd m21, [o(pw_16305x2)]
vpbroadcastd m8, [o(pw_1606x2)]
vpbroadcastd m18, [o(pw_m10394x2)]
vpbroadcastd m9, [o(pw_12665x2)]
pmulhrsw m21, m14 ; t15a
vpbroadcastd m19, [o(pw_14449x2)]
pmulhrsw m14, m8 ; t8a
vpbroadcastd m8, [o(pw_7723x2)]
pmulhrsw m18, m17 ; t9a
vpbroadcastd m20, [o(pw_m4756x2)]
pmulhrsw m17, m9 ; t14a
vpbroadcastd m9, [o(pw_15679x2)]
pmulhrsw m19, m16 ; t13a
vpbroadcastd m5, [o(pw_m9102x2)]
pmulhrsw m16, m8 ; t10a
vpbroadcastd m8, [o(pw_13623x2)]
pmulhrsw m20, m15 ; t11a
vpbroadcastd m7, [o(pw_16069x2)]
pmulhrsw m15, m9 ; t12a
vpbroadcastd m9, [o(pw_3196x2)]
pmulhrsw m5, m3 ; t5a
vpbroadcastd m6, [o(pw_15137x2)]
pmulhrsw m3, m8 ; t6a
vpbroadcastd m8, [o(pw_6270x2)]
pmulhrsw m7, m1 ; t7a
vpbroadcastd m4, [o(pw_11585x2)]
pmulhrsw m1, m9 ; t4
vpbroadcastd m10, [o(pd_8192)]
pmulhrsw m6, m2 ; t3
pmulhrsw m2, m8 ; t2
pmulhrsw m4, m0 ; t0
mova m0, m4 ; t1
jmp .idct16b
ALIGN function_align
.idct16:
vpbroadcastd m10, [o(pd_8192)]
ITX_MULSUB_2W 14, 21, 8, 9, 10, 1606, 16305 ; t8a, t15a
ITX_MULSUB_2W 18, 17, 8, 9, 10, 12665, 10394 ; t9a, t14a
ITX_MULSUB_2W 16, 19, 8, 9, 10, 7723, 14449 ; t10a, t13a
ITX_MULSUB_2W 20, 15, 8, 9, 10, 15679, 4756 ; t11a, t12
ITX_MULSUB_2W 5, 3, 8, 9, 10, 13623, 9102 ; t5a, t6a
ITX_MULSUB_2W 1, 7, 8, 9, 10, 3196, 16069 ; t4a, t7a
ITX_MULSUB_2W 2, 6, 8, 9, 10, 6270, 15137 ; t2, t3
ITX_MULSUB_2W 0, 4, 8, 9, 10, 11585, 11585 ; t1, t0
.idct16b:
paddw m8, m20, m16 ; t11
psubw m20, m16 ; t10
paddw m16, m15, m19 ; t12
psubw m15, m19 ; t13
psubw m19, m14, m18 ; t9
paddw m14, m18 ; t8
psubw m18, m21, m17 ; t14
paddw m21, m17 ; t15
vpbroadcastd m11, [o(pw_6270_15137)]
vpbroadcastd m12, [o(pw_m15137_6270)]
ITX_MULSUB_2W 18, 19, 9, 17, 10, 11, 12 ; t9a, t14a
vpbroadcastd m11, [o(pw_m6270_m15137)]
ITX_MULSUB_2W 15, 20, 9, 17, 10, 12, 11 ; t10a, t13a
vpbroadcastd m11, [o(pw_11585_11585)]
vpbroadcastd m12, [o(pw_m11585_11585)]
paddw m9, m7, m3 ; t7
psubw m3, m7, m3 ; t6a
paddw m7, m1, m5 ; t4
psubw m1, m5 ; t5a
psubw m17, m14, m8 ; t11a
paddw m8, m14 ; t8a
paddw m14, m18, m15 ; t9
psubw m18, m15 ; t10
psubw m15, m19, m20 ; t13
paddw m19, m20 ; t14
paddw m20, m21, m16 ; t15a
psubw m16, m21, m16 ; t12a
ITX_MULSUB_2W 3, 1, 5, 21, 10, 11, 12 ; t5, t6
ITX_MULSUB_2W 15, 18, 5, 21, 10, 11, 12 ; t10a, t13a
ITX_MULSUB_2W 16, 17, 5, 21, 10, 11, 12 ; t11, t12
psubw m5, m0, m2 ; t2
paddw m2, m0 ; t1
paddw m0, m4, m6 ; t0
psubw m4, m6 ; t3
psubw m6, m2, m1 ; t6
paddw m1, m2 ; t1
paddw m2, m5, m3 ; t2
psubw m5, m3 ; t5
paddw m3, m4, m7 ; t3
psubw m4, m7 ; t4
psubw m7, m0, m9 ; t7
paddw m0, m9 ; t0
psubw m21, m0, m20 ; out15
paddw m0, m20 ; out0
psubw m20, m1, m19 ; out14
paddw m1, m19 ; out1
psubw m19, m2, m18 ; out13
paddw m2, m18 ; out2
psubw m18, m3, m17 ; out12
paddw m3, m17 ; out3
psubw m17, m4, m16 ; out11
paddw m4, m16 ; out4
psubw m16, m5, m15 ; out10
paddw m5, m15 ; out5
psubw m15, m6, m14 ; out9
paddw m6, m14 ; out6
psubw m14, m7, m8 ; out8
paddw m7, m8 ; out7
ret
ALIGN function_align
.main_fast:
vpbroadcastd m21, [o(pw_16364x2)]
vpbroadcastd m8, [o(pw_804x2)]
vpbroadcastd m14, [o(pw_m11003x2)]
vpbroadcastd m9, [o(pw_12140x2)]
pmulhrsw m21, m22 ; t31a
vpbroadcastd m17, [o(pw_14811x2)]
pmulhrsw m22, m8 ; t16a
vpbroadcastd m8, [o(pw_7005x2)]
pmulhrsw m14, m29 ; t30a
vpbroadcastd m18, [o(pw_m5520x2)]
pmulhrsw m29, m9 ; t17a
vpbroadcastd m9, [o(pw_15426x2)]
pmulhrsw m17, m26 ; t29a
vpbroadcastd m19, [o(pw_15893x2)]
pmulhrsw m26, m8 ; t18a
vpbroadcastd m8, [o(pw_3981x2)]
pmulhrsw m18, m25 ; t19a
vpbroadcastd m16, [o(pw_m8423x2)]
pmulhrsw m25, m9 ; t28a
vpbroadcastd m9, [o(pw_14053x2)]
pmulhrsw m19, m24 ; t27a
vpbroadcastd m15, [o(pw_13160x2)]
pmulhrsw m24, m8 ; t20a
vpbroadcastd m8, [o(pw_9760x2)]
pmulhrsw m16, m27 ; t21a
vpbroadcastd m20, [o(pw_m2404x2)]
pmulhrsw m27, m9 ; t26a
vpbroadcastd m9, [o(pw_16207x2)]
pmulhrsw m15, m28 ; t25a
pmulhrsw m28, m8 ; t22a
pmulhrsw m20, m23 ; t23a
pmulhrsw m23, m9 ; t24a
jmp .main2
ALIGN function_align
.main:
ITX_MULSUB_2W 22, 21, 8, 9, 10, 804, 16364 ; t16a, t31a
ITX_MULSUB_2W 14, 29, 8, 9, 10, 12140, 11003 ; t17a, t30a
ITX_MULSUB_2W 26, 17, 8, 9, 10, 7005, 14811 ; t18a, t29a
ITX_MULSUB_2W 18, 25, 8, 9, 10, 15426, 5520 ; t19a, t28a
ITX_MULSUB_2W 24, 19, 8, 9, 10, 3981, 15893 ; t20a, t27a
ITX_MULSUB_2W 16, 27, 8, 9, 10, 14053, 8423 ; t21a, t26a
ITX_MULSUB_2W 28, 15, 8, 9, 10, 9760, 13160 ; t22a, t25a
ITX_MULSUB_2W 20, 23, 8, 9, 10, 16207, 2404 ; t23a, t24a
.main2:
psubw m8, m22, m14 ; t17
paddw m22, m14 ; t16
paddw m14, m18, m26 ; t19
psubw m18, m26 ; t18
psubw m26, m24, m16 ; t21
paddw m24, m16 ; t20
psubw m16, m20, m28 ; t22
paddw m28, m20 ; t23
psubw m20, m23, m15 ; t25
paddw m23, m15 ; t24
psubw m15, m21, m29 ; t30
paddw m21, m29 ; t31
psubw m29, m19, m27 ; t26
paddw m19, m27 ; t27
paddw m27, m25, m17 ; t28
psubw m25, m17 ; t29
ITX_MULSUB_2W 15, 8, 9, 17, 10, 3196, 16069 ; t17a, t30a
ITX_MULSUB_2W 25, 18, 9, 17, 10, m16069, 3196 ; t18a, t29a
ITX_MULSUB_2W 29, 26, 9, 17, 10, 13623, 9102 ; t21a, t26a
ITX_MULSUB_2W 20, 16, 9, 17, 10, m9102, 13623 ; t22a, t25a
psubw m17, m21, m27 ; t28a
paddw m21, m27 ; t31a
psubw m27, m15, m25 ; t18
paddw m15, m25 ; t17
psubw m25, m20, m29 ; t21
paddw m20, m29 ; t22
psubw m29, m8, m18 ; t29
paddw m8, m18 ; t30
psubw m18, m22, m14 ; t19a
paddw m22, m14 ; t16a
psubw m14, m28, m24 ; t20a
paddw m24, m28 ; t23a
paddw m28, m16, m26 ; t25
psubw m16, m26 ; t26
psubw m26, m23, m19 ; t27a
paddw m23, m19 ; t24a
vpbroadcastd m12, [o(pw_m15137_6270)]
vpbroadcastd m11, [o(pw_6270_15137)]
ITX_MULSUB_2W 29, 27, 9, 19, 10, 11, 12 ; t18a, t29a
ITX_MULSUB_2W 17, 18, 9, 19, 10, 11, 12 ; t19, t28
vpbroadcastd m11, [o(pw_m6270_m15137)]
ITX_MULSUB_2W 16, 25, 9, 19, 10, 12, 11 ; t21a, t26a
ITX_MULSUB_2W 26, 14, 9, 19, 10, 12, 11 ; t20, t27
vpbroadcastd m12, [o(pw_m11585_11585)]
vpbroadcastd m11, [o(pw_11585_11585)]
psubw m19, m27, m25 ; t26
paddw m27, m25 ; t29
psubw m25, m17, m26 ; t20a
paddw m17, m26 ; t19a
paddw m26, m18, m14 ; t28a
psubw m18, m14 ; t27a
paddw m14, m22, m24 ; t16
psubw m22, m24 ; t23
psubw m24, m29, m16 ; t21
paddw m16, m29 ; t18
paddw m29, m21, m23 ; t31
psubw m21, m23 ; t24
psubw m23, m15, m20 ; t22a
paddw m15, m20 ; t17a
psubw m20, m8, m28 ; t25a
paddw m28, m8 ; t30a
ITX_MULSUB_2W 18, 25, 8, 9, 10, 11, 12 ; t20, t27
ITX_MULSUB_2W 19, 24, 8, 9, 10, 11, 12 ; t21a, t26a
ITX_MULSUB_2W 21, 22, 8, 9, 10, 11, 12 ; t23a, t24a
ITX_MULSUB_2W 20, 23, 8, 9, 10, 11, 12 ; t22, t25
ret
ALIGN function_align
.main_packed_fast:
vpbroadcastd m8, [o(pw_804_16364x2)]
vpbroadcastd m9, [o(pw_m11003_12140x2)]
vpbroadcastd m11, [o(pw_7005_14811x2)]
vpbroadcastd m12, [o(pw_m5520_15426x2)]
pmulhrsw m21, m8 ; t16a, t31a
vpbroadcastd m8, [o(pw_3981_15893x2)]
pmulhrsw m17, m9 ; t17a, t30a
vpbroadcastd m9, [o(pw_m8423_14053x2)]
pmulhrsw m20, m11 ; t18a, t29a
vpbroadcastd m11, [o(pw_9760_13160x2)]
pmulhrsw m15, m12 ; t19a, t28a
vpbroadcastd m12, [o(pw_m2404_16207x2)]
pmulhrsw m18, m8 ; t20a, t27a
pmulhrsw m16, m9 ; t21a, t26a
pmulhrsw m19, m11 ; t22a, t25a
pmulhrsw m14, m12 ; t23a, t24a
psubw m8, m21, m17 ; t17 t30
paddw m21, m17 ; t16 t31
psubw m17, m15, m20 ; t18 t29
paddw m20, m15 ; t19 t28
psubw m15, m18, m16 ; t21 t26
paddw m18, m16 ; t20 t27
psubw m16, m14, m19 ; t22 t25
paddw m14, m19 ; t23 t24
ITX_MUL2X_PACK 8, 9, 19, 10, 3196, 16069, 5 ; t17a t30a
ITX_MUL2X_PACK 17, 9, 19, 10, m16069, 3196, 5 ; t18a t29a
ITX_MUL2X_PACK 15, 9, 19, 10, 13623, 9102, 5 ; t21a t26a
ITX_MUL2X_PACK 16, 9, 19, 10, m9102, 13623, 5 ; t22a t25a
vpbroadcastd m11, [o(pw_m15137_6270)]
psubw m19, m21, m20 ; t19a t28a
paddw m21, m20 ; t16a t31a
psubw m20, m14, m18 ; t20a t27a
paddw m14, m18 ; t23a t24a
psubw m18, m8, m17 ; t18 t29
paddw m8, m17 ; t17 t30
psubw m17, m16, m15 ; t21 t26
paddw m15, m16 ; t22 t25
ITX_MUL2X_PACK 18, 9, 16, 10, 6270_15137, 11, 20 ; t18a t29a
ITX_MUL2X_PACK 19, 9, 16, 10, 6270_15137, 11, 20 ; t19 t28
ITX_MUL2X_PACK 20, 9, 16, 10, 11, m6270_m15137, 36 ; t20 t27
ITX_MUL2X_PACK 17, 9, 16, 10, 11, m6270_m15137, 36 ; t21a t26a
vbroadcasti32x4 m9, [o(deint_shuf)]
psubw m16, m21, m14 ; t23 t24
paddw m14, m21 ; t16 t31
psubw m21, m8, m15 ; t22a t25a
paddw m15, m8 ; t17a t30a
psubw m8, m18, m17 ; t21 t26
paddw m18, m17 ; t18 t29
paddw m17, m19, m20 ; t19a t28a
psubw m19, m20 ; t20a t27a
vpbroadcastd m11, [o(pw_m11585_11585)]
vpbroadcastd m12, [o(pw_11585_11585)]
REPX {pshufb x, m9}, m14, m15, m18, m17
mova m9, m10
vpdpwssd m9, m16, m11
mova m20, m10
vpdpwssd m20, m21, m11
psrad m9, 14
psrad m20, 14
packssdw m9, m20 ; t23a t22
mova m20, m10
vpdpwssd m20, m16, m12
mova m16, m10
vpdpwssd m16, m21, m12
psrad m20, 14
psrad m16, 14
packssdw m16, m20, m16 ; t24a t25
ITX_MUL2X_PACK 8, 21, 20, 10, 11, 12, 8 ; t21a t26a
ITX_MUL2X_PACK 19, 8, 11, 10, 11, 12, 8 ; t20 t27
packssdw m11, m20 ; t27 t26a
packssdw m8, m21 ; t20 t21a
punpcklqdq m20, m14, m15 ; t16 t17a
punpckhqdq m14, m15 ; t31 t30a
punpckhqdq m15, m17, m18 ; t28a t29
punpcklqdq m17, m18 ; t19a t18
psubw m21, m0, m14 ; out31 out30
paddw m0, m14 ; out0 out1
psubw m14, m7, m20 ; out16 out17
paddw m7, m20 ; out15 out14
psubw m20, m1, m15 ; out28 out29
paddw m1, m15 ; out3 out2
psubw m15, m6, m17 ; out19 out18
paddw m6, m17 ; out12 out13
psubw m17, m4, m9 ; out23 out22
paddw m4, m9 ; out8 out9
psubw m18, m3, m16 ; out24 out25
paddw m3, m16 ; out7 out6
psubw m16, m5, m8 ; out20 out21
paddw m5, m8 ; out11 out10
psubw m19, m2, m11 ; out27 out26
paddw m2, m11 ; out4 out5
ret
%endif