2025-05-21 14:49:17 +02:00
|
|
|
;******************************************************************************
|
|
|
|
;* VP9 IDCT SIMD optimizations
|
|
|
|
;*
|
|
|
|
;* Copyright (C) 2025 Two Orioles, LLC
|
|
|
|
;*
|
|
|
|
;* This file is part of FFmpeg.
|
|
|
|
;*
|
|
|
|
;* FFmpeg is free software; you can redistribute it and/or
|
|
|
|
;* modify it under the terms of the GNU Lesser General Public
|
|
|
|
;* License as published by the Free Software Foundation; either
|
|
|
|
;* version 2.1 of the License, or (at your option) any later version.
|
|
|
|
;*
|
|
|
|
;* FFmpeg is distributed in the hope that it will be useful,
|
|
|
|
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
;* Lesser General Public License for more details.
|
|
|
|
;*
|
|
|
|
;* You should have received a copy of the GNU Lesser General Public
|
|
|
|
;* License along with FFmpeg; if not, write to the Free Software
|
|
|
|
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
;******************************************************************************
|
|
|
|
|
|
|
|
%include "libavutil/x86/x86util.asm"
|
|
|
|
|
|
|
|
%if ARCH_X86_64 && HAVE_AVX512ICL_EXTERNAL
|
|
|
|
|
|
|
|
SECTION_RODATA 64
|
|
|
|
|
2025-08-01 22:43:23 +02:00
|
|
|
; The following set of constants are ordered to form the
|
2025-05-21 14:49:17 +02:00
|
|
|
; qword shuffle mask { 0, 2, 4, 6, 1, 3, 5, 7 }
|
|
|
|
%define deintq_perm pd_5520
|
|
|
|
pd_5520: dd 5520
|
|
|
|
pd_9760: dd 9760
|
|
|
|
pd_10394: dd 10394
|
|
|
|
pd_15426: dd 15426
|
|
|
|
pd_804: dd 804
|
|
|
|
pd_2404: dd 2404
|
|
|
|
pd_6270: dd 6270
|
|
|
|
pd_9102: dd 9102
|
|
|
|
pd_11585: dd 11585
|
|
|
|
pd_12665: dd 12665
|
|
|
|
pd_7723: dd 7723
|
|
|
|
pd_14811: dd 14811
|
|
|
|
pd_7005: dd 7005
|
|
|
|
pd_14053: dd 14053
|
|
|
|
pd_8423: dd 8423
|
|
|
|
pd_13623: dd 13623
|
|
|
|
|
|
|
|
pixel_clip: times 2 dw 0x7c00
|
|
|
|
pixel_clip6: dd 2031648 ; 32 + (pixel_clip << 6)
|
|
|
|
pd_532480: dd 532480 ; 8192 + (32 << 14)
|
|
|
|
pd_8192: dd 8192
|
|
|
|
|
|
|
|
pd_1606: dd 1606
|
|
|
|
pd_3196: dd 3196
|
|
|
|
pd_3981: dd 3981
|
|
|
|
pd_4756: dd 4756
|
|
|
|
pd_11003: dd 11003
|
|
|
|
pd_12140: dd 12140
|
|
|
|
pd_13160: dd 13160
|
|
|
|
pd_14449: dd 14449
|
|
|
|
pd_15137: dd 15137
|
|
|
|
pd_15679: dd 15679
|
|
|
|
pd_15893: dd 15893
|
|
|
|
pd_16069: dd 16069
|
|
|
|
pd_16207: dd 16207
|
|
|
|
pd_16305: dd 16305
|
|
|
|
pd_16364: dd 16364
|
|
|
|
|
|
|
|
SECTION .text
|
|
|
|
|
|
|
|
%define o_base (deintq_perm+128)
|
|
|
|
%define o(x) (r5 - o_base + (x))
|
|
|
|
%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
|
|
|
|
|
|
|
|
; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
|
|
|
|
; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
|
|
|
|
; skip round/shift if rnd is not a number
|
|
|
|
%macro ITX_MULSUB_2D 8-9 0 ; dst/src[1-2], tmp[1-3], rnd, coef[1-2], inv_dst2
|
|
|
|
%if %8 < 32
|
|
|
|
pmulld m%4, m%1, m%8
|
|
|
|
pmulld m%3, m%2, m%8
|
|
|
|
%else
|
|
|
|
vpbroadcastd m%3, [o(pd_%8)]
|
|
|
|
pmulld m%4, m%1, m%3
|
|
|
|
pmulld m%3, m%2
|
|
|
|
%endif
|
|
|
|
%if %7 < 32
|
|
|
|
pmulld m%1, m%7
|
|
|
|
pmulld m%2, m%7
|
|
|
|
%else
|
|
|
|
vpbroadcastd m%5, [o(pd_%7)]
|
|
|
|
pmulld m%1, m%5
|
|
|
|
pmulld m%2, m%5
|
|
|
|
%endif
|
|
|
|
%if %9
|
|
|
|
psubd m%4, m%6, m%4
|
|
|
|
psubd m%2, m%4, m%2
|
|
|
|
%else
|
|
|
|
%ifnum %6
|
|
|
|
paddd m%4, m%6
|
|
|
|
%endif
|
|
|
|
paddd m%2, m%4
|
|
|
|
%endif
|
|
|
|
%ifnum %6
|
|
|
|
paddd m%1, m%6
|
|
|
|
%endif
|
|
|
|
psubd m%1, m%3
|
|
|
|
%ifnum %6
|
|
|
|
psrad m%2, 14
|
|
|
|
psrad m%1, 14
|
|
|
|
%endif
|
|
|
|
%endmacro
|
|
|
|
|
|
|
|
%macro WRAP_YMM 1+
|
|
|
|
INIT_YMM cpuname
|
|
|
|
%1
|
|
|
|
INIT_ZMM cpuname
|
|
|
|
%endmacro
|
|
|
|
|
|
|
|
%macro TRANSPOSE_4D 5 ; in[1-4], tmp
|
|
|
|
punpckhdq m%5, m%3, m%4 ; c2 d2 c3 d3
|
|
|
|
punpckldq m%3, m%4 ; c0 d0 c1 d1
|
|
|
|
punpckhdq m%4, m%1, m%2 ; a2 b2 a3 b3
|
|
|
|
punpckldq m%1, m%2 ; a0 b0 a1 b1
|
|
|
|
punpckhqdq m%2, m%1, m%3 ; a1 b1 c1 d1
|
|
|
|
punpcklqdq m%1, m%3 ; a0 b0 c0 d0
|
|
|
|
punpcklqdq m%3, m%4, m%5 ; a2 b2 c2 d2
|
|
|
|
punpckhqdq m%4, m%5 ; a3 b3 c3 d3
|
|
|
|
%endmacro
|
|
|
|
|
|
|
|
%macro TRANSPOSE_4DQ 5 ; in[1-4], tmp
|
|
|
|
vshufi32x4 m%5, m%3, m%4, q3232 ; c2 c3 d2 d3
|
|
|
|
vinserti32x8 m%3, ym%4, 1 ; c0 c1 d0 d1
|
|
|
|
vshufi32x4 m%4, m%1, m%2, q3232 ; a2 a3 b2 b3
|
|
|
|
vinserti32x8 m%1, ym%2, 1 ; a0 a1 b0 b1
|
|
|
|
vshufi32x4 m%2, m%1, m%3, q3131 ; a1 b1 c1 d1
|
|
|
|
vshufi32x4 m%1, m%3, q2020 ; a0 b0 c0 d0
|
|
|
|
vshufi32x4 m%3, m%4, m%5, q2020 ; a2 b2 c2 d2
|
|
|
|
vshufi32x4 m%4, m%5, q3131 ; a3 b3 c3 d3
|
|
|
|
%endmacro
|
|
|
|
|
|
|
|
%macro INV_TXFM_FN 3-4 0 ; type1, type2, size, eob_offset
|
|
|
|
cglobal vp9_i%1_i%2_%3_add_10, 4, 5, 0, dst, stride, c, eob, tx2
|
|
|
|
%define %%p1 m(vp9_i%1_%3_internal_10)
|
|
|
|
lea r5, [o_base]
|
|
|
|
; Jump to the 1st txfm function if we're not taking the fast path, which
|
|
|
|
; in turn performs an indirect jump to the 2nd txfm function.
|
|
|
|
lea tx2q, [m(vp9_i%2_%3_internal_10).pass2]
|
|
|
|
%ifidn %1_%2, dct_dct
|
|
|
|
dec eobd
|
|
|
|
jnz %%p1
|
|
|
|
%else
|
|
|
|
%if %4
|
|
|
|
add eobd, %4
|
|
|
|
%endif
|
|
|
|
; jump to the 1st txfm function unless it's located directly after this
|
|
|
|
times ((%%end - %%p1) >> 31) & 1 jmp %%p1
|
|
|
|
ALIGN function_align
|
|
|
|
%%end:
|
|
|
|
%endif
|
|
|
|
%endmacro
|
|
|
|
|
|
|
|
%macro INV_TXFM_16X16_FN 2-3 0 ; type1, type2, eob_offset
|
|
|
|
INV_TXFM_FN %1, %2, 16x16, %3
|
|
|
|
%ifidn %1_%2, dct_dct
|
|
|
|
imul r6d, [cq], 11585
|
|
|
|
vpbroadcastd ym3, [o(pixel_clip)]
|
|
|
|
mov [cq], r3d
|
|
|
|
add r6d, 8192
|
|
|
|
sar r6d, 14
|
|
|
|
imul r6d, 11585
|
|
|
|
or r3d, 8
|
|
|
|
add r6d, 532480
|
|
|
|
sar r6d, 20
|
|
|
|
vpbroadcastw ym2, r6d
|
|
|
|
paddsw ym2, ym3
|
|
|
|
.dconly_loop:
|
|
|
|
paddsw ym0, ym2, [dstq+strideq*0]
|
|
|
|
paddsw ym1, ym2, [dstq+strideq*1]
|
|
|
|
psubusw ym0, ym3
|
|
|
|
psubusw ym1, ym3
|
|
|
|
mova [dstq+strideq*0], ym0
|
|
|
|
mova [dstq+strideq*1], ym1
|
|
|
|
lea dstq, [dstq+strideq*2]
|
|
|
|
dec r3d
|
|
|
|
jg .dconly_loop
|
|
|
|
RET
|
|
|
|
%endif
|
|
|
|
%endmacro
|
|
|
|
|
|
|
|
%macro IDCT16_PART1 0
|
|
|
|
%if mmsize == 64
|
|
|
|
.main_part1_fast:
|
|
|
|
%endif
|
|
|
|
pmulld m15, m1, [o(pd_16305)] {bcstd} ; t15a
|
|
|
|
pmulld m1, [o(pd_1606)] {bcstd} ; t8a
|
|
|
|
pmulld m9, m7, [o(pd_10394)] {bcstd} ; t9a
|
|
|
|
pmulld m7, [o(pd_12665)] {bcstd} ; t14a
|
|
|
|
pmulld m11, m5, [o(pd_14449)] {bcstd} ; t13a
|
|
|
|
pmulld m5, [o(pd_7723)] {bcstd} ; t10a
|
|
|
|
pmulld m13, m3, [o(pd_4756)] {bcstd} ; t11a
|
|
|
|
pmulld m3, [o(pd_15679)] {bcstd} ; t12a
|
|
|
|
pmulld m10, m6, [o(pd_9102)] {bcstd} ; t5a
|
|
|
|
pmulld m6, [o(pd_13623)] {bcstd} ; t6a
|
|
|
|
pmulld m14, m2, [o(pd_16069)] {bcstd} ; t7a
|
|
|
|
pmulld m2, [o(pd_3196)] {bcstd} ; t4a
|
|
|
|
pmulld m12, m4, [o(pd_15137)] {bcstd} ; t3
|
|
|
|
pmulld m4, [o(pd_6270)] {bcstd} ; t2
|
|
|
|
pmulld m0, m21
|
|
|
|
REPX {psubd x, m20, x}, m9, m13, m10
|
|
|
|
paddd m0, m20
|
|
|
|
mova m18, m0
|
|
|
|
%if mmsize == 64 ; for the ymm variant we only ever use the fast path
|
|
|
|
jmp %%main_part1b
|
|
|
|
.main_part1:
|
|
|
|
ITX_MULSUB_2D 1, 15, 16, 17, 18, _, 1606, 16305 ; t8a, t15a
|
|
|
|
ITX_MULSUB_2D 9, 7, 16, 17, 18, _, 12665, 10394 ; t9a, t14a
|
|
|
|
ITX_MULSUB_2D 5, 11, 16, 17, 18, _, 7723, 14449 ; t10a, t13a
|
|
|
|
ITX_MULSUB_2D 13, 3, 16, 17, 18, _, 15679, 4756 ; t11a, t12a
|
|
|
|
ITX_MULSUB_2D 10, 6, 16, 17, 18, _, 13623, 9102 ; t5a, t6a
|
|
|
|
ITX_MULSUB_2D 2, 14, 16, 17, 18, _, 3196, 16069 ; t4a, t7a
|
|
|
|
ITX_MULSUB_2D 4, 12, 16, 17, 18, _, 6270, 15137 ; t2, t3
|
|
|
|
pmulld m0, m21
|
|
|
|
pmulld m8, m21
|
|
|
|
REPX {paddd x, m20}, m0, m9, m13, m10
|
|
|
|
psubd m18, m0, m8 ; t1
|
|
|
|
paddd m0, m8 ; t0
|
|
|
|
%%main_part1b:
|
|
|
|
%endif
|
|
|
|
vpbroadcastd m19, [o(pd_15137)]
|
|
|
|
vpbroadcastd m16, [o(pd_6270)]
|
|
|
|
REPX {paddd x, m20}, m15, m7, m1, m11, m3, m5
|
|
|
|
REPX {psrad x, 14 }, m15, m7, m1, m9, m11, m3, m5, m13
|
|
|
|
paddd m17, m15, m7 ; t15
|
|
|
|
psubd m15, m7 ; t14
|
|
|
|
psubd m7, m3, m11 ; t13
|
|
|
|
paddd m3, m11 ; t12
|
|
|
|
psubd m11, m13, m5 ; t10
|
|
|
|
paddd m5, m13 ; t11
|
|
|
|
psubd m13, m1, m9 ; t9
|
|
|
|
paddd m1, m9 ; t8
|
|
|
|
ITX_MULSUB_2D 15, 13, 8, 9, _, 20, 16, 19 ; t9a, t14a
|
|
|
|
ITX_MULSUB_2D 7, 11, 8, 9, _, 20, 16, 19, 2 ; t13a, t10a
|
|
|
|
paddd m16, m1, m5 ; t8a
|
|
|
|
psubd m1, m5 ; t11a
|
|
|
|
paddd m8, m15, m11 ; t9
|
|
|
|
psubd m15, m11 ; t10
|
|
|
|
psubd m11, m17, m3 ; t12a
|
|
|
|
paddd m17, m3 ; t15a
|
|
|
|
psubd m9, m13, m7 ; t13
|
|
|
|
paddd m13, m7 ; t14
|
|
|
|
REPX {pmulld x, m21}, m11, m9, m1, m15
|
|
|
|
REPX {paddd x, m20}, m2, m6, m14
|
|
|
|
REPX {psrad x, 14 }, m10, m2, m6, m14
|
|
|
|
psubd m3, m2, m10 ; t5a
|
|
|
|
paddd m10, m2 ; t4
|
|
|
|
paddd m11, m20
|
|
|
|
psubd m5, m11, m1 ; t11
|
|
|
|
paddd m11, m1 ; t12
|
|
|
|
psubd m1, m14, m6 ; t6a
|
|
|
|
paddd m14, m6 ; t7
|
|
|
|
pmulld m1, m21
|
|
|
|
pmulld m3, m21
|
|
|
|
paddd m4, m20
|
|
|
|
paddd m12, m20
|
|
|
|
REPX {psrad x, 14 }, m4, m12, m0, m18
|
|
|
|
paddd m9, m20
|
|
|
|
paddd m2, m9, m15 ; t13a
|
|
|
|
psubd m9, m15 ; t10a
|
|
|
|
paddd m1, m20
|
|
|
|
psubd m6, m1, m3 ; t5
|
|
|
|
paddd m1, m3 ; t6
|
|
|
|
REPX {psrad x, 14}, m6, m1, m11, m5, m2, m9
|
|
|
|
%endmacro
|
|
|
|
|
|
|
|
%macro IDCT16_PART2 0
|
|
|
|
psubd m3, m0, m12 ; t3
|
|
|
|
paddd m0, m12 ; t0
|
|
|
|
psubd m12, m18, m4 ; t2
|
|
|
|
paddd m18, m4 ; t1
|
|
|
|
psubd m4, m3, m10 ; t4
|
|
|
|
paddd m3, m10 ; t3
|
|
|
|
psubd m10, m12, m6 ; t5
|
|
|
|
paddd m12, m6 ; t2
|
|
|
|
psubd m6, m18, m1 ; t6
|
|
|
|
paddd m1, m18 ; t1
|
|
|
|
psubd m7, m0, m14 ; t7
|
|
|
|
paddd m0, m14 ; t0
|
|
|
|
psubd m15, m0, m17 ; out15
|
|
|
|
paddd m0, m17 ; out0
|
|
|
|
psubd m14, m1, m13 ; out14
|
|
|
|
paddd m1, m13 ; out1
|
|
|
|
psubd m13, m12, m2 ; out13
|
|
|
|
paddd m2, m12 ; out2
|
|
|
|
psubd m12, m3, m11 ; out12
|
|
|
|
paddd m3, m11 ; out3
|
|
|
|
psubd m11, m4, m5 ; out11
|
|
|
|
paddd m4, m5 ; out4
|
|
|
|
paddd m5, m10, m9 ; out5
|
|
|
|
psubd m10, m9 ; out10
|
|
|
|
psubd m9, m6, m8 ; out9
|
|
|
|
paddd m6, m8 ; out6
|
|
|
|
psubd m8, m7, m16 ; out8
|
|
|
|
paddd m7, m16 ; out7
|
|
|
|
%endmacro
|
|
|
|
|
|
|
|
INIT_ZMM avx512icl
|
|
|
|
INV_TXFM_16X16_FN dct, dct
|
|
|
|
INV_TXFM_16X16_FN dct, adst, 39-23-1
|
|
|
|
|
|
|
|
cglobal vp9_idct_16x16_internal_10, 0, 7, 22, dst, stride, c, eob, tx2
|
|
|
|
mova m0, [cq+64* 0]
|
|
|
|
mova m1, [cq+64* 1]
|
|
|
|
mova m2, [cq+64* 2]
|
|
|
|
mova m3, [cq+64* 3]
|
|
|
|
mova m4, [cq+64* 4]
|
|
|
|
mova m5, [cq+64* 5]
|
|
|
|
mova m6, [cq+64* 6]
|
|
|
|
mova m7, [cq+64* 7]
|
|
|
|
vpbroadcastd m20, [o(pd_8192)]
|
|
|
|
vpbroadcastd m21, [o(pd_11585)]
|
|
|
|
sub eobd, 38
|
|
|
|
jl .pass1_fast
|
|
|
|
mova m8, [cq+64* 8]
|
|
|
|
mova m9, [cq+64* 9]
|
|
|
|
mova m10, [cq+64*10]
|
|
|
|
mova m11, [cq+64*11]
|
|
|
|
mova m12, [cq+64*12]
|
|
|
|
mova m13, [cq+64*13]
|
|
|
|
mova m14, [cq+64*14]
|
|
|
|
mova m15, [cq+64*15]
|
|
|
|
call .main_part1
|
|
|
|
call .main_part2
|
|
|
|
.pass1_end:
|
|
|
|
TRANSPOSE_4DQ 0, 4, 8, 12, 16
|
|
|
|
TRANSPOSE_4DQ 1, 5, 9, 13, 16
|
|
|
|
TRANSPOSE_4DQ 2, 6, 10, 14, 16
|
|
|
|
TRANSPOSE_4DQ 3, 7, 11, 15, 16
|
|
|
|
TRANSPOSE_4D 8, 9, 10, 11, 16
|
|
|
|
TRANSPOSE_4D 12, 13, 14, 15, 16
|
|
|
|
mov r6d, 64*12
|
|
|
|
jmp .pass1_transpose_end
|
|
|
|
.pass1_fast:
|
|
|
|
WRAP_YMM IDCT16_PART1
|
|
|
|
WRAP_YMM IDCT16_PART2
|
|
|
|
.pass1_fast_end:
|
|
|
|
vinserti32x8 m0, ym4, 1
|
|
|
|
vinserti32x8 m8, ym12, 1
|
|
|
|
vinserti32x8 m1, ym5, 1
|
|
|
|
vinserti32x8 m9, ym13, 1
|
|
|
|
vinserti32x8 m2, ym6, 1
|
|
|
|
vinserti32x8 m10, ym14, 1
|
|
|
|
vinserti32x8 m3, ym7, 1
|
|
|
|
vinserti32x8 m11, ym15, 1
|
|
|
|
vshufi32x4 m4, m0, m8, q3131
|
|
|
|
vshufi32x4 m0, m8, q2020
|
|
|
|
vshufi32x4 m5, m1, m9, q3131
|
|
|
|
vshufi32x4 m1, m9, q2020
|
|
|
|
vshufi32x4 m6, m2, m10, q3131
|
|
|
|
vshufi32x4 m2, m10, q2020
|
|
|
|
vshufi32x4 m7, m3, m11, q3131
|
|
|
|
vshufi32x4 m3, m11, q2020
|
|
|
|
mov r6d, 64*4
|
|
|
|
.pass1_transpose_end:
|
|
|
|
pxor m16, m16
|
|
|
|
.zero_loop:
|
|
|
|
mova [cq+r6+64*0], m16
|
|
|
|
mova [cq+r6+64*1], m16
|
|
|
|
mova [cq+r6+64*2], m16
|
|
|
|
mova [cq+r6+64*3], m16
|
|
|
|
sub r6d, 64*4
|
|
|
|
jge .zero_loop
|
|
|
|
TRANSPOSE_4D 0, 1, 2, 3, 16
|
|
|
|
TRANSPOSE_4D 4, 5, 6, 7, 16
|
|
|
|
jmp tx2q
|
|
|
|
.pass2:
|
|
|
|
test eobd, eobd
|
|
|
|
jl .pass2_fast
|
|
|
|
call .main_part1
|
|
|
|
jmp .pass2_end
|
|
|
|
.pass2_fast:
|
|
|
|
call .main_part1_fast
|
|
|
|
.pass2_end:
|
|
|
|
vpbroadcastd m3, [o(pixel_clip6)]
|
|
|
|
paddd m0, m3
|
|
|
|
paddd m18, m3
|
|
|
|
call .main_part2
|
|
|
|
REPX {psrad x, 6}, m0, m1, m2, m3
|
|
|
|
packssdw m0, m1
|
|
|
|
lea r6, [strideq*3]
|
|
|
|
packssdw m1, m2, m3
|
|
|
|
mova m2, [o(deintq_perm)]
|
|
|
|
vpbroadcastd m3, [o(pixel_clip)]
|
|
|
|
REPX {psrad x, 6}, m4, m5, m6, m7
|
|
|
|
call .write_16x4
|
|
|
|
packssdw m0, m4, m5
|
|
|
|
packssdw m1, m6, m7
|
|
|
|
REPX {psrad x, 6}, m8, m9, m10, m11
|
|
|
|
call .write_16x4
|
|
|
|
packssdw m0, m8, m9
|
|
|
|
packssdw m1, m10, m11
|
|
|
|
.pass2_end2:
|
|
|
|
REPX {psrad x, 6}, m12, m13, m14, m15
|
|
|
|
call .write_16x4
|
|
|
|
packssdw m0, m12, m13
|
|
|
|
packssdw m1, m14, m15
|
|
|
|
call .write_16x4
|
|
|
|
RET
|
|
|
|
ALIGN function_align
|
|
|
|
.write_16x4:
|
|
|
|
mova ym16, [dstq+strideq*0]
|
|
|
|
vinserti32x8 m16, [dstq+strideq*1], 1
|
|
|
|
mova ym17, [dstq+strideq*2]
|
|
|
|
vinserti32x8 m17, [dstq+r6 ], 1
|
|
|
|
vpermq m0, m2, m0
|
|
|
|
vpermq m1, m2, m1
|
|
|
|
paddsw m16, m0
|
|
|
|
paddsw m17, m1
|
|
|
|
psubusw m16, m3
|
|
|
|
psubusw m17, m3
|
|
|
|
mova [dstq+strideq*0], ym16
|
|
|
|
vextracti32x8 [dstq+strideq*1], m16, 1
|
|
|
|
mova [dstq+strideq*2], ym17
|
|
|
|
vextracti32x8 [dstq+r6 ], m17, 1
|
|
|
|
lea dstq, [dstq+strideq*4]
|
|
|
|
ret
|
|
|
|
ALIGN function_align
|
|
|
|
IDCT16_PART1
|
|
|
|
ret
|
|
|
|
ALIGN function_align
|
|
|
|
.main_part2:
|
|
|
|
IDCT16_PART2
|
|
|
|
ret
|
|
|
|
|
|
|
|
%macro IADST16_PART1 0
|
|
|
|
%if mmsize == 64
|
|
|
|
.main_part1_fast:
|
|
|
|
%endif
|
|
|
|
pmulld m15, m0, [o(pd_16364)] {bcstd} ; t1
|
|
|
|
pmulld m0, [o(pd_804)] {bcstd} ; t0
|
|
|
|
pmulld m13, m2, [o(pd_15893)] {bcstd} ; t3
|
|
|
|
pmulld m2, [o(pd_3981)] {bcstd} ; t2
|
|
|
|
pmulld m11, m4, [o(pd_14811)] {bcstd} ; t5
|
|
|
|
pmulld m4, [o(pd_7005)] {bcstd} ; t4
|
|
|
|
pmulld m9, m6, [o(pd_13160)] {bcstd} ; t7
|
|
|
|
pmulld m6, [o(pd_9760)] {bcstd} ; t6
|
|
|
|
pmulld m8, m7, [o(pd_11003)] {bcstd} ; t8
|
|
|
|
pmulld m7, [o(pd_12140)] {bcstd} ; t9
|
|
|
|
pmulld m10, m5, [o(pd_8423)] {bcstd} ; t10
|
|
|
|
pmulld m5, [o(pd_14053)] {bcstd} ; t11
|
|
|
|
pmulld m12, m3, [o(pd_5520)] {bcstd} ; t12
|
|
|
|
pmulld m3, [o(pd_15426)] {bcstd} ; t13
|
|
|
|
pmulld m14, m1, [o(pd_2404)] {bcstd} ; t14
|
|
|
|
pmulld m1, [o(pd_16207)] {bcstd} ; t15
|
|
|
|
REPX {psubd x, m20, x}, m15, m13, m11, m9
|
|
|
|
%if mmsize == 64 ; for the ymm variant we only ever use the fast path
|
|
|
|
jmp %%main_part1b
|
|
|
|
ALIGN function_align
|
|
|
|
.main_part1:
|
|
|
|
ITX_MULSUB_2D 15, 0, 16, 17, 18, _, 804, 16364 ; t1, t0
|
|
|
|
ITX_MULSUB_2D 13, 2, 16, 17, 18, _, 3981, 15893 ; t3, t2
|
|
|
|
ITX_MULSUB_2D 11, 4, 16, 17, 18, _, 7005, 14811 ; t5, t4
|
|
|
|
ITX_MULSUB_2D 9, 6, 16, 17, 18, _, 9760, 13160 ; t7, t6
|
|
|
|
ITX_MULSUB_2D 7, 8, 16, 17, 18, _, 12140, 11003 ; t9, t8
|
|
|
|
ITX_MULSUB_2D 5, 10, 16, 17, 18, _, 14053, 8423 ; t11, t10
|
|
|
|
ITX_MULSUB_2D 3, 12, 16, 17, 18, _, 15426, 5520 ; t13, t12
|
|
|
|
ITX_MULSUB_2D 1, 14, 16, 17, 18, _, 16207, 2404 ; t15, t14
|
|
|
|
REPX {paddd x, m20}, m15, m13, m11, m9
|
|
|
|
%%main_part1b:
|
|
|
|
%endif
|
|
|
|
REPX {paddd x, m20}, m0, m2, m4, m6
|
|
|
|
psubd m16, m2, m10 ; t10a
|
|
|
|
paddd m2, m10 ; t2a
|
|
|
|
psubd m10, m9, m1 ; t15a
|
|
|
|
paddd m9, m1 ; t7a
|
|
|
|
psubd m1, m13, m5 ; t11a
|
|
|
|
paddd m13, m5 ; t3a
|
|
|
|
psubd m5, m6, m14 ; t14a
|
|
|
|
paddd m6, m14 ; t6a
|
|
|
|
REPX {psrad x, 14}, m16, m10, m1, m5
|
|
|
|
psubd m14, m0, m8 ; t8a
|
|
|
|
paddd m0, m8 ; t0a
|
|
|
|
psubd m8, m15, m7 ; t9a
|
|
|
|
paddd m15, m7 ; t1a
|
|
|
|
psubd m7, m4, m12 ; t12a
|
|
|
|
paddd m4, m12 ; t4a
|
|
|
|
paddd m12, m11, m3 ; t5a
|
|
|
|
psubd m11, m3 ; t13a
|
|
|
|
REPX {psrad x, 14}, m14, m8, m7, m11
|
|
|
|
vpbroadcastd m19, [o(pd_9102)]
|
|
|
|
vpbroadcastd m18, [o(pd_13623)]
|
|
|
|
ITX_MULSUB_2D 16, 1, 3, 17, _, _, 18, 19 ; t11, t10
|
|
|
|
ITX_MULSUB_2D 10, 5, 3, 17, _, _, 19, 18 ; t14, t15
|
|
|
|
vpbroadcastd m19, [o(pd_16069)]
|
|
|
|
vpbroadcastd m18, [o(pd_3196)]
|
|
|
|
ITX_MULSUB_2D 14, 8, 3, 17, _, _, 18, 19 ; t9, t8
|
|
|
|
ITX_MULSUB_2D 11, 7, 3, 17, _, _, 19, 18 ; t12, t13
|
|
|
|
vpbroadcastd m19, [o(pd_6270)]
|
|
|
|
vpbroadcastd m18, [o(pd_15137)]
|
|
|
|
REPX {psrad x, 14}, m15, m12, m0, m4
|
|
|
|
psubd m3, m15, m12 ; t5
|
|
|
|
paddd m15, m12 ; t1
|
|
|
|
psubd m12, m0, m4 ; t4
|
|
|
|
paddd m0, m4 ; t0
|
|
|
|
REPX {psrad x, 14}, m2, m6, m13, m9
|
|
|
|
psubd m4, m2, m6 ; t6
|
|
|
|
paddd m2, m6 ; t2
|
|
|
|
psubd m6, m13, m9 ; t7
|
|
|
|
paddd m9, m13 ; t3
|
|
|
|
REPX {paddd x, m20}, m8, m14, m1, m16
|
|
|
|
psubd m13, m8, m11 ; t12a
|
|
|
|
paddd m8, m11 ; t8a
|
|
|
|
psubd m11, m14, m7 ; t13a
|
|
|
|
paddd m14, m7 ; t9a
|
|
|
|
psubd m7, m1, m10 ; t14a
|
|
|
|
paddd m1, m10 ; t10a
|
|
|
|
psubd m10, m16, m5 ; t15a
|
|
|
|
paddd m16, m5 ; t11a
|
|
|
|
REPX {psrad x, 14}, m13, m11, m7, m10
|
|
|
|
ITX_MULSUB_2D 12, 3, 5, 17, _, _, 19, 18 ; t5a, t4a
|
|
|
|
ITX_MULSUB_2D 6, 4, 5, 17, _, _, 18, 19 ; t6a, t7a
|
|
|
|
ITX_MULSUB_2D 13, 11, 5, 17, _, _, 19, 18 ; t13, t12
|
|
|
|
ITX_MULSUB_2D 10, 7, 5, 17, _, _, 18, 19 ; t14, t15
|
|
|
|
REPX {psrad x, 14}, m8, m1, m14, m16
|
|
|
|
psubd m5, m8, m1 ; t10
|
|
|
|
paddd m1, m8 ; -out1
|
|
|
|
psubd m8, m15, m9 ; t3a
|
|
|
|
paddd m15, m9 ; -out15
|
|
|
|
psubd m9, m14, m16 ; t11
|
|
|
|
paddd m14, m16 ; out14
|
|
|
|
psubd m16, m0, m2 ; t2a
|
|
|
|
paddd m0, m2 ; out0
|
|
|
|
REPX {paddd x, m20}, m11, m13, m12, m3
|
|
|
|
paddd m2, m11, m10 ; out2
|
|
|
|
psubd m11, m10 ; t14a
|
|
|
|
psubd m10, m13, m7 ; t15a
|
|
|
|
paddd m13, m7 ; -out13
|
|
|
|
psubd m7, m12, m4 ; t7
|
|
|
|
paddd m12, m4 ; out12
|
|
|
|
psubd m4, m3, m6 ; t6
|
|
|
|
paddd m3, m6 ; -out3
|
|
|
|
REPX {psrad x, 14}, m10, m7, m11, m4
|
|
|
|
REPX {pmulld x, m21}, m9, m10, m7, m8, m5, m11, m4, m16
|
|
|
|
REPX {psrad x, 14}, m2, m13, m12, m3
|
|
|
|
%endmacro
|
|
|
|
|
|
|
|
%macro IADST16_PART2 0
|
|
|
|
paddd m9, m20
|
|
|
|
psubd m10, m20, m10
|
|
|
|
paddd m7, m20
|
|
|
|
psubd m8, m20, m8
|
|
|
|
paddd m6, m9, m5 ; out6
|
|
|
|
psubd m9, m5 ; out9
|
|
|
|
psubd m5, m10, m11 ; out5
|
|
|
|
paddd m10, m11 ; out10
|
|
|
|
psubd m11, m7, m4 ; out11
|
|
|
|
paddd m4, m7 ; out4
|
|
|
|
psubd m7, m8, m16 ; out7
|
|
|
|
paddd m8, m16 ; out8
|
|
|
|
%endmacro
|
|
|
|
|
|
|
|
%macro IADST16_PASS1_END 0
|
|
|
|
pxor m16, m16
|
|
|
|
psubd m1, m16, m1
|
|
|
|
psubd m3, m16, m3
|
|
|
|
psubd m13, m16, m13
|
|
|
|
psubd m15, m16, m15
|
|
|
|
REPX {psrad x, 14}, m4, m5, m6, m7, m8, m9, m10, m11
|
|
|
|
%endmacro
|
|
|
|
|
|
|
|
INV_TXFM_16X16_FN adst, dct, 39-18
|
|
|
|
INV_TXFM_16X16_FN adst, adst
|
|
|
|
|
|
|
|
cglobal vp9_iadst_16x16_internal_10, 0, 7, 22, dst, stride, c, eob, tx2
|
|
|
|
mova m0, [cq+64* 0]
|
|
|
|
mova m1, [cq+64* 1]
|
|
|
|
mova m2, [cq+64* 2]
|
|
|
|
mova m3, [cq+64* 3]
|
|
|
|
mova m4, [cq+64* 4]
|
|
|
|
mova m5, [cq+64* 5]
|
|
|
|
mova m6, [cq+64* 6]
|
|
|
|
mova m7, [cq+64* 7]
|
|
|
|
vpbroadcastd m20, [o(pd_8192)]
|
|
|
|
vpbroadcastd m21, [o(pd_11585)]
|
|
|
|
sub eobd, 39
|
|
|
|
jl .pass1_fast
|
|
|
|
mova m8, [cq+64* 8]
|
|
|
|
mova m9, [cq+64* 9]
|
|
|
|
mova m10, [cq+64*10]
|
|
|
|
mova m11, [cq+64*11]
|
|
|
|
mova m12, [cq+64*12]
|
|
|
|
mova m13, [cq+64*13]
|
|
|
|
mova m14, [cq+64*14]
|
|
|
|
mova m15, [cq+64*15]
|
|
|
|
call .main_part1
|
|
|
|
call .main_part2
|
|
|
|
IADST16_PASS1_END
|
|
|
|
jmp m(vp9_idct_16x16_internal_10).pass1_end
|
|
|
|
.pass1_fast:
|
|
|
|
WRAP_YMM IADST16_PART1
|
|
|
|
WRAP_YMM IADST16_PART2
|
|
|
|
WRAP_YMM IADST16_PASS1_END
|
|
|
|
jmp m(vp9_idct_16x16_internal_10).pass1_fast_end
|
|
|
|
.pass2:
|
|
|
|
test eobd, eobd
|
|
|
|
jl .pass2_fast
|
|
|
|
call .main_part1
|
|
|
|
jmp .pass2_end
|
|
|
|
.pass2_fast:
|
|
|
|
call .main_part1_fast
|
|
|
|
.pass2_end:
|
|
|
|
vpbroadcastd m20, [o(pd_532480)]
|
|
|
|
call .main_part2
|
|
|
|
vpbroadcastd m16, [o(pixel_clip6)]
|
|
|
|
REPX {paddd x, m16}, m0, m2, m12, m14
|
|
|
|
REPX {psubd x, m16, x}, m1, m3, m13, m15
|
|
|
|
REPX {psrad x, 6}, m0, m1, m2, m3
|
|
|
|
packssdw m0, m1
|
|
|
|
lea r6, [strideq*3]
|
|
|
|
packssdw m1, m2, m3
|
|
|
|
mova m2, [o(deintq_perm)]
|
|
|
|
vpbroadcastd m3, [o(pixel_clip)]
|
|
|
|
REPX {psrad x, 20}, m4, m5, m6, m7
|
|
|
|
call m(vp9_idct_16x16_internal_10).write_16x4
|
|
|
|
packssdw m0, m4, m5
|
|
|
|
packssdw m1, m6, m7
|
|
|
|
paddsw m0, m3
|
|
|
|
paddsw m1, m3
|
|
|
|
REPX {psrad x, 20}, m8, m9, m10, m11
|
|
|
|
call m(vp9_idct_16x16_internal_10).write_16x4
|
|
|
|
packssdw m0, m8, m9
|
|
|
|
packssdw m1, m10, m11
|
|
|
|
paddsw m0, m3
|
|
|
|
paddsw m1, m3
|
|
|
|
jmp m(vp9_idct_16x16_internal_10).pass2_end2
|
|
|
|
ALIGN function_align
|
|
|
|
IADST16_PART1
|
|
|
|
ret
|
|
|
|
ALIGN function_align
|
|
|
|
.main_part2:
|
|
|
|
IADST16_PART2
|
|
|
|
ret
|
|
|
|
|
|
|
|
cglobal vp9_idct_idct_32x32_add_10, 4, 7, 23, 64*64, dst, stride, c, eob
|
|
|
|
%undef cmp
|
|
|
|
lea r5, [o_base]
|
|
|
|
dec eobd
|
|
|
|
jnz .pass1
|
|
|
|
imul r6d, [cq], 11585
|
|
|
|
vpbroadcastd m3, [o(pixel_clip)]
|
|
|
|
mov [cq], r3d
|
|
|
|
add r6d, 8192
|
|
|
|
sar r6d, 14
|
|
|
|
imul r6d, 11585
|
|
|
|
or r3d, 16
|
|
|
|
add r6d, 532480
|
|
|
|
sar r6d, 20
|
|
|
|
vpbroadcastw m2, r6d
|
|
|
|
paddsw m2, m3
|
|
|
|
.dconly_loop:
|
|
|
|
paddsw m0, m2, [dstq+strideq*0]
|
|
|
|
paddsw m1, m2, [dstq+strideq*1]
|
|
|
|
psubusw m0, m3
|
|
|
|
psubusw m1, m3
|
|
|
|
mova [dstq+strideq*0], m0
|
|
|
|
mova [dstq+strideq*1], m1
|
|
|
|
lea dstq, [dstq+strideq*2]
|
|
|
|
dec r3d
|
|
|
|
jg .dconly_loop
|
|
|
|
RET
|
|
|
|
.pass1:
|
|
|
|
vpbroadcastd m20, [o(pd_8192)]
|
|
|
|
vpbroadcastd m21, [o(pd_11585)]
|
|
|
|
cmp eobd, 135
|
|
|
|
jl .pass1_fast
|
|
|
|
add cq, 64
|
|
|
|
lea r4, [rsp+64*8]
|
|
|
|
cmp eobd, 579
|
|
|
|
jl .pass1_right_fast
|
|
|
|
mov r6d, 128*28
|
|
|
|
call .pass1_main
|
|
|
|
jmp .pass1_right_end
|
|
|
|
.pass1_right_fast: ; bottomright quadrant is zero
|
|
|
|
mova m0, [cq+128* 1]
|
|
|
|
mova m1, [cq+128* 3]
|
|
|
|
mova m2, [cq+128* 5]
|
|
|
|
mova m3, [cq+128* 7]
|
|
|
|
mova m4, [cq+128* 9]
|
|
|
|
mova m5, [cq+128*11]
|
|
|
|
mova m6, [cq+128*13]
|
|
|
|
mova m7, [cq+128*15]
|
|
|
|
call .main_fast
|
|
|
|
mova m0, [cq+128* 0]
|
|
|
|
mova m1, [cq+128* 2]
|
|
|
|
mova m2, [cq+128* 4]
|
|
|
|
mova m3, [cq+128* 6]
|
|
|
|
mova m4, [cq+128* 8]
|
|
|
|
mova m5, [cq+128*10]
|
|
|
|
mova m6, [cq+128*12]
|
|
|
|
mova m7, [cq+128*14]
|
|
|
|
call m(vp9_idct_16x16_internal_10).main_part1_fast
|
|
|
|
mov r6d, 128*12
|
|
|
|
call .pass1_main_end
|
|
|
|
.pass1_right_end:
|
|
|
|
mova [r4+64* 8], m0
|
|
|
|
mova [r4+64* 9], m1
|
|
|
|
mova [r4+64*10], m2
|
|
|
|
mova [r4+64*11], m3
|
|
|
|
mova [r4+64*12], m4
|
|
|
|
mova [r4+64*13], m5
|
|
|
|
mova [r4+64*14], m6
|
|
|
|
mova [r4+64*15], m7
|
|
|
|
mova [r4+64*16], m16
|
|
|
|
mova [r4+64*17], m17
|
|
|
|
mova [r4+64*18], m18
|
|
|
|
mova [r4+64*19], m19
|
|
|
|
mova [r4+64*20], m8
|
|
|
|
mova [r4+64*21], m9
|
|
|
|
mova [r4+64*22], m10
|
|
|
|
mova [r4+64*23], m11
|
|
|
|
sub cq, 64
|
|
|
|
sub r4, 64*8
|
|
|
|
mov r6d, 128*28
|
|
|
|
call .pass1_main
|
|
|
|
mova m12, [r4+64*20]
|
|
|
|
mova m13, [r4+64*21]
|
|
|
|
mova m14, [r4+64*22]
|
|
|
|
mova m15, [r4+64*23]
|
|
|
|
mova [r4+64*20], m8
|
|
|
|
mova [r4+64*21], m9
|
|
|
|
mova [r4+64*22], m10
|
|
|
|
mova [r4+64*23], m11
|
|
|
|
mova m8, [r4+64*16]
|
|
|
|
mova m9, [r4+64*17]
|
|
|
|
mova m10, [r4+64*18]
|
|
|
|
mova m11, [r4+64*19]
|
|
|
|
mova [r4+64*16], m16
|
|
|
|
mova [r4+64*17], m17
|
|
|
|
mova [r4+64*18], m18
|
|
|
|
mova [r4+64*19], m19
|
|
|
|
call .main
|
|
|
|
mova m0, [r4+64*16]
|
|
|
|
mova m1, [r4+64*17]
|
|
|
|
mova m2, [r4+64*18]
|
|
|
|
mova m3, [r4+64*19]
|
|
|
|
mova m4, [r4+64*20]
|
|
|
|
mova m5, [r4+64*21]
|
|
|
|
mova m6, [r4+64*22]
|
|
|
|
mova m7, [r4+64*23]
|
|
|
|
mova m8, [r4+64*24]
|
|
|
|
mova m9, [r4+64*25]
|
|
|
|
mova m10, [r4+64*26]
|
|
|
|
mova m11, [r4+64*27]
|
|
|
|
mova m12, [r4+64*28]
|
|
|
|
mova m13, [r4+64*29]
|
|
|
|
mova m14, [r4+64*30]
|
|
|
|
mova m15, [r4+64*31]
|
|
|
|
call m(vp9_idct_16x16_internal_10).main_part1
|
|
|
|
call .pass2_main_left
|
|
|
|
mova m8, [r4+64* 8]
|
|
|
|
mova m9, [r4+64* 9]
|
|
|
|
mova m10, [r4+64*10]
|
|
|
|
mova m11, [r4+64*11]
|
|
|
|
mova m12, [r4+64*12]
|
|
|
|
mova m13, [r4+64*13]
|
|
|
|
mova m14, [r4+64*14]
|
|
|
|
mova m15, [r4+64*15]
|
|
|
|
TRANSPOSE_4DQ 8, 10, 12, 14, 16
|
|
|
|
TRANSPOSE_4DQ 9, 11, 13, 15, 16
|
|
|
|
call .main
|
|
|
|
call .pass2_main_right
|
|
|
|
mova m8, [r4+64*24]
|
|
|
|
mova m9, [r4+64*25]
|
|
|
|
mova m10, [r4+64*26]
|
|
|
|
mova m11, [r4+64*27]
|
|
|
|
mova m12, [r4+64*28]
|
|
|
|
mova m13, [r4+64*29]
|
|
|
|
mova m14, [r4+64*30]
|
|
|
|
mova m15, [r4+64*31]
|
|
|
|
TRANSPOSE_4DQ 8, 10, 12, 14, 16
|
|
|
|
TRANSPOSE_4DQ 9, 11, 13, 15, 16
|
|
|
|
call m(vp9_idct_16x16_internal_10).main_part1
|
|
|
|
jmp .pass2_end
|
|
|
|
.pass1_fast:
|
|
|
|
mova m0, [cq+128* 1]
|
|
|
|
mova m1, [cq+128* 3]
|
|
|
|
mova m2, [cq+128* 5]
|
|
|
|
mova m3, [cq+128* 7]
|
|
|
|
mova m4, [cq+128* 9]
|
|
|
|
mova m5, [cq+128*11]
|
|
|
|
mova m6, [cq+128*13]
|
|
|
|
mova m7, [cq+128*15]
|
|
|
|
mov r4, rsp
|
|
|
|
call .main_fast
|
|
|
|
mova m0, [cq+128* 0]
|
|
|
|
mova m1, [cq+128* 2]
|
|
|
|
mova m2, [cq+128* 4]
|
|
|
|
mova m3, [cq+128* 6]
|
|
|
|
mova m4, [cq+128* 8]
|
|
|
|
mova m5, [cq+128*10]
|
|
|
|
mova m6, [cq+128*12]
|
|
|
|
mova m7, [cq+128*14]
|
|
|
|
call m(vp9_idct_16x16_internal_10).main_part1_fast
|
|
|
|
call m(vp9_idct_16x16_internal_10).main_part2
|
|
|
|
mov r6d, 128*12
|
|
|
|
call .pass1_main_end2
|
|
|
|
mova [r4+64*16], m16
|
|
|
|
mova [r4+64*17], m17
|
|
|
|
mova [r4+64*18], m18
|
|
|
|
mova [r4+64*19], m19
|
|
|
|
mova [r4+64*20], m8
|
|
|
|
mova [r4+64*21], m9
|
|
|
|
mova [r4+64*22], m10
|
|
|
|
mova [r4+64*23], m11
|
|
|
|
call .main_fast
|
|
|
|
mova m0, [r4+64*16]
|
|
|
|
mova m1, [r4+64*17]
|
|
|
|
mova m2, [r4+64*18]
|
|
|
|
mova m3, [r4+64*19]
|
|
|
|
mova m4, [r4+64*20]
|
|
|
|
mova m5, [r4+64*21]
|
|
|
|
mova m6, [r4+64*22]
|
|
|
|
mova m7, [r4+64*23]
|
|
|
|
call m(vp9_idct_16x16_internal_10).main_part1_fast
|
|
|
|
call .pass2_main_left
|
|
|
|
call .main_fast
|
|
|
|
call .pass2_main_right
|
|
|
|
call m(vp9_idct_16x16_internal_10).main_part1_fast
|
|
|
|
.pass2_end:
|
|
|
|
paddd m0, m22
|
|
|
|
paddd m18, m22
|
|
|
|
call m(vp9_idct_16x16_internal_10).main_part2
|
|
|
|
mova m20, [o(deintq_perm)]
|
|
|
|
rorx r2, strideq, 59 ; strideq*32
|
|
|
|
vpbroadcastd m21, [o(pixel_clip)]
|
|
|
|
add r2, dstq
|
|
|
|
%assign i 0
|
|
|
|
%rep 16
|
|
|
|
mova m16, [r4+64*(15-i)]
|
|
|
|
mova m17, [r4+64*(i-16)]
|
|
|
|
mova m18, [r4-64*(17+i)]
|
|
|
|
paddd m19, m %+ i, m16
|
|
|
|
psubd m0, m %+ i, m16
|
|
|
|
call .write_32x2
|
|
|
|
%assign i i+1
|
|
|
|
%endrep
|
|
|
|
RET
|
|
|
|
ALIGN function_align
|
|
|
|
.write_32x2:
|
|
|
|
paddd m16, m17, m18
|
|
|
|
psubd m17, m18
|
|
|
|
REPX {psrad x, 6}, m19, m16, m0, m17
|
|
|
|
packssdw m16, m19
|
|
|
|
packssdw m17, m0
|
|
|
|
sub r2, strideq
|
|
|
|
vpermq m16, m20, m16
|
|
|
|
vpermq m17, m20, m17
|
|
|
|
paddsw m16, [dstq]
|
|
|
|
paddsw m17, [r2 ]
|
|
|
|
psubusw m16, m21
|
|
|
|
psubusw m17, m21
|
|
|
|
mova [dstq], m16
|
|
|
|
mova [r2 ], m17
|
|
|
|
add dstq, strideq
|
|
|
|
ret
|
|
|
|
ALIGN function_align
|
|
|
|
.pass1_main:
|
|
|
|
mova m0, [cq+128* 1]
|
|
|
|
mova m1, [cq+128* 3]
|
|
|
|
mova m2, [cq+128* 5]
|
|
|
|
mova m3, [cq+128* 7]
|
|
|
|
mova m4, [cq+128* 9]
|
|
|
|
mova m5, [cq+128*11]
|
|
|
|
mova m6, [cq+128*13]
|
|
|
|
mova m7, [cq+128*15]
|
|
|
|
mova m8, [cq+128*17]
|
|
|
|
mova m9, [cq+128*19]
|
|
|
|
mova m10, [cq+128*21]
|
|
|
|
mova m11, [cq+128*23]
|
|
|
|
mova m12, [cq+128*25]
|
|
|
|
mova m13, [cq+128*27]
|
|
|
|
mova m14, [cq+128*29]
|
|
|
|
mova m15, [cq+128*31]
|
|
|
|
call .main
|
|
|
|
mova m0, [cq+128* 0]
|
|
|
|
mova m1, [cq+128* 2]
|
|
|
|
mova m2, [cq+128* 4]
|
|
|
|
mova m3, [cq+128* 6]
|
|
|
|
mova m4, [cq+128* 8]
|
|
|
|
mova m5, [cq+128*10]
|
|
|
|
mova m6, [cq+128*12]
|
|
|
|
mova m7, [cq+128*14]
|
|
|
|
mova m8, [cq+128*16]
|
|
|
|
mova m9, [cq+128*18]
|
|
|
|
mova m10, [cq+128*20]
|
|
|
|
mova m11, [cq+128*22]
|
|
|
|
mova m12, [cq+128*24]
|
|
|
|
mova m13, [cq+128*26]
|
|
|
|
mova m14, [cq+128*28]
|
|
|
|
mova m15, [cq+128*30]
|
|
|
|
call m(vp9_idct_16x16_internal_10).main_part1
|
|
|
|
.pass1_main_end:
|
|
|
|
call m(vp9_idct_16x16_internal_10).main_part2
|
|
|
|
.pass1_main_end2:
|
|
|
|
pxor m16, m16
|
|
|
|
.pass1_zero_loop:
|
|
|
|
mova [cq+r6+128*0], m16
|
|
|
|
mova [cq+r6+128*1], m16
|
|
|
|
mova [cq+r6+128*2], m16
|
|
|
|
mova [cq+r6+128*3], m16
|
|
|
|
sub r6d, 128*4
|
|
|
|
jge .pass1_zero_loop
|
|
|
|
mova m16, [r4+64*15]
|
|
|
|
mova m19, [r4+64*14]
|
|
|
|
mova m22, [r4+64*13]
|
|
|
|
mova m17, [r4+64*12]
|
|
|
|
psubd m18, m0, m16
|
|
|
|
paddd m16, m0
|
|
|
|
paddd m0, m19, m1
|
|
|
|
psubd m19, m1, m19
|
|
|
|
paddd m1, m17, m3
|
|
|
|
psubd m3, m17
|
|
|
|
paddd m17, m2, m22
|
|
|
|
psubd m2, m22
|
|
|
|
TRANSPOSE_4D 3, 2, 19, 18, 22 ; 28 29 30 31
|
|
|
|
TRANSPOSE_4D 16, 0, 17, 1, 22 ; 0 1 2 3
|
|
|
|
mova [r4+64*54], m3
|
|
|
|
mova [r4+64*55], m19
|
|
|
|
mova [r4+64*38], m2
|
|
|
|
mova [r4+64*39], m18
|
|
|
|
mova m2, [r4+64*11]
|
|
|
|
mova m19, [r4+64*10]
|
|
|
|
mova m3, [r4+64* 9]
|
|
|
|
mova m22, [r4+64* 8]
|
|
|
|
paddd m18, m4, m2
|
|
|
|
psubd m4, m2
|
|
|
|
paddd m2, m5, m19
|
|
|
|
psubd m5, m19
|
|
|
|
paddd m19, m6, m3
|
|
|
|
psubd m6, m3
|
|
|
|
paddd m3, m7, m22
|
|
|
|
psubd m7, m22
|
|
|
|
TRANSPOSE_4D 7, 6, 5, 4, 22 ; 24 25 26 27
|
|
|
|
TRANSPOSE_4D 18, 2, 19, 3, 22 ; 4 5 6 7
|
|
|
|
mova [r4+64*52], m7
|
|
|
|
mova [r4+64*53], m5
|
|
|
|
mova [r4+64*36], m6
|
|
|
|
mova [r4+64*37], m4
|
|
|
|
mova m7, [r4+64* 7]
|
|
|
|
mova m4, [r4+64* 6]
|
|
|
|
mova m5, [r4+64* 5]
|
|
|
|
mova m22, [r4+64* 4]
|
|
|
|
psubd m6, m8, m7
|
|
|
|
paddd m8, m7
|
|
|
|
psubd m7, m9, m4
|
|
|
|
paddd m4, m9
|
|
|
|
paddd m9, m10, m5
|
|
|
|
psubd m10, m5
|
|
|
|
paddd m5, m11, m22
|
|
|
|
psubd m11, m22
|
|
|
|
TRANSPOSE_4D 11, 10, 7, 6, 22 ; 20 21 22 23
|
|
|
|
TRANSPOSE_4D 8, 4, 9, 5, 22 ; 8 9 10 11
|
|
|
|
mova [r4+64*50], m11
|
|
|
|
mova [r4+64*51], m7
|
|
|
|
mova [r4+64*34], m10
|
|
|
|
mova [r4+64*35], m6
|
|
|
|
mova m6, [r4+64* 3]
|
|
|
|
mova m11, [r4+64* 2]
|
|
|
|
mova m7, [r4+64* 1]
|
|
|
|
mova m22, [r4+64* 0]
|
|
|
|
paddd m10, m12, m6
|
|
|
|
psubd m12, m6
|
|
|
|
paddd m6, m13, m11
|
|
|
|
psubd m13, m11
|
|
|
|
paddd m11, m14, m7
|
|
|
|
psubd m14, m7
|
|
|
|
paddd m7, m15, m22
|
|
|
|
psubd m15, m22
|
|
|
|
TRANSPOSE_4D 15, 14, 13, 12, 22 ; 16 17 18 19
|
|
|
|
TRANSPOSE_4D 10, 6, 11, 7, 22 ; 12 13 14 15
|
|
|
|
mova [r4+64*48], m15
|
|
|
|
mova [r4+64*49], m13
|
|
|
|
mova [r4+64*32], m14
|
|
|
|
mova [r4+64*33], m12
|
|
|
|
TRANSPOSE_4DQ 0, 2, 4, 6, 22
|
|
|
|
TRANSPOSE_4DQ 1, 3, 5, 7, 22
|
|
|
|
TRANSPOSE_4DQ 16, 18, 8, 10, 22
|
|
|
|
TRANSPOSE_4DQ 17, 19, 9, 11, 22
|
|
|
|
ret
|
|
|
|
ALIGN function_align
|
|
|
|
.pass2_main_left:
|
|
|
|
vpbroadcastd m22, [o(pixel_clip6)]
|
|
|
|
paddd m0, m22
|
|
|
|
paddd m18, m22
|
|
|
|
call m(vp9_idct_16x16_internal_10).main_part2
|
|
|
|
mova [r4+64*16], m0
|
|
|
|
mova [r4+64*17], m1
|
|
|
|
mova [r4+64*18], m2
|
|
|
|
mova [r4+64*19], m3
|
|
|
|
mova [r4+64*20], m4
|
|
|
|
mova [r4+64*21], m5
|
|
|
|
mova [r4+64*22], m6
|
|
|
|
mova [r4+64*23], m7
|
|
|
|
mova [r4+64*24], m8
|
|
|
|
mova [r4+64*25], m9
|
|
|
|
mova [r4+64*26], m10
|
|
|
|
mova [r4+64*27], m11
|
|
|
|
mova [r4+64*28], m12
|
|
|
|
mova [r4+64*29], m13
|
|
|
|
mova [r4+64*30], m14
|
|
|
|
mova [r4+64*31], m15
|
|
|
|
add r4, 64*32
|
|
|
|
mova m0, [r4+64* 0]
|
|
|
|
mova m1, [r4+64* 1]
|
|
|
|
mova m2, [r4+64* 2]
|
|
|
|
mova m3, [r4+64* 3]
|
|
|
|
mova m4, [r4+64* 4]
|
|
|
|
mova m5, [r4+64* 5]
|
|
|
|
mova m6, [r4+64* 6]
|
|
|
|
mova m7, [r4+64* 7]
|
|
|
|
jmp .pass2_main_transpose
|
|
|
|
ALIGN function_align
|
|
|
|
.pass2_main_right:
|
|
|
|
mova m0, [r4+64*16]
|
|
|
|
mova m1, [r4+64*17]
|
|
|
|
mova m2, [r4+64*18]
|
|
|
|
mova m3, [r4+64*19]
|
|
|
|
mova m4, [r4+64*20]
|
|
|
|
mova m5, [r4+64*21]
|
|
|
|
mova m6, [r4+64*22]
|
|
|
|
mova m7, [r4+64*23]
|
|
|
|
.pass2_main_transpose:
|
|
|
|
TRANSPOSE_4DQ 0, 2, 4, 6, 8
|
|
|
|
TRANSPOSE_4DQ 1, 3, 5, 7, 8
|
|
|
|
ret
|
|
|
|
ALIGN function_align
|
|
|
|
.main_fast:
|
|
|
|
pmulld m15, m0, [o(pd_16364)] {1to16} ; t31a
|
|
|
|
pmulld m0, [o(pd_804)] {1to16} ; t16a
|
|
|
|
pmulld m8, m7, [o(pd_11003)] {1to16} ; t17a
|
|
|
|
pmulld m7, [o(pd_12140)] {1to16} ; t30a
|
|
|
|
pmulld m11, m4, [o(pd_14811)] {1to16} ; t29a
|
|
|
|
pmulld m4, [o(pd_7005)] {1to16} ; t18a
|
|
|
|
pmulld m12, m3, [o(pd_5520)] {1to16} ; t19a
|
|
|
|
pmulld m3, [o(pd_15426)] {1to16} ; t28a
|
|
|
|
pmulld m13, m2, [o(pd_15893)] {1to16} ; t27a
|
|
|
|
pmulld m2, [o(pd_3981)] {1to16} ; t20a
|
|
|
|
pmulld m10, m5, [o(pd_8423)] {1to16} ; t21a
|
|
|
|
pmulld m5, [o(pd_14053)] {1to16} ; t26a
|
|
|
|
pmulld m9, m6, [o(pd_13160)] {1to16} ; t25a
|
|
|
|
pmulld m6, [o(pd_9760)] {1to16} ; t22a
|
|
|
|
pmulld m14, m1, [o(pd_2404)] {1to16} ; t23a
|
|
|
|
pmulld m1, [o(pd_16207)] {1to16} ; t24a
|
|
|
|
REPX {psubd x, m20, x}, m8, m12, m10, m14
|
|
|
|
jmp .main2
|
|
|
|
ALIGN function_align
|
|
|
|
.main:
|
|
|
|
ITX_MULSUB_2D 0, 15, 16, 17, 18, _, 804, 16364 ; t16a, t31a
|
|
|
|
ITX_MULSUB_2D 8, 7, 16, 17, 18, _, 12140, 11003 ; t17a, t30a
|
|
|
|
ITX_MULSUB_2D 4, 11, 16, 17, 18, _, 7005, 14811 ; t18a, t29a
|
|
|
|
ITX_MULSUB_2D 12, 3, 16, 17, 18, _, 15426, 5520 ; t19a, t28a
|
|
|
|
ITX_MULSUB_2D 2, 13, 16, 17, 18, _, 3981, 15893 ; t20a, t27a
|
|
|
|
ITX_MULSUB_2D 10, 5, 16, 17, 18, _, 14053, 8423 ; t21a, t26a
|
|
|
|
ITX_MULSUB_2D 6, 9, 16, 17, 18, _, 9760, 13160 ; t22a, t25a
|
|
|
|
ITX_MULSUB_2D 14, 1, 16, 17, 18, _, 16207, 2404 ; t23a, t24a
|
|
|
|
REPX {paddd x, m20}, m8, m12, m10, m14
|
|
|
|
.main2:
|
|
|
|
REPX {paddd x, m20}, m0, m15, m7, m4, m3, m11
|
|
|
|
REPX {psrad x, 14 }, m8, m0, m15, m7, m12, m4, m3, m11
|
|
|
|
psubd m16, m0, m8 ; t17
|
|
|
|
paddd m0, m8 ; t16
|
|
|
|
psubd m8, m15, m7 ; t30
|
|
|
|
paddd m15, m7 ; t31
|
|
|
|
paddd m7, m12, m4 ; t19
|
|
|
|
psubd m12, m4 ; t18
|
|
|
|
paddd m4, m3, m11 ; t28
|
|
|
|
psubd m3, m11 ; t29
|
|
|
|
REPX {paddd x, m20}, m2, m13, m5, m6, m1, m9
|
|
|
|
REPX {psrad x, 14 }, m10, m2, m13, m5, m14, m6, m1, m9
|
|
|
|
psubd m11, m2, m10 ; t21
|
|
|
|
paddd m2, m10 ; t20
|
|
|
|
psubd m10, m13, m5 ; t26
|
|
|
|
paddd m13, m5 ; t27
|
|
|
|
psubd m5, m14, m6 ; t22
|
|
|
|
paddd m6, m14 ; t23
|
|
|
|
psubd m14, m1, m9 ; t25
|
|
|
|
paddd m9, m1 ; t24
|
|
|
|
vpbroadcastd m19, [o(pd_16069)]
|
|
|
|
vpbroadcastd m18, [o(pd_3196)]
|
|
|
|
ITX_MULSUB_2D 8, 16, 1, 17, _, 20, 18, 19 ; t17a, t30a
|
|
|
|
ITX_MULSUB_2D 3, 12, 1, 17, _, 20, 18, 19, 1 ; t29a, t18a
|
|
|
|
vpbroadcastd m19, [o(pd_9102)]
|
|
|
|
vpbroadcastd m18, [o(pd_13623)]
|
|
|
|
ITX_MULSUB_2D 10, 11, 1, 17, _, 20, 18, 19 ; t21a, t26a
|
|
|
|
ITX_MULSUB_2D 14, 5, 1, 17, _, 20, 18, 19, 1 ; t25a, t22a
|
|
|
|
paddd m1, m6, m2 ; t23a
|
|
|
|
psubd m6, m2 ; t20a
|
|
|
|
psubd m2, m9, m13 ; t27a
|
|
|
|
paddd m9, m13 ; t24a
|
|
|
|
psubd m13, m15, m4 ; t28a
|
|
|
|
paddd m15, m4 ; t31a
|
|
|
|
psubd m4, m8, m12 ; t18
|
|
|
|
paddd m8, m12 ; t17
|
|
|
|
psubd m12, m0, m7 ; t19a
|
|
|
|
paddd m0, m7 ; t16a
|
|
|
|
psubd m7, m16, m3 ; t29
|
|
|
|
paddd m3, m16 ; t30
|
|
|
|
paddd m16, m5, m10 ; t22
|
|
|
|
psubd m5, m10 ; t21
|
|
|
|
psubd m10, m14, m11 ; t26
|
|
|
|
paddd m14, m11 ; t25
|
|
|
|
vpbroadcastd m19, [o(pd_15137)]
|
|
|
|
vpbroadcastd m18, [o(pd_6270)]
|
|
|
|
ITX_MULSUB_2D 13, 12, 11, 17, _, 20, 18, 19 ; t19, t28
|
|
|
|
ITX_MULSUB_2D 2, 6, 11, 17, _, 20, 18, 19, 1 ; t27, t20
|
|
|
|
ITX_MULSUB_2D 7, 4, 11, 17, _, 20, 18, 19 ; t18a, t29a
|
|
|
|
ITX_MULSUB_2D 10, 5, 11, 17, _, 20, 18, 19, 1 ; t26a, t21a
|
|
|
|
psubd m11, m0, m1 ; t23
|
|
|
|
paddd m0, m1 ; t16
|
|
|
|
paddd m1, m16, m8 ; t17a
|
|
|
|
psubd m16, m8, m16 ; t22a
|
|
|
|
psubd m8, m15, m9 ; t24
|
|
|
|
paddd m15, m9 ; t31
|
|
|
|
psubd m9, m3, m14 ; t25a
|
|
|
|
paddd m14, m3 ; t30a
|
|
|
|
paddd m3, m6, m13 ; t19a
|
|
|
|
psubd m6, m13, m6 ; t20a
|
|
|
|
paddd m13, m10, m4 ; t29
|
|
|
|
psubd m10, m4, m10 ; t26
|
|
|
|
psubd m4, m12, m2 ; t27a
|
|
|
|
paddd m12, m2 ; t28a
|
|
|
|
paddd m2, m7, m5 ; t18
|
|
|
|
psubd m7, m5 ; t21
|
|
|
|
REPX {pmulld x, m21}, m10, m8, m4, m9, m7, m11, m6, m16
|
|
|
|
mova [r4+64* 0], m0
|
|
|
|
mova [r4+64* 1], m1
|
|
|
|
mova [r4+64* 2], m2
|
|
|
|
mova [r4+64* 3], m3
|
|
|
|
mova [r4+64*12], m12
|
|
|
|
mova [r4+64*13], m13
|
|
|
|
mova [r4+64*14], m14
|
|
|
|
mova [r4+64*15], m15
|
|
|
|
REPX {paddd x, m20}, m10, m8, m4, m9
|
|
|
|
psubd m5, m10, m7 ; t21a
|
|
|
|
paddd m10, m7 ; t26a
|
|
|
|
psubd m7, m8, m11 ; t23a
|
|
|
|
paddd m8, m11 ; t24a
|
|
|
|
REPX {psrad x, 14 }, m5, m10, m7, m8
|
|
|
|
paddd m11, m4, m6 ; t27
|
|
|
|
psubd m4, m6 ; t20
|
|
|
|
psubd m6, m9, m16 ; t22
|
|
|
|
paddd m9, m16 ; t25
|
|
|
|
REPX {psrad x, 14 }, m11, m4, m6, m9
|
|
|
|
mova [r4+64* 4], m4
|
|
|
|
mova [r4+64* 5], m5
|
|
|
|
mova [r4+64* 6], m6
|
|
|
|
mova [r4+64* 7], m7
|
|
|
|
mova [r4+64* 8], m8
|
|
|
|
mova [r4+64* 9], m9
|
|
|
|
mova [r4+64*10], m10
|
|
|
|
mova [r4+64*11], m11
|
|
|
|
ret
|
|
|
|
|
|
|
|
%endif
|