mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-12-02 03:06:28 +02:00
bfb28b5ce8
x64 always has MMX, MMXEXT, SSE and SSE2 and this means that some functions for MMX, MMXEXT and 3dnow are always overridden by other functions (unless one e.g. explicitly disables SSE2) for x64. So given that the only systems that benefit from these functions are truely ancient 32bit x86s they are removed. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
872 lines
38 KiB
NASM
872 lines
38 KiB
NASM
;
|
|
; Simple IDCT MMX
|
|
;
|
|
; Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at>
|
|
;
|
|
; Conversion from gcc syntax to x264asm syntax with minimal modifications
|
|
; by James Darnley <jdarnley@obe.tv>.
|
|
;
|
|
; This file is part of FFmpeg.
|
|
;
|
|
; FFmpeg is free software; you can redistribute it and/or
|
|
; modify it under the terms of the GNU Lesser General Public
|
|
; License as published by the Free Software Foundation; either
|
|
; version 2.1 of the License, or (at your option) any later version.
|
|
;
|
|
; FFmpeg is distributed in the hope that it will be useful,
|
|
; but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
; Lesser General Public License for more details.
|
|
;
|
|
; You should have received a copy of the GNU Lesser General Public
|
|
; License along with FFmpeg; if not, write to the Free Software
|
|
; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
;/
|
|
|
|
%include "libavutil/x86/x86util.asm"
|
|
|
|
%if ARCH_X86_32
|
|
SECTION_RODATA
|
|
|
|
cextern pb_80
|
|
|
|
wm1010: dw 0, 0xffff, 0, 0xffff
|
|
d40000: dd 4 << 16, 0
|
|
|
|
; 23170.475006
|
|
; 22725.260826
|
|
; 21406.727617
|
|
; 19265.545870
|
|
; 16384.000000
|
|
; 12872.826198
|
|
; 8866.956905
|
|
; 4520.335430
|
|
|
|
%define C0 23170 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
|
%define C1 22725 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
|
%define C2 21407 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
|
%define C3 19266 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
|
%define C4 16383 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
|
|
%define C5 12873 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
|
%define C6 8867 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
|
%define C7 4520 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
|
|
|
%define ROW_SHIFT 11
|
|
%define COL_SHIFT 20 ; 6
|
|
|
|
coeffs:
|
|
dw 1 << (ROW_SHIFT - 1), 0
|
|
dw 1 << (ROW_SHIFT - 1), 0
|
|
dw 1 << (ROW_SHIFT - 1), 1
|
|
dw 1 << (ROW_SHIFT - 1), 0
|
|
|
|
dw C4, C4, C4, C4
|
|
dw C4, -C4, C4, -C4
|
|
|
|
dw C2, C6, C2, C6
|
|
dw C6, -C2, C6, -C2
|
|
|
|
dw C1, C3, C1, C3
|
|
dw C5, C7, C5, C7
|
|
|
|
dw C3, -C7, C3, -C7
|
|
dw -C1, -C5, -C1, -C5
|
|
|
|
dw C5, -C1, C5, -C1
|
|
dw C7, C3, C7, C3
|
|
|
|
dw C7, -C5, C7, -C5
|
|
dw C3, -C1, C3, -C1
|
|
|
|
SECTION .text
|
|
|
|
%macro DC_COND_IDCT 7
|
|
movq mm0, [blockq + %1] ; R4 R0 r4 r0
|
|
movq mm1, [blockq + %2] ; R6 R2 r6 r2
|
|
movq mm2, [blockq + %3] ; R3 R1 r3 r1
|
|
movq mm3, [blockq + %4] ; R7 R5 r7 r5
|
|
movq mm4, [wm1010]
|
|
pand mm4, mm0
|
|
por mm4, mm1
|
|
por mm4, mm2
|
|
por mm4, mm3
|
|
packssdw mm4, mm4
|
|
movd t0d, mm4
|
|
or t0d, t0d
|
|
jz %%1
|
|
movq mm4, [coeffs + 16] ; C4 C4 C4 C4
|
|
pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0
|
|
movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4
|
|
pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0
|
|
movq mm5, [coeffs + 32] ; C6 C2 C6 C2
|
|
pmaddwd mm5, mm1 ; C6R6+C2R2 C6r6+C2r2
|
|
movq mm6, [coeffs + 40] ; -C2 C6 -C2 C6
|
|
pmaddwd mm1, mm6 ; -C2R6+C6R2 -C2r6+C6r2
|
|
movq mm7, [coeffs + 48] ; C3 C1 C3 C1
|
|
pmaddwd mm7, mm2 ; C3R3+C1R1 C3r3+C1r1
|
|
paddd mm4, [coeffs + 8]
|
|
movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0
|
|
paddd mm4, mm5 ; A0 a0
|
|
psubd mm6, mm5 ; A3 a3
|
|
movq mm5, [coeffs + 56] ; C7 C5 C7 C5
|
|
pmaddwd mm5, mm3 ; C7R7+C5R5 C7r7+C5r5
|
|
paddd mm0, [coeffs + 8]
|
|
paddd mm1, mm0 ; A1 a1
|
|
paddd mm0, mm0
|
|
psubd mm0, mm1 ; A2 a2
|
|
pmaddwd mm2, [coeffs + 64] ; -C7R3+C3R1 -C7r3+C3r1
|
|
paddd mm7, mm5 ; B0 b0
|
|
movq mm5, [coeffs + 72] ; -C5 -C1 -C5 -C1
|
|
pmaddwd mm5, mm3 ; -C5R7-C1R5 -C5r7-C1r5
|
|
paddd mm7, mm4 ; A0+B0 a0+b0
|
|
paddd mm4, mm4 ; 2A0 2a0
|
|
psubd mm4, mm7 ; A0-B0 a0-b0
|
|
paddd mm5, mm2 ; B1 b1
|
|
psrad mm7, %7
|
|
psrad mm4, %7
|
|
movq mm2, mm1 ; A1 a1
|
|
paddd mm1, mm5 ; A1+B1 a1+b1
|
|
psubd mm2, mm5 ; A1-B1 a1-b1
|
|
psrad mm1, %7
|
|
psrad mm2, %7
|
|
packssdw mm7, mm1 ; A1+B1 a1+b1 A0+B0 a0+b0
|
|
packssdw mm2, mm4 ; A0-B0 a0-b0 A1-B1 a1-b1
|
|
movq [%5], mm7
|
|
movq mm1, [blockq + %3] ; R3 R1 r3 r1
|
|
movq mm4, [coeffs + 80] ; -C1 C5 -C1 C5
|
|
movq [24 + %5], mm2
|
|
pmaddwd mm4, mm1 ; -C1R3+C5R1 -C1r3+C5r1
|
|
movq mm7, [coeffs + 88] ; C3 C7 C3 C7
|
|
pmaddwd mm1, [coeffs + 96] ; -C5R3+C7R1 -C5r3+C7r1
|
|
pmaddwd mm7, mm3 ; C3R7+C7R5 C3r7+C7r5
|
|
movq mm2, mm0 ; A2 a2
|
|
pmaddwd mm3, [coeffs + 104] ; -C1R7+C3R5 -C1r7+C3r5
|
|
paddd mm4, mm7 ; B2 b2
|
|
paddd mm2, mm4 ; A2+B2 a2+b2
|
|
psubd mm0, mm4 ; a2-B2 a2-b2
|
|
psrad mm2, %7
|
|
psrad mm0, %7
|
|
movq mm4, mm6 ; A3 a3
|
|
paddd mm3, mm1 ; B3 b3
|
|
paddd mm6, mm3 ; A3+B3 a3+b3
|
|
psubd mm4, mm3 ; a3-B3 a3-b3
|
|
psrad mm6, %7
|
|
packssdw mm2, mm6 ; A3+B3 a3+b3 A2+B2 a2+b2
|
|
movq [8 + %5], mm2
|
|
psrad mm4, %7
|
|
packssdw mm4, mm0 ; A2-B2 a2-b2 A3-B3 a3-b3
|
|
movq [16 + %5], mm4
|
|
jmp %%2
|
|
%%1:
|
|
pslld mm0, 16
|
|
paddd mm0, [d40000]
|
|
psrad mm0, 13
|
|
packssdw mm0, mm0
|
|
movq [%5], mm0
|
|
movq [8 + %5], mm0
|
|
movq [16 + %5], mm0
|
|
movq [24 + %5], mm0
|
|
%%2:
|
|
%endmacro
|
|
|
|
%macro Z_COND_IDCT 8
|
|
movq mm0, [blockq + %1] ; R4 R0 r4 r0
|
|
movq mm1, [blockq + %2] ; R6 R2 r6 r2
|
|
movq mm2, [blockq + %3] ; R3 R1 r3 r1
|
|
movq mm3, [blockq + %4] ; R7 R5 r7 r5
|
|
movq mm4, mm0
|
|
por mm4, mm1
|
|
por mm4, mm2
|
|
por mm4, mm3
|
|
packssdw mm4, mm4
|
|
movd t0d, mm4
|
|
or t0d, t0d
|
|
jz %8
|
|
movq mm4, [coeffs + 16] ; C4 C4 C4 C4
|
|
pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0
|
|
movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4
|
|
pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0
|
|
movq mm5, [coeffs + 32] ; C6 C2 C6 C2
|
|
pmaddwd mm5, mm1 ; C6R6+C2R2 C6r6+C2r2
|
|
movq mm6, [coeffs + 40] ; -C2 C6 -C2 C6
|
|
pmaddwd mm1, mm6 ; -C2R6+C6R2 -C2r6+C6r2
|
|
movq mm7, [coeffs + 48] ; C3 C1 C3 C1
|
|
pmaddwd mm7, mm2 ; C3R3+C1R1 C3r3+C1r1
|
|
paddd mm4, [coeffs]
|
|
movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0
|
|
paddd mm4, mm5 ; A0 a0
|
|
psubd mm6, mm5 ; A3 a3
|
|
movq mm5, [coeffs + 56] ; C7 C5 C7 C5
|
|
pmaddwd mm5, mm3 ; C7R7+C5R5 C7r7+C5r5
|
|
paddd mm0, [coeffs]
|
|
paddd mm1, mm0 ; A1 a1
|
|
paddd mm0, mm0
|
|
psubd mm0, mm1 ; A2 a2
|
|
pmaddwd mm2, [coeffs + 64] ; -C7R3+C3R1 -C7r3+C3r1
|
|
paddd mm7, mm5 ; B0 b0
|
|
movq mm5, [coeffs + 72] ; -C5 -C1 -C5 -C1
|
|
pmaddwd mm5, mm3 ; -C5R7-C1R5 -C5r7-C1r5
|
|
paddd mm7, mm4 ; A0+B0 a0+b0
|
|
paddd mm4, mm4 ; 2A0 2a0
|
|
psubd mm4, mm7 ; A0-B0 a0-b0
|
|
paddd mm5, mm2 ; B1 b1
|
|
psrad mm7, %7
|
|
psrad mm4, %7
|
|
movq mm2, mm1 ; A1 a1
|
|
paddd mm1, mm5 ; A1+B1 a1+b1
|
|
psubd mm2, mm5 ; A1-B1 a1-b1
|
|
psrad mm1, %7
|
|
psrad mm2, %7
|
|
packssdw mm7, mm1 ; A1+B1 a1+b1 A0+B0 a0+b0
|
|
packssdw mm2, mm4 ; A0-B0 a0-b0 A1-B1 a1-b1
|
|
movq [%5], mm7
|
|
movq mm1, [blockq + %3] ; R3 R1 r3 r1
|
|
movq mm4, [coeffs + 80] ; -C1 C5 -C1 C5
|
|
movq [24 + %5], mm2
|
|
pmaddwd mm4, mm1 ; -C1R3+C5R1 -C1r3+C5r1
|
|
movq mm7, [coeffs + 88] ; C3 C7 C3 C7
|
|
pmaddwd mm1, [coeffs + 96] ; -C5R3+C7R1 -C5r3+C7r1
|
|
pmaddwd mm7, mm3 ; C3R7+C7R5 C3r7+C7r5
|
|
movq mm2, mm0 ; A2 a2
|
|
pmaddwd mm3, [coeffs + 104] ; -C1R7+C3R5 -C1r7+C3r5
|
|
paddd mm4, mm7 ; B2 b2
|
|
paddd mm2, mm4 ; A2+B2 a2+b2
|
|
psubd mm0, mm4 ; a2-B2 a2-b2
|
|
psrad mm2, %7
|
|
psrad mm0, %7
|
|
movq mm4, mm6 ; A3 a3
|
|
paddd mm3, mm1 ; B3 b3
|
|
paddd mm6, mm3 ; A3+B3 a3+b3
|
|
psubd mm4, mm3 ; a3-B3 a3-b3
|
|
psrad mm6, %7
|
|
packssdw mm2, mm6 ; A3+B3 a3+b3 A2+B2 a2+b2
|
|
movq [8 + %5], mm2
|
|
psrad mm4, %7
|
|
packssdw mm4, mm0 ; A2-B2 a2-b2 A3-B3 a3-b3
|
|
movq [16 + %5], mm4
|
|
%endmacro
|
|
|
|
%macro IDCT1 6
|
|
movq mm0, %1 ; R4 R0 r4 r0
|
|
movq mm1, %2 ; R6 R2 r6 r2
|
|
movq mm2, %3 ; R3 R1 r3 r1
|
|
movq mm3, %4 ; R7 R5 r7 r5
|
|
movq mm4, [coeffs + 16] ; C4 C4 C4 C4
|
|
pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0
|
|
movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4
|
|
pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0
|
|
movq mm5, [coeffs + 32] ; C6 C2 C6 C2
|
|
pmaddwd mm5, mm1 ; C6R6+C2R2 C6r6+C2r2
|
|
movq mm6, [coeffs + 40] ; -C2 C6 -C2 C6
|
|
pmaddwd mm1, mm6 ; -C2R6+C6R2 -C2r6+C6r2
|
|
movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0
|
|
movq mm7, [coeffs + 48] ; C3 C1 C3 C1
|
|
pmaddwd mm7, mm2 ; C3R3+C1R1 C3r3+C1r1
|
|
paddd mm4, mm5 ; A0 a0
|
|
psubd mm6, mm5 ; A3 a3
|
|
movq mm5, mm0 ; -C4R4+C4R0 -C4r4+C4r0
|
|
paddd mm0, mm1 ; A1 a1
|
|
psubd mm5, mm1 ; A2 a2
|
|
movq mm1, [coeffs + 56] ; C7 C5 C7 C5
|
|
pmaddwd mm1, mm3 ; C7R7+C5R5 C7r7+C5r5
|
|
pmaddwd mm2, [coeffs + 64] ; -C7R3+C3R1 -C7r3+C3r1
|
|
paddd mm7, mm1 ; B0 b0
|
|
movq mm1, [coeffs + 72] ; -C5 -C1 -C5 -C1
|
|
pmaddwd mm1, mm3 ; -C5R7-C1R5 -C5r7-C1r5
|
|
paddd mm7, mm4 ; A0+B0 a0+b0
|
|
paddd mm4, mm4 ; 2A0 2a0
|
|
psubd mm4, mm7 ; A0-B0 a0-b0
|
|
paddd mm1, mm2 ; B1 b1
|
|
psrad mm7, %6
|
|
psrad mm4, %6
|
|
movq mm2, mm0 ; A1 a1
|
|
paddd mm0, mm1 ; A1+B1 a1+b1
|
|
psubd mm2, mm1 ; A1-B1 a1-b1
|
|
psrad mm0, %6
|
|
psrad mm2, %6
|
|
packssdw mm7, mm7 ; A0+B0 a0+b0
|
|
movd [%5], mm7
|
|
packssdw mm0, mm0 ; A1+B1 a1+b1
|
|
movd [16 + %5], mm0
|
|
packssdw mm2, mm2 ; A1-B1 a1-b1
|
|
movd [96 + %5], mm2
|
|
packssdw mm4, mm4 ; A0-B0 a0-b0
|
|
movd [112 + %5], mm4
|
|
movq mm0, %3 ; R3 R1 r3 r1
|
|
movq mm4, [coeffs + 80] ; -C1 C5 -C1 C5
|
|
pmaddwd mm4, mm0 ; -C1R3+C5R1 -C1r3+C5r1
|
|
movq mm7, [coeffs + 88] ; C3 C7 C3 C7
|
|
pmaddwd mm0, [coeffs + 96] ; -C5R3+C7R1 -C5r3+C7r1
|
|
pmaddwd mm7, mm3 ; C3R7+C7R5 C3r7+C7r5
|
|
movq mm2, mm5 ; A2 a2
|
|
pmaddwd mm3, [coeffs + 104] ; -C1R7+C3R5 -C1r7+C3r5
|
|
paddd mm4, mm7 ; B2 b2
|
|
paddd mm2, mm4 ; A2+B2 a2+b2
|
|
psubd mm5, mm4 ; a2-B2 a2-b2
|
|
psrad mm2, %6
|
|
psrad mm5, %6
|
|
movq mm4, mm6 ; A3 a3
|
|
paddd mm3, mm0 ; B3 b3
|
|
paddd mm6, mm3 ; A3+B3 a3+b3
|
|
psubd mm4, mm3 ; a3-B3 a3-b3
|
|
psrad mm6, %6
|
|
psrad mm4, %6
|
|
packssdw mm2, mm2 ; A2+B2 a2+b2
|
|
packssdw mm6, mm6 ; A3+B3 a3+b3
|
|
movd [32 + %5], mm2
|
|
packssdw mm4, mm4 ; A3-B3 a3-b3
|
|
packssdw mm5, mm5 ; A2-B2 a2-b2
|
|
movd [48 + %5], mm6
|
|
movd [64 + %5], mm4
|
|
movd [80 + %5], mm5
|
|
%endmacro
|
|
|
|
%macro IDCT2 6
|
|
movq mm0, %1 ; R4 R0 r4 r0
|
|
movq mm1, %2 ; R6 R2 r6 r2
|
|
movq mm3, %4 ; R7 R5 r7 r5
|
|
movq mm4, [coeffs + 16] ; C4 C4 C4 C4
|
|
pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0
|
|
movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4
|
|
pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0
|
|
movq mm5, [coeffs + 32] ; C6 C2 C6 C2
|
|
pmaddwd mm5, mm1 ; C6R6+C2R2 C6r6+C2r2
|
|
movq mm6, [coeffs + 40] ; -C2 C6 -C2 C6
|
|
pmaddwd mm1, mm6 ; -C2R6+C6R2 -C2r6+C6r2
|
|
movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0
|
|
paddd mm4, mm5 ; A0 a0
|
|
psubd mm6, mm5 ; A3 a3
|
|
movq mm5, mm0 ; -C4R4+C4R0 -C4r4+C4r0
|
|
paddd mm0, mm1 ; A1 a1
|
|
psubd mm5, mm1 ; A2 a2
|
|
movq mm1, [coeffs + 56] ; C7 C5 C7 C5
|
|
pmaddwd mm1, mm3 ; C7R7+C5R5 C7r7+C5r5
|
|
movq mm7, [coeffs + 72] ; -C5 -C1 -C5 -C1
|
|
pmaddwd mm7, mm3 ; -C5R7-C1R5 -C5r7-C1r5
|
|
paddd mm1, mm4 ; A0+B0 a0+b0
|
|
paddd mm4, mm4 ; 2A0 2a0
|
|
psubd mm4, mm1 ; A0-B0 a0-b0
|
|
psrad mm1, %6
|
|
psrad mm4, %6
|
|
movq mm2, mm0 ; A1 a1
|
|
paddd mm0, mm7 ; A1+B1 a1+b1
|
|
psubd mm2, mm7 ; A1-B1 a1-b1
|
|
psrad mm0, %6
|
|
psrad mm2, %6
|
|
packssdw mm1, mm1 ; A0+B0 a0+b0
|
|
movd [%5], mm1
|
|
packssdw mm0, mm0 ; A1+B1 a1+b1
|
|
movd [16 + %5], mm0
|
|
packssdw mm2, mm2 ; A1-B1 a1-b1
|
|
movd [96 + %5], mm2
|
|
packssdw mm4, mm4 ; A0-B0 a0-b0
|
|
movd [112 + %5], mm4
|
|
movq mm1, [coeffs + 88] ; C3 C7 C3 C7
|
|
pmaddwd mm1, mm3 ; C3R7+C7R5 C3r7+C7r5
|
|
movq mm2, mm5 ; A2 a2
|
|
pmaddwd mm3, [coeffs + 104] ; -C1R7+C3R5 -C1r7+C3r5
|
|
paddd mm2, mm1 ; A2+B2 a2+b2
|
|
psubd mm5, mm1 ; a2-B2 a2-b2
|
|
psrad mm2, %6
|
|
psrad mm5, %6
|
|
movq mm1, mm6 ; A3 a3
|
|
paddd mm6, mm3 ; A3+B3 a3+b3
|
|
psubd mm1, mm3 ; a3-B3 a3-b3
|
|
psrad mm6, %6
|
|
psrad mm1, %6
|
|
packssdw mm2, mm2 ; A2+B2 a2+b2
|
|
packssdw mm6, mm6 ; A3+B3 a3+b3
|
|
movd [32 + %5], mm2
|
|
packssdw mm1, mm1 ; A3-B3 a3-b3
|
|
packssdw mm5, mm5 ; A2-B2 a2-b2
|
|
movd [48 + %5], mm6
|
|
movd [64 + %5], mm1
|
|
movd [80 + %5], mm5
|
|
%endmacro
|
|
|
|
%macro IDCT3 6
|
|
movq mm0, %1 ; R4 R0 r4 r0
|
|
movq mm3, %4 ; R7 R5 r7 r5
|
|
movq mm4, [coeffs + 16] ; C4 C4 C4 C4
|
|
pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0
|
|
movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4
|
|
pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0
|
|
movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0
|
|
movq mm5, mm0 ; -C4R4+C4R0 -C4r4+C4r0
|
|
movq mm1, [coeffs + 56] ; C7 C5 C7 C5
|
|
pmaddwd mm1, mm3 ; C7R7+C5R5 C7r7+C5r5
|
|
movq mm7, [coeffs + 72] ; -C5 -C1 -C5 -C1
|
|
pmaddwd mm7, mm3 ; -C5R7-C1R5 -C5r7-C1r5
|
|
paddd mm1, mm4 ; A0+B0 a0+b0
|
|
paddd mm4, mm4 ; 2A0 2a0
|
|
psubd mm4, mm1 ; A0-B0 a0-b0
|
|
psrad mm1, %6
|
|
psrad mm4, %6
|
|
movq mm2, mm0 ; A1 a1
|
|
paddd mm0, mm7 ; A1+B1 a1+b1
|
|
psubd mm2, mm7 ; A1-B1 a1-b1
|
|
psrad mm0, %6
|
|
psrad mm2, %6
|
|
packssdw mm1, mm1 ; A0+B0 a0+b0
|
|
movd [%5], mm1
|
|
packssdw mm0, mm0 ; A1+B1 a1+b1
|
|
movd [16 + %5], mm0
|
|
packssdw mm2, mm2 ; A1-B1 a1-b1
|
|
movd [96 + %5], mm2
|
|
packssdw mm4, mm4 ; A0-B0 a0-b0
|
|
movd [112 + %5], mm4
|
|
movq mm1, [coeffs + 88] ; C3 C7 C3 C7
|
|
pmaddwd mm1, mm3 ; C3R7+C7R5 C3r7+C7r5
|
|
movq mm2, mm5 ; A2 a2
|
|
pmaddwd mm3, [coeffs + 104] ; -C1R7+C3R5 -C1r7+C3r5
|
|
paddd mm2, mm1 ; A2+B2 a2+b2
|
|
psubd mm5, mm1 ; a2-B2 a2-b2
|
|
psrad mm2, %6
|
|
psrad mm5, %6
|
|
movq mm1, mm6 ; A3 a3
|
|
paddd mm6, mm3 ; A3+B3 a3+b3
|
|
psubd mm1, mm3 ; a3-B3 a3-b3
|
|
psrad mm6, %6
|
|
psrad mm1, %6
|
|
packssdw mm2, mm2 ; A2+B2 a2+b2
|
|
packssdw mm6, mm6 ; A3+B3 a3+b3
|
|
movd [32 + %5], mm2
|
|
packssdw mm1, mm1 ; A3-B3 a3-b3
|
|
packssdw mm5, mm5 ; A2-B2 a2-b2
|
|
movd [48 + %5], mm6
|
|
movd [64 + %5], mm1
|
|
movd [80 + %5], mm5
|
|
%endmacro
|
|
|
|
%macro IDCT4 6
|
|
movq mm0, %1 ; R4 R0 r4 r0
|
|
movq mm2, %3 ; R3 R1 r3 r1
|
|
movq mm3, %4 ; R7 R5 r7 r5
|
|
movq mm4, [coeffs + 16] ; C4 C4 C4 C4
|
|
pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0
|
|
movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4
|
|
pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0
|
|
movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0
|
|
movq mm7, [coeffs + 48] ; C3 C1 C3 C1
|
|
pmaddwd mm7, mm2 ; C3R3+C1R1 C3r3+C1r1
|
|
movq mm5, mm0 ; -C4R4+C4R0 -C4r4+C4r0
|
|
movq mm1, [coeffs + 56] ; C7 C5 C7 C5
|
|
pmaddwd mm1, mm3 ; C7R7+C5R5 C7r7+C5r5
|
|
pmaddwd mm2, [coeffs + 64] ; -C7R3+C3R1 -C7r3+C3r1
|
|
paddd mm7, mm1 ; B0 b0
|
|
movq mm1, [coeffs + 72] ; -C5 -C1 -C5 -C1
|
|
pmaddwd mm1, mm3 ; -C5R7-C1R5 -C5r7-C1r5
|
|
paddd mm7, mm4 ; A0+B0 a0+b0
|
|
paddd mm4, mm4 ; 2A0 2a0
|
|
psubd mm4, mm7 ; A0-B0 a0-b0
|
|
paddd mm1, mm2 ; B1 b1
|
|
psrad mm7, %6
|
|
psrad mm4, %6
|
|
movq mm2, mm0 ; A1 a1
|
|
paddd mm0, mm1 ; A1+B1 a1+b1
|
|
psubd mm2, mm1 ; A1-B1 a1-b1
|
|
psrad mm0, %6
|
|
psrad mm2, %6
|
|
packssdw mm7, mm7 ; A0+B0 a0+b0
|
|
movd [%5], mm7
|
|
packssdw mm0, mm0 ; A1+B1 a1+b1
|
|
movd [16 + %5], mm0
|
|
packssdw mm2, mm2 ; A1-B1 a1-b1
|
|
movd [96 + %5], mm2
|
|
packssdw mm4, mm4 ; A0-B0 a0-b0
|
|
movd [112 + %5], mm4
|
|
movq mm0, %3 ; R3 R1 r3 r1
|
|
movq mm4, [coeffs + 80] ; -C1 C5 -C1 C5
|
|
pmaddwd mm4, mm0 ; -C1R3+C5R1 -C1r3+C5r1
|
|
movq mm7, [coeffs + 88] ; C3 C7 C3 C7
|
|
pmaddwd mm0, [coeffs + 96] ; -C5R3+C7R1 -C5r3+C7r1
|
|
pmaddwd mm7, mm3 ; C3R7+C7R5 C3r7+C7r5
|
|
movq mm2, mm5 ; A2 a2
|
|
pmaddwd mm3, [coeffs + 104] ; -C1R7+C3R5 -C1r7+C3r5
|
|
paddd mm4, mm7 ; B2 b2
|
|
paddd mm2, mm4 ; A2+B2 a2+b2
|
|
psubd mm5, mm4 ; a2-B2 a2-b2
|
|
psrad mm2, %6
|
|
psrad mm5, %6
|
|
movq mm4, mm6 ; A3 a3
|
|
paddd mm3, mm0 ; B3 b3
|
|
paddd mm6, mm3 ; A3+B3 a3+b3
|
|
psubd mm4, mm3 ; a3-B3 a3-b3
|
|
psrad mm6, %6
|
|
psrad mm4, %6
|
|
packssdw mm2, mm2 ; A2+B2 a2+b2
|
|
packssdw mm6, mm6 ; A3+B3 a3+b3
|
|
movd [32 + %5], mm2
|
|
packssdw mm4, mm4 ; A3-B3 a3-b3
|
|
packssdw mm5, mm5 ; A2-B2 a2-b2
|
|
movd [48 + %5], mm6
|
|
movd [64 + %5], mm4
|
|
movd [80 + %5], mm5
|
|
%endmacro
|
|
|
|
%macro IDCT5 6
|
|
movq mm0, %1 ; R4 R0 r4 r0
|
|
movq mm2, %3 ; R3 R1 r3 r1
|
|
movq mm4, [coeffs + 16] ; C4 C4 C4 C4
|
|
pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0
|
|
movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4
|
|
pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0
|
|
movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0
|
|
movq mm7, [coeffs + 48] ; C3 C1 C3 C1
|
|
pmaddwd mm7, mm2 ; C3R3+C1R1 C3r3+C1r1
|
|
movq mm5, mm0 ; -C4R4+C4R0 -C4r4+C4r0
|
|
movq mm3, [coeffs + 64]
|
|
pmaddwd mm3, mm2 ; -C7R3+C3R1 -C7r3+C3r1
|
|
paddd mm7, mm4 ; A0+B0 a0+b0
|
|
paddd mm4, mm4 ; 2A0 2a0
|
|
psubd mm4, mm7 ; A0-B0 a0-b0
|
|
psrad mm7, %6
|
|
psrad mm4, %6
|
|
movq mm1, mm0 ; A1 a1
|
|
paddd mm0, mm3 ; A1+B1 a1+b1
|
|
psubd mm1, mm3 ; A1-B1 a1-b1
|
|
psrad mm0, %6
|
|
psrad mm1, %6
|
|
packssdw mm7, mm7 ; A0+B0 a0+b0
|
|
movd [%5], mm7
|
|
packssdw mm0, mm0 ; A1+B1 a1+b1
|
|
movd [16 + %5], mm0
|
|
packssdw mm1, mm1 ; A1-B1 a1-b1
|
|
movd [96 + %5], mm1
|
|
packssdw mm4, mm4 ; A0-B0 a0-b0
|
|
movd [112 + %5], mm4
|
|
movq mm4, [coeffs + 80] ; -C1 C5 -C1 C5
|
|
pmaddwd mm4, mm2 ; -C1R3+C5R1 -C1r3+C5r1
|
|
pmaddwd mm2, [coeffs + 96] ; -C5R3+C7R1 -C5r3+C7r1
|
|
movq mm1, mm5 ; A2 a2
|
|
paddd mm1, mm4 ; A2+B2 a2+b2
|
|
psubd mm5, mm4 ; a2-B2 a2-b2
|
|
psrad mm1, %6
|
|
psrad mm5, %6
|
|
movq mm4, mm6 ; A3 a3
|
|
paddd mm6, mm2 ; A3+B3 a3+b3
|
|
psubd mm4, mm2 ; a3-B3 a3-b3
|
|
psrad mm6, %6
|
|
psrad mm4, %6
|
|
packssdw mm1, mm1 ; A2+B2 a2+b2
|
|
packssdw mm6, mm6 ; A3+B3 a3+b3
|
|
movd [32 + %5], mm1
|
|
packssdw mm4, mm4 ; A3-B3 a3-b3
|
|
packssdw mm5, mm5 ; A2-B2 a2-b2
|
|
movd [48 + %5], mm6
|
|
movd [64 + %5], mm4
|
|
movd [80 + %5], mm5
|
|
%endmacro
|
|
|
|
%macro IDCT6 6
|
|
movq mm0, [%1] ; R4 R0 r4 r0
|
|
movq mm1, [%2] ; R6 R2 r6 r2
|
|
movq mm4, [coeffs + 16] ; C4 C4 C4 C4
|
|
pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0
|
|
movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4
|
|
pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0
|
|
movq mm5, [coeffs + 32] ; C6 C2 C6 C2
|
|
pmaddwd mm5, mm1 ; C6R6+C2R2 C6r6+C2r2
|
|
movq mm6, [coeffs + 40] ; -C2 C6 -C2 C6
|
|
pmaddwd mm1, mm6 ; -C2R6+C6R2 -C2r6+C6r2
|
|
movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0
|
|
paddd mm4, mm5 ; A0 a0
|
|
psubd mm6, mm5 ; A3 a3
|
|
movq mm5, mm0 ; -C4R4+C4R0 -C4r4+C4r0
|
|
paddd mm0, mm1 ; A1 a1
|
|
psubd mm5, mm1 ; A2 a2
|
|
movq mm2, [8 + %1] ; R4 R0 r4 r0
|
|
movq mm3, [8 + %2] ; R6 R2 r6 r2
|
|
movq mm1, [coeffs + 16] ; C4 C4 C4 C4
|
|
pmaddwd mm1, mm2 ; C4R4+C4R0 C4r4+C4r0
|
|
movq mm7, [coeffs + 24] ; -C4 C4 -C4 C4
|
|
pmaddwd mm2, mm7 ; -C4R4+C4R0 -C4r4+C4r0
|
|
movq mm7, [coeffs + 32] ; C6 C2 C6 C2
|
|
pmaddwd mm7, mm3 ; C6R6+C2R2 C6r6+C2r2
|
|
pmaddwd mm3, [coeffs + 40] ; -C2R6+C6R2 -C2r6+C6r2
|
|
paddd mm7, mm1 ; A0 a0
|
|
paddd mm1, mm1 ; 2C0 2c0
|
|
psubd mm1, mm7 ; A3 a3
|
|
paddd mm3, mm2 ; A1 a1
|
|
paddd mm2, mm2 ; 2C1 2c1
|
|
psubd mm2, mm3 ; A2 a2
|
|
psrad mm4, %6
|
|
psrad mm7, %6
|
|
psrad mm3, %6
|
|
packssdw mm4, mm7 ; A0 a0
|
|
movq [%5], mm4
|
|
psrad mm0, %6
|
|
packssdw mm0, mm3 ; A1 a1
|
|
movq [16 + %5], mm0
|
|
movq [96 + %5], mm0
|
|
movq [112 + %5], mm4
|
|
psrad mm5, %6
|
|
psrad mm6, %6
|
|
psrad mm2, %6
|
|
packssdw mm5, mm2 ; A2-B2 a2-b2
|
|
movq [32 + %5], mm5
|
|
psrad mm1, %6
|
|
packssdw mm6, mm1 ; A3+B3 a3+b3
|
|
movq [48 + %5], mm6
|
|
movq [64 + %5], mm6
|
|
movq [80 + %5], mm5
|
|
%endmacro
|
|
|
|
%macro IDCT7 6
|
|
movq mm0, %1 ; R4 R0 r4 r0
|
|
movq mm1, %2 ; R6 R2 r6 r2
|
|
movq mm2, %3 ; R3 R1 r3 r1
|
|
movq mm4, [coeffs + 16] ; C4 C4 C4 C4
|
|
pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0
|
|
movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4
|
|
pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0
|
|
movq mm5, [coeffs + 32] ; C6 C2 C6 C2
|
|
pmaddwd mm5, mm1 ; C6R6+C2R2 C6r6+C2r2
|
|
movq mm6, [coeffs + 40] ; -C2 C6 -C2 C6
|
|
pmaddwd mm1, mm6 ; -C2R6+C6R2 -C2r6+C6r2
|
|
movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0
|
|
movq mm7, [coeffs + 48] ; C3 C1 C3 C1
|
|
pmaddwd mm7, mm2 ; C3R3+C1R1 C3r3+C1r1
|
|
paddd mm4, mm5 ; A0 a0
|
|
psubd mm6, mm5 ; A3 a3
|
|
movq mm5, mm0 ; -C4R4+C4R0 -C4r4+C4r0
|
|
paddd mm0, mm1 ; A1 a1
|
|
psubd mm5, mm1 ; A2 a2
|
|
movq mm1, [coeffs + 64]
|
|
pmaddwd mm1, mm2 ; -C7R3+C3R1 -C7r3+C3r1
|
|
paddd mm7, mm4 ; A0+B0 a0+b0
|
|
paddd mm4, mm4 ; 2A0 2a0
|
|
psubd mm4, mm7 ; A0-B0 a0-b0
|
|
psrad mm7, %6
|
|
psrad mm4, %6
|
|
movq mm3, mm0 ; A1 a1
|
|
paddd mm0, mm1 ; A1+B1 a1+b1
|
|
psubd mm3, mm1 ; A1-B1 a1-b1
|
|
psrad mm0, %6
|
|
psrad mm3, %6
|
|
packssdw mm7, mm7 ; A0+B0 a0+b0
|
|
movd [%5], mm7
|
|
packssdw mm0, mm0 ; A1+B1 a1+b1
|
|
movd [16 + %5], mm0
|
|
packssdw mm3, mm3 ; A1-B1 a1-b1
|
|
movd [96 + %5], mm3
|
|
packssdw mm4, mm4 ; A0-B0 a0-b0
|
|
movd [112 + %5], mm4
|
|
movq mm4, [coeffs + 80] ; -C1 C5 -C1 C5
|
|
pmaddwd mm4, mm2 ; -C1R3+C5R1 -C1r3+C5r1
|
|
pmaddwd mm2, [coeffs + 96] ; -C5R3+C7R1 -C5r3+C7r1
|
|
movq mm3, mm5 ; A2 a2
|
|
paddd mm3, mm4 ; A2+B2 a2+b2
|
|
psubd mm5, mm4 ; a2-B2 a2-b2
|
|
psrad mm3, %6
|
|
psrad mm5, %6
|
|
movq mm4, mm6 ; A3 a3
|
|
paddd mm6, mm2 ; A3+B3 a3+b3
|
|
psubd mm4, mm2 ; a3-B3 a3-b3
|
|
psrad mm6, %6
|
|
packssdw mm3, mm3 ; A2+B2 a2+b2
|
|
movd [32 + %5], mm3
|
|
psrad mm4, %6
|
|
packssdw mm6, mm6 ; A3+B3 a3+b3
|
|
movd [48 + %5], mm6
|
|
packssdw mm4, mm4 ; A3-B3 a3-b3
|
|
packssdw mm5, mm5 ; A2-B2 a2-b2
|
|
movd [64 + %5], mm4
|
|
movd [80 + %5], mm5
|
|
%endmacro
|
|
|
|
%macro IDCT8 6
|
|
movq mm0, [%1] ; R4 R0 r4 r0
|
|
movq mm4, [coeffs + 16] ; C4 C4 C4 C4
|
|
pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0
|
|
movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4
|
|
pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0
|
|
psrad mm4, %6
|
|
psrad mm0, %6
|
|
movq mm2, [8 + %1] ; R4 R0 r4 r0
|
|
movq mm1, [coeffs + 16] ; C4 C4 C4 C4
|
|
pmaddwd mm1, mm2 ; C4R4+C4R0 C4r4+C4r0
|
|
movq mm7, [coeffs + 24] ; -C4 C4 -C4 C4
|
|
pmaddwd mm2, mm7 ; -C4R4+C4R0 -C4r4+C4r0
|
|
movq mm7, [coeffs + 32] ; C6 C2 C6 C2
|
|
psrad mm1, %6
|
|
packssdw mm4, mm1 ; A0 a0
|
|
movq [%5], mm4
|
|
psrad mm2, %6
|
|
packssdw mm0, mm2 ; A1 a1
|
|
movq [16 + %5], mm0
|
|
movq [96 + %5], mm0
|
|
movq [112 + %5], mm4
|
|
movq [32 + %5], mm0
|
|
movq [48 + %5], mm4
|
|
movq [64 + %5], mm4
|
|
movq [80 + %5], mm0
|
|
%endmacro
|
|
|
|
%macro IDCT 0
|
|
DC_COND_IDCT 0, 8, 16, 24, rsp + 0, null, 11
|
|
Z_COND_IDCT 32, 40, 48, 56, rsp + 32, null, 11, %%4
|
|
Z_COND_IDCT 64, 72, 80, 88, rsp + 64, null, 11, %%2
|
|
Z_COND_IDCT 96, 104, 112, 120, rsp + 96, null, 11, %%1
|
|
|
|
IDCT1 [rsp + 0], [rsp + 64], [rsp + 32], [rsp + 96], blockq + 0, 20
|
|
IDCT1 [rsp + 8], [rsp + 72], [rsp + 40], [rsp + 104], blockq + 4, 20
|
|
IDCT1 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq + 8, 20
|
|
IDCT1 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
|
|
jmp %%9
|
|
|
|
ALIGN 16
|
|
%%4:
|
|
Z_COND_IDCT 64, 72, 80, 88, rsp + 64, null, 11, %%6
|
|
Z_COND_IDCT 96, 104, 112, 120, rsp + 96, null, 11, %%5
|
|
|
|
IDCT2 [rsp + 0], [rsp + 64], [rsp + 32], [rsp + 96], blockq + 0, 20
|
|
IDCT2 [rsp + 8], [rsp + 72], [rsp + 40], [rsp + 104], blockq + 4, 20
|
|
IDCT2 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq + 8, 20
|
|
IDCT2 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
|
|
jmp %%9
|
|
|
|
ALIGN 16
|
|
%%6:
|
|
Z_COND_IDCT 96, 104, 112, 120, rsp + 96, null, 11, %%7
|
|
|
|
IDCT3 [rsp + 0], [rsp + 64], [rsp + 32], [rsp + 96], blockq + 0, 20
|
|
IDCT3 [rsp + 8], [rsp + 72], [rsp + 40], [rsp + 104], blockq + 4, 20
|
|
IDCT3 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq + 8, 20
|
|
IDCT3 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
|
|
jmp %%9
|
|
|
|
ALIGN 16
|
|
%%2:
|
|
Z_COND_IDCT 96, 104, 112, 120, rsp + 96, null, 11, %%3
|
|
|
|
IDCT4 [rsp + 0], [rsp + 64], [rsp + 32], [rsp + 96], blockq + 0, 20
|
|
IDCT4 [rsp + 8], [rsp + 72], [rsp + 40], [rsp + 104], blockq + 4, 20
|
|
IDCT4 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq + 8, 20
|
|
IDCT4 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
|
|
jmp %%9
|
|
|
|
ALIGN 16
|
|
%%3:
|
|
|
|
IDCT5 [rsp + 0], [rsp + 64], [rsp + 32], [rsp + 96], blockq + 0, 20
|
|
IDCT5 [rsp + 8], [rsp + 72], [rsp + 40], [rsp + 104], blockq + 4, 20
|
|
IDCT5 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq + 8, 20
|
|
IDCT5 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
|
|
jmp %%9
|
|
|
|
ALIGN 16
|
|
%%5:
|
|
|
|
IDCT6 rsp + 0, rsp + 64, rsp + 32, rsp + 96, blockq + 0, 20
|
|
IDCT6 rsp + 16, rsp + 80, rsp + 48, rsp + 112, blockq + 8, 20
|
|
jmp %%9
|
|
|
|
ALIGN 16
|
|
%%1:
|
|
|
|
IDCT7 [rsp + 0], [rsp + 64], [rsp + 32], [rsp + 96], blockq + 0, 20
|
|
IDCT7 [rsp + 8], [rsp + 72], [rsp + 40], [rsp + 104], blockq + 4, 20
|
|
IDCT7 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq + 8, 20
|
|
IDCT7 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
|
|
jmp %%9
|
|
|
|
ALIGN 16
|
|
%%7:
|
|
|
|
IDCT8 rsp + 0, rsp + 64, rsp + 32, rsp + 96, blockq + 0, 20
|
|
IDCT8 rsp + 16, rsp + 80, rsp + 48, rsp + 112, blockq + 8, 20
|
|
|
|
%%9:
|
|
%endmacro
|
|
|
|
%macro PUT_PIXELS_CLAMPED_HALF 1
|
|
mova m0, [blockq+mmsize*0+%1]
|
|
mova m1, [blockq+mmsize*2+%1]
|
|
%if mmsize == 8
|
|
mova m2, [blockq+mmsize*4+%1]
|
|
mova m3, [blockq+mmsize*6+%1]
|
|
%endif
|
|
packuswb m0, [blockq+mmsize*1+%1]
|
|
packuswb m1, [blockq+mmsize*3+%1]
|
|
%if mmsize == 8
|
|
packuswb m2, [blockq+mmsize*5+%1]
|
|
packuswb m3, [blockq+mmsize*7+%1]
|
|
movq [pixelsq], m0
|
|
movq [lsizeq+pixelsq], m1
|
|
movq [2*lsizeq+pixelsq], m2
|
|
movq [lsize3q+pixelsq], m3
|
|
%else
|
|
movq [pixelsq], m0
|
|
movhps [lsizeq+pixelsq], m0
|
|
movq [2*lsizeq+pixelsq], m1
|
|
movhps [lsize3q+pixelsq], m1
|
|
%endif
|
|
%endmacro
|
|
|
|
%macro ADD_PIXELS_CLAMPED 1
|
|
mova m0, [blockq+mmsize*0+%1]
|
|
mova m1, [blockq+mmsize*1+%1]
|
|
%if mmsize == 8
|
|
mova m5, [blockq+mmsize*2+%1]
|
|
mova m6, [blockq+mmsize*3+%1]
|
|
%endif
|
|
movq m2, [pixelsq]
|
|
movq m3, [pixelsq+lsizeq]
|
|
%if mmsize == 8
|
|
mova m7, m2
|
|
punpcklbw m2, m4
|
|
punpckhbw m7, m4
|
|
paddsw m0, m2
|
|
paddsw m1, m7
|
|
mova m7, m3
|
|
punpcklbw m3, m4
|
|
punpckhbw m7, m4
|
|
paddsw m5, m3
|
|
paddsw m6, m7
|
|
%else
|
|
punpcklbw m2, m4
|
|
punpcklbw m3, m4
|
|
paddsw m0, m2
|
|
paddsw m1, m3
|
|
%endif
|
|
packuswb m0, m1
|
|
%if mmsize == 8
|
|
packuswb m5, m6
|
|
movq [pixelsq], m0
|
|
movq [pixelsq+lsizeq], m5
|
|
%else
|
|
movq [pixelsq], m0
|
|
movhps [pixelsq+lsizeq], m0
|
|
%endif
|
|
%endmacro
|
|
|
|
INIT_MMX mmx
|
|
|
|
cglobal simple_idct, 1, 2, 8, 128, block, t0
|
|
IDCT
|
|
RET
|
|
|
|
INIT_XMM sse2
|
|
|
|
cglobal simple_idct_put, 3, 5, 8, 128, pixels, lsize, block, lsize3, t0
|
|
IDCT
|
|
lea lsize3q, [lsizeq*3]
|
|
PUT_PIXELS_CLAMPED_HALF 0
|
|
lea pixelsq, [pixelsq+lsizeq*4]
|
|
PUT_PIXELS_CLAMPED_HALF 64
|
|
RET
|
|
|
|
cglobal simple_idct_add, 3, 4, 8, 128, pixels, lsize, block, t0
|
|
IDCT
|
|
pxor m4, m4
|
|
ADD_PIXELS_CLAMPED 0
|
|
lea pixelsq, [pixelsq+lsizeq*2]
|
|
ADD_PIXELS_CLAMPED 32
|
|
lea pixelsq, [pixelsq+lsizeq*2]
|
|
ADD_PIXELS_CLAMPED 64
|
|
lea pixelsq, [pixelsq+lsizeq*2]
|
|
ADD_PIXELS_CLAMPED 96
|
|
RET
|
|
%endif
|