mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-02-09 14:14:39 +02:00
x86: xvid_idct: merged idct_put SSE2 versions
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
This commit is contained in:
parent
8200575d84
commit
decd5193e1
@ -292,13 +292,13 @@ SECTION .text
|
|||||||
%define TAN3 xmm13
|
%define TAN3 xmm13
|
||||||
%define TAN1 xmm14
|
%define TAN1 xmm14
|
||||||
%else
|
%else
|
||||||
%define ROW0 [r0 + 0*16]
|
%define ROW0 [BLOCK + 0*16]
|
||||||
%define REG0 xmm4
|
%define REG0 xmm4
|
||||||
%define ROW2 [r0 + 2*16]
|
%define ROW2 [BLOCK + 2*16]
|
||||||
%define REG2 xmm4
|
%define REG2 xmm4
|
||||||
%define ROW4 [r0 + 4*16]
|
%define ROW4 [BLOCK + 4*16]
|
||||||
%define REG4 xmm6
|
%define REG4 xmm6
|
||||||
%define ROW6 [r0 + 6*16]
|
%define ROW6 [BLOCK + 6*16]
|
||||||
%define REG6 xmm6
|
%define REG6 xmm6
|
||||||
%define XMMS xmm2
|
%define XMMS xmm2
|
||||||
%define SREG2 xmm7
|
%define SREG2 xmm7
|
||||||
@ -369,8 +369,71 @@ SECTION .text
|
|||||||
movdqa TAN1, [tan1]
|
movdqa TAN1, [tan1]
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
|
%macro FIRST_HALF 2 ; %1=dct %2=type(normal,add,put)
|
||||||
|
psraw xmm5, 6
|
||||||
|
psraw REG0, 6
|
||||||
|
psraw TAN3, 6
|
||||||
|
psraw xmm3, 6
|
||||||
|
; dct coeffs must still be written for AC prediction
|
||||||
|
%if %2 == 0
|
||||||
|
movdqa [%1+1*16], TAN3
|
||||||
|
movdqa [%1+2*16], xmm3
|
||||||
|
movdqa [%1+5*16], REG0
|
||||||
|
movdqa [%1+6*16], xmm5
|
||||||
|
%else
|
||||||
|
; Must now load args as gprs are no longer used for masks
|
||||||
|
; DEST is set to where address of dest was loaded
|
||||||
|
%if ARCH_X86_32
|
||||||
|
%xdefine DEST r2q ; BLOCK is r0, stride r1
|
||||||
|
movifnidn DEST, destm
|
||||||
|
movifnidn strideq, stridem
|
||||||
|
%else
|
||||||
|
%xdefine DEST r0q
|
||||||
|
%endif
|
||||||
|
lea r3q, [3*strideq]
|
||||||
|
%if %2 == 1
|
||||||
|
packuswb TAN3, xmm3
|
||||||
|
packuswb xmm5, REG0
|
||||||
|
movq [DEST + strideq], TAN3
|
||||||
|
movhps [DEST + 2*strideq], TAN3
|
||||||
|
; REG0 and TAN3 are now available (and likely used in second half)
|
||||||
|
%else
|
||||||
|
%warning Unimplemented
|
||||||
|
%endif
|
||||||
|
%endif
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
%macro SECOND_HALF 6 ; %1=dct %2=type(normal,add,put) 3-6: xmms
|
||||||
|
psraw %3, 6
|
||||||
|
psraw %4, 6
|
||||||
|
psraw %5, 6
|
||||||
|
psraw %6, 6
|
||||||
|
; dct coeffs must still be written for AC prediction
|
||||||
|
%if %2 == 0
|
||||||
|
movdqa [%1+0*16], %3
|
||||||
|
movdqa [%1+3*16], %5
|
||||||
|
movdqa [%1+4*16], %6
|
||||||
|
movdqa [%1+7*16], %4
|
||||||
|
%elif %2 == 1
|
||||||
|
packuswb %3, %5
|
||||||
|
packuswb %6, %4
|
||||||
|
; address of dest may have been loaded
|
||||||
|
movq [DEST], %3
|
||||||
|
movhps [DEST + r3q], %3
|
||||||
|
lea DEST, [DEST + 4*strideq]
|
||||||
|
movq [DEST], %6
|
||||||
|
movhps [DEST + r3q], %6
|
||||||
|
; and now write remainder of first half
|
||||||
|
movq [DEST + 2*strideq], xmm5
|
||||||
|
movhps [DEST + strideq], xmm5
|
||||||
|
%elif %2 == 2
|
||||||
|
%warning Unimplemented
|
||||||
|
%endif
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
|
||||||
; IDCT pass on columns.
|
; IDCT pass on columns.
|
||||||
%macro iLLM_PASS 1 ;dct
|
%macro iLLM_PASS 2 ; %1=dct %2=type(normal,add,put)
|
||||||
movdqa xmm1, TAN3
|
movdqa xmm1, TAN3
|
||||||
movdqa xmm3, TAN1
|
movdqa xmm3, TAN1
|
||||||
pmulhw TAN3, xmm4
|
pmulhw TAN3, xmm4
|
||||||
@ -407,7 +470,7 @@ SECTION .text
|
|||||||
psubsw xmm5, REG6
|
psubsw xmm5, REG6
|
||||||
MOV32 ROW0, REG0
|
MOV32 ROW0, REG0
|
||||||
MOV32 ROW4, REG4
|
MOV32 ROW4, REG4
|
||||||
MOV32 TAN1, [r0]
|
MOV32 TAN1, [BLOCK]
|
||||||
movdqa XMMS, REG0
|
movdqa XMMS, REG0
|
||||||
psubsw REG0, REG4
|
psubsw REG0, REG4
|
||||||
paddsw REG4, XMMS
|
paddsw REG4, XMMS
|
||||||
@ -423,33 +486,22 @@ SECTION .text
|
|||||||
movdqa XMMS, REG0
|
movdqa XMMS, REG0
|
||||||
psubsw REG0, xmm3
|
psubsw REG0, xmm3
|
||||||
paddsw xmm3, XMMS
|
paddsw xmm3, XMMS
|
||||||
MOV32 [r0], TAN1
|
MOV32 [BLOCK], TAN1
|
||||||
psraw xmm5, 6
|
|
||||||
psraw REG0, 6
|
FIRST_HALF %1, %2
|
||||||
psraw TAN3, 6
|
|
||||||
psraw xmm3, 6
|
|
||||||
movdqa [%1+1*16], TAN3
|
|
||||||
movdqa [%1+2*16], xmm3
|
|
||||||
movdqa [%1+5*16], REG0
|
|
||||||
movdqa [%1+6*16], xmm5
|
|
||||||
movdqa xmm0, xmm7
|
movdqa xmm0, xmm7
|
||||||
movdqa xmm4, REG4
|
movdqa xmm4, REG4
|
||||||
psubsw xmm7, xmm1
|
psubsw xmm7, xmm1
|
||||||
psubsw REG4, TAN1
|
psubsw REG4, TAN1
|
||||||
paddsw xmm1, xmm0
|
paddsw xmm1, xmm0
|
||||||
paddsw TAN1, xmm4
|
paddsw TAN1, xmm4
|
||||||
psraw xmm1, 6
|
|
||||||
psraw xmm7, 6
|
SECOND_HALF %1, %2, xmm1, xmm7, TAN1, REG4
|
||||||
psraw TAN1, 6
|
|
||||||
psraw REG4, 6
|
|
||||||
movdqa [%1+0*16], xmm1
|
|
||||||
movdqa [%1+3*16], TAN1
|
|
||||||
movdqa [%1+4*16], REG4
|
|
||||||
movdqa [%1+7*16], xmm7
|
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
; IDCT pass on columns, assuming rows 4-7 are zero
|
; IDCT pass on columns, assuming rows 4-7 are zero
|
||||||
%macro iLLM_PASS_SPARSE 1 ;dct
|
%macro iLLM_PASS_SPARSE 2 ; %1=dct %2=type(normal,put,add)
|
||||||
pmulhw TAN3, xmm4
|
pmulhw TAN3, xmm4
|
||||||
paddsw TAN3, xmm4
|
paddsw TAN3, xmm4
|
||||||
movdqa xmm3, xmm6
|
movdqa xmm3, xmm6
|
||||||
@ -475,7 +527,7 @@ SECTION .text
|
|||||||
movdqa xmm6, REG0
|
movdqa xmm6, REG0
|
||||||
psubsw xmm6, SREG2
|
psubsw xmm6, SREG2
|
||||||
paddsw SREG2, REG0
|
paddsw SREG2, REG0
|
||||||
MOV32 TAN1, [r0]
|
MOV32 TAN1, [BLOCK]
|
||||||
movdqa XMMS, REG0
|
movdqa XMMS, REG0
|
||||||
psubsw REG0, xmm5
|
psubsw REG0, xmm5
|
||||||
paddsw xmm5, XMMS
|
paddsw xmm5, XMMS
|
||||||
@ -485,70 +537,92 @@ SECTION .text
|
|||||||
movdqa XMMS, REG0
|
movdqa XMMS, REG0
|
||||||
psubsw REG0, xmm3
|
psubsw REG0, xmm3
|
||||||
paddsw xmm3, XMMS
|
paddsw xmm3, XMMS
|
||||||
MOV32 [r0], TAN1
|
MOV32 [BLOCK], TAN1
|
||||||
psraw xmm5, 6
|
|
||||||
psraw REG0, 6
|
FIRST_HALF %1, %2
|
||||||
psraw TAN3, 6
|
|
||||||
psraw xmm3, 6
|
|
||||||
movdqa [%1+1*16], TAN3
|
|
||||||
movdqa [%1+2*16], xmm3
|
|
||||||
movdqa [%1+5*16], REG0
|
|
||||||
movdqa [%1+6*16], xmm5
|
|
||||||
movdqa xmm0, SREG2
|
movdqa xmm0, SREG2
|
||||||
movdqa xmm4, xmm6
|
movdqa xmm4, xmm6
|
||||||
psubsw SREG2, xmm1
|
psubsw SREG2, xmm1
|
||||||
psubsw xmm6, TAN1
|
psubsw xmm6, TAN1
|
||||||
paddsw xmm1, xmm0
|
paddsw xmm1, xmm0
|
||||||
paddsw TAN1, xmm4
|
paddsw TAN1, xmm4
|
||||||
psraw xmm1, 6
|
|
||||||
psraw SREG2, 6
|
SECOND_HALF %1, %2, xmm1, SREG2, TAN1, xmm6
|
||||||
psraw TAN1, 6
|
|
||||||
psraw xmm6, 6
|
|
||||||
movdqa [%1+0*16], xmm1
|
|
||||||
movdqa [%1+3*16], TAN1
|
|
||||||
movdqa [%1+4*16], xmm6
|
|
||||||
movdqa [%1+7*16], SREG2
|
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
INIT_XMM sse2
|
%macro IDCT_SSE2 1 ; 0=normal 1=put 2=add
|
||||||
cglobal xvid_idct, 1, 5, 8+7*ARCH_X86_64, block
|
%if %1 == 0 || ARCH_X86_32
|
||||||
|
%define GPR0 r1d
|
||||||
|
%define GPR1 r2d
|
||||||
|
%define GPR2 r3d
|
||||||
|
%define GPR3 r4d
|
||||||
|
%define NUM_GPRS 5
|
||||||
|
%else
|
||||||
|
%define GPR0 r3d
|
||||||
|
%define GPR1 r4d
|
||||||
|
%define GPR2 r5d
|
||||||
|
%define GPR3 r6d
|
||||||
|
%define NUM_GPRS 7
|
||||||
|
%endif
|
||||||
|
%if %1 == 0
|
||||||
|
cglobal xvid_idct, 1, NUM_GPRS, 8+7*ARCH_X86_64, block
|
||||||
|
%xdefine BLOCK blockq
|
||||||
|
%else
|
||||||
|
%if %1 == 1
|
||||||
|
cglobal xvid_idct_put, 0, NUM_GPRS, 8+7*ARCH_X86_64, dest, stride, block
|
||||||
|
%else
|
||||||
|
cglobal xvid_idct_add, 0, NUM_GPRS, 8+7*ARCH_X86_64, dest, stride, block
|
||||||
|
%endif
|
||||||
|
%if ARCH_X86_64
|
||||||
|
%xdefine BLOCK blockq
|
||||||
|
%else
|
||||||
|
mov r0q, blockm
|
||||||
|
%xdefine BLOCK r0q
|
||||||
|
%endif
|
||||||
|
%endif
|
||||||
movq mm0, [pb_127]
|
movq mm0, [pb_127]
|
||||||
iMTX_MULT r0 + 0*16, iTab1, PUT_EVEN, ROW0, 0*16
|
iMTX_MULT BLOCK + 0*16, iTab1, PUT_EVEN, ROW0, 0*16
|
||||||
iMTX_MULT r0 + 1*16, iTab2, PUT_ODD, ROW1, 1*16
|
iMTX_MULT BLOCK + 1*16, iTab2, PUT_ODD, ROW1, 1*16
|
||||||
iMTX_MULT r0 + 2*16, iTab3, PUT_EVEN, ROW2, 2*16
|
iMTX_MULT BLOCK + 2*16, iTab3, PUT_EVEN, ROW2, 2*16
|
||||||
|
|
||||||
TEST_TWO_ROWS r0 + 3*16, r0 + 4*16, r1d, r2d, CLEAR_ODD, ROW3, CLEAR_EVEN, ROW4 ; a, c
|
TEST_TWO_ROWS BLOCK + 3*16, BLOCK + 4*16, GPR0, GPR1, CLEAR_ODD, ROW3, CLEAR_EVEN, ROW4 ; a, c
|
||||||
JZ r1d, col1
|
JZ GPR0, col1
|
||||||
iMTX_MULT r0 + 3*16, iTab4, PUT_ODD, ROW3, 3*16
|
iMTX_MULT BLOCK + 3*16, iTab4, PUT_ODD, ROW3, 3*16
|
||||||
.col1:
|
.col1:
|
||||||
TEST_TWO_ROWS r0 + 5*16, r0 + 6*16, r1d, r3d, CLEAR_ODD, ROW5, CLEAR_EVEN, ROW6 ; a, d
|
TEST_TWO_ROWS BLOCK + 5*16, BLOCK + 6*16, GPR0, GPR2, CLEAR_ODD, ROW5, CLEAR_EVEN, ROW6 ; a, d
|
||||||
TEST_ONE_ROW r0 + 7*16, r4d, CLEAR_ODD, ROW7 ; esi
|
TEST_ONE_ROW BLOCK + 7*16, GPR3, CLEAR_ODD, ROW7 ; esi
|
||||||
|
|
||||||
iLLM_HEAD
|
iLLM_HEAD
|
||||||
JNZ r2d, 2
|
JNZ GPR1, 2
|
||||||
JNZ r1d, 3
|
JNZ GPR0, 3
|
||||||
JNZ r3d, 4
|
JNZ GPR2, 4
|
||||||
JNZ r4d, 5
|
JNZ GPR3, 5
|
||||||
iLLM_PASS_SPARSE r0
|
iLLM_PASS_SPARSE BLOCK, %1
|
||||||
jmp .6
|
jmp .6
|
||||||
.2:
|
.2:
|
||||||
iMTX_MULT r0 + 4*16, iTab1, PUT_EVEN, ROW4
|
iMTX_MULT BLOCK + 4*16, iTab1, PUT_EVEN, ROW4
|
||||||
.3:
|
.3:
|
||||||
iMTX_MULT r0 + 5*16, iTab4, PUT_ODD, ROW5, 4*16
|
iMTX_MULT BLOCK + 5*16, iTab4, PUT_ODD, ROW5, 4*16
|
||||||
JZ r3d, col2
|
JZ GPR2, col2
|
||||||
.4:
|
.4:
|
||||||
iMTX_MULT r0 + 6*16, iTab3, PUT_EVEN, ROW6, 5*16
|
iMTX_MULT BLOCK + 6*16, iTab3, PUT_EVEN, ROW6, 5*16
|
||||||
.col2:
|
.col2:
|
||||||
JZ r4d, col3
|
JZ GPR3, col3
|
||||||
.5:
|
.5:
|
||||||
iMTX_MULT r0 + 7*16, iTab2, PUT_ODD, ROW7, 5*16
|
iMTX_MULT BLOCK + 7*16, iTab2, PUT_ODD, ROW7, 5*16
|
||||||
.col3:
|
.col3:
|
||||||
%if ARCH_X86_32
|
%if ARCH_X86_32
|
||||||
iLLM_HEAD
|
iLLM_HEAD
|
||||||
%endif
|
%endif
|
||||||
iLLM_PASS r0
|
iLLM_PASS BLOCK, %1
|
||||||
.6:
|
.6:
|
||||||
RET
|
RET
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
INIT_XMM sse2
|
||||||
|
IDCT_SSE2 0
|
||||||
|
IDCT_SSE2 1
|
||||||
|
|
||||||
%if ARCH_X86_32
|
%if ARCH_X86_32
|
||||||
|
|
||||||
|
@ -26,11 +26,7 @@
|
|||||||
#include "idctdsp.h"
|
#include "idctdsp.h"
|
||||||
#include "xvididct.h"
|
#include "xvididct.h"
|
||||||
|
|
||||||
static void xvid_idct_sse2_put(uint8_t *dest, int line_size, short *block)
|
void ff_xvid_idct_put_sse2(uint8_t *dest, int line_size, short *block);
|
||||||
{
|
|
||||||
ff_xvid_idct_sse2(block);
|
|
||||||
ff_put_pixels_clamped(block, dest, line_size);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void xvid_idct_sse2_add(uint8_t *dest, int line_size, short *block)
|
static void xvid_idct_sse2_add(uint8_t *dest, int line_size, short *block)
|
||||||
{
|
{
|
||||||
@ -91,7 +87,7 @@ av_cold void ff_xvid_idct_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||||
c->idct_put = xvid_idct_sse2_put;
|
c->idct_put = ff_xvid_idct_put_sse2;
|
||||||
c->idct_add = xvid_idct_sse2_add;
|
c->idct_add = xvid_idct_sse2_add;
|
||||||
c->idct = ff_xvid_idct_sse2;
|
c->idct = ff_xvid_idct_sse2;
|
||||||
c->perm_type = FF_IDCT_PERM_SSE2;
|
c->perm_type = FF_IDCT_PERM_SSE2;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user