mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-12-23 12:43:46 +02:00
Move VP3 IDCT functions from inline ASM to YASM. This fixes part of the VP3/5/6
issues on Win64. Originally committed as revision 24988 to svn://svn.ffmpeg.org/ffmpeg/trunk
This commit is contained in:
parent
7e7c4b6008
commit
e9f5f020c6
@ -26,15 +26,12 @@ MMX-OBJS-$(CONFIG_GPL) += x86/idct_mmx.o
|
||||
MMX-OBJS-$(CONFIG_LPC) += x86/lpc_mmx.o
|
||||
MMX-OBJS-$(CONFIG_DWT) += x86/snowdsp_mmx.o
|
||||
MMX-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp_mmx.o
|
||||
MMX-OBJS-$(CONFIG_VP3_DECODER) += x86/vp3dsp_mmx.o \
|
||||
x86/vp3dsp_sse2.o
|
||||
MMX-OBJS-$(CONFIG_VP5_DECODER) += x86/vp3dsp_mmx.o \
|
||||
x86/vp3dsp_sse2.o \
|
||||
x86/vp56dsp_init.o
|
||||
YASM-OBJS-$(CONFIG_VP6_DECODER) += x86/vp56dsp.o
|
||||
MMX-OBJS-$(CONFIG_VP6_DECODER) += x86/vp3dsp_mmx.o \
|
||||
x86/vp3dsp_sse2.o \
|
||||
x86/vp56dsp_init.o
|
||||
YASM-OBJS-$(CONFIG_VP3_DECODER) += x86/vp3dsp.o
|
||||
YASM-OBJS-$(CONFIG_VP5_DECODER) += x86/vp3dsp.o
|
||||
MMX-OBJS-$(CONFIG_VP5_DECODER) += x86/vp56dsp_init.o
|
||||
YASM-OBJS-$(CONFIG_VP6_DECODER) += x86/vp3dsp.o \
|
||||
x86/vp56dsp.o
|
||||
MMX-OBJS-$(CONFIG_VP6_DECODER) += x86/vp56dsp_init.o
|
||||
YASM-OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp.o
|
||||
MMX-OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp-init.o
|
||||
MMX-OBJS-$(HAVE_YASM) += x86/dsputil_yasm.o \
|
||||
|
@ -28,8 +28,6 @@
|
||||
#include "libavcodec/mpegvideo.h"
|
||||
#include "libavcodec/simple_idct.h"
|
||||
#include "dsputil_mmx.h"
|
||||
#include "vp3dsp_mmx.h"
|
||||
#include "vp3dsp_sse2.h"
|
||||
#include "idct_xvid.h"
|
||||
|
||||
//#undef NDEBUG
|
||||
@ -2376,6 +2374,19 @@ static void float_to_int16_sse2(int16_t *dst, const float *src, long len){
|
||||
);
|
||||
}
|
||||
|
||||
void ff_vp3_idct_mmx(int16_t *input_data);
|
||||
void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block);
|
||||
void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block);
|
||||
|
||||
void ff_vp3_idct_dc_add_mmx2(uint8_t *dest, int line_size, const DCTELEM *block);
|
||||
|
||||
void ff_vp3_v_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values);
|
||||
void ff_vp3_h_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values);
|
||||
|
||||
void ff_vp3_idct_sse2(int16_t *input_data);
|
||||
void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block);
|
||||
void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block);
|
||||
|
||||
void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len);
|
||||
void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len);
|
||||
void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len);
|
||||
|
618
libavcodec/x86/vp3dsp.asm
Normal file
618
libavcodec/x86/vp3dsp.asm
Normal file
@ -0,0 +1,618 @@
|
||||
;******************************************************************************
|
||||
;* MMX/SSE2-optimized functions for the VP3 decoder
|
||||
;* Copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org>
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "x86inc.asm"
|
||||
%include "x86util.asm"
|
||||
|
||||
; MMX-optimized functions cribbed from the original VP3 source code.
|
||||
|
||||
SECTION_RODATA
|
||||
|
||||
vp3_idct_data: times 8 dw 64277
|
||||
times 8 dw 60547
|
||||
times 8 dw 54491
|
||||
times 8 dw 46341
|
||||
times 8 dw 36410
|
||||
times 8 dw 25080
|
||||
times 8 dw 12785
|
||||
|
||||
cextern pb_1
|
||||
cextern pb_3
|
||||
cextern pb_7
|
||||
cextern pb_1F
|
||||
cextern pb_81
|
||||
|
||||
cextern pw_8
|
||||
|
||||
cextern put_signed_pixels_clamped_mmx
|
||||
cextern add_pixels_clamped_mmx
|
||||
|
||||
SECTION .text
|
||||
|
||||
; this is off by one or two for some cases when filter_limit is greater than 63
|
||||
; in: p0 in mm6, p1 in mm4, p2 in mm2, p3 in mm1
|
||||
; out: p1 in mm4, p2 in mm3
|
||||
%macro VP3_LOOP_FILTER 0
|
||||
movq m7, m6
|
||||
pand m6, [pb_7] ; p0&7
|
||||
psrlw m7, 3
|
||||
pand m7, [pb_1F] ; p0>>3
|
||||
movq m3, m2 ; p2
|
||||
pxor m2, m4
|
||||
pand m2, [pb_1] ; (p2^p1)&1
|
||||
movq m5, m2
|
||||
paddb m2, m2
|
||||
paddb m2, m5 ; 3*(p2^p1)&1
|
||||
paddb m2, m6 ; extra bits lost in shifts
|
||||
pcmpeqb m0, m0
|
||||
pxor m1, m0 ; 255 - p3
|
||||
pavgb m1, m2 ; (256 - p3 + extrabits) >> 1
|
||||
pxor m0, m4 ; 255 - p1
|
||||
pavgb m0, m3 ; (256 + p2-p1) >> 1
|
||||
paddb m1, [pb_3]
|
||||
pavgb m1, m0 ; 128+2+( p2-p1 - p3) >> 2
|
||||
pavgb m1, m0 ; 128+1+(3*(p2-p1) - p3) >> 3
|
||||
paddusb m7, m1 ; d+128+1
|
||||
movq m6, [pb_81]
|
||||
psubusb m6, m7
|
||||
psubusb m7, [pb_81]
|
||||
|
||||
movq m5, [r2+516] ; flim
|
||||
pminub m6, m5
|
||||
pminub m7, m5
|
||||
movq m0, m6
|
||||
movq m1, m7
|
||||
paddb m6, m6
|
||||
paddb m7, m7
|
||||
pminub m6, m5
|
||||
pminub m7, m5
|
||||
psubb m6, m0
|
||||
psubb m7, m1
|
||||
paddusb m4, m7
|
||||
psubusb m4, m6
|
||||
psubusb m3, m7
|
||||
paddusb m3, m6
|
||||
%endmacro
|
||||
|
||||
%macro STORE_4_WORDS 1
|
||||
movd r2, %1
|
||||
mov [r0 -1], r2w
|
||||
psrlq %1, 32
|
||||
shr r2, 16
|
||||
mov [r0+r1 -1], r2w
|
||||
movd r2, %1
|
||||
mov [r0+r1*2-1], r2w
|
||||
shr r2, 16
|
||||
mov [r0+r3 -1], r2w
|
||||
%endmacro
|
||||
|
||||
INIT_MMX
|
||||
cglobal vp3_v_loop_filter_mmx2, 3, 4
|
||||
%ifdef ARCH_X86_64
|
||||
movsxd r1, r1d
|
||||
%endif
|
||||
mov r3, r1
|
||||
neg r1
|
||||
movq m6, [r0+r1*2]
|
||||
movq m4, [r0+r1 ]
|
||||
movq m2, [r0 ]
|
||||
movq m1, [r0+r3 ]
|
||||
|
||||
VP3_LOOP_FILTER
|
||||
|
||||
movq [r0+r1], m4
|
||||
movq [r0 ], m3
|
||||
RET
|
||||
|
||||
cglobal vp3_h_loop_filter_mmx2, 3, 4
|
||||
%ifdef ARCH_X86_64
|
||||
movsxd r1, r1d
|
||||
%endif
|
||||
lea r3, [r1*3]
|
||||
|
||||
movd m6, [r0 -2]
|
||||
movd m4, [r0+r1 -2]
|
||||
movd m2, [r0+r1*2-2]
|
||||
movd m1, [r0+r3 -2]
|
||||
lea r0, [r0+r1*4 ]
|
||||
punpcklbw m6, [r0 -2]
|
||||
punpcklbw m4, [r0+r1 -2]
|
||||
punpcklbw m2, [r0+r1*2-2]
|
||||
punpcklbw m1, [r0+r3 -2]
|
||||
sub r0, r3
|
||||
sub r0, r1
|
||||
|
||||
TRANSPOSE4x4B 6, 4, 2, 1, 0
|
||||
VP3_LOOP_FILTER
|
||||
SBUTTERFLY bw, 4, 3, 5
|
||||
|
||||
STORE_4_WORDS m4
|
||||
lea r0, [r0+r1*4 ]
|
||||
STORE_4_WORDS m3
|
||||
RET
|
||||
|
||||
; from original comments: The Macro does IDct on 4 1-D Dcts
|
||||
%macro BeginIDCT 0
|
||||
movq m2, I(3)
|
||||
movq m6, C(3)
|
||||
movq m4, m2
|
||||
movq m7, J(5)
|
||||
pmulhw m4, m6 ; r4 = c3*i3 - i3
|
||||
movq m1, C(5)
|
||||
pmulhw m6, m7 ; r6 = c3*i5 - i5
|
||||
movq m5, m1
|
||||
pmulhw m1, m2 ; r1 = c5*i3 - i3
|
||||
movq m3, I(1)
|
||||
pmulhw m5, m7 ; r5 = c5*i5 - i5
|
||||
movq m0, C(1)
|
||||
paddw m4, m2 ; r4 = c3*i3
|
||||
paddw m6, m7 ; r6 = c3*i5
|
||||
paddw m2, m1 ; r2 = c5*i3
|
||||
movq m1, J(7)
|
||||
paddw m7, m5 ; r7 = c5*i5
|
||||
movq m5, m0 ; r5 = c1
|
||||
pmulhw m0, m3 ; r0 = c1*i1 - i1
|
||||
paddsw m4, m7 ; r4 = C = c3*i3 + c5*i5
|
||||
pmulhw m5, m1 ; r5 = c1*i7 - i7
|
||||
movq m7, C(7)
|
||||
psubsw m6, m2 ; r6 = D = c3*i5 - c5*i3
|
||||
paddw m0, m3 ; r0 = c1*i1
|
||||
pmulhw m3, m7 ; r3 = c7*i1
|
||||
movq m2, I(2)
|
||||
pmulhw m7, m1 ; r7 = c7*i7
|
||||
paddw m5, m1 ; r5 = c1*i7
|
||||
movq m1, m2 ; r1 = i2
|
||||
pmulhw m2, C(2) ; r2 = c2*i2 - i2
|
||||
psubsw m3, m5 ; r3 = B = c7*i1 - c1*i7
|
||||
movq m5, J(6)
|
||||
paddsw m0, m7 ; r0 = A = c1*i1 + c7*i7
|
||||
movq m7, m5 ; r7 = i6
|
||||
psubsw m0, m4 ; r0 = A - C
|
||||
pmulhw m5, C(2) ; r5 = c2*i6 - i6
|
||||
paddw m2, m1 ; r2 = c2*i2
|
||||
pmulhw m1, C(6) ; r1 = c6*i2
|
||||
paddsw m4, m4 ; r4 = C + C
|
||||
paddsw m4, m0 ; r4 = C. = A + C
|
||||
psubsw m3, m6 ; r3 = B - D
|
||||
paddw m5, m7 ; r5 = c2*i6
|
||||
paddsw m6, m6 ; r6 = D + D
|
||||
pmulhw m7, C(6) ; r7 = c6*i6
|
||||
paddsw m6, m3 ; r6 = D. = B + D
|
||||
movq I(1), m4 ; save C. at I(1)
|
||||
psubsw m1, m5 ; r1 = H = c6*i2 - c2*i6
|
||||
movq m4, C(4)
|
||||
movq m5, m3 ; r5 = B - D
|
||||
pmulhw m3, m4 ; r3 = (c4 - 1) * (B - D)
|
||||
paddsw m7, m2 ; r3 = (c4 - 1) * (B - D)
|
||||
movq I(2), m6 ; save D. at I(2)
|
||||
movq m2, m0 ; r2 = A - C
|
||||
movq m6, I(0)
|
||||
pmulhw m0, m4 ; r0 = (c4 - 1) * (A - C)
|
||||
paddw m5, m3 ; r5 = B. = c4 * (B - D)
|
||||
movq m3, J(4)
|
||||
psubsw m5, m1 ; r5 = B.. = B. - H
|
||||
paddw m2, m0 ; r0 = A. = c4 * (A - C)
|
||||
psubsw m6, m3 ; r6 = i0 - i4
|
||||
movq m0, m6
|
||||
pmulhw m6, m4 ; r6 = (c4 - 1) * (i0 - i4)
|
||||
paddsw m3, m3 ; r3 = i4 + i4
|
||||
paddsw m1, m1 ; r1 = H + H
|
||||
paddsw m3, m0 ; r3 = i0 + i4
|
||||
paddsw m1, m5 ; r1 = H. = B + H
|
||||
pmulhw m4, m3 ; r4 = (c4 - 1) * (i0 + i4)
|
||||
paddsw m6, m0 ; r6 = F = c4 * (i0 - i4)
|
||||
psubsw m6, m2 ; r6 = F. = F - A.
|
||||
paddsw m2, m2 ; r2 = A. + A.
|
||||
movq m0, I(1) ; r0 = C.
|
||||
paddsw m2, m6 ; r2 = A.. = F + A.
|
||||
paddw m4, m3 ; r4 = E = c4 * (i0 + i4)
|
||||
psubsw m2, m1 ; r2 = R2 = A.. - H.
|
||||
%endmacro
|
||||
|
||||
; RowIDCT gets ready to transpose
|
||||
%macro RowIDCT 0
|
||||
BeginIDCT
|
||||
movq m3, I(2) ; r3 = D.
|
||||
psubsw m4, m7 ; r4 = E. = E - G
|
||||
paddsw m1, m1 ; r1 = H. + H.
|
||||
paddsw m7, m7 ; r7 = G + G
|
||||
paddsw m1, m2 ; r1 = R1 = A.. + H.
|
||||
paddsw m7, m4 ; r1 = R1 = A.. + H.
|
||||
psubsw m4, m3 ; r4 = R4 = E. - D.
|
||||
paddsw m3, m3
|
||||
psubsw m6, m5 ; r6 = R6 = F. - B..
|
||||
paddsw m5, m5
|
||||
paddsw m3, m4 ; r3 = R3 = E. + D.
|
||||
paddsw m5, m6 ; r5 = R5 = F. + B..
|
||||
psubsw m7, m0 ; r7 = R7 = G. - C.
|
||||
paddsw m0, m0
|
||||
movq I(1), m1 ; save R1
|
||||
paddsw m0, m7 ; r0 = R0 = G. + C.
|
||||
%endmacro
|
||||
|
||||
; Column IDCT normalizes and stores final results
|
||||
%macro ColumnIDCT 0
|
||||
BeginIDCT
|
||||
paddsw m2, OC_8 ; adjust R2 (and R1) for shift
|
||||
paddsw m1, m1 ; r1 = H. + H.
|
||||
paddsw m1, m2 ; r1 = R1 = A.. + H.
|
||||
psraw m2, 4 ; r2 = NR2
|
||||
psubsw m4, m7 ; r4 = E. = E - G
|
||||
psraw m1, 4 ; r1 = NR2
|
||||
movq m3, I(2) ; r3 = D.
|
||||
paddsw m7, m7 ; r7 = G + G
|
||||
movq I(2), m2 ; store NR2 at I2
|
||||
paddsw m7, m4 ; r7 = G. = E + G
|
||||
movq I(1), m1 ; store NR1 at I1
|
||||
psubsw m4, m3 ; r4 = R4 = E. - D.
|
||||
paddsw m4, OC_8 ; adjust R4 (and R3) for shift
|
||||
paddsw m3, m3 ; r3 = D. + D.
|
||||
paddsw m3, m4 ; r3 = R3 = E. + D.
|
||||
psraw m4, 4 ; r4 = NR4
|
||||
psubsw m6, m5 ; r6 = R6 = F. - B..
|
||||
psraw m3, 4 ; r3 = NR3
|
||||
paddsw m6, OC_8 ; adjust R6 (and R5) for shift
|
||||
paddsw m5, m5 ; r5 = B.. + B..
|
||||
paddsw m5, m6 ; r5 = R5 = F. + B..
|
||||
psraw m6, 4 ; r6 = NR6
|
||||
movq J(4), m4 ; store NR4 at J4
|
||||
psraw m5, 4 ; r5 = NR5
|
||||
movq I(3), m3 ; store NR3 at I3
|
||||
psubsw m7, m0 ; r7 = R7 = G. - C.
|
||||
paddsw m7, OC_8 ; adjust R7 (and R0) for shift
|
||||
paddsw m0, m0 ; r0 = C. + C.
|
||||
paddsw m0, m7 ; r0 = R0 = G. + C.
|
||||
psraw m7, 4 ; r7 = NR7
|
||||
movq J(6), m6 ; store NR6 at J6
|
||||
psraw m0, 4 ; r0 = NR0
|
||||
movq J(5), m5 ; store NR5 at J5
|
||||
movq J(7), m7 ; store NR7 at J7
|
||||
movq I(0), m0 ; store NR0 at I0
|
||||
%endmacro
|
||||
|
||||
; Following macro does two 4x4 transposes in place.
|
||||
;
|
||||
; At entry (we assume):
|
||||
;
|
||||
; r0 = a3 a2 a1 a0
|
||||
; I(1) = b3 b2 b1 b0
|
||||
; r2 = c3 c2 c1 c0
|
||||
; r3 = d3 d2 d1 d0
|
||||
;
|
||||
; r4 = e3 e2 e1 e0
|
||||
; r5 = f3 f2 f1 f0
|
||||
; r6 = g3 g2 g1 g0
|
||||
; r7 = h3 h2 h1 h0
|
||||
;
|
||||
; At exit, we have:
|
||||
;
|
||||
; I(0) = d0 c0 b0 a0
|
||||
; I(1) = d1 c1 b1 a1
|
||||
; I(2) = d2 c2 b2 a2
|
||||
; I(3) = d3 c3 b3 a3
|
||||
;
|
||||
; J(4) = h0 g0 f0 e0
|
||||
; J(5) = h1 g1 f1 e1
|
||||
; J(6) = h2 g2 f2 e2
|
||||
; J(7) = h3 g3 f3 e3
|
||||
;
|
||||
; I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3.
|
||||
; J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7.
|
||||
;
|
||||
; Since r1 is free at entry, we calculate the Js first.
|
||||
%macro Transpose 0
|
||||
movq m1, m4 ; r1 = e3 e2 e1 e0
|
||||
punpcklwd m4, m5 ; r4 = f1 e1 f0 e0
|
||||
movq I(0), m0 ; save a3 a2 a1 a0
|
||||
punpckhwd m1, m5 ; r1 = f3 e3 f2 e2
|
||||
movq m0, m6 ; r0 = g3 g2 g1 g0
|
||||
punpcklwd m6, m7 ; r6 = h1 g1 h0 g0
|
||||
movq m5, m4 ; r5 = f1 e1 f0 e0
|
||||
punpckldq m4, m6 ; r4 = h0 g0 f0 e0 = R4
|
||||
punpckhdq m5, m6 ; r5 = h1 g1 f1 e1 = R5
|
||||
movq m6, m1 ; r6 = f3 e3 f2 e2
|
||||
movq J(4), m4
|
||||
punpckhwd m0, m7 ; r0 = h3 g3 h2 g2
|
||||
movq J(5), m5
|
||||
punpckhdq m6, m0 ; r6 = h3 g3 f3 e3 = R7
|
||||
movq m4, I(0) ; r4 = a3 a2 a1 a0
|
||||
punpckldq m1, m0 ; r1 = h2 g2 f2 e2 = R6
|
||||
movq m5, I(1) ; r5 = b3 b2 b1 b0
|
||||
movq m0, m4 ; r0 = a3 a2 a1 a0
|
||||
movq J(7), m6
|
||||
punpcklwd m0, m5 ; r0 = b1 a1 b0 a0
|
||||
movq J(6), m1
|
||||
punpckhwd m4, m5 ; r4 = b3 a3 b2 a2
|
||||
movq m5, m2 ; r5 = c3 c2 c1 c0
|
||||
punpcklwd m2, m3 ; r2 = d1 c1 d0 c0
|
||||
movq m1, m0 ; r1 = b1 a1 b0 a0
|
||||
punpckldq m0, m2 ; r0 = d0 c0 b0 a0 = R0
|
||||
punpckhdq m1, m2 ; r1 = d1 c1 b1 a1 = R1
|
||||
movq m2, m4 ; r2 = b3 a3 b2 a2
|
||||
movq I(0), m0
|
||||
punpckhwd m5, m3 ; r5 = d3 c3 d2 c2
|
||||
movq I(1), m1
|
||||
punpckhdq m4, m5 ; r4 = d3 c3 b3 a3 = R3
|
||||
punpckldq m2, m5 ; r2 = d2 c2 b2 a2 = R2
|
||||
movq I(3), m4
|
||||
movq I(2), m2
|
||||
%endmacro
|
||||
|
||||
%macro VP3_IDCT_mmx 1
|
||||
; eax = quantized input
|
||||
; ebx = dequantizer matrix
|
||||
; ecx = IDCT constants
|
||||
; M(I) = ecx + MaskOffset(0) + I * 8
|
||||
; C(I) = ecx + CosineOffset(32) + (I-1) * 8
|
||||
; edx = output
|
||||
; r0..r7 = mm0..mm7
|
||||
%define OC_8 [pw_8]
|
||||
%define C(x) [vp3_idct_data+16*(x-1)]
|
||||
|
||||
; at this point, function has completed dequantization + dezigzag +
|
||||
; partial transposition; now do the idct itself
|
||||
%define I(x) [%1+16* x ]
|
||||
%define J(x) [%1+16*(x-4)+8]
|
||||
RowIDCT
|
||||
Transpose
|
||||
|
||||
%define I(x) [%1+16* x +64]
|
||||
%define J(x) [%1+16*(x-4)+72]
|
||||
RowIDCT
|
||||
Transpose
|
||||
|
||||
%define I(x) [%1+16*x]
|
||||
%define J(x) [%1+16*x]
|
||||
ColumnIDCT
|
||||
|
||||
%define I(x) [%1+16*x+8]
|
||||
%define J(x) [%1+16*x+8]
|
||||
ColumnIDCT
|
||||
%endmacro
|
||||
|
||||
%macro VP3_1D_IDCT_SSE2 0
|
||||
movdqa m2, I(3) ; xmm2 = i3
|
||||
movdqa m6, C(3) ; xmm6 = c3
|
||||
movdqa m4, m2 ; xmm4 = i3
|
||||
movdqa m7, I(5) ; xmm7 = i5
|
||||
pmulhw m4, m6 ; xmm4 = c3 * i3 - i3
|
||||
movdqa m1, C(5) ; xmm1 = c5
|
||||
pmulhw m6, m7 ; xmm6 = c3 * i5 - i5
|
||||
movdqa m5, m1 ; xmm5 = c5
|
||||
pmulhw m1, m2 ; xmm1 = c5 * i3 - i3
|
||||
movdqa m3, I(1) ; xmm3 = i1
|
||||
pmulhw m5, m7 ; xmm5 = c5 * i5 - i5
|
||||
movdqa m0, C(1) ; xmm0 = c1
|
||||
paddw m4, m2 ; xmm4 = c3 * i3
|
||||
paddw m6, m7 ; xmm6 = c3 * i5
|
||||
paddw m2, m1 ; xmm2 = c5 * i3
|
||||
movdqa m1, I(7) ; xmm1 = i7
|
||||
paddw m7, m5 ; xmm7 = c5 * i5
|
||||
movdqa m5, m0 ; xmm5 = c1
|
||||
pmulhw m0, m3 ; xmm0 = c1 * i1 - i1
|
||||
paddsw m4, m7 ; xmm4 = c3 * i3 + c5 * i5 = C
|
||||
pmulhw m5, m1 ; xmm5 = c1 * i7 - i7
|
||||
movdqa m7, C(7) ; xmm7 = c7
|
||||
psubsw m6, m2 ; xmm6 = c3 * i5 - c5 * i3 = D
|
||||
paddw m0, m3 ; xmm0 = c1 * i1
|
||||
pmulhw m3, m7 ; xmm3 = c7 * i1
|
||||
movdqa m2, I(2) ; xmm2 = i2
|
||||
pmulhw m7, m1 ; xmm7 = c7 * i7
|
||||
paddw m5, m1 ; xmm5 = c1 * i7
|
||||
movdqa m1, m2 ; xmm1 = i2
|
||||
pmulhw m2, C(2) ; xmm2 = i2 * c2 -i2
|
||||
psubsw m3, m5 ; xmm3 = c7 * i1 - c1 * i7 = B
|
||||
movdqa m5, I(6) ; xmm5 = i6
|
||||
paddsw m0, m7 ; xmm0 = c1 * i1 + c7 * i7 = A
|
||||
movdqa m7, m5 ; xmm7 = i6
|
||||
psubsw m0, m4 ; xmm0 = A - C
|
||||
pmulhw m5, C(2) ; xmm5 = c2 * i6 - i6
|
||||
paddw m2, m1 ; xmm2 = i2 * c2
|
||||
pmulhw m1, C(6) ; xmm1 = c6 * i2
|
||||
paddsw m4, m4 ; xmm4 = C + C
|
||||
paddsw m4, m0 ; xmm4 = A + C = C.
|
||||
psubsw m3, m6 ; xmm3 = B - D
|
||||
paddw m5, m7 ; xmm5 = c2 * i6
|
||||
paddsw m6, m6 ; xmm6 = D + D
|
||||
pmulhw m7, C(6) ; xmm7 = c6 * i6
|
||||
paddsw m6, m3 ; xmm6 = B + D = D.
|
||||
movdqa I(1), m4 ; Save C. at I(1)
|
||||
psubsw m1, m5 ; xmm1 = c6 * i2 - c2 * i6 = H
|
||||
movdqa m4, C(4) ; xmm4 = C4
|
||||
movdqa m5, m3 ; xmm5 = B - D
|
||||
pmulhw m3, m4 ; xmm3 = ( c4 -1 ) * ( B - D )
|
||||
paddsw m7, m2 ; xmm7 = c2 * i2 + c6 * i6 = G
|
||||
movdqa I(2), m6 ; save D. at I(2)
|
||||
movdqa m2, m0 ; xmm2 = A - C
|
||||
movdqa m6, I(0) ; xmm6 = i0
|
||||
pmulhw m0, m4 ; xmm0 = ( c4 - 1 ) * ( A - C ) = A.
|
||||
paddw m5, m3 ; xmm5 = c4 * ( B - D ) = B.
|
||||
movdqa m3, I(4) ; xmm3 = i4
|
||||
psubsw m5, m1 ; xmm5 = B. - H = B..
|
||||
paddw m2, m0 ; xmm2 = c4 * ( A - C) = A.
|
||||
psubsw m6, m3 ; xmm6 = i0 - i4
|
||||
movdqa m0, m6 ; xmm0 = i0 - i4
|
||||
pmulhw m6, m4 ; xmm6 = (c4 - 1) * (i0 - i4) = F
|
||||
paddsw m3, m3 ; xmm3 = i4 + i4
|
||||
paddsw m1, m1 ; xmm1 = H + H
|
||||
paddsw m3, m0 ; xmm3 = i0 + i4
|
||||
paddsw m1, m5 ; xmm1 = B. + H = H.
|
||||
pmulhw m4, m3 ; xmm4 = ( c4 - 1 ) * ( i0 + i4 )
|
||||
paddw m6, m0 ; xmm6 = c4 * ( i0 - i4 )
|
||||
psubsw m6, m2 ; xmm6 = F - A. = F.
|
||||
paddsw m2, m2 ; xmm2 = A. + A.
|
||||
movdqa m0, I(1) ; Load C. from I(1)
|
||||
paddsw m2, m6 ; xmm2 = F + A. = A..
|
||||
paddw m4, m3 ; xmm4 = c4 * ( i0 + i4 ) = 3
|
||||
psubsw m2, m1 ; xmm2 = A.. - H. = R2
|
||||
ADD(m2) ; Adjust R2 and R1 before shifting
|
||||
paddsw m1, m1 ; xmm1 = H. + H.
|
||||
paddsw m1, m2 ; xmm1 = A.. + H. = R1
|
||||
SHIFT(m2) ; xmm2 = op2
|
||||
psubsw m4, m7 ; xmm4 = E - G = E.
|
||||
SHIFT(m1) ; xmm1 = op1
|
||||
movdqa m3, I(2) ; Load D. from I(2)
|
||||
paddsw m7, m7 ; xmm7 = G + G
|
||||
paddsw m7, m4 ; xmm7 = E + G = G.
|
||||
psubsw m4, m3 ; xmm4 = E. - D. = R4
|
||||
ADD(m4) ; Adjust R4 and R3 before shifting
|
||||
paddsw m3, m3 ; xmm3 = D. + D.
|
||||
paddsw m3, m4 ; xmm3 = E. + D. = R3
|
||||
SHIFT(m4) ; xmm4 = op4
|
||||
psubsw m6, m5 ; xmm6 = F. - B..= R6
|
||||
SHIFT(m3) ; xmm3 = op3
|
||||
ADD(m6) ; Adjust R6 and R5 before shifting
|
||||
paddsw m5, m5 ; xmm5 = B.. + B..
|
||||
paddsw m5, m6 ; xmm5 = F. + B.. = R5
|
||||
SHIFT(m6) ; xmm6 = op6
|
||||
SHIFT(m5) ; xmm5 = op5
|
||||
psubsw m7, m0 ; xmm7 = G. - C. = R7
|
||||
ADD(m7) ; Adjust R7 and R0 before shifting
|
||||
paddsw m0, m0 ; xmm0 = C. + C.
|
||||
paddsw m0, m7 ; xmm0 = G. + C.
|
||||
SHIFT(m7) ; xmm7 = op7
|
||||
SHIFT(m0) ; xmm0 = op0
|
||||
%endmacro
|
||||
|
||||
%macro PUT_BLOCK 8
|
||||
movdqa O(0), m%1
|
||||
movdqa O(1), m%2
|
||||
movdqa O(2), m%3
|
||||
movdqa O(3), m%4
|
||||
movdqa O(4), m%5
|
||||
movdqa O(5), m%6
|
||||
movdqa O(6), m%7
|
||||
movdqa O(7), m%8
|
||||
%endmacro
|
||||
|
||||
%macro VP3_IDCT_sse2 1
|
||||
%define I(x) [%1+16*x]
|
||||
%define O(x) [%1+16*x]
|
||||
%define C(x) [vp3_idct_data+16*(x-1)]
|
||||
%define SHIFT(x)
|
||||
%define ADD(x)
|
||||
VP3_1D_IDCT_SSE2
|
||||
%ifdef ARCH_X86_64
|
||||
TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
|
||||
%else
|
||||
TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%1], [%1+16]
|
||||
%endif
|
||||
PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7
|
||||
|
||||
%define SHIFT(x) psraw x, 4
|
||||
%define ADD(x) paddsw x, [pw_8]
|
||||
VP3_1D_IDCT_SSE2
|
||||
PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7
|
||||
%endmacro
|
||||
|
||||
%macro vp3_idct_funcs 3
|
||||
cglobal vp3_idct_%1, 1, 1, %2
|
||||
VP3_IDCT_%1 r0
|
||||
RET
|
||||
|
||||
cglobal vp3_idct_put_%1, 3, %3, %2
|
||||
VP3_IDCT_%1 r2
|
||||
%ifdef ARCH_X86_64
|
||||
mov r3, r2
|
||||
mov r2, r1
|
||||
mov r1, r0
|
||||
mov r0, r3
|
||||
%else
|
||||
mov r0m, r2
|
||||
mov r1m, r0
|
||||
mov r2m, r1
|
||||
%endif
|
||||
%ifdef WIN64
|
||||
call put_signed_pixels_clamped_mmx
|
||||
RET
|
||||
%else
|
||||
jmp put_signed_pixels_clamped_mmx
|
||||
%endif
|
||||
|
||||
cglobal vp3_idct_add_%1, 3, %3, %2
|
||||
VP3_IDCT_%1 r2
|
||||
%ifdef ARCH_X86_64
|
||||
mov r3, r2
|
||||
mov r2, r1
|
||||
mov r1, r0
|
||||
mov r0, r3
|
||||
%else
|
||||
mov r0m, r2
|
||||
mov r1m, r0
|
||||
mov r2m, r1
|
||||
%endif
|
||||
%ifdef WIN64
|
||||
call add_pixels_clamped_mmx
|
||||
RET
|
||||
%else
|
||||
jmp add_pixels_clamped_mmx
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%ifdef ARCH_X86_64
|
||||
%define REGS 4
|
||||
%else
|
||||
%define REGS 3
|
||||
%endif
|
||||
INIT_MMX
|
||||
vp3_idct_funcs mmx, 0, REGS
|
||||
INIT_XMM
|
||||
vp3_idct_funcs sse2, 9, REGS
|
||||
%undef REGS
|
||||
|
||||
%macro DC_ADD 0
|
||||
movq m2, [r0 ]
|
||||
movq m3, [r0+r1 ]
|
||||
paddusb m2, m0
|
||||
movq m4, [r0+r1*2]
|
||||
paddusb m3, m0
|
||||
movq m5, [r0+r3 ]
|
||||
paddusb m4, m0
|
||||
paddusb m5, m0
|
||||
psubusb m2, m1
|
||||
psubusb m3, m1
|
||||
movq [r0 ], m2
|
||||
psubusb m4, m1
|
||||
movq [r0+r1 ], m3
|
||||
psubusb m5, m1
|
||||
movq [r0+r1*2], m4
|
||||
movq [r0+r3 ], m5
|
||||
%endmacro
|
||||
|
||||
INIT_MMX
|
||||
cglobal vp3_idct_dc_add_mmx2, 3, 4
|
||||
%ifdef ARCH_X86_64
|
||||
movsxd r1, r1d
|
||||
%endif
|
||||
lea r3, [r1*3]
|
||||
movsx r2, word [r2]
|
||||
add r2, 15
|
||||
sar r2, 5
|
||||
movd m0, r2
|
||||
pshufw m0, m0, 0x0
|
||||
pxor m1, m1
|
||||
psubw m1, m0
|
||||
packuswb m0, m0
|
||||
packuswb m1, m1
|
||||
DC_ADD
|
||||
lea r0, [r0+r1*4]
|
||||
DC_ADD
|
||||
RET
|
@ -1,436 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2004 the ffmpeg project
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
* MMX-optimized functions cribbed from the original VP3 source code.
|
||||
*/
|
||||
|
||||
#include "libavutil/x86_cpu.h"
|
||||
#include "libavcodec/dsputil.h"
|
||||
#include "dsputil_mmx.h"
|
||||
#include "vp3dsp_mmx.h"
|
||||
|
||||
extern const uint16_t ff_vp3_idct_data[];
|
||||
|
||||
// this is off by one or two for some cases when filter_limit is greater than 63
|
||||
// in: p0 in mm6, p1 in mm4, p2 in mm2, p3 in mm1
|
||||
// out: p1 in mm4, p2 in mm3
|
||||
#define VP3_LOOP_FILTER(flim) \
|
||||
"movq %%mm6, %%mm7 \n\t" \
|
||||
"pand "MANGLE(ff_pb_7 )", %%mm6 \n\t" /* p0&7 */ \
|
||||
"psrlw $3, %%mm7 \n\t" \
|
||||
"pand "MANGLE(ff_pb_1F)", %%mm7 \n\t" /* p0>>3 */ \
|
||||
"movq %%mm2, %%mm3 \n\t" /* mm3 = p2 */ \
|
||||
"pxor %%mm4, %%mm2 \n\t" \
|
||||
"pand "MANGLE(ff_pb_1 )", %%mm2 \n\t" /* (p2^p1)&1 */ \
|
||||
"movq %%mm2, %%mm5 \n\t" \
|
||||
"paddb %%mm2, %%mm2 \n\t" \
|
||||
"paddb %%mm5, %%mm2 \n\t" /* 3*(p2^p1)&1 */ \
|
||||
"paddb %%mm6, %%mm2 \n\t" /* extra bits lost in shifts */ \
|
||||
"pcmpeqb %%mm0, %%mm0 \n\t" \
|
||||
"pxor %%mm0, %%mm1 \n\t" /* 255 - p3 */ \
|
||||
"pavgb %%mm2, %%mm1 \n\t" /* (256 - p3 + extrabits) >> 1 */ \
|
||||
"pxor %%mm4, %%mm0 \n\t" /* 255 - p1 */ \
|
||||
"pavgb %%mm3, %%mm0 \n\t" /* (256 + p2-p1) >> 1 */ \
|
||||
"paddb "MANGLE(ff_pb_3 )", %%mm1 \n\t" \
|
||||
"pavgb %%mm0, %%mm1 \n\t" /* 128+2+( p2-p1 - p3) >> 2 */ \
|
||||
"pavgb %%mm0, %%mm1 \n\t" /* 128+1+(3*(p2-p1) - p3) >> 3 */ \
|
||||
"paddusb %%mm1, %%mm7 \n\t" /* d+128+1 */ \
|
||||
"movq "MANGLE(ff_pb_81)", %%mm6 \n\t" \
|
||||
"psubusb %%mm7, %%mm6 \n\t" \
|
||||
"psubusb "MANGLE(ff_pb_81)", %%mm7 \n\t" \
|
||||
\
|
||||
"movq "#flim", %%mm5 \n\t" \
|
||||
"pminub %%mm5, %%mm6 \n\t" \
|
||||
"pminub %%mm5, %%mm7 \n\t" \
|
||||
"movq %%mm6, %%mm0 \n\t" \
|
||||
"movq %%mm7, %%mm1 \n\t" \
|
||||
"paddb %%mm6, %%mm6 \n\t" \
|
||||
"paddb %%mm7, %%mm7 \n\t" \
|
||||
"pminub %%mm5, %%mm6 \n\t" \
|
||||
"pminub %%mm5, %%mm7 \n\t" \
|
||||
"psubb %%mm0, %%mm6 \n\t" \
|
||||
"psubb %%mm1, %%mm7 \n\t" \
|
||||
"paddusb %%mm7, %%mm4 \n\t" \
|
||||
"psubusb %%mm6, %%mm4 \n\t" \
|
||||
"psubusb %%mm7, %%mm3 \n\t" \
|
||||
"paddusb %%mm6, %%mm3 \n\t"
|
||||
|
||||
#define STORE_4_WORDS(dst0, dst1, dst2, dst3, mm) \
|
||||
"movd "#mm", %0 \n\t" \
|
||||
"movw %w0, -1"#dst0" \n\t" \
|
||||
"psrlq $32, "#mm" \n\t" \
|
||||
"shr $16, %0 \n\t" \
|
||||
"movw %w0, -1"#dst1" \n\t" \
|
||||
"movd "#mm", %0 \n\t" \
|
||||
"movw %w0, -1"#dst2" \n\t" \
|
||||
"shr $16, %0 \n\t" \
|
||||
"movw %w0, -1"#dst3" \n\t"
|
||||
|
||||
void ff_vp3_v_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values)
|
||||
{
|
||||
__asm__ volatile(
|
||||
"movq %0, %%mm6 \n\t"
|
||||
"movq %1, %%mm4 \n\t"
|
||||
"movq %2, %%mm2 \n\t"
|
||||
"movq %3, %%mm1 \n\t"
|
||||
|
||||
VP3_LOOP_FILTER(%4)
|
||||
|
||||
"movq %%mm4, %1 \n\t"
|
||||
"movq %%mm3, %2 \n\t"
|
||||
|
||||
: "+m" (*(uint64_t*)(src - 2*stride)),
|
||||
"+m" (*(uint64_t*)(src - 1*stride)),
|
||||
"+m" (*(uint64_t*)(src + 0*stride)),
|
||||
"+m" (*(uint64_t*)(src + 1*stride))
|
||||
: "m"(*(uint64_t*)(bounding_values+129))
|
||||
);
|
||||
}
|
||||
|
||||
void ff_vp3_h_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values)
|
||||
{
|
||||
x86_reg tmp;
|
||||
|
||||
__asm__ volatile(
|
||||
"movd -2(%1), %%mm6 \n\t"
|
||||
"movd -2(%1,%3), %%mm0 \n\t"
|
||||
"movd -2(%1,%3,2), %%mm1 \n\t"
|
||||
"movd -2(%1,%4), %%mm4 \n\t"
|
||||
|
||||
TRANSPOSE8x4(%%mm6, %%mm0, %%mm1, %%mm4, -2(%2), -2(%2,%3), -2(%2,%3,2), -2(%2,%4), %%mm2)
|
||||
VP3_LOOP_FILTER(%5)
|
||||
SBUTTERFLY(%%mm4, %%mm3, %%mm5, bw, q)
|
||||
|
||||
STORE_4_WORDS((%1), (%1,%3), (%1,%3,2), (%1,%4), %%mm4)
|
||||
STORE_4_WORDS((%2), (%2,%3), (%2,%3,2), (%2,%4), %%mm5)
|
||||
|
||||
: "=&r"(tmp)
|
||||
: "r"(src), "r"(src+4*stride), "r"((x86_reg)stride), "r"((x86_reg)3*stride),
|
||||
"m"(*(uint64_t*)(bounding_values+129))
|
||||
: "memory"
|
||||
);
|
||||
}
|
||||
|
||||
/* from original comments: The Macro does IDct on 4 1-D Dcts */
|
||||
#define BeginIDCT() \
|
||||
"movq "I(3)", %%mm2 \n\t" \
|
||||
"movq "C(3)", %%mm6 \n\t" \
|
||||
"movq %%mm2, %%mm4 \n\t" \
|
||||
"movq "J(5)", %%mm7 \n\t" \
|
||||
"pmulhw %%mm6, %%mm4 \n\t" /* r4 = c3*i3 - i3 */ \
|
||||
"movq "C(5)", %%mm1 \n\t" \
|
||||
"pmulhw %%mm7, %%mm6 \n\t" /* r6 = c3*i5 - i5 */ \
|
||||
"movq %%mm1, %%mm5 \n\t" \
|
||||
"pmulhw %%mm2, %%mm1 \n\t" /* r1 = c5*i3 - i3 */ \
|
||||
"movq "I(1)", %%mm3 \n\t" \
|
||||
"pmulhw %%mm7, %%mm5 \n\t" /* r5 = c5*i5 - i5 */ \
|
||||
"movq "C(1)", %%mm0 \n\t" \
|
||||
"paddw %%mm2, %%mm4 \n\t" /* r4 = c3*i3 */ \
|
||||
"paddw %%mm7, %%mm6 \n\t" /* r6 = c3*i5 */ \
|
||||
"paddw %%mm1, %%mm2 \n\t" /* r2 = c5*i3 */ \
|
||||
"movq "J(7)", %%mm1 \n\t" \
|
||||
"paddw %%mm5, %%mm7 \n\t" /* r7 = c5*i5 */ \
|
||||
"movq %%mm0, %%mm5 \n\t" /* r5 = c1 */ \
|
||||
"pmulhw %%mm3, %%mm0 \n\t" /* r0 = c1*i1 - i1 */ \
|
||||
"paddsw %%mm7, %%mm4 \n\t" /* r4 = C = c3*i3 + c5*i5 */ \
|
||||
"pmulhw %%mm1, %%mm5 \n\t" /* r5 = c1*i7 - i7 */ \
|
||||
"movq "C(7)", %%mm7 \n\t" \
|
||||
"psubsw %%mm2, %%mm6 \n\t" /* r6 = D = c3*i5 - c5*i3 */ \
|
||||
"paddw %%mm3, %%mm0 \n\t" /* r0 = c1*i1 */ \
|
||||
"pmulhw %%mm7, %%mm3 \n\t" /* r3 = c7*i1 */ \
|
||||
"movq "I(2)", %%mm2 \n\t" \
|
||||
"pmulhw %%mm1, %%mm7 \n\t" /* r7 = c7*i7 */ \
|
||||
"paddw %%mm1, %%mm5 \n\t" /* r5 = c1*i7 */ \
|
||||
"movq %%mm2, %%mm1 \n\t" /* r1 = i2 */ \
|
||||
"pmulhw "C(2)", %%mm2 \n\t" /* r2 = c2*i2 - i2 */ \
|
||||
"psubsw %%mm5, %%mm3 \n\t" /* r3 = B = c7*i1 - c1*i7 */ \
|
||||
"movq "J(6)", %%mm5 \n\t" \
|
||||
"paddsw %%mm7, %%mm0 \n\t" /* r0 = A = c1*i1 + c7*i7 */ \
|
||||
"movq %%mm5, %%mm7 \n\t" /* r7 = i6 */ \
|
||||
"psubsw %%mm4, %%mm0 \n\t" /* r0 = A - C */ \
|
||||
"pmulhw "C(2)", %%mm5 \n\t" /* r5 = c2*i6 - i6 */ \
|
||||
"paddw %%mm1, %%mm2 \n\t" /* r2 = c2*i2 */ \
|
||||
"pmulhw "C(6)", %%mm1 \n\t" /* r1 = c6*i2 */ \
|
||||
"paddsw %%mm4, %%mm4 \n\t" /* r4 = C + C */ \
|
||||
"paddsw %%mm0, %%mm4 \n\t" /* r4 = C. = A + C */ \
|
||||
"psubsw %%mm6, %%mm3 \n\t" /* r3 = B - D */ \
|
||||
"paddw %%mm7, %%mm5 \n\t" /* r5 = c2*i6 */ \
|
||||
"paddsw %%mm6, %%mm6 \n\t" /* r6 = D + D */ \
|
||||
"pmulhw "C(6)", %%mm7 \n\t" /* r7 = c6*i6 */ \
|
||||
"paddsw %%mm3, %%mm6 \n\t" /* r6 = D. = B + D */ \
|
||||
"movq %%mm4, "I(1)"\n\t" /* save C. at I(1) */ \
|
||||
"psubsw %%mm5, %%mm1 \n\t" /* r1 = H = c6*i2 - c2*i6 */ \
|
||||
"movq "C(4)", %%mm4 \n\t" \
|
||||
"movq %%mm3, %%mm5 \n\t" /* r5 = B - D */ \
|
||||
"pmulhw %%mm4, %%mm3 \n\t" /* r3 = (c4 - 1) * (B - D) */ \
|
||||
"paddsw %%mm2, %%mm7 \n\t" /* r3 = (c4 - 1) * (B - D) */ \
|
||||
"movq %%mm6, "I(2)"\n\t" /* save D. at I(2) */ \
|
||||
"movq %%mm0, %%mm2 \n\t" /* r2 = A - C */ \
|
||||
"movq "I(0)", %%mm6 \n\t" \
|
||||
"pmulhw %%mm4, %%mm0 \n\t" /* r0 = (c4 - 1) * (A - C) */ \
|
||||
"paddw %%mm3, %%mm5 \n\t" /* r5 = B. = c4 * (B - D) */ \
|
||||
"movq "J(4)", %%mm3 \n\t" \
|
||||
"psubsw %%mm1, %%mm5 \n\t" /* r5 = B.. = B. - H */ \
|
||||
"paddw %%mm0, %%mm2 \n\t" /* r0 = A. = c4 * (A - C) */ \
|
||||
"psubsw %%mm3, %%mm6 \n\t" /* r6 = i0 - i4 */ \
|
||||
"movq %%mm6, %%mm0 \n\t" \
|
||||
"pmulhw %%mm4, %%mm6 \n\t" /* r6 = (c4 - 1) * (i0 - i4) */ \
|
||||
"paddsw %%mm3, %%mm3 \n\t" /* r3 = i4 + i4 */ \
|
||||
"paddsw %%mm1, %%mm1 \n\t" /* r1 = H + H */ \
|
||||
"paddsw %%mm0, %%mm3 \n\t" /* r3 = i0 + i4 */ \
|
||||
"paddsw %%mm5, %%mm1 \n\t" /* r1 = H. = B + H */ \
|
||||
"pmulhw %%mm3, %%mm4 \n\t" /* r4 = (c4 - 1) * (i0 + i4) */ \
|
||||
"paddsw %%mm0, %%mm6 \n\t" /* r6 = F = c4 * (i0 - i4) */ \
|
||||
"psubsw %%mm2, %%mm6 \n\t" /* r6 = F. = F - A. */ \
|
||||
"paddsw %%mm2, %%mm2 \n\t" /* r2 = A. + A. */ \
|
||||
"movq "I(1)", %%mm0 \n\t" /* r0 = C. */ \
|
||||
"paddsw %%mm6, %%mm2 \n\t" /* r2 = A.. = F + A. */ \
|
||||
"paddw %%mm3, %%mm4 \n\t" /* r4 = E = c4 * (i0 + i4) */ \
|
||||
"psubsw %%mm1, %%mm2 \n\t" /* r2 = R2 = A.. - H. */
|
||||
|
||||
/* RowIDCT gets ready to transpose */
|
||||
#define RowIDCT() \
|
||||
BeginIDCT() \
|
||||
"movq "I(2)", %%mm3 \n\t" /* r3 = D. */ \
|
||||
"psubsw %%mm7, %%mm4 \n\t" /* r4 = E. = E - G */ \
|
||||
"paddsw %%mm1, %%mm1 \n\t" /* r1 = H. + H. */ \
|
||||
"paddsw %%mm7, %%mm7 \n\t" /* r7 = G + G */ \
|
||||
"paddsw %%mm2, %%mm1 \n\t" /* r1 = R1 = A.. + H. */ \
|
||||
"paddsw %%mm4, %%mm7 \n\t" /* r1 = R1 = A.. + H. */ \
|
||||
"psubsw %%mm3, %%mm4 \n\t" /* r4 = R4 = E. - D. */ \
|
||||
"paddsw %%mm3, %%mm3 \n\t" \
|
||||
"psubsw %%mm5, %%mm6 \n\t" /* r6 = R6 = F. - B.. */ \
|
||||
"paddsw %%mm5, %%mm5 \n\t" \
|
||||
"paddsw %%mm4, %%mm3 \n\t" /* r3 = R3 = E. + D. */ \
|
||||
"paddsw %%mm6, %%mm5 \n\t" /* r5 = R5 = F. + B.. */ \
|
||||
"psubsw %%mm0, %%mm7 \n\t" /* r7 = R7 = G. - C. */ \
|
||||
"paddsw %%mm0, %%mm0 \n\t" \
|
||||
"movq %%mm1, "I(1)"\n\t" /* save R1 */ \
|
||||
"paddsw %%mm7, %%mm0 \n\t" /* r0 = R0 = G. + C. */
|
||||
|
||||
/* Column IDCT normalizes and stores final results */
|
||||
#define ColumnIDCT() \
|
||||
BeginIDCT() \
|
||||
"paddsw "OC_8", %%mm2 \n\t" /* adjust R2 (and R1) for shift */ \
|
||||
"paddsw %%mm1, %%mm1 \n\t" /* r1 = H. + H. */ \
|
||||
"paddsw %%mm2, %%mm1 \n\t" /* r1 = R1 = A.. + H. */ \
|
||||
"psraw $4, %%mm2 \n\t" /* r2 = NR2 */ \
|
||||
"psubsw %%mm7, %%mm4 \n\t" /* r4 = E. = E - G */ \
|
||||
"psraw $4, %%mm1 \n\t" /* r1 = NR1 */ \
|
||||
"movq "I(2)", %%mm3 \n\t" /* r3 = D. */ \
|
||||
"paddsw %%mm7, %%mm7 \n\t" /* r7 = G + G */ \
|
||||
"movq %%mm2, "I(2)"\n\t" /* store NR2 at I2 */ \
|
||||
"paddsw %%mm4, %%mm7 \n\t" /* r7 = G. = E + G */ \
|
||||
"movq %%mm1, "I(1)"\n\t" /* store NR1 at I1 */ \
|
||||
"psubsw %%mm3, %%mm4 \n\t" /* r4 = R4 = E. - D. */ \
|
||||
"paddsw "OC_8", %%mm4 \n\t" /* adjust R4 (and R3) for shift */ \
|
||||
"paddsw %%mm3, %%mm3 \n\t" /* r3 = D. + D. */ \
|
||||
"paddsw %%mm4, %%mm3 \n\t" /* r3 = R3 = E. + D. */ \
|
||||
"psraw $4, %%mm4 \n\t" /* r4 = NR4 */ \
|
||||
"psubsw %%mm5, %%mm6 \n\t" /* r6 = R6 = F. - B.. */ \
|
||||
"psraw $4, %%mm3 \n\t" /* r3 = NR3 */ \
|
||||
"paddsw "OC_8", %%mm6 \n\t" /* adjust R6 (and R5) for shift */ \
|
||||
"paddsw %%mm5, %%mm5 \n\t" /* r5 = B.. + B.. */ \
|
||||
"paddsw %%mm6, %%mm5 \n\t" /* r5 = R5 = F. + B.. */ \
|
||||
"psraw $4, %%mm6 \n\t" /* r6 = NR6 */ \
|
||||
"movq %%mm4, "J(4)"\n\t" /* store NR4 at J4 */ \
|
||||
"psraw $4, %%mm5 \n\t" /* r5 = NR5 */ \
|
||||
"movq %%mm3, "I(3)"\n\t" /* store NR3 at I3 */ \
|
||||
"psubsw %%mm0, %%mm7 \n\t" /* r7 = R7 = G. - C. */ \
|
||||
"paddsw "OC_8", %%mm7 \n\t" /* adjust R7 (and R0) for shift */ \
|
||||
"paddsw %%mm0, %%mm0 \n\t" /* r0 = C. + C. */ \
|
||||
"paddsw %%mm7, %%mm0 \n\t" /* r0 = R0 = G. + C. */ \
|
||||
"psraw $4, %%mm7 \n\t" /* r7 = NR7 */ \
|
||||
"movq %%mm6, "J(6)"\n\t" /* store NR6 at J6 */ \
|
||||
"psraw $4, %%mm0 \n\t" /* r0 = NR0 */ \
|
||||
"movq %%mm5, "J(5)"\n\t" /* store NR5 at J5 */ \
|
||||
"movq %%mm7, "J(7)"\n\t" /* store NR7 at J7 */ \
|
||||
"movq %%mm0, "I(0)"\n\t" /* store NR0 at I0 */
|
||||
|
||||
/* Following macro does two 4x4 transposes in place.
|
||||
|
||||
At entry (we assume):
|
||||
|
||||
r0 = a3 a2 a1 a0
|
||||
I(1) = b3 b2 b1 b0
|
||||
r2 = c3 c2 c1 c0
|
||||
r3 = d3 d2 d1 d0
|
||||
|
||||
r4 = e3 e2 e1 e0
|
||||
r5 = f3 f2 f1 f0
|
||||
r6 = g3 g2 g1 g0
|
||||
r7 = h3 h2 h1 h0
|
||||
|
||||
At exit, we have:
|
||||
|
||||
I(0) = d0 c0 b0 a0
|
||||
I(1) = d1 c1 b1 a1
|
||||
I(2) = d2 c2 b2 a2
|
||||
I(3) = d3 c3 b3 a3
|
||||
|
||||
J(4) = h0 g0 f0 e0
|
||||
J(5) = h1 g1 f1 e1
|
||||
J(6) = h2 g2 f2 e2
|
||||
J(7) = h3 g3 f3 e3
|
||||
|
||||
I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3.
|
||||
J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7.
|
||||
|
||||
Since r1 is free at entry, we calculate the Js first. */
|
||||
#define Transpose() \
|
||||
"movq %%mm4, %%mm1 \n\t" /* r1 = e3 e2 e1 e0 */ \
|
||||
"punpcklwd %%mm5, %%mm4 \n\t" /* r4 = f1 e1 f0 e0 */ \
|
||||
"movq %%mm0, "I(0)"\n\t" /* save a3 a2 a1 a0 */ \
|
||||
"punpckhwd %%mm5, %%mm1 \n\t" /* r1 = f3 e3 f2 e2 */ \
|
||||
"movq %%mm6, %%mm0 \n\t" /* r0 = g3 g2 g1 g0 */ \
|
||||
"punpcklwd %%mm7, %%mm6 \n\t" /* r6 = h1 g1 h0 g0 */ \
|
||||
"movq %%mm4, %%mm5 \n\t" /* r5 = f1 e1 f0 e0 */ \
|
||||
"punpckldq %%mm6, %%mm4 \n\t" /* r4 = h0 g0 f0 e0 = R4 */ \
|
||||
"punpckhdq %%mm6, %%mm5 \n\t" /* r5 = h1 g1 f1 e1 = R5 */ \
|
||||
"movq %%mm1, %%mm6 \n\t" /* r6 = f3 e3 f2 e2 */ \
|
||||
"movq %%mm4, "J(4)"\n\t" \
|
||||
"punpckhwd %%mm7, %%mm0 \n\t" /* r0 = h3 g3 h2 g2 */ \
|
||||
"movq %%mm5, "J(5)"\n\t" \
|
||||
"punpckhdq %%mm0, %%mm6 \n\t" /* r6 = h3 g3 f3 e3 = R7 */ \
|
||||
"movq "I(0)", %%mm4 \n\t" /* r4 = a3 a2 a1 a0 */ \
|
||||
"punpckldq %%mm0, %%mm1 \n\t" /* r1 = h2 g2 f2 e2 = R6 */ \
|
||||
"movq "I(1)", %%mm5 \n\t" /* r5 = b3 b2 b1 b0 */ \
|
||||
"movq %%mm4, %%mm0 \n\t" /* r0 = a3 a2 a1 a0 */ \
|
||||
"movq %%mm6, "J(7)"\n\t" \
|
||||
"punpcklwd %%mm5, %%mm0 \n\t" /* r0 = b1 a1 b0 a0 */ \
|
||||
"movq %%mm1, "J(6)"\n\t" \
|
||||
"punpckhwd %%mm5, %%mm4 \n\t" /* r4 = b3 a3 b2 a2 */ \
|
||||
"movq %%mm2, %%mm5 \n\t" /* r5 = c3 c2 c1 c0 */ \
|
||||
"punpcklwd %%mm3, %%mm2 \n\t" /* r2 = d1 c1 d0 c0 */ \
|
||||
"movq %%mm0, %%mm1 \n\t" /* r1 = b1 a1 b0 a0 */ \
|
||||
"punpckldq %%mm2, %%mm0 \n\t" /* r0 = d0 c0 b0 a0 = R0 */ \
|
||||
"punpckhdq %%mm2, %%mm1 \n\t" /* r1 = d1 c1 b1 a1 = R1 */ \
|
||||
"movq %%mm4, %%mm2 \n\t" /* r2 = b3 a3 b2 a2 */ \
|
||||
"movq %%mm0, "I(0)"\n\t" \
|
||||
"punpckhwd %%mm3, %%mm5 \n\t" /* r5 = d3 c3 d2 c2 */ \
|
||||
"movq %%mm1, "I(1)"\n\t" \
|
||||
"punpckhdq %%mm5, %%mm4 \n\t" /* r4 = d3 c3 b3 a3 = R3 */ \
|
||||
"punpckldq %%mm5, %%mm2 \n\t" /* r2 = d2 c2 b2 a2 = R2 */ \
|
||||
"movq %%mm4, "I(3)"\n\t" \
|
||||
"movq %%mm2, "I(2)"\n\t"
|
||||
|
||||
void ff_vp3_idct_mmx(int16_t *output_data)
|
||||
{
|
||||
/* eax = quantized input
|
||||
* ebx = dequantizer matrix
|
||||
* ecx = IDCT constants
|
||||
* M(I) = ecx + MaskOffset(0) + I * 8
|
||||
* C(I) = ecx + CosineOffset(32) + (I-1) * 8
|
||||
* edx = output
|
||||
* r0..r7 = mm0..mm7
|
||||
*/
|
||||
|
||||
#define C(x) AV_STRINGIFY(16*(x-1))"(%1)"
|
||||
#define OC_8 "%2"
|
||||
|
||||
/* at this point, function has completed dequantization + dezigzag +
|
||||
* partial transposition; now do the idct itself */
|
||||
#define I(x) AV_STRINGIFY(16* x )"(%0)"
|
||||
#define J(x) AV_STRINGIFY(16*(x-4) + 8)"(%0)"
|
||||
|
||||
__asm__ volatile (
|
||||
RowIDCT()
|
||||
Transpose()
|
||||
|
||||
#undef I
|
||||
#undef J
|
||||
#define I(x) AV_STRINGIFY(16* x + 64)"(%0)"
|
||||
#define J(x) AV_STRINGIFY(16*(x-4) + 72)"(%0)"
|
||||
|
||||
RowIDCT()
|
||||
Transpose()
|
||||
|
||||
#undef I
|
||||
#undef J
|
||||
#define I(x) AV_STRINGIFY(16*x)"(%0)"
|
||||
#define J(x) AV_STRINGIFY(16*x)"(%0)"
|
||||
|
||||
ColumnIDCT()
|
||||
|
||||
#undef I
|
||||
#undef J
|
||||
#define I(x) AV_STRINGIFY(16*x + 8)"(%0)"
|
||||
#define J(x) AV_STRINGIFY(16*x + 8)"(%0)"
|
||||
|
||||
ColumnIDCT()
|
||||
:: "r"(output_data), "r"(ff_vp3_idct_data), "m"(ff_pw_8)
|
||||
);
|
||||
#undef I
|
||||
#undef J
|
||||
|
||||
}
|
||||
|
||||
void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block)
|
||||
{
|
||||
ff_vp3_idct_mmx(block);
|
||||
ff_put_signed_pixels_clamped_mmx(block, dest, line_size);
|
||||
}
|
||||
|
||||
void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block)
|
||||
{
|
||||
ff_vp3_idct_mmx(block);
|
||||
ff_add_pixels_clamped_mmx(block, dest, line_size);
|
||||
}
|
||||
|
||||
void ff_vp3_idct_dc_add_mmx2(uint8_t *dest, int linesize, const DCTELEM *block)
|
||||
{
|
||||
int dc = (block[0] + 15) >> 5;
|
||||
|
||||
__asm__ volatile(
|
||||
"movd %3, %%mm0 \n\t"
|
||||
"pshufw $0, %%mm0, %%mm0 \n\t"
|
||||
"pxor %%mm1, %%mm1 \n\t"
|
||||
"psubw %%mm0, %%mm1 \n\t"
|
||||
"packuswb %%mm0, %%mm0 \n\t"
|
||||
"packuswb %%mm1, %%mm1 \n\t"
|
||||
|
||||
#define DC_ADD \
|
||||
"movq (%0), %%mm2 \n\t" \
|
||||
"movq (%0,%1), %%mm3 \n\t" \
|
||||
"paddusb %%mm0, %%mm2 \n\t" \
|
||||
"movq (%0,%1,2), %%mm4 \n\t" \
|
||||
"paddusb %%mm0, %%mm3 \n\t" \
|
||||
"movq (%0,%2), %%mm5 \n\t" \
|
||||
"paddusb %%mm0, %%mm4 \n\t" \
|
||||
"paddusb %%mm0, %%mm5 \n\t" \
|
||||
"psubusb %%mm1, %%mm2 \n\t" \
|
||||
"psubusb %%mm1, %%mm3 \n\t" \
|
||||
"movq %%mm2, (%0) \n\t" \
|
||||
"psubusb %%mm1, %%mm4 \n\t" \
|
||||
"movq %%mm3, (%0,%1) \n\t" \
|
||||
"psubusb %%mm1, %%mm5 \n\t" \
|
||||
"movq %%mm4, (%0,%1,2) \n\t" \
|
||||
"movq %%mm5, (%0,%2) \n\t"
|
||||
|
||||
DC_ADD
|
||||
"lea (%0,%1,4), %0 \n\t"
|
||||
DC_ADD
|
||||
|
||||
: "+r"(dest)
|
||||
: "r"((x86_reg)linesize), "r"((x86_reg)3*linesize), "r"(dc)
|
||||
);
|
||||
}
|
@ -1,36 +0,0 @@
|
||||
/*
|
||||
* vp3dsp MMX function declarations
|
||||
* Copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_X86_VP3DSP_MMX_H
|
||||
#define AVCODEC_X86_VP3DSP_MMX_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include "libavcodec/dsputil.h"
|
||||
|
||||
void ff_vp3_idct_mmx(int16_t *data);
|
||||
void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block);
|
||||
void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block);
|
||||
void ff_vp3_idct_dc_add_mmx2(uint8_t *dest, int line_size, const DCTELEM *block);
|
||||
|
||||
void ff_vp3_v_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values);
|
||||
void ff_vp3_h_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values);
|
||||
|
||||
#endif /* AVCODEC_X86_VP3DSP_MMX_H */
|
@ -1,187 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2004 the ffmpeg project
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
* SSE2-optimized functions cribbed from the original VP3 source code.
|
||||
*/
|
||||
|
||||
#include "libavcodec/dsputil.h"
|
||||
#include "dsputil_mmx.h"
|
||||
#include "vp3dsp_sse2.h"
|
||||
|
||||
DECLARE_ALIGNED(16, const uint16_t, ff_vp3_idct_data)[7 * 8] =
|
||||
{
|
||||
64277,64277,64277,64277,64277,64277,64277,64277,
|
||||
60547,60547,60547,60547,60547,60547,60547,60547,
|
||||
54491,54491,54491,54491,54491,54491,54491,54491,
|
||||
46341,46341,46341,46341,46341,46341,46341,46341,
|
||||
36410,36410,36410,36410,36410,36410,36410,36410,
|
||||
25080,25080,25080,25080,25080,25080,25080,25080,
|
||||
12785,12785,12785,12785,12785,12785,12785,12785
|
||||
};
|
||||
|
||||
|
||||
#define VP3_1D_IDCT_SSE2(ADD, SHIFT) \
|
||||
"movdqa "I(3)", %%xmm2 \n\t" /* xmm2 = i3 */ \
|
||||
"movdqa "C(3)", %%xmm6 \n\t" /* xmm6 = c3 */ \
|
||||
"movdqa %%xmm2, %%xmm4 \n\t" /* xmm4 = i3 */ \
|
||||
"movdqa "I(5)", %%xmm7 \n\t" /* xmm7 = i5 */ \
|
||||
"pmulhw %%xmm6, %%xmm4 \n\t" /* xmm4 = c3 * i3 - i3 */ \
|
||||
"movdqa "C(5)", %%xmm1 \n\t" /* xmm1 = c5 */ \
|
||||
"pmulhw %%xmm7, %%xmm6 \n\t" /* xmm6 = c3 * i5 - i5 */ \
|
||||
"movdqa %%xmm1, %%xmm5 \n\t" /* xmm5 = c5 */ \
|
||||
"pmulhw %%xmm2, %%xmm1 \n\t" /* xmm1 = c5 * i3 - i3 */ \
|
||||
"movdqa "I(1)", %%xmm3 \n\t" /* xmm3 = i1 */ \
|
||||
"pmulhw %%xmm7, %%xmm5 \n\t" /* xmm5 = c5 * i5 - i5 */ \
|
||||
"movdqa "C(1)", %%xmm0 \n\t" /* xmm0 = c1 */ \
|
||||
"paddw %%xmm2, %%xmm4 \n\t" /* xmm4 = c3 * i3 */ \
|
||||
"paddw %%xmm7, %%xmm6 \n\t" /* xmm6 = c3 * i5 */ \
|
||||
"paddw %%xmm1, %%xmm2 \n\t" /* xmm2 = c5 * i3 */ \
|
||||
"movdqa "I(7)", %%xmm1 \n\t" /* xmm1 = i7 */ \
|
||||
"paddw %%xmm5, %%xmm7 \n\t" /* xmm7 = c5 * i5 */ \
|
||||
"movdqa %%xmm0, %%xmm5 \n\t" /* xmm5 = c1 */ \
|
||||
"pmulhw %%xmm3, %%xmm0 \n\t" /* xmm0 = c1 * i1 - i1 */ \
|
||||
"paddsw %%xmm7, %%xmm4 \n\t" /* xmm4 = c3 * i3 + c5 * i5 = C */ \
|
||||
"pmulhw %%xmm1, %%xmm5 \n\t" /* xmm5 = c1 * i7 - i7 */ \
|
||||
"movdqa "C(7)", %%xmm7 \n\t" /* xmm7 = c7 */ \
|
||||
"psubsw %%xmm2, %%xmm6 \n\t" /* xmm6 = c3 * i5 - c5 * i3 = D */ \
|
||||
"paddw %%xmm3, %%xmm0 \n\t" /* xmm0 = c1 * i1 */ \
|
||||
"pmulhw %%xmm7, %%xmm3 \n\t" /* xmm3 = c7 * i1 */ \
|
||||
"movdqa "I(2)", %%xmm2 \n\t" /* xmm2 = i2 */ \
|
||||
"pmulhw %%xmm1, %%xmm7 \n\t" /* xmm7 = c7 * i7 */ \
|
||||
"paddw %%xmm1, %%xmm5 \n\t" /* xmm5 = c1 * i7 */ \
|
||||
"movdqa %%xmm2, %%xmm1 \n\t" /* xmm1 = i2 */ \
|
||||
"pmulhw "C(2)", %%xmm2 \n\t" /* xmm2 = i2 * c2 -i2 */ \
|
||||
"psubsw %%xmm5, %%xmm3 \n\t" /* xmm3 = c7 * i1 - c1 * i7 = B */ \
|
||||
"movdqa "I(6)", %%xmm5 \n\t" /* xmm5 = i6 */ \
|
||||
"paddsw %%xmm7, %%xmm0 \n\t" /* xmm0 = c1 * i1 + c7 * i7 = A */ \
|
||||
"movdqa %%xmm5, %%xmm7 \n\t" /* xmm7 = i6 */ \
|
||||
"psubsw %%xmm4, %%xmm0 \n\t" /* xmm0 = A - C */ \
|
||||
"pmulhw "C(2)", %%xmm5 \n\t" /* xmm5 = c2 * i6 - i6 */ \
|
||||
"paddw %%xmm1, %%xmm2 \n\t" /* xmm2 = i2 * c2 */ \
|
||||
"pmulhw "C(6)", %%xmm1 \n\t" /* xmm1 = c6 * i2 */ \
|
||||
"paddsw %%xmm4, %%xmm4 \n\t" /* xmm4 = C + C */ \
|
||||
"paddsw %%xmm0, %%xmm4 \n\t" /* xmm4 = A + C = C. */ \
|
||||
"psubsw %%xmm6, %%xmm3 \n\t" /* xmm3 = B - D */ \
|
||||
"paddw %%xmm7, %%xmm5 \n\t" /* xmm5 = c2 * i6 */ \
|
||||
"paddsw %%xmm6, %%xmm6 \n\t" /* xmm6 = D + D */ \
|
||||
"pmulhw "C(6)", %%xmm7 \n\t" /* xmm7 = c6 * i6 */ \
|
||||
"paddsw %%xmm3, %%xmm6 \n\t" /* xmm6 = B + D = D. */ \
|
||||
"movdqa %%xmm4, "I(1)" \n\t" /* Save C. at I(1) */ \
|
||||
"psubsw %%xmm5, %%xmm1 \n\t" /* xmm1 = c6 * i2 - c2 * i6 = H */ \
|
||||
"movdqa "C(4)", %%xmm4 \n\t" /* xmm4 = c4 */ \
|
||||
"movdqa %%xmm3, %%xmm5 \n\t" /* xmm5 = B - D */ \
|
||||
"pmulhw %%xmm4, %%xmm3 \n\t" /* xmm3 = ( c4 -1 ) * ( B - D ) */ \
|
||||
"paddsw %%xmm2, %%xmm7 \n\t" /* xmm7 = c2 * i2 + c6 * i6 = G */ \
|
||||
"movdqa %%xmm6, "I(2)" \n\t" /* Save D. at I(2) */ \
|
||||
"movdqa %%xmm0, %%xmm2 \n\t" /* xmm2 = A - C */ \
|
||||
"movdqa "I(0)", %%xmm6 \n\t" /* xmm6 = i0 */ \
|
||||
"pmulhw %%xmm4, %%xmm0 \n\t" /* xmm0 = ( c4 - 1 ) * ( A - C ) = A. */ \
|
||||
"paddw %%xmm3, %%xmm5 \n\t" /* xmm5 = c4 * ( B - D ) = B. */ \
|
||||
"movdqa "I(4)", %%xmm3 \n\t" /* xmm3 = i4 */ \
|
||||
"psubsw %%xmm1, %%xmm5 \n\t" /* xmm5 = B. - H = B.. */ \
|
||||
"paddw %%xmm0, %%xmm2 \n\t" /* xmm2 = c4 * ( A - C) = A. */ \
|
||||
"psubsw %%xmm3, %%xmm6 \n\t" /* xmm6 = i0 - i4 */ \
|
||||
"movdqa %%xmm6, %%xmm0 \n\t" /* xmm0 = i0 - i4 */ \
|
||||
"pmulhw %%xmm4, %%xmm6 \n\t" /* xmm6 = (c4 - 1) * (i0 - i4) = F */ \
|
||||
"paddsw %%xmm3, %%xmm3 \n\t" /* xmm3 = i4 + i4 */ \
|
||||
"paddsw %%xmm1, %%xmm1 \n\t" /* xmm1 = H + H */ \
|
||||
"paddsw %%xmm0, %%xmm3 \n\t" /* xmm3 = i0 + i4 */ \
|
||||
"paddsw %%xmm5, %%xmm1 \n\t" /* xmm1 = B. + H = H. */ \
|
||||
"pmulhw %%xmm3, %%xmm4 \n\t" /* xmm4 = ( c4 - 1 ) * ( i0 + i4 ) */ \
|
||||
"paddw %%xmm0, %%xmm6 \n\t" /* xmm6 = c4 * ( i0 - i4 ) */ \
|
||||
"psubsw %%xmm2, %%xmm6 \n\t" /* xmm6 = F - A. = F. */ \
|
||||
"paddsw %%xmm2, %%xmm2 \n\t" /* xmm2 = A. + A. */ \
|
||||
"movdqa "I(1)", %%xmm0 \n\t" /* Load C. from I(1) */ \
|
||||
"paddsw %%xmm6, %%xmm2 \n\t" /* xmm2 = F + A. = A.. */ \
|
||||
"paddw %%xmm3, %%xmm4 \n\t" /* xmm4 = c4 * ( i0 + i4 ) = 3 */ \
|
||||
"psubsw %%xmm1, %%xmm2 \n\t" /* xmm2 = A.. - H. = R2 */ \
|
||||
ADD(%%xmm2) /* Adjust R2 and R1 before shifting */ \
|
||||
"paddsw %%xmm1, %%xmm1 \n\t" /* xmm1 = H. + H. */ \
|
||||
"paddsw %%xmm2, %%xmm1 \n\t" /* xmm1 = A.. + H. = R1 */ \
|
||||
SHIFT(%%xmm2) /* xmm2 = op2 */ \
|
||||
"psubsw %%xmm7, %%xmm4 \n\t" /* xmm4 = E - G = E. */ \
|
||||
SHIFT(%%xmm1) /* xmm1 = op1 */ \
|
||||
"movdqa "I(2)", %%xmm3 \n\t" /* Load D. from I(2) */ \
|
||||
"paddsw %%xmm7, %%xmm7 \n\t" /* xmm7 = G + G */ \
|
||||
"paddsw %%xmm4, %%xmm7 \n\t" /* xmm7 = E + G = G. */ \
|
||||
"psubsw %%xmm3, %%xmm4 \n\t" /* xmm4 = E. - D. = R4 */ \
|
||||
ADD(%%xmm4) /* Adjust R4 and R3 before shifting */ \
|
||||
"paddsw %%xmm3, %%xmm3 \n\t" /* xmm3 = D. + D. */ \
|
||||
"paddsw %%xmm4, %%xmm3 \n\t" /* xmm3 = E. + D. = R3 */ \
|
||||
SHIFT(%%xmm4) /* xmm4 = op4 */ \
|
||||
"psubsw %%xmm5, %%xmm6 \n\t" /* xmm6 = F. - B..= R6 */ \
|
||||
SHIFT(%%xmm3) /* xmm3 = op3 */ \
|
||||
ADD(%%xmm6) /* Adjust R6 and R5 before shifting */ \
|
||||
"paddsw %%xmm5, %%xmm5 \n\t" /* xmm5 = B.. + B.. */ \
|
||||
"paddsw %%xmm6, %%xmm5 \n\t" /* xmm5 = F. + B.. = R5 */ \
|
||||
SHIFT(%%xmm6) /* xmm6 = op6 */ \
|
||||
SHIFT(%%xmm5) /* xmm5 = op5 */ \
|
||||
"psubsw %%xmm0, %%xmm7 \n\t" /* xmm7 = G. - C. = R7 */ \
|
||||
ADD(%%xmm7) /* Adjust R7 and R0 before shifting */ \
|
||||
"paddsw %%xmm0, %%xmm0 \n\t" /* xmm0 = C. + C. */ \
|
||||
"paddsw %%xmm7, %%xmm0 \n\t" /* xmm0 = G. + C. */ \
|
||||
SHIFT(%%xmm7) /* xmm7 = op7 */ \
|
||||
SHIFT(%%xmm0) /* xmm0 = op0 */
|
||||
|
||||
#define PUT_BLOCK(r0, r1, r2, r3, r4, r5, r6, r7) \
|
||||
"movdqa " #r0 ", " O(0) "\n\t" \
|
||||
"movdqa " #r1 ", " O(1) "\n\t" \
|
||||
"movdqa " #r2 ", " O(2) "\n\t" \
|
||||
"movdqa " #r3 ", " O(3) "\n\t" \
|
||||
"movdqa " #r4 ", " O(4) "\n\t" \
|
||||
"movdqa " #r5 ", " O(5) "\n\t" \
|
||||
"movdqa " #r6 ", " O(6) "\n\t" \
|
||||
"movdqa " #r7 ", " O(7) "\n\t"
|
||||
|
||||
#define NOP(xmm)
|
||||
#define SHIFT4(xmm) "psraw $4, "#xmm"\n\t"
|
||||
#define ADD8(xmm) "paddsw %2, "#xmm"\n\t"
|
||||
|
||||
void ff_vp3_idct_sse2(int16_t *input_data)
|
||||
{
|
||||
#define I(x) AV_STRINGIFY(16*x)"(%0)"
|
||||
#define O(x) I(x)
|
||||
#define C(x) AV_STRINGIFY(16*(x-1))"(%1)"
|
||||
|
||||
__asm__ volatile (
|
||||
VP3_1D_IDCT_SSE2(NOP, NOP)
|
||||
|
||||
TRANSPOSE8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7, (%0))
|
||||
PUT_BLOCK(%%xmm0, %%xmm5, %%xmm7, %%xmm3, %%xmm6, %%xmm4, %%xmm2, %%xmm1)
|
||||
|
||||
VP3_1D_IDCT_SSE2(ADD8, SHIFT4)
|
||||
PUT_BLOCK(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)
|
||||
:: "r"(input_data), "r"(ff_vp3_idct_data), "m"(ff_pw_8)
|
||||
);
|
||||
}
|
||||
|
||||
void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block)
|
||||
{
|
||||
ff_vp3_idct_sse2(block);
|
||||
ff_put_signed_pixels_clamped_mmx(block, dest, line_size);
|
||||
}
|
||||
|
||||
void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block)
|
||||
{
|
||||
ff_vp3_idct_sse2(block);
|
||||
ff_add_pixels_clamped_mmx(block, dest, line_size);
|
||||
}
|
@ -1,31 +0,0 @@
|
||||
/*
|
||||
* vp3dsp SSE2 function declarations
|
||||
* Copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_X86_VP3DSP_SSE2_H
|
||||
#define AVCODEC_X86_VP3DSP_SSE2_H
|
||||
|
||||
#include "libavcodec/dsputil.h"
|
||||
|
||||
void ff_vp3_idct_sse2(int16_t *input_data);
|
||||
void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block);
|
||||
void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block);
|
||||
|
||||
#endif /* AVCODEC_X86_VP3DSP_SSE2_H */
|
Loading…
Reference in New Issue
Block a user