mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-03-23 04:24:35 +02:00
avcodec/x86/vp8dsp: Remove obsolete MMX(EXT) functions
The only systems which benefit from these are truely ancient 32bit x86s as all other systems use at least the SSE2 versions (this includes all x64 cpus (which is why this code is restricted to x86-32)). Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
This commit is contained in:
parent
61e3cccd36
commit
6a551f1405
libavcodec/x86
@ -840,25 +840,6 @@ cglobal put_vp8_pixels8, 5, 5, 0, dst, dststride, src, srcstride, height
|
||||
jg .nextrow
|
||||
REP_RET
|
||||
|
||||
%if ARCH_X86_32
|
||||
INIT_MMX mmx
|
||||
cglobal put_vp8_pixels16, 5, 5, 0, dst, dststride, src, srcstride, height
|
||||
.nextrow:
|
||||
movq mm0, [srcq+srcstrideq*0+0]
|
||||
movq mm1, [srcq+srcstrideq*0+8]
|
||||
movq mm2, [srcq+srcstrideq*1+0]
|
||||
movq mm3, [srcq+srcstrideq*1+8]
|
||||
lea srcq, [srcq+srcstrideq*2]
|
||||
movq [dstq+dststrideq*0+0], mm0
|
||||
movq [dstq+dststrideq*0+8], mm1
|
||||
movq [dstq+dststrideq*1+0], mm2
|
||||
movq [dstq+dststrideq*1+8], mm3
|
||||
lea dstq, [dstq+dststrideq*2]
|
||||
sub heightd, 2
|
||||
jg .nextrow
|
||||
REP_RET
|
||||
%endif
|
||||
|
||||
INIT_XMM sse
|
||||
cglobal put_vp8_pixels16, 5, 5, 2, dst, dststride, src, srcstride, height
|
||||
.nextrow:
|
||||
@ -895,32 +876,6 @@ cglobal put_vp8_pixels16, 5, 5, 2, dst, dststride, src, srcstride, height
|
||||
%4 [dst2q+strideq+%3], m5
|
||||
%endmacro
|
||||
|
||||
%if ARCH_X86_32
|
||||
INIT_MMX mmx
|
||||
cglobal vp8_idct_dc_add, 3, 3, 0, dst, block, stride
|
||||
; load data
|
||||
movd m0, [blockq]
|
||||
|
||||
; calculate DC
|
||||
paddw m0, [pw_4]
|
||||
pxor m1, m1
|
||||
psraw m0, 3
|
||||
movd [blockq], m1
|
||||
psubw m1, m0
|
||||
packuswb m0, m0
|
||||
packuswb m1, m1
|
||||
punpcklbw m0, m0
|
||||
punpcklbw m1, m1
|
||||
punpcklwd m0, m0
|
||||
punpcklwd m1, m1
|
||||
|
||||
; add DC
|
||||
DEFINE_ARGS dst1, dst2, stride
|
||||
lea dst2q, [dst1q+strideq*2]
|
||||
ADD_DC m0, m1, 0, movh
|
||||
RET
|
||||
%endif
|
||||
|
||||
%macro VP8_IDCT_DC_ADD 0
|
||||
cglobal vp8_idct_dc_add, 3, 3, 6, dst, block, stride
|
||||
; load data
|
||||
@ -971,44 +926,6 @@ VP8_IDCT_DC_ADD
|
||||
; void ff_vp8_idct_dc_add4y_<opt>(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride);
|
||||
;-----------------------------------------------------------------------------
|
||||
|
||||
%if ARCH_X86_32
|
||||
INIT_MMX mmx
|
||||
cglobal vp8_idct_dc_add4y, 3, 3, 0, dst, block, stride
|
||||
; load data
|
||||
movd m0, [blockq+32*0] ; A
|
||||
movd m1, [blockq+32*2] ; C
|
||||
punpcklwd m0, [blockq+32*1] ; A B
|
||||
punpcklwd m1, [blockq+32*3] ; C D
|
||||
punpckldq m0, m1 ; A B C D
|
||||
pxor m6, m6
|
||||
|
||||
; calculate DC
|
||||
paddw m0, [pw_4]
|
||||
movd [blockq+32*0], m6
|
||||
movd [blockq+32*1], m6
|
||||
movd [blockq+32*2], m6
|
||||
movd [blockq+32*3], m6
|
||||
psraw m0, 3
|
||||
psubw m6, m0
|
||||
packuswb m0, m0
|
||||
packuswb m6, m6
|
||||
punpcklbw m0, m0 ; AABBCCDD
|
||||
punpcklbw m6, m6 ; AABBCCDD
|
||||
movq m1, m0
|
||||
movq m7, m6
|
||||
punpcklbw m0, m0 ; AAAABBBB
|
||||
punpckhbw m1, m1 ; CCCCDDDD
|
||||
punpcklbw m6, m6 ; AAAABBBB
|
||||
punpckhbw m7, m7 ; CCCCDDDD
|
||||
|
||||
; add DC
|
||||
DEFINE_ARGS dst1, dst2, stride
|
||||
lea dst2q, [dst1q+strideq*2]
|
||||
ADD_DC m0, m6, 0, mova
|
||||
ADD_DC m1, m7, 8, mova
|
||||
RET
|
||||
%endif
|
||||
|
||||
INIT_XMM sse2
|
||||
cglobal vp8_idct_dc_add4y, 3, 3, 6, dst, block, stride
|
||||
; load data
|
||||
@ -1117,7 +1034,7 @@ cglobal vp8_idct_dc_add4uv, 3, 3, 0, dst, block, stride
|
||||
SWAP %4, %3
|
||||
%endmacro
|
||||
|
||||
%macro VP8_IDCT_ADD 0
|
||||
INIT_MMX sse
|
||||
cglobal vp8_idct_add, 3, 3, 0, dst, block, stride
|
||||
; load block data
|
||||
movq m0, [blockq+ 0]
|
||||
@ -1126,17 +1043,9 @@ cglobal vp8_idct_add, 3, 3, 0, dst, block, stride
|
||||
movq m3, [blockq+24]
|
||||
movq m6, [pw_20091]
|
||||
movq m7, [pw_17734]
|
||||
%if cpuflag(sse)
|
||||
xorps xmm0, xmm0
|
||||
movaps [blockq+ 0], xmm0
|
||||
movaps [blockq+16], xmm0
|
||||
%else
|
||||
pxor m4, m4
|
||||
movq [blockq+ 0], m4
|
||||
movq [blockq+ 8], m4
|
||||
movq [blockq+16], m4
|
||||
movq [blockq+24], m4
|
||||
%endif
|
||||
|
||||
; actual IDCT
|
||||
VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
|
||||
@ -1153,14 +1062,6 @@ cglobal vp8_idct_add, 3, 3, 0, dst, block, stride
|
||||
STORE_DIFFx2 m2, m3, m6, m7, m4, 3, dst2q, strideq
|
||||
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
%if ARCH_X86_32
|
||||
INIT_MMX mmx
|
||||
VP8_IDCT_ADD
|
||||
%endif
|
||||
INIT_MMX sse
|
||||
VP8_IDCT_ADD
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void ff_vp8_luma_dc_wht(int16_t block[4][4][16], int16_t dc[16])
|
||||
@ -1193,23 +1094,15 @@ VP8_IDCT_ADD
|
||||
SWAP %1, %4, %3
|
||||
%endmacro
|
||||
|
||||
%macro VP8_DC_WHT 0
|
||||
INIT_MMX sse
|
||||
cglobal vp8_luma_dc_wht, 2, 3, 0, block, dc1, dc2
|
||||
movq m0, [dc1q]
|
||||
movq m1, [dc1q+8]
|
||||
movq m2, [dc1q+16]
|
||||
movq m3, [dc1q+24]
|
||||
%if cpuflag(sse)
|
||||
xorps xmm0, xmm0
|
||||
movaps [dc1q+ 0], xmm0
|
||||
movaps [dc1q+16], xmm0
|
||||
%else
|
||||
pxor m4, m4
|
||||
movq [dc1q+ 0], m4
|
||||
movq [dc1q+ 8], m4
|
||||
movq [dc1q+16], m4
|
||||
movq [dc1q+24], m4
|
||||
%endif
|
||||
HADAMARD4_1D 0, 1, 2, 3
|
||||
TRANSPOSE4x4W 0, 1, 2, 3, 4
|
||||
paddw m0, [pw_3]
|
||||
@ -1221,11 +1114,3 @@ cglobal vp8_luma_dc_wht, 2, 3, 0, block, dc1, dc2
|
||||
SCATTER_WHT 0, 1, 0
|
||||
SCATTER_WHT 2, 3, 2
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
%if ARCH_X86_32
|
||||
INIT_MMX mmx
|
||||
VP8_DC_WHT
|
||||
%endif
|
||||
INIT_MMX sse
|
||||
VP8_DC_WHT
|
||||
|
@ -112,9 +112,6 @@ void ff_put_vp8_bilinear8_v_ssse3 (uint8_t *dst, ptrdiff_t dststride,
|
||||
void ff_put_vp8_pixels8_mmx (uint8_t *dst, ptrdiff_t dststride,
|
||||
uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int mx, int my);
|
||||
void ff_put_vp8_pixels16_mmx(uint8_t *dst, ptrdiff_t dststride,
|
||||
uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int mx, int my);
|
||||
void ff_put_vp8_pixels16_sse(uint8_t *dst, ptrdiff_t dststride,
|
||||
uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int mx, int my);
|
||||
@ -140,19 +137,6 @@ static void ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
|
||||
dst + 4, dststride, src + 4, srcstride, height, mx, my); \
|
||||
}
|
||||
|
||||
#if ARCH_X86_32
|
||||
TAP_W8 (mmxext, epel, h4)
|
||||
TAP_W8 (mmxext, epel, h6)
|
||||
TAP_W16(mmxext, epel, h6)
|
||||
TAP_W8 (mmxext, epel, v4)
|
||||
TAP_W8 (mmxext, epel, v6)
|
||||
TAP_W16(mmxext, epel, v6)
|
||||
TAP_W8 (mmxext, bilinear, h)
|
||||
TAP_W16(mmxext, bilinear, h)
|
||||
TAP_W8 (mmxext, bilinear, v)
|
||||
TAP_W16(mmxext, bilinear, v)
|
||||
#endif
|
||||
|
||||
TAP_W16(sse2, epel, h6)
|
||||
TAP_W16(sse2, epel, v6)
|
||||
TAP_W16(sse2, bilinear, h)
|
||||
@ -177,16 +161,8 @@ static void ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## v ## TAPNUMY ## _ ## OPT
|
||||
dst, dststride, tmpptr, SIZE, height, mx, my); \
|
||||
}
|
||||
|
||||
#if ARCH_X86_32
|
||||
#define HVTAPMMX(x, y) \
|
||||
HVTAP(mmxext, 8, x, y, 4, 8) \
|
||||
HVTAP(mmxext, 8, x, y, 8, 16)
|
||||
|
||||
HVTAP(mmxext, 8, 6, 6, 16, 16)
|
||||
#else
|
||||
#define HVTAPMMX(x, y) \
|
||||
HVTAP(mmxext, 8, x, y, 4, 8)
|
||||
#endif
|
||||
|
||||
HVTAPMMX(4, 4)
|
||||
HVTAPMMX(4, 6)
|
||||
@ -221,31 +197,21 @@ static void ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT( \
|
||||
}
|
||||
|
||||
HVBILIN(mmxext, 8, 4, 8)
|
||||
#if ARCH_X86_32
|
||||
HVBILIN(mmxext, 8, 8, 16)
|
||||
HVBILIN(mmxext, 8, 16, 16)
|
||||
#endif
|
||||
HVBILIN(sse2, 8, 8, 16)
|
||||
HVBILIN(sse2, 8, 16, 16)
|
||||
HVBILIN(ssse3, 8, 4, 8)
|
||||
HVBILIN(ssse3, 8, 8, 16)
|
||||
HVBILIN(ssse3, 8, 16, 16)
|
||||
|
||||
void ff_vp8_idct_dc_add_mmx(uint8_t *dst, int16_t block[16],
|
||||
ptrdiff_t stride);
|
||||
void ff_vp8_idct_dc_add_sse2(uint8_t *dst, int16_t block[16],
|
||||
ptrdiff_t stride);
|
||||
void ff_vp8_idct_dc_add_sse4(uint8_t *dst, int16_t block[16],
|
||||
ptrdiff_t stride);
|
||||
void ff_vp8_idct_dc_add4y_mmx(uint8_t *dst, int16_t block[4][16],
|
||||
ptrdiff_t stride);
|
||||
void ff_vp8_idct_dc_add4y_sse2(uint8_t *dst, int16_t block[4][16],
|
||||
ptrdiff_t stride);
|
||||
void ff_vp8_idct_dc_add4uv_mmx(uint8_t *dst, int16_t block[2][16],
|
||||
ptrdiff_t stride);
|
||||
void ff_vp8_luma_dc_wht_mmx(int16_t block[4][4][16], int16_t dc[16]);
|
||||
void ff_vp8_luma_dc_wht_sse(int16_t block[4][4][16], int16_t dc[16]);
|
||||
void ff_vp8_idct_add_mmx(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
|
||||
void ff_vp8_idct_add_sse(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
|
||||
|
||||
#define DECLARE_LOOP_FILTER(NAME) \
|
||||
@ -284,8 +250,6 @@ void ff_vp8_h_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, \
|
||||
ptrdiff_t s, \
|
||||
int e, int i, int hvt);
|
||||
|
||||
DECLARE_LOOP_FILTER(mmx)
|
||||
DECLARE_LOOP_FILTER(mmxext)
|
||||
DECLARE_LOOP_FILTER(sse2)
|
||||
DECLARE_LOOP_FILTER(ssse3)
|
||||
DECLARE_LOOP_FILTER(sse4)
|
||||
@ -322,10 +286,6 @@ av_cold void ff_vp78dsp_init_x86(VP8DSPContext *c)
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_MMX(cpu_flags)) {
|
||||
#if ARCH_X86_32
|
||||
c->put_vp8_epel_pixels_tab[0][0][0] =
|
||||
c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_mmx;
|
||||
#endif
|
||||
c->put_vp8_epel_pixels_tab[1][0][0] =
|
||||
c->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_mmx;
|
||||
}
|
||||
@ -335,12 +295,6 @@ av_cold void ff_vp78dsp_init_x86(VP8DSPContext *c)
|
||||
if (EXTERNAL_MMXEXT(cpu_flags)) {
|
||||
VP8_MC_FUNC(2, 4, mmxext);
|
||||
VP8_BILINEAR_MC_FUNC(2, 4, mmxext);
|
||||
#if ARCH_X86_32
|
||||
VP8_LUMA_MC_FUNC(0, 16, mmxext);
|
||||
VP8_MC_FUNC(1, 8, mmxext);
|
||||
VP8_BILINEAR_MC_FUNC(0, 16, mmxext);
|
||||
VP8_BILINEAR_MC_FUNC(1, 8, mmxext);
|
||||
#endif
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSE(cpu_flags)) {
|
||||
@ -373,44 +327,6 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext *c)
|
||||
|
||||
if (EXTERNAL_MMX(cpu_flags)) {
|
||||
c->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_mmx;
|
||||
#if ARCH_X86_32
|
||||
c->vp8_idct_dc_add = ff_vp8_idct_dc_add_mmx;
|
||||
c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_mmx;
|
||||
c->vp8_idct_add = ff_vp8_idct_add_mmx;
|
||||
c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_mmx;
|
||||
|
||||
c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmx;
|
||||
c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmx;
|
||||
|
||||
c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_mmx;
|
||||
c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmx;
|
||||
c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_mmx;
|
||||
c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_mmx;
|
||||
|
||||
c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_mmx;
|
||||
c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_mmx;
|
||||
c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_mmx;
|
||||
c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_mmx;
|
||||
#endif
|
||||
}
|
||||
|
||||
/* note that 4-tap width=16 functions are missing because w=16
|
||||
* is only used for luma, and luma is always a copy or sixtap. */
|
||||
if (EXTERNAL_MMXEXT(cpu_flags)) {
|
||||
#if ARCH_X86_32
|
||||
c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmxext;
|
||||
c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmxext;
|
||||
|
||||
c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_mmxext;
|
||||
c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmxext;
|
||||
c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_mmxext;
|
||||
c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_mmxext;
|
||||
|
||||
c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_mmxext;
|
||||
c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_mmxext;
|
||||
c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_mmxext;
|
||||
c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_mmxext;
|
||||
#endif
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSE(cpu_flags)) {
|
||||
|
@ -46,30 +46,6 @@ SECTION .text
|
||||
; void ff_vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, ptrdiff_t stride, int flim);
|
||||
;-----------------------------------------------------------------------------
|
||||
|
||||
; macro called with 7 mm register indexes as argument, and 4 regular registers
|
||||
;
|
||||
; first 4 mm registers will carry the transposed pixel data
|
||||
; the other three are scratchspace (one would be sufficient, but this allows
|
||||
; for more spreading/pipelining and thus faster execution on OOE CPUs)
|
||||
;
|
||||
; first two regular registers are buf+4*stride and buf+5*stride
|
||||
; third is -stride, fourth is +stride
|
||||
%macro READ_8x4_INTERLEAVED 11
|
||||
; interleave 8 (A-H) rows of 4 pixels each
|
||||
movd m%1, [%8+%10*4] ; A0-3
|
||||
movd m%5, [%9+%10*4] ; B0-3
|
||||
movd m%2, [%8+%10*2] ; C0-3
|
||||
movd m%6, [%8+%10] ; D0-3
|
||||
movd m%3, [%8] ; E0-3
|
||||
movd m%7, [%9] ; F0-3
|
||||
movd m%4, [%9+%11] ; G0-3
|
||||
punpcklbw m%1, m%5 ; A/B interleaved
|
||||
movd m%5, [%9+%11*2] ; H0-3
|
||||
punpcklbw m%2, m%6 ; C/D interleaved
|
||||
punpcklbw m%3, m%7 ; E/F interleaved
|
||||
punpcklbw m%4, m%5 ; G/H interleaved
|
||||
%endmacro
|
||||
|
||||
; macro called with 7 mm register indexes as argument, and 5 regular registers
|
||||
; first 11 mean the same as READ_8x4_TRANSPOSED above
|
||||
; fifth regular register is scratchspace to reach the bottom 8 rows, it
|
||||
@ -112,26 +88,6 @@ SECTION .text
|
||||
punpcklbw m%4, m%5 ; G/H/O/P interleaved
|
||||
%endmacro
|
||||
|
||||
; write 4 mm registers of 2 dwords each
|
||||
; first four arguments are mm register indexes containing source data
|
||||
; last four are registers containing buf+4*stride, buf+5*stride,
|
||||
; -stride and +stride
|
||||
%macro WRITE_4x2D 8
|
||||
; write out (2 dwords per register)
|
||||
movd [%5+%7*4], m%1
|
||||
movd [%5+%7*2], m%2
|
||||
movd [%5], m%3
|
||||
movd [%6+%8], m%4
|
||||
punpckhdq m%1, m%1
|
||||
punpckhdq m%2, m%2
|
||||
punpckhdq m%3, m%3
|
||||
punpckhdq m%4, m%4
|
||||
movd [%6+%7*4], m%1
|
||||
movd [%5+%7], m%2
|
||||
movd [%6], m%3
|
||||
movd [%6+%8*2], m%4
|
||||
%endmacro
|
||||
|
||||
; write 4 xmm registers of 4 dwords each
|
||||
; arguments same as WRITE_2x4D, but with an extra register, so that the 5 regular
|
||||
; registers contain buf+4*stride, buf+5*stride, buf+12*stride, -stride and +stride
|
||||
@ -192,42 +148,6 @@ SECTION .text
|
||||
movd [%7+%9*2], m%4
|
||||
%endmacro
|
||||
|
||||
; write 4 or 8 words in the mmx/xmm registers as 8 lines
|
||||
; 1 and 2 are the registers to write, this can be the same (for SSE2)
|
||||
; for pre-SSE4:
|
||||
; 3 is a general-purpose register that we will clobber
|
||||
; for SSE4:
|
||||
; 3 is a pointer to the destination's 5th line
|
||||
; 4 is a pointer to the destination's 4th line
|
||||
; 5/6 is -stride and +stride
|
||||
%macro WRITE_2x4W 6
|
||||
movd %3d, %1
|
||||
punpckhdq %1, %1
|
||||
mov [%4+%5*4], %3w
|
||||
shr %3, 16
|
||||
add %4, %6
|
||||
mov [%4+%5*4], %3w
|
||||
|
||||
movd %3d, %1
|
||||
add %4, %5
|
||||
mov [%4+%5*2], %3w
|
||||
shr %3, 16
|
||||
mov [%4+%5 ], %3w
|
||||
|
||||
movd %3d, %2
|
||||
punpckhdq %2, %2
|
||||
mov [%4 ], %3w
|
||||
shr %3, 16
|
||||
mov [%4+%6 ], %3w
|
||||
|
||||
movd %3d, %2
|
||||
add %4, %6
|
||||
mov [%4+%6 ], %3w
|
||||
shr %3, 16
|
||||
mov [%4+%6*2], %3w
|
||||
add %4, %5
|
||||
%endmacro
|
||||
|
||||
%macro WRITE_8W 5
|
||||
%if cpuflag(sse4)
|
||||
pextrw [%3+%4*4], %1, 0
|
||||
@ -269,29 +189,19 @@ SECTION .text
|
||||
|
||||
%macro SIMPLE_LOOPFILTER 2
|
||||
cglobal vp8_%1_loop_filter_simple, 3, %2, 8, dst, stride, flim, cntr
|
||||
%if mmsize == 8 ; mmx/mmxext
|
||||
mov cntrq, 2
|
||||
%endif
|
||||
%if cpuflag(ssse3)
|
||||
pxor m0, m0
|
||||
%endif
|
||||
SPLATB_REG m7, flim, m0 ; splat "flim" into register
|
||||
|
||||
; set up indexes to address 4 rows
|
||||
%if mmsize == 8
|
||||
DEFINE_ARGS dst1, mstride, stride, cntr, dst2
|
||||
%else
|
||||
DEFINE_ARGS dst1, mstride, stride, dst3, dst2
|
||||
%endif
|
||||
mov strideq, mstrideq
|
||||
neg mstrideq
|
||||
%ifidn %1, h
|
||||
lea dst1q, [dst1q+4*strideq-2]
|
||||
%endif
|
||||
|
||||
%if mmsize == 8 ; mmx / mmxext
|
||||
.next8px:
|
||||
%endif
|
||||
%ifidn %1, v
|
||||
; read 4 half/full rows of pixels
|
||||
mova m0, [dst1q+mstrideq*2] ; p1
|
||||
@ -301,11 +211,7 @@ cglobal vp8_%1_loop_filter_simple, 3, %2, 8, dst, stride, flim, cntr
|
||||
%else ; h
|
||||
lea dst2q, [dst1q+ strideq]
|
||||
|
||||
%if mmsize == 8 ; mmx/mmxext
|
||||
READ_8x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, dst1q, dst2q, mstrideq, strideq
|
||||
%else ; sse2
|
||||
READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, dst1q, dst2q, mstrideq, strideq, dst3q
|
||||
%endif
|
||||
TRANSPOSE4x4W 0, 1, 2, 3, 4
|
||||
%endif
|
||||
|
||||
@ -380,7 +286,6 @@ cglobal vp8_%1_loop_filter_simple, 3, %2, 8, dst, stride, flim, cntr
|
||||
inc dst1q
|
||||
SBUTTERFLY bw, 6, 4, 0
|
||||
|
||||
%if mmsize == 16 ; sse2
|
||||
%if cpuflag(sse4)
|
||||
inc dst2q
|
||||
%endif
|
||||
@ -390,35 +295,11 @@ cglobal vp8_%1_loop_filter_simple, 3, %2, 8, dst, stride, flim, cntr
|
||||
inc dst3q
|
||||
%endif
|
||||
WRITE_8W m4, dst3q, dst2q, mstrideq, strideq
|
||||
%else ; mmx/mmxext
|
||||
WRITE_2x4W m6, m4, dst2q, dst1q, mstrideq, strideq
|
||||
%endif
|
||||
%endif
|
||||
|
||||
%if mmsize == 8 ; mmx/mmxext
|
||||
; next 8 pixels
|
||||
%ifidn %1, v
|
||||
add dst1q, 8 ; advance 8 cols = pixels
|
||||
%else ; h
|
||||
lea dst1q, [dst1q+strideq*8-1] ; advance 8 rows = lines
|
||||
%endif
|
||||
dec cntrq
|
||||
jg .next8px
|
||||
REP_RET
|
||||
%else ; sse2
|
||||
RET
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%if ARCH_X86_32
|
||||
INIT_MMX mmx
|
||||
SIMPLE_LOOPFILTER v, 4
|
||||
SIMPLE_LOOPFILTER h, 5
|
||||
INIT_MMX mmxext
|
||||
SIMPLE_LOOPFILTER v, 4
|
||||
SIMPLE_LOOPFILTER h, 5
|
||||
%endif
|
||||
|
||||
INIT_XMM sse2
|
||||
SIMPLE_LOOPFILTER v, 3
|
||||
SIMPLE_LOOPFILTER h, 5
|
||||
@ -485,9 +366,6 @@ cglobal vp8_%1_loop_filter16y_inner, 5, 5, 13, stack_size, dst, stride, flimE, f
|
||||
|
||||
%if %2 == 8 ; chroma
|
||||
DEFINE_ARGS dst1, dst8, mstride, stride, dst2
|
||||
%elif mmsize == 8
|
||||
DEFINE_ARGS dst1, mstride, stride, dst2, cntr
|
||||
mov cntrq, 2
|
||||
%else
|
||||
DEFINE_ARGS dst1, mstride, stride, dst2, dst8
|
||||
%endif
|
||||
@ -500,9 +378,6 @@ cglobal vp8_%1_loop_filter16y_inner, 5, 5, 13, stack_size, dst, stride, flimE, f
|
||||
%endif
|
||||
%endif
|
||||
|
||||
%if mmsize == 8
|
||||
.next8px:
|
||||
%endif
|
||||
; read
|
||||
lea dst2q, [dst1q+strideq]
|
||||
%ifidn %1, v
|
||||
@ -527,33 +402,7 @@ cglobal vp8_%1_loop_filter16y_inner, 5, 5, 13, stack_size, dst, stride, flimE, f
|
||||
movhps m7, [dst8q+ strideq*2]
|
||||
add dst8q, mstrideq
|
||||
%endif
|
||||
%elif mmsize == 8 ; mmx/mmxext (h)
|
||||
; read 8 rows of 8px each
|
||||
movu m0, [dst1q+mstrideq*4]
|
||||
movu m1, [dst2q+mstrideq*4]
|
||||
movu m2, [dst1q+mstrideq*2]
|
||||
movu m3, [dst1q+mstrideq ]
|
||||
movu m4, [dst1q]
|
||||
movu m5, [dst2q]
|
||||
movu m6, [dst2q+ strideq ]
|
||||
|
||||
; 8x8 transpose
|
||||
TRANSPOSE4x4B 0, 1, 2, 3, 7
|
||||
mova m_q0backup, m1
|
||||
movu m7, [dst2q+ strideq*2]
|
||||
TRANSPOSE4x4B 4, 5, 6, 7, 1
|
||||
SBUTTERFLY dq, 0, 4, 1 ; p3/p2
|
||||
SBUTTERFLY dq, 2, 6, 1 ; q0/q1
|
||||
SBUTTERFLY dq, 3, 7, 1 ; q2/q3
|
||||
mova m1, m_q0backup
|
||||
mova m_q0backup, m2 ; store q0
|
||||
SBUTTERFLY dq, 1, 5, 2 ; p1/p0
|
||||
mova m_p0backup, m5 ; store p0
|
||||
SWAP 1, 4
|
||||
SWAP 2, 4
|
||||
SWAP 6, 3
|
||||
SWAP 5, 3
|
||||
%else ; sse2 (h)
|
||||
%else ; h
|
||||
%if %2 == 16
|
||||
lea dst8q, [dst1q+ strideq*8]
|
||||
%endif
|
||||
@ -641,25 +490,9 @@ cglobal vp8_%1_loop_filter16y_inner, 5, 5, 13, stack_size, dst, stride, flimE, f
|
||||
psubusb m6, m5 ; q2-q1
|
||||
por m6, m4 ; abs(q2-q1)
|
||||
|
||||
%if notcpuflag(mmxext)
|
||||
mova m4, m_flimI
|
||||
pxor m3, m3
|
||||
psubusb m0, m4
|
||||
psubusb m1, m4
|
||||
psubusb m7, m4
|
||||
psubusb m6, m4
|
||||
pcmpeqb m0, m3 ; abs(p3-p2) <= I
|
||||
pcmpeqb m1, m3 ; abs(p2-p1) <= I
|
||||
pcmpeqb m7, m3 ; abs(q3-q2) <= I
|
||||
pcmpeqb m6, m3 ; abs(q2-q1) <= I
|
||||
pand m0, m1
|
||||
pand m7, m6
|
||||
pand m0, m7
|
||||
%else ; mmxext/sse2
|
||||
pmaxub m0, m1
|
||||
pmaxub m6, m7
|
||||
pmaxub m0, m6
|
||||
%endif
|
||||
|
||||
; normal_limit and high_edge_variance for p1-p0, q1-q0
|
||||
SWAP 7, 3 ; now m7 is zero
|
||||
@ -681,18 +514,8 @@ cglobal vp8_%1_loop_filter16y_inner, 5, 5, 13, stack_size, dst, stride, flimE, f
|
||||
psubusb m1, m3 ; p1-p0
|
||||
psubusb m6, m2 ; p0-p1
|
||||
por m1, m6 ; abs(p1-p0)
|
||||
%if notcpuflag(mmxext)
|
||||
mova m6, m1
|
||||
psubusb m1, m4
|
||||
psubusb m6, m_hevthr
|
||||
pcmpeqb m1, m7 ; abs(p1-p0) <= I
|
||||
pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh
|
||||
pand m0, m1
|
||||
mova m_maskres, m6
|
||||
%else ; mmxext/sse2
|
||||
pmaxub m0, m1 ; max_I
|
||||
SWAP 1, 4 ; max_hev_thresh
|
||||
%endif
|
||||
|
||||
SWAP 6, 4 ; now m6 is I
|
||||
%ifidn %1, v
|
||||
@ -712,17 +535,6 @@ cglobal vp8_%1_loop_filter16y_inner, 5, 5, 13, stack_size, dst, stride, flimE, f
|
||||
psubusb m1, m5 ; q0-q1
|
||||
psubusb m7, m4 ; q1-q0
|
||||
por m1, m7 ; abs(q1-q0)
|
||||
%if notcpuflag(mmxext)
|
||||
mova m7, m1
|
||||
psubusb m1, m6
|
||||
psubusb m7, m_hevthr
|
||||
pxor m6, m6
|
||||
pcmpeqb m1, m6 ; abs(q1-q0) <= I
|
||||
pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh
|
||||
mova m6, m_maskres
|
||||
pand m0, m1 ; abs([pq][321]-[pq][210]) <= I
|
||||
pand m6, m7
|
||||
%else ; mmxext/sse2
|
||||
pxor m7, m7
|
||||
pmaxub m0, m1
|
||||
pmaxub m6, m1
|
||||
@ -730,7 +542,6 @@ cglobal vp8_%1_loop_filter16y_inner, 5, 5, 13, stack_size, dst, stride, flimE, f
|
||||
psubusb m6, m_hevthr
|
||||
pcmpeqb m0, m7 ; max(abs(..)) <= I
|
||||
pcmpeqb m6, m7 ; !(max(abs..) > thresh)
|
||||
%endif
|
||||
%ifdef m12
|
||||
SWAP 6, 12
|
||||
%else
|
||||
@ -820,25 +631,12 @@ cglobal vp8_%1_loop_filter16y_inner, 5, 5, 13, stack_size, dst, stride, flimE, f
|
||||
%else
|
||||
mova m6, m_maskres
|
||||
%endif
|
||||
%if notcpuflag(mmxext)
|
||||
mova m7, [pb_1]
|
||||
%else ; mmxext/sse2
|
||||
pxor m7, m7
|
||||
%endif
|
||||
pand m0, m6
|
||||
pand m1, m6
|
||||
%if notcpuflag(mmxext)
|
||||
paddusb m0, m7
|
||||
pand m1, [pb_FE]
|
||||
pandn m7, m0
|
||||
psrlq m1, 1
|
||||
psrlq m7, 1
|
||||
SWAP 0, 7
|
||||
%else ; mmxext/sse2
|
||||
psubusb m1, [pb_1]
|
||||
pavgb m0, m7 ; a
|
||||
pavgb m1, m7 ; -a
|
||||
%endif
|
||||
psubusb m5, m0
|
||||
psubusb m2, m1
|
||||
paddusb m5, m1 ; q1-a
|
||||
@ -863,51 +661,13 @@ cglobal vp8_%1_loop_filter16y_inner, 5, 5, 13, stack_size, dst, stride, flimE, f
|
||||
; 4x8/16 transpose
|
||||
TRANSPOSE4x4B 2, 3, 4, 5, 6
|
||||
|
||||
%if mmsize == 8 ; mmx/mmxext (h)
|
||||
WRITE_4x2D 2, 3, 4, 5, dst1q, dst2q, mstrideq, strideq
|
||||
%else ; sse2 (h)
|
||||
lea dst8q, [dst8q+mstrideq +2]
|
||||
WRITE_4x4D 2, 3, 4, 5, dst1q, dst2q, dst8q, mstrideq, strideq, %2
|
||||
%endif
|
||||
%endif
|
||||
|
||||
%if mmsize == 8
|
||||
%if %2 == 8 ; chroma
|
||||
%ifidn %1, h
|
||||
sub dst1q, 2
|
||||
%endif
|
||||
cmp dst1q, dst8q
|
||||
mov dst1q, dst8q
|
||||
jnz .next8px
|
||||
%else
|
||||
%ifidn %1, h
|
||||
lea dst1q, [dst1q+ strideq*8-2]
|
||||
%else ; v
|
||||
add dst1q, 8
|
||||
%endif
|
||||
dec cntrq
|
||||
jg .next8px
|
||||
%endif
|
||||
REP_RET
|
||||
%else ; mmsize == 16
|
||||
RET
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%if ARCH_X86_32
|
||||
INIT_MMX mmx
|
||||
INNER_LOOPFILTER v, 16
|
||||
INNER_LOOPFILTER h, 16
|
||||
INNER_LOOPFILTER v, 8
|
||||
INNER_LOOPFILTER h, 8
|
||||
|
||||
INIT_MMX mmxext
|
||||
INNER_LOOPFILTER v, 16
|
||||
INNER_LOOPFILTER h, 16
|
||||
INNER_LOOPFILTER v, 8
|
||||
INNER_LOOPFILTER h, 8
|
||||
%endif
|
||||
|
||||
INIT_XMM sse2
|
||||
INNER_LOOPFILTER v, 16
|
||||
INNER_LOOPFILTER h, 16
|
||||
@ -992,9 +752,6 @@ cglobal vp8_%1_loop_filter16y_mbedge, 5, 5, 15, stack_size, dst1, stride, flimE,
|
||||
|
||||
%if %2 == 8 ; chroma
|
||||
DEFINE_ARGS dst1, dst8, mstride, stride, dst2
|
||||
%elif mmsize == 8
|
||||
DEFINE_ARGS dst1, mstride, stride, dst2, cntr
|
||||
mov cntrq, 2
|
||||
%else
|
||||
DEFINE_ARGS dst1, mstride, stride, dst2, dst8
|
||||
%endif
|
||||
@ -1007,9 +764,6 @@ cglobal vp8_%1_loop_filter16y_mbedge, 5, 5, 15, stack_size, dst1, stride, flimE,
|
||||
%endif
|
||||
%endif
|
||||
|
||||
%if mmsize == 8
|
||||
.next8px:
|
||||
%endif
|
||||
; read
|
||||
lea dst2q, [dst1q+ strideq ]
|
||||
%ifidn %1, v
|
||||
@ -1034,33 +788,7 @@ cglobal vp8_%1_loop_filter16y_mbedge, 5, 5, 15, stack_size, dst1, stride, flimE,
|
||||
movhps m7, [dst8q+ strideq*2]
|
||||
add dst8q, mstrideq
|
||||
%endif
|
||||
%elif mmsize == 8 ; mmx/mmxext (h)
|
||||
; read 8 rows of 8px each
|
||||
movu m0, [dst1q+mstrideq*4]
|
||||
movu m1, [dst2q+mstrideq*4]
|
||||
movu m2, [dst1q+mstrideq*2]
|
||||
movu m3, [dst1q+mstrideq ]
|
||||
movu m4, [dst1q]
|
||||
movu m5, [dst2q]
|
||||
movu m6, [dst2q+ strideq ]
|
||||
|
||||
; 8x8 transpose
|
||||
TRANSPOSE4x4B 0, 1, 2, 3, 7
|
||||
mova m_q0backup, m1
|
||||
movu m7, [dst2q+ strideq*2]
|
||||
TRANSPOSE4x4B 4, 5, 6, 7, 1
|
||||
SBUTTERFLY dq, 0, 4, 1 ; p3/p2
|
||||
SBUTTERFLY dq, 2, 6, 1 ; q0/q1
|
||||
SBUTTERFLY dq, 3, 7, 1 ; q2/q3
|
||||
mova m1, m_q0backup
|
||||
mova m_q0backup, m2 ; store q0
|
||||
SBUTTERFLY dq, 1, 5, 2 ; p1/p0
|
||||
mova m_p0backup, m5 ; store p0
|
||||
SWAP 1, 4
|
||||
SWAP 2, 4
|
||||
SWAP 6, 3
|
||||
SWAP 5, 3
|
||||
%else ; sse2 (h)
|
||||
%else ; h
|
||||
%if %2 == 16
|
||||
lea dst8q, [dst1q+ strideq*8 ]
|
||||
%endif
|
||||
@ -1150,25 +878,9 @@ cglobal vp8_%1_loop_filter16y_mbedge, 5, 5, 15, stack_size, dst1, stride, flimE,
|
||||
psubusb m6, m5 ; q2-q1
|
||||
por m6, m4 ; abs(q2-q1)
|
||||
|
||||
%if notcpuflag(mmxext)
|
||||
mova m4, m_flimI
|
||||
pxor m3, m3
|
||||
psubusb m0, m4
|
||||
psubusb m1, m4
|
||||
psubusb m7, m4
|
||||
psubusb m6, m4
|
||||
pcmpeqb m0, m3 ; abs(p3-p2) <= I
|
||||
pcmpeqb m1, m3 ; abs(p2-p1) <= I
|
||||
pcmpeqb m7, m3 ; abs(q3-q2) <= I
|
||||
pcmpeqb m6, m3 ; abs(q2-q1) <= I
|
||||
pand m0, m1
|
||||
pand m7, m6
|
||||
pand m0, m7
|
||||
%else ; mmxext/sse2
|
||||
pmaxub m0, m1
|
||||
pmaxub m6, m7
|
||||
pmaxub m0, m6
|
||||
%endif
|
||||
|
||||
; normal_limit and high_edge_variance for p1-p0, q1-q0
|
||||
SWAP 7, 3 ; now m7 is zero
|
||||
@ -1190,18 +902,8 @@ cglobal vp8_%1_loop_filter16y_mbedge, 5, 5, 15, stack_size, dst1, stride, flimE,
|
||||
psubusb m1, m3 ; p1-p0
|
||||
psubusb m6, m2 ; p0-p1
|
||||
por m1, m6 ; abs(p1-p0)
|
||||
%if notcpuflag(mmxext)
|
||||
mova m6, m1
|
||||
psubusb m1, m4
|
||||
psubusb m6, m_hevthr
|
||||
pcmpeqb m1, m7 ; abs(p1-p0) <= I
|
||||
pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh
|
||||
pand m0, m1
|
||||
mova m_maskres, m6
|
||||
%else ; mmxext/sse2
|
||||
pmaxub m0, m1 ; max_I
|
||||
SWAP 1, 4 ; max_hev_thresh
|
||||
%endif
|
||||
|
||||
SWAP 6, 4 ; now m6 is I
|
||||
%ifidn %1, v
|
||||
@ -1221,17 +923,6 @@ cglobal vp8_%1_loop_filter16y_mbedge, 5, 5, 15, stack_size, dst1, stride, flimE,
|
||||
psubusb m1, m5 ; q0-q1
|
||||
psubusb m7, m4 ; q1-q0
|
||||
por m1, m7 ; abs(q1-q0)
|
||||
%if notcpuflag(mmxext)
|
||||
mova m7, m1
|
||||
psubusb m1, m6
|
||||
psubusb m7, m_hevthr
|
||||
pxor m6, m6
|
||||
pcmpeqb m1, m6 ; abs(q1-q0) <= I
|
||||
pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh
|
||||
mova m6, m_maskres
|
||||
pand m0, m1 ; abs([pq][321]-[pq][210]) <= I
|
||||
pand m6, m7
|
||||
%else ; mmxext/sse2
|
||||
pxor m7, m7
|
||||
pmaxub m0, m1
|
||||
pmaxub m6, m1
|
||||
@ -1239,7 +930,6 @@ cglobal vp8_%1_loop_filter16y_mbedge, 5, 5, 15, stack_size, dst1, stride, flimE,
|
||||
psubusb m6, m_hevthr
|
||||
pcmpeqb m0, m7 ; max(abs(..)) <= I
|
||||
pcmpeqb m6, m7 ; !(max(abs..) > thresh)
|
||||
%endif
|
||||
%ifdef m12
|
||||
SWAP 6, 12
|
||||
%else
|
||||
@ -1510,11 +1200,6 @@ cglobal vp8_%1_loop_filter16y_mbedge, 5, 5, 15, stack_size, dst1, stride, flimE,
|
||||
TRANSPOSE4x4B 1, 2, 3, 4, 0
|
||||
SBUTTERFLY bw, 5, 6, 0
|
||||
|
||||
%if mmsize == 8 ; mmx/mmxext (h)
|
||||
WRITE_4x2D 1, 2, 3, 4, dst1q, dst2q, mstrideq, strideq
|
||||
add dst1q, 4
|
||||
WRITE_2x4W m5, m6, dst2q, dst1q, mstrideq, strideq
|
||||
%else ; sse2 (h)
|
||||
lea dst8q, [dst8q+mstrideq+1]
|
||||
WRITE_4x4D 1, 2, 3, 4, dst1q, dst2q, dst8q, mstrideq, strideq, %2
|
||||
lea dst1q, [dst2q+mstrideq+4]
|
||||
@ -1528,45 +1213,10 @@ cglobal vp8_%1_loop_filter16y_mbedge, 5, 5, 15, stack_size, dst1, stride, flimE,
|
||||
%endif
|
||||
WRITE_8W m6, dst2q, dst8q, mstrideq, strideq
|
||||
%endif
|
||||
%endif
|
||||
|
||||
%if mmsize == 8
|
||||
%if %2 == 8 ; chroma
|
||||
%ifidn %1, h
|
||||
sub dst1q, 5
|
||||
%endif
|
||||
cmp dst1q, dst8q
|
||||
mov dst1q, dst8q
|
||||
jnz .next8px
|
||||
%else
|
||||
%ifidn %1, h
|
||||
lea dst1q, [dst1q+ strideq*8-5]
|
||||
%else ; v
|
||||
add dst1q, 8
|
||||
%endif
|
||||
dec cntrq
|
||||
jg .next8px
|
||||
%endif
|
||||
REP_RET
|
||||
%else ; mmsize == 16
|
||||
RET
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%if ARCH_X86_32
|
||||
INIT_MMX mmx
|
||||
MBEDGE_LOOPFILTER v, 16
|
||||
MBEDGE_LOOPFILTER h, 16
|
||||
MBEDGE_LOOPFILTER v, 8
|
||||
MBEDGE_LOOPFILTER h, 8
|
||||
|
||||
INIT_MMX mmxext
|
||||
MBEDGE_LOOPFILTER v, 16
|
||||
MBEDGE_LOOPFILTER h, 16
|
||||
MBEDGE_LOOPFILTER v, 8
|
||||
MBEDGE_LOOPFILTER h, 8
|
||||
%endif
|
||||
|
||||
INIT_XMM sse2
|
||||
MBEDGE_LOOPFILTER v, 16
|
||||
MBEDGE_LOOPFILTER h, 16
|
||||
|
Loading…
x
Reference in New Issue
Block a user