mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-11-26 19:01:44 +02:00
simd add_hfyu_left_prediction
2.2x faster than C on conroe, 3.6x on penryn. 4-6% faster huffyuv decoding if using left or plane mode and yuv Originally committed as revision 20287 to svn://svn.ffmpeg.org/ffmpeg/trunk
This commit is contained in:
parent
1303d62d84
commit
2f77923d72
@ -349,7 +349,7 @@ typedef struct DSPContext {
|
|||||||
*/
|
*/
|
||||||
void (*sub_hfyu_median_prediction)(uint8_t *dst, const uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top);
|
void (*sub_hfyu_median_prediction)(uint8_t *dst, const uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top);
|
||||||
void (*add_hfyu_median_prediction)(uint8_t *dst, const uint8_t *top, uint8_t *diff, int w, int *left, int *left_top);
|
void (*add_hfyu_median_prediction)(uint8_t *dst, const uint8_t *top, uint8_t *diff, int w, int *left, int *left_top);
|
||||||
int (*add_hfyu_left_prediction)(uint8_t *dst, const uint8_t *src, int w, int acc);
|
int (*add_hfyu_left_prediction)(uint8_t *dst, const uint8_t *src, int w, int left);
|
||||||
void (*add_hfyu_left_prediction_bgr32)(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue);
|
void (*add_hfyu_left_prediction_bgr32)(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue);
|
||||||
/* this might write to dst[w] */
|
/* this might write to dst[w] */
|
||||||
void (*add_png_paeth_prediction)(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp);
|
void (*add_png_paeth_prediction)(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp);
|
||||||
|
@ -2385,6 +2385,8 @@ void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len)
|
|||||||
void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len);
|
void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len);
|
||||||
void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len);
|
void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len);
|
||||||
void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *top, uint8_t *diff, int w, int *left, int *left_top);
|
void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *top, uint8_t *diff, int w, int *left, int *left_top);
|
||||||
|
int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, uint8_t *src, int w, int left);
|
||||||
|
int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, uint8_t *src, int w, int left);
|
||||||
void ff_x264_deblock_v_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0);
|
void ff_x264_deblock_v_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0);
|
||||||
void ff_x264_deblock_h_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0);
|
void ff_x264_deblock_h_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0);
|
||||||
void ff_x264_deblock_v8_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta);
|
void ff_x264_deblock_v8_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta);
|
||||||
@ -2951,6 +2953,11 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
|
|||||||
c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_ssse3;
|
c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_ssse3;
|
||||||
c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_ssse3;
|
c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_ssse3;
|
||||||
c->add_png_paeth_prediction= add_png_paeth_prediction_ssse3;
|
c->add_png_paeth_prediction= add_png_paeth_prediction_ssse3;
|
||||||
|
#if HAVE_YASM
|
||||||
|
c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3;
|
||||||
|
if (mm_flags & FF_MM_SSE4) // not really sse4, just slow on Conroe
|
||||||
|
c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -21,6 +21,13 @@
|
|||||||
|
|
||||||
%include "x86inc.asm"
|
%include "x86inc.asm"
|
||||||
|
|
||||||
|
SECTION_RODATA
|
||||||
|
pb_f: times 16 db 15
|
||||||
|
pb_zzzzzzzz77777777: times 8 db -1
|
||||||
|
pb_7: times 8 db 7
|
||||||
|
pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
|
||||||
|
pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
|
||||||
|
|
||||||
section .text align=16
|
section .text align=16
|
||||||
|
|
||||||
%macro PSWAPD_SSE 2
|
%macro PSWAPD_SSE 2
|
||||||
@ -150,3 +157,70 @@ cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_to
|
|||||||
movzx r2d, byte [topq-1]
|
movzx r2d, byte [topq-1]
|
||||||
mov [left_topq], r2d
|
mov [left_topq], r2d
|
||||||
RET
|
RET
|
||||||
|
|
||||||
|
|
||||||
|
%macro ADD_HFYU_LEFT_LOOP 1 ; %1 = is_aligned
|
||||||
|
add srcq, wq
|
||||||
|
add dstq, wq
|
||||||
|
neg wq
|
||||||
|
%%.loop:
|
||||||
|
mova m1, [srcq+wq]
|
||||||
|
mova m2, m1
|
||||||
|
psllw m1, 8
|
||||||
|
paddb m1, m2
|
||||||
|
mova m2, m1
|
||||||
|
pshufb m1, m3
|
||||||
|
paddb m1, m2
|
||||||
|
pshufb m0, m5
|
||||||
|
mova m2, m1
|
||||||
|
pshufb m1, m4
|
||||||
|
paddb m1, m2
|
||||||
|
%if mmsize == 16
|
||||||
|
mova m2, m1
|
||||||
|
pshufb m1, m6
|
||||||
|
paddb m1, m2
|
||||||
|
%endif
|
||||||
|
paddb m0, m1
|
||||||
|
%if %1
|
||||||
|
mova [dstq+wq], m0
|
||||||
|
%else
|
||||||
|
movq [dstq+wq], m0
|
||||||
|
movhps [dstq+wq+8], m0
|
||||||
|
%endif
|
||||||
|
add wq, mmsize
|
||||||
|
jl %%.loop
|
||||||
|
mov eax, mmsize-1
|
||||||
|
sub eax, wd
|
||||||
|
movd m1, eax
|
||||||
|
pshufb m0, m1
|
||||||
|
movd eax, m0
|
||||||
|
RET
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
; int ff_add_hfyu_left_prediction(uint8_t *dst, uint8_t *src, int w, int left)
|
||||||
|
INIT_MMX
|
||||||
|
cglobal add_hfyu_left_prediction_ssse3, 3,3,7, dst, src, w, left
|
||||||
|
.skip_prologue:
|
||||||
|
mova m5, [pb_7 GLOBAL]
|
||||||
|
mova m4, [pb_zzzz3333zzzzbbbb GLOBAL]
|
||||||
|
mova m3, [pb_zz11zz55zz99zzdd GLOBAL]
|
||||||
|
movd m0, leftm
|
||||||
|
psllq m0, 56
|
||||||
|
ADD_HFYU_LEFT_LOOP 1
|
||||||
|
|
||||||
|
INIT_XMM
|
||||||
|
cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left
|
||||||
|
mova m5, [pb_f GLOBAL]
|
||||||
|
mova m6, [pb_zzzzzzzz77777777 GLOBAL]
|
||||||
|
mova m4, [pb_zzzz3333zzzzbbbb GLOBAL]
|
||||||
|
mova m3, [pb_zz11zz55zz99zzdd GLOBAL]
|
||||||
|
movd m0, leftm
|
||||||
|
pslldq m0, 15
|
||||||
|
test srcq, 15
|
||||||
|
jnz ff_add_hfyu_left_prediction_ssse3 %+ .skip_prologue
|
||||||
|
test dstq, 15
|
||||||
|
jnz .unaligned
|
||||||
|
ADD_HFYU_LEFT_LOOP 1
|
||||||
|
.unaligned:
|
||||||
|
ADD_HFYU_LEFT_LOOP 0
|
||||||
|
|
||||||
|
@ -221,6 +221,7 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7
|
|||||||
CAT_UNDEF arg_name %+ %%i, d
|
CAT_UNDEF arg_name %+ %%i, d
|
||||||
CAT_UNDEF arg_name %+ %%i, w
|
CAT_UNDEF arg_name %+ %%i, w
|
||||||
CAT_UNDEF arg_name %+ %%i, b
|
CAT_UNDEF arg_name %+ %%i, b
|
||||||
|
CAT_UNDEF arg_name %+ %%i, m
|
||||||
CAT_UNDEF arg_name, %%i
|
CAT_UNDEF arg_name, %%i
|
||||||
%assign %%i %%i+1
|
%assign %%i %%i+1
|
||||||
%endrep
|
%endrep
|
||||||
@ -232,6 +233,7 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7
|
|||||||
%xdefine %1d r %+ %%i %+ d
|
%xdefine %1d r %+ %%i %+ d
|
||||||
%xdefine %1w r %+ %%i %+ w
|
%xdefine %1w r %+ %%i %+ w
|
||||||
%xdefine %1b r %+ %%i %+ b
|
%xdefine %1b r %+ %%i %+ b
|
||||||
|
%xdefine %1m r %+ %%i %+ m
|
||||||
CAT_XDEFINE arg_name, %%i, %1
|
CAT_XDEFINE arg_name, %%i, %1
|
||||||
%assign %%i %%i+1
|
%assign %%i %%i+1
|
||||||
%rotate 1
|
%rotate 1
|
||||||
|
Loading…
Reference in New Issue
Block a user