You've already forked FFmpeg
							
							
				mirror of
				https://github.com/FFmpeg/FFmpeg.git
				synced 2025-10-30 23:18:11 +02:00 
			
		
		
		
	libswscale/x86/yuv2rgb: add ssse3 version
Tested using this command: /ffmpeg -pix_fmt yuv420p -s 1920*1080 -i ArashRawYuv420.yuv \ -vcodec rawvideo -s 1920*1080 -pix_fmt rgb24 -f null /dev/null The fps increase from 389 to 640 on Intel(R) Core(TM) i7-8700K CPU @ 3.70GHz Signed-off-by: Ting Fu <ting.fu@intel.com>
This commit is contained in:
		| @@ -67,6 +67,15 @@ DECLARE_ASM_CONST(8, uint64_t, pb_07) = 0x0707070707070707ULL; | ||||
| #include "yuv2rgb_template.c" | ||||
| #endif /* HAVE_MMXEXT */ | ||||
|  | ||||
| //SSSE3 versions | ||||
| #if HAVE_SSSE3 | ||||
| #undef RENAME | ||||
| #undef COMPILE_TEMPLATE_MMXEXT | ||||
| #define COMPILE_TEMPLATE_MMXEXT 0 | ||||
| #define RENAME(a) a ## _ssse3 | ||||
| #include "yuv2rgb_template.c" | ||||
| #endif | ||||
|  | ||||
| #endif /* HAVE_X86ASM */ | ||||
|  | ||||
| av_cold SwsFunc ff_yuv2rgb_init_x86(SwsContext *c) | ||||
| @@ -74,6 +83,35 @@ av_cold SwsFunc ff_yuv2rgb_init_x86(SwsContext *c) | ||||
| #if HAVE_X86ASM | ||||
|     int cpu_flags = av_get_cpu_flags(); | ||||
|  | ||||
|     if (EXTERNAL_SSSE3(cpu_flags)) { | ||||
|         switch (c->dstFormat) { | ||||
|         case AV_PIX_FMT_RGB32: | ||||
|             if (c->srcFormat == AV_PIX_FMT_YUVA420P) { | ||||
| #if CONFIG_SWSCALE_ALPHA | ||||
|                 return yuva420_rgb32_ssse3; | ||||
| #endif | ||||
|                 break; | ||||
|             } else | ||||
|                 return yuv420_rgb32_ssse3; | ||||
|         case AV_PIX_FMT_BGR32: | ||||
|             if (c->srcFormat == AV_PIX_FMT_YUVA420P) { | ||||
| #if CONFIG_SWSCALE_ALPHA | ||||
|                 return yuva420_bgr32_ssse3; | ||||
| #endif | ||||
|                 break; | ||||
|             } else | ||||
|                 return yuv420_bgr32_ssse3; | ||||
|         case AV_PIX_FMT_RGB24: | ||||
|             return yuv420_rgb24_ssse3; | ||||
|         case AV_PIX_FMT_BGR24: | ||||
|             return yuv420_bgr24_ssse3; | ||||
|         case AV_PIX_FMT_RGB565: | ||||
|             return yuv420_rgb16_ssse3; | ||||
|         case AV_PIX_FMT_RGB555: | ||||
|             return yuv420_rgb15_ssse3; | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     if (EXTERNAL_MMXEXT(cpu_flags)) { | ||||
|         switch (c->dstFormat) { | ||||
|         case AV_PIX_FMT_RGB24: | ||||
|   | ||||
| @@ -25,11 +25,18 @@ | ||||
|  | ||||
| SECTION_RODATA | ||||
|  | ||||
| pw_00ff: times 4 dw 255 | ||||
| pb_f8:   times 8 db 248 | ||||
| pb_e0:   times 8 db 224 | ||||
| pb_03:   times 8 db 3 | ||||
| pb_07:   times 8 db 7 | ||||
| ; below variables are named like mask_dwXY, which means to preserve dword No.X & No.Y | ||||
| mask_dw036 : db -1, -1,  0,  0,  0,  0, -1, -1,  0,  0,  0,  0, -1, -1,  0,  0 | ||||
| mask_dw147 : db  0,  0, -1, -1,  0,  0,  0,  0, -1, -1,  0,  0,  0,  0, -1, -1 | ||||
| mask_dw25  : db  0,  0,  0,  0, -1, -1,  0,  0,  0,  0, -1, -1,  0,  0,  0,  0 | ||||
| rgb24_shuf1: db  0,  1,  6,  7, 12, 13,  2,  3,  8,  9, 14, 15,  4,  5, 10, 11 | ||||
| rgb24_shuf2: db 10, 11,  0,  1,  6,  7, 12, 13,  2,  3,  8,  9, 14, 15,  4,  5 | ||||
| rgb24_shuf3: db  4,  5, 10, 11,  0,  1,  6,  7, 12, 13,  2,  3,  8,  9, 14, 15 | ||||
| pw_00ff: times 8 dw 255 | ||||
| pb_f8:   times 16 db 248 | ||||
| pb_e0:   times 16 db 224 | ||||
| pb_03:   times 16 db 3 | ||||
| pb_07:   times 16 db 7 | ||||
|  | ||||
| mask_1101: dw -1, -1,  0, -1 | ||||
| mask_0010: dw  0,  0, -1,  0 | ||||
| @@ -49,7 +56,11 @@ SECTION .text | ||||
| ;----------------------------------------------------------------------------- | ||||
|  | ||||
| %macro MOV_H2L 1 | ||||
| psrlq %1, 32 | ||||
| %if mmsize == 8 | ||||
|     psrlq %1, 32 | ||||
| %else ; mmsize == 16 | ||||
|     psrldq %1, 8 | ||||
| %endif | ||||
| %endmacro | ||||
|  | ||||
| %macro yuv2rgb_fn 3 | ||||
| @@ -77,6 +88,7 @@ psrlq %1, 32 | ||||
| %define m_blue m1 | ||||
| %endif | ||||
|  | ||||
| %if mmsize == 8 | ||||
| %define time_num 1 | ||||
| %define reg_num 8 | ||||
| %define y_offset [pointer_c_ditherq + 8  * 8] | ||||
| @@ -87,11 +99,45 @@ psrlq %1, 32 | ||||
| %define y_coff   [pointer_c_ditherq + 3  * 8] | ||||
| %define ub_coff  [pointer_c_ditherq + 5  * 8] | ||||
| %define vr_coff  [pointer_c_ditherq + 4  * 8] | ||||
| %elif mmsize == 16 | ||||
| %define time_num 2 | ||||
| %if ARCH_X86_32 | ||||
| %define reg_num 8 | ||||
| %define my_offset [pointer_c_ditherq + 8  * 8] | ||||
| %define mu_offset [pointer_c_ditherq + 9  * 8] | ||||
| %define mv_offset [pointer_c_ditherq + 10 * 8] | ||||
| %define mug_coff  [pointer_c_ditherq + 7  * 8] | ||||
| %define mvg_coff  [pointer_c_ditherq + 6  * 8] | ||||
| %define my_coff   [pointer_c_ditherq + 3  * 8] | ||||
| %define mub_coff  [pointer_c_ditherq + 5  * 8] | ||||
| %define mvr_coff  [pointer_c_ditherq + 4  * 8] | ||||
| %else ; ARCH_X86_64 | ||||
| %define reg_num 16 | ||||
| %define y_offset m8 | ||||
| %define u_offset m9 | ||||
| %define v_offset m10 | ||||
| %define ug_coff  m11 | ||||
| %define vg_coff  m12 | ||||
| %define y_coff   m13 | ||||
| %define ub_coff  m14 | ||||
| %define vr_coff  m15 | ||||
| %endif ; ARCH_X86_32/64 | ||||
| %endif ; coeff define mmsize == 8/16 | ||||
|  | ||||
| cglobal %1_420_%2%3, GPR_num, GPR_num, reg_num, parameters | ||||
|  | ||||
| %if ARCH_X86_64 | ||||
|     movsxd indexq, indexd | ||||
| %if mmsize == 16 | ||||
|     VBROADCASTSD y_offset, [pointer_c_ditherq + 8  * 8] | ||||
|     VBROADCASTSD u_offset, [pointer_c_ditherq + 9  * 8] | ||||
|     VBROADCASTSD v_offset, [pointer_c_ditherq + 10 * 8] | ||||
|     VBROADCASTSD ug_coff,  [pointer_c_ditherq + 7  * 8] | ||||
|     VBROADCASTSD vg_coff,  [pointer_c_ditherq + 6  * 8] | ||||
|     VBROADCASTSD y_coff,   [pointer_c_ditherq + 3  * 8] | ||||
|     VBROADCASTSD ub_coff,  [pointer_c_ditherq + 5  * 8] | ||||
|     VBROADCASTSD vr_coff,  [pointer_c_ditherq + 4  * 8] | ||||
| %endif | ||||
| %endif | ||||
|     mova m_y, [py_2indexq + 2 * indexq] | ||||
|     movh m_u, [pu_indexq  +     indexq] | ||||
| @@ -108,10 +154,30 @@ cglobal %1_420_%2%3, GPR_num, GPR_num, reg_num, parameters | ||||
|     psllw m1, 3 | ||||
|     psllw m6, 3 | ||||
|     psllw m7, 3 | ||||
| %if (ARCH_X86_32 && mmsize == 16) | ||||
|     VBROADCASTSD m2, mu_offset | ||||
|     VBROADCASTSD m3, mv_offset | ||||
|     VBROADCASTSD m4, my_offset | ||||
|     psubsw m0, m2 ; U = U - 128 | ||||
|     psubsw m1, m3 ; V = V - 128 | ||||
|     psubw  m6, m4 | ||||
|     psubw  m7, m4 | ||||
|     VBROADCASTSD m2, mug_coff | ||||
|     VBROADCASTSD m3, mvg_coff | ||||
|     VBROADCASTSD m4, my_coff | ||||
|     VBROADCASTSD m5, mub_coff | ||||
|     pmulhw m2, m0 | ||||
|     pmulhw m3, m1 | ||||
|     pmulhw m6, m4 | ||||
|     pmulhw m7, m4 | ||||
|     pmulhw m0, m5 | ||||
|     VBROADCASTSD m4, mvr_coff | ||||
|     pmulhw m1, m4 | ||||
| %else ; ARCH_X86_64 || mmsize == 8 | ||||
|     psubsw m0, u_offset ; U = U - 128 | ||||
|     psubsw m1, v_offset ; V = V - 128 | ||||
|     psubw m6, y_offset | ||||
|     psubw m7, y_offset | ||||
|     psubw  m6, y_offset | ||||
|     psubw  m7, y_offset | ||||
|     mova m2, m0 | ||||
|     mova m3, m1 | ||||
|     pmulhw m2, ug_coff | ||||
| @@ -120,6 +186,7 @@ cglobal %1_420_%2%3, GPR_num, GPR_num, reg_num, parameters | ||||
|     pmulhw m7, y_coff | ||||
|     pmulhw m0, ub_coff | ||||
|     pmulhw m1, vr_coff | ||||
| %endif | ||||
|     paddsw m2, m3 | ||||
|     mova m3, m7 | ||||
|     mova m5, m7 | ||||
| @@ -142,6 +209,7 @@ cglobal %1_420_%2%3, GPR_num, GPR_num, reg_num, parameters | ||||
|     punpcklbw m6, m_red  ; B0 R1 B2 R3 B4 R5 B6 R7 B8 R9 ... | ||||
|     mova m5, m3 | ||||
|     punpckhbw m2, m_blue ; G1 B1 G3 B3 G5 B5 G7 B7 G9 B9 ... | ||||
| %if  mmsize == 8 | ||||
|     punpcklwd m3 ,m6     ; R0 G0 B0 R1 R2 G2 B2 R3 | ||||
|     punpckhwd m5, m6     ; R4 G4 B4 R5 R6 G6 B6 R7 | ||||
| %if cpuflag(mmxext) | ||||
| @@ -177,7 +245,33 @@ cglobal %1_420_%2%3, GPR_num, GPR_num, reg_num, parameters | ||||
|     psrlq m5, 32 | ||||
|     movd [imageq + 20], m2 ; -- -- G7 B7 | ||||
|     movd [imageq + 18], m5 ; R6 G6 B6 R7 | ||||
| %endif | ||||
| %endif ; mmsize = 8 | ||||
| %else ; mmsize == 16 | ||||
|     pshufb m3, [rgb24_shuf1] ; r0  g0  r6  g6  r12 g12 r2  g2  r8  g8  r14 g14 r4  g4  r10 g10 | ||||
|     pshufb m6, [rgb24_shuf2] ; b10 r11 b0  r1  b6  r7  b12 r13 b2  r3  b8  r9  b14 r15 b4  r5 | ||||
|     pshufb m2, [rgb24_shuf3] ; g5  b5  g11 b11 g1  b1  g7  b7  g13 b13 g3  b3  g9  b9  g15 b15 | ||||
|     mova   m7, [mask_dw036] | ||||
|     mova   m4, [mask_dw147] | ||||
|     mova   m5, [mask_dw25] | ||||
|     pand   m0, m7, m3      ; r0  g0  --- --- --- --- r2  g2  --- --- --- --- r4  g4  --- --- | ||||
|     pand   m1, m4, m6      ; --- --- b0  r1  --- --- --- --- b2  r3  --- --- --- --- b4  r5 | ||||
|     por    m0, m1 | ||||
|     pand   m1, m5, m2      ; --- --- --- --- g1  b1  --- --- --- --- g3  b3  --- --- --- --- | ||||
|     por    m0, m1          ; r0  g0  b0  r1  g1  b1  r2  g2  b2  r3  g3  b3  r4  g4  b4  r5 | ||||
|     pand   m1, m7, m2      ; g5  b5  --- --- --- --- g7  b7  --- --- --- --- g9  b9  --- --- | ||||
|     pand   m7, m6          ; b10 r11 --- --- --- --- b12 r13 --- --- --- --- b14 r15 --- --- | ||||
|     pand   m6, m5          ; --- --- --- --- b6  r7  --- --- --- --- b8  r9  --- --- --- --- | ||||
|     por    m1, m6 | ||||
|     pand   m6, m4, m3      ; --- --- r6  g6  --- --- --- --- r8  g8  --- --- --- --- r10 g10 | ||||
|     pand   m2, m4          ; --- --- g11 b11 --- --- --- --- g13 b13 --- --- --- --- g15 b15 | ||||
|     pand   m3, m5          ; --- --- --- --- r12 g12 --- --- --- --- r14 g14 --- --- --- --- | ||||
|     por    m2, m7 | ||||
|     por    m1, m6          ; g5  b5  r6  g6  b6  r7  g7  b7  r8  g8  b8  r9  g9  b9  r10 g10 | ||||
|     por    m2, m3          ; b10 r11 g11 b11 r12 g12 b12 r13 g13 b13 r14 g14 b14 r15 g15 b15 | ||||
|     mova [imageq], m0 | ||||
|     mova [imageq + 16], m1 | ||||
|     mova [imageq + 32], m2 | ||||
| %endif ; mmsize = 16 | ||||
| %else ; PACK RGB15/16/32 | ||||
|     packuswb m0, m1 | ||||
|     packuswb m3, m5 | ||||
| @@ -196,10 +290,10 @@ cglobal %1_420_%2%3, GPR_num, GPR_num, reg_num, parameters | ||||
| %endif | ||||
|     mova m5, m_blue | ||||
|     mova m6, m_red | ||||
|     punpckhbw m5, m_green | ||||
|     punpckhbw m5,     m_green | ||||
|     punpcklbw m_blue, m_green | ||||
|     punpckhbw m6, m_alpha | ||||
|     punpcklbw m_red, m_alpha | ||||
|     punpckhbw m6,     m_alpha | ||||
|     punpcklbw m_red,  m_alpha | ||||
|     mova m_green, m_blue | ||||
|     mova m_alpha, m5 | ||||
|     punpcklwd m_blue, m_red | ||||
| @@ -207,14 +301,23 @@ cglobal %1_420_%2%3, GPR_num, GPR_num, reg_num, parameters | ||||
|     punpcklwd m5, m6 | ||||
|     punpckhwd m_alpha, m6 | ||||
|     mova [imageq + 0], m_blue | ||||
|     mova [imageq + 8 * time_num], m_green | ||||
|     mova [imageq + 8  * time_num], m_green | ||||
|     mova [imageq + 16 * time_num], m5 | ||||
|     mova [imageq + 24 * time_num], m_alpha | ||||
| %else ; PACK RGB15/16 | ||||
| %define depth 2 | ||||
| %define blue_dither  [pointer_c_ditherq + 2 * 8] | ||||
| %define green_dither [pointer_c_ditherq + 1 * 8] | ||||
| %define red_dither   [pointer_c_ditherq + 0 * 8] | ||||
| %if cpuflag(ssse3) | ||||
|     %define red_dither m3 | ||||
|     %define green_dither m4 | ||||
|     %define blue_dither m5 | ||||
|     VBROADCASTSD red_dither,   [pointer_c_ditherq + 0 * 8] | ||||
|     VBROADCASTSD green_dither, [pointer_c_ditherq + 1 * 8] | ||||
|     VBROADCASTSD blue_dither,  [pointer_c_ditherq + 2 * 8] | ||||
| %else ; cpuflag(mmx/mmxext) | ||||
| %define blue_dither  [pointer_c_ditherq + 2  * 8] | ||||
| %define green_dither [pointer_c_ditherq + 1  * 8] | ||||
| %define red_dither   [pointer_c_ditherq + 0  * 8] | ||||
| %endif | ||||
| %if %3 == 15 | ||||
| %define gmask pb_03 | ||||
| %define isRGB15 1 | ||||
| @@ -268,3 +371,13 @@ yuv2rgb_fn yuv,  rgb, 16 | ||||
| INIT_MMX mmxext | ||||
| yuv2rgb_fn yuv, rgb, 24 | ||||
| yuv2rgb_fn yuv, bgr, 24 | ||||
|  | ||||
| INIT_XMM ssse3 | ||||
| yuv2rgb_fn yuv,  rgb, 24 | ||||
| yuv2rgb_fn yuv,  bgr, 24 | ||||
| yuv2rgb_fn yuv,  rgb, 32 | ||||
| yuv2rgb_fn yuv,  bgr, 32 | ||||
| yuv2rgb_fn yuva, rgb, 32 | ||||
| yuv2rgb_fn yuva, bgr, 32 | ||||
| yuv2rgb_fn yuv,  rgb, 15 | ||||
| yuv2rgb_fn yuv,  rgb, 16 | ||||
|   | ||||
		Reference in New Issue
	
	Block a user