You've already forked FFmpeg
							
							
				mirror of
				https://github.com/FFmpeg/FFmpeg.git
				synced 2025-10-30 23:18:11 +02:00 
			
		
		
		
	avfilter/x86/vf_spp: Port store_slice to SSE2
This allows to remove an emms_c from the filter. It also gives 25% speedup here (when timing the calls to store_slice using START/STOP_TIMER). Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
This commit is contained in:
		| @@ -31,7 +31,6 @@ | ||||
|  * ported by Clément Bœsch for FFmpeg. | ||||
|  */ | ||||
|  | ||||
| #include "libavutil/emms.h" | ||||
| #include "libavutil/imgutils.h" | ||||
| #include "libavutil/mem.h" | ||||
| #include "libavutil/mem_internal.h" | ||||
| @@ -425,7 +424,6 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in) | ||||
|                 filter(s, out->data[1], in->data[1], out->linesize[1], in->linesize[1], cw,        ch,        qp_table, qp_stride, 0, depth); | ||||
|                 filter(s, out->data[2], in->data[2], out->linesize[2], in->linesize[2], cw,        ch,        qp_table, qp_stride, 0, depth); | ||||
|             } | ||||
|             emms_c(); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|   | ||||
| @@ -18,16 +18,20 @@ | ||||
|  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  */ | ||||
|  | ||||
| #include <stdint.h> | ||||
|  | ||||
| #include "config.h" | ||||
| #include "libavutil/attributes.h" | ||||
| #include "libavutil/cpu.h" | ||||
| #include "libavutil/x86/asm.h" | ||||
| #include "libavutil/x86/cpu.h" | ||||
| #include "libavfilter/vf_spp.h" | ||||
|  | ||||
| #if HAVE_MMX_INLINE | ||||
| static void store_slice_mmx(uint8_t *dst, const int16_t *src, | ||||
|                             int dst_stride, int src_stride, | ||||
|                             int width, int height, int log2_scale, | ||||
|                             const uint8_t dither[8][8]) | ||||
| #if HAVE_SSE2_INLINE | ||||
| static void store_slice_sse2(uint8_t *dst, const int16_t *src, | ||||
|                              int dst_stride, int src_stride, | ||||
|                              int width, int height, int log2_scale, | ||||
|                              const uint8_t dither[8][8]) | ||||
| { | ||||
|     int y; | ||||
|  | ||||
| @@ -35,30 +39,25 @@ static void store_slice_mmx(uint8_t *dst, const int16_t *src, | ||||
|         uint8_t *dst1 = dst; | ||||
|         const int16_t *src1 = src; | ||||
|         __asm__ volatile( | ||||
|             "movq (%3), %%mm3           \n" | ||||
|             "movq (%3), %%mm4           \n" | ||||
|             "movd %4, %%mm2             \n" | ||||
|             "pxor %%mm0, %%mm0          \n" | ||||
|             "punpcklbw %%mm0, %%mm3     \n" | ||||
|             "punpckhbw %%mm0, %%mm4     \n" | ||||
|             "psraw %%mm2, %%mm3         \n" | ||||
|             "psraw %%mm2, %%mm4         \n" | ||||
|             "movd %5, %%mm2             \n" | ||||
|             "movq         (%3), %%xmm1  \n" | ||||
|             "movd           %4, %%xmm2  \n" | ||||
|             "pxor       %%xmm0, %%xmm0  \n" | ||||
|             "punpcklbw  %%xmm0, %%xmm1  \n" | ||||
|             "psraw      %%xmm2, %%xmm1  \n" | ||||
|             "movd           %5, %%xmm2  \n" | ||||
|             "1:                         \n" | ||||
|             "movq (%0), %%mm0           \n" | ||||
|             "movq 8(%0), %%mm1          \n" | ||||
|             "paddw %%mm3, %%mm0         \n" | ||||
|             "paddw %%mm4, %%mm1         \n" | ||||
|             "psraw %%mm2, %%mm0         \n" | ||||
|             "psraw %%mm2, %%mm1         \n" | ||||
|             "packuswb %%mm1, %%mm0      \n" | ||||
|             "movq %%mm0, (%1)           \n" | ||||
|             "movdqa       (%0), %%xmm0  \n" | ||||
|             "paddw      %%xmm1, %%xmm0  \n" | ||||
|             "psraw      %%xmm2, %%xmm0  \n" | ||||
|             "packuswb   %%xmm0, %%xmm0  \n" | ||||
|             "movq       %%xmm0, (%1)    \n" | ||||
|             "add $16, %0                \n" | ||||
|             "add $8, %1                 \n" | ||||
|             "cmp %2, %1                 \n" | ||||
|             " jb 1b                     \n" | ||||
|             : "+r" (src1), "+r"(dst1) | ||||
|             : "r"(dst + width), "r"(dither[y]), "g"(log2_scale), "g"(MAX_LEVEL - log2_scale) | ||||
|             XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2") | ||||
|         ); | ||||
|         src += src_stride; | ||||
|         dst += dst_stride; | ||||
| @@ -69,11 +68,11 @@ static void store_slice_mmx(uint8_t *dst, const int16_t *src, | ||||
|  | ||||
| av_cold void ff_spp_init_x86(SPPContext *s) | ||||
| { | ||||
| #if HAVE_MMX_INLINE | ||||
| #if HAVE_SSE2_INLINE | ||||
|     int cpu_flags = av_get_cpu_flags(); | ||||
|  | ||||
|     if (cpu_flags & AV_CPU_FLAG_MMX) { | ||||
|         s->store_slice = store_slice_mmx; | ||||
|     if (INLINE_SSE2(cpu_flags)) { | ||||
|         s->store_slice = store_slice_sse2; | ||||
|     } | ||||
| #endif | ||||
| } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user