You've already forked FFmpeg
							
							
				mirror of
				https://github.com/FFmpeg/FFmpeg.git
				synced 2025-10-30 23:18:11 +02:00 
			
		
		
		
	Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
CPUs supporting it. Originally committed as revision 24437 to svn://svn.ffmpeg.org/ffmpeg/trunk
This commit is contained in:
		| @@ -247,6 +247,7 @@ DECLARE_LOOP_FILTER(mmx) | |||||||
| DECLARE_LOOP_FILTER(mmxext) | DECLARE_LOOP_FILTER(mmxext) | ||||||
| DECLARE_LOOP_FILTER(sse2) | DECLARE_LOOP_FILTER(sse2) | ||||||
| DECLARE_LOOP_FILTER(ssse3) | DECLARE_LOOP_FILTER(ssse3) | ||||||
|  | DECLARE_LOOP_FILTER(sse4) | ||||||
|  |  | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
| @@ -379,6 +380,9 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c) | |||||||
|  |  | ||||||
|     if (mm_flags & FF_MM_SSE4) { |     if (mm_flags & FF_MM_SSE4) { | ||||||
|         c->vp8_idct_dc_add                  = ff_vp8_idct_dc_add_sse4; |         c->vp8_idct_dc_add                  = ff_vp8_idct_dc_add_sse4; | ||||||
|  |  | ||||||
|  |         c->vp8_h_loop_filter16y       = ff_vp8_h_loop_filter16y_mbedge_sse4; | ||||||
|  |         c->vp8_h_loop_filter8uv       = ff_vp8_h_loop_filter8uv_mbedge_sse4; | ||||||
|     } |     } | ||||||
| #endif | #endif | ||||||
| } | } | ||||||
|   | |||||||
| @@ -1932,10 +1932,24 @@ INNER_LOOPFILTER ssse3,  h, 6,  8, 13 | |||||||
|  |  | ||||||
| ; write 4 or 8 words in the mmx/xmm registers as 8 lines | ; write 4 or 8 words in the mmx/xmm registers as 8 lines | ||||||
| ; 1 and 2 are the registers to write, this can be the same (for SSE2) | ; 1 and 2 are the registers to write, this can be the same (for SSE2) | ||||||
|  | ; for pre-SSE4: | ||||||
| ; 3 is a general-purpose register that we will clobber | ; 3 is a general-purpose register that we will clobber | ||||||
|  | ; for SSE4: | ||||||
|  | ; 3 is a pointer to the destination's 5th line | ||||||
| ; 4 is a pointer to the destination's 4th line | ; 4 is a pointer to the destination's 4th line | ||||||
| ; 5 is -stride and +stride | ; 5/6 is -stride and +stride | ||||||
| %macro WRITE_8W 6 | ; 7 is optimization string | ||||||
|  | %macro WRITE_8W 7 | ||||||
|  | %ifidn %7, sse4 | ||||||
|  |     pextrw    [%4+%5*4], %1, 0 | ||||||
|  |     pextrw    [%3+%5*4], %1, 1 | ||||||
|  |     pextrw    [%4+%5*2], %1, 2 | ||||||
|  |     pextrw    [%4+%5  ], %1, 3 | ||||||
|  |     pextrw    [%4     ], %1, 4 | ||||||
|  |     pextrw    [%3     ], %1, 5 | ||||||
|  |     pextrw    [%3+%6  ], %1, 6 | ||||||
|  |     pextrw    [%3+%6*2], %1, 7 | ||||||
|  | %else | ||||||
|     movd             %3, %1 |     movd             %3, %1 | ||||||
| %if mmsize == 8 | %if mmsize == 8 | ||||||
|     punpckhdq        %1, %1 |     punpckhdq        %1, %1 | ||||||
| @@ -1974,6 +1988,7 @@ INNER_LOOPFILTER ssse3,  h, 6,  8, 13 | |||||||
| %if mmsize == 8 | %if mmsize == 8 | ||||||
|     add              %4, %5 |     add              %4, %5 | ||||||
| %endif | %endif | ||||||
|  | %endif | ||||||
| %endmacro | %endmacro | ||||||
|  |  | ||||||
| %macro MBEDGE_LOOPFILTER 5 | %macro MBEDGE_LOOPFILTER 5 | ||||||
| @@ -2509,14 +2524,17 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 | |||||||
| %if mmsize == 8 ; mmx/mmxext (h) | %if mmsize == 8 ; mmx/mmxext (h) | ||||||
|     WRITE_4x2D        1, 2, 3, 4, dst_reg, dst2_reg, mstride_reg, stride_reg |     WRITE_4x2D        1, 2, 3, 4, dst_reg, dst2_reg, mstride_reg, stride_reg | ||||||
|     add         dst_reg, 4 |     add         dst_reg, 4 | ||||||
|     WRITE_8W         m5, m6, dst2_reg, dst_reg, mstride_reg, stride_reg |     WRITE_8W         m5, m6, dst2_reg, dst_reg, mstride_reg, stride_reg, %4 | ||||||
| %else ; sse2 (h) | %else ; sse2 (h) | ||||||
|     lea        dst8_reg, [dst8_reg+mstride_reg+1] |     lea        dst8_reg, [dst8_reg+mstride_reg+1] | ||||||
|     WRITE_4x4D        1, 2, 3, 4, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %4 |     WRITE_4x4D        1, 2, 3, 4, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %4 | ||||||
|     lea         dst_reg, [dst2_reg+mstride_reg+4] |     lea         dst_reg, [dst2_reg+mstride_reg+4] | ||||||
|     lea        dst8_reg, [dst8_reg+mstride_reg+4] |     lea        dst8_reg, [dst8_reg+mstride_reg+4] | ||||||
|     WRITE_8W         m5, m5, dst2_reg, dst_reg,  mstride_reg, stride_reg |     WRITE_8W         m5, m5, dst2_reg, dst_reg,  mstride_reg, stride_reg, %2 | ||||||
|     WRITE_8W         m6, m6, dst2_reg, dst8_reg, mstride_reg, stride_reg | %ifidn %2, sse4 | ||||||
|  |     lea         dst_reg, [dst8_reg+ stride_reg] | ||||||
|  | %endif | ||||||
|  |     WRITE_8W         m6, m6, dst2_reg, dst8_reg, mstride_reg, stride_reg, %2 | ||||||
| %endif | %endif | ||||||
| %endif | %endif | ||||||
|  |  | ||||||
| @@ -2574,3 +2592,10 @@ MBEDGE_LOOPFILTER ssse3,  h, 6, 16, 16 | |||||||
| %endif | %endif | ||||||
| MBEDGE_LOOPFILTER ssse3,  v, 6,  8, 16 | MBEDGE_LOOPFILTER ssse3,  v, 6,  8, 16 | ||||||
| MBEDGE_LOOPFILTER ssse3,  h, 6,  8, 16 | MBEDGE_LOOPFILTER ssse3,  h, 6,  8, 16 | ||||||
|  |  | ||||||
|  | %ifdef m8 | ||||||
|  | MBEDGE_LOOPFILTER sse4,   h, 5, 16, 16 | ||||||
|  | %else | ||||||
|  | MBEDGE_LOOPFILTER sse4,   h, 6, 16, 16 | ||||||
|  | %endif | ||||||
|  | MBEDGE_LOOPFILTER sse4,   h, 6,  8, 16 | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user