You've already forked FFmpeg
							
							
				mirror of
				https://github.com/FFmpeg/FFmpeg.git
				synced 2025-10-30 23:18:11 +02:00 
			
		
		
		
	Add x86 assembly for some 10-bit H.264 intra predict functions.
Parts are inspired from the 8-bit H.264 predict code in Libav. Other parts ported from x264 with relicensing permission from author. Signed-off-by: Diego Biurrun <diego@biurrun.de>
This commit is contained in:
		
				
					committed by
					
						 Diego Biurrun
						Diego Biurrun
					
				
			
			
				
	
			
			
			
						parent
						
							2c6fb9f032
						
					
				
				
					commit
					a8d44f9dd5
				
			| @@ -16,7 +16,8 @@ YASM-OBJS-$(CONFIG_H264DSP)            += x86/h264_deblock.o            \ | ||||
|                                           x86/h264_idct_10bit.o         \ | ||||
|                                           x86/h264_weight.o             \ | ||||
|  | ||||
| YASM-OBJS-$(CONFIG_H264PRED)           += x86/h264_intrapred.o | ||||
| YASM-OBJS-$(CONFIG_H264PRED)           += x86/h264_intrapred.o          \ | ||||
|                                           x86/h264_intrapred_10bit.o | ||||
| MMX-OBJS-$(CONFIG_H264PRED)            += x86/h264_intrapred_init.o | ||||
|  | ||||
| YASM-OBJS-$(CONFIG_VC1_DECODER)        += x86/vc1dsp_yasm.o | ||||
|   | ||||
							
								
								
									
										337
									
								
								libavcodec/x86/h264_intrapred_10bit.asm
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										337
									
								
								libavcodec/x86/h264_intrapred_10bit.asm
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,337 @@ | ||||
| ;***************************************************************************** | ||||
| ;* MMX/SSE2/AVX-optimized 10-bit H.264 intra prediction code | ||||
| ;***************************************************************************** | ||||
| ;* Copyright (C) 2005-2011 x264 project | ||||
| ;* | ||||
| ;* Authors: Daniel Kang <daniel.d.kang@gmail.com> | ||||
| ;* | ||||
| ;* This file is part of Libav. | ||||
| ;* | ||||
| ;* Libav is free software; you can redistribute it and/or | ||||
| ;* modify it under the terms of the GNU Lesser General Public | ||||
| ;* License as published by the Free Software Foundation; either | ||||
| ;* version 2.1 of the License, or (at your option) any later version. | ||||
| ;* | ||||
| ;* Libav is distributed in the hope that it will be useful, | ||||
| ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
| ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU | ||||
| ;* Lesser General Public License for more details. | ||||
| ;* | ||||
| ;* You should have received a copy of the GNU Lesser General Public | ||||
| ;* License along with Libav; if not, write to the Free Software | ||||
| ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | ||||
| ;****************************************************************************** | ||||
|  | ||||
| %include "x86inc.asm" | ||||
| %include "x86util.asm" | ||||
|  | ||||
| SECTION_RODATA | ||||
|  | ||||
| SECTION .text | ||||
|  | ||||
| cextern pw_4 | ||||
| cextern pw_1 | ||||
|  | ||||
| %macro PRED4x4_LOWPASS 4 | ||||
|     paddw       %2, %3 | ||||
|     psrlw       %2, 1 | ||||
|     pavgw       %1, %4, %2 | ||||
| %endmacro | ||||
|  | ||||
| ;----------------------------------------------------------------------------- | ||||
| ; void pred4x4_down_right(pixel *src, const pixel *topright, int stride) | ||||
| ;----------------------------------------------------------------------------- | ||||
| %macro PRED4x4_DR 1 | ||||
| cglobal pred4x4_down_right_10_%1, 3,3 | ||||
|     sub       r0, r2 | ||||
|     lea       r1, [r0+r2*2] | ||||
|     movhps    m1, [r1-8] | ||||
|     movhps    m2, [r0+r2*1-8] | ||||
|     movhps    m4, [r0-8] | ||||
|     punpckhwd m2, m4 | ||||
|     movq      m3, [r0] | ||||
|     punpckhdq m1, m2 | ||||
|     PALIGNR   m3, m1, 10, m1 | ||||
|     mova      m1, m3 | ||||
|     movhps    m4, [r1+r2*1-8] | ||||
|     PALIGNR   m3, m4, 14, m4 | ||||
|     mova      m2, m3 | ||||
|     movhps    m4, [r1+r2*2-8] | ||||
|     PALIGNR   m3, m4, 14, m4 | ||||
|     PRED4x4_LOWPASS m0, m3, m1, m2 | ||||
|     movq      [r1+r2*2], m0 | ||||
|     psrldq    m0, 2 | ||||
|     movq      [r1+r2*1], m0 | ||||
|     psrldq    m0, 2 | ||||
|     movq      [r0+r2*2], m0 | ||||
|     psrldq    m0, 2 | ||||
|     movq      [r0+r2*1], m0 | ||||
|     RET | ||||
| %endmacro | ||||
|  | ||||
| INIT_XMM | ||||
| %define PALIGNR PALIGNR_MMX | ||||
| PRED4x4_DR sse2 | ||||
| %define PALIGNR PALIGNR_SSSE3 | ||||
| PRED4x4_DR ssse3 | ||||
| %ifdef HAVE_AVX | ||||
| INIT_AVX | ||||
| PRED4x4_DR avx | ||||
| %endif | ||||
|  | ||||
| ;----------------------------------------------------------------------------- | ||||
| ; void pred4x4_vertical_right(pixel *src, const pixel *topright, int stride) | ||||
| ;----------------------------------------------------------------------------- | ||||
| %macro PRED4x4_VR 1 | ||||
| cglobal pred4x4_vertical_right_10_%1, 3,3,6 | ||||
|     sub     r0, r2 | ||||
|     lea     r1, [r0+r2*2] | ||||
|     movq    m5, [r0]            ; ........t3t2t1t0 | ||||
|     movhps  m1, [r0-8] | ||||
|     PALIGNR m0, m5, m1, 14, m1  ; ......t3t2t1t0lt | ||||
|     pavgw   m5, m0 | ||||
|     movhps  m1, [r0+r2*1-8] | ||||
|     PALIGNR m0, m1, 14, m1      ; ....t3t2t1t0ltl0 | ||||
|     mova    m1, m0 | ||||
|     movhps  m2, [r0+r2*2-8] | ||||
|     PALIGNR m0, m2, 14, m2      ; ..t3t2t1t0ltl0l1 | ||||
|     mova    m2, m0 | ||||
|     movhps  m3, [r1+r2*1-8] | ||||
|     PALIGNR m0, m3, 14, m3      ; t3t2t1t0ltl0l1l2 | ||||
|     PRED4x4_LOWPASS m3, m1, m0, m2 | ||||
|     pslldq  m1, m3, 12 | ||||
|     psrldq  m3, 4 | ||||
|     movq    [r0+r2*1], m5 | ||||
|     movq    [r0+r2*2], m3 | ||||
|     PALIGNR m5, m1, 14, m2 | ||||
|     pslldq  m1, 2 | ||||
|     movq    [r1+r2*1], m5 | ||||
|     PALIGNR m3, m1, 14, m1 | ||||
|     movq    [r1+r2*2], m3 | ||||
|     RET | ||||
| %endmacro | ||||
|  | ||||
| INIT_XMM | ||||
| %define PALIGNR PALIGNR_MMX | ||||
| PRED4x4_VR sse2 | ||||
| %define PALIGNR PALIGNR_SSSE3 | ||||
| PRED4x4_VR ssse3 | ||||
| %ifdef HAVE_AVX | ||||
| INIT_AVX | ||||
| PRED4x4_VR avx | ||||
| %endif | ||||
|  | ||||
| ;----------------------------------------------------------------------------- | ||||
| ; void pred4x4_horizontal_down(pixel *src, const pixel *topright, int stride) | ||||
| ;----------------------------------------------------------------------------- | ||||
| %macro PRED4x4_HD 1 | ||||
| cglobal pred4x4_horizontal_down_10_%1, 3,3 | ||||
|     sub        r0, r2 | ||||
|     lea        r1, [r0+r2*2] | ||||
|     movq       m0, [r0-8]      ; lt .. | ||||
|     movhps     m0, [r0] | ||||
|     pslldq     m0, 2           ; t2 t1 t0 lt .. .. .. .. | ||||
|     movq       m1, [r1+r2*2-8] ; l3 | ||||
|     movq       m3, [r1+r2*1-8] | ||||
|     punpcklwd  m1, m3          ; l2 l3 | ||||
|     movq       m2, [r0+r2*2-8] ; l1 | ||||
|     movq       m3, [r0+r2*1-8] | ||||
|     punpcklwd  m2, m3          ; l0 l1 | ||||
|     punpckhdq  m1, m2          ; l0 l1 l2 l3 | ||||
|     punpckhqdq m1, m0          ; t2 t1 t0 lt l0 l1 l2 l3 | ||||
|     psrldq     m0, m1, 4       ; .. .. t2 t1 t0 lt l0 l1 | ||||
|     psrldq     m2, m1, 2       ; .. t2 t1 t0 lt l0 l1 l2 | ||||
|     pavgw      m5, m1, m2 | ||||
|     PRED4x4_LOWPASS m3, m1, m0, m2 | ||||
|     punpcklwd  m5, m3 | ||||
|     psrldq     m3, 8 | ||||
|     PALIGNR    m3, m5, 12, m4 | ||||
|     movq       [r1+r2*2], m5 | ||||
|     movhps     [r0+r2*2], m5 | ||||
|     psrldq     m5, 4 | ||||
|     movq       [r1+r2*1], m5 | ||||
|     movq       [r0+r2*1], m3 | ||||
|     RET | ||||
| %endmacro | ||||
|  | ||||
| INIT_XMM | ||||
| %define PALIGNR PALIGNR_MMX | ||||
| PRED4x4_HD sse2 | ||||
| %define PALIGNR PALIGNR_SSSE3 | ||||
| PRED4x4_HD ssse3 | ||||
| %ifdef HAVE_AVX | ||||
| INIT_AVX | ||||
| PRED4x4_HD avx | ||||
| %endif | ||||
|  | ||||
| ;----------------------------------------------------------------------------- | ||||
| ; void pred4x4_dc(pixel *src, const pixel *topright, int stride) | ||||
| ;----------------------------------------------------------------------------- | ||||
| %macro HADDD 2 ; sum junk | ||||
| %if mmsize == 16 | ||||
|     movhlps %2, %1 | ||||
|     paddd   %1, %2 | ||||
|     pshuflw %2, %1, 0xE | ||||
|     paddd   %1, %2 | ||||
| %else | ||||
|     pshufw  %2, %1, 0xE | ||||
|     paddd   %1, %2 | ||||
| %endif | ||||
| %endmacro | ||||
|  | ||||
| %macro HADDW 2 | ||||
|     pmaddwd %1, [pw_1] | ||||
|     HADDD   %1, %2 | ||||
| %endmacro | ||||
|  | ||||
| INIT_MMX | ||||
| cglobal pred4x4_dc_10_mmxext, 3,3 | ||||
|     sub    r0, r2 | ||||
|     lea    r1, [r0+r2*2] | ||||
|     movq   m2, [r0+r2*1-8] | ||||
|     paddw  m2, [r0+r2*2-8] | ||||
|     paddw  m2, [r1+r2*1-8] | ||||
|     paddw  m2, [r1+r2*2-8] | ||||
|     psrlq  m2, 48 | ||||
|     movq   m0, [r0] | ||||
|     HADDW  m0, m1 | ||||
|     paddw  m0, [pw_4] | ||||
|     paddw  m0, m2 | ||||
|     psrlw  m0, 3 | ||||
|     SPLATW m0, m0, 0 | ||||
|     movq   [r0+r2*1], m0 | ||||
|     movq   [r0+r2*2], m0 | ||||
|     movq   [r1+r2*1], m0 | ||||
|     movq   [r1+r2*2], m0 | ||||
|     RET | ||||
|  | ||||
| ;----------------------------------------------------------------------------- | ||||
| ; void pred4x4_down_left(pixel *src, const pixel *topright, int stride) | ||||
| ;----------------------------------------------------------------------------- | ||||
| ;TODO: more AVX here | ||||
| %macro PRED4x4_DL 1 | ||||
| cglobal pred4x4_down_left_10_%1, 3,3 | ||||
|     sub        r0, r2 | ||||
|     movq       m1, [r0] | ||||
|     movhps     m1, [r1] | ||||
|     pslldq     m5, m1, 2 | ||||
|     pxor       m2, m5, m1 | ||||
|     psrldq     m2, 2 | ||||
|     pxor       m3, m1, m2 | ||||
|     PRED4x4_LOWPASS m0, m5, m3, m1 | ||||
|     lea        r1, [r0+r2*2] | ||||
|     movhps     [r1+r2*2], m0 | ||||
|     psrldq     m0, 2 | ||||
|     movq       [r0+r2*1], m0 | ||||
|     psrldq     m0, 2 | ||||
|     movq       [r0+r2*2], m0 | ||||
|     psrldq     m0, 2 | ||||
|     movq       [r1+r2*1], m0 | ||||
|     RET | ||||
| %endmacro | ||||
|  | ||||
| INIT_XMM | ||||
| PRED4x4_DL sse2 | ||||
| %ifdef HAVE_AVX | ||||
| INIT_AVX | ||||
| PRED4x4_DL avx | ||||
| %endif | ||||
|  | ||||
| ;----------------------------------------------------------------------------- | ||||
| ; void pred4x4_vertical_left(pixel *src, const pixel *topright, int stride) | ||||
| ;----------------------------------------------------------------------------- | ||||
| %macro PRED4x4_VL 1 | ||||
| cglobal pred4x4_vertical_left_10_%1, 3,3 | ||||
|     sub        r0, r2 | ||||
|     movu       m1, [r0] | ||||
|     movhps     m1, [r1] | ||||
|     psrldq     m3, m1, 2 | ||||
|     psrldq     m2, m1, 4 | ||||
|     pavgw      m4, m3, m1 | ||||
|     PRED4x4_LOWPASS m0, m1, m2, m3 | ||||
|     lea        r1, [r0+r2*2] | ||||
|     movq       [r0+r2*1], m4 | ||||
|     movq       [r0+r2*2], m0 | ||||
|     psrldq     m4, 2 | ||||
|     psrldq     m0, 2 | ||||
|     movq       [r1+r2*1], m4 | ||||
|     movq       [r1+r2*2], m0 | ||||
|     RET | ||||
| %endmacro | ||||
|  | ||||
| INIT_XMM | ||||
| PRED4x4_VL sse2 | ||||
| %ifdef HAVE_AVX | ||||
| INIT_AVX | ||||
| PRED4x4_VL avx | ||||
| %endif | ||||
|  | ||||
| ;----------------------------------------------------------------------------- | ||||
| ; void pred4x4_horizontal_up(pixel *src, const pixel *topright, int stride) | ||||
| ;----------------------------------------------------------------------------- | ||||
| INIT_MMX | ||||
| cglobal pred4x4_horizontal_up_10_mmxext, 3,3 | ||||
|     sub       r0, r2 | ||||
|     lea       r1, [r0+r2*2] | ||||
|     movq      m0, [r0+r2*1-8] | ||||
|     punpckhwd m0, [r0+r2*2-8] | ||||
|     movq      m1, [r1+r2*1-8] | ||||
|     punpckhwd m1, [r1+r2*2-8] | ||||
|     punpckhdq m0, m1 | ||||
|     pshufw    m1, m1, 0xFF | ||||
|     movq      [r1+r2*2], m1 | ||||
|     movd      [r1+r2*1+4], m1 | ||||
|     pshufw    m2, m0, 11111001b | ||||
|     movq      m1, m2 | ||||
|     pavgw     m2, m0 | ||||
|  | ||||
|     pshufw    m5, m0, 11111110b | ||||
|     PRED4x4_LOWPASS m3, m0, m5, m1 | ||||
|     movq      m6, m2 | ||||
|     punpcklwd m6, m3 | ||||
|     movq      [r0+r2*1], m6 | ||||
|     psrlq     m2, 16 | ||||
|     psrlq     m3, 16 | ||||
|     punpcklwd m2, m3 | ||||
|     movq      [r0+r2*2], m2 | ||||
|     psrlq     m2, 32 | ||||
|     movd      [r1+r2*1], m2 | ||||
|     RET | ||||
|  | ||||
|  | ||||
|  | ||||
| ;----------------------------------------------------------------------------- | ||||
| ; void pred8x8_vertical(pixel *src, int stride) | ||||
| ;----------------------------------------------------------------------------- | ||||
| INIT_XMM | ||||
| cglobal pred8x8_vertical_10_sse2, 2,2 | ||||
|     sub  r0, r1 | ||||
|     mova m0, [r0] | ||||
| %rep 3 | ||||
|     mova [r0+r1*1], m0 | ||||
|     mova [r0+r1*2], m0 | ||||
|     lea  r0, [r0+r1*2] | ||||
| %endrep | ||||
|     mova [r0+r1*1], m0 | ||||
|     mova [r0+r1*2], m0 | ||||
|     RET | ||||
|  | ||||
| ;----------------------------------------------------------------------------- | ||||
| ; void pred8x8_horizontal(pixel *src, int stride) | ||||
| ;----------------------------------------------------------------------------- | ||||
| INIT_XMM | ||||
| cglobal pred8x8_horizontal_10_sse2, 2,3 | ||||
|     mov          r2, 4 | ||||
| .loop: | ||||
|     movq         m0, [r0+r1*0-8] | ||||
|     movq         m1, [r0+r1*1-8] | ||||
|     pshuflw      m0, m0, 0xff | ||||
|     pshuflw      m1, m1, 0xff | ||||
|     punpcklqdq   m0, m0 | ||||
|     punpcklqdq   m1, m1 | ||||
|     mova  [r0+r1*0], m0 | ||||
|     mova  [r0+r1*1], m1 | ||||
|     lea          r0, [r0+r1*2] | ||||
|     dec          r2 | ||||
|     jg .loop | ||||
|     REP_RET | ||||
| @@ -21,6 +21,31 @@ | ||||
| #include "libavutil/cpu.h" | ||||
| #include "libavcodec/h264pred.h" | ||||
|  | ||||
| #define PRED4x4(TYPE, DEPTH, OPT) \ | ||||
| void ff_pred4x4_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, const uint8_t *topright, int stride); | ||||
|  | ||||
| PRED4x4(dc, 10, mmxext) | ||||
| PRED4x4(down_left, 10, sse2) | ||||
| PRED4x4(down_left, 10, avx) | ||||
| PRED4x4(down_right, 10, sse2) | ||||
| PRED4x4(down_right, 10, ssse3) | ||||
| PRED4x4(down_right, 10, avx) | ||||
| PRED4x4(vertical_left, 10, sse2) | ||||
| PRED4x4(vertical_left, 10, avx) | ||||
| PRED4x4(vertical_right, 10, sse2) | ||||
| PRED4x4(vertical_right, 10, ssse3) | ||||
| PRED4x4(vertical_right, 10, avx) | ||||
| PRED4x4(horizontal_up, 10, mmxext) | ||||
| PRED4x4(horizontal_down, 10, sse2) | ||||
| PRED4x4(horizontal_down, 10, ssse3) | ||||
| PRED4x4(horizontal_down, 10, avx) | ||||
|  | ||||
| #define PRED8x8(TYPE, DEPTH, OPT) \ | ||||
| void ff_pred8x8_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, int stride); | ||||
|  | ||||
| PRED8x8(vertical, 10, sse2) | ||||
| PRED8x8(horizontal, 10, sse2) | ||||
|  | ||||
| void ff_pred16x16_vertical_mmx     (uint8_t *src, int stride); | ||||
| void ff_pred16x16_vertical_sse     (uint8_t *src, int stride); | ||||
| void ff_pred16x16_horizontal_mmx   (uint8_t *src, int stride); | ||||
| @@ -98,11 +123,8 @@ void ff_pred4x4_vertical_vp8_mmxext(uint8_t *src, const uint8_t *topright, int s | ||||
| void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, const int bit_depth) | ||||
| { | ||||
|     int mm_flags = av_get_cpu_flags(); | ||||
|     const int high_depth = bit_depth > 8; | ||||
|  | ||||
|     if (high_depth) | ||||
|         return; | ||||
|  | ||||
|     if (bit_depth == 8) { | ||||
| #if HAVE_YASM | ||||
|     if (mm_flags & AV_CPU_FLAG_MMX) { | ||||
|         h->pred16x16[VERT_PRED8x8] = ff_pred16x16_vertical_mmx; | ||||
| @@ -226,4 +248,35 @@ void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, const int bit_depth | ||||
|         } | ||||
|     } | ||||
| #endif | ||||
|     } else if (bit_depth == 10) { | ||||
| #if HAVE_YASM | ||||
|         if (mm_flags & AV_CPU_FLAG_MMX2) { | ||||
|             h->pred4x4[DC_PRED             ] = ff_pred4x4_dc_10_mmxext; | ||||
|             h->pred4x4[HOR_UP_PRED         ] = ff_pred4x4_horizontal_up_10_mmxext; | ||||
|         } | ||||
|         if (mm_flags & AV_CPU_FLAG_SSE2) { | ||||
|             h->pred4x4[DIAG_DOWN_LEFT_PRED ] = ff_pred4x4_down_left_10_sse2; | ||||
|             h->pred4x4[DIAG_DOWN_RIGHT_PRED] = ff_pred4x4_down_right_10_sse2; | ||||
|             h->pred4x4[VERT_LEFT_PRED      ] = ff_pred4x4_vertical_left_10_sse2; | ||||
|             h->pred4x4[VERT_RIGHT_PRED     ] = ff_pred4x4_vertical_right_10_sse2; | ||||
|             h->pred4x4[HOR_DOWN_PRED       ] = ff_pred4x4_horizontal_down_10_sse2; | ||||
|  | ||||
|             h->pred8x8[VERT_PRED8x8        ] = ff_pred8x8_vertical_10_sse2; | ||||
|             h->pred8x8[HOR_PRED8x8         ] = ff_pred8x8_horizontal_10_sse2; | ||||
|         } | ||||
|         if (mm_flags & AV_CPU_FLAG_SSSE3) { | ||||
|             h->pred4x4[DIAG_DOWN_RIGHT_PRED] = ff_pred4x4_down_right_10_ssse3; | ||||
|             h->pred4x4[VERT_RIGHT_PRED     ] = ff_pred4x4_vertical_right_10_ssse3; | ||||
|             h->pred4x4[HOR_DOWN_PRED       ] = ff_pred4x4_horizontal_down_10_ssse3; | ||||
|         } | ||||
| #if HAVE_AVX | ||||
|         if (mm_flags&AV_CPU_FLAG_AVX) { | ||||
|             h->pred4x4[DIAG_DOWN_LEFT_PRED ] = ff_pred4x4_down_left_10_avx; | ||||
|             h->pred4x4[DIAG_DOWN_RIGHT_PRED] = ff_pred4x4_down_right_10_avx; | ||||
|             h->pred4x4[VERT_RIGHT_PRED     ] = ff_pred4x4_vertical_right_10_avx; | ||||
|             h->pred4x4[HOR_DOWN_PRED       ] = ff_pred4x4_horizontal_down_10_avx; | ||||
|         } | ||||
| #endif /* HAVE_AVX */ | ||||
| #endif /* HAVE_YASM */ | ||||
|     } | ||||
| } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user