From 144e4b0997ef45a06d3f880e01972352a2ad773c Mon Sep 17 00:00:00 2001 From: Derek Buitenhuis Date: Thu, 2 Aug 2012 16:09:09 +0000 Subject: [PATCH 01/13] LICENSE: Document all GPL files Signed-off-by: Derek Buitenhuis --- LICENSE | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/LICENSE b/LICENSE index 24e0e44354..e6d25f299e 100644 --- a/LICENSE +++ b/LICENSE @@ -14,6 +14,14 @@ configure to activate them. In this case, Libav's license changes to GPL v2+. Specifically, the GPL parts of Libav are - the X11 grabber in libavdevice/x11grab.c +- the texi2pod.pl tool +- the following filters in libavfilter: + - vf_blackframe.c + - vf_boxblur.c + - vf_cropdetect.c + - vf_delogo.c + - vf_hqdn3d.c + - vf_yadif.c There are a handful of files under other licensing terms, namely: From 81905088a19a3d3913f9e2876de660acf3662911 Mon Sep 17 00:00:00 2001 From: Diego Biurrun Date: Tue, 31 Jul 2012 13:18:20 +0200 Subject: [PATCH 02/13] x86: h264dsp: K&R formatting cosmetics --- libavcodec/x86/h264dsp_mmx.c | 458 ++++++++++++++++++----------------- 1 file changed, 240 insertions(+), 218 deletions(-) diff --git a/libavcodec/x86/h264dsp_mmx.c b/libavcodec/x86/h264dsp_mmx.c index 16de15e66f..5e02a46236 100644 --- a/libavcodec/x86/h264dsp_mmx.c +++ b/libavcodec/x86/h264dsp_mmx.c @@ -25,8 +25,10 @@ /***********************************/ /* IDCT */ -#define IDCT_ADD_FUNC(NUM, DEPTH, OPT) \ -void ff_h264_idct ## NUM ## _add_ ## DEPTH ## _ ## OPT (uint8_t *dst, int16_t *block, int stride); +#define IDCT_ADD_FUNC(NUM, DEPTH, OPT) \ +void ff_h264_idct ## NUM ## _add_ ## DEPTH ## _ ## OPT(uint8_t *dst, \ + int16_t *block, \ + int stride); IDCT_ADD_FUNC(, 8, mmx) IDCT_ADD_FUNC(, 10, sse2) @@ -44,10 +46,10 @@ IDCT_ADD_FUNC(8, 10, avx) #endif -#define IDCT_ADD_REP_FUNC(NUM, REP, DEPTH, OPT) \ -void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT \ - (uint8_t *dst, const int *block_offset, \ - DCTELEM *block, int stride, const uint8_t nnzc[6*8]); +#define IDCT_ADD_REP_FUNC(NUM, REP, DEPTH, OPT) \ +void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT \ + (uint8_t *dst, const int *block_offset, \ + DCTELEM *block, int stride, const uint8_t nnzc[6 * 8]); IDCT_ADD_REP_FUNC(8, 4, 8, mmx) IDCT_ADD_REP_FUNC(8, 4, 8, mmx2) @@ -68,10 +70,11 @@ IDCT_ADD_REP_FUNC(, 16intra, 10, avx) #endif -#define IDCT_ADD_REP_FUNC2(NUM, REP, DEPTH, OPT) \ -void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT \ - (uint8_t **dst, const int *block_offset, \ - DCTELEM *block, int stride, const uint8_t nnzc[6*8]); +#define IDCT_ADD_REP_FUNC2(NUM, REP, DEPTH, OPT) \ +void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT \ + (uint8_t **dst, const int *block_offset, \ + DCTELEM *block, int stride, const uint8_t nnzc[6 * 8]); + IDCT_ADD_REP_FUNC2(, 8, 8, mmx) IDCT_ADD_REP_FUNC2(, 8, 8, mmx2) IDCT_ADD_REP_FUNC2(, 8, 8, sse2) @@ -80,7 +83,7 @@ IDCT_ADD_REP_FUNC2(, 8, 10, sse2) IDCT_ADD_REP_FUNC2(, 8, 10, avx) #endif -void ff_h264_luma_dc_dequant_idct_mmx (DCTELEM *output, DCTELEM *input, int qmul); +void ff_h264_luma_dc_dequant_idct_mmx(DCTELEM *output, DCTELEM *input, int qmul); void ff_h264_luma_dc_dequant_idct_sse2(DCTELEM *output, DCTELEM *input, int qmul); /***********************************/ @@ -91,273 +94,292 @@ void ff_h264_loop_filter_strength_mmx2(int16_t bS[2][4][4], uint8_t nnz[40], int bidir, int edges, int step, int mask_mv0, int mask_mv1, int field); -#define LF_FUNC(DIR, TYPE, DEPTH, OPT) \ -void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *pix, int stride, \ - int alpha, int beta, int8_t *tc0); +#define LF_FUNC(DIR, TYPE, DEPTH, OPT) \ +void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT(uint8_t *pix, \ + int stride, \ + int alpha, \ + int beta, \ + int8_t *tc0); #define LF_IFUNC(DIR, TYPE, DEPTH, OPT) \ -void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *pix, int stride, \ - int alpha, int beta); +void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT(uint8_t *pix, \ + int stride, \ + int alpha, \ + int beta); -#define LF_FUNCS(type, depth)\ -LF_FUNC (h, chroma, depth, mmx2)\ -LF_IFUNC(h, chroma_intra, depth, mmx2)\ -LF_FUNC (v, chroma, depth, mmx2)\ -LF_IFUNC(v, chroma_intra, depth, mmx2)\ -LF_FUNC (h, luma, depth, mmx2)\ -LF_IFUNC(h, luma_intra, depth, mmx2)\ -LF_FUNC (h, luma, depth, sse2)\ -LF_IFUNC(h, luma_intra, depth, sse2)\ -LF_FUNC (v, luma, depth, sse2)\ -LF_IFUNC(v, luma_intra, depth, sse2)\ -LF_FUNC (h, chroma, depth, sse2)\ -LF_IFUNC(h, chroma_intra, depth, sse2)\ -LF_FUNC (v, chroma, depth, sse2)\ -LF_IFUNC(v, chroma_intra, depth, sse2)\ -LF_FUNC (h, luma, depth, avx)\ -LF_IFUNC(h, luma_intra, depth, avx)\ -LF_FUNC (v, luma, depth, avx)\ -LF_IFUNC(v, luma_intra, depth, avx)\ -LF_FUNC (h, chroma, depth, avx)\ -LF_IFUNC(h, chroma_intra, depth, avx)\ -LF_FUNC (v, chroma, depth, avx)\ -LF_IFUNC(v, chroma_intra, depth, avx) +#define LF_FUNCS(type, depth) \ +LF_FUNC(h, chroma, depth, mmx2) \ +LF_IFUNC(h, chroma_intra, depth, mmx2) \ +LF_FUNC(v, chroma, depth, mmx2) \ +LF_IFUNC(v, chroma_intra, depth, mmx2) \ +LF_FUNC(h, luma, depth, mmx2) \ +LF_IFUNC(h, luma_intra, depth, mmx2) \ +LF_FUNC(h, luma, depth, sse2) \ +LF_IFUNC(h, luma_intra, depth, sse2) \ +LF_FUNC(v, luma, depth, sse2) \ +LF_IFUNC(v, luma_intra, depth, sse2) \ +LF_FUNC(h, chroma, depth, sse2) \ +LF_IFUNC(h, chroma_intra, depth, sse2) \ +LF_FUNC(v, chroma, depth, sse2) \ +LF_IFUNC(v, chroma_intra, depth, sse2) \ +LF_FUNC(h, luma, depth, avx) \ +LF_IFUNC(h, luma_intra, depth, avx) \ +LF_FUNC(v, luma, depth, avx) \ +LF_IFUNC(v, luma_intra, depth, avx) \ +LF_FUNC(h, chroma, depth, avx) \ +LF_IFUNC(h, chroma_intra, depth, avx) \ +LF_FUNC(v, chroma, depth, avx) \ +LF_IFUNC(v, chroma_intra, depth, avx) -LF_FUNCS( uint8_t, 8) +LF_FUNCS(uint8_t, 8) LF_FUNCS(uint16_t, 10) #if ARCH_X86_32 -LF_FUNC (v8, luma, 8, mmx2) -static void ff_deblock_v_luma_8_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) +LF_FUNC(v8, luma, 8, mmx2) +static void ff_deblock_v_luma_8_mmx2(uint8_t *pix, int stride, int alpha, + int beta, int8_t *tc0) { - if((tc0[0] & tc0[1]) >= 0) - ff_deblock_v8_luma_8_mmx2(pix+0, stride, alpha, beta, tc0); - if((tc0[2] & tc0[3]) >= 0) - ff_deblock_v8_luma_8_mmx2(pix+8, stride, alpha, beta, tc0+2); + if ((tc0[0] & tc0[1]) >= 0) + ff_deblock_v8_luma_8_mmx2(pix + 0, stride, alpha, beta, tc0); + if ((tc0[2] & tc0[3]) >= 0) + ff_deblock_v8_luma_8_mmx2(pix + 8, stride, alpha, beta, tc0 + 2); } -LF_IFUNC(v8, luma_intra, 8, mmx2) -static void ff_deblock_v_luma_intra_8_mmx2(uint8_t *pix, int stride, int alpha, int beta) + +LF_IFUNC(v8, luma_intra, 8, mmx2) +static void ff_deblock_v_luma_intra_8_mmx2(uint8_t *pix, int stride, + int alpha, int beta) { - ff_deblock_v8_luma_intra_8_mmx2(pix+0, stride, alpha, beta); - ff_deblock_v8_luma_intra_8_mmx2(pix+8, stride, alpha, beta); + ff_deblock_v8_luma_intra_8_mmx2(pix + 0, stride, alpha, beta); + ff_deblock_v8_luma_intra_8_mmx2(pix + 8, stride, alpha, beta); } #endif /* ARCH_X86_32 */ -LF_FUNC (v, luma, 10, mmx2) -LF_IFUNC(v, luma_intra, 10, mmx2) +LF_FUNC(v, luma, 10, mmx2) +LF_IFUNC(v, luma_intra, 10, mmx2) /***********************************/ /* weighted prediction */ -#define H264_WEIGHT(W, OPT) \ -void ff_h264_weight_ ## W ## _ ## OPT(uint8_t *dst, \ - int stride, int height, int log2_denom, int weight, int offset); +#define H264_WEIGHT(W, OPT) \ +void ff_h264_weight_ ## W ## _ ## OPT(uint8_t *dst, int stride, \ + int height, int log2_denom, \ + int weight, int offset); -#define H264_BIWEIGHT(W, OPT) \ -void ff_h264_biweight_ ## W ## _ ## OPT(uint8_t *dst, \ - uint8_t *src, int stride, int height, int log2_denom, int weightd, \ - int weights, int offset); +#define H264_BIWEIGHT(W, OPT) \ +void ff_h264_biweight_ ## W ## _ ## OPT(uint8_t *dst, uint8_t *src, \ + int stride, int height, \ + int log2_denom, int weightd, \ + int weights, int offset); -#define H264_BIWEIGHT_MMX(W) \ -H264_WEIGHT (W, mmx2) \ -H264_BIWEIGHT(W, mmx2) +#define H264_BIWEIGHT_MMX(W) \ + H264_WEIGHT(W, mmx2) \ + H264_BIWEIGHT(W, mmx2) -#define H264_BIWEIGHT_MMX_SSE(W) \ -H264_BIWEIGHT_MMX(W) \ -H264_WEIGHT (W, sse2) \ -H264_BIWEIGHT (W, sse2) \ -H264_BIWEIGHT (W, ssse3) +#define H264_BIWEIGHT_MMX_SSE(W) \ + H264_BIWEIGHT_MMX(W) \ + H264_WEIGHT(W, sse2) \ + H264_BIWEIGHT(W, sse2) \ + H264_BIWEIGHT(W, ssse3) H264_BIWEIGHT_MMX_SSE(16) -H264_BIWEIGHT_MMX_SSE( 8) -H264_BIWEIGHT_MMX ( 4) +H264_BIWEIGHT_MMX_SSE(8) +H264_BIWEIGHT_MMX(4) -#define H264_WEIGHT_10(W, DEPTH, OPT) \ -void ff_h264_weight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \ - int stride, int height, int log2_denom, int weight, int offset); +#define H264_WEIGHT_10(W, DEPTH, OPT) \ +void ff_h264_weight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \ + int stride, \ + int height, \ + int log2_denom, \ + int weight, \ + int offset); -#define H264_BIWEIGHT_10(W, DEPTH, OPT) \ -void ff_h264_biweight_ ## W ## _ ## DEPTH ## _ ## OPT \ - (uint8_t *dst, uint8_t *src, int stride, int height, int log2_denom, \ - int weightd, int weights, int offset); +#define H264_BIWEIGHT_10(W, DEPTH, OPT) \ +void ff_h264_biweight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \ + uint8_t *src, \ + int stride, \ + int height, \ + int log2_denom, \ + int weightd, \ + int weights, \ + int offset); -#define H264_BIWEIGHT_10_SSE(W, DEPTH) \ -H264_WEIGHT_10 (W, DEPTH, sse2) \ -H264_WEIGHT_10 (W, DEPTH, sse4) \ -H264_BIWEIGHT_10(W, DEPTH, sse2) \ -H264_BIWEIGHT_10(W, DEPTH, sse4) +#define H264_BIWEIGHT_10_SSE(W, DEPTH) \ + H264_WEIGHT_10(W, DEPTH, sse2) \ + H264_WEIGHT_10(W, DEPTH, sse4) \ + H264_BIWEIGHT_10(W, DEPTH, sse2) \ + H264_BIWEIGHT_10(W, DEPTH, sse4) H264_BIWEIGHT_10_SSE(16, 10) -H264_BIWEIGHT_10_SSE( 8, 10) -H264_BIWEIGHT_10_SSE( 4, 10) +H264_BIWEIGHT_10_SSE(8, 10) +H264_BIWEIGHT_10_SSE(4, 10) -void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chroma_format_idc) +void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, + const int chroma_format_idc) { #if HAVE_YASM int mm_flags = av_get_cpu_flags(); - if (chroma_format_idc == 1 && mm_flags & AV_CPU_FLAG_MMX2) { + if (chroma_format_idc == 1 && mm_flags & AV_CPU_FLAG_MMX2) c->h264_loop_filter_strength = ff_h264_loop_filter_strength_mmx2; - } if (bit_depth == 8) { - if (mm_flags & AV_CPU_FLAG_MMX) { - c->h264_idct_dc_add = - c->h264_idct_add = ff_h264_idct_add_8_mmx; - c->h264_idct8_dc_add = - c->h264_idct8_add = ff_h264_idct8_add_8_mmx; + if (mm_flags & AV_CPU_FLAG_MMX) { + c->h264_idct_dc_add = + c->h264_idct_add = ff_h264_idct_add_8_mmx; + c->h264_idct8_dc_add = + c->h264_idct8_add = ff_h264_idct8_add_8_mmx; - c->h264_idct_add16 = ff_h264_idct_add16_8_mmx; - c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmx; - if (chroma_format_idc == 1) - c->h264_idct_add8 = ff_h264_idct_add8_8_mmx; - c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmx; - if (mm_flags & AV_CPU_FLAG_CMOV) - c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_mmx; - - if (mm_flags & AV_CPU_FLAG_MMX2) { - c->h264_idct_dc_add = ff_h264_idct_dc_add_8_mmx2; - c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_mmx2; - c->h264_idct_add16 = ff_h264_idct_add16_8_mmx2; - c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmx2; + c->h264_idct_add16 = ff_h264_idct_add16_8_mmx; + c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmx; if (chroma_format_idc == 1) - c->h264_idct_add8 = ff_h264_idct_add8_8_mmx2; - c->h264_idct_add16intra= ff_h264_idct_add16intra_8_mmx2; + c->h264_idct_add8 = ff_h264_idct_add8_8_mmx; + c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmx; + if (mm_flags & AV_CPU_FLAG_CMOV) + c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_mmx; - c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_8_mmx2; - c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_8_mmx2; - if (chroma_format_idc == 1) { - c->h264_h_loop_filter_chroma= ff_deblock_h_chroma_8_mmx2; - c->h264_h_loop_filter_chroma_intra= ff_deblock_h_chroma_intra_8_mmx2; - } -#if ARCH_X86_32 - c->h264_v_loop_filter_luma= ff_deblock_v_luma_8_mmx2; - c->h264_h_loop_filter_luma= ff_deblock_h_luma_8_mmx2; - c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_mmx2; - c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_mmx2; -#endif - c->weight_h264_pixels_tab[0]= ff_h264_weight_16_mmx2; - c->weight_h264_pixels_tab[1]= ff_h264_weight_8_mmx2; - c->weight_h264_pixels_tab[2]= ff_h264_weight_4_mmx2; - - c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16_mmx2; - c->biweight_h264_pixels_tab[1]= ff_h264_biweight_8_mmx2; - c->biweight_h264_pixels_tab[2]= ff_h264_biweight_4_mmx2; - - if (mm_flags&AV_CPU_FLAG_SSE2) { - c->h264_idct8_add = ff_h264_idct8_add_8_sse2; - - c->h264_idct_add16 = ff_h264_idct_add16_8_sse2; - c->h264_idct8_add4 = ff_h264_idct8_add4_8_sse2; + if (mm_flags & AV_CPU_FLAG_MMX2) { + c->h264_idct_dc_add = ff_h264_idct_dc_add_8_mmx2; + c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_mmx2; + c->h264_idct_add16 = ff_h264_idct_add16_8_mmx2; + c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmx2; if (chroma_format_idc == 1) - c->h264_idct_add8 = ff_h264_idct_add8_8_sse2; - c->h264_idct_add16intra = ff_h264_idct_add16intra_8_sse2; - c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_sse2; + c->h264_idct_add8 = ff_h264_idct_add8_8_mmx2; + c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmx2; - c->weight_h264_pixels_tab[0]= ff_h264_weight_16_sse2; - c->weight_h264_pixels_tab[1]= ff_h264_weight_8_sse2; + c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_8_mmx2; + c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_8_mmx2; + if (chroma_format_idc == 1) { + c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_8_mmx2; + c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma_intra_8_mmx2; + } +#if ARCH_X86_32 + c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_mmx2; + c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_mmx2; + c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_mmx2; + c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_mmx2; +#endif /* ARCH_X86_32 */ + c->weight_h264_pixels_tab[0] = ff_h264_weight_16_mmx2; + c->weight_h264_pixels_tab[1] = ff_h264_weight_8_mmx2; + c->weight_h264_pixels_tab[2] = ff_h264_weight_4_mmx2; - c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16_sse2; - c->biweight_h264_pixels_tab[1]= ff_h264_biweight_8_sse2; + c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_mmx2; + c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_mmx2; + c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_mmx2; + + if (mm_flags & AV_CPU_FLAG_SSE2) { + c->h264_idct8_add = ff_h264_idct8_add_8_sse2; + + c->h264_idct_add16 = ff_h264_idct_add16_8_sse2; + c->h264_idct8_add4 = ff_h264_idct8_add4_8_sse2; + if (chroma_format_idc == 1) + c->h264_idct_add8 = ff_h264_idct_add8_8_sse2; + c->h264_idct_add16intra = ff_h264_idct_add16intra_8_sse2; + c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_sse2; + + c->weight_h264_pixels_tab[0] = ff_h264_weight_16_sse2; + c->weight_h264_pixels_tab[1] = ff_h264_weight_8_sse2; + + c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_sse2; + c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_sse2; #if HAVE_ALIGNED_STACK - c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_sse2; - c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_sse2; - c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_sse2; - c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_sse2; -#endif - } - if (mm_flags&AV_CPU_FLAG_SSSE3) { - c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16_ssse3; - c->biweight_h264_pixels_tab[1]= ff_h264_biweight_8_ssse3; - } - if (mm_flags&AV_CPU_FLAG_AVX) { + c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_sse2; + c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_sse2; + c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_sse2; + c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_sse2; +#endif /* HAVE_ALIGNED_STACK */ + } + if (mm_flags & AV_CPU_FLAG_SSSE3) { + c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_ssse3; + c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_ssse3; + } + if (mm_flags & AV_CPU_FLAG_AVX) { #if HAVE_ALIGNED_STACK - c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_avx; - c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_avx; - c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_avx; - c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_avx; -#endif + c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_avx; + c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_avx; + c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_avx; + c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_avx; +#endif /* HAVE_ALIGNED_STACK */ + } } } - } } else if (bit_depth == 10) { - if (mm_flags & AV_CPU_FLAG_MMX) { - if (mm_flags & AV_CPU_FLAG_MMX2) { + if (mm_flags & AV_CPU_FLAG_MMX) { + if (mm_flags & AV_CPU_FLAG_MMX2) { #if ARCH_X86_32 - c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_10_mmx2; - c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_10_mmx2; - c->h264_v_loop_filter_luma= ff_deblock_v_luma_10_mmx2; - c->h264_h_loop_filter_luma= ff_deblock_h_luma_10_mmx2; - c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_mmx2; - c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_mmx2; -#endif - c->h264_idct_dc_add= ff_h264_idct_dc_add_10_mmx2; - if (mm_flags&AV_CPU_FLAG_SSE2) { - c->h264_idct_add = ff_h264_idct_add_10_sse2; - c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_sse2; + c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_mmx2; + c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_mmx2; + c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_mmx2; + c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_mmx2; + c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_mmx2; + c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_mmx2; +#endif /* ARCH_X86_32 */ + c->h264_idct_dc_add = ff_h264_idct_dc_add_10_mmx2; + if (mm_flags & AV_CPU_FLAG_SSE2) { + c->h264_idct_add = ff_h264_idct_add_10_sse2; + c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_sse2; - c->h264_idct_add16 = ff_h264_idct_add16_10_sse2; - if (chroma_format_idc == 1) - c->h264_idct_add8 = ff_h264_idct_add8_10_sse2; - c->h264_idct_add16intra= ff_h264_idct_add16intra_10_sse2; + c->h264_idct_add16 = ff_h264_idct_add16_10_sse2; + if (chroma_format_idc == 1) + c->h264_idct_add8 = ff_h264_idct_add8_10_sse2; + c->h264_idct_add16intra = ff_h264_idct_add16intra_10_sse2; #if HAVE_ALIGNED_STACK - c->h264_idct8_add = ff_h264_idct8_add_10_sse2; - c->h264_idct8_add4 = ff_h264_idct8_add4_10_sse2; -#endif + c->h264_idct8_add = ff_h264_idct8_add_10_sse2; + c->h264_idct8_add4 = ff_h264_idct8_add4_10_sse2; +#endif /* HAVE_ALIGNED_STACK */ - c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse2; - c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse2; - c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse2; + c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse2; + c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse2; + c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse2; - c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse2; - c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse2; - c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse2; + c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse2; + c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse2; + c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse2; - c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_10_sse2; - c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_10_sse2; + c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_sse2; + c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_sse2; #if HAVE_ALIGNED_STACK - c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_sse2; - c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_sse2; - c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_sse2; - c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_sse2; -#endif - } - if (mm_flags&AV_CPU_FLAG_SSE4) { - c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse4; - c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse4; - c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse4; + c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_sse2; + c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_sse2; + c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_sse2; + c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_sse2; +#endif /* HAVE_ALIGNED_STACK */ + } + if (mm_flags & AV_CPU_FLAG_SSE4) { + c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse4; + c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse4; + c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse4; - c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse4; - c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse4; - c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse4; - } + c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse4; + c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse4; + c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse4; + } #if HAVE_AVX - if (mm_flags&AV_CPU_FLAG_AVX) { - c->h264_idct_dc_add = - c->h264_idct_add = ff_h264_idct_add_10_avx; - c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_avx; + if (mm_flags & AV_CPU_FLAG_AVX) { + c->h264_idct_dc_add = + c->h264_idct_add = ff_h264_idct_add_10_avx; + c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_avx; - c->h264_idct_add16 = ff_h264_idct_add16_10_avx; - if (chroma_format_idc == 1) - c->h264_idct_add8 = ff_h264_idct_add8_10_avx; - c->h264_idct_add16intra= ff_h264_idct_add16intra_10_avx; + c->h264_idct_add16 = ff_h264_idct_add16_10_avx; + if (chroma_format_idc == 1) + c->h264_idct_add8 = ff_h264_idct_add8_10_avx; + c->h264_idct_add16intra = ff_h264_idct_add16intra_10_avx; #if HAVE_ALIGNED_STACK - c->h264_idct8_add = ff_h264_idct8_add_10_avx; - c->h264_idct8_add4 = ff_h264_idct8_add4_10_avx; -#endif + c->h264_idct8_add = ff_h264_idct8_add_10_avx; + c->h264_idct8_add4 = ff_h264_idct8_add4_10_avx; +#endif /* HAVE_ALIGNED_STACK */ - c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_10_avx; - c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_10_avx; + c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_avx; + c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_avx; #if HAVE_ALIGNED_STACK - c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_avx; - c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_avx; - c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_avx; - c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_avx; -#endif - } + c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_avx; + c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_avx; + c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_avx; + c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_avx; +#endif /* HAVE_ALIGNED_STACK */ + } #endif /* HAVE_AVX */ + } } } - } -#endif +#endif /* HAVE_YASM */ } From 03737412a32c3b7e52cfe661bde7947665898c19 Mon Sep 17 00:00:00 2001 From: Diego Biurrun Date: Mon, 16 Jul 2012 00:01:00 +0200 Subject: [PATCH 03/13] x86: proresdsp: improve SIGNEXTEND macro comments --- libavcodec/x86/proresdsp.asm | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/libavcodec/x86/proresdsp.asm b/libavcodec/x86/proresdsp.asm index 70fd6862a3..bce36ac1fc 100644 --- a/libavcodec/x86/proresdsp.asm +++ b/libavcodec/x86/proresdsp.asm @@ -405,12 +405,12 @@ cglobal prores_idct_put_10, 4, 4, %1 RET %endmacro -%macro SIGNEXTEND 2-3 ; dstlow, dsthigh, tmp -%if cpuflag(sse4) +%macro SIGNEXTEND 2-3 +%if cpuflag(sse4) ; dstlow, dsthigh movhlps %2, %1 pmovsxwd %1, %1 pmovsxwd %2, %2 -%else ; sse2 +%elif cpuflag(sse2) ; dstlow, dsthigh, tmp pxor %3, %3 pcmpgtw %3, %1 mova %2, %1 From 3680b2435101a5de56821718a71c828320d535a0 Mon Sep 17 00:00:00 2001 From: Sean McGovern Date: Thu, 2 Aug 2012 15:37:28 -0400 Subject: [PATCH 04/13] wmapro: prevent division by zero when sample rate is unspecified This fixes Bugzilla #327: Signed-off-by: Kostya Shishkov --- libavcodec/wmaprodec.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/libavcodec/wmaprodec.c b/libavcodec/wmaprodec.c index 30a43c8585..699c1b7503 100644 --- a/libavcodec/wmaprodec.c +++ b/libavcodec/wmaprodec.c @@ -335,6 +335,11 @@ static av_cold int decode_init(AVCodecContext *avctx) return AVERROR_INVALIDDATA; } + if (s->avctx->sample_rate <= 0) { + av_log(avctx, AV_LOG_ERROR, "invalid sample rate\n"); + return AVERROR_INVALIDDATA; + } + s->num_channels = avctx->channels; if (s->num_channels < 0) { From 7f92db14f94eed184a4fa725436d0ed44f4327ae Mon Sep 17 00:00:00 2001 From: Kostya Shishkov Date: Thu, 2 Aug 2012 07:42:44 +0200 Subject: [PATCH 05/13] g723_1: save/restore excitation with offset to store LPC history The same buffer with saved data is used later in LPC reconstruction, so it should have some head space for LPC history. --- libavcodec/g723_1.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libavcodec/g723_1.c b/libavcodec/g723_1.c index df14a46644..11e90e8123 100644 --- a/libavcodec/g723_1.c +++ b/libavcodec/g723_1.c @@ -1071,7 +1071,7 @@ static int g723_1_decode_frame(AVCodecContext *avctx, void *data, vector_ptr = p->excitation + PITCH_MAX; /* Save the excitation */ - memcpy(p->audio, vector_ptr, FRAME_LEN * sizeof(*p->audio)); + memcpy(p->audio + LPC_ORDER, vector_ptr, FRAME_LEN * sizeof(*p->audio)); p->interp_index = comp_interp_index(p, p->pitch_lag[1], &p->sid_gain, &p->cur_gain); @@ -1086,7 +1086,7 @@ static int g723_1_decode_frame(AVCodecContext *avctx, void *data, /* Restore the original excitation */ memcpy(p->excitation, p->prev_excitation, PITCH_MAX * sizeof(*p->excitation)); - memcpy(vector_ptr, p->audio, FRAME_LEN * sizeof(*vector_ptr)); + memcpy(vector_ptr, p->audio + LPC_ORDER, FRAME_LEN * sizeof(*vector_ptr)); /* Peform pitch postfiltering */ if (p->postfilter) From 8772d2511a4ac45f275eaef2b4b6b1ef132c993b Mon Sep 17 00:00:00 2001 From: Kostya Shishkov Date: Thu, 2 Aug 2012 07:48:08 +0200 Subject: [PATCH 06/13] g723_1: fix off-by-one error in normalize_bits() --- libavcodec/g723_1.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libavcodec/g723_1.c b/libavcodec/g723_1.c index 11e90e8123..91f1b86170 100644 --- a/libavcodec/g723_1.c +++ b/libavcodec/g723_1.c @@ -274,7 +274,7 @@ static int normalize_bits(int num, int width) if (num < 0) num = ~num; - return width - av_log2(num); + return width - av_log2(num) - 1; } /** From 8ddadea171fa38563cc1ff9a78d4cd07655c0e1b Mon Sep 17 00:00:00 2001 From: Kostya Shishkov Date: Thu, 2 Aug 2012 07:50:49 +0200 Subject: [PATCH 07/13] g723_1: make scale_vector() behave like the reference --- libavcodec/g723_1.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/libavcodec/g723_1.c b/libavcodec/g723_1.c index 91f1b86170..4ad74f4448 100644 --- a/libavcodec/g723_1.c +++ b/libavcodec/g723_1.c @@ -282,7 +282,8 @@ static int normalize_bits(int num, int width) */ static int scale_vector(int16_t *vector, int length) { - int bits, scale, max = 0; + int bits, max = 0; + int64_t scale; int i; @@ -293,7 +294,7 @@ static int scale_vector(int16_t *vector, int length) scale = (bits == 15) ? 0x7FFF : (1 << bits); for (i = 0; i < length; i++) - vector[i] = (vector[i] * scale) >> 4; + vector[i] = av_clipl_int32(vector[i] * scale << 1) >> 4; return bits - 3; } From 802bcdcb2f177b84cdf8e0197338a808fc26fbff Mon Sep 17 00:00:00 2001 From: Kostya Shishkov Date: Thu, 2 Aug 2012 07:54:16 +0200 Subject: [PATCH 08/13] g723_1: fix upper bound parameter from inverse maximum autocorrelation --- libavcodec/g723_1.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/libavcodec/g723_1.c b/libavcodec/g723_1.c index 4ad74f4448..883c9498a4 100644 --- a/libavcodec/g723_1.c +++ b/libavcodec/g723_1.c @@ -630,7 +630,10 @@ static int autocorr_max(G723_1_Context *p, int offset, int *ccr_max, int i; pitch_lag = FFMIN(PITCH_MAX - 3, pitch_lag); - limit = FFMIN(FRAME_LEN + PITCH_MAX - offset - length, pitch_lag + 3); + if (dir > 0) + limit = FFMIN(FRAME_LEN + PITCH_MAX - offset - length, pitch_lag + 3); + else + limit = pitch_lag + 3; for (i = pitch_lag - 3; i <= limit; i++) { ccr = dot_product(buf, buf + dir * i, length, 1); From 94bfdfd6f05a3ccbf048a3ea70694247c2929053 Mon Sep 17 00:00:00 2001 From: Kostya Shishkov Date: Thu, 2 Aug 2012 19:15:51 +0200 Subject: [PATCH 09/13] g723_1: increase excitation storage by 4 Fixed codebook mode in 5300 rate may write up to SUBFRAME_LEN + 4 and that is considered normal by the reference decoder. Without that additional padding it might overwrite first elements of LPC history. --- libavcodec/g723_1.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libavcodec/g723_1.c b/libavcodec/g723_1.c index 883c9498a4..0b59f81ff6 100644 --- a/libavcodec/g723_1.c +++ b/libavcodec/g723_1.c @@ -87,7 +87,7 @@ typedef struct g723_1_context { int16_t prev_lsp[LPC_ORDER]; int16_t prev_excitation[PITCH_MAX]; - int16_t excitation[PITCH_MAX + FRAME_LEN]; + int16_t excitation[PITCH_MAX + FRAME_LEN + 4]; int16_t synth_mem[LPC_ORDER]; int16_t fir_mem[LPC_ORDER]; int iir_mem[LPC_ORDER]; From d3e0766fc00734adbb589eb4c865feb8d26785ab Mon Sep 17 00:00:00 2001 From: Kostya Shishkov Date: Thu, 2 Aug 2012 19:34:53 +0200 Subject: [PATCH 10/13] g723_1: scale output as supposed for the case with postfilter disabled --- libavcodec/g723_1.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/libavcodec/g723_1.c b/libavcodec/g723_1.c index 0b59f81ff6..18a5fe316a 100644 --- a/libavcodec/g723_1.c +++ b/libavcodec/g723_1.c @@ -1012,6 +1012,7 @@ static int g723_1_decode_frame(AVCodecContext *avctx, void *data, int16_t lpc[SUBFRAMES * LPC_ORDER]; int16_t acb_vector[SUBFRAME_LEN]; int16_t *vector_ptr; + int16_t *out; int bad_frame = 0, i, j, ret; if (buf_size < frame_size[dec_mode]) { @@ -1037,6 +1038,8 @@ static int g723_1_decode_frame(AVCodecContext *avctx, void *data, return ret; } + out = (int16_t *)p->frame.data[0]; + if (p->cur_frame_type == ACTIVE_FRAME) { if (!bad_frame) p->erased_frames = 0; @@ -1120,7 +1123,7 @@ static int g723_1_decode_frame(AVCodecContext *avctx, void *data, memcpy(p->prev_excitation, p->excitation + FRAME_LEN, PITCH_MAX * sizeof(*p->excitation)); } else { - memset(p->frame.data[0], 0, FRAME_LEN * 2); + memset(out, 0, FRAME_LEN * 2); av_log(avctx, AV_LOG_WARNING, "G.723.1: Comfort noise generation not supported yet\n"); @@ -1138,10 +1141,13 @@ static int g723_1_decode_frame(AVCodecContext *avctx, void *data, 0, 1, 1 << 12); memcpy(p->synth_mem, p->audio + FRAME_LEN, LPC_ORDER * sizeof(*p->audio)); - if (p->postfilter) + if (p->postfilter) { formant_postfilter(p, lpc, p->audio); - - memcpy(p->frame.data[0], p->audio + LPC_ORDER, FRAME_LEN * 2); + memcpy(p->frame.data[0], p->audio + LPC_ORDER, FRAME_LEN * 2); + } else { // if output is not postfiltered it should be scaled by 2 + for (i = 0; i < FRAME_LEN; i++) + out[i] = av_clip_int16(p->audio[LPC_ORDER + i] << 1); + } *got_frame_ptr = 1; *(AVFrame *)data = p->frame; From ca844b7be9c69c91113094ef21d720f1ca80db60 Mon Sep 17 00:00:00 2001 From: Diego Biurrun Date: Wed, 1 Aug 2012 15:31:43 +0200 Subject: [PATCH 11/13] x86: Use consistent 3dnowext function and macro name suffixes Currently there is a wild mix of 3dn2/3dnow2/3dnowext. Switching to "3dnowext", which is a more common name of the CPU flag, as reported e.g. by the Linux kernel, unifies this. --- libavcodec/x86/dsputil_mmx.c | 14 +++++----- libavcodec/x86/fft.c | 6 ++--- libavcodec/x86/fft.h | 6 ++--- libavcodec/x86/fft_mmx.asm | 46 ++++++++++++++++----------------- libavcodec/x86/fmtconvert.asm | 6 ++--- libavcodec/x86/fmtconvert_mmx.c | 10 ++++--- libavutil/x86/x86inc.asm | 2 +- 7 files changed, 46 insertions(+), 44 deletions(-) diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c index 827705c003..d26f6126a8 100644 --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -2358,9 +2358,9 @@ static void ac3_downmix_sse(float (*samples)[256], float (*matrix)[2], } #if HAVE_6REGS -static void vector_fmul_window_3dnow2(float *dst, const float *src0, - const float *src1, const float *win, - int len) +static void vector_fmul_window_3dnowext(float *dst, const float *src0, + const float *src1, const float *win, + int len) { x86_reg i = -len * 4; x86_reg j = len * 4 - 8; @@ -2809,11 +2809,11 @@ static void dsputil_init_3dnow(DSPContext *c, AVCodecContext *avctx, #endif } -static void dsputil_init_3dnow2(DSPContext *c, AVCodecContext *avctx, - int mm_flags) +static void dsputil_init_3dnowext(DSPContext *c, AVCodecContext *avctx, + int mm_flags) { #if HAVE_6REGS && HAVE_INLINE_ASM - c->vector_fmul_window = vector_fmul_window_3dnow2; + c->vector_fmul_window = vector_fmul_window_3dnowext; #endif } @@ -3051,7 +3051,7 @@ void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx) dsputil_init_3dnow(c, avctx, mm_flags); if (mm_flags & AV_CPU_FLAG_3DNOWEXT && HAVE_AMD3DNOWEXT) - dsputil_init_3dnow2(c, avctx, mm_flags); + dsputil_init_3dnowext(c, avctx, mm_flags); if (mm_flags & AV_CPU_FLAG_SSE && HAVE_SSE) dsputil_init_sse(c, avctx, mm_flags); diff --git a/libavcodec/x86/fft.c b/libavcodec/x86/fft.c index f1c1c9d36b..fcde3fa797 100644 --- a/libavcodec/x86/fft.c +++ b/libavcodec/x86/fft.c @@ -34,9 +34,9 @@ av_cold void ff_fft_init_mmx(FFTContext *s) } if (has_vectors & AV_CPU_FLAG_3DNOWEXT && HAVE_AMD3DNOWEXT) { /* 3DNowEx for K7 */ - s->imdct_calc = ff_imdct_calc_3dnow2; - s->imdct_half = ff_imdct_half_3dnow2; - s->fft_calc = ff_fft_calc_3dnow2; + s->imdct_calc = ff_imdct_calc_3dnowext; + s->imdct_half = ff_imdct_half_3dnowext; + s->fft_calc = ff_fft_calc_3dnowext; } #endif if (has_vectors & AV_CPU_FLAG_SSE && HAVE_SSE) { diff --git a/libavcodec/x86/fft.h b/libavcodec/x86/fft.h index 1cefe7a9ee..6e80b95d11 100644 --- a/libavcodec/x86/fft.h +++ b/libavcodec/x86/fft.h @@ -25,12 +25,12 @@ void ff_fft_permute_sse(FFTContext *s, FFTComplex *z); void ff_fft_calc_avx(FFTContext *s, FFTComplex *z); void ff_fft_calc_sse(FFTContext *s, FFTComplex *z); void ff_fft_calc_3dnow(FFTContext *s, FFTComplex *z); -void ff_fft_calc_3dnow2(FFTContext *s, FFTComplex *z); +void ff_fft_calc_3dnowext(FFTContext *s, FFTComplex *z); void ff_imdct_calc_3dnow(FFTContext *s, FFTSample *output, const FFTSample *input); void ff_imdct_half_3dnow(FFTContext *s, FFTSample *output, const FFTSample *input); -void ff_imdct_calc_3dnow2(FFTContext *s, FFTSample *output, const FFTSample *input); -void ff_imdct_half_3dnow2(FFTContext *s, FFTSample *output, const FFTSample *input); +void ff_imdct_calc_3dnowext(FFTContext *s, FFTSample *output, const FFTSample *input); +void ff_imdct_half_3dnowext(FFTContext *s, FFTSample *output, const FFTSample *input); void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input); void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample *input); void ff_imdct_half_avx(FFTContext *s, FFTSample *output, const FFTSample *input); diff --git a/libavcodec/x86/fft_mmx.asm b/libavcodec/x86/fft_mmx.asm index ac53296f70..7c0e9de311 100644 --- a/libavcodec/x86/fft_mmx.asm +++ b/libavcodec/x86/fft_mmx.asm @@ -93,14 +93,14 @@ cextern cos_ %+ i SECTION_TEXT -%macro T2_3DN 4 ; z0, z1, mem0, mem1 +%macro T2_3DNOW 4 ; z0, z1, mem0, mem1 mova %1, %3 mova %2, %1 pfadd %1, %4 pfsub %2, %4 %endmacro -%macro T4_3DN 6 ; z0, z1, z2, z3, tmp0, tmp1 +%macro T4_3DNOW 6 ; z0, z1, z2, z3, tmp0, tmp1 mova %5, %3 pfsub %3, %4 pfadd %5, %4 ; {t6,t5} @@ -444,13 +444,13 @@ fft16_sse: ret -%macro FFT48_3DN 0 +%macro FFT48_3DNOW 0 align 16 fft4 %+ SUFFIX: - T2_3DN m0, m1, Z(0), Z(1) + T2_3DNOW m0, m1, Z(0), Z(1) mova m2, Z(2) mova m3, Z(3) - T4_3DN m0, m1, m2, m3, m4, m5 + T4_3DNOW m0, m1, m2, m3, m4, m5 PUNPCK m0, m1, m4 PUNPCK m2, m3, m5 mova Z(0), m0 @@ -461,14 +461,14 @@ fft4 %+ SUFFIX: align 16 fft8 %+ SUFFIX: - T2_3DN m0, m1, Z(0), Z(1) + T2_3DNOW m0, m1, Z(0), Z(1) mova m2, Z(2) mova m3, Z(3) - T4_3DN m0, m1, m2, m3, m4, m5 + T4_3DNOW m0, m1, m2, m3, m4, m5 mova Z(0), m0 mova Z(2), m2 - T2_3DN m4, m5, Z(4), Z(5) - T2_3DN m6, m7, Z2(6), Z2(7) + T2_3DNOW m4, m5, Z(4), Z(5) + T2_3DNOW m6, m7, Z2(6), Z2(7) PSWAPD m0, m5 PSWAPD m2, m7 pxor m0, [ps_m1p1] @@ -477,12 +477,12 @@ fft8 %+ SUFFIX: pfadd m7, m2 pfmul m5, [ps_root2] pfmul m7, [ps_root2] - T4_3DN m1, m3, m5, m7, m0, m2 + T4_3DNOW m1, m3, m5, m7, m0, m2 mova Z(5), m5 mova Z2(7), m7 mova m0, Z(0) mova m2, Z(2) - T4_3DN m0, m2, m4, m6, m5, m7 + T4_3DNOW m0, m2, m4, m6, m5, m7 PUNPCK m0, m1, m5 PUNPCK m2, m3, m7 mova Z(0), m0 @@ -500,7 +500,7 @@ fft8 %+ SUFFIX: %if ARCH_X86_32 %macro PSWAPD 2 -%if cpuflag(3dnow2) +%if cpuflag(3dnowext) pswapd %1, %2 %elifidn %1, %2 movd [r0+12], %1 @@ -512,11 +512,11 @@ fft8 %+ SUFFIX: %endif %endmacro -INIT_MMX 3dnow2 -FFT48_3DN +INIT_MMX 3dnowext +FFT48_3DNOW INIT_MMX 3dnow -FFT48_3DN +FFT48_3DNOW %endif %define Z(x) [zcq + o1q*(x&6) + mmsize*(x&1)] @@ -633,7 +633,7 @@ cglobal fft_calc, 2,5,8 %if ARCH_X86_32 INIT_MMX 3dnow FFT_CALC_FUNC -INIT_MMX 3dnow2 +INIT_MMX 3dnowext FFT_CALC_FUNC %endif INIT_XMM sse @@ -727,7 +727,7 @@ cglobal imdct_calc, 3,5,3 %if ARCH_X86_32 INIT_MMX 3dnow IMDCT_CALC_FUNC -INIT_MMX 3dnow2 +INIT_MMX 3dnowext IMDCT_CALC_FUNC %endif @@ -743,8 +743,8 @@ INIT_MMX 3dnow %define unpckhps punpckhdq DECL_PASS pass_3dnow, PASS_SMALL 1, [wq], [wq+o1q] DECL_PASS pass_interleave_3dnow, PASS_BIG 0 -%define pass_3dnow2 pass_3dnow -%define pass_interleave_3dnow2 pass_interleave_3dnow +%define pass_3dnowext pass_3dnow +%define pass_interleave_3dnowext pass_interleave_3dnow %endif %ifdef PIC @@ -813,7 +813,7 @@ DECL_FFT 5, _interleave INIT_MMX 3dnow DECL_FFT 4 DECL_FFT 4, _interleave -INIT_MMX 3dnow2 +INIT_MMX 3dnowext DECL_FFT 4 DECL_FFT 4, _interleave %endif @@ -845,7 +845,7 @@ INIT_XMM sse PSWAPD m5, m3 pfmul m2, m3 pfmul m6, m5 -%if cpuflag(3dnow2) +%if cpuflag(3dnowext) pfpnacc m0, m4 pfpnacc m2, m6 %else @@ -1018,7 +1018,7 @@ cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *i xor r4, r4 sub r4, r3 %endif -%if notcpuflag(3dnow2) && mmsize == 8 +%if notcpuflag(3dnowext) && mmsize == 8 movd m7, [ps_m1m1m1m1] %endif .pre: @@ -1102,7 +1102,7 @@ DECL_IMDCT POSROTATESHUF INIT_MMX 3dnow DECL_IMDCT POSROTATESHUF_3DNOW -INIT_MMX 3dnow2 +INIT_MMX 3dnowext DECL_IMDCT POSROTATESHUF_3DNOW %endif diff --git a/libavcodec/x86/fmtconvert.asm b/libavcodec/x86/fmtconvert.asm index 4916e7af33..0fd14fefa3 100644 --- a/libavcodec/x86/fmtconvert.asm +++ b/libavcodec/x86/fmtconvert.asm @@ -249,7 +249,7 @@ FLOAT_TO_INT16_INTERLEAVE2 sse2 %macro PSWAPD_SSE 2 pshufw %1, %2, 0x4e %endmacro -%macro PSWAPD_3DN1 2 +%macro PSWAPD_3DNOW 2 movq %1, %2 psrlq %1, 32 punpckldq %1, %2 @@ -306,10 +306,10 @@ cglobal float_to_int16_interleave6_%1, 2,8,0, dst, src, src1, src2, src3, src4, %define pswapd PSWAPD_SSE FLOAT_TO_INT16_INTERLEAVE6 sse %define cvtps2pi pf2id -%define pswapd PSWAPD_3DN1 +%define pswapd PSWAPD_3DNOW FLOAT_TO_INT16_INTERLEAVE6 3dnow %undef pswapd -FLOAT_TO_INT16_INTERLEAVE6 3dn2 +FLOAT_TO_INT16_INTERLEAVE6 3dnowext %undef cvtps2pi ;----------------------------------------------------------------------------- diff --git a/libavcodec/x86/fmtconvert_mmx.c b/libavcodec/x86/fmtconvert_mmx.c index aaf634d37f..fbdc5262b9 100644 --- a/libavcodec/x86/fmtconvert_mmx.c +++ b/libavcodec/x86/fmtconvert_mmx.c @@ -46,7 +46,7 @@ void ff_float_to_int16_interleave2_sse2 (int16_t *dst, const float **src, long l void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len); void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len); -void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len); +void ff_float_to_int16_interleave6_3dnowext(int16_t *dst, const float **src, int len); #define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse @@ -74,9 +74,11 @@ FLOAT_TO_INT16_INTERLEAVE(3dnow) FLOAT_TO_INT16_INTERLEAVE(sse) FLOAT_TO_INT16_INTERLEAVE(sse2) -static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long len, int channels){ +static void float_to_int16_interleave_3dnowext(int16_t *dst, const float **src, + long len, int channels) +{ if(channels==6) - ff_float_to_int16_interleave6_3dn2(dst, src, len); + ff_float_to_int16_interleave6_3dnowext(dst, src, len); else float_to_int16_interleave_3dnow(dst, src, len, channels); } @@ -126,7 +128,7 @@ void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx) } if (HAVE_AMD3DNOWEXT && mm_flags & AV_CPU_FLAG_3DNOWEXT) { if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ - c->float_to_int16_interleave = float_to_int16_interleave_3dn2; + c->float_to_int16_interleave = float_to_int16_interleave_3dnowext; } } if (HAVE_SSE && mm_flags & AV_CPU_FLAG_SSE) { diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm index 7a75951cf6..03e6c0721b 100644 --- a/libavutil/x86/x86inc.asm +++ b/libavutil/x86/x86inc.asm @@ -557,7 +557,7 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits %assign cpuflags_mmx (1<<0) %assign cpuflags_mmx2 (1<<1) | cpuflags_mmx %assign cpuflags_3dnow (1<<2) | cpuflags_mmx -%assign cpuflags_3dnow2 (1<<3) | cpuflags_3dnow +%assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow %assign cpuflags_sse (1<<4) | cpuflags_mmx2 %assign cpuflags_sse2 (1<<5) | cpuflags_sse %assign cpuflags_sse2slow (1<<6) | cpuflags_sse2 From 8379ea5e9f6bf3d50663ffb655ba5dd6a11652b4 Mon Sep 17 00:00:00 2001 From: Mashiat Sarker Shakkhar Date: Fri, 3 Aug 2012 20:53:35 +0600 Subject: [PATCH 12/13] vc1dec: Invoke edge_emulation regardless of MV precision In VC-1 interlaced field pictures, chroma motion vectors can extend beyond picture boundary even if luma vectors are bounded. The problem shows up only for hpel interpolated MVs, and may be due to the way motion vectors are scaled / cropped. Thanks to Konstantin Shishkov for suggesting the fix. This fixes long-known segfaults in MC-VC1.ts from videolan streams archive. Signed-off-by: Kostya Shishkov --- libavcodec/vc1dec.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libavcodec/vc1dec.c b/libavcodec/vc1dec.c index e36cc0dc54..fcb25db0bf 100644 --- a/libavcodec/vc1dec.c +++ b/libavcodec/vc1dec.c @@ -1880,8 +1880,8 @@ static void vc1_interp_mc(VC1Context *v) } if (v->rangeredfrm || s->h_edge_pos < 22 || v_edge_pos < 22 - || (unsigned)(src_x - s->mspel) > s->h_edge_pos - (mx & 3) - 16 - s->mspel * 3 - || (unsigned)(src_y - s->mspel) > v_edge_pos - (my & 3) - 16 - s->mspel * 3) { + || (unsigned)(src_x - 1) > s->h_edge_pos - (mx & 3) - 16 - 3 + || (unsigned)(src_y - 1) > v_edge_pos - (my & 3) - 16 - 3) { uint8_t *uvbuf = s->edge_emu_buffer + 19 * s->linesize; srcY -= s->mspel * (1 + s->linesize); From 9cc74c9f6e8b645e67d45b2070db004caca09af7 Mon Sep 17 00:00:00 2001 From: Mashiat Sarker Shakkhar Date: Fri, 3 Aug 2012 20:53:36 +0600 Subject: [PATCH 13/13] vc1dec: Remove separate scaling function for interlaced field MVs The scaling process for obtaining direct MVs from co-located field MVs is the same for interlaced field and progressive pictures. Signed-off-by: Kostya Shishkov --- libavcodec/vc1dec.c | 30 ++++++++---------------------- 1 file changed, 8 insertions(+), 22 deletions(-) diff --git a/libavcodec/vc1dec.c b/libavcodec/vc1dec.c index fcb25db0bf..cb15dee982 100644 --- a/libavcodec/vc1dec.c +++ b/libavcodec/vc1dec.c @@ -1977,20 +1977,6 @@ static av_always_inline int scale_mv(int value, int bfrac, int inv, int qs) #endif } -static av_always_inline int scale_mv_intfi(int value, int bfrac, int inv, - int qs, int qs_last) -{ - int n = bfrac; - - if (inv) - n -= 256; - n <<= !qs_last; - if (!qs) - return (value * n + 255) >> 9; - else - return (value * n + 128) >> 8; -} - /** Reconstruct motion vector for B-frame and do motion compensation */ static inline void vc1_b_mc(VC1Context *v, int dmv_x[2], int dmv_y[2], @@ -2244,14 +2230,14 @@ static inline void vc1_pred_b_mv_intfi(VC1Context *v, int n, int *dmv_x, int *dm if (v->bmvtype == BMV_TYPE_DIRECT) { int total_opp, k, f; if (s->next_picture.f.mb_type[mb_pos + v->mb_off] != MB_TYPE_INTRA) { - s->mv[0][0][0] = scale_mv_intfi(s->next_picture.f.motion_val[1][s->block_index[0] + v->blocks_off][0], - v->bfraction, 0, s->quarter_sample, v->qs_last); - s->mv[0][0][1] = scale_mv_intfi(s->next_picture.f.motion_val[1][s->block_index[0] + v->blocks_off][1], - v->bfraction, 0, s->quarter_sample, v->qs_last); - s->mv[1][0][0] = scale_mv_intfi(s->next_picture.f.motion_val[1][s->block_index[0] + v->blocks_off][0], - v->bfraction, 1, s->quarter_sample, v->qs_last); - s->mv[1][0][1] = scale_mv_intfi(s->next_picture.f.motion_val[1][s->block_index[0] + v->blocks_off][1], - v->bfraction, 1, s->quarter_sample, v->qs_last); + s->mv[0][0][0] = scale_mv(s->next_picture.f.motion_val[1][s->block_index[0] + v->blocks_off][0], + v->bfraction, 0, s->quarter_sample); + s->mv[0][0][1] = scale_mv(s->next_picture.f.motion_val[1][s->block_index[0] + v->blocks_off][1], + v->bfraction, 0, s->quarter_sample); + s->mv[1][0][0] = scale_mv(s->next_picture.f.motion_val[1][s->block_index[0] + v->blocks_off][0], + v->bfraction, 1, s->quarter_sample); + s->mv[1][0][1] = scale_mv(s->next_picture.f.motion_val[1][s->block_index[0] + v->blocks_off][1], + v->bfraction, 1, s->quarter_sample); total_opp = v->mv_f_next[0][s->block_index[0] + v->blocks_off] + v->mv_f_next[0][s->block_index[1] + v->blocks_off]