You've already forked FFmpeg
							
							
				mirror of
				https://github.com/FFmpeg/FFmpeg.git
				synced 2025-10-30 23:18:11 +02:00 
			
		
		
		
	refactor and optimize scalarproduct
29-105% faster apply_filter, 6-90% faster ape decoding on core2 (Any x86 other than core2 probably gets much less, since this is mostly due to ssse3 cachesplit avoidance and I haven't written the full gamut of other cachesplit modes.) 9-123% faster ape decoding on G4. Originally committed as revision 20739 to svn://svn.ffmpeg.org/ffmpeg/trunk
This commit is contained in:
		| @@ -648,22 +648,16 @@ static void init_filter(APEContext * ctx, APEFilter *f, int16_t * buf, int order | ||||
|     do_init_filter(&f[1], buf + order * 3 + HISTORY_SIZE, order); | ||||
| } | ||||
|  | ||||
| static inline void do_apply_filter(APEContext * ctx, int version, APEFilter *f, int32_t *data, int count, int order, int fracbits) | ||||
| static void do_apply_filter(APEContext * ctx, int version, APEFilter *f, int32_t *data, int count, int order, int fracbits) | ||||
| { | ||||
|     int res; | ||||
|     int absres; | ||||
|  | ||||
|     while (count--) { | ||||
|         /* round fixedpoint scalar product */ | ||||
|         res = (ctx->dsp.scalarproduct_int16(f->delay - order, f->coeffs, order, 0) + (1 << (fracbits - 1))) >> fracbits; | ||||
|  | ||||
|         if (*data < 0) | ||||
|             ctx->dsp.add_int16(f->coeffs, f->adaptcoeffs - order, order); | ||||
|         else if (*data > 0) | ||||
|             ctx->dsp.sub_int16(f->coeffs, f->adaptcoeffs - order, order); | ||||
|  | ||||
|         res = ctx->dsp.scalarproduct_and_madd_int16(f->coeffs, f->delay - order, f->adaptcoeffs - order, order, APESIGN(*data)); | ||||
|         res = (res + (1 << (fracbits - 1))) >> fracbits; | ||||
|         res += *data; | ||||
|  | ||||
|         *data++ = res; | ||||
|  | ||||
|         /* Update the output history */ | ||||
|   | ||||
| @@ -4298,18 +4298,6 @@ void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, i | ||||
|     } | ||||
| } | ||||
|  | ||||
| static void add_int16_c(int16_t * v1, int16_t * v2, int order) | ||||
| { | ||||
|     while (order--) | ||||
|        *v1++ += *v2++; | ||||
| } | ||||
|  | ||||
| static void sub_int16_c(int16_t * v1, int16_t * v2, int order) | ||||
| { | ||||
|     while (order--) | ||||
|         *v1++ -= *v2++; | ||||
| } | ||||
|  | ||||
| static int32_t scalarproduct_int16_c(int16_t * v1, int16_t * v2, int order, int shift) | ||||
| { | ||||
|     int res = 0; | ||||
| @@ -4320,6 +4308,16 @@ static int32_t scalarproduct_int16_c(int16_t * v1, int16_t * v2, int order, int | ||||
|     return res; | ||||
| } | ||||
|  | ||||
| static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul) | ||||
| { | ||||
|     int res = 0; | ||||
|     while (order--) { | ||||
|         res   += *v1 * *v2++; | ||||
|         *v1++ += mul * *v3++; | ||||
|     } | ||||
|     return res; | ||||
| } | ||||
|  | ||||
| #define W0 2048 | ||||
| #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */ | ||||
| #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */ | ||||
| @@ -4848,9 +4846,8 @@ void dsputil_init(DSPContext* c, AVCodecContext *avctx) | ||||
|     c->vector_clipf = vector_clipf_c; | ||||
|     c->float_to_int16 = ff_float_to_int16_c; | ||||
|     c->float_to_int16_interleave = ff_float_to_int16_interleave_c; | ||||
|     c->add_int16 = add_int16_c; | ||||
|     c->sub_int16 = sub_int16_c; | ||||
|     c->scalarproduct_int16 = scalarproduct_int16_c; | ||||
|     c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c; | ||||
|     c->scalarproduct_float = scalarproduct_float_c; | ||||
|     c->butterflies_float = butterflies_float_c; | ||||
|     c->vector_fmul_scalar = vector_fmul_scalar_c; | ||||
|   | ||||
| @@ -560,23 +560,19 @@ typedef struct DSPContext { | ||||
|     void (*x8_setup_spatial_compensation)(uint8_t *src, uint8_t *dst, int linesize, | ||||
|            int * range, int * sum,  int edges); | ||||
|  | ||||
|     /* ape functions */ | ||||
|     /** | ||||
|      * Add contents of the second vector to the first one. | ||||
|      * @param len length of vectors, should be multiple of 16 | ||||
|      */ | ||||
|     void (*add_int16)(int16_t *v1/*align 16*/, int16_t *v2, int len); | ||||
|     /** | ||||
|      * Add contents of the second vector to the first one. | ||||
|      * @param len length of vectors, should be multiple of 16 | ||||
|      */ | ||||
|     void (*sub_int16)(int16_t *v1/*align 16*/, int16_t *v2, int len); | ||||
|     /** | ||||
|      * Calculate scalar product of two vectors. | ||||
|      * @param len length of vectors, should be multiple of 16 | ||||
|      * @param shift number of bits to discard from product | ||||
|      */ | ||||
|     int32_t (*scalarproduct_int16)(int16_t *v1, int16_t *v2/*align 16*/, int len, int shift); | ||||
|     /* ape functions */ | ||||
|     /** | ||||
|      * Calculate scalar product of v1 and v2, | ||||
|      * and v1[i] += v3[i] * mul | ||||
|      * @param len length of vectors, should be multiple of 16 | ||||
|      */ | ||||
|     int32_t (*scalarproduct_and_madd_int16)(int16_t *v1/*align 16*/, int16_t *v2, int16_t *v3, int len, int mul); | ||||
|  | ||||
|     /* rv30 functions */ | ||||
|     qpel_mc_func put_rv30_tpel_pixels_tab[4][16]; | ||||
|   | ||||
| @@ -79,34 +79,6 @@ static int ssd_int8_vs_int16_altivec(const int8_t *pix1, const int16_t *pix2, | ||||
|     return u.score[3]; | ||||
| } | ||||
|  | ||||
| static void add_int16_altivec(int16_t * v1, int16_t * v2, int order) | ||||
| { | ||||
|     int i; | ||||
|     register vec_s16 vec, *pv; | ||||
|  | ||||
|     for(i = 0; i < order; i += 8){ | ||||
|         pv = (vec_s16*)v2; | ||||
|         vec = vec_perm(pv[0], pv[1], vec_lvsl(0, v2)); | ||||
|         vec_st(vec_add(vec_ld(0, v1), vec), 0, v1); | ||||
|         v1 += 8; | ||||
|         v2 += 8; | ||||
|     } | ||||
| } | ||||
|  | ||||
| static void sub_int16_altivec(int16_t * v1, int16_t * v2, int order) | ||||
| { | ||||
|     int i; | ||||
|     register vec_s16 vec, *pv; | ||||
|  | ||||
|     for(i = 0; i < order; i += 8){ | ||||
|         pv = (vec_s16*)v2; | ||||
|         vec = vec_perm(pv[0], pv[1], vec_lvsl(0, v2)); | ||||
|         vec_st(vec_sub(vec_ld(0, v1), vec), 0, v1); | ||||
|         v1 += 8; | ||||
|         v2 += 8; | ||||
|     } | ||||
| } | ||||
|  | ||||
| static int32_t scalarproduct_int16_altivec(int16_t * v1, int16_t * v2, int order, const int shift) | ||||
| { | ||||
|     int i; | ||||
| @@ -137,10 +109,44 @@ static int32_t scalarproduct_int16_altivec(int16_t * v1, int16_t * v2, int order | ||||
|     return ires; | ||||
| } | ||||
|  | ||||
| static int32_t scalarproduct_and_madd_int16_altivec(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul) | ||||
| { | ||||
|     LOAD_ZERO; | ||||
|     vec_s16 *pv1 = (vec_s16*)v1; | ||||
|     vec_s16 *pv2 = (vec_s16*)v2; | ||||
|     vec_s16 *pv3 = (vec_s16*)v3; | ||||
|     register vec_s16 muls = {mul,mul,mul,mul,mul,mul,mul,mul}; | ||||
|     register vec_s16 t0, t1, i0, i1; | ||||
|     register vec_s16 i2 = pv2[0], i3 = pv3[0]; | ||||
|     register vec_s32 res = zero_s32v; | ||||
|     register vec_u8 align = vec_lvsl(0, v2); | ||||
|     int32_t ires; | ||||
|     order >>= 4; | ||||
|     do { | ||||
|         t0 = vec_perm(i2, pv2[1], align); | ||||
|         i2 = pv2[2]; | ||||
|         t1 = vec_perm(pv2[1], i2, align); | ||||
|         i0 = pv1[0]; | ||||
|         i1 = pv1[1]; | ||||
|         res = vec_msum(t0, i0, res); | ||||
|         res = vec_msum(t1, i1, res); | ||||
|         t0 = vec_perm(i3, pv3[1], align); | ||||
|         i3 = pv3[2]; | ||||
|         t1 = vec_perm(pv3[1], i3, align); | ||||
|         pv1[0] = vec_mladd(t0, muls, i0); | ||||
|         pv1[1] = vec_mladd(t1, muls, i1); | ||||
|         pv1 += 2; | ||||
|         pv2 += 2; | ||||
|         pv3 += 2; | ||||
|     } while(--order); | ||||
|     res = vec_splat(vec_sums(res, zero_s32v), 3); | ||||
|     vec_ste(res, 0, &ires); | ||||
|     return ires; | ||||
| } | ||||
|  | ||||
| void int_init_altivec(DSPContext* c, AVCodecContext *avctx) | ||||
| { | ||||
|     c->ssd_int8_vs_int16 = ssd_int8_vs_int16_altivec; | ||||
|     c->add_int16 = add_int16_altivec; | ||||
|     c->sub_int16 = sub_int16_altivec; | ||||
|     c->scalarproduct_int16 = scalarproduct_int16_altivec; | ||||
|     c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_altivec; | ||||
| } | ||||
|   | ||||
| @@ -2384,12 +2384,11 @@ static void float_to_int16_sse2(int16_t *dst, const float *src, long len){ | ||||
| void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len); | ||||
| void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len); | ||||
| void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len); | ||||
| void ff_add_int16_mmx2(int16_t * v1, int16_t * v2, int order); | ||||
| void ff_add_int16_sse2(int16_t * v1, int16_t * v2, int order); | ||||
| void ff_sub_int16_mmx2(int16_t * v1, int16_t * v2, int order); | ||||
| void ff_sub_int16_sse2(int16_t * v1, int16_t * v2, int order); | ||||
| int32_t ff_scalarproduct_int16_mmx2(int16_t * v1, int16_t * v2, int order, int shift); | ||||
| int32_t ff_scalarproduct_int16_sse2(int16_t * v1, int16_t * v2, int order, int shift); | ||||
| int32_t ff_scalarproduct_int16_mmx2(int16_t *v1, int16_t *v2, int order, int shift); | ||||
| int32_t ff_scalarproduct_int16_sse2(int16_t *v1, int16_t *v2, int order, int shift); | ||||
| int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul); | ||||
| int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul); | ||||
| int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul); | ||||
| void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top); | ||||
| int  ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left); | ||||
| int  ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left); | ||||
| @@ -2951,9 +2950,8 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) | ||||
|         } | ||||
|         if(mm_flags & FF_MM_MMX2){ | ||||
| #if HAVE_YASM | ||||
|             c->add_int16 = ff_add_int16_mmx2; | ||||
|             c->sub_int16 = ff_sub_int16_mmx2; | ||||
|             c->scalarproduct_int16 = ff_scalarproduct_int16_mmx2; | ||||
|             c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmx2; | ||||
| #endif | ||||
|         } | ||||
|         if(mm_flags & FF_MM_SSE){ | ||||
| @@ -2975,11 +2973,12 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) | ||||
|             c->float_to_int16 = float_to_int16_sse2; | ||||
|             c->float_to_int16_interleave = float_to_int16_interleave_sse2; | ||||
| #if HAVE_YASM | ||||
|             c->add_int16 = ff_add_int16_sse2; | ||||
|             c->sub_int16 = ff_sub_int16_sse2; | ||||
|             c->scalarproduct_int16 = ff_scalarproduct_int16_sse2; | ||||
|             c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2; | ||||
| #endif | ||||
|         } | ||||
|         if((mm_flags & FF_MM_SSSE3) && !(mm_flags & (FF_MM_SSE42|FF_MM_3DNOW)) && HAVE_YASM) // cachesplit | ||||
|             c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3; | ||||
|     } | ||||
|  | ||||
|     if (CONFIG_ENCODERS) | ||||
|   | ||||
| @@ -100,43 +100,7 @@ FLOAT_TO_INT16_INTERLEAVE6 3dn2 | ||||
|  | ||||
|  | ||||
| %macro SCALARPRODUCT 1 | ||||
| ; void add_int16(int16_t * v1, int16_t * v2, int order) | ||||
| cglobal add_int16_%1, 3,3,2, v1, v2, order | ||||
|     shl orderq, 1 | ||||
|     add v1q, orderq | ||||
|     add v2q, orderq | ||||
|     neg orderq | ||||
| .loop: | ||||
|     movu    m0, [v2q + orderq] | ||||
|     movu    m1, [v2q + orderq + mmsize] | ||||
|     paddw   m0, [v1q + orderq] | ||||
|     paddw   m1, [v1q + orderq + mmsize] | ||||
|     mova    [v1q + orderq], m0 | ||||
|     mova    [v1q + orderq + mmsize], m1 | ||||
|     add     orderq, mmsize*2 | ||||
|     jl .loop | ||||
|     REP_RET | ||||
|  | ||||
| ; void sub_int16(int16_t * v1, int16_t * v2, int order) | ||||
| cglobal sub_int16_%1, 3,3,4, v1, v2, order | ||||
|     shl orderq, 1 | ||||
|     add v1q, orderq | ||||
|     add v2q, orderq | ||||
|     neg orderq | ||||
| .loop: | ||||
|     movu    m2, [v2q + orderq] | ||||
|     movu    m3, [v2q + orderq + mmsize] | ||||
|     mova    m0, [v1q + orderq] | ||||
|     mova    m1, [v1q + orderq + mmsize] | ||||
|     psubw   m0, m2 | ||||
|     psubw   m1, m3 | ||||
|     mova    [v1q + orderq], m0 | ||||
|     mova    [v1q + orderq + mmsize], m1 | ||||
|     add     orderq, mmsize*2 | ||||
|     jl .loop | ||||
|     REP_RET | ||||
|  | ||||
| ; int scalarproduct_int16_sse2(int16_t * v1, int16_t * v2, int order, int shift) | ||||
| ; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order, int shift) | ||||
| cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift | ||||
|     shl orderq, 1 | ||||
|     add v1q, orderq | ||||
| @@ -165,6 +129,51 @@ cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift | ||||
|     paddd   m2, m0 | ||||
|     movd   eax, m2 | ||||
|     RET | ||||
|  | ||||
| ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul) | ||||
| cglobal scalarproduct_and_madd_int16_%1, 3,4,8, v1, v2, v3, order, mul | ||||
|     shl orderq, 1 | ||||
|     movd    m7, mulm | ||||
| %if mmsize == 16 | ||||
|     pshuflw m7, m7, 0 | ||||
|     punpcklqdq m7, m7 | ||||
| %else | ||||
|     pshufw  m7, m7, 0 | ||||
| %endif | ||||
|     pxor    m6, m6 | ||||
|     add v1q, orderq | ||||
|     add v2q, orderq | ||||
|     add v3q, orderq | ||||
|     neg orderq | ||||
| .loop: | ||||
|     movu    m0, [v2q + orderq] | ||||
|     movu    m1, [v2q + orderq + mmsize] | ||||
|     mova    m4, [v1q + orderq] | ||||
|     mova    m5, [v1q + orderq + mmsize] | ||||
|     movu    m2, [v3q + orderq] | ||||
|     movu    m3, [v3q + orderq + mmsize] | ||||
|     pmaddwd m0, m4 | ||||
|     pmaddwd m1, m5 | ||||
|     pmullw  m2, m7 | ||||
|     pmullw  m3, m7 | ||||
|     paddd   m6, m0 | ||||
|     paddd   m6, m1 | ||||
|     paddw   m2, m4 | ||||
|     paddw   m3, m5 | ||||
|     mova    [v1q + orderq], m2 | ||||
|     mova    [v1q + orderq + mmsize], m3 | ||||
|     add     orderq, mmsize*2 | ||||
|     jl .loop | ||||
| %if mmsize == 16 | ||||
|     movhlps m0, m6 | ||||
|     paddd   m6, m0 | ||||
|     pshuflw m0, m6, 0x4e | ||||
| %else | ||||
|     pshufw  m0, m6, 0x4e | ||||
| %endif | ||||
|     paddd   m6, m0 | ||||
|     movd   eax, m6 | ||||
|     RET | ||||
| %endmacro | ||||
|  | ||||
| INIT_MMX | ||||
| @@ -172,6 +181,87 @@ SCALARPRODUCT mmx2 | ||||
| INIT_XMM | ||||
| SCALARPRODUCT sse2 | ||||
|  | ||||
| %macro SCALARPRODUCT_LOOP 1 | ||||
| align 16 | ||||
| .loop%1: | ||||
|     sub     orderq, mmsize*2 | ||||
| %if %1 | ||||
|     mova    m1, m4 | ||||
|     mova    m4, [v2q + orderq] | ||||
|     mova    m0, [v2q + orderq + mmsize] | ||||
|     palignr m1, m0, %1 | ||||
|     palignr m0, m4, %1 | ||||
|     mova    m3, m5 | ||||
|     mova    m5, [v3q + orderq] | ||||
|     mova    m2, [v3q + orderq + mmsize] | ||||
|     palignr m3, m2, %1 | ||||
|     palignr m2, m5, %1 | ||||
| %else | ||||
|     mova    m0, [v2q + orderq] | ||||
|     mova    m1, [v2q + orderq + mmsize] | ||||
|     mova    m2, [v3q + orderq] | ||||
|     mova    m3, [v3q + orderq + mmsize] | ||||
| %endif | ||||
|     pmaddwd m0, [v1q + orderq] | ||||
|     pmaddwd m1, [v1q + orderq + mmsize] | ||||
|     pmullw  m2, m7 | ||||
|     pmullw  m3, m7 | ||||
|     paddw   m2, [v1q + orderq] | ||||
|     paddw   m3, [v1q + orderq + mmsize] | ||||
|     paddd   m6, m0 | ||||
|     paddd   m6, m1 | ||||
|     mova    [v1q + orderq], m2 | ||||
|     mova    [v1q + orderq + mmsize], m3 | ||||
|     jg .loop%1 | ||||
| %if %1 | ||||
|     jmp .end | ||||
| %endif | ||||
| %endmacro | ||||
|  | ||||
| ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul) | ||||
| cglobal scalarproduct_and_madd_int16_ssse3, 4,5,8, v1, v2, v3, order, mul | ||||
|     shl orderq, 1 | ||||
|     movd    m7, mulm | ||||
|     pshuflw m7, m7, 0 | ||||
|     punpcklqdq m7, m7 | ||||
|     pxor    m6, m6 | ||||
|     mov    r4d, v2d | ||||
|     and    r4d, 15 | ||||
|     and    v2q, ~15 | ||||
|     and    v3q, ~15 | ||||
|     mova    m4, [v2q + orderq] | ||||
|     mova    m5, [v3q + orderq] | ||||
|     ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable) | ||||
|     cmp    r4d, 0 | ||||
|     je .loop0 | ||||
|     cmp    r4d, 2 | ||||
|     je .loop2 | ||||
|     cmp    r4d, 4 | ||||
|     je .loop4 | ||||
|     cmp    r4d, 6 | ||||
|     je .loop6 | ||||
|     cmp    r4d, 8 | ||||
|     je .loop8 | ||||
|     cmp    r4d, 10 | ||||
|     je .loop10 | ||||
|     cmp    r4d, 12 | ||||
|     je .loop12 | ||||
| SCALARPRODUCT_LOOP 14 | ||||
| SCALARPRODUCT_LOOP 12 | ||||
| SCALARPRODUCT_LOOP 10 | ||||
| SCALARPRODUCT_LOOP 8 | ||||
| SCALARPRODUCT_LOOP 6 | ||||
| SCALARPRODUCT_LOOP 4 | ||||
| SCALARPRODUCT_LOOP 2 | ||||
| SCALARPRODUCT_LOOP 0 | ||||
| .end: | ||||
|     movhlps m0, m6 | ||||
|     paddd   m6, m0 | ||||
|     pshuflw m0, m6, 0x4e | ||||
|     paddd   m6, m0 | ||||
|     movd   eax, m6 | ||||
|     RET | ||||
|  | ||||
|  | ||||
|  | ||||
| ; void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top) | ||||
|   | ||||
		Reference in New Issue
	
	Block a user