1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2024-12-23 12:43:46 +02:00

h264pred: added AVX2 implementation for tm_vp8 16x16.

checkasm --bench results with 5000 runs

pred16x16_tm_vp8_c: 302.8
pred16x16_tm_vp8_mmx: 101.4
pred16x16_tm_vp8_mmxext: 95.5
pred16x16_tm_vp8_sse2: 95.1
pred16x16_tm_vp8_avx2: 38.2

Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com>
This commit is contained in:
Mirage Abeysekara 2017-03-19 01:20:53 +05:30 committed by Ronald S. Bultje
parent f3cd2302a9
commit 5eb4f95bef
2 changed files with 44 additions and 0 deletions

View File

@ -268,6 +268,43 @@ cglobal pred16x16_tm_vp8_8, 2,6,6
jg .loop
REP_RET
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
cglobal pred16x16_tm_vp8_8, 2, 4, 5, dst, stride, stride3, iteration
sub dstq, strideq
pmovzxbw m0, [dstq]
vpbroadcastb xm1, [r0-1]
pmovzxbw m1, xm1
psubw m0, m1
mov iterationd, 4
lea stride3q, [strideq*3]
.loop:
vpbroadcastb xm1, [dstq+strideq*1-1]
vpbroadcastb xm2, [dstq+strideq*2-1]
vpbroadcastb xm3, [dstq+stride3q-1]
vpbroadcastb xm4, [dstq+strideq*4-1]
pmovzxbw m1, xm1
pmovzxbw m2, xm2
pmovzxbw m3, xm3
pmovzxbw m4, xm4
paddw m1, m0
paddw m2, m0
paddw m3, m0
paddw m4, m0
vpackuswb m1, m1, m2
vpackuswb m3, m3, m4
vpermq m1, m1, q3120
vpermq m3, m3, q3120
movdqa [dstq+strideq*1], xm1
vextracti128 [dstq+strideq*2], m1, 1
movdqa [dstq+stride3q*1], xm3
vextracti128 [dstq+strideq*4], m3, 1
lea dstq, [dstq+strideq*4]
dec iterationd
jg .loop
REP_RET
%endif
;-----------------------------------------------------------------------------
; void ff_pred16x16_plane_*_8(uint8_t *src, int stride)
;-----------------------------------------------------------------------------

View File

@ -127,6 +127,7 @@ PRED16x16(plane_svq3, 8, ssse3)
PRED16x16(tm_vp8, 8, mmx)
PRED16x16(tm_vp8, 8, mmxext)
PRED16x16(tm_vp8, 8, sse2)
PRED16x16(tm_vp8, 8, avx2)
PRED8x8(top_dc, 8, mmxext)
PRED8x8(dc_rv40, 8, mmxext)
@ -323,6 +324,12 @@ av_cold void ff_h264_pred_init_x86(H264PredContext *h, int codec_id,
}
}
}
if(EXTERNAL_AVX2(cpu_flags)){
if (codec_id == AV_CODEC_ID_VP8) {
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_tm_vp8_8_avx2;
}
}
} else if (bit_depth == 10) {
if (EXTERNAL_MMXEXT(cpu_flags)) {
h->pred4x4[DC_PRED ] = ff_pred4x4_dc_10_mmxext;