diff --git a/libavcodec/x86/qpeldsp.asm b/libavcodec/x86/qpeldsp.asm index fd97b71134..d6c8778151 100644 --- a/libavcodec/x86/qpeldsp.asm +++ b/libavcodec/x86/qpeldsp.asm @@ -26,9 +26,9 @@ SECTION_RODATA cextern pw_3 -pw_15: times 4 dw 15 +pw_15: times 8 dw 15 cextern pw_16 -pw_20: times 4 dw 20 +pw_20: times 8 dw 20 SECTION .text @@ -396,68 +396,75 @@ MPEG4_QPEL8_H_LOWPASS put_no_rnd paddw m5, m4 psraw m5, 5 packuswb m5, m5 - OP_MOV %5, m5, m7 + OP_MOV %5, m5, m4 SWAP 0,1,2,3 %endmacro %macro MPEG4_QPEL16_V_LOWPASS 1 -cglobal %1_mpeg4_qpel16_v_lowpass, 4, 6, 0, 544 +cglobal %1_mpeg4_qpel16_v_lowpass, 4, 6, 7, 544 mov r4d, 17 mov r5, rsp - pxor m7, m7 + pxor m4, m4 .looph: - mova m0, [r1] - mova m1, [r1] + movu m0, [r1] + mova m1, m0 +%if mmsize == 8 mova m2, [r1+8] mova m3, [r1+8] - punpcklbw m0, m7 - punpckhbw m1, m7 - punpcklbw m2, m7 - punpckhbw m3, m7 + punpcklbw m0, m4 + punpckhbw m1, m4 + punpcklbw m2, m4 + punpckhbw m3, m4 mova [r5], m0 mova [r5+0x88], m1 mova [r5+0x110], m2 mova [r5+0x198], m3 - add r5, 8 +%else + punpcklbw m0, m4 + punpckhbw m1, m4 + mova [r5], m0 + mova [r5+0x110], m1 +%endif add r1, r3 + add r5, mmsize dec r4d jne .looph - mov r4d, 4 + mov r4d, 16/(mmsize/2) mov r1, r0 mov r5, rsp .loopv: - mova m0, [r5+ 0x0] - mova m1, [r5+ 0x8] - mova m2, [r5+0x10] - mova m3, [r5+0x18] - add r1, 4 - QPEL_V_LOW [r5+0x10], [r5+ 0x8], [r5+ 0x0], [r5+0x20], [r0] - QPEL_V_LOW [r5+ 0x8], [r5+ 0x0], [r5+ 0x0], [r5+0x28], [r0+r2] + mova m0, [r5+0 * mmsize] + mova m1, [r5+1 * mmsize] + mova m2, [r5+2 * mmsize] + mova m3, [r5+3 * mmsize] + add r1, mmsize/2 + QPEL_V_LOW [r5+2*mmsize], [r5+1*mmsize], [r5+0*mmsize], [r5+4*mmsize], [r0] + QPEL_V_LOW [r5+1*mmsize], [r5+0*mmsize], [r5+0*mmsize], [r5+5*mmsize], [r0+r2] lea r0, [r0+r2*2] - QPEL_V_LOW [r5+ 0x0], [r5+ 0x0], [r5+ 0x8], [r5+0x30], [r0] - QPEL_V_LOW [r5+ 0x0], [r5+ 0x8], [r5+0x10], [r5+0x38], [r0+r2] + QPEL_V_LOW [r5+0*mmsize], [r5+0*mmsize], [r5+1*mmsize], [r5+6*mmsize], [r0] + QPEL_V_LOW [r5+0*mmsize], [r5+1*mmsize], [r5+2*mmsize], [r5+7*mmsize], [r0+r2] lea r0, [r0+r2*2] - QPEL_V_LOW [r5+ 0x8], [r5+0x10], [r5+0x18], [r5+0x40], [r0] - QPEL_V_LOW [r5+0x10], [r5+0x18], [r5+0x20], [r5+0x48], [r0+r2] + QPEL_V_LOW [r5+1*mmsize], [r5+2*mmsize], [r5+3*mmsize], [r5+8*mmsize], [r0] + QPEL_V_LOW [r5+2*mmsize], [r5+3*mmsize], [r5+4*mmsize], [r5+9*mmsize], [r0+r2] lea r0, [r0+r2*2] - QPEL_V_LOW [r5+0x18], [r5+0x20], [r5+0x28], [r5+0x50], [r0] - QPEL_V_LOW [r5+0x20], [r5+0x28], [r5+0x30], [r5+0x58], [r0+r2] + QPEL_V_LOW [r5+3*mmsize], [r5+4*mmsize], [r5+5*mmsize], [r5+10*mmsize], [r0] + QPEL_V_LOW [r5+4*mmsize], [r5+5*mmsize], [r5+6*mmsize], [r5+11*mmsize], [r0+r2] lea r0, [r0+r2*2] - QPEL_V_LOW [r5+0x28], [r5+0x30], [r5+0x38], [r5+0x60], [r0] - QPEL_V_LOW [r5+0x30], [r5+0x38], [r5+0x40], [r5+0x68], [r0+r2] + QPEL_V_LOW [r5+5*mmsize], [r5+6*mmsize], [r5+7*mmsize], [r5+12*mmsize], [r0] + QPEL_V_LOW [r5+6*mmsize], [r5+7*mmsize], [r5+8*mmsize], [r5+13*mmsize], [r0+r2] lea r0, [r0+r2*2] - QPEL_V_LOW [r5+0x38], [r5+0x40], [r5+0x48], [r5+0x70], [r0] - QPEL_V_LOW [r5+0x40], [r5+0x48], [r5+0x50], [r5+0x78], [r0+r2] + QPEL_V_LOW [r5+7*mmsize], [r5+8*mmsize], [r5+ 9*mmsize], [r5+14*mmsize], [r0] + QPEL_V_LOW [r5+8*mmsize], [r5+9*mmsize], [r5+10*mmsize], [r5+15*mmsize], [r0+r2] lea r0, [r0+r2*2] - QPEL_V_LOW [r5+0x48], [r5+0x50], [r5+0x58], [r5+0x80], [r0] - QPEL_V_LOW [r5+0x50], [r5+0x58], [r5+0x60], [r5+0x80], [r0+r2] + QPEL_V_LOW [r5+ 9*mmsize], [r5+10*mmsize], [r5+11*mmsize], [r5+16*mmsize], [r0] + QPEL_V_LOW [r5+10*mmsize], [r5+11*mmsize], [r5+12*mmsize], [r5+16*mmsize], [r0+r2] lea r0, [r0+r2*2] - QPEL_V_LOW [r5+0x58], [r5+0x60], [r5+0x68], [r5+0x78], [r0] - QPEL_V_LOW [r5+0x60], [r5+0x68], [r5+0x70], [r5+0x70], [r0+r2] + QPEL_V_LOW [r5+11*mmsize], [r5+12*mmsize], [r5+13*mmsize], [r5+15*mmsize], [r0] + QPEL_V_LOW [r5+12*mmsize], [r5+13*mmsize], [r5+14*mmsize], [r5+14*mmsize], [r0+r2] - add r5, 0x88 + add r5, 17*mmsize mov r0, r1 dec r4d jne .loopv @@ -488,47 +495,60 @@ MPEG4_QPEL16_V_LOWPASS put_no_rnd %macro MPEG4_QPEL8_V_LOWPASS 1 -cglobal %1_mpeg4_qpel8_v_lowpass, 4, 6, 0, 144 +cglobal %1_mpeg4_qpel8_v_lowpass, 4, 6, 7, 144 mov r4d, 9 mov r5, rsp - pxor m7, m7 + pxor m2, m2 .looph: - mova m0, [r1] - mova m1, [r1] - punpcklbw m0, m7 - punpckhbw m1, m7 + movq m0, [r1] + add r1, r3 +%if mmsize == 8 + mova m1, m0 + punpcklbw m0, m2 + punpckhbw m1, m2 mova [r5], m0 mova [r5+0x48], m1 - add r5, 8 - add r1, r3 +%else + punpcklbw m0, m2 + mova [r5], m0 +%endif + add r5, mmsize dec r4d jne .looph +%if mmsize == 8 mov r4d, 2 mov r1, r0 mov r5, rsp .loopv: - mova m0, [r5+ 0x0] - mova m1, [r5+ 0x8] - mova m2, [r5+0x10] - mova m3, [r5+0x18] - QPEL_V_LOW [r5+0x10], [r5+ 0x8], [r5+ 0x0], [r5+0x20], [r0] - QPEL_V_LOW [r5+ 0x8], [r5+ 0x0], [r5+ 0x0], [r5+0x28], [r0+r2] - lea r0, [r0+r2*2] - QPEL_V_LOW [r5+ 0x0], [r5+ 0x0], [r5+ 0x8], [r5+0x30], [r0] - QPEL_V_LOW [r5+ 0x0], [r5+ 0x8], [r5+0x10], [r5+0x38], [r0+r2] - lea r0, [r0+r2*2] - QPEL_V_LOW [r5+ 0x8], [r5+0x10], [r5+0x18], [r5+0x40], [r0] - QPEL_V_LOW [r5+0x10], [r5+0x18], [r5+0x20], [r5+0x40], [r0+r2] - lea r0, [r0+r2*2] - QPEL_V_LOW [r5+0x18], [r5+0x20], [r5+0x28], [r5+0x38], [r0] - QPEL_V_LOW [r5+0x20], [r5+0x28], [r5+0x30], [r5+0x30], [r0+r2] +%define R5 r5 +%else +%define R5 rsp +%endif + mova m0, [R5+0 * mmsize] + mova m1, [R5+1 * mmsize] + mova m2, [R5+2 * mmsize] + mova m3, [R5+3 * mmsize] + QPEL_V_LOW [R5+2*mmsize], [R5+1*mmsize], [R5+0*mmsize], [R5+4*mmsize], [r0] + QPEL_V_LOW [R5+1*mmsize], [R5+0*mmsize], [R5+0*mmsize], [R5+5*mmsize], [r0+r2] + lea r0, [r0+r2*2] + QPEL_V_LOW [R5+0*mmsize], [R5+0*mmsize], [R5+1*mmsize], [R5+6*mmsize], [r0] + QPEL_V_LOW [R5+0*mmsize], [R5+1*mmsize], [R5+2*mmsize], [R5+7*mmsize], [r0+r2] + lea r0, [r0+r2*2] + QPEL_V_LOW [R5+1*mmsize], [R5+2*mmsize], [R5+3*mmsize], [R5+8*mmsize], [r0] + QPEL_V_LOW [R5+2*mmsize], [R5+3*mmsize], [R5+4*mmsize], [R5+8*mmsize], [r0+r2] + lea r0, [r0+r2*2] + QPEL_V_LOW [R5+3*mmsize], [R5+4*mmsize], [R5+5*mmsize], [R5+7*mmsize], [r0] + QPEL_V_LOW [R5+4*mmsize], [R5+5*mmsize], [R5+6*mmsize], [R5+6*mmsize], [r0+r2] + +%if mmsize == 8 add r5, 0x48 lea r0, [r1+4] dec r4d jne .loopv +%endif RET %endmacro @@ -542,3 +562,17 @@ MPEG4_QPEL8_V_LOWPASS avg %define PW_ROUND pw_15 %define OP_MOV PUT_OPH MPEG4_QPEL8_V_LOWPASS put_no_rnd + +INIT_XMM sse2 +%define PW_ROUND pw_16 +%define OP_MOV PUT_OPH +MPEG4_QPEL16_V_LOWPASS put +MPEG4_QPEL8_V_LOWPASS put +%define PW_ROUND pw_16 +%define OP_MOV AVG_OPH +MPEG4_QPEL16_V_LOWPASS avg +MPEG4_QPEL8_V_LOWPASS avg +%define PW_ROUND pw_15 +%define OP_MOV PUT_OPH +MPEG4_QPEL16_V_LOWPASS put_no_rnd +MPEG4_QPEL8_V_LOWPASS put_no_rnd diff --git a/libavcodec/x86/qpeldsp_init.c b/libavcodec/x86/qpeldsp_init.c index 7bcd465d2f..025753ce17 100644 --- a/libavcodec/x86/qpeldsp_init.c +++ b/libavcodec/x86/qpeldsp_init.c @@ -271,6 +271,35 @@ QPEL3(QPEL_H, 16, 17, mmxext, mmxext, mmxext, mmxext) QPEL3(QPEL_V, 16, 17, mmxext, mmxext, mmxext, mmxext) QPEL3(QPEL_HV, 16, 17, mmxext, mmxext, mmxext, mmxext) +QPEL3(QPEL_V, 8, 9, ssse3, sse2, ssse3, mmxext) +QPEL3(QPEL_HV, 8, 9, mmxext, sse2, sse2, mmxext) +QPEL3(QPEL_V, 16, 17, ssse3, sse2, ssse3, mmxext) +QPEL3(QPEL_HV, 16, 17, mmxext, sse2, sse2, mmxext) + +#define SET_QPEL_FUNC(OP, X, Y, SIZE, CPU, PREFIX) \ + c->OP ## _qpel_pixels_tab[SIZE == 8][X+4*Y] = PREFIX ## OP ## _qpel ## SIZE ## _mc ## X ## Y ## _ ## CPU + +#define SET_QPEL_FUNCS3(X, Y, SIZE, CPU, PREFIX) \ + SET_QPEL_FUNC(avg, X, Y, SIZE, CPU, PREFIX); \ + SET_QPEL_FUNC(put, X, Y, SIZE, CPU, PREFIX); \ + SET_QPEL_FUNC(put_no_rnd, X, Y, SIZE, CPU, PREFIX) + +#define SET_V_QPEL_FUNCS(SIZE, CPU, PREFIX) \ + SET_QPEL_FUNCS3(0, 1, SIZE, CPU, PREFIX); \ + SET_QPEL_FUNCS3(0, 2, SIZE, CPU, PREFIX); \ + SET_QPEL_FUNCS3(0, 3, SIZE, CPU, PREFIX) + +#define SET_HV_QPEL_FUNCS(SIZE, CPU, PREFIX) \ + SET_QPEL_FUNCS3(1, 1, SIZE, CPU, PREFIX); \ + SET_QPEL_FUNCS3(1, 2, SIZE, CPU, PREFIX); \ + SET_QPEL_FUNCS3(1, 3, SIZE, CPU, PREFIX); \ + SET_QPEL_FUNCS3(2, 1, SIZE, CPU, PREFIX); \ + SET_QPEL_FUNCS3(2, 2, SIZE, CPU, PREFIX); \ + SET_QPEL_FUNCS3(2, 3, SIZE, CPU, PREFIX); \ + SET_QPEL_FUNCS3(3, 1, SIZE, CPU, PREFIX); \ + SET_QPEL_FUNCS3(3, 2, SIZE, CPU, PREFIX); \ + SET_QPEL_FUNCS3(3, 3, SIZE, CPU, PREFIX) + #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \ do { \ c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \ @@ -313,6 +342,11 @@ av_cold void ff_qpeldsp_init_x86(QpelDSPContext *c) c->put_no_rnd_qpel_pixels_tab[1][0] = c->put_qpel_pixels_tab[1][0] = ff_put_pixels8x8_sse2; c->avg_qpel_pixels_tab[0][0] = ff_avg_pixels16x16_sse2; + + SET_V_QPEL_FUNCS (16, sse2,); + SET_HV_QPEL_FUNCS(16, sse2,); + SET_V_QPEL_FUNCS (8, sse2,); + SET_HV_QPEL_FUNCS(8, sse2,); } #endif }