1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2025-01-24 13:56:33 +02:00

Unroll loop in h264_idct_add16intra_sse2(). Basically identical to r25171, this

inlines scan8[] and removes loop setup. 15% faster, 0.4% overall.

See "[PATCH] unroll loop in h264_idct_add8_sse2()" thread on ML.

Originally committed as revision 25172 to svn://svn.ffmpeg.org/ffmpeg/trunk
This commit is contained in:
Ronald S. Bultje 2010-09-24 14:07:23 +00:00
parent 4bca677494
commit ae11291865

View File

@ -759,50 +759,50 @@ cglobal h264_idct_add16_sse2, 5, 5, 8
add16_sse2_cycle 7, 0x26 add16_sse2_cycle 7, 0x26
RET RET
; ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset, %macro add16intra_sse2_cycle 2
; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) movzx r0, word [r4+%2]
cglobal h264_idct_add16intra_sse2, 5, 7, 8
xor r5, r5
%ifdef ARCH_X86_64
mov r10, r0
%endif
%ifdef PIC
lea r11, [scan8_mem]
%endif
.next2blocks
movzx r0, byte [scan8+r5]
movzx r0, word [r4+r0]
test r0, r0 test r0, r0
jz .try_dc jz .try%1dc
mov r0d, dword [r1+r5*4] mov r0d, dword [r1+%1*8]
%ifdef ARCH_X86_64 %ifdef ARCH_X86_64
add r0, r10 add r0, r10
%else %else
add r0, r0m add r0, r0m
%endif %endif
call x264_add8x4_idct_sse2 call x264_add8x4_idct_sse2
add r5, 2 jmp .cycle%1end
add r2, 64 .try%1dc
cmp r5, 16
jl .next2blocks
REP_RET
.try_dc
movsx r0, word [r2 ] movsx r0, word [r2 ]
or r0w, word [r2+32] or r0w, word [r2+32]
jz .skip2blocks jz .cycle%1end
mov r0d, dword [r1+r5*4] mov r0d, dword [r1+%1*8]
%ifdef ARCH_X86_64 %ifdef ARCH_X86_64
add r0, r10 add r0, r10
%else %else
add r0, r0m add r0, r0m
%endif %endif
call h264_idct_dc_add8_mmx2 call h264_idct_dc_add8_mmx2
.skip2blocks .cycle%1end
add r5, 2 %if %1 < 7
add r2, 64 add r2, 64
cmp r5, 16 %endif
jl .next2blocks %endmacro
REP_RET
; ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset,
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
cglobal h264_idct_add16intra_sse2, 5, 7, 8
%ifdef ARCH_X86_64
mov r10, r0
%endif
add16intra_sse2_cycle 0, 0xc
add16intra_sse2_cycle 1, 0x14
add16intra_sse2_cycle 2, 0xe
add16intra_sse2_cycle 3, 0x16
add16intra_sse2_cycle 4, 0x1c
add16intra_sse2_cycle 5, 0x24
add16intra_sse2_cycle 6, 0x1e
add16intra_sse2_cycle 7, 0x26
RET
%macro add8_sse2_cycle 2 %macro add8_sse2_cycle 2
movzx r0, word [r4+%2] movzx r0, word [r4+%2]