mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-01-24 13:56:33 +02:00
Unroll loop in h264_idct_add16intra_sse2(). Basically identical to r25171, this
inlines scan8[] and removes loop setup. 15% faster, 0.4% overall. See "[PATCH] unroll loop in h264_idct_add8_sse2()" thread on ML. Originally committed as revision 25172 to svn://svn.ffmpeg.org/ffmpeg/trunk
This commit is contained in:
parent
4bca677494
commit
ae11291865
@ -759,50 +759,50 @@ cglobal h264_idct_add16_sse2, 5, 5, 8
|
|||||||
add16_sse2_cycle 7, 0x26
|
add16_sse2_cycle 7, 0x26
|
||||||
RET
|
RET
|
||||||
|
|
||||||
; ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset,
|
%macro add16intra_sse2_cycle 2
|
||||||
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
|
movzx r0, word [r4+%2]
|
||||||
cglobal h264_idct_add16intra_sse2, 5, 7, 8
|
|
||||||
xor r5, r5
|
|
||||||
%ifdef ARCH_X86_64
|
|
||||||
mov r10, r0
|
|
||||||
%endif
|
|
||||||
%ifdef PIC
|
|
||||||
lea r11, [scan8_mem]
|
|
||||||
%endif
|
|
||||||
.next2blocks
|
|
||||||
movzx r0, byte [scan8+r5]
|
|
||||||
movzx r0, word [r4+r0]
|
|
||||||
test r0, r0
|
test r0, r0
|
||||||
jz .try_dc
|
jz .try%1dc
|
||||||
mov r0d, dword [r1+r5*4]
|
mov r0d, dword [r1+%1*8]
|
||||||
%ifdef ARCH_X86_64
|
%ifdef ARCH_X86_64
|
||||||
add r0, r10
|
add r0, r10
|
||||||
%else
|
%else
|
||||||
add r0, r0m
|
add r0, r0m
|
||||||
%endif
|
%endif
|
||||||
call x264_add8x4_idct_sse2
|
call x264_add8x4_idct_sse2
|
||||||
add r5, 2
|
jmp .cycle%1end
|
||||||
add r2, 64
|
.try%1dc
|
||||||
cmp r5, 16
|
|
||||||
jl .next2blocks
|
|
||||||
REP_RET
|
|
||||||
.try_dc
|
|
||||||
movsx r0, word [r2 ]
|
movsx r0, word [r2 ]
|
||||||
or r0w, word [r2+32]
|
or r0w, word [r2+32]
|
||||||
jz .skip2blocks
|
jz .cycle%1end
|
||||||
mov r0d, dword [r1+r5*4]
|
mov r0d, dword [r1+%1*8]
|
||||||
%ifdef ARCH_X86_64
|
%ifdef ARCH_X86_64
|
||||||
add r0, r10
|
add r0, r10
|
||||||
%else
|
%else
|
||||||
add r0, r0m
|
add r0, r0m
|
||||||
%endif
|
%endif
|
||||||
call h264_idct_dc_add8_mmx2
|
call h264_idct_dc_add8_mmx2
|
||||||
.skip2blocks
|
.cycle%1end
|
||||||
add r5, 2
|
%if %1 < 7
|
||||||
add r2, 64
|
add r2, 64
|
||||||
cmp r5, 16
|
%endif
|
||||||
jl .next2blocks
|
%endmacro
|
||||||
REP_RET
|
|
||||||
|
; ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset,
|
||||||
|
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
|
||||||
|
cglobal h264_idct_add16intra_sse2, 5, 7, 8
|
||||||
|
%ifdef ARCH_X86_64
|
||||||
|
mov r10, r0
|
||||||
|
%endif
|
||||||
|
add16intra_sse2_cycle 0, 0xc
|
||||||
|
add16intra_sse2_cycle 1, 0x14
|
||||||
|
add16intra_sse2_cycle 2, 0xe
|
||||||
|
add16intra_sse2_cycle 3, 0x16
|
||||||
|
add16intra_sse2_cycle 4, 0x1c
|
||||||
|
add16intra_sse2_cycle 5, 0x24
|
||||||
|
add16intra_sse2_cycle 6, 0x1e
|
||||||
|
add16intra_sse2_cycle 7, 0x26
|
||||||
|
RET
|
||||||
|
|
||||||
%macro add8_sse2_cycle 2
|
%macro add8_sse2_cycle 2
|
||||||
movzx r0, word [r4+%2]
|
movzx r0, word [r4+%2]
|
||||||
|
Loading…
x
Reference in New Issue
Block a user