mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-12-23 12:43:46 +02:00
rv40dsp x86: use only one register, for both increment and loop counter
Around 10 cycles faster for luma. Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com>
This commit is contained in:
parent
272b252c01
commit
2130bd8f5b
@ -32,13 +32,14 @@ SECTION .text
|
||||
|
||||
; %1=5bits weights?, %2=dst %3=src1 %4=src3 %5=stride if sse2
|
||||
%macro RV40_WCORE 4-5
|
||||
movh m4, [%3 + 0]
|
||||
movh m5, [%4 + 0]
|
||||
movh m4, [%3 + r6 + 0]
|
||||
movh m5, [%4 + r6 + 0]
|
||||
%if %0 == 4
|
||||
%define OFFSET mmsize / 2
|
||||
%define OFFSET r6 + mmsize / 2
|
||||
%else
|
||||
; 8x8 block and sse2, stride was provided
|
||||
%define OFFSET %5
|
||||
%define OFFSET r6
|
||||
add r6, r5
|
||||
%endif
|
||||
movh m6, [%3 + OFFSET]
|
||||
movh m7, [%4 + OFFSET]
|
||||
@ -99,10 +100,12 @@ SECTION .text
|
||||
packuswb m4, m6
|
||||
%if %0 == 5
|
||||
; Only called for 8x8 blocks and sse2
|
||||
movh [%2 + 0], m4
|
||||
movhps [%2 + %5], m4
|
||||
sub r6, r5
|
||||
movh [%2 + r6], m4
|
||||
add r6, r5
|
||||
movhps [%2 + r6], m4
|
||||
%else
|
||||
mova [%2], m4
|
||||
mova [%2 + r6], m4
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
@ -115,26 +118,19 @@ SECTION .text
|
||||
%endif
|
||||
|
||||
; Prepare for next loop
|
||||
add r0, r5
|
||||
add r1, r5
|
||||
add r2, r5
|
||||
add r6, r5
|
||||
%else
|
||||
%ifidn %1, 8
|
||||
RV40_WCORE %2, r0, r1, r2, r5
|
||||
; Prepare 2 next lines
|
||||
lea r0, [r0 + 2 * r5]
|
||||
lea r1, [r1 + 2 * r5]
|
||||
lea r2, [r2 + 2 * r5]
|
||||
add r6, r5
|
||||
%else
|
||||
RV40_WCORE %2, r0, r1, r2
|
||||
; Prepare single next line
|
||||
add r0, r5
|
||||
add r1, r5
|
||||
add r2, r5
|
||||
add r6, r5
|
||||
%endif
|
||||
%endif
|
||||
|
||||
dec r6
|
||||
%endmacro
|
||||
|
||||
; rv40_weight_func_%1(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, int stride)
|
||||
@ -145,7 +141,7 @@ SECTION .text
|
||||
; Therefore, we check here whether they are multiples of 2^9 for
|
||||
; those simplifications to occur.
|
||||
%macro RV40_WEIGHT 3
|
||||
cglobal rv40_weight_func_%1_%2, 6, 7, %3
|
||||
cglobal rv40_weight_func_%1_%2, 6, 7, 8
|
||||
%if cpuflag(ssse3)
|
||||
mova m1, [shift_round]
|
||||
%else
|
||||
@ -153,11 +149,12 @@ cglobal rv40_weight_func_%1_%2, 6, 7, %3
|
||||
%endif
|
||||
pxor m0, m0
|
||||
; Set loop counter and increments
|
||||
%if mmsize == 8
|
||||
mov r6, %2
|
||||
%else
|
||||
mov r6, (%2 * %2) / mmsize
|
||||
%endif
|
||||
mov r6, r5
|
||||
shl r6, %3
|
||||
add r0, r6
|
||||
add r1, r6
|
||||
add r2, r6
|
||||
neg r6
|
||||
|
||||
movd m2, r3
|
||||
movd m3, r4
|
||||
|
Loading…
Reference in New Issue
Block a user