You've already forked FFmpeg
mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-11-23 21:54:53 +02:00
These functions are currently always called with height either
being equal to the block size or block size+1. height is
a compile-time constant at every callsite. This makes it possible
to split this function into two to avoid the check inside
the function for whether height is odd or even.
The corresponding avg function is only used with height == block size,
so that it does not have a height parameter at all. Removing the
parameter from the put_l2 functions as well therefore simplifies
the C code.
The new functions increase the size of .text from qpel{dsp}.o
by 32B here, yet they save 464B of C code here.
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
134 lines
4.0 KiB
NASM
134 lines
4.0 KiB
NASM
;******************************************************************************
|
|
;* SIMD-optimized quarterpel functions
|
|
;* Copyright (c) 2008 Loren Merritt
|
|
;* Copyright (c) 2003-2013 Michael Niedermayer
|
|
;* Copyright (c) 2013 Daniel Kang
|
|
;*
|
|
;* This file is part of FFmpeg.
|
|
;*
|
|
;* FFmpeg is free software; you can redistribute it and/or
|
|
;* modify it under the terms of the GNU Lesser General Public
|
|
;* License as published by the Free Software Foundation; either
|
|
;* version 2.1 of the License, or (at your option) any later version.
|
|
;*
|
|
;* FFmpeg is distributed in the hope that it will be useful,
|
|
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
;* Lesser General Public License for more details.
|
|
;*
|
|
;* You should have received a copy of the GNU Lesser General Public
|
|
;* License along with FFmpeg; if not, write to the Free Software
|
|
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
;******************************************************************************
|
|
|
|
%include "libavutil/x86/x86util.asm"
|
|
|
|
SECTION .text
|
|
|
|
%macro op_avg 2
|
|
pavgb %1, %2
|
|
mova %2, %1
|
|
%endmacro
|
|
|
|
%macro op_put 2
|
|
mova %2, %1
|
|
%endmacro
|
|
|
|
%macro PIXELS8_L2 1
|
|
%define OP op_%1
|
|
%ifidn %1, put
|
|
; void ff_put_pixels8x9_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
|
|
; ptrdiff_t dstStride, ptrdiff_t src1Stride)
|
|
cglobal put_pixels8x9_l2, 5,6
|
|
mova m0, [r1]
|
|
mova m1, [r2]
|
|
add r1, r4
|
|
add r2, 8
|
|
pavgb m0, m1
|
|
OP m0, [r0]
|
|
add r0, r3
|
|
; FIXME: avoid jump if prologue is empty
|
|
jmp %1_pixels8x8_after_prologue_ %+ cpuname
|
|
%endif
|
|
; void ff_avg/put_pixels8x8_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
|
|
; ptrdiff_t dstStride, ptrdiff_t src1Stride)
|
|
cglobal %1_pixels8x8_l2, 5,6
|
|
%1_pixels8x8_after_prologue_ %+ cpuname:
|
|
mov r5d, 8
|
|
.loop:
|
|
mova m0, [r1]
|
|
mova m1, [r1+r4]
|
|
lea r1, [r1+2*r4]
|
|
pavgb m0, [r2]
|
|
pavgb m1, [r2+8]
|
|
OP m0, [r0]
|
|
OP m1, [r0+r3]
|
|
lea r0, [r0+2*r3]
|
|
mova m0, [r1]
|
|
mova m1, [r1+r4]
|
|
lea r1, [r1+2*r4]
|
|
pavgb m0, [r2+16]
|
|
pavgb m1, [r2+24]
|
|
OP m0, [r0]
|
|
OP m1, [r0+r3]
|
|
lea r0, [r0+2*r3]
|
|
add r2, 32
|
|
sub r5d, 4
|
|
jne .loop
|
|
RET
|
|
%endmacro
|
|
|
|
INIT_MMX mmxext
|
|
PIXELS8_L2 put
|
|
PIXELS8_L2 avg
|
|
|
|
%macro PIXELS16_L2 1
|
|
%define OP op_%1
|
|
%ifidn %1, put
|
|
; void ff_put_pixels16x17_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
|
|
; ptrdiff_t dstStride, ptrdiff_t src1Stride)
|
|
cglobal put_pixels16x17_l2, 5,6
|
|
mova m0, [r1]
|
|
mova m1, [r1+8]
|
|
pavgb m0, [r2]
|
|
pavgb m1, [r2+8]
|
|
add r1, r4
|
|
add r2, 16
|
|
OP m0, [r0]
|
|
OP m1, [r0+8]
|
|
add r0, r3
|
|
; FIXME: avoid jump if prologue is empty
|
|
jmp %1_pixels16x16_after_prologue_ %+ cpuname
|
|
%endif
|
|
; void ff_avg/put_pixels16x16_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
|
|
; ptrdiff_t dstStride, ptrdiff_t src1Stride)
|
|
cglobal %1_pixels16x16_l2, 5,6
|
|
%1_pixels16x16_after_prologue_ %+ cpuname:
|
|
mov r5d, 16
|
|
.loop:
|
|
mova m0, [r1]
|
|
mova m1, [r1+8]
|
|
add r1, r4
|
|
pavgb m0, [r2]
|
|
pavgb m1, [r2+8]
|
|
OP m0, [r0]
|
|
OP m1, [r0+8]
|
|
add r0, r3
|
|
mova m0, [r1]
|
|
mova m1, [r1+8]
|
|
add r1, r4
|
|
pavgb m0, [r2+16]
|
|
pavgb m1, [r2+24]
|
|
OP m0, [r0]
|
|
OP m1, [r0+8]
|
|
add r0, r3
|
|
add r2, 32
|
|
sub r5d, 2
|
|
jne .loop
|
|
RET
|
|
%endmacro
|
|
|
|
INIT_MMX mmxext
|
|
PIXELS16_L2 put
|
|
PIXELS16_L2 avg
|