mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-12-28 20:53:54 +02:00
fed07efcde
The only systems which benefit from these are truely ancient 32bit x86s as all other systems use at least the SSE2 versions (this includes all x64 cpus (which is why this code is restricted to x86-32)). Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
395 lines
9.6 KiB
NASM
395 lines
9.6 KiB
NASM
;******************************************************************************
|
|
;* SIMD lossless video DSP utils
|
|
;* Copyright (c) 2008 Loren Merritt
|
|
;* Copyright (c) 2014 Michael Niedermayer
|
|
;* Copyright (c) 2017 Jokyo Images
|
|
;*
|
|
;* This file is part of FFmpeg.
|
|
;*
|
|
;* FFmpeg is free software; you can redistribute it and/or
|
|
;* modify it under the terms of the GNU Lesser General Public
|
|
;* License as published by the Free Software Foundation; either
|
|
;* version 2.1 of the License, or (at your option) any later version.
|
|
;*
|
|
;* FFmpeg is distributed in the hope that it will be useful,
|
|
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
;* Lesser General Public License for more details.
|
|
;*
|
|
;* You should have received a copy of the GNU Lesser General Public
|
|
;* License along with FFmpeg; if not, write to the Free Software
|
|
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
;******************************************************************************
|
|
|
|
%include "libavutil/x86/x86util.asm"
|
|
|
|
SECTION_RODATA
|
|
|
|
cextern pb_15
|
|
pb_zzzzzzzz77777777: times 8 db -1
|
|
pb_7: times 8 db 7
|
|
pb_ef: times 8 db 14,15
|
|
pb_67: times 8 db 6, 7
|
|
pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
|
|
pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
|
|
pb_zzzz2323zzzzabab: db -1,-1,-1,-1, 2, 3, 2, 3,-1,-1,-1,-1,10,11,10,11
|
|
pb_zzzzzzzz67676767: db -1,-1,-1,-1,-1,-1,-1,-1, 6, 7, 6, 7, 6, 7, 6, 7
|
|
|
|
SECTION .text
|
|
|
|
;------------------------------------------------------------------------------
|
|
; void ff_add_median_pred(uint8_t *dst, const uint8_t *top,
|
|
; const uint8_t *diff, int w,
|
|
; int *left, int *left_top)
|
|
;------------------------------------------------------------------------------
|
|
INIT_XMM sse2
|
|
cglobal add_median_pred, 6,6,8, dst, top, diff, w, left, left_top
|
|
movu m0, [topq]
|
|
mova m2, m0
|
|
movd m4, [left_topq]
|
|
LSHIFT m2, 1
|
|
mova m1, m0
|
|
por m4, m2
|
|
movd m3, [leftq]
|
|
psubb m0, m4 ; t-tl
|
|
add dstq, wq
|
|
add topq, wq
|
|
add diffq, wq
|
|
neg wq
|
|
jmp .skip
|
|
.loop:
|
|
movu m4, [topq+wq]
|
|
mova m0, m4
|
|
LSHIFT m4, 1
|
|
por m4, m1
|
|
mova m1, m0 ; t
|
|
psubb m0, m4 ; t-tl
|
|
.skip:
|
|
movu m2, [diffq+wq]
|
|
%assign i 0
|
|
%rep mmsize
|
|
mova m4, m0
|
|
paddb m4, m3 ; t-tl+l
|
|
mova m5, m3
|
|
pmaxub m3, m1
|
|
pminub m5, m1
|
|
pminub m3, m4
|
|
pmaxub m3, m5 ; median
|
|
paddb m3, m2 ; +residual
|
|
%if i==0
|
|
mova m7, m3
|
|
LSHIFT m7, mmsize-1
|
|
%else
|
|
mova m6, m3
|
|
RSHIFT m7, 1
|
|
LSHIFT m6, mmsize-1
|
|
por m7, m6
|
|
%endif
|
|
%if i<mmsize-1
|
|
RSHIFT m0, 1
|
|
RSHIFT m1, 1
|
|
RSHIFT m2, 1
|
|
%endif
|
|
%assign i i+1
|
|
%endrep
|
|
movu [dstq+wq], m7
|
|
add wq, mmsize
|
|
jl .loop
|
|
movzx r2d, byte [dstq-1]
|
|
mov [leftq], r2d
|
|
movzx r2d, byte [topq-1]
|
|
mov [left_topq], r2d
|
|
RET
|
|
|
|
|
|
%macro ADD_LEFT_LOOP 2 ; %1 = dst_is_aligned, %2 = src_is_aligned
|
|
add srcq, wq
|
|
add dstq, wq
|
|
neg wq
|
|
%%.loop:
|
|
pshufb xm0, xm5
|
|
%if %2
|
|
mova m1, [srcq+wq]
|
|
%else
|
|
movu m1, [srcq+wq]
|
|
%endif
|
|
psllw m2, m1, 8
|
|
paddb m1, m2
|
|
pshufb m2, m1, m3
|
|
paddb m1, m2
|
|
pshufb m2, m1, m4
|
|
paddb m1, m2
|
|
%if mmsize >= 16
|
|
pshufb m2, m1, m6
|
|
paddb m1, m2
|
|
%endif
|
|
paddb xm0, xm1
|
|
%if %1
|
|
mova [dstq+wq], xm0
|
|
%else
|
|
movq [dstq+wq], xm0
|
|
movhps [dstq+wq+8], xm0
|
|
%endif
|
|
|
|
%if mmsize == 32
|
|
vextracti128 xm2, m1, 1 ; get second lane of the ymm
|
|
pshufb xm0, xm5 ; set alls val to last val of the first lane
|
|
paddb xm0, xm2
|
|
;store val
|
|
%if %1
|
|
mova [dstq+wq+16], xm0
|
|
%else;
|
|
movq [dstq+wq+16], xm0
|
|
movhps [dstq+wq+16+8], xm0
|
|
%endif
|
|
%endif
|
|
add wq, mmsize
|
|
jl %%.loop
|
|
%if mmsize == 32
|
|
movzx eax, byte [dstq - 1]
|
|
%else;
|
|
mov eax, mmsize-1
|
|
sub eax, wd
|
|
movd m1, eax
|
|
pshufb m0, m1
|
|
movd eax, m0
|
|
%endif
|
|
RET
|
|
%endmacro
|
|
|
|
;------------------------------------------------------------------------------
|
|
; int ff_add_left_pred(uint8_t *dst, const uint8_t *src, int w, int left)
|
|
;------------------------------------------------------------------------------
|
|
INIT_MMX ssse3
|
|
cglobal add_left_pred, 3,3,7, dst, src, w, left
|
|
.skip_prologue:
|
|
mova m5, [pb_7]
|
|
mova m4, [pb_zzzz3333zzzzbbbb]
|
|
mova m3, [pb_zz11zz55zz99zzdd]
|
|
movd m0, leftm
|
|
psllq m0, 56
|
|
ADD_LEFT_LOOP 1, 1
|
|
|
|
%macro ADD_LEFT_PRED_UNALIGNED 0
|
|
cglobal add_left_pred_unaligned, 3,3,7, dst, src, w, left
|
|
mova xm5, [pb_15]
|
|
VBROADCASTI128 m6, [pb_zzzzzzzz77777777]
|
|
VBROADCASTI128 m4, [pb_zzzz3333zzzzbbbb]
|
|
VBROADCASTI128 m3, [pb_zz11zz55zz99zzdd]
|
|
movd xm0, leftm
|
|
pslldq xm0, 15
|
|
test srcq, mmsize - 1
|
|
jnz .src_unaligned
|
|
test dstq, mmsize - 1
|
|
jnz .dst_unaligned
|
|
ADD_LEFT_LOOP 1, 1
|
|
.dst_unaligned:
|
|
ADD_LEFT_LOOP 0, 1
|
|
.src_unaligned:
|
|
ADD_LEFT_LOOP 0, 0
|
|
%endmacro
|
|
|
|
INIT_XMM ssse3
|
|
ADD_LEFT_PRED_UNALIGNED
|
|
|
|
%if HAVE_AVX2_EXTERNAL
|
|
INIT_YMM avx2
|
|
ADD_LEFT_PRED_UNALIGNED
|
|
%endif
|
|
|
|
;------------------------------------------------------------------------------
|
|
; void ff_add_bytes(uint8_t *dst, uint8_t *src, ptrdiff_t w);
|
|
;------------------------------------------------------------------------------
|
|
%macro ADD_BYTES 0
|
|
cglobal add_bytes, 3,4,2, dst, src, w, size
|
|
mov sizeq, wq
|
|
and sizeq, -2*mmsize
|
|
jz .2
|
|
add dstq, sizeq
|
|
add srcq, sizeq
|
|
neg sizeq
|
|
.1:
|
|
mova m0, [srcq + sizeq]
|
|
mova m1, [srcq + sizeq + mmsize]
|
|
paddb m0, [dstq + sizeq]
|
|
paddb m1, [dstq + sizeq + mmsize]
|
|
mova [dstq + sizeq], m0
|
|
mova [dstq + sizeq + mmsize], m1
|
|
add sizeq, 2*mmsize
|
|
jl .1
|
|
.2:
|
|
and wq, 2*mmsize-1
|
|
jz .end
|
|
add dstq, wq
|
|
add srcq, wq
|
|
neg wq
|
|
.3:
|
|
mov sizeb, [srcq + wq]
|
|
add [dstq + wq], sizeb
|
|
inc wq
|
|
jl .3
|
|
.end:
|
|
REP_RET
|
|
%endmacro
|
|
|
|
INIT_XMM sse2
|
|
ADD_BYTES
|
|
|
|
%if HAVE_AVX2_EXTERNAL
|
|
INIT_YMM avx2
|
|
ADD_BYTES
|
|
%endif
|
|
|
|
%macro ADD_HFYU_LEFT_LOOP_INT16 2 ; %1 = dst alignment (a/u), %2 = src alignment (a/u)
|
|
add wd, wd
|
|
add srcq, wq
|
|
add dstq, wq
|
|
neg wq
|
|
%%.loop:
|
|
mov%2 m1, [srcq+wq]
|
|
mova m2, m1
|
|
pslld m1, 16
|
|
paddw m1, m2
|
|
mova m2, m1
|
|
|
|
pshufb m1, m3
|
|
paddw m1, m2
|
|
pshufb m0, m5
|
|
%if mmsize == 16
|
|
mova m2, m1
|
|
pshufb m1, m4
|
|
paddw m1, m2
|
|
%endif
|
|
paddw m0, m1
|
|
pand m0, m7
|
|
%ifidn %1, a
|
|
mova [dstq+wq], m0
|
|
%else
|
|
movq [dstq+wq], m0
|
|
movhps [dstq+wq+8], m0
|
|
%endif
|
|
add wq, mmsize
|
|
jl %%.loop
|
|
mov eax, mmsize-1
|
|
sub eax, wd
|
|
mov wd, eax
|
|
shl wd, 8
|
|
lea eax, [wd+eax-1]
|
|
movd m1, eax
|
|
pshufb m0, m1
|
|
movd eax, m0
|
|
RET
|
|
%endmacro
|
|
|
|
;---------------------------------------------------------------------------------------------
|
|
; int add_left_pred_int16(uint16_t *dst, const uint16_t *src, unsigned mask, int w, int left)
|
|
;---------------------------------------------------------------------------------------------
|
|
INIT_MMX ssse3
|
|
cglobal add_left_pred_int16, 4,4,8, dst, src, mask, w, left
|
|
.skip_prologue:
|
|
mova m5, [pb_67]
|
|
mova m3, [pb_zzzz2323zzzzabab]
|
|
movd m0, leftm
|
|
psllq m0, 48
|
|
movd m7, maskm
|
|
SPLATW m7 ,m7
|
|
ADD_HFYU_LEFT_LOOP_INT16 a, a
|
|
|
|
INIT_XMM ssse3
|
|
cglobal add_left_pred_int16_unaligned, 4,4,8, dst, src, mask, w, left
|
|
mova m5, [pb_ef]
|
|
mova m4, [pb_zzzzzzzz67676767]
|
|
mova m3, [pb_zzzz2323zzzzabab]
|
|
movd m0, leftm
|
|
pslldq m0, 14
|
|
movd m7, maskm
|
|
SPLATW m7 ,m7
|
|
test srcq, 15
|
|
jnz .src_unaligned
|
|
test dstq, 15
|
|
jnz .dst_unaligned
|
|
ADD_HFYU_LEFT_LOOP_INT16 a, a
|
|
.dst_unaligned:
|
|
ADD_HFYU_LEFT_LOOP_INT16 u, a
|
|
.src_unaligned:
|
|
ADD_HFYU_LEFT_LOOP_INT16 u, u
|
|
|
|
|
|
;---------------------------------------------------------------------------------------------
|
|
; void add_gradient_pred(uint8_t *src, const ptrdiff_t stride, const ptrdiff_t width)
|
|
;---------------------------------------------------------------------------------------------
|
|
%macro ADD_GRADIENT_PRED 0
|
|
cglobal add_gradient_pred, 3,4,5, src, stride, width, tmp
|
|
mova xm0, [pb_15]
|
|
|
|
;load src - 1 in xm1
|
|
movd xm1, [srcq-1]
|
|
%if cpuflag(avx2)
|
|
vpbroadcastb xm1, xm1
|
|
%else
|
|
pxor xm2, xm2
|
|
pshufb xm1, xm2
|
|
%endif
|
|
|
|
add srcq, widthq
|
|
neg widthq
|
|
neg strideq
|
|
|
|
.loop:
|
|
lea tmpq, [srcq + strideq]
|
|
mova m2, [tmpq + widthq] ; A = src[x-stride]
|
|
movu m3, [tmpq + widthq - 1] ; B = src[x - (stride + 1)]
|
|
mova m4, [srcq + widthq] ; current val (src[x])
|
|
|
|
psubb m2, m3; A - B
|
|
|
|
; prefix sum A-B
|
|
pslldq m3, m2, 1
|
|
paddb m2, m3
|
|
pslldq m3, m2, 2
|
|
paddb m2, m3
|
|
pslldq m3, m2, 4
|
|
paddb m2, m3
|
|
pslldq m3, m2, 8
|
|
paddb m2, m3
|
|
|
|
; prefix sum current val
|
|
pslldq m3, m4, 1
|
|
paddb m4, m3
|
|
pslldq m3, m4, 2
|
|
paddb m4, m3
|
|
pslldq m3, m4, 4
|
|
paddb m4, m3
|
|
pslldq m3, m4, 8
|
|
paddb m4, m3
|
|
|
|
; last sum
|
|
paddb m2, m4 ; current + (A - B)
|
|
|
|
paddb xm1, xm2 ; += C
|
|
mova [srcq + widthq], xm1 ; store
|
|
|
|
pshufb xm1, xm0 ; put last val in all val of xm1
|
|
|
|
%if mmsize == 32
|
|
vextracti128 xm2, m2, 1 ; get second lane of the ymm
|
|
paddb xm1, xm2; += C
|
|
|
|
mova [srcq + widthq + 16], xm1 ; store
|
|
pshufb xm1, xm0 ; put last val in all val of m1
|
|
%endif
|
|
|
|
add widthq, mmsize
|
|
jl .loop
|
|
RET
|
|
|
|
%endmacro
|
|
|
|
INIT_XMM ssse3
|
|
ADD_GRADIENT_PRED
|
|
|
|
%if HAVE_AVX2_EXTERNAL
|
|
INIT_YMM avx2
|
|
ADD_GRADIENT_PRED
|
|
%endif
|