FFmpeg/libavcodec/x86/huffyuvdsp.asm

;******************************************************************************
;* SIMD-optimized HuffYUV functions
;* Copyright (c) 2008 Loren Merritt
;* Copyright (c) 2014 Christophe Gisquet
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************

%include "libavutil/x86/x86util.asm"

SECTION .text

%include "libavcodec/x86/huffyuvdsp_template.asm"

;------------------------------------------------------------------------------
; void (*add_int16)(uint16_t *dst, const uint16_t *src, unsigned mask, int w);
;------------------------------------------------------------------------------

%macro ADD_INT16 0
cglobal add_int16, 4,4,5, dst, src, mask, w, tmp
%if mmsize > 8
    test srcq, mmsize-1
    jnz .unaligned
    test dstq, mmsize-1
    jnz .unaligned
%endif
    INT16_LOOP a, add
%if mmsize > 8
.unaligned:
    INT16_LOOP u, add
%endif
%endmacro

%if ARCH_X86_32
INIT_MMX mmx
ADD_INT16
%endif

INIT_XMM sse2
ADD_INT16

%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
ADD_INT16
%endif

; void add_hfyu_left_pred_bgr32(uint8_t *dst, const uint8_t *src,
;                               intptr_t w, uint8_t *left)
%macro LEFT_BGR32 0
cglobal add_hfyu_left_pred_bgr32, 4,4,3, dst, src, w, left
    shl           wq, 2
    movd          m0, [leftq]
    lea         dstq, [dstq + wq]
    lea         srcq, [srcq + wq]
    LSHIFT        m0, mmsize-4
    neg           wq
.loop:
    movu          m1, [srcq+wq]
    mova          m2, m1
%if mmsize == 8
    punpckhdq     m0, m0
%endif
    LSHIFT        m1, 4
    paddb         m1, m2
%if mmsize == 16
    pshufd        m0, m0, q3333
    mova          m2, m1
    LSHIFT        m1, 8
    paddb         m1, m2
%endif
    paddb         m0, m1
    movu   [dstq+wq], m0
    add           wq, mmsize
    jl         .loop
    movd          m0, [dstq-4]
    movd     [leftq], m0
    REP_RET
%endmacro

%if ARCH_X86_32
INIT_MMX mmx
LEFT_BGR32
%endif
INIT_XMM sse2
LEFT_BGR32

; void add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int mask, int w, int *left, int *left_top)
INIT_MMX mmxext
cglobal add_hfyu_median_pred_int16, 7,7,0, dst, top, diff, mask, w, left, left_top
    add      wd, wd
    movd    mm6, maskd
    SPLATW  mm6, mm6
    movq    mm0, [topq]
    movq    mm2, mm0
    movd    mm4, [left_topq]
    psllq   mm2, 16
    movq    mm1, mm0
    por     mm4, mm2
    movd    mm3, [leftq]
    psubw   mm0, mm4 ; t-tl
    add    dstq, wq
    add    topq, wq
    add   diffq, wq
    neg      wq
    jmp .skip
.loop:
    movq    mm4, [topq+wq]
    movq    mm0, mm4
    psllq   mm4, 16
    por     mm4, mm1
    movq    mm1, mm0 ; t
    psubw   mm0, mm4 ; t-tl
.skip:
    movq    mm2, [diffq+wq]
%assign i 0
%rep 4
    movq    mm4, mm0
    paddw   mm4, mm3 ; t-tl+l
    pand    mm4, mm6
    movq    mm5, mm3
    pmaxsw  mm3, mm1
    pminsw  mm5, mm1
    pminsw  mm3, mm4
    pmaxsw  mm3, mm5 ; median
    paddw   mm3, mm2 ; +residual
    pand    mm3, mm6
%if i==0
    movq    mm7, mm3
    psllq   mm7, 48
%else
    movq    mm4, mm3
    psrlq   mm7, 16
    psllq   mm4, 48
    por     mm7, mm4
%endif
%if i<3
    psrlq   mm0, 16
    psrlq   mm1, 16
    psrlq   mm2, 16
%endif
%assign i i+1
%endrep
    movq [dstq+wq], mm7
    add      wq, 8
    jl .loop
    movzx   r2d, word [dstq-2]
    mov [leftq], r2d
    movzx   r2d, word [topq-2]
    mov [left_topq], r2d
    RET
dsputil: Split off HuffYUV decoding bits into their own context Also shorten HuffYUV context member names to avoid clutter. 2014-01-07 14:23:13 +03:00			`;******************************************************************************`
			`;* SIMD-optimized HuffYUV functions`
			`;* Copyright (c) 2008 Loren Merritt`
x86: huffyuvdsp: add_hfyu_left_pred_bgr32 C MMX SSE2 Cycles: 3092 1053 578 Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2014-05-28 22:57:38 +03:00			`;* Copyright (c) 2014 Christophe Gisquet`
dsputil: Split off HuffYUV decoding bits into their own context Also shorten HuffYUV context member names to avoid clutter. 2014-01-07 14:23:13 +03:00			`;*`
Merge commit '0d439fbede03854eac8a978cccf21a3425a3c82d' * commit '0d439fbede03854eac8a978cccf21a3425a3c82d': dsputil: Split off HuffYUV decoding bits into their own context Conflicts: configure libavcodec/dsputil.c libavcodec/dsputil.h libavcodec/huffyuv.h libavcodec/huffyuvdec.c libavcodec/lagarith.c libavcodec/vble.c libavcodec/x86/Makefile libavcodec/x86/dsputil.asm libavcodec/x86/dsputil_init.c libavcodec/x86/dsputil_mmx.c Merged-by: Michael Niedermayer <michaelni@gmx.at> 2014-05-28 00:07:36 +03:00			`;* This file is part of FFmpeg.`
dsputil: Split off HuffYUV decoding bits into their own context Also shorten HuffYUV context member names to avoid clutter. 2014-01-07 14:23:13 +03:00			`;*`
Merge commit '0d439fbede03854eac8a978cccf21a3425a3c82d' * commit '0d439fbede03854eac8a978cccf21a3425a3c82d': dsputil: Split off HuffYUV decoding bits into their own context Conflicts: configure libavcodec/dsputil.c libavcodec/dsputil.h libavcodec/huffyuv.h libavcodec/huffyuvdec.c libavcodec/lagarith.c libavcodec/vble.c libavcodec/x86/Makefile libavcodec/x86/dsputil.asm libavcodec/x86/dsputil_init.c libavcodec/x86/dsputil_mmx.c Merged-by: Michael Niedermayer <michaelni@gmx.at> 2014-05-28 00:07:36 +03:00			`;* FFmpeg is free software; you can redistribute it and/or`
dsputil: Split off HuffYUV decoding bits into their own context Also shorten HuffYUV context member names to avoid clutter. 2014-01-07 14:23:13 +03:00			`;* modify it under the terms of the GNU Lesser General Public`
			`;* License as published by the Free Software Foundation; either`
			`;* version 2.1 of the License, or (at your option) any later version.`
			`;*`
Merge commit '0d439fbede03854eac8a978cccf21a3425a3c82d' * commit '0d439fbede03854eac8a978cccf21a3425a3c82d': dsputil: Split off HuffYUV decoding bits into their own context Conflicts: configure libavcodec/dsputil.c libavcodec/dsputil.h libavcodec/huffyuv.h libavcodec/huffyuvdec.c libavcodec/lagarith.c libavcodec/vble.c libavcodec/x86/Makefile libavcodec/x86/dsputil.asm libavcodec/x86/dsputil_init.c libavcodec/x86/dsputil_mmx.c Merged-by: Michael Niedermayer <michaelni@gmx.at> 2014-05-28 00:07:36 +03:00			`;* FFmpeg is distributed in the hope that it will be useful,`
dsputil: Split off HuffYUV decoding bits into their own context Also shorten HuffYUV context member names to avoid clutter. 2014-01-07 14:23:13 +03:00			`;* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`;* Lesser General Public License for more details.`
			`;*`
			`;* You should have received a copy of the GNU Lesser General Public`
Merge commit '0d439fbede03854eac8a978cccf21a3425a3c82d' * commit '0d439fbede03854eac8a978cccf21a3425a3c82d': dsputil: Split off HuffYUV decoding bits into their own context Conflicts: configure libavcodec/dsputil.c libavcodec/dsputil.h libavcodec/huffyuv.h libavcodec/huffyuvdec.c libavcodec/lagarith.c libavcodec/vble.c libavcodec/x86/Makefile libavcodec/x86/dsputil.asm libavcodec/x86/dsputil_init.c libavcodec/x86/dsputil_mmx.c Merged-by: Michael Niedermayer <michaelni@gmx.at> 2014-05-28 00:07:36 +03:00			`;* License along with FFmpeg; if not, write to the Free Software`
dsputil: Split off HuffYUV decoding bits into their own context Also shorten HuffYUV context member names to avoid clutter. 2014-01-07 14:23:13 +03:00			`;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA`
			`;******************************************************************************`

			`%include "libavutil/x86/x86util.asm"`

x86inc: Drop SECTION_TEXT macro The .text section is already 16-byte aligned by default on all supported platforms so `SECTION_TEXT` isn't any different from `SECTION .text`. 2015-05-27 21:38:14 +02:00			`SECTION .text`
dsputil: Split off HuffYUV decoding bits into their own context Also shorten HuffYUV context member names to avoid clutter. 2014-01-07 14:23:13 +03:00
avcodec/huffyuvdsp(enc) : move duplicate macro to a template file 2017-11-21 10:10:52 +02:00			`%include "libavcodec/x86/huffyuvdsp_template.asm"`
huffyuvdsp: move functions only used by huffyuv from lossless_videodsp Signed-off-by: James Almer <jamrial@gmail.com> 2017-01-08 16:48:05 +02:00
avcodec/huffyuvdsp : reorganize add_int16 asm 2017-11-21 10:11:36 +02:00			`;------------------------------------------------------------------------------`
			`; void (add_int16)(uint16_t dst, const uint16_t *src, unsigned mask, int w);`
			`;------------------------------------------------------------------------------`
huffyuvdsp: move functions only used by huffyuv from lossless_videodsp Signed-off-by: James Almer <jamrial@gmail.com> 2017-01-08 16:48:05 +02:00
avcodec/huffyuvdsp : reorganize add_int16 asm 2017-11-21 10:11:36 +02:00			`%macro ADD_INT16 0`
huffyuvdsp: move functions only used by huffyuv from lossless_videodsp Signed-off-by: James Almer <jamrial@gmail.com> 2017-01-08 16:48:05 +02:00			`cglobal add_int16, 4,4,5, dst, src, mask, w, tmp`
avcodec/huffyuvdsp : reorganize add_int16 asm 2017-11-21 10:11:36 +02:00			`%if mmsize > 8`
huffyuvdsp: move functions only used by huffyuv from lossless_videodsp Signed-off-by: James Almer <jamrial@gmail.com> 2017-01-08 16:48:05 +02:00			`test srcq, mmsize-1`
			`jnz .unaligned`
			`test dstq, mmsize-1`
			`jnz .unaligned`
avcodec/huffyuvdsp : reorganize add_int16 asm 2017-11-21 10:11:36 +02:00			`%endif`
huffyuvdsp: move functions only used by huffyuv from lossless_videodsp Signed-off-by: James Almer <jamrial@gmail.com> 2017-01-08 16:48:05 +02:00			`INT16_LOOP a, add`
avcodec/huffyuvdsp : reorganize add_int16 asm 2017-11-21 10:11:36 +02:00			`%if mmsize > 8`
huffyuvdsp: move functions only used by huffyuv from lossless_videodsp Signed-off-by: James Almer <jamrial@gmail.com> 2017-01-08 16:48:05 +02:00			`.unaligned:`
			`INT16_LOOP u, add`
avcodec/huffyuvdsp : reorganize add_int16 asm 2017-11-21 10:11:36 +02:00			`%endif`
			`%endmacro`

			`%if ARCH_X86_32`
			`INIT_MMX mmx`
			`ADD_INT16`
			`%endif`

			`INIT_XMM sse2`
			`ADD_INT16`
huffyuvdsp: move functions only used by huffyuv from lossless_videodsp Signed-off-by: James Almer <jamrial@gmail.com> 2017-01-08 16:48:05 +02:00
avcodec/huffyuvdsp : add add_int16 AVX2 func 2017-11-21 10:14:35 +02:00			`%if HAVE_AVX2_EXTERNAL`
			`INIT_YMM avx2`
			`ADD_INT16`
			`%endif`

x86: huffyuvdsp: add_hfyu_left_pred_bgr32 C MMX SSE2 Cycles: 3092 1053 578 Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2014-05-28 22:57:38 +03:00			`; void add_hfyu_left_pred_bgr32(uint8_t dst, const uint8_t src,`
			`; intptr_t w, uint8_t *left)`
			`%macro LEFT_BGR32 0`
			`cglobal add_hfyu_left_pred_bgr32, 4,4,3, dst, src, w, left`
			`shl wq, 2`
			`movd m0, [leftq]`
			`lea dstq, [dstq + wq]`
			`lea srcq, [srcq + wq]`
			`LSHIFT m0, mmsize-4`
			`neg wq`
			`.loop:`
			`movu m1, [srcq+wq]`
			`mova m2, m1`
			`%if mmsize == 8`
			`punpckhdq m0, m0`
			`%endif`
			`LSHIFT m1, 4`
			`paddb m1, m2`
			`%if mmsize == 16`
			`pshufd m0, m0, q3333`
			`mova m2, m1`
			`LSHIFT m1, 8`
			`paddb m1, m2`
			`%endif`
			`paddb m0, m1`
			`movu [dstq+wq], m0`
			`add wq, mmsize`
			`jl .loop`
			`movd m0, [dstq-4]`
			`movd [leftq], m0`
			`REP_RET`
			`%endmacro`

x86: huffyuvdsp: fewer functions for x86_64 When there are 2 functions that are <= SSE2, only one is needed for x86_64. Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2014-05-30 12:57:56 +03:00			`%if ARCH_X86_32`
x86: huffyuvdsp: add_hfyu_left_pred_bgr32 C MMX SSE2 Cycles: 3092 1053 578 Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2014-05-28 22:57:38 +03:00			`INIT_MMX mmx`
			`LEFT_BGR32`
x86: huffyuvdsp: fewer functions for x86_64 When there are 2 functions that are <= SSE2, only one is needed for x86_64. Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2014-05-30 12:57:56 +03:00			`%endif`
x86: huffyuvdsp: add_hfyu_left_pred_bgr32 C MMX SSE2 Cycles: 3092 1053 578 Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2014-05-28 22:57:38 +03:00			`INIT_XMM sse2`
			`LEFT_BGR32`
huffyuvdsp: move functions only used by huffyuv from lossless_videodsp Signed-off-by: James Almer <jamrial@gmail.com> 2017-01-08 16:48:05 +02:00
			`; void add_hfyu_median_prediction_mmxext(uint8_t dst, const uint8_t top, const uint8_t diff, int mask, int w, int left, int *left_top)`
			`INIT_MMX mmxext`
			`cglobal add_hfyu_median_pred_int16, 7,7,0, dst, top, diff, mask, w, left, left_top`
			`add wd, wd`
			`movd mm6, maskd`
			`SPLATW mm6, mm6`
			`movq mm0, [topq]`
			`movq mm2, mm0`
			`movd mm4, [left_topq]`
			`psllq mm2, 16`
			`movq mm1, mm0`
			`por mm4, mm2`
			`movd mm3, [leftq]`
			`psubw mm0, mm4 ; t-tl`
			`add dstq, wq`
			`add topq, wq`
			`add diffq, wq`
			`neg wq`
			`jmp .skip`
			`.loop:`
			`movq mm4, [topq+wq]`
			`movq mm0, mm4`
			`psllq mm4, 16`
			`por mm4, mm1`
			`movq mm1, mm0 ; t`
			`psubw mm0, mm4 ; t-tl`
			`.skip:`
			`movq mm2, [diffq+wq]`
			`%assign i 0`
			`%rep 4`
			`movq mm4, mm0`
			`paddw mm4, mm3 ; t-tl+l`
			`pand mm4, mm6`
			`movq mm5, mm3`
			`pmaxsw mm3, mm1`
			`pminsw mm5, mm1`
			`pminsw mm3, mm4`
			`pmaxsw mm3, mm5 ; median`
			`paddw mm3, mm2 ; +residual`
			`pand mm3, mm6`
			`%if i==0`
			`movq mm7, mm3`
			`psllq mm7, 48`
			`%else`
			`movq mm4, mm3`
			`psrlq mm7, 16`
			`psllq mm4, 48`
			`por mm7, mm4`
			`%endif`
			`%if i<3`
			`psrlq mm0, 16`
			`psrlq mm1, 16`
			`psrlq mm2, 16`
			`%endif`
			`%assign i i+1`
			`%endrep`
			`movq [dstq+wq], mm7`
			`add wq, 8`
			`jl .loop`
			`movzx r2d, word [dstq-2]`
			`mov [leftq], r2d`
			`movzx r2d, word [topq-2]`
			`mov [left_topq], r2d`
			`RET`