You've already forked FFmpeg
mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-11-23 21:54:53 +02:00
avcodec/x86/h264_chromamc: Add SSSE3 RV40 chroma motion compensation functions
The only difference between it and the H.264/VC-1 versions is the bias constant which depends on the shift parameters for RV40. This value ends up in a register and therefore one can reuse the H.264 code by setting the registers for RV40 and then jumping into the relevant H.264 function, making the four new functions cheap (just 256 bytes in total). This approach uses one jump more for the no-filter case and one jump less in the one-dimensional case than an approach using separate functions. avg_chroma_mc4_c: 167.5 ( 1.00x) avg_chroma_mc4_mmxext: 48.1 ( 3.48x) avg_chroma_mc4_ssse3: 31.1 ( 5.39x) avg_chroma_mc8_c: 325.5 ( 1.00x) avg_chroma_mc8_mmxext: 103.2 ( 3.15x) avg_chroma_mc8_ssse3: 33.5 ( 9.71x) put_chroma_mc4_c: 137.4 ( 1.00x) put_chroma_mc4_mmx: 44.5 ( 3.09x) put_chroma_mc4_ssse3: 28.4 ( 4.83x) put_chroma_mc8_c: 271.4 ( 1.00x) put_chroma_mc8_mmx: 99.9 ( 2.72x) put_chroma_mc8_ssse3: 30.6 ( 8.86x) Reviewed-by: Lynne <dev@lynne.ee> Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
This commit is contained in:
@@ -24,6 +24,7 @@
|
||||
* RV40 decoder motion compensation functions
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes_internal.h"
|
||||
#include "libavutil/common.h"
|
||||
#include "libavutil/intreadwrite.h"
|
||||
#include "h264qpel.h"
|
||||
@@ -283,7 +284,7 @@ static void avg_rv40_qpel8_mc33_c(uint8_t *dst, const uint8_t *src, ptrdiff_t st
|
||||
avg_pixels8_xy2_8_c(dst, src, stride, 8);
|
||||
}
|
||||
|
||||
static const int rv40_bias[4][4] = {
|
||||
attribute_visibility_hidden const int ff_rv40_bias[4][4] = {
|
||||
{ 0, 16, 32, 16 },
|
||||
{ 32, 28, 32, 28 },
|
||||
{ 0, 32, 16, 32 },
|
||||
@@ -300,7 +301,7 @@ static void OPNAME ## rv40_chroma_mc4_c(uint8_t *dst /*align 8*/,\
|
||||
const int C = (8-x) * ( y);\
|
||||
const int D = ( x) * ( y);\
|
||||
int i;\
|
||||
int bias = rv40_bias[y>>1][x>>1];\
|
||||
int bias = ff_rv40_bias[y>>1][x>>1];\
|
||||
\
|
||||
av_assert2(x<8 && y<8 && x>=0 && y>=0);\
|
||||
\
|
||||
@@ -336,7 +337,7 @@ static void OPNAME ## rv40_chroma_mc8_c(uint8_t *dst/*align 8*/,\
|
||||
const int C = (8-x) * ( y);\
|
||||
const int D = ( x) * ( y);\
|
||||
int i;\
|
||||
int bias = rv40_bias[y>>1][x>>1];\
|
||||
int bias = ff_rv40_bias[y>>1][x>>1];\
|
||||
\
|
||||
av_assert2(x<8 && y<8 && x>=0 && y>=0);\
|
||||
\
|
||||
|
||||
@@ -20,6 +20,7 @@
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "config_components.asm"
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA
|
||||
@@ -64,6 +65,8 @@ pw_28: times 8 dw 28
|
||||
cextern pw_32
|
||||
cextern pw_64
|
||||
|
||||
cextern rv40_bias
|
||||
|
||||
SECTION .text
|
||||
|
||||
%macro mv0_pixels_mc8 0
|
||||
@@ -447,11 +450,12 @@ chroma_mc4_mmx_func avg, rv40
|
||||
chroma_mc2_mmx_func avg, h264
|
||||
|
||||
%macro chroma_mc8_ssse3_func 2-3
|
||||
cglobal %1_%2_chroma_mc8%3, 6, 7, 8
|
||||
cglobal %1_%2_chroma_mc8%3, 6, 7+UNIX64, 8
|
||||
mov r6d, r5d
|
||||
or r6d, r4d
|
||||
jne .at_least_one_non_zero
|
||||
; mx == 0 AND my == 0 - no filter needed
|
||||
..@%1_%2_chroma_mc8_no_filter_ %+ cpuname:
|
||||
mv0_pixels_mc8
|
||||
RET
|
||||
|
||||
@@ -462,6 +466,8 @@ cglobal %1_%2_chroma_mc8%3, 6, 7, 8
|
||||
je .mx_is_zero
|
||||
|
||||
; general case, bilinear
|
||||
movdqa m5, [rnd_2d_%2]
|
||||
..@%1_%2_chroma_mc8_both_nonzero_ %+ cpuname:
|
||||
mov r6d, r4d
|
||||
shl r4d, 8
|
||||
sub r4, r6
|
||||
@@ -473,7 +479,6 @@ cglobal %1_%2_chroma_mc8%3, 6, 7, 8
|
||||
|
||||
movd m7, r6d
|
||||
movd m6, r4d
|
||||
movdqa m5, [rnd_2d_%2]
|
||||
movq m0, [r1 ]
|
||||
movq m1, [r1+1]
|
||||
pshuflw m7, m7, 0
|
||||
@@ -517,12 +522,13 @@ cglobal %1_%2_chroma_mc8%3, 6, 7, 8
|
||||
RET
|
||||
|
||||
.my_is_zero:
|
||||
movdqa m6, [rnd_1d_%2]
|
||||
..@%1_%2_chroma_mc8_my_zero_ %+ cpuname:
|
||||
mov r5d, r4d
|
||||
shl r4d, 8
|
||||
add r4, 8
|
||||
sub r4, r5 ; 255*x+8 = x<<8 | (8-x)
|
||||
movd m7, r4d
|
||||
movdqa m6, [rnd_1d_%2]
|
||||
pshuflw m7, m7, 0
|
||||
movlhps m7, m7
|
||||
|
||||
@@ -554,12 +560,13 @@ cglobal %1_%2_chroma_mc8%3, 6, 7, 8
|
||||
RET
|
||||
|
||||
.mx_is_zero:
|
||||
movdqa m6, [rnd_1d_%2]
|
||||
..@%1_%2_chroma_mc8_mx_zero_ %+ cpuname:
|
||||
mov r4d, r5d
|
||||
shl r5d, 8
|
||||
add r5, 8
|
||||
sub r5, r4 ; 255*y+8 = y<<8 | (8-y)
|
||||
movd m7, r5d
|
||||
movdqa m6, [rnd_1d_%2]
|
||||
pshuflw m7, m7, 0
|
||||
movlhps m7, m7
|
||||
|
||||
@@ -592,7 +599,9 @@ cglobal %1_%2_chroma_mc8%3, 6, 7, 8
|
||||
%endmacro
|
||||
|
||||
%macro chroma_mc4_ssse3_func 2
|
||||
cglobal %1_%2_chroma_mc4, 6, 7, 0
|
||||
cglobal %1_%2_chroma_mc4, 6, 7+UNIX64, 0
|
||||
movq m5, [pw_32]
|
||||
..@%1_%2_chroma_mc4_after_init_ %+ cpuname:
|
||||
mov r6, r4
|
||||
shl r4d, 8
|
||||
sub r4d, r6d
|
||||
@@ -604,7 +613,6 @@ cglobal %1_%2_chroma_mc4, 6, 7, 0
|
||||
|
||||
movd m7, r6d
|
||||
movd m6, r4d
|
||||
movq m5, [pw_32]
|
||||
movd m0, [r1 ]
|
||||
pshufw m7, m7, 0
|
||||
punpcklbw m0, [r1+1]
|
||||
@@ -641,16 +649,79 @@ cglobal %1_%2_chroma_mc4, 6, 7, 0
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
%macro rv40_get_bias 1 ; dst reg
|
||||
%if !PIC || UNIX64
|
||||
; on UNIX64 we have enough volatile registers
|
||||
%if PIC && UNIX64
|
||||
lea r7, [rv40_bias]
|
||||
%endif
|
||||
mov r6d, r5d
|
||||
and r6d, 6 ; &~1 for mx/my=[0,7]
|
||||
lea r6d, [r6d*4+r4d]
|
||||
sar r6d, 1
|
||||
%if PIC && UNIX64
|
||||
movd %1, [r7+4*r6]
|
||||
%else
|
||||
movd %1, [rv40_bias+4*r6]
|
||||
%endif
|
||||
%else ; PIC && !UNIX64, de facto WIN64
|
||||
lea r6, [rv40_bias]
|
||||
%ifidn r5d, r5m ; always false for currently supported calling conventions
|
||||
push r5
|
||||
%endif
|
||||
and r5d, 6 ; &~1 for mx/my=[0,7]
|
||||
lea r5d, [r5d*4+r4d]
|
||||
sar r5d, 1
|
||||
movd %1, [r6+4*r5]
|
||||
%ifidn r5d, r5m
|
||||
pop r5
|
||||
%else
|
||||
mov r5d, r5m
|
||||
%endif
|
||||
%endif
|
||||
SPLATW %1, %1
|
||||
%endmacro
|
||||
|
||||
%macro rv40_chroma_mc8_func 1 ; put vs avg
|
||||
%if CONFIG_RV40_DECODER
|
||||
cglobal rv40_%1_chroma_mc8, 6, 7+UNIX64, 8
|
||||
mov r6d, r5d
|
||||
or r6d, r4d
|
||||
jz ..@%1_h264_chroma_mc8_no_filter_ %+ cpuname
|
||||
rv40_get_bias m5
|
||||
; the bilinear code expects bias in m5, the one-dimensional code in m6
|
||||
mova m6, m5
|
||||
psraw m6, 3
|
||||
test r5d, r5d
|
||||
je ..@%1_h264_chroma_mc8_my_zero_ %+ cpuname
|
||||
test r4d, r4d
|
||||
je ..@%1_h264_chroma_mc8_mx_zero_ %+ cpuname
|
||||
jmp ..@%1_h264_chroma_mc8_both_nonzero_ %+ cpuname
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro rv40_chroma_mc4_func 1 ; put vs avg
|
||||
%if CONFIG_RV40_DECODER
|
||||
cglobal rv40_%1_chroma_mc4, 6, 7+UNIX64, 0
|
||||
rv40_get_bias m5
|
||||
jmp ..@%1_h264_chroma_mc4_after_init_ %+ cpuname
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%define CHROMAMC_AVG NOTHING
|
||||
INIT_XMM ssse3
|
||||
chroma_mc8_ssse3_func put, h264, _rnd
|
||||
chroma_mc8_ssse3_func put, vc1, _nornd
|
||||
rv40_chroma_mc8_func put
|
||||
INIT_MMX ssse3
|
||||
chroma_mc4_ssse3_func put, h264
|
||||
rv40_chroma_mc4_func put
|
||||
|
||||
%define CHROMAMC_AVG DIRECT_AVG
|
||||
INIT_XMM ssse3
|
||||
chroma_mc8_ssse3_func avg, h264, _rnd
|
||||
chroma_mc8_ssse3_func avg, vc1, _nornd
|
||||
rv40_chroma_mc8_func avg
|
||||
INIT_MMX ssse3
|
||||
chroma_mc4_ssse3_func avg, h264
|
||||
rv40_chroma_mc4_func avg
|
||||
|
||||
@@ -178,6 +178,12 @@ DEFINE_FN(avg, 8, ssse3)
|
||||
|
||||
DEFINE_FN(avg, 16, sse2)
|
||||
DEFINE_FN(avg, 16, ssse3)
|
||||
|
||||
#define CHROMA_MC_FUNC(OP, SIZE, XMM) \
|
||||
void ff_rv40_ ## OP ## _chroma_mc ## SIZE ## _ ## XMM(uint8_t *dst, const uint8_t *src, \
|
||||
ptrdiff_t stride, int h, int x, int y);\
|
||||
c->OP ## _chroma_pixels_tab[SIZE == 4] = ff_rv40_ ## OP ## _chroma_mc ## SIZE ## _ ## XMM
|
||||
|
||||
#endif /* HAVE_X86ASM */
|
||||
|
||||
av_cold void ff_rv40dsp_init_x86(RV34DSPContext *c)
|
||||
@@ -204,6 +210,10 @@ av_cold void ff_rv40dsp_init_x86(RV34DSPContext *c)
|
||||
QPEL_MC_SET(avg_, _sse2)
|
||||
}
|
||||
if (EXTERNAL_SSSE3(cpu_flags)) {
|
||||
CHROMA_MC_FUNC(put, 8, ssse3);
|
||||
CHROMA_MC_FUNC(put, 4, ssse3);
|
||||
CHROMA_MC_FUNC(avg, 8, ssse3);
|
||||
CHROMA_MC_FUNC(avg, 4, ssse3);
|
||||
c->put_pixels_tab[0][15] = put_rv40_qpel16_mc33_ssse3;
|
||||
c->put_pixels_tab[1][15] = put_rv40_qpel8_mc33_ssse3;
|
||||
c->avg_pixels_tab[0][15] = avg_rv40_qpel16_mc33_ssse3;
|
||||
|
||||
Reference in New Issue
Block a user