You've already forked FFmpeg
mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-11-23 21:54:53 +02:00
avcodec/x86/vp3dsp: Port loop filters to SSE2
The old code operated on bytes and did lots of tricks due to their limited range; it did not completely succeed, which is why the old versions were not used when bitexact output was requested. In contrast, the new version is much simpler: It operates on signed 16 bit words whose range is more than sufficient. This means that these functions don't need a check for bitexactness (and can be used in FATE). Old benchmarks (for this, the AV_CODEC_FLAG_BITEXACT check has been removed from checkasm): h_loop_filter_c: 29.8 ( 1.00x) h_loop_filter_mmxext: 32.2 ( 0.93x) h_loop_filter_unaligned_c: 29.9 ( 1.00x) h_loop_filter_unaligned_mmxext: 31.4 ( 0.95x) v_loop_filter_c: 39.3 ( 1.00x) v_loop_filter_mmxext: 14.2 ( 2.78x) v_loop_filter_unaligned_c: 38.9 ( 1.00x) v_loop_filter_unaligned_mmxext: 14.3 ( 2.72x) New benchmarks: h_loop_filter_c: 29.2 ( 1.00x) h_loop_filter_sse2: 28.6 ( 1.02x) h_loop_filter_unaligned_c: 29.0 ( 1.00x) h_loop_filter_unaligned_sse2: 26.9 ( 1.08x) v_loop_filter_c: 38.3 ( 1.00x) v_loop_filter_sse2: 11.0 ( 3.47x) v_loop_filter_unaligned_c: 35.5 ( 1.00x) v_loop_filter_unaligned_sse2: 11.2 ( 3.18x) Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
This commit is contained in:
@@ -325,7 +325,7 @@ typedef struct Vp3DecodeContext {
|
||||
HuffTable huffman_table[5 * 16];
|
||||
|
||||
uint8_t filter_limit_values[64];
|
||||
DECLARE_ALIGNED(8, int, bounding_values_array)[256 + 2];
|
||||
DECLARE_ALIGNED(16, int, bounding_values_array)[256 + 4];
|
||||
|
||||
VP4Predictor * dc_pred_row; /* dc_pred_row[y_superblock_width * 4] */
|
||||
} Vp3DecodeContext;
|
||||
|
||||
@@ -494,5 +494,10 @@ void ff_vp3dsp_set_bounding_values(int * bounding_values_array, int filter_limit
|
||||
}
|
||||
if (value)
|
||||
bounding_values[128] = value;
|
||||
#if ARCH_X86
|
||||
bounding_values[129] = bounding_values[130] =
|
||||
bounding_values[131] = bounding_values[132] = filter_limit * 0x00020002U;
|
||||
#else
|
||||
bounding_values[129] = bounding_values[130] = filter_limit * 0x02020202U;
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -33,113 +33,125 @@ vp3_idct_data: times 8 dw 64277
|
||||
times 8 dw 25080
|
||||
times 8 dw 12785
|
||||
|
||||
pb_7: times 8 db 0x07
|
||||
pb_1F: times 8 db 0x1f
|
||||
pb_81: times 8 db 0x81
|
||||
|
||||
cextern pb_1
|
||||
cextern pb_3
|
||||
cextern pb_80
|
||||
cextern pb_FE
|
||||
|
||||
cextern pw_4
|
||||
cextern pw_8
|
||||
|
||||
SECTION .text
|
||||
|
||||
; this is off by one or two for some cases when filter_limit is greater than 63
|
||||
; in: p0 in mm6, p1 in mm4, p2 in mm2, p3 in mm1
|
||||
; out: p1 in mm4, p2 in mm3
|
||||
; in: p0 in m5, p1 in m4, p2 in m2, p3 in m1, all unpacked;
|
||||
; m0 must be zeroed
|
||||
; out: p1 in m4, p2 in m2
|
||||
%macro VP3_LOOP_FILTER 0
|
||||
movq m7, m6
|
||||
pand m6, [pb_7] ; p0&7
|
||||
psrlw m7, 3
|
||||
pand m7, [pb_1F] ; p0>>3
|
||||
movq m3, m2 ; p2
|
||||
pxor m2, m4
|
||||
pand m2, [pb_1] ; (p2^p1)&1
|
||||
movq m5, m2
|
||||
paddb m2, m2
|
||||
paddb m2, m5 ; 3*(p2^p1)&1
|
||||
paddb m2, m6 ; extra bits lost in shifts
|
||||
pcmpeqb m0, m0
|
||||
pxor m1, m0 ; 255 - p3
|
||||
pavgb m1, m2 ; (256 - p3 + extrabits) >> 1
|
||||
pxor m0, m4 ; 255 - p1
|
||||
pavgb m0, m3 ; (256 + p2-p1) >> 1
|
||||
paddb m1, [pb_3]
|
||||
pavgb m1, m0 ; 128+2+( p2-p1 - p3) >> 2
|
||||
pavgb m1, m0 ; 128+1+(3*(p2-p1) - p3) >> 3
|
||||
paddusb m7, m1 ; d+128+1
|
||||
movq m6, [pb_81]
|
||||
psubusb m6, m7
|
||||
psubusb m7, [pb_81]
|
||||
psubw m5, m1
|
||||
mova m3, m2
|
||||
paddw m5, [pw_4]
|
||||
psubw m3, m4
|
||||
mova m1, m3
|
||||
paddw m1, m5
|
||||
mova m5, [r2+516] ; 2 * filter limit
|
||||
paddw m3, m3
|
||||
paddw m3, m1
|
||||
psraw m3, 3
|
||||
|
||||
movq m5, [r2+516] ; flim
|
||||
pminub m6, m5
|
||||
pminub m7, m5
|
||||
movq m0, m6
|
||||
movq m1, m7
|
||||
paddb m6, m6
|
||||
paddb m7, m7
|
||||
pminub m6, m5
|
||||
pminub m7, m5
|
||||
psubb m6, m0
|
||||
psubb m7, m1
|
||||
paddusb m4, m7
|
||||
psubusb m4, m6
|
||||
psubusb m3, m7
|
||||
paddusb m3, m6
|
||||
; We use that clamp(2clamp(x,2f),2f)-clamp(x,2f)
|
||||
; (with f = filter limit and clamping to the interval [-f,f])
|
||||
; gives the desired filter value
|
||||
psubw m0, m5
|
||||
pminsw m3, m5
|
||||
pmaxsw m3, m0
|
||||
mova m1, m3
|
||||
paddw m1, m1
|
||||
pminsw m1, m5
|
||||
pmaxsw m1, m0
|
||||
psubw m1, m3
|
||||
psubw m2, m1
|
||||
paddw m4, m1
|
||||
|
||||
packuswb m4, m4
|
||||
packuswb m2, m2
|
||||
%endmacro
|
||||
|
||||
%macro STORE_4_WORDS 1
|
||||
%if ARCH_X86_64
|
||||
movq r2, %1
|
||||
mov [r0 -1], r2w
|
||||
shr r2, 16
|
||||
mov [r0+r1 -1], r2w
|
||||
shr r2, 16
|
||||
%else
|
||||
movd r2d, %1
|
||||
mov [r0 -1], r2w
|
||||
psrlq %1, 32
|
||||
shr r2, 16
|
||||
shr r2d, 16
|
||||
mov [r0+r1 -1], r2w
|
||||
movd r2d, %1
|
||||
%endif
|
||||
mov [r0+r1*2-1], r2w
|
||||
shr r2, 16
|
||||
shr r2d, 16
|
||||
mov [r0+r3 -1], r2w
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
cglobal vp3_v_loop_filter, 3, 4
|
||||
mov r3, r1
|
||||
INIT_XMM sse2
|
||||
cglobal vp3_v_loop_filter, 3, 3, 6
|
||||
movq m1, [r0+r1 ]
|
||||
neg r1
|
||||
movq m6, [r0+r1*2]
|
||||
movq m4, [r0+r1 ]
|
||||
movq m2, [r0 ]
|
||||
movq m1, [r0+r3 ]
|
||||
movq m4, [r0+r1 ]
|
||||
movq m5, [r0+r1*2]
|
||||
|
||||
pxor m0, m0
|
||||
punpcklbw m1, m0
|
||||
punpcklbw m2, m0
|
||||
punpcklbw m4, m0
|
||||
punpcklbw m5, m0
|
||||
|
||||
VP3_LOOP_FILTER
|
||||
|
||||
movq [r0+r1], m4
|
||||
movq [r0 ], m3
|
||||
movq [r0 ], m2
|
||||
RET
|
||||
|
||||
cglobal vp3_h_loop_filter, 3, 4
|
||||
%macro TRANSPOSE4x4 1
|
||||
movd %1, [r0 -2]
|
||||
movd m2, [r0+r1 -2]
|
||||
movd m3, [r0+r1*2-2]
|
||||
movd m4, [r0+r3 -2]
|
||||
punpcklbw %1, m2
|
||||
punpcklbw m3, m4
|
||||
punpcklwd %1, m3
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
cglobal vp3_h_loop_filter, 3, 4, 6
|
||||
lea r3, [r1*3]
|
||||
|
||||
movd m6, [r0 -2]
|
||||
movd m4, [r0+r1 -2]
|
||||
movd m2, [r0+r1*2-2]
|
||||
movd m1, [r0+r3 -2]
|
||||
lea r0, [r0+r1*4 ]
|
||||
punpcklbw m6, [r0 -2]
|
||||
punpcklbw m4, [r0+r1 -2]
|
||||
punpcklbw m2, [r0+r1*2-2]
|
||||
punpcklbw m1, [r0+r3 -2]
|
||||
TRANSPOSE4x4 m5
|
||||
lea r0, [r0+r1*4]
|
||||
TRANSPOSE4x4 m0
|
||||
mova m2, m5
|
||||
punpckldq m5, m0
|
||||
punpckhdq m2, m0
|
||||
pxor m0, m0
|
||||
mova m4, m5
|
||||
punpcklbw m5, m0
|
||||
punpckhbw m4, m0
|
||||
mova m1, m2
|
||||
punpcklbw m2, m0
|
||||
punpckhbw m1, m0
|
||||
|
||||
VP3_LOOP_FILTER
|
||||
|
||||
punpcklbw m4, m2
|
||||
mova m2, m4
|
||||
punpckhqdq m2, m2
|
||||
|
||||
STORE_4_WORDS m2
|
||||
sub r0, r3
|
||||
sub r0, r1
|
||||
|
||||
TRANSPOSE4x4B 6, 4, 2, 1, 0
|
||||
VP3_LOOP_FILTER
|
||||
SBUTTERFLY bw, 4, 3, 5
|
||||
|
||||
STORE_4_WORDS m4
|
||||
lea r0, [r0+r1*4 ]
|
||||
STORE_4_WORDS m3
|
||||
RET
|
||||
|
||||
%macro PAVGB_NO_RND 0
|
||||
|
||||
@@ -18,12 +18,12 @@
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/avcodec.h"
|
||||
#include "libavcodec/vp3dsp.h"
|
||||
|
||||
void ff_vp3_idct_put_sse2(uint8_t *dest, ptrdiff_t stride, int16_t *block);
|
||||
@@ -31,10 +31,10 @@ void ff_vp3_idct_add_sse2(uint8_t *dest, ptrdiff_t stride, int16_t *block);
|
||||
|
||||
void ff_vp3_idct_dc_add_mmxext(uint8_t *dest, ptrdiff_t stride, int16_t *block);
|
||||
|
||||
void ff_vp3_v_loop_filter_mmxext(uint8_t *src, ptrdiff_t stride,
|
||||
int *bounding_values);
|
||||
void ff_vp3_h_loop_filter_mmxext(uint8_t *src, ptrdiff_t stride,
|
||||
int *bounding_values);
|
||||
void ff_vp3_v_loop_filter_sse2(uint8_t *src, ptrdiff_t stride,
|
||||
int *bounding_values);
|
||||
void ff_vp3_h_loop_filter_sse2(uint8_t *src, ptrdiff_t stride,
|
||||
int *bounding_values);
|
||||
|
||||
void ff_put_vp_no_rnd_pixels8_l2_mmx(uint8_t *dst, const uint8_t *a,
|
||||
const uint8_t *b, ptrdiff_t stride,
|
||||
@@ -50,15 +50,13 @@ av_cold void ff_vp3dsp_init_x86(VP3DSPContext *c, int flags)
|
||||
|
||||
if (EXTERNAL_MMXEXT(cpu_flags)) {
|
||||
c->idct_dc_add = ff_vp3_idct_dc_add_mmxext;
|
||||
|
||||
if (!(flags & AV_CODEC_FLAG_BITEXACT)) {
|
||||
c->v_loop_filter = c->v_loop_filter_unaligned = ff_vp3_v_loop_filter_mmxext;
|
||||
c->h_loop_filter = c->h_loop_filter_unaligned = ff_vp3_h_loop_filter_mmxext;
|
||||
}
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
c->idct_put = ff_vp3_idct_put_sse2;
|
||||
c->idct_add = ff_vp3_idct_add_sse2;
|
||||
|
||||
c->v_loop_filter = c->v_loop_filter_unaligned = ff_vp3_v_loop_filter_sse2;
|
||||
c->h_loop_filter = c->h_loop_filter_unaligned = ff_vp3_h_loop_filter_sse2;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -55,7 +55,7 @@ static void vp3_check_loop_filter(void)
|
||||
DECLARE_ALIGNED(8, uint8_t, hor_buf1)[HORIZONTAL_BUF_SIZE];
|
||||
DECLARE_ALIGNED(8, uint8_t, ver_buf0)[VERTICAL_BUF_SIZE];
|
||||
DECLARE_ALIGNED(8, uint8_t, ver_buf1)[VERTICAL_BUF_SIZE];
|
||||
DECLARE_ALIGNED(8, int, bounding_values_array)[256 + 2];
|
||||
DECLARE_ALIGNED(16, int, bounding_values_array)[256 + 4];
|
||||
int *const bounding_values = bounding_values_array + 127;
|
||||
VP3DSPContext vp3dsp;
|
||||
static const struct {
|
||||
@@ -72,7 +72,7 @@ static void vp3_check_loop_filter(void)
|
||||
{ TEST(v_loop_filter), 2, 1, 0, 7, 8, 0 },
|
||||
{ TEST(h_loop_filter), 0, 7, 2, 1, 8, 1 },
|
||||
};
|
||||
declare_func_emms(AV_CPU_FLAG_MMXEXT, void, uint8_t *src, ptrdiff_t stride, int *bounding_values);
|
||||
declare_func(void, uint8_t *src, ptrdiff_t stride, int *bounding_values);
|
||||
|
||||
ff_vp3dsp_init(&vp3dsp, AV_CODEC_FLAG_BITEXACT);
|
||||
|
||||
|
||||
Reference in New Issue
Block a user