1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2025-11-23 21:54:53 +02:00

avcodec/x86/vp3dsp: Port loop filters to SSE2

The old code operated on bytes and did lots of tricks
due to their limited range; it did not completely succeed,
which is why the old versions were not used when bitexact
output was requested.

In contrast, the new version is much simpler: It operates
on signed 16 bit words whose range is more than sufficient.
This means that these functions don't need a check for bitexactness
(and can be used in FATE).

Old benchmarks (for this, the AV_CODEC_FLAG_BITEXACT check has been
removed from checkasm):
h_loop_filter_c:                                        29.8 ( 1.00x)
h_loop_filter_mmxext:                                   32.2 ( 0.93x)
h_loop_filter_unaligned_c:                              29.9 ( 1.00x)
h_loop_filter_unaligned_mmxext:                         31.4 ( 0.95x)
v_loop_filter_c:                                        39.3 ( 1.00x)
v_loop_filter_mmxext:                                   14.2 ( 2.78x)
v_loop_filter_unaligned_c:                              38.9 ( 1.00x)
v_loop_filter_unaligned_mmxext:                         14.3 ( 2.72x)

New benchmarks:
h_loop_filter_c:                                        29.2 ( 1.00x)
h_loop_filter_sse2:                                     28.6 ( 1.02x)
h_loop_filter_unaligned_c:                              29.0 ( 1.00x)
h_loop_filter_unaligned_sse2:                           26.9 ( 1.08x)
v_loop_filter_c:                                        38.3 ( 1.00x)
v_loop_filter_sse2:                                     11.0 ( 3.47x)
v_loop_filter_unaligned_c:                              35.5 ( 1.00x)
v_loop_filter_unaligned_sse2:                           11.2 ( 3.18x)

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
This commit is contained in:
Andreas Rheinhardt
2025-10-09 14:58:05 +02:00
parent 5d9a392bce
commit e3ca57ae8f
5 changed files with 101 additions and 86 deletions

View File

@@ -325,7 +325,7 @@ typedef struct Vp3DecodeContext {
HuffTable huffman_table[5 * 16];
uint8_t filter_limit_values[64];
DECLARE_ALIGNED(8, int, bounding_values_array)[256 + 2];
DECLARE_ALIGNED(16, int, bounding_values_array)[256 + 4];
VP4Predictor * dc_pred_row; /* dc_pred_row[y_superblock_width * 4] */
} Vp3DecodeContext;

View File

@@ -494,5 +494,10 @@ void ff_vp3dsp_set_bounding_values(int * bounding_values_array, int filter_limit
}
if (value)
bounding_values[128] = value;
#if ARCH_X86
bounding_values[129] = bounding_values[130] =
bounding_values[131] = bounding_values[132] = filter_limit * 0x00020002U;
#else
bounding_values[129] = bounding_values[130] = filter_limit * 0x02020202U;
#endif
}

View File

@@ -33,113 +33,125 @@ vp3_idct_data: times 8 dw 64277
times 8 dw 25080
times 8 dw 12785
pb_7: times 8 db 0x07
pb_1F: times 8 db 0x1f
pb_81: times 8 db 0x81
cextern pb_1
cextern pb_3
cextern pb_80
cextern pb_FE
cextern pw_4
cextern pw_8
SECTION .text
; this is off by one or two for some cases when filter_limit is greater than 63
; in: p0 in mm6, p1 in mm4, p2 in mm2, p3 in mm1
; out: p1 in mm4, p2 in mm3
; in: p0 in m5, p1 in m4, p2 in m2, p3 in m1, all unpacked;
; m0 must be zeroed
; out: p1 in m4, p2 in m2
%macro VP3_LOOP_FILTER 0
movq m7, m6
pand m6, [pb_7] ; p0&7
psrlw m7, 3
pand m7, [pb_1F] ; p0>>3
movq m3, m2 ; p2
pxor m2, m4
pand m2, [pb_1] ; (p2^p1)&1
movq m5, m2
paddb m2, m2
paddb m2, m5 ; 3*(p2^p1)&1
paddb m2, m6 ; extra bits lost in shifts
pcmpeqb m0, m0
pxor m1, m0 ; 255 - p3
pavgb m1, m2 ; (256 - p3 + extrabits) >> 1
pxor m0, m4 ; 255 - p1
pavgb m0, m3 ; (256 + p2-p1) >> 1
paddb m1, [pb_3]
pavgb m1, m0 ; 128+2+( p2-p1 - p3) >> 2
pavgb m1, m0 ; 128+1+(3*(p2-p1) - p3) >> 3
paddusb m7, m1 ; d+128+1
movq m6, [pb_81]
psubusb m6, m7
psubusb m7, [pb_81]
psubw m5, m1
mova m3, m2
paddw m5, [pw_4]
psubw m3, m4
mova m1, m3
paddw m1, m5
mova m5, [r2+516] ; 2 * filter limit
paddw m3, m3
paddw m3, m1
psraw m3, 3
movq m5, [r2+516] ; flim
pminub m6, m5
pminub m7, m5
movq m0, m6
movq m1, m7
paddb m6, m6
paddb m7, m7
pminub m6, m5
pminub m7, m5
psubb m6, m0
psubb m7, m1
paddusb m4, m7
psubusb m4, m6
psubusb m3, m7
paddusb m3, m6
; We use that clamp(2clamp(x,2f),2f)-clamp(x,2f)
; (with f = filter limit and clamping to the interval [-f,f])
; gives the desired filter value
psubw m0, m5
pminsw m3, m5
pmaxsw m3, m0
mova m1, m3
paddw m1, m1
pminsw m1, m5
pmaxsw m1, m0
psubw m1, m3
psubw m2, m1
paddw m4, m1
packuswb m4, m4
packuswb m2, m2
%endmacro
%macro STORE_4_WORDS 1
%if ARCH_X86_64
movq r2, %1
mov [r0 -1], r2w
shr r2, 16
mov [r0+r1 -1], r2w
shr r2, 16
%else
movd r2d, %1
mov [r0 -1], r2w
psrlq %1, 32
shr r2, 16
shr r2d, 16
mov [r0+r1 -1], r2w
movd r2d, %1
%endif
mov [r0+r1*2-1], r2w
shr r2, 16
shr r2d, 16
mov [r0+r3 -1], r2w
%endmacro
INIT_MMX mmxext
cglobal vp3_v_loop_filter, 3, 4
mov r3, r1
INIT_XMM sse2
cglobal vp3_v_loop_filter, 3, 3, 6
movq m1, [r0+r1 ]
neg r1
movq m6, [r0+r1*2]
movq m4, [r0+r1 ]
movq m2, [r0 ]
movq m1, [r0+r3 ]
movq m4, [r0+r1 ]
movq m5, [r0+r1*2]
pxor m0, m0
punpcklbw m1, m0
punpcklbw m2, m0
punpcklbw m4, m0
punpcklbw m5, m0
VP3_LOOP_FILTER
movq [r0+r1], m4
movq [r0 ], m3
movq [r0 ], m2
RET
cglobal vp3_h_loop_filter, 3, 4
%macro TRANSPOSE4x4 1
movd %1, [r0 -2]
movd m2, [r0+r1 -2]
movd m3, [r0+r1*2-2]
movd m4, [r0+r3 -2]
punpcklbw %1, m2
punpcklbw m3, m4
punpcklwd %1, m3
%endmacro
INIT_XMM sse2
cglobal vp3_h_loop_filter, 3, 4, 6
lea r3, [r1*3]
movd m6, [r0 -2]
movd m4, [r0+r1 -2]
movd m2, [r0+r1*2-2]
movd m1, [r0+r3 -2]
lea r0, [r0+r1*4 ]
punpcklbw m6, [r0 -2]
punpcklbw m4, [r0+r1 -2]
punpcklbw m2, [r0+r1*2-2]
punpcklbw m1, [r0+r3 -2]
TRANSPOSE4x4 m5
lea r0, [r0+r1*4]
TRANSPOSE4x4 m0
mova m2, m5
punpckldq m5, m0
punpckhdq m2, m0
pxor m0, m0
mova m4, m5
punpcklbw m5, m0
punpckhbw m4, m0
mova m1, m2
punpcklbw m2, m0
punpckhbw m1, m0
VP3_LOOP_FILTER
punpcklbw m4, m2
mova m2, m4
punpckhqdq m2, m2
STORE_4_WORDS m2
sub r0, r3
sub r0, r1
TRANSPOSE4x4B 6, 4, 2, 1, 0
VP3_LOOP_FILTER
SBUTTERFLY bw, 4, 3, 5
STORE_4_WORDS m4
lea r0, [r0+r1*4 ]
STORE_4_WORDS m3
RET
%macro PAVGB_NO_RND 0

View File

@@ -18,12 +18,12 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stddef.h>
#include <stdint.h>
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/avcodec.h"
#include "libavcodec/vp3dsp.h"
void ff_vp3_idct_put_sse2(uint8_t *dest, ptrdiff_t stride, int16_t *block);
@@ -31,10 +31,10 @@ void ff_vp3_idct_add_sse2(uint8_t *dest, ptrdiff_t stride, int16_t *block);
void ff_vp3_idct_dc_add_mmxext(uint8_t *dest, ptrdiff_t stride, int16_t *block);
void ff_vp3_v_loop_filter_mmxext(uint8_t *src, ptrdiff_t stride,
int *bounding_values);
void ff_vp3_h_loop_filter_mmxext(uint8_t *src, ptrdiff_t stride,
int *bounding_values);
void ff_vp3_v_loop_filter_sse2(uint8_t *src, ptrdiff_t stride,
int *bounding_values);
void ff_vp3_h_loop_filter_sse2(uint8_t *src, ptrdiff_t stride,
int *bounding_values);
void ff_put_vp_no_rnd_pixels8_l2_mmx(uint8_t *dst, const uint8_t *a,
const uint8_t *b, ptrdiff_t stride,
@@ -50,15 +50,13 @@ av_cold void ff_vp3dsp_init_x86(VP3DSPContext *c, int flags)
if (EXTERNAL_MMXEXT(cpu_flags)) {
c->idct_dc_add = ff_vp3_idct_dc_add_mmxext;
if (!(flags & AV_CODEC_FLAG_BITEXACT)) {
c->v_loop_filter = c->v_loop_filter_unaligned = ff_vp3_v_loop_filter_mmxext;
c->h_loop_filter = c->h_loop_filter_unaligned = ff_vp3_h_loop_filter_mmxext;
}
}
if (EXTERNAL_SSE2(cpu_flags)) {
c->idct_put = ff_vp3_idct_put_sse2;
c->idct_add = ff_vp3_idct_add_sse2;
c->v_loop_filter = c->v_loop_filter_unaligned = ff_vp3_v_loop_filter_sse2;
c->h_loop_filter = c->h_loop_filter_unaligned = ff_vp3_h_loop_filter_sse2;
}
}

View File

@@ -55,7 +55,7 @@ static void vp3_check_loop_filter(void)
DECLARE_ALIGNED(8, uint8_t, hor_buf1)[HORIZONTAL_BUF_SIZE];
DECLARE_ALIGNED(8, uint8_t, ver_buf0)[VERTICAL_BUF_SIZE];
DECLARE_ALIGNED(8, uint8_t, ver_buf1)[VERTICAL_BUF_SIZE];
DECLARE_ALIGNED(8, int, bounding_values_array)[256 + 2];
DECLARE_ALIGNED(16, int, bounding_values_array)[256 + 4];
int *const bounding_values = bounding_values_array + 127;
VP3DSPContext vp3dsp;
static const struct {
@@ -72,7 +72,7 @@ static void vp3_check_loop_filter(void)
{ TEST(v_loop_filter), 2, 1, 0, 7, 8, 0 },
{ TEST(h_loop_filter), 0, 7, 2, 1, 8, 1 },
};
declare_func_emms(AV_CPU_FLAG_MMXEXT, void, uint8_t *src, ptrdiff_t stride, int *bounding_values);
declare_func(void, uint8_t *src, ptrdiff_t stride, int *bounding_values);
ff_vp3dsp_init(&vp3dsp, AV_CODEC_FLAG_BITEXACT);