You've already forked FFmpeg
mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-11-23 21:54:53 +02:00
avcodec/x86/mpegvideoencdsp_init: Use xmm registers in SSSE3 functions
Improves performance and no longer breaks the ABI (by forgetting to call emms). Old benchmarks: add_8x8basis_c: 43.6 ( 1.00x) add_8x8basis_ssse3: 12.3 ( 3.55x) New benchmarks: add_8x8basis_c: 43.0 ( 1.00x) add_8x8basis_ssse3: 6.3 ( 6.79x) Notice that the output of try_8x8basis_ssse3 changes a bit: Before this commit, it computes certain values and adds the values for i,i+1,i+4 and i+5 before right shifting them; now it adds the values for i,i+1,i+8,i+9. The second pair in these lists could be avoided (by shifting xmm0 and xmm1 before adding both together instead of only shifting xmm0 after adding them), but the former i,i+1 is inherent in using pmaddwd. This is the reason that this function is not bitexact. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
This commit is contained in:
@@ -2296,7 +2296,7 @@ static av_always_inline void encode_mb_internal(MPVEncContext *const s,
|
|||||||
* and neither of these encoders currently supports 444. */
|
* and neither of these encoders currently supports 444. */
|
||||||
#define INTERLACED_DCT(s) ((chroma_format == CHROMA_420 || chroma_format == CHROMA_422) && \
|
#define INTERLACED_DCT(s) ((chroma_format == CHROMA_420 || chroma_format == CHROMA_422) && \
|
||||||
(s)->c.avctx->flags & AV_CODEC_FLAG_INTERLACED_DCT)
|
(s)->c.avctx->flags & AV_CODEC_FLAG_INTERLACED_DCT)
|
||||||
int16_t weight[12][64];
|
DECLARE_ALIGNED(16, int16_t, weight)[12][64];
|
||||||
int16_t orig[12][64];
|
int16_t orig[12][64];
|
||||||
const int mb_x = s->c.mb_x;
|
const int mb_x = s->c.mb_x;
|
||||||
const int mb_y = s->c.mb_y;
|
const int mb_y = s->c.mb_y;
|
||||||
@@ -4293,7 +4293,7 @@ static int dct_quantize_trellis_c(MPVEncContext *const s,
|
|||||||
return last_non_zero;
|
return last_non_zero;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int16_t basis[64][64];
|
static DECLARE_ALIGNED(16, int16_t, basis)[64][64];
|
||||||
|
|
||||||
static void build_basis(uint8_t *perm){
|
static void build_basis(uint8_t *perm){
|
||||||
int i, j, x, y;
|
int i, j, x, y;
|
||||||
@@ -4317,7 +4317,7 @@ static void build_basis(uint8_t *perm){
|
|||||||
static int dct_quantize_refine(MPVEncContext *const s, //FIXME breaks denoise?
|
static int dct_quantize_refine(MPVEncContext *const s, //FIXME breaks denoise?
|
||||||
int16_t *block, int16_t *weight, int16_t *orig,
|
int16_t *block, int16_t *weight, int16_t *orig,
|
||||||
int n, int qscale){
|
int n, int qscale){
|
||||||
int16_t rem[64];
|
DECLARE_ALIGNED(16, int16_t, rem)[64];
|
||||||
LOCAL_ALIGNED_16(int16_t, d1, [64]);
|
LOCAL_ALIGNED_16(int16_t, d1, [64]);
|
||||||
const uint8_t *scantable;
|
const uint8_t *scantable;
|
||||||
const uint8_t *perm_scantable;
|
const uint8_t *perm_scantable;
|
||||||
|
|||||||
@@ -35,13 +35,6 @@ int ff_pix_norm1_sse2(const uint8_t *pix, ptrdiff_t line_size);
|
|||||||
#if HAVE_SSSE3_INLINE
|
#if HAVE_SSSE3_INLINE
|
||||||
#define SCALE_OFFSET -1
|
#define SCALE_OFFSET -1
|
||||||
|
|
||||||
/*
|
|
||||||
* pmulhrsw: dst[0 - 15] = (src[0 - 15] * dst[0 - 15] + 0x4000)[15 - 30]
|
|
||||||
*/
|
|
||||||
#define PMULHRW(x, y, s, o) \
|
|
||||||
"pmulhrsw " #s ", " #x " \n\t" \
|
|
||||||
"pmulhrsw " #s ", " #y " \n\t"
|
|
||||||
|
|
||||||
#define MAX_ABS 512
|
#define MAX_ABS 512
|
||||||
|
|
||||||
static int try_8x8basis_ssse3(const int16_t rem[64], const int16_t weight[64], const int16_t basis[64], int scale)
|
static int try_8x8basis_ssse3(const int16_t rem[64], const int16_t weight[64], const int16_t basis[64], int scale)
|
||||||
@@ -52,36 +45,39 @@ static int try_8x8basis_ssse3(const int16_t rem[64], const int16_t weight[64], c
|
|||||||
scale <<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT;
|
scale <<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT;
|
||||||
|
|
||||||
__asm__ volatile(
|
__asm__ volatile(
|
||||||
"pxor %%mm7, %%mm7 \n\t"
|
"pxor %%xmm2, %%xmm2 \n\t"
|
||||||
"movd %4, %%mm5 \n\t"
|
"movd %4, %%xmm3 \n\t"
|
||||||
"punpcklwd %%mm5, %%mm5 \n\t"
|
"punpcklwd %%xmm3, %%xmm3 \n\t"
|
||||||
"punpcklwd %%mm5, %%mm5 \n\t"
|
"pshufd $0, %%xmm3, %%xmm3 \n\t"
|
||||||
".p2align 4 \n\t"
|
".p2align 4 \n\t"
|
||||||
"1: \n\t"
|
"1: \n\t"
|
||||||
"movq (%1, %0), %%mm0 \n\t"
|
"movdqa (%1, %0), %%xmm0 \n\t"
|
||||||
"movq 8(%1, %0), %%mm1 \n\t"
|
"movdqa 16(%1, %0), %%xmm1 \n\t"
|
||||||
PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6)
|
"pmulhrsw %%xmm3, %%xmm0 \n\t"
|
||||||
"paddw (%2, %0), %%mm0 \n\t"
|
"pmulhrsw %%xmm3, %%xmm1 \n\t"
|
||||||
"paddw 8(%2, %0), %%mm1 \n\t"
|
"paddw (%2, %0), %%xmm0 \n\t"
|
||||||
"psraw $6, %%mm0 \n\t"
|
"paddw 16(%2, %0), %%xmm1 \n\t"
|
||||||
"psraw $6, %%mm1 \n\t"
|
"psraw $6, %%xmm0 \n\t"
|
||||||
"pmullw (%3, %0), %%mm0 \n\t"
|
"psraw $6, %%xmm1 \n\t"
|
||||||
"pmullw 8(%3, %0), %%mm1 \n\t"
|
"pmullw (%3, %0), %%xmm0 \n\t"
|
||||||
"pmaddwd %%mm0, %%mm0 \n\t"
|
"pmullw 16(%3, %0), %%xmm1 \n\t"
|
||||||
"pmaddwd %%mm1, %%mm1 \n\t"
|
"pmaddwd %%xmm0, %%xmm0 \n\t"
|
||||||
"paddd %%mm1, %%mm0 \n\t"
|
"pmaddwd %%xmm1, %%xmm1 \n\t"
|
||||||
"psrld $4, %%mm0 \n\t"
|
"paddd %%xmm1, %%xmm0 \n\t"
|
||||||
"paddd %%mm0, %%mm7 \n\t"
|
"psrld $4, %%xmm0 \n\t"
|
||||||
"add $16, %0 \n\t"
|
"paddd %%xmm0, %%xmm2 \n\t"
|
||||||
"cmp $128, %0 \n\t" //FIXME optimize & bench
|
"add $32, %0 \n\t"
|
||||||
" jb 1b \n\t"
|
"cmp $128, %0 \n\t" //FIXME optimize & bench
|
||||||
"pshufw $0x0E, %%mm7, %%mm6 \n\t"
|
" jb 1b \n\t"
|
||||||
"paddd %%mm6, %%mm7 \n\t" // faster than phaddd on core2
|
"pshufd $0x0E, %%xmm2, %%xmm0 \n\t"
|
||||||
"psrld $2, %%mm7 \n\t"
|
"paddd %%xmm0, %%xmm2 \n\t"
|
||||||
"movd %%mm7, %0 \n\t"
|
"pshufd $0x01, %%xmm2, %%xmm0 \n\t"
|
||||||
|
"paddd %%xmm0, %%xmm2 \n\t"
|
||||||
|
"psrld $2, %%xmm2 \n\t"
|
||||||
|
"movd %%xmm2, %0 \n\t"
|
||||||
: "+r" (i)
|
: "+r" (i)
|
||||||
: "r"(basis), "r"(rem), "r"(weight), "g"(scale)
|
: "r"(basis), "r"(rem), "r"(weight), "g"(scale)
|
||||||
|
XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3")
|
||||||
);
|
);
|
||||||
return i;
|
return i;
|
||||||
}
|
}
|
||||||
@@ -93,24 +89,25 @@ static void add_8x8basis_ssse3(int16_t rem[64], const int16_t basis[64], int sca
|
|||||||
if (FFABS(scale) < 1024) {
|
if (FFABS(scale) < 1024) {
|
||||||
scale <<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT;
|
scale <<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT;
|
||||||
__asm__ volatile(
|
__asm__ volatile(
|
||||||
"movd %3, %%mm5 \n\t"
|
"movd %3, %%xmm2 \n\t"
|
||||||
"punpcklwd %%mm5, %%mm5 \n\t"
|
"punpcklwd %%xmm2, %%xmm2 \n\t"
|
||||||
"punpcklwd %%mm5, %%mm5 \n\t"
|
"pshufd $0, %%xmm2, %%xmm2 \n\t"
|
||||||
".p2align 4 \n\t"
|
".p2align 4 \n\t"
|
||||||
"1: \n\t"
|
"1: \n\t"
|
||||||
"movq (%1, %0), %%mm0 \n\t"
|
"movdqa (%1, %0), %%xmm0 \n\t"
|
||||||
"movq 8(%1, %0), %%mm1 \n\t"
|
"movdqa 16(%1, %0), %%xmm1 \n\t"
|
||||||
PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6)
|
"pmulhrsw %%xmm2, %%xmm0 \n\t"
|
||||||
"paddw (%2, %0), %%mm0 \n\t"
|
"pmulhrsw %%xmm2, %%xmm1 \n\t"
|
||||||
"paddw 8(%2, %0), %%mm1 \n\t"
|
"paddw (%2, %0), %%xmm0 \n\t"
|
||||||
"movq %%mm0, (%2, %0) \n\t"
|
"paddw 16(%2, %0), %%xmm1 \n\t"
|
||||||
"movq %%mm1, 8(%2, %0) \n\t"
|
"movdqa %%xmm0, (%2, %0) \n\t"
|
||||||
"add $16, %0 \n\t"
|
"movdqa %%xmm1, 16(%2, %0) \n\t"
|
||||||
"cmp $128, %0 \n\t" // FIXME optimize & bench
|
"add $32, %0 \n\t"
|
||||||
" jb 1b \n\t"
|
"cmp $128, %0 \n\t" // FIXME optimize & bench
|
||||||
|
" jb 1b \n\t"
|
||||||
: "+r" (i)
|
: "+r" (i)
|
||||||
: "r"(basis), "r"(rem), "g"(scale)
|
: "r"(basis), "r"(rem), "g"(scale)
|
||||||
|
XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2")
|
||||||
);
|
);
|
||||||
} else {
|
} else {
|
||||||
for (i=0; i<8*8; i++) {
|
for (i=0; i<8*8; i++) {
|
||||||
|
|||||||
@@ -39,13 +39,13 @@
|
|||||||
|
|
||||||
static void check_add_8x8basis(MpegvideoEncDSPContext *c)
|
static void check_add_8x8basis(MpegvideoEncDSPContext *c)
|
||||||
{
|
{
|
||||||
declare_func_emms(AV_CPU_FLAG_SSSE3, void, int16_t rem[64], const int16_t basis[64], int scale);
|
declare_func(void, int16_t rem[64], const int16_t basis[64], int scale);
|
||||||
if (check_func(c->add_8x8basis, "add_8x8basis")) {
|
if (check_func(c->add_8x8basis, "add_8x8basis")) {
|
||||||
// FIXME: What are the actual ranges for these values?
|
// FIXME: What are the actual ranges for these values?
|
||||||
int scale = sign_extend(rnd(), 12);
|
int scale = sign_extend(rnd(), 12);
|
||||||
int16_t rem1[64];
|
DECLARE_ALIGNED(16, int16_t, rem1)[64];
|
||||||
int16_t rem2[64];
|
DECLARE_ALIGNED(16, int16_t, rem2)[64];
|
||||||
int16_t basis[64];
|
DECLARE_ALIGNED(16, int16_t, basis)[64];
|
||||||
|
|
||||||
randomize_buffer_clipped(basis, -15760, 15760);
|
randomize_buffer_clipped(basis, -15760, 15760);
|
||||||
randomize_buffers(rem1, sizeof(rem1));
|
randomize_buffers(rem1, sizeof(rem1));
|
||||||
|
|||||||
Reference in New Issue
Block a user