diff --git a/configure b/configure index d393983687..d4232f5d42 100755 --- a/configure +++ b/configure @@ -2430,7 +2430,7 @@ hap_encoder_deps="libsnappy" hap_encoder_select="texturedspenc" hevc_decoder_select="bswapdsp cabac golomb videodsp" huffyuv_decoder_select="bswapdsp huffyuvdsp llviddsp" -huffyuv_encoder_select="bswapdsp huffman huffyuvencdsp llviddsp" +huffyuv_encoder_select="bswapdsp huffman huffyuvencdsp" iac_decoder_select="imc_decoder" imc_decoder_select="bswapdsp fft mdct sinewin" indeo3_decoder_select="hpeldsp" diff --git a/libavcodec/huffyuv.c b/libavcodec/huffyuv.c index 492155550f..e582060cc3 100644 --- a/libavcodec/huffyuv.c +++ b/libavcodec/huffyuv.c @@ -76,7 +76,6 @@ av_cold void ff_huffyuv_common_init(AVCodecContext *avctx) s->flags = avctx->flags; ff_bswapdsp_init(&s->bdsp); - ff_llviddsp_init(&s->llviddsp, avctx); s->width = avctx->width; s->height = avctx->height; diff --git a/libavcodec/huffyuvdec.c b/libavcodec/huffyuvdec.c index 1b4112ce2f..d0682040b3 100644 --- a/libavcodec/huffyuvdec.c +++ b/libavcodec/huffyuvdec.c @@ -298,6 +298,7 @@ static av_cold int decode_init(AVCodecContext *avctx) return ret; ff_huffyuvdsp_init(&s->hdsp); + ff_llviddsp_init(&s->llviddsp, avctx); memset(s->vlc, 0, 4 * sizeof(VLC)); s->interlaced = avctx->height > 288; diff --git a/libavcodec/huffyuvenc.c b/libavcodec/huffyuvenc.c index 572de16a1f..f5bc99f196 100644 --- a/libavcodec/huffyuvenc.c +++ b/libavcodec/huffyuvenc.c @@ -43,7 +43,7 @@ static inline void diff_bytes(HYuvContext *s, uint8_t *dst, if (s->bps <= 8) { s->hencdsp.diff_bytes(dst, src0, src1, w); } else { - s->llviddsp.diff_int16((uint16_t *)dst, (const uint16_t *)src0, (const uint16_t *)src1, s->n - 1, w); + s->hencdsp.diff_int16((uint16_t *)dst, (const uint16_t *)src0, (const uint16_t *)src1, s->n - 1, w); } } @@ -84,7 +84,7 @@ static inline int sub_left_prediction(HYuvContext *s, uint8_t *dst, dst16[i] = temp - left; left = temp; } - s->llviddsp.diff_int16(dst16 + 16, src16 + 16, src16 + 15, s->n - 1, w - 16); + s->hencdsp.diff_int16(dst16 + 16, src16 + 16, src16 + 15, s->n - 1, w - 16); return src16[w-1]; } } @@ -158,7 +158,7 @@ static void sub_median_prediction(HYuvContext *s, uint8_t *dst, const uint8_t *s if (s->bps <= 8) { s->hencdsp.sub_hfyu_median_pred(dst, src1, src2, w , left, left_top); } else { - s->llviddsp.sub_hfyu_median_pred_int16((uint16_t *)dst, (const uint16_t *)src1, (const uint16_t *)src2, s->n - 1, w , left, left_top); + s->hencdsp.sub_hfyu_median_pred_int16((uint16_t *)dst, (const uint16_t *)src1, (const uint16_t *)src2, s->n - 1, w , left, left_top); } } @@ -217,7 +217,7 @@ static av_cold int encode_init(AVCodecContext *avctx) const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(avctx->pix_fmt); ff_huffyuv_common_init(avctx); - ff_huffyuvencdsp_init(&s->hencdsp); + ff_huffyuvencdsp_init(&s->hencdsp, avctx); avctx->extradata = av_mallocz(3*MAX_N + 4); if (s->flags&AV_CODEC_FLAG_PASS1) { diff --git a/libavcodec/huffyuvencdsp.c b/libavcodec/huffyuvencdsp.c index fdcd0b06aa..f051021094 100644 --- a/libavcodec/huffyuvencdsp.c +++ b/libavcodec/huffyuvencdsp.c @@ -53,6 +53,32 @@ static void diff_bytes_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, dst[i + 0] = src1[i + 0] - src2[i + 0]; } +static void diff_int16_c(uint16_t *dst, const uint16_t *src1, const uint16_t *src2, unsigned mask, int w){ + long i; +#if !HAVE_FAST_UNALIGNED + if((long)src2 & (sizeof(long)-1)){ + for(i=0; i+3> 1) * 0x0001000100010001ULL; + unsigned long pw_msb = pw_lsb + 0x0001000100010001ULL; + + for (i = 0; i <= w - (int)sizeof(long)/2; i += sizeof(long)/2) { + long a = *(long*)(src1+i); + long b = *(long*)(src2+i); + *(long*)(dst+i) = ((a|pw_msb) - (b&pw_lsb)) ^ ((a^b^pw_msb)&pw_msb); + } + } + for (; idiff_bytes = diff_bytes_c; + c->diff_int16 = diff_int16_c; c->sub_hfyu_median_pred = sub_hfyu_median_pred_c; + c->sub_hfyu_median_pred_int16 = sub_hfyu_median_pred_int16_c; if (ARCH_X86) - ff_huffyuvencdsp_init_x86(c); + ff_huffyuvencdsp_init_x86(c, avctx); } diff --git a/libavcodec/huffyuvencdsp.h b/libavcodec/huffyuvencdsp.h index 9d09095374..141dad8692 100644 --- a/libavcodec/huffyuvencdsp.h +++ b/libavcodec/huffyuvencdsp.h @@ -21,11 +21,18 @@ #include +#include "avcodec.h" + typedef struct HuffYUVEncDSPContext { void (*diff_bytes)(uint8_t *dst /* align 16 */, const uint8_t *src1 /* align 16 */, const uint8_t *src2 /* align 1 */, intptr_t w); + void (*diff_int16)(uint16_t *dst /* align 16 */, + const uint16_t *src1 /* align 16 */, + const uint16_t *src2 /* align 1 */, + unsigned mask, int w); + /** * Subtract HuffYUV's variant of median prediction. * Note, this might read from src1[-1], src2[-1]. @@ -33,9 +40,12 @@ typedef struct HuffYUVEncDSPContext { void (*sub_hfyu_median_pred)(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, intptr_t w, int *left, int *left_top); + void (*sub_hfyu_median_pred_int16)(uint16_t *dst, const uint16_t *src1, + const uint16_t *src2, unsigned mask, + int w, int *left, int *left_top); } HuffYUVEncDSPContext; -void ff_huffyuvencdsp_init(HuffYUVEncDSPContext *c); -void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c); +void ff_huffyuvencdsp_init(HuffYUVEncDSPContext *c, AVCodecContext *avctx); +void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c, AVCodecContext *avctx); #endif /* AVCODEC_HUFFYUVENCDSP_H */ diff --git a/libavcodec/lossless_videodsp.c b/libavcodec/lossless_videodsp.c index 5440ce2e31..b93d4e7214 100644 --- a/libavcodec/lossless_videodsp.c +++ b/libavcodec/lossless_videodsp.c @@ -92,32 +92,6 @@ static void add_int16_c(uint16_t *dst, const uint16_t *src, unsigned mask, int w dst[i] = (dst[i] + src[i]) & mask; } -static void diff_int16_c(uint16_t *dst, const uint16_t *src1, const uint16_t *src2, unsigned mask, int w){ - long i; -#if !HAVE_FAST_UNALIGNED - if((long)src2 & (sizeof(long)-1)){ - for(i=0; i+3> 1) * 0x0001000100010001ULL; - unsigned long pw_msb = pw_lsb + 0x0001000100010001ULL; - - for (i = 0; i <= w - (int)sizeof(long)/2; i += sizeof(long)/2) { - long a = *(long*)(src1+i); - long b = *(long*)(src2+i); - *(long*)(dst+i) = ((a|pw_msb) - (b&pw_lsb)) ^ ((a^b^pw_msb)&pw_msb); - } - } - for (; iadd_left_pred = add_left_pred_c; c->add_int16 = add_int16_c; - c->diff_int16= diff_int16_c; c->add_hfyu_left_pred_int16 = add_hfyu_left_pred_int16_c; c->add_hfyu_median_pred_int16 = add_hfyu_median_pred_int16_c; - c->sub_hfyu_median_pred_int16 = sub_hfyu_median_pred_int16_c; if (ARCH_X86) ff_llviddsp_init_x86(c, avctx); diff --git a/libavcodec/lossless_videodsp.h b/libavcodec/lossless_videodsp.h index e8ba175a5d..7f3168339f 100644 --- a/libavcodec/lossless_videodsp.h +++ b/libavcodec/lossless_videodsp.h @@ -35,9 +35,7 @@ typedef struct LLVidDSPContext { intptr_t w, int left); void (*add_int16)(uint16_t *dst/*align 16*/, const uint16_t *src/*align 16*/, unsigned mask, int w); - void (*diff_int16)(uint16_t *dst/*align 16*/, const uint16_t *src1/*align 16*/, const uint16_t *src2/*align 1*/, unsigned mask, int w); - void (*sub_hfyu_median_pred_int16)(uint16_t *dst, const uint16_t *src1, const uint16_t *src2, unsigned mask, int w, int *left, int *left_top); void (*add_hfyu_median_pred_int16)(uint16_t *dst, const uint16_t *top, const uint16_t *diff, unsigned mask, int w, int *left, int *left_top); int (*add_hfyu_left_pred_int16)(uint16_t *dst, const uint16_t *src, unsigned mask, int w, unsigned left); } LLVidDSPContext; diff --git a/libavcodec/pngenc.c b/libavcodec/pngenc.c index 51ae094770..3aeff83f42 100644 --- a/libavcodec/pngenc.c +++ b/libavcodec/pngenc.c @@ -1015,7 +1015,7 @@ FF_DISABLE_DEPRECATION_WARNINGS FF_ENABLE_DEPRECATION_WARNINGS #endif - ff_huffyuvencdsp_init(&s->hdsp); + ff_huffyuvencdsp_init(&s->hdsp, avctx); #if FF_API_PRIVATE_OPT FF_DISABLE_DEPRECATION_WARNINGS diff --git a/libavcodec/utvideoenc.c b/libavcodec/utvideoenc.c index fd27a15fd8..d82f6a3872 100644 --- a/libavcodec/utvideoenc.c +++ b/libavcodec/utvideoenc.c @@ -120,7 +120,7 @@ static av_cold int utvideo_encode_init(AVCodecContext *avctx) } ff_bswapdsp_init(&c->bdsp); - ff_huffyuvencdsp_init(&c->hdsp); + ff_huffyuvencdsp_init(&c->hdsp, avctx); #if FF_API_PRIVATE_OPT FF_DISABLE_DEPRECATION_WARNINGS diff --git a/libavcodec/x86/huffyuvencdsp.asm b/libavcodec/x86/huffyuvencdsp.asm index a55a1de65d..78ad202249 100644 --- a/libavcodec/x86/huffyuvencdsp.asm +++ b/libavcodec/x86/huffyuvencdsp.asm @@ -148,3 +148,116 @@ DIFF_BYTES_PROLOGUE DIFF_BYTES_BODY u, u %undef i %endif + +%macro INT16_LOOP 2 ; %1 = a/u (aligned/unaligned), %2 = add/sub + movd m4, maskd + SPLATW m4, m4 + add wd, wd + test wq, 2*mmsize - 1 + jz %%.tomainloop + push tmpq +%%.wordloop: + sub wq, 2 +%ifidn %2, add + mov tmpw, [srcq+wq] + add tmpw, [dstq+wq] +%else + mov tmpw, [src1q+wq] + sub tmpw, [src2q+wq] +%endif + and tmpw, maskw + mov [dstq+wq], tmpw + test wq, 2*mmsize - 1 + jnz %%.wordloop + pop tmpq +%%.tomainloop: +%ifidn %2, add + add srcq, wq +%else + add src1q, wq + add src2q, wq +%endif + add dstq, wq + neg wq + jz %%.end +%%.loop: +%ifidn %2, add + mov%1 m0, [srcq+wq] + mov%1 m1, [dstq+wq] + mov%1 m2, [srcq+wq+mmsize] + mov%1 m3, [dstq+wq+mmsize] +%else + mov%1 m0, [src1q+wq] + mov%1 m1, [src2q+wq] + mov%1 m2, [src1q+wq+mmsize] + mov%1 m3, [src2q+wq+mmsize] +%endif + p%2w m0, m1 + p%2w m2, m3 + pand m0, m4 + pand m2, m4 + mov%1 [dstq+wq] , m0 + mov%1 [dstq+wq+mmsize], m2 + add wq, 2*mmsize + jl %%.loop +%%.end: + RET +%endmacro + +%if ARCH_X86_32 +INIT_MMX mmx +cglobal diff_int16, 5,5,5, dst, src1, src2, mask, w, tmp + INT16_LOOP a, sub +%endif + +INIT_XMM sse2 +cglobal diff_int16, 5,5,5, dst, src1, src2, mask, w, tmp + test src1q, mmsize-1 + jnz .unaligned + test src2q, mmsize-1 + jnz .unaligned + test dstq, mmsize-1 + jnz .unaligned + INT16_LOOP a, sub +.unaligned: + INT16_LOOP u, sub + +INIT_MMX mmxext +cglobal sub_hfyu_median_pred_int16, 7,7,0, dst, src1, src2, mask, w, left, left_top + add wd, wd + movd mm7, maskd + SPLATW mm7, mm7 + movq mm0, [src1q] + movq mm2, [src2q] + psllq mm0, 16 + psllq mm2, 16 + movd mm6, [left_topq] + por mm0, mm6 + movd mm6, [leftq] + por mm2, mm6 + xor maskq, maskq +.loop: + movq mm1, [src1q + maskq] + movq mm3, [src2q + maskq] + movq mm4, mm2 + psubw mm2, mm0 + paddw mm2, mm1 + pand mm2, mm7 + movq mm5, mm4 + pmaxsw mm4, mm1 + pminsw mm1, mm5 + pminsw mm4, mm2 + pmaxsw mm4, mm1 + psubw mm3, mm4 + pand mm3, mm7 + movq [dstq + maskq], mm3 + add maskq, 8 + movq mm0, [src1q + maskq - 2] + movq mm2, [src2q + maskq - 2] + cmp maskq, wq + jb .loop + movzx maskd, word [src1q + wq - 2] + mov [left_topq], maskd + movzx maskd, word [src2q + wq - 2] + mov [leftq], maskd + RET diff --git a/libavcodec/x86/huffyuvencdsp_mmx.c b/libavcodec/x86/huffyuvencdsp_mmx.c index 9767b212da..2402021823 100644 --- a/libavcodec/x86/huffyuvencdsp_mmx.c +++ b/libavcodec/x86/huffyuvencdsp_mmx.c @@ -24,6 +24,7 @@ #include "libavutil/attributes.h" #include "libavutil/cpu.h" +#include "libavutil/pixdesc.h" #include "libavutil/x86/asm.h" #include "libavutil/x86/cpu.h" #include "libavcodec/huffyuvencdsp.h" @@ -35,6 +36,12 @@ void ff_diff_bytes_sse2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, intptr_t w); void ff_diff_bytes_avx2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, intptr_t w); +void ff_diff_int16_mmx (uint16_t *dst, const uint16_t *src1, const uint16_t *src2, + unsigned mask, int w); +void ff_diff_int16_sse2(uint16_t *dst, const uint16_t *src1, const uint16_t *src2, + unsigned mask, int w); +void ff_sub_hfyu_median_pred_int16_mmxext(uint16_t *dst, const uint16_t *src1, const uint16_t *src2, + unsigned mask, int w, int *left, int *left_top); #if HAVE_INLINE_ASM @@ -80,12 +87,14 @@ static void sub_hfyu_median_pred_mmxext(uint8_t *dst, const uint8_t *src1, #endif /* HAVE_INLINE_ASM */ -av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c) +av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c, AVCodecContext *avctx) { av_unused int cpu_flags = av_get_cpu_flags(); + const AVPixFmtDescriptor *pix_desc = av_pix_fmt_desc_get(avctx->pix_fmt); if (ARCH_X86_32 && EXTERNAL_MMX(cpu_flags)) { c->diff_bytes = ff_diff_bytes_mmx; + c->diff_int16 = ff_diff_int16_mmx; } #if HAVE_INLINE_ASM @@ -94,8 +103,13 @@ av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c) } #endif /* HAVE_INLINE_ASM */ + if (EXTERNAL_MMXEXT(cpu_flags) && pix_desc && pix_desc->comp[0].depth<16) { + c->sub_hfyu_median_pred_int16 = ff_sub_hfyu_median_pred_int16_mmxext; + } + if (EXTERNAL_SSE2(cpu_flags)) { c->diff_bytes = ff_diff_bytes_sse2; + c->diff_int16 = ff_diff_int16_sse2; } if (EXTERNAL_AVX2_FAST(cpu_flags)) { diff --git a/libavcodec/x86/lossless_videodsp.asm b/libavcodec/x86/lossless_videodsp.asm index a6ce5fe62b..bcc40ec061 100644 --- a/libavcodec/x86/lossless_videodsp.asm +++ b/libavcodec/x86/lossless_videodsp.asm @@ -288,25 +288,6 @@ cglobal add_int16, 4,4,5, dst, src, mask, w, tmp .unaligned: INT16_LOOP u, add -%if ARCH_X86_32 -INIT_MMX mmx -cglobal diff_int16, 5,5,5, dst, src1, src2, mask, w, tmp - INT16_LOOP a, sub -%endif - -INIT_XMM sse2 -cglobal diff_int16, 5,5,5, dst, src1, src2, mask, w, tmp - test src1q, mmsize-1 - jnz .unaligned - test src2q, mmsize-1 - jnz .unaligned - test dstq, mmsize-1 - jnz .unaligned - INT16_LOOP a, sub -.unaligned: - INT16_LOOP u, sub - - %macro ADD_HFYU_LEFT_LOOP_INT16 2 ; %1 = dst alignment (a/u), %2 = src alignment (a/u) add wd, wd add srcq, wq @@ -443,42 +424,3 @@ cglobal add_hfyu_median_pred_int16, 7,7,0, dst, top, diff, mask, w, left, left_t movzx r2d, word [topq-2] mov [left_topq], r2d RET - -cglobal sub_hfyu_median_pred_int16, 7,7,0, dst, src1, src2, mask, w, left, left_top - add wd, wd - movd mm7, maskd - SPLATW mm7, mm7 - movq mm0, [src1q] - movq mm2, [src2q] - psllq mm0, 16 - psllq mm2, 16 - movd mm6, [left_topq] - por mm0, mm6 - movd mm6, [leftq] - por mm2, mm6 - xor maskq, maskq -.loop: - movq mm1, [src1q + maskq] - movq mm3, [src2q + maskq] - movq mm4, mm2 - psubw mm2, mm0 - paddw mm2, mm1 - pand mm2, mm7 - movq mm5, mm4 - pmaxsw mm4, mm1 - pminsw mm1, mm5 - pminsw mm4, mm2 - pmaxsw mm4, mm1 - psubw mm3, mm4 - pand mm3, mm7 - movq [dstq + maskq], mm3 - add maskq, 8 - movq mm0, [src1q + maskq - 2] - movq mm2, [src2q + maskq - 2] - cmp maskq, wq - jb .loop - movzx maskd, word [src1q + wq - 2] - mov [left_topq], maskd - movzx maskd, word [src2q + wq - 2] - mov [leftq], maskd - RET diff --git a/libavcodec/x86/lossless_videodsp_init.c b/libavcodec/x86/lossless_videodsp_init.c index 465feef81b..2dc662d8b1 100644 --- a/libavcodec/x86/lossless_videodsp_init.c +++ b/libavcodec/x86/lossless_videodsp_init.c @@ -41,12 +41,9 @@ int ff_add_left_pred_sse4(uint8_t *dst, const uint8_t *src, void ff_add_int16_mmx(uint16_t *dst, const uint16_t *src, unsigned mask, int w); void ff_add_int16_sse2(uint16_t *dst, const uint16_t *src, unsigned mask, int w); -void ff_diff_int16_mmx (uint16_t *dst, const uint16_t *src1, const uint16_t *src2, unsigned mask, int w); -void ff_diff_int16_sse2(uint16_t *dst, const uint16_t *src1, const uint16_t *src2, unsigned mask, int w); int ff_add_hfyu_left_pred_int16_ssse3(uint16_t *dst, const uint16_t *src, unsigned mask, int w, unsigned acc); int ff_add_hfyu_left_pred_int16_sse4(uint16_t *dst, const uint16_t *src, unsigned mask, int w, unsigned acc); void ff_add_hfyu_median_pred_int16_mmxext(uint16_t *dst, const uint16_t *top, const uint16_t *diff, unsigned mask, int w, int *left, int *left_top); -void ff_sub_hfyu_median_pred_int16_mmxext(uint16_t *dst, const uint16_t *src1, const uint16_t *src2, unsigned mask, int w, int *left, int *left_top); #if HAVE_INLINE_ASM && HAVE_7REGS && ARCH_X86_32 static void add_median_pred_cmov(uint8_t *dst, const uint8_t *top, @@ -98,9 +95,7 @@ void ff_llviddsp_init_x86(LLVidDSPContext *c, AVCodecContext *avctx) if (ARCH_X86_32 && EXTERNAL_MMX(cpu_flags)) { c->add_bytes = ff_add_bytes_mmx; - c->add_int16 = ff_add_int16_mmx; - c->diff_int16 = ff_diff_int16_mmx; } if (ARCH_X86_32 && EXTERNAL_MMXEXT(cpu_flags)) { @@ -111,7 +106,6 @@ void ff_llviddsp_init_x86(LLVidDSPContext *c, AVCodecContext *avctx) if (EXTERNAL_MMXEXT(cpu_flags) && pix_desc && pix_desc->comp[0].depth<16) { c->add_hfyu_median_pred_int16 = ff_add_hfyu_median_pred_int16_mmxext; - c->sub_hfyu_median_pred_int16 = ff_sub_hfyu_median_pred_int16_mmxext; } if (EXTERNAL_SSE2(cpu_flags)) { @@ -119,7 +113,6 @@ void ff_llviddsp_init_x86(LLVidDSPContext *c, AVCodecContext *avctx) c->add_median_pred = ff_add_median_pred_sse2; c->add_int16 = ff_add_int16_sse2; - c->diff_int16 = ff_diff_int16_sse2; } if (EXTERNAL_SSSE3(cpu_flags)) {