From e74433a8e6fc00c8dbde293c97a3e45384c2c1d9 Mon Sep 17 00:00:00 2001 From: Diego Biurrun Date: Tue, 14 Jan 2014 10:33:47 +0100 Subject: [PATCH] dsputil: Split clear_block*/fill_block* off into a separate context --- configure | 35 +++--- libavcodec/4xm.c | 5 +- libavcodec/Makefile | 1 + libavcodec/arm/Makefile | 3 + libavcodec/arm/blockdsp_arm.h | 26 +++++ libavcodec/arm/blockdsp_init_arm.c | 33 ++++++ libavcodec/arm/blockdsp_init_neon.c | 37 ++++++ libavcodec/arm/blockdsp_neon.S | 38 +++++++ libavcodec/arm/dsputil_init_neon.c | 8 -- libavcodec/arm/dsputil_neon.S | 16 --- libavcodec/asv.h | 2 + libavcodec/asvdec.c | 4 +- libavcodec/bink.c | 16 +-- libavcodec/blockdsp.c | 78 +++++++++++++ libavcodec/blockdsp.h | 52 +++++++++ libavcodec/cavs.c | 1 + libavcodec/cavs.h | 2 + libavcodec/cavsdec.c | 2 +- libavcodec/dnxhddec.c | 9 +- libavcodec/dnxhdenc.c | 10 +- libavcodec/dnxhdenc.h | 1 + libavcodec/dsputil.c | 36 ------ libavcodec/dsputil.h | 25 +--- libavcodec/eamad.c | 4 +- libavcodec/eatqi.c | 4 +- libavcodec/g2meet.c | 7 +- libavcodec/h261dec.c | 2 +- libavcodec/h263.h | 2 +- libavcodec/intrax8.c | 2 +- libavcodec/ituh263dec.c | 8 +- libavcodec/jvdec.c | 12 +- libavcodec/mdec.c | 5 +- libavcodec/mimic.c | 5 +- libavcodec/mjpegdec.c | 6 +- libavcodec/mjpegdec.h | 2 + libavcodec/mpeg12dec.c | 8 +- libavcodec/mpeg4videodec.c | 8 +- libavcodec/mpeg4videoenc.c | 2 +- libavcodec/mpegvideo.c | 4 +- libavcodec/mpegvideo.h | 2 + libavcodec/msmpeg4dec.c | 4 +- libavcodec/ppc/Makefile | 1 + libavcodec/ppc/blockdsp.c | 169 ++++++++++++++++++++++++++++ libavcodec/ppc/dsputil_altivec.c | 14 --- libavcodec/ppc/dsputil_ppc.c | 110 ------------------ libavcodec/vc1dec.c | 9 +- libavcodec/wmv2.c | 5 +- libavcodec/wmv2dec.c | 4 +- libavcodec/x86/Makefile | 1 + libavcodec/x86/blockdsp_mmx.c | 120 ++++++++++++++++++++ libavcodec/x86/dsputil_init.c | 17 --- libavcodec/x86/dsputil_mmx.c | 56 --------- libavcodec/x86/dsputil_x86.h | 5 - 53 files changed, 677 insertions(+), 361 deletions(-) create mode 100644 libavcodec/arm/blockdsp_arm.h create mode 100644 libavcodec/arm/blockdsp_init_arm.c create mode 100644 libavcodec/arm/blockdsp_init_neon.c create mode 100644 libavcodec/arm/blockdsp_neon.S create mode 100644 libavcodec/blockdsp.c create mode 100644 libavcodec/blockdsp.h create mode 100644 libavcodec/ppc/blockdsp.c create mode 100644 libavcodec/x86/blockdsp_mmx.c diff --git a/configure b/configure index cc47c8ae08..c538632dbf 100755 --- a/configure +++ b/configure @@ -1530,6 +1530,7 @@ CONFIG_EXTRA=" aandcttables ac3dsp audio_frame_queue + blockdsp cabac dsputil gcrypt @@ -1705,7 +1706,7 @@ mdct_select="fft" rdft_select="fft" mpegaudio_select="mpegaudiodsp" mpegaudiodsp_select="dct" -mpegvideo_select="dsputil hpeldsp videodsp" +mpegvideo_select="blockdsp dsputil hpeldsp videodsp" mpegvideoenc_select="dsputil mpegvideo qpeldsp" # decoders / encoders @@ -1722,33 +1723,33 @@ amrnb_decoder_select="lsp" amrwb_decoder_select="lsp" amv_decoder_select="sp5x_decoder" ape_decoder_select="dsputil" -asv1_decoder_select="dsputil" +asv1_decoder_select="blockdsp dsputil" asv1_encoder_select="dsputil" -asv2_decoder_select="dsputil" +asv2_decoder_select="blockdsp dsputil" asv2_encoder_select="dsputil" atrac1_decoder_select="mdct sinewin" atrac3_decoder_select="mdct" atrac3p_decoder_select="mdct sinewin" -bink_decoder_select="dsputil hpeldsp" +bink_decoder_select="blockdsp hpeldsp" binkaudio_dct_decoder_select="mdct rdft dct sinewin" binkaudio_rdft_decoder_select="mdct rdft sinewin" -cavs_decoder_select="dsputil golomb h264chroma qpeldsp videodsp" +cavs_decoder_select="blockdsp dsputil golomb h264chroma qpeldsp videodsp" cllc_decoder_select="dsputil" comfortnoise_encoder_select="lpc" cook_decoder_select="dsputil mdct sinewin" cscd_decoder_select="lzo" cscd_decoder_suggest="zlib" dca_decoder_select="mdct" -dnxhd_decoder_select="dsputil" -dnxhd_encoder_select="aandcttables dsputil mpegvideoenc" +dnxhd_decoder_select="blockdsp dsputil" +dnxhd_encoder_select="aandcttables blockdsp dsputil mpegvideoenc" dvvideo_decoder_select="dsputil" dvvideo_encoder_select="dsputil" dxa_decoder_deps="zlib" eac3_decoder_select="ac3_decoder" eac3_encoder_select="ac3_encoder" -eamad_decoder_select="aandcttables dsputil mpegvideo" +eamad_decoder_select="aandcttables blockdsp dsputil mpegvideo" eatgq_decoder_select="aandcttables dsputil" -eatqi_decoder_select="aandcttables dsputil error_resilience mpegvideo" +eatqi_decoder_select="aandcttables blockdsp dsputil error_resilience mpegvideo" exr_decoder_deps="zlib" ffv1_decoder_select="golomb rangecoder" ffv1_encoder_select="rangecoder" @@ -1762,10 +1763,10 @@ flashsv_encoder_deps="zlib" flashsv2_decoder_deps="zlib" flv_decoder_select="h263_decoder" flv_encoder_select="h263_encoder" -fourxm_decoder_select="dsputil" +fourxm_decoder_select="blockdsp dsputil" fraps_decoder_select="dsputil huffman" g2m_decoder_deps="zlib" -g2m_decoder_select="dsputil" +g2m_decoder_select="blockdsp dsputil" h261_decoder_select="error_resilience mpegvideo" h261_encoder_select="aandcttables mpegvideoenc" h263_decoder_select="error_resilience h263_parser h263dsp mpegvideo qpeldsp" @@ -1783,14 +1784,14 @@ indeo3_decoder_select="hpeldsp" interplay_video_decoder_select="hpeldsp" jpegls_decoder_select="golomb mjpeg_decoder" jpegls_encoder_select="golomb" -jv_decoder_select="dsputil" +jv_decoder_select="blockdsp" lagarith_decoder_select="huffyuvdsp" ljpeg_encoder_select="aandcttables mpegvideoenc" loco_decoder_select="golomb" -mdec_decoder_select="dsputil error_resilience mpegvideo" +mdec_decoder_select="blockdsp dsputil error_resilience mpegvideo" metasound_decoder_select="lsp mdct sinewin" -mimic_decoder_select="dsputil hpeldsp" -mjpeg_decoder_select="dsputil hpeldsp" +mimic_decoder_select="blockdsp dsputil hpeldsp" +mjpeg_decoder_select="blockdsp dsputil hpeldsp" mjpeg_encoder_select="aandcttables mpegvideoenc" mjpegb_decoder_select="mjpeg_decoder" mlp_decoder_select="mlp_parser" @@ -1862,7 +1863,7 @@ twinvq_decoder_select="mdct lsp sinewin" utvideo_decoder_select="dsputil" utvideo_encoder_select="dsputil huffman huffyuvencdsp" vble_decoder_select="huffyuvdsp" -vc1_decoder_select="error_resilience h263_decoder h264chroma h264qpel intrax8 qpeldsp" +vc1_decoder_select="blockdsp error_resilience h263_decoder h264chroma h264qpel intrax8 qpeldsp" vc1image_decoder_select="vc1_decoder" vorbis_decoder_select="mdct" vorbis_encoder_select="mdct" @@ -1883,7 +1884,7 @@ wmav2_encoder_select="mdct sinewin" wmavoice_decoder_select="lsp rdft dct mdct sinewin" wmv1_decoder_select="h263_decoder" wmv1_encoder_select="h263_encoder" -wmv2_decoder_select="h263_decoder intrax8 videodsp" +wmv2_decoder_select="blockdsp h263_decoder intrax8 videodsp" wmv2_encoder_select="h263_encoder" wmv3_decoder_select="vc1_decoder" wmv3image_decoder_select="wmv3_decoder" diff --git a/libavcodec/4xm.c b/libavcodec/4xm.c index 3c89f1cb0c..b958e841ca 100644 --- a/libavcodec/4xm.c +++ b/libavcodec/4xm.c @@ -30,6 +30,7 @@ #include "libavutil/imgutils.h" #include "libavutil/intreadwrite.h" #include "avcodec.h" +#include "blockdsp.h" #include "bytestream.h" #include "dsputil.h" #include "get_bits.h" @@ -132,6 +133,7 @@ typedef struct CFrameBuffer { typedef struct FourXContext { AVCodecContext *avctx; DSPContext dsp; + BlockDSPContext bdsp; uint16_t *frame_buffer; uint16_t *last_frame_buffer; GetBitContext pre_gb; ///< ac/dc prefix @@ -564,7 +566,7 @@ static int decode_i_mb(FourXContext *f) int ret; int i; - f->dsp.clear_blocks(f->block[0]); + f->bdsp.clear_blocks(f->block[0]); for (i = 0; i < 6; i++) if ((ret = decode_i_block(f, f->block[i])) < 0) @@ -953,6 +955,7 @@ static av_cold int decode_init(AVCodecContext *avctx) } f->version = AV_RL32(avctx->extradata) >> 16; + ff_blockdsp_init(&f->bdsp, avctx); ff_dsputil_init(&f->dsp, avctx); f->avctx = avctx; init_vlcs(f); diff --git a/libavcodec/Makefile b/libavcodec/Makefile index a61e673d8b..c59154536f 100644 --- a/libavcodec/Makefile +++ b/libavcodec/Makefile @@ -28,6 +28,7 @@ OBJS = allcodecs.o \ OBJS-$(CONFIG_AANDCTTABLES) += aandcttab.o OBJS-$(CONFIG_AC3DSP) += ac3dsp.o OBJS-$(CONFIG_AUDIO_FRAME_QUEUE) += audio_frame_queue.o +OBJS-$(CONFIG_BLOCKDSP) += blockdsp.o OBJS-$(CONFIG_CABAC) += cabac.o OBJS-$(CONFIG_DCT) += dct.o dct32_fixed.o dct32_float.o OBJS-$(CONFIG_DXVA2) += dxva2.o diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile index 13025af9c1..381e997c0c 100644 --- a/libavcodec/arm/Makefile +++ b/libavcodec/arm/Makefile @@ -4,6 +4,7 @@ OBJS += arm/fmtconvert_init_arm.o OBJS-$(CONFIG_AC3DSP) += arm/ac3dsp_init_arm.o \ arm/ac3dsp_arm.o +OBJS-$(CONFIG_BLOCKDSP) += arm/blockdsp_init_arm.o OBJS-$(CONFIG_DSPUTIL) += arm/dsputil_init_arm.o \ arm/dsputil_arm.o \ arm/jrevdct_arm.o \ @@ -76,6 +77,8 @@ VFP-OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_vfp.o \ NEON-OBJS += arm/fmtconvert_neon.o NEON-OBJS-$(CONFIG_AC3DSP) += arm/ac3dsp_neon.o +NEON-OBJS-$(CONFIG_BLOCKDSP) += arm/blockdsp_init_neon.o \ + arm/blockdsp_neon.o NEON-OBJS-$(CONFIG_DSPUTIL) += arm/dsputil_init_neon.o \ arm/dsputil_neon.o \ arm/int_neon.o \ diff --git a/libavcodec/arm/blockdsp_arm.h b/libavcodec/arm/blockdsp_arm.h new file mode 100644 index 0000000000..6d9c2c3ed2 --- /dev/null +++ b/libavcodec/arm/blockdsp_arm.h @@ -0,0 +1,26 @@ +/* + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_ARM_BLOCKDSP_ARM_H +#define AVCODEC_ARM_BLOCKDSP_ARM_H + +#include "libavcodec/blockdsp.h" + +void ff_blockdsp_init_neon(BlockDSPContext *c, unsigned high_bit_depth); + +#endif /* AVCODEC_ARM_BLOCKDSP_ARM_H */ diff --git a/libavcodec/arm/blockdsp_init_arm.c b/libavcodec/arm/blockdsp_init_arm.c new file mode 100644 index 0000000000..a0c03674d7 --- /dev/null +++ b/libavcodec/arm/blockdsp_init_arm.c @@ -0,0 +1,33 @@ +/* + * ARM optimized block operations + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/arm/cpu.h" +#include "libavcodec/blockdsp.h" +#include "blockdsp_arm.h" + +av_cold void ff_blockdsp_init_arm(BlockDSPContext *c, unsigned high_bit_depth) +{ + int cpu_flags = av_get_cpu_flags(); + + if (have_neon(cpu_flags)) + ff_blockdsp_init_neon(c, high_bit_depth); +} diff --git a/libavcodec/arm/blockdsp_init_neon.c b/libavcodec/arm/blockdsp_init_neon.c new file mode 100644 index 0000000000..5081cf0cdf --- /dev/null +++ b/libavcodec/arm/blockdsp_init_neon.c @@ -0,0 +1,37 @@ +/* + * ARM NEON optimised block operations + * Copyright (c) 2008 Mans Rullgard + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include + +#include "libavutil/attributes.h" +#include "libavcodec/blockdsp.h" +#include "blockdsp_arm.h" + +void ff_clear_block_neon(int16_t *block); +void ff_clear_blocks_neon(int16_t *blocks); + +av_cold void ff_blockdsp_init_neon(BlockDSPContext *c, unsigned high_bit_depth) +{ + if (!high_bit_depth) { + c->clear_block = ff_clear_block_neon; + c->clear_blocks = ff_clear_blocks_neon; + } +} diff --git a/libavcodec/arm/blockdsp_neon.S b/libavcodec/arm/blockdsp_neon.S new file mode 100644 index 0000000000..98df2c60c4 --- /dev/null +++ b/libavcodec/arm/blockdsp_neon.S @@ -0,0 +1,38 @@ +/* + * ARM NEON optimised block functions + * Copyright (c) 2008 Mans Rullgard + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" + +function ff_clear_block_neon, export=1 + vmov.i16 q0, #0 + .rept 8 + vst1.16 {q0}, [r0,:128]! + .endr + bx lr +endfunc + +function ff_clear_blocks_neon, export=1 + vmov.i16 q0, #0 + .rept 8*6 + vst1.16 {q0}, [r0,:128]! + .endr + bx lr +endfunc diff --git a/libavcodec/arm/dsputil_init_neon.c b/libavcodec/arm/dsputil_init_neon.c index c9bdaa5a78..6863e05f73 100644 --- a/libavcodec/arm/dsputil_init_neon.c +++ b/libavcodec/arm/dsputil_init_neon.c @@ -30,9 +30,6 @@ void ff_simple_idct_neon(int16_t *data); void ff_simple_idct_put_neon(uint8_t *dest, int line_size, int16_t *data); void ff_simple_idct_add_neon(uint8_t *dest, int line_size, int16_t *data); -void ff_clear_block_neon(int16_t *block); -void ff_clear_blocks_neon(int16_t *blocks); - void ff_add_pixels_clamped_neon(const int16_t *, uint8_t *, int); void ff_put_pixels_clamped_neon(const int16_t *, uint8_t *, int); void ff_put_signed_pixels_clamped_neon(const int16_t *, uint8_t *, int); @@ -61,11 +58,6 @@ av_cold void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx, c->put_pixels_clamped = ff_put_pixels_clamped_neon; c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_neon; - if (!high_bit_depth) { - c->clear_block = ff_clear_block_neon; - c->clear_blocks = ff_clear_blocks_neon; - } - c->vector_clipf = ff_vector_clipf_neon; c->vector_clip_int32 = ff_vector_clip_int32_neon; diff --git a/libavcodec/arm/dsputil_neon.S b/libavcodec/arm/dsputil_neon.S index e30bd10b17..d494ec7ed7 100644 --- a/libavcodec/arm/dsputil_neon.S +++ b/libavcodec/arm/dsputil_neon.S @@ -21,22 +21,6 @@ #include "libavutil/arm/asm.S" -function ff_clear_block_neon, export=1 - vmov.i16 q0, #0 - .rept 8 - vst1.16 {q0}, [r0,:128]! - .endr - bx lr -endfunc - -function ff_clear_blocks_neon, export=1 - vmov.i16 q0, #0 - .rept 8*6 - vst1.16 {q0}, [r0,:128]! - .endr - bx lr -endfunc - function ff_put_pixels_clamped_neon, export=1 vld1.16 {d16-d19}, [r0,:128]! vqmovun.s16 d0, q8 diff --git a/libavcodec/asv.h b/libavcodec/asv.h index 3e56857d22..7a4e48b58c 100644 --- a/libavcodec/asv.h +++ b/libavcodec/asv.h @@ -31,12 +31,14 @@ #include "libavutil/mem.h" #include "avcodec.h" +#include "blockdsp.h" #include "dsputil.h" #include "get_bits.h" #include "put_bits.h" typedef struct ASV1Context{ AVCodecContext *avctx; + BlockDSPContext bdsp; DSPContext dsp; PutBitContext pb; GetBitContext gb; diff --git a/libavcodec/asvdec.c b/libavcodec/asvdec.c index f160434000..5bbca46ea3 100644 --- a/libavcodec/asvdec.c +++ b/libavcodec/asvdec.c @@ -28,6 +28,7 @@ #include "asv.h" #include "avcodec.h" +#include "blockdsp.h" #include "put_bits.h" #include "internal.h" #include "mathops.h" @@ -164,7 +165,7 @@ static inline int decode_mb(ASV1Context *a, int16_t block[6][64]) { int i; - a->dsp.clear_blocks(block[0]); + a->bdsp.clear_blocks(block[0]); if (a->avctx->codec_id == AV_CODEC_ID_ASV1) { for (i = 0; i < 6; i++) { @@ -280,6 +281,7 @@ static av_cold int decode_init(AVCodecContext *avctx) } ff_asv_common_init(avctx); + ff_blockdsp_init(&a->bdsp, avctx); init_vlcs(a); ff_init_scantable(a->dsp.idct_permutation, &a->scantable, ff_asv_scantab); avctx->pix_fmt = AV_PIX_FMT_YUV420P; diff --git a/libavcodec/bink.c b/libavcodec/bink.c index d1e94d98a8..e34585b8a5 100644 --- a/libavcodec/bink.c +++ b/libavcodec/bink.c @@ -24,9 +24,9 @@ #include "libavutil/imgutils.h" #include "libavutil/internal.h" #include "avcodec.h" -#include "dsputil.h" #include "binkdata.h" #include "binkdsp.h" +#include "blockdsp.h" #include "hpeldsp.h" #include "internal.h" #include "mathops.h" @@ -113,7 +113,7 @@ typedef struct Bundle { */ typedef struct BinkContext { AVCodecContext *avctx; - DSPContext dsp; + BlockDSPContext bdsp; HpelDSPContext hdsp; BinkDSPContext binkdsp; AVFrame *last; @@ -880,7 +880,7 @@ static int binkb_decode_plane(BinkContext *c, AVFrame *frame, GetBitContext *gb, } else { put_pixels8x8_overlapped(dst, ref, stride); } - c->dsp.clear_block(block); + c->bdsp.clear_block(block); v = binkb_get_value(c, BINKB_SRC_INTER_COEFS); read_residue(gb, block, v); c->binkdsp.add_pixels8(dst, block, stride); @@ -904,7 +904,7 @@ static int binkb_decode_plane(BinkContext *c, AVFrame *frame, GetBitContext *gb, break; case 5: v = binkb_get_value(c, BINKB_SRC_COLORS); - c->dsp.fill_block_tab[1](dst, v, stride, 8); + c->bdsp.fill_block_tab[1](dst, v, stride, 8); break; case 6: for (i = 0; i < 2; i++) @@ -1047,7 +1047,7 @@ static int bink_decode_plane(BinkContext *c, AVFrame *frame, GetBitContext *gb, break; case FILL_BLOCK: v = get_value(c, BINK_SRC_COLORS); - c->dsp.fill_block_tab[0](dst, v, stride, 16); + c->bdsp.fill_block_tab[0](dst, v, stride, 16); break; case PATTERN_BLOCK: for (i = 0; i < 2; i++) @@ -1117,7 +1117,7 @@ static int bink_decode_plane(BinkContext *c, AVFrame *frame, GetBitContext *gb, return AVERROR_INVALIDDATA; } c->hdsp.put_pixels_tab[1][0](dst, ref, stride, 8); - c->dsp.clear_block(block); + c->bdsp.clear_block(block); v = get_bits(gb, 7); read_residue(gb, block, v); c->binkdsp.add_pixels8(dst, block, stride); @@ -1130,7 +1130,7 @@ static int bink_decode_plane(BinkContext *c, AVFrame *frame, GetBitContext *gb, break; case FILL_BLOCK: v = get_value(c, BINK_SRC_COLORS); - c->dsp.fill_block_tab[1](dst, v, stride, 8); + c->bdsp.fill_block_tab[1](dst, v, stride, 8); break; case INTER_BLOCK: xoff = get_value(c, BINK_SRC_X_OFF); @@ -1310,7 +1310,7 @@ static av_cold int decode_init(AVCodecContext *avctx) avctx->pix_fmt = c->has_alpha ? AV_PIX_FMT_YUVA420P : AV_PIX_FMT_YUV420P; - ff_dsputil_init(&c->dsp, avctx); + ff_blockdsp_init(&c->bdsp, avctx); ff_hpeldsp_init(&c->hdsp, avctx->flags); ff_binkdsp_init(&c->binkdsp); diff --git a/libavcodec/blockdsp.c b/libavcodec/blockdsp.c new file mode 100644 index 0000000000..e3d2ca1fdc --- /dev/null +++ b/libavcodec/blockdsp.c @@ -0,0 +1,78 @@ +/* + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include +#include + +#include "config.h" +#include "libavutil/attributes.h" +#include "avcodec.h" +#include "blockdsp.h" +#include "version.h" + +static void clear_block_8_c(int16_t *block) +{ + memset(block, 0, sizeof(int16_t) * 64); +} + +static void clear_blocks_8_c(int16_t *blocks) +{ + memset(blocks, 0, sizeof(int16_t) * 6 * 64); +} + +static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h) +{ + int i; + + for (i = 0; i < h; i++) { + memset(block, value, 16); + block += line_size; + } +} + +static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h) +{ + int i; + + for (i = 0; i < h; i++) { + memset(block, value, 8); + block += line_size; + } +} + +av_cold void ff_blockdsp_init(BlockDSPContext *c, AVCodecContext *avctx) +{ + const unsigned high_bit_depth = avctx->bits_per_raw_sample > 8; + + c->clear_block = clear_block_8_c; + c->clear_blocks = clear_blocks_8_c; + + c->fill_block_tab[0] = fill_block16_c; + c->fill_block_tab[1] = fill_block8_c; + + if (ARCH_ARM) + ff_blockdsp_init_arm(c, high_bit_depth); + if (ARCH_PPC) + ff_blockdsp_init_ppc(c, high_bit_depth); + if (ARCH_X86) +#if FF_API_XVMC + ff_blockdsp_init_x86(c, high_bit_depth, avctx); +#else + ff_blockdsp_init_x86(c, high_bit_depth); +#endif /* FF_API_XVMC */ +} diff --git a/libavcodec/blockdsp.h b/libavcodec/blockdsp.h new file mode 100644 index 0000000000..32c671cf5a --- /dev/null +++ b/libavcodec/blockdsp.h @@ -0,0 +1,52 @@ +/* + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_BLOCKDSP_H +#define AVCODEC_BLOCKDSP_H + +#include + +#include "avcodec.h" +#include "version.h" + +/* add and put pixel (decoding) + * Block sizes for op_pixels_func are 8x4,8x8 16x8 16x16. + * h for op_pixels_func is limited to { width / 2, width }, + * but never larger than 16 and never smaller than 4. */ +typedef void (*op_fill_func)(uint8_t *block /* align width (8 or 16) */, + uint8_t value, int line_size, int h); + +typedef struct BlockDSPContext { + void (*clear_block)(int16_t *block /* align 16 */); + void (*clear_blocks)(int16_t *blocks /* align 16 */); + + op_fill_func fill_block_tab[2]; +} BlockDSPContext; + +void ff_blockdsp_init(BlockDSPContext *c, AVCodecContext *avctx); + +void ff_blockdsp_init_arm(BlockDSPContext *c, unsigned high_bit_depth); +void ff_blockdsp_init_ppc(BlockDSPContext *c, unsigned high_bit_depth); +#if FF_API_XVMC +void ff_blockdsp_init_x86(BlockDSPContext *c, unsigned high_bit_depth, + AVCodecContext *avctx); +#else +void ff_blockdsp_init_x86(BlockDSPContext *c, unsigned high_bit_depth); +#endif /* FF_API_XVMC */ + +#endif /* AVCODEC_BLOCKDSP_H */ diff --git a/libavcodec/cavs.c b/libavcodec/cavs.c index 3f21dccedf..21bc1edc23 100644 --- a/libavcodec/cavs.c +++ b/libavcodec/cavs.c @@ -759,6 +759,7 @@ av_cold int ff_cavs_init(AVCodecContext *avctx) { AVSContext *h = avctx->priv_data; + ff_blockdsp_init(&h->bdsp, avctx); ff_dsputil_init(&h->dsp, avctx); ff_h264chroma_init(&h->h264chroma, 8); ff_videodsp_init(&h->vdsp, 8); diff --git a/libavcodec/cavs.h b/libavcodec/cavs.h index 7d9b94e815..c5a10b556b 100644 --- a/libavcodec/cavs.h +++ b/libavcodec/cavs.h @@ -23,6 +23,7 @@ #define AVCODEC_CAVS_H #include "cavsdsp.h" +#include "blockdsp.h" #include "dsputil.h" #include "h264chroma.h" #include "get_bits.h" @@ -162,6 +163,7 @@ typedef struct AVSFrame { typedef struct AVSContext { AVCodecContext *avctx; DSPContext dsp; + BlockDSPContext bdsp; H264ChromaContext h264chroma; VideoDSPContext vdsp; CAVSDSPContext cdsp; diff --git a/libavcodec/cavsdec.c b/libavcodec/cavsdec.c index a8ed1920ed..fbbd04803a 100644 --- a/libavcodec/cavsdec.c +++ b/libavcodec/cavsdec.c @@ -581,7 +581,7 @@ static int decode_residual_block(AVSContext *h, GetBitContext *gb, dequant_shift[qp], i)) < 0) return ret; h->cdsp.cavs_idct8_add(dst, block, stride); - h->dsp.clear_block(block); + h->bdsp.clear_block(block); return 0; } diff --git a/libavcodec/dnxhddec.c b/libavcodec/dnxhddec.c index 4daee04cf0..3bd8ffecd6 100644 --- a/libavcodec/dnxhddec.c +++ b/libavcodec/dnxhddec.c @@ -25,6 +25,7 @@ #include "libavutil/imgutils.h" #include "libavutil/timer.h" #include "avcodec.h" +#include "blockdsp.h" #include "get_bits.h" #include "dnxhddata.h" #include "dsputil.h" @@ -33,6 +34,7 @@ typedef struct DNXHDContext { AVCodecContext *avctx; GetBitContext gb; + BlockDSPContext bdsp; int cid; ///< compression id unsigned int width, height; unsigned int mb_width, mb_height; @@ -133,6 +135,7 @@ static int dnxhd_decode_header(DNXHDContext *ctx, AVFrame *frame, ctx->avctx->pix_fmt = AV_PIX_FMT_YUV444P10; ctx->avctx->bits_per_raw_sample = 10; if (ctx->bit_depth != 10) { + ff_blockdsp_init(&ctx->bdsp, ctx->avctx); ff_dsputil_init(&ctx->dsp, ctx->avctx); ctx->bit_depth = 10; ctx->decode_dct_block = dnxhd_decode_dct_block_10_444; @@ -142,6 +145,7 @@ static int dnxhd_decode_header(DNXHDContext *ctx, AVFrame *frame, ctx->avctx->pix_fmt = AV_PIX_FMT_YUV422P10; ctx->avctx->bits_per_raw_sample = 10; if (ctx->bit_depth != 10) { + ff_blockdsp_init(&ctx->bdsp, ctx->avctx); ff_dsputil_init(&ctx->dsp, ctx->avctx); ctx->bit_depth = 10; ctx->decode_dct_block = dnxhd_decode_dct_block_10; @@ -150,6 +154,7 @@ static int dnxhd_decode_header(DNXHDContext *ctx, AVFrame *frame, ctx->avctx->pix_fmt = AV_PIX_FMT_YUV422P; ctx->avctx->bits_per_raw_sample = 8; if (ctx->bit_depth != 8) { + ff_blockdsp_init(&ctx->bdsp, ctx->avctx); ff_dsputil_init(&ctx->dsp, ctx->avctx); ctx->bit_depth = 8; ctx->decode_dct_block = dnxhd_decode_dct_block_8; @@ -307,12 +312,12 @@ static int dnxhd_decode_macroblock(DNXHDContext *ctx, AVFrame *frame, skip_bits1(&ctx->gb); for (i = 0; i < 8; i++) { - ctx->dsp.clear_block(ctx->blocks[i]); + ctx->bdsp.clear_block(ctx->blocks[i]); ctx->decode_dct_block(ctx, ctx->blocks[i], i, qscale); } if (ctx->is_444) { for (; i < 12; i++) { - ctx->dsp.clear_block(ctx->blocks[i]); + ctx->bdsp.clear_block(ctx->blocks[i]); ctx->decode_dct_block(ctx, ctx->blocks[i], i, qscale); } } diff --git a/libavcodec/dnxhdenc.c b/libavcodec/dnxhdenc.c index 4b06f6cc03..c637415187 100644 --- a/libavcodec/dnxhdenc.c +++ b/libavcodec/dnxhdenc.c @@ -29,6 +29,7 @@ #include "libavutil/timer.h" #include "avcodec.h" +#include "blockdsp.h" #include "dsputil.h" #include "internal.h" #include "mpegvideo.h" @@ -305,6 +306,7 @@ static av_cold int dnxhd_encode_init(AVCodecContext *avctx) avctx->bits_per_raw_sample = ctx->cid_table->bit_depth; + ff_blockdsp_init(&ctx->bdsp, avctx); ff_dsputil_init(&ctx->m.dsp, avctx); ff_dct_common_init(&ctx->m); if (!ctx->m.dct_quantize) @@ -556,10 +558,10 @@ void dnxhd_get_blocks(DNXHDEncContext *ctx, int mb_x, int mb_y) ptr_v + ctx->dct_uv_offset, ctx->m.uvlinesize); } else { - dsp->clear_block(ctx->blocks[4]); - dsp->clear_block(ctx->blocks[5]); - dsp->clear_block(ctx->blocks[6]); - dsp->clear_block(ctx->blocks[7]); + ctx->bdsp.clear_block(ctx->blocks[4]); + ctx->bdsp.clear_block(ctx->blocks[5]); + ctx->bdsp.clear_block(ctx->blocks[6]); + ctx->bdsp.clear_block(ctx->blocks[7]); } } else { dsp->get_pixels(ctx->blocks[4], diff --git a/libavcodec/dnxhdenc.h b/libavcodec/dnxhdenc.h index 215482ed5e..c3248a28d3 100644 --- a/libavcodec/dnxhdenc.h +++ b/libavcodec/dnxhdenc.h @@ -41,6 +41,7 @@ typedef struct RCEntry { typedef struct DNXHDEncContext { AVClass *class; + BlockDSPContext bdsp; MpegEncContext m; ///< Used for quantization dsp functions int cid; diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c index 0ef9d8c8f7..8f5ddd0eb1 100644 --- a/libavcodec/dsputil.c +++ b/libavcodec/dsputil.c @@ -373,26 +373,6 @@ static int sum_abs_dctelem_c(int16_t *block) return sum; } -static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h) -{ - int i; - - for (i = 0; i < h; i++) { - memset(block, value, 16); - block += line_size; - } -} - -static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h) -{ - int i; - - for (i = 0; i < h; i++) { - memset(block, value, 8); - block += line_size; - } -} - #define avg2(a, b) ((a + b + 1) >> 1) #define avg4(a, b, c, d) ((a + b + c + d + 2) >> 2) @@ -1408,16 +1388,6 @@ static void draw_edges_8_c(uint8_t *buf, int wrap, int width, int height, memcpy(last_line + (i + 1) * wrap, last_line, width + w + w); } -static void clear_block_8_c(int16_t *block) -{ - memset(block, 0, sizeof(int16_t) * 64); -} - -static void clear_blocks_8_c(int16_t *blocks) -{ - memset(blocks, 0, sizeof(int16_t) * 6 * 64); -} - /* init static data */ av_cold void ff_dsputil_static_init(void) { @@ -1487,9 +1457,6 @@ av_cold void ff_dsputil_init(DSPContext *c, AVCodecContext *avctx) c->pix_sum = pix_sum_c; c->pix_norm1 = pix_norm1_c; - c->fill_block_tab[0] = fill_block16_c; - c->fill_block_tab[1] = fill_block8_c; - /* TODO [0] 16 [1] 8 */ c->pix_abs[0][0] = pix_abs16_c; c->pix_abs[0][1] = pix_abs16_x2_c; @@ -1546,9 +1513,6 @@ av_cold void ff_dsputil_init(DSPContext *c, AVCodecContext *avctx) c->draw_edges = draw_edges_8_c; - c->clear_block = clear_block_8_c; - c->clear_blocks = clear_blocks_8_c; - switch (avctx->bits_per_raw_sample) { case 9: case 10: diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h index 070580fb4f..1aad789855 100644 --- a/libavcodec/dsputil.h +++ b/libavcodec/dsputil.h @@ -38,26 +38,6 @@ void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy, int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height); -/* minimum alignment rules ;) - * If you notice errors in the align stuff, need more alignment for some ASM code - * for some CPU or need to use a function with less aligned data then send a mail - * to the libav-devel mailing list, ... - * - * !warning These alignments might not match reality, (missing attribute((align)) - * stuff somewhere possible). - * I (Michael) did not check them, these are just the alignments which I think - * could be reached easily ... - * - * !future video codecs might need functions with less strict alignment - */ - -/* add and put pixel (decoding) - * Block sizes for op_pixels_func are 8x4,8x8 16x8 16x16. - * h for op_pixels_func is limited to { width / 2, width }, - * but never larger than 16 and never smaller than 4. */ -typedef void (*op_fill_func)(uint8_t *block /* align width (8 or 16) */, - uint8_t value, int line_size, int h); - struct MpegEncContext; /* Motion estimation: * h is limited to { width / 2, width, 2 * width }, @@ -116,8 +96,7 @@ typedef struct DSPContext { int stride, int h, int ox, int oy, int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height); - void (*clear_block)(int16_t *block /* align 16 */); - void (*clear_blocks)(int16_t *blocks /* align 16 */); + int (*pix_sum)(uint8_t *pix, int line_size); int (*pix_norm1)(uint8_t *pix, int line_size); @@ -234,8 +213,6 @@ typedef struct DSPContext { */ void (*vector_clip_int32)(int32_t *dst, const int32_t *src, int32_t min, int32_t max, unsigned int len); - - op_fill_func fill_block_tab[2]; } DSPContext; void ff_dsputil_static_init(void); diff --git a/libavcodec/eamad.c b/libavcodec/eamad.c index 22070a45f7..4bc07394c8 100644 --- a/libavcodec/eamad.c +++ b/libavcodec/eamad.c @@ -44,6 +44,7 @@ typedef struct MadContext { AVCodecContext *avctx; + BlockDSPContext bdsp; DSPContext dsp; AVFrame *last_frame; GetBitContext gb; @@ -61,6 +62,7 @@ static av_cold int decode_init(AVCodecContext *avctx) MadContext *s = avctx->priv_data; s->avctx = avctx; avctx->pix_fmt = AV_PIX_FMT_YUV420P; + ff_blockdsp_init(&s->bdsp, avctx); ff_dsputil_init(&s->dsp, avctx); ff_init_scantable_permutation(s->dsp.idct_permutation, FF_NO_IDCT_PERM); ff_init_scantable(s->dsp.idct_permutation, &s->scantable, ff_zigzag_direct); @@ -207,7 +209,7 @@ static void decode_mb(MadContext *s, AVFrame *frame, int inter) int add = 2*decode_motion(&s->gb); comp_block(s, frame, s->mb_x, s->mb_y, j, mv_x, mv_y, add); } else { - s->dsp.clear_block(s->block); + s->bdsp.clear_block(s->block); decode_block_intra(s, s->block); idct_put(s, frame, s->block, s->mb_x, s->mb_y, j); } diff --git a/libavcodec/eatqi.c b/libavcodec/eatqi.c index 2345cc7bef..8c31f1f7ad 100644 --- a/libavcodec/eatqi.c +++ b/libavcodec/eatqi.c @@ -27,6 +27,7 @@ */ #include "avcodec.h" +#include "blockdsp.h" #include "get_bits.h" #include "aandcttab.h" #include "eaidct.h" @@ -46,6 +47,7 @@ static av_cold int tqi_decode_init(AVCodecContext *avctx) TqiContext *t = avctx->priv_data; MpegEncContext *s = &t->s; s->avctx = avctx; + ff_blockdsp_init(&s->bdsp, avctx); ff_dsputil_init(&s->dsp, avctx); ff_init_scantable_permutation(s->dsp.idct_permutation, FF_NO_IDCT_PERM); ff_init_scantable(s->dsp.idct_permutation, &s->intra_scantable, ff_zigzag_direct); @@ -59,7 +61,7 @@ static av_cold int tqi_decode_init(AVCodecContext *avctx) static int tqi_decode_mb(MpegEncContext *s, int16_t (*block)[64]) { int n; - s->dsp.clear_blocks(block[0]); + s->bdsp.clear_blocks(block[0]); for (n=0; n<6; n++) if (ff_mpeg1_decode_block_intra(s, block[n], n) < 0) return -1; diff --git a/libavcodec/g2meet.c b/libavcodec/g2meet.c index a741b6f0f2..9660155619 100644 --- a/libavcodec/g2meet.c +++ b/libavcodec/g2meet.c @@ -29,6 +29,7 @@ #include "libavutil/intreadwrite.h" #include "avcodec.h" +#include "blockdsp.h" #include "bytestream.h" #include "dsputil.h" #include "get_bits.h" @@ -72,6 +73,7 @@ static const uint8_t chroma_quant[64] = { }; typedef struct JPGContext { + BlockDSPContext bdsp; DSPContext dsp; ScanTable scantable; @@ -150,6 +152,7 @@ static av_cold int jpg_init(AVCodecContext *avctx, JPGContext *c) if (ret) return ret; + ff_blockdsp_init(&c->bdsp, avctx); ff_dsputil_init(&c->dsp, avctx); ff_init_scantable(c->dsp.idct_permutation, &c->scantable, ff_zigzag_direct); @@ -193,7 +196,7 @@ static int jpg_decode_block(JPGContext *c, GetBitContext *gb, const int is_chroma = !!plane; const uint8_t *qmat = is_chroma ? chroma_quant : luma_quant; - c->dsp.clear_block(block); + c->bdsp.clear_block(block); dc = get_vlc2(gb, c->dc_vlc[is_chroma].table, 9, 3); if (dc < 0) return AVERROR_INVALIDDATA; @@ -259,7 +262,7 @@ static int jpg_decode_data(JPGContext *c, int width, int height, for (i = 0; i < 3; i++) c->prev_dc[i] = 1024; bx = by = 0; - c->dsp.clear_blocks(c->block[0]); + c->bdsp.clear_blocks(c->block[0]); for (mb_y = 0; mb_y < mb_h; mb_y++) { for (mb_x = 0; mb_x < mb_w; mb_x++) { if (mask && !mask[mb_x * 2] && !mask[mb_x * 2 + 1] && diff --git a/libavcodec/h261dec.c b/libavcodec/h261dec.c index 0d996eb4e9..73f6a59bf9 100644 --- a/libavcodec/h261dec.c +++ b/libavcodec/h261dec.c @@ -433,7 +433,7 @@ static int h261_decode_mb(H261Context *h) intra: /* decode each block */ if (s->mb_intra || HAS_CBP(h->mtype)) { - s->dsp.clear_blocks(s->block[0]); + s->bdsp.clear_blocks(s->block[0]); for (i = 0; i < 6; i++) { if (h261_decode_block(h, s->block[i], i, cbp & 32) < 0) return SLICE_ERROR; diff --git a/libavcodec/h263.h b/libavcodec/h263.h index c6ad618fa1..dbbe7cead8 100644 --- a/libavcodec/h263.h +++ b/libavcodec/h263.h @@ -197,7 +197,7 @@ static inline int get_p_cbp(MpegEncContext * s, for (i = 0; i < 6; i++) { if (s->block_last_index[i] >= 0 && ((cbp >> (5 - i))&1)==0 ){ s->block_last_index[i]= -1; - s->dsp.clear_block(s->block[i]); + s->bdsp.clear_block(s->block[i]); } } }else{ diff --git a/libavcodec/intrax8.c b/libavcodec/intrax8.c index 962c4608da..2bda7233f7 100644 --- a/libavcodec/intrax8.c +++ b/libavcodec/intrax8.c @@ -538,7 +538,7 @@ static int x8_decode_intra_mb(IntraX8Context* const w, const int chroma){ int sign; assert(w->orient<12); - s->dsp.clear_block(s->block[0]); + s->bdsp.clear_block(s->block[0]); if(chroma){ dc_mode=2; diff --git a/libavcodec/ituh263dec.c b/libavcodec/ituh263dec.c index 55a8c45ab8..dc3de30bb0 100644 --- a/libavcodec/ituh263dec.c +++ b/libavcodec/ituh263dec.c @@ -538,7 +538,7 @@ retry: rl = &ff_rl_intra_aic; i = 0; s->gb= gb; - s->dsp.clear_block(block); + s->bdsp.clear_block(block); goto retry; } av_log(s->avctx, AV_LOG_ERROR, "run overflow at %dx%d i:%d\n", s->mb_x, s->mb_y, s->mb_intra); @@ -628,7 +628,7 @@ int ff_h263_decode_mb(MpegEncContext *s, } }while(cbpc == 20); - s->dsp.clear_blocks(s->block[0]); + s->bdsp.clear_blocks(s->block[0]); dquant = cbpc & 8; s->mb_intra = ((cbpc & 4) != 0); @@ -723,7 +723,7 @@ int ff_h263_decode_mb(MpegEncContext *s, s->mb_intra = IS_INTRA(mb_type); if(HAS_CBP(mb_type)){ - s->dsp.clear_blocks(s->block[0]); + s->bdsp.clear_blocks(s->block[0]); cbpc = get_vlc2(&s->gb, cbpc_b_vlc.table, CBPC_B_VLC_BITS, 1); if(s->mb_intra){ dquant = IS_QUANT(mb_type); @@ -797,7 +797,7 @@ int ff_h263_decode_mb(MpegEncContext *s, } }while(cbpc == 8); - s->dsp.clear_blocks(s->block[0]); + s->bdsp.clear_blocks(s->block[0]); dquant = cbpc & 4; s->mb_intra = 1; diff --git a/libavcodec/jvdec.c b/libavcodec/jvdec.c index 662a94492d..bb347e045a 100644 --- a/libavcodec/jvdec.c +++ b/libavcodec/jvdec.c @@ -28,12 +28,12 @@ #include "libavutil/intreadwrite.h" #include "avcodec.h" -#include "dsputil.h" +#include "blockdsp.h" #include "get_bits.h" #include "internal.h" typedef struct JvContext { - DSPContext dsp; + BlockDSPContext bdsp; AVFrame *frame; uint32_t palette[AVPALETTE_COUNT]; int palette_has_changed; @@ -48,7 +48,7 @@ static av_cold int decode_init(AVCodecContext *avctx) return AVERROR(ENOMEM); avctx->pix_fmt = AV_PIX_FMT_PAL8; - ff_dsputil_init(&s->dsp, avctx); + ff_blockdsp_init(&s->bdsp, avctx); return 0; } @@ -113,14 +113,14 @@ static inline void decode4x4(GetBitContext *gb, uint8_t *dst, int linesize) * Decode 8x8 block */ static inline void decode8x8(GetBitContext *gb, uint8_t *dst, int linesize, - DSPContext *dsp) + BlockDSPContext *bdsp) { int i, j, v[2]; switch (get_bits(gb, 2)) { case 1: v[0] = get_bits(gb, 8); - dsp->fill_block_tab[1](dst, v[0], linesize, 8); + bdsp->fill_block_tab[1](dst, v[0], linesize, 8); break; case 2: v[0] = get_bits(gb, 8); @@ -163,7 +163,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, for (i = 0; i < avctx->width; i += 8) decode8x8(&gb, s->frame->data[0] + j * s->frame->linesize[0] + i, - s->frame->linesize[0], &s->dsp); + s->frame->linesize[0], &s->bdsp); buf += video_size; } else if (video_type == 2) { diff --git a/libavcodec/mdec.c b/libavcodec/mdec.c index d6c6060f04..b4213972b7 100644 --- a/libavcodec/mdec.c +++ b/libavcodec/mdec.c @@ -28,12 +28,14 @@ */ #include "avcodec.h" +#include "blockdsp.h" #include "mpegvideo.h" #include "mpeg12.h" #include "thread.h" typedef struct MDECContext { AVCodecContext *avctx; + BlockDSPContext bdsp; DSPContext dsp; ThreadFrame frame; GetBitContext gb; @@ -123,7 +125,7 @@ static inline int decode_mb(MDECContext *a, int16_t block[6][64]) int i, ret; const int block_index[6] = { 5, 4, 0, 1, 2, 3 }; - a->dsp.clear_blocks(block[0]); + a->bdsp.clear_blocks(block[0]); for (i = 0; i < 6; i++) { if ((ret = mdec_decode_block_intra(a, block[block_index[i]], @@ -212,6 +214,7 @@ static av_cold int decode_init(AVCodecContext *avctx) a->avctx = avctx; + ff_blockdsp_init(&a->bdsp, avctx); ff_dsputil_init(&a->dsp, avctx); ff_mpeg12_init_vlcs(); ff_init_scantable(a->dsp.idct_permutation, &a->scantable, ff_zigzag_direct); diff --git a/libavcodec/mimic.c b/libavcodec/mimic.c index 264c74afb4..179ffeae14 100644 --- a/libavcodec/mimic.c +++ b/libavcodec/mimic.c @@ -24,6 +24,7 @@ #include #include "avcodec.h" +#include "blockdsp.h" #include "internal.h" #include "get_bits.h" #include "bytestream.h" @@ -52,6 +53,7 @@ typedef struct { GetBitContext gb; ScanTable scantable; + BlockDSPContext bdsp; DSPContext dsp; HpelDSPContext hdsp; VLC vlc; @@ -145,6 +147,7 @@ static av_cold int mimic_decode_init(AVCodecContext *avctx) av_log(avctx, AV_LOG_ERROR, "error initializing vlc table\n"); return ret; } + ff_blockdsp_init(&ctx->bdsp, avctx); ff_dsputil_init(&ctx->dsp, avctx); ff_hpeldsp_init(&ctx->hdsp, avctx->flags); ff_init_scantable(ctx->dsp.idct_permutation, &ctx->scantable, col_zag); @@ -227,7 +230,7 @@ static int vlc_decode_block(MimicContext *ctx, int num_coeffs, int qscale) int16_t *block = ctx->dct_block; unsigned int pos; - ctx->dsp.clear_block(block); + ctx->bdsp.clear_block(block); block[0] = get_bits(&ctx->gb, 8) << 3; diff --git a/libavcodec/mjpegdec.c b/libavcodec/mjpegdec.c index b1192c58c0..cd1e292870 100644 --- a/libavcodec/mjpegdec.c +++ b/libavcodec/mjpegdec.c @@ -35,6 +35,7 @@ #include "libavutil/imgutils.h" #include "libavutil/opt.h" #include "avcodec.h" +#include "blockdsp.h" #include "internal.h" #include "mjpeg.h" #include "mjpegdec.h" @@ -92,6 +93,7 @@ av_cold int ff_mjpeg_decode_init(AVCodecContext *avctx) } s->avctx = avctx; + ff_blockdsp_init(&s->bdsp, avctx); ff_hpeldsp_init(&s->hdsp, avctx->flags); ff_dsputil_init(&s->dsp, avctx); ff_init_scantable(s->dsp.idct_permutation, &s->scantable, ff_zigzag_direct); @@ -486,7 +488,7 @@ static int decode_dc_progressive(MJpegDecodeContext *s, int16_t *block, int16_t *quant_matrix, int Al) { int val; - s->dsp.clear_block(block); + s->bdsp.clear_block(block); val = mjpeg_decode_dc(s, dc_index); if (val == 0xffff) { av_log(s->avctx, AV_LOG_ERROR, "error dc\n"); @@ -878,7 +880,7 @@ static int mjpeg_decode_scan(MJpegDecodeContext *s, int nb_components, int Ah, reference_data[c] + block_offset, linesize[c], 8); else { - s->dsp.clear_block(s->block); + s->bdsp.clear_block(s->block); if (decode_block(s, s->block, i, s->dc_index[i], s->ac_index[i], s->quant_matrixes[s->quant_index[c]]) < 0) { diff --git a/libavcodec/mjpegdec.h b/libavcodec/mjpegdec.h index 344d2cbafc..0d1dd9ee03 100644 --- a/libavcodec/mjpegdec.h +++ b/libavcodec/mjpegdec.h @@ -33,6 +33,7 @@ #include "libavutil/pixdesc.h" #include "avcodec.h" +#include "blockdsp.h" #include "get_bits.h" #include "dsputil.h" #include "hpeldsp.h" @@ -95,6 +96,7 @@ typedef struct MJpegDecodeContext { uint8_t *last_nnz[MAX_COMPONENTS]; uint64_t coefs_finished[MAX_COMPONENTS]; ///< bitmask of which coefs have been completely decoded (progressive mode) ScanTable scantable; + BlockDSPContext bdsp; DSPContext dsp; HpelDSPContext hdsp; diff --git a/libavcodec/mpeg12dec.c b/libavcodec/mpeg12dec.c index 1cd37fa6b7..195f9f3250 100644 --- a/libavcodec/mpeg12dec.c +++ b/libavcodec/mpeg12dec.c @@ -776,10 +776,10 @@ static int mpeg_decode_mb(MpegEncContext *s, int16_t block[12][64]) av_dlog(s->avctx, "mb_type=%x\n", mb_type); // motion_type = 0; /* avoid warning */ if (IS_INTRA(mb_type)) { - s->dsp.clear_blocks(s->block[0]); + s->bdsp.clear_blocks(s->block[0]); if (!s->chroma_y_shift) - s->dsp.clear_blocks(s->block[6]); + s->bdsp.clear_blocks(s->block[6]); /* compute DCT type */ // FIXME: add an interlaced_dct coded var? @@ -1014,13 +1014,13 @@ FF_ENABLE_DEPRECATION_WARNINGS s->mb_intra = 0; if (HAS_CBP(mb_type)) { - s->dsp.clear_blocks(s->block[0]); + s->bdsp.clear_blocks(s->block[0]); cbp = get_vlc2(&s->gb, ff_mb_pat_vlc.table, MB_PAT_VLC_BITS, 1); if (mb_block_count > 6) { cbp <<= mb_block_count - 6; cbp |= get_bits(&s->gb, mb_block_count - 6); - s->dsp.clear_blocks(s->block[6]); + s->bdsp.clear_blocks(s->block[6]); } if (cbp <= 0) { av_log(s->avctx, AV_LOG_ERROR, diff --git a/libavcodec/mpeg4videodec.c b/libavcodec/mpeg4videodec.c index 9405a02e4c..0e3e5803b1 100644 --- a/libavcodec/mpeg4videodec.c +++ b/libavcodec/mpeg4videodec.c @@ -1227,7 +1227,7 @@ static int mpeg4_decode_partitioned_mb(MpegEncContext *s, int16_t block[6][64]) if (!IS_SKIP(mb_type)) { int i; - s->dsp.clear_blocks(s->block[0]); + s->bdsp.clear_blocks(s->block[0]); /* decode each block */ for (i = 0; i < 6; i++) { if (mpeg4_decode_block(ctx, block[i], i, cbp & 32, s->mb_intra, ctx->rvlc) < 0) { @@ -1305,7 +1305,7 @@ static int mpeg4_decode_mb(MpegEncContext *s, int16_t block[6][64]) } } while (cbpc == 20); - s->dsp.clear_blocks(s->block[0]); + s->bdsp.clear_blocks(s->block[0]); dquant = cbpc & 8; s->mb_intra = ((cbpc & 4) != 0); if (s->mb_intra) @@ -1451,7 +1451,7 @@ static int mpeg4_decode_mb(MpegEncContext *s, int16_t block[6][64]) if (modb2) { cbp = 0; } else { - s->dsp.clear_blocks(s->block[0]); + s->bdsp.clear_blocks(s->block[0]); cbp = get_bits(&s->gb, 6); } @@ -1586,7 +1586,7 @@ intra: if (!s->progressive_sequence) s->interlaced_dct = get_bits1(&s->gb); - s->dsp.clear_blocks(s->block[0]); + s->bdsp.clear_blocks(s->block[0]); /* decode each block */ for (i = 0; i < 6; i++) { if (mpeg4_decode_block(ctx, block[i], i, cbp & 32, 1, 0) < 0) diff --git a/libavcodec/mpeg4videoenc.c b/libavcodec/mpeg4videoenc.c index 189664dc3d..b95752fe49 100644 --- a/libavcodec/mpeg4videoenc.c +++ b/libavcodec/mpeg4videoenc.c @@ -485,7 +485,7 @@ static inline int get_b_cbp(MpegEncContext *s, int16_t block[6][64], for (i = 0; i < 6; i++) { if (s->block_last_index[i] >= 0 && ((cbp >> (5 - i)) & 1) == 0) { s->block_last_index[i] = -1; - s->dsp.clear_block(s->block[i]); + s->bdsp.clear_block(s->block[i]); } } } else { diff --git a/libavcodec/mpegvideo.c b/libavcodec/mpegvideo.c index aea6321518..f6fc8dc3a8 100644 --- a/libavcodec/mpegvideo.c +++ b/libavcodec/mpegvideo.c @@ -33,6 +33,7 @@ #include "libavutil/internal.h" #include "libavutil/timer.h" #include "avcodec.h" +#include "blockdsp.h" #include "dsputil.h" #include "internal.h" #include "mathops.h" @@ -363,7 +364,7 @@ static void mpeg_er_decode_mb(void *opaque, int ref, int mv_dir, int mv_type, ff_init_block_index(s); ff_update_block_index(s); - s->dsp.clear_blocks(s->block[0]); + s->bdsp.clear_blocks(s->block[0]); s->dest[0] = s->current_picture.f->data[0] + (s->mb_y * 16 * s->linesize) + s->mb_x * 16; s->dest[1] = s->current_picture.f->data[1] + (s->mb_y * (16 >> s->chroma_y_shift) * s->uvlinesize) + s->mb_x * (16 >> s->chroma_x_shift); @@ -376,6 +377,7 @@ static void mpeg_er_decode_mb(void *opaque, int ref, int mv_dir, int mv_type, /* init common dct for both encoder and decoder */ av_cold int ff_dct_common_init(MpegEncContext *s) { + ff_blockdsp_init(&s->bdsp, s->avctx); ff_dsputil_init(&s->dsp, s->avctx); ff_hpeldsp_init(&s->hdsp, s->avctx->flags); ff_videodsp_init(&s->vdsp, s->avctx->bits_per_raw_sample); diff --git a/libavcodec/mpegvideo.h b/libavcodec/mpegvideo.h index a0114fdec1..7b0673c8a1 100644 --- a/libavcodec/mpegvideo.h +++ b/libavcodec/mpegvideo.h @@ -29,6 +29,7 @@ #define AVCODEC_MPEGVIDEO_H #include "avcodec.h" +#include "blockdsp.h" #include "dsputil.h" #include "error_resilience.h" #include "get_bits.h" @@ -347,6 +348,7 @@ typedef struct MpegEncContext { int unrestricted_mv; ///< mv can point outside of the coded picture int h263_long_vectors; ///< use horrible h263v1 long vector mode + BlockDSPContext bdsp; DSPContext dsp; ///< pointers for accelerated dsp functions HpelDSPContext hdsp; QpelDSPContext qdsp; diff --git a/libavcodec/msmpeg4dec.c b/libavcodec/msmpeg4dec.c index 40660ed551..191f81a6ca 100644 --- a/libavcodec/msmpeg4dec.c +++ b/libavcodec/msmpeg4dec.c @@ -174,7 +174,7 @@ static int msmpeg4v12_decode_mb(MpegEncContext *s, int16_t block[6][64]) } } - s->dsp.clear_blocks(s->block[0]); + s->bdsp.clear_blocks(s->block[0]); for (i = 0; i < 6; i++) { if (ff_msmpeg4_decode_block(s, block[i], i, (cbp >> (5 - i)) & 1, NULL) < 0) { @@ -265,7 +265,7 @@ static int msmpeg4v34_decode_mb(MpegEncContext *s, int16_t block[6][64]) } } - s->dsp.clear_blocks(s->block[0]); + s->bdsp.clear_blocks(s->block[0]); for (i = 0; i < 6; i++) { if (ff_msmpeg4_decode_block(s, block[i], i, (cbp >> (5 - i)) & 1, NULL) < 0) { diff --git a/libavcodec/ppc/Makefile b/libavcodec/ppc/Makefile index b78d4be8ae..bd78f8e728 100644 --- a/libavcodec/ppc/Makefile +++ b/libavcodec/ppc/Makefile @@ -1,5 +1,6 @@ OBJS += ppc/fmtconvert_altivec.o \ +OBJS-$(CONFIG_BLOCKDSP) += ppc/blockdsp.o OBJS-$(CONFIG_DSPUTIL) += ppc/dsputil_ppc.o OBJS-$(CONFIG_FFT) += ppc/fft_altivec.o OBJS-$(CONFIG_H264CHROMA) += ppc/h264chroma_init.o diff --git a/libavcodec/ppc/blockdsp.c b/libavcodec/ppc/blockdsp.c new file mode 100644 index 0000000000..679bc0454f --- /dev/null +++ b/libavcodec/ppc/blockdsp.c @@ -0,0 +1,169 @@ +/* + * Copyright (c) 2002 Brian Foley + * Copyright (c) 2002 Dieter Shirley + * Copyright (c) 2003-2004 Romain Dolbeau + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" +#if HAVE_ALTIVEC_H +#include +#endif +#include + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/mem.h" +#include "libavutil/ppc/cpu.h" +#include "libavutil/ppc/types_altivec.h" +#include "libavcodec/blockdsp.h" + +/* ***** WARNING ***** WARNING ***** WARNING ***** */ +/* + * clear_blocks_dcbz32_ppc will not work properly on PowerPC processors with + * a cache line size not equal to 32 bytes. Fortunately all processors used + * by Apple up to at least the 7450 (AKA second generation G4) use 32-byte + * cache lines. This is due to the use of the 'dcbz' instruction. It simply + * clears a single cache line to zero, so you need to know the cache line + * size to use it! It's absurd, but it's fast... + * + * update 24/06/2003: Apple released the G5 yesterday, with a PPC970. + * cache line size: 128 bytes. Oups. + * The semantics of dcbz was changed, it always clears 32 bytes. So the function + * below will work, but will be slow. So I fixed check_dcbz_effect to use dcbzl, + * which is defined to clear a cache line (as dcbz before). So we can still + * distinguish, and use dcbz (32 bytes) or dcbzl (one cache line) as required. + * + * see + * and + */ +static void clear_blocks_dcbz32_ppc(int16_t *blocks) +{ + register int misal = (unsigned long) blocks & 0x00000010, i = 0; + + if (misal) { + ((unsigned long *) blocks)[0] = 0L; + ((unsigned long *) blocks)[1] = 0L; + ((unsigned long *) blocks)[2] = 0L; + ((unsigned long *) blocks)[3] = 0L; + i += 16; + } + for (; i < sizeof(int16_t) * 6 * 64 - 31; i += 32) + __asm__ volatile ("dcbz %0,%1" :: "b" (blocks), "r" (i) : "memory"); + if (misal) { + ((unsigned long *) blocks)[188] = 0L; + ((unsigned long *) blocks)[189] = 0L; + ((unsigned long *) blocks)[190] = 0L; + ((unsigned long *) blocks)[191] = 0L; + i += 16; + } +} + +/* Same as above, when dcbzl clears a whole 128 bytes cache line + * i.e. the PPC970 AKA G5. */ +static void clear_blocks_dcbz128_ppc(int16_t *blocks) +{ +#if HAVE_DCBZL + register int misal = (unsigned long) blocks & 0x0000007f, i = 0; + + if (misal) { + /* We could probably also optimize this case, + * but there's not much point as the machines + * aren't available yet (2003-06-26). */ + memset(blocks, 0, sizeof(int16_t) * 6 * 64); + } else { + for (; i < sizeof(int16_t) * 6 * 64; i += 128) + __asm__ volatile ("dcbzl %0,%1" :: "b" (blocks), "r" (i) : "memory"); + } +#else + memset(blocks, 0, sizeof(int16_t) * 6 * 64); +#endif +} + +/* Check dcbz report how many bytes are set to 0 by dcbz. */ +/* update 24/06/2003: Replace dcbz by dcbzl to get the intended effect + * (Apple "fixed" dcbz). Unfortunately this cannot be used unless the + * assembler knows about dcbzl ... */ +static long check_dcbzl_effect(void) +{ + long count = 0; +#if HAVE_DCBZL + register char *fakedata = av_malloc(1024); + register char *fakedata_middle; + register long zero = 0, i = 0; + + if (!fakedata) + return 0L; + + fakedata_middle = fakedata + 512; + + memset(fakedata, 0xFF, 1024); + + /* Below the constraint "b" seems to mean "address base register" + * in gcc-3.3 / RS/6000 speaks. Seems to avoid using r0, so.... */ + __asm__ volatile ("dcbzl %0, %1" :: "b" (fakedata_middle), "r" (zero)); + + for (i = 0; i < 1024; i++) + if (fakedata[i] == (char) 0) + count++; + + av_free(fakedata); +#endif + + return count; +} + +#if HAVE_ALTIVEC +static void clear_block_altivec(int16_t *block) +{ + LOAD_ZERO; + vec_st(zero_s16v, 0, block); + vec_st(zero_s16v, 16, block); + vec_st(zero_s16v, 32, block); + vec_st(zero_s16v, 48, block); + vec_st(zero_s16v, 64, block); + vec_st(zero_s16v, 80, block); + vec_st(zero_s16v, 96, block); + vec_st(zero_s16v, 112, block); +} +#endif /* HAVE_ALTIVEC */ + +av_cold void ff_blockdsp_init_ppc(BlockDSPContext *c, unsigned high_bit_depth) +{ + // common optimizations whether AltiVec is available or not + if (!high_bit_depth) { + switch (check_dcbzl_effect()) { + case 32: + c->clear_blocks = clear_blocks_dcbz32_ppc; + break; + case 128: + c->clear_blocks = clear_blocks_dcbz128_ppc; + break; + default: + break; + } + } + +#if HAVE_ALTIVEC + if (!PPC_ALTIVEC(av_get_cpu_flags())) + return; + + if (!high_bit_depth) + c->clear_block = clear_block_altivec; +#endif /* HAVE_ALTIVEC */ +} diff --git a/libavcodec/ppc/dsputil_altivec.c b/libavcodec/ppc/dsputil_altivec.c index 442be6c969..c3f90e91c4 100644 --- a/libavcodec/ppc/dsputil_altivec.c +++ b/libavcodec/ppc/dsputil_altivec.c @@ -558,19 +558,6 @@ static void diff_pixels_altivec(int16_t *restrict block, const uint8_t *s1, } } -static void clear_block_altivec(int16_t *block) -{ - LOAD_ZERO; - vec_st(zero_s16v, 0, block); - vec_st(zero_s16v, 16, block); - vec_st(zero_s16v, 32, block); - vec_st(zero_s16v, 48, block); - vec_st(zero_s16v, 64, block); - vec_st(zero_s16v, 80, block); - vec_st(zero_s16v, 96, block); - vec_st(zero_s16v, 112, block); -} - static int hadamard8_diff8x8_altivec(MpegEncContext *s, uint8_t *dst, uint8_t *src, int stride, int h) { @@ -931,7 +918,6 @@ av_cold void ff_dsputil_init_altivec(DSPContext *c, AVCodecContext *avctx, if (!high_bit_depth) { c->get_pixels = get_pixels_altivec; - c->clear_block = clear_block_altivec; } c->hadamard8_diff[0] = hadamard8_diff16_altivec; diff --git a/libavcodec/ppc/dsputil_ppc.c b/libavcodec/ppc/dsputil_ppc.c index 698f54562f..b92fbf0a2c 100644 --- a/libavcodec/ppc/dsputil_ppc.c +++ b/libavcodec/ppc/dsputil_ppc.c @@ -24,124 +24,14 @@ #include "libavutil/attributes.h" #include "libavutil/cpu.h" -#include "libavutil/mem.h" #include "libavutil/ppc/cpu.h" #include "libavcodec/avcodec.h" #include "libavcodec/dsputil.h" #include "dsputil_altivec.h" -/* ***** WARNING ***** WARNING ***** WARNING ***** */ -/* - * clear_blocks_dcbz32_ppc will not work properly on PowerPC processors with - * a cache line size not equal to 32 bytes. Fortunately all processors used - * by Apple up to at least the 7450 (AKA second generation G4) use 32-byte - * cache lines. This is due to the use of the 'dcbz' instruction. It simply - * clears a single cache line to zero, so you need to know the cache line - * size to use it! It's absurd, but it's fast... - * - * update 24/06/2003: Apple released the G5 yesterday, with a PPC970. - * cache line size: 128 bytes. Oups. - * The semantics of dcbz was changed, it always clears 32 bytes. So the function - * below will work, but will be slow. So I fixed check_dcbz_effect to use dcbzl, - * which is defined to clear a cache line (as dcbz before). So we can still - * distinguish, and use dcbz (32 bytes) or dcbzl (one cache line) as required. - * - * see - * and - */ -static void clear_blocks_dcbz32_ppc(int16_t *blocks) -{ - register int misal = (unsigned long) blocks & 0x00000010, i = 0; - - if (misal) { - ((unsigned long *) blocks)[0] = 0L; - ((unsigned long *) blocks)[1] = 0L; - ((unsigned long *) blocks)[2] = 0L; - ((unsigned long *) blocks)[3] = 0L; - i += 16; - } - for (; i < sizeof(int16_t) * 6 * 64 - 31; i += 32) - __asm__ volatile ("dcbz %0,%1" :: "b" (blocks), "r" (i) : "memory"); - if (misal) { - ((unsigned long *) blocks)[188] = 0L; - ((unsigned long *) blocks)[189] = 0L; - ((unsigned long *) blocks)[190] = 0L; - ((unsigned long *) blocks)[191] = 0L; - i += 16; - } -} - -/* Same as above, when dcbzl clears a whole 128 bytes cache line - * i.e. the PPC970 AKA G5. */ -static void clear_blocks_dcbz128_ppc(int16_t *blocks) -{ -#if HAVE_DCBZL - register int misal = (unsigned long) blocks & 0x0000007f, i = 0; - - if (misal) { - /* We could probably also optimize this case, - * but there's not much point as the machines - * aren't available yet (2003-06-26). */ - memset(blocks, 0, sizeof(int16_t) * 6 * 64); - } else { - for (; i < sizeof(int16_t) * 6 * 64; i += 128) - __asm__ volatile ("dcbzl %0,%1" :: "b" (blocks), "r" (i) : "memory"); - } -#else - memset(blocks, 0, sizeof(int16_t) * 6 * 64); -#endif -} - -/* Check dcbz report how many bytes are set to 0 by dcbz. */ -/* update 24/06/2003: Replace dcbz by dcbzl to get the intended effect - * (Apple "fixed" dcbz). Unfortunately this cannot be used unless the - * assembler knows about dcbzl ... */ -static long check_dcbzl_effect(void) -{ - long count = 0; -#if HAVE_DCBZL - register char *fakedata = av_malloc(1024); - register char *fakedata_middle; - register long zero = 0, i = 0; - - if (!fakedata) - return 0L; - - fakedata_middle = fakedata + 512; - - memset(fakedata, 0xFF, 1024); - - /* Below the constraint "b" seems to mean "address base register" - * in gcc-3.3 / RS/6000 speaks. Seems to avoid using r0, so.... */ - __asm__ volatile ("dcbzl %0, %1" :: "b" (fakedata_middle), "r" (zero)); - - for (i = 0; i < 1024; i++) - if (fakedata[i] == (char) 0) - count++; - - av_free(fakedata); -#endif - - return count; -} - av_cold void ff_dsputil_init_ppc(DSPContext *c, AVCodecContext *avctx, unsigned high_bit_depth) { - // common optimizations whether AltiVec is available or not - if (!high_bit_depth) { - switch (check_dcbzl_effect()) { - case 32: - c->clear_blocks = clear_blocks_dcbz32_ppc; - break; - case 128: - c->clear_blocks = clear_blocks_dcbz128_ppc; - break; - default: - break; - } - } - if (PPC_ALTIVEC(av_get_cpu_flags())) { ff_dsputil_init_altivec(c, avctx, high_bit_depth); ff_int_init_altivec(c, avctx); diff --git a/libavcodec/vc1dec.c b/libavcodec/vc1dec.c index c8a195ce92..c36b249b29 100644 --- a/libavcodec/vc1dec.c +++ b/libavcodec/vc1dec.c @@ -3019,7 +3019,7 @@ static int vc1_decode_intra_block(VC1Context *v, int16_t block[64], int n, int scale; int q1, q2 = 0; - s->dsp.clear_block(block); + s->bdsp.clear_block(block); /* XXX: Guard against dumb values of mquant */ mquant = (mquant < 1) ? 0 : ((mquant > 31) ? 31 : mquant); @@ -3226,7 +3226,7 @@ static int vc1_decode_p_block(VC1Context *v, int16_t block[64], int n, int ttblk = ttmb & 7; int pat = 0; - s->dsp.clear_block(block); + s->bdsp.clear_block(block); if (ttmb == -1) { ttblk = ff_vc1_ttblk_to_tt[v->tt_index][get_vlc2(gb, ff_vc1_ttblk_vlc[v->tt_index].table, VC1_TTBLK_VLC_BITS, 1)]; @@ -4797,7 +4797,7 @@ static void vc1_decode_i_blocks(VC1Context *v) dst[3] = dst[2] + 8; dst[4] = s->dest[1]; dst[5] = s->dest[2]; - s->dsp.clear_blocks(s->block[0]); + s->bdsp.clear_blocks(s->block[0]); mb_pos = s->mb_x + s->mb_y * s->mb_width; s->current_picture.mb_type[mb_pos] = MB_TYPE_INTRA; s->current_picture.qscale_table[mb_pos] = v->pq; @@ -4937,7 +4937,7 @@ static void vc1_decode_i_blocks_adv(VC1Context *v) for (;s->mb_x < s->mb_width; s->mb_x++) { int16_t (*block)[64] = v->block[v->cur_blk_idx]; ff_update_block_index(s); - s->dsp.clear_blocks(block[0]); + s->bdsp.clear_blocks(block[0]); mb_pos = s->mb_x + s->mb_y * s->mb_stride; s->current_picture.mb_type[mb_pos + v->mb_off] = MB_TYPE_INTRA; s->current_picture.motion_val[1][s->block_index[0] + v->blocks_off][0] = 0; @@ -5603,6 +5603,7 @@ static av_cold int vc1_decode_init(AVCodecContext *avctx) if (ff_vc1_init_common(v) < 0) return -1; + ff_blockdsp_init(&s->bdsp, avctx); ff_h264chroma_init(&v->h264chroma, 8); ff_qpeldsp_init(&s->qdsp); ff_vc1dsp_init(&v->vc1dsp); diff --git a/libavcodec/wmv2.c b/libavcodec/wmv2.c index 003f0220c1..bd799d0e8b 100644 --- a/libavcodec/wmv2.c +++ b/libavcodec/wmv2.c @@ -28,6 +28,7 @@ av_cold void ff_wmv2_common_init(Wmv2Context * w){ MpegEncContext * const s= &w->s; + ff_blockdsp_init(&s->bdsp, s->avctx); ff_wmv2dsp_init(&w->wdsp); s->dsp.idct_permutation_type = w->wdsp.idct_perm; ff_init_scantable_permutation(s->dsp.idct_permutation, @@ -60,12 +61,12 @@ static void wmv2_add_block(Wmv2Context *w, int16_t *block1, uint8_t *dst, int st case 1: ff_simple_idct84_add(dst , stride, block1); ff_simple_idct84_add(dst + 4*stride, stride, w->abt_block2[n]); - s->dsp.clear_block(w->abt_block2[n]); + s->bdsp.clear_block(w->abt_block2[n]); break; case 2: ff_simple_idct48_add(dst , stride, block1); ff_simple_idct48_add(dst + 4 , stride, w->abt_block2[n]); - s->dsp.clear_block(w->abt_block2[n]); + s->bdsp.clear_block(w->abt_block2[n]); break; default: av_log(s->avctx, AV_LOG_ERROR, "internal error in WMV2 abt\n"); diff --git a/libavcodec/wmv2dec.c b/libavcodec/wmv2dec.c index 366aa1f6c1..4ebc801240 100644 --- a/libavcodec/wmv2dec.c +++ b/libavcodec/wmv2dec.c @@ -385,7 +385,7 @@ int ff_wmv2_decode_mb(MpegEncContext *s, int16_t block[6][64]) wmv2_pred_motion(w, &mx, &my); if(cbp){ - s->dsp.clear_blocks(s->block[0]); + s->bdsp.clear_blocks(s->block[0]); if(s->per_mb_rl_table){ s->rl_table_index = decode012(&s->gb); s->rl_chroma_table_index = s->rl_table_index; @@ -431,7 +431,7 @@ int ff_wmv2_decode_mb(MpegEncContext *s, int16_t block[6][64]) s->rl_chroma_table_index = s->rl_table_index; } - s->dsp.clear_blocks(s->block[0]); + s->bdsp.clear_blocks(s->block[0]); for (i = 0; i < 6; i++) { if (ff_msmpeg4_decode_block(s, block[i], i, (cbp >> (5 - i)) & 1, NULL) < 0) { diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index 5fddf3fb83..222a0ff9eb 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -44,6 +44,7 @@ OBJS-$(CONFIG_VP7_DECODER) += x86/vp8dsp_init.o OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp_init.o OBJS-$(CONFIG_VP9_DECODER) += x86/vp9dsp_init.o +MMX-OBJS-$(CONFIG_BLOCKDSP) += x86/blockdsp_mmx.o MMX-OBJS-$(CONFIG_DSPUTIL) += x86/dsputil_mmx.o \ x86/idct_mmx_xvid.o \ x86/idct_sse2_xvid.o \ diff --git a/libavcodec/x86/blockdsp_mmx.c b/libavcodec/x86/blockdsp_mmx.c new file mode 100644 index 0000000000..b5294242ab --- /dev/null +++ b/libavcodec/x86/blockdsp_mmx.c @@ -0,0 +1,120 @@ +/* + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include + +#include "config.h" +#include "libavutil/attributes.h" +#include "libavutil/internal.h" +#include "libavutil/cpu.h" +#include "libavutil/x86/asm.h" +#include "libavutil/x86/cpu.h" +#include "libavcodec/blockdsp.h" +#include "libavcodec/version.h" + +#if HAVE_INLINE_ASM + +#define CLEAR_BLOCKS(name, n) \ +static void name(int16_t *blocks) \ +{ \ + __asm__ volatile ( \ + "pxor %%mm7, %%mm7 \n\t" \ + "mov %1, %%"REG_a" \n\t" \ + "1: \n\t" \ + "movq %%mm7, (%0, %%"REG_a") \n\t" \ + "movq %%mm7, 8(%0, %%"REG_a") \n\t" \ + "movq %%mm7, 16(%0, %%"REG_a") \n\t" \ + "movq %%mm7, 24(%0, %%"REG_a") \n\t" \ + "add $32, %%"REG_a" \n\t" \ + "js 1b \n\t" \ + :: "r"(((uint8_t *) blocks) + 128 * n), \ + "i"(-128 * n) \ + : "%"REG_a); \ +} +CLEAR_BLOCKS(clear_blocks_mmx, 6) +CLEAR_BLOCKS(clear_block_mmx, 1) + +static void clear_block_sse(int16_t *block) +{ + __asm__ volatile ( + "xorps %%xmm0, %%xmm0 \n" + "movaps %%xmm0, (%0) \n" + "movaps %%xmm0, 16(%0) \n" + "movaps %%xmm0, 32(%0) \n" + "movaps %%xmm0, 48(%0) \n" + "movaps %%xmm0, 64(%0) \n" + "movaps %%xmm0, 80(%0) \n" + "movaps %%xmm0, 96(%0) \n" + "movaps %%xmm0, 112(%0) \n" + :: "r" (block) + : "memory"); +} + +static void clear_blocks_sse(int16_t *blocks) +{ + __asm__ volatile ( + "xorps %%xmm0, %%xmm0 \n" + "mov %1, %%"REG_a" \n" + "1: \n" + "movaps %%xmm0, (%0, %%"REG_a") \n" + "movaps %%xmm0, 16(%0, %%"REG_a") \n" + "movaps %%xmm0, 32(%0, %%"REG_a") \n" + "movaps %%xmm0, 48(%0, %%"REG_a") \n" + "movaps %%xmm0, 64(%0, %%"REG_a") \n" + "movaps %%xmm0, 80(%0, %%"REG_a") \n" + "movaps %%xmm0, 96(%0, %%"REG_a") \n" + "movaps %%xmm0, 112(%0, %%"REG_a") \n" + "add $128, %%"REG_a" \n" + "js 1b \n" + :: "r"(((uint8_t *) blocks) + 128 * 6), "i"(-128 * 6) + : "%"REG_a); +} + +#endif /* HAVE_INLINE_ASM */ + +#if FF_API_XVMC +av_cold void ff_blockdsp_init_x86(BlockDSPContext *c, unsigned high_bit_depth, + AVCodecContext *avctx) +#else +av_cold void ff_blockdsp_init_x86(BlockDSPContext *c, unsigned high_bit_depth) +#endif /* FF_API_XVMC */ +{ +#if HAVE_INLINE_ASM + int cpu_flags = av_get_cpu_flags(); + + if (!high_bit_depth) { + if (INLINE_MMX(cpu_flags)) { + c->clear_block = clear_block_mmx; + c->clear_blocks = clear_blocks_mmx; + } + +#if FF_API_XVMC +FF_DISABLE_DEPRECATION_WARNINGS + /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */ + if (CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1) + return; +FF_ENABLE_DEPRECATION_WARNINGS +#endif /* FF_API_XVMC */ + + if (INLINE_SSE(cpu_flags)) { + c->clear_block = clear_block_sse; + c->clear_blocks = clear_blocks_sse; + } + } +#endif /* HAVE_INLINE_ASM */ +} diff --git a/libavcodec/x86/dsputil_init.c b/libavcodec/x86/dsputil_init.c index 389e7634dd..a19b83d83c 100644 --- a/libavcodec/x86/dsputil_init.c +++ b/libavcodec/x86/dsputil_init.c @@ -19,12 +19,10 @@ #include "config.h" #include "libavutil/attributes.h" #include "libavutil/cpu.h" -#include "libavutil/internal.h" #include "libavutil/x86/cpu.h" #include "libavcodec/avcodec.h" #include "libavcodec/dsputil.h" #include "libavcodec/simple_idct.h" -#include "libavcodec/version.h" #include "dsputil_x86.h" #include "idct_xvid.h" @@ -54,8 +52,6 @@ static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, c->add_pixels_clamped = ff_add_pixels_clamped_mmx; if (!high_bit_depth) { - c->clear_block = ff_clear_block_mmx; - c->clear_blocks = ff_clear_blocks_mmx; c->draw_edges = ff_draw_edges_mmx; switch (avctx->idct_algo) { @@ -103,19 +99,6 @@ static av_cold void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx, { #if HAVE_SSE_INLINE c->vector_clipf = ff_vector_clipf_sse; - -#if FF_API_XVMC -FF_DISABLE_DEPRECATION_WARNINGS - /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */ - if (CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1) - return; -FF_ENABLE_DEPRECATION_WARNINGS -#endif /* FF_API_XVMC */ - - if (!high_bit_depth) { - c->clear_block = ff_clear_block_sse; - c->clear_blocks = ff_clear_blocks_sse; - } #endif /* HAVE_SSE_INLINE */ } diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c index c17f8d00d5..fd74efeb3d 100644 --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -166,62 +166,6 @@ void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, } while (--i); } -#define CLEAR_BLOCKS(name, n) \ -void name(int16_t *blocks) \ -{ \ - __asm__ volatile ( \ - "pxor %%mm7, %%mm7 \n\t" \ - "mov %1, %%"REG_a" \n\t" \ - "1: \n\t" \ - "movq %%mm7, (%0, %%"REG_a") \n\t" \ - "movq %%mm7, 8(%0, %%"REG_a") \n\t" \ - "movq %%mm7, 16(%0, %%"REG_a") \n\t" \ - "movq %%mm7, 24(%0, %%"REG_a") \n\t" \ - "add $32, %%"REG_a" \n\t" \ - "js 1b \n\t" \ - :: "r"(((uint8_t *) blocks) + 128 * n), \ - "i"(-128 * n) \ - : "%"REG_a); \ -} -CLEAR_BLOCKS(ff_clear_blocks_mmx, 6) -CLEAR_BLOCKS(ff_clear_block_mmx, 1) - -void ff_clear_block_sse(int16_t *block) -{ - __asm__ volatile ( - "xorps %%xmm0, %%xmm0 \n" - "movaps %%xmm0, (%0) \n" - "movaps %%xmm0, 16(%0) \n" - "movaps %%xmm0, 32(%0) \n" - "movaps %%xmm0, 48(%0) \n" - "movaps %%xmm0, 64(%0) \n" - "movaps %%xmm0, 80(%0) \n" - "movaps %%xmm0, 96(%0) \n" - "movaps %%xmm0, 112(%0) \n" - :: "r" (block) - : "memory"); -} - -void ff_clear_blocks_sse(int16_t *blocks) -{ - __asm__ volatile ( - "xorps %%xmm0, %%xmm0 \n" - "mov %1, %%"REG_a" \n" - "1: \n" - "movaps %%xmm0, (%0, %%"REG_a") \n" - "movaps %%xmm0, 16(%0, %%"REG_a") \n" - "movaps %%xmm0, 32(%0, %%"REG_a") \n" - "movaps %%xmm0, 48(%0, %%"REG_a") \n" - "movaps %%xmm0, 64(%0, %%"REG_a") \n" - "movaps %%xmm0, 80(%0, %%"REG_a") \n" - "movaps %%xmm0, 96(%0, %%"REG_a") \n" - "movaps %%xmm0, 112(%0, %%"REG_a") \n" - "add $128, %%"REG_a" \n" - "js 1b \n" - :: "r"(((uint8_t *) blocks) + 128 * 6), "i"(-128 * 6) - : "%"REG_a); -} - /* Draw the edges of width 'w' of an image of size width, height * this MMX version can only handle w == 8 || w == 16. */ void ff_draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, diff --git a/libavcodec/x86/dsputil_x86.h b/libavcodec/x86/dsputil_x86.h index a4bc8c2730..e99b6b7630 100644 --- a/libavcodec/x86/dsputil_x86.h +++ b/libavcodec/x86/dsputil_x86.h @@ -38,11 +38,6 @@ void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, int line_size); -void ff_clear_block_mmx(int16_t *block); -void ff_clear_block_sse(int16_t *block); -void ff_clear_blocks_mmx(int16_t *blocks); -void ff_clear_blocks_sse(int16_t *blocks); - void ff_draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w, int h, int sides);