1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2024-11-21 10:55:51 +02:00

lavc: introduce VideoDSPContext

Move some functions from dsputil. The idea is that videodsp contains
functions that are useful for a large and varied set of video decoders.
Currently, it contains emulated_edge_mc() and prefetch().

Signed-off-by: Luca Barbato <lu_zero@gentoo.org>
This commit is contained in:
Ronald S. Bultje 2012-12-15 09:46:02 -08:00 committed by Luca Barbato
parent a925f723a9
commit 8c53d39e7f
37 changed files with 1189 additions and 927 deletions

10
configure vendored
View File

@ -1328,6 +1328,7 @@ CONFIG_EXTRA="
rangecoder rangecoder
rtpdec rtpdec
sinewin sinewin
videodsp
vp3dsp vp3dsp
" "
@ -1544,6 +1545,7 @@ mpeg4_decoder_select="h263_decoder mpeg4video_parser"
mpeg4_encoder_select="h263_encoder" mpeg4_encoder_select="h263_encoder"
mpeg4_vaapi_hwaccel_select="vaapi mpeg4_decoder" mpeg4_vaapi_hwaccel_select="vaapi mpeg4_decoder"
mpeg4_vdpau_decoder_select="vdpau mpeg4_decoder" mpeg4_vdpau_decoder_select="vdpau mpeg4_decoder"
mpegvideo_select="videodsp"
msmpeg4v1_decoder_select="h263_decoder" msmpeg4v1_decoder_select="h263_decoder"
msmpeg4v1_encoder_select="h263_encoder" msmpeg4v1_encoder_select="h263_encoder"
msmpeg4v2_decoder_select="h263_decoder" msmpeg4v2_decoder_select="h263_decoder"
@ -1589,12 +1591,12 @@ vc1_vdpau_decoder_select="vdpau vc1_decoder"
vc1image_decoder_select="vc1_decoder" vc1image_decoder_select="vc1_decoder"
vorbis_decoder_select="mdct" vorbis_decoder_select="mdct"
vorbis_encoder_select="mdct" vorbis_encoder_select="mdct"
vp3_decoder_select="vp3dsp" vp3_decoder_select="vp3dsp videodsp"
vp5_decoder_select="vp3dsp" vp5_decoder_select="vp3dsp videodsp"
vp6_decoder_select="huffman vp3dsp" vp6_decoder_select="huffman vp3dsp videodsp"
vp6a_decoder_select="vp6_decoder" vp6a_decoder_select="vp6_decoder"
vp6f_decoder_select="vp6_decoder" vp6f_decoder_select="vp6_decoder"
vp8_decoder_select="h264pred h264qpel" vp8_decoder_select="h264pred videodsp"
wmapro_decoder_select="mdct sinewin" wmapro_decoder_select="mdct sinewin"
wmav1_decoder_select="mdct sinewin" wmav1_decoder_select="mdct sinewin"
wmav1_encoder_select="mdct sinewin" wmav1_encoder_select="mdct sinewin"

View File

@ -67,6 +67,7 @@ OBJS-$(CONFIG_RDFT) += rdft.o $(RDFT-OBJS-yes)
OBJS-$(CONFIG_SINEWIN) += sinewin.o OBJS-$(CONFIG_SINEWIN) += sinewin.o
OBJS-$(CONFIG_VAAPI) += vaapi.o OBJS-$(CONFIG_VAAPI) += vaapi.o
OBJS-$(CONFIG_VDPAU) += vdpau.o OBJS-$(CONFIG_VDPAU) += vdpau.o
OBJS-$(CONFIG_VIDEODSP) += videodsp.o
OBJS-$(CONFIG_VP3DSP) += vp3dsp.o OBJS-$(CONFIG_VP3DSP) += vp3dsp.o
# decoders/encoders/hardware accelerators # decoders/encoders/hardware accelerators

View File

@ -30,6 +30,8 @@ OBJS-$(CONFIG_RV30_DECODER) += arm/rv34dsp_init_arm.o
OBJS-$(CONFIG_RV40_DECODER) += arm/rv34dsp_init_arm.o \ OBJS-$(CONFIG_RV40_DECODER) += arm/rv34dsp_init_arm.o \
arm/rv40dsp_init_arm.o \ arm/rv40dsp_init_arm.o \
OBJS-$(CONFIG_VIDEODSP) += arm/videodsp_init_arm.o \
OBJS += arm/dsputil_init_arm.o \ OBJS += arm/dsputil_init_arm.o \
arm/dsputil_arm.o \ arm/dsputil_arm.o \
arm/fft_init_arm.o \ arm/fft_init_arm.o \
@ -41,6 +43,9 @@ OBJS += arm/dsputil_init_arm.o \
ARMV5TE-OBJS-$(CONFIG_MPEGVIDEO) += arm/mpegvideo_armv5te.o \ ARMV5TE-OBJS-$(CONFIG_MPEGVIDEO) += arm/mpegvideo_armv5te.o \
arm/mpegvideo_armv5te_s.o \ arm/mpegvideo_armv5te_s.o \
ARMV5TE-OBJS-$(CONFIG_VIDEODSP) += arm/videodsp_init_armv5te.o \
arm/videodsp_armv5te.o \
ARMV5TE-OBJS += arm/dsputil_init_armv5te.o \ ARMV5TE-OBJS += arm/dsputil_init_armv5te.o \
arm/simple_idct_armv5te.o \ arm/simple_idct_armv5te.o \

View File

@ -22,15 +22,7 @@
#include "config.h" #include "config.h"
#include "libavutil/arm/asm.S" #include "libavutil/arm/asm.S"
#if HAVE_ARMV5TE_EXTERNAL #if !HAVE_ARMV5TE_EXTERNAL
function ff_prefetch_arm, export=1
subs r2, r2, #1
pld [r0]
add r0, r0, r1
bne ff_prefetch_arm
bx lr
endfunc
#else
#define pld @ #define pld @
#endif #endif

View File

@ -25,8 +25,6 @@ void ff_simple_idct_armv5te(DCTELEM *data);
void ff_simple_idct_put_armv5te(uint8_t *dest, int line_size, DCTELEM *data); void ff_simple_idct_put_armv5te(uint8_t *dest, int line_size, DCTELEM *data);
void ff_simple_idct_add_armv5te(uint8_t *dest, int line_size, DCTELEM *data); void ff_simple_idct_add_armv5te(uint8_t *dest, int line_size, DCTELEM *data);
void ff_prefetch_arm(void *mem, int stride, int h);
av_cold void ff_dsputil_init_armv5te(DSPContext *c, AVCodecContext *avctx) av_cold void ff_dsputil_init_armv5te(DSPContext *c, AVCodecContext *avctx)
{ {
if (avctx->bits_per_raw_sample <= 8 && if (avctx->bits_per_raw_sample <= 8 &&
@ -37,6 +35,4 @@ av_cold void ff_dsputil_init_armv5te(DSPContext *c, AVCodecContext *avctx)
c->idct = ff_simple_idct_armv5te; c->idct = ff_simple_idct_armv5te;
c->idct_permutation_type = FF_NO_IDCT_PERM; c->idct_permutation_type = FF_NO_IDCT_PERM;
} }
c->prefetch = ff_prefetch_arm;
} }

View File

@ -0,0 +1,33 @@
@
@ ARMv5te optimized DSP utils
@ Copyright (c) 2004 AGAWA Koji <i (AT) atty (DOT) jp>
@
@ This file is part of Libav.
@
@ Libav is free software; you can redistribute it and/or
@ modify it under the terms of the GNU Lesser General Public
@ License as published by the Free Software Foundation; either
@ version 2.1 of the License, or (at your option) any later version.
@
@ Libav is distributed in the hope that it will be useful,
@ but WITHOUT ANY WARRANTY; without even the implied warranty of
@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
@ Lesser General Public License for more details.
@
@ You should have received a copy of the GNU Lesser General Public
@ License along with Libav; if not, write to the Free Software
@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@
#include "config.h"
#include "libavutil/arm/asm.S"
#if HAVE_ARMV5TE_EXTERNAL
function ff_prefetch_arm, export=1
subs r2, r2, #1
pld [r0]
add r0, r0, r1
bne ff_prefetch_arm
bx lr
endfunc
#endif

View File

@ -0,0 +1,29 @@
/*
* Copyright (C) 2012 Ronald S. Bultje
*
* This file is part of Libav.
*
* Libav is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* Libav is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with Libav; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/cpu.h"
#include "libavcodec/videodsp.h"
#include "videodsp_arm.h"
void ff_videodsp_init_arm(VideoDSPContext *ctx, int bpc)
{
int cpu_flags = av_get_cpu_flags();
if (have_armv5te(cpu_flags)) ff_videodsp_init_armv5te(ctx, bpc);
}

View File

@ -392,7 +392,7 @@ static inline void mc_dir_part(AVSContext *h,Picture *pic,
|| full_my < 0-extra_height || full_my < 0-extra_height
|| full_mx + 16/*FIXME*/ > pic_width + extra_width || full_mx + 16/*FIXME*/ > pic_width + extra_width
|| full_my + 16/*FIXME*/ > pic_height + extra_height){ || full_my + 16/*FIXME*/ > pic_height + extra_height){
s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->l_stride, h->l_stride, s->vdsp.emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->l_stride, h->l_stride,
16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height); 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
src_y= s->edge_emu_buffer + 2 + 2*h->l_stride; src_y= s->edge_emu_buffer + 2 + 2*h->l_stride;
emu=1; emu=1;
@ -401,14 +401,14 @@ static inline void mc_dir_part(AVSContext *h,Picture *pic,
qpix_op[luma_xy](dest_y, src_y, h->l_stride); //FIXME try variable height perhaps? qpix_op[luma_xy](dest_y, src_y, h->l_stride); //FIXME try variable height perhaps?
if(emu){ if(emu){
s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cb, h->c_stride, s->vdsp.emulated_edge_mc(s->edge_emu_buffer, src_cb, h->c_stride,
9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1); 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
src_cb= s->edge_emu_buffer; src_cb= s->edge_emu_buffer;
} }
chroma_op(dest_cb, src_cb, h->c_stride, chroma_height, mx&7, my&7); chroma_op(dest_cb, src_cb, h->c_stride, chroma_height, mx&7, my&7);
if(emu){ if(emu){
s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cr, h->c_stride, s->vdsp.emulated_edge_mc(s->edge_emu_buffer, src_cr, h->c_stride,
9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1); 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
src_cr= s->edge_emu_buffer; src_cr= s->edge_emu_buffer;
} }

View File

@ -2615,8 +2615,6 @@ static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
add_pixels_clamped_c(block, dest, line_size); add_pixels_clamped_c(block, dest, line_size);
} }
static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
/* init static data */ /* init static data */
av_cold void ff_dsputil_static_init(void) av_cold void ff_dsputil_static_init(void)
{ {
@ -2867,8 +2865,6 @@ av_cold void ff_dsputil_init(DSPContext* c, AVCodecContext *avctx)
c->shrink[2]= ff_shrink44; c->shrink[2]= ff_shrink44;
c->shrink[3]= ff_shrink88; c->shrink[3]= ff_shrink88;
c->prefetch= just_return;
memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab)); memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab)); memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
@ -2905,7 +2901,6 @@ av_cold void ff_dsputil_init(DSPContext* c, AVCodecContext *avctx)
#define BIT_DEPTH_FUNCS(depth, dct)\ #define BIT_DEPTH_FUNCS(depth, dct)\
c->get_pixels = FUNCC(get_pixels ## dct , depth);\ c->get_pixels = FUNCC(get_pixels ## dct , depth);\
c->draw_edges = FUNCC(draw_edges , depth);\ c->draw_edges = FUNCC(draw_edges , depth);\
c->emulated_edge_mc = FUNC (ff_emulated_edge_mc , depth);\
c->clear_block = FUNCC(clear_block ## dct , depth);\ c->clear_block = FUNCC(clear_block ## dct , depth);\
c->clear_blocks = FUNCC(clear_blocks ## dct , depth);\ c->clear_blocks = FUNCC(clear_blocks ## dct , depth);\
c->add_pixels8 = FUNCC(add_pixels8 ## dct , depth);\ c->add_pixels8 = FUNCC(add_pixels8 ## dct , depth);\

View File

@ -188,15 +188,6 @@ void ff_init_scantable(uint8_t *, ScanTable *st, const uint8_t *src_scantable);
void ff_init_scantable_permutation(uint8_t *idct_permutation, void ff_init_scantable_permutation(uint8_t *idct_permutation,
int idct_permutation_type); int idct_permutation_type);
#define EMULATED_EDGE(depth) \
void ff_emulated_edge_mc_ ## depth (uint8_t *buf, const uint8_t *src, int linesize,\
int block_w, int block_h,\
int src_x, int src_y, int w, int h);
EMULATED_EDGE(8)
EMULATED_EDGE(9)
EMULATED_EDGE(10)
/** /**
* DSPContext. * DSPContext.
*/ */
@ -215,21 +206,6 @@ typedef struct DSPContext {
void (*add_pixels8)(uint8_t *pixels, DCTELEM *block, int line_size); void (*add_pixels8)(uint8_t *pixels, DCTELEM *block, int line_size);
void (*add_pixels4)(uint8_t *pixels, DCTELEM *block, int line_size); void (*add_pixels4)(uint8_t *pixels, DCTELEM *block, int line_size);
int (*sum_abs_dctelem)(DCTELEM *block/*align 16*/); int (*sum_abs_dctelem)(DCTELEM *block/*align 16*/);
/**
* Motion estimation with emulated edge values.
* @param buf pointer to destination buffer (unaligned)
* @param src pointer to pixel source (unaligned)
* @param linesize width (in pixels) for src/buf
* @param block_w number of pixels (per row) to copy to buf
* @param block_h nummber of pixel rows to copy to buf
* @param src_x offset of src to start of row - this may be negative
* @param src_y offset of src to top of image - this may be negative
* @param w width of src in pixels
* @param h height of src in pixels
*/
void (*emulated_edge_mc)(uint8_t *buf, const uint8_t *src, int linesize,
int block_w, int block_h,
int src_x, int src_y, int w, int h);
/** /**
* translational global motion compensation. * translational global motion compensation.
*/ */
@ -465,8 +441,6 @@ typedef struct DSPContext {
#define EDGE_TOP 1 #define EDGE_TOP 1
#define EDGE_BOTTOM 2 #define EDGE_BOTTOM 2
void (*prefetch)(void *mem, int stride, int h);
void (*shrink[4])(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height); void (*shrink[4])(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height);
/** /**

View File

@ -113,85 +113,6 @@ static void FUNCC(draw_edges)(uint8_t *_buf, int _wrap, int width, int height, i
memcpy(last_line + (i + 1) * wrap, last_line, (width + w + w) * sizeof(pixel)); // bottom memcpy(last_line + (i + 1) * wrap, last_line, (width + w + w) * sizeof(pixel)); // bottom
} }
/**
* Copy a rectangular area of samples to a temporary buffer and replicate the border samples.
* @param buf destination buffer
* @param src source buffer
* @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
* @param block_w width of block
* @param block_h height of block
* @param src_x x coordinate of the top left sample of the block in the source buffer
* @param src_y y coordinate of the top left sample of the block in the source buffer
* @param w width of the source buffer
* @param h height of the source buffer
*/
void FUNC(ff_emulated_edge_mc)(uint8_t *buf, const uint8_t *src, int linesize, int block_w, int block_h,
int src_x, int src_y, int w, int h){
int x, y;
int start_y, start_x, end_y, end_x;
if(src_y>= h){
src+= (h-1-src_y)*linesize;
src_y=h-1;
}else if(src_y<=-block_h){
src+= (1-block_h-src_y)*linesize;
src_y=1-block_h;
}
if(src_x>= w){
src+= (w-1-src_x)*sizeof(pixel);
src_x=w-1;
}else if(src_x<=-block_w){
src+= (1-block_w-src_x)*sizeof(pixel);
src_x=1-block_w;
}
start_y= FFMAX(0, -src_y);
start_x= FFMAX(0, -src_x);
end_y= FFMIN(block_h, h-src_y);
end_x= FFMIN(block_w, w-src_x);
assert(start_y < end_y && block_h);
assert(start_x < end_x && block_w);
w = end_x - start_x;
src += start_y*linesize + start_x*sizeof(pixel);
buf += start_x*sizeof(pixel);
//top
for(y=0; y<start_y; y++){
memcpy(buf, src, w*sizeof(pixel));
buf += linesize;
}
// copy existing part
for(; y<end_y; y++){
memcpy(buf, src, w*sizeof(pixel));
src += linesize;
buf += linesize;
}
//bottom
src -= linesize;
for(; y<block_h; y++){
memcpy(buf, src, w*sizeof(pixel));
buf += linesize;
}
buf -= block_h * linesize + start_x*sizeof(pixel);
while (block_h--){
pixel *bufp = (pixel*)buf;
//left
for(x=0; x<start_x; x++){
bufp[x] = bufp[start_x];
}
//right
for(x=end_x; x<block_w; x++){
bufp[x] = bufp[end_x - 1];
}
buf += linesize;
}
}
#define DCTELEM_FUNCS(dctcoef, suffix) \ #define DCTELEM_FUNCS(dctcoef, suffix) \
static void FUNCC(get_pixels ## suffix)(DCTELEM *restrict _block, \ static void FUNCC(get_pixels ## suffix)(DCTELEM *restrict _block, \
const uint8_t *_pixels, \ const uint8_t *_pixels, \

View File

@ -486,11 +486,11 @@ static av_always_inline void mc_dir_part(H264Context *h, Picture *pic,
full_my < 0 - extra_height || full_my < 0 - extra_height ||
full_mx + 16 /*FIXME*/ > pic_width + extra_width || full_mx + 16 /*FIXME*/ > pic_width + extra_width ||
full_my + 16 /*FIXME*/ > pic_height + extra_height) { full_my + 16 /*FIXME*/ > pic_height + extra_height) {
s->dsp.emulated_edge_mc(s->edge_emu_buffer, s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
src_y - (2 << pixel_shift) - 2 * h->mb_linesize, src_y - (2 << pixel_shift) - 2 * h->mb_linesize,
h->mb_linesize, h->mb_linesize,
16 + 5, 16 + 5 /*FIXME*/, full_mx - 2, 16 + 5, 16 + 5 /*FIXME*/, full_mx - 2,
full_my - 2, pic_width, pic_height); full_my - 2, pic_width, pic_height);
src_y = s->edge_emu_buffer + (2 << pixel_shift) + 2 * h->mb_linesize; src_y = s->edge_emu_buffer + (2 << pixel_shift) + 2 * h->mb_linesize;
emu = 1; emu = 1;
} }
@ -505,12 +505,12 @@ static av_always_inline void mc_dir_part(H264Context *h, Picture *pic,
if (chroma_idc == 3 /* yuv444 */) { if (chroma_idc == 3 /* yuv444 */) {
src_cb = pic->f.data[1] + offset; src_cb = pic->f.data[1] + offset;
if (emu) { if (emu) {
s->dsp.emulated_edge_mc(s->edge_emu_buffer, s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
src_cb - (2 << pixel_shift) - 2 * h->mb_linesize, src_cb - (2 << pixel_shift) - 2 * h->mb_linesize,
h->mb_linesize, h->mb_linesize,
16 + 5, 16 + 5 /*FIXME*/, 16 + 5, 16 + 5 /*FIXME*/,
full_mx - 2, full_my - 2, full_mx - 2, full_my - 2,
pic_width, pic_height); pic_width, pic_height);
src_cb = s->edge_emu_buffer + (2 << pixel_shift) + 2 * h->mb_linesize; src_cb = s->edge_emu_buffer + (2 << pixel_shift) + 2 * h->mb_linesize;
} }
qpix_op[luma_xy](dest_cb, src_cb, h->mb_linesize); // FIXME try variable height perhaps? qpix_op[luma_xy](dest_cb, src_cb, h->mb_linesize); // FIXME try variable height perhaps?
@ -519,12 +519,12 @@ static av_always_inline void mc_dir_part(H264Context *h, Picture *pic,
src_cr = pic->f.data[2] + offset; src_cr = pic->f.data[2] + offset;
if (emu) { if (emu) {
s->dsp.emulated_edge_mc(s->edge_emu_buffer, s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
src_cr - (2 << pixel_shift) - 2 * h->mb_linesize, src_cr - (2 << pixel_shift) - 2 * h->mb_linesize,
h->mb_linesize, h->mb_linesize,
16 + 5, 16 + 5 /*FIXME*/, 16 + 5, 16 + 5 /*FIXME*/,
full_mx - 2, full_my - 2, full_mx - 2, full_my - 2,
pic_width, pic_height); pic_width, pic_height);
src_cr = s->edge_emu_buffer + (2 << pixel_shift) + 2 * h->mb_linesize; src_cr = s->edge_emu_buffer + (2 << pixel_shift) + 2 * h->mb_linesize;
} }
qpix_op[luma_xy](dest_cr, src_cr, h->mb_linesize); // FIXME try variable height perhaps? qpix_op[luma_xy](dest_cr, src_cr, h->mb_linesize); // FIXME try variable height perhaps?
@ -546,9 +546,9 @@ static av_always_inline void mc_dir_part(H264Context *h, Picture *pic,
(my >> ysh) * h->mb_uvlinesize; (my >> ysh) * h->mb_uvlinesize;
if (emu) { if (emu) {
s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, s->vdsp.emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize,
9, 8 * chroma_idc + 1, (mx >> 3), (my >> ysh), 9, 8 * chroma_idc + 1, (mx >> 3), (my >> ysh),
pic_width >> 1, pic_height >> (chroma_idc == 1 /* yuv420 */)); pic_width >> 1, pic_height >> (chroma_idc == 1 /* yuv420 */));
src_cb = s->edge_emu_buffer; src_cb = s->edge_emu_buffer;
} }
chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_op(dest_cb, src_cb, h->mb_uvlinesize,
@ -556,9 +556,9 @@ static av_always_inline void mc_dir_part(H264Context *h, Picture *pic,
mx & 7, (my << (chroma_idc == 2 /* yuv422 */)) & 7); mx & 7, (my << (chroma_idc == 2 /* yuv422 */)) & 7);
if (emu) { if (emu) {
s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, s->vdsp.emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize,
9, 8 * chroma_idc + 1, (mx >> 3), (my >> ysh), 9, 8 * chroma_idc + 1, (mx >> 3), (my >> ysh),
pic_width >> 1, pic_height >> (chroma_idc == 1 /* yuv420 */)); pic_width >> 1, pic_height >> (chroma_idc == 1 /* yuv420 */));
src_cr = s->edge_emu_buffer; src_cr = s->edge_emu_buffer;
} }
chroma_op(dest_cr, src_cr, h->mb_uvlinesize, height >> (chroma_idc == 1 /* yuv420 */), chroma_op(dest_cr, src_cr, h->mb_uvlinesize, height >> (chroma_idc == 1 /* yuv420 */),
@ -735,15 +735,15 @@ static av_always_inline void prefetch_motion(H264Context *h, int list,
int off = (mx << pixel_shift) + int off = (mx << pixel_shift) +
(my + (s->mb_x & 3) * 4) * h->mb_linesize + (my + (s->mb_x & 3) * 4) * h->mb_linesize +
(64 << pixel_shift); (64 << pixel_shift);
s->dsp.prefetch(src[0] + off, s->linesize, 4); s->vdsp.prefetch(src[0] + off, s->linesize, 4);
if (chroma_idc == 3 /* yuv444 */) { if (chroma_idc == 3 /* yuv444 */) {
s->dsp.prefetch(src[1] + off, s->linesize, 4); s->vdsp.prefetch(src[1] + off, s->linesize, 4);
s->dsp.prefetch(src[2] + off, s->linesize, 4); s->vdsp.prefetch(src[2] + off, s->linesize, 4);
} else { } else {
off = ((mx >> 1) << pixel_shift) + off = ((mx >> 1) << pixel_shift) +
((my >> 1) + (s->mb_x & 7)) * s->uvlinesize + ((my >> 1) + (s->mb_x & 7)) * s->uvlinesize +
(64 << pixel_shift); (64 << pixel_shift);
s->dsp.prefetch(src[1] + off, src[2] - src[1], 2); s->vdsp.prefetch(src[1] + off, src[2] - src[1], 2);
} }
} }
} }
@ -973,6 +973,7 @@ static av_cold void common_init(H264Context *h)
/* needed so that IDCT permutation is known early */ /* needed so that IDCT permutation is known early */
ff_dsputil_init(&s->dsp, s->avctx); ff_dsputil_init(&s->dsp, s->avctx);
ff_videodsp_init(&s->vdsp, 8);
memset(h->pps.scaling_matrix4, 16, 6 * 16 * sizeof(uint8_t)); memset(h->pps.scaling_matrix4, 16, 6 * 16 * sizeof(uint8_t));
memset(h->pps.scaling_matrix8, 16, 2 * 64 * sizeof(uint8_t)); memset(h->pps.scaling_matrix8, 16, 2 * 64 * sizeof(uint8_t));
@ -2439,6 +2440,7 @@ static int h264_set_parameter_from_sps(H264Context *h)
h->sps.chroma_format_idc); h->sps.chroma_format_idc);
s->dsp.dct_bits = h->sps.bit_depth_luma > 8 ? 32 : 16; s->dsp.dct_bits = h->sps.bit_depth_luma > 8 ? 32 : 16;
ff_dsputil_init(&s->dsp, s->avctx); ff_dsputil_init(&s->dsp, s->avctx);
ff_videodsp_init(&s->vdsp, h->sps.bit_depth_luma);
} else { } else {
av_log(s->avctx, AV_LOG_ERROR, "Unsupported bit depth: %d\n", av_log(s->avctx, AV_LOG_ERROR, "Unsupported bit depth: %d\n",
h->sps.bit_depth_luma); h->sps.bit_depth_luma);

View File

@ -60,8 +60,8 @@ static av_noinline void FUNC(hl_decode_mb)(H264Context *h)
dest_cb = s->current_picture.f.data[1] + (mb_x << PIXEL_SHIFT) * 8 + mb_y * s->uvlinesize * block_h; dest_cb = s->current_picture.f.data[1] + (mb_x << PIXEL_SHIFT) * 8 + mb_y * s->uvlinesize * block_h;
dest_cr = s->current_picture.f.data[2] + (mb_x << PIXEL_SHIFT) * 8 + mb_y * s->uvlinesize * block_h; dest_cr = s->current_picture.f.data[2] + (mb_x << PIXEL_SHIFT) * 8 + mb_y * s->uvlinesize * block_h;
s->dsp.prefetch(dest_y + (s->mb_x & 3) * 4 * s->linesize + (64 << PIXEL_SHIFT), s->linesize, 4); s->vdsp.prefetch(dest_y + (s->mb_x & 3) * 4 * s->linesize + (64 << PIXEL_SHIFT), s->linesize, 4);
s->dsp.prefetch(dest_cb + (s->mb_x & 7) * s->uvlinesize + (64 << PIXEL_SHIFT), dest_cr - dest_cb, 2); s->vdsp.prefetch(dest_cb + (s->mb_x & 7) * s->uvlinesize + (64 << PIXEL_SHIFT), dest_cr - dest_cb, 2);
h->list_counts[mb_xy] = h->list_count; h->list_counts[mb_xy] = h->list_count;
@ -292,8 +292,8 @@ static av_noinline void FUNC(hl_decode_mb_444)(H264Context *h)
for (p = 0; p < plane_count; p++) { for (p = 0; p < plane_count; p++) {
dest[p] = s->current_picture.f.data[p] + dest[p] = s->current_picture.f.data[p] +
((mb_x << PIXEL_SHIFT) + mb_y * s->linesize) * 16; ((mb_x << PIXEL_SHIFT) + mb_y * s->linesize) * 16;
s->dsp.prefetch(dest[p] + (s->mb_x & 3) * 4 * s->linesize + (64 << PIXEL_SHIFT), s->vdsp.prefetch(dest[p] + (s->mb_x & 3) * 4 * s->linesize + (64 << PIXEL_SHIFT),
s->linesize, 4); s->linesize, 4);
} }
h->list_counts[mb_xy] = h->list_count; h->list_counts[mb_xy] = h->list_count;

View File

@ -175,6 +175,7 @@ const uint8_t *avpriv_mpv_find_start_code(const uint8_t *restrict p,
av_cold int ff_dct_common_init(MpegEncContext *s) av_cold int ff_dct_common_init(MpegEncContext *s)
{ {
ff_dsputil_init(&s->dsp, s->avctx); ff_dsputil_init(&s->dsp, s->avctx);
ff_videodsp_init(&s->vdsp, 8);
s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_c; s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_c;
s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_c; s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_c;

View File

@ -36,6 +36,7 @@
#include "parser.h" #include "parser.h"
#include "mpeg12data.h" #include "mpeg12data.h"
#include "rl.h" #include "rl.h"
#include "videodsp.h"
#include "libavutil/opt.h" #include "libavutil/opt.h"
@ -358,6 +359,7 @@ typedef struct MpegEncContext {
int h263_long_vectors; ///< use horrible h263v1 long vector mode int h263_long_vectors; ///< use horrible h263v1 long vector mode
DSPContext dsp; ///< pointers for accelerated dsp functions DSPContext dsp; ///< pointers for accelerated dsp functions
VideoDSPContext vdsp;
int f_code; ///< forward MV resolution int f_code; ///< forward MV resolution
int b_code; ///< backward MV resolution for B Frames (mpeg4) int b_code; ///< backward MV resolution for B Frames (mpeg4)
int16_t (*p_mv_table_base)[2]; int16_t (*p_mv_table_base)[2];

View File

@ -1782,16 +1782,16 @@ static av_always_inline void encode_mb_internal(MpegEncContext *s,
if (mb_x * 16 + 16 > s->width || mb_y * 16 + 16 > s->height) { if (mb_x * 16 + 16 > s->width || mb_y * 16 + 16 > s->height) {
uint8_t *ebuf = s->edge_emu_buffer + 32; uint8_t *ebuf = s->edge_emu_buffer + 32;
s->dsp.emulated_edge_mc(ebuf, ptr_y, wrap_y, 16, 16, mb_x * 16, s->vdsp.emulated_edge_mc(ebuf, ptr_y, wrap_y, 16, 16, mb_x * 16,
mb_y * 16, s->width, s->height); mb_y * 16, s->width, s->height);
ptr_y = ebuf; ptr_y = ebuf;
s->dsp.emulated_edge_mc(ebuf + 18 * wrap_y, ptr_cb, wrap_c, 8, s->vdsp.emulated_edge_mc(ebuf + 18 * wrap_y, ptr_cb, wrap_c, 8,
mb_block_height, mb_x * 8, mb_y * 8, mb_block_height, mb_x * 8, mb_y * 8,
s->width >> 1, s->height >> 1); s->width >> 1, s->height >> 1);
ptr_cb = ebuf + 18 * wrap_y; ptr_cb = ebuf + 18 * wrap_y;
s->dsp.emulated_edge_mc(ebuf + 18 * wrap_y + 8, ptr_cr, wrap_c, 8, s->vdsp.emulated_edge_mc(ebuf + 18 * wrap_y + 8, ptr_cr, wrap_c, 8,
mb_block_height, mb_x * 8, mb_y * 8, mb_block_height, mb_x * 8, mb_y * 8,
s->width >> 1, s->height >> 1); s->width >> 1, s->height >> 1);
ptr_cr = ebuf + 18 * wrap_y + 8; ptr_cr = ebuf + 18 * wrap_y + 8;
} }

View File

@ -59,7 +59,7 @@ static void gmc1_motion(MpegEncContext *s,
if(s->flags&CODEC_FLAG_EMU_EDGE){ if(s->flags&CODEC_FLAG_EMU_EDGE){
if( (unsigned)src_x >= FFMAX(s->h_edge_pos - 17, 0) if( (unsigned)src_x >= FFMAX(s->h_edge_pos - 17, 0)
|| (unsigned)src_y >= FFMAX(s->v_edge_pos - 17, 0)){ || (unsigned)src_y >= FFMAX(s->v_edge_pos - 17, 0)){
s->dsp.emulated_edge_mc(s->edge_emu_buffer, ptr, linesize, 17, 17, src_x, src_y, s->h_edge_pos, s->v_edge_pos); s->vdsp.emulated_edge_mc(s->edge_emu_buffer, ptr, linesize, 17, 17, src_x, src_y, s->h_edge_pos, s->v_edge_pos);
ptr= s->edge_emu_buffer; ptr= s->edge_emu_buffer;
} }
} }
@ -98,7 +98,7 @@ static void gmc1_motion(MpegEncContext *s,
if(s->flags&CODEC_FLAG_EMU_EDGE){ if(s->flags&CODEC_FLAG_EMU_EDGE){
if( (unsigned)src_x >= FFMAX((s->h_edge_pos>>1) - 9, 0) if( (unsigned)src_x >= FFMAX((s->h_edge_pos>>1) - 9, 0)
|| (unsigned)src_y >= FFMAX((s->v_edge_pos>>1) - 9, 0)){ || (unsigned)src_y >= FFMAX((s->v_edge_pos>>1) - 9, 0)){
s->dsp.emulated_edge_mc(s->edge_emu_buffer, ptr, uvlinesize, 9, 9, src_x, src_y, s->h_edge_pos>>1, s->v_edge_pos>>1); s->vdsp.emulated_edge_mc(s->edge_emu_buffer, ptr, uvlinesize, 9, 9, src_x, src_y, s->h_edge_pos>>1, s->v_edge_pos>>1);
ptr= s->edge_emu_buffer; ptr= s->edge_emu_buffer;
emu=1; emu=1;
} }
@ -107,7 +107,7 @@ static void gmc1_motion(MpegEncContext *s,
ptr = ref_picture[2] + offset; ptr = ref_picture[2] + offset;
if(emu){ if(emu){
s->dsp.emulated_edge_mc(s->edge_emu_buffer, ptr, uvlinesize, 9, 9, src_x, src_y, s->h_edge_pos>>1, s->v_edge_pos>>1); s->vdsp.emulated_edge_mc(s->edge_emu_buffer, ptr, uvlinesize, 9, 9, src_x, src_y, s->h_edge_pos>>1, s->v_edge_pos>>1);
ptr= s->edge_emu_buffer; ptr= s->edge_emu_buffer;
} }
s->dsp.gmc1(dest_cr, ptr, uvlinesize, 8, motion_x&15, motion_y&15, 128 - s->no_rounding); s->dsp.gmc1(dest_cr, ptr, uvlinesize, 8, motion_x&15, motion_y&15, 128 - s->no_rounding);
@ -195,7 +195,7 @@ static inline int hpel_motion(MpegEncContext *s,
if(s->unrestricted_mv && (s->flags&CODEC_FLAG_EMU_EDGE)){ if(s->unrestricted_mv && (s->flags&CODEC_FLAG_EMU_EDGE)){
if( (unsigned)src_x > FFMAX(s->h_edge_pos - (motion_x&1) - 8, 0) if( (unsigned)src_x > FFMAX(s->h_edge_pos - (motion_x&1) - 8, 0)
|| (unsigned)src_y > FFMAX(s->v_edge_pos - (motion_y&1) - 8, 0)){ || (unsigned)src_y > FFMAX(s->v_edge_pos - (motion_y&1) - 8, 0)){
s->dsp.emulated_edge_mc(s->edge_emu_buffer, src, s->linesize, 9, 9, s->vdsp.emulated_edge_mc(s->edge_emu_buffer, src, s->linesize, 9, 9,
src_x, src_y, s->h_edge_pos, s->v_edge_pos); src_x, src_y, s->h_edge_pos, s->v_edge_pos);
src= s->edge_emu_buffer; src= s->edge_emu_buffer;
emu=1; emu=1;
@ -285,19 +285,19 @@ if(s->quarter_sample)
"MPEG motion vector out of boundary (%d %d)\n", src_x, src_y); "MPEG motion vector out of boundary (%d %d)\n", src_x, src_y);
return; return;
} }
s->dsp.emulated_edge_mc(s->edge_emu_buffer, ptr_y, s->linesize, s->vdsp.emulated_edge_mc(s->edge_emu_buffer, ptr_y, s->linesize,
17, 17+field_based, 17, 17+field_based,
src_x, src_y<<field_based, src_x, src_y<<field_based,
s->h_edge_pos, s->v_edge_pos); s->h_edge_pos, s->v_edge_pos);
ptr_y = s->edge_emu_buffer; ptr_y = s->edge_emu_buffer;
if(!CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){ if(!CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
uint8_t *uvbuf= s->edge_emu_buffer+18*s->linesize; uint8_t *uvbuf= s->edge_emu_buffer+18*s->linesize;
s->dsp.emulated_edge_mc(uvbuf , s->vdsp.emulated_edge_mc(uvbuf ,
ptr_cb, s->uvlinesize, ptr_cb, s->uvlinesize,
9, 9+field_based, 9, 9+field_based,
uvsrc_x, uvsrc_y<<field_based, uvsrc_x, uvsrc_y<<field_based,
s->h_edge_pos>>1, s->v_edge_pos>>1); s->h_edge_pos>>1, s->v_edge_pos>>1);
s->dsp.emulated_edge_mc(uvbuf+16, s->vdsp.emulated_edge_mc(uvbuf+16,
ptr_cr, s->uvlinesize, ptr_cr, s->uvlinesize,
9, 9+field_based, 9, 9+field_based,
uvsrc_x, uvsrc_y<<field_based, uvsrc_x, uvsrc_y<<field_based,
@ -498,17 +498,17 @@ static inline void qpel_motion(MpegEncContext *s,
if( (unsigned)src_x > FFMAX(s->h_edge_pos - (motion_x&3) - 16, 0) if( (unsigned)src_x > FFMAX(s->h_edge_pos - (motion_x&3) - 16, 0)
|| (unsigned)src_y > FFMAX( v_edge_pos - (motion_y&3) - h , 0)){ || (unsigned)src_y > FFMAX( v_edge_pos - (motion_y&3) - h , 0)){
s->dsp.emulated_edge_mc(s->edge_emu_buffer, ptr_y, s->linesize, s->vdsp.emulated_edge_mc(s->edge_emu_buffer, ptr_y, s->linesize,
17, 17+field_based, src_x, src_y<<field_based, 17, 17+field_based, src_x, src_y<<field_based,
s->h_edge_pos, s->v_edge_pos); s->h_edge_pos, s->v_edge_pos);
ptr_y= s->edge_emu_buffer; ptr_y= s->edge_emu_buffer;
if(!CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){ if(!CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
uint8_t *uvbuf= s->edge_emu_buffer + 18*s->linesize; uint8_t *uvbuf= s->edge_emu_buffer + 18*s->linesize;
s->dsp.emulated_edge_mc(uvbuf, ptr_cb, s->uvlinesize, s->vdsp.emulated_edge_mc(uvbuf, ptr_cb, s->uvlinesize,
9, 9 + field_based, 9, 9 + field_based,
uvsrc_x, uvsrc_y<<field_based, uvsrc_x, uvsrc_y<<field_based,
s->h_edge_pos>>1, s->v_edge_pos>>1); s->h_edge_pos>>1, s->v_edge_pos>>1);
s->dsp.emulated_edge_mc(uvbuf + 16, ptr_cr, s->uvlinesize, s->vdsp.emulated_edge_mc(uvbuf + 16, ptr_cr, s->uvlinesize,
9, 9 + field_based, 9, 9 + field_based,
uvsrc_x, uvsrc_y<<field_based, uvsrc_x, uvsrc_y<<field_based,
s->h_edge_pos>>1, s->v_edge_pos>>1); s->h_edge_pos>>1, s->v_edge_pos>>1);
@ -577,7 +577,7 @@ static void chroma_4mv_motion(MpegEncContext *s,
if(s->flags&CODEC_FLAG_EMU_EDGE){ if(s->flags&CODEC_FLAG_EMU_EDGE){
if( (unsigned)src_x > FFMAX((s->h_edge_pos>>1) - (dxy &1) - 8, 0) if( (unsigned)src_x > FFMAX((s->h_edge_pos>>1) - (dxy &1) - 8, 0)
|| (unsigned)src_y > FFMAX((s->v_edge_pos>>1) - (dxy>>1) - 8, 0)){ || (unsigned)src_y > FFMAX((s->v_edge_pos>>1) - (dxy>>1) - 8, 0)){
s->dsp.emulated_edge_mc(s->edge_emu_buffer, ptr, s->uvlinesize, s->vdsp.emulated_edge_mc(s->edge_emu_buffer, ptr, s->uvlinesize,
9, 9, src_x, src_y, 9, 9, src_x, src_y,
s->h_edge_pos>>1, s->v_edge_pos>>1); s->h_edge_pos>>1, s->v_edge_pos>>1);
ptr= s->edge_emu_buffer; ptr= s->edge_emu_buffer;
@ -588,7 +588,7 @@ static void chroma_4mv_motion(MpegEncContext *s,
ptr = ref_picture[2] + offset; ptr = ref_picture[2] + offset;
if(emu){ if(emu){
s->dsp.emulated_edge_mc(s->edge_emu_buffer, ptr, s->uvlinesize, s->vdsp.emulated_edge_mc(s->edge_emu_buffer, ptr, s->uvlinesize,
9, 9, src_x, src_y, 9, 9, src_x, src_y,
s->h_edge_pos>>1, s->v_edge_pos>>1); s->h_edge_pos>>1, s->v_edge_pos>>1);
ptr= s->edge_emu_buffer; ptr= s->edge_emu_buffer;
@ -603,9 +603,9 @@ static inline void prefetch_motion(MpegEncContext *s, uint8_t **pix, int dir){
const int mx= (s->mv[dir][0][0]>>shift) + 16*s->mb_x + 8; const int mx= (s->mv[dir][0][0]>>shift) + 16*s->mb_x + 8;
const int my= (s->mv[dir][0][1]>>shift) + 16*s->mb_y; const int my= (s->mv[dir][0][1]>>shift) + 16*s->mb_y;
int off= mx + (my + (s->mb_x&3)*4)*s->linesize + 64; int off= mx + (my + (s->mb_x&3)*4)*s->linesize + 64;
s->dsp.prefetch(pix[0]+off, s->linesize, 4); s->vdsp.prefetch(pix[0]+off, s->linesize, 4);
off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64; off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
s->dsp.prefetch(pix[1]+off, pix[2]-pix[1], 2); s->vdsp.prefetch(pix[1]+off, pix[2]-pix[1], 2);
} }
/** /**
@ -757,7 +757,7 @@ static av_always_inline void MPV_motion_internal(MpegEncContext *s,
if(s->flags&CODEC_FLAG_EMU_EDGE){ if(s->flags&CODEC_FLAG_EMU_EDGE){
if( (unsigned)src_x > FFMAX(s->h_edge_pos - (motion_x&3) - 8, 0) if( (unsigned)src_x > FFMAX(s->h_edge_pos - (motion_x&3) - 8, 0)
|| (unsigned)src_y > FFMAX(s->v_edge_pos - (motion_y&3) - 8, 0)){ || (unsigned)src_y > FFMAX(s->v_edge_pos - (motion_y&3) - 8, 0)){
s->dsp.emulated_edge_mc(s->edge_emu_buffer, ptr, s->vdsp.emulated_edge_mc(s->edge_emu_buffer, ptr,
s->linesize, 9, 9, s->linesize, 9, 9,
src_x, src_y, src_x, src_y,
s->h_edge_pos, s->v_edge_pos); s->h_edge_pos, s->v_edge_pos);

View File

@ -1,4 +1,5 @@
OBJS += ppc/dsputil_ppc.o \ OBJS += ppc/dsputil_ppc.o \
ppc/videodsp_ppc.o \
OBJS-$(CONFIG_VP3DSP) += ppc/vp3dsp_altivec.o OBJS-$(CONFIG_VP3DSP) += ppc/vp3dsp_altivec.o

View File

@ -137,21 +137,11 @@ static long check_dcbzl_effect(void)
} }
#endif #endif
static void prefetch_ppc(void *mem, int stride, int h)
{
register const uint8_t *p = mem;
do {
__asm__ volatile ("dcbt 0,%0" : : "r" (p));
p+= stride;
} while(--h);
}
void ff_dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx) void ff_dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx)
{ {
const int high_bit_depth = avctx->bits_per_raw_sample > 8; const int high_bit_depth = avctx->bits_per_raw_sample > 8;
// Common optimizations whether AltiVec is available or not // Common optimizations whether AltiVec is available or not
c->prefetch = prefetch_ppc;
if (!high_bit_depth) { if (!high_bit_depth) {
switch (check_dcbzl_effect()) { switch (check_dcbzl_effect()) {
case 32: case 32:

View File

@ -0,0 +1,35 @@
/*
* Copyright (c) 2003-2004 Romain Dolbeau
*
* This file is part of Libav.
*
* Libav is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* Libav is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with Libav; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavcodec/videodsp.h"
static void prefetch_ppc(uint8_t *mem, ptrdiff_t stride, int h)
{
register const uint8_t *p = mem;
do {
__asm__ volatile ("dcbt 0,%0" : : "r" (p));
p += stride;
} while(--h);
}
void ff_videodsp_init_ppc(VideoDSPContext *ctx, int bpc)
{
ctx->prefetch = prefetch_ppc;
}

View File

@ -725,12 +725,12 @@ static inline void rv34_mc(RV34DecContext *r, const int block_type,
uint8_t *uvbuf = s->edge_emu_buffer + 22 * s->linesize; uint8_t *uvbuf = s->edge_emu_buffer + 22 * s->linesize;
srcY -= 2 + 2*s->linesize; srcY -= 2 + 2*s->linesize;
s->dsp.emulated_edge_mc(s->edge_emu_buffer, srcY, s->linesize, (width<<3)+6, (height<<3)+6, s->vdsp.emulated_edge_mc(s->edge_emu_buffer, srcY, s->linesize, (width<<3)+6, (height<<3)+6,
src_x - 2, src_y - 2, s->h_edge_pos, s->v_edge_pos); src_x - 2, src_y - 2, s->h_edge_pos, s->v_edge_pos);
srcY = s->edge_emu_buffer + 2 + 2*s->linesize; srcY = s->edge_emu_buffer + 2 + 2*s->linesize;
s->dsp.emulated_edge_mc(uvbuf , srcU, s->uvlinesize, (width<<2)+1, (height<<2)+1, s->vdsp.emulated_edge_mc(uvbuf , srcU, s->uvlinesize, (width<<2)+1, (height<<2)+1,
uvsrc_x, uvsrc_y, s->h_edge_pos >> 1, s->v_edge_pos >> 1); uvsrc_x, uvsrc_y, s->h_edge_pos >> 1, s->v_edge_pos >> 1);
s->dsp.emulated_edge_mc(uvbuf + 16, srcV, s->uvlinesize, (width<<2)+1, (height<<2)+1, s->vdsp.emulated_edge_mc(uvbuf + 16, srcV, s->uvlinesize, (width<<2)+1, (height<<2)+1,
uvsrc_x, uvsrc_y, s->h_edge_pos >> 1, s->v_edge_pos >> 1); uvsrc_x, uvsrc_y, s->h_edge_pos >> 1, s->v_edge_pos >> 1);
srcU = uvbuf; srcU = uvbuf;
srcV = uvbuf + 16; srcV = uvbuf + 16;

View File

@ -293,9 +293,9 @@ static inline void svq3_mc_dir_part(MpegEncContext *s,
src = pic->f.data[0] + mx + my * s->linesize; src = pic->f.data[0] + mx + my * s->linesize;
if (emu) { if (emu) {
s->dsp.emulated_edge_mc(s->edge_emu_buffer, src, s->linesize, s->vdsp.emulated_edge_mc(s->edge_emu_buffer, src, s->linesize,
width + 1, height + 1, width + 1, height + 1,
mx, my, s->h_edge_pos, s->v_edge_pos); mx, my, s->h_edge_pos, s->v_edge_pos);
src = s->edge_emu_buffer; src = s->edge_emu_buffer;
} }
if (thirdpel) if (thirdpel)
@ -319,10 +319,10 @@ static inline void svq3_mc_dir_part(MpegEncContext *s,
src = pic->f.data[i] + mx + my * s->uvlinesize; src = pic->f.data[i] + mx + my * s->uvlinesize;
if (emu) { if (emu) {
s->dsp.emulated_edge_mc(s->edge_emu_buffer, src, s->uvlinesize, s->vdsp.emulated_edge_mc(s->edge_emu_buffer, src, s->uvlinesize,
width + 1, height + 1, width + 1, height + 1,
mx, my, (s->h_edge_pos >> 1), mx, my, (s->h_edge_pos >> 1),
s->v_edge_pos >> 1); s->v_edge_pos >> 1);
src = s->edge_emu_buffer; src = s->edge_emu_buffer;
} }
if (thirdpel) if (thirdpel)

View File

@ -434,15 +434,15 @@ static void vc1_mc_1mv(VC1Context *v, int dir)
uint8_t *uvbuf = s->edge_emu_buffer + 19 * s->linesize; uint8_t *uvbuf = s->edge_emu_buffer + 19 * s->linesize;
srcY -= s->mspel * (1 + s->linesize); srcY -= s->mspel * (1 + s->linesize);
s->dsp.emulated_edge_mc(s->edge_emu_buffer, srcY, s->linesize, s->vdsp.emulated_edge_mc(s->edge_emu_buffer, srcY, s->linesize,
17 + s->mspel * 2, 17 + s->mspel * 2, 17 + s->mspel * 2, 17 + s->mspel * 2,
src_x - s->mspel, src_y - s->mspel, src_x - s->mspel, src_y - s->mspel,
s->h_edge_pos, v_edge_pos); s->h_edge_pos, v_edge_pos);
srcY = s->edge_emu_buffer; srcY = s->edge_emu_buffer;
s->dsp.emulated_edge_mc(uvbuf , srcU, s->uvlinesize, 8 + 1, 8 + 1, s->vdsp.emulated_edge_mc(uvbuf , srcU, s->uvlinesize, 8 + 1, 8 + 1,
uvsrc_x, uvsrc_y, s->h_edge_pos >> 1, v_edge_pos >> 1); uvsrc_x, uvsrc_y, s->h_edge_pos >> 1, v_edge_pos >> 1);
s->dsp.emulated_edge_mc(uvbuf + 16, srcV, s->uvlinesize, 8 + 1, 8 + 1, s->vdsp.emulated_edge_mc(uvbuf + 16, srcV, s->uvlinesize, 8 + 1, 8 + 1,
uvsrc_x, uvsrc_y, s->h_edge_pos >> 1, v_edge_pos >> 1); uvsrc_x, uvsrc_y, s->h_edge_pos >> 1, v_edge_pos >> 1);
srcU = uvbuf; srcU = uvbuf;
srcV = uvbuf + 16; srcV = uvbuf + 16;
/* if we deal with range reduction we need to scale source blocks */ /* if we deal with range reduction we need to scale source blocks */
@ -667,10 +667,10 @@ static void vc1_mc_4mv_luma(VC1Context *v, int n, int dir)
|| (unsigned)(src_y - (s->mspel << fieldmv)) > v_edge_pos - (my & 3) - ((8 + s->mspel * 2) << fieldmv)) { || (unsigned)(src_y - (s->mspel << fieldmv)) > v_edge_pos - (my & 3) - ((8 + s->mspel * 2) << fieldmv)) {
srcY -= s->mspel * (1 + (s->linesize << fieldmv)); srcY -= s->mspel * (1 + (s->linesize << fieldmv));
/* check emulate edge stride and offset */ /* check emulate edge stride and offset */
s->dsp.emulated_edge_mc(s->edge_emu_buffer, srcY, s->linesize, s->vdsp.emulated_edge_mc(s->edge_emu_buffer, srcY, s->linesize,
9 + s->mspel * 2, (9 + s->mspel * 2) << fieldmv, 9 + s->mspel * 2, (9 + s->mspel * 2) << fieldmv,
src_x - s->mspel, src_y - (s->mspel << fieldmv), src_x - s->mspel, src_y - (s->mspel << fieldmv),
s->h_edge_pos, v_edge_pos); s->h_edge_pos, v_edge_pos);
srcY = s->edge_emu_buffer; srcY = s->edge_emu_buffer;
/* if we deal with range reduction we need to scale source blocks */ /* if we deal with range reduction we need to scale source blocks */
if (v->rangeredfrm) { if (v->rangeredfrm) {
@ -868,12 +868,12 @@ static void vc1_mc_4mv_chroma(VC1Context *v, int dir)
|| s->h_edge_pos < 18 || v_edge_pos < 18 || s->h_edge_pos < 18 || v_edge_pos < 18
|| (unsigned)uvsrc_x > (s->h_edge_pos >> 1) - 9 || (unsigned)uvsrc_x > (s->h_edge_pos >> 1) - 9
|| (unsigned)uvsrc_y > (v_edge_pos >> 1) - 9) { || (unsigned)uvsrc_y > (v_edge_pos >> 1) - 9) {
s->dsp.emulated_edge_mc(s->edge_emu_buffer , srcU, s->uvlinesize, s->vdsp.emulated_edge_mc(s->edge_emu_buffer , srcU, s->uvlinesize,
8 + 1, 8 + 1, uvsrc_x, uvsrc_y, 8 + 1, 8 + 1, uvsrc_x, uvsrc_y,
s->h_edge_pos >> 1, v_edge_pos >> 1); s->h_edge_pos >> 1, v_edge_pos >> 1);
s->dsp.emulated_edge_mc(s->edge_emu_buffer + 16, srcV, s->uvlinesize, s->vdsp.emulated_edge_mc(s->edge_emu_buffer + 16, srcV, s->uvlinesize,
8 + 1, 8 + 1, uvsrc_x, uvsrc_y, 8 + 1, 8 + 1, uvsrc_x, uvsrc_y,
s->h_edge_pos >> 1, v_edge_pos >> 1); s->h_edge_pos >> 1, v_edge_pos >> 1);
srcU = s->edge_emu_buffer; srcU = s->edge_emu_buffer;
srcV = s->edge_emu_buffer + 16; srcV = s->edge_emu_buffer + 16;
@ -973,12 +973,12 @@ static void vc1_mc_4mv_chroma4(VC1Context *v)
|| s->h_edge_pos < 10 || v_edge_pos < (5 << fieldmv) || s->h_edge_pos < 10 || v_edge_pos < (5 << fieldmv)
|| (unsigned)uvsrc_x > (s->h_edge_pos >> 1) - 5 || (unsigned)uvsrc_x > (s->h_edge_pos >> 1) - 5
|| (unsigned)uvsrc_y > v_edge_pos - (5 << fieldmv)) { || (unsigned)uvsrc_y > v_edge_pos - (5 << fieldmv)) {
s->dsp.emulated_edge_mc(s->edge_emu_buffer, srcU, s->uvlinesize, s->vdsp.emulated_edge_mc(s->edge_emu_buffer, srcU, s->uvlinesize,
5, (5 << fieldmv), uvsrc_x, uvsrc_y, 5, (5 << fieldmv), uvsrc_x, uvsrc_y,
s->h_edge_pos >> 1, v_edge_pos); s->h_edge_pos >> 1, v_edge_pos);
s->dsp.emulated_edge_mc(s->edge_emu_buffer + 16, srcV, s->uvlinesize, s->vdsp.emulated_edge_mc(s->edge_emu_buffer + 16, srcV, s->uvlinesize,
5, (5 << fieldmv), uvsrc_x, uvsrc_y, 5, (5 << fieldmv), uvsrc_x, uvsrc_y,
s->h_edge_pos >> 1, v_edge_pos); s->h_edge_pos >> 1, v_edge_pos);
srcU = s->edge_emu_buffer; srcU = s->edge_emu_buffer;
srcV = s->edge_emu_buffer + 16; srcV = s->edge_emu_buffer + 16;
@ -1888,15 +1888,15 @@ static void vc1_interp_mc(VC1Context *v)
uint8_t *uvbuf = s->edge_emu_buffer + 19 * s->linesize; uint8_t *uvbuf = s->edge_emu_buffer + 19 * s->linesize;
srcY -= s->mspel * (1 + s->linesize); srcY -= s->mspel * (1 + s->linesize);
s->dsp.emulated_edge_mc(s->edge_emu_buffer, srcY, s->linesize, s->vdsp.emulated_edge_mc(s->edge_emu_buffer, srcY, s->linesize,
17 + s->mspel * 2, 17 + s->mspel * 2, 17 + s->mspel * 2, 17 + s->mspel * 2,
src_x - s->mspel, src_y - s->mspel, src_x - s->mspel, src_y - s->mspel,
s->h_edge_pos, v_edge_pos); s->h_edge_pos, v_edge_pos);
srcY = s->edge_emu_buffer; srcY = s->edge_emu_buffer;
s->dsp.emulated_edge_mc(uvbuf , srcU, s->uvlinesize, 8 + 1, 8 + 1, s->vdsp.emulated_edge_mc(uvbuf , srcU, s->uvlinesize, 8 + 1, 8 + 1,
uvsrc_x, uvsrc_y, s->h_edge_pos >> 1, v_edge_pos >> 1); uvsrc_x, uvsrc_y, s->h_edge_pos >> 1, v_edge_pos >> 1);
s->dsp.emulated_edge_mc(uvbuf + 16, srcV, s->uvlinesize, 8 + 1, 8 + 1, s->vdsp.emulated_edge_mc(uvbuf + 16, srcV, s->uvlinesize, 8 + 1, 8 + 1,
uvsrc_x, uvsrc_y, s->h_edge_pos >> 1, v_edge_pos >> 1); uvsrc_x, uvsrc_y, s->h_edge_pos >> 1, v_edge_pos >> 1);
srcU = uvbuf; srcU = uvbuf;
srcV = uvbuf + 16; srcV = uvbuf + 16;
/* if we deal with range reduction we need to scale source blocks */ /* if we deal with range reduction we need to scale source blocks */

51
libavcodec/videodsp.c Normal file
View File

@ -0,0 +1,51 @@
/*
* Copyright (C) 2012 Ronald S. Bultje
*
* This file is part of Libav.
*
* Libav is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* Libav is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with Libav; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/common.h"
#include "videodsp.h"
#define BIT_DEPTH 8
#include "videodsp_template.c"
#undef BIT_DEPTH
#define BIT_DEPTH 16
#include "videodsp_template.c"
#undef BIT_DEPTH
static void just_return(uint8_t *buf, ptrdiff_t stride, int h)
{
}
void ff_videodsp_init(VideoDSPContext *ctx, int bpc)
{
ctx->prefetch = just_return;
if (bpc <= 8) {
ctx->emulated_edge_mc = ff_emulated_edge_mc_8;
} else {
ctx->emulated_edge_mc = ff_emulated_edge_mc_16;
}
if (ARCH_ARM)
ff_videodsp_init_arm(ctx, bpc);
if (ARCH_PPC)
ff_videodsp_init_ppc(ctx, bpc);
if (ARCH_X86)
ff_videodsp_init_x86(ctx, bpc);
}

71
libavcodec/videodsp.h Normal file
View File

@ -0,0 +1,71 @@
/*
* Copyright (C) 2012 Ronald S. Bultje
*
* This file is part of Libav.
*
* Libav is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* Libav is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with Libav; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
/**
* @file
* Core video DSP helper functions
*/
#ifndef AVCODEC_VIDEODSP_H
#define AVCODEC_VIDEODSP_H
#include <stddef.h>
#include <stdint.h>
typedef struct VideoDSPContext {
/**
* Copy a rectangular area of samples to a temporary buffer and replicate
* the border samples.
*
* @param buf destination buffer
* @param src source buffer
* @param linesize number of bytes between 2 vertically adjacent samples
* in both the source and destination buffers
* @param block_w width of block
* @param block_h height of block
* @param src_x x coordinate of the top left sample of the block in the
* source buffer
* @param src_y y coordinate of the top left sample of the block in the
* source buffer
* @param w width of the source buffer
* @param h height of the source buffer
*/
void (*emulated_edge_mc)(uint8_t *buf, const uint8_t *src,
ptrdiff_t linesize, int block_w, int block_h,
int src_x, int src_y, int w, int h);
/**
* Prefetch memory into cache (if supported by hardware).
*
* @buf pointer to buffer to prefetch memory from
* @stride distance between two lines of buf (in bytes)
* @h number of lines to prefetch
*/
void (*prefetch)(uint8_t *buf, ptrdiff_t stride, int h);
} VideoDSPContext;
void ff_videodsp_init(VideoDSPContext *ctx, int bpc);
/* for internal use only (i.e. called by ff_videodsp_init() */
void ff_videodsp_init_arm(VideoDSPContext *ctx, int bpc);
void ff_videodsp_init_ppc(VideoDSPContext *ctx, int bpc);
void ff_videodsp_init_x86(VideoDSPContext *ctx, int bpc);
#endif /* AVCODEC_VIDEODSP_H */

View File

@ -0,0 +1,93 @@
/*
* Copyright (c) 2002-2004 Michael Niedermayer
* Copyright (C) 2012 Ronald S. Bultje
*
* This file is part of Libav.
*
* Libav is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* Libav is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with Libav; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "bit_depth_template.c"
static void FUNC(ff_emulated_edge_mc)(uint8_t *buf, const uint8_t *src,
ptrdiff_t linesize,
int block_w, int block_h,
int src_x, int src_y, int w, int h)
{
int x, y;
int start_y, start_x, end_y, end_x;
if (src_y >= h) {
src += (h - 1 - src_y) * linesize;
src_y = h - 1;
} else if (src_y <= -block_h) {
src += (1 - block_h - src_y) * linesize;
src_y = 1 - block_h;
}
if (src_x >= w) {
src += (w - 1 - src_x) * sizeof(pixel);
src_x = w - 1;
} else if (src_x <= -block_w) {
src += (1 - block_w - src_x) * sizeof(pixel);
src_x = 1 - block_w;
}
start_y = FFMAX(0, -src_y);
start_x = FFMAX(0, -src_x);
end_y = FFMIN(block_h, h-src_y);
end_x = FFMIN(block_w, w-src_x);
assert(start_y < end_y && block_h);
assert(start_x < end_x && block_w);
w = end_x - start_x;
src += start_y * linesize + start_x * sizeof(pixel);
buf += start_x * sizeof(pixel);
// top
for (y = 0; y < start_y; y++) {
memcpy(buf, src, w * sizeof(pixel));
buf += linesize;
}
// copy existing part
for (; y < end_y; y++) {
memcpy(buf, src, w * sizeof(pixel));
src += linesize;
buf += linesize;
}
// bottom
src -= linesize;
for (; y < block_h; y++) {
memcpy(buf, src, w * sizeof(pixel));
buf += linesize;
}
buf -= block_h * linesize + start_x * sizeof(pixel);
while (block_h--) {
pixel *bufp = (pixel *) buf;
// left
for(x = 0; x < start_x; x++) {
bufp[x] = bufp[start_x];
}
// right
for (x = end_x; x < block_w; x++) {
bufp[x] = bufp[end_x - 1];
}
buf += linesize;
}
}

View File

@ -38,7 +38,7 @@
#include "internal.h" #include "internal.h"
#include "dsputil.h" #include "dsputil.h"
#include "get_bits.h" #include "get_bits.h"
#include "videodsp.h"
#include "vp3data.h" #include "vp3data.h"
#include "vp3dsp.h" #include "vp3dsp.h"
#include "xiph.h" #include "xiph.h"
@ -136,6 +136,7 @@ typedef struct Vp3DecodeContext {
AVFrame current_frame; AVFrame current_frame;
int keyframe; int keyframe;
DSPContext dsp; DSPContext dsp;
VideoDSPContext vdsp;
VP3DSPContext vp3dsp; VP3DSPContext vp3dsp;
int flipped_image; int flipped_image;
int last_slice_end; int last_slice_end;
@ -1543,7 +1544,7 @@ static void render_slice(Vp3DecodeContext *s, int slice)
uint8_t *temp= s->edge_emu_buffer; uint8_t *temp= s->edge_emu_buffer;
if(stride<0) temp -= 8*stride; if(stride<0) temp -= 8*stride;
s->dsp.emulated_edge_mc(temp, motion_source, stride, 9, 9, src_x, src_y, plane_width, plane_height); s->vdsp.emulated_edge_mc(temp, motion_source, stride, 9, 9, src_x, src_y, plane_width, plane_height);
motion_source= temp; motion_source= temp;
} }
} }
@ -1677,6 +1678,7 @@ static av_cold int vp3_decode_init(AVCodecContext *avctx)
avctx->pix_fmt = AV_PIX_FMT_YUV420P; avctx->pix_fmt = AV_PIX_FMT_YUV420P;
avctx->chroma_sample_location = AVCHROMA_LOC_CENTER; avctx->chroma_sample_location = AVCHROMA_LOC_CENTER;
ff_dsputil_init(&s->dsp, avctx); ff_dsputil_init(&s->dsp, avctx);
ff_videodsp_init(&s->vdsp, 8);
ff_vp3dsp_init(&s->vp3dsp, avctx->flags); ff_vp3dsp_init(&s->vp3dsp, avctx->flags);
ff_init_scantable_permutation(s->dsp.idct_permutation, s->vp3dsp.idct_perm); ff_init_scantable_permutation(s->dsp.idct_permutation, s->vp3dsp.idct_perm);

View File

@ -340,7 +340,7 @@ static void vp56_mc(VP56Context *s, int b, int plane, uint8_t *src,
if (x<0 || x+12>=s->plane_width[plane] || if (x<0 || x+12>=s->plane_width[plane] ||
y<0 || y+12>=s->plane_height[plane]) { y<0 || y+12>=s->plane_height[plane]) {
s->dsp.emulated_edge_mc(s->edge_emu_buffer, s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
src + s->block_offset[b] + (dy-2)*stride + (dx-2), src + s->block_offset[b] + (dy-2)*stride + (dx-2),
stride, 12, 12, x, y, stride, 12, 12, x, y,
s->plane_width[plane], s->plane_width[plane],
@ -674,6 +674,7 @@ av_cold void ff_vp56_init(AVCodecContext *avctx, int flip, int has_alpha)
avctx->pix_fmt = has_alpha ? AV_PIX_FMT_YUVA420P : AV_PIX_FMT_YUV420P; avctx->pix_fmt = has_alpha ? AV_PIX_FMT_YUVA420P : AV_PIX_FMT_YUV420P;
ff_dsputil_init(&s->dsp, avctx); ff_dsputil_init(&s->dsp, avctx);
ff_videodsp_init(&s->vdsp, 8);
ff_vp3dsp_init(&s->vp3dsp, avctx->flags); ff_vp3dsp_init(&s->vp3dsp, avctx->flags);
ff_vp56dsp_init(&s->vp56dsp, avctx->codec->id); ff_vp56dsp_init(&s->vp56dsp, avctx->codec->id);
ff_init_scantable_permutation(s->dsp.idct_permutation, s->vp3dsp.idct_perm); ff_init_scantable_permutation(s->dsp.idct_permutation, s->vp3dsp.idct_perm);

View File

@ -30,6 +30,7 @@
#include "dsputil.h" #include "dsputil.h"
#include "get_bits.h" #include "get_bits.h"
#include "bytestream.h" #include "bytestream.h"
#include "videodsp.h"
#include "vp3dsp.h" #include "vp3dsp.h"
#include "vp56dsp.h" #include "vp56dsp.h"
@ -94,6 +95,7 @@ typedef struct VP56Model {
struct vp56_context { struct vp56_context {
AVCodecContext *avctx; AVCodecContext *avctx;
DSPContext dsp; DSPContext dsp;
VideoDSPContext vdsp;
VP3DSPContext vp3dsp; VP3DSPContext vp3dsp;
VP56DSPContext vp56dsp; VP56DSPContext vp56dsp;
ScanTable scantable; ScanTable scantable;

View File

@ -1198,9 +1198,9 @@ void vp8_mc_luma(VP8Context *s, VP8ThreadData *td, uint8_t *dst,
src += y_off * linesize + x_off; src += y_off * linesize + x_off;
if (x_off < mx_idx || x_off >= width - block_w - subpel_idx[2][mx] || if (x_off < mx_idx || x_off >= width - block_w - subpel_idx[2][mx] ||
y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) { y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
s->dsp.emulated_edge_mc(td->edge_emu_buffer, src - my_idx * linesize - mx_idx, linesize, s->vdsp.emulated_edge_mc(td->edge_emu_buffer, src - my_idx * linesize - mx_idx, linesize,
block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my], block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
x_off - mx_idx, y_off - my_idx, width, height); x_off - mx_idx, y_off - my_idx, width, height);
src = td->edge_emu_buffer + mx_idx + linesize * my_idx; src = td->edge_emu_buffer + mx_idx + linesize * my_idx;
} }
mc_func[my_idx][mx_idx](dst, linesize, src, linesize, block_h, mx, my); mc_func[my_idx][mx_idx](dst, linesize, src, linesize, block_h, mx, my);
@ -1248,15 +1248,15 @@ void vp8_mc_chroma(VP8Context *s, VP8ThreadData *td, uint8_t *dst1, uint8_t *dst
ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 3, 0); ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 3, 0);
if (x_off < mx_idx || x_off >= width - block_w - subpel_idx[2][mx] || if (x_off < mx_idx || x_off >= width - block_w - subpel_idx[2][mx] ||
y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) { y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
s->dsp.emulated_edge_mc(td->edge_emu_buffer, src1 - my_idx * linesize - mx_idx, linesize, s->vdsp.emulated_edge_mc(td->edge_emu_buffer, src1 - my_idx * linesize - mx_idx, linesize,
block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my], block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
x_off - mx_idx, y_off - my_idx, width, height); x_off - mx_idx, y_off - my_idx, width, height);
src1 = td->edge_emu_buffer + mx_idx + linesize * my_idx; src1 = td->edge_emu_buffer + mx_idx + linesize * my_idx;
mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my); mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my);
s->dsp.emulated_edge_mc(td->edge_emu_buffer, src2 - my_idx * linesize - mx_idx, linesize, s->vdsp.emulated_edge_mc(td->edge_emu_buffer, src2 - my_idx * linesize - mx_idx, linesize,
block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my], block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
x_off - mx_idx, y_off - my_idx, width, height); x_off - mx_idx, y_off - my_idx, width, height);
src2 = td->edge_emu_buffer + mx_idx + linesize * my_idx; src2 = td->edge_emu_buffer + mx_idx + linesize * my_idx;
mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my); mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my);
} else { } else {
@ -1315,9 +1315,9 @@ static av_always_inline void prefetch_motion(VP8Context *s, VP8Macroblock *mb, i
/* For threading, a ff_thread_await_progress here might be useful, but /* For threading, a ff_thread_await_progress here might be useful, but
* it actually slows down the decoder. Since a bad prefetch doesn't * it actually slows down the decoder. Since a bad prefetch doesn't
* generate bad decoder output, we don't run it here. */ * generate bad decoder output, we don't run it here. */
s->dsp.prefetch(src[0]+off, s->linesize, 4); s->vdsp.prefetch(src[0]+off, s->linesize, 4);
off= (mx>>1) + ((my>>1) + (mb_x&7))*s->uvlinesize + 64; off= (mx>>1) + ((my>>1) + (mb_x&7))*s->uvlinesize + 64;
s->dsp.prefetch(src[1]+off, src[2]-src[1], 2); s->vdsp.prefetch(src[1]+off, src[2]-src[1], 2);
} }
} }
@ -1716,8 +1716,8 @@ static void vp8_decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
} }
} }
s->dsp.prefetch(dst[0] + (mb_x&3)*4*s->linesize + 64, s->linesize, 4); s->vdsp.prefetch(dst[0] + (mb_x&3)*4*s->linesize + 64, s->linesize, 4);
s->dsp.prefetch(dst[1] + (mb_x&7)*s->uvlinesize + 64, dst[2] - dst[1], 2); s->vdsp.prefetch(dst[1] + (mb_x&7)*s->uvlinesize + 64, dst[2] - dst[1], 2);
if (!s->mb_layout) if (!s->mb_layout)
decode_mb_mode(s, mb, mb_x, mb_y, curframe->ref_index[0] + mb_xy, decode_mb_mode(s, mb, mb_x, mb_y, curframe->ref_index[0] + mb_xy,
@ -2020,7 +2020,7 @@ static av_cold int vp8_decode_init(AVCodecContext *avctx)
s->avctx = avctx; s->avctx = avctx;
avctx->pix_fmt = AV_PIX_FMT_YUV420P; avctx->pix_fmt = AV_PIX_FMT_YUV420P;
ff_dsputil_init(&s->dsp, avctx); ff_videodsp_init(&s->vdsp, 8);
ff_h264_pred_init(&s->hpc, AV_CODEC_ID_VP8, 8, 1); ff_h264_pred_init(&s->hpc, AV_CODEC_ID_VP8, 8, 1);
ff_vp8dsp_init(&s->vp8dsp); ff_vp8dsp_init(&s->vp8dsp);

View File

@ -247,7 +247,7 @@ typedef struct VP8Context {
*/ */
int num_coeff_partitions; int num_coeff_partitions;
VP56RangeCoder coeff_partition[8]; VP56RangeCoder coeff_partition[8];
DSPContext dsp; VideoDSPContext vdsp;
VP8DSPContext vp8dsp; VP8DSPContext vp8dsp;
H264PredContext hpc; H264PredContext hpc;
vp8_mc_func put_pixels_tab[3][3][3]; vp8_mc_func put_pixels_tab[3][3][3];

View File

@ -102,7 +102,7 @@ void ff_mspel_motion(MpegEncContext *s,
if(s->flags&CODEC_FLAG_EMU_EDGE){ if(s->flags&CODEC_FLAG_EMU_EDGE){
if(src_x<1 || src_y<1 || src_x + 17 >= s->h_edge_pos if(src_x<1 || src_y<1 || src_x + 17 >= s->h_edge_pos
|| src_y + h+1 >= v_edge_pos){ || src_y + h+1 >= v_edge_pos){
s->dsp.emulated_edge_mc(s->edge_emu_buffer, ptr - 1 - s->linesize, s->linesize, 19, 19, s->vdsp.emulated_edge_mc(s->edge_emu_buffer, ptr - 1 - s->linesize, s->linesize, 19, 19,
src_x-1, src_y-1, s->h_edge_pos, s->v_edge_pos); src_x-1, src_y-1, s->h_edge_pos, s->v_edge_pos);
ptr= s->edge_emu_buffer + 1 + s->linesize; ptr= s->edge_emu_buffer + 1 + s->linesize;
emu=1; emu=1;
@ -143,7 +143,7 @@ void ff_mspel_motion(MpegEncContext *s,
offset = (src_y * uvlinesize) + src_x; offset = (src_y * uvlinesize) + src_x;
ptr = ref_picture[1] + offset; ptr = ref_picture[1] + offset;
if(emu){ if(emu){
s->dsp.emulated_edge_mc(s->edge_emu_buffer, ptr, s->uvlinesize, 9, 9, s->vdsp.emulated_edge_mc(s->edge_emu_buffer, ptr, s->uvlinesize, 9, 9,
src_x, src_y, s->h_edge_pos>>1, s->v_edge_pos>>1); src_x, src_y, s->h_edge_pos>>1, s->v_edge_pos>>1);
ptr= s->edge_emu_buffer; ptr= s->edge_emu_buffer;
} }
@ -151,7 +151,7 @@ void ff_mspel_motion(MpegEncContext *s,
ptr = ref_picture[2] + offset; ptr = ref_picture[2] + offset;
if(emu){ if(emu){
s->dsp.emulated_edge_mc(s->edge_emu_buffer, ptr, s->uvlinesize, 9, 9, s->vdsp.emulated_edge_mc(s->edge_emu_buffer, ptr, s->uvlinesize, 9, 9,
src_x, src_y, s->h_edge_pos>>1, s->v_edge_pos>>1); src_x, src_y, s->h_edge_pos>>1, s->v_edge_pos>>1);
ptr= s->edge_emu_buffer; ptr= s->edge_emu_buffer;
} }

View File

@ -19,6 +19,7 @@ OBJS-$(CONFIG_RV40_DECODER) += x86/rv34dsp_init.o \
x86/rv40dsp_init.o x86/rv40dsp_init.o
OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp.o OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp.o
OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp_init.o OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp_init.o
OBJS-$(CONFIG_VIDEODSP) += x86/videodsp_init.o
OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp_init.o OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp_init.o
OBJS-$(CONFIG_VP5_DECODER) += x86/vp56dsp_init.o OBJS-$(CONFIG_VP5_DECODER) += x86/vp56dsp_init.o
OBJS-$(CONFIG_VP6_DECODER) += x86/vp56dsp_init.o OBJS-$(CONFIG_VP6_DECODER) += x86/vp56dsp_init.o
@ -60,6 +61,7 @@ YASM-OBJS-$(CONFIG_RV30_DECODER) += x86/rv34dsp.o
YASM-OBJS-$(CONFIG_RV40_DECODER) += x86/rv34dsp.o \ YASM-OBJS-$(CONFIG_RV40_DECODER) += x86/rv34dsp.o \
x86/rv40dsp.o x86/rv40dsp.o
YASM-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp.o YASM-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp.o
YASM-OBJS-$(CONFIG_VIDEODSP) += x86/videodsp.o
YASM-OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp.o YASM-OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp.o
YASM-OBJS-$(CONFIG_VP6_DECODER) += x86/vp56dsp.o YASM-OBJS-$(CONFIG_VP6_DECODER) += x86/vp56dsp.o
YASM-OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp.o YASM-OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp.o

View File

@ -489,577 +489,6 @@ cglobal scalarproduct_float, 3,3,2, v1, v2, offset
%endif %endif
RET RET
; extern void ff_emu_edge_core(uint8_t *buf, const uint8_t *src, x86_reg linesize,
; x86_reg start_y, x86_reg end_y, x86_reg block_h,
; x86_reg start_x, x86_reg end_x, x86_reg block_w);
;
; The actual function itself is below. It basically wraps a very simple
; w = end_x - start_x
; if (w) {
; if (w > 22) {
; jump to the slow loop functions
; } else {
; jump to the fast loop functions
; }
; }
;
; ... and then the same for left/right extend also. See below for loop
; function implementations. Fast are fixed-width, slow is variable-width
%macro EMU_EDGE_FUNC 0
%if ARCH_X86_64
%define w_reg r7
cglobal emu_edge_core, 6, 9, 1
mov r8, r5 ; save block_h
%else
%define w_reg r6
cglobal emu_edge_core, 2, 7, 0
mov r4, r4m ; end_y
mov r5, r5m ; block_h
%endif
; start with vertical extend (top/bottom) and body pixel copy
mov w_reg, r7m
sub w_reg, r6m ; w = start_x - end_x
sub r5, r4
%if ARCH_X86_64
sub r4, r3
%else
sub r4, dword r3m
%endif
cmp w_reg, 22
jg .slow_v_extend_loop
%if ARCH_X86_32
mov r2, r2m ; linesize
%endif
sal w_reg, 7 ; w * 128
%ifdef PIC
lea rax, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)]
add w_reg, rax
%else
lea w_reg, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)+w_reg]
%endif
call w_reg ; fast top extend, body copy and bottom extend
.v_extend_end:
; horizontal extend (left/right)
mov w_reg, r6m ; start_x
sub r0, w_reg
%if ARCH_X86_64
mov r3, r0 ; backup of buf+block_h*linesize
mov r5, r8
%else
mov r0m, r0 ; backup of buf+block_h*linesize
mov r5, r5m
%endif
test w_reg, w_reg
jz .right_extend
cmp w_reg, 22
jg .slow_left_extend_loop
mov r1, w_reg
dec w_reg
; FIXME we can do a if size == 1 here if that makes any speed difference, test me
sar w_reg, 1
sal w_reg, 6
; r0=buf+block_h*linesize,r7(64)/r6(32)=start_x offset for funcs
; r6(rax)/r3(ebx)=val,r2=linesize,r1=start_x,r5=block_h
%ifdef PIC
lea rax, [.emuedge_extend_left_2]
add w_reg, rax
%else
lea w_reg, [.emuedge_extend_left_2+w_reg]
%endif
call w_reg
; now r3(64)/r0(32)=buf,r2=linesize,r8/r5=block_h,r6/r3=val, r7/r6=end_x, r1=block_w
.right_extend:
%if ARCH_X86_32
mov r0, r0m
mov r5, r5m
%endif
mov w_reg, r7m ; end_x
mov r1, r8m ; block_w
mov r4, r1
sub r1, w_reg
jz .h_extend_end ; if (end_x == block_w) goto h_extend_end
cmp r1, 22
jg .slow_right_extend_loop
dec r1
; FIXME we can do a if size == 1 here if that makes any speed difference, test me
sar r1, 1
sal r1, 6
%ifdef PIC
lea rax, [.emuedge_extend_right_2]
add r1, rax
%else
lea r1, [.emuedge_extend_right_2+r1]
%endif
call r1
.h_extend_end:
RET
%if ARCH_X86_64
%define vall al
%define valh ah
%define valw ax
%define valw2 r7w
%define valw3 r3w
%if WIN64
%define valw4 r7w
%else ; unix64
%define valw4 r3w
%endif
%define vald eax
%else
%define vall bl
%define valh bh
%define valw bx
%define valw2 r6w
%define valw3 valw2
%define valw4 valw3
%define vald ebx
%define stack_offset 0x14
%endif
%endmacro
; macro to read/write a horizontal number of pixels (%2) to/from registers
; on x86-64, - fills xmm0-15 for consecutive sets of 16 pixels
; - if (%2 & 15 == 8) fills the last 8 bytes into rax
; - else if (%2 & 8) fills 8 bytes into mm0
; - if (%2 & 7 == 4) fills the last 4 bytes into rax
; - else if (%2 & 4) fills 4 bytes into mm0-1
; - if (%2 & 3 == 3) fills 2 bytes into r7/r3, and 1 into eax
; (note that we're using r3 for body/bottom because it's a shorter
; opcode, and then the loop fits in 128 bytes)
; - else fills remaining bytes into rax
; on x86-32, - fills mm0-7 for consecutive sets of 8 pixels
; - if (%2 & 7 == 4) fills 4 bytes into ebx
; - else if (%2 & 4) fills 4 bytes into mm0-7
; - if (%2 & 3 == 3) fills 2 bytes into r6, and 1 into ebx
; - else fills remaining bytes into ebx
; writing data out is in the same way
%macro READ_NUM_BYTES 2
%assign %%src_off 0 ; offset in source buffer
%assign %%smidx 0 ; mmx register idx
%assign %%sxidx 0 ; xmm register idx
%if cpuflag(sse)
%rep %2/16
movups xmm %+ %%sxidx, [r1+%%src_off]
%assign %%src_off %%src_off+16
%assign %%sxidx %%sxidx+1
%endrep ; %2/16
%endif
%if ARCH_X86_64
%if (%2-%%src_off) == 8
mov rax, [r1+%%src_off]
%assign %%src_off %%src_off+8
%endif ; (%2-%%src_off) == 8
%endif ; x86-64
%rep (%2-%%src_off)/8
movq mm %+ %%smidx, [r1+%%src_off]
%assign %%src_off %%src_off+8
%assign %%smidx %%smidx+1
%endrep ; (%2-%%dst_off)/8
%if (%2-%%src_off) == 4
mov vald, [r1+%%src_off]
%elif (%2-%%src_off) & 4
movd mm %+ %%smidx, [r1+%%src_off]
%assign %%src_off %%src_off+4
%endif ; (%2-%%src_off) ==/& 4
%if (%2-%%src_off) == 1
mov vall, [r1+%%src_off]
%elif (%2-%%src_off) == 2
mov valw, [r1+%%src_off]
%elif (%2-%%src_off) == 3
%ifidn %1, top
mov valw2, [r1+%%src_off]
%elifidn %1, body
mov valw3, [r1+%%src_off]
%elifidn %1, bottom
mov valw4, [r1+%%src_off]
%endif ; %1 ==/!= top
mov vall, [r1+%%src_off+2]
%endif ; (%2-%%src_off) == 1/2/3
%endmacro ; READ_NUM_BYTES
%macro WRITE_NUM_BYTES 2
%assign %%dst_off 0 ; offset in destination buffer
%assign %%dmidx 0 ; mmx register idx
%assign %%dxidx 0 ; xmm register idx
%if cpuflag(sse)
%rep %2/16
movups [r0+%%dst_off], xmm %+ %%dxidx
%assign %%dst_off %%dst_off+16
%assign %%dxidx %%dxidx+1
%endrep ; %2/16
%endif
%if ARCH_X86_64
%if (%2-%%dst_off) == 8
mov [r0+%%dst_off], rax
%assign %%dst_off %%dst_off+8
%endif ; (%2-%%dst_off) == 8
%endif ; x86-64
%rep (%2-%%dst_off)/8
movq [r0+%%dst_off], mm %+ %%dmidx
%assign %%dst_off %%dst_off+8
%assign %%dmidx %%dmidx+1
%endrep ; (%2-%%dst_off)/8
%if (%2-%%dst_off) == 4
mov [r0+%%dst_off], vald
%elif (%2-%%dst_off) & 4
movd [r0+%%dst_off], mm %+ %%dmidx
%assign %%dst_off %%dst_off+4
%endif ; (%2-%%dst_off) ==/& 4
%if (%2-%%dst_off) == 1
mov [r0+%%dst_off], vall
%elif (%2-%%dst_off) == 2
mov [r0+%%dst_off], valw
%elif (%2-%%dst_off) == 3
%ifidn %1, top
mov [r0+%%dst_off], valw2
%elifidn %1, body
mov [r0+%%dst_off], valw3
%elifidn %1, bottom
mov [r0+%%dst_off], valw4
%endif ; %1 ==/!= top
mov [r0+%%dst_off+2], vall
%endif ; (%2-%%dst_off) == 1/2/3
%endmacro ; WRITE_NUM_BYTES
; vertical top/bottom extend and body copy fast loops
; these are function pointers to set-width line copy functions, i.e.
; they read a fixed number of pixels into set registers, and write
; those out into the destination buffer
; r0=buf,r1=src,r2=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h
; r6(eax/64)/r3(ebx/32)=val_reg
%macro VERTICAL_EXTEND 0
%assign %%n 1
%rep 22
ALIGN 128
.emuedge_v_extend_ %+ %%n:
; extend pixels above body
%if ARCH_X86_64
test r3 , r3 ; if (!start_y)
jz .emuedge_copy_body_ %+ %%n %+ _loop ; goto body
%else ; ARCH_X86_32
cmp dword r3m, 0
je .emuedge_copy_body_ %+ %%n %+ _loop
%endif ; ARCH_X86_64/32
READ_NUM_BYTES top, %%n ; read bytes
.emuedge_extend_top_ %+ %%n %+ _loop: ; do {
WRITE_NUM_BYTES top, %%n ; write bytes
add r0 , r2 ; dst += linesize
%if ARCH_X86_64
dec r3d
%else ; ARCH_X86_32
dec dword r3m
%endif ; ARCH_X86_64/32
jnz .emuedge_extend_top_ %+ %%n %+ _loop ; } while (--start_y)
; copy body pixels
.emuedge_copy_body_ %+ %%n %+ _loop: ; do {
READ_NUM_BYTES body, %%n ; read bytes
WRITE_NUM_BYTES body, %%n ; write bytes
add r0 , r2 ; dst += linesize
add r1 , r2 ; src += linesize
dec r4d
jnz .emuedge_copy_body_ %+ %%n %+ _loop ; } while (--end_y)
; copy bottom pixels
test r5 , r5 ; if (!block_h)
jz .emuedge_v_extend_end_ %+ %%n ; goto end
sub r1 , r2 ; src -= linesize
READ_NUM_BYTES bottom, %%n ; read bytes
.emuedge_extend_bottom_ %+ %%n %+ _loop: ; do {
WRITE_NUM_BYTES bottom, %%n ; write bytes
add r0 , r2 ; dst += linesize
dec r5d
jnz .emuedge_extend_bottom_ %+ %%n %+ _loop ; } while (--block_h)
.emuedge_v_extend_end_ %+ %%n:
%if ARCH_X86_64
ret
%else ; ARCH_X86_32
rep ret
%endif ; ARCH_X86_64/32
%assign %%n %%n+1
%endrep
%endmacro VERTICAL_EXTEND
; left/right (horizontal) fast extend functions
; these are essentially identical to the vertical extend ones above,
; just left/right separated because number of pixels to extend is
; obviously not the same on both sides.
; for reading, pixels are placed in eax (x86-64) or ebx (x86-64) in the
; lowest two bytes of the register (so val*0x0101), and are splatted
; into each byte of mm0 as well if n_pixels >= 8
%macro READ_V_PIXEL 2
mov vall, %2
mov valh, vall
%if %1 >= 8
movd mm0, vald
%if cpuflag(mmxext)
pshufw mm0, mm0, 0
%else ; mmx
punpcklwd mm0, mm0
punpckldq mm0, mm0
%endif ; sse
%endif ; %1 >= 8
%endmacro
%macro WRITE_V_PIXEL 2
%assign %%dst_off 0
%rep %1/8
movq [%2+%%dst_off], mm0
%assign %%dst_off %%dst_off+8
%endrep
%if %1 & 4
%if %1 >= 8
movd [%2+%%dst_off], mm0
%else ; %1 < 8
mov [%2+%%dst_off] , valw
mov [%2+%%dst_off+2], valw
%endif ; %1 >=/< 8
%assign %%dst_off %%dst_off+4
%endif ; %1 & 4
%if %1&2
mov [%2+%%dst_off], valw
%endif ; %1 & 2
%endmacro
; r0=buf+block_h*linesize, r1=start_x, r2=linesize, r5=block_h, r6/r3=val
%macro LEFT_EXTEND 0
%assign %%n 2
%rep 11
ALIGN 64
.emuedge_extend_left_ %+ %%n: ; do {
sub r0, r2 ; dst -= linesize
READ_V_PIXEL %%n, [r0+r1] ; read pixels
WRITE_V_PIXEL %%n, r0 ; write pixels
dec r5
jnz .emuedge_extend_left_ %+ %%n ; } while (--block_h)
%if ARCH_X86_64
ret
%else ; ARCH_X86_32
rep ret
%endif ; ARCH_X86_64/32
%assign %%n %%n+2
%endrep
%endmacro ; LEFT_EXTEND
; r3/r0=buf+block_h*linesize, r2=linesize, r8/r5=block_h, r0/r6=end_x, r6/r3=val
%macro RIGHT_EXTEND 0
%assign %%n 2
%rep 11
ALIGN 64
.emuedge_extend_right_ %+ %%n: ; do {
%if ARCH_X86_64
sub r3, r2 ; dst -= linesize
READ_V_PIXEL %%n, [r3+w_reg-1] ; read pixels
WRITE_V_PIXEL %%n, r3+r4-%%n ; write pixels
dec r8
%else ; ARCH_X86_32
sub r0, r2 ; dst -= linesize
READ_V_PIXEL %%n, [r0+w_reg-1] ; read pixels
WRITE_V_PIXEL %%n, r0+r4-%%n ; write pixels
dec r5
%endif ; ARCH_X86_64/32
jnz .emuedge_extend_right_ %+ %%n ; } while (--block_h)
%if ARCH_X86_64
ret
%else ; ARCH_X86_32
rep ret
%endif ; ARCH_X86_64/32
%assign %%n %%n+2
%endrep
%if ARCH_X86_32
%define stack_offset 0x10
%endif
%endmacro ; RIGHT_EXTEND
; below follow the "slow" copy/extend functions, these act on a non-fixed
; width specified in a register, and run a loop to copy the full amount
; of bytes. They are optimized for copying of large amounts of pixels per
; line, so they unconditionally splat data into mm registers to copy 8
; bytes per loop iteration. It could be considered to use xmm for x86-64
; also, but I haven't optimized this as much (i.e. FIXME)
%macro V_COPY_NPX 4-5
%if %0 == 4
test w_reg, %4
jz .%1_skip_%4_px
%else ; %0 == 5
.%1_%4_px_loop:
%endif
%3 %2, [r1+cnt_reg]
%3 [r0+cnt_reg], %2
add cnt_reg, %4
%if %0 == 5
sub w_reg, %4
test w_reg, %5
jnz .%1_%4_px_loop
%endif
.%1_skip_%4_px:
%endmacro
%macro V_COPY_ROW 2
%ifidn %1, bottom
sub r1, linesize
%endif
.%1_copy_loop:
xor cnt_reg, cnt_reg
%if notcpuflag(sse)
%define linesize r2m
V_COPY_NPX %1, mm0, movq, 8, 0xFFFFFFF8
%else ; sse
V_COPY_NPX %1, xmm0, movups, 16, 0xFFFFFFF0
%if ARCH_X86_64
%define linesize r2
V_COPY_NPX %1, rax , mov, 8
%else ; ARCH_X86_32
%define linesize r2m
V_COPY_NPX %1, mm0, movq, 8
%endif ; ARCH_X86_64/32
%endif ; sse
V_COPY_NPX %1, vald, mov, 4
V_COPY_NPX %1, valw, mov, 2
V_COPY_NPX %1, vall, mov, 1
mov w_reg, cnt_reg
%ifidn %1, body
add r1, linesize
%endif
add r0, linesize
dec %2
jnz .%1_copy_loop
%endmacro
%macro SLOW_V_EXTEND 0
.slow_v_extend_loop:
; r0=buf,r1=src,r2(64)/r2m(32)=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h
; r8(64)/r3(later-64)/r2(32)=cnt_reg,r6(64)/r3(32)=val_reg,r7(64)/r6(32)=w=end_x-start_x
%if ARCH_X86_64
push r8 ; save old value of block_h
test r3, r3
%define cnt_reg r8
jz .do_body_copy ; if (!start_y) goto do_body_copy
V_COPY_ROW top, r3
%else
cmp dword r3m, 0
%define cnt_reg r2
je .do_body_copy ; if (!start_y) goto do_body_copy
V_COPY_ROW top, dword r3m
%endif
.do_body_copy:
V_COPY_ROW body, r4
%if ARCH_X86_64
pop r8 ; restore old value of block_h
%define cnt_reg r3
%endif
test r5, r5
%if ARCH_X86_64
jz .v_extend_end
%else
jz .skip_bottom_extend
%endif
V_COPY_ROW bottom, r5
%if ARCH_X86_32
.skip_bottom_extend:
mov r2, r2m
%endif
jmp .v_extend_end
%endmacro
%macro SLOW_LEFT_EXTEND 0
.slow_left_extend_loop:
; r0=buf+block_h*linesize,r2=linesize,r6(64)/r3(32)=val,r5=block_h,r4=cntr,r7/r6=start_x
mov r4, 8
sub r0, linesize
READ_V_PIXEL 8, [r0+w_reg]
.left_extend_8px_loop:
movq [r0+r4-8], mm0
add r4, 8
cmp r4, w_reg
jle .left_extend_8px_loop
sub r4, 8
cmp r4, w_reg
jge .left_extend_loop_end
.left_extend_2px_loop:
mov [r0+r4], valw
add r4, 2
cmp r4, w_reg
jl .left_extend_2px_loop
.left_extend_loop_end:
dec r5
jnz .slow_left_extend_loop
%if ARCH_X86_32
mov r2, r2m
%endif
jmp .right_extend
%endmacro
%macro SLOW_RIGHT_EXTEND 0
.slow_right_extend_loop:
; r3(64)/r0(32)=buf+block_h*linesize,r2=linesize,r4=block_w,r8(64)/r5(32)=block_h,
; r7(64)/r6(32)=end_x,r6/r3=val,r1=cntr
%if ARCH_X86_64
%define buf_reg r3
%define bh_reg r8
%else
%define buf_reg r0
%define bh_reg r5
%endif
lea r1, [r4-8]
sub buf_reg, linesize
READ_V_PIXEL 8, [buf_reg+w_reg-1]
.right_extend_8px_loop:
movq [buf_reg+r1], mm0
sub r1, 8
cmp r1, w_reg
jge .right_extend_8px_loop
add r1, 8
cmp r1, w_reg
je .right_extend_loop_end
.right_extend_2px_loop:
sub r1, 2
mov [buf_reg+r1], valw
cmp r1, w_reg
jg .right_extend_2px_loop
.right_extend_loop_end:
dec bh_reg
jnz .slow_right_extend_loop
jmp .h_extend_end
%endmacro
%macro emu_edge 1
INIT_XMM %1
EMU_EDGE_FUNC
VERTICAL_EXTEND
LEFT_EXTEND
RIGHT_EXTEND
SLOW_V_EXTEND
SLOW_LEFT_EXTEND
SLOW_RIGHT_EXTEND
%endmacro
emu_edge sse
%if ARCH_X86_32
emu_edge mmx
%endif
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min, ; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min,
; int32_t max, unsigned int len) ; int32_t max, unsigned int len)

View File

@ -1635,78 +1635,6 @@ void ff_avg_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
avg_pixels16_xy2_mmx(dst, src, stride, 16); avg_pixels16_xy2_mmx(dst, src, stride, 16);
} }
#endif /* HAVE_INLINE_ASM */
#if HAVE_YASM
typedef void emu_edge_core_func(uint8_t *buf, const uint8_t *src,
x86_reg linesize, x86_reg start_y,
x86_reg end_y, x86_reg block_h,
x86_reg start_x, x86_reg end_x,
x86_reg block_w);
extern emu_edge_core_func ff_emu_edge_core_mmx;
extern emu_edge_core_func ff_emu_edge_core_sse;
static av_always_inline void emulated_edge_mc(uint8_t *buf, const uint8_t *src,
int linesize,
int block_w, int block_h,
int src_x, int src_y,
int w, int h,
emu_edge_core_func *core_fn)
{
int start_y, start_x, end_y, end_x, src_y_add = 0;
if (src_y >= h) {
src_y_add = h - 1 - src_y;
src_y = h - 1;
} else if (src_y <= -block_h) {
src_y_add = 1 - block_h - src_y;
src_y = 1 - block_h;
}
if (src_x >= w) {
src += w - 1 - src_x;
src_x = w - 1;
} else if (src_x <= -block_w) {
src += 1 - block_w - src_x;
src_x = 1 - block_w;
}
start_y = FFMAX(0, -src_y);
start_x = FFMAX(0, -src_x);
end_y = FFMIN(block_h, h-src_y);
end_x = FFMIN(block_w, w-src_x);
assert(start_x < end_x && block_w > 0);
assert(start_y < end_y && block_h > 0);
// fill in the to-be-copied part plus all above/below
src += (src_y_add + start_y) * linesize + start_x;
buf += start_x;
core_fn(buf, src, linesize, start_y, end_y,
block_h, start_x, end_x, block_w);
}
#if ARCH_X86_32
static av_noinline void emulated_edge_mc_mmx(uint8_t *buf, const uint8_t *src,
int linesize,
int block_w, int block_h,
int src_x, int src_y, int w, int h)
{
emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
w, h, &ff_emu_edge_core_mmx);
}
#endif
static av_noinline void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src,
int linesize,
int block_w, int block_h,
int src_x, int src_y, int w, int h)
{
emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
w, h, &ff_emu_edge_core_sse);
}
#endif /* HAVE_YASM */
#if HAVE_INLINE_ASM
static void gmc_mmx(uint8_t *dst, uint8_t *src, static void gmc_mmx(uint8_t *dst, uint8_t *src,
int stride, int h, int ox, int oy, int stride, int h, int ox, int oy,
int dxx, int dxy, int dyx, int dyy, int dxx, int dxy, int dyx, int dyy,
@ -1822,21 +1750,6 @@ static void gmc_mmx(uint8_t *dst, uint8_t *src,
src += 4 - h * stride; src += 4 - h * stride;
} }
} }
#define PREFETCH(name, op) \
static void name(void *mem, int stride, int h) \
{ \
const uint8_t *p = mem; \
do { \
__asm__ volatile (#op" %0" :: "m"(*p)); \
p += stride; \
} while (--h); \
}
PREFETCH(prefetch_mmxext, prefetcht0)
PREFETCH(prefetch_3dnow, prefetch)
#undef PREFETCH
#endif /* HAVE_INLINE_ASM */ #endif /* HAVE_INLINE_ASM */
#include "h264_qpel.c" #include "h264_qpel.c"
@ -2239,11 +2152,6 @@ static void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
#endif /* HAVE_INLINE_ASM */ #endif /* HAVE_INLINE_ASM */
#if HAVE_YASM #if HAVE_YASM
#if ARCH_X86_32
if (!high_bit_depth)
c->emulated_edge_mc = emulated_edge_mc_mmx;
#endif
if (!high_bit_depth && CONFIG_H264CHROMA) { if (!high_bit_depth && CONFIG_H264CHROMA) {
c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_rnd_mmx; c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_rnd_mmx;
c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_mmx; c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_mmx;
@ -2261,8 +2169,6 @@ static void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
const int high_bit_depth = bit_depth > 8; const int high_bit_depth = bit_depth > 8;
#if HAVE_INLINE_ASM #if HAVE_INLINE_ASM
c->prefetch = prefetch_mmxext;
SET_QPEL_FUNCS(avg_qpel, 0, 16, mmxext, ); SET_QPEL_FUNCS(avg_qpel, 0, 16, mmxext, );
SET_QPEL_FUNCS(avg_qpel, 1, 8, mmxext, ); SET_QPEL_FUNCS(avg_qpel, 1, 8, mmxext, );
SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmxext, ); SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmxext, );
@ -2371,8 +2277,6 @@ static void dsputil_init_3dnow(DSPContext *c, AVCodecContext *avctx,
const int high_bit_depth = avctx->bits_per_raw_sample > 8; const int high_bit_depth = avctx->bits_per_raw_sample > 8;
#if HAVE_INLINE_ASM #if HAVE_INLINE_ASM
c->prefetch = prefetch_3dnow;
if (!high_bit_depth) { if (!high_bit_depth) {
c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow; c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow; c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
@ -2452,9 +2356,6 @@ static void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx, int mm_flags)
c->scalarproduct_float = ff_scalarproduct_float_sse; c->scalarproduct_float = ff_scalarproduct_float_sse;
c->butterflies_float_interleave = ff_butterflies_float_interleave_sse; c->butterflies_float_interleave = ff_butterflies_float_interleave_sse;
if (!high_bit_depth)
c->emulated_edge_mc = emulated_edge_mc_sse;
#endif /* HAVE_YASM */ #endif /* HAVE_YASM */
} }

612
libavcodec/x86/videodsp.asm Normal file
View File

@ -0,0 +1,612 @@
;******************************************************************************
;* Core video DSP functions
;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
;*
;* This file is part of Libav.
;*
;* Libav is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* Libav is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with Libav; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION .text
; extern void ff_emu_edge_core(uint8_t *buf, const uint8_t *src, x86_reg linesize,
; x86_reg start_y, x86_reg end_y, x86_reg block_h,
; x86_reg start_x, x86_reg end_x, x86_reg block_w);
;
; The actual function itself is below. It basically wraps a very simple
; w = end_x - start_x
; if (w) {
; if (w > 22) {
; jump to the slow loop functions
; } else {
; jump to the fast loop functions
; }
; }
;
; ... and then the same for left/right extend also. See below for loop
; function implementations. Fast are fixed-width, slow is variable-width
%macro EMU_EDGE_FUNC 0
%if ARCH_X86_64
%define w_reg r7
cglobal emu_edge_core, 6, 9, 1
mov r8, r5 ; save block_h
%else
%define w_reg r6
cglobal emu_edge_core, 2, 7, 0
mov r4, r4m ; end_y
mov r5, r5m ; block_h
%endif
; start with vertical extend (top/bottom) and body pixel copy
mov w_reg, r7m
sub w_reg, r6m ; w = start_x - end_x
sub r5, r4
%if ARCH_X86_64
sub r4, r3
%else
sub r4, dword r3m
%endif
cmp w_reg, 22
jg .slow_v_extend_loop
%if ARCH_X86_32
mov r2, r2m ; linesize
%endif
sal w_reg, 7 ; w * 128
%ifdef PIC
lea rax, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)]
add w_reg, rax
%else
lea w_reg, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)+w_reg]
%endif
call w_reg ; fast top extend, body copy and bottom extend
.v_extend_end:
; horizontal extend (left/right)
mov w_reg, r6m ; start_x
sub r0, w_reg
%if ARCH_X86_64
mov r3, r0 ; backup of buf+block_h*linesize
mov r5, r8
%else
mov r0m, r0 ; backup of buf+block_h*linesize
mov r5, r5m
%endif
test w_reg, w_reg
jz .right_extend
cmp w_reg, 22
jg .slow_left_extend_loop
mov r1, w_reg
dec w_reg
; FIXME we can do a if size == 1 here if that makes any speed difference, test me
sar w_reg, 1
sal w_reg, 6
; r0=buf+block_h*linesize,r7(64)/r6(32)=start_x offset for funcs
; r6(rax)/r3(ebx)=val,r2=linesize,r1=start_x,r5=block_h
%ifdef PIC
lea rax, [.emuedge_extend_left_2]
add w_reg, rax
%else
lea w_reg, [.emuedge_extend_left_2+w_reg]
%endif
call w_reg
; now r3(64)/r0(32)=buf,r2=linesize,r8/r5=block_h,r6/r3=val, r7/r6=end_x, r1=block_w
.right_extend:
%if ARCH_X86_32
mov r0, r0m
mov r5, r5m
%endif
mov w_reg, r7m ; end_x
mov r1, r8m ; block_w
mov r4, r1
sub r1, w_reg
jz .h_extend_end ; if (end_x == block_w) goto h_extend_end
cmp r1, 22
jg .slow_right_extend_loop
dec r1
; FIXME we can do a if size == 1 here if that makes any speed difference, test me
sar r1, 1
sal r1, 6
%ifdef PIC
lea rax, [.emuedge_extend_right_2]
add r1, rax
%else
lea r1, [.emuedge_extend_right_2+r1]
%endif
call r1
.h_extend_end:
RET
%if ARCH_X86_64
%define vall al
%define valh ah
%define valw ax
%define valw2 r7w
%define valw3 r3w
%if WIN64
%define valw4 r7w
%else ; unix64
%define valw4 r3w
%endif
%define vald eax
%else
%define vall bl
%define valh bh
%define valw bx
%define valw2 r6w
%define valw3 valw2
%define valw4 valw3
%define vald ebx
%define stack_offset 0x14
%endif
%endmacro
; macro to read/write a horizontal number of pixels (%2) to/from registers
; on x86-64, - fills xmm0-15 for consecutive sets of 16 pixels
; - if (%2 & 15 == 8) fills the last 8 bytes into rax
; - else if (%2 & 8) fills 8 bytes into mm0
; - if (%2 & 7 == 4) fills the last 4 bytes into rax
; - else if (%2 & 4) fills 4 bytes into mm0-1
; - if (%2 & 3 == 3) fills 2 bytes into r7/r3, and 1 into eax
; (note that we're using r3 for body/bottom because it's a shorter
; opcode, and then the loop fits in 128 bytes)
; - else fills remaining bytes into rax
; on x86-32, - fills mm0-7 for consecutive sets of 8 pixels
; - if (%2 & 7 == 4) fills 4 bytes into ebx
; - else if (%2 & 4) fills 4 bytes into mm0-7
; - if (%2 & 3 == 3) fills 2 bytes into r6, and 1 into ebx
; - else fills remaining bytes into ebx
; writing data out is in the same way
%macro READ_NUM_BYTES 2
%assign %%src_off 0 ; offset in source buffer
%assign %%smidx 0 ; mmx register idx
%assign %%sxidx 0 ; xmm register idx
%if cpuflag(sse)
%rep %2/16
movups xmm %+ %%sxidx, [r1+%%src_off]
%assign %%src_off %%src_off+16
%assign %%sxidx %%sxidx+1
%endrep ; %2/16
%endif
%if ARCH_X86_64
%if (%2-%%src_off) == 8
mov rax, [r1+%%src_off]
%assign %%src_off %%src_off+8
%endif ; (%2-%%src_off) == 8
%endif ; x86-64
%rep (%2-%%src_off)/8
movq mm %+ %%smidx, [r1+%%src_off]
%assign %%src_off %%src_off+8
%assign %%smidx %%smidx+1
%endrep ; (%2-%%dst_off)/8
%if (%2-%%src_off) == 4
mov vald, [r1+%%src_off]
%elif (%2-%%src_off) & 4
movd mm %+ %%smidx, [r1+%%src_off]
%assign %%src_off %%src_off+4
%endif ; (%2-%%src_off) ==/& 4
%if (%2-%%src_off) == 1
mov vall, [r1+%%src_off]
%elif (%2-%%src_off) == 2
mov valw, [r1+%%src_off]
%elif (%2-%%src_off) == 3
%ifidn %1, top
mov valw2, [r1+%%src_off]
%elifidn %1, body
mov valw3, [r1+%%src_off]
%elifidn %1, bottom
mov valw4, [r1+%%src_off]
%endif ; %1 ==/!= top
mov vall, [r1+%%src_off+2]
%endif ; (%2-%%src_off) == 1/2/3
%endmacro ; READ_NUM_BYTES
%macro WRITE_NUM_BYTES 2
%assign %%dst_off 0 ; offset in destination buffer
%assign %%dmidx 0 ; mmx register idx
%assign %%dxidx 0 ; xmm register idx
%if cpuflag(sse)
%rep %2/16
movups [r0+%%dst_off], xmm %+ %%dxidx
%assign %%dst_off %%dst_off+16
%assign %%dxidx %%dxidx+1
%endrep ; %2/16
%endif
%if ARCH_X86_64
%if (%2-%%dst_off) == 8
mov [r0+%%dst_off], rax
%assign %%dst_off %%dst_off+8
%endif ; (%2-%%dst_off) == 8
%endif ; x86-64
%rep (%2-%%dst_off)/8
movq [r0+%%dst_off], mm %+ %%dmidx
%assign %%dst_off %%dst_off+8
%assign %%dmidx %%dmidx+1
%endrep ; (%2-%%dst_off)/8
%if (%2-%%dst_off) == 4
mov [r0+%%dst_off], vald
%elif (%2-%%dst_off) & 4
movd [r0+%%dst_off], mm %+ %%dmidx
%assign %%dst_off %%dst_off+4
%endif ; (%2-%%dst_off) ==/& 4
%if (%2-%%dst_off) == 1
mov [r0+%%dst_off], vall
%elif (%2-%%dst_off) == 2
mov [r0+%%dst_off], valw
%elif (%2-%%dst_off) == 3
%ifidn %1, top
mov [r0+%%dst_off], valw2
%elifidn %1, body
mov [r0+%%dst_off], valw3
%elifidn %1, bottom
mov [r0+%%dst_off], valw4
%endif ; %1 ==/!= top
mov [r0+%%dst_off+2], vall
%endif ; (%2-%%dst_off) == 1/2/3
%endmacro ; WRITE_NUM_BYTES
; vertical top/bottom extend and body copy fast loops
; these are function pointers to set-width line copy functions, i.e.
; they read a fixed number of pixels into set registers, and write
; those out into the destination buffer
; r0=buf,r1=src,r2=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h
; r6(eax/64)/r3(ebx/32)=val_reg
%macro VERTICAL_EXTEND 0
%assign %%n 1
%rep 22
ALIGN 128
.emuedge_v_extend_ %+ %%n:
; extend pixels above body
%if ARCH_X86_64
test r3 , r3 ; if (!start_y)
jz .emuedge_copy_body_ %+ %%n %+ _loop ; goto body
%else ; ARCH_X86_32
cmp dword r3m, 0
je .emuedge_copy_body_ %+ %%n %+ _loop
%endif ; ARCH_X86_64/32
READ_NUM_BYTES top, %%n ; read bytes
.emuedge_extend_top_ %+ %%n %+ _loop: ; do {
WRITE_NUM_BYTES top, %%n ; write bytes
add r0 , r2 ; dst += linesize
%if ARCH_X86_64
dec r3d
%else ; ARCH_X86_32
dec dword r3m
%endif ; ARCH_X86_64/32
jnz .emuedge_extend_top_ %+ %%n %+ _loop ; } while (--start_y)
; copy body pixels
.emuedge_copy_body_ %+ %%n %+ _loop: ; do {
READ_NUM_BYTES body, %%n ; read bytes
WRITE_NUM_BYTES body, %%n ; write bytes
add r0 , r2 ; dst += linesize
add r1 , r2 ; src += linesize
dec r4d
jnz .emuedge_copy_body_ %+ %%n %+ _loop ; } while (--end_y)
; copy bottom pixels
test r5 , r5 ; if (!block_h)
jz .emuedge_v_extend_end_ %+ %%n ; goto end
sub r1 , r2 ; src -= linesize
READ_NUM_BYTES bottom, %%n ; read bytes
.emuedge_extend_bottom_ %+ %%n %+ _loop: ; do {
WRITE_NUM_BYTES bottom, %%n ; write bytes
add r0 , r2 ; dst += linesize
dec r5d
jnz .emuedge_extend_bottom_ %+ %%n %+ _loop ; } while (--block_h)
.emuedge_v_extend_end_ %+ %%n:
%if ARCH_X86_64
ret
%else ; ARCH_X86_32
rep ret
%endif ; ARCH_X86_64/32
%assign %%n %%n+1
%endrep
%endmacro VERTICAL_EXTEND
; left/right (horizontal) fast extend functions
; these are essentially identical to the vertical extend ones above,
; just left/right separated because number of pixels to extend is
; obviously not the same on both sides.
; for reading, pixels are placed in eax (x86-64) or ebx (x86-64) in the
; lowest two bytes of the register (so val*0x0101), and are splatted
; into each byte of mm0 as well if n_pixels >= 8
%macro READ_V_PIXEL 2
mov vall, %2
mov valh, vall
%if %1 >= 8
movd mm0, vald
%if cpuflag(mmxext)
pshufw mm0, mm0, 0
%else ; mmx
punpcklwd mm0, mm0
punpckldq mm0, mm0
%endif ; sse
%endif ; %1 >= 8
%endmacro
%macro WRITE_V_PIXEL 2
%assign %%dst_off 0
%rep %1/8
movq [%2+%%dst_off], mm0
%assign %%dst_off %%dst_off+8
%endrep
%if %1 & 4
%if %1 >= 8
movd [%2+%%dst_off], mm0
%else ; %1 < 8
mov [%2+%%dst_off] , valw
mov [%2+%%dst_off+2], valw
%endif ; %1 >=/< 8
%assign %%dst_off %%dst_off+4
%endif ; %1 & 4
%if %1&2
mov [%2+%%dst_off], valw
%endif ; %1 & 2
%endmacro
; r0=buf+block_h*linesize, r1=start_x, r2=linesize, r5=block_h, r6/r3=val
%macro LEFT_EXTEND 0
%assign %%n 2
%rep 11
ALIGN 64
.emuedge_extend_left_ %+ %%n: ; do {
sub r0, r2 ; dst -= linesize
READ_V_PIXEL %%n, [r0+r1] ; read pixels
WRITE_V_PIXEL %%n, r0 ; write pixels
dec r5
jnz .emuedge_extend_left_ %+ %%n ; } while (--block_h)
%if ARCH_X86_64
ret
%else ; ARCH_X86_32
rep ret
%endif ; ARCH_X86_64/32
%assign %%n %%n+2
%endrep
%endmacro ; LEFT_EXTEND
; r3/r0=buf+block_h*linesize, r2=linesize, r8/r5=block_h, r0/r6=end_x, r6/r3=val
%macro RIGHT_EXTEND 0
%assign %%n 2
%rep 11
ALIGN 64
.emuedge_extend_right_ %+ %%n: ; do {
%if ARCH_X86_64
sub r3, r2 ; dst -= linesize
READ_V_PIXEL %%n, [r3+w_reg-1] ; read pixels
WRITE_V_PIXEL %%n, r3+r4-%%n ; write pixels
dec r8
%else ; ARCH_X86_32
sub r0, r2 ; dst -= linesize
READ_V_PIXEL %%n, [r0+w_reg-1] ; read pixels
WRITE_V_PIXEL %%n, r0+r4-%%n ; write pixels
dec r5
%endif ; ARCH_X86_64/32
jnz .emuedge_extend_right_ %+ %%n ; } while (--block_h)
%if ARCH_X86_64
ret
%else ; ARCH_X86_32
rep ret
%endif ; ARCH_X86_64/32
%assign %%n %%n+2
%endrep
%if ARCH_X86_32
%define stack_offset 0x10
%endif
%endmacro ; RIGHT_EXTEND
; below follow the "slow" copy/extend functions, these act on a non-fixed
; width specified in a register, and run a loop to copy the full amount
; of bytes. They are optimized for copying of large amounts of pixels per
; line, so they unconditionally splat data into mm registers to copy 8
; bytes per loop iteration. It could be considered to use xmm for x86-64
; also, but I haven't optimized this as much (i.e. FIXME)
%macro V_COPY_NPX 4-5
%if %0 == 4
test w_reg, %4
jz .%1_skip_%4_px
%else ; %0 == 5
.%1_%4_px_loop:
%endif
%3 %2, [r1+cnt_reg]
%3 [r0+cnt_reg], %2
add cnt_reg, %4
%if %0 == 5
sub w_reg, %4
test w_reg, %5
jnz .%1_%4_px_loop
%endif
.%1_skip_%4_px:
%endmacro
%macro V_COPY_ROW 2
%ifidn %1, bottom
sub r1, linesize
%endif
.%1_copy_loop:
xor cnt_reg, cnt_reg
%if notcpuflag(sse)
%define linesize r2m
V_COPY_NPX %1, mm0, movq, 8, 0xFFFFFFF8
%else ; sse
V_COPY_NPX %1, xmm0, movups, 16, 0xFFFFFFF0
%if ARCH_X86_64
%define linesize r2
V_COPY_NPX %1, rax , mov, 8
%else ; ARCH_X86_32
%define linesize r2m
V_COPY_NPX %1, mm0, movq, 8
%endif ; ARCH_X86_64/32
%endif ; sse
V_COPY_NPX %1, vald, mov, 4
V_COPY_NPX %1, valw, mov, 2
V_COPY_NPX %1, vall, mov, 1
mov w_reg, cnt_reg
%ifidn %1, body
add r1, linesize
%endif
add r0, linesize
dec %2
jnz .%1_copy_loop
%endmacro
%macro SLOW_V_EXTEND 0
.slow_v_extend_loop:
; r0=buf,r1=src,r2(64)/r2m(32)=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h
; r8(64)/r3(later-64)/r2(32)=cnt_reg,r6(64)/r3(32)=val_reg,r7(64)/r6(32)=w=end_x-start_x
%if ARCH_X86_64
push r8 ; save old value of block_h
test r3, r3
%define cnt_reg r8
jz .do_body_copy ; if (!start_y) goto do_body_copy
V_COPY_ROW top, r3
%else
cmp dword r3m, 0
%define cnt_reg r2
je .do_body_copy ; if (!start_y) goto do_body_copy
V_COPY_ROW top, dword r3m
%endif
.do_body_copy:
V_COPY_ROW body, r4
%if ARCH_X86_64
pop r8 ; restore old value of block_h
%define cnt_reg r3
%endif
test r5, r5
%if ARCH_X86_64
jz .v_extend_end
%else
jz .skip_bottom_extend
%endif
V_COPY_ROW bottom, r5
%if ARCH_X86_32
.skip_bottom_extend:
mov r2, r2m
%endif
jmp .v_extend_end
%endmacro
%macro SLOW_LEFT_EXTEND 0
.slow_left_extend_loop:
; r0=buf+block_h*linesize,r2=linesize,r6(64)/r3(32)=val,r5=block_h,r4=cntr,r7/r6=start_x
mov r4, 8
sub r0, linesize
READ_V_PIXEL 8, [r0+w_reg]
.left_extend_8px_loop:
movq [r0+r4-8], mm0
add r4, 8
cmp r4, w_reg
jle .left_extend_8px_loop
sub r4, 8
cmp r4, w_reg
jge .left_extend_loop_end
.left_extend_2px_loop:
mov [r0+r4], valw
add r4, 2
cmp r4, w_reg
jl .left_extend_2px_loop
.left_extend_loop_end:
dec r5
jnz .slow_left_extend_loop
%if ARCH_X86_32
mov r2, r2m
%endif
jmp .right_extend
%endmacro
%macro SLOW_RIGHT_EXTEND 0
.slow_right_extend_loop:
; r3(64)/r0(32)=buf+block_h*linesize,r2=linesize,r4=block_w,r8(64)/r5(32)=block_h,
; r7(64)/r6(32)=end_x,r6/r3=val,r1=cntr
%if ARCH_X86_64
%define buf_reg r3
%define bh_reg r8
%else
%define buf_reg r0
%define bh_reg r5
%endif
lea r1, [r4-8]
sub buf_reg, linesize
READ_V_PIXEL 8, [buf_reg+w_reg-1]
.right_extend_8px_loop:
movq [buf_reg+r1], mm0
sub r1, 8
cmp r1, w_reg
jge .right_extend_8px_loop
add r1, 8
cmp r1, w_reg
je .right_extend_loop_end
.right_extend_2px_loop:
sub r1, 2
mov [buf_reg+r1], valw
cmp r1, w_reg
jg .right_extend_2px_loop
.right_extend_loop_end:
dec bh_reg
jnz .slow_right_extend_loop
jmp .h_extend_end
%endmacro
%macro emu_edge 1
INIT_XMM %1
EMU_EDGE_FUNC
VERTICAL_EXTEND
LEFT_EXTEND
RIGHT_EXTEND
SLOW_V_EXTEND
SLOW_LEFT_EXTEND
SLOW_RIGHT_EXTEND
%endmacro
emu_edge sse
%if ARCH_X86_32
emu_edge mmx
%endif
%macro PREFETCH_FN 1
cglobal prefetch, 3, 3, 0, buf, stride, h
.loop:
%1 [bufq]
add bufq, strideq
dec hd
jg .loop
REP_RET
%endmacro
INIT_MMX mmxext
PREFETCH_FN prefetcht0
%if ARCH_X86_32
INIT_MMX 3dnow
PREFETCH_FN prefetch
%endif

View File

@ -0,0 +1,119 @@
/*
* Copyright (C) 2012 Ronald S. Bultje
*
* This file is part of Libav.
*
* Libav is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* Libav is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with Libav; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/common.h"
#include "libavutil/cpu.h"
#include "libavutil/mem.h"
#include "libavutil/x86/asm.h"
#include "libavcodec/videodsp.h"
#if HAVE_YASM
typedef void emu_edge_core_func(uint8_t *buf, const uint8_t *src,
x86_reg linesize, x86_reg start_y,
x86_reg end_y, x86_reg block_h,
x86_reg start_x, x86_reg end_x,
x86_reg block_w);
extern emu_edge_core_func ff_emu_edge_core_mmx;
extern emu_edge_core_func ff_emu_edge_core_sse;
static av_always_inline void emulated_edge_mc(uint8_t *buf, const uint8_t *src,
ptrdiff_t linesize,
int block_w, int block_h,
int src_x, int src_y,
int w, int h,
emu_edge_core_func *core_fn)
{
int start_y, start_x, end_y, end_x, src_y_add = 0;
if (src_y >= h) {
src_y_add = h - 1 - src_y;
src_y = h - 1;
} else if (src_y <= -block_h) {
src_y_add = 1 - block_h - src_y;
src_y = 1 - block_h;
}
if (src_x >= w) {
src += w - 1 - src_x;
src_x = w - 1;
} else if (src_x <= -block_w) {
src += 1 - block_w - src_x;
src_x = 1 - block_w;
}
start_y = FFMAX(0, -src_y);
start_x = FFMAX(0, -src_x);
end_y = FFMIN(block_h, h-src_y);
end_x = FFMIN(block_w, w-src_x);
assert(start_x < end_x && block_w > 0);
assert(start_y < end_y && block_h > 0);
// fill in the to-be-copied part plus all above/below
src += (src_y_add + start_y) * linesize + start_x;
buf += start_x;
core_fn(buf, src, linesize, start_y, end_y,
block_h, start_x, end_x, block_w);
}
#if ARCH_X86_32
static av_noinline void emulated_edge_mc_mmx(uint8_t *buf, const uint8_t *src,
ptrdiff_t linesize,
int block_w, int block_h,
int src_x, int src_y, int w, int h)
{
emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
w, h, &ff_emu_edge_core_mmx);
}
#endif
static av_noinline void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src,
ptrdiff_t linesize,
int block_w, int block_h,
int src_x, int src_y, int w, int h)
{
emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
w, h, &ff_emu_edge_core_sse);
}
#endif /* HAVE_YASM */
void ff_prefetch_mmxext(uint8_t *buf, ptrdiff_t stride, int h);
void ff_prefetch_3dnow(uint8_t *buf, ptrdiff_t stride, int h);
void ff_videodsp_init_x86(VideoDSPContext *ctx, int bpc)
{
#if HAVE_YASM
int mm_flags = av_get_cpu_flags();
#if ARCH_X86_32
if (bpc <= 8 && mm_flags & AV_CPU_FLAG_MMX) {
ctx->emulated_edge_mc = emulated_edge_mc_mmx;
}
if (mm_flags & AV_CPU_FLAG_3DNOW) {
ctx->prefetch = ff_prefetch_3dnow;
}
#endif /* ARCH_X86_32 */
if (mm_flags & AV_CPU_FLAG_MMXEXT) {
ctx->prefetch = ff_prefetch_mmxext;
}
if (bpc <= 8 && mm_flags & AV_CPU_FLAG_SSE) {
ctx->emulated_edge_mc = emulated_edge_mc_sse;
}
#endif /* HAVE_YASM */
}