diff --git a/configure b/configure index aa45fd77ff..00f2ebe83a 100755 --- a/configure +++ b/configure @@ -276,6 +276,8 @@ Developer options (useful when working on FFmpeg itself): Cannot be combined with --target-exec --samples=PATH location of test samples for FATE, if not set use \$FATE_SAMPLES at make invocation time. + --enable-xmm-clobber-test check XMM registers for clobbering (Win64-only; + should be used only for debugging purposes) NOTE: Object files are built at the place where configure is launched. EOF @@ -1085,6 +1087,7 @@ CONFIG_LIST=" vda vdpau version3 + xmm_clobber_test x11grab zlib " @@ -1779,7 +1782,7 @@ test_deps _muxer _demuxer \ wav \ yuv4mpegpipe=yuv4mpeg \ -ac3_fixed_test_deps="ac3_fixed_encoder ac3_decoder rm_muxer rm_demuxer" +ac3_fixed_test_deps="ac3_fixed_encoder ac3_decoder" mpg_test_deps="mpeg1system_muxer mpegps_demuxer" # default parameters @@ -3304,6 +3307,17 @@ check_ldflags -Wl,--warn-common check_ldflags -Wl,-rpath-link=libpostproc:libswresample:libswscale:libavfilter:libavdevice:libavformat:libavcodec:libavutil test_ldflags -Wl,-Bsymbolic && append SHFLAGS -Wl,-Bsymbolic +enabled xmm_clobber_test && \ + check_ldflags -Wl,--wrap,avcodec_open2 \ + -Wl,--wrap,avcodec_decode_audio4 \ + -Wl,--wrap,avcodec_decode_video2 \ + -Wl,--wrap,avcodec_decode_subtitle2 \ + -Wl,--wrap,avcodec_encode_audio2 \ + -Wl,--wrap,avcodec_encode_video \ + -Wl,--wrap,avcodec_encode_subtitle \ + -Wl,--wrap,sws_scale || \ + disable xmm_clobber_test + echo "X{};" > $TMPV if test_ldflags -Wl,--version-script,$TMPV; then append SHFLAGS '-Wl,--version-script,\$(SUBDIR)lib\$(NAME).ver' diff --git a/doc/APIchanges b/doc/APIchanges index bc68cb0b32..d6e4decdf3 100644 --- a/doc/APIchanges +++ b/doc/APIchanges @@ -19,18 +19,18 @@ API changes, most recent first: 2012-01-24 - xxxxxxx - lavfi 2.60.100 Add avfilter_graph_dump. -2012-02-01 - xxxxxxx - lavc 54.01.0 +2012-02-01 - 316fc74 - lavc 54.01.0 Add av_fast_padded_malloc() as alternative for av_realloc() when aligned memory is required. The buffer will always have FF_INPUT_BUFFER_PADDING_SIZE zero-padded bytes at the end. -2012-01-31 - xxxxxxx - lavf 54.01.0 +2012-01-31 - dd6d3b0 - lavf 54.01.0 Add avformat_get_riff_video_tags() and avformat_get_riff_audio_tags(). -2012-01-31 - xxxxxxx - lavc 54.01.0 +2012-01-31 - af08d9a - lavc 54.01.0 Add avcodec_is_open() function. -2012-01-30 - xxxxxxx - lavu 51.22.0 - intfloat.h +2012-01-30 - 8b93312 - lavu 51.22.0 - intfloat.h Add a new installed header libavutil/intfloat.h with int/float punning functions. diff --git a/doc/general.texi b/doc/general.texi index a1838296b5..dbf8775b71 100644 --- a/doc/general.texi +++ b/doc/general.texi @@ -497,6 +497,7 @@ following image formats are supported: @item Flash Screen Video v2 @tab X @tab X @item Flash Video (FLV) @tab X @tab X @tab Sorenson H.263 used in Flash +@item Forward Uncompressed @tab @tab X @item Fraps @tab @tab X @item H.261 @tab X @tab X @item H.263 / H.263-1996 @tab X @tab X diff --git a/libavcodec/arm/ac3dsp_armv6.S b/libavcodec/arm/ac3dsp_armv6.S index b6aee867b3..df8bfbaa03 100644 --- a/libavcodec/arm/ac3dsp_armv6.S +++ b/libavcodec/arm/ac3dsp_armv6.S @@ -34,24 +34,23 @@ function ff_ac3_bit_alloc_calc_bap_armv6, export=1 add r0, r0, r4, lsl #1 @ mask + band add r4, lr, r4 add r7, r7, r2 @ bap + start - ldrb r10, [r4], #1 1: ldrsh r9, [r0], #2 @ mask[band] mov r8, #0xff0 sub r9, r9, r12 @ - snr_offset - mov r11, r10 - ldrb r10, [r4], #1 @ band_start_tab[band++] + ldrb r10, [r4, #1]! @ band_start_tab[++band] subs r9, r9, r5 @ - floor it lt movlt r9, #0 cmp r10, r3 @ - end and r9, r9, r8, lsl #1 @ & 0x1fe0 ite gt - subgt r8, r3, r11 - suble r8, r10, r11 + subgt r8, r3, r2 + suble r8, r10, r2 + mov r2, r10 add r9, r9, r5 @ + floor => m tst r8, #1 - add r2, r7, r8 + add r11, r7, r8 bne 3f b 5f 2: @@ -65,9 +64,9 @@ function ff_ac3_bit_alloc_calc_bap_armv6, export=1 ldrb lr, [r6, lr] strb r8, [r7], #1 @ bap[bin] strb lr, [r7], #1 -5: cmp r7, r2 +5: cmp r7, r11 blo 2b - cmp r3, r11 + cmp r3, r10 bgt 1b pop {r4-r11,pc} 3: diff --git a/libavcodec/bethsoftvideo.c b/libavcodec/bethsoftvideo.c index d80ac166a5..d85783b109 100644 --- a/libavcodec/bethsoftvideo.c +++ b/libavcodec/bethsoftvideo.c @@ -61,7 +61,7 @@ static int set_palette(BethsoftvidContext *ctx) palette[a] |= palette[a] >> 6 & 0x30303; } ctx->frame.palette_has_changed = 1; - return 256*3; + return 0; } static int bethsoftvid_decode_frame(AVCodecContext *avctx, @@ -88,7 +88,13 @@ static int bethsoftvid_decode_frame(AVCodecContext *avctx, switch(block_type = bytestream2_get_byte(&vid->g)){ case PALETTE_BLOCK: { - return set_palette(vid); + int ret; + *data_size = 0; + if ((ret = set_palette(vid)) < 0) { + av_log(avctx, AV_LOG_ERROR, "error reading palette\n"); + return ret; + } + return bytestream2_tell(&vid->g); } case VIDEO_YOFF_P_FRAME: yoffset = bytestream2_get_le16(&vid->g); diff --git a/libavcodec/internal.h b/libavcodec/internal.h index 72a89441c2..b7d4a6e139 100644 --- a/libavcodec/internal.h +++ b/libavcodec/internal.h @@ -130,6 +130,7 @@ int avpriv_unlock_avformat(void); * If avpkt->data is already set, avpkt->size is checked * to ensure it is large enough. * If avpkt->data is NULL, a new buffer is allocated. + * avpkt->size is set to the specified size. * All other AVPacket fields will be reset with av_init_packet(). * @param size the minimum required packet size * @return 0 on success, negative error code on failure diff --git a/libavcodec/libx264.c b/libavcodec/libx264.c index 7f817cbb4c..1380e0a438 100644 --- a/libavcodec/libx264.c +++ b/libavcodec/libx264.c @@ -188,12 +188,12 @@ static int X264_frame(AVCodecContext *ctx, uint8_t *buf, do { bufsize = orig_bufsize; - if (x264_encoder_encode(x4->enc, &nal, &nnal, frame? &x4->pic: NULL, &pic_out) < 0) - return -1; + if (x264_encoder_encode(x4->enc, &nal, &nnal, frame? &x4->pic: NULL, &pic_out) < 0) + return -1; - bufsize = encode_nals(ctx, buf, bufsize, nal, nnal, 0); - if (bufsize < 0) - return -1; + bufsize = encode_nals(ctx, buf, bufsize, nal, nnal, 0); + if (bufsize < 0) + return -1; } while (!bufsize && !frame && x264_encoder_delayed_frames(x4->enc)); /* FIXME: libx264 now provides DTS, but AVFrame doesn't have a field for it. */ diff --git a/libavcodec/mpc.h b/libavcodec/mpc.h index 808739fcc9..8b4deef689 100644 --- a/libavcodec/mpc.h +++ b/libavcodec/mpc.h @@ -66,8 +66,6 @@ typedef struct { int buf_size; AVLFG rnd; int frames_to_skip; - uint8_t *buffer; - int buffer_size; /* for synthesis */ DECLARE_ALIGNED(16, MPA_INT, synth_buf)[MPA_MAX_CHANNELS][512*2]; int synth_buf_offset[MPA_MAX_CHANNELS]; diff --git a/libavcodec/mpc7.c b/libavcodec/mpc7.c index c60a621c65..5693c4ddb2 100644 --- a/libavcodec/mpc7.c +++ b/libavcodec/mpc7.c @@ -200,34 +200,46 @@ static int mpc7_decode_frame(AVCodecContext * avctx, void *data, int *got_frame_ptr, AVPacket *avpkt) { const uint8_t *buf = avpkt->data; - int buf_size = avpkt->size; + int buf_size; MPCContext *c = avctx->priv_data; GetBitContext gb; int i, ch; int mb = -1; Band *bands = c->bands; - int off, ret; + int off, ret, last_frame, skip; int bits_used, bits_avail; memset(bands, 0, sizeof(*bands) * (c->maxbands + 1)); - if(buf_size <= 4){ - av_log(avctx, AV_LOG_ERROR, "Too small buffer passed (%i bytes)\n", buf_size); - return AVERROR(EINVAL); + + buf_size = avpkt->size & ~3; + if (buf_size <= 0) { + av_log(avctx, AV_LOG_ERROR, "packet size is too small (%i bytes)\n", + avpkt->size); + return AVERROR_INVALIDDATA; + } + if (buf_size != avpkt->size) { + av_log(avctx, AV_LOG_WARNING, "packet size is not a multiple of 4. " + "extra bytes at the end will be skipped.\n"); } + skip = buf[0]; + last_frame = buf[1]; + buf += 4; + buf_size -= 4; + /* get output buffer */ - c->frame.nb_samples = buf[1] ? c->lastframelen : MPC_FRAME_SIZE; + c->frame.nb_samples = last_frame ? c->lastframelen : MPC_FRAME_SIZE; if ((ret = avctx->get_buffer(avctx, &c->frame)) < 0) { av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n"); return ret; } - av_fast_padded_malloc(&c->buffer, &c->buffer_size, FFALIGN(buf_size - 1, 4)); - if (!c->buffer) + av_fast_padded_malloc(&c->bits, &c->buf_size, buf_size); + if (!c->bits) return AVERROR(ENOMEM); - c->dsp.bswap_buf((uint32_t*)c->buffer, (const uint32_t*)(buf + 4), (buf_size - 4) >> 2); - init_get_bits(&gb, c->buffer, (buf_size - 4)* 8); - skip_bits_long(&gb, buf[0]); + c->dsp.bswap_buf((uint32_t *)c->bits, (const uint32_t *)buf, buf_size >> 2); + init_get_bits(&gb, c->bits, buf_size * 8); + skip_bits_long(&gb, skip); /* read subband indexes */ for(i = 0; i <= c->maxbands; i++){ @@ -284,21 +296,21 @@ static int mpc7_decode_frame(AVCodecContext * avctx, void *data, ff_mpc_dequantize_and_synth(c, mb, c->frame.data[0], 2); bits_used = get_bits_count(&gb); - bits_avail = (buf_size - 4) * 8; - if(!buf[1] && ((bits_avail < bits_used) || (bits_used + 32 <= bits_avail))){ + bits_avail = buf_size * 8; + if (!last_frame && ((bits_avail < bits_used) || (bits_used + 32 <= bits_avail))) { av_log(NULL,0, "Error decoding frame: used %i of %i bits\n", bits_used, bits_avail); return -1; } if(c->frames_to_skip){ c->frames_to_skip--; *got_frame_ptr = 0; - return buf_size; + return avpkt->size; } *got_frame_ptr = 1; *(AVFrame *)data = c->frame; - return buf_size; + return avpkt->size; } static void mpc7_decode_flush(AVCodecContext *avctx) @@ -312,8 +324,8 @@ static void mpc7_decode_flush(AVCodecContext *avctx) static av_cold int mpc7_decode_close(AVCodecContext *avctx) { MPCContext *c = avctx->priv_data; - av_freep(&c->buffer); - c->buffer_size = 0; + av_freep(&c->bits); + c->buf_size = 0; return 0; } diff --git a/libavcodec/pcm.c b/libavcodec/pcm.c index d8b926d33e..1916c2f938 100644 --- a/libavcodec/pcm.c +++ b/libavcodec/pcm.c @@ -194,7 +194,6 @@ static int pcm_encode_frame(AVCodecContext *avctx, AVPacket *avpkt, return -1; } - avpkt->size = frame->nb_samples * avctx->channels * sample_size; *got_packet_ptr = 1; return 0; } diff --git a/libavcodec/ra144enc.c b/libavcodec/ra144enc.c index 725abc2f38..91bf7e174f 100644 --- a/libavcodec/ra144enc.c +++ b/libavcodec/ra144enc.c @@ -521,5 +521,5 @@ AVCodec ff_ra_144_encoder = { .close = ra144_encode_close, .sample_fmts = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_S16, AV_SAMPLE_FMT_NONE }, - .long_name = NULL_IF_CONFIG_SMALL("RealAudio 1.0 (14.4K) encoder"), + .long_name = NULL_IF_CONFIG_SMALL("RealAudio 1.0 (14.4K)"), }; diff --git a/libavcodec/utils.c b/libavcodec/utils.c index f21e36fd9c..74206720ca 100644 --- a/libavcodec/utils.c +++ b/libavcodec/utils.c @@ -919,16 +919,14 @@ int ff_alloc_packet(AVPacket *avpkt, int size) if (avpkt->data) { uint8_t *pkt_data; - int pkt_size; if (avpkt->size < size) return AVERROR(EINVAL); pkt_data = avpkt->data; - pkt_size = avpkt->size; av_init_packet(avpkt); avpkt->data = pkt_data; - avpkt->size = pkt_size; + avpkt->size = size; return 0; } else { return av_new_packet(avpkt, size); diff --git a/libavcodec/vorbis.c b/libavcodec/vorbis.c index 28176f3b12..fac8d0b2cd 100644 --- a/libavcodec/vorbis.c +++ b/libavcodec/vorbis.c @@ -156,7 +156,7 @@ void ff_vorbis_ready_floor1_list(vorbis_floor1_entry * list, int values) } } -static inline void render_line_unrolled(intptr_t x, uint8_t y, int x1, +static inline void render_line_unrolled(intptr_t x, int y, int x1, intptr_t sy, int ady, int adx, float *buf) { @@ -168,30 +168,30 @@ static inline void render_line_unrolled(intptr_t x, uint8_t y, int x1, if (err >= 0) { err += ady - adx; y += sy; - buf[x++] = ff_vorbis_floor1_inverse_db_table[y]; + buf[x++] = ff_vorbis_floor1_inverse_db_table[av_clip_uint8(y)]; } - buf[x] = ff_vorbis_floor1_inverse_db_table[y]; + buf[x] = ff_vorbis_floor1_inverse_db_table[av_clip_uint8(y)]; } if (x <= 0) { if (err + ady >= 0) y += sy; - buf[x] = ff_vorbis_floor1_inverse_db_table[y]; + buf[x] = ff_vorbis_floor1_inverse_db_table[av_clip_uint8(y)]; } } -static void render_line(int x0, uint8_t y0, int x1, int y1, float *buf) +static void render_line(int x0, int y0, int x1, int y1, float *buf) { int dy = y1 - y0; int adx = x1 - x0; int ady = FFABS(dy); int sy = dy < 0 ? -1 : 1; - buf[x0] = ff_vorbis_floor1_inverse_db_table[y0]; + buf[x0] = ff_vorbis_floor1_inverse_db_table[av_clip_uint8(y0)]; if (ady*2 <= adx) { // optimized common case render_line_unrolled(x0, y0, x1, sy, ady, adx, buf); } else { int base = dy / adx; int x = x0; - uint8_t y = y0; + int y = y0; int err = -adx; ady -= FFABS(base) * adx; while (++x < x1) { @@ -201,7 +201,7 @@ static void render_line(int x0, uint8_t y0, int x1, int y1, float *buf) err -= adx; y += sy; } - buf[x] = ff_vorbis_floor1_inverse_db_table[y]; + buf[x] = ff_vorbis_floor1_inverse_db_table[av_clip_uint8(y)]; } } } @@ -210,8 +210,7 @@ void ff_vorbis_floor1_render_list(vorbis_floor1_entry * list, int values, uint16_t *y_list, int *flag, int multiplier, float *out, int samples) { - int lx, i; - uint8_t ly; + int lx, ly, i; lx = 0; ly = y_list[0] * multiplier; for (i = 1; i < values; i++) { diff --git a/libavcodec/vorbisdec.c b/libavcodec/vorbisdec.c index f1d9a79297..f71d606dad 100644 --- a/libavcodec/vorbisdec.c +++ b/libavcodec/vorbisdec.c @@ -1256,20 +1256,20 @@ static int vorbis_floor1_decode(vorbis_context *vc, floor1_flag[i] = 1; if (val >= room) { if (highroom > lowroom) { - floor1_Y_final[i] = val - lowroom + predicted; + floor1_Y_final[i] = av_clip_uint16(val - lowroom + predicted); } else { - floor1_Y_final[i] = predicted - val + highroom - 1; + floor1_Y_final[i] = av_clip_uint16(predicted - val + highroom - 1); } } else { if (val & 1) { - floor1_Y_final[i] = predicted - (val + 1) / 2; + floor1_Y_final[i] = av_clip_uint16(predicted - (val + 1) / 2); } else { - floor1_Y_final[i] = predicted + val / 2; + floor1_Y_final[i] = av_clip_uint16(predicted + val / 2); } } } else { floor1_flag[i] = 0; - floor1_Y_final[i] = predicted; + floor1_Y_final[i] = av_clip_uint16(predicted); } av_dlog(NULL, " Decoded floor(%d) = %u / val %u\n", diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index 3b8ee56a49..dc8c66afde 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -83,3 +83,4 @@ OBJS-$(HAVE_MMX) += x86/dsputil_mmx.o \ x86/mpegvideo_mmx.o \ x86/simple_idct_mmx.o \ +OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o diff --git a/libavcodec/x86/dsputil_yasm.asm b/libavcodec/x86/dsputil_yasm.asm index ad9ec2c339..966344f7c7 100644 --- a/libavcodec/x86/dsputil_yasm.asm +++ b/libavcodec/x86/dsputil_yasm.asm @@ -1063,7 +1063,7 @@ emu_edge mmx ; %4 = CLIPD function takes min/max as float instead of int (CLIPD_SSE2) ; %5 = suffix %macro VECTOR_CLIP_INT32 4-5 -cglobal vector_clip_int32%5, 5,5,%2, dst, src, min, max, len +cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len %if %4 cvtsi2ss m4, minm cvtsi2ss m5, maxm diff --git a/libavcodec/x86/w64xmmtest.c b/libavcodec/x86/w64xmmtest.c new file mode 100644 index 0000000000..f6e3de9496 --- /dev/null +++ b/libavcodec/x86/w64xmmtest.c @@ -0,0 +1,80 @@ +/* + * check XMM registers for clobbers on Win64 + * Copyright (c) 2012 Ronald S. Bultje + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavcodec/avcodec.h" +#include "libavutil/x86/w64xmmtest.h" + +wrap(avcodec_open2(AVCodecContext *avctx, + AVCodec *codec, + AVDictionary **options)) +{ + testxmmclobbers(avcodec_open2, avctx, codec, options); +} + +wrap(avcodec_decode_audio4(AVCodecContext *avctx, + AVFrame *frame, + int *got_frame_ptr, + AVPacket *avpkt)) +{ + testxmmclobbers(avcodec_decode_audio4, avctx, frame, + got_frame_ptr, avpkt); +} + +wrap(avcodec_decode_video2(AVCodecContext *avctx, + AVFrame *picture, + int *got_picture_ptr, + AVPacket *avpkt)) +{ + testxmmclobbers(avcodec_decode_video2, avctx, picture, + got_picture_ptr, avpkt); +} + +wrap(avcodec_decode_subtitle2(AVCodecContext *avctx, + AVSubtitle *sub, + int *got_sub_ptr, + AVPacket *avpkt)) +{ + testxmmclobbers(avcodec_decode_subtitle2, avctx, sub, + got_sub_ptr, avpkt); +} + +wrap(avcodec_encode_audio2(AVCodecContext *avctx, + AVPacket *avpkt, + const AVFrame *frame, + int *got_packet_ptr)) +{ + testxmmclobbers(avcodec_encode_audio2, avctx, avpkt, frame, + got_packet_ptr); +} + +wrap(avcodec_encode_video(AVCodecContext *avctx, + uint8_t *buf, int buf_size, + const AVFrame *pict)) +{ + testxmmclobbers(avcodec_encode_video, avctx, buf, buf_size, pict); +} + +wrap(avcodec_encode_subtitle(AVCodecContext *avctx, + uint8_t *buf, int buf_size, + const AVSubtitle *sub)) +{ + testxmmclobbers(avcodec_encode_subtitle, avctx, buf, buf_size, sub); +} diff --git a/libavformat/isom.h b/libavformat/isom.h index 40455e5d75..eb9218ef45 100644 --- a/libavformat/isom.h +++ b/libavformat/isom.h @@ -129,6 +129,7 @@ typedef struct MOVStreamContext { int has_palette; int64_t data_size; uint32_t tmcd_flags; ///< tmcd track flags + int64_t track_end; ///< used for dts generation in fragmented movie files } MOVStreamContext; typedef struct MOVContext { diff --git a/libavformat/mov.c b/libavformat/mov.c index e345355d95..ddbe49083a 100644 --- a/libavformat/mov.c +++ b/libavformat/mov.c @@ -1012,6 +1012,32 @@ static int mov_read_glbl(MOVContext *c, AVIOContext *pb, MOVAtom atom) return 0; } +static int mov_read_dvc1(MOVContext *c, AVIOContext *pb, MOVAtom atom) +{ + AVStream *st; + uint8_t profile_level; + + if (c->fc->nb_streams < 1) + return 0; + st = c->fc->streams[c->fc->nb_streams-1]; + + if (atom.size >= (1<<28) || atom.size < 7) + return AVERROR_INVALIDDATA; + + profile_level = avio_r8(pb); + if (profile_level & 0xf0 != 0xc0) + return 0; + + av_free(st->codec->extradata); + st->codec->extradata = av_mallocz(atom.size - 7 + FF_INPUT_BUFFER_PADDING_SIZE); + if (!st->codec->extradata) + return AVERROR(ENOMEM); + st->codec->extradata_size = atom.size - 7; + avio_seek(pb, 6, SEEK_CUR); + avio_read(pb, st->codec->extradata, st->codec->extradata_size); + return 0; +} + /** * An strf atom is a BITMAPINFOHEADER struct. This struct is 40 bytes itself, * but can have extradata appended at the end after the 40 bytes belonging @@ -1706,6 +1732,7 @@ static int mov_read_stts(MOVContext *c, AVIOContext *pb, MOVAtom atom) st->nb_frames= total_sample_count; if (duration) st->duration= duration; + sc->track_end = duration; return 0; } @@ -2326,7 +2353,7 @@ static int mov_read_trun(MOVContext *c, AVIOContext *pb, MOVAtom atom) if (flags & 0x001) data_offset = avio_rb32(pb); if (flags & 0x004) first_sample_flags = avio_rb32(pb); - dts = st->duration - sc->time_offset; + dts = sc->track_end - sc->time_offset; offset = frag->base_data_offset + data_offset; distance = 0; av_dlog(c->fc, "first sample flags 0x%x\n", first_sample_flags); @@ -2356,7 +2383,7 @@ static int mov_read_trun(MOVContext *c, AVIOContext *pb, MOVAtom atom) sc->data_size += sample_size; } frag->moof_offset = offset; - st->duration = dts + sc->time_offset; + st->duration = sc->track_end = dts + sc->time_offset; return 0; } @@ -2538,6 +2565,7 @@ static const MOVParseTableEntry mov_default_parse_table[] = { { MKTAG('w','f','e','x'), mov_read_wfex }, { MKTAG('c','m','o','v'), mov_read_cmov }, { MKTAG('c','h','a','n'), mov_read_chan }, /* channel layout */ +{ MKTAG('d','v','c','1'), mov_read_dvc1 }, { 0, NULL } }; diff --git a/libavutil/x86/w64xmmtest.h b/libavutil/x86/w64xmmtest.h new file mode 100644 index 0000000000..1c1ded86ec --- /dev/null +++ b/libavutil/x86/w64xmmtest.h @@ -0,0 +1,71 @@ +/* + * check XMM registers for clobbers on Win64 + * Copyright (c) 2008 Ramiro Polla + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include +#include +#include + +#include "libavutil/bswap.h" + +#define storexmmregs(mem) \ + __asm__ volatile( \ + "movups %%xmm6 , 0x00(%0)\n\t" \ + "movups %%xmm7 , 0x10(%0)\n\t" \ + "movups %%xmm8 , 0x20(%0)\n\t" \ + "movups %%xmm9 , 0x30(%0)\n\t" \ + "movups %%xmm10, 0x40(%0)\n\t" \ + "movups %%xmm11, 0x50(%0)\n\t" \ + "movups %%xmm12, 0x60(%0)\n\t" \ + "movups %%xmm13, 0x70(%0)\n\t" \ + "movups %%xmm14, 0x80(%0)\n\t" \ + "movups %%xmm15, 0x90(%0)\n\t" \ + :: "r"(mem) : "memory") + +#define testxmmclobbers(func, ctx, ...) \ + uint64_t xmm[2][10][2]; \ + int ret; \ + storexmmregs(xmm[0]); \ + ret = __real_ ## func(ctx, __VA_ARGS__); \ + storexmmregs(xmm[1]); \ + if (memcmp(xmm[0], xmm[1], sizeof(xmm[0]))) { \ + int i; \ + av_log(ctx, AV_LOG_ERROR, \ + "XMM REGS CLOBBERED IN %s!\n", #func); \ + for (i = 0; i < 10; i ++) \ + if (xmm[0][i][0] != xmm[1][i][0] || \ + xmm[0][i][1] != xmm[1][i][1]) { \ + av_log(ctx, AV_LOG_ERROR, \ + "xmm%-2d = %016"PRIx64"%016"PRIx64"\n", \ + 6 + i, av_bswap64(xmm[0][i][0]), \ + av_bswap64(xmm[0][i][1])); \ + av_log(ctx, AV_LOG_ERROR, \ + " -> %016"PRIx64"%016"PRIx64"\n", \ + av_bswap64(xmm[1][i][0]), \ + av_bswap64(xmm[1][i][1])); \ + } \ + abort(); \ + } \ + return ret + +#define wrap(func) \ +int __real_ ## func; \ +int __wrap_ ## func; \ +int __wrap_ ## func diff --git a/libswscale/Makefile b/libswscale/Makefile index 77d896a76b..b761470fd1 100644 --- a/libswscale/Makefile +++ b/libswscale/Makefile @@ -25,6 +25,8 @@ MMX-OBJS-$(HAVE_YASM) += x86/input.o \ $(SUBDIR)x86/swscale_mmx.o: CFLAGS += $(NOREDZONE_FLAGS) +OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o + TESTPROGS = colorspace swscale DIRS = bfin mlib ppc sparc x86 diff --git a/libswscale/bfin/internal_bfin.S b/libswscale/bfin/internal_bfin.S index cb8d71253c..eab30aa6ce 100644 --- a/libswscale/bfin/internal_bfin.S +++ b/libswscale/bfin/internal_bfin.S @@ -30,11 +30,11 @@ and converts it to RGB565. R:5 bits, G:6 bits, B:5 bits.. packed into shorts. The following calculation is used for the conversion: - r = clipz((y-oy)*cy + crv*(v-128)) - g = clipz((y-oy)*cy + cgv*(v-128) + cgu*(u-128)) - b = clipz((y-oy)*cy + cbu*(u-128)) + r = clipz((y - oy) * cy + crv * (v - 128)) + g = clipz((y - oy) * cy + cgv * (v - 128) + cgu * (u - 128)) + b = clipz((y - oy) * cy + cbu * (u - 128)) -y,u,v are prescaled by a factor of 4 i.e. left-shifted to gain precision. +y, u, v are prescaled by a factor of 4 i.e. left-shifted to gain precision. New factorization to eliminate the truncation error which was @@ -47,7 +47,7 @@ occurring due to the byteop3p. 2) Scale operands up by a factor of 4 not 8 because Blackfin multiplies include a shift. -3) Compute into the accumulators cy*yx0, cy*yx1. +3) Compute into the accumulators cy * yx0, cy * yx1. 4) Compute each of the linear equations: r = clipz((y - oy) * cy + crv * (v - 128)) @@ -73,7 +73,7 @@ occurring due to the byteop3p. Where coeffs have the following layout in memory. -uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv; +uint32_t oy, oc, zero, cy, crv, rmask, cbu, bmask, cgu, cgv; coeffs is a pointer to oy. diff --git a/libswscale/bfin/swscale_bfin.c b/libswscale/bfin/swscale_bfin.c index 870636ea05..3cd4f28387 100644 --- a/libswscale/bfin/swscale_bfin.c +++ b/libswscale/bfin/swscale_bfin.c @@ -27,32 +27,34 @@ #include #include "config.h" #include + #include "libswscale/rgb2rgb.h" #include "libswscale/swscale.h" #include "libswscale/swscale_internal.h" #if defined (__FDPIC__) && CONFIG_SRAM -#define L1CODE __attribute__ ((l1_text)) +#define L1CODE __attribute__((l1_text)) #else #define L1CODE #endif -int ff_bfin_uyvytoyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, - int width, int height, +int ff_bfin_uyvytoyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, + uint8_t *vdst, int width, int height, int lumStride, int chromStride, int srcStride) L1CODE; -int ff_bfin_yuyvtoyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, - int width, int height, +int ff_bfin_yuyvtoyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, + uint8_t *vdst, int width, int height, int lumStride, int chromStride, int srcStride) L1CODE; -static int uyvytoyv12_unscaled(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, - int srcSliceH, uint8_t* dst[], int dstStride[]) +static int uyvytoyv12_unscaled(SwsContext *c, uint8_t *src[], int srcStride[], + int srcSliceY, int srcSliceH, uint8_t *dst[], + int dstStride[]) { - uint8_t *dsty = dst[0] + dstStride[0]*srcSliceY; - uint8_t *dstu = dst[1] + dstStride[1]*srcSliceY/2; - uint8_t *dstv = dst[2] + dstStride[2]*srcSliceY/2; - uint8_t *ip = src[0] + srcStride[0]*srcSliceY; - int w = dstStride[0]; + uint8_t *dsty = dst[0] + dstStride[0] * srcSliceY; + uint8_t *dstu = dst[1] + dstStride[1] * srcSliceY / 2; + uint8_t *dstv = dst[2] + dstStride[2] * srcSliceY / 2; + uint8_t *ip = src[0] + srcStride[0] * srcSliceY; + int w = dstStride[0]; ff_bfin_uyvytoyv12(ip, dsty, dstu, dstv, w, srcSliceH, dstStride[0], dstStride[1], srcStride[0]); @@ -60,14 +62,15 @@ static int uyvytoyv12_unscaled(SwsContext *c, uint8_t* src[], int srcStride[], i return srcSliceH; } -static int yuyvtoyv12_unscaled(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, - int srcSliceH, uint8_t* dst[], int dstStride[]) +static int yuyvtoyv12_unscaled(SwsContext *c, uint8_t *src[], int srcStride[], + int srcSliceY, int srcSliceH, uint8_t *dst[], + int dstStride[]) { - uint8_t *dsty = dst[0] + dstStride[0]*srcSliceY; - uint8_t *dstu = dst[1] + dstStride[1]*srcSliceY/2; - uint8_t *dstv = dst[2] + dstStride[2]*srcSliceY/2; - uint8_t *ip = src[0] + srcStride[0]*srcSliceY; - int w = dstStride[0]; + uint8_t *dsty = dst[0] + dstStride[0] * srcSliceY; + uint8_t *dstu = dst[1] + dstStride[1] * srcSliceY / 2; + uint8_t *dstv = dst[2] + dstStride[2] * srcSliceY / 2; + uint8_t *ip = src[0] + srcStride[0] * srcSliceY; + int w = dstStride[0]; ff_bfin_yuyvtoyv12(ip, dsty, dstu, dstv, w, srcSliceH, dstStride[0], dstStride[1], srcStride[0]); @@ -75,15 +78,16 @@ static int yuyvtoyv12_unscaled(SwsContext *c, uint8_t* src[], int srcStride[], i return srcSliceH; } - void ff_bfin_get_unscaled_swscale(SwsContext *c) { if (c->dstFormat == PIX_FMT_YUV420P && c->srcFormat == PIX_FMT_UYVY422) { - av_log (NULL, AV_LOG_VERBOSE, "selecting Blackfin optimized uyvytoyv12_unscaled\n"); + av_log(NULL, AV_LOG_VERBOSE, + "selecting Blackfin optimized uyvytoyv12_unscaled\n"); c->swScale = uyvytoyv12_unscaled; } if (c->dstFormat == PIX_FMT_YUV420P && c->srcFormat == PIX_FMT_YUYV422) { - av_log (NULL, AV_LOG_VERBOSE, "selecting Blackfin optimized yuyvtoyv12_unscaled\n"); + av_log(NULL, AV_LOG_VERBOSE, + "selecting Blackfin optimized yuyvtoyv12_unscaled\n"); c->swScale = yuyvtoyv12_unscaled; } } diff --git a/libswscale/bfin/yuv2rgb_bfin.c b/libswscale/bfin/yuv2rgb_bfin.c index 7a7dc7f0e6..e7f657fe00 100644 --- a/libswscale/bfin/yuv2rgb_bfin.c +++ b/libswscale/bfin/yuv2rgb_bfin.c @@ -26,15 +26,16 @@ #include #include #include -#include "config.h" #include #include "libavutil/pixdesc.h" + +#include "config.h" #include "libswscale/rgb2rgb.h" #include "libswscale/swscale.h" #include "libswscale/swscale_internal.h" #if defined(__FDPIC__) && CONFIG_SRAM -#define L1CODE __attribute__ ((l1_text)) +#define L1CODE __attribute__((l1_text)) #else #define L1CODE #endif @@ -48,21 +49,20 @@ void ff_bfin_yuv2rgb565_line(uint8_t *Y, uint8_t *U, uint8_t *V, uint8_t *out, void ff_bfin_yuv2rgb24_line(uint8_t *Y, uint8_t *U, uint8_t *V, uint8_t *out, int w, uint32_t *coeffs) L1CODE; -typedef void (* ltransform)(uint8_t *Y, uint8_t *U, uint8_t *V, uint8_t *out, - int w, uint32_t *coeffs); - +typedef void (*ltransform)(uint8_t *Y, uint8_t *U, uint8_t *V, uint8_t *out, + int w, uint32_t *coeffs); static void bfin_prepare_coefficients(SwsContext *c, int rgb, int masks) { int oy; - oy = c->yOffset&0xffff; - oy = oy >> 3; // keep everything U8.0 for offset calculation + oy = c->yOffset & 0xffff; + oy = oy >> 3; // keep everything U8.0 for offset calculation - c->oc = 128*0x01010101U; - c->oy = oy*0x01010101U; + c->oc = 128 * 0x01010101U; + c->oy = oy * 0x01010101U; /* copy 64bit vector coeffs down to 32bit vector coeffs */ - c->cy = c->yCoeff; + c->cy = c->yCoeff; c->zero = 0; if (rgb) { @@ -77,7 +77,6 @@ static void bfin_prepare_coefficients(SwsContext *c, int rgb, int masks) c->cgv = c->ugCoeff; } - if (masks == 555) { c->rmask = 0x001f * 0x00010001U; c->gmask = 0x03e0 * 0x00010001U; @@ -89,27 +88,25 @@ static void bfin_prepare_coefficients(SwsContext *c, int rgb, int masks) } } -static int core_yuv420_rgb(SwsContext *c, - uint8_t **in, int *instrides, - int srcSliceY, int srcSliceH, - uint8_t **oplanes, int *outstrides, - ltransform lcscf, int rgb, int masks) +static int core_yuv420_rgb(SwsContext *c, uint8_t **in, int *instrides, + int srcSliceY, int srcSliceH, uint8_t **oplanes, + int *outstrides, ltransform lcscf, + int rgb, int masks) { - uint8_t *py,*pu,*pv,*op; + uint8_t *py, *pu, *pv, *op; int w = instrides[0]; - int h2 = srcSliceH>>1; + int h2 = srcSliceH >> 1; int i; bfin_prepare_coefficients(c, rgb, masks); py = in[0]; - pu = in[1+(1^rgb)]; - pv = in[1+(0^rgb)]; + pu = in[1 + (1 ^ rgb)]; + pv = in[1 + (0 ^ rgb)]; - op = oplanes[0] + srcSliceY*outstrides[0]; - - for (i=0;ioy); py += instrides[0]; @@ -126,9 +123,7 @@ static int core_yuv420_rgb(SwsContext *c, return srcSliceH; } - -static int bfin_yuv420_rgb555(SwsContext *c, - uint8_t **in, int *instrides, +static int bfin_yuv420_rgb555(SwsContext *c, uint8_t **in, int *instrides, int srcSliceY, int srcSliceH, uint8_t **oplanes, int *outstrides) { @@ -136,8 +131,7 @@ static int bfin_yuv420_rgb555(SwsContext *c, outstrides, ff_bfin_yuv2rgb555_line, 1, 555); } -static int bfin_yuv420_bgr555(SwsContext *c, - uint8_t **in, int *instrides, +static int bfin_yuv420_bgr555(SwsContext *c, uint8_t **in, int *instrides, int srcSliceY, int srcSliceH, uint8_t **oplanes, int *outstrides) { @@ -145,8 +139,7 @@ static int bfin_yuv420_bgr555(SwsContext *c, outstrides, ff_bfin_yuv2rgb555_line, 0, 555); } -static int bfin_yuv420_rgb24(SwsContext *c, - uint8_t **in, int *instrides, +static int bfin_yuv420_rgb24(SwsContext *c, uint8_t **in, int *instrides, int srcSliceY, int srcSliceH, uint8_t **oplanes, int *outstrides) { @@ -154,8 +147,7 @@ static int bfin_yuv420_rgb24(SwsContext *c, outstrides, ff_bfin_yuv2rgb24_line, 1, 888); } -static int bfin_yuv420_bgr24(SwsContext *c, - uint8_t **in, int *instrides, +static int bfin_yuv420_bgr24(SwsContext *c, uint8_t **in, int *instrides, int srcSliceY, int srcSliceH, uint8_t **oplanes, int *outstrides) { @@ -163,8 +155,7 @@ static int bfin_yuv420_bgr24(SwsContext *c, outstrides, ff_bfin_yuv2rgb24_line, 0, 888); } -static int bfin_yuv420_rgb565(SwsContext *c, - uint8_t **in, int *instrides, +static int bfin_yuv420_rgb565(SwsContext *c, uint8_t **in, int *instrides, int srcSliceY, int srcSliceH, uint8_t **oplanes, int *outstrides) { @@ -172,8 +163,7 @@ static int bfin_yuv420_rgb565(SwsContext *c, outstrides, ff_bfin_yuv2rgb565_line, 1, 565); } -static int bfin_yuv420_bgr565(SwsContext *c, - uint8_t **in, int *instrides, +static int bfin_yuv420_bgr565(SwsContext *c, uint8_t **in, int *instrides, int srcSliceY, int srcSliceH, uint8_t **oplanes, int *outstrides) { @@ -181,18 +171,29 @@ static int bfin_yuv420_bgr565(SwsContext *c, outstrides, ff_bfin_yuv2rgb565_line, 0, 565); } - SwsFunc ff_yuv2rgb_get_func_ptr_bfin(SwsContext *c) { SwsFunc f; - switch(c->dstFormat) { - case PIX_FMT_RGB555: f = bfin_yuv420_rgb555; break; - case PIX_FMT_BGR555: f = bfin_yuv420_bgr555; break; - case PIX_FMT_RGB565: f = bfin_yuv420_rgb565; break; - case PIX_FMT_BGR565: f = bfin_yuv420_bgr565; break; - case PIX_FMT_RGB24: f = bfin_yuv420_rgb24; break; - case PIX_FMT_BGR24: f = bfin_yuv420_bgr24; break; + switch (c->dstFormat) { + case PIX_FMT_RGB555: + f = bfin_yuv420_rgb555; + break; + case PIX_FMT_BGR555: + f = bfin_yuv420_bgr555; + break; + case PIX_FMT_RGB565: + f = bfin_yuv420_rgb565; + break; + case PIX_FMT_BGR565: + f = bfin_yuv420_bgr565; + break; + case PIX_FMT_RGB24: + f = bfin_yuv420_rgb24; + break; + case PIX_FMT_BGR24: + f = bfin_yuv420_bgr24; + break; default: return 0; } diff --git a/libswscale/x86/input.asm b/libswscale/x86/input.asm index 9a8e24f0b0..50e071a89a 100644 --- a/libswscale/x86/input.asm +++ b/libswscale/x86/input.asm @@ -51,6 +51,19 @@ bgr_Vcoeff_3x56: times 2 dw RV, 0, GV, RV rgb_Vcoeff_12x4: times 2 dw RV, GV, 0, RV rgb_Vcoeff_3x56: times 2 dw BV, 0, GV, BV +rgba_Ycoeff_rb: times 4 dw RY, BY +rgba_Ycoeff_br: times 4 dw BY, RY +rgba_Ycoeff_ga: times 4 dw GY, 0 +rgba_Ycoeff_ag: times 4 dw 0, GY +rgba_Ucoeff_rb: times 4 dw RU, BU +rgba_Ucoeff_br: times 4 dw BU, RU +rgba_Ucoeff_ga: times 4 dw GU, 0 +rgba_Ucoeff_ag: times 4 dw 0, GU +rgba_Vcoeff_rb: times 4 dw RV, BV +rgba_Vcoeff_br: times 4 dw BV, RV +rgba_Vcoeff_ga: times 4 dw GV, 0 +rgba_Vcoeff_ag: times 4 dw 0, GV + shuf_rgb_12x4: db 0, 0x80, 1, 0x80, 2, 0x80, 3, 0x80, \ 6, 0x80, 7, 0x80, 8, 0x80, 9, 0x80 shuf_rgb_3x56: db 2, 0x80, 3, 0x80, 4, 0x80, 5, 0x80, \ @@ -294,6 +307,150 @@ RGB24_FUNCS 11, 13 INIT_XMM avx RGB24_FUNCS 11, 13 +; %1 = nr. of XMM registers +; %2-5 = rgba, bgra, argb or abgr (in individual characters) +%macro RGB32_TO_Y_FN 5-6 +cglobal %2%3%4%5 %+ ToY, 6, 6, %1, dst, src, u1, u2, w, u3 + mova m5, [rgba_Ycoeff_%2%4] + mova m6, [rgba_Ycoeff_%3%5] +%if %0 == 6 + jmp mangle(program_name %+ _ %+ %6 %+ ToY %+ SUFFIX).body +%else ; %0 == 6 +.body: +%if ARCH_X86_64 + movsxd wq, wd +%endif + lea srcq, [srcq+wq*4] + add wq, wq + add dstq, wq + neg wq + mova m4, [rgb_Yrnd] + pcmpeqb m7, m7 + psrlw m7, 8 ; (word) { 0x00ff } x4 +.loop: + ; FIXME check alignment and use mova + movu m0, [srcq+wq*2+0] ; (byte) { Bx, Gx, Rx, xx }[0-3] + movu m2, [srcq+wq*2+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7] + DEINTB 1, 0, 3, 2, 7 ; (word) { Gx, xx (m0/m2) or Bx, Rx (m1/m3) }[0-3]/[4-7] + pmaddwd m1, m5 ; (dword) { Bx*BY + Rx*RY }[0-3] + pmaddwd m0, m6 ; (dword) { Gx*GY }[0-3] + pmaddwd m3, m5 ; (dword) { Bx*BY + Rx*RY }[4-7] + pmaddwd m2, m6 ; (dword) { Gx*GY }[4-7] + paddd m0, m4 ; += rgb_Yrnd + paddd m2, m4 ; += rgb_Yrnd + paddd m0, m1 ; (dword) { Y[0-3] } + paddd m2, m3 ; (dword) { Y[4-7] } + psrad m0, 9 + psrad m2, 9 + packssdw m0, m2 ; (word) { Y[0-7] } + mova [dstq+wq], m0 + add wq, mmsize + jl .loop + REP_RET +%endif ; %0 == 3 +%endmacro + +; %1 = nr. of XMM registers +; %2-5 = rgba, bgra, argb or abgr (in individual characters) +%macro RGB32_TO_UV_FN 5-6 +cglobal %2%3%4%5 %+ ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, u3 +%if ARCH_X86_64 + mova m8, [rgba_Ucoeff_%2%4] + mova m9, [rgba_Ucoeff_%3%5] + mova m10, [rgba_Vcoeff_%2%4] + mova m11, [rgba_Vcoeff_%3%5] +%define coeffU1 m8 +%define coeffU2 m9 +%define coeffV1 m10 +%define coeffV2 m11 +%else ; x86-32 +%define coeffU1 [rgba_Ucoeff_%2%4] +%define coeffU2 [rgba_Ucoeff_%3%5] +%define coeffV1 [rgba_Vcoeff_%2%4] +%define coeffV2 [rgba_Vcoeff_%3%5] +%endif ; x86-64/32 +%if ARCH_X86_64 && %0 == 6 + jmp mangle(program_name %+ _ %+ %6 %+ ToUV %+ SUFFIX).body +%else ; ARCH_X86_64 && %0 == 6 +.body: +%if ARCH_X86_64 + movsxd wq, dword r5m +%else ; x86-32 + mov wq, r5m +%endif + add wq, wq + add dstUq, wq + add dstVq, wq + lea srcq, [srcq+wq*2] + neg wq + pcmpeqb m7, m7 + psrlw m7, 8 ; (word) { 0x00ff } x4 + mova m6, [rgb_UVrnd] +.loop: + ; FIXME check alignment and use mova + movu m0, [srcq+wq*2+0] ; (byte) { Bx, Gx, Rx, xx }[0-3] + movu m4, [srcq+wq*2+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7] + DEINTB 1, 0, 5, 4, 7 ; (word) { Gx, xx (m0/m4) or Bx, Rx (m1/m5) }[0-3]/[4-7] + pmaddwd m3, m1, coeffV1 ; (dword) { Bx*BV + Rx*RV }[0-3] + pmaddwd m2, m0, coeffV2 ; (dword) { Gx*GV }[0-3] + pmaddwd m1, coeffU1 ; (dword) { Bx*BU + Rx*RU }[0-3] + pmaddwd m0, coeffU2 ; (dword) { Gx*GU }[0-3] + paddd m3, m6 ; += rgb_UVrnd + paddd m1, m6 ; += rgb_UVrnd + paddd m2, m3 ; (dword) { V[0-3] } + paddd m0, m1 ; (dword) { U[0-3] } + pmaddwd m3, m5, coeffV1 ; (dword) { Bx*BV + Rx*RV }[4-7] + pmaddwd m1, m4, coeffV2 ; (dword) { Gx*GV }[4-7] + pmaddwd m5, coeffU1 ; (dword) { Bx*BU + Rx*RU }[4-7] + pmaddwd m4, coeffU2 ; (dword) { Gx*GU }[4-7] + paddd m3, m6 ; += rgb_UVrnd + paddd m5, m6 ; += rgb_UVrnd + psrad m0, 9 + paddd m1, m3 ; (dword) { V[4-7] } + paddd m4, m5 ; (dword) { U[4-7] } + psrad m2, 9 + psrad m4, 9 + psrad m1, 9 + packssdw m0, m4 ; (word) { U[0-7] } + packssdw m2, m1 ; (word) { V[0-7] } +%if mmsize == 8 + mova [dstUq+wq], m0 + mova [dstVq+wq], m2 +%else ; mmsize == 16 + mova [dstUq+wq], m0 + mova [dstVq+wq], m2 +%endif ; mmsize == 8/16 + add wq, mmsize + jl .loop + REP_RET +%endif ; ARCH_X86_64 && %0 == 3 +%endmacro + +; %1 = nr. of XMM registers for rgb-to-Y func +; %2 = nr. of XMM registers for rgb-to-UV func +%macro RGB32_FUNCS 2 +RGB32_TO_Y_FN %1, r, g, b, a +RGB32_TO_Y_FN %1, b, g, r, a, rgba +RGB32_TO_Y_FN %1, a, r, g, b, rgba +RGB32_TO_Y_FN %1, a, b, g, r, rgba + +RGB32_TO_UV_FN %2, r, g, b, a +RGB32_TO_UV_FN %2, b, g, r, a, rgba +RGB32_TO_UV_FN %2, a, r, g, b, rgba +RGB32_TO_UV_FN %2, a, b, g, r, rgba +%endmacro + +%if ARCH_X86_32 +INIT_MMX mmx +RGB32_FUNCS 0, 0 +%endif + +INIT_XMM sse2 +RGB32_FUNCS 8, 12 + +INIT_XMM avx +RGB32_FUNCS 8, 12 + ;----------------------------------------------------------------------------- ; YUYV/UYVY/NV12/NV21 packed pixel shuffling. ; diff --git a/libswscale/x86/swscale_mmx.c b/libswscale/x86/swscale_mmx.c index ab5b68fb0b..1118515164 100644 --- a/libswscale/x86/swscale_mmx.c +++ b/libswscale/x86/swscale_mmx.c @@ -308,6 +308,10 @@ extern void ff_ ## fmt ## ToUV_ ## opt(uint8_t *dstU, uint8_t *dstV, \ INPUT_FUNC(yuyv, opt); \ INPUT_UV_FUNC(nv12, opt); \ INPUT_UV_FUNC(nv21, opt); \ + INPUT_FUNC(rgba, opt); \ + INPUT_FUNC(bgra, opt); \ + INPUT_FUNC(argb, opt); \ + INPUT_FUNC(abgr, opt); \ INPUT_FUNC(rgb24, opt); \ INPUT_FUNC(bgr24, opt) @@ -406,6 +410,10 @@ switch(c->dstBpc){ \ break; case_rgb(rgb24, RGB24, mmx); case_rgb(bgr24, BGR24, mmx); + case_rgb(bgra, BGRA, mmx); + case_rgb(rgba, RGBA, mmx); + case_rgb(abgr, ABGR, mmx); + case_rgb(argb, ARGB, mmx); default: break; } @@ -450,6 +458,10 @@ switch(c->dstBpc){ \ break; case_rgb(rgb24, RGB24, sse2); case_rgb(bgr24, BGR24, sse2); + case_rgb(bgra, BGRA, sse2); + case_rgb(rgba, RGBA, sse2); + case_rgb(abgr, ABGR, sse2); + case_rgb(argb, ARGB, sse2); default: break; } @@ -493,6 +505,10 @@ switch(c->dstBpc){ \ break; case_rgb(rgb24, RGB24, avx); case_rgb(bgr24, BGR24, avx); + case_rgb(bgra, BGRA, avx); + case_rgb(rgba, RGBA, avx); + case_rgb(abgr, ABGR, avx); + case_rgb(argb, ARGB, avx); default: break; } diff --git a/libswscale/x86/w64xmmtest.c b/libswscale/x86/w64xmmtest.c new file mode 100644 index 0000000000..dd9a2a4378 --- /dev/null +++ b/libswscale/x86/w64xmmtest.c @@ -0,0 +1,31 @@ +/* + * check XMM registers for clobbers on Win64 + * Copyright (c) 2012 Ronald S. Bultje + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/x86/w64xmmtest.h" +#include "libswscale/swscale.h" + +wrap(sws_scale(struct SwsContext *c, const uint8_t *const srcSlice[], + const int srcStride[], int srcSliceY, int srcSliceH, + uint8_t *const dst[], const int dstStride[])) +{ + testxmmclobbers(sws_scale, c, srcSlice, srcStride, srcSliceY, + srcSliceH, dst, dstStride); +} diff --git a/tests/codec-regression.sh b/tests/codec-regression.sh index 5da37b9b9a..b8d27cb7b0 100755 --- a/tests/codec-regression.sh +++ b/tests/codec-regression.sh @@ -368,7 +368,7 @@ $tiny_psnr $pcm_dst $pcm_ref 2 1924 fi if [ -n "$do_ac3_fixed" ] ; then -do_audio_encoding ac3.rm "-vn -acodec ac3_fixed" +do_audio_encoding ac3.ac3 "-vn -acodec ac3_fixed" # binaries configured with --disable-sse decode ac3 differently #do_audio_decoding #$tiny_psnr $pcm_dst $pcm_ref 2 1024 diff --git a/tests/ref/acodec/ac3_fixed b/tests/ref/acodec/ac3_fixed index dba2dfc5e7..0c2f9b7214 100644 --- a/tests/ref/acodec/ac3_fixed +++ b/tests/ref/acodec/ac3_fixed @@ -1,2 +1,2 @@ -e7fa185030a56d9db8663ad9e38c6c94 *./tests/data/acodec/ac3.rm -98751 ./tests/data/acodec/ac3.rm +a1d1fc116463b771abf5aef7ed37d7b1 *./tests/data/acodec/ac3.ac3 +96408 ./tests/data/acodec/ac3.ac3 diff --git a/tests/ref/fate/vc1-ism b/tests/ref/fate/vc1-ism index e2fda62d18..0e8520605f 100644 --- a/tests/ref/fate/vc1-ism +++ b/tests/ref/fate/vc1-ism @@ -117,4 +117,3 @@ 0, 438750, 37440, 0xf0fe8c1c 0, 442500, 37440, 0xc0036222 0, 446250, 37440, 0x3058385c -0, 450000, 37440, 0x68141016 diff --git a/tests/ref/seek/ac3_ac3 b/tests/ref/seek/ac3_ac3 new file mode 100644 index 0000000000..167dc8d716 --- /dev/null +++ b/tests/ref/seek/ac3_ac3 @@ -0,0 +1,49 @@ +ret: 0 st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos: 0 size: 556 +ret: 0 st:-1 flags:0 ts:-1.000000 +ret: 0 st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos: 0 size: 556 +ret: 0 st:-1 flags:1 ts: 1.894167 +ret: 0 st: 0 flags:1 dts: 1.880400 pts: 1.880400 pos: 30092 size: 558 +ret: 0 st: 0 flags:0 ts: 0.788333 +ret: 0 st: 0 flags:1 dts: 0.800911 pts: 0.800911 pos: 12818 size: 556 +ret:-1 st: 0 flags:1 ts:-0.317500 +ret: 0 st:-1 flags:0 ts: 2.576668 +ret: 0 st: 0 flags:1 dts: 2.576844 pts: 2.576844 pos: 41238 size: 558 +ret: 0 st:-1 flags:1 ts: 1.470835 +ret: 0 st: 0 flags:1 dts: 1.462533 pts: 1.462533 pos: 23406 size: 556 +ret: 0 st: 0 flags:0 ts: 0.365000 +ret: 0 st: 0 flags:1 dts: 0.383044 pts: 0.383044 pos: 6130 size: 558 +ret:-1 st: 0 flags:1 ts:-0.740833 +ret: 0 st:-1 flags:0 ts: 2.153336 +ret: 0 st: 0 flags:1 dts: 2.158978 pts: 2.158978 pos: 34552 size: 556 +ret: 0 st:-1 flags:1 ts: 1.047503 +ret: 0 st: 0 flags:1 dts: 1.044667 pts: 1.044667 pos: 16718 size: 558 +ret: 0 st: 0 flags:0 ts:-0.058333 +ret: 0 st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos: 0 size: 556 +ret: 0 st: 0 flags:1 ts: 2.835833 +ret: 0 st: 0 flags:1 dts: 2.820600 pts: 2.820600 pos: 45140 size: 556 +ret: 0 st:-1 flags:0 ts: 1.730004 +ret: 0 st: 0 flags:1 dts: 1.741111 pts: 1.741111 pos: 27864 size: 556 +ret: 0 st:-1 flags:1 ts: 0.624171 +ret: 0 st: 0 flags:1 dts: 0.591978 pts: 0.591978 pos: 9474 size: 556 +ret: 0 st: 0 flags:0 ts:-0.481667 +ret: 0 st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos: 0 size: 556 +ret: 0 st: 0 flags:1 ts: 2.412500 +ret: 0 st: 0 flags:1 dts: 2.402733 pts: 2.402733 pos: 38452 size: 558 +ret: 0 st:-1 flags:0 ts: 1.306672 +ret: 0 st: 0 flags:1 dts: 1.323244 pts: 1.323244 pos: 21176 size: 558 +ret: 0 st:-1 flags:1 ts: 0.200839 +ret: 0 st: 0 flags:1 dts: 0.174111 pts: 0.174111 pos: 2786 size: 558 +ret: 0 st: 0 flags:0 ts:-0.904989 +ret: 0 st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos: 0 size: 556 +ret: 0 st: 0 flags:1 ts: 1.989178 +ret: 0 st: 0 flags:1 dts: 1.984867 pts: 1.984867 pos: 31764 size: 558 +ret: 0 st:-1 flags:0 ts: 0.883340 +ret: 0 st: 0 flags:1 dts: 0.905378 pts: 0.905378 pos: 14488 size: 558 +ret:-1 st:-1 flags:1 ts:-0.222493 +ret: 0 st: 0 flags:0 ts: 2.671678 +ret: 0 st: 0 flags:1 dts: 2.681311 pts: 2.681311 pos: 42910 size: 558 +ret: 0 st: 0 flags:1 ts: 1.565844 +ret: 0 st: 0 flags:1 dts: 1.532178 pts: 1.532178 pos: 24520 size: 558 +ret: 0 st:-1 flags:0 ts: 0.460008 +ret: 0 st: 0 flags:1 dts: 0.487511 pts: 0.487511 pos: 7802 size: 556 +ret:-1 st:-1 flags:1 ts:-0.645825 diff --git a/tests/ref/seek/ac3_rm b/tests/ref/seek/ac3_rm deleted file mode 100644 index ecf03c29a1..0000000000 --- a/tests/ref/seek/ac3_rm +++ /dev/null @@ -1,41 +0,0 @@ -ret: 0 st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos: 271 size: 556 -ret: 0 st:-1 flags:0 ts:-1.000000 -ret: 0 st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos: 271 size: 556 -ret:-1 st:-1 flags:1 ts: 1.894167 -ret:-1 st: 0 flags:0 ts: 0.788000 -ret: 0 st: 0 flags:1 ts:-0.317000 -ret: 0 st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos: 271 size: 556 -ret: 0 st:-1 flags:0 ts: 2.576668 -ret: 0 st: 0 flags:1 dts: 2.124000 pts: 2.124000 pos: 34997 size: 558 -ret:-1 st:-1 flags:1 ts: 1.470835 -ret:-1 st: 0 flags:0 ts: 0.365000 -ret: 0 st: 0 flags:1 ts:-0.741000 -ret: 0 st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos: 271 size: 556 -ret: 0 st:-1 flags:0 ts: 2.153336 -ret: 0 st: 0 flags:1 dts: 2.124000 pts: 2.124000 pos: 34997 size: 558 -ret:-1 st:-1 flags:1 ts: 1.047503 -ret: 0 st: 0 flags:0 ts:-0.058000 -ret: 0 st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos: 271 size: 556 -ret: 0 st: 0 flags:1 ts: 2.836000 -ret: 0 st: 0 flags:1 dts: 2.124000 pts: 2.124000 pos: 34997 size: 558 -ret:-1 st:-1 flags:0 ts: 1.730004 -ret:-1 st:-1 flags:1 ts: 0.624171 -ret: 0 st: 0 flags:0 ts:-0.482000 -ret: 0 st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos: 271 size: 556 -ret: 0 st: 0 flags:1 ts: 2.413000 -ret: 0 st: 0 flags:1 dts: 2.124000 pts: 2.124000 pos: 34997 size: 558 -ret: 0 st:-1 flags:0 ts: 1.306672 -ret: 0 st: 0 flags:1 dts:65.537000 pts:65.537000 pos: 87488 size: 6132 -ret:-1 st:-1 flags:1 ts: 0.200839 -ret: 0 st: 0 flags:0 ts:-0.905000 -ret: 0 st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos: 271 size: 556 -ret:-1 st: 0 flags:1 ts: 1.989000 -ret:-1 st:-1 flags:0 ts: 0.883340 -ret: 0 st:-1 flags:1 ts:-0.222493 -ret: 0 st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos: 271 size: 556 -ret:-1 st: 0 flags:0 ts: 2.672000 -ret:-1 st: 0 flags:1 ts: 1.566000 -ret: 0 st:-1 flags:0 ts: 0.460008 -ret: 0 st: 0 flags:1 dts: 1.567000 pts: 1.567000 pos: 25889 size: 556 -ret: 0 st:-1 flags:1 ts:-0.645825 -ret: 0 st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos: 271 size: 556