Merge remote branch 'qatar/master'

* qatar/master: (30 commits) AVOptions: make default_val a union, as proposed in AVOption2. arm/h264pred: add missing argument type. h264dsp_mmx: place bracket outside #if/#endif block. lavf/utils: fix ff_interleave_compare_dts corner case. fate: add 10-bit H264 tests. h264: do not print "too many references" warning for intra-only. Enable decoding of high bit depth h264. Adds 8-, 9- and 10-bit versions of some of the functions used by the h264 decoder. Add support for higher QP values in h264. Add the notion of pixel size in h264 related functions. Make the h264 loop filter bit depth aware. Template dsputil_template.c with respect to pixel size, etc. Template h264idct_template.c with respect to pixel size, etc. Preparatory patch for high bit depth h264 decoding support. Move some functions in dsputil.c into a new file dsputil_template.c. Move the functions in h264idct into a new file h264idct_template.c. Move the functions in h264pred.c into a new file h264pred_template.c. Preparatory patch for high bit depth h264 decoding support. Add pixel formats for 9- and 10-bit yuv420p. Choose h264 chroma dc dequant function dynamically. ... Conflicts: doc/APIchanges ffmpeg.c ffplay.c libavcodec/alpha/dsputil_alpha.c libavcodec/arm/dsputil_init_arm.c libavcodec/arm/dsputil_init_armv6.c libavcodec/arm/dsputil_init_neon.c libavcodec/arm/dsputil_iwmmxt.c libavcodec/arm/h264pred_init_arm.c libavcodec/bfin/dsputil_bfin.c libavcodec/dsputil.c libavcodec/h264.c libavcodec/h264.h libavcodec/h264_cabac.c libavcodec/h264_cavlc.c libavcodec/h264_loopfilter.c libavcodec/h264_ps.c libavcodec/h264_refs.c libavcodec/h264dsp.c libavcodec/h264idct.c libavcodec/h264pred.c libavcodec/mlib/dsputil_mlib.c libavcodec/options.c libavcodec/ppc/dsputil_altivec.c libavcodec/ppc/dsputil_ppc.c libavcodec/ppc/h264_altivec.c libavcodec/ps2/dsputil_mmi.c libavcodec/sh4/dsputil_align.c libavcodec/sh4/dsputil_sh4.c libavcodec/sparc/dsputil_vis.c libavcodec/utils.c libavcodec/version.h libavcodec/x86/dsputil_mmx.c libavformat/options.c libavformat/utils.c libavutil/pixfmt.h libswscale/swscale.c libswscale/swscale_internal.h libswscale/swscale_template.c tests/ref/seek/lavf_avi Merged-by: Michael Niedermayer <michaelni@gmx.at>
2025-07-11 14:30:22 +02:00 · 2011-05-11 05:34:17 +02:00
parent 580fa76c5c b66752790a
commit 59eb12faff
56 changed files with 3947 additions and 472 deletions
--- a/doc/APIchanges
+++ b/doc/APIchanges
@ -13,6 +13,12 @@ libavutil:   2011-04-18

 API changes, most recent first:

+2011-05-10 - xxxxxxx - lavc 53.3.0 - avcodec.h
+  Deprecate AVLPCType and the following fields in
+  AVCodecContext: lpc_coeff_precision, prediction_order_method,
+  min_partition_order, max_partition_order, lpc_type, lpc_passes.
+  Corresponding FLAC encoder options should be used instead.
+
 2011-05-07 - xxxxxxx - lavfi 2.5.0 - avcodec.h
  Add libavfilter/avcodec.h header and avfilter_copy_frame_props()
  function.
--- a/ffmpeg.c
+++ b/ffmpeg.c
@ -292,7 +292,6 @@ typedef struct AVOutputStream {
    int resample_pix_fmt;

    float frame_aspect_ratio;
-
    /* forced key frames */
    int64_t *forced_kf_pts;
    int forced_kf_count;
@ -1505,7 +1504,7 @@ static int output_packet(AVInputStream *ist, int ist_index,
    AVFormatContext *os;
    AVOutputStream *ost;
    int ret, i;
-    int got_picture;
+    int got_output;
    AVFrame picture;
    void *buffer_to_free = NULL;
    static unsigned int samples_size= 0;
@ -1537,7 +1536,7 @@ static int output_packet(AVInputStream *ist, int ist_index,
        pkt_pts = av_rescale_q(pkt->pts, ist->st->time_base, AV_TIME_BASE_Q);

    //while we have more to decode or while the decoder did output something on EOF
-    while (avpkt.size > 0 || (!pkt && ist->next_pts != ist->pts)) {
+    while (avpkt.size > 0 || (!pkt && got_output)) {
        uint8_t *data_buf, *decoded_data_buf;
        int data_size, decoded_data_size;
    handle_eof:
@ -1573,9 +1572,10 @@ static int output_packet(AVInputStream *ist, int ist_index,
                avpkt.data += ret;
                avpkt.size -= ret;
                data_size   = ret;
+                got_output  = decoded_data_size > 0;
                /* Some bug in mpeg audio decoder gives */
                /* decoded_data_size < 0, it seems they are overflows */
-                if (decoded_data_size <= 0) {
+                if (!got_output) {
                    /* no audio frame */
                    continue;
                }
@ -1592,11 +1592,11 @@ static int output_packet(AVInputStream *ist, int ist_index,
                    pkt_pts = AV_NOPTS_VALUE;

                    ret = avcodec_decode_video2(ist->st->codec,
-                                                &picture, &got_picture, &avpkt);
+                                                &picture, &got_output, &avpkt);
                    ist->st->quality= picture.quality;
                    if (ret < 0)
                        goto fail_decode;
-                    if (!got_picture) {
+                    if (!got_output) {
                        /* no picture yet */
                        goto discard_packet;
                    }
@ -1613,10 +1613,10 @@ static int output_packet(AVInputStream *ist, int ist_index,
                    break;
            case AVMEDIA_TYPE_SUBTITLE:
                ret = avcodec_decode_subtitle2(ist->st->codec,
-                                               &subtitle, &got_picture, &avpkt);
+                                               &subtitle, &got_output, &avpkt);
                if (ret < 0)
                    goto fail_decode;
-                if (!got_picture) {
+                if (!got_output) {
                    goto discard_packet;
                }
                subtitle_to_free = &subtitle;
--- a/libavcodec/alacenc.c
+++ b/libavcodec/alacenc.c
@ -146,7 +146,7 @@ static void calc_predictor_params(AlacEncodeContext *s, int ch)
                                      s->min_prediction_order,
                                      s->max_prediction_order,
                                      ALAC_MAX_LPC_PRECISION, coefs, shift,
-                                      AV_LPC_TYPE_LEVINSON, 0,
+                                      FF_LPC_TYPE_LEVINSON, 0,
                                      ORDER_METHOD_EST, ALAC_MAX_LPC_SHIFT, 1);

        s->lpc[ch].lpc_order = opt_order;
@ -457,7 +457,7 @@ static av_cold int alac_encode_init(AVCodecContext *avctx)

    s->avctx = avctx;
    ret = ff_lpc_init(&s->lpc_ctx, avctx->frame_size, s->max_prediction_order,
-                      AV_LPC_TYPE_LEVINSON);
+                      FF_LPC_TYPE_LEVINSON);

    return ret;
 }
--- a/libavcodec/alpha/dsputil_alpha.c
+++ b/libavcodec/alpha/dsputil_alpha.c
@ -270,9 +270,9 @@ static void put_pixels16_axp_asm(uint8_t *block, const uint8_t *pixels,

 void dsputil_init_alpha(DSPContext* c, AVCodecContext *avctx)
 {
-    const int h264_high_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
+    const int high_bit_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;

-    if (!h264_high_depth) {
+    if (!high_bit_depth) {
    c->put_pixels_tab[0][0] = put_pixels16_axp_asm;
    c->put_pixels_tab[0][1] = put_pixels16_x2_axp;
    c->put_pixels_tab[0][2] = put_pixels16_y2_axp;
--- a/libavcodec/arm/dsputil_init_arm.c
+++ b/libavcodec/arm/dsputil_init_arm.c
@ -75,7 +75,7 @@ static void simple_idct_arm_add(uint8_t *dest, int line_size, DCTELEM *block)

 void dsputil_init_arm(DSPContext* c, AVCodecContext *avctx)
 {
-    const int h264_high_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
+    const int high_bit_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;

    ff_put_pixels_clamped = c->put_pixels_clamped;
    ff_add_pixels_clamped = c->add_pixels_clamped;
@ -97,7 +97,7 @@ void dsputil_init_arm(DSPContext* c, AVCodecContext *avctx)

    c->add_pixels_clamped = ff_add_pixels_clamped_arm;

-    if (!h264_high_depth) {
+    if (!high_bit_depth) {
    c->put_pixels_tab[0][0] = ff_put_pixels16_arm;
    c->put_pixels_tab[0][1] = ff_put_pixels16_x2_arm;
    c->put_pixels_tab[0][2] = ff_put_pixels16_y2_arm;
--- a/libavcodec/arm/dsputil_init_armv6.c
+++ b/libavcodec/arm/dsputil_init_armv6.c
@ -72,7 +72,7 @@ int ff_pix_sum_armv6(uint8_t *pix, int line_size);

 void av_cold ff_dsputil_init_armv6(DSPContext* c, AVCodecContext *avctx)
 {
-    const int h264_high_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
+    const int high_bit_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;

    if (!avctx->lowres && (avctx->idct_algo == FF_IDCT_AUTO ||
                           avctx->idct_algo == FF_IDCT_SIMPLEARMV6)) {
@ -82,7 +82,7 @@ void av_cold ff_dsputil_init_armv6(DSPContext* c, AVCodecContext *avctx)
        c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM;
    }

-    if (!h264_high_depth) {
+    if (!high_bit_depth) {
    c->put_pixels_tab[0][0] = ff_put_pixels16_armv6;
    c->put_pixels_tab[0][1] = ff_put_pixels16_x2_armv6;
    c->put_pixels_tab[0][2] = ff_put_pixels16_y2_armv6;
--- a/libavcodec/arm/dsputil_init_neon.c
+++ b/libavcodec/arm/dsputil_init_neon.c
@ -173,7 +173,7 @@ void ff_apply_window_int16_neon(int16_t *dst, const int16_t *src,

 void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
 {
-    const int h264_high_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
+    const int high_bit_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;

    if (!avctx->lowres) {
        if (avctx->idct_algo == FF_IDCT_AUTO ||
@ -192,7 +192,7 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
        }
    }

-    if (!h264_high_depth) {
+    if (!high_bit_depth) {
    c->clear_block  = ff_clear_block_neon;
    c->clear_blocks = ff_clear_blocks_neon;

@ -223,7 +223,7 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
    c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_neon;

    if (CONFIG_H264_DECODER) {
-        if (!h264_high_depth) {
+        if (!high_bit_depth) {
        c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_neon;
        c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_neon;
        c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_neon;
--- a/libavcodec/arm/dsputil_iwmmxt.c
+++ b/libavcodec/arm/dsputil_iwmmxt.c
@ -155,7 +155,7 @@ static void nop(uint8_t *block, const uint8_t *pixels, int line_size, int h)
 void ff_dsputil_init_iwmmxt(DSPContext* c, AVCodecContext *avctx)
 {
    int mm_flags = AV_CPU_FLAG_IWMMXT; /* multimedia extension flags */
-    const int h264_high_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
+    const int high_bit_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;

    if (avctx->dsp_mask) {
        if (avctx->dsp_mask & AV_CPU_FLAG_FORCE)
@ -168,7 +168,7 @@ void ff_dsputil_init_iwmmxt(DSPContext* c, AVCodecContext *avctx)

    c->add_pixels_clamped = add_pixels_clamped_iwmmxt;

-    if (!h264_high_depth) {
+    if (!high_bit_depth) {
    c->clear_blocks = clear_blocks_iwmmxt;

    c->put_pixels_tab[0][0] = put_pixels16_iwmmxt;
--- a/libavcodec/arm/h264pred_init_arm.c
+++ b/libavcodec/arm/h264pred_init_arm.c
@ -74,7 +74,7 @@ static void ff_h264_pred_init_neon(H264PredContext *h, int codec_id, const int b
        h->pred16x16[PLANE_PRED8x8  ] = ff_pred16x16_plane_neon;
 }

-void ff_h264_pred_init_arm(H264PredContext *h, int codec_id, const int bit_depth)
+void ff_h264_pred_init_arm(H264PredContext *h, int codec_id, int bit_depth)
 {
    if (HAVE_NEON)    ff_h264_pred_init_neon(h, codec_id, bit_depth);
 }
--- a/libavcodec/avcodec.h
+++ b/libavcodec/avcodec.h
@ -516,10 +516,11 @@ enum AVChromaLocation{
    AVCHROMA_LOC_NB           , ///< Not part of ABI
 };

+#if FF_API_FLAC_GLOBAL_OPTS
 /**
 * LPC analysis type
 */
-enum AVLPCType {
+attribute_deprecated enum AVLPCType {
    AV_LPC_TYPE_DEFAULT     = -1, ///< use the codec default LPC type
    AV_LPC_TYPE_NONE        =  0, ///< do not use LPC prediction or use all zero coefficients
    AV_LPC_TYPE_FIXED       =  1, ///< fixed LPC coefficients
@ -527,6 +528,7 @@ enum AVLPCType {
    AV_LPC_TYPE_CHOLESKY    =  3, ///< Cholesky factorization
    AV_LPC_TYPE_NB              , ///< Not part of ABI
 };
+#endif

 enum AVAudioServiceType {
    AV_AUDIO_SERVICE_TYPE_MAIN              = 0,
@ -2513,13 +2515,6 @@ typedef struct AVCodecContext {
    int compression_level;
 #define FF_COMPRESSION_DEFAULT -1

-    /**
-     * LPC coefficient precision - used by FLAC encoder
-     * - encoding: Set by user.
-     * - decoding: unused
-     */
-    int lpc_coeff_precision;
-
    /**
     * - encoding: Set by user.
     * - decoding: unused
@ -2532,24 +2527,42 @@ typedef struct AVCodecContext {
     */
    int max_prediction_order;

+#if FF_API_FLAC_GLOBAL_OPTS
+    /**
+     * @defgroup flac_opts FLAC options
+     * @deprecated Use FLAC encoder private options instead.
+     * @{
+     */
+
+    /**
+     * LPC coefficient precision - used by FLAC encoder
+     * - encoding: Set by user.
+     * - decoding: unused
+     */
+    attribute_deprecated int lpc_coeff_precision;
+
    /**
     * search method for selecting prediction order
     * - encoding: Set by user.
     * - decoding: unused
     */
-    int prediction_order_method;
+    attribute_deprecated int prediction_order_method;

    /**
     * - encoding: Set by user.
     * - decoding: unused
     */
-    int min_partition_order;
+    attribute_deprecated int min_partition_order;

    /**
     * - encoding: Set by user.
     * - decoding: unused
     */
-    int max_partition_order;
+    attribute_deprecated int max_partition_order;
+    /**
+     * @}
+     */
+#endif

    /**
     * GOP timecode frame start number, in non drop frame format
@ -2767,19 +2780,21 @@ typedef struct AVCodecContext {

    int log_level_offset;

+#if FF_API_FLAC_GLOBAL_OPTS
    /**
     * Determines which LPC analysis algorithm to use.
     * - encoding: Set by user
     * - decoding: unused
     */
-    enum AVLPCType lpc_type;
+    attribute_deprecated enum AVLPCType lpc_type;

    /**
     * Number of passes to use for Cholesky factorization during LPC analysis
     * - encoding: Set by user
     * - decoding: unused
     */
-    int lpc_passes;
+    attribute_deprecated int lpc_passes;
+#endif

    /**
     * Number of slices.
--- a/libavcodec/bfin/dsputil_bfin.c
+++ b/libavcodec/bfin/dsputil_bfin.c
@ -197,14 +197,14 @@ static int bfin_pix_abs8_xy2 (void *c, uint8_t *blk1, uint8_t *blk2, int line_si

 void dsputil_init_bfin( DSPContext* c, AVCodecContext *avctx )
 {
-    const int h264_high_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
+    const int high_bit_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;

    c->get_pixels         = ff_bfin_get_pixels;
    c->diff_pixels        = ff_bfin_diff_pixels;
    c->put_pixels_clamped = ff_bfin_put_pixels_clamped;
    c->add_pixels_clamped = ff_bfin_add_pixels_clamped;

-    if (!h264_high_depth)
+    if (!high_bit_depth)
    c->clear_blocks       = bfin_clear_blocks;
    c->pix_sum            = ff_bfin_pix_sum;
    c->pix_norm1          = ff_bfin_pix_norm1;
@ -231,7 +231,7 @@ void dsputil_init_bfin( DSPContext* c, AVCodecContext *avctx )
    c->sse[1] = ff_bfin_sse8;
    c->sse[2] = ff_bfin_sse4;

-    if (!h264_high_depth) {
+    if (!high_bit_depth) {
    c->put_pixels_tab[0][0] = bfin_put_pixels16;
    c->put_pixels_tab[0][1] = bfin_put_pixels16_x2;
    c->put_pixels_tab[0][2] = bfin_put_pixels16_y2;
--- a/libavcodec/dsputil.c
+++ b/libavcodec/dsputil.c
@ -43,15 +43,15 @@ uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
 uint32_t ff_squareTbl[512] = {0, };

 #define BIT_DEPTH 9
-#include "dsputil_internal.h"
+#include "dsputil_template.c"
 #undef BIT_DEPTH

 #define BIT_DEPTH 10
-#include "dsputil_internal.h"
+#include "dsputil_template.c"
 #undef BIT_DEPTH

 #define BIT_DEPTH 8
-#include "dsputil_internal.h"
+#include "dsputil_template.c"

 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
 #define pb_7f (~0UL/255 * 0x7f)
--- a/libavcodec/dsputil_template.c
+++ b/libavcodec/dsputil_template.c
--- a/libavcodec/flacenc.c
+++ b/libavcodec/flacenc.c
@ -21,6 +21,7 @@

 #include "libavutil/crc.h"
 #include "libavutil/md5.h"
+#include "libavutil/opt.h"
 #include "avcodec.h"
 #include "get_bits.h"
 #include "golomb.h"
@ -43,7 +44,7 @@
 typedef struct CompressionOptions {
    int compression_level;
    int block_time_ms;
-    enum AVLPCType lpc_type;
+    enum FFLPCType lpc_type;
    int lpc_passes;
    int lpc_coeff_precision;
    int min_prediction_order;
@ -80,6 +81,7 @@ typedef struct FlacFrame {
 } FlacFrame;

 typedef struct FlacEncodeContext {
+    AVClass *class;
    PutBitContext pb;
    int channels;
    int samplerate;
@ -156,16 +158,16 @@ static av_cold void dprint_compression_options(FlacEncodeContext *s)
    av_log(avctx, AV_LOG_DEBUG, " compression: %d\n", opt->compression_level);

    switch (opt->lpc_type) {
-    case AV_LPC_TYPE_NONE:
+    case FF_LPC_TYPE_NONE:
        av_log(avctx, AV_LOG_DEBUG, " lpc type: None\n");
        break;
-    case AV_LPC_TYPE_FIXED:
+    case FF_LPC_TYPE_FIXED:
        av_log(avctx, AV_LOG_DEBUG, " lpc type: Fixed pre-defined coefficients\n");
        break;
-    case AV_LPC_TYPE_LEVINSON:
+    case FF_LPC_TYPE_LEVINSON:
        av_log(avctx, AV_LOG_DEBUG, " lpc type: Levinson-Durbin recursion with Welch window\n");
        break;
-    case AV_LPC_TYPE_CHOLESKY:
+    case FF_LPC_TYPE_CHOLESKY:
        av_log(avctx, AV_LOG_DEBUG, " lpc type: Cholesky factorization, %d pass%s\n",
               opt->lpc_passes, opt->lpc_passes == 1 ? "" : "es");
        break;
@ -266,32 +268,42 @@ static av_cold int flac_encode_init(AVCodecContext *avctx)

    s->options.block_time_ms = ((int[]){ 27, 27, 27,105,105,105,105,105,105,105,105,105,105})[level];

-    s->options.lpc_type      = ((int[]){ AV_LPC_TYPE_FIXED,    AV_LPC_TYPE_FIXED,    AV_LPC_TYPE_FIXED,
-                                         AV_LPC_TYPE_LEVINSON, AV_LPC_TYPE_LEVINSON, AV_LPC_TYPE_LEVINSON,
-                                         AV_LPC_TYPE_LEVINSON, AV_LPC_TYPE_LEVINSON, AV_LPC_TYPE_LEVINSON,
-                                         AV_LPC_TYPE_LEVINSON, AV_LPC_TYPE_LEVINSON, AV_LPC_TYPE_LEVINSON,
-                                         AV_LPC_TYPE_LEVINSON})[level];
+    if (s->options.lpc_type == FF_LPC_TYPE_DEFAULT)
+        s->options.lpc_type  = ((int[]){ FF_LPC_TYPE_FIXED,    FF_LPC_TYPE_FIXED,    FF_LPC_TYPE_FIXED,
+                                         FF_LPC_TYPE_LEVINSON, FF_LPC_TYPE_LEVINSON, FF_LPC_TYPE_LEVINSON,
+                                         FF_LPC_TYPE_LEVINSON, FF_LPC_TYPE_LEVINSON, FF_LPC_TYPE_LEVINSON,
+                                         FF_LPC_TYPE_LEVINSON, FF_LPC_TYPE_LEVINSON, FF_LPC_TYPE_LEVINSON,
+                                         FF_LPC_TYPE_LEVINSON})[level];

    s->options.min_prediction_order = ((int[]){  2,  0,  0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1})[level];
    s->options.max_prediction_order = ((int[]){  3,  4,  4,  6,  8,  8,  8,  8, 12, 12, 12, 32, 32})[level];

-    s->options.prediction_order_method = ((int[]){ ORDER_METHOD_EST,    ORDER_METHOD_EST,    ORDER_METHOD_EST,
-                                                   ORDER_METHOD_EST,    ORDER_METHOD_EST,    ORDER_METHOD_EST,
-                                                   ORDER_METHOD_4LEVEL, ORDER_METHOD_LOG,    ORDER_METHOD_4LEVEL,
-                                                   ORDER_METHOD_LOG,    ORDER_METHOD_SEARCH, ORDER_METHOD_LOG,
-                                                   ORDER_METHOD_SEARCH})[level];
+    if (s->options.prediction_order_method < 0)
+        s->options.prediction_order_method = ((int[]){ ORDER_METHOD_EST,    ORDER_METHOD_EST,    ORDER_METHOD_EST,
+                                                       ORDER_METHOD_EST,    ORDER_METHOD_EST,    ORDER_METHOD_EST,
+                                                       ORDER_METHOD_4LEVEL, ORDER_METHOD_LOG,    ORDER_METHOD_4LEVEL,
+                                                       ORDER_METHOD_LOG,    ORDER_METHOD_SEARCH, ORDER_METHOD_LOG,
+                                                       ORDER_METHOD_SEARCH})[level];

-    s->options.min_partition_order = ((int[]){  2,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0})[level];
-    s->options.max_partition_order = ((int[]){  2,  2,  3,  3,  3,  8,  8,  8,  8,  8,  8,  8,  8})[level];
+    if (s->options.min_partition_order > s->options.max_partition_order) {
+        av_log(avctx, AV_LOG_ERROR, "invalid partition orders: min=%d max=%d\n",
+               s->options.min_partition_order, s->options.max_partition_order);
+        return AVERROR(EINVAL);
+    }
+    if (s->options.min_partition_order < 0)
+        s->options.min_partition_order = ((int[]){  2,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0})[level];
+    if (s->options.max_partition_order < 0)
+        s->options.max_partition_order = ((int[]){  2,  2,  3,  3,  3,  8,  8,  8,  8,  8,  8,  8,  8})[level];

    /* set compression option overrides from AVCodecContext */
-    if (avctx->lpc_type > AV_LPC_TYPE_DEFAULT) {
-        if (avctx->lpc_type > AV_LPC_TYPE_CHOLESKY) {
+#if FF_API_FLAC_GLOBAL_OPTS
+    if (avctx->lpc_type > FF_LPC_TYPE_DEFAULT) {
+        if (avctx->lpc_type > FF_LPC_TYPE_CHOLESKY) {
            av_log(avctx, AV_LOG_ERROR, "unknown lpc type: %d\n", avctx->lpc_type);
            return -1;
        }
        s->options.lpc_type = avctx->lpc_type;
-        if (s->options.lpc_type == AV_LPC_TYPE_CHOLESKY) {
+        if (s->options.lpc_type == FF_LPC_TYPE_CHOLESKY) {
            if (avctx->lpc_passes < 0) {
                // default number of passes for Cholesky
                s->options.lpc_passes = 2;
@ -304,11 +316,12 @@ static av_cold int flac_encode_init(AVCodecContext *avctx)
            }
        }
    }
+#endif

-    if (s->options.lpc_type == AV_LPC_TYPE_NONE) {
+    if (s->options.lpc_type == FF_LPC_TYPE_NONE) {
        s->options.min_prediction_order = 0;
    } else if (avctx->min_prediction_order >= 0) {
-        if (s->options.lpc_type == AV_LPC_TYPE_FIXED) {
+        if (s->options.lpc_type == FF_LPC_TYPE_FIXED) {
            if (avctx->min_prediction_order > MAX_FIXED_ORDER) {
                av_log(avctx, AV_LOG_ERROR, "invalid min prediction order: %d\n",
                       avctx->min_prediction_order);
@ -322,10 +335,10 @@ static av_cold int flac_encode_init(AVCodecContext *avctx)
        }
        s->options.min_prediction_order = avctx->min_prediction_order;
    }
-    if (s->options.lpc_type == AV_LPC_TYPE_NONE) {
+    if (s->options.lpc_type == FF_LPC_TYPE_NONE) {
        s->options.max_prediction_order = 0;
    } else if (avctx->max_prediction_order >= 0) {
-        if (s->options.lpc_type == AV_LPC_TYPE_FIXED) {
+        if (s->options.lpc_type == FF_LPC_TYPE_FIXED) {
            if (avctx->max_prediction_order > MAX_FIXED_ORDER) {
                av_log(avctx, AV_LOG_ERROR, "invalid max prediction order: %d\n",
                       avctx->max_prediction_order);
@ -345,6 +358,7 @@ static av_cold int flac_encode_init(AVCodecContext *avctx)
        return -1;
    }

+#if FF_API_FLAC_GLOBAL_OPTS
    if (avctx->prediction_order_method >= 0) {
        if (avctx->prediction_order_method > ORDER_METHOD_LOG) {
            av_log(avctx, AV_LOG_ERROR, "invalid prediction order method: %d\n",
@ -375,6 +389,7 @@ static av_cold int flac_encode_init(AVCodecContext *avctx)
               s->options.min_partition_order, s->options.max_partition_order);
        return -1;
    }
+#endif

    if (avctx->frame_size > 0) {
        if (avctx->frame_size < FLAC_MIN_BLOCKSIZE ||
@ -388,6 +403,7 @@ static av_cold int flac_encode_init(AVCodecContext *avctx)
    }
    s->max_blocksize = s->avctx->frame_size;

+#if FF_API_FLAC_GLOBAL_OPTS
    /* set LPC precision */
    if (avctx->lpc_coeff_precision > 0) {
        if (avctx->lpc_coeff_precision > MAX_LPC_PRECISION) {
@ -396,10 +412,8 @@ static av_cold int flac_encode_init(AVCodecContext *avctx)
            return -1;
        }
        s->options.lpc_coeff_precision = avctx->lpc_coeff_precision;
-    } else {
-        /* default LPC precision */
-        s->options.lpc_coeff_precision = 15;
    }
+#endif

    /* set maximum encoded frame size in verbatim mode */
    s->max_framesize = ff_flac_get_max_frame_size(s->avctx->frame_size,
@ -448,7 +462,7 @@ static av_cold int flac_encode_init(AVCodecContext *avctx)
    }

    ret = ff_lpc_init(&s->lpc_ctx, avctx->frame_size,
-                      s->options.max_prediction_order, AV_LPC_TYPE_LEVINSON);
+                      s->options.max_prediction_order, FF_LPC_TYPE_LEVINSON);

    dprint_compression_options(s);

@ -889,8 +903,8 @@ static int encode_residual_ch(FlacEncodeContext *s, int ch)

    /* FIXED */
    sub->type = FLAC_SUBFRAME_FIXED;
-    if (s->options.lpc_type == AV_LPC_TYPE_NONE  ||
-        s->options.lpc_type == AV_LPC_TYPE_FIXED || n <= max_order) {
+    if (s->options.lpc_type == FF_LPC_TYPE_NONE  ||
+        s->options.lpc_type == FF_LPC_TYPE_FIXED || n <= max_order) {
        uint32_t bits[MAX_FIXED_ORDER+1];
        if (max_order > MAX_FIXED_ORDER)
            max_order = MAX_FIXED_ORDER;
@ -1336,6 +1350,33 @@ static av_cold int flac_encode_close(AVCodecContext *avctx)
    return 0;
 }

+#define FLAGS AV_OPT_FLAG_ENCODING_PARAM | AV_OPT_FLAG_AUDIO_PARAM
+static const AVOption options[] = {
+{ "lpc_coeff_precision", "LPC coefficient precision", offsetof(FlacEncodeContext, options.lpc_coeff_precision), FF_OPT_TYPE_INT, 15, 0, MAX_LPC_PRECISION, FLAGS },
+{ "lpc_type", "LPC algorithm", offsetof(FlacEncodeContext, options.lpc_type), FF_OPT_TYPE_INT, FF_LPC_TYPE_DEFAULT, FF_LPC_TYPE_DEFAULT, FF_LPC_TYPE_NB-1, FLAGS, "lpc_type" },
+{ "none",     NULL, 0, FF_OPT_TYPE_CONST, FF_LPC_TYPE_NONE,     INT_MIN, INT_MAX, FLAGS, "lpc_type" },
+{ "fixed",    NULL, 0, FF_OPT_TYPE_CONST, FF_LPC_TYPE_FIXED,    INT_MIN, INT_MAX, FLAGS, "lpc_type" },
+{ "levinson", NULL, 0, FF_OPT_TYPE_CONST, FF_LPC_TYPE_LEVINSON, INT_MIN, INT_MAX, FLAGS, "lpc_type" },
+{ "cholesky", NULL, 0, FF_OPT_TYPE_CONST, FF_LPC_TYPE_CHOLESKY, INT_MIN, INT_MAX, FLAGS, "lpc_type" },
+{ "lpc_passes", "Number of passes to use for Cholesky factorization during LPC analysis", offsetof(FlacEncodeContext, options.lpc_passes),  FF_OPT_TYPE_INT, -1, INT_MIN, INT_MAX, FLAGS },
+{ "min_partition_order",  NULL, offsetof(FlacEncodeContext, options.min_partition_order),  FF_OPT_TYPE_INT, -1,      -1, MAX_PARTITION_ORDER, FLAGS },
+{ "max_partition_order",  NULL, offsetof(FlacEncodeContext, options.max_partition_order),  FF_OPT_TYPE_INT, -1,      -1, MAX_PARTITION_ORDER, FLAGS },
+{ "prediction_order_method", "Search method for selecting prediction order", offsetof(FlacEncodeContext, options.prediction_order_method), FF_OPT_TYPE_INT, -1, -1, ORDER_METHOD_LOG, FLAGS, "predm" },
+{ "estimation", NULL, 0, FF_OPT_TYPE_CONST, ORDER_METHOD_EST,    INT_MIN, INT_MAX, FLAGS, "predm" },
+{ "2level",     NULL, 0, FF_OPT_TYPE_CONST, ORDER_METHOD_2LEVEL, INT_MIN, INT_MAX, FLAGS, "predm" },
+{ "4level",     NULL, 0, FF_OPT_TYPE_CONST, ORDER_METHOD_4LEVEL, INT_MIN, INT_MAX, FLAGS, "predm" },
+{ "8level",     NULL, 0, FF_OPT_TYPE_CONST, ORDER_METHOD_8LEVEL, INT_MIN, INT_MAX, FLAGS, "predm" },
+{ "search",     NULL, 0, FF_OPT_TYPE_CONST, ORDER_METHOD_SEARCH, INT_MIN, INT_MAX, FLAGS, "predm" },
+{ "log",        NULL, 0, FF_OPT_TYPE_CONST, ORDER_METHOD_LOG,    INT_MIN, INT_MAX, FLAGS, "predm" },
+{ NULL },
+};
+
+static const AVClass flac_encoder_class = {
+    "FLAC encoder",
+    av_default_item_name,
+    options,
+    LIBAVUTIL_VERSION_INT,
+};

 AVCodec ff_flac_encoder = {
    "flac",
@ -1349,4 +1390,5 @@ AVCodec ff_flac_encoder = {
    .capabilities = CODEC_CAP_SMALL_LAST_FRAME | CODEC_CAP_DELAY,
    .sample_fmts = (const enum AVSampleFormat[]){AV_SAMPLE_FMT_S16,AV_SAMPLE_FMT_NONE},
    .long_name = NULL_IF_CONFIG_SMALL("FLAC (Free Lossless Audio Codec)"),
+    .priv_class = &flac_encoder_class,
 };
--- a/libavcodec/h264.c
+++ b/libavcodec/h264.c
@ -45,11 +45,11 @@
 //#undef NDEBUG
 #include <assert.h>

-static const uint8_t rem6[QP_MAX_MAX+1]={
+static const uint8_t rem6[QP_MAX_NUM+1]={
 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
 };

-static const uint8_t div6[QP_MAX_MAX+1]={
+static const uint8_t div6[QP_MAX_NUM+1]={
 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9,10,10,10,10,
 };

@ -586,6 +586,7 @@ av_cold int ff_h264_decode_init(AVCodecContext *avctx){

    h->sps.bit_depth_luma = avctx->bits_per_raw_sample = 8;
    h->pixel_shift = 0;
+    h->sps.bit_depth_luma = avctx->bits_per_raw_sample = 8;

    h->thread_context[0] = h;
    h->outputed_poc = h->next_outputed_poc = INT_MIN;
@ -733,6 +734,7 @@ static int decode_update_thread_context(AVCodecContext *dst, const AVCodecContex
 int ff_h264_frame_start(H264Context *h){
    MpegEncContext * const s = &h->s;
    int i;
+    const int pixel_shift = h->pixel_shift;

    if(MPV_frame_start(s, s->avctx) < 0)
        return -1;
@ -749,14 +751,14 @@ int ff_h264_frame_start(H264Context *h){
    assert(s->linesize && s->uvlinesize);

    for(i=0; i<16; i++){
-        h->block_offset[i]= (4*((scan8[i] - scan8[0])&7)<<h->pixel_shift) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
-        h->block_offset[24+i]= (4*((scan8[i] - scan8[0])&7)<<h->pixel_shift) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
+        h->block_offset[i]= (4*((scan8[i] - scan8[0])&7) << pixel_shift) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
+        h->block_offset[24+i]= (4*((scan8[i] - scan8[0])&7) << pixel_shift) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
    }
    for(i=0; i<4; i++){
        h->block_offset[16+i]=
-        h->block_offset[20+i]= (4*((scan8[i] - scan8[0])&7)<<h->pixel_shift) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
+        h->block_offset[20+i]= (4*((scan8[i] - scan8[0])&7) << pixel_shift) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
        h->block_offset[24+16+i]=
-        h->block_offset[24+20+i]= (4*((scan8[i] - scan8[0])&7)<<h->pixel_shift) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
+        h->block_offset[24+20+i]= (4*((scan8[i] - scan8[0])&7) << pixel_shift) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
    }

    /* can't be in alloc_tables because linesize isn't known there.
@ -948,6 +950,7 @@ static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src
    MpegEncContext * const s = &h->s;
    uint8_t *top_border;
    int top_idx = 1;
+    const int pixel_shift = h->pixel_shift;

    src_y  -=   linesize;
    src_cb -= uvlinesize;
@ -958,10 +961,10 @@ static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src
            if(!MB_MBAFF){
                top_border = h->top_borders[0][s->mb_x];
                AV_COPY128(top_border, src_y + 15*linesize);
-                if (h->pixel_shift)
+                if (pixel_shift)
                    AV_COPY128(top_border+16, src_y+15*linesize+16);
                if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
-                    if (h->pixel_shift) {
+                    if (pixel_shift) {
                        AV_COPY128(top_border+32, src_cb+7*uvlinesize);
                        AV_COPY128(top_border+48, src_cr+7*uvlinesize);
                    } else {
@ -980,11 +983,11 @@ static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src
    // There are two lines saved, the line above the the top macroblock of a pair,
    // and the line above the bottom macroblock
    AV_COPY128(top_border, src_y + 16*linesize);
-    if (h->pixel_shift)
+    if (pixel_shift)
        AV_COPY128(top_border+16, src_y+16*linesize+16);

    if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
-        if (h->pixel_shift) {
+        if (pixel_shift) {
            AV_COPY128(top_border+32, src_cb+8*uvlinesize);
            AV_COPY128(top_border+48, src_cr+8*uvlinesize);
        } else {
@ -994,7 +997,10 @@ static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src
    }
 }

-static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg, int simple, int pixel_shift){
+static inline void xchg_mb_border(H264Context *h, uint8_t *src_y,
+                                  uint8_t *src_cb, uint8_t *src_cr,
+                                  int linesize, int uvlinesize,
+                                  int xchg, int simple, int pixel_shift){
    MpegEncContext * const s = &h->s;
    int deblock_left;
    int deblock_top;
@ -1040,38 +1046,38 @@ else      AV_COPY64(b,a);

    if(deblock_top){
        if(deblock_left){
-            XCHG(top_border_m1+(8<<pixel_shift), src_y -(7<<h->pixel_shift), 1);
+            XCHG(top_border_m1 + (8 << pixel_shift), src_y - (7 << pixel_shift), 1);
        }
-        XCHG(top_border+(0<<pixel_shift), src_y +(1<<pixel_shift), xchg);
-        XCHG(top_border+(8<<pixel_shift), src_y +(9<<pixel_shift), 1);
+        XCHG(top_border + (0 << pixel_shift), src_y + (1 << pixel_shift), xchg);
+        XCHG(top_border + (8 << pixel_shift), src_y + (9 << pixel_shift), 1);
        if(s->mb_x+1 < s->mb_width){
-            XCHG(h->top_borders[top_idx][s->mb_x+1], src_y +(17<<pixel_shift), 1);
+            XCHG(h->top_borders[top_idx][s->mb_x+1], src_y + (17 << pixel_shift), 1);
        }
    }
    if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
        if(deblock_top){
            if(deblock_left){
-                XCHG(top_border_m1+(16<<pixel_shift), src_cb -(7<<pixel_shift), 1);
-                XCHG(top_border_m1+(24<<pixel_shift), src_cr -(7<<pixel_shift), 1);
+                XCHG(top_border_m1 + (16 << pixel_shift), src_cb - (7 << pixel_shift), 1);
+                XCHG(top_border_m1 + (24 << pixel_shift), src_cr - (7 << pixel_shift), 1);
            }
-            XCHG(top_border+(16<<pixel_shift), src_cb+1+pixel_shift, 1);
-            XCHG(top_border+(24<<pixel_shift), src_cr+1+pixel_shift, 1);
+            XCHG(top_border + (16 << pixel_shift), src_cb+1+pixel_shift, 1);
+            XCHG(top_border + (24 << pixel_shift), src_cr+1+pixel_shift, 1);
        }
    }
 }

-static av_always_inline int dctcoef_get(H264Context *h, DCTELEM *mb, int index, int pixel_shift) {
-    if (!pixel_shift)
-        return mb[index];
-    else
-        return ((int32_t*)mb)[index];
+static av_always_inline int dctcoef_get(DCTELEM *mb, int high_bit_depth, int index) {
+    if (high_bit_depth) {
+        return AV_RN32A(((int32_t*)mb) + index);
+    } else
+        return AV_RN16A(mb + index);
 }

-static av_always_inline void dctcoef_set(H264Context *h, DCTELEM *mb, int index, int value, int pixel_shift) {
-    if (!pixel_shift)
-        mb[index] = value;
-    else
-        ((int32_t*)mb)[index] = value;
+static av_always_inline void dctcoef_set(DCTELEM *mb, int high_bit_depth, int index, int value) {
+    if (high_bit_depth) {
+        AV_WN32A(((int32_t*)mb) + index, value);
+    } else
+        AV_WN16A(mb + index, value);
 }

 static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, int pixel_shift){
@ -1090,12 +1096,12 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i
    void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
    void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);

-    dest_y  = s->current_picture.data[0] + ((mb_x<<pixel_shift) + mb_y * s->linesize  ) * 16;
-    dest_cb = s->current_picture.data[1] + ((mb_x<<pixel_shift) + mb_y * s->uvlinesize) * 8;
-    dest_cr = s->current_picture.data[2] + ((mb_x<<pixel_shift) + mb_y * s->uvlinesize) * 8;
+    dest_y  = s->current_picture.data[0] + ((mb_x << pixel_shift) + mb_y * s->linesize  ) * 16;
+    dest_cb = s->current_picture.data[1] + ((mb_x << pixel_shift) + mb_y * s->uvlinesize) * 8;
+    dest_cr = s->current_picture.data[2] + ((mb_x << pixel_shift) + mb_y * s->uvlinesize) * 8;

-    s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + (64<<pixel_shift), s->linesize, 4);
-    s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + (64<<pixel_shift), dest_cr - dest_cb, 2);
+    s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + (64 << pixel_shift), s->linesize, 4);
+    s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + (64 << pixel_shift), dest_cr - dest_cb, 2);

    h->list_counts[mb_xy]= h->list_count;

@ -1186,16 +1192,16 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i
                            uint8_t * const ptr= dest_y + block_offset[i];
                            const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
                            if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
-                                h->hpc.pred8x8l_add[dir](ptr, h->mb + (i*16<<pixel_shift), linesize);
+                                h->hpc.pred8x8l_add[dir](ptr, h->mb + (i*16 << pixel_shift), linesize);
                            }else{
                                const int nnz = h->non_zero_count_cache[ scan8[i] ];
                                h->hpc.pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
                                                            (h->topright_samples_available<<i)&0x4000, linesize);
                                if(nnz){
-                                    if(nnz == 1 && dctcoef_get(h, h->mb, i*16, pixel_shift))
-                                        idct_dc_add(ptr, h->mb + (i*16<<pixel_shift), linesize);
+                                    if(nnz == 1 && dctcoef_get(h->mb, pixel_shift, i*16))
+                                        idct_dc_add(ptr, h->mb + (i*16 << pixel_shift), linesize);
                                    else
-                                        idct_add   (ptr, h->mb + (i*16<<pixel_shift), linesize);
+                                        idct_add   (ptr, h->mb + (i*16 << pixel_shift), linesize);
                                }
                            }
                        }
@ -1212,7 +1218,7 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i
                            const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];

                            if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
-                                h->hpc.pred4x4_add[dir](ptr, h->mb + (i*16<<pixel_shift), linesize);
+                                h->hpc.pred4x4_add[dir](ptr, h->mb + (i*16 << pixel_shift), linesize);
                            }else{
                                uint8_t *topright;
                                int nnz, tr;
@ -1229,7 +1235,7 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i
                                        topright= (uint8_t*) &tr;
                                        }
                                    }else
-                                        topright= ptr + (4<<pixel_shift) - linesize;
+                                        topright= ptr + (4 << pixel_shift) - linesize;
                                }else
                                    topright= NULL;

@ -1237,8 +1243,8 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i
                                nnz = h->non_zero_count_cache[ scan8[i] ];
                                if(nnz){
                                    if(is_h264){
-                                        if(nnz == 1 && dctcoef_get(h, h->mb, i*16, pixel_shift))
-                                            idct_dc_add(ptr, h->mb + (i*16<<pixel_shift), linesize);
+                                        if(nnz == 1 && dctcoef_get(h->mb, pixel_shift, i*16))
+                                            idct_dc_add(ptr, h->mb + (i*16 << pixel_shift), linesize);
                                        else
                                            idct_add   (ptr, h->mb + (i*16<<pixel_shift), linesize);
                                    }
@ -1261,7 +1267,7 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i
                            static const uint8_t dc_mapping[16] = { 0*16, 1*16, 4*16, 5*16, 2*16, 3*16, 6*16, 7*16,
                                                                    8*16, 9*16,12*16,13*16,10*16,11*16,14*16,15*16};
                            for(i = 0; i < 16; i++)
-                                dctcoef_set(h, h->mb, dc_mapping[i], dctcoef_get(h, h->mb_luma_dc, i,pixel_shift),pixel_shift);
+                                dctcoef_set(h->mb, pixel_shift, dc_mapping[i], dctcoef_get(h->mb_luma_dc, pixel_shift, i));
                        }
                    }
                }
@ -1288,8 +1294,8 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i
                            h->hpc.pred16x16_add[h->intra16x16_pred_mode](dest_y, block_offset, h->mb, linesize);
                        }else{
                            for(i=0; i<16; i++){
-                                if(h->non_zero_count_cache[ scan8[i] ] || dctcoef_get(h, h->mb, i*16,pixel_shift))
-                                    s->dsp.add_pixels4(dest_y + block_offset[i], h->mb + (i*16<<pixel_shift), linesize);
+                                if(h->non_zero_count_cache[ scan8[i] ] || dctcoef_get(h->mb, pixel_shift, i*16))
+                                    s->dsp.add_pixels4(dest_y + block_offset[i], h->mb + (i*16 << pixel_shift), linesize);
                            }
                        }
                    }else{
@ -1301,7 +1307,7 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i
                        idct_add= IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
                        for(i=0; i<16; i+=di){
                            if(h->non_zero_count_cache[ scan8[i] ]){
-                                idct_add(dest_y + block_offset[i], h->mb + (i*16<<pixel_shift), linesize);
+                                idct_add(dest_y + block_offset[i], h->mb + (i*16 << pixel_shift), linesize);
                            }
                        }
                    }else{
@ -1329,21 +1335,21 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i
            uint8_t *dest[2] = {dest_cb, dest_cr};
            if(transform_bypass){
                if(IS_INTRA(mb_type) && h->sps.profile_idc==244 && (h->chroma_pred_mode==VERT_PRED8x8 || h->chroma_pred_mode==HOR_PRED8x8)){
-                    h->hpc.pred8x8_add[h->chroma_pred_mode](dest[0], block_offset + 16, h->mb + (16*16<<pixel_shift), uvlinesize);
-                    h->hpc.pred8x8_add[h->chroma_pred_mode](dest[1], block_offset + 20, h->mb + (20*16<<pixel_shift), uvlinesize);
+                    h->hpc.pred8x8_add[h->chroma_pred_mode](dest[0], block_offset + 16, h->mb + (16*16 << pixel_shift), uvlinesize);
+                    h->hpc.pred8x8_add[h->chroma_pred_mode](dest[1], block_offset + 20, h->mb + (20*16 << pixel_shift), uvlinesize);
                }else{
                    idct_add = s->dsp.add_pixels4;
                    for(i=16; i<16+8; i++){
-                        if(h->non_zero_count_cache[ scan8[i] ] || dctcoef_get(h, h->mb, i*16,pixel_shift))
-                            idct_add   (dest[(i&4)>>2] + block_offset[i], h->mb + (i*16<<pixel_shift), uvlinesize);
+                        if(h->non_zero_count_cache[ scan8[i] ] || dctcoef_get(h->mb, pixel_shift, i*16))
+                            idct_add   (dest[(i&4)>>2] + block_offset[i], h->mb + (i*16 << pixel_shift), uvlinesize);
                    }
                }
            }else{
                if(is_h264){
                    if(h->non_zero_count_cache[ scan8[CHROMA_DC_BLOCK_INDEX+0] ])
-                        h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + (16*16<<pixel_shift)       , h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
+                        h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + (16*16 << pixel_shift)       , h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
                    if(h->non_zero_count_cache[ scan8[CHROMA_DC_BLOCK_INDEX+1] ])
-                        h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + ((16*16+4*16)<<pixel_shift), h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
+                        h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + ((16*16+4*16) << pixel_shift), h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
                    h->h264dsp.h264_idct_add8(dest, block_offset,
                                              h->mb, uvlinesize,
                                              h->non_zero_count_cache);
@ -1370,9 +1376,12 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i
 /**
 * Process a macroblock; this case avoids checks for expensive uncommon cases.
 */
-static void hl_decode_mb_simple8(H264Context *h){
-    hl_decode_mb_internal(h, 1, 0);
+#define hl_decode_mb_simple(sh, bits) \
+static void hl_decode_mb_simple_ ## bits(H264Context *h){ \
+    hl_decode_mb_internal(h, 1, sh); \
 }
+hl_decode_mb_simple(0, 8);
+hl_decode_mb_simple(1, 16);

 /**
 * Process a macroblock; this handles edge cases, such as interlacing.
@ -1387,11 +1396,12 @@ void ff_h264_hl_decode_mb(H264Context *h){
    const int mb_type= s->current_picture.mb_type[mb_xy];
    int is_complex = CONFIG_SMALL || h->is_complex || IS_INTRA_PCM(mb_type) || s->qscale == 0;

-    if (is_complex || h->pixel_shift)
+    if (is_complex) {
        hl_decode_mb_complex(h);
-    else{
-        hl_decode_mb_simple8(h);
-    }
+    } else if (h->pixel_shift) {
+        hl_decode_mb_simple_16(h);
+    } else
+        hl_decode_mb_simple_8(h);
 }

 static int pred_weight_table(H264Context *h){
@ -2557,6 +2567,7 @@ static void loop_filter(H264Context *h){
    const int end_mb_y= s->mb_y + FRAME_MBAFF;
    const int old_slice_type= h->slice_type;
    const int end_mb_x  = s->mb_x;
+    const int pixel_shift = h->pixel_shift;

    if(h->deblocking_filter) {
        int start_x= s->resync_mb_y == s->mb_y ? s->resync_mb_x : 0;
@ -2573,9 +2584,9 @@ static void loop_filter(H264Context *h){

                s->mb_x= mb_x;
                s->mb_y= mb_y;
-                dest_y  = s->current_picture.data[0] + ((mb_x<<h->pixel_shift) + mb_y * s->linesize  ) * 16;
-                dest_cb = s->current_picture.data[1] + ((mb_x<<h->pixel_shift) + mb_y * s->uvlinesize) * 8;
-                dest_cr = s->current_picture.data[2] + ((mb_x<<h->pixel_shift) + mb_y * s->uvlinesize) * 8;
+                dest_y  = s->current_picture.data[0] + ((mb_x << pixel_shift) + mb_y * s->linesize  ) * 16;
+                dest_cb = s->current_picture.data[1] + ((mb_x << pixel_shift) + mb_y * s->uvlinesize) * 8;
+                dest_cr = s->current_picture.data[2] + ((mb_x << pixel_shift) + mb_y * s->uvlinesize) * 8;
                    //FIXME simplify above

                if (MB_FIELD) {
@ -3060,7 +3071,7 @@ static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size){
            if (avctx->bits_per_raw_sample != h->sps.bit_depth_luma) {
                if (h->sps.bit_depth_luma >= 8 && h->sps.bit_depth_luma <= 10) {
                    avctx->bits_per_raw_sample = h->sps.bit_depth_luma;
-                    h->pixel_shift = h->sps.bit_depth_luma/9;
+                    h->pixel_shift = h->sps.bit_depth_luma > 8;

                    ff_h264dsp_init(&h->h264dsp, h->sps.bit_depth_luma);
                    ff_h264_pred_init(&h->hpc, s->codec_id, h->sps.bit_depth_luma);
--- a/libavcodec/h264.h
+++ b/libavcodec/h264.h
@ -108,7 +108,7 @@
 */
 #define DELAYED_PIC_REF 4

-#define QP_MAX_MAX (51 + 2*6)           // The maximum supported qp
+#define QP_MAX_NUM (51 + 2*6)           // The maximum supported qp

 /* NAL unit types */
 enum {
@ -266,7 +266,7 @@ typedef struct MMCO{
 typedef struct H264Context{
    MpegEncContext s;
    H264DSPContext h264dsp;
-    int pixel_shift;
+    int pixel_shift;    ///< 0 for 8-bit H264, 1 for high-bit-depth H264
    int chroma_qp[2]; //QPc

    int qp_thresh;      ///< QP threshold to skip loopfilter
@ -355,8 +355,8 @@ typedef struct H264Context{
     */
    PPS pps; //FIXME move to Picture perhaps? (->no) do we need that?

-    uint32_t dequant4_buffer[6][QP_MAX_MAX+1][16]; //FIXME should these be moved down?
-    uint32_t dequant8_buffer[2][QP_MAX_MAX+1][64];
+    uint32_t dequant4_buffer[6][QP_MAX_NUM+1][16]; //FIXME should these be moved down?
+    uint32_t dequant8_buffer[2][QP_MAX_NUM+1][64];
    uint32_t (*dequant4_coeff[6])[16];
    uint32_t (*dequant8_coeff[2])[64];

@ -597,7 +597,7 @@ typedef struct H264Context{
 }H264Context;


-extern const uint8_t ff_h264_chroma_qp[3][QP_MAX_MAX+1]; ///< One chroma qp table for each supported bit depth (8, 9, 10).
+extern const uint8_t ff_h264_chroma_qp[3][QP_MAX_NUM+1]; ///< One chroma qp table for each supported bit depth (8, 9, 10).

 /**
 * Decode SEI
--- a/libavcodec/h264_cabac.c
+++ b/libavcodec/h264_cabac.c
@ -1103,10 +1103,11 @@ static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCT


 #define STORE_BLOCK(type) \
-    do {\
-        uint8_t *ctx = coeff_abs_level1_ctx[node_ctx] + abs_level_m1_ctx_base;\
-\
-        int j= scantable[index[--coeff_count]];\
+    do { \
+        uint8_t *ctx = coeff_abs_level1_ctx[node_ctx] + abs_level_m1_ctx_base; \
+ \
+        int j= scantable[index[--coeff_count]]; \
+ \
        if( get_cabac( CC, ctx ) == 0 ) { \
            node_ctx = coeff_abs_level_transition[0][node_ctx]; \
            if( is_dc ) { \
@ -1141,8 +1142,8 @@ static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCT
            }else{ \
                ((type*)block)[j] = ((int)(get_cabac_bypass_sign( CC, -coeff_abs ) * qmul[j] + 32)) >> 6; \
            } \
-        }\
-    } while( coeff_count );
+        } \
+    } while ( coeff_count );

        if (h->pixel_shift) {
            STORE_BLOCK(int32_t)
@ -1204,6 +1205,7 @@ int ff_h264_decode_mb_cabac(H264Context *h) {
    int mb_xy;
    int mb_type, partition_count, cbp = 0;
    int dct8x8_allowed= h->pps.transform_8x8_mode;
+    const int pixel_shift = h->pixel_shift;

    mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;

@ -1312,7 +1314,7 @@ decode_intra_mb:
    h->slice_table[ mb_xy ]= h->slice_num;

    if(IS_INTRA_PCM(mb_type)) {
-        const int mb_size = 384*h->sps.bit_depth_luma/8;
+        const int mb_size = (384*h->sps.bit_depth_luma) >> 3;
        const uint8_t *ptr;

        // We assume these blocks are very rare so we do not optimize it.
@ -1670,7 +1672,7 @@ decode_intra_mb:
                qmul = h->dequant4_coeff[0][s->qscale];
                for( i = 0; i < 16; i++ ) {
                    //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
-                    decode_cabac_residual_nondc(h, h->mb + (16*i<<h->pixel_shift), 1, i, scan + 1, qmul, 15);
+                    decode_cabac_residual_nondc(h, h->mb + (16*i << pixel_shift), 1, i, scan + 1, qmul, 15);
                }
            } else {
                fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
@ -1680,7 +1682,7 @@ decode_intra_mb:
            for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
                if( cbp & (1<<i8x8) ) {
                    if( IS_8x8DCT(mb_type) ) {
-                        decode_cabac_residual_nondc(h, h->mb + (64*i8x8<<h->pixel_shift), 5, 4*i8x8,
+                        decode_cabac_residual_nondc(h, h->mb + (64*i8x8 << pixel_shift), 5, 4*i8x8,
                            scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64);
                    } else {
                        qmul = h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale];
@ -1688,7 +1690,7 @@ decode_intra_mb:
                            const int index = 4*i8x8 + i4x4;
                            //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
 //START_TIMER
-                            decode_cabac_residual_nondc(h, h->mb + (16*index<<h->pixel_shift), 2, index, scan, qmul, 16);
+                            decode_cabac_residual_nondc(h, h->mb + (16*index << pixel_shift), 2, index, scan, qmul, 16);
 //STOP_TIMER("decode_residual")
                        }
                    }
@ -1703,7 +1705,7 @@ decode_intra_mb:
            int c;
            for( c = 0; c < 2; c++ ) {
                //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
-                decode_cabac_residual_dc(h, h->mb + ((256 + 16*4*c)<<h->pixel_shift), 3, CHROMA_DC_BLOCK_INDEX+c, chroma_dc_scan, 4);
+                decode_cabac_residual_dc(h, h->mb + ((256 + 16*4*c) << pixel_shift), 3, CHROMA_DC_BLOCK_INDEX+c, chroma_dc_scan, 4);
            }
        }

@ -1714,7 +1716,7 @@ decode_intra_mb:
                for( i = 0; i < 4; i++ ) {
                    const int index = 16 + 4 * c + i;
                    //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
-                    decode_cabac_residual_nondc(h, h->mb + (16*index<<h->pixel_shift), 4, index, scan + 1, qmul, 15);
+                    decode_cabac_residual_nondc(h, h->mb + (16*index << pixel_shift), 4, index, scan + 1, qmul, 15);
                }
            }
        } else {
--- a/libavcodec/h264_cavlc.c
+++ b/libavcodec/h264_cavlc.c
@ -542,6 +542,7 @@ int ff_h264_decode_mb_cavlc(H264Context *h){
    int partition_count;
    unsigned int mb_type, cbp;
    int dct8x8_allowed= h->pps.transform_8x8_mode;
+    const int pixel_shift = h->pixel_shift;

    mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;

@ -961,7 +962,7 @@ decode_intra_mb:
                for(i8x8=0; i8x8<4; i8x8++){
                    for(i4x4=0; i4x4<4; i4x4++){
                        const int index= i4x4 + 4*i8x8;
-                        if( decode_residual(h, h->intra_gb_ptr, h->mb + (16*index<<h->pixel_shift), index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
+                        if( decode_residual(h, h->intra_gb_ptr, h->mb + (16*index << pixel_shift), index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
                            return -1;
                        }
                    }
@ -973,7 +974,7 @@ decode_intra_mb:
            for(i8x8=0; i8x8<4; i8x8++){
                if(cbp & (1<<i8x8)){
                    if(IS_8x8DCT(mb_type)){
-                        DCTELEM *buf = &h->mb[64*i8x8<<h->pixel_shift];
+                        DCTELEM *buf = &h->mb[64*i8x8 << pixel_shift];
                        uint8_t *nnz;
                        for(i4x4=0; i4x4<4; i4x4++){
                            if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
@ -986,7 +987,7 @@ decode_intra_mb:
                        for(i4x4=0; i4x4<4; i4x4++){
                            const int index= i4x4 + 4*i8x8;

-                            if( decode_residual(h, gb, h->mb + (16*index<<h->pixel_shift), index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
+                            if( decode_residual(h, gb, h->mb + (16*index << pixel_shift), index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
                                return -1;
                            }
                        }
@ -1000,7 +1001,7 @@ decode_intra_mb:

        if(cbp&0x30){
            for(chroma_idx=0; chroma_idx<2; chroma_idx++)
-                if( decode_residual(h, gb, h->mb + ((256 + 16*4*chroma_idx)<<h->pixel_shift), CHROMA_DC_BLOCK_INDEX+chroma_idx, chroma_dc_scan, NULL, 4) < 0){
+                if( decode_residual(h, gb, h->mb + ((256 + 16*4*chroma_idx) << pixel_shift), CHROMA_DC_BLOCK_INDEX+chroma_idx, chroma_dc_scan, NULL, 4) < 0){
                    return -1;
                }
        }
@ -1010,7 +1011,7 @@ decode_intra_mb:
                const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]];
                for(i4x4=0; i4x4<4; i4x4++){
                    const int index= 16 + 4*chroma_idx + i4x4;
-                    if( decode_residual(h, gb, h->mb + (16*index<<h->pixel_shift), index, scan + 1, qmul, 15) < 0){
+                    if( decode_residual(h, gb, h->mb + (16*index << pixel_shift), index, scan + 1, qmul, 15) < 0){
                        return -1;
                    }
                }
--- a/libavcodec/h264_loopfilter.c
+++ b/libavcodec/h264_loopfilter.c
@ -101,197 +101,92 @@ static const uint8_t tc0_table[52*3][4] = {
 };

 static void av_always_inline filter_mb_edgev( uint8_t *pix, int stride, int16_t bS[4], unsigned int qp, H264Context *h) {
-    const int bit_depth = h->sps.bit_depth_luma;
-    const int qp_bd_offset = 6*(bit_depth-8);
+    const int qp_bd_offset = 6 * (h->sps.bit_depth_luma - 8);
    const unsigned int index_a = qp - qp_bd_offset + h->slice_alpha_c0_offset;
-    const int alpha = alpha_table[index_a] << (bit_depth-8);
-    const int beta  = beta_table[qp - qp_bd_offset + h->slice_beta_offset] << (bit_depth-8);
+    const int alpha = alpha_table[index_a];
+    const int beta  = beta_table[qp - qp_bd_offset + h->slice_beta_offset];
    if (alpha ==0 || beta == 0) return;

    if( bS[0] < 4 ) {
        int8_t tc[4];
-        tc[0] = tc0_table[index_a][bS[0]] << (bit_depth-8);
-        tc[1] = tc0_table[index_a][bS[1]] << (bit_depth-8);
-        tc[2] = tc0_table[index_a][bS[2]] << (bit_depth-8);
-        tc[3] = tc0_table[index_a][bS[3]] << (bit_depth-8);
+        tc[0] = tc0_table[index_a][bS[0]];
+        tc[1] = tc0_table[index_a][bS[1]];
+        tc[2] = tc0_table[index_a][bS[2]];
+        tc[3] = tc0_table[index_a][bS[3]];
        h->h264dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
    } else {
        h->h264dsp.h264_h_loop_filter_luma_intra(pix, stride, alpha, beta);
    }
 }
 static void av_always_inline filter_mb_edgecv( uint8_t *pix, int stride, int16_t bS[4], unsigned int qp, H264Context *h ) {
-    const int bit_depth = h->sps.bit_depth_luma;
-    const int qp_bd_offset = 6*(bit_depth-8);
+    const int qp_bd_offset = 6 * (h->sps.bit_depth_luma - 8);
    const unsigned int index_a = qp - qp_bd_offset + h->slice_alpha_c0_offset;
-    const int alpha = alpha_table[index_a] << (bit_depth-8);
-    const int beta  = beta_table[qp - qp_bd_offset + h->slice_beta_offset] << (bit_depth-8);
+    const int alpha = alpha_table[index_a];
+    const int beta  = beta_table[qp - qp_bd_offset + h->slice_beta_offset];
    if (alpha ==0 || beta == 0) return;

    if( bS[0] < 4 ) {
        int8_t tc[4];
-        tc[0] = (tc0_table[index_a][bS[0]] << (bit_depth-8))+1;
-        tc[1] = (tc0_table[index_a][bS[1]] << (bit_depth-8))+1;
-        tc[2] = (tc0_table[index_a][bS[2]] << (bit_depth-8))+1;
-        tc[3] = (tc0_table[index_a][bS[3]] << (bit_depth-8))+1;
+        tc[0] = tc0_table[index_a][bS[0]]+1;
+        tc[1] = tc0_table[index_a][bS[1]]+1;
+        tc[2] = tc0_table[index_a][bS[2]]+1;
+        tc[3] = tc0_table[index_a][bS[3]]+1;
        h->h264dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
    } else {
        h->h264dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
    }
 }

-static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int bsi, int qp ) {
-    int i;
-    const int bit_depth = h->sps.bit_depth_luma;
-    const int qp_bd_offset = 6*(bit_depth-8);
+static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[7], int bsi, int qp ) {
+    const int qp_bd_offset = 6 * (h->sps.bit_depth_luma - 8);
    int index_a = qp - qp_bd_offset + h->slice_alpha_c0_offset;
-    int alpha = alpha_table[index_a] << (bit_depth-8);
-    int beta  = beta_table[qp - qp_bd_offset + h->slice_beta_offset] << (bit_depth-8);
-    for( i = 0; i < 8; i++, pix += stride) {
-        const int bS_index = (i >> 1) * bsi;
-
-        if( bS[bS_index] == 0 ) {
-            continue;
-        }
-
-        if( bS[bS_index] < 4 ) {
-            const int tc0 = tc0_table[index_a][bS[bS_index]] << (bit_depth-8);
-            const int p0 = pix[-1];
-            const int p1 = pix[-2];
-            const int p2 = pix[-3];
-            const int q0 = pix[0];
-            const int q1 = pix[1];
-            const int q2 = pix[2];
-
-            if( FFABS( p0 - q0 ) < alpha &&
-                FFABS( p1 - p0 ) < beta &&
-                FFABS( q1 - q0 ) < beta ) {
-                int tc = tc0;
-                int i_delta;
-
-                if( FFABS( p2 - p0 ) < beta ) {
-                    if(tc0)
-                    pix[-2] = p1 + av_clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
-                    tc++;
-                }
-                if( FFABS( q2 - q0 ) < beta ) {
-                    if(tc0)
-                    pix[1] = q1 + av_clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
-                    tc++;
-                }
-
-                i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
-                pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
-                pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
-                tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
-            }
-        }else{
-            const int p0 = pix[-1];
-            const int p1 = pix[-2];
-            const int p2 = pix[-3];
-
-            const int q0 = pix[0];
-            const int q1 = pix[1];
-            const int q2 = pix[2];
-
-            if( FFABS( p0 - q0 ) < alpha &&
-                FFABS( p1 - p0 ) < beta &&
-                FFABS( q1 - q0 ) < beta ) {
-
-                if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
-                    if( FFABS( p2 - p0 ) < beta)
-                    {
-                        const int p3 = pix[-4];
-                        /* p0', p1', p2' */
-                        pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
-                        pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
-                        pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
-                    } else {
-                        /* p0' */
-                        pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
-                    }
-                    if( FFABS( q2 - q0 ) < beta)
-                    {
-                        const int q3 = pix[3];
-                        /* q0', q1', q2' */
-                        pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
-                        pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
-                        pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
-                    } else {
-                        /* q0' */
-                        pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
-                    }
-                }else{
-                    /* p0', q0' */
-                    pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
-                    pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
-                }
-                tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
-            }
-        }
-    }
-}
-static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int bsi, int qp ) {
-    int i;
-    const int bit_depth = h->sps.bit_depth_luma;
-    const int qp_bd_offset = 6*(bit_depth-8);
-    int index_a = qp - qp_bd_offset + h->slice_alpha_c0_offset;
-    int alpha = alpha_table[index_a] << (bit_depth-8);
-    int beta  = beta_table[qp - qp_bd_offset + h->slice_beta_offset] << (bit_depth-8);
-    for( i = 0; i < 4; i++, pix += stride) {
-        const int bS_index = i*bsi;
-
-        if( bS[bS_index] == 0 ) {
-            continue;
-        }
-
-        if( bS[bS_index] < 4 ) {
-            const int tc = (tc0_table[index_a][bS[bS_index]] << (bit_depth-8)) + 1;
-            const int p0 = pix[-1];
-            const int p1 = pix[-2];
-            const int q0 = pix[0];
-            const int q1 = pix[1];
-
-            if( FFABS( p0 - q0 ) < alpha &&
-                FFABS( p1 - p0 ) < beta &&
-                FFABS( q1 - q0 ) < beta ) {
-                const int i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
-
-                pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
-                pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
-                tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
-            }
-        }else{
-            const int p0 = pix[-1];
-            const int p1 = pix[-2];
-            const int q0 = pix[0];
-            const int q1 = pix[1];
-
-            if( FFABS( p0 - q0 ) < alpha &&
-                FFABS( p1 - p0 ) < beta &&
-                FFABS( q1 - q0 ) < beta ) {
-
-                pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
-                pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
-                tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
-            }
-        }
-    }
-}
-
-static void av_always_inline filter_mb_edgeh( uint8_t *pix, int stride, int16_t bS[4], unsigned int qp, H264Context *h ) {
-    const int bit_depth = h->sps.bit_depth_luma;
-    const int qp_bd_offset = 6*(bit_depth-8);
-    const unsigned int index_a = qp - qp_bd_offset + h->slice_alpha_c0_offset;
-    const int alpha = alpha_table[index_a] << (bit_depth-8);
-    const int beta  = beta_table[qp - qp_bd_offset + h->slice_beta_offset] << (bit_depth-8);
+    int alpha = alpha_table[index_a];
+    int beta  = beta_table[qp - qp_bd_offset + h->slice_beta_offset];
    if (alpha ==0 || beta == 0) return;

    if( bS[0] < 4 ) {
        int8_t tc[4];
-        tc[0] = tc0_table[index_a][bS[0]] << (bit_depth-8);
-        tc[1] = tc0_table[index_a][bS[1]] << (bit_depth-8);
-        tc[2] = tc0_table[index_a][bS[2]] << (bit_depth-8);
-        tc[3] = tc0_table[index_a][bS[3]] << (bit_depth-8);
+        tc[0] = tc0_table[index_a][bS[0*bsi]];
+        tc[1] = tc0_table[index_a][bS[1*bsi]];
+        tc[2] = tc0_table[index_a][bS[2*bsi]];
+        tc[3] = tc0_table[index_a][bS[3*bsi]];
+        h->h264dsp.h264_h_loop_filter_luma_mbaff(pix, stride, alpha, beta, tc);
+    } else {
+        h->h264dsp.h264_h_loop_filter_luma_mbaff_intra(pix, stride, alpha, beta);
+    }
+}
+static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[7], int bsi, int qp ) {
+    const int qp_bd_offset = 6 * (h->sps.bit_depth_luma - 8);
+    int index_a = qp - qp_bd_offset + h->slice_alpha_c0_offset;
+    int alpha = alpha_table[index_a];
+    int beta  = beta_table[qp - qp_bd_offset + h->slice_beta_offset];
+    if (alpha ==0 || beta == 0) return;
+
+    if( bS[0] < 4 ) {
+        int8_t tc[4];
+        tc[0] = tc0_table[index_a][bS[0*bsi]] + 1;
+        tc[1] = tc0_table[index_a][bS[1*bsi]] + 1;
+        tc[2] = tc0_table[index_a][bS[2*bsi]] + 1;
+        tc[3] = tc0_table[index_a][bS[3*bsi]] + 1;
+        h->h264dsp.h264_h_loop_filter_chroma_mbaff(pix, stride, alpha, beta, tc);
+    } else {
+        h->h264dsp.h264_h_loop_filter_chroma_mbaff_intra(pix, stride, alpha, beta);
+    }
+}
+
+static void av_always_inline filter_mb_edgeh( uint8_t *pix, int stride, int16_t bS[4], unsigned int qp, H264Context *h ) {
+    const int qp_bd_offset = 6 * (h->sps.bit_depth_luma - 8);
+    const unsigned int index_a = qp - qp_bd_offset + h->slice_alpha_c0_offset;
+    const int alpha = alpha_table[index_a];
+    const int beta  = beta_table[qp - qp_bd_offset + h->slice_beta_offset];
+    if (alpha ==0 || beta == 0) return;
+
+    if( bS[0] < 4 ) {
+        int8_t tc[4];
+        tc[0] = tc0_table[index_a][bS[0]];
+        tc[1] = tc0_table[index_a][bS[1]];
+        tc[2] = tc0_table[index_a][bS[2]];
+        tc[3] = tc0_table[index_a][bS[3]];
        h->h264dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
    } else {
        h->h264dsp.h264_v_loop_filter_luma_intra(pix, stride, alpha, beta);
@ -299,19 +194,18 @@ static void av_always_inline filter_mb_edgeh( uint8_t *pix, int stride, int16_t
 }

 static void av_always_inline filter_mb_edgech( uint8_t *pix, int stride, int16_t bS[4], unsigned int qp, H264Context *h ) {
-    const int bit_depth = h->sps.bit_depth_luma;
-    const int qp_bd_offset = 6*(bit_depth-8);
+    const int qp_bd_offset = 6 * (h->sps.bit_depth_luma - 8);
    const unsigned int index_a = qp - qp_bd_offset + h->slice_alpha_c0_offset;
-    const int alpha = alpha_table[index_a] << (bit_depth-8);
-    const int beta  = beta_table[qp - qp_bd_offset + h->slice_beta_offset] << (bit_depth-8);
+    const int alpha = alpha_table[index_a];
+    const int beta  = beta_table[qp - qp_bd_offset + h->slice_beta_offset];
    if (alpha ==0 || beta == 0) return;

    if( bS[0] < 4 ) {
        int8_t tc[4];
-        tc[0] = (tc0_table[index_a][bS[0]] << (bit_depth-8))+1;
-        tc[1] = (tc0_table[index_a][bS[1]] << (bit_depth-8))+1;
-        tc[2] = (tc0_table[index_a][bS[2]] << (bit_depth-8))+1;
-        tc[3] = (tc0_table[index_a][bS[3]] << (bit_depth-8))+1;
+        tc[0] = tc0_table[index_a][bS[0]]+1;
+        tc[1] = tc0_table[index_a][bS[1]]+1;
+        tc[2] = tc0_table[index_a][bS[2]]+1;
+        tc[3] = tc0_table[index_a][bS[3]]+1;
        h->h264dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
    } else {
        h->h264dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
@ -650,10 +544,10 @@ static av_always_inline void filter_mb_dir(H264Context *h, int mb_x, int mb_y, u
        tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
        //{ int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
        if( dir == 0 ) {
-            filter_mb_edgev( &img_y[4*edge<<h->pixel_shift], linesize, bS, qp, h );
+            filter_mb_edgev( &img_y[4*edge << h->pixel_shift], linesize, bS, qp, h );
            if( (edge&1) == 0 ) {
-                filter_mb_edgecv( &img_cb[2*edge<<h->pixel_shift], uvlinesize, bS, h->chroma_qp[0], h);
-                filter_mb_edgecv( &img_cr[2*edge<<h->pixel_shift], uvlinesize, bS, h->chroma_qp[1], h);
+                filter_mb_edgecv( &img_cb[2*edge << h->pixel_shift], uvlinesize, bS, h->chroma_qp[0], h);
+                filter_mb_edgecv( &img_cr[2*edge << h->pixel_shift], uvlinesize, bS, h->chroma_qp[1], h);
            }
        } else {
            filter_mb_edgeh( &img_y[4*edge*linesize], linesize, bS, qp, h );
--- a/libavcodec/h264_ps.c
+++ b/libavcodec/h264_ps.c
@ -70,7 +70,7 @@ static const AVRational pixel_aspect[17]={
    QP(37,d), QP(37,d), QP(37,d), QP(38,d), QP(38,d), QP(38,d),\
    QP(39,d), QP(39,d), QP(39,d), QP(39,d)

-const uint8_t ff_h264_chroma_qp[3][QP_MAX_MAX+1] = {
+const uint8_t ff_h264_chroma_qp[3][QP_MAX_NUM+1] = {
    {
        CHROMA_QP_TABLE_END(8)
    },
--- a/libavcodec/h264_refs.c
+++ b/libavcodec/h264_refs.c
@ -629,8 +629,9 @@ int ff_h264_execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
         * short_ref and long_ref buffers.
         */
        av_log(h->s.avctx, AV_LOG_ERROR,
-               "number of reference frames exceeds max (probably "
-               "corrupt input), discarding one long:%d short:%d max:%d\n", h->long_ref_count, h->short_ref_count, h->sps.ref_frame_count);
+               "number of reference frames (%d+%d) exceeds max (%d; probably "
+               "corrupt input), discarding one\n",
+               h->long_ref_count, h->short_ref_count, h->sps.ref_frame_count);

        if (h->long_ref_count && !h->short_ref_count) {
            for (i = 0; i < 16; ++i)
--- a/libavcodec/h264dsp.c
+++ b/libavcodec/h264dsp.c
@ -30,15 +30,15 @@
 #include "h264dsp.h"

 #define BIT_DEPTH 8
-#include "h264dsp_internal.h"
+#include "h264dsp_template.c"
 #undef BIT_DEPTH

 #define BIT_DEPTH 9
-#include "h264dsp_internal.h"
+#include "h264dsp_template.c"
 #undef BIT_DEPTH

 #define BIT_DEPTH 10
-#include "h264dsp_internal.h"
+#include "h264dsp_template.c"
 #undef BIT_DEPTH

 void ff_h264dsp_init(H264DSPContext *c, const int bit_depth)
@ -47,58 +47,62 @@ void ff_h264dsp_init(H264DSPContext *c, const int bit_depth)
 #define FUNC(a, depth) a ## _ ## depth ## _c

 #define H264_DSP(depth) \
-    c->h264_idct_add                   = FUNC(ff_h264_idct_add               , depth);\
-    c->h264_idct8_add                  = FUNC(ff_h264_idct8_add              , depth);\
-    c->h264_idct_dc_add                = FUNC(ff_h264_idct_dc_add            , depth);\
-    c->h264_idct8_dc_add               = FUNC(ff_h264_idct8_dc_add           , depth);\
-    c->h264_idct_add16                 = FUNC(ff_h264_idct_add16             , depth);\
-    c->h264_idct8_add4                 = FUNC(ff_h264_idct8_add4             , depth);\
-    c->h264_idct_add8                  = FUNC(ff_h264_idct_add8              , depth);\
-    c->h264_idct_add16intra            = FUNC(ff_h264_idct_add16intra        , depth);\
-    c->h264_luma_dc_dequant_idct       = FUNC(ff_h264_luma_dc_dequant_idct   , depth);\
-    c->h264_chroma_dc_dequant_idct     = FUNC(ff_h264_chroma_dc_dequant_idct , depth);\
+    c->h264_idct_add= FUNC(ff_h264_idct_add, depth);\
+    c->h264_idct8_add= FUNC(ff_h264_idct8_add, depth);\
+    c->h264_idct_dc_add= FUNC(ff_h264_idct_dc_add, depth);\
+    c->h264_idct8_dc_add= FUNC(ff_h264_idct8_dc_add, depth);\
+    c->h264_idct_add16     = FUNC(ff_h264_idct_add16, depth);\
+    c->h264_idct8_add4     = FUNC(ff_h264_idct8_add4, depth);\
+    c->h264_idct_add8      = FUNC(ff_h264_idct_add8, depth);\
+    c->h264_idct_add16intra= FUNC(ff_h264_idct_add16intra, depth);\
+    c->h264_luma_dc_dequant_idct= FUNC(ff_h264_luma_dc_dequant_idct, depth);\
+    c->h264_chroma_dc_dequant_idct= FUNC(ff_h264_chroma_dc_dequant_idct, depth);\
 \
-    c->weight_h264_pixels_tab[0]       = FUNC(  weight_h264_pixels16x16      , depth);\
-    c->weight_h264_pixels_tab[1]       = FUNC(  weight_h264_pixels16x8       , depth);\
-    c->weight_h264_pixels_tab[2]       = FUNC(  weight_h264_pixels8x16       , depth);\
-    c->weight_h264_pixels_tab[3]       = FUNC(  weight_h264_pixels8x8        , depth);\
-    c->weight_h264_pixels_tab[4]       = FUNC(  weight_h264_pixels8x4        , depth);\
-    c->weight_h264_pixels_tab[5]       = FUNC(  weight_h264_pixels4x8        , depth);\
-    c->weight_h264_pixels_tab[6]       = FUNC(  weight_h264_pixels4x4        , depth);\
-    c->weight_h264_pixels_tab[7]       = FUNC(  weight_h264_pixels4x2        , depth);\
-    c->weight_h264_pixels_tab[8]       = FUNC(  weight_h264_pixels2x4        , depth);\
-    c->weight_h264_pixels_tab[9]       = FUNC(  weight_h264_pixels2x2        , depth);\
-    c->biweight_h264_pixels_tab[0]     = FUNC(biweight_h264_pixels16x16      , depth);\
-    c->biweight_h264_pixels_tab[1]     = FUNC(biweight_h264_pixels16x8       , depth);\
-    c->biweight_h264_pixels_tab[2]     = FUNC(biweight_h264_pixels8x16       , depth);\
-    c->biweight_h264_pixels_tab[3]     = FUNC(biweight_h264_pixels8x8        , depth);\
-    c->biweight_h264_pixels_tab[4]     = FUNC(biweight_h264_pixels8x4        , depth);\
-    c->biweight_h264_pixels_tab[5]     = FUNC(biweight_h264_pixels4x8        , depth);\
-    c->biweight_h264_pixels_tab[6]     = FUNC(biweight_h264_pixels4x4        , depth);\
-    c->biweight_h264_pixels_tab[7]     = FUNC(biweight_h264_pixels4x2        , depth);\
-    c->biweight_h264_pixels_tab[8]     = FUNC(biweight_h264_pixels2x4        , depth);\
-    c->biweight_h264_pixels_tab[9]     = FUNC(biweight_h264_pixels2x2        , depth);\
+    c->weight_h264_pixels_tab[0]= FUNC(weight_h264_pixels16x16, depth);\
+    c->weight_h264_pixels_tab[1]= FUNC(weight_h264_pixels16x8, depth);\
+    c->weight_h264_pixels_tab[2]= FUNC(weight_h264_pixels8x16, depth);\
+    c->weight_h264_pixels_tab[3]= FUNC(weight_h264_pixels8x8, depth);\
+    c->weight_h264_pixels_tab[4]= FUNC(weight_h264_pixels8x4, depth);\
+    c->weight_h264_pixels_tab[5]= FUNC(weight_h264_pixels4x8, depth);\
+    c->weight_h264_pixels_tab[6]= FUNC(weight_h264_pixels4x4, depth);\
+    c->weight_h264_pixels_tab[7]= FUNC(weight_h264_pixels4x2, depth);\
+    c->weight_h264_pixels_tab[8]= FUNC(weight_h264_pixels2x4, depth);\
+    c->weight_h264_pixels_tab[9]= FUNC(weight_h264_pixels2x2, depth);\
+    c->biweight_h264_pixels_tab[0]= FUNC(biweight_h264_pixels16x16, depth);\
+    c->biweight_h264_pixels_tab[1]= FUNC(biweight_h264_pixels16x8, depth);\
+    c->biweight_h264_pixels_tab[2]= FUNC(biweight_h264_pixels8x16, depth);\
+    c->biweight_h264_pixels_tab[3]= FUNC(biweight_h264_pixels8x8, depth);\
+    c->biweight_h264_pixels_tab[4]= FUNC(biweight_h264_pixels8x4, depth);\
+    c->biweight_h264_pixels_tab[5]= FUNC(biweight_h264_pixels4x8, depth);\
+    c->biweight_h264_pixels_tab[6]= FUNC(biweight_h264_pixels4x4, depth);\
+    c->biweight_h264_pixels_tab[7]= FUNC(biweight_h264_pixels4x2, depth);\
+    c->biweight_h264_pixels_tab[8]= FUNC(biweight_h264_pixels2x4, depth);\
+    c->biweight_h264_pixels_tab[9]= FUNC(biweight_h264_pixels2x2, depth);\
 \
-    c->h264_v_loop_filter_luma         = FUNC(h264_v_loop_filter_luma        , depth);\
-    c->h264_h_loop_filter_luma         = FUNC(h264_h_loop_filter_luma        , depth);\
-    c->h264_v_loop_filter_luma_intra   = FUNC(h264_v_loop_filter_luma_intra  , depth);\
-    c->h264_h_loop_filter_luma_intra   = FUNC(h264_h_loop_filter_luma_intra  , depth);\
-    c->h264_v_loop_filter_chroma       = FUNC(h264_v_loop_filter_chroma      , depth);\
-    c->h264_h_loop_filter_chroma       = FUNC(h264_h_loop_filter_chroma      , depth);\
-    c->h264_v_loop_filter_chroma_intra = FUNC(h264_v_loop_filter_chroma_intra, depth);\
-    c->h264_h_loop_filter_chroma_intra = FUNC(h264_h_loop_filter_chroma_intra, depth);\
+    c->h264_v_loop_filter_luma= FUNC(h264_v_loop_filter_luma, depth);\
+    c->h264_h_loop_filter_luma= FUNC(h264_h_loop_filter_luma, depth);\
+    c->h264_h_loop_filter_luma_mbaff= FUNC(h264_h_loop_filter_luma_mbaff, depth);\
+    c->h264_v_loop_filter_luma_intra= FUNC(h264_v_loop_filter_luma_intra, depth);\
+    c->h264_h_loop_filter_luma_intra= FUNC(h264_h_loop_filter_luma_intra, depth);\
+    c->h264_h_loop_filter_luma_mbaff_intra= FUNC(h264_h_loop_filter_luma_mbaff_intra, depth);\
+    c->h264_v_loop_filter_chroma= FUNC(h264_v_loop_filter_chroma, depth);\
+    c->h264_h_loop_filter_chroma= FUNC(h264_h_loop_filter_chroma, depth);\
+    c->h264_h_loop_filter_chroma_mbaff= FUNC(h264_h_loop_filter_chroma_mbaff, depth);\
+    c->h264_v_loop_filter_chroma_intra= FUNC(h264_v_loop_filter_chroma_intra, depth);\
+    c->h264_h_loop_filter_chroma_intra= FUNC(h264_h_loop_filter_chroma_intra, depth);\
+    c->h264_h_loop_filter_chroma_mbaff_intra= FUNC(h264_h_loop_filter_chroma_mbaff_intra, depth);\
    c->h264_loop_filter_strength= NULL;

    switch (bit_depth) {
-        case 9:
-            H264_DSP(9);
-            break;
-        case 10:
-            H264_DSP(10);
-            break;
-        default:
-            H264_DSP(8);
-            break;
+    case 9:
+        H264_DSP(9);
+        break;
+    case 10:
+        H264_DSP(10);
+        break;
+    default:
+        H264_DSP(8);
+        break;
    }

    if (ARCH_ARM) ff_h264dsp_init_arm(c, bit_depth);
--- a/libavcodec/h264dsp.h
+++ b/libavcodec/h264dsp.h
@ -45,13 +45,17 @@ typedef struct H264DSPContext{
    /* loop filter */
    void (*h264_v_loop_filter_luma)(uint8_t *pix/*align 16*/, int stride, int alpha, int beta, int8_t *tc0);
    void (*h264_h_loop_filter_luma)(uint8_t *pix/*align 4 */, int stride, int alpha, int beta, int8_t *tc0);
+    void (*h264_h_loop_filter_luma_mbaff)(uint8_t *pix/*align 16*/, int stride, int alpha, int beta, int8_t *tc0);
    /* v/h_loop_filter_luma_intra: align 16 */
    void (*h264_v_loop_filter_luma_intra)(uint8_t *pix, int stride, int alpha, int beta);
    void (*h264_h_loop_filter_luma_intra)(uint8_t *pix, int stride, int alpha, int beta);
+    void (*h264_h_loop_filter_luma_mbaff_intra)(uint8_t *pix/*align 16*/, int stride, int alpha, int beta);
    void (*h264_v_loop_filter_chroma)(uint8_t *pix/*align 8*/, int stride, int alpha, int beta, int8_t *tc0);
    void (*h264_h_loop_filter_chroma)(uint8_t *pix/*align 4*/, int stride, int alpha, int beta, int8_t *tc0);
+    void (*h264_h_loop_filter_chroma_mbaff)(uint8_t *pix/*align 8*/, int stride, int alpha, int beta, int8_t *tc0);
    void (*h264_v_loop_filter_chroma_intra)(uint8_t *pix/*align 8*/, int stride, int alpha, int beta);
    void (*h264_h_loop_filter_chroma_intra)(uint8_t *pix/*align 8*/, int stride, int alpha, int beta);
+    void (*h264_h_loop_filter_chroma_mbaff_intra)(uint8_t *pix/*align 8*/, int stride, int alpha, int beta);
    // h264_loop_filter_strength: simd only. the C version is inlined in h264.c
    void (*h264_loop_filter_strength)(int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2],
                                      int bidir, int edges, int step, int mask_mv0, int mask_mv1, int field);
--- a/libavcodec/h264dsp_template.c
+++ b/libavcodec/h264dsp_template.c
@ -0,0 +1,313 @@
+/*
+ * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
+ * Copyright (c) 2003-2010 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * H.264 / AVC / MPEG4 part10 DSP functions.
+ * @author Michael Niedermayer <michaelni@gmx.at>
+ */
+
+#include "high_bit_depth.h"
+
+#define op_scale1(x)  block[x] = av_clip_pixel( (block[x]*weight + offset) >> log2_denom )
+#define op_scale2(x)  dst[x] = av_clip_pixel( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
+#define H264_WEIGHT(W,H) \
+static void FUNCC(weight_h264_pixels ## W ## x ## H)(uint8_t *_block, int stride, int log2_denom, int weight, int offset){ \
+    int y; \
+    pixel *block = (pixel*)_block; \
+    stride /= sizeof(pixel); \
+    offset <<= (log2_denom + (BIT_DEPTH-8)); \
+    if(log2_denom) offset += 1<<(log2_denom-1); \
+    for(y=0; y<H; y++, block += stride){ \
+        op_scale1(0); \
+        op_scale1(1); \
+        if(W==2) continue; \
+        op_scale1(2); \
+        op_scale1(3); \
+        if(W==4) continue; \
+        op_scale1(4); \
+        op_scale1(5); \
+        op_scale1(6); \
+        op_scale1(7); \
+        if(W==8) continue; \
+        op_scale1(8); \
+        op_scale1(9); \
+        op_scale1(10); \
+        op_scale1(11); \
+        op_scale1(12); \
+        op_scale1(13); \
+        op_scale1(14); \
+        op_scale1(15); \
+    } \
+} \
+static void FUNCC(biweight_h264_pixels ## W ## x ## H)(uint8_t *_dst, uint8_t *_src, int stride, int log2_denom, int weightd, int weights, int offset){ \
+    int y; \
+    pixel *dst = (pixel*)_dst; \
+    pixel *src = (pixel*)_src; \
+    stride /= sizeof(pixel); \
+    offset = ((offset + 1) | 1) << log2_denom; \
+    for(y=0; y<H; y++, dst += stride, src += stride){ \
+        op_scale2(0); \
+        op_scale2(1); \
+        if(W==2) continue; \
+        op_scale2(2); \
+        op_scale2(3); \
+        if(W==4) continue; \
+        op_scale2(4); \
+        op_scale2(5); \
+        op_scale2(6); \
+        op_scale2(7); \
+        if(W==8) continue; \
+        op_scale2(8); \
+        op_scale2(9); \
+        op_scale2(10); \
+        op_scale2(11); \
+        op_scale2(12); \
+        op_scale2(13); \
+        op_scale2(14); \
+        op_scale2(15); \
+    } \
+}
+
+H264_WEIGHT(16,16)
+H264_WEIGHT(16,8)
+H264_WEIGHT(8,16)
+H264_WEIGHT(8,8)
+H264_WEIGHT(8,4)
+H264_WEIGHT(4,8)
+H264_WEIGHT(4,4)
+H264_WEIGHT(4,2)
+H264_WEIGHT(2,4)
+H264_WEIGHT(2,2)
+
+#undef op_scale1
+#undef op_scale2
+#undef H264_WEIGHT
+
+static av_always_inline av_flatten void FUNCC(h264_loop_filter_luma)(uint8_t *_pix, int xstride, int ystride, int inner_iters, int alpha, int beta, int8_t *tc0)
+{
+    pixel *pix = (pixel*)_pix;
+    int i, d;
+    xstride /= sizeof(pixel);
+    ystride /= sizeof(pixel);
+    alpha <<= BIT_DEPTH - 8;
+    beta  <<= BIT_DEPTH - 8;
+    for( i = 0; i < 4; i++ ) {
+        const int tc_orig = tc0[i] << (BIT_DEPTH - 8);
+        if( tc_orig < 0 ) {
+            pix += inner_iters*ystride;
+            continue;
+        }
+        for( d = 0; d < inner_iters; d++ ) {
+            const int p0 = pix[-1*xstride];
+            const int p1 = pix[-2*xstride];
+            const int p2 = pix[-3*xstride];
+            const int q0 = pix[0];
+            const int q1 = pix[1*xstride];
+            const int q2 = pix[2*xstride];
+
+            if( FFABS( p0 - q0 ) < alpha &&
+                FFABS( p1 - p0 ) < beta &&
+                FFABS( q1 - q0 ) < beta ) {
+
+                int tc = tc_orig;
+                int i_delta;
+
+                if( FFABS( p2 - p0 ) < beta ) {
+                    if(tc_orig)
+                    pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc_orig, tc_orig );
+                    tc++;
+                }
+                if( FFABS( q2 - q0 ) < beta ) {
+                    if(tc_orig)
+                    pix[   xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc_orig, tc_orig );
+                    tc++;
+                }
+
+                i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
+                pix[-xstride] = av_clip_pixel( p0 + i_delta );    /* p0' */
+                pix[0]        = av_clip_pixel( q0 - i_delta );    /* q0' */
+            }
+            pix += ystride;
+        }
+    }
+}
+static void FUNCC(h264_v_loop_filter_luma)(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
+{
+    FUNCC(h264_loop_filter_luma)(pix, stride, sizeof(pixel), 4, alpha, beta, tc0);
+}
+static void FUNCC(h264_h_loop_filter_luma)(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
+{
+    FUNCC(h264_loop_filter_luma)(pix, sizeof(pixel), stride, 4, alpha, beta, tc0);
+}
+static void FUNCC(h264_h_loop_filter_luma_mbaff)(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
+{
+    FUNCC(h264_loop_filter_luma)(pix, sizeof(pixel), stride, 2, alpha, beta, tc0);
+}
+
+static av_always_inline av_flatten void FUNCC(h264_loop_filter_luma_intra)(uint8_t *_pix, int xstride, int ystride, int inner_iters, int alpha, int beta)
+{
+    pixel *pix = (pixel*)_pix;
+    int d;
+    xstride /= sizeof(pixel);
+    ystride /= sizeof(pixel);
+    alpha <<= BIT_DEPTH - 8;
+    beta  <<= BIT_DEPTH - 8;
+    for( d = 0; d < 4 * inner_iters; d++ ) {
+        const int p2 = pix[-3*xstride];
+        const int p1 = pix[-2*xstride];
+        const int p0 = pix[-1*xstride];
+
+        const int q0 = pix[ 0*xstride];
+        const int q1 = pix[ 1*xstride];
+        const int q2 = pix[ 2*xstride];
+
+        if( FFABS( p0 - q0 ) < alpha &&
+            FFABS( p1 - p0 ) < beta &&
+            FFABS( q1 - q0 ) < beta ) {
+
+            if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
+                if( FFABS( p2 - p0 ) < beta)
+                {
+                    const int p3 = pix[-4*xstride];
+                    /* p0', p1', p2' */
+                    pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
+                    pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
+                    pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
+                } else {
+                    /* p0' */
+                    pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
+                }
+                if( FFABS( q2 - q0 ) < beta)
+                {
+                    const int q3 = pix[3*xstride];
+                    /* q0', q1', q2' */
+                    pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
+                    pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
+                    pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
+                } else {
+                    /* q0' */
+                    pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
+                }
+            }else{
+                /* p0', q0' */
+                pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
+                pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
+            }
+        }
+        pix += ystride;
+    }
+}
+static void FUNCC(h264_v_loop_filter_luma_intra)(uint8_t *pix, int stride, int alpha, int beta)
+{
+    FUNCC(h264_loop_filter_luma_intra)(pix, stride, sizeof(pixel), 4, alpha, beta);
+}
+static void FUNCC(h264_h_loop_filter_luma_intra)(uint8_t *pix, int stride, int alpha, int beta)
+{
+    FUNCC(h264_loop_filter_luma_intra)(pix, sizeof(pixel), stride, 4, alpha, beta);
+}
+static void FUNCC(h264_h_loop_filter_luma_mbaff_intra)(uint8_t *pix, int stride, int alpha, int beta)
+{
+    FUNCC(h264_loop_filter_luma_intra)(pix, sizeof(pixel), stride, 2, alpha, beta);
+}
+
+static av_always_inline av_flatten void FUNCC(h264_loop_filter_chroma)(uint8_t *_pix, int xstride, int ystride, int inner_iters, int alpha, int beta, int8_t *tc0)
+{
+    pixel *pix = (pixel*)_pix;
+    int i, d;
+    xstride /= sizeof(pixel);
+    ystride /= sizeof(pixel);
+    alpha <<= BIT_DEPTH - 8;
+    beta  <<= BIT_DEPTH - 8;
+    for( i = 0; i < 4; i++ ) {
+        const int tc = ((tc0[i] - 1) << (BIT_DEPTH - 8)) + 1;
+        if( tc <= 0 ) {
+            pix += inner_iters*ystride;
+            continue;
+        }
+        for( d = 0; d < inner_iters; d++ ) {
+            const int p0 = pix[-1*xstride];
+            const int p1 = pix[-2*xstride];
+            const int q0 = pix[0];
+            const int q1 = pix[1*xstride];
+
+            if( FFABS( p0 - q0 ) < alpha &&
+                FFABS( p1 - p0 ) < beta &&
+                FFABS( q1 - q0 ) < beta ) {
+
+                int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
+
+                pix[-xstride] = av_clip_pixel( p0 + delta );    /* p0' */
+                pix[0]        = av_clip_pixel( q0 - delta );    /* q0' */
+            }
+            pix += ystride;
+        }
+    }
+}
+static void FUNCC(h264_v_loop_filter_chroma)(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
+{
+    FUNCC(h264_loop_filter_chroma)(pix, stride, sizeof(pixel), 2, alpha, beta, tc0);
+}
+static void FUNCC(h264_h_loop_filter_chroma)(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
+{
+    FUNCC(h264_loop_filter_chroma)(pix, sizeof(pixel), stride, 2, alpha, beta, tc0);
+}
+static void FUNCC(h264_h_loop_filter_chroma_mbaff)(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
+{
+    FUNCC(h264_loop_filter_chroma)(pix, sizeof(pixel), stride, 1, alpha, beta, tc0);
+}
+
+static av_always_inline av_flatten void FUNCC(h264_loop_filter_chroma_intra)(uint8_t *_pix, int xstride, int ystride, int inner_iters, int alpha, int beta)
+{
+    pixel *pix = (pixel*)_pix;
+    int d;
+    xstride /= sizeof(pixel);
+    ystride /= sizeof(pixel);
+    alpha <<= BIT_DEPTH - 8;
+    beta  <<= BIT_DEPTH - 8;
+    for( d = 0; d < 4 * inner_iters; d++ ) {
+        const int p0 = pix[-1*xstride];
+        const int p1 = pix[-2*xstride];
+        const int q0 = pix[0];
+        const int q1 = pix[1*xstride];
+
+        if( FFABS( p0 - q0 ) < alpha &&
+            FFABS( p1 - p0 ) < beta &&
+            FFABS( q1 - q0 ) < beta ) {
+
+            pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
+            pix[0]        = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
+        }
+        pix += ystride;
+    }
+}
+static void FUNCC(h264_v_loop_filter_chroma_intra)(uint8_t *pix, int stride, int alpha, int beta)
+{
+    FUNCC(h264_loop_filter_chroma_intra)(pix, stride, sizeof(pixel), 2, alpha, beta);
+}
+static void FUNCC(h264_h_loop_filter_chroma_intra)(uint8_t *pix, int stride, int alpha, int beta)
+{
+    FUNCC(h264_loop_filter_chroma_intra)(pix, sizeof(pixel), stride, 2, alpha, beta);
+}
+static void FUNCC(h264_h_loop_filter_chroma_mbaff_intra)(uint8_t *pix, int stride, int alpha, int beta)
+{
+    FUNCC(h264_loop_filter_chroma_intra)(pix, sizeof(pixel), stride, 1, alpha, beta);
+}
--- a/libavcodec/h264idct.c
+++ b/libavcodec/h264idct.c
@ -26,13 +26,13 @@
 */

 #define BIT_DEPTH 8
-#include "h264idct_internal.h"
+#include "h264idct_template.c"
 #undef BIT_DEPTH

 #define BIT_DEPTH 9
-#include "h264idct_internal.h"
+#include "h264idct_template.c"
 #undef BIT_DEPTH

 #define BIT_DEPTH 10
-#include "h264idct_internal.h"
+#include "h264idct_template.c"
 #undef BIT_DEPTH
--- a/libavcodec/h264idct_template.c
+++ b/libavcodec/h264idct_template.c
@ -0,0 +1,291 @@
+/*
+ * H.264 IDCT
+ * Copyright (c) 2004-2011 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * H.264 IDCT.
+ * @author Michael Niedermayer <michaelni@gmx.at>
+ */
+
+#include "high_bit_depth.h"
+
+#ifndef AVCODEC_H264IDCT_INTERNAL_H
+#define AVCODEC_H264IDCT_INTERNAL_H
+//FIXME this table is a duplicate from h264data.h, and will be removed once the tables from, h264 have been split
+static const uint8_t scan8[16 + 2*4]={
+ 4+1*8, 5+1*8, 4+2*8, 5+2*8,
+ 6+1*8, 7+1*8, 6+2*8, 7+2*8,
+ 4+3*8, 5+3*8, 4+4*8, 5+4*8,
+ 6+3*8, 7+3*8, 6+4*8, 7+4*8,
+ 1+1*8, 2+1*8,
+ 1+2*8, 2+2*8,
+ 1+4*8, 2+4*8,
+ 1+5*8, 2+5*8,
+};
+#endif
+
+static av_always_inline void FUNCC(idct_internal)(uint8_t *_dst, DCTELEM *_block, int stride, int block_stride, int shift, int add){
+    int i;
+    INIT_CLIP
+    pixel *dst = (pixel*)_dst;
+    dctcoef *block = (dctcoef*)_block;
+    stride /= sizeof(pixel);
+
+    block[0] += 1<<(shift-1);
+
+    for(i=0; i<4; i++){
+        const int z0=  block[i + block_stride*0]     +  block[i + block_stride*2];
+        const int z1=  block[i + block_stride*0]     -  block[i + block_stride*2];
+        const int z2= (block[i + block_stride*1]>>1) -  block[i + block_stride*3];
+        const int z3=  block[i + block_stride*1]     + (block[i + block_stride*3]>>1);
+
+        block[i + block_stride*0]= z0 + z3;
+        block[i + block_stride*1]= z1 + z2;
+        block[i + block_stride*2]= z1 - z2;
+        block[i + block_stride*3]= z0 - z3;
+    }
+
+    for(i=0; i<4; i++){
+        const int z0=  block[0 + block_stride*i]     +  block[2 + block_stride*i];
+        const int z1=  block[0 + block_stride*i]     -  block[2 + block_stride*i];
+        const int z2= (block[1 + block_stride*i]>>1) -  block[3 + block_stride*i];
+        const int z3=  block[1 + block_stride*i]     + (block[3 + block_stride*i]>>1);
+
+        dst[i + 0*stride]= CLIP(add*dst[i + 0*stride] + ((z0 + z3) >> shift));
+        dst[i + 1*stride]= CLIP(add*dst[i + 1*stride] + ((z1 + z2) >> shift));
+        dst[i + 2*stride]= CLIP(add*dst[i + 2*stride] + ((z1 - z2) >> shift));
+        dst[i + 3*stride]= CLIP(add*dst[i + 3*stride] + ((z0 - z3) >> shift));
+    }
+}
+
+void FUNCC(ff_h264_idct_add)(uint8_t *dst, DCTELEM *block, int stride){
+    FUNCC(idct_internal)(dst, block, stride, 4, 6, 1);
+}
+
+void FUNCC(ff_h264_lowres_idct_add)(uint8_t *dst, int stride, DCTELEM *block){
+    FUNCC(idct_internal)(dst, block, stride, 8, 3, 1);
+}
+
+void FUNCC(ff_h264_lowres_idct_put)(uint8_t *dst, int stride, DCTELEM *block){
+    FUNCC(idct_internal)(dst, block, stride, 8, 3, 0);
+}
+
+void FUNCC(ff_h264_idct8_add)(uint8_t *_dst, DCTELEM *_block, int stride){
+    int i;
+    INIT_CLIP
+    pixel *dst = (pixel*)_dst;
+    dctcoef *block = (dctcoef*)_block;
+    stride /= sizeof(pixel);
+
+    block[0] += 32;
+
+    for( i = 0; i < 8; i++ )
+    {
+        const int a0 =  block[i+0*8] + block[i+4*8];
+        const int a2 =  block[i+0*8] - block[i+4*8];
+        const int a4 = (block[i+2*8]>>1) - block[i+6*8];
+        const int a6 = (block[i+6*8]>>1) + block[i+2*8];
+
+        const int b0 = a0 + a6;
+        const int b2 = a2 + a4;
+        const int b4 = a2 - a4;
+        const int b6 = a0 - a6;
+
+        const int a1 = -block[i+3*8] + block[i+5*8] - block[i+7*8] - (block[i+7*8]>>1);
+        const int a3 =  block[i+1*8] + block[i+7*8] - block[i+3*8] - (block[i+3*8]>>1);
+        const int a5 = -block[i+1*8] + block[i+7*8] + block[i+5*8] + (block[i+5*8]>>1);
+        const int a7 =  block[i+3*8] + block[i+5*8] + block[i+1*8] + (block[i+1*8]>>1);
+
+        const int b1 = (a7>>2) + a1;
+        const int b3 =  a3 + (a5>>2);
+        const int b5 = (a3>>2) - a5;
+        const int b7 =  a7 - (a1>>2);
+
+        block[i+0*8] = b0 + b7;
+        block[i+7*8] = b0 - b7;
+        block[i+1*8] = b2 + b5;
+        block[i+6*8] = b2 - b5;
+        block[i+2*8] = b4 + b3;
+        block[i+5*8] = b4 - b3;
+        block[i+3*8] = b6 + b1;
+        block[i+4*8] = b6 - b1;
+    }
+    for( i = 0; i < 8; i++ )
+    {
+        const int a0 =  block[0+i*8] + block[4+i*8];
+        const int a2 =  block[0+i*8] - block[4+i*8];
+        const int a4 = (block[2+i*8]>>1) - block[6+i*8];
+        const int a6 = (block[6+i*8]>>1) + block[2+i*8];
+
+        const int b0 = a0 + a6;
+        const int b2 = a2 + a4;
+        const int b4 = a2 - a4;
+        const int b6 = a0 - a6;
+
+        const int a1 = -block[3+i*8] + block[5+i*8] - block[7+i*8] - (block[7+i*8]>>1);
+        const int a3 =  block[1+i*8] + block[7+i*8] - block[3+i*8] - (block[3+i*8]>>1);
+        const int a5 = -block[1+i*8] + block[7+i*8] + block[5+i*8] + (block[5+i*8]>>1);
+        const int a7 =  block[3+i*8] + block[5+i*8] + block[1+i*8] + (block[1+i*8]>>1);
+
+        const int b1 = (a7>>2) + a1;
+        const int b3 =  a3 + (a5>>2);
+        const int b5 = (a3>>2) - a5;
+        const int b7 =  a7 - (a1>>2);
+
+        dst[i + 0*stride] = CLIP( dst[i + 0*stride] + ((b0 + b7) >> 6) );
+        dst[i + 1*stride] = CLIP( dst[i + 1*stride] + ((b2 + b5) >> 6) );
+        dst[i + 2*stride] = CLIP( dst[i + 2*stride] + ((b4 + b3) >> 6) );
+        dst[i + 3*stride] = CLIP( dst[i + 3*stride] + ((b6 + b1) >> 6) );
+        dst[i + 4*stride] = CLIP( dst[i + 4*stride] + ((b6 - b1) >> 6) );
+        dst[i + 5*stride] = CLIP( dst[i + 5*stride] + ((b4 - b3) >> 6) );
+        dst[i + 6*stride] = CLIP( dst[i + 6*stride] + ((b2 - b5) >> 6) );
+        dst[i + 7*stride] = CLIP( dst[i + 7*stride] + ((b0 - b7) >> 6) );
+    }
+}
+
+// assumes all AC coefs are 0
+void FUNCC(ff_h264_idct_dc_add)(uint8_t *_dst, DCTELEM *block, int stride){
+    int i, j;
+    int dc = (((dctcoef*)block)[0] + 32) >> 6;
+    INIT_CLIP
+    pixel *dst = (pixel*)_dst;
+    stride /= sizeof(pixel);
+    for( j = 0; j < 4; j++ )
+    {
+        for( i = 0; i < 4; i++ )
+            dst[i] = CLIP( dst[i] + dc );
+        dst += stride;
+    }
+}
+
+void FUNCC(ff_h264_idct8_dc_add)(uint8_t *_dst, DCTELEM *block, int stride){
+    int i, j;
+    int dc = (((dctcoef*)block)[0] + 32) >> 6;
+    INIT_CLIP
+    pixel *dst = (pixel*)_dst;
+    stride /= sizeof(pixel);
+    for( j = 0; j < 8; j++ )
+    {
+        for( i = 0; i < 8; i++ )
+            dst[i] = CLIP( dst[i] + dc );
+        dst += stride;
+    }
+}
+
+void FUNCC(ff_h264_idct_add16)(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+    int i;
+    for(i=0; i<16; i++){
+        int nnz = nnzc[ scan8[i] ];
+        if(nnz){
+            if(nnz==1 && ((dctcoef*)block)[i*16]) FUNCC(ff_h264_idct_dc_add)(dst + block_offset[i], block + i*16*sizeof(pixel), stride);
+            else                                  FUNCC(idct_internal      )(dst + block_offset[i], block + i*16*sizeof(pixel), stride, 4, 6, 1);
+        }
+    }
+}
+
+void FUNCC(ff_h264_idct_add16intra)(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+    int i;
+    for(i=0; i<16; i++){
+        if(nnzc[ scan8[i] ])             FUNCC(idct_internal      )(dst + block_offset[i], block + i*16*sizeof(pixel), stride, 4, 6, 1);
+        else if(((dctcoef*)block)[i*16]) FUNCC(ff_h264_idct_dc_add)(dst + block_offset[i], block + i*16*sizeof(pixel), stride);
+    }
+}
+
+void FUNCC(ff_h264_idct8_add4)(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+    int i;
+    for(i=0; i<16; i+=4){
+        int nnz = nnzc[ scan8[i] ];
+        if(nnz){
+            if(nnz==1 && ((dctcoef*)block)[i*16]) FUNCC(ff_h264_idct8_dc_add)(dst + block_offset[i], block + i*16*sizeof(pixel), stride);
+            else                                  FUNCC(ff_h264_idct8_add   )(dst + block_offset[i], block + i*16*sizeof(pixel), stride);
+        }
+    }
+}
+
+void FUNCC(ff_h264_idct_add8)(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+    int i;
+    for(i=16; i<16+8; i++){
+        if(nnzc[ scan8[i] ])
+            FUNCC(ff_h264_idct_add   )(dest[(i&4)>>2] + block_offset[i], block + i*16*sizeof(pixel), stride);
+        else if(((dctcoef*)block)[i*16])
+            FUNCC(ff_h264_idct_dc_add)(dest[(i&4)>>2] + block_offset[i], block + i*16*sizeof(pixel), stride);
+    }
+}
+/**
+ * IDCT transforms the 16 dc values and dequantizes them.
+ * @param qp quantization parameter
+ */
+void FUNCC(ff_h264_luma_dc_dequant_idct)(DCTELEM *_output, DCTELEM *_input, int qmul){
+#define stride 16
+    int i;
+    int temp[16];
+    static const uint8_t x_offset[4]={0, 2*stride, 8*stride, 10*stride};
+    dctcoef *input = (dctcoef*)_input;
+    dctcoef *output = (dctcoef*)_output;
+
+    for(i=0; i<4; i++){
+        const int z0= input[4*i+0] + input[4*i+1];
+        const int z1= input[4*i+0] - input[4*i+1];
+        const int z2= input[4*i+2] - input[4*i+3];
+        const int z3= input[4*i+2] + input[4*i+3];
+
+        temp[4*i+0]= z0+z3;
+        temp[4*i+1]= z0-z3;
+        temp[4*i+2]= z1-z2;
+        temp[4*i+3]= z1+z2;
+    }
+
+    for(i=0; i<4; i++){
+        const int offset= x_offset[i];
+        const int z0= temp[4*0+i] + temp[4*2+i];
+        const int z1= temp[4*0+i] - temp[4*2+i];
+        const int z2= temp[4*1+i] - temp[4*3+i];
+        const int z3= temp[4*1+i] + temp[4*3+i];
+
+        output[stride* 0+offset]= ((((z0 + z3)*qmul + 128 ) >> 8));
+        output[stride* 1+offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
+        output[stride* 4+offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
+        output[stride* 5+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
+    }
+#undef stride
+}
+
+void FUNCC(ff_h264_chroma_dc_dequant_idct)(DCTELEM *_block, int qmul){
+    const int stride= 16*2;
+    const int xStride= 16;
+    int a,b,c,d,e;
+    dctcoef *block = (dctcoef*)_block;
+
+    a= block[stride*0 + xStride*0];
+    b= block[stride*0 + xStride*1];
+    c= block[stride*1 + xStride*0];
+    d= block[stride*1 + xStride*1];
+
+    e= a-b;
+    a= a+b;
+    b= c-d;
+    c= c+d;
+
+    block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
+    block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
+    block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
+    block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
+}
--- a/libavcodec/h264pred.c
+++ b/libavcodec/h264pred.c
@ -28,17 +28,338 @@
 #include "h264pred.h"

 #define BIT_DEPTH 8
-#include "h264pred_internal.h"
+#include "h264pred_template.c"
 #undef BIT_DEPTH

 #define BIT_DEPTH 9
-#include "h264pred_internal.h"
+#include "h264pred_template.c"
 #undef BIT_DEPTH

 #define BIT_DEPTH 10
-#include "h264pred_internal.h"
+#include "h264pred_template.c"
 #undef BIT_DEPTH

+static void pred4x4_vertical_vp8_c(uint8_t *src, const uint8_t *topright, int stride){
+    const int lt= src[-1-1*stride];
+    LOAD_TOP_EDGE
+    LOAD_TOP_RIGHT_EDGE
+    uint32_t v = PACK_4U8((lt + 2*t0 + t1 + 2) >> 2,
+                          (t0 + 2*t1 + t2 + 2) >> 2,
+                          (t1 + 2*t2 + t3 + 2) >> 2,
+                          (t2 + 2*t3 + t4 + 2) >> 2);
+
+    AV_WN32A(src+0*stride, v);
+    AV_WN32A(src+1*stride, v);
+    AV_WN32A(src+2*stride, v);
+    AV_WN32A(src+3*stride, v);
+}
+
+static void pred4x4_horizontal_vp8_c(uint8_t *src, const uint8_t *topright, int stride){
+    const int lt= src[-1-1*stride];
+    LOAD_LEFT_EDGE
+
+    AV_WN32A(src+0*stride, ((lt + 2*l0 + l1 + 2) >> 2)*0x01010101);
+    AV_WN32A(src+1*stride, ((l0 + 2*l1 + l2 + 2) >> 2)*0x01010101);
+    AV_WN32A(src+2*stride, ((l1 + 2*l2 + l3 + 2) >> 2)*0x01010101);
+    AV_WN32A(src+3*stride, ((l2 + 2*l3 + l3 + 2) >> 2)*0x01010101);
+}
+
+static void pred4x4_down_left_svq3_c(uint8_t *src, const uint8_t *topright, int stride){
+    LOAD_TOP_EDGE
+    LOAD_LEFT_EDGE
+    const av_unused int unu0= t0;
+    const av_unused int unu1= l0;
+
+    src[0+0*stride]=(l1 + t1)>>1;
+    src[1+0*stride]=
+    src[0+1*stride]=(l2 + t2)>>1;
+    src[2+0*stride]=
+    src[1+1*stride]=
+    src[0+2*stride]=
+    src[3+0*stride]=
+    src[2+1*stride]=
+    src[1+2*stride]=
+    src[0+3*stride]=
+    src[3+1*stride]=
+    src[2+2*stride]=
+    src[1+3*stride]=
+    src[3+2*stride]=
+    src[2+3*stride]=
+    src[3+3*stride]=(l3 + t3)>>1;
+}
+
+static void pred4x4_down_left_rv40_c(uint8_t *src, const uint8_t *topright, int stride){
+    LOAD_TOP_EDGE
+    LOAD_TOP_RIGHT_EDGE
+    LOAD_LEFT_EDGE
+    LOAD_DOWN_LEFT_EDGE
+
+    src[0+0*stride]=(t0 + t2 + 2*t1 + 2 + l0 + l2 + 2*l1 + 2)>>3;
+    src[1+0*stride]=
+    src[0+1*stride]=(t1 + t3 + 2*t2 + 2 + l1 + l3 + 2*l2 + 2)>>3;
+    src[2+0*stride]=
+    src[1+1*stride]=
+    src[0+2*stride]=(t2 + t4 + 2*t3 + 2 + l2 + l4 + 2*l3 + 2)>>3;
+    src[3+0*stride]=
+    src[2+1*stride]=
+    src[1+2*stride]=
+    src[0+3*stride]=(t3 + t5 + 2*t4 + 2 + l3 + l5 + 2*l4 + 2)>>3;
+    src[3+1*stride]=
+    src[2+2*stride]=
+    src[1+3*stride]=(t4 + t6 + 2*t5 + 2 + l4 + l6 + 2*l5 + 2)>>3;
+    src[3+2*stride]=
+    src[2+3*stride]=(t5 + t7 + 2*t6 + 2 + l5 + l7 + 2*l6 + 2)>>3;
+    src[3+3*stride]=(t6 + t7 + 1 + l6 + l7 + 1)>>2;
+}
+
+static void pred4x4_down_left_rv40_nodown_c(uint8_t *src, const uint8_t *topright, int stride){
+    LOAD_TOP_EDGE
+    LOAD_TOP_RIGHT_EDGE
+    LOAD_LEFT_EDGE
+
+    src[0+0*stride]=(t0 + t2 + 2*t1 + 2 + l0 + l2 + 2*l1 + 2)>>3;
+    src[1+0*stride]=
+    src[0+1*stride]=(t1 + t3 + 2*t2 + 2 + l1 + l3 + 2*l2 + 2)>>3;
+    src[2+0*stride]=
+    src[1+1*stride]=
+    src[0+2*stride]=(t2 + t4 + 2*t3 + 2 + l2 + 3*l3 + 2)>>3;
+    src[3+0*stride]=
+    src[2+1*stride]=
+    src[1+2*stride]=
+    src[0+3*stride]=(t3 + t5 + 2*t4 + 2 + l3*4 + 2)>>3;
+    src[3+1*stride]=
+    src[2+2*stride]=
+    src[1+3*stride]=(t4 + t6 + 2*t5 + 2 + l3*4 + 2)>>3;
+    src[3+2*stride]=
+    src[2+3*stride]=(t5 + t7 + 2*t6 + 2 + l3*4 + 2)>>3;
+    src[3+3*stride]=(t6 + t7 + 1 + 2*l3 + 1)>>2;
+}
+
+static void pred4x4_vertical_left_rv40(uint8_t *src, const uint8_t *topright, int stride,
+                                       const int l0, const int l1, const int l2, const int l3, const int l4){
+    LOAD_TOP_EDGE
+    LOAD_TOP_RIGHT_EDGE
+
+    src[0+0*stride]=(2*t0 + 2*t1 + l1 + 2*l2 + l3 + 4)>>3;
+    src[1+0*stride]=
+    src[0+2*stride]=(t1 + t2 + 1)>>1;
+    src[2+0*stride]=
+    src[1+2*stride]=(t2 + t3 + 1)>>1;
+    src[3+0*stride]=
+    src[2+2*stride]=(t3 + t4+ 1)>>1;
+    src[3+2*stride]=(t4 + t5+ 1)>>1;
+    src[0+1*stride]=(t0 + 2*t1 + t2 + l2 + 2*l3 + l4 + 4)>>3;
+    src[1+1*stride]=
+    src[0+3*stride]=(t1 + 2*t2 + t3 + 2)>>2;
+    src[2+1*stride]=
+    src[1+3*stride]=(t2 + 2*t3 + t4 + 2)>>2;
+    src[3+1*stride]=
+    src[2+3*stride]=(t3 + 2*t4 + t5 + 2)>>2;
+    src[3+3*stride]=(t4 + 2*t5 + t6 + 2)>>2;
+}
+
+static void pred4x4_vertical_left_rv40_c(uint8_t *src, const uint8_t *topright, int stride){
+    LOAD_LEFT_EDGE
+    LOAD_DOWN_LEFT_EDGE
+
+    pred4x4_vertical_left_rv40(src, topright, stride, l0, l1, l2, l3, l4);
+}
+
+static void pred4x4_vertical_left_rv40_nodown_c(uint8_t *src, const uint8_t *topright, int stride){
+    LOAD_LEFT_EDGE
+
+    pred4x4_vertical_left_rv40(src, topright, stride, l0, l1, l2, l3, l3);
+}
+
+static void pred4x4_vertical_left_vp8_c(uint8_t *src, const uint8_t *topright, int stride){
+    LOAD_TOP_EDGE
+    LOAD_TOP_RIGHT_EDGE
+
+    src[0+0*stride]=(t0 + t1 + 1)>>1;
+    src[1+0*stride]=
+    src[0+2*stride]=(t1 + t2 + 1)>>1;
+    src[2+0*stride]=
+    src[1+2*stride]=(t2 + t3 + 1)>>1;
+    src[3+0*stride]=
+    src[2+2*stride]=(t3 + t4 + 1)>>1;
+    src[0+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
+    src[1+1*stride]=
+    src[0+3*stride]=(t1 + 2*t2 + t3 + 2)>>2;
+    src[2+1*stride]=
+    src[1+3*stride]=(t2 + 2*t3 + t4 + 2)>>2;
+    src[3+1*stride]=
+    src[2+3*stride]=(t3 + 2*t4 + t5 + 2)>>2;
+    src[3+2*stride]=(t4 + 2*t5 + t6 + 2)>>2;
+    src[3+3*stride]=(t5 + 2*t6 + t7 + 2)>>2;
+}
+
+static void pred4x4_horizontal_up_rv40_c(uint8_t *src, const uint8_t *topright, int stride){
+    LOAD_LEFT_EDGE
+    LOAD_DOWN_LEFT_EDGE
+    LOAD_TOP_EDGE
+    LOAD_TOP_RIGHT_EDGE
+
+    src[0+0*stride]=(t1 + 2*t2 + t3 + 2*l0 + 2*l1 + 4)>>3;
+    src[1+0*stride]=(t2 + 2*t3 + t4 + l0 + 2*l1 + l2 + 4)>>3;
+    src[2+0*stride]=
+    src[0+1*stride]=(t3 + 2*t4 + t5 + 2*l1 + 2*l2 + 4)>>3;
+    src[3+0*stride]=
+    src[1+1*stride]=(t4 + 2*t5 + t6 + l1 + 2*l2 + l3 + 4)>>3;
+    src[2+1*stride]=
+    src[0+2*stride]=(t5 + 2*t6 + t7 + 2*l2 + 2*l3 + 4)>>3;
+    src[3+1*stride]=
+    src[1+2*stride]=(t6 + 3*t7 + l2 + 3*l3 + 4)>>3;
+    src[3+2*stride]=
+    src[1+3*stride]=(l3 + 2*l4 + l5 + 2)>>2;
+    src[0+3*stride]=
+    src[2+2*stride]=(t6 + t7 + l3 + l4 + 2)>>2;
+    src[2+3*stride]=(l4 + l5 + 1)>>1;
+    src[3+3*stride]=(l4 + 2*l5 + l6 + 2)>>2;
+}
+
+static void pred4x4_horizontal_up_rv40_nodown_c(uint8_t *src, const uint8_t *topright, int stride){
+    LOAD_LEFT_EDGE
+    LOAD_TOP_EDGE
+    LOAD_TOP_RIGHT_EDGE
+
+    src[0+0*stride]=(t1 + 2*t2 + t3 + 2*l0 + 2*l1 + 4)>>3;
+    src[1+0*stride]=(t2 + 2*t3 + t4 + l0 + 2*l1 + l2 + 4)>>3;
+    src[2+0*stride]=
+    src[0+1*stride]=(t3 + 2*t4 + t5 + 2*l1 + 2*l2 + 4)>>3;
+    src[3+0*stride]=
+    src[1+1*stride]=(t4 + 2*t5 + t6 + l1 + 2*l2 + l3 + 4)>>3;
+    src[2+1*stride]=
+    src[0+2*stride]=(t5 + 2*t6 + t7 + 2*l2 + 2*l3 + 4)>>3;
+    src[3+1*stride]=
+    src[1+2*stride]=(t6 + 3*t7 + l2 + 3*l3 + 4)>>3;
+    src[3+2*stride]=
+    src[1+3*stride]=l3;
+    src[0+3*stride]=
+    src[2+2*stride]=(t6 + t7 + 2*l3 + 2)>>2;
+    src[2+3*stride]=
+    src[3+3*stride]=l3;
+}
+
+static void pred4x4_tm_vp8_c(uint8_t *src, const uint8_t *topright, int stride){
+    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP - src[-1-stride];
+    uint8_t *top = src-stride;
+    int y;
+
+    for (y = 0; y < 4; y++) {
+        uint8_t *cm_in = cm + src[-1];
+        src[0] = cm_in[top[0]];
+        src[1] = cm_in[top[1]];
+        src[2] = cm_in[top[2]];
+        src[3] = cm_in[top[3]];
+        src += stride;
+    }
+}
+
+static void pred16x16_plane_svq3_c(uint8_t *src, int stride){
+    pred16x16_plane_compat_8_c(src, stride, 1, 0);
+}
+
+static void pred16x16_plane_rv40_c(uint8_t *src, int stride){
+    pred16x16_plane_compat_8_c(src, stride, 0, 1);
+}
+
+static void pred16x16_tm_vp8_c(uint8_t *src, int stride){
+    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP - src[-1-stride];
+    uint8_t *top = src-stride;
+    int y;
+
+    for (y = 0; y < 16; y++) {
+        uint8_t *cm_in = cm + src[-1];
+        src[0]  = cm_in[top[0]];
+        src[1]  = cm_in[top[1]];
+        src[2]  = cm_in[top[2]];
+        src[3]  = cm_in[top[3]];
+        src[4]  = cm_in[top[4]];
+        src[5]  = cm_in[top[5]];
+        src[6]  = cm_in[top[6]];
+        src[7]  = cm_in[top[7]];
+        src[8]  = cm_in[top[8]];
+        src[9]  = cm_in[top[9]];
+        src[10] = cm_in[top[10]];
+        src[11] = cm_in[top[11]];
+        src[12] = cm_in[top[12]];
+        src[13] = cm_in[top[13]];
+        src[14] = cm_in[top[14]];
+        src[15] = cm_in[top[15]];
+        src += stride;
+    }
+}
+
+static void pred8x8_left_dc_rv40_c(uint8_t *src, int stride){
+    int i;
+    int dc0;
+
+    dc0=0;
+    for(i=0;i<8; i++)
+        dc0+= src[-1+i*stride];
+    dc0= 0x01010101*((dc0 + 4)>>3);
+
+    for(i=0; i<8; i++){
+        ((uint32_t*)(src+i*stride))[0]=
+        ((uint32_t*)(src+i*stride))[1]= dc0;
+    }
+}
+
+static void pred8x8_top_dc_rv40_c(uint8_t *src, int stride){
+    int i;
+    int dc0;
+
+    dc0=0;
+    for(i=0;i<8; i++)
+        dc0+= src[i-stride];
+    dc0= 0x01010101*((dc0 + 4)>>3);
+
+    for(i=0; i<8; i++){
+        ((uint32_t*)(src+i*stride))[0]=
+        ((uint32_t*)(src+i*stride))[1]= dc0;
+    }
+}
+
+static void pred8x8_dc_rv40_c(uint8_t *src, int stride){
+    int i;
+    int dc0=0;
+
+    for(i=0;i<4; i++){
+        dc0+= src[-1+i*stride] + src[i-stride];
+        dc0+= src[4+i-stride];
+        dc0+= src[-1+(i+4)*stride];
+    }
+    dc0= 0x01010101*((dc0 + 8)>>4);
+
+    for(i=0; i<4; i++){
+        ((uint32_t*)(src+i*stride))[0]= dc0;
+        ((uint32_t*)(src+i*stride))[1]= dc0;
+    }
+    for(i=4; i<8; i++){
+        ((uint32_t*)(src+i*stride))[0]= dc0;
+        ((uint32_t*)(src+i*stride))[1]= dc0;
+    }
+}
+
+static void pred8x8_tm_vp8_c(uint8_t *src, int stride){
+    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP - src[-1-stride];
+    uint8_t *top = src-stride;
+    int y;
+
+    for (y = 0; y < 8; y++) {
+        uint8_t *cm_in = cm + src[-1];
+        src[0] = cm_in[top[0]];
+        src[1] = cm_in[top[1]];
+        src[2] = cm_in[top[2]];
+        src[3] = cm_in[top[3]];
+        src[4] = cm_in[top[4]];
+        src[5] = cm_in[top[5]];
+        src[6] = cm_in[top[6]];
+        src[7] = cm_in[top[7]];
+        src += stride;
+    }
+}
+
 /**
 * Set the intra prediction function pointers.
 */
@ -49,26 +370,27 @@ void ff_h264_pred_init(H264PredContext *h, int codec_id, const int bit_depth){
 #undef FUNCC
 #define FUNC(a, depth) a ## _ ## depth
 #define FUNCC(a, depth) a ## _ ## depth ## _c
+#define FUNCD(a) a ## _c

 #define H264_PRED(depth) \
    if(codec_id != CODEC_ID_RV40){\
        if(codec_id == CODEC_ID_VP8) {\
-            h->pred4x4[VERT_PRED       ]= FUNCC(pred4x4_vertical_vp8      , depth);\
-            h->pred4x4[HOR_PRED        ]= FUNCC(pred4x4_horizontal_vp8    , depth);\
+            h->pred4x4[VERT_PRED       ]= FUNCD(pred4x4_vertical_vp8);\
+            h->pred4x4[HOR_PRED        ]= FUNCD(pred4x4_horizontal_vp8);\
        } else {\
            h->pred4x4[VERT_PRED       ]= FUNCC(pred4x4_vertical          , depth);\
            h->pred4x4[HOR_PRED        ]= FUNCC(pred4x4_horizontal        , depth);\
        }\
        h->pred4x4[DC_PRED             ]= FUNCC(pred4x4_dc                , depth);\
        if(codec_id == CODEC_ID_SVQ3)\
-            h->pred4x4[DIAG_DOWN_LEFT_PRED ]= FUNCC(pred4x4_down_left_svq3, depth);\
+            h->pred4x4[DIAG_DOWN_LEFT_PRED ]= FUNCD(pred4x4_down_left_svq3);\
        else\
            h->pred4x4[DIAG_DOWN_LEFT_PRED ]= FUNCC(pred4x4_down_left     , depth);\
        h->pred4x4[DIAG_DOWN_RIGHT_PRED]= FUNCC(pred4x4_down_right        , depth);\
        h->pred4x4[VERT_RIGHT_PRED     ]= FUNCC(pred4x4_vertical_right    , depth);\
        h->pred4x4[HOR_DOWN_PRED       ]= FUNCC(pred4x4_horizontal_down   , depth);\
        if (codec_id == CODEC_ID_VP8) {\
-            h->pred4x4[VERT_LEFT_PRED  ]= FUNCC(pred4x4_vertical_left_vp8 , depth);\
+            h->pred4x4[VERT_LEFT_PRED  ]= FUNCD(pred4x4_vertical_left_vp8);\
        } else\
            h->pred4x4[VERT_LEFT_PRED  ]= FUNCC(pred4x4_vertical_left     , depth);\
        h->pred4x4[HOR_UP_PRED         ]= FUNCC(pred4x4_horizontal_up     , depth);\
@ -77,7 +399,7 @@ void ff_h264_pred_init(H264PredContext *h, int codec_id, const int bit_depth){
            h->pred4x4[TOP_DC_PRED     ]= FUNCC(pred4x4_top_dc            , depth);\
            h->pred4x4[DC_128_PRED     ]= FUNCC(pred4x4_128_dc            , depth);\
        } else {\
-            h->pred4x4[TM_VP8_PRED     ]= FUNCC(pred4x4_tm_vp8            , depth);\
+            h->pred4x4[TM_VP8_PRED     ]= FUNCD(pred4x4_tm_vp8);\
            h->pred4x4[DC_127_PRED     ]= FUNCC(pred4x4_127_dc            , depth);\
            h->pred4x4[DC_129_PRED     ]= FUNCC(pred4x4_129_dc            , depth);\
            h->pred4x4[VERT_VP8_PRED   ]= FUNCC(pred4x4_vertical          , depth);\
@ -87,18 +409,18 @@ void ff_h264_pred_init(H264PredContext *h, int codec_id, const int bit_depth){
        h->pred4x4[VERT_PRED           ]= FUNCC(pred4x4_vertical          , depth);\
        h->pred4x4[HOR_PRED            ]= FUNCC(pred4x4_horizontal        , depth);\
        h->pred4x4[DC_PRED             ]= FUNCC(pred4x4_dc                , depth);\
-        h->pred4x4[DIAG_DOWN_LEFT_PRED ]= FUNCC(pred4x4_down_left_rv40    , depth);\
+        h->pred4x4[DIAG_DOWN_LEFT_PRED ]= FUNCD(pred4x4_down_left_rv40);\
        h->pred4x4[DIAG_DOWN_RIGHT_PRED]= FUNCC(pred4x4_down_right        , depth);\
        h->pred4x4[VERT_RIGHT_PRED     ]= FUNCC(pred4x4_vertical_right    , depth);\
        h->pred4x4[HOR_DOWN_PRED       ]= FUNCC(pred4x4_horizontal_down   , depth);\
-        h->pred4x4[VERT_LEFT_PRED      ]= FUNCC(pred4x4_vertical_left_rv40, depth);\
-        h->pred4x4[HOR_UP_PRED         ]= FUNCC(pred4x4_horizontal_up_rv40, depth);\
+        h->pred4x4[VERT_LEFT_PRED      ]= FUNCD(pred4x4_vertical_left_rv40);\
+        h->pred4x4[HOR_UP_PRED         ]= FUNCD(pred4x4_horizontal_up_rv40);\
        h->pred4x4[LEFT_DC_PRED        ]= FUNCC(pred4x4_left_dc           , depth);\
        h->pred4x4[TOP_DC_PRED         ]= FUNCC(pred4x4_top_dc            , depth);\
        h->pred4x4[DC_128_PRED         ]= FUNCC(pred4x4_128_dc            , depth);\
-        h->pred4x4[DIAG_DOWN_LEFT_PRED_RV40_NODOWN]= FUNCC(pred4x4_down_left_rv40_nodown, depth);\
-        h->pred4x4[HOR_UP_PRED_RV40_NODOWN]= FUNCC(pred4x4_horizontal_up_rv40_nodown    , depth);\
-        h->pred4x4[VERT_LEFT_PRED_RV40_NODOWN]= FUNCC(pred4x4_vertical_left_rv40_nodown , depth);\
+        h->pred4x4[DIAG_DOWN_LEFT_PRED_RV40_NODOWN]= FUNCD(pred4x4_down_left_rv40_nodown);\
+        h->pred4x4[HOR_UP_PRED_RV40_NODOWN]= FUNCD(pred4x4_horizontal_up_rv40_nodown);\
+        h->pred4x4[VERT_LEFT_PRED_RV40_NODOWN]= FUNCD(pred4x4_vertical_left_rv40_nodown);\
    }\
 \
    h->pred8x8l[VERT_PRED           ]= FUNCC(pred8x8l_vertical            , depth);\
@ -119,7 +441,7 @@ void ff_h264_pred_init(H264PredContext *h, int codec_id, const int bit_depth){
    if (codec_id != CODEC_ID_VP8) {\
        h->pred8x8[PLANE_PRED8x8]= FUNCC(pred8x8_plane                    , depth);\
    } else\
-        h->pred8x8[PLANE_PRED8x8]= FUNCC(pred8x8_tm_vp8                   , depth);\
+        h->pred8x8[PLANE_PRED8x8]= FUNCD(pred8x8_tm_vp8);\
    if(codec_id != CODEC_ID_RV40 && codec_id != CODEC_ID_VP8){\
        h->pred8x8[DC_PRED8x8     ]= FUNCC(pred8x8_dc                     , depth);\
        h->pred8x8[LEFT_DC_PRED8x8]= FUNCC(pred8x8_left_dc                , depth);\
@ -129,9 +451,9 @@ void ff_h264_pred_init(H264PredContext *h, int codec_id, const int bit_depth){
        h->pred8x8[ALZHEIMER_DC_L00_PRED8x8 ]= FUNC(pred8x8_mad_cow_dc_l00, depth);\
        h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8 ]= FUNC(pred8x8_mad_cow_dc_0l0, depth);\
    }else{\
-        h->pred8x8[DC_PRED8x8     ]= FUNCC(pred8x8_dc_rv40                , depth);\
-        h->pred8x8[LEFT_DC_PRED8x8]= FUNCC(pred8x8_left_dc_rv40           , depth);\
-        h->pred8x8[TOP_DC_PRED8x8 ]= FUNCC(pred8x8_top_dc_rv40            , depth);\
+        h->pred8x8[DC_PRED8x8     ]= FUNCD(pred8x8_dc_rv40);\
+        h->pred8x8[LEFT_DC_PRED8x8]= FUNCD(pred8x8_left_dc_rv40);\
+        h->pred8x8[TOP_DC_PRED8x8 ]= FUNCD(pred8x8_top_dc_rv40);\
        if (codec_id == CODEC_ID_VP8) {\
            h->pred8x8[DC_127_PRED8x8]= FUNCC(pred8x8_127_dc              , depth);\
            h->pred8x8[DC_129_PRED8x8]= FUNCC(pred8x8_129_dc              , depth);\
@ -144,13 +466,13 @@ void ff_h264_pred_init(H264PredContext *h, int codec_id, const int bit_depth){
    h->pred16x16[HOR_PRED8x8    ]= FUNCC(pred16x16_horizontal             , depth);\
    switch(codec_id){\
    case CODEC_ID_SVQ3:\
-       h->pred16x16[PLANE_PRED8x8  ]= FUNCC(pred16x16_plane_svq3          , depth);\
+       h->pred16x16[PLANE_PRED8x8  ]= FUNCD(pred16x16_plane_svq3);\
       break;\
    case CODEC_ID_RV40:\
-       h->pred16x16[PLANE_PRED8x8  ]= FUNCC(pred16x16_plane_rv40          , depth);\
+       h->pred16x16[PLANE_PRED8x8  ]= FUNCD(pred16x16_plane_rv40);\
       break;\
    case CODEC_ID_VP8:\
-       h->pred16x16[PLANE_PRED8x8  ]= FUNCC(pred16x16_tm_vp8              , depth);\
+       h->pred16x16[PLANE_PRED8x8  ]= FUNCD(pred16x16_tm_vp8);\
       h->pred16x16[DC_127_PRED8x8]= FUNCC(pred16x16_127_dc               , depth);\
       h->pred16x16[DC_129_PRED8x8]= FUNCC(pred16x16_129_dc               , depth);\
       break;\
--- a/libavcodec/h264pred_template.c
+++ b/libavcodec/h264pred_template.c
@ -0,0 +1,975 @@
+/*
+ * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
+ * Copyright (c) 2003-2011 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * H.264 / AVC / MPEG4 part10 prediction functions.
+ * @author Michael Niedermayer <michaelni@gmx.at>
+ */
+
+#include "mathops.h"
+#include "high_bit_depth.h"
+
+static void FUNCC(pred4x4_vertical)(uint8_t *_src, const uint8_t *topright, int _stride){
+    pixel *src = (pixel*)_src;
+    int stride = _stride/sizeof(pixel);
+    const pixel4 a= ((pixel4*)(src-stride))[0];
+    ((pixel4*)(src+0*stride))[0]= a;
+    ((pixel4*)(src+1*stride))[0]= a;
+    ((pixel4*)(src+2*stride))[0]= a;
+    ((pixel4*)(src+3*stride))[0]= a;
+}
+
+static void FUNCC(pred4x4_horizontal)(uint8_t *_src, const uint8_t *topright, int _stride){
+    pixel *src = (pixel*)_src;
+    int stride = _stride/sizeof(pixel);
+    ((pixel4*)(src+0*stride))[0]= PIXEL_SPLAT_X4(src[-1+0*stride]);
+    ((pixel4*)(src+1*stride))[0]= PIXEL_SPLAT_X4(src[-1+1*stride]);
+    ((pixel4*)(src+2*stride))[0]= PIXEL_SPLAT_X4(src[-1+2*stride]);
+    ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4(src[-1+3*stride]);
+}
+
+static void FUNCC(pred4x4_dc)(uint8_t *_src, const uint8_t *topright, int _stride){
+    pixel *src = (pixel*)_src;
+    int stride = _stride/sizeof(pixel);
+    const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride]
+                   + src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 4) >>3;
+
+    ((pixel4*)(src+0*stride))[0]=
+    ((pixel4*)(src+1*stride))[0]=
+    ((pixel4*)(src+2*stride))[0]=
+    ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4(dc);
+}
+
+static void FUNCC(pred4x4_left_dc)(uint8_t *_src, const uint8_t *topright, int _stride){
+    pixel *src = (pixel*)_src;
+    int stride = _stride/sizeof(pixel);
+    const int dc= (  src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 2) >>2;
+
+    ((pixel4*)(src+0*stride))[0]=
+    ((pixel4*)(src+1*stride))[0]=
+    ((pixel4*)(src+2*stride))[0]=
+    ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4(dc);
+}
+
+static void FUNCC(pred4x4_top_dc)(uint8_t *_src, const uint8_t *topright, int _stride){
+    pixel *src = (pixel*)_src;
+    int stride = _stride/sizeof(pixel);
+    const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] + 2) >>2;
+
+    ((pixel4*)(src+0*stride))[0]=
+    ((pixel4*)(src+1*stride))[0]=
+    ((pixel4*)(src+2*stride))[0]=
+    ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4(dc);
+}
+
+static void FUNCC(pred4x4_128_dc)(uint8_t *_src, const uint8_t *topright, int _stride){
+    pixel *src = (pixel*)_src;
+    int stride = _stride/sizeof(pixel);
+    ((pixel4*)(src+0*stride))[0]=
+    ((pixel4*)(src+1*stride))[0]=
+    ((pixel4*)(src+2*stride))[0]=
+    ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4(1<<(BIT_DEPTH-1));
+}
+
+static void FUNCC(pred4x4_127_dc)(uint8_t *_src, const uint8_t *topright, int _stride){
+    pixel *src = (pixel*)_src;
+    int stride = _stride/sizeof(pixel);
+    ((pixel4*)(src+0*stride))[0]=
+    ((pixel4*)(src+1*stride))[0]=
+    ((pixel4*)(src+2*stride))[0]=
+    ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4((1<<(BIT_DEPTH-1))-1);
+}
+
+static void FUNCC(pred4x4_129_dc)(uint8_t *_src, const uint8_t *topright, int _stride){
+    pixel *src = (pixel*)_src;
+    int stride = _stride/sizeof(pixel);
+    ((pixel4*)(src+0*stride))[0]=
+    ((pixel4*)(src+1*stride))[0]=
+    ((pixel4*)(src+2*stride))[0]=
+    ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4((1<<(BIT_DEPTH-1))+1);
+}
+
+
+#define LOAD_TOP_RIGHT_EDGE\
+    const int av_unused t4= topright[0];\
+    const int av_unused t5= topright[1];\
+    const int av_unused t6= topright[2];\
+    const int av_unused t7= topright[3];\
+
+#define LOAD_DOWN_LEFT_EDGE\
+    const int av_unused l4= src[-1+4*stride];\
+    const int av_unused l5= src[-1+5*stride];\
+    const int av_unused l6= src[-1+6*stride];\
+    const int av_unused l7= src[-1+7*stride];\
+
+#define LOAD_LEFT_EDGE\
+    const int av_unused l0= src[-1+0*stride];\
+    const int av_unused l1= src[-1+1*stride];\
+    const int av_unused l2= src[-1+2*stride];\
+    const int av_unused l3= src[-1+3*stride];\
+
+#define LOAD_TOP_EDGE\
+    const int av_unused t0= src[ 0-1*stride];\
+    const int av_unused t1= src[ 1-1*stride];\
+    const int av_unused t2= src[ 2-1*stride];\
+    const int av_unused t3= src[ 3-1*stride];\
+
+static void FUNCC(pred4x4_down_right)(uint8_t *_src, const uint8_t *topright, int _stride){
+    pixel *src = (pixel*)_src;
+    int stride = _stride/sizeof(pixel);
+    const int lt= src[-1-1*stride];
+    LOAD_TOP_EDGE
+    LOAD_LEFT_EDGE
+
+    src[0+3*stride]=(l3 + 2*l2 + l1 + 2)>>2;
+    src[0+2*stride]=
+    src[1+3*stride]=(l2 + 2*l1 + l0 + 2)>>2;
+    src[0+1*stride]=
+    src[1+2*stride]=
+    src[2+3*stride]=(l1 + 2*l0 + lt + 2)>>2;
+    src[0+0*stride]=
+    src[1+1*stride]=
+    src[2+2*stride]=
+    src[3+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
+    src[1+0*stride]=
+    src[2+1*stride]=
+    src[3+2*stride]=(lt + 2*t0 + t1 + 2)>>2;
+    src[2+0*stride]=
+    src[3+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
+    src[3+0*stride]=(t1 + 2*t2 + t3 + 2)>>2;
+}
+
+static void FUNCC(pred4x4_down_left)(uint8_t *_src, const uint8_t *_topright, int _stride){
+    pixel *src = (pixel*)_src;
+    const pixel *topright = (const pixel*)_topright;
+    int stride = _stride/sizeof(pixel);
+    LOAD_TOP_EDGE
+    LOAD_TOP_RIGHT_EDGE
+//    LOAD_LEFT_EDGE
+
+    src[0+0*stride]=(t0 + t2 + 2*t1 + 2)>>2;
+    src[1+0*stride]=
+    src[0+1*stride]=(t1 + t3 + 2*t2 + 2)>>2;
+    src[2+0*stride]=
+    src[1+1*stride]=
+    src[0+2*stride]=(t2 + t4 + 2*t3 + 2)>>2;
+    src[3+0*stride]=
+    src[2+1*stride]=
+    src[1+2*stride]=
+    src[0+3*stride]=(t3 + t5 + 2*t4 + 2)>>2;
+    src[3+1*stride]=
+    src[2+2*stride]=
+    src[1+3*stride]=(t4 + t6 + 2*t5 + 2)>>2;
+    src[3+2*stride]=
+    src[2+3*stride]=(t5 + t7 + 2*t6 + 2)>>2;
+    src[3+3*stride]=(t6 + 3*t7 + 2)>>2;
+}
+
+static void FUNCC(pred4x4_vertical_right)(uint8_t *_src, const uint8_t *topright, int _stride){
+    pixel *src = (pixel*)_src;
+    int stride = _stride/sizeof(pixel);
+    const int lt= src[-1-1*stride];
+    LOAD_TOP_EDGE
+    LOAD_LEFT_EDGE
+
+    src[0+0*stride]=
+    src[1+2*stride]=(lt + t0 + 1)>>1;
+    src[1+0*stride]=
+    src[2+2*stride]=(t0 + t1 + 1)>>1;
+    src[2+0*stride]=
+    src[3+2*stride]=(t1 + t2 + 1)>>1;
+    src[3+0*stride]=(t2 + t3 + 1)>>1;
+    src[0+1*stride]=
+    src[1+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
+    src[1+1*stride]=
+    src[2+3*stride]=(lt + 2*t0 + t1 + 2)>>2;
+    src[2+1*stride]=
+    src[3+3*stride]=(t0 + 2*t1 + t2 + 2)>>2;
+    src[3+1*stride]=(t1 + 2*t2 + t3 + 2)>>2;
+    src[0+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
+    src[0+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
+}
+
+static void FUNCC(pred4x4_vertical_left)(uint8_t *_src, const uint8_t *_topright, int _stride){
+    pixel *src = (pixel*)_src;
+    const pixel *topright = (const pixel*)_topright;
+    int stride = _stride/sizeof(pixel);
+    LOAD_TOP_EDGE
+    LOAD_TOP_RIGHT_EDGE
+
+    src[0+0*stride]=(t0 + t1 + 1)>>1;
+    src[1+0*stride]=
+    src[0+2*stride]=(t1 + t2 + 1)>>1;
+    src[2+0*stride]=
+    src[1+2*stride]=(t2 + t3 + 1)>>1;
+    src[3+0*stride]=
+    src[2+2*stride]=(t3 + t4+ 1)>>1;
+    src[3+2*stride]=(t4 + t5+ 1)>>1;
+    src[0+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
+    src[1+1*stride]=
+    src[0+3*stride]=(t1 + 2*t2 + t3 + 2)>>2;
+    src[2+1*stride]=
+    src[1+3*stride]=(t2 + 2*t3 + t4 + 2)>>2;
+    src[3+1*stride]=
+    src[2+3*stride]=(t3 + 2*t4 + t5 + 2)>>2;
+    src[3+3*stride]=(t4 + 2*t5 + t6 + 2)>>2;
+}
+
+static void FUNCC(pred4x4_horizontal_up)(uint8_t *_src, const uint8_t *topright, int _stride){
+    pixel *src = (pixel*)_src;
+    int stride = _stride/sizeof(pixel);
+    LOAD_LEFT_EDGE
+
+    src[0+0*stride]=(l0 + l1 + 1)>>1;
+    src[1+0*stride]=(l0 + 2*l1 + l2 + 2)>>2;
+    src[2+0*stride]=
+    src[0+1*stride]=(l1 + l2 + 1)>>1;
+    src[3+0*stride]=
+    src[1+1*stride]=(l1 + 2*l2 + l3 + 2)>>2;
+    src[2+1*stride]=
+    src[0+2*stride]=(l2 + l3 + 1)>>1;
+    src[3+1*stride]=
+    src[1+2*stride]=(l2 + 2*l3 + l3 + 2)>>2;
+    src[3+2*stride]=
+    src[1+3*stride]=
+    src[0+3*stride]=
+    src[2+2*stride]=
+    src[2+3*stride]=
+    src[3+3*stride]=l3;
+}
+
+static void FUNCC(pred4x4_horizontal_down)(uint8_t *_src, const uint8_t *topright, int _stride){
+    pixel *src = (pixel*)_src;
+    int stride = _stride/sizeof(pixel);
+    const int lt= src[-1-1*stride];
+    LOAD_TOP_EDGE
+    LOAD_LEFT_EDGE
+
+    src[0+0*stride]=
+    src[2+1*stride]=(lt + l0 + 1)>>1;
+    src[1+0*stride]=
+    src[3+1*stride]=(l0 + 2*lt + t0 + 2)>>2;
+    src[2+0*stride]=(lt + 2*t0 + t1 + 2)>>2;
+    src[3+0*stride]=(t0 + 2*t1 + t2 + 2)>>2;
+    src[0+1*stride]=
+    src[2+2*stride]=(l0 + l1 + 1)>>1;
+    src[1+1*stride]=
+    src[3+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
+    src[0+2*stride]=
+    src[2+3*stride]=(l1 + l2+ 1)>>1;
+    src[1+2*stride]=
+    src[3+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
+    src[0+3*stride]=(l2 + l3 + 1)>>1;
+    src[1+3*stride]=(l1 + 2*l2 + l3 + 2)>>2;
+}
+
+static void FUNCC(pred16x16_vertical)(uint8_t *_src, int _stride){
+    int i;
+    pixel *src = (pixel*)_src;
+    int stride = _stride/sizeof(pixel);
+    const pixel4 a = ((pixel4*)(src-stride))[0];
+    const pixel4 b = ((pixel4*)(src-stride))[1];
+    const pixel4 c = ((pixel4*)(src-stride))[2];
+    const pixel4 d = ((pixel4*)(src-stride))[3];
+
+    for(i=0; i<16; i++){
+        ((pixel4*)(src+i*stride))[0] = a;
+        ((pixel4*)(src+i*stride))[1] = b;
+        ((pixel4*)(src+i*stride))[2] = c;
+        ((pixel4*)(src+i*stride))[3] = d;
+    }
+}
+
+static void FUNCC(pred16x16_horizontal)(uint8_t *_src, int stride){
+    int i;
+    pixel *src = (pixel*)_src;
+    stride /= sizeof(pixel);
+
+    for(i=0; i<16; i++){
+        ((pixel4*)(src+i*stride))[0] =
+        ((pixel4*)(src+i*stride))[1] =
+        ((pixel4*)(src+i*stride))[2] =
+        ((pixel4*)(src+i*stride))[3] = PIXEL_SPLAT_X4(src[-1+i*stride]);
+    }
+}
+
+#define PREDICT_16x16_DC(v)\
+    for(i=0; i<16; i++){\
+        AV_WN4P(src+ 0, v);\
+        AV_WN4P(src+ 4, v);\
+        AV_WN4P(src+ 8, v);\
+        AV_WN4P(src+12, v);\
+        src += stride;\
+    }
+
+static void FUNCC(pred16x16_dc)(uint8_t *_src, int stride){
+    int i, dc=0;
+    pixel *src = (pixel*)_src;
+    pixel4 dcsplat;
+    stride /= sizeof(pixel);
+
+    for(i=0;i<16; i++){
+        dc+= src[-1+i*stride];
+    }
+
+    for(i=0;i<16; i++){
+        dc+= src[i-stride];
+    }
+
+    dcsplat = PIXEL_SPLAT_X4((dc+16)>>5);
+    PREDICT_16x16_DC(dcsplat);
+}
+
+static void FUNCC(pred16x16_left_dc)(uint8_t *_src, int stride){
+    int i, dc=0;
+    pixel *src = (pixel*)_src;
+    pixel4 dcsplat;
+    stride /= sizeof(pixel);
+
+    for(i=0;i<16; i++){
+        dc+= src[-1+i*stride];
+    }
+
+    dcsplat = PIXEL_SPLAT_X4((dc+8)>>4);
+    PREDICT_16x16_DC(dcsplat);
+}
+
+static void FUNCC(pred16x16_top_dc)(uint8_t *_src, int stride){
+    int i, dc=0;
+    pixel *src = (pixel*)_src;
+    pixel4 dcsplat;
+    stride /= sizeof(pixel);
+
+    for(i=0;i<16; i++){
+        dc+= src[i-stride];
+    }
+
+    dcsplat = PIXEL_SPLAT_X4((dc+8)>>4);
+    PREDICT_16x16_DC(dcsplat);
+}
+
+#define PRED16x16_X(n, v) \
+static void FUNCC(pred16x16_##n##_dc)(uint8_t *_src, int stride){\
+    int i;\
+    pixel *src = (pixel*)_src;\
+    stride /= sizeof(pixel);\
+    PREDICT_16x16_DC(PIXEL_SPLAT_X4(v));\
+}
+
+PRED16x16_X(127, (1<<(BIT_DEPTH-1))-1);
+PRED16x16_X(128, (1<<(BIT_DEPTH-1))+0);
+PRED16x16_X(129, (1<<(BIT_DEPTH-1))+1);
+
+static inline void FUNCC(pred16x16_plane_compat)(uint8_t *_src, int _stride, const int svq3, const int rv40){
+  int i, j, k;
+  int a;
+  INIT_CLIP
+  pixel *src = (pixel*)_src;
+  int stride = _stride/sizeof(pixel);
+  const pixel * const src0 = src +7-stride;
+  const pixel *       src1 = src +8*stride-1;
+  const pixel *       src2 = src1-2*stride;    // == src+6*stride-1;
+  int H = src0[1] - src0[-1];
+  int V = src1[0] - src2[ 0];
+  for(k=2; k<=8; ++k) {
+    src1 += stride; src2 -= stride;
+    H += k*(src0[k] - src0[-k]);
+    V += k*(src1[0] - src2[ 0]);
+  }
+  if(svq3){
+    H = ( 5*(H/4) ) / 16;
+    V = ( 5*(V/4) ) / 16;
+
+    /* required for 100% accuracy */
+    i = H; H = V; V = i;
+  }else if(rv40){
+    H = ( H + (H>>2) ) >> 4;
+    V = ( V + (V>>2) ) >> 4;
+  }else{
+    H = ( 5*H+32 ) >> 6;
+    V = ( 5*V+32 ) >> 6;
+  }
+
+  a = 16*(src1[0] + src2[16] + 1) - 7*(V+H);
+  for(j=16; j>0; --j) {
+    int b = a;
+    a += V;
+    for(i=-16; i<0; i+=4) {
+      src[16+i] = CLIP((b    ) >> 5);
+      src[17+i] = CLIP((b+  H) >> 5);
+      src[18+i] = CLIP((b+2*H) >> 5);
+      src[19+i] = CLIP((b+3*H) >> 5);
+      b += 4*H;
+    }
+    src += stride;
+  }
+}
+
+static void FUNCC(pred16x16_plane)(uint8_t *src, int stride){
+    FUNCC(pred16x16_plane_compat)(src, stride, 0, 0);
+}
+
+static void FUNCC(pred8x8_vertical)(uint8_t *_src, int _stride){
+    int i;
+    pixel *src = (pixel*)_src;
+    int stride = _stride/sizeof(pixel);
+    const pixel4 a= ((pixel4*)(src-stride))[0];
+    const pixel4 b= ((pixel4*)(src-stride))[1];
+
+    for(i=0; i<8; i++){
+        ((pixel4*)(src+i*stride))[0]= a;
+        ((pixel4*)(src+i*stride))[1]= b;
+    }
+}
+
+static void FUNCC(pred8x8_horizontal)(uint8_t *_src, int stride){
+    int i;
+    pixel *src = (pixel*)_src;
+    stride /= sizeof(pixel);
+
+    for(i=0; i<8; i++){
+        ((pixel4*)(src+i*stride))[0]=
+        ((pixel4*)(src+i*stride))[1]= PIXEL_SPLAT_X4(src[-1+i*stride]);
+    }
+}
+
+#define PRED8x8_X(n, v)\
+static void FUNCC(pred8x8_##n##_dc)(uint8_t *_src, int stride){\
+    int i;\
+    pixel *src = (pixel*)_src;\
+    stride /= sizeof(pixel);\
+    for(i=0; i<8; i++){\
+        ((pixel4*)(src+i*stride))[0]=\
+        ((pixel4*)(src+i*stride))[1]= PIXEL_SPLAT_X4(v);\
+    }\
+}
+
+PRED8x8_X(127, (1<<(BIT_DEPTH-1))-1);
+PRED8x8_X(128, (1<<(BIT_DEPTH-1))+0);
+PRED8x8_X(129, (1<<(BIT_DEPTH-1))+1);
+
+static void FUNCC(pred8x8_left_dc)(uint8_t *_src, int stride){
+    int i;
+    int dc0, dc2;
+    pixel4 dc0splat, dc2splat;
+    pixel *src = (pixel*)_src;
+    stride /= sizeof(pixel);
+
+    dc0=dc2=0;
+    for(i=0;i<4; i++){
+        dc0+= src[-1+i*stride];
+        dc2+= src[-1+(i+4)*stride];
+    }
+    dc0splat = PIXEL_SPLAT_X4((dc0 + 2)>>2);
+    dc2splat = PIXEL_SPLAT_X4((dc2 + 2)>>2);
+
+    for(i=0; i<4; i++){
+        ((pixel4*)(src+i*stride))[0]=
+        ((pixel4*)(src+i*stride))[1]= dc0splat;
+    }
+    for(i=4; i<8; i++){
+        ((pixel4*)(src+i*stride))[0]=
+        ((pixel4*)(src+i*stride))[1]= dc2splat;
+    }
+}
+
+static void FUNCC(pred8x8_top_dc)(uint8_t *_src, int stride){
+    int i;
+    int dc0, dc1;
+    pixel4 dc0splat, dc1splat;
+    pixel *src = (pixel*)_src;
+    stride /= sizeof(pixel);
+
+    dc0=dc1=0;
+    for(i=0;i<4; i++){
+        dc0+= src[i-stride];
+        dc1+= src[4+i-stride];
+    }
+    dc0splat = PIXEL_SPLAT_X4((dc0 + 2)>>2);
+    dc1splat = PIXEL_SPLAT_X4((dc1 + 2)>>2);
+
+    for(i=0; i<4; i++){
+        ((pixel4*)(src+i*stride))[0]= dc0splat;
+        ((pixel4*)(src+i*stride))[1]= dc1splat;
+    }
+    for(i=4; i<8; i++){
+        ((pixel4*)(src+i*stride))[0]= dc0splat;
+        ((pixel4*)(src+i*stride))[1]= dc1splat;
+    }
+}
+
+static void FUNCC(pred8x8_dc)(uint8_t *_src, int stride){
+    int i;
+    int dc0, dc1, dc2;
+    pixel4 dc0splat, dc1splat, dc2splat, dc3splat;
+    pixel *src = (pixel*)_src;
+    stride /= sizeof(pixel);
+
+    dc0=dc1=dc2=0;
+    for(i=0;i<4; i++){
+        dc0+= src[-1+i*stride] + src[i-stride];
+        dc1+= src[4+i-stride];
+        dc2+= src[-1+(i+4)*stride];
+    }
+    dc0splat = PIXEL_SPLAT_X4((dc0 + 4)>>3);
+    dc1splat = PIXEL_SPLAT_X4((dc1 + 2)>>2);
+    dc2splat = PIXEL_SPLAT_X4((dc2 + 2)>>2);
+    dc3splat = PIXEL_SPLAT_X4((dc1 + dc2 + 4)>>3);
+
+    for(i=0; i<4; i++){
+        ((pixel4*)(src+i*stride))[0]= dc0splat;
+        ((pixel4*)(src+i*stride))[1]= dc1splat;
+    }
+    for(i=4; i<8; i++){
+        ((pixel4*)(src+i*stride))[0]= dc2splat;
+        ((pixel4*)(src+i*stride))[1]= dc3splat;
+    }
+}
+
+//the following 4 function should not be optimized!
+static void FUNC(pred8x8_mad_cow_dc_l0t)(uint8_t *src, int stride){
+    FUNCC(pred8x8_top_dc)(src, stride);
+    FUNCC(pred4x4_dc)(src, NULL, stride);
+}
+
+static void FUNC(pred8x8_mad_cow_dc_0lt)(uint8_t *src, int stride){
+    FUNCC(pred8x8_dc)(src, stride);
+    FUNCC(pred4x4_top_dc)(src, NULL, stride);
+}
+
+static void FUNC(pred8x8_mad_cow_dc_l00)(uint8_t *src, int stride){
+    FUNCC(pred8x8_left_dc)(src, stride);
+    FUNCC(pred4x4_128_dc)(src + 4*stride                  , NULL, stride);
+    FUNCC(pred4x4_128_dc)(src + 4*stride + 4*sizeof(pixel), NULL, stride);
+}
+
+static void FUNC(pred8x8_mad_cow_dc_0l0)(uint8_t *src, int stride){
+    FUNCC(pred8x8_left_dc)(src, stride);
+    FUNCC(pred4x4_128_dc)(src                  , NULL, stride);
+    FUNCC(pred4x4_128_dc)(src + 4*sizeof(pixel), NULL, stride);
+}
+
+static void FUNCC(pred8x8_plane)(uint8_t *_src, int _stride){
+  int j, k;
+  int a;
+  INIT_CLIP
+  pixel *src = (pixel*)_src;
+  int stride = _stride/sizeof(pixel);
+  const pixel * const src0 = src +3-stride;
+  const pixel *       src1 = src +4*stride-1;
+  const pixel *       src2 = src1-2*stride;    // == src+2*stride-1;
+  int H = src0[1] - src0[-1];
+  int V = src1[0] - src2[ 0];
+  for(k=2; k<=4; ++k) {
+    src1 += stride; src2 -= stride;
+    H += k*(src0[k] - src0[-k]);
+    V += k*(src1[0] - src2[ 0]);
+  }
+  H = ( 17*H+16 ) >> 5;
+  V = ( 17*V+16 ) >> 5;
+
+  a = 16*(src1[0] + src2[8]+1) - 3*(V+H);
+  for(j=8; j>0; --j) {
+    int b = a;
+    a += V;
+    src[0] = CLIP((b    ) >> 5);
+    src[1] = CLIP((b+  H) >> 5);
+    src[2] = CLIP((b+2*H) >> 5);
+    src[3] = CLIP((b+3*H) >> 5);
+    src[4] = CLIP((b+4*H) >> 5);
+    src[5] = CLIP((b+5*H) >> 5);
+    src[6] = CLIP((b+6*H) >> 5);
+    src[7] = CLIP((b+7*H) >> 5);
+    src += stride;
+  }
+}
+
+#define SRC(x,y) src[(x)+(y)*stride]
+#define PL(y) \
+    const int l##y = (SRC(-1,y-1) + 2*SRC(-1,y) + SRC(-1,y+1) + 2) >> 2;
+#define PREDICT_8x8_LOAD_LEFT \
+    const int l0 = ((has_topleft ? SRC(-1,-1) : SRC(-1,0)) \
+                     + 2*SRC(-1,0) + SRC(-1,1) + 2) >> 2; \
+    PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) \
+    const int l7 av_unused = (SRC(-1,6) + 3*SRC(-1,7) + 2) >> 2
+
+#define PT(x) \
+    const int t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
+#define PREDICT_8x8_LOAD_TOP \
+    const int t0 = ((has_topleft ? SRC(-1,-1) : SRC(0,-1)) \
+                     + 2*SRC(0,-1) + SRC(1,-1) + 2) >> 2; \
+    PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) \
+    const int t7 av_unused = ((has_topright ? SRC(8,-1) : SRC(7,-1)) \
+                     + 2*SRC(7,-1) + SRC(6,-1) + 2) >> 2
+
+#define PTR(x) \
+    t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
+#define PREDICT_8x8_LOAD_TOPRIGHT \
+    int t8, t9, t10, t11, t12, t13, t14, t15; \
+    if(has_topright) { \
+        PTR(8) PTR(9) PTR(10) PTR(11) PTR(12) PTR(13) PTR(14) \
+        t15 = (SRC(14,-1) + 3*SRC(15,-1) + 2) >> 2; \
+    } else t8=t9=t10=t11=t12=t13=t14=t15= SRC(7,-1);
+
+#define PREDICT_8x8_LOAD_TOPLEFT \
+    const int lt = (SRC(-1,0) + 2*SRC(-1,-1) + SRC(0,-1) + 2) >> 2
+
+#define PREDICT_8x8_DC(v) \
+    int y; \
+    for( y = 0; y < 8; y++ ) { \
+        ((pixel4*)src)[0] = \
+        ((pixel4*)src)[1] = v; \
+        src += stride; \
+    }
+
+static void FUNCC(pred8x8l_128_dc)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
+{
+    pixel *src = (pixel*)_src;
+    int stride = _stride/sizeof(pixel);
+
+    PREDICT_8x8_DC(PIXEL_SPLAT_X4(1<<(BIT_DEPTH-1)));
+}
+static void FUNCC(pred8x8l_left_dc)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
+{
+    pixel *src = (pixel*)_src;
+    int stride = _stride/sizeof(pixel);
+
+    PREDICT_8x8_LOAD_LEFT;
+    const pixel4 dc = PIXEL_SPLAT_X4((l0+l1+l2+l3+l4+l5+l6+l7+4) >> 3);
+    PREDICT_8x8_DC(dc);
+}
+static void FUNCC(pred8x8l_top_dc)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
+{
+    pixel *src = (pixel*)_src;
+    int stride = _stride/sizeof(pixel);
+
+    PREDICT_8x8_LOAD_TOP;
+    const pixel4 dc = PIXEL_SPLAT_X4((t0+t1+t2+t3+t4+t5+t6+t7+4) >> 3);
+    PREDICT_8x8_DC(dc);
+}
+static void FUNCC(pred8x8l_dc)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
+{
+    pixel *src = (pixel*)_src;
+    int stride = _stride/sizeof(pixel);
+
+    PREDICT_8x8_LOAD_LEFT;
+    PREDICT_8x8_LOAD_TOP;
+    const pixel4 dc = PIXEL_SPLAT_X4((l0+l1+l2+l3+l4+l5+l6+l7
+                                     +t0+t1+t2+t3+t4+t5+t6+t7+8) >> 4);
+    PREDICT_8x8_DC(dc);
+}
+static void FUNCC(pred8x8l_horizontal)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
+{
+    pixel *src = (pixel*)_src;
+    int stride = _stride/sizeof(pixel);
+
+    PREDICT_8x8_LOAD_LEFT;
+#define ROW(y) ((pixel4*)(src+y*stride))[0] =\
+               ((pixel4*)(src+y*stride))[1] = PIXEL_SPLAT_X4(l##y)
+    ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7);
+#undef ROW
+}
+static void FUNCC(pred8x8l_vertical)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
+{
+    int y;
+    pixel *src = (pixel*)_src;
+    int stride = _stride/sizeof(pixel);
+
+    PREDICT_8x8_LOAD_TOP;
+    src[0] = t0;
+    src[1] = t1;
+    src[2] = t2;
+    src[3] = t3;
+    src[4] = t4;
+    src[5] = t5;
+    src[6] = t6;
+    src[7] = t7;
+    for( y = 1; y < 8; y++ ) {
+        ((pixel4*)(src+y*stride))[0] = ((pixel4*)src)[0];
+        ((pixel4*)(src+y*stride))[1] = ((pixel4*)src)[1];
+    }
+}
+static void FUNCC(pred8x8l_down_left)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
+{
+    pixel *src = (pixel*)_src;
+    int stride = _stride/sizeof(pixel);
+    PREDICT_8x8_LOAD_TOP;
+    PREDICT_8x8_LOAD_TOPRIGHT;
+    SRC(0,0)= (t0 + 2*t1 + t2 + 2) >> 2;
+    SRC(0,1)=SRC(1,0)= (t1 + 2*t2 + t3 + 2) >> 2;
+    SRC(0,2)=SRC(1,1)=SRC(2,0)= (t2 + 2*t3 + t4 + 2) >> 2;
+    SRC(0,3)=SRC(1,2)=SRC(2,1)=SRC(3,0)= (t3 + 2*t4 + t5 + 2) >> 2;
+    SRC(0,4)=SRC(1,3)=SRC(2,2)=SRC(3,1)=SRC(4,0)= (t4 + 2*t5 + t6 + 2) >> 2;
+    SRC(0,5)=SRC(1,4)=SRC(2,3)=SRC(3,2)=SRC(4,1)=SRC(5,0)= (t5 + 2*t6 + t7 + 2) >> 2;
+    SRC(0,6)=SRC(1,5)=SRC(2,4)=SRC(3,3)=SRC(4,2)=SRC(5,1)=SRC(6,0)= (t6 + 2*t7 + t8 + 2) >> 2;
+    SRC(0,7)=SRC(1,6)=SRC(2,5)=SRC(3,4)=SRC(4,3)=SRC(5,2)=SRC(6,1)=SRC(7,0)= (t7 + 2*t8 + t9 + 2) >> 2;
+    SRC(1,7)=SRC(2,6)=SRC(3,5)=SRC(4,4)=SRC(5,3)=SRC(6,2)=SRC(7,1)= (t8 + 2*t9 + t10 + 2) >> 2;
+    SRC(2,7)=SRC(3,6)=SRC(4,5)=SRC(5,4)=SRC(6,3)=SRC(7,2)= (t9 + 2*t10 + t11 + 2) >> 2;
+    SRC(3,7)=SRC(4,6)=SRC(5,5)=SRC(6,4)=SRC(7,3)= (t10 + 2*t11 + t12 + 2) >> 2;
+    SRC(4,7)=SRC(5,6)=SRC(6,5)=SRC(7,4)= (t11 + 2*t12 + t13 + 2) >> 2;
+    SRC(5,7)=SRC(6,6)=SRC(7,5)= (t12 + 2*t13 + t14 + 2) >> 2;
+    SRC(6,7)=SRC(7,6)= (t13 + 2*t14 + t15 + 2) >> 2;
+    SRC(7,7)= (t14 + 3*t15 + 2) >> 2;
+}
+static void FUNCC(pred8x8l_down_right)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
+{
+    pixel *src = (pixel*)_src;
+    int stride = _stride/sizeof(pixel);
+    PREDICT_8x8_LOAD_TOP;
+    PREDICT_8x8_LOAD_LEFT;
+    PREDICT_8x8_LOAD_TOPLEFT;
+    SRC(0,7)= (l7 + 2*l6 + l5 + 2) >> 2;
+    SRC(0,6)=SRC(1,7)= (l6 + 2*l5 + l4 + 2) >> 2;
+    SRC(0,5)=SRC(1,6)=SRC(2,7)= (l5 + 2*l4 + l3 + 2) >> 2;
+    SRC(0,4)=SRC(1,5)=SRC(2,6)=SRC(3,7)= (l4 + 2*l3 + l2 + 2) >> 2;
+    SRC(0,3)=SRC(1,4)=SRC(2,5)=SRC(3,6)=SRC(4,7)= (l3 + 2*l2 + l1 + 2) >> 2;
+    SRC(0,2)=SRC(1,3)=SRC(2,4)=SRC(3,5)=SRC(4,6)=SRC(5,7)= (l2 + 2*l1 + l0 + 2) >> 2;
+    SRC(0,1)=SRC(1,2)=SRC(2,3)=SRC(3,4)=SRC(4,5)=SRC(5,6)=SRC(6,7)= (l1 + 2*l0 + lt + 2) >> 2;
+    SRC(0,0)=SRC(1,1)=SRC(2,2)=SRC(3,3)=SRC(4,4)=SRC(5,5)=SRC(6,6)=SRC(7,7)= (l0 + 2*lt + t0 + 2) >> 2;
+    SRC(1,0)=SRC(2,1)=SRC(3,2)=SRC(4,3)=SRC(5,4)=SRC(6,5)=SRC(7,6)= (lt + 2*t0 + t1 + 2) >> 2;
+    SRC(2,0)=SRC(3,1)=SRC(4,2)=SRC(5,3)=SRC(6,4)=SRC(7,5)= (t0 + 2*t1 + t2 + 2) >> 2;
+    SRC(3,0)=SRC(4,1)=SRC(5,2)=SRC(6,3)=SRC(7,4)= (t1 + 2*t2 + t3 + 2) >> 2;
+    SRC(4,0)=SRC(5,1)=SRC(6,2)=SRC(7,3)= (t2 + 2*t3 + t4 + 2) >> 2;
+    SRC(5,0)=SRC(6,1)=SRC(7,2)= (t3 + 2*t4 + t5 + 2) >> 2;
+    SRC(6,0)=SRC(7,1)= (t4 + 2*t5 + t6 + 2) >> 2;
+    SRC(7,0)= (t5 + 2*t6 + t7 + 2) >> 2;
+}
+static void FUNCC(pred8x8l_vertical_right)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
+{
+    pixel *src = (pixel*)_src;
+    int stride = _stride/sizeof(pixel);
+    PREDICT_8x8_LOAD_TOP;
+    PREDICT_8x8_LOAD_LEFT;
+    PREDICT_8x8_LOAD_TOPLEFT;
+    SRC(0,6)= (l5 + 2*l4 + l3 + 2) >> 2;
+    SRC(0,7)= (l6 + 2*l5 + l4 + 2) >> 2;
+    SRC(0,4)=SRC(1,6)= (l3 + 2*l2 + l1 + 2) >> 2;
+    SRC(0,5)=SRC(1,7)= (l4 + 2*l3 + l2 + 2) >> 2;
+    SRC(0,2)=SRC(1,4)=SRC(2,6)= (l1 + 2*l0 + lt + 2) >> 2;
+    SRC(0,3)=SRC(1,5)=SRC(2,7)= (l2 + 2*l1 + l0 + 2) >> 2;
+    SRC(0,1)=SRC(1,3)=SRC(2,5)=SRC(3,7)= (l0 + 2*lt + t0 + 2) >> 2;
+    SRC(0,0)=SRC(1,2)=SRC(2,4)=SRC(3,6)= (lt + t0 + 1) >> 1;
+    SRC(1,1)=SRC(2,3)=SRC(3,5)=SRC(4,7)= (lt + 2*t0 + t1 + 2) >> 2;
+    SRC(1,0)=SRC(2,2)=SRC(3,4)=SRC(4,6)= (t0 + t1 + 1) >> 1;
+    SRC(2,1)=SRC(3,3)=SRC(4,5)=SRC(5,7)= (t0 + 2*t1 + t2 + 2) >> 2;
+    SRC(2,0)=SRC(3,2)=SRC(4,4)=SRC(5,6)= (t1 + t2 + 1) >> 1;
+    SRC(3,1)=SRC(4,3)=SRC(5,5)=SRC(6,7)= (t1 + 2*t2 + t3 + 2) >> 2;
+    SRC(3,0)=SRC(4,2)=SRC(5,4)=SRC(6,6)= (t2 + t3 + 1) >> 1;
+    SRC(4,1)=SRC(5,3)=SRC(6,5)=SRC(7,7)= (t2 + 2*t3 + t4 + 2) >> 2;
+    SRC(4,0)=SRC(5,2)=SRC(6,4)=SRC(7,6)= (t3 + t4 + 1) >> 1;
+    SRC(5,1)=SRC(6,3)=SRC(7,5)= (t3 + 2*t4 + t5 + 2) >> 2;
+    SRC(5,0)=SRC(6,2)=SRC(7,4)= (t4 + t5 + 1) >> 1;
+    SRC(6,1)=SRC(7,3)= (t4 + 2*t5 + t6 + 2) >> 2;
+    SRC(6,0)=SRC(7,2)= (t5 + t6 + 1) >> 1;
+    SRC(7,1)= (t5 + 2*t6 + t7 + 2) >> 2;
+    SRC(7,0)= (t6 + t7 + 1) >> 1;
+}
+static void FUNCC(pred8x8l_horizontal_down)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
+{
+    pixel *src = (pixel*)_src;
+    int stride = _stride/sizeof(pixel);
+    PREDICT_8x8_LOAD_TOP;
+    PREDICT_8x8_LOAD_LEFT;
+    PREDICT_8x8_LOAD_TOPLEFT;
+    SRC(0,7)= (l6 + l7 + 1) >> 1;
+    SRC(1,7)= (l5 + 2*l6 + l7 + 2) >> 2;
+    SRC(0,6)=SRC(2,7)= (l5 + l6 + 1) >> 1;
+    SRC(1,6)=SRC(3,7)= (l4 + 2*l5 + l6 + 2) >> 2;
+    SRC(0,5)=SRC(2,6)=SRC(4,7)= (l4 + l5 + 1) >> 1;
+    SRC(1,5)=SRC(3,6)=SRC(5,7)= (l3 + 2*l4 + l5 + 2) >> 2;
+    SRC(0,4)=SRC(2,5)=SRC(4,6)=SRC(6,7)= (l3 + l4 + 1) >> 1;
+    SRC(1,4)=SRC(3,5)=SRC(5,6)=SRC(7,7)= (l2 + 2*l3 + l4 + 2) >> 2;
+    SRC(0,3)=SRC(2,4)=SRC(4,5)=SRC(6,6)= (l2 + l3 + 1) >> 1;
+    SRC(1,3)=SRC(3,4)=SRC(5,5)=SRC(7,6)= (l1 + 2*l2 + l3 + 2) >> 2;
+    SRC(0,2)=SRC(2,3)=SRC(4,4)=SRC(6,5)= (l1 + l2 + 1) >> 1;
+    SRC(1,2)=SRC(3,3)=SRC(5,4)=SRC(7,5)= (l0 + 2*l1 + l2 + 2) >> 2;
+    SRC(0,1)=SRC(2,2)=SRC(4,3)=SRC(6,4)= (l0 + l1 + 1) >> 1;
+    SRC(1,1)=SRC(3,2)=SRC(5,3)=SRC(7,4)= (lt + 2*l0 + l1 + 2) >> 2;
+    SRC(0,0)=SRC(2,1)=SRC(4,2)=SRC(6,3)= (lt + l0 + 1) >> 1;
+    SRC(1,0)=SRC(3,1)=SRC(5,2)=SRC(7,3)= (l0 + 2*lt + t0 + 2) >> 2;
+    SRC(2,0)=SRC(4,1)=SRC(6,2)= (t1 + 2*t0 + lt + 2) >> 2;
+    SRC(3,0)=SRC(5,1)=SRC(7,2)= (t2 + 2*t1 + t0 + 2) >> 2;
+    SRC(4,0)=SRC(6,1)= (t3 + 2*t2 + t1 + 2) >> 2;
+    SRC(5,0)=SRC(7,1)= (t4 + 2*t3 + t2 + 2) >> 2;
+    SRC(6,0)= (t5 + 2*t4 + t3 + 2) >> 2;
+    SRC(7,0)= (t6 + 2*t5 + t4 + 2) >> 2;
+}
+static void FUNCC(pred8x8l_vertical_left)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
+{
+    pixel *src = (pixel*)_src;
+    int stride = _stride/sizeof(pixel);
+    PREDICT_8x8_LOAD_TOP;
+    PREDICT_8x8_LOAD_TOPRIGHT;
+    SRC(0,0)= (t0 + t1 + 1) >> 1;
+    SRC(0,1)= (t0 + 2*t1 + t2 + 2) >> 2;
+    SRC(0,2)=SRC(1,0)= (t1 + t2 + 1) >> 1;
+    SRC(0,3)=SRC(1,1)= (t1 + 2*t2 + t3 + 2) >> 2;
+    SRC(0,4)=SRC(1,2)=SRC(2,0)= (t2 + t3 + 1) >> 1;
+    SRC(0,5)=SRC(1,3)=SRC(2,1)= (t2 + 2*t3 + t4 + 2) >> 2;
+    SRC(0,6)=SRC(1,4)=SRC(2,2)=SRC(3,0)= (t3 + t4 + 1) >> 1;
+    SRC(0,7)=SRC(1,5)=SRC(2,3)=SRC(3,1)= (t3 + 2*t4 + t5 + 2) >> 2;
+    SRC(1,6)=SRC(2,4)=SRC(3,2)=SRC(4,0)= (t4 + t5 + 1) >> 1;
+    SRC(1,7)=SRC(2,5)=SRC(3,3)=SRC(4,1)= (t4 + 2*t5 + t6 + 2) >> 2;
+    SRC(2,6)=SRC(3,4)=SRC(4,2)=SRC(5,0)= (t5 + t6 + 1) >> 1;
+    SRC(2,7)=SRC(3,5)=SRC(4,3)=SRC(5,1)= (t5 + 2*t6 + t7 + 2) >> 2;
+    SRC(3,6)=SRC(4,4)=SRC(5,2)=SRC(6,0)= (t6 + t7 + 1) >> 1;
+    SRC(3,7)=SRC(4,5)=SRC(5,3)=SRC(6,1)= (t6 + 2*t7 + t8 + 2) >> 2;
+    SRC(4,6)=SRC(5,4)=SRC(6,2)=SRC(7,0)= (t7 + t8 + 1) >> 1;
+    SRC(4,7)=SRC(5,5)=SRC(6,3)=SRC(7,1)= (t7 + 2*t8 + t9 + 2) >> 2;
+    SRC(5,6)=SRC(6,4)=SRC(7,2)= (t8 + t9 + 1) >> 1;
+    SRC(5,7)=SRC(6,5)=SRC(7,3)= (t8 + 2*t9 + t10 + 2) >> 2;
+    SRC(6,6)=SRC(7,4)= (t9 + t10 + 1) >> 1;
+    SRC(6,7)=SRC(7,5)= (t9 + 2*t10 + t11 + 2) >> 2;
+    SRC(7,6)= (t10 + t11 + 1) >> 1;
+    SRC(7,7)= (t10 + 2*t11 + t12 + 2) >> 2;
+}
+static void FUNCC(pred8x8l_horizontal_up)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
+{
+    pixel *src = (pixel*)_src;
+    int stride = _stride/sizeof(pixel);
+    PREDICT_8x8_LOAD_LEFT;
+    SRC(0,0)= (l0 + l1 + 1) >> 1;
+    SRC(1,0)= (l0 + 2*l1 + l2 + 2) >> 2;
+    SRC(0,1)=SRC(2,0)= (l1 + l2 + 1) >> 1;
+    SRC(1,1)=SRC(3,0)= (l1 + 2*l2 + l3 + 2) >> 2;
+    SRC(0,2)=SRC(2,1)=SRC(4,0)= (l2 + l3 + 1) >> 1;
+    SRC(1,2)=SRC(3,1)=SRC(5,0)= (l2 + 2*l3 + l4 + 2) >> 2;
+    SRC(0,3)=SRC(2,2)=SRC(4,1)=SRC(6,0)= (l3 + l4 + 1) >> 1;
+    SRC(1,3)=SRC(3,2)=SRC(5,1)=SRC(7,0)= (l3 + 2*l4 + l5 + 2) >> 2;
+    SRC(0,4)=SRC(2,3)=SRC(4,2)=SRC(6,1)= (l4 + l5 + 1) >> 1;
+    SRC(1,4)=SRC(3,3)=SRC(5,2)=SRC(7,1)= (l4 + 2*l5 + l6 + 2) >> 2;
+    SRC(0,5)=SRC(2,4)=SRC(4,3)=SRC(6,2)= (l5 + l6 + 1) >> 1;
+    SRC(1,5)=SRC(3,4)=SRC(5,3)=SRC(7,2)= (l5 + 2*l6 + l7 + 2) >> 2;
+    SRC(0,6)=SRC(2,5)=SRC(4,4)=SRC(6,3)= (l6 + l7 + 1) >> 1;
+    SRC(1,6)=SRC(3,5)=SRC(5,4)=SRC(7,3)= (l6 + 3*l7 + 2) >> 2;
+    SRC(0,7)=SRC(1,7)=SRC(2,6)=SRC(2,7)=SRC(3,6)=
+    SRC(3,7)=SRC(4,5)=SRC(4,6)=SRC(4,7)=SRC(5,5)=
+    SRC(5,6)=SRC(5,7)=SRC(6,4)=SRC(6,5)=SRC(6,6)=
+    SRC(6,7)=SRC(7,4)=SRC(7,5)=SRC(7,6)=SRC(7,7)= l7;
+}
+#undef PREDICT_8x8_LOAD_LEFT
+#undef PREDICT_8x8_LOAD_TOP
+#undef PREDICT_8x8_LOAD_TOPLEFT
+#undef PREDICT_8x8_LOAD_TOPRIGHT
+#undef PREDICT_8x8_DC
+#undef PTR
+#undef PT
+#undef PL
+#undef SRC
+
+static void FUNCC(pred4x4_vertical_add)(uint8_t *_pix, const DCTELEM *_block, int stride){
+    int i;
+    pixel *pix = (pixel*)_pix;
+    const dctcoef *block = (const dctcoef*)_block;
+    stride /= sizeof(pixel);
+    pix -= stride;
+    for(i=0; i<4; i++){
+        pixel v = pix[0];
+        pix[1*stride]= v += block[0];
+        pix[2*stride]= v += block[4];
+        pix[3*stride]= v += block[8];
+        pix[4*stride]= v +  block[12];
+        pix++;
+        block++;
+    }
+}
+
+static void FUNCC(pred4x4_horizontal_add)(uint8_t *_pix, const DCTELEM *_block, int stride){
+    int i;
+    pixel *pix = (pixel*)_pix;
+    const dctcoef *block = (const dctcoef*)_block;
+    stride /= sizeof(pixel);
+    for(i=0; i<4; i++){
+        pixel v = pix[-1];
+        pix[0]= v += block[0];
+        pix[1]= v += block[1];
+        pix[2]= v += block[2];
+        pix[3]= v +  block[3];
+        pix+= stride;
+        block+= 4;
+    }
+}
+
+static void FUNCC(pred8x8l_vertical_add)(uint8_t *_pix, const DCTELEM *_block, int stride){
+    int i;
+    pixel *pix = (pixel*)_pix;
+    const dctcoef *block = (const dctcoef*)_block;
+    stride /= sizeof(pixel);
+    pix -= stride;
+    for(i=0; i<8; i++){
+        pixel v = pix[0];
+        pix[1*stride]= v += block[0];
+        pix[2*stride]= v += block[8];
+        pix[3*stride]= v += block[16];
+        pix[4*stride]= v += block[24];
+        pix[5*stride]= v += block[32];
+        pix[6*stride]= v += block[40];
+        pix[7*stride]= v += block[48];
+        pix[8*stride]= v +  block[56];
+        pix++;
+        block++;
+    }
+}
+
+static void FUNCC(pred8x8l_horizontal_add)(uint8_t *_pix, const DCTELEM *_block, int stride){
+    int i;
+    pixel *pix = (pixel*)_pix;
+    const dctcoef *block = (const dctcoef*)_block;
+    stride /= sizeof(pixel);
+    for(i=0; i<8; i++){
+        pixel v = pix[-1];
+        pix[0]= v += block[0];
+        pix[1]= v += block[1];
+        pix[2]= v += block[2];
+        pix[3]= v += block[3];
+        pix[4]= v += block[4];
+        pix[5]= v += block[5];
+        pix[6]= v += block[6];
+        pix[7]= v +  block[7];
+        pix+= stride;
+        block+= 8;
+    }
+}
+
+static void FUNCC(pred16x16_vertical_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
+    int i;
+    for(i=0; i<16; i++)
+        FUNCC(pred4x4_vertical_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
+}
+
+static void FUNCC(pred16x16_horizontal_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
+    int i;
+    for(i=0; i<16; i++)
+        FUNCC(pred4x4_horizontal_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
+}
+
+static void FUNCC(pred8x8_vertical_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
+    int i;
+    for(i=0; i<4; i++)
+        FUNCC(pred4x4_vertical_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
+}
+
+static void FUNCC(pred8x8_horizontal_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
+    int i;
+    for(i=0; i<4; i++)
+        FUNCC(pred4x4_horizontal_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
+}
--- a/libavcodec/high_bit_depth.h
+++ b/libavcodec/high_bit_depth.h
@ -0,0 +1,85 @@
+#include "dsputil.h"
+
+#ifndef BIT_DEPTH
+#define BIT_DEPTH 8
+#endif
+
+#ifdef AVCODEC_H264_HIGH_DEPTH_H
+#   undef pixel
+#   undef pixel2
+#   undef pixel4
+#   undef dctcoef
+#   undef INIT_CLIP
+#   undef no_rnd_avg_pixel4
+#   undef rnd_avg_pixel4
+#   undef AV_RN2P
+#   undef AV_RN4P
+#   undef AV_WN2P
+#   undef AV_WN4P
+#   undef AV_WN4PA
+#   undef CLIP
+#   undef FUNC
+#   undef FUNCC
+#   undef av_clip_pixel
+#   undef PIXEL_SPLAT_X4
+#else
+#   define AVCODEC_H264_HIGH_DEPTH_H
+#   define CLIP_PIXEL(depth)\
+    static inline uint16_t av_clip_pixel_ ## depth (int p)\
+    {\
+        const int pixel_max = (1 << depth)-1;\
+        return (p & ~pixel_max) ? (-p)>>31 & pixel_max : p;\
+    }
+
+CLIP_PIXEL( 9)
+CLIP_PIXEL(10)
+#endif
+
+#if BIT_DEPTH > 8
+#   define pixel  uint16_t
+#   define pixel2 uint32_t
+#   define pixel4 uint64_t
+#   define dctcoef int32_t
+
+#   define INIT_CLIP
+#   define no_rnd_avg_pixel4 no_rnd_avg64
+#   define    rnd_avg_pixel4    rnd_avg64
+#   define AV_RN2P  AV_RN32
+#   define AV_RN4P  AV_RN64
+#   define AV_WN2P  AV_WN32
+#   define AV_WN4P  AV_WN64
+#   define AV_WN4PA AV_WN64A
+#   define PIXEL_SPLAT_X4(x) ((x)*0x0001000100010001ULL)
+#else
+#   define pixel  uint8_t
+#   define pixel2 uint16_t
+#   define pixel4 uint32_t
+#   define dctcoef int16_t
+
+#   define INIT_CLIP uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
+#   define no_rnd_avg_pixel4 no_rnd_avg32
+#   define    rnd_avg_pixel4    rnd_avg32
+#   define AV_RN2P  AV_RN16
+#   define AV_RN4P  AV_RN32
+#   define AV_WN2P  AV_WN16
+#   define AV_WN4P  AV_WN32
+#   define AV_WN4PA AV_WN32A
+#   define PIXEL_SPLAT_X4(x) ((x)*0x01010101U)
+#endif
+
+#if BIT_DEPTH == 8
+#   define av_clip_pixel(a) av_clip_uint8(a)
+#   define CLIP(a) cm[a]
+#   define FUNC(a)  a ## _8
+#   define FUNCC(a) a ## _8_c
+#elif BIT_DEPTH == 9
+#   define av_clip_pixel(a) av_clip_pixel_9(a)
+#   define CLIP(a)          av_clip_pixel_9(a)
+#   define FUNC(a)  a ## _9
+#   define FUNCC(a) a ## _9_c
+#elif BIT_DEPTH == 10
+#   define av_clip_pixel(a) av_clip_pixel_10(a)
+#   define CLIP(a)          av_clip_pixel_10(a)
+#   define FUNC(a)  a ## _10
+#   define FUNCC(a) a ## _10_c
+#endif
--- a/libavcodec/lpc.c
+++ b/libavcodec/lpc.c
@ -158,7 +158,7 @@ int ff_lpc_calc_coefs(LPCContext *s,
                      const int32_t *samples, int blocksize, int min_order,
                      int max_order, int precision,
                      int32_t coefs[][MAX_LPC_ORDER], int *shift,
-                      enum AVLPCType lpc_type, int lpc_passes,
+                      enum FFLPCType lpc_type, int lpc_passes,
                      int omethod, int max_shift, int zero_shift)
 {
    double autoc[MAX_LPC_ORDER+1];
@ -168,7 +168,7 @@ int ff_lpc_calc_coefs(LPCContext *s,
    int opt_order;

    assert(max_order >= MIN_LPC_ORDER && max_order <= MAX_LPC_ORDER &&
-           lpc_type > AV_LPC_TYPE_FIXED);
+           lpc_type > FF_LPC_TYPE_FIXED);

    /* reinit LPC context if parameters have changed */
    if (blocksize != s->blocksize || max_order != s->max_order ||
@ -177,7 +177,7 @@ int ff_lpc_calc_coefs(LPCContext *s,
        ff_lpc_init(s, blocksize, max_order, lpc_type);
    }

-    if (lpc_type == AV_LPC_TYPE_LEVINSON) {
+    if (lpc_type == FF_LPC_TYPE_LEVINSON) {
        double *windowed_samples = s->windowed_samples + max_order;

        s->lpc_apply_welch_window(samples, blocksize, windowed_samples);
@ -188,7 +188,7 @@ int ff_lpc_calc_coefs(LPCContext *s,

        for(i=0; i<max_order; i++)
            ref[i] = fabs(lpc[i][i]);
-    } else if (lpc_type == AV_LPC_TYPE_CHOLESKY) {
+    } else if (lpc_type == FF_LPC_TYPE_CHOLESKY) {
        LLSModel m[2];
        double var[MAX_LPC_ORDER+1], av_uninit(weight);

@ -241,13 +241,13 @@ int ff_lpc_calc_coefs(LPCContext *s,
 }

 av_cold int ff_lpc_init(LPCContext *s, int blocksize, int max_order,
-                        enum AVLPCType lpc_type)
+                        enum FFLPCType lpc_type)
 {
    s->blocksize = blocksize;
    s->max_order = max_order;
    s->lpc_type  = lpc_type;

-    if (lpc_type == AV_LPC_TYPE_LEVINSON) {
+    if (lpc_type == FF_LPC_TYPE_LEVINSON) {
        s->windowed_samples = av_mallocz((blocksize + max_order + 2) *
                                         sizeof(*s->windowed_samples));
        if (!s->windowed_samples)
--- a/libavcodec/lpc.h
+++ b/libavcodec/lpc.h
@ -35,11 +35,22 @@
 #define MIN_LPC_ORDER        1
 #define MAX_LPC_ORDER       32

+/**
+ * LPC analysis type
+ */
+enum FFLPCType {
+    FF_LPC_TYPE_DEFAULT     = -1, ///< use the codec default LPC type
+    FF_LPC_TYPE_NONE        =  0, ///< do not use LPC prediction or use all zero coefficients
+    FF_LPC_TYPE_FIXED       =  1, ///< fixed LPC coefficients
+    FF_LPC_TYPE_LEVINSON    =  2, ///< Levinson-Durbin recursion
+    FF_LPC_TYPE_CHOLESKY    =  3, ///< Cholesky factorization
+    FF_LPC_TYPE_NB              , ///< Not part of ABI
+};

 typedef struct LPCContext {
    int blocksize;
    int max_order;
-    enum AVLPCType lpc_type;
+    enum FFLPCType lpc_type;
    double *windowed_samples;

    /**
@ -77,14 +88,14 @@ int ff_lpc_calc_coefs(LPCContext *s,
                      const int32_t *samples, int blocksize, int min_order,
                      int max_order, int precision,
                      int32_t coefs[][MAX_LPC_ORDER], int *shift,
-                      enum AVLPCType lpc_type, int lpc_passes,
+                      enum FFLPCType lpc_type, int lpc_passes,
                      int omethod, int max_shift, int zero_shift);

 /**
 * Initialize LPCContext.
 */
 int ff_lpc_init(LPCContext *s, int blocksize, int max_order,
-                enum AVLPCType lpc_type);
+                enum FFLPCType lpc_type);
 void ff_lpc_init_x86(LPCContext *s);

 /**
--- a/libavcodec/mlib/dsputil_mlib.c
+++ b/libavcodec/mlib/dsputil_mlib.c
@ -421,13 +421,13 @@ static void ff_fdct_mlib(DCTELEM *data)

 void dsputil_init_mlib(DSPContext* c, AVCodecContext *avctx)
 {
-    const int h264_high_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
+    const int high_bit_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;

    c->get_pixels  = get_pixels_mlib;
    c->diff_pixels = diff_pixels_mlib;
    c->add_pixels_clamped = add_pixels_clamped_mlib;

-    if (!h264_high_depth) {
+    if (!high_bit_depth) {
    c->put_pixels_tab[0][0] = put_pixels16_mlib;
    c->put_pixels_tab[0][1] = put_pixels16_x2_mlib;
    c->put_pixels_tab[0][2] = put_pixels16_y2_mlib;
--- a/libavcodec/options.c
+++ b/libavcodec/options.c
@ -103,7 +103,7 @@ static const AVOption options[]={
 {"umh", "umh motion estimation", 0, FF_OPT_TYPE_CONST, {.dbl = ME_UMH }, INT_MIN, INT_MAX, V|E, "me_method" },
 {"iter", "iter motion estimation", 0, FF_OPT_TYPE_CONST, {.dbl = ME_ITER }, INT_MIN, INT_MAX, V|E, "me_method" },
 {"extradata_size", NULL, OFFSET(extradata_size), FF_OPT_TYPE_INT, {.dbl = DEFAULT }, INT_MIN, INT_MAX},
-{"time_base", NULL, OFFSET(time_base), FF_OPT_TYPE_RATIONAL, {.dbl=0}, INT_MIN, INT_MAX},
+{"time_base", NULL, OFFSET(time_base), FF_OPT_TYPE_RATIONAL, {.dbl = 0}, INT_MIN, INT_MAX},
 {"g", "set the group of picture size", OFFSET(gop_size), FF_OPT_TYPE_INT, {.dbl = 12 }, INT_MIN, INT_MAX, V|E},
 {"ar", "set audio sampling rate (in Hz)", OFFSET(sample_rate), FF_OPT_TYPE_INT, {.dbl = DEFAULT }, INT_MIN, INT_MAX},
 {"ac", "set number of audio channels", OFFSET(channels), FF_OPT_TYPE_INT, {.dbl = DEFAULT }, INT_MIN, INT_MAX},
@ -224,7 +224,7 @@ static const AVOption options[]={
 {"left", NULL, 0, FF_OPT_TYPE_CONST, {.dbl = FF_PRED_LEFT }, INT_MIN, INT_MAX, V|E, "pred"},
 {"plane", NULL, 0, FF_OPT_TYPE_CONST, {.dbl = FF_PRED_PLANE }, INT_MIN, INT_MAX, V|E, "pred"},
 {"median", NULL, 0, FF_OPT_TYPE_CONST, {.dbl = FF_PRED_MEDIAN }, INT_MIN, INT_MAX, V|E, "pred"},
-{"aspect", "sample aspect ratio", OFFSET(sample_aspect_ratio), FF_OPT_TYPE_RATIONAL, {.dbl=0}, 0, 10, V|E},
+{"aspect", "sample aspect ratio", OFFSET(sample_aspect_ratio), FF_OPT_TYPE_RATIONAL, {.dbl = 0}, 0, 10, V|E},
 {"debug", "print specific debug info", OFFSET(debug), FF_OPT_TYPE_FLAGS, {.dbl = DEFAULT }, 0, INT_MAX, V|A|S|E|D, "debug"},
 {"pict", "picture info", 0, FF_OPT_TYPE_CONST, {.dbl = FF_DEBUG_PICT_INFO }, INT_MIN, INT_MAX, V|D, "debug"},
 {"rc", "rate control", 0, FF_OPT_TYPE_CONST, {.dbl = FF_DEBUG_RC }, INT_MIN, INT_MAX, V|E, "debug"},
@ -380,12 +380,14 @@ static const AVOption options[]={
 {"ivlc", "intra vlc table", 0, FF_OPT_TYPE_CONST, {.dbl = CODEC_FLAG2_INTRA_VLC }, INT_MIN, INT_MAX, V|E, "flags2"},
 {"b_sensitivity", "adjusts sensitivity of b_frame_strategy 1", OFFSET(b_sensitivity), FF_OPT_TYPE_INT, {.dbl = 40 }, 1, INT_MAX, V|E},
 {"compression_level", NULL, OFFSET(compression_level), FF_OPT_TYPE_INT, {.dbl = FF_COMPRESSION_DEFAULT }, INT_MIN, INT_MAX, V|A|E},
-{"lpc_coeff_precision", "LPC coefficient precision (FLAC)", OFFSET(lpc_coeff_precision), FF_OPT_TYPE_INT, {.dbl = DEFAULT }, 0, INT_MAX, A|E},
 {"min_prediction_order", NULL, OFFSET(min_prediction_order), FF_OPT_TYPE_INT, {.dbl = -1 }, INT_MIN, INT_MAX, A|E},
 {"max_prediction_order", NULL, OFFSET(max_prediction_order), FF_OPT_TYPE_INT, {.dbl = -1 }, INT_MIN, INT_MAX, A|E},
-{"prediction_order_method", "search method for selecting prediction order", OFFSET(prediction_order_method), FF_OPT_TYPE_INT, {.dbl = -1 }, INT_MIN, INT_MAX, A|E},
-{"min_partition_order", NULL, OFFSET(min_partition_order), FF_OPT_TYPE_INT, {.dbl = -1 }, INT_MIN, INT_MAX, A|E},
-{"max_partition_order", NULL, OFFSET(max_partition_order), FF_OPT_TYPE_INT, {.dbl = -1 }, INT_MIN, INT_MAX, A|E},
+#if FF_API_FLAC_GLOBAL_OPTS
+{"lpc_coeff_precision", "deprecated, use flac-specific options", OFFSET(lpc_coeff_precision), FF_OPT_TYPE_INT, {.dbl = DEFAULT }, 0, INT_MAX, A|E},
+{"prediction_order_method", "deprecated, use flac-specific options", OFFSET(prediction_order_method), FF_OPT_TYPE_INT, {.dbl = -1 }, INT_MIN, INT_MAX, A|E},
+{"min_partition_order", "deprecated, use flac-specific options", OFFSET(min_partition_order), FF_OPT_TYPE_INT, {.dbl = -1 }, INT_MIN, INT_MAX, A|E},
+{"max_partition_order", "deprecated, use flac-specific options", OFFSET(max_partition_order), FF_OPT_TYPE_INT, {.dbl = -1 }, INT_MIN, INT_MAX, A|E},
+#endif
 {"timecode_frame_start", "GOP timecode frame start number, in non drop frame format", OFFSET(timecode_frame_start), FF_OPT_TYPE_INT64, {.dbl = 0 }, 0, INT64_MAX, V|E},
 {"drop_frame_timecode", NULL, 0, FF_OPT_TYPE_CONST, {.dbl = CODEC_FLAG2_DROP_FRAME_TIMECODE }, INT_MIN, INT_MAX, V|E, "flags2"},
 {"non_linear_q", "use non linear quantizer", 0, FF_OPT_TYPE_CONST, {.dbl = CODEC_FLAG2_NON_LINEAR_QUANT }, INT_MIN, INT_MAX, V|E, "flags2"},
@ -416,12 +418,14 @@ static const AVOption options[]={
 {"intra_refresh", "use periodic insertion of intra blocks instead of keyframes", 0, FF_OPT_TYPE_CONST, {.dbl = CODEC_FLAG2_INTRA_REFRESH }, INT_MIN, INT_MAX, V|E, "flags2"},
 {"crf_max", "in crf mode, prevents vbv from lowering quality beyond this point", OFFSET(crf_max), FF_OPT_TYPE_FLOAT, {.dbl = DEFAULT }, 0, 51, V|E},
 {"log_level_offset", "set the log level offset", OFFSET(log_level_offset), FF_OPT_TYPE_INT, {.dbl = 0 }, INT_MIN, INT_MAX },
-{"lpc_type", "specify LPC algorithm", OFFSET(lpc_type), FF_OPT_TYPE_INT, {.dbl = AV_LPC_TYPE_DEFAULT }, AV_LPC_TYPE_DEFAULT, AV_LPC_TYPE_NB-1, A|E},
+#if FF_API_FLAC_GLOBAL_OPTS
+{"lpc_type", "deprecated, use flac-specific options", OFFSET(lpc_type), FF_OPT_TYPE_INT, {.dbl = AV_LPC_TYPE_DEFAULT }, AV_LPC_TYPE_DEFAULT, AV_LPC_TYPE_NB-1, A|E},
 {"none",     NULL, 0, FF_OPT_TYPE_CONST, {.dbl = AV_LPC_TYPE_NONE },     INT_MIN, INT_MAX, A|E, "lpc_type"},
 {"fixed",    NULL, 0, FF_OPT_TYPE_CONST, {.dbl = AV_LPC_TYPE_FIXED },    INT_MIN, INT_MAX, A|E, "lpc_type"},
 {"levinson", NULL, 0, FF_OPT_TYPE_CONST, {.dbl = AV_LPC_TYPE_LEVINSON }, INT_MIN, INT_MAX, A|E, "lpc_type"},
 {"cholesky", NULL, 0, FF_OPT_TYPE_CONST, {.dbl = AV_LPC_TYPE_CHOLESKY }, INT_MIN, INT_MAX, A|E, "lpc_type"},
-{"lpc_passes", "number of passes to use for Cholesky factorization during LPC analysis", OFFSET(lpc_passes), FF_OPT_TYPE_INT, {.dbl = -1 }, INT_MIN, INT_MAX, A|E},
+{"lpc_passes", "deprecated, use flac-specific options", OFFSET(lpc_passes), FF_OPT_TYPE_INT, {.dbl = -1 }, INT_MIN, INT_MAX, A|E},
+#endif
 {"slices", "number of slices, used in parallelized decoding", OFFSET(slices), FF_OPT_TYPE_INT, {.dbl = 0 }, 0, INT_MAX, V|E},
 {"thread_type", "select multithreading type", OFFSET(thread_type), FF_OPT_TYPE_INT, {.dbl = FF_THREAD_SLICE|FF_THREAD_FRAME }, 0, INT_MAX, V|E|D, "thread_type"},
 {"slice", NULL, 0, FF_OPT_TYPE_CONST, {.dbl = FF_THREAD_SLICE }, INT_MIN, INT_MAX, V|E|D, "thread_type"},
--- a/libavcodec/ppc/dsputil_altivec.c
+++ b/libavcodec/ppc/dsputil_altivec.c
@ -1384,7 +1384,7 @@ static void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int l

 void dsputil_init_altivec(DSPContext* c, AVCodecContext *avctx)
 {
-    const int h264_high_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
+    const int high_bit_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;

    c->pix_abs[0][1] = sad16_x2_altivec;
    c->pix_abs[0][2] = sad16_y2_altivec;
@ -1399,10 +1399,10 @@ void dsputil_init_altivec(DSPContext* c, AVCodecContext *avctx)
    c->pix_sum = pix_sum_altivec;
    c->diff_pixels = diff_pixels_altivec;
    c->get_pixels = get_pixels_altivec;
-    if (!h264_high_depth)
+    if (!high_bit_depth)
    c->clear_block = clear_block_altivec;
    c->add_bytes= add_bytes_altivec;
-    if (!h264_high_depth) {
+    if (!high_bit_depth) {
    c->put_pixels_tab[0][0] = put_pixels16_altivec;
    /* the two functions do the same thing, so use the same code */
    c->put_no_rnd_pixels_tab[0][0] = put_pixels16_altivec;
--- a/libavcodec/ppc/dsputil_ppc.c
+++ b/libavcodec/ppc/dsputil_ppc.c
@ -153,11 +153,11 @@ static void prefetch_ppc(void *mem, int stride, int h)

 void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx)
 {
-    const int h264_high_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
+    const int high_bit_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;

    // Common optimizations whether AltiVec is available or not
    c->prefetch = prefetch_ppc;
-    if (!h264_high_depth) {
+    if (!high_bit_depth) {
    switch (check_dcbzl_effect()) {
        case 32:
            c->clear_blocks = clear_blocks_dcbz32_ppc;
--- a/libavcodec/ppc/h264_altivec.c
+++ b/libavcodec/ppc/h264_altivec.c
@ -965,10 +965,10 @@ H264_WEIGHT( 8, 8)
 H264_WEIGHT( 8, 4)

 void dsputil_h264_init_ppc(DSPContext* c, AVCodecContext *avctx) {
-    const int h264_high_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
+    const int high_bit_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;

    if (av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC) {
-    if (!h264_high_depth) {
+    if (!high_bit_depth) {
        c->put_h264_chroma_pixels_tab[0] = put_h264_chroma_mc8_altivec;
        c->avg_h264_chroma_pixels_tab[0] = avg_h264_chroma_mc8_altivec;

--- a/libavcodec/ps2/dsputil_mmi.c
+++ b/libavcodec/ps2/dsputil_mmi.c
@ -142,9 +142,9 @@ static void put_pixels16_mmi(uint8_t *block, const uint8_t *pixels, int line_siz
 void dsputil_init_mmi(DSPContext* c, AVCodecContext *avctx)
 {
    const int idct_algo= avctx->idct_algo;
-    const int h264_high_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
+    const int high_bit_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;

-    if (!h264_high_depth) {
+    if (!high_bit_depth) {
    c->clear_blocks = clear_blocks_mmi;

    c->put_pixels_tab[1][0] = put_pixels8_mmi;
--- a/libavcodec/ra144enc.c
+++ b/libavcodec/ra144enc.c
@ -54,7 +54,7 @@ static av_cold int ra144_encode_init(AVCodecContext * avctx)
    ractx->lpc_coef[1] = ractx->lpc_tables[1];
    ractx->avctx = avctx;
    ret = ff_lpc_init(&ractx->lpc_ctx, avctx->frame_size, LPC_ORDER,
-                      AV_LPC_TYPE_LEVINSON);
+                      FF_LPC_TYPE_LEVINSON);
    return ret;
 }

@ -461,7 +461,7 @@ static int ra144_encode_frame(AVCodecContext *avctx, uint8_t *frame,
                                    32)];

    ff_lpc_calc_coefs(&ractx->lpc_ctx, lpc_data, NBLOCKS * BLOCKSIZE, LPC_ORDER,
-                      LPC_ORDER, 16, lpc_coefs, shift, AV_LPC_TYPE_LEVINSON,
+                      LPC_ORDER, 16, lpc_coefs, shift, FF_LPC_TYPE_LEVINSON,
                      0, ORDER_METHOD_EST, 12, 0);
    for (i = 0; i < LPC_ORDER; i++)
        block_coefs[NBLOCKS - 1][i] = -(lpc_coefs[LPC_ORDER - 1][i] <<
--- a/libavcodec/sh4/dsputil_align.c
+++ b/libavcodec/sh4/dsputil_align.c
@ -333,9 +333,9 @@ DEFFUNC(avg,no_rnd,xy,16,OP_XY,PACK)

 void dsputil_init_align(DSPContext* c, AVCodecContext *avctx)
 {
-        const int h264_high_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
+        const int high_bit_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;

-        if (!h264_high_depth) {
+        if (!high_bit_depth) {
        c->put_pixels_tab[0][0] = put_rnd_pixels16_o;
        c->put_pixels_tab[0][1] = put_rnd_pixels16_x;
        c->put_pixels_tab[0][2] = put_rnd_pixels16_y;
@ -405,7 +405,7 @@ void dsputil_init_align(DSPContext* c, AVCodecContext *avctx)
    dspfunc(avg_qpel, 1, 8);
    /* dspfunc(avg_no_rnd_qpel, 1, 8); */

-    if (!h264_high_depth) {
+    if (!high_bit_depth) {
    dspfunc(put_h264_qpel, 0, 16);
    dspfunc(put_h264_qpel, 1, 8);
    dspfunc(put_h264_qpel, 2, 4);
@ -415,7 +415,7 @@ void dsputil_init_align(DSPContext* c, AVCodecContext *avctx)
    }

 #undef dspfunc
-    if (!h264_high_depth) {
+    if (!high_bit_depth) {
    c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_sh4;
    c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_sh4;
    c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_sh4;
--- a/libavcodec/sh4/dsputil_sh4.c
+++ b/libavcodec/sh4/dsputil_sh4.c
@ -92,10 +92,10 @@ static void idct_add(uint8_t *dest, int line_size, DCTELEM *block)
 void dsputil_init_sh4(DSPContext* c, AVCodecContext *avctx)
 {
        const int idct_algo= avctx->idct_algo;
-        const int h264_high_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
+        const int high_bit_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
        dsputil_init_align(c,avctx);

-        if (!h264_high_depth)
+        if (!high_bit_depth)
        c->clear_blocks = clear_blocks_sh4;
        if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SH4){
                c->idct_put = idct_put;
--- a/libavcodec/sparc/dsputil_vis.c
+++ b/libavcodec/sparc/dsputil_vis.c
@ -3953,7 +3953,7 @@ void dsputil_init_vis(DSPContext* c, AVCodecContext *avctx)
 {
  /* VIS-specific optimizations */
  int accel = vis_level ();
-  const int h264_high_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
+  const int high_bit_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;

  if (accel & ACCEL_SPARC_VIS) {
      if(avctx->idct_algo==FF_IDCT_SIMPLEVIS){
@ -3963,7 +3963,7 @@ void dsputil_init_vis(DSPContext* c, AVCodecContext *avctx)
          c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
      }

-      if (!h264_high_depth) {
+      if (!high_bit_depth) {
      c->put_pixels_tab[0][0] = MC_put_o_16_vis;
      c->put_pixels_tab[0][1] = MC_put_x_16_vis;
      c->put_pixels_tab[0][2] = MC_put_y_16_vis;
--- a/libavcodec/utils.c
+++ b/libavcodec/utils.c
@ -1297,9 +1297,9 @@ int av_lockmgr_register(int (*cb)(void **mutex, enum AVLockOp op))
 unsigned int ff_toupper4(unsigned int x)
 {
    return     toupper( x     &0xFF)
-    + (toupper((x>>8 )&0xFF)<<8 )
-    + (toupper((x>>16)&0xFF)<<16)
-    + (toupper((x>>24)&0xFF)<<24);
+            + (toupper((x>>8 )&0xFF)<<8 )
+            + (toupper((x>>16)&0xFF)<<16)
+            + (toupper((x>>24)&0xFF)<<24);
 }

 #if !HAVE_PTHREADS
--- a/libavcodec/version.h
+++ b/libavcodec/version.h
@ -62,5 +62,8 @@
 #ifndef FF_API_OLD_FF_PICT_TYPES
 #define FF_API_OLD_FF_PICT_TYPES (LIBAVCODEC_VERSION_MAJOR < 54)
 #endif
+#ifndef FF_API_FLAC_GLOBAL_OPTS
+#define FF_API_FLAC_GLOBAL_OPTS (LIBAVCODEC_VERSION_MAJOR < 54)
+#endif

 #endif /* AVCODEC_VERSION_H */
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@ -2322,7 +2322,7 @@ float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
 void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
 {
    int mm_flags = av_get_cpu_flags();
-    const int h264_high_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
+    const int high_bit_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;

    if (avctx->dsp_mask) {
        if (avctx->dsp_mask & AV_CPU_FLAG_FORCE)
@ -2404,7 +2404,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
        c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
        c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
        c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
-        if (!h264_high_depth) {
+        if (!high_bit_depth) {
        c->clear_block  = clear_block_mmx;
        c->clear_blocks = clear_blocks_mmx;
        if ((mm_flags & AV_CPU_FLAG_SSE) &&
@ -2421,7 +2421,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
        c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
        c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU

-        if (!h264_high_depth) {
+        if (!high_bit_depth) {
        SET_HPEL_FUNCS(put, 0, 16, mmx);
        SET_HPEL_FUNCS(put_no_rnd, 0, 16, mmx);
        SET_HPEL_FUNCS(avg, 0, 16, mmx);
@ -2436,13 +2436,13 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
        c->gmc= gmc_mmx;
 #endif
 #if ARCH_X86_32 && HAVE_YASM
-        if (!h264_high_depth)
+        if (!high_bit_depth)
        c->emulated_edge_mc = emulated_edge_mc_mmx;
 #endif

        c->add_bytes= add_bytes_mmx;

-        if (!h264_high_depth)
+        if (!high_bit_depth)
        c->draw_edges = draw_edges_mmx;

        if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
@ -2451,7 +2451,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
        }

 #if HAVE_YASM
-        if (!h264_high_depth) {
+        if (!high_bit_depth) {
        c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_mmx_rnd;
        c->put_h264_chroma_pixels_tab[1]= ff_put_h264_chroma_mc4_mmx;
        }
@ -2463,7 +2463,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
        if (mm_flags & AV_CPU_FLAG_MMX2) {
            c->prefetch = prefetch_mmx2;

-            if (!h264_high_depth) {
+            if (!high_bit_depth) {
            c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
            c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;

@ -2480,7 +2480,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
            }

            if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
-                if (!h264_high_depth) {
+                if (!high_bit_depth) {
                c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
                c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
                c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
@ -2529,7 +2529,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
            SET_QPEL_FUNCS(avg_qpel, 0, 16, mmx2);
            SET_QPEL_FUNCS(avg_qpel, 1, 8, mmx2);

-            if (!h264_high_depth) {
+            if (!high_bit_depth) {
            SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmx2);
            SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmx2);
            SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmx2);
@ -2547,7 +2547,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
            c->avg_rv40_chroma_pixels_tab[0]= ff_avg_rv40_chroma_mc8_mmx2;
            c->avg_rv40_chroma_pixels_tab[1]= ff_avg_rv40_chroma_mc4_mmx2;

-            if (!h264_high_depth) {
+            if (!high_bit_depth) {
            c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_mmx2_rnd;
            c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_mmx2;
            c->avg_h264_chroma_pixels_tab[2]= ff_avg_h264_chroma_mc2_mmx2;
@ -2564,7 +2564,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
        } else if (mm_flags & AV_CPU_FLAG_3DNOW) {
            c->prefetch = prefetch_3dnow;

-            if (!h264_high_depth) {
+            if (!high_bit_depth) {
            c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
            c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;

@ -2602,7 +2602,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
            SET_QPEL_FUNCS(avg_qpel, 0, 16, 3dnow);
            SET_QPEL_FUNCS(avg_qpel, 1, 8, 3dnow);

-            if (!h264_high_depth) {
+            if (!high_bit_depth) {
            SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 3dnow);
            SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 3dnow);
            SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 3dnow);
@ -2617,7 +2617,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
            SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, 3dnow);

 #if HAVE_YASM
-            if (!h264_high_depth) {
+            if (!high_bit_depth) {
            c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_3dnow_rnd;
            c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_3dnow;
            }
@ -2635,7 +2635,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
            c->avg_h264_qpel_pixels_tab[1][x+y*4] = avg_h264_qpel8_mc##x##y##_##CPU;
        if((mm_flags & AV_CPU_FLAG_SSE2) && !(mm_flags & AV_CPU_FLAG_3DNOW)){
            // these functions are slower than mmx on AMD, but faster on Intel
-            if (!h264_high_depth) {
+            if (!high_bit_depth) {
            c->put_pixels_tab[0][0] = put_pixels16_sse2;
            c->put_no_rnd_pixels_tab[0][0] = put_pixels16_sse2;
            c->avg_pixels_tab[0][0] = avg_pixels16_sse2;
@ -2643,7 +2643,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
            }
        }
        if(mm_flags & AV_CPU_FLAG_SSE2){
-            if (!h264_high_depth) {
+            if (!high_bit_depth) {
            H264_QPEL_FUNCS(0, 1, sse2);
            H264_QPEL_FUNCS(0, 2, sse2);
            H264_QPEL_FUNCS(0, 3, sse2);
@ -2660,7 +2660,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
        }
 #if HAVE_SSSE3
        if(mm_flags & AV_CPU_FLAG_SSSE3){
-            if (!h264_high_depth) {
+            if (!high_bit_depth) {
            H264_QPEL_FUNCS(1, 0, ssse3);
            H264_QPEL_FUNCS(1, 1, ssse3);
            H264_QPEL_FUNCS(1, 2, ssse3);
@ -2675,7 +2675,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
            H264_QPEL_FUNCS(3, 3, ssse3);
            }
 #if HAVE_YASM
-            if (!h264_high_depth) {
+            if (!high_bit_depth) {
            c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_ssse3_rnd;
            c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_ssse3_rnd;
            c->put_h264_chroma_pixels_tab[1]= ff_put_h264_chroma_mc4_ssse3;
@ -2737,7 +2737,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
                }
            }

-            if (!h264_high_depth)
+            if (!high_bit_depth)
            c->emulated_edge_mc = emulated_edge_mc_sse;
            c->gmc= gmc_sse;
 #endif
--- a/libavformat/utils.c
+++ b/libavformat/utils.c
@ -3048,12 +3048,12 @@ static int ff_interleave_compare_dts(AVFormatContext *s, AVPacket *next, AVPacke
 {
    AVStream *st = s->streams[ pkt ->stream_index];
    AVStream *st2= s->streams[ next->stream_index];
-    int64_t a= st2->time_base.num * (int64_t)st ->time_base.den;
-    int64_t b= st ->time_base.num * (int64_t)st2->time_base.den;
-    int64_t dts1 = av_rescale_rnd(pkt->dts, b, a, AV_ROUND_DOWN);
-    if (dts1==next->dts && dts1==av_rescale_rnd(pkt->dts, b, a, AV_ROUND_UP))
+    int comp = av_compare_ts(next->dts, st2->time_base, pkt->dts,
+                             st->time_base);
+
+    if (comp == 0)
        return pkt->stream_index < next->stream_index;
-    return dts1 < next->dts;
+    return comp > 0;
 }

 int av_interleave_packet_per_dts(AVFormatContext *s, AVPacket *out, AVPacket *pkt, int flush){
--- a/libavutil/opt.h
+++ b/libavutil/opt.h
@ -67,6 +67,9 @@ typedef struct AVOption {
    union {
        double dbl;
        const char *str;
+        /* TODO those are unused now */
+        int64_t i64;
+        AVRational q;
    } default_val;
    double min;                 ///< minimum valid value for the option
    double max;                 ///< maximum valid value for the option
--- a/libavutil/pixfmt.h
+++ b/libavutil/pixfmt.h
@ -136,7 +136,7 @@ enum PixelFormat {
    PIX_FMT_BGR48BE,   ///< packed RGB 16:16:16, 48bpp, 16B, 16G, 16R, the 2-byte value for each R/G/B component is stored as big-endian
    PIX_FMT_BGR48LE,   ///< packed RGB 16:16:16, 48bpp, 16B, 16G, 16R, the 2-byte value for each R/G/B component is stored as little-endian

-    //the following 4 formats are deprecated and should be replaced by PIX_FMT_YUV420P16* with the bpp stored seperately
+    //the following 6 formats are deprecated and should be replaced by PIX_FMT_YUV420P16* with the bpp stored seperately
    PIX_FMT_YUV420P9BE, ///< planar YUV 4:2:0, 13.5bpp, (1 Cr & Cb sample per 2x2 Y samples), big-endian
    PIX_FMT_YUV420P9LE, ///< planar YUV 4:2:0, 13.5bpp, (1 Cr & Cb sample per 2x2 Y samples), little-endian
    PIX_FMT_YUV420P10BE,///< planar YUV 4:2:0, 15bpp, (1 Cr & Cb sample per 2x2 Y samples), big-endian
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@ -107,9 +107,13 @@ const char *swscale_license(void)
        || (x)==PIX_FMT_YUV440P     \
        || (x)==PIX_FMT_MONOWHITE   \
        || (x)==PIX_FMT_MONOBLACK   \
+        || (x)==PIX_FMT_YUV420P9LE    \
+        || (x)==PIX_FMT_YUV420P10LE   \
        || (x)==PIX_FMT_YUV420P16LE   \
        || (x)==PIX_FMT_YUV422P16LE   \
        || (x)==PIX_FMT_YUV444P16LE   \
+        || (x)==PIX_FMT_YUV420P9BE    \
+        || (x)==PIX_FMT_YUV420P10BE   \
        || (x)==PIX_FMT_YUV420P16BE   \
        || (x)==PIX_FMT_YUV422P16BE   \
        || (x)==PIX_FMT_YUV444P16BE   \
--- a/tests/fate/h264.mak
+++ b/tests/fate/h264.mak
@ -127,6 +127,13 @@ FATE_H264 = aud_mw_e                                                    \
            frext-hpcvflnl_bcrm_a                                       \
            frext-hpcvmolq_brcm_b                                       \
            frext-hpcvnl_brcm_a                                         \
+            frext-pph10i1_panasonic_a                                   \
+            frext-pph10i2_panasonic_a                                   \
+            frext-pph10i3_panasonic_a                                   \
+            frext-pph10i4_panasonic_a                                   \
+            frext-pph10i5_panasonic_a                                   \
+            frext-pph10i6_panasonic_a                                   \
+            frext-pph10i7_panasonic_a                                   \
            hcbp2_hhi_a                                                 \
            hcmp1_hhi_a                                                 \
            ls_sva_d                                                    \
@ -301,6 +308,13 @@ fate-h264-conformance-frext-hpcvfl_bcrm_a: CMD = framecrc  -i $(SAMPLES)/h264-co
 fate-h264-conformance-frext-hpcvflnl_bcrm_a: CMD = framecrc  -i $(SAMPLES)/h264-conformance/FRext/HPCVFLNL_BRCM_A.264 -vsync 0
 fate-h264-conformance-frext-hpcvmolq_brcm_b: CMD = framecrc  -i $(SAMPLES)/h264-conformance/FRext/HPCVMOLQ_BRCM_B.264
 fate-h264-conformance-frext-hpcvnl_brcm_a: CMD = framecrc  -i $(SAMPLES)/h264-conformance/FRext/HPCVNL_BRCM_A.264
+fate-h264-conformance-frext-pph10i1_panasonic_a: CMD = framecrc  -i $(SAMPLES)/h264-conformance/FRext/PPH10I1_Panasonic_A.264
+fate-h264-conformance-frext-pph10i2_panasonic_a: CMD = framecrc  -i $(SAMPLES)/h264-conformance/FRext/PPH10I2_Panasonic_A.264
+fate-h264-conformance-frext-pph10i3_panasonic_a: CMD = framecrc  -i $(SAMPLES)/h264-conformance/FRext/PPH10I3_Panasonic_A.264
+fate-h264-conformance-frext-pph10i4_panasonic_a: CMD = framecrc  -i $(SAMPLES)/h264-conformance/FRext/PPH10I4_Panasonic_A.264
+fate-h264-conformance-frext-pph10i5_panasonic_a: CMD = framecrc  -i $(SAMPLES)/h264-conformance/FRext/PPH10I5_Panasonic_A.264
+fate-h264-conformance-frext-pph10i6_panasonic_a: CMD = framecrc  -i $(SAMPLES)/h264-conformance/FRext/PPH10I6_Panasonic_A.264
+fate-h264-conformance-frext-pph10i7_panasonic_a: CMD = framecrc  -i $(SAMPLES)/h264-conformance/FRext/PPH10I7_Panasonic_A.264
 fate-h264-conformance-hcbp2_hhi_a: CMD = framecrc  -vsync 0 -strict 1 -i $(SAMPLES)/h264-conformance/HCBP2_HHI_A.264
 fate-h264-conformance-hcmp1_hhi_a: CMD = framecrc  -vsync 0 -strict 1 -i $(SAMPLES)/h264-conformance/HCMP1_HHI_A.264
 fate-h264-conformance-ls_sva_d: CMD = framecrc  -i $(SAMPLES)/h264-conformance/LS_SVA_D.264
--- a/tests/ref/fate/h264-conformance-frext-pph10i1_panasonic_a
+++ b/tests/ref/fate/h264-conformance-frext-pph10i1_panasonic_a
@ -0,0 +1,10 @@
+0, 0, 2764800, 0xcc4df07d
+0, 3600, 2764800, 0x85f9e6d4
+0, 7200, 2764800, 0x23ffe90d
+0, 10800, 2764800, 0xf0a6d453
+0, 14400, 2764800, 0x913a6392
+0, 18000, 2764800, 0xcc5f9736
+0, 21600, 2764800, 0x43f9f9ce
+0, 25200, 2764800, 0xc874b44e
+0, 28800, 2764800, 0x83b665e6
+0, 32400, 2764800, 0x5ea2e31e
--- a/tests/ref/fate/h264-conformance-frext-pph10i2_panasonic_a
+++ b/tests/ref/fate/h264-conformance-frext-pph10i2_panasonic_a
@ -0,0 +1,10 @@
+0, 0, 2764800, 0x4f710132
+0, 3600, 2764800, 0x57e5b713
+0, 7200, 2764800, 0xcca01477
+0, 10800, 2764800, 0xa19a95cd
+0, 14400, 2764800, 0x700a757d
+0, 18000, 2764800, 0xd8c6f60f
+0, 21600, 2764800, 0x95a1bbc7
+0, 25200, 2764800, 0x0582077a
+0, 28800, 2764800, 0x91595f91
+0, 32400, 2764800, 0xf5fe034a
--- a/tests/ref/fate/h264-conformance-frext-pph10i3_panasonic_a
+++ b/tests/ref/fate/h264-conformance-frext-pph10i3_panasonic_a
@ -0,0 +1,10 @@
+0, 0, 2764800, 0xda69f69e
+0, 3600, 2764800, 0x29ed832f
+0, 7200, 2764800, 0xb3244cc4
+0, 10800, 2764800, 0xe41a312c
+0, 14400, 2764800, 0xac0b344b
+0, 18000, 2764800, 0xc585aa20
+0, 21600, 2764800, 0x0952054c
+0, 25200, 2764800, 0xd1a02f87
+0, 28800, 2764800, 0xfcbfe87c
+0, 32400, 2764800, 0xe4e9b8a2
--- a/tests/ref/fate/h264-conformance-frext-pph10i4_panasonic_a
+++ b/tests/ref/fate/h264-conformance-frext-pph10i4_panasonic_a
@ -0,0 +1,19 @@
+0, 0, 6220800, 0xca2a2a5e
+0, 3600, 6220800, 0x8009a65e
+0, 7200, 6220800, 0x63e72b3b
+0, 10800, 6220800, 0x7459a1cc
+0, 14400, 6220800, 0x02191aa9
+0, 18000, 6220800, 0x88dca590
+0, 21600, 6220800, 0x56dd150a
+0, 25200, 6220800, 0x5f56a56f
+0, 28800, 6220800, 0x67ada4b7
+0, 32400, 6220800, 0x88dca590
+0, 36000, 6220800, 0xd3b09fe5
+0, 39600, 6220800, 0x2223998c
+0, 43200, 6220800, 0x5e5b2da5
+0, 46800, 6220800, 0x88dca590
+0, 50400, 6220800, 0x5e5b2da5
+0, 54000, 6220800, 0x88dca590
+0, 57600, 6220800, 0x5e5b2da5
+0, 61200, 6220800, 0x88dca590
+0, 64800, 6220800, 0x26e1ec8b
--- a/tests/ref/fate/h264-conformance-frext-pph10i5_panasonic_a
+++ b/tests/ref/fate/h264-conformance-frext-pph10i5_panasonic_a
@ -0,0 +1,10 @@
+0, 0, 6220800, 0x1df58ce9
+0, 3600, 6220800, 0x8f2859ce
+0, 7200, 6220800, 0x229cc7ff
+0, 10800, 6220800, 0x73e86984
+0, 14400, 6220800, 0xb6d4504b
+0, 18000, 6220800, 0x4e7d4883
+0, 21600, 6220800, 0xbec3f0f7
+0, 25200, 6220800, 0x1d9af065
+0, 28800, 6220800, 0x44851549
+0, 32400, 6220800, 0xfcf8728e
--- a/tests/ref/fate/h264-conformance-frext-pph10i6_panasonic_a
+++ b/tests/ref/fate/h264-conformance-frext-pph10i6_panasonic_a
@ -0,0 +1,10 @@
+0, 0, 6220800, 0x408daf70
+0, 3600, 6220800, 0x59b254a3
+0, 7200, 6220800, 0x4cf4279c
+0, 10800, 6220800, 0x5c9437ae
+0, 14400, 6220800, 0x986c3eb8
+0, 18000, 6220800, 0x23fd883e
+0, 21600, 6220800, 0x84f222fe
+0, 25200, 6220800, 0xe7f91107
+0, 28800, 6220800, 0xb544b31e
+0, 32400, 6220800, 0x1ebdde56
--- a/tests/ref/fate/h264-conformance-frext-pph10i7_panasonic_a
+++ b/tests/ref/fate/h264-conformance-frext-pph10i7_panasonic_a
@ -0,0 +1,10 @@
+0, 0, 6220800, 0xf81873fe
+0, 3600, 6220800, 0x7b96fbdc
+0, 7200, 6220800, 0x75dbafc4
+0, 10800, 6220800, 0x7524301e
+0, 14400, 6220800, 0x0f3621ab
+0, 18000, 6220800, 0xa5e25b35
+0, 21600, 6220800, 0x063a8116
+0, 25200, 6220800, 0x48ebc8ff
+0, 28800, 6220800, 0x1f635df8
+0, 32400, 6220800, 0xe282c8bd