From 4e528206bc4d968706401206cf54471739250ec7 Mon Sep 17 00:00:00 2001
From: Mark Thompson <sw@jkqxz.net>
Date: Sun, 4 Sep 2016 13:26:37 +0100
Subject: [PATCH 1/4] vp8: Add hwaccel hooks

Also adds some extra fields to the main context structure that may
be needed by a hwaccel decoder.
---
 libavcodec/vp8.c | 185 ++++++++++++++++++++++++++++++++---------------
 libavcodec/vp8.h |  32 ++++++++
 2 files changed, 157 insertions(+), 60 deletions(-)

diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c
index 546124cdf2..ced49799bc 100644
--- a/libavcodec/vp8.c
+++ b/libavcodec/vp8.c
@@ -64,16 +64,30 @@ static int vp8_alloc_frame(VP8Context *s, VP8Frame *f, int ref)
     if ((ret = ff_thread_get_buffer(s->avctx, &f->tf,
                                     ref ? AV_GET_BUFFER_FLAG_REF : 0)) < 0)
         return ret;
-    if (!(f->seg_map = av_buffer_allocz(s->mb_width * s->mb_height))) {
-        ff_thread_release_buffer(s->avctx, &f->tf);
-        return AVERROR(ENOMEM);
+    if (!(f->seg_map = av_buffer_allocz(s->mb_width * s->mb_height)))
+        goto fail;
+    if (s->avctx->hwaccel) {
+        const AVHWAccel *hwaccel = s->avctx->hwaccel;
+        if (hwaccel->frame_priv_data_size) {
+            f->hwaccel_priv_buf = av_buffer_allocz(hwaccel->frame_priv_data_size);
+            if (!f->hwaccel_priv_buf)
+                goto fail;
+            f->hwaccel_picture_private = f->hwaccel_priv_buf->data;
+        }
     }
     return 0;
+
+fail:
+    av_buffer_unref(&f->seg_map);
+    ff_thread_release_buffer(s->avctx, &f->tf);
+    return AVERROR(ENOMEM);
 }
 
 static void vp8_release_frame(VP8Context *s, VP8Frame *f)
 {
     av_buffer_unref(&f->seg_map);
+    av_buffer_unref(&f->hwaccel_priv_buf);
+    f->hwaccel_picture_private = NULL;
     ff_thread_release_buffer(s->avctx, &f->tf);
 }
 
@@ -91,6 +105,12 @@ static int vp8_ref_frame(VP8Context *s, VP8Frame *dst, VP8Frame *src)
         vp8_release_frame(s, dst);
         return AVERROR(ENOMEM);
     }
+    if (src->hwaccel_picture_private) {
+        dst->hwaccel_priv_buf = av_buffer_ref(src->hwaccel_priv_buf);
+        if (!dst->hwaccel_priv_buf)
+            return AVERROR(ENOMEM);
+        dst->hwaccel_picture_private = dst->hwaccel_priv_buf->data;
+    }
 
     return 0;
 }
@@ -132,7 +152,7 @@ static VP8Frame *vp8_find_free_buffer(VP8Context *s)
         av_log(s->avctx, AV_LOG_FATAL, "Ran out of free frames!\n");
         abort();
     }
-    if (frame->tf.f->data[0])
+    if (frame->tf.f->buf[0])
         vp8_release_frame(s, frame);
 
     return frame;
@@ -209,8 +229,9 @@ static void parse_segment_info(VP8Context *s)
     int i;
 
     s->segmentation.update_map = vp8_rac_get(c);
+    s->segmentation.update_feature_data = vp8_rac_get(c);
 
-    if (vp8_rac_get(c)) { // update segment feature data
+    if (s->segmentation.update_feature_data) {
         s->segmentation.absolute_vals = vp8_rac_get(c);
 
         for (i = 0; i < 4; i++)
@@ -264,11 +285,14 @@ static int setup_partitions(VP8Context *s, const uint8_t *buf, int buf_size)
         int size = AV_RL24(sizes + 3 * i);
         if (buf_size - size < 0)
             return -1;
+        s->coeff_partition_size[i] = size;
 
         ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, size);
         buf      += size;
         buf_size -= size;
     }
+
+    s->coeff_partition_size[i] = buf_size;
     ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, buf_size);
 
     return 0;
@@ -298,28 +322,28 @@ static void get_quants(VP8Context *s)
     VP56RangeCoder *c = &s->c;
     int i, base_qi;
 
-    int yac_qi     = vp8_rac_get_uint(c, 7);
-    int ydc_delta  = vp8_rac_get_sint(c, 4);
-    int y2dc_delta = vp8_rac_get_sint(c, 4);
-    int y2ac_delta = vp8_rac_get_sint(c, 4);
-    int uvdc_delta = vp8_rac_get_sint(c, 4);
-    int uvac_delta = vp8_rac_get_sint(c, 4);
+    s->quant.yac_qi     = vp8_rac_get_uint(c, 7);
+    s->quant.ydc_delta  = vp8_rac_get_sint(c, 4);
+    s->quant.y2dc_delta = vp8_rac_get_sint(c, 4);
+    s->quant.y2ac_delta = vp8_rac_get_sint(c, 4);
+    s->quant.uvdc_delta = vp8_rac_get_sint(c, 4);
+    s->quant.uvac_delta = vp8_rac_get_sint(c, 4);
 
     for (i = 0; i < 4; i++) {
         if (s->segmentation.enabled) {
             base_qi = s->segmentation.base_quant[i];
             if (!s->segmentation.absolute_vals)
-                base_qi += yac_qi;
+                base_qi += s->quant.yac_qi;
         } else
-            base_qi = yac_qi;
+            base_qi = s->quant.yac_qi;
 
-        s->qmat[i].luma_qmul[0]    = vp8_dc_qlookup[av_clip_uintp2(base_qi + ydc_delta,  7)];
+        s->qmat[i].luma_qmul[0]    = vp8_dc_qlookup[av_clip_uintp2(base_qi + s->quant.ydc_delta,  7)];
         s->qmat[i].luma_qmul[1]    = vp8_ac_qlookup[av_clip_uintp2(base_qi,              7)];
-        s->qmat[i].luma_dc_qmul[0] = vp8_dc_qlookup[av_clip_uintp2(base_qi + y2dc_delta, 7)] * 2;
+        s->qmat[i].luma_dc_qmul[0] = vp8_dc_qlookup[av_clip_uintp2(base_qi + s->quant.y2dc_delta, 7)] * 2;
         /* 101581>>16 is equivalent to 155/100 */
-        s->qmat[i].luma_dc_qmul[1] = vp8_ac_qlookup[av_clip_uintp2(base_qi + y2ac_delta, 7)] * 101581 >> 16;
-        s->qmat[i].chroma_qmul[0]  = vp8_dc_qlookup[av_clip_uintp2(base_qi + uvdc_delta, 7)];
-        s->qmat[i].chroma_qmul[1]  = vp8_ac_qlookup[av_clip_uintp2(base_qi + uvac_delta, 7)];
+        s->qmat[i].luma_dc_qmul[1] = vp8_ac_qlookup[av_clip_uintp2(base_qi + s->quant.y2ac_delta, 7)] * 101581 >> 16;
+        s->qmat[i].chroma_qmul[0]  = vp8_dc_qlookup[av_clip_uintp2(base_qi + s->quant.uvdc_delta, 7)];
+        s->qmat[i].chroma_qmul[1]  = vp8_ac_qlookup[av_clip_uintp2(base_qi + s->quant.uvac_delta, 7)];
 
         s->qmat[i].luma_dc_qmul[1] = FFMAX(s->qmat[i].luma_dc_qmul[1], 8);
         s->qmat[i].chroma_qmul[0]  = FFMIN(s->qmat[i].chroma_qmul[0], 132);
@@ -637,6 +661,8 @@ static int vp8_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_si
     buf      += 3;
     buf_size -= 3;
 
+    s->header_partition_size = header_size;
+
     if (s->profile > 3)
         av_log(s->avctx, AV_LOG_WARNING, "Unknown profile %d\n", s->profile);
 
@@ -700,9 +726,11 @@ static int vp8_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_si
     s->filter.level     = vp8_rac_get_uint(c, 6);
     s->filter.sharpness = vp8_rac_get_uint(c, 3);
 
-    if ((s->lf_delta.enabled = vp8_rac_get(c)))
-        if (vp8_rac_get(c))
+    if ((s->lf_delta.enabled = vp8_rac_get(c))) {
+        s->lf_delta.update = vp8_rac_get(c);
+        if (s->lf_delta.update)
             update_lf_deltas(s);
+    }
 
     if (setup_partitions(s, buf, buf_size)) {
         av_log(s->avctx, AV_LOG_ERROR, "Invalid partitions\n");
@@ -741,6 +769,13 @@ static int vp8_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_si
         vp78_update_pred16x16_pred8x8_mvc_probabilities(s, VP8_MVC_SIZE);
     }
 
+    // Record the entropy coder state here so that hwaccels can use it.
+    s->c.code_word = vp56_rac_renorm(&s->c);
+    s->coder_state_at_header_end.input     = s->c.buffer - (-s->c.bits / 8);
+    s->coder_state_at_header_end.range     = s->c.high;
+    s->coder_state_at_header_end.value     = s->c.code_word >> 16;
+    s->coder_state_at_header_end.bit_count = -s->c.bits % 8;
+
     return 0;
 }
 
@@ -2462,7 +2497,6 @@ static int vp8_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata,
     return vp78_decode_mb_row_sliced(avctx, tdata, jobnr, threadnr, IS_VP8);
 }
 
-
 static av_always_inline
 int vp78_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                       AVPacket *avpkt, int is_vp7)
@@ -2480,6 +2514,20 @@ int vp78_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     if (ret < 0)
         goto err;
 
+    if (!is_vp7 && s->pix_fmt == AV_PIX_FMT_NONE) {
+        enum AVPixelFormat pix_fmts[] = {
+            AV_PIX_FMT_YUV420P,
+            AV_PIX_FMT_NONE,
+        };
+
+        s->pix_fmt = ff_get_format(s->avctx, pix_fmts);
+        if (s->pix_fmt < 0) {
+            ret = AVERROR(EINVAL);
+            goto err;
+        }
+        avctx->pix_fmt = s->pix_fmt;
+    }
+
     prev_frame = s->framep[VP56_FRAME_CURRENT];
 
     referenced = s->update_last || s->update_golden == VP56_FRAME_CURRENT ||
@@ -2555,51 +2603,67 @@ int vp78_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
 
     ff_thread_finish_setup(avctx);
 
-    s->linesize   = curframe->tf.f->linesize[0];
-    s->uvlinesize = curframe->tf.f->linesize[1];
+    if (avctx->hwaccel) {
+        ret = avctx->hwaccel->start_frame(avctx, avpkt->data, avpkt->size);
+        if (ret < 0)
+            goto err;
 
-    memset(s->top_nnz, 0, s->mb_width * sizeof(*s->top_nnz));
-    /* Zero macroblock structures for top/top-left prediction
-     * from outside the frame. */
-    if (!s->mb_layout)
-        memset(s->macroblocks + s->mb_height * 2 - 1, 0,
-               (s->mb_width + 1) * sizeof(*s->macroblocks));
-    if (!s->mb_layout && s->keyframe)
-        memset(s->intra4x4_pred_mode_top, DC_PRED, s->mb_width * 4);
+        ret = avctx->hwaccel->decode_slice(avctx, avpkt->data, avpkt->size);
+        if (ret < 0)
+            goto err;
 
-    memset(s->ref_count, 0, sizeof(s->ref_count));
+        ret = avctx->hwaccel->end_frame(avctx);
+        if (ret < 0)
+            goto err;
 
-    if (s->mb_layout == 1) {
-        // Make sure the previous frame has read its segmentation map,
-        // if we re-use the same map.
-        if (prev_frame && s->segmentation.enabled &&
-            !s->segmentation.update_map)
-            ff_thread_await_progress(&prev_frame->tf, 1, 0);
-        if (is_vp7)
-            vp7_decode_mv_mb_modes(avctx, curframe, prev_frame);
+    } else {
+        s->linesize   = curframe->tf.f->linesize[0];
+        s->uvlinesize = curframe->tf.f->linesize[1];
+
+        memset(s->top_nnz, 0, s->mb_width * sizeof(*s->top_nnz));
+        /* Zero macroblock structures for top/top-left prediction
+         * from outside the frame. */
+        if (!s->mb_layout)
+            memset(s->macroblocks + s->mb_height * 2 - 1, 0,
+                   (s->mb_width + 1) * sizeof(*s->macroblocks));
+        if (!s->mb_layout && s->keyframe)
+            memset(s->intra4x4_pred_mode_top, DC_PRED, s->mb_width * 4);
+
+        memset(s->ref_count, 0, sizeof(s->ref_count));
+
+        if (s->mb_layout == 1) {
+            // Make sure the previous frame has read its segmentation map,
+            // if we re-use the same map.
+            if (prev_frame && s->segmentation.enabled &&
+                !s->segmentation.update_map)
+                ff_thread_await_progress(&prev_frame->tf, 1, 0);
+            if (is_vp7)
+                vp7_decode_mv_mb_modes(avctx, curframe, prev_frame);
+            else
+                vp8_decode_mv_mb_modes(avctx, curframe, prev_frame);
+        }
+
+        if (avctx->active_thread_type == FF_THREAD_FRAME)
+            num_jobs = 1;
         else
-            vp8_decode_mv_mb_modes(avctx, curframe, prev_frame);
-    }
+            num_jobs = FFMIN(s->num_coeff_partitions, avctx->thread_count);
+        s->num_jobs   = num_jobs;
+        s->curframe   = curframe;
+        s->prev_frame = prev_frame;
+        s->mv_min.y   = -MARGIN;
+        s->mv_max.y   = ((s->mb_height - 1) << 6) + MARGIN;
+        for (i = 0; i < MAX_THREADS; i++) {
+            s->thread_data[i].thread_mb_pos = 0;
+            s->thread_data[i].wait_mb_pos   = INT_MAX;
+        }
 
-    if (avctx->active_thread_type == FF_THREAD_FRAME)
-        num_jobs = 1;
-    else
-        num_jobs = FFMIN(s->num_coeff_partitions, avctx->thread_count);
-    s->num_jobs   = num_jobs;
-    s->curframe   = curframe;
-    s->prev_frame = prev_frame;
-    s->mv_min.y   = -MARGIN;
-    s->mv_max.y   = ((s->mb_height - 1) << 6) + MARGIN;
-    for (i = 0; i < MAX_THREADS; i++) {
-        s->thread_data[i].thread_mb_pos = 0;
-        s->thread_data[i].wait_mb_pos   = INT_MAX;
+        if (is_vp7)
+            avctx->execute2(avctx, vp7_decode_mb_row_sliced, s->thread_data, NULL,
+                            num_jobs);
+        else
+            avctx->execute2(avctx, vp8_decode_mb_row_sliced, s->thread_data, NULL,
+                            num_jobs);
     }
-    if (is_vp7)
-        avctx->execute2(avctx, vp7_decode_mb_row_sliced, s->thread_data, NULL,
-                        num_jobs);
-    else
-        avctx->execute2(avctx, vp8_decode_mb_row_sliced, s->thread_data, NULL,
-                        num_jobs);
 
     ff_thread_report_progress(&curframe->tf, INT_MAX, 0);
     memcpy(&s->framep[0], &s->next_framep[0], sizeof(s->framep[0]) * 4);
@@ -2666,6 +2730,7 @@ int vp78_decode_init(AVCodecContext *avctx, int is_vp7)
     int ret;
 
     s->avctx = avctx;
+    s->pix_fmt = AV_PIX_FMT_NONE;
     avctx->pix_fmt = AV_PIX_FMT_YUV420P;
     avctx->internal->allocate_progress = 1;
 
diff --git a/libavcodec/vp8.h b/libavcodec/vp8.h
index 65948e1d6b..1870705ad2 100644
--- a/libavcodec/vp8.h
+++ b/libavcodec/vp8.h
@@ -130,12 +130,17 @@ typedef struct VP8ThreadData {
 typedef struct VP8Frame {
     ThreadFrame tf;
     AVBufferRef *seg_map;
+
+    AVBufferRef *hwaccel_priv_buf;
+    void *hwaccel_picture_private;
 } VP8Frame;
 
 #define MAX_THREADS 8
 typedef struct VP8Context {
     VP8ThreadData *thread_data;
     AVCodecContext *avctx;
+    enum AVPixelFormat pix_fmt;
+
     VP8Frame *framep[4];
     VP8Frame *next_framep[4];
     VP8Frame *curframe;
@@ -165,6 +170,7 @@ typedef struct VP8Context {
         uint8_t enabled;
         uint8_t absolute_vals;
         uint8_t update_map;
+        uint8_t update_feature_data;
         int8_t base_quant[4];
         int8_t filter_level[4];     ///< base loop filter level
     } segmentation;
@@ -192,8 +198,19 @@ typedef struct VP8Context {
         int16_t chroma_qmul[2];
     } qmat[4];
 
+    // Raw quantisation values, which may be needed by hwaccel decode.
+    struct {
+        int yac_qi;
+        int ydc_delta;
+        int y2dc_delta;
+        int y2ac_delta;
+        int uvdc_delta;
+        int uvac_delta;
+    } quant;
+
     struct {
         uint8_t enabled;    ///< whether each mb can have a different strength based on mode/ref
+        uint8_t update;
 
         /**
          * filter strength adjustment for the following macroblock modes:
@@ -221,6 +238,20 @@ typedef struct VP8Context {
 
     VP56RangeCoder c;   ///< header context, includes mb modes and motion vectors
 
+    /* This contains the entropy coder state at the end of the header
+     * block, in the form specified by the standard.  For use by
+     * hwaccels, so that a hardware decoder has the information to
+     * start decoding at the macroblock layer.
+     */
+    struct {
+        const uint8_t *input;
+        uint32_t range;
+        uint32_t value;
+        int bit_count;
+    } coder_state_at_header_end;
+
+    int header_partition_size;
+
     /**
      * These are all of the updatable probabilities for binary decisions.
      * They are only implicitly reset on keyframes, making it quite likely
@@ -258,6 +289,7 @@ typedef struct VP8Context {
      */
     int num_coeff_partitions;
     VP56RangeCoder coeff_partition[8];
+    int coeff_partition_size[8];
     VideoDSPContext vdsp;
     VP8DSPContext vp8dsp;
     H264PredContext hpc;

From a9fb134730da1f9642eb5a2baa50943b8a4aa245 Mon Sep 17 00:00:00 2001
From: Mark Thompson <sw@jkqxz.net>
Date: Sun, 4 Sep 2016 13:28:10 +0100
Subject: [PATCH 2/4] lavc/vaapi: Add VP8 decode hwaccel

---
 configure              |   3 +
 libavcodec/Makefile    |   1 +
 libavcodec/allcodecs.c |   1 +
 libavcodec/vaapi_vp8.c | 231 +++++++++++++++++++++++++++++++++++++++++
 libavcodec/vp8.c       |   3 +
 5 files changed, 239 insertions(+)
 create mode 100644 libavcodec/vaapi_vp8.c

diff --git a/configure b/configure
index 7ad920f4be..520f07ccff 100755
--- a/configure
+++ b/configure
@@ -2185,6 +2185,8 @@ vc1_vaapi_hwaccel_deps="vaapi"
 vc1_vaapi_hwaccel_select="vc1_decoder"
 vc1_vdpau_hwaccel_deps="vdpau"
 vc1_vdpau_hwaccel_select="vc1_decoder"
+vp8_vaapi_hwaccel_deps="vaapi VAPictureParameterBufferVP8"
+vp8_vaapi_hwaccel_select="vp8_decoder"
 wmv3_d3d11va_hwaccel_select="vc1_d3d11va_hwaccel"
 wmv3_dxva2_hwaccel_select="vc1_dxva2_hwaccel"
 wmv3_vaapi_hwaccel_select="vc1_vaapi_hwaccel"
@@ -4544,6 +4546,7 @@ check_type "windows.h dxva.h" "DXVA_PicParams_HEVC" -DWINAPI_FAMILY=WINAPI_FAMIL
 check_type "windows.h d3d11.h" "ID3D11VideoDecoder"
 check_type "d3d9.h dxva2api.h" DXVA2_ConfigPictureDecode -D_WIN32_WINNT=0x0602
 
+check_type "va/va.h va/va_dec_vp8.h" "VAPictureParameterBufferVP8"
 check_type "va/va.h va/va_vpp.h" "VAProcPipelineParameterBuffer"
 check_type "va/va.h va/va_enc_h264.h" "VAEncPictureParameterBufferH264"
 check_type "va/va.h va/va_enc_hevc.h" "VAEncPictureParameterBufferHEVC"
diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index 974480f06d..bec461b80c 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -634,6 +634,7 @@ OBJS-$(CONFIG_VC1_D3D11VA_HWACCEL)        += dxva2_vc1.o
 OBJS-$(CONFIG_VC1_DXVA2_HWACCEL)          += dxva2_vc1.o
 OBJS-$(CONFIG_VC1_VAAPI_HWACCEL)          += vaapi_vc1.o
 OBJS-$(CONFIG_VC1_VDPAU_HWACCEL)          += vdpau_vc1.o
+OBJS-$(CONFIG_VP8_VAAPI_HWACCEL)          += vaapi_vp8.o
 
 # libavformat dependencies
 OBJS-$(CONFIG_ISO_MEDIA)               += mpeg4audio.o mpegaudiodata.o
diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c
index e259de2510..41af38eb7c 100644
--- a/libavcodec/allcodecs.c
+++ b/libavcodec/allcodecs.c
@@ -94,6 +94,7 @@ void avcodec_register_all(void)
     REGISTER_HWACCEL(VC1_VAAPI,         vc1_vaapi);
     REGISTER_HWACCEL(VC1_VDPAU,         vc1_vdpau);
     REGISTER_HWACCEL(VC1_MMAL,          vc1_mmal);
+    REGISTER_HWACCEL(VP8_VAAPI,         vp8_vaapi);
     REGISTER_HWACCEL(WMV3_D3D11VA,      wmv3_d3d11va);
     REGISTER_HWACCEL(WMV3_DXVA2,        wmv3_dxva2);
     REGISTER_HWACCEL(WMV3_VAAPI,        wmv3_vaapi);
diff --git a/libavcodec/vaapi_vp8.c b/libavcodec/vaapi_vp8.c
new file mode 100644
index 0000000000..a130c04e1d
--- /dev/null
+++ b/libavcodec/vaapi_vp8.c
@@ -0,0 +1,231 @@
+/*
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "vaapi_decode.h"
+#include "vp8.h"
+
+static VASurfaceID vaapi_vp8_surface_id(VP8Frame *vf)
+{
+    if (vf)
+        return ff_vaapi_get_surface_id(vf->tf.f);
+    else
+        return VA_INVALID_SURFACE;
+}
+
+static int vaapi_vp8_start_frame(AVCodecContext          *avctx,
+                                 av_unused const uint8_t *buffer,
+                                 av_unused uint32_t       size)
+{
+    const VP8Context *s = avctx->priv_data;
+    VAAPIDecodePicture *pic = s->framep[VP56_FRAME_CURRENT]->hwaccel_picture_private;
+    VAPictureParameterBufferVP8 pp;
+    VAProbabilityDataBufferVP8 prob;
+    VAIQMatrixBufferVP8 quant;
+    int err, i, j, k;
+
+    pic->output_surface = vaapi_vp8_surface_id(s->framep[VP56_FRAME_CURRENT]);
+
+    pp = (VAPictureParameterBufferVP8) {
+        .frame_width                     = avctx->width,
+        .frame_height                    = avctx->height,
+
+        .last_ref_frame                  = vaapi_vp8_surface_id(s->framep[VP56_FRAME_PREVIOUS]),
+        .golden_ref_frame                = vaapi_vp8_surface_id(s->framep[VP56_FRAME_GOLDEN]),
+        .alt_ref_frame                   = vaapi_vp8_surface_id(s->framep[VP56_FRAME_GOLDEN2]),
+        .out_of_loop_frame               = VA_INVALID_SURFACE,
+
+        .pic_fields.bits = {
+            .key_frame                   = !s->keyframe,
+            .version                     = s->profile,
+
+            .segmentation_enabled        = s->segmentation.enabled,
+            .update_mb_segmentation_map  = s->segmentation.update_map,
+            .update_segment_feature_data = s->segmentation.update_feature_data,
+
+            .filter_type                 = s->filter.simple,
+            .sharpness_level             = s->filter.sharpness,
+
+            .loop_filter_adj_enable      = s->lf_delta.enabled,
+            .mode_ref_lf_delta_update    = s->lf_delta.update,
+
+            .sign_bias_golden            = s->sign_bias[VP56_FRAME_GOLDEN],
+            .sign_bias_alternate         = s->sign_bias[VP56_FRAME_GOLDEN2],
+
+            .mb_no_coeff_skip            = s->mbskip_enabled,
+            .loop_filter_disable         = s->filter.level == 0,
+        },
+
+        .prob_skip_false                 = s->prob->mbskip,
+        .prob_intra                      = s->prob->intra,
+        .prob_last                       = s->prob->last,
+        .prob_gf                         = s->prob->golden,
+    };
+
+    for (i = 0; i < 3; i++)
+        pp.mb_segment_tree_probs[i] = s->prob->segmentid[i];
+
+    for (i = 0; i < 4; i++) {
+        if (s->segmentation.enabled) {
+            pp.loop_filter_level[i] = s->segmentation.filter_level[i];
+            if (!s->segmentation.absolute_vals)
+                pp.loop_filter_level[i] += s->filter.level;
+        } else {
+            pp.loop_filter_level[i] = s->filter.level;
+        }
+        pp.loop_filter_level[i] = av_clip_uintp2(pp.loop_filter_level[i], 6);
+    }
+
+    for (i = 0; i < 4; i++) {
+        pp.loop_filter_deltas_ref_frame[i] = s->lf_delta.ref[i];
+        pp.loop_filter_deltas_mode[i] = s->lf_delta.mode[i + 4];
+    }
+
+    if (s->keyframe) {
+        static const uint8_t keyframe_y_mode_probs[4] = {
+            145, 156, 163, 128
+        };
+        static const uint8_t keyframe_uv_mode_probs[3] = {
+            142, 114, 183
+        };
+        memcpy(pp.y_mode_probs,  keyframe_y_mode_probs,  4);
+        memcpy(pp.uv_mode_probs, keyframe_uv_mode_probs, 3);
+    } else {
+        for (i = 0; i < 4; i++)
+            pp.y_mode_probs[i] = s->prob->pred16x16[i];
+        for (i = 0; i < 3; i++)
+            pp.uv_mode_probs[i] = s->prob->pred8x8c[i];
+    }
+    for (i = 0; i < 2; i++)
+        for (j = 0; j < 19; j++)
+            pp.mv_probs[i][j] = s->prob->mvc[i][j];
+
+    pp.bool_coder_ctx.range = s->coder_state_at_header_end.range;
+    pp.bool_coder_ctx.value = s->coder_state_at_header_end.value;
+    pp.bool_coder_ctx.count = s->coder_state_at_header_end.bit_count;
+
+    err = ff_vaapi_decode_make_param_buffer(avctx, pic,
+                                            VAPictureParameterBufferType,
+                                            &pp, sizeof(pp));
+    if (err < 0)
+        goto fail;
+
+    for (i = 0; i < 4; i++) {
+        for (j = 0; j < 8; j++) {
+            static const int coeff_bands_inverse[8] = {
+                0, 1, 2, 3, 5, 6, 4, 15
+            };
+            int coeff_pos = coeff_bands_inverse[j];
+
+            for (k = 0; k < 3; k++) {
+                memcpy(prob.dct_coeff_probs[i][j][k],
+                       s->prob->token[i][coeff_pos][k], 11);
+            }
+        }
+    }
+
+    err = ff_vaapi_decode_make_param_buffer(avctx, pic,
+                                            VAProbabilityBufferType,
+                                            &prob, sizeof(prob));
+    if (err < 0)
+        goto fail;
+
+    for (i = 0; i < 4; i++) {
+        int base_qi = s->segmentation.base_quant[i];
+        if (!s->segmentation.absolute_vals)
+            base_qi += s->quant.yac_qi;
+
+        quant.quantization_index[i][0] = av_clip_uintp2(base_qi,                       7);
+        quant.quantization_index[i][1] = av_clip_uintp2(base_qi + s->quant.ydc_delta,  7);
+        quant.quantization_index[i][2] = av_clip_uintp2(base_qi + s->quant.y2dc_delta, 7);
+        quant.quantization_index[i][3] = av_clip_uintp2(base_qi + s->quant.y2ac_delta, 7);
+        quant.quantization_index[i][4] = av_clip_uintp2(base_qi + s->quant.uvdc_delta, 7);
+        quant.quantization_index[i][5] = av_clip_uintp2(base_qi + s->quant.uvac_delta, 7);
+    }
+
+    err = ff_vaapi_decode_make_param_buffer(avctx, pic,
+                                            VAIQMatrixBufferType,
+                                            &quant, sizeof(quant));
+    if (err < 0)
+        goto fail;
+
+    return 0;
+
+fail:
+    ff_vaapi_decode_cancel(avctx, pic);
+    return err;
+}
+
+static int vaapi_vp8_end_frame(AVCodecContext *avctx)
+{
+    const VP8Context *s = avctx->priv_data;
+    VAAPIDecodePicture *pic = s->framep[VP56_FRAME_CURRENT]->hwaccel_picture_private;
+
+    return ff_vaapi_decode_issue(avctx, pic);
+}
+
+static int vaapi_vp8_decode_slice(AVCodecContext *avctx,
+                                  const uint8_t  *buffer,
+                                  uint32_t        size)
+{
+    const VP8Context *s = avctx->priv_data;
+    VAAPIDecodePicture *pic = s->framep[VP56_FRAME_CURRENT]->hwaccel_picture_private;
+    VASliceParameterBufferVP8 sp;
+    int err, i;
+
+    unsigned int header_size = 3 + 7 * s->keyframe;
+    const uint8_t *data = buffer + header_size;
+    unsigned int data_size = size - header_size;
+
+    sp = (VASliceParameterBufferVP8) {
+        .slice_data_size   = data_size,
+        .slice_data_offset = 0,
+        .slice_data_flag   = VA_SLICE_DATA_FLAG_ALL,
+
+        .macroblock_offset = (8 * (s->coder_state_at_header_end.input - data) -
+                              s->coder_state_at_header_end.bit_count - 8),
+        .num_of_partitions = s->num_coeff_partitions + 1,
+    };
+
+    sp.partition_size[0] = s->header_partition_size - ((sp.macroblock_offset + 7) / 8);
+    for (i = 0; i < 8; i++)
+        sp.partition_size[i+1] = s->coeff_partition_size[i];
+
+    err = ff_vaapi_decode_make_slice_buffer(avctx, pic, &sp, sizeof(sp), data, data_size);
+    if (err)
+        goto fail;
+
+    return 0;
+
+fail:
+    ff_vaapi_decode_cancel(avctx, pic);
+    return err;
+}
+
+AVHWAccel ff_vp8_vaapi_hwaccel = {
+    .name                 = "vp8_vaapi",
+    .type                 = AVMEDIA_TYPE_VIDEO,
+    .id                   = AV_CODEC_ID_VP8,
+    .pix_fmt              = AV_PIX_FMT_VAAPI,
+    .start_frame          = &vaapi_vp8_start_frame,
+    .end_frame            = &vaapi_vp8_end_frame,
+    .decode_slice         = &vaapi_vp8_decode_slice,
+    .frame_priv_data_size = sizeof(VAAPIDecodePicture),
+    .init                 = &ff_vaapi_decode_init,
+    .uninit               = &ff_vaapi_decode_uninit,
+    .priv_data_size       = sizeof(VAAPIDecodeContext),
+};
diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c
index ced49799bc..bf1b03e9f7 100644
--- a/libavcodec/vp8.c
+++ b/libavcodec/vp8.c
@@ -2516,6 +2516,9 @@ int vp78_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
 
     if (!is_vp7 && s->pix_fmt == AV_PIX_FMT_NONE) {
         enum AVPixelFormat pix_fmts[] = {
+#if CONFIG_VP8_VAAPI_HWACCEL
+            AV_PIX_FMT_VAAPI,
+#endif
             AV_PIX_FMT_YUV420P,
             AV_PIX_FMT_NONE,
         };

From 11c191b52ce0768370e38a2726132f9223e701f6 Mon Sep 17 00:00:00 2001
From: Mark Thompson <sw@jkqxz.net>
Date: Sun, 4 Sep 2016 13:33:15 +0100
Subject: [PATCH 3/4] vaapi_decode: Ignore the profile when not useful

Enables VP8 decoding - the decoder places the the bitstream version
in the profile field, which we want to ignore.
---
 libavcodec/vaapi_decode.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/libavcodec/vaapi_decode.c b/libavcodec/vaapi_decode.c
index 51b6d47a66..ab8445afc0 100644
--- a/libavcodec/vaapi_decode.c
+++ b/libavcodec/vaapi_decode.c
@@ -320,7 +320,8 @@ static int vaapi_decode_make_config(AVCodecContext *avctx)
         int profile_match = 0;
         if (avctx->codec_id != vaapi_profile_map[i].codec_id)
             continue;
-        if (avctx->profile == vaapi_profile_map[i].codec_profile)
+        if (avctx->profile == vaapi_profile_map[i].codec_profile ||
+            vaapi_profile_map[i].codec_profile == FF_PROFILE_UNKNOWN)
             profile_match = 1;
         profile = vaapi_profile_map[i].va_profile;
         for (j = 0; j < profile_count; j++) {

From 75d642a944d5579e4ef20ff3701422a64692afcf Mon Sep 17 00:00:00 2001
From: Mark Thompson <sw@jkqxz.net>
Date: Fri, 9 Sep 2016 15:59:13 +0100
Subject: [PATCH 4/4] vaapi_vp8: Explicitly include libva vp8 decode header

With some old libva versions <va/va.h> does not automatically include
the per-codec subsidiary headers, so we need to include the right one
explicitly ourselves.
---
 libavcodec/vaapi_vp8.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/libavcodec/vaapi_vp8.c b/libavcodec/vaapi_vp8.c
index a130c04e1d..70e9cec3d4 100644
--- a/libavcodec/vaapi_vp8.c
+++ b/libavcodec/vaapi_vp8.c
@@ -16,6 +16,9 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#include <va/va.h>
+#include <va/va_dec_vp8.h>
+
 #include "vaapi_decode.h"
 #include "vp8.h"