1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2025-01-03 05:10:03 +02:00
FFmpeg/libavcodec/vp9dsp.c

2190 lines
83 KiB
C
Raw Normal View History

/*
* VP9 compatible video decoder
*
* Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
* Copyright (C) 2013 Clément Bœsch <u pkh me>
*
* This file is part of Libav.
*
* Libav is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* Libav is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with Libav; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/common.h"
#include "libavutil/intreadwrite.h"
#include "rnd_avg.h"
#include "vp9.h"
// FIXME see whether we can merge parts of this (perhaps at least 4x4 and 8x8)
// back with h264pred.[ch]
static void vert_4x4_c(uint8_t *dst, ptrdiff_t stride,
const uint8_t *left, const uint8_t *top)
{
unsigned p4 = AV_RN32A(top);
AV_WN32A(dst + stride * 0, p4);
AV_WN32A(dst + stride * 1, p4);
AV_WN32A(dst + stride * 2, p4);
AV_WN32A(dst + stride * 3, p4);
}
static void vert_8x8_c(uint8_t *dst, ptrdiff_t stride,
const uint8_t *left, const uint8_t *top)
{
uint64_t p8 = AV_RN64A(top);
int y;
for (y = 0; y < 8; y++) {
AV_WN64A(dst, p8);
dst += stride;
}
}
static void vert_16x16_c(uint8_t *dst, ptrdiff_t stride,
const uint8_t *left, const uint8_t *top)
{
uint64_t p8a = AV_RN64A(top + 0), p8b = AV_RN64A(top + 8);
int y;
for (y = 0; y < 16; y++) {
AV_WN64A(dst + 0, p8a);
AV_WN64A(dst + 8, p8b);
dst += stride;
}
}
static void vert_32x32_c(uint8_t *dst, ptrdiff_t stride,
const uint8_t *left, const uint8_t *top)
{
uint64_t p8a = AV_RN64A(top + 0), p8b = AV_RN64A(top + 8),
p8c = AV_RN64A(top + 16), p8d = AV_RN64A(top + 24);
int y;
for (y = 0; y < 32; y++) {
AV_WN64A(dst + 0, p8a);
AV_WN64A(dst + 8, p8b);
AV_WN64A(dst + 16, p8c);
AV_WN64A(dst + 24, p8d);
dst += stride;
}
}
static void hor_4x4_c(uint8_t *dst, ptrdiff_t stride,
const uint8_t *left, const uint8_t *top)
{
AV_WN32A(dst + stride * 0, left[0] * 0x01010101U);
AV_WN32A(dst + stride * 1, left[1] * 0x01010101U);
AV_WN32A(dst + stride * 2, left[2] * 0x01010101U);
AV_WN32A(dst + stride * 3, left[3] * 0x01010101U);
}
static void hor_8x8_c(uint8_t *dst, ptrdiff_t stride,
const uint8_t *left, const uint8_t *top)
{
int y;
for (y = 0; y < 8; y++) {
AV_WN64A(dst, left[y] * 0x0101010101010101ULL);
dst += stride;
}
}
static void hor_16x16_c(uint8_t *dst, ptrdiff_t stride,
const uint8_t *left, const uint8_t *top)
{
int y;
for (y = 0; y < 16; y++) {
uint64_t p8 = left[y] * 0x0101010101010101ULL;
AV_WN64A(dst + 0, p8);
AV_WN64A(dst + 8, p8);
dst += stride;
}
}
static void hor_32x32_c(uint8_t *dst, ptrdiff_t stride,
const uint8_t *left, const uint8_t *top)
{
int y;
for (y = 0; y < 32; y++) {
uint64_t p8 = left[y] * 0x0101010101010101ULL;
AV_WN64A(dst + 0, p8);
AV_WN64A(dst + 8, p8);
AV_WN64A(dst + 16, p8);
AV_WN64A(dst + 24, p8);
dst += stride;
}
}
static void tm_4x4_c(uint8_t *dst, ptrdiff_t stride,
const uint8_t *left, const uint8_t *top)
{
int y, tl = top[-1];
for (y = 0; y < 4; y++) {
int l_m_tl = left[y] - tl;
dst[0] = av_clip_uint8(top[0] + l_m_tl);
dst[1] = av_clip_uint8(top[1] + l_m_tl);
dst[2] = av_clip_uint8(top[2] + l_m_tl);
dst[3] = av_clip_uint8(top[3] + l_m_tl);
dst += stride;
}
}
static void tm_8x8_c(uint8_t *dst, ptrdiff_t stride,
const uint8_t *left, const uint8_t *top)
{
int y, tl = top[-1];
for (y = 0; y < 8; y++) {
int l_m_tl = left[y] - tl;
dst[0] = av_clip_uint8(top[0] + l_m_tl);
dst[1] = av_clip_uint8(top[1] + l_m_tl);
dst[2] = av_clip_uint8(top[2] + l_m_tl);
dst[3] = av_clip_uint8(top[3] + l_m_tl);
dst[4] = av_clip_uint8(top[4] + l_m_tl);
dst[5] = av_clip_uint8(top[5] + l_m_tl);
dst[6] = av_clip_uint8(top[6] + l_m_tl);
dst[7] = av_clip_uint8(top[7] + l_m_tl);
dst += stride;
}
}
static void tm_16x16_c(uint8_t *dst, ptrdiff_t stride,
const uint8_t *left, const uint8_t *top)
{
int y, tl = top[-1];
for (y = 0; y < 16; y++) {
int l_m_tl = left[y] - tl;
dst[0] = av_clip_uint8(top[0] + l_m_tl);
dst[1] = av_clip_uint8(top[1] + l_m_tl);
dst[2] = av_clip_uint8(top[2] + l_m_tl);
dst[3] = av_clip_uint8(top[3] + l_m_tl);
dst[4] = av_clip_uint8(top[4] + l_m_tl);
dst[5] = av_clip_uint8(top[5] + l_m_tl);
dst[6] = av_clip_uint8(top[6] + l_m_tl);
dst[7] = av_clip_uint8(top[7] + l_m_tl);
dst[8] = av_clip_uint8(top[8] + l_m_tl);
dst[9] = av_clip_uint8(top[9] + l_m_tl);
dst[10] = av_clip_uint8(top[10] + l_m_tl);
dst[11] = av_clip_uint8(top[11] + l_m_tl);
dst[12] = av_clip_uint8(top[12] + l_m_tl);
dst[13] = av_clip_uint8(top[13] + l_m_tl);
dst[14] = av_clip_uint8(top[14] + l_m_tl);
dst[15] = av_clip_uint8(top[15] + l_m_tl);
dst += stride;
}
}
static void tm_32x32_c(uint8_t *dst, ptrdiff_t stride,
const uint8_t *left, const uint8_t *top)
{
int y, tl = top[-1];
for (y = 0; y < 32; y++) {
int l_m_tl = left[y] - tl;
dst[0] = av_clip_uint8(top[0] + l_m_tl);
dst[1] = av_clip_uint8(top[1] + l_m_tl);
dst[2] = av_clip_uint8(top[2] + l_m_tl);
dst[3] = av_clip_uint8(top[3] + l_m_tl);
dst[4] = av_clip_uint8(top[4] + l_m_tl);
dst[5] = av_clip_uint8(top[5] + l_m_tl);
dst[6] = av_clip_uint8(top[6] + l_m_tl);
dst[7] = av_clip_uint8(top[7] + l_m_tl);
dst[8] = av_clip_uint8(top[8] + l_m_tl);
dst[9] = av_clip_uint8(top[9] + l_m_tl);
dst[10] = av_clip_uint8(top[10] + l_m_tl);
dst[11] = av_clip_uint8(top[11] + l_m_tl);
dst[12] = av_clip_uint8(top[12] + l_m_tl);
dst[13] = av_clip_uint8(top[13] + l_m_tl);
dst[14] = av_clip_uint8(top[14] + l_m_tl);
dst[15] = av_clip_uint8(top[15] + l_m_tl);
dst[16] = av_clip_uint8(top[16] + l_m_tl);
dst[17] = av_clip_uint8(top[17] + l_m_tl);
dst[18] = av_clip_uint8(top[18] + l_m_tl);
dst[19] = av_clip_uint8(top[19] + l_m_tl);
dst[20] = av_clip_uint8(top[20] + l_m_tl);
dst[21] = av_clip_uint8(top[21] + l_m_tl);
dst[22] = av_clip_uint8(top[22] + l_m_tl);
dst[23] = av_clip_uint8(top[23] + l_m_tl);
dst[24] = av_clip_uint8(top[24] + l_m_tl);
dst[25] = av_clip_uint8(top[25] + l_m_tl);
dst[26] = av_clip_uint8(top[26] + l_m_tl);
dst[27] = av_clip_uint8(top[27] + l_m_tl);
dst[28] = av_clip_uint8(top[28] + l_m_tl);
dst[29] = av_clip_uint8(top[29] + l_m_tl);
dst[30] = av_clip_uint8(top[30] + l_m_tl);
dst[31] = av_clip_uint8(top[31] + l_m_tl);
dst += stride;
}
}
static void dc_4x4_c(uint8_t *dst, ptrdiff_t stride,
const uint8_t *left, const uint8_t *top)
{
unsigned dc = 0x01010101U *
((left[0] + left[1] + left[2] + left[3] +
top[0] + top[1] + top[2] + top[3] + 4) >> 3);
AV_WN32A(dst + stride * 0, dc);
AV_WN32A(dst + stride * 1, dc);
AV_WN32A(dst + stride * 2, dc);
AV_WN32A(dst + stride * 3, dc);
}
static void dc_8x8_c(uint8_t *dst, ptrdiff_t stride,
const uint8_t *left, const uint8_t *top)
{
uint64_t dc = 0x0101010101010101ULL *
((left[0] + left[1] + left[2] + left[3] +
left[4] + left[5] + left[6] + left[7] +
top[0] + top[1] + top[2] + top[3] +
top[4] + top[5] + top[6] + top[7] + 8) >> 4);
int y;
for (y = 0; y < 8; y++) {
AV_WN64A(dst, dc);
dst += stride;
}
}
static void dc_16x16_c(uint8_t *dst, ptrdiff_t stride,
const uint8_t *left, const uint8_t *top)
{
uint64_t dc = 0x0101010101010101ULL *
((left[0] + left[1] + left[2] + left[3] +
left[4] + left[5] + left[6] + left[7] +
left[8] + left[9] + left[10] + left[11] +
left[12] + left[13] + left[14] + left[15] +
top[0] + top[1] + top[2] + top[3] +
top[4] + top[5] + top[6] + top[7] +
top[8] + top[9] + top[10] + top[11] +
top[12] + top[13] + top[14] + top[15] + 16) >> 5);
int y;
for (y = 0; y < 16; y++) {
AV_WN64A(dst + 0, dc);
AV_WN64A(dst + 8, dc);
dst += stride;
}
}
static void dc_32x32_c(uint8_t *dst, ptrdiff_t stride,
const uint8_t *left, const uint8_t *top)
{
uint64_t dc = 0x0101010101010101ULL *
((left[0] + left[1] + left[2] + left[3] +
left[4] + left[5] + left[6] + left[7] +
left[8] + left[9] + left[10] + left[11] +
left[12] + left[13] + left[14] + left[15] +
left[16] + left[17] + left[18] + left[19] +
left[20] + left[21] + left[22] + left[23] +
left[24] + left[25] + left[26] + left[27] +
left[28] + left[29] + left[30] + left[31] +
top[0] + top[1] + top[2] + top[3] +
top[4] + top[5] + top[6] + top[7] +
top[8] + top[9] + top[10] + top[11] +
top[12] + top[13] + top[14] + top[15] +
top[16] + top[17] + top[18] + top[19] +
top[20] + top[21] + top[22] + top[23] +
top[24] + top[25] + top[26] + top[27] +
top[28] + top[29] + top[30] + top[31] + 32) >> 6);
int y;
for (y = 0; y < 32; y++) {
AV_WN64A(dst + 0, dc);
AV_WN64A(dst + 8, dc);
AV_WN64A(dst + 16, dc);
AV_WN64A(dst + 24, dc);
dst += stride;
}
}
static void dc_left_4x4_c(uint8_t *dst, ptrdiff_t stride,
const uint8_t *left, const uint8_t *top)
{
unsigned dc = 0x01010101U *
((left[0] + left[1] + left[2] + left[3] + 2) >> 2);
AV_WN32A(dst + stride * 0, dc);
AV_WN32A(dst + stride * 1, dc);
AV_WN32A(dst + stride * 2, dc);
AV_WN32A(dst + stride * 3, dc);
}
static void dc_left_8x8_c(uint8_t *dst, ptrdiff_t stride,
const uint8_t *left, const uint8_t *top)
{
uint64_t dc = 0x0101010101010101ULL *
((left[0] + left[1] + left[2] + left[3] +
left[4] + left[5] + left[6] + left[7] + 4) >> 3);
int y;
for (y = 0; y < 8; y++) {
AV_WN64A(dst, dc);
dst += stride;
}
}
static void dc_left_16x16_c(uint8_t *dst, ptrdiff_t stride,
const uint8_t *left, const uint8_t *top)
{
uint64_t dc = 0x0101010101010101ULL *
((left[0] + left[1] + left[2] + left[3] +
left[4] + left[5] + left[6] + left[7] +
left[8] + left[9] + left[10] + left[11] +
left[12] + left[13] + left[14] + left[15] + 8) >> 4);
int y;
for (y = 0; y < 16; y++) {
AV_WN64A(dst + 0, dc);
AV_WN64A(dst + 8, dc);
dst += stride;
}
}
static void dc_left_32x32_c(uint8_t *dst, ptrdiff_t stride,
const uint8_t *left, const uint8_t *top)
{
uint64_t dc = 0x0101010101010101ULL *
((left[0] + left[1] + left[2] + left[3] +
left[4] + left[5] + left[6] + left[7] +
left[8] + left[9] + left[10] + left[11] +
left[12] + left[13] + left[14] + left[15] +
left[16] + left[17] + left[18] + left[19] +
left[20] + left[21] + left[22] + left[23] +
left[24] + left[25] + left[26] + left[27] +
left[28] + left[29] + left[30] + left[31] + 16) >> 5);
int y;
for (y = 0; y < 32; y++) {
AV_WN64A(dst + 0, dc);
AV_WN64A(dst + 8, dc);
AV_WN64A(dst + 16, dc);
AV_WN64A(dst + 24, dc);
dst += stride;
}
}
static void dc_top_4x4_c(uint8_t *dst, ptrdiff_t stride,
const uint8_t *left, const uint8_t *top)
{
unsigned dc = 0x01010101U * ((top[0] + top[1] + top[2] + top[3] + 2) >> 2);
AV_WN32A(dst + stride * 0, dc);
AV_WN32A(dst + stride * 1, dc);
AV_WN32A(dst + stride * 2, dc);
AV_WN32A(dst + stride * 3, dc);
}
static void dc_top_8x8_c(uint8_t *dst, ptrdiff_t stride,
const uint8_t *left, const uint8_t *top)
{
uint64_t dc = 0x0101010101010101ULL *
((top[0] + top[1] + top[2] + top[3] +
top[4] + top[5] + top[6] + top[7] + 4) >> 3);
int y;
for (y = 0; y < 8; y++) {
AV_WN64A(dst, dc);
dst += stride;
}
}
static void dc_top_16x16_c(uint8_t *dst, ptrdiff_t stride,
const uint8_t *left, const uint8_t *top)
{
uint64_t dc = 0x0101010101010101ULL *
((top[0] + top[1] + top[2] + top[3] +
top[4] + top[5] + top[6] + top[7] +
top[8] + top[9] + top[10] + top[11] +
top[12] + top[13] + top[14] + top[15] + 8) >> 4);
int y;
for (y = 0; y < 16; y++) {
AV_WN64A(dst + 0, dc);
AV_WN64A(dst + 8, dc);
dst += stride;
}
}
static void dc_top_32x32_c(uint8_t *dst, ptrdiff_t stride,
const uint8_t *left, const uint8_t *top)
{
uint64_t dc = 0x0101010101010101ULL *
((top[0] + top[1] + top[2] + top[3] +
top[4] + top[5] + top[6] + top[7] +
top[8] + top[9] + top[10] + top[11] +
top[12] + top[13] + top[14] + top[15] +
top[16] + top[17] + top[18] + top[19] +
top[20] + top[21] + top[22] + top[23] +
top[24] + top[25] + top[26] + top[27] +
top[28] + top[29] + top[30] + top[31] + 16) >> 5);
int y;
for (y = 0; y < 32; y++) {
AV_WN64A(dst + 0, dc);
AV_WN64A(dst + 8, dc);
AV_WN64A(dst + 16, dc);
AV_WN64A(dst + 24, dc);
dst += stride;
}
}
static void dc_128_4x4_c(uint8_t *dst, ptrdiff_t stride,
const uint8_t *left, const uint8_t *top)
{
AV_WN32A(dst + stride * 0, 0x80808080U);
AV_WN32A(dst + stride * 1, 0x80808080U);
AV_WN32A(dst + stride * 2, 0x80808080U);
AV_WN32A(dst + stride * 3, 0x80808080U);
}
static void dc_128_8x8_c(uint8_t *dst, ptrdiff_t stride,
const uint8_t *left, const uint8_t *top)
{
int y;
for (y = 0; y < 8; y++) {
AV_WN64A(dst, 0x8080808080808080ULL);
dst += stride;
}
}
static void dc_128_16x16_c(uint8_t *dst, ptrdiff_t stride,
const uint8_t *left, const uint8_t *top)
{
int y;
for (y = 0; y < 16; y++) {
AV_WN64A(dst + 0, 0x8080808080808080ULL);
AV_WN64A(dst + 8, 0x8080808080808080ULL);
dst += stride;
}
}
static void dc_128_32x32_c(uint8_t *dst, ptrdiff_t stride,
const uint8_t *left, const uint8_t *top)
{
int y;
for (y = 0; y < 32; y++) {
AV_WN64A(dst + 0, 0x8080808080808080ULL);
AV_WN64A(dst + 8, 0x8080808080808080ULL);
AV_WN64A(dst + 16, 0x8080808080808080ULL);
AV_WN64A(dst + 24, 0x8080808080808080ULL);
dst += stride;
}
}
static void dc_127_4x4_c(uint8_t *dst, ptrdiff_t stride,
const uint8_t *left, const uint8_t *top)
{
AV_WN32A(dst + stride * 0, 0x7F7F7F7FU);
AV_WN32A(dst + stride * 1, 0x7F7F7F7FU);
AV_WN32A(dst + stride * 2, 0x7F7F7F7FU);
AV_WN32A(dst + stride * 3, 0x7F7F7F7FU);
}
static void dc_127_8x8_c(uint8_t *dst, ptrdiff_t stride,
const uint8_t *left, const uint8_t *top)
{
int y;
for (y = 0; y < 8; y++) {
AV_WN64A(dst, 0x7F7F7F7F7F7F7F7FULL);
dst += stride;
}
}
static void dc_127_16x16_c(uint8_t *dst, ptrdiff_t stride,
const uint8_t *left, const uint8_t *top)
{
int y;
for (y = 0; y < 16; y++) {
AV_WN64A(dst + 0, 0x7F7F7F7F7F7F7F7FULL);
AV_WN64A(dst + 8, 0x7F7F7F7F7F7F7F7FULL);
dst += stride;
}
}
static void dc_127_32x32_c(uint8_t *dst, ptrdiff_t stride,
const uint8_t *left, const uint8_t *top)
{
int y;
for (y = 0; y < 32; y++) {
AV_WN64A(dst + 0, 0x7F7F7F7F7F7F7F7FULL);
AV_WN64A(dst + 8, 0x7F7F7F7F7F7F7F7FULL);
AV_WN64A(dst + 16, 0x7F7F7F7F7F7F7F7FULL);
AV_WN64A(dst + 24, 0x7F7F7F7F7F7F7F7FULL);
dst += stride;
}
}
static void dc_129_4x4_c(uint8_t *dst, ptrdiff_t stride,
const uint8_t *left, const uint8_t *top)
{
AV_WN32A(dst + stride * 0, 0x81818181U);
AV_WN32A(dst + stride * 1, 0x81818181U);
AV_WN32A(dst + stride * 2, 0x81818181U);
AV_WN32A(dst + stride * 3, 0x81818181U);
}
static void dc_129_8x8_c(uint8_t *dst, ptrdiff_t stride,
const uint8_t *left, const uint8_t *top)
{
int y;
for (y = 0; y < 8; y++) {
AV_WN64A(dst, 0x8181818181818181ULL);
dst += stride;
}
}
static void dc_129_16x16_c(uint8_t *dst, ptrdiff_t stride,
const uint8_t *left, const uint8_t *top)
{
int y;
for (y = 0; y < 16; y++) {
AV_WN64A(dst + 0, 0x8181818181818181ULL);
AV_WN64A(dst + 8, 0x8181818181818181ULL);
dst += stride;
}
}
static void dc_129_32x32_c(uint8_t *dst, ptrdiff_t stride,
const uint8_t *left, const uint8_t *top)
{
int y;
for (y = 0; y < 32; y++) {
AV_WN64A(dst + 0, 0x8181818181818181ULL);
AV_WN64A(dst + 8, 0x8181818181818181ULL);
AV_WN64A(dst + 16, 0x8181818181818181ULL);
AV_WN64A(dst + 24, 0x8181818181818181ULL);
dst += stride;
}
}
#define DST(x, y) dst[(x) + (y) * stride]
static void diag_downleft_4x4_c(uint8_t *dst, ptrdiff_t stride,
const uint8_t *left, const uint8_t *top)
{
int a0 = top[0], a1 = top[1], a2 = top[2], a3 = top[3],
a4 = top[4], a5 = top[5], a6 = top[6], a7 = top[7];
DST(0, 0) = (a0 + a1 * 2 + a2 + 2) >> 2;
DST(1, 0) =
DST(0, 1) = (a1 + a2 * 2 + a3 + 2) >> 2;
DST(2, 0) =
DST(1, 1) =
DST(0, 2) = (a2 + a3 * 2 + a4 + 2) >> 2;
DST(3, 0) =
DST(2, 1) =
DST(1, 2) =
DST(0, 3) = (a3 + a4 * 2 + a5 + 2) >> 2;
DST(3, 1) =
DST(2, 2) =
DST(1, 3) = (a4 + a5 * 2 + a6 + 2) >> 2;
DST(3, 2) =
DST(2, 3) = (a5 + a6 * 2 + a7 + 2) >> 2;
DST(3, 3) = a7; // note: this is different from vp8 and such
}
#define def_diag_downleft(size) \
static void diag_downleft_ ## size ## x ## size ## _c(uint8_t *dst, \
ptrdiff_t stride, \
const uint8_t *left, \
const uint8_t *top) \
{ \
int i, j; \
uint8_t v[size - 1]; \
\
for (i = 0; i < size - 2; i++) \
v[i] = (top[i] + top[i + 1] * 2 + top[i + 2] + 2) >> 2; \
v[size - 2] = (top[size - 2] + top[size - 1] * 3 + 2) >> 2; \
\
for (j = 0; j < size; j++) { \
memcpy(dst + j * stride, v + j, size - 1 - j); \
memset(dst + j * stride + size - 1 - j, top[size - 1], j + 1); \
} \
}
def_diag_downleft(8)
def_diag_downleft(16)
def_diag_downleft(32)
static void diag_downright_4x4_c(uint8_t *dst, ptrdiff_t stride,
const uint8_t *left, const uint8_t *top)
{
int tl = top[-1], a0 = top[0], a1 = top[1], a2 = top[2], a3 = top[3],
l0 = left[0], l1 = left[1], l2 = left[2], l3 = left[3];
DST(0, 3) = (l1 + l2 * 2 + l3 + 2) >> 2;
DST(0, 2) =
DST(1, 3) = (l0 + l1 * 2 + l2 + 2) >> 2;
DST(0, 1) =
DST(1, 2) =
DST(2, 3) = (tl + l0 * 2 + l1 + 2) >> 2;
DST(0, 0) =
DST(1, 1) =
DST(2, 2) =
DST(3, 3) = (l0 + tl * 2 + a0 + 2) >> 2;
DST(1, 0) =
DST(2, 1) =
DST(3, 2) = (tl + a0 * 2 + a1 + 2) >> 2;
DST(2, 0) =
DST(3, 1) = (a0 + a1 * 2 + a2 + 2) >> 2;
DST(3, 0) = (a1 + a2 * 2 + a3 + 2) >> 2;
}
#define def_diag_downright(size) \
static void diag_downright_ ## size ## x ## size ## _c(uint8_t *dst, \
ptrdiff_t stride, \
const uint8_t *left, \
const uint8_t *top) \
{ \
int i, j; \
uint8_t v[size + size - 1]; \
\
for (i = 0; i < size - 2; i++) { \
v[i] = (left[size - 1 - i] + \
left[size - 2 - i] * 2 + \
left[size - 3 - i] + 2) >> 2; \
v[size + 1 + i] = (top[i] + \
top[i + 1] * 2 + \
top[i + 2] + 2) >> 2; \
} \
v[size - 2] = (left[1] + left[0] * 2 + top[-1] + 2) >> 2; \
v[size - 1] = (left[0] + top[-1] * 2 + top[0] + 2) >> 2; \
v[size] = (top[-1] + top[0] * 2 + top[1] + 2) >> 2; \
\
for (j = 0; j < size; j++) \
memcpy(dst + j * stride, v + size - 1 - j, size); \
}
def_diag_downright(8)
def_diag_downright(16)
def_diag_downright(32)
static void vert_right_4x4_c(uint8_t *dst, ptrdiff_t stride,
const uint8_t *left, const uint8_t *top)
{
int tl = top[-1], a0 = top[0], a1 = top[1], a2 = top[2], a3 = top[3],
l0 = left[0], l1 = left[1], l2 = left[2];
DST(0, 3) = (l0 + l1 * 2 + l2 + 2) >> 2;
DST(0, 2) = (tl + l0 * 2 + l1 + 2) >> 2;
DST(0, 0) =
DST(1, 2) = (tl + a0 + 1) >> 1;
DST(0, 1) =
DST(1, 3) = (l0 + tl * 2 + a0 + 2) >> 2;
DST(1, 0) =
DST(2, 2) = (a0 + a1 + 1) >> 1;
DST(1, 1) =
DST(2, 3) = (tl + a0 * 2 + a1 + 2) >> 2;
DST(2, 0) =
DST(3, 2) = (a1 + a2 + 1) >> 1;
DST(2, 1) =
DST(3, 3) = (a0 + a1 * 2 + a2 + 2) >> 2;
DST(3, 0) = (a2 + a3 + 1) >> 1;
DST(3, 1) = (a1 + a2 * 2 + a3 + 2) >> 2;
}
#define def_vert_right(size) \
static void vert_right_ ## size ## x ## size ## _c(uint8_t *dst, \
ptrdiff_t stride, \
const uint8_t *left, \
const uint8_t *top) \
{ \
int i, j; \
uint8_t ve[size + size / 2 - 1], vo[size + size / 2 - 1]; \
\
for (i = 0; i < size / 2 - 2; i++) { \
vo[i] = (left[size - 4 - i * 2] + \
left[size - 3 - i * 2] * 2 + \
left[size - 2 - i * 2] + 2) >> 2; \
ve[i] = (left[size - 5 - i * 2] + \
left[size - 4 - i * 2] * 2 + \
left[size - 3 - i * 2] + 2) >> 2; \
} \
vo[size / 2 - 2] = (left[0] + left[1] * 2 + left[2] + 2) >> 2; \
ve[size / 2 - 2] = (top[-1] + left[0] * 2 + left[1] + 2) >> 2; \
\
ve[size / 2 - 1] = (top[-1] + top[0] + 1) >> 1; \
vo[size / 2 - 1] = (left[0] + top[-1] * 2 + top[0] + 2) >> 2; \
for (i = 0; i < size - 1; i++) { \
ve[size / 2 + i] = (top[i] + top[i + 1] + 1) >> 1; \
vo[size / 2 + i] = (top[i - 1] + top[i] * 2 + top[i + 1] + 2) >> 2; \
} \
\
for (j = 0; j < size / 2; j++) { \
memcpy(dst + j * 2 * stride, ve + size / 2 - 1 - j, size); \
memcpy(dst + (j * 2 + 1) * stride, vo + size / 2 - 1 - j, size); \
} \
}
def_vert_right(8)
def_vert_right(16)
def_vert_right(32)
static void hor_down_4x4_c(uint8_t *dst, ptrdiff_t stride,
const uint8_t *left, const uint8_t *top)
{
int l0 = left[0], l1 = left[1], l2 = left[2], l3 = left[3],
tl = top[-1], a0 = top[0], a1 = top[1], a2 = top[2];
DST(2, 0) = (tl + a0 * 2 + a1 + 2) >> 2;
DST(3, 0) = (a0 + a1 * 2 + a2 + 2) >> 2;
DST(0, 0) =
DST(2, 1) = (tl + l0 + 1) >> 1;
DST(1, 0) =
DST(3, 1) = (a0 + tl * 2 + l0 + 2) >> 2;
DST(0, 1) =
DST(2, 2) = (l0 + l1 + 1) >> 1;
DST(1, 1) =
DST(3, 2) = (tl + l0 * 2 + l1 + 2) >> 2;
DST(0, 2) =
DST(2, 3) = (l1 + l2 + 1) >> 1;
DST(1, 2) =
DST(3, 3) = (l0 + l1 * 2 + l2 + 2) >> 2;
DST(0, 3) = (l2 + l3 + 1) >> 1;
DST(1, 3) = (l1 + l2 * 2 + l3 + 2) >> 2;
}
#define def_hor_down(size) \
static void hor_down_ ## size ## x ## size ## _c(uint8_t *dst, \
ptrdiff_t stride, \
const uint8_t *left, \
const uint8_t *top) \
{ \
int i, j; \
uint8_t v[size * 3 - 2]; \
\
for (i = 0; i < size - 2; i++) { \
v[i * 2] = (left[size - 2 - i] + \
left[size - 1 - i] + 1) >> 1; \
v[i * 2 + 1] = (left[size - 3 - i] + \
left[size - 2 - i] * 2 + \
left[size - 1 - i] + 2) >> 2; \
v[size * 2 + i] = (top[i - 1] + \
top[i] * 2 + \
top[i + 1] + 2) >> 2; \
} \
v[size * 2 - 2] = (top[-1] + left[0] + 1) >> 1; \
v[size * 2 - 4] = (left[0] + left[1] + 1) >> 1; \
v[size * 2 - 1] = (top[0] + top[-1] * 2 + left[0] + 2) >> 2; \
v[size * 2 - 3] = (top[-1] + left[0] * 2 + left[1] + 2) >> 2; \
\
for (j = 0; j < size; j++) \
memcpy(dst + j * stride, v + size * 2 - 2 - j * 2, size); \
}
def_hor_down(8)
def_hor_down(16)
def_hor_down(32)
static void vert_left_4x4_c(uint8_t *dst, ptrdiff_t stride,
const uint8_t *left, const uint8_t *top)
{
int a0 = top[0], a1 = top[1], a2 = top[2], a3 = top[3],
a4 = top[4], a5 = top[5], a6 = top[6];
DST(0, 0) = (a0 + a1 + 1) >> 1;
DST(0, 1) = (a0 + a1 * 2 + a2 + 2) >> 2;
DST(1, 0) =
DST(0, 2) = (a1 + a2 + 1) >> 1;
DST(1, 1) =
DST(0, 3) = (a1 + a2 * 2 + a3 + 2) >> 2;
DST(2, 0) =
DST(1, 2) = (a2 + a3 + 1) >> 1;
DST(2, 1) =
DST(1, 3) = (a2 + a3 * 2 + a4 + 2) >> 2;
DST(3, 0) =
DST(2, 2) = (a3 + a4 + 1) >> 1;
DST(3, 1) =
DST(2, 3) = (a3 + a4 * 2 + a5 + 2) >> 2;
DST(3, 2) = (a4 + a5 + 1) >> 1;
DST(3, 3) = (a4 + a5 * 2 + a6 + 2) >> 2;
}
#define def_vert_left(size) \
static void vert_left_ ## size ## x ## size ## _c(uint8_t *dst, \
ptrdiff_t stride, \
const uint8_t *left, \
const uint8_t *top) \
{ \
int i, j; \
uint8_t ve[size - 1], vo[size - 1]; \
\
for (i = 0; i < size - 2; i++) { \
ve[i] = (top[i] + top[i + 1] + 1) >> 1; \
vo[i] = (top[i] + top[i + 1] * 2 + top[i + 2] + 2) >> 2; \
} \
ve[size - 2] = (top[size - 2] + top[size - 1] + 1) >> 1; \
vo[size - 2] = (top[size - 2] + top[size - 1] * 3 + 2) >> 2; \
\
for (j = 0; j < size / 2; j++) { \
memcpy(dst + j * 2 * stride, ve + j, size - (j + 1)); \
memset(dst + j * 2 * stride + size - j - 1, \
top[size - 1], j + 1); \
memcpy(dst + (j * 2 + 1) * stride, vo + j, size - (j + 1)); \
memset(dst + (j * 2 + 1) * stride + size - j - 1, \
top[size - 1], j + 1); \
} \
}
def_vert_left(8)
def_vert_left(16)
def_vert_left(32)
static void hor_up_4x4_c(uint8_t *dst, ptrdiff_t stride,
const uint8_t *left, const uint8_t *top)
{
int l0 = left[0], l1 = left[1], l2 = left[2], l3 = left[3];
DST(0, 0) = (l0 + l1 + 1) >> 1;
DST(1, 0) = (l0 + l1 * 2 + l2 + 2) >> 2;
DST(0, 1) =
DST(2, 0) = (l1 + l2 + 1) >> 1;
DST(1, 1) =
DST(3, 0) = (l1 + l2 * 2 + l3 + 2) >> 2;
DST(0, 2) =
DST(2, 1) = (l2 + l3 + 1) >> 1;
DST(1, 2) =
DST(3, 1) = (l2 + l3 * 3 + 2) >> 2;
DST(0, 3) =
DST(1, 3) =
DST(2, 2) =
DST(2, 3) =
DST(3, 2) =
DST(3, 3) = l3;
}
#define def_hor_up(size) \
static void hor_up_ ## size ## x ## size ## _c(uint8_t *dst, \
ptrdiff_t stride, \
const uint8_t *left, \
const uint8_t *top) \
{ \
int i, j; \
uint8_t v[size * 2 - 2]; \
\
for (i = 0; i < size - 2; i++) { \
v[i * 2] = (left[i] + left[i + 1] + 1) >> 1; \
v[i * 2 + 1] = (left[i] + left[i + 1] * 2 + left[i + 2] + 2) >> 2; \
} \
v[size * 2 - 4] = (left[size - 2] + left[size - 1] + 1) >> 1; \
v[size * 2 - 3] = (left[size - 2] + left[size - 1] * 3 + 2) >> 2; \
\
for (j = 0; j < size / 2; j++) \
memcpy(dst + j * stride, v + j * 2, size); \
for (j = size / 2; j < size; j++) { \
memcpy(dst + j * stride, v + j * 2, size * 2 - 2 - j * 2); \
memset(dst + j * stride + size * 2 - 2 - j * 2, left[size - 1], \
2 + j * 2 - size); \
} \
}
def_hor_up(8)
def_hor_up(16)
def_hor_up(32)
#undef DST
static av_cold void vp9dsp_intrapred_init(VP9DSPContext *dsp)
{
#define init_intra_pred(tx, sz) \
dsp->intra_pred[tx][VERT_PRED] = vert_ ## sz ## _c; \
dsp->intra_pred[tx][HOR_PRED] = hor_ ## sz ## _c; \
dsp->intra_pred[tx][DC_PRED] = dc_ ## sz ## _c; \
dsp->intra_pred[tx][DIAG_DOWN_LEFT_PRED] = diag_downleft_ ## sz ## _c; \
dsp->intra_pred[tx][DIAG_DOWN_RIGHT_PRED] = diag_downright_ ## sz ## _c; \
dsp->intra_pred[tx][VERT_RIGHT_PRED] = vert_right_ ## sz ## _c; \
dsp->intra_pred[tx][HOR_DOWN_PRED] = hor_down_ ## sz ## _c; \
dsp->intra_pred[tx][VERT_LEFT_PRED] = vert_left_ ## sz ## _c; \
dsp->intra_pred[tx][HOR_UP_PRED] = hor_up_ ## sz ## _c; \
dsp->intra_pred[tx][TM_VP8_PRED] = tm_ ## sz ## _c; \
dsp->intra_pred[tx][LEFT_DC_PRED] = dc_left_ ## sz ## _c; \
dsp->intra_pred[tx][TOP_DC_PRED] = dc_top_ ## sz ## _c; \
dsp->intra_pred[tx][DC_128_PRED] = dc_128_ ## sz ## _c; \
dsp->intra_pred[tx][DC_127_PRED] = dc_127_ ## sz ## _c; \
dsp->intra_pred[tx][DC_129_PRED] = dc_129_ ## sz ## _c
init_intra_pred(TX_4X4, 4x4);
init_intra_pred(TX_8X8, 8x8);
init_intra_pred(TX_16X16, 16x16);
init_intra_pred(TX_32X32, 32x32);
#undef init_intra_pred
}
#define itxfm_wrapper(type_a, type_b, sz, bits, has_dconly) \
static void \
type_a ## _ ## type_b ## _ ## sz ## x ## sz ## _add_c(uint8_t *dst, \
ptrdiff_t stride, \
int16_t *block, \
int eob) \
{ \
int i, j; \
int16_t tmp[sz * sz], out[sz]; \
\
if (has_dconly && eob == 1) { \
const int t = (((block[0] * 11585 + (1 << 13)) >> 14) \
* 11585 + (1 << 13)) >> 14; \
block[0] = 0; \
for (i = 0; i < sz; i++) { \
for (j = 0; j < sz; j++) \
dst[j * stride] = \
av_clip_uint8(dst[j * stride] + \
(bits ? (t + (1 << (bits - 1))) >> bits \
: t)); \
dst++; \
} \
return; \
} \
\
for (i = 0; i < sz; i++) \
type_a ## sz ## _1d(tmp + i * sz, block + i, sz, 0); \
memset(block, 0, sz * sz * sizeof(*block)); \
for (i = 0; i < sz; i++) { \
type_b ## sz ## _1d(out, tmp + i, sz, 1); \
for (j = 0; j < sz; j++) \
dst[j * stride] = \
av_clip_uint8(dst[j * stride] + \
(bits ? (out[j] + (1 << (bits - 1))) >> bits \
: out[j])); \
dst++; \
} \
}
#define itxfm_wrap(sz, bits) \
itxfm_wrapper(idct, idct, sz, bits, 1) \
itxfm_wrapper(iadst, idct, sz, bits, 0) \
itxfm_wrapper(idct, iadst, sz, bits, 0) \
itxfm_wrapper(iadst, iadst, sz, bits, 0)
#define IN(x) in[x * stride]
static av_always_inline void idct4_1d(int16_t *out, const int16_t *in,
ptrdiff_t stride, int pass)
{
int t0, t1, t2, t3;
t0 = ((IN(0) + IN(2)) * 11585 + (1 << 13)) >> 14;
t1 = ((IN(0) - IN(2)) * 11585 + (1 << 13)) >> 14;
t2 = (IN(1) * 6270 - IN(3) * 15137 + (1 << 13)) >> 14;
t3 = (IN(1) * 15137 + IN(3) * 6270 + (1 << 13)) >> 14;
out[0] = t0 + t3;
out[1] = t1 + t2;
out[2] = t1 - t2;
out[3] = t0 - t3;
}
static av_always_inline void iadst4_1d(int16_t *out, const int16_t *in,
ptrdiff_t stride, int pass)
{
int t0, t1, t2, t3;
t0 = 5283 * IN(0) + 15212 * IN(2) + 9929 * IN(3);
t1 = 9929 * IN(0) - 5283 * IN(2) - 15212 * IN(3);
t2 = 13377 * (IN(0) - IN(2) + IN(3));
t3 = 13377 * IN(1);
out[0] = (t0 + t3 + (1 << 13)) >> 14;
out[1] = (t1 + t3 + (1 << 13)) >> 14;
out[2] = (t2 + (1 << 13)) >> 14;
out[3] = (t0 + t1 - t3 + (1 << 13)) >> 14;
}
itxfm_wrap(4, 4)
static av_always_inline void idct8_1d(int16_t *out, const int16_t *in,
ptrdiff_t stride, int pass)
{
int t0, t0a, t1, t1a, t2, t2a, t3, t3a, t4, t4a, t5, t5a, t6, t6a, t7, t7a;
t0a = ((IN(0) + IN(4)) * 11585 + (1 << 13)) >> 14;
t1a = ((IN(0) - IN(4)) * 11585 + (1 << 13)) >> 14;
t2a = (IN(2) * 6270 - IN(6) * 15137 + (1 << 13)) >> 14;
t3a = (IN(2) * 15137 + IN(6) * 6270 + (1 << 13)) >> 14;
t4a = (IN(1) * 3196 - IN(7) * 16069 + (1 << 13)) >> 14;
t5a = (IN(5) * 13623 - IN(3) * 9102 + (1 << 13)) >> 14;
t6a = (IN(5) * 9102 + IN(3) * 13623 + (1 << 13)) >> 14;
t7a = (IN(1) * 16069 + IN(7) * 3196 + (1 << 13)) >> 14;
t0 = t0a + t3a;
t1 = t1a + t2a;
t2 = t1a - t2a;
t3 = t0a - t3a;
t4 = t4a + t5a;
t5a = t4a - t5a;
t7 = t7a + t6a;
t6a = t7a - t6a;
t5 = ((t6a - t5a) * 11585 + (1 << 13)) >> 14;
t6 = ((t6a + t5a) * 11585 + (1 << 13)) >> 14;
out[0] = t0 + t7;
out[1] = t1 + t6;
out[2] = t2 + t5;
out[3] = t3 + t4;
out[4] = t3 - t4;
out[5] = t2 - t5;
out[6] = t1 - t6;
out[7] = t0 - t7;
}
static av_always_inline void iadst8_1d(int16_t *out, const int16_t *in,
ptrdiff_t stride, int pass)
{
int t0, t0a, t1, t1a, t2, t2a, t3, t3a, t4, t4a, t5, t5a, t6, t6a, t7, t7a;
t0a = 16305 * IN(7) + 1606 * IN(0);
t1a = 1606 * IN(7) - 16305 * IN(0);
t2a = 14449 * IN(5) + 7723 * IN(2);
t3a = 7723 * IN(5) - 14449 * IN(2);
t4a = 10394 * IN(3) + 12665 * IN(4);
t5a = 12665 * IN(3) - 10394 * IN(4);
t6a = 4756 * IN(1) + 15679 * IN(6);
t7a = 15679 * IN(1) - 4756 * IN(6);
t0 = (t0a + t4a + (1 << 13)) >> 14;
t1 = (t1a + t5a + (1 << 13)) >> 14;
t2 = (t2a + t6a + (1 << 13)) >> 14;
t3 = (t3a + t7a + (1 << 13)) >> 14;
t4 = (t0a - t4a + (1 << 13)) >> 14;
t5 = (t1a - t5a + (1 << 13)) >> 14;
t6 = (t2a - t6a + (1 << 13)) >> 14;
t7 = (t3a - t7a + (1 << 13)) >> 14;
t4a = 15137 * t4 + 6270 * t5;
t5a = 6270 * t4 - 15137 * t5;
t6a = 15137 * t7 - 6270 * t6;
t7a = 6270 * t7 + 15137 * t6;
out[0] = t0 + t2;
out[7] = -(t1 + t3);
t2 = t0 - t2;
t3 = t1 - t3;
out[1] = -((t4a + t6a + (1 << 13)) >> 14);
out[6] = (t5a + t7a + (1 << 13)) >> 14;
t6 = (t4a - t6a + (1 << 13)) >> 14;
t7 = (t5a - t7a + (1 << 13)) >> 14;
out[3] = -(((t2 + t3) * 11585 + (1 << 13)) >> 14);
out[4] = ((t2 - t3) * 11585 + (1 << 13)) >> 14;
out[2] = ((t6 + t7) * 11585 + (1 << 13)) >> 14;
out[5] = -(((t6 - t7) * 11585 + (1 << 13)) >> 14);
}
itxfm_wrap(8, 5)
static av_always_inline void idct16_1d(int16_t *out, const int16_t *in,
ptrdiff_t stride, int pass)
{
int t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
int t0a, t1a, t2a, t3a, t4a, t5a, t6a, t7a;
int t8a, t9a, t10a, t11a, t12a, t13a, t14a, t15a;
t0a = ((IN(0) + IN(8)) * 11585 + (1 << 13)) >> 14;
t1a = ((IN(0) - IN(8)) * 11585 + (1 << 13)) >> 14;
t2a = (IN(4) * 6270 - IN(12) * 15137 + (1 << 13)) >> 14;
t3a = (IN(4) * 15137 + IN(12) * 6270 + (1 << 13)) >> 14;
t4a = (IN(2) * 3196 - IN(14) * 16069 + (1 << 13)) >> 14;
t7a = (IN(2) * 16069 + IN(14) * 3196 + (1 << 13)) >> 14;
t5a = (IN(10) * 13623 - IN(6) * 9102 + (1 << 13)) >> 14;
t6a = (IN(10) * 9102 + IN(6) * 13623 + (1 << 13)) >> 14;
t8a = (IN(1) * 1606 - IN(15) * 16305 + (1 << 13)) >> 14;
t15a = (IN(1) * 16305 + IN(15) * 1606 + (1 << 13)) >> 14;
t9a = (IN(9) * 12665 - IN(7) * 10394 + (1 << 13)) >> 14;
t14a = (IN(9) * 10394 + IN(7) * 12665 + (1 << 13)) >> 14;
t10a = (IN(5) * 7723 - IN(11) * 14449 + (1 << 13)) >> 14;
t13a = (IN(5) * 14449 + IN(11) * 7723 + (1 << 13)) >> 14;
t11a = (IN(13) * 15679 - IN(3) * 4756 + (1 << 13)) >> 14;
t12a = (IN(13) * 4756 + IN(3) * 15679 + (1 << 13)) >> 14;
t0 = t0a + t3a;
t1 = t1a + t2a;
t2 = t1a - t2a;
t3 = t0a - t3a;
t4 = t4a + t5a;
t5 = t4a - t5a;
t6 = t7a - t6a;
t7 = t7a + t6a;
t8 = t8a + t9a;
t9 = t8a - t9a;
t10 = t11a - t10a;
t11 = t11a + t10a;
t12 = t12a + t13a;
t13 = t12a - t13a;
t14 = t15a - t14a;
t15 = t15a + t14a;
t5a = ((t6 - t5) * 11585 + (1 << 13)) >> 14;
t6a = ((t6 + t5) * 11585 + (1 << 13)) >> 14;
t9a = (t14 * 6270 - t9 * 15137 + (1 << 13)) >> 14;
t14a = (t14 * 15137 + t9 * 6270 + (1 << 13)) >> 14;
t10a = (-(t13 * 15137 + t10 * 6270) + (1 << 13)) >> 14;
t13a = (t13 * 6270 - t10 * 15137 + (1 << 13)) >> 14;
t0a = t0 + t7;
t1a = t1 + t6a;
t2a = t2 + t5a;
t3a = t3 + t4;
t4 = t3 - t4;
t5 = t2 - t5a;
t6 = t1 - t6a;
t7 = t0 - t7;
t8a = t8 + t11;
t9 = t9a + t10a;
t10 = t9a - t10a;
t11a = t8 - t11;
t12a = t15 - t12;
t13 = t14a - t13a;
t14 = t14a + t13a;
t15a = t15 + t12;
t10a = ((t13 - t10) * 11585 + (1 << 13)) >> 14;
t13a = ((t13 + t10) * 11585 + (1 << 13)) >> 14;
t11 = ((t12a - t11a) * 11585 + (1 << 13)) >> 14;
t12 = ((t12a + t11a) * 11585 + (1 << 13)) >> 14;
out[0] = t0a + t15a;
out[1] = t1a + t14;
out[2] = t2a + t13a;
out[3] = t3a + t12;
out[4] = t4 + t11;
out[5] = t5 + t10a;
out[6] = t6 + t9;
out[7] = t7 + t8a;
out[8] = t7 - t8a;
out[9] = t6 - t9;
out[10] = t5 - t10a;
out[11] = t4 - t11;
out[12] = t3a - t12;
out[13] = t2a - t13a;
out[14] = t1a - t14;
out[15] = t0a - t15a;
}
static av_always_inline void iadst16_1d(int16_t *out, const int16_t *in,
ptrdiff_t stride, int pass)
{
int t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
int t0a, t1a, t2a, t3a, t4a, t5a, t6a, t7a;
int t8a, t9a, t10a, t11a, t12a, t13a, t14a, t15a;
t0 = IN(15) * 16364 + IN(0) * 804;
t1 = IN(15) * 804 - IN(0) * 16364;
t2 = IN(13) * 15893 + IN(2) * 3981;
t3 = IN(13) * 3981 - IN(2) * 15893;
t4 = IN(11) * 14811 + IN(4) * 7005;
t5 = IN(11) * 7005 - IN(4) * 14811;
t6 = IN(9) * 13160 + IN(6) * 9760;
t7 = IN(9) * 9760 - IN(6) * 13160;
t8 = IN(7) * 11003 + IN(8) * 12140;
t9 = IN(7) * 12140 - IN(8) * 11003;
t10 = IN(5) * 8423 + IN(10) * 14053;
t11 = IN(5) * 14053 - IN(10) * 8423;
t12 = IN(3) * 5520 + IN(12) * 15426;
t13 = IN(3) * 15426 - IN(12) * 5520;
t14 = IN(1) * 2404 + IN(14) * 16207;
t15 = IN(1) * 16207 - IN(14) * 2404;
t0a = (t0 + t8 + (1 << 13)) >> 14;
t1a = (t1 + t9 + (1 << 13)) >> 14;
t2a = (t2 + t10 + (1 << 13)) >> 14;
t3a = (t3 + t11 + (1 << 13)) >> 14;
t4a = (t4 + t12 + (1 << 13)) >> 14;
t5a = (t5 + t13 + (1 << 13)) >> 14;
t6a = (t6 + t14 + (1 << 13)) >> 14;
t7a = (t7 + t15 + (1 << 13)) >> 14;
t8a = (t0 - t8 + (1 << 13)) >> 14;
t9a = (t1 - t9 + (1 << 13)) >> 14;
t10a = (t2 - t10 + (1 << 13)) >> 14;
t11a = (t3 - t11 + (1 << 13)) >> 14;
t12a = (t4 - t12 + (1 << 13)) >> 14;
t13a = (t5 - t13 + (1 << 13)) >> 14;
t14a = (t6 - t14 + (1 << 13)) >> 14;
t15a = (t7 - t15 + (1 << 13)) >> 14;
t8 = t8a * 16069 + t9a * 3196;
t9 = t8a * 3196 - t9a * 16069;
t10 = t10a * 9102 + t11a * 13623;
t11 = t10a * 13623 - t11a * 9102;
t12 = t13a * 16069 - t12a * 3196;
t13 = t13a * 3196 + t12a * 16069;
t14 = t15a * 9102 - t14a * 13623;
t15 = t15a * 13623 + t14a * 9102;
t0 = t0a + t4a;
t1 = t1a + t5a;
t2 = t2a + t6a;
t3 = t3a + t7a;
t4 = t0a - t4a;
t5 = t1a - t5a;
t6 = t2a - t6a;
t7 = t3a - t7a;
t8a = (t8 + t12 + (1 << 13)) >> 14;
t9a = (t9 + t13 + (1 << 13)) >> 14;
t10a = (t10 + t14 + (1 << 13)) >> 14;
t11a = (t11 + t15 + (1 << 13)) >> 14;
t12a = (t8 - t12 + (1 << 13)) >> 14;
t13a = (t9 - t13 + (1 << 13)) >> 14;
t14a = (t10 - t14 + (1 << 13)) >> 14;
t15a = (t11 - t15 + (1 << 13)) >> 14;
t4a = t4 * 15137 + t5 * 6270;
t5a = t4 * 6270 - t5 * 15137;
t6a = t7 * 15137 - t6 * 6270;
t7a = t7 * 6270 + t6 * 15137;
t12 = t12a * 15137 + t13a * 6270;
t13 = t12a * 6270 - t13a * 15137;
t14 = t15a * 15137 - t14a * 6270;
t15 = t15a * 6270 + t14a * 15137;
out[0] = t0 + t2;
out[15] = -(t1 + t3);
t2a = t0 - t2;
t3a = t1 - t3;
out[3] = -((t4a + t6a + (1 << 13)) >> 14);
out[12] = (t5a + t7a + (1 << 13)) >> 14;
t6 = (t4a - t6a + (1 << 13)) >> 14;
t7 = (t5a - t7a + (1 << 13)) >> 14;
out[1] = -(t8a + t10a);
out[14] = t9a + t11a;
t10 = t8a - t10a;
t11 = t9a - t11a;
out[2] = (t12 + t14 + (1 << 13)) >> 14;
out[13] = -((t13 + t15 + (1 << 13)) >> 14);
t14a = (t12 - t14 + (1 << 13)) >> 14;
t15a = (t13 - t15 + (1 << 13)) >> 14;
out[7] = ((t2a + t3a) * -11585 + (1 << 13)) >> 14;
out[8] = ((t2a - t3a) * 11585 + (1 << 13)) >> 14;
out[4] = ((t7 + t6) * 11585 + (1 << 13)) >> 14;
out[11] = ((t7 - t6) * 11585 + (1 << 13)) >> 14;
out[6] = ((t11 + t10) * 11585 + (1 << 13)) >> 14;
out[9] = ((t11 - t10) * 11585 + (1 << 13)) >> 14;
out[5] = ((t14a + t15a) * -11585 + (1 << 13)) >> 14;
out[10] = ((t14a - t15a) * 11585 + (1 << 13)) >> 14;
}
itxfm_wrap(16, 6)
static av_always_inline void idct32_1d(int16_t *out, const int16_t *in,
ptrdiff_t stride, int pass)
{
int t0a = ((IN(0) + IN(16)) * 11585 + (1 << 13)) >> 14;
int t1a = ((IN(0) - IN(16)) * 11585 + (1 << 13)) >> 14;
int t2a = (IN(8) * 6270 - IN(24) * 15137 + (1 << 13)) >> 14;
int t3a = (IN(8) * 15137 + IN(24) * 6270 + (1 << 13)) >> 14;
int t4a = (IN(4) * 3196 - IN(28) * 16069 + (1 << 13)) >> 14;
int t7a = (IN(4) * 16069 + IN(28) * 3196 + (1 << 13)) >> 14;
int t5a = (IN(20) * 13623 - IN(12) * 9102 + (1 << 13)) >> 14;
int t6a = (IN(20) * 9102 + IN(12) * 13623 + (1 << 13)) >> 14;
int t8a = (IN(2) * 1606 - IN(30) * 16305 + (1 << 13)) >> 14;
int t15a = (IN(2) * 16305 + IN(30) * 1606 + (1 << 13)) >> 14;
int t9a = (IN(18) * 12665 - IN(14) * 10394 + (1 << 13)) >> 14;
int t14a = (IN(18) * 10394 + IN(14) * 12665 + (1 << 13)) >> 14;
int t10a = (IN(10) * 7723 - IN(22) * 14449 + (1 << 13)) >> 14;
int t13a = (IN(10) * 14449 + IN(22) * 7723 + (1 << 13)) >> 14;
int t11a = (IN(26) * 15679 - IN(6) * 4756 + (1 << 13)) >> 14;
int t12a = (IN(26) * 4756 + IN(6) * 15679 + (1 << 13)) >> 14;
int t16a = (IN(1) * 804 - IN(31) * 16364 + (1 << 13)) >> 14;
int t31a = (IN(1) * 16364 + IN(31) * 804 + (1 << 13)) >> 14;
int t17a = (IN(17) * 12140 - IN(15) * 11003 + (1 << 13)) >> 14;
int t30a = (IN(17) * 11003 + IN(15) * 12140 + (1 << 13)) >> 14;
int t18a = (IN(9) * 7005 - IN(23) * 14811 + (1 << 13)) >> 14;
int t29a = (IN(9) * 14811 + IN(23) * 7005 + (1 << 13)) >> 14;
int t19a = (IN(25) * 15426 - IN(7) * 5520 + (1 << 13)) >> 14;
int t28a = (IN(25) * 5520 + IN(7) * 15426 + (1 << 13)) >> 14;
int t20a = (IN(5) * 3981 - IN(27) * 15893 + (1 << 13)) >> 14;
int t27a = (IN(5) * 15893 + IN(27) * 3981 + (1 << 13)) >> 14;
int t21a = (IN(21) * 14053 - IN(11) * 8423 + (1 << 13)) >> 14;
int t26a = (IN(21) * 8423 + IN(11) * 14053 + (1 << 13)) >> 14;
int t22a = (IN(13) * 9760 - IN(19) * 13160 + (1 << 13)) >> 14;
int t25a = (IN(13) * 13160 + IN(19) * 9760 + (1 << 13)) >> 14;
int t23a = (IN(29) * 16207 - IN(3) * 2404 + (1 << 13)) >> 14;
int t24a = (IN(29) * 2404 + IN(3) * 16207 + (1 << 13)) >> 14;
int t0 = t0a + t3a;
int t1 = t1a + t2a;
int t2 = t1a - t2a;
int t3 = t0a - t3a;
int t4 = t4a + t5a;
int t5 = t4a - t5a;
int t6 = t7a - t6a;
int t7 = t7a + t6a;
int t8 = t8a + t9a;
int t9 = t8a - t9a;
int t10 = t11a - t10a;
int t11 = t11a + t10a;
int t12 = t12a + t13a;
int t13 = t12a - t13a;
int t14 = t15a - t14a;
int t15 = t15a + t14a;
int t16 = t16a + t17a;
int t17 = t16a - t17a;
int t18 = t19a - t18a;
int t19 = t19a + t18a;
int t20 = t20a + t21a;
int t21 = t20a - t21a;
int t22 = t23a - t22a;
int t23 = t23a + t22a;
int t24 = t24a + t25a;
int t25 = t24a - t25a;
int t26 = t27a - t26a;
int t27 = t27a + t26a;
int t28 = t28a + t29a;
int t29 = t28a - t29a;
int t30 = t31a - t30a;
int t31 = t31a + t30a;
t5a = ((t6 - t5) * 11585 + (1 << 13)) >> 14;
t6a = ((t6 + t5) * 11585 + (1 << 13)) >> 14;
t9a = (t14 * 6270 - t9 * 15137 + (1 << 13)) >> 14;
t14a = (t14 * 15137 + t9 * 6270 + (1 << 13)) >> 14;
t10a = (-(t13 * 15137 + t10 * 6270) + (1 << 13)) >> 14;
t13a = (t13 * 6270 - t10 * 15137 + (1 << 13)) >> 14;
t17a = (t30 * 3196 - t17 * 16069 + (1 << 13)) >> 14;
t30a = (t30 * 16069 + t17 * 3196 + (1 << 13)) >> 14;
t18a = (-(t29 * 16069 + t18 * 3196) + (1 << 13)) >> 14;
t29a = (t29 * 3196 - t18 * 16069 + (1 << 13)) >> 14;
t21a = (t26 * 13623 - t21 * 9102 + (1 << 13)) >> 14;
t26a = (t26 * 9102 + t21 * 13623 + (1 << 13)) >> 14;
t22a = (-(t25 * 9102 + t22 * 13623) + (1 << 13)) >> 14;
t25a = (t25 * 13623 - t22 * 9102 + (1 << 13)) >> 14;
t0a = t0 + t7;
t1a = t1 + t6a;
t2a = t2 + t5a;
t3a = t3 + t4;
t4a = t3 - t4;
t5 = t2 - t5a;
t6 = t1 - t6a;
t7a = t0 - t7;
t8a = t8 + t11;
t9 = t9a + t10a;
t10 = t9a - t10a;
t11a = t8 - t11;
t12a = t15 - t12;
t13 = t14a - t13a;
t14 = t14a + t13a;
t15a = t15 + t12;
t16a = t16 + t19;
t17 = t17a + t18a;
t18 = t17a - t18a;
t19a = t16 - t19;
t20a = t23 - t20;
t21 = t22a - t21a;
t22 = t22a + t21a;
t23a = t23 + t20;
t24a = t24 + t27;
t25 = t25a + t26a;
t26 = t25a - t26a;
t27a = t24 - t27;
t28a = t31 - t28;
t29 = t30a - t29a;
t30 = t30a + t29a;
t31a = t31 + t28;
t10a = ((t13 - t10) * 11585 + (1 << 13)) >> 14;
t13a = ((t13 + t10) * 11585 + (1 << 13)) >> 14;
t11 = ((t12a - t11a) * 11585 + (1 << 13)) >> 14;
t12 = ((t12a + t11a) * 11585 + (1 << 13)) >> 14;
t18a = (t29 * 6270 - t18 * 15137 + (1 << 13)) >> 14;
t29a = (t29 * 15137 + t18 * 6270 + (1 << 13)) >> 14;
t19 = (t28a * 6270 - t19a * 15137 + (1 << 13)) >> 14;
t28 = (t28a * 15137 + t19a * 6270 + (1 << 13)) >> 14;
t20 = (-(t27a * 15137 + t20a * 6270) + (1 << 13)) >> 14;
t27 = (t27a * 6270 - t20a * 15137 + (1 << 13)) >> 14;
t21a = (-(t26 * 15137 + t21 * 6270) + (1 << 13)) >> 14;
t26a = (t26 * 6270 - t21 * 15137 + (1 << 13)) >> 14;
t0 = t0a + t15a;
t1 = t1a + t14;
t2 = t2a + t13a;
t3 = t3a + t12;
t4 = t4a + t11;
t5a = t5 + t10a;
t6a = t6 + t9;
t7 = t7a + t8a;
t8 = t7a - t8a;
t9a = t6 - t9;
t10 = t5 - t10a;
t11a = t4a - t11;
t12a = t3a - t12;
t13 = t2a - t13a;
t14a = t1a - t14;
t15 = t0a - t15a;
t16 = t16a + t23a;
t17a = t17 + t22;
t18 = t18a + t21a;
t19a = t19 + t20;
t20a = t19 - t20;
t21 = t18a - t21a;
t22a = t17 - t22;
t23 = t16a - t23a;
t24 = t31a - t24a;
t25a = t30 - t25;
t26 = t29a - t26a;
t27a = t28 - t27;
t28a = t28 + t27;
t29 = t29a + t26a;
t30a = t30 + t25;
t31 = t31a + t24a;
t20 = ((t27a - t20a) * 11585 + (1 << 13)) >> 14;
t27 = ((t27a + t20a) * 11585 + (1 << 13)) >> 14;
t21a = ((t26 - t21) * 11585 + (1 << 13)) >> 14;
t26a = ((t26 + t21) * 11585 + (1 << 13)) >> 14;
t22 = ((t25a - t22a) * 11585 + (1 << 13)) >> 14;
t25 = ((t25a + t22a) * 11585 + (1 << 13)) >> 14;
t23a = ((t24 - t23) * 11585 + (1 << 13)) >> 14;
t24a = ((t24 + t23) * 11585 + (1 << 13)) >> 14;
out[0] = t0 + t31;
out[1] = t1 + t30a;
out[2] = t2 + t29;
out[3] = t3 + t28a;
out[4] = t4 + t27;
out[5] = t5a + t26a;
out[6] = t6a + t25;
out[7] = t7 + t24a;
out[8] = t8 + t23a;
out[9] = t9a + t22;
out[10] = t10 + t21a;
out[11] = t11a + t20;
out[12] = t12a + t19a;
out[13] = t13 + t18;
out[14] = t14a + t17a;
out[15] = t15 + t16;
out[16] = t15 - t16;
out[17] = t14a - t17a;
out[18] = t13 - t18;
out[19] = t12a - t19a;
out[20] = t11a - t20;
out[21] = t10 - t21a;
out[22] = t9a - t22;
out[23] = t8 - t23a;
out[24] = t7 - t24a;
out[25] = t6a - t25;
out[26] = t5a - t26a;
out[27] = t4 - t27;
out[28] = t3 - t28a;
out[29] = t2 - t29;
out[30] = t1 - t30a;
out[31] = t0 - t31;
}
itxfm_wrapper(idct, idct, 32, 6, 1)
static av_always_inline void iwht4_1d(int16_t *out, const int16_t *in,
ptrdiff_t stride, int pass)
{
int t0, t1, t2, t3, t4;
if (pass == 0) {
t0 = IN(0) >> 2;
t1 = IN(3) >> 2;
t2 = IN(1) >> 2;
t3 = IN(2) >> 2;
} else {
t0 = IN(0);
t1 = IN(3);
t2 = IN(1);
t3 = IN(2);
}
t0 += t2;
t3 -= t1;
t4 = (t0 - t3) >> 1;
t1 = t4 - t1;
t2 = t4 - t2;
t0 -= t1;
t3 += t2;
out[0] = t0;
out[1] = t1;
out[2] = t2;
out[3] = t3;
}
itxfm_wrapper(iwht, iwht, 4, 0, 0)
#undef IN
#undef itxfm_wrapper
#undef itxfm_wrap
static av_cold void vp9dsp_itxfm_init(VP9DSPContext *dsp)
{
#define init_itxfm(tx, sz) \
dsp->itxfm_add[tx][DCT_DCT] = idct_idct_ ## sz ## _add_c; \
dsp->itxfm_add[tx][DCT_ADST] = iadst_idct_ ## sz ## _add_c; \
dsp->itxfm_add[tx][ADST_DCT] = idct_iadst_ ## sz ## _add_c; \
dsp->itxfm_add[tx][ADST_ADST] = iadst_iadst_ ## sz ## _add_c
#define init_idct(tx, nm) \
dsp->itxfm_add[tx][DCT_DCT] = \
dsp->itxfm_add[tx][ADST_DCT] = \
dsp->itxfm_add[tx][DCT_ADST] = \
dsp->itxfm_add[tx][ADST_ADST] = nm ## _add_c
init_itxfm(TX_4X4, 4x4);
init_itxfm(TX_8X8, 8x8);
init_itxfm(TX_16X16, 16x16);
init_idct(TX_32X32, idct_idct_32x32);
init_idct(4 /* lossless */, iwht_iwht_4x4);
#undef init_itxfm
#undef init_idct
}
static av_always_inline void loop_filter(uint8_t *dst, ptrdiff_t stride,
int E, int I, int H,
ptrdiff_t stridea, ptrdiff_t strideb,
int wd)
{
int i;
for (i = 0; i < 8; i++, dst += stridea) {
int p7, p6, p5, p4;
int p3 = dst[strideb * -4], p2 = dst[strideb * -3];
int p1 = dst[strideb * -2], p0 = dst[strideb * -1];
int q0 = dst[strideb * +0], q1 = dst[strideb * +1];
int q2 = dst[strideb * +2], q3 = dst[strideb * +3];
int q4, q5, q6, q7;
int fm = FFABS(p3 - p2) <= I && FFABS(p2 - p1) <= I &&
FFABS(p1 - p0) <= I && FFABS(q1 - q0) <= I &&
FFABS(q2 - q1) <= I && FFABS(q3 - q2) <= I &&
FFABS(p0 - q0) * 2 + (FFABS(p1 - q1) >> 1) <= E;
int flat8out, flat8in;
if (!fm)
continue;
if (wd >= 16) {
p7 = dst[strideb * -8];
p6 = dst[strideb * -7];
p5 = dst[strideb * -6];
p4 = dst[strideb * -5];
q4 = dst[strideb * +4];
q5 = dst[strideb * +5];
q6 = dst[strideb * +6];
q7 = dst[strideb * +7];
flat8out = FFABS(p7 - p0) <= 1 && FFABS(p6 - p0) <= 1 &&
FFABS(p5 - p0) <= 1 && FFABS(p4 - p0) <= 1 &&
FFABS(q4 - q0) <= 1 && FFABS(q5 - q0) <= 1 &&
FFABS(q6 - q0) <= 1 && FFABS(q7 - q0) <= 1;
}
if (wd >= 8)
flat8in = FFABS(p3 - p0) <= 1 && FFABS(p2 - p0) <= 1 &&
FFABS(p1 - p0) <= 1 && FFABS(q1 - q0) <= 1 &&
FFABS(q2 - q0) <= 1 && FFABS(q3 - q0) <= 1;
if (wd >= 16 && flat8out && flat8in) {
dst[strideb * -7] = (p7 + p7 + p7 + p7 + p7 + p7 + p7 + p6 * 2 +
p5 + p4 + p3 + p2 + p1 + p0 + q0 + 8) >> 4;
dst[strideb * -6] = (p7 + p7 + p7 + p7 + p7 + p7 + p6 + p5 * 2 +
p4 + p3 + p2 + p1 + p0 + q0 + q1 + 8) >> 4;
dst[strideb * -5] = (p7 + p7 + p7 + p7 + p7 + p6 + p5 + p4 * 2 +
p3 + p2 + p1 + p0 + q0 + q1 + q2 + 8) >> 4;
dst[strideb * -4] = (p7 + p7 + p7 + p7 + p6 + p5 + p4 + p3 * 2 +
p2 + p1 + p0 + q0 + q1 + q2 + q3 + 8) >> 4;
dst[strideb * -3] = (p7 + p7 + p7 + p6 + p5 + p4 + p3 + p2 * 2 +
p1 + p0 + q0 + q1 + q2 + q3 + q4 + 8) >> 4;
dst[strideb * -2] = (p7 + p7 + p6 + p5 + p4 + p3 + p2 + p1 * 2 +
p0 + q0 + q1 + q2 + q3 + q4 + q5 + 8) >> 4;
dst[strideb * -1] = (p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 +
q0 + q1 + q2 + q3 + q4 + q5 + q6 + 8) >> 4;
dst[strideb * +0] = (p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 +
q1 + q2 + q3 + q4 + q5 + q6 + q7 + 8) >> 4;
dst[strideb * +1] = (p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 +
q2 + q3 + q4 + q5 + q6 + q7 + q7 + 8) >> 4;
dst[strideb * +2] = (p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 +
q3 + q4 + q5 + q6 + q7 + q7 + q7 + 8) >> 4;
dst[strideb * +3] = (p3 + p2 + p1 + p0 + q0 + q1 + q2 + q3 * 2 +
q4 + q5 + q6 + q7 + q7 + q7 + q7 + 8) >> 4;
dst[strideb * +4] = (p2 + p1 + p0 + q0 + q1 + q2 + q3 + q4 * 2 +
q5 + q6 + q7 + q7 + q7 + q7 + q7 + 8) >> 4;
dst[strideb * +5] = (p1 + p0 + q0 + q1 + q2 + q3 + q4 + q5 * 2 +
q6 + q7 + q7 + q7 + q7 + q7 + q7 + 8) >> 4;
dst[strideb * +6] = (p0 + q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 +
q7 + q7 + q7 + q7 + q7 + q7 + q7 + 8) >> 4;
} else if (wd >= 8 && flat8in) {
dst[strideb * -3] = (p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0 + 4) >> 3;
dst[strideb * -2] = (p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4) >> 3;
dst[strideb * -1] = (p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4) >> 3;
dst[strideb * +0] = (p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4) >> 3;
dst[strideb * +1] = (p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3 + 4) >> 3;
dst[strideb * +2] = (p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3 + 4) >> 3;
} else {
int hev = FFABS(p1 - p0) > H || FFABS(q1 - q0) > H;
if (hev) {
int f = av_clip_int8(3 * (q0 - p0) + av_clip_int8(p1 - q1));
int f1 = FFMIN(f + 4, 127) >> 3;
int f2 = FFMIN(f + 3, 127) >> 3;
dst[strideb * -1] = av_clip_uint8(p0 + f2);
dst[strideb * +0] = av_clip_uint8(q0 - f1);
} else {
int f = av_clip_int8(3 * (q0 - p0));
int f1 = FFMIN(f + 4, 127) >> 3;
int f2 = FFMIN(f + 3, 127) >> 3;
dst[strideb * -1] = av_clip_uint8(p0 + f2);
dst[strideb * +0] = av_clip_uint8(q0 - f1);
f = (f1 + 1) >> 1;
dst[strideb * -2] = av_clip_uint8(p1 + f);
dst[strideb * +1] = av_clip_uint8(q1 - f);
}
}
}
}
#define lf_8_fn(dir, wd, stridea, strideb) \
static void loop_filter_ ## dir ## _ ## wd ## _8_c(uint8_t *dst, \
ptrdiff_t stride, \
int E, int I, int H) \
{ \
loop_filter(dst, stride, E, I, H, stridea, strideb, wd); \
}
#define lf_8_fns(wd) \
lf_8_fn(h, wd, stride, 1) \
lf_8_fn(v, wd, 1, stride)
lf_8_fns(4)
lf_8_fns(8)
lf_8_fns(16)
#undef lf_8_fn
#undef lf_8_fns
#define lf_16_fn(dir, stridea) \
static void loop_filter_ ## dir ## _16_16_c(uint8_t *dst, \
ptrdiff_t stride, \
int E, int I, int H) \
{ \
loop_filter_ ## dir ## _16_8_c(dst, stride, E, I, H); \
loop_filter_ ## dir ## _16_8_c(dst + 8 * stridea, stride, E, I, H); \
}
lf_16_fn(h, stride)
lf_16_fn(v, 1)
#undef lf_16_fn
#define lf_mix_fn(dir, wd1, wd2, stridea) \
static void loop_filter_ ## dir ## _ ## wd1 ## wd2 ## _16_c(uint8_t *dst, \
ptrdiff_t stride, \
int E, int I, \
int H) \
{ \
loop_filter_ ## dir ## _ ## wd1 ## _8_c(dst, stride, E & 0xff, \
I & 0xff, H & 0xff); \
loop_filter_ ## dir ## _ ## wd2 ## _8_c(dst + 8 * stridea, stride, \
E >> 8, I >> 8, H >> 8); \
}
#define lf_mix_fns(wd1, wd2) \
lf_mix_fn(h, wd1, wd2, stride) \
lf_mix_fn(v, wd1, wd2, 1)
lf_mix_fns(4, 4)
lf_mix_fns(4, 8)
lf_mix_fns(8, 4)
lf_mix_fns(8, 8)
#undef lf_mix_fn
#undef lf_mix_fns
static av_cold void vp9dsp_loopfilter_init(VP9DSPContext *dsp)
{
dsp->loop_filter_8[0][0] = loop_filter_h_4_8_c;
dsp->loop_filter_8[0][1] = loop_filter_v_4_8_c;
dsp->loop_filter_8[1][0] = loop_filter_h_8_8_c;
dsp->loop_filter_8[1][1] = loop_filter_v_8_8_c;
dsp->loop_filter_8[2][0] = loop_filter_h_16_8_c;
dsp->loop_filter_8[2][1] = loop_filter_v_16_8_c;
dsp->loop_filter_16[0] = loop_filter_h_16_16_c;
dsp->loop_filter_16[1] = loop_filter_v_16_16_c;
dsp->loop_filter_mix2[0][0][0] = loop_filter_h_44_16_c;
dsp->loop_filter_mix2[0][0][1] = loop_filter_v_44_16_c;
dsp->loop_filter_mix2[0][1][0] = loop_filter_h_48_16_c;
dsp->loop_filter_mix2[0][1][1] = loop_filter_v_48_16_c;
dsp->loop_filter_mix2[1][0][0] = loop_filter_h_84_16_c;
dsp->loop_filter_mix2[1][0][1] = loop_filter_v_84_16_c;
dsp->loop_filter_mix2[1][1][0] = loop_filter_h_88_16_c;
dsp->loop_filter_mix2[1][1][1] = loop_filter_v_88_16_c;
}
static av_always_inline void copy_c(uint8_t *dst, ptrdiff_t dst_stride,
const uint8_t *src, ptrdiff_t src_stride,
int w, int h)
{
do {
memcpy(dst, src, w);
dst += dst_stride;
src += src_stride;
} while (--h);
}
static av_always_inline void avg_c(uint8_t *dst, ptrdiff_t dst_stride,
const uint8_t *src, ptrdiff_t src_stride,
int w, int h)
{
do {
int x;
for (x = 0; x < w; x += 4)
AV_WN32A(&dst[x], rnd_avg32(AV_RN32A(&dst[x]), AV_RN32(&src[x])));
dst += dst_stride;
src += src_stride;
} while (--h);
}
#define fpel_fn(type, sz) \
static void type ## sz ## _c(uint8_t *dst, ptrdiff_t dst_stride, \
const uint8_t *src, ptrdiff_t src_stride, \
int h, int mx, int my) \
{ \
type ## _c(dst, dst_stride, src, src_stride, sz, h); \
}
#define copy_avg_fn(sz) \
fpel_fn(copy, sz) \
fpel_fn(avg, sz)
copy_avg_fn(64)
copy_avg_fn(32)
copy_avg_fn(16)
copy_avg_fn(8)
copy_avg_fn(4)
#undef fpel_fn
#undef copy_avg_fn
const DECLARE_ALIGNED(8, int8_t, ff_vp9_subpel_filters)[3][15][8] = {
[FILTER_8TAP_REGULAR] = {
{ 0, 1, -5, 126, 8, -3, 1, 0 },
{ -1, 3, -10, 122, 18, -6, 2, 0 },
{ -1, 4, -13, 118, 27, -9, 3, -1 },
{ -1, 4, -16, 112, 37, -11, 4, -1 },
{ -1, 5, -18, 105, 48, -14, 4, -1 },
{ -1, 5, -19, 97, 58, -16, 5, -1 },
{ -1, 6, -19, 88, 68, -18, 5, -1 },
{ -1, 6, -19, 78, 78, -19, 6, -1 },
{ -1, 5, -18, 68, 88, -19, 6, -1 },
{ -1, 5, -16, 58, 97, -19, 5, -1 },
{ -1, 4, -14, 48, 105, -18, 5, -1 },
{ -1, 4, -11, 37, 112, -16, 4, -1 },
{ -1, 3, -9, 27, 118, -13, 4, -1 },
{ 0, 2, -6, 18, 122, -10, 3, -1 },
{ 0, 1, -3, 8, 126, -5, 1, 0 },
}, [FILTER_8TAP_SHARP] = {
{ -1, 3, -7, 127, 8, -3, 1, 0 },
{ -2, 5, -13, 125, 17, -6, 3, -1 },
{ -3, 7, -17, 121, 27, -10, 5, -2 },
{ -4, 9, -20, 115, 37, -13, 6, -2 },
{ -4, 10, -23, 108, 48, -16, 8, -3 },
{ -4, 10, -24, 100, 59, -19, 9, -3 },
{ -4, 11, -24, 90, 70, -21, 10, -4 },
{ -4, 11, -23, 80, 80, -23, 11, -4 },
{ -4, 10, -21, 70, 90, -24, 11, -4 },
{ -3, 9, -19, 59, 100, -24, 10, -4 },
{ -3, 8, -16, 48, 108, -23, 10, -4 },
{ -2, 6, -13, 37, 115, -20, 9, -4 },
{ -2, 5, -10, 27, 121, -17, 7, -3 },
{ -1, 3, -6, 17, 125, -13, 5, -2 },
{ 0, 1, -3, 8, 127, -7, 3, -1 },
}, [FILTER_8TAP_SMOOTH] = {
{ -3, -1, 32, 64, 38, 1, -3, 0 },
{ -2, -2, 29, 63, 41, 2, -3, 0 },
{ -2, -2, 26, 63, 43, 4, -4, 0 },
{ -2, -3, 24, 62, 46, 5, -4, 0 },
{ -2, -3, 21, 60, 49, 7, -4, 0 },
{ -1, -4, 18, 59, 51, 9, -4, 0 },
{ -1, -4, 16, 57, 53, 12, -4, -1 },
{ -1, -4, 14, 55, 55, 14, -4, -1 },
{ -1, -4, 12, 53, 57, 16, -4, -1 },
{ 0, -4, 9, 51, 59, 18, -4, -1 },
{ 0, -4, 7, 49, 60, 21, -3, -2 },
{ 0, -4, 5, 46, 62, 24, -3, -2 },
{ 0, -4, 4, 43, 63, 26, -2, -2 },
{ 0, -3, 2, 41, 63, 29, -2, -2 },
{ 0, -3, 1, 38, 64, 32, -1, -3 },
}
};
#define FILTER_8TAP(src, x, F, stride) \
av_clip_uint8((F[0] * src[x + -3 * stride] + \
F[1] * src[x + -2 * stride] + \
F[2] * src[x + -1 * stride] + \
F[3] * src[x + +0 * stride] + \
F[4] * src[x + +1 * stride] + \
F[5] * src[x + +2 * stride] + \
F[6] * src[x + +3 * stride] + \
F[7] * src[x + +4 * stride] + 64) >> 7)
static av_always_inline void do_8tap_1d_c(uint8_t *dst, ptrdiff_t dst_stride,
const uint8_t *src, ptrdiff_t src_stride,
int w, int h, ptrdiff_t ds,
const int8_t *filter, int avg)
{
do {
int x;
for (x = 0; x < w; x++)
if (avg)
dst[x] = (dst[x] + FILTER_8TAP(src, x, filter, ds) + 1) >> 1;
else
dst[x] = FILTER_8TAP(src, x, filter, ds);
dst += dst_stride;
src += src_stride;
} while (--h);
}
#define filter_8tap_1d_fn(opn, opa, dir, ds) \
static av_noinline void opn ## _8tap_1d_ ## dir ## _c(uint8_t *dst, \
ptrdiff_t dst_stride, \
const uint8_t *src, \
ptrdiff_t src_stride, \
int w, int h, \
const int8_t *filter) \
{ \
do_8tap_1d_c(dst, dst_stride, src, src_stride, w, h, ds, filter, opa); \
}
filter_8tap_1d_fn(put, 0, v, src_stride)
filter_8tap_1d_fn(put, 0, h, 1)
filter_8tap_1d_fn(avg, 1, v, src_stride)
filter_8tap_1d_fn(avg, 1, h, 1)
#undef filter_8tap_1d_fn
static av_always_inline void do_8tap_2d_c(uint8_t *dst, ptrdiff_t dst_stride,
const uint8_t *src, ptrdiff_t src_stride,
int w, int h, const int8_t *filterx,
const int8_t *filtery, int avg)
{
int tmp_h = h + 7;
uint8_t tmp[64 * 71], *tmp_ptr = tmp;
src -= src_stride * 3;
do {
int x;
for (x = 0; x < w; x++)
tmp_ptr[x] = FILTER_8TAP(src, x, filterx, 1);
tmp_ptr += 64;
src += src_stride;
} while (--tmp_h);
tmp_ptr = tmp + 64 * 3;
do {
int x;
for (x = 0; x < w; x++)
if (avg)
dst[x] = (dst[x] + FILTER_8TAP(tmp_ptr, x, filtery, 64) + 1) >> 1;
else
dst[x] = FILTER_8TAP(tmp_ptr, x, filtery, 64);
tmp_ptr += 64;
dst += dst_stride;
} while (--h);
}
#define filter_8tap_2d_fn(opn, opa) \
static av_noinline void opn ## _8tap_2d_hv_c(uint8_t *dst, \
ptrdiff_t dst_stride, \
const uint8_t *src, \
ptrdiff_t src_stride, \
int w, int h, \
const int8_t *filterx, \
const int8_t *filtery) \
{ \
do_8tap_2d_c(dst, dst_stride, src, src_stride, \
w, h, filterx, filtery, opa); \
}
filter_8tap_2d_fn(put, 0)
filter_8tap_2d_fn(avg, 1)
#undef filter_8tap_2d_fn
#undef FILTER_8TAP
#define filter_fn_1d(sz, dir, dir_m, type, type_idx, avg) \
static void \
avg ## _8tap_ ## type ## _ ## sz ## dir ## _c(uint8_t *dst, \
ptrdiff_t dst_stride, \
const uint8_t *src, \
ptrdiff_t src_stride, \
int h, int mx, int my) \
{ \
avg ## _8tap_1d_ ## dir ## _c(dst, dst_stride, src, src_stride, sz, h, \
ff_vp9_subpel_filters[type_idx][dir_m - 1]); \
}
#define filter_fn_2d(sz, type, type_idx, avg) \
static void avg ## _8tap_ ## type ## _ ## sz ## hv_c(uint8_t *dst, \
ptrdiff_t dst_stride, \
const uint8_t *src, \
ptrdiff_t src_stride, \
int h, int mx, int my) \
{ \
avg ## _8tap_2d_hv_c(dst, dst_stride, src, src_stride, sz, h, \
ff_vp9_subpel_filters[type_idx][mx - 1], \
ff_vp9_subpel_filters[type_idx][my - 1]); \
}
#define FILTER_BILIN(src, x, mxy, stride) \
(src[x] + ((mxy * (src[x + stride] - src[x]) + 8) >> 4))
static av_always_inline void do_bilin_1d_c(uint8_t *dst,
ptrdiff_t dst_stride,
const uint8_t *src,
ptrdiff_t src_stride,
int w, int h, ptrdiff_t ds,
int mxy, int avg)
{
do {
int x;
for (x = 0; x < w; x++)
if (avg)
dst[x] = (dst[x] + FILTER_BILIN(src, x, mxy, ds) + 1) >> 1;
else
dst[x] = FILTER_BILIN(src, x, mxy, ds);
dst += dst_stride;
src += src_stride;
} while (--h);
}
#define bilin_1d_fn(opn, opa, dir, ds) \
static av_noinline void opn ## _bilin_1d_ ## dir ## _c(uint8_t *dst, \
ptrdiff_t dst_stride, \
const uint8_t *src, \
ptrdiff_t src_stride, \
int w, int h, int mxy) \
{ \
do_bilin_1d_c(dst, dst_stride, src, src_stride, w, h, ds, mxy, opa); \
}
bilin_1d_fn(put, 0, v, src_stride)
bilin_1d_fn(put, 0, h, 1)
bilin_1d_fn(avg, 1, v, src_stride)
bilin_1d_fn(avg, 1, h, 1)
#undef bilin_1d_fn
static av_always_inline void do_bilin_2d_c(uint8_t *dst,
ptrdiff_t dst_stride,
const uint8_t *src,
ptrdiff_t src_stride,
int w, int h, int mx, int my,
int avg)
{
uint8_t tmp[64 * 65], *tmp_ptr = tmp;
int tmp_h = h + 1;
do {
int x;
for (x = 0; x < w; x++)
tmp_ptr[x] = FILTER_BILIN(src, x, mx, 1);
tmp_ptr += 64;
src += src_stride;
} while (--tmp_h);
tmp_ptr = tmp;
do {
int x;
for (x = 0; x < w; x++)
if (avg)
dst[x] = (dst[x] + FILTER_BILIN(tmp_ptr, x, my, 64) + 1) >> 1;
else
dst[x] = FILTER_BILIN(tmp_ptr, x, my, 64);
tmp_ptr += 64;
dst += dst_stride;
} while (--h);
}
#define bilin_2d_fn(opn, opa) \
static av_noinline void opn ## _bilin_2d_hv_c(uint8_t *dst, \
ptrdiff_t dst_stride, \
const uint8_t *src, \
ptrdiff_t src_stride, \
int w, int h, \
int mx, int my) \
{ \
do_bilin_2d_c(dst, dst_stride, src, src_stride, w, h, mx, my, opa); \
}
bilin_2d_fn(put, 0)
bilin_2d_fn(avg, 1)
#undef bilin_2d_fn
#undef FILTER_BILIN
#define bilinf_fn_1d(sz, dir, dir_m, avg) \
static void avg ## _bilin_ ## sz ## dir ## _c(uint8_t *dst, \
ptrdiff_t dst_stride, \
const uint8_t *src, \
ptrdiff_t src_stride, \
int h, int mx, int my) \
{ \
avg ## _bilin_1d_ ## dir ## _c(dst, dst_stride, src, src_stride, \
sz, h, dir_m); \
}
#define bilinf_fn_2d(sz, avg) \
static void avg ## _bilin_ ## sz ## hv_c(uint8_t *dst, \
ptrdiff_t dst_stride, \
const uint8_t *src, \
ptrdiff_t src_stride, \
int h, int mx, int my) \
{ \
avg ## _bilin_2d_hv_c(dst, dst_stride, src, src_stride, \
sz, h, mx, my); \
}
#define filter_fn(sz, avg) \
filter_fn_1d(sz, h, mx, regular, FILTER_8TAP_REGULAR, avg) \
filter_fn_1d(sz, v, my, regular, FILTER_8TAP_REGULAR, avg) \
filter_fn_2d(sz, regular, FILTER_8TAP_REGULAR, avg) \
filter_fn_1d(sz, h, mx, smooth, FILTER_8TAP_SMOOTH, avg) \
filter_fn_1d(sz, v, my, smooth, FILTER_8TAP_SMOOTH, avg) \
filter_fn_2d(sz, smooth, FILTER_8TAP_SMOOTH, avg) \
filter_fn_1d(sz, h, mx, sharp, FILTER_8TAP_SHARP, avg) \
filter_fn_1d(sz, v, my, sharp, FILTER_8TAP_SHARP, avg) \
filter_fn_2d(sz, sharp, FILTER_8TAP_SHARP, avg) \
bilinf_fn_1d(sz, h, mx, avg) \
bilinf_fn_1d(sz, v, my, avg) \
bilinf_fn_2d(sz, avg)
#define filter_fn_set(avg) \
filter_fn(64, avg) \
filter_fn(32, avg) \
filter_fn(16, avg) \
filter_fn(8, avg) \
filter_fn(4, avg)
filter_fn_set(put)
filter_fn_set(avg)
#undef filter_fn
#undef filter_fn_set
#undef filter_fn_1d
#undef filter_fn_2d
#undef bilinf_fn_1d
#undef bilinf_fn_2d
static av_cold void vp9dsp_mc_init(VP9DSPContext *dsp)
{
#define init_fpel(idx1, idx2, sz, type) \
dsp->mc[idx1][FILTER_8TAP_SMOOTH][idx2][0][0] = type ## sz ## _c; \
dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = type ## sz ## _c; \
dsp->mc[idx1][FILTER_8TAP_SHARP][idx2][0][0] = type ## sz ## _c; \
dsp->mc[idx1][FILTER_BILINEAR][idx2][0][0] = type ## sz ## _c
#define init_copy_avg(idx, sz) \
init_fpel(idx, 0, sz, copy); \
init_fpel(idx, 1, sz, avg)
init_copy_avg(0, 64);
init_copy_avg(1, 32);
init_copy_avg(2, 16);
init_copy_avg(3, 8);
init_copy_avg(4, 4);
#undef init_copy_avg
#undef init_fpel
#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type) \
dsp->mc[idx1][FILTER_8TAP_SMOOTH][idx2][idxh][idxv] = type ## _8tap_smooth_ ## sz ## dir ## _c; \
dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = type ## _8tap_regular_ ## sz ## dir ## _c; \
dsp->mc[idx1][FILTER_8TAP_SHARP][idx2][idxh][idxv] = type ## _8tap_sharp_ ## sz ## dir ## _c; \
dsp->mc[idx1][FILTER_BILINEAR][idx2][idxh][idxv] = type ## _bilin_ ## sz ## dir ## _c
#define init_subpel2(idx, idxh, idxv, dir, type) \
init_subpel1(0, idx, idxh, idxv, 64, dir, type); \
init_subpel1(1, idx, idxh, idxv, 32, dir, type); \
init_subpel1(2, idx, idxh, idxv, 16, dir, type); \
init_subpel1(3, idx, idxh, idxv, 8, dir, type); \
init_subpel1(4, idx, idxh, idxv, 4, dir, type)
#define init_subpel3(idx, type) \
init_subpel2(idx, 1, 1, hv, type); \
init_subpel2(idx, 0, 1, v, type); \
init_subpel2(idx, 1, 0, h, type)
init_subpel3(0, put);
init_subpel3(1, avg);
#undef init_subpel1
#undef init_subpel2
#undef init_subpel3
}
av_cold void ff_vp9dsp_init(VP9DSPContext *dsp)
{
vp9dsp_intrapred_init(dsp);
vp9dsp_itxfm_init(dsp);
vp9dsp_loopfilter_init(dsp);
vp9dsp_mc_init(dsp);
aarch64: vp9: Add NEON optimizations of VP9 MC functions This work is sponsored by, and copyright, Google. These are ported from the ARM version; it is essentially a 1:1 port with no extra added features, but with some hand tuning (especially for the plain copy/avg functions). The ARM version isn't very register starved to begin with, so there's not much to be gained from having more spare registers here - we only avoid having to clobber callee-saved registers. Examples of runtimes vs the 32 bit version, on a Cortex A53: ARM AArch64 vp9_avg4_neon: 27.2 23.7 vp9_avg8_neon: 56.5 54.7 vp9_avg16_neon: 169.9 167.4 vp9_avg32_neon: 585.8 585.2 vp9_avg64_neon: 2460.3 2294.7 vp9_avg_8tap_smooth_4h_neon: 132.7 125.2 vp9_avg_8tap_smooth_4hv_neon: 478.8 442.0 vp9_avg_8tap_smooth_4v_neon: 126.0 93.7 vp9_avg_8tap_smooth_8h_neon: 241.7 234.2 vp9_avg_8tap_smooth_8hv_neon: 690.9 646.5 vp9_avg_8tap_smooth_8v_neon: 245.0 205.5 vp9_avg_8tap_smooth_64h_neon: 11273.2 11280.1 vp9_avg_8tap_smooth_64hv_neon: 22980.6 22184.1 vp9_avg_8tap_smooth_64v_neon: 11549.7 10781.1 vp9_put4_neon: 18.0 17.2 vp9_put8_neon: 40.2 37.7 vp9_put16_neon: 97.4 99.5 vp9_put32_neon/armv8: 346.0 307.4 vp9_put64_neon/armv8: 1319.0 1107.5 vp9_put_8tap_smooth_4h_neon: 126.7 118.2 vp9_put_8tap_smooth_4hv_neon: 465.7 434.0 vp9_put_8tap_smooth_4v_neon: 113.0 86.5 vp9_put_8tap_smooth_8h_neon: 229.7 221.6 vp9_put_8tap_smooth_8hv_neon: 658.9 621.3 vp9_put_8tap_smooth_8v_neon: 215.0 187.5 vp9_put_8tap_smooth_64h_neon: 10636.7 10627.8 vp9_put_8tap_smooth_64hv_neon: 21076.8 21026.9 vp9_put_8tap_smooth_64v_neon: 9635.0 9632.4 These are generally about as fast as the corresponding ARM routines on the same CPU (at least on the A53), in most cases marginally faster. The speedup vs C code is pretty much the same as for the 32 bit case; on the A53 it's around 6-13x for ther larger 8tap filters. The exact speedup varies a little, since the C versions generally don't end up exactly as slow/fast as on 32 bit. Signed-off-by: Martin Storsjö <martin@martin.st>
2016-11-10 11:06:26 +02:00
if (ARCH_AARCH64)
ff_vp9dsp_init_aarch64(dsp);
arm: vp9: Add NEON optimizations of VP9 MC functions This work is sponsored by, and copyright, Google. The filter coefficients are signed values, where the product of the multiplication with one individual filter coefficient doesn't overflow a 16 bit signed value (the largest filter coefficient is 127). But when the products are accumulated, the resulting sum can overflow the 16 bit signed range. Instead of accumulating in 32 bit, we accumulate the largest product (either index 3 or 4) last with a saturated addition. (The VP8 MC asm does something similar, but slightly simpler, by accumulating each half of the filter separately. In the VP9 MC filters, each half of the filter can also overflow though, so the largest component has to be handled individually.) Examples of relative speedup compared to the C version, from checkasm: Cortex A7 A8 A9 A53 vp9_avg4_neon: 1.71 1.15 1.42 1.49 vp9_avg8_neon: 2.51 3.63 3.14 2.58 vp9_avg16_neon: 2.95 6.76 3.01 2.84 vp9_avg32_neon: 3.29 6.64 2.85 3.00 vp9_avg64_neon: 3.47 6.67 3.14 2.80 vp9_avg_8tap_smooth_4h_neon: 3.22 4.73 2.76 4.67 vp9_avg_8tap_smooth_4hv_neon: 3.67 4.76 3.28 4.71 vp9_avg_8tap_smooth_4v_neon: 5.52 7.60 4.60 6.31 vp9_avg_8tap_smooth_8h_neon: 6.22 9.04 5.12 9.32 vp9_avg_8tap_smooth_8hv_neon: 6.38 8.21 5.72 8.17 vp9_avg_8tap_smooth_8v_neon: 9.22 12.66 8.15 11.10 vp9_avg_8tap_smooth_64h_neon: 7.02 10.23 5.54 11.58 vp9_avg_8tap_smooth_64hv_neon: 6.76 9.46 5.93 9.40 vp9_avg_8tap_smooth_64v_neon: 10.76 14.13 9.46 13.37 vp9_put4_neon: 1.11 1.47 1.00 1.21 vp9_put8_neon: 1.23 2.17 1.94 1.48 vp9_put16_neon: 1.63 4.02 1.73 1.97 vp9_put32_neon: 1.56 4.92 2.00 1.96 vp9_put64_neon: 2.10 5.28 2.03 2.35 vp9_put_8tap_smooth_4h_neon: 3.11 4.35 2.63 4.35 vp9_put_8tap_smooth_4hv_neon: 3.67 4.69 3.25 4.71 vp9_put_8tap_smooth_4v_neon: 5.45 7.27 4.49 6.52 vp9_put_8tap_smooth_8h_neon: 5.97 8.18 4.81 8.56 vp9_put_8tap_smooth_8hv_neon: 6.39 7.90 5.64 8.15 vp9_put_8tap_smooth_8v_neon: 9.03 11.84 8.07 11.51 vp9_put_8tap_smooth_64h_neon: 6.78 9.48 4.88 10.89 vp9_put_8tap_smooth_64hv_neon: 6.99 8.87 5.94 9.56 vp9_put_8tap_smooth_64v_neon: 10.69 13.30 9.43 14.34 For the larger 8tap filters, the speedup vs C code is around 5-14x. This is significantly faster than libvpx's implementation of the same functions, at least when comparing the put_8tap_smooth_64 functions (compared to vpx_convolve8_horiz_neon and vpx_convolve8_vert_neon from libvpx). Absolute runtimes from checkasm: Cortex A7 A8 A9 A53 vp9_put_8tap_smooth_64h_neon: 20150.3 14489.4 19733.6 10863.7 libvpx vpx_convolve8_horiz_neon: 52623.3 19736.4 21907.7 25027.7 vp9_put_8tap_smooth_64v_neon: 14455.0 12303.9 13746.4 9628.9 libvpx vpx_convolve8_vert_neon: 42090.0 17706.2 17659.9 16941.2 Thus, on the A9, the horizontal filter is only marginally faster than libvpx, while our version is significantly faster on the other cores, and the vertical filter is significantly faster on all cores. The difference is especially large on the A7. The libvpx implementation does the accumulation in 32 bit, which probably explains most of the differences. Signed-off-by: Martin Storsjö <martin@martin.st>
2016-11-03 09:12:08 +02:00
if (ARCH_ARM)
ff_vp9dsp_init_arm(dsp);
if (ARCH_X86)
ff_vp9dsp_init_x86(dsp);
}