From 8ad77b65b548a6b2f4707265ebd7e97f956acf0b Mon Sep 17 00:00:00 2001 From: Jason Garrett-Glaser Date: Tue, 10 May 2011 07:08:24 -0700 Subject: [PATCH 01/32] Update x86 H.264 deblock asm Includes AVX versions from x264. --- libavcodec/x86/h264_deblock.asm | 391 ++++++++++++++++++-------------- libavcodec/x86/h264dsp_mmx.c | 56 +++-- libavcodec/x86/x86util.asm | 19 +- 3 files changed, 275 insertions(+), 191 deletions(-) diff --git a/libavcodec/x86/h264_deblock.asm b/libavcodec/x86/h264_deblock.asm index 01778a45cb..081c0e1aef 100644 --- a/libavcodec/x86/h264_deblock.asm +++ b/libavcodec/x86/h264_deblock.asm @@ -1,10 +1,11 @@ ;***************************************************************************** -;* MMX/SSE2-optimized H.264 deblocking code +;* MMX/SSE2/AVX-optimized H.264 deblocking code ;***************************************************************************** -;* Copyright (C) 2005-2008 x264 project +;* Copyright (C) 2005-2011 x264 project ;* ;* Authors: Loren Merritt ;* Jason Garrett-Glaser +;* Oskar Arvidsson ;* ;* This file is part of Libav. ;* @@ -26,96 +27,135 @@ %include "x86inc.asm" %include "x86util.asm" -SECTION_RODATA +SECTION .text cextern pb_0 cextern pb_1 cextern pb_3 cextern pb_A1 -SECTION .text - ; expands to [base],...,[base+7*stride] %define PASS8ROWS(base, base3, stride, stride3) \ [base], [base+stride], [base+stride*2], [base3], \ [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4] -; in: 8 rows of 4 bytes in %1..%8 +%define PASS8ROWS(base, base3, stride, stride3, offset) \ + PASS8ROWS(base+offset, base3+offset, stride, stride3) + +; in: 8 rows of 4 bytes in %4..%11 ; out: 4 rows of 8 bytes in m0..m3 -%macro TRANSPOSE4x8_LOAD 8 - movd m0, %1 - movd m2, %2 - movd m1, %3 - movd m3, %4 - punpcklbw m0, m2 - punpcklbw m1, m3 - movq m2, m0 - punpcklwd m0, m1 - punpckhwd m2, m1 +%macro TRANSPOSE4x8_LOAD 11 + movh m0, %4 + movh m2, %5 + movh m1, %6 + movh m3, %7 + punpckl%1 m0, m2 + punpckl%1 m1, m3 + mova m2, m0 + punpckl%2 m0, m1 + punpckh%2 m2, m1 - movd m4, %5 - movd m6, %6 - movd m5, %7 - movd m7, %8 - punpcklbw m4, m6 - punpcklbw m5, m7 - movq m6, m4 - punpcklwd m4, m5 - punpckhwd m6, m5 + movh m4, %8 + movh m6, %9 + movh m5, %10 + movh m7, %11 + punpckl%1 m4, m6 + punpckl%1 m5, m7 + mova m6, m4 + punpckl%2 m4, m5 + punpckh%2 m6, m5 - movq m1, m0 - movq m3, m2 - punpckldq m0, m4 - punpckhdq m1, m4 - punpckldq m2, m6 - punpckhdq m3, m6 + punpckh%3 m1, m0, m4 + punpckh%3 m3, m2, m6 + punpckl%3 m0, m4 + punpckl%3 m2, m6 %endmacro ; in: 4 rows of 8 bytes in m0..m3 ; out: 8 rows of 4 bytes in %1..%8 -%macro TRANSPOSE8x4_STORE 8 - movq m4, m0 - movq m5, m1 - movq m6, m2 - punpckhdq m4, m4 - punpckhdq m5, m5 - punpckhdq m6, m6 +%macro TRANSPOSE8x4B_STORE 8 + punpckhdq m4, m0, m0 + punpckhdq m5, m1, m1 + punpckhdq m6, m2, m2 punpcklbw m0, m1 punpcklbw m2, m3 - movq m1, m0 - punpcklwd m0, m2 - punpckhwd m1, m2 - movd %1, m0 - punpckhdq m0, m0 - movd %2, m0 - movd %3, m1 + punpcklwd m1, m0, m2 + punpckhwd m0, m2 + movh %1, m1 punpckhdq m1, m1 - movd %4, m1 + movh %2, m1 + movh %3, m0 + punpckhdq m0, m0 + movh %4, m0 punpckhdq m3, m3 punpcklbw m4, m5 punpcklbw m6, m3 - movq m5, m4 - punpcklwd m4, m6 - punpckhwd m5, m6 - movd %5, m4 - punpckhdq m4, m4 - movd %6, m4 - movd %7, m5 + punpcklwd m5, m4, m6 + punpckhwd m4, m6 + movh %5, m5 punpckhdq m5, m5 - movd %8, m5 + movh %6, m5 + movh %7, m4 + punpckhdq m4, m4 + movh %8, m4 +%endmacro + +%macro TRANSPOSE4x8B_LOAD 8 + TRANSPOSE4x8_LOAD bw, wd, dq, %1, %2, %3, %4, %5, %6, %7, %8 +%endmacro + +%macro TRANSPOSE4x8W_LOAD 8 +%if mmsize==16 + TRANSPOSE4x8_LOAD wd, dq, qdq, %1, %2, %3, %4, %5, %6, %7, %8 +%else + SWAP 1, 4, 2, 3 + mova m0, [t5] + mova m1, [t5+r1] + mova m2, [t5+r1*2] + mova m3, [t5+t6] + TRANSPOSE4x4W 0, 1, 2, 3, 4 +%endif +%endmacro + +%macro TRANSPOSE8x2W_STORE 8 + punpckhwd m0, m1, m2 + punpcklwd m1, m2 +%if mmsize==8 + movd %3, m0 + movd %1, m1 + psrlq m1, 32 + psrlq m0, 32 + movd %2, m1 + movd %4, m0 +%else + movd %5, m0 + movd %1, m1 + psrldq m1, 4 + psrldq m0, 4 + movd %2, m1 + movd %6, m0 + psrldq m1, 4 + psrldq m0, 4 + movd %3, m1 + movd %7, m0 + psrldq m1, 4 + psrldq m0, 4 + movd %4, m1 + movd %8, m0 +%endif %endmacro %macro SBUTTERFLY3 4 - movq %4, %2 + punpckh%1 %4, %2, %3 punpckl%1 %2, %3 - punpckh%1 %4, %3 %endmacro ; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8 ; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16] %macro TRANSPOSE6x8_MEM 9 + RESET_MM_PERMUTATION movq m0, %1 movq m1, %2 movq m2, %3 @@ -123,30 +163,32 @@ SECTION .text movq m4, %5 movq m5, %6 movq m6, %7 - SBUTTERFLY3 bw, m0, m1, m7 - SBUTTERFLY3 bw, m2, m3, m1 - SBUTTERFLY3 bw, m4, m5, m3 - movq [%9+0x10], m1 - SBUTTERFLY3 bw, m6, %8, m5 - SBUTTERFLY3 wd, m0, m2, m1 - SBUTTERFLY3 wd, m4, m6, m2 + SBUTTERFLY bw, 0, 1, 7 + SBUTTERFLY bw, 2, 3, 7 + SBUTTERFLY bw, 4, 5, 7 + movq [%9+0x10], m3 + SBUTTERFLY3 bw, m6, %8, m7 + SBUTTERFLY wd, 0, 2, 3 + SBUTTERFLY wd, 4, 6, 3 punpckhdq m0, m4 movq [%9+0x00], m0 - SBUTTERFLY3 wd, m7, [%9+0x10], m6 - SBUTTERFLY3 wd, m3, m5, m4 - SBUTTERFLY3 dq, m7, m3, m0 - SBUTTERFLY3 dq, m1, m2, m5 - punpckldq m6, m4 - movq [%9+0x10], m1 - movq [%9+0x20], m5 - movq [%9+0x30], m7 - movq [%9+0x40], m0 - movq [%9+0x50], m6 + SBUTTERFLY3 wd, m1, [%9+0x10], m3 + SBUTTERFLY wd, 5, 7, 0 + SBUTTERFLY dq, 1, 5, 0 + SBUTTERFLY dq, 2, 6, 0 + punpckldq m3, m7 + movq [%9+0x10], m2 + movq [%9+0x20], m6 + movq [%9+0x30], m1 + movq [%9+0x40], m5 + movq [%9+0x50], m3 + RESET_MM_PERMUTATION %endmacro ; in: 8 rows of 8 in %1..%8 ; out: 8 rows of 8 in %9..%16 %macro TRANSPOSE8x8_MEM 16 + RESET_MM_PERMUTATION movq m0, %1 movq m1, %2 movq m2, %3 @@ -154,38 +196,44 @@ SECTION .text movq m4, %5 movq m5, %6 movq m6, %7 - SBUTTERFLY3 bw, m0, m1, m7 - SBUTTERFLY3 bw, m2, m3, m1 - SBUTTERFLY3 bw, m4, m5, m3 - SBUTTERFLY3 bw, m6, %8, m5 - movq %9, m3 - SBUTTERFLY3 wd, m0, m2, m3 - SBUTTERFLY3 wd, m4, m6, m2 - SBUTTERFLY3 wd, m7, m1, m6 - movq %11, m2 - movq m2, %9 - SBUTTERFLY3 wd, m2, m5, m1 - SBUTTERFLY3 dq, m0, m4, m5 - SBUTTERFLY3 dq, m7, m2, m4 + SBUTTERFLY bw, 0, 1, 7 + SBUTTERFLY bw, 2, 3, 7 + SBUTTERFLY bw, 4, 5, 7 + SBUTTERFLY3 bw, m6, %8, m7 + movq %9, m5 + SBUTTERFLY wd, 0, 2, 5 + SBUTTERFLY wd, 4, 6, 5 + SBUTTERFLY wd, 1, 3, 5 + movq %11, m6 + movq m6, %9 + SBUTTERFLY wd, 6, 7, 5 + SBUTTERFLY dq, 0, 4, 5 + SBUTTERFLY dq, 1, 6, 5 movq %9, m0 - movq %10, m5 - movq %13, m7 - movq %14, m4 - SBUTTERFLY3 dq, m3, %11, m0 - SBUTTERFLY3 dq, m6, m1, m5 - movq %11, m3 + movq %10, m4 + movq %13, m1 + movq %14, m6 + SBUTTERFLY3 dq, m2, %11, m0 + SBUTTERFLY dq, 3, 7, 4 + movq %11, m2 movq %12, m0 - movq %15, m6 - movq %16, m5 + movq %15, m3 + movq %16, m7 + RESET_MM_PERMUTATION %endmacro ; out: %4 = |%1-%2|>%3 ; clobbers: %5 %macro DIFF_GT 5 +%if avx_enabled == 0 mova %5, %2 mova %4, %1 psubusb %5, %1 psubusb %4, %2 +%else + psubusb %5, %2, %1 + psubusb %4, %1, %2 +%endif por %4, %5 psubusb %4, %3 %endmacro @@ -193,32 +241,28 @@ SECTION .text ; out: %4 = |%1-%2|>%3 ; clobbers: %5 %macro DIFF_GT2 5 +%ifdef ARCH_X86_64 + psubusb %5, %2, %1 + psubusb %4, %1, %2 +%else mova %5, %2 mova %4, %1 psubusb %5, %1 psubusb %4, %2 +%endif psubusb %5, %3 psubusb %4, %3 pcmpeqb %4, %5 %endmacro -%macro SPLATW 1 -%ifidn m0, xmm0 - pshuflw %1, %1, 0 - punpcklqdq %1, %1 -%else - pshufw %1, %1, 0 -%endif -%endmacro - ; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1 ; out: m5=beta-1, m7=mask, %3=alpha-1 ; clobbers: m4,m6 %macro LOAD_MASK 2-3 movd m4, %1 movd m5, %2 - SPLATW m4 - SPLATW m5 + SPLATW m4, m4 + SPLATW m5, m5 packuswb m4, m4 ; 16x alpha-1 packuswb m5, m5 ; 16x beta-1 %if %0>2 @@ -237,8 +281,7 @@ SECTION .text ; out: m1=p0' m2=q0' ; clobbers: m0,3-6 %macro DEBLOCK_P0_Q0 0 - mova m5, m1 - pxor m5, m2 ; p0^q0 + pxor m5, m1, m2 ; p0^q0 pand m5, [pb_1] ; (p0^q0)&1 pcmpeqb m4, m4 pxor m3, m4 @@ -264,14 +307,12 @@ SECTION .text ; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 ) ; clobbers: q2, tmp, tc0 %macro LUMA_Q1 6 - mova %6, m1 - pavgb %6, m2 + pavgb %6, m1, m2 pavgb %2, %6 ; avg(p2,avg(p0,q0)) pxor %6, %3 pand %6, [pb_1] ; (p2^avg(p0,q0))&1 psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1 - mova %6, %1 - psubusb %6, %5 + psubusb %6, %1, %5 paddusb %5, %1 pmaxub %2, %6 pminub %2, %5 @@ -280,10 +321,10 @@ SECTION .text %ifdef ARCH_X86_64 ;----------------------------------------------------------------------------- -; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +; void deblock_v_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- -INIT_XMM -cglobal x264_deblock_v_luma_sse2, 5,5,10 +%macro DEBLOCK_LUMA 1 +cglobal deblock_v_luma_%1, 5,5,10 movd m8, [r4] ; tc0 lea r4, [r1*3] dec r2d ; alpha-1 @@ -307,8 +348,7 @@ cglobal x264_deblock_v_luma_sse2, 5,5,10 movdqa m3, [r4] ; p2 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1 pand m6, m9 - mova m7, m8 - psubb m7, m6 + psubb m7, m8, m6 pand m6, m8 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4 @@ -326,10 +366,10 @@ cglobal x264_deblock_v_luma_sse2, 5,5,10 RET ;----------------------------------------------------------------------------- -; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- INIT_MMX -cglobal x264_deblock_h_luma_sse2, 5,7 +cglobal deblock_h_luma_%1, 5,7 movsxd r10, r1d lea r11, [r10+r10*2] lea r6, [r0-4] @@ -350,13 +390,13 @@ cglobal x264_deblock_h_luma_sse2, 5,7 ; vertical filter ; alpha, beta, tc0 are still in r2d, r3d, r4 - ; don't backup r6, r5, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them + ; don't backup r6, r5, r10, r11 because deblock_v_luma_sse2 doesn't use them lea r0, [pix_tmp+0x30] mov r1d, 0x10 %ifdef WIN64 mov [rsp+0x20], r4 %endif - call x264_deblock_v_luma_sse2 + call deblock_v_luma_%1 ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter) add r6, 2 @@ -365,7 +405,7 @@ cglobal x264_deblock_h_luma_sse2, 5,7 movq m1, [pix_tmp+0x28] movq m2, [pix_tmp+0x38] movq m3, [pix_tmp+0x48] - TRANSPOSE8x4_STORE PASS8ROWS(r6, r5, r10, r11) + TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r10, r11) shl r10, 3 sub r6, r10 @@ -375,7 +415,7 @@ cglobal x264_deblock_h_luma_sse2, 5,7 movq m1, [pix_tmp+0x20] movq m2, [pix_tmp+0x30] movq m3, [pix_tmp+0x40] - TRANSPOSE8x4_STORE PASS8ROWS(r6, r5, r10, r11) + TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r10, r11) %ifdef WIN64 add rsp, 0x98 @@ -383,14 +423,20 @@ cglobal x264_deblock_h_luma_sse2, 5,7 add rsp, 0x68 %endif RET +%endmacro + +INIT_XMM +DEBLOCK_LUMA sse2 +INIT_AVX +DEBLOCK_LUMA avx %else %macro DEBLOCK_LUMA 3 ;----------------------------------------------------------------------------- -; void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +; void deblock_v8_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- -cglobal x264_deblock_%2_luma_%1, 5,5 +cglobal deblock_%2_luma_%1, 5,5 lea r4, [r1*3] dec r2 ; alpha-1 neg r4 @@ -419,8 +465,7 @@ cglobal x264_deblock_%2_luma_%1, 5,5 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1 pand m6, m4 pand m4, [esp+%3] ; tc - mova m7, m4 - psubb m7, m6 + psubb m7, m4, m6 pand m6, m4 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4 @@ -441,10 +486,10 @@ cglobal x264_deblock_%2_luma_%1, 5,5 RET ;----------------------------------------------------------------------------- -; void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- INIT_MMX -cglobal x264_deblock_h_luma_%1, 0,5 +cglobal deblock_h_luma_%1, 0,5 mov r0, r0mp mov r3, r1m lea r4, [r3*3] @@ -467,11 +512,11 @@ cglobal x264_deblock_h_luma_%1, 0,5 PUSH dword r2m PUSH dword 16 PUSH dword r0 - call x264_deblock_%2_luma_%1 + call deblock_%2_luma_%1 %ifidn %2, v8 add dword [esp ], 8 ; pix_tmp+0x38 add dword [esp+16], 2 ; tc0+2 - call x264_deblock_%2_luma_%1 + call deblock_%2_luma_%1 %endif ADD esp, 20 @@ -484,7 +529,7 @@ cglobal x264_deblock_h_luma_%1, 0,5 movq m1, [pix_tmp+0x20] movq m2, [pix_tmp+0x30] movq m3, [pix_tmp+0x40] - TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4) + TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4) lea r0, [r0+r3*8] lea r1, [r1+r3*8] @@ -492,7 +537,7 @@ cglobal x264_deblock_h_luma_%1, 0,5 movq m1, [pix_tmp+0x28] movq m2, [pix_tmp+0x38] movq m3, [pix_tmp+0x48] - TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4) + TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4) ADD esp, pad RET @@ -502,22 +547,34 @@ INIT_MMX DEBLOCK_LUMA mmxext, v8, 8 INIT_XMM DEBLOCK_LUMA sse2, v, 16 +INIT_AVX +DEBLOCK_LUMA avx, v, 16 %endif ; ARCH %macro LUMA_INTRA_P012 4 ; p0..p3 in memory +%ifdef ARCH_X86_64 + pavgb t0, p2, p1 + pavgb t1, p0, q0 +%else mova t0, p2 mova t1, p0 pavgb t0, p1 pavgb t1, q0 +%endif pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2 mova t5, t1 +%ifdef ARCH_X86_64 + paddb t2, p2, p1 + paddb t3, p0, q0 +%else mova t2, p2 mova t3, p0 paddb t2, p1 paddb t3, q0 +%endif paddb t2, t3 mova t3, t2 mova t4, t2 @@ -527,10 +584,15 @@ DEBLOCK_LUMA sse2, v, 16 pand t2, mpb_1 psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4; +%ifdef ARCH_X86_64 + pavgb t1, p2, q1 + psubb t2, p2, q1 +%else mova t1, p2 mova t2, p2 pavgb t1, q1 psubb t2, q1 +%endif paddb t3, t3 psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1 pand t2, mpb_1 @@ -543,10 +605,8 @@ DEBLOCK_LUMA sse2, v, 16 pand t3, mpb_1 psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8 - mova t3, p0 - mova t2, p0 - pxor t3, q1 - pavgb t2, q1 + pxor t3, p0, q1 + pavgb t2, p0, q1 pand t3, mpb_1 psubb t2, t3 pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4 @@ -560,9 +620,8 @@ DEBLOCK_LUMA sse2, v, 16 mova %1, t1 ; store p0 mova t1, %4 ; p3 - mova t2, t1 + paddb t2, t1, p2 pavgb t1, p2 - paddb t2, p2 pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4 paddb t2, t2 paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0 @@ -624,9 +683,9 @@ DEBLOCK_LUMA sse2, v, 16 %endif ;----------------------------------------------------------------------------- -; void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta ) +; void deblock_v_luma_intra( uint8_t *pix, int stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal x264_deblock_%2_luma_intra_%1, 4,6,16 +cglobal deblock_%2_luma_intra_%1, 4,6,16 %ifndef ARCH_X86_64 sub esp, 0x60 %endif @@ -686,9 +745,9 @@ cglobal x264_deblock_%2_luma_intra_%1, 4,6,16 INIT_MMX %ifdef ARCH_X86_64 ;----------------------------------------------------------------------------- -; void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta ) +; void deblock_h_luma_intra( uint8_t *pix, int stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal x264_deblock_h_luma_intra_%1, 4,7 +cglobal deblock_h_luma_intra_%1, 4,7 movsxd r10, r1d lea r11, [r10*3] lea r6, [r0-4] @@ -704,7 +763,7 @@ cglobal x264_deblock_h_luma_intra_%1, 4,7 lea r0, [pix_tmp+0x40] mov r1, 0x10 - call x264_deblock_v_luma_intra_%1 + call deblock_v_luma_intra_%1 ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8) lea r5, [r6+r11] @@ -717,7 +776,7 @@ cglobal x264_deblock_h_luma_intra_%1, 4,7 add rsp, 0x88 RET %else -cglobal x264_deblock_h_luma_intra_%1, 2,4 +cglobal deblock_h_luma_intra_%1, 2,4 lea r3, [r1*3] sub r0, 4 lea r2, [r0+r3] @@ -736,10 +795,10 @@ cglobal x264_deblock_h_luma_intra_%1, 2,4 PUSH dword r2m PUSH dword 16 PUSH r0 - call x264_deblock_%2_luma_intra_%1 + call deblock_%2_luma_intra_%1 %ifidn %2, v8 add dword [rsp], 8 ; pix_tmp+8 - call x264_deblock_%2_luma_intra_%1 + call deblock_%2_luma_intra_%1 %endif ADD esp, 16 @@ -760,13 +819,13 @@ cglobal x264_deblock_h_luma_intra_%1, 2,4 INIT_XMM DEBLOCK_LUMA_INTRA sse2, v +INIT_AVX +DEBLOCK_LUMA_INTRA avx , v %ifndef ARCH_X86_64 INIT_MMX DEBLOCK_LUMA_INTRA mmxext, v8 %endif - - INIT_MMX %macro CHROMA_V_START 0 @@ -790,23 +849,23 @@ INIT_MMX %define t6 r6 ;----------------------------------------------------------------------------- -; void x264_deblock_v_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +; void ff_deblock_v_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- -cglobal x264_deblock_v_chroma_mmxext, 5,6 +cglobal deblock_v_chroma_mmxext, 5,6 CHROMA_V_START movq m0, [t5] movq m1, [t5+r1] movq m2, [r0] movq m3, [r0+r1] - call x264_chroma_inter_body_mmxext + call ff_chroma_inter_body_mmxext movq [t5+r1], m1 movq [r0], m2 RET ;----------------------------------------------------------------------------- -; void x264_deblock_h_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +; void ff_deblock_h_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- -cglobal x264_deblock_h_chroma_mmxext, 5,7 +cglobal deblock_h_chroma_mmxext, 5,7 %ifdef ARCH_X86_64 %define buf0 [rsp-24] %define buf1 [rsp-16] @@ -815,17 +874,17 @@ cglobal x264_deblock_h_chroma_mmxext, 5,7 %define buf1 r2m %endif CHROMA_H_START - TRANSPOSE4x8_LOAD PASS8ROWS(t5, r0, r1, t6) + TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6) movq buf0, m0 movq buf1, m3 - call x264_chroma_inter_body_mmxext + call ff_chroma_inter_body_mmxext movq m0, buf0 movq m3, buf1 - TRANSPOSE8x4_STORE PASS8ROWS(t5, r0, r1, t6) + TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6) RET ALIGN 16 -x264_chroma_inter_body_mmxext: +ff_chroma_inter_body_mmxext: LOAD_MASK r2d, r3d movd m6, [r4] ; tc0 punpcklbw m6, m6 @@ -850,31 +909,31 @@ x264_chroma_inter_body_mmxext: %define t6 r5 ;----------------------------------------------------------------------------- -; void x264_deblock_v_chroma_intra( uint8_t *pix, int stride, int alpha, int beta ) +; void ff_deblock_v_chroma_intra( uint8_t *pix, int stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal x264_deblock_v_chroma_intra_mmxext, 4,5 +cglobal deblock_v_chroma_intra_mmxext, 4,5 CHROMA_V_START movq m0, [t5] movq m1, [t5+r1] movq m2, [r0] movq m3, [r0+r1] - call x264_chroma_intra_body_mmxext + call ff_chroma_intra_body_mmxext movq [t5+r1], m1 movq [r0], m2 RET ;----------------------------------------------------------------------------- -; void x264_deblock_h_chroma_intra( uint8_t *pix, int stride, int alpha, int beta ) +; void ff_deblock_h_chroma_intra( uint8_t *pix, int stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal x264_deblock_h_chroma_intra_mmxext, 4,6 +cglobal deblock_h_chroma_intra_mmxext, 4,6 CHROMA_H_START - TRANSPOSE4x8_LOAD PASS8ROWS(t5, r0, r1, t6) - call x264_chroma_intra_body_mmxext - TRANSPOSE8x4_STORE PASS8ROWS(t5, r0, r1, t6) + TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6) + call ff_chroma_intra_body_mmxext + TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6) RET ALIGN 16 -x264_chroma_intra_body_mmxext: +ff_chroma_intra_body_mmxext: LOAD_MASK r2d, r3d movq m5, m1 movq m6, m2 diff --git a/libavcodec/x86/h264dsp_mmx.c b/libavcodec/x86/h264dsp_mmx.c index 3a783a39ab..7d27c02ea2 100644 --- a/libavcodec/x86/h264dsp_mmx.c +++ b/libavcodec/x86/h264dsp_mmx.c @@ -219,11 +219,11 @@ static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40] } #define LF_FUNC(DIR, TYPE, OPT) \ -void ff_x264_deblock_ ## DIR ## _ ## TYPE ## _ ## OPT (uint8_t *pix, int stride, \ - int alpha, int beta, int8_t *tc0); +void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## OPT (uint8_t *pix, int stride, \ + int alpha, int beta, int8_t *tc0); #define LF_IFUNC(DIR, TYPE, OPT) \ -void ff_x264_deblock_ ## DIR ## _ ## TYPE ## _ ## OPT (uint8_t *pix, int stride, \ - int alpha, int beta); +void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## OPT (uint8_t *pix, int stride, \ + int alpha, int beta); LF_FUNC (h, chroma, mmxext) LF_IFUNC(h, chroma_intra, mmxext) @@ -234,18 +234,18 @@ LF_FUNC (h, luma, mmxext) LF_IFUNC(h, luma_intra, mmxext) #if HAVE_YASM && ARCH_X86_32 LF_FUNC (v8, luma, mmxext) -static void ff_x264_deblock_v_luma_mmxext(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) +static void ff_deblock_v_luma_mmxext(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) { if((tc0[0] & tc0[1]) >= 0) - ff_x264_deblock_v8_luma_mmxext(pix+0, stride, alpha, beta, tc0); + ff_deblock_v8_luma_mmxext(pix+0, stride, alpha, beta, tc0); if((tc0[2] & tc0[3]) >= 0) - ff_x264_deblock_v8_luma_mmxext(pix+8, stride, alpha, beta, tc0+2); + ff_deblock_v8_luma_mmxext(pix+8, stride, alpha, beta, tc0+2); } LF_IFUNC(v8, luma_intra, mmxext) -static void ff_x264_deblock_v_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta) +static void ff_deblock_v_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta) { - ff_x264_deblock_v8_luma_intra_mmxext(pix+0, stride, alpha, beta); - ff_x264_deblock_v8_luma_intra_mmxext(pix+8, stride, alpha, beta); + ff_deblock_v8_luma_intra_mmxext(pix+0, stride, alpha, beta); + ff_deblock_v8_luma_intra_mmxext(pix+8, stride, alpha, beta); } #endif @@ -253,6 +253,10 @@ LF_FUNC (h, luma, sse2) LF_IFUNC(h, luma_intra, sse2) LF_FUNC (v, luma, sse2) LF_IFUNC(v, luma_intra, sse2) +LF_FUNC (h, luma, avx) +LF_IFUNC(h, luma_intra, avx) +LF_FUNC (v, luma, avx) +LF_IFUNC(v, luma_intra, avx) /***********************************/ /* weighted prediction */ @@ -314,15 +318,15 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth) c->h264_idct_add8 = ff_h264_idct_add8_mmx2; c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx2; - c->h264_v_loop_filter_chroma= ff_x264_deblock_v_chroma_mmxext; - c->h264_h_loop_filter_chroma= ff_x264_deblock_h_chroma_mmxext; - c->h264_v_loop_filter_chroma_intra= ff_x264_deblock_v_chroma_intra_mmxext; - c->h264_h_loop_filter_chroma_intra= ff_x264_deblock_h_chroma_intra_mmxext; + c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_mmxext; + c->h264_h_loop_filter_chroma= ff_deblock_h_chroma_mmxext; + c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_mmxext; + c->h264_h_loop_filter_chroma_intra= ff_deblock_h_chroma_intra_mmxext; #if ARCH_X86_32 - c->h264_v_loop_filter_luma= ff_x264_deblock_v_luma_mmxext; - c->h264_h_loop_filter_luma= ff_x264_deblock_h_luma_mmxext; - c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_mmxext; - c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_mmxext; + c->h264_v_loop_filter_luma= ff_deblock_v_luma_mmxext; + c->h264_h_loop_filter_luma= ff_deblock_h_luma_mmxext; + c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_mmxext; + c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_mmxext; #endif c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2; c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2; @@ -360,10 +364,10 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth) c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_sse2; #if HAVE_ALIGNED_STACK - c->h264_v_loop_filter_luma = ff_x264_deblock_v_luma_sse2; - c->h264_h_loop_filter_luma = ff_x264_deblock_h_luma_sse2; - c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_sse2; - c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_sse2; + c->h264_v_loop_filter_luma = ff_deblock_v_luma_sse2; + c->h264_h_loop_filter_luma = ff_deblock_h_luma_sse2; + c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_sse2; + c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_sse2; #endif c->h264_idct_add16 = ff_h264_idct_add16_sse2; @@ -377,6 +381,14 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth) c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_ssse3; c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_ssse3; } + if (mm_flags&AV_CPU_FLAG_AVX) { +#if HAVE_ALIGNED_STACK + c->h264_v_loop_filter_luma = ff_deblock_v_luma_avx; + c->h264_h_loop_filter_luma = ff_deblock_h_luma_avx; + c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_avx; + c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_avx; +#endif + } } } #endif diff --git a/libavcodec/x86/x86util.asm b/libavcodec/x86/x86util.asm index f731616270..28baf7a96a 100644 --- a/libavcodec/x86/x86util.asm +++ b/libavcodec/x86/x86util.asm @@ -24,16 +24,20 @@ ;****************************************************************************** %macro SBUTTERFLY 4 +%if avx_enabled == 0 mova m%4, m%2 punpckl%1 m%2, m%3 punpckh%1 m%4, m%3 +%else + punpckh%1 m%4, m%2, m%3 + punpckl%1 m%2, m%3 +%endif SWAP %3, %4 %endmacro %macro SBUTTERFLY2 4 - mova m%4, m%2 - punpckh%1 m%2, m%3 - punpckl%1 m%4, m%3 + punpckl%1 m%4, m%2, m%3 + punpckh%1 m%2, m%2, m%3 SWAP %2, %4, %3 %endmacro @@ -444,3 +448,12 @@ %macro PMINUB_MMXEXT 3 ; dst, src, ignored pminub %1, %2 %endmacro + +%macro SPLATW 2-3 0 +%if mmsize == 16 + pshuflw %1, %2, (%3)*0x55 + punpcklqdq %1, %1 +%else + pshufw %1, %2, (%3)*0x55 +%endif +%endmacro From 9f3d6ca4f16e9b1f6f89424e9d946bb3a6a40d91 Mon Sep 17 00:00:00 2001 From: Jason Garrett-Glaser Date: Tue, 10 May 2011 08:55:12 -0700 Subject: [PATCH 02/32] Port x86 10-bit H.264 deblock asm from x264 --- libavcodec/x86/Makefile | 1 + libavcodec/x86/dsputil_mmx.c | 1 + libavcodec/x86/h264_deblock.asm | 34 +- libavcodec/x86/h264_deblock_10bit.asm | 804 ++++++++++++++++++++++++++ libavcodec/x86/h264dsp_mmx.c | 125 ++-- libavcodec/x86/x86util.asm | 5 + 6 files changed, 907 insertions(+), 63 deletions(-) create mode 100644 libavcodec/x86/h264_deblock_10bit.asm diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index f8d456d3ea..1cde9517a5 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -9,6 +9,7 @@ YASM-OBJS-$(CONFIG_FFT) += x86/fft_mmx.o \ MMX-OBJS-$(CONFIG_H264DSP) += x86/h264dsp_mmx.o YASM-OBJS-$(CONFIG_H264DSP) += x86/h264_deblock.o \ + x86/h264_deblock_10bit.o \ x86/h264_weight.o \ x86/h264_idct.o \ diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c index a0cb11aa40..1cc6991666 100644 --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -43,6 +43,7 @@ DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] = {0x8000000080000000ULL, 0x8000000080000000ULL}; DECLARE_ALIGNED(8, const uint64_t, ff_pw_1 ) = 0x0001000100010001ULL; +DECLARE_ALIGNED(16, const xmm_reg, ff_pw_2 ) = {0x0002000200020002ULL, 0x0002000200020002ULL}; DECLARE_ALIGNED(16, const xmm_reg, ff_pw_3 ) = {0x0003000300030003ULL, 0x0003000300030003ULL}; DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4 ) = {0x0004000400040004ULL, 0x0004000400040004ULL}; DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5 ) = {0x0005000500050005ULL, 0x0005000500050005ULL}; diff --git a/libavcodec/x86/h264_deblock.asm b/libavcodec/x86/h264_deblock.asm index 081c0e1aef..37866812e7 100644 --- a/libavcodec/x86/h264_deblock.asm +++ b/libavcodec/x86/h264_deblock.asm @@ -324,7 +324,7 @@ cextern pb_A1 ; void deblock_v_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- %macro DEBLOCK_LUMA 1 -cglobal deblock_v_luma_%1, 5,5,10 +cglobal deblock_v_luma_8_%1, 5,5,10 movd m8, [r4] ; tc0 lea r4, [r1*3] dec r2d ; alpha-1 @@ -369,7 +369,7 @@ cglobal deblock_v_luma_%1, 5,5,10 ; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- INIT_MMX -cglobal deblock_h_luma_%1, 5,7 +cglobal deblock_h_luma_8_%1, 5,7 movsxd r10, r1d lea r11, [r10+r10*2] lea r6, [r0-4] @@ -396,7 +396,7 @@ cglobal deblock_h_luma_%1, 5,7 %ifdef WIN64 mov [rsp+0x20], r4 %endif - call deblock_v_luma_%1 + call deblock_v_luma_8_%1 ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter) add r6, 2 @@ -436,7 +436,7 @@ DEBLOCK_LUMA avx ;----------------------------------------------------------------------------- ; void deblock_v8_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- -cglobal deblock_%2_luma_%1, 5,5 +cglobal deblock_%2_luma_8_%1, 5,5 lea r4, [r1*3] dec r2 ; alpha-1 neg r4 @@ -489,7 +489,7 @@ cglobal deblock_%2_luma_%1, 5,5 ; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- INIT_MMX -cglobal deblock_h_luma_%1, 0,5 +cglobal deblock_h_luma_8_%1, 0,5 mov r0, r0mp mov r3, r1m lea r4, [r3*3] @@ -512,11 +512,11 @@ cglobal deblock_h_luma_%1, 0,5 PUSH dword r2m PUSH dword 16 PUSH dword r0 - call deblock_%2_luma_%1 + call deblock_%2_luma_8_%1 %ifidn %2, v8 add dword [esp ], 8 ; pix_tmp+0x38 add dword [esp+16], 2 ; tc0+2 - call deblock_%2_luma_%1 + call deblock_%2_luma_8_%1 %endif ADD esp, 20 @@ -685,7 +685,7 @@ DEBLOCK_LUMA avx, v, 16 ;----------------------------------------------------------------------------- ; void deblock_v_luma_intra( uint8_t *pix, int stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal deblock_%2_luma_intra_%1, 4,6,16 +cglobal deblock_%2_luma_intra_8_%1, 4,6,16 %ifndef ARCH_X86_64 sub esp, 0x60 %endif @@ -747,7 +747,7 @@ INIT_MMX ;----------------------------------------------------------------------------- ; void deblock_h_luma_intra( uint8_t *pix, int stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal deblock_h_luma_intra_%1, 4,7 +cglobal deblock_h_luma_intra_8_%1, 4,7 movsxd r10, r1d lea r11, [r10*3] lea r6, [r0-4] @@ -763,7 +763,7 @@ cglobal deblock_h_luma_intra_%1, 4,7 lea r0, [pix_tmp+0x40] mov r1, 0x10 - call deblock_v_luma_intra_%1 + call deblock_v_luma_intra_8_%1 ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8) lea r5, [r6+r11] @@ -776,7 +776,7 @@ cglobal deblock_h_luma_intra_%1, 4,7 add rsp, 0x88 RET %else -cglobal deblock_h_luma_intra_%1, 2,4 +cglobal deblock_h_luma_intra_8_%1, 2,4 lea r3, [r1*3] sub r0, 4 lea r2, [r0+r3] @@ -795,10 +795,10 @@ cglobal deblock_h_luma_intra_%1, 2,4 PUSH dword r2m PUSH dword 16 PUSH r0 - call deblock_%2_luma_intra_%1 + call deblock_%2_luma_intra_8_%1 %ifidn %2, v8 add dword [rsp], 8 ; pix_tmp+8 - call deblock_%2_luma_intra_%1 + call deblock_%2_luma_intra_8_%1 %endif ADD esp, 16 @@ -851,7 +851,7 @@ INIT_MMX ;----------------------------------------------------------------------------- ; void ff_deblock_v_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- -cglobal deblock_v_chroma_mmxext, 5,6 +cglobal deblock_v_chroma_8_mmxext, 5,6 CHROMA_V_START movq m0, [t5] movq m1, [t5+r1] @@ -865,7 +865,7 @@ cglobal deblock_v_chroma_mmxext, 5,6 ;----------------------------------------------------------------------------- ; void ff_deblock_h_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- -cglobal deblock_h_chroma_mmxext, 5,7 +cglobal deblock_h_chroma_8_mmxext, 5,7 %ifdef ARCH_X86_64 %define buf0 [rsp-24] %define buf1 [rsp-16] @@ -911,7 +911,7 @@ ff_chroma_inter_body_mmxext: ;----------------------------------------------------------------------------- ; void ff_deblock_v_chroma_intra( uint8_t *pix, int stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal deblock_v_chroma_intra_mmxext, 4,5 +cglobal deblock_v_chroma_intra_8_mmxext, 4,5 CHROMA_V_START movq m0, [t5] movq m1, [t5+r1] @@ -925,7 +925,7 @@ cglobal deblock_v_chroma_intra_mmxext, 4,5 ;----------------------------------------------------------------------------- ; void ff_deblock_h_chroma_intra( uint8_t *pix, int stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal deblock_h_chroma_intra_mmxext, 4,6 +cglobal deblock_h_chroma_intra_8_mmxext, 4,6 CHROMA_H_START TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6) call ff_chroma_intra_body_mmxext diff --git a/libavcodec/x86/h264_deblock_10bit.asm b/libavcodec/x86/h264_deblock_10bit.asm new file mode 100644 index 0000000000..402ed9bfac --- /dev/null +++ b/libavcodec/x86/h264_deblock_10bit.asm @@ -0,0 +1,804 @@ +;***************************************************************************** +;* MMX/SSE2/AVX-optimized 10-bit H.264 deblocking code +;***************************************************************************** +;* Copyright (C) 2005-2011 x264 project +;* +;* Authors: Oskar Arvidsson +;* Loren Merritt +;* Jason Garrett-Glaser +;* +;* This file is part of Libav. +;* +;* Libav is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* Libav is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with Libav; if not, write to the Free Software +;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "x86inc.asm" +%include "x86util.asm" + +SECTION_RODATA + +pw_pixel_max: times 8 dw ((1 << 10)-1) + +SECTION .text + +cextern pw_2 +cextern pw_4 + +; out: %4 = |%1-%2|-%3 +; clobbers: %5 +%macro ABS_SUB 5 + psubusw %5, %2, %1 + psubusw %4, %1, %2 + por %4, %5 + psubw %4, %3 +%endmacro + +; out: %4 = |%1-%2|<%3 +%macro DIFF_LT 5 + psubusw %4, %2, %1 + psubusw %5, %1, %2 + por %5, %4 ; |%1-%2| + pxor %4, %4 + psubw %5, %3 ; |%1-%2|-%3 + pcmpgtw %4, %5 ; 0 > |%1-%2|-%3 +%endmacro + +%macro LOAD_AB 4 + movd %1, %3 + movd %2, %4 + SPLATW %1, %1 + SPLATW %2, %2 +%endmacro + +; in: %2=tc reg +; out: %1=splatted tc +%macro LOAD_TC 2 + movd %1, [%2] + punpcklbw %1, %1 +%if mmsize == 8 + pshufw %1, %1, 0 +%else + pshuflw %1, %1, 01010000b + pshufd %1, %1, 01010000b +%endif + psraw %1, 6 +%endmacro + +; in: %1=p1, %2=p0, %3=q0, %4=q1 +; %5=alpha, %6=beta, %7-%9=tmp +; out: %7=mask +%macro LOAD_MASK 9 + ABS_SUB %2, %3, %5, %8, %7 ; |p0-q0| - alpha + ABS_SUB %1, %2, %6, %9, %7 ; |p1-p0| - beta + pand %8, %9 + ABS_SUB %3, %4, %6, %9, %7 ; |q1-q0| - beta + pxor %7, %7 + pand %8, %9 + pcmpgtw %7, %8 +%endmacro + +; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp +; out: %1=p0', m2=q0' +%macro DEBLOCK_P0_Q0 7 + psubw %3, %4 + pxor %7, %7 + paddw %3, [pw_4] + psubw %7, %5 + psubw %6, %2, %1 + psllw %6, 2 + paddw %3, %6 + psraw %3, 3 + mova %6, [pw_pixel_max] + CLIPW %3, %7, %5 + pxor %7, %7 + paddw %1, %3 + psubw %2, %3 + CLIPW %1, %7, %6 + CLIPW %2, %7, %6 +%endmacro + +; in: %1=x2, %2=x1, %3=p0, %4=q0 %5=mask&tc, %6=tmp +%macro LUMA_Q1 6 + pavgw %6, %3, %4 ; (p0+q0+1)>>1 + paddw %1, %6 + pxor %6, %6 + psraw %1, 1 + psubw %6, %5 + psubw %1, %2 + CLIPW %1, %6, %5 + paddw %1, %2 +%endmacro + +%macro LUMA_DEBLOCK_ONE 3 + DIFF_LT m5, %1, bm, m4, m6 + pxor m6, m6 + mova %3, m4 + pcmpgtw m6, tcm + pand m4, tcm + pandn m6, m7 + pand m4, m6 + LUMA_Q1 m5, %2, m1, m2, m4, m6 +%endmacro + +%macro LUMA_H_STORE 2 +%if mmsize == 8 + movq [r0-4], m0 + movq [r0+r1-4], m1 + movq [r0+r1*2-4], m2 + movq [r0+%2-4], m3 +%else + movq [r0-4], m0 + movhps [r0+r1-4], m0 + movq [r0+r1*2-4], m1 + movhps [%1-4], m1 + movq [%1+r1-4], m2 + movhps [%1+r1*2-4], m2 + movq [%1+%2-4], m3 + movhps [%1+r1*4-4], m3 +%endif +%endmacro + +%macro DEBLOCK_LUMA 1 +;----------------------------------------------------------------------------- +; void deblock_v_luma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +;----------------------------------------------------------------------------- +cglobal deblock_v_luma_10_%1, 5,5,8*(mmsize/16) + %assign pad 5*mmsize+12-(stack_offset&15) + %define tcm [rsp] + %define ms1 [rsp+mmsize] + %define ms2 [rsp+mmsize*2] + %define am [rsp+mmsize*3] + %define bm [rsp+mmsize*4] + SUB rsp, pad + shl r2d, 2 + shl r3d, 2 + LOAD_AB m4, m5, r2, r3 + mov r3, 32/mmsize + mov r2, r0 + sub r0, r1 + mova am, m4 + sub r0, r1 + mova bm, m5 + sub r0, r1 +.loop: + mova m0, [r0+r1] + mova m1, [r0+r1*2] + mova m2, [r2] + mova m3, [r2+r1] + + LOAD_MASK m0, m1, m2, m3, am, bm, m7, m4, m6 + LOAD_TC m6, r4 + mova tcm, m6 + + mova m5, [r0] + LUMA_DEBLOCK_ONE m1, m0, ms1 + mova [r0+r1], m5 + + mova m5, [r2+r1*2] + LUMA_DEBLOCK_ONE m2, m3, ms2 + mova [r2+r1], m5 + + pxor m5, m5 + mova m6, tcm + pcmpgtw m5, tcm + psubw m6, ms1 + pandn m5, m7 + psubw m6, ms2 + pand m5, m6 + DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6 + mova [r0+r1*2], m1 + mova [r2], m2 + + add r0, mmsize + add r2, mmsize + add r4, mmsize/8 + dec r3 + jg .loop + ADD rsp, pad + RET + +cglobal deblock_h_luma_10_%1, 5,6,8*(mmsize/16) + %assign pad 7*mmsize+12-(stack_offset&15) + %define tcm [rsp] + %define ms1 [rsp+mmsize] + %define ms2 [rsp+mmsize*2] + %define p1m [rsp+mmsize*3] + %define p2m [rsp+mmsize*4] + %define am [rsp+mmsize*5] + %define bm [rsp+mmsize*6] + SUB rsp, pad + shl r2d, 2 + shl r3d, 2 + LOAD_AB m4, m5, r2, r3 + mov r3, r1 + mova am, m4 + add r3, r1 + mov r5, 32/mmsize + mova bm, m5 + add r3, r1 +%if mmsize == 16 + mov r2, r0 + add r2, r3 +%endif +.loop: +%if mmsize == 8 + movq m2, [r0-8] ; y q2 q1 q0 + movq m7, [r0+0] + movq m5, [r0+r1-8] + movq m3, [r0+r1+0] + movq m0, [r0+r1*2-8] + movq m6, [r0+r1*2+0] + movq m1, [r0+r3-8] + TRANSPOSE4x4W 2, 5, 0, 1, 4 + SWAP 2, 7 + movq m7, [r0+r3] + TRANSPOSE4x4W 2, 3, 6, 7, 4 +%else + movu m5, [r0-8] ; y q2 q1 q0 p0 p1 p2 x + movu m0, [r0+r1-8] + movu m2, [r0+r1*2-8] + movu m3, [r2-8] + TRANSPOSE4x4W 5, 0, 2, 3, 6 + mova tcm, m3 + + movu m4, [r2+r1-8] + movu m1, [r2+r1*2-8] + movu m3, [r2+r3-8] + movu m7, [r2+r1*4-8] + TRANSPOSE4x4W 4, 1, 3, 7, 6 + + mova m6, tcm + punpcklqdq m6, m7 + punpckhqdq m5, m4 + SBUTTERFLY qdq, 0, 1, 7 + SBUTTERFLY qdq, 2, 3, 7 +%endif + + mova p2m, m6 + LOAD_MASK m0, m1, m2, m3, am, bm, m7, m4, m6 + LOAD_TC m6, r4 + mova tcm, m6 + + LUMA_DEBLOCK_ONE m1, m0, ms1 + mova p1m, m5 + + mova m5, p2m + LUMA_DEBLOCK_ONE m2, m3, ms2 + mova p2m, m5 + + pxor m5, m5 + mova m6, tcm + pcmpgtw m5, tcm + psubw m6, ms1 + pandn m5, m7 + psubw m6, ms2 + pand m5, m6 + DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6 + mova m0, p1m + mova m3, p2m + TRANSPOSE4x4W 0, 1, 2, 3, 4 + LUMA_H_STORE r2, r3 + + add r4, mmsize/8 + lea r0, [r0+r1*(mmsize/2)] + lea r2, [r2+r1*(mmsize/2)] + dec r5 + jg .loop + ADD rsp, pad + RET +%endmacro + +INIT_XMM +%ifdef ARCH_X86_64 +; in: m0=p1, m1=p0, m2=q0, m3=q1, m8=p2, m9=q2 +; m12=alpha, m13=beta +; out: m0=p1', m3=q1', m1=p0', m2=q0' +; clobbers: m4, m5, m6, m7, m10, m11, m14 +%macro DEBLOCK_LUMA_INTER_SSE2 0 + LOAD_MASK m0, m1, m2, m3, m12, m13, m7, m4, m6 + LOAD_TC m6, r4 + DIFF_LT m8, m1, m13, m10, m4 + DIFF_LT m9, m2, m13, m11, m4 + pand m6, m7 + + mova m14, m6 + pxor m4, m4 + pcmpgtw m6, m4 + pand m6, m14 + + mova m5, m10 + pand m5, m6 + LUMA_Q1 m8, m0, m1, m2, m5, m4 + + mova m5, m11 + pand m5, m6 + LUMA_Q1 m9, m3, m1, m2, m5, m4 + + pxor m4, m4 + psubw m6, m10 + pcmpgtw m4, m14 + pandn m4, m7 + psubw m6, m11 + pand m4, m6 + DEBLOCK_P0_Q0 m1, m2, m0, m3, m4, m5, m6 + + SWAP 0, 8 + SWAP 3, 9 +%endmacro + +%macro DEBLOCK_LUMA_64 1 +cglobal deblock_v_luma_10_%1, 5,5,15 + %define p2 m8 + %define p1 m0 + %define p0 m1 + %define q0 m2 + %define q1 m3 + %define q2 m9 + %define mask0 m7 + %define mask1 m10 + %define mask2 m11 + shl r2d, 2 + shl r3d, 2 + LOAD_AB m12, m13, r2, r3 + mov r2, r0 + sub r0, r1 + sub r0, r1 + sub r0, r1 + mov r3, 2 +.loop: + mova p2, [r0] + mova p1, [r0+r1] + mova p0, [r0+r1*2] + mova q0, [r2] + mova q1, [r2+r1] + mova q2, [r2+r1*2] + DEBLOCK_LUMA_INTER_SSE2 + mova [r0+r1], p1 + mova [r0+r1*2], p0 + mova [r2], q0 + mova [r2+r1], q1 + add r0, mmsize + add r2, mmsize + add r4, 2 + dec r3 + jg .loop + REP_RET + +cglobal deblock_h_luma_10_%1, 5,7,15 + shl r2d, 2 + shl r3d, 2 + LOAD_AB m12, m13, r2, r3 + mov r2, r1 + add r2, r1 + add r2, r1 + mov r5, r0 + add r5, r2 + mov r6, 2 +.loop: + movu m8, [r0-8] ; y q2 q1 q0 p0 p1 p2 x + movu m0, [r0+r1-8] + movu m2, [r0+r1*2-8] + movu m9, [r5-8] + movu m5, [r5+r1-8] + movu m1, [r5+r1*2-8] + movu m3, [r5+r2-8] + movu m7, [r5+r1*4-8] + + TRANSPOSE4x4W 8, 0, 2, 9, 10 + TRANSPOSE4x4W 5, 1, 3, 7, 10 + + punpckhqdq m8, m5 + SBUTTERFLY qdq, 0, 1, 10 + SBUTTERFLY qdq, 2, 3, 10 + punpcklqdq m9, m7 + + DEBLOCK_LUMA_INTER_SSE2 + + TRANSPOSE4x4W 0, 1, 2, 3, 4 + LUMA_H_STORE r5, r2 + add r4, 2 + lea r0, [r0+r1*8] + lea r5, [r5+r1*8] + dec r6 + jg .loop + REP_RET +%endmacro + +INIT_XMM +DEBLOCK_LUMA_64 sse2 +INIT_AVX +DEBLOCK_LUMA_64 avx +%endif + +%macro SWAPMOVA 2 +%ifid %1 + SWAP %1, %2 +%else + mova %1, %2 +%endif +%endmacro + +; in: t0-t2: tmp registers +; %1=p0 %2=p1 %3=p2 %4=p3 %5=q0 %6=q1 %7=mask0 +; %8=mask1p %9=2 %10=p0' %11=p1' %12=p2' +%macro LUMA_INTRA_P012 12 ; p0..p3 in memory +%ifdef ARCH_X86_64 + paddw t0, %3, %2 + mova t2, %4 + paddw t2, %3 +%else + mova t0, %3 + mova t2, %4 + paddw t0, %2 + paddw t2, %3 +%endif + paddw t0, %1 + paddw t2, t2 + paddw t0, %5 + paddw t2, %9 + paddw t0, %9 ; (p2 + p1 + p0 + q0 + 2) + paddw t2, t0 ; (2*p3 + 3*p2 + p1 + p0 + q0 + 4) + + psrlw t2, 3 + psrlw t1, t0, 2 + psubw t2, %3 + psubw t1, %2 + pand t2, %8 + pand t1, %8 + paddw t2, %3 + paddw t1, %2 + SWAPMOVA %11, t1 + + psubw t1, t0, %3 + paddw t0, t0 + psubw t1, %5 + psubw t0, %3 + paddw t1, %6 + paddw t1, %2 + paddw t0, %6 + psrlw t1, 2 ; (2*p1 + p0 + q1 + 2)/4 + psrlw t0, 3 ; (p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4)>>3 + + pxor t0, t1 + pxor t1, %1 + pand t0, %8 + pand t1, %7 + pxor t0, t1 + pxor t0, %1 + SWAPMOVA %10, t0 + SWAPMOVA %12, t2 +%endmacro + +%macro LUMA_INTRA_INIT 1 + %xdefine pad %1*mmsize+((gprsize*3) % mmsize)-(stack_offset&15) + %define t0 m4 + %define t1 m5 + %define t2 m6 + %define t3 m7 + %assign i 4 +%rep %1 + CAT_XDEFINE t, i, [rsp+mmsize*(i-4)] + %assign i i+1 +%endrep + SUB rsp, pad +%endmacro + +; in: %1-%3=tmp, %4=p2, %5=q2 +%macro LUMA_INTRA_INTER 5 + LOAD_AB t0, t1, r2d, r3d + mova %1, t0 + LOAD_MASK m0, m1, m2, m3, %1, t1, t0, t2, t3 +%ifdef ARCH_X86_64 + mova %2, t0 ; mask0 + psrlw t3, %1, 2 +%else + mova t3, %1 + mova %2, t0 ; mask0 + psrlw t3, 2 +%endif + paddw t3, [pw_2] ; alpha/4+2 + DIFF_LT m1, m2, t3, t2, t0 ; t2 = |p0-q0| < alpha/4+2 + pand t2, %2 + mova t3, %5 ; q2 + mova %1, t2 ; mask1 + DIFF_LT t3, m2, t1, t2, t0 ; t2 = |q2-q0| < beta + pand t2, %1 + mova t3, %4 ; p2 + mova %3, t2 ; mask1q + DIFF_LT t3, m1, t1, t2, t0 ; t2 = |p2-p0| < beta + pand t2, %1 + mova %1, t2 ; mask1p +%endmacro + +%macro LUMA_H_INTRA_LOAD 0 +%if mmsize == 8 + movu t0, [r0-8] + movu t1, [r0+r1-8] + movu m0, [r0+r1*2-8] + movu m1, [r0+r4-8] + TRANSPOSE4x4W 4, 5, 0, 1, 2 + mova t4, t0 ; p3 + mova t5, t1 ; p2 + + movu m2, [r0] + movu m3, [r0+r1] + movu t0, [r0+r1*2] + movu t1, [r0+r4] + TRANSPOSE4x4W 2, 3, 4, 5, 6 + mova t6, t0 ; q2 + mova t7, t1 ; q3 +%else + movu t0, [r0-8] + movu t1, [r0+r1-8] + movu m0, [r0+r1*2-8] + movu m1, [r0+r5-8] + movu m2, [r4-8] + movu m3, [r4+r1-8] + movu t2, [r4+r1*2-8] + movu t3, [r4+r5-8] + TRANSPOSE8x8W 4, 5, 0, 1, 2, 3, 6, 7, t4, t5 + mova t4, t0 ; p3 + mova t5, t1 ; p2 + mova t6, t2 ; q2 + mova t7, t3 ; q3 +%endif +%endmacro + +; in: %1=q3 %2=q2' %3=q1' %4=q0' %5=p0' %6=p1' %7=p2' %8=p3 %9=tmp +%macro LUMA_H_INTRA_STORE 9 +%if mmsize == 8 + TRANSPOSE4x4W %1, %2, %3, %4, %9 + movq [r0-8], m%1 + movq [r0+r1-8], m%2 + movq [r0+r1*2-8], m%3 + movq [r0+r4-8], m%4 + movq m%1, %8 + TRANSPOSE4x4W %5, %6, %7, %1, %9 + movq [r0], m%5 + movq [r0+r1], m%6 + movq [r0+r1*2], m%7 + movq [r0+r4], m%1 +%else + TRANSPOSE2x4x4W %1, %2, %3, %4, %9 + movq [r0-8], m%1 + movq [r0+r1-8], m%2 + movq [r0+r1*2-8], m%3 + movq [r0+r5-8], m%4 + movhps [r4-8], m%1 + movhps [r4+r1-8], m%2 + movhps [r4+r1*2-8], m%3 + movhps [r4+r5-8], m%4 +%ifnum %8 + SWAP %1, %8 +%else + mova m%1, %8 +%endif + TRANSPOSE2x4x4W %5, %6, %7, %1, %9 + movq [r0], m%5 + movq [r0+r1], m%6 + movq [r0+r1*2], m%7 + movq [r0+r5], m%1 + movhps [r4], m%5 + movhps [r4+r1], m%6 + movhps [r4+r1*2], m%7 + movhps [r4+r5], m%1 +%endif +%endmacro + +%ifdef ARCH_X86_64 +;----------------------------------------------------------------------------- +; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta ) +;----------------------------------------------------------------------------- +%macro DEBLOCK_LUMA_INTRA_64 1 +cglobal deblock_v_luma_intra_10_%1, 4,7,16 + %define t0 m1 + %define t1 m2 + %define t2 m4 + %define p2 m8 + %define p1 m9 + %define p0 m10 + %define q0 m11 + %define q1 m12 + %define q2 m13 + %define aa m5 + %define bb m14 + lea r4, [r1*4] + lea r5, [r1*3] ; 3*stride + neg r4 + add r4, r0 ; pix-4*stride + mov r6, 2 + mova m0, [pw_2] + shl r2d, 2 + shl r3d, 2 + LOAD_AB aa, bb, r2d, r3d +.loop + mova p2, [r4+r1] + mova p1, [r4+2*r1] + mova p0, [r4+r5] + mova q0, [r0] + mova q1, [r0+r1] + mova q2, [r0+2*r1] + + LOAD_MASK p1, p0, q0, q1, aa, bb, m3, t0, t1 + mova t2, aa + psrlw t2, 2 + paddw t2, m0 ; alpha/4+2 + DIFF_LT p0, q0, t2, m6, t0 ; m6 = |p0-q0| < alpha/4+2 + DIFF_LT p2, p0, bb, t1, t0 ; m7 = |p2-p0| < beta + DIFF_LT q2, q0, bb, m7, t0 ; t1 = |q2-q0| < beta + pand m6, m3 + pand m7, m6 + pand m6, t1 + LUMA_INTRA_P012 p0, p1, p2, [r4], q0, q1, m3, m6, m0, [r4+r5], [r4+2*r1], [r4+r1] + LUMA_INTRA_P012 q0, q1, q2, [r0+r5], p0, p1, m3, m7, m0, [r0], [r0+r1], [r0+2*r1] + add r0, mmsize + add r4, mmsize + dec r6 + jg .loop + REP_RET + +;----------------------------------------------------------------------------- +; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta ) +;----------------------------------------------------------------------------- +cglobal deblock_h_luma_intra_10_%1, 4,7,16 + %define t0 m15 + %define t1 m14 + %define t2 m2 + %define q3 m5 + %define q2 m8 + %define q1 m9 + %define q0 m10 + %define p0 m11 + %define p1 m12 + %define p2 m13 + %define p3 m4 + %define spill [rsp] + %assign pad 24-(stack_offset&15) + SUB rsp, pad + lea r4, [r1*4] + lea r5, [r1*3] ; 3*stride + add r4, r0 ; pix+4*stride + mov r6, 2 + mova m0, [pw_2] + shl r2d, 2 + shl r3d, 2 +.loop + movu q3, [r0-8] + movu q2, [r0+r1-8] + movu q1, [r0+r1*2-8] + movu q0, [r0+r5-8] + movu p0, [r4-8] + movu p1, [r4+r1-8] + movu p2, [r4+r1*2-8] + movu p3, [r4+r5-8] + TRANSPOSE8x8W 5, 8, 9, 10, 11, 12, 13, 4, 1 + + LOAD_AB m1, m2, r2d, r3d + LOAD_MASK q1, q0, p0, p1, m1, m2, m3, t0, t1 + psrlw m1, 2 + paddw m1, m0 ; alpha/4+2 + DIFF_LT p0, q0, m1, m6, t0 ; m6 = |p0-q0| < alpha/4+2 + DIFF_LT q2, q0, m2, t1, t0 ; t1 = |q2-q0| < beta + DIFF_LT p0, p2, m2, m7, t0 ; m7 = |p2-p0| < beta + pand m6, m3 + pand m7, m6 + pand m6, t1 + + mova spill, q3 + LUMA_INTRA_P012 q0, q1, q2, q3, p0, p1, m3, m6, m0, m5, m1, q2 + LUMA_INTRA_P012 p0, p1, p2, p3, q0, q1, m3, m7, m0, p0, m6, p2 + mova m7, spill + + LUMA_H_INTRA_STORE 7, 8, 1, 5, 11, 6, 13, 4, 14 + + lea r0, [r0+r1*8] + lea r4, [r4+r1*8] + dec r6 + jg .loop + ADD rsp, pad + RET +%endmacro + +INIT_XMM +DEBLOCK_LUMA_INTRA_64 sse2 +INIT_AVX +DEBLOCK_LUMA_INTRA_64 avx + +%endif + +%macro DEBLOCK_LUMA_INTRA 1 +;----------------------------------------------------------------------------- +; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta ) +;----------------------------------------------------------------------------- +cglobal deblock_v_luma_intra_10_%1, 4,7,8*(mmsize/16) + LUMA_INTRA_INIT 3 + lea r4, [r1*4] + lea r5, [r1*3] + neg r4 + add r4, r0 + mov r6, 32/mmsize + shl r2d, 2 + shl r3d, 2 +.loop: + mova m0, [r4+r1*2] ; p1 + mova m1, [r4+r5] ; p0 + mova m2, [r0] ; q0 + mova m3, [r0+r1] ; q1 + LUMA_INTRA_INTER t4, t5, t6, [r4+r1], [r0+r1*2] + LUMA_INTRA_P012 m1, m0, t3, [r4], m2, m3, t5, t4, [pw_2], [r4+r5], [r4+2*r1], [r4+r1] + mova t3, [r0+r1*2] ; q2 + LUMA_INTRA_P012 m2, m3, t3, [r0+r5], m1, m0, t5, t6, [pw_2], [r0], [r0+r1], [r0+2*r1] + add r0, mmsize + add r4, mmsize + dec r6 + jg .loop + ADD rsp, pad + RET + +;----------------------------------------------------------------------------- +; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta ) +;----------------------------------------------------------------------------- +cglobal deblock_h_luma_intra_10_%1, 4,7,8*(mmsize/16) + LUMA_INTRA_INIT 8 +%if mmsize == 8 + lea r4, [r1*3] + mov r5, 32/mmsize +%else + lea r4, [r1*4] + lea r5, [r1*3] ; 3*stride + add r4, r0 ; pix+4*stride + mov r6, 32/mmsize +%endif + shl r2d, 2 + shl r3d, 2 +.loop: + LUMA_H_INTRA_LOAD + LUMA_INTRA_INTER t8, t9, t10, t5, t6 + + LUMA_INTRA_P012 m1, m0, t3, t4, m2, m3, t9, t8, [pw_2], t8, t5, t11 + mova t3, t6 ; q2 + LUMA_INTRA_P012 m2, m3, t3, t7, m1, m0, t9, t10, [pw_2], m4, t6, m5 + + mova m2, t4 + mova m0, t11 + mova m1, t5 + mova m3, t8 + mova m6, t6 + + LUMA_H_INTRA_STORE 2, 0, 1, 3, 4, 6, 5, t7, 7 + + lea r0, [r0+r1*(mmsize/2)] +%if mmsize == 8 + dec r5 +%else + lea r4, [r4+r1*(mmsize/2)] + dec r6 +%endif + jg .loop + ADD rsp, pad + RET +%endmacro + +%ifndef ARCH_X86_64 +INIT_MMX +DEBLOCK_LUMA mmxext +DEBLOCK_LUMA_INTRA mmxext +INIT_XMM +DEBLOCK_LUMA sse2 +DEBLOCK_LUMA_INTRA sse2 +INIT_AVX +DEBLOCK_LUMA avx +DEBLOCK_LUMA_INTRA avx +%endif diff --git a/libavcodec/x86/h264dsp_mmx.c b/libavcodec/x86/h264dsp_mmx.c index 7d27c02ea2..42dae93f2d 100644 --- a/libavcodec/x86/h264dsp_mmx.c +++ b/libavcodec/x86/h264dsp_mmx.c @@ -218,45 +218,49 @@ static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40] ); } -#define LF_FUNC(DIR, TYPE, OPT) \ -void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## OPT (uint8_t *pix, int stride, \ - int alpha, int beta, int8_t *tc0); -#define LF_IFUNC(DIR, TYPE, OPT) \ -void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## OPT (uint8_t *pix, int stride, \ - int alpha, int beta); +#define LF_FUNC(DIR, TYPE, DEPTH, OPT) \ +void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *pix, int stride, \ + int alpha, int beta, int8_t *tc0); +#define LF_IFUNC(DIR, TYPE, DEPTH, OPT) \ +void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *pix, int stride, \ + int alpha, int beta); -LF_FUNC (h, chroma, mmxext) -LF_IFUNC(h, chroma_intra, mmxext) -LF_FUNC (v, chroma, mmxext) -LF_IFUNC(v, chroma_intra, mmxext) +#define LF_FUNCS(type, depth)\ +LF_FUNC (h, chroma, depth, mmxext)\ +LF_IFUNC(h, chroma_intra, depth, mmxext)\ +LF_FUNC (v, chroma, depth, mmxext)\ +LF_IFUNC(v, chroma_intra, depth, mmxext)\ +LF_FUNC (h, luma, depth, mmxext)\ +LF_IFUNC(h, luma_intra, depth, mmxext)\ +LF_FUNC (h, luma, depth, sse2)\ +LF_IFUNC(h, luma_intra, depth, sse2)\ +LF_FUNC (v, luma, depth, sse2)\ +LF_IFUNC(v, luma_intra, depth, sse2)\ +LF_FUNC (h, luma, depth, avx)\ +LF_IFUNC(h, luma_intra, depth, avx)\ +LF_FUNC (v, luma, depth, avx)\ +LF_IFUNC(v, luma_intra, depth, avx) -LF_FUNC (h, luma, mmxext) -LF_IFUNC(h, luma_intra, mmxext) -#if HAVE_YASM && ARCH_X86_32 -LF_FUNC (v8, luma, mmxext) -static void ff_deblock_v_luma_mmxext(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) +LF_FUNCS( uint8_t, 8) +LF_FUNCS(uint16_t, 10) + +LF_FUNC (v8, luma, 8, mmxext) +static void ff_deblock_v_luma_8_mmxext(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) { if((tc0[0] & tc0[1]) >= 0) - ff_deblock_v8_luma_mmxext(pix+0, stride, alpha, beta, tc0); + ff_deblock_v8_luma_8_mmxext(pix+0, stride, alpha, beta, tc0); if((tc0[2] & tc0[3]) >= 0) - ff_deblock_v8_luma_mmxext(pix+8, stride, alpha, beta, tc0+2); + ff_deblock_v8_luma_8_mmxext(pix+8, stride, alpha, beta, tc0+2); } -LF_IFUNC(v8, luma_intra, mmxext) -static void ff_deblock_v_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta) +LF_IFUNC(v8, luma_intra, 8, mmxext) +static void ff_deblock_v_luma_intra_8_mmxext(uint8_t *pix, int stride, int alpha, int beta) { - ff_deblock_v8_luma_intra_mmxext(pix+0, stride, alpha, beta); - ff_deblock_v8_luma_intra_mmxext(pix+8, stride, alpha, beta); + ff_deblock_v8_luma_intra_8_mmxext(pix+0, stride, alpha, beta); + ff_deblock_v8_luma_intra_8_mmxext(pix+8, stride, alpha, beta); } -#endif -LF_FUNC (h, luma, sse2) -LF_IFUNC(h, luma_intra, sse2) -LF_FUNC (v, luma, sse2) -LF_IFUNC(v, luma_intra, sse2) -LF_FUNC (h, luma, avx) -LF_IFUNC(h, luma_intra, avx) -LF_FUNC (v, luma, avx) -LF_IFUNC(v, luma_intra, avx) +LF_FUNC (v, luma, 10, mmxext) +LF_IFUNC(v, luma_intra, 10, mmxext) /***********************************/ /* weighted prediction */ @@ -318,15 +322,15 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth) c->h264_idct_add8 = ff_h264_idct_add8_mmx2; c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx2; - c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_mmxext; - c->h264_h_loop_filter_chroma= ff_deblock_h_chroma_mmxext; - c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_mmxext; - c->h264_h_loop_filter_chroma_intra= ff_deblock_h_chroma_intra_mmxext; + c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_8_mmxext; + c->h264_h_loop_filter_chroma= ff_deblock_h_chroma_8_mmxext; + c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_8_mmxext; + c->h264_h_loop_filter_chroma_intra= ff_deblock_h_chroma_intra_8_mmxext; #if ARCH_X86_32 - c->h264_v_loop_filter_luma= ff_deblock_v_luma_mmxext; - c->h264_h_loop_filter_luma= ff_deblock_h_luma_mmxext; - c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_mmxext; - c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_mmxext; + c->h264_v_loop_filter_luma= ff_deblock_v_luma_8_mmxext; + c->h264_h_loop_filter_luma= ff_deblock_h_luma_8_mmxext; + c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_mmxext; + c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_mmxext; #endif c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2; c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2; @@ -364,10 +368,10 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth) c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_sse2; #if HAVE_ALIGNED_STACK - c->h264_v_loop_filter_luma = ff_deblock_v_luma_sse2; - c->h264_h_loop_filter_luma = ff_deblock_h_luma_sse2; - c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_sse2; - c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_sse2; + c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_sse2; + c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_sse2; + c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_sse2; + c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_sse2; #endif c->h264_idct_add16 = ff_h264_idct_add16_sse2; @@ -383,10 +387,39 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth) } if (mm_flags&AV_CPU_FLAG_AVX) { #if HAVE_ALIGNED_STACK - c->h264_v_loop_filter_luma = ff_deblock_v_luma_avx; - c->h264_h_loop_filter_luma = ff_deblock_h_luma_avx; - c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_avx; - c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_avx; + c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_avx; + c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_avx; + c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_avx; + c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_avx; +#endif + } + } + } +#endif + } else if (bit_depth == 10) { +#if HAVE_YASM + if (mm_flags & AV_CPU_FLAG_MMX) { + if (mm_flags & AV_CPU_FLAG_MMX2) { +#if ARCH_X86_32 + c->h264_v_loop_filter_luma= ff_deblock_v_luma_10_mmxext; + c->h264_h_loop_filter_luma= ff_deblock_h_luma_10_mmxext; + c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_mmxext; + c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_mmxext; +#endif + if (mm_flags&AV_CPU_FLAG_SSE2) { +#if HAVE_ALIGNED_STACK + c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_sse2; + c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_sse2; + c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_sse2; + c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_sse2; +#endif + } + if (mm_flags&AV_CPU_FLAG_AVX) { +#if HAVE_ALIGNED_STACK + c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_avx; + c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_avx; + c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_avx; + c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_avx; #endif } } diff --git a/libavcodec/x86/x86util.asm b/libavcodec/x86/x86util.asm index 28baf7a96a..c963deff86 100644 --- a/libavcodec/x86/x86util.asm +++ b/libavcodec/x86/x86util.asm @@ -457,3 +457,8 @@ pshufw %1, %2, (%3)*0x55 %endif %endmacro + +%macro CLIPW 3 ;(dst, min, max) + pmaxsw %1, %2 + pminsw %1, %3 +%endmacro From 918a5409532e1218b011b5c079beb4eb5f45fdd4 Mon Sep 17 00:00:00 2001 From: Alex Converse Date: Tue, 10 May 2011 15:10:31 -0700 Subject: [PATCH 03/32] Don't allow unsupported resampling configurations. --- libavcodec/resample.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/libavcodec/resample.c b/libavcodec/resample.c index 2185a11ad7..9f0599fb59 100644 --- a/libavcodec/resample.c +++ b/libavcodec/resample.c @@ -156,6 +156,11 @@ ReSampleContext *av_audio_resample_init(int output_channels, int input_channels, av_log(NULL, AV_LOG_ERROR, "Resampling with input channels greater than 2 unsupported.\n"); return NULL; } + if (output_channels > 2 && !(output_channels == 6 && input_channels == 2)) { + av_log(NULL, AV_LOG_ERROR, + "Resampling output channel count must be 1 or 2 for mono input and 1, 2 or 6 for stereo input.\n"); + return NULL; + } s = av_mallocz(sizeof(ReSampleContext)); if (!s) From 3e00ababc49bc8ddd149c891199ba2d30beb3118 Mon Sep 17 00:00:00 2001 From: Alex Converse Date: Tue, 10 May 2011 14:24:05 -0700 Subject: [PATCH 04/32] Allow resampling with no channel count change for up to 8 channels. --- libavcodec/resample.c | 84 +++++++++++++++++++++---------------------- 1 file changed, 41 insertions(+), 43 deletions(-) diff --git a/libavcodec/resample.c b/libavcodec/resample.c index 9f0599fb59..bdd32f439d 100644 --- a/libavcodec/resample.c +++ b/libavcodec/resample.c @@ -29,6 +29,8 @@ #include "libavutil/opt.h" #include "libavutil/samplefmt.h" +#define MAX_CHANNELS 8 + struct AVResampleContext; static const char *context_to_name(void *ptr) @@ -41,7 +43,7 @@ static const AVClass audioresample_context_class = { "ReSampleContext", context_ struct ReSampleContext { struct AVResampleContext *resample_context; - short *temp[2]; + short *temp[MAX_CHANNELS]; int temp_len; float ratio; /* channel convert */ @@ -104,24 +106,25 @@ static void mono_to_stereo(short *output, short *input, int n1) } } -/* XXX: should use more abstract 'N' channels system */ -static void stereo_split(short *output1, short *output2, short *input, int n) +static void deinterleave(short **output, short *input, int channels, int samples) { - int i; + int i, j; - for(i=0;i 2) + if (input_channels > MAX_CHANNELS) { - av_log(NULL, AV_LOG_ERROR, "Resampling with input channels greater than 2 unsupported.\n"); + av_log(NULL, AV_LOG_ERROR, + "Resampling with input channels greater than %d is unsupported.\n", + MAX_CHANNELS); return NULL; } - if (output_channels > 2 && !(output_channels == 6 && input_channels == 2)) { + if ( output_channels > 2 && + !(output_channels == 6 && input_channels == 2) && + output_channels != input_channels) { av_log(NULL, AV_LOG_ERROR, - "Resampling output channel count must be 1 or 2 for mono input and 1, 2 or 6 for stereo input.\n"); + "Resampling output channel count must be 1 or 2 for mono input; 1, 2 or 6 for stereo input; or N for N channel input.\n"); return NULL; } @@ -206,14 +213,6 @@ ReSampleContext *av_audio_resample_init(int output_channels, int input_channels, } } -/* - * AC-3 output is the only case where filter_channels could be greater than 2. - * input channels can't be greater than 2, so resample the 2 channels and then - * expand to 6 channels after the resampling. - */ - if(s->filter_channels>2) - s->filter_channels = 2; - #define TAPS 16 s->resample_context= av_resample_init(output_rate, input_rate, filter_length, log2_phase_count, linear, cutoff); @@ -228,9 +227,9 @@ ReSampleContext *av_audio_resample_init(int output_channels, int input_channels, int audio_resample(ReSampleContext *s, short *output, short *input, int nb_samples) { int i, nb_samples1; - short *bufin[2]; - short *bufout[2]; - short *buftmp2[2], *buftmp3[2]; + short *bufin[MAX_CHANNELS]; + short *bufout[MAX_CHANNELS]; + short *buftmp2[MAX_CHANNELS], *buftmp3[MAX_CHANNELS]; short *output_bak = NULL; int lenout; @@ -291,12 +290,9 @@ int audio_resample(ReSampleContext *s, short *output, short *input, int nb_sampl bufin[i]= av_malloc( (nb_samples + s->temp_len) * sizeof(short) ); memcpy(bufin[i], s->temp[i], s->temp_len * sizeof(short)); buftmp2[i] = bufin[i] + s->temp_len; + bufout[i] = av_malloc(lenout * sizeof(short)); } - /* make some zoom to avoid round pb */ - bufout[0]= av_malloc( lenout * sizeof(short) ); - bufout[1]= av_malloc( lenout * sizeof(short) ); - if (s->input_channels == 2 && s->output_channels == 1) { buftmp3[0] = output; @@ -304,10 +300,11 @@ int audio_resample(ReSampleContext *s, short *output, short *input, int nb_sampl } else if (s->output_channels >= 2 && s->input_channels == 1) { buftmp3[0] = bufout[0]; memcpy(buftmp2[0], input, nb_samples*sizeof(short)); - } else if (s->output_channels >= 2) { - buftmp3[0] = bufout[0]; - buftmp3[1] = bufout[1]; - stereo_split(buftmp2[0], buftmp2[1], input, nb_samples); + } else if (s->output_channels >= s->input_channels && s->input_channels >= 2) { + for (i = 0; i < s->input_channels; i++) { + buftmp3[i] = bufout[i]; + } + deinterleave(buftmp2, input, s->input_channels, nb_samples); } else { buftmp3[0] = output; memcpy(buftmp2[0], input, nb_samples*sizeof(short)); @@ -329,10 +326,10 @@ int audio_resample(ReSampleContext *s, short *output, short *input, int nb_sampl if (s->output_channels == 2 && s->input_channels == 1) { mono_to_stereo(output, buftmp3[0], nb_samples1); - } else if (s->output_channels == 2) { - stereo_mux(output, buftmp3[0], buftmp3[1], nb_samples1); - } else if (s->output_channels == 6) { + } else if (s->output_channels == 6 && s->input_channels == 2) { ac3_5p1_mux(output, buftmp3[0], buftmp3[1], nb_samples1); + } else if (s->output_channels == s->input_channels && s->input_channels >= 2) { + interleave(output, buftmp3, s->output_channels, nb_samples1); } if (s->sample_fmt[1] != AV_SAMPLE_FMT_S16) { @@ -348,19 +345,20 @@ int audio_resample(ReSampleContext *s, short *output, short *input, int nb_sampl } } - for(i=0; ifilter_channels; i++) + for (i = 0; i < s->filter_channels; i++) { av_free(bufin[i]); + av_free(bufout[i]); + } - av_free(bufout[0]); - av_free(bufout[1]); return nb_samples1; } void audio_resample_close(ReSampleContext *s) { + int i; av_resample_close(s->resample_context); - av_freep(&s->temp[0]); - av_freep(&s->temp[1]); + for (i = 0; i < s->filter_channels; i++) + av_freep(&s->temp[i]); av_freep(&s->buffer[0]); av_freep(&s->buffer[1]); av_audio_convert_free(s->convert_ctx[0]); From ffc437c026dd0e1b8e5d9114163b4e95999b95fd Mon Sep 17 00:00:00 2001 From: Alex Converse Date: Tue, 10 May 2011 16:58:01 -0700 Subject: [PATCH 05/32] cosmetics: Fix crazy formatting in resample. --- libavcodec/resample.c | 97 ++++++++++++++++++++++--------------------- 1 file changed, 50 insertions(+), 47 deletions(-) diff --git a/libavcodec/resample.c b/libavcodec/resample.c index bdd32f439d..0bebe1ab88 100644 --- a/libavcodec/resample.c +++ b/libavcodec/resample.c @@ -39,7 +39,9 @@ static const char *context_to_name(void *ptr) } static const AVOption options[] = {{NULL}}; -static const AVClass audioresample_context_class = { "ReSampleContext", context_to_name, options, LIBAVUTIL_VERSION_INT }; +static const AVClass audioresample_context_class = { + "ReSampleContext", context_to_name, options, LIBAVUTIL_VERSION_INT +}; struct ReSampleContext { struct AVResampleContext *resample_context; @@ -50,9 +52,9 @@ struct ReSampleContext { int input_channels, output_channels, filter_channels; AVAudioConvert *convert_ctx[2]; enum AVSampleFormat sample_fmt[2]; ///< input and output sample format - unsigned sample_size[2]; ///< size of one sample in sample_fmt - short *buffer[2]; ///< buffers used for conversion to S16 - unsigned buffer_size[2]; ///< sizes of allocated buffers + unsigned sample_size[2]; ///< size of one sample in sample_fmt + short *buffer[2]; ///< buffers used for conversion to S16 + unsigned buffer_size[2]; ///< sizes of allocated buffers }; /* n1: number of samples */ @@ -131,17 +133,17 @@ static void interleave(short *output, short **input, int channels, int samples) static void ac3_5p1_mux(short *output, short *input1, short *input2, int n) { int i; - short l,r; + short l, r; - for(i=0;i MAX_CHANNELS) - { + if (input_channels > MAX_CHANNELS) { av_log(NULL, AV_LOG_ERROR, "Resampling with input channels greater than %d is unsupported.\n", MAX_CHANNELS); return NULL; - } - if ( output_channels > 2 && + } + if (output_channels > 2 && !(output_channels == 6 && input_channels == 2) && - output_channels != input_channels) { + output_channels != input_channels) { av_log(NULL, AV_LOG_ERROR, "Resampling output channel count must be 1 or 2 for mono input; 1, 2 or 6 for stereo input; or N for N channel input.\n"); return NULL; } s = av_mallocz(sizeof(ReSampleContext)); - if (!s) - { + if (!s) { av_log(NULL, AV_LOG_ERROR, "Can't allocate memory for resample context.\n"); return NULL; - } + } s->ratio = (float)output_rate / (float)input_rate; @@ -185,10 +185,10 @@ ReSampleContext *av_audio_resample_init(int output_channels, int input_channels, if (s->output_channels < s->filter_channels) s->filter_channels = s->output_channels; - s->sample_fmt [0] = sample_fmt_in; - s->sample_fmt [1] = sample_fmt_out; - s->sample_size[0] = av_get_bits_per_sample_fmt(s->sample_fmt[0])>>3; - s->sample_size[1] = av_get_bits_per_sample_fmt(s->sample_fmt[1])>>3; + s->sample_fmt[0] = sample_fmt_in; + s->sample_fmt[1] = sample_fmt_out; + s->sample_size[0] = av_get_bits_per_sample_fmt(s->sample_fmt[0]) >> 3; + s->sample_size[1] = av_get_bits_per_sample_fmt(s->sample_fmt[1]) >> 3; if (s->sample_fmt[0] != AV_SAMPLE_FMT_S16) { if (!(s->convert_ctx[0] = av_audio_convert_alloc(AV_SAMPLE_FMT_S16, 1, @@ -214,8 +214,9 @@ ReSampleContext *av_audio_resample_init(int output_channels, int input_channels, } #define TAPS 16 - s->resample_context= av_resample_init(output_rate, input_rate, - filter_length, log2_phase_count, linear, cutoff); + s->resample_context = av_resample_init(output_rate, input_rate, + filter_length, log2_phase_count, + linear, cutoff); *(const AVClass**)s->resample_context = &audioresample_context_class; @@ -244,7 +245,7 @@ int audio_resample(ReSampleContext *s, short *output, short *input, int nb_sampl int ostride[1] = { 2 }; const void *ibuf[1] = { input }; void *obuf[1]; - unsigned input_size = nb_samples*s->input_channels*2; + unsigned input_size = nb_samples * s->input_channels * 2; if (!s->buffer_size[0] || s->buffer_size[0] < input_size) { av_free(s->buffer[0]); @@ -259,15 +260,16 @@ int audio_resample(ReSampleContext *s, short *output, short *input, int nb_sampl obuf[0] = s->buffer[0]; if (av_audio_convert(s->convert_ctx[0], obuf, ostride, - ibuf, istride, nb_samples*s->input_channels) < 0) { - av_log(s->resample_context, AV_LOG_ERROR, "Audio sample format conversion failed\n"); + ibuf, istride, nb_samples * s->input_channels) < 0) { + av_log(s->resample_context, AV_LOG_ERROR, + "Audio sample format conversion failed\n"); return 0; } - input = s->buffer[0]; + input = s->buffer[0]; } - lenout= 4*nb_samples * s->ratio + 16; + lenout = 4 * nb_samples * s->ratio + 16; if (s->sample_fmt[1] != AV_SAMPLE_FMT_S16) { output_bak = output; @@ -286,20 +288,19 @@ int audio_resample(ReSampleContext *s, short *output, short *input, int nb_sampl } /* XXX: move those malloc to resample init code */ - for(i=0; ifilter_channels; i++){ - bufin[i]= av_malloc( (nb_samples + s->temp_len) * sizeof(short) ); + for (i = 0; i < s->filter_channels; i++) { + bufin[i] = av_malloc((nb_samples + s->temp_len) * sizeof(short)); memcpy(bufin[i], s->temp[i], s->temp_len * sizeof(short)); buftmp2[i] = bufin[i] + s->temp_len; bufout[i] = av_malloc(lenout * sizeof(short)); } - if (s->input_channels == 2 && - s->output_channels == 1) { + if (s->input_channels == 2 && s->output_channels == 1) { buftmp3[0] = output; stereo_to_mono(buftmp2[0], input, nb_samples); } else if (s->output_channels >= 2 && s->input_channels == 1) { buftmp3[0] = bufout[0]; - memcpy(buftmp2[0], input, nb_samples*sizeof(short)); + memcpy(buftmp2[0], input, nb_samples * sizeof(short)); } else if (s->output_channels >= s->input_channels && s->input_channels >= 2) { for (i = 0; i < s->input_channels; i++) { buftmp3[i] = bufout[i]; @@ -307,21 +308,22 @@ int audio_resample(ReSampleContext *s, short *output, short *input, int nb_sampl deinterleave(buftmp2, input, s->input_channels, nb_samples); } else { buftmp3[0] = output; - memcpy(buftmp2[0], input, nb_samples*sizeof(short)); + memcpy(buftmp2[0], input, nb_samples * sizeof(short)); } nb_samples += s->temp_len; /* resample each channel */ nb_samples1 = 0; /* avoid warning */ - for(i=0;ifilter_channels;i++) { + for (i = 0; i < s->filter_channels; i++) { int consumed; - int is_last= i+1 == s->filter_channels; + int is_last = i + 1 == s->filter_channels; - nb_samples1 = av_resample(s->resample_context, buftmp3[i], bufin[i], &consumed, nb_samples, lenout, is_last); - s->temp_len= nb_samples - consumed; - s->temp[i]= av_realloc(s->temp[i], s->temp_len*sizeof(short)); - memcpy(s->temp[i], bufin[i] + consumed, s->temp_len*sizeof(short)); + nb_samples1 = av_resample(s->resample_context, buftmp3[i], bufin[i], + &consumed, nb_samples, lenout, is_last); + s->temp_len = nb_samples - consumed; + s->temp[i] = av_realloc(s->temp[i], s->temp_len * sizeof(short)); + memcpy(s->temp[i], bufin[i] + consumed, s->temp_len * sizeof(short)); } if (s->output_channels == 2 && s->input_channels == 1) { @@ -339,8 +341,9 @@ int audio_resample(ReSampleContext *s, short *output, short *input, int nb_sampl void *obuf[1] = { output_bak }; if (av_audio_convert(s->convert_ctx[1], obuf, ostride, - ibuf, istride, nb_samples1*s->output_channels) < 0) { - av_log(s->resample_context, AV_LOG_ERROR, "Audio sample format convertion failed\n"); + ibuf, istride, nb_samples1 * s->output_channels) < 0) { + av_log(s->resample_context, AV_LOG_ERROR, + "Audio sample format convertion failed\n"); return 0; } } From 91199cfe55b1398b23a16b1f55df75e62e05198b Mon Sep 17 00:00:00 2001 From: Benjamin Larsson Date: Tue, 8 Mar 2011 15:29:46 +0100 Subject: [PATCH 06/32] ffplay: add a dummy option -i so that it is easy to switch between ffmpeg -i "file" and ffplay -i "file". Signed-off-by: Anton Khirnov --- cmdutils.c | 4 ++-- ffplay.c | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/cmdutils.c b/cmdutils.c index f95778822e..feeea694d8 100644 --- a/cmdutils.c +++ b/cmdutils.c @@ -281,8 +281,8 @@ unknown_opt: fprintf(stderr, "%s: failed to set value '%s' for option '%s'\n", argv[0], arg, opt); exit(1); } - } else { - po->u.func_arg(arg); + } else if (po->u.func_arg) { + po->u.func_arg(arg); } if(po->flags & OPT_EXIT) exit(0); diff --git a/ffplay.c b/ffplay.c index 11b307eaca..07727b667a 100644 --- a/ffplay.c +++ b/ffplay.c @@ -3019,6 +3019,7 @@ static const OptionDef options[] = { #endif { "rdftspeed", OPT_INT | HAS_ARG| OPT_AUDIO | OPT_EXPERT, {(void*)&rdftspeed}, "rdft speed", "msecs" }, { "default", OPT_FUNC2 | HAS_ARG | OPT_AUDIO | OPT_VIDEO | OPT_EXPERT, {(void*)opt_default}, "generic catch all option", "" }, + { "i", 0, {NULL}, "ffmpeg compatibility dummy option", ""}, { NULL, }, }; From b568d6d94bda607e4ebb35be68181a8c2a9f5c50 Mon Sep 17 00:00:00 2001 From: Stefano Sabatini Date: Sat, 26 Mar 2011 15:26:45 +0100 Subject: [PATCH 07/32] ffmpeg: warns the user when the selected pixel format is ignored Signed-off-by: Anton Khirnov --- ffmpeg.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/ffmpeg.c b/ffmpeg.c index 74cfd095d0..612a413077 100644 --- a/ffmpeg.c +++ b/ffmpeg.c @@ -590,8 +590,15 @@ static void choose_pixel_fmt(AVStream *st, AVCodec *codec) if(*p == st->codec->pix_fmt) break; } - if(*p == -1) + if (*p == -1) { + if(st->codec->pix_fmt != PIX_FMT_NONE) + av_log(NULL, AV_LOG_WARNING, + "Incompatible pixel format '%s' for codec '%s', auto-selecting format '%s'\n", + av_pix_fmt_descriptors[st->codec->pix_fmt].name, + codec->name, + av_pix_fmt_descriptors[codec->pix_fmts[0]].name); st->codec->pix_fmt = codec->pix_fmts[0]; + } } } From 2ecc5b70fb53d0e2e74b51d1d598af8c842afc68 Mon Sep 17 00:00:00 2001 From: Stefano Sabatini Date: Sat, 16 Apr 2011 22:58:13 +0200 Subject: [PATCH 08/32] ffmpeg: improve reporting if size/pixel format changes Use av_log() rather than fprintf(stderr, ...), and show information related to the previous size/pixel format configuration. Consistent with the corresponding message issued in case of audio configuration change. Signed-off-by: Stefano Sabatini Signed-off-by: Anton Khirnov --- ffmpeg.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/ffmpeg.c b/ffmpeg.c index 612a413077..89755863e4 100644 --- a/ffmpeg.c +++ b/ffmpeg.c @@ -1129,8 +1129,11 @@ static void do_video_out(AVFormatContext *s, if ( ost->resample_height != ist->st->codec->height || ost->resample_width != ist->st->codec->width || (ost->resample_pix_fmt!= ist->st->codec->pix_fmt) ) { - - fprintf(stderr,"Input Stream #%d.%d frame size changed to %dx%d, %s\n", ist->file_index, ist->index, ist->st->codec->width, ist->st->codec->height,avcodec_get_pix_fmt_name(ist->st->codec->pix_fmt)); + av_log(NULL, AV_LOG_INFO, + "Input stream #%d.%d frame changed from size:%dx%d fmt:%s to size:%dx%d fmt:%s\n", + ist->file_index, ist->index, + ost->resample_width , ost->resample_height , avcodec_get_pix_fmt_name(ost->resample_pix_fmt), + ist->st->codec->width, ist->st->codec->height, avcodec_get_pix_fmt_name(ist->st->codec->pix_fmt)); if(!ost->video_resample) ffmpeg_exit(1); } From c29c2eea8fb35682fdfdcb64c4890e8a25137b2a Mon Sep 17 00:00:00 2001 From: Stefano Sabatini Date: Sun, 17 Apr 2011 01:38:09 +0200 Subject: [PATCH 09/32] ffmpeg: prefer "dec" over "ist->st->codec" in do_video_out() snippet Simplify, ease readability. Signed-off-by: Stefano Sabatini Signed-off-by: Anton Khirnov --- ffmpeg.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ffmpeg.c b/ffmpeg.c index 89755863e4..a5d877a022 100644 --- a/ffmpeg.c +++ b/ffmpeg.c @@ -1132,8 +1132,8 @@ static void do_video_out(AVFormatContext *s, av_log(NULL, AV_LOG_INFO, "Input stream #%d.%d frame changed from size:%dx%d fmt:%s to size:%dx%d fmt:%s\n", ist->file_index, ist->index, - ost->resample_width , ost->resample_height , avcodec_get_pix_fmt_name(ost->resample_pix_fmt), - ist->st->codec->width, ist->st->codec->height, avcodec_get_pix_fmt_name(ist->st->codec->pix_fmt)); + ost->resample_width, ost->resample_height, avcodec_get_pix_fmt_name(ost->resample_pix_fmt), + dec->width , dec->height , avcodec_get_pix_fmt_name(dec->pix_fmt)); if(!ost->video_resample) ffmpeg_exit(1); } From 9aa797cd2873562e85d04ea45ef7f49ad2cb07b9 Mon Sep 17 00:00:00 2001 From: Stefano Sabatini Date: Sat, 16 Apr 2011 23:11:01 +0200 Subject: [PATCH 10/32] ffmpeg: factorize resampling condition check in do_video_out() Simplify and improve readability. Signed-off-by: Stefano Sabatini Signed-off-by: Anton Khirnov --- ffmpeg.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/ffmpeg.c b/ffmpeg.c index a5d877a022..647018c083 100644 --- a/ffmpeg.c +++ b/ffmpeg.c @@ -1078,7 +1078,7 @@ static void do_video_out(AVFormatContext *s, AVFrame *in_picture, int *frame_size) { - int nb_frames, i, ret; + int nb_frames, i, ret, resample_changed; AVFrame *final_picture, *formatted_picture, *resampling_dst; AVCodecContext *enc, *dec; double sync_ipts; @@ -1126,9 +1126,11 @@ static void do_video_out(AVFormatContext *s, final_picture = formatted_picture; resampling_dst = &ost->pict_tmp; - if ( ost->resample_height != ist->st->codec->height - || ost->resample_width != ist->st->codec->width - || (ost->resample_pix_fmt!= ist->st->codec->pix_fmt) ) { + resample_changed = ost->resample_width != dec->width || + ost->resample_height != dec->height || + ost->resample_pix_fmt != dec->pix_fmt; + + if (resample_changed) { av_log(NULL, AV_LOG_INFO, "Input stream #%d.%d frame changed from size:%dx%d fmt:%s to size:%dx%d fmt:%s\n", ist->file_index, ist->index, @@ -1141,10 +1143,7 @@ static void do_video_out(AVFormatContext *s, #if !CONFIG_AVFILTER if (ost->video_resample) { final_picture = &ost->pict_tmp; - if( ost->resample_height != ist->st->codec->height - || ost->resample_width != ist->st->codec->width - || (ost->resample_pix_fmt!= ist->st->codec->pix_fmt) ) { - + if (resample_changed) { /* initialize a new scaler context */ sws_freeContext(ost->img_resample_ctx); ost->img_resample_ctx = sws_getContext( From 2b95602e93226bd269676b0edcda5322b5be8444 Mon Sep 17 00:00:00 2001 From: Stefano Sabatini Date: Sat, 16 Apr 2011 23:18:22 +0200 Subject: [PATCH 11/32] ffmpeg: reformat resample condition code in transcode() Signed-off-by: Stefano Sabatini Signed-off-by: Anton Khirnov --- ffmpeg.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ffmpeg.c b/ffmpeg.c index 647018c083..738fb7b922 100644 --- a/ffmpeg.c +++ b/ffmpeg.c @@ -2171,9 +2171,9 @@ static int transcode(AVFormatContext **output_files, fprintf(stderr, "Video pixel format is unknown, stream cannot be encoded\n"); ffmpeg_exit(1); } - ost->video_resample = (codec->width != icodec->width || - codec->height != icodec->height || - (codec->pix_fmt != icodec->pix_fmt)); + ost->video_resample = codec->width != icodec->width || + codec->height != icodec->height || + codec->pix_fmt != icodec->pix_fmt; if (ost->video_resample) { #if !CONFIG_AVFILTER avcodec_get_frame_defaults(&ost->pict_tmp); From 3fd62c6e247468d792ce8f1d3c458017d1ea9eb5 Mon Sep 17 00:00:00 2001 From: Stefano Sabatini Date: Wed, 20 Apr 2011 13:13:09 +0200 Subject: [PATCH 12/32] ffmpeg: call pre_process_video_frame() only if decoding is needed In output_packet(), move the pre_process_video_frame() call inside the if (ist->decoding_needed) { } block. This way pre_process_video_frame() is not called when stream-copy has been selected. Also simplify. Signed-off-by: Stefano Sabatini Signed-off-by: Anton Khirnov --- ffmpeg.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/ffmpeg.c b/ffmpeg.c index 738fb7b922..5ab3c7a508 100644 --- a/ffmpeg.c +++ b/ffmpeg.c @@ -1428,7 +1428,7 @@ static int output_packet(AVInputStream *ist, int ist_index, int ret, i; int got_output; AVFrame picture; - void *buffer_to_free; + void *buffer_to_free = NULL; static unsigned int samples_size= 0; AVSubtitle subtitle, *subtitle_to_free; int64_t pkt_pts = AV_NOPTS_VALUE; @@ -1530,6 +1530,8 @@ static int output_packet(AVInputStream *ist, int ist_index, ist->st->codec->time_base.den; } avpkt.size = 0; + buffer_to_free = NULL; + pre_process_video_frame(ist, (AVPicture *)&picture, &buffer_to_free); break; case AVMEDIA_TYPE_SUBTITLE: ret = avcodec_decode_subtitle2(ist->st->codec, @@ -1564,12 +1566,6 @@ static int output_packet(AVInputStream *ist, int ist_index, avpkt.size = 0; } - buffer_to_free = NULL; - if (ist->st->codec->codec_type == AVMEDIA_TYPE_VIDEO) { - pre_process_video_frame(ist, (AVPicture *)&picture, - &buffer_to_free); - } - #if CONFIG_AVFILTER if (ist->st->codec->codec_type == AVMEDIA_TYPE_VIDEO && ist->input_video_filter) { AVRational sar; From 0b4949b51816bc2fd23ba4c4de183b877b58aa25 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Mon, 9 May 2011 20:11:16 +0300 Subject: [PATCH 13/32] rtsp: Only do keepalive using GET_PARAMETER if the server supports it MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is more like what VLC does. If the server doesn't mention supporting GET_PARAMETER in response to an OPTIONS request, VLC doesn't send any keepalive requests at all. After this patch, libavformat will still send OPTIONS keepalives if GET_PARAMETER isn't explicitly said to be supported. Some RTSP cameras don't support GET_PARAMETER, and will close the connection if this is sent as keepalive request (but support OPTIONS just fine, but probably don't need any keepalive at all). Some other cameras don't support using OPTIONS as keepalive, but require GET_PARAMETER instead. Signed-off-by: Martin Storsjö --- libavformat/rtsp.c | 4 ++++ libavformat/rtsp.h | 5 +++++ libavformat/rtspdec.c | 4 +++- 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/libavformat/rtsp.c b/libavformat/rtsp.c index 14111e69fd..2ebf7e0510 100644 --- a/libavformat/rtsp.c +++ b/libavformat/rtsp.c @@ -808,6 +808,10 @@ void ff_rtsp_parse_line(RTSPMessageHeader *reply, const char *buf, p += strspn(p, SPACE_CHARS); if (method && !strcmp(method, "PLAY")) rtsp_parse_rtp_info(rt, p); + } else if (av_stristart(p, "Public:", &p) && rt) { + if (strstr(p, "GET_PARAMETER") && + method && !strcmp(method, "OPTIONS")) + rt->get_parameter_supported = 1; } } diff --git a/libavformat/rtsp.h b/libavformat/rtsp.h index e1f1df990e..ff66502626 100644 --- a/libavformat/rtsp.h +++ b/libavformat/rtsp.h @@ -331,6 +331,11 @@ typedef struct RTSPState { * Polling array for udp */ struct pollfd *p; + + /** + * Whether the server supports the GET_PARAMETER method. + */ + int get_parameter_supported; } RTSPState; /** diff --git a/libavformat/rtspdec.c b/libavformat/rtspdec.c index 866f313d10..ccfc4d8e27 100644 --- a/libavformat/rtspdec.c +++ b/libavformat/rtspdec.c @@ -341,7 +341,9 @@ retry: /* send dummy request to keep TCP connection alive */ if ((av_gettime() - rt->last_cmd_time) / 1000000 >= rt->timeout / 2) { - if (rt->server_type != RTSP_SERVER_REAL) { + if (rt->server_type == RTSP_SERVER_WMS || + (rt->server_type != RTSP_SERVER_REAL && + rt->get_parameter_supported)) { ff_rtsp_send_cmd_async(s, "GET_PARAMETER", rt->control_uri, NULL); } else { ff_rtsp_send_cmd_async(s, "OPTIONS", "*", NULL); From f628559d9a600159576faff9735e069479c9d361 Mon Sep 17 00:00:00 2001 From: Diego Biurrun Date: Mon, 9 May 2011 11:45:13 +0200 Subject: [PATCH 14/32] rotozoom: K&R coding style cosmetics --- tests/rotozoom.c | 199 ++++++++++++++++++++++++----------------------- 1 file changed, 102 insertions(+), 97 deletions(-) diff --git a/tests/rotozoom.c b/tests/rotozoom.c index 505072c4f9..03e9afb86f 100644 --- a/tests/rotozoom.c +++ b/tests/rotozoom.c @@ -24,47 +24,52 @@ #include #include -#define FIXP (1<<16) -#define MY_PI 205887 //(M_PI*FIX) +#define FIXP (1 << 16) +#define MY_PI 205887 //(M_PI * FIX) -static int64_t int_pow(int64_t a, int p){ - int64_t v= FIXP; +static int64_t int_pow(int64_t a, int p) +{ + int64_t v = FIXP; - for(; p; p--){ - v*= a; - v/= FIXP; + for (; p; p--) { + v *= a; + v /= FIXP; } return v; } -static int64_t int_sin(int64_t a){ - if(a<0) a= MY_PI-a; // 0..inf - a %= 2*MY_PI; // 0..2PI +static int64_t int_sin(int64_t a) +{ + if (a < 0) + a = MY_PI - a; // 0..inf + a %= 2 * MY_PI; // 0..2PI - if(a>=MY_PI*3/2) a -= 2*MY_PI; // -PI/2 .. 3PI/2 - if(a>=MY_PI/2 ) a = MY_PI - a; // -PI/2 .. PI/2 + if (a >= MY_PI * 3 / 2) + a -= 2 * MY_PI; // -PI / 2 .. 3PI / 2 + if (a >= MY_PI /2) + a = MY_PI - a; // -PI / 2 .. PI / 2 - return a - int_pow(a, 3)/6 + int_pow(a, 5)/120 - int_pow(a, 7)/5040; + return a - int_pow(a, 3) / 6 + int_pow(a, 5) / 120 - int_pow(a, 7) / 5040; } #define SCALEBITS 8 #define ONE_HALF (1 << (SCALEBITS - 1)) -#define FIX(x) ((int) ((x) * (1L<> SCALEBITS; - p += wrap3; + p += wrap3; lum += wrap; r = p[0]; @@ -104,14 +109,14 @@ static void rgb24_to_yuv420p(UINT8 *lum, UINT8 *cb, UINT8 *cr, cb[0] = ((- FIX(0.16874) * r1 - FIX(0.33126) * g1 + FIX(0.50000) * b1 + 4 * ONE_HALF - 1) >> (SCALEBITS + 2)) + 128; cr[0] = ((FIX(0.50000) * r1 - FIX(0.41869) * g1 - - FIX(0.08131) * b1 + 4 * ONE_HALF - 1) >> (SCALEBITS + 2)) + 128; + FIX(0.08131) * b1 + 4 * ONE_HALF - 1) >> (SCALEBITS + 2)) + 128; cb++; cr++; - p += -wrap3 + 2 * 3; - lum += -wrap + 2; + p += -wrap3 + 2 * 3; + lum += -wrap + 2; } - p += wrap3; + p += wrap3; lum += wrap; } } @@ -119,7 +124,7 @@ static void rgb24_to_yuv420p(UINT8 *lum, UINT8 *cb, UINT8 *cr, /* cif format */ #define DEFAULT_WIDTH 352 #define DEFAULT_HEIGHT 288 -#define DEFAULT_NB_PICT 50 +#define DEFAULT_NB_PICT 50 static void pgmyuv_save(const char *filename, int w, int h, unsigned char *rgb_tab) @@ -130,19 +135,19 @@ static void pgmyuv_save(const char *filename, int w, int h, unsigned char *lum_tab, *cb_tab, *cr_tab; lum_tab = malloc(w * h); - cb_tab = malloc((w * h) / 4); - cr_tab = malloc((w * h) / 4); + cb_tab = malloc((w * h) / 4); + cr_tab = malloc((w * h) / 4); rgb24_to_yuv420p(lum_tab, cb_tab, cr_tab, rgb_tab, w, h); - f = fopen(filename,"wb"); + f = fopen(filename, "wb"); fprintf(f, "P5\n%d %d\n%d\n", w, (h * 3) / 2, 255); fwrite(lum_tab, 1, w * h, f); h2 = h / 2; w2 = w / 2; cb = cb_tab; cr = cr_tab; - for(i=0;i>16; - int int_y= y>>16; - int frac_x= x&0xFFFF; - int frac_y= y&0xFFFF; - int s00= src[ ( int_x &255) + 256*( int_y &255) ]; - int s01= src[ ((int_x+1)&255) + 256*( int_y &255) ]; - int s10= src[ ( int_x &255) + 256*((int_y+1)&255) ]; - int s11= src[ ((int_x+1)&255) + 256*((int_y+1)&255) ]; - int s0= (((1<<16) - frac_x)*s00 + frac_x*s01)>>8; - int s1= (((1<<16) - frac_x)*s10 + frac_x*s11)>>8; +static int ipol(uint8_t *src, int x, int y) +{ + int int_x = x >> 16; + int int_y = y >> 16; + int frac_x = x & 0xFFFF; + int frac_y = y & 0xFFFF; + int s00 = src[( int_x & 255) + 256 * ( int_y & 255)]; + int s01 = src[((int_x + 1) & 255) + 256 * ( int_y & 255)]; + int s10 = src[( int_x & 255) + 256 * ((int_y + 1) & 255)]; + int s11 = src[((int_x + 1) & 255) + 256 * ((int_y + 1) & 255)]; + int s0 = (((1 << 16) - frac_x) * s00 + frac_x * s01) >> 8; + int s1 = (((1 << 16) - frac_x) * s10 + frac_x * s11) >> 8; - return (((1<<16) - frac_y)*s0 + frac_y*s1)>>24; + return (((1 << 16) - frac_y) * s0 + frac_y * s1) >> 24; } static void gen_image(int num, int w, int h) { - const int c = h_cos [num % 360]; - const int s = h_sin [num % 360]; + const int c = h_cos [num % 360]; + const int s = h_sin [num % 360]; - const int xi = -(w/2) * c; - const int yi = (w/2) * s; + const int xi = -(w / 2) * c; + const int yi = (w / 2) * s; - const int xj = -(h/2) * s; - const int yj = -(h/2) * c; - int i,j; + const int xj = -(h / 2) * s; + const int yj = -(h / 2) * c; + int i, j; - int x,y; - int xprime = xj; - int yprime = yj; + int x, y; + int xprime = xj; + int yprime = yj; + for (j = 0; j < h; j++) { + x = xprime + xi + FIXP * w / 2; + xprime += s; - for (j=0;j Date: Mon, 9 May 2011 12:23:55 +0200 Subject: [PATCH 15/32] rotozoom: Drop some unnecessary parentheses. --- tests/rotozoom.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/rotozoom.c b/tests/rotozoom.c index 03e9afb86f..209ed027d1 100644 --- a/tests/rotozoom.c +++ b/tests/rotozoom.c @@ -135,13 +135,13 @@ static void pgmyuv_save(const char *filename, int w, int h, unsigned char *lum_tab, *cb_tab, *cr_tab; lum_tab = malloc(w * h); - cb_tab = malloc((w * h) / 4); - cr_tab = malloc((w * h) / 4); + cb_tab = malloc(w * h / 4); + cr_tab = malloc(w * h / 4); rgb24_to_yuv420p(lum_tab, cb_tab, cr_tab, rgb_tab, w, h); f = fopen(filename, "wb"); - fprintf(f, "P5\n%d %d\n%d\n", w, (h * 3) / 2, 255); + fprintf(f, "P5\n%d %d\n%d\n", w, h * 3 / 2, 255); fwrite(lum_tab, 1, w * h, f); h2 = h / 2; w2 = w / 2; @@ -264,8 +264,8 @@ static void init_demo(const char *filename) for (i = 0; i < 360; i++) { radian = 2 * i * MY_PI / 360; h = 2 * FIXP + int_sin (radian); - h_cos[i] = (h * int_sin(radian + MY_PI / 2)) / 2 / FIXP; - h_sin[i] = (h * int_sin(radian) ) / 2 / FIXP; + h_cos[i] = h * int_sin(radian + MY_PI / 2) / 2 / FIXP; + h_sin[i] = h * int_sin(radian) / 2 / FIXP; } } From 5a37c12c82323c0b1f06cf4b8030bcabb554765d Mon Sep 17 00:00:00 2001 From: Diego Biurrun Date: Mon, 9 May 2011 12:26:00 +0200 Subject: [PATCH 16/32] rotozoom: Drop silly UINT8 typedef. --- tests/rotozoom.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/rotozoom.c b/tests/rotozoom.c index 209ed027d1..642cefa56f 100644 --- a/tests/rotozoom.c +++ b/tests/rotozoom.c @@ -56,14 +56,14 @@ static int64_t int_sin(int64_t a) #define SCALEBITS 8 #define ONE_HALF (1 << (SCALEBITS - 1)) #define FIX(x) ((int) ((x) * (1L << SCALEBITS) + 0.5)) -typedef unsigned char UINT8; -static void rgb24_to_yuv420p(UINT8 *lum, UINT8 *cb, UINT8 *cr, - UINT8 *src, int width, int height) +static void rgb24_to_yuv420p(unsigned char *lum, unsigned char *cb, + unsigned char *cr, unsigned char *src, + int width, int height) { int wrap, wrap3, x, y; int r, g, b, r1, g1, b1; - UINT8 *p; + unsigned char *p; wrap = width; wrap3 = width * 3; From 2131e8590c447575a1c23bbc9f7e0bf9592d8997 Mon Sep 17 00:00:00 2001 From: Diego Biurrun Date: Mon, 9 May 2011 12:33:45 +0200 Subject: [PATCH 17/32] rotozoom: Make init_demo() return int and check for errors on invocation. --- tests/rotozoom.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/rotozoom.c b/tests/rotozoom.c index 642cefa56f..aebcc8e74a 100644 --- a/tests/rotozoom.c +++ b/tests/rotozoom.c @@ -234,7 +234,7 @@ static void gen_image(int num, int w, int h) #define W 256 #define H 256 -static void init_demo(const char *filename) +static int init_demo(const char *filename) { int i, j; int h; @@ -246,7 +246,7 @@ static void init_demo(const char *filename) fichier = fopen(filename, "rb"); if (!fichier) { perror(filename); - exit(1); + return 1; } fread(line, 1, 15, fichier); @@ -267,6 +267,8 @@ static void init_demo(const char *filename) h_cos[i] = h * int_sin(radian + MY_PI / 2) / 2 / FIXP; h_sin[i] = h * int_sin(radian) / 2 / FIXP; } + + return 0; } int main(int argc, char **argv) @@ -288,7 +290,8 @@ int main(int argc, char **argv) width = w; height = h; - init_demo(argv[2]); + if (init_demo(argv[2])) + return 1; for (i = 0; i < DEFAULT_NB_PICT; i++) { snprintf(buf, sizeof(buf), "%s%02d.pgm", argv[1], i); From 771339ca206468636a64a6041852068be2da3dd2 Mon Sep 17 00:00:00 2001 From: Diego Biurrun Date: Mon, 9 May 2011 11:16:55 +0200 Subject: [PATCH 18/32] rotozoom: Return an error value instead of calling exit(). --- tests/rotozoom.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/rotozoom.c b/tests/rotozoom.c index aebcc8e74a..644ae8b1eb 100644 --- a/tests/rotozoom.c +++ b/tests/rotozoom.c @@ -279,7 +279,7 @@ int main(int argc, char **argv) if (argc != 3) { printf("usage: %s directory/ image.pnm\n" "generate a test video stream\n", argv[0]); - exit(1); + return 1; } w = DEFAULT_WIDTH; From cbb0930f0ebdb2655296d7ae4424ee922168c5b7 Mon Sep 17 00:00:00 2001 From: Diego Biurrun Date: Mon, 9 May 2011 12:42:15 +0200 Subject: [PATCH 19/32] rotozoom: Check return value of fread(). MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This fixes the warnings: tests/rotozoom.c:252: warning: ignoring return value of ‘fread’, declared with attribute warn_unused_result tests/rotozoom.c:254: warning: ignoring return value of ‘fread’, declared with attribute warn_unused_result --- tests/rotozoom.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/rotozoom.c b/tests/rotozoom.c index 644ae8b1eb..d61ce21c10 100644 --- a/tests/rotozoom.c +++ b/tests/rotozoom.c @@ -249,9 +249,11 @@ static int init_demo(const char *filename) return 1; } - fread(line, 1, 15, fichier); + if (fread(line, 1, 15, fichier) != 15) + return 1; for (i = 0; i < H; i++) { - fread(line, 1, 3 * W, fichier); + if (fread(line, 1, 3 * W, fichier) != 3 * W) + return 1; for (j = 0; j < W; j++) { tab_r[W * i + j] = line[3 * j ]; tab_g[W * i + j] = line[3 * j + 1]; From e1e0ca70eb0cd469b4ba757e3f72e7540ef04056 Mon Sep 17 00:00:00 2001 From: Diego Biurrun Date: Mon, 9 May 2011 13:00:04 +0200 Subject: [PATCH 20/32] rotozoom: Eliminate French variable name. --- tests/rotozoom.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/rotozoom.c b/tests/rotozoom.c index d61ce21c10..ff817acc3b 100644 --- a/tests/rotozoom.c +++ b/tests/rotozoom.c @@ -241,18 +241,18 @@ static int init_demo(const char *filename) int radian; char line[3 * W]; - FILE *fichier; + FILE *input_file; - fichier = fopen(filename, "rb"); - if (!fichier) { + input_file = fopen(filename, "rb"); + if (!input_file) { perror(filename); return 1; } - if (fread(line, 1, 15, fichier) != 15) + if (fread(line, 1, 15, input_file) != 15) return 1; for (i = 0; i < H; i++) { - if (fread(line, 1, 3 * W, fichier) != 3 * W) + if (fread(line, 1, 3 * W, input_file) != 3 * W) return 1; for (j = 0; j < W; j++) { tab_r[W * i + j] = line[3 * j ]; @@ -260,7 +260,7 @@ static int init_demo(const char *filename) tab_b[W * i + j] = line[3 * j + 2]; } } - fclose(fichier); + fclose(input_file); /* tables sin/cos */ for (i = 0; i < 360; i++) { From 2caf19e90f270abe1e80a3e85acaf0eb5c9d0aac Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Tue, 10 May 2011 20:23:08 -0400 Subject: [PATCH 21/32] h264pred: fix aliasing violations. Tested to fix Haiku H264/10bit fate failures, may also fix others. --- libavcodec/h264pred_template.c | 163 ++++++++++++++++++--------------- libavcodec/high_bit_depth.h | 3 + 2 files changed, 93 insertions(+), 73 deletions(-) diff --git a/libavcodec/h264pred_template.c b/libavcodec/h264pred_template.c index 066e837cdf..c600133346 100644 --- a/libavcodec/h264pred_template.c +++ b/libavcodec/h264pred_template.c @@ -31,20 +31,21 @@ static void FUNCC(pred4x4_vertical)(uint8_t *_src, const uint8_t *topright, int _stride){ pixel *src = (pixel*)_src; int stride = _stride/sizeof(pixel); - const pixel4 a= ((pixel4*)(src-stride))[0]; - ((pixel4*)(src+0*stride))[0]= a; - ((pixel4*)(src+1*stride))[0]= a; - ((pixel4*)(src+2*stride))[0]= a; - ((pixel4*)(src+3*stride))[0]= a; + const pixel4 a= AV_RN4PA(src-stride); + + AV_WN4PA(src+0*stride, a); + AV_WN4PA(src+1*stride, a); + AV_WN4PA(src+2*stride, a); + AV_WN4PA(src+3*stride, a); } static void FUNCC(pred4x4_horizontal)(uint8_t *_src, const uint8_t *topright, int _stride){ pixel *src = (pixel*)_src; int stride = _stride/sizeof(pixel); - ((pixel4*)(src+0*stride))[0]= PIXEL_SPLAT_X4(src[-1+0*stride]); - ((pixel4*)(src+1*stride))[0]= PIXEL_SPLAT_X4(src[-1+1*stride]); - ((pixel4*)(src+2*stride))[0]= PIXEL_SPLAT_X4(src[-1+2*stride]); - ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4(src[-1+3*stride]); + AV_WN4PA(src+0*stride, PIXEL_SPLAT_X4(src[-1+0*stride])); + AV_WN4PA(src+1*stride, PIXEL_SPLAT_X4(src[-1+1*stride])); + AV_WN4PA(src+2*stride, PIXEL_SPLAT_X4(src[-1+2*stride])); + AV_WN4PA(src+3*stride, PIXEL_SPLAT_X4(src[-1+3*stride])); } static void FUNCC(pred4x4_dc)(uint8_t *_src, const uint8_t *topright, int _stride){ @@ -52,60 +53,69 @@ static void FUNCC(pred4x4_dc)(uint8_t *_src, const uint8_t *topright, int _strid int stride = _stride/sizeof(pixel); const int dc= ( src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] + src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 4) >>3; + const pixel4 a = PIXEL_SPLAT_X4(dc); - ((pixel4*)(src+0*stride))[0]= - ((pixel4*)(src+1*stride))[0]= - ((pixel4*)(src+2*stride))[0]= - ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4(dc); + AV_WN4PA(src+0*stride, a); + AV_WN4PA(src+1*stride, a); + AV_WN4PA(src+2*stride, a); + AV_WN4PA(src+3*stride, a); } static void FUNCC(pred4x4_left_dc)(uint8_t *_src, const uint8_t *topright, int _stride){ pixel *src = (pixel*)_src; int stride = _stride/sizeof(pixel); const int dc= ( src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 2) >>2; + const pixel4 a = PIXEL_SPLAT_X4(dc); - ((pixel4*)(src+0*stride))[0]= - ((pixel4*)(src+1*stride))[0]= - ((pixel4*)(src+2*stride))[0]= - ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4(dc); + AV_WN4PA(src+0*stride, a); + AV_WN4PA(src+1*stride, a); + AV_WN4PA(src+2*stride, a); + AV_WN4PA(src+3*stride, a); } static void FUNCC(pred4x4_top_dc)(uint8_t *_src, const uint8_t *topright, int _stride){ pixel *src = (pixel*)_src; int stride = _stride/sizeof(pixel); const int dc= ( src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] + 2) >>2; + const pixel4 a = PIXEL_SPLAT_X4(dc); - ((pixel4*)(src+0*stride))[0]= - ((pixel4*)(src+1*stride))[0]= - ((pixel4*)(src+2*stride))[0]= - ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4(dc); + AV_WN4PA(src+0*stride, a); + AV_WN4PA(src+1*stride, a); + AV_WN4PA(src+2*stride, a); + AV_WN4PA(src+3*stride, a); } static void FUNCC(pred4x4_128_dc)(uint8_t *_src, const uint8_t *topright, int _stride){ pixel *src = (pixel*)_src; int stride = _stride/sizeof(pixel); - ((pixel4*)(src+0*stride))[0]= - ((pixel4*)(src+1*stride))[0]= - ((pixel4*)(src+2*stride))[0]= - ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4(1<<(BIT_DEPTH-1)); + const pixel4 a = PIXEL_SPLAT_X4(1<<(BIT_DEPTH-1)); + + AV_WN4PA(src+0*stride, a); + AV_WN4PA(src+1*stride, a); + AV_WN4PA(src+2*stride, a); + AV_WN4PA(src+3*stride, a); } static void FUNCC(pred4x4_127_dc)(uint8_t *_src, const uint8_t *topright, int _stride){ pixel *src = (pixel*)_src; int stride = _stride/sizeof(pixel); - ((pixel4*)(src+0*stride))[0]= - ((pixel4*)(src+1*stride))[0]= - ((pixel4*)(src+2*stride))[0]= - ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4((1<<(BIT_DEPTH-1))-1); + const pixel4 a = PIXEL_SPLAT_X4((1<<(BIT_DEPTH-1))-1); + + AV_WN4PA(src+0*stride, a); + AV_WN4PA(src+1*stride, a); + AV_WN4PA(src+2*stride, a); + AV_WN4PA(src+3*stride, a); } static void FUNCC(pred4x4_129_dc)(uint8_t *_src, const uint8_t *topright, int _stride){ pixel *src = (pixel*)_src; int stride = _stride/sizeof(pixel); - ((pixel4*)(src+0*stride))[0]= - ((pixel4*)(src+1*stride))[0]= - ((pixel4*)(src+2*stride))[0]= - ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4((1<<(BIT_DEPTH-1))+1); + const pixel4 a = PIXEL_SPLAT_X4((1<<(BIT_DEPTH-1))+1); + + AV_WN4PA(src+0*stride, a); + AV_WN4PA(src+1*stride, a); + AV_WN4PA(src+2*stride, a); + AV_WN4PA(src+3*stride, a); } @@ -286,16 +296,16 @@ static void FUNCC(pred16x16_vertical)(uint8_t *_src, int _stride){ int i; pixel *src = (pixel*)_src; int stride = _stride/sizeof(pixel); - const pixel4 a = ((pixel4*)(src-stride))[0]; - const pixel4 b = ((pixel4*)(src-stride))[1]; - const pixel4 c = ((pixel4*)(src-stride))[2]; - const pixel4 d = ((pixel4*)(src-stride))[3]; + const pixel4 a = AV_RN4PA(((pixel4*)(src-stride))+0); + const pixel4 b = AV_RN4PA(((pixel4*)(src-stride))+1); + const pixel4 c = AV_RN4PA(((pixel4*)(src-stride))+2); + const pixel4 d = AV_RN4PA(((pixel4*)(src-stride))+3); for(i=0; i<16; i++){ - ((pixel4*)(src+i*stride))[0] = a; - ((pixel4*)(src+i*stride))[1] = b; - ((pixel4*)(src+i*stride))[2] = c; - ((pixel4*)(src+i*stride))[3] = d; + AV_WN4PA(((pixel4*)(src+i*stride))+0, a); + AV_WN4PA(((pixel4*)(src+i*stride))+1, b); + AV_WN4PA(((pixel4*)(src+i*stride))+2, c); + AV_WN4PA(((pixel4*)(src+i*stride))+3, d); } } @@ -305,19 +315,21 @@ static void FUNCC(pred16x16_horizontal)(uint8_t *_src, int stride){ stride /= sizeof(pixel); for(i=0; i<16; i++){ - ((pixel4*)(src+i*stride))[0] = - ((pixel4*)(src+i*stride))[1] = - ((pixel4*)(src+i*stride))[2] = - ((pixel4*)(src+i*stride))[3] = PIXEL_SPLAT_X4(src[-1+i*stride]); + const pixel4 a = PIXEL_SPLAT_X4(src[-1+i*stride]); + + AV_WN4PA(((pixel4*)(src+i*stride))+0, a); + AV_WN4PA(((pixel4*)(src+i*stride))+1, a); + AV_WN4PA(((pixel4*)(src+i*stride))+2, a); + AV_WN4PA(((pixel4*)(src+i*stride))+3, a); } } #define PREDICT_16x16_DC(v)\ for(i=0; i<16; i++){\ - AV_WN4P(src+ 0, v);\ - AV_WN4P(src+ 4, v);\ - AV_WN4P(src+ 8, v);\ - AV_WN4P(src+12, v);\ + AV_WN4PA(src+ 0, v);\ + AV_WN4PA(src+ 4, v);\ + AV_WN4PA(src+ 8, v);\ + AV_WN4PA(src+12, v);\ src += stride;\ } @@ -432,12 +444,12 @@ static void FUNCC(pred8x8_vertical)(uint8_t *_src, int _stride){ int i; pixel *src = (pixel*)_src; int stride = _stride/sizeof(pixel); - const pixel4 a= ((pixel4*)(src-stride))[0]; - const pixel4 b= ((pixel4*)(src-stride))[1]; + const pixel4 a= AV_RN4PA(((pixel4*)(src-stride))+0); + const pixel4 b= AV_RN4PA(((pixel4*)(src-stride))+1); for(i=0; i<8; i++){ - ((pixel4*)(src+i*stride))[0]= a; - ((pixel4*)(src+i*stride))[1]= b; + AV_WN4PA(((pixel4*)(src+i*stride))+0, a); + AV_WN4PA(((pixel4*)(src+i*stride))+1, b); } } @@ -447,19 +459,21 @@ static void FUNCC(pred8x8_horizontal)(uint8_t *_src, int stride){ stride /= sizeof(pixel); for(i=0; i<8; i++){ - ((pixel4*)(src+i*stride))[0]= - ((pixel4*)(src+i*stride))[1]= PIXEL_SPLAT_X4(src[-1+i*stride]); + const pixel4 a = PIXEL_SPLAT_X4(src[-1+i*stride]); + AV_WN4PA(((pixel4*)(src+i*stride))+0, a); + AV_WN4PA(((pixel4*)(src+i*stride))+1, a); } } #define PRED8x8_X(n, v)\ static void FUNCC(pred8x8_##n##_dc)(uint8_t *_src, int stride){\ int i;\ + const pixel4 a = PIXEL_SPLAT_X4(v);\ pixel *src = (pixel*)_src;\ stride /= sizeof(pixel);\ for(i=0; i<8; i++){\ - ((pixel4*)(src+i*stride))[0]=\ - ((pixel4*)(src+i*stride))[1]= PIXEL_SPLAT_X4(v);\ + AV_WN4PA(((pixel4*)(src+i*stride))+0, a);\ + AV_WN4PA(((pixel4*)(src+i*stride))+1, a);\ }\ } @@ -483,12 +497,12 @@ static void FUNCC(pred8x8_left_dc)(uint8_t *_src, int stride){ dc2splat = PIXEL_SPLAT_X4((dc2 + 2)>>2); for(i=0; i<4; i++){ - ((pixel4*)(src+i*stride))[0]= - ((pixel4*)(src+i*stride))[1]= dc0splat; + AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat); + AV_WN4PA(((pixel4*)(src+i*stride))+1, dc0splat); } for(i=4; i<8; i++){ - ((pixel4*)(src+i*stride))[0]= - ((pixel4*)(src+i*stride))[1]= dc2splat; + AV_WN4PA(((pixel4*)(src+i*stride))+0, dc2splat); + AV_WN4PA(((pixel4*)(src+i*stride))+1, dc2splat); } } @@ -508,12 +522,12 @@ static void FUNCC(pred8x8_top_dc)(uint8_t *_src, int stride){ dc1splat = PIXEL_SPLAT_X4((dc1 + 2)>>2); for(i=0; i<4; i++){ - ((pixel4*)(src+i*stride))[0]= dc0splat; - ((pixel4*)(src+i*stride))[1]= dc1splat; + AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat); + AV_WN4PA(((pixel4*)(src+i*stride))+1, dc1splat); } for(i=4; i<8; i++){ - ((pixel4*)(src+i*stride))[0]= dc0splat; - ((pixel4*)(src+i*stride))[1]= dc1splat; + AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat); + AV_WN4PA(((pixel4*)(src+i*stride))+1, dc1splat); } } @@ -536,12 +550,12 @@ static void FUNCC(pred8x8_dc)(uint8_t *_src, int stride){ dc3splat = PIXEL_SPLAT_X4((dc1 + dc2 + 4)>>3); for(i=0; i<4; i++){ - ((pixel4*)(src+i*stride))[0]= dc0splat; - ((pixel4*)(src+i*stride))[1]= dc1splat; + AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat); + AV_WN4PA(((pixel4*)(src+i*stride))+1, dc1splat); } for(i=4; i<8; i++){ - ((pixel4*)(src+i*stride))[0]= dc2splat; - ((pixel4*)(src+i*stride))[1]= dc3splat; + AV_WN4PA(((pixel4*)(src+i*stride))+0, dc2splat); + AV_WN4PA(((pixel4*)(src+i*stride))+1, dc3splat); } } @@ -636,8 +650,8 @@ static void FUNCC(pred8x8_plane)(uint8_t *_src, int _stride){ #define PREDICT_8x8_DC(v) \ int y; \ for( y = 0; y < 8; y++ ) { \ - ((pixel4*)src)[0] = \ - ((pixel4*)src)[1] = v; \ + AV_WN4PA(((pixel4*)src)+0, v); \ + AV_WN4PA(((pixel4*)src)+1, v); \ src += stride; \ } @@ -693,6 +707,7 @@ static void FUNCC(pred8x8l_vertical)(uint8_t *_src, int has_topleft, int has_top int y; pixel *src = (pixel*)_src; int stride = _stride/sizeof(pixel); + pixel4 a, b; PREDICT_8x8_LOAD_TOP; src[0] = t0; @@ -703,9 +718,11 @@ static void FUNCC(pred8x8l_vertical)(uint8_t *_src, int has_topleft, int has_top src[5] = t5; src[6] = t6; src[7] = t7; + a = AV_RN4PA(((pixel4*)src)+0); + b = AV_RN4PA(((pixel4*)src)+1); for( y = 1; y < 8; y++ ) { - ((pixel4*)(src+y*stride))[0] = ((pixel4*)src)[0]; - ((pixel4*)(src+y*stride))[1] = ((pixel4*)src)[1]; + AV_WN4PA(((pixel4*)(src+y*stride))+0, a); + AV_WN4PA(((pixel4*)(src+y*stride))+1, b); } } static void FUNCC(pred8x8l_down_left)(uint8_t *_src, int has_topleft, int has_topright, int _stride) diff --git a/libavcodec/high_bit_depth.h b/libavcodec/high_bit_depth.h index 6f2b6a74f4..511cd00f3a 100644 --- a/libavcodec/high_bit_depth.h +++ b/libavcodec/high_bit_depth.h @@ -14,6 +14,7 @@ # undef rnd_avg_pixel4 # undef AV_RN2P # undef AV_RN4P +# undef AV_RN4PA # undef AV_WN2P # undef AV_WN4P # undef AV_WN4PA @@ -46,6 +47,7 @@ CLIP_PIXEL(10) # define rnd_avg_pixel4 rnd_avg64 # define AV_RN2P AV_RN32 # define AV_RN4P AV_RN64 +# define AV_RN4PA AV_RN64A # define AV_WN2P AV_WN32 # define AV_WN4P AV_WN64 # define AV_WN4PA AV_WN64A @@ -61,6 +63,7 @@ CLIP_PIXEL(10) # define rnd_avg_pixel4 rnd_avg32 # define AV_RN2P AV_RN16 # define AV_RN4P AV_RN32 +# define AV_RN4PA AV_RN32A # define AV_WN2P AV_WN16 # define AV_WN4P AV_WN32 # define AV_WN4PA AV_WN32A From bea705752d6448578482cfa022944a9795388f14 Mon Sep 17 00:00:00 2001 From: Diego Biurrun Date: Wed, 11 May 2011 12:22:28 +0200 Subject: [PATCH 22/32] Remove unused softfloat implementation. The softfloat functionality is unused, not installed and incomplete. On platforms without floating point units, the compiler provides a softfloat implementation so there is no point in carrying this code around locally. --- doc/avutil.txt | 1 - libavutil/Makefile | 2 +- libavutil/softfloat.c | 72 ------------------------ libavutil/softfloat.h | 126 ------------------------------------------ 4 files changed, 1 insertion(+), 200 deletions(-) delete mode 100644 libavutil/softfloat.c delete mode 100644 libavutil/softfloat.h diff --git a/doc/avutil.txt b/doc/avutil.txt index 210bd07264..0847683d1d 100644 --- a/doc/avutil.txt +++ b/doc/avutil.txt @@ -19,7 +19,6 @@ integer.c 128bit integer math lls.c mathematics.c greatest common divisor, integer sqrt, integer log2, ... mem.c memory allocation routines with guaranteed alignment -softfloat.c Headers: bswap.h big/little/native-endian conversion code diff --git a/libavutil/Makefile b/libavutil/Makefile index 8c3cc60dab..1386ebb190 100644 --- a/libavutil/Makefile +++ b/libavutil/Makefile @@ -75,7 +75,7 @@ OBJS-$(ARCH_ARM) += arm/cpu.o OBJS-$(ARCH_PPC) += ppc/cpu.o OBJS-$(ARCH_X86) += x86/cpu.o -TESTPROGS = adler32 aes base64 cpu crc des lls md5 pca sha softfloat tree +TESTPROGS = adler32 aes base64 cpu crc des lls md5 pca sha tree TESTPROGS-$(HAVE_LZO1X_999_COMPRESS) += lzo DIRS = arm bfin sh4 x86 diff --git a/libavutil/softfloat.c b/libavutil/softfloat.c deleted file mode 100644 index 55969fb141..0000000000 --- a/libavutil/softfloat.c +++ /dev/null @@ -1,72 +0,0 @@ -/* - * copyright (c) 2006 Michael Niedermayer - * - * This file is part of Libav. - * - * Libav is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * Libav is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include -#include -#include -#include "softfloat.h" -#include "common.h" -#include "log.h" - -#undef printf - -int main(void){ - SoftFloat one= av_int2sf(1, 0); - SoftFloat sf1, sf2; - double d1, d2; - int i, j; - av_log_set_level(AV_LOG_DEBUG); - - d1= 1; - for(i= 0; i<10; i++){ - d1= 1/(d1+1); - } - printf("test1 double=%d\n", (int)(d1 * (1<<24))); - - sf1= one; - for(i= 0; i<10; i++){ - sf1= av_div_sf(one, av_normalize_sf(av_add_sf(one, sf1))); - } - printf("test1 sf =%d\n", av_sf2int(sf1, 24)); - - - for(i= 0; i<100; i++){ - START_TIMER - d1= i; - d2= i/100.0; - for(j= 0; j<1000; j++){ - d1= (d1+1)*d2; - } - STOP_TIMER("float add mul") - } - printf("test2 double=%d\n", (int)(d1 * (1<<24))); - - for(i= 0; i<100; i++){ - START_TIMER - sf1= av_int2sf(i, 0); - sf2= av_div_sf(av_int2sf(i, 2), av_int2sf(200, 3)); - for(j= 0; j<1000; j++){ - sf1= av_mul_sf(av_add_sf(sf1, one),sf2); - } - STOP_TIMER("softfloat add mul") - } - printf("test2 sf =%d (%d %d)\n", av_sf2int(sf1, 24), sf1.exp, sf1.mant); - return 0; -} diff --git a/libavutil/softfloat.h b/libavutil/softfloat.h deleted file mode 100644 index 3078bd7496..0000000000 --- a/libavutil/softfloat.h +++ /dev/null @@ -1,126 +0,0 @@ -/* - * Copyright (c) 2006 Michael Niedermayer - * - * This file is part of Libav. - * - * Libav is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * Libav is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVUTIL_SOFTFLOAT_H -#define AVUTIL_SOFTFLOAT_H - -#include -#include "common.h" - -#define MIN_EXP -126 -#define MAX_EXP 126 -#define ONE_BITS 29 - -typedef struct SoftFloat{ - int32_t exp; - int32_t mant; -}SoftFloat; - -static av_const SoftFloat av_normalize_sf(SoftFloat a){ - if(a.mant){ -#if 1 - while((a.mant + 0x20000000U)<0x40000000U){ - a.mant += a.mant; - a.exp -= 1; - } -#else - int s=ONE_BITS + 1 - av_log2(a.mant ^ (a.mant<<1)); - a.exp -= s; - a.mant <<= s; -#endif - if(a.exp < MIN_EXP){ - a.exp = MIN_EXP; - a.mant= 0; - } - }else{ - a.exp= MIN_EXP; - } - return a; -} - -static inline av_const SoftFloat av_normalize1_sf(SoftFloat a){ -#if 1 - if(a.mant + 0x40000000 < 0){ - a.exp++; - a.mant>>=1; - } - return a; -#elif 1 - int t= a.mant + 0x40000000 < 0; - return (SoftFloat){a.exp+t, a.mant>>t}; -#else - int t= (a.mant + 0x40000000U)>>31; - return (SoftFloat){a.exp+t, a.mant>>t}; -#endif -} - -/** - * @return Will not be more denormalized than a+b. So if either input is - * normalized, then the output will not be worse then the other input. - * If both are normalized, then the output will be normalized. - */ -static inline av_const SoftFloat av_mul_sf(SoftFloat a, SoftFloat b){ - a.exp += b.exp; - a.mant = (a.mant * (int64_t)b.mant) >> ONE_BITS; - return av_normalize1_sf(a); -} - -/** - * b has to be normalized and not zero. - * @return Will not be more denormalized than a. - */ -static av_const SoftFloat av_div_sf(SoftFloat a, SoftFloat b){ - a.exp -= b.exp+1; - a.mant = ((int64_t)a.mant<<(ONE_BITS+1)) / b.mant; - return av_normalize1_sf(a); -} - -static inline av_const int av_cmp_sf(SoftFloat a, SoftFloat b){ - int t= a.exp - b.exp; - if(t<0) return (a.mant >> (-t)) - b.mant ; - else return a.mant - (b.mant >> t); -} - -static inline av_const SoftFloat av_add_sf(SoftFloat a, SoftFloat b){ - int t= a.exp - b.exp; - if(t<0) return av_normalize1_sf((SoftFloat){b.exp, b.mant + (a.mant >> (-t))}); - else return av_normalize1_sf((SoftFloat){a.exp, a.mant + (b.mant >> t )}); -} - -static inline av_const SoftFloat av_sub_sf(SoftFloat a, SoftFloat b){ - return av_add_sf(a, (SoftFloat){b.exp, -b.mant}); -} - -//FIXME sqrt, log, exp, pow, sin, cos - -static inline av_const SoftFloat av_int2sf(int v, int frac_bits){ - return av_normalize_sf((SoftFloat){ONE_BITS-frac_bits, v}); -} - -/** - * Rounding is to -inf. - */ -static inline av_const int av_sf2int(SoftFloat v, int frac_bits){ - v.exp += frac_bits - ONE_BITS; - if(v.exp >= 0) return v.mant << v.exp ; - else return v.mant >>(-v.exp); -} - -#endif /* AVUTIL_SOFTFLOAT_H */ From b437f5b055497fc81b8061de593f3ef27657c1e3 Mon Sep 17 00:00:00 2001 From: Stefano Sabatini Date: Mon, 9 May 2011 21:59:20 +0200 Subject: [PATCH 23/32] tiff: add support for inverted FillOrder for uncompressed data Fix decoding of file b.tif, trac issue #168. Signed-off-by: Diego Biurrun --- libavcodec/tiff.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/libavcodec/tiff.c b/libavcodec/tiff.c index 3cc3a42500..1ec78a7125 100644 --- a/libavcodec/tiff.c +++ b/libavcodec/tiff.c @@ -170,7 +170,13 @@ static int tiff_unpack_strip(TiffContext *s, uint8_t* dst, int stride, const uin } switch(s->compr){ case TIFF_RAW: - memcpy(dst, src, width); + if (!s->fill_order) { + memcpy(dst, src, width); + } else { + int i; + for (i = 0; i < width; i++) + dst[i] = av_reverse[src[i]]; + } src += width; break; case TIFF_PACKBITS: From 5c511ad4ce20aff96ec587de1a8be6f28aed4544 Mon Sep 17 00:00:00 2001 From: Baptiste Coudurier Date: Wed, 11 May 2011 10:00:50 +0200 Subject: [PATCH 24/32] swscale: extend YUV422p support to 10bits depth Signed-off-by: Michael Niedermayer --- libavcodec/utils.c | 2 ++ libavutil/pixdesc.c | 23 +++++++++++++++++++++++ libavutil/pixfmt.h | 3 +++ 3 files changed, 28 insertions(+) diff --git a/libavcodec/utils.c b/libavcodec/utils.c index 9a19cde9cc..9e879940a9 100644 --- a/libavcodec/utils.c +++ b/libavcodec/utils.c @@ -140,6 +140,8 @@ void avcodec_align_dimensions2(AVCodecContext *s, int *width, int *height, int l case PIX_FMT_YUV420P9BE: case PIX_FMT_YUV420P10LE: case PIX_FMT_YUV420P10BE: + case PIX_FMT_YUV422P10LE: + case PIX_FMT_YUV422P10BE: w_align= 16; //FIXME check for non mpeg style codecs and use less alignment h_align= 16; if(s->codec_id == CODEC_ID_MPEG2VIDEO || s->codec_id == CODEC_ID_MJPEG || s->codec_id == CODEC_ID_AMV || s->codec_id == CODEC_ID_THP || s->codec_id == CODEC_ID_H264) diff --git a/libavutil/pixdesc.c b/libavutil/pixdesc.c index a291141436..bff45e522a 100644 --- a/libavutil/pixdesc.c +++ b/libavutil/pixdesc.c @@ -809,6 +809,29 @@ const AVPixFmtDescriptor av_pix_fmt_descriptors[PIX_FMT_NB] = { }, .flags = PIX_FMT_BE, }, + [PIX_FMT_YUV422P10LE] = { + .name = "yuv422p10le", + .nb_components= 3, + .log2_chroma_w= 1, + .log2_chroma_h= 0, + .comp = { + {0,1,1,0,9}, /* Y */ + {1,1,1,0,9}, /* U */ + {2,1,1,0,9}, /* V */ + }, + }, + [PIX_FMT_YUV422P10BE] = { + .name = "yuv422p10be", + .nb_components= 3, + .log2_chroma_w= 1, + .log2_chroma_h= 0, + .comp = { + {0,1,1,0,9}, /* Y */ + {1,1,1,0,9}, /* U */ + {2,1,1,0,9}, /* V */ + }, + .flags = PIX_FMT_BE, + }, [PIX_FMT_YUV422P16LE] = { .name = "yuv422p16le", .nb_components= 3, diff --git a/libavutil/pixfmt.h b/libavutil/pixfmt.h index fafbf9be1a..533eb9f706 100644 --- a/libavutil/pixfmt.h +++ b/libavutil/pixfmt.h @@ -139,6 +139,8 @@ enum PixelFormat { PIX_FMT_YUV420P9LE, ///< planar YUV 4:2:0, 13.5bpp, (1 Cr & Cb sample per 2x2 Y samples), little-endian PIX_FMT_YUV420P10BE,///< planar YUV 4:2:0, 15bpp, (1 Cr & Cb sample per 2x2 Y samples), big-endian PIX_FMT_YUV420P10LE,///< planar YUV 4:2:0, 15bpp, (1 Cr & Cb sample per 2x2 Y samples), little-endian + PIX_FMT_YUV422P10BE,///< planar YUV 4:2:2, 20bpp, (1 Cr & Cb sample per 2x1 Y samples), little-endian + PIX_FMT_YUV422P10LE,///< planar YUV 4:2:2, 20bpp, (1 Cr & Cb sample per 2x1 Y samples), big-endian PIX_FMT_NB, ///< number of pixel formats, DO NOT USE THIS if you want to link with shared libav* because the number of formats might differ between versions }; @@ -165,6 +167,7 @@ enum PixelFormat { #define PIX_FMT_YUV420P9 PIX_FMT_NE(YUV420P9BE , YUV420P9LE) #define PIX_FMT_YUV420P10 PIX_FMT_NE(YUV420P10BE, YUV420P10LE) +#define PIX_FMT_YUV422P10 PIX_FMT_NE(YUV422P10BE, YUV422P10LE) #define PIX_FMT_YUV420P16 PIX_FMT_NE(YUV420P16BE, YUV420P16LE) #define PIX_FMT_YUV422P16 PIX_FMT_NE(YUV422P16BE, YUV422P16LE) #define PIX_FMT_YUV444P16 PIX_FMT_NE(YUV444P16BE, YUV444P16LE) From 083e715f339e4546e9be8a6e265360be87cca517 Mon Sep 17 00:00:00 2001 From: Reinhard Tartler Date: Wed, 11 May 2011 13:51:11 +0200 Subject: [PATCH 25/32] aac: workaround for compilation on cygwin On cygwin, math.h needs to be included before float.h because of a bug in the system headers. Including libavutil/libm.h first works around this issue. Longer discussion of the topic: http://thread.gmane.org/gmane.comp.video.ffmpeg.devel/128582 --- libavcodec/aaccoder.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/libavcodec/aaccoder.c b/libavcodec/aaccoder.c index 15fe430732..83d3734089 100644 --- a/libavcodec/aaccoder.c +++ b/libavcodec/aaccoder.c @@ -30,13 +30,14 @@ * add sane pulse detection ***********************************/ +#include "libavutil/libm.h" // brought forward to work around cygwin header breakage + #include #include "avcodec.h" #include "put_bits.h" #include "aac.h" #include "aacenc.h" #include "aactab.h" -#include "libavutil/libm.h" /** bits needed to code codebook run value for long windows */ static const uint8_t run_value_bits_long[64] = { From c9e81d0783345a7b3b573a03ba5fa79ea90cd8c1 Mon Sep 17 00:00:00 2001 From: Anton Khirnov Date: Wed, 11 May 2011 08:45:17 +0200 Subject: [PATCH 26/32] lavc: deprecate named constants for deprecated antialias_algo. --- libavcodec/options.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libavcodec/options.c b/libavcodec/options.c index b22e53db9d..9c714fb73e 100644 --- a/libavcodec/options.c +++ b/libavcodec/options.c @@ -305,11 +305,11 @@ static const AVOption options[]={ {"error", NULL, OFFSET(error_rate), FF_OPT_TYPE_INT, {.dbl = DEFAULT }, INT_MIN, INT_MAX, V|E}, #if FF_API_ANTIALIAS_ALGO {"antialias", "MP3 antialias algorithm", OFFSET(antialias_algo), FF_OPT_TYPE_INT, {.dbl = DEFAULT }, INT_MIN, INT_MAX, V|D, "aa"}, -#endif {"auto", NULL, 0, FF_OPT_TYPE_CONST, {.dbl = FF_AA_AUTO }, INT_MIN, INT_MAX, V|D, "aa"}, {"fastint", NULL, 0, FF_OPT_TYPE_CONST, {.dbl = FF_AA_FASTINT }, INT_MIN, INT_MAX, V|D, "aa"}, {"int", NULL, 0, FF_OPT_TYPE_CONST, {.dbl = FF_AA_INT }, INT_MIN, INT_MAX, V|D, "aa"}, {"float", NULL, 0, FF_OPT_TYPE_CONST, {.dbl = FF_AA_FLOAT }, INT_MIN, INT_MAX, V|D, "aa"}, +#endif {"qns", "quantizer noise shaping", OFFSET(quantizer_noise_shaping), FF_OPT_TYPE_INT, {.dbl = DEFAULT }, INT_MIN, INT_MAX, V|E}, {"threads", NULL, OFFSET(thread_count), FF_OPT_TYPE_INT, {.dbl = 1 }, INT_MIN, INT_MAX, V|E|D}, {"me_threshold", "motion estimaton threshold", OFFSET(me_threshold), FF_OPT_TYPE_INT, {.dbl = DEFAULT }, INT_MIN, INT_MAX}, From 880fa2183051dfe6a7e44879361f674a906f5aa7 Mon Sep 17 00:00:00 2001 From: Anton Khirnov Date: Wed, 11 May 2011 13:45:12 +0200 Subject: [PATCH 27/32] flacenc: use proper initializers for AVOption default values. default_val was recently changes from double to a union, current code wasn't updated for that. --- libavcodec/flacenc.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/libavcodec/flacenc.c b/libavcodec/flacenc.c index 7685ff6ea0..8624a6d987 100644 --- a/libavcodec/flacenc.c +++ b/libavcodec/flacenc.c @@ -1330,22 +1330,22 @@ static av_cold int flac_encode_close(AVCodecContext *avctx) #define FLAGS AV_OPT_FLAG_ENCODING_PARAM | AV_OPT_FLAG_AUDIO_PARAM static const AVOption options[] = { -{ "lpc_coeff_precision", "LPC coefficient precision", offsetof(FlacEncodeContext, options.lpc_coeff_precision), FF_OPT_TYPE_INT, 15, 0, MAX_LPC_PRECISION, FLAGS }, -{ "lpc_type", "LPC algorithm", offsetof(FlacEncodeContext, options.lpc_type), FF_OPT_TYPE_INT, FF_LPC_TYPE_DEFAULT, FF_LPC_TYPE_DEFAULT, FF_LPC_TYPE_NB-1, FLAGS, "lpc_type" }, -{ "none", NULL, 0, FF_OPT_TYPE_CONST, FF_LPC_TYPE_NONE, INT_MIN, INT_MAX, FLAGS, "lpc_type" }, -{ "fixed", NULL, 0, FF_OPT_TYPE_CONST, FF_LPC_TYPE_FIXED, INT_MIN, INT_MAX, FLAGS, "lpc_type" }, -{ "levinson", NULL, 0, FF_OPT_TYPE_CONST, FF_LPC_TYPE_LEVINSON, INT_MIN, INT_MAX, FLAGS, "lpc_type" }, -{ "cholesky", NULL, 0, FF_OPT_TYPE_CONST, FF_LPC_TYPE_CHOLESKY, INT_MIN, INT_MAX, FLAGS, "lpc_type" }, -{ "lpc_passes", "Number of passes to use for Cholesky factorization during LPC analysis", offsetof(FlacEncodeContext, options.lpc_passes), FF_OPT_TYPE_INT, -1, INT_MIN, INT_MAX, FLAGS }, -{ "min_partition_order", NULL, offsetof(FlacEncodeContext, options.min_partition_order), FF_OPT_TYPE_INT, -1, -1, MAX_PARTITION_ORDER, FLAGS }, -{ "max_partition_order", NULL, offsetof(FlacEncodeContext, options.max_partition_order), FF_OPT_TYPE_INT, -1, -1, MAX_PARTITION_ORDER, FLAGS }, -{ "prediction_order_method", "Search method for selecting prediction order", offsetof(FlacEncodeContext, options.prediction_order_method), FF_OPT_TYPE_INT, -1, -1, ORDER_METHOD_LOG, FLAGS, "predm" }, -{ "estimation", NULL, 0, FF_OPT_TYPE_CONST, ORDER_METHOD_EST, INT_MIN, INT_MAX, FLAGS, "predm" }, -{ "2level", NULL, 0, FF_OPT_TYPE_CONST, ORDER_METHOD_2LEVEL, INT_MIN, INT_MAX, FLAGS, "predm" }, -{ "4level", NULL, 0, FF_OPT_TYPE_CONST, ORDER_METHOD_4LEVEL, INT_MIN, INT_MAX, FLAGS, "predm" }, -{ "8level", NULL, 0, FF_OPT_TYPE_CONST, ORDER_METHOD_8LEVEL, INT_MIN, INT_MAX, FLAGS, "predm" }, -{ "search", NULL, 0, FF_OPT_TYPE_CONST, ORDER_METHOD_SEARCH, INT_MIN, INT_MAX, FLAGS, "predm" }, -{ "log", NULL, 0, FF_OPT_TYPE_CONST, ORDER_METHOD_LOG, INT_MIN, INT_MAX, FLAGS, "predm" }, +{ "lpc_coeff_precision", "LPC coefficient precision", offsetof(FlacEncodeContext, options.lpc_coeff_precision), FF_OPT_TYPE_INT, {.dbl = 15 }, 0, MAX_LPC_PRECISION, FLAGS }, +{ "lpc_type", "LPC algorithm", offsetof(FlacEncodeContext, options.lpc_type), FF_OPT_TYPE_INT, {.dbl = FF_LPC_TYPE_DEFAULT }, FF_LPC_TYPE_DEFAULT, FF_LPC_TYPE_NB-1, FLAGS, "lpc_type" }, +{ "none", NULL, 0, FF_OPT_TYPE_CONST, {.dbl = FF_LPC_TYPE_NONE }, INT_MIN, INT_MAX, FLAGS, "lpc_type" }, +{ "fixed", NULL, 0, FF_OPT_TYPE_CONST, {.dbl = FF_LPC_TYPE_FIXED }, INT_MIN, INT_MAX, FLAGS, "lpc_type" }, +{ "levinson", NULL, 0, FF_OPT_TYPE_CONST, {.dbl = FF_LPC_TYPE_LEVINSON }, INT_MIN, INT_MAX, FLAGS, "lpc_type" }, +{ "cholesky", NULL, 0, FF_OPT_TYPE_CONST, {.dbl = FF_LPC_TYPE_CHOLESKY }, INT_MIN, INT_MAX, FLAGS, "lpc_type" }, +{ "lpc_passes", "Number of passes to use for Cholesky factorization during LPC analysis", offsetof(FlacEncodeContext, options.lpc_passes), FF_OPT_TYPE_INT, {.dbl = -1 }, INT_MIN, INT_MAX, FLAGS }, +{ "min_partition_order", NULL, offsetof(FlacEncodeContext, options.min_partition_order), FF_OPT_TYPE_INT, {.dbl = -1 }, -1, MAX_PARTITION_ORDER, FLAGS }, +{ "max_partition_order", NULL, offsetof(FlacEncodeContext, options.max_partition_order), FF_OPT_TYPE_INT, {.dbl = -1 }, -1, MAX_PARTITION_ORDER, FLAGS }, +{ "prediction_order_method", "Search method for selecting prediction order", offsetof(FlacEncodeContext, options.prediction_order_method), FF_OPT_TYPE_INT, {.dbl = -1 }, -1, ORDER_METHOD_LOG, FLAGS, "predm" }, +{ "estimation", NULL, 0, FF_OPT_TYPE_CONST, {.dbl = ORDER_METHOD_EST }, INT_MIN, INT_MAX, FLAGS, "predm" }, +{ "2level", NULL, 0, FF_OPT_TYPE_CONST, {.dbl = ORDER_METHOD_2LEVEL }, INT_MIN, INT_MAX, FLAGS, "predm" }, +{ "4level", NULL, 0, FF_OPT_TYPE_CONST, {.dbl = ORDER_METHOD_4LEVEL }, INT_MIN, INT_MAX, FLAGS, "predm" }, +{ "8level", NULL, 0, FF_OPT_TYPE_CONST, {.dbl = ORDER_METHOD_8LEVEL }, INT_MIN, INT_MAX, FLAGS, "predm" }, +{ "search", NULL, 0, FF_OPT_TYPE_CONST, {.dbl = ORDER_METHOD_SEARCH }, INT_MIN, INT_MAX, FLAGS, "predm" }, +{ "log", NULL, 0, FF_OPT_TYPE_CONST, {.dbl = ORDER_METHOD_LOG }, INT_MIN, INT_MAX, FLAGS, "predm" }, { NULL }, }; From 36dc49b713f81708e5637c18748c6f666f732caa Mon Sep 17 00:00:00 2001 From: Anton Khirnov Date: Wed, 11 May 2011 13:56:42 +0200 Subject: [PATCH 28/32] doc/APIchanges: fill in missing hashes and dates. --- doc/APIchanges | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/APIchanges b/doc/APIchanges index 0995b192a4..ee96ddf258 100644 --- a/doc/APIchanges +++ b/doc/APIchanges @@ -13,21 +13,21 @@ libavutil: 2011-04-18 API changes, most recent first: -2011-05-10 - xxxxxxx - lavc 53.3.0 - avcodec.h +2011-05-10 - 188dea1 - lavc 53.3.0 - avcodec.h Deprecate AVLPCType and the following fields in AVCodecContext: lpc_coeff_precision, prediction_order_method, min_partition_order, max_partition_order, lpc_type, lpc_passes. Corresponding FLAC encoder options should be used instead. -2011-04-XX - bebe72f - lavu 51.1.0 - avutil.h +2011-04-26 - bebe72f - lavu 51.1.0 - avutil.h Add AVPictureType enum and av_get_picture_type_char(), deprecate FF_*_TYPE defines and av_get_pict_type_char() defined in libavcodec/avcodec.h. -2011-04-xx - 10d3940 - lavfi 2.3.0 - avfilter.h +2011-04-26 - 10d3940 - lavfi 2.3.0 - avfilter.h Add pict_type and key_frame fields to AVFilterBufferRefVideo. -2011-04-xx - 7a11c82 - lavfi 2.2.0 - vsrc_buffer +2011-04-26 - 7a11c82 - lavfi 2.2.0 - vsrc_buffer Add sample_aspect_ratio fields to vsrc_buffer arguments 2011-04-21 - 94f7451 - lavc 53.1.0 - avcodec.h From d2bf42895ac30d228491a8a95a5908351dc32783 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Wed, 11 May 2011 08:30:02 -0400 Subject: [PATCH 29/32] h264pred: fix one more aliasing violation. Signed-off-by: Anton Khirnov --- libavcodec/h264pred_template.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/libavcodec/h264pred_template.c b/libavcodec/h264pred_template.c index c600133346..1c1fe0bc31 100644 --- a/libavcodec/h264pred_template.c +++ b/libavcodec/h264pred_template.c @@ -695,10 +695,12 @@ static void FUNCC(pred8x8l_horizontal)(uint8_t *_src, int has_topleft, int has_t { pixel *src = (pixel*)_src; int stride = _stride/sizeof(pixel); + pixel4 a; PREDICT_8x8_LOAD_LEFT; -#define ROW(y) ((pixel4*)(src+y*stride))[0] =\ - ((pixel4*)(src+y*stride))[1] = PIXEL_SPLAT_X4(l##y) +#define ROW(y) a = PIXEL_SPLAT_X4(l##y); \ + AV_WN4PA(src+y*stride, a); \ + AV_WN4PA(src+y*stride+4, a); ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7); #undef ROW } From b44c8ad280c221691560ae9625421416e20c483f Mon Sep 17 00:00:00 2001 From: Anatoly Nenashev Date: Sun, 27 Mar 2011 21:41:48 +0200 Subject: [PATCH 30/32] Fix crash of interlaced MPEG2 decoding Problem description, preliminary review discussion at http://thread.gmane.org/gmane.comp.video.ffmpeg.devel/127731 --- libavcodec/mpegvideo.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/libavcodec/mpegvideo.c b/libavcodec/mpegvideo.c index f4de8dded4..2c0525e2ad 100644 --- a/libavcodec/mpegvideo.c +++ b/libavcodec/mpegvideo.c @@ -998,8 +998,13 @@ int MPV_frame_start(MpegEncContext *s, AVCodecContext *avctx) s->pict_type, s->dropable);*/ if(s->codec_id != CODEC_ID_H264){ - if((s->last_picture_ptr==NULL || s->last_picture_ptr->data[0]==NULL) && s->pict_type!=AV_PICTURE_TYPE_I){ - av_log(avctx, AV_LOG_ERROR, "warning: first frame is no keyframe\n"); + if((s->last_picture_ptr==NULL || s->last_picture_ptr->data[0]==NULL) && + (s->pict_type!=AV_PICTURE_TYPE_I || s->picture_structure != PICT_FRAME)){ + if (s->pict_type != AV_PICTURE_TYPE_I) + av_log(avctx, AV_LOG_ERROR, "warning: first frame is no keyframe\n"); + else if (s->picture_structure != PICT_FRAME) + av_log(avctx, AV_LOG_INFO, "allocate dummy last picture for field based first keyframe\n"); + /* Allocate a dummy frame */ i= ff_find_unused_picture(s, 0); s->last_picture_ptr= &s->picture[i]; From 9aa91043f30cee1419555c0e299c94e655b0930a Mon Sep 17 00:00:00 2001 From: Baptiste Coudurier Date: Tue, 12 Apr 2011 15:29:09 -0700 Subject: [PATCH 31/32] Port SMPTE S302M audio decoder from FFmbc 0.3. --- Changelog | 1 + doc/general.texi | 1 + libavcodec/Makefile | 1 + libavcodec/allcodecs.c | 1 + libavcodec/avcodec.h | 1 + libavcodec/s302m.c | 141 +++++++++++++++++++++++++++++++++++++++++ libavformat/mpegts.c | 1 + 7 files changed, 147 insertions(+) create mode 100644 libavcodec/s302m.c diff --git a/Changelog b/Changelog index b5ceea9444..3d3fe6eb57 100644 --- a/Changelog +++ b/Changelog @@ -7,6 +7,7 @@ version : - Lots of deprecated API cruft removed - fft and imdct optimizations for AVX (Sandy Bridge) processors - DPX image encoder +- SMPTE 302M AES3 audio decoder version 0.7_beta1: diff --git a/doc/general.texi b/doc/general.texi index 598b9bc873..53482273f9 100644 --- a/doc/general.texi +++ b/doc/general.texi @@ -673,6 +673,7 @@ following image formats are supported: @item Sierra VMD audio @tab @tab X @tab Used in Sierra VMD files. @item Smacker audio @tab @tab X +@item SMPTE 302M AES3 audio @tab @tab X @item Speex @tab @tab E @tab supported through external library libspeex @item True Audio (TTA) @tab @tab X diff --git a/libavcodec/Makefile b/libavcodec/Makefile index 996b9b4d81..9040b32f57 100644 --- a/libavcodec/Makefile +++ b/libavcodec/Makefile @@ -324,6 +324,7 @@ OBJS-$(CONFIG_RV30_DECODER) += rv30.o rv34.o rv30dsp.o \ mpegvideo.o error_resilience.o OBJS-$(CONFIG_RV40_DECODER) += rv40.o rv34.o rv40dsp.o \ mpegvideo.o error_resilience.o +OBJS-$(CONFIG_S302M_DECODER) += s302m.o OBJS-$(CONFIG_SGI_DECODER) += sgidec.o OBJS-$(CONFIG_SGI_ENCODER) += sgienc.o rle.o OBJS-$(CONFIG_SHORTEN_DECODER) += shorten.o diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c index 694674ce7b..3466ad94fd 100644 --- a/libavcodec/allcodecs.c +++ b/libavcodec/allcodecs.c @@ -177,6 +177,7 @@ void avcodec_register_all(void) REGISTER_ENCDEC (RV20, rv20); REGISTER_DECODER (RV30, rv30); REGISTER_DECODER (RV40, rv40); + REGISTER_DECODER (S302M, s302m); REGISTER_ENCDEC (SGI, sgi); REGISTER_DECODER (SMACKER, smacker); REGISTER_DECODER (SMC, smc); diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h index 876ba8c21b..2eb218ba4f 100644 --- a/libavcodec/avcodec.h +++ b/libavcodec/avcodec.h @@ -232,6 +232,7 @@ enum CodecID { CODEC_ID_PCM_F64LE, CODEC_ID_PCM_BLURAY, CODEC_ID_PCM_LXF, + CODEC_ID_S302M, /* various ADPCM codecs */ CODEC_ID_ADPCM_IMA_QT= 0x11000, diff --git a/libavcodec/s302m.c b/libavcodec/s302m.c new file mode 100644 index 0000000000..fb1fd867d0 --- /dev/null +++ b/libavcodec/s302m.c @@ -0,0 +1,141 @@ +/* + * SMPTE 302M decoder + * Copyright (c) 2008 Laurent Aimar + * Copyright (c) 2009 Baptiste Coudurier + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/intreadwrite.h" +#include "avcodec.h" + +#define AES3_HEADER_LEN 4 + +static int s302m_parse_frame_header(AVCodecContext *avctx, const uint8_t *buf, + int buf_size) +{ + uint32_t h; + int frame_size, channels, id, bits; + + if (buf_size <= AES3_HEADER_LEN) { + av_log(avctx, AV_LOG_ERROR, "frame is too short\n"); + return AVERROR_INVALIDDATA; + } + + /* + * AES3 header : + * size: 16 + * number channels 2 + * channel_id 8 + * bits per samples 2 + * alignments 4 + */ + + h = AV_RB32(buf); + frame_size = (h >> 16) & 0xffff; + channels = ((h >> 14) & 0x0003) * 2 + 2; + id = (h >> 6) & 0x00ff; + bits = ((h >> 4) & 0x0003) * 4 + 16; + + if (AES3_HEADER_LEN + frame_size != buf_size || bits > 24) { + av_log(avctx, AV_LOG_ERROR, "frame has invalid header\n"); + return AVERROR_INVALIDDATA; + } + + /* Set output properties */ + avctx->bits_per_coded_sample = bits; + if (bits > 16) + avctx->sample_fmt = SAMPLE_FMT_S32; + else + avctx->sample_fmt = SAMPLE_FMT_S16; + + avctx->channels = channels; + avctx->sample_rate = 48000; + avctx->bit_rate = 48000 * avctx->channels * (avctx->bits_per_coded_sample + 4) + + 32 * (48000 / (buf_size * 8 / + (avctx->channels * + (avctx->bits_per_coded_sample + 4)))); + + return frame_size; +} + +static int s302m_decode_frame(AVCodecContext *avctx, void *data, + int *data_size, AVPacket *avpkt) +{ + const uint8_t *buf = avpkt->data; + int buf_size = avpkt->size; + + int frame_size = s302m_parse_frame_header(avctx, buf, buf_size); + if (frame_size < 0) + return frame_size; + + buf_size -= AES3_HEADER_LEN; + buf += AES3_HEADER_LEN; + + if (*data_size < 4 * buf_size * 8 / (avctx->bits_per_coded_sample + 4)) + return -1; + + if (avctx->bits_per_coded_sample == 24) { + uint32_t *o = data; + for (; buf_size > 6; buf_size -= 7) { + *o++ = (av_reverse[buf[2]] << 24) | + (av_reverse[buf[1]] << 16) | + (av_reverse[buf[0]] << 8); + *o++ = (av_reverse[buf[6] & 0xf0] << 28) | + (av_reverse[buf[5]] << 20) | + (av_reverse[buf[4]] << 12) | + (av_reverse[buf[3] & 0x0f] << 8); + buf += 7; + } + *data_size = (uint8_t*) o - (uint8_t*) data; + } else if (avctx->bits_per_coded_sample == 20) { + uint32_t *o = data; + for (; buf_size > 5; buf_size -= 6) { + *o++ = (av_reverse[buf[2] & 0xf0] << 28) | + (av_reverse[buf[1]] << 20) | + (av_reverse[buf[0]] << 12); + *o++ = (av_reverse[buf[5] & 0xf0] << 28) | + (av_reverse[buf[4]] << 20) | + (av_reverse[buf[3]] << 12); + buf += 6; + } + *data_size = (uint8_t*) o - (uint8_t*) data; + } else { + uint16_t *o = data; + for (; buf_size > 4; buf_size -= 5) { + *o++ = (av_reverse[buf[1]] << 8) | + av_reverse[buf[0]]; + *o++ = (av_reverse[buf[4] & 0xf0] << 12) | + (av_reverse[buf[3]] << 4) | + av_reverse[buf[2] & 0x0f]; + buf += 5; + } + *data_size = (uint8_t*) o - (uint8_t*) data; + } + + return buf - avpkt->data; +} + + +AVCodec ff_s302m_decoder = { + .name = "s302m", + .type = AVMEDIA_TYPE_AUDIO, + .id = CODEC_ID_S302M, + .priv_data_size = 0, + .decode = s302m_decode_frame, + .long_name = NULL_IF_CONFIG_SMALL("SMPTE 302M"), +}; diff --git a/libavformat/mpegts.c b/libavformat/mpegts.c index 3130eb9ff5..cdefb84864 100644 --- a/libavformat/mpegts.c +++ b/libavformat/mpegts.c @@ -524,6 +524,7 @@ static const StreamType MISC_types[] = { static const StreamType REGD_types[] = { { MKTAG('d','r','a','c'), AVMEDIA_TYPE_VIDEO, CODEC_ID_DIRAC }, { MKTAG('A','C','-','3'), AVMEDIA_TYPE_AUDIO, CODEC_ID_AC3 }, + { MKTAG('B','S','S','D'), AVMEDIA_TYPE_AUDIO, CODEC_ID_S302M }, { 0 }, }; From 5705b02079449c685a3dd337fcc3a8b440dca4a0 Mon Sep 17 00:00:00 2001 From: Jason Garrett-Glaser Date: Wed, 11 May 2011 10:11:55 -0700 Subject: [PATCH 32/32] 10-bit H.264 x86 chroma v loopfilter asm Also delete some unused deblock asm macros. --- libavcodec/x86/h264_deblock.asm | 41 ---------- libavcodec/x86/h264_deblock_10bit.asm | 106 ++++++++++++++++++++++++++ libavcodec/x86/h264dsp_mmx.c | 16 +++- 3 files changed, 121 insertions(+), 42 deletions(-) diff --git a/libavcodec/x86/h264_deblock.asm b/libavcodec/x86/h264_deblock.asm index 37866812e7..0cf013f58f 100644 --- a/libavcodec/x86/h264_deblock.asm +++ b/libavcodec/x86/h264_deblock.asm @@ -106,47 +106,6 @@ cextern pb_A1 TRANSPOSE4x8_LOAD bw, wd, dq, %1, %2, %3, %4, %5, %6, %7, %8 %endmacro -%macro TRANSPOSE4x8W_LOAD 8 -%if mmsize==16 - TRANSPOSE4x8_LOAD wd, dq, qdq, %1, %2, %3, %4, %5, %6, %7, %8 -%else - SWAP 1, 4, 2, 3 - mova m0, [t5] - mova m1, [t5+r1] - mova m2, [t5+r1*2] - mova m3, [t5+t6] - TRANSPOSE4x4W 0, 1, 2, 3, 4 -%endif -%endmacro - -%macro TRANSPOSE8x2W_STORE 8 - punpckhwd m0, m1, m2 - punpcklwd m1, m2 -%if mmsize==8 - movd %3, m0 - movd %1, m1 - psrlq m1, 32 - psrlq m0, 32 - movd %2, m1 - movd %4, m0 -%else - movd %5, m0 - movd %1, m1 - psrldq m1, 4 - psrldq m0, 4 - movd %2, m1 - movd %6, m0 - psrldq m1, 4 - psrldq m0, 4 - movd %3, m1 - movd %7, m0 - psrldq m1, 4 - psrldq m0, 4 - movd %4, m1 - movd %8, m0 -%endif -%endmacro - %macro SBUTTERFLY3 4 punpckh%1 %4, %2, %3 punpckl%1 %2, %3 diff --git a/libavcodec/x86/h264_deblock_10bit.asm b/libavcodec/x86/h264_deblock_10bit.asm index 402ed9bfac..c253d02954 100644 --- a/libavcodec/x86/h264_deblock_10bit.asm +++ b/libavcodec/x86/h264_deblock_10bit.asm @@ -34,6 +34,7 @@ pw_pixel_max: times 8 dw ((1 << 10)-1) SECTION .text cextern pw_2 +cextern pw_3 cextern pw_4 ; out: %4 = |%1-%2|-%3 @@ -802,3 +803,108 @@ INIT_AVX DEBLOCK_LUMA avx DEBLOCK_LUMA_INTRA avx %endif + +; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp +; out: %1=p0', %2=q0' +%macro CHROMA_DEBLOCK_P0_Q0_INTRA 7 + mova %6, [pw_2] + paddw %6, %3 + paddw %6, %4 + paddw %7, %6, %2 + paddw %6, %1 + paddw %6, %3 + paddw %7, %4 + psraw %6, 2 + psraw %7, 2 + psubw %6, %1 + psubw %7, %2 + pand %6, %5 + pand %7, %5 + paddw %1, %6 + paddw %2, %7 +%endmacro + +%macro CHROMA_V_LOAD 1 + mova m0, [r0] ; p1 + mova m1, [r0+r1] ; p0 + mova m2, [%1] ; q0 + mova m3, [%1+r1] ; q1 +%endmacro + +%macro CHROMA_V_STORE 0 + mova [r0+1*r1], m1 + mova [r0+2*r1], m2 +%endmacro + +%macro DEBLOCK_CHROMA 1 +;----------------------------------------------------------------------------- +; void deblock_v_chroma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +;----------------------------------------------------------------------------- +cglobal deblock_v_chroma_10_%1, 5,7-(mmsize/16),8*(mmsize/16) + mov r5, r0 + sub r0, r1 + sub r0, r1 + shl r2d, 2 + shl r3d, 2 +%if mmsize < 16 + mov r6, 16/mmsize +.loop: +%endif + CHROMA_V_LOAD r5 + LOAD_AB m4, m5, r2, r3 + LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4 + pxor m4, m4 + LOAD_TC m6, r4 + psubw m6, [pw_3] + pmaxsw m6, m4 + pand m7, m6 + DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6 + CHROMA_V_STORE +%if mmsize < 16 + add r0, mmsize + add r5, mmsize + add r4, mmsize/8 + dec r6 + jg .loop + REP_RET +%else + RET +%endif + +;----------------------------------------------------------------------------- +; void deblock_v_chroma_intra( uint16_t *pix, int stride, int alpha, int beta ) +;----------------------------------------------------------------------------- +cglobal deblock_v_chroma_intra_10_%1, 4,6-(mmsize/16),8*(mmsize/16) + mov r4, r0 + sub r0, r1 + sub r0, r1 + shl r2d, 2 + shl r3d, 2 +%if mmsize < 16 + mov r5, 16/mmsize +.loop: +%endif + CHROMA_V_LOAD r4 + LOAD_AB m4, m5, r2, r3 + LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4 + CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6 + CHROMA_V_STORE +%if mmsize < 16 + add r0, mmsize + add r4, mmsize + dec r5 + jg .loop + REP_RET +%else + RET +%endif +%endmacro + +%ifndef ARCH_X86_64 +INIT_MMX +DEBLOCK_CHROMA mmxext +%endif +INIT_XMM +DEBLOCK_CHROMA sse2 +INIT_AVX +DEBLOCK_CHROMA avx diff --git a/libavcodec/x86/h264dsp_mmx.c b/libavcodec/x86/h264dsp_mmx.c index 42dae93f2d..01b11163c8 100644 --- a/libavcodec/x86/h264dsp_mmx.c +++ b/libavcodec/x86/h264dsp_mmx.c @@ -236,10 +236,18 @@ LF_FUNC (h, luma, depth, sse2)\ LF_IFUNC(h, luma_intra, depth, sse2)\ LF_FUNC (v, luma, depth, sse2)\ LF_IFUNC(v, luma_intra, depth, sse2)\ +LF_FUNC (h, chroma, depth, sse2)\ +LF_IFUNC(h, chroma_intra, depth, sse2)\ +LF_FUNC (v, chroma, depth, sse2)\ +LF_IFUNC(v, chroma_intra, depth, sse2)\ LF_FUNC (h, luma, depth, avx)\ LF_IFUNC(h, luma_intra, depth, avx)\ LF_FUNC (v, luma, depth, avx)\ -LF_IFUNC(v, luma_intra, depth, avx) +LF_IFUNC(v, luma_intra, depth, avx)\ +LF_FUNC (h, chroma, depth, avx)\ +LF_IFUNC(h, chroma_intra, depth, avx)\ +LF_FUNC (v, chroma, depth, avx)\ +LF_IFUNC(v, chroma_intra, depth, avx) LF_FUNCS( uint8_t, 8) LF_FUNCS(uint16_t, 10) @@ -401,12 +409,16 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth) if (mm_flags & AV_CPU_FLAG_MMX) { if (mm_flags & AV_CPU_FLAG_MMX2) { #if ARCH_X86_32 + c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_10_mmxext; + c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_10_mmxext; c->h264_v_loop_filter_luma= ff_deblock_v_luma_10_mmxext; c->h264_h_loop_filter_luma= ff_deblock_h_luma_10_mmxext; c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_mmxext; c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_mmxext; #endif if (mm_flags&AV_CPU_FLAG_SSE2) { + c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_10_sse2; + c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_10_sse2; #if HAVE_ALIGNED_STACK c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_sse2; c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_sse2; @@ -415,6 +427,8 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth) #endif } if (mm_flags&AV_CPU_FLAG_AVX) { + c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_10_avx; + c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_10_avx; #if HAVE_ALIGNED_STACK c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_avx; c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_avx;