From 57c89c50bd2f971833e92d7441ced7b94487988d Mon Sep 17 00:00:00 2001 From: Rong Yan Date: Fri, 28 Nov 2014 06:49:50 +0000 Subject: [PATCH] avcodec/ppc/h264dsp: POWER LE support for h264_idct8_add_altivec() h264_idct_dc_add_internal() h264_loop_filter_luma_altivec() write16x4() VEC_1D_DCT() weight_h264_W_altivec() biweight_h264_W_altivec() VEC_LOAD_U8_ADD_S16_STORE_U8() ALTIVEC_STORE_SUM_CLIP() add marcos GET_2PERM() dstv_load() vdst_load() dest_unligned_store() Signed-off-by: Michael Niedermayer --- libavcodec/ppc/h264dsp.c | 95 +++++++++++++++++++++++++++------------- 1 file changed, 65 insertions(+), 30 deletions(-) diff --git a/libavcodec/ppc/h264dsp.c b/libavcodec/ppc/h264dsp.c index 7fc7e0bc7f..da118a49b6 100644 --- a/libavcodec/ppc/h264dsp.c +++ b/libavcodec/ppc/h264dsp.c @@ -62,10 +62,17 @@ b2 = vec_mergeh( a1, a3 ); \ b3 = vec_mergel( a1, a3 ) +#if HAVE_BIGENDIAN +#define vdst_load(d) \ + vdst_orig = vec_ld(0, dst); \ + vdst = vec_perm(vdst_orig, zero_u8v, vdst_mask); +#else +#define vdst_load(d) vdst = vec_vsx_ld(0, dst) +#endif + #define VEC_LOAD_U8_ADD_S16_STORE_U8(va) \ - vdst_orig = vec_ld(0, dst); \ - vdst = vec_perm(vdst_orig, zero_u8v, vdst_mask); \ - vdst_ss = (vec_s16) vec_mergeh(zero_u8v, vdst); \ + vdst_load(); \ + vdst_ss = (vec_s16) VEC_MERGEH(zero_u8v, vdst); \ va = vec_add(va, vdst_ss); \ va_u8 = vec_packsu(va, zero_s16v); \ va_u32 = vec_splat((vec_u32)va_u8, 0); \ @@ -165,26 +172,43 @@ static void h264_idct_add_altivec(uint8_t *dst, int16_t *block, int stride) d7 = vec_sub(b0v, b7v); \ } -#define ALTIVEC_STORE_SUM_CLIP(dest, idctv, perm_ldv, perm_stv, sel) { \ - /* unaligned load */ \ - vec_u8 hv = vec_ld( 0, dest ); \ - vec_u8 lv = vec_ld( 7, dest ); \ - vec_u8 dstv = vec_perm( hv, lv, (vec_u8)perm_ldv ); \ - vec_s16 idct_sh6 = vec_sra(idctv, sixv); \ - vec_u16 dst16 = (vec_u16)vec_mergeh(zero_u8v, dstv); \ - vec_s16 idstsum = vec_adds(idct_sh6, (vec_s16)dst16); \ - vec_u8 idstsum8 = vec_packsu(zero_s16v, idstsum); \ - vec_u8 edgehv; \ - /* unaligned store */ \ - vec_u8 bodyv = vec_perm( idstsum8, idstsum8, perm_stv );\ - vec_u8 edgelv = vec_perm( sel, zero_u8v, perm_stv ); \ +#if HAVE_BIGENDIAN +#define GET_2PERM(ldv, stv, d) \ + ldv = vec_lvsl(0, d); \ + stv = vec_lvsr(8, d); +#define dstv_load(d) \ + vec_u8 hv = vec_ld( 0, d ); \ + vec_u8 lv = vec_ld( 7, d); \ + vec_u8 dstv = vec_perm( hv, lv, (vec_u8)perm_ldv ); +#define dest_unligned_store(d) \ + vec_u8 edgehv; \ + vec_u8 bodyv = vec_perm( idstsum8, idstsum8, perm_stv ); \ + vec_u8 edgelv = vec_perm( sel, zero_u8v, perm_stv ); \ lv = vec_sel( lv, bodyv, edgelv ); \ - vec_st( lv, 7, dest ); \ - hv = vec_ld( 0, dest ); \ + vec_st( lv, 7, d ); \ + hv = vec_ld( 0, d ); \ edgehv = vec_perm( zero_u8v, sel, perm_stv ); \ hv = vec_sel( hv, bodyv, edgehv ); \ - vec_st( hv, 0, dest ); \ - } + vec_st( hv, 0, d ); +#else + +#define GET_2PERM(ldv, stv, d) {} +#define dstv_load(d) vec_u8 dstv = vec_vsx_ld(0, d) +#define dest_unligned_store(d)\ + vec_u8 dst8 = vec_perm((vec_u8)idstsum8, dstv, vcprm(2,3,s2,s3));\ + vec_vsx_st(dst8, 0, d) +#endif /* HAVE_BIGENDIAN */ + +#define ALTIVEC_STORE_SUM_CLIP(dest, idctv, perm_ldv, perm_stv, sel) { \ + /* unaligned load */ \ + dstv_load(dest); \ + vec_s16 idct_sh6 = vec_sra(idctv, sixv); \ + vec_u16 dst16 = (vec_u16)VEC_MERGEH(zero_u8v, dstv); \ + vec_s16 idstsum = vec_adds(idct_sh6, (vec_s16)dst16); \ + vec_u8 idstsum8 = vec_packsu(zero_s16v, idstsum); \ + /* unaligned store */ \ + dest_unligned_store(dest);\ +} static void h264_idct8_add_altivec(uint8_t *dst, int16_t *dct, int stride) { @@ -192,8 +216,8 @@ static void h264_idct8_add_altivec(uint8_t *dst, int16_t *dct, int stride) vec_s16 d0, d1, d2, d3, d4, d5, d6, d7; vec_s16 idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7; - vec_u8 perm_ldv = vec_lvsl(0, dst); - vec_u8 perm_stv = vec_lvsr(8, dst); + vec_u8 perm_ldv, perm_stv; + GET_2PERM(perm_ldv, perm_stv, dst); const vec_u16 onev = vec_splat_u16(1); const vec_u16 twov = vec_splat_u16(2); @@ -236,20 +260,25 @@ static av_always_inline void h264_idct_dc_add_internal(uint8_t *dst, int16_t *bl { vec_s16 dc16; vec_u8 dcplus, dcminus, v0, v1, v2, v3, aligner; + vec_s32 v_dc32; LOAD_ZERO; DECLARE_ALIGNED(16, int, dc); int i; dc = (block[0] + 32) >> 6; block[0] = 0; - dc16 = vec_splat((vec_s16) vec_lde(0, &dc), 1); + v_dc32 = vec_lde(0, &dc); + dc16 = VEC_SPLAT16((vec_s16)v_dc32, 1); if (size == 4) - dc16 = vec_sld(dc16, zero_s16v, 8); + dc16 = VEC_SLD16(dc16, zero_s16v, 8); dcplus = vec_packsu(dc16, zero_s16v); dcminus = vec_packsu(vec_sub(zero_s16v, dc16), zero_s16v); aligner = vec_lvsr(0, dst); +#if !HAVE_BIGENDIAN + aligner = vec_perm(aligner, zero_u8v, vcswapc()); +#endif dcplus = vec_perm(dcplus, dcplus, aligner); dcminus = vec_perm(dcminus, dcminus, aligner); @@ -633,6 +662,9 @@ void weight_h264_W_altivec(uint8_t *block, int stride, int height, temp[2] = offset; vtemp = (vec_s16)vec_ld(0, temp); +#if !HAVE_BIGENDIAN + vtemp =(vec_s16)vec_perm(vtemp, vtemp, vcswapi2s(0,1,2,3)); +#endif vlog2_denom = (vec_u16)vec_splat(vtemp, 1); vweight = vec_splat(vtemp, 3); voffset = vec_splat(vtemp, 5); @@ -641,8 +673,8 @@ void weight_h264_W_altivec(uint8_t *block, int stride, int height, for (y = 0; y < height; y++) { vblock = vec_ld(0, block); - v0 = (vec_s16)vec_mergeh(zero_u8v, vblock); - v1 = (vec_s16)vec_mergel(zero_u8v, vblock); + v0 = (vec_s16)VEC_MERGEH(zero_u8v, vblock); + v1 = (vec_s16)VEC_MERGEL(zero_u8v, vblock); if (w == 16 || aligned) { v0 = vec_mladd(v0, vweight, zero_s16v); @@ -679,6 +711,9 @@ void biweight_h264_W_altivec(uint8_t *dst, uint8_t *src, int stride, int height, temp[3] = offset; vtemp = (vec_s16)vec_ld(0, temp); +#if !HAVE_BIGENDIAN + vtemp =(vec_s16)vec_perm(vtemp, vtemp, vcswapi2s(0,1,2,3)); +#endif vlog2_denom = (vec_u16)vec_splat(vtemp, 1); vweights = vec_splat(vtemp, 3); vweightd = vec_splat(vtemp, 5); @@ -690,10 +725,10 @@ void biweight_h264_W_altivec(uint8_t *dst, uint8_t *src, int stride, int height, vdst = vec_ld(0, dst); vsrc = vec_ld(0, src); - v0 = (vec_s16)vec_mergeh(zero_u8v, vdst); - v1 = (vec_s16)vec_mergel(zero_u8v, vdst); - v2 = (vec_s16)vec_mergeh(zero_u8v, vsrc); - v3 = (vec_s16)vec_mergel(zero_u8v, vsrc); + v0 = (vec_s16)VEC_MERGEH(zero_u8v, vdst); + v1 = (vec_s16)VEC_MERGEL(zero_u8v, vdst); + v2 = (vec_s16)VEC_MERGEH(zero_u8v, vsrc); + v3 = (vec_s16)VEC_MERGEL(zero_u8v, vsrc); if (w == 8) { if (src_aligned)