diff --git a/libavcodec/ppc/dsputil_ppc.c b/libavcodec/ppc/dsputil_ppc.c index 168f8d8b6d..28bcf2caba 100644 --- a/libavcodec/ppc/dsputil_ppc.c +++ b/libavcodec/ppc/dsputil_ppc.c @@ -60,33 +60,33 @@ int mm_support(void) unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][powerpc_data_total]; /* list below must match enum in dsputil_ppc.h */ static unsigned char* perfname[] = { - "ff_fft_calc_altivec", - "gmc1_altivec", - "dct_unquantize_h263_altivec", - "fdct_altivec", - "idct_add_altivec", - "idct_put_altivec", - "put_pixels16_altivec", - "avg_pixels16_altivec", - "avg_pixels8_altivec", - "put_pixels8_xy2_altivec", - "put_no_rnd_pixels8_xy2_altivec", - "put_pixels16_xy2_altivec", - "put_no_rnd_pixels16_xy2_altivec", - "hadamard8_diff8x8_altivec", - "hadamard8_diff16_altivec", - "avg_pixels8_xy2_altivec", - "clear_blocks_dcbz32_ppc", - "clear_blocks_dcbz128_ppc", - "put_h264_chroma_mc8_altivec", - "avg_h264_chroma_mc8_altivec", - "put_h264_qpel16_h_lowpass_altivec", - "avg_h264_qpel16_h_lowpass_altivec", - "put_h264_qpel16_v_lowpass_altivec", - "avg_h264_qpel16_v_lowpass_altivec", - "put_h264_qpel16_hv_lowpass_altivec", - "avg_h264_qpel16_hv_lowpass_altivec", - "" + "ff_fft_calc_altivec", + "gmc1_altivec", + "dct_unquantize_h263_altivec", + "fdct_altivec", + "idct_add_altivec", + "idct_put_altivec", + "put_pixels16_altivec", + "avg_pixels16_altivec", + "avg_pixels8_altivec", + "put_pixels8_xy2_altivec", + "put_no_rnd_pixels8_xy2_altivec", + "put_pixels16_xy2_altivec", + "put_no_rnd_pixels16_xy2_altivec", + "hadamard8_diff8x8_altivec", + "hadamard8_diff16_altivec", + "avg_pixels8_xy2_altivec", + "clear_blocks_dcbz32_ppc", + "clear_blocks_dcbz128_ppc", + "put_h264_chroma_mc8_altivec", + "avg_h264_chroma_mc8_altivec", + "put_h264_qpel16_h_lowpass_altivec", + "avg_h264_qpel16_h_lowpass_altivec", + "put_h264_qpel16_v_lowpass_altivec", + "avg_h264_qpel16_v_lowpass_altivec", + "put_h264_qpel16_hv_lowpass_altivec", + "avg_h264_qpel16_hv_lowpass_altivec", + "" }; #include #endif @@ -94,51 +94,44 @@ static unsigned char* perfname[] = { #ifdef CONFIG_POWERPC_PERF void powerpc_display_perf_report(void) { - int i, j; - av_log(NULL, AV_LOG_INFO, "PowerPC performance report\n Values are from the PMC registers, and represent whatever the registers are set to record.\n"); - for(i = 0 ; i < powerpc_perf_total ; i++) - { - for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++) - { - if (perfdata[j][i][powerpc_data_num] != (unsigned long long)0) - av_log(NULL, AV_LOG_INFO, - " Function \"%s\" (pmc%d):\n\tmin: %"PRIu64"\n\tmax: %"PRIu64"\n\tavg: %1.2lf (%"PRIu64")\n", - perfname[i], - j+1, - perfdata[j][i][powerpc_data_min], - perfdata[j][i][powerpc_data_max], - (double)perfdata[j][i][powerpc_data_sum] / - (double)perfdata[j][i][powerpc_data_num], - perfdata[j][i][powerpc_data_num]); - } - } + int i, j; + av_log(NULL, AV_LOG_INFO, "PowerPC performance report\n Values are from the PMC registers, and represent whatever the registers are set to record.\n"); + for(i = 0 ; i < powerpc_perf_total ; i++) { + for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++) { + if (perfdata[j][i][powerpc_data_num] != (unsigned long long)0) + av_log(NULL, AV_LOG_INFO, + " Function \"%s\" (pmc%d):\n\tmin: %"PRIu64"\n\tmax: %"PRIu64"\n\tavg: %1.2lf (%"PRIu64")\n", + perfname[i], + j+1, + perfdata[j][i][powerpc_data_min], + perfdata[j][i][powerpc_data_max], + (double)perfdata[j][i][powerpc_data_sum] / + (double)perfdata[j][i][powerpc_data_num], + perfdata[j][i][powerpc_data_num]); + } + } } #endif /* CONFIG_POWERPC_PERF */ /* ***** WARNING ***** WARNING ***** WARNING ***** */ /* - clear_blocks_dcbz32_ppc will not work properly - on PowerPC processors with a cache line size - not equal to 32 bytes. - Fortunately all processor used by Apple up to - at least the 7450 (aka second generation G4) - use 32 bytes cache line. - This is due to the use of the 'dcbz' instruction. - It simply clear to zero a single cache line, - so you need to know the cache line size to use it ! - It's absurd, but it's fast... +clear_blocks_dcbz32_ppc will not work properly on PowerPC processors with a +cache line size not equal to 32 bytes. +Fortunately all processor used by Apple up to at least the 7450 (aka second +generation G4) use 32 bytes cache line. +This is due to the use of the 'dcbz' instruction. It simply clear to zero a +single cache line, so you need to know the cache line size to use it ! +It's absurd, but it's fast... - update 24/06/2003 : Apple released yesterday the G5, - with a PPC970. cache line size : 128 bytes. Oups. - The semantic of dcbz was changed, it always clear - 32 bytes. so the function below will work, but will - be slow. So I fixed check_dcbz_effect to use dcbzl, - which is defined to clear a cache line (as dcbz before). - So we still can distinguish, and use dcbz (32 bytes) - or dcbzl (one cache line) as required. +update 24/06/2003 : Apple released yesterday the G5, with a PPC970. cache line +size: 128 bytes. Oups. +The semantic of dcbz was changed, it always clear 32 bytes. so the function +below will work, but will be slow. So I fixed check_dcbz_effect to use dcbzl, +which is defined to clear a cache line (as dcbz before). So we still can +distinguish, and use dcbz (32 bytes) or dcbzl (one cache line) as required. - see - and +see +and */ void clear_blocks_dcbz32_ppc(DCTELEM *blocks) { @@ -148,21 +141,21 @@ POWERPC_PERF_DECLARE(powerpc_clear_blocks_dcbz32, 1); POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz32, 1); #if 1 if (misal) { - ((unsigned long*)blocks)[0] = 0L; - ((unsigned long*)blocks)[1] = 0L; - ((unsigned long*)blocks)[2] = 0L; - ((unsigned long*)blocks)[3] = 0L; - i += 16; + ((unsigned long*)blocks)[0] = 0L; + ((unsigned long*)blocks)[1] = 0L; + ((unsigned long*)blocks)[2] = 0L; + ((unsigned long*)blocks)[3] = 0L; + i += 16; } for ( ; i < sizeof(DCTELEM)*6*64-31 ; i += 32) { - asm volatile("dcbz %0,%1" : : "b" (blocks), "r" (i) : "memory"); + asm volatile("dcbz %0,%1" : : "b" (blocks), "r" (i) : "memory"); } if (misal) { - ((unsigned long*)blocks)[188] = 0L; - ((unsigned long*)blocks)[189] = 0L; - ((unsigned long*)blocks)[190] = 0L; - ((unsigned long*)blocks)[191] = 0L; - i += 16; + ((unsigned long*)blocks)[188] = 0L; + ((unsigned long*)blocks)[189] = 0L; + ((unsigned long*)blocks)[190] = 0L; + ((unsigned long*)blocks)[191] = 0L; + i += 16; } #else memset(blocks, 0, sizeof(DCTELEM)*6*64); @@ -180,16 +173,16 @@ POWERPC_PERF_DECLARE(powerpc_clear_blocks_dcbz128, 1); register int i = 0; POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz128, 1); #if 1 - if (misal) { - // we could probably also optimize this case, - // but there's not much point as the machines - // aren't available yet (2003-06-26) - memset(blocks, 0, sizeof(DCTELEM)*6*64); + if (misal) { + // we could probably also optimize this case, + // but there's not much point as the machines + // aren't available yet (2003-06-26) + memset(blocks, 0, sizeof(DCTELEM)*6*64); } else - for ( ; i < sizeof(DCTELEM)*6*64 ; i += 128) { - asm volatile("dcbzl %0,%1" : : "b" (blocks), "r" (i) : "memory"); - } + for ( ; i < sizeof(DCTELEM)*6*64 ; i += 128) { + asm volatile("dcbzl %0,%1" : : "b" (blocks), "r" (i) : "memory"); + } #else memset(blocks, 0, sizeof(DCTELEM)*6*64); #endif @@ -198,7 +191,7 @@ POWERPC_PERF_STOP_COUNT(powerpc_clear_blocks_dcbz128, 1); #else void clear_blocks_dcbz128_ppc(DCTELEM *blocks) { - memset(blocks, 0, sizeof(DCTELEM)*6*64); + memset(blocks, 0, sizeof(DCTELEM)*6*64); } #endif @@ -210,34 +203,32 @@ void clear_blocks_dcbz128_ppc(DCTELEM *blocks) knows about dcbzl ... */ long check_dcbzl_effect(void) { - register char *fakedata = av_malloc(1024); - register char *fakedata_middle; - register long zero = 0; - register long i = 0; - long count = 0; + register char *fakedata = av_malloc(1024); + register char *fakedata_middle; + register long zero = 0; + register long i = 0; + long count = 0; - if (!fakedata) - { - return 0L; - } + if (!fakedata) { + return 0L; + } - fakedata_middle = (fakedata + 512); + fakedata_middle = (fakedata + 512); - memset(fakedata, 0xFF, 1024); + memset(fakedata, 0xFF, 1024); - /* below the constraint "b" seems to mean "Address base register" - in gcc-3.3 / RS/6000 speaks. seems to avoid using r0, so.... */ - asm volatile("dcbzl %0, %1" : : "b" (fakedata_middle), "r" (zero)); + /* below the constraint "b" seems to mean "Address base register" + in gcc-3.3 / RS/6000 speaks. seems to avoid using r0, so.... */ + asm volatile("dcbzl %0, %1" : : "b" (fakedata_middle), "r" (zero)); - for (i = 0; i < 1024 ; i ++) - { - if (fakedata[i] == (char)0) - count++; - } + for (i = 0; i < 1024 ; i ++) { + if (fakedata[i] == (char)0) + count++; + } - av_free(fakedata); + av_free(fakedata); - return count; + return count; } #else long check_dcbzl_effect(void) @@ -286,36 +277,31 @@ void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx) #ifdef CONFIG_ENCODERS if (avctx->dct_algo == FF_DCT_AUTO || - avctx->dct_algo == FF_DCT_ALTIVEC) - { + avctx->dct_algo == FF_DCT_ALTIVEC) { c->fdct = fdct_altivec; } #endif //CONFIG_ENCODERS - if (avctx->lowres==0) - { - if ((avctx->idct_algo == FF_IDCT_AUTO) || - (avctx->idct_algo == FF_IDCT_ALTIVEC)) - { - c->idct_put = idct_put_altivec; - c->idct_add = idct_add_altivec; - c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM; - } + if (avctx->lowres==0) { + if ((avctx->idct_algo == FF_IDCT_AUTO) || + (avctx->idct_algo == FF_IDCT_ALTIVEC)) { + c->idct_put = idct_put_altivec; + c->idct_add = idct_add_altivec; + c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM; + } } #ifdef CONFIG_POWERPC_PERF { - int i, j; - for (i = 0 ; i < powerpc_perf_total ; i++) - { - for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++) - { - perfdata[j][i][powerpc_data_min] = 0xFFFFFFFFFFFFFFFFULL; - perfdata[j][i][powerpc_data_max] = 0x0000000000000000ULL; - perfdata[j][i][powerpc_data_sum] = 0x0000000000000000ULL; - perfdata[j][i][powerpc_data_num] = 0x0000000000000000ULL; + int i, j; + for (i = 0 ; i < powerpc_perf_total ; i++) { + for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++) { + perfdata[j][i][powerpc_data_min] = 0xFFFFFFFFFFFFFFFFULL; + perfdata[j][i][powerpc_data_max] = 0x0000000000000000ULL; + perfdata[j][i][powerpc_data_sum] = 0x0000000000000000ULL; + perfdata[j][i][powerpc_data_num] = 0x0000000000000000ULL; + } } - } } #endif /* CONFIG_POWERPC_PERF */ } diff --git a/libavcodec/ppc/dsputil_ppc.h b/libavcodec/ppc/dsputil_ppc.h index d8f6b27f98..1276661b02 100644 --- a/libavcodec/ppc/dsputil_ppc.h +++ b/libavcodec/ppc/dsputil_ppc.h @@ -31,40 +31,40 @@ void powerpc_display_perf_report(void); /* if you add to the enum below, also add to the perfname array in dsputil_ppc.c */ enum powerpc_perf_index { - altivec_fft_num = 0, - altivec_gmc1_num, - altivec_dct_unquantize_h263_num, - altivec_fdct, - altivec_idct_add_num, - altivec_idct_put_num, - altivec_put_pixels16_num, - altivec_avg_pixels16_num, - altivec_avg_pixels8_num, - altivec_put_pixels8_xy2_num, - altivec_put_no_rnd_pixels8_xy2_num, - altivec_put_pixels16_xy2_num, - altivec_put_no_rnd_pixels16_xy2_num, - altivec_hadamard8_diff8x8_num, - altivec_hadamard8_diff16_num, - altivec_avg_pixels8_xy2_num, - powerpc_clear_blocks_dcbz32, - powerpc_clear_blocks_dcbz128, - altivec_put_h264_chroma_mc8_num, - altivec_avg_h264_chroma_mc8_num, - altivec_put_h264_qpel16_h_lowpass_num, - altivec_avg_h264_qpel16_h_lowpass_num, - altivec_put_h264_qpel16_v_lowpass_num, - altivec_avg_h264_qpel16_v_lowpass_num, - altivec_put_h264_qpel16_hv_lowpass_num, - altivec_avg_h264_qpel16_hv_lowpass_num, - powerpc_perf_total + altivec_fft_num = 0, + altivec_gmc1_num, + altivec_dct_unquantize_h263_num, + altivec_fdct, + altivec_idct_add_num, + altivec_idct_put_num, + altivec_put_pixels16_num, + altivec_avg_pixels16_num, + altivec_avg_pixels8_num, + altivec_put_pixels8_xy2_num, + altivec_put_no_rnd_pixels8_xy2_num, + altivec_put_pixels16_xy2_num, + altivec_put_no_rnd_pixels16_xy2_num, + altivec_hadamard8_diff8x8_num, + altivec_hadamard8_diff16_num, + altivec_avg_pixels8_xy2_num, + powerpc_clear_blocks_dcbz32, + powerpc_clear_blocks_dcbz128, + altivec_put_h264_chroma_mc8_num, + altivec_avg_h264_chroma_mc8_num, + altivec_put_h264_qpel16_h_lowpass_num, + altivec_avg_h264_qpel16_h_lowpass_num, + altivec_put_h264_qpel16_v_lowpass_num, + altivec_avg_h264_qpel16_v_lowpass_num, + altivec_put_h264_qpel16_hv_lowpass_num, + altivec_avg_h264_qpel16_hv_lowpass_num, + powerpc_perf_total }; enum powerpc_data_index { - powerpc_data_min = 0, - powerpc_data_max, - powerpc_data_sum, - powerpc_data_num, - powerpc_data_total + powerpc_data_min = 0, + powerpc_data_max, + powerpc_data_sum, + powerpc_data_num, + powerpc_data_total }; extern unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][powerpc_data_total]; @@ -105,45 +105,42 @@ extern unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][ #define POWERPC_GET_PMC6(a) do {} while (0) #endif #endif /* HAVE_PPC64 */ -#define POWERPC_PERF_DECLARE(a, cond) \ - POWERP_PMC_DATATYPE \ - pmc_start[POWERPC_NUM_PMC_ENABLED], \ - pmc_stop[POWERPC_NUM_PMC_ENABLED], \ - pmc_loop_index; +#define POWERPC_PERF_DECLARE(a, cond) \ + POWERP_PMC_DATATYPE \ + pmc_start[POWERPC_NUM_PMC_ENABLED], \ + pmc_stop[POWERPC_NUM_PMC_ENABLED], \ + pmc_loop_index; #define POWERPC_PERF_START_COUNT(a, cond) do { \ - POWERPC_GET_PMC6(pmc_start[5]); \ - POWERPC_GET_PMC5(pmc_start[4]); \ - POWERPC_GET_PMC4(pmc_start[3]); \ - POWERPC_GET_PMC3(pmc_start[2]); \ - POWERPC_GET_PMC2(pmc_start[1]); \ - POWERPC_GET_PMC1(pmc_start[0]); \ - } while (0) + POWERPC_GET_PMC6(pmc_start[5]); \ + POWERPC_GET_PMC5(pmc_start[4]); \ + POWERPC_GET_PMC4(pmc_start[3]); \ + POWERPC_GET_PMC3(pmc_start[2]); \ + POWERPC_GET_PMC2(pmc_start[1]); \ + POWERPC_GET_PMC1(pmc_start[0]); \ + } while (0) #define POWERPC_PERF_STOP_COUNT(a, cond) do { \ - POWERPC_GET_PMC1(pmc_stop[0]); \ - POWERPC_GET_PMC2(pmc_stop[1]); \ - POWERPC_GET_PMC3(pmc_stop[2]); \ - POWERPC_GET_PMC4(pmc_stop[3]); \ - POWERPC_GET_PMC5(pmc_stop[4]); \ - POWERPC_GET_PMC6(pmc_stop[5]); \ - if (cond) \ - { \ - for(pmc_loop_index = 0; \ - pmc_loop_index < POWERPC_NUM_PMC_ENABLED; \ - pmc_loop_index++) \ - { \ - if (pmc_stop[pmc_loop_index] >= pmc_start[pmc_loop_index]) \ - { \ - POWERP_PMC_DATATYPE diff = \ - pmc_stop[pmc_loop_index] - pmc_start[pmc_loop_index]; \ - if (diff < perfdata[pmc_loop_index][a][powerpc_data_min]) \ - perfdata[pmc_loop_index][a][powerpc_data_min] = diff; \ - if (diff > perfdata[pmc_loop_index][a][powerpc_data_max]) \ - perfdata[pmc_loop_index][a][powerpc_data_max] = diff; \ - perfdata[pmc_loop_index][a][powerpc_data_sum] += diff; \ - perfdata[pmc_loop_index][a][powerpc_data_num] ++; \ - } \ - } \ - } \ + POWERPC_GET_PMC1(pmc_stop[0]); \ + POWERPC_GET_PMC2(pmc_stop[1]); \ + POWERPC_GET_PMC3(pmc_stop[2]); \ + POWERPC_GET_PMC4(pmc_stop[3]); \ + POWERPC_GET_PMC5(pmc_stop[4]); \ + POWERPC_GET_PMC6(pmc_stop[5]); \ + if (cond) { \ + for(pmc_loop_index = 0; \ + pmc_loop_index < POWERPC_NUM_PMC_ENABLED; \ + pmc_loop_index++) { \ + if (pmc_stop[pmc_loop_index] >= pmc_start[pmc_loop_index]) { \ + POWERP_PMC_DATATYPE diff = \ + pmc_stop[pmc_loop_index] - pmc_start[pmc_loop_index]; \ + if (diff < perfdata[pmc_loop_index][a][powerpc_data_min]) \ + perfdata[pmc_loop_index][a][powerpc_data_min] = diff; \ + if (diff > perfdata[pmc_loop_index][a][powerpc_data_max]) \ + perfdata[pmc_loop_index][a][powerpc_data_max] = diff; \ + perfdata[pmc_loop_index][a][powerpc_data_sum] += diff; \ + perfdata[pmc_loop_index][a][powerpc_data_num] ++; \ + } \ + } \ + } \ } while (0) #else /* CONFIG_POWERPC_PERF */ // those are needed to avoid empty statements. diff --git a/libavcodec/ppc/fft_altivec.c b/libavcodec/ppc/fft_altivec.c index f5608556cb..ddf142b580 100644 --- a/libavcodec/ppc/fft_altivec.c +++ b/libavcodec/ppc/fft_altivec.c @@ -33,21 +33,21 @@ /* butter fly op */ #define BF(pre, pim, qre, qim, pre1, pim1, qre1, qim1) \ {\ - FFTSample ax, ay, bx, by;\ - bx=pre1;\ - by=pim1;\ - ax=qre1;\ - ay=qim1;\ - pre = (bx + ax);\ - pim = (by + ay);\ - qre = (bx - ax);\ - qim = (by - ay);\ + FFTSample ax, ay, bx, by;\ + bx=pre1;\ + by=pim1;\ + ax=qre1;\ + ay=qim1;\ + pre = (bx + ax);\ + pim = (by + ay);\ + qre = (bx - ax);\ + qim = (by - ay);\ } #define MUL16(a,b) ((a) * (b)) #define CMUL(pre, pim, are, aim, bre, bim) \ {\ - pre = (MUL16(are, bre) - MUL16(aim, bim));\ - pim = (MUL16(are, bim) + MUL16(bre, aim));\ + pre = (MUL16(are, bre) - MUL16(aim, bim));\ + pim = (MUL16(are, bim) + MUL16(bre, aim));\ } @@ -85,14 +85,11 @@ POWERPC_PERF_START_COUNT(altivec_fft_num, s->nbits >= 6); c1 = vcii(p,p,n,n); - if (s->inverse) - { - c2 = vcii(p,p,n,p); - } - else - { - c2 = vcii(p,p,p,n); - } + if (s->inverse) { + c2 = vcii(p,p,n,p); + } else { + c2 = vcii(p,p,p,n); + } j = (np >> 2); do { diff --git a/libavcodec/ppc/gmc_altivec.c b/libavcodec/ppc/gmc_altivec.c index 0113a9aa97..a69062d183 100644 --- a/libavcodec/ppc/gmc_altivec.c +++ b/libavcodec/ppc/gmc_altivec.c @@ -36,16 +36,16 @@ void gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */, int str { POWERPC_PERF_DECLARE(altivec_gmc1_num, GMC1_PERF_COND); const DECLARE_ALIGNED_16(unsigned short, rounder_a[8]) = - {rounder, rounder, rounder, rounder, - rounder, rounder, rounder, rounder}; + {rounder, rounder, rounder, rounder, + rounder, rounder, rounder, rounder}; const DECLARE_ALIGNED_16(unsigned short, ABCD[8]) = - { - (16-x16)*(16-y16), /* A */ - ( x16)*(16-y16), /* B */ - (16-x16)*( y16), /* C */ - ( x16)*( y16), /* D */ - 0, 0, 0, 0 /* padding */ - }; + { + (16-x16)*(16-y16), /* A */ + ( x16)*(16-y16), /* B */ + (16-x16)*( y16), /* C */ + ( x16)*( y16), /* D */ + 0, 0, 0, 0 /* padding */ + }; register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); register const vector unsigned short vcsr8 = (const vector unsigned short)vec_splat_u16(8); register vector unsigned char dstv, dstv2, src_0, src_1, srcvA, srcvB, srcvC, srcvD; @@ -74,73 +74,67 @@ POWERPC_PERF_START_COUNT(altivec_gmc1_num, GMC1_PERF_COND); src_1 = vec_ld(16, src); srcvA = vec_perm(src_0, src_1, vec_lvsl(0, src)); - if (src_really_odd != 0x0000000F) - { // if src & 0xF == 0xF, then (src+1) is properly aligned on the second vector. - srcvB = vec_perm(src_0, src_1, vec_lvsl(1, src)); - } - else - { - srcvB = src_1; + if (src_really_odd != 0x0000000F) { + // if src & 0xF == 0xF, then (src+1) is properly aligned + // on the second vector. + srcvB = vec_perm(src_0, src_1, vec_lvsl(1, src)); + } else { + srcvB = src_1; } srcvA = vec_mergeh(vczero, srcvA); srcvB = vec_mergeh(vczero, srcvB); - for(i=0; i>15 on the result. Since FILTER_BITS is 8, and we have 15 bits of magnitude in @@ -86,13 +85,11 @@ void v_resample16_altivec(uint8_t *dst, int dst_width, const uint8_t *src, /* Do our altivec resampling on 16 pixels at once. */ while(dst_width>=16) { - /* - Read 16 (potentially unaligned) bytes from each of + /* Read 16 (potentially unaligned) bytes from each of 4 lines into 4 vectors, and split them into shorts. Interleave the multipy/accumulate for the resample filter with the loads to hide the 3 cycle latency - the vec_madds have. - */ + the vec_madds have. */ tv = (vector unsigned char *) &s[0 * wrap]; tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[i * wrap])); srchv[0].v = (vector signed short) vec_mergeh(zero, tmp); @@ -121,10 +118,8 @@ void v_resample16_altivec(uint8_t *dst, int dst_width, const uint8_t *src, sumhv = vec_madds(srchv[3].v, fv[3].v, sumhv); sumlv = vec_madds(srclv[3].v, fv[3].v, sumlv); - /* - Pack the results into our destination vector, - and do an aligned write of that back to memory. - */ + /* Pack the results into our destination vector, + and do an aligned write of that back to memory. */ dstv = vec_packsu(sumhv, sumlv) ; vec_st(dstv, 0, (vector unsigned char *) dst); @@ -133,10 +128,8 @@ void v_resample16_altivec(uint8_t *dst, int dst_width, const uint8_t *src, dst_width-=16; } - /* - If there are any leftover pixels, resample them - with the slow scalar method. - */ + /* If there are any leftover pixels, resample them + with the slow scalar method. */ while(dst_width>0) { sum = s[0 * wrap] * filter[0] + s[1 * wrap] * filter[1] + diff --git a/libavcodec/ppc/int_altivec.c b/libavcodec/ppc/int_altivec.c index 7a155a2c9c..8bd3936a84 100644 --- a/libavcodec/ppc/int_altivec.c +++ b/libavcodec/ppc/int_altivec.c @@ -38,7 +38,7 @@ static int ssd_int8_vs_int16_altivec(const int8_t *pix1, const int16_t *pix2, vector signed short vpix2, vdiff, vpix1l,vpix1h; union { vector signed int vscore; int32_t score[4]; - } u; + } u; u.vscore = vec_splat_s32(0); // //XXX lazy way, fix it later diff --git a/libavcodec/ppc/mathops.h b/libavcodec/ppc/mathops.h index 82abadcba8..2259f2af09 100644 --- a/libavcodec/ppc/mathops.h +++ b/libavcodec/ppc/mathops.h @@ -25,14 +25,14 @@ #if defined(ARCH_POWERPC_405) /* signed 16x16 -> 32 multiply add accumulate */ -# define MAC16(rt, ra, rb) \ - asm ("maclhw %0, %2, %3" : "=r" (rt) : "0" (rt), "r" (ra), "r" (rb)); +#define MAC16(rt, ra, rb) \ + asm ("maclhw %0, %2, %3" : "=r" (rt) : "0" (rt), "r" (ra), "r" (rb)); /* signed 16x16 -> 32 multiply */ -# define MUL16(ra, rb) \ - ({ int __rt; \ - asm ("mullhw %0, %1, %2" : "=r" (__rt) : "r" (ra), "r" (rb)); \ - __rt; }) +#define MUL16(ra, rb) \ + ({ int __rt; \ + asm ("mullhw %0, %1, %2" : "=r" (__rt) : "r" (ra), "r" (rb)); \ + __rt; }) #endif #endif /* FFMPEG_PPC_MATHOPS_H */ diff --git a/libavcodec/ppc/mpegvideo_altivec.c b/libavcodec/ppc/mpegvideo_altivec.c index 9832fb96a4..f2e4fae092 100644 --- a/libavcodec/ppc/mpegvideo_altivec.c +++ b/libavcodec/ppc/mpegvideo_altivec.c @@ -41,15 +41,15 @@ do { \ // transposes a matrix consisting of four vectors with four elements each #define TRANSPOSE4(a,b,c,d) \ do { \ - __typeof__(a) _trans_ach = vec_mergeh(a, c); \ - __typeof__(a) _trans_acl = vec_mergel(a, c); \ - __typeof__(a) _trans_bdh = vec_mergeh(b, d); \ - __typeof__(a) _trans_bdl = vec_mergel(b, d); \ - \ - a = vec_mergeh(_trans_ach, _trans_bdh); \ - b = vec_mergel(_trans_ach, _trans_bdh); \ - c = vec_mergeh(_trans_acl, _trans_bdl); \ - d = vec_mergel(_trans_acl, _trans_bdl); \ + __typeof__(a) _trans_ach = vec_mergeh(a, c); \ + __typeof__(a) _trans_acl = vec_mergel(a, c); \ + __typeof__(a) _trans_bdh = vec_mergeh(b, d); \ + __typeof__(a) _trans_bdl = vec_mergel(b, d); \ + \ + a = vec_mergeh(_trans_ach, _trans_bdh); \ + b = vec_mergel(_trans_ach, _trans_bdh); \ + c = vec_mergeh(_trans_acl, _trans_bdl); \ + d = vec_mergel(_trans_acl, _trans_bdl); \ } while (0) @@ -58,19 +58,19 @@ do { \ // target address is four-byte aligned (which should be always). #define LOAD4(vec, address) \ { \ - __typeof__(vec)* _load_addr = (__typeof__(vec)*)(address); \ - vector unsigned char _perm_vec = vec_lvsl(0,(address)); \ - vec = vec_ld(0, _load_addr); \ - vec = vec_perm(vec, vec, _perm_vec); \ - vec = vec_splat(vec, 0); \ + __typeof__(vec)* _load_addr = (__typeof__(vec)*)(address); \ + vector unsigned char _perm_vec = vec_lvsl(0,(address)); \ + vec = vec_ld(0, _load_addr); \ + vec = vec_perm(vec, vec, _perm_vec); \ + vec = vec_splat(vec, 0); \ } #define FOUROF(a) AVV(a,a,a,a) int dct_quantize_altivec(MpegEncContext* s, - DCTELEM* data, int n, - int qscale, int* overflow) + DCTELEM* data, int n, + int qscale, int* overflow) { int lastNonZero; vector float row0, row1, row2, row3, row4, row5, row6, row7; @@ -137,10 +137,8 @@ int dct_quantize_altivec(MpegEncContext* s, int whichPass, whichHalf; - for(whichPass = 1; whichPass<=2; whichPass++) - { - for(whichHalf = 1; whichHalf<=2; whichHalf++) - { + for(whichPass = 1; whichPass<=2; whichPass++) { + for(whichHalf = 1; whichHalf<=2; whichHalf++) { vector float tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; vector float tmp10, tmp11, tmp12, tmp13; vector float z1, z2, z3, z4, z5; @@ -235,8 +233,7 @@ int dct_quantize_altivec(MpegEncContext* s, SWAP(row7, alt7); } - if (whichPass == 1) - { + if (whichPass == 1) { // transpose the data for the second pass // First, block transpose the upper right with lower left. @@ -261,8 +258,7 @@ int dct_quantize_altivec(MpegEncContext* s, const vector signed int* qmat; vector float bias, negBias; - if (s->mb_intra) - { + if (s->mb_intra) { vector signed int baseVector; // We must cache element 0 in the intra case @@ -272,9 +268,7 @@ int dct_quantize_altivec(MpegEncContext* s, qmat = (vector signed int*)s->q_intra_matrix[qscale]; biasAddr = &(s->intra_quant_bias); - } - else - { + } else { qmat = (vector signed int*)s->q_inter_matrix[qscale]; biasAddr = &(s->inter_quant_bias); } @@ -439,8 +433,7 @@ int dct_quantize_altivec(MpegEncContext* s, // and handle it using the vector unit if we can. This is the permute used // by the altivec idct, so it is common when using the altivec dct. - if ((lastNonZero > 0) && (s->dsp.idct_permutation_type == FF_TRANSPOSE_IDCT_PERM)) - { + if ((lastNonZero > 0) && (s->dsp.idct_permutation_type == FF_TRANSPOSE_IDCT_PERM)) { TRANSPOSE8(data0, data1, data2, data3, data4, data5, data6, data7); } @@ -456,10 +449,8 @@ int dct_quantize_altivec(MpegEncContext* s, } // special handling of block[0] - if (s->mb_intra) - { - if (!s->h263_aic) - { + if (s->mb_intra) { + if (!s->h263_aic) { if (n < 4) oldBaseValue /= s->y_dc_scale; else @@ -474,8 +465,7 @@ int dct_quantize_altivec(MpegEncContext* s, // need to permute the "no" permutation case. if ((lastNonZero > 0) && (s->dsp.idct_permutation_type != FF_TRANSPOSE_IDCT_PERM) && - (s->dsp.idct_permutation_type != FF_NO_IDCT_PERM)) - { + (s->dsp.idct_permutation_type != FF_NO_IDCT_PERM)) { ff_block_permute(data, s->dsp.idct_permutation, s->intra_scantable.scantable, lastNonZero); } @@ -483,10 +473,8 @@ int dct_quantize_altivec(MpegEncContext* s, return lastNonZero; } -/* - AltiVec version of dct_unquantize_h263 - this code assumes `block' is 16 bytes-aligned -*/ +/* AltiVec version of dct_unquantize_h263 + this code assumes `block' is 16 bytes-aligned */ void dct_unquantize_h263_altivec(MpegEncContext *s, DCTELEM *block, int n, int qscale) { @@ -517,82 +505,81 @@ POWERPC_PERF_START_COUNT(altivec_dct_unquantize_h263_num, 1); } { - register const vector signed short vczero = (const vector signed short)vec_splat_s16(0); - DECLARE_ALIGNED_16(short, qmul8[]) = - { - qmul, qmul, qmul, qmul, - qmul, qmul, qmul, qmul - }; - DECLARE_ALIGNED_16(short, qadd8[]) = - { - qadd, qadd, qadd, qadd, - qadd, qadd, qadd, qadd - }; - DECLARE_ALIGNED_16(short, nqadd8[]) = - { - -qadd, -qadd, -qadd, -qadd, - -qadd, -qadd, -qadd, -qadd - }; - register vector signed short blockv, qmulv, qaddv, nqaddv, temp1; - register vector bool short blockv_null, blockv_neg; - register short backup_0 = block[0]; - register int j = 0; + register const vector signed short vczero = (const vector signed short)vec_splat_s16(0); + DECLARE_ALIGNED_16(short, qmul8[]) = + { + qmul, qmul, qmul, qmul, + qmul, qmul, qmul, qmul + }; + DECLARE_ALIGNED_16(short, qadd8[]) = + { + qadd, qadd, qadd, qadd, + qadd, qadd, qadd, qadd + }; + DECLARE_ALIGNED_16(short, nqadd8[]) = + { + -qadd, -qadd, -qadd, -qadd, + -qadd, -qadd, -qadd, -qadd + }; + register vector signed short blockv, qmulv, qaddv, nqaddv, temp1; + register vector bool short blockv_null, blockv_neg; + register short backup_0 = block[0]; + register int j = 0; - qmulv = vec_ld(0, qmul8); - qaddv = vec_ld(0, qadd8); - nqaddv = vec_ld(0, nqadd8); + qmulv = vec_ld(0, qmul8); + qaddv = vec_ld(0, qadd8); + nqaddv = vec_ld(0, nqadd8); -#if 0 // block *is* 16 bytes-aligned, it seems. - // first make sure block[j] is 16 bytes-aligned - for(j = 0; (j <= nCoeffs) && ((((unsigned long)block) + (j << 1)) & 0x0000000F) ; j++) { - level = block[j]; - if (level) { - if (level < 0) { - level = level * qmul - qadd; - } else { - level = level * qmul + qadd; +#if 0 // block *is* 16 bytes-aligned, it seems. + // first make sure block[j] is 16 bytes-aligned + for(j = 0; (j <= nCoeffs) && ((((unsigned long)block) + (j << 1)) & 0x0000000F) ; j++) { + level = block[j]; + if (level) { + if (level < 0) { + level = level * qmul - qadd; + } else { + level = level * qmul + qadd; + } + block[j] = level; } - block[j] = level; } - } #endif - // vectorize all the 16 bytes-aligned blocks - // of 8 elements - for(; (j + 7) <= nCoeffs ; j+=8) - { - blockv = vec_ld(j << 1, block); - blockv_neg = vec_cmplt(blockv, vczero); - blockv_null = vec_cmpeq(blockv, vczero); - // choose between +qadd or -qadd as the third operand - temp1 = vec_sel(qaddv, nqaddv, blockv_neg); - // multiply & add (block{i,i+7} * qmul [+-] qadd) - temp1 = vec_mladd(blockv, qmulv, temp1); - // put 0 where block[{i,i+7} used to have 0 - blockv = vec_sel(temp1, blockv, blockv_null); - vec_st(blockv, j << 1, block); - } - - // if nCoeffs isn't a multiple of 8, finish the job - // using good old scalar units. - // (we could do it using a truncated vector, - // but I'm not sure it's worth the hassle) - for(; j <= nCoeffs ; j++) { - level = block[j]; - if (level) { - if (level < 0) { - level = level * qmul - qadd; - } else { - level = level * qmul + qadd; - } - block[j] = level; + // vectorize all the 16 bytes-aligned blocks + // of 8 elements + for(; (j + 7) <= nCoeffs ; j+=8) { + blockv = vec_ld(j << 1, block); + blockv_neg = vec_cmplt(blockv, vczero); + blockv_null = vec_cmpeq(blockv, vczero); + // choose between +qadd or -qadd as the third operand + temp1 = vec_sel(qaddv, nqaddv, blockv_neg); + // multiply & add (block{i,i+7} * qmul [+-] qadd) + temp1 = vec_mladd(blockv, qmulv, temp1); + // put 0 where block[{i,i+7} used to have 0 + blockv = vec_sel(temp1, blockv, blockv_null); + vec_st(blockv, j << 1, block); } - } - if (i == 1) - { // cheat. this avoid special-casing the first iteration - block[0] = backup_0; - } + // if nCoeffs isn't a multiple of 8, finish the job + // using good old scalar units. + // (we could do it using a truncated vector, + // but I'm not sure it's worth the hassle) + for(; j <= nCoeffs ; j++) { + level = block[j]; + if (level) { + if (level < 0) { + level = level * qmul - qadd; + } else { + level = level * qmul + qadd; + } + block[j] = level; + } + } + + if (i == 1) { + // cheat. this avoid special-casing the first iteration + block[0] = backup_0; + } } POWERPC_PERF_STOP_COUNT(altivec_dct_unquantize_h263_num, nCoeffs == 63); } @@ -605,11 +592,9 @@ void MPV_common_init_altivec(MpegEncContext *s) { if ((mm_flags & MM_ALTIVEC) == 0) return; - if (s->avctx->lowres==0) - { + if (s->avctx->lowres==0) { if ((s->avctx->idct_algo == FF_IDCT_AUTO) || - (s->avctx->idct_algo == FF_IDCT_ALTIVEC)) - { + (s->avctx->idct_algo == FF_IDCT_ALTIVEC)) { s->dsp.idct_put = idct_put_altivec; s->dsp.idct_add = idct_add_altivec; s->dsp.idct_permutation_type = FF_TRANSPOSE_IDCT_PERM; @@ -618,15 +603,13 @@ void MPV_common_init_altivec(MpegEncContext *s) // Test to make sure that the dct required alignments are met. if ((((long)(s->q_intra_matrix) & 0x0f) != 0) || - (((long)(s->q_inter_matrix) & 0x0f) != 0)) - { + (((long)(s->q_inter_matrix) & 0x0f) != 0)) { av_log(s->avctx, AV_LOG_INFO, "Internal Error: q-matrix blocks must be 16-byte aligned " "to use AltiVec DCT. Reverting to non-AltiVec version.\n"); return; } - if (((long)(s->intra_scantable.inverse) & 0x0f) != 0) - { + if (((long)(s->intra_scantable.inverse) & 0x0f) != 0) { av_log(s->avctx, AV_LOG_INFO, "Internal Error: scan table blocks must be 16-byte aligned " "to use AltiVec DCT. Reverting to non-AltiVec version.\n"); return; @@ -634,8 +617,7 @@ void MPV_common_init_altivec(MpegEncContext *s) if ((s->avctx->dct_algo == FF_DCT_AUTO) || - (s->avctx->dct_algo == FF_DCT_ALTIVEC)) - { + (s->avctx->dct_algo == FF_DCT_ALTIVEC)) { #if 0 /* seems to cause trouble under some circumstances */ s->dct_quantize = dct_quantize_altivec; #endif diff --git a/libavcodec/ppc/snow_altivec.c b/libavcodec/ppc/snow_altivec.c index ea228b0daa..2ae32c79e8 100644 --- a/libavcodec/ppc/snow_altivec.c +++ b/libavcodec/ppc/snow_altivec.c @@ -379,8 +379,7 @@ void ff_snow_vertical_compose97i_altivec(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, v4=(vector signed int *)b4; v5=(vector signed int *)b5; - for (i=0; i< w4;i++) - { + for (i=0; i< w4;i++) { #if 0 b4[i] -= (3*(b3[i] + b5[i])+4)>>3; @@ -782,8 +781,8 @@ void ff_snow_inner_add_yblock_altivec(uint8_t *obmc, const int obmc_stride, void snow_init_altivec(DSPContext* c, AVCodecContext *avctx) { #if 0 - c->horizontal_compose97i = ff_snow_horizontal_compose97i_altivec; - c->vertical_compose97i = ff_snow_vertical_compose97i_altivec; - c->inner_add_yblock = ff_snow_inner_add_yblock_altivec; + c->horizontal_compose97i = ff_snow_horizontal_compose97i_altivec; + c->vertical_compose97i = ff_snow_vertical_compose97i_altivec; + c->inner_add_yblock = ff_snow_inner_add_yblock_altivec; #endif }