mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-01-24 13:56:33 +02:00
cosmetics: Reformat PPC code in libavcodec according to style guidelines.
This includes indentation changes, comment reformatting, consistent brace placement and some prettyprinting. Originally committed as revision 14316 to svn://svn.ffmpeg.org/ffmpeg/trunk
This commit is contained in:
parent
41f5c62f5c
commit
e3905ce0af
@ -60,33 +60,33 @@ int mm_support(void)
|
|||||||
unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][powerpc_data_total];
|
unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][powerpc_data_total];
|
||||||
/* list below must match enum in dsputil_ppc.h */
|
/* list below must match enum in dsputil_ppc.h */
|
||||||
static unsigned char* perfname[] = {
|
static unsigned char* perfname[] = {
|
||||||
"ff_fft_calc_altivec",
|
"ff_fft_calc_altivec",
|
||||||
"gmc1_altivec",
|
"gmc1_altivec",
|
||||||
"dct_unquantize_h263_altivec",
|
"dct_unquantize_h263_altivec",
|
||||||
"fdct_altivec",
|
"fdct_altivec",
|
||||||
"idct_add_altivec",
|
"idct_add_altivec",
|
||||||
"idct_put_altivec",
|
"idct_put_altivec",
|
||||||
"put_pixels16_altivec",
|
"put_pixels16_altivec",
|
||||||
"avg_pixels16_altivec",
|
"avg_pixels16_altivec",
|
||||||
"avg_pixels8_altivec",
|
"avg_pixels8_altivec",
|
||||||
"put_pixels8_xy2_altivec",
|
"put_pixels8_xy2_altivec",
|
||||||
"put_no_rnd_pixels8_xy2_altivec",
|
"put_no_rnd_pixels8_xy2_altivec",
|
||||||
"put_pixels16_xy2_altivec",
|
"put_pixels16_xy2_altivec",
|
||||||
"put_no_rnd_pixels16_xy2_altivec",
|
"put_no_rnd_pixels16_xy2_altivec",
|
||||||
"hadamard8_diff8x8_altivec",
|
"hadamard8_diff8x8_altivec",
|
||||||
"hadamard8_diff16_altivec",
|
"hadamard8_diff16_altivec",
|
||||||
"avg_pixels8_xy2_altivec",
|
"avg_pixels8_xy2_altivec",
|
||||||
"clear_blocks_dcbz32_ppc",
|
"clear_blocks_dcbz32_ppc",
|
||||||
"clear_blocks_dcbz128_ppc",
|
"clear_blocks_dcbz128_ppc",
|
||||||
"put_h264_chroma_mc8_altivec",
|
"put_h264_chroma_mc8_altivec",
|
||||||
"avg_h264_chroma_mc8_altivec",
|
"avg_h264_chroma_mc8_altivec",
|
||||||
"put_h264_qpel16_h_lowpass_altivec",
|
"put_h264_qpel16_h_lowpass_altivec",
|
||||||
"avg_h264_qpel16_h_lowpass_altivec",
|
"avg_h264_qpel16_h_lowpass_altivec",
|
||||||
"put_h264_qpel16_v_lowpass_altivec",
|
"put_h264_qpel16_v_lowpass_altivec",
|
||||||
"avg_h264_qpel16_v_lowpass_altivec",
|
"avg_h264_qpel16_v_lowpass_altivec",
|
||||||
"put_h264_qpel16_hv_lowpass_altivec",
|
"put_h264_qpel16_hv_lowpass_altivec",
|
||||||
"avg_h264_qpel16_hv_lowpass_altivec",
|
"avg_h264_qpel16_hv_lowpass_altivec",
|
||||||
""
|
""
|
||||||
};
|
};
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#endif
|
#endif
|
||||||
@ -94,51 +94,44 @@ static unsigned char* perfname[] = {
|
|||||||
#ifdef CONFIG_POWERPC_PERF
|
#ifdef CONFIG_POWERPC_PERF
|
||||||
void powerpc_display_perf_report(void)
|
void powerpc_display_perf_report(void)
|
||||||
{
|
{
|
||||||
int i, j;
|
int i, j;
|
||||||
av_log(NULL, AV_LOG_INFO, "PowerPC performance report\n Values are from the PMC registers, and represent whatever the registers are set to record.\n");
|
av_log(NULL, AV_LOG_INFO, "PowerPC performance report\n Values are from the PMC registers, and represent whatever the registers are set to record.\n");
|
||||||
for(i = 0 ; i < powerpc_perf_total ; i++)
|
for(i = 0 ; i < powerpc_perf_total ; i++) {
|
||||||
{
|
for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++) {
|
||||||
for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++)
|
if (perfdata[j][i][powerpc_data_num] != (unsigned long long)0)
|
||||||
{
|
av_log(NULL, AV_LOG_INFO,
|
||||||
if (perfdata[j][i][powerpc_data_num] != (unsigned long long)0)
|
" Function \"%s\" (pmc%d):\n\tmin: %"PRIu64"\n\tmax: %"PRIu64"\n\tavg: %1.2lf (%"PRIu64")\n",
|
||||||
av_log(NULL, AV_LOG_INFO,
|
perfname[i],
|
||||||
" Function \"%s\" (pmc%d):\n\tmin: %"PRIu64"\n\tmax: %"PRIu64"\n\tavg: %1.2lf (%"PRIu64")\n",
|
j+1,
|
||||||
perfname[i],
|
perfdata[j][i][powerpc_data_min],
|
||||||
j+1,
|
perfdata[j][i][powerpc_data_max],
|
||||||
perfdata[j][i][powerpc_data_min],
|
(double)perfdata[j][i][powerpc_data_sum] /
|
||||||
perfdata[j][i][powerpc_data_max],
|
(double)perfdata[j][i][powerpc_data_num],
|
||||||
(double)perfdata[j][i][powerpc_data_sum] /
|
perfdata[j][i][powerpc_data_num]);
|
||||||
(double)perfdata[j][i][powerpc_data_num],
|
}
|
||||||
perfdata[j][i][powerpc_data_num]);
|
}
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
#endif /* CONFIG_POWERPC_PERF */
|
#endif /* CONFIG_POWERPC_PERF */
|
||||||
|
|
||||||
/* ***** WARNING ***** WARNING ***** WARNING ***** */
|
/* ***** WARNING ***** WARNING ***** WARNING ***** */
|
||||||
/*
|
/*
|
||||||
clear_blocks_dcbz32_ppc will not work properly
|
clear_blocks_dcbz32_ppc will not work properly on PowerPC processors with a
|
||||||
on PowerPC processors with a cache line size
|
cache line size not equal to 32 bytes.
|
||||||
not equal to 32 bytes.
|
Fortunately all processor used by Apple up to at least the 7450 (aka second
|
||||||
Fortunately all processor used by Apple up to
|
generation G4) use 32 bytes cache line.
|
||||||
at least the 7450 (aka second generation G4)
|
This is due to the use of the 'dcbz' instruction. It simply clear to zero a
|
||||||
use 32 bytes cache line.
|
single cache line, so you need to know the cache line size to use it !
|
||||||
This is due to the use of the 'dcbz' instruction.
|
It's absurd, but it's fast...
|
||||||
It simply clear to zero a single cache line,
|
|
||||||
so you need to know the cache line size to use it !
|
|
||||||
It's absurd, but it's fast...
|
|
||||||
|
|
||||||
update 24/06/2003 : Apple released yesterday the G5,
|
update 24/06/2003 : Apple released yesterday the G5, with a PPC970. cache line
|
||||||
with a PPC970. cache line size : 128 bytes. Oups.
|
size: 128 bytes. Oups.
|
||||||
The semantic of dcbz was changed, it always clear
|
The semantic of dcbz was changed, it always clear 32 bytes. so the function
|
||||||
32 bytes. so the function below will work, but will
|
below will work, but will be slow. So I fixed check_dcbz_effect to use dcbzl,
|
||||||
be slow. So I fixed check_dcbz_effect to use dcbzl,
|
which is defined to clear a cache line (as dcbz before). So we still can
|
||||||
which is defined to clear a cache line (as dcbz before).
|
distinguish, and use dcbz (32 bytes) or dcbzl (one cache line) as required.
|
||||||
So we still can distinguish, and use dcbz (32 bytes)
|
|
||||||
or dcbzl (one cache line) as required.
|
|
||||||
|
|
||||||
see <http://developer.apple.com/technotes/tn/tn2087.html>
|
see <http://developer.apple.com/technotes/tn/tn2087.html>
|
||||||
and <http://developer.apple.com/technotes/tn/tn2086.html>
|
and <http://developer.apple.com/technotes/tn/tn2086.html>
|
||||||
*/
|
*/
|
||||||
void clear_blocks_dcbz32_ppc(DCTELEM *blocks)
|
void clear_blocks_dcbz32_ppc(DCTELEM *blocks)
|
||||||
{
|
{
|
||||||
@ -148,21 +141,21 @@ POWERPC_PERF_DECLARE(powerpc_clear_blocks_dcbz32, 1);
|
|||||||
POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz32, 1);
|
POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz32, 1);
|
||||||
#if 1
|
#if 1
|
||||||
if (misal) {
|
if (misal) {
|
||||||
((unsigned long*)blocks)[0] = 0L;
|
((unsigned long*)blocks)[0] = 0L;
|
||||||
((unsigned long*)blocks)[1] = 0L;
|
((unsigned long*)blocks)[1] = 0L;
|
||||||
((unsigned long*)blocks)[2] = 0L;
|
((unsigned long*)blocks)[2] = 0L;
|
||||||
((unsigned long*)blocks)[3] = 0L;
|
((unsigned long*)blocks)[3] = 0L;
|
||||||
i += 16;
|
i += 16;
|
||||||
}
|
}
|
||||||
for ( ; i < sizeof(DCTELEM)*6*64-31 ; i += 32) {
|
for ( ; i < sizeof(DCTELEM)*6*64-31 ; i += 32) {
|
||||||
asm volatile("dcbz %0,%1" : : "b" (blocks), "r" (i) : "memory");
|
asm volatile("dcbz %0,%1" : : "b" (blocks), "r" (i) : "memory");
|
||||||
}
|
}
|
||||||
if (misal) {
|
if (misal) {
|
||||||
((unsigned long*)blocks)[188] = 0L;
|
((unsigned long*)blocks)[188] = 0L;
|
||||||
((unsigned long*)blocks)[189] = 0L;
|
((unsigned long*)blocks)[189] = 0L;
|
||||||
((unsigned long*)blocks)[190] = 0L;
|
((unsigned long*)blocks)[190] = 0L;
|
||||||
((unsigned long*)blocks)[191] = 0L;
|
((unsigned long*)blocks)[191] = 0L;
|
||||||
i += 16;
|
i += 16;
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
memset(blocks, 0, sizeof(DCTELEM)*6*64);
|
memset(blocks, 0, sizeof(DCTELEM)*6*64);
|
||||||
@ -180,16 +173,16 @@ POWERPC_PERF_DECLARE(powerpc_clear_blocks_dcbz128, 1);
|
|||||||
register int i = 0;
|
register int i = 0;
|
||||||
POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz128, 1);
|
POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz128, 1);
|
||||||
#if 1
|
#if 1
|
||||||
if (misal) {
|
if (misal) {
|
||||||
// we could probably also optimize this case,
|
// we could probably also optimize this case,
|
||||||
// but there's not much point as the machines
|
// but there's not much point as the machines
|
||||||
// aren't available yet (2003-06-26)
|
// aren't available yet (2003-06-26)
|
||||||
memset(blocks, 0, sizeof(DCTELEM)*6*64);
|
memset(blocks, 0, sizeof(DCTELEM)*6*64);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
for ( ; i < sizeof(DCTELEM)*6*64 ; i += 128) {
|
for ( ; i < sizeof(DCTELEM)*6*64 ; i += 128) {
|
||||||
asm volatile("dcbzl %0,%1" : : "b" (blocks), "r" (i) : "memory");
|
asm volatile("dcbzl %0,%1" : : "b" (blocks), "r" (i) : "memory");
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
memset(blocks, 0, sizeof(DCTELEM)*6*64);
|
memset(blocks, 0, sizeof(DCTELEM)*6*64);
|
||||||
#endif
|
#endif
|
||||||
@ -198,7 +191,7 @@ POWERPC_PERF_STOP_COUNT(powerpc_clear_blocks_dcbz128, 1);
|
|||||||
#else
|
#else
|
||||||
void clear_blocks_dcbz128_ppc(DCTELEM *blocks)
|
void clear_blocks_dcbz128_ppc(DCTELEM *blocks)
|
||||||
{
|
{
|
||||||
memset(blocks, 0, sizeof(DCTELEM)*6*64);
|
memset(blocks, 0, sizeof(DCTELEM)*6*64);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -210,34 +203,32 @@ void clear_blocks_dcbz128_ppc(DCTELEM *blocks)
|
|||||||
knows about dcbzl ... */
|
knows about dcbzl ... */
|
||||||
long check_dcbzl_effect(void)
|
long check_dcbzl_effect(void)
|
||||||
{
|
{
|
||||||
register char *fakedata = av_malloc(1024);
|
register char *fakedata = av_malloc(1024);
|
||||||
register char *fakedata_middle;
|
register char *fakedata_middle;
|
||||||
register long zero = 0;
|
register long zero = 0;
|
||||||
register long i = 0;
|
register long i = 0;
|
||||||
long count = 0;
|
long count = 0;
|
||||||
|
|
||||||
if (!fakedata)
|
if (!fakedata) {
|
||||||
{
|
return 0L;
|
||||||
return 0L;
|
}
|
||||||
}
|
|
||||||
|
|
||||||
fakedata_middle = (fakedata + 512);
|
fakedata_middle = (fakedata + 512);
|
||||||
|
|
||||||
memset(fakedata, 0xFF, 1024);
|
memset(fakedata, 0xFF, 1024);
|
||||||
|
|
||||||
/* below the constraint "b" seems to mean "Address base register"
|
/* below the constraint "b" seems to mean "Address base register"
|
||||||
in gcc-3.3 / RS/6000 speaks. seems to avoid using r0, so.... */
|
in gcc-3.3 / RS/6000 speaks. seems to avoid using r0, so.... */
|
||||||
asm volatile("dcbzl %0, %1" : : "b" (fakedata_middle), "r" (zero));
|
asm volatile("dcbzl %0, %1" : : "b" (fakedata_middle), "r" (zero));
|
||||||
|
|
||||||
for (i = 0; i < 1024 ; i ++)
|
for (i = 0; i < 1024 ; i ++) {
|
||||||
{
|
if (fakedata[i] == (char)0)
|
||||||
if (fakedata[i] == (char)0)
|
count++;
|
||||||
count++;
|
}
|
||||||
}
|
|
||||||
|
|
||||||
av_free(fakedata);
|
av_free(fakedata);
|
||||||
|
|
||||||
return count;
|
return count;
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
long check_dcbzl_effect(void)
|
long check_dcbzl_effect(void)
|
||||||
@ -286,36 +277,31 @@ void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx)
|
|||||||
|
|
||||||
#ifdef CONFIG_ENCODERS
|
#ifdef CONFIG_ENCODERS
|
||||||
if (avctx->dct_algo == FF_DCT_AUTO ||
|
if (avctx->dct_algo == FF_DCT_AUTO ||
|
||||||
avctx->dct_algo == FF_DCT_ALTIVEC)
|
avctx->dct_algo == FF_DCT_ALTIVEC) {
|
||||||
{
|
|
||||||
c->fdct = fdct_altivec;
|
c->fdct = fdct_altivec;
|
||||||
}
|
}
|
||||||
#endif //CONFIG_ENCODERS
|
#endif //CONFIG_ENCODERS
|
||||||
|
|
||||||
if (avctx->lowres==0)
|
if (avctx->lowres==0) {
|
||||||
{
|
if ((avctx->idct_algo == FF_IDCT_AUTO) ||
|
||||||
if ((avctx->idct_algo == FF_IDCT_AUTO) ||
|
(avctx->idct_algo == FF_IDCT_ALTIVEC)) {
|
||||||
(avctx->idct_algo == FF_IDCT_ALTIVEC))
|
c->idct_put = idct_put_altivec;
|
||||||
{
|
c->idct_add = idct_add_altivec;
|
||||||
c->idct_put = idct_put_altivec;
|
c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
|
||||||
c->idct_add = idct_add_altivec;
|
}
|
||||||
c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef CONFIG_POWERPC_PERF
|
#ifdef CONFIG_POWERPC_PERF
|
||||||
{
|
{
|
||||||
int i, j;
|
int i, j;
|
||||||
for (i = 0 ; i < powerpc_perf_total ; i++)
|
for (i = 0 ; i < powerpc_perf_total ; i++) {
|
||||||
{
|
for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++) {
|
||||||
for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++)
|
perfdata[j][i][powerpc_data_min] = 0xFFFFFFFFFFFFFFFFULL;
|
||||||
{
|
perfdata[j][i][powerpc_data_max] = 0x0000000000000000ULL;
|
||||||
perfdata[j][i][powerpc_data_min] = 0xFFFFFFFFFFFFFFFFULL;
|
perfdata[j][i][powerpc_data_sum] = 0x0000000000000000ULL;
|
||||||
perfdata[j][i][powerpc_data_max] = 0x0000000000000000ULL;
|
perfdata[j][i][powerpc_data_num] = 0x0000000000000000ULL;
|
||||||
perfdata[j][i][powerpc_data_sum] = 0x0000000000000000ULL;
|
}
|
||||||
perfdata[j][i][powerpc_data_num] = 0x0000000000000000ULL;
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
}
|
}
|
||||||
#endif /* CONFIG_POWERPC_PERF */
|
#endif /* CONFIG_POWERPC_PERF */
|
||||||
}
|
}
|
||||||
|
@ -31,40 +31,40 @@ void powerpc_display_perf_report(void);
|
|||||||
/* if you add to the enum below, also add to the perfname array
|
/* if you add to the enum below, also add to the perfname array
|
||||||
in dsputil_ppc.c */
|
in dsputil_ppc.c */
|
||||||
enum powerpc_perf_index {
|
enum powerpc_perf_index {
|
||||||
altivec_fft_num = 0,
|
altivec_fft_num = 0,
|
||||||
altivec_gmc1_num,
|
altivec_gmc1_num,
|
||||||
altivec_dct_unquantize_h263_num,
|
altivec_dct_unquantize_h263_num,
|
||||||
altivec_fdct,
|
altivec_fdct,
|
||||||
altivec_idct_add_num,
|
altivec_idct_add_num,
|
||||||
altivec_idct_put_num,
|
altivec_idct_put_num,
|
||||||
altivec_put_pixels16_num,
|
altivec_put_pixels16_num,
|
||||||
altivec_avg_pixels16_num,
|
altivec_avg_pixels16_num,
|
||||||
altivec_avg_pixels8_num,
|
altivec_avg_pixels8_num,
|
||||||
altivec_put_pixels8_xy2_num,
|
altivec_put_pixels8_xy2_num,
|
||||||
altivec_put_no_rnd_pixels8_xy2_num,
|
altivec_put_no_rnd_pixels8_xy2_num,
|
||||||
altivec_put_pixels16_xy2_num,
|
altivec_put_pixels16_xy2_num,
|
||||||
altivec_put_no_rnd_pixels16_xy2_num,
|
altivec_put_no_rnd_pixels16_xy2_num,
|
||||||
altivec_hadamard8_diff8x8_num,
|
altivec_hadamard8_diff8x8_num,
|
||||||
altivec_hadamard8_diff16_num,
|
altivec_hadamard8_diff16_num,
|
||||||
altivec_avg_pixels8_xy2_num,
|
altivec_avg_pixels8_xy2_num,
|
||||||
powerpc_clear_blocks_dcbz32,
|
powerpc_clear_blocks_dcbz32,
|
||||||
powerpc_clear_blocks_dcbz128,
|
powerpc_clear_blocks_dcbz128,
|
||||||
altivec_put_h264_chroma_mc8_num,
|
altivec_put_h264_chroma_mc8_num,
|
||||||
altivec_avg_h264_chroma_mc8_num,
|
altivec_avg_h264_chroma_mc8_num,
|
||||||
altivec_put_h264_qpel16_h_lowpass_num,
|
altivec_put_h264_qpel16_h_lowpass_num,
|
||||||
altivec_avg_h264_qpel16_h_lowpass_num,
|
altivec_avg_h264_qpel16_h_lowpass_num,
|
||||||
altivec_put_h264_qpel16_v_lowpass_num,
|
altivec_put_h264_qpel16_v_lowpass_num,
|
||||||
altivec_avg_h264_qpel16_v_lowpass_num,
|
altivec_avg_h264_qpel16_v_lowpass_num,
|
||||||
altivec_put_h264_qpel16_hv_lowpass_num,
|
altivec_put_h264_qpel16_hv_lowpass_num,
|
||||||
altivec_avg_h264_qpel16_hv_lowpass_num,
|
altivec_avg_h264_qpel16_hv_lowpass_num,
|
||||||
powerpc_perf_total
|
powerpc_perf_total
|
||||||
};
|
};
|
||||||
enum powerpc_data_index {
|
enum powerpc_data_index {
|
||||||
powerpc_data_min = 0,
|
powerpc_data_min = 0,
|
||||||
powerpc_data_max,
|
powerpc_data_max,
|
||||||
powerpc_data_sum,
|
powerpc_data_sum,
|
||||||
powerpc_data_num,
|
powerpc_data_num,
|
||||||
powerpc_data_total
|
powerpc_data_total
|
||||||
};
|
};
|
||||||
extern unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][powerpc_data_total];
|
extern unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][powerpc_data_total];
|
||||||
|
|
||||||
@ -105,45 +105,42 @@ extern unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][
|
|||||||
#define POWERPC_GET_PMC6(a) do {} while (0)
|
#define POWERPC_GET_PMC6(a) do {} while (0)
|
||||||
#endif
|
#endif
|
||||||
#endif /* HAVE_PPC64 */
|
#endif /* HAVE_PPC64 */
|
||||||
#define POWERPC_PERF_DECLARE(a, cond) \
|
#define POWERPC_PERF_DECLARE(a, cond) \
|
||||||
POWERP_PMC_DATATYPE \
|
POWERP_PMC_DATATYPE \
|
||||||
pmc_start[POWERPC_NUM_PMC_ENABLED], \
|
pmc_start[POWERPC_NUM_PMC_ENABLED], \
|
||||||
pmc_stop[POWERPC_NUM_PMC_ENABLED], \
|
pmc_stop[POWERPC_NUM_PMC_ENABLED], \
|
||||||
pmc_loop_index;
|
pmc_loop_index;
|
||||||
#define POWERPC_PERF_START_COUNT(a, cond) do { \
|
#define POWERPC_PERF_START_COUNT(a, cond) do { \
|
||||||
POWERPC_GET_PMC6(pmc_start[5]); \
|
POWERPC_GET_PMC6(pmc_start[5]); \
|
||||||
POWERPC_GET_PMC5(pmc_start[4]); \
|
POWERPC_GET_PMC5(pmc_start[4]); \
|
||||||
POWERPC_GET_PMC4(pmc_start[3]); \
|
POWERPC_GET_PMC4(pmc_start[3]); \
|
||||||
POWERPC_GET_PMC3(pmc_start[2]); \
|
POWERPC_GET_PMC3(pmc_start[2]); \
|
||||||
POWERPC_GET_PMC2(pmc_start[1]); \
|
POWERPC_GET_PMC2(pmc_start[1]); \
|
||||||
POWERPC_GET_PMC1(pmc_start[0]); \
|
POWERPC_GET_PMC1(pmc_start[0]); \
|
||||||
} while (0)
|
} while (0)
|
||||||
#define POWERPC_PERF_STOP_COUNT(a, cond) do { \
|
#define POWERPC_PERF_STOP_COUNT(a, cond) do { \
|
||||||
POWERPC_GET_PMC1(pmc_stop[0]); \
|
POWERPC_GET_PMC1(pmc_stop[0]); \
|
||||||
POWERPC_GET_PMC2(pmc_stop[1]); \
|
POWERPC_GET_PMC2(pmc_stop[1]); \
|
||||||
POWERPC_GET_PMC3(pmc_stop[2]); \
|
POWERPC_GET_PMC3(pmc_stop[2]); \
|
||||||
POWERPC_GET_PMC4(pmc_stop[3]); \
|
POWERPC_GET_PMC4(pmc_stop[3]); \
|
||||||
POWERPC_GET_PMC5(pmc_stop[4]); \
|
POWERPC_GET_PMC5(pmc_stop[4]); \
|
||||||
POWERPC_GET_PMC6(pmc_stop[5]); \
|
POWERPC_GET_PMC6(pmc_stop[5]); \
|
||||||
if (cond) \
|
if (cond) { \
|
||||||
{ \
|
for(pmc_loop_index = 0; \
|
||||||
for(pmc_loop_index = 0; \
|
pmc_loop_index < POWERPC_NUM_PMC_ENABLED; \
|
||||||
pmc_loop_index < POWERPC_NUM_PMC_ENABLED; \
|
pmc_loop_index++) { \
|
||||||
pmc_loop_index++) \
|
if (pmc_stop[pmc_loop_index] >= pmc_start[pmc_loop_index]) { \
|
||||||
{ \
|
POWERP_PMC_DATATYPE diff = \
|
||||||
if (pmc_stop[pmc_loop_index] >= pmc_start[pmc_loop_index]) \
|
pmc_stop[pmc_loop_index] - pmc_start[pmc_loop_index]; \
|
||||||
{ \
|
if (diff < perfdata[pmc_loop_index][a][powerpc_data_min]) \
|
||||||
POWERP_PMC_DATATYPE diff = \
|
perfdata[pmc_loop_index][a][powerpc_data_min] = diff; \
|
||||||
pmc_stop[pmc_loop_index] - pmc_start[pmc_loop_index]; \
|
if (diff > perfdata[pmc_loop_index][a][powerpc_data_max]) \
|
||||||
if (diff < perfdata[pmc_loop_index][a][powerpc_data_min]) \
|
perfdata[pmc_loop_index][a][powerpc_data_max] = diff; \
|
||||||
perfdata[pmc_loop_index][a][powerpc_data_min] = diff; \
|
perfdata[pmc_loop_index][a][powerpc_data_sum] += diff; \
|
||||||
if (diff > perfdata[pmc_loop_index][a][powerpc_data_max]) \
|
perfdata[pmc_loop_index][a][powerpc_data_num] ++; \
|
||||||
perfdata[pmc_loop_index][a][powerpc_data_max] = diff; \
|
} \
|
||||||
perfdata[pmc_loop_index][a][powerpc_data_sum] += diff; \
|
} \
|
||||||
perfdata[pmc_loop_index][a][powerpc_data_num] ++; \
|
} \
|
||||||
} \
|
|
||||||
} \
|
|
||||||
} \
|
|
||||||
} while (0)
|
} while (0)
|
||||||
#else /* CONFIG_POWERPC_PERF */
|
#else /* CONFIG_POWERPC_PERF */
|
||||||
// those are needed to avoid empty statements.
|
// those are needed to avoid empty statements.
|
||||||
|
@ -33,21 +33,21 @@
|
|||||||
/* butter fly op */
|
/* butter fly op */
|
||||||
#define BF(pre, pim, qre, qim, pre1, pim1, qre1, qim1) \
|
#define BF(pre, pim, qre, qim, pre1, pim1, qre1, qim1) \
|
||||||
{\
|
{\
|
||||||
FFTSample ax, ay, bx, by;\
|
FFTSample ax, ay, bx, by;\
|
||||||
bx=pre1;\
|
bx=pre1;\
|
||||||
by=pim1;\
|
by=pim1;\
|
||||||
ax=qre1;\
|
ax=qre1;\
|
||||||
ay=qim1;\
|
ay=qim1;\
|
||||||
pre = (bx + ax);\
|
pre = (bx + ax);\
|
||||||
pim = (by + ay);\
|
pim = (by + ay);\
|
||||||
qre = (bx - ax);\
|
qre = (bx - ax);\
|
||||||
qim = (by - ay);\
|
qim = (by - ay);\
|
||||||
}
|
}
|
||||||
#define MUL16(a,b) ((a) * (b))
|
#define MUL16(a,b) ((a) * (b))
|
||||||
#define CMUL(pre, pim, are, aim, bre, bim) \
|
#define CMUL(pre, pim, are, aim, bre, bim) \
|
||||||
{\
|
{\
|
||||||
pre = (MUL16(are, bre) - MUL16(aim, bim));\
|
pre = (MUL16(are, bre) - MUL16(aim, bim));\
|
||||||
pim = (MUL16(are, bim) + MUL16(bre, aim));\
|
pim = (MUL16(are, bim) + MUL16(bre, aim));\
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -85,14 +85,11 @@ POWERPC_PERF_START_COUNT(altivec_fft_num, s->nbits >= 6);
|
|||||||
|
|
||||||
c1 = vcii(p,p,n,n);
|
c1 = vcii(p,p,n,n);
|
||||||
|
|
||||||
if (s->inverse)
|
if (s->inverse) {
|
||||||
{
|
c2 = vcii(p,p,n,p);
|
||||||
c2 = vcii(p,p,n,p);
|
} else {
|
||||||
}
|
c2 = vcii(p,p,p,n);
|
||||||
else
|
}
|
||||||
{
|
|
||||||
c2 = vcii(p,p,p,n);
|
|
||||||
}
|
|
||||||
|
|
||||||
j = (np >> 2);
|
j = (np >> 2);
|
||||||
do {
|
do {
|
||||||
|
@ -36,16 +36,16 @@ void gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */, int str
|
|||||||
{
|
{
|
||||||
POWERPC_PERF_DECLARE(altivec_gmc1_num, GMC1_PERF_COND);
|
POWERPC_PERF_DECLARE(altivec_gmc1_num, GMC1_PERF_COND);
|
||||||
const DECLARE_ALIGNED_16(unsigned short, rounder_a[8]) =
|
const DECLARE_ALIGNED_16(unsigned short, rounder_a[8]) =
|
||||||
{rounder, rounder, rounder, rounder,
|
{rounder, rounder, rounder, rounder,
|
||||||
rounder, rounder, rounder, rounder};
|
rounder, rounder, rounder, rounder};
|
||||||
const DECLARE_ALIGNED_16(unsigned short, ABCD[8]) =
|
const DECLARE_ALIGNED_16(unsigned short, ABCD[8]) =
|
||||||
{
|
{
|
||||||
(16-x16)*(16-y16), /* A */
|
(16-x16)*(16-y16), /* A */
|
||||||
( x16)*(16-y16), /* B */
|
( x16)*(16-y16), /* B */
|
||||||
(16-x16)*( y16), /* C */
|
(16-x16)*( y16), /* C */
|
||||||
( x16)*( y16), /* D */
|
( x16)*( y16), /* D */
|
||||||
0, 0, 0, 0 /* padding */
|
0, 0, 0, 0 /* padding */
|
||||||
};
|
};
|
||||||
register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
|
register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
|
||||||
register const vector unsigned short vcsr8 = (const vector unsigned short)vec_splat_u16(8);
|
register const vector unsigned short vcsr8 = (const vector unsigned short)vec_splat_u16(8);
|
||||||
register vector unsigned char dstv, dstv2, src_0, src_1, srcvA, srcvB, srcvC, srcvD;
|
register vector unsigned char dstv, dstv2, src_0, src_1, srcvA, srcvB, srcvC, srcvD;
|
||||||
@ -74,73 +74,67 @@ POWERPC_PERF_START_COUNT(altivec_gmc1_num, GMC1_PERF_COND);
|
|||||||
src_1 = vec_ld(16, src);
|
src_1 = vec_ld(16, src);
|
||||||
srcvA = vec_perm(src_0, src_1, vec_lvsl(0, src));
|
srcvA = vec_perm(src_0, src_1, vec_lvsl(0, src));
|
||||||
|
|
||||||
if (src_really_odd != 0x0000000F)
|
if (src_really_odd != 0x0000000F) {
|
||||||
{ // if src & 0xF == 0xF, then (src+1) is properly aligned on the second vector.
|
// if src & 0xF == 0xF, then (src+1) is properly aligned
|
||||||
srcvB = vec_perm(src_0, src_1, vec_lvsl(1, src));
|
// on the second vector.
|
||||||
}
|
srcvB = vec_perm(src_0, src_1, vec_lvsl(1, src));
|
||||||
else
|
} else {
|
||||||
{
|
srcvB = src_1;
|
||||||
srcvB = src_1;
|
|
||||||
}
|
}
|
||||||
srcvA = vec_mergeh(vczero, srcvA);
|
srcvA = vec_mergeh(vczero, srcvA);
|
||||||
srcvB = vec_mergeh(vczero, srcvB);
|
srcvB = vec_mergeh(vczero, srcvB);
|
||||||
|
|
||||||
for(i=0; i<h; i++)
|
for(i=0; i<h; i++) {
|
||||||
{
|
dst_odd = (unsigned long)dst & 0x0000000F;
|
||||||
dst_odd = (unsigned long)dst & 0x0000000F;
|
src_really_odd = (((unsigned long)src) + stride) & 0x0000000F;
|
||||||
src_really_odd = (((unsigned long)src) + stride) & 0x0000000F;
|
|
||||||
|
|
||||||
dstv = vec_ld(0, dst);
|
dstv = vec_ld(0, dst);
|
||||||
|
|
||||||
// we we'll be able to pick-up our 9 char elements
|
// we we'll be able to pick-up our 9 char elements
|
||||||
// at src + stride from those 32 bytes
|
// at src + stride from those 32 bytes
|
||||||
// then reuse the resulting 2 vectors srvcC and srcvD
|
// then reuse the resulting 2 vectors srvcC and srcvD
|
||||||
// as the next srcvA and srcvB
|
// as the next srcvA and srcvB
|
||||||
src_0 = vec_ld(stride + 0, src);
|
src_0 = vec_ld(stride + 0, src);
|
||||||
src_1 = vec_ld(stride + 16, src);
|
src_1 = vec_ld(stride + 16, src);
|
||||||
srcvC = vec_perm(src_0, src_1, vec_lvsl(stride + 0, src));
|
srcvC = vec_perm(src_0, src_1, vec_lvsl(stride + 0, src));
|
||||||
|
|
||||||
if (src_really_odd != 0x0000000F)
|
if (src_really_odd != 0x0000000F) {
|
||||||
{ // if src & 0xF == 0xF, then (src+1) is properly aligned on the second vector.
|
// if src & 0xF == 0xF, then (src+1) is properly aligned
|
||||||
srcvD = vec_perm(src_0, src_1, vec_lvsl(stride + 1, src));
|
// on the second vector.
|
||||||
}
|
srcvD = vec_perm(src_0, src_1, vec_lvsl(stride + 1, src));
|
||||||
else
|
} else {
|
||||||
{
|
srcvD = src_1;
|
||||||
srcvD = src_1;
|
}
|
||||||
}
|
|
||||||
|
|
||||||
srcvC = vec_mergeh(vczero, srcvC);
|
srcvC = vec_mergeh(vczero, srcvC);
|
||||||
srcvD = vec_mergeh(vczero, srcvD);
|
srcvD = vec_mergeh(vczero, srcvD);
|
||||||
|
|
||||||
|
|
||||||
// OK, now we (finally) do the math :-)
|
// OK, now we (finally) do the math :-)
|
||||||
// those four instructions replaces 32 int muls & 32 int adds.
|
// those four instructions replaces 32 int muls & 32 int adds.
|
||||||
// isn't AltiVec nice ?
|
// isn't AltiVec nice ?
|
||||||
tempA = vec_mladd((vector unsigned short)srcvA, Av, rounderV);
|
tempA = vec_mladd((vector unsigned short)srcvA, Av, rounderV);
|
||||||
tempB = vec_mladd((vector unsigned short)srcvB, Bv, tempA);
|
tempB = vec_mladd((vector unsigned short)srcvB, Bv, tempA);
|
||||||
tempC = vec_mladd((vector unsigned short)srcvC, Cv, tempB);
|
tempC = vec_mladd((vector unsigned short)srcvC, Cv, tempB);
|
||||||
tempD = vec_mladd((vector unsigned short)srcvD, Dv, tempC);
|
tempD = vec_mladd((vector unsigned short)srcvD, Dv, tempC);
|
||||||
|
|
||||||
srcvA = srcvC;
|
srcvA = srcvC;
|
||||||
srcvB = srcvD;
|
srcvB = srcvD;
|
||||||
|
|
||||||
tempD = vec_sr(tempD, vcsr8);
|
tempD = vec_sr(tempD, vcsr8);
|
||||||
|
|
||||||
dstv2 = vec_pack(tempD, (vector unsigned short)vczero);
|
dstv2 = vec_pack(tempD, (vector unsigned short)vczero);
|
||||||
|
|
||||||
if (dst_odd)
|
if (dst_odd) {
|
||||||
{
|
dstv2 = vec_perm(dstv, dstv2, vcprm(0,1,s0,s1));
|
||||||
dstv2 = vec_perm(dstv, dstv2, vcprm(0,1,s0,s1));
|
} else {
|
||||||
}
|
dstv2 = vec_perm(dstv, dstv2, vcprm(s0,s1,2,3));
|
||||||
else
|
}
|
||||||
{
|
|
||||||
dstv2 = vec_perm(dstv, dstv2, vcprm(s0,s1,2,3));
|
|
||||||
}
|
|
||||||
|
|
||||||
vec_st(dstv2, 0, dst);
|
vec_st(dstv2, 0, dst);
|
||||||
|
|
||||||
dst += stride;
|
dst += stride;
|
||||||
src += stride;
|
src += stride;
|
||||||
}
|
}
|
||||||
|
|
||||||
POWERPC_PERF_STOP_COUNT(altivec_gmc1_num, GMC1_PERF_COND);
|
POWERPC_PERF_STOP_COUNT(altivec_gmc1_num, GMC1_PERF_COND);
|
||||||
|
@ -196,7 +196,7 @@ void put_no_rnd_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride
|
|||||||
const vec_s16_t vD = vec_splat((vec_s16_t)vABCD, 7);
|
const vec_s16_t vD = vec_splat((vec_s16_t)vABCD, 7);
|
||||||
LOAD_ZERO;
|
LOAD_ZERO;
|
||||||
const vec_s16_t v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
|
const vec_s16_t v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
|
||||||
const vec_u16_t v6us = vec_splat_u16(6);
|
const vec_u16_t v6us = vec_splat_u16(6);
|
||||||
register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
|
register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
|
||||||
register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
|
register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
|
||||||
|
|
||||||
@ -392,8 +392,8 @@ static inline void avg_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1,
|
|||||||
#define avg_pixels16_l2_altivec(d,s1,s2,ds,s1s,h) avg_pixels16_l2(d,s1,s2,ds,s1s,16,h)
|
#define avg_pixels16_l2_altivec(d,s1,s2,ds,s1s,h) avg_pixels16_l2(d,s1,s2,ds,s1s,16,h)
|
||||||
*/
|
*/
|
||||||
|
|
||||||
H264_MC(put_, 16, altivec)
|
H264_MC(put_, 16, altivec)
|
||||||
H264_MC(avg_, 16, altivec)
|
H264_MC(avg_, 16, altivec)
|
||||||
|
|
||||||
|
|
||||||
/****************************************************************************
|
/****************************************************************************
|
||||||
@ -685,9 +685,9 @@ static inline void write16x4(uint8_t *dst, int dst_stride,
|
|||||||
r15 = vec_mergel(r3, r7); /*3,7,11,15 set 1*/ \
|
r15 = vec_mergel(r3, r7); /*3,7,11,15 set 1*/ \
|
||||||
\
|
\
|
||||||
/*Third merge*/ \
|
/*Third merge*/ \
|
||||||
r0 = vec_mergeh(r8, r12); /*0,2,4,6,8,10,12,14 set 0*/ \
|
r0 = vec_mergeh(r8, r12); /*0,2,4,6,8,10,12,14 set 0*/ \
|
||||||
r1 = vec_mergel(r8, r12); /*0,2,4,6,8,10,12,14 set 1*/ \
|
r1 = vec_mergel(r8, r12); /*0,2,4,6,8,10,12,14 set 1*/ \
|
||||||
r2 = vec_mergeh(r9, r13); /*0,2,4,6,8,10,12,14 set 2*/ \
|
r2 = vec_mergeh(r9, r13); /*0,2,4,6,8,10,12,14 set 2*/ \
|
||||||
r4 = vec_mergeh(r10, r14); /*1,3,5,7,9,11,13,15 set 0*/ \
|
r4 = vec_mergeh(r10, r14); /*1,3,5,7,9,11,13,15 set 0*/ \
|
||||||
r5 = vec_mergel(r10, r14); /*1,3,5,7,9,11,13,15 set 1*/ \
|
r5 = vec_mergel(r10, r14); /*1,3,5,7,9,11,13,15 set 1*/ \
|
||||||
r6 = vec_mergeh(r11, r15); /*1,3,5,7,9,11,13,15 set 2*/ \
|
r6 = vec_mergeh(r11, r15); /*1,3,5,7,9,11,13,15 set 2*/ \
|
||||||
|
@ -206,489 +206,489 @@ void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
|
|||||||
|
|
||||||
/* this code assume stride % 16 == 0 */
|
/* this code assume stride % 16 == 0 */
|
||||||
static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
|
static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
|
||||||
POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1);
|
POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1);
|
||||||
register int i;
|
register int i;
|
||||||
|
|
||||||
LOAD_ZERO;
|
LOAD_ZERO;
|
||||||
const vec_u8_t permM2 = vec_lvsl(-2, src);
|
const vec_u8_t permM2 = vec_lvsl(-2, src);
|
||||||
const vec_u8_t permM1 = vec_lvsl(-1, src);
|
const vec_u8_t permM1 = vec_lvsl(-1, src);
|
||||||
const vec_u8_t permP0 = vec_lvsl(+0, src);
|
const vec_u8_t permP0 = vec_lvsl(+0, src);
|
||||||
const vec_u8_t permP1 = vec_lvsl(+1, src);
|
const vec_u8_t permP1 = vec_lvsl(+1, src);
|
||||||
const vec_u8_t permP2 = vec_lvsl(+2, src);
|
const vec_u8_t permP2 = vec_lvsl(+2, src);
|
||||||
const vec_u8_t permP3 = vec_lvsl(+3, src);
|
const vec_u8_t permP3 = vec_lvsl(+3, src);
|
||||||
const vec_s16_t v5ss = vec_splat_s16(5);
|
const vec_s16_t v5ss = vec_splat_s16(5);
|
||||||
const vec_u16_t v5us = vec_splat_u16(5);
|
const vec_u16_t v5us = vec_splat_u16(5);
|
||||||
const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
|
const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
|
||||||
const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
|
const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
|
||||||
|
|
||||||
vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
|
vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
|
||||||
|
|
||||||
register int align = ((((unsigned long)src) - 2) % 16);
|
register int align = ((((unsigned long)src) - 2) % 16);
|
||||||
|
|
||||||
vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B,
|
vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B,
|
||||||
srcP2A, srcP2B, srcP3A, srcP3B,
|
srcP2A, srcP2B, srcP3A, srcP3B,
|
||||||
srcM1A, srcM1B, srcM2A, srcM2B,
|
srcM1A, srcM1B, srcM2A, srcM2B,
|
||||||
sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
|
sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
|
||||||
pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
|
pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
|
||||||
psumA, psumB, sumA, sumB;
|
psumA, psumB, sumA, sumB;
|
||||||
|
|
||||||
vec_u8_t sum, vdst, fsum;
|
vec_u8_t sum, vdst, fsum;
|
||||||
|
|
||||||
POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
|
POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
|
||||||
|
|
||||||
for (i = 0 ; i < 16 ; i ++) {
|
for (i = 0 ; i < 16 ; i ++) {
|
||||||
vec_u8_t srcR1 = vec_ld(-2, src);
|
vec_u8_t srcR1 = vec_ld(-2, src);
|
||||||
vec_u8_t srcR2 = vec_ld(14, src);
|
vec_u8_t srcR2 = vec_ld(14, src);
|
||||||
|
|
||||||
switch (align) {
|
switch (align) {
|
||||||
default: {
|
default: {
|
||||||
srcM2 = vec_perm(srcR1, srcR2, permM2);
|
srcM2 = vec_perm(srcR1, srcR2, permM2);
|
||||||
srcM1 = vec_perm(srcR1, srcR2, permM1);
|
srcM1 = vec_perm(srcR1, srcR2, permM1);
|
||||||
srcP0 = vec_perm(srcR1, srcR2, permP0);
|
srcP0 = vec_perm(srcR1, srcR2, permP0);
|
||||||
srcP1 = vec_perm(srcR1, srcR2, permP1);
|
srcP1 = vec_perm(srcR1, srcR2, permP1);
|
||||||
srcP2 = vec_perm(srcR1, srcR2, permP2);
|
srcP2 = vec_perm(srcR1, srcR2, permP2);
|
||||||
srcP3 = vec_perm(srcR1, srcR2, permP3);
|
srcP3 = vec_perm(srcR1, srcR2, permP3);
|
||||||
} break;
|
} break;
|
||||||
case 11: {
|
case 11: {
|
||||||
srcM2 = vec_perm(srcR1, srcR2, permM2);
|
srcM2 = vec_perm(srcR1, srcR2, permM2);
|
||||||
srcM1 = vec_perm(srcR1, srcR2, permM1);
|
srcM1 = vec_perm(srcR1, srcR2, permM1);
|
||||||
srcP0 = vec_perm(srcR1, srcR2, permP0);
|
srcP0 = vec_perm(srcR1, srcR2, permP0);
|
||||||
srcP1 = vec_perm(srcR1, srcR2, permP1);
|
srcP1 = vec_perm(srcR1, srcR2, permP1);
|
||||||
srcP2 = vec_perm(srcR1, srcR2, permP2);
|
srcP2 = vec_perm(srcR1, srcR2, permP2);
|
||||||
srcP3 = srcR2;
|
srcP3 = srcR2;
|
||||||
} break;
|
} break;
|
||||||
case 12: {
|
case 12: {
|
||||||
vec_u8_t srcR3 = vec_ld(30, src);
|
vec_u8_t srcR3 = vec_ld(30, src);
|
||||||
srcM2 = vec_perm(srcR1, srcR2, permM2);
|
srcM2 = vec_perm(srcR1, srcR2, permM2);
|
||||||
srcM1 = vec_perm(srcR1, srcR2, permM1);
|
srcM1 = vec_perm(srcR1, srcR2, permM1);
|
||||||
srcP0 = vec_perm(srcR1, srcR2, permP0);
|
srcP0 = vec_perm(srcR1, srcR2, permP0);
|
||||||
srcP1 = vec_perm(srcR1, srcR2, permP1);
|
srcP1 = vec_perm(srcR1, srcR2, permP1);
|
||||||
srcP2 = srcR2;
|
srcP2 = srcR2;
|
||||||
srcP3 = vec_perm(srcR2, srcR3, permP3);
|
srcP3 = vec_perm(srcR2, srcR3, permP3);
|
||||||
} break;
|
} break;
|
||||||
case 13: {
|
case 13: {
|
||||||
vec_u8_t srcR3 = vec_ld(30, src);
|
vec_u8_t srcR3 = vec_ld(30, src);
|
||||||
srcM2 = vec_perm(srcR1, srcR2, permM2);
|
srcM2 = vec_perm(srcR1, srcR2, permM2);
|
||||||
srcM1 = vec_perm(srcR1, srcR2, permM1);
|
srcM1 = vec_perm(srcR1, srcR2, permM1);
|
||||||
srcP0 = vec_perm(srcR1, srcR2, permP0);
|
srcP0 = vec_perm(srcR1, srcR2, permP0);
|
||||||
srcP1 = srcR2;
|
srcP1 = srcR2;
|
||||||
srcP2 = vec_perm(srcR2, srcR3, permP2);
|
srcP2 = vec_perm(srcR2, srcR3, permP2);
|
||||||
srcP3 = vec_perm(srcR2, srcR3, permP3);
|
srcP3 = vec_perm(srcR2, srcR3, permP3);
|
||||||
} break;
|
} break;
|
||||||
case 14: {
|
case 14: {
|
||||||
vec_u8_t srcR3 = vec_ld(30, src);
|
vec_u8_t srcR3 = vec_ld(30, src);
|
||||||
srcM2 = vec_perm(srcR1, srcR2, permM2);
|
srcM2 = vec_perm(srcR1, srcR2, permM2);
|
||||||
srcM1 = vec_perm(srcR1, srcR2, permM1);
|
srcM1 = vec_perm(srcR1, srcR2, permM1);
|
||||||
srcP0 = srcR2;
|
srcP0 = srcR2;
|
||||||
srcP1 = vec_perm(srcR2, srcR3, permP1);
|
srcP1 = vec_perm(srcR2, srcR3, permP1);
|
||||||
srcP2 = vec_perm(srcR2, srcR3, permP2);
|
srcP2 = vec_perm(srcR2, srcR3, permP2);
|
||||||
srcP3 = vec_perm(srcR2, srcR3, permP3);
|
srcP3 = vec_perm(srcR2, srcR3, permP3);
|
||||||
} break;
|
} break;
|
||||||
case 15: {
|
case 15: {
|
||||||
vec_u8_t srcR3 = vec_ld(30, src);
|
vec_u8_t srcR3 = vec_ld(30, src);
|
||||||
srcM2 = vec_perm(srcR1, srcR2, permM2);
|
srcM2 = vec_perm(srcR1, srcR2, permM2);
|
||||||
srcM1 = srcR2;
|
srcM1 = srcR2;
|
||||||
srcP0 = vec_perm(srcR2, srcR3, permP0);
|
srcP0 = vec_perm(srcR2, srcR3, permP0);
|
||||||
srcP1 = vec_perm(srcR2, srcR3, permP1);
|
srcP1 = vec_perm(srcR2, srcR3, permP1);
|
||||||
srcP2 = vec_perm(srcR2, srcR3, permP2);
|
srcP2 = vec_perm(srcR2, srcR3, permP2);
|
||||||
srcP3 = vec_perm(srcR2, srcR3, permP3);
|
srcP3 = vec_perm(srcR2, srcR3, permP3);
|
||||||
} break;
|
} break;
|
||||||
|
}
|
||||||
|
|
||||||
|
srcP0A = (vec_s16_t) vec_mergeh(zero_u8v, srcP0);
|
||||||
|
srcP0B = (vec_s16_t) vec_mergel(zero_u8v, srcP0);
|
||||||
|
srcP1A = (vec_s16_t) vec_mergeh(zero_u8v, srcP1);
|
||||||
|
srcP1B = (vec_s16_t) vec_mergel(zero_u8v, srcP1);
|
||||||
|
|
||||||
|
srcP2A = (vec_s16_t) vec_mergeh(zero_u8v, srcP2);
|
||||||
|
srcP2B = (vec_s16_t) vec_mergel(zero_u8v, srcP2);
|
||||||
|
srcP3A = (vec_s16_t) vec_mergeh(zero_u8v, srcP3);
|
||||||
|
srcP3B = (vec_s16_t) vec_mergel(zero_u8v, srcP3);
|
||||||
|
|
||||||
|
srcM1A = (vec_s16_t) vec_mergeh(zero_u8v, srcM1);
|
||||||
|
srcM1B = (vec_s16_t) vec_mergel(zero_u8v, srcM1);
|
||||||
|
srcM2A = (vec_s16_t) vec_mergeh(zero_u8v, srcM2);
|
||||||
|
srcM2B = (vec_s16_t) vec_mergel(zero_u8v, srcM2);
|
||||||
|
|
||||||
|
sum1A = vec_adds(srcP0A, srcP1A);
|
||||||
|
sum1B = vec_adds(srcP0B, srcP1B);
|
||||||
|
sum2A = vec_adds(srcM1A, srcP2A);
|
||||||
|
sum2B = vec_adds(srcM1B, srcP2B);
|
||||||
|
sum3A = vec_adds(srcM2A, srcP3A);
|
||||||
|
sum3B = vec_adds(srcM2B, srcP3B);
|
||||||
|
|
||||||
|
pp1A = vec_mladd(sum1A, v20ss, v16ss);
|
||||||
|
pp1B = vec_mladd(sum1B, v20ss, v16ss);
|
||||||
|
|
||||||
|
pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
|
||||||
|
pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
|
||||||
|
|
||||||
|
pp3A = vec_add(sum3A, pp1A);
|
||||||
|
pp3B = vec_add(sum3B, pp1B);
|
||||||
|
|
||||||
|
psumA = vec_sub(pp3A, pp2A);
|
||||||
|
psumB = vec_sub(pp3B, pp2B);
|
||||||
|
|
||||||
|
sumA = vec_sra(psumA, v5us);
|
||||||
|
sumB = vec_sra(psumB, v5us);
|
||||||
|
|
||||||
|
sum = vec_packsu(sumA, sumB);
|
||||||
|
|
||||||
|
ASSERT_ALIGNED(dst);
|
||||||
|
vdst = vec_ld(0, dst);
|
||||||
|
|
||||||
|
OP_U8_ALTIVEC(fsum, sum, vdst);
|
||||||
|
|
||||||
|
vec_st(fsum, 0, dst);
|
||||||
|
|
||||||
|
src += srcStride;
|
||||||
|
dst += dstStride;
|
||||||
}
|
}
|
||||||
|
POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
|
||||||
srcP0A = (vec_s16_t) vec_mergeh(zero_u8v, srcP0);
|
|
||||||
srcP0B = (vec_s16_t) vec_mergel(zero_u8v, srcP0);
|
|
||||||
srcP1A = (vec_s16_t) vec_mergeh(zero_u8v, srcP1);
|
|
||||||
srcP1B = (vec_s16_t) vec_mergel(zero_u8v, srcP1);
|
|
||||||
|
|
||||||
srcP2A = (vec_s16_t) vec_mergeh(zero_u8v, srcP2);
|
|
||||||
srcP2B = (vec_s16_t) vec_mergel(zero_u8v, srcP2);
|
|
||||||
srcP3A = (vec_s16_t) vec_mergeh(zero_u8v, srcP3);
|
|
||||||
srcP3B = (vec_s16_t) vec_mergel(zero_u8v, srcP3);
|
|
||||||
|
|
||||||
srcM1A = (vec_s16_t) vec_mergeh(zero_u8v, srcM1);
|
|
||||||
srcM1B = (vec_s16_t) vec_mergel(zero_u8v, srcM1);
|
|
||||||
srcM2A = (vec_s16_t) vec_mergeh(zero_u8v, srcM2);
|
|
||||||
srcM2B = (vec_s16_t) vec_mergel(zero_u8v, srcM2);
|
|
||||||
|
|
||||||
sum1A = vec_adds(srcP0A, srcP1A);
|
|
||||||
sum1B = vec_adds(srcP0B, srcP1B);
|
|
||||||
sum2A = vec_adds(srcM1A, srcP2A);
|
|
||||||
sum2B = vec_adds(srcM1B, srcP2B);
|
|
||||||
sum3A = vec_adds(srcM2A, srcP3A);
|
|
||||||
sum3B = vec_adds(srcM2B, srcP3B);
|
|
||||||
|
|
||||||
pp1A = vec_mladd(sum1A, v20ss, v16ss);
|
|
||||||
pp1B = vec_mladd(sum1B, v20ss, v16ss);
|
|
||||||
|
|
||||||
pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
|
|
||||||
pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
|
|
||||||
|
|
||||||
pp3A = vec_add(sum3A, pp1A);
|
|
||||||
pp3B = vec_add(sum3B, pp1B);
|
|
||||||
|
|
||||||
psumA = vec_sub(pp3A, pp2A);
|
|
||||||
psumB = vec_sub(pp3B, pp2B);
|
|
||||||
|
|
||||||
sumA = vec_sra(psumA, v5us);
|
|
||||||
sumB = vec_sra(psumB, v5us);
|
|
||||||
|
|
||||||
sum = vec_packsu(sumA, sumB);
|
|
||||||
|
|
||||||
ASSERT_ALIGNED(dst);
|
|
||||||
vdst = vec_ld(0, dst);
|
|
||||||
|
|
||||||
OP_U8_ALTIVEC(fsum, sum, vdst);
|
|
||||||
|
|
||||||
vec_st(fsum, 0, dst);
|
|
||||||
|
|
||||||
src += srcStride;
|
|
||||||
dst += dstStride;
|
|
||||||
}
|
|
||||||
POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* this code assume stride % 16 == 0 */
|
/* this code assume stride % 16 == 0 */
|
||||||
static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
|
static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
|
||||||
POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1);
|
POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1);
|
||||||
|
|
||||||
register int i;
|
register int i;
|
||||||
|
|
||||||
LOAD_ZERO;
|
LOAD_ZERO;
|
||||||
const vec_u8_t perm = vec_lvsl(0, src);
|
const vec_u8_t perm = vec_lvsl(0, src);
|
||||||
const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
|
const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
|
||||||
const vec_u16_t v5us = vec_splat_u16(5);
|
const vec_u16_t v5us = vec_splat_u16(5);
|
||||||
const vec_s16_t v5ss = vec_splat_s16(5);
|
const vec_s16_t v5ss = vec_splat_s16(5);
|
||||||
const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
|
const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
|
||||||
|
|
||||||
uint8_t *srcbis = src - (srcStride * 2);
|
uint8_t *srcbis = src - (srcStride * 2);
|
||||||
|
|
||||||
const vec_u8_t srcM2a = vec_ld(0, srcbis);
|
const vec_u8_t srcM2a = vec_ld(0, srcbis);
|
||||||
const vec_u8_t srcM2b = vec_ld(16, srcbis);
|
const vec_u8_t srcM2b = vec_ld(16, srcbis);
|
||||||
const vec_u8_t srcM2 = vec_perm(srcM2a, srcM2b, perm);
|
const vec_u8_t srcM2 = vec_perm(srcM2a, srcM2b, perm);
|
||||||
// srcbis += srcStride;
|
//srcbis += srcStride;
|
||||||
const vec_u8_t srcM1a = vec_ld(0, srcbis += srcStride);
|
const vec_u8_t srcM1a = vec_ld(0, srcbis += srcStride);
|
||||||
const vec_u8_t srcM1b = vec_ld(16, srcbis);
|
const vec_u8_t srcM1b = vec_ld(16, srcbis);
|
||||||
const vec_u8_t srcM1 = vec_perm(srcM1a, srcM1b, perm);
|
const vec_u8_t srcM1 = vec_perm(srcM1a, srcM1b, perm);
|
||||||
// srcbis += srcStride;
|
//srcbis += srcStride;
|
||||||
const vec_u8_t srcP0a = vec_ld(0, srcbis += srcStride);
|
const vec_u8_t srcP0a = vec_ld(0, srcbis += srcStride);
|
||||||
const vec_u8_t srcP0b = vec_ld(16, srcbis);
|
const vec_u8_t srcP0b = vec_ld(16, srcbis);
|
||||||
const vec_u8_t srcP0 = vec_perm(srcP0a, srcP0b, perm);
|
const vec_u8_t srcP0 = vec_perm(srcP0a, srcP0b, perm);
|
||||||
// srcbis += srcStride;
|
//srcbis += srcStride;
|
||||||
const vec_u8_t srcP1a = vec_ld(0, srcbis += srcStride);
|
const vec_u8_t srcP1a = vec_ld(0, srcbis += srcStride);
|
||||||
const vec_u8_t srcP1b = vec_ld(16, srcbis);
|
const vec_u8_t srcP1b = vec_ld(16, srcbis);
|
||||||
const vec_u8_t srcP1 = vec_perm(srcP1a, srcP1b, perm);
|
const vec_u8_t srcP1 = vec_perm(srcP1a, srcP1b, perm);
|
||||||
// srcbis += srcStride;
|
//srcbis += srcStride;
|
||||||
const vec_u8_t srcP2a = vec_ld(0, srcbis += srcStride);
|
const vec_u8_t srcP2a = vec_ld(0, srcbis += srcStride);
|
||||||
const vec_u8_t srcP2b = vec_ld(16, srcbis);
|
const vec_u8_t srcP2b = vec_ld(16, srcbis);
|
||||||
const vec_u8_t srcP2 = vec_perm(srcP2a, srcP2b, perm);
|
const vec_u8_t srcP2 = vec_perm(srcP2a, srcP2b, perm);
|
||||||
// srcbis += srcStride;
|
//srcbis += srcStride;
|
||||||
|
|
||||||
vec_s16_t srcM2ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcM2);
|
vec_s16_t srcM2ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcM2);
|
||||||
vec_s16_t srcM2ssB = (vec_s16_t) vec_mergel(zero_u8v, srcM2);
|
vec_s16_t srcM2ssB = (vec_s16_t) vec_mergel(zero_u8v, srcM2);
|
||||||
vec_s16_t srcM1ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcM1);
|
vec_s16_t srcM1ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcM1);
|
||||||
vec_s16_t srcM1ssB = (vec_s16_t) vec_mergel(zero_u8v, srcM1);
|
vec_s16_t srcM1ssB = (vec_s16_t) vec_mergel(zero_u8v, srcM1);
|
||||||
vec_s16_t srcP0ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP0);
|
vec_s16_t srcP0ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP0);
|
||||||
vec_s16_t srcP0ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP0);
|
vec_s16_t srcP0ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP0);
|
||||||
vec_s16_t srcP1ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP1);
|
vec_s16_t srcP1ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP1);
|
||||||
vec_s16_t srcP1ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP1);
|
vec_s16_t srcP1ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP1);
|
||||||
vec_s16_t srcP2ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP2);
|
vec_s16_t srcP2ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP2);
|
||||||
vec_s16_t srcP2ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP2);
|
vec_s16_t srcP2ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP2);
|
||||||
|
|
||||||
vec_s16_t pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
|
vec_s16_t pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
|
||||||
psumA, psumB, sumA, sumB,
|
psumA, psumB, sumA, sumB,
|
||||||
srcP3ssA, srcP3ssB,
|
srcP3ssA, srcP3ssB,
|
||||||
sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
|
sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
|
||||||
|
|
||||||
vec_u8_t sum, vdst, fsum, srcP3a, srcP3b, srcP3;
|
vec_u8_t sum, vdst, fsum, srcP3a, srcP3b, srcP3;
|
||||||
|
|
||||||
POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
|
POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
|
||||||
|
|
||||||
for (i = 0 ; i < 16 ; i++) {
|
for (i = 0 ; i < 16 ; i++) {
|
||||||
srcP3a = vec_ld(0, srcbis += srcStride);
|
srcP3a = vec_ld(0, srcbis += srcStride);
|
||||||
srcP3b = vec_ld(16, srcbis);
|
srcP3b = vec_ld(16, srcbis);
|
||||||
srcP3 = vec_perm(srcP3a, srcP3b, perm);
|
srcP3 = vec_perm(srcP3a, srcP3b, perm);
|
||||||
srcP3ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP3);
|
srcP3ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP3);
|
||||||
srcP3ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP3);
|
srcP3ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP3);
|
||||||
// srcbis += srcStride;
|
//srcbis += srcStride;
|
||||||
|
|
||||||
sum1A = vec_adds(srcP0ssA, srcP1ssA);
|
sum1A = vec_adds(srcP0ssA, srcP1ssA);
|
||||||
sum1B = vec_adds(srcP0ssB, srcP1ssB);
|
sum1B = vec_adds(srcP0ssB, srcP1ssB);
|
||||||
sum2A = vec_adds(srcM1ssA, srcP2ssA);
|
sum2A = vec_adds(srcM1ssA, srcP2ssA);
|
||||||
sum2B = vec_adds(srcM1ssB, srcP2ssB);
|
sum2B = vec_adds(srcM1ssB, srcP2ssB);
|
||||||
sum3A = vec_adds(srcM2ssA, srcP3ssA);
|
sum3A = vec_adds(srcM2ssA, srcP3ssA);
|
||||||
sum3B = vec_adds(srcM2ssB, srcP3ssB);
|
sum3B = vec_adds(srcM2ssB, srcP3ssB);
|
||||||
|
|
||||||
srcM2ssA = srcM1ssA;
|
srcM2ssA = srcM1ssA;
|
||||||
srcM2ssB = srcM1ssB;
|
srcM2ssB = srcM1ssB;
|
||||||
srcM1ssA = srcP0ssA;
|
srcM1ssA = srcP0ssA;
|
||||||
srcM1ssB = srcP0ssB;
|
srcM1ssB = srcP0ssB;
|
||||||
srcP0ssA = srcP1ssA;
|
srcP0ssA = srcP1ssA;
|
||||||
srcP0ssB = srcP1ssB;
|
srcP0ssB = srcP1ssB;
|
||||||
srcP1ssA = srcP2ssA;
|
srcP1ssA = srcP2ssA;
|
||||||
srcP1ssB = srcP2ssB;
|
srcP1ssB = srcP2ssB;
|
||||||
srcP2ssA = srcP3ssA;
|
srcP2ssA = srcP3ssA;
|
||||||
srcP2ssB = srcP3ssB;
|
srcP2ssB = srcP3ssB;
|
||||||
|
|
||||||
pp1A = vec_mladd(sum1A, v20ss, v16ss);
|
pp1A = vec_mladd(sum1A, v20ss, v16ss);
|
||||||
pp1B = vec_mladd(sum1B, v20ss, v16ss);
|
pp1B = vec_mladd(sum1B, v20ss, v16ss);
|
||||||
|
|
||||||
pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
|
pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
|
||||||
pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
|
pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
|
||||||
|
|
||||||
pp3A = vec_add(sum3A, pp1A);
|
pp3A = vec_add(sum3A, pp1A);
|
||||||
pp3B = vec_add(sum3B, pp1B);
|
pp3B = vec_add(sum3B, pp1B);
|
||||||
|
|
||||||
psumA = vec_sub(pp3A, pp2A);
|
psumA = vec_sub(pp3A, pp2A);
|
||||||
psumB = vec_sub(pp3B, pp2B);
|
psumB = vec_sub(pp3B, pp2B);
|
||||||
|
|
||||||
sumA = vec_sra(psumA, v5us);
|
sumA = vec_sra(psumA, v5us);
|
||||||
sumB = vec_sra(psumB, v5us);
|
sumB = vec_sra(psumB, v5us);
|
||||||
|
|
||||||
sum = vec_packsu(sumA, sumB);
|
sum = vec_packsu(sumA, sumB);
|
||||||
|
|
||||||
ASSERT_ALIGNED(dst);
|
ASSERT_ALIGNED(dst);
|
||||||
vdst = vec_ld(0, dst);
|
vdst = vec_ld(0, dst);
|
||||||
|
|
||||||
OP_U8_ALTIVEC(fsum, sum, vdst);
|
OP_U8_ALTIVEC(fsum, sum, vdst);
|
||||||
|
|
||||||
vec_st(fsum, 0, dst);
|
vec_st(fsum, 0, dst);
|
||||||
|
|
||||||
dst += dstStride;
|
dst += dstStride;
|
||||||
}
|
}
|
||||||
POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
|
POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* this code assume stride % 16 == 0 *and* tmp is properly aligned */
|
/* this code assume stride % 16 == 0 *and* tmp is properly aligned */
|
||||||
static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) {
|
static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) {
|
||||||
POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1);
|
POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1);
|
||||||
register int i;
|
register int i;
|
||||||
LOAD_ZERO;
|
LOAD_ZERO;
|
||||||
const vec_u8_t permM2 = vec_lvsl(-2, src);
|
const vec_u8_t permM2 = vec_lvsl(-2, src);
|
||||||
const vec_u8_t permM1 = vec_lvsl(-1, src);
|
const vec_u8_t permM1 = vec_lvsl(-1, src);
|
||||||
const vec_u8_t permP0 = vec_lvsl(+0, src);
|
const vec_u8_t permP0 = vec_lvsl(+0, src);
|
||||||
const vec_u8_t permP1 = vec_lvsl(+1, src);
|
const vec_u8_t permP1 = vec_lvsl(+1, src);
|
||||||
const vec_u8_t permP2 = vec_lvsl(+2, src);
|
const vec_u8_t permP2 = vec_lvsl(+2, src);
|
||||||
const vec_u8_t permP3 = vec_lvsl(+3, src);
|
const vec_u8_t permP3 = vec_lvsl(+3, src);
|
||||||
const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
|
const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
|
||||||
const vec_u32_t v10ui = vec_splat_u32(10);
|
const vec_u32_t v10ui = vec_splat_u32(10);
|
||||||
const vec_s16_t v5ss = vec_splat_s16(5);
|
const vec_s16_t v5ss = vec_splat_s16(5);
|
||||||
const vec_s16_t v1ss = vec_splat_s16(1);
|
const vec_s16_t v1ss = vec_splat_s16(1);
|
||||||
const vec_s32_t v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
|
const vec_s32_t v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
|
||||||
const vec_u32_t v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
|
const vec_u32_t v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
|
||||||
|
|
||||||
register int align = ((((unsigned long)src) - 2) % 16);
|
register int align = ((((unsigned long)src) - 2) % 16);
|
||||||
|
|
||||||
vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B,
|
vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B,
|
||||||
srcP2A, srcP2B, srcP3A, srcP3B,
|
srcP2A, srcP2B, srcP3A, srcP3B,
|
||||||
srcM1A, srcM1B, srcM2A, srcM2B,
|
srcM1A, srcM1B, srcM2A, srcM2B,
|
||||||
sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
|
sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
|
||||||
pp1A, pp1B, pp2A, pp2B, psumA, psumB;
|
pp1A, pp1B, pp2A, pp2B, psumA, psumB;
|
||||||
|
|
||||||
const vec_u8_t mperm = (const vec_u8_t)
|
const vec_u8_t mperm = (const vec_u8_t)
|
||||||
AVV(0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
|
AVV(0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
|
||||||
0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F);
|
0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F);
|
||||||
int16_t *tmpbis = tmp;
|
int16_t *tmpbis = tmp;
|
||||||
|
|
||||||
vec_s16_t tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
|
vec_s16_t tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
|
||||||
tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
|
tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
|
||||||
tmpP2ssA, tmpP2ssB;
|
tmpP2ssA, tmpP2ssB;
|
||||||
|
|
||||||
vec_s32_t pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
|
vec_s32_t pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
|
||||||
pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
|
pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
|
||||||
pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
|
pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
|
||||||
ssumAe, ssumAo, ssumBe, ssumBo;
|
ssumAe, ssumAo, ssumBe, ssumBo;
|
||||||
vec_u8_t fsum, sumv, sum, vdst;
|
vec_u8_t fsum, sumv, sum, vdst;
|
||||||
vec_s16_t ssume, ssumo;
|
vec_s16_t ssume, ssumo;
|
||||||
|
|
||||||
POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
|
POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
|
||||||
src -= (2 * srcStride);
|
src -= (2 * srcStride);
|
||||||
for (i = 0 ; i < 21 ; i ++) {
|
for (i = 0 ; i < 21 ; i ++) {
|
||||||
vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
|
vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
|
||||||
vec_u8_t srcR1 = vec_ld(-2, src);
|
vec_u8_t srcR1 = vec_ld(-2, src);
|
||||||
vec_u8_t srcR2 = vec_ld(14, src);
|
vec_u8_t srcR2 = vec_ld(14, src);
|
||||||
|
|
||||||
switch (align) {
|
switch (align) {
|
||||||
default: {
|
default: {
|
||||||
srcM2 = vec_perm(srcR1, srcR2, permM2);
|
srcM2 = vec_perm(srcR1, srcR2, permM2);
|
||||||
srcM1 = vec_perm(srcR1, srcR2, permM1);
|
srcM1 = vec_perm(srcR1, srcR2, permM1);
|
||||||
srcP0 = vec_perm(srcR1, srcR2, permP0);
|
srcP0 = vec_perm(srcR1, srcR2, permP0);
|
||||||
srcP1 = vec_perm(srcR1, srcR2, permP1);
|
srcP1 = vec_perm(srcR1, srcR2, permP1);
|
||||||
srcP2 = vec_perm(srcR1, srcR2, permP2);
|
srcP2 = vec_perm(srcR1, srcR2, permP2);
|
||||||
srcP3 = vec_perm(srcR1, srcR2, permP3);
|
srcP3 = vec_perm(srcR1, srcR2, permP3);
|
||||||
} break;
|
} break;
|
||||||
case 11: {
|
case 11: {
|
||||||
srcM2 = vec_perm(srcR1, srcR2, permM2);
|
srcM2 = vec_perm(srcR1, srcR2, permM2);
|
||||||
srcM1 = vec_perm(srcR1, srcR2, permM1);
|
srcM1 = vec_perm(srcR1, srcR2, permM1);
|
||||||
srcP0 = vec_perm(srcR1, srcR2, permP0);
|
srcP0 = vec_perm(srcR1, srcR2, permP0);
|
||||||
srcP1 = vec_perm(srcR1, srcR2, permP1);
|
srcP1 = vec_perm(srcR1, srcR2, permP1);
|
||||||
srcP2 = vec_perm(srcR1, srcR2, permP2);
|
srcP2 = vec_perm(srcR1, srcR2, permP2);
|
||||||
srcP3 = srcR2;
|
srcP3 = srcR2;
|
||||||
} break;
|
} break;
|
||||||
case 12: {
|
case 12: {
|
||||||
vec_u8_t srcR3 = vec_ld(30, src);
|
vec_u8_t srcR3 = vec_ld(30, src);
|
||||||
srcM2 = vec_perm(srcR1, srcR2, permM2);
|
srcM2 = vec_perm(srcR1, srcR2, permM2);
|
||||||
srcM1 = vec_perm(srcR1, srcR2, permM1);
|
srcM1 = vec_perm(srcR1, srcR2, permM1);
|
||||||
srcP0 = vec_perm(srcR1, srcR2, permP0);
|
srcP0 = vec_perm(srcR1, srcR2, permP0);
|
||||||
srcP1 = vec_perm(srcR1, srcR2, permP1);
|
srcP1 = vec_perm(srcR1, srcR2, permP1);
|
||||||
srcP2 = srcR2;
|
srcP2 = srcR2;
|
||||||
srcP3 = vec_perm(srcR2, srcR3, permP3);
|
srcP3 = vec_perm(srcR2, srcR3, permP3);
|
||||||
} break;
|
} break;
|
||||||
case 13: {
|
case 13: {
|
||||||
vec_u8_t srcR3 = vec_ld(30, src);
|
vec_u8_t srcR3 = vec_ld(30, src);
|
||||||
srcM2 = vec_perm(srcR1, srcR2, permM2);
|
srcM2 = vec_perm(srcR1, srcR2, permM2);
|
||||||
srcM1 = vec_perm(srcR1, srcR2, permM1);
|
srcM1 = vec_perm(srcR1, srcR2, permM1);
|
||||||
srcP0 = vec_perm(srcR1, srcR2, permP0);
|
srcP0 = vec_perm(srcR1, srcR2, permP0);
|
||||||
srcP1 = srcR2;
|
srcP1 = srcR2;
|
||||||
srcP2 = vec_perm(srcR2, srcR3, permP2);
|
srcP2 = vec_perm(srcR2, srcR3, permP2);
|
||||||
srcP3 = vec_perm(srcR2, srcR3, permP3);
|
srcP3 = vec_perm(srcR2, srcR3, permP3);
|
||||||
} break;
|
} break;
|
||||||
case 14: {
|
case 14: {
|
||||||
vec_u8_t srcR3 = vec_ld(30, src);
|
vec_u8_t srcR3 = vec_ld(30, src);
|
||||||
srcM2 = vec_perm(srcR1, srcR2, permM2);
|
srcM2 = vec_perm(srcR1, srcR2, permM2);
|
||||||
srcM1 = vec_perm(srcR1, srcR2, permM1);
|
srcM1 = vec_perm(srcR1, srcR2, permM1);
|
||||||
srcP0 = srcR2;
|
srcP0 = srcR2;
|
||||||
srcP1 = vec_perm(srcR2, srcR3, permP1);
|
srcP1 = vec_perm(srcR2, srcR3, permP1);
|
||||||
srcP2 = vec_perm(srcR2, srcR3, permP2);
|
srcP2 = vec_perm(srcR2, srcR3, permP2);
|
||||||
srcP3 = vec_perm(srcR2, srcR3, permP3);
|
srcP3 = vec_perm(srcR2, srcR3, permP3);
|
||||||
} break;
|
} break;
|
||||||
case 15: {
|
case 15: {
|
||||||
vec_u8_t srcR3 = vec_ld(30, src);
|
vec_u8_t srcR3 = vec_ld(30, src);
|
||||||
srcM2 = vec_perm(srcR1, srcR2, permM2);
|
srcM2 = vec_perm(srcR1, srcR2, permM2);
|
||||||
srcM1 = srcR2;
|
srcM1 = srcR2;
|
||||||
srcP0 = vec_perm(srcR2, srcR3, permP0);
|
srcP0 = vec_perm(srcR2, srcR3, permP0);
|
||||||
srcP1 = vec_perm(srcR2, srcR3, permP1);
|
srcP1 = vec_perm(srcR2, srcR3, permP1);
|
||||||
srcP2 = vec_perm(srcR2, srcR3, permP2);
|
srcP2 = vec_perm(srcR2, srcR3, permP2);
|
||||||
srcP3 = vec_perm(srcR2, srcR3, permP3);
|
srcP3 = vec_perm(srcR2, srcR3, permP3);
|
||||||
} break;
|
} break;
|
||||||
|
}
|
||||||
|
|
||||||
|
srcP0A = (vec_s16_t) vec_mergeh(zero_u8v, srcP0);
|
||||||
|
srcP0B = (vec_s16_t) vec_mergel(zero_u8v, srcP0);
|
||||||
|
srcP1A = (vec_s16_t) vec_mergeh(zero_u8v, srcP1);
|
||||||
|
srcP1B = (vec_s16_t) vec_mergel(zero_u8v, srcP1);
|
||||||
|
|
||||||
|
srcP2A = (vec_s16_t) vec_mergeh(zero_u8v, srcP2);
|
||||||
|
srcP2B = (vec_s16_t) vec_mergel(zero_u8v, srcP2);
|
||||||
|
srcP3A = (vec_s16_t) vec_mergeh(zero_u8v, srcP3);
|
||||||
|
srcP3B = (vec_s16_t) vec_mergel(zero_u8v, srcP3);
|
||||||
|
|
||||||
|
srcM1A = (vec_s16_t) vec_mergeh(zero_u8v, srcM1);
|
||||||
|
srcM1B = (vec_s16_t) vec_mergel(zero_u8v, srcM1);
|
||||||
|
srcM2A = (vec_s16_t) vec_mergeh(zero_u8v, srcM2);
|
||||||
|
srcM2B = (vec_s16_t) vec_mergel(zero_u8v, srcM2);
|
||||||
|
|
||||||
|
sum1A = vec_adds(srcP0A, srcP1A);
|
||||||
|
sum1B = vec_adds(srcP0B, srcP1B);
|
||||||
|
sum2A = vec_adds(srcM1A, srcP2A);
|
||||||
|
sum2B = vec_adds(srcM1B, srcP2B);
|
||||||
|
sum3A = vec_adds(srcM2A, srcP3A);
|
||||||
|
sum3B = vec_adds(srcM2B, srcP3B);
|
||||||
|
|
||||||
|
pp1A = vec_mladd(sum1A, v20ss, sum3A);
|
||||||
|
pp1B = vec_mladd(sum1B, v20ss, sum3B);
|
||||||
|
|
||||||
|
pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
|
||||||
|
pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
|
||||||
|
|
||||||
|
psumA = vec_sub(pp1A, pp2A);
|
||||||
|
psumB = vec_sub(pp1B, pp2B);
|
||||||
|
|
||||||
|
vec_st(psumA, 0, tmp);
|
||||||
|
vec_st(psumB, 16, tmp);
|
||||||
|
|
||||||
|
src += srcStride;
|
||||||
|
tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
|
||||||
}
|
}
|
||||||
|
|
||||||
srcP0A = (vec_s16_t) vec_mergeh(zero_u8v, srcP0);
|
tmpM2ssA = vec_ld(0, tmpbis);
|
||||||
srcP0B = (vec_s16_t) vec_mergel(zero_u8v, srcP0);
|
tmpM2ssB = vec_ld(16, tmpbis);
|
||||||
srcP1A = (vec_s16_t) vec_mergeh(zero_u8v, srcP1);
|
tmpbis += tmpStride;
|
||||||
srcP1B = (vec_s16_t) vec_mergel(zero_u8v, srcP1);
|
tmpM1ssA = vec_ld(0, tmpbis);
|
||||||
|
tmpM1ssB = vec_ld(16, tmpbis);
|
||||||
srcP2A = (vec_s16_t) vec_mergeh(zero_u8v, srcP2);
|
tmpbis += tmpStride;
|
||||||
srcP2B = (vec_s16_t) vec_mergel(zero_u8v, srcP2);
|
tmpP0ssA = vec_ld(0, tmpbis);
|
||||||
srcP3A = (vec_s16_t) vec_mergeh(zero_u8v, srcP3);
|
tmpP0ssB = vec_ld(16, tmpbis);
|
||||||
srcP3B = (vec_s16_t) vec_mergel(zero_u8v, srcP3);
|
tmpbis += tmpStride;
|
||||||
|
tmpP1ssA = vec_ld(0, tmpbis);
|
||||||
srcM1A = (vec_s16_t) vec_mergeh(zero_u8v, srcM1);
|
tmpP1ssB = vec_ld(16, tmpbis);
|
||||||
srcM1B = (vec_s16_t) vec_mergel(zero_u8v, srcM1);
|
tmpbis += tmpStride;
|
||||||
srcM2A = (vec_s16_t) vec_mergeh(zero_u8v, srcM2);
|
tmpP2ssA = vec_ld(0, tmpbis);
|
||||||
srcM2B = (vec_s16_t) vec_mergel(zero_u8v, srcM2);
|
tmpP2ssB = vec_ld(16, tmpbis);
|
||||||
|
|
||||||
sum1A = vec_adds(srcP0A, srcP1A);
|
|
||||||
sum1B = vec_adds(srcP0B, srcP1B);
|
|
||||||
sum2A = vec_adds(srcM1A, srcP2A);
|
|
||||||
sum2B = vec_adds(srcM1B, srcP2B);
|
|
||||||
sum3A = vec_adds(srcM2A, srcP3A);
|
|
||||||
sum3B = vec_adds(srcM2B, srcP3B);
|
|
||||||
|
|
||||||
pp1A = vec_mladd(sum1A, v20ss, sum3A);
|
|
||||||
pp1B = vec_mladd(sum1B, v20ss, sum3B);
|
|
||||||
|
|
||||||
pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
|
|
||||||
pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
|
|
||||||
|
|
||||||
psumA = vec_sub(pp1A, pp2A);
|
|
||||||
psumB = vec_sub(pp1B, pp2B);
|
|
||||||
|
|
||||||
vec_st(psumA, 0, tmp);
|
|
||||||
vec_st(psumB, 16, tmp);
|
|
||||||
|
|
||||||
src += srcStride;
|
|
||||||
tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
|
|
||||||
}
|
|
||||||
|
|
||||||
tmpM2ssA = vec_ld(0, tmpbis);
|
|
||||||
tmpM2ssB = vec_ld(16, tmpbis);
|
|
||||||
tmpbis += tmpStride;
|
|
||||||
tmpM1ssA = vec_ld(0, tmpbis);
|
|
||||||
tmpM1ssB = vec_ld(16, tmpbis);
|
|
||||||
tmpbis += tmpStride;
|
|
||||||
tmpP0ssA = vec_ld(0, tmpbis);
|
|
||||||
tmpP0ssB = vec_ld(16, tmpbis);
|
|
||||||
tmpbis += tmpStride;
|
|
||||||
tmpP1ssA = vec_ld(0, tmpbis);
|
|
||||||
tmpP1ssB = vec_ld(16, tmpbis);
|
|
||||||
tmpbis += tmpStride;
|
|
||||||
tmpP2ssA = vec_ld(0, tmpbis);
|
|
||||||
tmpP2ssB = vec_ld(16, tmpbis);
|
|
||||||
tmpbis += tmpStride;
|
|
||||||
|
|
||||||
for (i = 0 ; i < 16 ; i++) {
|
|
||||||
const vec_s16_t tmpP3ssA = vec_ld(0, tmpbis);
|
|
||||||
const vec_s16_t tmpP3ssB = vec_ld(16, tmpbis);
|
|
||||||
|
|
||||||
const vec_s16_t sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
|
|
||||||
const vec_s16_t sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
|
|
||||||
const vec_s16_t sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
|
|
||||||
const vec_s16_t sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
|
|
||||||
const vec_s16_t sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
|
|
||||||
const vec_s16_t sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
|
|
||||||
|
|
||||||
tmpbis += tmpStride;
|
tmpbis += tmpStride;
|
||||||
|
|
||||||
tmpM2ssA = tmpM1ssA;
|
for (i = 0 ; i < 16 ; i++) {
|
||||||
tmpM2ssB = tmpM1ssB;
|
const vec_s16_t tmpP3ssA = vec_ld(0, tmpbis);
|
||||||
tmpM1ssA = tmpP0ssA;
|
const vec_s16_t tmpP3ssB = vec_ld(16, tmpbis);
|
||||||
tmpM1ssB = tmpP0ssB;
|
|
||||||
tmpP0ssA = tmpP1ssA;
|
|
||||||
tmpP0ssB = tmpP1ssB;
|
|
||||||
tmpP1ssA = tmpP2ssA;
|
|
||||||
tmpP1ssB = tmpP2ssB;
|
|
||||||
tmpP2ssA = tmpP3ssA;
|
|
||||||
tmpP2ssB = tmpP3ssB;
|
|
||||||
|
|
||||||
pp1Ae = vec_mule(sum1A, v20ss);
|
const vec_s16_t sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
|
||||||
pp1Ao = vec_mulo(sum1A, v20ss);
|
const vec_s16_t sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
|
||||||
pp1Be = vec_mule(sum1B, v20ss);
|
const vec_s16_t sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
|
||||||
pp1Bo = vec_mulo(sum1B, v20ss);
|
const vec_s16_t sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
|
||||||
|
const vec_s16_t sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
|
||||||
|
const vec_s16_t sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
|
||||||
|
|
||||||
pp2Ae = vec_mule(sum2A, v5ss);
|
tmpbis += tmpStride;
|
||||||
pp2Ao = vec_mulo(sum2A, v5ss);
|
|
||||||
pp2Be = vec_mule(sum2B, v5ss);
|
|
||||||
pp2Bo = vec_mulo(sum2B, v5ss);
|
|
||||||
|
|
||||||
pp3Ae = vec_sra((vec_s32_t)sum3A, v16ui);
|
tmpM2ssA = tmpM1ssA;
|
||||||
pp3Ao = vec_mulo(sum3A, v1ss);
|
tmpM2ssB = tmpM1ssB;
|
||||||
pp3Be = vec_sra((vec_s32_t)sum3B, v16ui);
|
tmpM1ssA = tmpP0ssA;
|
||||||
pp3Bo = vec_mulo(sum3B, v1ss);
|
tmpM1ssB = tmpP0ssB;
|
||||||
|
tmpP0ssA = tmpP1ssA;
|
||||||
|
tmpP0ssB = tmpP1ssB;
|
||||||
|
tmpP1ssA = tmpP2ssA;
|
||||||
|
tmpP1ssB = tmpP2ssB;
|
||||||
|
tmpP2ssA = tmpP3ssA;
|
||||||
|
tmpP2ssB = tmpP3ssB;
|
||||||
|
|
||||||
pp1cAe = vec_add(pp1Ae, v512si);
|
pp1Ae = vec_mule(sum1A, v20ss);
|
||||||
pp1cAo = vec_add(pp1Ao, v512si);
|
pp1Ao = vec_mulo(sum1A, v20ss);
|
||||||
pp1cBe = vec_add(pp1Be, v512si);
|
pp1Be = vec_mule(sum1B, v20ss);
|
||||||
pp1cBo = vec_add(pp1Bo, v512si);
|
pp1Bo = vec_mulo(sum1B, v20ss);
|
||||||
|
|
||||||
pp32Ae = vec_sub(pp3Ae, pp2Ae);
|
pp2Ae = vec_mule(sum2A, v5ss);
|
||||||
pp32Ao = vec_sub(pp3Ao, pp2Ao);
|
pp2Ao = vec_mulo(sum2A, v5ss);
|
||||||
pp32Be = vec_sub(pp3Be, pp2Be);
|
pp2Be = vec_mule(sum2B, v5ss);
|
||||||
pp32Bo = vec_sub(pp3Bo, pp2Bo);
|
pp2Bo = vec_mulo(sum2B, v5ss);
|
||||||
|
|
||||||
sumAe = vec_add(pp1cAe, pp32Ae);
|
pp3Ae = vec_sra((vec_s32_t)sum3A, v16ui);
|
||||||
sumAo = vec_add(pp1cAo, pp32Ao);
|
pp3Ao = vec_mulo(sum3A, v1ss);
|
||||||
sumBe = vec_add(pp1cBe, pp32Be);
|
pp3Be = vec_sra((vec_s32_t)sum3B, v16ui);
|
||||||
sumBo = vec_add(pp1cBo, pp32Bo);
|
pp3Bo = vec_mulo(sum3B, v1ss);
|
||||||
|
|
||||||
ssumAe = vec_sra(sumAe, v10ui);
|
pp1cAe = vec_add(pp1Ae, v512si);
|
||||||
ssumAo = vec_sra(sumAo, v10ui);
|
pp1cAo = vec_add(pp1Ao, v512si);
|
||||||
ssumBe = vec_sra(sumBe, v10ui);
|
pp1cBe = vec_add(pp1Be, v512si);
|
||||||
ssumBo = vec_sra(sumBo, v10ui);
|
pp1cBo = vec_add(pp1Bo, v512si);
|
||||||
|
|
||||||
ssume = vec_packs(ssumAe, ssumBe);
|
pp32Ae = vec_sub(pp3Ae, pp2Ae);
|
||||||
ssumo = vec_packs(ssumAo, ssumBo);
|
pp32Ao = vec_sub(pp3Ao, pp2Ao);
|
||||||
|
pp32Be = vec_sub(pp3Be, pp2Be);
|
||||||
|
pp32Bo = vec_sub(pp3Bo, pp2Bo);
|
||||||
|
|
||||||
sumv = vec_packsu(ssume, ssumo);
|
sumAe = vec_add(pp1cAe, pp32Ae);
|
||||||
sum = vec_perm(sumv, sumv, mperm);
|
sumAo = vec_add(pp1cAo, pp32Ao);
|
||||||
|
sumBe = vec_add(pp1cBe, pp32Be);
|
||||||
|
sumBo = vec_add(pp1cBo, pp32Bo);
|
||||||
|
|
||||||
ASSERT_ALIGNED(dst);
|
ssumAe = vec_sra(sumAe, v10ui);
|
||||||
vdst = vec_ld(0, dst);
|
ssumAo = vec_sra(sumAo, v10ui);
|
||||||
|
ssumBe = vec_sra(sumBe, v10ui);
|
||||||
|
ssumBo = vec_sra(sumBo, v10ui);
|
||||||
|
|
||||||
OP_U8_ALTIVEC(fsum, sum, vdst);
|
ssume = vec_packs(ssumAe, ssumBe);
|
||||||
|
ssumo = vec_packs(ssumAo, ssumBo);
|
||||||
|
|
||||||
vec_st(fsum, 0, dst);
|
sumv = vec_packsu(ssume, ssumo);
|
||||||
|
sum = vec_perm(sumv, sumv, mperm);
|
||||||
|
|
||||||
dst += dstStride;
|
ASSERT_ALIGNED(dst);
|
||||||
}
|
vdst = vec_ld(0, dst);
|
||||||
POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
|
|
||||||
|
OP_U8_ALTIVEC(fsum, sum, vdst);
|
||||||
|
|
||||||
|
vec_st(fsum, 0, dst);
|
||||||
|
|
||||||
|
dst += dstStride;
|
||||||
|
}
|
||||||
|
POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
|
||||||
}
|
}
|
||||||
|
@ -22,7 +22,6 @@
|
|||||||
* NOTE: This code is based on GPL code from the libmpeg2 project. The
|
* NOTE: This code is based on GPL code from the libmpeg2 project. The
|
||||||
* author, Michel Lespinasses, has given explicit permission to release
|
* author, Michel Lespinasses, has given explicit permission to release
|
||||||
* under LGPL as part of ffmpeg.
|
* under LGPL as part of ffmpeg.
|
||||||
*
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -46,8 +46,7 @@ void v_resample16_altivec(uint8_t *dst, int dst_width, const uint8_t *src,
|
|||||||
vector signed short zeros, sumhv, sumlv;
|
vector signed short zeros, sumhv, sumlv;
|
||||||
s = src;
|
s = src;
|
||||||
|
|
||||||
for(i=0;i<4;i++)
|
for(i=0;i<4;i++) {
|
||||||
{
|
|
||||||
/*
|
/*
|
||||||
The vec_madds later on does an implicit >>15 on the result.
|
The vec_madds later on does an implicit >>15 on the result.
|
||||||
Since FILTER_BITS is 8, and we have 15 bits of magnitude in
|
Since FILTER_BITS is 8, and we have 15 bits of magnitude in
|
||||||
@ -86,13 +85,11 @@ void v_resample16_altivec(uint8_t *dst, int dst_width, const uint8_t *src,
|
|||||||
|
|
||||||
/* Do our altivec resampling on 16 pixels at once. */
|
/* Do our altivec resampling on 16 pixels at once. */
|
||||||
while(dst_width>=16) {
|
while(dst_width>=16) {
|
||||||
/*
|
/* Read 16 (potentially unaligned) bytes from each of
|
||||||
Read 16 (potentially unaligned) bytes from each of
|
|
||||||
4 lines into 4 vectors, and split them into shorts.
|
4 lines into 4 vectors, and split them into shorts.
|
||||||
Interleave the multipy/accumulate for the resample
|
Interleave the multipy/accumulate for the resample
|
||||||
filter with the loads to hide the 3 cycle latency
|
filter with the loads to hide the 3 cycle latency
|
||||||
the vec_madds have.
|
the vec_madds have. */
|
||||||
*/
|
|
||||||
tv = (vector unsigned char *) &s[0 * wrap];
|
tv = (vector unsigned char *) &s[0 * wrap];
|
||||||
tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[i * wrap]));
|
tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[i * wrap]));
|
||||||
srchv[0].v = (vector signed short) vec_mergeh(zero, tmp);
|
srchv[0].v = (vector signed short) vec_mergeh(zero, tmp);
|
||||||
@ -121,10 +118,8 @@ void v_resample16_altivec(uint8_t *dst, int dst_width, const uint8_t *src,
|
|||||||
sumhv = vec_madds(srchv[3].v, fv[3].v, sumhv);
|
sumhv = vec_madds(srchv[3].v, fv[3].v, sumhv);
|
||||||
sumlv = vec_madds(srclv[3].v, fv[3].v, sumlv);
|
sumlv = vec_madds(srclv[3].v, fv[3].v, sumlv);
|
||||||
|
|
||||||
/*
|
/* Pack the results into our destination vector,
|
||||||
Pack the results into our destination vector,
|
and do an aligned write of that back to memory. */
|
||||||
and do an aligned write of that back to memory.
|
|
||||||
*/
|
|
||||||
dstv = vec_packsu(sumhv, sumlv) ;
|
dstv = vec_packsu(sumhv, sumlv) ;
|
||||||
vec_st(dstv, 0, (vector unsigned char *) dst);
|
vec_st(dstv, 0, (vector unsigned char *) dst);
|
||||||
|
|
||||||
@ -133,10 +128,8 @@ void v_resample16_altivec(uint8_t *dst, int dst_width, const uint8_t *src,
|
|||||||
dst_width-=16;
|
dst_width-=16;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/* If there are any leftover pixels, resample them
|
||||||
If there are any leftover pixels, resample them
|
with the slow scalar method. */
|
||||||
with the slow scalar method.
|
|
||||||
*/
|
|
||||||
while(dst_width>0) {
|
while(dst_width>0) {
|
||||||
sum = s[0 * wrap] * filter[0] +
|
sum = s[0 * wrap] * filter[0] +
|
||||||
s[1 * wrap] * filter[1] +
|
s[1 * wrap] * filter[1] +
|
||||||
|
@ -38,7 +38,7 @@ static int ssd_int8_vs_int16_altivec(const int8_t *pix1, const int16_t *pix2,
|
|||||||
vector signed short vpix2, vdiff, vpix1l,vpix1h;
|
vector signed short vpix2, vdiff, vpix1l,vpix1h;
|
||||||
union { vector signed int vscore;
|
union { vector signed int vscore;
|
||||||
int32_t score[4];
|
int32_t score[4];
|
||||||
} u;
|
} u;
|
||||||
u.vscore = vec_splat_s32(0);
|
u.vscore = vec_splat_s32(0);
|
||||||
//
|
//
|
||||||
//XXX lazy way, fix it later
|
//XXX lazy way, fix it later
|
||||||
|
@ -25,14 +25,14 @@
|
|||||||
|
|
||||||
#if defined(ARCH_POWERPC_405)
|
#if defined(ARCH_POWERPC_405)
|
||||||
/* signed 16x16 -> 32 multiply add accumulate */
|
/* signed 16x16 -> 32 multiply add accumulate */
|
||||||
# define MAC16(rt, ra, rb) \
|
#define MAC16(rt, ra, rb) \
|
||||||
asm ("maclhw %0, %2, %3" : "=r" (rt) : "0" (rt), "r" (ra), "r" (rb));
|
asm ("maclhw %0, %2, %3" : "=r" (rt) : "0" (rt), "r" (ra), "r" (rb));
|
||||||
|
|
||||||
/* signed 16x16 -> 32 multiply */
|
/* signed 16x16 -> 32 multiply */
|
||||||
# define MUL16(ra, rb) \
|
#define MUL16(ra, rb) \
|
||||||
({ int __rt; \
|
({ int __rt; \
|
||||||
asm ("mullhw %0, %1, %2" : "=r" (__rt) : "r" (ra), "r" (rb)); \
|
asm ("mullhw %0, %1, %2" : "=r" (__rt) : "r" (ra), "r" (rb)); \
|
||||||
__rt; })
|
__rt; })
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#endif /* FFMPEG_PPC_MATHOPS_H */
|
#endif /* FFMPEG_PPC_MATHOPS_H */
|
||||||
|
@ -41,15 +41,15 @@ do { \
|
|||||||
// transposes a matrix consisting of four vectors with four elements each
|
// transposes a matrix consisting of four vectors with four elements each
|
||||||
#define TRANSPOSE4(a,b,c,d) \
|
#define TRANSPOSE4(a,b,c,d) \
|
||||||
do { \
|
do { \
|
||||||
__typeof__(a) _trans_ach = vec_mergeh(a, c); \
|
__typeof__(a) _trans_ach = vec_mergeh(a, c); \
|
||||||
__typeof__(a) _trans_acl = vec_mergel(a, c); \
|
__typeof__(a) _trans_acl = vec_mergel(a, c); \
|
||||||
__typeof__(a) _trans_bdh = vec_mergeh(b, d); \
|
__typeof__(a) _trans_bdh = vec_mergeh(b, d); \
|
||||||
__typeof__(a) _trans_bdl = vec_mergel(b, d); \
|
__typeof__(a) _trans_bdl = vec_mergel(b, d); \
|
||||||
\
|
\
|
||||||
a = vec_mergeh(_trans_ach, _trans_bdh); \
|
a = vec_mergeh(_trans_ach, _trans_bdh); \
|
||||||
b = vec_mergel(_trans_ach, _trans_bdh); \
|
b = vec_mergel(_trans_ach, _trans_bdh); \
|
||||||
c = vec_mergeh(_trans_acl, _trans_bdl); \
|
c = vec_mergeh(_trans_acl, _trans_bdl); \
|
||||||
d = vec_mergel(_trans_acl, _trans_bdl); \
|
d = vec_mergel(_trans_acl, _trans_bdl); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
|
|
||||||
@ -58,19 +58,19 @@ do { \
|
|||||||
// target address is four-byte aligned (which should be always).
|
// target address is four-byte aligned (which should be always).
|
||||||
#define LOAD4(vec, address) \
|
#define LOAD4(vec, address) \
|
||||||
{ \
|
{ \
|
||||||
__typeof__(vec)* _load_addr = (__typeof__(vec)*)(address); \
|
__typeof__(vec)* _load_addr = (__typeof__(vec)*)(address); \
|
||||||
vector unsigned char _perm_vec = vec_lvsl(0,(address)); \
|
vector unsigned char _perm_vec = vec_lvsl(0,(address)); \
|
||||||
vec = vec_ld(0, _load_addr); \
|
vec = vec_ld(0, _load_addr); \
|
||||||
vec = vec_perm(vec, vec, _perm_vec); \
|
vec = vec_perm(vec, vec, _perm_vec); \
|
||||||
vec = vec_splat(vec, 0); \
|
vec = vec_splat(vec, 0); \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
#define FOUROF(a) AVV(a,a,a,a)
|
#define FOUROF(a) AVV(a,a,a,a)
|
||||||
|
|
||||||
int dct_quantize_altivec(MpegEncContext* s,
|
int dct_quantize_altivec(MpegEncContext* s,
|
||||||
DCTELEM* data, int n,
|
DCTELEM* data, int n,
|
||||||
int qscale, int* overflow)
|
int qscale, int* overflow)
|
||||||
{
|
{
|
||||||
int lastNonZero;
|
int lastNonZero;
|
||||||
vector float row0, row1, row2, row3, row4, row5, row6, row7;
|
vector float row0, row1, row2, row3, row4, row5, row6, row7;
|
||||||
@ -137,10 +137,8 @@ int dct_quantize_altivec(MpegEncContext* s,
|
|||||||
|
|
||||||
int whichPass, whichHalf;
|
int whichPass, whichHalf;
|
||||||
|
|
||||||
for(whichPass = 1; whichPass<=2; whichPass++)
|
for(whichPass = 1; whichPass<=2; whichPass++) {
|
||||||
{
|
for(whichHalf = 1; whichHalf<=2; whichHalf++) {
|
||||||
for(whichHalf = 1; whichHalf<=2; whichHalf++)
|
|
||||||
{
|
|
||||||
vector float tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
|
vector float tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
|
||||||
vector float tmp10, tmp11, tmp12, tmp13;
|
vector float tmp10, tmp11, tmp12, tmp13;
|
||||||
vector float z1, z2, z3, z4, z5;
|
vector float z1, z2, z3, z4, z5;
|
||||||
@ -235,8 +233,7 @@ int dct_quantize_altivec(MpegEncContext* s,
|
|||||||
SWAP(row7, alt7);
|
SWAP(row7, alt7);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (whichPass == 1)
|
if (whichPass == 1) {
|
||||||
{
|
|
||||||
// transpose the data for the second pass
|
// transpose the data for the second pass
|
||||||
|
|
||||||
// First, block transpose the upper right with lower left.
|
// First, block transpose the upper right with lower left.
|
||||||
@ -261,8 +258,7 @@ int dct_quantize_altivec(MpegEncContext* s,
|
|||||||
const vector signed int* qmat;
|
const vector signed int* qmat;
|
||||||
vector float bias, negBias;
|
vector float bias, negBias;
|
||||||
|
|
||||||
if (s->mb_intra)
|
if (s->mb_intra) {
|
||||||
{
|
|
||||||
vector signed int baseVector;
|
vector signed int baseVector;
|
||||||
|
|
||||||
// We must cache element 0 in the intra case
|
// We must cache element 0 in the intra case
|
||||||
@ -272,9 +268,7 @@ int dct_quantize_altivec(MpegEncContext* s,
|
|||||||
|
|
||||||
qmat = (vector signed int*)s->q_intra_matrix[qscale];
|
qmat = (vector signed int*)s->q_intra_matrix[qscale];
|
||||||
biasAddr = &(s->intra_quant_bias);
|
biasAddr = &(s->intra_quant_bias);
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
qmat = (vector signed int*)s->q_inter_matrix[qscale];
|
qmat = (vector signed int*)s->q_inter_matrix[qscale];
|
||||||
biasAddr = &(s->inter_quant_bias);
|
biasAddr = &(s->inter_quant_bias);
|
||||||
}
|
}
|
||||||
@ -439,8 +433,7 @@ int dct_quantize_altivec(MpegEncContext* s,
|
|||||||
// and handle it using the vector unit if we can. This is the permute used
|
// and handle it using the vector unit if we can. This is the permute used
|
||||||
// by the altivec idct, so it is common when using the altivec dct.
|
// by the altivec idct, so it is common when using the altivec dct.
|
||||||
|
|
||||||
if ((lastNonZero > 0) && (s->dsp.idct_permutation_type == FF_TRANSPOSE_IDCT_PERM))
|
if ((lastNonZero > 0) && (s->dsp.idct_permutation_type == FF_TRANSPOSE_IDCT_PERM)) {
|
||||||
{
|
|
||||||
TRANSPOSE8(data0, data1, data2, data3, data4, data5, data6, data7);
|
TRANSPOSE8(data0, data1, data2, data3, data4, data5, data6, data7);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -456,10 +449,8 @@ int dct_quantize_altivec(MpegEncContext* s,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// special handling of block[0]
|
// special handling of block[0]
|
||||||
if (s->mb_intra)
|
if (s->mb_intra) {
|
||||||
{
|
if (!s->h263_aic) {
|
||||||
if (!s->h263_aic)
|
|
||||||
{
|
|
||||||
if (n < 4)
|
if (n < 4)
|
||||||
oldBaseValue /= s->y_dc_scale;
|
oldBaseValue /= s->y_dc_scale;
|
||||||
else
|
else
|
||||||
@ -474,8 +465,7 @@ int dct_quantize_altivec(MpegEncContext* s,
|
|||||||
// need to permute the "no" permutation case.
|
// need to permute the "no" permutation case.
|
||||||
if ((lastNonZero > 0) &&
|
if ((lastNonZero > 0) &&
|
||||||
(s->dsp.idct_permutation_type != FF_TRANSPOSE_IDCT_PERM) &&
|
(s->dsp.idct_permutation_type != FF_TRANSPOSE_IDCT_PERM) &&
|
||||||
(s->dsp.idct_permutation_type != FF_NO_IDCT_PERM))
|
(s->dsp.idct_permutation_type != FF_NO_IDCT_PERM)) {
|
||||||
{
|
|
||||||
ff_block_permute(data, s->dsp.idct_permutation,
|
ff_block_permute(data, s->dsp.idct_permutation,
|
||||||
s->intra_scantable.scantable, lastNonZero);
|
s->intra_scantable.scantable, lastNonZero);
|
||||||
}
|
}
|
||||||
@ -483,10 +473,8 @@ int dct_quantize_altivec(MpegEncContext* s,
|
|||||||
return lastNonZero;
|
return lastNonZero;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/* AltiVec version of dct_unquantize_h263
|
||||||
AltiVec version of dct_unquantize_h263
|
this code assumes `block' is 16 bytes-aligned */
|
||||||
this code assumes `block' is 16 bytes-aligned
|
|
||||||
*/
|
|
||||||
void dct_unquantize_h263_altivec(MpegEncContext *s,
|
void dct_unquantize_h263_altivec(MpegEncContext *s,
|
||||||
DCTELEM *block, int n, int qscale)
|
DCTELEM *block, int n, int qscale)
|
||||||
{
|
{
|
||||||
@ -517,82 +505,81 @@ POWERPC_PERF_START_COUNT(altivec_dct_unquantize_h263_num, 1);
|
|||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
register const vector signed short vczero = (const vector signed short)vec_splat_s16(0);
|
register const vector signed short vczero = (const vector signed short)vec_splat_s16(0);
|
||||||
DECLARE_ALIGNED_16(short, qmul8[]) =
|
DECLARE_ALIGNED_16(short, qmul8[]) =
|
||||||
{
|
{
|
||||||
qmul, qmul, qmul, qmul,
|
qmul, qmul, qmul, qmul,
|
||||||
qmul, qmul, qmul, qmul
|
qmul, qmul, qmul, qmul
|
||||||
};
|
};
|
||||||
DECLARE_ALIGNED_16(short, qadd8[]) =
|
DECLARE_ALIGNED_16(short, qadd8[]) =
|
||||||
{
|
{
|
||||||
qadd, qadd, qadd, qadd,
|
qadd, qadd, qadd, qadd,
|
||||||
qadd, qadd, qadd, qadd
|
qadd, qadd, qadd, qadd
|
||||||
};
|
};
|
||||||
DECLARE_ALIGNED_16(short, nqadd8[]) =
|
DECLARE_ALIGNED_16(short, nqadd8[]) =
|
||||||
{
|
{
|
||||||
-qadd, -qadd, -qadd, -qadd,
|
-qadd, -qadd, -qadd, -qadd,
|
||||||
-qadd, -qadd, -qadd, -qadd
|
-qadd, -qadd, -qadd, -qadd
|
||||||
};
|
};
|
||||||
register vector signed short blockv, qmulv, qaddv, nqaddv, temp1;
|
register vector signed short blockv, qmulv, qaddv, nqaddv, temp1;
|
||||||
register vector bool short blockv_null, blockv_neg;
|
register vector bool short blockv_null, blockv_neg;
|
||||||
register short backup_0 = block[0];
|
register short backup_0 = block[0];
|
||||||
register int j = 0;
|
register int j = 0;
|
||||||
|
|
||||||
qmulv = vec_ld(0, qmul8);
|
qmulv = vec_ld(0, qmul8);
|
||||||
qaddv = vec_ld(0, qadd8);
|
qaddv = vec_ld(0, qadd8);
|
||||||
nqaddv = vec_ld(0, nqadd8);
|
nqaddv = vec_ld(0, nqadd8);
|
||||||
|
|
||||||
#if 0 // block *is* 16 bytes-aligned, it seems.
|
#if 0 // block *is* 16 bytes-aligned, it seems.
|
||||||
// first make sure block[j] is 16 bytes-aligned
|
// first make sure block[j] is 16 bytes-aligned
|
||||||
for(j = 0; (j <= nCoeffs) && ((((unsigned long)block) + (j << 1)) & 0x0000000F) ; j++) {
|
for(j = 0; (j <= nCoeffs) && ((((unsigned long)block) + (j << 1)) & 0x0000000F) ; j++) {
|
||||||
level = block[j];
|
level = block[j];
|
||||||
if (level) {
|
if (level) {
|
||||||
if (level < 0) {
|
if (level < 0) {
|
||||||
level = level * qmul - qadd;
|
level = level * qmul - qadd;
|
||||||
} else {
|
} else {
|
||||||
level = level * qmul + qadd;
|
level = level * qmul + qadd;
|
||||||
|
}
|
||||||
|
block[j] = level;
|
||||||
}
|
}
|
||||||
block[j] = level;
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// vectorize all the 16 bytes-aligned blocks
|
// vectorize all the 16 bytes-aligned blocks
|
||||||
// of 8 elements
|
// of 8 elements
|
||||||
for(; (j + 7) <= nCoeffs ; j+=8)
|
for(; (j + 7) <= nCoeffs ; j+=8) {
|
||||||
{
|
blockv = vec_ld(j << 1, block);
|
||||||
blockv = vec_ld(j << 1, block);
|
blockv_neg = vec_cmplt(blockv, vczero);
|
||||||
blockv_neg = vec_cmplt(blockv, vczero);
|
blockv_null = vec_cmpeq(blockv, vczero);
|
||||||
blockv_null = vec_cmpeq(blockv, vczero);
|
// choose between +qadd or -qadd as the third operand
|
||||||
// choose between +qadd or -qadd as the third operand
|
temp1 = vec_sel(qaddv, nqaddv, blockv_neg);
|
||||||
temp1 = vec_sel(qaddv, nqaddv, blockv_neg);
|
// multiply & add (block{i,i+7} * qmul [+-] qadd)
|
||||||
// multiply & add (block{i,i+7} * qmul [+-] qadd)
|
temp1 = vec_mladd(blockv, qmulv, temp1);
|
||||||
temp1 = vec_mladd(blockv, qmulv, temp1);
|
// put 0 where block[{i,i+7} used to have 0
|
||||||
// put 0 where block[{i,i+7} used to have 0
|
blockv = vec_sel(temp1, blockv, blockv_null);
|
||||||
blockv = vec_sel(temp1, blockv, blockv_null);
|
vec_st(blockv, j << 1, block);
|
||||||
vec_st(blockv, j << 1, block);
|
|
||||||
}
|
|
||||||
|
|
||||||
// if nCoeffs isn't a multiple of 8, finish the job
|
|
||||||
// using good old scalar units.
|
|
||||||
// (we could do it using a truncated vector,
|
|
||||||
// but I'm not sure it's worth the hassle)
|
|
||||||
for(; j <= nCoeffs ; j++) {
|
|
||||||
level = block[j];
|
|
||||||
if (level) {
|
|
||||||
if (level < 0) {
|
|
||||||
level = level * qmul - qadd;
|
|
||||||
} else {
|
|
||||||
level = level * qmul + qadd;
|
|
||||||
}
|
|
||||||
block[j] = level;
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
if (i == 1)
|
// if nCoeffs isn't a multiple of 8, finish the job
|
||||||
{ // cheat. this avoid special-casing the first iteration
|
// using good old scalar units.
|
||||||
block[0] = backup_0;
|
// (we could do it using a truncated vector,
|
||||||
}
|
// but I'm not sure it's worth the hassle)
|
||||||
|
for(; j <= nCoeffs ; j++) {
|
||||||
|
level = block[j];
|
||||||
|
if (level) {
|
||||||
|
if (level < 0) {
|
||||||
|
level = level * qmul - qadd;
|
||||||
|
} else {
|
||||||
|
level = level * qmul + qadd;
|
||||||
|
}
|
||||||
|
block[j] = level;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (i == 1) {
|
||||||
|
// cheat. this avoid special-casing the first iteration
|
||||||
|
block[0] = backup_0;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
POWERPC_PERF_STOP_COUNT(altivec_dct_unquantize_h263_num, nCoeffs == 63);
|
POWERPC_PERF_STOP_COUNT(altivec_dct_unquantize_h263_num, nCoeffs == 63);
|
||||||
}
|
}
|
||||||
@ -605,11 +592,9 @@ void MPV_common_init_altivec(MpegEncContext *s)
|
|||||||
{
|
{
|
||||||
if ((mm_flags & MM_ALTIVEC) == 0) return;
|
if ((mm_flags & MM_ALTIVEC) == 0) return;
|
||||||
|
|
||||||
if (s->avctx->lowres==0)
|
if (s->avctx->lowres==0) {
|
||||||
{
|
|
||||||
if ((s->avctx->idct_algo == FF_IDCT_AUTO) ||
|
if ((s->avctx->idct_algo == FF_IDCT_AUTO) ||
|
||||||
(s->avctx->idct_algo == FF_IDCT_ALTIVEC))
|
(s->avctx->idct_algo == FF_IDCT_ALTIVEC)) {
|
||||||
{
|
|
||||||
s->dsp.idct_put = idct_put_altivec;
|
s->dsp.idct_put = idct_put_altivec;
|
||||||
s->dsp.idct_add = idct_add_altivec;
|
s->dsp.idct_add = idct_add_altivec;
|
||||||
s->dsp.idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
|
s->dsp.idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
|
||||||
@ -618,15 +603,13 @@ void MPV_common_init_altivec(MpegEncContext *s)
|
|||||||
|
|
||||||
// Test to make sure that the dct required alignments are met.
|
// Test to make sure that the dct required alignments are met.
|
||||||
if ((((long)(s->q_intra_matrix) & 0x0f) != 0) ||
|
if ((((long)(s->q_intra_matrix) & 0x0f) != 0) ||
|
||||||
(((long)(s->q_inter_matrix) & 0x0f) != 0))
|
(((long)(s->q_inter_matrix) & 0x0f) != 0)) {
|
||||||
{
|
|
||||||
av_log(s->avctx, AV_LOG_INFO, "Internal Error: q-matrix blocks must be 16-byte aligned "
|
av_log(s->avctx, AV_LOG_INFO, "Internal Error: q-matrix blocks must be 16-byte aligned "
|
||||||
"to use AltiVec DCT. Reverting to non-AltiVec version.\n");
|
"to use AltiVec DCT. Reverting to non-AltiVec version.\n");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (((long)(s->intra_scantable.inverse) & 0x0f) != 0)
|
if (((long)(s->intra_scantable.inverse) & 0x0f) != 0) {
|
||||||
{
|
|
||||||
av_log(s->avctx, AV_LOG_INFO, "Internal Error: scan table blocks must be 16-byte aligned "
|
av_log(s->avctx, AV_LOG_INFO, "Internal Error: scan table blocks must be 16-byte aligned "
|
||||||
"to use AltiVec DCT. Reverting to non-AltiVec version.\n");
|
"to use AltiVec DCT. Reverting to non-AltiVec version.\n");
|
||||||
return;
|
return;
|
||||||
@ -634,8 +617,7 @@ void MPV_common_init_altivec(MpegEncContext *s)
|
|||||||
|
|
||||||
|
|
||||||
if ((s->avctx->dct_algo == FF_DCT_AUTO) ||
|
if ((s->avctx->dct_algo == FF_DCT_AUTO) ||
|
||||||
(s->avctx->dct_algo == FF_DCT_ALTIVEC))
|
(s->avctx->dct_algo == FF_DCT_ALTIVEC)) {
|
||||||
{
|
|
||||||
#if 0 /* seems to cause trouble under some circumstances */
|
#if 0 /* seems to cause trouble under some circumstances */
|
||||||
s->dct_quantize = dct_quantize_altivec;
|
s->dct_quantize = dct_quantize_altivec;
|
||||||
#endif
|
#endif
|
||||||
|
@ -379,8 +379,7 @@ void ff_snow_vertical_compose97i_altivec(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2,
|
|||||||
v4=(vector signed int *)b4;
|
v4=(vector signed int *)b4;
|
||||||
v5=(vector signed int *)b5;
|
v5=(vector signed int *)b5;
|
||||||
|
|
||||||
for (i=0; i< w4;i++)
|
for (i=0; i< w4;i++) {
|
||||||
{
|
|
||||||
|
|
||||||
#if 0
|
#if 0
|
||||||
b4[i] -= (3*(b3[i] + b5[i])+4)>>3;
|
b4[i] -= (3*(b3[i] + b5[i])+4)>>3;
|
||||||
@ -782,8 +781,8 @@ void ff_snow_inner_add_yblock_altivec(uint8_t *obmc, const int obmc_stride,
|
|||||||
void snow_init_altivec(DSPContext* c, AVCodecContext *avctx)
|
void snow_init_altivec(DSPContext* c, AVCodecContext *avctx)
|
||||||
{
|
{
|
||||||
#if 0
|
#if 0
|
||||||
c->horizontal_compose97i = ff_snow_horizontal_compose97i_altivec;
|
c->horizontal_compose97i = ff_snow_horizontal_compose97i_altivec;
|
||||||
c->vertical_compose97i = ff_snow_vertical_compose97i_altivec;
|
c->vertical_compose97i = ff_snow_vertical_compose97i_altivec;
|
||||||
c->inner_add_yblock = ff_snow_inner_add_yblock_altivec;
|
c->inner_add_yblock = ff_snow_inner_add_yblock_altivec;
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user