mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-12-28 20:53:54 +02:00
0ea61725b1
This patch adds code to support specializations of the hscale function and adds a specialization for filterSize == 4. ff_hscale8to15_4_neon is a complete rewrite. Since the main bottleneck here is loading the data from src, this data is loaded a whole block ahead and stored back to the stack to be loaded again with ld4. This arranges the data for most efficient use of the vector instructions and removes the need for completion adds at the end. The number of iterations of the C per iteration of the assembly is increased from 4 to 8, but because of the prefetching, there must be a special section without prefetching when dstW < 16. This improves speed on Graviton 2 (Neoverse N1) dramatically in the case where previously fs=8 would have been required. before: hscale_8_to_15__fs_8_dstW_512_neon: 1962.8 after : hscale_8_to_15__fs_4_dstW_512_neon: 1220.9 Signed-off-by: Jonathan Swinney <jswinney@amazon.com> Signed-off-by: Martin Storsjö <martin@martin.st>
70 lines
2.9 KiB
C
70 lines
2.9 KiB
C
/*
|
|
* This file is part of FFmpeg.
|
|
*
|
|
* FFmpeg is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
* License as published by the Free Software Foundation; either
|
|
* version 2.1 of the License, or (at your option) any later version.
|
|
*
|
|
* FFmpeg is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
* License along with FFmpeg; if not, write to the Free Software
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
*/
|
|
|
|
#include "config.h"
|
|
#include "libavutil/attributes.h"
|
|
#include "libswscale/swscale.h"
|
|
#include "libswscale/swscale_internal.h"
|
|
#include "libavutil/aarch64/cpu.h"
|
|
|
|
#define SCALE_FUNC(filter_n, from_bpc, to_bpc, opt) \
|
|
void ff_hscale ## from_bpc ## to ## to_bpc ## _ ## filter_n ## _ ## opt( \
|
|
SwsContext *c, int16_t *data, \
|
|
int dstW, const uint8_t *src, \
|
|
const int16_t *filter, \
|
|
const int32_t *filterPos, int filterSize)
|
|
#define SCALE_FUNCS(filter_n, opt) \
|
|
SCALE_FUNC(filter_n, 8, 15, opt);
|
|
#define ALL_SCALE_FUNCS(opt) \
|
|
SCALE_FUNCS(4, opt); \
|
|
SCALE_FUNCS(X8, opt)
|
|
|
|
ALL_SCALE_FUNCS(neon);
|
|
|
|
void ff_yuv2planeX_8_neon(const int16_t *filter, int filterSize,
|
|
const int16_t **src, uint8_t *dest, int dstW,
|
|
const uint8_t *dither, int offset);
|
|
|
|
#define ASSIGN_SCALE_FUNC2(hscalefn, filtersize, opt) do { \
|
|
if (c->srcBpc == 8 && c->dstBpc <= 14) { \
|
|
hscalefn = \
|
|
ff_hscale8to15_ ## filtersize ## _ ## opt; \
|
|
} \
|
|
} while (0)
|
|
|
|
#define ASSIGN_SCALE_FUNC(hscalefn, filtersize, opt) \
|
|
switch (filtersize) { \
|
|
case 4: ASSIGN_SCALE_FUNC2(hscalefn, 4, opt); break; \
|
|
default: if (filtersize % 8 == 0) \
|
|
ASSIGN_SCALE_FUNC2(hscalefn, X8, opt); \
|
|
break; \
|
|
}
|
|
|
|
av_cold void ff_sws_init_swscale_aarch64(SwsContext *c)
|
|
{
|
|
int cpu_flags = av_get_cpu_flags();
|
|
|
|
if (have_neon(cpu_flags)) {
|
|
ASSIGN_SCALE_FUNC(c->hyScale, c->hLumFilterSize, neon);
|
|
ASSIGN_SCALE_FUNC(c->hcScale, c->hChrFilterSize, neon);
|
|
if (c->dstBpc == 8) {
|
|
c->yuv2planeX = ff_yuv2planeX_8_neon;
|
|
}
|
|
}
|
|
}
|