mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-11-26 19:01:44 +02:00
swscale/aarch64: add nv24/nv42 to yuv420p unscaled converter
A55 A76 nv24_yuv420p_128_c: 4956.1 1267.0 nv24_yuv420p_128_neon: 3109.1 ( 1.59x) 640.0 ( 1.98x) nv24_yuv420p_1920_c: 35728.4 11736.2 nv24_yuv420p_1920_neon: 8011.1 ( 4.46x) 2436.0 ( 4.82x) nv42_yuv420p_128_c: 4956.4 1270.5 nv42_yuv420p_128_neon: 3074.6 ( 1.61x) 639.5 ( 1.99x) nv42_yuv420p_1920_c: 35685.9 11732.5 nv42_yuv420p_1920_neon: 7995.1 ( 4.46x) 2437.2 ( 4.81x)
This commit is contained in:
parent
88a563ad18
commit
52887683e9
@ -7,4 +7,5 @@ NEON-OBJS += aarch64/hscale.o \
|
||||
aarch64/output.o \
|
||||
aarch64/range_convert_neon.o \
|
||||
aarch64/rgb2rgb_neon.o \
|
||||
aarch64/swscale_unscaled_neon.o \
|
||||
aarch64/yuv2rgb_neon.o \
|
||||
|
@ -139,6 +139,31 @@ static int ifmt##_to_##ofmt##_neon_wrapper(SwsContext *c, const uint8_t *src[],
|
||||
dst[2] + srcSliceY * dstStride[2], dstStride[2]); \
|
||||
} \
|
||||
|
||||
void ff_nv24_to_yuv420p_chroma_neon(uint8_t *dst1, int dstStride1,
|
||||
uint8_t *dst2, int dstStride2,
|
||||
const uint8_t *src, int srcStride,
|
||||
int w, int h);
|
||||
|
||||
static int nv24_to_yuv420p_neon_wrapper(SwsContext *c, const uint8_t *src[],
|
||||
int srcStride[], int srcSliceY, int srcSliceH,
|
||||
uint8_t *dst[], int dstStride[])
|
||||
{
|
||||
uint8_t *dst1 = dst[1] + dstStride[1] * srcSliceY / 2;
|
||||
uint8_t *dst2 = dst[2] + dstStride[2] * srcSliceY / 2;
|
||||
|
||||
ff_copyPlane(src[0], srcStride[0], srcSliceY, srcSliceH, c->srcW,
|
||||
dst[0], dstStride[0]);
|
||||
|
||||
if (c->srcFormat == AV_PIX_FMT_NV24)
|
||||
ff_nv24_to_yuv420p_chroma_neon(dst1, dstStride[1], dst2, dstStride[2],
|
||||
src[1], srcStride[1], c->srcW / 2, srcSliceH);
|
||||
else
|
||||
ff_nv24_to_yuv420p_chroma_neon(dst2, dstStride[2], dst1, dstStride[1],
|
||||
src[1], srcStride[1], c->srcW / 2, srcSliceH);
|
||||
|
||||
return srcSliceH;
|
||||
}
|
||||
|
||||
#define DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS(nvx) \
|
||||
DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, argb) \
|
||||
DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, rgba) \
|
||||
@ -177,6 +202,11 @@ static void get_unscaled_swscale_neon(SwsContext *c) {
|
||||
SET_FF_NVX_TO_ALL_RGBX_FUNC(nv21, NV21, accurate_rnd);
|
||||
SET_FF_NVX_TO_ALL_RGBX_FUNC(yuv420p, YUV420P, accurate_rnd);
|
||||
SET_FF_NVX_TO_ALL_RGBX_FUNC(yuv422p, YUV422P, accurate_rnd);
|
||||
|
||||
if (c->dstFormat == AV_PIX_FMT_YUV420P &&
|
||||
(c->srcFormat == AV_PIX_FMT_NV24 || c->srcFormat == AV_PIX_FMT_NV42) &&
|
||||
!(c->srcH & 1) && !(c->srcW & 15) && !accurate_rnd)
|
||||
c->convert_unscaled = nv24_to_yuv420p_neon_wrapper;
|
||||
}
|
||||
|
||||
void ff_get_unscaled_swscale_aarch64(SwsContext *c)
|
||||
|
70
libswscale/aarch64/swscale_unscaled_neon.S
Normal file
70
libswscale/aarch64/swscale_unscaled_neon.S
Normal file
@ -0,0 +1,70 @@
|
||||
/*
|
||||
* Copyright (c) 2024 Ramiro Polla
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/aarch64/asm.S"
|
||||
|
||||
function ff_nv24_to_yuv420p_chroma_neon, export=1
|
||||
// x0 uint8_t *dst1
|
||||
// x1 int dstStride1
|
||||
// x2 uint8_t *dst2
|
||||
// x3 int dstStride2
|
||||
// x4 const uint8_t *src
|
||||
// x5 int srcStride
|
||||
// w6 int w
|
||||
// w7 int h
|
||||
|
||||
add x9, x4, w5, sxtw // x9 = src + srcStride
|
||||
lsl w5, w5, #1 // srcStride *= 2
|
||||
sub w5, w5, w6, lsl #2 // srcPadding = (2 * srcStride) - (4 * w)
|
||||
sub w1, w1, w6 // dstPadding1 = dstStride1 - w
|
||||
sub w3, w3, w6 // dstPadding2 = dstStride2 - w
|
||||
|
||||
1:
|
||||
mov w10, w6 // w10 = w
|
||||
|
||||
2:
|
||||
ld2 { v0.16b, v1.16b }, [x4], #32 // v0 = U1, v1 = V1
|
||||
ld2 { v2.16b, v3.16b }, [x9], #32 // v2 = U2, v3 = V2
|
||||
|
||||
uaddlp v0.8h, v0.16b // pairwise add U1 into v0
|
||||
uaddlp v1.8h, v1.16b // pairwise add V1 into v1
|
||||
uadalp v0.8h, v2.16b // pairwise add U2, accumulate into v0
|
||||
uadalp v1.8h, v3.16b // pairwise add V2, accumulate into v1
|
||||
|
||||
shrn v0.8b, v0.8h, #2 // divide by 4
|
||||
shrn v1.8b, v1.8h, #2 // divide by 4
|
||||
|
||||
st1 { v0.8b }, [x0], #8 // store U into dst1
|
||||
st1 { v1.8b }, [x2], #8 // store V into dst2
|
||||
|
||||
subs w10, w10, #8
|
||||
b.gt 2b
|
||||
|
||||
// next row
|
||||
add x4, x4, w5, sxtw // src1 += srcPadding
|
||||
add x9, x9, w5, sxtw // src2 += srcPadding
|
||||
add x0, x0, w1, sxtw // dst1 += dstPadding1
|
||||
add x2, x2, w3, sxtw // dst2 += dstPadding2
|
||||
|
||||
subs w7, w7, #2
|
||||
b.gt 1b
|
||||
|
||||
ret
|
||||
endfunc
|
Loading…
Reference in New Issue
Block a user