diff --git a/libswscale/arm/swscale_unscaled.c b/libswscale/arm/swscale_unscaled.c index 4c121228ab..1b50acd7e2 100644 --- a/libswscale/arm/swscale_unscaled.c +++ b/libswscale/arm/swscale_unscaled.c @@ -63,6 +63,50 @@ static int rgbx_to_nv12_neon_16_wrapper(SwsContext *context, const uint8_t *src[ } #endif +#define YUV_TO_RGB_TABLE(precision) \ + c->yuv2rgb_v2r_coeff / ((precision) == 16 ? 1 << 7 : 1), \ + c->yuv2rgb_u2g_coeff / ((precision) == 16 ? 1 << 7 : 1), \ + c->yuv2rgb_v2g_coeff / ((precision) == 16 ? 1 << 7 : 1), \ + c->yuv2rgb_u2b_coeff / ((precision) == 16 ? 1 << 7 : 1), \ + +#define DECLARE_FF_YUV420P_TO_RGBX_FUNCS(ofmt, precision) \ +int ff_yuv420p_to_##ofmt##_neon_##precision(int w, int h, \ + uint8_t *dst, int linesize, \ + const uint8_t *srcY, int linesizeY, \ + const uint8_t *srcU, int linesizeU, \ + const uint8_t *srcV, int linesizeV, \ + const int16_t *table, \ + int y_offset, \ + int y_coeff); \ + \ +static int yuv420p_to_##ofmt##_neon_wrapper_##precision(SwsContext *c, const uint8_t *src[],\ + int srcStride[], int srcSliceY, int srcSliceH, \ + uint8_t *dst[], int dstStride[]) { \ + const int16_t yuv2rgb_table[] = { YUV_TO_RGB_TABLE(precision) }; \ + \ + ff_yuv420p_to_##ofmt##_neon_##precision(c->srcW, srcSliceH, \ + dst[0] + srcSliceY * dstStride[0], dstStride[0], \ + src[0], srcStride[0], \ + src[1], srcStride[1], \ + src[2], srcStride[2], \ + yuv2rgb_table, \ + c->yuv2rgb_y_offset >> 9, \ + c->yuv2rgb_y_coeff / ((precision) == 16 ? 1 << 7 : 1)); \ + \ + return 0; \ +} \ + +#define DECLARE_FF_YUV420P_TO_ALL_RGBX_FUNCS(precision) \ +DECLARE_FF_YUV420P_TO_RGBX_FUNCS(argb, precision) \ +DECLARE_FF_YUV420P_TO_RGBX_FUNCS(rgba, precision) \ +DECLARE_FF_YUV420P_TO_RGBX_FUNCS(abgr, precision) \ +DECLARE_FF_YUV420P_TO_RGBX_FUNCS(bgra, precision) \ + +#define DECLARE_FF_YUV420P_TO_ALL_RGBX_ALL_PRECISION_FUNCS \ +DECLARE_FF_YUV420P_TO_ALL_RGBX_FUNCS(16) \ + +DECLARE_FF_YUV420P_TO_ALL_RGBX_ALL_PRECISION_FUNCS + #define DECLARE_FF_NVX_TO_RGBX_FUNCS(ifmt, ofmt, precision) \ int ff_##ifmt##_to_##ofmt##_neon_##precision(int w, int h, \ uint8_t *dst, int linesize, \ @@ -75,12 +119,7 @@ int ff_##ifmt##_to_##ofmt##_neon_##precision(int w, int h, static int ifmt##_to_##ofmt##_neon_wrapper_##precision(SwsContext *c, const uint8_t *src[], \ int srcStride[], int srcSliceY, int srcSliceH, \ uint8_t *dst[], int dstStride[]) { \ - const int16_t yuv2rgb_table[] = { \ - c->yuv2rgb_v2r_coeff / ((precision) == 16 ? 1 << 7 : 1), \ - c->yuv2rgb_u2g_coeff / ((precision) == 16 ? 1 << 7 : 1), \ - c->yuv2rgb_v2g_coeff / ((precision) == 16 ? 1 << 7 : 1), \ - c->yuv2rgb_u2b_coeff / ((precision) == 16 ? 1 << 7 : 1), \ - }; \ + const int16_t yuv2rgb_table[] = { YUV_TO_RGB_TABLE(precision) }; \ \ ff_##ifmt##_to_##ofmt##_neon_##precision(c->srcW, srcSliceH, \ dst[0] + srcSliceY * dstStride[0], dstStride[0], \ @@ -138,6 +177,7 @@ static void get_unscaled_swscale_neon(SwsContext *c) { SET_FF_NVX_TO_ALL_RGBX_FUNC(nv12, NV12, accurate_rnd); SET_FF_NVX_TO_ALL_RGBX_FUNC(nv21, NV21, accurate_rnd); + SET_FF_NVX_TO_ALL_RGBX_FUNC(yuv420p, YUV420P, accurate_rnd); } void ff_get_unscaled_swscale_arm(SwsContext *c) diff --git a/libswscale/arm/yuv2rgb_neon.S b/libswscale/arm/yuv2rgb_neon.S index 9f9dd2aaa1..dd00246ef3 100644 --- a/libswscale/arm/yuv2rgb_neon.S +++ b/libswscale/arm/yuv2rgb_neon.S @@ -103,7 +103,8 @@ vmovl.u8 q15, \y1 @ 8px of y vdup.16 q5, r9 @ q5 = y_offset - vdup.16 q7, r10 @ q7 = y_coeff + vmov d14, d0 @ q7 = y_coeff + vmov d15, d0 @ q7 = y_coeff vsub.s16 q14, q5 vsub.s16 q15, q5 @@ -184,7 +185,7 @@ compute_8px_32 r11, d30, \ofmt .endm -.macro load_args +.macro load_args_nvx push {r4-r12, lr} vpush {q4-q7} ldr r4, [sp, #104] @ r4 = srcY @@ -206,9 +207,42 @@ sub r7, r7, r0 @ r7 = linesizeC - width (paddingC) .endm +.macro load_args_yuv420p + push {r4-r12, lr} + vpush {q4-q7} + ldr r4, [sp, #104] @ r4 = srcY + ldr r5, [sp, #108] @ r5 = linesizeY + ldr r6, [sp, #112] @ r6 = srcU + ldr r8, [sp, #128] @ r8 = table + ldr r9, [sp, #132] @ r9 = y_offset + ldr r10,[sp, #136] @ r10 = y_coeff + vdup.16 d0, r10 @ d0 = y_coeff + vld1.16 {d1}, [r8] @ d1 = *table + add r11, r2, r3 @ r11 = dst + linesize (dst2) + add r12, r4, r5 @ r12 = srcY + linesizeY (srcY2) + lsl r3, r3, #1 + lsl r5, r5, #1 + lsl r8, r0, #2 + sub r3, r3, r8 @ r3 = linesize * 2 - width * 4 (padding) + sub r5, r5, r0 @ r5 = linesizeY * 2 - width (paddingY) + ldr r10,[sp, #120] @ r10 = srcV +.endm + .macro declare_func ifmt ofmt precision function ff_\ifmt\()_to_\ofmt\()_neon_\precision\(), export=1 - load_args + +.ifc \ifmt,nv12 + load_args_nvx +.endif + +.ifc \ifmt,nv21 + load_args_nvx +.endif + +.ifc \ifmt,yuv420p + load_args_yuv420p +.endif + 1: mov r8, r0 @ r8 = width 2: @@ -216,16 +250,30 @@ function ff_\ifmt\()_to_\ofmt\()_neon_\precision\(), export=1 pld [r4, #64*3] pld [r12, #64*3] - vld2.8 {d2, d3}, [r6]! @ q1: interleaved chroma line vmov.i8 d10, #128 + .ifc \ifmt,nv12 + vld2.8 {d2, d3}, [r6]! @ q1: interleaved chroma line vsubl.u8 q14, d2, d10 @ q14 = U - 128 vsubl.u8 q15, d3, d10 @ q15 = V - 128 -.else +.endif + +.ifc \ifmt,nv21 + vld2.8 {d2, d3}, [r6]! @ q1: interleaved chroma line vsubl.u8 q14, d3, d10 @ q14 = U - 128 vsubl.u8 q15, d2, d10 @ q15 = V - 128 .endif +.ifc \ifmt,yuv420p + pld [r10, #64*3] + + vld1.8 d2, [r6]! @ d2: chroma red line + vld1.8 d3, [r10]! @ d3: chroma blue line + vsubl.u8 q14, d2, d10 @ q14 = U - 128 + vsubl.u8 q15, d3, d10 @ q15 = V - 128 +.endif + + process_16px_\precision \ofmt subs r8, r8, #16 @ width -= 16 @@ -235,7 +283,24 @@ function ff_\ifmt\()_to_\ofmt\()_neon_\precision\(), export=1 add r4, r4, r5 @ srcY += paddingY add r11, r11, r3 @ dst2 += padding add r12, r12, r5 @ srcY2 += paddingY + +.ifc \ifmt,nv12 add r6, r6, r7 @ srcC += paddingC +.endif + +.ifc \ifmt,nv21 + add r6, r6, r7 @ srcC += paddingC +.endif + +.ifc \ifmt,yuv420p + ldr r7, [sp, #116] @ r7 = linesizeU + sub r7, r7, r0, lsr #1 @ r7 = linesizeU - width / 2 (paddingU) + add r6, r6, r7 @ srcU += paddingU + + ldr r7, [sp, #124] @ r7 = linesizeV + sub r7, r7, r0, lsr #1 @ r7 = linesizeV - width / 2 (paddingV) + add r10, r10, r7 @ srcU += paddingV +.endif subs r1, r1, #2 @ height -= 2 bgt 1b @@ -257,3 +322,5 @@ declare_rgb_funcs nv12, 16 declare_rgb_funcs nv21, 16 declare_rgb_funcs nv12, 32 declare_rgb_funcs nv21, 32 +declare_rgb_funcs yuv420p, 16 +declare_rgb_funcs yuv420p, 32