You've already forked FFmpeg
							
							
				mirror of
				https://github.com/FFmpeg/FFmpeg.git
				synced 2025-10-30 23:18:11 +02:00 
			
		
		
		
	swscale: aarch64: Add a NEON implementation of interleaveBytes
This allows speeding up format conversions from yuv420 to nv12.
                             Cortex A53      A72      A73
interleave_bytes_c:             86077.5  51433.0  66972.0
interleave_bytes_neon:          19701.7  23019.2  15859.2
interleave_bytes_aligned_c:     86603.0  52017.2  67484.2
interleave_bytes_aligned_neon:   9061.0   7623.0   6309.0
Signed-off-by: Martin Storsjö <martin@martin.st>
			
			
This commit is contained in:
		| @@ -1,6 +1,8 @@ | ||||
| OBJS        += aarch64/swscale.o                \ | ||||
| OBJS        += aarch64/rgb2rgb.o                \ | ||||
|                aarch64/swscale.o                \ | ||||
|                aarch64/swscale_unscaled.o       \ | ||||
|  | ||||
| NEON-OBJS   += aarch64/hscale.o                 \ | ||||
|                aarch64/output.o                 \ | ||||
|                aarch64/rgb2rgb_neon.o           \ | ||||
|                aarch64/yuv2rgb_neon.o           \ | ||||
|   | ||||
							
								
								
									
										41
									
								
								libswscale/aarch64/rgb2rgb.c
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										41
									
								
								libswscale/aarch64/rgb2rgb.c
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,41 @@ | ||||
| /* | ||||
|  * This file is part of FFmpeg. | ||||
|  * | ||||
|  * FFmpeg is free software; you can redistribute it and/or | ||||
|  * modify it under the terms of the GNU Lesser General Public | ||||
|  * License as published by the Free Software Foundation; either | ||||
|  * version 2.1 of the License, or (at your option) any later version. | ||||
|  * | ||||
|  * FFmpeg is distributed in the hope that it will be useful, | ||||
|  * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
|  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU | ||||
|  * Lesser General Public License for more details. | ||||
|  * | ||||
|  * You should have received a copy of the GNU Lesser General Public | ||||
|  * License along with FFmpeg; if not, write to the Free Software | ||||
|  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | ||||
|  */ | ||||
|  | ||||
| #include <stdint.h> | ||||
|  | ||||
| #include "config.h" | ||||
| #include "libavutil/attributes.h" | ||||
| #include "libavutil/aarch64/cpu.h" | ||||
| #include "libavutil/cpu.h" | ||||
| #include "libavutil/bswap.h" | ||||
| #include "libswscale/rgb2rgb.h" | ||||
| #include "libswscale/swscale.h" | ||||
| #include "libswscale/swscale_internal.h" | ||||
|  | ||||
| void ff_interleave_bytes_neon(const uint8_t *src1, const uint8_t *src2, | ||||
|                               uint8_t *dest, int width, int height, | ||||
|                               int src1Stride, int src2Stride, int dstStride); | ||||
|  | ||||
| av_cold void rgb2rgb_init_aarch64(void) | ||||
| { | ||||
|     int cpu_flags = av_get_cpu_flags(); | ||||
|  | ||||
|     if (have_neon(cpu_flags)) { | ||||
|         interleaveBytes = ff_interleave_bytes_neon; | ||||
|     } | ||||
| } | ||||
							
								
								
									
										79
									
								
								libswscale/aarch64/rgb2rgb_neon.S
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										79
									
								
								libswscale/aarch64/rgb2rgb_neon.S
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,79 @@ | ||||
| /* | ||||
|  * Copyright (c) 2020 Martin Storsjo | ||||
|  * | ||||
|  * This file is part of FFmpeg. | ||||
|  * | ||||
|  * FFmpeg is free software; you can redistribute it and/or | ||||
|  * modify it under the terms of the GNU Lesser General Public | ||||
|  * License as published by the Free Software Foundation; either | ||||
|  * version 2.1 of the License, or (at your option) any later version. | ||||
|  * | ||||
|  * FFmpeg is distributed in the hope that it will be useful, | ||||
|  * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
|  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU | ||||
|  * Lesser General Public License for more details. | ||||
|  * | ||||
|  * You should have received a copy of the GNU Lesser General Public | ||||
|  * License along with FFmpeg; if not, write to the Free Software | ||||
|  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | ||||
|  */ | ||||
|  | ||||
| #include "libavutil/aarch64/asm.S" | ||||
|  | ||||
| // void ff_interleave_bytes_neon(const uint8_t *src1, const uint8_t *src2, | ||||
| //                               uint8_t *dest, int width, int height, | ||||
| //                               int src1Stride, int src2Stride, int dstStride); | ||||
| function ff_interleave_bytes_neon, export=1 | ||||
|         sub             w5,  w5,  w3 | ||||
|         sub             w6,  w6,  w3 | ||||
|         sub             w7,  w7,  w3, lsl #1 | ||||
| 1: | ||||
|         ands            w8,  w3,  #0xfffffff0 // & ~15 | ||||
|         b.eq            3f | ||||
| 2: | ||||
|         ld1             {v0.16b}, [x0], #16 | ||||
|         ld1             {v1.16b}, [x1], #16 | ||||
|         subs            w8,  w8,  #16 | ||||
|         st2             {v0.16b, v1.16b}, [x2], #32 | ||||
|         b.gt            2b | ||||
|  | ||||
|         tst             w3,  #15 | ||||
|         b.eq            9f | ||||
|  | ||||
| 3: | ||||
|         tst             w3,  #8 | ||||
|         b.eq            4f | ||||
|         ld1             {v0.8b}, [x0], #8 | ||||
|         ld1             {v1.8b}, [x1], #8 | ||||
|         st2             {v0.8b, v1.8b}, [x2], #16 | ||||
| 4: | ||||
|         tst             w3,  #4 | ||||
|         b.eq            5f | ||||
|  | ||||
|         ld1             {v0.s}[0], [x0], #4 | ||||
|         ld1             {v1.s}[0], [x1], #4 | ||||
|         zip1            v0.8b,   v0.8b,   v1.8b | ||||
|         st1             {v0.8b}, [x2], #8 | ||||
|  | ||||
| 5: | ||||
|         ands            w8,  w3,  #3 | ||||
|         b.eq            9f | ||||
| 6: | ||||
|         ldrb            w9,  [x0], #1 | ||||
|         ldrb            w10, [x1], #1 | ||||
|         subs            w8,  w8,  #1 | ||||
|         bfi             w9,  w10, #8,  #8 | ||||
|         strh            w9,  [x2], #2 | ||||
|         b.gt            6b | ||||
|  | ||||
| 9: | ||||
|         subs            w4,  w4,  #1 | ||||
|         b.eq            0f | ||||
|         add             x0,  x0,  w5, sxtw | ||||
|         add             x1,  x1,  w6, sxtw | ||||
|         add             x2,  x2,  w7, sxtw | ||||
|         b               1b | ||||
|  | ||||
| 0: | ||||
|         ret | ||||
| endfunc | ||||
| @@ -137,6 +137,8 @@ void (*yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, | ||||
| av_cold void ff_sws_rgb2rgb_init(void) | ||||
| { | ||||
|     rgb2rgb_init_c(); | ||||
|     if (ARCH_AARCH64) | ||||
|         rgb2rgb_init_aarch64(); | ||||
|     if (ARCH_X86) | ||||
|         rgb2rgb_init_x86(); | ||||
| } | ||||
|   | ||||
| @@ -169,6 +169,7 @@ extern void (*yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const u | ||||
|  | ||||
| void ff_sws_rgb2rgb_init(void); | ||||
|  | ||||
| void rgb2rgb_init_aarch64(void); | ||||
| void rgb2rgb_init_x86(void); | ||||
|  | ||||
| #endif /* SWSCALE_RGB2RGB_H */ | ||||
|   | ||||
		Reference in New Issue
	
	Block a user