You've already forked FFmpeg
							
							
				mirror of
				https://github.com/FFmpeg/FFmpeg.git
				synced 2025-10-30 23:18:11 +02:00 
			
		
		
		
	Merge commit '8438b3f09f6b225d0886cc385117c38eb44ca0c1'
* commit '8438b3f09f6b225d0886cc385117c38eb44ca0c1': aarch64: h264 idct NEON assembler optimizations Merged-by: Michael Niedermayer <michaelni@gmx.at>
This commit is contained in:
		| @@ -1,5 +1,7 @@ | ||||
| OBJS-$(CONFIG_H264CHROMA)               += aarch64/h264chroma_init_aarch64.o | ||||
| OBJS-$(CONFIG_H264DSP)                  += aarch64/h264dsp_init_aarch64.o | ||||
| OBJS-$(CONFIG_RV40_DECODER)             += aarch64/rv40dsp_init_aarch64.o | ||||
| OBJS-$(CONFIG_VC1_DECODER)              += aarch64/vc1dsp_init_aarch64.o | ||||
|  | ||||
| NEON-OBJS-$(CONFIG_H264CHROMA)          += aarch64/h264cmc_neon.o | ||||
| NEON-OBJS-$(CONFIG_H264DSP)             += aarch64/h264idct_neon.o | ||||
|   | ||||
							
								
								
									
										62
									
								
								libavcodec/aarch64/h264dsp_init_aarch64.c
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										62
									
								
								libavcodec/aarch64/h264dsp_init_aarch64.c
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,62 @@ | ||||
| /* | ||||
|  * Copyright (c) 2010 Mans Rullgard <mans@mansr.com> | ||||
|  * | ||||
|  * This file is part of FFmpeg. | ||||
|  * | ||||
|  * FFmpeg is free software; you can redistribute it and/or | ||||
|  * modify it under the terms of the GNU Lesser General Public | ||||
|  * License as published by the Free Software Foundation; either | ||||
|  * version 2.1 of the License, or (at your option) any later version. | ||||
|  * | ||||
|  * FFmpeg is distributed in the hope that it will be useful, | ||||
|  * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
|  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU | ||||
|  * Lesser General Public License for more details. | ||||
|  * | ||||
|  * You should have received a copy of the GNU Lesser General Public | ||||
|  * License along with FFmpeg; if not, write to the Free Software | ||||
|  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | ||||
|  */ | ||||
|  | ||||
| #include <stdint.h> | ||||
|  | ||||
| #include "libavutil/attributes.h" | ||||
| #include "libavutil/cpu.h" | ||||
| #include "libavutil/aarch64/cpu.h" | ||||
| #include "libavcodec/h264dsp.h" | ||||
|  | ||||
| void ff_h264_idct_add_neon(uint8_t *dst, int16_t *block, int stride); | ||||
| void ff_h264_idct_dc_add_neon(uint8_t *dst, int16_t *block, int stride); | ||||
| void ff_h264_idct_add16_neon(uint8_t *dst, const int *block_offset, | ||||
|                              int16_t *block, int stride, | ||||
|                              const uint8_t nnzc[6*8]); | ||||
| void ff_h264_idct_add16intra_neon(uint8_t *dst, const int *block_offset, | ||||
|                                   int16_t *block, int stride, | ||||
|                                   const uint8_t nnzc[6*8]); | ||||
| void ff_h264_idct_add8_neon(uint8_t **dest, const int *block_offset, | ||||
|                             int16_t *block, int stride, | ||||
|                             const uint8_t nnzc[6*8]); | ||||
|  | ||||
| void ff_h264_idct8_add_neon(uint8_t *dst, int16_t *block, int stride); | ||||
| void ff_h264_idct8_dc_add_neon(uint8_t *dst, int16_t *block, int stride); | ||||
| void ff_h264_idct8_add4_neon(uint8_t *dst, const int *block_offset, | ||||
|                              int16_t *block, int stride, | ||||
|                              const uint8_t nnzc[6*8]); | ||||
|  | ||||
| av_cold void ff_h264dsp_init_aarch64(H264DSPContext *c, const int bit_depth, | ||||
|                                      const int chroma_format_idc) | ||||
| { | ||||
|     int cpu_flags = av_get_cpu_flags(); | ||||
|  | ||||
|     if (have_neon(cpu_flags) && bit_depth == 8) { | ||||
|         c->h264_idct_add        = ff_h264_idct_add_neon; | ||||
|         c->h264_idct_dc_add     = ff_h264_idct_dc_add_neon; | ||||
|         c->h264_idct_add16      = ff_h264_idct_add16_neon; | ||||
|         c->h264_idct_add16intra = ff_h264_idct_add16intra_neon; | ||||
|         if (chroma_format_idc <= 1) | ||||
|             c->h264_idct_add8   = ff_h264_idct_add8_neon; | ||||
|         c->h264_idct8_add       = ff_h264_idct8_add_neon; | ||||
|         c->h264_idct8_dc_add    = ff_h264_idct8_dc_add_neon; | ||||
|         c->h264_idct8_add4      = ff_h264_idct8_add4_neon; | ||||
|     } | ||||
| } | ||||
							
								
								
									
										408
									
								
								libavcodec/aarch64/h264idct_neon.S
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										408
									
								
								libavcodec/aarch64/h264idct_neon.S
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,408 @@ | ||||
| /* | ||||
|  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> | ||||
|  * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net> | ||||
|  * | ||||
|  * This file is part of FFmpeg. | ||||
|  * | ||||
|  * FFmpeg is free software; you can redistribute it and/or | ||||
|  * modify it under the terms of the GNU Lesser General Public | ||||
|  * License as published by the Free Software Foundation; either | ||||
|  * version 2.1 of the License, or (at your option) any later version. | ||||
|  * | ||||
|  * FFmpeg is distributed in the hope that it will be useful, | ||||
|  * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
|  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU | ||||
|  * Lesser General Public License for more details. | ||||
|  * | ||||
|  * You should have received a copy of the GNU Lesser General Public | ||||
|  * License along with FFmpeg; if not, write to the Free Software | ||||
|  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | ||||
|  */ | ||||
|  | ||||
| #include "libavutil/aarch64/asm.S" | ||||
| #include "neon.S" | ||||
|  | ||||
| function ff_h264_idct_add_neon, export=1 | ||||
|         ld1             {v0.4H, v1.4H, v2.4H, v3.4H},  [x1] | ||||
|         sxtw            x2,     w2 | ||||
|         movi            v30.8H, #0 | ||||
|  | ||||
|         add             v4.4H,  v0.4H,  v2.4H | ||||
|         sshr            v16.4H, v1.4H,  #1 | ||||
|         st1             {v30.8H},    [x1], #16 | ||||
|         sshr            v17.4H, v3.4H,  #1 | ||||
|         st1             {v30.8H},    [x1], #16 | ||||
|         sub             v5.4H,  v0.4H,  v2.4H | ||||
|         add             v6.4H,  v1.4H,  v17.4H | ||||
|         sub             v7.4H,  v16.4H, v3.4H | ||||
|         add             v0.4H,  v4.4H,  v6.4H | ||||
|         add             v1.4H,  v5.4H,  v7.4H | ||||
|         sub             v2.4H,  v4.4H,  v6.4H | ||||
|         sub             v3.4H,  v5.4H,  v7.4H | ||||
|  | ||||
|         transpose_4x4H  v0, v1, v2, v3, v4, v5, v6, v7 | ||||
|  | ||||
|         add             v4.4H,  v0.4H,  v3.4H | ||||
|         ld1             {v18.S}[0], [x0], x2 | ||||
|         sshr            v16.4H,  v2.4H,  #1 | ||||
|         sshr            v17.4H,  v1.4H,  #1 | ||||
|         ld1             {v19.S}[1], [x0], x2 | ||||
|         sub             v5.4H,  v0.4H,  v3.4H | ||||
|         ld1             {v18.S}[1], [x0], x2 | ||||
|         add             v6.4H,  v16.4H, v1.4H | ||||
|         ins             v4.D[1],  v5.D[0] | ||||
|         sub             v7.4H,  v2.4H,  v17.4H | ||||
|         ld1             {v19.S}[0], [x0], x2 | ||||
|         ins             v6.D[1],  v7.D[0] | ||||
|         sub             x0,  x0,  x2, lsl #2 | ||||
|         add             v0.8H,  v4.8H,  v6.8H | ||||
|         sub             v1.8H,  v4.8H,  v6.8H | ||||
|  | ||||
|         srshr           v0.8H,  v0.8H,  #6 | ||||
|         srshr           v1.8H,  v1.8H,  #6 | ||||
|  | ||||
|         uaddw           v0.8H,  v0.8H,  v18.8B | ||||
|         uaddw           v1.8H,  v1.8H,  v19.8B | ||||
|  | ||||
|         sqxtun          v0.8B, v0.8H | ||||
|         sqxtun          v1.8B, v1.8H | ||||
|  | ||||
|         st1             {v0.S}[0],  [x0], x2 | ||||
|         st1             {v1.S}[1],  [x0], x2 | ||||
|         st1             {v0.S}[1],  [x0], x2 | ||||
|         st1             {v1.S}[0],  [x0], x2 | ||||
|  | ||||
|         sub             x1,  x1,  #32 | ||||
|         ret | ||||
| endfunc | ||||
|  | ||||
| function ff_h264_idct_dc_add_neon, export=1 | ||||
|         sxtw            x2,  w2 | ||||
|         mov             w3,       #0 | ||||
|         ld1r            {v2.8H},  [x1] | ||||
|         strh            w3,       [x1] | ||||
|         srshr           v2.8H,  v2.8H,  #6 | ||||
|         ld1             {v0.S}[0],  [x0], x2 | ||||
|         ld1             {v0.S}[1],  [x0], x2 | ||||
|         uaddw           v3.8H,  v2.8H,  v0.8B | ||||
|         ld1             {v1.S}[0],  [x0], x2 | ||||
|         ld1             {v1.S}[1],  [x0], x2 | ||||
|         uaddw           v4.8H,  v2.8H,  v1.8B | ||||
|         sqxtun          v0.8B,  v3.8H | ||||
|         sqxtun          v1.8B,  v4.8H | ||||
|         sub             x0,  x0,  x2, lsl #2 | ||||
|         st1             {v0.S}[0],  [x0], x2 | ||||
|         st1             {v0.S}[1],  [x0], x2 | ||||
|         st1             {v1.S}[0],  [x0], x2 | ||||
|         st1             {v1.S}[1],  [x0], x2 | ||||
|         ret | ||||
| endfunc | ||||
|  | ||||
| function ff_h264_idct_add16_neon, export=1 | ||||
|         mov             x12, x30 | ||||
|         mov             x6,  x0         // dest | ||||
|         mov             x5,  x1         // block_offset | ||||
|         mov             x1,  x2         // block | ||||
|         mov             w9,  w3         // stride | ||||
|         movrel          x7,  scan8 | ||||
|         mov             x10, #16 | ||||
|         movrel          x13, ff_h264_idct_dc_add_neon | ||||
|         movrel          x14, ff_h264_idct_add_neon | ||||
| 1:      mov             w2,  w9 | ||||
|         ldrb            w3,  [x7], #1 | ||||
|         ldrsw           x0,  [x5], #4 | ||||
|         ldrb            w3,  [x4,  w3,  uxtw] | ||||
|         subs            w3,  w3,  #1 | ||||
|         b.lt            2f | ||||
|         ldrsh           w3,  [x1] | ||||
|         add             x0,  x0,  x6 | ||||
|         ccmp            w3,  #0,  #4,  eq | ||||
|         csel            x15, x13, x14, ne | ||||
|         blr             x15 | ||||
| 2:      subs            x10, x10, #1 | ||||
|         add             x1,  x1,  #32 | ||||
|         b.ne            1b | ||||
|         ret             x12 | ||||
| endfunc | ||||
|  | ||||
| function ff_h264_idct_add16intra_neon, export=1 | ||||
|         mov             x12, x30 | ||||
|         mov             x6,  x0         // dest | ||||
|         mov             x5,  x1         // block_offset | ||||
|         mov             x1,  x2         // block | ||||
|         mov             w9,  w3         // stride | ||||
|         movrel          x7,  scan8 | ||||
|         mov             x10, #16 | ||||
|         movrel          x13, ff_h264_idct_dc_add_neon | ||||
|         movrel          x14, ff_h264_idct_add_neon | ||||
| 1:      mov             w2,  w9 | ||||
|         ldrb            w3,  [x7], #1 | ||||
|         ldrsw           x0,  [x5], #4 | ||||
|         ldrb            w3,  [x4,  w3,  uxtw] | ||||
|         add             x0,  x0,  x6 | ||||
|         cmp             w3,  #0 | ||||
|         ldrsh           w3,  [x1] | ||||
|         csel            x15, x13, x14, eq | ||||
|         ccmp            w3,  #0,  #0,  eq | ||||
|         b.eq            2f | ||||
|         blr             x15 | ||||
| 2:      subs            x10, x10, #1 | ||||
|         add             x1,  x1,  #32 | ||||
|         b.ne            1b | ||||
|         ret             x12 | ||||
| endfunc | ||||
|  | ||||
| function ff_h264_idct_add8_neon, export=1 | ||||
|         sub             sp,  sp, #0x40 | ||||
|         stp             x19, x20, [sp] | ||||
|         mov             x12, x30 | ||||
|         ldp             x6,  x15, [x0]          // dest[0], dest[1] | ||||
|         add             x5,  x1,  #16*4         // block_offset | ||||
|         add             x9,  x2,  #16*32        // block | ||||
|         mov             w19, w3                 // stride | ||||
|         movrel          x13, ff_h264_idct_dc_add_neon | ||||
|         movrel          x14, ff_h264_idct_add_neon | ||||
|         movrel          x7,  scan8+16 | ||||
|         mov             x10, #0 | ||||
|         mov             x11, #16 | ||||
| 1:      mov             w2,  w19 | ||||
|         ldrb            w3,  [x7, x10]          // scan8[i] | ||||
|         ldrsw           x0,  [x5, x10, lsl #2]  // block_offset[i] | ||||
|         ldrb            w3,  [x4, w3,  uxtw]    // nnzc[ scan8[i] ] | ||||
|         add             x0,  x0,  x6            // block_offset[i] + dst[j-1] | ||||
|         add             x1,  x9,  x10, lsl #5   // block + i * 16 | ||||
|         cmp             w3,  #0 | ||||
|         ldrsh           w3,  [x1]               // block[i*16] | ||||
|         csel            x20, x13, x14, eq | ||||
|         ccmp            w3,  #0,  #0,  eq | ||||
|         b.eq            2f | ||||
|         blr             x20 | ||||
| 2:      add             x10, x10, #1 | ||||
|         cmp             x10, #4 | ||||
|         csel            x10, x11, x10, eq     // mov x10, #16 | ||||
|         csel            x6,  x15, x6,  eq | ||||
|         cmp             x10, #20 | ||||
|         b.lt            1b | ||||
|         ldp             x19, x20, [sp] | ||||
|         add             sp,  sp,  #0x40 | ||||
|         ret             x12 | ||||
| endfunc | ||||
|  | ||||
| .macro  idct8x8_cols    pass | ||||
|   .if \pass == 0 | ||||
|         va      .req    v18 | ||||
|         vb      .req    v30 | ||||
|         sshr            v18.8H, v26.8H, #1 | ||||
|         add             v16.8H, v24.8H, v28.8H | ||||
|         ld1             {v30.8H, v31.8H}, [x1] | ||||
|         st1             {v19.8H}, [x1],  #16 | ||||
|         st1             {v19.8H}, [x1],  #16 | ||||
|         sub             v17.8H,  v24.8H, v28.8H | ||||
|         sshr            v19.8H,  v30.8H, #1 | ||||
|         sub             v18.8H,  v18.8H,  v30.8H | ||||
|         add             v19.8H,  v19.8H,  v26.8H | ||||
|   .else | ||||
|         va      .req    v30 | ||||
|         vb      .req    v18 | ||||
|         sshr            v30.8H, v26.8H, #1 | ||||
|         sshr            v19.8H, v18.8H, #1 | ||||
|         add             v16.8H, v24.8H, v28.8H | ||||
|         sub             v17.8H, v24.8H, v28.8H | ||||
|         sub             v30.8H, v30.8H, v18.8H | ||||
|         add             v19.8H, v19.8H, v26.8H | ||||
|   .endif | ||||
|         add             v26.8H, v17.8H, va.8H | ||||
|         sub             v28.8H, v17.8H, va.8H | ||||
|         add             v24.8H, v16.8H, v19.8H | ||||
|         sub             vb.8H,  v16.8H, v19.8H | ||||
|         sub             v16.8H, v29.8H, v27.8H | ||||
|         add             v17.8H, v31.8H, v25.8H | ||||
|         sub             va.8H,  v31.8H, v25.8H | ||||
|         add             v19.8H, v29.8H, v27.8H | ||||
|         sub             v16.8H, v16.8H, v31.8H | ||||
|         sub             v17.8H, v17.8H, v27.8H | ||||
|         add             va.8H,  va.8H,  v29.8H | ||||
|         add             v19.8H, v19.8H, v25.8H | ||||
|         sshr            v25.8H, v25.8H, #1 | ||||
|         sshr            v27.8H, v27.8H, #1 | ||||
|         sshr            v29.8H, v29.8H, #1 | ||||
|         sshr            v31.8H, v31.8H, #1 | ||||
|         sub             v16.8H, v16.8H, v31.8H | ||||
|         sub             v17.8H, v17.8H, v27.8H | ||||
|         add             va.8H,  va.8H,  v29.8H | ||||
|         add             v19.8H, v19.8H, v25.8H | ||||
|         sshr            v25.8H, v16.8H, #2 | ||||
|         sshr            v27.8H, v17.8H, #2 | ||||
|         sshr            v29.8H, va.8H,  #2 | ||||
|         sshr            v31.8H, v19.8H, #2 | ||||
|         sub             v19.8H, v19.8H, v25.8H | ||||
|         sub             va.8H,  v27.8H, va.8H | ||||
|         add             v17.8H, v17.8H, v29.8H | ||||
|         add             v16.8H, v16.8H, v31.8H | ||||
|   .if \pass == 0 | ||||
|         sub             v31.8H, v24.8H, v19.8H | ||||
|         add             v24.8H, v24.8H, v19.8H | ||||
|         add             v25.8H, v26.8H, v18.8H | ||||
|         sub             v18.8H, v26.8H, v18.8H | ||||
|         add             v26.8H, v28.8H, v17.8H | ||||
|         add             v27.8H, v30.8H, v16.8H | ||||
|         sub             v29.8H, v28.8H, v17.8H | ||||
|         sub             v28.8H, v30.8H, v16.8H | ||||
|   .else | ||||
|         sub             v31.8H, v24.8H, v19.8H | ||||
|         add             v24.8H, v24.8H, v19.8H | ||||
|         add             v25.8H, v26.8H, v30.8H | ||||
|         sub             v30.8H, v26.8H, v30.8H | ||||
|         add             v26.8H, v28.8H, v17.8H | ||||
|         sub             v29.8H, v28.8H, v17.8H | ||||
|         add             v27.8H, v18.8H, v16.8H | ||||
|         sub             v28.8H, v18.8H, v16.8H | ||||
|   .endif | ||||
|         .unreq          va | ||||
|         .unreq          vb | ||||
| .endm | ||||
|  | ||||
| function ff_h264_idct8_add_neon, export=1 | ||||
|         movi            v19.8H,   #0 | ||||
|         ld1             {v24.8H, v25.8H}, [x1] | ||||
|         st1             {v19.8H},  [x1],   #16 | ||||
|         st1             {v19.8H},  [x1],   #16 | ||||
|         ld1             {v26.8H, v27.8H}, [x1] | ||||
|         st1             {v19.8H},  [x1],   #16 | ||||
|         st1             {v19.8H},  [x1],   #16 | ||||
|         ld1             {v28.8H, v29.8H}, [x1] | ||||
|         st1             {v19.8H},  [x1],   #16 | ||||
|         st1             {v19.8H},  [x1],   #16 | ||||
|  | ||||
|         idct8x8_cols    0 | ||||
|         transpose_8x8H  v24, v25, v26, v27, v28, v29, v18, v31, v6, v7 | ||||
|         idct8x8_cols    1 | ||||
|  | ||||
|         mov             x3,  x0 | ||||
|         srshr           v24.8H, v24.8H, #6 | ||||
|         ld1             {v0.8B},     [x0], x2 | ||||
|         srshr           v25.8H, v25.8H, #6 | ||||
|         ld1             {v1.8B},     [x0], x2 | ||||
|         srshr           v26.8H, v26.8H, #6 | ||||
|         ld1             {v2.8B},     [x0], x2 | ||||
|         srshr           v27.8H, v27.8H, #6 | ||||
|         ld1             {v3.8B},     [x0], x2 | ||||
|         srshr           v28.8H, v28.8H, #6 | ||||
|         ld1             {v4.8B},     [x0], x2 | ||||
|         srshr           v29.8H, v29.8H, #6 | ||||
|         ld1             {v5.8B},     [x0], x2 | ||||
|         srshr           v30.8H, v30.8H, #6 | ||||
|         ld1             {v6.8B},     [x0], x2 | ||||
|         srshr           v31.8H, v31.8H, #6 | ||||
|         ld1             {v7.8B},     [x0], x2 | ||||
|         uaddw           v24.8H, v24.8H, v0.8B | ||||
|         uaddw           v25.8H, v25.8H, v1.8B | ||||
|         uaddw           v26.8H, v26.8H, v2.8B | ||||
|         sqxtun          v0.8B,  v24.8H | ||||
|         uaddw           v27.8H, v27.8H, v3.8B | ||||
|         sqxtun          v1.8B,  v25.8H | ||||
|         uaddw           v28.8H, v28.8H, v4.8B | ||||
|         sqxtun          v2.8B,  v26.8H | ||||
|         st1             {v0.8B},     [x3], x2 | ||||
|         uaddw           v29.8H, v29.8H, v5.8B | ||||
|         sqxtun          v3.8B,  v27.8H | ||||
|         st1             {v1.8B},     [x3], x2 | ||||
|         uaddw           v30.8H, v30.8H, v6.8B | ||||
|         sqxtun          v4.8B,  v28.8H | ||||
|         st1             {v2.8B},     [x3], x2 | ||||
|         uaddw           v31.8H, v31.8H, v7.8B | ||||
|         sqxtun          v5.8B,  v29.8H | ||||
|         st1             {v3.8B},     [x3], x2 | ||||
|         sqxtun          v6.8B,  v30.8H | ||||
|         sqxtun          v7.8B,  v31.8H | ||||
|         st1             {v4.8B},     [x3], x2 | ||||
|         st1             {v5.8B},     [x3], x2 | ||||
|         st1             {v6.8B},     [x3], x2 | ||||
|         st1             {v7.8B},     [x3], x2 | ||||
|  | ||||
|         sub             x1,  x1,  #128 | ||||
|         ret | ||||
| endfunc | ||||
|  | ||||
| function ff_h264_idct8_dc_add_neon, export=1 | ||||
|         mov             w3,       #0 | ||||
|         sxtw            x2,       w2 | ||||
|         ld1r            {v31.8H}, [x1] | ||||
|         strh            w3,       [x1] | ||||
|         ld1             {v0.8B},  [x0], x2 | ||||
|         srshr           v31.8H, v31.8H, #6 | ||||
|         ld1             {v1.8B},     [x0], x2 | ||||
|         ld1             {v2.8B},     [x0], x2 | ||||
|         uaddw           v24.8H, v31.8H, v0.8B | ||||
|         ld1             {v3.8B},     [x0], x2 | ||||
|         uaddw           v25.8H, v31.8H, v1.8B | ||||
|         ld1             {v4.8B},     [x0], x2 | ||||
|         uaddw           v26.8H, v31.8H, v2.8B | ||||
|         ld1             {v5.8B},     [x0], x2 | ||||
|         uaddw           v27.8H, v31.8H, v3.8B | ||||
|         ld1             {v6.8B},     [x0], x2 | ||||
|         uaddw           v28.8H, v31.8H, v4.8B | ||||
|         ld1             {v7.8B},     [x0], x2 | ||||
|         uaddw           v29.8H, v31.8H, v5.8B | ||||
|         uaddw           v30.8H, v31.8H, v6.8B | ||||
|         uaddw           v31.8H, v31.8H, v7.8B | ||||
|         sqxtun          v0.8B,  v24.8H | ||||
|         sqxtun          v1.8B,  v25.8H | ||||
|         sqxtun          v2.8B,  v26.8H | ||||
|         sqxtun          v3.8B,  v27.8H | ||||
|         sub             x0,  x0,  x2, lsl #3 | ||||
|         st1             {v0.8B},     [x0], x2 | ||||
|         sqxtun          v4.8B,  v28.8H | ||||
|         st1             {v1.8B},     [x0], x2 | ||||
|         sqxtun          v5.8B,  v29.8H | ||||
|         st1             {v2.8B},     [x0], x2 | ||||
|         sqxtun          v6.8B,  v30.8H | ||||
|         st1             {v3.8B},     [x0], x2 | ||||
|         sqxtun          v7.8B,  v31.8H | ||||
|         st1             {v4.8B},     [x0], x2 | ||||
|         st1             {v5.8B},     [x0], x2 | ||||
|         st1             {v6.8B},     [x0], x2 | ||||
|         st1             {v7.8B},     [x0], x2 | ||||
|         ret | ||||
| endfunc | ||||
|  | ||||
| function ff_h264_idct8_add4_neon, export=1 | ||||
|         mov             x12, x30 | ||||
|         mov             x6,  x0 | ||||
|         mov             x5,  x1 | ||||
|         mov             x1,  x2 | ||||
|         mov             w2,  w3 | ||||
|         movrel          x7,  scan8 | ||||
|         mov             w10, #16 | ||||
|         movrel          x13, ff_h264_idct8_dc_add_neon | ||||
|         movrel          x14, ff_h264_idct8_add_neon | ||||
| 1:      ldrb            w9,  [x7], #4 | ||||
|         ldrsw           x0,  [x5], #16 | ||||
|         ldrb            w9,  [x4, w9, UXTW] | ||||
|         subs            w9,  w9,  #1 | ||||
|         b.lt            2f | ||||
|         ldrsh           w11,  [x1] | ||||
|         add             x0,  x6,  x0 | ||||
|         ccmp            w11, #0,  #4,  eq | ||||
|         csel            x15, x13, x14, ne | ||||
|         blr             x15 | ||||
| 2:      subs            w10, w10, #4 | ||||
|         add             x1,  x1,  #128 | ||||
|         b.ne            1b | ||||
|         ret             x12 | ||||
| endfunc | ||||
|  | ||||
| const   scan8 | ||||
|         .byte           4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8 | ||||
|         .byte           6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8 | ||||
|         .byte           4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8 | ||||
|         .byte           6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8 | ||||
|         .byte           4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8 | ||||
|         .byte           6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8 | ||||
|         .byte           4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8 | ||||
|         .byte           6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8 | ||||
|         .byte           4+11*8, 5+11*8, 4+12*8, 5+12*8 | ||||
|         .byte           6+11*8, 7+11*8, 6+12*8, 7+12*8 | ||||
|         .byte           4+13*8, 5+13*8, 4+14*8, 5+14*8 | ||||
|         .byte           6+13*8, 7+13*8, 6+14*8, 7+14*8 | ||||
| endconst | ||||
							
								
								
									
										61
									
								
								libavcodec/aarch64/neon.S
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										61
									
								
								libavcodec/aarch64/neon.S
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,61 @@ | ||||
| /* | ||||
|  * This file is part of FFmpeg. | ||||
|  * | ||||
|  * FFmpeg is free software; you can redistribute it and/or | ||||
|  * modify it under the terms of the GNU Lesser General Public | ||||
|  * License as published by the Free Software Foundation; either | ||||
|  * version 2.1 of the License, or (at your option) any later version. | ||||
|  * | ||||
|  * FFmpeg is distributed in the hope that it will be useful, | ||||
|  * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
|  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU | ||||
|  * Lesser General Public License for more details. | ||||
|  * | ||||
|  * You should have received a copy of the GNU Lesser General Public | ||||
|  * License along with FFmpeg; if not, write to the Free Software | ||||
|  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | ||||
|  */ | ||||
|  | ||||
| .macro  transpose_4x4H  r0, r1, r2, r3, r4, r5, r6, r7 | ||||
|         trn1            \r4\().4H,  \r0\().4H,  \r1\().4H | ||||
|         trn2            \r5\().4H,  \r0\().4H,  \r1\().4H | ||||
|         trn1            \r7\().4H,  \r3\().4H,  \r2\().4H | ||||
|         trn2            \r6\().4H,  \r3\().4H,  \r2\().4H | ||||
|         trn1            \r0\().2S,  \r4\().2S,  \r7\().2S | ||||
|         trn2            \r3\().2S,  \r4\().2S,  \r7\().2S | ||||
|         trn1            \r1\().2S,  \r5\().2S,  \r6\().2S | ||||
|         trn2            \r2\().2S,  \r5\().2S,  \r6\().2S | ||||
| .endm | ||||
|  | ||||
| .macro  transpose_8x8H  r0, r1, r2, r3, r4, r5, r6, r7, r8, r9 | ||||
|         trn1            \r8\().8H,  \r0\().8H,  \r1\().8H | ||||
|         trn2            \r9\().8H,  \r0\().8H,  \r1\().8H | ||||
|         trn1            \r1\().8H,  \r2\().8H,  \r3\().8H | ||||
|         trn2            \r3\().8H,  \r2\().8H,  \r3\().8H | ||||
|         trn1            \r0\().8H,  \r4\().8H,  \r5\().8H | ||||
|         trn2            \r5\().8H,  \r4\().8H,  \r5\().8H | ||||
|         trn1            \r2\().8H,  \r6\().8H,  \r7\().8H | ||||
|         trn2            \r7\().8H,  \r6\().8H,  \r7\().8H | ||||
|  | ||||
|         trn1            \r4\().4S,  \r0\().4S,  \r2\().4S | ||||
|         trn2            \r2\().4S,  \r0\().4S,  \r2\().4S | ||||
|         trn1            \r6\().4S,  \r5\().4S,  \r7\().4S | ||||
|         trn2            \r7\().4S,  \r5\().4S,  \r7\().4S | ||||
|         trn1            \r5\().4S,  \r9\().4S,  \r3\().4S | ||||
|         trn2            \r9\().4S,  \r9\().4S,  \r3\().4S | ||||
|         trn1            \r3\().4S,  \r8\().4S,  \r1\().4S | ||||
|         trn2            \r8\().4S,  \r8\().4S,  \r1\().4S | ||||
|  | ||||
|         trn1            \r0\().2D,  \r3\().2D,  \r4\().2D | ||||
|         trn2            \r4\().2D,  \r3\().2D,  \r4\().2D | ||||
|  | ||||
|         trn1            \r1\().2D,  \r5\().2D,  \r6\().2D | ||||
|         trn2            \r5\().2D,  \r5\().2D,  \r6\().2D | ||||
|  | ||||
|         trn2            \r6\().2D,  \r8\().2D,  \r2\().2D | ||||
|         trn1            \r2\().2D,  \r8\().2D,  \r2\().2D | ||||
|  | ||||
|         trn1            \r3\().2D,  \r9\().2D,  \r7\().2D | ||||
|         trn2            \r7\().2D,  \r9\().2D,  \r7\().2D | ||||
|  | ||||
| .endm | ||||
| @@ -180,6 +180,7 @@ av_cold void ff_h264dsp_init(H264DSPContext *c, const int bit_depth, | ||||
|     } | ||||
|     c->h264_find_start_code_candidate = h264_find_start_code_candidate_c; | ||||
|  | ||||
|     if (ARCH_AARCH64) ff_h264dsp_init_aarch64(c, bit_depth, chroma_format_idc); | ||||
|     if (ARCH_ARM) ff_h264dsp_init_arm(c, bit_depth, chroma_format_idc); | ||||
|     if (ARCH_PPC) ff_h264dsp_init_ppc(c, bit_depth, chroma_format_idc); | ||||
|     if (ARCH_X86) ff_h264dsp_init_x86(c, bit_depth, chroma_format_idc); | ||||
|   | ||||
| @@ -118,6 +118,8 @@ typedef struct H264DSPContext { | ||||
|  | ||||
| void ff_h264dsp_init(H264DSPContext *c, const int bit_depth, | ||||
|                      const int chroma_format_idc); | ||||
| void ff_h264dsp_init_aarch64(H264DSPContext *c, const int bit_depth, | ||||
|                              const int chroma_format_idc); | ||||
| void ff_h264dsp_init_arm(H264DSPContext *c, const int bit_depth, | ||||
|                          const int chroma_format_idc); | ||||
| void ff_h264dsp_init_ppc(H264DSPContext *c, const int bit_depth, | ||||
|   | ||||
		Reference in New Issue
	
	Block a user