You've already forked FFmpeg
							
							
				mirror of
				https://github.com/FFmpeg/FFmpeg.git
				synced 2025-10-30 23:18:11 +02:00 
			
		
		
		
	lavc/hevcdsp_qpel_neon: using movi.16b instead of movi.2d
Building iOS platform with arm64, the compiler has a warning: "instruction movi.2d with immediate #0 may not function correctly on this CPU, converting to movi.16b" Signed-off-by: xufuji456 <839789740@qq.com> Signed-off-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
		
				
					committed by
					
						 Martin Storsjö
						Martin Storsjö
					
				
			
			
				
	
			
			
			
						parent
						
							67ce690bc6
						
					
				
				
					commit
					cc86343b96
				
			| @@ -694,7 +694,7 @@ function ff_hevc_put_hevc_epel_h4_8_neon_i8mm, export=1 | ||||
|         trn1            v4.2s, v4.2s, v5.2s | ||||
|         trn1            v6.2s, v6.2s, v7.2s | ||||
|         trn1            v4.2d, v4.2d, v6.2d | ||||
|         movi            v16.2d, #0 | ||||
|         movi            v16.16b, #0 | ||||
|         usdot           v16.4s, v4.16b, v30.16b | ||||
|         xtn             v16.4h, v16.4s | ||||
|         st1             {v16.4h}, [x0], x10 | ||||
| @@ -714,8 +714,8 @@ function ff_hevc_put_hevc_epel_h6_8_neon_i8mm, export=1 | ||||
|         trn2            v17.2s, v4.2s, v5.2s | ||||
|         trn1            v6.2s, v6.2s, v7.2s | ||||
|         trn1            v16.2d, v16.2d, v6.2d | ||||
|         movi            v18.2d, #0 | ||||
|         movi            v19.2d, #0 | ||||
|         movi            v18.16b, #0 | ||||
|         movi            v19.16b, #0 | ||||
|         usdot           v18.4s, v16.16b, v30.16b | ||||
|         usdot           v19.2s, v17.8b, v30.8b | ||||
|         xtn             v18.4h, v18.4s | ||||
| @@ -736,8 +736,8 @@ function ff_hevc_put_hevc_epel_h8_8_neon_i8mm, export=1 | ||||
|         ext             v7.16b, v4.16b, v4.16b, #3 | ||||
|         zip1            v20.4s, v4.4s, v6.4s | ||||
|         zip1            v21.4s, v5.4s, v7.4s | ||||
|         movi            v16.2d, #0 | ||||
|         movi            v17.2d, #0 | ||||
|         movi            v16.16b, #0 | ||||
|         movi            v17.16b, #0 | ||||
|         usdot           v16.4s, v20.16b, v30.16b | ||||
|         usdot           v17.4s, v21.16b, v30.16b | ||||
|         xtn             v16.4h, v16.4s | ||||
| @@ -761,9 +761,9 @@ function ff_hevc_put_hevc_epel_h12_8_neon_i8mm, export=1 | ||||
|         trn1            v4.4s, v20.4s, v21.4s | ||||
|         trn2            v5.4s, v20.4s, v21.4s | ||||
|         trn1            v6.4s, v22.4s, v23.4s | ||||
|         movi            v16.2d, #0 | ||||
|         movi            v17.2d, #0 | ||||
|         movi            v18.2d, #0 | ||||
|         movi            v16.16b, #0 | ||||
|         movi            v17.16b, #0 | ||||
|         movi            v18.16b, #0 | ||||
|         usdot           v16.4s, v4.16b, v30.16b | ||||
|         usdot           v17.4s, v5.16b, v30.16b | ||||
|         usdot           v18.4s, v6.16b, v30.16b | ||||
| @@ -788,10 +788,10 @@ function ff_hevc_put_hevc_epel_h16_8_neon_i8mm, export=1 | ||||
|         zip2            v22.4s, v0.4s, v6.4s | ||||
|         zip1            v21.4s, v5.4s, v7.4s | ||||
|         zip2            v23.4s, v5.4s, v7.4s | ||||
|         movi            v16.2d, #0 | ||||
|         movi            v17.2d, #0 | ||||
|         movi            v18.2d, #0 | ||||
|         movi            v19.2d, #0 | ||||
|         movi            v16.16b, #0 | ||||
|         movi            v17.16b, #0 | ||||
|         movi            v18.16b, #0 | ||||
|         movi            v19.16b, #0 | ||||
|         usdot           v16.4s, v20.16b, v30.16b | ||||
|         usdot           v17.4s, v21.16b, v30.16b | ||||
|         usdot           v18.4s, v22.16b, v30.16b | ||||
| @@ -815,14 +815,14 @@ function ff_hevc_put_hevc_epel_h24_8_neon_i8mm, export=1 | ||||
|         ext             v26.16b, v1.16b, v1.16b, #1 | ||||
|         ext             v27.16b, v1.16b, v1.16b, #2 | ||||
|         ext             v28.16b, v1.16b, v1.16b, #3 | ||||
|         movi            v16.2d, #0 | ||||
|         movi            v17.2d, #0 | ||||
|         movi            v18.2d, #0 | ||||
|         movi            v19.2d, #0 | ||||
|         movi            v20.2d, #0 | ||||
|         movi            v21.2d, #0 | ||||
|         movi            v22.2d, #0 | ||||
|         movi            v23.2d, #0 | ||||
|         movi            v16.16b, #0 | ||||
|         movi            v17.16b, #0 | ||||
|         movi            v18.16b, #0 | ||||
|         movi            v19.16b, #0 | ||||
|         movi            v20.16b, #0 | ||||
|         movi            v21.16b, #0 | ||||
|         movi            v22.16b, #0 | ||||
|         movi            v23.16b, #0 | ||||
|         usdot           v16.4s, v0.16b, v30.16b | ||||
|         usdot           v17.4s, v5.16b, v30.16b | ||||
|         usdot           v18.4s, v6.16b, v30.16b | ||||
| @@ -861,14 +861,14 @@ function ff_hevc_put_hevc_epel_h32_8_neon_i8mm, export=1 | ||||
|         ext             v26.16b, v1.16b, v2.16b, #1 | ||||
|         ext             v27.16b, v1.16b, v2.16b, #2 | ||||
|         ext             v28.16b, v1.16b, v2.16b, #3 | ||||
|         movi            v16.2d, #0 | ||||
|         movi            v17.2d, #0 | ||||
|         movi            v18.2d, #0 | ||||
|         movi            v19.2d, #0 | ||||
|         movi            v20.2d, #0 | ||||
|         movi            v21.2d, #0 | ||||
|         movi            v22.2d, #0 | ||||
|         movi            v23.2d, #0 | ||||
|         movi            v16.16b, #0 | ||||
|         movi            v17.16b, #0 | ||||
|         movi            v18.16b, #0 | ||||
|         movi            v19.16b, #0 | ||||
|         movi            v20.16b, #0 | ||||
|         movi            v21.16b, #0 | ||||
|         movi            v22.16b, #0 | ||||
|         movi            v23.16b, #0 | ||||
|         usdot           v16.4s, v0.16b, v30.16b | ||||
|         usdot           v17.4s, v5.16b, v30.16b | ||||
|         usdot           v18.4s, v6.16b, v30.16b | ||||
| @@ -900,18 +900,18 @@ function ff_hevc_put_hevc_epel_h48_8_neon_i8mm, export=1 | ||||
|         ext             v16.16b, v1.16b, v2.16b, #1 | ||||
|         ext             v17.16b, v1.16b, v2.16b, #2 | ||||
|         ext             v18.16b, v1.16b, v2.16b, #3 | ||||
|         movi            v20.2d, #0 | ||||
|         movi            v21.2d, #0 | ||||
|         movi            v22.2d, #0 | ||||
|         movi            v23.2d, #0 | ||||
|         movi            v20.16b, #0 | ||||
|         movi            v21.16b, #0 | ||||
|         movi            v22.16b, #0 | ||||
|         movi            v23.16b, #0 | ||||
|         usdot           v20.4s, v0.16b, v30.16b | ||||
|         usdot           v21.4s, v4.16b, v30.16b | ||||
|         usdot           v22.4s, v5.16b, v30.16b | ||||
|         usdot           v23.4s, v6.16b, v30.16b | ||||
|         movi            v24.2d, #0 | ||||
|         movi            v25.2d, #0 | ||||
|         movi            v26.2d, #0 | ||||
|         movi            v27.2d, #0 | ||||
|         movi            v24.16b, #0 | ||||
|         movi            v25.16b, #0 | ||||
|         movi            v26.16b, #0 | ||||
|         movi            v27.16b, #0 | ||||
|         usdot           v24.4s, v1.16b, v30.16b | ||||
|         usdot           v25.4s, v16.16b, v30.16b | ||||
|         usdot           v26.4s, v17.16b, v30.16b | ||||
| @@ -928,10 +928,10 @@ function ff_hevc_put_hevc_epel_h48_8_neon_i8mm, export=1 | ||||
|         ext             v4.16b, v2.16b, v3.16b, #1 | ||||
|         ext             v5.16b, v2.16b, v3.16b, #2 | ||||
|         ext             v6.16b, v2.16b, v3.16b, #3 | ||||
|         movi            v20.2d, #0 | ||||
|         movi            v21.2d, #0 | ||||
|         movi            v22.2d, #0 | ||||
|         movi            v23.2d, #0 | ||||
|         movi            v20.16b, #0 | ||||
|         movi            v21.16b, #0 | ||||
|         movi            v22.16b, #0 | ||||
|         movi            v23.16b, #0 | ||||
|         usdot           v20.4s, v2.16b, v30.16b | ||||
|         usdot           v21.4s, v4.16b, v30.16b | ||||
|         usdot           v22.4s, v5.16b, v30.16b | ||||
| @@ -957,18 +957,18 @@ function ff_hevc_put_hevc_epel_h64_8_neon_i8mm, export=1 | ||||
|         ext             v16.16b, v1.16b, v2.16b, #1 | ||||
|         ext             v17.16b, v1.16b, v2.16b, #2 | ||||
|         ext             v18.16b, v1.16b, v2.16b, #3 | ||||
|         movi            v20.2d, #0 | ||||
|         movi            v21.2d, #0 | ||||
|         movi            v22.2d, #0 | ||||
|         movi            v23.2d, #0 | ||||
|         movi            v20.16b, #0 | ||||
|         movi            v21.16b, #0 | ||||
|         movi            v22.16b, #0 | ||||
|         movi            v23.16b, #0 | ||||
|         usdot           v20.4s, v0.16b, v30.16b | ||||
|         usdot           v21.4s, v4.16b, v30.16b | ||||
|         usdot           v22.4s, v5.16b, v30.16b | ||||
|         usdot           v23.4s, v6.16b, v30.16b | ||||
|         movi            v24.2d, #0 | ||||
|         movi            v25.2d, #0 | ||||
|         movi            v26.2d, #0 | ||||
|         movi            v27.2d, #0 | ||||
|         movi            v24.16b, #0 | ||||
|         movi            v25.16b, #0 | ||||
|         movi            v26.16b, #0 | ||||
|         movi            v27.16b, #0 | ||||
|         usdot           v24.4s, v1.16b, v30.16b | ||||
|         usdot           v25.4s, v16.16b, v30.16b | ||||
|         usdot           v26.4s, v17.16b, v30.16b | ||||
| @@ -989,18 +989,18 @@ function ff_hevc_put_hevc_epel_h64_8_neon_i8mm, export=1 | ||||
|         ext             v16.16b, v3.16b, v7.16b, #1 | ||||
|         ext             v17.16b, v3.16b, v7.16b, #2 | ||||
|         ext             v18.16b, v3.16b, v7.16b, #3 | ||||
|         movi            v20.2d, #0 | ||||
|         movi            v21.2d, #0 | ||||
|         movi            v22.2d, #0 | ||||
|         movi            v23.2d, #0 | ||||
|         movi            v20.16b, #0 | ||||
|         movi            v21.16b, #0 | ||||
|         movi            v22.16b, #0 | ||||
|         movi            v23.16b, #0 | ||||
|         usdot           v20.4s, v2.16b, v30.16b | ||||
|         usdot           v21.4s, v4.16b, v30.16b | ||||
|         usdot           v22.4s, v5.16b, v30.16b | ||||
|         usdot           v23.4s, v6.16b, v30.16b | ||||
|         movi            v24.2d, #0 | ||||
|         movi            v25.2d, #0 | ||||
|         movi            v26.2d, #0 | ||||
|         movi            v27.2d, #0 | ||||
|         movi            v24.16b, #0 | ||||
|         movi            v25.16b, #0 | ||||
|         movi            v26.16b, #0 | ||||
|         movi            v27.16b, #0 | ||||
|         usdot           v24.4s, v3.16b, v30.16b | ||||
|         usdot           v25.4s, v16.16b, v30.16b | ||||
|         usdot           v26.4s, v17.16b, v30.16b | ||||
| @@ -1593,7 +1593,7 @@ function ff_hevc_put_hevc_epel_uni_w_h4_8_neon_i8mm, export=1 | ||||
|         trn1            v0.2s, v0.2s, v2.2s | ||||
|         trn1            v1.2s, v1.2s, v3.2s | ||||
|         zip1            v0.4s, v0.4s, v1.4s | ||||
|         movi            v16.2d, #0 | ||||
|         movi            v16.16b, #0 | ||||
|         usdot           v16.4s, v0.16b, v28.16b | ||||
|         mul             v16.4s, v16.4s, v30.4s | ||||
|         sqrshl          v16.4s, v16.4s, v31.4s | ||||
| @@ -1620,8 +1620,8 @@ function ff_hevc_put_hevc_epel_uni_w_h6_8_neon_i8mm, export=1 | ||||
|         trn2            v6.2s, v0.2s, v1.2s | ||||
|         trn1            v5.2s, v2.2s, v3.2s | ||||
|         zip1            v4.2d, v4.2d, v5.2d | ||||
|         movi            v16.2d, #0 | ||||
|         movi            v17.2d, #0 | ||||
|         movi            v16.16b, #0 | ||||
|         movi            v17.16b, #0 | ||||
|         usdot           v16.4s, v4.16b, v28.16b | ||||
|         usdot           v17.2s, v6.8b, v28.8b | ||||
|         mul             v16.4s, v16.4s, v30.4s | ||||
| @@ -1640,8 +1640,8 @@ function ff_hevc_put_hevc_epel_uni_w_h6_8_neon_i8mm, export=1 | ||||
| endfunc | ||||
|  | ||||
| .macro  EPEL_UNI_W_H_CALC s0, s1, d0, d1 | ||||
|         movi            \d0\().2d, #0 | ||||
|         movi            \d1\().2d, #0 | ||||
|         movi            \d0\().16b, #0 | ||||
|         movi            \d1\().16b, #0 | ||||
|         usdot           \d0\().4s, \s0\().16b, v28.16b | ||||
|         usdot           \d1\().4s, \s1\().16b, v28.16b | ||||
|         mul             \d0\().4s, \d0\().4s, v30.4s | ||||
| @@ -1687,7 +1687,7 @@ function ff_hevc_put_hevc_epel_uni_w_h12_8_neon_i8mm, export=1 | ||||
|         zip2            v7.4s, v1.4s, v3.4s | ||||
|         zip1            v6.4s, v6.4s, v7.4s | ||||
|         EPEL_UNI_W_H_CALC v4, v5, v16, v17 | ||||
|         movi            v18.2d, #0 | ||||
|         movi            v18.16b, #0 | ||||
|         usdot           v18.4s, v6.16b, v28.16b | ||||
|         mul             v18.4s, v18.4s, v30.4s | ||||
|         sqrshl          v18.4s, v18.4s, v31.4s | ||||
| @@ -2575,7 +2575,7 @@ DISABLE_I8MM | ||||
| .endm | ||||
|  | ||||
| .macro EPEL_UNI_W_V4_CALC d0, s0, s1, s2, s3 | ||||
|         movi            \d0\().2d, #0 | ||||
|         movi            \d0\().16b, #0 | ||||
|         umlsl           \d0\().8h, \s0\().8b, v0.8b | ||||
|         umlal           \d0\().8h, \s1\().8b, v1.8b | ||||
|         umlal           \d0\().8h, \s2\().8b, v2.8b | ||||
| @@ -2626,7 +2626,7 @@ function ff_hevc_put_hevc_epel_uni_w_v4_8_neon, export=1 | ||||
| endfunc | ||||
|  | ||||
| .macro EPEL_UNI_W_V8_CALC d0, s0, s1, s2, s3, t0, t1 | ||||
|         movi            \d0\().2d, #0 | ||||
|         movi            \d0\().16b, #0 | ||||
|         umlsl           \d0\().8h, \s0\().8b, v0.8b | ||||
|         umlal           \d0\().8h, \s1\().8b, v1.8b | ||||
|         umlal           \d0\().8h, \s2\().8b, v2.8b | ||||
| @@ -2720,8 +2720,8 @@ function ff_hevc_put_hevc_epel_uni_w_v8_8_neon, export=1 | ||||
| endfunc | ||||
|  | ||||
| .macro EPEL_UNI_W_V12_CALC d0, d1, s0, s1, s2, s3, t0, t1, t2, t3 | ||||
|         movi            \d0\().2d, #0 | ||||
|         movi            \d1\().2d, #0 | ||||
|         movi            \d0\().16b, #0 | ||||
|         movi            \d1\().16b, #0 | ||||
|         umlsl           \d0\().8h, \s0\().8b, v0.8b | ||||
|         umlsl2          \d1\().8h, \s0\().16b, v0.16b | ||||
|         umlal           \d0\().8h, \s1\().8b, v1.8b | ||||
| @@ -2793,8 +2793,8 @@ function ff_hevc_put_hevc_epel_uni_w_v12_8_neon, export=1 | ||||
| endfunc | ||||
|  | ||||
| .macro EPEL_UNI_W_V16_CALC d0, d1, s0, s1, s2, s3, t0, t1, t2, t3 | ||||
|         movi            \d0\().2d, #0 | ||||
|         movi            \d1\().2d, #0 | ||||
|         movi            \d0\().16b, #0 | ||||
|         movi            \d1\().16b, #0 | ||||
|         umlsl           \d0\().8h, \s0\().8b, v0.8b | ||||
|         umlsl2          \d1\().8h, \s0\().16b, v0.16b | ||||
|         umlal           \d0\().8h, \s1\().8b, v1.8b | ||||
|   | ||||
| @@ -2180,8 +2180,8 @@ function ff_hevc_put_hevc_qpel_uni_w_h4_8_neon_i8mm, export=1 | ||||
|         ext             v3.16b, v0.16b, v0.16b, #3 | ||||
|         zip1            v0.2d, v0.2d, v1.2d | ||||
|         zip1            v2.2d, v2.2d, v3.2d | ||||
|         movi            v16.2d, #0 | ||||
|         movi            v17.2d, #0 | ||||
|         movi            v16.16b, #0 | ||||
|         movi            v17.16b, #0 | ||||
|         usdot           v16.4s, v0.16b, v28.16b | ||||
|         usdot           v17.4s, v2.16b, v28.16b | ||||
|         addp            v16.4s, v16.4s, v17.4s | ||||
| @@ -2210,9 +2210,9 @@ function ff_hevc_put_hevc_qpel_uni_w_h6_8_neon_i8mm, export=1 | ||||
|         zip1            v0.2d, v0.2d, v1.2d | ||||
|         zip1            v2.2d, v2.2d, v3.2d | ||||
|         zip1            v4.2d, v4.2d, v5.2d | ||||
|         movi            v16.2d, #0 | ||||
|         movi            v17.2d, #0 | ||||
|         movi            v18.2d, #0 | ||||
|         movi            v16.16b, #0 | ||||
|         movi            v17.16b, #0 | ||||
|         movi            v18.16b, #0 | ||||
|         usdot           v16.4s, v0.16b, v28.16b | ||||
|         usdot           v17.4s, v2.16b, v28.16b | ||||
|         usdot           v18.4s, v4.16b, v28.16b | ||||
| @@ -2236,10 +2236,10 @@ endfunc | ||||
|  | ||||
|  | ||||
| .macro  QPEL_UNI_W_H_CALC s0, s1, s2, s3, d0, d1, d2, d3 | ||||
|         movi            \d0\().2d, #0 | ||||
|         movi            \d1\().2d, #0 | ||||
|         movi            \d2\().2d, #0 | ||||
|         movi            \d3\().2d, #0 | ||||
|         movi            \d0\().16b, #0 | ||||
|         movi            \d1\().16b, #0 | ||||
|         movi            \d2\().16b, #0 | ||||
|         movi            \d3\().16b, #0 | ||||
|         usdot           \d0\().4s, \s0\().16b, v28.16b | ||||
|         usdot           \d1\().4s, \s1\().16b, v28.16b | ||||
|         usdot           \d2\().4s, \s2\().16b, v28.16b | ||||
| @@ -2255,8 +2255,8 @@ endfunc | ||||
| .endm | ||||
|  | ||||
| .macro  QPEL_UNI_W_H_CALC_HALF s0, s1, d0, d1 | ||||
|         movi            \d0\().2d, #0 | ||||
|         movi            \d1\().2d, #0 | ||||
|         movi            \d0\().16b, #0 | ||||
|         movi            \d1\().16b, #0 | ||||
|         usdot           \d0\().4s, \s0\().16b, v28.16b | ||||
|         usdot           \d1\().4s, \s1\().16b, v28.16b | ||||
|         addp            \d0\().4s, \d0\().4s, \d1\().4s | ||||
| @@ -2606,8 +2606,8 @@ function ff_hevc_put_hevc_qpel_h4_8_neon_i8mm, export=1 | ||||
|         ext             v3.16b, v0.16b, v0.16b, #3 | ||||
|         zip1            v0.2d, v0.2d, v1.2d | ||||
|         zip1            v2.2d, v2.2d, v3.2d | ||||
|         movi            v16.2d, #0 | ||||
|         movi            v17.2d, #0 | ||||
|         movi            v16.16b, #0 | ||||
|         movi            v17.16b, #0 | ||||
|         usdot           v16.4s, v0.16b, v31.16b | ||||
|         usdot           v17.4s, v2.16b, v31.16b | ||||
|         addp            v16.4s, v16.4s, v17.4s | ||||
| @@ -2633,9 +2633,9 @@ function ff_hevc_put_hevc_qpel_h6_8_neon_i8mm, export=1 | ||||
|         zip1            v0.2d, v0.2d, v1.2d | ||||
|         zip1            v2.2d, v2.2d, v3.2d | ||||
|         zip1            v4.2d, v4.2d, v5.2d | ||||
|         movi            v16.2d, #0 | ||||
|         movi            v17.2d, #0 | ||||
|         movi            v18.2d, #0 | ||||
|         movi            v16.16b, #0 | ||||
|         movi            v17.16b, #0 | ||||
|         movi            v18.16b, #0 | ||||
|         usdot           v16.4s, v0.16b, v31.16b | ||||
|         usdot           v17.4s, v2.16b, v31.16b | ||||
|         usdot           v18.4s, v4.16b, v31.16b | ||||
| @@ -2668,10 +2668,10 @@ function ff_hevc_put_hevc_qpel_h8_8_neon_i8mm, export=1 | ||||
|         zip1            v2.2d, v2.2d, v3.2d | ||||
|         zip1            v4.2d, v4.2d, v5.2d | ||||
|         zip1            v6.2d, v6.2d, v7.2d | ||||
|         movi            v16.2d, #0 | ||||
|         movi            v17.2d, #0 | ||||
|         movi            v18.2d, #0 | ||||
|         movi            v19.2d, #0 | ||||
|         movi            v16.16b, #0 | ||||
|         movi            v17.16b, #0 | ||||
|         movi            v18.16b, #0 | ||||
|         movi            v19.16b, #0 | ||||
|         usdot           v16.4s, v0.16b, v31.16b | ||||
|         usdot           v17.4s, v2.16b, v31.16b | ||||
|         usdot           v18.4s, v4.16b, v31.16b | ||||
| @@ -2688,10 +2688,10 @@ function ff_hevc_put_hevc_qpel_h8_8_neon_i8mm, export=1 | ||||
| endfunc | ||||
|  | ||||
| .macro QPEL_H_CALC s0, s1, s2, s3, d0, d1, d2, d3 | ||||
|         movi            \d0\().2d, #0 | ||||
|         movi            \d1\().2d, #0 | ||||
|         movi            \d2\().2d, #0 | ||||
|         movi            \d3\().2d, #0 | ||||
|         movi            \d0\().16b, #0 | ||||
|         movi            \d1\().16b, #0 | ||||
|         movi            \d2\().16b, #0 | ||||
|         movi            \d3\().16b, #0 | ||||
|         usdot           \d0\().4s, \s0\().16b, v31.16b | ||||
|         usdot           \d1\().4s, \s1\().16b, v31.16b | ||||
|         usdot           \d2\().4s, \s2\().16b, v31.16b | ||||
| @@ -2716,8 +2716,8 @@ function ff_hevc_put_hevc_qpel_h12_8_neon_i8mm, export=1 | ||||
|         QPEL_H_CALC     v16, v1, v2, v3, v20, v21, v22, v23 | ||||
|         addp            v20.4s, v20.4s, v22.4s | ||||
|         addp            v21.4s, v21.4s, v23.4s | ||||
|         movi            v24.2d, #0 | ||||
|         movi            v25.2d, #0 | ||||
|         movi            v24.16b, #0 | ||||
|         movi            v25.16b, #0 | ||||
|         usdot           v24.4s, v18.16b, v31.16b | ||||
|         usdot           v25.4s, v19.16b, v31.16b | ||||
|         addp            v24.4s, v24.4s, v25.4s | ||||
|   | ||||
| @@ -56,7 +56,7 @@ function ff_mpadsp_apply_window_\type\()_neon, export=1 | ||||
| .ifc \type, fixed | ||||
|         ld1r            {v16.2s}, [x2]          // dither_state | ||||
|         sxtl            v16.2d, v16.2s | ||||
|         movi            v29.2d, #0 | ||||
|         movi            v29.16b, #0 | ||||
|         movi            v30.2d, #(1<<OUT_SHIFT)-1 | ||||
|         trn1            v31.2d, v29.2d, v30.2d | ||||
|         trn2            v30.2d, v30.2d, v29.2d | ||||
| @@ -74,9 +74,9 @@ function ff_mpadsp_apply_window_\type\()_neon, export=1 | ||||
|         add             x11, x6, #(32)<<2      // w  + 32 | ||||
|         add             x12, x7, #(32)<<2      // w2 + 32 | ||||
|         mov             x15, #8 | ||||
|         movi            v17.2d, #0 | ||||
|         movi            v18.2d, #0 | ||||
|         movi            v19.2d, #0 | ||||
|         movi            v17.16b, #0 | ||||
|         movi            v18.16b, #0 | ||||
|         movi            v19.16b, #0 | ||||
| 2: | ||||
|         subs            x15, x15, #1 | ||||
|         ld1             {v0.4s},  [x8],  x9 | ||||
|   | ||||
| @@ -50,10 +50,10 @@ function ff_hscale8to15_X8_neon, export=1 | ||||
|         add             x12, x16, x7                // filter1 = filter0 + filterSize*2 | ||||
|         add             x13, x12, x7                // filter2 = filter1 + filterSize*2 | ||||
|         add             x4, x13, x7                 // filter3 = filter2 + filterSize*2 | ||||
|         movi            v0.2d, #0                   // val sum part 1 (for dst[0]) | ||||
|         movi            v1.2d, #0                   // val sum part 2 (for dst[1]) | ||||
|         movi            v2.2d, #0                   // val sum part 3 (for dst[2]) | ||||
|         movi            v3.2d, #0                   // val sum part 4 (for dst[3]) | ||||
|         movi            v0.16b, #0                  // val sum part 1 (for dst[0]) | ||||
|         movi            v1.16b, #0                  // val sum part 2 (for dst[1]) | ||||
|         movi            v2.16b, #0                  // val sum part 3 (for dst[2]) | ||||
|         movi            v3.16b, #0                  // val sum part 4 (for dst[3]) | ||||
|         add             x17, x3, w8, uxtw           // srcp + filterPos[0] | ||||
|         add             x8,  x3, w0, uxtw           // srcp + filterPos[1] | ||||
|         add             x0, x3, w11, uxtw           // srcp + filterPos[2] | ||||
| @@ -108,10 +108,10 @@ function ff_hscale8to15_X4_neon, export=1 | ||||
|         ldp             w8, w9,  [x5]               // filterPos[idx + 0], [idx + 1] | ||||
|         ldp             w10, w11, [x5, #8]          // filterPos[idx + 2], [idx + 3] | ||||
|  | ||||
|         movi            v16.2d, #0                  // initialize accumulator for idx + 0 | ||||
|         movi            v17.2d, #0                  // initialize accumulator for idx + 1 | ||||
|         movi            v18.2d, #0                  // initialize accumulator for idx + 2 | ||||
|         movi            v19.2d, #0                  // initialize accumulator for idx + 3 | ||||
|         movi            v16.16b, #0                 // initialize accumulator for idx + 0 | ||||
|         movi            v17.16b, #0                 // initialize accumulator for idx + 1 | ||||
|         movi            v18.16b, #0                 // initialize accumulator for idx + 2 | ||||
|         movi            v19.16b, #0                 // initialize accumulator for idx + 3 | ||||
|  | ||||
|         mov             x12, x4                     // filter pointer for idx + 0 | ||||
|         add             x13, x4, x7                 // filter pointer for idx + 1 | ||||
| @@ -253,8 +253,8 @@ function ff_hscale8to15_4_neon, export=1 | ||||
|         ldp             w12, w13, [x5, #16]         // filterPos[idx + 4][0..3], [idx + 5][0..3], next iteration | ||||
|         ldp             w14, w15, [x5, #24]         // filterPos[idx + 6][0..3], [idx + 7][0..3], next iteration | ||||
|  | ||||
|         movi            v0.2d, #0                   // Clear madd accumulator for idx 0..3 | ||||
|         movi            v5.2d, #0                   // Clear madd accumulator for idx 4..7 | ||||
|         movi            v0.16b, #0                  // Clear madd accumulator for idx 0..3 | ||||
|         movi            v5.16b, #0                  // Clear madd accumulator for idx 4..7 | ||||
|  | ||||
|         ld4             {v1.8h, v2.8h, v3.8h, v4.8h}, [x4], #64 // load filter idx + 0..7 | ||||
|  | ||||
| @@ -299,8 +299,8 @@ function ff_hscale8to15_4_neon, export=1 | ||||
|         ld4             {v16.8b, v17.8b, v18.8b, v19.8b}, [sp] | ||||
|         ld4             {v1.8h, v2.8h, v3.8h, v4.8h}, [x4], #64 // load filter idx + 0..7 | ||||
|  | ||||
|         movi            v0.2d, #0                   // Clear madd accumulator for idx 0..3 | ||||
|         movi            v5.2d, #0                   // Clear madd accumulator for idx 4..7 | ||||
|         movi            v0.16b, #0                  // Clear madd accumulator for idx 0..3 | ||||
|         movi            v5.16b, #0                  // Clear madd accumulator for idx 4..7 | ||||
|  | ||||
|         uxtl            v16.8h, v16.8b              // unsigned extend long, covert src data to 16-bit | ||||
|         uxtl            v17.8h, v17.8b              // unsigned extend long, covert src data to 16-bit | ||||
| @@ -499,10 +499,10 @@ function ff_hscale8to19_X8_neon, export=1 | ||||
|         ldr             w11, [x5], #4               // filterPos[idx + 2] | ||||
|         add             x4, x13, x7                 // filter3 = filter2 + filterSize*2 | ||||
|         ldr             w9, [x5], #4                // filterPos[idx + 3] | ||||
|         movi            v0.2d, #0                   // val sum part 1 (for dst[0]) | ||||
|         movi            v1.2d, #0                   // val sum part 2 (for dst[1]) | ||||
|         movi            v2.2d, #0                   // val sum part 3 (for dst[2]) | ||||
|         movi            v3.2d, #0                   // val sum part 4 (for dst[3]) | ||||
|         movi            v0.16b, #0                  // val sum part 1 (for dst[0]) | ||||
|         movi            v1.16b, #0                  // val sum part 2 (for dst[1]) | ||||
|         movi            v2.16b, #0                  // val sum part 3 (for dst[2]) | ||||
|         movi            v3.16b, #0                  // val sum part 4 (for dst[3]) | ||||
|         add             x17, x3, w8, uxtw           // srcp + filterPos[0] | ||||
|         add             x8,  x3, w0, uxtw           // srcp + filterPos[1] | ||||
|         add             x0, x3, w11, uxtw           // srcp + filterPos[2] | ||||
| @@ -560,10 +560,10 @@ function ff_hscale8to19_X4_neon, export=1 | ||||
|         ldp             w8, w9, [x5] | ||||
|         ldp             w10, w11, [x5, #8] | ||||
|  | ||||
|         movi            v16.2d, #0                  // initialize accumulator for idx + 0 | ||||
|         movi            v17.2d, #0                  // initialize accumulator for idx + 1 | ||||
|         movi            v18.2d, #0                  // initialize accumulator for idx + 2 | ||||
|         movi            v19.2d, #0                  // initialize accumulator for idx + 3 | ||||
|         movi            v16.16b, #0                 // initialize accumulator for idx + 0 | ||||
|         movi            v17.16b, #0                 // initialize accumulator for idx + 1 | ||||
|         movi            v18.16b, #0                 // initialize accumulator for idx + 2 | ||||
|         movi            v19.16b, #0                 // initialize accumulator for idx + 3 | ||||
|  | ||||
|         mov             x12, x4                     // filter + 0 | ||||
|         add             x13, x4, x7                 // filter + 1 | ||||
| @@ -865,10 +865,10 @@ function ff_hscale16to15_X8_neon_asm, export=1 | ||||
|         add             x12, x16, x7                // filter1 = filter0 + filterSize*2 | ||||
|         add             x13, x12, x7                // filter2 = filter1 + filterSize*2 | ||||
|         add             x4, x13, x7                 // filter3 = filter2 + filterSize*2 | ||||
|         movi            v0.2d, #0                   // val sum part 1 (for dst[0]) | ||||
|         movi            v1.2d, #0                   // val sum part 2 (for dst[1]) | ||||
|         movi            v2.2d, #0                   // val sum part 3 (for dst[2]) | ||||
|         movi            v3.2d, #0                   // val sum part 4 (for dst[3]) | ||||
|         movi            v0.16b, #0                  // val sum part 1 (for dst[0]) | ||||
|         movi            v1.16b, #0                  // val sum part 2 (for dst[1]) | ||||
|         movi            v2.16b, #0                  // val sum part 3 (for dst[2]) | ||||
|         movi            v3.16b, #0                  // val sum part 4 (for dst[3]) | ||||
|         add             x17, x3, w8, uxtw           // srcp + filterPos[0] | ||||
|         add             x8,  x3, w10, uxtw          // srcp + filterPos[1] | ||||
|         add             x10, x3, w11, uxtw          // srcp + filterPos[2] | ||||
| @@ -945,10 +945,10 @@ function ff_hscale16to15_X4_neon_asm, export=1 | ||||
|         ldp             w8, w9, [x5] | ||||
|         ldp             w10, w11, [x5, #8] | ||||
|  | ||||
|         movi            v16.2d, #0                  // initialize accumulator for idx + 0 | ||||
|         movi            v17.2d, #0                  // initialize accumulator for idx + 1 | ||||
|         movi            v18.2d, #0                  // initialize accumulator for idx + 2 | ||||
|         movi            v19.2d, #0                  // initialize accumulator for idx + 3 | ||||
|         movi            v16.16b, #0                 // initialize accumulator for idx + 0 | ||||
|         movi            v17.16b, #0                 // initialize accumulator for idx + 1 | ||||
|         movi            v18.16b, #0                 // initialize accumulator for idx + 2 | ||||
|         movi            v19.16b, #0                 // initialize accumulator for idx + 3 | ||||
|  | ||||
|         mov             x12, x4                     // filter + 0 | ||||
|         add             x13, x4, x7                 // filter + 1 | ||||
| @@ -1270,10 +1270,10 @@ function ff_hscale16to19_X8_neon_asm, export=1 | ||||
|         add             x13, x12, x7                // filter2 = filter1 + filterSize*2 | ||||
|         lsl             w10, w10, #1 | ||||
|         add             x4, x13, x7                 // filter3 = filter2 + filterSize*2 | ||||
|         movi            v0.2d, #0                   // val sum part 1 (for dst[0]) | ||||
|         movi            v1.2d, #0                   // val sum part 2 (for dst[1]) | ||||
|         movi            v2.2d, #0                   // val sum part 3 (for dst[2]) | ||||
|         movi            v3.2d, #0                   // val sum part 4 (for dst[3]) | ||||
|         movi            v0.16b, #0                  // val sum part 1 (for dst[0]) | ||||
|         movi            v1.16b, #0                  // val sum part 2 (for dst[1]) | ||||
|         movi            v2.16b, #0                  // val sum part 3 (for dst[2]) | ||||
|         movi            v3.16b, #0                  // val sum part 4 (for dst[3]) | ||||
|         add             x17, x3, w8, uxtw           // srcp + filterPos[0] | ||||
|         add             x8,  x3, w10, uxtw          // srcp + filterPos[1] | ||||
|         add             x10, x3, w11, uxtw          // srcp + filterPos[2] | ||||
| @@ -1348,10 +1348,10 @@ function ff_hscale16to19_X4_neon_asm, export=1 | ||||
|         ldp             w8, w9, [x5] | ||||
|         ldp             w10, w11, [x5, #8] | ||||
|  | ||||
|         movi            v16.2d, #0                  // initialize accumulator for idx + 0 | ||||
|         movi            v17.2d, #0                  // initialize accumulator for idx + 1 | ||||
|         movi            v18.2d, #0                  // initialize accumulator for idx + 2 | ||||
|         movi            v19.2d, #0                  // initialize accumulator for idx + 3 | ||||
|         movi            v16.16b, #0                 // initialize accumulator for idx + 0 | ||||
|         movi            v17.16b, #0                 // initialize accumulator for idx + 1 | ||||
|         movi            v18.16b, #0                 // initialize accumulator for idx + 2 | ||||
|         movi            v19.16b, #0                 // initialize accumulator for idx + 3 | ||||
|  | ||||
|         mov             x12, x4                     // filter + 0 | ||||
|         add             x13, x4, x7                 // filter + 1 | ||||
|   | ||||
		Reference in New Issue
	
	Block a user