You've already forked FFmpeg
							
							
				mirror of
				https://github.com/FFmpeg/FFmpeg.git
				synced 2025-10-30 23:18:11 +02:00 
			
		
		
		
	avfilter/vf_nlmeans: add x86 SIMD
This commit is contained in:
		| @@ -308,9 +308,9 @@ static int config_input(AVFilterLink *inlink) | ||||
|     s->ii = s->ii_orig + s->ii_lz_32 + 1; | ||||
|  | ||||
|     // allocate weighted average for every pixel | ||||
|     s->linesize = inlink->w; | ||||
|     s->total_weight = av_malloc_array(inlink->w, inlink->h * sizeof(*s->total_weight)); | ||||
|     s->sum = av_malloc_array(inlink->w, inlink->h * sizeof(*s->sum)); | ||||
|     s->linesize = inlink->w + 100; | ||||
|     s->total_weight = av_malloc_array(s->linesize, inlink->h * sizeof(*s->total_weight)); | ||||
|     s->sum = av_malloc_array(s->linesize, inlink->h * sizeof(*s->sum)); | ||||
|     if (!s->total_weight || !s->sum) | ||||
|         return AVERROR(ENOMEM); | ||||
|  | ||||
| @@ -519,6 +519,9 @@ void ff_nlmeans_init(NLMeansDSPContext *dsp) | ||||
|  | ||||
|     if (ARCH_AARCH64) | ||||
|         ff_nlmeans_init_aarch64(dsp); | ||||
|  | ||||
|     if (ARCH_X86) | ||||
|         ff_nlmeans_init_x86(dsp); | ||||
| } | ||||
|  | ||||
| static av_cold int init(AVFilterContext *ctx) | ||||
|   | ||||
| @@ -41,5 +41,6 @@ typedef struct NLMeansDSPContext { | ||||
|  | ||||
| void ff_nlmeans_init(NLMeansDSPContext *dsp); | ||||
| void ff_nlmeans_init_aarch64(NLMeansDSPContext *dsp); | ||||
| void ff_nlmeans_init_x86(NLMeansDSPContext *dsp); | ||||
|  | ||||
| #endif /* AVFILTER_NLMEANS_H */ | ||||
|   | ||||
| @@ -20,6 +20,7 @@ OBJS-$(CONFIG_LIMITER_FILTER)                += x86/vf_limiter_init.o | ||||
| OBJS-$(CONFIG_LUT3D_FILTER)                  += x86/vf_lut3d_init.o | ||||
| OBJS-$(CONFIG_MASKEDCLAMP_FILTER)            += x86/vf_maskedclamp_init.o | ||||
| OBJS-$(CONFIG_MASKEDMERGE_FILTER)            += x86/vf_maskedmerge_init.o | ||||
| OBJS-$(CONFIG_NLMEANS_FILTER)                += x86/vf_nlmeans_init.o | ||||
| OBJS-$(CONFIG_NOISE_FILTER)                  += x86/vf_noise.o | ||||
| OBJS-$(CONFIG_OVERLAY_FILTER)                += x86/vf_overlay_init.o | ||||
| OBJS-$(CONFIG_PP7_FILTER)                    += x86/vf_pp7_init.o | ||||
| @@ -61,6 +62,7 @@ X86ASM-OBJS-$(CONFIG_LIMITER_FILTER)         += x86/vf_limiter.o | ||||
| X86ASM-OBJS-$(CONFIG_LUT3D_FILTER)           += x86/vf_lut3d.o | ||||
| X86ASM-OBJS-$(CONFIG_MASKEDCLAMP_FILTER)     += x86/vf_maskedclamp.o | ||||
| X86ASM-OBJS-$(CONFIG_MASKEDMERGE_FILTER)     += x86/vf_maskedmerge.o | ||||
| X86ASM-OBJS-$(CONFIG_NLMEANS_FILTER)         += x86/vf_nlmeans.o | ||||
| X86ASM-OBJS-$(CONFIG_OVERLAY_FILTER)         += x86/vf_overlay.o | ||||
| X86ASM-OBJS-$(CONFIG_PP7_FILTER)             += x86/vf_pp7.o | ||||
| X86ASM-OBJS-$(CONFIG_PSNR_FILTER)            += x86/vf_psnr.o | ||||
|   | ||||
							
								
								
									
										97
									
								
								libavfilter/x86/vf_nlmeans.asm
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										97
									
								
								libavfilter/x86/vf_nlmeans.asm
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,97 @@ | ||||
| ;***************************************************************************** | ||||
| ;* x86-optimized functions for nlmeans filter | ||||
| ;* | ||||
| ;* This file is part of FFmpeg. | ||||
| ;* | ||||
| ;* FFmpeg is free software; you can redistribute it and/or | ||||
| ;* modify it under the terms of the GNU Lesser General Public | ||||
| ;* License as published by the Free Software Foundation; either | ||||
| ;* version 2.1 of the License, or (at your option) any later version. | ||||
| ;* | ||||
| ;* FFmpeg is distributed in the hope that it will be useful, | ||||
| ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
| ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU | ||||
| ;* Lesser General Public License for more details. | ||||
| ;* | ||||
| ;* You should have received a copy of the GNU Lesser General Public | ||||
| ;* License along with FFmpeg; if not, write to the Free Software | ||||
| ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | ||||
| ;****************************************************************************** | ||||
|  | ||||
|  | ||||
| %include "libavutil/x86/x86util.asm" | ||||
|  | ||||
| %if HAVE_AVX2_EXTERNAL && ARCH_X86_64 | ||||
|  | ||||
| SECTION_RODATA 32 | ||||
|  | ||||
| ending_lut: dd -1, -1, -1, -1, -1, -1, -1, -1,\ | ||||
|                 0, -1, -1, -1, -1, -1, -1, -1,\ | ||||
|                 0,  0, -1, -1, -1, -1, -1, -1,\ | ||||
|                 0,  0,  0, -1, -1, -1, -1, -1,\ | ||||
|                 0,  0,  0,  0, -1, -1, -1, -1,\ | ||||
|                 0,  0,  0,  0,  0, -1, -1, -1,\ | ||||
|                 0,  0,  0,  0,  0,  0, -1, -1,\ | ||||
|                 0,  0,  0,  0,  0,  0,  0, -1,\ | ||||
|                 0,  0,  0,  0,  0,  0,  0,  0 | ||||
|  | ||||
| SECTION .text | ||||
|  | ||||
| ; void ff_compute_weights_line(const uint32_t *const iia, | ||||
| ;                              const uint32_t *const iib, | ||||
| ;                              const uint32_t *const iid, | ||||
| ;                              const uint32_t *const iie, | ||||
| ;                              const uint8_t *const src, | ||||
| ;                              float *total, | ||||
| ;                              float *sum, | ||||
| ;                              const float *const lut, | ||||
| ;                              int max, | ||||
| ;                              int startx, int endx); | ||||
|  | ||||
| INIT_YMM avx2 | ||||
| cglobal compute_weights_line, 8, 13, 5, 0, iia, iib, iid, iie, src, total, sum, lut, x, startx, endx, mod, elut | ||||
|     movsxd startxq, dword startxm | ||||
|     movsxd   endxq, dword endxm | ||||
|     VPBROADCASTD      m2, r8m | ||||
|  | ||||
|     mov      xq, startxq | ||||
|     mov    modq, mmsize / 4 | ||||
|     lea   elutq, [ending_lut] | ||||
|  | ||||
|     vpcmpeqd  m4, m4 | ||||
|  | ||||
|     .loop: | ||||
|         mov    startxq, endxq | ||||
|         sub    startxq, xq | ||||
|         cmp    startxq, modq | ||||
|         cmovge startxq, modq | ||||
|         sal    startxq, 5 | ||||
|  | ||||
|         movu   m0, [iieq + xq * 4] | ||||
|  | ||||
|         psubd  m0, [iidq + xq * 4] | ||||
|         psubd  m0, [iibq + xq * 4] | ||||
|         paddd  m0, [iiaq + xq * 4] | ||||
|         por    m0, [elutq + startxq] | ||||
|         pminud m0, m2 | ||||
|         pslld  m0, 2 | ||||
|         mova   m3, m4 | ||||
|         vgatherdps m1, [lutq + m0], m3 | ||||
|  | ||||
|         pmovzxbd m0, [srcq + xq] | ||||
|         cvtdq2ps m0, m0 | ||||
|  | ||||
|         mulps m0, m1 | ||||
|  | ||||
|         addps m1, [totalq + xq * 4] | ||||
|         addps m0, [sumq + xq * 4] | ||||
|  | ||||
|         movups [totalq + xq * 4], m1 | ||||
|         movups [sumq + xq * 4], m0 | ||||
|  | ||||
|         add xq, mmsize / 4 | ||||
|         cmp xq, endxq | ||||
|         jl .loop | ||||
|     RET | ||||
|  | ||||
| %endif | ||||
							
								
								
									
										40
									
								
								libavfilter/x86/vf_nlmeans_init.c
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										40
									
								
								libavfilter/x86/vf_nlmeans_init.c
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,40 @@ | ||||
| /* | ||||
|  * This file is part of FFmpeg. | ||||
|  * | ||||
|  * FFmpeg is free software; you can redistribute it and/or | ||||
|  * modify it under the terms of the GNU Lesser General Public | ||||
|  * License as published by the Free Software Foundation; either | ||||
|  * version 2.1 of the License, or (at your option) any later version. | ||||
|  * | ||||
|  * FFmpeg is distributed in the hope that it will be useful, | ||||
|  * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
|  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU | ||||
|  * Lesser General Public License for more details. | ||||
|  * | ||||
|  * You should have received a copy of the GNU Lesser General Public | ||||
|  * License along with FFmpeg; if not, write to the Free Software | ||||
|  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | ||||
|  */ | ||||
|  | ||||
| #include "libavutil/attributes.h" | ||||
| #include "libavutil/x86/cpu.h" | ||||
| #include "libavfilter/vf_nlmeans.h" | ||||
|  | ||||
| void ff_compute_weights_line_avx2(const uint32_t *const iia, | ||||
|                                   const uint32_t *const iib, | ||||
|                                   const uint32_t *const iid, | ||||
|                                   const uint32_t *const iie, | ||||
|                                   const uint8_t *const src, | ||||
|                                   float *total_weight, | ||||
|                                   float *sum, | ||||
|                                   const float *const weight_lut, | ||||
|                                   int max_meaningful_diff, | ||||
|                                   int startx, int endx); | ||||
|  | ||||
| av_cold void ff_nlmeans_init_x86(NLMeansDSPContext *dsp) | ||||
| { | ||||
|     int cpu_flags = av_get_cpu_flags(); | ||||
|  | ||||
|     if (ARCH_X86_64 && EXTERNAL_AVX2_FAST(cpu_flags)) | ||||
|         dsp->compute_weights_line = ff_compute_weights_line_avx2; | ||||
| } | ||||
		Reference in New Issue
	
	Block a user