You've already forked FFmpeg
							
							
				mirror of
				https://github.com/FFmpeg/FFmpeg.git
				synced 2025-10-30 23:18:11 +02:00 
			
		
		
		
	vf_ssim: x86 simd for ssim_4x4xN and ssim_endN.
Both are 2-2.5x faster than their C counterpart. Reviewed-by: Paul B Mahol <onemda@gmail.com> Reviewed-by: James Almer <jamrial@gmail.com> Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
This commit is contained in:
		
				
					committed by
					
						 Michael Niedermayer
						Michael Niedermayer
					
				
			
			
				
	
			
			
			
						parent
						
							39a04ebcaf
						
					
				
				
					commit
					dfc58584b4
				
			
							
								
								
									
										36
									
								
								libavfilter/ssim.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										36
									
								
								libavfilter/ssim.h
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,36 @@ | ||||
| /* | ||||
|  * Copyright (c) 2015 Ronald S. Bultje <rsbultje@gmail.com> | ||||
|  * | ||||
|  * This file is part of FFmpeg. | ||||
|  * | ||||
|  * FFmpeg is free software; you can redistribute it and/or | ||||
|  * modify it under the terms of the GNU Lesser General Public | ||||
|  * License as published by the Free Software Foundation; either | ||||
|  * version 2.1 of the License, or (at your option) any later version. | ||||
|  * | ||||
|  * FFmpeg is distributed in the hope that it will be useful, | ||||
|  * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
|  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU | ||||
|  * Lesser General Public License for more details. | ||||
|  * | ||||
|  * You should have received a copy of the GNU Lesser General Public | ||||
|  * License along with FFmpeg; if not, write to the Free Software | ||||
|  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | ||||
|  */ | ||||
|  | ||||
| #ifndef LIBAVFILTER_SSIM_H | ||||
| #define LIBAVFILTER_SSIM_H | ||||
|  | ||||
| #include <stddef.h> | ||||
| #include <stdint.h> | ||||
|  | ||||
| typedef struct SSIMDSPContext { | ||||
|     void (*ssim_4x4_line)(const uint8_t *buf, ptrdiff_t buf_stride, | ||||
|                           const uint8_t *ref, ptrdiff_t ref_stride, | ||||
|                           int (*sums)[4], int w); | ||||
|     float (*ssim_end_line)(const int (*sum0)[4], const int (*sum1)[4], int w); | ||||
| } SSIMDSPContext; | ||||
|  | ||||
| void ff_ssim_init_x86(SSIMDSPContext *dsp); | ||||
|  | ||||
| #endif /* LIBAVFILTER_SSIM_H */ | ||||
| @@ -42,6 +42,7 @@ | ||||
| #include "drawutils.h" | ||||
| #include "formats.h" | ||||
| #include "internal.h" | ||||
| #include "ssim.h" | ||||
| #include "video.h" | ||||
|  | ||||
| typedef struct SSIMContext { | ||||
| @@ -59,6 +60,7 @@ typedef struct SSIMContext { | ||||
|     int planeheight[4]; | ||||
|     int *temp; | ||||
|     int is_rgb; | ||||
|     SSIMDSPContext dsp; | ||||
| } SSIMContext; | ||||
|  | ||||
| #define OFFSET(x) offsetof(SSIMContext, x) | ||||
| @@ -85,8 +87,8 @@ static void set_meta(AVDictionary **metadata, const char *key, char comp, float | ||||
|     } | ||||
| } | ||||
|  | ||||
| static void ssim_4x4xn(const uint8_t *main, int main_stride, | ||||
|                        const uint8_t *ref, int ref_stride, | ||||
| static void ssim_4x4xn(const uint8_t *main, ptrdiff_t main_stride, | ||||
|                        const uint8_t *ref, ptrdiff_t ref_stride, | ||||
|                        int (*sums)[4], int width) | ||||
| { | ||||
|     int x, y, z; | ||||
| @@ -132,7 +134,7 @@ static float ssim_end1(int s1, int s2, int ss, int s12) | ||||
|          / ((float)(fs1 * fs1 + fs2 * fs2 + ssim_c1) * (float)(vars + ssim_c2)); | ||||
| } | ||||
|  | ||||
| static float ssim_endn(int (*sum0)[4], int (*sum1)[4], int width) | ||||
| static float ssim_endn(const int (*sum0)[4], const int (*sum1)[4], int width) | ||||
| { | ||||
|     float ssim = 0.0; | ||||
|     int i; | ||||
| @@ -145,7 +147,8 @@ static float ssim_endn(int (*sum0)[4], int (*sum1)[4], int width) | ||||
|     return ssim; | ||||
| } | ||||
|  | ||||
| static float ssim_plane(uint8_t *main, int main_stride, | ||||
| static float ssim_plane(SSIMDSPContext *dsp, | ||||
|                         uint8_t *main, int main_stride, | ||||
|                         uint8_t *ref, int ref_stride, | ||||
|                         int width, int height, void *temp) | ||||
| { | ||||
| @@ -160,12 +163,12 @@ static float ssim_plane(uint8_t *main, int main_stride, | ||||
|     for (y = 1; y < height; y++) { | ||||
|         for (; z <= y; z++) { | ||||
|             FFSWAP(void*, sum0, sum1); | ||||
|             ssim_4x4xn(&main[4 * z * main_stride], main_stride, | ||||
|                        &ref[4 * z * ref_stride], ref_stride, | ||||
|                        sum0, width); | ||||
|             dsp->ssim_4x4_line(&main[4 * z * main_stride], main_stride, | ||||
|                                &ref[4 * z * ref_stride], ref_stride, | ||||
|                                sum0, width); | ||||
|         } | ||||
|  | ||||
|         ssim += ssim_endn(sum0, sum1, width - 1); | ||||
|         ssim += dsp->ssim_end_line(sum0, sum1, width - 1); | ||||
|     } | ||||
|  | ||||
|     return ssim / ((height - 1) * (width - 1)); | ||||
| @@ -187,7 +190,7 @@ static AVFrame *do_ssim(AVFilterContext *ctx, AVFrame *main, | ||||
|     s->nb_frames++; | ||||
|  | ||||
|     for (i = 0; i < s->nb_components; i++) { | ||||
|         c[i] = ssim_plane(main->data[i], main->linesize[i], | ||||
|         c[i] = ssim_plane(&s->dsp, main->data[i], main->linesize[i], | ||||
|                           ref->data[i], ref->linesize[i], | ||||
|                           s->planewidth[i], s->planeheight[i], s->temp); | ||||
|         ssimv += s->coefs[i] * c[i]; | ||||
| @@ -294,6 +297,11 @@ static int config_input_ref(AVFilterLink *inlink) | ||||
|     if (!s->temp) | ||||
|         return AVERROR(ENOMEM); | ||||
|  | ||||
|     s->dsp.ssim_4x4_line = ssim_4x4xn; | ||||
|     s->dsp.ssim_end_line = ssim_endn; | ||||
|     if (ARCH_X86) | ||||
|         ff_ssim_init_x86(&s->dsp); | ||||
|  | ||||
|     return 0; | ||||
| } | ||||
|  | ||||
|   | ||||
| @@ -8,6 +8,7 @@ OBJS-$(CONFIG_NOISE_FILTER)                  += x86/vf_noise.o | ||||
| OBJS-$(CONFIG_PP7_FILTER)                    += x86/vf_pp7_init.o | ||||
| OBJS-$(CONFIG_PULLUP_FILTER)                 += x86/vf_pullup_init.o | ||||
| OBJS-$(CONFIG_SPP_FILTER)                    += x86/vf_spp.o | ||||
| OBJS-$(CONFIG_SSIM_FILTER)                   += x86/vf_ssim_init.o | ||||
| OBJS-$(CONFIG_TINTERLACE_FILTER)             += x86/vf_tinterlace_init.o | ||||
| OBJS-$(CONFIG_VOLUME_FILTER)                 += x86/af_volume_init.o | ||||
| OBJS-$(CONFIG_YADIF_FILTER)                  += x86/vf_yadif_init.o | ||||
| @@ -19,6 +20,7 @@ YASM-OBJS-$(CONFIG_IDET_FILTER)              += x86/vf_idet.o | ||||
| YASM-OBJS-$(CONFIG_INTERLACE_FILTER)         += x86/vf_interlace.o | ||||
| YASM-OBJS-$(CONFIG_PP7_FILTER)               += x86/vf_pp7.o | ||||
| YASM-OBJS-$(CONFIG_PULLUP_FILTER)            += x86/vf_pullup.o | ||||
| YASM-OBJS-$(CONFIG_SSIM_FILTER)              += x86/vf_ssim.o | ||||
| YASM-OBJS-$(CONFIG_TINTERLACE_FILTER)        += x86/vf_interlace.o | ||||
| YASM-OBJS-$(CONFIG_VOLUME_FILTER)            += x86/af_volume.o | ||||
| YASM-OBJS-$(CONFIG_YADIF_FILTER)             += x86/vf_yadif.o x86/yadif-16.o x86/yadif-10.o | ||||
|   | ||||
							
								
								
									
										191
									
								
								libavfilter/x86/vf_ssim.asm
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										191
									
								
								libavfilter/x86/vf_ssim.asm
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,191 @@ | ||||
| ;***************************************************************************** | ||||
| ;* x86-optimized functions for ssim filter | ||||
| ;* | ||||
| ;* Copyright (C) 2015 Ronald S. Bultje <rsbultje@gmail.com> | ||||
| ;* | ||||
| ;* This file is part of FFmpeg. | ||||
| ;* | ||||
| ;* FFmpeg is free software; you can redistribute it and/or | ||||
| ;* modify it under the terms of the GNU Lesser General Public | ||||
| ;* License as published by the Free Software Foundation; either | ||||
| ;* version 2.1 of the License, or (at your option) any later version. | ||||
| ;* | ||||
| ;* FFmpeg is distributed in the hope that it will be useful, | ||||
| ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
| ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU | ||||
| ;* Lesser General Public License for more details. | ||||
| ;* | ||||
| ;* You should have received a copy of the GNU Lesser General Public | ||||
| ;* License along with FFmpeg; if not, write to the Free Software | ||||
| ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | ||||
| ;****************************************************************************** | ||||
|  | ||||
| %include "libavutil/x86/x86util.asm" | ||||
|  | ||||
| SECTION_RODATA | ||||
|  | ||||
| pw_1: times 8 dw 1 | ||||
| ssim_c1: times 4 dd 416 ;(.01*.01*255*255*64 + .5) | ||||
| ssim_c2: times 4 dd 235963 ;(.03*.03*255*255*64*63 + .5) | ||||
|  | ||||
| SECTION .text | ||||
|  | ||||
| %if ARCH_X86_64 | ||||
|  | ||||
| INIT_XMM ssse3 | ||||
| cglobal ssim_4x4_line, 6, 8, 16, buf, buf_stride, ref, ref_stride, sums, w, buf_stride3, ref_stride3 | ||||
|     lea     ref_stride3q, [ref_strideq*3] | ||||
|     lea     buf_stride3q, [buf_strideq*3] | ||||
|     pxor              m7, m7 | ||||
|     mova             m15, [pw_1] | ||||
|  | ||||
| .loop: | ||||
|     movh              m0, [bufq+buf_strideq*0]  ; a1 | ||||
|     movh              m1, [refq+ref_strideq*0]  ; b1 | ||||
|     movh              m2, [bufq+buf_strideq*1]  ; a2 | ||||
|     movh              m3, [refq+ref_strideq*1]  ; b2 | ||||
|     punpcklbw         m0, m7                    ; s1 [word] | ||||
|     punpcklbw         m1, m7                    ; s2 [word] | ||||
|     punpcklbw         m2, m7                    ; s1 [word] | ||||
|     punpcklbw         m3, m7                    ; s2 [word] | ||||
|     pmaddwd           m4, m0, m0                ; a1 * a1 | ||||
|     pmaddwd           m5, m1, m1                ; b1 * b1 | ||||
|     pmaddwd           m8, m2, m2                ; a2 * a2 | ||||
|     pmaddwd           m9, m3, m3                ; b2 * b2 | ||||
|     paddd             m4, m5                    ; ss | ||||
|     paddd             m8, m9                    ; ss | ||||
|     pmaddwd           m6, m0, m1                ; a1 * b1 = ss12 | ||||
|     pmaddwd           m5, m2, m3                ; a2 * b2 = ss12 | ||||
|     paddw             m0, m2 | ||||
|     paddw             m1, m3 | ||||
|     paddd             m6, m5                    ; s12 | ||||
|     paddd             m4, m8                    ; ss | ||||
|  | ||||
|     movh              m2, [bufq+buf_strideq*2]  ; a3 | ||||
|     movh              m3, [refq+ref_strideq*2]  ; b3 | ||||
|     movh              m5, [bufq+buf_stride3q]   ; a4 | ||||
|     movh              m8, [refq+ref_stride3q]   ; b4 | ||||
|     punpcklbw         m2, m7                    ; s1 [word] | ||||
|     punpcklbw         m3, m7                    ; s2 [word] | ||||
|     punpcklbw         m5, m7                    ; s1 [word] | ||||
|     punpcklbw         m8, m7                    ; s2 [word] | ||||
|     pmaddwd           m9, m2, m2                ; a3 * a3 | ||||
|     pmaddwd          m10, m3, m3                ; b3 * b3 | ||||
|     pmaddwd          m12, m5, m5                ; a4 * a4 | ||||
|     pmaddwd          m13, m8, m8                ; b4 * b4 | ||||
|     pmaddwd          m11, m2, m3                ; a3 * b3 = ss12 | ||||
|     pmaddwd          m14, m5, m8                ; a4 * b4 = ss12 | ||||
|     paddd             m9, m10 | ||||
|     paddd            m12, m13 | ||||
|     paddw             m0, m2 | ||||
|     paddw             m1, m3 | ||||
|     paddw             m0, m5 | ||||
|     paddw             m1, m8 | ||||
|     paddd             m6, m11 | ||||
|     paddd             m4, m9 | ||||
|     paddd             m6, m14 | ||||
|     paddd             m4, m12 | ||||
|  | ||||
|     ; m0 = [word] s1 a,a,a,a,b,b,b,b | ||||
|     ; m1 = [word] s2 a,a,a,a,b,b,b,b | ||||
|     ; m4 = [dword] ss a,a,b,b | ||||
|     ; m6 = [dword] s12 a,a,b,b | ||||
|  | ||||
|     pmaddwd           m0, m15                   ; [dword] s1 a,a,b,b | ||||
|     pmaddwd           m1, m15                   ; [dword] s2 a,a,b,b | ||||
|     phaddd            m0, m4                    ; [dword] s1 a, b, ss a, b | ||||
|     phaddd            m1, m6                    ; [dword] s2 a, b, s12 a, b | ||||
|     punpckhdq     m2, m0, m1                    ; [dword] ss a, s12 a, ss b, s12 b | ||||
|     punpckldq         m0, m1                    ; [dword] s1 a, s2 a, s1 b, s2 b | ||||
|     punpckhqdq    m1, m0, m2                    ; [dword] a s1, s2, ss, s12 | ||||
|     punpcklqdq        m0, m2                    ; [dword] b s1, s2, ss, s12 | ||||
|  | ||||
|     mova  [sumsq+     0], m0 | ||||
|     mova  [sumsq+mmsize], m1 | ||||
|  | ||||
|     add             bufq, mmsize/2 | ||||
|     add             refq, mmsize/2 | ||||
|     add            sumsq, mmsize*2 | ||||
|     sub               wd, mmsize/8 | ||||
|     jg .loop | ||||
|     RET | ||||
|  | ||||
| %endif | ||||
|  | ||||
| INIT_XMM sse4 | ||||
| cglobal ssim_end_line, 3, 3, 6, sum0, sum1, w | ||||
|     pxor              m0, m0 | ||||
| .loop: | ||||
|     mova              m1, [sum0q+mmsize*0] | ||||
|     mova              m2, [sum0q+mmsize*1] | ||||
|     mova              m3, [sum0q+mmsize*2] | ||||
|     mova              m4, [sum0q+mmsize*3] | ||||
|     paddd             m1, [sum1q+mmsize*0] | ||||
|     paddd             m2, [sum1q+mmsize*1] | ||||
|     paddd             m3, [sum1q+mmsize*2] | ||||
|     paddd             m4, [sum1q+mmsize*3] | ||||
|     paddd             m1, m2 | ||||
|     paddd             m2, m3 | ||||
|     paddd             m3, m4 | ||||
|     paddd             m4, [sum0q+mmsize*4] | ||||
|     paddd             m4, [sum1q+mmsize*4] | ||||
|     TRANSPOSE4x4D      1, 2, 3, 4, 5 | ||||
|  | ||||
|     ; m1 = fs1, m2 = fs2, m3 = fss, m4 = fs12 | ||||
|     pslld             m3, 6 | ||||
|     pslld             m4, 6 | ||||
|     pmulld            m5, m1, m2                ; fs1 * fs2 | ||||
|     pmulld            m1, m1                    ; fs1 * fs1 | ||||
|     pmulld            m2, m2                    ; fs2 * fs2 | ||||
|     psubd             m3, m1 | ||||
|     psubd             m4, m5                    ; covariance | ||||
|     psubd             m3, m2                    ; variance | ||||
|  | ||||
|     ; m1 = fs1 * fs1, m2 = fs2 * fs2, m3 = variance, m4 = covariance, m5 = fs1 * fs2 | ||||
|     paddd             m4, m4                    ; 2 * covariance | ||||
|     paddd             m5, m5                    ; 2 * fs1 * fs2 | ||||
|     paddd             m1, m2                    ; fs1 * fs1 + fs2 * fs2 | ||||
|     paddd             m3, [ssim_c2]             ; variance + ssim_c2 | ||||
|     paddd             m4, [ssim_c2]             ; 2 * covariance + ssim_c2 | ||||
|     paddd             m5, [ssim_c1]             ; 2 * fs1 * fs2 + ssim_c1 | ||||
|     paddd             m1, [ssim_c1]             ; fs1 * fs1 + fs2 * fs2 + ssim_c1 | ||||
|  | ||||
|     ; convert to float | ||||
|     cvtdq2ps          m3, m3 | ||||
|     cvtdq2ps          m4, m4 | ||||
|     cvtdq2ps          m5, m5 | ||||
|     cvtdq2ps          m1, m1 | ||||
|     mulps             m4, m5 | ||||
|     mulps             m3, m1 | ||||
|     divps             m4, m3                    ; ssim_endl | ||||
|     addps             m0, m4                    ; ssim | ||||
|     add            sum0q, mmsize*4 | ||||
|     add            sum1q, mmsize*4 | ||||
|     sub               wd, 4 | ||||
|     jg .loop | ||||
|  | ||||
|     ; subps the ones we added too much | ||||
|     test              wd, wd | ||||
|     jz .end | ||||
|     add               wd, 4 | ||||
|     test              wd, 2 | ||||
|     jz .skip2 | ||||
|     psrldq            m4, 8 | ||||
| .skip2: | ||||
|     test              wd, 1 | ||||
|     jz .skip1 | ||||
|     psrldq            m4, 4 | ||||
| .skip1: | ||||
|     subps             m0, m4 | ||||
|  | ||||
| .end: | ||||
|     movhlps           m4, m0 | ||||
|     addps             m0, m4 | ||||
|     movss             m4, m0 | ||||
|     shufps            m0, m0, 1 | ||||
|     addss             m0, m4 | ||||
| %if ARCH_X86_32 | ||||
|     movss            r0m, m0 | ||||
|     fld             r0mp | ||||
| %endif | ||||
|     RET | ||||
							
								
								
									
										38
									
								
								libavfilter/x86/vf_ssim_init.c
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										38
									
								
								libavfilter/x86/vf_ssim_init.c
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,38 @@ | ||||
| /* | ||||
|  * Copyright (c) 2015 Ronald S. Bultje <rsbultje@gmail.com> | ||||
|  * | ||||
|  * This file is part of FFmpeg. | ||||
|  * | ||||
|  * FFmpeg is free software; you can redistribute it and/or | ||||
|  * modify it under the terms of the GNU Lesser General Public | ||||
|  * License as published by the Free Software Foundation; either | ||||
|  * version 2.1 of the License, or (at your option) any later version. | ||||
|  * | ||||
|  * FFmpeg is distributed in the hope that it will be useful, | ||||
|  * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
|  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU | ||||
|  * Lesser General Public License for more details. | ||||
|  * | ||||
|  * You should have received a copy of the GNU Lesser General Public | ||||
|  * License along with FFmpeg; if not, write to the Free Software | ||||
|  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | ||||
|  */ | ||||
|  | ||||
| #include "libavutil/x86/cpu.h" | ||||
|  | ||||
| #include "libavfilter/ssim.h" | ||||
|  | ||||
| void ff_ssim_4x4_line_ssse3(const uint8_t *buf, ptrdiff_t buf_stride, | ||||
|                             const uint8_t *ref, ptrdiff_t ref_stride, | ||||
|                             int (*sums)[4], int w); | ||||
| float ff_ssim_end_line_sse4(const int (*sum0)[4], const int (*sum1)[4], int w); | ||||
|  | ||||
| void ff_ssim_init_x86(SSIMDSPContext *dsp) | ||||
| { | ||||
|     int cpu_flags = av_get_cpu_flags(); | ||||
|  | ||||
|     if (ARCH_X86_64 && EXTERNAL_SSSE3(cpu_flags)) | ||||
|         dsp->ssim_4x4_line = ff_ssim_4x4_line_ssse3; | ||||
|     if (EXTERNAL_SSE4(cpu_flags)) | ||||
|         dsp->ssim_end_line = ff_ssim_end_line_sse4; | ||||
| } | ||||
		Reference in New Issue
	
	Block a user