2015-10-02 17:22:42 +02:00
|
|
|
OBJS-$(CONFIG_BLEND_FILTER) += x86/vf_blend_init.o
|
2016-03-13 11:06:21 +02:00
|
|
|
OBJS-$(CONFIG_BWDIF_FILTER) += x86/vf_bwdif_init.o
|
2016-04-06 20:09:08 +02:00
|
|
|
OBJS-$(CONFIG_COLORSPACE_FILTER) += x86/colorspacedsp_init.o
|
2015-01-19 00:26:48 +02:00
|
|
|
OBJS-$(CONFIG_EQ_FILTER) += x86/vf_eq.o
|
2014-12-26 20:37:54 +02:00
|
|
|
OBJS-$(CONFIG_FSPP_FILTER) += x86/vf_fspp_init.o
|
2013-10-22 03:37:46 +03:00
|
|
|
OBJS-$(CONFIG_GRADFUN_FILTER) += x86/vf_gradfun_init.o
|
2013-01-22 03:39:37 +03:00
|
|
|
OBJS-$(CONFIG_HQDN3D_FILTER) += x86/vf_hqdn3d_init.o
|
2014-09-03 12:02:32 +03:00
|
|
|
OBJS-$(CONFIG_IDET_FILTER) += x86/vf_idet_init.o
|
2014-11-11 20:43:42 +02:00
|
|
|
OBJS-$(CONFIG_INTERLACE_FILTER) += x86/vf_interlace_init.o
|
2015-09-30 23:00:14 +02:00
|
|
|
OBJS-$(CONFIG_MASKEDMERGE_FILTER) += x86/vf_maskedmerge_init.o
|
2014-10-17 04:24:42 +03:00
|
|
|
OBJS-$(CONFIG_NOISE_FILTER) += x86/vf_noise.o
|
2015-01-09 21:51:13 +02:00
|
|
|
OBJS-$(CONFIG_PP7_FILTER) += x86/vf_pp7_init.o
|
2015-07-12 12:44:39 +02:00
|
|
|
OBJS-$(CONFIG_PSNR_FILTER) += x86/vf_psnr_init.o
|
2013-07-08 15:42:53 +03:00
|
|
|
OBJS-$(CONFIG_PULLUP_FILTER) += x86/vf_pullup_init.o
|
avfilter/vf_removegrain: add x86 and x86_64 SSE2 functions
Speed of all modes increased by a factor between 7.4 and 19.8 largely depending
on whether bytes are unpacked into words. Modes 2, 3, and 4 have been sped-up
by a factor of 43 (thanks quick sort!)
All modes are available on x86_64 but only modes 1, 10, 11, 12, 13, 14, 19, 20,
21, and 22 are available on x86 due to the number of SIMD registers used.
With a contribution from James Almer <jamrial@gmail.com>
2015-07-15 01:48:47 +02:00
|
|
|
OBJS-$(CONFIG_REMOVEGRAIN_FILTER) += x86/vf_removegrain_init.o
|
2016-06-04 09:33:05 +02:00
|
|
|
OBJS-$(CONFIG_SHOWCQT_FILTER) += x86/avf_showcqt_init.o
|
2013-05-11 13:03:38 +03:00
|
|
|
OBJS-$(CONFIG_SPP_FILTER) += x86/vf_spp.o
|
2015-07-13 01:33:06 +02:00
|
|
|
OBJS-$(CONFIG_SSIM_FILTER) += x86/vf_ssim_init.o
|
2015-10-04 11:34:03 +02:00
|
|
|
OBJS-$(CONFIG_STEREO3D_FILTER) += x86/vf_stereo3d_init.o
|
2015-10-02 17:22:42 +02:00
|
|
|
OBJS-$(CONFIG_TBLEND_FILTER) += x86/vf_blend_init.o
|
2014-11-15 04:49:37 +02:00
|
|
|
OBJS-$(CONFIG_TINTERLACE_FILTER) += x86/vf_tinterlace_init.o
|
2012-09-23 21:49:26 +03:00
|
|
|
OBJS-$(CONFIG_VOLUME_FILTER) += x86/af_volume_init.o
|
2015-10-07 21:03:16 +02:00
|
|
|
OBJS-$(CONFIG_W3FDIF_FILTER) += x86/vf_w3fdif_init.o
|
2014-01-04 15:49:38 +03:00
|
|
|
OBJS-$(CONFIG_YADIF_FILTER) += x86/vf_yadif_init.o
|
2012-08-29 20:37:14 +03:00
|
|
|
|
2015-10-02 17:22:42 +02:00
|
|
|
YASM-OBJS-$(CONFIG_BLEND_FILTER) += x86/vf_blend.o
|
2016-03-13 11:06:21 +02:00
|
|
|
YASM-OBJS-$(CONFIG_BWDIF_FILTER) += x86/vf_bwdif.o
|
2016-04-06 20:09:08 +02:00
|
|
|
YASM-OBJS-$(CONFIG_COLORSPACE_FILTER) += x86/colorspacedsp.o
|
2014-12-26 20:37:54 +02:00
|
|
|
YASM-OBJS-$(CONFIG_FSPP_FILTER) += x86/vf_fspp.o
|
2013-10-22 03:37:46 +03:00
|
|
|
YASM-OBJS-$(CONFIG_GRADFUN_FILTER) += x86/vf_gradfun.o
|
2013-02-01 15:14:31 +03:00
|
|
|
YASM-OBJS-$(CONFIG_HQDN3D_FILTER) += x86/vf_hqdn3d.o
|
2014-09-03 12:02:32 +03:00
|
|
|
YASM-OBJS-$(CONFIG_IDET_FILTER) += x86/vf_idet.o
|
2014-11-11 20:43:42 +02:00
|
|
|
YASM-OBJS-$(CONFIG_INTERLACE_FILTER) += x86/vf_interlace.o
|
2015-09-30 23:00:14 +02:00
|
|
|
YASM-OBJS-$(CONFIG_MASKEDMERGE_FILTER) += x86/vf_maskedmerge.o
|
2015-01-09 21:51:13 +02:00
|
|
|
YASM-OBJS-$(CONFIG_PP7_FILTER) += x86/vf_pp7.o
|
2015-07-12 12:44:39 +02:00
|
|
|
YASM-OBJS-$(CONFIG_PSNR_FILTER) += x86/vf_psnr.o
|
2013-07-08 15:42:53 +03:00
|
|
|
YASM-OBJS-$(CONFIG_PULLUP_FILTER) += x86/vf_pullup.o
|
avfilter/vf_removegrain: add x86 and x86_64 SSE2 functions
Speed of all modes increased by a factor between 7.4 and 19.8 largely depending
on whether bytes are unpacked into words. Modes 2, 3, and 4 have been sped-up
by a factor of 43 (thanks quick sort!)
All modes are available on x86_64 but only modes 1, 10, 11, 12, 13, 14, 19, 20,
21, and 22 are available on x86 due to the number of SIMD registers used.
With a contribution from James Almer <jamrial@gmail.com>
2015-07-15 01:48:47 +02:00
|
|
|
ifdef CONFIG_GPL
|
|
|
|
YASM-OBJS-$(CONFIG_REMOVEGRAIN_FILTER) += x86/vf_removegrain.o
|
|
|
|
endif
|
2016-06-04 09:33:05 +02:00
|
|
|
YASM-OBJS-$(CONFIG_SHOWCQT_FILTER) += x86/avf_showcqt.o
|
2015-07-13 01:33:06 +02:00
|
|
|
YASM-OBJS-$(CONFIG_SSIM_FILTER) += x86/vf_ssim.o
|
2015-10-04 11:34:03 +02:00
|
|
|
YASM-OBJS-$(CONFIG_STEREO3D_FILTER) += x86/vf_stereo3d.o
|
2015-10-02 17:22:42 +02:00
|
|
|
YASM-OBJS-$(CONFIG_TBLEND_FILTER) += x86/vf_blend.o
|
2014-11-15 04:49:37 +02:00
|
|
|
YASM-OBJS-$(CONFIG_TINTERLACE_FILTER) += x86/vf_interlace.o
|
2012-09-23 21:49:26 +03:00
|
|
|
YASM-OBJS-$(CONFIG_VOLUME_FILTER) += x86/af_volume.o
|
2015-10-07 21:03:16 +02:00
|
|
|
YASM-OBJS-$(CONFIG_W3FDIF_FILTER) += x86/vf_w3fdif.o
|
yadif: x86 assembly for 9 to 14-bit samples
These smaller samples do not need to be unpacked to double words
allowing the code to process more pixels every iteration (still 2 in MMX
but 6 in SSE2). It also avoids emulating the missing double word
instructions on older instruction sets.
Like with the previous code for 16-bit samples this has been tested on
an Athlon64 and a Core2Quad.
Athlon64:
1809275 decicycles in C, 32718 runs, 50 skips
911675 decicycles in mmx, 32727 runs, 41 skips, 2.0x faster
495284 decicycles in sse2, 32747 runs, 21 skips, 3.7x faster
Core2Quad:
921363 decicycles in C, 32756 runs, 12 skips
486537 decicycles in mmx, 32764 runs, 4 skips, 1.9x faster
293296 decicycles in sse2, 32759 runs, 9 skips, 3.1x faster
284910 decicycles in ssse3, 32759 runs, 9 skips, 3.2x faster
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
2013-03-16 23:42:24 +03:00
|
|
|
YASM-OBJS-$(CONFIG_YADIF_FILTER) += x86/vf_yadif.o x86/yadif-16.o x86/yadif-10.o
|