1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2025-11-29 05:57:37 +02:00
Files
FFmpeg/libswscale/x86/range_convert.asm

202 lines
5.7 KiB
NASM
Raw Normal View History

swscale/x86: add sse2 and avx2 {lum,chr}ConvertRange chrRangeFromJpeg_8_c: 22.3 chrRangeFromJpeg_8_sse2: 13.3 chrRangeFromJpeg_8_avx2: 13.3 chrRangeFromJpeg_24_c: 72.8 chrRangeFromJpeg_24_sse2: 22.3 chrRangeFromJpeg_24_avx2: 17.5 chrRangeFromJpeg_128_c: 345.5 chrRangeFromJpeg_128_sse2: 106.0 chrRangeFromJpeg_128_avx2: 57.8 chrRangeFromJpeg_144_c: 380.5 chrRangeFromJpeg_144_sse2: 118.5 chrRangeFromJpeg_144_avx2: 62.3 chrRangeFromJpeg_256_c: 646.3 chrRangeFromJpeg_256_sse2: 218.8 chrRangeFromJpeg_256_avx2: 109.0 chrRangeFromJpeg_512_c: 1461.5 chrRangeFromJpeg_512_sse2: 426.5 chrRangeFromJpeg_512_avx2: 211.5 chrRangeToJpeg_8_c: 37.8 chrRangeToJpeg_8_sse2: 10.5 chrRangeToJpeg_8_avx2: 14.0 chrRangeToJpeg_24_c: 114.3 chrRangeToJpeg_24_sse2: 23.5 chrRangeToJpeg_24_avx2: 16.3 chrRangeToJpeg_128_c: 633.5 chrRangeToJpeg_128_sse2: 107.5 chrRangeToJpeg_128_avx2: 55.0 chrRangeToJpeg_144_c: 758.3 chrRangeToJpeg_144_sse2: 132.0 chrRangeToJpeg_144_avx2: 64.5 chrRangeToJpeg_256_c: 1345.0 chrRangeToJpeg_256_sse2: 218.0 chrRangeToJpeg_256_avx2: 105.3 chrRangeToJpeg_512_c: 2524.0 chrRangeToJpeg_512_sse2: 417.0 chrRangeToJpeg_512_avx2: 218.8 lumRangeFromJpeg_8_c: 11.8 lumRangeFromJpeg_8_sse2: 11.0 lumRangeFromJpeg_8_avx2: 10.3 lumRangeFromJpeg_24_c: 38.5 lumRangeFromJpeg_24_sse2: 15.5 lumRangeFromJpeg_24_avx2: 12.5 lumRangeFromJpeg_128_c: 232.3 lumRangeFromJpeg_128_sse2: 60.0 lumRangeFromJpeg_128_avx2: 26.8 lumRangeFromJpeg_144_c: 259.5 lumRangeFromJpeg_144_sse2: 65.3 lumRangeFromJpeg_144_avx2: 29.0 lumRangeFromJpeg_256_c: 464.5 lumRangeFromJpeg_256_sse2: 107.5 lumRangeFromJpeg_256_avx2: 54.0 lumRangeFromJpeg_512_c: 897.5 lumRangeFromJpeg_512_sse2: 224.5 lumRangeFromJpeg_512_avx2: 109.8 lumRangeToJpeg_8_c: 17.8 lumRangeToJpeg_8_sse2: 11.0 lumRangeToJpeg_8_avx2: 11.8 lumRangeToJpeg_24_c: 56.3 lumRangeToJpeg_24_sse2: 11.0 lumRangeToJpeg_24_avx2: 12.5 lumRangeToJpeg_128_c: 333.8 lumRangeToJpeg_128_sse2: 53.3 lumRangeToJpeg_128_avx2: 26.5 lumRangeToJpeg_144_c: 375.5 lumRangeToJpeg_144_sse2: 60.8 lumRangeToJpeg_144_avx2: 29.0 lumRangeToJpeg_256_c: 652.0 lumRangeToJpeg_256_sse2: 109.5 lumRangeToJpeg_256_avx2: 53.5 lumRangeToJpeg_512_c: 1284.3 lumRangeToJpeg_512_sse2: 218.0 lumRangeToJpeg_512_avx2: 108.3
2024-06-06 18:33:34 +02:00
;******************************************************************************
;* Copyright (c) 2024 Ramiro Polla
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
pack19: times 4 dd (1 << 19) - 1
swscale/x86: add sse2 and avx2 {lum,chr}ConvertRange chrRangeFromJpeg_8_c: 22.3 chrRangeFromJpeg_8_sse2: 13.3 chrRangeFromJpeg_8_avx2: 13.3 chrRangeFromJpeg_24_c: 72.8 chrRangeFromJpeg_24_sse2: 22.3 chrRangeFromJpeg_24_avx2: 17.5 chrRangeFromJpeg_128_c: 345.5 chrRangeFromJpeg_128_sse2: 106.0 chrRangeFromJpeg_128_avx2: 57.8 chrRangeFromJpeg_144_c: 380.5 chrRangeFromJpeg_144_sse2: 118.5 chrRangeFromJpeg_144_avx2: 62.3 chrRangeFromJpeg_256_c: 646.3 chrRangeFromJpeg_256_sse2: 218.8 chrRangeFromJpeg_256_avx2: 109.0 chrRangeFromJpeg_512_c: 1461.5 chrRangeFromJpeg_512_sse2: 426.5 chrRangeFromJpeg_512_avx2: 211.5 chrRangeToJpeg_8_c: 37.8 chrRangeToJpeg_8_sse2: 10.5 chrRangeToJpeg_8_avx2: 14.0 chrRangeToJpeg_24_c: 114.3 chrRangeToJpeg_24_sse2: 23.5 chrRangeToJpeg_24_avx2: 16.3 chrRangeToJpeg_128_c: 633.5 chrRangeToJpeg_128_sse2: 107.5 chrRangeToJpeg_128_avx2: 55.0 chrRangeToJpeg_144_c: 758.3 chrRangeToJpeg_144_sse2: 132.0 chrRangeToJpeg_144_avx2: 64.5 chrRangeToJpeg_256_c: 1345.0 chrRangeToJpeg_256_sse2: 218.0 chrRangeToJpeg_256_avx2: 105.3 chrRangeToJpeg_512_c: 2524.0 chrRangeToJpeg_512_sse2: 417.0 chrRangeToJpeg_512_avx2: 218.8 lumRangeFromJpeg_8_c: 11.8 lumRangeFromJpeg_8_sse2: 11.0 lumRangeFromJpeg_8_avx2: 10.3 lumRangeFromJpeg_24_c: 38.5 lumRangeFromJpeg_24_sse2: 15.5 lumRangeFromJpeg_24_avx2: 12.5 lumRangeFromJpeg_128_c: 232.3 lumRangeFromJpeg_128_sse2: 60.0 lumRangeFromJpeg_128_avx2: 26.8 lumRangeFromJpeg_144_c: 259.5 lumRangeFromJpeg_144_sse2: 65.3 lumRangeFromJpeg_144_avx2: 29.0 lumRangeFromJpeg_256_c: 464.5 lumRangeFromJpeg_256_sse2: 107.5 lumRangeFromJpeg_256_avx2: 54.0 lumRangeFromJpeg_512_c: 897.5 lumRangeFromJpeg_512_sse2: 224.5 lumRangeFromJpeg_512_avx2: 109.8 lumRangeToJpeg_8_c: 17.8 lumRangeToJpeg_8_sse2: 11.0 lumRangeToJpeg_8_avx2: 11.8 lumRangeToJpeg_24_c: 56.3 lumRangeToJpeg_24_sse2: 11.0 lumRangeToJpeg_24_avx2: 12.5 lumRangeToJpeg_128_c: 333.8 lumRangeToJpeg_128_sse2: 53.3 lumRangeToJpeg_128_avx2: 26.5 lumRangeToJpeg_144_c: 375.5 lumRangeToJpeg_144_sse2: 60.8 lumRangeToJpeg_144_avx2: 29.0 lumRangeToJpeg_256_c: 652.0 lumRangeToJpeg_256_sse2: 109.5 lumRangeToJpeg_256_avx2: 53.5 lumRangeToJpeg_512_c: 1284.3 lumRangeToJpeg_512_sse2: 218.0 lumRangeToJpeg_512_avx2: 108.3
2024-06-06 18:33:34 +02:00
SECTION .text
;-----------------------------------------------------------------------------
; lumConvertRange
;
; void ff_lumRangeToJpeg{8,16}_<opt>(int16_t *dst, int width,
; uint32_t coeff, int64_t offset);
; void ff_lumRangeFromJpeg{8,16}_<opt>(int16_t *dst, int width,
; uint32_t coeff, int64_t offset);
swscale/x86: add sse2 and avx2 {lum,chr}ConvertRange chrRangeFromJpeg_8_c: 22.3 chrRangeFromJpeg_8_sse2: 13.3 chrRangeFromJpeg_8_avx2: 13.3 chrRangeFromJpeg_24_c: 72.8 chrRangeFromJpeg_24_sse2: 22.3 chrRangeFromJpeg_24_avx2: 17.5 chrRangeFromJpeg_128_c: 345.5 chrRangeFromJpeg_128_sse2: 106.0 chrRangeFromJpeg_128_avx2: 57.8 chrRangeFromJpeg_144_c: 380.5 chrRangeFromJpeg_144_sse2: 118.5 chrRangeFromJpeg_144_avx2: 62.3 chrRangeFromJpeg_256_c: 646.3 chrRangeFromJpeg_256_sse2: 218.8 chrRangeFromJpeg_256_avx2: 109.0 chrRangeFromJpeg_512_c: 1461.5 chrRangeFromJpeg_512_sse2: 426.5 chrRangeFromJpeg_512_avx2: 211.5 chrRangeToJpeg_8_c: 37.8 chrRangeToJpeg_8_sse2: 10.5 chrRangeToJpeg_8_avx2: 14.0 chrRangeToJpeg_24_c: 114.3 chrRangeToJpeg_24_sse2: 23.5 chrRangeToJpeg_24_avx2: 16.3 chrRangeToJpeg_128_c: 633.5 chrRangeToJpeg_128_sse2: 107.5 chrRangeToJpeg_128_avx2: 55.0 chrRangeToJpeg_144_c: 758.3 chrRangeToJpeg_144_sse2: 132.0 chrRangeToJpeg_144_avx2: 64.5 chrRangeToJpeg_256_c: 1345.0 chrRangeToJpeg_256_sse2: 218.0 chrRangeToJpeg_256_avx2: 105.3 chrRangeToJpeg_512_c: 2524.0 chrRangeToJpeg_512_sse2: 417.0 chrRangeToJpeg_512_avx2: 218.8 lumRangeFromJpeg_8_c: 11.8 lumRangeFromJpeg_8_sse2: 11.0 lumRangeFromJpeg_8_avx2: 10.3 lumRangeFromJpeg_24_c: 38.5 lumRangeFromJpeg_24_sse2: 15.5 lumRangeFromJpeg_24_avx2: 12.5 lumRangeFromJpeg_128_c: 232.3 lumRangeFromJpeg_128_sse2: 60.0 lumRangeFromJpeg_128_avx2: 26.8 lumRangeFromJpeg_144_c: 259.5 lumRangeFromJpeg_144_sse2: 65.3 lumRangeFromJpeg_144_avx2: 29.0 lumRangeFromJpeg_256_c: 464.5 lumRangeFromJpeg_256_sse2: 107.5 lumRangeFromJpeg_256_avx2: 54.0 lumRangeFromJpeg_512_c: 897.5 lumRangeFromJpeg_512_sse2: 224.5 lumRangeFromJpeg_512_avx2: 109.8 lumRangeToJpeg_8_c: 17.8 lumRangeToJpeg_8_sse2: 11.0 lumRangeToJpeg_8_avx2: 11.8 lumRangeToJpeg_24_c: 56.3 lumRangeToJpeg_24_sse2: 11.0 lumRangeToJpeg_24_avx2: 12.5 lumRangeToJpeg_128_c: 333.8 lumRangeToJpeg_128_sse2: 53.3 lumRangeToJpeg_128_avx2: 26.5 lumRangeToJpeg_144_c: 375.5 lumRangeToJpeg_144_sse2: 60.8 lumRangeToJpeg_144_avx2: 29.0 lumRangeToJpeg_256_c: 652.0 lumRangeToJpeg_256_sse2: 109.5 lumRangeToJpeg_256_avx2: 53.5 lumRangeToJpeg_512_c: 1284.3 lumRangeToJpeg_512_sse2: 218.0 lumRangeToJpeg_512_avx2: 108.3
2024-06-06 18:33:34 +02:00
;
;-----------------------------------------------------------------------------
%macro LUMCONVERTRANGE 2
cglobal lumRange%1Jpeg%2, 4, 4, 5, dst, width, coeff, offset
shl widthd, %2 >> 3
movd xm2, coeffd
VBROADCASTSS m2, xm2
%if ARCH_X86_64
movq xm3, offsetq
%else
movq xm3, offsetm
%endif
%if %2 == 16
VBROADCASTSD m3, xm3
%ifidni %1,To
VBROADCASTI128 m4, [pack19]
%endif
%elif %2 == 8
VBROADCASTSS m3, xm3
pxor m4, m4
%endif ; %2 == 8/16
swscale/x86: add sse2 and avx2 {lum,chr}ConvertRange chrRangeFromJpeg_8_c: 22.3 chrRangeFromJpeg_8_sse2: 13.3 chrRangeFromJpeg_8_avx2: 13.3 chrRangeFromJpeg_24_c: 72.8 chrRangeFromJpeg_24_sse2: 22.3 chrRangeFromJpeg_24_avx2: 17.5 chrRangeFromJpeg_128_c: 345.5 chrRangeFromJpeg_128_sse2: 106.0 chrRangeFromJpeg_128_avx2: 57.8 chrRangeFromJpeg_144_c: 380.5 chrRangeFromJpeg_144_sse2: 118.5 chrRangeFromJpeg_144_avx2: 62.3 chrRangeFromJpeg_256_c: 646.3 chrRangeFromJpeg_256_sse2: 218.8 chrRangeFromJpeg_256_avx2: 109.0 chrRangeFromJpeg_512_c: 1461.5 chrRangeFromJpeg_512_sse2: 426.5 chrRangeFromJpeg_512_avx2: 211.5 chrRangeToJpeg_8_c: 37.8 chrRangeToJpeg_8_sse2: 10.5 chrRangeToJpeg_8_avx2: 14.0 chrRangeToJpeg_24_c: 114.3 chrRangeToJpeg_24_sse2: 23.5 chrRangeToJpeg_24_avx2: 16.3 chrRangeToJpeg_128_c: 633.5 chrRangeToJpeg_128_sse2: 107.5 chrRangeToJpeg_128_avx2: 55.0 chrRangeToJpeg_144_c: 758.3 chrRangeToJpeg_144_sse2: 132.0 chrRangeToJpeg_144_avx2: 64.5 chrRangeToJpeg_256_c: 1345.0 chrRangeToJpeg_256_sse2: 218.0 chrRangeToJpeg_256_avx2: 105.3 chrRangeToJpeg_512_c: 2524.0 chrRangeToJpeg_512_sse2: 417.0 chrRangeToJpeg_512_avx2: 218.8 lumRangeFromJpeg_8_c: 11.8 lumRangeFromJpeg_8_sse2: 11.0 lumRangeFromJpeg_8_avx2: 10.3 lumRangeFromJpeg_24_c: 38.5 lumRangeFromJpeg_24_sse2: 15.5 lumRangeFromJpeg_24_avx2: 12.5 lumRangeFromJpeg_128_c: 232.3 lumRangeFromJpeg_128_sse2: 60.0 lumRangeFromJpeg_128_avx2: 26.8 lumRangeFromJpeg_144_c: 259.5 lumRangeFromJpeg_144_sse2: 65.3 lumRangeFromJpeg_144_avx2: 29.0 lumRangeFromJpeg_256_c: 464.5 lumRangeFromJpeg_256_sse2: 107.5 lumRangeFromJpeg_256_avx2: 54.0 lumRangeFromJpeg_512_c: 897.5 lumRangeFromJpeg_512_sse2: 224.5 lumRangeFromJpeg_512_avx2: 109.8 lumRangeToJpeg_8_c: 17.8 lumRangeToJpeg_8_sse2: 11.0 lumRangeToJpeg_8_avx2: 11.8 lumRangeToJpeg_24_c: 56.3 lumRangeToJpeg_24_sse2: 11.0 lumRangeToJpeg_24_avx2: 12.5 lumRangeToJpeg_128_c: 333.8 lumRangeToJpeg_128_sse2: 53.3 lumRangeToJpeg_128_avx2: 26.5 lumRangeToJpeg_144_c: 375.5 lumRangeToJpeg_144_sse2: 60.8 lumRangeToJpeg_144_avx2: 29.0 lumRangeToJpeg_256_c: 652.0 lumRangeToJpeg_256_sse2: 109.5 lumRangeToJpeg_256_avx2: 53.5 lumRangeToJpeg_512_c: 1284.3 lumRangeToJpeg_512_sse2: 218.0 lumRangeToJpeg_512_avx2: 108.3
2024-06-06 18:33:34 +02:00
add dstq, widthq
neg widthq
.loop:
movu m0, [dstq+widthq]
%if %2 == 16
pshufd m1, m0, 0xb1
pmuldq m0, m2
pmuldq m1, m2
paddq m0, m3
paddq m1, m3
psrlq m0, 18
psrlq m1, 18
pshufd m0, m0, 0xd8
pshufd m1, m1, 0xd8
punpckldq m0, m1
%ifidni %1,To
PMINSD m0, m4, m1
%endif
%elif %2 == 8
punpckhwd m1, m0, m4
punpcklwd m0, m4
pmaddwd m0, m2
pmaddwd m1, m2
paddd m0, m3
paddd m1, m3
psrad m0, 14
psrad m1, 14
swscale/x86: add sse2 and avx2 {lum,chr}ConvertRange chrRangeFromJpeg_8_c: 22.3 chrRangeFromJpeg_8_sse2: 13.3 chrRangeFromJpeg_8_avx2: 13.3 chrRangeFromJpeg_24_c: 72.8 chrRangeFromJpeg_24_sse2: 22.3 chrRangeFromJpeg_24_avx2: 17.5 chrRangeFromJpeg_128_c: 345.5 chrRangeFromJpeg_128_sse2: 106.0 chrRangeFromJpeg_128_avx2: 57.8 chrRangeFromJpeg_144_c: 380.5 chrRangeFromJpeg_144_sse2: 118.5 chrRangeFromJpeg_144_avx2: 62.3 chrRangeFromJpeg_256_c: 646.3 chrRangeFromJpeg_256_sse2: 218.8 chrRangeFromJpeg_256_avx2: 109.0 chrRangeFromJpeg_512_c: 1461.5 chrRangeFromJpeg_512_sse2: 426.5 chrRangeFromJpeg_512_avx2: 211.5 chrRangeToJpeg_8_c: 37.8 chrRangeToJpeg_8_sse2: 10.5 chrRangeToJpeg_8_avx2: 14.0 chrRangeToJpeg_24_c: 114.3 chrRangeToJpeg_24_sse2: 23.5 chrRangeToJpeg_24_avx2: 16.3 chrRangeToJpeg_128_c: 633.5 chrRangeToJpeg_128_sse2: 107.5 chrRangeToJpeg_128_avx2: 55.0 chrRangeToJpeg_144_c: 758.3 chrRangeToJpeg_144_sse2: 132.0 chrRangeToJpeg_144_avx2: 64.5 chrRangeToJpeg_256_c: 1345.0 chrRangeToJpeg_256_sse2: 218.0 chrRangeToJpeg_256_avx2: 105.3 chrRangeToJpeg_512_c: 2524.0 chrRangeToJpeg_512_sse2: 417.0 chrRangeToJpeg_512_avx2: 218.8 lumRangeFromJpeg_8_c: 11.8 lumRangeFromJpeg_8_sse2: 11.0 lumRangeFromJpeg_8_avx2: 10.3 lumRangeFromJpeg_24_c: 38.5 lumRangeFromJpeg_24_sse2: 15.5 lumRangeFromJpeg_24_avx2: 12.5 lumRangeFromJpeg_128_c: 232.3 lumRangeFromJpeg_128_sse2: 60.0 lumRangeFromJpeg_128_avx2: 26.8 lumRangeFromJpeg_144_c: 259.5 lumRangeFromJpeg_144_sse2: 65.3 lumRangeFromJpeg_144_avx2: 29.0 lumRangeFromJpeg_256_c: 464.5 lumRangeFromJpeg_256_sse2: 107.5 lumRangeFromJpeg_256_avx2: 54.0 lumRangeFromJpeg_512_c: 897.5 lumRangeFromJpeg_512_sse2: 224.5 lumRangeFromJpeg_512_avx2: 109.8 lumRangeToJpeg_8_c: 17.8 lumRangeToJpeg_8_sse2: 11.0 lumRangeToJpeg_8_avx2: 11.8 lumRangeToJpeg_24_c: 56.3 lumRangeToJpeg_24_sse2: 11.0 lumRangeToJpeg_24_avx2: 12.5 lumRangeToJpeg_128_c: 333.8 lumRangeToJpeg_128_sse2: 53.3 lumRangeToJpeg_128_avx2: 26.5 lumRangeToJpeg_144_c: 375.5 lumRangeToJpeg_144_sse2: 60.8 lumRangeToJpeg_144_avx2: 29.0 lumRangeToJpeg_256_c: 652.0 lumRangeToJpeg_256_sse2: 109.5 lumRangeToJpeg_256_avx2: 53.5 lumRangeToJpeg_512_c: 1284.3 lumRangeToJpeg_512_sse2: 218.0 lumRangeToJpeg_512_avx2: 108.3
2024-06-06 18:33:34 +02:00
packssdw m0, m1
%endif ; %2 == 8/16
swscale/x86: add sse2 and avx2 {lum,chr}ConvertRange chrRangeFromJpeg_8_c: 22.3 chrRangeFromJpeg_8_sse2: 13.3 chrRangeFromJpeg_8_avx2: 13.3 chrRangeFromJpeg_24_c: 72.8 chrRangeFromJpeg_24_sse2: 22.3 chrRangeFromJpeg_24_avx2: 17.5 chrRangeFromJpeg_128_c: 345.5 chrRangeFromJpeg_128_sse2: 106.0 chrRangeFromJpeg_128_avx2: 57.8 chrRangeFromJpeg_144_c: 380.5 chrRangeFromJpeg_144_sse2: 118.5 chrRangeFromJpeg_144_avx2: 62.3 chrRangeFromJpeg_256_c: 646.3 chrRangeFromJpeg_256_sse2: 218.8 chrRangeFromJpeg_256_avx2: 109.0 chrRangeFromJpeg_512_c: 1461.5 chrRangeFromJpeg_512_sse2: 426.5 chrRangeFromJpeg_512_avx2: 211.5 chrRangeToJpeg_8_c: 37.8 chrRangeToJpeg_8_sse2: 10.5 chrRangeToJpeg_8_avx2: 14.0 chrRangeToJpeg_24_c: 114.3 chrRangeToJpeg_24_sse2: 23.5 chrRangeToJpeg_24_avx2: 16.3 chrRangeToJpeg_128_c: 633.5 chrRangeToJpeg_128_sse2: 107.5 chrRangeToJpeg_128_avx2: 55.0 chrRangeToJpeg_144_c: 758.3 chrRangeToJpeg_144_sse2: 132.0 chrRangeToJpeg_144_avx2: 64.5 chrRangeToJpeg_256_c: 1345.0 chrRangeToJpeg_256_sse2: 218.0 chrRangeToJpeg_256_avx2: 105.3 chrRangeToJpeg_512_c: 2524.0 chrRangeToJpeg_512_sse2: 417.0 chrRangeToJpeg_512_avx2: 218.8 lumRangeFromJpeg_8_c: 11.8 lumRangeFromJpeg_8_sse2: 11.0 lumRangeFromJpeg_8_avx2: 10.3 lumRangeFromJpeg_24_c: 38.5 lumRangeFromJpeg_24_sse2: 15.5 lumRangeFromJpeg_24_avx2: 12.5 lumRangeFromJpeg_128_c: 232.3 lumRangeFromJpeg_128_sse2: 60.0 lumRangeFromJpeg_128_avx2: 26.8 lumRangeFromJpeg_144_c: 259.5 lumRangeFromJpeg_144_sse2: 65.3 lumRangeFromJpeg_144_avx2: 29.0 lumRangeFromJpeg_256_c: 464.5 lumRangeFromJpeg_256_sse2: 107.5 lumRangeFromJpeg_256_avx2: 54.0 lumRangeFromJpeg_512_c: 897.5 lumRangeFromJpeg_512_sse2: 224.5 lumRangeFromJpeg_512_avx2: 109.8 lumRangeToJpeg_8_c: 17.8 lumRangeToJpeg_8_sse2: 11.0 lumRangeToJpeg_8_avx2: 11.8 lumRangeToJpeg_24_c: 56.3 lumRangeToJpeg_24_sse2: 11.0 lumRangeToJpeg_24_avx2: 12.5 lumRangeToJpeg_128_c: 333.8 lumRangeToJpeg_128_sse2: 53.3 lumRangeToJpeg_128_avx2: 26.5 lumRangeToJpeg_144_c: 375.5 lumRangeToJpeg_144_sse2: 60.8 lumRangeToJpeg_144_avx2: 29.0 lumRangeToJpeg_256_c: 652.0 lumRangeToJpeg_256_sse2: 109.5 lumRangeToJpeg_256_avx2: 53.5 lumRangeToJpeg_512_c: 1284.3 lumRangeToJpeg_512_sse2: 218.0 lumRangeToJpeg_512_avx2: 108.3
2024-06-06 18:33:34 +02:00
movu [dstq+widthq], m0
add widthq, mmsize
jl .loop
RET
%endmacro
;-----------------------------------------------------------------------------
; chrConvertRange
;
; void ff_chrRangeToJpeg{8,16}_<opt>(int16_t *dstU, int16_t *dstV, int width,
; uint32_t coeff, int64_t offset);
; void ff_chrRangeFromJpeg{8,16}_<opt>(int16_t *dstU, int16_t *dstV, int width,
; uint32_t coeff, int64_t offset);
swscale/x86: add sse2 and avx2 {lum,chr}ConvertRange chrRangeFromJpeg_8_c: 22.3 chrRangeFromJpeg_8_sse2: 13.3 chrRangeFromJpeg_8_avx2: 13.3 chrRangeFromJpeg_24_c: 72.8 chrRangeFromJpeg_24_sse2: 22.3 chrRangeFromJpeg_24_avx2: 17.5 chrRangeFromJpeg_128_c: 345.5 chrRangeFromJpeg_128_sse2: 106.0 chrRangeFromJpeg_128_avx2: 57.8 chrRangeFromJpeg_144_c: 380.5 chrRangeFromJpeg_144_sse2: 118.5 chrRangeFromJpeg_144_avx2: 62.3 chrRangeFromJpeg_256_c: 646.3 chrRangeFromJpeg_256_sse2: 218.8 chrRangeFromJpeg_256_avx2: 109.0 chrRangeFromJpeg_512_c: 1461.5 chrRangeFromJpeg_512_sse2: 426.5 chrRangeFromJpeg_512_avx2: 211.5 chrRangeToJpeg_8_c: 37.8 chrRangeToJpeg_8_sse2: 10.5 chrRangeToJpeg_8_avx2: 14.0 chrRangeToJpeg_24_c: 114.3 chrRangeToJpeg_24_sse2: 23.5 chrRangeToJpeg_24_avx2: 16.3 chrRangeToJpeg_128_c: 633.5 chrRangeToJpeg_128_sse2: 107.5 chrRangeToJpeg_128_avx2: 55.0 chrRangeToJpeg_144_c: 758.3 chrRangeToJpeg_144_sse2: 132.0 chrRangeToJpeg_144_avx2: 64.5 chrRangeToJpeg_256_c: 1345.0 chrRangeToJpeg_256_sse2: 218.0 chrRangeToJpeg_256_avx2: 105.3 chrRangeToJpeg_512_c: 2524.0 chrRangeToJpeg_512_sse2: 417.0 chrRangeToJpeg_512_avx2: 218.8 lumRangeFromJpeg_8_c: 11.8 lumRangeFromJpeg_8_sse2: 11.0 lumRangeFromJpeg_8_avx2: 10.3 lumRangeFromJpeg_24_c: 38.5 lumRangeFromJpeg_24_sse2: 15.5 lumRangeFromJpeg_24_avx2: 12.5 lumRangeFromJpeg_128_c: 232.3 lumRangeFromJpeg_128_sse2: 60.0 lumRangeFromJpeg_128_avx2: 26.8 lumRangeFromJpeg_144_c: 259.5 lumRangeFromJpeg_144_sse2: 65.3 lumRangeFromJpeg_144_avx2: 29.0 lumRangeFromJpeg_256_c: 464.5 lumRangeFromJpeg_256_sse2: 107.5 lumRangeFromJpeg_256_avx2: 54.0 lumRangeFromJpeg_512_c: 897.5 lumRangeFromJpeg_512_sse2: 224.5 lumRangeFromJpeg_512_avx2: 109.8 lumRangeToJpeg_8_c: 17.8 lumRangeToJpeg_8_sse2: 11.0 lumRangeToJpeg_8_avx2: 11.8 lumRangeToJpeg_24_c: 56.3 lumRangeToJpeg_24_sse2: 11.0 lumRangeToJpeg_24_avx2: 12.5 lumRangeToJpeg_128_c: 333.8 lumRangeToJpeg_128_sse2: 53.3 lumRangeToJpeg_128_avx2: 26.5 lumRangeToJpeg_144_c: 375.5 lumRangeToJpeg_144_sse2: 60.8 lumRangeToJpeg_144_avx2: 29.0 lumRangeToJpeg_256_c: 652.0 lumRangeToJpeg_256_sse2: 109.5 lumRangeToJpeg_256_avx2: 53.5 lumRangeToJpeg_512_c: 1284.3 lumRangeToJpeg_512_sse2: 218.0 lumRangeToJpeg_512_avx2: 108.3
2024-06-06 18:33:34 +02:00
;
;-----------------------------------------------------------------------------
%macro CHRCONVERTRANGE 2
cglobal chrRange%1Jpeg%2, 5, 5, 7, dstU, dstV, width, coeff, offset
shl widthd, %2 >> 3
movd xm4, coeffd
VBROADCASTSS m4, xm4
%if ARCH_X86_64
movq xm5, offsetq
%else
movq xm5, offsetm
%endif
%if %2 == 16
VBROADCASTSD m5, xm5
%ifidni %1,To
VBROADCASTI128 m6, [pack19]
%endif
%elif %2 == 8
VBROADCASTSS m5, xm5
swscale/x86: add sse2 and avx2 {lum,chr}ConvertRange chrRangeFromJpeg_8_c: 22.3 chrRangeFromJpeg_8_sse2: 13.3 chrRangeFromJpeg_8_avx2: 13.3 chrRangeFromJpeg_24_c: 72.8 chrRangeFromJpeg_24_sse2: 22.3 chrRangeFromJpeg_24_avx2: 17.5 chrRangeFromJpeg_128_c: 345.5 chrRangeFromJpeg_128_sse2: 106.0 chrRangeFromJpeg_128_avx2: 57.8 chrRangeFromJpeg_144_c: 380.5 chrRangeFromJpeg_144_sse2: 118.5 chrRangeFromJpeg_144_avx2: 62.3 chrRangeFromJpeg_256_c: 646.3 chrRangeFromJpeg_256_sse2: 218.8 chrRangeFromJpeg_256_avx2: 109.0 chrRangeFromJpeg_512_c: 1461.5 chrRangeFromJpeg_512_sse2: 426.5 chrRangeFromJpeg_512_avx2: 211.5 chrRangeToJpeg_8_c: 37.8 chrRangeToJpeg_8_sse2: 10.5 chrRangeToJpeg_8_avx2: 14.0 chrRangeToJpeg_24_c: 114.3 chrRangeToJpeg_24_sse2: 23.5 chrRangeToJpeg_24_avx2: 16.3 chrRangeToJpeg_128_c: 633.5 chrRangeToJpeg_128_sse2: 107.5 chrRangeToJpeg_128_avx2: 55.0 chrRangeToJpeg_144_c: 758.3 chrRangeToJpeg_144_sse2: 132.0 chrRangeToJpeg_144_avx2: 64.5 chrRangeToJpeg_256_c: 1345.0 chrRangeToJpeg_256_sse2: 218.0 chrRangeToJpeg_256_avx2: 105.3 chrRangeToJpeg_512_c: 2524.0 chrRangeToJpeg_512_sse2: 417.0 chrRangeToJpeg_512_avx2: 218.8 lumRangeFromJpeg_8_c: 11.8 lumRangeFromJpeg_8_sse2: 11.0 lumRangeFromJpeg_8_avx2: 10.3 lumRangeFromJpeg_24_c: 38.5 lumRangeFromJpeg_24_sse2: 15.5 lumRangeFromJpeg_24_avx2: 12.5 lumRangeFromJpeg_128_c: 232.3 lumRangeFromJpeg_128_sse2: 60.0 lumRangeFromJpeg_128_avx2: 26.8 lumRangeFromJpeg_144_c: 259.5 lumRangeFromJpeg_144_sse2: 65.3 lumRangeFromJpeg_144_avx2: 29.0 lumRangeFromJpeg_256_c: 464.5 lumRangeFromJpeg_256_sse2: 107.5 lumRangeFromJpeg_256_avx2: 54.0 lumRangeFromJpeg_512_c: 897.5 lumRangeFromJpeg_512_sse2: 224.5 lumRangeFromJpeg_512_avx2: 109.8 lumRangeToJpeg_8_c: 17.8 lumRangeToJpeg_8_sse2: 11.0 lumRangeToJpeg_8_avx2: 11.8 lumRangeToJpeg_24_c: 56.3 lumRangeToJpeg_24_sse2: 11.0 lumRangeToJpeg_24_avx2: 12.5 lumRangeToJpeg_128_c: 333.8 lumRangeToJpeg_128_sse2: 53.3 lumRangeToJpeg_128_avx2: 26.5 lumRangeToJpeg_144_c: 375.5 lumRangeToJpeg_144_sse2: 60.8 lumRangeToJpeg_144_avx2: 29.0 lumRangeToJpeg_256_c: 652.0 lumRangeToJpeg_256_sse2: 109.5 lumRangeToJpeg_256_avx2: 53.5 lumRangeToJpeg_512_c: 1284.3 lumRangeToJpeg_512_sse2: 218.0 lumRangeToJpeg_512_avx2: 108.3
2024-06-06 18:33:34 +02:00
pxor m6, m6
%endif ; %2 == 8/16
swscale/x86: add sse2 and avx2 {lum,chr}ConvertRange chrRangeFromJpeg_8_c: 22.3 chrRangeFromJpeg_8_sse2: 13.3 chrRangeFromJpeg_8_avx2: 13.3 chrRangeFromJpeg_24_c: 72.8 chrRangeFromJpeg_24_sse2: 22.3 chrRangeFromJpeg_24_avx2: 17.5 chrRangeFromJpeg_128_c: 345.5 chrRangeFromJpeg_128_sse2: 106.0 chrRangeFromJpeg_128_avx2: 57.8 chrRangeFromJpeg_144_c: 380.5 chrRangeFromJpeg_144_sse2: 118.5 chrRangeFromJpeg_144_avx2: 62.3 chrRangeFromJpeg_256_c: 646.3 chrRangeFromJpeg_256_sse2: 218.8 chrRangeFromJpeg_256_avx2: 109.0 chrRangeFromJpeg_512_c: 1461.5 chrRangeFromJpeg_512_sse2: 426.5 chrRangeFromJpeg_512_avx2: 211.5 chrRangeToJpeg_8_c: 37.8 chrRangeToJpeg_8_sse2: 10.5 chrRangeToJpeg_8_avx2: 14.0 chrRangeToJpeg_24_c: 114.3 chrRangeToJpeg_24_sse2: 23.5 chrRangeToJpeg_24_avx2: 16.3 chrRangeToJpeg_128_c: 633.5 chrRangeToJpeg_128_sse2: 107.5 chrRangeToJpeg_128_avx2: 55.0 chrRangeToJpeg_144_c: 758.3 chrRangeToJpeg_144_sse2: 132.0 chrRangeToJpeg_144_avx2: 64.5 chrRangeToJpeg_256_c: 1345.0 chrRangeToJpeg_256_sse2: 218.0 chrRangeToJpeg_256_avx2: 105.3 chrRangeToJpeg_512_c: 2524.0 chrRangeToJpeg_512_sse2: 417.0 chrRangeToJpeg_512_avx2: 218.8 lumRangeFromJpeg_8_c: 11.8 lumRangeFromJpeg_8_sse2: 11.0 lumRangeFromJpeg_8_avx2: 10.3 lumRangeFromJpeg_24_c: 38.5 lumRangeFromJpeg_24_sse2: 15.5 lumRangeFromJpeg_24_avx2: 12.5 lumRangeFromJpeg_128_c: 232.3 lumRangeFromJpeg_128_sse2: 60.0 lumRangeFromJpeg_128_avx2: 26.8 lumRangeFromJpeg_144_c: 259.5 lumRangeFromJpeg_144_sse2: 65.3 lumRangeFromJpeg_144_avx2: 29.0 lumRangeFromJpeg_256_c: 464.5 lumRangeFromJpeg_256_sse2: 107.5 lumRangeFromJpeg_256_avx2: 54.0 lumRangeFromJpeg_512_c: 897.5 lumRangeFromJpeg_512_sse2: 224.5 lumRangeFromJpeg_512_avx2: 109.8 lumRangeToJpeg_8_c: 17.8 lumRangeToJpeg_8_sse2: 11.0 lumRangeToJpeg_8_avx2: 11.8 lumRangeToJpeg_24_c: 56.3 lumRangeToJpeg_24_sse2: 11.0 lumRangeToJpeg_24_avx2: 12.5 lumRangeToJpeg_128_c: 333.8 lumRangeToJpeg_128_sse2: 53.3 lumRangeToJpeg_128_avx2: 26.5 lumRangeToJpeg_144_c: 375.5 lumRangeToJpeg_144_sse2: 60.8 lumRangeToJpeg_144_avx2: 29.0 lumRangeToJpeg_256_c: 652.0 lumRangeToJpeg_256_sse2: 109.5 lumRangeToJpeg_256_avx2: 53.5 lumRangeToJpeg_512_c: 1284.3 lumRangeToJpeg_512_sse2: 218.0 lumRangeToJpeg_512_avx2: 108.3
2024-06-06 18:33:34 +02:00
add dstUq, widthq
add dstVq, widthq
neg widthq
.loop:
movu m0, [dstUq+widthq]
movu m2, [dstVq+widthq]
%if %2 == 16
pshufd m1, m0, 0xb1
pshufd m3, m2, 0xb1
pmuldq m0, m4
pmuldq m1, m4
pmuldq m2, m4
pmuldq m3, m4
paddq m0, m5
paddq m1, m5
paddq m2, m5
paddq m3, m5
psrlq m0, 18
psrlq m1, 18
psrlq m2, 18
psrlq m3, 18
pshufd m0, m0, 0xd8
pshufd m1, m1, 0xd8
pshufd m2, m2, 0xd8
pshufd m3, m3, 0xd8
punpckldq m0, m1
punpckldq m2, m3
%ifidni %1,To
PMINSD m0, m6, m1
PMINSD m2, m6, m3
%endif
%elif %2 == 8
swscale/x86: add sse2 and avx2 {lum,chr}ConvertRange chrRangeFromJpeg_8_c: 22.3 chrRangeFromJpeg_8_sse2: 13.3 chrRangeFromJpeg_8_avx2: 13.3 chrRangeFromJpeg_24_c: 72.8 chrRangeFromJpeg_24_sse2: 22.3 chrRangeFromJpeg_24_avx2: 17.5 chrRangeFromJpeg_128_c: 345.5 chrRangeFromJpeg_128_sse2: 106.0 chrRangeFromJpeg_128_avx2: 57.8 chrRangeFromJpeg_144_c: 380.5 chrRangeFromJpeg_144_sse2: 118.5 chrRangeFromJpeg_144_avx2: 62.3 chrRangeFromJpeg_256_c: 646.3 chrRangeFromJpeg_256_sse2: 218.8 chrRangeFromJpeg_256_avx2: 109.0 chrRangeFromJpeg_512_c: 1461.5 chrRangeFromJpeg_512_sse2: 426.5 chrRangeFromJpeg_512_avx2: 211.5 chrRangeToJpeg_8_c: 37.8 chrRangeToJpeg_8_sse2: 10.5 chrRangeToJpeg_8_avx2: 14.0 chrRangeToJpeg_24_c: 114.3 chrRangeToJpeg_24_sse2: 23.5 chrRangeToJpeg_24_avx2: 16.3 chrRangeToJpeg_128_c: 633.5 chrRangeToJpeg_128_sse2: 107.5 chrRangeToJpeg_128_avx2: 55.0 chrRangeToJpeg_144_c: 758.3 chrRangeToJpeg_144_sse2: 132.0 chrRangeToJpeg_144_avx2: 64.5 chrRangeToJpeg_256_c: 1345.0 chrRangeToJpeg_256_sse2: 218.0 chrRangeToJpeg_256_avx2: 105.3 chrRangeToJpeg_512_c: 2524.0 chrRangeToJpeg_512_sse2: 417.0 chrRangeToJpeg_512_avx2: 218.8 lumRangeFromJpeg_8_c: 11.8 lumRangeFromJpeg_8_sse2: 11.0 lumRangeFromJpeg_8_avx2: 10.3 lumRangeFromJpeg_24_c: 38.5 lumRangeFromJpeg_24_sse2: 15.5 lumRangeFromJpeg_24_avx2: 12.5 lumRangeFromJpeg_128_c: 232.3 lumRangeFromJpeg_128_sse2: 60.0 lumRangeFromJpeg_128_avx2: 26.8 lumRangeFromJpeg_144_c: 259.5 lumRangeFromJpeg_144_sse2: 65.3 lumRangeFromJpeg_144_avx2: 29.0 lumRangeFromJpeg_256_c: 464.5 lumRangeFromJpeg_256_sse2: 107.5 lumRangeFromJpeg_256_avx2: 54.0 lumRangeFromJpeg_512_c: 897.5 lumRangeFromJpeg_512_sse2: 224.5 lumRangeFromJpeg_512_avx2: 109.8 lumRangeToJpeg_8_c: 17.8 lumRangeToJpeg_8_sse2: 11.0 lumRangeToJpeg_8_avx2: 11.8 lumRangeToJpeg_24_c: 56.3 lumRangeToJpeg_24_sse2: 11.0 lumRangeToJpeg_24_avx2: 12.5 lumRangeToJpeg_128_c: 333.8 lumRangeToJpeg_128_sse2: 53.3 lumRangeToJpeg_128_avx2: 26.5 lumRangeToJpeg_144_c: 375.5 lumRangeToJpeg_144_sse2: 60.8 lumRangeToJpeg_144_avx2: 29.0 lumRangeToJpeg_256_c: 652.0 lumRangeToJpeg_256_sse2: 109.5 lumRangeToJpeg_256_avx2: 53.5 lumRangeToJpeg_512_c: 1284.3 lumRangeToJpeg_512_sse2: 218.0 lumRangeToJpeg_512_avx2: 108.3
2024-06-06 18:33:34 +02:00
punpckhwd m1, m0, m6
punpckhwd m3, m2, m6
punpcklwd m0, m6
punpcklwd m2, m6
pmaddwd m0, m4
pmaddwd m1, m4
pmaddwd m2, m4
pmaddwd m3, m4
paddd m0, m5
paddd m1, m5
paddd m2, m5
paddd m3, m5
psrad m0, 14
psrad m1, 14
psrad m2, 14
psrad m3, 14
swscale/x86: add sse2 and avx2 {lum,chr}ConvertRange chrRangeFromJpeg_8_c: 22.3 chrRangeFromJpeg_8_sse2: 13.3 chrRangeFromJpeg_8_avx2: 13.3 chrRangeFromJpeg_24_c: 72.8 chrRangeFromJpeg_24_sse2: 22.3 chrRangeFromJpeg_24_avx2: 17.5 chrRangeFromJpeg_128_c: 345.5 chrRangeFromJpeg_128_sse2: 106.0 chrRangeFromJpeg_128_avx2: 57.8 chrRangeFromJpeg_144_c: 380.5 chrRangeFromJpeg_144_sse2: 118.5 chrRangeFromJpeg_144_avx2: 62.3 chrRangeFromJpeg_256_c: 646.3 chrRangeFromJpeg_256_sse2: 218.8 chrRangeFromJpeg_256_avx2: 109.0 chrRangeFromJpeg_512_c: 1461.5 chrRangeFromJpeg_512_sse2: 426.5 chrRangeFromJpeg_512_avx2: 211.5 chrRangeToJpeg_8_c: 37.8 chrRangeToJpeg_8_sse2: 10.5 chrRangeToJpeg_8_avx2: 14.0 chrRangeToJpeg_24_c: 114.3 chrRangeToJpeg_24_sse2: 23.5 chrRangeToJpeg_24_avx2: 16.3 chrRangeToJpeg_128_c: 633.5 chrRangeToJpeg_128_sse2: 107.5 chrRangeToJpeg_128_avx2: 55.0 chrRangeToJpeg_144_c: 758.3 chrRangeToJpeg_144_sse2: 132.0 chrRangeToJpeg_144_avx2: 64.5 chrRangeToJpeg_256_c: 1345.0 chrRangeToJpeg_256_sse2: 218.0 chrRangeToJpeg_256_avx2: 105.3 chrRangeToJpeg_512_c: 2524.0 chrRangeToJpeg_512_sse2: 417.0 chrRangeToJpeg_512_avx2: 218.8 lumRangeFromJpeg_8_c: 11.8 lumRangeFromJpeg_8_sse2: 11.0 lumRangeFromJpeg_8_avx2: 10.3 lumRangeFromJpeg_24_c: 38.5 lumRangeFromJpeg_24_sse2: 15.5 lumRangeFromJpeg_24_avx2: 12.5 lumRangeFromJpeg_128_c: 232.3 lumRangeFromJpeg_128_sse2: 60.0 lumRangeFromJpeg_128_avx2: 26.8 lumRangeFromJpeg_144_c: 259.5 lumRangeFromJpeg_144_sse2: 65.3 lumRangeFromJpeg_144_avx2: 29.0 lumRangeFromJpeg_256_c: 464.5 lumRangeFromJpeg_256_sse2: 107.5 lumRangeFromJpeg_256_avx2: 54.0 lumRangeFromJpeg_512_c: 897.5 lumRangeFromJpeg_512_sse2: 224.5 lumRangeFromJpeg_512_avx2: 109.8 lumRangeToJpeg_8_c: 17.8 lumRangeToJpeg_8_sse2: 11.0 lumRangeToJpeg_8_avx2: 11.8 lumRangeToJpeg_24_c: 56.3 lumRangeToJpeg_24_sse2: 11.0 lumRangeToJpeg_24_avx2: 12.5 lumRangeToJpeg_128_c: 333.8 lumRangeToJpeg_128_sse2: 53.3 lumRangeToJpeg_128_avx2: 26.5 lumRangeToJpeg_144_c: 375.5 lumRangeToJpeg_144_sse2: 60.8 lumRangeToJpeg_144_avx2: 29.0 lumRangeToJpeg_256_c: 652.0 lumRangeToJpeg_256_sse2: 109.5 lumRangeToJpeg_256_avx2: 53.5 lumRangeToJpeg_512_c: 1284.3 lumRangeToJpeg_512_sse2: 218.0 lumRangeToJpeg_512_avx2: 108.3
2024-06-06 18:33:34 +02:00
packssdw m0, m1
packssdw m2, m3
%endif ; %2 == 8/16
swscale/x86: add sse2 and avx2 {lum,chr}ConvertRange chrRangeFromJpeg_8_c: 22.3 chrRangeFromJpeg_8_sse2: 13.3 chrRangeFromJpeg_8_avx2: 13.3 chrRangeFromJpeg_24_c: 72.8 chrRangeFromJpeg_24_sse2: 22.3 chrRangeFromJpeg_24_avx2: 17.5 chrRangeFromJpeg_128_c: 345.5 chrRangeFromJpeg_128_sse2: 106.0 chrRangeFromJpeg_128_avx2: 57.8 chrRangeFromJpeg_144_c: 380.5 chrRangeFromJpeg_144_sse2: 118.5 chrRangeFromJpeg_144_avx2: 62.3 chrRangeFromJpeg_256_c: 646.3 chrRangeFromJpeg_256_sse2: 218.8 chrRangeFromJpeg_256_avx2: 109.0 chrRangeFromJpeg_512_c: 1461.5 chrRangeFromJpeg_512_sse2: 426.5 chrRangeFromJpeg_512_avx2: 211.5 chrRangeToJpeg_8_c: 37.8 chrRangeToJpeg_8_sse2: 10.5 chrRangeToJpeg_8_avx2: 14.0 chrRangeToJpeg_24_c: 114.3 chrRangeToJpeg_24_sse2: 23.5 chrRangeToJpeg_24_avx2: 16.3 chrRangeToJpeg_128_c: 633.5 chrRangeToJpeg_128_sse2: 107.5 chrRangeToJpeg_128_avx2: 55.0 chrRangeToJpeg_144_c: 758.3 chrRangeToJpeg_144_sse2: 132.0 chrRangeToJpeg_144_avx2: 64.5 chrRangeToJpeg_256_c: 1345.0 chrRangeToJpeg_256_sse2: 218.0 chrRangeToJpeg_256_avx2: 105.3 chrRangeToJpeg_512_c: 2524.0 chrRangeToJpeg_512_sse2: 417.0 chrRangeToJpeg_512_avx2: 218.8 lumRangeFromJpeg_8_c: 11.8 lumRangeFromJpeg_8_sse2: 11.0 lumRangeFromJpeg_8_avx2: 10.3 lumRangeFromJpeg_24_c: 38.5 lumRangeFromJpeg_24_sse2: 15.5 lumRangeFromJpeg_24_avx2: 12.5 lumRangeFromJpeg_128_c: 232.3 lumRangeFromJpeg_128_sse2: 60.0 lumRangeFromJpeg_128_avx2: 26.8 lumRangeFromJpeg_144_c: 259.5 lumRangeFromJpeg_144_sse2: 65.3 lumRangeFromJpeg_144_avx2: 29.0 lumRangeFromJpeg_256_c: 464.5 lumRangeFromJpeg_256_sse2: 107.5 lumRangeFromJpeg_256_avx2: 54.0 lumRangeFromJpeg_512_c: 897.5 lumRangeFromJpeg_512_sse2: 224.5 lumRangeFromJpeg_512_avx2: 109.8 lumRangeToJpeg_8_c: 17.8 lumRangeToJpeg_8_sse2: 11.0 lumRangeToJpeg_8_avx2: 11.8 lumRangeToJpeg_24_c: 56.3 lumRangeToJpeg_24_sse2: 11.0 lumRangeToJpeg_24_avx2: 12.5 lumRangeToJpeg_128_c: 333.8 lumRangeToJpeg_128_sse2: 53.3 lumRangeToJpeg_128_avx2: 26.5 lumRangeToJpeg_144_c: 375.5 lumRangeToJpeg_144_sse2: 60.8 lumRangeToJpeg_144_avx2: 29.0 lumRangeToJpeg_256_c: 652.0 lumRangeToJpeg_256_sse2: 109.5 lumRangeToJpeg_256_avx2: 53.5 lumRangeToJpeg_512_c: 1284.3 lumRangeToJpeg_512_sse2: 218.0 lumRangeToJpeg_512_avx2: 108.3
2024-06-06 18:33:34 +02:00
movu [dstUq+widthq], m0
movu [dstVq+widthq], m2
add widthq, mmsize
jl .loop
RET
%endmacro
INIT_XMM sse2
LUMCONVERTRANGE To, 8
CHRCONVERTRANGE To, 8
LUMCONVERTRANGE From, 8
CHRCONVERTRANGE From, 8
INIT_XMM sse4
LUMCONVERTRANGE To, 16
CHRCONVERTRANGE To, 16
LUMCONVERTRANGE From, 16
CHRCONVERTRANGE From, 16
swscale/x86: add sse2 and avx2 {lum,chr}ConvertRange chrRangeFromJpeg_8_c: 22.3 chrRangeFromJpeg_8_sse2: 13.3 chrRangeFromJpeg_8_avx2: 13.3 chrRangeFromJpeg_24_c: 72.8 chrRangeFromJpeg_24_sse2: 22.3 chrRangeFromJpeg_24_avx2: 17.5 chrRangeFromJpeg_128_c: 345.5 chrRangeFromJpeg_128_sse2: 106.0 chrRangeFromJpeg_128_avx2: 57.8 chrRangeFromJpeg_144_c: 380.5 chrRangeFromJpeg_144_sse2: 118.5 chrRangeFromJpeg_144_avx2: 62.3 chrRangeFromJpeg_256_c: 646.3 chrRangeFromJpeg_256_sse2: 218.8 chrRangeFromJpeg_256_avx2: 109.0 chrRangeFromJpeg_512_c: 1461.5 chrRangeFromJpeg_512_sse2: 426.5 chrRangeFromJpeg_512_avx2: 211.5 chrRangeToJpeg_8_c: 37.8 chrRangeToJpeg_8_sse2: 10.5 chrRangeToJpeg_8_avx2: 14.0 chrRangeToJpeg_24_c: 114.3 chrRangeToJpeg_24_sse2: 23.5 chrRangeToJpeg_24_avx2: 16.3 chrRangeToJpeg_128_c: 633.5 chrRangeToJpeg_128_sse2: 107.5 chrRangeToJpeg_128_avx2: 55.0 chrRangeToJpeg_144_c: 758.3 chrRangeToJpeg_144_sse2: 132.0 chrRangeToJpeg_144_avx2: 64.5 chrRangeToJpeg_256_c: 1345.0 chrRangeToJpeg_256_sse2: 218.0 chrRangeToJpeg_256_avx2: 105.3 chrRangeToJpeg_512_c: 2524.0 chrRangeToJpeg_512_sse2: 417.0 chrRangeToJpeg_512_avx2: 218.8 lumRangeFromJpeg_8_c: 11.8 lumRangeFromJpeg_8_sse2: 11.0 lumRangeFromJpeg_8_avx2: 10.3 lumRangeFromJpeg_24_c: 38.5 lumRangeFromJpeg_24_sse2: 15.5 lumRangeFromJpeg_24_avx2: 12.5 lumRangeFromJpeg_128_c: 232.3 lumRangeFromJpeg_128_sse2: 60.0 lumRangeFromJpeg_128_avx2: 26.8 lumRangeFromJpeg_144_c: 259.5 lumRangeFromJpeg_144_sse2: 65.3 lumRangeFromJpeg_144_avx2: 29.0 lumRangeFromJpeg_256_c: 464.5 lumRangeFromJpeg_256_sse2: 107.5 lumRangeFromJpeg_256_avx2: 54.0 lumRangeFromJpeg_512_c: 897.5 lumRangeFromJpeg_512_sse2: 224.5 lumRangeFromJpeg_512_avx2: 109.8 lumRangeToJpeg_8_c: 17.8 lumRangeToJpeg_8_sse2: 11.0 lumRangeToJpeg_8_avx2: 11.8 lumRangeToJpeg_24_c: 56.3 lumRangeToJpeg_24_sse2: 11.0 lumRangeToJpeg_24_avx2: 12.5 lumRangeToJpeg_128_c: 333.8 lumRangeToJpeg_128_sse2: 53.3 lumRangeToJpeg_128_avx2: 26.5 lumRangeToJpeg_144_c: 375.5 lumRangeToJpeg_144_sse2: 60.8 lumRangeToJpeg_144_avx2: 29.0 lumRangeToJpeg_256_c: 652.0 lumRangeToJpeg_256_sse2: 109.5 lumRangeToJpeg_256_avx2: 53.5 lumRangeToJpeg_512_c: 1284.3 lumRangeToJpeg_512_sse2: 218.0 lumRangeToJpeg_512_avx2: 108.3
2024-06-06 18:33:34 +02:00
%if HAVE_AVX2_EXTERNAL
swscale/x86: add sse2 and avx2 {lum,chr}ConvertRange chrRangeFromJpeg_8_c: 22.3 chrRangeFromJpeg_8_sse2: 13.3 chrRangeFromJpeg_8_avx2: 13.3 chrRangeFromJpeg_24_c: 72.8 chrRangeFromJpeg_24_sse2: 22.3 chrRangeFromJpeg_24_avx2: 17.5 chrRangeFromJpeg_128_c: 345.5 chrRangeFromJpeg_128_sse2: 106.0 chrRangeFromJpeg_128_avx2: 57.8 chrRangeFromJpeg_144_c: 380.5 chrRangeFromJpeg_144_sse2: 118.5 chrRangeFromJpeg_144_avx2: 62.3 chrRangeFromJpeg_256_c: 646.3 chrRangeFromJpeg_256_sse2: 218.8 chrRangeFromJpeg_256_avx2: 109.0 chrRangeFromJpeg_512_c: 1461.5 chrRangeFromJpeg_512_sse2: 426.5 chrRangeFromJpeg_512_avx2: 211.5 chrRangeToJpeg_8_c: 37.8 chrRangeToJpeg_8_sse2: 10.5 chrRangeToJpeg_8_avx2: 14.0 chrRangeToJpeg_24_c: 114.3 chrRangeToJpeg_24_sse2: 23.5 chrRangeToJpeg_24_avx2: 16.3 chrRangeToJpeg_128_c: 633.5 chrRangeToJpeg_128_sse2: 107.5 chrRangeToJpeg_128_avx2: 55.0 chrRangeToJpeg_144_c: 758.3 chrRangeToJpeg_144_sse2: 132.0 chrRangeToJpeg_144_avx2: 64.5 chrRangeToJpeg_256_c: 1345.0 chrRangeToJpeg_256_sse2: 218.0 chrRangeToJpeg_256_avx2: 105.3 chrRangeToJpeg_512_c: 2524.0 chrRangeToJpeg_512_sse2: 417.0 chrRangeToJpeg_512_avx2: 218.8 lumRangeFromJpeg_8_c: 11.8 lumRangeFromJpeg_8_sse2: 11.0 lumRangeFromJpeg_8_avx2: 10.3 lumRangeFromJpeg_24_c: 38.5 lumRangeFromJpeg_24_sse2: 15.5 lumRangeFromJpeg_24_avx2: 12.5 lumRangeFromJpeg_128_c: 232.3 lumRangeFromJpeg_128_sse2: 60.0 lumRangeFromJpeg_128_avx2: 26.8 lumRangeFromJpeg_144_c: 259.5 lumRangeFromJpeg_144_sse2: 65.3 lumRangeFromJpeg_144_avx2: 29.0 lumRangeFromJpeg_256_c: 464.5 lumRangeFromJpeg_256_sse2: 107.5 lumRangeFromJpeg_256_avx2: 54.0 lumRangeFromJpeg_512_c: 897.5 lumRangeFromJpeg_512_sse2: 224.5 lumRangeFromJpeg_512_avx2: 109.8 lumRangeToJpeg_8_c: 17.8 lumRangeToJpeg_8_sse2: 11.0 lumRangeToJpeg_8_avx2: 11.8 lumRangeToJpeg_24_c: 56.3 lumRangeToJpeg_24_sse2: 11.0 lumRangeToJpeg_24_avx2: 12.5 lumRangeToJpeg_128_c: 333.8 lumRangeToJpeg_128_sse2: 53.3 lumRangeToJpeg_128_avx2: 26.5 lumRangeToJpeg_144_c: 375.5 lumRangeToJpeg_144_sse2: 60.8 lumRangeToJpeg_144_avx2: 29.0 lumRangeToJpeg_256_c: 652.0 lumRangeToJpeg_256_sse2: 109.5 lumRangeToJpeg_256_avx2: 53.5 lumRangeToJpeg_512_c: 1284.3 lumRangeToJpeg_512_sse2: 218.0 lumRangeToJpeg_512_avx2: 108.3
2024-06-06 18:33:34 +02:00
INIT_YMM avx2
LUMCONVERTRANGE To, 8
LUMCONVERTRANGE To, 16
CHRCONVERTRANGE To, 8
CHRCONVERTRANGE To, 16
LUMCONVERTRANGE From, 8
LUMCONVERTRANGE From, 16
CHRCONVERTRANGE From, 8
CHRCONVERTRANGE From, 16
%endif