You've already forked FFmpeg
mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2026-05-04 21:08:03 +02:00
avcodec/x86/qpeldsp: Add SSE2 vertical lowpass functions
Benchmarks ([4], [8] and [12] are pure vertical functions and therefore show the biggest improvements): avg_qpel_pixels_tab[0][4]_c: 844.5 ( 1.00x) avg_qpel_pixels_tab[0][4]_mmxext: 225.5 ( 3.74x) avg_qpel_pixels_tab[0][4]_sse2: 146.6 ( 5.76x) avg_qpel_pixels_tab[0][5]_c: 1915.9 ( 1.00x) avg_qpel_pixels_tab[0][5]_mmxext: 499.6 ( 3.83x) avg_qpel_pixels_tab[0][5]_sse2: 405.5 ( 4.72x) avg_qpel_pixels_tab[0][6]_c: 1775.9 ( 1.00x) avg_qpel_pixels_tab[0][6]_mmxext: 484.9 ( 3.66x) avg_qpel_pixels_tab[0][6]_sse2: 385.4 ( 4.61x) avg_qpel_pixels_tab[0][7]_c: 1937.0 ( 1.00x) avg_qpel_pixels_tab[0][7]_mmxext: 501.3 ( 3.86x) avg_qpel_pixels_tab[0][7]_sse2: 403.6 ( 4.80x) avg_qpel_pixels_tab[0][8]_c: 976.7 ( 1.00x) avg_qpel_pixels_tab[0][8]_mmxext: 216.9 ( 4.50x) avg_qpel_pixels_tab[0][8]_sse2: 113.1 ( 8.64x) avg_qpel_pixels_tab[0][9]_c: 1971.8 ( 1.00x) avg_qpel_pixels_tab[0][9]_mmxext: 494.9 ( 3.98x) avg_qpel_pixels_tab[0][9]_sse2: 388.3 ( 5.08x) avg_qpel_pixels_tab[0][10]_c: 1900.8 ( 1.00x) avg_qpel_pixels_tab[0][10]_mmxext: 476.4 ( 3.99x) avg_qpel_pixels_tab[0][10]_sse2: 362.4 ( 5.24x) avg_qpel_pixels_tab[0][11]_c: 2003.3 ( 1.00x) avg_qpel_pixels_tab[0][11]_mmxext: 496.5 ( 4.04x) avg_qpel_pixels_tab[0][11]_sse2: 385.9 ( 5.19x) avg_qpel_pixels_tab[0][12]_c: 841.8 ( 1.00x) avg_qpel_pixels_tab[0][12]_mmxext: 226.7 ( 3.71x) avg_qpel_pixels_tab[0][12]_sse2: 143.3 ( 5.87x) avg_qpel_pixels_tab[0][13]_c: 1929.0 ( 1.00x) avg_qpel_pixels_tab[0][13]_mmxext: 499.6 ( 3.86x) avg_qpel_pixels_tab[0][13]_sse2: 412.1 ( 4.68x) avg_qpel_pixels_tab[0][14]_c: 1777.9 ( 1.00x) avg_qpel_pixels_tab[0][14]_mmxext: 484.8 ( 3.67x) avg_qpel_pixels_tab[0][14]_sse2: 385.9 ( 4.61x) avg_qpel_pixels_tab[0][15]_c: 1914.8 ( 1.00x) avg_qpel_pixels_tab[0][15]_mmxext: 501.8 ( 3.82x) avg_qpel_pixels_tab[0][15]_sse2: 405.0 ( 4.73x) avg_qpel_pixels_tab[1][4]_c: 203.4 ( 1.00x) avg_qpel_pixels_tab[1][4]_mmxext: 64.7 ( 3.14x) avg_qpel_pixels_tab[1][4]_sse2: 40.3 ( 5.05x) avg_qpel_pixels_tab[1][5]_c: 488.8 ( 1.00x) avg_qpel_pixels_tab[1][5]_mmxext: 134.6 ( 3.63x) avg_qpel_pixels_tab[1][5]_sse2: 108.5 ( 4.50x) avg_qpel_pixels_tab[1][6]_c: 448.2 ( 1.00x) avg_qpel_pixels_tab[1][6]_mmxext: 128.8 ( 3.48x) avg_qpel_pixels_tab[1][6]_sse2: 102.5 ( 4.37x) avg_qpel_pixels_tab[1][7]_c: 489.6 ( 1.00x) avg_qpel_pixels_tab[1][7]_mmxext: 134.5 ( 3.64x) avg_qpel_pixels_tab[1][7]_sse2: 108.8 ( 4.50x) avg_qpel_pixels_tab[1][8]_c: 223.8 ( 1.00x) avg_qpel_pixels_tab[1][8]_mmxext: 57.5 ( 3.89x) avg_qpel_pixels_tab[1][8]_sse2: 36.3 ( 6.16x) avg_qpel_pixels_tab[1][9]_c: 496.6 ( 1.00x) avg_qpel_pixels_tab[1][9]_mmxext: 129.8 ( 3.82x) avg_qpel_pixels_tab[1][9]_sse2: 105.1 ( 4.72x) avg_qpel_pixels_tab[1][10]_c: 466.1 ( 1.00x) avg_qpel_pixels_tab[1][10]_mmxext: 123.2 ( 3.78x) avg_qpel_pixels_tab[1][10]_sse2: 99.1 ( 4.70x) avg_qpel_pixels_tab[1][11]_c: 497.9 ( 1.00x) avg_qpel_pixels_tab[1][11]_mmxext: 129.9 ( 3.83x) avg_qpel_pixels_tab[1][11]_sse2: 105.4 ( 4.72x) avg_qpel_pixels_tab[1][12]_c: 203.5 ( 1.00x) avg_qpel_pixels_tab[1][12]_mmxext: 63.8 ( 3.19x) avg_qpel_pixels_tab[1][12]_sse2: 38.8 ( 5.25x) avg_qpel_pixels_tab[1][13]_c: 487.9 ( 1.00x) avg_qpel_pixels_tab[1][13]_mmxext: 134.7 ( 3.62x) avg_qpel_pixels_tab[1][13]_sse2: 108.4 ( 4.50x) avg_qpel_pixels_tab[1][14]_c: 447.4 ( 1.00x) avg_qpel_pixels_tab[1][14]_mmxext: 128.2 ( 3.49x) avg_qpel_pixels_tab[1][14]_sse2: 102.4 ( 4.37x) avg_qpel_pixels_tab[1][15]_c: 487.5 ( 1.00x) avg_qpel_pixels_tab[1][15]_mmxext: 134.0 ( 3.64x) avg_qpel_pixels_tab[1][15]_sse2: 109.9 ( 4.44x) put_no_rnd_qpel_pixels_tab[0][4]_c: 825.5 ( 1.00x) put_no_rnd_qpel_pixels_tab[0][4]_mmxext: 242.5 ( 3.40x) put_no_rnd_qpel_pixels_tab[0][4]_sse2: 136.0 ( 6.07x) put_no_rnd_qpel_pixels_tab[0][5]_c: 1837.4 ( 1.00x) put_no_rnd_qpel_pixels_tab[0][5]_mmxext: 542.5 ( 3.39x) put_no_rnd_qpel_pixels_tab[0][5]_sse2: 446.5 ( 4.11x) put_no_rnd_qpel_pixels_tab[0][6]_c: 1766.3 ( 1.00x) put_no_rnd_qpel_pixels_tab[0][6]_mmxext: 493.6 ( 3.58x) put_no_rnd_qpel_pixels_tab[0][6]_sse2: 394.6 ( 4.48x) put_no_rnd_qpel_pixels_tab[0][7]_c: 1877.4 ( 1.00x) put_no_rnd_qpel_pixels_tab[0][7]_mmxext: 541.9 ( 3.46x) put_no_rnd_qpel_pixels_tab[0][7]_sse2: 447.6 ( 4.19x) put_no_rnd_qpel_pixels_tab[0][8]_c: 785.1 ( 1.00x) put_no_rnd_qpel_pixels_tab[0][8]_mmxext: 206.2 ( 3.81x) put_no_rnd_qpel_pixels_tab[0][8]_sse2: 101.6 ( 7.73x) put_no_rnd_qpel_pixels_tab[0][9]_c: 1772.2 ( 1.00x) put_no_rnd_qpel_pixels_tab[0][9]_mmxext: 489.5 ( 3.62x) put_no_rnd_qpel_pixels_tab[0][9]_sse2: 394.8 ( 4.49x) put_no_rnd_qpel_pixels_tab[0][10]_c: 1711.5 ( 1.00x) put_no_rnd_qpel_pixels_tab[0][10]_mmxext: 461.2 ( 3.71x) put_no_rnd_qpel_pixels_tab[0][10]_sse2: 357.9 ( 4.78x) put_no_rnd_qpel_pixels_tab[0][11]_c: 1815.9 ( 1.00x) put_no_rnd_qpel_pixels_tab[0][11]_mmxext: 490.8 ( 3.70x) put_no_rnd_qpel_pixels_tab[0][11]_sse2: 394.0 ( 4.61x) put_no_rnd_qpel_pixels_tab[0][12]_c: 824.8 ( 1.00x) put_no_rnd_qpel_pixels_tab[0][12]_mmxext: 242.9 ( 3.40x) put_no_rnd_qpel_pixels_tab[0][12]_sse2: 135.3 ( 6.10x) put_no_rnd_qpel_pixels_tab[0][13]_c: 1843.5 ( 1.00x) put_no_rnd_qpel_pixels_tab[0][13]_mmxext: 545.4 ( 3.38x) put_no_rnd_qpel_pixels_tab[0][13]_sse2: 444.9 ( 4.14x) put_no_rnd_qpel_pixels_tab[0][14]_c: 1758.1 ( 1.00x) put_no_rnd_qpel_pixels_tab[0][14]_mmxext: 497.7 ( 3.53x) put_no_rnd_qpel_pixels_tab[0][14]_sse2: 393.5 ( 4.47x) put_no_rnd_qpel_pixels_tab[0][15]_c: 1861.3 ( 1.00x) put_no_rnd_qpel_pixels_tab[0][15]_mmxext: 545.0 ( 3.42x) put_no_rnd_qpel_pixels_tab[0][15]_sse2: 445.7 ( 4.18x) put_no_rnd_qpel_pixels_tab[1][4]_c: 198.3 ( 1.00x) put_no_rnd_qpel_pixels_tab[1][4]_mmxext: 64.3 ( 3.08x) put_no_rnd_qpel_pixels_tab[1][4]_sse2: 39.8 ( 4.98x) put_no_rnd_qpel_pixels_tab[1][5]_c: 460.7 ( 1.00x) put_no_rnd_qpel_pixels_tab[1][5]_mmxext: 137.2 ( 3.36x) put_no_rnd_qpel_pixels_tab[1][5]_sse2: 113.5 ( 4.06x) put_no_rnd_qpel_pixels_tab[1][6]_c: 441.4 ( 1.00x) put_no_rnd_qpel_pixels_tab[1][6]_mmxext: 126.7 ( 3.49x) put_no_rnd_qpel_pixels_tab[1][6]_sse2: 103.7 ( 4.26x) put_no_rnd_qpel_pixels_tab[1][7]_c: 465.9 ( 1.00x) put_no_rnd_qpel_pixels_tab[1][7]_mmxext: 137.7 ( 3.38x) put_no_rnd_qpel_pixels_tab[1][7]_sse2: 114.0 ( 4.09x) put_no_rnd_qpel_pixels_tab[1][8]_c: 193.8 ( 1.00x) put_no_rnd_qpel_pixels_tab[1][8]_mmxext: 52.1 ( 3.72x) put_no_rnd_qpel_pixels_tab[1][8]_sse2: 27.8 ( 6.97x) put_no_rnd_qpel_pixels_tab[1][9]_c: 450.9 ( 1.00x) put_no_rnd_qpel_pixels_tab[1][9]_mmxext: 126.2 ( 3.57x) put_no_rnd_qpel_pixels_tab[1][9]_sse2: 104.3 ( 4.32x) put_no_rnd_qpel_pixels_tab[1][10]_c: 436.5 ( 1.00x) put_no_rnd_qpel_pixels_tab[1][10]_mmxext: 118.1 ( 3.69x) put_no_rnd_qpel_pixels_tab[1][10]_sse2: 92.4 ( 4.73x) put_no_rnd_qpel_pixels_tab[1][11]_c: 453.6 ( 1.00x) put_no_rnd_qpel_pixels_tab[1][11]_mmxext: 128.7 ( 3.52x) put_no_rnd_qpel_pixels_tab[1][11]_sse2: 103.6 ( 4.38x) put_no_rnd_qpel_pixels_tab[1][12]_c: 201.2 ( 1.00x) put_no_rnd_qpel_pixels_tab[1][12]_mmxext: 64.2 ( 3.13x) put_no_rnd_qpel_pixels_tab[1][12]_sse2: 39.6 ( 5.08x) put_no_rnd_qpel_pixels_tab[1][13]_c: 461.9 ( 1.00x) put_no_rnd_qpel_pixels_tab[1][13]_mmxext: 137.6 ( 3.36x) put_no_rnd_qpel_pixels_tab[1][13]_sse2: 113.4 ( 4.07x) put_no_rnd_qpel_pixels_tab[1][14]_c: 442.6 ( 1.00x) put_no_rnd_qpel_pixels_tab[1][14]_mmxext: 127.0 ( 3.49x) put_no_rnd_qpel_pixels_tab[1][14]_sse2: 102.2 ( 4.33x) put_no_rnd_qpel_pixels_tab[1][15]_c: 462.9 ( 1.00x) put_no_rnd_qpel_pixels_tab[1][15]_mmxext: 139.5 ( 3.32x) put_no_rnd_qpel_pixels_tab[1][15]_sse2: 113.3 ( 4.09x) put_qpel_pixels_tab[0][4]_c: 824.6 ( 1.00x) put_qpel_pixels_tab[0][4]_mmxext: 220.1 ( 3.75x) put_qpel_pixels_tab[0][4]_sse2: 137.8 ( 5.98x) put_qpel_pixels_tab[0][5]_c: 1892.0 ( 1.00x) put_qpel_pixels_tab[0][5]_mmxext: 508.0 ( 3.72x) put_qpel_pixels_tab[0][5]_sse2: 408.6 ( 4.63x) put_qpel_pixels_tab[0][6]_c: 1758.0 ( 1.00x) put_qpel_pixels_tab[0][6]_mmxext: 476.7 ( 3.69x) put_qpel_pixels_tab[0][6]_sse2: 381.4 ( 4.61x) put_qpel_pixels_tab[0][7]_c: 1924.3 ( 1.00x) put_qpel_pixels_tab[0][7]_mmxext: 495.1 ( 3.89x) put_qpel_pixels_tab[0][7]_sse2: 417.2 ( 4.61x) put_qpel_pixels_tab[0][8]_c: 772.1 ( 1.00x) put_qpel_pixels_tab[0][8]_mmxext: 197.5 ( 3.91x) put_qpel_pixels_tab[0][8]_sse2: 118.4 ( 6.52x) put_qpel_pixels_tab[0][9]_c: 1778.2 ( 1.00x) put_qpel_pixels_tab[0][9]_mmxext: 476.7 ( 3.73x) put_qpel_pixels_tab[0][9]_sse2: 379.6 ( 4.68x) put_qpel_pixels_tab[0][10]_c: 1714.6 ( 1.00x) put_qpel_pixels_tab[0][10]_mmxext: 460.7 ( 3.72x) put_qpel_pixels_tab[0][10]_sse2: 386.8 ( 4.43x) put_qpel_pixels_tab[0][11]_c: 1819.1 ( 1.00x) put_qpel_pixels_tab[0][11]_mmxext: 474.9 ( 3.83x) put_qpel_pixels_tab[0][11]_sse2: 404.5 ( 4.50x) put_qpel_pixels_tab[0][12]_c: 829.7 ( 1.00x) put_qpel_pixels_tab[0][12]_mmxext: 221.5 ( 3.75x) put_qpel_pixels_tab[0][12]_sse2: 138.7 ( 5.98x) put_qpel_pixels_tab[0][13]_c: 1892.8 ( 1.00x) put_qpel_pixels_tab[0][13]_mmxext: 494.4 ( 3.83x) put_qpel_pixels_tab[0][13]_sse2: 413.9 ( 4.57x) put_qpel_pixels_tab[0][14]_c: 1763.1 ( 1.00x) put_qpel_pixels_tab[0][14]_mmxext: 473.4 ( 3.72x) put_qpel_pixels_tab[0][14]_sse2: 377.8 ( 4.67x) put_qpel_pixels_tab[0][15]_c: 1896.4 ( 1.00x) put_qpel_pixels_tab[0][15]_mmxext: 492.5 ( 3.85x) put_qpel_pixels_tab[0][15]_sse2: 399.0 ( 4.75x) put_qpel_pixels_tab[1][4]_c: 198.6 ( 1.00x) put_qpel_pixels_tab[1][4]_mmxext: 60.9 ( 3.26x) put_qpel_pixels_tab[1][4]_sse2: 40.1 ( 4.95x) put_qpel_pixels_tab[1][5]_c: 471.4 ( 1.00x) put_qpel_pixels_tab[1][5]_mmxext: 131.8 ( 3.58x) put_qpel_pixels_tab[1][5]_sse2: 107.2 ( 4.40x) put_qpel_pixels_tab[1][6]_c: 440.3 ( 1.00x) put_qpel_pixels_tab[1][6]_mmxext: 126.3 ( 3.49x) put_qpel_pixels_tab[1][6]_sse2: 100.6 ( 4.38x) put_qpel_pixels_tab[1][7]_c: 469.2 ( 1.00x) put_qpel_pixels_tab[1][7]_mmxext: 131.7 ( 3.56x) put_qpel_pixels_tab[1][7]_sse2: 106.9 ( 4.39x) put_qpel_pixels_tab[1][8]_c: 194.2 ( 1.00x) put_qpel_pixels_tab[1][8]_mmxext: 52.9 ( 3.67x) put_qpel_pixels_tab[1][8]_sse2: 28.0 ( 6.95x) put_qpel_pixels_tab[1][9]_c: 464.6 ( 1.00x) put_qpel_pixels_tab[1][9]_mmxext: 125.1 ( 3.71x) put_qpel_pixels_tab[1][9]_sse2: 100.9 ( 4.60x) put_qpel_pixels_tab[1][10]_c: 433.8 ( 1.00x) put_qpel_pixels_tab[1][10]_mmxext: 118.2 ( 3.67x) put_qpel_pixels_tab[1][10]_sse2: 94.5 ( 4.59x) put_qpel_pixels_tab[1][11]_c: 463.9 ( 1.00x) put_qpel_pixels_tab[1][11]_mmxext: 125.5 ( 3.70x) put_qpel_pixels_tab[1][11]_sse2: 102.6 ( 4.52x) put_qpel_pixels_tab[1][12]_c: 199.2 ( 1.00x) put_qpel_pixels_tab[1][12]_mmxext: 63.7 ( 3.12x) put_qpel_pixels_tab[1][12]_sse2: 36.2 ( 5.50x) put_qpel_pixels_tab[1][13]_c: 475.6 ( 1.00x) put_qpel_pixels_tab[1][13]_mmxext: 139.5 ( 3.41x) put_qpel_pixels_tab[1][13]_sse2: 107.3 ( 4.43x) put_qpel_pixels_tab[1][14]_c: 441.9 ( 1.00x) put_qpel_pixels_tab[1][14]_mmxext: 126.9 ( 3.48x) put_qpel_pixels_tab[1][14]_sse2: 101.3 ( 4.36x) put_qpel_pixels_tab[1][15]_c: 475.9 ( 1.00x) put_qpel_pixels_tab[1][15]_mmxext: 131.9 ( 3.61x) put_qpel_pixels_tab[1][15]_sse2: 107.0 ( 4.45x) The new functions (in qpeldsp.asm) occupy 8244B (the MMXEXT functions which they will replace occupy only 6720B). Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
This commit is contained in:
+92
-58
@@ -26,9 +26,9 @@
|
||||
SECTION_RODATA
|
||||
|
||||
cextern pw_3
|
||||
pw_15: times 4 dw 15
|
||||
pw_15: times 8 dw 15
|
||||
cextern pw_16
|
||||
pw_20: times 4 dw 20
|
||||
pw_20: times 8 dw 20
|
||||
|
||||
|
||||
SECTION .text
|
||||
@@ -396,68 +396,75 @@ MPEG4_QPEL8_H_LOWPASS put_no_rnd
|
||||
paddw m5, m4
|
||||
psraw m5, 5
|
||||
packuswb m5, m5
|
||||
OP_MOV %5, m5, m7
|
||||
OP_MOV %5, m5, m4
|
||||
SWAP 0,1,2,3
|
||||
%endmacro
|
||||
|
||||
%macro MPEG4_QPEL16_V_LOWPASS 1
|
||||
cglobal %1_mpeg4_qpel16_v_lowpass, 4, 6, 0, 544
|
||||
cglobal %1_mpeg4_qpel16_v_lowpass, 4, 6, 7, 544
|
||||
mov r4d, 17
|
||||
mov r5, rsp
|
||||
pxor m7, m7
|
||||
pxor m4, m4
|
||||
.looph:
|
||||
mova m0, [r1]
|
||||
mova m1, [r1]
|
||||
movu m0, [r1]
|
||||
mova m1, m0
|
||||
%if mmsize == 8
|
||||
mova m2, [r1+8]
|
||||
mova m3, [r1+8]
|
||||
punpcklbw m0, m7
|
||||
punpckhbw m1, m7
|
||||
punpcklbw m2, m7
|
||||
punpckhbw m3, m7
|
||||
punpcklbw m0, m4
|
||||
punpckhbw m1, m4
|
||||
punpcklbw m2, m4
|
||||
punpckhbw m3, m4
|
||||
mova [r5], m0
|
||||
mova [r5+0x88], m1
|
||||
mova [r5+0x110], m2
|
||||
mova [r5+0x198], m3
|
||||
add r5, 8
|
||||
%else
|
||||
punpcklbw m0, m4
|
||||
punpckhbw m1, m4
|
||||
mova [r5], m0
|
||||
mova [r5+0x110], m1
|
||||
%endif
|
||||
add r1, r3
|
||||
add r5, mmsize
|
||||
dec r4d
|
||||
jne .looph
|
||||
|
||||
|
||||
mov r4d, 4
|
||||
mov r4d, 16/(mmsize/2)
|
||||
mov r1, r0
|
||||
mov r5, rsp
|
||||
.loopv:
|
||||
mova m0, [r5+ 0x0]
|
||||
mova m1, [r5+ 0x8]
|
||||
mova m2, [r5+0x10]
|
||||
mova m3, [r5+0x18]
|
||||
add r1, 4
|
||||
QPEL_V_LOW [r5+0x10], [r5+ 0x8], [r5+ 0x0], [r5+0x20], [r0]
|
||||
QPEL_V_LOW [r5+ 0x8], [r5+ 0x0], [r5+ 0x0], [r5+0x28], [r0+r2]
|
||||
mova m0, [r5+0 * mmsize]
|
||||
mova m1, [r5+1 * mmsize]
|
||||
mova m2, [r5+2 * mmsize]
|
||||
mova m3, [r5+3 * mmsize]
|
||||
add r1, mmsize/2
|
||||
QPEL_V_LOW [r5+2*mmsize], [r5+1*mmsize], [r5+0*mmsize], [r5+4*mmsize], [r0]
|
||||
QPEL_V_LOW [r5+1*mmsize], [r5+0*mmsize], [r5+0*mmsize], [r5+5*mmsize], [r0+r2]
|
||||
lea r0, [r0+r2*2]
|
||||
QPEL_V_LOW [r5+ 0x0], [r5+ 0x0], [r5+ 0x8], [r5+0x30], [r0]
|
||||
QPEL_V_LOW [r5+ 0x0], [r5+ 0x8], [r5+0x10], [r5+0x38], [r0+r2]
|
||||
QPEL_V_LOW [r5+0*mmsize], [r5+0*mmsize], [r5+1*mmsize], [r5+6*mmsize], [r0]
|
||||
QPEL_V_LOW [r5+0*mmsize], [r5+1*mmsize], [r5+2*mmsize], [r5+7*mmsize], [r0+r2]
|
||||
lea r0, [r0+r2*2]
|
||||
QPEL_V_LOW [r5+ 0x8], [r5+0x10], [r5+0x18], [r5+0x40], [r0]
|
||||
QPEL_V_LOW [r5+0x10], [r5+0x18], [r5+0x20], [r5+0x48], [r0+r2]
|
||||
QPEL_V_LOW [r5+1*mmsize], [r5+2*mmsize], [r5+3*mmsize], [r5+8*mmsize], [r0]
|
||||
QPEL_V_LOW [r5+2*mmsize], [r5+3*mmsize], [r5+4*mmsize], [r5+9*mmsize], [r0+r2]
|
||||
lea r0, [r0+r2*2]
|
||||
QPEL_V_LOW [r5+0x18], [r5+0x20], [r5+0x28], [r5+0x50], [r0]
|
||||
QPEL_V_LOW [r5+0x20], [r5+0x28], [r5+0x30], [r5+0x58], [r0+r2]
|
||||
QPEL_V_LOW [r5+3*mmsize], [r5+4*mmsize], [r5+5*mmsize], [r5+10*mmsize], [r0]
|
||||
QPEL_V_LOW [r5+4*mmsize], [r5+5*mmsize], [r5+6*mmsize], [r5+11*mmsize], [r0+r2]
|
||||
lea r0, [r0+r2*2]
|
||||
QPEL_V_LOW [r5+0x28], [r5+0x30], [r5+0x38], [r5+0x60], [r0]
|
||||
QPEL_V_LOW [r5+0x30], [r5+0x38], [r5+0x40], [r5+0x68], [r0+r2]
|
||||
QPEL_V_LOW [r5+5*mmsize], [r5+6*mmsize], [r5+7*mmsize], [r5+12*mmsize], [r0]
|
||||
QPEL_V_LOW [r5+6*mmsize], [r5+7*mmsize], [r5+8*mmsize], [r5+13*mmsize], [r0+r2]
|
||||
lea r0, [r0+r2*2]
|
||||
QPEL_V_LOW [r5+0x38], [r5+0x40], [r5+0x48], [r5+0x70], [r0]
|
||||
QPEL_V_LOW [r5+0x40], [r5+0x48], [r5+0x50], [r5+0x78], [r0+r2]
|
||||
QPEL_V_LOW [r5+7*mmsize], [r5+8*mmsize], [r5+ 9*mmsize], [r5+14*mmsize], [r0]
|
||||
QPEL_V_LOW [r5+8*mmsize], [r5+9*mmsize], [r5+10*mmsize], [r5+15*mmsize], [r0+r2]
|
||||
lea r0, [r0+r2*2]
|
||||
QPEL_V_LOW [r5+0x48], [r5+0x50], [r5+0x58], [r5+0x80], [r0]
|
||||
QPEL_V_LOW [r5+0x50], [r5+0x58], [r5+0x60], [r5+0x80], [r0+r2]
|
||||
QPEL_V_LOW [r5+ 9*mmsize], [r5+10*mmsize], [r5+11*mmsize], [r5+16*mmsize], [r0]
|
||||
QPEL_V_LOW [r5+10*mmsize], [r5+11*mmsize], [r5+12*mmsize], [r5+16*mmsize], [r0+r2]
|
||||
lea r0, [r0+r2*2]
|
||||
QPEL_V_LOW [r5+0x58], [r5+0x60], [r5+0x68], [r5+0x78], [r0]
|
||||
QPEL_V_LOW [r5+0x60], [r5+0x68], [r5+0x70], [r5+0x70], [r0+r2]
|
||||
QPEL_V_LOW [r5+11*mmsize], [r5+12*mmsize], [r5+13*mmsize], [r5+15*mmsize], [r0]
|
||||
QPEL_V_LOW [r5+12*mmsize], [r5+13*mmsize], [r5+14*mmsize], [r5+14*mmsize], [r0+r2]
|
||||
|
||||
add r5, 0x88
|
||||
add r5, 17*mmsize
|
||||
mov r0, r1
|
||||
dec r4d
|
||||
jne .loopv
|
||||
@@ -488,47 +495,60 @@ MPEG4_QPEL16_V_LOWPASS put_no_rnd
|
||||
|
||||
|
||||
%macro MPEG4_QPEL8_V_LOWPASS 1
|
||||
cglobal %1_mpeg4_qpel8_v_lowpass, 4, 6, 0, 144
|
||||
cglobal %1_mpeg4_qpel8_v_lowpass, 4, 6, 7, 144
|
||||
mov r4d, 9
|
||||
mov r5, rsp
|
||||
pxor m7, m7
|
||||
pxor m2, m2
|
||||
.looph:
|
||||
mova m0, [r1]
|
||||
mova m1, [r1]
|
||||
punpcklbw m0, m7
|
||||
punpckhbw m1, m7
|
||||
movq m0, [r1]
|
||||
add r1, r3
|
||||
%if mmsize == 8
|
||||
mova m1, m0
|
||||
punpcklbw m0, m2
|
||||
punpckhbw m1, m2
|
||||
mova [r5], m0
|
||||
mova [r5+0x48], m1
|
||||
add r5, 8
|
||||
add r1, r3
|
||||
%else
|
||||
punpcklbw m0, m2
|
||||
mova [r5], m0
|
||||
%endif
|
||||
add r5, mmsize
|
||||
dec r4d
|
||||
jne .looph
|
||||
|
||||
|
||||
%if mmsize == 8
|
||||
mov r4d, 2
|
||||
mov r1, r0
|
||||
mov r5, rsp
|
||||
.loopv:
|
||||
mova m0, [r5+ 0x0]
|
||||
mova m1, [r5+ 0x8]
|
||||
mova m2, [r5+0x10]
|
||||
mova m3, [r5+0x18]
|
||||
QPEL_V_LOW [r5+0x10], [r5+ 0x8], [r5+ 0x0], [r5+0x20], [r0]
|
||||
QPEL_V_LOW [r5+ 0x8], [r5+ 0x0], [r5+ 0x0], [r5+0x28], [r0+r2]
|
||||
lea r0, [r0+r2*2]
|
||||
QPEL_V_LOW [r5+ 0x0], [r5+ 0x0], [r5+ 0x8], [r5+0x30], [r0]
|
||||
QPEL_V_LOW [r5+ 0x0], [r5+ 0x8], [r5+0x10], [r5+0x38], [r0+r2]
|
||||
lea r0, [r0+r2*2]
|
||||
QPEL_V_LOW [r5+ 0x8], [r5+0x10], [r5+0x18], [r5+0x40], [r0]
|
||||
QPEL_V_LOW [r5+0x10], [r5+0x18], [r5+0x20], [r5+0x40], [r0+r2]
|
||||
lea r0, [r0+r2*2]
|
||||
QPEL_V_LOW [r5+0x18], [r5+0x20], [r5+0x28], [r5+0x38], [r0]
|
||||
QPEL_V_LOW [r5+0x20], [r5+0x28], [r5+0x30], [r5+0x30], [r0+r2]
|
||||
%define R5 r5
|
||||
%else
|
||||
%define R5 rsp
|
||||
%endif
|
||||
|
||||
mova m0, [R5+0 * mmsize]
|
||||
mova m1, [R5+1 * mmsize]
|
||||
mova m2, [R5+2 * mmsize]
|
||||
mova m3, [R5+3 * mmsize]
|
||||
QPEL_V_LOW [R5+2*mmsize], [R5+1*mmsize], [R5+0*mmsize], [R5+4*mmsize], [r0]
|
||||
QPEL_V_LOW [R5+1*mmsize], [R5+0*mmsize], [R5+0*mmsize], [R5+5*mmsize], [r0+r2]
|
||||
lea r0, [r0+r2*2]
|
||||
QPEL_V_LOW [R5+0*mmsize], [R5+0*mmsize], [R5+1*mmsize], [R5+6*mmsize], [r0]
|
||||
QPEL_V_LOW [R5+0*mmsize], [R5+1*mmsize], [R5+2*mmsize], [R5+7*mmsize], [r0+r2]
|
||||
lea r0, [r0+r2*2]
|
||||
QPEL_V_LOW [R5+1*mmsize], [R5+2*mmsize], [R5+3*mmsize], [R5+8*mmsize], [r0]
|
||||
QPEL_V_LOW [R5+2*mmsize], [R5+3*mmsize], [R5+4*mmsize], [R5+8*mmsize], [r0+r2]
|
||||
lea r0, [r0+r2*2]
|
||||
QPEL_V_LOW [R5+3*mmsize], [R5+4*mmsize], [R5+5*mmsize], [R5+7*mmsize], [r0]
|
||||
QPEL_V_LOW [R5+4*mmsize], [R5+5*mmsize], [R5+6*mmsize], [R5+6*mmsize], [r0+r2]
|
||||
|
||||
%if mmsize == 8
|
||||
add r5, 0x48
|
||||
lea r0, [r1+4]
|
||||
dec r4d
|
||||
jne .loopv
|
||||
%endif
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
@@ -542,3 +562,17 @@ MPEG4_QPEL8_V_LOWPASS avg
|
||||
%define PW_ROUND pw_15
|
||||
%define OP_MOV PUT_OPH
|
||||
MPEG4_QPEL8_V_LOWPASS put_no_rnd
|
||||
|
||||
INIT_XMM sse2
|
||||
%define PW_ROUND pw_16
|
||||
%define OP_MOV PUT_OPH
|
||||
MPEG4_QPEL16_V_LOWPASS put
|
||||
MPEG4_QPEL8_V_LOWPASS put
|
||||
%define PW_ROUND pw_16
|
||||
%define OP_MOV AVG_OPH
|
||||
MPEG4_QPEL16_V_LOWPASS avg
|
||||
MPEG4_QPEL8_V_LOWPASS avg
|
||||
%define PW_ROUND pw_15
|
||||
%define OP_MOV PUT_OPH
|
||||
MPEG4_QPEL16_V_LOWPASS put_no_rnd
|
||||
MPEG4_QPEL8_V_LOWPASS put_no_rnd
|
||||
|
||||
@@ -271,6 +271,35 @@ QPEL3(QPEL_H, 16, 17, mmxext, mmxext, mmxext, mmxext)
|
||||
QPEL3(QPEL_V, 16, 17, mmxext, mmxext, mmxext, mmxext)
|
||||
QPEL3(QPEL_HV, 16, 17, mmxext, mmxext, mmxext, mmxext)
|
||||
|
||||
QPEL3(QPEL_V, 8, 9, ssse3, sse2, ssse3, mmxext)
|
||||
QPEL3(QPEL_HV, 8, 9, mmxext, sse2, sse2, mmxext)
|
||||
QPEL3(QPEL_V, 16, 17, ssse3, sse2, ssse3, mmxext)
|
||||
QPEL3(QPEL_HV, 16, 17, mmxext, sse2, sse2, mmxext)
|
||||
|
||||
#define SET_QPEL_FUNC(OP, X, Y, SIZE, CPU, PREFIX) \
|
||||
c->OP ## _qpel_pixels_tab[SIZE == 8][X+4*Y] = PREFIX ## OP ## _qpel ## SIZE ## _mc ## X ## Y ## _ ## CPU
|
||||
|
||||
#define SET_QPEL_FUNCS3(X, Y, SIZE, CPU, PREFIX) \
|
||||
SET_QPEL_FUNC(avg, X, Y, SIZE, CPU, PREFIX); \
|
||||
SET_QPEL_FUNC(put, X, Y, SIZE, CPU, PREFIX); \
|
||||
SET_QPEL_FUNC(put_no_rnd, X, Y, SIZE, CPU, PREFIX)
|
||||
|
||||
#define SET_V_QPEL_FUNCS(SIZE, CPU, PREFIX) \
|
||||
SET_QPEL_FUNCS3(0, 1, SIZE, CPU, PREFIX); \
|
||||
SET_QPEL_FUNCS3(0, 2, SIZE, CPU, PREFIX); \
|
||||
SET_QPEL_FUNCS3(0, 3, SIZE, CPU, PREFIX)
|
||||
|
||||
#define SET_HV_QPEL_FUNCS(SIZE, CPU, PREFIX) \
|
||||
SET_QPEL_FUNCS3(1, 1, SIZE, CPU, PREFIX); \
|
||||
SET_QPEL_FUNCS3(1, 2, SIZE, CPU, PREFIX); \
|
||||
SET_QPEL_FUNCS3(1, 3, SIZE, CPU, PREFIX); \
|
||||
SET_QPEL_FUNCS3(2, 1, SIZE, CPU, PREFIX); \
|
||||
SET_QPEL_FUNCS3(2, 2, SIZE, CPU, PREFIX); \
|
||||
SET_QPEL_FUNCS3(2, 3, SIZE, CPU, PREFIX); \
|
||||
SET_QPEL_FUNCS3(3, 1, SIZE, CPU, PREFIX); \
|
||||
SET_QPEL_FUNCS3(3, 2, SIZE, CPU, PREFIX); \
|
||||
SET_QPEL_FUNCS3(3, 3, SIZE, CPU, PREFIX)
|
||||
|
||||
#define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
|
||||
do { \
|
||||
c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
|
||||
@@ -313,6 +342,11 @@ av_cold void ff_qpeldsp_init_x86(QpelDSPContext *c)
|
||||
c->put_no_rnd_qpel_pixels_tab[1][0] =
|
||||
c->put_qpel_pixels_tab[1][0] = ff_put_pixels8x8_sse2;
|
||||
c->avg_qpel_pixels_tab[0][0] = ff_avg_pixels16x16_sse2;
|
||||
|
||||
SET_V_QPEL_FUNCS (16, sse2,);
|
||||
SET_HV_QPEL_FUNCS(16, sse2,);
|
||||
SET_V_QPEL_FUNCS (8, sse2,);
|
||||
SET_HV_QPEL_FUNCS(8, sse2,);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user