diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile index 1c91d62ca8..5b717b72f4 100644 --- a/libavcodec/arm/Makefile +++ b/libavcodec/arm/Makefile @@ -31,6 +31,11 @@ OBJS-$(CONFIG_H264DSP) += arm/h264dsp_init_arm.o OBJS-$(CONFIG_H264PRED) += arm/h264pred_init_arm.o OBJS-$(CONFIG_H264QPEL) += arm/h264qpel_init_arm.o +OBJS-$(CONFIG_HPELDSP) += arm/hpeldsp_init_arm.o \ + arm/hpeldsp_arm.o +ARMV6-OBJS-$(CONFIG_HPELDSP) += arm/hpeldsp_init_armv6.o \ + arm/hpeldsp_armv6.o + OBJS-$(CONFIG_RV30_DECODER) += arm/rv34dsp_init_arm.o OBJS-$(CONFIG_RV40_DECODER) += arm/rv34dsp_init_arm.o \ arm/rv40dsp_init_arm.o \ @@ -84,6 +89,9 @@ NEON-OBJS-$(CONFIG_AAC_DECODER) += arm/sbrdsp_neon.o \ NEON-OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_neon.o \ arm/synth_filter_neon.o \ +NEON-OBJS-$(CONFIG_HPELDSP) += arm/hpeldsp_init_neon.o \ + arm/hpeldsp_neon.o + NEON-OBJS-$(CONFIG_MPEGVIDEO) += arm/mpegvideo_neon.o NEON-OBJS-$(CONFIG_RV30_DECODER) += arm/rv34dsp_neon.o NEON-OBJS-$(CONFIG_RV40_DECODER) += arm/rv34dsp_neon.o \ diff --git a/libavcodec/arm/dsputil_arm.S b/libavcodec/arm/dsputil_arm.S index 8504032bf4..1692a58543 100644 --- a/libavcodec/arm/dsputil_arm.S +++ b/libavcodec/arm/dsputil_arm.S @@ -26,590 +26,6 @@ #define pld @ #endif -.macro ALIGN_QWORD_D shift, Rd0, Rd1, Rd2, Rd3, Rn0, Rn1, Rn2, Rn3, Rn4 - mov \Rd0, \Rn0, lsr #(\shift * 8) - mov \Rd1, \Rn1, lsr #(\shift * 8) - mov \Rd2, \Rn2, lsr #(\shift * 8) - mov \Rd3, \Rn3, lsr #(\shift * 8) - orr \Rd0, \Rd0, \Rn1, lsl #(32 - \shift * 8) - orr \Rd1, \Rd1, \Rn2, lsl #(32 - \shift * 8) - orr \Rd2, \Rd2, \Rn3, lsl #(32 - \shift * 8) - orr \Rd3, \Rd3, \Rn4, lsl #(32 - \shift * 8) -.endm -.macro ALIGN_DWORD shift, R0, R1, R2 - mov \R0, \R0, lsr #(\shift * 8) - orr \R0, \R0, \R1, lsl #(32 - \shift * 8) - mov \R1, \R1, lsr #(\shift * 8) - orr \R1, \R1, \R2, lsl #(32 - \shift * 8) -.endm -.macro ALIGN_DWORD_D shift, Rdst0, Rdst1, Rsrc0, Rsrc1, Rsrc2 - mov \Rdst0, \Rsrc0, lsr #(\shift * 8) - mov \Rdst1, \Rsrc1, lsr #(\shift * 8) - orr \Rdst0, \Rdst0, \Rsrc1, lsl #(32 - (\shift * 8)) - orr \Rdst1, \Rdst1, \Rsrc2, lsl #(32 - (\shift * 8)) -.endm - -.macro RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask - @ Rd = (Rn | Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1) - @ Rmask = 0xFEFEFEFE - @ Rn = destroy - eor \Rd0, \Rn0, \Rm0 - eor \Rd1, \Rn1, \Rm1 - orr \Rn0, \Rn0, \Rm0 - orr \Rn1, \Rn1, \Rm1 - and \Rd0, \Rd0, \Rmask - and \Rd1, \Rd1, \Rmask - sub \Rd0, \Rn0, \Rd0, lsr #1 - sub \Rd1, \Rn1, \Rd1, lsr #1 -.endm - -.macro NO_RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask - @ Rd = (Rn & Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1) - @ Rmask = 0xFEFEFEFE - @ Rn = destroy - eor \Rd0, \Rn0, \Rm0 - eor \Rd1, \Rn1, \Rm1 - and \Rn0, \Rn0, \Rm0 - and \Rn1, \Rn1, \Rm1 - and \Rd0, \Rd0, \Rmask - and \Rd1, \Rd1, \Rmask - add \Rd0, \Rn0, \Rd0, lsr #1 - add \Rd1, \Rn1, \Rd1, lsr #1 -.endm - -.macro JMP_ALIGN tmp, reg - ands \tmp, \reg, #3 - bic \reg, \reg, #3 - beq 1f - subs \tmp, \tmp, #1 - beq 2f - subs \tmp, \tmp, #1 - beq 3f - b 4f -.endm - -@ ---------------------------------------------------------------- - .align 5 -function ff_put_pixels16_arm, export=1 - @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) - @ block = word aligned, pixles = unaligned - pld [r1] - push {r4-r11, lr} - JMP_ALIGN r5, r1 -1: - ldm r1, {r4-r7} - add r1, r1, r2 - stm r0, {r4-r7} - pld [r1] - subs r3, r3, #1 - add r0, r0, r2 - bne 1b - pop {r4-r11, pc} - .align 5 -2: - ldm r1, {r4-r8} - add r1, r1, r2 - ALIGN_QWORD_D 1, r9, r10, r11, r12, r4, r5, r6, r7, r8 - pld [r1] - subs r3, r3, #1 - stm r0, {r9-r12} - add r0, r0, r2 - bne 2b - pop {r4-r11, pc} - .align 5 -3: - ldm r1, {r4-r8} - add r1, r1, r2 - ALIGN_QWORD_D 2, r9, r10, r11, r12, r4, r5, r6, r7, r8 - pld [r1] - subs r3, r3, #1 - stm r0, {r9-r12} - add r0, r0, r2 - bne 3b - pop {r4-r11, pc} - .align 5 -4: - ldm r1, {r4-r8} - add r1, r1, r2 - ALIGN_QWORD_D 3, r9, r10, r11, r12, r4, r5, r6, r7, r8 - pld [r1] - subs r3, r3, #1 - stm r0, {r9-r12} - add r0, r0, r2 - bne 4b - pop {r4-r11,pc} -endfunc - -@ ---------------------------------------------------------------- - .align 5 -function ff_put_pixels8_arm, export=1 - @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) - @ block = word aligned, pixles = unaligned - pld [r1] - push {r4-r5,lr} - JMP_ALIGN r5, r1 -1: - ldm r1, {r4-r5} - add r1, r1, r2 - subs r3, r3, #1 - pld [r1] - stm r0, {r4-r5} - add r0, r0, r2 - bne 1b - pop {r4-r5,pc} - .align 5 -2: - ldm r1, {r4-r5, r12} - add r1, r1, r2 - ALIGN_DWORD 1, r4, r5, r12 - pld [r1] - subs r3, r3, #1 - stm r0, {r4-r5} - add r0, r0, r2 - bne 2b - pop {r4-r5,pc} - .align 5 -3: - ldm r1, {r4-r5, r12} - add r1, r1, r2 - ALIGN_DWORD 2, r4, r5, r12 - pld [r1] - subs r3, r3, #1 - stm r0, {r4-r5} - add r0, r0, r2 - bne 3b - pop {r4-r5,pc} - .align 5 -4: - ldm r1, {r4-r5, r12} - add r1, r1, r2 - ALIGN_DWORD 3, r4, r5, r12 - pld [r1] - subs r3, r3, #1 - stm r0, {r4-r5} - add r0, r0, r2 - bne 4b - pop {r4-r5,pc} -endfunc - -@ ---------------------------------------------------------------- - .align 5 -function ff_put_pixels8_x2_arm, export=1 - @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) - @ block = word aligned, pixles = unaligned - pld [r1] - push {r4-r10,lr} - ldr r12, =0xfefefefe - JMP_ALIGN r5, r1 -1: - ldm r1, {r4-r5, r10} - add r1, r1, r2 - ALIGN_DWORD_D 1, r6, r7, r4, r5, r10 - pld [r1] - RND_AVG32 r8, r9, r4, r5, r6, r7, r12 - subs r3, r3, #1 - stm r0, {r8-r9} - add r0, r0, r2 - bne 1b - pop {r4-r10,pc} - .align 5 -2: - ldm r1, {r4-r5, r10} - add r1, r1, r2 - ALIGN_DWORD_D 1, r6, r7, r4, r5, r10 - ALIGN_DWORD_D 2, r8, r9, r4, r5, r10 - pld [r1] - RND_AVG32 r4, r5, r6, r7, r8, r9, r12 - subs r3, r3, #1 - stm r0, {r4-r5} - add r0, r0, r2 - bne 2b - pop {r4-r10,pc} - .align 5 -3: - ldm r1, {r4-r5, r10} - add r1, r1, r2 - ALIGN_DWORD_D 2, r6, r7, r4, r5, r10 - ALIGN_DWORD_D 3, r8, r9, r4, r5, r10 - pld [r1] - RND_AVG32 r4, r5, r6, r7, r8, r9, r12 - subs r3, r3, #1 - stm r0, {r4-r5} - add r0, r0, r2 - bne 3b - pop {r4-r10,pc} - .align 5 -4: - ldm r1, {r4-r5, r10} - add r1, r1, r2 - ALIGN_DWORD_D 3, r6, r7, r4, r5, r10 - pld [r1] - RND_AVG32 r8, r9, r6, r7, r5, r10, r12 - subs r3, r3, #1 - stm r0, {r8-r9} - add r0, r0, r2 - bne 4b - pop {r4-r10,pc} -endfunc - - .align 5 -function ff_put_no_rnd_pixels8_x2_arm, export=1 - @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) - @ block = word aligned, pixles = unaligned - pld [r1] - push {r4-r10,lr} - ldr r12, =0xfefefefe - JMP_ALIGN r5, r1 -1: - ldm r1, {r4-r5, r10} - add r1, r1, r2 - ALIGN_DWORD_D 1, r6, r7, r4, r5, r10 - pld [r1] - NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12 - subs r3, r3, #1 - stm r0, {r8-r9} - add r0, r0, r2 - bne 1b - pop {r4-r10,pc} - .align 5 -2: - ldm r1, {r4-r5, r10} - add r1, r1, r2 - ALIGN_DWORD_D 1, r6, r7, r4, r5, r10 - ALIGN_DWORD_D 2, r8, r9, r4, r5, r10 - pld [r1] - NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12 - subs r3, r3, #1 - stm r0, {r4-r5} - add r0, r0, r2 - bne 2b - pop {r4-r10,pc} - .align 5 -3: - ldm r1, {r4-r5, r10} - add r1, r1, r2 - ALIGN_DWORD_D 2, r6, r7, r4, r5, r10 - ALIGN_DWORD_D 3, r8, r9, r4, r5, r10 - pld [r1] - NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12 - subs r3, r3, #1 - stm r0, {r4-r5} - add r0, r0, r2 - bne 3b - pop {r4-r10,pc} - .align 5 -4: - ldm r1, {r4-r5, r10} - add r1, r1, r2 - ALIGN_DWORD_D 3, r6, r7, r4, r5, r10 - pld [r1] - NO_RND_AVG32 r8, r9, r6, r7, r5, r10, r12 - subs r3, r3, #1 - stm r0, {r8-r9} - add r0, r0, r2 - bne 4b - pop {r4-r10,pc} -endfunc - - -@ ---------------------------------------------------------------- - .align 5 -function ff_put_pixels8_y2_arm, export=1 - @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) - @ block = word aligned, pixles = unaligned - pld [r1] - push {r4-r11,lr} - mov r3, r3, lsr #1 - ldr r12, =0xfefefefe - JMP_ALIGN r5, r1 -1: - ldm r1, {r4-r5} - add r1, r1, r2 -6: ldm r1, {r6-r7} - add r1, r1, r2 - pld [r1] - RND_AVG32 r8, r9, r4, r5, r6, r7, r12 - ldm r1, {r4-r5} - add r1, r1, r2 - stm r0, {r8-r9} - add r0, r0, r2 - pld [r1] - RND_AVG32 r8, r9, r6, r7, r4, r5, r12 - subs r3, r3, #1 - stm r0, {r8-r9} - add r0, r0, r2 - bne 6b - pop {r4-r11,pc} - .align 5 -2: - ldm r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 1, r4, r5, r6 -6: ldm r1, {r7-r9} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 1, r7, r8, r9 - RND_AVG32 r10, r11, r4, r5, r7, r8, r12 - stm r0, {r10-r11} - add r0, r0, r2 - ldm r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 1, r4, r5, r6 - subs r3, r3, #1 - RND_AVG32 r10, r11, r7, r8, r4, r5, r12 - stm r0, {r10-r11} - add r0, r0, r2 - bne 6b - pop {r4-r11,pc} - .align 5 -3: - ldm r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 2, r4, r5, r6 -6: ldm r1, {r7-r9} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 2, r7, r8, r9 - RND_AVG32 r10, r11, r4, r5, r7, r8, r12 - stm r0, {r10-r11} - add r0, r0, r2 - ldm r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 2, r4, r5, r6 - subs r3, r3, #1 - RND_AVG32 r10, r11, r7, r8, r4, r5, r12 - stm r0, {r10-r11} - add r0, r0, r2 - bne 6b - pop {r4-r11,pc} - .align 5 -4: - ldm r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 3, r4, r5, r6 -6: ldm r1, {r7-r9} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 3, r7, r8, r9 - RND_AVG32 r10, r11, r4, r5, r7, r8, r12 - stm r0, {r10-r11} - add r0, r0, r2 - ldm r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 3, r4, r5, r6 - subs r3, r3, #1 - RND_AVG32 r10, r11, r7, r8, r4, r5, r12 - stm r0, {r10-r11} - add r0, r0, r2 - bne 6b - pop {r4-r11,pc} -endfunc - - .align 5 -function ff_put_no_rnd_pixels8_y2_arm, export=1 - @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) - @ block = word aligned, pixles = unaligned - pld [r1] - push {r4-r11,lr} - mov r3, r3, lsr #1 - ldr r12, =0xfefefefe - JMP_ALIGN r5, r1 -1: - ldm r1, {r4-r5} - add r1, r1, r2 -6: ldm r1, {r6-r7} - add r1, r1, r2 - pld [r1] - NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12 - ldm r1, {r4-r5} - add r1, r1, r2 - stm r0, {r8-r9} - add r0, r0, r2 - pld [r1] - NO_RND_AVG32 r8, r9, r6, r7, r4, r5, r12 - subs r3, r3, #1 - stm r0, {r8-r9} - add r0, r0, r2 - bne 6b - pop {r4-r11,pc} - .align 5 -2: - ldm r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 1, r4, r5, r6 -6: ldm r1, {r7-r9} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 1, r7, r8, r9 - NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12 - stm r0, {r10-r11} - add r0, r0, r2 - ldm r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 1, r4, r5, r6 - subs r3, r3, #1 - NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12 - stm r0, {r10-r11} - add r0, r0, r2 - bne 6b - pop {r4-r11,pc} - .align 5 -3: - ldm r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 2, r4, r5, r6 -6: ldm r1, {r7-r9} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 2, r7, r8, r9 - NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12 - stm r0, {r10-r11} - add r0, r0, r2 - ldm r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 2, r4, r5, r6 - subs r3, r3, #1 - NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12 - stm r0, {r10-r11} - add r0, r0, r2 - bne 6b - pop {r4-r11,pc} - .align 5 -4: - ldm r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 3, r4, r5, r6 -6: ldm r1, {r7-r9} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 3, r7, r8, r9 - NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12 - stm r0, {r10-r11} - add r0, r0, r2 - ldm r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 3, r4, r5, r6 - subs r3, r3, #1 - NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12 - stm r0, {r10-r11} - add r0, r0, r2 - bne 6b - pop {r4-r11,pc} -endfunc - - .ltorg - -@ ---------------------------------------------------------------- -.macro RND_XY2_IT align, rnd - @ l1= (a & 0x03030303) + (b & 0x03030303) ?(+ 0x02020202) - @ h1= ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2) -.if \align == 0 - ldm r1, {r6-r8} -.elseif \align == 3 - ldm r1, {r5-r7} -.else - ldm r1, {r8-r10} -.endif - add r1, r1, r2 - pld [r1] -.if \align == 0 - ALIGN_DWORD_D 1, r4, r5, r6, r7, r8 -.elseif \align == 1 - ALIGN_DWORD_D 1, r4, r5, r8, r9, r10 - ALIGN_DWORD_D 2, r6, r7, r8, r9, r10 -.elseif \align == 2 - ALIGN_DWORD_D 2, r4, r5, r8, r9, r10 - ALIGN_DWORD_D 3, r6, r7, r8, r9, r10 -.elseif \align == 3 - ALIGN_DWORD_D 3, r4, r5, r5, r6, r7 -.endif - ldr r14, =0x03030303 - tst r3, #1 - and r8, r4, r14 - and r9, r5, r14 - and r10, r6, r14 - and r11, r7, r14 - it eq - andeq r14, r14, r14, \rnd #1 - add r8, r8, r10 - add r9, r9, r11 - ldr r12, =0xfcfcfcfc >> 2 - itt eq - addeq r8, r8, r14 - addeq r9, r9, r14 - and r4, r12, r4, lsr #2 - and r5, r12, r5, lsr #2 - and r6, r12, r6, lsr #2 - and r7, r12, r7, lsr #2 - add r10, r4, r6 - add r11, r5, r7 - subs r3, r3, #1 -.endm - -.macro RND_XY2_EXPAND align, rnd - RND_XY2_IT \align, \rnd -6: push {r8-r11} - RND_XY2_IT \align, \rnd - pop {r4-r7} - add r4, r4, r8 - add r5, r5, r9 - ldr r14, =0x0f0f0f0f - add r6, r6, r10 - add r7, r7, r11 - and r4, r14, r4, lsr #2 - and r5, r14, r5, lsr #2 - add r4, r4, r6 - add r5, r5, r7 - stm r0, {r4-r5} - add r0, r0, r2 - bge 6b - pop {r4-r11,pc} -.endm - - .align 5 -function ff_put_pixels8_xy2_arm, export=1 - @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) - @ block = word aligned, pixles = unaligned - pld [r1] - push {r4-r11,lr} @ R14 is also called LR - JMP_ALIGN r5, r1 -1: RND_XY2_EXPAND 0, lsl - .align 5 -2: RND_XY2_EXPAND 1, lsl - .align 5 -3: RND_XY2_EXPAND 2, lsl - .align 5 -4: RND_XY2_EXPAND 3, lsl -endfunc - - .align 5 -function ff_put_no_rnd_pixels8_xy2_arm, export=1 - @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) - @ block = word aligned, pixles = unaligned - pld [r1] - push {r4-r11,lr} - JMP_ALIGN r5, r1 -1: RND_XY2_EXPAND 0, lsr - .align 5 -2: RND_XY2_EXPAND 1, lsr - .align 5 -3: RND_XY2_EXPAND 2, lsr - .align 5 -4: RND_XY2_EXPAND 3, lsr -endfunc - .align 5 @ void ff_add_pixels_clamped_arm(int16_t *block, uint8_t *dest, int stride) function ff_add_pixels_clamped_arm, export=1 diff --git a/libavcodec/arm/dsputil_armv6.S b/libavcodec/arm/dsputil_armv6.S index 61535a121f..e667a47f94 100644 --- a/libavcodec/arm/dsputil_armv6.S +++ b/libavcodec/arm/dsputil_armv6.S @@ -20,244 +20,6 @@ #include "libavutil/arm/asm.S" -.macro call_2x_pixels type, subp -function ff_\type\()_pixels16\subp\()_armv6, export=1 - push {r0-r3, lr} - bl ff_\type\()_pixels8\subp\()_armv6 - pop {r0-r3, lr} - add r0, r0, #8 - add r1, r1, #8 - b ff_\type\()_pixels8\subp\()_armv6 -endfunc -.endm - -call_2x_pixels avg -call_2x_pixels put, _x2 -call_2x_pixels put, _y2 -call_2x_pixels put, _x2_no_rnd -call_2x_pixels put, _y2_no_rnd - -function ff_put_pixels16_armv6, export=1 - push {r4-r11} -1: - ldr r5, [r1, #4] - ldr r6, [r1, #8] - ldr r7, [r1, #12] - ldr_post r4, r1, r2 - strd r6, r7, [r0, #8] - ldr r9, [r1, #4] - strd_post r4, r5, r0, r2 - ldr r10, [r1, #8] - ldr r11, [r1, #12] - ldr_post r8, r1, r2 - strd r10, r11, [r0, #8] - subs r3, r3, #2 - strd_post r8, r9, r0, r2 - bne 1b - - pop {r4-r11} - bx lr -endfunc - -function ff_put_pixels8_armv6, export=1 - push {r4-r7} -1: - ldr r5, [r1, #4] - ldr_post r4, r1, r2 - ldr r7, [r1, #4] - strd_post r4, r5, r0, r2 - ldr_post r6, r1, r2 - subs r3, r3, #2 - strd_post r6, r7, r0, r2 - bne 1b - - pop {r4-r7} - bx lr -endfunc - -function ff_put_pixels8_x2_armv6, export=1 - push {r4-r11, lr} - mov r12, #1 - orr r12, r12, r12, lsl #8 - orr r12, r12, r12, lsl #16 -1: - ldr r4, [r1] - subs r3, r3, #2 - ldr r5, [r1, #4] - ldr r7, [r1, #5] - lsr r6, r4, #8 - ldr_pre r8, r1, r2 - orr r6, r6, r5, lsl #24 - ldr r9, [r1, #4] - ldr r11, [r1, #5] - lsr r10, r8, #8 - add r1, r1, r2 - orr r10, r10, r9, lsl #24 - eor r14, r4, r6 - uhadd8 r4, r4, r6 - eor r6, r5, r7 - uhadd8 r5, r5, r7 - and r14, r14, r12 - and r6, r6, r12 - uadd8 r4, r4, r14 - eor r14, r8, r10 - uadd8 r5, r5, r6 - eor r6, r9, r11 - uhadd8 r8, r8, r10 - and r14, r14, r12 - uhadd8 r9, r9, r11 - and r6, r6, r12 - uadd8 r8, r8, r14 - strd_post r4, r5, r0, r2 - uadd8 r9, r9, r6 - strd_post r8, r9, r0, r2 - bne 1b - - pop {r4-r11, pc} -endfunc - -function ff_put_pixels8_y2_armv6, export=1 - push {r4-r11} - mov r12, #1 - orr r12, r12, r12, lsl #8 - orr r12, r12, r12, lsl #16 - ldr r4, [r1] - ldr r5, [r1, #4] - ldr_pre r6, r1, r2 - ldr r7, [r1, #4] -1: - subs r3, r3, #2 - uhadd8 r8, r4, r6 - eor r10, r4, r6 - uhadd8 r9, r5, r7 - eor r11, r5, r7 - and r10, r10, r12 - ldr_pre r4, r1, r2 - uadd8 r8, r8, r10 - and r11, r11, r12 - uadd8 r9, r9, r11 - ldr r5, [r1, #4] - uhadd8 r10, r4, r6 - eor r6, r4, r6 - uhadd8 r11, r5, r7 - and r6, r6, r12 - eor r7, r5, r7 - uadd8 r10, r10, r6 - and r7, r7, r12 - ldr_pre r6, r1, r2 - uadd8 r11, r11, r7 - strd_post r8, r9, r0, r2 - ldr r7, [r1, #4] - strd_post r10, r11, r0, r2 - bne 1b - - pop {r4-r11} - bx lr -endfunc - -function ff_put_pixels8_x2_no_rnd_armv6, export=1 - push {r4-r9, lr} -1: - subs r3, r3, #2 - ldr r4, [r1] - ldr r5, [r1, #4] - ldr r7, [r1, #5] - ldr_pre r8, r1, r2 - ldr r9, [r1, #4] - ldr r14, [r1, #5] - add r1, r1, r2 - lsr r6, r4, #8 - orr r6, r6, r5, lsl #24 - lsr r12, r8, #8 - orr r12, r12, r9, lsl #24 - uhadd8 r4, r4, r6 - uhadd8 r5, r5, r7 - uhadd8 r8, r8, r12 - uhadd8 r9, r9, r14 - stm r0, {r4,r5} - add r0, r0, r2 - stm r0, {r8,r9} - add r0, r0, r2 - bne 1b - - pop {r4-r9, pc} -endfunc - -function ff_put_pixels8_y2_no_rnd_armv6, export=1 - push {r4-r9, lr} - ldr r4, [r1] - ldr r5, [r1, #4] - ldr_pre r6, r1, r2 - ldr r7, [r1, #4] -1: - subs r3, r3, #2 - uhadd8 r8, r4, r6 - ldr_pre r4, r1, r2 - uhadd8 r9, r5, r7 - ldr r5, [r1, #4] - uhadd8 r12, r4, r6 - ldr_pre r6, r1, r2 - uhadd8 r14, r5, r7 - ldr r7, [r1, #4] - stm r0, {r8,r9} - add r0, r0, r2 - stm r0, {r12,r14} - add r0, r0, r2 - bne 1b - - pop {r4-r9, pc} -endfunc - -function ff_avg_pixels8_armv6, export=1 - pld [r1, r2] - push {r4-r10, lr} - mov lr, #1 - orr lr, lr, lr, lsl #8 - orr lr, lr, lr, lsl #16 - ldrd r4, r5, [r0] - ldr r10, [r1, #4] - ldr_post r9, r1, r2 - subs r3, r3, #2 -1: - pld [r1, r2] - eor r8, r4, r9 - uhadd8 r4, r4, r9 - eor r12, r5, r10 - ldrd_reg r6, r7, r0, r2 - uhadd8 r5, r5, r10 - and r8, r8, lr - ldr r10, [r1, #4] - and r12, r12, lr - uadd8 r4, r4, r8 - ldr_post r9, r1, r2 - eor r8, r6, r9 - uadd8 r5, r5, r12 - pld [r1, r2, lsl #1] - eor r12, r7, r10 - uhadd8 r6, r6, r9 - strd_post r4, r5, r0, r2 - uhadd8 r7, r7, r10 - beq 2f - and r8, r8, lr - ldrd_reg r4, r5, r0, r2 - uadd8 r6, r6, r8 - ldr r10, [r1, #4] - and r12, r12, lr - subs r3, r3, #2 - uadd8 r7, r7, r12 - ldr_post r9, r1, r2 - strd_post r6, r7, r0, r2 - b 1b -2: - and r8, r8, lr - and r12, r12, lr - uadd8 r6, r6, r8 - uadd8 r7, r7, r12 - strd_post r6, r7, r0, r2 - - pop {r4-r10, pc} -endfunc - function ff_add_pixels_clamped_armv6, export=1 push {r4-r8,lr} mov r3, #8 diff --git a/libavcodec/arm/dsputil_init_arm.c b/libavcodec/arm/dsputil_init_arm.c index cc24c0a10c..bb68eb6003 100644 --- a/libavcodec/arm/dsputil_init_arm.c +++ b/libavcodec/arm/dsputil_init_arm.c @@ -30,24 +30,6 @@ void ff_simple_idct_arm(int16_t *data); static void (*ff_put_pixels_clamped)(const int16_t *block, uint8_t *pixels, int line_size); static void (*ff_add_pixels_clamped)(const int16_t *block, uint8_t *pixels, int line_size); -void ff_put_pixels8_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); -void ff_put_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); -void ff_put_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); -void ff_put_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); - -void ff_put_no_rnd_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); -void ff_put_no_rnd_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); -void ff_put_no_rnd_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); - -void ff_put_pixels16_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); - -CALL_2X_PIXELS(ff_put_pixels16_x2_arm, ff_put_pixels8_x2_arm, 8) -CALL_2X_PIXELS(ff_put_pixels16_y2_arm, ff_put_pixels8_y2_arm, 8) -CALL_2X_PIXELS(ff_put_pixels16_xy2_arm, ff_put_pixels8_xy2_arm, 8) -CALL_2X_PIXELS(ff_put_no_rnd_pixels16_x2_arm, ff_put_no_rnd_pixels8_x2_arm, 8) -CALL_2X_PIXELS(ff_put_no_rnd_pixels16_y2_arm, ff_put_no_rnd_pixels8_y2_arm, 8) -CALL_2X_PIXELS(ff_put_no_rnd_pixels16_xy2_arm, ff_put_no_rnd_pixels8_xy2_arm,8) - void ff_add_pixels_clamped_arm(const int16_t *block, uint8_t *dest, int line_size); @@ -76,7 +58,6 @@ static void simple_idct_arm_add(uint8_t *dest, int line_size, int16_t *block) av_cold void ff_dsputil_init_arm(DSPContext *c, AVCodecContext *avctx) { - const int high_bit_depth = avctx->bits_per_raw_sample > 8; int cpu_flags = av_get_cpu_flags(); ff_put_pixels_clamped = c->put_pixels_clamped; @@ -99,26 +80,6 @@ av_cold void ff_dsputil_init_arm(DSPContext *c, AVCodecContext *avctx) c->add_pixels_clamped = ff_add_pixels_clamped_arm; - if (!high_bit_depth) { - c->put_pixels_tab[0][0] = ff_put_pixels16_arm; - c->put_pixels_tab[0][1] = ff_put_pixels16_x2_arm; - c->put_pixels_tab[0][2] = ff_put_pixels16_y2_arm; - c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_arm; - c->put_pixels_tab[1][0] = ff_put_pixels8_arm; - c->put_pixels_tab[1][1] = ff_put_pixels8_x2_arm; - c->put_pixels_tab[1][2] = ff_put_pixels8_y2_arm; - c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_arm; - - c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_arm; - c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_arm; - c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_arm; - c->put_no_rnd_pixels_tab[0][3] = ff_put_no_rnd_pixels16_xy2_arm; - c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_arm; - c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_arm; - c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_arm; - c->put_no_rnd_pixels_tab[1][3] = ff_put_no_rnd_pixels8_xy2_arm; - } - if (have_armv5te(cpu_flags)) ff_dsputil_init_armv5te(c, avctx); if (have_armv6(cpu_flags)) ff_dsputil_init_armv6(c, avctx); if (have_neon(cpu_flags)) ff_dsputil_init_neon(c, avctx); diff --git a/libavcodec/arm/dsputil_init_armv6.c b/libavcodec/arm/dsputil_init_armv6.c index 5e2c661039..4c8ba47ccb 100644 --- a/libavcodec/arm/dsputil_init_armv6.c +++ b/libavcodec/arm/dsputil_init_armv6.c @@ -27,24 +27,6 @@ void ff_simple_idct_armv6(int16_t *data); void ff_simple_idct_put_armv6(uint8_t *dest, int line_size, int16_t *data); void ff_simple_idct_add_armv6(uint8_t *dest, int line_size, int16_t *data); -void ff_put_pixels16_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); -void ff_put_pixels16_x2_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); -void ff_put_pixels16_y2_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); - -void ff_put_pixels16_x2_no_rnd_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); -void ff_put_pixels16_y2_no_rnd_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); - -void ff_avg_pixels16_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); - -void ff_put_pixels8_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); -void ff_put_pixels8_x2_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); -void ff_put_pixels8_y2_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); - -void ff_put_pixels8_x2_no_rnd_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); -void ff_put_pixels8_y2_no_rnd_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); - -void ff_avg_pixels8_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); - void ff_add_pixels_clamped_armv6(const int16_t *block, uint8_t *restrict pixels, int line_size); @@ -82,29 +64,6 @@ av_cold void ff_dsputil_init_armv6(DSPContext *c, AVCodecContext *avctx) c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM; } - if (!high_bit_depth) { - c->put_pixels_tab[0][0] = ff_put_pixels16_armv6; - c->put_pixels_tab[0][1] = ff_put_pixels16_x2_armv6; - c->put_pixels_tab[0][2] = ff_put_pixels16_y2_armv6; -/* c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_armv6; */ - c->put_pixels_tab[1][0] = ff_put_pixels8_armv6; - c->put_pixels_tab[1][1] = ff_put_pixels8_x2_armv6; - c->put_pixels_tab[1][2] = ff_put_pixels8_y2_armv6; -/* c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_armv6; */ - - c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_armv6; - c->put_no_rnd_pixels_tab[0][1] = ff_put_pixels16_x2_no_rnd_armv6; - c->put_no_rnd_pixels_tab[0][2] = ff_put_pixels16_y2_no_rnd_armv6; -/* c->put_no_rnd_pixels_tab[0][3] = ff_put_pixels16_xy2_no_rnd_armv6; */ - c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_armv6; - c->put_no_rnd_pixels_tab[1][1] = ff_put_pixels8_x2_no_rnd_armv6; - c->put_no_rnd_pixels_tab[1][2] = ff_put_pixels8_y2_no_rnd_armv6; -/* c->put_no_rnd_pixels_tab[1][3] = ff_put_pixels8_xy2_no_rnd_armv6; */ - - c->avg_pixels_tab[0][0] = ff_avg_pixels16_armv6; - c->avg_pixels_tab[1][0] = ff_avg_pixels8_armv6; - } - if (!high_bit_depth) c->get_pixels = ff_get_pixels_armv6; c->add_pixels_clamped = ff_add_pixels_clamped_armv6; diff --git a/libavcodec/arm/dsputil_init_neon.c b/libavcodec/arm/dsputil_init_neon.c index 5866118740..793fab1cf8 100644 --- a/libavcodec/arm/dsputil_init_neon.c +++ b/libavcodec/arm/dsputil_init_neon.c @@ -32,33 +32,6 @@ void ff_simple_idct_add_neon(uint8_t *dest, int line_size, int16_t *data); void ff_clear_block_neon(int16_t *block); void ff_clear_blocks_neon(int16_t *blocks); -void ff_put_pixels16_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); -void ff_put_pixels16_x2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); -void ff_put_pixels16_y2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); -void ff_put_pixels16_xy2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); -void ff_put_pixels8_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); -void ff_put_pixels8_x2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); -void ff_put_pixels8_y2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); -void ff_put_pixels8_xy2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); -void ff_put_pixels16_x2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); -void ff_put_pixels16_y2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); -void ff_put_pixels16_xy2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); -void ff_put_pixels8_x2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); -void ff_put_pixels8_y2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); -void ff_put_pixels8_xy2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); - -void ff_avg_pixels16_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); -void ff_avg_pixels16_x2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); -void ff_avg_pixels16_y2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); -void ff_avg_pixels16_xy2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); -void ff_avg_pixels8_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); -void ff_avg_pixels8_x2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); -void ff_avg_pixels8_y2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); -void ff_avg_pixels8_xy2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); -void ff_avg_pixels16_x2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); -void ff_avg_pixels16_y2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); -void ff_avg_pixels16_xy2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); - void ff_add_pixels_clamped_neon(const int16_t *, uint8_t *, int); void ff_put_pixels_clamped_neon(const int16_t *, uint8_t *, int); void ff_put_signed_pixels_clamped_neon(const int16_t *, uint8_t *, int); @@ -92,38 +65,6 @@ av_cold void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx) if (!high_bit_depth) { c->clear_block = ff_clear_block_neon; c->clear_blocks = ff_clear_blocks_neon; - - c->put_pixels_tab[0][0] = ff_put_pixels16_neon; - c->put_pixels_tab[0][1] = ff_put_pixels16_x2_neon; - c->put_pixels_tab[0][2] = ff_put_pixels16_y2_neon; - c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_neon; - c->put_pixels_tab[1][0] = ff_put_pixels8_neon; - c->put_pixels_tab[1][1] = ff_put_pixels8_x2_neon; - c->put_pixels_tab[1][2] = ff_put_pixels8_y2_neon; - c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_neon; - - c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_neon; - c->put_no_rnd_pixels_tab[0][1] = ff_put_pixels16_x2_no_rnd_neon; - c->put_no_rnd_pixels_tab[0][2] = ff_put_pixels16_y2_no_rnd_neon; - c->put_no_rnd_pixels_tab[0][3] = ff_put_pixels16_xy2_no_rnd_neon; - c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_neon; - c->put_no_rnd_pixels_tab[1][1] = ff_put_pixels8_x2_no_rnd_neon; - c->put_no_rnd_pixels_tab[1][2] = ff_put_pixels8_y2_no_rnd_neon; - c->put_no_rnd_pixels_tab[1][3] = ff_put_pixels8_xy2_no_rnd_neon; - - c->avg_pixels_tab[0][0] = ff_avg_pixels16_neon; - c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_neon; - c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_neon; - c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_neon; - c->avg_pixels_tab[1][0] = ff_avg_pixels8_neon; - c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_neon; - c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_neon; - c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_neon; - - c->avg_no_rnd_pixels_tab[0] = ff_avg_pixels16_neon; - c->avg_no_rnd_pixels_tab[1] = ff_avg_pixels16_x2_no_rnd_neon; - c->avg_no_rnd_pixels_tab[2] = ff_avg_pixels16_y2_no_rnd_neon; - c->avg_no_rnd_pixels_tab[3] = ff_avg_pixels16_xy2_no_rnd_neon; } c->add_pixels_clamped = ff_add_pixels_clamped_neon; diff --git a/libavcodec/arm/dsputil_neon.S b/libavcodec/arm/dsputil_neon.S index f33fa33d73..81662c04ad 100644 --- a/libavcodec/arm/dsputil_neon.S +++ b/libavcodec/arm/dsputil_neon.S @@ -37,394 +37,6 @@ function ff_clear_blocks_neon, export=1 bx lr endfunc -.macro pixels16 rnd=1, avg=0 - .if \avg - mov r12, r0 - .endif -1: vld1.8 {q0}, [r1], r2 - vld1.8 {q1}, [r1], r2 - vld1.8 {q2}, [r1], r2 - pld [r1, r2, lsl #2] - vld1.8 {q3}, [r1], r2 - pld [r1] - pld [r1, r2] - pld [r1, r2, lsl #1] - .if \avg - vld1.8 {q8}, [r12,:128], r2 - vrhadd.u8 q0, q0, q8 - vld1.8 {q9}, [r12,:128], r2 - vrhadd.u8 q1, q1, q9 - vld1.8 {q10}, [r12,:128], r2 - vrhadd.u8 q2, q2, q10 - vld1.8 {q11}, [r12,:128], r2 - vrhadd.u8 q3, q3, q11 - .endif - subs r3, r3, #4 - vst1.64 {q0}, [r0,:128], r2 - vst1.64 {q1}, [r0,:128], r2 - vst1.64 {q2}, [r0,:128], r2 - vst1.64 {q3}, [r0,:128], r2 - bne 1b - bx lr -.endm - -.macro pixels16_x2 rnd=1, avg=0 -1: vld1.8 {d0-d2}, [r1], r2 - vld1.8 {d4-d6}, [r1], r2 - pld [r1] - pld [r1, r2] - subs r3, r3, #2 - vext.8 q1, q0, q1, #1 - avg q0, q0, q1 - vext.8 q3, q2, q3, #1 - avg q2, q2, q3 - .if \avg - vld1.8 {q1}, [r0,:128], r2 - vld1.8 {q3}, [r0,:128] - vrhadd.u8 q0, q0, q1 - vrhadd.u8 q2, q2, q3 - sub r0, r0, r2 - .endif - vst1.8 {q0}, [r0,:128], r2 - vst1.8 {q2}, [r0,:128], r2 - bne 1b - bx lr -.endm - -.macro pixels16_y2 rnd=1, avg=0 - sub r3, r3, #2 - vld1.8 {q0}, [r1], r2 - vld1.8 {q1}, [r1], r2 -1: subs r3, r3, #2 - avg q2, q0, q1 - vld1.8 {q0}, [r1], r2 - avg q3, q0, q1 - vld1.8 {q1}, [r1], r2 - pld [r1] - pld [r1, r2] - .if \avg - vld1.8 {q8}, [r0,:128], r2 - vld1.8 {q9}, [r0,:128] - vrhadd.u8 q2, q2, q8 - vrhadd.u8 q3, q3, q9 - sub r0, r0, r2 - .endif - vst1.8 {q2}, [r0,:128], r2 - vst1.8 {q3}, [r0,:128], r2 - bne 1b - - avg q2, q0, q1 - vld1.8 {q0}, [r1], r2 - avg q3, q0, q1 - .if \avg - vld1.8 {q8}, [r0,:128], r2 - vld1.8 {q9}, [r0,:128] - vrhadd.u8 q2, q2, q8 - vrhadd.u8 q3, q3, q9 - sub r0, r0, r2 - .endif - vst1.8 {q2}, [r0,:128], r2 - vst1.8 {q3}, [r0,:128], r2 - - bx lr -.endm - -.macro pixels16_xy2 rnd=1, avg=0 - sub r3, r3, #2 - vld1.8 {d0-d2}, [r1], r2 - vld1.8 {d4-d6}, [r1], r2 -NRND vmov.i16 q13, #1 - pld [r1] - pld [r1, r2] - vext.8 q1, q0, q1, #1 - vext.8 q3, q2, q3, #1 - vaddl.u8 q8, d0, d2 - vaddl.u8 q10, d1, d3 - vaddl.u8 q9, d4, d6 - vaddl.u8 q11, d5, d7 -1: subs r3, r3, #2 - vld1.8 {d0-d2}, [r1], r2 - vadd.u16 q12, q8, q9 - pld [r1] -NRND vadd.u16 q12, q12, q13 - vext.8 q15, q0, q1, #1 - vadd.u16 q1 , q10, q11 - shrn d28, q12, #2 -NRND vadd.u16 q1, q1, q13 - shrn d29, q1, #2 - .if \avg - vld1.8 {q8}, [r0,:128] - vrhadd.u8 q14, q14, q8 - .endif - vaddl.u8 q8, d0, d30 - vld1.8 {d2-d4}, [r1], r2 - vaddl.u8 q10, d1, d31 - vst1.8 {q14}, [r0,:128], r2 - vadd.u16 q12, q8, q9 - pld [r1, r2] -NRND vadd.u16 q12, q12, q13 - vext.8 q2, q1, q2, #1 - vadd.u16 q0, q10, q11 - shrn d30, q12, #2 -NRND vadd.u16 q0, q0, q13 - shrn d31, q0, #2 - .if \avg - vld1.8 {q9}, [r0,:128] - vrhadd.u8 q15, q15, q9 - .endif - vaddl.u8 q9, d2, d4 - vaddl.u8 q11, d3, d5 - vst1.8 {q15}, [r0,:128], r2 - bgt 1b - - vld1.8 {d0-d2}, [r1], r2 - vadd.u16 q12, q8, q9 -NRND vadd.u16 q12, q12, q13 - vext.8 q15, q0, q1, #1 - vadd.u16 q1 , q10, q11 - shrn d28, q12, #2 -NRND vadd.u16 q1, q1, q13 - shrn d29, q1, #2 - .if \avg - vld1.8 {q8}, [r0,:128] - vrhadd.u8 q14, q14, q8 - .endif - vaddl.u8 q8, d0, d30 - vaddl.u8 q10, d1, d31 - vst1.8 {q14}, [r0,:128], r2 - vadd.u16 q12, q8, q9 -NRND vadd.u16 q12, q12, q13 - vadd.u16 q0, q10, q11 - shrn d30, q12, #2 -NRND vadd.u16 q0, q0, q13 - shrn d31, q0, #2 - .if \avg - vld1.8 {q9}, [r0,:128] - vrhadd.u8 q15, q15, q9 - .endif - vst1.8 {q15}, [r0,:128], r2 - - bx lr -.endm - -.macro pixels8 rnd=1, avg=0 -1: vld1.8 {d0}, [r1], r2 - vld1.8 {d1}, [r1], r2 - vld1.8 {d2}, [r1], r2 - pld [r1, r2, lsl #2] - vld1.8 {d3}, [r1], r2 - pld [r1] - pld [r1, r2] - pld [r1, r2, lsl #1] - .if \avg - vld1.8 {d4}, [r0,:64], r2 - vrhadd.u8 d0, d0, d4 - vld1.8 {d5}, [r0,:64], r2 - vrhadd.u8 d1, d1, d5 - vld1.8 {d6}, [r0,:64], r2 - vrhadd.u8 d2, d2, d6 - vld1.8 {d7}, [r0,:64], r2 - vrhadd.u8 d3, d3, d7 - sub r0, r0, r2, lsl #2 - .endif - subs r3, r3, #4 - vst1.8 {d0}, [r0,:64], r2 - vst1.8 {d1}, [r0,:64], r2 - vst1.8 {d2}, [r0,:64], r2 - vst1.8 {d3}, [r0,:64], r2 - bne 1b - bx lr -.endm - -.macro pixels8_x2 rnd=1, avg=0 -1: vld1.8 {q0}, [r1], r2 - vext.8 d1, d0, d1, #1 - vld1.8 {q1}, [r1], r2 - vext.8 d3, d2, d3, #1 - pld [r1] - pld [r1, r2] - subs r3, r3, #2 - vswp d1, d2 - avg q0, q0, q1 - .if \avg - vld1.8 {d4}, [r0,:64], r2 - vld1.8 {d5}, [r0,:64] - vrhadd.u8 q0, q0, q2 - sub r0, r0, r2 - .endif - vst1.8 {d0}, [r0,:64], r2 - vst1.8 {d1}, [r0,:64], r2 - bne 1b - bx lr -.endm - -.macro pixels8_y2 rnd=1, avg=0 - sub r3, r3, #2 - vld1.8 {d0}, [r1], r2 - vld1.8 {d1}, [r1], r2 -1: subs r3, r3, #2 - avg d4, d0, d1 - vld1.8 {d0}, [r1], r2 - avg d5, d0, d1 - vld1.8 {d1}, [r1], r2 - pld [r1] - pld [r1, r2] - .if \avg - vld1.8 {d2}, [r0,:64], r2 - vld1.8 {d3}, [r0,:64] - vrhadd.u8 q2, q2, q1 - sub r0, r0, r2 - .endif - vst1.8 {d4}, [r0,:64], r2 - vst1.8 {d5}, [r0,:64], r2 - bne 1b - - avg d4, d0, d1 - vld1.8 {d0}, [r1], r2 - avg d5, d0, d1 - .if \avg - vld1.8 {d2}, [r0,:64], r2 - vld1.8 {d3}, [r0,:64] - vrhadd.u8 q2, q2, q1 - sub r0, r0, r2 - .endif - vst1.8 {d4}, [r0,:64], r2 - vst1.8 {d5}, [r0,:64], r2 - - bx lr -.endm - -.macro pixels8_xy2 rnd=1, avg=0 - sub r3, r3, #2 - vld1.8 {q0}, [r1], r2 - vld1.8 {q1}, [r1], r2 -NRND vmov.i16 q11, #1 - pld [r1] - pld [r1, r2] - vext.8 d4, d0, d1, #1 - vext.8 d6, d2, d3, #1 - vaddl.u8 q8, d0, d4 - vaddl.u8 q9, d2, d6 -1: subs r3, r3, #2 - vld1.8 {q0}, [r1], r2 - pld [r1] - vadd.u16 q10, q8, q9 - vext.8 d4, d0, d1, #1 -NRND vadd.u16 q10, q10, q11 - vaddl.u8 q8, d0, d4 - shrn d5, q10, #2 - vld1.8 {q1}, [r1], r2 - vadd.u16 q10, q8, q9 - pld [r1, r2] - .if \avg - vld1.8 {d7}, [r0,:64] - vrhadd.u8 d5, d5, d7 - .endif -NRND vadd.u16 q10, q10, q11 - vst1.8 {d5}, [r0,:64], r2 - shrn d7, q10, #2 - .if \avg - vld1.8 {d5}, [r0,:64] - vrhadd.u8 d7, d7, d5 - .endif - vext.8 d6, d2, d3, #1 - vaddl.u8 q9, d2, d6 - vst1.8 {d7}, [r0,:64], r2 - bgt 1b - - vld1.8 {q0}, [r1], r2 - vadd.u16 q10, q8, q9 - vext.8 d4, d0, d1, #1 -NRND vadd.u16 q10, q10, q11 - vaddl.u8 q8, d0, d4 - shrn d5, q10, #2 - vadd.u16 q10, q8, q9 - .if \avg - vld1.8 {d7}, [r0,:64] - vrhadd.u8 d5, d5, d7 - .endif -NRND vadd.u16 q10, q10, q11 - vst1.8 {d5}, [r0,:64], r2 - shrn d7, q10, #2 - .if \avg - vld1.8 {d5}, [r0,:64] - vrhadd.u8 d7, d7, d5 - .endif - vst1.8 {d7}, [r0,:64], r2 - - bx lr -.endm - -.macro pixfunc pfx, name, suf, rnd=1, avg=0 - .if \rnd - .macro avg rd, rn, rm - vrhadd.u8 \rd, \rn, \rm - .endm - .macro shrn rd, rn, rm - vrshrn.u16 \rd, \rn, \rm - .endm - .macro NRND insn:vararg - .endm - .else - .macro avg rd, rn, rm - vhadd.u8 \rd, \rn, \rm - .endm - .macro shrn rd, rn, rm - vshrn.u16 \rd, \rn, \rm - .endm - .macro NRND insn:vararg - \insn - .endm - .endif -function ff_\pfx\name\suf\()_neon, export=1 - \name \rnd, \avg -endfunc - .purgem avg - .purgem shrn - .purgem NRND -.endm - -.macro pixfunc2 pfx, name, avg=0 - pixfunc \pfx, \name, rnd=1, avg=\avg - pixfunc \pfx, \name, _no_rnd, rnd=0, avg=\avg -.endm - -function ff_put_h264_qpel16_mc00_neon, export=1 - mov r3, #16 -endfunc - - pixfunc put_, pixels16, avg=0 - pixfunc2 put_, pixels16_x2, avg=0 - pixfunc2 put_, pixels16_y2, avg=0 - pixfunc2 put_, pixels16_xy2, avg=0 - -function ff_avg_h264_qpel16_mc00_neon, export=1 - mov r3, #16 -endfunc - - pixfunc avg_, pixels16, avg=1 - pixfunc2 avg_, pixels16_x2, avg=1 - pixfunc2 avg_, pixels16_y2, avg=1 - pixfunc2 avg_, pixels16_xy2, avg=1 - -function ff_put_h264_qpel8_mc00_neon, export=1 - mov r3, #8 -endfunc - - pixfunc put_, pixels8, avg=0 - pixfunc2 put_, pixels8_x2, avg=0 - pixfunc2 put_, pixels8_y2, avg=0 - pixfunc2 put_, pixels8_xy2, avg=0 - -function ff_avg_h264_qpel8_mc00_neon, export=1 - mov r3, #8 -endfunc - - pixfunc avg_, pixels8, avg=1 - pixfunc avg_, pixels8_x2, avg=1 - pixfunc avg_, pixels8_y2, avg=1 - pixfunc avg_, pixels8_xy2, avg=1 - function ff_put_pixels_clamped_neon, export=1 vld1.16 {d16-d19}, [r0,:128]! vqmovun.s16 d0, q8 diff --git a/libavcodec/arm/hpeldsp_arm.S b/libavcodec/arm/hpeldsp_arm.S new file mode 100644 index 0000000000..d4f97e3c13 --- /dev/null +++ b/libavcodec/arm/hpeldsp_arm.S @@ -0,0 +1,611 @@ +@ +@ ARMv4 optimized DSP utils +@ Copyright (c) 2004 AGAWA Koji +@ +@ This file is part of Libav. +@ +@ Libav is free software; you can redistribute it and/or +@ modify it under the terms of the GNU Lesser General Public +@ License as published by the Free Software Foundation; either +@ version 2.1 of the License, or (at your option) any later version. +@ +@ Libav is distributed in the hope that it will be useful, +@ but WITHOUT ANY WARRANTY; without even the implied warranty of +@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +@ Lesser General Public License for more details. +@ +@ You should have received a copy of the GNU Lesser General Public +@ License along with Libav; if not, write to the Free Software +@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +@ + +#include "config.h" +#include "libavutil/arm/asm.S" + +#if !HAVE_ARMV5TE_EXTERNAL +#define pld @ +#endif + +.macro ALIGN_QWORD_D shift, Rd0, Rd1, Rd2, Rd3, Rn0, Rn1, Rn2, Rn3, Rn4 + mov \Rd0, \Rn0, lsr #(\shift * 8) + mov \Rd1, \Rn1, lsr #(\shift * 8) + mov \Rd2, \Rn2, lsr #(\shift * 8) + mov \Rd3, \Rn3, lsr #(\shift * 8) + orr \Rd0, \Rd0, \Rn1, lsl #(32 - \shift * 8) + orr \Rd1, \Rd1, \Rn2, lsl #(32 - \shift * 8) + orr \Rd2, \Rd2, \Rn3, lsl #(32 - \shift * 8) + orr \Rd3, \Rd3, \Rn4, lsl #(32 - \shift * 8) +.endm +.macro ALIGN_DWORD shift, R0, R1, R2 + mov \R0, \R0, lsr #(\shift * 8) + orr \R0, \R0, \R1, lsl #(32 - \shift * 8) + mov \R1, \R1, lsr #(\shift * 8) + orr \R1, \R1, \R2, lsl #(32 - \shift * 8) +.endm +.macro ALIGN_DWORD_D shift, Rdst0, Rdst1, Rsrc0, Rsrc1, Rsrc2 + mov \Rdst0, \Rsrc0, lsr #(\shift * 8) + mov \Rdst1, \Rsrc1, lsr #(\shift * 8) + orr \Rdst0, \Rdst0, \Rsrc1, lsl #(32 - (\shift * 8)) + orr \Rdst1, \Rdst1, \Rsrc2, lsl #(32 - (\shift * 8)) +.endm + +.macro RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask + @ Rd = (Rn | Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1) + @ Rmask = 0xFEFEFEFE + @ Rn = destroy + eor \Rd0, \Rn0, \Rm0 + eor \Rd1, \Rn1, \Rm1 + orr \Rn0, \Rn0, \Rm0 + orr \Rn1, \Rn1, \Rm1 + and \Rd0, \Rd0, \Rmask + and \Rd1, \Rd1, \Rmask + sub \Rd0, \Rn0, \Rd0, lsr #1 + sub \Rd1, \Rn1, \Rd1, lsr #1 +.endm + +.macro NO_RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask + @ Rd = (Rn & Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1) + @ Rmask = 0xFEFEFEFE + @ Rn = destroy + eor \Rd0, \Rn0, \Rm0 + eor \Rd1, \Rn1, \Rm1 + and \Rn0, \Rn0, \Rm0 + and \Rn1, \Rn1, \Rm1 + and \Rd0, \Rd0, \Rmask + and \Rd1, \Rd1, \Rmask + add \Rd0, \Rn0, \Rd0, lsr #1 + add \Rd1, \Rn1, \Rd1, lsr #1 +.endm + +.macro JMP_ALIGN tmp, reg + ands \tmp, \reg, #3 + bic \reg, \reg, #3 + beq 1f + subs \tmp, \tmp, #1 + beq 2f + subs \tmp, \tmp, #1 + beq 3f + b 4f +.endm + +@ ---------------------------------------------------------------- + .align 5 +function ff_put_pixels16_arm, export=1 + @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) + @ block = word aligned, pixles = unaligned + pld [r1] + push {r4-r11, lr} + JMP_ALIGN r5, r1 +1: + ldm r1, {r4-r7} + add r1, r1, r2 + stm r0, {r4-r7} + pld [r1] + subs r3, r3, #1 + add r0, r0, r2 + bne 1b + pop {r4-r11, pc} + .align 5 +2: + ldm r1, {r4-r8} + add r1, r1, r2 + ALIGN_QWORD_D 1, r9, r10, r11, r12, r4, r5, r6, r7, r8 + pld [r1] + subs r3, r3, #1 + stm r0, {r9-r12} + add r0, r0, r2 + bne 2b + pop {r4-r11, pc} + .align 5 +3: + ldm r1, {r4-r8} + add r1, r1, r2 + ALIGN_QWORD_D 2, r9, r10, r11, r12, r4, r5, r6, r7, r8 + pld [r1] + subs r3, r3, #1 + stm r0, {r9-r12} + add r0, r0, r2 + bne 3b + pop {r4-r11, pc} + .align 5 +4: + ldm r1, {r4-r8} + add r1, r1, r2 + ALIGN_QWORD_D 3, r9, r10, r11, r12, r4, r5, r6, r7, r8 + pld [r1] + subs r3, r3, #1 + stm r0, {r9-r12} + add r0, r0, r2 + bne 4b + pop {r4-r11,pc} +endfunc + +@ ---------------------------------------------------------------- + .align 5 +function ff_put_pixels8_arm, export=1 + @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) + @ block = word aligned, pixles = unaligned + pld [r1] + push {r4-r5,lr} + JMP_ALIGN r5, r1 +1: + ldm r1, {r4-r5} + add r1, r1, r2 + subs r3, r3, #1 + pld [r1] + stm r0, {r4-r5} + add r0, r0, r2 + bne 1b + pop {r4-r5,pc} + .align 5 +2: + ldm r1, {r4-r5, r12} + add r1, r1, r2 + ALIGN_DWORD 1, r4, r5, r12 + pld [r1] + subs r3, r3, #1 + stm r0, {r4-r5} + add r0, r0, r2 + bne 2b + pop {r4-r5,pc} + .align 5 +3: + ldm r1, {r4-r5, r12} + add r1, r1, r2 + ALIGN_DWORD 2, r4, r5, r12 + pld [r1] + subs r3, r3, #1 + stm r0, {r4-r5} + add r0, r0, r2 + bne 3b + pop {r4-r5,pc} + .align 5 +4: + ldm r1, {r4-r5, r12} + add r1, r1, r2 + ALIGN_DWORD 3, r4, r5, r12 + pld [r1] + subs r3, r3, #1 + stm r0, {r4-r5} + add r0, r0, r2 + bne 4b + pop {r4-r5,pc} +endfunc + +@ ---------------------------------------------------------------- + .align 5 +function ff_put_pixels8_x2_arm, export=1 + @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) + @ block = word aligned, pixles = unaligned + pld [r1] + push {r4-r10,lr} + ldr r12, =0xfefefefe + JMP_ALIGN r5, r1 +1: + ldm r1, {r4-r5, r10} + add r1, r1, r2 + ALIGN_DWORD_D 1, r6, r7, r4, r5, r10 + pld [r1] + RND_AVG32 r8, r9, r4, r5, r6, r7, r12 + subs r3, r3, #1 + stm r0, {r8-r9} + add r0, r0, r2 + bne 1b + pop {r4-r10,pc} + .align 5 +2: + ldm r1, {r4-r5, r10} + add r1, r1, r2 + ALIGN_DWORD_D 1, r6, r7, r4, r5, r10 + ALIGN_DWORD_D 2, r8, r9, r4, r5, r10 + pld [r1] + RND_AVG32 r4, r5, r6, r7, r8, r9, r12 + subs r3, r3, #1 + stm r0, {r4-r5} + add r0, r0, r2 + bne 2b + pop {r4-r10,pc} + .align 5 +3: + ldm r1, {r4-r5, r10} + add r1, r1, r2 + ALIGN_DWORD_D 2, r6, r7, r4, r5, r10 + ALIGN_DWORD_D 3, r8, r9, r4, r5, r10 + pld [r1] + RND_AVG32 r4, r5, r6, r7, r8, r9, r12 + subs r3, r3, #1 + stm r0, {r4-r5} + add r0, r0, r2 + bne 3b + pop {r4-r10,pc} + .align 5 +4: + ldm r1, {r4-r5, r10} + add r1, r1, r2 + ALIGN_DWORD_D 3, r6, r7, r4, r5, r10 + pld [r1] + RND_AVG32 r8, r9, r6, r7, r5, r10, r12 + subs r3, r3, #1 + stm r0, {r8-r9} + add r0, r0, r2 + bne 4b + pop {r4-r10,pc} +endfunc + + .align 5 +function ff_put_no_rnd_pixels8_x2_arm, export=1 + @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) + @ block = word aligned, pixles = unaligned + pld [r1] + push {r4-r10,lr} + ldr r12, =0xfefefefe + JMP_ALIGN r5, r1 +1: + ldm r1, {r4-r5, r10} + add r1, r1, r2 + ALIGN_DWORD_D 1, r6, r7, r4, r5, r10 + pld [r1] + NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12 + subs r3, r3, #1 + stm r0, {r8-r9} + add r0, r0, r2 + bne 1b + pop {r4-r10,pc} + .align 5 +2: + ldm r1, {r4-r5, r10} + add r1, r1, r2 + ALIGN_DWORD_D 1, r6, r7, r4, r5, r10 + ALIGN_DWORD_D 2, r8, r9, r4, r5, r10 + pld [r1] + NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12 + subs r3, r3, #1 + stm r0, {r4-r5} + add r0, r0, r2 + bne 2b + pop {r4-r10,pc} + .align 5 +3: + ldm r1, {r4-r5, r10} + add r1, r1, r2 + ALIGN_DWORD_D 2, r6, r7, r4, r5, r10 + ALIGN_DWORD_D 3, r8, r9, r4, r5, r10 + pld [r1] + NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12 + subs r3, r3, #1 + stm r0, {r4-r5} + add r0, r0, r2 + bne 3b + pop {r4-r10,pc} + .align 5 +4: + ldm r1, {r4-r5, r10} + add r1, r1, r2 + ALIGN_DWORD_D 3, r6, r7, r4, r5, r10 + pld [r1] + NO_RND_AVG32 r8, r9, r6, r7, r5, r10, r12 + subs r3, r3, #1 + stm r0, {r8-r9} + add r0, r0, r2 + bne 4b + pop {r4-r10,pc} +endfunc + + +@ ---------------------------------------------------------------- + .align 5 +function ff_put_pixels8_y2_arm, export=1 + @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) + @ block = word aligned, pixles = unaligned + pld [r1] + push {r4-r11,lr} + mov r3, r3, lsr #1 + ldr r12, =0xfefefefe + JMP_ALIGN r5, r1 +1: + ldm r1, {r4-r5} + add r1, r1, r2 +6: ldm r1, {r6-r7} + add r1, r1, r2 + pld [r1] + RND_AVG32 r8, r9, r4, r5, r6, r7, r12 + ldm r1, {r4-r5} + add r1, r1, r2 + stm r0, {r8-r9} + add r0, r0, r2 + pld [r1] + RND_AVG32 r8, r9, r6, r7, r4, r5, r12 + subs r3, r3, #1 + stm r0, {r8-r9} + add r0, r0, r2 + bne 6b + pop {r4-r11,pc} + .align 5 +2: + ldm r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 1, r4, r5, r6 +6: ldm r1, {r7-r9} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 1, r7, r8, r9 + RND_AVG32 r10, r11, r4, r5, r7, r8, r12 + stm r0, {r10-r11} + add r0, r0, r2 + ldm r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 1, r4, r5, r6 + subs r3, r3, #1 + RND_AVG32 r10, r11, r7, r8, r4, r5, r12 + stm r0, {r10-r11} + add r0, r0, r2 + bne 6b + pop {r4-r11,pc} + .align 5 +3: + ldm r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 2, r4, r5, r6 +6: ldm r1, {r7-r9} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 2, r7, r8, r9 + RND_AVG32 r10, r11, r4, r5, r7, r8, r12 + stm r0, {r10-r11} + add r0, r0, r2 + ldm r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 2, r4, r5, r6 + subs r3, r3, #1 + RND_AVG32 r10, r11, r7, r8, r4, r5, r12 + stm r0, {r10-r11} + add r0, r0, r2 + bne 6b + pop {r4-r11,pc} + .align 5 +4: + ldm r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 3, r4, r5, r6 +6: ldm r1, {r7-r9} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 3, r7, r8, r9 + RND_AVG32 r10, r11, r4, r5, r7, r8, r12 + stm r0, {r10-r11} + add r0, r0, r2 + ldm r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 3, r4, r5, r6 + subs r3, r3, #1 + RND_AVG32 r10, r11, r7, r8, r4, r5, r12 + stm r0, {r10-r11} + add r0, r0, r2 + bne 6b + pop {r4-r11,pc} +endfunc + + .align 5 +function ff_put_no_rnd_pixels8_y2_arm, export=1 + @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) + @ block = word aligned, pixles = unaligned + pld [r1] + push {r4-r11,lr} + mov r3, r3, lsr #1 + ldr r12, =0xfefefefe + JMP_ALIGN r5, r1 +1: + ldm r1, {r4-r5} + add r1, r1, r2 +6: ldm r1, {r6-r7} + add r1, r1, r2 + pld [r1] + NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12 + ldm r1, {r4-r5} + add r1, r1, r2 + stm r0, {r8-r9} + add r0, r0, r2 + pld [r1] + NO_RND_AVG32 r8, r9, r6, r7, r4, r5, r12 + subs r3, r3, #1 + stm r0, {r8-r9} + add r0, r0, r2 + bne 6b + pop {r4-r11,pc} + .align 5 +2: + ldm r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 1, r4, r5, r6 +6: ldm r1, {r7-r9} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 1, r7, r8, r9 + NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12 + stm r0, {r10-r11} + add r0, r0, r2 + ldm r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 1, r4, r5, r6 + subs r3, r3, #1 + NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12 + stm r0, {r10-r11} + add r0, r0, r2 + bne 6b + pop {r4-r11,pc} + .align 5 +3: + ldm r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 2, r4, r5, r6 +6: ldm r1, {r7-r9} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 2, r7, r8, r9 + NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12 + stm r0, {r10-r11} + add r0, r0, r2 + ldm r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 2, r4, r5, r6 + subs r3, r3, #1 + NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12 + stm r0, {r10-r11} + add r0, r0, r2 + bne 6b + pop {r4-r11,pc} + .align 5 +4: + ldm r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 3, r4, r5, r6 +6: ldm r1, {r7-r9} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 3, r7, r8, r9 + NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12 + stm r0, {r10-r11} + add r0, r0, r2 + ldm r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 3, r4, r5, r6 + subs r3, r3, #1 + NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12 + stm r0, {r10-r11} + add r0, r0, r2 + bne 6b + pop {r4-r11,pc} +endfunc + + .ltorg + +@ ---------------------------------------------------------------- +.macro RND_XY2_IT align, rnd + @ l1= (a & 0x03030303) + (b & 0x03030303) ?(+ 0x02020202) + @ h1= ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2) +.if \align == 0 + ldm r1, {r6-r8} +.elseif \align == 3 + ldm r1, {r5-r7} +.else + ldm r1, {r8-r10} +.endif + add r1, r1, r2 + pld [r1] +.if \align == 0 + ALIGN_DWORD_D 1, r4, r5, r6, r7, r8 +.elseif \align == 1 + ALIGN_DWORD_D 1, r4, r5, r8, r9, r10 + ALIGN_DWORD_D 2, r6, r7, r8, r9, r10 +.elseif \align == 2 + ALIGN_DWORD_D 2, r4, r5, r8, r9, r10 + ALIGN_DWORD_D 3, r6, r7, r8, r9, r10 +.elseif \align == 3 + ALIGN_DWORD_D 3, r4, r5, r5, r6, r7 +.endif + ldr r14, =0x03030303 + tst r3, #1 + and r8, r4, r14 + and r9, r5, r14 + and r10, r6, r14 + and r11, r7, r14 + it eq + andeq r14, r14, r14, \rnd #1 + add r8, r8, r10 + add r9, r9, r11 + ldr r12, =0xfcfcfcfc >> 2 + itt eq + addeq r8, r8, r14 + addeq r9, r9, r14 + and r4, r12, r4, lsr #2 + and r5, r12, r5, lsr #2 + and r6, r12, r6, lsr #2 + and r7, r12, r7, lsr #2 + add r10, r4, r6 + add r11, r5, r7 + subs r3, r3, #1 +.endm + +.macro RND_XY2_EXPAND align, rnd + RND_XY2_IT \align, \rnd +6: push {r8-r11} + RND_XY2_IT \align, \rnd + pop {r4-r7} + add r4, r4, r8 + add r5, r5, r9 + ldr r14, =0x0f0f0f0f + add r6, r6, r10 + add r7, r7, r11 + and r4, r14, r4, lsr #2 + and r5, r14, r5, lsr #2 + add r4, r4, r6 + add r5, r5, r7 + stm r0, {r4-r5} + add r0, r0, r2 + bge 6b + pop {r4-r11,pc} +.endm + + .align 5 +function ff_put_pixels8_xy2_arm, export=1 + @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) + @ block = word aligned, pixles = unaligned + pld [r1] + push {r4-r11,lr} @ R14 is also called LR + JMP_ALIGN r5, r1 +1: RND_XY2_EXPAND 0, lsl + .align 5 +2: RND_XY2_EXPAND 1, lsl + .align 5 +3: RND_XY2_EXPAND 2, lsl + .align 5 +4: RND_XY2_EXPAND 3, lsl +endfunc + + .align 5 +function ff_put_no_rnd_pixels8_xy2_arm, export=1 + @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) + @ block = word aligned, pixles = unaligned + pld [r1] + push {r4-r11,lr} + JMP_ALIGN r5, r1 +1: RND_XY2_EXPAND 0, lsr + .align 5 +2: RND_XY2_EXPAND 1, lsr + .align 5 +3: RND_XY2_EXPAND 2, lsr + .align 5 +4: RND_XY2_EXPAND 3, lsr +endfunc diff --git a/libavcodec/arm/hpeldsp_arm.h b/libavcodec/arm/hpeldsp_arm.h new file mode 100644 index 0000000000..6b5c1ced3f --- /dev/null +++ b/libavcodec/arm/hpeldsp_arm.h @@ -0,0 +1,27 @@ +/* + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_ARM_HPELDSP_H +#define AVCODEC_ARM_HPELDSP_H + +#include "libavcodec/hpeldsp.h" + +void ff_hpeldsp_init_armv6(HpelDSPContext *c, int flags); +void ff_hpeldsp_init_neon(HpelDSPContext *c, int flags); + +#endif /* AVCODEC_ARM_HPELDSP_H */ diff --git a/libavcodec/arm/hpeldsp_armv6.S b/libavcodec/arm/hpeldsp_armv6.S new file mode 100644 index 0000000000..a030d423bf --- /dev/null +++ b/libavcodec/arm/hpeldsp_armv6.S @@ -0,0 +1,259 @@ +/* + * Copyright (c) 2009 Mans Rullgard + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" + +.macro call_2x_pixels type, subp +function ff_\type\()_pixels16\subp\()_armv6, export=1 + push {r0-r3, lr} + bl ff_\type\()_pixels8\subp\()_armv6 + pop {r0-r3, lr} + add r0, r0, #8 + add r1, r1, #8 + b ff_\type\()_pixels8\subp\()_armv6 +endfunc +.endm + +call_2x_pixels avg +call_2x_pixels put, _x2 +call_2x_pixels put, _y2 +call_2x_pixels put, _x2_no_rnd +call_2x_pixels put, _y2_no_rnd + +function ff_put_pixels16_armv6, export=1 + push {r4-r11} +1: + ldr r5, [r1, #4] + ldr r6, [r1, #8] + ldr r7, [r1, #12] + ldr_post r4, r1, r2 + strd r6, r7, [r0, #8] + ldr r9, [r1, #4] + strd_post r4, r5, r0, r2 + ldr r10, [r1, #8] + ldr r11, [r1, #12] + ldr_post r8, r1, r2 + strd r10, r11, [r0, #8] + subs r3, r3, #2 + strd_post r8, r9, r0, r2 + bne 1b + + pop {r4-r11} + bx lr +endfunc + +function ff_put_pixels8_armv6, export=1 + push {r4-r7} +1: + ldr r5, [r1, #4] + ldr_post r4, r1, r2 + ldr r7, [r1, #4] + strd_post r4, r5, r0, r2 + ldr_post r6, r1, r2 + subs r3, r3, #2 + strd_post r6, r7, r0, r2 + bne 1b + + pop {r4-r7} + bx lr +endfunc + +function ff_put_pixels8_x2_armv6, export=1 + push {r4-r11, lr} + mov r12, #1 + orr r12, r12, r12, lsl #8 + orr r12, r12, r12, lsl #16 +1: + ldr r4, [r1] + subs r3, r3, #2 + ldr r5, [r1, #4] + ldr r7, [r1, #5] + lsr r6, r4, #8 + ldr_pre r8, r1, r2 + orr r6, r6, r5, lsl #24 + ldr r9, [r1, #4] + ldr r11, [r1, #5] + lsr r10, r8, #8 + add r1, r1, r2 + orr r10, r10, r9, lsl #24 + eor r14, r4, r6 + uhadd8 r4, r4, r6 + eor r6, r5, r7 + uhadd8 r5, r5, r7 + and r14, r14, r12 + and r6, r6, r12 + uadd8 r4, r4, r14 + eor r14, r8, r10 + uadd8 r5, r5, r6 + eor r6, r9, r11 + uhadd8 r8, r8, r10 + and r14, r14, r12 + uhadd8 r9, r9, r11 + and r6, r6, r12 + uadd8 r8, r8, r14 + strd_post r4, r5, r0, r2 + uadd8 r9, r9, r6 + strd_post r8, r9, r0, r2 + bne 1b + + pop {r4-r11, pc} +endfunc + +function ff_put_pixels8_y2_armv6, export=1 + push {r4-r11} + mov r12, #1 + orr r12, r12, r12, lsl #8 + orr r12, r12, r12, lsl #16 + ldr r4, [r1] + ldr r5, [r1, #4] + ldr_pre r6, r1, r2 + ldr r7, [r1, #4] +1: + subs r3, r3, #2 + uhadd8 r8, r4, r6 + eor r10, r4, r6 + uhadd8 r9, r5, r7 + eor r11, r5, r7 + and r10, r10, r12 + ldr_pre r4, r1, r2 + uadd8 r8, r8, r10 + and r11, r11, r12 + uadd8 r9, r9, r11 + ldr r5, [r1, #4] + uhadd8 r10, r4, r6 + eor r6, r4, r6 + uhadd8 r11, r5, r7 + and r6, r6, r12 + eor r7, r5, r7 + uadd8 r10, r10, r6 + and r7, r7, r12 + ldr_pre r6, r1, r2 + uadd8 r11, r11, r7 + strd_post r8, r9, r0, r2 + ldr r7, [r1, #4] + strd_post r10, r11, r0, r2 + bne 1b + + pop {r4-r11} + bx lr +endfunc + +function ff_put_pixels8_x2_no_rnd_armv6, export=1 + push {r4-r9, lr} +1: + subs r3, r3, #2 + ldr r4, [r1] + ldr r5, [r1, #4] + ldr r7, [r1, #5] + ldr_pre r8, r1, r2 + ldr r9, [r1, #4] + ldr r14, [r1, #5] + add r1, r1, r2 + lsr r6, r4, #8 + orr r6, r6, r5, lsl #24 + lsr r12, r8, #8 + orr r12, r12, r9, lsl #24 + uhadd8 r4, r4, r6 + uhadd8 r5, r5, r7 + uhadd8 r8, r8, r12 + uhadd8 r9, r9, r14 + stm r0, {r4,r5} + add r0, r0, r2 + stm r0, {r8,r9} + add r0, r0, r2 + bne 1b + + pop {r4-r9, pc} +endfunc + +function ff_put_pixels8_y2_no_rnd_armv6, export=1 + push {r4-r9, lr} + ldr r4, [r1] + ldr r5, [r1, #4] + ldr_pre r6, r1, r2 + ldr r7, [r1, #4] +1: + subs r3, r3, #2 + uhadd8 r8, r4, r6 + ldr_pre r4, r1, r2 + uhadd8 r9, r5, r7 + ldr r5, [r1, #4] + uhadd8 r12, r4, r6 + ldr_pre r6, r1, r2 + uhadd8 r14, r5, r7 + ldr r7, [r1, #4] + stm r0, {r8,r9} + add r0, r0, r2 + stm r0, {r12,r14} + add r0, r0, r2 + bne 1b + + pop {r4-r9, pc} +endfunc + +function ff_avg_pixels8_armv6, export=1 + pld [r1, r2] + push {r4-r10, lr} + mov lr, #1 + orr lr, lr, lr, lsl #8 + orr lr, lr, lr, lsl #16 + ldrd r4, r5, [r0] + ldr r10, [r1, #4] + ldr_post r9, r1, r2 + subs r3, r3, #2 +1: + pld [r1, r2] + eor r8, r4, r9 + uhadd8 r4, r4, r9 + eor r12, r5, r10 + ldrd_reg r6, r7, r0, r2 + uhadd8 r5, r5, r10 + and r8, r8, lr + ldr r10, [r1, #4] + and r12, r12, lr + uadd8 r4, r4, r8 + ldr_post r9, r1, r2 + eor r8, r6, r9 + uadd8 r5, r5, r12 + pld [r1, r2, lsl #1] + eor r12, r7, r10 + uhadd8 r6, r6, r9 + strd_post r4, r5, r0, r2 + uhadd8 r7, r7, r10 + beq 2f + and r8, r8, lr + ldrd_reg r4, r5, r0, r2 + uadd8 r6, r6, r8 + ldr r10, [r1, #4] + and r12, r12, lr + subs r3, r3, #2 + uadd8 r7, r7, r12 + ldr_post r9, r1, r2 + strd_post r6, r7, r0, r2 + b 1b +2: + and r8, r8, lr + and r12, r12, lr + uadd8 r6, r6, r8 + uadd8 r7, r7, r12 + strd_post r6, r7, r0, r2 + + pop {r4-r10, pc} +endfunc diff --git a/libavcodec/arm/hpeldsp_init_arm.c b/libavcodec/arm/hpeldsp_init_arm.c new file mode 100644 index 0000000000..8176afeb3b --- /dev/null +++ b/libavcodec/arm/hpeldsp_init_arm.c @@ -0,0 +1,71 @@ +/* + * ARM optimized DSP utils + * Copyright (c) 2001 Lionel Ulmer + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/cpu.h" +#include "libavutil/attributes.h" +#include "libavcodec/rnd_avg.h" +#include "hpeldsp_arm.h" + +void ff_put_pixels8_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); +void ff_put_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); +void ff_put_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); +void ff_put_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); + +void ff_put_no_rnd_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); +void ff_put_no_rnd_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); +void ff_put_no_rnd_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); + +void ff_put_pixels16_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); + +CALL_2X_PIXELS(ff_put_pixels16_x2_arm, ff_put_pixels8_x2_arm, 8) +CALL_2X_PIXELS(ff_put_pixels16_y2_arm, ff_put_pixels8_y2_arm, 8) +CALL_2X_PIXELS(ff_put_pixels16_xy2_arm, ff_put_pixels8_xy2_arm, 8) +CALL_2X_PIXELS(ff_put_no_rnd_pixels16_x2_arm, ff_put_no_rnd_pixels8_x2_arm, 8) +CALL_2X_PIXELS(ff_put_no_rnd_pixels16_y2_arm, ff_put_no_rnd_pixels8_y2_arm, 8) +CALL_2X_PIXELS(ff_put_no_rnd_pixels16_xy2_arm, ff_put_no_rnd_pixels8_xy2_arm,8) + +av_cold void ff_hpeldsp_init_arm(HpelDSPContext *c, int flags) +{ + int cpu_flags = av_get_cpu_flags(); + + c->put_pixels_tab[0][0] = ff_put_pixels16_arm; + c->put_pixels_tab[0][1] = ff_put_pixels16_x2_arm; + c->put_pixels_tab[0][2] = ff_put_pixels16_y2_arm; + c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_arm; + c->put_pixels_tab[1][0] = ff_put_pixels8_arm; + c->put_pixels_tab[1][1] = ff_put_pixels8_x2_arm; + c->put_pixels_tab[1][2] = ff_put_pixels8_y2_arm; + c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_arm; + + c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_arm; + c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_arm; + c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_arm; + c->put_no_rnd_pixels_tab[0][3] = ff_put_no_rnd_pixels16_xy2_arm; + c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_arm; + c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_arm; + c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_arm; + c->put_no_rnd_pixels_tab[1][3] = ff_put_no_rnd_pixels8_xy2_arm; + + if (have_armv6(cpu_flags)) + ff_hpeldsp_init_armv6(c, flags); + if (have_neon(cpu_flags)) + ff_hpeldsp_init_neon(c, flags); +} diff --git a/libavcodec/arm/hpeldsp_init_armv6.c b/libavcodec/arm/hpeldsp_init_armv6.c new file mode 100644 index 0000000000..67a500d513 --- /dev/null +++ b/libavcodec/arm/hpeldsp_init_armv6.c @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2009 Mans Rullgard + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include +#include + +#include "libavutil/attributes.h" +#include "hpeldsp_arm.h" + +void ff_put_pixels16_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); +void ff_put_pixels16_x2_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); +void ff_put_pixels16_y2_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); + +void ff_put_pixels16_x2_no_rnd_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); +void ff_put_pixels16_y2_no_rnd_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); + +void ff_avg_pixels16_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); + +void ff_put_pixels8_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); +void ff_put_pixels8_x2_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); +void ff_put_pixels8_y2_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); + +void ff_put_pixels8_x2_no_rnd_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); +void ff_put_pixels8_y2_no_rnd_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); + +void ff_avg_pixels8_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); + +av_cold void ff_hpeldsp_init_armv6(HpelDSPContext *c, int flags) +{ + c->put_pixels_tab[0][0] = ff_put_pixels16_armv6; + c->put_pixels_tab[0][1] = ff_put_pixels16_x2_armv6; + c->put_pixels_tab[0][2] = ff_put_pixels16_y2_armv6; +/* c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_armv6; */ + c->put_pixels_tab[1][0] = ff_put_pixels8_armv6; + c->put_pixels_tab[1][1] = ff_put_pixels8_x2_armv6; + c->put_pixels_tab[1][2] = ff_put_pixels8_y2_armv6; +/* c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_armv6; */ + + c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_armv6; + c->put_no_rnd_pixels_tab[0][1] = ff_put_pixels16_x2_no_rnd_armv6; + c->put_no_rnd_pixels_tab[0][2] = ff_put_pixels16_y2_no_rnd_armv6; +/* c->put_no_rnd_pixels_tab[0][3] = ff_put_pixels16_xy2_no_rnd_armv6; */ + c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_armv6; + c->put_no_rnd_pixels_tab[1][1] = ff_put_pixels8_x2_no_rnd_armv6; + c->put_no_rnd_pixels_tab[1][2] = ff_put_pixels8_y2_no_rnd_armv6; +/* c->put_no_rnd_pixels_tab[1][3] = ff_put_pixels8_xy2_no_rnd_armv6; */ + + c->avg_pixels_tab[0][0] = ff_avg_pixels16_armv6; + c->avg_pixels_tab[1][0] = ff_avg_pixels8_armv6; +} diff --git a/libavcodec/arm/hpeldsp_init_neon.c b/libavcodec/arm/hpeldsp_init_neon.c new file mode 100644 index 0000000000..76d4eafceb --- /dev/null +++ b/libavcodec/arm/hpeldsp_init_neon.c @@ -0,0 +1,88 @@ +/* + * ARM NEON optimised DSP functions + * Copyright (c) 2008 Mans Rullgard + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include +#include + +#include "libavutil/attributes.h" +#include "hpeldsp_arm.h" + +void ff_put_pixels16_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); +void ff_put_pixels16_x2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); +void ff_put_pixels16_y2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); +void ff_put_pixels16_xy2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); +void ff_put_pixels8_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); +void ff_put_pixels8_x2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); +void ff_put_pixels8_y2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); +void ff_put_pixels8_xy2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); +void ff_put_pixels16_x2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); +void ff_put_pixels16_y2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); +void ff_put_pixels16_xy2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); +void ff_put_pixels8_x2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); +void ff_put_pixels8_y2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); +void ff_put_pixels8_xy2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); + +void ff_avg_pixels16_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); +void ff_avg_pixels16_x2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); +void ff_avg_pixels16_y2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); +void ff_avg_pixels16_xy2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); +void ff_avg_pixels8_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); +void ff_avg_pixels8_x2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); +void ff_avg_pixels8_y2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); +void ff_avg_pixels8_xy2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); +void ff_avg_pixels16_x2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); +void ff_avg_pixels16_y2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); +void ff_avg_pixels16_xy2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); + +av_cold void ff_hpeldsp_init_neon(HpelDSPContext *c, int flags) +{ + c->put_pixels_tab[0][0] = ff_put_pixels16_neon; + c->put_pixels_tab[0][1] = ff_put_pixels16_x2_neon; + c->put_pixels_tab[0][2] = ff_put_pixels16_y2_neon; + c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_neon; + c->put_pixels_tab[1][0] = ff_put_pixels8_neon; + c->put_pixels_tab[1][1] = ff_put_pixels8_x2_neon; + c->put_pixels_tab[1][2] = ff_put_pixels8_y2_neon; + c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_neon; + + c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_neon; + c->put_no_rnd_pixels_tab[0][1] = ff_put_pixels16_x2_no_rnd_neon; + c->put_no_rnd_pixels_tab[0][2] = ff_put_pixels16_y2_no_rnd_neon; + c->put_no_rnd_pixels_tab[0][3] = ff_put_pixels16_xy2_no_rnd_neon; + c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_neon; + c->put_no_rnd_pixels_tab[1][1] = ff_put_pixels8_x2_no_rnd_neon; + c->put_no_rnd_pixels_tab[1][2] = ff_put_pixels8_y2_no_rnd_neon; + c->put_no_rnd_pixels_tab[1][3] = ff_put_pixels8_xy2_no_rnd_neon; + + c->avg_pixels_tab[0][0] = ff_avg_pixels16_neon; + c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_neon; + c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_neon; + c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_neon; + c->avg_pixels_tab[1][0] = ff_avg_pixels8_neon; + c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_neon; + c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_neon; + c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_neon; + + c->avg_no_rnd_pixels_tab[0] = ff_avg_pixels16_neon; + c->avg_no_rnd_pixels_tab[1] = ff_avg_pixels16_x2_no_rnd_neon; + c->avg_no_rnd_pixels_tab[2] = ff_avg_pixels16_y2_no_rnd_neon; + c->avg_no_rnd_pixels_tab[3] = ff_avg_pixels16_xy2_no_rnd_neon; +} diff --git a/libavcodec/arm/hpeldsp_neon.S b/libavcodec/arm/hpeldsp_neon.S new file mode 100644 index 0000000000..90bc3cb8ae --- /dev/null +++ b/libavcodec/arm/hpeldsp_neon.S @@ -0,0 +1,410 @@ +/* + * ARM NEON optimised DSP functions + * Copyright (c) 2008 Mans Rullgard + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" + +.macro pixels16 rnd=1, avg=0 + .if \avg + mov r12, r0 + .endif +1: vld1.8 {q0}, [r1], r2 + vld1.8 {q1}, [r1], r2 + vld1.8 {q2}, [r1], r2 + pld [r1, r2, lsl #2] + vld1.8 {q3}, [r1], r2 + pld [r1] + pld [r1, r2] + pld [r1, r2, lsl #1] + .if \avg + vld1.8 {q8}, [r12,:128], r2 + vrhadd.u8 q0, q0, q8 + vld1.8 {q9}, [r12,:128], r2 + vrhadd.u8 q1, q1, q9 + vld1.8 {q10}, [r12,:128], r2 + vrhadd.u8 q2, q2, q10 + vld1.8 {q11}, [r12,:128], r2 + vrhadd.u8 q3, q3, q11 + .endif + subs r3, r3, #4 + vst1.64 {q0}, [r0,:128], r2 + vst1.64 {q1}, [r0,:128], r2 + vst1.64 {q2}, [r0,:128], r2 + vst1.64 {q3}, [r0,:128], r2 + bne 1b + bx lr +.endm + +.macro pixels16_x2 rnd=1, avg=0 +1: vld1.8 {d0-d2}, [r1], r2 + vld1.8 {d4-d6}, [r1], r2 + pld [r1] + pld [r1, r2] + subs r3, r3, #2 + vext.8 q1, q0, q1, #1 + avg q0, q0, q1 + vext.8 q3, q2, q3, #1 + avg q2, q2, q3 + .if \avg + vld1.8 {q1}, [r0,:128], r2 + vld1.8 {q3}, [r0,:128] + vrhadd.u8 q0, q0, q1 + vrhadd.u8 q2, q2, q3 + sub r0, r0, r2 + .endif + vst1.8 {q0}, [r0,:128], r2 + vst1.8 {q2}, [r0,:128], r2 + bne 1b + bx lr +.endm + +.macro pixels16_y2 rnd=1, avg=0 + sub r3, r3, #2 + vld1.8 {q0}, [r1], r2 + vld1.8 {q1}, [r1], r2 +1: subs r3, r3, #2 + avg q2, q0, q1 + vld1.8 {q0}, [r1], r2 + avg q3, q0, q1 + vld1.8 {q1}, [r1], r2 + pld [r1] + pld [r1, r2] + .if \avg + vld1.8 {q8}, [r0,:128], r2 + vld1.8 {q9}, [r0,:128] + vrhadd.u8 q2, q2, q8 + vrhadd.u8 q3, q3, q9 + sub r0, r0, r2 + .endif + vst1.8 {q2}, [r0,:128], r2 + vst1.8 {q3}, [r0,:128], r2 + bne 1b + + avg q2, q0, q1 + vld1.8 {q0}, [r1], r2 + avg q3, q0, q1 + .if \avg + vld1.8 {q8}, [r0,:128], r2 + vld1.8 {q9}, [r0,:128] + vrhadd.u8 q2, q2, q8 + vrhadd.u8 q3, q3, q9 + sub r0, r0, r2 + .endif + vst1.8 {q2}, [r0,:128], r2 + vst1.8 {q3}, [r0,:128], r2 + + bx lr +.endm + +.macro pixels16_xy2 rnd=1, avg=0 + sub r3, r3, #2 + vld1.8 {d0-d2}, [r1], r2 + vld1.8 {d4-d6}, [r1], r2 +NRND vmov.i16 q13, #1 + pld [r1] + pld [r1, r2] + vext.8 q1, q0, q1, #1 + vext.8 q3, q2, q3, #1 + vaddl.u8 q8, d0, d2 + vaddl.u8 q10, d1, d3 + vaddl.u8 q9, d4, d6 + vaddl.u8 q11, d5, d7 +1: subs r3, r3, #2 + vld1.8 {d0-d2}, [r1], r2 + vadd.u16 q12, q8, q9 + pld [r1] +NRND vadd.u16 q12, q12, q13 + vext.8 q15, q0, q1, #1 + vadd.u16 q1 , q10, q11 + shrn d28, q12, #2 +NRND vadd.u16 q1, q1, q13 + shrn d29, q1, #2 + .if \avg + vld1.8 {q8}, [r0,:128] + vrhadd.u8 q14, q14, q8 + .endif + vaddl.u8 q8, d0, d30 + vld1.8 {d2-d4}, [r1], r2 + vaddl.u8 q10, d1, d31 + vst1.8 {q14}, [r0,:128], r2 + vadd.u16 q12, q8, q9 + pld [r1, r2] +NRND vadd.u16 q12, q12, q13 + vext.8 q2, q1, q2, #1 + vadd.u16 q0, q10, q11 + shrn d30, q12, #2 +NRND vadd.u16 q0, q0, q13 + shrn d31, q0, #2 + .if \avg + vld1.8 {q9}, [r0,:128] + vrhadd.u8 q15, q15, q9 + .endif + vaddl.u8 q9, d2, d4 + vaddl.u8 q11, d3, d5 + vst1.8 {q15}, [r0,:128], r2 + bgt 1b + + vld1.8 {d0-d2}, [r1], r2 + vadd.u16 q12, q8, q9 +NRND vadd.u16 q12, q12, q13 + vext.8 q15, q0, q1, #1 + vadd.u16 q1 , q10, q11 + shrn d28, q12, #2 +NRND vadd.u16 q1, q1, q13 + shrn d29, q1, #2 + .if \avg + vld1.8 {q8}, [r0,:128] + vrhadd.u8 q14, q14, q8 + .endif + vaddl.u8 q8, d0, d30 + vaddl.u8 q10, d1, d31 + vst1.8 {q14}, [r0,:128], r2 + vadd.u16 q12, q8, q9 +NRND vadd.u16 q12, q12, q13 + vadd.u16 q0, q10, q11 + shrn d30, q12, #2 +NRND vadd.u16 q0, q0, q13 + shrn d31, q0, #2 + .if \avg + vld1.8 {q9}, [r0,:128] + vrhadd.u8 q15, q15, q9 + .endif + vst1.8 {q15}, [r0,:128], r2 + + bx lr +.endm + +.macro pixels8 rnd=1, avg=0 +1: vld1.8 {d0}, [r1], r2 + vld1.8 {d1}, [r1], r2 + vld1.8 {d2}, [r1], r2 + pld [r1, r2, lsl #2] + vld1.8 {d3}, [r1], r2 + pld [r1] + pld [r1, r2] + pld [r1, r2, lsl #1] + .if \avg + vld1.8 {d4}, [r0,:64], r2 + vrhadd.u8 d0, d0, d4 + vld1.8 {d5}, [r0,:64], r2 + vrhadd.u8 d1, d1, d5 + vld1.8 {d6}, [r0,:64], r2 + vrhadd.u8 d2, d2, d6 + vld1.8 {d7}, [r0,:64], r2 + vrhadd.u8 d3, d3, d7 + sub r0, r0, r2, lsl #2 + .endif + subs r3, r3, #4 + vst1.8 {d0}, [r0,:64], r2 + vst1.8 {d1}, [r0,:64], r2 + vst1.8 {d2}, [r0,:64], r2 + vst1.8 {d3}, [r0,:64], r2 + bne 1b + bx lr +.endm + +.macro pixels8_x2 rnd=1, avg=0 +1: vld1.8 {q0}, [r1], r2 + vext.8 d1, d0, d1, #1 + vld1.8 {q1}, [r1], r2 + vext.8 d3, d2, d3, #1 + pld [r1] + pld [r1, r2] + subs r3, r3, #2 + vswp d1, d2 + avg q0, q0, q1 + .if \avg + vld1.8 {d4}, [r0,:64], r2 + vld1.8 {d5}, [r0,:64] + vrhadd.u8 q0, q0, q2 + sub r0, r0, r2 + .endif + vst1.8 {d0}, [r0,:64], r2 + vst1.8 {d1}, [r0,:64], r2 + bne 1b + bx lr +.endm + +.macro pixels8_y2 rnd=1, avg=0 + sub r3, r3, #2 + vld1.8 {d0}, [r1], r2 + vld1.8 {d1}, [r1], r2 +1: subs r3, r3, #2 + avg d4, d0, d1 + vld1.8 {d0}, [r1], r2 + avg d5, d0, d1 + vld1.8 {d1}, [r1], r2 + pld [r1] + pld [r1, r2] + .if \avg + vld1.8 {d2}, [r0,:64], r2 + vld1.8 {d3}, [r0,:64] + vrhadd.u8 q2, q2, q1 + sub r0, r0, r2 + .endif + vst1.8 {d4}, [r0,:64], r2 + vst1.8 {d5}, [r0,:64], r2 + bne 1b + + avg d4, d0, d1 + vld1.8 {d0}, [r1], r2 + avg d5, d0, d1 + .if \avg + vld1.8 {d2}, [r0,:64], r2 + vld1.8 {d3}, [r0,:64] + vrhadd.u8 q2, q2, q1 + sub r0, r0, r2 + .endif + vst1.8 {d4}, [r0,:64], r2 + vst1.8 {d5}, [r0,:64], r2 + + bx lr +.endm + +.macro pixels8_xy2 rnd=1, avg=0 + sub r3, r3, #2 + vld1.8 {q0}, [r1], r2 + vld1.8 {q1}, [r1], r2 +NRND vmov.i16 q11, #1 + pld [r1] + pld [r1, r2] + vext.8 d4, d0, d1, #1 + vext.8 d6, d2, d3, #1 + vaddl.u8 q8, d0, d4 + vaddl.u8 q9, d2, d6 +1: subs r3, r3, #2 + vld1.8 {q0}, [r1], r2 + pld [r1] + vadd.u16 q10, q8, q9 + vext.8 d4, d0, d1, #1 +NRND vadd.u16 q10, q10, q11 + vaddl.u8 q8, d0, d4 + shrn d5, q10, #2 + vld1.8 {q1}, [r1], r2 + vadd.u16 q10, q8, q9 + pld [r1, r2] + .if \avg + vld1.8 {d7}, [r0,:64] + vrhadd.u8 d5, d5, d7 + .endif +NRND vadd.u16 q10, q10, q11 + vst1.8 {d5}, [r0,:64], r2 + shrn d7, q10, #2 + .if \avg + vld1.8 {d5}, [r0,:64] + vrhadd.u8 d7, d7, d5 + .endif + vext.8 d6, d2, d3, #1 + vaddl.u8 q9, d2, d6 + vst1.8 {d7}, [r0,:64], r2 + bgt 1b + + vld1.8 {q0}, [r1], r2 + vadd.u16 q10, q8, q9 + vext.8 d4, d0, d1, #1 +NRND vadd.u16 q10, q10, q11 + vaddl.u8 q8, d0, d4 + shrn d5, q10, #2 + vadd.u16 q10, q8, q9 + .if \avg + vld1.8 {d7}, [r0,:64] + vrhadd.u8 d5, d5, d7 + .endif +NRND vadd.u16 q10, q10, q11 + vst1.8 {d5}, [r0,:64], r2 + shrn d7, q10, #2 + .if \avg + vld1.8 {d5}, [r0,:64] + vrhadd.u8 d7, d7, d5 + .endif + vst1.8 {d7}, [r0,:64], r2 + + bx lr +.endm + +.macro pixfunc pfx, name, suf, rnd=1, avg=0 + .if \rnd + .macro avg rd, rn, rm + vrhadd.u8 \rd, \rn, \rm + .endm + .macro shrn rd, rn, rm + vrshrn.u16 \rd, \rn, \rm + .endm + .macro NRND insn:vararg + .endm + .else + .macro avg rd, rn, rm + vhadd.u8 \rd, \rn, \rm + .endm + .macro shrn rd, rn, rm + vshrn.u16 \rd, \rn, \rm + .endm + .macro NRND insn:vararg + \insn + .endm + .endif +function ff_\pfx\name\suf\()_neon, export=1 + \name \rnd, \avg +endfunc + .purgem avg + .purgem shrn + .purgem NRND +.endm + +.macro pixfunc2 pfx, name, avg=0 + pixfunc \pfx, \name, rnd=1, avg=\avg + pixfunc \pfx, \name, _no_rnd, rnd=0, avg=\avg +.endm + +function ff_put_h264_qpel16_mc00_neon, export=1 + mov r3, #16 +endfunc + + pixfunc put_, pixels16, avg=0 + pixfunc2 put_, pixels16_x2, avg=0 + pixfunc2 put_, pixels16_y2, avg=0 + pixfunc2 put_, pixels16_xy2, avg=0 + +function ff_avg_h264_qpel16_mc00_neon, export=1 + mov r3, #16 +endfunc + + pixfunc avg_, pixels16, avg=1 + pixfunc2 avg_, pixels16_x2, avg=1 + pixfunc2 avg_, pixels16_y2, avg=1 + pixfunc2 avg_, pixels16_xy2, avg=1 + +function ff_put_h264_qpel8_mc00_neon, export=1 + mov r3, #8 +endfunc + + pixfunc put_, pixels8, avg=0 + pixfunc2 put_, pixels8_x2, avg=0 + pixfunc2 put_, pixels8_y2, avg=0 + pixfunc2 put_, pixels8_xy2, avg=0 + +function ff_avg_h264_qpel8_mc00_neon, export=1 + mov r3, #8 +endfunc + + pixfunc avg_, pixels8, avg=1 + pixfunc avg_, pixels8_x2, avg=1 + pixfunc avg_, pixels8_y2, avg=1 + pixfunc avg_, pixels8_xy2, avg=1 diff --git a/libavcodec/hpeldsp.c b/libavcodec/hpeldsp.c index 09e4576774..9756645a14 100644 --- a/libavcodec/hpeldsp.c +++ b/libavcodec/hpeldsp.c @@ -54,6 +54,8 @@ av_cold void ff_hpeldsp_init(HpelDSPContext *c, int flags) hpel_funcs(avg, [3], 2); hpel_funcs(avg_no_rnd,, 16); + if (ARCH_ARM) + ff_hpeldsp_init_arm(c, flags); if (ARCH_PPC) ff_hpeldsp_init_ppc(c, flags); if (ARCH_X86) diff --git a/libavcodec/hpeldsp.h b/libavcodec/hpeldsp.h index eeb7a1c008..837a11b577 100644 --- a/libavcodec/hpeldsp.h +++ b/libavcodec/hpeldsp.h @@ -94,6 +94,7 @@ typedef struct HpelDSPContext { void ff_hpeldsp_init(HpelDSPContext *c, int flags); +void ff_hpeldsp_init_arm(HpelDSPContext *c, int flags); void ff_hpeldsp_init_ppc(HpelDSPContext *c, int flags); void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags);