Separate format conversion DSP functions from DSPContext.

This will be beneficial for use with the audio conversion API without requiring it to depend on all of dsputil. Signed-off-by: Mans Rullgard <mans@mansr.com> (cherry picked from commit c73d99e672329c8f2df290736ffc474c360ac4ae)
2025-03-03 14:32:16 +02:00 · 2011-01-30 15:06:46 +00:00 · 2011-01-30 15:06:46 +00:00 · fe2ff6d247
commit fe2ff6d247
parent a35d782d28
32 changed files with 1204 additions and 882 deletions
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@ -12,6 +12,7 @@ OBJS = allcodecs.o                                                      \
       bitstream_filter.o                                               \
       dsputil.o                                                        \
       faanidct.o                                                       \
+       fmtconvert.o                                                     \
       imgconvert.o                                                     \
       jrevdct.o                                                        \
       opt.o                                                            \
--- a/libavcodec/aac.h
+++ b/libavcodec/aac.h
@ -35,6 +35,7 @@
 #include "fft.h"
 #include "mpeg4audio.h"
 #include "sbr.h"
+#include "fmtconvert.h"

 #include <stdint.h>

@ -268,6 +269,7 @@ typedef struct {
    FFTContext mdct;
    FFTContext mdct_small;
    DSPContext dsp;
+    FmtConvertContext fmt_conv;
    int random_state;
    /** @} */

--- a/libavcodec/aacdec.c
+++ b/libavcodec/aacdec.c
@ -85,6 +85,7 @@
 #include "get_bits.h"
 #include "dsputil.h"
 #include "fft.h"
+#include "fmtconvert.h"
 #include "lpc.h"

 #include "aac.h"
@ -562,6 +563,7 @@ static av_cold int aac_decode_init(AVCodecContext *avctx)
    ff_aac_sbr_init();

    dsputil_init(&ac->dsp, avctx);
+    ff_fmt_convert_init(&ac->fmt_conv, avctx);

    ac->random_state = 0x1f2e3d4c;

@ -2032,7 +2034,7 @@ static int aac_decode_frame_int(AVCodecContext *avctx, void *data,
    *data_size = data_size_tmp;

    if (samples)
-        ac->dsp.float_to_int16_interleave(data, (const float **)ac->output_data, samples, avctx->channels);
+        ac->fmt_conv.float_to_int16_interleave(data, (const float **)ac->output_data, samples, avctx->channels);

    if (ac->output_configured)
        ac->output_configured = OC_LOCKED;
--- a/libavcodec/ac3dec.c
+++ b/libavcodec/ac3dec.c
@ -193,6 +193,7 @@ static av_cold int ac3_decode_init(AVCodecContext *avctx)
    ff_mdct_init(&s->imdct_512, 9, 1, 1.0);
    ff_kbd_window_init(s->window, 5.0, 256);
    dsputil_init(&s->dsp, avctx);
+    ff_fmt_convert_init(&s->fmt_conv, avctx);
    av_lfg_init(&s->dith_state, 0);

    /* set scale value for float to int16 conversion */
@ -1255,7 +1256,7 @@ static int decode_audio_block(AC3DecodeContext *s, int blk)
        } else {
            gain *= s->dynamic_range[0];
        }
-        s->dsp.int32_to_float_fmul_scalar(s->transform_coeffs[ch], s->fixed_coeffs[ch], gain, 256);
+        s->fmt_conv.int32_to_float_fmul_scalar(s->transform_coeffs[ch], s->fixed_coeffs[ch], gain, 256);
    }

    /* apply spectral extension to high frequency bins */
@ -1407,7 +1408,7 @@ static int ac3_decode_frame(AVCodecContext * avctx, void *data, int *data_size,
            av_log(avctx, AV_LOG_ERROR, "error decoding the audio block\n");
            err = 1;
        }
-        s->dsp.float_to_int16_interleave(out_samples, output, 256, s->out_channels);
+        s->fmt_conv.float_to_int16_interleave(out_samples, output, 256, s->out_channels);
        out_samples += 256 * s->out_channels;
    }
    *data_size = s->num_blocks * 256 * avctx->channels * sizeof (int16_t);
--- a/libavcodec/ac3dec.h
+++ b/libavcodec/ac3dec.h
@ -55,6 +55,7 @@
 #include "get_bits.h"
 #include "dsputil.h"
 #include "fft.h"
+#include "fmtconvert.h"

 /* override ac3.h to include coupling channel */
 #undef AC3_MAX_CHANNELS
@ -190,6 +191,7 @@ typedef struct {

 ///@defgroup opt optimization
    DSPContext dsp;                         ///< for optimization
+    FmtConvertContext fmt_conv;             ///< optimized conversion functions
    float mul_bias;                         ///< scaling for float_to_int16 conversion
 ///@}

--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@ -9,6 +9,7 @@ OBJS-$(CONFIG_H264PRED)                += arm/h264pred_init_arm.o
 OBJS                                   += arm/dsputil_init_arm.o        \
                                          arm/dsputil_arm.o             \
                                          arm/fft_init_arm.o            \
+                                          arm/fmtconvert_init_arm.o     \
                                          arm/jrevdct_arm.o             \
                                          arm/mpegvideo_arm.o           \
                                          arm/simple_idct_arm.o         \
@ -22,8 +23,11 @@ OBJS-$(HAVE_ARMV6)                     += arm/dsputil_init_armv6.o      \
                                          arm/dsputil_armv6.o           \
                                          arm/simple_idct_armv6.o       \

+VFP-OBJS-$(HAVE_ARMV6)                 += arm/fmtconvert_vfp.o          \
+
 OBJS-$(HAVE_ARMVFP)                    += arm/dsputil_vfp.o             \
                                          arm/dsputil_init_vfp.o        \
+                                          $(VFP-OBJS-yes)

 OBJS-$(HAVE_IWMMXT)                    += arm/dsputil_iwmmxt.o          \
                                          arm/mpegvideo_iwmmxt.o        \
@ -52,6 +56,7 @@ NEON-OBJS-$(CONFIG_VP6_DECODER)        += arm/vp56dsp_neon.o            \

 OBJS-$(HAVE_NEON)                      += arm/dsputil_init_neon.o       \
                                          arm/dsputil_neon.o            \
+                                          arm/fmtconvert_neon.o         \
                                          arm/int_neon.o                \
                                          arm/mpegvideo_neon.o          \
                                          arm/simple_idct_neon.o        \
--- a/libavcodec/arm/dsputil_init_neon.c
+++ b/libavcodec/arm/dsputil_init_neon.c
@ -153,8 +153,6 @@ void ff_sv_fmul_scalar_4_neon(float *dst, const float **vp, float mul,
                              int len);
 void ff_butterflies_float_neon(float *v1, float *v2, int len);
 float ff_scalarproduct_float_neon(const float *v1, const float *v2, int len);
-void ff_int32_to_float_fmul_scalar_neon(float *dst, const int *src,
-                                        float mul, int len);
 void ff_vector_fmul_reverse_neon(float *dst, const float *src0,
                                 const float *src1, int len);
 void ff_vector_fmul_add_neon(float *dst, const float *src0, const float *src1,
@ -162,8 +160,6 @@ void ff_vector_fmul_add_neon(float *dst, const float *src0, const float *src1,

 void ff_vector_clipf_neon(float *dst, const float *src, float min, float max,
                          int len);
-void ff_float_to_int16_neon(int16_t *, const float *, long);
-void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int);

 void ff_vorbis_inverse_coupling_neon(float *mag, float *ang, int blocksize);

@ -308,7 +304,6 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
    c->vector_fmul_scalar         = ff_vector_fmul_scalar_neon;
    c->butterflies_float          = ff_butterflies_float_neon;
    c->scalarproduct_float        = ff_scalarproduct_float_neon;
-    c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_neon;
    c->vector_fmul_reverse        = ff_vector_fmul_reverse_neon;
    c->vector_fmul_add            = ff_vector_fmul_add_neon;
    c->vector_clipf               = ff_vector_clipf_neon;
@ -319,11 +314,6 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
    c->sv_fmul_scalar[0] = ff_sv_fmul_scalar_2_neon;
    c->sv_fmul_scalar[1] = ff_sv_fmul_scalar_4_neon;

-    if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
-        c->float_to_int16            = ff_float_to_int16_neon;
-        c->float_to_int16_interleave = ff_float_to_int16_interleave_neon;
-    }
-
    if (CONFIG_VORBIS_DECODER)
        c->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_neon;

--- a/libavcodec/arm/dsputil_init_vfp.c
+++ b/libavcodec/arm/dsputil_init_vfp.c
@ -25,13 +25,9 @@ void ff_vector_fmul_vfp(float *dst, const float *src0,
                        const float *src1, int len);
 void ff_vector_fmul_reverse_vfp(float *dst, const float *src0,
                                const float *src1, int len);
-void ff_float_to_int16_vfp(int16_t *dst, const float *src, long len);

 void ff_dsputil_init_vfp(DSPContext* c, AVCodecContext *avctx)
 {
    c->vector_fmul = ff_vector_fmul_vfp;
    c->vector_fmul_reverse = ff_vector_fmul_reverse_vfp;
-#if HAVE_ARMV6
-    c->float_to_int16 = ff_float_to_int16_vfp;
-#endif
 }
--- a/libavcodec/arm/dsputil_neon.S
+++ b/libavcodec/arm/dsputil_neon.S
@ -400,343 +400,6 @@ function ff_add_pixels_clamped_neon, export=1
        bx              lr
 endfunc

-function ff_float_to_int16_neon, export=1
-        subs            r2,  r2,  #8
-        vld1.64         {d0-d1},  [r1,:128]!
-        vcvt.s32.f32    q8,  q0,  #16
-        vld1.64         {d2-d3},  [r1,:128]!
-        vcvt.s32.f32    q9,  q1,  #16
-        beq             3f
-        bics            ip,  r2,  #15
-        beq             2f
-1:      subs            ip,  ip,  #16
-        vshrn.s32       d4,  q8,  #16
-        vld1.64         {d0-d1},  [r1,:128]!
-        vcvt.s32.f32    q0,  q0,  #16
-        vshrn.s32       d5,  q9,  #16
-        vld1.64         {d2-d3},  [r1,:128]!
-        vcvt.s32.f32    q1,  q1,  #16
-        vshrn.s32       d6,  q0,  #16
-        vst1.64         {d4-d5},  [r0,:128]!
-        vshrn.s32       d7,  q1,  #16
-        vld1.64         {d16-d17},[r1,:128]!
-        vcvt.s32.f32    q8,  q8,  #16
-        vld1.64         {d18-d19},[r1,:128]!
-        vcvt.s32.f32    q9,  q9,  #16
-        vst1.64         {d6-d7},  [r0,:128]!
-        bne             1b
-        ands            r2,  r2,  #15
-        beq             3f
-2:      vld1.64         {d0-d1},  [r1,:128]!
-        vshrn.s32       d4,  q8,  #16
-        vcvt.s32.f32    q0,  q0,  #16
-        vld1.64         {d2-d3},  [r1,:128]!
-        vshrn.s32       d5,  q9,  #16
-        vcvt.s32.f32    q1,  q1,  #16
-        vshrn.s32       d6,  q0,  #16
-        vst1.64         {d4-d5},  [r0,:128]!
-        vshrn.s32       d7,  q1,  #16
-        vst1.64         {d6-d7},  [r0,:128]!
-        bx              lr
-3:      vshrn.s32       d4,  q8,  #16
-        vshrn.s32       d5,  q9,  #16
-        vst1.64         {d4-d5},  [r0,:128]!
-        bx              lr
-endfunc
-
-function ff_float_to_int16_interleave_neon, export=1
-        cmp             r3, #2
-        ldrlt           r1, [r1]
-        blt             ff_float_to_int16_neon
-        bne             4f
-
-        ldr             r3, [r1]
-        ldr             r1, [r1, #4]
-
-        subs            r2,  r2,  #8
-        vld1.64         {d0-d1},  [r3,:128]!
-        vcvt.s32.f32    q8,  q0,  #16
-        vld1.64         {d2-d3},  [r3,:128]!
-        vcvt.s32.f32    q9,  q1,  #16
-        vld1.64         {d20-d21},[r1,:128]!
-        vcvt.s32.f32    q10, q10, #16
-        vld1.64         {d22-d23},[r1,:128]!
-        vcvt.s32.f32    q11, q11, #16
-        beq             3f
-        bics            ip,  r2,  #15
-        beq             2f
-1:      subs            ip,  ip,  #16
-        vld1.64         {d0-d1},  [r3,:128]!
-        vcvt.s32.f32    q0,  q0,  #16
-        vsri.32         q10, q8,  #16
-        vld1.64         {d2-d3},  [r3,:128]!
-        vcvt.s32.f32    q1,  q1,  #16
-        vld1.64         {d24-d25},[r1,:128]!
-        vcvt.s32.f32    q12, q12, #16
-        vld1.64         {d26-d27},[r1,:128]!
-        vsri.32         q11, q9,  #16
-        vst1.64         {d20-d21},[r0,:128]!
-        vcvt.s32.f32    q13, q13, #16
-        vst1.64         {d22-d23},[r0,:128]!
-        vsri.32         q12, q0,  #16
-        vld1.64         {d16-d17},[r3,:128]!
-        vsri.32         q13, q1,  #16
-        vst1.64         {d24-d25},[r0,:128]!
-        vcvt.s32.f32    q8,  q8,  #16
-        vld1.64         {d18-d19},[r3,:128]!
-        vcvt.s32.f32    q9,  q9,  #16
-        vld1.64         {d20-d21},[r1,:128]!
-        vcvt.s32.f32    q10, q10, #16
-        vld1.64         {d22-d23},[r1,:128]!
-        vcvt.s32.f32    q11, q11, #16
-        vst1.64         {d26-d27},[r0,:128]!
-        bne             1b
-        ands            r2,  r2,  #15
-        beq             3f
-2:      vsri.32         q10, q8,  #16
-        vld1.64         {d0-d1},  [r3,:128]!
-        vcvt.s32.f32    q0,  q0,  #16
-        vld1.64         {d2-d3},  [r3,:128]!
-        vcvt.s32.f32    q1,  q1,  #16
-        vld1.64         {d24-d25},[r1,:128]!
-        vcvt.s32.f32    q12, q12, #16
-        vsri.32         q11, q9,  #16
-        vld1.64         {d26-d27},[r1,:128]!
-        vcvt.s32.f32    q13, q13, #16
-        vst1.64         {d20-d21},[r0,:128]!
-        vsri.32         q12, q0,  #16
-        vst1.64         {d22-d23},[r0,:128]!
-        vsri.32         q13, q1,  #16
-        vst1.64         {d24-d27},[r0,:128]!
-        bx              lr
-3:      vsri.32         q10, q8,  #16
-        vsri.32         q11, q9,  #16
-        vst1.64         {d20-d23},[r0,:128]!
-        bx              lr
-
-4:      push            {r4-r8,lr}
-        cmp             r3,  #4
-        lsl             ip,  r3,  #1
-        blt             4f
-
-        @ 4 channels
-5:      ldmia           r1!, {r4-r7}
-        mov             lr,  r2
-        mov             r8,  r0
-        vld1.64         {d16-d17},[r4,:128]!
-        vcvt.s32.f32    q8,  q8,  #16
-        vld1.64         {d18-d19},[r5,:128]!
-        vcvt.s32.f32    q9,  q9,  #16
-        vld1.64         {d20-d21},[r6,:128]!
-        vcvt.s32.f32    q10, q10, #16
-        vld1.64         {d22-d23},[r7,:128]!
-        vcvt.s32.f32    q11, q11, #16
-6:      subs            lr,  lr,  #8
-        vld1.64         {d0-d1},  [r4,:128]!
-        vcvt.s32.f32    q0,  q0,  #16
-        vsri.32         q9,  q8,  #16
-        vld1.64         {d2-d3},  [r5,:128]!
-        vcvt.s32.f32    q1,  q1,  #16
-        vsri.32         q11, q10, #16
-        vld1.64         {d4-d5},  [r6,:128]!
-        vcvt.s32.f32    q2,  q2,  #16
-        vzip.32         d18, d22
-        vld1.64         {d6-d7},  [r7,:128]!
-        vcvt.s32.f32    q3,  q3,  #16
-        vzip.32         d19, d23
-        vst1.64         {d18},    [r8], ip
-        vsri.32         q1,  q0,  #16
-        vst1.64         {d22},    [r8], ip
-        vsri.32         q3,  q2,  #16
-        vst1.64         {d19},    [r8], ip
-        vzip.32         d2,  d6
-        vst1.64         {d23},    [r8], ip
-        vzip.32         d3,  d7
-        beq             7f
-        vld1.64         {d16-d17},[r4,:128]!
-        vcvt.s32.f32    q8,  q8,  #16
-        vst1.64         {d2},     [r8], ip
-        vld1.64         {d18-d19},[r5,:128]!
-        vcvt.s32.f32    q9,  q9,  #16
-        vst1.64         {d6},     [r8], ip
-        vld1.64         {d20-d21},[r6,:128]!
-        vcvt.s32.f32    q10, q10, #16
-        vst1.64         {d3},     [r8], ip
-        vld1.64         {d22-d23},[r7,:128]!
-        vcvt.s32.f32    q11, q11, #16
-        vst1.64         {d7},     [r8], ip
-        b               6b
-7:      vst1.64         {d2},     [r8], ip
-        vst1.64         {d6},     [r8], ip
-        vst1.64         {d3},     [r8], ip
-        vst1.64         {d7},     [r8], ip
-        subs            r3,  r3,  #4
-        popeq           {r4-r8,pc}
-        cmp             r3,  #4
-        add             r0,  r0,  #8
-        bge             5b
-
-        @ 2 channels
-4:      cmp             r3,  #2
-        blt             4f
-        ldmia           r1!, {r4-r5}
-        mov             lr,  r2
-        mov             r8,  r0
-        tst             lr,  #8
-        vld1.64         {d16-d17},[r4,:128]!
-        vcvt.s32.f32    q8,  q8,  #16
-        vld1.64         {d18-d19},[r5,:128]!
-        vcvt.s32.f32    q9,  q9,  #16
-        vld1.64         {d20-d21},[r4,:128]!
-        vcvt.s32.f32    q10, q10, #16
-        vld1.64         {d22-d23},[r5,:128]!
-        vcvt.s32.f32    q11, q11, #16
-        beq             6f
-        subs            lr,  lr,  #8
-        beq             7f
-        vsri.32         d18, d16, #16
-        vsri.32         d19, d17, #16
-        vld1.64         {d16-d17},[r4,:128]!
-        vcvt.s32.f32    q8,  q8,  #16
-        vst1.32         {d18[0]}, [r8], ip
-        vsri.32         d22, d20, #16
-        vst1.32         {d18[1]}, [r8], ip
-        vsri.32         d23, d21, #16
-        vst1.32         {d19[0]}, [r8], ip
-        vst1.32         {d19[1]}, [r8], ip
-        vld1.64         {d18-d19},[r5,:128]!
-        vcvt.s32.f32    q9,  q9,  #16
-        vst1.32         {d22[0]}, [r8], ip
-        vst1.32         {d22[1]}, [r8], ip
-        vld1.64         {d20-d21},[r4,:128]!
-        vcvt.s32.f32    q10, q10, #16
-        vst1.32         {d23[0]}, [r8], ip
-        vst1.32         {d23[1]}, [r8], ip
-        vld1.64         {d22-d23},[r5,:128]!
-        vcvt.s32.f32    q11, q11, #16
-6:      subs            lr,  lr,  #16
-        vld1.64         {d0-d1},  [r4,:128]!
-        vcvt.s32.f32    q0,  q0,  #16
-        vsri.32         d18, d16, #16
-        vld1.64         {d2-d3},  [r5,:128]!
-        vcvt.s32.f32    q1,  q1,  #16
-        vsri.32         d19, d17, #16
-        vld1.64         {d4-d5},  [r4,:128]!
-        vcvt.s32.f32    q2,  q2,  #16
-        vld1.64         {d6-d7},  [r5,:128]!
-        vcvt.s32.f32    q3,  q3,  #16
-        vst1.32         {d18[0]}, [r8], ip
-        vsri.32         d22, d20, #16
-        vst1.32         {d18[1]}, [r8], ip
-        vsri.32         d23, d21, #16
-        vst1.32         {d19[0]}, [r8], ip
-        vsri.32         d2,  d0,  #16
-        vst1.32         {d19[1]}, [r8], ip
-        vsri.32         d3,  d1,  #16
-        vst1.32         {d22[0]}, [r8], ip
-        vsri.32         d6,  d4,  #16
-        vst1.32         {d22[1]}, [r8], ip
-        vsri.32         d7,  d5,  #16
-        vst1.32         {d23[0]}, [r8], ip
-        vst1.32         {d23[1]}, [r8], ip
-        beq             6f
-        vld1.64         {d16-d17},[r4,:128]!
-        vcvt.s32.f32    q8,  q8,  #16
-        vst1.32         {d2[0]},  [r8], ip
-        vst1.32         {d2[1]},  [r8], ip
-        vld1.64         {d18-d19},[r5,:128]!
-        vcvt.s32.f32    q9,  q9,  #16
-        vst1.32         {d3[0]},  [r8], ip
-        vst1.32         {d3[1]},  [r8], ip
-        vld1.64         {d20-d21},[r4,:128]!
-        vcvt.s32.f32    q10, q10, #16
-        vst1.32         {d6[0]},  [r8], ip
-        vst1.32         {d6[1]},  [r8], ip
-        vld1.64         {d22-d23},[r5,:128]!
-        vcvt.s32.f32    q11, q11, #16
-        vst1.32         {d7[0]},  [r8], ip
-        vst1.32         {d7[1]},  [r8], ip
-        bgt             6b
-6:      vst1.32         {d2[0]},  [r8], ip
-        vst1.32         {d2[1]},  [r8], ip
-        vst1.32         {d3[0]},  [r8], ip
-        vst1.32         {d3[1]},  [r8], ip
-        vst1.32         {d6[0]},  [r8], ip
-        vst1.32         {d6[1]},  [r8], ip
-        vst1.32         {d7[0]},  [r8], ip
-        vst1.32         {d7[1]},  [r8], ip
-        b               8f
-7:      vsri.32         d18, d16, #16
-        vsri.32         d19, d17, #16
-        vst1.32         {d18[0]}, [r8], ip
-        vsri.32         d22, d20, #16
-        vst1.32         {d18[1]}, [r8], ip
-        vsri.32         d23, d21, #16
-        vst1.32         {d19[0]}, [r8], ip
-        vst1.32         {d19[1]}, [r8], ip
-        vst1.32         {d22[0]}, [r8], ip
-        vst1.32         {d22[1]}, [r8], ip
-        vst1.32         {d23[0]}, [r8], ip
-        vst1.32         {d23[1]}, [r8], ip
-8:      subs            r3,  r3,  #2
-        add             r0,  r0,  #4
-        popeq           {r4-r8,pc}
-
-        @ 1 channel
-4:      ldr             r4,  [r1],#4
-        tst             r2,  #8
-        mov             lr,  r2
-        mov             r5,  r0
-        vld1.64         {d0-d1},  [r4,:128]!
-        vcvt.s32.f32    q0,  q0,  #16
-        vld1.64         {d2-d3},  [r4,:128]!
-        vcvt.s32.f32    q1,  q1,  #16
-        bne             8f
-6:      subs            lr,  lr,  #16
-        vld1.64         {d4-d5},  [r4,:128]!
-        vcvt.s32.f32    q2,  q2,  #16
-        vld1.64         {d6-d7},  [r4,:128]!
-        vcvt.s32.f32    q3,  q3,  #16
-        vst1.16         {d0[1]},  [r5,:16], ip
-        vst1.16         {d0[3]},  [r5,:16], ip
-        vst1.16         {d1[1]},  [r5,:16], ip
-        vst1.16         {d1[3]},  [r5,:16], ip
-        vst1.16         {d2[1]},  [r5,:16], ip
-        vst1.16         {d2[3]},  [r5,:16], ip
-        vst1.16         {d3[1]},  [r5,:16], ip
-        vst1.16         {d3[3]},  [r5,:16], ip
-        beq             7f
-        vld1.64         {d0-d1},  [r4,:128]!
-        vcvt.s32.f32    q0,  q0,  #16
-        vld1.64         {d2-d3},  [r4,:128]!
-        vcvt.s32.f32    q1,  q1,  #16
-7:      vst1.16         {d4[1]},  [r5,:16], ip
-        vst1.16         {d4[3]},  [r5,:16], ip
-        vst1.16         {d5[1]},  [r5,:16], ip
-        vst1.16         {d5[3]},  [r5,:16], ip
-        vst1.16         {d6[1]},  [r5,:16], ip
-        vst1.16         {d6[3]},  [r5,:16], ip
-        vst1.16         {d7[1]},  [r5,:16], ip
-        vst1.16         {d7[3]},  [r5,:16], ip
-        bgt             6b
-        pop             {r4-r8,pc}
-8:      subs            lr,  lr,  #8
-        vst1.16         {d0[1]},  [r5,:16], ip
-        vst1.16         {d0[3]},  [r5,:16], ip
-        vst1.16         {d1[1]},  [r5,:16], ip
-        vst1.16         {d1[3]},  [r5,:16], ip
-        vst1.16         {d2[1]},  [r5,:16], ip
-        vst1.16         {d2[3]},  [r5,:16], ip
-        vst1.16         {d3[1]},  [r5,:16], ip
-        vst1.16         {d3[3]},  [r5,:16], ip
-        popeq           {r4-r8,pc}
-        vld1.64         {d0-d1},  [r4,:128]!
-        vcvt.s32.f32    q0,  q0,  #16
-        vld1.64         {d2-d3},  [r4,:128]!
-        vcvt.s32.f32    q1,  q1,  #16
-        b               6b
-endfunc
-
 function ff_vector_fmul_neon, export=1
        subs            r3,  r3,  #8
        vld1.64         {d0-d3},  [r1,:128]!
@ -1050,34 +713,6 @@ NOVFP   vmov.32         r0,  d0[0]
        bx              lr
 endfunc

-function ff_int32_to_float_fmul_scalar_neon, export=1
-VFP     vdup.32         q0,  d0[0]
-VFP     len     .req    r2
-NOVFP   vdup.32         q0,  r2
-NOVFP   len     .req    r3
-
-        vld1.32         {q1},[r1,:128]!
-        vcvt.f32.s32    q3,  q1
-        vld1.32         {q2},[r1,:128]!
-        vcvt.f32.s32    q8,  q2
-1:      subs            len, len, #8
-        pld             [r1, #16]
-        vmul.f32        q9,  q3,  q0
-        vmul.f32        q10, q8,  q0
-        beq             2f
-        vld1.32         {q1},[r1,:128]!
-        vcvt.f32.s32    q3,  q1
-        vld1.32         {q2},[r1,:128]!
-        vcvt.f32.s32    q8,  q2
-        vst1.32         {q9}, [r0,:128]!
-        vst1.32         {q10},[r0,:128]!
-        b               1b
-2:      vst1.32         {q9}, [r0,:128]!
-        vst1.32         {q10},[r0,:128]!
-        bx              lr
-        .unreq  len
-endfunc
-
 function ff_vector_fmul_reverse_neon, export=1
        add             r2,  r2,  r3,  lsl #2
        sub             r2,  r2,  #32
--- a/libavcodec/arm/dsputil_vfp.S
+++ b/libavcodec/arm/dsputil_vfp.S
@ -131,58 +131,3 @@ function ff_vector_fmul_reverse_vfp, export=1
        vpop            {d8-d15}
        bx              lr
 endfunc
-
-#if HAVE_ARMV6
-/**
- * ARM VFP optimized float to int16 conversion.
- * Assume that len is a positive number and is multiple of 8, destination
- * buffer is at least 4 bytes aligned (8 bytes alignment is better for
- * performance), little endian byte sex
- */
-@ void ff_float_to_int16_vfp(int16_t *dst, const float *src, int len)
-function ff_float_to_int16_vfp, export=1
-        push            {r4-r8,lr}
-        vpush           {d8-d11}
-        vldmia          r1!, {s16-s23}
-        vcvt.s32.f32    s0,  s16
-        vcvt.s32.f32    s1,  s17
-        vcvt.s32.f32    s2,  s18
-        vcvt.s32.f32    s3,  s19
-        vcvt.s32.f32    s4,  s20
-        vcvt.s32.f32    s5,  s21
-        vcvt.s32.f32    s6,  s22
-        vcvt.s32.f32    s7,  s23
-1:
-        subs            r2,  r2,  #8
-        vmov            r3,  r4,  s0, s1
-        vmov            r5,  r6,  s2, s3
-        vmov            r7,  r8,  s4, s5
-        vmov            ip,  lr,  s6, s7
-        vldmiagt        r1!, {s16-s23}
-        ssat            r4,  #16, r4
-        ssat            r3,  #16, r3
-        ssat            r6,  #16, r6
-        ssat            r5,  #16, r5
-        pkhbt           r3,  r3,  r4, lsl #16
-        pkhbt           r4,  r5,  r6, lsl #16
-        vcvtgt.s32.f32  s0,  s16
-        vcvtgt.s32.f32  s1,  s17
-        vcvtgt.s32.f32  s2,  s18
-        vcvtgt.s32.f32  s3,  s19
-        vcvtgt.s32.f32  s4,  s20
-        vcvtgt.s32.f32  s5,  s21
-        vcvtgt.s32.f32  s6,  s22
-        vcvtgt.s32.f32  s7,  s23
-        ssat            r8,  #16, r8
-        ssat            r7,  #16, r7
-        ssat            lr,  #16, lr
-        ssat            ip,  #16, ip
-        pkhbt           r5,  r7,  r8, lsl #16
-        pkhbt           r6,  ip,  lr, lsl #16
-        stmia           r0!, {r3-r6}
-        bgt             1b
-
-        vpop            {d8-d11}
-        pop             {r4-r8,pc}
-endfunc
-#endif
--- a/libavcodec/arm/fmtconvert_init_arm.c
+++ b/libavcodec/arm/fmtconvert_init_arm.c
@ -0,0 +1,48 @@
+/*
+ * ARM optimized Format Conversion Utils
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavcodec/avcodec.h"
+#include "libavcodec/fmtconvert.h"
+
+void ff_int32_to_float_fmul_scalar_neon(float *dst, const int *src,
+                                        float mul, int len);
+
+void ff_float_to_int16_neon(int16_t *dst, const float *src, long len);
+void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int);
+
+void ff_float_to_int16_vfp(int16_t *dst, const float *src, long len);
+
+void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx)
+{
+    if (HAVE_ARMVFP && HAVE_ARMV6) {
+        c->float_to_int16 = ff_float_to_int16_vfp;
+    }
+
+    if (HAVE_NEON) {
+        c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_neon;
+
+        if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
+            c->float_to_int16            = ff_float_to_int16_neon;
+            c->float_to_int16_interleave = ff_float_to_int16_interleave_neon;
+        }
+    }
+}
--- a/libavcodec/arm/fmtconvert_neon.S
+++ b/libavcodec/arm/fmtconvert_neon.S
@ -0,0 +1,391 @@
+/*
+ * ARM NEON optimised Format Conversion Utils
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "asm.S"
+
+        preserve8
+        .text
+
+function ff_float_to_int16_neon, export=1
+        subs            r2,  r2,  #8
+        vld1.64         {d0-d1},  [r1,:128]!
+        vcvt.s32.f32    q8,  q0,  #16
+        vld1.64         {d2-d3},  [r1,:128]!
+        vcvt.s32.f32    q9,  q1,  #16
+        beq             3f
+        bics            ip,  r2,  #15
+        beq             2f
+1:      subs            ip,  ip,  #16
+        vshrn.s32       d4,  q8,  #16
+        vld1.64         {d0-d1},  [r1,:128]!
+        vcvt.s32.f32    q0,  q0,  #16
+        vshrn.s32       d5,  q9,  #16
+        vld1.64         {d2-d3},  [r1,:128]!
+        vcvt.s32.f32    q1,  q1,  #16
+        vshrn.s32       d6,  q0,  #16
+        vst1.64         {d4-d5},  [r0,:128]!
+        vshrn.s32       d7,  q1,  #16
+        vld1.64         {d16-d17},[r1,:128]!
+        vcvt.s32.f32    q8,  q8,  #16
+        vld1.64         {d18-d19},[r1,:128]!
+        vcvt.s32.f32    q9,  q9,  #16
+        vst1.64         {d6-d7},  [r0,:128]!
+        bne             1b
+        ands            r2,  r2,  #15
+        beq             3f
+2:      vld1.64         {d0-d1},  [r1,:128]!
+        vshrn.s32       d4,  q8,  #16
+        vcvt.s32.f32    q0,  q0,  #16
+        vld1.64         {d2-d3},  [r1,:128]!
+        vshrn.s32       d5,  q9,  #16
+        vcvt.s32.f32    q1,  q1,  #16
+        vshrn.s32       d6,  q0,  #16
+        vst1.64         {d4-d5},  [r0,:128]!
+        vshrn.s32       d7,  q1,  #16
+        vst1.64         {d6-d7},  [r0,:128]!
+        bx              lr
+3:      vshrn.s32       d4,  q8,  #16
+        vshrn.s32       d5,  q9,  #16
+        vst1.64         {d4-d5},  [r0,:128]!
+        bx              lr
+endfunc
+
+function ff_float_to_int16_interleave_neon, export=1
+        cmp             r3, #2
+        ldrlt           r1, [r1]
+        blt             ff_float_to_int16_neon
+        bne             4f
+
+        ldr             r3, [r1]
+        ldr             r1, [r1, #4]
+
+        subs            r2,  r2,  #8
+        vld1.64         {d0-d1},  [r3,:128]!
+        vcvt.s32.f32    q8,  q0,  #16
+        vld1.64         {d2-d3},  [r3,:128]!
+        vcvt.s32.f32    q9,  q1,  #16
+        vld1.64         {d20-d21},[r1,:128]!
+        vcvt.s32.f32    q10, q10, #16
+        vld1.64         {d22-d23},[r1,:128]!
+        vcvt.s32.f32    q11, q11, #16
+        beq             3f
+        bics            ip,  r2,  #15
+        beq             2f
+1:      subs            ip,  ip,  #16
+        vld1.64         {d0-d1},  [r3,:128]!
+        vcvt.s32.f32    q0,  q0,  #16
+        vsri.32         q10, q8,  #16
+        vld1.64         {d2-d3},  [r3,:128]!
+        vcvt.s32.f32    q1,  q1,  #16
+        vld1.64         {d24-d25},[r1,:128]!
+        vcvt.s32.f32    q12, q12, #16
+        vld1.64         {d26-d27},[r1,:128]!
+        vsri.32         q11, q9,  #16
+        vst1.64         {d20-d21},[r0,:128]!
+        vcvt.s32.f32    q13, q13, #16
+        vst1.64         {d22-d23},[r0,:128]!
+        vsri.32         q12, q0,  #16
+        vld1.64         {d16-d17},[r3,:128]!
+        vsri.32         q13, q1,  #16
+        vst1.64         {d24-d25},[r0,:128]!
+        vcvt.s32.f32    q8,  q8,  #16
+        vld1.64         {d18-d19},[r3,:128]!
+        vcvt.s32.f32    q9,  q9,  #16
+        vld1.64         {d20-d21},[r1,:128]!
+        vcvt.s32.f32    q10, q10, #16
+        vld1.64         {d22-d23},[r1,:128]!
+        vcvt.s32.f32    q11, q11, #16
+        vst1.64         {d26-d27},[r0,:128]!
+        bne             1b
+        ands            r2,  r2,  #15
+        beq             3f
+2:      vsri.32         q10, q8,  #16
+        vld1.64         {d0-d1},  [r3,:128]!
+        vcvt.s32.f32    q0,  q0,  #16
+        vld1.64         {d2-d3},  [r3,:128]!
+        vcvt.s32.f32    q1,  q1,  #16
+        vld1.64         {d24-d25},[r1,:128]!
+        vcvt.s32.f32    q12, q12, #16
+        vsri.32         q11, q9,  #16
+        vld1.64         {d26-d27},[r1,:128]!
+        vcvt.s32.f32    q13, q13, #16
+        vst1.64         {d20-d21},[r0,:128]!
+        vsri.32         q12, q0,  #16
+        vst1.64         {d22-d23},[r0,:128]!
+        vsri.32         q13, q1,  #16
+        vst1.64         {d24-d27},[r0,:128]!
+        bx              lr
+3:      vsri.32         q10, q8,  #16
+        vsri.32         q11, q9,  #16
+        vst1.64         {d20-d23},[r0,:128]!
+        bx              lr
+
+4:      push            {r4-r8,lr}
+        cmp             r3,  #4
+        lsl             ip,  r3,  #1
+        blt             4f
+
+        @ 4 channels
+5:      ldmia           r1!, {r4-r7}
+        mov             lr,  r2
+        mov             r8,  r0
+        vld1.64         {d16-d17},[r4,:128]!
+        vcvt.s32.f32    q8,  q8,  #16
+        vld1.64         {d18-d19},[r5,:128]!
+        vcvt.s32.f32    q9,  q9,  #16
+        vld1.64         {d20-d21},[r6,:128]!
+        vcvt.s32.f32    q10, q10, #16
+        vld1.64         {d22-d23},[r7,:128]!
+        vcvt.s32.f32    q11, q11, #16
+6:      subs            lr,  lr,  #8
+        vld1.64         {d0-d1},  [r4,:128]!
+        vcvt.s32.f32    q0,  q0,  #16
+        vsri.32         q9,  q8,  #16
+        vld1.64         {d2-d3},  [r5,:128]!
+        vcvt.s32.f32    q1,  q1,  #16
+        vsri.32         q11, q10, #16
+        vld1.64         {d4-d5},  [r6,:128]!
+        vcvt.s32.f32    q2,  q2,  #16
+        vzip.32         d18, d22
+        vld1.64         {d6-d7},  [r7,:128]!
+        vcvt.s32.f32    q3,  q3,  #16
+        vzip.32         d19, d23
+        vst1.64         {d18},    [r8], ip
+        vsri.32         q1,  q0,  #16
+        vst1.64         {d22},    [r8], ip
+        vsri.32         q3,  q2,  #16
+        vst1.64         {d19},    [r8], ip
+        vzip.32         d2,  d6
+        vst1.64         {d23},    [r8], ip
+        vzip.32         d3,  d7
+        beq             7f
+        vld1.64         {d16-d17},[r4,:128]!
+        vcvt.s32.f32    q8,  q8,  #16
+        vst1.64         {d2},     [r8], ip
+        vld1.64         {d18-d19},[r5,:128]!
+        vcvt.s32.f32    q9,  q9,  #16
+        vst1.64         {d6},     [r8], ip
+        vld1.64         {d20-d21},[r6,:128]!
+        vcvt.s32.f32    q10, q10, #16
+        vst1.64         {d3},     [r8], ip
+        vld1.64         {d22-d23},[r7,:128]!
+        vcvt.s32.f32    q11, q11, #16
+        vst1.64         {d7},     [r8], ip
+        b               6b
+7:      vst1.64         {d2},     [r8], ip
+        vst1.64         {d6},     [r8], ip
+        vst1.64         {d3},     [r8], ip
+        vst1.64         {d7},     [r8], ip
+        subs            r3,  r3,  #4
+        popeq           {r4-r8,pc}
+        cmp             r3,  #4
+        add             r0,  r0,  #8
+        bge             5b
+
+        @ 2 channels
+4:      cmp             r3,  #2
+        blt             4f
+        ldmia           r1!, {r4-r5}
+        mov             lr,  r2
+        mov             r8,  r0
+        tst             lr,  #8
+        vld1.64         {d16-d17},[r4,:128]!
+        vcvt.s32.f32    q8,  q8,  #16
+        vld1.64         {d18-d19},[r5,:128]!
+        vcvt.s32.f32    q9,  q9,  #16
+        vld1.64         {d20-d21},[r4,:128]!
+        vcvt.s32.f32    q10, q10, #16
+        vld1.64         {d22-d23},[r5,:128]!
+        vcvt.s32.f32    q11, q11, #16
+        beq             6f
+        subs            lr,  lr,  #8
+        beq             7f
+        vsri.32         d18, d16, #16
+        vsri.32         d19, d17, #16
+        vld1.64         {d16-d17},[r4,:128]!
+        vcvt.s32.f32    q8,  q8,  #16
+        vst1.32         {d18[0]}, [r8], ip
+        vsri.32         d22, d20, #16
+        vst1.32         {d18[1]}, [r8], ip
+        vsri.32         d23, d21, #16
+        vst1.32         {d19[0]}, [r8], ip
+        vst1.32         {d19[1]}, [r8], ip
+        vld1.64         {d18-d19},[r5,:128]!
+        vcvt.s32.f32    q9,  q9,  #16
+        vst1.32         {d22[0]}, [r8], ip
+        vst1.32         {d22[1]}, [r8], ip
+        vld1.64         {d20-d21},[r4,:128]!
+        vcvt.s32.f32    q10, q10, #16
+        vst1.32         {d23[0]}, [r8], ip
+        vst1.32         {d23[1]}, [r8], ip
+        vld1.64         {d22-d23},[r5,:128]!
+        vcvt.s32.f32    q11, q11, #16
+6:      subs            lr,  lr,  #16
+        vld1.64         {d0-d1},  [r4,:128]!
+        vcvt.s32.f32    q0,  q0,  #16
+        vsri.32         d18, d16, #16
+        vld1.64         {d2-d3},  [r5,:128]!
+        vcvt.s32.f32    q1,  q1,  #16
+        vsri.32         d19, d17, #16
+        vld1.64         {d4-d5},  [r4,:128]!
+        vcvt.s32.f32    q2,  q2,  #16
+        vld1.64         {d6-d7},  [r5,:128]!
+        vcvt.s32.f32    q3,  q3,  #16
+        vst1.32         {d18[0]}, [r8], ip
+        vsri.32         d22, d20, #16
+        vst1.32         {d18[1]}, [r8], ip
+        vsri.32         d23, d21, #16
+        vst1.32         {d19[0]}, [r8], ip
+        vsri.32         d2,  d0,  #16
+        vst1.32         {d19[1]}, [r8], ip
+        vsri.32         d3,  d1,  #16
+        vst1.32         {d22[0]}, [r8], ip
+        vsri.32         d6,  d4,  #16
+        vst1.32         {d22[1]}, [r8], ip
+        vsri.32         d7,  d5,  #16
+        vst1.32         {d23[0]}, [r8], ip
+        vst1.32         {d23[1]}, [r8], ip
+        beq             6f
+        vld1.64         {d16-d17},[r4,:128]!
+        vcvt.s32.f32    q8,  q8,  #16
+        vst1.32         {d2[0]},  [r8], ip
+        vst1.32         {d2[1]},  [r8], ip
+        vld1.64         {d18-d19},[r5,:128]!
+        vcvt.s32.f32    q9,  q9,  #16
+        vst1.32         {d3[0]},  [r8], ip
+        vst1.32         {d3[1]},  [r8], ip
+        vld1.64         {d20-d21},[r4,:128]!
+        vcvt.s32.f32    q10, q10, #16
+        vst1.32         {d6[0]},  [r8], ip
+        vst1.32         {d6[1]},  [r8], ip
+        vld1.64         {d22-d23},[r5,:128]!
+        vcvt.s32.f32    q11, q11, #16
+        vst1.32         {d7[0]},  [r8], ip
+        vst1.32         {d7[1]},  [r8], ip
+        bgt             6b
+6:      vst1.32         {d2[0]},  [r8], ip
+        vst1.32         {d2[1]},  [r8], ip
+        vst1.32         {d3[0]},  [r8], ip
+        vst1.32         {d3[1]},  [r8], ip
+        vst1.32         {d6[0]},  [r8], ip
+        vst1.32         {d6[1]},  [r8], ip
+        vst1.32         {d7[0]},  [r8], ip
+        vst1.32         {d7[1]},  [r8], ip
+        b               8f
+7:      vsri.32         d18, d16, #16
+        vsri.32         d19, d17, #16
+        vst1.32         {d18[0]}, [r8], ip
+        vsri.32         d22, d20, #16
+        vst1.32         {d18[1]}, [r8], ip
+        vsri.32         d23, d21, #16
+        vst1.32         {d19[0]}, [r8], ip
+        vst1.32         {d19[1]}, [r8], ip
+        vst1.32         {d22[0]}, [r8], ip
+        vst1.32         {d22[1]}, [r8], ip
+        vst1.32         {d23[0]}, [r8], ip
+        vst1.32         {d23[1]}, [r8], ip
+8:      subs            r3,  r3,  #2
+        add             r0,  r0,  #4
+        popeq           {r4-r8,pc}
+
+        @ 1 channel
+4:      ldr             r4,  [r1],#4
+        tst             r2,  #8
+        mov             lr,  r2
+        mov             r5,  r0
+        vld1.64         {d0-d1},  [r4,:128]!
+        vcvt.s32.f32    q0,  q0,  #16
+        vld1.64         {d2-d3},  [r4,:128]!
+        vcvt.s32.f32    q1,  q1,  #16
+        bne             8f
+6:      subs            lr,  lr,  #16
+        vld1.64         {d4-d5},  [r4,:128]!
+        vcvt.s32.f32    q2,  q2,  #16
+        vld1.64         {d6-d7},  [r4,:128]!
+        vcvt.s32.f32    q3,  q3,  #16
+        vst1.16         {d0[1]},  [r5,:16], ip
+        vst1.16         {d0[3]},  [r5,:16], ip
+        vst1.16         {d1[1]},  [r5,:16], ip
+        vst1.16         {d1[3]},  [r5,:16], ip
+        vst1.16         {d2[1]},  [r5,:16], ip
+        vst1.16         {d2[3]},  [r5,:16], ip
+        vst1.16         {d3[1]},  [r5,:16], ip
+        vst1.16         {d3[3]},  [r5,:16], ip
+        beq             7f
+        vld1.64         {d0-d1},  [r4,:128]!
+        vcvt.s32.f32    q0,  q0,  #16
+        vld1.64         {d2-d3},  [r4,:128]!
+        vcvt.s32.f32    q1,  q1,  #16
+7:      vst1.16         {d4[1]},  [r5,:16], ip
+        vst1.16         {d4[3]},  [r5,:16], ip
+        vst1.16         {d5[1]},  [r5,:16], ip
+        vst1.16         {d5[3]},  [r5,:16], ip
+        vst1.16         {d6[1]},  [r5,:16], ip
+        vst1.16         {d6[3]},  [r5,:16], ip
+        vst1.16         {d7[1]},  [r5,:16], ip
+        vst1.16         {d7[3]},  [r5,:16], ip
+        bgt             6b
+        pop             {r4-r8,pc}
+8:      subs            lr,  lr,  #8
+        vst1.16         {d0[1]},  [r5,:16], ip
+        vst1.16         {d0[3]},  [r5,:16], ip
+        vst1.16         {d1[1]},  [r5,:16], ip
+        vst1.16         {d1[3]},  [r5,:16], ip
+        vst1.16         {d2[1]},  [r5,:16], ip
+        vst1.16         {d2[3]},  [r5,:16], ip
+        vst1.16         {d3[1]},  [r5,:16], ip
+        vst1.16         {d3[3]},  [r5,:16], ip
+        popeq           {r4-r8,pc}
+        vld1.64         {d0-d1},  [r4,:128]!
+        vcvt.s32.f32    q0,  q0,  #16
+        vld1.64         {d2-d3},  [r4,:128]!
+        vcvt.s32.f32    q1,  q1,  #16
+        b               6b
+endfunc
+
+function ff_int32_to_float_fmul_scalar_neon, export=1
+VFP     vdup.32         q0,  d0[0]
+VFP     len     .req    r2
+NOVFP   vdup.32         q0,  r2
+NOVFP   len     .req    r3
+
+        vld1.32         {q1},[r1,:128]!
+        vcvt.f32.s32    q3,  q1
+        vld1.32         {q2},[r1,:128]!
+        vcvt.f32.s32    q8,  q2
+1:      subs            len, len, #8
+        pld             [r1, #16]
+        vmul.f32        q9,  q3,  q0
+        vmul.f32        q10, q8,  q0
+        beq             2f
+        vld1.32         {q1},[r1,:128]!
+        vcvt.f32.s32    q3,  q1
+        vld1.32         {q2},[r1,:128]!
+        vcvt.f32.s32    q8,  q2
+        vst1.32         {q9}, [r0,:128]!
+        vst1.32         {q10},[r0,:128]!
+        b               1b
+2:      vst1.32         {q9}, [r0,:128]!
+        vst1.32         {q10},[r0,:128]!
+        bx              lr
+        .unreq  len
+endfunc
--- a/libavcodec/arm/fmtconvert_vfp.S
+++ b/libavcodec/arm/fmtconvert_vfp.S
@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2008 Siarhei Siamashka <ssvb@users.sourceforge.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "asm.S"
+
+        .syntax unified
+
+/**
+ * ARM VFP optimized float to int16 conversion.
+ * Assume that len is a positive number and is multiple of 8, destination
+ * buffer is at least 4 bytes aligned (8 bytes alignment is better for
+ * performance), little endian byte sex
+ */
+@ void ff_float_to_int16_vfp(int16_t *dst, const float *src, int len)
+function ff_float_to_int16_vfp, export=1
+        push            {r4-r8,lr}
+        vpush           {d8-d11}
+        vldmia          r1!, {s16-s23}
+        vcvt.s32.f32    s0,  s16
+        vcvt.s32.f32    s1,  s17
+        vcvt.s32.f32    s2,  s18
+        vcvt.s32.f32    s3,  s19
+        vcvt.s32.f32    s4,  s20
+        vcvt.s32.f32    s5,  s21
+        vcvt.s32.f32    s6,  s22
+        vcvt.s32.f32    s7,  s23
+1:
+        subs            r2,  r2,  #8
+        vmov            r3,  r4,  s0, s1
+        vmov            r5,  r6,  s2, s3
+        vmov            r7,  r8,  s4, s5
+        vmov            ip,  lr,  s6, s7
+        vldmiagt        r1!, {s16-s23}
+        ssat            r4,  #16, r4
+        ssat            r3,  #16, r3
+        ssat            r6,  #16, r6
+        ssat            r5,  #16, r5
+        pkhbt           r3,  r3,  r4, lsl #16
+        pkhbt           r4,  r5,  r6, lsl #16
+        vcvtgt.s32.f32  s0,  s16
+        vcvtgt.s32.f32  s1,  s17
+        vcvtgt.s32.f32  s2,  s18
+        vcvtgt.s32.f32  s3,  s19
+        vcvtgt.s32.f32  s4,  s20
+        vcvtgt.s32.f32  s5,  s21
+        vcvtgt.s32.f32  s6,  s22
+        vcvtgt.s32.f32  s7,  s23
+        ssat            r8,  #16, r8
+        ssat            r7,  #16, r7
+        ssat            lr,  #16, lr
+        ssat            ip,  #16, ip
+        pkhbt           r5,  r7,  r8, lsl #16
+        pkhbt           r6,  ip,  lr, lsl #16
+        stmia           r0!, {r3-r6}
+        bgt             1b
+
+        vpop            {d8-d11}
+        pop             {r4-r8,pc}
+endfunc
--- a/libavcodec/binkaudio.c
+++ b/libavcodec/binkaudio.c
@ -33,6 +33,7 @@
 #include "get_bits.h"
 #include "dsputil.h"
 #include "fft.h"
+#include "fmtconvert.h"

 extern const uint16_t ff_wma_critical_freqs[25];

@ -43,6 +44,7 @@ typedef struct {
    AVCodecContext *avctx;
    GetBitContext gb;
    DSPContext dsp;
+    FmtConvertContext fmt_conv;
    int first;
    int channels;
    int frame_len;          ///< transform size (samples)
@ -71,6 +73,7 @@ static av_cold int decode_init(AVCodecContext *avctx)

    s->avctx = avctx;
    dsputil_init(&s->dsp, avctx);
+    ff_fmt_convert_init(&s->fmt_conv, avctx);

    /* determine frame length */
    if (avctx->sample_rate < 22050) {
@ -222,7 +225,8 @@ static void decode_block(BinkAudioContext *s, short *out, int use_dct)
            ff_rdft_calc(&s->trans.rdft, coeffs);
    }

-    s->dsp.float_to_int16_interleave(out, (const float **)s->coeffs_ptr, s->frame_len, s->channels);
+    s->fmt_conv.float_to_int16_interleave(out, (const float **)s->coeffs_ptr,
+                                          s->frame_len, s->channels);

    if (!s->first) {
        int count = s->overlap_len * s->channels;
--- a/libavcodec/dca.c
+++ b/libavcodec/dca.c
@ -40,6 +40,7 @@
 #include "dca.h"
 #include "synth_filter.h"
 #include "dcadsp.h"
+#include "fmtconvert.h"

 //#define TRACE

@ -347,6 +348,7 @@ typedef struct {
    FFTContext imdct;
    SynthFilterContext synth;
    DCADSPContext dcadsp;
+    FmtConvertContext fmt_conv;
 } DCAContext;

 static const uint16_t dca_vlc_offs[] = {
@ -1115,7 +1117,7 @@ static int dca_subsubframe(DCAContext * s, int base_channel, int block_index)
                        block[m] = get_bitalloc(&s->gb, &dca_smpl_bitalloc[abits], sel);
                }

-                s->dsp.int32_to_float_fmul_scalar(subband_samples[k][l],
+                s->fmt_conv.int32_to_float_fmul_scalar(subband_samples[k][l],
                                                  block, rscale, 8);
            }

@ -1802,7 +1804,7 @@ static int dca_decode_frame(AVCodecContext * avctx,
            }
        }

-        s->dsp.float_to_int16_interleave(samples, s->samples_chanptr, 256, channels);
+        s->fmt_conv.float_to_int16_interleave(samples, s->samples_chanptr, 256, channels);
        samples += 256 * channels;
    }

@ -1835,6 +1837,7 @@ static av_cold int dca_decode_init(AVCodecContext * avctx)
    ff_mdct_init(&s->imdct, 6, 1, 1.0);
    ff_synth_filter_init(&s->synth);
    ff_dcadsp_init(&s->dcadsp);
+    ff_fmt_convert_init(&s->fmt_conv, avctx);

    for (i = 0; i < DCA_PRIM_CHANNELS_MAX+1; i++)
        s->samples_chanptr[i] = s->samples + i * 256;
--- a/libavcodec/dsputil.c
+++ b/libavcodec/dsputil.c
@ -3867,12 +3867,6 @@ static float scalarproduct_float_c(const float *v1, const float *v2, int len)
    return p;
 }

-static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
-    int i;
-    for(i=0; i<len; i++)
-        dst[i] = src[i] * mul;
-}
-
 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
                   uint32_t maxi, uint32_t maxisign)
 {
@ -3918,30 +3912,6 @@ static void vector_clipf_c(float *dst, const float *src, float min, float max, i
    }
 }

-static av_always_inline int float_to_int16_one(const float *src){
-    return av_clip_int16(lrintf(*src));
-}
-
-static void ff_float_to_int16_c(int16_t *dst, const float *src, long len){
-    int i;
-    for(i=0; i<len; i++)
-        dst[i] = float_to_int16_one(src+i);
-}
-
-static void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels){
-    int i,j,c;
-    if(channels==2){
-        for(i=0; i<len; i++){
-            dst[2*i]   = float_to_int16_one(src[0]+i);
-            dst[2*i+1] = float_to_int16_one(src[1]+i);
-        }
-    }else{
-        for(c=0; c<channels; c++)
-            for(i=0, j=c; i<len; i++, j+=channels)
-                dst[j] = float_to_int16_one(src[c]+i);
-    }
-}
-
 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift)
 {
    int res = 0;
@ -4437,10 +4407,7 @@ av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
    c->vector_fmul_reverse = vector_fmul_reverse_c;
    c->vector_fmul_add = vector_fmul_add_c;
    c->vector_fmul_window = vector_fmul_window_c;
-    c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
    c->vector_clipf = vector_clipf_c;
-    c->float_to_int16 = ff_float_to_int16_c;
-    c->float_to_int16_interleave = ff_float_to_int16_interleave_c;
    c->scalarproduct_int16 = scalarproduct_int16_c;
    c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
    c->scalarproduct_float = scalarproduct_float_c;
--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@ -392,7 +392,6 @@ typedef struct DSPContext {
    /* assume len is a multiple of 4, and arrays are 16-byte aligned */
    void (*vector_fmul_window)(float *dst, const float *src0, const float *src1, const float *win, int len);
    /* assume len is a multiple of 8, and arrays are 16-byte aligned */
-    void (*int32_to_float_fmul_scalar)(float *dst, const int *src, float mul, int len);
    void (*vector_clipf)(float *dst /* align 16 */, const float *src /* align 16 */, float min, float max, int len /* align 16 */);
    /**
     * Multiply a vector of floats by a scalar float.  Source and
@ -445,10 +444,6 @@ typedef struct DSPContext {
     */
    void (*butterflies_float)(float *restrict v1, float *restrict v2, int len);

-    /* convert floats from [-32768.0,32767.0] without rescaling and arrays are 16byte aligned */
-    void (*float_to_int16)(int16_t *dst, const float *src, long len);
-    void (*float_to_int16_interleave)(int16_t *dst, const float **src, long len, int channels);
-
    /* (I)DCT */
    void (*fdct)(DCTELEM *block/* align 16*/);
    void (*fdct248)(DCTELEM *block/* align 16*/);
--- a/libavcodec/fmtconvert.c
+++ b/libavcodec/fmtconvert.c
@ -0,0 +1,68 @@
+/*
+ * Format Conversion Utils
+ * Copyright (c) 2000, 2001 Fabrice Bellard
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "fmtconvert.h"
+
+static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
+    int i;
+    for(i=0; i<len; i++)
+        dst[i] = src[i] * mul;
+}
+
+static av_always_inline int float_to_int16_one(const float *src){
+    return av_clip_int16(lrintf(*src));
+}
+
+static void float_to_int16_c(int16_t *dst, const float *src, long len)
+{
+    int i;
+    for(i=0; i<len; i++)
+        dst[i] = float_to_int16_one(src+i);
+}
+
+static void float_to_int16_interleave_c(int16_t *dst, const float **src,
+                                        long len, int channels)
+{
+    int i,j,c;
+    if(channels==2){
+        for(i=0; i<len; i++){
+            dst[2*i]   = float_to_int16_one(src[0]+i);
+            dst[2*i+1] = float_to_int16_one(src[1]+i);
+        }
+    }else{
+        for(c=0; c<channels; c++)
+            for(i=0, j=c; i<len; i++, j+=channels)
+                dst[j] = float_to_int16_one(src[c]+i);
+    }
+}
+
+av_cold void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx)
+{
+    c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
+    c->float_to_int16             = float_to_int16_c;
+    c->float_to_int16_interleave  = float_to_int16_interleave_c;
+
+    if (ARCH_ARM) ff_fmt_convert_init_arm(c, avctx);
+    if (ARCH_PPC) ff_fmt_convert_init_ppc(c, avctx);
+    if (HAVE_MMX) ff_fmt_convert_init_x86(c, avctx);
+}
--- a/libavcodec/fmtconvert.h
+++ b/libavcodec/fmtconvert.h
@ -0,0 +1,79 @@
+/*
+ * Format Conversion Utils
+ * Copyright (c) 2000, 2001 Fabrice Bellard
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_FMTCONVERT_H
+#define AVCODEC_FMTCONVERT_H
+
+#include "avcodec.h"
+
+typedef struct FmtConvertContext {
+    /**
+     * Convert an array of int32_t to float and multiply by a float value.
+     * @param dst destination array of float.
+     *            constraints: 16-byte aligned
+     * @param src source array of int32_t.
+     *            constraints: 16-byte aligned
+     * @param len number of elements to convert.
+     *            constraints: multiple of 8
+     */
+    void (*int32_to_float_fmul_scalar)(float *dst, const int *src, float mul, int len);
+
+    /**
+     * Convert an array of float to an array of int16_t.
+     *
+     * Convert floats from in the range [-32768.0,32767.0] to ints
+     * without rescaling
+     *
+     * @param dst destination array of int16_t.
+     *            constraints: 16-byte aligned
+     * @param src source array of float.
+     *            constraints: 16-byte aligned
+     * @param len number of elements to convert.
+     *            constraints: multiple of 8
+     */
+    void (*float_to_int16)(int16_t *dst, const float *src, long len);
+
+    /**
+     * Convert multiple arrays of float to an interleaved array of int16_t.
+     *
+     * Convert floats from in the range [-32768.0,32767.0] to ints
+     * without rescaling
+     *
+     * @param dst destination array of interleaved int16_t.
+     *            constraints: 16-byte aligned
+     * @param src source array of float arrays, one for each channel.
+     *            constraints: 16-byte aligned
+     * @param len number of elements to convert.
+     *            constraints: multiple of 8
+     * @param channels number of channels
+     */
+    void (*float_to_int16_interleave)(int16_t *dst, const float **src,
+                                      long len, int channels);
+} FmtConvertContext;
+
+void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx);
+
+void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx);
+void ff_fmt_convert_init_ppc(FmtConvertContext *c, AVCodecContext *avctx);
+void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx);
+
+#endif /* AVCODEC_FMTCONVERT_H */
--- a/libavcodec/nellymoserdec.c
+++ b/libavcodec/nellymoserdec.c
@ -38,6 +38,7 @@
 #include "avcodec.h"
 #include "dsputil.h"
 #include "fft.h"
+#include "fmtconvert.h"

 #define ALT_BITSTREAM_READER_LE
 #include "get_bits.h"
@ -52,6 +53,7 @@ typedef struct NellyMoserDecodeContext {
    float           scale_bias;
    DSPContext      dsp;
    FFTContext      imdct_ctx;
+    FmtConvertContext fmt_conv;
    DECLARE_ALIGNED(16, float,imdct_out)[NELLY_BUF_LEN * 2];
 } NellyMoserDecodeContext;

@ -134,6 +136,7 @@ static av_cold int decode_init(AVCodecContext * avctx) {
    ff_mdct_init(&s->imdct_ctx, 8, 1, 1.0);

    dsputil_init(&s->dsp, avctx);
+    ff_fmt_convert_init(&s->fmt_conv, avctx);

    s->scale_bias = 1.0/(1*8);

@ -175,7 +178,7 @@ static int decode_tag(AVCodecContext * avctx,

    for (i=0 ; i<blocks ; i++) {
        nelly_decode_block(s, &buf[i*NELLY_BLOCK_LEN], s->float_buf);
-        s->dsp.float_to_int16(&samples[i*NELLY_SAMPLES], s->float_buf, NELLY_SAMPLES);
+        s->fmt_conv.float_to_int16(&samples[i*NELLY_SAMPLES], s->float_buf, NELLY_SAMPLES);
        *data_size += NELLY_SAMPLES*sizeof(int16_t);
    }

--- a/libavcodec/ppc/Makefile
+++ b/libavcodec/ppc/Makefile
@ -21,6 +21,7 @@ ALTIVEC-OBJS-$(CONFIG_FFT)             += ppc/fft_altivec.o             \
 OBJS-$(HAVE_ALTIVEC)                   += ppc/dsputil_altivec.o         \
                                          ppc/fdct_altivec.o            \
                                          ppc/float_altivec.o           \
+                                          ppc/fmtconvert_altivec.o      \
                                          ppc/gmc_altivec.o             \
                                          ppc/idct_altivec.o            \
                                          ppc/int_altivec.o             \
--- a/libavcodec/ppc/float_altivec.c
+++ b/libavcodec/ppc/float_altivec.c
@ -122,124 +122,12 @@ static void vector_fmul_window_altivec(float *dst, const float *src0, const floa
    }
 }

-static void int32_to_float_fmul_scalar_altivec(float *dst, const int *src, float mul, int len)
-{
-    union {
-        vector float v;
-        float s[4];
-    } mul_u;
-    int i;
-    vector float src1, src2, dst1, dst2, mul_v, zero;
-
-    zero = (vector float)vec_splat_u32(0);
-    mul_u.s[0] = mul;
-    mul_v = vec_splat(mul_u.v, 0);
-
-    for(i=0; i<len; i+=8) {
-        src1 = vec_ctf(vec_ld(0,  src+i), 0);
-        src2 = vec_ctf(vec_ld(16, src+i), 0);
-        dst1 = vec_madd(src1, mul_v, zero);
-        dst2 = vec_madd(src2, mul_v, zero);
-        vec_st(dst1,  0, dst+i);
-        vec_st(dst2, 16, dst+i);
-    }
-}
-
-
-static vector signed short
-float_to_int16_one_altivec(const float *src)
-{
-    vector float s0 = vec_ld(0, src);
-    vector float s1 = vec_ld(16, src);
-    vector signed int t0 = vec_cts(s0, 0);
-    vector signed int t1 = vec_cts(s1, 0);
-    return vec_packs(t0,t1);
-}
-
-static void float_to_int16_altivec(int16_t *dst, const float *src, long len)
-{
-    int i;
-    vector signed short d0, d1, d;
-    vector unsigned char align;
-    if(((long)dst)&15) //FIXME
-    for(i=0; i<len-7; i+=8) {
-        d0 = vec_ld(0, dst+i);
-        d = float_to_int16_one_altivec(src+i);
-        d1 = vec_ld(15, dst+i);
-        d1 = vec_perm(d1, d0, vec_lvsl(0,dst+i));
-        align = vec_lvsr(0, dst+i);
-        d0 = vec_perm(d1, d, align);
-        d1 = vec_perm(d, d1, align);
-        vec_st(d0, 0, dst+i);
-        vec_st(d1,15, dst+i);
-    }
-    else
-    for(i=0; i<len-7; i+=8) {
-        d = float_to_int16_one_altivec(src+i);
-        vec_st(d, 0, dst+i);
-    }
-}
-
-static void
-float_to_int16_interleave_altivec(int16_t *dst, const float **src,
-                                  long len, int channels)
-{
-    int i;
-    vector signed short d0, d1, d2, c0, c1, t0, t1;
-    vector unsigned char align;
-    if(channels == 1)
-        float_to_int16_altivec(dst, src[0], len);
-    else
-        if (channels == 2) {
-        if(((long)dst)&15)
-        for(i=0; i<len-7; i+=8) {
-            d0 = vec_ld(0, dst + i);
-            t0 = float_to_int16_one_altivec(src[0] + i);
-            d1 = vec_ld(31, dst + i);
-            t1 = float_to_int16_one_altivec(src[1] + i);
-            c0 = vec_mergeh(t0, t1);
-            c1 = vec_mergel(t0, t1);
-            d2 = vec_perm(d1, d0, vec_lvsl(0, dst + i));
-            align = vec_lvsr(0, dst + i);
-            d0 = vec_perm(d2, c0, align);
-            d1 = vec_perm(c0, c1, align);
-            vec_st(d0,  0, dst + i);
-            d0 = vec_perm(c1, d2, align);
-            vec_st(d1, 15, dst + i);
-            vec_st(d0, 31, dst + i);
-            dst+=8;
-        }
-        else
-        for(i=0; i<len-7; i+=8) {
-            t0 = float_to_int16_one_altivec(src[0] + i);
-            t1 = float_to_int16_one_altivec(src[1] + i);
-            d0 = vec_mergeh(t0, t1);
-            d1 = vec_mergel(t0, t1);
-            vec_st(d0,  0, dst + i);
-            vec_st(d1, 16, dst + i);
-            dst+=8;
-        }
-    } else {
-        DECLARE_ALIGNED(16, int16_t, tmp)[len];
-        int c, j;
-        for (c = 0; c < channels; c++) {
-            float_to_int16_altivec(tmp, src[c], len);
-            for (i = 0, j = c; i < len; i++, j+=channels) {
-                dst[j] = tmp[i];
-            }
-        }
-   }
-}
-
 void float_init_altivec(DSPContext* c, AVCodecContext *avctx)
 {
    c->vector_fmul = vector_fmul_altivec;
    c->vector_fmul_reverse = vector_fmul_reverse_altivec;
    c->vector_fmul_add = vector_fmul_add_altivec;
-    c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_altivec;
    if(!(avctx->flags & CODEC_FLAG_BITEXACT)) {
        c->vector_fmul_window = vector_fmul_window_altivec;
-        c->float_to_int16 = float_to_int16_altivec;
-        c->float_to_int16_interleave = float_to_int16_interleave_altivec;
    }
 }
--- a/libavcodec/ppc/fmtconvert_altivec.c
+++ b/libavcodec/ppc/fmtconvert_altivec.c
@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2006 Luca Barbato <lu_zero@gentoo.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/fmtconvert.h"
+
+#include "dsputil_altivec.h"
+#include "util_altivec.h"
+
+static void int32_to_float_fmul_scalar_altivec(float *dst, const int *src, float mul, int len)
+{
+    union {
+        vector float v;
+        float s[4];
+    } mul_u;
+    int i;
+    vector float src1, src2, dst1, dst2, mul_v, zero;
+
+    zero = (vector float)vec_splat_u32(0);
+    mul_u.s[0] = mul;
+    mul_v = vec_splat(mul_u.v, 0);
+
+    for(i=0; i<len; i+=8) {
+        src1 = vec_ctf(vec_ld(0,  src+i), 0);
+        src2 = vec_ctf(vec_ld(16, src+i), 0);
+        dst1 = vec_madd(src1, mul_v, zero);
+        dst2 = vec_madd(src2, mul_v, zero);
+        vec_st(dst1,  0, dst+i);
+        vec_st(dst2, 16, dst+i);
+    }
+}
+
+
+static vector signed short
+float_to_int16_one_altivec(const float *src)
+{
+    vector float s0 = vec_ld(0, src);
+    vector float s1 = vec_ld(16, src);
+    vector signed int t0 = vec_cts(s0, 0);
+    vector signed int t1 = vec_cts(s1, 0);
+    return vec_packs(t0,t1);
+}
+
+static void float_to_int16_altivec(int16_t *dst, const float *src, long len)
+{
+    int i;
+    vector signed short d0, d1, d;
+    vector unsigned char align;
+    if(((long)dst)&15) //FIXME
+    for(i=0; i<len-7; i+=8) {
+        d0 = vec_ld(0, dst+i);
+        d = float_to_int16_one_altivec(src+i);
+        d1 = vec_ld(15, dst+i);
+        d1 = vec_perm(d1, d0, vec_lvsl(0,dst+i));
+        align = vec_lvsr(0, dst+i);
+        d0 = vec_perm(d1, d, align);
+        d1 = vec_perm(d, d1, align);
+        vec_st(d0, 0, dst+i);
+        vec_st(d1,15, dst+i);
+    }
+    else
+    for(i=0; i<len-7; i+=8) {
+        d = float_to_int16_one_altivec(src+i);
+        vec_st(d, 0, dst+i);
+    }
+}
+
+static void
+float_to_int16_interleave_altivec(int16_t *dst, const float **src,
+                                  long len, int channels)
+{
+    int i;
+    vector signed short d0, d1, d2, c0, c1, t0, t1;
+    vector unsigned char align;
+    if(channels == 1)
+        float_to_int16_altivec(dst, src[0], len);
+    else
+        if (channels == 2) {
+        if(((long)dst)&15)
+        for(i=0; i<len-7; i+=8) {
+            d0 = vec_ld(0, dst + i);
+            t0 = float_to_int16_one_altivec(src[0] + i);
+            d1 = vec_ld(31, dst + i);
+            t1 = float_to_int16_one_altivec(src[1] + i);
+            c0 = vec_mergeh(t0, t1);
+            c1 = vec_mergel(t0, t1);
+            d2 = vec_perm(d1, d0, vec_lvsl(0, dst + i));
+            align = vec_lvsr(0, dst + i);
+            d0 = vec_perm(d2, c0, align);
+            d1 = vec_perm(c0, c1, align);
+            vec_st(d0,  0, dst + i);
+            d0 = vec_perm(c1, d2, align);
+            vec_st(d1, 15, dst + i);
+            vec_st(d0, 31, dst + i);
+            dst+=8;
+        }
+        else
+        for(i=0; i<len-7; i+=8) {
+            t0 = float_to_int16_one_altivec(src[0] + i);
+            t1 = float_to_int16_one_altivec(src[1] + i);
+            d0 = vec_mergeh(t0, t1);
+            d1 = vec_mergel(t0, t1);
+            vec_st(d0,  0, dst + i);
+            vec_st(d1, 16, dst + i);
+            dst+=8;
+        }
+    } else {
+        DECLARE_ALIGNED(16, int16_t, tmp)[len];
+        int c, j;
+        for (c = 0; c < channels; c++) {
+            float_to_int16_altivec(tmp, src[c], len);
+            for (i = 0, j = c; i < len; i++, j+=channels) {
+                dst[j] = tmp[i];
+            }
+        }
+   }
+}
+
+void ff_fmt_convert_init_ppc(FmtConvertContext *c, AVCodecContext *avctx)
+{
+    c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_altivec;
+    if(!(avctx->flags & CODEC_FLAG_BITEXACT)) {
+        c->float_to_int16 = float_to_int16_altivec;
+        c->float_to_int16_interleave = float_to_int16_interleave_altivec;
+    }
+}
--- a/libavcodec/vorbis_dec.c
+++ b/libavcodec/vorbis_dec.c
@ -31,6 +31,7 @@
 #include "get_bits.h"
 #include "dsputil.h"
 #include "fft.h"
+#include "fmtconvert.h"

 #include "vorbis.h"
 #include "xiph.h"
@ -127,6 +128,7 @@ typedef struct vorbis_context_s {
    AVCodecContext *avccontext;
    GetBitContext gb;
    DSPContext dsp;
+    FmtConvertContext fmt_conv;

    FFTContext mdct[2];
    uint_fast8_t  first_frame;
@ -961,6 +963,7 @@ static av_cold int vorbis_decode_init(AVCodecContext *avccontext)

    vc->avccontext = avccontext;
    dsputil_init(&vc->dsp, avccontext);
+    ff_fmt_convert_init(&vc->fmt_conv, avccontext);

    vc->scale_bias = 32768.0f;

@ -1636,7 +1639,8 @@ static int vorbis_decode_frame(AVCodecContext *avccontext,
                              len * ff_vorbis_channel_layout_offsets[vc->audio_channels - 1][i];
    }

-    vc->dsp.float_to_int16_interleave(data, channel_ptrs, len, vc->audio_channels);
+    vc->fmt_conv.float_to_int16_interleave(data, channel_ptrs, len,
+                                           vc->audio_channels);
    *data_size = len * 2 * vc->audio_channels;

    return buf_size ;
--- a/libavcodec/wma.c
+++ b/libavcodec/wma.c
@ -126,6 +126,7 @@ int ff_wma_init(AVCodecContext *avctx, int flags2)
    s->block_align = avctx->block_align;

    dsputil_init(&s->dsp, avctx);
+    ff_fmt_convert_init(&s->fmt_conv, avctx);

    if (avctx->codec->id == CODEC_ID_WMAV1) {
        s->version = 1;
--- a/libavcodec/wma.h
+++ b/libavcodec/wma.h
@ -26,6 +26,7 @@
 #include "put_bits.h"
 #include "dsputil.h"
 #include "fft.h"
+#include "fmtconvert.h"

 /* size of blocks */
 #define BLOCK_MIN_BITS 7
@ -134,6 +135,7 @@ typedef struct WMACodecContext {
    float lsp_pow_m_table1[(1 << LSP_POW_BITS)];
    float lsp_pow_m_table2[(1 << LSP_POW_BITS)];
    DSPContext dsp;
+    FmtConvertContext fmt_conv;

 #ifdef TRACE
    int frame_count;
--- a/libavcodec/wmadec.c
+++ b/libavcodec/wmadec.c
@ -791,7 +791,7 @@ static int wma_decode_frame(WMACodecContext *s, int16_t *samples)
    incr = s->nb_channels;
    for (ch = 0; ch < MAX_CHANNELS; ch++)
        output[ch] = s->frame_out[ch];
-    s->dsp.float_to_int16_interleave(samples, output, n, incr);
+    s->fmt_conv.float_to_int16_interleave(samples, output, n, incr);
    for (ch = 0; ch < incr; ch++) {
        /* prepare for next block */
        memmove(&s->frame_out[ch][0], &s->frame_out[ch][n], n * sizeof(float));
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@ -39,6 +39,7 @@ YASM-OBJS-$(CONFIG_VP8_DECODER)        += x86/vp8dsp.o
 MMX-OBJS-$(CONFIG_VP8_DECODER)         += x86/vp8dsp-init.o
 MMX-OBJS-$(HAVE_YASM)                  += x86/dsputil_yasm.o            \
                                          x86/deinterlace.o             \
+                                          x86/fmtconvert.o              \
                                          x86/h264_chromamc.o           \
                                          $(YASM-OBJS-yes)

@ -47,6 +48,7 @@ MMX-OBJS-$(CONFIG_FFT)                 += x86/fft.o
 OBJS-$(HAVE_MMX)                       += x86/dnxhd_mmx.o               \
                                          x86/dsputil_mmx.o             \
                                          x86/fdct_mmx.o                \
+                                          x86/fmtconvert_mmx.o          \
                                          x86/idct_mmx_xvid.o           \
                                          x86/idct_sse2_xvid.o          \
                                          x86/motion_est_mmx.o          \
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@ -2349,50 +2349,6 @@ static void vector_fmul_window_sse(float *dst, const float *src0, const float *s
 }
 #endif /* HAVE_6REGS */

-static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len)
-{
-    x86_reg i = -4*len;
-    __asm__ volatile(
-        "movss  %3, %%xmm4 \n"
-        "shufps $0, %%xmm4, %%xmm4 \n"
-        "1: \n"
-        "cvtpi2ps   (%2,%0), %%xmm0 \n"
-        "cvtpi2ps  8(%2,%0), %%xmm1 \n"
-        "cvtpi2ps 16(%2,%0), %%xmm2 \n"
-        "cvtpi2ps 24(%2,%0), %%xmm3 \n"
-        "movlhps  %%xmm1,    %%xmm0 \n"
-        "movlhps  %%xmm3,    %%xmm2 \n"
-        "mulps    %%xmm4,    %%xmm0 \n"
-        "mulps    %%xmm4,    %%xmm2 \n"
-        "movaps   %%xmm0,   (%1,%0) \n"
-        "movaps   %%xmm2, 16(%1,%0) \n"
-        "add $32, %0 \n"
-        "jl 1b \n"
-        :"+r"(i)
-        :"r"(dst+len), "r"(src+len), "m"(mul)
-    );
-}
-
-static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len)
-{
-    x86_reg i = -4*len;
-    __asm__ volatile(
-        "movss  %3, %%xmm4 \n"
-        "shufps $0, %%xmm4, %%xmm4 \n"
-        "1: \n"
-        "cvtdq2ps   (%2,%0), %%xmm0 \n"
-        "cvtdq2ps 16(%2,%0), %%xmm1 \n"
-        "mulps    %%xmm4,    %%xmm0 \n"
-        "mulps    %%xmm4,    %%xmm1 \n"
-        "movaps   %%xmm0,   (%1,%0) \n"
-        "movaps   %%xmm1, 16(%1,%0) \n"
-        "add $32, %0 \n"
-        "jl 1b \n"
-        :"+r"(i)
-        :"r"(dst+len), "r"(src+len), "m"(mul)
-    );
-}
-
 static void vector_clipf_sse(float *dst, const float *src, float min, float max,
                             int len)
 {
@ -2427,70 +2383,6 @@ static void vector_clipf_sse(float *dst, const float *src, float min, float max,
    );
 }

-static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){
-    x86_reg reglen = len;
-    // not bit-exact: pf2id uses different rounding than C and SSE
-    __asm__ volatile(
-        "add        %0          , %0        \n\t"
-        "lea         (%2,%0,2)  , %2        \n\t"
-        "add        %0          , %1        \n\t"
-        "neg        %0                      \n\t"
-        "1:                                 \n\t"
-        "pf2id       (%2,%0,2)  , %%mm0     \n\t"
-        "pf2id      8(%2,%0,2)  , %%mm1     \n\t"
-        "pf2id     16(%2,%0,2)  , %%mm2     \n\t"
-        "pf2id     24(%2,%0,2)  , %%mm3     \n\t"
-        "packssdw   %%mm1       , %%mm0     \n\t"
-        "packssdw   %%mm3       , %%mm2     \n\t"
-        "movq       %%mm0       ,  (%1,%0)  \n\t"
-        "movq       %%mm2       , 8(%1,%0)  \n\t"
-        "add        $16         , %0        \n\t"
-        " js 1b                             \n\t"
-        "femms                              \n\t"
-        :"+r"(reglen), "+r"(dst), "+r"(src)
-    );
-}
-static void float_to_int16_sse(int16_t *dst, const float *src, long len){
-    x86_reg reglen = len;
-    __asm__ volatile(
-        "add        %0          , %0        \n\t"
-        "lea         (%2,%0,2)  , %2        \n\t"
-        "add        %0          , %1        \n\t"
-        "neg        %0                      \n\t"
-        "1:                                 \n\t"
-        "cvtps2pi    (%2,%0,2)  , %%mm0     \n\t"
-        "cvtps2pi   8(%2,%0,2)  , %%mm1     \n\t"
-        "cvtps2pi  16(%2,%0,2)  , %%mm2     \n\t"
-        "cvtps2pi  24(%2,%0,2)  , %%mm3     \n\t"
-        "packssdw   %%mm1       , %%mm0     \n\t"
-        "packssdw   %%mm3       , %%mm2     \n\t"
-        "movq       %%mm0       ,  (%1,%0)  \n\t"
-        "movq       %%mm2       , 8(%1,%0)  \n\t"
-        "add        $16         , %0        \n\t"
-        " js 1b                             \n\t"
-        "emms                               \n\t"
-        :"+r"(reglen), "+r"(dst), "+r"(src)
-    );
-}
-
-static void float_to_int16_sse2(int16_t *dst, const float *src, long len){
-    x86_reg reglen = len;
-    __asm__ volatile(
-        "add        %0          , %0        \n\t"
-        "lea         (%2,%0,2)  , %2        \n\t"
-        "add        %0          , %1        \n\t"
-        "neg        %0                      \n\t"
-        "1:                                 \n\t"
-        "cvtps2dq    (%2,%0,2)  , %%xmm0    \n\t"
-        "cvtps2dq  16(%2,%0,2)  , %%xmm1    \n\t"
-        "packssdw   %%xmm1      , %%xmm0    \n\t"
-        "movdqa     %%xmm0      ,  (%1,%0)  \n\t"
-        "add        $16         , %0        \n\t"
-        " js 1b                             \n\t"
-        :"+r"(reglen), "+r"(dst), "+r"(src)
-    );
-}
-
 void ff_vp3_idct_mmx(int16_t *input_data);
 void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block);
 void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block);
@ -2504,9 +2396,6 @@ void ff_vp3_idct_sse2(int16_t *input_data);
 void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block);
 void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block);

-void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len);
-void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len);
-void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len);
 int32_t ff_scalarproduct_int16_mmx2(const int16_t *v1, const int16_t *v2, int order, int shift);
 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2, int order, int shift);
 int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
@ -2516,102 +2405,6 @@ void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const
 int  ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left);
 int  ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left);

-#if !HAVE_YASM
-#define ff_float_to_int16_interleave6_sse(a,b,c)   float_to_int16_interleave_misc_sse(a,b,c,6)
-#define ff_float_to_int16_interleave6_3dnow(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6)
-#define ff_float_to_int16_interleave6_3dn2(a,b,c)  float_to_int16_interleave_misc_3dnow(a,b,c,6)
-#endif
-#define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse
-
-#define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \
-/* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\
-static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\
-    DECLARE_ALIGNED(16, int16_t, tmp)[len];\
-    int i,j,c;\
-    for(c=0; c<channels; c++){\
-        float_to_int16_##cpu(tmp, src[c], len);\
-        for(i=0, j=c; i<len; i++, j+=channels)\
-            dst[j] = tmp[i];\
-    }\
-}\
-\
-static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, long len, int channels){\
-    if(channels==1)\
-        float_to_int16_##cpu(dst, src[0], len);\
-    else if(channels==2){\
-        x86_reg reglen = len; \
-        const float *src0 = src[0];\
-        const float *src1 = src[1];\
-        __asm__ volatile(\
-            "shl $2, %0 \n"\
-            "add %0, %1 \n"\
-            "add %0, %2 \n"\
-            "add %0, %3 \n"\
-            "neg %0 \n"\
-            body\
-            :"+r"(reglen), "+r"(dst), "+r"(src0), "+r"(src1)\
-        );\
-    }else if(channels==6){\
-        ff_float_to_int16_interleave6_##cpu(dst, src, len);\
-    }else\
-        float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\
-}
-
-FLOAT_TO_INT16_INTERLEAVE(3dnow,
-    "1:                         \n"
-    "pf2id     (%2,%0), %%mm0   \n"
-    "pf2id    8(%2,%0), %%mm1   \n"
-    "pf2id     (%3,%0), %%mm2   \n"
-    "pf2id    8(%3,%0), %%mm3   \n"
-    "packssdw    %%mm1, %%mm0   \n"
-    "packssdw    %%mm3, %%mm2   \n"
-    "movq        %%mm0, %%mm1   \n"
-    "punpcklwd   %%mm2, %%mm0   \n"
-    "punpckhwd   %%mm2, %%mm1   \n"
-    "movq        %%mm0,  (%1,%0)\n"
-    "movq        %%mm1, 8(%1,%0)\n"
-    "add $16, %0                \n"
-    "js 1b                      \n"
-    "femms                      \n"
-)
-
-FLOAT_TO_INT16_INTERLEAVE(sse,
-    "1:                         \n"
-    "cvtps2pi  (%2,%0), %%mm0   \n"
-    "cvtps2pi 8(%2,%0), %%mm1   \n"
-    "cvtps2pi  (%3,%0), %%mm2   \n"
-    "cvtps2pi 8(%3,%0), %%mm3   \n"
-    "packssdw    %%mm1, %%mm0   \n"
-    "packssdw    %%mm3, %%mm2   \n"
-    "movq        %%mm0, %%mm1   \n"
-    "punpcklwd   %%mm2, %%mm0   \n"
-    "punpckhwd   %%mm2, %%mm1   \n"
-    "movq        %%mm0,  (%1,%0)\n"
-    "movq        %%mm1, 8(%1,%0)\n"
-    "add $16, %0                \n"
-    "js 1b                      \n"
-    "emms                       \n"
-)
-
-FLOAT_TO_INT16_INTERLEAVE(sse2,
-    "1:                         \n"
-    "cvtps2dq  (%2,%0), %%xmm0  \n"
-    "cvtps2dq  (%3,%0), %%xmm1  \n"
-    "packssdw   %%xmm1, %%xmm0  \n"
-    "movhlps    %%xmm0, %%xmm1  \n"
-    "punpcklwd  %%xmm1, %%xmm0  \n"
-    "movdqa     %%xmm0, (%1,%0) \n"
-    "add $16, %0                \n"
-    "js 1b                      \n"
-)
-
-static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long len, int channels){
-    if(channels==6)
-        ff_float_to_int16_interleave6_3dn2(dst, src, len);
-    else
-        float_to_int16_interleave_3dnow(dst, src, len, channels);
-}
-
 float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);

 void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
@ -2968,19 +2761,12 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
        if(mm_flags & AV_CPU_FLAG_3DNOW){
            c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
            c->vector_fmul = vector_fmul_3dnow;
-            if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
-                c->float_to_int16 = float_to_int16_3dnow;
-                c->float_to_int16_interleave = float_to_int16_interleave_3dnow;
-            }
        }
        if(mm_flags & AV_CPU_FLAG_3DNOWEXT){
            c->vector_fmul_reverse = vector_fmul_reverse_3dnow2;
 #if HAVE_6REGS
            c->vector_fmul_window = vector_fmul_window_3dnow2;
 #endif
-            if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
-                c->float_to_int16_interleave = float_to_int16_interleave_3dn2;
-            }
        }
        if(mm_flags & AV_CPU_FLAG_MMX2){
 #if HAVE_YASM
@ -2997,10 +2783,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
 #if HAVE_6REGS
            c->vector_fmul_window = vector_fmul_window_sse;
 #endif
-            c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse;
            c->vector_clipf = vector_clipf_sse;
-            c->float_to_int16 = float_to_int16_sse;
-            c->float_to_int16_interleave = float_to_int16_interleave_sse;
 #if HAVE_YASM
            c->scalarproduct_float = ff_scalarproduct_float_sse;
 #endif
@ -3008,9 +2791,6 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
        if(mm_flags & AV_CPU_FLAG_3DNOW)
            c->vector_fmul_add = vector_fmul_add_3dnow; // faster than sse
        if(mm_flags & AV_CPU_FLAG_SSE2){
-            c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2;
-            c->float_to_int16 = float_to_int16_sse2;
-            c->float_to_int16_interleave = float_to_int16_interleave_sse2;
 #if HAVE_YASM
            c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
            c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
--- a/libavcodec/x86/dsputil_yasm.asm
+++ b/libavcodec/x86/dsputil_yasm.asm
@ -30,75 +30,6 @@ pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13

 section .text align=16

-%macro PSWAPD_SSE 2
-    pshufw %1, %2, 0x4e
-%endmacro
-%macro PSWAPD_3DN1 2
-    movq  %1, %2
-    psrlq %1, 32
-    punpckldq %1, %2
-%endmacro
-
-%macro FLOAT_TO_INT16_INTERLEAVE6 1
-; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len)
-cglobal float_to_int16_interleave6_%1, 2,7,0, dst, src, src1, src2, src3, src4, src5
-%ifdef ARCH_X86_64
-    %define lend r10d
-    mov     lend, r2d
-%else
-    %define lend dword r2m
-%endif
-    mov src1q, [srcq+1*gprsize]
-    mov src2q, [srcq+2*gprsize]
-    mov src3q, [srcq+3*gprsize]
-    mov src4q, [srcq+4*gprsize]
-    mov src5q, [srcq+5*gprsize]
-    mov srcq,  [srcq]
-    sub src1q, srcq
-    sub src2q, srcq
-    sub src3q, srcq
-    sub src4q, srcq
-    sub src5q, srcq
-.loop:
-    cvtps2pi   mm0, [srcq]
-    cvtps2pi   mm1, [srcq+src1q]
-    cvtps2pi   mm2, [srcq+src2q]
-    cvtps2pi   mm3, [srcq+src3q]
-    cvtps2pi   mm4, [srcq+src4q]
-    cvtps2pi   mm5, [srcq+src5q]
-    packssdw   mm0, mm3
-    packssdw   mm1, mm4
-    packssdw   mm2, mm5
-    pswapd     mm3, mm0
-    punpcklwd  mm0, mm1
-    punpckhwd  mm1, mm2
-    punpcklwd  mm2, mm3
-    pswapd     mm3, mm0
-    punpckldq  mm0, mm2
-    punpckhdq  mm2, mm1
-    punpckldq  mm1, mm3
-    movq [dstq   ], mm0
-    movq [dstq+16], mm2
-    movq [dstq+ 8], mm1
-    add srcq, 8
-    add dstq, 24
-    sub lend, 2
-    jg .loop
-    emms
-    RET
-%endmacro ; FLOAT_TO_INT16_INTERLEAVE6
-
-%define pswapd PSWAPD_SSE
-FLOAT_TO_INT16_INTERLEAVE6 sse
-%define cvtps2pi pf2id
-%define pswapd PSWAPD_3DN1
-FLOAT_TO_INT16_INTERLEAVE6 3dnow
-%undef pswapd
-FLOAT_TO_INT16_INTERLEAVE6 3dn2
-%undef cvtps2pi
-
-
-
 %macro SCALARPRODUCT 1
 ; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order, int shift)
 cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift
--- a/libavcodec/x86/fmtconvert.asm
+++ b/libavcodec/x86/fmtconvert.asm
@ -0,0 +1,91 @@
+;******************************************************************************
+;* x86 optimized Format Conversion Utils
+;* Copyright (c) 2008 Loren Merritt
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "x86inc.asm"
+
+section .text align=16
+
+%macro PSWAPD_SSE 2
+    pshufw %1, %2, 0x4e
+%endmacro
+%macro PSWAPD_3DN1 2
+    movq  %1, %2
+    psrlq %1, 32
+    punpckldq %1, %2
+%endmacro
+
+%macro FLOAT_TO_INT16_INTERLEAVE6 1
+; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len)
+cglobal float_to_int16_interleave6_%1, 2,7,0, dst, src, src1, src2, src3, src4, src5
+%ifdef ARCH_X86_64
+    %define lend r10d
+    mov     lend, r2d
+%else
+    %define lend dword r2m
+%endif
+    mov src1q, [srcq+1*gprsize]
+    mov src2q, [srcq+2*gprsize]
+    mov src3q, [srcq+3*gprsize]
+    mov src4q, [srcq+4*gprsize]
+    mov src5q, [srcq+5*gprsize]
+    mov srcq,  [srcq]
+    sub src1q, srcq
+    sub src2q, srcq
+    sub src3q, srcq
+    sub src4q, srcq
+    sub src5q, srcq
+.loop:
+    cvtps2pi   mm0, [srcq]
+    cvtps2pi   mm1, [srcq+src1q]
+    cvtps2pi   mm2, [srcq+src2q]
+    cvtps2pi   mm3, [srcq+src3q]
+    cvtps2pi   mm4, [srcq+src4q]
+    cvtps2pi   mm5, [srcq+src5q]
+    packssdw   mm0, mm3
+    packssdw   mm1, mm4
+    packssdw   mm2, mm5
+    pswapd     mm3, mm0
+    punpcklwd  mm0, mm1
+    punpckhwd  mm1, mm2
+    punpcklwd  mm2, mm3
+    pswapd     mm3, mm0
+    punpckldq  mm0, mm2
+    punpckhdq  mm2, mm1
+    punpckldq  mm1, mm3
+    movq [dstq   ], mm0
+    movq [dstq+16], mm2
+    movq [dstq+ 8], mm1
+    add srcq, 8
+    add dstq, 24
+    sub lend, 2
+    jg .loop
+    emms
+    RET
+%endmacro ; FLOAT_TO_INT16_INTERLEAVE6
+
+%define pswapd PSWAPD_SSE
+FLOAT_TO_INT16_INTERLEAVE6 sse
+%define cvtps2pi pf2id
+%define pswapd PSWAPD_3DN1
+FLOAT_TO_INT16_INTERLEAVE6 3dnow
+%undef pswapd
+FLOAT_TO_INT16_INTERLEAVE6 3dn2
+%undef cvtps2pi
--- a/libavcodec/x86/fmtconvert_mmx.c
+++ b/libavcodec/x86/fmtconvert_mmx.c
@ -0,0 +1,266 @@
+/*
+ * Format Conversion Utils
+ * Copyright (c) 2000, 2001 Fabrice Bellard
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
+ */
+
+#include "libavutil/cpu.h"
+#include "libavutil/x86_cpu.h"
+#include "libavcodec/fmtconvert.h"
+
+static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len)
+{
+    x86_reg i = -4*len;
+    __asm__ volatile(
+        "movss  %3, %%xmm4 \n"
+        "shufps $0, %%xmm4, %%xmm4 \n"
+        "1: \n"
+        "cvtpi2ps   (%2,%0), %%xmm0 \n"
+        "cvtpi2ps  8(%2,%0), %%xmm1 \n"
+        "cvtpi2ps 16(%2,%0), %%xmm2 \n"
+        "cvtpi2ps 24(%2,%0), %%xmm3 \n"
+        "movlhps  %%xmm1,    %%xmm0 \n"
+        "movlhps  %%xmm3,    %%xmm2 \n"
+        "mulps    %%xmm4,    %%xmm0 \n"
+        "mulps    %%xmm4,    %%xmm2 \n"
+        "movaps   %%xmm0,   (%1,%0) \n"
+        "movaps   %%xmm2, 16(%1,%0) \n"
+        "add $32, %0 \n"
+        "jl 1b \n"
+        :"+r"(i)
+        :"r"(dst+len), "r"(src+len), "m"(mul)
+    );
+}
+
+static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len)
+{
+    x86_reg i = -4*len;
+    __asm__ volatile(
+        "movss  %3, %%xmm4 \n"
+        "shufps $0, %%xmm4, %%xmm4 \n"
+        "1: \n"
+        "cvtdq2ps   (%2,%0), %%xmm0 \n"
+        "cvtdq2ps 16(%2,%0), %%xmm1 \n"
+        "mulps    %%xmm4,    %%xmm0 \n"
+        "mulps    %%xmm4,    %%xmm1 \n"
+        "movaps   %%xmm0,   (%1,%0) \n"
+        "movaps   %%xmm1, 16(%1,%0) \n"
+        "add $32, %0 \n"
+        "jl 1b \n"
+        :"+r"(i)
+        :"r"(dst+len), "r"(src+len), "m"(mul)
+    );
+}
+
+static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){
+    x86_reg reglen = len;
+    // not bit-exact: pf2id uses different rounding than C and SSE
+    __asm__ volatile(
+        "add        %0          , %0        \n\t"
+        "lea         (%2,%0,2)  , %2        \n\t"
+        "add        %0          , %1        \n\t"
+        "neg        %0                      \n\t"
+        "1:                                 \n\t"
+        "pf2id       (%2,%0,2)  , %%mm0     \n\t"
+        "pf2id      8(%2,%0,2)  , %%mm1     \n\t"
+        "pf2id     16(%2,%0,2)  , %%mm2     \n\t"
+        "pf2id     24(%2,%0,2)  , %%mm3     \n\t"
+        "packssdw   %%mm1       , %%mm0     \n\t"
+        "packssdw   %%mm3       , %%mm2     \n\t"
+        "movq       %%mm0       ,  (%1,%0)  \n\t"
+        "movq       %%mm2       , 8(%1,%0)  \n\t"
+        "add        $16         , %0        \n\t"
+        " js 1b                             \n\t"
+        "femms                              \n\t"
+        :"+r"(reglen), "+r"(dst), "+r"(src)
+    );
+}
+
+static void float_to_int16_sse(int16_t *dst, const float *src, long len){
+    x86_reg reglen = len;
+    __asm__ volatile(
+        "add        %0          , %0        \n\t"
+        "lea         (%2,%0,2)  , %2        \n\t"
+        "add        %0          , %1        \n\t"
+        "neg        %0                      \n\t"
+        "1:                                 \n\t"
+        "cvtps2pi    (%2,%0,2)  , %%mm0     \n\t"
+        "cvtps2pi   8(%2,%0,2)  , %%mm1     \n\t"
+        "cvtps2pi  16(%2,%0,2)  , %%mm2     \n\t"
+        "cvtps2pi  24(%2,%0,2)  , %%mm3     \n\t"
+        "packssdw   %%mm1       , %%mm0     \n\t"
+        "packssdw   %%mm3       , %%mm2     \n\t"
+        "movq       %%mm0       ,  (%1,%0)  \n\t"
+        "movq       %%mm2       , 8(%1,%0)  \n\t"
+        "add        $16         , %0        \n\t"
+        " js 1b                             \n\t"
+        "emms                               \n\t"
+        :"+r"(reglen), "+r"(dst), "+r"(src)
+    );
+}
+
+static void float_to_int16_sse2(int16_t *dst, const float *src, long len){
+    x86_reg reglen = len;
+    __asm__ volatile(
+        "add        %0          , %0        \n\t"
+        "lea         (%2,%0,2)  , %2        \n\t"
+        "add        %0          , %1        \n\t"
+        "neg        %0                      \n\t"
+        "1:                                 \n\t"
+        "cvtps2dq    (%2,%0,2)  , %%xmm0    \n\t"
+        "cvtps2dq  16(%2,%0,2)  , %%xmm1    \n\t"
+        "packssdw   %%xmm1      , %%xmm0    \n\t"
+        "movdqa     %%xmm0      ,  (%1,%0)  \n\t"
+        "add        $16         , %0        \n\t"
+        " js 1b                             \n\t"
+        :"+r"(reglen), "+r"(dst), "+r"(src)
+    );
+}
+
+void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len);
+void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len);
+void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len);
+
+#if !HAVE_YASM
+#define ff_float_to_int16_interleave6_sse(a,b,c)   float_to_int16_interleave_misc_sse(a,b,c,6)
+#define ff_float_to_int16_interleave6_3dnow(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6)
+#define ff_float_to_int16_interleave6_3dn2(a,b,c)  float_to_int16_interleave_misc_3dnow(a,b,c,6)
+#endif
+#define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse
+
+#define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \
+/* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\
+static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\
+    DECLARE_ALIGNED(16, int16_t, tmp)[len];\
+    int i,j,c;\
+    for(c=0; c<channels; c++){\
+        float_to_int16_##cpu(tmp, src[c], len);\
+        for(i=0, j=c; i<len; i++, j+=channels)\
+            dst[j] = tmp[i];\
+    }\
+}\
+\
+static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, long len, int channels){\
+    if(channels==1)\
+        float_to_int16_##cpu(dst, src[0], len);\
+    else if(channels==2){\
+        x86_reg reglen = len; \
+        const float *src0 = src[0];\
+        const float *src1 = src[1];\
+        __asm__ volatile(\
+            "shl $2, %0 \n"\
+            "add %0, %1 \n"\
+            "add %0, %2 \n"\
+            "add %0, %3 \n"\
+            "neg %0 \n"\
+            body\
+            :"+r"(reglen), "+r"(dst), "+r"(src0), "+r"(src1)\
+        );\
+    }else if(channels==6){\
+        ff_float_to_int16_interleave6_##cpu(dst, src, len);\
+    }else\
+        float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\
+}
+
+FLOAT_TO_INT16_INTERLEAVE(3dnow,
+    "1:                         \n"
+    "pf2id     (%2,%0), %%mm0   \n"
+    "pf2id    8(%2,%0), %%mm1   \n"
+    "pf2id     (%3,%0), %%mm2   \n"
+    "pf2id    8(%3,%0), %%mm3   \n"
+    "packssdw    %%mm1, %%mm0   \n"
+    "packssdw    %%mm3, %%mm2   \n"
+    "movq        %%mm0, %%mm1   \n"
+    "punpcklwd   %%mm2, %%mm0   \n"
+    "punpckhwd   %%mm2, %%mm1   \n"
+    "movq        %%mm0,  (%1,%0)\n"
+    "movq        %%mm1, 8(%1,%0)\n"
+    "add $16, %0                \n"
+    "js 1b                      \n"
+    "femms                      \n"
+)
+
+FLOAT_TO_INT16_INTERLEAVE(sse,
+    "1:                         \n"
+    "cvtps2pi  (%2,%0), %%mm0   \n"
+    "cvtps2pi 8(%2,%0), %%mm1   \n"
+    "cvtps2pi  (%3,%0), %%mm2   \n"
+    "cvtps2pi 8(%3,%0), %%mm3   \n"
+    "packssdw    %%mm1, %%mm0   \n"
+    "packssdw    %%mm3, %%mm2   \n"
+    "movq        %%mm0, %%mm1   \n"
+    "punpcklwd   %%mm2, %%mm0   \n"
+    "punpckhwd   %%mm2, %%mm1   \n"
+    "movq        %%mm0,  (%1,%0)\n"
+    "movq        %%mm1, 8(%1,%0)\n"
+    "add $16, %0                \n"
+    "js 1b                      \n"
+    "emms                       \n"
+)
+
+FLOAT_TO_INT16_INTERLEAVE(sse2,
+    "1:                         \n"
+    "cvtps2dq  (%2,%0), %%xmm0  \n"
+    "cvtps2dq  (%3,%0), %%xmm1  \n"
+    "packssdw   %%xmm1, %%xmm0  \n"
+    "movhlps    %%xmm0, %%xmm1  \n"
+    "punpcklwd  %%xmm1, %%xmm0  \n"
+    "movdqa     %%xmm0, (%1,%0) \n"
+    "add $16, %0                \n"
+    "js 1b                      \n"
+)
+
+static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long len, int channels){
+    if(channels==6)
+        ff_float_to_int16_interleave6_3dn2(dst, src, len);
+    else
+        float_to_int16_interleave_3dnow(dst, src, len, channels);
+}
+
+void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx)
+{
+    int mm_flags = av_get_cpu_flags();
+
+    if (mm_flags & AV_CPU_FLAG_MMX) {
+
+        if(mm_flags & AV_CPU_FLAG_3DNOW){
+            if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
+                c->float_to_int16 = float_to_int16_3dnow;
+                c->float_to_int16_interleave = float_to_int16_interleave_3dnow;
+            }
+        }
+        if(mm_flags & AV_CPU_FLAG_3DNOWEXT){
+            if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
+                c->float_to_int16_interleave = float_to_int16_interleave_3dn2;
+            }
+        }
+        if(mm_flags & AV_CPU_FLAG_SSE){
+            c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse;
+            c->float_to_int16 = float_to_int16_sse;
+            c->float_to_int16_interleave = float_to_int16_interleave_sse;
+        }
+        if(mm_flags & AV_CPU_FLAG_SSE2){
+            c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2;
+            c->float_to_int16 = float_to_int16_sse2;
+            c->float_to_int16_interleave = float_to_int16_interleave_sse2;
+        }
+    }
+}