1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2025-01-13 21:28:01 +02:00

avfilter/dctdnoiz: use 32-bit (float) operations instead of 64 (double) for DCTs

This makes the code about 1.5x faster without any noticeable difference
in the output.
This commit is contained in:
Clément Bœsch 2014-08-08 19:59:15 +02:00
parent eb16a6d229
commit 1ba7c6ead2

View File

@ -102,20 +102,20 @@ static void av_always_inline fdct8_1d(float *dst, const float *src,
const float x09 = x01 + x02;
const float x0a = x00 - x03;
const float x0b = x01 - x02;
const float x0c = 1.38703984532215*x04 + 0.275899379282943*x07;
const float x0d = 1.17587560241936*x05 + 0.785694958387102*x06;
const float x0e = -0.785694958387102*x05 + 1.17587560241936*x06;
const float x0f = 0.275899379282943*x04 - 1.38703984532215*x07;
const float x10 = 0.353553390593274 * (x0c - x0d);
const float x11 = 0.353553390593274 * (x0e - x0f);
dst[0*dst_stridea] = 0.353553390593274 * (x08 + x09);
dst[1*dst_stridea] = 0.353553390593274 * (x0c + x0d);
dst[2*dst_stridea] = 0.461939766255643*x0a + 0.191341716182545*x0b;
dst[3*dst_stridea] = 0.707106781186547 * (x10 - x11);
dst[4*dst_stridea] = 0.353553390593274 * (x08 - x09);
dst[5*dst_stridea] = 0.707106781186547 * (x10 + x11);
dst[6*dst_stridea] = 0.191341716182545*x0a - 0.461939766255643*x0b;
dst[7*dst_stridea] = 0.353553390593274 * (x0e + x0f);
const float x0c = 1.38703984532215f*x04 + 0.275899379282943f*x07;
const float x0d = 1.17587560241936f*x05 + 0.785694958387102f*x06;
const float x0e = -0.785694958387102f*x05 + 1.17587560241936f*x06;
const float x0f = 0.275899379282943f*x04 - 1.38703984532215f*x07;
const float x10 = 0.353553390593274f * (x0c - x0d);
const float x11 = 0.353553390593274f * (x0e - x0f);
dst[0*dst_stridea] = 0.353553390593274f * (x08 + x09);
dst[1*dst_stridea] = 0.353553390593274f * (x0c + x0d);
dst[2*dst_stridea] = 0.461939766255643f*x0a + 0.191341716182545f*x0b;
dst[3*dst_stridea] = 0.707106781186547f * (x10 - x11);
dst[4*dst_stridea] = 0.353553390593274f * (x08 - x09);
dst[5*dst_stridea] = 0.707106781186547f * (x10 + x11);
dst[6*dst_stridea] = 0.191341716182545f*x0a - 0.461939766255643f*x0b;
dst[7*dst_stridea] = 0.353553390593274f * (x0e + x0f);
dst += dst_strideb;
src += src_strideb;
}
@ -129,37 +129,37 @@ static void av_always_inline idct8_1d(float *dst, const float *src,
int i;
for (i = 0; i < 8; i++) {
const float x00 = 1.4142135623731*src[0*src_stridea];
const float x01 = 1.38703984532215*src[1*src_stridea] + 0.275899379282943*src[7*src_stridea];
const float x02 = 1.30656296487638*src[2*src_stridea] + 0.541196100146197*src[6*src_stridea];
const float x03 = 1.17587560241936*src[3*src_stridea] + 0.785694958387102*src[5*src_stridea];
const float x04 = 1.4142135623731*src[4*src_stridea];
const float x05 = -0.785694958387102*src[3*src_stridea] + 1.17587560241936*src[5*src_stridea];
const float x06 = 0.541196100146197*src[2*src_stridea] - 1.30656296487638*src[6*src_stridea];
const float x07 = -0.275899379282943*src[1*src_stridea] + 1.38703984532215*src[7*src_stridea];
const float x00 = 1.4142135623731f *src[0*src_stridea];
const float x01 = 1.38703984532215f *src[1*src_stridea] + 0.275899379282943f*src[7*src_stridea];
const float x02 = 1.30656296487638f *src[2*src_stridea] + 0.541196100146197f*src[6*src_stridea];
const float x03 = 1.17587560241936f *src[3*src_stridea] + 0.785694958387102f*src[5*src_stridea];
const float x04 = 1.4142135623731f *src[4*src_stridea];
const float x05 = -0.785694958387102f*src[3*src_stridea] + 1.17587560241936f*src[5*src_stridea];
const float x06 = 0.541196100146197f*src[2*src_stridea] - 1.30656296487638f*src[6*src_stridea];
const float x07 = -0.275899379282943f*src[1*src_stridea] + 1.38703984532215f*src[7*src_stridea];
const float x09 = x00 + x04;
const float x0a = x01 + x03;
const float x0b = 1.4142135623731*x02;
const float x0b = 1.4142135623731f*x02;
const float x0c = x00 - x04;
const float x0d = x01 - x03;
const float x0e = 0.353553390593274 * (x09 - x0b);
const float x0f = 0.353553390593274 * (x0c + x0d);
const float x10 = 0.353553390593274 * (x0c - x0d);
const float x11 = 1.4142135623731*x06;
const float x0e = 0.353553390593274f * (x09 - x0b);
const float x0f = 0.353553390593274f * (x0c + x0d);
const float x10 = 0.353553390593274f * (x0c - x0d);
const float x11 = 1.4142135623731f*x06;
const float x12 = x05 + x07;
const float x13 = x05 - x07;
const float x14 = 0.353553390593274 * (x11 + x12);
const float x15 = 0.353553390593274 * (x11 - x12);
const float x16 = 0.5*x13;
const float x14 = 0.353553390593274f * (x11 + x12);
const float x15 = 0.353553390593274f * (x11 - x12);
const float x16 = 0.5f*x13;
const float x08 = -x15;
dst[0*dst_stridea] = (add ? dst[ 0*dst_stridea] : 0) + 0.25 * (x09 + x0b) + 0.353553390593274*x0a;
dst[1*dst_stridea] = (add ? dst[ 1*dst_stridea] : 0) + 0.707106781186547 * (x0f - x08);
dst[2*dst_stridea] = (add ? dst[ 2*dst_stridea] : 0) + 0.707106781186547 * (x0f + x08);
dst[3*dst_stridea] = (add ? dst[ 3*dst_stridea] : 0) + 0.707106781186547 * (x0e + x16);
dst[4*dst_stridea] = (add ? dst[ 4*dst_stridea] : 0) + 0.707106781186547 * (x0e - x16);
dst[5*dst_stridea] = (add ? dst[ 5*dst_stridea] : 0) + 0.707106781186547 * (x10 - x14);
dst[6*dst_stridea] = (add ? dst[ 6*dst_stridea] : 0) + 0.707106781186547 * (x10 + x14);
dst[7*dst_stridea] = (add ? dst[ 7*dst_stridea] : 0) + 0.25 * (x09 + x0b) - 0.353553390593274*x0a;
dst[0*dst_stridea] = (add ? dst[ 0*dst_stridea] : 0) + 0.25f * (x09 + x0b) + 0.353553390593274f*x0a;
dst[1*dst_stridea] = (add ? dst[ 1*dst_stridea] : 0) + 0.707106781186547f * (x0f - x08);
dst[2*dst_stridea] = (add ? dst[ 2*dst_stridea] : 0) + 0.707106781186547f * (x0f + x08);
dst[3*dst_stridea] = (add ? dst[ 3*dst_stridea] : 0) + 0.707106781186547f * (x0e + x16);
dst[4*dst_stridea] = (add ? dst[ 4*dst_stridea] : 0) + 0.707106781186547f * (x0e - x16);
dst[5*dst_stridea] = (add ? dst[ 5*dst_stridea] : 0) + 0.707106781186547f * (x10 - x14);
dst[6*dst_stridea] = (add ? dst[ 6*dst_stridea] : 0) + 0.707106781186547f * (x10 + x14);
dst[7*dst_stridea] = (add ? dst[ 7*dst_stridea] : 0) + 0.25f * (x09 + x0b) - 0.353553390593274f*x0a;
dst += dst_strideb;
src += src_strideb;
}
@ -201,50 +201,50 @@ static void av_always_inline fdct16_1d(float *dst, const float *src,
const float x19 = x11 + x12;
const float x1a = x10 - x13;
const float x1b = x11 - x12;
const float x1c = 1.38703984532215*x14 + 0.275899379282943*x17;
const float x1d = 1.17587560241936*x15 + 0.785694958387102*x16;
const float x1e = -0.785694958387102*x15 + 1.17587560241936*x16;
const float x1f = 0.275899379282943*x14 - 1.38703984532215*x17;
const float x20 = 0.25 * (x1c - x1d);
const float x21 = 0.25 * (x1e - x1f);
const float x22 = 1.40740373752638*x08 + 0.138617169199091*x0f;
const float x23 = 1.35331800117435*x09 + 0.410524527522357*x0e;
const float x24 = 1.24722501298667*x0a + 0.666655658477747*x0d;
const float x25 = 1.09320186700176*x0b + 0.897167586342636*x0c;
const float x26 = -0.897167586342636*x0b + 1.09320186700176*x0c;
const float x27 = 0.666655658477747*x0a - 1.24722501298667*x0d;
const float x28 = -0.410524527522357*x09 + 1.35331800117435*x0e;
const float x29 = 0.138617169199091*x08 - 1.40740373752638*x0f;
const float x1c = 1.38703984532215f*x14 + 0.275899379282943f*x17;
const float x1d = 1.17587560241936f*x15 + 0.785694958387102f*x16;
const float x1e = -0.785694958387102f*x15 + 1.17587560241936f *x16;
const float x1f = 0.275899379282943f*x14 - 1.38703984532215f *x17;
const float x20 = 0.25f * (x1c - x1d);
const float x21 = 0.25f * (x1e - x1f);
const float x22 = 1.40740373752638f *x08 + 0.138617169199091f*x0f;
const float x23 = 1.35331800117435f *x09 + 0.410524527522357f*x0e;
const float x24 = 1.24722501298667f *x0a + 0.666655658477747f*x0d;
const float x25 = 1.09320186700176f *x0b + 0.897167586342636f*x0c;
const float x26 = -0.897167586342636f*x0b + 1.09320186700176f *x0c;
const float x27 = 0.666655658477747f*x0a - 1.24722501298667f *x0d;
const float x28 = -0.410524527522357f*x09 + 1.35331800117435f *x0e;
const float x29 = 0.138617169199091f*x08 - 1.40740373752638f *x0f;
const float x2a = x22 + x25;
const float x2b = x23 + x24;
const float x2c = x22 - x25;
const float x2d = x23 - x24;
const float x2e = 0.25 * (x2a - x2b);
const float x2f = 0.326640741219094*x2c + 0.135299025036549*x2d;
const float x30 = 0.135299025036549*x2c - 0.326640741219094*x2d;
const float x2e = 0.25f * (x2a - x2b);
const float x2f = 0.326640741219094f*x2c + 0.135299025036549f*x2d;
const float x30 = 0.135299025036549f*x2c - 0.326640741219094f*x2d;
const float x31 = x26 + x29;
const float x32 = x27 + x28;
const float x33 = x26 - x29;
const float x34 = x27 - x28;
const float x35 = 0.25 * (x31 - x32);
const float x36 = 0.326640741219094*x33 + 0.135299025036549*x34;
const float x37 = 0.135299025036549*x33 - 0.326640741219094*x34;
dst[ 0*dst_stridea] = 0.25 * (x18 + x19);
dst[ 1*dst_stridea] = 0.25 * (x2a + x2b);
dst[ 2*dst_stridea] = 0.25 * (x1c + x1d);
dst[ 3*dst_stridea] = 0.707106781186547 * (x2f - x37);
dst[ 4*dst_stridea] = 0.326640741219094*x1a + 0.135299025036549*x1b;
dst[ 5*dst_stridea] = 0.707106781186547 * (x2f + x37);
dst[ 6*dst_stridea] = 0.707106781186547 * (x20 - x21);
dst[ 7*dst_stridea] = 0.707106781186547 * (x2e + x35);
dst[ 8*dst_stridea] = 0.25 * (x18 - x19);
dst[ 9*dst_stridea] = 0.707106781186547 * (x2e - x35);
dst[10*dst_stridea] = 0.707106781186547 * (x20 + x21);
dst[11*dst_stridea] = 0.707106781186547 * (x30 - x36);
dst[12*dst_stridea] = 0.135299025036549*x1a - 0.326640741219094*x1b;
dst[13*dst_stridea] = 0.707106781186547 * (x30 + x36);
dst[14*dst_stridea] = 0.25 * (x1e + x1f);
dst[15*dst_stridea] = 0.25 * (x31 + x32);
const float x35 = 0.25f * (x31 - x32);
const float x36 = 0.326640741219094f*x33 + 0.135299025036549f*x34;
const float x37 = 0.135299025036549f*x33 - 0.326640741219094f*x34;
dst[ 0*dst_stridea] = 0.25f * (x18 + x19);
dst[ 1*dst_stridea] = 0.25f * (x2a + x2b);
dst[ 2*dst_stridea] = 0.25f * (x1c + x1d);
dst[ 3*dst_stridea] = 0.707106781186547f * (x2f - x37);
dst[ 4*dst_stridea] = 0.326640741219094f*x1a + 0.135299025036549f*x1b;
dst[ 5*dst_stridea] = 0.707106781186547f * (x2f + x37);
dst[ 6*dst_stridea] = 0.707106781186547f * (x20 - x21);
dst[ 7*dst_stridea] = 0.707106781186547f * (x2e + x35);
dst[ 8*dst_stridea] = 0.25f * (x18 - x19);
dst[ 9*dst_stridea] = 0.707106781186547f * (x2e - x35);
dst[10*dst_stridea] = 0.707106781186547f * (x20 + x21);
dst[11*dst_stridea] = 0.707106781186547f * (x30 - x36);
dst[12*dst_stridea] = 0.135299025036549f*x1a - 0.326640741219094f*x1b;
dst[13*dst_stridea] = 0.707106781186547f * (x30 + x36);
dst[14*dst_stridea] = 0.25f * (x1e + x1f);
dst[15*dst_stridea] = 0.25f * (x31 + x32);
dst += dst_strideb;
src += src_strideb;
}
@ -258,91 +258,91 @@ static void av_always_inline idct16_1d(float *dst, const float *src,
int i;
for (i = 0; i < 16; i++) {
const float x00 = 1.4142135623731 *src[ 0*src_stridea];
const float x01 = 1.40740373752638 *src[ 1*src_stridea] + 0.138617169199091*src[15*src_stridea];
const float x02 = 1.38703984532215 *src[ 2*src_stridea] + 0.275899379282943*src[14*src_stridea];
const float x03 = 1.35331800117435 *src[ 3*src_stridea] + 0.410524527522357*src[13*src_stridea];
const float x04 = 1.30656296487638 *src[ 4*src_stridea] + 0.541196100146197*src[12*src_stridea];
const float x05 = 1.24722501298667 *src[ 5*src_stridea] + 0.666655658477747*src[11*src_stridea];
const float x06 = 1.17587560241936 *src[ 6*src_stridea] + 0.785694958387102*src[10*src_stridea];
const float x07 = 1.09320186700176 *src[ 7*src_stridea] + 0.897167586342636*src[ 9*src_stridea];
const float x08 = 1.4142135623731 *src[ 8*src_stridea];
const float x09 = -0.897167586342636*src[ 7*src_stridea] + 1.09320186700176*src[ 9*src_stridea];
const float x0a = 0.785694958387102*src[ 6*src_stridea] - 1.17587560241936*src[10*src_stridea];
const float x0b = -0.666655658477747*src[ 5*src_stridea] + 1.24722501298667*src[11*src_stridea];
const float x0c = 0.541196100146197*src[ 4*src_stridea] - 1.30656296487638*src[12*src_stridea];
const float x0d = -0.410524527522357*src[ 3*src_stridea] + 1.35331800117435*src[13*src_stridea];
const float x0e = 0.275899379282943*src[ 2*src_stridea] - 1.38703984532215*src[14*src_stridea];
const float x0f = -0.138617169199091*src[ 1*src_stridea] + 1.40740373752638*src[15*src_stridea];
const float x00 = 1.4142135623731f *src[ 0*src_stridea];
const float x01 = 1.40740373752638f *src[ 1*src_stridea] + 0.138617169199091f*src[15*src_stridea];
const float x02 = 1.38703984532215f *src[ 2*src_stridea] + 0.275899379282943f*src[14*src_stridea];
const float x03 = 1.35331800117435f *src[ 3*src_stridea] + 0.410524527522357f*src[13*src_stridea];
const float x04 = 1.30656296487638f *src[ 4*src_stridea] + 0.541196100146197f*src[12*src_stridea];
const float x05 = 1.24722501298667f *src[ 5*src_stridea] + 0.666655658477747f*src[11*src_stridea];
const float x06 = 1.17587560241936f *src[ 6*src_stridea] + 0.785694958387102f*src[10*src_stridea];
const float x07 = 1.09320186700176f *src[ 7*src_stridea] + 0.897167586342636f*src[ 9*src_stridea];
const float x08 = 1.4142135623731f *src[ 8*src_stridea];
const float x09 = -0.897167586342636f*src[ 7*src_stridea] + 1.09320186700176f*src[ 9*src_stridea];
const float x0a = 0.785694958387102f*src[ 6*src_stridea] - 1.17587560241936f*src[10*src_stridea];
const float x0b = -0.666655658477747f*src[ 5*src_stridea] + 1.24722501298667f*src[11*src_stridea];
const float x0c = 0.541196100146197f*src[ 4*src_stridea] - 1.30656296487638f*src[12*src_stridea];
const float x0d = -0.410524527522357f*src[ 3*src_stridea] + 1.35331800117435f*src[13*src_stridea];
const float x0e = 0.275899379282943f*src[ 2*src_stridea] - 1.38703984532215f*src[14*src_stridea];
const float x0f = -0.138617169199091f*src[ 1*src_stridea] + 1.40740373752638f*src[15*src_stridea];
const float x12 = x00 + x08;
const float x13 = x01 + x07;
const float x14 = x02 + x06;
const float x15 = x03 + x05;
const float x16 = 1.4142135623731*x04;
const float x16 = 1.4142135623731f*x04;
const float x17 = x00 - x08;
const float x18 = x01 - x07;
const float x19 = x02 - x06;
const float x1a = x03 - x05;
const float x1d = x12 + x16;
const float x1e = x13 + x15;
const float x1f = 1.4142135623731*x14;
const float x1f = 1.4142135623731f*x14;
const float x20 = x12 - x16;
const float x21 = x13 - x15;
const float x22 = 0.25 * (x1d - x1f);
const float x23 = 0.25 * (x20 + x21);
const float x24 = 0.25 * (x20 - x21);
const float x25 = 1.4142135623731*x17;
const float x26 = 1.30656296487638*x18 + 0.541196100146197*x1a;
const float x27 = 1.4142135623731*x19;
const float x28 = -0.541196100146197*x18 + 1.30656296487638*x1a;
const float x29 = 0.176776695296637 * (x25 + x27) + 0.25*x26;
const float x2a = 0.25 * (x25 - x27);
const float x2b = 0.176776695296637 * (x25 + x27) - 0.25*x26;
const float x2c = 0.353553390593274*x28;
const float x1b = 0.707106781186547 * (x2a - x2c);
const float x1c = 0.707106781186547 * (x2a + x2c);
const float x2d = 1.4142135623731*x0c;
const float x22 = 0.25f * (x1d - x1f);
const float x23 = 0.25f * (x20 + x21);
const float x24 = 0.25f * (x20 - x21);
const float x25 = 1.4142135623731f*x17;
const float x26 = 1.30656296487638f*x18 + 0.541196100146197f*x1a;
const float x27 = 1.4142135623731f*x19;
const float x28 = -0.541196100146197f*x18 + 1.30656296487638f*x1a;
const float x29 = 0.176776695296637f * (x25 + x27) + 0.25f*x26;
const float x2a = 0.25f * (x25 - x27);
const float x2b = 0.176776695296637f * (x25 + x27) - 0.25f*x26;
const float x2c = 0.353553390593274f*x28;
const float x1b = 0.707106781186547f * (x2a - x2c);
const float x1c = 0.707106781186547f * (x2a + x2c);
const float x2d = 1.4142135623731f*x0c;
const float x2e = x0b + x0d;
const float x2f = x0a + x0e;
const float x30 = x09 + x0f;
const float x31 = x09 - x0f;
const float x32 = x0a - x0e;
const float x33 = x0b - x0d;
const float x37 = 1.4142135623731*x2d;
const float x38 = 1.30656296487638*x2e + 0.541196100146197*x30;
const float x39 = 1.4142135623731*x2f;
const float x3a = -0.541196100146197*x2e + 1.30656296487638*x30;
const float x3b = 0.176776695296637 * (x37 + x39) + 0.25*x38;
const float x3c = 0.25 * (x37 - x39);
const float x3d = 0.176776695296637 * (x37 + x39) - 0.25*x38;
const float x3e = 0.353553390593274*x3a;
const float x34 = 0.707106781186547 * (x3c - x3e);
const float x35 = 0.707106781186547 * (x3c + x3e);
const float x3f = 1.4142135623731*x32;
const float x37 = 1.4142135623731f*x2d;
const float x38 = 1.30656296487638f*x2e + 0.541196100146197f*x30;
const float x39 = 1.4142135623731f*x2f;
const float x3a = -0.541196100146197f*x2e + 1.30656296487638f*x30;
const float x3b = 0.176776695296637f * (x37 + x39) + 0.25f*x38;
const float x3c = 0.25f * (x37 - x39);
const float x3d = 0.176776695296637f * (x37 + x39) - 0.25f*x38;
const float x3e = 0.353553390593274f*x3a;
const float x34 = 0.707106781186547f * (x3c - x3e);
const float x35 = 0.707106781186547f * (x3c + x3e);
const float x3f = 1.4142135623731f*x32;
const float x40 = x31 + x33;
const float x41 = x31 - x33;
const float x42 = 0.25 * (x3f + x40);
const float x43 = 0.25 * (x3f - x40);
const float x44 = 0.353553390593274*x41;
const float x42 = 0.25f * (x3f + x40);
const float x43 = 0.25f * (x3f - x40);
const float x44 = 0.353553390593274f*x41;
const float x36 = -x43;
const float x10 = -x34;
const float x11 = -x3d;
dst[ 0*dst_stridea] = (add ? dst[ 0*dst_stridea] : 0) + 0.176776695296637 * (x1d + x1f) + 0.25*x1e;
dst[ 1*dst_stridea] = (add ? dst[ 1*dst_stridea] : 0) + 0.707106781186547 * (x29 - x11);
dst[ 2*dst_stridea] = (add ? dst[ 2*dst_stridea] : 0) + 0.707106781186547 * (x29 + x11);
dst[ 3*dst_stridea] = (add ? dst[ 3*dst_stridea] : 0) + 0.707106781186547 * (x23 + x36);
dst[ 4*dst_stridea] = (add ? dst[ 4*dst_stridea] : 0) + 0.707106781186547 * (x23 - x36);
dst[ 5*dst_stridea] = (add ? dst[ 5*dst_stridea] : 0) + 0.707106781186547 * (x1b - x35);
dst[ 6*dst_stridea] = (add ? dst[ 6*dst_stridea] : 0) + 0.707106781186547 * (x1b + x35);
dst[ 7*dst_stridea] = (add ? dst[ 7*dst_stridea] : 0) + 0.707106781186547 * (x22 + x44);
dst[ 8*dst_stridea] = (add ? dst[ 8*dst_stridea] : 0) + 0.707106781186547 * (x22 - x44);
dst[ 9*dst_stridea] = (add ? dst[ 9*dst_stridea] : 0) + 0.707106781186547 * (x1c - x10);
dst[10*dst_stridea] = (add ? dst[10*dst_stridea] : 0) + 0.707106781186547 * (x1c + x10);
dst[11*dst_stridea] = (add ? dst[11*dst_stridea] : 0) + 0.707106781186547 * (x24 + x42);
dst[12*dst_stridea] = (add ? dst[12*dst_stridea] : 0) + 0.707106781186547 * (x24 - x42);
dst[13*dst_stridea] = (add ? dst[13*dst_stridea] : 0) + 0.707106781186547 * (x2b - x3b);
dst[14*dst_stridea] = (add ? dst[14*dst_stridea] : 0) + 0.707106781186547 * (x2b + x3b);
dst[15*dst_stridea] = (add ? dst[15*dst_stridea] : 0) + 0.176776695296637 * (x1d + x1f) - 0.25*x1e;
dst[ 0*dst_stridea] = (add ? dst[ 0*dst_stridea] : 0) + 0.176776695296637f * (x1d + x1f) + 0.25f*x1e;
dst[ 1*dst_stridea] = (add ? dst[ 1*dst_stridea] : 0) + 0.707106781186547f * (x29 - x11);
dst[ 2*dst_stridea] = (add ? dst[ 2*dst_stridea] : 0) + 0.707106781186547f * (x29 + x11);
dst[ 3*dst_stridea] = (add ? dst[ 3*dst_stridea] : 0) + 0.707106781186547f * (x23 + x36);
dst[ 4*dst_stridea] = (add ? dst[ 4*dst_stridea] : 0) + 0.707106781186547f * (x23 - x36);
dst[ 5*dst_stridea] = (add ? dst[ 5*dst_stridea] : 0) + 0.707106781186547f * (x1b - x35);
dst[ 6*dst_stridea] = (add ? dst[ 6*dst_stridea] : 0) + 0.707106781186547f * (x1b + x35);
dst[ 7*dst_stridea] = (add ? dst[ 7*dst_stridea] : 0) + 0.707106781186547f * (x22 + x44);
dst[ 8*dst_stridea] = (add ? dst[ 8*dst_stridea] : 0) + 0.707106781186547f * (x22 - x44);
dst[ 9*dst_stridea] = (add ? dst[ 9*dst_stridea] : 0) + 0.707106781186547f * (x1c - x10);
dst[10*dst_stridea] = (add ? dst[10*dst_stridea] : 0) + 0.707106781186547f * (x1c + x10);
dst[11*dst_stridea] = (add ? dst[11*dst_stridea] : 0) + 0.707106781186547f * (x24 + x42);
dst[12*dst_stridea] = (add ? dst[12*dst_stridea] : 0) + 0.707106781186547f * (x24 - x42);
dst[13*dst_stridea] = (add ? dst[13*dst_stridea] : 0) + 0.707106781186547f * (x2b - x3b);
dst[14*dst_stridea] = (add ? dst[14*dst_stridea] : 0) + 0.707106781186547f * (x2b + x3b);
dst[15*dst_stridea] = (add ? dst[15*dst_stridea] : 0) + 0.176776695296637f * (x1d + x1f) - 0.25f*x1e;
dst += dst_strideb;
src += src_strideb;
}