diff --git a/tests/checkasm/vc1dsp.c b/tests/checkasm/vc1dsp.c index 2fd6c74d6c..7d4457306f 100644 --- a/tests/checkasm/vc1dsp.c +++ b/tests/checkasm/vc1dsp.c @@ -30,12 +30,208 @@ #include "libavutil/mem_internal.h" #define VC1DSP_TEST(func) { #func, offsetof(VC1DSPContext, func) }, +#define VC1DSP_SIZED_TEST(func, width, height) { #func, offsetof(VC1DSPContext, func), width, height }, typedef struct { const char *name; size_t offset; + int width; + int height; } test; +typedef struct matrix { + size_t width; + size_t height; + float d[]; +} matrix; + +static const matrix T8 = { 8, 8, { + 12, 12, 12, 12, 12, 12, 12, 12, + 16, 15, 9, 4, -4, -9, -15, -16, + 16, 6, -6, -16, -16, -6, 6, 16, + 15, -4, -16, -9, 9, 16, 4, -15, + 12, -12, -12, 12, 12, -12, -12, 12, + 9, -16, 4, 15, -15, -4, 16, -9, + 6, -16, 16, -6, -6, 16, -16, 6, + 4, -9, 15, -16, 16, -15, 9, -4 +} }; + +static const matrix T4 = { 4, 4, { + 17, 17, 17, 17, + 22, 10, -10, -22, + 17, -17, -17, 17, + 10, -22, 22, -10 +} }; + +static const matrix T8t = { 8, 8, { + 12, 16, 16, 15, 12, 9, 6, 4, + 12, 15, 6, -4, -12, -16, -16, -9, + 12, 9, -6, -16, -12, 4, 16, 15, + 12, 4, -16, -9, 12, 15, -6, -16, + 12, -4, -16, 9, 12, -15, -6, 16, + 12, -9, -6, 16, -12, -4, 16, -15, + 12, -15, 6, 4, -12, 16, -16, 9, + 12, -16, 16, -15, 12, -9, 6, -4 +} }; + +static const matrix T4t = { 4, 4, { + 17, 22, 17, 10, + 17, 10, -17, -22, + 17, -10, -17, 22, + 17, -22, 17, -10 +} }; + +static matrix *new_matrix(size_t width, size_t height) +{ + matrix *out = av_mallocz(sizeof (matrix) + height * width * sizeof (float)); + if (out == NULL) { + fprintf(stderr, "Memory allocation failure\n"); + exit(EXIT_FAILURE); + } + out->width = width; + out->height = height; + return out; +} + +static matrix *multiply(const matrix *a, const matrix *b) +{ + matrix *out; + if (a->width != b->height) { + fprintf(stderr, "Incompatible multiplication\n"); + exit(EXIT_FAILURE); + } + out = new_matrix(b->width, a->height); + for (int j = 0; j < out->height; ++j) + for (int i = 0; i < out->width; ++i) { + float sum = 0; + for (int k = 0; k < a->width; ++k) + sum += a->d[j * a->width + k] * b->d[k * b->width + i]; + out->d[j * out->width + i] = sum; + } + return out; +} + +static void normalise(matrix *a) +{ + for (int j = 0; j < a->height; ++j) + for (int i = 0; i < a->width; ++i) { + float *p = a->d + j * a->width + i; + *p *= 64; + if (a->height == 4) + *p /= (const unsigned[]) { 289, 292, 289, 292 } [j]; + else + *p /= (const unsigned[]) { 288, 289, 292, 289, 288, 289, 292, 289 } [j]; + if (a->width == 4) + *p /= (const unsigned[]) { 289, 292, 289, 292 } [i]; + else + *p /= (const unsigned[]) { 288, 289, 292, 289, 288, 289, 292, 289 } [i]; + } +} + +static void divide_and_round_nearest(matrix *a, float by) +{ + for (int j = 0; j < a->height; ++j) + for (int i = 0; i < a->width; ++i) { + float *p = a->d + j * a->width + i; + *p = rintf(*p / by); + } +} + +static void tweak(matrix *a) +{ + for (int j = 4; j < a->height; ++j) + for (int i = 0; i < a->width; ++i) { + float *p = a->d + j * a->width + i; + *p += 1; + } +} + +/* The VC-1 spec places restrictions on the values permitted at three + * different stages: + * - D: the input coefficients in frequency domain + * - E: the intermediate coefficients, inverse-transformed only horizontally + * - R: the fully inverse-transformed coefficients + * + * To fully cater for the ranges specified requires various intermediate + * values to be held to 17-bit precision; yet these conditions do not appear + * to be utilised in real-world streams. At least some assembly + * implementations have chosen to restrict these values to 16-bit precision, + * to accelerate the decoding of real-world streams at the cost of strict + * adherence to the spec. To avoid our test marking these as failures, + * reduce our random inputs. + */ +#define ATTENUATION 4 + +static matrix *generate_inverse_quantized_transform_coefficients(size_t width, size_t height) +{ + matrix *raw, *tmp, *D, *E, *R; + raw = new_matrix(width, height); + for (int i = 0; i < width * height; ++i) + raw->d[i] = (int) (rnd() % (1024/ATTENUATION)) - 512/ATTENUATION; + tmp = multiply(height == 8 ? &T8 : &T4, raw); + D = multiply(tmp, width == 8 ? &T8t : &T4t); + normalise(D); + divide_and_round_nearest(D, 1); + for (int i = 0; i < width * height; ++i) { + if (D->d[i] < -2048/ATTENUATION || D->d[i] > 2048/ATTENUATION-1) { + /* Rare, so simply try again */ + av_free(raw); + av_free(tmp); + av_free(D); + return generate_inverse_quantized_transform_coefficients(width, height); + } + } + E = multiply(D, width == 8 ? &T8 : &T4); + divide_and_round_nearest(E, 8); + for (int i = 0; i < width * height; ++i) + if (E->d[i] < -4096/ATTENUATION || E->d[i] > 4096/ATTENUATION-1) { + /* Rare, so simply try again */ + av_free(raw); + av_free(tmp); + av_free(D); + av_free(E); + return generate_inverse_quantized_transform_coefficients(width, height); + } + R = multiply(height == 8 ? &T8t : &T4t, E); + tweak(R); + divide_and_round_nearest(R, 128); + for (int i = 0; i < width * height; ++i) + if (R->d[i] < -512/ATTENUATION || R->d[i] > 512/ATTENUATION-1) { + /* Rare, so simply try again */ + av_free(raw); + av_free(tmp); + av_free(D); + av_free(E); + av_free(R); + return generate_inverse_quantized_transform_coefficients(width, height); + } + av_free(raw); + av_free(tmp); + av_free(E); + av_free(R); + return D; +} + +#define RANDOMIZE_BUFFER16(name, size) \ + do { \ + int i; \ + for (i = 0; i < size; ++i) { \ + uint16_t r = rnd(); \ + AV_WN16A(name##0 + i, r); \ + AV_WN16A(name##1 + i, r); \ + } \ + } while (0) + +#define RANDOMIZE_BUFFER8(name, size) \ + do { \ + int i; \ + for (i = 0; i < size; ++i) { \ + uint8_t r = rnd(); \ + name##0[i] = r; \ + name##1[i] = r; \ + } \ + } while (0) + #define RANDOMIZE_BUFFER8_MID_WEIGHTED(name, size) \ do { \ uint8_t *p##0 = name##0, *p##1 = name##1; \ @@ -49,6 +245,89 @@ typedef struct { } \ } while (0) +static void check_inv_trans_inplace(void) +{ + /* Inverse transform input coefficients are stored in a 16-bit buffer + * with row stride of 8 coefficients irrespective of transform size. + * vc1_inv_trans_8x8 differs from the others in two ways: coefficients + * are stored in column-major order, and the outputs are written back + * to the input buffer, so we oversize it slightly to catch overruns. */ + LOCAL_ALIGNED_16(int16_t, inv_trans_in0, [10 * 8]); + LOCAL_ALIGNED_16(int16_t, inv_trans_in1, [10 * 8]); + + VC1DSPContext h; + + ff_vc1dsp_init(&h); + + if (check_func(h.vc1_inv_trans_8x8, "vc1dsp.vc1_inv_trans_8x8")) { + matrix *coeffs; + declare_func_emms(AV_CPU_FLAG_MMX, void, int16_t *); + RANDOMIZE_BUFFER16(inv_trans_in, 10 * 8); + coeffs = generate_inverse_quantized_transform_coefficients(8, 8); + for (int j = 0; j < 8; ++j) + for (int i = 0; i < 8; ++i) { + int idx = 8 + i * 8 + j; + inv_trans_in1[idx] = inv_trans_in0[idx] = coeffs->d[j * 8 + i]; + } + call_ref(inv_trans_in0 + 8); + call_new(inv_trans_in1 + 8); + if (memcmp(inv_trans_in0, inv_trans_in1, 10 * 8 * sizeof (int16_t))) + fail(); + bench_new(inv_trans_in1 + 8); + av_free(coeffs); + } +} + +static void check_inv_trans_adding(void) +{ + /* Inverse transform input coefficients are stored in a 16-bit buffer + * with row stride of 8 coefficients irrespective of transform size. */ + LOCAL_ALIGNED_16(int16_t, inv_trans_in0, [8 * 8]); + LOCAL_ALIGNED_16(int16_t, inv_trans_in1, [8 * 8]); + + /* For all but vc1_inv_trans_8x8, the inverse transform is narrowed and + * added with saturation to an array of unsigned 8-bit values. Oversize + * this by 8 samples left and right and one row above and below. */ + LOCAL_ALIGNED_8(uint8_t, inv_trans_out0, [10 * 24]); + LOCAL_ALIGNED_8(uint8_t, inv_trans_out1, [10 * 24]); + + VC1DSPContext h; + + const test tests[] = { + VC1DSP_SIZED_TEST(vc1_inv_trans_8x4, 8, 4) + VC1DSP_SIZED_TEST(vc1_inv_trans_4x8, 4, 8) + VC1DSP_SIZED_TEST(vc1_inv_trans_4x4, 4, 4) + VC1DSP_SIZED_TEST(vc1_inv_trans_8x8_dc, 8, 8) + VC1DSP_SIZED_TEST(vc1_inv_trans_8x4_dc, 8, 4) + VC1DSP_SIZED_TEST(vc1_inv_trans_4x8_dc, 4, 8) + VC1DSP_SIZED_TEST(vc1_inv_trans_4x4_dc, 4, 4) + }; + + ff_vc1dsp_init(&h); + + for (size_t t = 0; t < FF_ARRAY_ELEMS(tests); ++t) { + void (*func)(uint8_t *, ptrdiff_t, int16_t *) = *(void **)((intptr_t) &h + tests[t].offset); + if (check_func(func, "vc1dsp.%s", tests[t].name)) { + matrix *coeffs; + declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *, ptrdiff_t, int16_t *); + RANDOMIZE_BUFFER16(inv_trans_in, 8 * 8); + RANDOMIZE_BUFFER8(inv_trans_out, 10 * 24); + coeffs = generate_inverse_quantized_transform_coefficients(tests[t].width, tests[t].height); + for (int j = 0; j < tests[t].height; ++j) + for (int i = 0; i < tests[t].width; ++i) { + int idx = j * 8 + i; + inv_trans_in1[idx] = inv_trans_in0[idx] = coeffs->d[j * tests[t].width + i]; + } + call_ref(inv_trans_out0 + 24 + 8, 24, inv_trans_in0); + call_new(inv_trans_out1 + 24 + 8, 24, inv_trans_in1); + if (memcmp(inv_trans_out0, inv_trans_out1, 10 * 24)) + fail(); + bench_new(inv_trans_out1 + 24 + 8, 24, inv_trans_in1 + 8); + av_free(coeffs); + } + } +} + static void check_loop_filter(void) { /* Deblocking filter buffers are big enough to hold a 16x16 block, @@ -97,6 +376,10 @@ static void check_loop_filter(void) void checkasm_check_vc1dsp(void) { + check_inv_trans_inplace(); + check_inv_trans_adding(); + report("inv_trans"); + check_loop_filter(); report("loop_filter"); }