FFmpeg/libavcodec/wasm/hevc/sao.c

/*
 * Copyright (c) 2025 Zhao Zhili
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#include "sao.h"

#include <wasm_simd128.h>

#include "libavcodec/defs.h"

#define HEVC_MAX_PB_SIZE 64

void ff_hevc_sao_band_filter_8x8_8_simd128(uint8_t *dst, const uint8_t *src,
                                       ptrdiff_t stride_dst,
                                       ptrdiff_t stride_src,
                                       const int16_t *sao_offset_val,
                                       int sao_left_class, int width,
                                       int height)
{
    int8_t offset_table[32] = {0};
    v128_t offset_low, offset_high;

    for (int k = 0; k < 4; k++)
        offset_table[(k + sao_left_class) & 31] = (int8_t)sao_offset_val[k + 1];

    offset_low = wasm_v128_load(offset_table);
    offset_high = wasm_v128_load(&offset_table[16]);

    for (int y = height; y > 0; y -= 2) {
        v128_t src_v, src_high;
        v128_t v0, v1;

        src_v = wasm_v128_load64_zero(src);
        src += stride_src;
        src_v = wasm_v128_load64_lane(src, src_v, 1);
        src += stride_src;

        v0 = wasm_u8x16_shr(src_v, 3);
        v1 = wasm_i8x16_sub(v0, wasm_i8x16_const_splat(16));
        v0 = wasm_i8x16_swizzle(offset_low, v0);
        v1 = wasm_i8x16_swizzle(offset_high, v1);
        v0 = wasm_v128_or(v0, v1);
        src_high = wasm_u16x8_extend_high_u8x16(src_v);
        v1 = wasm_i16x8_extend_high_i8x16(v0);
        src_v = wasm_u16x8_extend_low_u8x16(src_v);
        v0 = wasm_i16x8_extend_low_i8x16(v0);

        v0 = wasm_i16x8_add_sat(src_v, v0);
        v1 = wasm_i16x8_add_sat(src_high, v1);
        v0 = wasm_u8x16_narrow_i16x8(v0, v1);

        wasm_v128_store64_lane(dst, v0, 0);
        dst += stride_dst;
        wasm_v128_store64_lane(dst, v0, 1);
        dst += stride_dst;
    }
}

void ff_hevc_sao_band_filter_16x16_8_simd128(uint8_t *dst, const uint8_t *src,
                                           ptrdiff_t stride_dst,
                                           ptrdiff_t stride_src,
                                           const int16_t *sao_offset_val,
                                           int sao_left_class, int width,
                                           int height)
{
    int8_t offset_table[32] = {0};
    v128_t offset_low, offset_high;

    for (int k = 0; k < 4; k++)
        offset_table[(k + sao_left_class) & 31] = (int8_t)sao_offset_val[k + 1];

    offset_low = wasm_v128_load(offset_table);
    offset_high = wasm_v128_load(&offset_table[16]);

    for (int y = height; y > 0; y--) {
        for (int x = 0; x < width; x += 16) {
            v128_t src_v, src_high;
            v128_t v0, v1;

            src_v = wasm_v128_load(&src[x]);

            v0 = wasm_u8x16_shr(src_v, 3);
            v1 = wasm_i8x16_sub(v0, wasm_i8x16_const_splat(16));
            v0 = wasm_i8x16_swizzle(offset_low, v0);
            v1 = wasm_i8x16_swizzle(offset_high, v1);
            v0 = wasm_v128_or(v0, v1);
            src_high = wasm_u16x8_extend_high_u8x16(src_v);
            v1 = wasm_i16x8_extend_high_i8x16(v0);
            src_v = wasm_u16x8_extend_low_u8x16(src_v);
            v0 = wasm_i16x8_extend_low_i8x16(v0);

            v0 = wasm_i16x8_add_sat(src_v, v0);
            v1 = wasm_i16x8_add_sat(src_high, v1);
            v0 = wasm_u8x16_narrow_i16x8(v0, v1);
            wasm_v128_store(&dst[x], v0);
        }

        dst += stride_dst;
        src += stride_src;
    }
}

void ff_hevc_sao_edge_filter_8x8_8_simd128(uint8_t *dst, const uint8_t *src,
                                           ptrdiff_t stride_dst,
                                           const int16_t *sao_offset_val,
                                           int eo, int width, int height)
{
    static const int8_t pos[4][2][2] = {
            { { -1,  0 }, {  1, 0 } }, // horizontal
            { {  0, -1 }, {  0, 1 } }, // vertical
            { { -1, -1 }, {  1, 1 } }, // 45 degree
            { {  1, -1 }, { -1, 1 } }, // 135 degree
    };
    int a_stride, b_stride;
    ptrdiff_t stride_src = (2 * HEVC_MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE);
    const v128_t edge_idx = wasm_u8x16_make(1, 2, 0, 3,
                                            4, 0, 0, 0,
                                            0, 0, 0, 0,
                                            0, 0, 0, 0);
    v128_t sao_offset = wasm_v128_load(sao_offset_val);
    v128_t one = wasm_i8x16_const_splat(1);
    v128_t two = wasm_i8x16_const_splat(2);

    a_stride = pos[eo][0][0] + pos[eo][0][1] * stride_src;
    b_stride = pos[eo][1][0] + pos[eo][1][1] * stride_src;
    for (int y = height; y > 0; y -= 2) {
        v128_t v0, v1, v2;
        v128_t diff0, diff1;

        v0 = wasm_v128_load64_zero(src);
        v1 = wasm_v128_load64_zero(src + a_stride);
        v2 = wasm_v128_load64_zero(src + b_stride);
        src += stride_src;
        v0 = wasm_v128_load64_lane(src, v0, 1);
        v1 = wasm_v128_load64_lane(src + a_stride, v1, 1);
        v2 = wasm_v128_load64_lane(src + b_stride, v2, 1);
        src += stride_src;

        diff0 = wasm_u8x16_gt(v0, v1);
        v1 = wasm_u8x16_lt(v0, v1);
        diff0 = wasm_i8x16_sub(v1, diff0);

        diff1 = wasm_u8x16_gt(v0, v2);
        v2 = wasm_u8x16_lt(v0, v2);
        diff1 = wasm_i8x16_sub(v2, diff1);

        v1 = wasm_i8x16_add(diff0, two);
        v1 = wasm_i8x16_add(v1, diff1);

        v2 = wasm_i8x16_swizzle(edge_idx, v1);  // offset_val
        v1 = wasm_i8x16_shl(v2, 1);             // Access int16_t
        v2 = wasm_i8x16_add(v1, one);           // Access upper half of int16_t
        diff0 = wasm_i8x16_shuffle(v1, v2, 0, 16, 1, 17, 2, 18, 3, 19, 4,
                                   20, 5, 21, 6, 22, 7, 23);
        diff1 = wasm_i8x16_shuffle(v1, v2, 8, 24, 9, 25, 10, 26, 11, 27,
                                   12, 28, 13, 29, 14, 30, 15, 31);
        v1 = wasm_u16x8_extend_high_u8x16(v0);
        v0 = wasm_u16x8_extend_low_u8x16(v0);
        diff0 = wasm_i8x16_swizzle(sao_offset, diff0);
        diff1 = wasm_i8x16_swizzle(sao_offset, diff1);

        v0 = wasm_i16x8_add_sat(v0, diff0);
        v1 = wasm_i16x8_add_sat(v1, diff1);
        v0 = wasm_u8x16_narrow_i16x8(v0, v1);

        wasm_v128_store64_lane(dst, v0, 0);
        dst += stride_dst;
        wasm_v128_store64_lane(dst, v0, 1);
        dst += stride_dst;
    }
}

void ff_hevc_sao_edge_filter_16x16_8_simd128(uint8_t *dst, const uint8_t *src,
                                           ptrdiff_t stride_dst,
                                           const int16_t *sao_offset_val,
                                           int eo, int width, int height)
{
    static const int8_t pos[4][2][2] = {
            { { -1,  0 }, {  1, 0 } }, // horizontal
            { {  0, -1 }, {  0, 1 } }, // vertical
            { { -1, -1 }, {  1, 1 } }, // 45 degree
            { {  1, -1 }, { -1, 1 } }, // 135 degree
    };
    int a_stride, b_stride;
    ptrdiff_t stride_src = (2 * HEVC_MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE);
    const v128_t edge_idx = wasm_u8x16_make(1, 2, 0, 3,
                                            4, 0, 0, 0,
                                            0, 0, 0, 0,
                                            0, 0, 0, 0);
    v128_t sao_offset = wasm_v128_load(sao_offset_val);
    v128_t one = wasm_i8x16_const_splat(1);
    v128_t two = wasm_i8x16_const_splat(2);

    a_stride = pos[eo][0][0] + pos[eo][0][1] * stride_src;
    b_stride = pos[eo][1][0] + pos[eo][1][1] * stride_src;
    for (int y = height; y > 0; y--) {
        for (int x = 0; x < width; x += 16) {
            v128_t v0, v1, v2;
            v128_t diff0, diff1;

            v0 = wasm_v128_load(&src[x]);
            v1 = wasm_v128_load(&src[x + a_stride]);
            v2 = wasm_v128_load(&src[x + b_stride]);

            diff0 = wasm_u8x16_gt(v0, v1);
            v1 = wasm_u8x16_lt(v0, v1);
            diff0 = wasm_i8x16_sub(v1, diff0);

            diff1 = wasm_u8x16_gt(v0, v2);
            v2 = wasm_u8x16_lt(v0, v2);
            diff1 = wasm_i8x16_sub(v2, diff1);

            v1 = wasm_i8x16_add(diff0, two);
            v1 = wasm_i8x16_add(v1, diff1);

            v2 = wasm_i8x16_swizzle(edge_idx, v1);  // offset_val
            v1 = wasm_i8x16_shl(v2, 1);             // Access int16_t
            v2 = wasm_i8x16_add(v1, one);           // Access upper half of int16_t
            diff0 = wasm_i8x16_shuffle(v1, v2, 0, 16, 1, 17, 2, 18, 3, 19, 4,
                                       20, 5, 21, 6, 22, 7, 23);
            diff1 = wasm_i8x16_shuffle(v1, v2, 8, 24, 9, 25, 10, 26, 11, 27,
                                       12, 28, 13, 29, 14, 30, 15, 31);
            v1 = wasm_u16x8_extend_high_u8x16(v0);
            v0 = wasm_u16x8_extend_low_u8x16(v0);
            diff0 = wasm_i8x16_swizzle(sao_offset, diff0);
            diff1 = wasm_i8x16_swizzle(sao_offset, diff1);

            v0 = wasm_i16x8_add_sat(v0, diff0);
            v1 = wasm_i16x8_add_sat(v1, diff1);
            v0 = wasm_u8x16_narrow_i16x8(v0, v1);
            wasm_v128_store(&dst[x], v0);
        }

        src += stride_src;
        dst += stride_dst;
    }
}
wasm/hevc: Add sao_band_filter hevc_sao_band_8_8_c: 63.0 ( 1.00x) hevc_sao_band_8_8_simd128: 10.4 ( 6.06x) hevc_sao_band_16_8_c: 230.4 ( 1.00x) hevc_sao_band_16_8_simd128: 22.9 (10.07x) hevc_sao_band_32_8_c: 900.4 ( 1.00x) hevc_sao_band_32_8_simd128: 81.5 (11.05x) hevc_sao_band_48_8_c: 2009.1 ( 1.00x) hevc_sao_band_48_8_simd128: 170.2 (11.80x) hevc_sao_band_64_8_c: 3535.0 ( 1.00x) hevc_sao_band_64_8_simd128: 297.5 (11.88x) Signed-off-by: Zhao Zhili <zhilizhao@tencent.com> 2025-06-07 18:18:11 +08:00			`/*`
			`* Copyright (c) 2025 Zhao Zhili`
			`*`
			`* This file is part of FFmpeg.`
			`*`
			`* FFmpeg is free software; you can redistribute it and/or`
			`* modify it under the terms of the GNU Lesser General Public`
			`* License as published by the Free Software Foundation; either`
			`* version 2.1 of the License, or (at your option) any later version.`
			`*`
			`* FFmpeg is distributed in the hope that it will be useful,`
			`* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`* Lesser General Public License for more details.`
			`*`
			`* You should have received a copy of the GNU Lesser General Public`
			`* License along with FFmpeg; if not, write to the Free Software`
			`* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA`
			`*/`

			`#include "sao.h"`

			`#include <wasm_simd128.h>`

wasm/hevc: Add sao_edge_filter hevc_sao_edge_8_8_c: 124.5 ( 1.00x) hevc_sao_edge_8_8_simd128: 18.1 ( 6.89x) hevc_sao_edge_16_8_c: 478.6 ( 1.00x) hevc_sao_edge_16_8_simd128: 48.9 ( 9.79x) hevc_sao_edge_32_8_c: 2021.1 ( 1.00x) hevc_sao_edge_32_8_simd128: 187.4 (10.79x) hevc_sao_edge_48_8_c: 4295.5 ( 1.00x) hevc_sao_edge_48_8_simd128: 397.4 (10.81x) hevc_sao_edge_64_8_c: 7245.5 ( 1.00x) hevc_sao_edge_64_8_simd128: 709.5 (10.21x) Signed-off-by: Zhao Zhili <zhilizhao@tencent.com> 2025-06-07 18:18:16 +08:00			`#include "libavcodec/defs.h"`

			`#define HEVC_MAX_PB_SIZE 64`

wasm/hevc: Add sao_band_filter hevc_sao_band_8_8_c: 63.0 ( 1.00x) hevc_sao_band_8_8_simd128: 10.4 ( 6.06x) hevc_sao_band_16_8_c: 230.4 ( 1.00x) hevc_sao_band_16_8_simd128: 22.9 (10.07x) hevc_sao_band_32_8_c: 900.4 ( 1.00x) hevc_sao_band_32_8_simd128: 81.5 (11.05x) hevc_sao_band_48_8_c: 2009.1 ( 1.00x) hevc_sao_band_48_8_simd128: 170.2 (11.80x) hevc_sao_band_64_8_c: 3535.0 ( 1.00x) hevc_sao_band_64_8_simd128: 297.5 (11.88x) Signed-off-by: Zhao Zhili <zhilizhao@tencent.com> 2025-06-07 18:18:11 +08:00			`void ff_hevc_sao_band_filter_8x8_8_simd128(uint8_t dst, const uint8_t src,`
			`ptrdiff_t stride_dst,`
			`ptrdiff_t stride_src,`
			`const int16_t *sao_offset_val,`
			`int sao_left_class, int width,`
			`int height)`
			`{`
			`int8_t offset_table[32] = {0};`
			`v128_t offset_low, offset_high;`

			`for (int k = 0; k < 4; k++)`
			`offset_table[(k + sao_left_class) & 31] = (int8_t)sao_offset_val[k + 1];`

			`offset_low = wasm_v128_load(offset_table);`
			`offset_high = wasm_v128_load(&offset_table[16]);`

			`for (int y = height; y > 0; y -= 2) {`
			`v128_t src_v, src_high;`
			`v128_t v0, v1;`

			`src_v = wasm_v128_load64_zero(src);`
			`src += stride_src;`
			`src_v = wasm_v128_load64_lane(src, src_v, 1);`
			`src += stride_src;`

			`v0 = wasm_u8x16_shr(src_v, 3);`
			`v1 = wasm_i8x16_sub(v0, wasm_i8x16_const_splat(16));`
			`v0 = wasm_i8x16_swizzle(offset_low, v0);`
			`v1 = wasm_i8x16_swizzle(offset_high, v1);`
			`v0 = wasm_v128_or(v0, v1);`
			`src_high = wasm_u16x8_extend_high_u8x16(src_v);`
			`v1 = wasm_i16x8_extend_high_i8x16(v0);`
			`src_v = wasm_u16x8_extend_low_u8x16(src_v);`
			`v0 = wasm_i16x8_extend_low_i8x16(v0);`

			`v0 = wasm_i16x8_add_sat(src_v, v0);`
			`v1 = wasm_i16x8_add_sat(src_high, v1);`
			`v0 = wasm_u8x16_narrow_i16x8(v0, v1);`

			`wasm_v128_store64_lane(dst, v0, 0);`
			`dst += stride_dst;`
			`wasm_v128_store64_lane(dst, v0, 1);`
			`dst += stride_dst;`
			`}`
			`}`

			`void ff_hevc_sao_band_filter_16x16_8_simd128(uint8_t dst, const uint8_t src,`
			`ptrdiff_t stride_dst,`
			`ptrdiff_t stride_src,`
			`const int16_t *sao_offset_val,`
			`int sao_left_class, int width,`
			`int height)`
			`{`
			`int8_t offset_table[32] = {0};`
			`v128_t offset_low, offset_high;`

			`for (int k = 0; k < 4; k++)`
			`offset_table[(k + sao_left_class) & 31] = (int8_t)sao_offset_val[k + 1];`

			`offset_low = wasm_v128_load(offset_table);`
			`offset_high = wasm_v128_load(&offset_table[16]);`

			`for (int y = height; y > 0; y--) {`
			`for (int x = 0; x < width; x += 16) {`
			`v128_t src_v, src_high;`
			`v128_t v0, v1;`

			`src_v = wasm_v128_load(&src[x]);`

			`v0 = wasm_u8x16_shr(src_v, 3);`
			`v1 = wasm_i8x16_sub(v0, wasm_i8x16_const_splat(16));`
			`v0 = wasm_i8x16_swizzle(offset_low, v0);`
			`v1 = wasm_i8x16_swizzle(offset_high, v1);`
			`v0 = wasm_v128_or(v0, v1);`
			`src_high = wasm_u16x8_extend_high_u8x16(src_v);`
			`v1 = wasm_i16x8_extend_high_i8x16(v0);`
			`src_v = wasm_u16x8_extend_low_u8x16(src_v);`
			`v0 = wasm_i16x8_extend_low_i8x16(v0);`

			`v0 = wasm_i16x8_add_sat(src_v, v0);`
			`v1 = wasm_i16x8_add_sat(src_high, v1);`
			`v0 = wasm_u8x16_narrow_i16x8(v0, v1);`
			`wasm_v128_store(&dst[x], v0);`
			`}`

			`dst += stride_dst;`
			`src += stride_src;`
			`}`
			`}`

wasm/hevc: Add sao_edge_filter hevc_sao_edge_8_8_c: 124.5 ( 1.00x) hevc_sao_edge_8_8_simd128: 18.1 ( 6.89x) hevc_sao_edge_16_8_c: 478.6 ( 1.00x) hevc_sao_edge_16_8_simd128: 48.9 ( 9.79x) hevc_sao_edge_32_8_c: 2021.1 ( 1.00x) hevc_sao_edge_32_8_simd128: 187.4 (10.79x) hevc_sao_edge_48_8_c: 4295.5 ( 1.00x) hevc_sao_edge_48_8_simd128: 397.4 (10.81x) hevc_sao_edge_64_8_c: 7245.5 ( 1.00x) hevc_sao_edge_64_8_simd128: 709.5 (10.21x) Signed-off-by: Zhao Zhili <zhilizhao@tencent.com> 2025-06-07 18:18:16 +08:00			`void ff_hevc_sao_edge_filter_8x8_8_simd128(uint8_t dst, const uint8_t src,`
			`ptrdiff_t stride_dst,`
			`const int16_t *sao_offset_val,`
			`int eo, int width, int height)`
			`{`
			`static const int8_t pos[4][2][2] = {`
			`{ { -1, 0 }, { 1, 0 } }, // horizontal`
			`{ { 0, -1 }, { 0, 1 } }, // vertical`
			`{ { -1, -1 }, { 1, 1 } }, // 45 degree`
			`{ { 1, -1 }, { -1, 1 } }, // 135 degree`
			`};`
			`int a_stride, b_stride;`
			`ptrdiff_t stride_src = (2 * HEVC_MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE);`
			`const v128_t edge_idx = wasm_u8x16_make(1, 2, 0, 3,`
			`4, 0, 0, 0,`
			`0, 0, 0, 0,`
			`0, 0, 0, 0);`
			`v128_t sao_offset = wasm_v128_load(sao_offset_val);`
			`v128_t one = wasm_i8x16_const_splat(1);`
			`v128_t two = wasm_i8x16_const_splat(2);`

			`a_stride = pos[eo][0][0] + pos[eo][0][1] * stride_src;`
			`b_stride = pos[eo][1][0] + pos[eo][1][1] * stride_src;`
			`for (int y = height; y > 0; y -= 2) {`
			`v128_t v0, v1, v2;`
			`v128_t diff0, diff1;`

			`v0 = wasm_v128_load64_zero(src);`
			`v1 = wasm_v128_load64_zero(src + a_stride);`
			`v2 = wasm_v128_load64_zero(src + b_stride);`
			`src += stride_src;`
			`v0 = wasm_v128_load64_lane(src, v0, 1);`
			`v1 = wasm_v128_load64_lane(src + a_stride, v1, 1);`
			`v2 = wasm_v128_load64_lane(src + b_stride, v2, 1);`
			`src += stride_src;`

			`diff0 = wasm_u8x16_gt(v0, v1);`
			`v1 = wasm_u8x16_lt(v0, v1);`
			`diff0 = wasm_i8x16_sub(v1, diff0);`

			`diff1 = wasm_u8x16_gt(v0, v2);`
			`v2 = wasm_u8x16_lt(v0, v2);`
			`diff1 = wasm_i8x16_sub(v2, diff1);`

			`v1 = wasm_i8x16_add(diff0, two);`
			`v1 = wasm_i8x16_add(v1, diff1);`

			`v2 = wasm_i8x16_swizzle(edge_idx, v1); // offset_val`
			`v1 = wasm_i8x16_shl(v2, 1); // Access int16_t`
			`v2 = wasm_i8x16_add(v1, one); // Access upper half of int16_t`
			`diff0 = wasm_i8x16_shuffle(v1, v2, 0, 16, 1, 17, 2, 18, 3, 19, 4,`
			`20, 5, 21, 6, 22, 7, 23);`
			`diff1 = wasm_i8x16_shuffle(v1, v2, 8, 24, 9, 25, 10, 26, 11, 27,`
			`12, 28, 13, 29, 14, 30, 15, 31);`
			`v1 = wasm_u16x8_extend_high_u8x16(v0);`
			`v0 = wasm_u16x8_extend_low_u8x16(v0);`
			`diff0 = wasm_i8x16_swizzle(sao_offset, diff0);`
			`diff1 = wasm_i8x16_swizzle(sao_offset, diff1);`

			`v0 = wasm_i16x8_add_sat(v0, diff0);`
			`v1 = wasm_i16x8_add_sat(v1, diff1);`
			`v0 = wasm_u8x16_narrow_i16x8(v0, v1);`

			`wasm_v128_store64_lane(dst, v0, 0);`
			`dst += stride_dst;`
			`wasm_v128_store64_lane(dst, v0, 1);`
			`dst += stride_dst;`
			`}`
			`}`

			`void ff_hevc_sao_edge_filter_16x16_8_simd128(uint8_t dst, const uint8_t src,`
			`ptrdiff_t stride_dst,`
			`const int16_t *sao_offset_val,`
			`int eo, int width, int height)`
			`{`
			`static const int8_t pos[4][2][2] = {`
			`{ { -1, 0 }, { 1, 0 } }, // horizontal`
			`{ { 0, -1 }, { 0, 1 } }, // vertical`
			`{ { -1, -1 }, { 1, 1 } }, // 45 degree`
			`{ { 1, -1 }, { -1, 1 } }, // 135 degree`
			`};`
			`int a_stride, b_stride;`
			`ptrdiff_t stride_src = (2 * HEVC_MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE);`
			`const v128_t edge_idx = wasm_u8x16_make(1, 2, 0, 3,`
			`4, 0, 0, 0,`
			`0, 0, 0, 0,`
			`0, 0, 0, 0);`
			`v128_t sao_offset = wasm_v128_load(sao_offset_val);`
			`v128_t one = wasm_i8x16_const_splat(1);`
			`v128_t two = wasm_i8x16_const_splat(2);`

			`a_stride = pos[eo][0][0] + pos[eo][0][1] * stride_src;`
			`b_stride = pos[eo][1][0] + pos[eo][1][1] * stride_src;`
			`for (int y = height; y > 0; y--) {`
			`for (int x = 0; x < width; x += 16) {`
			`v128_t v0, v1, v2;`
			`v128_t diff0, diff1;`

			`v0 = wasm_v128_load(&src[x]);`
			`v1 = wasm_v128_load(&src[x + a_stride]);`
			`v2 = wasm_v128_load(&src[x + b_stride]);`

			`diff0 = wasm_u8x16_gt(v0, v1);`
			`v1 = wasm_u8x16_lt(v0, v1);`
			`diff0 = wasm_i8x16_sub(v1, diff0);`

			`diff1 = wasm_u8x16_gt(v0, v2);`
			`v2 = wasm_u8x16_lt(v0, v2);`
			`diff1 = wasm_i8x16_sub(v2, diff1);`

			`v1 = wasm_i8x16_add(diff0, two);`
			`v1 = wasm_i8x16_add(v1, diff1);`

			`v2 = wasm_i8x16_swizzle(edge_idx, v1); // offset_val`
			`v1 = wasm_i8x16_shl(v2, 1); // Access int16_t`
			`v2 = wasm_i8x16_add(v1, one); // Access upper half of int16_t`
			`diff0 = wasm_i8x16_shuffle(v1, v2, 0, 16, 1, 17, 2, 18, 3, 19, 4,`
			`20, 5, 21, 6, 22, 7, 23);`
			`diff1 = wasm_i8x16_shuffle(v1, v2, 8, 24, 9, 25, 10, 26, 11, 27,`
			`12, 28, 13, 29, 14, 30, 15, 31);`
			`v1 = wasm_u16x8_extend_high_u8x16(v0);`
			`v0 = wasm_u16x8_extend_low_u8x16(v0);`
			`diff0 = wasm_i8x16_swizzle(sao_offset, diff0);`
			`diff1 = wasm_i8x16_swizzle(sao_offset, diff1);`

			`v0 = wasm_i16x8_add_sat(v0, diff0);`
			`v1 = wasm_i16x8_add_sat(v1, diff1);`
			`v0 = wasm_u8x16_narrow_i16x8(v0, v1);`
			`wasm_v128_store(&dst[x], v0);`
			`}`

			`src += stride_src;`
			`dst += stride_dst;`
			`}`
			`}`