1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2025-01-08 13:22:53 +02:00

Add gradfun filter, ported from MPlayer.

Patch by Nolan L nol888 <=> gmail >=< com.

See thread:
Subject: [FFmpeg-devel] [PATCH] Port gradfun to libavfilter (GCI)
Date: Mon, 29 Nov 2010 07:18:14 -0500

Originally committed as revision 25942 to svn://svn.ffmpeg.org/ffmpeg/trunk
This commit is contained in:
Nolan L 2010-12-12 17:59:10 +00:00 committed by Stefano Sabatini
parent 9d845ca40c
commit d5f187fd33
9 changed files with 498 additions and 2 deletions

View File

@ -64,6 +64,7 @@ version <next>:
- hqdn3d filter added - hqdn3d filter added
- RTP depacketization of QCELP - RTP depacketization of QCELP
- FLAC parser added - FLAC parser added
- gradfun filter added
version 0.6: version 0.6:

View File

@ -425,6 +425,35 @@ frei0r=perspective:0.2/0.2:0.8/0.2
For more information see: For more information see:
@url{http://piksel.org/frei0r} @url{http://piksel.org/frei0r}
@section gradfun
Fix the banding artifacts that are sometimes introduced into nearly flat
regions by truncation to 8bit colordepth.
Interpolate the gradients that should go where the bands are, and
dither them.
The filter takes two optional parameters, separated by ':':
@var{strength}:@var{radius}
@var{strength} is the maximum amount by which the filter will change
any one pixel. Also the threshold for detecting nearly flat
regions. Acceptable values range from .51 to 255, default value is
1.2, out-of-range values will be clipped to the valid range.
@var{radius} is the neighborhood to fit the gradient to. A larger
radius makes for smoother gradients, but also prevents the filter from
modifying the pixels near detailed regions. Acceptable values are
8-32, default value is 16, out-of-range values will be clipped to the
valid range.
@example
# default parameters
gradfun=1.2:16
# omitting radius
gradfun=1.2
@end example
@section hflip @section hflip
Flip the input video horizontally. Flip the input video horizontally.

View File

@ -26,6 +26,7 @@ OBJS-$(CONFIG_DRAWBOX_FILTER) += vf_drawbox.o
OBJS-$(CONFIG_FIFO_FILTER) += vf_fifo.o OBJS-$(CONFIG_FIFO_FILTER) += vf_fifo.o
OBJS-$(CONFIG_FORMAT_FILTER) += vf_format.o OBJS-$(CONFIG_FORMAT_FILTER) += vf_format.o
OBJS-$(CONFIG_FREI0R_FILTER) += vf_frei0r.o OBJS-$(CONFIG_FREI0R_FILTER) += vf_frei0r.o
OBJS-$(CONFIG_GRADFUN_FILTER) += vf_gradfun.o
OBJS-$(CONFIG_HFLIP_FILTER) += vf_hflip.o OBJS-$(CONFIG_HFLIP_FILTER) += vf_hflip.o
OBJS-$(CONFIG_HQDN3D_FILTER) += vf_hqdn3d.o OBJS-$(CONFIG_HQDN3D_FILTER) += vf_hqdn3d.o
OBJS-$(CONFIG_NOFORMAT_FILTER) += vf_format.o OBJS-$(CONFIG_NOFORMAT_FILTER) += vf_format.o

View File

@ -47,6 +47,7 @@ void avfilter_register_all(void)
REGISTER_FILTER (FIFO, fifo, vf); REGISTER_FILTER (FIFO, fifo, vf);
REGISTER_FILTER (FORMAT, format, vf); REGISTER_FILTER (FORMAT, format, vf);
REGISTER_FILTER (FREI0R, frei0r, vf); REGISTER_FILTER (FREI0R, frei0r, vf);
REGISTER_FILTER (GRADFUN, gradfun, vf);
REGISTER_FILTER (HFLIP, hflip, vf); REGISTER_FILTER (HFLIP, hflip, vf);
REGISTER_FILTER (HQDN3D, hqdn3d, vf); REGISTER_FILTER (HQDN3D, hqdn3d, vf);
REGISTER_FILTER (NOFORMAT, noformat, vf); REGISTER_FILTER (NOFORMAT, noformat, vf);

View File

@ -27,8 +27,8 @@
#include "libavcore/samplefmt.h" #include "libavcore/samplefmt.h"
#define LIBAVFILTER_VERSION_MAJOR 1 #define LIBAVFILTER_VERSION_MAJOR 1
#define LIBAVFILTER_VERSION_MINOR 68 #define LIBAVFILTER_VERSION_MINOR 69
#define LIBAVFILTER_VERSION_MICRO 1 #define LIBAVFILTER_VERSION_MICRO 0
#define LIBAVFILTER_VERSION_INT AV_VERSION_INT(LIBAVFILTER_VERSION_MAJOR, \ #define LIBAVFILTER_VERSION_INT AV_VERSION_INT(LIBAVFILTER_VERSION_MAJOR, \
LIBAVFILTER_VERSION_MINOR, \ LIBAVFILTER_VERSION_MINOR, \

48
libavfilter/gradfun.h Normal file
View File

@ -0,0 +1,48 @@
/*
* Copyright (c) 2010 Nolan Lum <nol888@gmail.com>
* Copyright (c) 2009 Loren Merritt <lorenm@u.washignton.edu>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVFILTER_GRADFUN_H
#define AVFILTER_GRADFUN_H
#include "avfilter.h"
/// Holds instance-specific information for gradfun.
typedef struct {
int thresh; ///< threshold for gradient algorithm
int radius; ///< blur radius
int chroma_w; ///< width of the chroma planes
int chroma_h; ///< weight of the chroma planes
int chroma_r; ///< blur radius for the chroma planes
uint16_t *buf; ///< holds image data for blur algorithm passed into filter.
/// DSP functions.
void (*filter_line) (uint8_t *dst, uint8_t *src, uint16_t *dc, int width, int thresh, const uint16_t *dithers);
void (*blur_line) (uint16_t *dc, uint16_t *buf, uint16_t *buf1, uint8_t *src, int src_linesize, int width);
} GradFunContext;
void ff_gradfun_filter_line_c(uint8_t *dst, uint8_t *src, uint16_t *dc, int width, int thresh, const uint16_t *dithers);
void ff_gradfun_blur_line_c(uint16_t *dc, uint16_t *buf, uint16_t *buf1, uint8_t *src, int src_linesize, int width);
void ff_gradfun_filter_line_mmx2(uint8_t *dst, uint8_t *src, uint16_t *dc, int width, int thresh, const uint16_t *dithers);
void ff_gradfun_filter_line_ssse3(uint8_t *dst, uint8_t *src, uint16_t *dc, int width, int thresh, const uint16_t *dithers);
void ff_gradfun_blur_line_sse2(uint16_t *dc, uint16_t *buf, uint16_t *buf1, uint8_t *src, int src_linesize, int width);
#endif /* AVFILTER_GRADFUN_H */

253
libavfilter/vf_gradfun.c Normal file
View File

@ -0,0 +1,253 @@
/*
* Copyright (c) 2010 Nolan Lum <nol888@gmail.com>
* Copyright (c) 2009 Loren Merritt <lorenm@u.washignton.edu>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
/**
* @file
* gradfun debanding filter, ported from MPlayer
* libmpcodecs/vf_gradfun.c
*
* Apply a boxblur debanding algorithm (based on the gradfun2db
* Avisynth filter by prunedtree).
* Foreach pixel, if it's within threshold of the blurred value, make it closer.
* So now we have a smoothed and higher bitdepth version of all the shallow
* gradients, while leaving detailed areas untouched.
* Dither it back to 8bit.
*/
#include "libavcore/imgutils.h"
#include "libavutil/cpu.h"
#include "libavutil/pixdesc.h"
#include "avfilter.h"
#include "gradfun.h"
DECLARE_ALIGNED(16, static const uint16_t, dither)[8][8] = {
{0x00,0x60,0x18,0x78,0x06,0x66,0x1E,0x7E},
{0x40,0x20,0x58,0x38,0x46,0x26,0x5E,0x3E},
{0x10,0x70,0x08,0x68,0x16,0x76,0x0E,0x6E},
{0x50,0x30,0x48,0x28,0x56,0x36,0x4E,0x2E},
{0x04,0x64,0x1C,0x7C,0x02,0x62,0x1A,0x7A},
{0x44,0x24,0x5C,0x3C,0x42,0x22,0x5A,0x3A},
{0x14,0x74,0x0C,0x6C,0x12,0x72,0x0A,0x6A},
{0x54,0x34,0x4C,0x2C,0x52,0x32,0x4A,0x2A},
};
void ff_gradfun_filter_line_c(uint8_t *dst, uint8_t *src, uint16_t *dc, int width, int thresh, const uint16_t *dithers)
{
int x;
for (x = 0; x < width; x++, dc += x & 1) {
int pix = src[x] << 7;
int delta = dc[0] - pix;
int m = abs(delta) * thresh >> 16;
m = FFMAX(0, 127 - m);
m = m * m * delta >> 14;
pix += m + dithers[x & 7];
dst[x] = av_clip_uint8(pix >> 7);
}
}
void ff_gradfun_blur_line_c(uint16_t *dc, uint16_t *buf, uint16_t *buf1, uint8_t *src, int src_linesize, int width)
{
int x, v, old;
for (x = 0; x < width; x++) {
v = buf1[x] + src[2 * x] + src[2 * x + 1] + src[2 * x + src_linesize] + src[2 * x + 1 + src_linesize];
old = buf[x];
buf[x] = v;
dc[x] = v - old;
}
}
static void filter(GradFunContext *ctx, uint8_t *dst, uint8_t *src, int width, int height, int dst_linesize, int src_linesize, int r)
{
int bstride = FFALIGN(width, 16) / 2;
int y;
uint32_t dc_factor = (1 << 21) / (r * r);
uint16_t *dc = ctx->buf + 16;
uint16_t *buf = ctx->buf + bstride + 32;
int thresh = ctx->thresh;
memset(dc, 0, (bstride + 16) * sizeof(*buf));
for (y = 0; y < r; y++)
ctx->blur_line(dc, buf + y * bstride, buf + (y - 1) * bstride, src + 2 * y * src_linesize, src_linesize, width / 2);
for (;;) {
if (y < height - r) {
int mod = ((y + r) / 2) % r;
uint16_t *buf0 = buf + mod * bstride;
uint16_t *buf1 = buf + (mod ? mod - 1 : r - 1) * bstride;
int x, v;
ctx->blur_line(dc, buf0, buf1, src + (y + r) * src_linesize, src_linesize, width / 2);
for (x = v = 0; x < r; x++)
v += dc[x];
for (; x < width / 2; x++) {
v += dc[x] - dc[x-r];
dc[x-r] = v * dc_factor >> 16;
}
for (; x < (width + r + 1) / 2; x++)
dc[x-r] = v * dc_factor >> 16;
for (x = -r / 2; x < 0; x++)
dc[x] = dc[0];
}
if (y == r) {
for (y = 0; y < r; y++)
ctx->filter_line(dst + y * dst_linesize, src + y * src_linesize, dc - r / 2, width, thresh, dither[y & 7]);
}
ctx->filter_line(dst + y * dst_linesize, src + y * src_linesize, dc - r / 2, width, thresh, dither[y & 7]);
if (++y >= height) break;
ctx->filter_line(dst + y * dst_linesize, src + y * src_linesize, dc - r / 2, width, thresh, dither[y & 7]);
if (++y >= height) break;
}
}
static av_cold int init(AVFilterContext *ctx, const char *args, void *opaque)
{
GradFunContext *gf = ctx->priv;
float thresh = 1.2;
int radius = 16;
av_unused int cpu_flags = av_get_cpu_flags();
if (args)
sscanf(args, "%f:%d", &thresh, &radius);
thresh = av_clipf(thresh, 0.51, 255);
gf->thresh = (1 << 15) / thresh;
gf->radius = av_clip((radius + 1) & ~1, 4, 32);
gf->blur_line = ff_gradfun_blur_line_c;
gf->filter_line = ff_gradfun_filter_line_c;
if (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX2)
gf->filter_line = ff_gradfun_filter_line_mmx2;
if (HAVE_SSSE3 && cpu_flags & AV_CPU_FLAG_SSSE3)
gf->filter_line = ff_gradfun_filter_line_ssse3;
if (HAVE_SSE && cpu_flags & AV_CPU_FLAG_SSE2)
gf->blur_line = ff_gradfun_blur_line_sse2;
av_log(ctx, AV_LOG_INFO, "threshold:%.2f radius:%d\n", thresh, gf->radius);
return 0;
}
static av_cold void uninit(AVFilterContext *ctx)
{
GradFunContext *gf = ctx->priv;
av_freep(&gf->buf);
}
static int query_formats(AVFilterContext *ctx)
{
static const enum PixelFormat pix_fmts[] = {
PIX_FMT_YUV410P, PIX_FMT_YUV420P,
PIX_FMT_GRAY8, PIX_FMT_NV12,
PIX_FMT_NV21, PIX_FMT_YUV444P,
PIX_FMT_YUV422P, PIX_FMT_YUV411P,
PIX_FMT_NONE
};
avfilter_set_common_formats(ctx, avfilter_make_format_list(pix_fmts));
return 0;
}
static int config_input(AVFilterLink *inlink)
{
GradFunContext *gf = inlink->dst->priv;
int hsub = av_pix_fmt_descriptors[inlink->format].log2_chroma_w;
int vsub = av_pix_fmt_descriptors[inlink->format].log2_chroma_h;
gf->buf = av_mallocz((FFALIGN(inlink->w, 16) * (gf->radius + 1) / 2 + 32) * sizeof(uint16_t));
if (!gf->buf)
return AVERROR(ENOMEM);
gf->chroma_w = -((-inlink->w) >> hsub);
gf->chroma_h = -((-inlink->h) >> vsub);
gf->chroma_r = av_clip(((((gf->radius >> hsub) + (gf->radius >> vsub)) / 2 ) + 1) & ~1, 4, 32);
return 0;
}
static void start_frame(AVFilterLink *inlink, AVFilterBufferRef *inpicref)
{
AVFilterLink *outlink = inlink->dst->outputs[0];
AVFilterBufferRef *outpicref;
if (inpicref->perms & AV_PERM_PRESERVE) {
outpicref = avfilter_get_video_buffer(outlink, AV_PERM_WRITE, outlink->w, outlink->h);
avfilter_copy_buffer_ref_props(outpicref, inpicref);
outpicref->video->w = outlink->w;
outpicref->video->h = outlink->h;
} else
outpicref = inpicref;
outlink->out_buf = outpicref;
avfilter_start_frame(outlink, avfilter_ref_buffer(outpicref, ~0));
}
static void null_draw_slice(AVFilterLink *link, int y, int h, int slice_dir) { }
static void end_frame(AVFilterLink *inlink)
{
GradFunContext *gf = inlink->dst->priv;
AVFilterBufferRef *inpic = inlink->cur_buf;
AVFilterLink *outlink = inlink->dst->outputs[0];
AVFilterBufferRef *outpic = outlink->out_buf;
int p;
for (p = 0; p < 4 && inpic->data[p]; p++) {
int w = inlink->w;
int h = inlink->h;
int r = gf->radius;
if (p) {
w = gf->chroma_w;
h = gf->chroma_h;
r = gf->chroma_r;
}
if (FFMIN(w, h) > 2 * r)
filter(gf, outpic->data[p], inpic->data[p], w, h, outpic->linesize[p], inpic->linesize[p], r);
else if (outpic->data[p] != inpic->data[p])
av_image_copy_plane(outpic->data[p], outpic->linesize[p], inpic->data[p], inpic->linesize[p], w, h);
}
avfilter_draw_slice(outlink, 0, inlink->h, 1);
avfilter_end_frame(outlink);
avfilter_unref_buffer(inpic);
avfilter_unref_buffer(outpic);
}
AVFilter avfilter_vf_gradfun = {
.name = "gradfun",
.description = NULL_IF_CONFIG_SMALL("Debands video quickly using gradients."),
.priv_size = sizeof(GradFunContext),
.init = init,
.uninit = uninit,
.query_formats = query_formats,
.inputs = (AVFilterPad[]) {{ .name = "default",
.type = AVMEDIA_TYPE_VIDEO,
.config_props = config_input,
.start_frame = start_frame,
.draw_slice = null_draw_slice,
.end_frame = end_frame,
.min_perms = AV_PERM_READ, },
{ .name = NULL}},
.outputs = (AVFilterPad[]) {{ .name = "default",
.type = AVMEDIA_TYPE_VIDEO, },
{ .name = NULL}},
};

View File

@ -1 +1,2 @@
MMX-OBJS-$(CONFIG_YADIF_FILTER) += x86/yadif.o MMX-OBJS-$(CONFIG_YADIF_FILTER) += x86/yadif.o
MMX-OBJS-$(CONFIG_GRADFUN_FILTER) += x86/gradfun.o

162
libavfilter/x86/gradfun.c Normal file
View File

@ -0,0 +1,162 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
#include "libavutil/cpu.h"
#include "libavutil/x86_cpu.h"
#include "libavfilter/gradfun.h"
DECLARE_ALIGNED(16, static const uint16_t, pw_7f)[8] = {0x7F,0x7F,0x7F,0x7F,0x7F,0x7F,0x7F,0x7F};
DECLARE_ALIGNED(16, static const uint16_t, pw_ff)[8] = {0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF};
void ff_gradfun_filter_line_mmx2(uint8_t *dst, uint8_t *src, uint16_t *dc, int width, int thresh, const uint16_t *dithers)
{
#if HAVE_MMX
intptr_t x;
if (width & 3) {
x = width & ~3;
ff_gradfun_filter_line_c(dst + x, src + x, dc + x / 2, width - x, thresh, dithers);
width = x;
}
x = -width;
__asm__ volatile(
"movd %4, %%mm5 \n"
"pxor %%mm7, %%mm7 \n"
"pshufw $0, %%mm5, %%mm5 \n"
"movq %6, %%mm6 \n"
"movq %5, %%mm4 \n"
"1: \n"
"movd (%2,%0), %%mm0 \n"
"movd (%3,%0), %%mm1 \n"
"punpcklbw %%mm7, %%mm0 \n"
"punpcklwd %%mm1, %%mm1 \n"
"psllw $7, %%mm0 \n"
"pxor %%mm2, %%mm2 \n"
"psubw %%mm0, %%mm1 \n" // delta = dc - pix
"psubw %%mm1, %%mm2 \n"
"pmaxsw %%mm1, %%mm2 \n"
"pmulhuw %%mm5, %%mm2 \n" // m = abs(delta) * thresh >> 16
"psubw %%mm6, %%mm2 \n"
"pminsw %%mm7, %%mm2 \n" // m = -max(0, 127-m)
"pmullw %%mm2, %%mm2 \n"
"paddw %%mm4, %%mm0 \n" // pix += dither
"pmulhw %%mm2, %%mm1 \n"
"psllw $2, %%mm1 \n" // m = m*m*delta >> 14
"paddw %%mm1, %%mm0 \n" // pix += m
"psraw $7, %%mm0 \n"
"packuswb %%mm0, %%mm0 \n"
"movd %%mm0, (%1,%0) \n" // dst = clip(pix>>7)
"add $4, %0 \n"
"jl 1b \n"
"emms \n"
:"+r"(x)
:"r"(dst+width), "r"(src+width), "r"(dc+width/2),
"rm"(thresh), "m"(*dithers), "m"(*pw_7f)
:"memory"
);
#endif
}
void ff_gradfun_filter_line_ssse3(uint8_t *dst, uint8_t *src, uint16_t *dc, int width, int thresh, const uint16_t *dithers)
{
#if HAVE_SSSE3
intptr_t x;
if (width & 7) {
// could be 10% faster if I somehow eliminated this
x = width & ~7;
ff_gradfun_filter_line_c(dst + x, src + x, dc + x / 2, width - x, thresh, dithers);
width = x;
}
x = -width;
__asm__ volatile(
"movd %4, %%xmm5 \n"
"pxor %%xmm7, %%xmm7 \n"
"pshuflw $0,%%xmm5, %%xmm5 \n"
"movdqa %6, %%xmm6 \n"
"punpcklqdq %%xmm5, %%xmm5 \n"
"movdqa %5, %%xmm4 \n"
"1: \n"
"movq (%2,%0), %%xmm0 \n"
"movq (%3,%0), %%xmm1 \n"
"punpcklbw %%xmm7, %%xmm0 \n"
"punpcklwd %%xmm1, %%xmm1 \n"
"psllw $7, %%xmm0 \n"
"psubw %%xmm0, %%xmm1 \n" // delta = dc - pix
"pabsw %%xmm1, %%xmm2 \n"
"pmulhuw %%xmm5, %%xmm2 \n" // m = abs(delta) * thresh >> 16
"psubw %%xmm6, %%xmm2 \n"
"pminsw %%xmm7, %%xmm2 \n" // m = -max(0, 127-m)
"pmullw %%xmm2, %%xmm2 \n"
"psllw $1, %%xmm2 \n"
"paddw %%xmm4, %%xmm0 \n" // pix += dither
"pmulhrsw %%xmm2, %%xmm1 \n" // m = m*m*delta >> 14
"paddw %%xmm1, %%xmm0 \n" // pix += m
"psraw $7, %%xmm0 \n"
"packuswb %%xmm0, %%xmm0 \n"
"movq %%xmm0, (%1,%0) \n" // dst = clip(pix>>7)
"add $8, %0 \n"
"jl 1b \n"
:"+&r"(x)
:"r"(dst+width), "r"(src+width), "r"(dc+width/2),
"rm"(thresh), "m"(*dithers), "m"(*pw_7f)
:"memory"
);
#endif // HAVE_SSSE3
}
void ff_gradfun_blur_line_sse2(uint16_t *dc, uint16_t *buf, uint16_t *buf1, uint8_t *src, int src_linesize, int width)
{
#if HAVE_SSE
#define BLURV(load)\
intptr_t x = -2*width;\
__asm__ volatile(\
"movdqa %6, %%xmm7 \n"\
"1: \n"\
load" (%4,%0), %%xmm0 \n"\
load" (%5,%0), %%xmm1 \n"\
"movdqa %%xmm0, %%xmm2 \n"\
"movdqa %%xmm1, %%xmm3 \n"\
"psrlw $8, %%xmm0 \n"\
"psrlw $8, %%xmm1 \n"\
"pand %%xmm7, %%xmm2 \n"\
"pand %%xmm7, %%xmm3 \n"\
"paddw %%xmm1, %%xmm0 \n"\
"paddw %%xmm3, %%xmm2 \n"\
"paddw %%xmm2, %%xmm0 \n"\
"paddw (%2,%0), %%xmm0 \n"\
"movdqa (%1,%0), %%xmm1 \n"\
"movdqa %%xmm0, (%1,%0) \n"\
"psubw %%xmm1, %%xmm0 \n"\
"movdqa %%xmm0, (%3,%0) \n"\
"add $16, %0 \n"\
"jl 1b \n"\
:"+&r"(x)\
:"r"(buf+width),\
"r"(buf1+width),\
"r"(dc+width),\
"r"(src+width*2),\
"r"(src+width*2+src_linesize),\
"m"(*pw_ff)\
:"memory"\
);
if (((intptr_t) src | src_linesize) & 15) {
BLURV("movdqu");
} else {
BLURV("movdqa");
}
#endif // HAVE_SSE
}