lavfi: add deshake_opencl filter

2025-08-10 06:10:52 +02:00 · 2019-08-08 09:24:32 -04:00
parent 5b5746b1e0
commit b29c7bcbf6
8 changed files with 2924 additions and 1 deletions
--- a/1
+++ b/1
@@ -3454,6 +3454,7 @@ delogo_filter_deps="gpl"
 denoise_vaapi_filter_deps="vaapi"
 derain_filter_select="dnn"
 deshake_filter_select="pixelutils"
+deshake_opencl_filter_deps="opencl"
 dilation_opencl_filter_deps="opencl"
 drawtext_filter_deps="libfreetype"
 drawtext_filter_suggest="libfontconfig libfribidi"
--- a/doc/filters.texi
+++ b/doc/filters.texi
@@ -19795,6 +19795,75 @@ Make every semi-green pixel in the input transparent with some slight blending:
@end example
@end itemize

+@section deshake_opencl
+Feature-point based video stabilization filter.
+
+The filter accepts the following options:
+
+@table @option
+@item tripod
+Simulates a tripod by preventing any camera movement whatsoever from the original frame. Defaults to @code{0}.
+
+@item debug
+Whether or not additional debug info should be displayed, both in the processed output and in the console.
+
+Note that in order to see console debug output you will also need to pass @code{-v verbose} to ffmpeg.
+
+Viewing point matches in the output video is only supported for RGB input.
+
+Defaults to @code{0}.
+
+@item adaptive_crop
+Whether or not to do a tiny bit of cropping at the borders to cut down on the amount of mirrored pixels.
+
+Defaults to @code{1}.
+
+@item refine_features
+Whether or not feature points should be refined at a sub-pixel level.
+
+This can be turned off for a slight performance gain at the cost of precision.
+
+Defaults to @code{1}.
+
+@item smooth_strength
+The strength of the smoothing applied to the camera path from @code{0.0} to @code{1.0}.
+
+@code{1.0} is the maximum smoothing strength while values less than that result in less smoothing.
+
+@code{0.0} causes the filter to adaptively choose a smoothing strength on a per-frame basis.
+
+Defaults to @code{0.0}.
+
+@item smooth_window_multiplier
+Controls the size of the smoothing window (the number of frames buffered to determine motion information from).
+
+The size of the smoothing window is determined by multiplying the framerate of the video by this number.
+
+Acceptable values range from @code{0.1} to @code{10.0}.
+
+Larger values increase the amount of motion data available for determining how to smooth the camera path,
+potentially improving smoothness, but also increase latency and memory usage.
+
+Defaults to @code{2.0}.
+
+@end table
+
+@subsection Examples
+
+@itemize
+@item
+Stabilize a video with a fixed, medium smoothing strength:
+@example
+-i INPUT -vf "hwupload, deshake_opencl=smooth_strength=0.5, hwdownload" OUTPUT
+@end example
+
+@item
+Stabilize a video with debugging (both in console and in rendered video):
+@example
+-i INPUT -filter_complex "[0:v]format=rgba, hwupload, deshake_opencl=debug=1, hwdownload, format=rgba, format=yuv420p" -v verbose OUTPUT
+@end example
+@end itemize
+
@section nlmeans_opencl

 Non-local Means denoise filter through OpenCL, this filter accepts same options as @ref{nlmeans}.
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -211,6 +211,8 @@ OBJS-$(CONFIG_DEINTERLACE_VAAPI_FILTER)      += vf_deinterlace_vaapi.o vaapi_vpp
 OBJS-$(CONFIG_DEJUDDER_FILTER)               += vf_dejudder.o
 OBJS-$(CONFIG_DELOGO_FILTER)                 += vf_delogo.o
 OBJS-$(CONFIG_DENOISE_VAAPI_FILTER)          += vf_misc_vaapi.o vaapi_vpp.o
+OBJS-$(CONFIG_DESHAKE_OPENCL_FILTER)        += vf_deshake_opencl.o opencl.o \
+                                                opencl/deshake.o
 OBJS-$(CONFIG_DESHAKE_FILTER)                += vf_deshake.o
 OBJS-$(CONFIG_DESPILL_FILTER)                += vf_despill.o
 OBJS-$(CONFIG_DETELECINE_FILTER)             += vf_detelecine.o
--- a/libavfilter/allfilters.c
+++ b/libavfilter/allfilters.c
@@ -200,6 +200,7 @@ extern AVFilter ff_vf_delogo;
 extern AVFilter ff_vf_denoise_vaapi;
 extern AVFilter ff_vf_derain;
 extern AVFilter ff_vf_deshake;
+extern AVFilter ff_vf_deshake_opencl;
 extern AVFilter ff_vf_despill;
 extern AVFilter ff_vf_detelecine;
 extern AVFilter ff_vf_dilation;
--- a/libavfilter/opencl/deshake.cl
+++ b/libavfilter/opencl/deshake.cl
@@ -0,0 +1,647 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Copyright (C) 2000, Intel Corporation, all rights reserved.
+ * Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistribution's of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistribution's in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * The name of the copyright holders may not be used to endorse or promote products
+ *     derived from this software without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall the Intel Corporation or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#define HARRIS_THRESHOLD 3.0f
+// Block size over which to compute harris response
+//
+// Note that changing this will require fiddling with the local array sizes in
+// harris_response
+#define HARRIS_RADIUS 2
+#define DISTANCE_THRESHOLD 80
+
+// Sub-pixel refinement window for feature points
+#define REFINE_WIN_HALF_W 5
+#define REFINE_WIN_HALF_H 5
+#define REFINE_WIN_W 11 // REFINE_WIN_HALF_W * 2 + 1
+#define REFINE_WIN_H 11
+
+// Non-maximum suppression window size
+#define NONMAX_WIN 30
+#define NONMAX_WIN_HALF 15 // NONMAX_WIN / 2
+
+typedef struct PointPair {
+    // Previous frame
+    float2 p1;
+    // Current frame
+    float2 p2;
+} PointPair;
+
+typedef struct SmoothedPointPair {
+    // Non-smoothed point in current frame
+    int2 p1;
+    // Smoothed point in current frame
+    float2 p2;
+} SmoothedPointPair;
+
+typedef struct MotionVector {
+    PointPair p;
+    // Used to mark vectors as potential outliers
+    int should_consider;
+} MotionVector;
+
+const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE |
+                          CLK_ADDRESS_CLAMP_TO_EDGE |
+                          CLK_FILTER_NEAREST;
+
+const sampler_t sampler_linear = CLK_NORMALIZED_COORDS_FALSE |
+                          CLK_ADDRESS_CLAMP_TO_EDGE |
+                          CLK_FILTER_LINEAR;
+
+const sampler_t sampler_linear_mirror = CLK_NORMALIZED_COORDS_TRUE |
+                          CLK_ADDRESS_MIRRORED_REPEAT |
+                          CLK_FILTER_LINEAR;
+
+// Writes to a 1D array at loc, treating it as a 2D array with the same
+// dimensions as the global work size.
+static void write_to_1d_arrf(__global float *buf, int2 loc, float val) {
+    buf[loc.x + loc.y * get_global_size(0)] = val;
+}
+
+static void write_to_1d_arrul8(__global ulong8 *buf, int2 loc, ulong8 val) {
+    buf[loc.x + loc.y * get_global_size(0)] = val;
+}
+
+static void write_to_1d_arrvec(__global MotionVector *buf, int2 loc, MotionVector val) {
+    buf[loc.x + loc.y * get_global_size(0)] = val;
+}
+
+static void write_to_1d_arrf2(__global float2 *buf, int2 loc, float2 val) {
+    buf[loc.x + loc.y * get_global_size(0)] = val;
+}
+
+static ulong8 read_from_1d_arrul8(__global const ulong8 *buf, int2 loc) {
+    return buf[loc.x + loc.y * get_global_size(0)];
+}
+
+static float2 read_from_1d_arrf2(__global const float2 *buf, int2 loc) {
+    return buf[loc.x + loc.y * get_global_size(0)];
+}
+
+// Returns the grayscale value at the given point.
+static float pixel_grayscale(__read_only image2d_t src, int2 loc) {
+    float4 pixel = read_imagef(src, sampler, loc);
+    return (pixel.x + pixel.y + pixel.z) / 3.0f;
+}
+
+static float convolve(
+    __local const float *grayscale,
+    int local_idx_x,
+    int local_idx_y,
+    float mask[3][3]
+) {
+    float ret = 0;
+
+    // These loops touch each pixel surrounding loc as well as loc itself
+    for (int i = 1, i2 = 0; i >= -1; --i, ++i2) {
+        for (int j = -1, j2 = 0; j <= 1; ++j, ++j2) {
+            ret += mask[i2][j2] * grayscale[(local_idx_x + 3 + j) + (local_idx_y + 3 + i) * 14];
+        }
+    }
+
+    return ret;
+}
+
+// Sums dx * dy for all pixels within radius of loc
+static float sum_deriv_prod(
+    __local const float *grayscale,
+    float mask_x[3][3],
+    float mask_y[3][3]
+) {
+    float ret = 0;
+
+    for (int i = HARRIS_RADIUS; i >= -HARRIS_RADIUS; --i) {
+        for (int j = -HARRIS_RADIUS; j <= HARRIS_RADIUS; ++j) {
+            ret += convolve(grayscale, get_local_id(0) + j, get_local_id(1) + i, mask_x) *
+                   convolve(grayscale, get_local_id(0) + j, get_local_id(1) + i, mask_y);
+        }
+    }
+
+    return ret;
+}
+
+// Sums d<>^2 (determined by mask) for all pixels within radius of loc
+static float sum_deriv_pow(__local const float *grayscale, float mask[3][3])
+{
+    float ret = 0;
+
+    for (int i = HARRIS_RADIUS; i >= -HARRIS_RADIUS; --i) {
+        for (int j = -HARRIS_RADIUS; j <= HARRIS_RADIUS; ++j) {
+            float deriv = convolve(grayscale, get_local_id(0) + j, get_local_id(1) + i, mask);
+            ret += deriv * deriv;
+        }
+    }
+
+    return ret;
+}
+
+// Fills a box with the given radius and pixel around loc
+static void draw_box(__write_only image2d_t dst, int2 loc, float4 pixel, int radius)
+{
+    for (int i = -radius; i <= radius; ++i) {
+        for (int j = -radius; j <= radius; ++j) {
+            write_imagef(
+                dst,
+                (int2)(
+                    // Clamp to avoid writing outside image bounds
+                    clamp(loc.x + i, 0, get_image_dim(dst).x - 1),
+                    clamp(loc.y + j, 0, get_image_dim(dst).y - 1)
+                ),
+                pixel
+            );
+        }
+    }
+}
+
+// Converts the src image to grayscale
+__kernel void grayscale(
+    __read_only image2d_t src,
+    __write_only image2d_t grayscale
+) {
+    int2 loc = (int2)(get_global_id(0), get_global_id(1));
+    write_imagef(grayscale, loc, (float4)(pixel_grayscale(src, loc), 0.0f, 0.0f, 1.0f));
+}
+
+// This kernel computes the harris response for the given grayscale src image
+// within the given radius and writes it to harris_buf
+__kernel void harris_response(
+    __read_only image2d_t grayscale,
+    __global float *harris_buf
+) {
+    int2 loc = (int2)(get_global_id(0), get_global_id(1));
+
+    if (loc.x > get_image_width(grayscale) - 1 || loc.y > get_image_height(grayscale) - 1) {
+        write_to_1d_arrf(harris_buf, loc, 0);
+        return;
+    }
+
+    float scale = 1.0f / ((1 << 2) * HARRIS_RADIUS * 255.0f);
+
+    float sobel_mask_x[3][3] = {
+        {-1, 0, 1},
+        {-2, 0, 2},
+        {-1, 0, 1}
+    };
+
+    float sobel_mask_y[3][3] = {
+        { 1,  2,  1},
+        { 0,  0,  0},
+        {-1, -2, -1}
+    };
+
+    // 8 x 8 local work + 3 pixels around each side (needed to accomodate for the
+    // block size radius of 2)
+    __local float grayscale_data[196];
+
+    int idx = get_group_id(0) * get_local_size(0);
+    int idy = get_group_id(1) * get_local_size(1);
+
+    for (int i = idy - 3, it = 0; i < idy + (int)get_local_size(1) + 3; i++, it++) {
+        for (int j = idx - 3, jt = 0; j < idx + (int)get_local_size(0) + 3; j++, jt++) {
+            grayscale_data[jt + it * 14] = read_imagef(grayscale, sampler, (int2)(j, i)).x;
+        }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    float sumdxdy = sum_deriv_prod(grayscale_data, sobel_mask_x, sobel_mask_y);
+    float sumdx2 = sum_deriv_pow(grayscale_data, sobel_mask_x);
+    float sumdy2 = sum_deriv_pow(grayscale_data, sobel_mask_y);
+
+    float trace = sumdx2 + sumdy2;
+    // r = det(M) - k(trace(M))^2
+    // k usually between 0.04 to 0.06
+    float r = (sumdx2 * sumdy2 - sumdxdy * sumdxdy) - 0.04f * (trace * trace) * pown(scale, 4);
+
+    // Threshold the r value
+    harris_buf[loc.x + loc.y * get_image_width(grayscale)] = r * step(HARRIS_THRESHOLD, r);
+}
+
+// Gets a patch centered around a float coordinate from a grayscale image using
+// bilinear interpolation
+static void get_rect_sub_pix(
+    __read_only image2d_t grayscale,
+    float *buffer,
+    int size_x,
+    int size_y,
+    float2 center
+) {
+    float2 offset = ((float2)(size_x, size_y) - 1.0f) * 0.5f;
+
+    for (int i = 0; i < size_y; i++) {
+        for (int j = 0; j < size_x; j++) {
+            buffer[i * size_x + j] = read_imagef(
+                grayscale,
+                sampler_linear,
+                (float2)(j, i) + center - offset
+            ).x * 255.0f;
+        }
+    }
+}
+
+// Refines detected features at a sub-pixel level
+//
+// This function is ported from OpenCV
+static float2 corner_sub_pix(
+    __read_only image2d_t grayscale,
+    float2 feature,
+    float *mask
+) {
+    float2 init = feature;
+    int src_width = get_global_size(0);
+    int src_height = get_global_size(1);
+
+    const int max_iters = 40;
+    const float eps = 0.001f * 0.001f;
+    int i, j, k;
+
+    int iter = 0;
+    float err = 0;
+    float subpix[(REFINE_WIN_W + 2) * (REFINE_WIN_H + 2)];
+    const float flt_epsilon = 0x1.0p-23f;
+
+    do {
+        float2 feature_tmp;
+        float a = 0, b = 0, c = 0, bb1 = 0, bb2 = 0;
+
+        get_rect_sub_pix(grayscale, subpix, REFINE_WIN_W + 2, REFINE_WIN_H + 2, feature);
+        float *subpix_ptr = subpix;
+        subpix_ptr += REFINE_WIN_W + 2 + 1;
+
+        // process gradient
+        for (i = 0, k = 0; i < REFINE_WIN_H; i++, subpix_ptr += REFINE_WIN_W + 2) {
+            float py = i - REFINE_WIN_HALF_H;
+
+            for (j = 0; j < REFINE_WIN_W; j++, k++) {
+                float m = mask[k];
+                float tgx = subpix_ptr[j + 1] - subpix_ptr[j - 1];
+                float tgy = subpix_ptr[j + REFINE_WIN_W + 2] - subpix_ptr[j - REFINE_WIN_W - 2];
+                float gxx = tgx * tgx * m;
+                float gxy = tgx * tgy * m;
+                float gyy = tgy * tgy * m;
+                float px = j - REFINE_WIN_HALF_W;
+
+                a += gxx;
+                b += gxy;
+                c += gyy;
+
+                bb1 += gxx * px + gxy * py;
+                bb2 += gxy * px + gyy * py;
+            }
+        }
+
+        float det = a * c - b * b;
+        if (fabs(det) <= flt_epsilon * flt_epsilon) {
+            break;
+        }
+
+        // 2x2 matrix inversion
+        float scale = 1.0f / det;
+        feature_tmp.x = (float)(feature.x + (c * scale * bb1) - (b * scale * bb2));
+        feature_tmp.y = (float)(feature.y - (b * scale * bb1) + (a * scale * bb2));
+        err = dot(feature_tmp - feature, feature_tmp - feature);
+
+        feature = feature_tmp;
+        if (feature.x < 0 || feature.x >= src_width || feature.y < 0 || feature.y >= src_height) {
+            break;
+        }
+    } while (++iter < max_iters && err > eps);
+
+    // Make sure new point isn't too far from the initial point (indicates poor convergence)
+    if (fabs(feature.x - init.x) > REFINE_WIN_HALF_W || fabs(feature.y - init.y) > REFINE_WIN_HALF_H) {
+        feature = init;
+    }
+
+    return feature;
+}
+
+// Performs non-maximum suppression on the harris response and writes the resulting
+// feature locations to refined_features.
+//
+// Assumes that refined_features and the global work sizes are set up such that the image
+// is split up into a grid of 32x32 blocks where each block has a single slot in the
+// refined_features buffer. This kernel finds the best corner in each block (if the
+// block has any) and writes it to the corresponding slot in the buffer.
+//
+// If subpixel_refine is true, the features are additionally refined at a sub-pixel
+// level for increased precision.
+__kernel void refine_features(
+    __read_only image2d_t grayscale,
+    __global const float *harris_buf,
+    __global float2 *refined_features,
+    int subpixel_refine
+) {
+    int2 loc = (int2)(get_global_id(0), get_global_id(1));
+    // The location in the grayscale buffer rather than the compacted grid
+    int2 loc_i = (int2)(loc.x * 32, loc.y * 32);
+
+    float new_val;
+    float max_val = 0;
+    float2 loc_max = (float2)(-1, -1);
+
+    int end_x = min(loc_i.x + 32, (int)get_image_dim(grayscale).x - 1);
+    int end_y = min(loc_i.y + 32, (int)get_image_dim(grayscale).y - 1);
+
+    for (int i = loc_i.x; i < end_x; ++i) {
+        for (int j = loc_i.y; j < end_y; ++j) {
+            new_val = harris_buf[i + j * get_image_dim(grayscale).x];
+
+            if (new_val > max_val) {
+                max_val = new_val;
+                loc_max = (float2)(i, j);
+            }
+        }
+    }
+
+    if (max_val == 0) {
+        // There are no features in this part of the frame
+        write_to_1d_arrf2(refined_features, loc, loc_max);
+        return;
+    }
+
+    if (subpixel_refine) {
+        float mask[REFINE_WIN_H * REFINE_WIN_W];
+        for (int i = 0; i < REFINE_WIN_H; i++) {
+            float y = (float)(i - REFINE_WIN_HALF_H) / REFINE_WIN_HALF_H;
+            float vy = exp(-y * y);
+
+            for (int j = 0; j < REFINE_WIN_W; j++) {
+                float x = (float)(j - REFINE_WIN_HALF_W) / REFINE_WIN_HALF_W;
+                mask[i * REFINE_WIN_W + j] = (float)(vy * exp(-x * x));
+            }
+        }
+
+        loc_max = corner_sub_pix(grayscale, loc_max, mask);
+    }
+
+    write_to_1d_arrf2(refined_features, loc, loc_max);
+}
+
+// Extracts BRIEF descriptors from the grayscale src image for the given features
+// using the provided sampler.
+__kernel void brief_descriptors(
+    __read_only image2d_t grayscale,
+    __global const float2 *refined_features,
+    // for 512 bit descriptors
+    __global ulong8 *desc_buf,
+    __global const PointPair *brief_pattern
+) {
+    int2 loc = (int2)(get_global_id(0), get_global_id(1));
+    float2 feature = read_from_1d_arrf2(refined_features, loc);
+
+    // There was no feature in this part of the frame
+    if (feature.x == -1) {
+        write_to_1d_arrul8(desc_buf, loc, (ulong8)(0));
+        return;
+    }
+
+    ulong8 desc = 0;
+    ulong *p = &desc;
+
+    for (int i = 0; i < 8; ++i) {
+        for (int j = 0; j < 64; ++j) {
+            PointPair pair = brief_pattern[j * (i + 1)];
+            float l1 = read_imagef(grayscale, sampler_linear, feature + pair.p1).x;
+            float l2 = read_imagef(grayscale, sampler_linear, feature + pair.p2).x;
+
+            if (l1 < l2) {
+                p[i] |= 1UL << j;
+            }
+        }
+    }
+
+    write_to_1d_arrul8(desc_buf, loc, desc);
+}
+
+// Given buffers with descriptors for the current and previous frame, determines
+// which ones match, writing correspondences to matches_buf.
+//
+// Feature and descriptor buffers are assumed to be compacted (each element sourced
+// from a 32x32 block in the frame being processed).
+__kernel void match_descriptors(
+    __global const float2 *prev_refined_features,
+    __global const float2 *refined_features,
+    __global const ulong8 *desc_buf,
+    __global const ulong8 *prev_desc_buf,
+    __global MotionVector *matches_buf
+) {
+    int2 loc = (int2)(get_global_id(0), get_global_id(1));
+    ulong8 desc = read_from_1d_arrul8(desc_buf, loc);
+    const int search_radius = 3;
+
+    MotionVector invalid_vector = (MotionVector) {
+        (PointPair) {
+            (float2)(-1, -1),
+            (float2)(-1, -1)
+        },
+        0
+    };
+
+    if (desc.s0 == 0 && desc.s1 == 0) {
+        // There was no feature in this part of the frame
+        write_to_1d_arrvec(
+            matches_buf,
+            loc,
+            invalid_vector
+        );
+        return;
+    }
+
+    int2 start = max(loc - search_radius, 0);
+    int2 end = min(loc + search_radius, (int2)(get_global_size(0) - 1, get_global_size(1) - 1));
+
+    for (int i = start.x; i < end.x; ++i) {
+        for (int j = start.y; j < end.y; ++j) {
+            int2 prev_point = (int2)(i, j);
+            int total_dist = 0;
+
+            ulong8 prev_desc = read_from_1d_arrul8(prev_desc_buf, prev_point);
+
+            if (prev_desc.s0 == 0 && prev_desc.s1 == 0) {
+                continue;
+            }
+
+            ulong *prev_desc_p = &prev_desc;
+            ulong *desc_p = &desc;
+
+            for (int i = 0; i < 8; i++) {
+                total_dist += popcount(desc_p[i] ^ prev_desc_p[i]);
+            }
+
+            if (total_dist < DISTANCE_THRESHOLD) {
+                write_to_1d_arrvec(
+                    matches_buf,
+                    loc,
+                    (MotionVector) {
+                        (PointPair) {
+                            read_from_1d_arrf2(prev_refined_features, prev_point),
+                            read_from_1d_arrf2(refined_features, loc)
+                        },
+                        1
+                    }
+                );
+
+                return;
+            }
+        }
+    }
+
+    // There is no found match for this point
+    write_to_1d_arrvec(
+        matches_buf,
+        loc,
+        invalid_vector
+    );
+}
+
+// Returns the position of the given point after the transform is applied
+static float2 transformed_point(float2 p, __global const float *transform) {
+    float2 ret;
+
+    ret.x = p.x * transform[0] + p.y * transform[1] + transform[2];
+    ret.y = p.x * transform[3] + p.y * transform[4] + transform[5];
+
+    return ret;
+}
+
+
+// Performs the given transform on the src image
+__kernel void transform(
+    __read_only image2d_t src,
+    __write_only image2d_t dst,
+    __global const float *transform
+) {
+    int2 loc = (int2)(get_global_id(0), get_global_id(1));
+    float2 norm = convert_float2(get_image_dim(src));
+
+    write_imagef(
+        dst,
+        loc,
+        read_imagef(
+            src,
+            sampler_linear_mirror,
+            transformed_point((float2)(loc.x, loc.y), transform) / norm
+        )
+    );
+}
+
+// Returns the new location of the given point using the given crop bounding box
+// and the width and height of the original frame.
+static float2 cropped_point(
+    float2 p,
+    float2 top_left,
+    float2 bottom_right,
+    int2 orig_dim
+) {
+    float2 ret;
+
+    float crop_width  = bottom_right.x - top_left.x;
+    float crop_height = bottom_right.y - top_left.y;
+
+    float width_norm = p.x / (float)orig_dim.x;
+    float height_norm = p.y / (float)orig_dim.y;
+
+    ret.x = (width_norm * crop_width) + top_left.x;
+    ret.y = (height_norm * crop_height) + ((float)orig_dim.y - bottom_right.y);
+
+    return ret;
+}
+
+// Upscales the given cropped region to the size of the original frame
+__kernel void crop_upscale(
+    __read_only image2d_t src,
+    __write_only image2d_t dst,
+    float2 top_left,
+    float2 bottom_right
+) {
+    int2 loc = (int2)(get_global_id(0), get_global_id(1));
+
+    write_imagef(
+        dst,
+        loc,
+        read_imagef(
+            src,
+            sampler_linear,
+            cropped_point((float2)(loc.x, loc.y), top_left, bottom_right, get_image_dim(dst))
+        )
+    );
+}
+
+// Draws boxes to represent the given point matches and uses the given transform
+// and crop info to make sure their positions are accurate on the transformed frame.
+//
+// model_matches is an array of three points that were used by the RANSAC process
+// to generate the given transform
+__kernel void draw_debug_info(
+    __write_only image2d_t dst,
+    __global const MotionVector *matches,
+    __global const MotionVector *model_matches,
+    int num_model_matches,
+    __global const float *transform
+) {
+    int loc = get_global_id(0);
+    MotionVector vec = matches[loc];
+    // Black box: matched point that RANSAC considered an outlier
+    float4 big_rect_color = (float4)(0.1f, 0.1f, 0.1f, 1.0f);
+
+    if (vec.should_consider) {
+        // Green box: matched point that RANSAC considered an inlier
+        big_rect_color = (float4)(0.0f, 1.0f, 0.0f, 1.0f);
+    }
+
+    for (int i = 0; i < num_model_matches; i++) {
+        if (vec.p.p2.x == model_matches[i].p.p2.x && vec.p.p2.y == model_matches[i].p.p2.y) {
+            // Orange box: point used to calculate model
+            big_rect_color = (float4)(1.0f, 0.5f, 0.0f, 1.0f);
+        }
+    }
+
+    float2 transformed_p1 = transformed_point(vec.p.p1, transform);
+    float2 transformed_p2 = transformed_point(vec.p.p2, transform);
+
+    draw_box(dst, (int2)(transformed_p2.x, transformed_p2.y), big_rect_color, 5);
+    // Small light blue box: the point in the previous frame
+    draw_box(dst, (int2)(transformed_p1.x, transformed_p1.y), (float4)(0.0f, 0.3f, 0.7f, 1.0f), 3);
+}
--- a/libavfilter/opencl_source.h
+++ b/libavfilter/opencl_source.h
@@ -23,6 +23,7 @@ extern const char *ff_opencl_source_avgblur;
 extern const char *ff_opencl_source_colorkey;
 extern const char *ff_opencl_source_colorspace_common;
 extern const char *ff_opencl_source_convolution;
+extern const char *ff_opencl_source_deshake;
 extern const char *ff_opencl_source_neighbor;
 extern const char *ff_opencl_source_nlmeans;
 extern const char *ff_opencl_source_overlay;
--- a/libavfilter/version.h
+++ b/libavfilter/version.h
@@ -31,7 +31,7 @@

 #define LIBAVFILTER_VERSION_MAJOR   7
 #define LIBAVFILTER_VERSION_MINOR  58
-#define LIBAVFILTER_VERSION_MICRO 100
+#define LIBAVFILTER_VERSION_MICRO 101


 #define LIBAVFILTER_VERSION_INT AV_VERSION_INT(LIBAVFILTER_VERSION_MAJOR, \
--- a/libavfilter/vf_deshake_opencl.c
+++ b/libavfilter/vf_deshake_opencl.c