From patchwork Wed Jan 30 08:13:02 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Ruiling Song X-Patchwork-Id: 11910 Return-Path: X-Original-To: patchwork@ffaux-bg.ffmpeg.org Delivered-To: patchwork@ffaux-bg.ffmpeg.org Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org [79.124.17.100]) by ffaux.localdomain (Postfix) with ESMTP id EB02944CC2A for ; Wed, 30 Jan 2019 10:15:52 +0200 (EET) Received: from [127.0.1.1] (localhost [127.0.0.1]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id 124BD68ADB4; Wed, 30 Jan 2019 10:15:41 +0200 (EET) X-Original-To: ffmpeg-devel@ffmpeg.org Delivered-To: ffmpeg-devel@ffmpeg.org Received: from mga11.intel.com (mga11.intel.com [192.55.52.93]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTPS id DBFDE68AB9E for ; Wed, 30 Jan 2019 10:15:33 +0200 (EET) X-Amp-Result: SKIPPED(no attachment in message) X-Amp-File-Uploaded: False Received: from orsmga008.jf.intel.com ([10.7.209.65]) by fmsmga102.fm.intel.com with ESMTP/TLS/DHE-RSA-AES256-GCM-SHA384; 30 Jan 2019 00:15:52 -0800 X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.56,540,1539673200"; d="scan'208";a="113836061" Received: from ruiling-skl2.sh.intel.com ([10.239.158.154]) by orsmga008.jf.intel.com with ESMTP; 30 Jan 2019 00:15:50 -0800 From: Ruiling Song To: ffmpeg-devel@ffmpeg.org Date: Wed, 30 Jan 2019 16:13:02 +0800 Message-Id: <1548835982-8118-1-git-send-email-ruiling.song@intel.com> X-Mailer: git-send-email 2.7.4 Subject: [FFmpeg-devel] [RFC] lavfi: add scale_opencl filter. X-BeenThere: ffmpeg-devel@ffmpeg.org X-Mailman-Version: 2.1.20 Precedence: list List-Id: FFmpeg development discussions and patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Reply-To: FFmpeg development discussions and patches Cc: Ruiling Song MIME-Version: 1.0 Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" Signed-off-by: Ruiling Song --- This patch depends on the colorspace patchset I sent before (https://patchwork.ffmpeg.org/patch/11820/) Although I am still working on some minor functionality, hope somebody could give some comments about the overall design. Ruiling configure | 1 + libavfilter/Makefile | 2 + libavfilter/allfilters.c | 1 + libavfilter/opencl/scale.cl | 252 ++++++++++++++++ libavfilter/opencl_source.h | 1 + libavfilter/vf_scale_opencl.c | 682 ++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 939 insertions(+) create mode 100644 libavfilter/opencl/scale.cl create mode 100644 libavfilter/vf_scale_opencl.c diff --git a/configure b/configure index ec8f70d..5640137 100755 --- a/configure +++ b/configure @@ -3450,6 +3450,7 @@ rubberband_filter_deps="librubberband" sab_filter_deps="gpl swscale" scale2ref_filter_deps="swscale" scale_filter_deps="swscale" +scale_opencl_filter_deps="opencl" scale_qsv_filter_deps="libmfx" select_filter_select="scene_sad" sharpness_vaapi_filter_deps="vaapi" diff --git a/libavfilter/Makefile b/libavfilter/Makefile index bc642ac..9de7d44 100644 --- a/libavfilter/Makefile +++ b/libavfilter/Makefile @@ -343,6 +343,8 @@ OBJS-$(CONFIG_SCALE_FILTER) += vf_scale.o scale.o OBJS-$(CONFIG_SCALE_CUDA_FILTER) += vf_scale_cuda.o vf_scale_cuda.ptx.o \ cuda_check.o OBJS-$(CONFIG_SCALE_NPP_FILTER) += vf_scale_npp.o scale.o cuda_check.o +OBJS-$(CONFIG_SCALE_OPENCL_FILTER) += vf_scale_opencl.o opencl.o \ + opencl/scale.o OBJS-$(CONFIG_SCALE_QSV_FILTER) += vf_scale_qsv.o OBJS-$(CONFIG_SCALE_VAAPI_FILTER) += vf_scale_vaapi.o scale.o vaapi_vpp.o OBJS-$(CONFIG_SCALE2REF_FILTER) += vf_scale.o scale.o diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c index c51ae0f..5708d16 100644 --- a/libavfilter/allfilters.c +++ b/libavfilter/allfilters.c @@ -325,6 +325,7 @@ extern AVFilter ff_vf_sab; extern AVFilter ff_vf_scale; extern AVFilter ff_vf_scale_cuda; extern AVFilter ff_vf_scale_npp; +extern AVFilter ff_vf_scale_opencl; extern AVFilter ff_vf_scale_qsv; extern AVFilter ff_vf_scale_vaapi; extern AVFilter ff_vf_scale2ref; diff --git a/libavfilter/opencl/scale.cl b/libavfilter/opencl/scale.cl new file mode 100644 index 0000000..5d3deda --- /dev/null +++ b/libavfilter/opencl/scale.cl @@ -0,0 +1,252 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +extern float3 yuv2rgb(float, float, float); +extern float3 rgb2yuv(float, float, float); + +const sampler_t sampler_nearest = (CLK_NORMALIZED_COORDS_FALSE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST); + +const sampler_t sampler_linear = (CLK_NORMALIZED_COORDS_FALSE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_LINEAR); + +float4 neighbor(image2d_t img, float vscale, + float hscale, int x, int y, + __constant float *coff_x, + __constant float *coff_y, + int2 filter_size) +{ + float xi = ((float)x + 0.5f) * hscale; + float yi = ((float)y + 0.5f) * vscale; + + return read_imagef(img, sampler_nearest, (float2)(xi, yi)); +} + +float4 bilinear(image2d_t img, float vscale, + float hscale, int x, int y, + __constant float *coff_x, + __constant float *coff_y, + int2 filter_size) +{ + float xi = ((float)x + 0.5f) * hscale; + float yi = ((float)y + 0.5f) * vscale; + + return read_imagef(img, sampler_linear, (float2)(xi, yi)); +} + +float4 generic_filter(image2d_t img, float vscale, float hscale, int x, int y, + __constant float *coff_x, __constant float *coff_y, + int2 filter_size) +{ + int2 dst_pos = (int2)(x, y); + float2 src_coord = (convert_float2(dst_pos) + 0.5f) * + (float2)(hscale, vscale); + int2 src_pos = convert_int2(floor(src_coord - 0.5f)); + + float4 color = 0.0f; + for (int i = 0; i < filter_size.y; ++i) { + float4 sum = 0.0f; + for (int j = 0; j < filter_size.x; ++j) { + int x_offset = filter_size.x / 2 - j; + int y_offset = filter_size.y / 2 - i; + float4 c = read_imagef(img, sampler_nearest, + src_pos + (int2)(x_offset, y_offset)); + sum += c * coff_x[dst_pos.x * filter_size.x + j]; + } + color += sum * coff_y[dst_pos.y * filter_size.y + i]; + } + return color; +} + +__kernel void scale(__write_only image2d_t dst, + __read_only image2d_t src, + int dst_width, int dst_height, + int src_width, int src_height, + __constant float *coff_x, + __constant float *coff_y, + int2 filter_size) +{ + int x = get_global_id(0); + int y = get_global_id(1); + float vscale = (float)src_height / (float)dst_height; + float hscale = (float)src_width / (float)dst_width; + + float4 sum = algorithm(src, vscale, hscale, x, y, + coff_x, coff_y, filter_size); + + if (x < dst_width && y < dst_height) + write_imagef(dst, (int2)(x, y), sum); +} + +// read chroma value from 'img', 'color[4]' contains the results. +inline void read_chroma(float4 color[4], image2d_t img, + float scalev, float scaleh, int xchr, int ychr, + __constant float *coff_x, __constant float *coff_y, + int2 filter_size, int chroma_h, int chroma_v) +{ + color[0] = algorithm(img, scalev, scaleh, xchr, ychr, coff_x, coff_y, + filter_size); + + if (chroma_v == 2) { + color[2] = algorithm(img, scalev, scaleh, xchr, ychr + 1, coff_x, coff_y, + filter_size); + if (chroma_h == 2) { + color[1] = algorithm(img, scalev, scaleh, xchr + 1, ychr, coff_x, + coff_y, filter_size); + color[3] = algorithm(img, scalev, scaleh, xchr + 1, ychr + 1, coff_x, + coff_y, filter_size); + } else { + color[1] = color[0]; + color[3] = color[2]; + } + } else { + color[1] = color[2] = color[3] = color[0]; + } +} + +inline void write_chroma(write_only image2d_t img, float4 c[4], + int xchr, int ychr, int chroma_h, int chroma_v) +{ + write_imagef(img, (int2)(xchr, ychr), c[0]); + + if (chroma_v == 2) { + write_imagef(img, (int2)(xchr, ychr + 1), c[2]); + if (chroma_h == 2) { + write_imagef(img, (int2)(xchr + 1, ychr), c[1]); + write_imagef(img, (int2)(xchr + 1, ychr + 1), c[3]); + } + } +} + +__kernel void convert(__write_only image2d_t dst0, + __write_only image2d_t dst1, + __write_only image2d_t dst2, + __write_only image2d_t dst3, + __read_only image2d_t src0, + __read_only image2d_t src1, + __read_only image2d_t src2, + __read_only image2d_t src3, + int dst_width, int dst_height, + int src_width, int src_height, + __constant float *coff_x, + __constant float *coff_y, + int2 filter_size) +{ + int x = get_global_id(0); + int y = get_global_id(1); + int x2 = x * 2; + int y2 = y * 2; + int xchr = SRC_CHROMA_H * x; + int ychr = SRC_CHROMA_V * y; + int i; + float4 color[4], color0[4], color1[4], color2[4], color3[4]; + + float scalev = (float)src_height / (float)dst_height; + float scaleh = (float)src_width / (float)dst_width; + color0[0] = algorithm(src0, scalev, scaleh, x2, y2, + coff_x, coff_y, filter_size); + color0[1] = algorithm(src0, scalev, scaleh, x2 + 1, y2, + coff_x, coff_y, filter_size); + color0[2] = algorithm(src0, scalev, scaleh, x2, y2 + 1, + coff_x, coff_y, filter_size); + color0[3] = algorithm(src0, scalev, scaleh, x2 + 1, y2 + 1, + coff_x, coff_y, filter_size); + #pragma unroll + for (i = 0; i < 4; i++) + color[i] = color0[i]; + + if (SRC_IMGS > 1) { + read_chroma(color1, src1, scalev, scaleh, xchr, ychr, + coff_x, coff_y, filter_size, + SRC_CHROMA_H, SRC_CHROMA_V); + #pragma unroll + for (i = 0; i < 4; i++) + color[i].yz = color1[i].xy; + } + + if (SRC_IMGS > 2) { + read_chroma(color2, src2, scalev, scaleh, xchr, ychr, + coff_x, coff_y, filter_size, + SRC_CHROMA_H, SRC_CHROMA_V); + #pragma unroll + for (i = 0; i < 4; i++) + color[i].z = color2[i].x; + } + + if (SRC_IMGS > 3) { + color3[0] = algorithm(src3, scalev, scaleh, x2, y2, + coff_x, coff_y, filter_size); + color3[1] = algorithm(src3, scalev, scaleh, x2 + 1, y2, + coff_x, coff_y, filter_size); + color3[2] = algorithm(src3, scalev, scaleh, x2, y2 + 1, + coff_x, coff_y, filter_size); + color3[3] = algorithm(src3, scalev, scaleh, x2 + 1, y2 + 1, + coff_x, coff_y, filter_size); + #pragma unroll + for (i = 0; i < 4; i++) + color[i].w = color3[i].x; + } + + // possible yuv-rgb conversion here + #ifdef YUV2RGB + #pragma unroll + for (i = 0; i < 4; i++) + color[i].xyz = yuv2rgb(color[i].x, color[i].y, color[i].z); + #endif + + #ifdef RGB2YUV + #pragma unroll + for (i = 0; i < 4; i++) + color[i].xyz = rgb2yuv(color[i].x, color[i].y, color[i].z); + + #endif + + xchr = DST_CHROMA_H * x; + ychr = DST_CHROMA_V * y; + + if (x2 < dst_width && y2 < dst_height) { + write_imagef(dst0, (int2)(x2, y2 ), color[0]); + write_imagef(dst0, (int2)(x2 + 1, y2 ), color[1]); + write_imagef(dst0, (int2)(x2, y2 + 1), color[2]); + write_imagef(dst0, (int2)(x2 + 1, y2 + 1), color[3]); + } + + if (DST_IMGS > 1 && x2 < dst_width && y2 < dst_height) { + float4 c2[4]; + #pragma unroll + for (i = 0; i < 4; i++) + c2[i] = color[i].yzyz; + write_chroma(dst1, c2, xchr, ychr, DST_CHROMA_H, DST_CHROMA_V); + + if (DST_IMGS > 2) { + #pragma unroll + for (i = 0; i < 4; i++) + c2[i] = color[i].zzzz; + write_chroma(dst2, c2, xchr, ychr, DST_CHROMA_H, DST_CHROMA_V); + } + + if (DST_IMGS > 3) { + write_imagef(dst3, (int2)(x2, y2), color[0].wwww); + write_imagef(dst3, (int2)(x2 + 1, y2), color[1].wwww); + write_imagef(dst3, (int2)(x2, y2 + 1), color[2].wwww); + write_imagef(dst3, (int2)(x2 + 1, y2 + 1), color[3].wwww); + } + } +} diff --git a/libavfilter/opencl_source.h b/libavfilter/opencl_source.h index 4118138..3dd7634 100644 --- a/libavfilter/opencl_source.h +++ b/libavfilter/opencl_source.h @@ -24,6 +24,7 @@ extern const char *ff_opencl_source_colorspace_common; extern const char *ff_opencl_source_convolution; extern const char *ff_opencl_source_neighbor; extern const char *ff_opencl_source_overlay; +extern const char *ff_opencl_source_scale; extern const char *ff_opencl_source_tonemap; extern const char *ff_opencl_source_transpose; extern const char *ff_opencl_source_unsharp; diff --git a/libavfilter/vf_scale_opencl.c b/libavfilter/vf_scale_opencl.c new file mode 100644 index 0000000..929a5dd --- /dev/null +++ b/libavfilter/vf_scale_opencl.c @@ -0,0 +1,682 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/bprint.h" +#include "libavutil/common.h" +#include "libavutil/imgutils.h" +#include "libavutil/mem.h" +#include "libavutil/opt.h" +#include "libavutil/pixdesc.h" + +#include "avfilter.h" +#include "colorspace.h" +#include "internal.h" +#include "opencl.h" +#include "opencl_source.h" +#include "scale.h" +#include "video.h" + +enum ScaleAlgorithm { + SCALE_NEIGHBOR = 0, + SCALE_BILINEAR, + SCALE_BICUBIC, +}; + +typedef struct SurfaceInfo { + int plane_id[4]; + int nb_planes; + int is_rgb; + int width; + int height; + int chroma_v; /// value of (chroma_height / half_luma_height) + int chroma_h; /// value of (chroma_width / half_luma_width) +} SurfaceInfo; + +typedef struct ScaleInfo { + SurfaceInfo src; + SurfaceInfo dst; +} ScaleInfo; + +typedef struct ScaleOpenCLContext { + OpenCLFilterContext ocf; + enum AVPixelFormat format; + enum AVColorSpace colorspace, colorspace_in, colorspace_out; + enum AVColorRange range, range_in, range_out; + enum AVChromaLocation chroma_loc; + enum ScaleAlgorithm algorithm; + + char *w_expr; + char *h_expr; + char *format_str; + int initialised; + cl_kernel kernel; + cl_command_queue command_queue; + cl_mem coff_x; + cl_mem coff_y; + cl_int2 filter_size; + ScaleInfo job; +} ScaleOpenCLContext; + +static const char *algo_str[3] = +{ + "neighbor", + "bilinear", + "generic_filter" +}; + +static const enum AVPixelFormat supported_formats[] = { + AV_PIX_FMT_YUV444P, + AV_PIX_FMT_YUV422P, + AV_PIX_FMT_YUV420P, + AV_PIX_FMT_NV12, + AV_PIX_FMT_GBRP, + AV_PIX_FMT_RGBA, + AV_PIX_FMT_BGRA, + AV_PIX_FMT_ARGB +}; + +static void gather_surface_info(AVFilterContext *avctx, SurfaceInfo *info, + AVFrame *frame) +{ + const AVPixFmtDescriptor *desc; + AVHWFramesContext *hwctx; + enum AVPixelFormat format; + int plane_id, plane_id_last; + int i, nb_comp; + + hwctx = (AVHWFramesContext *)frame->hw_frames_ctx->data; + format = hwctx->sw_format; + desc = av_pix_fmt_desc_get(format); + info->is_rgb = desc->flags & AV_PIX_FMT_FLAG_RGB; + + nb_comp = desc->nb_components; + plane_id_last = -1; + + info->width = frame->width; + info->height = frame->height; + info->chroma_h = 2 / (1 << desc->log2_chroma_w); + info->chroma_v = 2 / (1 << desc->log2_chroma_h); + for (i = 0; i < nb_comp; i++) { + plane_id = desc->comp[i].plane; + + if (plane_id != plane_id_last) { + info->plane_id[info->nb_planes] = plane_id; + info->nb_planes++; + } + plane_id_last = plane_id; + } +} + +static int init_jobs(AVFilterContext *avctx, AVFrame *in, AVFrame *out) +{ + ScaleOpenCLContext *ctx = avctx->priv; + ScaleInfo *info = &ctx->job; + + gather_surface_info(avctx, &info->src, in); + gather_surface_info(avctx, &info->dst, out); + + av_log(avctx, AV_LOG_DEBUG, "Scale Job Info:\n"); + av_log(avctx, AV_LOG_DEBUG, "src: planes: %d width: %d height: %d, " + "plane-id (%d %d %d %d)\n", info->src.nb_planes, + info->src.width, info->src.height, + info->src.plane_id[0], info->src.plane_id[1], + info->src.plane_id[2], info->src.plane_id[3]); + + av_log(avctx, AV_LOG_DEBUG, "dst: planes: %d width: %d height: %d, " + "plane-id (%d %d %d %d)\n", info->dst.nb_planes, + info->dst.width, info->dst.height, + info->dst.plane_id[0], info->dst.plane_id[1], + info->dst.plane_id[2], info->dst.plane_id[3]); + + return 0; +} + +static const int filter_size[] = { + [SCALE_NEIGHBOR] = 1, + [SCALE_BILINEAR] = 1, + [SCALE_BICUBIC] = 4, +}; + +static float catmullrom(float x) +{ + float x2, x3; + const float B = 0.0f; + const float C = 0.5f; + x = x < 0.0f ? -x : x; + x2 = x * x; + x3 = x2 * x; + + if(x < 1.0f) { + return ((12 - 9 * B - 6 * C) * x3 + + (-18 + 12 * B + 6 * C) * x2 + + (6 - 2 * B)) / 6.0f; + } else if (x >= 1.0 && x < 2.0f) { + return (( -B - 6 * C) * x3 + + ( 6 * B + 30 * C ) * x2 + + (-( 12 * B ) - 48 * C) * x + + 8 * B + 24 * C) / 6.0f; + } else { + return 0.0f; + } +} + +static float filter(enum ScaleAlgorithm algo, float f) +{ + switch (algo) { + case SCALE_BICUBIC: + return catmullrom(f); + default: + return f; + } +} + +static void fill_filter_table(AVFilterContext *avctx, float *buf, int length, + int filter_size, float scale) +{ + ScaleOpenCLContext *ctx = avctx->priv; + int i, k; + float sum; + for (i = 0; i < length; i++) { + float dst_in_src = (i + 0.5) * scale - 0.5; + float t = dst_in_src - floor(dst_in_src); + sum = 0.0; + for (k = 0; k < filter_size; k++) { + float fpos = ((float)(filter_size / 2 - k) - t); + float ff = filter(ctx->algorithm, fpos / FFMIN(scale, 1.0)); + buf[i * filter_size + k] = ff; + sum += ff; + } + + for (k = 0; k < filter_size; k++) { + buf[i * filter_size + k] /= sum; + } + } +} +static int prepare_opencl_header(AVFilterContext *avctx, AVBPrint *header) +{ + ScaleOpenCLContext *ctx = avctx->priv; + double rgb2yuv[3][3], yuv2rgb[3][3]; + const struct LumaCoefficients *luma_src; + + luma_src = ff_get_luma_coefficients(ctx->colorspace_in); + if (!luma_src) { + av_log(avctx, AV_LOG_ERROR, "unsupported input colorspace %d\n", + ctx->colorspace_in); + return AVERROR(EINVAL); + } + // fill this value as it is required to compile opencl program successfully + // currently we only support source & destination with same color-space. + av_bprintf(header, "constant float3 luma_src = {%.4ff, %.4ff, %.4ff};\n", + luma_src->cr, luma_src->cg, luma_src->cb); + av_bprintf(header, "constant float3 luma_dst = {%.4ff, %.4ff, %.4ff};\n", + luma_src->cr, luma_src->cg, luma_src->cb); + av_bprintf(header, "#define RGB2RGB_PASSTHROUGH\n"); + + ff_fill_rgb2yuv_table(luma_src, rgb2yuv); + ff_opencl_print_const_matrix_3x3(header, "yuv_matrix", rgb2yuv); + ff_matrix_invert_3x3(rgb2yuv, yuv2rgb); + ff_opencl_print_const_matrix_3x3(header, "rgb_matrix", yuv2rgb); + + if (ctx->job.src.is_rgb && !ctx->job.dst.is_rgb) { + av_bprintf(header, "#define RGB2YUV\n"); + } else if (!ctx->job.src.is_rgb && ctx->job.dst.is_rgb) { + av_bprintf(header, "#define YUV2RGB\n"); + } + + if (!ctx->job.src.is_rgb && ctx->range_in == AVCOL_RANGE_JPEG) + av_bprintf(header, "#define FULL_RANGE_IN\n"); + + if (!ctx->job.dst.is_rgb && ctx->range_out == AVCOL_RANGE_JPEG) + av_bprintf(header, "#define FULL_RANGE_OUT\n"); + + av_bprintf(header, "#define chroma_loc %d\n", (int)ctx->chroma_loc); + av_bprintf(header, "#define algorithm %s\n", algo_str[ctx->algorithm]); + av_bprintf(header, "#define DST_IMGS %d\n", ctx->job.dst.nb_planes); + av_bprintf(header, "#define SRC_IMGS %d\n", ctx->job.src.nb_planes); + av_bprintf(header, "#define SRC_CHROMA_V %d\n", ctx->job.src.chroma_v); + av_bprintf(header, "#define SRC_CHROMA_H %d\n", ctx->job.src.chroma_h); + av_bprintf(header, "#define DST_CHROMA_V %d\n", ctx->job.dst.chroma_v); + av_bprintf(header, "#define DST_CHROMA_H %d\n", ctx->job.dst.chroma_h); + return 0; +} +#define OPENCL_SOURCE_NB 3 +static int scale_opencl_init(AVFilterContext *avctx) +{ + ScaleOpenCLContext *ctx = avctx->priv; + const char *opencl_sources[OPENCL_SOURCE_NB]; + float *coff_x = NULL, *coff_y = NULL; + cl_int cle; + int err; + int filter_size_x, filter_size_y, size_factor; + float vscale, hscale; + const char *k_name; + AVBPrint header; + + size_factor = filter_size[ctx->algorithm]; + vscale = ctx->job.src.height / ctx->job.dst.height; + hscale = ctx->job.src.width / ctx->job.dst.width; + + filter_size_x = ceil(size_factor * FFMAX(1, hscale)); + filter_size_y = ceil(size_factor * FFMAX(1, vscale)); + ctx->filter_size.s[0] = filter_size_x; + ctx->filter_size.s[1] = filter_size_y; + + if (ctx->algorithm != SCALE_BILINEAR && + ctx->algorithm != SCALE_NEIGHBOR) { + // we don't need such coefficients for bilinear & neighbor + coff_x = av_malloc_array(filter_size_x * ctx->job.dst.width, + sizeof(float)); + coff_y = av_malloc_array(filter_size_y * ctx->job.dst.height, + sizeof(float)); + if (!coff_x || !coff_y) { + goto fail; + } + + fill_filter_table(avctx, coff_x, ctx->job.dst.width, filter_size_x, + hscale); + fill_filter_table(avctx, coff_y, ctx->job.dst.height, filter_size_y, + vscale); + + ctx->coff_x = clCreateBuffer(ctx->ocf.hwctx->context, + CL_MEM_COPY_HOST_PTR | CL_MEM_READ_ONLY, + ctx->job.dst.width * filter_size_x * + sizeof(cl_float), + coff_x, &cle); + + ctx->coff_y = clCreateBuffer(ctx->ocf.hwctx->context, + CL_MEM_COPY_HOST_PTR | CL_MEM_READ_ONLY, + ctx->job.dst.height * filter_size_y * + sizeof(cl_float), + coff_y, &cle); + av_freep(&coff_x); + av_freep(&coff_y); + } + + av_bprint_init(&header, 1024, AV_BPRINT_SIZE_AUTOMATIC); + err = prepare_opencl_header(avctx, &header); + if (err < 0) + goto fail; + av_log(avctx, AV_LOG_DEBUG, "Generated OpenCL header:\n%s\n", header.str); + opencl_sources[0] = header.str; + opencl_sources[1] = ff_opencl_source_scale; + opencl_sources[2] = ff_opencl_source_colorspace_common; + err = ff_opencl_filter_load_program(avctx, opencl_sources, OPENCL_SOURCE_NB); + + av_bprint_finalize(&header, NULL); + + if (err < 0) + goto fail; + + ctx->command_queue = clCreateCommandQueue(ctx->ocf.hwctx->context, + ctx->ocf.hwctx->device_id, + 0, &cle); + CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to create OpenCL " + "command queue %d.\n", cle); + + k_name = ctx->format != AV_PIX_FMT_NONE ? "convert" : "scale"; + ctx->kernel = clCreateKernel(ctx->ocf.program, k_name, &cle); + CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to create kernel %d.\n", cle); + + ctx->initialised = 1; + return 0; + +fail: + if (ctx->command_queue) + clReleaseCommandQueue(ctx->command_queue); + if (ctx->kernel) + clReleaseKernel(ctx->kernel); + + if (coff_x) + av_freep(&coff_x); + if (coff_y) + av_freep(&coff_y); + + av_bprint_finalize(&header, NULL); + return err; +} + +static int is_fmt_supported(enum AVPixelFormat fmt) +{ + int i; + + for (i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++) + if (supported_formats[i] == fmt) + return 1; + + return 0; +} + +static int scale_opencl_config_output(AVFilterLink *outlink) +{ + AVFilterContext *avctx = outlink->src; + ScaleOpenCLContext *s = avctx->priv; + AVFilterLink *inlink = outlink->src->inputs[0]; + AVHWFramesContext *in_frames_ctx; + enum AVPixelFormat in_format; + int w, h, ret; + + if (!strcmp(s->format_str, "same")) { + s->format = AV_PIX_FMT_NONE; + } else { + s->format = av_get_pix_fmt(s->format_str); + if (s->format == AV_PIX_FMT_NONE) { + av_log(avctx, AV_LOG_ERROR, "Unrecognized pixel format: %s\n", s->format_str); + return AVERROR(EINVAL); + } + } + + if ((ret = ff_scale_eval_dimensions(s, + s->w_expr, s->h_expr, + inlink, outlink, + &w, &h)) < 0) + return ret; + + s->ocf.output_width = w; + s->ocf.output_height = h; + if (s->format != AV_PIX_FMT_NONE) + s->ocf.output_format = s->format; + + ret = ff_opencl_filter_config_output(outlink); + if (ret < 0) + return ret; + + in_frames_ctx = (AVHWFramesContext*)inlink->hw_frames_ctx->data; + in_format = in_frames_ctx->sw_format; + + if (!is_fmt_supported(in_format)) { + av_log(avctx, AV_LOG_ERROR, "unsupported input format %s\n", + av_get_pix_fmt_name(in_format)); + return AVERROR(ENOSYS); + } + + if (!is_fmt_supported(s->ocf.output_format)) { + av_log(avctx, AV_LOG_ERROR, "unsupported output format %s\n", + av_get_pix_fmt_name(s->ocf.output_format)); + return AVERROR(ENOSYS); + } + + if (inlink->sample_aspect_ratio.num) + outlink->sample_aspect_ratio = av_mul_q((AVRational){outlink->h*inlink->w, + outlink->w*inlink->h}, + inlink->sample_aspect_ratio); + else + outlink->sample_aspect_ratio = inlink->sample_aspect_ratio; + return 0; +} + +static int enqueue_simple_scale(AVFilterContext *avctx, cl_command_queue queue, + cl_kernel kernel, AVFrame *input, + AVFrame *output) +{ + ScaleOpenCLContext *ctx = avctx->priv; + size_t global_work[2]; + cl_int cle; + cl_mem src, dst; + cl_int src_width, src_height, dst_width, dst_height; + int err, plane, is_chroma; + + for (plane = 0; plane < FF_ARRAY_ELEMS(output->data); plane++) { + dst = (cl_mem)output->data[plane]; + src = (cl_mem)input->data[plane]; + + if (!dst) break; + + is_chroma = (plane == 1 || plane == 2); + if (!is_chroma) { + src_width = input->width; + src_height = input->height; + dst_width = output->width; + dst_height = output->height; + } else { + src_width = input->width * ctx->job.src.chroma_h / 2; + src_height = input->height * ctx->job.src.chroma_v / 2; + dst_width = output->width * ctx->job.dst.chroma_h / 2; + dst_height = output->height * ctx->job.dst.chroma_v / 2; + } + + CL_SET_KERNEL_ARG(kernel, 0, cl_mem, &dst); + CL_SET_KERNEL_ARG(kernel, 1, cl_mem, &src); + CL_SET_KERNEL_ARG(kernel, 2, cl_int, &dst_width); + CL_SET_KERNEL_ARG(kernel, 3, cl_int, &dst_height); + CL_SET_KERNEL_ARG(kernel, 4, cl_int, &src_width); + CL_SET_KERNEL_ARG(kernel, 5, cl_int, &src_height); + CL_SET_KERNEL_ARG(kernel, 6, cl_mem, &ctx->coff_x); + CL_SET_KERNEL_ARG(kernel, 7, cl_mem, &ctx->coff_y); + CL_SET_KERNEL_ARG(kernel, 8, cl_int2, &ctx->filter_size); + + err = ff_opencl_filter_work_size_from_image(avctx, global_work, output, + plane, 8); + if (err < 0) + return err; + + cle = clEnqueueNDRangeKernel(ctx->command_queue, kernel, 2, NULL, + global_work, NULL, + 0, NULL, NULL); + CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to enqueue kernel: %d.\n", cle); + + } + return 0; +fail: + return err; +} + +static int enqueue_convert(AVFilterContext *avctx, cl_command_queue queue, + cl_kernel kernel, AVFrame *in, AVFrame *out) +{ + ScaleOpenCLContext *ctx = avctx->priv; + cl_int cle; + cl_mem mem; + int err; + size_t global_work[2]; + + global_work[0] = ctx->job.dst.width / 2; + global_work[1] = ctx->job.dst.height / 2; +#define SET_DST(idx) \ +mem = (cl_mem)out->data[ctx->job.dst.plane_id[idx]]; \ +CL_SET_KERNEL_ARG(kernel, idx, cl_mem, &mem); + +#define SET_SRC(idx) \ +mem = (cl_mem)in->data[ctx->job.src.plane_id[idx]]; \ +CL_SET_KERNEL_ARG(kernel, idx + 4, cl_mem, &mem); + + SET_DST(0) + SET_DST(1) + SET_DST(2) + SET_DST(3) + + SET_SRC(0) + SET_SRC(1) + SET_SRC(2) + SET_SRC(3) +#undef SET_DST +#undef SET_SRC + CL_SET_KERNEL_ARG(kernel, 8, cl_int, &ctx->job.dst.width); + CL_SET_KERNEL_ARG(kernel, 9, cl_int, &ctx->job.dst.height); + CL_SET_KERNEL_ARG(kernel, 10, cl_int, &ctx->job.src.width); + CL_SET_KERNEL_ARG(kernel, 11, cl_int, &ctx->job.src.height); + CL_SET_KERNEL_ARG(kernel, 12, cl_mem, &ctx->coff_x); + CL_SET_KERNEL_ARG(kernel, 13, cl_mem, &ctx->coff_y); + CL_SET_KERNEL_ARG(kernel, 14, cl_int2, &ctx->filter_size); + + cle = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, + global_work, NULL, + 0, NULL, NULL); + CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to enqueue kernel: %d.\n", cle); + + return 0; +fail: + return err; +} + +static int scale_opencl_filter_frame(AVFilterLink *inlink, AVFrame *input) +{ + AVFilterContext *avctx = inlink->dst; + AVFilterLink *outlink = avctx->outputs[0]; + ScaleOpenCLContext *ctx = avctx->priv; + AVFrame *output = NULL; + cl_kernel kernel; + cl_int cle; + int err; + + av_log(ctx, AV_LOG_DEBUG, "Filter input: %s, %ux%u (%"PRId64").\n", + av_get_pix_fmt_name(input->format), + input->width, input->height, input->pts); + + if (!input->hw_frames_ctx) + return AVERROR(EINVAL); + + + output = ff_get_video_buffer(outlink, outlink->w, outlink->h); + if (!output) { + err = AVERROR(ENOMEM); + goto fail; + } + + err = av_frame_copy_props(output, input); + if (err < 0) + goto fail; + + if (ctx->colorspace != -1) + ctx->colorspace_in = ctx->colorspace; + else + ctx->colorspace_in = input->colorspace; + // default to bt601 + if (ctx->colorspace_in == AVCOL_SPC_UNSPECIFIED) + ctx->colorspace_in = AVCOL_SPC_BT470BG; + ctx->range_in = input->color_range; + + if (!ctx->initialised) { + err = init_jobs(avctx, input, output); + if (err < 0) + goto fail; + + err = scale_opencl_init(avctx); + if (err < 0) + goto fail; + } + + kernel = ctx->kernel; + + if (ctx->format != AV_PIX_FMT_NONE) + err = enqueue_convert(avctx, ctx->command_queue, kernel, input, output); + else + err = enqueue_simple_scale(avctx, ctx->command_queue, kernel, input, + output); + if (err < 0) + goto fail; + + cle = clFinish(ctx->command_queue); + CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to finish command queue: %d.\n", cle); + + av_frame_free(&input); + + av_log(ctx, AV_LOG_DEBUG, "Filter output: %s, %ux%u (%"PRId64").\n", + av_get_pix_fmt_name(output->format), + output->width, output->height, output->pts); + + return ff_filter_frame(outlink, output); + +fail: + clFinish(ctx->command_queue); + av_frame_free(&input); + av_frame_free(&output); + return err; +} + +static av_cold void scale_opencl_uninit(AVFilterContext *avctx) +{ + ScaleOpenCLContext *ctx = avctx->priv; + cl_int cle; + + if (ctx->kernel) { + cle = clReleaseKernel(ctx->kernel); + if (cle != CL_SUCCESS) + av_log(avctx, AV_LOG_ERROR, "Failed to release " + "kernel: %d.\n", cle); + } + + if (ctx->command_queue) { + cle = clReleaseCommandQueue(ctx->command_queue); + if (cle != CL_SUCCESS) + av_log(avctx, AV_LOG_ERROR, "Failed to release " + "command queue: %d.\n", cle); + } + + if (ctx->coff_x) { + cle = clReleaseMemObject(ctx->coff_x); + } + + if (ctx->coff_y) { + cle = clReleaseMemObject(ctx->coff_y); + } + ff_opencl_filter_uninit(avctx); +} + +#define OFFSET(x) offsetof(ScaleOpenCLContext, x) +#define FLAGS (AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM) +static const AVOption scale_opencl_options[] = { + { "w", "Output video width", OFFSET(w_expr), AV_OPT_TYPE_STRING, { .str = "iw" }, .flags = FLAGS }, + { "h", "Output video height", OFFSET(h_expr), AV_OPT_TYPE_STRING, { .str = "ih" }, .flags = FLAGS }, + { "format", "Output pixel format", OFFSET(format_str), AV_OPT_TYPE_STRING, { .str = "same" }, .flags = FLAGS }, + { "in_color_matrix", "set input YCbCr type", OFFSET(colorspace), AV_OPT_TYPE_INT, { .i64 = -1 }, -1, INT_MAX, FLAGS, "matrix" }, + { "bt709", 0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_BT709}, 0, 0, FLAGS, "matrix" }, + { "fcc", 0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_FCC}, 0, 0, FLAGS, "matrix" }, + { "bt470bg", 0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_BT470BG}, 0, 0, FLAGS, "matrix" }, + { "smpte170m", 0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_SMPTE170M}, 0, 0, FLAGS, "matrix" }, + { "smpte240m", 0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_SMPTE240M}, 0, 0, FLAGS, "matrix" }, + { "bt2020", 0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_BT2020_NCL}, 0, 0, FLAGS, "matrix" }, + { "algorithm", "Scaling algorithm", OFFSET(algorithm), AV_OPT_TYPE_INT, { .i64 = SCALE_BILINEAR }, INT_MIN, INT_MAX, FLAGS, "algorithm" }, + { "neighbor", "nearest neighbor", 0, AV_OPT_TYPE_CONST, { .i64 = SCALE_NEIGHBOR}, 0, 0, FLAGS, "algorithm" }, + { "bilinear", "bilinear", 0, AV_OPT_TYPE_CONST, { .i64 = SCALE_BILINEAR}, 0, 0, FLAGS, "algorithm" }, + { "bicubic", "bicubic", 0, AV_OPT_TYPE_CONST, { .i64 = SCALE_BICUBIC}, 0, 0, FLAGS, "algorithm" }, + { NULL } +}; + +AVFILTER_DEFINE_CLASS(scale_opencl); + +static const AVFilterPad scale_opencl_inputs[] = { + { + .name = "default", + .type = AVMEDIA_TYPE_VIDEO, + .filter_frame = &scale_opencl_filter_frame, + .config_props = &ff_opencl_filter_config_input, + }, + { NULL } +}; + +static const AVFilterPad scale_opencl_outputs[] = { + { + .name = "default", + .type = AVMEDIA_TYPE_VIDEO, + .config_props = &scale_opencl_config_output, + }, + { NULL } +}; + +AVFilter ff_vf_scale_opencl = { + .name = "scale_opencl", + .description = NULL_IF_CONFIG_SMALL("OpenCL Scale and Format Conversion Filter"), + .priv_size = sizeof(ScaleOpenCLContext), + .priv_class = &scale_opencl_class, + .init = &ff_opencl_filter_init, + .uninit = &scale_opencl_uninit, + .query_formats = &ff_opencl_filter_query_formats, + .inputs = scale_opencl_inputs, + .outputs = scale_opencl_outputs, + .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE, +};