[FFmpeg-devel,v2,2/5] libavfilter/x86/vf_gblur: add ff_verti_slice_avx2/512()

Message ID	20210804020616.82866-2-jianhua.wu@intel.com
State	Accepted
Commit	68a2722aee2868084ad3ba1a7a5431735eab049e
Headers	show Delivered-To: ffmpegpatchwork2@gmail.com Received-SPF: pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) client-ip=79.124.17.100; From: Wu Jianhua <jianhua.wu@intel.com> To: ffmpeg-devel@ffmpeg.org Date: Wed, 4 Aug 2021 10:06:13 +0800 Message-Id: <20210804020616.82866-2-jianhua.wu@intel.com> In-Reply-To: <20210804020616.82866-1-jianhua.wu@intel.com> References: <20210804020616.82866-1-jianhua.wu@intel.com> Subject: [FFmpeg-devel] [PATCH v2 2/5] libavfilter/x86/vf_gblur: add ff_verti_slice_avx2/512() Precedence: list Reply-To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org> Cc: Wu Jianhua <jianhua.wu@intel.com> MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" <ffmpeg-devel-bounces@ffmpeg.org>
Series	[FFmpeg-devel,v2,1/5] libavfilter/x86/vf_gblur: add ff_postscale_slice_avx512() \| expand [FFmpeg-devel,v2,1/5] libavfilter/x86/vf_gblur: add ff_postscale_slice_avx512() [FFmpeg-devel,v2,2/5] libavfilter/x86/vf_gblur: add ff_verti_slice_avx2/512() [FFmpeg-devel,v2,3/5] tests/checkasm/vf_gblur.c: add check_verti_slice() for unit test [FFmpeg-devel,v2,4/5] libavfilter/x86/vf_gblur: add localbuf and ff_horiz_slice_avx2/512() [FFmpeg-devel,v2,5/5] tests/checkasm/vf_gblur.c: update check_horiz_slice for the new ff_horiz_slic…

Context	Check	Description
andriy/x86_make	success	Make finished
andriy/x86_make_fate	success	Make fate finished
andriy/PPC64_make	success	Make finished
andriy/PPC64_make_fate	success	Make fate finished

diff --git a/libavfilter/gblur.h b/libavfilter/gblur.h index dce50671f6..367575a6db 100644 --- a/libavfilter/gblur.h +++ b/libavfilter/gblur.h @@ -50,6 +50,8 @@ typedef struct GBlurContext { float nuV; int nb_planes; void (*horiz_slice)(float *buffer, int width, int height, int steps, float nu, float bscale); + void (*verti_slice)(float *buffer, int width, int height, int slice_start, int slice_end, int steps, + float nu, float bscale); void (*postscale_slice)(float *buffer, int length, float postscale, float min, float max); } GBlurContext; diff --git a/libavfilter/vf_gblur.c b/libavfilter/vf_gblur.c index 3f61275658..de7ed82d49 100644 --- a/libavfilter/vf_gblur.c +++ b/libavfilter/vf_gblur.c @@ -138,6 +138,19 @@ static void do_vertical_columns(float *buffer, int width, int height, } } +static void verti_slice_c(float *buffer, int width, int height, + int slice_start, int slice_end, int steps, + float nu, float boundaryscale) +{ + int aligned_end = slice_start + (((slice_end - slice_start) >> 3) << 3); + /* Filter vertically along columns (process 8 columns in each step) */ + do_vertical_columns(buffer, width, height, slice_start, aligned_end, + steps, nu, boundaryscale, 8); + /* Filter un-aligned columns one by one */ + do_vertical_columns(buffer, width, height, aligned_end, slice_end, + steps, nu, boundaryscale, 1); +} + static int filter_vertically(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) { GBlurContext *s = ctx->priv; @@ -150,16 +163,10 @@ static int filter_vertically(AVFilterContext *ctx, void *arg, int jobnr, int nb_ const int steps = s->steps; const float nu = s->nuV; float *buffer = s->buffer; - int aligned_end; - aligned_end = slice_start + (((slice_end - slice_start) >> 3) << 3); - /* Filter vertically along columns (process 8 columns in each step) */ - do_vertical_columns(buffer, width, height, slice_start, aligned_end, - steps, nu, boundaryscale, 8); + s->verti_slice(buffer, width, height, slice_start, slice_end, + steps, nu, boundaryscale); - /* Filter un-aligned columns one by one */ - do_vertical_columns(buffer, width, height, aligned_end, slice_end, - steps, nu, boundaryscale, 1); return 0; } @@ -233,6 +240,7 @@ static int query_formats(AVFilterContext *ctx) void ff_gblur_init(GBlurContext *s) { s->horiz_slice = horiz_slice_c; + s->verti_slice = verti_slice_c; s->postscale_slice = postscale_c; if (ARCH_X86) ff_gblur_init_x86(s); diff --git a/libavfilter/x86/vf_gblur.asm b/libavfilter/x86/vf_gblur.asm index 276fe347f5..ac4debba74 100644 --- a/libavfilter/x86/vf_gblur.asm +++ b/libavfilter/x86/vf_gblur.asm @@ -22,6 +22,43 @@ SECTION .text +%xdefine AVX2_MMSIZE 32 +%xdefine AVX512_MMSIZE 64 + +%macro MOVSXDIFNIDN 1-* + %rep %0 + movsxdifnidn %1q, %1d + %rotate 1 + %endrep +%endmacro + +%macro PUSH_MASK 5 +%if mmsize == AVX2_MMSIZE + %assign %%n mmsize/4 + %assign %%i 0 + %rep %%n + mov %4, %3 + and %4, 1 + neg %4 + mov dword [%5 + %%i*4], %4 + sar %3, 1 + %assign %%i %%i+1 + %endrep + movu %1, [%5] +%else + kmovd %2, %3 +%endif +%endmacro + +%macro VMASKMOVPS 4 +%if mmsize == AVX2_MMSIZE + vpmaskmovd %1, %3, %2 +%else + kmovw k7, %4 + vmovups %1{k7}, %2 +%endif +%endmacro + ; void ff_horiz_slice_sse4(float *ptr, int width, int height, int steps, ; float nu, float bscale) @@ -232,3 +269,155 @@ POSTSCALE_SLICE INIT_ZMM avx512 POSTSCALE_SLICE %endif + + +;******************************************************************************* +; void ff_verti_slice(float *buffer, int width, int height, int column_begin, +; int column_end, int steps, float nu, float bscale); +;******************************************************************************* +%macro VERTI_SLICE 0 +%if UNIX64 +cglobal verti_slice, 6, 12, 9, 0-mmsize*2, buffer, width, height, cbegin, cend, \ + steps, x, y, cwidth, step, ptr, stride +%else +cglobal verti_slice, 6, 12, 9, 0-mmsize*2, buffer, width, height, cbegin, cend, \ + steps, nu, bscale, x, y, cwidth, step, \ + ptr, stride +%endif +%assign cols mmsize/4 +%if WIN64 + VBROADCASTSS m0, num + VBROADCASTSS m1, bscalem + DEFINE_ARGS buffer, width, height, cbegin, cend, \ + steps, x, y, cwidth, step, ptr, stride + MOVSXDIFNIDN width, height, cbegin, cend, steps +%else + VBROADCASTSS m0, xmm0 ; nu + VBROADCASTSS m1, xmm1 ; bscale +%endif + mov cwidthq, cendq + sub cwidthq, cbeginq + lea strideq, [widthq * 4] + + xor xq, xq ; x = 0 + cmp cwidthq, cols + jl .x_scalar + cmp cwidthq, 0x0 + je .end_scalar + + sub cwidthq, cols +.loop_x: + xor stepq, stepq + .loop_step: + ; ptr = buffer + x + column_begin; + lea ptrq, [xq + cbeginq] + lea ptrq, [bufferq + ptrq*4] + + ; ptr[15:0] *= bcale; + movu m2, [ptrq] + mulps m2, m1 + movu [ptrq], m2 + + ; Filter downwards + mov yq, 1 + .loop_y_down: + add ptrq, strideq ; ptrq += width + movu m3, [ptrq] + FMULADD_PS m2, m2, m0, m3, m2 + movu [ptrq], m2 + + inc yq + cmp yq, heightq + jl .loop_y_down + + mulps m2, m1 + movu [ptrq], m2 + + ; Filter upwards + dec yq + .loop_y_up: + sub ptrq, strideq + movu m3, [ptrq] + FMULADD_PS m2, m2, m0, m3, m2 + movu [ptrq], m2 + + dec yq + cmp yq, 0 + jg .loop_y_up + + inc stepq + cmp stepq, stepsq + jl .loop_step + + add xq, cols + cmp xq, cwidthq + jle .loop_x + + add cwidthq, cols + cmp xq, cwidthq + jge .end_scalar + +.x_scalar: + xor stepq, stepq + mov qword [rsp + 0x10], xq + sub cwidthq, xq + mov xq, 1 + shlx cwidthq, xq, cwidthq + sub cwidthq, 1 + PUSH_MASK m4, k1, cwidthd, xd, rsp + 0x20 + mov xq, qword [rsp + 0x10] + + .loop_step_scalar: + lea ptrq, [xq + cbeginq] + lea ptrq, [bufferq + ptrq*4] + + VMASKMOVPS m2, [ptrq], m4, k1 + mulps m2, m1 + VMASKMOVPS [ptrq], m2, m4, k1 + + ; Filter downwards + mov yq, 1 + .x_scalar_loop_y_down: + add ptrq, strideq + VMASKMOVPS m3, [ptrq], m4, k1 + FMULADD_PS m2, m2, m0, m3, m2 + VMASKMOVPS [ptrq], m2, m4, k1 + + inc yq + cmp yq, heightq + jl .x_scalar_loop_y_down + + mulps m2, m1 + VMASKMOVPS [ptrq], m2, m4, k1 + + ; Filter upwards + dec yq + .x_scalar_loop_y_up: + sub ptrq, strideq + VMASKMOVPS m3, [ptrq], m4, k1 + FMULADD_PS m2, m2, m0, m3, m2 + VMASKMOVPS [ptrq], m2, m4, k1 + + dec yq + cmp yq, 0 + jg .x_scalar_loop_y_up + + inc stepq + cmp stepq, stepsq + jl .loop_step_scalar + +.end_scalar: + RET +%endmacro + +%if ARCH_X86_64 +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +VERTI_SLICE +%endif + +%if HAVE_AVX512_EXTERNAL +INIT_ZMM avx512 +VERTI_SLICE +%endif +%endif diff --git a/libavfilter/x86/vf_gblur_init.c b/libavfilter/x86/vf_gblur_init.c index 34aba4ca6e..3e173410c2 100644 --- a/libavfilter/x86/vf_gblur_init.c +++ b/libavfilter/x86/vf_gblur_init.c @@ -31,6 +31,11 @@ void ff_postscale_slice_sse(float *ptr, int length, float postscale, float min, void ff_postscale_slice_avx2(float *ptr, int length, float postscale, float min, float max); void ff_postscale_slice_avx512(float *ptr, int length, float postscale, float min, float max); +void ff_verti_slice_avx2(float *buffer, int width, int height, int column_begin, int column_end, + int steps, float nu, float bscale); +void ff_verti_slice_avx512(float *buffer, int width, int height, int column_begin, int column_end, + int steps, float nu, float bscale); + av_cold void ff_gblur_init_x86(GBlurContext *s) { int cpu_flags = av_get_cpu_flags(); @@ -47,9 +52,11 @@ av_cold void ff_gblur_init_x86(GBlurContext *s) } if (EXTERNAL_AVX2(cpu_flags)) { s->horiz_slice = ff_horiz_slice_avx2; + s->verti_slice = ff_verti_slice_avx2; } if (EXTERNAL_AVX512(cpu_flags)) { s->postscale_slice = ff_postscale_slice_avx512; + s->verti_slice = ff_verti_slice_avx512; } #endif }

[FFmpeg-devel,v2,2/5] libavfilter/x86/vf_gblur: add ff_verti_slice_avx2/512()

Checks

Commit Message

Patch