Message ID | 20230226054835.14201-1-nuomi2021@gmail.com |
---|---|
State | New |
Headers | show |
Series | [FFmpeg-devel,1/2] vvcdec: alf, add avx2 for luma and chroma filter | expand |
Context | Check | Description |
---|---|---|
yinshiyou/configure_loongarch64 | warning | Failed to apply patch |
andriy/configure_x86 | warning | Failed to apply patch |
On Sun, Feb 26, 2023 at 1:48 PM Nuo Mi <nuomi2021@gmail.com> wrote: > got 11%~26% performance for 1080P and 4k video > > clip before after delta > RitualDance_1920x1080_60_10_420_32_LD.26 35 43 22.8% > RitualDance_1920x1080_60_10_420_37_RA.266 43 48 11.6% > Tango2_3840x2160_60_10_420_27_LD.266 7.9 10 26.5% > --- > libavcodec/vvcdsp.c | 3 + > libavcodec/x86/Makefile | 2 + > libavcodec/x86/vvc_alf.asm | 301 +++++++++++++++++++++++++++++++++++ > libavcodec/x86/vvcdsp.h | 44 +++++ > libavcodec/x86/vvcdsp_init.c | 81 ++++++++++ > 5 files changed, 431 insertions(+) > create mode 100644 libavcodec/x86/vvc_alf.asm > create mode 100644 libavcodec/x86/vvcdsp.h > create mode 100644 libavcodec/x86/vvcdsp_init.c > > diff --git a/libavcodec/vvcdsp.c b/libavcodec/vvcdsp.c > index 801bd0189d..399631503f 100644 > --- a/libavcodec/vvcdsp.c > +++ b/libavcodec/vvcdsp.c > @@ -313,4 +313,7 @@ void ff_vvc_dsp_init(VVCDSPContext *vvcdsp, int > bit_depth) > VVC_DSP(8); > break; > } > +#if ARCH_X86 > + ff_vvc_dsp_init_x86(vvcdsp, bit_depth); > +#endif > } > diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile > index 118daca333..23b2fb42bb 100644 > --- a/libavcodec/x86/Makefile > +++ b/libavcodec/x86/Makefile > @@ -82,6 +82,7 @@ OBJS-$(CONFIG_VP9_DECODER) += > x86/vp9dsp_init.o \ > x86/vp9dsp_init_12bpp.o \ > x86/vp9dsp_init_16bpp.o > OBJS-$(CONFIG_WEBP_DECODER) += x86/vp8dsp_init.o > +OBJS-$(CONFIG_VVC_DECODER) += x86/vvcdsp_init.o > > > # GCC inline assembly optimizations > @@ -202,4 +203,5 @@ X86ASM-OBJS-$(CONFIG_VP9_DECODER) += > x86/vp9intrapred.o \ > x86/vp9lpf_16bpp.o \ > x86/vp9mc.o \ > x86/vp9mc_16bpp.o > +X86ASM-OBJS-$(CONFIG_VVC_DECODER) += x86/vvc_alf.o > X86ASM-OBJS-$(CONFIG_WEBP_DECODER) += x86/vp8dsp.o > diff --git a/libavcodec/x86/vvc_alf.asm b/libavcodec/x86/vvc_alf.asm > new file mode 100644 > index 0000000000..c3e4074be7 > --- /dev/null > +++ b/libavcodec/x86/vvc_alf.asm > @@ -0,0 +1,301 @@ > > +;****************************************************************************** > +;* VVC Adaptive Loop Filter SIMD optimizations > +;* > +;* Copyright (c) 2023 Nuo Mi <nuomi2021@gmail.com> > +;* > +;* This file is part of FFmpeg. > +;* > +;* FFmpeg is free software; you can redistribute it and/or > +;* modify it under the terms of the GNU Lesser General Public > +;* License as published by the Free Software Foundation; either > +;* version 2.1 of the License, or (at your option) any later version. > +;* > +;* FFmpeg is distributed in the hope that it will be useful, > +;* but WITHOUT ANY WARRANTY; without even the implied warranty of > +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > +;* Lesser General Public License for more details. > +;* > +;* You should have received a copy of the GNU Lesser General Public > +;* License along with FFmpeg; if not, write to the Free Software > +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA > 02110-1301 USA > > +;****************************************************************************** > + > +%include "libavutil/x86/x86util.asm" > + > +SECTION_RODATA > + > +%macro PARAM_SHUFFE 1 > +%assign i (%1 * 2) > +%assign j ((i + 1) << 8) + (i) > +param_shuffe_%+%1: > +%rep 2 > + times 4 dw j > + times 4 dw (j + 0x0808) > +%endrep > +%endmacro > + > +PARAM_SHUFFE 0 > +PARAM_SHUFFE 1 > +PARAM_SHUFFE 2 > +PARAM_SHUFFE 3 > + > +dw_64: dd 64 > + > +SECTION .text > + > +%if HAVE_AVX2_EXTERNAL > + > +;%1-%3 out > +;%4 clip or filter > +%macro LOAD_LUMA_PARAMS_W16 4 > + %ifidn clip, %4 > + movu m%1, [%4q + 0 * 32] > + movu m%2, [%4q + 1 * 32] > + movu m%3, [%4q + 2 * 32] > + %elifidn filter, %4 > + movu xm%1, [%4q + 0 * 16] > + movu xm%2, [%4q + 1 * 16] > + movu xm%3, [%4q + 2 * 16] > + pmovsxbw m%1, xm%1 > + pmovsxbw m%2, xm%2 > + pmovsxbw m%3, xm%3 > + %else > + %error "need filter or clip for the fourth param" > + %endif > +%endmacro > + > +%macro LOAD_LUMA_PARAMS_W16 6 > + LOAD_LUMA_PARAMS_W16 %1, %2, %3, %4 > + ;m%1 = 03 02 01 00 > + ;m%2 = 07 06 05 04 > + ;m%3 = 11 10 09 08 > + > + vshufpd m%5, m%1, m%2, 0b0011 ;06 02 05 01 > + vshufpd m%6, m%3, m%5, 0b1001 ;06 10 01 09 > + > + vshufpd m%1, m%1, m%6, 0b1100 ;06 03 09 00 > + vshufpd m%2, m%2, m%6, 0b0110 ;10 07 01 04 > + vshufpd m%3, m%3, m%5, 0b0110 ;02 11 05 08 > + > + vpermpd m%1, m%1, 0b01_11_10_00 ;09 06 03 00 > + vshufpd m%2, m%2, m%2, 0b1001 ;10 07 04 01 > + vpermpd m%3, m%3, 0b10_00_01_11 ;11 08 05 02 > +%endmacro > + > +%macro LOAD_LUMA_PARAMS_W4 6 > + %ifidn clip, %4 > + movq xm%1, [%4q + 0 * 8] > + movq xm%2, [%4q + 1 * 8] > + movq xm%3, [%4q + 2 * 8] > + %elifidn filter, %4 > + movd xm%1, [%4q + 0 * 4] > + movd xm%2, [%4q + 1 * 4] > + movd xm%3, [%4q + 2 * 4] > + pmovsxbw xm%1, xm%1 > + pmovsxbw xm%2, xm%2 > + pmovsxbw xm%3, xm%3 > + %else > + %error "need filter or clip for the fourth param" > + %endif > + vpbroadcastq m%1, xm%1 > + vpbroadcastq m%2, xm%2 > + vpbroadcastq m%3, xm%3 > +%endmacro > + > +;%1-%3 out > +;%4 clip or filter > +;%5, %6 tmp > +%macro LOAD_LUMA_PARAMS 6 > + LOAD_LUMA_PARAMS_W %+ WIDTH %1, %2, %3, %4, %5, %6 > +%endmacro > + > +%macro LOAD_CHROMA_PARAMS 4 > + ;LOAD_CHROMA_PARAMS_W %+ WIDTH %1, %2, %3, %4 > + %ifidn clip, %3 > + movq xm%1, [%3q] > + movd xm%2, [%3q + 8] > + %elifidn filter, %3 > + movd xm%1, [%3q + 0] > + pinsrw xm%2, [%3q + 4], 0 > + vpmovsxbw m%1, xm%1 > + vpmovsxbw m%2, xm%2 > + %else > + %error "need filter or clip for the third param" > + %endif > + vpbroadcastq m%1, xm%1 > + vpbroadcastq m%2, xm%2 > +%endmacro > + > +%macro LOAD_PARAMS 0 > + %if LUMA > + LOAD_LUMA_PARAMS 3, 4, 5, filter, 6, 7 > + LOAD_LUMA_PARAMS 6, 7, 8, clip, 9, 10 > + %else > + LOAD_CHROMA_PARAMS 3, 4, filter, 5 > + LOAD_CHROMA_PARAMS 6, 7, clip, 8 > + %endif > +%endmacro > + > +;FILTER(param_idx) > +;input: m2, m9, m10 > +;output: m0, m1 > +;m12 ~ m15: tmp > +%macro FILTER 1 > + %assign i (%1 % 4) > + %assign j (%1 / 4 + 3) > + %assign k (%1 / 4 + 6) > + %define filters m%+j > + %define clips m%+k > + > + movu m12, [param_shuffe_%+i] > + pshufb m14, clips, m12 ;clip > + pxor m13, m13 > + psubw m13, m14 ;-clip > + > + vpsubw m9, m2 > + CLIPW m9, m13, m14 > + > + vpsubw m10, m2 > + CLIPW m10, m13, m14 > + > + vpunpckhwd m15, m9, m10 > + vpunpcklwd m9, m9, m10 > + > + pshufb m14, filters, m12 ;filter > + vpunpcklwd m10, m14, m14 > + vpunpckhwd m14, m14, m14 > + > + vpmaddwd m9, m10 > + vpmaddwd m14, m15 > + > + paddd m0, m9 > + paddd m1, m14 > +%endmacro > + > +;FILTER(param_start, off0~off2) > +%macro FILTER 4 > + %assign %%i (%1) > + %rep 3 > + lea offsetq, [%2] > + mov topq, srcq > + mov bottomq, srcq > + sub topq, offsetq > + add bottomq, offsetq > + LOAD_PIXELS 9, topq, 11 > + LOAD_PIXELS 10, bottomq, 12 > + FILTER %%i > + %assign %%i %%i+1 > + %rotate 1 > + %endrep > +%endmacro > + > +;filter pixels for luma and chroma > +%macro FILTER 0 > + %if LUMA > + FILTER 0, src_stride3q , src_strideq * 2 + > ps, src_strideq * 2 > + FILTER 3, src_strideq * 2 - ps, src_strideq + 2 * > ps, src_strideq + ps > + FILTER 6, src_strideq, src_strideq - ps, > src_strideq + -2 * ps > + FILTER 9, src_stride0q + 3 * ps, src_stride0q + 2 * > ps, src_stride0q + ps > + %else > + FILTER 0, src_strideq * 2, src_strideq + ps, > src_strideq > + FILTER 3, src_strideq - ps, src_stride0q + 2 * > ps, src_stride0q + ps > + %endif > +%endmacro > + > +%define SHIFT 7 > + > +;LOAD_PIXELS(dest, src, tmp) > +%macro LOAD_PIXELS 3 > + %if WIDTH == 16 > + movu m%1, [%2] > + %else > + pinsrq xm%1, [%2], 0 > + pinsrq xm%1, [%2 + src_strideq], 1 > + pinsrq xm%3, [%2 + src_strideq * 2], 0 > + pinsrq xm%3, [%2 + src_stride3q], 1 > + vinsertf128 m%1, xm%3, 1 > + %endif > +%endmacro > + > +;STORE_PIXELS(dest, src, tmp) > +%macro STORE_PIXELS 3 > + %if WIDTH == 16 > + movu [%1], m%2 > + %else > + pextrq [%1], xm%2, 0 > + pextrq [%1 + src_strideq], xm%2, 1 > + vperm2f128 m%2, m%2, 1 > + pextrq [%1 + src_strideq * 2], xm%2, 0 > + pextrq [%1 + src_stride3q], xm%2, 1 > + %endif > +%endmacro > + > +;FILTER_LUMA(width) > +%macro ALF_FILTER_16BPP 2 > +%ifidn %1, luma > + %xdefine LUMA 1 > +%else > + %xdefine LUMA 0 > +%endif > +%xdefine WIDTH %2 > +; void vvc_alf_filter_luma_w%1_16bpp_avx2(uint8_t *dst, ptrdiff_t > dst_stride, > +; const uint8_t *src, ptrdiff_t src_stride, int height, > +; const int8_t *filter, const int16_t *clip, ptrdiff_t stride, > uint16_t pixel_max); > + > +; see c code for p0 to p6 > + > +INIT_YMM avx2 > +cglobal vvc_alf_filter_%1_w%2_16bpp, 9, 15, 15, dst, dst_stride, src, > src_stride, height, filter, clip, stride, pixel_max, \ > + top, bottom, offset, src_stride3, src_stride0 > +%define ps 2 > + lea src_stride3q, [src_strideq * 2 + src_strideq] > + mov src_stride0q, 0 > + shr heightq, 2 > + > +.loop: > + LOAD_PARAMS > + > +;we need loop 4 times for a 16x4 block, 1 time for a 4x4 block > +%define rep_num (WIDTH / 4) > +%define lines (4 / rep_num) > +%rep rep_num > + VPBROADCASTD m0, [dw_64] > + VPBROADCASTD m1, [dw_64] > + > + LOAD_PIXELS 2, srcq, 9 ;p0 > + > + FILTER > + > + vpsrad m0, SHIFT > + vpsrad m1, SHIFT > + > + vpackssdw m0, m0, m1 > + paddw m0, m2 > + > + ;clip to pixel > + pinsrw xm2, pixel_maxw, 0 > + vpbroadcastw m2, xm2 > + pxor m1, m1 > + CLIPW m0, m1, m2 > + > + STORE_PIXELS dstq, 0, 1 > + > + lea srcq, [srcq + lines * src_strideq] > + lea dstq, [dstq + lines * dst_strideq] > +%endrep > + > + lea filterq, [filterq + strideq] > + lea clipq, [clipq + 2 * strideq] > + > + dec heightq > + jg .loop > + RET > +%endmacro > + > +ALF_FILTER_16BPP luma, 16 > +ALF_FILTER_16BPP luma, 4 > +ALF_FILTER_16BPP chroma, 16 > +ALF_FILTER_16BPP chroma, 4 > + > +%endif > + > diff --git a/libavcodec/x86/vvcdsp.h b/libavcodec/x86/vvcdsp.h > new file mode 100644 > index 0000000000..8589d4ae97 > --- /dev/null > +++ b/libavcodec/x86/vvcdsp.h > @@ -0,0 +1,44 @@ > +/* > + * VVC DSP for x86 > + * > + * Copyright (C) 2022 Nuo Mi > + * > + * > + * This file is part of FFmpeg. > + * > + * FFmpeg is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2.1 of the License, or (at your option) any later version. > + * > + * FFmpeg is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with FFmpeg; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA > 02110-1301 USA > + */ > + > +#ifndef AVCODEC_X86_VVCDSP_H > +#define AVCODEC_X86_VVCDSP_H > + > +void ff_vvc_alf_filter_luma_w16_16bpp_avx2(uint8_t *dst, ptrdiff_t > dst_stride, > + const uint8_t *src, ptrdiff_t src_stride, int height, > + const int8_t *filter, const int16_t *clip, ptrdiff_t stride, uint16_t > pixel_max); > + > +void ff_vvc_alf_filter_luma_w4_16bpp_avx2(uint8_t *dst, ptrdiff_t > dst_stride, > + const uint8_t *src, ptrdiff_t src_stride, int height, > + const int8_t *filter, const int16_t *clip, ptrdiff_t stride, uint16_t > pixel_max); > + > +void ff_vvc_alf_filter_chroma_w16_16bpp_avx2(uint8_t *dst, ptrdiff_t > dst_stride, > + const uint8_t *src, ptrdiff_t src_stride, int height, > + const int8_t *filter, const int16_t *clip, ptrdiff_t stride, uint16_t > pixel_max); > + > +void ff_vvc_alf_filter_chroma_w4_16bpp_avx2(uint8_t *dst, ptrdiff_t > dst_stride, > + const uint8_t *src, ptrdiff_t src_stride, int height, > + const int8_t *filter, const int16_t *clip, ptrdiff_t stride, uint16_t > pixel_max); > + > +#endif //AVCODEC_X86_VVCDSP_H > + > diff --git a/libavcodec/x86/vvcdsp_init.c b/libavcodec/x86/vvcdsp_init.c > new file mode 100644 > index 0000000000..c595ed55fa > --- /dev/null > +++ b/libavcodec/x86/vvcdsp_init.c > @@ -0,0 +1,81 @@ > +/* > + * VVC DSP init for x86 > + * > + * Copyright (C) 2022 Nuo Mi > + * > + * > + * This file is part of FFmpeg. > + * > + * FFmpeg is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2.1 of the License, or (at your option) any later version. > + * > + * FFmpeg is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with FFmpeg; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA > 02110-1301 USA > + */ > + > +#include "config.h" > + > +#include "libavutil/cpu.h" > +#include "libavutil/x86/asm.h" > +#include "libavutil/x86/cpu.h" > +#include "libavcodec/vvcdec.h" > +#include "libavcodec/vvcdsp.h" > +#include "libavcodec/x86/vvcdsp.h" > + > +static void alf_filter_luma_10_avx2(uint8_t *dst, ptrdiff_t dst_stride, > const uint8_t *src, ptrdiff_t src_stride, > + int width, int height, const int8_t *filter, const int16_t *clip) > +{ > + const int ps = 1; > //pixel shift > + const int pixel_max = (1 << 10) - 1; > + const int param_stride = (width >> 2) * ALF_NUM_COEFF_LUMA; > + int w; > + > + for (w = 0; w + 16 <= width; w += 16) { > + const int param_offset = w * ALF_NUM_COEFF_LUMA / ALF_BLOCK_SIZE; > + ff_vvc_alf_filter_luma_w16_16bpp_avx2(dst + (w << ps), > dst_stride, src + (w << ps), src_stride, > + height, filter + param_offset, clip + param_offset, > param_stride, pixel_max); > + } > + for ( /* nothing */; w < width; w += 4) { > + const int param_offset = w * ALF_NUM_COEFF_LUMA / ALF_BLOCK_SIZE; > + ff_vvc_alf_filter_luma_w4_16bpp_avx2(dst + (w << ps), dst_stride, > src + (w << ps), src_stride, > + height, filter + param_offset, clip + param_offset, > param_stride, pixel_max); > + } > +} > + > +static void alf_filter_chroma_10_avx2(uint8_t *dst, ptrdiff_t dst_stride, > const uint8_t *src, ptrdiff_t src_stride, > + int width, int height, const int8_t *filter, const int16_t *clip) > +{ > + const int ps = 1; > //pixel shift > + const int pixel_max = (1 << 10) - 1; > + int w; > + > + for (w = 0; w + 16 <= width; w += 16) { > + ff_vvc_alf_filter_chroma_w16_16bpp_avx2(dst + (w << ps), > dst_stride, src + (w << ps), src_stride, > + height, filter, clip, 0, pixel_max); > + } > + for ( /* nothing */; w < width; w += 4) { > + ff_vvc_alf_filter_chroma_w4_16bpp_avx2(dst + (w << ps), > dst_stride, src + (w << ps), src_stride, > + height, filter, clip, 0, pixel_max); > + } > +} > + > +void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bit_depth) > +{ > + const int cpu_flags = av_get_cpu_flags(); > + > + if (bit_depth == 10) { > + if (EXTERNAL_AVX2(cpu_flags)) { > + c->alf.filter[LUMA] = alf_filter_luma_10_avx2; > + c->alf.filter[CHROMA] = alf_filter_chroma_10_avx2; > + } > + } > +} > + > -- > 2.25.1 > Hi experts, Please help review ALF filter for vvc. ALF is the most time consuming process in vvc. It takes about 30~60% time for C code. Filter is a part of it. Please help review and give some performance improvement suggestions for this patch. For each 4x4 pixel block, there are 12 coeffs (int8_t) and 12 clips (int16_t) params. For each pixel, we need to subtract and clip 24 times and multiply 12 times. The current AVX2 code will process 16x4 or 4x4 blocks in a loop. Please check [1] for a working build and [2] for C code. Thank you very much. [1] https://github.com/ffvvc/FFmpeg/pull/42/commits [2] https://github.com/ffvvc/FFmpeg/blob/main/libavcodec/vvc_filter_template.c#L246
diff --git a/libavcodec/vvcdsp.c b/libavcodec/vvcdsp.c index 801bd0189d..399631503f 100644 --- a/libavcodec/vvcdsp.c +++ b/libavcodec/vvcdsp.c @@ -313,4 +313,7 @@ void ff_vvc_dsp_init(VVCDSPContext *vvcdsp, int bit_depth) VVC_DSP(8); break; } +#if ARCH_X86 + ff_vvc_dsp_init_x86(vvcdsp, bit_depth); +#endif } diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index 118daca333..23b2fb42bb 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -82,6 +82,7 @@ OBJS-$(CONFIG_VP9_DECODER) += x86/vp9dsp_init.o \ x86/vp9dsp_init_12bpp.o \ x86/vp9dsp_init_16bpp.o OBJS-$(CONFIG_WEBP_DECODER) += x86/vp8dsp_init.o +OBJS-$(CONFIG_VVC_DECODER) += x86/vvcdsp_init.o # GCC inline assembly optimizations @@ -202,4 +203,5 @@ X86ASM-OBJS-$(CONFIG_VP9_DECODER) += x86/vp9intrapred.o \ x86/vp9lpf_16bpp.o \ x86/vp9mc.o \ x86/vp9mc_16bpp.o +X86ASM-OBJS-$(CONFIG_VVC_DECODER) += x86/vvc_alf.o X86ASM-OBJS-$(CONFIG_WEBP_DECODER) += x86/vp8dsp.o diff --git a/libavcodec/x86/vvc_alf.asm b/libavcodec/x86/vvc_alf.asm new file mode 100644 index 0000000000..c3e4074be7 --- /dev/null +++ b/libavcodec/x86/vvc_alf.asm @@ -0,0 +1,301 @@ +;****************************************************************************** +;* VVC Adaptive Loop Filter SIMD optimizations +;* +;* Copyright (c) 2023 Nuo Mi <nuomi2021@gmail.com> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +%macro PARAM_SHUFFE 1 +%assign i (%1 * 2) +%assign j ((i + 1) << 8) + (i) +param_shuffe_%+%1: +%rep 2 + times 4 dw j + times 4 dw (j + 0x0808) +%endrep +%endmacro + +PARAM_SHUFFE 0 +PARAM_SHUFFE 1 +PARAM_SHUFFE 2 +PARAM_SHUFFE 3 + +dw_64: dd 64 + +SECTION .text + +%if HAVE_AVX2_EXTERNAL + +;%1-%3 out +;%4 clip or filter +%macro LOAD_LUMA_PARAMS_W16 4 + %ifidn clip, %4 + movu m%1, [%4q + 0 * 32] + movu m%2, [%4q + 1 * 32] + movu m%3, [%4q + 2 * 32] + %elifidn filter, %4 + movu xm%1, [%4q + 0 * 16] + movu xm%2, [%4q + 1 * 16] + movu xm%3, [%4q + 2 * 16] + pmovsxbw m%1, xm%1 + pmovsxbw m%2, xm%2 + pmovsxbw m%3, xm%3 + %else + %error "need filter or clip for the fourth param" + %endif +%endmacro + +%macro LOAD_LUMA_PARAMS_W16 6 + LOAD_LUMA_PARAMS_W16 %1, %2, %3, %4 + ;m%1 = 03 02 01 00 + ;m%2 = 07 06 05 04 + ;m%3 = 11 10 09 08 + + vshufpd m%5, m%1, m%2, 0b0011 ;06 02 05 01 + vshufpd m%6, m%3, m%5, 0b1001 ;06 10 01 09 + + vshufpd m%1, m%1, m%6, 0b1100 ;06 03 09 00 + vshufpd m%2, m%2, m%6, 0b0110 ;10 07 01 04 + vshufpd m%3, m%3, m%5, 0b0110 ;02 11 05 08 + + vpermpd m%1, m%1, 0b01_11_10_00 ;09 06 03 00 + vshufpd m%2, m%2, m%2, 0b1001 ;10 07 04 01 + vpermpd m%3, m%3, 0b10_00_01_11 ;11 08 05 02 +%endmacro + +%macro LOAD_LUMA_PARAMS_W4 6 + %ifidn clip, %4 + movq xm%1, [%4q + 0 * 8] + movq xm%2, [%4q + 1 * 8] + movq xm%3, [%4q + 2 * 8] + %elifidn filter, %4 + movd xm%1, [%4q + 0 * 4] + movd xm%2, [%4q + 1 * 4] + movd xm%3, [%4q + 2 * 4] + pmovsxbw xm%1, xm%1 + pmovsxbw xm%2, xm%2 + pmovsxbw xm%3, xm%3 + %else + %error "need filter or clip for the fourth param" + %endif + vpbroadcastq m%1, xm%1 + vpbroadcastq m%2, xm%2 + vpbroadcastq m%3, xm%3 +%endmacro + +;%1-%3 out +;%4 clip or filter +;%5, %6 tmp +%macro LOAD_LUMA_PARAMS 6 + LOAD_LUMA_PARAMS_W %+ WIDTH %1, %2, %3, %4, %5, %6 +%endmacro + +%macro LOAD_CHROMA_PARAMS 4 + ;LOAD_CHROMA_PARAMS_W %+ WIDTH %1, %2, %3, %4 + %ifidn clip, %3 + movq xm%1, [%3q] + movd xm%2, [%3q + 8] + %elifidn filter, %3 + movd xm%1, [%3q + 0] + pinsrw xm%2, [%3q + 4], 0 + vpmovsxbw m%1, xm%1 + vpmovsxbw m%2, xm%2 + %else + %error "need filter or clip for the third param" + %endif + vpbroadcastq m%1, xm%1 + vpbroadcastq m%2, xm%2 +%endmacro + +%macro LOAD_PARAMS 0 + %if LUMA + LOAD_LUMA_PARAMS 3, 4, 5, filter, 6, 7 + LOAD_LUMA_PARAMS 6, 7, 8, clip, 9, 10 + %else + LOAD_CHROMA_PARAMS 3, 4, filter, 5 + LOAD_CHROMA_PARAMS 6, 7, clip, 8 + %endif +%endmacro + +;FILTER(param_idx) +;input: m2, m9, m10 +;output: m0, m1 +;m12 ~ m15: tmp +%macro FILTER 1 + %assign i (%1 % 4) + %assign j (%1 / 4 + 3) + %assign k (%1 / 4 + 6) + %define filters m%+j + %define clips m%+k + + movu m12, [param_shuffe_%+i] + pshufb m14, clips, m12 ;clip + pxor m13, m13 + psubw m13, m14 ;-clip + + vpsubw m9, m2 + CLIPW m9, m13, m14 + + vpsubw m10, m2 + CLIPW m10, m13, m14 + + vpunpckhwd m15, m9, m10 + vpunpcklwd m9, m9, m10 + + pshufb m14, filters, m12 ;filter + vpunpcklwd m10, m14, m14 + vpunpckhwd m14, m14, m14 + + vpmaddwd m9, m10 + vpmaddwd m14, m15 + + paddd m0, m9 + paddd m1, m14 +%endmacro + +;FILTER(param_start, off0~off2) +%macro FILTER 4 + %assign %%i (%1) + %rep 3 + lea offsetq, [%2] + mov topq, srcq + mov bottomq, srcq + sub topq, offsetq + add bottomq, offsetq + LOAD_PIXELS 9, topq, 11 + LOAD_PIXELS 10, bottomq, 12 + FILTER %%i + %assign %%i %%i+1 + %rotate 1 + %endrep +%endmacro + +;filter pixels for luma and chroma +%macro FILTER 0 + %if LUMA + FILTER 0, src_stride3q , src_strideq * 2 + ps, src_strideq * 2 + FILTER 3, src_strideq * 2 - ps, src_strideq + 2 * ps, src_strideq + ps + FILTER 6, src_strideq, src_strideq - ps, src_strideq + -2 * ps + FILTER 9, src_stride0q + 3 * ps, src_stride0q + 2 * ps, src_stride0q + ps + %else + FILTER 0, src_strideq * 2, src_strideq + ps, src_strideq + FILTER 3, src_strideq - ps, src_stride0q + 2 * ps, src_stride0q + ps + %endif +%endmacro + +%define SHIFT 7 + +;LOAD_PIXELS(dest, src, tmp) +%macro LOAD_PIXELS 3 + %if WIDTH == 16 + movu m%1, [%2] + %else + pinsrq xm%1, [%2], 0 + pinsrq xm%1, [%2 + src_strideq], 1 + pinsrq xm%3, [%2 + src_strideq * 2], 0 + pinsrq xm%3, [%2 + src_stride3q], 1 + vinsertf128 m%1, xm%3, 1 + %endif +%endmacro + +;STORE_PIXELS(dest, src, tmp) +%macro STORE_PIXELS 3 + %if WIDTH == 16 + movu [%1], m%2 + %else + pextrq [%1], xm%2, 0 + pextrq [%1 + src_strideq], xm%2, 1 + vperm2f128 m%2, m%2, 1 + pextrq [%1 + src_strideq * 2], xm%2, 0 + pextrq [%1 + src_stride3q], xm%2, 1 + %endif +%endmacro + +;FILTER_LUMA(width) +%macro ALF_FILTER_16BPP 2 +%ifidn %1, luma + %xdefine LUMA 1 +%else + %xdefine LUMA 0 +%endif +%xdefine WIDTH %2 +; void vvc_alf_filter_luma_w%1_16bpp_avx2(uint8_t *dst, ptrdiff_t dst_stride, +; const uint8_t *src, ptrdiff_t src_stride, int height, +; const int8_t *filter, const int16_t *clip, ptrdiff_t stride, uint16_t pixel_max); + +; see c code for p0 to p6 + +INIT_YMM avx2 +cglobal vvc_alf_filter_%1_w%2_16bpp, 9, 15, 15, dst, dst_stride, src, src_stride, height, filter, clip, stride, pixel_max, \ + top, bottom, offset, src_stride3, src_stride0 +%define ps 2 + lea src_stride3q, [src_strideq * 2 + src_strideq] + mov src_stride0q, 0 + shr heightq, 2 + +.loop: + LOAD_PARAMS + +;we need loop 4 times for a 16x4 block, 1 time for a 4x4 block +%define rep_num (WIDTH / 4) +%define lines (4 / rep_num) +%rep rep_num + VPBROADCASTD m0, [dw_64] + VPBROADCASTD m1, [dw_64] + + LOAD_PIXELS 2, srcq, 9 ;p0 + + FILTER + + vpsrad m0, SHIFT + vpsrad m1, SHIFT + + vpackssdw m0, m0, m1 + paddw m0, m2 + + ;clip to pixel + pinsrw xm2, pixel_maxw, 0 + vpbroadcastw m2, xm2 + pxor m1, m1 + CLIPW m0, m1, m2 + + STORE_PIXELS dstq, 0, 1 + + lea srcq, [srcq + lines * src_strideq] + lea dstq, [dstq + lines * dst_strideq] +%endrep + + lea filterq, [filterq + strideq] + lea clipq, [clipq + 2 * strideq] + + dec heightq + jg .loop + RET +%endmacro + +ALF_FILTER_16BPP luma, 16 +ALF_FILTER_16BPP luma, 4 +ALF_FILTER_16BPP chroma, 16 +ALF_FILTER_16BPP chroma, 4 + +%endif + diff --git a/libavcodec/x86/vvcdsp.h b/libavcodec/x86/vvcdsp.h new file mode 100644 index 0000000000..8589d4ae97 --- /dev/null +++ b/libavcodec/x86/vvcdsp.h @@ -0,0 +1,44 @@ +/* + * VVC DSP for x86 + * + * Copyright (C) 2022 Nuo Mi + * + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_X86_VVCDSP_H +#define AVCODEC_X86_VVCDSP_H + +void ff_vvc_alf_filter_luma_w16_16bpp_avx2(uint8_t *dst, ptrdiff_t dst_stride, + const uint8_t *src, ptrdiff_t src_stride, int height, + const int8_t *filter, const int16_t *clip, ptrdiff_t stride, uint16_t pixel_max); + +void ff_vvc_alf_filter_luma_w4_16bpp_avx2(uint8_t *dst, ptrdiff_t dst_stride, + const uint8_t *src, ptrdiff_t src_stride, int height, + const int8_t *filter, const int16_t *clip, ptrdiff_t stride, uint16_t pixel_max); + +void ff_vvc_alf_filter_chroma_w16_16bpp_avx2(uint8_t *dst, ptrdiff_t dst_stride, + const uint8_t *src, ptrdiff_t src_stride, int height, + const int8_t *filter, const int16_t *clip, ptrdiff_t stride, uint16_t pixel_max); + +void ff_vvc_alf_filter_chroma_w4_16bpp_avx2(uint8_t *dst, ptrdiff_t dst_stride, + const uint8_t *src, ptrdiff_t src_stride, int height, + const int8_t *filter, const int16_t *clip, ptrdiff_t stride, uint16_t pixel_max); + +#endif //AVCODEC_X86_VVCDSP_H + diff --git a/libavcodec/x86/vvcdsp_init.c b/libavcodec/x86/vvcdsp_init.c new file mode 100644 index 0000000000..c595ed55fa --- /dev/null +++ b/libavcodec/x86/vvcdsp_init.c @@ -0,0 +1,81 @@ +/* + * VVC DSP init for x86 + * + * Copyright (C) 2022 Nuo Mi + * + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" + +#include "libavutil/cpu.h" +#include "libavutil/x86/asm.h" +#include "libavutil/x86/cpu.h" +#include "libavcodec/vvcdec.h" +#include "libavcodec/vvcdsp.h" +#include "libavcodec/x86/vvcdsp.h" + +static void alf_filter_luma_10_avx2(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, + int width, int height, const int8_t *filter, const int16_t *clip) +{ + const int ps = 1; //pixel shift + const int pixel_max = (1 << 10) - 1; + const int param_stride = (width >> 2) * ALF_NUM_COEFF_LUMA; + int w; + + for (w = 0; w + 16 <= width; w += 16) { + const int param_offset = w * ALF_NUM_COEFF_LUMA / ALF_BLOCK_SIZE; + ff_vvc_alf_filter_luma_w16_16bpp_avx2(dst + (w << ps), dst_stride, src + (w << ps), src_stride, + height, filter + param_offset, clip + param_offset, param_stride, pixel_max); + } + for ( /* nothing */; w < width; w += 4) { + const int param_offset = w * ALF_NUM_COEFF_LUMA / ALF_BLOCK_SIZE; + ff_vvc_alf_filter_luma_w4_16bpp_avx2(dst + (w << ps), dst_stride, src + (w << ps), src_stride, + height, filter + param_offset, clip + param_offset, param_stride, pixel_max); + } +} + +static void alf_filter_chroma_10_avx2(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, + int width, int height, const int8_t *filter, const int16_t *clip) +{ + const int ps = 1; //pixel shift + const int pixel_max = (1 << 10) - 1; + int w; + + for (w = 0; w + 16 <= width; w += 16) { + ff_vvc_alf_filter_chroma_w16_16bpp_avx2(dst + (w << ps), dst_stride, src + (w << ps), src_stride, + height, filter, clip, 0, pixel_max); + } + for ( /* nothing */; w < width; w += 4) { + ff_vvc_alf_filter_chroma_w4_16bpp_avx2(dst + (w << ps), dst_stride, src + (w << ps), src_stride, + height, filter, clip, 0, pixel_max); + } +} + +void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bit_depth) +{ + const int cpu_flags = av_get_cpu_flags(); + + if (bit_depth == 10) { + if (EXTERNAL_AVX2(cpu_flags)) { + c->alf.filter[LUMA] = alf_filter_luma_10_avx2; + c->alf.filter[CHROMA] = alf_filter_chroma_10_avx2; + } + } +} +