[FFmpeg-devel,v3,1/2,GSoC,2024] libavcodec/x86/vvc: Add AVX2 DMVR SAD functions for VVC

Message ID	20240514204019.11022-2-chen.stonechen@gmail.com
State	New
Headers	show Delivered-To: ffmpegpatchwork2@gmail.com Received-SPF: pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) client-ip=79.124.17.100; From: Stone Chen <chen.stonechen@gmail.com> To: ffmpeg-devel@ffmpeg.org Date: Tue, 14 May 2024 16:40:09 -0400 Message-ID: <20240514204019.11022-2-chen.stonechen@gmail.com> MIME-Version: 1.0 Subject: [FFmpeg-devel] [PATCH v3 1/2][GSoC 2024] libavcodec/x86/vvc: Add AVX2 DMVR SAD functions for VVC Precedence: list Reply-To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org> Cc: Stone Chen <chen.stonechen@gmail.com> Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" <ffmpeg-devel-bounces@ffmpeg.org>
Series	[FFmpeg-devel,v3,1/2,GSoC,2024] libavcodec/x86/vvc: Add AVX2 DMVR SAD functions for VVC \| expand [FFmpeg-devel,v3,1/2,GSoC,2024] libavcodec/x86/vvc: Add AVX2 DMVR SAD functions for VVC [FFmpeg-devel,v3,2/2,GSoC,2024] tests/checkasm: Add check_vvc_sad to vvc_mc.c

Context	Check	Description
andriy/make_x86	success	Make finished
andriy/make_fate_x86	success	Make fate finished

diff --git a/libavcodec/x86/vvc/Makefile b/libavcodec/x86/vvc/Makefile index d6a66f860a..7b2438ce17 100644 --- a/libavcodec/x86/vvc/Makefile +++ b/libavcodec/x86/vvc/Makefile @@ -5,4 +5,5 @@ OBJS-$(CONFIG_VVC_DECODER) += x86/vvc/vvcdsp_init.o \ x86/h26x/h2656dsp.o X86ASM-OBJS-$(CONFIG_VVC_DECODER) += x86/vvc/vvc_alf.o \ x86/vvc/vvc_mc.o \ - x86/h26x/h2656_inter.o + x86/vvc/vvc_sad.o \ + x86/h26x/h2656_inter.o diff --git a/libavcodec/x86/vvc/vvc_sad.asm b/libavcodec/x86/vvc/vvc_sad.asm new file mode 100644 index 0000000000..530142ad35 --- /dev/null +++ b/libavcodec/x86/vvc/vvc_sad.asm @@ -0,0 +1,157 @@ +; /* +; * Provide SIMD DMVR SAD functions for VVC decoding +; * +; * Copyright (c) 2024 Stone Chen +; * +; * This file is part of FFmpeg. +; * +; * FFmpeg is free software; you can redistribute it and/or +; * modify it under the terms of the GNU Lesser General Public +; * License as published by the Free Software Foundation; either +; * version 2.1 of the License, or (at your option) any later version. +; * +; * FFmpeg is distributed in the hope that it will be useful, +; * but WITHOUT ANY WARRANTY; without even the implied warranty of +; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +; * Lesser General Public License for more details. +; * +; * You should have received a copy of the GNU Lesser General Public +; * License along with FFmpeg; if not, write to the Free Software +; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +; */ + +%include "libavutil/x86/x86util.asm" + +%define MAX_PB_SIZE 128 +%define ROWS 2 ; DMVR SAD is only calculated on even rows to reduce complexity + +SECTION .text + +%macro MIN_MAX_SAD 3 ; + vpminuw %1, %2, %3 + vpmaxuw %3, %2, %3 + vpsubusw %3, %3, %1 +%endmacro + +%macro HORIZ_ADD 3 ; xm0, xm1, m1 + vextracti128 %1, %3, q0001 ; 3 2 1 0 + vpaddd %1, %2 ; xm0 (7 + 3) (6 + 2) (5 + 1) (4 + 0) + vpshufd %2, %1, q0032 ; xm1 - - (7 + 3) (6 + 2) + vpaddd %1, %1, %2 ; xm0 _ _ (5 1 7 3) (4 0 6 2) + vpshufd %2, %1, q0001 ; xm1 _ _ (5 1 7 3) (5 1 7 3) + vpaddd %1, %1, %2 ; (01234567) +%endmacro + +%macro INIT_OFFSET 6 ; src1, src2, dxq, dyq, off1, off2 + sub %3, 2 + sub %4, 2 + + mov %5, 2 + mov %6, 2 + + add %5, %4 + sub %6, %4 + + imul %5, 128 + imul %6, 128 + + add %5, 2 + add %6, 2 + + add %5, %3 + sub %6, %3 + + lea %1, [%1 + %5 * 2] + lea %2, [%2 + %6 * 2] +%endmacro + +%if ARCH_X86_64 +%if HAVE_AVX2_EXTERNAL + +INIT_YMM avx2 + +cglobal vvc_sad, 6, 11, 14, src1, src2, dx, dy, block_w, block_h, off1, off2, row_idx, dx2, dy2 + movsxd dx2q, dxd + movsxd dy2q, dyd + INIT_OFFSET src1q, src2q, dx2q, dy2q, off1q, off2q + pxor m3, m3 + pxor m8, m8 + + cmp block_wd, 16 + jge vvc_sad_16_128 + + vvc_sad_8: + .loop_height: + movu xm0, [src1q] + movu xm1, [src2q] + MIN_MAX_SAD xm2, xm0, xm1 + vpmovzxwd m1, xm1 + vpaddd m3, m1 + + movu xm5, [src1q + MAX_PB_SIZE * ROWS * 2] + movu xm6, [src2q + MAX_PB_SIZE * ROWS * 2] + MIN_MAX_SAD xm7, xm5, xm6 + vpmovzxwd m6, xm6 + vpaddd m3, m6 + + movu xm8, [src1q + MAX_PB_SIZE * 2 * ROWS * 2] + movu xm9, [src2q + MAX_PB_SIZE * 2 * ROWS * 2] + MIN_MAX_SAD xm10, xm8, xm9 + vpmovzxwd m9, xm9 + vpaddd m3, m9 + + movu xm11, [src1q + MAX_PB_SIZE * 3 * ROWS * 2] + movu xm12, [src2q + MAX_PB_SIZE * 3 * ROWS * 2] + MIN_MAX_SAD xm13, xm11, xm12 + vpmovzxwd m12, xm12 + + vpaddd m3, m12 + + add src1q, MAX_PB_SIZE * 4 * ROWS * 2 + add src2q, MAX_PB_SIZE * 4 * ROWS * 2 + + sub block_hd, 8 + jg .loop_height + + HORIZ_ADD xm0, xm3, m3 + movd eax, xm0 + RET + + vvc_sad_16_128: + .loop_height: + mov off1q, src1q + mov off2q, src2q + mov row_idxd, block_wd + sar row_idxd, 4 + + .loop_width: + movu xm0, [src1q] + movu xm1, [src2q] + MIN_MAX_SAD xm2, xm0, xm1 + vpmovzxwd m1, xm1 + vpaddd m3, m1 + + movu xm5, [src1q + 16] + movu xm6, [src2q + 16] + MIN_MAX_SAD xm7, xm5, xm6 + vpmovzxwd m6, xm6 + vpaddd m3, m6 + + add src1q, 32 + add src2q, 32 + dec row_idxd + jg .loop_width + + lea src1q, [off1q + ROWS * MAX_PB_SIZE * 2] + lea src2q, [off2q + ROWS * MAX_PB_SIZE * 2] + + sub block_hd, 2 + jg .loop_height + + HORIZ_ADD xm0, xm3, m3 + movd eax, xm0 + + RET + +%endif +%endif diff --git a/libavcodec/x86/vvc/vvcdsp_init.c b/libavcodec/x86/vvc/vvcdsp_init.c index 0e68971b2c..4b4a2aa937 100644 --- a/libavcodec/x86/vvc/vvcdsp_init.c +++ b/libavcodec/x86/vvc/vvcdsp_init.c @@ -311,6 +311,9 @@ ALF_FUNCS(16, 12, avx2) c->alf.filter[CHROMA] = ff_vvc_alf_filter_chroma_##bd##_avx2; \ c->alf.classify = ff_vvc_alf_classify_##bd##_avx2; \ } while (0) + +int ff_vvc_sad_avx2(const int16_t *src0, const int16_t *src1, int dx, int dy, int block_w, int block_h); +#define SAD_INIT() c->inter.sad = ff_vvc_sad_avx2 #endif void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd) @@ -327,6 +330,7 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd) ALF_INIT(8); AVG_INIT(8, avx2); MC_LINKS_AVX2(8); + SAD_INIT(); } break; case 10: @@ -338,6 +342,7 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd) AVG_INIT(10, avx2); MC_LINKS_AVX2(10); MC_LINKS_16BPC_AVX2(10); + SAD_INIT(); } break; case 12: @@ -349,6 +354,7 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd) AVG_INIT(12, avx2); MC_LINKS_AVX2(12); MC_LINKS_16BPC_AVX2(12); + SAD_INIT(); } break; default:

[FFmpeg-devel,v3,1/2,GSoC,2024] libavcodec/x86/vvc: Add AVX2 DMVR SAD functions for VVC

Checks

Commit Message

Comments

Patch