diff mbox series

[FFmpeg-devel,v5,1/2,GSoC,2024] libavcodec/x86/vvc: Add AVX2 DMVR SAD functions for VVC

Message ID 20240522000039.34913-2-chen.stonechen@gmail.com
State New
Headers show
Series [FFmpeg-devel,v5,1/2,GSoC,2024] libavcodec/x86/vvc: Add AVX2 DMVR SAD functions for VVC | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished

Commit Message

Stone Chen May 22, 2024, midnight UTC
Implements AVX2 DMVR (decoder-side motion vector refinement) SAD functions. DMVR SAD is only calculated if w >= 8, h >= 8, and w * h > 128. To reduce complexity, SAD is only calculated on even rows. This is calculated for all video bitdepths, but the values passed to the function are always 16bit (even if the original video bitdepth is 8). The AVX2 implementation uses min/max/sub.

Additionally this changes parameters dx and dy from int to intptr_t. This allows dx & dy to be used as pointer offsets without needing to use movsxd.

Benchmarks ( AMD 7940HS )
Before:
BQTerrace_1920x1080_60_10_420_22_RA.vvc | 106.0 |
Chimera_8bit_1080P_1000_frames.vvc | 204.3 |
NovosobornayaSquare_1920x1080.bin | 197.3 |
RitualDance_1920x1080_60_10_420_37_RA.266 | 174.0 |

After:
BQTerrace_1920x1080_60_10_420_22_RA.vvc | 109.3 |
Chimera_8bit_1080P_1000_frames.vvc | 216.0 |
NovosobornayaSquare_1920x1080.bin | 204.0|
RitualDance_1920x1080_60_10_420_37_RA.266 | 181.7 |
---
 libavcodec/vvc/dsp.c             |   2 +-
 libavcodec/vvc/dsp.h             |   2 +-
 libavcodec/x86/vvc/Makefile      |   3 +-
 libavcodec/x86/vvc/vvc_sad.asm   | 130 +++++++++++++++++++++++++++++++
 libavcodec/x86/vvc/vvcdsp_init.c |   6 ++
 5 files changed, 140 insertions(+), 3 deletions(-)
 create mode 100644 libavcodec/x86/vvc/vvc_sad.asm

Comments

Ronald S. Bultje May 22, 2024, 1:01 a.m. UTC | #1
Hi,

On Tue, May 21, 2024 at 8:01 PM Stone Chen <chen.stonechen@gmail.com> wrote:

> Implements AVX2 DMVR (decoder-side motion vector refinement) SAD
> functions. DMVR SAD is only calculated if w >= 8, h >= 8, and w * h > 128.
> To reduce complexity, SAD is only calculated on even rows. This is
> calculated for all video bitdepths, but the values passed to the function
> are always 16bit (even if the original video bitdepth is 8). The AVX2
> implementation uses min/max/sub.
>
> Additionally this changes parameters dx and dy from int to intptr_t. This
> allows dx & dy to be used as pointer offsets without needing to use movsxd.
>
> Benchmarks ( AMD 7940HS )
> Before:
> BQTerrace_1920x1080_60_10_420_22_RA.vvc | 106.0 |
> Chimera_8bit_1080P_1000_frames.vvc | 204.3 |
> NovosobornayaSquare_1920x1080.bin | 197.3 |
> RitualDance_1920x1080_60_10_420_37_RA.266 | 174.0 |
>
> After:
> BQTerrace_1920x1080_60_10_420_22_RA.vvc | 109.3 |
> Chimera_8bit_1080P_1000_frames.vvc | 216.0 |
> NovosobornayaSquare_1920x1080.bin | 204.0|
> RitualDance_1920x1080_60_10_420_37_RA.266 | 181.7 |
> ---
>  libavcodec/vvc/dsp.c             |   2 +-
>  libavcodec/vvc/dsp.h             |   2 +-
>  libavcodec/x86/vvc/Makefile      |   3 +-
>  libavcodec/x86/vvc/vvc_sad.asm   | 130 +++++++++++++++++++++++++++++++
>  libavcodec/x86/vvc/vvcdsp_init.c |   6 ++
>  5 files changed, 140 insertions(+), 3 deletions(-)
>  create mode 100644 libavcodec/x86/vvc/vvc_sad.asm
>

LGTM.

Ronald
Andreas Rheinhardt May 22, 2024, 5:02 a.m. UTC | #2
Stone Chen:
> Implements AVX2 DMVR (decoder-side motion vector refinement) SAD functions. DMVR SAD is only calculated if w >= 8, h >= 8, and w * h > 128. To reduce complexity, SAD is only calculated on even rows. This is calculated for all video bitdepths, but the values passed to the function are always 16bit (even if the original video bitdepth is 8). The AVX2 implementation uses min/max/sub.
> 
> Additionally this changes parameters dx and dy from int to intptr_t. This allows dx & dy to be used as pointer offsets without needing to use movsxd.
> 
> Benchmarks ( AMD 7940HS )
> Before:
> BQTerrace_1920x1080_60_10_420_22_RA.vvc | 106.0 |
> Chimera_8bit_1080P_1000_frames.vvc | 204.3 |
> NovosobornayaSquare_1920x1080.bin | 197.3 |
> RitualDance_1920x1080_60_10_420_37_RA.266 | 174.0 |
> 
> After:
> BQTerrace_1920x1080_60_10_420_22_RA.vvc | 109.3 |
> Chimera_8bit_1080P_1000_frames.vvc | 216.0 |
> NovosobornayaSquare_1920x1080.bin | 204.0|
> RitualDance_1920x1080_60_10_420_37_RA.266 | 181.7 |
> ---
>  libavcodec/vvc/dsp.c             |   2 +-
>  libavcodec/vvc/dsp.h             |   2 +-
>  libavcodec/x86/vvc/Makefile      |   3 +-
>  libavcodec/x86/vvc/vvc_sad.asm   | 130 +++++++++++++++++++++++++++++++
>  libavcodec/x86/vvc/vvcdsp_init.c |   6 ++
>  5 files changed, 140 insertions(+), 3 deletions(-)
>  create mode 100644 libavcodec/x86/vvc/vvc_sad.asm
> 
> diff --git a/libavcodec/x86/vvc/vvcdsp_init.c b/libavcodec/x86/vvc/vvcdsp_init.c
> index 0e68971b2c..aa6c916760 100644
> --- a/libavcodec/x86/vvc/vvcdsp_init.c
> +++ b/libavcodec/x86/vvc/vvcdsp_init.c
> @@ -311,6 +311,9 @@ ALF_FUNCS(16, 12, avx2)
>      c->alf.filter[CHROMA] = ff_vvc_alf_filter_chroma_##bd##_avx2;    \
>      c->alf.classify       = ff_vvc_alf_classify_##bd##_avx2;         \
>  } while (0)
> +
> +int ff_vvc_sad_avx2(const int16_t *src0, const int16_t *src1, intptr_t dx, intptr_t dy, int block_w, int block_h);
> +#define SAD_INIT() c->inter.sad = ff_vvc_sad_avx2

You are adding an AVX2 function to an ARCH_X86_64 #if block. I expect
this to lead to linking failures if AVX2 is disabled.

>  #endif
>  
>  void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
> @@ -327,6 +330,7 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
>              ALF_INIT(8);
>              AVG_INIT(8, avx2);
>              MC_LINKS_AVX2(8);
> +            SAD_INIT();
>          }
>          break;
>      case 10:
> @@ -338,6 +342,7 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
>              AVG_INIT(10, avx2);
>              MC_LINKS_AVX2(10);
>              MC_LINKS_16BPC_AVX2(10);
> +            SAD_INIT();
>          }
>          break;
>      case 12:
> @@ -349,6 +354,7 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
>              AVG_INIT(12, avx2);
>              MC_LINKS_AVX2(12);
>              MC_LINKS_16BPC_AVX2(12);
> +            SAD_INIT();
>          }
>          break;
>      default:
James Almer May 22, 2024, 10:54 p.m. UTC | #3
On 5/22/2024 2:02 AM, Andreas Rheinhardt wrote:
> Stone Chen:
>> Implements AVX2 DMVR (decoder-side motion vector refinement) SAD functions. DMVR SAD is only calculated if w >= 8, h >= 8, and w * h > 128. To reduce complexity, SAD is only calculated on even rows. This is calculated for all video bitdepths, but the values passed to the function are always 16bit (even if the original video bitdepth is 8). The AVX2 implementation uses min/max/sub.
>>
>> Additionally this changes parameters dx and dy from int to intptr_t. This allows dx & dy to be used as pointer offsets without needing to use movsxd.
>>
>> Benchmarks ( AMD 7940HS )
>> Before:
>> BQTerrace_1920x1080_60_10_420_22_RA.vvc | 106.0 |
>> Chimera_8bit_1080P_1000_frames.vvc | 204.3 |
>> NovosobornayaSquare_1920x1080.bin | 197.3 |
>> RitualDance_1920x1080_60_10_420_37_RA.266 | 174.0 |
>>
>> After:
>> BQTerrace_1920x1080_60_10_420_22_RA.vvc | 109.3 |
>> Chimera_8bit_1080P_1000_frames.vvc | 216.0 |
>> NovosobornayaSquare_1920x1080.bin | 204.0|
>> RitualDance_1920x1080_60_10_420_37_RA.266 | 181.7 |
>> ---
>>   libavcodec/vvc/dsp.c             |   2 +-
>>   libavcodec/vvc/dsp.h             |   2 +-
>>   libavcodec/x86/vvc/Makefile      |   3 +-
>>   libavcodec/x86/vvc/vvc_sad.asm   | 130 +++++++++++++++++++++++++++++++
>>   libavcodec/x86/vvc/vvcdsp_init.c |   6 ++
>>   5 files changed, 140 insertions(+), 3 deletions(-)
>>   create mode 100644 libavcodec/x86/vvc/vvc_sad.asm
>>
>> diff --git a/libavcodec/x86/vvc/vvcdsp_init.c b/libavcodec/x86/vvc/vvcdsp_init.c
>> index 0e68971b2c..aa6c916760 100644
>> --- a/libavcodec/x86/vvc/vvcdsp_init.c
>> +++ b/libavcodec/x86/vvc/vvcdsp_init.c
>> @@ -311,6 +311,9 @@ ALF_FUNCS(16, 12, avx2)
>>       c->alf.filter[CHROMA] = ff_vvc_alf_filter_chroma_##bd##_avx2;    \
>>       c->alf.classify       = ff_vvc_alf_classify_##bd##_avx2;         \
>>   } while (0)
>> +
>> +int ff_vvc_sad_avx2(const int16_t *src0, const int16_t *src1, intptr_t dx, intptr_t dy, int block_w, int block_h);
>> +#define SAD_INIT() c->inter.sad = ff_vvc_sad_avx2
> 
> You are adding an AVX2 function to an ARCH_X86_64 #if block. I expect
> this to lead to linking failures if AVX2 is disabled.

It's a prototype, so no linking failures. And SAD_INIT() is called on a 
block that both needs ARCH_X86_64 and EXTERNAL_AVX2_FAST to be true.
James Almer May 22, 2024, 11:38 p.m. UTC | #4
On 5/21/2024 10:01 PM, Ronald S. Bultje wrote:
> Hi,
> 
> On Tue, May 21, 2024 at 8:01 PM Stone Chen <chen.stonechen@gmail.com> wrote:
> 
>> Implements AVX2 DMVR (decoder-side motion vector refinement) SAD
>> functions. DMVR SAD is only calculated if w >= 8, h >= 8, and w * h > 128.
>> To reduce complexity, SAD is only calculated on even rows. This is
>> calculated for all video bitdepths, but the values passed to the function
>> are always 16bit (even if the original video bitdepth is 8). The AVX2
>> implementation uses min/max/sub.
>>
>> Additionally this changes parameters dx and dy from int to intptr_t. This
>> allows dx & dy to be used as pointer offsets without needing to use movsxd.
>>
>> Benchmarks ( AMD 7940HS )
>> Before:
>> BQTerrace_1920x1080_60_10_420_22_RA.vvc | 106.0 |
>> Chimera_8bit_1080P_1000_frames.vvc | 204.3 |
>> NovosobornayaSquare_1920x1080.bin | 197.3 |
>> RitualDance_1920x1080_60_10_420_37_RA.266 | 174.0 |
>>
>> After:
>> BQTerrace_1920x1080_60_10_420_22_RA.vvc | 109.3 |
>> Chimera_8bit_1080P_1000_frames.vvc | 216.0 |
>> NovosobornayaSquare_1920x1080.bin | 204.0|
>> RitualDance_1920x1080_60_10_420_37_RA.266 | 181.7 |
>> ---
>>   libavcodec/vvc/dsp.c             |   2 +-
>>   libavcodec/vvc/dsp.h             |   2 +-
>>   libavcodec/x86/vvc/Makefile      |   3 +-
>>   libavcodec/x86/vvc/vvc_sad.asm   | 130 +++++++++++++++++++++++++++++++
>>   libavcodec/x86/vvc/vvcdsp_init.c |   6 ++
>>   5 files changed, 140 insertions(+), 3 deletions(-)
>>   create mode 100644 libavcodec/x86/vvc/vvc_sad.asm
>>
> 
> LGTM.
> 
> Ronald

Implemented my changes and applied.
Nuo Mi May 23, 2024, 1:18 p.m. UTC | #5
On Thu, May 23, 2024 at 7:38 AM James Almer <jamrial@gmail.com> wrote:

> On 5/21/2024 10:01 PM, Ronald S. Bultje wrote:
> > Hi,
> >
> > On Tue, May 21, 2024 at 8:01 PM Stone Chen <chen.stonechen@gmail.com>
> wrote:
> >
> >> Implements AVX2 DMVR (decoder-side motion vector refinement) SAD
> >> functions. DMVR SAD is only calculated if w >= 8, h >= 8, and w * h >
> 128.
> >> To reduce complexity, SAD is only calculated on even rows. This is
> >> calculated for all video bitdepths, but the values passed to the
> function
> >> are always 16bit (even if the original video bitdepth is 8). The AVX2
> >> implementation uses min/max/sub.
> >>
> >> Additionally this changes parameters dx and dy from int to intptr_t.
> This
> >> allows dx & dy to be used as pointer offsets without needing to use
> movsxd.
> >>
> >> Benchmarks ( AMD 7940HS )
> >> Before:
> >> BQTerrace_1920x1080_60_10_420_22_RA.vvc | 106.0 |
> >> Chimera_8bit_1080P_1000_frames.vvc | 204.3 |
> >> NovosobornayaSquare_1920x1080.bin | 197.3 |
> >> RitualDance_1920x1080_60_10_420_37_RA.266 | 174.0 |
> >>
> >> After:
> >> BQTerrace_1920x1080_60_10_420_22_RA.vvc | 109.3 |
> >> Chimera_8bit_1080P_1000_frames.vvc | 216.0 |
> >> NovosobornayaSquare_1920x1080.bin | 204.0|
> >> RitualDance_1920x1080_60_10_420_37_RA.266 | 181.7 |
> >> ---
> >>   libavcodec/vvc/dsp.c             |   2 +-
> >>   libavcodec/vvc/dsp.h             |   2 +-
> >>   libavcodec/x86/vvc/Makefile      |   3 +-
> >>   libavcodec/x86/vvc/vvc_sad.asm   | 130 +++++++++++++++++++++++++++++++
> >>   libavcodec/x86/vvc/vvcdsp_init.c |   6 ++
> >>   5 files changed, 140 insertions(+), 3 deletions(-)
> >>   create mode 100644 libavcodec/x86/vvc/vvc_sad.asm
> >>
> >
> > LGTM.
> >
> > Ronald
>
> Implemented my changes and applied.
>
Thank you, Ronald, Andreas, and James.

Hi Stone,
Congratulations on surviving your first crossfire!
Stone Chen May 23, 2024, 1:23 p.m. UTC | #6
On Thu, May 23, 2024 at 9:18 AM Nuo Mi <nuomi2021@gmail.com> wrote:

> On Thu, May 23, 2024 at 7:38 AM James Almer <jamrial@gmail.com> wrote:
>
> > On 5/21/2024 10:01 PM, Ronald S. Bultje wrote:
> > > Hi,
> > >
> > > On Tue, May 21, 2024 at 8:01 PM Stone Chen <chen.stonechen@gmail.com>
> > wrote:
> > >
> > >> Implements AVX2 DMVR (decoder-side motion vector refinement) SAD
> > >> functions. DMVR SAD is only calculated if w >= 8, h >= 8, and w * h >
> > 128.
> > >> To reduce complexity, SAD is only calculated on even rows. This is
> > >> calculated for all video bitdepths, but the values passed to the
> > function
> > >> are always 16bit (even if the original video bitdepth is 8). The AVX2
> > >> implementation uses min/max/sub.
> > >>
> > >> Additionally this changes parameters dx and dy from int to intptr_t.
> > This
> > >> allows dx & dy to be used as pointer offsets without needing to use
> > movsxd.
> > >>
> > >> Benchmarks ( AMD 7940HS )
> > >> Before:
> > >> BQTerrace_1920x1080_60_10_420_22_RA.vvc | 106.0 |
> > >> Chimera_8bit_1080P_1000_frames.vvc | 204.3 |
> > >> NovosobornayaSquare_1920x1080.bin | 197.3 |
> > >> RitualDance_1920x1080_60_10_420_37_RA.266 | 174.0 |
> > >>
> > >> After:
> > >> BQTerrace_1920x1080_60_10_420_22_RA.vvc | 109.3 |
> > >> Chimera_8bit_1080P_1000_frames.vvc | 216.0 |
> > >> NovosobornayaSquare_1920x1080.bin | 204.0|
> > >> RitualDance_1920x1080_60_10_420_37_RA.266 | 181.7 |
> > >> ---
> > >>   libavcodec/vvc/dsp.c             |   2 +-
> > >>   libavcodec/vvc/dsp.h             |   2 +-
> > >>   libavcodec/x86/vvc/Makefile      |   3 +-
> > >>   libavcodec/x86/vvc/vvc_sad.asm   | 130
> +++++++++++++++++++++++++++++++
> > >>   libavcodec/x86/vvc/vvcdsp_init.c |   6 ++
> > >>   5 files changed, 140 insertions(+), 3 deletions(-)
> > >>   create mode 100644 libavcodec/x86/vvc/vvc_sad.asm
> > >>
> > >
> > > LGTM.
> > >
> > > Ronald
> >
> > Implemented my changes and applied.
> >
>

Hi all,


> Thank you, Ronald, Andreas, and James.
>
> Hi Stone,
> Congratulations on surviving your first crossfire!
>


Yes thank you Ronald, Andreas and James for the feedback! Also Nuo Mi and
Jian Hua as well.

-Stone


>
> _______________________________________________
> > ffmpeg-devel mailing list
> > ffmpeg-devel@ffmpeg.org
> > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> >
> > To unsubscribe, visit link above, or email
> > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
> >
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
diff mbox series

Patch

diff --git a/libavcodec/vvc/dsp.c b/libavcodec/vvc/dsp.c
index 41e830a98a..aded1a2f9f 100644
--- a/libavcodec/vvc/dsp.c
+++ b/libavcodec/vvc/dsp.c
@@ -46,7 +46,7 @@  static void av_always_inline pad_int16(int16_t *_dst, const ptrdiff_t dst_stride
     memcpy(_dst, _dst - dst_stride, padded_width * sizeof(int16_t));
 }
 
-static int vvc_sad(const int16_t *src0, const int16_t *src1, int dx, int dy,
+static int vvc_sad(const int16_t *src0, const int16_t *src1, intptr_t dx, intptr_t dy,
     const int block_w, const int block_h)
 {
     int sad = 0;
diff --git a/libavcodec/vvc/dsp.h b/libavcodec/vvc/dsp.h
index 9810ac314c..213337358b 100644
--- a/libavcodec/vvc/dsp.h
+++ b/libavcodec/vvc/dsp.h
@@ -86,7 +86,7 @@  typedef struct VVCInterDSPContext {
 
     void (*apply_bdof)(uint8_t *dst, ptrdiff_t dst_stride, int16_t *src0, int16_t *src1, int block_w, int block_h);
 
-    int (*sad)(const int16_t *src0, const int16_t *src1, int dx, int dy, int block_w, int block_h);
+    int (*sad)(const int16_t *src0, const int16_t *src1, intptr_t dx, intptr_t dy, int block_w, int block_h);
     void (*dmvr[2][2])(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, int height,
         intptr_t mx, intptr_t my, int width);
 } VVCInterDSPContext;
diff --git a/libavcodec/x86/vvc/Makefile b/libavcodec/x86/vvc/Makefile
index d6a66f860a..7b2438ce17 100644
--- a/libavcodec/x86/vvc/Makefile
+++ b/libavcodec/x86/vvc/Makefile
@@ -5,4 +5,5 @@  OBJS-$(CONFIG_VVC_DECODER)             += x86/vvc/vvcdsp_init.o \
                                           x86/h26x/h2656dsp.o
 X86ASM-OBJS-$(CONFIG_VVC_DECODER)      += x86/vvc/vvc_alf.o      \
                                           x86/vvc/vvc_mc.o       \
-                                          x86/h26x/h2656_inter.o
+                                          x86/vvc/vvc_sad.o      \
+                                          x86/h26x/h2656_inter.o 
diff --git a/libavcodec/x86/vvc/vvc_sad.asm b/libavcodec/x86/vvc/vvc_sad.asm
new file mode 100644
index 0000000000..9766446b11
--- /dev/null
+++ b/libavcodec/x86/vvc/vvc_sad.asm
@@ -0,0 +1,130 @@ 
+; /*
+; * Provide SIMD DMVR SAD functions for VVC decoding
+; *
+; * Copyright (c) 2024 Stone Chen
+; *
+; * This file is part of FFmpeg.
+; *
+; * FFmpeg is free software; you can redistribute it and/or
+; * modify it under the terms of the GNU Lesser General Public
+; * License as published by the Free Software Foundation; either
+; * version 2.1 of the License, or (at your option) any later version.
+; *
+; * FFmpeg is distributed in the hope that it will be useful,
+; * but WITHOUT ANY WARRANTY; without even the implied warranty of
+; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+; * Lesser General Public License for more details.
+; *
+; * You should have received a copy of the GNU Lesser General Public
+; * License along with FFmpeg; if not, write to the Free Software
+; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+; */
+
+%include "libavutil/x86/x86util.asm"
+%define MAX_PB_SIZE 128
+%define ROWS 2    
+
+SECTION_RODATA
+
+pw_1: times 2 dw 1
+
+; DMVR SAD is only calculated on even rows to reduce complexity
+SECTION .text
+
+%macro MIN_MAX_SAD 3 ; 
+    pminuw           %3, %2, %1
+    pmaxuw           %1, %2, %1
+    psubusw          %1, %1, %3
+%endmacro
+
+%macro HORIZ_ADD 3  ; xm0, xm1, m1
+    vextracti128     %1, %3, q0001  ;        3        2      1          0
+    paddd            %1, %2         ; xm0 (7 + 3) (6 + 2) (5 + 1)   (4 + 0)
+    pshufd           %2, %1, q0032  ; xm1    -      -     (7 + 3)   (6 + 2)
+    paddd            %1, %1, %2     ; xm0    _      _     (5 1 7 3) (4 0 6 2)
+    pshufd           %2, %1, q0001  ; xm1    _      _     (5 1 7 3) (5 1 7 3)
+    paddd            %1, %1, %2     ;                               (01234567)
+%endmacro
+
+%if ARCH_X86_64
+%if HAVE_AVX2_EXTERNAL
+
+INIT_YMM avx2
+
+cglobal vvc_sad, 6, 9, 5, src1, src2, dx, dy, block_w, block_h, off1, off2, row_idx
+    sub             dxq, 2
+    sub             dyq, 2
+
+    mov             off1q, 2
+    mov             off2q, 2
+
+    add             off1q, dyq   
+    sub             off2q, dyq
+
+    shl             off1q, 7
+    shl             off2q, 7
+    
+    add             off1q, dxq
+    sub             off2q, dxq
+
+    lea             src1q, [src1q + off1q * 2 + 2 * 2]
+    lea             src2q, [src2q + off2q * 2 + 2 * 2]
+
+    pxor               m3, m3
+    vpbroadcastd       m4, [pw_1]
+
+    cmp          block_wd, 16
+    jge    vvc_sad_16_128
+
+    vvc_sad_8:
+        .loop_height:
+        movu              xm0, [src1q]
+        vinserti128        m0, [src1q + MAX_PB_SIZE * ROWS * 2], 1
+        movu              xm1, [src2q]
+        vinserti128        m1, [src2q + MAX_PB_SIZE * ROWS * 2], 1
+
+        MIN_MAX_SAD        m1, m0, m2
+        pmaddwd            m1, m4
+        paddd              m3, m1
+
+        add         src1q, 2 * MAX_PB_SIZE * ROWS * 2 
+        add         src2q, 2 * MAX_PB_SIZE * ROWS * 2
+
+        sub      block_hd, 4
+        jg   .loop_height
+
+        HORIZ_ADD     xm0, xm3, m3
+        movd          eax, xm0
+    RET
+
+    vvc_sad_16_128:
+        sar      block_wd, 4
+        .loop_height:
+        mov         off1q, src1q
+        mov         off2q, src2q
+        mov      row_idxd, block_wd
+
+        .loop_width:
+            movu               m0, [src1q]
+            movu               m1, [src2q]
+            MIN_MAX_SAD        m1, m0, m2
+            pmaddwd            m1, m4
+            paddd              m3, m1
+
+            add             src1q, 32
+            add             src2q, 32
+            dec          row_idxd
+            jg        .loop_width
+
+        lea         src1q, [off1q + ROWS * MAX_PB_SIZE * 2] 
+        lea         src2q, [off2q + ROWS * MAX_PB_SIZE * 2]
+
+        sub      block_hd, 2
+        jg   .loop_height
+
+        HORIZ_ADD     xm0, xm3, m3
+        movd          eax, xm0
+    RET
+
+%endif
+%endif
diff --git a/libavcodec/x86/vvc/vvcdsp_init.c b/libavcodec/x86/vvc/vvcdsp_init.c
index 0e68971b2c..aa6c916760 100644
--- a/libavcodec/x86/vvc/vvcdsp_init.c
+++ b/libavcodec/x86/vvc/vvcdsp_init.c
@@ -311,6 +311,9 @@  ALF_FUNCS(16, 12, avx2)
     c->alf.filter[CHROMA] = ff_vvc_alf_filter_chroma_##bd##_avx2;    \
     c->alf.classify       = ff_vvc_alf_classify_##bd##_avx2;         \
 } while (0)
+
+int ff_vvc_sad_avx2(const int16_t *src0, const int16_t *src1, intptr_t dx, intptr_t dy, int block_w, int block_h);
+#define SAD_INIT() c->inter.sad = ff_vvc_sad_avx2
 #endif
 
 void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
@@ -327,6 +330,7 @@  void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
             ALF_INIT(8);
             AVG_INIT(8, avx2);
             MC_LINKS_AVX2(8);
+            SAD_INIT();
         }
         break;
     case 10:
@@ -338,6 +342,7 @@  void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
             AVG_INIT(10, avx2);
             MC_LINKS_AVX2(10);
             MC_LINKS_16BPC_AVX2(10);
+            SAD_INIT();
         }
         break;
     case 12:
@@ -349,6 +354,7 @@  void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
             AVG_INIT(12, avx2);
             MC_LINKS_AVX2(12);
             MC_LINKS_16BPC_AVX2(12);
+            SAD_INIT();
         }
         break;
     default: