@@ -4,4 +4,5 @@ clean::
OBJS-$(CONFIG_VVC_DECODER) += x86/vvc/vvcdsp_init.o \
x86/h26x/h2656dsp.o
X86ASM-OBJS-$(CONFIG_VVC_DECODER) += x86/vvc/vvc_mc.o \
- x86/h26x/h2656_inter.o
+ x86/h26x/h2656_inter.o \
+ x86/vvc/vvc_sad.o
new file mode 100644
@@ -0,0 +1,193 @@
+; /*
+; * Provide SIMD DMVR SAD functions for VVC decoding
+; *
+; * Copyright (c) 2024 Stone Chen
+; *
+; * This file is part of FFmpeg.
+; *
+; * FFmpeg is free software; you can redistribute it and/or
+; * modify it under the terms of the GNU Lesser General Public
+; * License as published by the Free Software Foundation; either
+; * version 2.1 of the License, or (at your option) any later version.
+; *
+; * FFmpeg is distributed in the hope that it will be useful,
+; * but WITHOUT ANY WARRANTY; without even the implied warranty of
+; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+; * Lesser General Public License for more details.
+; *
+; * You should have received a copy of the GNU Lesser General Public
+; * License along with FFmpeg; if not, write to the Free Software
+; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+; */
+
+%include "libavutil/x86/x86util.asm"
+
+%define MAX_PB_SIZE 128
+%define ROWS 2 ; DMVR SAD is only calculated on even rows to reduce complexity
+
+SECTION .text
+
+%macro MIN_MAX_SAD 3 ;
+ vpminuw %1, %2, %3
+ vpmaxuw %3, %2, %3
+ vpsubusw %3, %3, %1
+%endmacro
+
+%macro HORIZ_ADD 3 ; xm0, xm1, m1
+ vextracti128 %1, %3, q0001 ; 3 2 1 0
+ vpaddd %1, %2 ; xm0 (7 + 3) (6 + 2) (5 + 1) (4 + 0)
+ vpshufd %2, %1, q0032 ; xm1 - - (7 + 3) (6 + 2)
+ vpaddd %1, %1, %2 ; xm0 _ _ (5 1 7 3) (4 0 6 2)
+ vpshufd %2, %1, q0001 ; xm1 _ _ (5 1 7 3) (5 1 7 3)
+ vpaddd %1, %1, %2 ; (01234567)
+%endmacro
+
+%macro INIT_OFFSET 6 ; src1, src2, dxq, dyq, off1, off2
+ sub %3, 2
+ sub %4, 2
+
+ mov %5, 2
+ mov %6, 2
+
+ add %5, %4
+ sub %6, %4
+
+ imul %5, 128
+ imul %6, 128
+
+ add %5, 2
+ add %6, 2
+
+ add %5, %3
+ sub %6, %3
+
+ lea %1, [%1 + %5 * 2]
+ lea %2, [%2 + %6 * 2]
+%endmacro
+
+%if ARCH_X86_64
+%if HAVE_AVX2_EXTERNAL
+
+INIT_YMM avx2
+
+cglobal vvc_sad_8, 6, 8, 13, src1, src2, dx, dy, block_w, block_h, off1, off2
+
+ INIT_OFFSET src1q, src2q, dxq, dyq, off1q, off2q
+ pxor m3, m3
+
+ .loop_height:
+ movu xm0, [src1q]
+ movu xm1, [src2q]
+ MIN_MAX_SAD xm2, xm0, xm1
+ vpmovzxwd m1, xm1
+ vpaddd m3, m1
+
+ movu xm5, [src1q + MAX_PB_SIZE * ROWS * 2]
+ movu xm6, [src2q + MAX_PB_SIZE * ROWS * 2]
+ MIN_MAX_SAD xm7, xm5, xm6
+ vpmovzxwd m6, xm6
+ vpaddd m3, m6
+
+ movu xm8, [src1q + MAX_PB_SIZE * 2 * ROWS * 2]
+ movu xm9, [src2q + MAX_PB_SIZE * 2 * ROWS * 2]
+ MIN_MAX_SAD xm10, xm8, xm9
+ vpmovzxwd m9, xm9
+ vpaddd m3, m9
+
+ movu xm11, [src1q + MAX_PB_SIZE * 3 * ROWS * 2]
+ movu xm12, [src2q + MAX_PB_SIZE * 3 * ROWS * 2]
+ MIN_MAX_SAD xm13, xm11, xm12
+ vpmovzxwd m12, xm12
+
+ vpaddd m3, m12
+
+ add src1q, MAX_PB_SIZE * 4 * ROWS * 2
+ add src2q, MAX_PB_SIZE * 4 * ROWS * 2
+
+ sub block_hd, 8
+ jg .loop_height
+
+ HORIZ_ADD xm0, xm3, m3
+ movd eax, xm0
+ RET
+
+cglobal vvc_sad_16, 6, 8, 13, src1, src2, dx, dy, block_w, block_h, off1, off2
+ INIT_OFFSET src1q, src2q, dxq, dyq, off1q, off2q
+ pxor m8, m8
+.load_pixels:
+ movu xm0, [src1q]
+ movu xm1, [src2q]
+ MIN_MAX_SAD xm2, xm0, xm1
+ vpmovzxwd m1, xm1
+ vpaddd m8, m1
+
+ movu xm5, [src1q + 16]
+ movu xm6, [src2q + 16]
+ MIN_MAX_SAD xm7, xm5, xm6
+ vpmovzxwd m6, xm6
+ vpaddd m8, m6
+
+ add src1q, ROWS * MAX_PB_SIZE * 2
+ add src2q, ROWS * MAX_PB_SIZE * 2
+
+ sub block_hd, 2
+ jg .load_pixels
+
+ HORIZ_ADD xm0, xm8, m8
+ movd eax, xm0
+
+ RET
+
+cglobal vvc_sad_32_128, 6, 9, 13, src1, src2, dx, dy, block_w, block_h, off1, off2, row_idx
+ INIT_OFFSET src1q, src2q, dxq, dyq, off1q, off2q
+ pxor m3, m3
+
+.loop_height:
+ mov off1q, src1q
+ mov off2q, src2q
+ mov row_idxd, block_wd
+ sar row_idxd, 5
+
+ .loop_width:
+ movu xm0, [src1q]
+ movu xm1, [src2q]
+ MIN_MAX_SAD xm2, xm0, xm1
+ vpmovzxwd m1, xm1
+ vpaddd m3, m1
+
+ movu xm5, [src1q + 16]
+ movu xm6, [src2q + 16]
+ MIN_MAX_SAD xm7, xm5, xm6
+ vpmovzxwd m6, xm6
+ vpaddd m3, m6
+
+ movu xm8, [src1q + 32]
+ movu xm9, [src2q + 32]
+ MIN_MAX_SAD xm10, xm8, xm9
+ vpmovzxwd m9, xm9
+ vpaddd m3, m9
+
+ movu xm11, [src1q + 48]
+ movu xm12, [src2q + 48]
+ MIN_MAX_SAD xm13, xm11, xm12
+ vpmovzxwd m12, xm12
+ vpaddd m3, m12
+
+ add src1q, 64
+ add src2q, 64
+ dec row_idxd
+ jg .loop_width
+
+ lea src1q, [off1q + ROWS * MAX_PB_SIZE * 2]
+ lea src2q, [off2q + ROWS * MAX_PB_SIZE * 2]
+
+ sub block_hq, 2
+ jg .loop_height
+
+ HORIZ_ADD xm0, xm3, m3
+ movd eax, xm0
+
+ RET
+
+%endif
+%endif
@@ -252,6 +252,18 @@ AVG_FUNCS(16, 12, avx2)
c->inter.avg = bf(ff_vvc_avg, bd, opt); \
c->inter.w_avg = bf(ff_vvc_w_avg, bd, opt); \
} while (0)
+
+int ff_vvc_sad_8_avx2(const int16_t *src0, const int16_t *src1, int dx, int dy, int block_w, int block_h);
+int ff_vvc_sad_16_avx2(const int16_t *src0, const int16_t *src1, int dx, int dy, int block_w, int block_h);
+int ff_vvc_sad_32_128_avx2(const int16_t *src0, const int16_t *src1, int dx, int dy, int block_w, int block_h);
+
+#define SAD_INIT() do { \
+ c->inter.sad[1] = ff_vvc_sad_8_avx2; \
+ c->inter.sad[2] = ff_vvc_sad_16_avx2; \
+ c->inter.sad[3] = ff_vvc_sad_32_128_avx2; \
+ c->inter.sad[4] = ff_vvc_sad_32_128_avx2; \
+ c->inter.sad[5] = ff_vvc_sad_32_128_avx2; \
+} while (0)
#endif
void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
@@ -265,6 +277,7 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
}
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
MC_LINKS_AVX2(8);
+ SAD_INIT();
}
} else if (bd == 10) {
if (EXTERNAL_SSE4(cpu_flags)) {
@@ -273,6 +286,7 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
MC_LINKS_AVX2(10);
MC_LINKS_16BPC_AVX2(10);
+ SAD_INIT();
}
} else if (bd == 12) {
if (EXTERNAL_SSE4(cpu_flags)) {
@@ -281,6 +295,7 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
MC_LINKS_AVX2(12);
MC_LINKS_16BPC_AVX2(12);
+ SAD_INIT();
}
}