@@ -2,5 +2,6 @@ clean::
$(RM) $(CLEANSUFFIXES:%=libavcodec/x86/vvc/%)
OBJS-$(CONFIG_VVC_DECODER) += x86/vvc/vvcdsp_init.o
-X86ASM-OBJS-$(CONFIG_VVC_DECODER) += x86/h26x/h2656dsp.o \
+X86ASM-OBJS-$(CONFIG_VVC_DECODER) += x86/vvc/vvc_mc.o \
+ x86/h26x/h2656dsp.o \
x86/h26x/h2656_inter.o
new file mode 100644
@@ -0,0 +1,301 @@
+; /*
+; * Provide SIMD MC functions for VVC decoding
+; *
+; * Copyright © 2021, VideoLAN and dav1d authors
+; * Copyright © 2021, Two Orioles, LLC
+; * All rights reserved.
+; *
+; * Copyright (c) 2023-2024 Nuo Mi
+; * Copyright (c) 2023-2024 Wu Jianhua
+; *
+; * This file is part of FFmpeg.
+; *
+; * FFmpeg is free software; you can redistribute it and/or
+; * modify it under the terms of the GNU Lesser General Public
+; * License as published by the Free Software Foundation; either
+; * version 2.1 of the License, or (at your option) any later version.
+; *
+; * FFmpeg is distributed in the hope that it will be useful,
+; * but WITHOUT ANY WARRANTY; without even the implied warranty of
+; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+; * Lesser General Public License for more details.
+; *
+; * You should have received a copy of the GNU Lesser General Public
+; * License along with FFmpeg; if not, write to the Free Software
+; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+; */
+
+%include "libavutil/x86/x86util.asm"
+
+%define MAX_PB_SIZE 128
+
+SECTION_RODATA 32
+
+pw_0 times 2 dw 0
+pw_1 times 2 dw 1
+pw_4 times 2 dw 4
+pw_12 times 2 dw 12
+pw_256 times 2 dw 256
+
+%macro AVG_JMP_TABLE 3-*
+ %xdefine %1_%2_%3_table (%%table - 2*%4)
+ %xdefine %%base %1_%2_%3_table
+ %xdefine %%prefix mangle(private_prefix %+ _vvc_%1_%2bpc_%3)
+ %%table:
+ %rep %0 - 3
+ dd %%prefix %+ .w%4 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+AVG_JMP_TABLE avg, 8, avx2, 2, 4, 8, 16, 32, 64, 128
+AVG_JMP_TABLE avg, 16, avx2, 2, 4, 8, 16, 32, 64, 128
+AVG_JMP_TABLE w_avg, 8, avx2, 2, 4, 8, 16, 32, 64, 128
+AVG_JMP_TABLE w_avg, 16, avx2, 2, 4, 8, 16, 32, 64, 128
+
+SECTION .text
+
+%macro AVG_W16_FN 3 ; bpc, op, count
+ %assign %%i 0
+ %rep %3
+ %define off %%i
+ AVG_LOAD_W16 0, off
+ %2
+ AVG_SAVE_W16 %1, 0, off
+
+
+ AVG_LOAD_W16 1, off
+ %2
+ AVG_SAVE_W16 %1, 1, off
+
+ %assign %%i %%i+1
+ %endrep
+%endmacro
+
+%macro AVG_FN 2 ; bpc, op
+ jmp wq
+
+.w2:
+ movd xm0, [src0q]
+ pinsrd xm0, [src0q + AVG_SRC_STRIDE], 1
+ movd xm1, [src1q]
+ pinsrd xm1, [src1q + AVG_SRC_STRIDE], 1
+ %2
+ AVG_SAVE_W2 %1
+ AVG_LOOP_END .w2
+
+.w4:
+ movq xm0, [src0q]
+ pinsrq xm0, [src0q + AVG_SRC_STRIDE], 1
+ movq xm1, [src1q]
+ pinsrq xm1, [src1q + AVG_SRC_STRIDE], 1
+ %2
+ AVG_SAVE_W4 %1
+
+ AVG_LOOP_END .w4
+
+.w8:
+ vinserti128 m0, m0, [src0q], 0
+ vinserti128 m0, m0, [src0q + AVG_SRC_STRIDE], 1
+ vinserti128 m1, m1, [src1q], 0
+ vinserti128 m1, m1, [src1q + AVG_SRC_STRIDE], 1
+ %2
+ AVG_SAVE_W8 %1
+
+ AVG_LOOP_END .w8
+
+.w16:
+ AVG_W16_FN %1, %2, 1
+
+ AVG_LOOP_END .w16
+
+.w32:
+ AVG_W16_FN %1, %2, 2
+
+ AVG_LOOP_END .w32
+
+.w64:
+ AVG_W16_FN %1, %2, 4
+
+ AVG_LOOP_END .w64
+
+.w128:
+ AVG_W16_FN %1, %2, 8
+
+ AVG_LOOP_END .w128
+
+.ret:
+ RET
+%endmacro
+
+%macro AVG 0
+ paddsw m0, m1
+ pmulhrsw m0, m2
+ CLIPW m0, m3, m4
+%endmacro
+
+%macro W_AVG 0
+ punpckhwd m5, m0, m1
+ pmaddwd m5, m3
+ paddd m5, m4
+ psrad m5, xm2
+
+ punpcklwd m0, m0, m1
+ pmaddwd m0, m3
+ paddd m0, m4
+ psrad m0, xm2
+
+ packssdw m0, m5
+ CLIPW m0, m6, m7
+%endmacro
+
+%macro AVG_LOAD_W16 2 ; line, offset
+ movu m0, [src0q + %1 * AVG_SRC_STRIDE + %2 * 32]
+ movu m1, [src1q + %1 * AVG_SRC_STRIDE + %2 * 32]
+%endmacro
+
+%macro AVG_SAVE_W2 1 ;bpc
+ %if %1 == 16
+ pextrd [dstq], xm0, 0
+ pextrd [dstq + strideq], xm0, 1
+ %else
+ packuswb m0, m0
+ pextrw [dstq], xm0, 0
+ pextrw [dstq + strideq], xm0, 1
+ %endif
+%endmacro
+
+%macro AVG_SAVE_W4 1 ;bpc
+ %if %1 == 16
+ pextrq [dstq], xm0, 0
+ pextrq [dstq + strideq], xm0, 1
+ %else
+ packuswb m0, m0
+ pextrd [dstq], xm0, 0
+ pextrd [dstq + strideq], xm0, 1
+ %endif
+%endmacro
+
+%macro AVG_SAVE_W8 1 ;bpc
+ %if %1 == 16
+ vextracti128 [dstq], m0, 0
+ vextracti128 [dstq + strideq], m0, 1
+ %else
+ packuswb m0, m0
+ vpermq m0, m0, 1000b
+ pextrq [dstq], xm0, 0
+ pextrq [dstq + strideq], xm0, 1
+ %endif
+%endmacro
+
+%macro AVG_SAVE_W16 3 ; bpc, line, offset
+ %if %1 == 16
+ movu [dstq + %2 * strideq + %3 * 32], m0
+ %else
+ packuswb m0, m0
+ vpermq m0, m0, 1000b
+ vextracti128 [dstq + %2 * strideq + %3 * 16], m0, 0
+ %endif
+%endmacro
+
+%macro AVG_LOOP_END 1
+ sub hd, 2
+ je .ret
+
+ lea src0q, [src0q + 2 * AVG_SRC_STRIDE]
+ lea src1q, [src1q + 2 * AVG_SRC_STRIDE]
+ lea dstq, [dstq + 2 * strideq]
+ jmp %1
+%endmacro
+
+%define AVG_SRC_STRIDE MAX_PB_SIZE*2
+
+;void ff_vvc_avg_%1bpc_avx2(uint8_t *dst, ptrdiff_t dst_stride,
+; const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, intptr_t pixel_max);
+%macro VVC_AVG_AVX2 1
+cglobal vvc_avg_%1bpc, 4, 7, 5, dst, stride, src0, src1, w, h, bd
+ movifnidn hd, hm
+
+ pxor m3, m3 ; pixel min
+ vpbroadcastw m4, bdm ; pixel max
+
+ movifnidn bdd, bdm
+ inc bdd
+ tzcnt bdd, bdd ; bit depth
+
+ sub bdd, 8
+ movd xm0, bdd
+ vpbroadcastd m1, [pw_4]
+ pminuw m0, m1
+ vpbroadcastd m2, [pw_256]
+ psllw m2, xm0 ; shift
+
+ lea r6, [avg_%1 %+ SUFFIX %+ _table]
+ tzcnt wd, wm
+ movsxd wq, dword [r6+wq*4]
+ add wq, r6
+ AVG_FN %1, AVG
+%endmacro
+
+;void ff_vvc_w_avg_%1bpc_avx(uint8_t *dst, ptrdiff_t dst_stride,
+; const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height,
+; intptr_t denom, intptr_t w0, intptr_t w1, intptr_t o0, intptr_t o1, intptr_t pixel_max);
+%macro VVC_W_AVG_AVX2 1
+cglobal vvc_w_avg_%1bpc, 4, 7, 8, dst, stride, src0, src1, w, h, t0, t1
+
+ movifnidn hd, hm
+
+ movifnidn t0d, r8m ; w1
+ shl t0d, 16
+ mov t0w, r7m ; w0
+ movd xm3, t0d
+ vpbroadcastd m3, xm3 ; w0, w1
+
+ pxor m6, m6 ;pixel min
+ vpbroadcastw m7, r11m ;pixel max
+
+ mov t1q, rcx ; save ecx
+ mov ecx, r11m
+ inc ecx ; bd
+ tzcnt ecx, ecx
+ sub ecx, 8
+ mov t0d, r9m ; o0
+ add t0d, r10m ; o1
+ shl t0d, cl
+ inc t0d ;((o0 + o1) << (BIT_DEPTH - 8)) + 1
+
+ neg ecx
+ add ecx, 4 ; bd - 12
+ cmovl ecx, [pw_0]
+ add ecx, 3
+ add ecx, r6m
+ movd xm2, ecx ; shift
+
+ dec ecx
+ shl t0d, cl
+ movd xm4, t0d
+ vpbroadcastd m4, xm4 ; offset
+ mov rcx, t1q ; restore ecx
+
+ lea r6, [w_avg_%1 %+ SUFFIX %+ _table]
+ tzcnt wd, wm
+ movsxd wq, dword [r6+wq*4]
+ add wq, r6
+ AVG_FN %1, W_AVG
+%endmacro
+
+%if ARCH_X86_64
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+
+VVC_AVG_AVX2 16
+
+VVC_AVG_AVX2 8
+
+VVC_W_AVG_AVX2 16
+
+VVC_W_AVG_AVX2 8
+%endif
+
+%endif
@@ -169,6 +169,42 @@ FW_PUT_16BPC_AVX2(12);
MC_TAP_LINKS_16BPC_AVX2(LUMA, 8, bd); \
MC_TAP_LINKS_16BPC_AVX2(CHROMA, 4, bd);
+#define bf(fn, bd, opt) fn##_##bd##_##opt
+#define BF(fn, bpc, opt) fn##_##bpc##bpc_##opt
+
+#define AVG_BPC_FUNC(bpc, opt) \
+void BF(ff_vvc_avg, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
+ const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, intptr_t pixel_max); \
+void BF(ff_vvc_w_avg, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
+ const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, \
+ intptr_t denom, intptr_t w0, intptr_t w1, intptr_t o0, intptr_t o1, intptr_t pixel_max);
+
+#define AVG_FUNCS(bpc, bd, opt) \
+static void bf(avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
+ const int16_t *src0, const int16_t *src1, int width, int height) \
+{ \
+ BF(ff_vvc_avg, bpc, opt)(dst, dst_stride, src0, src1, width, height, (1 << bd) - 1); \
+} \
+static void bf(w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
+ const int16_t *src0, const int16_t *src1, int width, int height, \
+ int denom, int w0, int w1, int o0, int o1) \
+{ \
+ BF(ff_vvc_w_avg, bpc, opt)(dst, dst_stride, src0, src1, width, height, \
+ denom, w0, w1, o0, o1, (1 << bd) - 1); \
+}
+
+AVG_BPC_FUNC(8, avx2)
+AVG_BPC_FUNC(16, avx2)
+
+AVG_FUNCS(8, 8, avx2)
+AVG_FUNCS(16, 10, avx2)
+AVG_FUNCS(16, 12, avx2)
+
+#define AVG_INIT(bd, opt) do { \
+ c->inter.avg = bf(avg, bd, opt); \
+ c->inter.w_avg = bf(w_avg, bd, opt); \
+} while (0)
+
void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
{
const int cpu_flags = av_get_cpu_flags();
@@ -198,5 +234,21 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
MC_LINKS_16BPC_AVX2(12);
}
}
+
+ if (EXTERNAL_AVX2(cpu_flags)) {
+ switch (bd) {
+ case 8:
+ AVG_INIT(8, avx2);
+ break;
+ case 10:
+ AVG_INIT(10, avx2);
+ break;
+ case 12:
+ AVG_INIT(12, avx2);
+ break;
+ default:
+ break;
+ }
+ }
}
}