@@ -87,6 +87,26 @@ QPEL_TABLE 12, 4, w, sse4
QPEL_TABLE 8,16, b, avx2
QPEL_TABLE 10, 8, w, avx2
+QPEL_TABLE 8, 1, b, avx512icl_h
+QPEL_TABLE 8, 1, d, avx512icl_v
+
+pb_qpel_shuffle_index: db 0, 1, 2, 3
+ db 1, 2, 3, 4
+ db 2, 3, 4, 5
+ db 3, 4, 5, 6
+ db 4, 5, 6, 7
+ db 5, 6, 7, 8
+ db 6, 7, 8, 9
+ db 7, 8, 9, 10
+ db 4, 5, 6, 7
+ db 5, 6, 7, 8
+ db 6, 7, 8, 9
+ db 7, 8, 9, 10
+ db 8, 9, 10, 11
+ db 9, 10, 11, 12
+ db 10, 11, 12, 13
+ db 11, 12, 13, 14
+
SECTION .text
%define MAX_PB_SIZE 64
@@ -1670,3 +1690,120 @@ HEVC_PUT_HEVC_QPEL_HV 16, 10
%endif ;AVX2
%endif ; ARCH_X86_64
+
+%macro QPEL_FILTER_H 5
+%define %%table hevc_qpel_filters_avx512icl_h_%1
+%assign %%offset 4
+ dec %2q
+ shl %2q, 3
+%ifdef PIC
+ lea %5q, [%%table]
+ %define FILTER %5q
+%else
+ %define FILTER %%table
+%endif
+ vpbroadcastd m%3, [FILTER + %2q + 0*%%offset]
+ vpbroadcastd m%4, [FILTER + %2q + 1*%%offset]
+%endmacro
+
+%macro QPEL_FILTER_V 5
+ vpbroadcastd m%3, [%5 + %2q + 4*%4]
+%endmacro
+
+%macro QPEL_LOAD_SHUF 2
+ movu m%1, [pb_qpel_shuffle_index + 0]
+ movu m%2, [pb_qpel_shuffle_index + 32]
+%endmacro
+
+; required: m0-m5
+; %1: dst register index
+; %2: name for src
+%macro QPEL_H_LOAD_COMPUTE 2
+ pxor m%1, m%1
+ movu xm4, [%2q - 3]
+ vpermb m5, m2, m4
+ vpermb m4, m3, m4
+ vpdpbusd m%1, m5, m0
+ vpdpbusd m%1, m4, m1
+%endmacro
+
+%macro HEVC_PUT_HEVC_QPEL_AVX512ICL 2
+cglobal hevc_put_hevc_qpel_h%1_%2, 5, 6, 8, dst, src, srcstride, height, mx, tmp
+ QPEL_FILTER_H %1, mx, 0, 1, tmp
+ QPEL_LOAD_SHUF 2, 3
+.loop:
+ QPEL_H_LOAD_COMPUTE 6, src
+ vpmovdw [dstq], m6
+ LOOP_END dst, src, srcstride
+ RET
+%endmacro
+
+%macro HEVC_PUT_HEVC_QPEL_HV_AVX512ICL 2
+cglobal hevc_put_hevc_qpel_hv%1_%2, 6, 7, 27, dst, src, srcstride, height, mx, my, tmp
+%assign %%shift 6
+%assign %%extra 7
+ QPEL_FILTER_H %1, mx, 0, 1, tmp
+ QPEL_LOAD_SHUF 2, 3
+ lea tmpq, [srcstrideq*3]
+ sub srcq, tmpq
+ sub myq, 1
+ shl myq, 5
+%ifdef PIC
+%define %%table hevc_qpel_filters_avx512icl_v_%1
+ lea tmpq, [%%table]
+ %define FILTER tmpq
+%else
+ %define FILTER %%table
+%endif
+%assign %%i 6
+%assign %%j 0
+%rep %1
+ QPEL_FILTER_V %1, my, %%i, %%j, FILTER
+ %assign %%i %%i+1
+ %assign %%j %%j+1
+%endrep
+%rep %%extra
+ QPEL_H_LOAD_COMPUTE %%i, src
+ add srcq, srcstrideq
+%assign %%i %%i+1
+%endrep
+.loop:
+ QPEL_H_LOAD_COMPUTE %%i, src
+ vpmulld m22, m14, m6
+ vpmulld m23, m15, m7
+ vpmulld m24, m16, m8
+ vpmulld m25, m17, m9
+ vpaddd m26, m22, m23
+ vpaddd m24, m25
+ vpaddd m26, m24
+ vpmulld m22, m18, m10
+ vpmulld m23, m19, m11
+ vpmulld m24, m20, m12
+ vpmulld m25, m21, m13
+ vpaddd m22, m22, m23
+ vpaddd m24, m25
+ vpaddd m26, m24
+ vpaddd m22, m26
+ mova m14, m15
+ mova m15, m16
+ mova m16, m17
+ mova m17, m18
+ mova m18, m19
+ mova m19, m20
+ mova m20, m21
+ vpsrad m22, %%shift
+ vpmovdw [dstq], m22
+ LOOP_END dst, src, srcstride
+
+ RET
+%endmacro
+
+%if ARCH_X86_64
+%if HAVE_AVX512ICL_EXTERNAL
+
+INIT_YMM avx512icl
+HEVC_PUT_HEVC_QPEL_AVX512ICL 8, 8
+HEVC_PUT_HEVC_QPEL_HV_AVX512ICL 8, 8
+
+%endif
+%endif
@@ -233,6 +233,9 @@ WEIGHTING_PROTOTYPES(8, sse4);
WEIGHTING_PROTOTYPES(10, sse4);
WEIGHTING_PROTOTYPES(12, sse4);
+void ff_hevc_put_hevc_qpel_h8_8_avx512icl(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_hv8_8_avx512icl(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width);
+
///////////////////////////////////////////////////////////////////////////////
// TRANSFORM_ADD
///////////////////////////////////////////////////////////////////////////////
@@ -878,6 +878,10 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
c->add_residual[3] = ff_hevc_add_residual_32_8_avx2;
}
+ if (EXTERNAL_AVX512ICL(cpu_flags) && ARCH_X86_64) {
+ c->put_hevc_qpel[3][0][1] = ff_hevc_put_hevc_qpel_h8_8_avx512icl;
+ c->put_hevc_qpel[3][1][1] = ff_hevc_put_hevc_qpel_hv8_8_avx512icl;
+ }
} else if (bit_depth == 10) {
if (EXTERNAL_MMXEXT(cpu_flags)) {
c->add_residual[0] = ff_hevc_add_residual_4_10_mmxext;