@@ -858,7 +858,7 @@ h264_add8x4_idct_sse2:
%else
add r0, r0m
%endif
- call h264_add8x4_idct_sse2
+ call h264_add8x4_idct_ %+ cpuname
%%skip:
%if %1 < 7
add r2, 64
@@ -1142,6 +1142,29 @@ IDCT_DC_DEQUANT 7
INIT_XMM avx
+ALIGN 16
+; r0 = uint8_t *dst (clobbered), r2 = int16_t *block, r3 = int stride
+h264_add8x4_idct_avx:
+ movu m0, [r2 + 0]
+ movu m1, [r2 + 32]
+ movu m2, [r2 + 16]
+ movu m3, [r2 + 48]
+ SBUTTERFLY qdq, 0, 1, 4
+ SBUTTERFLY qdq, 2, 3, 5
+ IDCT4_1D w,0,1,2,3,4,5
+ TRANSPOSE2x4x4W 0,1,2,3,4
+ paddw m0, [pw_32]
+ IDCT4_1D w,0,1,2,3,4,5
+ pxor m7, m7
+ mova [r2+ 0], m7
+ mova [r2+16], m7
+ mova [r2+32], m7
+ mova [r2+48], m7
+ STORE_DIFFx2 m0, m1, m4, m5, m7, 6, r0, r3
+ lea r0, [r0+r3*2]
+ STORE_DIFFx2 m2, m3, m4, m5, m7, 6, r0, r3
+ret
+
; %unmacro STORE_DIFFx2 8 ; remove macro from x86util.asm but yasm doesn't have this yet
%macro STORE_DIFFx2 8 ; add1, add2, reg1, reg2, zero, shift, source, stride
movd %3, [%7]
@@ -1199,3 +1222,18 @@ cglobal h264_idct8_dc_add_8, 3, 4, 0
lea dst_q, [dst_q + stride_q*4]
DC_ADD_MMXEXT_OP movq, dst_q, stride_q, r3
RET
+
+cglobal h264_idct_add16_8, 5, 5 + ARCH_X86_64, 8, dst_, block_offset_, block_, stride_, nnzc_
+ movsxdifnidn stride_q, stride_d
+ %if ARCH_X86_64
+ mov r5, r0
+ %endif
+ add16_sse2_cycle 0, 0xc
+ add16_sse2_cycle 1, 0x14
+ add16_sse2_cycle 2, 0xe
+ add16_sse2_cycle 3, 0x16
+ add16_sse2_cycle 4, 0x1c
+ add16_sse2_cycle 5, 0x24
+ add16_sse2_cycle 6, 0x1e
+ add16_sse2_cycle 7, 0x26
+RET
@@ -62,6 +62,7 @@ IDCT_ADD_REP_FUNC(8, 4, 10, avx)
IDCT_ADD_REP_FUNC(, 16, 8, mmx)
IDCT_ADD_REP_FUNC(, 16, 8, mmxext)
IDCT_ADD_REP_FUNC(, 16, 8, sse2)
+IDCT_ADD_REP_FUNC(, 16, 8, avx)
IDCT_ADD_REP_FUNC(, 16, 10, sse2)
IDCT_ADD_REP_FUNC(, 16intra, 8, mmx)
IDCT_ADD_REP_FUNC(, 16intra, 8, mmxext)
@@ -346,6 +347,7 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
c->h264_idct8_add = ff_h264_idct8_add_8_avx;
c->h264_idct_dc_add = ff_h264_idct_dc_add_8_avx;
c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_avx;
+ c->h264_idct_add16 = ff_h264_idct_add16_8_avx;
}
} else if (bit_depth == 10) {
if (EXTERNAL_MMXEXT(cpu_flags)) {