@@ -28,7 +28,7 @@ pd_128: times 4 dd 128
SECTION .text
-INIT_XMM sse2
+%macro TAK_DECORRELATE 0
cglobal tak_decorrelate_ls, 3, 3, 2, p1, p2, length
shl lengthd, 2
add p1q, lengthq
@@ -73,10 +73,8 @@ cglobal tak_decorrelate_sm, 3, 3, 6, p1, p2, length
mova m1, [p2q+lengthq]
mova m3, [p1q+lengthq+mmsize]
mova m4, [p2q+lengthq+mmsize]
- mova m2, m1
- mova m5, m4
- psrad m2, 1
- psrad m5, 1
+ psrad m2, m1, 1
+ psrad m5, m4, 1
psubd m0, m2
psubd m3, m5
paddd m1, m0
@@ -88,29 +86,39 @@ cglobal tak_decorrelate_sm, 3, 3, 6, p1, p2, length
add lengthq, mmsize*2
jl .loop
RET
+%endmacro
-INIT_XMM sse4
+INIT_XMM sse2
+TAK_DECORRELATE
+INIT_YMM avx2
+TAK_DECORRELATE
+
+%macro TAK_DECORRELATE_SF 0
cglobal tak_decorrelate_sf, 3, 3, 5, p1, p2, length, dshift, dfactor
shl lengthd, 2
add p1q, lengthq
add p2q, lengthq
neg lengthq
- movd m2, dshiftm
- movd m3, dfactorm
- pshufd m3, m3, 0
- mova m4, [pd_128]
+ movd xm2, dshiftm
+ VPBROADCASTD m3, dfactorm
+ VBROADCASTI128 m4, [pd_128]
.loop:
- mova m0, [p1q+lengthq]
mova m1, [p2q+lengthq]
- psrad m1, m2
+ psrad m1, xm2
pmulld m1, m3
paddd m1, m4
psrad m1, 8
- pslld m1, m2
- psubd m1, m0
+ pslld m1, xm2
+ psubd m1, [p1q+lengthq]
mova [p1q+lengthq], m1
add lengthq, mmsize
jl .loop
RET
+%endmacro
+
+INIT_XMM sse4
+TAK_DECORRELATE_SF
+INIT_YMM avx2
+TAK_DECORRELATE_SF
@@ -24,9 +24,13 @@
#include "config.h"
void ff_tak_decorrelate_ls_sse2(int32_t *p1, int32_t *p2, int length);
+void ff_tak_decorrelate_ls_avx2(int32_t *p1, int32_t *p2, int length);
void ff_tak_decorrelate_sr_sse2(int32_t *p1, int32_t *p2, int length);
+void ff_tak_decorrelate_sr_avx2(int32_t *p1, int32_t *p2, int length);
void ff_tak_decorrelate_sm_sse2(int32_t *p1, int32_t *p2, int length);
+void ff_tak_decorrelate_sm_avx2(int32_t *p1, int32_t *p2, int length);
void ff_tak_decorrelate_sf_sse4(int32_t *p1, int32_t *p2, int length, int dshift, int dfactor);
+void ff_tak_decorrelate_sf_avx2(int32_t *p1, int32_t *p2, int length, int dshift, int dfactor);
av_cold void ff_takdsp_init_x86(TAKDSPContext *c)
{
@@ -42,5 +46,12 @@ av_cold void ff_takdsp_init_x86(TAKDSPContext *c)
if (EXTERNAL_SSE4(cpu_flags)) {
c->decorrelate_sf = ff_tak_decorrelate_sf_sse4;
}
+
+ if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+ c->decorrelate_ls = ff_tak_decorrelate_ls_avx2;
+ c->decorrelate_sr = ff_tak_decorrelate_sr_avx2;
+ c->decorrelate_sm = ff_tak_decorrelate_sm_avx2;
+ c->decorrelate_sf = ff_tak_decorrelate_sf_avx2;
+ }
#endif
}
On an Intel Core i7 12700k: decorrelate_ls_c: 814.3 decorrelate_ls_sse2: 165.8 decorrelate_ls_avx2: 101.3 decorrelate_sf_c: 1602.6 decorrelate_sf_sse4: 640.1 decorrelate_sf_avx2: 324.6 decorrelate_sm_c: 1564.8 decorrelate_sm_sse2: 379.3 decorrelate_sm_avx2: 203.3 decorrelate_sr_c: 785.3 decorrelate_sr_sse2: 176.3 decorrelate_sr_avx2: 99.8 Signed-off-by: James Almer <jamrial@gmail.com> --- No changes since last version libavcodec/x86/takdsp.asm | 36 ++++++++++++++++++++++-------------- libavcodec/x86/takdsp_init.c | 11 +++++++++++ 2 files changed, 33 insertions(+), 14 deletions(-)