Message ID | 20231222011549.16057-2-jamrial@gmail.com |
---|---|
State | New |
Headers | show |
Series | [FFmpeg-devel,1/2] checkasm/takdsp: add decorrelate_sf test | expand |
Context | Check | Description |
---|---|---|
yinshiyou/make_loongarch64 | success | Make finished |
yinshiyou/make_fate_loongarch64 | success | Make fate finished |
andriy/make_x86 | success | Make finished |
andriy/make_fate_x86 | success | Make fate finished |
On Thu, Dec 21, 2023 at 10:15:49PM -0300, James Almer wrote: > On an Intel Core i7 12700k: > > decorrelate_ls_c: 814.3 > decorrelate_ls_sse2: 165.8 > decorrelate_ls_avx2: 101.3 > decorrelate_sf_c: 1602.6 > decorrelate_sf_sse4: 640.1 > decorrelate_sf_avx2: 324.6 > decorrelate_sm_c: 1564.8 > decorrelate_sm_sse2: 379.3 > decorrelate_sm_avx2: 203.3 > decorrelate_sr_c: 785.3 > decorrelate_sr_sse2: 176.3 > decorrelate_sr_avx2: 99.8 > > Signed-off-by: James Almer <jamrial@gmail.com> on AMD Ryzen 9 3950X 16-Core Processor Illegal instruction (core dumped) threads=1 tests/Makefile:308: recipe for target 'fate-lossless-tak' failed make: *** [fate-lossless-tak] Error 132 (gdb) disassemble $rip-32, $rip+32 Dump of assembler code from 0x55555651a580 to 0x55555651a5c0: 0x000055555651a580: or $0x17,%al 0x000055555651a582: movdqa %xmm1,(%rdi,%rdx,1) 0x000055555651a587: add $0x10,%rdx 0x000055555651a58b: jl 0x55555651a562 0x000055555651a58d: retq 0x000055555651a58e: nop 0x000055555651a58f: nop 0x000055555651a590: shl $0x2,%edx 0x000055555651a593: add %rdx,%rdi 0x000055555651a596: add %rdx,%rsi 0x000055555651a599: neg %rdx 0x000055555651a59c: vmovd %ecx,%xmm2 => 0x000055555651a5a0: vpbroadcastd %r8d,%ymm3 0x000055555651a5a6: vbroadcasti128 0x4bc751(%rip),%ymm4 # 0x5555569d6d00 0x000055555651a5af: vmovdqa (%rsi,%rdx,1),%ymm1 0x000055555651a5b4: vpsrad %xmm2,%ymm1,%ymm1 0x000055555651a5b8: vpmulld %ymm3,%ymm1,%ymm1 0x000055555651a5bd: vpaddd %ymm4,%ymm1,%ymm1 End of assembler dump. [...]
On 12/22/2023 8:08 PM, Michael Niedermayer wrote: > On Thu, Dec 21, 2023 at 10:15:49PM -0300, James Almer wrote: >> On an Intel Core i7 12700k: >> >> decorrelate_ls_c: 814.3 >> decorrelate_ls_sse2: 165.8 >> decorrelate_ls_avx2: 101.3 >> decorrelate_sf_c: 1602.6 >> decorrelate_sf_sse4: 640.1 >> decorrelate_sf_avx2: 324.6 >> decorrelate_sm_c: 1564.8 >> decorrelate_sm_sse2: 379.3 >> decorrelate_sm_avx2: 203.3 >> decorrelate_sr_c: 785.3 >> decorrelate_sr_sse2: 176.3 >> decorrelate_sr_avx2: 99.8 >> >> Signed-off-by: James Almer <jamrial@gmail.com> > > on AMD Ryzen 9 3950X 16-Core Processor > > Illegal instruction (core dumped) > threads=1 > tests/Makefile:308: recipe for target 'fate-lossless-tak' failed > make: *** [fate-lossless-tak] Error 132 > > (gdb) disassemble $rip-32, $rip+32 > Dump of assembler code from 0x55555651a580 to 0x55555651a5c0: > 0x000055555651a580: or $0x17,%al > 0x000055555651a582: movdqa %xmm1,(%rdi,%rdx,1) > 0x000055555651a587: add $0x10,%rdx > 0x000055555651a58b: jl 0x55555651a562 > 0x000055555651a58d: retq > 0x000055555651a58e: nop > 0x000055555651a58f: nop > 0x000055555651a590: shl $0x2,%edx > 0x000055555651a593: add %rdx,%rdi > 0x000055555651a596: add %rdx,%rsi > 0x000055555651a599: neg %rdx > 0x000055555651a59c: vmovd %ecx,%xmm2 > => 0x000055555651a5a0: vpbroadcastd %r8d,%ymm3 Right, on linux the fifth argument is on a gpr, and vpbroadcastd with gpr source is avx512. Will fix and resend. > 0x000055555651a5a6: vbroadcasti128 0x4bc751(%rip),%ymm4 # 0x5555569d6d00 > 0x000055555651a5af: vmovdqa (%rsi,%rdx,1),%ymm1 > 0x000055555651a5b4: vpsrad %xmm2,%ymm1,%ymm1 > 0x000055555651a5b8: vpmulld %ymm3,%ymm1,%ymm1 > 0x000055555651a5bd: vpaddd %ymm4,%ymm1,%ymm1 > End of assembler dump. > > > [...] > > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
diff --git a/libavcodec/x86/takdsp.asm b/libavcodec/x86/takdsp.asm index be8e1ab553..a5501cc285 100644 --- a/libavcodec/x86/takdsp.asm +++ b/libavcodec/x86/takdsp.asm @@ -28,7 +28,7 @@ pd_128: times 4 dd 128 SECTION .text -INIT_XMM sse2 +%macro TAK_DECORRELATE 0 cglobal tak_decorrelate_ls, 3, 3, 2, p1, p2, length shl lengthd, 2 add p1q, lengthq @@ -73,10 +73,8 @@ cglobal tak_decorrelate_sm, 3, 3, 6, p1, p2, length mova m1, [p2q+lengthq] mova m3, [p1q+lengthq+mmsize] mova m4, [p2q+lengthq+mmsize] - mova m2, m1 - mova m5, m4 - psrad m2, 1 - psrad m5, 1 + psrad m2, m1, 1 + psrad m5, m4, 1 psubd m0, m2 psubd m3, m5 paddd m1, m0 @@ -88,29 +86,39 @@ cglobal tak_decorrelate_sm, 3, 3, 6, p1, p2, length add lengthq, mmsize*2 jl .loop RET +%endmacro -INIT_XMM sse4 +INIT_XMM sse2 +TAK_DECORRELATE +INIT_YMM avx2 +TAK_DECORRELATE + +%macro TAK_DECORRELATE_SF 0 cglobal tak_decorrelate_sf, 3, 3, 5, p1, p2, length, dshift, dfactor shl lengthd, 2 add p1q, lengthq add p2q, lengthq neg lengthq - movd m2, dshiftm - movd m3, dfactorm - pshufd m3, m3, 0 - mova m4, [pd_128] + movd xm2, dshiftm + VPBROADCASTD m3, dfactorm + VBROADCASTI128 m4, [pd_128] .loop: - mova m0, [p1q+lengthq] mova m1, [p2q+lengthq] - psrad m1, m2 + psrad m1, xm2 pmulld m1, m3 paddd m1, m4 psrad m1, 8 - pslld m1, m2 - psubd m1, m0 + pslld m1, xm2 + psubd m1, [p1q+lengthq] mova [p1q+lengthq], m1 add lengthq, mmsize jl .loop RET +%endmacro + +INIT_XMM sse4 +TAK_DECORRELATE_SF +INIT_YMM avx2 +TAK_DECORRELATE_SF diff --git a/libavcodec/x86/takdsp_init.c b/libavcodec/x86/takdsp_init.c index b2e6e639ee..c99a057b24 100644 --- a/libavcodec/x86/takdsp_init.c +++ b/libavcodec/x86/takdsp_init.c @@ -24,9 +24,13 @@ #include "config.h" void ff_tak_decorrelate_ls_sse2(int32_t *p1, int32_t *p2, int length); +void ff_tak_decorrelate_ls_avx2(int32_t *p1, int32_t *p2, int length); void ff_tak_decorrelate_sr_sse2(int32_t *p1, int32_t *p2, int length); +void ff_tak_decorrelate_sr_avx2(int32_t *p1, int32_t *p2, int length); void ff_tak_decorrelate_sm_sse2(int32_t *p1, int32_t *p2, int length); +void ff_tak_decorrelate_sm_avx2(int32_t *p1, int32_t *p2, int length); void ff_tak_decorrelate_sf_sse4(int32_t *p1, int32_t *p2, int length, int dshift, int dfactor); +void ff_tak_decorrelate_sf_avx2(int32_t *p1, int32_t *p2, int length, int dshift, int dfactor); av_cold void ff_takdsp_init_x86(TAKDSPContext *c) { @@ -42,5 +46,12 @@ av_cold void ff_takdsp_init_x86(TAKDSPContext *c) if (EXTERNAL_SSE4(cpu_flags)) { c->decorrelate_sf = ff_tak_decorrelate_sf_sse4; } + + if (EXTERNAL_AVX2_FAST(cpu_flags)) { + c->decorrelate_ls = ff_tak_decorrelate_ls_avx2; + c->decorrelate_sr = ff_tak_decorrelate_sr_avx2; + c->decorrelate_sm = ff_tak_decorrelate_sm_avx2; + c->decorrelate_sf = ff_tak_decorrelate_sf_avx2; + } #endif }
On an Intel Core i7 12700k: decorrelate_ls_c: 814.3 decorrelate_ls_sse2: 165.8 decorrelate_ls_avx2: 101.3 decorrelate_sf_c: 1602.6 decorrelate_sf_sse4: 640.1 decorrelate_sf_avx2: 324.6 decorrelate_sm_c: 1564.8 decorrelate_sm_sse2: 379.3 decorrelate_sm_avx2: 203.3 decorrelate_sr_c: 785.3 decorrelate_sr_sse2: 176.3 decorrelate_sr_avx2: 99.8 Signed-off-by: James Almer <jamrial@gmail.com> --- libavcodec/x86/takdsp.asm | 36 ++++++++++++++++++++++-------------- libavcodec/x86/takdsp_init.c | 11 +++++++++++ 2 files changed, 33 insertions(+), 14 deletions(-)