Message ID | 20230220195703.1297421-3-jdarnley@obe.tv |
---|---|
State | New |
Headers | show |
Series | [FFmpeg-devel,1/3] avfilter: move bwdif's filter_line init into a dedicated function | expand |
Context | Check | Description |
---|---|---|
yinshiyou/make_loongarch64 | success | Make finished |
yinshiyou/make_fate_loongarch64 | success | Make fate finished |
andriy/make_x86 | success | Make finished |
andriy/make_fate_x86 | success | Make fate finished |
Hi James, Am Mo., 20. Feb. 2023 um 20:59 Uhr schrieb James Darnley <jdarnley@obe.tv>: > 2.24x faster (1925±1.3 vs. 859±2.2 decicycles) compared with ssse3 > --- > libavfilter/x86/vf_bwdif.asm | 29 ++++++++++++++++++++++++----- > libavfilter/x86/vf_bwdif_init.c | 12 ++++++++++++ > 2 files changed, 36 insertions(+), 5 deletions(-) > > diff --git a/libavfilter/x86/vf_bwdif.asm b/libavfilter/x86/vf_bwdif.asm > index 0b453da53b..5cc61435fd 100644 > --- a/libavfilter/x86/vf_bwdif.asm > +++ b/libavfilter/x86/vf_bwdif.asm > @@ -26,18 +26,22 @@ > > %include "libavutil/x86/x86util.asm" > > -SECTION_RODATA > +SECTION_RODATA 32 > > -pw_coefhf: times 4 dw 1016, 5570 > -pw_coefhf1: times 8 dw -3801 > -pw_coefsp: times 4 dw 5077, -981 > -pw_splfdif: times 4 dw -768, 768 > +pw_coefhf: times 8 dw 1016, 5570 > +pw_coefhf1: times 16 dw -3801 > +pw_coefsp: times 8 dw 5077, -981 > +pw_splfdif: times 8 dw -768, 768 > > SECTION .text > > %macro LOAD8 2 > + %if mmsize == 32 > + pmovzxbw %1, %2 > + %else > movh %1, %2 > punpcklbw %1, m7 > + %endif > %endmacro > > %macro LOAD12 2 > @@ -45,8 +49,14 @@ SECTION .text > %endmacro > > %macro DISP8 0 > + %if mmsize == 32 > + vextracti128 xm1, m2, 1 > + packuswb xm2, xm1 > + movu [dstq], xm2 > + %else > packuswb m2, m2 > movh [dstq], m2 > + %endif > %endmacro > > %macro DISP12 0 > @@ -244,8 +254,12 @@ cglobal bwdif_filter_line_12bit, 4, 9, 13, 0, dst, > prev, cur, next, w, \ > prefs, mrefs, prefs2, > mrefs2, \ > prefs3, mrefs3, prefs4, \ > mrefs4, parity, clip_max > + %if mmsize == 32 > + vpbroadcastd m12, DWORD clip_maxm > I get a green pattern at bit depths > 8. Looks good with: vpbroadcastw m12, WORD clip_maxm + %else > movd m12, DWORD clip_maxm > SPLATW m12, m12, 0 > + %endif > %else > cglobal bwdif_filter_line_12bit, 4, 6, 8, 80, dst, prev, cur, next, w, \ > prefs, mrefs, prefs2, > mrefs2, \ > @@ -264,3 +278,8 @@ INIT_XMM ssse3 > BWDIF > INIT_XMM sse2 > BWDIF > + > +%if HAVE_AVX2_EXTERNAL && ARCH_X86_64 > +INIT_YMM avx2 > +BWDIF > +%endif > diff --git a/libavfilter/x86/vf_bwdif_init.c > b/libavfilter/x86/vf_bwdif_init.c > index ba7bc40c3d..f833318c10 100644 > --- a/libavfilter/x86/vf_bwdif_init.c > +++ b/libavfilter/x86/vf_bwdif_init.c > @@ -32,6 +32,10 @@ void ff_bwdif_filter_line_ssse3(void *dst, void *prev, > void *cur, void *next, > int w, int prefs, int mrefs, int prefs2, > int mrefs2, int prefs3, int mrefs3, int > prefs4, > int mrefs4, int parity, int clip_max); > +void ff_bwdif_filter_line_avx2(void *dst, void *prev, void *cur, void > *next, > + int w, int prefs, int mrefs, int prefs2, > + int mrefs2, int prefs3, int mrefs3, int > prefs4, > + int mrefs4, int parity, int clip_max); > > void ff_bwdif_filter_line_12bit_sse2(void *dst, void *prev, void *cur, > void *next, > int w, int prefs, int mrefs, int > prefs2, > @@ -41,6 +45,10 @@ void ff_bwdif_filter_line_12bit_ssse3(void *dst, void > *prev, void *cur, void *ne > int w, int prefs, int mrefs, int > prefs2, > int mrefs2, int prefs3, int mrefs3, > int prefs4, > int mrefs4, int parity, int > clip_max); > +void ff_bwdif_filter_line_12bit_avx2(void *dst, void *prev, void *cur, > void *next, > + int w, int prefs, int mrefs, int > prefs2, > + int mrefs2, int prefs3, int mrefs3, > int prefs4, > + int mrefs4, int parity, int > clip_max); > > av_cold void ff_bwdif_init_x86(BWDIFContext *bwdif, int bit_depth) > { > @@ -51,10 +59,14 @@ av_cold void ff_bwdif_init_x86(BWDIFContext *bwdif, > int bit_depth) > bwdif->filter_line = ff_bwdif_filter_line_sse2; > if (EXTERNAL_SSSE3(cpu_flags)) > bwdif->filter_line = ff_bwdif_filter_line_ssse3; > + if (ARCH_X86_64 && EXTERNAL_AVX2(cpu_flags)) > + bwdif->filter_line = ff_bwdif_filter_line_avx2; > } else if (bit_depth <= 12) { > if (EXTERNAL_SSE2(cpu_flags)) > bwdif->filter_line = ff_bwdif_filter_line_12bit_sse2; > if (EXTERNAL_SSSE3(cpu_flags)) > bwdif->filter_line = ff_bwdif_filter_line_12bit_ssse3; > + if (ARCH_X86_64 && EXTERNAL_AVX2(cpu_flags)) > + bwdif->filter_line = ff_bwdif_filter_line_12bit_avx2; > } > } > -- > 2.39.1
On 3/11/23 17:14, Thomas Mundt wrote: >> + %if mmsize == 32 >> + vpbroadcastd m12, DWORD clip_maxm >> > > I get a green pattern at bit depths > 8. > Looks good with: > vpbroadcastw m12, WORD clip_maxm > > + %else >> movd m12, DWORD clip_maxm >> SPLATW m12, m12, 0 >> + %endif Of course it should be a word broadcast! But why doesn't my checkasm test catch it? >> bwdif->filter_line = ff_bwdif_filter_line_sse2; >> if (EXTERNAL_SSSE3(cpu_flags)) >> bwdif->filter_line = ff_bwdif_filter_line_ssse3; >> + if (ARCH_X86_64 && EXTERNAL_AVX2(cpu_flags)) >> + bwdif->filter_line = ff_bwdif_filter_line_avx2; >> } else if (bit_depth <= 12) { >> if (EXTERNAL_SSE2(cpu_flags)) >> bwdif->filter_line = ff_bwdif_filter_line_12bit_sse2; >> if (EXTERNAL_SSSE3(cpu_flags)) >> bwdif->filter_line = ff_bwdif_filter_line_12bit_ssse3; >> + if (ARCH_X86_64 && EXTERNAL_AVX2(cpu_flags)) >> + bwdif->filter_line = ff_bwdif_filter_line_12bit_avx2; >> } >> } I was intending to only modify/write the 8-bit function so this is a mistake. Thanks. I'll be back with a version 2. [re-sending to list]
diff --git a/libavfilter/x86/vf_bwdif.asm b/libavfilter/x86/vf_bwdif.asm index 0b453da53b..5cc61435fd 100644 --- a/libavfilter/x86/vf_bwdif.asm +++ b/libavfilter/x86/vf_bwdif.asm @@ -26,18 +26,22 @@ %include "libavutil/x86/x86util.asm" -SECTION_RODATA +SECTION_RODATA 32 -pw_coefhf: times 4 dw 1016, 5570 -pw_coefhf1: times 8 dw -3801 -pw_coefsp: times 4 dw 5077, -981 -pw_splfdif: times 4 dw -768, 768 +pw_coefhf: times 8 dw 1016, 5570 +pw_coefhf1: times 16 dw -3801 +pw_coefsp: times 8 dw 5077, -981 +pw_splfdif: times 8 dw -768, 768 SECTION .text %macro LOAD8 2 + %if mmsize == 32 + pmovzxbw %1, %2 + %else movh %1, %2 punpcklbw %1, m7 + %endif %endmacro %macro LOAD12 2 @@ -45,8 +49,14 @@ SECTION .text %endmacro %macro DISP8 0 + %if mmsize == 32 + vextracti128 xm1, m2, 1 + packuswb xm2, xm1 + movu [dstq], xm2 + %else packuswb m2, m2 movh [dstq], m2 + %endif %endmacro %macro DISP12 0 @@ -244,8 +254,12 @@ cglobal bwdif_filter_line_12bit, 4, 9, 13, 0, dst, prev, cur, next, w, \ prefs, mrefs, prefs2, mrefs2, \ prefs3, mrefs3, prefs4, \ mrefs4, parity, clip_max + %if mmsize == 32 + vpbroadcastd m12, DWORD clip_maxm + %else movd m12, DWORD clip_maxm SPLATW m12, m12, 0 + %endif %else cglobal bwdif_filter_line_12bit, 4, 6, 8, 80, dst, prev, cur, next, w, \ prefs, mrefs, prefs2, mrefs2, \ @@ -264,3 +278,8 @@ INIT_XMM ssse3 BWDIF INIT_XMM sse2 BWDIF + +%if HAVE_AVX2_EXTERNAL && ARCH_X86_64 +INIT_YMM avx2 +BWDIF +%endif diff --git a/libavfilter/x86/vf_bwdif_init.c b/libavfilter/x86/vf_bwdif_init.c index ba7bc40c3d..f833318c10 100644 --- a/libavfilter/x86/vf_bwdif_init.c +++ b/libavfilter/x86/vf_bwdif_init.c @@ -32,6 +32,10 @@ void ff_bwdif_filter_line_ssse3(void *dst, void *prev, void *cur, void *next, int w, int prefs, int mrefs, int prefs2, int mrefs2, int prefs3, int mrefs3, int prefs4, int mrefs4, int parity, int clip_max); +void ff_bwdif_filter_line_avx2(void *dst, void *prev, void *cur, void *next, + int w, int prefs, int mrefs, int prefs2, + int mrefs2, int prefs3, int mrefs3, int prefs4, + int mrefs4, int parity, int clip_max); void ff_bwdif_filter_line_12bit_sse2(void *dst, void *prev, void *cur, void *next, int w, int prefs, int mrefs, int prefs2, @@ -41,6 +45,10 @@ void ff_bwdif_filter_line_12bit_ssse3(void *dst, void *prev, void *cur, void *ne int w, int prefs, int mrefs, int prefs2, int mrefs2, int prefs3, int mrefs3, int prefs4, int mrefs4, int parity, int clip_max); +void ff_bwdif_filter_line_12bit_avx2(void *dst, void *prev, void *cur, void *next, + int w, int prefs, int mrefs, int prefs2, + int mrefs2, int prefs3, int mrefs3, int prefs4, + int mrefs4, int parity, int clip_max); av_cold void ff_bwdif_init_x86(BWDIFContext *bwdif, int bit_depth) { @@ -51,10 +59,14 @@ av_cold void ff_bwdif_init_x86(BWDIFContext *bwdif, int bit_depth) bwdif->filter_line = ff_bwdif_filter_line_sse2; if (EXTERNAL_SSSE3(cpu_flags)) bwdif->filter_line = ff_bwdif_filter_line_ssse3; + if (ARCH_X86_64 && EXTERNAL_AVX2(cpu_flags)) + bwdif->filter_line = ff_bwdif_filter_line_avx2; } else if (bit_depth <= 12) { if (EXTERNAL_SSE2(cpu_flags)) bwdif->filter_line = ff_bwdif_filter_line_12bit_sse2; if (EXTERNAL_SSSE3(cpu_flags)) bwdif->filter_line = ff_bwdif_filter_line_12bit_ssse3; + if (ARCH_X86_64 && EXTERNAL_AVX2(cpu_flags)) + bwdif->filter_line = ff_bwdif_filter_line_12bit_avx2; } }