Message ID | 20240605202853.3135-2-jamrial@gmail.com |
---|---|
State | New |
Headers | show |
Series | [FFmpeg-devel,1/2] swscale/x86/input: add AVX2 optimized RGB32 to YUV functions | expand |
Context | Check | Description |
---|---|---|
andriy/configure_x86 | warning | Failed to apply patch |
James Almer: > uyvytoyuv422_c: 23991.8 > uyvytoyuv422_sse2: 2817.8 > uyvytoyuv422_avx: 2819.3 Why don't you nuke the avx version in a follow-up patch? > uyvytoyuv422_avx2: 1972.3 > > Signed-off-by: James Almer <jamrial@gmail.com> > --- > libswscale/x86/rgb2rgb.c | 6 ++++++ > libswscale/x86/rgb_2_rgb.asm | 32 ++++++++++++++++++++++++-------- > 2 files changed, 30 insertions(+), 8 deletions(-) > > diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c > index b325e5dbd5..21ccfafe51 100644 > --- a/libswscale/x86/rgb2rgb.c > +++ b/libswscale/x86/rgb2rgb.c > @@ -136,6 +136,9 @@ void ff_uyvytoyuv422_sse2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, > void ff_uyvytoyuv422_avx(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, > const uint8_t *src, int width, int height, > int lumStride, int chromStride, int srcStride); > +void ff_uyvytoyuv422_avx2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, > + const uint8_t *src, int width, int height, > + int lumStride, int chromStride, int srcStride); > #endif > > av_cold void rgb2rgb_init_x86(void) > @@ -177,5 +180,8 @@ av_cold void rgb2rgb_init_x86(void) > if (EXTERNAL_AVX(cpu_flags)) { > uyvytoyuv422 = ff_uyvytoyuv422_avx; > } > + if (EXTERNAL_AVX2_FAST(cpu_flags)) { > + uyvytoyuv422 = ff_uyvytoyuv422_avx2; > + } > #endif > } > diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm > index 76ca1eec03..0bf1278718 100644 > --- a/libswscale/x86/rgb_2_rgb.asm > +++ b/libswscale/x86/rgb_2_rgb.asm > @@ -34,13 +34,16 @@ pb_shuffle3210: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 > > SECTION .text > > -%macro RSHIFT_COPY 3 > +%macro RSHIFT_COPY 5 > ; %1 dst ; %2 src ; %3 shift > -%if cpuflag(avx) > - psrldq %1, %2, %3 > +%if mmsize == 32 > + vperm2i128 %1, %2, %3, %5 > + RSHIFT %1, %4 > +%elif cpuflag(avx) > + psrldq %1, %2, %4 > %else > mova %1, %2 > - RSHIFT %1, %3 > + RSHIFT %1, %4 > %endif > %endmacro > > @@ -233,26 +236,37 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s > jge .end_line > > .loop_simd: > +%if mmsize == 32 > + movu xm2, [srcq + wtwoq ] > + movu xm3, [srcq + wtwoq + 16 ] > + movu xm4, [srcq + wtwoq + 16 * 2] > + movu xm5, [srcq + wtwoq + 16 * 3] > + vinserti128 m2, m2, [srcq + wtwoq + 16 * 4], 1 > + vinserti128 m3, m3, [srcq + wtwoq + 16 * 5], 1 > + vinserti128 m4, m4, [srcq + wtwoq + 16 * 6], 1 > + vinserti128 m5, m5, [srcq + wtwoq + 16 * 7], 1 > +%else > movu m2, [srcq + wtwoq ] > movu m3, [srcq + wtwoq + mmsize ] > movu m4, [srcq + wtwoq + mmsize * 2] > movu m5, [srcq + wtwoq + mmsize * 3] > +%endif > > ; extract y part 1 > - RSHIFT_COPY m6, m2, 1 ; UYVY UYVY -> YVYU YVY... > + RSHIFT_COPY m6, m2, m4, 1, 0x20 ; UYVY UYVY -> YVYU YVY... > pand m6, m1; YxYx YxYx... > > - RSHIFT_COPY m7, m3, 1 ; UYVY UYVY -> YVYU YVY... > + RSHIFT_COPY m7, m3, m5, 1, 0x20 ; UYVY UYVY -> YVYU YVY... > pand m7, m1 ; YxYx YxYx... > > packuswb m6, m7 ; YYYY YYYY... > movu [ydstq + wq], m6 > > ; extract y part 2 > - RSHIFT_COPY m6, m4, 1 ; UYVY UYVY -> YVYU YVY... > + RSHIFT_COPY m6, m4, m2, 1, 0x13 ; UYVY UYVY -> YVYU YVY... > pand m6, m1; YxYx YxYx... > > - RSHIFT_COPY m7, m5, 1 ; UYVY UYVY -> YVYU YVY... > + RSHIFT_COPY m7, m5, m3, 1, 0x13 ; UYVY UYVY -> YVYU YVY... > pand m7, m1 ; YxYx YxYx... > > packuswb m6, m7 ; YYYY YYYY... > @@ -309,4 +323,6 @@ UYVY_TO_YUV422 > > INIT_XMM avx > UYVY_TO_YUV422 > +INIT_YMM avx2 > +UYVY_TO_YUV422 > %endif
Le 6 juin 2024 00:00:57 GMT+03:00, Andreas Rheinhardt <andreas.rheinhardt@outlook.com> a écrit : >James Almer: >> uyvytoyuv422_c: 23991.8 >> uyvytoyuv422_sse2: 2817.8 >> uyvytoyuv422_avx: 2819.3 > >Why don't you nuke the avx version in a follow-up patch? Same problem with the RGBA stuff as well. Are the AVX functions expected to be faster than SSE2 on processors *without* AVX2?
Le jeu. 6 juin 2024 à 08:11, Rémi Denis-Courmont <remi@remlab.net> a écrit : > >James Almer: > >> uyvytoyuv422_c: 23991.8 > >> uyvytoyuv422_sse2: 2817.8 > >> uyvytoyuv422_avx: 2819.3 > > > >Why don't you nuke the avx version in a follow-up patch? > > Same problem with the RGBA stuff as well. Are the AVX functions expected to be faster than SSE2 on processors *without* AVX2? Something frequent in this type of questions is that people are using numbers from a CPU that has had 10 years of arch improvements (and probably a doubling in throughput for any instruction set) over one that supported at most AVX. The presence of an AVX function (whose benefit is only 3-operand instructions, so admittedly small) would ideally only be benchmarked on that kind of CPUs. Case in point, at that time, even x264 introduced avx versions, so there was a time and CPU generations where yes, it was faster: https://code.videolan.org/search?search=INIT_XMM%20avx&nav_source=navbar&project_id=536&group_id=9&search_code=true&repository_ref=master https://code.videolan.org/videolan/x264/-/commit/abc2283e9abc6254744bf6dd148ac25433cdf80e But I understand the point is that any type of maintenance for a minor improvement to few CPUs, which are maybe 1% of a userbase, is not appealing.
Le 6 juin 2024 10:01:24 GMT+03:00, Christophe Gisquet <christophe.gisquet@gmail.com> a écrit : >Le jeu. 6 juin 2024 à 08:11, Rémi Denis-Courmont <remi@remlab.net> a écrit : >> >James Almer: >> >> uyvytoyuv422_c: 23991.8 >> >> uyvytoyuv422_sse2: 2817.8 >> >> uyvytoyuv422_avx: 2819.3 >> > >> >Why don't you nuke the avx version in a follow-up patch? >> >> Same problem with the RGBA stuff as well. Are the AVX functions expected to be faster than SSE2 on processors *without* AVX2? > >Something frequent in this type of questions is that people are using >numbers from a CPU that has had 10 years of arch improvements (and >probably a doubling in throughput for any instruction set) over one >that supported at most AVX. The presence of an AVX function (whose >benefit is only 3-operand instructions, so admittedly small) would >ideally only be benchmarked on that kind of CPUs. It feels a bit dense for someone not intimate with x86 innards such as I. Intuitively, 3 operands instructions are certainly helpful in avoiding vector copies around destructive operations. But it should be clear if, for any given function, this does or does not help. That being said, I have no objections as such to run-time optimisations for middle-aged processors. If anything, I think we should auto-trim the useless C code at least via DCE if we want to save space. We can assume at least SSE2 on x86-64, no? Of course this will break checkasm and some command line flags.
diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c index b325e5dbd5..21ccfafe51 100644 --- a/libswscale/x86/rgb2rgb.c +++ b/libswscale/x86/rgb2rgb.c @@ -136,6 +136,9 @@ void ff_uyvytoyuv422_sse2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, void ff_uyvytoyuv422_avx(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, int width, int height, int lumStride, int chromStride, int srcStride); +void ff_uyvytoyuv422_avx2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, + const uint8_t *src, int width, int height, + int lumStride, int chromStride, int srcStride); #endif av_cold void rgb2rgb_init_x86(void) @@ -177,5 +180,8 @@ av_cold void rgb2rgb_init_x86(void) if (EXTERNAL_AVX(cpu_flags)) { uyvytoyuv422 = ff_uyvytoyuv422_avx; } + if (EXTERNAL_AVX2_FAST(cpu_flags)) { + uyvytoyuv422 = ff_uyvytoyuv422_avx2; + } #endif } diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm index 76ca1eec03..0bf1278718 100644 --- a/libswscale/x86/rgb_2_rgb.asm +++ b/libswscale/x86/rgb_2_rgb.asm @@ -34,13 +34,16 @@ pb_shuffle3210: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 SECTION .text -%macro RSHIFT_COPY 3 +%macro RSHIFT_COPY 5 ; %1 dst ; %2 src ; %3 shift -%if cpuflag(avx) - psrldq %1, %2, %3 +%if mmsize == 32 + vperm2i128 %1, %2, %3, %5 + RSHIFT %1, %4 +%elif cpuflag(avx) + psrldq %1, %2, %4 %else mova %1, %2 - RSHIFT %1, %3 + RSHIFT %1, %4 %endif %endmacro @@ -233,26 +236,37 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s jge .end_line .loop_simd: +%if mmsize == 32 + movu xm2, [srcq + wtwoq ] + movu xm3, [srcq + wtwoq + 16 ] + movu xm4, [srcq + wtwoq + 16 * 2] + movu xm5, [srcq + wtwoq + 16 * 3] + vinserti128 m2, m2, [srcq + wtwoq + 16 * 4], 1 + vinserti128 m3, m3, [srcq + wtwoq + 16 * 5], 1 + vinserti128 m4, m4, [srcq + wtwoq + 16 * 6], 1 + vinserti128 m5, m5, [srcq + wtwoq + 16 * 7], 1 +%else movu m2, [srcq + wtwoq ] movu m3, [srcq + wtwoq + mmsize ] movu m4, [srcq + wtwoq + mmsize * 2] movu m5, [srcq + wtwoq + mmsize * 3] +%endif ; extract y part 1 - RSHIFT_COPY m6, m2, 1 ; UYVY UYVY -> YVYU YVY... + RSHIFT_COPY m6, m2, m4, 1, 0x20 ; UYVY UYVY -> YVYU YVY... pand m6, m1; YxYx YxYx... - RSHIFT_COPY m7, m3, 1 ; UYVY UYVY -> YVYU YVY... + RSHIFT_COPY m7, m3, m5, 1, 0x20 ; UYVY UYVY -> YVYU YVY... pand m7, m1 ; YxYx YxYx... packuswb m6, m7 ; YYYY YYYY... movu [ydstq + wq], m6 ; extract y part 2 - RSHIFT_COPY m6, m4, 1 ; UYVY UYVY -> YVYU YVY... + RSHIFT_COPY m6, m4, m2, 1, 0x13 ; UYVY UYVY -> YVYU YVY... pand m6, m1; YxYx YxYx... - RSHIFT_COPY m7, m5, 1 ; UYVY UYVY -> YVYU YVY... + RSHIFT_COPY m7, m5, m3, 1, 0x13 ; UYVY UYVY -> YVYU YVY... pand m7, m1 ; YxYx YxYx... packuswb m6, m7 ; YYYY YYYY... @@ -309,4 +323,6 @@ UYVY_TO_YUV422 INIT_XMM avx UYVY_TO_YUV422 +INIT_YMM avx2 +UYVY_TO_YUV422 %endif
uyvytoyuv422_c: 23991.8 uyvytoyuv422_sse2: 2817.8 uyvytoyuv422_avx: 2819.3 uyvytoyuv422_avx2: 1972.3 Signed-off-by: James Almer <jamrial@gmail.com> --- libswscale/x86/rgb2rgb.c | 6 ++++++ libswscale/x86/rgb_2_rgb.asm | 32 ++++++++++++++++++++++++-------- 2 files changed, 30 insertions(+), 8 deletions(-)