diff mbox series

[FFmpeg-devel,v1] scale: Bring back the old yuv2yuvX, use it when disable-x86asm.

Message ID 20240426122142.60282-1-heng.hu.1989@gmail.com
State New
Headers show
Series [FFmpeg-devel,v1] scale: Bring back the old yuv2yuvX, use it when disable-x86asm. | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished

Commit Message

hu heng April 26, 2024, 12:21 p.m. UTC
From: huheng <heng.hu.1989@gmail.com>

rename old inline yuv2yuvX to yuv2yuv_X, to avoid conflicts with
the names of standalone asm functions. When ffmpeg is compiled with
--disable-x86asm, using the scale function will cause the video to
be blurred. The reason is that when disable-x86asm, INLINE_MMXEXT
is 1 and use_mmx_vfilter is 1, but c->yuv2planeX uses the c language
version, which causes a problem of mismatch with the vfilter. This
problem has persisted from version 4.4 to the present. Fix it by using
inline yuv2yuv_X_mmxext, that can maintain the consistency of
use_mmx_vfilter.

reproduce the issue:
1. ./configure --disable-x86asm --enable-gpl --enable-libx264
2. ./ffmpeg -i input.mp4 -vf "scale=1280x720" -c:v libx264 output.mp4
the output.mp4 is abnormal

Signed-off-by: huheng <heng.hu.1989@gmail.com>
---
 libswscale/x86/swscale.c          |  6 +++-
 libswscale/x86/swscale_template.c | 53 +++++++++++++++++++++++++++++++
 2 files changed, 58 insertions(+), 1 deletion(-)

Comments

hu heng May 2, 2024, 8:07 p.m. UTC | #1
<heng.hu.1989@gmail.com> 于2024年4月26日周五 20:21写道:
>
> From: huheng <heng.hu.1989@gmail.com>
>
> rename old inline yuv2yuvX to yuv2yuv_X, to avoid conflicts with
> the names of standalone asm functions. When ffmpeg is compiled with
> --disable-x86asm, using the scale function will cause the video to
> be blurred. The reason is that when disable-x86asm, INLINE_MMXEXT
> is 1 and use_mmx_vfilter is 1, but c->yuv2planeX uses the c language
> version, which causes a problem of mismatch with the vfilter. This
> problem has persisted from version 4.4 to the present. Fix it by using
> inline yuv2yuv_X_mmxext, that can maintain the consistency of
> use_mmx_vfilter.
>
> reproduce the issue:
> 1. ./configure --disable-x86asm --enable-gpl --enable-libx264
> 2. ./ffmpeg -i input.mp4 -vf "scale=1280x720" -c:v libx264 output.mp4
> the output.mp4 is abnormal
>
> Signed-off-by: huheng <heng.hu.1989@gmail.com>
> ---
>  libswscale/x86/swscale.c          |  6 +++-
>  libswscale/x86/swscale_template.c | 53 +++++++++++++++++++++++++++++++
>  2 files changed, 58 insertions(+), 1 deletion(-)
>
> diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
> index ff16398988..1bb9d1d51a 100644
> --- a/libswscale/x86/swscale.c
> +++ b/libswscale/x86/swscale.c
> @@ -452,8 +452,12 @@ av_cold void ff_sws_init_swscale_x86(SwsContext *c)
>      int cpu_flags = av_get_cpu_flags();
>
>  #if HAVE_MMXEXT_INLINE
> -    if (INLINE_MMXEXT(cpu_flags))
> +    if (INLINE_MMXEXT(cpu_flags)) {
>          sws_init_swscale_mmxext(c);
> +        if (c->use_mmx_vfilter && !(c->flags & SWS_ACCURATE_RND)) {
> +            c->yuv2planeX = yuv2yuv_X_mmxext;
> +        }
> +    }
>  #endif
>      if(c->use_mmx_vfilter && !(c->flags & SWS_ACCURATE_RND)) {
>  #if HAVE_MMXEXT_EXTERNAL
> diff --git a/libswscale/x86/swscale_template.c b/libswscale/x86/swscale_template.c
> index 6190fcb4fe..1b8794480d 100644
> --- a/libswscale/x86/swscale_template.c
> +++ b/libswscale/x86/swscale_template.c
> @@ -33,6 +33,59 @@
>  #define MOVNTQ2 "movntq "
>  #define MOVNTQ(a,b)  REAL_MOVNTQ(a,b)
>
> +static void RENAME(yuv2yuv_X)(const int16_t *filter, int filterSize,
> +                           const int16_t **src, uint8_t *dest, int dstW,
> +                           const uint8_t *dither, int offset)
> +{
> +    filterSize--;
> +    __asm__ volatile(
> +        "movd %0, %%mm1\n\t"
> +        "punpcklwd %%mm1, %%mm1\n\t"
> +        "punpckldq %%mm1, %%mm1\n\t"
> +        "psllw        $3, %%mm1\n\t"
> +        "paddw     %%mm1, %%mm3\n\t"
> +        "paddw     %%mm1, %%mm4\n\t"
> +        "psraw        $4, %%mm3\n\t"
> +        "psraw        $4, %%mm4\n\t"
> +        ::"m"(filterSize)
> +     );
> +
> +    __asm__ volatile(\
> +        "movq    %%mm3, %%mm6\n\t"
> +        "movq    %%mm4, %%mm7\n\t"
> +        "movl %3, %%ecx\n\t"
> +        "mov                                 %0, %%"FF_REG_d"       \n\t"\
> +        "mov                        (%%"FF_REG_d"), %%"FF_REG_S"    \n\t"\
> +        ".p2align                             4                     \n\t" /* FIXME Unroll? */\
> +        "1:                                                         \n\t"\
> +        "movq                      8(%%"FF_REG_d"), %%mm0           \n\t" /* filterCoeff */\
> +        "movq                (%%"FF_REG_S", %%"FF_REG_c", 2), %%mm2 \n\t" /* srcData */\
> +        "movq               8(%%"FF_REG_S", %%"FF_REG_c", 2), %%mm5 \n\t" /* srcData */\
> +        "add                                $16, %%"FF_REG_d"       \n\t"\
> +        "mov                        (%%"FF_REG_d"), %%"FF_REG_S"    \n\t"\
> +        "test                         %%"FF_REG_S", %%"FF_REG_S"    \n\t"\
> +        "pmulhw                           %%mm0, %%mm2      \n\t"\
> +        "pmulhw                           %%mm0, %%mm5      \n\t"\
> +        "paddw                            %%mm2, %%mm3      \n\t"\
> +        "paddw                            %%mm5, %%mm4      \n\t"\
> +        " jnz                                1b             \n\t"\
> +        "psraw                               $3, %%mm3      \n\t"\
> +        "psraw                               $3, %%mm4      \n\t"\
> +        "packuswb                         %%mm4, %%mm3      \n\t"
> +        MOVNTQ2 "                         %%mm3, (%1, %%"FF_REG_c")\n\t"
> +        "add                          $8, %%"FF_REG_c"      \n\t"\
> +        "cmp                          %2, %%"FF_REG_c"      \n\t"\
> +        "movq    %%mm6, %%mm3\n\t"
> +        "movq    %%mm7, %%mm4\n\t"
> +        "mov                                 %0, %%"FF_REG_d"     \n\t"\
> +        "mov                        (%%"FF_REG_d"), %%"FF_REG_S"  \n\t"\
> +        "jb                                  1b                   \n\t"\
> +        :: "g" (filter),
> +           "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset)
> +        : "%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_c
> +    );
> +}
> +
>  #define YSCALEYUV2PACKEDX_UV \
>      __asm__ volatile(\
>          "xor                %%"FF_REG_a", %%"FF_REG_a"  \n\t"\
> --
> 2.20.1
>
Request for review and some additional information:
I found that when compiling ffmpeg with the '-disable-x86asm' option,
there is an issue with the scale function causing video distortion.
This issue is very easily reproducible. I used bisect to identify the
problem and found that it was introduced after commit 554c2bc7086f49.
This commit moved the inline assembly function yuv2yuvX_sse3 to
standalone assembly, which broke the scale function under the
'disable-x86asm' option. My patch restores the original inline
assembly code under the 'disable-x86asm' option, fixing this issue.
Michael Niedermayer May 4, 2024, 12:21 a.m. UTC | #2
On Fri, May 03, 2024 at 04:07:36AM +0800, hu heng wrote:
> <heng.hu.1989@gmail.com> 于2024年4月26日周五 20:21写道:
> >
> > From: huheng <heng.hu.1989@gmail.com>
> >
> > rename old inline yuv2yuvX to yuv2yuv_X, to avoid conflicts with
> > the names of standalone asm functions. When ffmpeg is compiled with
> > --disable-x86asm, using the scale function will cause the video to
> > be blurred. The reason is that when disable-x86asm, INLINE_MMXEXT
> > is 1 and use_mmx_vfilter is 1, but c->yuv2planeX uses the c language
> > version, which causes a problem of mismatch with the vfilter. This
> > problem has persisted from version 4.4 to the present. Fix it by using
> > inline yuv2yuv_X_mmxext, that can maintain the consistency of
> > use_mmx_vfilter.
> >
> > reproduce the issue:
> > 1. ./configure --disable-x86asm --enable-gpl --enable-libx264
> > 2. ./ffmpeg -i input.mp4 -vf "scale=1280x720" -c:v libx264 output.mp4
> > the output.mp4 is abnormal
> >
> > Signed-off-by: huheng <heng.hu.1989@gmail.com>
> > ---
> >  libswscale/x86/swscale.c          |  6 +++-
> >  libswscale/x86/swscale_template.c | 53 +++++++++++++++++++++++++++++++
> >  2 files changed, 58 insertions(+), 1 deletion(-)
> >
> > diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
> > index ff16398988..1bb9d1d51a 100644
> > --- a/libswscale/x86/swscale.c
> > +++ b/libswscale/x86/swscale.c
> > @@ -452,8 +452,12 @@ av_cold void ff_sws_init_swscale_x86(SwsContext *c)
> >      int cpu_flags = av_get_cpu_flags();
> >
> >  #if HAVE_MMXEXT_INLINE
> > -    if (INLINE_MMXEXT(cpu_flags))
> > +    if (INLINE_MMXEXT(cpu_flags)) {
> >          sws_init_swscale_mmxext(c);
> > +        if (c->use_mmx_vfilter && !(c->flags & SWS_ACCURATE_RND)) {
> > +            c->yuv2planeX = yuv2yuv_X_mmxext;
> > +        }
> > +    }
> >  #endif
> >      if(c->use_mmx_vfilter && !(c->flags & SWS_ACCURATE_RND)) {
> >  #if HAVE_MMXEXT_EXTERNAL
> > diff --git a/libswscale/x86/swscale_template.c b/libswscale/x86/swscale_template.c
> > index 6190fcb4fe..1b8794480d 100644
> > --- a/libswscale/x86/swscale_template.c
> > +++ b/libswscale/x86/swscale_template.c
> > @@ -33,6 +33,59 @@
> >  #define MOVNTQ2 "movntq "
> >  #define MOVNTQ(a,b)  REAL_MOVNTQ(a,b)
> >
> > +static void RENAME(yuv2yuv_X)(const int16_t *filter, int filterSize,
> > +                           const int16_t **src, uint8_t *dest, int dstW,
> > +                           const uint8_t *dither, int offset)
> > +{
> > +    filterSize--;
> > +    __asm__ volatile(
> > +        "movd %0, %%mm1\n\t"
> > +        "punpcklwd %%mm1, %%mm1\n\t"
> > +        "punpckldq %%mm1, %%mm1\n\t"
> > +        "psllw        $3, %%mm1\n\t"
> > +        "paddw     %%mm1, %%mm3\n\t"
> > +        "paddw     %%mm1, %%mm4\n\t"
> > +        "psraw        $4, %%mm3\n\t"
> > +        "psraw        $4, %%mm4\n\t"
> > +        ::"m"(filterSize)
> > +     );
> > +
> > +    __asm__ volatile(\
> > +        "movq    %%mm3, %%mm6\n\t"
> > +        "movq    %%mm4, %%mm7\n\t"
> > +        "movl %3, %%ecx\n\t"
> > +        "mov                                 %0, %%"FF_REG_d"       \n\t"\
> > +        "mov                        (%%"FF_REG_d"), %%"FF_REG_S"    \n\t"\
> > +        ".p2align                             4                     \n\t" /* FIXME Unroll? */\
> > +        "1:                                                         \n\t"\
> > +        "movq                      8(%%"FF_REG_d"), %%mm0           \n\t" /* filterCoeff */\
> > +        "movq                (%%"FF_REG_S", %%"FF_REG_c", 2), %%mm2 \n\t" /* srcData */\
> > +        "movq               8(%%"FF_REG_S", %%"FF_REG_c", 2), %%mm5 \n\t" /* srcData */\
> > +        "add                                $16, %%"FF_REG_d"       \n\t"\
> > +        "mov                        (%%"FF_REG_d"), %%"FF_REG_S"    \n\t"\
> > +        "test                         %%"FF_REG_S", %%"FF_REG_S"    \n\t"\
> > +        "pmulhw                           %%mm0, %%mm2      \n\t"\
> > +        "pmulhw                           %%mm0, %%mm5      \n\t"\
> > +        "paddw                            %%mm2, %%mm3      \n\t"\
> > +        "paddw                            %%mm5, %%mm4      \n\t"\
> > +        " jnz                                1b             \n\t"\
> > +        "psraw                               $3, %%mm3      \n\t"\
> > +        "psraw                               $3, %%mm4      \n\t"\
> > +        "packuswb                         %%mm4, %%mm3      \n\t"
> > +        MOVNTQ2 "                         %%mm3, (%1, %%"FF_REG_c")\n\t"
> > +        "add                          $8, %%"FF_REG_c"      \n\t"\
> > +        "cmp                          %2, %%"FF_REG_c"      \n\t"\
> > +        "movq    %%mm6, %%mm3\n\t"
> > +        "movq    %%mm7, %%mm4\n\t"
> > +        "mov                                 %0, %%"FF_REG_d"     \n\t"\
> > +        "mov                        (%%"FF_REG_d"), %%"FF_REG_S"  \n\t"\
> > +        "jb                                  1b                   \n\t"\
> > +        :: "g" (filter),
> > +           "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset)
> > +        : "%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_c
> > +    );
> > +}
> > +
> >  #define YSCALEYUV2PACKEDX_UV \
> >      __asm__ volatile(\
> >          "xor                %%"FF_REG_a", %%"FF_REG_a"  \n\t"\
> > --
> > 2.20.1
> >
> Request for review and some additional information:
> I found that when compiling ffmpeg with the '-disable-x86asm' option,
> there is an issue with the scale function causing video distortion.
> This issue is very easily reproducible. I used bisect to identify the
> problem and found that it was introduced after commit 554c2bc7086f49.
> This commit moved the inline assembly function yuv2yuvX_sse3 to
> standalone assembly, which broke the scale function under the
> 'disable-x86asm' option. My patch restores the original inline
> assembly code under the 'disable-x86asm' option, fixing this issue.

Having a inline asm AND a external asm implementation of the same
thing is not ideal

I think moving all asm to yasm/nasm fits more into FFmpegs direction.
that would likely make things consistent and this sort of bug disappear

thx

[...]
diff mbox series

Patch

diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index ff16398988..1bb9d1d51a 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -452,8 +452,12 @@  av_cold void ff_sws_init_swscale_x86(SwsContext *c)
     int cpu_flags = av_get_cpu_flags();
 
 #if HAVE_MMXEXT_INLINE
-    if (INLINE_MMXEXT(cpu_flags))
+    if (INLINE_MMXEXT(cpu_flags)) {
         sws_init_swscale_mmxext(c);
+        if (c->use_mmx_vfilter && !(c->flags & SWS_ACCURATE_RND)) {
+            c->yuv2planeX = yuv2yuv_X_mmxext;
+        }
+    }
 #endif
     if(c->use_mmx_vfilter && !(c->flags & SWS_ACCURATE_RND)) {
 #if HAVE_MMXEXT_EXTERNAL
diff --git a/libswscale/x86/swscale_template.c b/libswscale/x86/swscale_template.c
index 6190fcb4fe..1b8794480d 100644
--- a/libswscale/x86/swscale_template.c
+++ b/libswscale/x86/swscale_template.c
@@ -33,6 +33,59 @@ 
 #define MOVNTQ2 "movntq "
 #define MOVNTQ(a,b)  REAL_MOVNTQ(a,b)
 
+static void RENAME(yuv2yuv_X)(const int16_t *filter, int filterSize,
+                           const int16_t **src, uint8_t *dest, int dstW,
+                           const uint8_t *dither, int offset)
+{
+    filterSize--;
+    __asm__ volatile(
+        "movd %0, %%mm1\n\t"
+        "punpcklwd %%mm1, %%mm1\n\t"
+        "punpckldq %%mm1, %%mm1\n\t"
+        "psllw        $3, %%mm1\n\t"
+        "paddw     %%mm1, %%mm3\n\t"
+        "paddw     %%mm1, %%mm4\n\t"
+        "psraw        $4, %%mm3\n\t"
+        "psraw        $4, %%mm4\n\t"
+        ::"m"(filterSize)
+     );
+
+    __asm__ volatile(\
+        "movq    %%mm3, %%mm6\n\t"
+        "movq    %%mm4, %%mm7\n\t"
+        "movl %3, %%ecx\n\t"
+        "mov                                 %0, %%"FF_REG_d"       \n\t"\
+        "mov                        (%%"FF_REG_d"), %%"FF_REG_S"    \n\t"\
+        ".p2align                             4                     \n\t" /* FIXME Unroll? */\
+        "1:                                                         \n\t"\
+        "movq                      8(%%"FF_REG_d"), %%mm0           \n\t" /* filterCoeff */\
+        "movq                (%%"FF_REG_S", %%"FF_REG_c", 2), %%mm2 \n\t" /* srcData */\
+        "movq               8(%%"FF_REG_S", %%"FF_REG_c", 2), %%mm5 \n\t" /* srcData */\
+        "add                                $16, %%"FF_REG_d"       \n\t"\
+        "mov                        (%%"FF_REG_d"), %%"FF_REG_S"    \n\t"\
+        "test                         %%"FF_REG_S", %%"FF_REG_S"    \n\t"\
+        "pmulhw                           %%mm0, %%mm2      \n\t"\
+        "pmulhw                           %%mm0, %%mm5      \n\t"\
+        "paddw                            %%mm2, %%mm3      \n\t"\
+        "paddw                            %%mm5, %%mm4      \n\t"\
+        " jnz                                1b             \n\t"\
+        "psraw                               $3, %%mm3      \n\t"\
+        "psraw                               $3, %%mm4      \n\t"\
+        "packuswb                         %%mm4, %%mm3      \n\t"
+        MOVNTQ2 "                         %%mm3, (%1, %%"FF_REG_c")\n\t"
+        "add                          $8, %%"FF_REG_c"      \n\t"\
+        "cmp                          %2, %%"FF_REG_c"      \n\t"\
+        "movq    %%mm6, %%mm3\n\t"
+        "movq    %%mm7, %%mm4\n\t"
+        "mov                                 %0, %%"FF_REG_d"     \n\t"\
+        "mov                        (%%"FF_REG_d"), %%"FF_REG_S"  \n\t"\
+        "jb                                  1b                   \n\t"\
+        :: "g" (filter),
+           "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset)
+        : "%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_c
+    );
+}
+
 #define YSCALEYUV2PACKEDX_UV \
     __asm__ volatile(\
         "xor                %%"FF_REG_a", %%"FF_REG_a"  \n\t"\