diff mbox series

[FFmpeg-devel,3/3] swscale/x86/yuv2yuvX: Process tails by jumping back into the main loop.

Message ID 20230714100847.475017-1-alankelly@google.com
State New
Headers show
Series [FFmpeg-devel,1/3] swscale/x86/swscale: Process yuv2yuvX tails using next largest register size | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

Alan Kelly July 14, 2023, 10:08 a.m. UTC
---
 libswscale/x86/swscale.c    | 11 ++++-------
 libswscale/x86/yuv2yuvX.asm | 12 ++++++++++--
 2 files changed, 14 insertions(+), 9 deletions(-)

Comments

Michael Niedermayer July 15, 2023, 8:39 p.m. UTC | #1
On Fri, Jul 14, 2023 at 12:08:46PM +0200, Alan Kelly wrote:
> ---
>  libswscale/x86/swscale.c    | 11 ++++-------
>  libswscale/x86/yuv2yuvX.asm | 12 ++++++++++--
>  2 files changed, 14 insertions(+), 9 deletions(-)

seems to segfault with

./ffmpeg_g -i mm-short.mpg -an -vcodec snow -t 0.2 -bitexact -pix_fmt yuv410p -s 199x199 -vstrict -2 -y  snow3914-199-410.avi

Thread 79 "ffmpeg_g" received signal SIGSEGV, Segmentation fault.
[Switching to Thread 0x7fffaffef700 (LWP 23533)]
0x000055555658a0f6 in ff_yuv2yuvX_sse3 ()
(gdb) bt
#0  0x000055555658a0f6 in ff_yuv2yuvX_sse3 ()
#1  0x0000555556585bc6 in chr_planar_vscale ()
#2  0x00005555565817d1 in scale_internal ()
#3  0x00005555565827d9 in ff_sws_slice_worker ()
#4  0x000055555662b06e in thread_worker ()
#5  0x00007ffff75fc6db in start_thread (arg=0x7fffaffef700) at pthread_create.c:463
#6  0x00007fffed12861f in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95
(gdb) disassemble $rip-32,$rip+32
Dump of assembler code from 0x55555658a0d6 to 0x55555658a116:
   0x000055555658a0d6 <ff_yuv2yuvX_sse3+86>:	std
   0x000055555658a0d7 <ff_yuv2yuvX_sse3+87>:	fldenv 0xf(%rsi)
   0x000055555658a0da <ff_yuv2yuvX_sse3+90>:	outsl  %ds:(%rsi),(%dx)
   0x000055555658a0db <ff_yuv2yuvX_sse3+91>:	sti
   0x000055555658a0dc <ff_yuv2yuvX_sse3+92>:	psraw  $0x4,%xmm7
   0x000055555658a0e1 <ff_yuv2yuvX_sse3+97>:	movdqa %xmm7,%xmm4
   0x000055555658a0e5 <ff_yuv2yuvX_sse3+101>:	movdqa %xmm7,%xmm3
   0x000055555658a0e9 <ff_yuv2yuvX_sse3+105>:	movdqa %xmm7,%xmm6
   0x000055555658a0ed <ff_yuv2yuvX_sse3+109>:	movdqa %xmm7,%xmm1
   0x000055555658a0f1 <ff_yuv2yuvX_sse3+113>:	movddup 0x8(%rsi),%xmm0
=> 0x000055555658a0f6 <ff_yuv2yuvX_sse3+118>:	movdqa (%rdx,%rax,2),%xmm2
   0x000055555658a0fb <ff_yuv2yuvX_sse3+123>:	pmulhw %xmm0,%xmm2
   0x000055555658a0ff <ff_yuv2yuvX_sse3+127>:	movdqa 0x10(%rdx,%rax,2),%xmm5
   0x000055555658a105 <ff_yuv2yuvX_sse3+133>:	pmulhw %xmm0,%xmm5
   0x000055555658a109 <ff_yuv2yuvX_sse3+137>:	paddw  %xmm2,%xmm3
   0x000055555658a10d <ff_yuv2yuvX_sse3+141>:	paddw  %xmm5,%xmm4
   0x000055555658a111 <ff_yuv2yuvX_sse3+145>:	movdqa 0x20(%rdx,%rax,2),%xmm2
End of assembler dump.
(gdb) info all-registers
rax            0x12	18
rbx            0x32	50
rcx            0x555557915480	93825029723264
rdx            0x555557687680	93825027044992
rsi            0x555557666658	93825026909784
rdi            0x555557666658	93825026909784
rbp            0x55555765b880	0x55555765b880
rsp            0x7fffaffee7a8	0x7fffaffee7a8
r8             0x20	32
r9             0x32	50
r10            0x555556589860	93825009227872
r11            0x5555576f9dc0	93825027513792
r12            0x55555763b280	93825026732672
r13            0x555557666658	93825026909784
r14            0x5555577b5800	93825028282368
r15            0x555557622640	93825026631232
rip            0x55555658a0f6	0x55555658a0f6 <ff_yuv2yuvX_sse3+118>
eflags         0x10297	[ CF PF AF SF IF RF ]
cs             0x33	51
ss             0x2b	43
ds             0x0	0
es             0x0	0
fs             0x0	0
gs             0x0	0
st0            0	(raw 0x00000000000000000000)
st1            0	(raw 0x00000000000000000000)
st2            0	(raw 0x00000000000000000000)
st3            0	(raw 0x00000000000000000000)
st4            0	(raw 0x00000000000000000000)
st5            0	(raw 0x00000000000000000000)
st6            0	(raw 0x00000000000000000000)
st7            0	(raw 0x00000000000000000000)
fctrl          0xffff	65535
fstat          0xffff	65535
ftag           0xaaaa	43690
fiseg          0x1	1
fioff          0x0	0
foseg          0x5646	22086
fooff          0xa	10
fop            0x7ff	2047
mxcsr          0x1fa8	[ OE PE IM DM ZM OM UM PM ]


> 
> diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
> index 52423a1199..71434f58d3 100644
> --- a/libswscale/x86/swscale.c
> +++ b/libswscale/x86/swscale.c
> @@ -202,17 +202,14 @@ static void yuv2yuvX_ ##opt(const int16_t *filter, int filterSize, \
>                             const int16_t **src, uint8_t *dest, int dstW, \
>                             const uint8_t *dither, int offset) \
>  { \
> -    int remainder = (dstW % step); \
> -    int pixelsProcessed = dstW - remainder; \
>      if(((uintptr_t)dest) & 15){ \
>          yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset); \
>          return; \
>      } \
> -    if(pixelsProcessed > 0) \
> -        ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset, pixelsProcessed + offset, dither, offset); \
> -    if(remainder > 0){ \
> -      yuv2yuvX_ ##tail(filter, filterSize, src, dest, dstW, dither, offset); \
> -    } \
> +    if (dstW >= step) \
> +        ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset, dstW + offset, dither, offset); \
> +    else \
> +        yuv2yuvX_ ##tail(filter, filterSize, src, dest, dstW, dither, offset); \
>      return; \
>  }
>  
> diff --git a/libswscale/x86/yuv2yuvX.asm b/libswscale/x86/yuv2yuvX.asm
> index 57bfa09d66..ad0e8bd448 100644
> --- a/libswscale/x86/yuv2yuvX.asm
> +++ b/libswscale/x86/yuv2yuvX.asm
> @@ -54,6 +54,8 @@ cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, dstW, dither, offset
>  %else
>      movq                 xm3, [ditherq]
>  %endif ; avx2
> +    mov                  ditherq, dstWq
> +    sub                  dstWq, mmsize * unroll
>  
>  %if cpuflag(avx512)
>      mova                 m15, [permutation]
> @@ -131,8 +133,14 @@ cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, dstW, dither, offset
>      add                  offsetq, mmsize * unroll
>      mov                  filterSizeq, filterq
>      cmp                  offsetq, dstWq
> -    jb                  .outerloop
> -    RET
> +    jb                   .outerloop
> +
> +    mov                  dstWq, offsetq
> +    mov                  offsetq, ditherq
> +    sub                  offsetq, mmsize * unroll
> +    cmp                  dstWq, ditherq
> +    jb                   .outerloop
> +    REP_RET
>  %endmacro
>  
>  INIT_MMX mmxext
> -- 
> 2.41.0.255.g8b1d071c50-goog
> 
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
Alan Kelly July 17, 2023, 9:29 a.m. UTC | #2
On Sat, Jul 15, 2023 at 10:40 PM Michael Niedermayer <michael@niedermayer.cc>
wrote:

> On Fri, Jul 14, 2023 at 12:08:46PM +0200, Alan Kelly wrote:
> > ---
> >  libswscale/x86/swscale.c    | 11 ++++-------
> >  libswscale/x86/yuv2yuvX.asm | 12 ++++++++++--
> >  2 files changed, 14 insertions(+), 9 deletions(-)
>
> seems to segfault with
>
> ./ffmpeg_g -i mm-short.mpg -an -vcodec snow -t 0.2 -bitexact -pix_fmt
> yuv410p -s 199x199 -vstrict -2 -y  snow3914-199-410.avi
>
> Thread 79 "ffmpeg_g" received signal SIGSEGV, Segmentation fault.
> [Switching to Thread 0x7fffaffef700 (LWP 23533)]
> 0x000055555658a0f6 in ff_yuv2yuvX_sse3 ()
> (gdb) bt
> #0  0x000055555658a0f6 in ff_yuv2yuvX_sse3 ()
> #1  0x0000555556585bc6 in chr_planar_vscale ()
> #2  0x00005555565817d1 in scale_internal ()
> #3  0x00005555565827d9 in ff_sws_slice_worker ()
> #4  0x000055555662b06e in thread_worker ()
> #5  0x00007ffff75fc6db in start_thread (arg=0x7fffaffef700) at
> pthread_create.c:463
> #6  0x00007fffed12861f in clone () at
> ../sysdeps/unix/sysv/linux/x86_64/clone.S:95
> (gdb) disassemble $rip-32,$rip+32
> Dump of assembler code from 0x55555658a0d6 to 0x55555658a116:
>    0x000055555658a0d6 <ff_yuv2yuvX_sse3+86>:    std
>    0x000055555658a0d7 <ff_yuv2yuvX_sse3+87>:    fldenv 0xf(%rsi)
>    0x000055555658a0da <ff_yuv2yuvX_sse3+90>:    outsl  %ds:(%rsi),(%dx)
>    0x000055555658a0db <ff_yuv2yuvX_sse3+91>:    sti
>    0x000055555658a0dc <ff_yuv2yuvX_sse3+92>:    psraw  $0x4,%xmm7
>    0x000055555658a0e1 <ff_yuv2yuvX_sse3+97>:    movdqa %xmm7,%xmm4
>    0x000055555658a0e5 <ff_yuv2yuvX_sse3+101>:   movdqa %xmm7,%xmm3
>    0x000055555658a0e9 <ff_yuv2yuvX_sse3+105>:   movdqa %xmm7,%xmm6
>    0x000055555658a0ed <ff_yuv2yuvX_sse3+109>:   movdqa %xmm7,%xmm1
>    0x000055555658a0f1 <ff_yuv2yuvX_sse3+113>:   movddup 0x8(%rsi),%xmm0
> => 0x000055555658a0f6 <ff_yuv2yuvX_sse3+118>:   movdqa (%rdx,%rax,2),%xmm2
>    0x000055555658a0fb <ff_yuv2yuvX_sse3+123>:   pmulhw %xmm0,%xmm2
>    0x000055555658a0ff <ff_yuv2yuvX_sse3+127>:   movdqa
> 0x10(%rdx,%rax,2),%xmm5
>    0x000055555658a105 <ff_yuv2yuvX_sse3+133>:   pmulhw %xmm0,%xmm5
>    0x000055555658a109 <ff_yuv2yuvX_sse3+137>:   paddw  %xmm2,%xmm3
>    0x000055555658a10d <ff_yuv2yuvX_sse3+141>:   paddw  %xmm5,%xmm4
>    0x000055555658a111 <ff_yuv2yuvX_sse3+145>:   movdqa
> 0x20(%rdx,%rax,2),%xmm2
> End of assembler dump.
> (gdb) info all-registers
> rax            0x12     18
> rbx            0x32     50
> rcx            0x555557915480   93825029723264
> rdx            0x555557687680   93825027044992
> rsi            0x555557666658   93825026909784
> rdi            0x555557666658   93825026909784
> rbp            0x55555765b880   0x55555765b880
> rsp            0x7fffaffee7a8   0x7fffaffee7a8
> r8             0x20     32
> r9             0x32     50
> r10            0x555556589860   93825009227872
> r11            0x5555576f9dc0   93825027513792
> r12            0x55555763b280   93825026732672
> r13            0x555557666658   93825026909784
> r14            0x5555577b5800   93825028282368
> r15            0x555557622640   93825026631232
> rip            0x55555658a0f6   0x55555658a0f6 <ff_yuv2yuvX_sse3+118>
> eflags         0x10297  [ CF PF AF SF IF RF ]
> cs             0x33     51
> ss             0x2b     43
> ds             0x0      0
> es             0x0      0
> fs             0x0      0
> gs             0x0      0
> st0            0        (raw 0x00000000000000000000)
> st1            0        (raw 0x00000000000000000000)
> st2            0        (raw 0x00000000000000000000)
> st3            0        (raw 0x00000000000000000000)
> st4            0        (raw 0x00000000000000000000)
> st5            0        (raw 0x00000000000000000000)
> st6            0        (raw 0x00000000000000000000)
> st7            0        (raw 0x00000000000000000000)
> fctrl          0xffff   65535
> fstat          0xffff   65535
> ftag           0xaaaa   43690
> fiseg          0x1      1
> fioff          0x0      0
> foseg          0x5646   22086
> fooff          0xa      10
> fop            0x7ff    2047
> mxcsr          0x1fa8   [ OE PE IM DM ZM OM UM PM ]
>
>
> >
> > diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
> > index 52423a1199..71434f58d3 100644
> > --- a/libswscale/x86/swscale.c
> > +++ b/libswscale/x86/swscale.c
> > @@ -202,17 +202,14 @@ static void yuv2yuvX_ ##opt(const int16_t *filter,
> int filterSize, \
> >                             const int16_t **src, uint8_t *dest, int
> dstW, \
> >                             const uint8_t *dither, int offset) \
> >  { \
> > -    int remainder = (dstW % step); \
> > -    int pixelsProcessed = dstW - remainder; \
> >      if(((uintptr_t)dest) & 15){ \
> >          yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither,
> offset); \
> >          return; \
> >      } \
> > -    if(pixelsProcessed > 0) \
> > -        ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset,
> pixelsProcessed + offset, dither, offset); \
> > -    if(remainder > 0){ \
> > -      yuv2yuvX_ ##tail(filter, filterSize, src, dest, dstW, dither,
> offset); \
> > -    } \
> > +    if (dstW >= step) \
> > +        ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset,
> dstW + offset, dither, offset); \
> > +    else \
> > +        yuv2yuvX_ ##tail(filter, filterSize, src, dest, dstW, dither,
> offset); \
> >      return; \
> >  }
> >
> > diff --git a/libswscale/x86/yuv2yuvX.asm b/libswscale/x86/yuv2yuvX.asm
> > index 57bfa09d66..ad0e8bd448 100644
> > --- a/libswscale/x86/yuv2yuvX.asm
> > +++ b/libswscale/x86/yuv2yuvX.asm
> > @@ -54,6 +54,8 @@ cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src,
> dest, dstW, dither, offset
> >  %else
> >      movq                 xm3, [ditherq]
> >  %endif ; avx2
> > +    mov                  ditherq, dstWq
> > +    sub                  dstWq, mmsize * unroll
> >
> >  %if cpuflag(avx512)
> >      mova                 m15, [permutation]
> > @@ -131,8 +133,14 @@ cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src,
> dest, dstW, dither, offset
> >      add                  offsetq, mmsize * unroll
> >      mov                  filterSizeq, filterq
> >      cmp                  offsetq, dstWq
> > -    jb                  .outerloop
> > -    RET
> > +    jb                   .outerloop
> > +
> > +    mov                  dstWq, offsetq
> > +    mov                  offsetq, ditherq
> > +    sub                  offsetq, mmsize * unroll
> > +    cmp                  dstWq, ditherq
> > +    jb                   .outerloop
> > +    REP_RET
> >  %endmacro
> >
> >  INIT_MMX mmxext
> > --
> > 2.41.0.255.g8b1d071c50-goog
> >
> > _______________________________________________
> > ffmpeg-devel mailing list
> > ffmpeg-devel@ffmpeg.org
> > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> >
> > To unsubscribe, visit link above, or email
> > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
> >
>
> --
> Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
>
> Into a blind darkness they enter who follow after the Ignorance,
> they as if into a greater darkness enter who devote themselves
> to the Knowledge alone. -- Isha Upanishad
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>

Thanks Michael for looking at this. Looks like an aligned load on an
unaligned address when jumping back into the inner loop causes the crash.

Changing this to an unaligned load fixes the issue. There doesn't seem to
be a performance hit from the unaligned load on a Skylake desktop.

Unaligned Aligned Ratio
yuv2yuvX_2_0_512_approximate_sse3 188.2 188.7 0.9973502915
yuv2yuvX_2_0_512_approximate_avx2 135.7 136.2 0.996328928
yuv2yuvX_2_0_512_approximate_avx512 97.7 98.2 0.9949083503
yuv2yuvX_2_16_512_approximate_sse3 192.2 187.7 1.023974427
yuv2yuvX_2_16_512_approximate_avx2 134.7 142.2 0.947257384
yuv2yuvX_2_16_512_approximate_avx512 97.2 100.7 0.9652432969
yuv2yuvX_2_32_512_approximate_sse3 182.7 182.7 1
yuv2yuvX_2_32_512_approximate_avx2 136.7 135.7 1.007369197
yuv2yuvX_2_32_512_approximate_avx512 95.7 98.2 0.9745417515
yuv2yuvX_2_48_512_approximate_sse3 181.2 178.7 1.013989927
yuv2yuvX_2_48_512_approximate_avx2 133.7 137.2 0.9744897959
yuv2yuvX_2_48_512_approximate_avx512 99.2 100.7 0.9851042701
yuv2yuvX_4_0_512_approximate_sse3 300.2 301.7 0.9950281737
yuv2yuvX_4_0_512_approximate_avx2 204.2 205.7 0.9927078269
yuv2yuvX_4_0_512_approximate_avx512 146.2 144.7 1.010366275
yuv2yuvX_4_16_512_approximate_sse3 300.2 308.2 0.9740428293
yuv2yuvX_4_16_512_approximate_avx2 206.7 206.7 1
yuv2yuvX_4_16_512_approximate_avx512 146.2 142.7 1.02452698
yuv2yuvX_4_32_512_approximate_sse3 277.2 285.7 0.9702485124
yuv2yuvX_4_32_512_approximate_avx2 210.7 207.2 1.016891892
yuv2yuvX_4_32_512_approximate_avx512 154.7 146.7 1.054533061
yuv2yuvX_4_48_512_approximate_sse3 283.7 284.7 0.9964875307
yuv2yuvX_4_48_512_approximate_avx2 209.2 215.7 0.969865554
yuv2yuvX_4_48_512_approximate_avx512 144.2 143.2 1.00698324
yuv2yuvX_8_0_512_approximate_sse3 535.2 537.7 0.9953505672
yuv2yuvX_8_0_512_approximate_avx2 361.7 355.2 1.01829955
yuv2yuvX_8_0_512_approximate_avx512 237.7 240.2 0.9895920067
yuv2yuvX_8_16_512_approximate_sse3 548.2 539.7 1.01574949
yuv2yuvX_8_16_512_approximate_avx2 358.7 358.2 1.001395868
yuv2yuvX_8_16_512_approximate_avx512 234.7 235.7 0.9957573186
yuv2yuvX_8_32_512_approximate_sse3 503.7 503.7 1
yuv2yuvX_8_32_512_approximate_avx2 362.2 354.2 1.02258611
yuv2yuvX_8_32_512_approximate_avx512 233.2 235.7 0.9893932966
yuv2yuvX_8_48_512_approximate_sse3 504.7 496.7 1.016106302
yuv2yuvX_8_48_512_approximate_avx2 356.2 366.2 0.9726925177
yuv2yuvX_8_48_512_approximate_avx512 236.7 239.2 0.989548495
yuv2yuvX_16_0_512_approximate_sse3 975.2 989.7 0.9853490957
yuv2yuvX_16_0_512_approximate_avx2 689.2 671.2 1.02681764
yuv2yuvX_16_0_512_approximate_avx512 427.7 425.2 1.005879586
yuv2yuvX_16_16_512_approximate_sse3 947.7 970.7 0.9763057587
yuv2yuvX_16_16_512_approximate_avx2 682.7 670.2 1.018651149
yuv2yuvX_16_16_512_approximate_avx512 439.2 427.7 1.026888006
yuv2yuvX_16_32_512_approximate_sse3 897.2 928.2 0.9666020254
yuv2yuvX_16_32_512_approximate_avx2 702.7 668.7 1.050844923
yuv2yuvX_16_32_512_approximate_avx512 452.2 434.2 1.04145555
yuv2yuvX_16_48_512_approximate_sse3 926.7 936.2 0.9898525956
yuv2yuvX_16_48_512_approximate_avx2 682.7 669.7 1.019411677
yuv2yuvX_16_48_512_approximate_avx512 435.7 457.7 0.9519335809
diff mbox series

Patch

diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index 52423a1199..71434f58d3 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -202,17 +202,14 @@  static void yuv2yuvX_ ##opt(const int16_t *filter, int filterSize, \
                            const int16_t **src, uint8_t *dest, int dstW, \
                            const uint8_t *dither, int offset) \
 { \
-    int remainder = (dstW % step); \
-    int pixelsProcessed = dstW - remainder; \
     if(((uintptr_t)dest) & 15){ \
         yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset); \
         return; \
     } \
-    if(pixelsProcessed > 0) \
-        ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset, pixelsProcessed + offset, dither, offset); \
-    if(remainder > 0){ \
-      yuv2yuvX_ ##tail(filter, filterSize, src, dest, dstW, dither, offset); \
-    } \
+    if (dstW >= step) \
+        ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset, dstW + offset, dither, offset); \
+    else \
+        yuv2yuvX_ ##tail(filter, filterSize, src, dest, dstW, dither, offset); \
     return; \
 }
 
diff --git a/libswscale/x86/yuv2yuvX.asm b/libswscale/x86/yuv2yuvX.asm
index 57bfa09d66..ad0e8bd448 100644
--- a/libswscale/x86/yuv2yuvX.asm
+++ b/libswscale/x86/yuv2yuvX.asm
@@ -54,6 +54,8 @@  cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, dstW, dither, offset
 %else
     movq                 xm3, [ditherq]
 %endif ; avx2
+    mov                  ditherq, dstWq
+    sub                  dstWq, mmsize * unroll
 
 %if cpuflag(avx512)
     mova                 m15, [permutation]
@@ -131,8 +133,14 @@  cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, dstW, dither, offset
     add                  offsetq, mmsize * unroll
     mov                  filterSizeq, filterq
     cmp                  offsetq, dstWq
-    jb                  .outerloop
-    RET
+    jb                   .outerloop
+
+    mov                  dstWq, offsetq
+    mov                  offsetq, ditherq
+    sub                  offsetq, mmsize * unroll
+    cmp                  dstWq, ditherq
+    jb                   .outerloop
+    REP_RET
 %endmacro
 
 INIT_MMX mmxext