Message ID | 20230714100847.475017-1-alankelly@google.com |
---|---|
State | New |
Headers | show |
Series | [FFmpeg-devel,1/3] swscale/x86/swscale: Process yuv2yuvX tails using next largest register size | expand |
Context | Check | Description |
---|---|---|
yinshiyou/make_loongarch64 | success | Make finished |
yinshiyou/make_fate_loongarch64 | success | Make fate finished |
andriy/make_x86 | success | Make finished |
andriy/make_fate_x86 | success | Make fate finished |
On Fri, Jul 14, 2023 at 12:08:46PM +0200, Alan Kelly wrote: > --- > libswscale/x86/swscale.c | 11 ++++------- > libswscale/x86/yuv2yuvX.asm | 12 ++++++++++-- > 2 files changed, 14 insertions(+), 9 deletions(-) seems to segfault with ./ffmpeg_g -i mm-short.mpg -an -vcodec snow -t 0.2 -bitexact -pix_fmt yuv410p -s 199x199 -vstrict -2 -y snow3914-199-410.avi Thread 79 "ffmpeg_g" received signal SIGSEGV, Segmentation fault. [Switching to Thread 0x7fffaffef700 (LWP 23533)] 0x000055555658a0f6 in ff_yuv2yuvX_sse3 () (gdb) bt #0 0x000055555658a0f6 in ff_yuv2yuvX_sse3 () #1 0x0000555556585bc6 in chr_planar_vscale () #2 0x00005555565817d1 in scale_internal () #3 0x00005555565827d9 in ff_sws_slice_worker () #4 0x000055555662b06e in thread_worker () #5 0x00007ffff75fc6db in start_thread (arg=0x7fffaffef700) at pthread_create.c:463 #6 0x00007fffed12861f in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95 (gdb) disassemble $rip-32,$rip+32 Dump of assembler code from 0x55555658a0d6 to 0x55555658a116: 0x000055555658a0d6 <ff_yuv2yuvX_sse3+86>: std 0x000055555658a0d7 <ff_yuv2yuvX_sse3+87>: fldenv 0xf(%rsi) 0x000055555658a0da <ff_yuv2yuvX_sse3+90>: outsl %ds:(%rsi),(%dx) 0x000055555658a0db <ff_yuv2yuvX_sse3+91>: sti 0x000055555658a0dc <ff_yuv2yuvX_sse3+92>: psraw $0x4,%xmm7 0x000055555658a0e1 <ff_yuv2yuvX_sse3+97>: movdqa %xmm7,%xmm4 0x000055555658a0e5 <ff_yuv2yuvX_sse3+101>: movdqa %xmm7,%xmm3 0x000055555658a0e9 <ff_yuv2yuvX_sse3+105>: movdqa %xmm7,%xmm6 0x000055555658a0ed <ff_yuv2yuvX_sse3+109>: movdqa %xmm7,%xmm1 0x000055555658a0f1 <ff_yuv2yuvX_sse3+113>: movddup 0x8(%rsi),%xmm0 => 0x000055555658a0f6 <ff_yuv2yuvX_sse3+118>: movdqa (%rdx,%rax,2),%xmm2 0x000055555658a0fb <ff_yuv2yuvX_sse3+123>: pmulhw %xmm0,%xmm2 0x000055555658a0ff <ff_yuv2yuvX_sse3+127>: movdqa 0x10(%rdx,%rax,2),%xmm5 0x000055555658a105 <ff_yuv2yuvX_sse3+133>: pmulhw %xmm0,%xmm5 0x000055555658a109 <ff_yuv2yuvX_sse3+137>: paddw %xmm2,%xmm3 0x000055555658a10d <ff_yuv2yuvX_sse3+141>: paddw %xmm5,%xmm4 0x000055555658a111 <ff_yuv2yuvX_sse3+145>: movdqa 0x20(%rdx,%rax,2),%xmm2 End of assembler dump. (gdb) info all-registers rax 0x12 18 rbx 0x32 50 rcx 0x555557915480 93825029723264 rdx 0x555557687680 93825027044992 rsi 0x555557666658 93825026909784 rdi 0x555557666658 93825026909784 rbp 0x55555765b880 0x55555765b880 rsp 0x7fffaffee7a8 0x7fffaffee7a8 r8 0x20 32 r9 0x32 50 r10 0x555556589860 93825009227872 r11 0x5555576f9dc0 93825027513792 r12 0x55555763b280 93825026732672 r13 0x555557666658 93825026909784 r14 0x5555577b5800 93825028282368 r15 0x555557622640 93825026631232 rip 0x55555658a0f6 0x55555658a0f6 <ff_yuv2yuvX_sse3+118> eflags 0x10297 [ CF PF AF SF IF RF ] cs 0x33 51 ss 0x2b 43 ds 0x0 0 es 0x0 0 fs 0x0 0 gs 0x0 0 st0 0 (raw 0x00000000000000000000) st1 0 (raw 0x00000000000000000000) st2 0 (raw 0x00000000000000000000) st3 0 (raw 0x00000000000000000000) st4 0 (raw 0x00000000000000000000) st5 0 (raw 0x00000000000000000000) st6 0 (raw 0x00000000000000000000) st7 0 (raw 0x00000000000000000000) fctrl 0xffff 65535 fstat 0xffff 65535 ftag 0xaaaa 43690 fiseg 0x1 1 fioff 0x0 0 foseg 0x5646 22086 fooff 0xa 10 fop 0x7ff 2047 mxcsr 0x1fa8 [ OE PE IM DM ZM OM UM PM ] > > diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c > index 52423a1199..71434f58d3 100644 > --- a/libswscale/x86/swscale.c > +++ b/libswscale/x86/swscale.c > @@ -202,17 +202,14 @@ static void yuv2yuvX_ ##opt(const int16_t *filter, int filterSize, \ > const int16_t **src, uint8_t *dest, int dstW, \ > const uint8_t *dither, int offset) \ > { \ > - int remainder = (dstW % step); \ > - int pixelsProcessed = dstW - remainder; \ > if(((uintptr_t)dest) & 15){ \ > yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset); \ > return; \ > } \ > - if(pixelsProcessed > 0) \ > - ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset, pixelsProcessed + offset, dither, offset); \ > - if(remainder > 0){ \ > - yuv2yuvX_ ##tail(filter, filterSize, src, dest, dstW, dither, offset); \ > - } \ > + if (dstW >= step) \ > + ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset, dstW + offset, dither, offset); \ > + else \ > + yuv2yuvX_ ##tail(filter, filterSize, src, dest, dstW, dither, offset); \ > return; \ > } > > diff --git a/libswscale/x86/yuv2yuvX.asm b/libswscale/x86/yuv2yuvX.asm > index 57bfa09d66..ad0e8bd448 100644 > --- a/libswscale/x86/yuv2yuvX.asm > +++ b/libswscale/x86/yuv2yuvX.asm > @@ -54,6 +54,8 @@ cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, dstW, dither, offset > %else > movq xm3, [ditherq] > %endif ; avx2 > + mov ditherq, dstWq > + sub dstWq, mmsize * unroll > > %if cpuflag(avx512) > mova m15, [permutation] > @@ -131,8 +133,14 @@ cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, dstW, dither, offset > add offsetq, mmsize * unroll > mov filterSizeq, filterq > cmp offsetq, dstWq > - jb .outerloop > - RET > + jb .outerloop > + > + mov dstWq, offsetq > + mov offsetq, ditherq > + sub offsetq, mmsize * unroll > + cmp dstWq, ditherq > + jb .outerloop > + REP_RET > %endmacro > > INIT_MMX mmxext > -- > 2.41.0.255.g8b1d071c50-goog > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". >
On Sat, Jul 15, 2023 at 10:40 PM Michael Niedermayer <michael@niedermayer.cc> wrote: > On Fri, Jul 14, 2023 at 12:08:46PM +0200, Alan Kelly wrote: > > --- > > libswscale/x86/swscale.c | 11 ++++------- > > libswscale/x86/yuv2yuvX.asm | 12 ++++++++++-- > > 2 files changed, 14 insertions(+), 9 deletions(-) > > seems to segfault with > > ./ffmpeg_g -i mm-short.mpg -an -vcodec snow -t 0.2 -bitexact -pix_fmt > yuv410p -s 199x199 -vstrict -2 -y snow3914-199-410.avi > > Thread 79 "ffmpeg_g" received signal SIGSEGV, Segmentation fault. > [Switching to Thread 0x7fffaffef700 (LWP 23533)] > 0x000055555658a0f6 in ff_yuv2yuvX_sse3 () > (gdb) bt > #0 0x000055555658a0f6 in ff_yuv2yuvX_sse3 () > #1 0x0000555556585bc6 in chr_planar_vscale () > #2 0x00005555565817d1 in scale_internal () > #3 0x00005555565827d9 in ff_sws_slice_worker () > #4 0x000055555662b06e in thread_worker () > #5 0x00007ffff75fc6db in start_thread (arg=0x7fffaffef700) at > pthread_create.c:463 > #6 0x00007fffed12861f in clone () at > ../sysdeps/unix/sysv/linux/x86_64/clone.S:95 > (gdb) disassemble $rip-32,$rip+32 > Dump of assembler code from 0x55555658a0d6 to 0x55555658a116: > 0x000055555658a0d6 <ff_yuv2yuvX_sse3+86>: std > 0x000055555658a0d7 <ff_yuv2yuvX_sse3+87>: fldenv 0xf(%rsi) > 0x000055555658a0da <ff_yuv2yuvX_sse3+90>: outsl %ds:(%rsi),(%dx) > 0x000055555658a0db <ff_yuv2yuvX_sse3+91>: sti > 0x000055555658a0dc <ff_yuv2yuvX_sse3+92>: psraw $0x4,%xmm7 > 0x000055555658a0e1 <ff_yuv2yuvX_sse3+97>: movdqa %xmm7,%xmm4 > 0x000055555658a0e5 <ff_yuv2yuvX_sse3+101>: movdqa %xmm7,%xmm3 > 0x000055555658a0e9 <ff_yuv2yuvX_sse3+105>: movdqa %xmm7,%xmm6 > 0x000055555658a0ed <ff_yuv2yuvX_sse3+109>: movdqa %xmm7,%xmm1 > 0x000055555658a0f1 <ff_yuv2yuvX_sse3+113>: movddup 0x8(%rsi),%xmm0 > => 0x000055555658a0f6 <ff_yuv2yuvX_sse3+118>: movdqa (%rdx,%rax,2),%xmm2 > 0x000055555658a0fb <ff_yuv2yuvX_sse3+123>: pmulhw %xmm0,%xmm2 > 0x000055555658a0ff <ff_yuv2yuvX_sse3+127>: movdqa > 0x10(%rdx,%rax,2),%xmm5 > 0x000055555658a105 <ff_yuv2yuvX_sse3+133>: pmulhw %xmm0,%xmm5 > 0x000055555658a109 <ff_yuv2yuvX_sse3+137>: paddw %xmm2,%xmm3 > 0x000055555658a10d <ff_yuv2yuvX_sse3+141>: paddw %xmm5,%xmm4 > 0x000055555658a111 <ff_yuv2yuvX_sse3+145>: movdqa > 0x20(%rdx,%rax,2),%xmm2 > End of assembler dump. > (gdb) info all-registers > rax 0x12 18 > rbx 0x32 50 > rcx 0x555557915480 93825029723264 > rdx 0x555557687680 93825027044992 > rsi 0x555557666658 93825026909784 > rdi 0x555557666658 93825026909784 > rbp 0x55555765b880 0x55555765b880 > rsp 0x7fffaffee7a8 0x7fffaffee7a8 > r8 0x20 32 > r9 0x32 50 > r10 0x555556589860 93825009227872 > r11 0x5555576f9dc0 93825027513792 > r12 0x55555763b280 93825026732672 > r13 0x555557666658 93825026909784 > r14 0x5555577b5800 93825028282368 > r15 0x555557622640 93825026631232 > rip 0x55555658a0f6 0x55555658a0f6 <ff_yuv2yuvX_sse3+118> > eflags 0x10297 [ CF PF AF SF IF RF ] > cs 0x33 51 > ss 0x2b 43 > ds 0x0 0 > es 0x0 0 > fs 0x0 0 > gs 0x0 0 > st0 0 (raw 0x00000000000000000000) > st1 0 (raw 0x00000000000000000000) > st2 0 (raw 0x00000000000000000000) > st3 0 (raw 0x00000000000000000000) > st4 0 (raw 0x00000000000000000000) > st5 0 (raw 0x00000000000000000000) > st6 0 (raw 0x00000000000000000000) > st7 0 (raw 0x00000000000000000000) > fctrl 0xffff 65535 > fstat 0xffff 65535 > ftag 0xaaaa 43690 > fiseg 0x1 1 > fioff 0x0 0 > foseg 0x5646 22086 > fooff 0xa 10 > fop 0x7ff 2047 > mxcsr 0x1fa8 [ OE PE IM DM ZM OM UM PM ] > > > > > > diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c > > index 52423a1199..71434f58d3 100644 > > --- a/libswscale/x86/swscale.c > > +++ b/libswscale/x86/swscale.c > > @@ -202,17 +202,14 @@ static void yuv2yuvX_ ##opt(const int16_t *filter, > int filterSize, \ > > const int16_t **src, uint8_t *dest, int > dstW, \ > > const uint8_t *dither, int offset) \ > > { \ > > - int remainder = (dstW % step); \ > > - int pixelsProcessed = dstW - remainder; \ > > if(((uintptr_t)dest) & 15){ \ > > yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, > offset); \ > > return; \ > > } \ > > - if(pixelsProcessed > 0) \ > > - ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset, > pixelsProcessed + offset, dither, offset); \ > > - if(remainder > 0){ \ > > - yuv2yuvX_ ##tail(filter, filterSize, src, dest, dstW, dither, > offset); \ > > - } \ > > + if (dstW >= step) \ > > + ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset, > dstW + offset, dither, offset); \ > > + else \ > > + yuv2yuvX_ ##tail(filter, filterSize, src, dest, dstW, dither, > offset); \ > > return; \ > > } > > > > diff --git a/libswscale/x86/yuv2yuvX.asm b/libswscale/x86/yuv2yuvX.asm > > index 57bfa09d66..ad0e8bd448 100644 > > --- a/libswscale/x86/yuv2yuvX.asm > > +++ b/libswscale/x86/yuv2yuvX.asm > > @@ -54,6 +54,8 @@ cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, > dest, dstW, dither, offset > > %else > > movq xm3, [ditherq] > > %endif ; avx2 > > + mov ditherq, dstWq > > + sub dstWq, mmsize * unroll > > > > %if cpuflag(avx512) > > mova m15, [permutation] > > @@ -131,8 +133,14 @@ cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, > dest, dstW, dither, offset > > add offsetq, mmsize * unroll > > mov filterSizeq, filterq > > cmp offsetq, dstWq > > - jb .outerloop > > - RET > > + jb .outerloop > > + > > + mov dstWq, offsetq > > + mov offsetq, ditherq > > + sub offsetq, mmsize * unroll > > + cmp dstWq, ditherq > > + jb .outerloop > > + REP_RET > > %endmacro > > > > INIT_MMX mmxext > > -- > > 2.41.0.255.g8b1d071c50-goog > > > > _______________________________________________ > > ffmpeg-devel mailing list > > ffmpeg-devel@ffmpeg.org > > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > > > To unsubscribe, visit link above, or email > > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". > > > > -- > Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB > > Into a blind darkness they enter who follow after the Ignorance, > they as if into a greater darkness enter who devote themselves > to the Knowledge alone. -- Isha Upanishad > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". > Thanks Michael for looking at this. Looks like an aligned load on an unaligned address when jumping back into the inner loop causes the crash. Changing this to an unaligned load fixes the issue. There doesn't seem to be a performance hit from the unaligned load on a Skylake desktop. Unaligned Aligned Ratio yuv2yuvX_2_0_512_approximate_sse3 188.2 188.7 0.9973502915 yuv2yuvX_2_0_512_approximate_avx2 135.7 136.2 0.996328928 yuv2yuvX_2_0_512_approximate_avx512 97.7 98.2 0.9949083503 yuv2yuvX_2_16_512_approximate_sse3 192.2 187.7 1.023974427 yuv2yuvX_2_16_512_approximate_avx2 134.7 142.2 0.947257384 yuv2yuvX_2_16_512_approximate_avx512 97.2 100.7 0.9652432969 yuv2yuvX_2_32_512_approximate_sse3 182.7 182.7 1 yuv2yuvX_2_32_512_approximate_avx2 136.7 135.7 1.007369197 yuv2yuvX_2_32_512_approximate_avx512 95.7 98.2 0.9745417515 yuv2yuvX_2_48_512_approximate_sse3 181.2 178.7 1.013989927 yuv2yuvX_2_48_512_approximate_avx2 133.7 137.2 0.9744897959 yuv2yuvX_2_48_512_approximate_avx512 99.2 100.7 0.9851042701 yuv2yuvX_4_0_512_approximate_sse3 300.2 301.7 0.9950281737 yuv2yuvX_4_0_512_approximate_avx2 204.2 205.7 0.9927078269 yuv2yuvX_4_0_512_approximate_avx512 146.2 144.7 1.010366275 yuv2yuvX_4_16_512_approximate_sse3 300.2 308.2 0.9740428293 yuv2yuvX_4_16_512_approximate_avx2 206.7 206.7 1 yuv2yuvX_4_16_512_approximate_avx512 146.2 142.7 1.02452698 yuv2yuvX_4_32_512_approximate_sse3 277.2 285.7 0.9702485124 yuv2yuvX_4_32_512_approximate_avx2 210.7 207.2 1.016891892 yuv2yuvX_4_32_512_approximate_avx512 154.7 146.7 1.054533061 yuv2yuvX_4_48_512_approximate_sse3 283.7 284.7 0.9964875307 yuv2yuvX_4_48_512_approximate_avx2 209.2 215.7 0.969865554 yuv2yuvX_4_48_512_approximate_avx512 144.2 143.2 1.00698324 yuv2yuvX_8_0_512_approximate_sse3 535.2 537.7 0.9953505672 yuv2yuvX_8_0_512_approximate_avx2 361.7 355.2 1.01829955 yuv2yuvX_8_0_512_approximate_avx512 237.7 240.2 0.9895920067 yuv2yuvX_8_16_512_approximate_sse3 548.2 539.7 1.01574949 yuv2yuvX_8_16_512_approximate_avx2 358.7 358.2 1.001395868 yuv2yuvX_8_16_512_approximate_avx512 234.7 235.7 0.9957573186 yuv2yuvX_8_32_512_approximate_sse3 503.7 503.7 1 yuv2yuvX_8_32_512_approximate_avx2 362.2 354.2 1.02258611 yuv2yuvX_8_32_512_approximate_avx512 233.2 235.7 0.9893932966 yuv2yuvX_8_48_512_approximate_sse3 504.7 496.7 1.016106302 yuv2yuvX_8_48_512_approximate_avx2 356.2 366.2 0.9726925177 yuv2yuvX_8_48_512_approximate_avx512 236.7 239.2 0.989548495 yuv2yuvX_16_0_512_approximate_sse3 975.2 989.7 0.9853490957 yuv2yuvX_16_0_512_approximate_avx2 689.2 671.2 1.02681764 yuv2yuvX_16_0_512_approximate_avx512 427.7 425.2 1.005879586 yuv2yuvX_16_16_512_approximate_sse3 947.7 970.7 0.9763057587 yuv2yuvX_16_16_512_approximate_avx2 682.7 670.2 1.018651149 yuv2yuvX_16_16_512_approximate_avx512 439.2 427.7 1.026888006 yuv2yuvX_16_32_512_approximate_sse3 897.2 928.2 0.9666020254 yuv2yuvX_16_32_512_approximate_avx2 702.7 668.7 1.050844923 yuv2yuvX_16_32_512_approximate_avx512 452.2 434.2 1.04145555 yuv2yuvX_16_48_512_approximate_sse3 926.7 936.2 0.9898525956 yuv2yuvX_16_48_512_approximate_avx2 682.7 669.7 1.019411677 yuv2yuvX_16_48_512_approximate_avx512 435.7 457.7 0.9519335809
diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c index 52423a1199..71434f58d3 100644 --- a/libswscale/x86/swscale.c +++ b/libswscale/x86/swscale.c @@ -202,17 +202,14 @@ static void yuv2yuvX_ ##opt(const int16_t *filter, int filterSize, \ const int16_t **src, uint8_t *dest, int dstW, \ const uint8_t *dither, int offset) \ { \ - int remainder = (dstW % step); \ - int pixelsProcessed = dstW - remainder; \ if(((uintptr_t)dest) & 15){ \ yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset); \ return; \ } \ - if(pixelsProcessed > 0) \ - ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset, pixelsProcessed + offset, dither, offset); \ - if(remainder > 0){ \ - yuv2yuvX_ ##tail(filter, filterSize, src, dest, dstW, dither, offset); \ - } \ + if (dstW >= step) \ + ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset, dstW + offset, dither, offset); \ + else \ + yuv2yuvX_ ##tail(filter, filterSize, src, dest, dstW, dither, offset); \ return; \ } diff --git a/libswscale/x86/yuv2yuvX.asm b/libswscale/x86/yuv2yuvX.asm index 57bfa09d66..ad0e8bd448 100644 --- a/libswscale/x86/yuv2yuvX.asm +++ b/libswscale/x86/yuv2yuvX.asm @@ -54,6 +54,8 @@ cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, dstW, dither, offset %else movq xm3, [ditherq] %endif ; avx2 + mov ditherq, dstWq + sub dstWq, mmsize * unroll %if cpuflag(avx512) mova m15, [permutation] @@ -131,8 +133,14 @@ cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, dstW, dither, offset add offsetq, mmsize * unroll mov filterSizeq, filterq cmp offsetq, dstWq - jb .outerloop - RET + jb .outerloop + + mov dstWq, offsetq + mov offsetq, ditherq + sub offsetq, mmsize * unroll + cmp dstWq, ditherq + jb .outerloop + REP_RET %endmacro INIT_MMX mmxext