diff mbox series

[FFmpeg-devel,1/3] libswscale/x86/yuv2yuvX: Removes unrolling for mmx and mmxext

Message ID 20210401100017.2863838-1-alankelly@google.com
State Accepted
Headers show
Series [FFmpeg-devel,1/3] libswscale/x86/yuv2yuvX: Removes unrolling for mmx and mmxext | expand

Checks

Context Check Description
andriy/x86_make success Make finished
andriy/x86_make_fate success Make fate finished
andriy/PPC64_make success Make finished
andriy/PPC64_make_fate success Make fate finished

Commit Message

Alan Kelly April 1, 2021, 10 a.m. UTC
---
 This is so that inputs of size 8 are supported, as was the case with
 the original implementation. A bug was found with inputs not divisible
 by 16.
 libswscale/x86/yuv2yuvX.asm | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

Comments

Michael Niedermayer April 1, 2021, 6:12 p.m. UTC | #1
On Thu, Apr 01, 2021 at 12:00:15PM +0200, Alan Kelly wrote:
> ---
>  This is so that inputs of size 8 are supported, as was the case with
>  the original implementation. A bug was found with inputs not divisible
>  by 16.
>  libswscale/x86/yuv2yuvX.asm | 14 +++++++++++++-
>  1 file changed, 13 insertions(+), 1 deletion(-)

will apply patchset

thx

[...]
diff mbox series

Patch

diff --git a/libswscale/x86/yuv2yuvX.asm b/libswscale/x86/yuv2yuvX.asm
index 521880dabe..b6294cb919 100644
--- a/libswscale/x86/yuv2yuvX.asm
+++ b/libswscale/x86/yuv2yuvX.asm
@@ -37,8 +37,10 @@  SECTION .text
 cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, dstW, dither, offset
 %if notcpuflag(sse3)
 %define movr mova
+%define unroll 1
 %else
 %define movr movdqu
+%define unroll 2
 %endif
     movsxdifnidn         dstWq, dstWd
     movsxdifnidn         offsetq, offsetd
@@ -70,8 +72,10 @@  cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, dstW, dither, offset
 .outerloop:
     mova                 m4, m7
     mova                 m3, m7
+%if cpuflag(sse3)
     mova                 m6, m7
     mova                 m1, m7
+%endif
 .loop:
 %if cpuflag(avx2)
     vpbroadcastq         m0, [filterSizeq + 8]
@@ -84,28 +88,36 @@  cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, dstW, dither, offset
     pmulhw               m5, m0, [srcq + offsetq * 2 + mmsize]
     paddw                m3, m3, m2
     paddw                m4, m4, m5
+%if cpuflag(sse3)
     pmulhw               m2, m0, [srcq + offsetq * 2 + 2 * mmsize]
     pmulhw               m5, m0, [srcq + offsetq * 2 + 3 * mmsize]
     paddw                m6, m6, m2
     paddw                m1, m1, m5
+%endif
     add                  filterSizeq, $10
     mov                  srcq, [filterSizeq]
     test                 srcq, srcq
     jnz                  .loop
     psraw                m3, m3, 3
     psraw                m4, m4, 3
+%if cpuflag(sse3)
     psraw                m6, m6, 3
     psraw                m1, m1, 3
+%endif
     packuswb             m3, m3, m4
+%if cpuflag(sse3)
     packuswb             m6, m6, m1
+%endif
     mov                  srcq, [filterq]
 %if cpuflag(avx2)
     vpermq               m3, m3, 216
     vpermq               m6, m6, 216
 %endif
     movr                 [destq + offsetq], m3
+%if cpuflag(sse3)
     movr                 [destq + offsetq + mmsize], m6
-    add                  offsetq, mmsize * 2
+%endif
+    add                  offsetq, mmsize * unroll
     mov                  filterSizeq, filterq
     cmp                  offsetq, dstWq
     jb                  .outerloop