diff mbox series

[FFmpeg-devel] Unrolls main loop of yuv2yuvX_sse3 and general code tidying for ~20% speedup

Message ID 20200915161158.2051301-1-alankelly@google.com
State New
Headers show
Series [FFmpeg-devel] Unrolls main loop of yuv2yuvX_sse3 and general code tidying for ~20% speedup
Related show

Checks

Context Check Description
andriy/default pending
andriy/make success Make finished
andriy/make_fate success Make fate finished

Commit Message

Alan Kelly Sept. 15, 2020, 4:11 p.m. UTC
---
 libswscale/x86/swscale.c | 138 ++++++++++++++++++++-------------------
 1 file changed, 72 insertions(+), 66 deletions(-)

Comments

Michael Niedermayer Sept. 15, 2020, 10:39 p.m. UTC | #1
On Tue, Sep 15, 2020 at 06:11:58PM +0200, Alan Kelly wrote:
> ---
>  libswscale/x86/swscale.c | 138 ++++++++++++++++++++-------------------
>  1 file changed, 72 insertions(+), 66 deletions(-)
> 
> diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
> index 3160fedf04..e47fee2bbd 100644
> --- a/libswscale/x86/swscale.c
> +++ b/libswscale/x86/swscale.c
> @@ -201,76 +201,82 @@ static void yuv2yuvX_sse3(const int16_t *filter, int filterSize,
>                             const int16_t **src, uint8_t *dest, int dstW,
>                             const uint8_t *dither, int offset)
>  {
> -    if(((uintptr_t)dest) & 15){
> +    if(((uintptr_t)dest) & 31){
>          yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset);
>          return;
>      }
>      filterSize--;
> -#define MAIN_FUNCTION \
> -        "pxor       %%xmm0, %%xmm0 \n\t" \
> -        "punpcklbw  %%xmm0, %%xmm3 \n\t" \
> -        "movd           %4, %%xmm1 \n\t" \
> -        "punpcklwd  %%xmm1, %%xmm1 \n\t" \
> -        "punpckldq  %%xmm1, %%xmm1 \n\t" \
> -        "punpcklqdq %%xmm1, %%xmm1 \n\t" \
> -        "psllw          $3, %%xmm1 \n\t" \
> -        "paddw      %%xmm1, %%xmm3 \n\t" \
> -        "psraw          $4, %%xmm3 \n\t" \
> -        "movdqa     %%xmm3, %%xmm4 \n\t" \
> -        "movdqa     %%xmm3, %%xmm7 \n\t" \
> -        "movl           %3, %%ecx  \n\t" \
> -        "mov                                 %0, %%"FF_REG_d"        \n\t"\
> -        "mov                        (%%"FF_REG_d"), %%"FF_REG_S"     \n\t"\
> -        ".p2align                             4             \n\t" /* FIXME Unroll? */\
> -        "1:                                                 \n\t"\
> -        "movddup                  8(%%"FF_REG_d"), %%xmm0   \n\t" /* filterCoeff */\
> -        "movdqa              (%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm2 \n\t" /* srcData */\
> -        "movdqa            16(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm5 \n\t" /* srcData */\
> -        "add                                $16, %%"FF_REG_d"        \n\t"\
> -        "mov                        (%%"FF_REG_d"), %%"FF_REG_S"     \n\t"\
> -        "test                         %%"FF_REG_S", %%"FF_REG_S"     \n\t"\
> -        "pmulhw                           %%xmm0, %%xmm2      \n\t"\
> -        "pmulhw                           %%xmm0, %%xmm5      \n\t"\
> -        "paddw                            %%xmm2, %%xmm3      \n\t"\
> -        "paddw                            %%xmm5, %%xmm4      \n\t"\
> -        " jnz                                1b             \n\t"\
> -        "psraw                               $3, %%xmm3      \n\t"\
> -        "psraw                               $3, %%xmm4      \n\t"\
> -        "packuswb                         %%xmm4, %%xmm3      \n\t"\
> -        "movntdq                          %%xmm3, (%1, %%"FF_REG_c") \n\t"\
> -        "add                         $16, %%"FF_REG_c"        \n\t"\
> -        "cmp                          %2, %%"FF_REG_c"        \n\t"\
> -        "movdqa                   %%xmm7, %%xmm3            \n\t" \
> -        "movdqa                   %%xmm7, %%xmm4            \n\t" \
> -        "mov                                 %0, %%"FF_REG_d"        \n\t"\
> -        "mov                        (%%"FF_REG_d"), %%"FF_REG_S"     \n\t"\
> -        "jb                                  1b             \n\t"
> -
> -    if (offset) {
> -        __asm__ volatile(
> -            "movq          %5, %%xmm3  \n\t"
> -            "movdqa    %%xmm3, %%xmm4  \n\t"
> -            "psrlq        $24, %%xmm3  \n\t"
> -            "psllq        $40, %%xmm4  \n\t"
> -            "por       %%xmm4, %%xmm3  \n\t"
> -            MAIN_FUNCTION
> -              :: "g" (filter),
> -              "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset),
> -              "m"(filterSize), "m"(((uint64_t *) dither)[0])
> -              : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , "%xmm5" , "%xmm7" ,)
> -                "%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_c
> -              );
> -    } else {
> -        __asm__ volatile(
> -            "movq          %5, %%xmm3   \n\t"
> -            MAIN_FUNCTION
> -              :: "g" (filter),
> -              "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset),
> -              "m"(filterSize), "m"(((uint64_t *) dither)[0])
> -              : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , "%xmm5" , "%xmm7" ,)
> -                "%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_c
> -              );
> -    }
> +    __asm__ volatile(
> +        "vmovq                    %5, %%xmm3            \n\t"
> +        "cmpl                     $0, %3                \n\t"
> +        "jz                       2f                    \n\t"
> +
> +        "# offset != 0 path.                            \n\t"
> +        "vpsrlq                  $24, %%xmm3, %%xmm5    \n\t"
> +        "vpsllq                  $40, %%xmm3, %%xmm3    \n\t"
> +        "vpor                 %%xmm3, %%xmm5, %%xmm3    \n\t"
> +
> +        "2:                                             \n\t"
> +        "vpxor                %%xmm0, %%xmm0, %%xmm0    \n\t"
> +        "mov                    (%0), %%"FF_REG_S"      \n\t"
> +        "vpunpcklbw           %%xmm0, %%xmm3, %%xmm3    \n\t"
> +        "vpbroadcastw             %4, %%xmm1            \n\t"
> +        "vpsllw                   $3, %%xmm1, %%xmm1    \n\t"
> +        "mov                      %0, %%"FF_REG_d"      \n\t"
> +        "vpaddw               %%xmm1, %%xmm3, %%xmm3    \n\t"
> +        "vpsraw                   $4, %%xmm3, %%xmm3    \n\t"
> +        "vmovdqa              %%xmm3, %%xmm4            \n\t"
> +        "vmovdqa              %%xmm3, %%xmm7            \n\t"
> +        "vmovdqa              %%xmm3, %%xmm9            \n\t"
> +        "vmovdqa              %%xmm3, %%xmm10            \n\t"
> +        "movl                     %3, %%ecx             \n\t"
> +
> +        ".p2align                  4                    \n\t"
> +        "1:                                             \n\t"
> +        "vpbroadcastq 8(%%"FF_REG_d"), %%xmm0           \n\t" /* filterCoeff */
> +        "vmovdqa       (%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm2 \n\t" /* srcData */
> +        "vmovdqa     16(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm5 \n\t" /* srcData */
> +        "vmovdqa     32(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm11 \n\t" /* srcData */
> +        "vmovdqa     48(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm12 \n\t" /* srcData */
> +        "add                     $16, %%"FF_REG_d"      \n\t"
> +        "mov          (%%"FF_REG_d"), %%"FF_REG_S"      \n\t"
> +        "vpmulhw              %%xmm0, %%xmm2, %%xmm2    \n\t"
> +        "vpmulhw              %%xmm0, %%xmm5, %%xmm5    \n\t"
> +        "vpmulhw              %%xmm0, %%xmm11, %%xmm11    \n\t"
> +        "vpmulhw              %%xmm0, %%xmm12, %%xmm12    \n\t"
> +        "vpaddw               %%xmm2, %%xmm3, %%xmm3    \n\t"
> +        "vpaddw               %%xmm5, %%xmm4, %%xmm4    \n\t"
> +        "vpaddw               %%xmm11, %%xmm9, %%xmm9    \n\t"
> +        "vpaddw               %%xmm12, %%xmm10, %%xmm10    \n\t"
> +        "test           %%"FF_REG_S", %%"FF_REG_S"      \n\t"
> +        "jnz                      1b                    \n\t"
> +
> +        "vpsraw                   $3, %%xmm3, %%xmm3    \n\t"
> +        "vpsraw                   $3, %%xmm4, %%xmm4    \n\t"
> +        "vpsraw                   $3, %%xmm9, %%xmm9    \n\t"
> +        "vpsraw                   $3, %%xmm10, %%xmm10    \n\t"
> +        "vpackuswb            %%xmm4, %%xmm3, %%xmm3    \n\t"
> +        "vpackuswb            %%xmm10, %%xmm9, %%xmm9    \n\t"
> +        "mov                    (%0), %%"FF_REG_S"      \n\t"
> +        "vmovntdq              %%xmm3, (%1, %%"FF_REG_c")\n\t"
> +        "vmovntdq              %%xmm9, 16(%1, %%"FF_REG_c")\n\t"
> +        "add                     $32, %%"FF_REG_c"      \n\t"
> +        "vmovdqa              %%xmm7, %%xmm3            \n\t"
> +        "vmovdqa              %%xmm7, %%xmm4            \n\t"
> +        "vmovdqa              %%xmm7, %%xmm9            \n\t"
> +        "vmovdqa              %%xmm7, %%xmm10            \n\t"
> +        "mov                      %0, %%"FF_REG_d"      \n\t"
> +        "cmp                      %2, %%"FF_REG_c"      \n\t"
> +        "jb                       1b                    \n\t"
> +
> +        :
> +        : "r" (filter),
> +          "r" (dest-offset), "r" ((int64_t)(dstW+offset)), "m" (offset),
> +          "m"(filterSize), "m"(((uint64_t *) dither)[0])
> +        : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" ,
> +                       "%xmm5" , "%xmm7" , "%xmm9", "%xmm10" , "%xmm11" , "%xmm12" ,)
> +          "%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_c
> +    );
>  }
>  #endif
>  

this breaks build on x86-32 also new asm probably should be nasm/yasm not gcc

thx

[...]
diff mbox series

Patch

diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index 3160fedf04..e47fee2bbd 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -201,76 +201,82 @@  static void yuv2yuvX_sse3(const int16_t *filter, int filterSize,
                            const int16_t **src, uint8_t *dest, int dstW,
                            const uint8_t *dither, int offset)
 {
-    if(((uintptr_t)dest) & 15){
+    if(((uintptr_t)dest) & 31){
         yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset);
         return;
     }
     filterSize--;
-#define MAIN_FUNCTION \
-        "pxor       %%xmm0, %%xmm0 \n\t" \
-        "punpcklbw  %%xmm0, %%xmm3 \n\t" \
-        "movd           %4, %%xmm1 \n\t" \
-        "punpcklwd  %%xmm1, %%xmm1 \n\t" \
-        "punpckldq  %%xmm1, %%xmm1 \n\t" \
-        "punpcklqdq %%xmm1, %%xmm1 \n\t" \
-        "psllw          $3, %%xmm1 \n\t" \
-        "paddw      %%xmm1, %%xmm3 \n\t" \
-        "psraw          $4, %%xmm3 \n\t" \
-        "movdqa     %%xmm3, %%xmm4 \n\t" \
-        "movdqa     %%xmm3, %%xmm7 \n\t" \
-        "movl           %3, %%ecx  \n\t" \
-        "mov                                 %0, %%"FF_REG_d"        \n\t"\
-        "mov                        (%%"FF_REG_d"), %%"FF_REG_S"     \n\t"\
-        ".p2align                             4             \n\t" /* FIXME Unroll? */\
-        "1:                                                 \n\t"\
-        "movddup                  8(%%"FF_REG_d"), %%xmm0   \n\t" /* filterCoeff */\
-        "movdqa              (%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm2 \n\t" /* srcData */\
-        "movdqa            16(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm5 \n\t" /* srcData */\
-        "add                                $16, %%"FF_REG_d"        \n\t"\
-        "mov                        (%%"FF_REG_d"), %%"FF_REG_S"     \n\t"\
-        "test                         %%"FF_REG_S", %%"FF_REG_S"     \n\t"\
-        "pmulhw                           %%xmm0, %%xmm2      \n\t"\
-        "pmulhw                           %%xmm0, %%xmm5      \n\t"\
-        "paddw                            %%xmm2, %%xmm3      \n\t"\
-        "paddw                            %%xmm5, %%xmm4      \n\t"\
-        " jnz                                1b             \n\t"\
-        "psraw                               $3, %%xmm3      \n\t"\
-        "psraw                               $3, %%xmm4      \n\t"\
-        "packuswb                         %%xmm4, %%xmm3      \n\t"\
-        "movntdq                          %%xmm3, (%1, %%"FF_REG_c") \n\t"\
-        "add                         $16, %%"FF_REG_c"        \n\t"\
-        "cmp                          %2, %%"FF_REG_c"        \n\t"\
-        "movdqa                   %%xmm7, %%xmm3            \n\t" \
-        "movdqa                   %%xmm7, %%xmm4            \n\t" \
-        "mov                                 %0, %%"FF_REG_d"        \n\t"\
-        "mov                        (%%"FF_REG_d"), %%"FF_REG_S"     \n\t"\
-        "jb                                  1b             \n\t"
-
-    if (offset) {
-        __asm__ volatile(
-            "movq          %5, %%xmm3  \n\t"
-            "movdqa    %%xmm3, %%xmm4  \n\t"
-            "psrlq        $24, %%xmm3  \n\t"
-            "psllq        $40, %%xmm4  \n\t"
-            "por       %%xmm4, %%xmm3  \n\t"
-            MAIN_FUNCTION
-              :: "g" (filter),
-              "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset),
-              "m"(filterSize), "m"(((uint64_t *) dither)[0])
-              : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , "%xmm5" , "%xmm7" ,)
-                "%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_c
-              );
-    } else {
-        __asm__ volatile(
-            "movq          %5, %%xmm3   \n\t"
-            MAIN_FUNCTION
-              :: "g" (filter),
-              "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset),
-              "m"(filterSize), "m"(((uint64_t *) dither)[0])
-              : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , "%xmm5" , "%xmm7" ,)
-                "%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_c
-              );
-    }
+    __asm__ volatile(
+        "vmovq                    %5, %%xmm3            \n\t"
+        "cmpl                     $0, %3                \n\t"
+        "jz                       2f                    \n\t"
+
+        "# offset != 0 path.                            \n\t"
+        "vpsrlq                  $24, %%xmm3, %%xmm5    \n\t"
+        "vpsllq                  $40, %%xmm3, %%xmm3    \n\t"
+        "vpor                 %%xmm3, %%xmm5, %%xmm3    \n\t"
+
+        "2:                                             \n\t"
+        "vpxor                %%xmm0, %%xmm0, %%xmm0    \n\t"
+        "mov                    (%0), %%"FF_REG_S"      \n\t"
+        "vpunpcklbw           %%xmm0, %%xmm3, %%xmm3    \n\t"
+        "vpbroadcastw             %4, %%xmm1            \n\t"
+        "vpsllw                   $3, %%xmm1, %%xmm1    \n\t"
+        "mov                      %0, %%"FF_REG_d"      \n\t"
+        "vpaddw               %%xmm1, %%xmm3, %%xmm3    \n\t"
+        "vpsraw                   $4, %%xmm3, %%xmm3    \n\t"
+        "vmovdqa              %%xmm3, %%xmm4            \n\t"
+        "vmovdqa              %%xmm3, %%xmm7            \n\t"
+        "vmovdqa              %%xmm3, %%xmm9            \n\t"
+        "vmovdqa              %%xmm3, %%xmm10            \n\t"
+        "movl                     %3, %%ecx             \n\t"
+
+        ".p2align                  4                    \n\t"
+        "1:                                             \n\t"
+        "vpbroadcastq 8(%%"FF_REG_d"), %%xmm0           \n\t" /* filterCoeff */
+        "vmovdqa       (%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm2 \n\t" /* srcData */
+        "vmovdqa     16(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm5 \n\t" /* srcData */
+        "vmovdqa     32(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm11 \n\t" /* srcData */
+        "vmovdqa     48(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm12 \n\t" /* srcData */
+        "add                     $16, %%"FF_REG_d"      \n\t"
+        "mov          (%%"FF_REG_d"), %%"FF_REG_S"      \n\t"
+        "vpmulhw              %%xmm0, %%xmm2, %%xmm2    \n\t"
+        "vpmulhw              %%xmm0, %%xmm5, %%xmm5    \n\t"
+        "vpmulhw              %%xmm0, %%xmm11, %%xmm11    \n\t"
+        "vpmulhw              %%xmm0, %%xmm12, %%xmm12    \n\t"
+        "vpaddw               %%xmm2, %%xmm3, %%xmm3    \n\t"
+        "vpaddw               %%xmm5, %%xmm4, %%xmm4    \n\t"
+        "vpaddw               %%xmm11, %%xmm9, %%xmm9    \n\t"
+        "vpaddw               %%xmm12, %%xmm10, %%xmm10    \n\t"
+        "test           %%"FF_REG_S", %%"FF_REG_S"      \n\t"
+        "jnz                      1b                    \n\t"
+
+        "vpsraw                   $3, %%xmm3, %%xmm3    \n\t"
+        "vpsraw                   $3, %%xmm4, %%xmm4    \n\t"
+        "vpsraw                   $3, %%xmm9, %%xmm9    \n\t"
+        "vpsraw                   $3, %%xmm10, %%xmm10    \n\t"
+        "vpackuswb            %%xmm4, %%xmm3, %%xmm3    \n\t"
+        "vpackuswb            %%xmm10, %%xmm9, %%xmm9    \n\t"
+        "mov                    (%0), %%"FF_REG_S"      \n\t"
+        "vmovntdq              %%xmm3, (%1, %%"FF_REG_c")\n\t"
+        "vmovntdq              %%xmm9, 16(%1, %%"FF_REG_c")\n\t"
+        "add                     $32, %%"FF_REG_c"      \n\t"
+        "vmovdqa              %%xmm7, %%xmm3            \n\t"
+        "vmovdqa              %%xmm7, %%xmm4            \n\t"
+        "vmovdqa              %%xmm7, %%xmm9            \n\t"
+        "vmovdqa              %%xmm7, %%xmm10            \n\t"
+        "mov                      %0, %%"FF_REG_d"      \n\t"
+        "cmp                      %2, %%"FF_REG_c"      \n\t"
+        "jb                       1b                    \n\t"
+
+        :
+        : "r" (filter),
+          "r" (dest-offset), "r" ((int64_t)(dstW+offset)), "m" (offset),
+          "m"(filterSize), "m"(((uint64_t *) dither)[0])
+        : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" ,
+                       "%xmm5" , "%xmm7" , "%xmm9", "%xmm10" , "%xmm11" , "%xmm12" ,)
+          "%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_c
+    );
 }
 #endif