diff mbox series

[FFmpeg-devel] avcodec/vp9: ipred_vl_16x16_16 avx2 implementation

Message ID 20220504125705.2387-1-sinonim147@gmail.com
State Accepted
Commit e71d5156c8fec67a7198a0032262036ae7d46bcd
Headers show
Series [FFmpeg-devel] avcodec/vp9: ipred_vl_16x16_16 avx2 implementation | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished
andriy/make_armv7_RPi4 success Make finished
andriy/make_fate_armv7_RPi4 success Make fate finished

Commit Message

Sam Blackriver May 4, 2022, 12:57 p.m. UTC
From: Semen Belozerov <sinonim147@gmail.com>

---
 libavcodec/x86/vp9dsp_init_16bpp.c    |  2 ++
 libavcodec/x86/vp9intrapred_16bpp.asm | 51 +++++++++++++++++++++++++++
 2 files changed, 53 insertions(+)

Comments

Sam Blackriver May 11, 2022, 4:18 a.m. UTC | #1
Ср, 4 мая 2022 г. в 7:57 PM, FacelessLake <blackriver741@gmail.com>:

> From: Semen Belozerov <sinonim147@gmail.com>
>
> ---
>  libavcodec/x86/vp9dsp_init_16bpp.c    |  2 ++
>  libavcodec/x86/vp9intrapred_16bpp.asm | 51 +++++++++++++++++++++++++++
>  2 files changed, 53 insertions(+)
>
> diff --git a/libavcodec/x86/vp9dsp_init_16bpp.c
> b/libavcodec/x86/vp9dsp_init_16bpp.c
> index 27e746aea1..b17826326f 100644
> --- a/libavcodec/x86/vp9dsp_init_16bpp.c
> +++ b/libavcodec/x86/vp9dsp_init_16bpp.c
> @@ -54,6 +54,7 @@ decl_ipred_fn(dl,       16,     16, avx2);
>  decl_ipred_fn(dl,       32,     16, avx2);
>  decl_ipred_fn(dr,       16,     16, avx2);
>  decl_ipred_fn(dr,       32,     16, avx2);
> +decl_ipred_fn(vl,       16,     16, avx2);
>
>  #define decl_ipred_dir_funcs(type) \
>  decl_ipred_fns(type, 16, sse2,  sse2); \
> @@ -139,6 +140,7 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext
> *dsp)
>          init_ipred_func(dl, DIAG_DOWN_LEFT, 16, 16, avx2);
>          init_ipred_func(dl, DIAG_DOWN_LEFT, 32, 16, avx2);
>          init_ipred_func(dr, DIAG_DOWN_RIGHT, 16, 16, avx2);
> +        init_ipred_func(vl, VERT_LEFT, 16, 16, avx2);
>  #if ARCH_X86_64
>          init_ipred_func(dr, DIAG_DOWN_RIGHT, 32, 16, avx2);
>  #endif
> diff --git a/libavcodec/x86/vp9intrapred_16bpp.asm
> b/libavcodec/x86/vp9intrapred_16bpp.asm
> index 32b698243a..0dad91ac5c 100644
> --- a/libavcodec/x86/vp9intrapred_16bpp.asm
> +++ b/libavcodec/x86/vp9intrapred_16bpp.asm
> @@ -1222,6 +1222,57 @@ cglobal vp9_ipred_dr_16x16_16, 4, 5, 6, dst,
> stride, l, a
>      mova     [dst3q+strideq*4], m5                     ; 7
>      RET
>
> +cglobal vp9_ipred_vl_16x16_16, 4, 5, 7, dst, stride, l, a
> +    movifnidn               aq, amp
> +    mova                    m0, [aq]                   ; abcdefghijklmnop
> +    vpbroadcastw           xm1, [aq+30]                ; pppppppp
> +    vperm2i128              m2, m0, m1, q0201          ; ijklmnoppppppppp
> +    vpalignr                m3, m2, m0, 2              ; bcdefghijklmnopp
> +    vperm2i128              m4, m3, m1, q0201          ; jklmnopppppppppp
> +    vpalignr                m5, m2, m0, 4              ; cdefghijklmnoppp
> +    vperm2i128              m6, m5, m1, q0201          ; klmnoppppppppppp
> +    LOWPASS                  5,  3,  0                 ; BCDEFGHIJKLMNOPP
> +    LOWPASS                  6,  4,  2                 ; JKLMNOPPPPPPPPPP
> +    pavgw                   m3, m0                     ; abcdefghijklmnop
> +    pavgw                   m4, m2                     ; ijklmnoppppppppp
> +    DEFINE_ARGS dst, stride, stride3, stride5, dst4
> +    lea                  dst4q, [dstq+strideq*4]
> +    lea               stride3q, [strideq*3]
> +    lea               stride5q, [stride3q+strideq*2]
> +
> +    mova      [dstq+strideq*0], m3                     ; 0
> abcdefghijklmnop
> +    mova      [dstq+strideq*1], m5                     ; 1
> BCDEFGHIJKLMNOPP
> +    vpalignr                m0, m4, m3, 2
> +    vpalignr                m1, m6, m5, 2
> +    mova     [dstq+strideq*2 ], m0                     ; 2
> bcdefghijklmnopp
> +    mova     [dstq+stride3q*1], m1                     ; 3
> CDEFGHIJKLMNOPPP
> +    vpalignr                m0, m4, m3, 4
> +    vpalignr                m1, m6, m5, 4
> +    mova     [dst4q+strideq*0], m0                     ; 4
> cdefghijklmnoppp
> +    mova     [dstq+stride5q*1], m1                     ; 5
> DEFGHIJKLMNOPPPP
> +    vpalignr                m0, m4, m3, 6
> +    vpalignr                m1, m6, m5, 6
> +    mova    [ dstq+stride3q*2], m0                     ; 6
> defghijklmnopppp
> +    mova    [dst4q+stride3q*1], m1                     ; 7
> EFGHIJKLMNOPPPPP
> +    vpalignr                m0, m4, m3, 8
> +    vpalignr                m1, m6, m5, 8
> +    mova    [  dstq+strideq*8], m0                     ; 8
> efghijklmnoppppp
> +    mova    [dst4q+stride5q*1], m1                     ; 9
> FGHIJKLMNOPPPPPP
> +    vpalignr                m0, m4, m3, 10
> +    mova     [dstq+stride5q*2], m0                     ; 10
> fghijklmnopppppp
> +    vpalignr                m0, m4, m3, 12
> +    mova     [dst4q+strideq*8], m0                     ; 12
> ghijklmnoppppppp
> +    vpalignr                m0, m4, m3, 14
> +    mova    [dst4q+stride5q*2], m0                     ; 14
> hijklmnopppppppp
> +    sub                  dst4q, strideq
> +    vpalignr                m1, m6, m5, 10
> +    mova     [dst4q+strideq*8], m1                     ; 11
> GHIJKLMNOPPPPPPP
> +    vpalignr                m1, m6, m5, 12
> +    mova    [dst4q+stride5q*2], m1                     ; 13
> HIJKLMNOPPPPPPPP
> +    vpalignr                m1, m6, m5, 14
> +    mova    [dst4q+stride3q*4], m1                     ; 15
> IJKLMNOPPPPPPPPP
> +    RET
> +
>  %if ARCH_X86_64
>  cglobal vp9_ipred_dr_32x32_16, 4, 7, 10, dst, stride, l, a
>      mova                    m0, [lq+mmsize*0+0]        ; l[0-15]
> --
> 2.35.1
>
>
Ronald S. Bultje May 12, 2022, 6:51 p.m. UTC | #2
Hi,

On Wed, May 4, 2022 at 8:57 AM FacelessLake <blackriver741@gmail.com> wrote:

> From: Semen Belozerov <sinonim147@gmail.com>
>
> ---
>  libavcodec/x86/vp9dsp_init_16bpp.c    |  2 ++
>  libavcodec/x86/vp9intrapred_16bpp.asm | 51 +++++++++++++++++++++++++++
>  2 files changed, 53 insertions(+)
>

Merged.

Ronald
diff mbox series

Patch

diff --git a/libavcodec/x86/vp9dsp_init_16bpp.c b/libavcodec/x86/vp9dsp_init_16bpp.c
index 27e746aea1..b17826326f 100644
--- a/libavcodec/x86/vp9dsp_init_16bpp.c
+++ b/libavcodec/x86/vp9dsp_init_16bpp.c
@@ -54,6 +54,7 @@  decl_ipred_fn(dl,       16,     16, avx2);
 decl_ipred_fn(dl,       32,     16, avx2);
 decl_ipred_fn(dr,       16,     16, avx2);
 decl_ipred_fn(dr,       32,     16, avx2);
+decl_ipred_fn(vl,       16,     16, avx2);
 
 #define decl_ipred_dir_funcs(type) \
 decl_ipred_fns(type, 16, sse2,  sse2); \
@@ -139,6 +140,7 @@  av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp)
         init_ipred_func(dl, DIAG_DOWN_LEFT, 16, 16, avx2);
         init_ipred_func(dl, DIAG_DOWN_LEFT, 32, 16, avx2);
         init_ipred_func(dr, DIAG_DOWN_RIGHT, 16, 16, avx2);
+        init_ipred_func(vl, VERT_LEFT, 16, 16, avx2);
 #if ARCH_X86_64
         init_ipred_func(dr, DIAG_DOWN_RIGHT, 32, 16, avx2);
 #endif
diff --git a/libavcodec/x86/vp9intrapred_16bpp.asm b/libavcodec/x86/vp9intrapred_16bpp.asm
index 32b698243a..0dad91ac5c 100644
--- a/libavcodec/x86/vp9intrapred_16bpp.asm
+++ b/libavcodec/x86/vp9intrapred_16bpp.asm
@@ -1222,6 +1222,57 @@  cglobal vp9_ipred_dr_16x16_16, 4, 5, 6, dst, stride, l, a
     mova     [dst3q+strideq*4], m5                     ; 7
     RET
 
+cglobal vp9_ipred_vl_16x16_16, 4, 5, 7, dst, stride, l, a
+    movifnidn               aq, amp
+    mova                    m0, [aq]                   ; abcdefghijklmnop
+    vpbroadcastw           xm1, [aq+30]                ; pppppppp
+    vperm2i128              m2, m0, m1, q0201          ; ijklmnoppppppppp
+    vpalignr                m3, m2, m0, 2              ; bcdefghijklmnopp
+    vperm2i128              m4, m3, m1, q0201          ; jklmnopppppppppp
+    vpalignr                m5, m2, m0, 4              ; cdefghijklmnoppp
+    vperm2i128              m6, m5, m1, q0201          ; klmnoppppppppppp
+    LOWPASS                  5,  3,  0                 ; BCDEFGHIJKLMNOPP
+    LOWPASS                  6,  4,  2                 ; JKLMNOPPPPPPPPPP
+    pavgw                   m3, m0                     ; abcdefghijklmnop
+    pavgw                   m4, m2                     ; ijklmnoppppppppp
+    DEFINE_ARGS dst, stride, stride3, stride5, dst4
+    lea                  dst4q, [dstq+strideq*4]
+    lea               stride3q, [strideq*3]
+    lea               stride5q, [stride3q+strideq*2]
+
+    mova      [dstq+strideq*0], m3                     ; 0  abcdefghijklmnop
+    mova      [dstq+strideq*1], m5                     ; 1  BCDEFGHIJKLMNOPP
+    vpalignr                m0, m4, m3, 2
+    vpalignr                m1, m6, m5, 2
+    mova     [dstq+strideq*2 ], m0                     ; 2  bcdefghijklmnopp
+    mova     [dstq+stride3q*1], m1                     ; 3  CDEFGHIJKLMNOPPP
+    vpalignr                m0, m4, m3, 4
+    vpalignr                m1, m6, m5, 4
+    mova     [dst4q+strideq*0], m0                     ; 4  cdefghijklmnoppp
+    mova     [dstq+stride5q*1], m1                     ; 5  DEFGHIJKLMNOPPPP
+    vpalignr                m0, m4, m3, 6
+    vpalignr                m1, m6, m5, 6
+    mova    [ dstq+stride3q*2], m0                     ; 6  defghijklmnopppp
+    mova    [dst4q+stride3q*1], m1                     ; 7  EFGHIJKLMNOPPPPP
+    vpalignr                m0, m4, m3, 8
+    vpalignr                m1, m6, m5, 8
+    mova    [  dstq+strideq*8], m0                     ; 8  efghijklmnoppppp
+    mova    [dst4q+stride5q*1], m1                     ; 9  FGHIJKLMNOPPPPPP
+    vpalignr                m0, m4, m3, 10
+    mova     [dstq+stride5q*2], m0                     ; 10 fghijklmnopppppp
+    vpalignr                m0, m4, m3, 12
+    mova     [dst4q+strideq*8], m0                     ; 12 ghijklmnoppppppp
+    vpalignr                m0, m4, m3, 14
+    mova    [dst4q+stride5q*2], m0                     ; 14 hijklmnopppppppp
+    sub                  dst4q, strideq
+    vpalignr                m1, m6, m5, 10
+    mova     [dst4q+strideq*8], m1                     ; 11 GHIJKLMNOPPPPPPP
+    vpalignr                m1, m6, m5, 12
+    mova    [dst4q+stride5q*2], m1                     ; 13 HIJKLMNOPPPPPPPP
+    vpalignr                m1, m6, m5, 14
+    mova    [dst4q+stride3q*4], m1                     ; 15 IJKLMNOPPPPPPPPP
+    RET
+
 %if ARCH_X86_64
 cglobal vp9_ipred_dr_32x32_16, 4, 7, 10, dst, stride, l, a
     mova                    m0, [lq+mmsize*0+0]        ; l[0-15]