[FFmpeg-devel,4/4] avfilter/vf_v360: x86 SIMD for interpolations

Submitted by Paul B Mahol on Sept. 4, 2019, 7:28 p.m.

Details

Message ID 20190904192844.9866-4-onemda@gmail.com
State New
Headers show

Commit Message

Paul B Mahol Sept. 4, 2019, 7:28 p.m.
Signed-off-by: Paul B Mahol <onemda@gmail.com>
---
 libavfilter/v360.h             | 113 ++++++++++++++++
 libavfilter/vf_v360.c          | 236 ++++++++++++---------------------
 libavfilter/x86/Makefile       |   2 +
 libavfilter/x86/vf_v360.asm    | 104 +++++++++++++++
 libavfilter/x86/vf_v360_init.c |  43 ++++++
 5 files changed, 349 insertions(+), 149 deletions(-)
 create mode 100644 libavfilter/v360.h
 create mode 100644 libavfilter/x86/vf_v360.asm
 create mode 100644 libavfilter/x86/vf_v360_init.c

Comments

James Almer Sept. 4, 2019, 8:01 p.m.
On 9/4/2019 4:28 PM, Paul B Mahol wrote:
> diff --git a/libavfilter/x86/vf_v360.asm b/libavfilter/x86/vf_v360.asm
> new file mode 100644
> index 0000000000..46142a3bad
> --- /dev/null
> +++ b/libavfilter/x86/vf_v360.asm
> @@ -0,0 +1,104 @@
> +;*****************************************************************************
> +;* x86-optimized functions for v360 filter
> +;*
> +;* This file is part of FFmpeg.
> +;*
> +;* FFmpeg is free software; you can redistribute it and/or
> +;* modify it under the terms of the GNU Lesser General Public
> +;* License as published by the Free Software Foundation; either
> +;* version 2.1 of the License, or (at your option) any later version.
> +;*
> +;* FFmpeg is distributed in the hope that it will be useful,
> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +;* Lesser General Public License for more details.
> +;*
> +;* You should have received a copy of the GNU Lesser General Public
> +;* License along with FFmpeg; if not, write to the Free Software
> +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> +;******************************************************************************
> +
> +%include "libavutil/x86/x86util.asm"
> +
> +SECTION_RODATA
> +
> +ALIGN 32
> +
> +pb_mask: db 0,4,8,12,5,5,5,5,5,5,5,5,5,5,5,5
> +pd_255: times 4 dd 255
> +pb_255: times 16 db 255

No need for this one. See below.

> +
> +SECTION .text
> +
> +; void ff_remap2_8bit_line_avx2(uint8_t *dst, int width, const uint8_t *src, ptrdiff_t in_linesize,
> +;                               const uint16_t *u, const uint16_t *v, const int16_t *ker);
> +
> +%if HAVE_AVX2_EXTERNAL
> +INIT_YMM avx2
> +cglobal remap1_8bit_line, 7, 7, 7, dst, width, src, in_linesize, u, v, x

You're loading seven regs when you only have six. x doesn't need to be
loaded.

> +    movsxdifnidn widthq, widthd
> +    movsxdifnidn in_linesizeq, in_linesized

Unneeded since in_linesize is ptrdiff_t.

> +    xor             xq, xq
> +    movd           xm0, in_linesized
> +    VBROADCASTI128  m4, [pb_255]

pcmpeqw m4, m4

> +    VBROADCASTI128  m6, [pb_mask]
> +    vpbroadcastd    m0, xm0
> +
> +    .loop:
> +        vpmovsxwd   m1, [vq + xq * 2]
> +        vpmovsxwd   m2, [uq + xq * 2]

No need to use the v prefix in all these pre-avx instructions. INIT_YMM
takes care of it.

> +
> +        vpmulld          m3, m1, m0
> +        vpaddd           m1, m3, m2

pmulld m1, m0
paddd  m1, m2

> +        mova             m2, m4

Pointless mova. Just use m4 in the vpgatherdd below.

> +        vpgatherdd       m5, [srcq + m1], m2
> +        vextracti128    xm3, m5, 1
> +        vpshufb          m1, m5, m6
> +        vpshufb          m2, m3, m6

You could make these two pshufb use xmm regs, since you don't care
what's in the upper 128 bits.

> +        movd      [dstq+xq], xm1
> +        movd    [dstq+xq+4], xm2
> +
> +        add   xq, mmsize / 4
> +        cmp   xq, widthq
> +        jl .loop
> +    RET
> +
> +INIT_YMM avx2
> +cglobal remap2_8bit_line, 7, 9, 9, dst, width, src, in_linesize, u, v, ker, x, temp
> +    movsxdifnidn widthq, widthd
> +    movsxdifnidn in_linesizeq, in_linesized

Same as above.

> +    xor             xq, xq
> +    movd           xm0, in_linesized
> +    VBROADCASTI128  m7, [pb_255]

Also pcmpeqw m4, m4

> +    vpbroadcastd    m0, xm0
> +    movd           xm6, [pd_255]
> +    vpbroadcastd    m6, xm6

VBROADCASTI128 m6, [pd_255]

> +
> +    .loop:
> +        vpmovsxwd  m1, [kerq + xq * 8]
> +        vpmovsxwd  m2, [vq + xq * 8]
> +        vpmovsxwd  m3, [uq + xq * 8]
> +
> +        vpmulld         m4, m2, m0
> +        vpaddd          m4, m3
> +        mova            m3, m7

Also pointless mova. Use m7 below.

> +        vpgatherdd      m5, [srcq + m4], m3
> +        vpand           m5, m6
> +        vpmulld         m5, m1
> +        vphaddd         m2, m5, m5
> +        vphaddd         m5, m2, m2
> +        vpsrld          m5, m5, 0xd
> +        vextracti128   xm3, m5, 1
> +        vpshufb         m5, m5, [pb_mask]
> +        vpshufb         m3, m3, [pb_mask]

Why aren't you loading the mask at the beginning into some register?

> +
> +        movd          tempd, xm5
> +        mov       [dstq+xq], tempb
> +        movd          tempd, xm3
> +        mov     [dstq+xq+1], tempb

did you try pextrb [mem], xm, 0?

> +
> +        add   xq, mmsize / 16
> +        cmp   xq, widthq
> +        jl .loop
> +    RET
> +%endif
> diff --git a/libavfilter/x86/vf_v360_init.c b/libavfilter/x86/vf_v360_init.c
> new file mode 100644
> index 0000000000..e48ee307b3
> --- /dev/null
> +++ b/libavfilter/x86/vf_v360_init.c
> @@ -0,0 +1,43 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "config.h"
> +
> +#include "libavutil/attributes.h"
> +#include "libavutil/cpu.h"
> +#include "libavutil/x86/cpu.h"
> +#include "libavfilter/v360.h"
> +
> +void ff_remap1_8bit_line_avx2(uint8_t *dst, int width, const uint8_t *src, ptrdiff_t in_linesize,
> +                              const uint16_t *u, const uint16_t *v, const int16_t *ker);
> +
> +void ff_remap2_8bit_line_avx2(uint8_t *dst, int width, const uint8_t *src, ptrdiff_t in_linesize,
> +                              const uint16_t *u, const uint16_t *v, const int16_t *ker);
> +
> +av_cold void ff_v360_init_x86(V360Context *s, int depth)
> +{
> +#if ARCH_X86_64
> +    int cpu_flags = av_get_cpu_flags();
> +
> +    if (EXTERNAL_AVX2(cpu_flags) && s->interp == NEAREST && depth <= 8)
> +        s->remap_line = ff_remap1_8bit_line_avx2;
> +
> +    if (EXTERNAL_AVX2(cpu_flags) && s->interp == BILINEAR && depth <= 8)
> +        s->remap_line = ff_remap2_8bit_line_avx2;

EXTERNAL_AVX2_FAST() for both.

> +#endif
> +}
Henrik Gramner Sept. 4, 2019, 8:47 p.m.
On Wed, Sep 4, 2019 at 10:01 PM James Almer <jamrial@gmail.com> wrote:
> On 9/4/2019 4:28 PM, Paul B Mahol wrote:
> > +        vpmulld          m3, m1, m0
> > +        vpaddd           m1, m3, m2
>
> pmulld m1, m0
> paddd  m1, m2

Could use pmaddwd instead as well, it's faster than pmulld on pretty
much every CPU.

> > +        mova             m2, m4
>
> Pointless mova. Just use m4 in the vpgatherdd below.

No, it's required. Gathers overwrite the mask register.

> > +        vpgatherdd       m5, [srcq + m1], m2
> > +        vextracti128    xm3, m5, 1
> > +        vpshufb          m1, m5, m6
> > +        vpshufb          m2, m3, m6
>
> You could make these two pshufb use xmm regs, since you don't care
> what's in the upper 128 bits.

Or a single ymm pshufb before the vectracti128.
James Almer Sept. 4, 2019, 8:56 p.m.
On 9/4/2019 5:47 PM, Henrik Gramner wrote:
> On Wed, Sep 4, 2019 at 10:01 PM James Almer <jamrial@gmail.com> wrote:
>> On 9/4/2019 4:28 PM, Paul B Mahol wrote:
>>> +        vpmulld          m3, m1, m0
>>> +        vpaddd           m1, m3, m2
>>
>> pmulld m1, m0
>> paddd  m1, m2
> 
> Could use pmaddwd instead as well, it's faster than pmulld on pretty
> much every CPU.
> 
>>> +        mova             m2, m4
>>
>> Pointless mova. Just use m4 in the vpgatherdd below.
> 
> No, it's required. Gathers overwrite the mask register.

Ah, my bad.

> 
>>> +        vpgatherdd       m5, [srcq + m1], m2
>>> +        vextracti128    xm3, m5, 1
>>> +        vpshufb          m1, m5, m6
>>> +        vpshufb          m2, m3, m6
>>
>> You could make these two pshufb use xmm regs, since you don't care
>> what's in the upper 128 bits.
> 
> Or a single ymm pshufb before the vectracti128.
Henrik Gramner Sept. 5, 2019, 2:44 p.m.
On Wed, Sep 4, 2019 at 9:29 PM Paul B Mahol <onemda@gmail.com> wrote:
> +    movd           xm6, [pd_255]
> +    vpbroadcastd    m6, xm6

vpbroadcastd    m6, [pd_255]
James Almer Sept. 5, 2019, 3:22 p.m.
On 9/5/2019 11:44 AM, Henrik Gramner wrote:
> On Wed, Sep 4, 2019 at 9:29 PM Paul B Mahol <onemda@gmail.com> wrote:
>> +    movd           xm6, [pd_255]
>> +    vpbroadcastd    m6, xm6
> 
> vpbroadcastd    m6, [pd_255]

I suggested VBROADCASTI128 m6, [pd_255] instead. Is vpbroadcastd faster?
Paul B Mahol Sept. 5, 2019, 3:31 p.m.
On 9/5/19, James Almer <jamrial@gmail.com> wrote:
> On 9/5/2019 11:44 AM, Henrik Gramner wrote:
>> On Wed, Sep 4, 2019 at 9:29 PM Paul B Mahol <onemda@gmail.com> wrote:
>>> +    movd           xm6, [pd_255]
>>> +    vpbroadcastd    m6, xm6
>>
>> vpbroadcastd    m6, [pd_255]
>
> I suggested VBROADCASTI128 m6, [pd_255] instead. Is vpbroadcastd faster?

This is only called once per output line.

> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

Patch hide | download patch | download mbox

diff --git a/libavfilter/v360.h b/libavfilter/v360.h
new file mode 100644
index 0000000000..a0eefdec16
--- /dev/null
+++ b/libavfilter/v360.h
@@ -0,0 +1,113 @@ 
+/*
+ * Copyright (c) 2019 Eugene Lyapustin
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVFILTER_V360_H
+#define AVFILTER_V360_H
+#include "avfilter.h"
+
+enum Projections {
+    EQUIRECTANGULAR,
+    CUBEMAP_3_2,
+    CUBEMAP_6_1,
+    EQUIANGULAR,
+    FLAT,
+    DUAL_FISHEYE,
+    BARREL,
+    CUBEMAP_1_6,
+    NB_PROJECTIONS,
+};
+
+enum InterpMethod {
+    NEAREST,
+    BILINEAR,
+    BICUBIC,
+    LANCZOS,
+    NB_INTERP_METHODS,
+};
+
+enum Faces {
+    TOP_LEFT,
+    TOP_MIDDLE,
+    TOP_RIGHT,
+    BOTTOM_LEFT,
+    BOTTOM_MIDDLE,
+    BOTTOM_RIGHT,
+    NB_FACES,
+};
+
+enum Direction {
+    RIGHT,  ///< Axis +X
+    LEFT,   ///< Axis -X
+    UP,     ///< Axis +Y
+    DOWN,   ///< Axis -Y
+    FRONT,  ///< Axis -Z
+    BACK,   ///< Axis +Z
+    NB_DIRECTIONS,
+};
+
+enum Rotation {
+    ROT_0,
+    ROT_90,
+    ROT_180,
+    ROT_270,
+    NB_ROTATIONS,
+};
+
+typedef struct V360Context {
+    const AVClass *class;
+    int in, out;
+    int interp;
+    int width, height;
+    char* in_forder;
+    char* out_forder;
+    char* in_frot;
+    char* out_frot;
+
+    int in_cubemap_face_order[6];
+    int out_cubemap_direction_order[6];
+    int in_cubemap_face_rotation[6];
+    int out_cubemap_face_rotation[6];
+
+    float in_pad, out_pad;
+
+    float yaw, pitch, roll;
+
+    int h_flip, v_flip, d_flip;
+
+    float h_fov, v_fov;
+    float flat_range[3];
+
+    int planewidth[4], planeheight[4];
+    int inplanewidth[4], inplaneheight[4];
+    int nb_planes;
+
+    uint16_t *u[4], *v[4];
+    int16_t *ker[4];
+
+    int (*remap_slice)(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs);
+
+    void (*remap_line)(uint8_t *dst, int width, const uint8_t *src, ptrdiff_t in_linesize,
+                       const uint16_t *u, const uint16_t *v, const int16_t *ker);
+} V360Context;
+
+void ff_v360_init(V360Context *s, int depth);
+void ff_v360_init_x86(V360Context *s, int depth);
+
+#endif /* AVFILTER_V360_H */
diff --git a/libavfilter/vf_v360.c b/libavfilter/vf_v360.c
index fc120097d9..e69aa7e8c5 100644
--- a/libavfilter/vf_v360.c
+++ b/libavfilter/vf_v360.c
@@ -41,88 +41,7 @@ 
 #include "formats.h"
 #include "internal.h"
 #include "video.h"
-
-enum Projections {
-    EQUIRECTANGULAR,
-    CUBEMAP_3_2,
-    CUBEMAP_6_1,
-    EQUIANGULAR,
-    FLAT,
-    DUAL_FISHEYE,
-    BARREL,
-    CUBEMAP_1_6,
-    NB_PROJECTIONS,
-};
-
-enum InterpMethod {
-    NEAREST,
-    BILINEAR,
-    BICUBIC,
-    LANCZOS,
-    NB_INTERP_METHODS,
-};
-
-enum Faces {
-    TOP_LEFT,
-    TOP_MIDDLE,
-    TOP_RIGHT,
-    BOTTOM_LEFT,
-    BOTTOM_MIDDLE,
-    BOTTOM_RIGHT,
-    NB_FACES,
-};
-
-enum Direction {
-    RIGHT,  ///< Axis +X
-    LEFT,   ///< Axis -X
-    UP,     ///< Axis +Y
-    DOWN,   ///< Axis -Y
-    FRONT,  ///< Axis -Z
-    BACK,   ///< Axis +Z
-    NB_DIRECTIONS,
-};
-
-enum Rotation {
-    ROT_0,
-    ROT_90,
-    ROT_180,
-    ROT_270,
-    NB_ROTATIONS,
-};
-
-typedef struct V360Context {
-    const AVClass *class;
-    int in, out;
-    int interp;
-    int width, height;
-    char* in_forder;
-    char* out_forder;
-    char* in_frot;
-    char* out_frot;
-
-    int in_cubemap_face_order[6];
-    int out_cubemap_direction_order[6];
-    int in_cubemap_face_rotation[6];
-    int out_cubemap_face_rotation[6];
-
-    float in_pad, out_pad;
-
-    float yaw, pitch, roll;
-
-    int h_flip, v_flip, d_flip;
-
-    float h_fov, v_fov;
-    float flat_range[3];
-
-    int planewidth[4], planeheight[4];
-    int inplanewidth[4], inplaneheight[4];
-    int nb_planes;
-
-    uint16_t *u[4], *v[4];
-    int16_t *ker[4];
-
-    int (*remap_slice)(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs);
-} V360Context;
+#include "v360.h"
 
 typedef struct ThreadData {
     AVFrame *in;
@@ -251,47 +170,26 @@  static int query_formats(AVFilterContext *ctx)
     return ff_set_common_formats(ctx, fmts_list);
 }
 
-/**
- * Generate no-interpolation remapping function with a given pixel depth.
- *
- * @param bits number of bits per pixel
- * @param div number of bytes per pixel
- */
-#define DEFINE_REMAP1(bits, div)                                                             \
-static int remap1_##bits##bit_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) \
-{                                                                                            \
-    ThreadData *td = (ThreadData*)arg;                                                       \
-    const V360Context *s = ctx->priv;                                                        \
-    const AVFrame *in = td->in;                                                              \
-    AVFrame *out = td->out;                                                                  \
-                                                                                             \
-    int plane, x, y;                                                                         \
-                                                                                             \
-    for (plane = 0; plane < s->nb_planes; plane++) {                                         \
-        const int in_linesize  = in->linesize[plane]  / div;                                 \
-        const int out_linesize = out->linesize[plane] / div;                                 \
-        const uint##bits##_t *src = (const uint##bits##_t *)in->data[plane];                 \
-        uint##bits##_t *dst = (uint##bits##_t *)out->data[plane];                            \
-        const int width = s->planewidth[plane];                                              \
-        const int height = s->planeheight[plane];                                            \
-                                                                                             \
-        const int slice_start = (height *  jobnr     ) / nb_jobs;                            \
-        const int slice_end   = (height * (jobnr + 1)) / nb_jobs;                            \
-                                                                                             \
-        for (y = slice_start; y < slice_end; y++) {                                          \
-            const uint16_t *u = s->u[plane] + y * width;                                     \
-            const uint16_t *v = s->v[plane] + y * width;                                     \
-            uint##bits##_t *d = dst + y * out_linesize;                                      \
-            for (x = 0; x < width; x++)                                                      \
-                *d++ = src[v[x] * in_linesize + u[x]];                                       \
-        }                                                                                    \
-    }                                                                                        \
-                                                                                             \
-    return 0;                                                                                \
+#define DEFINE_REMAP1_LINE(bits, div)                                                                \
+static void remap1_##bits##bit_line_c(uint8_t *dst, int width, const uint8_t *src,              \
+                                      ptrdiff_t in_linesize,                                    \
+                                      const uint16_t *u, const uint16_t *v, const int16_t *ker) \
+{                                                                                                    \
+    const uint##bits##_t *s = (const uint##bits##_t *)src;                                           \
+    uint##bits##_t *d = (uint##bits##_t *)dst;                                                       \
+                                                                                                     \
+    in_linesize /= div;                                                                              \
+                                                                                                     \
+    for (int x = 0; x < width; x++) {                                                                \
+        const uint16_t *uu = u + x;                                                                  \
+        const uint16_t *vv = v + x;                                                                  \
+                                                                                                     \
+        d[x] = s[vv[0] * in_linesize + uu[0]];                                                       \
+    }                                                                                                \
 }
 
-DEFINE_REMAP1( 8, 1)
-DEFINE_REMAP1(16, 2)
+DEFINE_REMAP1_LINE( 8, 1)
+DEFINE_REMAP1_LINE(16, 2)
 
 typedef struct XYRemap {
     uint16_t u[4][4];
@@ -304,9 +202,8 @@  typedef struct XYRemap {
  *
  * @param ws size of interpolation window
  * @param bits number of bits per pixel
- * @param div number of bytes per pixel
  */
-#define DEFINE_REMAP(ws, bits, div)                                                                        \
+#define DEFINE_REMAP(ws, bits)                                                                             \
 static int remap##ws##_##bits##bit_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)          \
 {                                                                                                          \
     ThreadData *td = (ThreadData*)arg;                                                                     \
@@ -314,48 +211,87 @@  static int remap##ws##_##bits##bit_slice(AVFilterContext *ctx, void *arg, int jo
     const AVFrame *in = td->in;                                                                            \
     AVFrame *out = td->out;                                                                                \
                                                                                                            \
-    int plane, x, y, i, j;                                                                                 \
-                                                                                                           \
-    for (plane = 0; plane < s->nb_planes; plane++) {                                                       \
-        const int in_linesize  = in->linesize[plane]  / div;                                               \
-        const int out_linesize = out->linesize[plane] / div;                                               \
-        const uint##bits##_t *src = (const uint##bits##_t *)in->data[plane];                               \
-        uint##bits##_t *dst = (uint##bits##_t *)out->data[plane];                                          \
+    for (int plane = 0; plane < s->nb_planes; plane++) {                                                   \
+        const int in_linesize  = in->linesize[plane];                                                      \
+        const int out_linesize = out->linesize[plane];                                                     \
+        const uint8_t *src = in->data[plane];                                                              \
+        uint8_t *dst = out->data[plane];                                                                   \
         const int width = s->planewidth[plane];                                                            \
         const int height = s->planeheight[plane];                                                          \
                                                                                                            \
         const int slice_start = (height *  jobnr     ) / nb_jobs;                                          \
         const int slice_end   = (height * (jobnr + 1)) / nb_jobs;                                          \
                                                                                                            \
-        for (y = slice_start; y < slice_end; y++) {                                                        \
-            uint##bits##_t *d = dst + y * out_linesize;                                                    \
+        for (int y = slice_start; y < slice_end; y++) {                                                    \
             const uint16_t *u = s->u[plane] + y * width * ws * ws;                                         \
             const uint16_t *v = s->v[plane] + y * width * ws * ws;                                         \
             const int16_t *ker = s->ker[plane] + y * width * ws * ws;                                      \
-            for (x = 0; x < width; x++) {                                                                  \
-                const uint16_t *uu = u + x * ws * ws;                                                      \
-                const uint16_t *vv = v + x * ws * ws;                                                      \
-                const int16_t *kker = ker + x * ws * ws;                                                   \
-                int tmp = 0;                                                                               \
-                                                                                                           \
-                for (i = 0; i < ws; i++) {                                                                 \
-                    for (j = 0; j < ws; j++) {                                                             \
-                        tmp += kker[i * ws + j] * src[vv[i * ws + j] * in_linesize + uu[i * ws + j]];      \
-                    }                                                                                      \
-                }                                                                                          \
                                                                                                            \
-                *d++ = av_clip_uint##bits(tmp >> (15 - ws));                                               \
-            }                                                                                              \
+            s->remap_line(dst + y * out_linesize, width, src, in_linesize, u, v, ker);                     \
         }                                                                                                  \
     }                                                                                                      \
                                                                                                            \
     return 0;                                                                                              \
 }
 
-DEFINE_REMAP(2,  8, 1)
-DEFINE_REMAP(4,  8, 1)
-DEFINE_REMAP(2, 16, 2)
-DEFINE_REMAP(4, 16, 2)
+DEFINE_REMAP(1,  8)
+DEFINE_REMAP(2,  8)
+DEFINE_REMAP(4,  8)
+DEFINE_REMAP(1, 16)
+DEFINE_REMAP(2, 16)
+DEFINE_REMAP(4, 16)
+
+#define DEFINE_REMAP_LINE(ws, bits, div)                                                                   \
+static void remap##ws##_##bits##bit_line_c(uint8_t *dst, int width, const uint8_t *src,                    \
+                                           ptrdiff_t in_linesize,                                          \
+                                           const uint16_t *u, const uint16_t *v, const int16_t *ker)       \
+{                                                                                                          \
+    const uint##bits##_t *s = (const uint##bits##_t *)src;                                                 \
+    uint##bits##_t *d = (uint##bits##_t *)dst;                                                             \
+                                                                                                           \
+    in_linesize /= div;                                                                                    \
+                                                                                                           \
+    for (int x = 0; x < width; x++) {                                                                      \
+        const uint16_t *uu = u + x * ws * ws;                                                              \
+        const uint16_t *vv = v + x * ws * ws;                                                              \
+        const int16_t *kker = ker + x * ws * ws;                                                           \
+        int tmp = 0;                                                                                       \
+                                                                                                           \
+        for (int i = 0; i < ws; i++) {                                                                     \
+            for (int j = 0; j < ws; j++) {                                                                 \
+                tmp += kker[i * ws + j] * s[vv[i * ws + j] * in_linesize + uu[i * ws + j]];                \
+            }                                                                                              \
+        }                                                                                                  \
+                                                                                                           \
+        d[x] = av_clip_uint##bits(tmp >> (15 - ws));                                                       \
+    }                                                                                                      \
+}
+
+DEFINE_REMAP_LINE(2,  8, 1)
+DEFINE_REMAP_LINE(4,  8, 1)
+DEFINE_REMAP_LINE(2, 16, 2)
+DEFINE_REMAP_LINE(4, 16, 2)
+
+void ff_v360_init(V360Context *s, int depth)
+{
+    switch (s->interp) {
+    case NEAREST:
+        s->remap_line = depth <= 8 ? remap1_8bit_line_c : remap1_16bit_line_c;
+        break;
+    case BILINEAR:
+        s->remap_line = depth <= 8 ? remap2_8bit_line_c : remap2_16bit_line_c;
+        break;
+    case BICUBIC:
+        s->remap_line = depth <= 8 ? remap4_8bit_line_c : remap4_16bit_line_c;
+        break;
+    case LANCZOS:
+        s->remap_line = depth <= 8 ? remap4_8bit_line_c : remap4_16bit_line_c;
+        break;
+    }
+
+    if (ARCH_X86_64)
+        ff_v360_init_x86(s, depth);
+}
 
 /**
  * Save nearest pixel coordinates for remapping.
@@ -2038,6 +1974,8 @@  static int config_output(AVFilterLink *outlink)
         av_assert0(0);
     }
 
+    ff_v360_init(s, depth);
+
     switch (s->in) {
     case EQUIRECTANGULAR:
         in_transform = xyz_to_equirect;
diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile
index 8dc0b0e6d4..f12993e606 100644
--- a/libavfilter/x86/Makefile
+++ b/libavfilter/x86/Makefile
@@ -31,6 +31,7 @@  OBJS-$(CONFIG_TBLEND_FILTER)                 += x86/vf_blend_init.o
 OBJS-$(CONFIG_THRESHOLD_FILTER)              += x86/vf_threshold_init.o
 OBJS-$(CONFIG_TINTERLACE_FILTER)             += x86/vf_tinterlace_init.o
 OBJS-$(CONFIG_VOLUME_FILTER)                 += x86/af_volume_init.o
+OBJS-$(CONFIG_V360_FILTER)                   += x86/vf_v360_init.o
 OBJS-$(CONFIG_W3FDIF_FILTER)                 += x86/vf_w3fdif_init.o
 OBJS-$(CONFIG_YADIF_FILTER)                  += x86/vf_yadif_init.o
 
@@ -66,5 +67,6 @@  X86ASM-OBJS-$(CONFIG_TBLEND_FILTER)          += x86/vf_blend.o
 X86ASM-OBJS-$(CONFIG_THRESHOLD_FILTER)       += x86/vf_threshold.o
 X86ASM-OBJS-$(CONFIG_TINTERLACE_FILTER)      += x86/vf_interlace.o
 X86ASM-OBJS-$(CONFIG_VOLUME_FILTER)          += x86/af_volume.o
+X86ASM-OBJS-$(CONFIG_V360_FILTER)            += x86/vf_v360.o
 X86ASM-OBJS-$(CONFIG_W3FDIF_FILTER)          += x86/vf_w3fdif.o
 X86ASM-OBJS-$(CONFIG_YADIF_FILTER)           += x86/vf_yadif.o x86/yadif-16.o x86/yadif-10.o
diff --git a/libavfilter/x86/vf_v360.asm b/libavfilter/x86/vf_v360.asm
new file mode 100644
index 0000000000..46142a3bad
--- /dev/null
+++ b/libavfilter/x86/vf_v360.asm
@@ -0,0 +1,104 @@ 
+;*****************************************************************************
+;* x86-optimized functions for v360 filter
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+ALIGN 32
+
+pb_mask: db 0,4,8,12,5,5,5,5,5,5,5,5,5,5,5,5
+pd_255: times 4 dd 255
+pb_255: times 16 db 255
+
+SECTION .text
+
+; void ff_remap2_8bit_line_avx2(uint8_t *dst, int width, const uint8_t *src, ptrdiff_t in_linesize,
+;                               const uint16_t *u, const uint16_t *v, const int16_t *ker);
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+cglobal remap1_8bit_line, 7, 7, 7, dst, width, src, in_linesize, u, v, x
+    movsxdifnidn widthq, widthd
+    movsxdifnidn in_linesizeq, in_linesized
+    xor             xq, xq
+    movd           xm0, in_linesized
+    VBROADCASTI128  m4, [pb_255]
+    VBROADCASTI128  m6, [pb_mask]
+    vpbroadcastd    m0, xm0
+
+    .loop:
+        vpmovsxwd   m1, [vq + xq * 2]
+        vpmovsxwd   m2, [uq + xq * 2]
+
+        vpmulld          m3, m1, m0
+        vpaddd           m1, m3, m2
+        mova             m2, m4
+        vpgatherdd       m5, [srcq + m1], m2
+        vextracti128    xm3, m5, 1
+        vpshufb          m1, m5, m6
+        vpshufb          m2, m3, m6
+        movd      [dstq+xq], xm1
+        movd    [dstq+xq+4], xm2
+
+        add   xq, mmsize / 4
+        cmp   xq, widthq
+        jl .loop
+    RET
+
+INIT_YMM avx2
+cglobal remap2_8bit_line, 7, 9, 9, dst, width, src, in_linesize, u, v, ker, x, temp
+    movsxdifnidn widthq, widthd
+    movsxdifnidn in_linesizeq, in_linesized
+    xor             xq, xq
+    movd           xm0, in_linesized
+    VBROADCASTI128  m7, [pb_255]
+    vpbroadcastd    m0, xm0
+    movd           xm6, [pd_255]
+    vpbroadcastd    m6, xm6
+
+    .loop:
+        vpmovsxwd  m1, [kerq + xq * 8]
+        vpmovsxwd  m2, [vq + xq * 8]
+        vpmovsxwd  m3, [uq + xq * 8]
+
+        vpmulld         m4, m2, m0
+        vpaddd          m4, m3
+        mova            m3, m7
+        vpgatherdd      m5, [srcq + m4], m3
+        vpand           m5, m6
+        vpmulld         m5, m1
+        vphaddd         m2, m5, m5
+        vphaddd         m5, m2, m2
+        vpsrld          m5, m5, 0xd
+        vextracti128   xm3, m5, 1
+        vpshufb         m5, m5, [pb_mask]
+        vpshufb         m3, m3, [pb_mask]
+
+        movd          tempd, xm5
+        mov       [dstq+xq], tempb
+        movd          tempd, xm3
+        mov     [dstq+xq+1], tempb
+
+        add   xq, mmsize / 16
+        cmp   xq, widthq
+        jl .loop
+    RET
+%endif
diff --git a/libavfilter/x86/vf_v360_init.c b/libavfilter/x86/vf_v360_init.c
new file mode 100644
index 0000000000..e48ee307b3
--- /dev/null
+++ b/libavfilter/x86/vf_v360_init.c
@@ -0,0 +1,43 @@ 
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavfilter/v360.h"
+
+void ff_remap1_8bit_line_avx2(uint8_t *dst, int width, const uint8_t *src, ptrdiff_t in_linesize,
+                              const uint16_t *u, const uint16_t *v, const int16_t *ker);
+
+void ff_remap2_8bit_line_avx2(uint8_t *dst, int width, const uint8_t *src, ptrdiff_t in_linesize,
+                              const uint16_t *u, const uint16_t *v, const int16_t *ker);
+
+av_cold void ff_v360_init_x86(V360Context *s, int depth)
+{
+#if ARCH_X86_64
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_AVX2(cpu_flags) && s->interp == NEAREST && depth <= 8)
+        s->remap_line = ff_remap1_8bit_line_avx2;
+
+    if (EXTERNAL_AVX2(cpu_flags) && s->interp == BILINEAR && depth <= 8)
+        s->remap_line = ff_remap2_8bit_line_avx2;
+#endif
+}