diff mbox

[FFmpeg-devel,4/5] avfilter/vf_v360: x86 SIMD for interpolations

Message ID 20190905085255.24699-4-onemda@gmail.com
State New
Headers show

Commit Message

Paul B Mahol Sept. 5, 2019, 8:52 a.m. UTC
Signed-off-by: Paul B Mahol <onemda@gmail.com>
---
 libavfilter/v360.h             | 113 ++++++++++++++++
 libavfilter/vf_v360.c          | 236 ++++++++++++---------------------
 libavfilter/x86/Makefile       |   2 +
 libavfilter/x86/vf_v360.asm    |  98 ++++++++++++++
 libavfilter/x86/vf_v360_init.c |  43 ++++++
 5 files changed, 343 insertions(+), 149 deletions(-)
 create mode 100644 libavfilter/v360.h
 create mode 100644 libavfilter/x86/vf_v360.asm
 create mode 100644 libavfilter/x86/vf_v360_init.c

Comments

James Almer Sept. 5, 2019, 4:03 p.m. UTC | #1
On 9/5/2019 5:52 AM, Paul B Mahol wrote:
> diff --git a/libavfilter/x86/vf_v360.asm b/libavfilter/x86/vf_v360.asm
> new file mode 100644
> index 0000000000..e1efe2e3a3
> --- /dev/null
> +++ b/libavfilter/x86/vf_v360.asm
> @@ -0,0 +1,98 @@
> +;*****************************************************************************
> +;* x86-optimized functions for v360 filter
> +;*
> +;* This file is part of FFmpeg.
> +;*
> +;* FFmpeg is free software; you can redistribute it and/or
> +;* modify it under the terms of the GNU Lesser General Public
> +;* License as published by the Free Software Foundation; either
> +;* version 2.1 of the License, or (at your option) any later version.
> +;*
> +;* FFmpeg is distributed in the hope that it will be useful,
> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +;* Lesser General Public License for more details.
> +;*
> +;* You should have received a copy of the GNU Lesser General Public
> +;* License along with FFmpeg; if not, write to the Free Software
> +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> +;******************************************************************************
> +
> +%include "libavutil/x86/x86util.asm"
> +
> +SECTION_RODATA
> +
> +ALIGN 32

No need for 32 byte alignment. And if there was, you should do
SECTION_RODATA 32 instead.

> +
> +pb_mask: db 0,4,8,12,5,5,5,5,5,5,5,5,5,5,5,5
> +pd_255: times 4 dd 255
> +
> +SECTION .text
> +
> +; void ff_remap2_8bit_line_avx2(uint8_t *dst, int width, const uint8_t *src, ptrdiff_t in_linesize,
> +;                               const uint16_t *u, const uint16_t *v, const int16_t *ker);
> +
> +%if HAVE_AVX2_EXTERNAL
> +INIT_YMM avx2
> +cglobal remap1_8bit_line, 6, 7, 7, dst, width, src, in_linesize, u, v, x
> +    movsxdifnidn widthq, widthd
> +    xor             xq, xq
> +    movd           xm0, in_linesized
> +    pcmpeqw         m4, m4
> +    VBROADCASTI128  m6, [pb_mask]
> +    vpbroadcastd    m0, xm0
> +
> +    .loop:
> +        pmovsxwd   m1, [vq + xq * 2]
> +        pmovsxwd   m2, [uq + xq * 2]
> +
> +        pmulld           m1, m0

Use pmaddwd as Hendrik suggested.

> +        paddd            m1, m2
> +        mova             m2, m4
> +        vpgatherdd       m5, [srcq + m1], m2
> +        pshufb           m1, m5, m6
> +        vextracti128    xm2, m1, 1
> +        movd      [dstq+xq], xm1
> +        movd    [dstq+xq+4], xm2
> +
> +        add   xq, mmsize / 4
> +        cmp   xq, widthq
> +        jl .loop
> +    RET
> +
> +INIT_YMM avx2
> +cglobal remap2_8bit_line, 7, 8, 8, dst, width, src, in_linesize, u, v, ker, x
> +    movsxdifnidn widthq, widthd
> +    xor             xq, xq
> +    movd           xm0, in_linesized

    movd           xm0, in_linesized
%if ARCH_X86_32
DEFINE_ARGS dst, width, src, x, u, v, ker
%endif
    xor             xq, xq

That way this function will work on x86_32. Otherwise, seeing you didn't
add a preprocessor wrapper, assembly will fail on those targets.

> +    pcmpeqw         m7, m7
> +    vpbroadcastd    m0, xm0
> +    movd           xm6, [pd_255]
> +    vpbroadcastd    m6, xm6
> +    VBROADCASTI128  m5, [pb_mask]
> +
> +    .loop:
> +        pmovsxwd   m1, [kerq + xq * 8]
> +        pmovsxwd   m2, [vq + xq * 8]
> +        pmovsxwd   m3, [uq + xq * 8]
> +
> +        pmulld          m4, m2, m0
> +        paddd           m4, m3
> +        mova            m3, m7
> +        vpgatherdd      m2, [srcq + m4], m3
> +        pand            m2, m6
> +        pmulld          m2, m1

Same for these two pmulld.

> +        phaddd          m2, m2
> +        phaddd          m1, m2, m2
> +        psrld           m1, m1, 0xd
> +        pshufb          m1, m1, m5
> +        vextracti128   xm2, m1, 1
> +
> +        pextrb    [dstq+xq], xm1, 0
> +        pextrb  [dstq+xq+1], xm2, 0
> +
> +        add   xq, mmsize / 16
> +        cmp   xq, widthq
> +        jl .loop
> +    RET
> +%endif
> diff --git a/libavfilter/x86/vf_v360_init.c b/libavfilter/x86/vf_v360_init.c
> new file mode 100644
> index 0000000000..b781fb13d1
> --- /dev/null
> +++ b/libavfilter/x86/vf_v360_init.c
> @@ -0,0 +1,43 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "config.h"
> +
> +#include "libavutil/attributes.h"
> +#include "libavutil/cpu.h"
> +#include "libavutil/x86/cpu.h"
> +#include "libavfilter/v360.h"
> +
> +void ff_remap1_8bit_line_avx2(uint8_t *dst, int width, const uint8_t *src, ptrdiff_t in_linesize,
> +                              const uint16_t *u, const uint16_t *v, const int16_t *ker);
> +
> +void ff_remap2_8bit_line_avx2(uint8_t *dst, int width, const uint8_t *src, ptrdiff_t in_linesize,
> +                              const uint16_t *u, const uint16_t *v, const int16_t *ker);
> +
> +av_cold void ff_v360_init_x86(V360Context *s, int depth)
> +{
> +#if ARCH_X86_64

You can remove this after adding the above.

> +    int cpu_flags = av_get_cpu_flags();
> +
> +    if (EXTERNAL_AVX2_FAST(cpu_flags) && s->interp == NEAREST && depth <= 8)
> +        s->remap_line = ff_remap1_8bit_line_avx2;
> +
> +    if (EXTERNAL_AVX2_FAST(cpu_flags) && s->interp == BILINEAR && depth <= 8)
> +        s->remap_line = ff_remap2_8bit_line_avx2;
> +#endif
> +}
Paul B Mahol Sept. 5, 2019, 4:57 p.m. UTC | #2
On 9/5/19, James Almer <jamrial@gmail.com> wrote:
> On 9/5/2019 5:52 AM, Paul B Mahol wrote:
>> diff --git a/libavfilter/x86/vf_v360.asm b/libavfilter/x86/vf_v360.asm
>> new file mode 100644
>> index 0000000000..e1efe2e3a3
>> --- /dev/null
>> +++ b/libavfilter/x86/vf_v360.asm
>> @@ -0,0 +1,98 @@
>> +;*****************************************************************************
>> +;* x86-optimized functions for v360 filter
>> +;*
>> +;* This file is part of FFmpeg.
>> +;*
>> +;* FFmpeg is free software; you can redistribute it and/or
>> +;* modify it under the terms of the GNU Lesser General Public
>> +;* License as published by the Free Software Foundation; either
>> +;* version 2.1 of the License, or (at your option) any later version.
>> +;*
>> +;* FFmpeg is distributed in the hope that it will be useful,
>> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
>> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> +;* Lesser General Public License for more details.
>> +;*
>> +;* You should have received a copy of the GNU Lesser General Public
>> +;* License along with FFmpeg; if not, write to the Free Software
>> +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
>> 02110-1301 USA
>> +;******************************************************************************
>> +
>> +%include "libavutil/x86/x86util.asm"
>> +
>> +SECTION_RODATA
>> +
>> +ALIGN 32
>
> No need for 32 byte alignment. And if there was, you should do
> SECTION_RODATA 32 instead.
Removed.

>
>> +
>> +pb_mask: db 0,4,8,12,5,5,5,5,5,5,5,5,5,5,5,5
>> +pd_255: times 4 dd 255
>> +
>> +SECTION .text
>> +
>> +; void ff_remap2_8bit_line_avx2(uint8_t *dst, int width, const uint8_t
>> *src, ptrdiff_t in_linesize,
>> +;                               const uint16_t *u, const uint16_t *v,
>> const int16_t *ker);
>> +
>> +%if HAVE_AVX2_EXTERNAL
>> +INIT_YMM avx2
>> +cglobal remap1_8bit_line, 6, 7, 7, dst, width, src, in_linesize, u, v, x
>> +    movsxdifnidn widthq, widthd
>> +    xor             xq, xq
>> +    movd           xm0, in_linesized
>> +    pcmpeqw         m4, m4
>> +    VBROADCASTI128  m6, [pb_mask]
>> +    vpbroadcastd    m0, xm0
>> +
>> +    .loop:
>> +        pmovsxwd   m1, [vq + xq * 2]
>> +        pmovsxwd   m2, [uq + xq * 2]
>> +
>> +        pmulld           m1, m0
>
> Use pmaddwd as Hendrik suggested.

I can not use it, linesize is int.

>
>> +        paddd            m1, m2
>> +        mova             m2, m4
>> +        vpgatherdd       m5, [srcq + m1], m2
>> +        pshufb           m1, m5, m6
>> +        vextracti128    xm2, m1, 1
>> +        movd      [dstq+xq], xm1
>> +        movd    [dstq+xq+4], xm2
>> +
>> +        add   xq, mmsize / 4
>> +        cmp   xq, widthq
>> +        jl .loop
>> +    RET
>> +
>> +INIT_YMM avx2
>> +cglobal remap2_8bit_line, 7, 8, 8, dst, width, src, in_linesize, u, v,
>> ker, x
>> +    movsxdifnidn widthq, widthd
>> +    xor             xq, xq
>> +    movd           xm0, in_linesized
>
>     movd           xm0, in_linesized
> %if ARCH_X86_32
> DEFINE_ARGS dst, width, src, x, u, v, ker
> %endif
>     xor             xq, xq
>
> That way this function will work on x86_32. Otherwise, seeing you didn't
> add a preprocessor wrapper, assembly will fail on those targets.

I will left x32 to ones that cares.

>
>> +    pcmpeqw         m7, m7
>> +    vpbroadcastd    m0, xm0
>> +    movd           xm6, [pd_255]
>> +    vpbroadcastd    m6, xm6
>> +    VBROADCASTI128  m5, [pb_mask]
>> +
>> +    .loop:
>> +        pmovsxwd   m1, [kerq + xq * 8]
>> +        pmovsxwd   m2, [vq + xq * 8]
>> +        pmovsxwd   m3, [uq + xq * 8]
>> +
>> +        pmulld          m4, m2, m0
>> +        paddd           m4, m3
>> +        mova            m3, m7
>> +        vpgatherdd      m2, [srcq + m4], m3
>> +        pand            m2, m6
>> +        pmulld          m2, m1
>
> Same for these two pmulld.

Already explained.

>
>> +        phaddd          m2, m2
>> +        phaddd          m1, m2, m2
>> +        psrld           m1, m1, 0xd
>> +        pshufb          m1, m1, m5
>> +        vextracti128   xm2, m1, 1
>> +
>> +        pextrb    [dstq+xq], xm1, 0
>> +        pextrb  [dstq+xq+1], xm2, 0
>> +
>> +        add   xq, mmsize / 16
>> +        cmp   xq, widthq
>> +        jl .loop
>> +    RET
>> +%endif
>> diff --git a/libavfilter/x86/vf_v360_init.c
>> b/libavfilter/x86/vf_v360_init.c
>> new file mode 100644
>> index 0000000000..b781fb13d1
>> --- /dev/null
>> +++ b/libavfilter/x86/vf_v360_init.c
>> @@ -0,0 +1,43 @@
>> +/*
>> + * This file is part of FFmpeg.
>> + *
>> + * FFmpeg is free software; you can redistribute it and/or
>> + * modify it under the terms of the GNU Lesser General Public
>> + * License as published by the Free Software Foundation; either
>> + * version 2.1 of the License, or (at your option) any later version.
>> + *
>> + * FFmpeg is distributed in the hope that it will be useful,
>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> + * Lesser General Public License for more details.
>> + *
>> + * You should have received a copy of the GNU Lesser General Public
>> + * License along with FFmpeg; if not, write to the Free Software
>> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
>> 02110-1301 USA
>> + */
>> +
>> +#include "config.h"
>> +
>> +#include "libavutil/attributes.h"
>> +#include "libavutil/cpu.h"
>> +#include "libavutil/x86/cpu.h"
>> +#include "libavfilter/v360.h"
>> +
>> +void ff_remap1_8bit_line_avx2(uint8_t *dst, int width, const uint8_t
>> *src, ptrdiff_t in_linesize,
>> +                              const uint16_t *u, const uint16_t *v, const
>> int16_t *ker);
>> +
>> +void ff_remap2_8bit_line_avx2(uint8_t *dst, int width, const uint8_t
>> *src, ptrdiff_t in_linesize,
>> +                              const uint16_t *u, const uint16_t *v, const
>> int16_t *ker);
>> +
>> +av_cold void ff_v360_init_x86(V360Context *s, int depth)
>> +{
>> +#if ARCH_X86_64
>
> You can remove this after adding the above.

Already explained.

>
>> +    int cpu_flags = av_get_cpu_flags();
>> +
>> +    if (EXTERNAL_AVX2_FAST(cpu_flags) && s->interp == NEAREST && depth <=
>> 8)
>> +        s->remap_line = ff_remap1_8bit_line_avx2;
>> +
>> +    if (EXTERNAL_AVX2_FAST(cpu_flags) && s->interp == BILINEAR && depth
>> <= 8)
>> +        s->remap_line = ff_remap2_8bit_line_avx2;
>> +#endif
>> +}
>

I plan to apply this soon.

>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
James Almer Sept. 5, 2019, 5:17 p.m. UTC | #3
On 9/5/2019 1:57 PM, Paul B Mahol wrote:
> On 9/5/19, James Almer <jamrial@gmail.com> wrote:
>> On 9/5/2019 5:52 AM, Paul B Mahol wrote:
>>> diff --git a/libavfilter/x86/vf_v360.asm b/libavfilter/x86/vf_v360.asm
>>> new file mode 100644
>>> index 0000000000..e1efe2e3a3
>>> --- /dev/null
>>> +++ b/libavfilter/x86/vf_v360.asm
>>> @@ -0,0 +1,98 @@
>>> +;*****************************************************************************
>>> +;* x86-optimized functions for v360 filter
>>> +;*
>>> +;* This file is part of FFmpeg.
>>> +;*
>>> +;* FFmpeg is free software; you can redistribute it and/or
>>> +;* modify it under the terms of the GNU Lesser General Public
>>> +;* License as published by the Free Software Foundation; either
>>> +;* version 2.1 of the License, or (at your option) any later version.
>>> +;*
>>> +;* FFmpeg is distributed in the hope that it will be useful,
>>> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
>>> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>>> +;* Lesser General Public License for more details.
>>> +;*
>>> +;* You should have received a copy of the GNU Lesser General Public
>>> +;* License along with FFmpeg; if not, write to the Free Software
>>> +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
>>> 02110-1301 USA
>>> +;******************************************************************************
>>> +
>>> +%include "libavutil/x86/x86util.asm"
>>> +
>>> +SECTION_RODATA
>>> +
>>> +ALIGN 32
>>
>> No need for 32 byte alignment. And if there was, you should do
>> SECTION_RODATA 32 instead.
> Removed.
> 
>>
>>> +
>>> +pb_mask: db 0,4,8,12,5,5,5,5,5,5,5,5,5,5,5,5
>>> +pd_255: times 4 dd 255
>>> +
>>> +SECTION .text
>>> +
>>> +; void ff_remap2_8bit_line_avx2(uint8_t *dst, int width, const uint8_t
>>> *src, ptrdiff_t in_linesize,
>>> +;                               const uint16_t *u, const uint16_t *v,
>>> const int16_t *ker);
>>> +
>>> +%if HAVE_AVX2_EXTERNAL
>>> +INIT_YMM avx2
>>> +cglobal remap1_8bit_line, 6, 7, 7, dst, width, src, in_linesize, u, v, x
>>> +    movsxdifnidn widthq, widthd
>>> +    xor             xq, xq
>>> +    movd           xm0, in_linesized
>>> +    pcmpeqw         m4, m4
>>> +    VBROADCASTI128  m6, [pb_mask]
>>> +    vpbroadcastd    m0, xm0
>>> +
>>> +    .loop:
>>> +        pmovsxwd   m1, [vq + xq * 2]
>>> +        pmovsxwd   m2, [uq + xq * 2]
>>> +
>>> +        pmulld           m1, m0
>>
>> Use pmaddwd as Hendrik suggested.
> 
> I can not use it, linesize is int.

It is int, but as long as its value fits on a signed word, it should
work with no extra changes.

> 
>>
>>> +        paddd            m1, m2
>>> +        mova             m2, m4
>>> +        vpgatherdd       m5, [srcq + m1], m2
>>> +        pshufb           m1, m5, m6
>>> +        vextracti128    xm2, m1, 1
>>> +        movd      [dstq+xq], xm1
>>> +        movd    [dstq+xq+4], xm2
>>> +
>>> +        add   xq, mmsize / 4
>>> +        cmp   xq, widthq
>>> +        jl .loop
>>> +    RET
>>> +
>>> +INIT_YMM avx2
>>> +cglobal remap2_8bit_line, 7, 8, 8, dst, width, src, in_linesize, u, v,
>>> ker, x
>>> +    movsxdifnidn widthq, widthd
>>> +    xor             xq, xq
>>> +    movd           xm0, in_linesized
>>
>>     movd           xm0, in_linesized
>> %if ARCH_X86_32
>> DEFINE_ARGS dst, width, src, x, u, v, ker
>> %endif
>>     xor             xq, xq
>>
>> That way this function will work on x86_32. Otherwise, seeing you didn't
>> add a preprocessor wrapper, assembly will fail on those targets.
> 
> I will left x32 to ones that cares.

I literally just gave you the code to copy and paste. And if you don't
want to paste those lines, you'll need to wrap this entire file in a
x86_64 check to prevent breaking x86_32 builds.

> 
>>
>>> +    pcmpeqw         m7, m7
>>> +    vpbroadcastd    m0, xm0
>>> +    movd           xm6, [pd_255]
>>> +    vpbroadcastd    m6, xm6
>>> +    VBROADCASTI128  m5, [pb_mask]
>>> +
>>> +    .loop:
>>> +        pmovsxwd   m1, [kerq + xq * 8]
>>> +        pmovsxwd   m2, [vq + xq * 8]
>>> +        pmovsxwd   m3, [uq + xq * 8]
>>> +
>>> +        pmulld          m4, m2, m0
>>> +        paddd           m4, m3
>>> +        mova            m3, m7
>>> +        vpgatherdd      m2, [srcq + m4], m3
>>> +        pand            m2, m6
>>> +        pmulld          m2, m1
>>
>> Same for these two pmulld.
> 
> Already explained.
> 
>>
>>> +        phaddd          m2, m2
>>> +        phaddd          m1, m2, m2
>>> +        psrld           m1, m1, 0xd
>>> +        pshufb          m1, m1, m5
>>> +        vextracti128   xm2, m1, 1
>>> +
>>> +        pextrb    [dstq+xq], xm1, 0
>>> +        pextrb  [dstq+xq+1], xm2, 0
>>> +
>>> +        add   xq, mmsize / 16
>>> +        cmp   xq, widthq
>>> +        jl .loop
>>> +    RET
>>> +%endif
>>> diff --git a/libavfilter/x86/vf_v360_init.c
>>> b/libavfilter/x86/vf_v360_init.c
>>> new file mode 100644
>>> index 0000000000..b781fb13d1
>>> --- /dev/null
>>> +++ b/libavfilter/x86/vf_v360_init.c
>>> @@ -0,0 +1,43 @@
>>> +/*
>>> + * This file is part of FFmpeg.
>>> + *
>>> + * FFmpeg is free software; you can redistribute it and/or
>>> + * modify it under the terms of the GNU Lesser General Public
>>> + * License as published by the Free Software Foundation; either
>>> + * version 2.1 of the License, or (at your option) any later version.
>>> + *
>>> + * FFmpeg is distributed in the hope that it will be useful,
>>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>>> + * Lesser General Public License for more details.
>>> + *
>>> + * You should have received a copy of the GNU Lesser General Public
>>> + * License along with FFmpeg; if not, write to the Free Software
>>> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
>>> 02110-1301 USA
>>> + */
>>> +
>>> +#include "config.h"
>>> +
>>> +#include "libavutil/attributes.h"
>>> +#include "libavutil/cpu.h"
>>> +#include "libavutil/x86/cpu.h"
>>> +#include "libavfilter/v360.h"
>>> +
>>> +void ff_remap1_8bit_line_avx2(uint8_t *dst, int width, const uint8_t
>>> *src, ptrdiff_t in_linesize,
>>> +                              const uint16_t *u, const uint16_t *v, const
>>> int16_t *ker);
>>> +
>>> +void ff_remap2_8bit_line_avx2(uint8_t *dst, int width, const uint8_t
>>> *src, ptrdiff_t in_linesize,
>>> +                              const uint16_t *u, const uint16_t *v, const
>>> int16_t *ker);
>>> +
>>> +av_cold void ff_v360_init_x86(V360Context *s, int depth)
>>> +{
>>> +#if ARCH_X86_64
>>
>> You can remove this after adding the above.
> 
> Already explained.
> 
>>
>>> +    int cpu_flags = av_get_cpu_flags();
>>> +
>>> +    if (EXTERNAL_AVX2_FAST(cpu_flags) && s->interp == NEAREST && depth <=
>>> 8)
>>> +        s->remap_line = ff_remap1_8bit_line_avx2;
>>> +
>>> +    if (EXTERNAL_AVX2_FAST(cpu_flags) && s->interp == BILINEAR && depth
>>> <= 8)
>>> +        s->remap_line = ff_remap2_8bit_line_avx2;
>>> +#endif
>>> +}
>>
> 
> I plan to apply this soon.
> 
>>
>> _______________________________________________
>> ffmpeg-devel mailing list
>> ffmpeg-devel@ffmpeg.org
>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>>
>> To unsubscribe, visit link above, or email
>> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
diff mbox

Patch

diff --git a/libavfilter/v360.h b/libavfilter/v360.h
new file mode 100644
index 0000000000..a0eefdec16
--- /dev/null
+++ b/libavfilter/v360.h
@@ -0,0 +1,113 @@ 
+/*
+ * Copyright (c) 2019 Eugene Lyapustin
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVFILTER_V360_H
+#define AVFILTER_V360_H
+#include "avfilter.h"
+
+enum Projections {
+    EQUIRECTANGULAR,
+    CUBEMAP_3_2,
+    CUBEMAP_6_1,
+    EQUIANGULAR,
+    FLAT,
+    DUAL_FISHEYE,
+    BARREL,
+    CUBEMAP_1_6,
+    NB_PROJECTIONS,
+};
+
+enum InterpMethod {
+    NEAREST,
+    BILINEAR,
+    BICUBIC,
+    LANCZOS,
+    NB_INTERP_METHODS,
+};
+
+enum Faces {
+    TOP_LEFT,
+    TOP_MIDDLE,
+    TOP_RIGHT,
+    BOTTOM_LEFT,
+    BOTTOM_MIDDLE,
+    BOTTOM_RIGHT,
+    NB_FACES,
+};
+
+enum Direction {
+    RIGHT,  ///< Axis +X
+    LEFT,   ///< Axis -X
+    UP,     ///< Axis +Y
+    DOWN,   ///< Axis -Y
+    FRONT,  ///< Axis -Z
+    BACK,   ///< Axis +Z
+    NB_DIRECTIONS,
+};
+
+enum Rotation {
+    ROT_0,
+    ROT_90,
+    ROT_180,
+    ROT_270,
+    NB_ROTATIONS,
+};
+
+typedef struct V360Context {
+    const AVClass *class;
+    int in, out;
+    int interp;
+    int width, height;
+    char* in_forder;
+    char* out_forder;
+    char* in_frot;
+    char* out_frot;
+
+    int in_cubemap_face_order[6];
+    int out_cubemap_direction_order[6];
+    int in_cubemap_face_rotation[6];
+    int out_cubemap_face_rotation[6];
+
+    float in_pad, out_pad;
+
+    float yaw, pitch, roll;
+
+    int h_flip, v_flip, d_flip;
+
+    float h_fov, v_fov;
+    float flat_range[3];
+
+    int planewidth[4], planeheight[4];
+    int inplanewidth[4], inplaneheight[4];
+    int nb_planes;
+
+    uint16_t *u[4], *v[4];
+    int16_t *ker[4];
+
+    int (*remap_slice)(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs);
+
+    void (*remap_line)(uint8_t *dst, int width, const uint8_t *src, ptrdiff_t in_linesize,
+                       const uint16_t *u, const uint16_t *v, const int16_t *ker);
+} V360Context;
+
+void ff_v360_init(V360Context *s, int depth);
+void ff_v360_init_x86(V360Context *s, int depth);
+
+#endif /* AVFILTER_V360_H */
diff --git a/libavfilter/vf_v360.c b/libavfilter/vf_v360.c
index fc120097d9..e69aa7e8c5 100644
--- a/libavfilter/vf_v360.c
+++ b/libavfilter/vf_v360.c
@@ -41,88 +41,7 @@ 
 #include "formats.h"
 #include "internal.h"
 #include "video.h"
-
-enum Projections {
-    EQUIRECTANGULAR,
-    CUBEMAP_3_2,
-    CUBEMAP_6_1,
-    EQUIANGULAR,
-    FLAT,
-    DUAL_FISHEYE,
-    BARREL,
-    CUBEMAP_1_6,
-    NB_PROJECTIONS,
-};
-
-enum InterpMethod {
-    NEAREST,
-    BILINEAR,
-    BICUBIC,
-    LANCZOS,
-    NB_INTERP_METHODS,
-};
-
-enum Faces {
-    TOP_LEFT,
-    TOP_MIDDLE,
-    TOP_RIGHT,
-    BOTTOM_LEFT,
-    BOTTOM_MIDDLE,
-    BOTTOM_RIGHT,
-    NB_FACES,
-};
-
-enum Direction {
-    RIGHT,  ///< Axis +X
-    LEFT,   ///< Axis -X
-    UP,     ///< Axis +Y
-    DOWN,   ///< Axis -Y
-    FRONT,  ///< Axis -Z
-    BACK,   ///< Axis +Z
-    NB_DIRECTIONS,
-};
-
-enum Rotation {
-    ROT_0,
-    ROT_90,
-    ROT_180,
-    ROT_270,
-    NB_ROTATIONS,
-};
-
-typedef struct V360Context {
-    const AVClass *class;
-    int in, out;
-    int interp;
-    int width, height;
-    char* in_forder;
-    char* out_forder;
-    char* in_frot;
-    char* out_frot;
-
-    int in_cubemap_face_order[6];
-    int out_cubemap_direction_order[6];
-    int in_cubemap_face_rotation[6];
-    int out_cubemap_face_rotation[6];
-
-    float in_pad, out_pad;
-
-    float yaw, pitch, roll;
-
-    int h_flip, v_flip, d_flip;
-
-    float h_fov, v_fov;
-    float flat_range[3];
-
-    int planewidth[4], planeheight[4];
-    int inplanewidth[4], inplaneheight[4];
-    int nb_planes;
-
-    uint16_t *u[4], *v[4];
-    int16_t *ker[4];
-
-    int (*remap_slice)(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs);
-} V360Context;
+#include "v360.h"
 
 typedef struct ThreadData {
     AVFrame *in;
@@ -251,47 +170,26 @@  static int query_formats(AVFilterContext *ctx)
     return ff_set_common_formats(ctx, fmts_list);
 }
 
-/**
- * Generate no-interpolation remapping function with a given pixel depth.
- *
- * @param bits number of bits per pixel
- * @param div number of bytes per pixel
- */
-#define DEFINE_REMAP1(bits, div)                                                             \
-static int remap1_##bits##bit_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) \
-{                                                                                            \
-    ThreadData *td = (ThreadData*)arg;                                                       \
-    const V360Context *s = ctx->priv;                                                        \
-    const AVFrame *in = td->in;                                                              \
-    AVFrame *out = td->out;                                                                  \
-                                                                                             \
-    int plane, x, y;                                                                         \
-                                                                                             \
-    for (plane = 0; plane < s->nb_planes; plane++) {                                         \
-        const int in_linesize  = in->linesize[plane]  / div;                                 \
-        const int out_linesize = out->linesize[plane] / div;                                 \
-        const uint##bits##_t *src = (const uint##bits##_t *)in->data[plane];                 \
-        uint##bits##_t *dst = (uint##bits##_t *)out->data[plane];                            \
-        const int width = s->planewidth[plane];                                              \
-        const int height = s->planeheight[plane];                                            \
-                                                                                             \
-        const int slice_start = (height *  jobnr     ) / nb_jobs;                            \
-        const int slice_end   = (height * (jobnr + 1)) / nb_jobs;                            \
-                                                                                             \
-        for (y = slice_start; y < slice_end; y++) {                                          \
-            const uint16_t *u = s->u[plane] + y * width;                                     \
-            const uint16_t *v = s->v[plane] + y * width;                                     \
-            uint##bits##_t *d = dst + y * out_linesize;                                      \
-            for (x = 0; x < width; x++)                                                      \
-                *d++ = src[v[x] * in_linesize + u[x]];                                       \
-        }                                                                                    \
-    }                                                                                        \
-                                                                                             \
-    return 0;                                                                                \
+#define DEFINE_REMAP1_LINE(bits, div)                                                                \
+static void remap1_##bits##bit_line_c(uint8_t *dst, int width, const uint8_t *src,              \
+                                      ptrdiff_t in_linesize,                                    \
+                                      const uint16_t *u, const uint16_t *v, const int16_t *ker) \
+{                                                                                                    \
+    const uint##bits##_t *s = (const uint##bits##_t *)src;                                           \
+    uint##bits##_t *d = (uint##bits##_t *)dst;                                                       \
+                                                                                                     \
+    in_linesize /= div;                                                                              \
+                                                                                                     \
+    for (int x = 0; x < width; x++) {                                                                \
+        const uint16_t *uu = u + x;                                                                  \
+        const uint16_t *vv = v + x;                                                                  \
+                                                                                                     \
+        d[x] = s[vv[0] * in_linesize + uu[0]];                                                       \
+    }                                                                                                \
 }
 
-DEFINE_REMAP1( 8, 1)
-DEFINE_REMAP1(16, 2)
+DEFINE_REMAP1_LINE( 8, 1)
+DEFINE_REMAP1_LINE(16, 2)
 
 typedef struct XYRemap {
     uint16_t u[4][4];
@@ -304,9 +202,8 @@  typedef struct XYRemap {
  *
  * @param ws size of interpolation window
  * @param bits number of bits per pixel
- * @param div number of bytes per pixel
  */
-#define DEFINE_REMAP(ws, bits, div)                                                                        \
+#define DEFINE_REMAP(ws, bits)                                                                             \
 static int remap##ws##_##bits##bit_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)          \
 {                                                                                                          \
     ThreadData *td = (ThreadData*)arg;                                                                     \
@@ -314,48 +211,87 @@  static int remap##ws##_##bits##bit_slice(AVFilterContext *ctx, void *arg, int jo
     const AVFrame *in = td->in;                                                                            \
     AVFrame *out = td->out;                                                                                \
                                                                                                            \
-    int plane, x, y, i, j;                                                                                 \
-                                                                                                           \
-    for (plane = 0; plane < s->nb_planes; plane++) {                                                       \
-        const int in_linesize  = in->linesize[plane]  / div;                                               \
-        const int out_linesize = out->linesize[plane] / div;                                               \
-        const uint##bits##_t *src = (const uint##bits##_t *)in->data[plane];                               \
-        uint##bits##_t *dst = (uint##bits##_t *)out->data[plane];                                          \
+    for (int plane = 0; plane < s->nb_planes; plane++) {                                                   \
+        const int in_linesize  = in->linesize[plane];                                                      \
+        const int out_linesize = out->linesize[plane];                                                     \
+        const uint8_t *src = in->data[plane];                                                              \
+        uint8_t *dst = out->data[plane];                                                                   \
         const int width = s->planewidth[plane];                                                            \
         const int height = s->planeheight[plane];                                                          \
                                                                                                            \
         const int slice_start = (height *  jobnr     ) / nb_jobs;                                          \
         const int slice_end   = (height * (jobnr + 1)) / nb_jobs;                                          \
                                                                                                            \
-        for (y = slice_start; y < slice_end; y++) {                                                        \
-            uint##bits##_t *d = dst + y * out_linesize;                                                    \
+        for (int y = slice_start; y < slice_end; y++) {                                                    \
             const uint16_t *u = s->u[plane] + y * width * ws * ws;                                         \
             const uint16_t *v = s->v[plane] + y * width * ws * ws;                                         \
             const int16_t *ker = s->ker[plane] + y * width * ws * ws;                                      \
-            for (x = 0; x < width; x++) {                                                                  \
-                const uint16_t *uu = u + x * ws * ws;                                                      \
-                const uint16_t *vv = v + x * ws * ws;                                                      \
-                const int16_t *kker = ker + x * ws * ws;                                                   \
-                int tmp = 0;                                                                               \
-                                                                                                           \
-                for (i = 0; i < ws; i++) {                                                                 \
-                    for (j = 0; j < ws; j++) {                                                             \
-                        tmp += kker[i * ws + j] * src[vv[i * ws + j] * in_linesize + uu[i * ws + j]];      \
-                    }                                                                                      \
-                }                                                                                          \
                                                                                                            \
-                *d++ = av_clip_uint##bits(tmp >> (15 - ws));                                               \
-            }                                                                                              \
+            s->remap_line(dst + y * out_linesize, width, src, in_linesize, u, v, ker);                     \
         }                                                                                                  \
     }                                                                                                      \
                                                                                                            \
     return 0;                                                                                              \
 }
 
-DEFINE_REMAP(2,  8, 1)
-DEFINE_REMAP(4,  8, 1)
-DEFINE_REMAP(2, 16, 2)
-DEFINE_REMAP(4, 16, 2)
+DEFINE_REMAP(1,  8)
+DEFINE_REMAP(2,  8)
+DEFINE_REMAP(4,  8)
+DEFINE_REMAP(1, 16)
+DEFINE_REMAP(2, 16)
+DEFINE_REMAP(4, 16)
+
+#define DEFINE_REMAP_LINE(ws, bits, div)                                                                   \
+static void remap##ws##_##bits##bit_line_c(uint8_t *dst, int width, const uint8_t *src,                    \
+                                           ptrdiff_t in_linesize,                                          \
+                                           const uint16_t *u, const uint16_t *v, const int16_t *ker)       \
+{                                                                                                          \
+    const uint##bits##_t *s = (const uint##bits##_t *)src;                                                 \
+    uint##bits##_t *d = (uint##bits##_t *)dst;                                                             \
+                                                                                                           \
+    in_linesize /= div;                                                                                    \
+                                                                                                           \
+    for (int x = 0; x < width; x++) {                                                                      \
+        const uint16_t *uu = u + x * ws * ws;                                                              \
+        const uint16_t *vv = v + x * ws * ws;                                                              \
+        const int16_t *kker = ker + x * ws * ws;                                                           \
+        int tmp = 0;                                                                                       \
+                                                                                                           \
+        for (int i = 0; i < ws; i++) {                                                                     \
+            for (int j = 0; j < ws; j++) {                                                                 \
+                tmp += kker[i * ws + j] * s[vv[i * ws + j] * in_linesize + uu[i * ws + j]];                \
+            }                                                                                              \
+        }                                                                                                  \
+                                                                                                           \
+        d[x] = av_clip_uint##bits(tmp >> (15 - ws));                                                       \
+    }                                                                                                      \
+}
+
+DEFINE_REMAP_LINE(2,  8, 1)
+DEFINE_REMAP_LINE(4,  8, 1)
+DEFINE_REMAP_LINE(2, 16, 2)
+DEFINE_REMAP_LINE(4, 16, 2)
+
+void ff_v360_init(V360Context *s, int depth)
+{
+    switch (s->interp) {
+    case NEAREST:
+        s->remap_line = depth <= 8 ? remap1_8bit_line_c : remap1_16bit_line_c;
+        break;
+    case BILINEAR:
+        s->remap_line = depth <= 8 ? remap2_8bit_line_c : remap2_16bit_line_c;
+        break;
+    case BICUBIC:
+        s->remap_line = depth <= 8 ? remap4_8bit_line_c : remap4_16bit_line_c;
+        break;
+    case LANCZOS:
+        s->remap_line = depth <= 8 ? remap4_8bit_line_c : remap4_16bit_line_c;
+        break;
+    }
+
+    if (ARCH_X86_64)
+        ff_v360_init_x86(s, depth);
+}
 
 /**
  * Save nearest pixel coordinates for remapping.
@@ -2038,6 +1974,8 @@  static int config_output(AVFilterLink *outlink)
         av_assert0(0);
     }
 
+    ff_v360_init(s, depth);
+
     switch (s->in) {
     case EQUIRECTANGULAR:
         in_transform = xyz_to_equirect;
diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile
index 8dc0b0e6d4..f12993e606 100644
--- a/libavfilter/x86/Makefile
+++ b/libavfilter/x86/Makefile
@@ -31,6 +31,7 @@  OBJS-$(CONFIG_TBLEND_FILTER)                 += x86/vf_blend_init.o
 OBJS-$(CONFIG_THRESHOLD_FILTER)              += x86/vf_threshold_init.o
 OBJS-$(CONFIG_TINTERLACE_FILTER)             += x86/vf_tinterlace_init.o
 OBJS-$(CONFIG_VOLUME_FILTER)                 += x86/af_volume_init.o
+OBJS-$(CONFIG_V360_FILTER)                   += x86/vf_v360_init.o
 OBJS-$(CONFIG_W3FDIF_FILTER)                 += x86/vf_w3fdif_init.o
 OBJS-$(CONFIG_YADIF_FILTER)                  += x86/vf_yadif_init.o
 
@@ -66,5 +67,6 @@  X86ASM-OBJS-$(CONFIG_TBLEND_FILTER)          += x86/vf_blend.o
 X86ASM-OBJS-$(CONFIG_THRESHOLD_FILTER)       += x86/vf_threshold.o
 X86ASM-OBJS-$(CONFIG_TINTERLACE_FILTER)      += x86/vf_interlace.o
 X86ASM-OBJS-$(CONFIG_VOLUME_FILTER)          += x86/af_volume.o
+X86ASM-OBJS-$(CONFIG_V360_FILTER)            += x86/vf_v360.o
 X86ASM-OBJS-$(CONFIG_W3FDIF_FILTER)          += x86/vf_w3fdif.o
 X86ASM-OBJS-$(CONFIG_YADIF_FILTER)           += x86/vf_yadif.o x86/yadif-16.o x86/yadif-10.o
diff --git a/libavfilter/x86/vf_v360.asm b/libavfilter/x86/vf_v360.asm
new file mode 100644
index 0000000000..e1efe2e3a3
--- /dev/null
+++ b/libavfilter/x86/vf_v360.asm
@@ -0,0 +1,98 @@ 
+;*****************************************************************************
+;* x86-optimized functions for v360 filter
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+ALIGN 32
+
+pb_mask: db 0,4,8,12,5,5,5,5,5,5,5,5,5,5,5,5
+pd_255: times 4 dd 255
+
+SECTION .text
+
+; void ff_remap2_8bit_line_avx2(uint8_t *dst, int width, const uint8_t *src, ptrdiff_t in_linesize,
+;                               const uint16_t *u, const uint16_t *v, const int16_t *ker);
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+cglobal remap1_8bit_line, 6, 7, 7, dst, width, src, in_linesize, u, v, x
+    movsxdifnidn widthq, widthd
+    xor             xq, xq
+    movd           xm0, in_linesized
+    pcmpeqw         m4, m4
+    VBROADCASTI128  m6, [pb_mask]
+    vpbroadcastd    m0, xm0
+
+    .loop:
+        pmovsxwd   m1, [vq + xq * 2]
+        pmovsxwd   m2, [uq + xq * 2]
+
+        pmulld           m1, m0
+        paddd            m1, m2
+        mova             m2, m4
+        vpgatherdd       m5, [srcq + m1], m2
+        pshufb           m1, m5, m6
+        vextracti128    xm2, m1, 1
+        movd      [dstq+xq], xm1
+        movd    [dstq+xq+4], xm2
+
+        add   xq, mmsize / 4
+        cmp   xq, widthq
+        jl .loop
+    RET
+
+INIT_YMM avx2
+cglobal remap2_8bit_line, 7, 8, 8, dst, width, src, in_linesize, u, v, ker, x
+    movsxdifnidn widthq, widthd
+    xor             xq, xq
+    movd           xm0, in_linesized
+    pcmpeqw         m7, m7
+    vpbroadcastd    m0, xm0
+    movd           xm6, [pd_255]
+    vpbroadcastd    m6, xm6
+    VBROADCASTI128  m5, [pb_mask]
+
+    .loop:
+        pmovsxwd   m1, [kerq + xq * 8]
+        pmovsxwd   m2, [vq + xq * 8]
+        pmovsxwd   m3, [uq + xq * 8]
+
+        pmulld          m4, m2, m0
+        paddd           m4, m3
+        mova            m3, m7
+        vpgatherdd      m2, [srcq + m4], m3
+        pand            m2, m6
+        pmulld          m2, m1
+        phaddd          m2, m2
+        phaddd          m1, m2, m2
+        psrld           m1, m1, 0xd
+        pshufb          m1, m1, m5
+        vextracti128   xm2, m1, 1
+
+        pextrb    [dstq+xq], xm1, 0
+        pextrb  [dstq+xq+1], xm2, 0
+
+        add   xq, mmsize / 16
+        cmp   xq, widthq
+        jl .loop
+    RET
+%endif
diff --git a/libavfilter/x86/vf_v360_init.c b/libavfilter/x86/vf_v360_init.c
new file mode 100644
index 0000000000..b781fb13d1
--- /dev/null
+++ b/libavfilter/x86/vf_v360_init.c
@@ -0,0 +1,43 @@ 
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavfilter/v360.h"
+
+void ff_remap1_8bit_line_avx2(uint8_t *dst, int width, const uint8_t *src, ptrdiff_t in_linesize,
+                              const uint16_t *u, const uint16_t *v, const int16_t *ker);
+
+void ff_remap2_8bit_line_avx2(uint8_t *dst, int width, const uint8_t *src, ptrdiff_t in_linesize,
+                              const uint16_t *u, const uint16_t *v, const int16_t *ker);
+
+av_cold void ff_v360_init_x86(V360Context *s, int depth)
+{
+#if ARCH_X86_64
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_AVX2_FAST(cpu_flags) && s->interp == NEAREST && depth <= 8)
+        s->remap_line = ff_remap1_8bit_line_avx2;
+
+    if (EXTERNAL_AVX2_FAST(cpu_flags) && s->interp == BILINEAR && depth <= 8)
+        s->remap_line = ff_remap2_8bit_line_avx2;
+#endif
+}