diff mbox series

[FFmpeg-devel,2/3] swscale/x86/output: add AVX2 version of yuv2nv12cX

Message ID 1587697999-84025-3-git-send-email-negomez@linux.microsoft.com
State Superseded
Headers show
Series swscale: add AVX2 version of yuv2nv12cX | expand

Checks

Context Check Description
andriy/default pending
andriy/make success Make finished
andriy/make_fate success Make fate finished

Commit Message

Nelson Gomez April 24, 2020, 3:13 a.m. UTC
From: Nelson Gomez <nelson.gomez@microsoft.com>

256 bits is just wide enough to fit all the operands needed to vectorize
the software implementation, but AVX2 is needed to for some instructions
like 16-to-32 bit vector sign extension.

Output is bit-for-bit identical to C.

Signed-off-by: Nelson Gomez <nelson.gomez@microsoft.com>
---
 libswscale/x86/output.asm | 140 +++++++++++++++++++++++++++++++++++++-
 libswscale/x86/swscale.c  |  24 +++++++
 2 files changed, 163 insertions(+), 1 deletion(-)

Comments

James Almer April 24, 2020, 3:56 a.m. UTC | #1
On 4/24/2020 12:13 AM, Nelson Gomez wrote:
> From: Nelson Gomez <nelson.gomez@microsoft.com>
> 
> 256 bits is just wide enough to fit all the operands needed to vectorize
> the software implementation, but AVX2 is needed to for some instructions
> like 16-to-32 bit vector sign extension.
> 
> Output is bit-for-bit identical to C.
> 
> Signed-off-by: Nelson Gomez <nelson.gomez@microsoft.com>
> ---
>  libswscale/x86/output.asm | 140 +++++++++++++++++++++++++++++++++++++-
>  libswscale/x86/swscale.c  |  24 +++++++
>  2 files changed, 163 insertions(+), 1 deletion(-)
> 
> diff --git a/libswscale/x86/output.asm b/libswscale/x86/output.asm
> index db3e9934f8..7947163cac 100644
> --- a/libswscale/x86/output.asm
> +++ b/libswscale/x86/output.asm
> @@ -2,6 +2,7 @@
>  ;* x86-optimized vertical line scaling functions
>  ;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
>  ;*                    Kieran Kunhya <kieran@kunhya.com>
> +;*           (c) 2020 Nelson Gomez <nelson.gomez@microsoft.com>
>  ;*
>  ;* This file is part of FFmpeg.
>  ;*
> @@ -22,7 +23,7 @@
>  
>  %include "libavutil/x86/x86util.asm"
>  
> -SECTION_RODATA
> +SECTION_RODATA 32
>  
>  minshort:      times 8 dw 0x8000
>  yuv2yuvX_16_start:  times 4 dd 0x4000 - 0x40000000
> @@ -37,6 +38,18 @@ pw_32:         times 8 dw 32
>  pw_512:        times 8 dw 512
>  pw_1024:       times 8 dw 1024
>  
> +uint8_min_ymm: times 8 dd 0

Clear the register you need this for using pxor instead.

> +uint8_max_ymm: times 8 dd 255

Call it pd_255, following the same naming scheme as the constants above.

> +yuv2nv12_shuffle_mask: times 2 db 0,  4,  8, 12, \
> +                                 -1, -1, -1, -1, \
> +                                 -1, -1, -1, -1, \
> +                                 -1, -1, -1, -1
> +yuv2nv21_shuffle_mask: times 2 db 4,  0, 12,  8, \
> +                                 -1, -1, -1, -1, \
> +                                 -1, -1, -1, -1, \
> +                                 -1, -1, -1, -1
> +yuv2nv12_permute_mask: dd 0, 4, 1, 2, 3, 5, 6, 7
> +
>  SECTION .text
>  
>  ;-----------------------------------------------------------------------------
> @@ -423,3 +436,128 @@ yuv2plane1_fn  9, 5, 3
>  yuv2plane1_fn 10, 5, 3
>  yuv2plane1_fn 16, 5, 3
>  %endif
> +
> +%undef movsx
> +
> +;-----------------------------------------------------------------------------
> +; AVX2 yuv2nv12cX implementation
> +;
> +; void ff_yuv2nv12cX_avx2(enum AVPixelFormat format, const uint8_t *dither,
> +;                         const int16_t *filter, int filterSize,
> +;                         const int16_t **u, const int16_t **v,
> +;                         uint8_t *dst, int dstWidth)
> +;
> +; void ff_yuv2nv21cX_avx2(enum AVPixelFormat format, const uint8_t *dither,
> +;                         const int16_t *filter, int filterSize,
> +;                         const int16_t **u, const int16_t **v,
> +;                         uint8_t *dst, int dstWidth)
> +;-----------------------------------------------------------------------------
> +
> +%macro yuv2nv12cX_avx2_fn 1
> +%if %1
> +cglobal yuv2nv21cX, 8, 11, 13, 64, \
> +                    format, dither, filter, filterSize, u, v, dst, dstWidth
> +%else
> +cglobal yuv2nv12cX, 8, 11, 13, 64, \
> +                    format, dither, filter, filterSize, u, v, dst, dstWidth
> +%endif

You can pass yuv2nv21 and yuv2nv12 as arguments to the
yuv2nv12cX_avx2_fn macro instead of 0 and 1, and simplify this as

cglobal %1cX, 8, 11, 13, 64, ...

> +
> +    %assign i 0
> +    %rep 8
> +        movzx r8d, byte [ditherq + i]
> +        shl r8d, 12
> +        mov [rsp + i * 8], r8d
> +
> +        movzx r9d, byte [ditherq + ((i + 3) % 8)]
> +        shl r9d, 12
> +        mov [rsp + (i * 8) + 4], r9d
> +
> +        %assign i i+1
> +    %endrep
> +
> +    mova ym0, [rsp]                         ; ditherLo

Use m# instead of ym#. By initializing these functions with INIT_YMM,
m0-m15 become aliases of ym0-ym15.

> +    mova ym1, [rsp + 32]                    ; ditherHi
> +    mova ym9, [uint8_min_ymm]               ; uint8_min dwords

As i said above, pxor xm9, xm9 (Which also implicitly clears the higher
16 bytes).

> +    mova ym10, [uint8_max_ymm]              ; uint8_max dwords
> +    mova ym12, [yuv2nv12_permute_mask]      ; permute mask
> +%if %1
> +    mova ym11, [yuv2nv21_shuffle_mask]      ; shuffle_mask (NV21)
> +%else
> +    mova ym11, [yuv2nv12_shuffle_mask]      ; shuffle_mask (NV12)
> +%endif

Can also be simplified as

mova m11, [%1_shuffle_mask]

> +
> +    DEFINE_ARGS tmp1, tmp2, filter, filterSize, u, v, dst, dstWidth
> +
> +    xor r8q, r8q
> +
> +nv12_outer_%1:
> +    mova ym2, ym0                           ; resultLo
> +    mova ym3, ym1                           ; resultHi
> +    xor r9q, r9q
> +
> +nv12_inner_%1:
> +    movsx r10d, word [filterq + (2 * r9q)]
> +    movd xm4, r10d
> +    vpbroadcastd ym4, xm4                   ; filter
> +
> +    mov tmp1q, [uq + (gprsize * r9q)]
> +    mova xm7, oword [tmp1q + 2 * r8q]
> +
> +    mov tmp2q, [vq + (gprsize * r9q)]
> +    mova xm8, oword [tmp2q + 2 * r8q]
> +
> +    vpunpcklwd xm5, xm7, xm8

Don't add the v prefix to pre-avx instructions. The x86inc magic will
ensure they are expanded to the corresponding VEX encoded version.
You only need it for AVX or newer instructions, like vperm and vpbroadcastd.

> +    vpmovsxwd ym5, xm5                      ; multiplicandsLo
> +    vpunpckhwd xm6, xm7, xm8
> +    vpmovsxwd ym6, xm6                      ; multiplicandsHi
> +
> +    vpmulld ym7, ym5, ym4                   ; mulResultLo
> +    vpmulld ym8, ym6, ym4                   ; mulResultHi
> +    vpaddd ym2, ym2, ym7                    ; resultLo += mulResultLo
> +    vpaddd ym3, ym3, ym8                    ; resultHi += mulResultHi
> +
> +    inc r9d
> +    cmp r9d, filterSized
> +    jl nv12_inner_%1
> +    ; end of inner loop
> +
> +    vpsrad ym2, ym2, 19
> +    vpsrad ym3, ym3, 19
> +
> +    ; Vectorized av_clip_uint8
> +    vpmaxsd ym2, ym2, ym9
> +    vpmaxsd ym3, ym3, ym9
> +    vpminsd ym2, ym2, ym10
> +    vpminsd ym3, ym3, ym10
> +
> +    ; At this point we have clamped uint8s arranged in this order:
> +    ;     ym2: u1  0  0  0  v1  0  0  0  [...]
> +    ;     ym3: u5  0  0  0  v5  0  0  0  [...]
> +    ;
> +    ; First, we shuffle the bytes to make the bytes semi-contiguous.
> +    ; AVX-2 doesn't have cross-lane shuffling, so we'll end up with:
> +    ;     ym2: u1  v1  u2  v2  0  0  0  0  0  0  0  0  u3  v3  u4  v4
> +    ;     ym3: u5  v5  u6  v6  0  0  0  0  0  0  0  0  u7  v7  u8  v8
> +    vpshufb ym2, ym2, ym11
> +    vpshufb ym3, ym3, ym11
> +
> +    ; To fix the cross-lane shuffling issue, we'll then use cross-lane
> +    ; permutation to combine the two segments
> +    vpermd ym2, ym12, ym2
> +    vpermd ym3, ym12, ym3
> +
> +    ; Now we have the final results in the lower 8 bytes of each register
> +    movq [dstq], xm2
> +    movq [dstq + 8], xm3
> +
> +    add r8d, 8
> +    add dstq, 16
> +
> +    cmp r8d, dstWidthd
> +    jl nv12_outer_%1
> +    RET
> +%endmacro
> +
> +INIT_YMM avx2
> +yuv2nv12cX_avx2_fn 0
> +yuv2nv12cX_avx2_fn 1
> diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
> index 61110839ee..ad4a09df8d 100644
> --- a/libswscale/x86/swscale.c
> +++ b/libswscale/x86/swscale.c
> @@ -380,6 +380,15 @@ INPUT_FUNCS(sse2);
>  INPUT_FUNCS(ssse3);
>  INPUT_FUNCS(avx);
>  
> +#define YUV2NV_DECL(fmt, opt) \
> +void ff_yuv2 ## fmt ## cX_ ## opt(enum AVPixelFormat format, const uint8_t *dither, \
> +                                  const int16_t *filter, int filterSize, \
> +                                  const int16_t **u, const int16_t **v, \
> +                                  uint8_t *dst, int dstWidth)
> +
> +YUV2NV_DECL(nv12, avx2);
> +YUV2NV_DECL(nv21, avx2);
> +
>  av_cold void ff_sws_init_swscale_x86(SwsContext *c)
>  {
>      int cpu_flags = av_get_cpu_flags();
> @@ -580,4 +589,19 @@ switch(c->dstBpc){ \
>              break;
>          }
>      }
> +
> +    if (EXTERNAL_AVX2(cpu_flags)) {

EXTERNAL_AVX2_FAST(cpu_flags). Otherwise these will be used on AMD
Excavator cpus, which are very slow with ymm instructions.

> +        switch (c->dstFormat) {
> +        case AV_PIX_FMT_NV12:
> +        case AV_PIX_FMT_NV24:
> +            c->yuv2nv12cX = ff_yuv2nv12cX_avx2;
> +            break;
> +        case AV_PIX_FMT_NV21:
> +        case AV_PIX_FMT_NV42:
> +            c->yuv2nv12cX = ff_yuv2nv21cX_avx2;
> +            break;
> +        default:
> +            break;
> +        }
> +    }
>  }
>
Michael Niedermayer April 24, 2020, 4:06 p.m. UTC | #2
On Thu, Apr 23, 2020 at 08:13:18PM -0700, Nelson Gomez wrote:
> From: Nelson Gomez <nelson.gomez@microsoft.com>
> 
> 256 bits is just wide enough to fit all the operands needed to vectorize
> the software implementation, but AVX2 is needed to for some instructions
> like 16-to-32 bit vector sign extension.
> 
> Output is bit-for-bit identical to C.
> 
> Signed-off-by: Nelson Gomez <nelson.gomez@microsoft.com>
> ---
>  libswscale/x86/output.asm | 140 +++++++++++++++++++++++++++++++++++++-
>  libswscale/x86/swscale.c  |  24 +++++++
>  2 files changed, 163 insertions(+), 1 deletion(-)

Fails to build on x86_32

X86ASM	libswscale/x86/output.o
src/libswscale/x86/output.asm:562: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:562: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:562: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:562: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:562: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:562: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:562: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:562: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:562: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:562: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:562: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:562: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:562: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:562: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:562: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:562: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:562: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:562: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:562: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:562: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:562: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:562: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:562: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:562: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:562: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:562: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:562: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:562: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:562: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:562: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:562: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:562: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:562: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:562: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:562: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:562: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:562: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:562: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:562: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:562: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:562: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:562: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:562: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:562: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:562: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:562: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:562: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:562: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:562: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:497: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:562: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:498: ... from macro `yuv2nv12cX_avx2_fn' defined here
src//libavutil/x86/x86inc.asm:1381: ... from macro `movd' defined here
src//libavutil/x86/x86inc.asm:1249: ... from macro `RUN_AVX_INSTR' defined here
src/libswscale/x86/output.asm:562: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:517: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:562: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:518: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:562: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:551: ... from macro `yuv2nv12cX_avx2_fn' defined here
src//libavutil/x86/x86inc.asm:1125: ... from macro `add' defined here
src/libswscale/x86/output.asm:562: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:554: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:563: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:563: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:563: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:563: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:563: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:563: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:563: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:563: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:563: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:563: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:563: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:563: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:563: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:563: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:563: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:563: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:563: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:563: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:563: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:563: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:563: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:563: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:563: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:563: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:563: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:563: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:563: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:563: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:563: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:563: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:563: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:563: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:563: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:563: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:563: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:563: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:563: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:563: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:563: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:563: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:563: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:563: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:563: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:563: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:563: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:563: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:563: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:563: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:474: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:563: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:497: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:563: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:498: ... from macro `yuv2nv12cX_avx2_fn' defined here
src//libavutil/x86/x86inc.asm:1381: ... from macro `movd' defined here
src//libavutil/x86/x86inc.asm:1249: ... from macro `RUN_AVX_INSTR' defined here
src/libswscale/x86/output.asm:563: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:517: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:563: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:518: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/libswscale/x86/output.asm:563: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:551: ... from macro `yuv2nv12cX_avx2_fn' defined here
src//libavutil/x86/x86inc.asm:1125: ... from macro `add' defined here
src/libswscale/x86/output.asm:563: error: invalid operands in non-64-bit mode
src/libswscale/x86/output.asm:554: ... from macro `yuv2nv12cX_avx2_fn' defined here
src/ffbuild/common.mak:81: recipe for target 'libswscale/x86/output.o' failed
make: *** [libswscale/x86/output.o] Error 1

[...]
diff mbox series

Patch

diff --git a/libswscale/x86/output.asm b/libswscale/x86/output.asm
index db3e9934f8..7947163cac 100644
--- a/libswscale/x86/output.asm
+++ b/libswscale/x86/output.asm
@@ -2,6 +2,7 @@ 
 ;* x86-optimized vertical line scaling functions
 ;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
 ;*                    Kieran Kunhya <kieran@kunhya.com>
+;*           (c) 2020 Nelson Gomez <nelson.gomez@microsoft.com>
 ;*
 ;* This file is part of FFmpeg.
 ;*
@@ -22,7 +23,7 @@ 
 
 %include "libavutil/x86/x86util.asm"
 
-SECTION_RODATA
+SECTION_RODATA 32
 
 minshort:      times 8 dw 0x8000
 yuv2yuvX_16_start:  times 4 dd 0x4000 - 0x40000000
@@ -37,6 +38,18 @@  pw_32:         times 8 dw 32
 pw_512:        times 8 dw 512
 pw_1024:       times 8 dw 1024
 
+uint8_min_ymm: times 8 dd 0
+uint8_max_ymm: times 8 dd 255
+yuv2nv12_shuffle_mask: times 2 db 0,  4,  8, 12, \
+                                 -1, -1, -1, -1, \
+                                 -1, -1, -1, -1, \
+                                 -1, -1, -1, -1
+yuv2nv21_shuffle_mask: times 2 db 4,  0, 12,  8, \
+                                 -1, -1, -1, -1, \
+                                 -1, -1, -1, -1, \
+                                 -1, -1, -1, -1
+yuv2nv12_permute_mask: dd 0, 4, 1, 2, 3, 5, 6, 7
+
 SECTION .text
 
 ;-----------------------------------------------------------------------------
@@ -423,3 +436,128 @@  yuv2plane1_fn  9, 5, 3
 yuv2plane1_fn 10, 5, 3
 yuv2plane1_fn 16, 5, 3
 %endif
+
+%undef movsx
+
+;-----------------------------------------------------------------------------
+; AVX2 yuv2nv12cX implementation
+;
+; void ff_yuv2nv12cX_avx2(enum AVPixelFormat format, const uint8_t *dither,
+;                         const int16_t *filter, int filterSize,
+;                         const int16_t **u, const int16_t **v,
+;                         uint8_t *dst, int dstWidth)
+;
+; void ff_yuv2nv21cX_avx2(enum AVPixelFormat format, const uint8_t *dither,
+;                         const int16_t *filter, int filterSize,
+;                         const int16_t **u, const int16_t **v,
+;                         uint8_t *dst, int dstWidth)
+;-----------------------------------------------------------------------------
+
+%macro yuv2nv12cX_avx2_fn 1
+%if %1
+cglobal yuv2nv21cX, 8, 11, 13, 64, \
+                    format, dither, filter, filterSize, u, v, dst, dstWidth
+%else
+cglobal yuv2nv12cX, 8, 11, 13, 64, \
+                    format, dither, filter, filterSize, u, v, dst, dstWidth
+%endif
+
+    %assign i 0
+    %rep 8
+        movzx r8d, byte [ditherq + i]
+        shl r8d, 12
+        mov [rsp + i * 8], r8d
+
+        movzx r9d, byte [ditherq + ((i + 3) % 8)]
+        shl r9d, 12
+        mov [rsp + (i * 8) + 4], r9d
+
+        %assign i i+1
+    %endrep
+
+    mova ym0, [rsp]                         ; ditherLo
+    mova ym1, [rsp + 32]                    ; ditherHi
+    mova ym9, [uint8_min_ymm]               ; uint8_min dwords
+    mova ym10, [uint8_max_ymm]              ; uint8_max dwords
+    mova ym12, [yuv2nv12_permute_mask]      ; permute mask
+%if %1
+    mova ym11, [yuv2nv21_shuffle_mask]      ; shuffle_mask (NV21)
+%else
+    mova ym11, [yuv2nv12_shuffle_mask]      ; shuffle_mask (NV12)
+%endif
+
+    DEFINE_ARGS tmp1, tmp2, filter, filterSize, u, v, dst, dstWidth
+
+    xor r8q, r8q
+
+nv12_outer_%1:
+    mova ym2, ym0                           ; resultLo
+    mova ym3, ym1                           ; resultHi
+    xor r9q, r9q
+
+nv12_inner_%1:
+    movsx r10d, word [filterq + (2 * r9q)]
+    movd xm4, r10d
+    vpbroadcastd ym4, xm4                   ; filter
+
+    mov tmp1q, [uq + (gprsize * r9q)]
+    mova xm7, oword [tmp1q + 2 * r8q]
+
+    mov tmp2q, [vq + (gprsize * r9q)]
+    mova xm8, oword [tmp2q + 2 * r8q]
+
+    vpunpcklwd xm5, xm7, xm8
+    vpmovsxwd ym5, xm5                      ; multiplicandsLo
+    vpunpckhwd xm6, xm7, xm8
+    vpmovsxwd ym6, xm6                      ; multiplicandsHi
+
+    vpmulld ym7, ym5, ym4                   ; mulResultLo
+    vpmulld ym8, ym6, ym4                   ; mulResultHi
+    vpaddd ym2, ym2, ym7                    ; resultLo += mulResultLo
+    vpaddd ym3, ym3, ym8                    ; resultHi += mulResultHi
+
+    inc r9d
+    cmp r9d, filterSized
+    jl nv12_inner_%1
+    ; end of inner loop
+
+    vpsrad ym2, ym2, 19
+    vpsrad ym3, ym3, 19
+
+    ; Vectorized av_clip_uint8
+    vpmaxsd ym2, ym2, ym9
+    vpmaxsd ym3, ym3, ym9
+    vpminsd ym2, ym2, ym10
+    vpminsd ym3, ym3, ym10
+
+    ; At this point we have clamped uint8s arranged in this order:
+    ;     ym2: u1  0  0  0  v1  0  0  0  [...]
+    ;     ym3: u5  0  0  0  v5  0  0  0  [...]
+    ;
+    ; First, we shuffle the bytes to make the bytes semi-contiguous.
+    ; AVX-2 doesn't have cross-lane shuffling, so we'll end up with:
+    ;     ym2: u1  v1  u2  v2  0  0  0  0  0  0  0  0  u3  v3  u4  v4
+    ;     ym3: u5  v5  u6  v6  0  0  0  0  0  0  0  0  u7  v7  u8  v8
+    vpshufb ym2, ym2, ym11
+    vpshufb ym3, ym3, ym11
+
+    ; To fix the cross-lane shuffling issue, we'll then use cross-lane
+    ; permutation to combine the two segments
+    vpermd ym2, ym12, ym2
+    vpermd ym3, ym12, ym3
+
+    ; Now we have the final results in the lower 8 bytes of each register
+    movq [dstq], xm2
+    movq [dstq + 8], xm3
+
+    add r8d, 8
+    add dstq, 16
+
+    cmp r8d, dstWidthd
+    jl nv12_outer_%1
+    RET
+%endmacro
+
+INIT_YMM avx2
+yuv2nv12cX_avx2_fn 0
+yuv2nv12cX_avx2_fn 1
diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index 61110839ee..ad4a09df8d 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -380,6 +380,15 @@  INPUT_FUNCS(sse2);
 INPUT_FUNCS(ssse3);
 INPUT_FUNCS(avx);
 
+#define YUV2NV_DECL(fmt, opt) \
+void ff_yuv2 ## fmt ## cX_ ## opt(enum AVPixelFormat format, const uint8_t *dither, \
+                                  const int16_t *filter, int filterSize, \
+                                  const int16_t **u, const int16_t **v, \
+                                  uint8_t *dst, int dstWidth)
+
+YUV2NV_DECL(nv12, avx2);
+YUV2NV_DECL(nv21, avx2);
+
 av_cold void ff_sws_init_swscale_x86(SwsContext *c)
 {
     int cpu_flags = av_get_cpu_flags();
@@ -580,4 +589,19 @@  switch(c->dstBpc){ \
             break;
         }
     }
+
+    if (EXTERNAL_AVX2(cpu_flags)) {
+        switch (c->dstFormat) {
+        case AV_PIX_FMT_NV12:
+        case AV_PIX_FMT_NV24:
+            c->yuv2nv12cX = ff_yuv2nv12cX_avx2;
+            break;
+        case AV_PIX_FMT_NV21:
+        case AV_PIX_FMT_NV42:
+            c->yuv2nv12cX = ff_yuv2nv21cX_avx2;
+            break;
+        default:
+            break;
+        }
+    }
 }