[FFmpeg-devel,1/3] diracdec: add 10-bit Haar SIMD functions

Message ID	20180726112808.11792-2-jdarnley@obe.tv
State	New
Headers	show Delivered-To: ffmpegpatchwork@gmail.com Received-SPF: pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) client-ip=79.124.17.100; From: James Darnley <jdarnley@obe.tv> To: ffmpeg-devel@ffmpeg.org Date: Thu, 26 Jul 2018 13:28:06 +0200 Message-Id: <20180726112808.11792-2-jdarnley@obe.tv> In-Reply-To: <20180726112808.11792-1-jdarnley@obe.tv> References: <20180726112808.11792-1-jdarnley@obe.tv> Subject: [FFmpeg-devel] [PATCH 1/3] diracdec: add 10-bit Haar SIMD functions Precedence: list Reply-To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org> MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: base64 Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" <ffmpeg-devel-bounces@ffmpeg.org>

James Darnley July 26, 2018, 11:28 a.m. UTC

Speed of ffmpeg when decoding a 720p yuv422p10 file encoded with the
relevant transform.
C:    119fps
SSE2: 204fps
AVX:  206fps
AVX2: 221fps

timer measurements, haar horizontal compose:
    sse2: 3.68x faster (45143 vs. 12279 decicycles) compared with C
    avx:  3.68x faster (45143 vs. 12275 decicycles) compared with C
    avx2: 5.16x faster (45143 vs.  8742 decicycles) compared with C
haar vertical compose:
    sse2: 1.64x faster (31792 vs. 19377 decicycles) compared with C
    avx:  1.58x faster (31792 vs. 20090 decicycles) compared with C
    avx2: 1.66x faster (31792 vs. 19157 decicycles) compared with C
---
 libavcodec/dirac_dwt.c                |   7 +-
 libavcodec/dirac_dwt.h                |   1 +
 libavcodec/x86/Makefile               |   6 +-
 libavcodec/x86/dirac_dwt_10bit.asm    | 160 ++++++++++++++++++++++++++
 libavcodec/x86/dirac_dwt_init_10bit.c |  76 ++++++++++++
 5 files changed, 247 insertions(+), 3 deletions(-)
 create mode 100644 libavcodec/x86/dirac_dwt_10bit.asm
 create mode 100644 libavcodec/x86/dirac_dwt_init_10bit.c

Rostislav Pehlivanov July 26, 2018, 3:29 p.m. UTC | #1

On 26 July 2018 at 12:28, James Darnley <jdarnley@obe.tv> wrote:

> Speed of ffmpeg when decoding a 720p yuv422p10 file encoded with the
> relevant transform.
> C:    119fps
> SSE2: 204fps
> AVX:  206fps
> AVX2: 221fps
>
> timer measurements, haar horizontal compose:
>     sse2: 3.68x faster (45143 vs. 12279 decicycles) compared with C
>     avx:  3.68x faster (45143 vs. 12275 decicycles) compared with C
>     avx2: 5.16x faster (45143 vs.  8742 decicycles) compared with C
> haar vertical compose:
>     sse2: 1.64x faster (31792 vs. 19377 decicycles) compared with C
>     avx:  1.58x faster (31792 vs. 20090 decicycles) compared with C
>     avx2: 1.66x faster (31792 vs. 19157 decicycles) compared with C
> ---
>  libavcodec/dirac_dwt.c                |   7 +-
>  libavcodec/dirac_dwt.h                |   1 +
>  libavcodec/x86/Makefile               |   6 +-
>  libavcodec/x86/dirac_dwt_10bit.asm    | 160 ++++++++++++++++++++++++++
>  libavcodec/x86/dirac_dwt_init_10bit.c |  76 ++++++++++++
>  5 files changed, 247 insertions(+), 3 deletions(-)
>  create mode 100644 libavcodec/x86/dirac_dwt_10bit.asm
>  create mode 100644 libavcodec/x86/dirac_dwt_init_10bit.c
>
> diff --git a/libavcodec/dirac_dwt.c b/libavcodec/dirac_dwt.c
> index cc08f8865a..86bee5bb9b 100644
> --- a/libavcodec/dirac_dwt.c
> +++ b/libavcodec/dirac_dwt.c
> @@ -59,8 +59,13 @@ int ff_spatial_idwt_init(DWTContext *d, DWTPlane *p,
> enum dwt_type type,
>          return AVERROR_INVALIDDATA;
>      }
>
> -    if (ARCH_X86 && bit_depth == 8)
> +#if ARCH_X86
> +    if (bit_depth == 8)
>          ff_spatial_idwt_init_x86(d, type);
> +    else if (bit_depth == 10)
> +        ff_spatial_idwt_init_10bit_x86(d, type);
> +#endif
> +
>      return 0;
>  }
>
> diff --git a/libavcodec/dirac_dwt.h b/libavcodec/dirac_dwt.h
> index 994dc21d70..1ad7b9a821 100644
> --- a/libavcodec/dirac_dwt.h
> +++ b/libavcodec/dirac_dwt.h
> @@ -88,6 +88,7 @@ enum dwt_type {
>  int ff_spatial_idwt_init(DWTContext *d, DWTPlane *p, enum dwt_type type,
>                           int decomposition_count, int bit_depth);
>  void ff_spatial_idwt_init_x86(DWTContext *d, enum dwt_type type);
> +void ff_spatial_idwt_init_10bit_x86(DWTContext *d, enum dwt_type type);
>
>  void ff_spatial_idwt_slice2(DWTContext *d, int y);
>
> diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
> index 2350c8bbee..590d83c167 100644
> --- a/libavcodec/x86/Makefile
> +++ b/libavcodec/x86/Makefile
> @@ -7,7 +7,8 @@ OBJS-$(CONFIG_BLOCKDSP)                +=
> x86/blockdsp_init.o
>  OBJS-$(CONFIG_BSWAPDSP)                += x86/bswapdsp_init.o
>  OBJS-$(CONFIG_DCT)                     += x86/dct_init.o
>  OBJS-$(CONFIG_DIRAC_DECODER)           += x86/diracdsp_init.o           \
> -                                          x86/dirac_dwt_init.o
> +                                          x86/dirac_dwt_init.o \
> +                                          x86/dirac_dwt_init_10bit.o
>  OBJS-$(CONFIG_FDCTDSP)                 += x86/fdctdsp_init.o
>  OBJS-$(CONFIG_FFT)                     += x86/fft_init.o
>  OBJS-$(CONFIG_FLACDSP)                 += x86/flacdsp_init.o
> @@ -153,7 +154,8 @@ X86ASM-OBJS-$(CONFIG_APNG_DECODER)     += x86/pngdsp.o
>  X86ASM-OBJS-$(CONFIG_CAVS_DECODER)     += x86/cavsidct.o
>  X86ASM-OBJS-$(CONFIG_DCA_DECODER)      += x86/dcadsp.o x86/synth_filter.o
>  X86ASM-OBJS-$(CONFIG_DIRAC_DECODER)    += x86/diracdsp.o                \
> -                                          x86/dirac_dwt.o
> +                                          x86/dirac_dwt.o \
> +                                          x86/dirac_dwt_10bit.o
>  X86ASM-OBJS-$(CONFIG_DNXHD_ENCODER)    += x86/dnxhdenc.o
>  X86ASM-OBJS-$(CONFIG_EXR_DECODER)      += x86/exrdsp.o
>  X86ASM-OBJS-$(CONFIG_FLAC_DECODER)     += x86/flacdsp.o
> diff --git a/libavcodec/x86/dirac_dwt_10bit.asm
> b/libavcodec/x86/dirac_dwt_10bit.asm
> new file mode 100644
> index 0000000000..baea91329e
> --- /dev/null
> +++ b/libavcodec/x86/dirac_dwt_10bit.asm
> @@ -0,0 +1,160 @@
> +;**********************************************************
> ********************
> +;* x86 optimized discrete 10-bit wavelet trasnform
> +;* Copyright (c) 2018 James Darnley
> +;*
> +;* This file is part of FFmpeg.
> +;*
> +;* FFmpeg is free software; you can redistribute it and/or
> +;* modify it under the terms of the GNU Lesser General Public
> +;* License as published by the Free Software Foundation; either
> +;* version 2.1 of the License, or (at your option) any later version.
> +;*
> +;* FFmpeg is distributed in the hope that it will be useful,
> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +;* Lesser General Public License for more details.
> +;*
> +;* You should have received a copy of the GNU Lesser General Public
> +;* License along with FFmpeg; if not, write to the Free Software
> +;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA
> 02110-1301 USA
> +;**********************************************************
> ********************
> +
> +%include "libavutil/x86/x86util.asm"
> +
> +SECTION_RODATA
> +
> +cextern pd_1
> +
> +SECTION .text
> +
> +%macro HAAR_VERTICAL 0
>
+
>

See below.


+cglobal vertical_compose_haar_10bit, 3, 6, 4, b0, b1, w
> +    DECLARE_REG_TMP 4,5
> +
> +    mova  m2, [pd_1]
> +    mov  r3d, wd
> +    and   wd, ~(mmsize/4 - 1)
> +    shl   wd, 2
> +    add  b0q, wq
> +    add  b1q, wq
> +    neg   wq
> +
> +    ALIGN 16
> +    .loop_simd:
> +        mova m0, [b0q + wq]
> +        mova m1, [b1q + wq]
> +        paddd m3, m1, m2
> +        psrad m3, 1
> +        psubd m0, m3
> +        paddd m1, m0
> +        mova [b0q + wq], m0
> +        mova [b1q + wq], m1
> +        add wq, mmsize
> +    jl .loop_simd
> +
> +    and  r3d, mmsize/4 - 1
> +    jz .end
> +    .loop_scalar:
> +        mov t0d, [b0q]
> +        mov t1d, [b1q]
> +        mov r2d, t1d
> +        add r2d, 1
> +        sar r2d, 1
> +        sub t0d, r2d
> +        add t1d, t0d
> +        mov [b0q], t0d
> +        mov [b1q], t1d
> +
> +        add b0q, 4
> +        add b1q, 4
> +        sub r3d, 1
> +    jg .loop_scalar
> +
> +    .end:
> +RET
> +
> +%endmacro
> +
> +%macro HAAR_HORIZONTAL 0
>
+
>

Could you remove this newline from every patch? All asm I've written and
seen keep them without a newline. It made me think there's something in the
asm which checked the value of the macro, not that the entire function is
macro'd.


+cglobal horizontal_compose_haar_10bit, 3, 6+ARCH_X86_64, 4, b, temp_, w,
> x, b2
> +    DECLARE_REG_TMP 2,5
> +    %if ARCH_X86_64
> +        %define tail r6d
> +    %else
> +        %define tail dword wm
> +    %endif
> +
> +    mova m2, [pd_1]
> +    xor xd, xd
> +    shr wd, 1
> +    mov tail, wd
> +    lea b2q, [bq + 4*wq]
> +
> +    ALIGN 16
> +    .loop_lo:
> +        mova m0, [bq  + 4*xq]
> +        movu m1, [b2q + 4*xq]
> +        paddd m1, m2
> +        psrad m1, 1
> +        psubd m0, m1
> +        mova [temp_q + 4*xq], m0
> +        add xd, mmsize/4
> +        cmp xd, wd
> +    jl .loop_lo
> +
> +    xor xd, xd
> +    and wd, ~(mmsize/4 - 1)
> +
> +    ALIGN 16
> +    .loop_hi:
> +        mova m0, [temp_q + 4*xq]
> +        movu m1, [b2q    + 4*xq]
> +        paddd m1, m0
> +        paddd m0, m2
> +        paddd m1, m2
> +        psrad m0, 1
> +        psrad m1, 1
> +        SBUTTERFLY dq, 0,1,3
> +        %if cpuflag(avx2)
> +            SBUTTERFLY dqqq, 0,1,3
> +        %endif
> +        mova [bq + 8*xq], m0
> +        mova [bq + 8*xq + mmsize], m1
> +        add xd, mmsize/4
> +        cmp xd, wd
> +    jl .loop_hi
> +
> +    and tail, mmsize/4 - 1
> +    jz .end
> +    .loop_scalar:
> +        mov t0d, [temp_q + 4*xq]
> +        mov t1d, [b2q    + 4*xq]
> +        add t1d, t0d
> +        add t0d, 1
> +        add t1d, 1
> +        sar t0d, 1
> +        sar t1d, 1
> +        mov [bq + 8*xq], t0d
> +        mov [bq + 8*xq + 4], t1d
> +        add  xq, 1
> +        sub tail, 1
> +    jg .loop_scalar
> +
> +    .end:
> +REP_RET
> +
> +%endmacro
> +
> +INIT_XMM sse2
> +HAAR_HORIZONTAL
> +HAAR_VERTICAL
> +
> +INIT_XMM avx
> +HAAR_HORIZONTAL
> +HAAR_VERTICAL
>

You're not using any avx functions in that version, not unless a macro'd
instruction inserts one for you. I think you should remove the avx version
then.
Also since you always have a HAAR_HORIZONTAL and HAAR_VERTICAL macros per
version you can just make a single macro to do both versions at the same
time.


+
> +INIT_YMM avx2
> +HAAR_HORIZONTAL
> +HAAR_VERTICAL
> diff --git a/libavcodec/x86/dirac_dwt_init_10bit.c
> b/libavcodec/x86/dirac_dwt_init_10bit.c
> new file mode 100644
> index 0000000000..289862d728
> --- /dev/null
> +++ b/libavcodec/x86/dirac_dwt_init_10bit.c
> @@ -0,0 +1,76 @@
> +/*
> + * x86 optimized discrete wavelet transform
> + * Copyright (c) 2018 James Darnley
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301 USA
> + */
> +
> +#include "libavutil/x86/asm.h"
> +#include "libavutil/x86/cpu.h"
> +#include "libavcodec/dirac_dwt.h"
> +
> +void ff_horizontal_compose_haar_10bit_sse2(int32_t *b0, int32_t *b1, int
> width_align);
> +void ff_horizontal_compose_haar_10bit_avx(int32_t *b0, int32_t *b1, int
> width_align);
> +void ff_horizontal_compose_haar_10bit_avx2(int32_t *b0, int32_t *b1, int
> width_align);
> +
> +void ff_vertical_compose_haar_10bit_sse2(int32_t *b0, int32_t *b1, int
> width_align);
> +void ff_vertical_compose_haar_10bit_avx(int32_t *b0, int32_t *b1, int
> width_align);
> +void ff_vertical_compose_haar_10bit_avx2(int32_t *b0, int32_t *b1, int
> width_align);
> +
> +av_cold void ff_spatial_idwt_init_10bit_x86(DWTContext *d, enum dwt_type
> type)
> +{
> +#if HAVE_X86ASM
> +    int cpu_flags = av_get_cpu_flags();
> +
> +    if (EXTERNAL_SSE2(cpu_flags)) {
> +        switch (type) {
> +            case DWT_DIRAC_HAAR0:
> +                d->vertical_compose = (void*)ff_vertical_compose_
> haar_10bit_sse2;
> +                break;
> +            case DWT_DIRAC_HAAR1:
> +                d->horizontal_compose = (void*)ff_horizontal_compose_
> haar_10bit_sse2;
> +                d->vertical_compose = (void*)ff_vertical_compose_
> haar_10bit_sse2;
> +                break;
> +        }
> +    }
> +
> +    if (EXTERNAL_AVX(cpu_flags)) {
> +        switch (type) {
> +            case DWT_DIRAC_HAAR0:
> +                d->vertical_compose = (void*)ff_vertical_compose_
> haar_10bit_avx;
> +                break;
> +            case DWT_DIRAC_HAAR1:
> +                d->horizontal_compose = (void*)ff_horizontal_compose_
> haar_10bit_avx;
> +                d->vertical_compose = (void*)ff_vertical_compose_
> haar_10bit_avx;
> +                break;
> +        }
>

We keep switches and cases on the same line.


+    }
> +
> +    if (EXTERNAL_AVX2(cpu_flags)) {
> +        switch (type) {
> +            case DWT_DIRAC_HAAR0:
> +                d->vertical_compose = (void*)ff_vertical_compose_
> haar_10bit_avx2;
> +                break;
> +            case DWT_DIRAC_HAAR1:
> +                d->horizontal_compose = (void*)ff_horizontal_compose_
> haar_10bit_avx2;
> +                d->vertical_compose = (void*)ff_vertical_compose_
> haar_10bit_avx2;
> +                break;
> +        }


Same.

Rostislav Pehlivanov July 26, 2018, 3:31 p.m. UTC | #2

On 26 July 2018 at 12:28, James Darnley <jdarnley@obe.tv> wrote:

> +
> +%macro HAAR_HORIZONTAL 0
> +
> +cglobal horizontal_compose_haar_10bit, 3, 6+ARCH_X86_64, 4, b, temp_, w,
> x, b2
> +    DECLARE_REG_TMP 2,5
> +    %if ARCH_X86_64
> +        %define tail r6d
> +    %else
> +        %define tail dword wm
> +    %endif
> +
>

You can remove this whole bit, the init function only gets called if
ARCH_X86_64 is true.

James Darnley July 27, 2018, 11:47 a.m. UTC | #3

On 2018-07-26 17:29, Rostislav Pehlivanov wrote:
> On 26 July 2018 at 12:28, James Darnley <jdarnley@obe.tv> wrote:
> +cglobal vertical_compose_haar_10bit, 3, 6, 4, b0, b1, w
>> +    DECLARE_REG_TMP 4,5
>> +
>> +    mova  m2, [pd_1]
>> +    mov  r3d, wd
>> +    and   wd, ~(mmsize/4 - 1)
>> +    shl   wd, 2
>> +    add  b0q, wq
>> +    add  b1q, wq
>> +    neg   wq
>> +
>> +    ALIGN 16
>> +    .loop_simd:
>> +        mova m0, [b0q + wq]
>> +        mova m1, [b1q + wq]
>> +        paddd m3, m1, m2
>> +        psrad m3, 1
>> +        psubd m0, m3
>> +        paddd m1, m0
>> +        mova [b0q + wq], m0
>> +        mova [b1q + wq], m1
>> +        add wq, mmsize
>> +    jl .loop_simd
>> +
>> +    and  r3d, mmsize/4 - 1
>> +    jz .end
>> +    .loop_scalar:
>> +        mov t0d, [b0q]
>> +        mov t1d, [b1q]
>> +        mov r2d, t1d
>> +        add r2d, 1
>> +        sar r2d, 1
>> +        sub t0d, r2d
>> +        add t1d, t0d
>> +        mov [b0q], t0d
>> +        mov [b1q], t1d
>> +
>> +        add b0q, 4
>> +        add b1q, 4
>> +        sub r3d, 1
>> +    jg .loop_scalar
>> +
>> +    .end:
>> +RET
>> +
>> +%endmacro
>> +
>> +%macro HAAR_HORIZONTAL 0
>>
> +
>>
> 
> Could you remove this newline from every patch? All asm I've written and
> seen keep them without a newline. It made me think there's something in the
> asm which checked the value of the macro, not that the entire function is
> macro'd.

What?  I don't understand what you mean.  Do you think I have too many
blank lines between things?

> +cglobal horizontal_compose_haar_10bit, 3, 6+ARCH_X86_64, 4, b, temp_, w,
>> x, b2
>> +    DECLARE_REG_TMP 2,5
>> +    %if ARCH_X86_64
>> +        %define tail r6d
>> +    %else
>> +        %define tail dword wm
>> +    %endif
>> +
>> +    mova m2, [pd_1]
>> +    xor xd, xd
>> +    shr wd, 1
>> +    mov tail, wd
>> +    lea b2q, [bq + 4*wq]
>> +
>> +    ALIGN 16
>> +    .loop_lo:
>> +        mova m0, [bq  + 4*xq]
>> +        movu m1, [b2q + 4*xq]
>> +        paddd m1, m2
>> +        psrad m1, 1
>> +        psubd m0, m1
>> +        mova [temp_q + 4*xq], m0
>> +        add xd, mmsize/4
>> +        cmp xd, wd
>> +    jl .loop_lo
>> +
>> +    xor xd, xd
>> +    and wd, ~(mmsize/4 - 1)
>> +
>> +    ALIGN 16
>> +    .loop_hi:
>> +        mova m0, [temp_q + 4*xq]
>> +        movu m1, [b2q    + 4*xq]
>> +        paddd m1, m0
>> +        paddd m0, m2
>> +        paddd m1, m2
>> +        psrad m0, 1
>> +        psrad m1, 1
>> +        SBUTTERFLY dq, 0,1,3
>> +        %if cpuflag(avx2)
>> +            SBUTTERFLY dqqq, 0,1,3
>> +        %endif
>> +        mova [bq + 8*xq], m0
>> +        mova [bq + 8*xq + mmsize], m1
>> +        add xd, mmsize/4
>> +        cmp xd, wd
>> +    jl .loop_hi
>> +
>> +    and tail, mmsize/4 - 1
>> +    jz .end
>> +    .loop_scalar:
>> +        mov t0d, [temp_q + 4*xq]
>> +        mov t1d, [b2q    + 4*xq]
>> +        add t1d, t0d
>> +        add t0d, 1
>> +        add t1d, 1
>> +        sar t0d, 1
>> +        sar t1d, 1
>> +        mov [bq + 8*xq], t0d
>> +        mov [bq + 8*xq + 4], t1d
>> +        add  xq, 1
>> +        sub tail, 1
>> +    jg .loop_scalar
>> +
>> +    .end:
>> +REP_RET
>> +
>> +%endmacro
>> +
>> +INIT_XMM sse2
>> +HAAR_HORIZONTAL
>> +HAAR_VERTICAL
>> +
>> +INIT_XMM avx
>> +HAAR_HORIZONTAL
>> +HAAR_VERTICAL
>>
> 
> You're not using any avx functions in that version, not unless a macro'd
> instruction inserts one for you. I think you should remove the avx version
> then.
> Also since you always have a HAAR_HORIZONTAL and HAAR_VERTICAL macros per
> version you can just make a single macro to do both versions at the same
> time.

Now that I think about it there will be only one 3-operand instruction
in the SBUTTERFLY and the vertical function also only has 1.  I will
remove it.

I can merge the two macros but I will look back at what I've done
previously.  I think it is usually 1 macro per function.

> +
>> +INIT_YMM avx2
>> +HAAR_HORIZONTAL
>> +HAAR_VERTICAL
>> diff --git a/libavcodec/x86/dirac_dwt_init_10bit.c
>> b/libavcodec/x86/dirac_dwt_init_10bit.c
>> new file mode 100644
>> index 0000000000..289862d728
>> --- /dev/null
>> +++ b/libavcodec/x86/dirac_dwt_init_10bit.c
>> @@ -0,0 +1,76 @@
>> +/*
>> + * x86 optimized discrete wavelet transform
>> + * Copyright (c) 2018 James Darnley
>> + *
>> + * This file is part of FFmpeg.
>> + *
>> + * FFmpeg is free software; you can redistribute it and/or
>> + * modify it under the terms of the GNU Lesser General Public
>> + * License as published by the Free Software Foundation; either
>> + * version 2.1 of the License, or (at your option) any later version.
>> + *
>> + * FFmpeg is distributed in the hope that it will be useful,
>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> + * Lesser General Public License for more details.
>> + *
>> + * You should have received a copy of the GNU Lesser General Public
>> + * License along with FFmpeg; if not, write to the Free Software
>> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
>> 02110-1301 USA
>> + */
>> +
>> +#include "libavutil/x86/asm.h"
>> +#include "libavutil/x86/cpu.h"
>> +#include "libavcodec/dirac_dwt.h"
>> +
>> +void ff_horizontal_compose_haar_10bit_sse2(int32_t *b0, int32_t *b1, int
>> width_align);
>> +void ff_horizontal_compose_haar_10bit_avx(int32_t *b0, int32_t *b1, int
>> width_align);
>> +void ff_horizontal_compose_haar_10bit_avx2(int32_t *b0, int32_t *b1, int
>> width_align);
>> +
>> +void ff_vertical_compose_haar_10bit_sse2(int32_t *b0, int32_t *b1, int
>> width_align);
>> +void ff_vertical_compose_haar_10bit_avx(int32_t *b0, int32_t *b1, int
>> width_align);
>> +void ff_vertical_compose_haar_10bit_avx2(int32_t *b0, int32_t *b1, int
>> width_align);
>> +
>> +av_cold void ff_spatial_idwt_init_10bit_x86(DWTContext *d, enum dwt_type
>> type)
>> +{
>> +#if HAVE_X86ASM
>> +    int cpu_flags = av_get_cpu_flags();
>> +
>> +    if (EXTERNAL_SSE2(cpu_flags)) {
>> +        switch (type) {
>> +            case DWT_DIRAC_HAAR0:
>> +                d->vertical_compose = (void*)ff_vertical_compose_
>> haar_10bit_sse2;
>> +                break;
>> +            case DWT_DIRAC_HAAR1:
>> +                d->horizontal_compose = (void*)ff_horizontal_compose_
>> haar_10bit_sse2;
>> +                d->vertical_compose = (void*)ff_vertical_compose_
>> haar_10bit_sse2;
>> +                break;
>> +        }
>> +    }
>> +
>> +    if (EXTERNAL_AVX(cpu_flags)) {
>> +        switch (type) {
>> +            case DWT_DIRAC_HAAR0:
>> +                d->vertical_compose = (void*)ff_vertical_compose_
>> haar_10bit_avx;
>> +                break;
>> +            case DWT_DIRAC_HAAR1:
>> +                d->horizontal_compose = (void*)ff_horizontal_compose_
>> haar_10bit_avx;
>> +                d->vertical_compose = (void*)ff_vertical_compose_
>> haar_10bit_avx;
>> +                break;
>> +        }
>>
> 
> We keep switches and cases on the same line.

I had a look and it is the most common style even though not everywhere
holds to it.  Also `indent` does it that way.  So I will change it and
keep it in mind for the future.

>> +
>> +%macro HAAR_HORIZONTAL 0
>> +
>> +cglobal horizontal_compose_haar_10bit, 3, 6+ARCH_X86_64, 4, b, temp_, w,
>> x, b2
>> +    DECLARE_REG_TMP 2,5
>> +    %if ARCH_X86_64
>> +        %define tail r6d
>> +    %else
>> +        %define tail dword wm
>> +    %endif
>> +
>>
> 
> You can remove this whole bit, the init function only gets called if
> ARCH_X86_64 is true.

Where did you get that from?  I don't require 64-bit for this.

Henrik Gramner July 27, 2018, 1:05 p.m. UTC | #4

On Fri, Jul 27, 2018 at 1:47 PM, James Darnley <jdarnley@obe.tv> wrote:
> On 2018-07-26 17:29, Rostislav Pehlivanov wrote:
>>> +cglobal horizontal_compose_haar_10bit, 3, 6+ARCH_X86_64, 4, b, temp_, w,
>>> x, b2
>>> +    DECLARE_REG_TMP 2,5
>>> +    %if ARCH_X86_64
>>> +        %define tail r6d
>>> +    %else
>>> +        %define tail dword wm
>>> +    %endif
>>> +
>>>
>>
>> You can remove this whole bit, the init function only gets called if
>> ARCH_X86_64 is true.
>
> Where did you get that from?  I don't require 64-bit for this.

Can't you just use 7 GPR:s on x86-32 as well?

James Darnley July 27, 2018, 2:03 p.m. UTC | #5

On 2018-07-27 15:05, Henrik Gramner wrote:
> On Fri, Jul 27, 2018 at 1:47 PM, James Darnley <jdarnley@obe.tv> wrote:
>> On 2018-07-26 17:29, Rostislav Pehlivanov wrote:
>>>> +cglobal horizontal_compose_haar_10bit, 3, 6+ARCH_X86_64, 4, b, temp_, w,
>>>> x, b2
>>>> +    DECLARE_REG_TMP 2,5
>>>> +    %if ARCH_X86_64
>>>> +        %define tail r6d
>>>> +    %else
>>>> +        %define tail dword wm
>>>> +    %endif
>>>> +
>>>>
>>>
>>> You can remove this whole bit, the init function only gets called if
>>> ARCH_X86_64 is true.
>>
>> Where did you get that from?  I don't require 64-bit for this.
> 
> Can't you just use 7 GPR:s on x86-32 as well?

I'm sure I've done that in the past and at least 1 platform has always
complained due to PIE or stack alignment or whatever, I think.  I went
looking for an old email but couldn't find it.

If you want me to try it I can.

Henrik Gramner July 27, 2018, 2:22 p.m. UTC | #6

On Fri, Jul 27, 2018 at 4:03 PM, James Darnley <jdarnley@obe.tv> wrote:
> On 2018-07-27 15:05, Henrik Gramner wrote:
>> Can't you just use 7 GPR:s on x86-32 as well?
>
> I'm sure I've done that in the past and at least 1 platform has always
> complained due to PIE or stack alignment or whatever, I think.  I went
> looking for an old email but couldn't find it.
>
> If you want me to try it I can.

If you're allocating stack space with a positive number there has to
be a register available to store the old stack pointer in case the
guaranteed stack alignment is smaller than the required stack
alignment. This can be avoided by using a negative value for the stack
space in case of high register pressure which will allocate space for
the old stack pointer and store it on the stack after
allocation/alignment. In functions that doesn't use the stack this is
a non-issue.

So it should be fine to use 7 registers here.

Rostislav Pehlivanov July 27, 2018, 2:22 p.m. UTC | #7

On 27 July 2018 at 12:47, James Darnley <jdarnley@obe.tv> wrote:

> On 2018-07-26 17:29, Rostislav Pehlivanov wrote:
> > On 26 July 2018 at 12:28, James Darnley <jdarnley@obe.tv> wrote:
> > +cglobal vertical_compose_haar_10bit, 3, 6, 4, b0, b1, w
> >> +    DECLARE_REG_TMP 4,5
> >> +
> >> +    mova  m2, [pd_1]
> >> +    mov  r3d, wd
> >> +    and   wd, ~(mmsize/4 - 1)
> >> +    shl   wd, 2
> >> +    add  b0q, wq
> >> +    add  b1q, wq
> >> +    neg   wq
> >> +
> >> +    ALIGN 16
> >> +    .loop_simd:
> >> +        mova m0, [b0q + wq]
> >> +        mova m1, [b1q + wq]
> >> +        paddd m3, m1, m2
> >> +        psrad m3, 1
> >> +        psubd m0, m3
> >> +        paddd m1, m0
> >> +        mova [b0q + wq], m0
> >> +        mova [b1q + wq], m1
> >> +        add wq, mmsize
> >> +    jl .loop_simd
> >> +
> >> +    and  r3d, mmsize/4 - 1
> >> +    jz .end
> >> +    .loop_scalar:
> >> +        mov t0d, [b0q]
> >> +        mov t1d, [b1q]
> >> +        mov r2d, t1d
> >> +        add r2d, 1
> >> +        sar r2d, 1
> >> +        sub t0d, r2d
> >> +        add t1d, t0d
> >> +        mov [b0q], t0d
> >> +        mov [b1q], t1d
> >> +
> >> +        add b0q, 4
> >> +        add b1q, 4
> >> +        sub r3d, 1
> >> +    jg .loop_scalar
> >> +
> >> +    .end:
> >> +RET
> >> +
> >> +%endmacro
> >> +
> >> +%macro HAAR_HORIZONTAL 0
> >>
> > +
> >>
> >
> > Could you remove this newline from every patch? All asm I've written and
> > seen keep them without a newline. It made me think there's something in
> the
> > asm which checked the value of the macro, not that the entire function is
> > macro'd.
>
> What?  I don't understand what you mean.  Do you think I have too many
> blank lines between things?
>

Just remove the newline between the macro definition and the cglobal.


> +cglobal horizontal_compose_haar_10bit, 3, 6+ARCH_X86_64, 4, b, temp_, w,
> >> x, b2
> >> +    DECLARE_REG_TMP 2,5
> >> +    %if ARCH_X86_64
> >> +        %define tail r6d
> >> +    %else
> >> +        %define tail dword wm
> >> +    %endif
> >> +
> >> +    mova m2, [pd_1]
> >> +    xor xd, xd
> >> +    shr wd, 1
> >> +    mov tail, wd
> >> +    lea b2q, [bq + 4*wq]
> >> +
> >> +    ALIGN 16
> >> +    .loop_lo:
> >> +        mova m0, [bq  + 4*xq]
> >> +        movu m1, [b2q + 4*xq]
> >> +        paddd m1, m2
> >> +        psrad m1, 1
> >> +        psubd m0, m1
> >> +        mova [temp_q + 4*xq], m0
> >> +        add xd, mmsize/4
> >> +        cmp xd, wd
> >> +    jl .loop_lo
> >> +
> >> +    xor xd, xd
> >> +    and wd, ~(mmsize/4 - 1)
> >> +
> >> +    ALIGN 16
> >> +    .loop_hi:
> >> +        mova m0, [temp_q + 4*xq]
> >> +        movu m1, [b2q    + 4*xq]
> >> +        paddd m1, m0
> >> +        paddd m0, m2
> >> +        paddd m1, m2
> >> +        psrad m0, 1
> >> +        psrad m1, 1
> >> +        SBUTTERFLY dq, 0,1,3
> >> +        %if cpuflag(avx2)
> >> +            SBUTTERFLY dqqq, 0,1,3
> >> +        %endif
> >> +        mova [bq + 8*xq], m0
> >> +        mova [bq + 8*xq + mmsize], m1
> >> +        add xd, mmsize/4
> >> +        cmp xd, wd
> >> +    jl .loop_hi
> >> +
> >> +    and tail, mmsize/4 - 1
> >> +    jz .end
> >> +    .loop_scalar:
> >> +        mov t0d, [temp_q + 4*xq]
> >> +        mov t1d, [b2q    + 4*xq]
> >> +        add t1d, t0d
> >> +        add t0d, 1
> >> +        add t1d, 1
> >> +        sar t0d, 1
> >> +        sar t1d, 1
> >> +        mov [bq + 8*xq], t0d
> >> +        mov [bq + 8*xq + 4], t1d
> >> +        add  xq, 1
> >> +        sub tail, 1
> >> +    jg .loop_scalar
> >> +
> >> +    .end:
> >> +REP_RET
> >> +
> >> +%endmacro
> >> +
> >> +INIT_XMM sse2
> >> +HAAR_HORIZONTAL
> >> +HAAR_VERTICAL
> >> +
> >> +INIT_XMM avx
> >> +HAAR_HORIZONTAL
> >> +HAAR_VERTICAL
> >>
> >
> > You're not using any avx functions in that version, not unless a macro'd
> > instruction inserts one for you. I think you should remove the avx
> version
> > then.
> > Also since you always have a HAAR_HORIZONTAL and HAAR_VERTICAL macros per
> > version you can just make a single macro to do both versions at the same
> > time.
>
> Now that I think about it there will be only one 3-operand instruction
> in the SBUTTERFLY and the vertical function also only has 1.  I will
> remove it.
>
> I can merge the two macros but I will look back at what I've done
> previously.  I think it is usually 1 macro per function.
>
> > +
> >> +INIT_YMM avx2
> >> +HAAR_HORIZONTAL
> >> +HAAR_VERTICAL
> >> diff --git a/libavcodec/x86/dirac_dwt_init_10bit.c
> >> b/libavcodec/x86/dirac_dwt_init_10bit.c
> >> new file mode 100644
> >> index 0000000000..289862d728
> >> --- /dev/null
> >> +++ b/libavcodec/x86/dirac_dwt_init_10bit.c
> >> @@ -0,0 +1,76 @@
> >> +/*
> >> + * x86 optimized discrete wavelet transform
> >> + * Copyright (c) 2018 James Darnley
> >> + *
> >> + * This file is part of FFmpeg.
> >> + *
> >> + * FFmpeg is free software; you can redistribute it and/or
> >> + * modify it under the terms of the GNU Lesser General Public
> >> + * License as published by the Free Software Foundation; either
> >> + * version 2.1 of the License, or (at your option) any later version.
> >> + *
> >> + * FFmpeg is distributed in the hope that it will be useful,
> >> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> >> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> >> + * Lesser General Public License for more details.
> >> + *
> >> + * You should have received a copy of the GNU Lesser General Public
> >> + * License along with FFmpeg; if not, write to the Free Software
> >> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> >> 02110-1301 USA
> >> + */
> >> +
> >> +#include "libavutil/x86/asm.h"
> >> +#include "libavutil/x86/cpu.h"
> >> +#include "libavcodec/dirac_dwt.h"
> >> +
> >> +void ff_horizontal_compose_haar_10bit_sse2(int32_t *b0, int32_t *b1,
> int
> >> width_align);
> >> +void ff_horizontal_compose_haar_10bit_avx(int32_t *b0, int32_t *b1,
> int
> >> width_align);
> >> +void ff_horizontal_compose_haar_10bit_avx2(int32_t *b0, int32_t *b1,
> int
> >> width_align);
> >> +
> >> +void ff_vertical_compose_haar_10bit_sse2(int32_t *b0, int32_t *b1, int
> >> width_align);
> >> +void ff_vertical_compose_haar_10bit_avx(int32_t *b0, int32_t *b1, int
> >> width_align);
> >> +void ff_vertical_compose_haar_10bit_avx2(int32_t *b0, int32_t *b1, int
> >> width_align);
> >> +
> >> +av_cold void ff_spatial_idwt_init_10bit_x86(DWTContext *d, enum
> dwt_type
> >> type)
> >> +{
> >> +#if HAVE_X86ASM
> >> +    int cpu_flags = av_get_cpu_flags();
> >> +
> >> +    if (EXTERNAL_SSE2(cpu_flags)) {
> >> +        switch (type) {
> >> +            case DWT_DIRAC_HAAR0:
> >> +                d->vertical_compose = (void*)ff_vertical_compose_
> >> haar_10bit_sse2;
> >> +                break;
> >> +            case DWT_DIRAC_HAAR1:
> >> +                d->horizontal_compose = (void*)ff_horizontal_compose_
> >> haar_10bit_sse2;
> >> +                d->vertical_compose = (void*)ff_vertical_compose_
> >> haar_10bit_sse2;
> >> +                break;
> >> +        }
> >> +    }
> >> +
> >> +    if (EXTERNAL_AVX(cpu_flags)) {
> >> +        switch (type) {
> >> +            case DWT_DIRAC_HAAR0:
> >> +                d->vertical_compose = (void*)ff_vertical_compose_
> >> haar_10bit_avx;
> >> +                break;
> >> +            case DWT_DIRAC_HAAR1:
> >> +                d->horizontal_compose = (void*)ff_horizontal_compose_
> >> haar_10bit_avx;
> >> +                d->vertical_compose = (void*)ff_vertical_compose_
> >> haar_10bit_avx;
> >> +                break;
> >> +        }
> >>
> >
> > We keep switches and cases on the same line.
>
> I had a look and it is the most common style even though not everywhere
> holds to it.  Also `indent` does it that way.  So I will change it and
> keep it in mind for the future.
>
> >> +
> >> +%macro HAAR_HORIZONTAL 0
> >> +
> >> +cglobal horizontal_compose_haar_10bit, 3, 6+ARCH_X86_64, 4, b, temp_,
> w,
> >> x, b2
> >> +    DECLARE_REG_TMP 2,5
> >> +    %if ARCH_X86_64
> >> +        %define tail r6d
> >> +    %else
> >> +        %define tail dword wm
> >> +    %endif
> >> +
> >>
> >
> > You can remove this whole bit, the init function only gets called if
> > ARCH_X86_64 is true.
>
> Where did you get that from?  I don't require 64-bit for this.
>

Oh, right, I misread

> -    if (ARCH_X86 && bit_depth == 8)
> +#if ARCH_X86
> +    if (bit_depth == 8)
>          ff_spatial_idwt_init_x86(d, type);
> +    else if (bit_depth == 10)
> +        ff_spatial_idwt_init_10bit_x86(d, type);
> +#endif
> +

as ARCH_X86_64, nevermind.

[FFmpeg-devel,1/3] diracdec: add 10-bit Haar SIMD functions

Commit Message

Comments

Patch