Message ID | 20180726112808.11792-2-jdarnley@obe.tv |
---|---|
State | New |
Headers | show |
On 26 July 2018 at 12:28, James Darnley <jdarnley@obe.tv> wrote: > Speed of ffmpeg when decoding a 720p yuv422p10 file encoded with the > relevant transform. > C: 119fps > SSE2: 204fps > AVX: 206fps > AVX2: 221fps > > timer measurements, haar horizontal compose: > sse2: 3.68x faster (45143 vs. 12279 decicycles) compared with C > avx: 3.68x faster (45143 vs. 12275 decicycles) compared with C > avx2: 5.16x faster (45143 vs. 8742 decicycles) compared with C > haar vertical compose: > sse2: 1.64x faster (31792 vs. 19377 decicycles) compared with C > avx: 1.58x faster (31792 vs. 20090 decicycles) compared with C > avx2: 1.66x faster (31792 vs. 19157 decicycles) compared with C > --- > libavcodec/dirac_dwt.c | 7 +- > libavcodec/dirac_dwt.h | 1 + > libavcodec/x86/Makefile | 6 +- > libavcodec/x86/dirac_dwt_10bit.asm | 160 ++++++++++++++++++++++++++ > libavcodec/x86/dirac_dwt_init_10bit.c | 76 ++++++++++++ > 5 files changed, 247 insertions(+), 3 deletions(-) > create mode 100644 libavcodec/x86/dirac_dwt_10bit.asm > create mode 100644 libavcodec/x86/dirac_dwt_init_10bit.c > > diff --git a/libavcodec/dirac_dwt.c b/libavcodec/dirac_dwt.c > index cc08f8865a..86bee5bb9b 100644 > --- a/libavcodec/dirac_dwt.c > +++ b/libavcodec/dirac_dwt.c > @@ -59,8 +59,13 @@ int ff_spatial_idwt_init(DWTContext *d, DWTPlane *p, > enum dwt_type type, > return AVERROR_INVALIDDATA; > } > > - if (ARCH_X86 && bit_depth == 8) > +#if ARCH_X86 > + if (bit_depth == 8) > ff_spatial_idwt_init_x86(d, type); > + else if (bit_depth == 10) > + ff_spatial_idwt_init_10bit_x86(d, type); > +#endif > + > return 0; > } > > diff --git a/libavcodec/dirac_dwt.h b/libavcodec/dirac_dwt.h > index 994dc21d70..1ad7b9a821 100644 > --- a/libavcodec/dirac_dwt.h > +++ b/libavcodec/dirac_dwt.h > @@ -88,6 +88,7 @@ enum dwt_type { > int ff_spatial_idwt_init(DWTContext *d, DWTPlane *p, enum dwt_type type, > int decomposition_count, int bit_depth); > void ff_spatial_idwt_init_x86(DWTContext *d, enum dwt_type type); > +void ff_spatial_idwt_init_10bit_x86(DWTContext *d, enum dwt_type type); > > void ff_spatial_idwt_slice2(DWTContext *d, int y); > > diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile > index 2350c8bbee..590d83c167 100644 > --- a/libavcodec/x86/Makefile > +++ b/libavcodec/x86/Makefile > @@ -7,7 +7,8 @@ OBJS-$(CONFIG_BLOCKDSP) += > x86/blockdsp_init.o > OBJS-$(CONFIG_BSWAPDSP) += x86/bswapdsp_init.o > OBJS-$(CONFIG_DCT) += x86/dct_init.o > OBJS-$(CONFIG_DIRAC_DECODER) += x86/diracdsp_init.o \ > - x86/dirac_dwt_init.o > + x86/dirac_dwt_init.o \ > + x86/dirac_dwt_init_10bit.o > OBJS-$(CONFIG_FDCTDSP) += x86/fdctdsp_init.o > OBJS-$(CONFIG_FFT) += x86/fft_init.o > OBJS-$(CONFIG_FLACDSP) += x86/flacdsp_init.o > @@ -153,7 +154,8 @@ X86ASM-OBJS-$(CONFIG_APNG_DECODER) += x86/pngdsp.o > X86ASM-OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsidct.o > X86ASM-OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp.o x86/synth_filter.o > X86ASM-OBJS-$(CONFIG_DIRAC_DECODER) += x86/diracdsp.o \ > - x86/dirac_dwt.o > + x86/dirac_dwt.o \ > + x86/dirac_dwt_10bit.o > X86ASM-OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc.o > X86ASM-OBJS-$(CONFIG_EXR_DECODER) += x86/exrdsp.o > X86ASM-OBJS-$(CONFIG_FLAC_DECODER) += x86/flacdsp.o > diff --git a/libavcodec/x86/dirac_dwt_10bit.asm > b/libavcodec/x86/dirac_dwt_10bit.asm > new file mode 100644 > index 0000000000..baea91329e > --- /dev/null > +++ b/libavcodec/x86/dirac_dwt_10bit.asm > @@ -0,0 +1,160 @@ > +;********************************************************** > ******************** > +;* x86 optimized discrete 10-bit wavelet trasnform > +;* Copyright (c) 2018 James Darnley > +;* > +;* This file is part of FFmpeg. > +;* > +;* FFmpeg is free software; you can redistribute it and/or > +;* modify it under the terms of the GNU Lesser General Public > +;* License as published by the Free Software Foundation; either > +;* version 2.1 of the License, or (at your option) any later version. > +;* > +;* FFmpeg is distributed in the hope that it will be useful, > +;* but WITHOUT ANY WARRANTY; without even the implied warranty of > +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > +;* Lesser General Public License for more details. > +;* > +;* You should have received a copy of the GNU Lesser General Public > +;* License along with FFmpeg; if not, write to the Free Software > +;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA > 02110-1301 USA > +;********************************************************** > ******************** > + > +%include "libavutil/x86/x86util.asm" > + > +SECTION_RODATA > + > +cextern pd_1 > + > +SECTION .text > + > +%macro HAAR_VERTICAL 0 > + > See below. +cglobal vertical_compose_haar_10bit, 3, 6, 4, b0, b1, w > + DECLARE_REG_TMP 4,5 > + > + mova m2, [pd_1] > + mov r3d, wd > + and wd, ~(mmsize/4 - 1) > + shl wd, 2 > + add b0q, wq > + add b1q, wq > + neg wq > + > + ALIGN 16 > + .loop_simd: > + mova m0, [b0q + wq] > + mova m1, [b1q + wq] > + paddd m3, m1, m2 > + psrad m3, 1 > + psubd m0, m3 > + paddd m1, m0 > + mova [b0q + wq], m0 > + mova [b1q + wq], m1 > + add wq, mmsize > + jl .loop_simd > + > + and r3d, mmsize/4 - 1 > + jz .end > + .loop_scalar: > + mov t0d, [b0q] > + mov t1d, [b1q] > + mov r2d, t1d > + add r2d, 1 > + sar r2d, 1 > + sub t0d, r2d > + add t1d, t0d > + mov [b0q], t0d > + mov [b1q], t1d > + > + add b0q, 4 > + add b1q, 4 > + sub r3d, 1 > + jg .loop_scalar > + > + .end: > +RET > + > +%endmacro > + > +%macro HAAR_HORIZONTAL 0 > + > Could you remove this newline from every patch? All asm I've written and seen keep them without a newline. It made me think there's something in the asm which checked the value of the macro, not that the entire function is macro'd. +cglobal horizontal_compose_haar_10bit, 3, 6+ARCH_X86_64, 4, b, temp_, w, > x, b2 > + DECLARE_REG_TMP 2,5 > + %if ARCH_X86_64 > + %define tail r6d > + %else > + %define tail dword wm > + %endif > + > + mova m2, [pd_1] > + xor xd, xd > + shr wd, 1 > + mov tail, wd > + lea b2q, [bq + 4*wq] > + > + ALIGN 16 > + .loop_lo: > + mova m0, [bq + 4*xq] > + movu m1, [b2q + 4*xq] > + paddd m1, m2 > + psrad m1, 1 > + psubd m0, m1 > + mova [temp_q + 4*xq], m0 > + add xd, mmsize/4 > + cmp xd, wd > + jl .loop_lo > + > + xor xd, xd > + and wd, ~(mmsize/4 - 1) > + > + ALIGN 16 > + .loop_hi: > + mova m0, [temp_q + 4*xq] > + movu m1, [b2q + 4*xq] > + paddd m1, m0 > + paddd m0, m2 > + paddd m1, m2 > + psrad m0, 1 > + psrad m1, 1 > + SBUTTERFLY dq, 0,1,3 > + %if cpuflag(avx2) > + SBUTTERFLY dqqq, 0,1,3 > + %endif > + mova [bq + 8*xq], m0 > + mova [bq + 8*xq + mmsize], m1 > + add xd, mmsize/4 > + cmp xd, wd > + jl .loop_hi > + > + and tail, mmsize/4 - 1 > + jz .end > + .loop_scalar: > + mov t0d, [temp_q + 4*xq] > + mov t1d, [b2q + 4*xq] > + add t1d, t0d > + add t0d, 1 > + add t1d, 1 > + sar t0d, 1 > + sar t1d, 1 > + mov [bq + 8*xq], t0d > + mov [bq + 8*xq + 4], t1d > + add xq, 1 > + sub tail, 1 > + jg .loop_scalar > + > + .end: > +REP_RET > + > +%endmacro > + > +INIT_XMM sse2 > +HAAR_HORIZONTAL > +HAAR_VERTICAL > + > +INIT_XMM avx > +HAAR_HORIZONTAL > +HAAR_VERTICAL > You're not using any avx functions in that version, not unless a macro'd instruction inserts one for you. I think you should remove the avx version then. Also since you always have a HAAR_HORIZONTAL and HAAR_VERTICAL macros per version you can just make a single macro to do both versions at the same time. + > +INIT_YMM avx2 > +HAAR_HORIZONTAL > +HAAR_VERTICAL > diff --git a/libavcodec/x86/dirac_dwt_init_10bit.c > b/libavcodec/x86/dirac_dwt_init_10bit.c > new file mode 100644 > index 0000000000..289862d728 > --- /dev/null > +++ b/libavcodec/x86/dirac_dwt_init_10bit.c > @@ -0,0 +1,76 @@ > +/* > + * x86 optimized discrete wavelet transform > + * Copyright (c) 2018 James Darnley > + * > + * This file is part of FFmpeg. > + * > + * FFmpeg is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2.1 of the License, or (at your option) any later version. > + * > + * FFmpeg is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with FFmpeg; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA > 02110-1301 USA > + */ > + > +#include "libavutil/x86/asm.h" > +#include "libavutil/x86/cpu.h" > +#include "libavcodec/dirac_dwt.h" > + > +void ff_horizontal_compose_haar_10bit_sse2(int32_t *b0, int32_t *b1, int > width_align); > +void ff_horizontal_compose_haar_10bit_avx(int32_t *b0, int32_t *b1, int > width_align); > +void ff_horizontal_compose_haar_10bit_avx2(int32_t *b0, int32_t *b1, int > width_align); > + > +void ff_vertical_compose_haar_10bit_sse2(int32_t *b0, int32_t *b1, int > width_align); > +void ff_vertical_compose_haar_10bit_avx(int32_t *b0, int32_t *b1, int > width_align); > +void ff_vertical_compose_haar_10bit_avx2(int32_t *b0, int32_t *b1, int > width_align); > + > +av_cold void ff_spatial_idwt_init_10bit_x86(DWTContext *d, enum dwt_type > type) > +{ > +#if HAVE_X86ASM > + int cpu_flags = av_get_cpu_flags(); > + > + if (EXTERNAL_SSE2(cpu_flags)) { > + switch (type) { > + case DWT_DIRAC_HAAR0: > + d->vertical_compose = (void*)ff_vertical_compose_ > haar_10bit_sse2; > + break; > + case DWT_DIRAC_HAAR1: > + d->horizontal_compose = (void*)ff_horizontal_compose_ > haar_10bit_sse2; > + d->vertical_compose = (void*)ff_vertical_compose_ > haar_10bit_sse2; > + break; > + } > + } > + > + if (EXTERNAL_AVX(cpu_flags)) { > + switch (type) { > + case DWT_DIRAC_HAAR0: > + d->vertical_compose = (void*)ff_vertical_compose_ > haar_10bit_avx; > + break; > + case DWT_DIRAC_HAAR1: > + d->horizontal_compose = (void*)ff_horizontal_compose_ > haar_10bit_avx; > + d->vertical_compose = (void*)ff_vertical_compose_ > haar_10bit_avx; > + break; > + } > We keep switches and cases on the same line. + } > + > + if (EXTERNAL_AVX2(cpu_flags)) { > + switch (type) { > + case DWT_DIRAC_HAAR0: > + d->vertical_compose = (void*)ff_vertical_compose_ > haar_10bit_avx2; > + break; > + case DWT_DIRAC_HAAR1: > + d->horizontal_compose = (void*)ff_horizontal_compose_ > haar_10bit_avx2; > + d->vertical_compose = (void*)ff_vertical_compose_ > haar_10bit_avx2; > + break; > + } Same.
On 26 July 2018 at 12:28, James Darnley <jdarnley@obe.tv> wrote: > + > +%macro HAAR_HORIZONTAL 0 > + > +cglobal horizontal_compose_haar_10bit, 3, 6+ARCH_X86_64, 4, b, temp_, w, > x, b2 > + DECLARE_REG_TMP 2,5 > + %if ARCH_X86_64 > + %define tail r6d > + %else > + %define tail dword wm > + %endif > + > You can remove this whole bit, the init function only gets called if ARCH_X86_64 is true.
On 2018-07-26 17:29, Rostislav Pehlivanov wrote: > On 26 July 2018 at 12:28, James Darnley <jdarnley@obe.tv> wrote: > +cglobal vertical_compose_haar_10bit, 3, 6, 4, b0, b1, w >> + DECLARE_REG_TMP 4,5 >> + >> + mova m2, [pd_1] >> + mov r3d, wd >> + and wd, ~(mmsize/4 - 1) >> + shl wd, 2 >> + add b0q, wq >> + add b1q, wq >> + neg wq >> + >> + ALIGN 16 >> + .loop_simd: >> + mova m0, [b0q + wq] >> + mova m1, [b1q + wq] >> + paddd m3, m1, m2 >> + psrad m3, 1 >> + psubd m0, m3 >> + paddd m1, m0 >> + mova [b0q + wq], m0 >> + mova [b1q + wq], m1 >> + add wq, mmsize >> + jl .loop_simd >> + >> + and r3d, mmsize/4 - 1 >> + jz .end >> + .loop_scalar: >> + mov t0d, [b0q] >> + mov t1d, [b1q] >> + mov r2d, t1d >> + add r2d, 1 >> + sar r2d, 1 >> + sub t0d, r2d >> + add t1d, t0d >> + mov [b0q], t0d >> + mov [b1q], t1d >> + >> + add b0q, 4 >> + add b1q, 4 >> + sub r3d, 1 >> + jg .loop_scalar >> + >> + .end: >> +RET >> + >> +%endmacro >> + >> +%macro HAAR_HORIZONTAL 0 >> > + >> > > Could you remove this newline from every patch? All asm I've written and > seen keep them without a newline. It made me think there's something in the > asm which checked the value of the macro, not that the entire function is > macro'd. What? I don't understand what you mean. Do you think I have too many blank lines between things? > +cglobal horizontal_compose_haar_10bit, 3, 6+ARCH_X86_64, 4, b, temp_, w, >> x, b2 >> + DECLARE_REG_TMP 2,5 >> + %if ARCH_X86_64 >> + %define tail r6d >> + %else >> + %define tail dword wm >> + %endif >> + >> + mova m2, [pd_1] >> + xor xd, xd >> + shr wd, 1 >> + mov tail, wd >> + lea b2q, [bq + 4*wq] >> + >> + ALIGN 16 >> + .loop_lo: >> + mova m0, [bq + 4*xq] >> + movu m1, [b2q + 4*xq] >> + paddd m1, m2 >> + psrad m1, 1 >> + psubd m0, m1 >> + mova [temp_q + 4*xq], m0 >> + add xd, mmsize/4 >> + cmp xd, wd >> + jl .loop_lo >> + >> + xor xd, xd >> + and wd, ~(mmsize/4 - 1) >> + >> + ALIGN 16 >> + .loop_hi: >> + mova m0, [temp_q + 4*xq] >> + movu m1, [b2q + 4*xq] >> + paddd m1, m0 >> + paddd m0, m2 >> + paddd m1, m2 >> + psrad m0, 1 >> + psrad m1, 1 >> + SBUTTERFLY dq, 0,1,3 >> + %if cpuflag(avx2) >> + SBUTTERFLY dqqq, 0,1,3 >> + %endif >> + mova [bq + 8*xq], m0 >> + mova [bq + 8*xq + mmsize], m1 >> + add xd, mmsize/4 >> + cmp xd, wd >> + jl .loop_hi >> + >> + and tail, mmsize/4 - 1 >> + jz .end >> + .loop_scalar: >> + mov t0d, [temp_q + 4*xq] >> + mov t1d, [b2q + 4*xq] >> + add t1d, t0d >> + add t0d, 1 >> + add t1d, 1 >> + sar t0d, 1 >> + sar t1d, 1 >> + mov [bq + 8*xq], t0d >> + mov [bq + 8*xq + 4], t1d >> + add xq, 1 >> + sub tail, 1 >> + jg .loop_scalar >> + >> + .end: >> +REP_RET >> + >> +%endmacro >> + >> +INIT_XMM sse2 >> +HAAR_HORIZONTAL >> +HAAR_VERTICAL >> + >> +INIT_XMM avx >> +HAAR_HORIZONTAL >> +HAAR_VERTICAL >> > > You're not using any avx functions in that version, not unless a macro'd > instruction inserts one for you. I think you should remove the avx version > then. > Also since you always have a HAAR_HORIZONTAL and HAAR_VERTICAL macros per > version you can just make a single macro to do both versions at the same > time. Now that I think about it there will be only one 3-operand instruction in the SBUTTERFLY and the vertical function also only has 1. I will remove it. I can merge the two macros but I will look back at what I've done previously. I think it is usually 1 macro per function. > + >> +INIT_YMM avx2 >> +HAAR_HORIZONTAL >> +HAAR_VERTICAL >> diff --git a/libavcodec/x86/dirac_dwt_init_10bit.c >> b/libavcodec/x86/dirac_dwt_init_10bit.c >> new file mode 100644 >> index 0000000000..289862d728 >> --- /dev/null >> +++ b/libavcodec/x86/dirac_dwt_init_10bit.c >> @@ -0,0 +1,76 @@ >> +/* >> + * x86 optimized discrete wavelet transform >> + * Copyright (c) 2018 James Darnley >> + * >> + * This file is part of FFmpeg. >> + * >> + * FFmpeg is free software; you can redistribute it and/or >> + * modify it under the terms of the GNU Lesser General Public >> + * License as published by the Free Software Foundation; either >> + * version 2.1 of the License, or (at your option) any later version. >> + * >> + * FFmpeg is distributed in the hope that it will be useful, >> + * but WITHOUT ANY WARRANTY; without even the implied warranty of >> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> + * Lesser General Public License for more details. >> + * >> + * You should have received a copy of the GNU Lesser General Public >> + * License along with FFmpeg; if not, write to the Free Software >> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA >> 02110-1301 USA >> + */ >> + >> +#include "libavutil/x86/asm.h" >> +#include "libavutil/x86/cpu.h" >> +#include "libavcodec/dirac_dwt.h" >> + >> +void ff_horizontal_compose_haar_10bit_sse2(int32_t *b0, int32_t *b1, int >> width_align); >> +void ff_horizontal_compose_haar_10bit_avx(int32_t *b0, int32_t *b1, int >> width_align); >> +void ff_horizontal_compose_haar_10bit_avx2(int32_t *b0, int32_t *b1, int >> width_align); >> + >> +void ff_vertical_compose_haar_10bit_sse2(int32_t *b0, int32_t *b1, int >> width_align); >> +void ff_vertical_compose_haar_10bit_avx(int32_t *b0, int32_t *b1, int >> width_align); >> +void ff_vertical_compose_haar_10bit_avx2(int32_t *b0, int32_t *b1, int >> width_align); >> + >> +av_cold void ff_spatial_idwt_init_10bit_x86(DWTContext *d, enum dwt_type >> type) >> +{ >> +#if HAVE_X86ASM >> + int cpu_flags = av_get_cpu_flags(); >> + >> + if (EXTERNAL_SSE2(cpu_flags)) { >> + switch (type) { >> + case DWT_DIRAC_HAAR0: >> + d->vertical_compose = (void*)ff_vertical_compose_ >> haar_10bit_sse2; >> + break; >> + case DWT_DIRAC_HAAR1: >> + d->horizontal_compose = (void*)ff_horizontal_compose_ >> haar_10bit_sse2; >> + d->vertical_compose = (void*)ff_vertical_compose_ >> haar_10bit_sse2; >> + break; >> + } >> + } >> + >> + if (EXTERNAL_AVX(cpu_flags)) { >> + switch (type) { >> + case DWT_DIRAC_HAAR0: >> + d->vertical_compose = (void*)ff_vertical_compose_ >> haar_10bit_avx; >> + break; >> + case DWT_DIRAC_HAAR1: >> + d->horizontal_compose = (void*)ff_horizontal_compose_ >> haar_10bit_avx; >> + d->vertical_compose = (void*)ff_vertical_compose_ >> haar_10bit_avx; >> + break; >> + } >> > > We keep switches and cases on the same line. I had a look and it is the most common style even though not everywhere holds to it. Also `indent` does it that way. So I will change it and keep it in mind for the future. >> + >> +%macro HAAR_HORIZONTAL 0 >> + >> +cglobal horizontal_compose_haar_10bit, 3, 6+ARCH_X86_64, 4, b, temp_, w, >> x, b2 >> + DECLARE_REG_TMP 2,5 >> + %if ARCH_X86_64 >> + %define tail r6d >> + %else >> + %define tail dword wm >> + %endif >> + >> > > You can remove this whole bit, the init function only gets called if > ARCH_X86_64 is true. Where did you get that from? I don't require 64-bit for this.
On Fri, Jul 27, 2018 at 1:47 PM, James Darnley <jdarnley@obe.tv> wrote: > On 2018-07-26 17:29, Rostislav Pehlivanov wrote: >>> +cglobal horizontal_compose_haar_10bit, 3, 6+ARCH_X86_64, 4, b, temp_, w, >>> x, b2 >>> + DECLARE_REG_TMP 2,5 >>> + %if ARCH_X86_64 >>> + %define tail r6d >>> + %else >>> + %define tail dword wm >>> + %endif >>> + >>> >> >> You can remove this whole bit, the init function only gets called if >> ARCH_X86_64 is true. > > Where did you get that from? I don't require 64-bit for this. Can't you just use 7 GPR:s on x86-32 as well?
On 2018-07-27 15:05, Henrik Gramner wrote: > On Fri, Jul 27, 2018 at 1:47 PM, James Darnley <jdarnley@obe.tv> wrote: >> On 2018-07-26 17:29, Rostislav Pehlivanov wrote: >>>> +cglobal horizontal_compose_haar_10bit, 3, 6+ARCH_X86_64, 4, b, temp_, w, >>>> x, b2 >>>> + DECLARE_REG_TMP 2,5 >>>> + %if ARCH_X86_64 >>>> + %define tail r6d >>>> + %else >>>> + %define tail dword wm >>>> + %endif >>>> + >>>> >>> >>> You can remove this whole bit, the init function only gets called if >>> ARCH_X86_64 is true. >> >> Where did you get that from? I don't require 64-bit for this. > > Can't you just use 7 GPR:s on x86-32 as well? I'm sure I've done that in the past and at least 1 platform has always complained due to PIE or stack alignment or whatever, I think. I went looking for an old email but couldn't find it. If you want me to try it I can.
On Fri, Jul 27, 2018 at 4:03 PM, James Darnley <jdarnley@obe.tv> wrote: > On 2018-07-27 15:05, Henrik Gramner wrote: >> Can't you just use 7 GPR:s on x86-32 as well? > > I'm sure I've done that in the past and at least 1 platform has always > complained due to PIE or stack alignment or whatever, I think. I went > looking for an old email but couldn't find it. > > If you want me to try it I can. If you're allocating stack space with a positive number there has to be a register available to store the old stack pointer in case the guaranteed stack alignment is smaller than the required stack alignment. This can be avoided by using a negative value for the stack space in case of high register pressure which will allocate space for the old stack pointer and store it on the stack after allocation/alignment. In functions that doesn't use the stack this is a non-issue. So it should be fine to use 7 registers here.
On 27 July 2018 at 12:47, James Darnley <jdarnley@obe.tv> wrote: > On 2018-07-26 17:29, Rostislav Pehlivanov wrote: > > On 26 July 2018 at 12:28, James Darnley <jdarnley@obe.tv> wrote: > > +cglobal vertical_compose_haar_10bit, 3, 6, 4, b0, b1, w > >> + DECLARE_REG_TMP 4,5 > >> + > >> + mova m2, [pd_1] > >> + mov r3d, wd > >> + and wd, ~(mmsize/4 - 1) > >> + shl wd, 2 > >> + add b0q, wq > >> + add b1q, wq > >> + neg wq > >> + > >> + ALIGN 16 > >> + .loop_simd: > >> + mova m0, [b0q + wq] > >> + mova m1, [b1q + wq] > >> + paddd m3, m1, m2 > >> + psrad m3, 1 > >> + psubd m0, m3 > >> + paddd m1, m0 > >> + mova [b0q + wq], m0 > >> + mova [b1q + wq], m1 > >> + add wq, mmsize > >> + jl .loop_simd > >> + > >> + and r3d, mmsize/4 - 1 > >> + jz .end > >> + .loop_scalar: > >> + mov t0d, [b0q] > >> + mov t1d, [b1q] > >> + mov r2d, t1d > >> + add r2d, 1 > >> + sar r2d, 1 > >> + sub t0d, r2d > >> + add t1d, t0d > >> + mov [b0q], t0d > >> + mov [b1q], t1d > >> + > >> + add b0q, 4 > >> + add b1q, 4 > >> + sub r3d, 1 > >> + jg .loop_scalar > >> + > >> + .end: > >> +RET > >> + > >> +%endmacro > >> + > >> +%macro HAAR_HORIZONTAL 0 > >> > > + > >> > > > > Could you remove this newline from every patch? All asm I've written and > > seen keep them without a newline. It made me think there's something in > the > > asm which checked the value of the macro, not that the entire function is > > macro'd. > > What? I don't understand what you mean. Do you think I have too many > blank lines between things? > Just remove the newline between the macro definition and the cglobal. > +cglobal horizontal_compose_haar_10bit, 3, 6+ARCH_X86_64, 4, b, temp_, w, > >> x, b2 > >> + DECLARE_REG_TMP 2,5 > >> + %if ARCH_X86_64 > >> + %define tail r6d > >> + %else > >> + %define tail dword wm > >> + %endif > >> + > >> + mova m2, [pd_1] > >> + xor xd, xd > >> + shr wd, 1 > >> + mov tail, wd > >> + lea b2q, [bq + 4*wq] > >> + > >> + ALIGN 16 > >> + .loop_lo: > >> + mova m0, [bq + 4*xq] > >> + movu m1, [b2q + 4*xq] > >> + paddd m1, m2 > >> + psrad m1, 1 > >> + psubd m0, m1 > >> + mova [temp_q + 4*xq], m0 > >> + add xd, mmsize/4 > >> + cmp xd, wd > >> + jl .loop_lo > >> + > >> + xor xd, xd > >> + and wd, ~(mmsize/4 - 1) > >> + > >> + ALIGN 16 > >> + .loop_hi: > >> + mova m0, [temp_q + 4*xq] > >> + movu m1, [b2q + 4*xq] > >> + paddd m1, m0 > >> + paddd m0, m2 > >> + paddd m1, m2 > >> + psrad m0, 1 > >> + psrad m1, 1 > >> + SBUTTERFLY dq, 0,1,3 > >> + %if cpuflag(avx2) > >> + SBUTTERFLY dqqq, 0,1,3 > >> + %endif > >> + mova [bq + 8*xq], m0 > >> + mova [bq + 8*xq + mmsize], m1 > >> + add xd, mmsize/4 > >> + cmp xd, wd > >> + jl .loop_hi > >> + > >> + and tail, mmsize/4 - 1 > >> + jz .end > >> + .loop_scalar: > >> + mov t0d, [temp_q + 4*xq] > >> + mov t1d, [b2q + 4*xq] > >> + add t1d, t0d > >> + add t0d, 1 > >> + add t1d, 1 > >> + sar t0d, 1 > >> + sar t1d, 1 > >> + mov [bq + 8*xq], t0d > >> + mov [bq + 8*xq + 4], t1d > >> + add xq, 1 > >> + sub tail, 1 > >> + jg .loop_scalar > >> + > >> + .end: > >> +REP_RET > >> + > >> +%endmacro > >> + > >> +INIT_XMM sse2 > >> +HAAR_HORIZONTAL > >> +HAAR_VERTICAL > >> + > >> +INIT_XMM avx > >> +HAAR_HORIZONTAL > >> +HAAR_VERTICAL > >> > > > > You're not using any avx functions in that version, not unless a macro'd > > instruction inserts one for you. I think you should remove the avx > version > > then. > > Also since you always have a HAAR_HORIZONTAL and HAAR_VERTICAL macros per > > version you can just make a single macro to do both versions at the same > > time. > > Now that I think about it there will be only one 3-operand instruction > in the SBUTTERFLY and the vertical function also only has 1. I will > remove it. > > I can merge the two macros but I will look back at what I've done > previously. I think it is usually 1 macro per function. > > > + > >> +INIT_YMM avx2 > >> +HAAR_HORIZONTAL > >> +HAAR_VERTICAL > >> diff --git a/libavcodec/x86/dirac_dwt_init_10bit.c > >> b/libavcodec/x86/dirac_dwt_init_10bit.c > >> new file mode 100644 > >> index 0000000000..289862d728 > >> --- /dev/null > >> +++ b/libavcodec/x86/dirac_dwt_init_10bit.c > >> @@ -0,0 +1,76 @@ > >> +/* > >> + * x86 optimized discrete wavelet transform > >> + * Copyright (c) 2018 James Darnley > >> + * > >> + * This file is part of FFmpeg. > >> + * > >> + * FFmpeg is free software; you can redistribute it and/or > >> + * modify it under the terms of the GNU Lesser General Public > >> + * License as published by the Free Software Foundation; either > >> + * version 2.1 of the License, or (at your option) any later version. > >> + * > >> + * FFmpeg is distributed in the hope that it will be useful, > >> + * but WITHOUT ANY WARRANTY; without even the implied warranty of > >> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > >> + * Lesser General Public License for more details. > >> + * > >> + * You should have received a copy of the GNU Lesser General Public > >> + * License along with FFmpeg; if not, write to the Free Software > >> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA > >> 02110-1301 USA > >> + */ > >> + > >> +#include "libavutil/x86/asm.h" > >> +#include "libavutil/x86/cpu.h" > >> +#include "libavcodec/dirac_dwt.h" > >> + > >> +void ff_horizontal_compose_haar_10bit_sse2(int32_t *b0, int32_t *b1, > int > >> width_align); > >> +void ff_horizontal_compose_haar_10bit_avx(int32_t *b0, int32_t *b1, > int > >> width_align); > >> +void ff_horizontal_compose_haar_10bit_avx2(int32_t *b0, int32_t *b1, > int > >> width_align); > >> + > >> +void ff_vertical_compose_haar_10bit_sse2(int32_t *b0, int32_t *b1, int > >> width_align); > >> +void ff_vertical_compose_haar_10bit_avx(int32_t *b0, int32_t *b1, int > >> width_align); > >> +void ff_vertical_compose_haar_10bit_avx2(int32_t *b0, int32_t *b1, int > >> width_align); > >> + > >> +av_cold void ff_spatial_idwt_init_10bit_x86(DWTContext *d, enum > dwt_type > >> type) > >> +{ > >> +#if HAVE_X86ASM > >> + int cpu_flags = av_get_cpu_flags(); > >> + > >> + if (EXTERNAL_SSE2(cpu_flags)) { > >> + switch (type) { > >> + case DWT_DIRAC_HAAR0: > >> + d->vertical_compose = (void*)ff_vertical_compose_ > >> haar_10bit_sse2; > >> + break; > >> + case DWT_DIRAC_HAAR1: > >> + d->horizontal_compose = (void*)ff_horizontal_compose_ > >> haar_10bit_sse2; > >> + d->vertical_compose = (void*)ff_vertical_compose_ > >> haar_10bit_sse2; > >> + break; > >> + } > >> + } > >> + > >> + if (EXTERNAL_AVX(cpu_flags)) { > >> + switch (type) { > >> + case DWT_DIRAC_HAAR0: > >> + d->vertical_compose = (void*)ff_vertical_compose_ > >> haar_10bit_avx; > >> + break; > >> + case DWT_DIRAC_HAAR1: > >> + d->horizontal_compose = (void*)ff_horizontal_compose_ > >> haar_10bit_avx; > >> + d->vertical_compose = (void*)ff_vertical_compose_ > >> haar_10bit_avx; > >> + break; > >> + } > >> > > > > We keep switches and cases on the same line. > > I had a look and it is the most common style even though not everywhere > holds to it. Also `indent` does it that way. So I will change it and > keep it in mind for the future. > > >> + > >> +%macro HAAR_HORIZONTAL 0 > >> + > >> +cglobal horizontal_compose_haar_10bit, 3, 6+ARCH_X86_64, 4, b, temp_, > w, > >> x, b2 > >> + DECLARE_REG_TMP 2,5 > >> + %if ARCH_X86_64 > >> + %define tail r6d > >> + %else > >> + %define tail dword wm > >> + %endif > >> + > >> > > > > You can remove this whole bit, the init function only gets called if > > ARCH_X86_64 is true. > > Where did you get that from? I don't require 64-bit for this. > Oh, right, I misread > - if (ARCH_X86 && bit_depth == 8) > +#if ARCH_X86 > + if (bit_depth == 8) > ff_spatial_idwt_init_x86(d, type); > + else if (bit_depth == 10) > + ff_spatial_idwt_init_10bit_x86(d, type); > +#endif > + as ARCH_X86_64, nevermind.
diff --git a/libavcodec/dirac_dwt.c b/libavcodec/dirac_dwt.c index cc08f8865a..86bee5bb9b 100644 --- a/libavcodec/dirac_dwt.c +++ b/libavcodec/dirac_dwt.c @@ -59,8 +59,13 @@ int ff_spatial_idwt_init(DWTContext *d, DWTPlane *p, enum dwt_type type, return AVERROR_INVALIDDATA; } - if (ARCH_X86 && bit_depth == 8) +#if ARCH_X86 + if (bit_depth == 8) ff_spatial_idwt_init_x86(d, type); + else if (bit_depth == 10) + ff_spatial_idwt_init_10bit_x86(d, type); +#endif + return 0; } diff --git a/libavcodec/dirac_dwt.h b/libavcodec/dirac_dwt.h index 994dc21d70..1ad7b9a821 100644 --- a/libavcodec/dirac_dwt.h +++ b/libavcodec/dirac_dwt.h @@ -88,6 +88,7 @@ enum dwt_type { int ff_spatial_idwt_init(DWTContext *d, DWTPlane *p, enum dwt_type type, int decomposition_count, int bit_depth); void ff_spatial_idwt_init_x86(DWTContext *d, enum dwt_type type); +void ff_spatial_idwt_init_10bit_x86(DWTContext *d, enum dwt_type type); void ff_spatial_idwt_slice2(DWTContext *d, int y); diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index 2350c8bbee..590d83c167 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -7,7 +7,8 @@ OBJS-$(CONFIG_BLOCKDSP) += x86/blockdsp_init.o OBJS-$(CONFIG_BSWAPDSP) += x86/bswapdsp_init.o OBJS-$(CONFIG_DCT) += x86/dct_init.o OBJS-$(CONFIG_DIRAC_DECODER) += x86/diracdsp_init.o \ - x86/dirac_dwt_init.o + x86/dirac_dwt_init.o \ + x86/dirac_dwt_init_10bit.o OBJS-$(CONFIG_FDCTDSP) += x86/fdctdsp_init.o OBJS-$(CONFIG_FFT) += x86/fft_init.o OBJS-$(CONFIG_FLACDSP) += x86/flacdsp_init.o @@ -153,7 +154,8 @@ X86ASM-OBJS-$(CONFIG_APNG_DECODER) += x86/pngdsp.o X86ASM-OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsidct.o X86ASM-OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp.o x86/synth_filter.o X86ASM-OBJS-$(CONFIG_DIRAC_DECODER) += x86/diracdsp.o \ - x86/dirac_dwt.o + x86/dirac_dwt.o \ + x86/dirac_dwt_10bit.o X86ASM-OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc.o X86ASM-OBJS-$(CONFIG_EXR_DECODER) += x86/exrdsp.o X86ASM-OBJS-$(CONFIG_FLAC_DECODER) += x86/flacdsp.o diff --git a/libavcodec/x86/dirac_dwt_10bit.asm b/libavcodec/x86/dirac_dwt_10bit.asm new file mode 100644 index 0000000000..baea91329e --- /dev/null +++ b/libavcodec/x86/dirac_dwt_10bit.asm @@ -0,0 +1,160 @@ +;****************************************************************************** +;* x86 optimized discrete 10-bit wavelet trasnform +;* Copyright (c) 2018 James Darnley +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +cextern pd_1 + +SECTION .text + +%macro HAAR_VERTICAL 0 + +cglobal vertical_compose_haar_10bit, 3, 6, 4, b0, b1, w + DECLARE_REG_TMP 4,5 + + mova m2, [pd_1] + mov r3d, wd + and wd, ~(mmsize/4 - 1) + shl wd, 2 + add b0q, wq + add b1q, wq + neg wq + + ALIGN 16 + .loop_simd: + mova m0, [b0q + wq] + mova m1, [b1q + wq] + paddd m3, m1, m2 + psrad m3, 1 + psubd m0, m3 + paddd m1, m0 + mova [b0q + wq], m0 + mova [b1q + wq], m1 + add wq, mmsize + jl .loop_simd + + and r3d, mmsize/4 - 1 + jz .end + .loop_scalar: + mov t0d, [b0q] + mov t1d, [b1q] + mov r2d, t1d + add r2d, 1 + sar r2d, 1 + sub t0d, r2d + add t1d, t0d + mov [b0q], t0d + mov [b1q], t1d + + add b0q, 4 + add b1q, 4 + sub r3d, 1 + jg .loop_scalar + + .end: +RET + +%endmacro + +%macro HAAR_HORIZONTAL 0 + +cglobal horizontal_compose_haar_10bit, 3, 6+ARCH_X86_64, 4, b, temp_, w, x, b2 + DECLARE_REG_TMP 2,5 + %if ARCH_X86_64 + %define tail r6d + %else + %define tail dword wm + %endif + + mova m2, [pd_1] + xor xd, xd + shr wd, 1 + mov tail, wd + lea b2q, [bq + 4*wq] + + ALIGN 16 + .loop_lo: + mova m0, [bq + 4*xq] + movu m1, [b2q + 4*xq] + paddd m1, m2 + psrad m1, 1 + psubd m0, m1 + mova [temp_q + 4*xq], m0 + add xd, mmsize/4 + cmp xd, wd + jl .loop_lo + + xor xd, xd + and wd, ~(mmsize/4 - 1) + + ALIGN 16 + .loop_hi: + mova m0, [temp_q + 4*xq] + movu m1, [b2q + 4*xq] + paddd m1, m0 + paddd m0, m2 + paddd m1, m2 + psrad m0, 1 + psrad m1, 1 + SBUTTERFLY dq, 0,1,3 + %if cpuflag(avx2) + SBUTTERFLY dqqq, 0,1,3 + %endif + mova [bq + 8*xq], m0 + mova [bq + 8*xq + mmsize], m1 + add xd, mmsize/4 + cmp xd, wd + jl .loop_hi + + and tail, mmsize/4 - 1 + jz .end + .loop_scalar: + mov t0d, [temp_q + 4*xq] + mov t1d, [b2q + 4*xq] + add t1d, t0d + add t0d, 1 + add t1d, 1 + sar t0d, 1 + sar t1d, 1 + mov [bq + 8*xq], t0d + mov [bq + 8*xq + 4], t1d + add xq, 1 + sub tail, 1 + jg .loop_scalar + + .end: +REP_RET + +%endmacro + +INIT_XMM sse2 +HAAR_HORIZONTAL +HAAR_VERTICAL + +INIT_XMM avx +HAAR_HORIZONTAL +HAAR_VERTICAL + +INIT_YMM avx2 +HAAR_HORIZONTAL +HAAR_VERTICAL diff --git a/libavcodec/x86/dirac_dwt_init_10bit.c b/libavcodec/x86/dirac_dwt_init_10bit.c new file mode 100644 index 0000000000..289862d728 --- /dev/null +++ b/libavcodec/x86/dirac_dwt_init_10bit.c @@ -0,0 +1,76 @@ +/* + * x86 optimized discrete wavelet transform + * Copyright (c) 2018 James Darnley + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/x86/asm.h" +#include "libavutil/x86/cpu.h" +#include "libavcodec/dirac_dwt.h" + +void ff_horizontal_compose_haar_10bit_sse2(int32_t *b0, int32_t *b1, int width_align); +void ff_horizontal_compose_haar_10bit_avx(int32_t *b0, int32_t *b1, int width_align); +void ff_horizontal_compose_haar_10bit_avx2(int32_t *b0, int32_t *b1, int width_align); + +void ff_vertical_compose_haar_10bit_sse2(int32_t *b0, int32_t *b1, int width_align); +void ff_vertical_compose_haar_10bit_avx(int32_t *b0, int32_t *b1, int width_align); +void ff_vertical_compose_haar_10bit_avx2(int32_t *b0, int32_t *b1, int width_align); + +av_cold void ff_spatial_idwt_init_10bit_x86(DWTContext *d, enum dwt_type type) +{ +#if HAVE_X86ASM + int cpu_flags = av_get_cpu_flags(); + + if (EXTERNAL_SSE2(cpu_flags)) { + switch (type) { + case DWT_DIRAC_HAAR0: + d->vertical_compose = (void*)ff_vertical_compose_haar_10bit_sse2; + break; + case DWT_DIRAC_HAAR1: + d->horizontal_compose = (void*)ff_horizontal_compose_haar_10bit_sse2; + d->vertical_compose = (void*)ff_vertical_compose_haar_10bit_sse2; + break; + } + } + + if (EXTERNAL_AVX(cpu_flags)) { + switch (type) { + case DWT_DIRAC_HAAR0: + d->vertical_compose = (void*)ff_vertical_compose_haar_10bit_avx; + break; + case DWT_DIRAC_HAAR1: + d->horizontal_compose = (void*)ff_horizontal_compose_haar_10bit_avx; + d->vertical_compose = (void*)ff_vertical_compose_haar_10bit_avx; + break; + } + } + + if (EXTERNAL_AVX2(cpu_flags)) { + switch (type) { + case DWT_DIRAC_HAAR0: + d->vertical_compose = (void*)ff_vertical_compose_haar_10bit_avx2; + break; + case DWT_DIRAC_HAAR1: + d->horizontal_compose = (void*)ff_horizontal_compose_haar_10bit_avx2; + d->vertical_compose = (void*)ff_vertical_compose_haar_10bit_avx2; + break; + } + } + +#endif // HAVE_X86ASM +}