Message ID | 20180719145252.30613-4-jdarnley@obe.tv |
---|---|
State | Superseded |
Headers | show |
On 19 July 2018 at 15:52, James Darnley <jdarnley@obe.tv> wrote: > Speed of ffmpeg when decoding a 720p yuv422p10 file encoded with the > relevant transform. > C: 84fps > SSE2: 111fps > AVX2: 115fps > --- > libavcodec/x86/dirac_dwt_10bit.asm | 38 +++++++++++++++++++++++++++ > libavcodec/x86/dirac_dwt_init_10bit.c | 16 +++++++++++ > 2 files changed, 54 insertions(+) > > diff --git a/libavcodec/x86/dirac_dwt_10bit.asm > b/libavcodec/x86/dirac_dwt_10bit.asm > index c00de32bfe..681de5e1df 100644 > --- a/libavcodec/x86/dirac_dwt_10bit.asm > +++ b/libavcodec/x86/dirac_dwt_10bit.asm > @@ -25,6 +25,7 @@ SECTION_RODATA > > cextern pd_1 > pd_2: times 4 dd 2 > +pd_8: times 4 dd 8 > > SECTION .text > > @@ -153,7 +154,44 @@ RET > > %endmacro > > +%macro DD97_VERTICAL_HI 0 > + > +cglobal dd97_vertical_hi, 6, 6, 8, b0, b1, b2, b3, b4, w > + mova m7, [pd_8] > + shl wd, 2 > + add b0q, wq > + add b1q, wq > + add b2q, wq > + add b3q, wq > + add b4q, wq > + neg wq > + > + ALIGN 16 > + .loop: > + mova m0, [b0q + wq] > + mova m1, [b1q + wq] > + mova m2, [b2q + wq] > + mova m3, [b3q + wq] > + mova m4, [b4q + wq] > + pslld m5, m1, 3 > + pslld m6, m3, 3 > + paddd m5, m1 > + paddd m6, m3 > + psubd m5, m0 > + psubd m6, m4 > + paddd m5, m7 > + paddd m5, m6 > + psrad m5, 4 > + paddd m2, m5 > + mova [b2q + wq], m2 > + add wq, mmsize > + jl .loop > +RET > + > +%endmacro > + > INIT_XMM sse2 > +DD97_VERTICAL_HI > HAAR_HORIZONTAL > HAAR_VERTICAL > LEGALL53_VERTICAL_HI > diff --git a/libavcodec/x86/dirac_dwt_init_10bit.c > b/libavcodec/x86/dirac_dwt_init_10bit.c > index 88cf267d14..e7e7534050 100644 > --- a/libavcodec/x86/dirac_dwt_init_10bit.c > +++ b/libavcodec/x86/dirac_dwt_init_10bit.c > @@ -23,6 +23,8 @@ > #include "libavutil/x86/cpu.h" > #include "libavcodec/dirac_dwt.h" > > +void ff_dd97_vertical_hi_sse2(int32_t *b0, int32_t *b1, int32_t *b2, > int32_t *b3, int32_t *b4, int width); > + > void ff_legall53_vertical_hi_sse2(int32_t *b0, int32_t *b1, int32_t *b2, > int width); > void ff_legall53_vertical_lo_sse2(int32_t *b0, int32_t *b1, int32_t *b2, > int width); > > @@ -110,6 +112,16 @@ static void legall53_vertical_hi_sse2(int32_t *b0, > int32_t *b1, int32_t *b2, int > b1[i] = COMPOSE_DIRAC53iH0(b0[i], b1[i], b2[i]); > } > > +static void dd97_vertical_hi_sse2(int32_t *b0, int32_t *b1, int32_t *b2, > + int32_t *b3, int32_t *b4, int width) > +{ > + int i = width & ~3; > + ff_dd97_vertical_hi_sse2(b0, b1, b2, b3, b4, i); > + for(; i<width; i++) > + b2[i] = COMPOSE_DD97iH0(b0[i], b1[i], b2[i], b3[i], b4[i]); > + > +} > This, along with the rest of the patchset: what's up with the hybrid implementations? Couldn't you put the second part in the asm code as well? Now there are 2 function calls instead of 1.
On 2018-07-19 17:26, Rostislav Pehlivanov wrote: > On 19 July 2018 at 15:52, James Darnley <jdarnley@obe.tv> wrote: > >> int32_t *b1, int32_t *b2, int >> b1[i] = COMPOSE_DIRAC53iH0(b0[i], b1[i], b2[i]); >> } >> >> +static void dd97_vertical_hi_sse2(int32_t *b0, int32_t *b1, int32_t *b2, >> + int32_t *b3, int32_t *b4, int width) >> +{ >> + int i = width & ~3; >> + ff_dd97_vertical_hi_sse2(b0, b1, b2, b3, b4, i); >> + for(; i<width; i++) >> + b2[i] = COMPOSE_DD97iH0(b0[i], b1[i], b2[i], b3[i], b4[i]); >> + >> +} >> > > > This, along with the rest of the patchset: what's up with the hybrid > implementations? Couldn't you put the second part in the asm code as well? > Now there are 2 function calls instead of 1. The 8-bit code does this and I just followed it lead. I believe this is done because we cannot write junk data beyond what we think is the end of the line because this might be one of the higher depths and the coeffs for the next level sit beyond the end of the line. But now it has just occurred to me that maybe you meant "why didn't you do the scalar operations in SIMD?", is that what you meant? Answer is because it didn't occur to me at the time. Aside from that I always write do-while loops in assembly because I can usually guarantee 1 run of the block. I can certainly look at making that change.
On 19 July 2018 at 16:52, James Darnley <jdarnley@obe.tv> wrote: > On 2018-07-19 17:26, Rostislav Pehlivanov wrote: > > On 19 July 2018 at 15:52, James Darnley <jdarnley@obe.tv> wrote: > > > >> int32_t *b1, int32_t *b2, int > >> b1[i] = COMPOSE_DIRAC53iH0(b0[i], b1[i], b2[i]); > >> } > >> > >> +static void dd97_vertical_hi_sse2(int32_t *b0, int32_t *b1, int32_t > *b2, > >> + int32_t *b3, int32_t *b4, int width) > >> +{ > >> + int i = width & ~3; > >> + ff_dd97_vertical_hi_sse2(b0, b1, b2, b3, b4, i); > >> + for(; i<width; i++) > >> + b2[i] = COMPOSE_DD97iH0(b0[i], b1[i], b2[i], b3[i], b4[i]); > >> + > >> +} > >> > > > > > > This, along with the rest of the patchset: what's up with the hybrid > > implementations? Couldn't you put the second part in the asm code as > well? > > Now there are 2 function calls instead of 1. > > The 8-bit code does this and I just followed it lead. I believe this is > done because we cannot write junk data beyond what we think is the end > of the line because this might be one of the higher depths and the > coeffs for the next level sit beyond the end of the line. > > But now it has just occurred to me that maybe you meant "why didn't you > do the scalar operations in SIMD?", is that what you meant? Answer is > because it didn't occur to me at the time. Aside from that I always > write do-while loops in assembly because I can usually guarantee 1 run > of the block. > > I can certainly look at making that change. > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > http://ffmpeg.org/mailman/listinfo/ffmpeg-devel > Yep, I think you ought to put the scalar code in the asm.
diff --git a/libavcodec/x86/dirac_dwt_10bit.asm b/libavcodec/x86/dirac_dwt_10bit.asm index c00de32bfe..681de5e1df 100644 --- a/libavcodec/x86/dirac_dwt_10bit.asm +++ b/libavcodec/x86/dirac_dwt_10bit.asm @@ -25,6 +25,7 @@ SECTION_RODATA cextern pd_1 pd_2: times 4 dd 2 +pd_8: times 4 dd 8 SECTION .text @@ -153,7 +154,44 @@ RET %endmacro +%macro DD97_VERTICAL_HI 0 + +cglobal dd97_vertical_hi, 6, 6, 8, b0, b1, b2, b3, b4, w + mova m7, [pd_8] + shl wd, 2 + add b0q, wq + add b1q, wq + add b2q, wq + add b3q, wq + add b4q, wq + neg wq + + ALIGN 16 + .loop: + mova m0, [b0q + wq] + mova m1, [b1q + wq] + mova m2, [b2q + wq] + mova m3, [b3q + wq] + mova m4, [b4q + wq] + pslld m5, m1, 3 + pslld m6, m3, 3 + paddd m5, m1 + paddd m6, m3 + psubd m5, m0 + psubd m6, m4 + paddd m5, m7 + paddd m5, m6 + psrad m5, 4 + paddd m2, m5 + mova [b2q + wq], m2 + add wq, mmsize + jl .loop +RET + +%endmacro + INIT_XMM sse2 +DD97_VERTICAL_HI HAAR_HORIZONTAL HAAR_VERTICAL LEGALL53_VERTICAL_HI diff --git a/libavcodec/x86/dirac_dwt_init_10bit.c b/libavcodec/x86/dirac_dwt_init_10bit.c index 88cf267d14..e7e7534050 100644 --- a/libavcodec/x86/dirac_dwt_init_10bit.c +++ b/libavcodec/x86/dirac_dwt_init_10bit.c @@ -23,6 +23,8 @@ #include "libavutil/x86/cpu.h" #include "libavcodec/dirac_dwt.h" +void ff_dd97_vertical_hi_sse2(int32_t *b0, int32_t *b1, int32_t *b2, int32_t *b3, int32_t *b4, int width); + void ff_legall53_vertical_hi_sse2(int32_t *b0, int32_t *b1, int32_t *b2, int width); void ff_legall53_vertical_lo_sse2(int32_t *b0, int32_t *b1, int32_t *b2, int width); @@ -110,6 +112,16 @@ static void legall53_vertical_hi_sse2(int32_t *b0, int32_t *b1, int32_t *b2, int b1[i] = COMPOSE_DIRAC53iH0(b0[i], b1[i], b2[i]); } +static void dd97_vertical_hi_sse2(int32_t *b0, int32_t *b1, int32_t *b2, + int32_t *b3, int32_t *b4, int width) +{ + int i = width & ~3; + ff_dd97_vertical_hi_sse2(b0, b1, b2, b3, b4, i); + for(; i<width; i++) + b2[i] = COMPOSE_DD97iH0(b0[i], b1[i], b2[i], b3[i], b4[i]); + +} + av_cold void ff_spatial_idwt_init_10bit_x86(DWTContext *d, enum dwt_type type) { #if HAVE_X86ASM @@ -117,6 +129,10 @@ av_cold void ff_spatial_idwt_init_10bit_x86(DWTContext *d, enum dwt_type type) if (EXTERNAL_SSE2(cpu_flags)) { switch (type) { + case DWT_DIRAC_DD9_7: + d->vertical_compose_h0 = (void*)dd97_vertical_hi_sse2; + d->vertical_compose_l0 = (void*)legall53_vertical_lo_sse2; + break; case DWT_DIRAC_LEGALL5_3: d->vertical_compose_h0 = (void*)legall53_vertical_hi_sse2; d->vertical_compose_l0 = (void*)legall53_vertical_lo_sse2;