diff mbox

[FFmpeg-devel,3/6] diracdec: add 10-bit Deslauriers-Dubuc 9, 7 (9_7) vertical high-pass function

Message ID 20180719145252.30613-4-jdarnley@obe.tv
State Superseded
Headers show

Commit Message

James Darnley July 19, 2018, 2:52 p.m. UTC
Speed of ffmpeg when decoding a 720p yuv422p10 file encoded with the
relevant transform.
C:     84fps
SSE2: 111fps
AVX2: 115fps
---
 libavcodec/x86/dirac_dwt_10bit.asm    | 38 +++++++++++++++++++++++++++
 libavcodec/x86/dirac_dwt_init_10bit.c | 16 +++++++++++
 2 files changed, 54 insertions(+)

Comments

Rostislav Pehlivanov July 19, 2018, 3:26 p.m. UTC | #1
On 19 July 2018 at 15:52, James Darnley <jdarnley@obe.tv> wrote:

> Speed of ffmpeg when decoding a 720p yuv422p10 file encoded with the
> relevant transform.
> C:     84fps
> SSE2: 111fps
> AVX2: 115fps
> ---
>  libavcodec/x86/dirac_dwt_10bit.asm    | 38 +++++++++++++++++++++++++++
>  libavcodec/x86/dirac_dwt_init_10bit.c | 16 +++++++++++
>  2 files changed, 54 insertions(+)
>
> diff --git a/libavcodec/x86/dirac_dwt_10bit.asm
> b/libavcodec/x86/dirac_dwt_10bit.asm
> index c00de32bfe..681de5e1df 100644
> --- a/libavcodec/x86/dirac_dwt_10bit.asm
> +++ b/libavcodec/x86/dirac_dwt_10bit.asm
> @@ -25,6 +25,7 @@ SECTION_RODATA
>
>  cextern pd_1
>  pd_2: times 4 dd 2
> +pd_8: times 4 dd 8
>
>  SECTION .text
>
> @@ -153,7 +154,44 @@ RET
>
>  %endmacro
>
> +%macro DD97_VERTICAL_HI 0
> +
> +cglobal dd97_vertical_hi, 6, 6, 8, b0, b1, b2, b3, b4, w
> +    mova m7, [pd_8]
> +    shl wd, 2
> +    add b0q, wq
> +    add b1q, wq
> +    add b2q, wq
> +    add b3q, wq
> +    add b4q, wq
> +    neg wq
> +
> +    ALIGN 16
> +    .loop:
> +        mova m0, [b0q + wq]
> +        mova m1, [b1q + wq]
> +        mova m2, [b2q + wq]
> +        mova m3, [b3q + wq]
> +        mova m4, [b4q + wq]
> +        pslld m5, m1, 3
> +        pslld m6, m3, 3
> +        paddd m5, m1
> +        paddd m6, m3
> +        psubd m5, m0
> +        psubd m6, m4
> +        paddd m5, m7
> +        paddd m5, m6
> +        psrad m5, 4
> +        paddd m2, m5
> +        mova [b2q + wq], m2
> +        add wq, mmsize
> +    jl .loop
> +RET
> +
> +%endmacro
> +
>  INIT_XMM sse2
> +DD97_VERTICAL_HI
>  HAAR_HORIZONTAL
>  HAAR_VERTICAL
>  LEGALL53_VERTICAL_HI
> diff --git a/libavcodec/x86/dirac_dwt_init_10bit.c
> b/libavcodec/x86/dirac_dwt_init_10bit.c
> index 88cf267d14..e7e7534050 100644
> --- a/libavcodec/x86/dirac_dwt_init_10bit.c
> +++ b/libavcodec/x86/dirac_dwt_init_10bit.c
> @@ -23,6 +23,8 @@
>  #include "libavutil/x86/cpu.h"
>  #include "libavcodec/dirac_dwt.h"
>
> +void ff_dd97_vertical_hi_sse2(int32_t *b0, int32_t *b1, int32_t *b2,
> int32_t *b3, int32_t *b4, int width);
> +
>  void ff_legall53_vertical_hi_sse2(int32_t *b0, int32_t *b1, int32_t *b2,
> int width);
>  void ff_legall53_vertical_lo_sse2(int32_t *b0, int32_t *b1, int32_t *b2,
> int width);
>
> @@ -110,6 +112,16 @@ static void legall53_vertical_hi_sse2(int32_t *b0,
> int32_t *b1, int32_t *b2, int
>          b1[i] = COMPOSE_DIRAC53iH0(b0[i], b1[i], b2[i]);
>  }
>
> +static void dd97_vertical_hi_sse2(int32_t *b0, int32_t *b1, int32_t *b2,
> +                                  int32_t *b3, int32_t *b4, int width)
> +{
> +    int i = width & ~3;
> +    ff_dd97_vertical_hi_sse2(b0, b1, b2, b3, b4, i);
> +    for(; i<width; i++)
> +        b2[i] = COMPOSE_DD97iH0(b0[i], b1[i], b2[i], b3[i], b4[i]);
> +
> +}
>


This, along with the rest of the patchset: what's up with the hybrid
implementations? Couldn't you put the second part in the asm code as well?
Now there are 2 function calls instead of 1.
James Darnley July 19, 2018, 3:52 p.m. UTC | #2
On 2018-07-19 17:26, Rostislav Pehlivanov wrote:
> On 19 July 2018 at 15:52, James Darnley <jdarnley@obe.tv> wrote:
> 
>> int32_t *b1, int32_t *b2, int
>>          b1[i] = COMPOSE_DIRAC53iH0(b0[i], b1[i], b2[i]);
>>  }
>>
>> +static void dd97_vertical_hi_sse2(int32_t *b0, int32_t *b1, int32_t *b2,
>> +                                  int32_t *b3, int32_t *b4, int width)
>> +{
>> +    int i = width & ~3;
>> +    ff_dd97_vertical_hi_sse2(b0, b1, b2, b3, b4, i);
>> +    for(; i<width; i++)
>> +        b2[i] = COMPOSE_DD97iH0(b0[i], b1[i], b2[i], b3[i], b4[i]);
>> +
>> +}
>>
> 
> 
> This, along with the rest of the patchset: what's up with the hybrid
> implementations? Couldn't you put the second part in the asm code as well?
> Now there are 2 function calls instead of 1.

The 8-bit code does this and I just followed it lead.  I believe this is
done because we cannot write junk data beyond what we think is the end
of the line because this might be one of the higher depths and the
coeffs for the next level sit beyond the end of the line.

But now it has just occurred to me that maybe you meant "why didn't you
do the scalar operations in SIMD?", is that what you meant?  Answer is
because it didn't occur to me at the time.  Aside from that I always
write do-while loops in assembly because I can usually guarantee 1 run
of the block.

I can certainly look at making that change.
Rostislav Pehlivanov July 19, 2018, 4:09 p.m. UTC | #3
On 19 July 2018 at 16:52, James Darnley <jdarnley@obe.tv> wrote:

> On 2018-07-19 17:26, Rostislav Pehlivanov wrote:
> > On 19 July 2018 at 15:52, James Darnley <jdarnley@obe.tv> wrote:
> >
> >> int32_t *b1, int32_t *b2, int
> >>          b1[i] = COMPOSE_DIRAC53iH0(b0[i], b1[i], b2[i]);
> >>  }
> >>
> >> +static void dd97_vertical_hi_sse2(int32_t *b0, int32_t *b1, int32_t
> *b2,
> >> +                                  int32_t *b3, int32_t *b4, int width)
> >> +{
> >> +    int i = width & ~3;
> >> +    ff_dd97_vertical_hi_sse2(b0, b1, b2, b3, b4, i);
> >> +    for(; i<width; i++)
> >> +        b2[i] = COMPOSE_DD97iH0(b0[i], b1[i], b2[i], b3[i], b4[i]);
> >> +
> >> +}
> >>
> >
> >
> > This, along with the rest of the patchset: what's up with the hybrid
> > implementations? Couldn't you put the second part in the asm code as
> well?
> > Now there are 2 function calls instead of 1.
>
> The 8-bit code does this and I just followed it lead.  I believe this is
> done because we cannot write junk data beyond what we think is the end
> of the line because this might be one of the higher depths and the
> coeffs for the next level sit beyond the end of the line.
>
> But now it has just occurred to me that maybe you meant "why didn't you
> do the scalar operations in SIMD?", is that what you meant?  Answer is
> because it didn't occur to me at the time.  Aside from that I always
> write do-while loops in assembly because I can usually guarantee 1 run
> of the block.
>
> I can certainly look at making that change.
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>

Yep, I think you ought to put the scalar code in the asm.
diff mbox

Patch

diff --git a/libavcodec/x86/dirac_dwt_10bit.asm b/libavcodec/x86/dirac_dwt_10bit.asm
index c00de32bfe..681de5e1df 100644
--- a/libavcodec/x86/dirac_dwt_10bit.asm
+++ b/libavcodec/x86/dirac_dwt_10bit.asm
@@ -25,6 +25,7 @@  SECTION_RODATA
 
 cextern pd_1
 pd_2: times 4 dd 2
+pd_8: times 4 dd 8
 
 SECTION .text
 
@@ -153,7 +154,44 @@  RET
 
 %endmacro
 
+%macro DD97_VERTICAL_HI 0
+
+cglobal dd97_vertical_hi, 6, 6, 8, b0, b1, b2, b3, b4, w
+    mova m7, [pd_8]
+    shl wd, 2
+    add b0q, wq
+    add b1q, wq
+    add b2q, wq
+    add b3q, wq
+    add b4q, wq
+    neg wq
+
+    ALIGN 16
+    .loop:
+        mova m0, [b0q + wq]
+        mova m1, [b1q + wq]
+        mova m2, [b2q + wq]
+        mova m3, [b3q + wq]
+        mova m4, [b4q + wq]
+        pslld m5, m1, 3
+        pslld m6, m3, 3
+        paddd m5, m1
+        paddd m6, m3
+        psubd m5, m0
+        psubd m6, m4
+        paddd m5, m7
+        paddd m5, m6
+        psrad m5, 4
+        paddd m2, m5
+        mova [b2q + wq], m2
+        add wq, mmsize
+    jl .loop
+RET
+
+%endmacro
+
 INIT_XMM sse2
+DD97_VERTICAL_HI
 HAAR_HORIZONTAL
 HAAR_VERTICAL
 LEGALL53_VERTICAL_HI
diff --git a/libavcodec/x86/dirac_dwt_init_10bit.c b/libavcodec/x86/dirac_dwt_init_10bit.c
index 88cf267d14..e7e7534050 100644
--- a/libavcodec/x86/dirac_dwt_init_10bit.c
+++ b/libavcodec/x86/dirac_dwt_init_10bit.c
@@ -23,6 +23,8 @@ 
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/dirac_dwt.h"
 
+void ff_dd97_vertical_hi_sse2(int32_t *b0, int32_t *b1, int32_t *b2, int32_t *b3, int32_t *b4, int width);
+
 void ff_legall53_vertical_hi_sse2(int32_t *b0, int32_t *b1, int32_t *b2, int width);
 void ff_legall53_vertical_lo_sse2(int32_t *b0, int32_t *b1, int32_t *b2, int width);
 
@@ -110,6 +112,16 @@  static void legall53_vertical_hi_sse2(int32_t *b0, int32_t *b1, int32_t *b2, int
         b1[i] = COMPOSE_DIRAC53iH0(b0[i], b1[i], b2[i]);
 }
 
+static void dd97_vertical_hi_sse2(int32_t *b0, int32_t *b1, int32_t *b2,
+                                  int32_t *b3, int32_t *b4, int width)
+{
+    int i = width & ~3;
+    ff_dd97_vertical_hi_sse2(b0, b1, b2, b3, b4, i);
+    for(; i<width; i++)
+        b2[i] = COMPOSE_DD97iH0(b0[i], b1[i], b2[i], b3[i], b4[i]);
+
+}
+
 av_cold void ff_spatial_idwt_init_10bit_x86(DWTContext *d, enum dwt_type type)
 {
 #if HAVE_X86ASM
@@ -117,6 +129,10 @@  av_cold void ff_spatial_idwt_init_10bit_x86(DWTContext *d, enum dwt_type type)
 
     if (EXTERNAL_SSE2(cpu_flags)) {
         switch (type) {
+            case DWT_DIRAC_DD9_7:
+                d->vertical_compose_h0 = (void*)dd97_vertical_hi_sse2;
+                d->vertical_compose_l0 = (void*)legall53_vertical_lo_sse2;
+                break;
             case DWT_DIRAC_LEGALL5_3:
                 d->vertical_compose_h0 = (void*)legall53_vertical_hi_sse2;
                 d->vertical_compose_l0 = (void*)legall53_vertical_lo_sse2;