[FFmpeg-devel,2/3] diracdec: add 10-bit Legall 5, 3 (5_3) SIMD functions

Submitted by James Darnley on July 26, 2018, 11:28 a.m.

Details

Message ID 20180726112808.11792-3-jdarnley@obe.tv
State New
Headers show

Commit Message

James Darnley July 26, 2018, 11:28 a.m.
Speed of ffmpeg when decoding a 720p yuv422p10 file encoded with the
relevant transform.
C:     94fps
SSE2: 118fps
AVX2: 121fps

legall vertical hi
    sse2: 3.86x faster (20201 vs. 5231 decicycles) compared with C
    avx2: 6.70x faster (20201 vs. 3014 decicycles) compared with C
legall vertical lo
    sse2: 1.50x faster (28345 vs. 18908 decicycles) compared with C
    avx2: 1.63x faster (28345 vs. 17361 decicycles) compared with C
---
 libavcodec/x86/dirac_dwt_10bit.asm    | 105 +++++++++++++++++++++++++-
 libavcodec/x86/dirac_dwt_init_10bit.c |  13 ++++
 2 files changed, 117 insertions(+), 1 deletion(-)

Patch hide | download patch | download mbox

diff --git a/libavcodec/x86/dirac_dwt_10bit.asm b/libavcodec/x86/dirac_dwt_10bit.asm
index baea91329e..0295e6f554 100644
--- a/libavcodec/x86/dirac_dwt_10bit.asm
+++ b/libavcodec/x86/dirac_dwt_10bit.asm
@@ -21,9 +21,10 @@ 
 
 %include "libavutil/x86/x86util.asm"
 
-SECTION_RODATA
+SECTION_RODATA 32
 
 cextern pd_1
+pd_2: times 8 dd 2
 
 SECTION .text
 
@@ -147,9 +148,109 @@  REP_RET
 
 %endmacro
 
+%macro LEGALL53_VERTICAL_LO 0
+
+cglobal legall53_vertical_lo, 4, 6, 4, b0, b1, b2, w
+    DECLARE_REG_TMP 3,4,5
+
+    mova  m3, [pd_2]
+    mov  t2d, wd
+    and   wd, ~(mmsize/4 - 1)
+    shl   wd, 2
+    add  b0q, wq
+    add  b1q, wq
+    add  b2q, wq
+    neg   wq
+
+    ALIGN 16
+    .loop:
+        mova m0, [b0q + wq]
+        mova m1, [b1q + wq]
+        mova m2, [b2q + wq]
+        paddd m0, m2
+        paddd m0, m3
+        psrad m0, 2
+        psubd m1, m0
+        mova [b1q + wq], m1
+        add wq, mmsize
+    jl .loop
+
+    and  t2d, mmsize/4 - 1
+    jz .end
+    .loop_scalar:
+        mov t0d, [b0q]
+        mov t1d, [b1q]
+        add t0d, [b2q]
+        add t0d, 2
+        sar t0d, 2
+        sub t1d, t0d
+        mov [b1q], t1d
+
+        add b0q, 4
+        add b1q, 4
+        add b2q, 4
+        sub t2d, 1
+    jg .loop_scalar
+
+    .end:
+RET
+
+%endmacro
+
+%macro LEGALL53_VERTICAL_HI 0
+
+cglobal legall53_vertical_hi, 4, 6, 4, b0, b1, b2, w
+    DECLARE_REG_TMP 3,4,5
+
+    mova  m3, [pd_1]
+    mov  t2d, wd
+    and   wd, ~(mmsize/4 - 1)
+    shl   wd, 2
+    add  b0q, wq
+    add  b1q, wq
+    add  b2q, wq
+    neg   wq
+
+    ALIGN 16
+    .loop:
+        mova m0, [b0q + wq]
+        mova m1, [b1q + wq]
+        mova m2, [b2q + wq]
+        paddd m0, m2
+        paddd m0, m3
+        psrad m0, 1
+        paddd m1, m0
+        mova [b1q + wq], m1
+        add wq, mmsize
+    jl .loop
+
+    and  t2d, mmsize/4 - 1
+    jz .end
+    .loop_scalar:
+        mov t0d, [b0q]
+        mov t1d, [b1q]
+        add t0d, [b2q]
+        add t0d, 1
+        sar t0d, 1
+        add t1d, t0d
+        mov [b1q], t1d
+
+        add b0q, 4
+        add b1q, 4
+        add b2q, 4
+        sub t2d, 1
+    jg .loop_scalar
+
+    .end:
+RET
+
+%endmacro
+
 INIT_XMM sse2
 HAAR_HORIZONTAL
 HAAR_VERTICAL
+LEGALL53_VERTICAL_HI
+LEGALL53_VERTICAL_LO
 
 INIT_XMM avx
 HAAR_HORIZONTAL
@@ -158,3 +259,5 @@  HAAR_VERTICAL
 INIT_YMM avx2
 HAAR_HORIZONTAL
 HAAR_VERTICAL
+LEGALL53_VERTICAL_HI
+LEGALL53_VERTICAL_LO
diff --git a/libavcodec/x86/dirac_dwt_init_10bit.c b/libavcodec/x86/dirac_dwt_init_10bit.c
index 289862d728..d1234efac5 100644
--- a/libavcodec/x86/dirac_dwt_init_10bit.c
+++ b/libavcodec/x86/dirac_dwt_init_10bit.c
@@ -23,6 +23,11 @@ 
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/dirac_dwt.h"
 
+void ff_legall53_vertical_hi_sse2(int32_t *b0, int32_t *b1, int32_t *b2, int width);
+void ff_legall53_vertical_lo_sse2(int32_t *b0, int32_t *b1, int32_t *b2, int width);
+void ff_legall53_vertical_hi_avx2(int32_t *b0, int32_t *b1, int32_t *b2, int width);
+void ff_legall53_vertical_lo_avx2(int32_t *b0, int32_t *b1, int32_t *b2, int width);
+
 void ff_horizontal_compose_haar_10bit_sse2(int32_t *b0, int32_t *b1, int width_align);
 void ff_horizontal_compose_haar_10bit_avx(int32_t *b0, int32_t *b1, int width_align);
 void ff_horizontal_compose_haar_10bit_avx2(int32_t *b0, int32_t *b1, int width_align);
@@ -38,6 +43,10 @@  av_cold void ff_spatial_idwt_init_10bit_x86(DWTContext *d, enum dwt_type type)
 
     if (EXTERNAL_SSE2(cpu_flags)) {
         switch (type) {
+            case DWT_DIRAC_LEGALL5_3:
+                d->vertical_compose_h0 = (void*)ff_legall53_vertical_hi_sse2;
+                d->vertical_compose_l0 = (void*)ff_legall53_vertical_lo_sse2;
+                break;
             case DWT_DIRAC_HAAR0:
                 d->vertical_compose = (void*)ff_vertical_compose_haar_10bit_sse2;
                 break;
@@ -62,6 +71,10 @@  av_cold void ff_spatial_idwt_init_10bit_x86(DWTContext *d, enum dwt_type type)
 
     if (EXTERNAL_AVX2(cpu_flags)) {
         switch (type) {
+            case DWT_DIRAC_LEGALL5_3:
+                d->vertical_compose_h0 = (void*)ff_legall53_vertical_hi_avx2;
+                d->vertical_compose_l0 = (void*)ff_legall53_vertical_lo_avx2;
+                break;
             case DWT_DIRAC_HAAR0:
                 d->vertical_compose = (void*)ff_vertical_compose_haar_10bit_avx2;
                 break;