[FFmpeg-devel,5/6] avcodec/x86: add x86-64 8-bit simple_idct add function

Submitted by James Darnley on June 12, 2017, 1:36 p.m.

Details

Message ID 20170612133609.24172-6-jdarnley@obe.tv
State New
Headers show

Commit Message

James Darnley June 12, 2017, 1:36 p.m.
---
 libavcodec/x86/idctdsp_init.c    |  2 ++
 libavcodec/x86/simple_idct.h     |  3 ++
 libavcodec/x86/simple_idct10.asm | 61 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 66 insertions(+)

Patch hide | download patch | download mbox

diff --git a/libavcodec/x86/idctdsp_init.c b/libavcodec/x86/idctdsp_init.c
index 1826d01e0e..9da60d1a1e 100644
--- a/libavcodec/x86/idctdsp_init.c
+++ b/libavcodec/x86/idctdsp_init.c
@@ -103,6 +103,7 @@  av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
                 avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
                 c->idct      = ff_simple_idct8_sse2;
                 c->idct_put  = ff_simple_idct8_put_sse2;
+                c->idct_add  = ff_simple_idct8_add_sse2;
                 c->perm_type = FF_IDCT_PERM_TRANSPOSE;
         }
     }
@@ -115,6 +116,7 @@  av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
                 avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
                 c->idct      = ff_simple_idct8_avx;
                 c->idct_put  = ff_simple_idct8_put_avx;
+                c->idct_add  = ff_simple_idct8_add_avx;
                 c->perm_type = FF_IDCT_PERM_TRANSPOSE;
         }
 
diff --git a/libavcodec/x86/simple_idct.h b/libavcodec/x86/simple_idct.h
index b559f8527c..9b64cfe9bc 100644
--- a/libavcodec/x86/simple_idct.h
+++ b/libavcodec/x86/simple_idct.h
@@ -35,6 +35,9 @@  void ff_simple_idct8_avx(int16_t *block);
 void ff_simple_idct8_put_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
 void ff_simple_idct8_put_avx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
 
+void ff_simple_idct8_add_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+void ff_simple_idct8_add_avx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+
 void ff_simple_idct10_sse2(int16_t *block);
 void ff_simple_idct10_avx(int16_t *block);
 
diff --git a/libavcodec/x86/simple_idct10.asm b/libavcodec/x86/simple_idct10.asm
index f31fb5cfa5..29e18fe6a6 100644
--- a/libavcodec/x86/simple_idct10.asm
+++ b/libavcodec/x86/simple_idct10.asm
@@ -82,6 +82,31 @@  SECTION .text
     movhps %8, %12
 %endmacro
 
+%macro LOAD_ZXBW_8 16
+    pmovzxbw %1, %9
+    pmovzxbw %2, %10
+    pmovzxbw %3, %11
+    pmovzxbw %4, %12
+    pmovzxbw %5, %13
+    pmovzxbw %6, %14
+    pmovzxbw %7, %15
+    pmovzxbw %8, %16
+%endmacro
+
+%macro LOAD_ZXBW_4 9
+    movh %1, %5
+    movh %2, %6
+    movh %3, %7
+    movh %4, %8
+    punpcklbw %1, %9
+    punpcklbw %2, %9
+    punpcklbw %3, %9
+    punpcklbw %4, %9
+%endmacro
+
+%define PASS4ROWS(base, stride, stride3) \
+    [base], [base + stride], [base + 2*stride], [base + stride3]
+
 %macro idct_fn 0
 cglobal simple_idct8, 1, 1, 16, block
     IDCT_FN    "", 11, pw_round_20_div_w4, 20, "store"
@@ -99,6 +124,42 @@  cglobal simple_idct8_put, 3, 4, 16, pixels, lsize, block
     STORE_HI_LO PASS8ROWS(pixelsq, r2, lsizeq, r3), m8, m1, m4, m9
 RET
 
+; TODO: optimise by not writing the final data to the block.
+cglobal simple_idct8_add, 3, 4, 16, pixels, lsize, block
+    IDCT_FN    "", 11, pw_round_20_div_w4, 20
+    lea r2, [3*lsizeq]
+    lea r3, [pixelsq + r2]
+    %if cpuflag(sse4)
+        LOAD_ZXBW_8 m3, m5, m6, m7, m12, m13, m14, m15, PASS8ROWS(pixelsq, r3, lsizeq, r2)
+        paddsw m8, m3
+        paddsw m0, m5
+        paddsw m1, m6
+        paddsw m2, m7
+        paddsw m4, m12
+        paddsw m11, m13
+        paddsw m9, m14
+        paddsw m10, m15
+    %else
+        pxor m12, m12
+        LOAD_ZXBW_4 m3, m5, m6, m7, PASS4ROWS(pixelsq, lsizeq, r2), m12
+        paddsw m8, m3
+        paddsw m0, m5
+        paddsw m1, m6
+        paddsw m2, m7
+        lea pixelsq, [pixelsq + 4*lsizeq]
+        LOAD_ZXBW_4 m3, m5, m6, m7, PASS4ROWS(pixelsq, lsizeq, r2), m12
+        paddsw m4, m3
+        paddsw m11, m5
+        paddsw m9, m6
+        paddsw m10, m7
+    %endif
+    packuswb  m8, m0
+    packuswb  m1, m2
+    packuswb  m4, m11
+    packuswb  m9, m10
+    STORE_HI_LO PASS8ROWS(pixelsq, r3, lsizeq, r2), m8, m1, m4, m9
+RET
+
 cglobal simple_idct10, 1, 1, 16, block
     IDCT_FN    "", 12, "", 19, "store"
     RET