[FFmpeg-devel,3/6] avcodec/x86: add x86-64 8-bit simple_idct function

Submitted by James Darnley on June 12, 2017, 1:36 p.m.

Details

Message ID 20170612133609.24172-4-jdarnley@obe.tv
State New
Headers show

Commit Message

James Darnley June 12, 2017, 1:36 p.m.
Rounding contributed by Ronald S. Bultje
---
 libavcodec/tests/x86/dct.c       |  2 ++
 libavcodec/x86/idctdsp_init.c    | 19 +++++++++++++++++++
 libavcodec/x86/simple_idct.h     |  3 +++
 libavcodec/x86/simple_idct10.asm |  8 ++++++++
 4 files changed, 32 insertions(+)

Comments

Michael Niedermayer June 12, 2017, 4:57 p.m.
On Mon, Jun 12, 2017 at 03:36:06PM +0200, James Darnley wrote:
> Rounding contributed by Ronald S. Bultje
> ---
>  libavcodec/tests/x86/dct.c       |  2 ++
>  libavcodec/x86/idctdsp_init.c    | 19 +++++++++++++++++++
>  libavcodec/x86/simple_idct.h     |  3 +++
>  libavcodec/x86/simple_idct10.asm |  8 ++++++++
>  4 files changed, 32 insertions(+)

this (3) and the patches 1 and 2 break te idct

./ffplay ~/videos/matrixbench_mpeg2.mpg
looks pretty bad

[...]

Patch hide | download patch | download mbox

diff --git a/libavcodec/tests/x86/dct.c b/libavcodec/tests/x86/dct.c
index 34f5b8767b..317d973f9f 100644
--- a/libavcodec/tests/x86/dct.c
+++ b/libavcodec/tests/x86/dct.c
@@ -88,10 +88,12 @@  static const struct algo idct_tab_arch[] = {
 #if HAVE_YASM
 #if ARCH_X86_64
 #if HAVE_SSE2_EXTERNAL
+    { "SIMPLE8-SSE2",   ff_simple_idct8_sse2,  FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_SSE2},
     { "SIMPLE10-SSE2",  ff_simple_idct10_sse2, FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_SSE2},
     { "SIMPLE12-SSE2",  ff_simple_idct12_sse2, FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_SSE2, 1 },
 #endif
 #if HAVE_AVX_EXTERNAL
+    { "SIMPLE8-AVX",    ff_simple_idct8_avx,   FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_AVX},
     { "SIMPLE10-AVX",   ff_simple_idct10_avx,  FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_AVX},
     { "SIMPLE12-AVX",   ff_simple_idct12_avx,  FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_AVX,  1 },
 #endif
diff --git a/libavcodec/x86/idctdsp_init.c b/libavcodec/x86/idctdsp_init.c
index f1c915aa00..4b2145e478 100644
--- a/libavcodec/x86/idctdsp_init.c
+++ b/libavcodec/x86/idctdsp_init.c
@@ -94,9 +94,28 @@  av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
                 c->idct_add  = ff_simple_idct_add_sse2;
                 c->perm_type = FF_IDCT_PERM_SIMPLE;
         }
+
+        if (ARCH_X86_64 &&
+            !high_bit_depth &&
+            avctx->lowres == 0 &&
+            (avctx->idct_algo == FF_IDCT_AUTO ||
+                avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
+                avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
+                c->idct      = ff_simple_idct8_sse2;
+                c->perm_type = FF_IDCT_PERM_TRANSPOSE;
+        }
     }
 
     if (ARCH_X86_64 && avctx->lowres == 0) {
+        if (EXTERNAL_AVX(cpu_flags) &&
+            !high_bit_depth &&
+            (avctx->idct_algo == FF_IDCT_AUTO ||
+                avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
+                avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
+                c->idct      = ff_simple_idct8_avx;
+                c->perm_type = FF_IDCT_PERM_TRANSPOSE;
+        }
+
         if (avctx->bits_per_raw_sample == 10 &&
             (avctx->idct_algo == FF_IDCT_AUTO ||
              avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
diff --git a/libavcodec/x86/simple_idct.h b/libavcodec/x86/simple_idct.h
index d17ef6a462..d17a855312 100644
--- a/libavcodec/x86/simple_idct.h
+++ b/libavcodec/x86/simple_idct.h
@@ -29,6 +29,9 @@  void ff_simple_idct_put_mmx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
 void ff_simple_idct_add_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
 void ff_simple_idct_put_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
 
+void ff_simple_idct8_sse2(int16_t *block);
+void ff_simple_idct8_avx(int16_t *block);
+
 void ff_simple_idct10_sse2(int16_t *block);
 void ff_simple_idct10_avx(int16_t *block);
 
diff --git a/libavcodec/x86/simple_idct10.asm b/libavcodec/x86/simple_idct10.asm
index 1a5a2eae9b..168b6a08e0 100644
--- a/libavcodec/x86/simple_idct10.asm
+++ b/libavcodec/x86/simple_idct10.asm
@@ -33,9 +33,11 @@  cextern pw_2
 cextern pw_16
 cextern pw_1023
 cextern pw_4095
+pd_round_11: times 4 dd 1<<(11-1)
 pd_round_12: times 4 dd 1<<(12-1)
 pd_round_15: times 4 dd 1<<(15-1)
 pd_round_19: times 4 dd 1<<(19-1)
+pd_round_20: times 4 dd 1<<(20-1)
 
 %macro CONST_DEC  3
 const %1
@@ -50,6 +52,8 @@  times 4 dw %2, %3
 %define W6sh2  8867 ; W6 = 35468 =  8867<<2
 %define W7sh2  4520 ; W7 = 18081 =  4520<<2 + 1
 
+pw_round_20_div_w4: times 8 dw ((1 << (20 - 1)) / W4sh2)
+
 CONST_DEC  w4_plus_w2,   W4sh2, +W2sh2
 CONST_DEC  w4_min_w2,    W4sh2, -W2sh2
 CONST_DEC  w4_plus_w6,   W4sh2, +W6sh2
@@ -68,6 +72,10 @@  CONST_DEC  w7_min_w5,    W7sh2, -W5sh2
 SECTION .text
 
 %macro idct_fn 0
+cglobal simple_idct8, 1, 1, 16, block
+    IDCT_FN    "", 11, pw_round_20_div_w4, 20, "store"
+RET
+
 cglobal simple_idct10, 1, 1, 16, block
     IDCT_FN    "", 12, "", 19, "store"
     RET