diff mbox

[FFmpeg-devel,4/5] avcodec/h264: add avx 8-bit h264_idct_add

Message ID 20170405015328.5476-5-jdarnley@obe.tv
State Superseded
Headers show

Commit Message

James Darnley April 5, 2017, 1:53 a.m. UTC
Haswell:
 - 1.11x faster (522±0.4 vs. 469±1.8 decicycles) compared with mmxext

Skylake-U:
 - 1.21x faster (671±5.5 vs. 555±1.4 decicycles) compared with mmxext
---
 libavcodec/x86/h264_idct.asm  | 33 ++++++++++++++++++++++++++++++++-
 libavcodec/x86/h264dsp_init.c |  3 +++
 2 files changed, 35 insertions(+), 1 deletion(-)

Comments

James Almer April 5, 2017, 3:44 a.m. UTC | #1
On 4/4/2017 10:53 PM, James Darnley wrote:
> Haswell:
>  - 1.11x faster (522±0.4 vs. 469±1.8 decicycles) compared with mmxext
> 
> Skylake-U:
>  - 1.21x faster (671±5.5 vs. 555±1.4 decicycles) compared with mmxext

Again, you should add an SSE2 version first, then an AVX one if it's
measurably faster than the SSE2 one.

> ---
>  libavcodec/x86/h264_idct.asm  | 33 ++++++++++++++++++++++++++++++++-
>  libavcodec/x86/h264dsp_init.c |  3 +++
>  2 files changed, 35 insertions(+), 1 deletion(-)
> 
> diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
> index bc4dce4..24fb4d2 100644
> --- a/libavcodec/x86/h264_idct.asm
> +++ b/libavcodec/x86/h264_idct.asm
> @@ -65,7 +65,15 @@ SECTION .text
>  
>      IDCT4_1D      w, 0, 1, 2, 3, 4, 5
>      mova         m6, [pw_32]
> -    TRANSPOSE4x4W 0, 1, 2, 3, 4
> +    %if mmsize == 8
> +        TRANSPOSE4x4W 0, 1, 2, 3, 4
> +    %else
> +        punpcklwd m0, m1
> +        punpcklwd m2, m3
> +        SBUTTERFLY dq, 0, 2, 4
> +        MOVHL m1, m0
> +        MOVHL m3, m2
> +    %endif
>      paddw        m0, m6
>      IDCT4_1D      w, 0, 1, 2, 3, 4, 5
>      pxor         m7, m7
> @@ -1131,3 +1139,26 @@ INIT_MMX mmx
>  IDCT_DC_DEQUANT 0
>  INIT_MMX sse2
>  IDCT_DC_DEQUANT 7
> +
> +INIT_XMM avx
> +
> +; %unmacro STORE_DIFFx2 8 ; remove macro from x86util.asm but yasm doesn't have this yet
> +%macro STORE_DIFFx2 8 ; add1, add2, reg1, reg2, zero, shift, source, stride
> +    movd       %3, [%7]
> +    movd       %4, [%7+%8]
> +    psraw      %1, %6
> +    psraw      %2, %6
> +    punpcklbw  %3, %5
> +    punpcklbw  %4, %5
> +    paddw      %3, %1
> +    paddw      %4, %2
> +    packuswb   %3, %5
> +    packuswb   %4, %5
> +    movd     [%7], %3
> +    movd  [%7+%8], %4
> +%endmacro
> +
> +cglobal h264_idct_add_8, 3, 3, 8, dst_, block_, stride_
> +    movsxdifnidn stride_q, stride_d
> +    IDCT4_ADD    dst_q, block_q, stride_q
> +RET
> diff --git a/libavcodec/x86/h264dsp_init.c b/libavcodec/x86/h264dsp_init.c
> index 0643b37..8ba085f 100644
> --- a/libavcodec/x86/h264dsp_init.c
> +++ b/libavcodec/x86/h264dsp_init.c
> @@ -32,6 +32,7 @@ void ff_h264_idct ## NUM ## _add_ ## DEPTH ## _ ## OPT(uint8_t *dst,    \
>                                                         int stride);
>  
>  IDCT_ADD_FUNC(, 8, mmx)
> +IDCT_ADD_FUNC(, 8, avx)
>  IDCT_ADD_FUNC(, 10, sse2)
>  IDCT_ADD_FUNC(_dc, 8, mmxext)
>  IDCT_ADD_FUNC(_dc, 10, mmxext)
> @@ -337,6 +338,8 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
>                  c->h264_h_loop_filter_chroma       = ff_deblock_h_chroma422_8_avx;
>                  c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma422_intra_8_avx;
>              }
> +
> +            c->h264_idct_add        = ff_h264_idct_add_8_avx;
>          }
>      } else if (bit_depth == 10) {
>          if (EXTERNAL_MMXEXT(cpu_flags)) {
>
James Darnley April 6, 2017, 3:34 p.m. UTC | #2
On 2017-04-05 05:44, James Almer wrote:
> On 4/4/2017 10:53 PM, James Darnley wrote:
>> Haswell:
>>  - 1.11x faster (522±0.4 vs. 469±1.8 decicycles) compared with mmxext
>>
>> Skylake-U:
>>  - 1.21x faster (671±5.5 vs. 555±1.4 decicycles) compared with mmxext
> 
> Again, you should add an SSE2 version first, then an AVX one if it's
> measurably faster than the SSE2 one.

On a Yorkfield sse2 is barely faster: 1.02x faster (728±2.1 vs. 710±3.9
decicycles).  So 1 or 2 cycles

On a Skylake-U sse2 is most of the speedup: 1.15x faster (661±2.2 vs
573±1.9).  Then avx gains a mere 3 cycles: 547±0.5

On a Haswell sse2 provides only half the speedup:
 - sse2: 1.06x faster (525±2.5 vs 497±1.0 decicycles)
 - avx:  1.06x faster (497±1.0 vs 468±1.2 decicycles)

(All on 64-bit Linux)

On Nehalem and 64-bit Windows sse2 is slower:  0.92x faster (597±3.0 vs.
650±9.3 decicycles)

And on that note I should probably recheck the deblock patches I pushed
a little while ago.

So...  SSE2 for this function, yay or nay?
James Almer April 6, 2017, 4:06 p.m. UTC | #3
On 4/6/2017 12:34 PM, James Darnley wrote:
> On 2017-04-05 05:44, James Almer wrote:
>> On 4/4/2017 10:53 PM, James Darnley wrote:
>>> Haswell:
>>>  - 1.11x faster (522±0.4 vs. 469±1.8 decicycles) compared with mmxext
>>>
>>> Skylake-U:
>>>  - 1.21x faster (671±5.5 vs. 555±1.4 decicycles) compared with mmxext
>>
>> Again, you should add an SSE2 version first, then an AVX one if it's
>> measurably faster than the SSE2 one.
> 
> On a Yorkfield sse2 is barely faster: 1.02x faster (728±2.1 vs. 710±3.9
> decicycles).  So 1 or 2 cycles
> 
> On a Skylake-U sse2 is most of the speedup: 1.15x faster (661±2.2 vs
> 573±1.9).  Then avx gains a mere 3 cycles: 547±0.5
> 
> On a Haswell sse2 provides only half the speedup:
>  - sse2: 1.06x faster (525±2.5 vs 497±1.0 decicycles)
>  - avx:  1.06x faster (497±1.0 vs 468±1.2 decicycles)
> 
> (All on 64-bit Linux)
> 
> On Nehalem and 64-bit Windows sse2 is slower:  0.92x faster (597±3.0 vs.
> 650±9.3 decicycles)

Slower than what? MMX or AVX?

Your numbers are really confusing. Could you post the actual numbers for
each function instead of doing comparisons?

ff_h264_idct_add_8_mmx  = ??? cycles
ff_h264_idct_add_8_sse2 = ??? cycles
ff_h264_idct_add_8_avx  = ??? cycles

Does checkasm support these functions? Maybe you could just run that and
paste the results you get, which would be easier and faster.

> 
> And on that note I should probably recheck the deblock patches I pushed
> a little while ago.
> 
> So...  SSE2 for this function, yay or nay?

By default, always add SSE2 if it's measurably faster than MMX, especially
when you take advantage of the wider XMM regs, like you're doing here.
What you need to check is if adding AVX is worth it on top of SSE2 when
you're not taking advantage of the wider YMM regs, like it happens here.
James Darnley April 14, 2017, 11:26 a.m. UTC | #4
On 2017-04-06 18:06, James Almer wrote:
> Your numbers are really confusing. Could you post the actual numbers for
> each function instead of doing comparisons?

These figures are the actual numbers!

Using the figures from Haswell above:
> ff_h264_idct_add_8_mmx  = 52 cycles
> ff_h264_idct_add_8_sse2 = 49 cycles
> ff_h264_idct_add_8_avx  = 46 cycles

Coming back to this draft I saved I removed a fair bit of ranting and
cut it down to the essential point.

Also, I forgot about the Pentium I tested previous patches on.  I added
SSE2.  From that commit message:
> Kaby Lake Pentium:
>  - ff_h264_idct_add_8_sse2:    ~1.18x faster than mmxext
>  - ff_h264_idct_dc_add_8_sse2: ~1.07x faster than mmxext
diff mbox

Patch

diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
index bc4dce4..24fb4d2 100644
--- a/libavcodec/x86/h264_idct.asm
+++ b/libavcodec/x86/h264_idct.asm
@@ -65,7 +65,15 @@  SECTION .text
 
     IDCT4_1D      w, 0, 1, 2, 3, 4, 5
     mova         m6, [pw_32]
-    TRANSPOSE4x4W 0, 1, 2, 3, 4
+    %if mmsize == 8
+        TRANSPOSE4x4W 0, 1, 2, 3, 4
+    %else
+        punpcklwd m0, m1
+        punpcklwd m2, m3
+        SBUTTERFLY dq, 0, 2, 4
+        MOVHL m1, m0
+        MOVHL m3, m2
+    %endif
     paddw        m0, m6
     IDCT4_1D      w, 0, 1, 2, 3, 4, 5
     pxor         m7, m7
@@ -1131,3 +1139,26 @@  INIT_MMX mmx
 IDCT_DC_DEQUANT 0
 INIT_MMX sse2
 IDCT_DC_DEQUANT 7
+
+INIT_XMM avx
+
+; %unmacro STORE_DIFFx2 8 ; remove macro from x86util.asm but yasm doesn't have this yet
+%macro STORE_DIFFx2 8 ; add1, add2, reg1, reg2, zero, shift, source, stride
+    movd       %3, [%7]
+    movd       %4, [%7+%8]
+    psraw      %1, %6
+    psraw      %2, %6
+    punpcklbw  %3, %5
+    punpcklbw  %4, %5
+    paddw      %3, %1
+    paddw      %4, %2
+    packuswb   %3, %5
+    packuswb   %4, %5
+    movd     [%7], %3
+    movd  [%7+%8], %4
+%endmacro
+
+cglobal h264_idct_add_8, 3, 3, 8, dst_, block_, stride_
+    movsxdifnidn stride_q, stride_d
+    IDCT4_ADD    dst_q, block_q, stride_q
+RET
diff --git a/libavcodec/x86/h264dsp_init.c b/libavcodec/x86/h264dsp_init.c
index 0643b37..8ba085f 100644
--- a/libavcodec/x86/h264dsp_init.c
+++ b/libavcodec/x86/h264dsp_init.c
@@ -32,6 +32,7 @@  void ff_h264_idct ## NUM ## _add_ ## DEPTH ## _ ## OPT(uint8_t *dst,    \
                                                        int stride);
 
 IDCT_ADD_FUNC(, 8, mmx)
+IDCT_ADD_FUNC(, 8, avx)
 IDCT_ADD_FUNC(, 10, sse2)
 IDCT_ADD_FUNC(_dc, 8, mmxext)
 IDCT_ADD_FUNC(_dc, 10, mmxext)
@@ -337,6 +338,8 @@  av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
                 c->h264_h_loop_filter_chroma       = ff_deblock_h_chroma422_8_avx;
                 c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma422_intra_8_avx;
             }
+
+            c->h264_idct_add        = ff_h264_idct_add_8_avx;
         }
     } else if (bit_depth == 10) {
         if (EXTERNAL_MMXEXT(cpu_flags)) {