diff mbox

[FFmpeg-devel,5/5] avcodec/h264: add avx 8-bit h264_idct_dc_add

Message ID 20170405015328.5476-6-jdarnley@obe.tv
State Superseded
Headers show

Commit Message

James Darnley April 5, 2017, 1:53 a.m. UTC
Haswell:
 - 1.02x faster (405±0.7 vs. 397±0.8 decicycles) compared with mmxext

Skylake-U:
 - 1.06x faster (498±1.8 vs. 470±1.3 decicycles) compared with mmxext
---
 libavcodec/x86/h264_idct.asm  | 20 ++++++++++++++++++++
 libavcodec/x86/h264dsp_init.c |  2 ++
 2 files changed, 22 insertions(+)

Comments

James Almer April 5, 2017, 4:05 a.m. UTC | #1
On 4/4/2017 10:53 PM, James Darnley wrote:
> Haswell:
>  - 1.02x faster (405±0.7 vs. 397±0.8 decicycles) compared with mmxext
> 
> Skylake-U:
>  - 1.06x faster (498±1.8 vs. 470±1.3 decicycles) compared with mmxext
> ---
>  libavcodec/x86/h264_idct.asm  | 20 ++++++++++++++++++++
>  libavcodec/x86/h264dsp_init.c |  2 ++
>  2 files changed, 22 insertions(+)
> 
> diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
> index 24fb4d2..7fd57d3 100644
> --- a/libavcodec/x86/h264_idct.asm
> +++ b/libavcodec/x86/h264_idct.asm
> @@ -1158,7 +1158,27 @@ INIT_XMM avx
>      movd  [%7+%8], %4
>  %endmacro
>  
> +%macro DC_ADD_INIT 1
> +    add      %1d, 32
> +    sar      %1d, 6
> +    movd     m0, %1d
> +    SPLATW   m0, m0, 0

Considering DC_ADD_MMXEXT_OP works with dwords, a single pshuflw should be
enough. This macro calls two instructions to fill the entire XMM register,
and there's no need for that.
You could for that matter try to optimize DC_ADD_MMXEXT_OP a bit, combining
said dwords with punpk* into fewer registers to reduce the amount of padd*
and psub* needed afterwards. See ADD_RES_MMX_4_8 in hevc_add_res.asm

And again, SSE2 first, AVX only if measurably faster. But since you're not
making use of the wider XMM regs here at all, the only chips that will see
any real speed up are those slow in mmx (like Skylake seems to be).

> +    lea      %1, [3*stride_q]
> +    pxor     m1, m1
> +    psubw    m1, m0
> +    packuswb m0, m0
> +    packuswb m1, m1
> +%endmacro
> +
>  cglobal h264_idct_add_8, 3, 3, 8, dst_, block_, stride_
>      movsxdifnidn stride_q, stride_d
>      IDCT4_ADD    dst_q, block_q, stride_q
>  RET
> +
> +cglobal h264_idct_dc_add_8, 3, 4, 0, dst_, block_, stride_
> +    movsxdifnidn stride_q, stride_d
> +    movsx             r3d, word [block_q]
> +    mov   dword [block_q], 0
> +    DC_ADD_INIT r3
> +    DC_ADD_MMXEXT_OP movd, dst_q, stride_q, r3
> +RET
> diff --git a/libavcodec/x86/h264dsp_init.c b/libavcodec/x86/h264dsp_init.c
> index 8ba085f..bf74937 100644
> --- a/libavcodec/x86/h264dsp_init.c
> +++ b/libavcodec/x86/h264dsp_init.c
> @@ -35,6 +35,7 @@ IDCT_ADD_FUNC(, 8, mmx)
>  IDCT_ADD_FUNC(, 8, avx)
>  IDCT_ADD_FUNC(, 10, sse2)
>  IDCT_ADD_FUNC(_dc, 8, mmxext)
> +IDCT_ADD_FUNC(_dc, 8, avx)
>  IDCT_ADD_FUNC(_dc, 10, mmxext)
>  IDCT_ADD_FUNC(8_dc, 8, mmxext)
>  IDCT_ADD_FUNC(8_dc, 10, sse2)
> @@ -340,6 +341,7 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
>              }
>  
>              c->h264_idct_add        = ff_h264_idct_add_8_avx;
> +            c->h264_idct_dc_add     = ff_h264_idct_dc_add_8_avx;
>          }
>      } else if (bit_depth == 10) {
>          if (EXTERNAL_MMXEXT(cpu_flags)) {
>
James Darnley April 6, 2017, 4:01 p.m. UTC | #2
On 2017-04-05 06:05, James Almer wrote:
> On 4/4/2017 10:53 PM, James Darnley wrote:
>> Haswell:
>>  - 1.02x faster (405±0.7 vs. 397±0.8 decicycles) compared with mmxext
>>
>> Skylake-U:
>>  - 1.06x faster (498±1.8 vs. 470±1.3 decicycles) compared with mmxext
>> ---
>>  libavcodec/x86/h264_idct.asm  | 20 ++++++++++++++++++++
>>  libavcodec/x86/h264dsp_init.c |  2 ++
>>  2 files changed, 22 insertions(+)
>>
>> diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
>> index 24fb4d2..7fd57d3 100644
>> --- a/libavcodec/x86/h264_idct.asm
>> +++ b/libavcodec/x86/h264_idct.asm
>> @@ -1158,7 +1158,27 @@ INIT_XMM avx
>>      movd  [%7+%8], %4
>>  %endmacro
>>  
>> +%macro DC_ADD_INIT 1
>> +    add      %1d, 32
>> +    sar      %1d, 6
>> +    movd     m0, %1d
>> +    SPLATW   m0, m0, 0
> 
> Considering DC_ADD_MMXEXT_OP works with dwords, a single pshuflw should be
> enough. This macro calls two instructions to fill the entire XMM register,
> and there's no need for that.

Noted, I made that change butit doesn't seemto change much in terms of
performance.

> You could for that matter try to optimize DC_ADD_MMXEXT_OP a bit, combining
> said dwords with punpk* into fewer registers to reduce the amount of padd*
> and psub* needed afterwards. See ADD_RES_MMX_4_8 in hevc_add_res.asm

Noted.  Maybe in the future.

> And again, SSE2 first, AVX only if measurably faster. But since you're not
> making use of the wider XMM regs here at all, the only chips that will see
> any real speed up are those slow in mmx (like Skylake seems to be).

Yorkfield gets no benefit from sse2 (575±0.4 vs. 574±0.3 decicycles).
Haswell gets most of its benefit from sse2 (404±0.6 vs. 390±0.3 vs.
388±0.3).
Skylake-U gets all of its speedup from sse2 (533±3.0 vs 488±2.0 vs 497±1.4).

Nehalem and 64-bit also gets no benefit from sse2.

Again: SSE2 yay or nay?  Maybe I should just drop this; I'm not sure 5
cycles is worth it.

(I will now go and modify my script to divide the recorded decicycle
count by 10.)

>> +cglobal h264_idct_dc_add_8, 3, 4, 0, dst_, block_, stride_
                                      ^
Fixed this bug.
James Almer April 6, 2017, 4:36 p.m. UTC | #3
On 4/6/2017 1:01 PM, James Darnley wrote:
> On 2017-04-05 06:05, James Almer wrote:
>> On 4/4/2017 10:53 PM, James Darnley wrote:
>>> Haswell:
>>>  - 1.02x faster (405±0.7 vs. 397±0.8 decicycles) compared with mmxext
>>>
>>> Skylake-U:
>>>  - 1.06x faster (498±1.8 vs. 470±1.3 decicycles) compared with mmxext
>>> ---
>>>  libavcodec/x86/h264_idct.asm  | 20 ++++++++++++++++++++
>>>  libavcodec/x86/h264dsp_init.c |  2 ++
>>>  2 files changed, 22 insertions(+)
>>>
>>> diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
>>> index 24fb4d2..7fd57d3 100644
>>> --- a/libavcodec/x86/h264_idct.asm
>>> +++ b/libavcodec/x86/h264_idct.asm
>>> @@ -1158,7 +1158,27 @@ INIT_XMM avx
>>>      movd  [%7+%8], %4
>>>  %endmacro
>>>  
>>> +%macro DC_ADD_INIT 1
>>> +    add      %1d, 32
>>> +    sar      %1d, 6
>>> +    movd     m0, %1d
>>> +    SPLATW   m0, m0, 0
>>
>> Considering DC_ADD_MMXEXT_OP works with dwords, a single pshuflw should be
>> enough. This macro calls two instructions to fill the entire XMM register,
>> and there's no need for that.
> 
> Noted, I made that change butit doesn't seemto change much in terms of
> performance.
> 
>> You could for that matter try to optimize DC_ADD_MMXEXT_OP a bit, combining
>> said dwords with punpk* into fewer registers to reduce the amount of padd*
>> and psub* needed afterwards. See ADD_RES_MMX_4_8 in hevc_add_res.asm
> 
> Noted.  Maybe in the future.
> 
>> And again, SSE2 first, AVX only if measurably faster. But since you're not
>> making use of the wider XMM regs here at all, the only chips that will see
>> any real speed up are those slow in mmx (like Skylake seems to be).
> 
> Yorkfield gets no benefit from sse2 (575±0.4 vs. 574±0.3 decicycles).
> Haswell gets most of its benefit from sse2 (404±0.6 vs. 390±0.3 vs.
> 388±0.3).
> Skylake-U gets all of its speedup from sse2 (533±3.0 vs 488±2.0 vs 497±1.4).
> 
> Nehalem and 64-bit also gets no benefit from sse2.
> 
> Again: SSE2 yay or nay?  Maybe I should just drop this; I'm not sure 5
> cycles is worth it.

Yes, add SSE2 as it's the source of pretty much any speedup.
AVX seems to not make a difference on top of SSE2 if i'm reading
your numbers right (as i asked in the previous reply, please post
actual numbers for each version), so you can just skip it.

> 
> (I will now go and modify my script to divide the recorded decicycle
> count by 10.)
> 
>>> +cglobal h264_idct_dc_add_8, 3, 4, 0, dst_, block_, stride_
>                                       ^
> Fixed this bug.
> 
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
diff mbox

Patch

diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
index 24fb4d2..7fd57d3 100644
--- a/libavcodec/x86/h264_idct.asm
+++ b/libavcodec/x86/h264_idct.asm
@@ -1158,7 +1158,27 @@  INIT_XMM avx
     movd  [%7+%8], %4
 %endmacro
 
+%macro DC_ADD_INIT 1
+    add      %1d, 32
+    sar      %1d, 6
+    movd     m0, %1d
+    SPLATW   m0, m0, 0
+    lea      %1, [3*stride_q]
+    pxor     m1, m1
+    psubw    m1, m0
+    packuswb m0, m0
+    packuswb m1, m1
+%endmacro
+
 cglobal h264_idct_add_8, 3, 3, 8, dst_, block_, stride_
     movsxdifnidn stride_q, stride_d
     IDCT4_ADD    dst_q, block_q, stride_q
 RET
+
+cglobal h264_idct_dc_add_8, 3, 4, 0, dst_, block_, stride_
+    movsxdifnidn stride_q, stride_d
+    movsx             r3d, word [block_q]
+    mov   dword [block_q], 0
+    DC_ADD_INIT r3
+    DC_ADD_MMXEXT_OP movd, dst_q, stride_q, r3
+RET
diff --git a/libavcodec/x86/h264dsp_init.c b/libavcodec/x86/h264dsp_init.c
index 8ba085f..bf74937 100644
--- a/libavcodec/x86/h264dsp_init.c
+++ b/libavcodec/x86/h264dsp_init.c
@@ -35,6 +35,7 @@  IDCT_ADD_FUNC(, 8, mmx)
 IDCT_ADD_FUNC(, 8, avx)
 IDCT_ADD_FUNC(, 10, sse2)
 IDCT_ADD_FUNC(_dc, 8, mmxext)
+IDCT_ADD_FUNC(_dc, 8, avx)
 IDCT_ADD_FUNC(_dc, 10, mmxext)
 IDCT_ADD_FUNC(8_dc, 8, mmxext)
 IDCT_ADD_FUNC(8_dc, 10, sse2)
@@ -340,6 +341,7 @@  av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
             }
 
             c->h264_idct_add        = ff_h264_idct_add_8_avx;
+            c->h264_idct_dc_add     = ff_h264_idct_dc_add_8_avx;
         }
     } else if (bit_depth == 10) {
         if (EXTERNAL_MMXEXT(cpu_flags)) {