diff mbox

[FFmpeg-devel,4/4] avcodec/h264: sse2, avx h luma mbaff deblock/loop filter

Message ID 20170213124417.25808-4-jdarnley@obe.tv
State Superseded
Headers show

Commit Message

James Darnley Feb. 13, 2017, 12:44 p.m. UTC
x86-64 only

Yorkfield:
- sse2: 2.16x (434 vs. 201 cycles)

Skylake:
- sse2: 3.04x (378 vs. 124 cycles)
- avx:  3.29x (378 vs. 115 cycles)
---
 libavcodec/x86/h264_deblock.asm | 119 ++++++++++++++++++++++++++++++++++++++++
 libavcodec/x86/h264dsp_init.c   |  10 ++++
 2 files changed, 129 insertions(+)

Comments

James Almer Feb. 15, 2017, 4:55 p.m. UTC | #1
On 2/13/2017 9:44 AM, James Darnley wrote:
> x86-64 only
> 
> Yorkfield:
> - sse2: 2.16x (434 vs. 201 cycles)
> 
> Skylake:
> - sse2: 3.04x (378 vs. 124 cycles)
> - avx:  3.29x (378 vs. 115 cycles)
> ---
>  libavcodec/x86/h264_deblock.asm | 119 ++++++++++++++++++++++++++++++++++++++++
>  libavcodec/x86/h264dsp_init.c   |  10 ++++
>  2 files changed, 129 insertions(+)
> 
> diff --git a/libavcodec/x86/h264_deblock.asm b/libavcodec/x86/h264_deblock.asm
> index 509a0dbe0c..f47a199e8f 100644
> --- a/libavcodec/x86/h264_deblock.asm
> +++ b/libavcodec/x86/h264_deblock.asm
> @@ -377,10 +377,129 @@ cglobal deblock_h_luma_8, 5,9,0,0x60+16*WIN64
>      RET
>  %endmacro
>  
> +; TODO: use macro arguments
> +%macro TRANSPOSE_8X8B_XMM 8

Why not put this in x86util? And using arguments, of course.
Also, just call it TRANSPOSE_8X8B.

> +    punpcklbw m0, m1
> +    punpcklbw m2, m3
> +    punpcklbw m4, m5
> +    punpcklbw m6, m7
> +
> +    punpckhwd m1, m0, m2
> +    punpcklwd m0, m2

Use SBUTTERFLY here and below.

> +
> +    punpckhwd m5, m4, m6
> +    punpcklwd m4, m6
> +
> +    punpckhdq m2, m0, m4
> +    punpckldq m0, m4
> +
> +    punpckhdq m6, m1, m5
> +    punpckldq m1, m5
> +
> +    MOVHL     m4, m0
> +    MOVHL     m3, m2
> +    MOVHL     m7, m6
> +    MOVHL     m5, m1
> +    SWAP 1, 4
> +%endmacro
> +
> +%macro TRANSPOSE_8X8B_XMM 0
> +    TRANSPOSE_8X8B_XMM 0, 1, 2, 3, 4, 5, 6, 7

This seems wrong, or at least superfluous.

> +%endmacro
> +
> +%macro DEBLOCK_H_LUMA_MBAFF 0
> +
> +cglobal deblock_h_luma_mbaff_8, 5, 9, 10, 8*16, pix_, stride_, alpha_, beta_, tc0_

Why the underscores?

> +    movsxd stride_q,  stride_d
> +    dec    alpha_d
> +    dec    beta_d
> +    mov    r5,        pix_q
> +    lea    r6,       [3*stride_q]

Call r6 stride3.
James Darnley Feb. 15, 2017, 6:05 p.m. UTC | #2
On 2017-02-15 17:55, James Almer wrote:
> On 2/13/2017 9:44 AM, James Darnley wrote:
>> x86-64 only
>>
>> Yorkfield:
>> - sse2: 2.16x (434 vs. 201 cycles)
>>
>> Skylake:
>> - sse2: 3.04x (378 vs. 124 cycles)
>> - avx:  3.29x (378 vs. 115 cycles)
>> ---
>>  libavcodec/x86/h264_deblock.asm | 119 ++++++++++++++++++++++++++++++++++++++++
>>  libavcodec/x86/h264dsp_init.c   |  10 ++++
>>  2 files changed, 129 insertions(+)
>>
>> diff --git a/libavcodec/x86/h264_deblock.asm b/libavcodec/x86/h264_deblock.asm
>> index 509a0dbe0c..f47a199e8f 100644
>> --- a/libavcodec/x86/h264_deblock.asm
>> +++ b/libavcodec/x86/h264_deblock.asm
>> @@ -377,10 +377,129 @@ cglobal deblock_h_luma_8, 5,9,0,0x60+16*WIN64
>>      RET
>>  %endmacro
>>  
>> +; TODO: use macro arguments
>> +%macro TRANSPOSE_8X8B_XMM 8
> 
> Why not put this in x86util? And using arguments, of course.
> Also, just call it TRANSPOSE_8X8B.

I could call it that but people might be tempted to think it would work
on mmx registers.  Perhaps I can add a check for mmsize and error in
that case.

>> +    punpcklbw m0, m1
>> +    punpcklbw m2, m3
>> +    punpcklbw m4, m5
>> +    punpcklbw m6, m7
>> +
>> +    punpckhwd m1, m0, m2
>> +    punpcklwd m0, m2
> 
> Use SBUTTERFLY here and below.

Will do.

>> +
>> +    punpckhwd m5, m4, m6
>> +    punpcklwd m4, m6
>> +
>> +    punpckhdq m2, m0, m4
>> +    punpckldq m0, m4
>> +
>> +    punpckhdq m6, m1, m5
>> +    punpckldq m1, m5
>> +
>> +    MOVHL     m4, m0
>> +    MOVHL     m3, m2
>> +    MOVHL     m7, m6
>> +    MOVHL     m5, m1
>> +    SWAP 1, 4
>> +%endmacro
>> +
>> +%macro TRANSPOSE_8X8B_XMM 0
>> +    TRANSPOSE_8X8B_XMM 0, 1, 2, 3, 4, 5, 6, 7
> 
> This seems wrong, or at least superfluous.

I had the idea of letting it be a default so I (or other users) wouldn't
have to enter 0-7 every time.  Perhaps I could define the macro to take
zero to seven parameters but that is slightly suboptimal because it
would allow one to six parameters to be given

>> +%endmacro
>> +
>> +%macro DEBLOCK_H_LUMA_MBAFF 0
>> +
>> +cglobal deblock_h_luma_mbaff_8, 5, 9, 10, 8*16, pix_, stride_, alpha_, beta_, tc0_
> 
> Why the underscores?

Because I grew sick of seeing "named", "nameq", etc.  So I decided I
wanted a separator.  I could change x264asm but that seems rather involved.

(Ideally I would change nasm/yasm to support unicode so I could use
subscripted numbers but that is even harder.  Plus I would struggle to
enter the characters using my keyboard.  Heheheh)

>> +    movsxd stride_q,  stride_d
>> +    dec    alpha_d
>> +    dec    beta_d
>> +    mov    r5,        pix_q
>> +    lea    r6,       [3*stride_q]
> 
> Call r6 stride3.

I'll consider it.

Thanks for the comments.
diff mbox

Patch

diff --git a/libavcodec/x86/h264_deblock.asm b/libavcodec/x86/h264_deblock.asm
index 509a0dbe0c..f47a199e8f 100644
--- a/libavcodec/x86/h264_deblock.asm
+++ b/libavcodec/x86/h264_deblock.asm
@@ -377,10 +377,129 @@  cglobal deblock_h_luma_8, 5,9,0,0x60+16*WIN64
     RET
 %endmacro
 
+; TODO: use macro arguments
+%macro TRANSPOSE_8X8B_XMM 8
+    punpcklbw m0, m1
+    punpcklbw m2, m3
+    punpcklbw m4, m5
+    punpcklbw m6, m7
+
+    punpckhwd m1, m0, m2
+    punpcklwd m0, m2
+
+    punpckhwd m5, m4, m6
+    punpcklwd m4, m6
+
+    punpckhdq m2, m0, m4
+    punpckldq m0, m4
+
+    punpckhdq m6, m1, m5
+    punpckldq m1, m5
+
+    MOVHL     m4, m0
+    MOVHL     m3, m2
+    MOVHL     m7, m6
+    MOVHL     m5, m1
+    SWAP 1, 4
+%endmacro
+
+%macro TRANSPOSE_8X8B_XMM 0
+    TRANSPOSE_8X8B_XMM 0, 1, 2, 3, 4, 5, 6, 7
+%endmacro
+
+%macro DEBLOCK_H_LUMA_MBAFF 0
+
+cglobal deblock_h_luma_mbaff_8, 5, 9, 10, 8*16, pix_, stride_, alpha_, beta_, tc0_
+    movsxd stride_q,  stride_d
+    dec    alpha_d
+    dec    beta_d
+    mov    r5,        pix_q
+    lea    r6,       [3*stride_q]
+    add    r5,        r6
+
+    movq m0, [pix_q - 4]
+    movq m1, [pix_q + stride_q - 4]
+    movq m2, [pix_q + 2*stride_q - 4]
+    movq m3, [r5 - 4]
+    movq m4, [r5 + stride_q - 4]
+    movq m5, [r5 + 2*stride_q - 4]
+    movq m6, [r5 + r6 - 4]
+    movq m7, [r5 +4*stride_q - 4]
+
+    TRANSPOSE_8X8B_XMM
+
+    %assign i 0
+    %rep 8
+        movq [rsp + 16*i], m %+ i
+        %assign i i+1
+    %endrep
+
+    ; p2 = m1 [rsp + 16]
+    ; p1 = m2 [rsp + 32]
+    ; p0 = m3 [rsp + 48]
+    ; q0 = m4 [rsp + 64]
+    ; q1 = m5 [rsp + 80]
+    ; q2 = m6 [rsp + 96]
+
+    SWAP 0, 2
+    SWAP 1, 3
+    SWAP 2, 4
+    SWAP 3, 5
+
+    LOAD_MASK alpha_d, beta_d
+    movd m8, [tc0_q]
+    punpcklbw m8, m8
+    pcmpeqb m9, m9
+    pcmpeqb m9, m8
+    pandn   m9, m7
+    pand    m8, m9
+
+    movdqa  m3, [rsp + 16] ; p2
+    DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
+    pand    m6, m9
+    psubb   m7, m8, m6
+    pand    m6, m8
+    LUMA_Q1 m0, m3, [rsp + 16], [rsp + 32], m6, m4
+
+    movdqa  m4, [rsp + 96] ; q2
+    DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
+    pand    m6, m9
+    pand    m8, m6
+    psubb   m7, m6
+    mova    m3, [rsp + 80]
+    LUMA_Q1 m3, m4, [rsp + 96], [rsp + 80], m8, m6
+
+    DEBLOCK_P0_Q0
+    SWAP 1, 3
+    SWAP 2, 4
+    movq m0, [rsp]
+    movq m1, [rsp + 16]
+    movq m2, [rsp + 32]
+    movq m5, [rsp + 80]
+    movq m6, [rsp + 96]
+    movq m7, [rsp + 112]
+
+    TRANSPOSE_8X8B_XMM
+    movq [pix_q - 4], m0
+    movq [pix_q + stride_q - 4], m1
+    movq [pix_q + 2*stride_q - 4], m2
+    movq [r5 - 4], m3
+    movq [r5 + stride_q - 4], m4
+    movq [r5 + 2*stride_q - 4], m5
+    movq [r5 + r6 - 4], m6
+    movq [r5 +4*stride_q - 4], m7
+
+RET
+
+%endmacro
+
 INIT_XMM sse2
+DEBLOCK_H_LUMA_MBAFF
 DEBLOCK_LUMA
+
 %if HAVE_AVX_EXTERNAL
 INIT_XMM avx
+DEBLOCK_H_LUMA_MBAFF
 DEBLOCK_LUMA
 %endif
 
diff --git a/libavcodec/x86/h264dsp_init.c b/libavcodec/x86/h264dsp_init.c
index 7b3d17f971..10f19401ef 100644
--- a/libavcodec/x86/h264dsp_init.c
+++ b/libavcodec/x86/h264dsp_init.c
@@ -137,6 +137,9 @@  LF_IFUNC(h, chroma422_intra, depth, avx)        \
 LF_FUNC(v,  chroma,          depth, avx)        \
 LF_IFUNC(v, chroma_intra,    depth, avx)
 
+LF_FUNC(h, luma_mbaff, 8, sse2)
+LF_FUNC(h, luma_mbaff, 8, avx)
+
 LF_FUNCS(uint8_t,   8)
 LF_FUNCS(uint16_t, 10)
 
@@ -297,6 +300,10 @@  av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
             c->h264_h_loop_filter_luma       = ff_deblock_h_luma_8_sse2;
             c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_sse2;
             c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_sse2;
+
+#if ARCH_X86_64
+            c->h264_h_loop_filter_luma_mbaff = ff_deblock_h_luma_mbaff_8_sse2;
+#endif
         }
         if (EXTERNAL_SSSE3(cpu_flags)) {
             c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_ssse3;
@@ -307,6 +314,9 @@  av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
             c->h264_h_loop_filter_luma       = ff_deblock_h_luma_8_avx;
             c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_avx;
             c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_avx;
+#if ARCH_X86_64
+            c->h264_h_loop_filter_luma_mbaff = ff_deblock_h_luma_mbaff_8_avx;
+#endif
         }
     } else if (bit_depth == 10) {
         if (EXTERNAL_MMXEXT(cpu_flags)) {