diff mbox

[FFmpeg-devel,1/6] avcodec/h264: mmx2, sse2, avx 10-bit h chroma deblock/loop filter

Message ID 20161201165749.10368-2-jdarnley@obe.tv
State Superseded
Headers show

Commit Message

James Darnley Dec. 1, 2016, 4:57 p.m. UTC
Yorkfield:
 - mmx2: 2.44x faster (278 vs. 114 cycles)
 - sse2: 3.35x faster (278 vs.  83 cycles)

Skylake:
 - mmx2: 1.69x faster (169 vs. 100 cycles)
 - sse2: 2.34x faster (169 vs.  72 cycles)
 - avx:  2.32x faster (169 vs.  73 cycles)
---
 libavcodec/x86/h264_deblock_10bit.asm | 118 ++++++++++++++++++++++++++++++++++
 libavcodec/x86/h264dsp_init.c         |   9 +++
 2 files changed, 127 insertions(+)

Comments

Michael Niedermayer Dec. 1, 2016, 10:16 p.m. UTC | #1
On Thu, Dec 01, 2016 at 05:57:44PM +0100, James Darnley wrote:
> Yorkfield:
>  - mmx2: 2.44x faster (278 vs. 114 cycles)
>  - sse2: 3.35x faster (278 vs.  83 cycles)
> 
> Skylake:
>  - mmx2: 1.69x faster (169 vs. 100 cycles)
>  - sse2: 2.34x faster (169 vs.  72 cycles)
>  - avx:  2.32x faster (169 vs.  73 cycles)
> ---
>  libavcodec/x86/h264_deblock_10bit.asm | 118 ++++++++++++++++++++++++++++++++++
>  libavcodec/x86/h264dsp_init.c         |   9 +++
>  2 files changed, 127 insertions(+)

breaks build on linux x86-32

YASM    libavcodec/x86/h264_deblock_10bit.o
src/libavcodec/x86/h264_deblock_10bit.asm:1039: warning: `bpl' is a register in 64-bit mode
src/libavcodec/x86/h264_deblock_10bit.asm:1039: error: undefined symbol `bpl' (first use)
src/libavcodec/x86/h264_deblock_10bit.asm:1039: error:  (Each undefined symbol is reported only once.)
src/libavcodec/x86/h264_deblock_10bit.asm:1039: warning: `bpl' is a register in 64-bit mode

[...]
Carl Eugen Hoyos Dec. 1, 2016, 11:31 p.m. UTC | #2
2016-12-01 17:57 GMT+01:00 James Darnley <jdarnley@obe.tv>:
> Yorkfield:
>  - mmx2: 2.44x faster (278 vs. 114 cycles)
>  - sse2: 3.35x faster (278 vs.  83 cycles)
>
> Skylake:
>  - mmx2: 1.69x faster (169 vs. 100 cycles)
>  - sse2: 2.34x faster (169 vs.  72 cycles)

Is it expected (or possible) that the speed impact is so
different for different Intel hardware?

>  - avx:  2.32x faster (169 vs.  73 cycles)

Don't you agree that if this is true (I don't know if it is)
the patch should not be applied as is?

Carl Eugen
James Darnley Dec. 1, 2016, 11:49 p.m. UTC | #3
On 2016-12-01 23:16, Michael Niedermayer wrote:
> On Thu, Dec 01, 2016 at 05:57:44PM +0100, James Darnley wrote:
>> Yorkfield:
>>  - mmx2: 2.44x faster (278 vs. 114 cycles)
>>  - sse2: 3.35x faster (278 vs.  83 cycles)
>>
>> Skylake:
>>  - mmx2: 1.69x faster (169 vs. 100 cycles)
>>  - sse2: 2.34x faster (169 vs.  72 cycles)
>>  - avx:  2.32x faster (169 vs.  73 cycles)
>> ---
>>  libavcodec/x86/h264_deblock_10bit.asm | 118 ++++++++++++++++++++++++++++++++++
>>  libavcodec/x86/h264dsp_init.c         |   9 +++
>>  2 files changed, 127 insertions(+)
> 
> breaks build on linux x86-32
> 
> YASM    libavcodec/x86/h264_deblock_10bit.o
> src/libavcodec/x86/h264_deblock_10bit.asm:1039: warning: `bpl' is a register in 64-bit mode
> src/libavcodec/x86/h264_deblock_10bit.asm:1039: error: undefined symbol `bpl' (first use)
> src/libavcodec/x86/h264_deblock_10bit.asm:1039: error:  (Each undefined symbol is reported only once.)
> src/libavcodec/x86/h264_deblock_10bit.asm:1039: warning: `bpl' is a register in 64-bit mode

Ah.  I shouldn't do clever things like trying to use the byte-sized
registers.  It isn't needed and causes problems like this.  Changed
locally.  Also changed in the 4:2:0 chroma intra patch.
James Darnley Dec. 1, 2016, 11:56 p.m. UTC | #4
On 2016-12-02 00:31, Carl Eugen Hoyos wrote:
> 2016-12-01 17:57 GMT+01:00 James Darnley <jdarnley@obe.tv>:
>> Yorkfield:
>>  - mmx2: 2.44x faster (278 vs. 114 cycles)
>>  - sse2: 3.35x faster (278 vs.  83 cycles)
>>
>> Skylake:
>>  - mmx2: 1.69x faster (169 vs. 100 cycles)
>>  - sse2: 2.34x faster (169 vs.  72 cycles)
> 
> Is it expected (or possible) that the speed impact is so
> different for different Intel hardware?

Yes.  Intel's Core branded processors introduced a much better
micro-architecture (the generation after the Yorkfield) which will cause
the scalar C code to be quite a bit faster.  The SIMD on the other hand
was already so quick it didn't gain much.

(At least I think I remember this being the story.)

>>  - avx:  2.32x faster (169 vs.  73 cycles)
> 
> Don't you agree that if this is true (I don't know if it is)
> the patch should not be applied as is?

I do agree and I wouldn't (deliberately) apply anything that made the
decoder slower, or not as fast as it could be.
diff mbox

Patch

diff --git a/libavcodec/x86/h264_deblock_10bit.asm b/libavcodec/x86/h264_deblock_10bit.asm
index ebf8a3f..e91b1c6 100644
--- a/libavcodec/x86/h264_deblock_10bit.asm
+++ b/libavcodec/x86/h264_deblock_10bit.asm
@@ -843,6 +843,88 @@  DEBLOCK_LUMA_INTRA
     mova [r0+2*r1], m2
 %endmacro
 
+; expands to [base],...,[base+7*stride]
+%define PASS8ROWS(base, base3, stride, stride3) \
+    [base], [base+stride], [base+stride*2], [base3], \
+    [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4]
+
+; in: 8 rows of 4 words in %4..%11
+; out: 4 rows of 8 words in m0..m3
+%macro TRANSPOSE4x8W_LOAD 8
+    movq             m0, %1
+    movq             m2, %2
+    movq             m1, %3
+    movq             m3, %4
+
+    punpcklwd        m0, m2
+    punpcklwd        m1, m3
+    punpckhdq        m2, m0, m1
+    punpckldq        m0, m1
+
+    movq             m4, %5
+    movq             m6, %6
+    movq             m5, %7
+    movq             m3, %8
+
+    punpcklwd        m4, m6
+    punpcklwd        m5, m3
+    punpckhdq        m6, m4, m5
+    punpckldq        m4, m5
+
+    punpckhqdq       m1, m0, m4
+    punpcklqdq       m0, m4
+    punpckhqdq       m3, m2, m6
+    punpcklqdq       m2, m6
+%endmacro
+
+; in: 4 rows of 8 words in m0..m3
+; out: 8 rows of 4 words in %1..%8
+%macro TRANSPOSE8x4W_STORE 8
+    TRANSPOSE4x4W     0, 1, 2, 3, 4
+    movq             %1, m0
+    movhps           %2, m0
+    movq             %3, m1
+    movhps           %4, m1
+    movq             %5, m2
+    movhps           %6, m2
+    movq             %7, m3
+    movhps           %8, m3
+%endmacro
+
+; %1 = base + 3*stride
+; %2 = 3*stride (unused on mmx)
+; %3, %4 = place to store p1 and q1 values
+%macro CHROMA_H_LOAD 4
+    %if mmsize == 8
+        movq m0, [pix_q - 4]
+        movq m1, [pix_q +   stride_q - 4]
+        movq m2, [pix_q + 2*stride_q - 4]
+        movq m3, [%1 - 4]
+        TRANSPOSE4x4W 0, 1, 2, 3, 4
+    %else
+        TRANSPOSE4x8W_LOAD PASS8ROWS(pix_q-4, %1-4, stride_q, %2)
+    %endif
+    mova %3, m0
+    mova %4, m3
+%endmacro
+
+; %1 = base + 3*stride
+; %2 = 3*stride (unused on mmx)
+; %3, %4 = place to load p1 and q1 values
+%macro CHROMA_H_STORE 4
+    mova m0, %3
+    mova m3, %4
+    %if mmsize == 8
+        TRANSPOSE4x4W 0, 1, 2, 3, 4
+        movq [pix_q - 4],              m0
+        movq [pix_q +   stride_q - 4], m1
+        movq [pix_q + 2*stride_q - 4], m2
+        movq [%1 - 4],                 m3
+    %else
+        TRANSPOSE8x4W_STORE PASS8ROWS(pix_q-4, %1-4, stride_q, %2)
+    %endif
+%endmacro
+
 %macro CHROMA_V_LOAD_TC 2
     movd        %1, [%2]
     punpcklbw   %1, %1
@@ -914,6 +996,42 @@  cglobal deblock_v_chroma_intra_10, 4,6-(mmsize/16),8*(mmsize/16)
 %else
     RET
 %endif
+
+;-----------------------------------------------------------------------------
+; void ff_deblock_h_chroma_10(uint16_t *pix, int stride, int alpha, int beta,
+;                             int8_t *tc0)
+;-----------------------------------------------------------------------------
+cglobal deblock_h_chroma_10, 5, 7, 8, 2*mmsize, pix_, stride_, alpha_, beta_, tc0_
+    shl alpha_d,  2
+    shl beta_d,   2
+    mov r5,       pix_q
+    lea r6,      [3*stride_q]
+    add r5,       r6
+%if mmsize == 8
+    mov r6b,      2
+    .loop:
+%endif
+
+        CHROMA_H_LOAD r5, r6, [rsp], [rsp + mmsize]
+        LOAD_AB          m4,  m5, alpha_d, beta_d
+        LOAD_MASK        m0,  m1, m2, m3, m4, m5, m7, m6, m4
+        pxor             m4,  m4
+        CHROMA_V_LOAD_TC m6,  tc0_q
+        psubw            m6, [pw_3]
+        pmaxsw           m6,  m4
+        pand             m7,  m6
+        DEBLOCK_P0_Q0    m1,  m2, m0, m3, m7, m5, m6
+        CHROMA_H_STORE r5, r6, [rsp], [rsp + mmsize]
+
+%if mmsize == 8
+        lea pix_q, [pix_q + 4*stride_q]
+        lea r5,    [r5 + 4*stride_q]
+        add tc0_q,  2
+        dec r6b
+    jg .loop
+%endif
+RET
+
 %endmacro
 
 %if ARCH_X86_64 == 0
diff --git a/libavcodec/x86/h264dsp_init.c b/libavcodec/x86/h264dsp_init.c
index c6c643a..c568762 100644
--- a/libavcodec/x86/h264dsp_init.c
+++ b/libavcodec/x86/h264dsp_init.c
@@ -310,6 +310,9 @@  av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
 #if ARCH_X86_32
             c->h264_v_loop_filter_chroma       = ff_deblock_v_chroma_10_mmxext;
             c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_mmxext;
+            if (chroma_format_idc <= 1) {
+                c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_10_mmxext;
+            }
             c->h264_v_loop_filter_luma         = ff_deblock_v_luma_10_mmxext;
             c->h264_h_loop_filter_luma         = ff_deblock_h_luma_10_mmxext;
             c->h264_v_loop_filter_luma_intra   = ff_deblock_v_luma_intra_10_mmxext;
@@ -343,6 +346,9 @@  av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
 
             c->h264_v_loop_filter_chroma       = ff_deblock_v_chroma_10_sse2;
             c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_sse2;
+            if (chroma_format_idc <= 1) {
+                c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_10_sse2;
+            }
 #if HAVE_ALIGNED_STACK
             c->h264_v_loop_filter_luma       = ff_deblock_v_luma_10_sse2;
             c->h264_h_loop_filter_luma       = ff_deblock_h_luma_10_sse2;
@@ -378,6 +384,9 @@  av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
 
             c->h264_v_loop_filter_chroma       = ff_deblock_v_chroma_10_avx;
             c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_avx;
+            if (chroma_format_idc <= 1) {
+                c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_10_avx;
+            }
 #if HAVE_ALIGNED_STACK
             c->h264_v_loop_filter_luma         = ff_deblock_v_luma_10_avx;
             c->h264_h_loop_filter_luma         = ff_deblock_h_luma_10_avx;