diff mbox

[FFmpeg-devel,3/4] avcodec/h264: mmx2, sse2, avx 10-bit h chroma deblock/loop filter

Message ID 20161205183224.24627-4-jdarnley@obe.tv
State Accepted
Commit add21d0bb3f5fb25fd5d1437eb746b91c2570a8f
Headers show

Commit Message

James Darnley Dec. 5, 2016, 6:32 p.m. UTC
Yorkfield:
 - mmx2: 2.45x (279 vs. 114 cycles)
 - sse2: 3.36x (279 vs.  83 cycles)

Nehalem:
 - mmx2: 2.10x (192 vs.  92 cycles)
 - sse2: 2.84x (192 vs.  68 cycles)

Skylake:
 - mmx2: 1.75x (170 vs.  97 cycles)
 - sse2: 2.47x (170 vs.  69 cycles)
 - avx:  2.47x (170 vs.  69 cycles)
---
 libavcodec/x86/h264_deblock_10bit.asm | 118 ++++++++++++++++++++++++++++++++++
 libavcodec/x86/h264dsp_init.c         |   9 +++
 2 files changed, 127 insertions(+)

Comments

Carl Eugen Hoyos Dec. 7, 2016, 10:07 a.m. UTC | #1
2016-12-05 19:32 GMT+01:00 James Darnley <jdarnley@obe.tv>:

>  - sse2: 2.47x (170 vs.  69 cycles)
>  - avx:  2.47x (170 vs.  69 cycles)

Please elaborate on why this was committed.

Carl Eugen
James Darnley Dec. 7, 2016, 1:07 p.m. UTC | #2
On 2016-12-07 11:07, Carl Eugen Hoyos wrote:
> 2016-12-05 19:32 GMT+01:00 James Darnley <jdarnley@obe.tv>:
> 
>>  - sse2: 2.47x (170 vs.  69 cycles)
>>  - avx:  2.47x (170 vs.  69 cycles)
> 
> Please elaborate on why this was committed.

Because writing it cost almost zero time.  All it needed was writing the
dsp pointer assignment.  Preventing the function from being created
(with more %ifs) would have required another patch set being sent
through review.

Because a few instructions using 3 operand form should be quicker.  The
fact that it doesn't show is no doubt down to the out of order execution
managing to do the moves earlier than written.

Because it is future proof.  Someone may write a better AVX or a new
instruction version of the macros used.  A CPU may appear which
deprecates all SIMD without the VEX prefix.  FFmpeg may allow disabling
of old instruction sets without disabling new ones.

These last three reasons are each more unlikely than the previous.

And now for some more detailed stats, collected for 50 runs, each with
500k calls to the function in question:
> sse2: min: 687, max: 774, mean: 690.041, stddev: 12.155
> avx:  min: 681, max: 721, mean: 685.469, stddev: 9.083
Henrik Gramner Dec. 7, 2016, 10:17 p.m. UTC | #3
On Wed, Dec 7, 2016 at 2:07 PM, James Darnley <jdarnley@obe.tv> wrote:
> Because a few instructions using 3 operand form should be quicker.  The
> fact that it doesn't show is no doubt down to the out of order execution
> managing to do the moves earlier than written.

Register-register moves are handled in the register renaming stage
without consuming any execution resources and are often essentially
free (on modern Intel CPUs, e.g. Haswell+ at least, I don't remember
if this applied to Sandy Bridge as well). The advantage of 3-arg
operands is reduced decoding pressure and slightly denser encoding
which can save some cache if you have a lot of register-register
moves.
Carl Eugen Hoyos Dec. 8, 2016, 1:44 p.m. UTC | #4
2016-12-07 14:07 GMT+01:00 James Darnley <jdarnley@obe.tv>:
> On 2016-12-07 11:07, Carl Eugen Hoyos wrote:
>> 2016-12-05 19:32 GMT+01:00 James Darnley <jdarnley@obe.tv>:
>>
>>>  - sse2: 2.47x (170 vs.  69 cycles)
>>>  - avx:  2.47x (170 vs.  69 cycles)
>>
>> Please elaborate on why this was committed.
>
> Because writing it cost almost zero time.  All it needed was writing the
> dsp pointer assignment.

> Preventing the function from being created
> (with more %ifs) would have required another patch set being sent
> through review.

Thank you!

Carl Eugen
diff mbox

Patch

diff --git a/libavcodec/x86/h264_deblock_10bit.asm b/libavcodec/x86/h264_deblock_10bit.asm
index ebf8a3f..3536e41 100644
--- a/libavcodec/x86/h264_deblock_10bit.asm
+++ b/libavcodec/x86/h264_deblock_10bit.asm
@@ -843,6 +843,88 @@  DEBLOCK_LUMA_INTRA
     mova [r0+2*r1], m2
 %endmacro
 
+; expands to [base],...,[base+7*stride]
+%define PASS8ROWS(base, base3, stride, stride3) \
+    [base], [base+stride], [base+stride*2], [base3], \
+    [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4]
+
+; in: 8 rows of 4 words in %4..%11
+; out: 4 rows of 8 words in m0..m3
+%macro TRANSPOSE4x8W_LOAD 8
+    movq             m0, %1
+    movq             m2, %2
+    movq             m1, %3
+    movq             m3, %4
+
+    punpcklwd        m0, m2
+    punpcklwd        m1, m3
+    punpckhdq        m2, m0, m1
+    punpckldq        m0, m1
+
+    movq             m4, %5
+    movq             m6, %6
+    movq             m5, %7
+    movq             m3, %8
+
+    punpcklwd        m4, m6
+    punpcklwd        m5, m3
+    punpckhdq        m6, m4, m5
+    punpckldq        m4, m5
+
+    punpckhqdq       m1, m0, m4
+    punpcklqdq       m0, m4
+    punpckhqdq       m3, m2, m6
+    punpcklqdq       m2, m6
+%endmacro
+
+; in: 4 rows of 8 words in m0..m3
+; out: 8 rows of 4 words in %1..%8
+%macro TRANSPOSE8x4W_STORE 8
+    TRANSPOSE4x4W     0, 1, 2, 3, 4
+    movq             %1, m0
+    movhps           %2, m0
+    movq             %3, m1
+    movhps           %4, m1
+    movq             %5, m2
+    movhps           %6, m2
+    movq             %7, m3
+    movhps           %8, m3
+%endmacro
+
+; %1 = base + 3*stride
+; %2 = 3*stride (unused on mmx)
+; %3, %4 = place to store p1 and q1 values
+%macro CHROMA_H_LOAD 4
+    %if mmsize == 8
+        movq m0, [pix_q - 4]
+        movq m1, [pix_q +   stride_q - 4]
+        movq m2, [pix_q + 2*stride_q - 4]
+        movq m3, [%1 - 4]
+        TRANSPOSE4x4W 0, 1, 2, 3, 4
+    %else
+        TRANSPOSE4x8W_LOAD PASS8ROWS(pix_q-4, %1-4, stride_q, %2)
+    %endif
+    mova %3, m0
+    mova %4, m3
+%endmacro
+
+; %1 = base + 3*stride
+; %2 = 3*stride (unused on mmx)
+; %3, %4 = place to load p1 and q1 values
+%macro CHROMA_H_STORE 4
+    mova m0, %3
+    mova m3, %4
+    %if mmsize == 8
+        TRANSPOSE4x4W 0, 1, 2, 3, 4
+        movq [pix_q - 4],              m0
+        movq [pix_q +   stride_q - 4], m1
+        movq [pix_q + 2*stride_q - 4], m2
+        movq [%1 - 4],                 m3
+    %else
+        TRANSPOSE8x4W_STORE PASS8ROWS(pix_q-4, %1-4, stride_q, %2)
+    %endif
+%endmacro
+
 %macro CHROMA_V_LOAD_TC 2
     movd        %1, [%2]
     punpcklbw   %1, %1
@@ -914,6 +996,42 @@  cglobal deblock_v_chroma_intra_10, 4,6-(mmsize/16),8*(mmsize/16)
 %else
     RET
 %endif
+
+;-----------------------------------------------------------------------------
+; void ff_deblock_h_chroma_10(uint16_t *pix, int stride, int alpha, int beta,
+;                             int8_t *tc0)
+;-----------------------------------------------------------------------------
+cglobal deblock_h_chroma_10, 5, 7, 8, 2*mmsize, pix_, stride_, alpha_, beta_, tc0_
+    shl alpha_d,  2
+    shl beta_d,   2
+    mov r5,       pix_q
+    lea r6,      [3*stride_q]
+    add r5,       r6
+%if mmsize == 8
+    mov r6d,      2
+    .loop:
+%endif
+
+        CHROMA_H_LOAD r5, r6, [rsp], [rsp + mmsize]
+        LOAD_AB          m4,  m5, alpha_d, beta_d
+        LOAD_MASK        m0,  m1, m2, m3, m4, m5, m7, m6, m4
+        pxor             m4,  m4
+        CHROMA_V_LOAD_TC m6,  tc0_q
+        psubw            m6, [pw_3]
+        pmaxsw           m6,  m4
+        pand             m7,  m6
+        DEBLOCK_P0_Q0    m1,  m2, m0, m3, m7, m5, m6
+        CHROMA_H_STORE r5, r6, [rsp], [rsp + mmsize]
+
+%if mmsize == 8
+        lea pix_q, [pix_q + 4*stride_q]
+        lea r5,    [r5 + 4*stride_q]
+        add tc0_q,  2
+        dec r6d
+    jg .loop
+%endif
+RET
+
 %endmacro
 
 %if ARCH_X86_64 == 0
diff --git a/libavcodec/x86/h264dsp_init.c b/libavcodec/x86/h264dsp_init.c
index 7e16dca..ab270da 100644
--- a/libavcodec/x86/h264dsp_init.c
+++ b/libavcodec/x86/h264dsp_init.c
@@ -313,6 +313,9 @@  av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
 #if ARCH_X86_32
             c->h264_v_loop_filter_chroma       = ff_deblock_v_chroma_10_mmxext;
             c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_mmxext;
+            if (chroma_format_idc <= 1) {
+                c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_10_mmxext;
+            }
             c->h264_v_loop_filter_luma         = ff_deblock_v_luma_10_mmxext;
             c->h264_h_loop_filter_luma         = ff_deblock_h_luma_10_mmxext;
             c->h264_v_loop_filter_luma_intra   = ff_deblock_v_luma_intra_10_mmxext;
@@ -346,6 +349,9 @@  av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
 
             c->h264_v_loop_filter_chroma       = ff_deblock_v_chroma_10_sse2;
             c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_sse2;
+            if (chroma_format_idc <= 1) {
+                c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_10_sse2;
+            }
 #if HAVE_ALIGNED_STACK
             c->h264_v_loop_filter_luma       = ff_deblock_v_luma_10_sse2;
             c->h264_h_loop_filter_luma       = ff_deblock_h_luma_10_sse2;
@@ -381,6 +387,9 @@  av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
 
             c->h264_v_loop_filter_chroma       = ff_deblock_v_chroma_10_avx;
             c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_avx;
+            if (chroma_format_idc <= 1) {
+                c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_10_avx;
+            }
 #if HAVE_ALIGNED_STACK
             c->h264_v_loop_filter_luma         = ff_deblock_v_luma_10_avx;
             c->h264_h_loop_filter_luma         = ff_deblock_h_luma_10_avx;