diff mbox

[FFmpeg-devel,4/4] avcodec/h264: mmx2, sse2, avx 10-bit 4:2:2 h chroma deblock/loop filter

Message ID 20161205183224.24627-5-jdarnley@obe.tv
State Accepted
Headers show

Commit Message

James Darnley Dec. 5, 2016, 6:32 p.m. UTC
Yorkfield:
 - mmx2: 2.53x (504 vs. 199 cycles)
 - sse2: 3.83x (504 vs. 131 cycles)

Nehalem:
 - mmx2: 2.42x (365 vs. 151 cycles)
 - sse2: 3.56x (365 vs. 103 cycles)

Skylake:
 - mmx2: 1.81x (308 vs. 170 cycles)
 - sse2: 2.84x (308 vs. 108 cycles)
 - avx:  2.93x (308 vs. 105 cycles)
---
 libavcodec/x86/h264_deblock_10bit.asm | 39 +++++++++++++++++++++++++++++++++++
 libavcodec/x86/h264dsp_init.c         |  6 ++++++
 2 files changed, 45 insertions(+)
diff mbox

Patch

diff --git a/libavcodec/x86/h264_deblock_10bit.asm b/libavcodec/x86/h264_deblock_10bit.asm
index 3536e41..56cf4d6 100644
--- a/libavcodec/x86/h264_deblock_10bit.asm
+++ b/libavcodec/x86/h264_deblock_10bit.asm
@@ -1032,6 +1032,45 @@  cglobal deblock_h_chroma_10, 5, 7, 8, 2*mmsize, pix_, stride_, alpha_, beta_, tc
 %endif
 RET
 
+;-----------------------------------------------------------------------------
+; void ff_deblock_h_chroma422_10(uint16_t *pix, int stride, int alpha, int beta,
+;                                int8_t *tc0)
+;-----------------------------------------------------------------------------
+cglobal deblock_h_chroma422_10, 5, 7, 8, 3*mmsize, pix_, stride_, alpha_, beta_, tc0_
+    shl alpha_d,  2
+    shl beta_d,   2
+
+    movd m0, [tc0_q]
+    punpcklbw m0, m0
+    psraw m0, 6
+    movq [rsp], m0
+
+    mov r5,       pix_q
+    lea r6,      [3*stride_q]
+    add r5,       r6
+
+    mov r4, -8
+    .loop:
+
+        CHROMA_H_LOAD r5, r6, [rsp + 1*mmsize], [rsp + 2*mmsize]
+        LOAD_AB          m4,  m5, alpha_d, beta_d
+        LOAD_MASK        m0,  m1, m2, m3, m4, m5, m7, m6, m4
+        pxor             m4,  m4
+        movd             m6, [rsp + r4 + 8]
+        punpcklwd        m6,  m6
+        punpcklwd        m6,  m6
+        psubw            m6, [pw_3]
+        pmaxsw           m6,  m4
+        pand             m7,  m6
+        DEBLOCK_P0_Q0    m1,  m2, m0, m3, m7, m5, m6
+        CHROMA_H_STORE r5, r6, [rsp + 1*mmsize], [rsp + 2*mmsize]
+
+        lea pix_q, [pix_q + (mmsize/2)*stride_q]
+        lea r5,    [r5 +    (mmsize/2)*stride_q]
+        add r4, (mmsize/4)
+    jl .loop
+RET
+
 %endmacro
 
 %if ARCH_X86_64 == 0
diff --git a/libavcodec/x86/h264dsp_init.c b/libavcodec/x86/h264dsp_init.c
index ab270da..7b3d17f 100644
--- a/libavcodec/x86/h264dsp_init.c
+++ b/libavcodec/x86/h264dsp_init.c
@@ -315,6 +315,8 @@  av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
             c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_mmxext;
             if (chroma_format_idc <= 1) {
                 c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_10_mmxext;
+            } else {
+                c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_10_mmxext;
             }
             c->h264_v_loop_filter_luma         = ff_deblock_v_luma_10_mmxext;
             c->h264_h_loop_filter_luma         = ff_deblock_h_luma_10_mmxext;
@@ -351,6 +353,8 @@  av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
             c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_sse2;
             if (chroma_format_idc <= 1) {
                 c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_10_sse2;
+            } else {
+                c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_10_sse2;
             }
 #if HAVE_ALIGNED_STACK
             c->h264_v_loop_filter_luma       = ff_deblock_v_luma_10_sse2;
@@ -389,6 +393,8 @@  av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
             c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_avx;
             if (chroma_format_idc <= 1) {
                 c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_10_avx;
+            } else {
+                c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_10_avx;
             }
 #if HAVE_ALIGNED_STACK
             c->h264_v_loop_filter_luma         = ff_deblock_v_luma_10_avx;