diff mbox series

[FFmpeg-devel,2/3] avcodec: [loongarch] optimize get_cabac.

Message ID 20211109070910.4447-3-yinshiyou@loongson.cn
State New
Headers show
Series [FFmpeg-devel,1/3] configure: Add support for loongarch.
Related show

Checks

Context Check Description
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished
andriy/make_ppc success Make finished
andriy/make_fate_ppc success Make fate finished

Commit Message

Shiyou Yin Nov. 9, 2021, 7:09 a.m. UTC
Decoding 1080P H264 on 2.5Ghz 3A5000: 165fps==>168fps.
Testing command: ffmpeg -i ***.mp4 -f rawvideo -y /dev/null -an
---
 libavcodec/cabac_functions.h |   3 +
 libavcodec/loongarch/cabac.h | 254 +++++++++++++++++++++++++++++++++++
 2 files changed, 257 insertions(+)
 create mode 100644 libavcodec/loongarch/cabac.h
diff mbox series

Patch

diff --git a/libavcodec/cabac_functions.h b/libavcodec/cabac_functions.h
index 46af921822..2f2d48a8f8 100644
--- a/libavcodec/cabac_functions.h
+++ b/libavcodec/cabac_functions.h
@@ -49,6 +49,9 @@ 
 #if ARCH_MIPS
 #   include "mips/cabac.h"
 #endif
+#if ARCH_LOONGARCH64
+#   include "loongarch/cabac.h"
+#endif
 
 static const uint8_t * const ff_h264_norm_shift = ff_h264_cabac_tables + H264_NORM_SHIFT_OFFSET;
 static const uint8_t * const ff_h264_lps_range = ff_h264_cabac_tables + H264_LPS_RANGE_OFFSET;
diff --git a/libavcodec/loongarch/cabac.h b/libavcodec/loongarch/cabac.h
new file mode 100644
index 0000000000..71e8ba3be7
--- /dev/null
+++ b/libavcodec/loongarch/cabac.h
@@ -0,0 +1,254 @@ 
+/*
+ * Loongson  optimized cabac
+ *
+ * Copyright (c) 2020 Loongson Technology Corporation Limited
+ * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
+ *                Gu Xiwei(guxiwei-hf@loongson.cn)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_LOONGARCH_CABAC_H
+#define AVCODEC_LOONGARCH_CABAC_H
+
+#include "libavcodec/cabac.h"
+#include "config.h"
+
+#define GET_CABAC_LOONGARCH_UNCBSR                                      \
+    "ld.bu        %[bit],        %[state],       0x0           \n\t"    \
+    "andi         %[tmp0],       %[c_range],     0xC0          \n\t"    \
+    "slli.d       %[tmp0],       %[tmp0],        0x01          \n\t"    \
+    "add.d        %[tmp0],       %[tmp0],        %[tables]     \n\t"    \
+    "add.d        %[tmp0],       %[tmp0],        %[bit]        \n\t"    \
+    /* tmp1: RangeLPS */                                                \
+    "ld.bu        %[tmp1],       %[tmp0],        %[lps_off]    \n\t"    \
+                                                                        \
+    "sub.d        %[c_range],    %[c_range],     %[tmp1]       \n\t"    \
+    "slli.d       %[tmp0],       %[c_range],     0x11          \n\t"    \
+    "bge          %[tmp0],       %[c_low],       1f            \n\t"    \
+    "move         %[c_range],    %[tmp1]                       \n\t"    \
+    "nor          %[bit],        %[bit],         %[bit]        \n\t"    \
+    "sub.d        %[c_low],      %[c_low],       %[tmp0]       \n\t"    \
+                                                                        \
+    "1:                                                        \n\t"    \
+    /* tmp1: *state */                                                  \
+    "add.d        %[tmp0],       %[tables],      %[bit]        \n\t"    \
+    "ld.bu        %[tmp1],       %[tmp0],        %[mlps_off]   \n\t"    \
+    /* tmp2: lps_mask */                                                \
+    "add.d        %[tmp0],       %[tables],      %[c_range]    \n\t"    \
+    "ld.bu        %[tmp2],       %[tmp0],        %[norm_off]   \n\t"    \
+                                                                        \
+    "andi         %[bit],        %[bit],         0x01          \n\t"    \
+    "st.b         %[tmp1],       %[state],       0x0           \n\t"    \
+    "sll.d        %[c_range],    %[c_range],     %[tmp2]       \n\t"    \
+    "sll.d        %[c_low],      %[c_low],       %[tmp2]       \n\t"    \
+                                                                        \
+    "and          %[tmp1],       %[c_low],       %[cabac_mask] \n\t"    \
+    "bnez         %[tmp1],       1f                            \n\t"    \
+    "ld.hu        %[tmp1],       %[c_bytestream], 0x0          \n\t"    \
+    "addi.d       %[tmp0],       %[c_low],       -0X01         \n\t"    \
+    "xor          %[tmp0],       %[c_low],       %[tmp0]       \n\t"    \
+    "srai.d       %[tmp0],       %[tmp0],        0x0f          \n\t"    \
+    "add.d        %[tmp0],       %[tmp0],        %[tables]     \n\t"    \
+    /* tmp2: ff_h264_norm_shift[x >> (CABAC_BITS - 1)] */               \
+    "ld.bu        %[tmp2],       %[tmp0],        %[norm_off]   \n\t"    \
+                                                                        \
+    "revb.2h      %[tmp0],       %[tmp1]                       \n\t"    \
+    "slli.d       %[tmp0],       %[tmp0],        0x01          \n\t"    \
+    "sub.d        %[tmp0],       %[tmp0],        %[cabac_mask] \n\t"    \
+                                                                        \
+    "li.d         %[tmp1],       0x07                          \n\t"    \
+    "sub.d        %[tmp1],       %[tmp1],        %[tmp2]       \n\t"    \
+    "sll.d        %[tmp0],       %[tmp0],        %[tmp1]       \n\t"    \
+    "add.d        %[c_low],      %[c_low],       %[tmp0]       \n\t"    \
+                                                                        \
+    "addi.d       %[c_bytestream], %[c_bytestream],     0x02   \n\t"    \
+    "1:                                                        \n\t"    \
+
+#define GET_CABAC_LOONGARCH                                             \
+    "ld.bu        %[bit],        %[state],       0x0           \n\t"    \
+    "andi         %[tmp0],       %[c_range],     0xC0          \n\t"    \
+    "slli.d       %[tmp0],       %[tmp0],        0x01          \n\t"    \
+    "add.d        %[tmp0],       %[tmp0],        %[tables]     \n\t"    \
+    "add.d        %[tmp0],       %[tmp0],        %[bit]        \n\t"    \
+    /* tmp1: RangeLPS */                                                \
+    "ld.bu        %[tmp1],       %[tmp0],        %[lps_off]    \n\t"    \
+                                                                        \
+    "sub.d        %[c_range],    %[c_range],     %[tmp1]       \n\t"    \
+    "slli.d       %[tmp0],       %[c_range],     0x11          \n\t"    \
+    "bge          %[tmp0],       %[c_low],       1f            \n\t"    \
+    "move         %[c_range],    %[tmp1]                       \n\t"    \
+    "nor          %[bit],        %[bit],         %[bit]        \n\t"    \
+    "sub.d        %[c_low],      %[c_low],       %[tmp0]       \n\t"    \
+                                                                        \
+    "1:                                                        \n\t"    \
+    /* tmp1: *state */                                                  \
+    "add.d        %[tmp0],       %[tables],      %[bit]        \n\t"    \
+    "ld.bu        %[tmp1],       %[tmp0],        %[mlps_off]   \n\t"    \
+    /* tmp2: lps_mask */                                                \
+    "add.d        %[tmp0],       %[tables],      %[c_range]    \n\t"    \
+    "ld.bu        %[tmp2],       %[tmp0],        %[norm_off]   \n\t"    \
+                                                                        \
+    "andi         %[bit],        %[bit],         0x01          \n\t"    \
+    "st.b         %[tmp1],       %[state],       0x0           \n\t"    \
+    "sll.d        %[c_range],    %[c_range],     %[tmp2]       \n\t"    \
+    "sll.d        %[c_low],      %[c_low],       %[tmp2]       \n\t"    \
+                                                                        \
+    "and          %[tmp1],       %[c_low],       %[cabac_mask] \n\t"    \
+    "bnez         %[tmp1],       1f                            \n\t"    \
+    "ld.hu        %[tmp1],       %[c_bytestream], 0x0          \n\t"    \
+    "addi.d       %[tmp0],       %[c_low],       -0X01         \n\t"    \
+    "xor          %[tmp0],       %[c_low],       %[tmp0]       \n\t"    \
+    "srai.d       %[tmp0],       %[tmp0],        0x0f          \n\t"    \
+    "add.d        %[tmp0],       %[tmp0],        %[tables]     \n\t"    \
+    /* tmp2: ff_h264_norm_shift[x >> (CABAC_BITS - 1)] */               \
+    "ld.bu        %[tmp2],       %[tmp0],        %[norm_off]   \n\t"    \
+                                                                        \
+    "revb.2h      %[tmp0],       %[tmp1]                       \n\t"    \
+    "slli.d       %[tmp0],       %[tmp0],        0x01          \n\t"    \
+    "sub.d        %[tmp0],       %[tmp0],        %[cabac_mask] \n\t"    \
+                                                                        \
+    "li.d         %[tmp1],       0x07                          \n\t"    \
+    "sub.d        %[tmp1],       %[tmp1],        %[tmp2]       \n\t"    \
+    "sll.d        %[tmp0],       %[tmp0],        %[tmp1]       \n\t"    \
+    "add.d        %[c_low],      %[c_low],       %[tmp0]       \n\t"    \
+                                                                        \
+    "slt      %[tmp0],  %[c_bytestream],  %[c_bytestream_end]  \n\t"    \
+    "add.d    %[c_bytestream], %[c_bytestream],     %[tmp0]    \n\t"    \
+    "add.d    %[c_bytestream], %[c_bytestream],     %[tmp0]    \n\t"    \
+    "1:                                                        \n\t"    \
+
+#define get_cabac_inline get_cabac_inline_loongarch
+static av_always_inline
+int get_cabac_inline_loongarch(CABACContext *c, uint8_t * const state)
+{
+    int64_t tmp0, tmp1, tmp2, bit;
+
+    __asm__ volatile (
+#if UNCHECKED_BITSTREAM_READER
+        GET_CABAC_LOONGARCH_UNCBSR
+#else
+        GET_CABAC_LOONGARCH
+#endif
+    : [bit]"=&r"(bit), [tmp0]"=&r"(tmp0), [tmp1]"=&r"(tmp1), [tmp2]"=&r"(tmp2),
+      [c_range]"+&r"(c->range), [c_low]"+&r"(c->low),
+      [c_bytestream]"+&r"(c->bytestream)
+    : [state]"r"(state), [tables]"r"(ff_h264_cabac_tables),
+#if !UNCHECKED_BITSTREAM_READER
+      [c_bytestream_end]"r"(c->bytestream_end),
+#endif
+      [lps_off]"i"(H264_LPS_RANGE_OFFSET),
+      [mlps_off]"i"(H264_MLPS_STATE_OFFSET + 128),
+      [norm_off]"i"(H264_NORM_SHIFT_OFFSET),
+      [cabac_mask]"r"(CABAC_MASK)
+    : "memory"
+    );
+
+    return bit;
+}
+
+#define get_cabac_bypass get_cabac_bypass_loongarch
+static av_always_inline int get_cabac_bypass_loongarch(CABACContext *c)
+{
+    int64_t tmp0, tmp1, tmp2;
+    int res = 0;
+    __asm__ volatile(
+        "slli.d     %[c_low],        %[c_low],        0x01                \n\t"
+        "and        %[tmp0],         %[c_low],        %[cabac_mask]       \n\t"
+        "bnez       %[tmp0],         1f                                   \n\t"
+        "ld.hu      %[tmp1],         %[c_bytestream], 0x0                 \n\t"
+#if UNCHECKED_BITSTREAM_READER
+        "addi.d     %[c_bytestream], %[c_bytestream], 0x02                \n\t"
+#else
+        "slt        %[tmp0],         %[c_bytestream], %[c_bytestream_end] \n\t"
+        "add.d      %[c_bytestream], %[c_bytestream], %[tmp0]             \n\t"
+        "add.d      %[c_bytestream], %[c_bytestream], %[tmp0]             \n\t"
+#endif
+        "revb.2h    %[tmp1],         %[tmp1]                              \n\t"
+        "slli.d     %[tmp1],         %[tmp1],         0x01                \n\t"
+        "sub.d      %[tmp1],         %[tmp1],         %[cabac_mask]       \n\t"
+        "add.d      %[c_low],        %[c_low],        %[tmp1]             \n\t"
+        "1:                                                               \n\t"
+        "slli.d     %[tmp1],         %[c_range],      0x11                \n\t"
+        "slt        %[tmp0],         %[c_low],        %[tmp1]             \n\t"
+        "sub.d      %[tmp1],         %[c_low],        %[tmp1]             \n\t"
+        "masknez    %[tmp2],         %[one],          %[tmp0]             \n\t"
+        "maskeqz    %[res],          %[res],          %[tmp0]             \n\t"
+        "or         %[res],          %[res],          %[tmp2]             \n\t"
+        "masknez    %[tmp2],         %[tmp1],         %[tmp0]             \n\t"
+        "maskeqz    %[c_low],        %[c_low],        %[tmp0]             \n\t"
+        "or         %[c_low],        %[c_low],        %[tmp2]             \n\t"
+        : [tmp0]"=&r"(tmp0), [tmp1]"=&r"(tmp1), [tmp2]"=&r"(tmp2),
+          [c_range]"+&r"(c->range), [c_low]"+&r"(c->low),
+          [c_bytestream]"+&r"(c->bytestream), [res]"+&r"(res)
+        : [cabac_mask]"r"(CABAC_MASK),
+#if !UNCHECKED_BITSTREAM_READER
+          [c_bytestream_end]"r"(c->bytestream_end),
+#endif
+          [one]"r"(0x01)
+        : "memory"
+    );
+    return res;
+}
+
+#define get_cabac_bypass_sign get_cabac_bypass_sign_loongarch
+static av_always_inline
+int get_cabac_bypass_sign_loongarch(CABACContext *c, int val)
+{
+    int64_t tmp0, tmp1;
+    int res = val;
+    __asm__ volatile(
+        "slli.d     %[c_low],        %[c_low],        0x01                \n\t"
+        "and        %[tmp0],         %[c_low],        %[cabac_mask]       \n\t"
+        "bnez       %[tmp0],         1f                                   \n\t"
+        "ld.hu      %[tmp1],         %[c_bytestream], 0x0                 \n\t"
+#if UNCHECKED_BITSTREAM_READER
+        "addi.d     %[c_bytestream], %[c_bytestream], 0x02                \n\t"
+#else
+        "slt        %[tmp0],         %[c_bytestream], %[c_bytestream_end] \n\t"
+        "add.d      %[c_bytestream], %[c_bytestream], %[tmp0]             \n\t"
+        "add.d      %[c_bytestream], %[c_bytestream], %[tmp0]             \n\t"
+#endif
+        "revb.2h    %[tmp1],         %[tmp1]                              \n\t"
+        "slli.d     %[tmp1],         %[tmp1],         0x01                \n\t"
+        "sub.d      %[tmp1],         %[tmp1],         %[cabac_mask]       \n\t"
+        "add.d      %[c_low],        %[c_low],        %[tmp1]             \n\t"
+        "1:                                                               \n\t"
+        "slli.d     %[tmp1],         %[c_range],      0x11                \n\t"
+        "slt        %[tmp0],         %[c_low],        %[tmp1]             \n\t"
+        "sub.d      %[tmp1],         %[c_low],        %[tmp1]             \n\t"
+        "masknez    %[tmp1],         %[tmp1],         %[tmp0]             \n\t"
+        "maskeqz    %[c_low],        %[c_low],        %[tmp0]             \n\t"
+        "or         %[c_low],        %[c_low],        %[tmp1]             \n\t"
+        "sub.d      %[tmp1],         %[zero],         %[res]              \n\t"
+        "maskeqz    %[tmp1],         %[tmp1],         %[tmp0]             \n\t"
+        "masknez    %[res],          %[res],          %[tmp0]             \n\t"
+        "or         %[res],          %[res],          %[tmp1]             \n\t"
+        : [tmp0]"=&r"(tmp0), [tmp1]"=&r"(tmp1), [res]"+&r"(res),
+          [c_range]"+&r"(c->range), [c_low]"+&r"(c->low),
+          [c_bytestream]"+&r"(c->bytestream)
+        : [cabac_mask]"r"(CABAC_MASK),
+#if !UNCHECKED_BITSTREAM_READER
+          [c_bytestream_end]"r"(c->bytestream_end),
+#endif
+          [zero]"r"(0x0)
+        : "memory"
+    );
+
+    return res;
+}
+#endif /* AVCODEC_LOONGARCH_CABAC_H */