diff mbox series

[FFmpeg-devel,16/21] aarch64: hevc: Deduplicate the hevc_put_hevc_qpel_uni_w_hv*_8_end_neon functions

Message ID 20240325150243.59058-17-martin@martin.st
State Accepted
Commit 4f71e4ebf23694ef8aa8d749d5e9aa219d165ce6
Headers show
Series aarch64: hevc: Add missing hevc_pel NEON functions | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

Martin Storsjö March 25, 2024, 3:02 p.m. UTC
The hv32 and hv64 functions were identical - both loop and
process 16 pixels at a time.

The hv16 function was near identical, except for the outer loop
(and using sp instead of a separate register).

Given the size of these functions, the extra cost of the outer
loop is negligible, so use the same function for hv16 as well.

This removes over 200 lines of duplicated assembly, and over 4 KB
of binary size.
---
 libavcodec/aarch64/hevcdsp_qpel_neon.S | 220 +------------------------
 1 file changed, 3 insertions(+), 217 deletions(-)
diff mbox series

Patch

diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S b/libavcodec/aarch64/hevcdsp_qpel_neon.S
index c04e8dbea8..06832603d9 100644
--- a/libavcodec/aarch64/hevcdsp_qpel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
@@ -4381,231 +4381,17 @@  function ff_hevc_put_hevc_qpel_uni_w_hv16_8_neon_i8mm, export=1
         b               hevc_put_hevc_qpel_uni_w_hv16_8_end_neon
 endfunc
 
-function hevc_put_hevc_qpel_uni_w_hv16_8_end_neon
-        ldp             q16, q1, [sp]
-        add             sp, sp, x10
-        ldp             q17, q2, [sp]
-        add             sp, sp, x10
-        ldp             q18, q3, [sp]
-        add             sp, sp, x10
-        ldp             q19, q4, [sp]
-        add             sp, sp, x10
-        ldp             q20, q5, [sp]
-        add             sp, sp, x10
-        ldp             q21, q6, [sp]
-        add             sp, sp, x10
-        ldp             q22, q7, [sp]
-        add             sp, sp, x10
-1:
-        ldp             q23, q31, [sp]
-        add             sp, sp, x10
-        QPEL_FILTER_H   v24, v16, v17, v18, v19, v20, v21, v22, v23
-        QPEL_FILTER_H2  v25, v16, v17, v18, v19, v20, v21, v22, v23
-        QPEL_FILTER_H   v26,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v31
-        QPEL_FILTER_H2  v27,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v31
-        QPEL_UNI_W_HV_16
-        subs            w22, w22, #1
-        b.eq            2f
-
-        ldp             q16, q1, [sp]
-        add             sp, sp, x10
-        QPEL_FILTER_H   v24, v17, v18, v19, v20, v21, v22, v23, v16
-        QPEL_FILTER_H2  v25, v17, v18, v19, v20, v21, v22, v23, v16
-        QPEL_FILTER_H   v26,  v2,  v3,  v4,  v5,  v6,  v7, v31,  v1
-        QPEL_FILTER_H2  v27,  v2,  v3,  v4,  v5,  v6,  v7, v31,  v1
-        QPEL_UNI_W_HV_16
-        subs            w22, w22, #1
-        b.eq            2f
-
-        ldp             q17, q2, [sp]
-        add             sp, sp, x10
-        QPEL_FILTER_H   v24, v18, v19, v20, v21, v22, v23, v16, v17
-        QPEL_FILTER_H2  v25, v18, v19, v20, v21, v22, v23, v16, v17
-        QPEL_FILTER_H   v26,  v3,  v4,  v5,  v6,  v7, v31,  v1,  v2
-        QPEL_FILTER_H2  v27,  v3,  v4,  v5,  v6,  v7, v31,  v1,  v2
-        QPEL_UNI_W_HV_16
-        subs            w22, w22, #1
-        b.eq            2f
-
-        ldp             q18, q3, [sp]
-        add             sp, sp, x10
-        QPEL_FILTER_H   v24, v19, v20, v21, v22, v23, v16, v17, v18
-        QPEL_FILTER_H2  v25, v19, v20, v21, v22, v23, v16, v17, v18
-        QPEL_FILTER_H   v26,  v4,  v5,  v6,  v7, v31,  v1,  v2,  v3
-        QPEL_FILTER_H2  v27,  v4,  v5,  v6,  v7, v31,  v1,  v2,  v3
-        QPEL_UNI_W_HV_16
-        subs            w22, w22, #1
-        b.eq            2f
-
-        ldp             q19, q4, [sp]
-        add             sp, sp, x10
-        QPEL_FILTER_H   v24, v20, v21, v22, v23, v16, v17, v18, v19
-        QPEL_FILTER_H2  v25, v20, v21, v22, v23, v16, v17, v18, v19
-        QPEL_FILTER_H   v26,  v5,  v6,  v7, v31,  v1,  v2,  v3,  v4
-        QPEL_FILTER_H2  v27,  v5,  v6,  v7, v31,  v1,  v2,  v3,  v4
-        QPEL_UNI_W_HV_16
-        subs            w22, w22, #1
-        b.eq            2f
-
-        ldp             q20, q5, [sp]
-        add             sp, sp, x10
-        QPEL_FILTER_H   v24, v21, v22, v23, v16, v17, v18, v19, v20
-        QPEL_FILTER_H2  v25, v21, v22, v23, v16, v17, v18, v19, v20
-        QPEL_FILTER_H   v26,  v6,  v7, v31,  v1,  v2,  v3,  v4,  v5
-        QPEL_FILTER_H2  v27,  v6,  v7, v31,  v1,  v2,  v3,  v4,  v5
-        QPEL_UNI_W_HV_16
-        subs            w22, w22, #1
-        b.eq            2f
-
-        ldp             q21, q6, [sp]
-        add             sp, sp, x10
-        QPEL_FILTER_H   v24, v22, v23, v16, v17, v18, v19, v20, v21
-        QPEL_FILTER_H2  v25, v22, v23, v16, v17, v18, v19, v20, v21
-        QPEL_FILTER_H   v26,  v7, v31,  v1,  v2,  v3,  v4,  v5,  v6
-        QPEL_FILTER_H2  v27,  v7, v31,  v1,  v2,  v3,  v4,  v5,  v6
-        QPEL_UNI_W_HV_16
-        subs            w22, w22, #1
-        b.eq            2f
-
-        ldp             q22, q7, [sp]
-        add             sp, sp, x10
-        QPEL_FILTER_H   v24, v23, v16, v17, v18, v19, v20, v21, v22
-        QPEL_FILTER_H2  v25, v23, v16, v17, v18, v19, v20, v21, v22
-        QPEL_FILTER_H   v26, v31,  v1,  v2,  v3,  v4,  v5,  v6,  v7
-        QPEL_FILTER_H2  v27, v31,  v1,  v2,  v3,  v4,  v5,  v6,  v7
-        QPEL_UNI_W_HV_16
-        subs            w22, w22, #1
-        b.hi            1b
-
-2:
-        QPEL_UNI_W_HV_END
-        ret
-endfunc
-
-
 function ff_hevc_put_hevc_qpel_uni_w_hv32_8_neon_i8mm, export=1
         QPEL_UNI_W_HV_HEADER 32
-        b               hevc_put_hevc_qpel_uni_w_hv32_8_end_neon
-endfunc
-
-function hevc_put_hevc_qpel_uni_w_hv32_8_end_neon
-        mov             x11, sp
-        mov             w12, w22
-        mov             x13, x20
-        mov             x14, sp
-3:
-        ldp             q16, q1, [x11]
-        add             x11, x11, x10
-        ldp             q17, q2, [x11]
-        add             x11, x11, x10
-        ldp             q18, q3, [x11]
-        add             x11, x11, x10
-        ldp             q19, q4, [x11]
-        add             x11, x11, x10
-        ldp             q20, q5, [x11]
-        add             x11, x11, x10
-        ldp             q21, q6, [x11]
-        add             x11, x11, x10
-        ldp             q22, q7, [x11]
-        add             x11, x11, x10
-1:
-        ldp             q23, q31, [x11]
-        add             x11, x11, x10
-        QPEL_FILTER_H   v24, v16, v17, v18, v19, v20, v21, v22, v23
-        QPEL_FILTER_H2  v25, v16, v17, v18, v19, v20, v21, v22, v23
-        QPEL_FILTER_H   v26,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v31
-        QPEL_FILTER_H2  v27,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v31
-        QPEL_UNI_W_HV_16
-        subs            w22, w22, #1
-        b.eq            2f
-
-        ldp             q16, q1, [x11]
-        add             x11, x11, x10
-        QPEL_FILTER_H   v24, v17, v18, v19, v20, v21, v22, v23, v16
-        QPEL_FILTER_H2  v25, v17, v18, v19, v20, v21, v22, v23, v16
-        QPEL_FILTER_H   v26,  v2,  v3,  v4,  v5,  v6,  v7, v31,  v1
-        QPEL_FILTER_H2  v27,  v2,  v3,  v4,  v5,  v6,  v7, v31,  v1
-        QPEL_UNI_W_HV_16
-        subs            w22, w22, #1
-        b.eq            2f
-
-        ldp             q17, q2, [x11]
-        add             x11, x11, x10
-        QPEL_FILTER_H   v24, v18, v19, v20, v21, v22, v23, v16, v17
-        QPEL_FILTER_H2  v25, v18, v19, v20, v21, v22, v23, v16, v17
-        QPEL_FILTER_H   v26,  v3,  v4,  v5,  v6,  v7, v31,  v1,  v2
-        QPEL_FILTER_H2  v27,  v3,  v4,  v5,  v6,  v7, v31,  v1,  v2
-        QPEL_UNI_W_HV_16
-        subs            w22, w22, #1
-        b.eq            2f
-
-        ldp             q18, q3, [x11]
-        add             x11, x11, x10
-        QPEL_FILTER_H   v24, v19, v20, v21, v22, v23, v16, v17, v18
-        QPEL_FILTER_H2  v25, v19, v20, v21, v22, v23, v16, v17, v18
-        QPEL_FILTER_H   v26,  v4,  v5,  v6,  v7, v31,  v1,  v2,  v3
-        QPEL_FILTER_H2  v27,  v4,  v5,  v6,  v7, v31,  v1,  v2,  v3
-        QPEL_UNI_W_HV_16
-        subs            w22, w22, #1
-        b.eq            2f
-
-        ldp             q19, q4, [x11]
-        add             x11, x11, x10
-        QPEL_FILTER_H   v24, v20, v21, v22, v23, v16, v17, v18, v19
-        QPEL_FILTER_H2  v25, v20, v21, v22, v23, v16, v17, v18, v19
-        QPEL_FILTER_H   v26,  v5,  v6,  v7, v31,  v1,  v2,  v3,  v4
-        QPEL_FILTER_H2  v27,  v5,  v6,  v7, v31,  v1,  v2,  v3,  v4
-        QPEL_UNI_W_HV_16
-        subs            w22, w22, #1
-        b.eq            2f
-
-        ldp             q20, q5, [x11]
-        add             x11, x11, x10
-        QPEL_FILTER_H   v24, v21, v22, v23, v16, v17, v18, v19, v20
-        QPEL_FILTER_H2  v25, v21, v22, v23, v16, v17, v18, v19, v20
-        QPEL_FILTER_H   v26,  v6,  v7, v31,  v1,  v2,  v3,  v4,  v5
-        QPEL_FILTER_H2  v27,  v6,  v7, v31,  v1,  v2,  v3,  v4,  v5
-        QPEL_UNI_W_HV_16
-        subs            w22, w22, #1
-        b.eq            2f
-
-        ldp             q21, q6, [x11]
-        add             x11, x11, x10
-        QPEL_FILTER_H   v24, v22, v23, v16, v17, v18, v19, v20, v21
-        QPEL_FILTER_H2  v25, v22, v23, v16, v17, v18, v19, v20, v21
-        QPEL_FILTER_H   v26,  v7, v31,  v1,  v2,  v3,  v4,  v5,  v6
-        QPEL_FILTER_H2  v27,  v7, v31,  v1,  v2,  v3,  v4,  v5,  v6
-        QPEL_UNI_W_HV_16
-        subs            w22, w22, #1
-        b.eq            2f
-
-        ldp             q22, q7, [x11]
-        add             x11, x11, x10
-        QPEL_FILTER_H   v24, v23, v16, v17, v18, v19, v20, v21, v22
-        QPEL_FILTER_H2  v25, v23, v16, v17, v18, v19, v20, v21, v22
-        QPEL_FILTER_H   v26, v31,  v1,  v2,  v3,  v4,  v5,  v6,  v7
-        QPEL_FILTER_H2  v27, v31,  v1,  v2,  v3,  v4,  v5,  v6,  v7
-        QPEL_UNI_W_HV_16
-        subs            w22, w22, #1
-        b.hi            1b
-2:
-        subs            w27, w27, #16
-        add             x11, x14, #32
-        add             x20, x13, #16
-        mov             w22, w12
-        mov             x14, x11
-        mov             x13, x20
-        b.hi            3b
-        QPEL_UNI_W_HV_END
-        ret
+        b               hevc_put_hevc_qpel_uni_w_hv16_8_end_neon
 endfunc
 
 function ff_hevc_put_hevc_qpel_uni_w_hv64_8_neon_i8mm, export=1
         QPEL_UNI_W_HV_HEADER 64
-        b               hevc_put_hevc_qpel_uni_w_hv64_8_end_neon
+        b               hevc_put_hevc_qpel_uni_w_hv16_8_end_neon
 endfunc
 
-function hevc_put_hevc_qpel_uni_w_hv64_8_end_neon
+function hevc_put_hevc_qpel_uni_w_hv16_8_end_neon
         mov             x11, sp
         mov             w12, w22
         mov             x13, x20