From patchwork Sat Aug 26 08:49:38 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Logan.Lyu" X-Patchwork-Id: 43323 Delivered-To: ffmpegpatchwork2@gmail.com Received: by 2002:a05:6a20:7194:b0:149:dfde:5c0a with SMTP id s20csp355883pzb; Sat, 26 Aug 2023 01:49:58 -0700 (PDT) X-Google-Smtp-Source: AGHT+IHcsc96St+GKMB0C7bNwzI4GyWXO0krH85dJIRWZkHBAQEetA0gGLcatjHzFmyFK3AnQnxk X-Received: by 2002:a05:6402:648:b0:52a:8bb:4068 with SMTP id u8-20020a056402064800b0052a08bb4068mr15724831edx.29.1693039797939; Sat, 26 Aug 2023 01:49:57 -0700 (PDT) ARC-Seal: i=1; a=rsa-sha256; t=1693039797; cv=none; d=google.com; s=arc-20160816; b=XGvHfqBJVzmSwkyj1Frniw2umNNrnEVs0/exM54HBAA/NqM09kxQL3AVvDRR71HlG6 lniXE5qZxENEjCw9ok1E9iW9QYwOarXQkPEEdHJ7xY7HBYv9mxov+kuxNAX3BPFF2e22 Wy7G7sLNhFofswrIO455TuxHaIyGcW8MiDo9prlbFMWaKV67Emr0rEBpSGjHNoXGesu4 43fMjqIRO3CbnJPDE+VFTkkZ4Ay7l85jZ+3UXhWtvAinqm5YzkRoLT4krKg/hDaqFyoV 4jBvYs5CXD9+A4V+OSyG43ExjCj4RWNWI+pVWPxtxSY6djG/hZzNB/3WKHirlCU129JZ BLxQ== ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=google.com; s=arc-20160816; h=sender:errors-to:content-transfer-encoding:cc:reply-to :list-subscribe:list-help:list-post:list-archive:list-unsubscribe :list-id:precedence:subject:organization:to:from:user-agent :mime-version:date:message-id:delivered-to; bh=uJeRL7Dd1uGftpx3pOq4O5TtLRNDg50Vyjb9ogeE/Bg=; fh=+fqFbc7xIMnG/cSZsGs3FRxJ55tKLvNlnjRSUGhXbk0=; b=RAJWxwz8O8Iy9kAwCg7K0T+KepRMiINFa/ypiFLxPAfhJTlLRqZrC9Yv+uyVzTQJBk aBIyVu3YhfAqRUh/qjZx/Yh/3gqdE7NqSMsKnoBXr5liWT3ONgHzMYhFz7rSjdUTNhaO Q2zmNuqh7PTJiA1bgxIkF9eIi5PH2s92DYex2rjyXsC/nQ5F7Hxl96GfaEXYhRxCtu1d QJ5bohlZqf5XMXz2xpiKEDGwemnCGjT5NCkmzYLxLsJ8xllm33lH4L1YHWljgMQVplqF Hx1mQY6BVEjjowh+7QGpG3FEBOHINYz10FF+qp0QUopTUadf5Gc5r3Mw7qq+RN2UuJOY bIYQ== ARC-Authentication-Results: i=1; mx.google.com; spf=pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) smtp.mailfrom=ffmpeg-devel-bounces@ffmpeg.org Return-Path: Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org. [79.124.17.100]) by mx.google.com with ESMTP id j22-20020aa7c0d6000000b0052a08b8f4fbsi18489edp.269.2023.08.26.01.49.57; Sat, 26 Aug 2023 01:49:57 -0700 (PDT) Received-SPF: pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) client-ip=79.124.17.100; Authentication-Results: mx.google.com; spf=pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) smtp.mailfrom=ffmpeg-devel-bounces@ffmpeg.org Received: from [127.0.1.1] (localhost [127.0.0.1]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id 5E6AC68C5A1; Sat, 26 Aug 2023 11:49:53 +0300 (EEST) X-Original-To: ffmpeg-devel@ffmpeg.org Delivered-To: ffmpeg-devel@ffmpeg.org Received: from smtp-my3-01p7.yunyou.top (smtp-my3-01p7.yunyou.top [60.247.169.7]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTPS id 2DFE568AAB3 for ; Sat, 26 Aug 2023 11:49:46 +0300 (EEST) Received: from [192.168.15.105] (unknown [183.158.247.103]) by smtp-my-01.yunyou.top (WestCloudMail) with ESMTPA id BF335FE635; Sat, 26 Aug 2023 16:49:38 +0800 (CST) Message-ID: <33af9c88-c31d-e11e-58a3-7f9a05718c8f@myais.com.cn> Date: Sat, 26 Aug 2023 16:49:38 +0800 MIME-Version: 1.0 User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Thunderbird/102.14.0 From: "Logan.Lyu" To: ffmpeg-devel@ffmpeg.org Organization: myais Subject: [FFmpeg-devel] [PATCH 1/4] lavc/aarch64: new optimization for 8-bit hevc_epel_uni_v X-BeenThere: ffmpeg-devel@ffmpeg.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: FFmpeg development discussions and patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Reply-To: FFmpeg development discussions and patches Cc: jb@videolan.org, jdek@itanimul.li Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" X-TUID: YwFGGsRf3Q4E checkasm bench: put_hevc_epel_uni_hv64_8_i8mm: 6568.7 put_hevc_epel_uni_v4_8_c: 88.7 put_hevc_epel_uni_v4_8_neon: 32.7 put_hevc_epel_uni_v6_8_c: 185.4 put_hevc_epel_uni_v6_8_neon: 44.9 put_hevc_epel_uni_v8_8_c: 333.9 put_hevc_epel_uni_v8_8_neon: 44.4 put_hevc_epel_uni_v12_8_c: 728.7 put_hevc_epel_uni_v12_8_neon: 119.7 put_hevc_epel_uni_v16_8_c: 1224.2 put_hevc_epel_uni_v16_8_neon: 139.7 put_hevc_epel_uni_v24_8_c: 2531.2 put_hevc_epel_uni_v24_8_neon: 329.9 put_hevc_epel_uni_v32_8_c: 4739.9 put_hevc_epel_uni_v32_8_neon: 562.7 put_hevc_epel_uni_v48_8_c: 10618.7 put_hevc_epel_uni_v48_8_neon: 1256.2 put_hevc_epel_uni_v64_8_c: 19169.9 put_hevc_epel_uni_v64_8_neon: 2179.2 Co-Authored-By: J. Dekker Signed-off-by: Logon Lyu --- libavcodec/aarch64/hevcdsp_epel_neon.S | 320 ++++++++++++++++++++++ libavcodec/aarch64/hevcdsp_init_aarch64.c | 5 + 2 files changed, 325 insertions(+) NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 0, pel_uni_w_pixels,); diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S b/libavcodec/aarch64/hevcdsp_epel_neon.S index a8d694639b..7ce7eec829 100644 --- a/libavcodec/aarch64/hevcdsp_epel_neon.S +++ b/libavcodec/aarch64/hevcdsp_epel_neon.S @@ -32,6 +32,326 @@ const epel_filters, align=4 .byte -2, 10, 58, -2 endconst +.macro load_epel_filterb freg, xreg + movrel \xreg, epel_filters + add \xreg, \xreg, \freg, lsl #2 + ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [\xreg] // filter + neg v0.16b, v0.16b + neg v3.16b, v3.16b +.endm + +.macro calc_epelb dst, src0, src1, src2, src3 + umlsl \dst\().8h, \src0\().8b, v0.8b + umlal \dst\().8h, \src1\().8b, v1.8b + umlal \dst\().8h, \src2\().8b, v2.8b + umlsl \dst\().8h, \src3\().8b, v3.8b +.endm + +.macro calc_epelb2 dst, src0, src1, src2, src3 + umlsl2 \dst\().8h, \src0\().16b, v0.16b + umlal2 \dst\().8h, \src1\().16b, v1.16b + umlal2 \dst\().8h, \src2\().16b, v2.16b + umlsl2 \dst\().8h, \src3\().16b, v3.16b +.endm + +.macro calc_all4 + calc v16, v17, v18, v19 + b.eq 2f + calc v17, v18, v19, v16 + b.eq 2f + calc v18, v19, v16, v17 + b.eq 2f + calc v19, v16, v17, v18 + b.ne 1b +.endm + +.macro calc_all8 + calc v16, v17, v18, v19, v20, v21, v22, v23 + b.eq 2f + calc v18, v19, v20, v21, v22, v23, v16, v17 + b.eq 2f + calc v20, v21, v22, v23, v16, v17, v18, v19 + b.eq 2f + calc v22, v23, v16, v17, v18, v19, v20, v21 + b.ne 1b +.endm + +.macro calc_all12 + calc v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27 + b.eq 2f + calc v19, v20, v21, v22, v23, v24, v25, v26, v27, v16, v17, v18 + b.eq 2f + calc v22, v23, v24, v25, v26, v27, v16, v17, v18, v19, v20, v21 + b.eq 2f + calc v25, v26, v27, v16, v17, v18, v19, v20, v21, v22, v23, v24 + b.ne 1b +.endm + +.macro calc_all16 + calc v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31 + b.eq 2f + calc v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v16, v17, v18, v19 + b.eq 2f + calc v24, v25, v26, v27, v28, v29, v30, v31, v16, v17, v18, v19, v20, v21, v22, v23 + b.eq 2f + calc v28, v29, v30, v31, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27 + b.ne 1b +.endm + +function ff_hevc_put_hevc_epel_uni_v4_8_neon, export=1 + load_epel_filterb x6, x5 + sxtw x3, w3 + sxtw x1, w1 + sub x2, x2, x3 + ld1 {v16.s}[0], [x2], x3 + ld1 {v17.s}[0], [x2], x3 + ld1 {v18.s}[0], [x2], x3 +.macro calc src0, src1, src2, src3 + ld1 {\src3\().s}[0], [x2], x3 + movi v4.8h, #0 + calc_epelb v4, \src0, \src1, \src2, \src3 + sqrshrun v4.8b, v4.8h, #6 + subs w4, w4, #1 + st1 {v4.s}[0], [x0], x1 +.endm +1: calc_all4 +.purgem calc +2: ret +endfunc + +function ff_hevc_put_hevc_epel_uni_v6_8_neon, export=1 + load_epel_filterb x6, x5 + sxtw x3, w3 + sxtw x1, w1 + sub x2, x2, x3 + sub x1, x1, #4 + ld1 {v16.8b}, [x2], x3 + ld1 {v17.8b}, [x2], x3 + ld1 {v18.8b}, [x2], x3 +.macro calc src0, src1, src2, src3 + ld1 {\src3\().8b}, [x2], x3 + movi v4.8h, #0 + calc_epelb v4, \src0, \src1, \src2, \src3 + sqrshrun v4.8b, v4.8h, #6 + st1 {v4.s}[0], [x0], #4 + subs w4, w4, #1 + st1 {v4.h}[2], [x0], x1 +.endm +1: calc_all4 +.purgem calc +2: ret +endfunc + +function ff_hevc_put_hevc_epel_uni_v8_8_neon, export=1 + load_epel_filterb x6, x5 + sxtw x3, w3 + sxtw x1, w1 + sub x2, x2, x3 + ld1 {v16.8b}, [x2], x3 + ld1 {v17.8b}, [x2], x3 + ld1 {v18.8b}, [x2], x3 +.macro calc src0, src1, src2, src3 + ld1 {\src3\().8b}, [x2], x3 + movi v4.8h, #0 + calc_epelb v4, \src0, \src1, \src2, \src3 + sqrshrun v4.8b, v4.8h, #6 + subs w4, w4, #1 + st1 {v4.8b}, [x0], x1 +.endm +1: calc_all4 +.purgem calc +2: ret +endfunc + +function ff_hevc_put_hevc_epel_uni_v12_8_neon, export=1 + load_epel_filterb x6, x5 + sxtw x3, w3 + sxtw x1, w1 + sub x2, x2, x3 + sub x1, x1, #8 + ld1 {v16.16b}, [x2], x3 + ld1 {v17.16b}, [x2], x3 + ld1 {v18.16b}, [x2], x3 +.macro calc src0, src1, src2, src3 + ld1 {\src3\().16b}, [x2], x3 + movi v4.8h, #0 + movi v5.8h, #0 + calc_epelb v4, \src0, \src1, \src2, \src3 + calc_epelb2 v5, \src0, \src1, \src2, \src3 + sqrshrun v4.8b, v4.8h, #6 + sqrshrun2 v4.16b, v5.8h, #6 + subs w4, w4, #1 + st1 {v4.8b}, [x0], #8 + st1 {v4.s}[2], [x0], x1 +.endm +1: calc_all4 +.purgem calc +2: ret +endfunc + +function ff_hevc_put_hevc_epel_uni_v16_8_neon, export=1 + load_epel_filterb x6, x5 + sxtw x3, w3 + sxtw x1, w1 + sub x2, x2, x3 + ld1 {v16.16b}, [x2], x3 + ld1 {v17.16b}, [x2], x3 + ld1 {v18.16b}, [x2], x3 +.macro calc src0, src1, src2, src3 + ld1 {\src3\().16b}, [x2], x3 + movi v4.8h, #0 + movi v5.8h, #0 + calc_epelb v4, \src0, \src1, \src2, \src3 + calc_epelb2 v5, \src0, \src1, \src2, \src3 + sqrshrun v4.8b, v4.8h, #6 + sqrshrun2 v4.16b, v5.8h, #6 + subs w4, w4, #1 + st1 {v4.16b}, [x0], x1 +.endm +1: calc_all4 +.purgem calc +2: ret +endfunc + +function ff_hevc_put_hevc_epel_uni_v24_8_neon, export=1 + load_epel_filterb x6, x5 + sxtw x3, w3 + sxtw x1, w1 + sub x2, x2, x3 + ld1 {v16.8b, v17.8b, v18.8b}, [x2], x3 + ld1 {v19.8b, v20.8b, v21.8b}, [x2], x3 + ld1 {v22.8b, v23.8b, v24.8b}, [x2], x3 +.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11 + ld1 {\src9\().8b, \src10\().8b, \src11\().8b}, [x2], x3 + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + calc_epelb v4, \src0, \src3, \src6, \src9 + calc_epelb v5, \src1, \src4, \src7, \src10 + calc_epelb v6, \src2, \src5, \src8, \src11 + sqrshrun v4.8b, v4.8h, #6 + sqrshrun v5.8b, v5.8h, #6 + sqrshrun v6.8b, v6.8h, #6 + subs w4, w4, #1 + st1 {v4.8b-v6.8b}, [x0], x1 +.endm +1: calc_all12 +.purgem calc +2: ret +endfunc + +function ff_hevc_put_hevc_epel_uni_v32_8_neon, export=1 + load_epel_filterb x6, x5 + sxtw x3, w3 + sxtw x1, w1 + sub x2, x2, x3 + ld1 {v16.16b, v17.16b}, [x2], x3 + ld1 {v18.16b, v19.16b}, [x2], x3 + ld1 {v20.16b, v21.16b}, [x2], x3 +.macro calc src0, src1, src2, src3, src4, src5, src6, src7 + ld1 {\src6\().16b, \src7\().16b}, [x2], x3 + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 + calc_epelb v4, \src0, \src2, \src4, \src6 + calc_epelb2 v5, \src0, \src2, \src4, \src6 + calc_epelb v6, \src1, \src3, \src5, \src7 + calc_epelb2 v7, \src1, \src3, \src5, \src7 + sqrshrun v4.8b, v4.8h, #6 + sqrshrun2 v4.16b, v5.8h, #6 + sqrshrun v5.8b, v6.8h, #6 + sqrshrun2 v5.16b, v7.8h, #6 + subs w4, w4, #1 + st1 {v4.16b, v5.16b}, [x0], x1 +.endm +1: calc_all8 +.purgem calc +2: ret +endfunc + +function ff_hevc_put_hevc_epel_uni_v48_8_neon, export=1 + load_epel_filterb x6, x5 + sxtw x3, w3 + sxtw x1, w1 + sub x2, x2, x3 + ld1 {v16.16b, v17.16b, v18.16b}, [x2], x3 + ld1 {v19.16b, v20.16b, v21.16b}, [x2], x3 + ld1 {v22.16b, v23.16b, v24.16b}, [x2], x3 +.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11 + ld1 {\src9\().16b, \src10\().16b, \src11\().16b}, [x2], x3 + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 + movi v28.8h, #0 + movi v29.8h, #0 + calc_epelb v4, \src0, \src3, \src6, \src9 + calc_epelb2 v5, \src0, \src3, \src6, \src9 + calc_epelb v6, \src1, \src4, \src7, \src10 + calc_epelb2 v7, \src1, \src4, \src7, \src10 + calc_epelb v28, \src2, \src5, \src8, \src11 + calc_epelb2 v29, \src2, \src5, \src8, \src11 + sqrshrun v4.8b, v4.8h, #6 + sqrshrun2 v4.16b, v5.8h, #6 + sqrshrun v5.8b, v6.8h, #6 + sqrshrun2 v5.16b, v7.8h, #6 + sqrshrun v6.8b, v28.8h, #6 + sqrshrun2 v6.16b, v29.8h, #6 + subs w4, w4, #1 + st1 {v4.16b, v5.16b, v6.16b}, [x0], x1 +.endm +1: calc_all12 +.purgem calc +2: ret +endfunc + +function ff_hevc_put_hevc_epel_uni_v64_8_neon, export=1 + load_epel_filterb x6, x5 + sub sp, sp, #32 + sxtw x3, w3 + sxtw x1, w1 + st1 {v8.8b-v11.8b}, [sp] + sub x2, x2, x3 + ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], x3 + ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], x3 + ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], x3 +.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15 + ld1 {\src12\().16b, \src13\().16b, \src14\().16b, \src15\().16b}, [x2], x3 + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 + movi v8.8h, #0 + movi v9.8h, #0 + movi v10.8h, #0 + movi v11.8h, #0 + calc_epelb v10, \src3, \src7, \src11, \src15 + calc_epelb2 v11, \src3, \src7, \src11, \src15 + calc_epelb v4, \src0, \src4, \src8, \src12 + calc_epelb2 v5, \src0, \src4, \src8, \src12 + calc_epelb v6, \src1, \src5, \src9, \src13 + calc_epelb2 v7, \src1, \src5, \src9, \src13 + calc_epelb v8, \src2, \src6, \src10, \src14 + calc_epelb2 v9, \src2, \src6, \src10, \src14 + sqrshrun v4.8b, v4.8h, #6 + sqrshrun2 v4.16b, v5.8h, #6 + sqrshrun v5.8b, v6.8h, #6 + sqrshrun2 v5.16b, v7.8h, #6 + sqrshrun v6.8b, v8.8h, #6 + sqrshrun2 v6.16b, v9.8h, #6 + sqrshrun v7.8b, v10.8h, #6 + sqrshrun2 v7.16b, v11.8h, #6 + subs w4, w4, #1 + st1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x0], x1 +.endm +1: calc_all16 +.purgem calc +2: ld1 {v8.8b-v11.8b}, [sp] + add sp, sp, #32 + ret +endfunc + #if HAVE_I8MM .macro EPEL_H_HEADER diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c index e125b0cfb2..f1e167c50b 100644 --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c @@ -161,6 +161,10 @@ NEON8_FNPROTO(pel_uni_w_pixels, (uint8_t *_dst, ptrdiff_t _dststride, int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width),); +NEON8_FNPROTO(epel_uni_v, (uint8_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int height, intptr_t mx, intptr_t my, int width),); + NEON8_FNPROTO(epel_uni_w_v, (uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src, ptrdiff_t _srcstride, int height, int denom, int wx, int ox, @@ -285,6 +289,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth) c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_qpel_bi_h16_8_neon; NEON8_FNASSIGN(c->put_hevc_epel_uni, 0, 0, pel_uni_pixels,); + NEON8_FNASSIGN(c->put_hevc_epel_uni, 1, 0, epel_uni_v,); NEON8_FNASSIGN(c->put_hevc_qpel_uni, 0, 0, pel_uni_pixels,); NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 0, pel_uni_w_pixels,);