From patchwork Mon Jul 15 19:11:21 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: =?utf-8?q?R=C3=A9mi_Denis-Courmont?= X-Patchwork-Id: 50548 Delivered-To: ffmpegpatchwork2@gmail.com Received: by 2002:a05:612c:2298:b0:482:c625:d099 with SMTP id fp24csp2457382vqb; Mon, 15 Jul 2024 12:11:38 -0700 (PDT) X-Forwarded-Encrypted: i=2; AJvYcCXnume3PK+5rm36rfjNv0uVPirhdmobuRZsTtmmUMGnXDO80IamwXLd8WwDlGtVEVbhDuHNDcFrE3gmX8DYCqsqHQXE9Gu+gW/3cg== X-Google-Smtp-Source: AGHT+IG6MJ9ecgpVPTp47GXup+k3QvaBfx1viWWkptSWgTbw1FZ69aCASl+Ee8EtZCD1Vic0+HJi X-Received: by 2002:a17:906:480c:b0:a77:c364:c4f2 with SMTP id a640c23a62f3a-a79e6af9cf7mr42943266b.52.1721070698012; Mon, 15 Jul 2024 12:11:38 -0700 (PDT) ARC-Seal: i=1; a=rsa-sha256; t=1721070697; cv=none; d=google.com; s=arc-20160816; b=kITlm/m7Jw/TDX25ZGFAO8KcMS1aho0hCqmHQStzIiXa3YOp0ESP8AhJVlyKJrv79A BCMvvELodOsm8rmVWqHjQxa1MRiIBLpqtffY9bmhjudcNFreolnyey7Ag/QuK8zJrFHi q5keO4rA0pRe2lQSfDzn+N+fbnoMNeJfI7Vkw60t+tJXOkfr175BCYjoRR+Ptz2BkpTg 2+HbdUh60LjXVPzcUyZ41ww1oc8keWZ4C1AW6ZwzDpCqEW0hpgiv5w0xeVrFGTfAYJdb LphKnTpKNI96WXMu0bSsOyJHWLi++50pEKddzqUTO6WX0joCrMiFmsqV5wLGe3q9qeOG vGhQ== ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=google.com; s=arc-20160816; h=sender:errors-to:content-transfer-encoding:reply-to:list-subscribe :list-help:list-post:list-archive:list-unsubscribe:list-id :precedence:subject:mime-version:message-id:date:to:from :delivered-to; bh=vTYH3uE9hsPBzLkf+ZKIx87PBPITcTsZ3ZO/MSxlax4=; fh=YOA8vD9MJZuwZ71F/05pj6KdCjf6jQRmzLS+CATXUQk=; b=vJm0FqFn/QNVuPDLGVry/DwLan1z/OYP3gremLFw3jjuxiVVySPpZq0b+D0eNDZdIO 58YRF0Cci75LrAQdEwEBgGeTFaltp39NZUc83LYKuwdrIgnjTpNvXamf9EcJTE9zBBPv lvzVHZ//fzd6w9PSLxQSauoUj0yyeFOxN2vIMqgU9+tQxEaByx80p+oZn2ESZpg/MpPG +foxnZDFoIGEAm2sWthpzH95QDbzfFK6WZj8wwmMwtmdjWyjFe9uIcRVm33slnS3d7o8 cQI/NndUvDePCaKmm2MgXlkdFrHKHa4XWrYo+vsrldvBQYxtMsSwDEBwMzaQUVLxsZGD y3vQ==; dara=google.com ARC-Authentication-Results: i=1; mx.google.com; spf=pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) smtp.mailfrom=ffmpeg-devel-bounces@ffmpeg.org Return-Path: Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org. [79.124.17.100]) by mx.google.com with ESMTP id a640c23a62f3a-a79bc827c27si265984966b.981.2024.07.15.12.11.33; Mon, 15 Jul 2024 12:11:37 -0700 (PDT) Received-SPF: pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) client-ip=79.124.17.100; Authentication-Results: mx.google.com; spf=pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) smtp.mailfrom=ffmpeg-devel-bounces@ffmpeg.org Received: from [127.0.1.1] (localhost [127.0.0.1]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id 1401F68D240; Mon, 15 Jul 2024 22:11:30 +0300 (EEST) X-Original-To: ffmpeg-devel@ffmpeg.org Delivered-To: ffmpeg-devel@ffmpeg.org Received: from ursule.remlab.net (vps-a2bccee9.vps.ovh.net [51.75.19.47]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id B85A468D861 for ; Mon, 15 Jul 2024 22:11:22 +0300 (EEST) Received: from basile.remlab.net (localhost [IPv6:::1]) by ursule.remlab.net (Postfix) with ESMTP id 2D18EC0140 for ; Mon, 15 Jul 2024 22:11:22 +0300 (EEST) From: =?utf-8?q?R=C3=A9mi_Denis-Courmont?= To: ffmpeg-devel@ffmpeg.org Date: Mon, 15 Jul 2024 22:11:21 +0300 Message-ID: <20240715191121.14217-1-remi@remlab.net> X-Mailer: git-send-email 2.45.2 MIME-Version: 1.0 Subject: [FFmpeg-devel] [PATCH] lavc/h264dsp: R-V V high-depth idct_add{, intra}16, idct8_add4 X-BeenThere: ffmpeg-devel@ffmpeg.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: FFmpeg development discussions and patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Reply-To: FFmpeg development discussions and patches Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" X-TUID: YiC41BkyUDpr As with 8-bit, this tends to be faster, but results are all over the place due to the variable distribution of non-zero coefficients. --- libavcodec/riscv/h264dsp_init.c | 77 +++++++++-------- libavcodec/riscv/h264idct_rvv.S | 147 +++++++++++++++++++++++++------- 2 files changed, 154 insertions(+), 70 deletions(-) diff --git a/libavcodec/riscv/h264dsp_init.c b/libavcodec/riscv/h264dsp_init.c index 4fc695f158..14eea29892 100644 --- a/libavcodec/riscv/h264dsp_init.c +++ b/libavcodec/riscv/h264dsp_init.c @@ -40,26 +40,25 @@ void ff_h264_h_loop_filter_luma_8_rvv(uint8_t *pix, ptrdiff_t stride, void ff_h264_h_loop_filter_luma_mbaff_8_rvv(uint8_t *pix, ptrdiff_t stride, int alpha, int beta, int8_t *tc0); -void ff_h264_idct_add_8_rvv(uint8_t *dst, int16_t *block, int stride); -void ff_h264_idct8_add_8_rvv(uint8_t *dst, int16_t *block, int stride); -void ff_h264_idct_add16_8_rvv(uint8_t *dst, const int *blockoffset, - int16_t *block, int stride, - const uint8_t nnzc[5 * 8]); -void ff_h264_idct_add16intra_8_rvv(uint8_t *dst, const int *blockoffset, - int16_t *block, int stride, - const uint8_t nnzc[5 * 8]); -void ff_h264_idct8_add4_8_rvv(uint8_t *dst, const int *blockoffset, - int16_t *block, int stride, - const uint8_t nnzc[5 * 8]); - -void ff_h264_idct_add_9_rvv(uint8_t *dst, int16_t *block, int stride); -void ff_h264_idct8_add_9_rvv(uint8_t *dst, int16_t *block, int stride); -void ff_h264_idct_add_10_rvv(uint8_t *dst, int16_t *block, int stride); -void ff_h264_idct8_add_10_rvv(uint8_t *dst, int16_t *block, int stride); -void ff_h264_idct_add_12_rvv(uint8_t *dst, int16_t *block, int stride); -void ff_h264_idct8_add_12_rvv(uint8_t *dst, int16_t *block, int stride); -void ff_h264_idct_add_14_rvv(uint8_t *dst, int16_t *block, int stride); -void ff_h264_idct8_add_14_rvv(uint8_t *dst, int16_t *block, int stride); +#define IDCT_DEPTH(depth) \ +void ff_h264_idct_add_##depth##_rvv(uint8_t *d, int16_t *s, int stride); \ +void ff_h264_idct8_add_##depth##_rvv(uint8_t *d, int16_t *s, int stride); \ +void ff_h264_idct_add16_##depth##_rvv(uint8_t *d, const int *soffset, \ + int16_t *s, int stride, \ + const uint8_t nnzc[5 * 8]); \ +void ff_h264_idct_add16intra_##depth##_rvv(uint8_t *d, const int *soffset, \ + int16_t *s, int stride, \ + const uint8_t nnzc[5 * 8]); \ +void ff_h264_idct8_add4_##depth##_rvv(uint8_t *d, const int *soffset, \ + int16_t *s, int stride, \ + const uint8_t nnzc[5 * 8]); + +IDCT_DEPTH(8) +IDCT_DEPTH(9) +IDCT_DEPTH(10) +IDCT_DEPTH(12) +IDCT_DEPTH(14) +#undef IDCT_DEPTH void ff_h264_add_pixels8_8_rvv(uint8_t *dst, int16_t *block, int stride); void ff_h264_add_pixels4_8_rvv(uint8_t *dst, int16_t *block, int stride); @@ -106,26 +105,26 @@ av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int bit_depth, dsp->h264_add_pixels4_clear = ff_h264_add_pixels4_8_rvv; } - if (bit_depth == 9) { - if (zvl128b) - dsp->h264_idct_add = ff_h264_idct_add_9_rvv; - dsp->h264_idct8_add = ff_h264_idct8_add_9_rvv; - } - if (bit_depth == 10) { - if (zvl128b) - dsp->h264_idct_add = ff_h264_idct_add_10_rvv; - dsp->h264_idct8_add = ff_h264_idct8_add_10_rvv; - } - if (bit_depth == 12) { - if (zvl128b) - dsp->h264_idct_add = ff_h264_idct_add_12_rvv; - dsp->h264_idct8_add = ff_h264_idct8_add_12_rvv; - } - if (bit_depth == 14) { - if (zvl128b) - dsp->h264_idct_add = ff_h264_idct_add_14_rvv; - dsp->h264_idct8_add = ff_h264_idct8_add_14_rvv; +#define IDCT_DEPTH(depth) \ + if (bit_depth == depth) { \ + if (zvl128b) \ + dsp->h264_idct_add = ff_h264_idct_add_##depth##_rvv; \ + if (flags & AV_CPU_FLAG_RVB_ADDR) \ + dsp->h264_idct8_add = ff_h264_idct8_add_##depth##_rvv; \ + if (__riscv_xlen == 64 && zvl128b) { \ + dsp->h264_idct_add16 = ff_h264_idct_add16_##depth##_rvv; \ + dsp->h264_idct_add16intra = \ + ff_h264_idct_add16intra_##depth##_rvv; \ + } \ + if (__riscv_xlen == 64 && (flags & AV_CPU_FLAG_RVB_ADDR)) \ + dsp->h264_idct8_add4 = ff_h264_idct8_add4_##depth##_rvv; \ } + + IDCT_DEPTH(9) + IDCT_DEPTH(10) + IDCT_DEPTH(12) + IDCT_DEPTH(14) + if (bit_depth > 8 && zvl128b) { dsp->h264_add_pixels8_clear = ff_h264_add_pixels8_16_rvv; if (flags & AV_CPU_FLAG_RVV_I64) diff --git a/libavcodec/riscv/h264idct_rvv.S b/libavcodec/riscv/h264idct_rvv.S index 7dd0a524fe..48de65ec0b 100644 --- a/libavcodec/riscv/h264idct_rvv.S +++ b/libavcodec/riscv/h264idct_rvv.S @@ -107,6 +107,7 @@ endfunc func ff_h264_idct_add_16_rvv, zve32x csrwi vxrm, 0 +.Lidct_add4_16_rvv: vsetivli zero, 4, e32, m1, ta, ma addi t1, a1, 1 * 4 * 4 vle32.v v0, (a1) @@ -147,7 +148,7 @@ func ff_h264_idct_add_16_rvv, zve32x vmax.vx v\n, v\n, zero .endr .irp n,0,1,2,3 - vmin.vx v\n, v\n, a3 + vmin.vx v\n, v\n, a5 .endr vsetvli zero, zero, e16, mf2, ta, ma vncvt.x.x.w v4, v0 @@ -295,9 +296,10 @@ func ff_h264_idct8_add_8_rvv, zve32x endfunc func ff_h264_idct8_add_16_rvv, zve32x - li a4, 8 csrwi vxrm, 0 - vsetivli a5, 8, e32, m1, ta, ma +.Lidct8_add_16_rvv: + li a4, 8 + vsetivli a3, 8, e32, m1, ta, ma 1: addi t1, a1, 1 * 8 * 4 vle32.v v0, (a1) @@ -313,11 +315,11 @@ func ff_h264_idct8_add_16_rvv, zve32x vle32.v v5, (t5) addi a7, a1, 7 * 8 * 4 vle32.v v6, (t6) - sub a4, a4, a5 + sub a4, a4, a3 vle32.v v7, (a7) jal t0, ff_h264_idct8_rvv vse32.v v0, (a1) - sh2add a1, a5, a1 + sh2add a1, a3, a1 vse32.v v1, (t1) vse32.v v2, (t2) vse32.v v3, (t3) @@ -329,7 +331,7 @@ func ff_h264_idct8_add_16_rvv, zve32x addi a1, a1, -8 * 4 li a4, 8 - slli a6, a5, 3 + 2 + slli a6, a3, 3 + 2 2: vsetvli zero, zero, e32, m1, ta, ma vlseg8e32.v v0, (a1) @@ -348,7 +350,7 @@ func ff_h264_idct8_add_16_rvv, zve32x vle16.v v21, (t5) add a7, t6, a2 vle16.v v22, (t6) - sub a4, a4, a5 + sub a4, a4, a3 vle16.v v23, (a7) .irp n,0,1,2,3,4,5,6,7 vssra.vi v\n, v\n, 6 @@ -368,7 +370,7 @@ func ff_h264_idct8_add_16_rvv, zve32x vmax.vx v\n, v\n, zero .endr .irp n,0,1,2,3,4,5,6,7 - vmin.vx v\n, v\n, a3 + vmin.vx v\n, v\n, a5 .endr vsetvli zero, zero, e16, mf2, ta, ma vncvt.x.x.w v16, v0 @@ -380,7 +382,7 @@ func ff_h264_idct8_add_16_rvv, zve32x vncvt.x.x.w v22, v6 vncvt.x.x.w v23, v7 vse16.v v16, (a0) - sh1add a0, a5, a0 + sh1add a0, a3, a0 vse16.v v17, (t1) vse16.v v18, (t2) vse16.v v19, (t3) @@ -400,12 +402,12 @@ endfunc .irp depth, 9, 10, 12, 14 func ff_h264_idct_add_\depth\()_rvv, zve32x - li a3, (1 << \depth) - 1 + li a5, (1 << \depth) - 1 j ff_h264_idct_add_16_rvv endfunc func ff_h264_idct8_add_\depth\()_rvv, zve32x - li a3, (1 << \depth) - 1 + li a5, (1 << \depth) - 1 j ff_h264_idct8_add_16_rvv endfunc .endr @@ -416,13 +418,13 @@ const ff_h264_scan8 endconst #if (__riscv_xlen == 64) -.irp depth, 8 +.irp depth, 8, 16 func ff_h264_idct_add16_\depth\()_rvv, zve32x csrwi vxrm, 0 - addi sp, sp, -80 + addi sp, sp, -96 lla t0, ff_h264_scan8 sd s0, (sp) - li t1, 32 << (\depth > 8) + li t1, 32 * (\depth / 8) mv s0, sp sd ra, 8(sp) sd s1, 16(sp) @@ -432,9 +434,19 @@ func ff_h264_idct_add16_\depth\()_rvv, zve32x sd s5, 48(sp) sd s6, 56(sp) sd s7, 64(sp) +.if \depth > 8 + sd s8, 72(sp) + sd s9, 80(sp) + mv s8, a5 + mv s9, a6 +.endif vsetivli zero, 16, e8, m1, ta, ma vle8.v v8, (t0) +.if \depth == 8 vlse16.v v16, (a2), t1 +.else + vlse32.v v16, (a2), t1 +.endif vluxei8.v v12, (a4), v8 .if \depth == 8 vsetvli zero, zero, e16, m2, ta, ma @@ -464,17 +476,28 @@ func ff_h264_idct_add16_\depth\()_rvv, zve32x mv a1, s6 mv a2, s7 add a0, s4, t2 - beqz t1, 2f # if (nnz == 1 && block[i * 16]) - call ff_h264_idct_dc_add_\depth\()_c +.if \depth > 8 + mv a5, s8 +.endif + bnez t1, 2f # if (nnz == 1 && block[i * 16]) + jal .Lidct_add4_\depth\()_rvv j 3f 2: - call .Lidct_add4_\depth\()_rvv +.if \depth == 8 + call ff_h264_idct_dc_add_\depth\()_c +.else + jalr s9 +.endif 3: srli s3, s3, 1 addi s5, s5, 4 - addi s6, s6, 16 * 2 << (\depth > 8) + addi s6, s6, 16 * 2 * (\depth / 8) bnez s1, 1b +.if \depth > 8 + ld s9, 80(sp) + ld s8, 72(sp) +.endif ld s7, 64(sp) ld s6, 56(sp) ld s5, 48(sp) @@ -484,16 +507,16 @@ func ff_h264_idct_add16_\depth\()_rvv, zve32x ld s1, 16(sp) ld ra, 8(sp) ld s0, 0(sp) - addi sp, sp, 80 + addi sp, sp, 96 ret endfunc func ff_h264_idct_add16intra_\depth\()_rvv, zve32x csrwi vxrm, 0 - addi sp, sp, -80 + addi sp, sp, -96 lla t0, ff_h264_scan8 sd s0, (sp) - li t1, 32 << (\depth > 8) + li t1, 32 * (\depth / 8) mv s0, sp sd ra, 8(sp) sd s1, 16(sp) @@ -503,9 +526,19 @@ func ff_h264_idct_add16intra_\depth\()_rvv, zve32x sd s5, 48(sp) sd s6, 56(sp) sd s7, 64(sp) +.if \depth > 8 + sd s8, 72(sp) + sd s9, 80(sp) + mv s8, a5 + mv s9, a6 +.endif vsetivli zero, 16, e8, m1, ta, ma vle8.v v8, (t0) +.if \depth == 8 vlse16.v v16, (a2), t1 +.else + vlse32.v v16, (a2), t1 +.endif vluxei8.v v12, (a4), v8 .if \depth == 8 vsetvli zero, zero, e16, m2, ta, ma @@ -532,18 +565,29 @@ func ff_h264_idct_add16intra_\depth\()_rvv, zve32x mv a1, s6 mv a2, s7 add a0, s4, t2 +.if \depth > 8 + mv a5, s8 +.endif beqz t0, 2f # if (nnzc[scan8[i]]) - call .Lidct_add4_\depth\()_rvv + jal .Lidct_add4_\depth\()_rvv j 3f 2: beqz t1, 3f # if (block[i * 16]) +.if \depth == 8 call ff_h264_idct_dc_add_\depth\()_c +.else + jalr s9 +.endif 3: srli s3, s3, 1 addi s5, s5, 4 - addi s6, s6, 16 * 2 << (\depth > 8) + addi s6, s6, 16 * 2 * (\depth / 8) bnez s1, 1b +.if \depth > 8 + ld s9, 80(sp) + ld s8, 72(sp) +.endif ld s7, 64(sp) ld s6, 56(sp) ld s5, 48(sp) @@ -553,16 +597,16 @@ func ff_h264_idct_add16intra_\depth\()_rvv, zve32x ld s1, 16(sp) ld ra, 8(sp) ld s0, 0(sp) - addi sp, sp, 80 + addi sp, sp, 96 ret endfunc func ff_h264_idct8_add4_\depth\()_rvv, zve32x csrwi vxrm, 0 - addi sp, sp, -80 + addi sp, sp, -96 lla t0, ff_h264_scan8 sd s0, (sp) - li t1, 4 * 32 << (\depth > 8) + li t1, 4 * 32 * (\depth / 8) mv s0, sp li t2, 4 sd ra, 8(sp) @@ -573,9 +617,19 @@ func ff_h264_idct8_add4_\depth\()_rvv, zve32x sd s5, 48(sp) sd s6, 56(sp) sd s7, 64(sp) +.if \depth > 8 + sd s8, 72(sp) + sd s9, 80(sp) + mv s8, a5 + mv s9, a6 +.endif vsetivli zero, 4, e8, mf4, ta, ma vlse8.v v8, (t0), t2 +.if \depth == 8 vlse16.v v16, (a2), t1 +.else + vlse32.v v16, (a2), t1 +.endif vluxei8.v v12, (a4), v8 .if \depth == 8 vsetvli zero, zero, e16, mf2, ta, ma @@ -604,17 +658,28 @@ func ff_h264_idct8_add4_\depth\()_rvv, zve32x mv a1, s6 mv a2, s7 add a0, s4, t2 - beqz t1, 2f # if (nnz == 1 && block[i * 16]) - call ff_h264_idct8_dc_add_\depth\()_c +.if \depth > 8 + mv a5, s8 +.endif + bnez t1, 2f # if (nnz == 1 && block[i * 16]) + jal .Lidct8_add_\depth\()_rvv j 3f 2: - call .Lidct8_add_\depth\()_rvv +.if \depth == 8 + call ff_h264_idct8_dc_add_\depth\()_c +.else + jalr s9 +.endif 3: srli s3, s3, 1 addi s5, s5, 4 * 4 - addi s6, s6, 4 * 16 * 2 << (\depth > 8) + addi s6, s6, 4 * 16 * 2 * (\depth / 8) bnez s1, 1b +.if \depth > 8 + ld s9, 80(sp) + ld s8, 72(sp) +.endif ld s7, 64(sp) ld s6, 56(sp) ld s5, 48(sp) @@ -624,8 +689,28 @@ func ff_h264_idct8_add4_\depth\()_rvv, zve32x ld s1, 16(sp) ld ra, 8(sp) ld s0, 0(sp) - addi sp, sp, 80 + addi sp, sp, 96 ret endfunc .endr + +.irp depth, 9, 10, 12, 14 +func ff_h264_idct_add16_\depth\()_rvv, zve32x + li a5, (1 << \depth) - 1 + lla a6, ff_h264_idct_dc_add_\depth\()_c + j ff_h264_idct_add16_16_rvv +endfunc + +func ff_h264_idct_add16intra_\depth\()_rvv, zve32x + li a5, (1 << \depth) - 1 + lla a6, ff_h264_idct_dc_add_\depth\()_c + j ff_h264_idct_add16intra_16_rvv +endfunc + +func ff_h264_idct8_add4_\depth\()_rvv, zve32x + li a5, (1 << \depth) - 1 + lla a6, ff_h264_idct8_dc_add_\depth\()_c + j ff_h264_idct8_add4_16_rvv +endfunc +.endr #endif