From patchwork Wed Jul 31 19:06:50 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: =?utf-8?q?R=C3=A9mi_Denis-Courmont?= X-Patchwork-Id: 50833 Delivered-To: ffmpegpatchwork2@gmail.com Received: by 2002:a59:12d6:0:b0:489:2eb3:e4c4 with SMTP id 205csp690751vqs; Wed, 31 Jul 2024 12:07:02 -0700 (PDT) X-Forwarded-Encrypted: i=2; AJvYcCVnF1YuAhAHspukyP9HBhnJBgXAKNX7kYv594niMNVJFAZiKpa3pjGJG1L3Epo2xOXh0dfZdDzHfM8yoFm5rNtPyutof+9aR0obtA== X-Google-Smtp-Source: AGHT+IF442HBzi9/EVXcK1wvWwFHygN63O71jawtaN76Z7vzoAJdnXdMPxuSfiiD9wevDq44cKDW X-Received: by 2002:a17:906:730f:b0:a7a:adac:57d5 with SMTP id a640c23a62f3a-a7daf54aefcmr5691166b.18.1722452822516; Wed, 31 Jul 2024 12:07:02 -0700 (PDT) ARC-Seal: i=1; a=rsa-sha256; t=1722452822; cv=none; d=google.com; s=arc-20160816; b=mSQWymz4mo9pAXrVc11R4eUooa6OP3oilkMCkkysG/KJJTsc944Ex+mD7HWGrUspK2 nHs4mFcIIazrytcNZi21XUJhTu36zgfadrVwEWwek38PwTFDDG1ac18ASFVt4hWIT+C2 dJLWqsnJ6b1h6pIdt3BAl2w8FDmV1WS0lIwfSxROch1g+okafHnWKm5CLIKZ6jXIB9wg IKM741aqsTnScfQbRiA6R/oarxikHDSX4EXwARD/W971L/8vXmwgHrfIu18bq8RHtX+3 slIbGupPkeMU1ScMJN+s9jw8GTsvg3joL+NTGrJnk2HWR9LsFRtTwQwWtgT77Det7+Yv XWVg== ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=google.com; s=arc-20160816; h=sender:errors-to:content-transfer-encoding:reply-to:list-subscribe :list-help:list-post:list-archive:list-unsubscribe:list-id :precedence:subject:mime-version:message-id:date:to:from :delivered-to; bh=Mgc7PpUrH9EqT3H2Eri6pIU11VahO1UWKbb6UsT6u1g=; fh=YOA8vD9MJZuwZ71F/05pj6KdCjf6jQRmzLS+CATXUQk=; b=hTGonHT4UOFVZdwuLdqxumt8ndm2P0Js0kZ3P4HsmU0p5s/QxrlpmzwVBk8oYQmaCn y68aQACPhEoJskyhuzNLpyt792KsVlq7ACMVSk+xs6MnsRR3nEoPl3wwNstaGz1AFkKs Qb2OB0OCpZHkn2XnPTsoObWz4JVvcqkh6Psins03s5oglpGqzrQdxUhUCUrOf32Hm835 xzb6i5mlK0RH+zhgM7/RaQN3pSCw+P7xqJv1s3t9fNQFTbsTpPupIiJfIOS4Dd9gJF++ eloq4eM80Co/SvnPfrNmP2Wu1601F+OeOV/T26q9TewgZxC+IATSmJOXCG7pLSwWZ1hS 0auQ==; dara=google.com ARC-Authentication-Results: i=1; mx.google.com; spf=pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) smtp.mailfrom=ffmpeg-devel-bounces@ffmpeg.org Return-Path: Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org. [79.124.17.100]) by mx.google.com with ESMTP id a640c23a62f3a-a7aca9f903dsi938190566b.244.2024.07.31.12.07.01; Wed, 31 Jul 2024 12:07:02 -0700 (PDT) Received-SPF: pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) client-ip=79.124.17.100; Authentication-Results: mx.google.com; spf=pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) smtp.mailfrom=ffmpeg-devel-bounces@ffmpeg.org Received: from [127.0.1.1] (localhost [127.0.0.1]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id C95AB68D7A0; Wed, 31 Jul 2024 22:06:57 +0300 (EEST) X-Original-To: ffmpeg-devel@ffmpeg.org Delivered-To: ffmpeg-devel@ffmpeg.org Received: from ursule.remlab.net (vps-a2bccee9.vps.ovh.net [51.75.19.47]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id 0366F68D65B for ; Wed, 31 Jul 2024 22:06:50 +0300 (EEST) Received: from basile.remlab.net (localhost [IPv6:::1]) by ursule.remlab.net (Postfix) with ESMTP id 8FEC3C0143 for ; Wed, 31 Jul 2024 22:06:50 +0300 (EEST) From: =?utf-8?q?R=C3=A9mi_Denis-Courmont?= To: ffmpeg-devel@ffmpeg.org Date: Wed, 31 Jul 2024 22:06:50 +0300 Message-ID: <20240731190650.636970-1-remi@remlab.net> X-Mailer: git-send-email 2.45.2 MIME-Version: 1.0 Subject: [FFmpeg-devel] [PATCH] lavc/h264dsp: R-V V idct4_add8 (all depth) X-BeenThere: ffmpeg-devel@ffmpeg.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: FFmpeg development discussions and patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Reply-To: FFmpeg development discussions and patches Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" X-TUID: xGiFzUEx1OpC This is really just a wrapper for idct4_add16intra, which is in turm mostly a wrapper for idct4_add and idct4_dc_add. For benchmarks refer to the later two. --- libavcodec/riscv/h264dsp_init.c | 14 ++++++-- libavcodec/riscv/h264idct_rvv.S | 59 +++++++++++++++++++++++++++++---- 2 files changed, 63 insertions(+), 10 deletions(-) diff --git a/libavcodec/riscv/h264dsp_init.c b/libavcodec/riscv/h264dsp_init.c index e892c335a6..6b81587003 100644 --- a/libavcodec/riscv/h264dsp_init.c +++ b/libavcodec/riscv/h264dsp_init.c @@ -53,6 +53,9 @@ void ff_h264_idct_add16intra_##depth##_rvv(uint8_t *d, const int *soffset, \ int16_t *s, int stride, \ const uint8_t nnzc[5 * 8]); \ void ff_h264_idct8_add4_##depth##_rvv(uint8_t *d, const int *soffset, \ + int16_t *s, int stride, \ + const uint8_t nnzc[5 * 8]); \ +void ff_h264_idct4_add8_##depth##_rvv(uint8_t **d, const int *soffset, \ int16_t *s, int stride, \ const uint8_t nnzc[5 * 8]); @@ -104,6 +107,8 @@ av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int bit_depth, dsp->h264_idct_add16intra = ff_h264_idct_add16intra_8_rvv; # if __riscv_xlen == 64 dsp->h264_idct8_add4 = ff_h264_idct8_add4_8_rvv; + if (chroma_format_idc <= 1) + dsp->h264_idct_add8 = ff_h264_idct4_add8_8_rvv; # endif } if (flags & AV_CPU_FLAG_RVV_I64) { @@ -123,10 +128,13 @@ av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int bit_depth, if (zvl128b && (flags & AV_CPU_FLAG_RVB)) { \ dsp->h264_idct_dc_add = ff_h264_idct4_dc_add_##depth##_rvv; \ dsp->h264_idct8_dc_add = ff_h264_idct8_dc_add_##depth##_rvv; \ + dsp->h264_idct_add16 = ff_h264_idct_add16_##depth##_rvv; \ + dsp->h264_idct_add16intra = \ + ff_h264_idct_add16intra_##depth##_rvv; \ if (__riscv_xlen == 64) { \ - dsp->h264_idct_add16 = ff_h264_idct_add16_##depth##_rvv; \ - dsp->h264_idct_add16intra = \ - ff_h264_idct_add16intra_##depth##_rvv; \ + if (chroma_format_idc <= 1) \ + dsp->h264_idct_add8 = \ + ff_h264_idct4_add8_##depth##_rvv; \ } \ } \ if (__riscv_xlen == 64 && (flags & AV_CPU_FLAG_RVB)) \ diff --git a/libavcodec/riscv/h264idct_rvv.S b/libavcodec/riscv/h264idct_rvv.S index f823346c8d..70b7cfac4d 100644 --- a/libavcodec/riscv/h264idct_rvv.S +++ b/libavcodec/riscv/h264idct_rvv.S @@ -57,7 +57,7 @@ endfunc func ff_h264_idct_add_8_rvv, zve32x lpad 0 csrwi vxrm, 0 -.Lidct_add4_8_rvv: +.Lidct4_add_8_rvv: vsetivli zero, 4, e16, mf2, ta, ma addi t1, a1, 1 * 4 * 2 vle16.v v0, (a1) @@ -111,7 +111,7 @@ endfunc func ff_h264_idct_add_16_rvv, zve32x csrwi vxrm, 0 -.Lidct_add4_16_rvv: +.Lidct4_add_16_rvv: vsetivli zero, 4, e32, m1, ta, ma addi t1, a1, 1 * 4 * 4 vle32.v v0, (a1) @@ -543,8 +543,12 @@ endfunc .endr const ff_h264_scan8 - .byte 014, 015, 024, 025, 016, 017, 026, 027 - .byte 034, 035, 044, 045, 036, 037, 046, 047 + .byte 014, 015, 024, 025, 016, 017, 026, 027 + .byte 034, 035, 044, 045, 036, 037, 046, 047 + .byte 064, 065, 074, 075, 066, 067, 076, 077 + .byte 0104, 0105, 0114, 0115, 0106, 0107, 0116, 0117 + .byte 0134, 0135, 0144, 0145, 0136, 0137, 0146, 0147 + .byte 0154, 0155, 0164, 0165, 0156, 0157, 0166, 0167 endconst .macro idct4_adds type, depth @@ -554,8 +558,11 @@ func ff_h264_idct_add\type\()_\depth\()_rvv, zve32x, b .endif csrwi vxrm, 0 lla t0, ff_h264_scan8 - li t1, 32 * (\depth / 8) vsetivli zero, 16, e8, m1, ta, ma +.ifc \type, 16intra +.Lidct4_add4_\depth\()_rvv: +.endif + li t1, 32 * (\depth / 8) vle8.v v8, (t0) .if \depth == 8 vlse16.v v16, (a2), t1 @@ -587,7 +594,7 @@ func ff_h264_idct_add\type\()_\depth\()_rvv, zve32x, b mv t5, a1 mv a1, a2 mv a2, a3 - li a3, 16 + csrr a3, vl mv a7, ra 1: andi t0, a4, 1 @@ -603,7 +610,7 @@ func ff_h264_idct_add\type\()_\depth\()_rvv, zve32x, b .else beqz t0, 2f # if (nnzc[scan8[i]]) .endif - jal .Lidct_add4_\depth\()_rvv + jal .Lidct4_add_\depth\()_rvv j 3f 2: .ifnc \type, 16 @@ -702,6 +709,38 @@ func ff_h264_idct8_add4_\depth\()_rvv, zve32x, b addi sp, sp, 48 ret endfunc + +func ff_h264_idct4_add8_\depth\()_rvv, zve32x +.if \depth == 8 + lpad 0 +.endif + csrwi vxrm, 0 + addi sp, sp, -32 + addi a2, a2, 16 * 16 * 2 * (\depth / 8) # &block[16 * 16] + lla t0, ff_h264_scan8 + 16 + sd s0, 0(sp) + sd ra, 8(sp) + mv s0, sp + sd a0, 16(sp) + sd a4, 24(sp) + ld a0, 0(a0) # dest[0] + addi a1, a1, 16 * 4 # &block_offset[16] + vsetivli zero, 4, e8, mf4, ta, ma + jal .Lidct4_add4_\depth\()_rvv + + ld a4, 24(sp) # nnzc + ld a0, 16(sp) + mv a3, a2 # stride + addi a2, a1, (16 - 4) * 16 * 2 * (\depth / 8) # &block[32 * 16] + addi a1, t5, (16 - 4)* 4 # &block_offset[32] + ld a0, 8(a0) # dest[1] + lla t0, ff_h264_scan8 + 32 + ld ra, 8(sp) + ld s0, 0(sp) + addi sp, sp, 32 + vsetivli zero, 4, e8, mf4, ta, ma + j .Lidct4_add4_\depth\()_rvv +endfunc #endif .endr @@ -724,5 +763,11 @@ func ff_h264_idct8_add4_\depth\()_rvv, zve32x li a5, (1 << \depth) - 1 j ff_h264_idct8_add4_16_rvv endfunc + +func ff_h264_idct4_add8_\depth\()_rvv, zve32x + lpad 0 + li a5, (1 << \depth) - 1 + j ff_h264_idct4_add8_16_rvv +endfunc #endif .endr