From patchwork Wed Jul 31 20:05:17 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: =?utf-8?q?R=C3=A9mi_Denis-Courmont?= X-Patchwork-Id: 50837 Delivered-To: ffmpegpatchwork2@gmail.com Received: by 2002:a59:12d6:0:b0:489:2eb3:e4c4 with SMTP id 205csp718324vqs; Wed, 31 Jul 2024 13:05:28 -0700 (PDT) X-Forwarded-Encrypted: i=2; AJvYcCXF9Ijf80yFWOSY+EfMYDapojSNvKgxEQoVSJWvNIvwpIeWDNTf14CDqWk/5XI3g6CXR2OkC2GbnWOb2ic1HeGIs11agCzdEE7Hnw== X-Google-Smtp-Source: AGHT+IEOiDjRZwgyAX+3jsPXDzEru2iM0d+V1SeWlN4PlFAmolRFN7TR9FhOL0PKWGqywPgAbvY1 X-Received: by 2002:a05:6512:3c91:b0:52f:89aa:c344 with SMTP id 2adb3069b0e04-530b61af829mr89613e87.16.1722456327718; Wed, 31 Jul 2024 13:05:27 -0700 (PDT) ARC-Seal: i=1; a=rsa-sha256; t=1722456327; cv=none; d=google.com; s=arc-20160816; b=vJnvrODJ8/IbKx+Cn8eYvGI62niMO27toRae8ITbPosAsqSewSFpxIE3GwaZDuBaQ/ SJG7DlFA0TGZULf2FPe5Uuq8OEhu18S6KyU3UJGqebzEaGwWUHPqyPwUhTg6AgaJjC68 qJ7FFW2SYdXvzX87yjULPOZvVYmocrw5kBopsSPpXgA21gE/8ldLKZ0P6fXUHsASDrXr bbe6b2v5goMFdDPmIh0XfJtxEWRh/m6qRVatdHJb5oAbmMvmgtwfKe1paddNliN/BBr6 O6CjKPQLKOScgIXdP51h0yNAzBpI1xQH9X6/8Vmdk0ajTKJyJVJNEIOkZTsVo1MhNb2F 5VZw== ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=google.com; s=arc-20160816; h=sender:errors-to:content-transfer-encoding:reply-to:list-subscribe :list-help:list-post:list-archive:list-unsubscribe:list-id :precedence:subject:mime-version:references:in-reply-to:message-id :date:to:from:delivered-to; bh=XDgZ3hklbYLITxgIZ9sYxlwrKw/YY1rUEFNuundJXL0=; fh=YOA8vD9MJZuwZ71F/05pj6KdCjf6jQRmzLS+CATXUQk=; b=xaCPnAeTegse7KJdbJxGde1TTbfuzsI20nwn+zinBPB094RLJpPEraRt6luG7YiVD7 H/cpKVxh2GfXZJEc5THEpE744eVzJLSH0VP8FeqAR2VP+t+iPKdzByqHGQ/gLfwstdbx 3B2dDbYHqdoLvYSOwobWn06R7rfzVrNYNv6tBkHDsj5GiYc3ofaqN4YBa4FqS4Rq4Jp1 nk367RUyz84cyY78OLXGzmVqI9xOOUNWPKBfovJ3eULqsZRrt/zxCiYjxn94lGYCBZJC 8rJh7h0j1y6P7rcRLXBypPBse/A6ueNcH3wo6P4RvRC89AKQF0X5zaUuOUjF2d9JFygt duFQ==; dara=google.com ARC-Authentication-Results: i=1; mx.google.com; spf=pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) smtp.mailfrom=ffmpeg-devel-bounces@ffmpeg.org Return-Path: Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org. [79.124.17.100]) by mx.google.com with ESMTP id a640c23a62f3a-a7acac632f1si939707166b.581.2024.07.31.13.05.27; Wed, 31 Jul 2024 13:05:27 -0700 (PDT) Received-SPF: pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) client-ip=79.124.17.100; Authentication-Results: mx.google.com; spf=pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) smtp.mailfrom=ffmpeg-devel-bounces@ffmpeg.org Received: from [127.0.1.1] (localhost [127.0.0.1]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id BD37568D8EC; Wed, 31 Jul 2024 23:05:24 +0300 (EEST) X-Original-To: ffmpeg-devel@ffmpeg.org Delivered-To: ffmpeg-devel@ffmpeg.org Received: from ursule.remlab.net (vps-a2bccee9.vps.ovh.net [51.75.19.47]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id E825C68CCF9 for ; Wed, 31 Jul 2024 23:05:17 +0300 (EEST) Received: from basile.remlab.net (localhost [IPv6:::1]) by ursule.remlab.net (Postfix) with ESMTP id 8C44CC0143 for ; Wed, 31 Jul 2024 23:05:17 +0300 (EEST) From: =?utf-8?q?R=C3=A9mi_Denis-Courmont?= To: ffmpeg-devel@ffmpeg.org Date: Wed, 31 Jul 2024 23:05:17 +0300 Message-ID: <20240731200517.701331-1-remi@remlab.net> X-Mailer: git-send-email 2.45.2 In-Reply-To: <20240731190650.636970-1-remi@remlab.net> References: <20240731190650.636970-1-remi@remlab.net> MIME-Version: 1.0 Subject: [FFmpeg-devel] [PATCHv2] lavc/h264dsp: R-V V idct4_add8 (all depths) X-BeenThere: ffmpeg-devel@ffmpeg.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: FFmpeg development discussions and patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Reply-To: FFmpeg development discussions and patches Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" X-TUID: r3/mTgfRw9s9 These are really just wrappers for idct4_add16intra functions, which are in turn mostly wrappers for idct4_add and idct4_dc_add functions. For benchmarks refer to the later two sets. --- libavcodec/riscv/h264dsp_init.c | 24 ++++++-- libavcodec/riscv/h264idct_rvv.S | 97 +++++++++++++++++++++++++++++---- 2 files changed, 107 insertions(+), 14 deletions(-) diff --git a/libavcodec/riscv/h264dsp_init.c b/libavcodec/riscv/h264dsp_init.c index 671330d664..7f787d8f57 100644 --- a/libavcodec/riscv/h264dsp_init.c +++ b/libavcodec/riscv/h264dsp_init.c @@ -54,7 +54,13 @@ void ff_h264_idct_add16intra_##depth##_rvv(uint8_t *d, const int *soffset, \ const uint8_t nnzc[5 * 8]); \ void ff_h264_idct8_add4_##depth##_rvv(uint8_t *d, const int *soffset, \ int16_t *s, int stride, \ - const uint8_t nnzc[5 * 8]); + const uint8_t nnzc[5 * 8]); \ +void ff_h264_idct4_add8_##depth##_rvv(uint8_t **d, const int *soffset, \ + int16_t *s, int stride, \ + const uint8_t nnzc[5 * 8]); \ +void ff_h264_idct4_add8_422_##depth##_rvv(uint8_t **d, const int *soffset, \ + int16_t *s, int stride, \ + const uint8_t nnzc[5 * 8]); IDCT_DEPTH(8) IDCT_DEPTH(9) @@ -104,6 +110,10 @@ av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int bit_depth, dsp->h264_idct_add16intra = ff_h264_idct_add16intra_8_rvv; # if __riscv_xlen == 64 dsp->h264_idct8_add4 = ff_h264_idct8_add4_8_rvv; + if (chroma_format_idc <= 1) + dsp->h264_idct_add8 = ff_h264_idct4_add8_8_rvv; + else + dsp->h264_idct_add8 = ff_h264_idct4_add8_422_8_rvv; # endif } if (flags & AV_CPU_FLAG_RVV_I64) { @@ -123,10 +133,16 @@ av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int bit_depth, if (zvl128b && (flags & AV_CPU_FLAG_RVB)) { \ dsp->h264_idct_dc_add = ff_h264_idct4_dc_add_##depth##_rvv; \ dsp->h264_idct8_dc_add = ff_h264_idct8_dc_add_##depth##_rvv; \ + dsp->h264_idct_add16 = ff_h264_idct_add16_##depth##_rvv; \ + dsp->h264_idct_add16intra = \ + ff_h264_idct_add16intra_##depth##_rvv; \ if (__riscv_xlen == 64) { \ - dsp->h264_idct_add16 = ff_h264_idct_add16_##depth##_rvv; \ - dsp->h264_idct_add16intra = \ - ff_h264_idct_add16intra_##depth##_rvv; \ + if (chroma_format_idc <= 1) \ + dsp->h264_idct_add8 = \ + ff_h264_idct4_add8_##depth##_rvv; \ + else \ + dsp->h264_idct_add8 = \ + ff_h264_idct4_add8_422_##depth##_rvv; \ } \ } \ if (__riscv_xlen == 64 && (flags & AV_CPU_FLAG_RVB)) \ diff --git a/libavcodec/riscv/h264idct_rvv.S b/libavcodec/riscv/h264idct_rvv.S index f823346c8d..d2f77a5b47 100644 --- a/libavcodec/riscv/h264idct_rvv.S +++ b/libavcodec/riscv/h264idct_rvv.S @@ -57,7 +57,7 @@ endfunc func ff_h264_idct_add_8_rvv, zve32x lpad 0 csrwi vxrm, 0 -.Lidct_add4_8_rvv: +.Lidct4_add_8_rvv: vsetivli zero, 4, e16, mf2, ta, ma addi t1, a1, 1 * 4 * 2 vle16.v v0, (a1) @@ -111,7 +111,7 @@ endfunc func ff_h264_idct_add_16_rvv, zve32x csrwi vxrm, 0 -.Lidct_add4_16_rvv: +.Lidct4_add_16_rvv: vsetivli zero, 4, e32, m1, ta, ma addi t1, a1, 1 * 4 * 4 vle32.v v0, (a1) @@ -543,19 +543,26 @@ endfunc .endr const ff_h264_scan8 - .byte 014, 015, 024, 025, 016, 017, 026, 027 - .byte 034, 035, 044, 045, 036, 037, 046, 047 + .byte 014, 015, 024, 025, 016, 017, 026, 027 + .byte 034, 035, 044, 045, 036, 037, 046, 047 + .byte 064, 065, 074, 075, 066, 067, 076, 077 + .byte 0104, 0105, 0114, 0115, 0106, 0107, 0116, 0117 + .byte 0134, 0135, 0144, 0145, 0136, 0137, 0146, 0147 + .byte 0154, 0155, 0164, 0165, 0156, 0157, 0166, 0167 endconst -.macro idct4_adds type, depth +.macro idct4_add16 type, depth func ff_h264_idct_add\type\()_\depth\()_rvv, zve32x, b .if \depth == 8 lpad 0 .endif csrwi vxrm, 0 lla t0, ff_h264_scan8 - li t1, 32 * (\depth / 8) vsetivli zero, 16, e8, m1, ta, ma +.ifc \type, 16intra +.Lidct4_add4_\depth\()_rvv: +.endif + li t1, 32 * (\depth / 8) vle8.v v8, (t0) .if \depth == 8 vlse16.v v16, (a2), t1 @@ -587,7 +594,7 @@ func ff_h264_idct_add\type\()_\depth\()_rvv, zve32x, b mv t5, a1 mv a1, a2 mv a2, a3 - li a3, 16 + csrr a3, vl mv a7, ra 1: andi t0, a4, 1 @@ -603,7 +610,7 @@ func ff_h264_idct_add\type\()_\depth\()_rvv, zve32x, b .else beqz t0, 2f # if (nnzc[scan8[i]]) .endif - jal .Lidct_add4_\depth\()_rvv + jal .Lidct4_add_\depth\()_rvv j 3f 2: .ifnc \type, 16 @@ -621,9 +628,67 @@ func ff_h264_idct_add\type\()_\depth\()_rvv, zve32x, b endfunc .endm +.macro idct4_add8 type, depth +func ff_h264_idct4_add\type\()_\depth\()_rvv, zve32x +.if \depth == 8 + lpad 0 +.endif + csrwi vxrm, 0 + addi sp, sp, -32 + addi a2, a2, 16 * 16 * 2 * (\depth / 8) # &block[16 * 16] + lla t0, ff_h264_scan8 + 16 + sd s0, 0(sp) + sd ra, 8(sp) + mv s0, sp + sd a0, 16(sp) + sd a4, 24(sp) + ld a0, 0(a0) # dest[0] + addi a1, a1, 16 * 4 # &block_offset[16] + vsetivli zero, 4, e8, mf4, ta, ma + jal .Lidct4_add4_\depth\()_rvv + + ld a4, 24(sp) # nnzc + ld a0, 16(sp) + mv a3, a2 # stride + addi a2, a1, (16 - 4) * 16 * 2 * (\depth / 8) # &block[32 * 16] + addi a1, t5, (16 - 4) * 4 # &block_offset[32] + ld a0, 8(a0) # dest[1] + lla t0, ff_h264_scan8 + 32 +.ifc \type, 8_422 + vsetivli zero, 4, e8, mf4, ta, ma + jal .Lidct4_add4_\depth\()_rvv + + ld a4, 24(sp) # nnzc + ld a0, 16(sp) + mv a3, a2 # stride + addi a2, a1, (-12- 4) * 16 * 2 * (\depth / 8) # &block[20 * 16] + addi a1, t5, (-8 - 4) * 4 # &block_offset[24] + ld a0, 0(a0) # dest[0] + lla t0, ff_h264_scan8 + 24 + vsetivli zero, 4, e8, mf4, ta, ma + jal .Lidct4_add4_\depth\()_rvv + + ld a4, 24(sp) # nnzc + ld a0, 16(sp) + mv a3, a2 # stride + addi a2, a1, (16 - 4) * 16 * 2 * (\depth / 8) # &block[36 * 16] + addi a1, t5, (16 - 4) * 4 # &block_offset[40] + ld a0, 8(a0) # dest[1] + lla t0, ff_h264_scan8 + 40 +.endif + ld ra, 8(sp) + ld s0, 0(sp) + addi sp, sp, 32 + vsetivli zero, 4, e8, mf4, ta, ma + j .Lidct4_add4_\depth\()_rvv +endfunc +.endm + .irp depth, 8, 16 -idct4_adds 16, \depth -idct4_adds 16intra, \depth +idct4_add16 16, \depth +idct4_add16 16intra, \depth +idct4_add8 8, \depth +idct4_add8 8_422, \depth #if (__riscv_xlen == 64) func ff_h264_idct8_add4_\depth\()_rvv, zve32x, b @@ -724,5 +789,17 @@ func ff_h264_idct8_add4_\depth\()_rvv, zve32x li a5, (1 << \depth) - 1 j ff_h264_idct8_add4_16_rvv endfunc + +func ff_h264_idct4_add8_\depth\()_rvv, zve32x + lpad 0 + li a5, (1 << \depth) - 1 + j ff_h264_idct4_add8_16_rvv +endfunc + +func ff_h264_idct4_add8_422_\depth\()_rvv, zve32x + lpad 0 + li a5, (1 << \depth) - 1 + j ff_h264_idct4_add8_422_16_rvv +endfunc #endif .endr