diff mbox series

[FFmpeg-devel,2/3] lavc/vp8dsp: R-V V put_epel v

Message ID CAEa-L+ucAaY=sPBkoDQBMfzcys=ny7SBuYvw2tEeD-FVwRBbog@mail.gmail.com
State New
Headers show
Series [FFmpeg-devel,1/3] lavc/vp8dsp: R-V V put_epel h | expand

Checks

Context Check Description
andriy/configure_x86 warning Failed to apply patch
yinshiyou/configure_loongarch64 warning Failed to apply patch

Commit Message

flow gg March 22, 2024, 6:01 a.m. UTC

Comments

Rémi Denis-Courmont March 27, 2024, 3:36 p.m. UTC | #1
Le perjantaina 22. maaliskuuta 2024, 8.01.21 EET flow gg a écrit :
> 

IMO, you could just as well share the code and avoid most if's. Not like one 
additional `li a3, 1` per function call is going to matter in the grand scheme 
of things. It might even help by reducing I-cache pressure.
flow gg March 28, 2024, 2:16 a.m. UTC | #2
Okay, changed in the reply and github (another reason for not doing so
initially was the thought that there weren't enough registers available,
and that other changes would need to be made that could cause side effects,
but now it's found that the vp8 registers are sufficient.. it's just that
vp9 doesn't have enough)

Rémi Denis-Courmont <remi@remlab.net> 于2024年3月27日周三 23:36写道:

> Le perjantaina 22. maaliskuuta 2024, 8.01.21 EET flow gg a écrit :
> >
>
> IMO, you could just as well share the code and avoid most if's. Not like
> one
> additional `li a3, 1` per function call is going to matter in the grand
> scheme
> of things. It might even help by reducing I-cache pressure.
>
> --
> 雷米‧德尼-库尔蒙
> http://www.remlab.net/
>
>
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
diff mbox series

Patch

From a59509c554a319f8271ad4175da40788445f7a56 Mon Sep 17 00:00:00 2001
From: sunyuechi <sunyuechi@iscas.ac.cn>
Date: Thu, 21 Mar 2024 17:49:54 +0800
Subject: [PATCH 2/3] lavc/vp8dsp: R-V V put_epel v

C908:
vp8_put_epel4_v4_c: 11.0
vp8_put_epel4_v4_rvv_i32: 5.0
vp8_put_epel4_v6_c: 16.5
vp8_put_epel4_v6_rvv_i32: 6.2
vp8_put_epel8_v4_c: 43.7
vp8_put_epel8_v4_rvv_i32: 11.2
vp8_put_epel8_v6_c: 68.7
vp8_put_epel8_v6_rvv_i32: 13.2
vp8_put_epel16_v4_c: 92.5
vp8_put_epel16_v4_rvv_i32: 13.7
vp8_put_epel16_v6_c: 135.7
vp8_put_epel16_v6_rvv_i32: 16.5
---
 libavcodec/riscv/vp8dsp_init.c |  7 ++++++
 libavcodec/riscv/vp8dsp_rvv.S  | 44 +++++++++++++++++++++++++++-------
 2 files changed, 42 insertions(+), 9 deletions(-)

diff --git a/libavcodec/riscv/vp8dsp_init.c b/libavcodec/riscv/vp8dsp_init.c
index 6614d661f7..2f123b67fe 100644
--- a/libavcodec/riscv/vp8dsp_init.c
+++ b/libavcodec/riscv/vp8dsp_init.c
@@ -85,6 +85,13 @@  av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c)
         c->put_vp8_epel_pixels_tab[0][0][1] = ff_put_vp8_epel16_h4_rvv;
         c->put_vp8_epel_pixels_tab[1][0][1] = ff_put_vp8_epel8_h4_rvv;
         c->put_vp8_epel_pixels_tab[2][0][1] = ff_put_vp8_epel4_h4_rvv;
+
+        c->put_vp8_epel_pixels_tab[0][2][0] = ff_put_vp8_epel16_v6_rvv;
+        c->put_vp8_epel_pixels_tab[1][2][0] = ff_put_vp8_epel8_v6_rvv;
+        c->put_vp8_epel_pixels_tab[2][2][0] = ff_put_vp8_epel4_v6_rvv;
+        c->put_vp8_epel_pixels_tab[0][1][0] = ff_put_vp8_epel16_v4_rvv;
+        c->put_vp8_epel_pixels_tab[1][1][0] = ff_put_vp8_epel8_v4_rvv;
+        c->put_vp8_epel_pixels_tab[2][1][0] = ff_put_vp8_epel4_v4_rvv;
     }
 #endif
 }
diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
index a0dd46e3a8..134154acfc 100644
--- a/libavcodec/riscv/vp8dsp_rvv.S
+++ b/libavcodec/riscv/vp8dsp_rvv.S
@@ -233,9 +233,13 @@  subpel_filters:
         .byte 1,  -8,  36, 108, -11, 2
         .byte 0,  -1,  12, 123,  -6, 0
 
-.macro epel_filter size
+.macro epel_filter size type
         lla             t2, subpel_filters
+.ifc \type,v
+        addi            t0, a6, -1
+.elseif \type == h
         addi            t0, a5, -1
+.endif
         li              t1, 6
         mul             t0, t0, t1
         add             t0, t0, t2
@@ -248,19 +252,33 @@  subpel_filters:
 .endif
 .endm
 
-.macro epel_load dst len size
+.macro epel_load dst len size type
+.ifc \type,v
+        sub             t6, a2, a3
+        add             a7, a2, a3
+.elseif \type == h
         addi            t6, a2, -1
         addi            a7, a2, 1
+.endif
         vle8.v          v24, (a2)
         vle8.v          v22, (t6)
         vle8.v          v26, (a7)
+.ifc \type,v
+        add             a7, a7, a3
+.elseif \type == h
         addi            a7, a7, 1
+.endif
         vle8.v          v28, (a7)
         vwmulu.vx       v16, v24, t2
         vwmulu.vx       v20, v26, t3
 .ifc \size,6
+.ifc \type,v
+        sub             t6, t6, a3
+        add             a7, a7, a3
+.elseif \type == h
         addi            t6, t6, -1
         addi            a7, a7, 1
+.endif
         vle8.v          v24, (t6)
         vle8.v          v26, (a7)
         vwmaccu.vx      v16, t0, v24
@@ -292,13 +310,13 @@  subpel_filters:
         vnclipu.wi      \dst, v24, 0
 .endm
 
-.macro epel_load_inc dst len size
-        epel_load       \dst \len \size
+.macro epel_load_inc dst len size type
+        epel_load       \dst \len \size \type
         add             a2, a2, a3
 .endm
 
-.macro epel len size
-        epel_filter     \size
+.macro epel len size type
+        epel_filter     \size \type
 
 .ifc \len,4
         vsetivli        zero, 4, e8, mf4, ta, ma
@@ -310,7 +328,7 @@  subpel_filters:
 
 1:
         addi            a4, a4, -1
-        epel_load_inc   v30 \len \size
+        epel_load_inc   v30 \len \size \type
         vse8.v          v30, (a0)
         add             a0, a0, a1
         bnez            a4, 1b
@@ -320,10 +338,18 @@  subpel_filters:
 
 .irp len 16,8,4
 func ff_put_vp8_epel\len\()_h6_rvv, zve32x
-        epel \len 6
+        epel \len 6 h
 endfunc
 
 func ff_put_vp8_epel\len\()_h4_rvv, zve32x
-        epel \len 4
+        epel \len 4 h
+endfunc
+
+func ff_put_vp8_epel\len\()_v6_rvv, zve32x
+        epel \len 6 v
+endfunc
+
+func ff_put_vp8_epel\len\()_v4_rvv, zve32x
+        epel \len 4 v
 endfunc
 .endr
-- 
2.44.0