diff mbox series

[FFmpeg-devel,5/5] lavc/vp8dsp: factor R-V V EPEL functions for all lengths

Message ID 20240525153840.78147-5-remi@remlab.net
State New
Headers show
Series [FFmpeg-devel,1/5] lavc/vp8dsp: avoid one multiplication on RISC-V | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

Rémi Denis-Courmont May 25, 2024, 3:38 p.m. UTC
---
 libavcodec/riscv/vp8dsp_rvv.S | 56 ++++++++++++++++++++---------------
 1 file changed, 32 insertions(+), 24 deletions(-)

Comments

flow gg May 25, 2024, 6:16 p.m. UTC | #1
Would it be better to replace the two vsetvlstatic8 and vsetvlstatic16 with
two vsetvl? This would require the previous patch and this one to work
together, increasing the number of lines of code and making the code a bit
harder to read.
Additionally, I have a question about patch 4 'save one R-V GPR' and patch
5. Should they be submitted as a single patch? Because patch 4 looks
similar to what I initially submitted, and you suggested changing it to
save lines of code. If it is only for patch 5, shouldn't they be combined
together?

Rémi Denis-Courmont <remi@remlab.net> 于2024年5月25日周六 23:39写道:

> ---
>  libavcodec/riscv/vp8dsp_rvv.S | 56 ++++++++++++++++++++---------------
>  1 file changed, 32 insertions(+), 24 deletions(-)
>
> diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
> index a4fcd158a5..002e7f3174 100644
> --- a/libavcodec/riscv/vp8dsp_rvv.S
> +++ b/libavcodec/riscv/vp8dsp_rvv.S
> @@ -32,16 +32,6 @@
>  .endif
>  .endm
>
> -.macro vsetvlstatic16 len
> -.if \len <= 4
> -        vsetivli        zero, \len, e16, mf2, ta, ma
> -.elseif \len <= 8
> -        vsetivli        zero, \len, e16, m1, ta, ma
> -.elseif \len <= 16
> -        vsetivli        zero, \len, e16, m2, ta, ma
> -.endif
> -.endm
> -
>  .macro vp8_idct_dc_add
>          vlse32.v      v0, (a0), a2
>          lh            a5, 0(a1)
> @@ -181,13 +171,8 @@ const subpel_filters
>          .byte 0,  -1,  12, 123,  -6, 0
>  endconst
>
> -.macro epel len size type
> -func ff_put_vp8_epel\len\()_\type\()\size\()_rvv, zve32x
> -.ifc \type,v
> -        addi            t0, a6, -1
> -.else
> -        addi            t0, a5, -1
> -.endif
> +.macro epel_common size, type
> +func ff_put_vp8_epel_\type\()\size\().rvv, zve32x
>          lla             t2, subpel_filters
>          sh1add          t0, t0, t0
>          sh1add          t0, t0, t2
> @@ -198,7 +183,6 @@ func ff_put_vp8_epel\len\()_\type\()\size\()_rvv,
> zve32x
>          lb              t5, 5(t0)
>          lb              t0, (t0)
>  .endif
> -        vsetvlstatic8   \len
>  1:
>          addi            a4, a4, -1
>  .ifc \type,v
> @@ -236,11 +220,11 @@ func ff_put_vp8_epel\len\()_\type\()\size\()_rvv,
> zve32x
>          vwmaccsu.vx     v16, t1, v22
>          vwmaccsu.vx     v16, t4, v28
>          vwadd.wx        v16, v16, t6
> -        vsetvlstatic16  \len
> +        vsetvl          zero, zero, a6 # e16
>          vwadd.vv        v24, v16, v20
>          vnsra.wi        v24, v24, 7
>          vmax.vx         v24, v24, zero
> -        vsetvlstatic8   \len
> +        vsetvl          zero, zero, a5 # e8
>          vnclipu.wi      v30, v24, 0
>          add             a2, a2, a3
>          vse8.v          v30, (a0)
> @@ -251,9 +235,33 @@ func ff_put_vp8_epel\len\()_\type\()\size\()_rvv,
> zve32x
>  endfunc
>  .endm
>
> +.macro epel len, size, type
> +func ff_put_vp8_epel\len\()_\type\()\size\()_rvv, zve32x
> +.ifc \type,v
> +        addi    t0, a6, -1
> +.else
> +        addi    t0, a5, -1
> +.endif
> +.if \len <= 4
> +        li      a5, 0306 # e8, mf4, ta, ma
> +        li      a6, 0317 # e16, mf2, ta, ma
> +.elseif \len <= 8
> +        li      a5, 0307 # e8, mf2, ta, ma
> +        li      a6, 0310 # e16, m1, ta, ma
> +.else # if len <= 16
> +        li      a5, 0300 # e8, m1, ta, ma
> +        li      a6, 0311 # e16, m2, ta, ma
> +.endif
> +        vsetvlstatic8 \len
> +        j       ff_put_vp8_epel_\type\()\size\().rvv
> +endfunc
> +.endm
> +
> +.irp type,h,v
> +.irp size,4,6
> +epel_common \size, \type
>  .irp len,16,8,4
> -epel \len 6 h
> -epel \len 4 h
> -epel \len 6 v
> -epel \len 4 v
> +epel \len, \size, \type
> +.endr
> +.endr
>  .endr
> --
> 2.45.1
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
Rémi Denis-Courmont May 25, 2024, 6:29 p.m. UTC | #2
Le lauantaina 25. toukokuuta 2024, 21.16.22 EEST flow gg a écrit :
> Would it be better to replace the two vsetvlstatic8 and vsetvlstatic16 with
> two vsetvl?

The other option is to hard-code the most pessimistic multiplier. That would 
be easier to read and save two instructions in the head, it would most likely 
end up slower overall, due to increased latency from the vector unit in the 
main loop.

On the other hand, with vsetvl, we have the option to adjust the multiplier at 
run-time depending on hardware vector size. That will not be possible with 
vsetvli unless we patch the code live (yikes).

> This would require the previous patch and this one to work
> together,

Yes, patch order matters.

> increasing the number of lines of code

This is reducing code size by over 2 kib of code, or several hundreds of 
instructions.

> Additionally, I have a question about patch 4 'save one R-V GPR' and patch
> 5. Should they be submitted as a single patch? Because patch 4 looks
> similar to what I initially submitted, and you suggested changing it to
> save lines of code. If it is only for patch 5, shouldn't they be combined
> together?

I think people here like to have as small and many patches as possible, as is 
generally considered the right way to use Git. Since patch 4 is a very minor 
but still independent (from patch 5) improvement, it should be separate, as 
far as I understand FFmpeg's practices.
flow gg May 25, 2024, 6:56 p.m. UTC | #3
Well, I'm mainly considering that we have added some vset related lines,
but they haven't played a new role for the time being. If it's for future
modifications, it does make sense.

> This is reducing code size by over 2 kib of code, or several hundreds of
instructions.

The reduction in code size seems to be due to switching to using j labels,
doesn't seem to be about vset, but another issue. j labels are indeed
better. I will make similar modifications.

Rémi Denis-Courmont <remi@remlab.net> 于2024年5月26日周日 02:29写道:

> Le lauantaina 25. toukokuuta 2024, 21.16.22 EEST flow gg a écrit :
> > Would it be better to replace the two vsetvlstatic8 and vsetvlstatic16
> with
> > two vsetvl?
>
> The other option is to hard-code the most pessimistic multiplier. That
> would
> be easier to read and save two instructions in the head, it would most
> likely
> end up slower overall, due to increased latency from the vector unit in
> the
> main loop.
>
> On the other hand, with vsetvl, we have the option to adjust the
> multiplier at
> run-time depending on hardware vector size. That will not be possible with
> vsetvli unless we patch the code live (yikes).
>
> > This would require the previous patch and this one to work
> > together,
>
> Yes, patch order matters.
>
> > increasing the number of lines of code
>
> This is reducing code size by over 2 kib of code, or several hundreds of
> instructions.
>
> > Additionally, I have a question about patch 4 'save one R-V GPR' and
> patch
> > 5. Should they be submitted as a single patch? Because patch 4 looks
> > similar to what I initially submitted, and you suggested changing it to
> > save lines of code. If it is only for patch 5, shouldn't they be combined
> > together?
>
> I think people here like to have as small and many patches as possible, as
> is
> generally considered the right way to use Git. Since patch 4 is a very
> minor
> but still independent (from patch 5) improvement, it should be separate,
> as
> far as I understand FFmpeg's practices.
>
> --
> レミ・デニ-クールモン
> http://www.remlab.net/
>
>
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
Rémi Denis-Courmont May 25, 2024, 7:15 p.m. UTC | #4
Le lauantaina 25. toukokuuta 2024, 21.56.58 EEST flow gg a écrit :
> The reduction in code size seems to be due to switching to using j labels,
> doesn't seem to be about vset, but another issue. j labels are indeed
> better. I will make similar modifications.

If we don't use vsetvl, then we have to hard-code the worst-case multipler for 
all lengths, which should be slower. That or we can't share the code with 
jumps at all.
diff mbox series

Patch

diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
index a4fcd158a5..002e7f3174 100644
--- a/libavcodec/riscv/vp8dsp_rvv.S
+++ b/libavcodec/riscv/vp8dsp_rvv.S
@@ -32,16 +32,6 @@ 
 .endif
 .endm
 
-.macro vsetvlstatic16 len
-.if \len <= 4
-        vsetivli        zero, \len, e16, mf2, ta, ma
-.elseif \len <= 8
-        vsetivli        zero, \len, e16, m1, ta, ma
-.elseif \len <= 16
-        vsetivli        zero, \len, e16, m2, ta, ma
-.endif
-.endm
-
 .macro vp8_idct_dc_add
         vlse32.v      v0, (a0), a2
         lh            a5, 0(a1)
@@ -181,13 +171,8 @@  const subpel_filters
         .byte 0,  -1,  12, 123,  -6, 0
 endconst
 
-.macro epel len size type
-func ff_put_vp8_epel\len\()_\type\()\size\()_rvv, zve32x
-.ifc \type,v
-        addi            t0, a6, -1
-.else
-        addi            t0, a5, -1
-.endif
+.macro epel_common size, type
+func ff_put_vp8_epel_\type\()\size\().rvv, zve32x
         lla             t2, subpel_filters
         sh1add          t0, t0, t0
         sh1add          t0, t0, t2
@@ -198,7 +183,6 @@  func ff_put_vp8_epel\len\()_\type\()\size\()_rvv, zve32x
         lb              t5, 5(t0)
         lb              t0, (t0)
 .endif
-        vsetvlstatic8   \len
 1:
         addi            a4, a4, -1
 .ifc \type,v
@@ -236,11 +220,11 @@  func ff_put_vp8_epel\len\()_\type\()\size\()_rvv, zve32x
         vwmaccsu.vx     v16, t1, v22
         vwmaccsu.vx     v16, t4, v28
         vwadd.wx        v16, v16, t6
-        vsetvlstatic16  \len
+        vsetvl          zero, zero, a6 # e16
         vwadd.vv        v24, v16, v20
         vnsra.wi        v24, v24, 7
         vmax.vx         v24, v24, zero
-        vsetvlstatic8   \len
+        vsetvl          zero, zero, a5 # e8
         vnclipu.wi      v30, v24, 0
         add             a2, a2, a3
         vse8.v          v30, (a0)
@@ -251,9 +235,33 @@  func ff_put_vp8_epel\len\()_\type\()\size\()_rvv, zve32x
 endfunc
 .endm
 
+.macro epel len, size, type
+func ff_put_vp8_epel\len\()_\type\()\size\()_rvv, zve32x
+.ifc \type,v
+        addi    t0, a6, -1
+.else
+        addi    t0, a5, -1
+.endif
+.if \len <= 4
+        li      a5, 0306 # e8, mf4, ta, ma
+        li      a6, 0317 # e16, mf2, ta, ma
+.elseif \len <= 8
+        li      a5, 0307 # e8, mf2, ta, ma
+        li      a6, 0310 # e16, m1, ta, ma
+.else # if len <= 16
+        li      a5, 0300 # e8, m1, ta, ma
+        li      a6, 0311 # e16, m2, ta, ma
+.endif
+        vsetvlstatic8 \len
+        j       ff_put_vp8_epel_\type\()\size\().rvv
+endfunc
+.endm
+
+.irp type,h,v
+.irp size,4,6
+epel_common \size, \type
 .irp len,16,8,4
-epel \len 6 h
-epel \len 4 h
-epel \len 6 v
-epel \len 4 v
+epel \len, \size, \type
+.endr
+.endr
 .endr