diff mbox series

[FFmpeg-devel] lavc/rv34dsp: optimise R-V V idct_dc_add

Message ID 20240522202854.15461-1-remi@remlab.net
State New
Headers show
Series [FFmpeg-devel] lavc/rv34dsp: optimise R-V V idct_dc_add | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

Rémi Denis-Courmont May 22, 2024, 8:28 p.m. UTC
This removes one stray LI and reworks the vector arithmetic to avoid
changing the vector configuration. On K230, this takes the 46.5 cycle
count down from 46.5 to 43.5.
---
 libavcodec/riscv/rv34dsp_rvv.S | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

Comments

Rémi Denis-Courmont May 22, 2024, 8:34 p.m. UTC | #1
Le 22 mai 2024 23:28:54 GMT+03:00, "Rémi Denis-Courmont" <remi@remlab.net> a écrit :
>This removes one stray LI and reworks the vector arithmetic to avoid
>changing the vector configuration. On K230, this takes the 46.5 cycle
>count down from 46.5 to 43.5.
>---
> libavcodec/riscv/rv34dsp_rvv.S | 13 ++++++-------
> 1 file changed, 6 insertions(+), 7 deletions(-)
>
>diff --git a/libavcodec/riscv/rv34dsp_rvv.S b/libavcodec/riscv/rv34dsp_rvv.S
>index f1f6345012..e8aff7e570 100644
>--- a/libavcodec/riscv/rv34dsp_rvv.S
>+++ b/libavcodec/riscv/rv34dsp_rvv.S
>@@ -36,16 +36,15 @@ func ff_rv34_idct_dc_add_rvv, zve32x
>         vsetivli      zero, 4, e8, mf4, ta, ma
>         vlse32.v      v0, (a0), a1
>         li            t1, 169
>+        li            t2, 128
>         mul           t1, t1, a2
>-        li            a2, 255
>+        vsetivli      zero, 4*4, e8, m1, ta, ma
>+        vwsubu.vx     v2, v0, t2
>         addi          t1, t1, 512
>         srai          t1, t1, 10
>-        vsetivli      zero, 4*4, e16, m2, ta, ma
>-        vzext.vf2     v2, v0
>-        vadd.vx       v2, v2, t1
>-        vmax.vx       v2, v2, zero
>-        vsetvli       zero, zero, e8, m1, ta, ma
>-        vnclipu.wi    v0, v2, 0
>+        vwadd.wx      v2, v2, t1

Hmm, this should not work, as t1 has more than 8 bits. Maybe checkasm is sloppy here.

>+        vnclip.wi     v0, v2, 0
>+        vxor.vx       v0, v0, t2
>         vsetivli      zero, 4, e8, mf4, ta, ma
>         vsse32.v      v0, (a0), a1
>
flow gg May 23, 2024, 3:31 a.m. UTC | #2
Unfortunately I only test to obtain benchmarks and basic correctness. I
always feel the need for a professional to write the tests.

Rémi Denis-Courmont <remi@remlab.net> 于2024年5月23日周四 04:35写道:

>
>
> Le 22 mai 2024 23:28:54 GMT+03:00, "Rémi Denis-Courmont" <remi@remlab.net>
> a écrit :
> >This removes one stray LI and reworks the vector arithmetic to avoid
> >changing the vector configuration. On K230, this takes the 46.5 cycle
> >count down from 46.5 to 43.5.
> >---
> > libavcodec/riscv/rv34dsp_rvv.S | 13 ++++++-------
> > 1 file changed, 6 insertions(+), 7 deletions(-)
> >
> >diff --git a/libavcodec/riscv/rv34dsp_rvv.S
> b/libavcodec/riscv/rv34dsp_rvv.S
> >index f1f6345012..e8aff7e570 100644
> >--- a/libavcodec/riscv/rv34dsp_rvv.S
> >+++ b/libavcodec/riscv/rv34dsp_rvv.S
> >@@ -36,16 +36,15 @@ func ff_rv34_idct_dc_add_rvv, zve32x
> >         vsetivli      zero, 4, e8, mf4, ta, ma
> >         vlse32.v      v0, (a0), a1
> >         li            t1, 169
> >+        li            t2, 128
> >         mul           t1, t1, a2
> >-        li            a2, 255
> >+        vsetivli      zero, 4*4, e8, m1, ta, ma
> >+        vwsubu.vx     v2, v0, t2
> >         addi          t1, t1, 512
> >         srai          t1, t1, 10
> >-        vsetivli      zero, 4*4, e16, m2, ta, ma
> >-        vzext.vf2     v2, v0
> >-        vadd.vx       v2, v2, t1
> >-        vmax.vx       v2, v2, zero
> >-        vsetvli       zero, zero, e8, m1, ta, ma
> >-        vnclipu.wi    v0, v2, 0
> >+        vwadd.wx      v2, v2, t1
>
> Hmm, this should not work, as t1 has more than 8 bits. Maybe checkasm is
> sloppy here.
>
> >+        vnclip.wi     v0, v2, 0
> >+        vxor.vx       v0, v0, t2
> >         vsetivli      zero, 4, e8, mf4, ta, ma
> >         vsse32.v      v0, (a0), a1
> >
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
Rémi Denis-Courmont May 23, 2024, 8:33 a.m. UTC | #3
Le 23 mai 2024 06:31:52 GMT+03:00, flow gg <hlefthleft@gmail.com> a écrit :
>Unfortunately I only test to obtain benchmarks and basic correctness. I
>always feel the need for a professional to write the tests.

Everybody agrees that tests should be written alongside the corresponding DSP C reference functions, but we can't go back in time and make it happen for existing code.

In my experience, nobody funds audio and old video codec optimisations, so the chances that checkasm tests would be written professionally are unfortunately slim. Likewise getting rid of MMX legacy optimisations.

For RVV, I am aware of funding for H.264 only. I hope that H.265 could get funded afterwards but that's just my hope. H.266 and VP9 won't be funded since you're already working on them, and everything else is probably too old to get attention. So colour me pessimistic for getting funding there :-(

</Rant>

FWIW, beware that people will tend to assume that you are a professional if you assign copyright to a legal entity, even if there is another nonobvious reason why you do that.

>Rémi Denis-Courmont <remi@remlab.net> 于2024年5月23日周四 04:35写道:
>
>>
>>
>> Le 22 mai 2024 23:28:54 GMT+03:00, "Rémi Denis-Courmont" <remi@remlab.net>
>> a écrit :
>> >This removes one stray LI and reworks the vector arithmetic to avoid
>> >changing the vector configuration. On K230, this takes the 46.5 cycle
>> >count down from 46.5 to 43.5.
>> >---
>> > libavcodec/riscv/rv34dsp_rvv.S | 13 ++++++-------
>> > 1 file changed, 6 insertions(+), 7 deletions(-)
>> >
>> >diff --git a/libavcodec/riscv/rv34dsp_rvv.S
>> b/libavcodec/riscv/rv34dsp_rvv.S
>> >index f1f6345012..e8aff7e570 100644
>> >--- a/libavcodec/riscv/rv34dsp_rvv.S
>> >+++ b/libavcodec/riscv/rv34dsp_rvv.S
>> >@@ -36,16 +36,15 @@ func ff_rv34_idct_dc_add_rvv, zve32x
>> >         vsetivli      zero, 4, e8, mf4, ta, ma
>> >         vlse32.v      v0, (a0), a1
>> >         li            t1, 169
>> >+        li            t2, 128
>> >         mul           t1, t1, a2
>> >-        li            a2, 255
>> >+        vsetivli      zero, 4*4, e8, m1, ta, ma
>> >+        vwsubu.vx     v2, v0, t2
>> >         addi          t1, t1, 512
>> >         srai          t1, t1, 10
>> >-        vsetivli      zero, 4*4, e16, m2, ta, ma
>> >-        vzext.vf2     v2, v0
>> >-        vadd.vx       v2, v2, t1
>> >-        vmax.vx       v2, v2, zero
>> >-        vsetvli       zero, zero, e8, m1, ta, ma
>> >-        vnclipu.wi    v0, v2, 0
>> >+        vwadd.wx      v2, v2, t1
>>
>> Hmm, this should not work, as t1 has more than 8 bits. Maybe checkasm is
>> sloppy here.
>>
>> >+        vnclip.wi     v0, v2, 0
>> >+        vxor.vx       v0, v0, t2
>> >         vsetivli      zero, 4, e8, mf4, ta, ma
>> >         vsse32.v      v0, (a0), a1
>> >
>> _______________________________________________
>> ffmpeg-devel mailing list
>> ffmpeg-devel@ffmpeg.org
>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>>
>> To unsubscribe, visit link above, or email
>> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>>
>_______________________________________________
>ffmpeg-devel mailing list
>ffmpeg-devel@ffmpeg.org
>https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
>To unsubscribe, visit link above, or email
>ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
diff mbox series

Patch

diff --git a/libavcodec/riscv/rv34dsp_rvv.S b/libavcodec/riscv/rv34dsp_rvv.S
index f1f6345012..e8aff7e570 100644
--- a/libavcodec/riscv/rv34dsp_rvv.S
+++ b/libavcodec/riscv/rv34dsp_rvv.S
@@ -36,16 +36,15 @@  func ff_rv34_idct_dc_add_rvv, zve32x
         vsetivli      zero, 4, e8, mf4, ta, ma
         vlse32.v      v0, (a0), a1
         li            t1, 169
+        li            t2, 128
         mul           t1, t1, a2
-        li            a2, 255
+        vsetivli      zero, 4*4, e8, m1, ta, ma
+        vwsubu.vx     v2, v0, t2
         addi          t1, t1, 512
         srai          t1, t1, 10
-        vsetivli      zero, 4*4, e16, m2, ta, ma
-        vzext.vf2     v2, v0
-        vadd.vx       v2, v2, t1
-        vmax.vx       v2, v2, zero
-        vsetvli       zero, zero, e8, m1, ta, ma
-        vnclipu.wi    v0, v2, 0
+        vwadd.wx      v2, v2, t1
+        vnclip.wi     v0, v2, 0
+        vxor.vx       v0, v0, t2
         vsetivli      zero, 4, e8, mf4, ta, ma
         vsse32.v      v0, (a0), a1