Message ID | 20240513164328.21569-1-remi@remlab.net |
---|---|
State | Accepted |
Commit | 7591eb4055dfd2db925198cf61ec6a103b222725 |
Headers | show |
Series | [FFmpeg-devel] Revert "lavc/sbrdsp: R-V V neg_odd_64" | expand |
Context | Check | Description |
---|---|---|
andriy/make_x86 | success | Make finished |
andriy/make_fate_x86 | success | Make fate finished |
On 13/05/2024 18:43, Rémi Denis-Courmont wrote: > While this function can easily be written with vectors, it just fails to > get any performance improvement. > > For reference, this is a simpler loop-free implementation that does get > better performance than the current one depending on hardware, but still > more or less the same metrics as the C code: > > func ff_sbr_neg_odd_64_rvv, zve64x > li a1, 32 > addi a0, a0, 7 > li t0, 8 > vsetvli zero, a1, e8, m2, ta, ma > li t1, 0x80 > vlse8.v v8, (a0), t0 > vxor.vx v8, v8, t1 > vsse8.v v8, (a0), t0 > ret > endfunc > > This reverts commit d06fd18f8f4c6a81ef94cbb600620d83ad51269d. > --- > libavcodec/riscv/sbrdsp_init.c | 5 ----- > libavcodec/riscv/sbrdsp_rvv.S | 17 ----------------- > 2 files changed, 22 deletions(-) > > diff --git a/libavcodec/riscv/sbrdsp_init.c b/libavcodec/riscv/sbrdsp_init.c > index f937c47e22..d3bafa961e 100644 > --- a/libavcodec/riscv/sbrdsp_init.c > +++ b/libavcodec/riscv/sbrdsp_init.c > @@ -26,7 +26,6 @@ > > void ff_sbr_sum64x5_rvv(float *z); > float ff_sbr_sum_square_rvv(float (*x)[2], int n); > -void ff_sbr_neg_odd_64_rvv(float *x); > void ff_sbr_autocorrelate_rvv(const float x[40][2], float phi[3][2][2]); > void ff_sbr_hf_gen_rvv(float (*X_high)[2], const float (*X_low)[2], > const float alpha0[2], const float alpha1[2], > @@ -64,9 +63,5 @@ av_cold void ff_sbrdsp_init_riscv(SBRDSPContext *c) > } > c->autocorrelate = ff_sbr_autocorrelate_rvv; > } > -#if __riscv_xlen >= 64 > - if ((flags & AV_CPU_FLAG_RVV_I64) && (flags & AV_CPU_FLAG_RVB_ADDR)) > - c->neg_odd_64 = ff_sbr_neg_odd_64_rvv; > -#endif > #endif > } > diff --git a/libavcodec/riscv/sbrdsp_rvv.S b/libavcodec/riscv/sbrdsp_rvv.S > index 918c37882f..aba9a28108 100644 > --- a/libavcodec/riscv/sbrdsp_rvv.S > +++ b/libavcodec/riscv/sbrdsp_rvv.S > @@ -68,23 +68,6 @@ NOHWF fmv.x.w a0, fa0 > ret > endfunc > > -#if __riscv_xlen >= 64 > -func ff_sbr_neg_odd_64_rvv, zve64x > - li a1, 32 > - li t1, 1 << 63 > -1: > - vsetvli t0, a1, e64, m8, ta, ma > - vle64.v v8, (a0) > - sub a1, a1, t0 > - vxor.vx v8, v8, t1 > - vse64.v v8, (a0) > - sh3add a0, t0, a0 > - bnez t0, 1b > - > - ret > -endfunc > -#endif > - > func ff_sbr_autocorrelate_rvv, zve32f > vsetvli t0, zero, e32, m4, ta, ma > vmv.v.x v0, zero Do you think a 256bit implementation be able to overcome the overhead and end up being faster?
Le 13 mai 2024 20:06:34 GMT+03:00, Lynne via ffmpeg-devel <ffmpeg-devel@ffmpeg.org> a écrit : >On 13/05/2024 18:43, Rémi Denis-Courmont wrote: >> While this function can easily be written with vectors, it just fails to >> get any performance improvement. >> >> For reference, this is a simpler loop-free implementation that does get >> better performance than the current one depending on hardware, but still >> more or less the same metrics as the C code: >> >> func ff_sbr_neg_odd_64_rvv, zve64x >> li a1, 32 >> addi a0, a0, 7 >> li t0, 8 >> vsetvli zero, a1, e8, m2, ta, ma >> li t1, 0x80 >> vlse8.v v8, (a0), t0 >> vxor.vx v8, v8, t1 >> vsse8.v v8, (a0), t0 >> ret >> endfunc >> >> This reverts commit d06fd18f8f4c6a81ef94cbb600620d83ad51269d. >> --- >> libavcodec/riscv/sbrdsp_init.c | 5 ----- >> libavcodec/riscv/sbrdsp_rvv.S | 17 ----------------- >> 2 files changed, 22 deletions(-) >> >> diff --git a/libavcodec/riscv/sbrdsp_init.c b/libavcodec/riscv/sbrdsp_init.c >> index f937c47e22..d3bafa961e 100644 >> --- a/libavcodec/riscv/sbrdsp_init.c >> +++ b/libavcodec/riscv/sbrdsp_init.c >> @@ -26,7 +26,6 @@ >> void ff_sbr_sum64x5_rvv(float *z); >> float ff_sbr_sum_square_rvv(float (*x)[2], int n); >> -void ff_sbr_neg_odd_64_rvv(float *x); >> void ff_sbr_autocorrelate_rvv(const float x[40][2], float phi[3][2][2]); >> void ff_sbr_hf_gen_rvv(float (*X_high)[2], const float (*X_low)[2], >> const float alpha0[2], const float alpha1[2], >> @@ -64,9 +63,5 @@ av_cold void ff_sbrdsp_init_riscv(SBRDSPContext *c) >> } >> c->autocorrelate = ff_sbr_autocorrelate_rvv; >> } >> -#if __riscv_xlen >= 64 >> - if ((flags & AV_CPU_FLAG_RVV_I64) && (flags & AV_CPU_FLAG_RVB_ADDR)) >> - c->neg_odd_64 = ff_sbr_neg_odd_64_rvv; >> -#endif >> #endif >> } >> diff --git a/libavcodec/riscv/sbrdsp_rvv.S b/libavcodec/riscv/sbrdsp_rvv.S >> index 918c37882f..aba9a28108 100644 >> --- a/libavcodec/riscv/sbrdsp_rvv.S >> +++ b/libavcodec/riscv/sbrdsp_rvv.S >> @@ -68,23 +68,6 @@ NOHWF fmv.x.w a0, fa0 >> ret >> endfunc >> -#if __riscv_xlen >= 64 >> -func ff_sbr_neg_odd_64_rvv, zve64x >> - li a1, 32 >> - li t1, 1 << 63 >> -1: >> - vsetvli t0, a1, e64, m8, ta, ma >> - vle64.v v8, (a0) >> - sub a1, a1, t0 >> - vxor.vx v8, v8, t1 >> - vse64.v v8, (a0) >> - sh3add a0, t0, a0 >> - bnez t0, 1b >> - >> - ret >> -endfunc >> -#endif >> - >> func ff_sbr_autocorrelate_rvv, zve32f >> vsetvli t0, zero, e32, m4, ta, ma >> vmv.v.x v0, zero > >Do you think a 256bit implementation be able to overcome the overhead and end up being faster? Based on Sunyeuchi's tests, it gets worse with larger vectors. I guess the C code is just bound by memory bandwidth, since the "calculations" are so trivial, and thus we can't really beat C here.
diff --git a/libavcodec/riscv/sbrdsp_init.c b/libavcodec/riscv/sbrdsp_init.c index f937c47e22..d3bafa961e 100644 --- a/libavcodec/riscv/sbrdsp_init.c +++ b/libavcodec/riscv/sbrdsp_init.c @@ -26,7 +26,6 @@ void ff_sbr_sum64x5_rvv(float *z); float ff_sbr_sum_square_rvv(float (*x)[2], int n); -void ff_sbr_neg_odd_64_rvv(float *x); void ff_sbr_autocorrelate_rvv(const float x[40][2], float phi[3][2][2]); void ff_sbr_hf_gen_rvv(float (*X_high)[2], const float (*X_low)[2], const float alpha0[2], const float alpha1[2], @@ -64,9 +63,5 @@ av_cold void ff_sbrdsp_init_riscv(SBRDSPContext *c) } c->autocorrelate = ff_sbr_autocorrelate_rvv; } -#if __riscv_xlen >= 64 - if ((flags & AV_CPU_FLAG_RVV_I64) && (flags & AV_CPU_FLAG_RVB_ADDR)) - c->neg_odd_64 = ff_sbr_neg_odd_64_rvv; -#endif #endif } diff --git a/libavcodec/riscv/sbrdsp_rvv.S b/libavcodec/riscv/sbrdsp_rvv.S index 918c37882f..aba9a28108 100644 --- a/libavcodec/riscv/sbrdsp_rvv.S +++ b/libavcodec/riscv/sbrdsp_rvv.S @@ -68,23 +68,6 @@ NOHWF fmv.x.w a0, fa0 ret endfunc -#if __riscv_xlen >= 64 -func ff_sbr_neg_odd_64_rvv, zve64x - li a1, 32 - li t1, 1 << 63 -1: - vsetvli t0, a1, e64, m8, ta, ma - vle64.v v8, (a0) - sub a1, a1, t0 - vxor.vx v8, v8, t1 - vse64.v v8, (a0) - sh3add a0, t0, a0 - bnez t0, 1b - - ret -endfunc -#endif - func ff_sbr_autocorrelate_rvv, zve32f vsetvli t0, zero, e32, m4, ta, ma vmv.v.x v0, zero