Message ID | 20240612044723.175502-1-remi@remlab.net |
---|---|
State | New |
Headers | show |
Series | [FFmpeg-devel,PATCHv5,1/4] lavc/h263dsp: add DCT dequantisation functions | expand |
Context | Check | Description |
---|---|---|
yinshiyou/make_loongarch64 | success | Make finished |
yinshiyou/make_fate_loongarch64 | success | Make fate finished |
andriy/make_x86 | success | Make finished |
andriy/make_fate_x86 | success | Make fate finished |
On 6/12/2024 1:47 AM, Rémi Denis-Courmont wrote: > Note that optimised implementations of these functions will be taken > into actual use only if MpegEncContext.dct_unquantize_h263_{inter,intra} > are *not* overloaded by existing optimisations. > > --- > This adds the plus ones back, saving two branch instructions in C and > one in assembler (at the cost of two unconditional adds). See my reply in the previous version. Not sure if it will help with this. > > --- > libavcodec/h263dsp.c | 26 ++++++++++++++++++++++++++ > libavcodec/h263dsp.h | 4 ++++ > 2 files changed, 30 insertions(+) > > diff --git a/libavcodec/h263dsp.c b/libavcodec/h263dsp.c > index 6a13353499..f4523a68c1 100644 > --- a/libavcodec/h263dsp.c > +++ b/libavcodec/h263dsp.c > @@ -19,10 +19,34 @@ > #include <stdint.h> > > #include "libavutil/attributes.h" > +#include "libavutil/avassert.h" > #include "libavutil/common.h" > #include "config.h" > #include "h263dsp.h" > > +static void h263_dct_unquantize_inter_c(int16_t *block, size_t len, > + int qmul, int qadd) > +{ > + for (size_t i = 0; i < len; i++) { > + int level = block[i]; > + > + if (level) { > + if (level < 0) > + level = level * qmul - qadd; > + else > + level = level * qmul + qadd; > + block[i] = level; > + } > + } > +} > + > +static void h263_dct_unquantize_intra_c(int16_t *block, size_t len, > + int qmul, int qadd) > +{ > + av_assert1(len >= 1); > + h263_dct_unquantize_inter_c(block + 1, len - 1, qmul, qadd); > +} > + > const uint8_t ff_h263_loop_filter_strength[32] = { > 0, 1, 1, 2, 2, 3, 3, 4, 4, 4, 5, 5, 6, 6, 7, 7, > 7, 8, 8, 8, 9, 9, 9, 10, 10, 10, 11, 11, 11, 12, 12, 12 > @@ -116,6 +140,8 @@ static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale) > > av_cold void ff_h263dsp_init(H263DSPContext *ctx) > { > + ctx->h263_dct_unquantize_intra = h263_dct_unquantize_intra_c; > + ctx->h263_dct_unquantize_inter = h263_dct_unquantize_inter_c; > ctx->h263_h_loop_filter = h263_h_loop_filter_c; > ctx->h263_v_loop_filter = h263_v_loop_filter_c; > > diff --git a/libavcodec/h263dsp.h b/libavcodec/h263dsp.h > index 2dccd23392..0ecbe83314 100644 > --- a/libavcodec/h263dsp.h > +++ b/libavcodec/h263dsp.h > @@ -24,6 +24,10 @@ > extern const uint8_t ff_h263_loop_filter_strength[32]; > > typedef struct H263DSPContext { > + void (*h263_dct_unquantize_intra)(int16_t *block /* align 16 */, > + size_t len, int mul, int add); > + void (*h263_dct_unquantize_inter)(int16_t *block /* align 16 */, > + size_t len, int mul, int add); > void (*h263_h_loop_filter)(uint8_t *src, int stride, int qscale); > void (*h263_v_loop_filter)(uint8_t *src, int stride, int qscale); > } H263DSPContext;
Le keskiviikkona 12. kesäkuuta 2024, 20.40.37 EEST James Almer a écrit : > On 6/12/2024 1:47 AM, Rémi Denis-Courmont wrote: > > Note that optimised implementations of these functions will be taken > > into actual use only if MpegEncContext.dct_unquantize_h263_{inter,intra} > > are *not* overloaded by existing optimisations. > > > > --- > > This adds the plus ones back, saving two branch instructions in C and > > one in assembler (at the cost of two unconditional adds). > > See my reply in the previous version. Not sure if it will help with this. We can of course avoid the branches - this version avoids the branches, as did the initial versions. In C (and in RVV), we can't avoid incrementing the pointer and a counter variable. If you change the loop like yuo suggest: for (size_t i = 1; i <= nCoeffs; i++) { int level = block[i]; if (level) { if (level < 0) level = level * qmul - qadd; else level = level * qmul + qadd; block[i] = level; } } ... at best, an optimising compiler will reinterpret it to: if (nCoeffs >= 1) { block++; end = block + nCoeffs; loop: level = *block; if (level) { tmp = level * qmul; if (level < 0) tmp -= qadd; else tmp += qadd; *(block++) = tmp; } if (block <= end) goto loop; } Or perhaps the compiler will keep an explicit counter, which is even worse. This does not save branches, nor increments. It just looks like it because of the syntactic sugar that is the for() loop. In reality, this only duplicates code (as we can no longer share between inter/intra).
Rémi Denis-Courmont: > Note that optimised implementations of these functions will be taken > into actual use only if MpegEncContext.dct_unquantize_h263_{inter,intra} > are *not* overloaded by existing optimisations. > > --- > This adds the plus ones back, saving two branch instructions in C and > one in assembler (at the cost of two unconditional adds). > I don't see how this saves branch instructions: len - 1 is allowed to be 0 in h263_dct_unquantize_intra and therefore the initial check can't be avoided (presuming the compiler produces an initial check+do-loop). It seems to me that your intra assembly simply ignores this. If you were to check for the case of nCoeffs == 0 in dct_unquantize_h263_intra_c (before the call!), you could write assembly with this restriction in mind. It would also allow the compiler to avoid the branch in case it is known that nCoeffs == 63. > --- > libavcodec/h263dsp.c | 26 ++++++++++++++++++++++++++ > libavcodec/h263dsp.h | 4 ++++ > 2 files changed, 30 insertions(+) > > diff --git a/libavcodec/h263dsp.c b/libavcodec/h263dsp.c > index 6a13353499..f4523a68c1 100644 > --- a/libavcodec/h263dsp.c > +++ b/libavcodec/h263dsp.c > @@ -19,10 +19,34 @@ > #include <stdint.h> > > #include "libavutil/attributes.h" > +#include "libavutil/avassert.h" > #include "libavutil/common.h" > #include "config.h" > #include "h263dsp.h" > > +static void h263_dct_unquantize_inter_c(int16_t *block, size_t len, > + int qmul, int qadd) > +{ > + for (size_t i = 0; i < len; i++) { > + int level = block[i]; > + > + if (level) { > + if (level < 0) > + level = level * qmul - qadd; > + else > + level = level * qmul + qadd; > + block[i] = level; > + } > + } > +} > + > +static void h263_dct_unquantize_intra_c(int16_t *block, size_t len, > + int qmul, int qadd) > +{ > + av_assert1(len >= 1); > + h263_dct_unquantize_inter_c(block + 1, len - 1, qmul, qadd); > +} > + > const uint8_t ff_h263_loop_filter_strength[32] = { > 0, 1, 1, 2, 2, 3, 3, 4, 4, 4, 5, 5, 6, 6, 7, 7, > 7, 8, 8, 8, 9, 9, 9, 10, 10, 10, 11, 11, 11, 12, 12, 12 > @@ -116,6 +140,8 @@ static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale) > > av_cold void ff_h263dsp_init(H263DSPContext *ctx) > { > + ctx->h263_dct_unquantize_intra = h263_dct_unquantize_intra_c; > + ctx->h263_dct_unquantize_inter = h263_dct_unquantize_inter_c; > ctx->h263_h_loop_filter = h263_h_loop_filter_c; > ctx->h263_v_loop_filter = h263_v_loop_filter_c; > > diff --git a/libavcodec/h263dsp.h b/libavcodec/h263dsp.h > index 2dccd23392..0ecbe83314 100644 > --- a/libavcodec/h263dsp.h > +++ b/libavcodec/h263dsp.h > @@ -24,6 +24,10 @@ > extern const uint8_t ff_h263_loop_filter_strength[32]; > > typedef struct H263DSPContext { > + void (*h263_dct_unquantize_intra)(int16_t *block /* align 16 */, > + size_t len, int mul, int add); > + void (*h263_dct_unquantize_inter)(int16_t *block /* align 16 */, > + size_t len, int mul, int add); > void (*h263_h_loop_filter)(uint8_t *src, int stride, int qscale); > void (*h263_v_loop_filter)(uint8_t *src, int stride, int qscale); > } H263DSPContext;
Le torstaina 13. kesäkuuta 2024, 5.23.12 EEST Andreas Rheinhardt a écrit : > Rémi Denis-Courmont: > > Note that optimised implementations of these functions will be taken > > into actual use only if MpegEncContext.dct_unquantize_h263_{inter,intra} > > are *not* overloaded by existing optimisations. > > > > --- > > This adds the plus ones back, saving two branch instructions in C and > > one in assembler (at the cost of two unconditional adds). > > I don't see how this saves branch instructions: You can compare versions 4 and 5. The C had the extra if() that you complained about and the assembler had an explicit extra branch. That being said, if someone wants to microoptimise the C version of DSP function, there are much greater and much less debatable savings to be had elsewhere - for instance adding missing restrict keywords (I don't mean in this particular case). The point of this patchset is not to optimise the C code, rather to enable checkasm and avoid copying the same scalar prologues in RVV and SSE2. > It seems to me that your intra assembly simply ignores [that] len -1 is > allowed to be 0. > If you were to check for the case of nCoeffs == 0 in > dct_unquantize_h263_intra_c (before the call!), you could write assembly > with this restriction in mind. If it really is a common case worth optimising for, then indeed it could be checked in the common C calling code. But that is not a call that I can credibly make, and is outside the scope of this MR. If someone has data to back that optimisation, they are welcome to submit it as a separate patch. The assembler works fine with 0 length. The only assumption is that the length is the actual and unsigned length. > It would also allow the compiler to avoid > the branch in case it is known that nCoeffs == 63.
diff --git a/libavcodec/h263dsp.c b/libavcodec/h263dsp.c index 6a13353499..f4523a68c1 100644 --- a/libavcodec/h263dsp.c +++ b/libavcodec/h263dsp.c @@ -19,10 +19,34 @@ #include <stdint.h> #include "libavutil/attributes.h" +#include "libavutil/avassert.h" #include "libavutil/common.h" #include "config.h" #include "h263dsp.h" +static void h263_dct_unquantize_inter_c(int16_t *block, size_t len, + int qmul, int qadd) +{ + for (size_t i = 0; i < len; i++) { + int level = block[i]; + + if (level) { + if (level < 0) + level = level * qmul - qadd; + else + level = level * qmul + qadd; + block[i] = level; + } + } +} + +static void h263_dct_unquantize_intra_c(int16_t *block, size_t len, + int qmul, int qadd) +{ + av_assert1(len >= 1); + h263_dct_unquantize_inter_c(block + 1, len - 1, qmul, qadd); +} + const uint8_t ff_h263_loop_filter_strength[32] = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 4, 5, 5, 6, 6, 7, 7, 7, 8, 8, 8, 9, 9, 9, 10, 10, 10, 11, 11, 11, 12, 12, 12 @@ -116,6 +140,8 @@ static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale) av_cold void ff_h263dsp_init(H263DSPContext *ctx) { + ctx->h263_dct_unquantize_intra = h263_dct_unquantize_intra_c; + ctx->h263_dct_unquantize_inter = h263_dct_unquantize_inter_c; ctx->h263_h_loop_filter = h263_h_loop_filter_c; ctx->h263_v_loop_filter = h263_v_loop_filter_c; diff --git a/libavcodec/h263dsp.h b/libavcodec/h263dsp.h index 2dccd23392..0ecbe83314 100644 --- a/libavcodec/h263dsp.h +++ b/libavcodec/h263dsp.h @@ -24,6 +24,10 @@ extern const uint8_t ff_h263_loop_filter_strength[32]; typedef struct H263DSPContext { + void (*h263_dct_unquantize_intra)(int16_t *block /* align 16 */, + size_t len, int mul, int add); + void (*h263_dct_unquantize_inter)(int16_t *block /* align 16 */, + size_t len, int mul, int add); void (*h263_h_loop_filter)(uint8_t *src, int stride, int qscale); void (*h263_v_loop_filter)(uint8_t *src, int stride, int qscale); } H263DSPContext;