diff mbox series

[FFmpeg-devel,PATCHv5,1/4] lavc/h263dsp: add DCT dequantisation functions

Message ID 20240612044723.175502-1-remi@remlab.net
State New
Headers show
Series [FFmpeg-devel,PATCHv5,1/4] lavc/h263dsp: add DCT dequantisation functions | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

Rémi Denis-Courmont June 12, 2024, 4:47 a.m. UTC
Note that optimised implementations of these functions will be taken
into actual use only if MpegEncContext.dct_unquantize_h263_{inter,intra}
are *not* overloaded by existing optimisations.

---
This adds the plus ones back, saving two branch instructions in C and
one in assembler (at the cost of two unconditional adds).

---
 libavcodec/h263dsp.c | 26 ++++++++++++++++++++++++++
 libavcodec/h263dsp.h |  4 ++++
 2 files changed, 30 insertions(+)

Comments

James Almer June 12, 2024, 5:40 p.m. UTC | #1
On 6/12/2024 1:47 AM, Rémi Denis-Courmont wrote:
> Note that optimised implementations of these functions will be taken
> into actual use only if MpegEncContext.dct_unquantize_h263_{inter,intra}
> are *not* overloaded by existing optimisations.
> 
> ---
> This adds the plus ones back, saving two branch instructions in C and
> one in assembler (at the cost of two unconditional adds).

See my reply in the previous version. Not sure if it will help with this.

> 
> ---
>   libavcodec/h263dsp.c | 26 ++++++++++++++++++++++++++
>   libavcodec/h263dsp.h |  4 ++++
>   2 files changed, 30 insertions(+)
> 
> diff --git a/libavcodec/h263dsp.c b/libavcodec/h263dsp.c
> index 6a13353499..f4523a68c1 100644
> --- a/libavcodec/h263dsp.c
> +++ b/libavcodec/h263dsp.c
> @@ -19,10 +19,34 @@
>   #include <stdint.h>
>   
>   #include "libavutil/attributes.h"
> +#include "libavutil/avassert.h"
>   #include "libavutil/common.h"
>   #include "config.h"
>   #include "h263dsp.h"
>   
> +static void h263_dct_unquantize_inter_c(int16_t *block, size_t len,
> +                                        int qmul, int qadd)
> +{
> +    for (size_t i = 0; i < len; i++) {
> +        int level = block[i];
> +
> +        if (level) {
> +            if (level < 0)
> +                level = level * qmul - qadd;
> +            else
> +                level = level * qmul + qadd;
> +            block[i] = level;
> +        }
> +    }
> +}
> +
> +static void h263_dct_unquantize_intra_c(int16_t *block, size_t len,
> +                                        int qmul, int qadd)
> +{
> +    av_assert1(len >= 1);
> +    h263_dct_unquantize_inter_c(block + 1, len - 1, qmul, qadd);
> +}
> +
>   const uint8_t ff_h263_loop_filter_strength[32] = {
>       0, 1, 1, 2, 2, 3, 3,  4,  4,  4,  5,  5,  6,  6,  7, 7,
>       7, 8, 8, 8, 9, 9, 9, 10, 10, 10, 11, 11, 11, 12, 12, 12
> @@ -116,6 +140,8 @@ static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale)
>   
>   av_cold void ff_h263dsp_init(H263DSPContext *ctx)
>   {
> +    ctx->h263_dct_unquantize_intra = h263_dct_unquantize_intra_c;
> +    ctx->h263_dct_unquantize_inter = h263_dct_unquantize_inter_c;
>       ctx->h263_h_loop_filter = h263_h_loop_filter_c;
>       ctx->h263_v_loop_filter = h263_v_loop_filter_c;
>   
> diff --git a/libavcodec/h263dsp.h b/libavcodec/h263dsp.h
> index 2dccd23392..0ecbe83314 100644
> --- a/libavcodec/h263dsp.h
> +++ b/libavcodec/h263dsp.h
> @@ -24,6 +24,10 @@
>   extern const uint8_t ff_h263_loop_filter_strength[32];
>   
>   typedef struct H263DSPContext {
> +    void (*h263_dct_unquantize_intra)(int16_t *block /* align 16 */,
> +                                      size_t len, int mul, int add);
> +    void (*h263_dct_unquantize_inter)(int16_t *block /* align 16 */,
> +                                      size_t len, int mul, int add);
>       void (*h263_h_loop_filter)(uint8_t *src, int stride, int qscale);
>       void (*h263_v_loop_filter)(uint8_t *src, int stride, int qscale);
>   } H263DSPContext;
Rémi Denis-Courmont June 12, 2024, 6:10 p.m. UTC | #2
Le keskiviikkona 12. kesäkuuta 2024, 20.40.37 EEST James Almer a écrit :
> On 6/12/2024 1:47 AM, Rémi Denis-Courmont wrote:
> > Note that optimised implementations of these functions will be taken
> > into actual use only if MpegEncContext.dct_unquantize_h263_{inter,intra}
> > are *not* overloaded by existing optimisations.
> > 
> > ---
> > This adds the plus ones back, saving two branch instructions in C and
> > one in assembler (at the cost of two unconditional adds).
> 
> See my reply in the previous version. Not sure if it will help with this.

We can of course avoid the branches - this version avoids the branches, as did 
the initial versions. In C (and in RVV), we can't avoid incrementing the 
pointer and a counter variable.

If you change the loop like yuo suggest:
     for (size_t i = 1; i <= nCoeffs; i++) { 
         int level = block[i];

         if (level) {
             if (level < 0)
                 level = level * qmul - qadd;
             else
                 level = level * qmul + qadd;
             block[i] = level;
         }
    }

... at best, an optimising compiler will reinterpret it to:

     if (nCoeffs >= 1) {
         block++;
         end = block + nCoeffs;

      loop:
            level = *block;
            if (level) {
                 tmp = level * qmul;
                 if (level < 0)
                     tmp -= qadd;
                 else
                     tmp += qadd;
                 *(block++) = tmp;
             }
            if (block <= end)
                goto loop;
    }

Or perhaps the compiler will keep an explicit counter, which is even worse. 
This does not save branches, nor increments. It just looks like it because of 
the syntactic sugar that is the for() loop. In reality, this only duplicates 
code (as we can no longer share between inter/intra).
Andreas Rheinhardt June 13, 2024, 2:23 a.m. UTC | #3
Rémi Denis-Courmont:
> Note that optimised implementations of these functions will be taken
> into actual use only if MpegEncContext.dct_unquantize_h263_{inter,intra}
> are *not* overloaded by existing optimisations.
> 
> ---
> This adds the plus ones back, saving two branch instructions in C and
> one in assembler (at the cost of two unconditional adds).
> 

I don't see how this saves branch instructions: len - 1 is allowed to be
0 in h263_dct_unquantize_intra and therefore the initial check can't be
avoided (presuming the compiler produces an initial check+do-loop). It
seems to me that your intra assembly simply ignores this.

If you were to check for the case of nCoeffs == 0 in
dct_unquantize_h263_intra_c (before the call!), you could write assembly
with this restriction in mind. It would also allow the compiler to avoid
the branch in case it is known that nCoeffs == 63.

> ---
>  libavcodec/h263dsp.c | 26 ++++++++++++++++++++++++++
>  libavcodec/h263dsp.h |  4 ++++
>  2 files changed, 30 insertions(+)
> 
> diff --git a/libavcodec/h263dsp.c b/libavcodec/h263dsp.c
> index 6a13353499..f4523a68c1 100644
> --- a/libavcodec/h263dsp.c
> +++ b/libavcodec/h263dsp.c
> @@ -19,10 +19,34 @@
>  #include <stdint.h>
>  
>  #include "libavutil/attributes.h"
> +#include "libavutil/avassert.h"
>  #include "libavutil/common.h"
>  #include "config.h"
>  #include "h263dsp.h"
>  
> +static void h263_dct_unquantize_inter_c(int16_t *block, size_t len,
> +                                        int qmul, int qadd)
> +{
> +    for (size_t i = 0; i < len; i++) {
> +        int level = block[i];
> +
> +        if (level) {
> +            if (level < 0)
> +                level = level * qmul - qadd;
> +            else
> +                level = level * qmul + qadd;
> +            block[i] = level;
> +        }
> +    }
> +}
> +
> +static void h263_dct_unquantize_intra_c(int16_t *block, size_t len,
> +                                        int qmul, int qadd)
> +{
> +    av_assert1(len >= 1);
> +    h263_dct_unquantize_inter_c(block + 1, len - 1, qmul, qadd);
> +}
> +
>  const uint8_t ff_h263_loop_filter_strength[32] = {
>      0, 1, 1, 2, 2, 3, 3,  4,  4,  4,  5,  5,  6,  6,  7, 7,
>      7, 8, 8, 8, 9, 9, 9, 10, 10, 10, 11, 11, 11, 12, 12, 12
> @@ -116,6 +140,8 @@ static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale)
>  
>  av_cold void ff_h263dsp_init(H263DSPContext *ctx)
>  {
> +    ctx->h263_dct_unquantize_intra = h263_dct_unquantize_intra_c;
> +    ctx->h263_dct_unquantize_inter = h263_dct_unquantize_inter_c;
>      ctx->h263_h_loop_filter = h263_h_loop_filter_c;
>      ctx->h263_v_loop_filter = h263_v_loop_filter_c;
>  
> diff --git a/libavcodec/h263dsp.h b/libavcodec/h263dsp.h
> index 2dccd23392..0ecbe83314 100644
> --- a/libavcodec/h263dsp.h
> +++ b/libavcodec/h263dsp.h
> @@ -24,6 +24,10 @@
>  extern const uint8_t ff_h263_loop_filter_strength[32];
>  
>  typedef struct H263DSPContext {
> +    void (*h263_dct_unquantize_intra)(int16_t *block /* align 16 */,
> +                                      size_t len, int mul, int add);
> +    void (*h263_dct_unquantize_inter)(int16_t *block /* align 16 */,
> +                                      size_t len, int mul, int add);
>      void (*h263_h_loop_filter)(uint8_t *src, int stride, int qscale);
>      void (*h263_v_loop_filter)(uint8_t *src, int stride, int qscale);
>  } H263DSPContext;
Rémi Denis-Courmont June 13, 2024, 7:22 p.m. UTC | #4
Le torstaina 13. kesäkuuta 2024, 5.23.12 EEST Andreas Rheinhardt a écrit :
> Rémi Denis-Courmont:
> > Note that optimised implementations of these functions will be taken
> > into actual use only if MpegEncContext.dct_unquantize_h263_{inter,intra}
> > are *not* overloaded by existing optimisations.
> > 
> > ---
> > This adds the plus ones back, saving two branch instructions in C and
> > one in assembler (at the cost of two unconditional adds).
> 
> I don't see how this saves branch instructions:

You can compare versions 4 and 5. The C had the extra if() that you complained 
about and the assembler had an explicit extra branch.

That being said, if someone wants to microoptimise the C version of DSP 
function, there are much greater and much less debatable savings to be had 
elsewhere - for instance adding missing restrict keywords (I don't mean in 
this particular case).

The point of this patchset is not to optimise the C code, rather to enable 
checkasm and avoid copying the same scalar prologues in RVV and SSE2.

> It seems to me that your intra assembly simply ignores [that] len -1 is
> allowed to be 0.

> If you were to check for the case of nCoeffs == 0 in
> dct_unquantize_h263_intra_c (before the call!), you could write assembly
> with this restriction in mind.

If it really is a common case worth optimising for, then indeed it could be 
checked in the common C calling code. But that is not a call that I can 
credibly make, and is outside the scope of this MR. If someone has data to 
back that optimisation, they are welcome to submit it as a separate patch.

The assembler works fine with 0 length. The only assumption is that the length 
is the actual and unsigned length.

> It would also allow the compiler to avoid
> the branch in case it is known that nCoeffs == 63.
diff mbox series

Patch

diff --git a/libavcodec/h263dsp.c b/libavcodec/h263dsp.c
index 6a13353499..f4523a68c1 100644
--- a/libavcodec/h263dsp.c
+++ b/libavcodec/h263dsp.c
@@ -19,10 +19,34 @@ 
 #include <stdint.h>
 
 #include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
 #include "libavutil/common.h"
 #include "config.h"
 #include "h263dsp.h"
 
+static void h263_dct_unquantize_inter_c(int16_t *block, size_t len,
+                                        int qmul, int qadd)
+{
+    for (size_t i = 0; i < len; i++) {
+        int level = block[i];
+
+        if (level) {
+            if (level < 0)
+                level = level * qmul - qadd;
+            else
+                level = level * qmul + qadd;
+            block[i] = level;
+        }
+    }
+}
+
+static void h263_dct_unquantize_intra_c(int16_t *block, size_t len,
+                                        int qmul, int qadd)
+{
+    av_assert1(len >= 1);
+    h263_dct_unquantize_inter_c(block + 1, len - 1, qmul, qadd);
+}
+
 const uint8_t ff_h263_loop_filter_strength[32] = {
     0, 1, 1, 2, 2, 3, 3,  4,  4,  4,  5,  5,  6,  6,  7, 7,
     7, 8, 8, 8, 9, 9, 9, 10, 10, 10, 11, 11, 11, 12, 12, 12
@@ -116,6 +140,8 @@  static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale)
 
 av_cold void ff_h263dsp_init(H263DSPContext *ctx)
 {
+    ctx->h263_dct_unquantize_intra = h263_dct_unquantize_intra_c;
+    ctx->h263_dct_unquantize_inter = h263_dct_unquantize_inter_c;
     ctx->h263_h_loop_filter = h263_h_loop_filter_c;
     ctx->h263_v_loop_filter = h263_v_loop_filter_c;
 
diff --git a/libavcodec/h263dsp.h b/libavcodec/h263dsp.h
index 2dccd23392..0ecbe83314 100644
--- a/libavcodec/h263dsp.h
+++ b/libavcodec/h263dsp.h
@@ -24,6 +24,10 @@ 
 extern const uint8_t ff_h263_loop_filter_strength[32];
 
 typedef struct H263DSPContext {
+    void (*h263_dct_unquantize_intra)(int16_t *block /* align 16 */,
+                                      size_t len, int mul, int add);
+    void (*h263_dct_unquantize_inter)(int16_t *block /* align 16 */,
+                                      size_t len, int mul, int add);
     void (*h263_h_loop_filter)(uint8_t *src, int stride, int qscale);
     void (*h263_v_loop_filter)(uint8_t *src, int stride, int qscale);
 } H263DSPContext;