diff mbox series

[FFmpeg-devel,v2,4/4] vvcdec: reuse h26x/h2656_deblock_template.c

Message ID TYSPR06MB64330000902A81F27C1F7537AA652@TYSPR06MB6433.apcprd06.prod.outlook.com
State Accepted
Commit 8d0dda8260e67996efbc96d3148ce7238e42df60
Headers show
Series [FFmpeg-devel,v2,1/4] hevcdec: move sao template to h26x/h2656_sao_template.c | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

Nuo Mi Jan. 6, 2024, 10:51 a.m. UTC
---
 libavcodec/vvc/vvc_filter_template.c | 82 +---------------------------
 1 file changed, 3 insertions(+), 79 deletions(-)

Comments

Nuo Mi Jan. 9, 2024, 1:02 a.m. UTC | #1
On Sat, Jan 6, 2024 at 6:52 PM Nuo Mi <nuomi2021@gmail.com> wrote:

> ---
>  libavcodec/vvc/vvc_filter_template.c | 82 +---------------------------
>  1 file changed, 3 insertions(+), 79 deletions(-)
>
> diff --git a/libavcodec/vvc/vvc_filter_template.c
> b/libavcodec/vvc/vvc_filter_template.c
> index 9418980c33..671ed7de4e 100644
> --- a/libavcodec/vvc/vvc_filter_template.c
> +++ b/libavcodec/vvc/vvc_filter_template.c
> @@ -461,6 +461,8 @@ static void FUNC(alf_recon_coeff_and_clip)(int16_t
> *coeff, int16_t *clip,
>  #define FQ2 pix[2  * xstride + 1 * ystride]
>  #define FQ3 pix[3  * xstride + 1 * ystride]
>
> +#include "libavcodec/h26x/h2656_deblock_template.c"
>
Will merge this in two days if there are no objections to the file and
directory names.
Thank you
Ronald S. Bultje Jan. 9, 2024, 1:22 a.m. UTC | #2
Hi,

On Mon, Jan 8, 2024 at 8:03 PM Nuo Mi <nuomi2021@gmail.com> wrote:

> On Sat, Jan 6, 2024 at 6:52 PM Nuo Mi <nuomi2021@gmail.com> wrote:
>
> > ---
> >  libavcodec/vvc/vvc_filter_template.c | 82 +---------------------------
> >  1 file changed, 3 insertions(+), 79 deletions(-)
> >
> > diff --git a/libavcodec/vvc/vvc_filter_template.c
> > b/libavcodec/vvc/vvc_filter_template.c
> > index 9418980c33..671ed7de4e 100644
> > --- a/libavcodec/vvc/vvc_filter_template.c
> > +++ b/libavcodec/vvc/vvc_filter_template.c
> > @@ -461,6 +461,8 @@ static void FUNC(alf_recon_coeff_and_clip)(int16_t
> > *coeff, int16_t *clip,
> >  #define FQ2 pix[2  * xstride + 1 * ystride]
> >  #define FQ3 pix[3  * xstride + 1 * ystride]
> >
> > +#include "libavcodec/h26x/h2656_deblock_template.c"
> >
> Will merge this in two days if there are no objections to the file and
> directory names.
>

Are there options to share the actual generated binary code? The C code
admittedly is not so important, but it would be great if there was some way
to ensure that optimizations written for HEVC in some instruction set, work
for VVC also - or vice versa.

Ronald
Nuo Mi Jan. 9, 2024, 3:05 a.m. UTC | #3
On Tue, Jan 9, 2024 at 9:23 AM Ronald S. Bultje <rsbultje@gmail.com> wrote:

> Hi,
>
> On Mon, Jan 8, 2024 at 8:03 PM Nuo Mi <nuomi2021@gmail.com> wrote:
>
> > On Sat, Jan 6, 2024 at 6:52 PM Nuo Mi <nuomi2021@gmail.com> wrote:
> >
> > > ---
> > >  libavcodec/vvc/vvc_filter_template.c | 82 +---------------------------
> > >  1 file changed, 3 insertions(+), 79 deletions(-)
> > >
> > > diff --git a/libavcodec/vvc/vvc_filter_template.c
> > > b/libavcodec/vvc/vvc_filter_template.c
> > > index 9418980c33..671ed7de4e 100644
> > > --- a/libavcodec/vvc/vvc_filter_template.c
> > > +++ b/libavcodec/vvc/vvc_filter_template.c
> > > @@ -461,6 +461,8 @@ static void FUNC(alf_recon_coeff_and_clip)(int16_t
> > > *coeff, int16_t *clip,
> > >  #define FQ2 pix[2  * xstride + 1 * ystride]
> > >  #define FQ3 pix[3  * xstride + 1 * ystride]
> > >
> > > +#include "libavcodec/h26x/h2656_deblock_template.c"
> > >
> > Will merge this in two days if there are no objections to the file and
> > directory names.
> >
>
> Are there options to share the actual generated binary code? The C code
> admittedly is not so important, but it would be great if there was some way
> to ensure that optimizations written for HEVC in some instruction set, work
> for VVC also - or vice versa.
>
Yes. After we merge this, we will send out the mc x86 asm code for review.
It will share the same binary with HEVC.
For SAO/Deblock, we will follow a similar approach, but it needs to be a
little later than mc.
For C code, we can share the binary as well, but it involves some interface
changes, better to do it after all asm is ready.

>
> Ronald
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
Ronald S. Bultje Jan. 9, 2024, 12:50 p.m. UTC | #4
Hi,

On Mon, Jan 8, 2024 at 10:05 PM Nuo Mi <nuomi2021@gmail.com> wrote:

> On Tue, Jan 9, 2024 at 9:23 AM Ronald S. Bultje <rsbultje@gmail.com>
> wrote:
>
> > Hi,
> >
> > On Mon, Jan 8, 2024 at 8:03 PM Nuo Mi <nuomi2021@gmail.com> wrote:
> >
> > > On Sat, Jan 6, 2024 at 6:52 PM Nuo Mi <nuomi2021@gmail.com> wrote:
> > >
> > > > ---
> > > >  libavcodec/vvc/vvc_filter_template.c | 82
> +---------------------------
> > > >  1 file changed, 3 insertions(+), 79 deletions(-)
> > > >
> > > > diff --git a/libavcodec/vvc/vvc_filter_template.c
> > > > b/libavcodec/vvc/vvc_filter_template.c
> > > > index 9418980c33..671ed7de4e 100644
> > > > --- a/libavcodec/vvc/vvc_filter_template.c
> > > > +++ b/libavcodec/vvc/vvc_filter_template.c
> > > > @@ -461,6 +461,8 @@ static void
> FUNC(alf_recon_coeff_and_clip)(int16_t
> > > > *coeff, int16_t *clip,
> > > >  #define FQ2 pix[2  * xstride + 1 * ystride]
> > > >  #define FQ3 pix[3  * xstride + 1 * ystride]
> > > >
> > > > +#include "libavcodec/h26x/h2656_deblock_template.c"
> > > >
> > > Will merge this in two days if there are no objections to the file and
> > > directory names.
> > >
> >
> > Are there options to share the actual generated binary code? The C code
> > admittedly is not so important, but it would be great if there was some
> way
> > to ensure that optimizations written for HEVC in some instruction set,
> work
> > for VVC also - or vice versa.
> >
> Yes. After we merge this, we will send out the mc x86 asm code for review.
> It will share the same binary with HEVC.
> For SAO/Deblock, we will follow a similar approach, but it needs to be a
> little later than mc.
> For C code, we can share the binary as well, but it involves some interface
> changes, better to do it after all asm is ready.
>

OK, that sounds reasonable to me.

Ronald
Nuo Mi Jan. 11, 2024, 3:05 p.m. UTC | #5
>
>
>
> > Yes. After we merge this, we will send out the mc x86 asm code for
> review.
> > It will share the same binary with HEVC.
> > For SAO/Deblock, we will follow a similar approach, but it needs to be a
> > little later than mc.
> > For C code, we can share the binary as well, but it involves some
> interface
> > changes, better to do it after all asm is ready.
> >
>
> OK, that sounds reasonable to me.
>
Thank you, Ronald.
Pushed.

>
> Ronald
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
diff mbox series

Patch

diff --git a/libavcodec/vvc/vvc_filter_template.c b/libavcodec/vvc/vvc_filter_template.c
index 9418980c33..671ed7de4e 100644
--- a/libavcodec/vvc/vvc_filter_template.c
+++ b/libavcodec/vvc/vvc_filter_template.c
@@ -461,6 +461,8 @@  static void FUNC(alf_recon_coeff_and_clip)(int16_t *coeff, int16_t *clip,
 #define FQ2 pix[2  * xstride + 1 * ystride]
 #define FQ3 pix[3  * xstride + 1 * ystride]
 
+#include "libavcodec/h26x/h2656_deblock_template.c"
+
 static void FUNC(loop_filter_luma_large)(pixel *pix, const ptrdiff_t xstride, const ptrdiff_t ystride, const int32_t tc,
     const uint8_t no_p, const uint8_t no_q, const uint8_t max_len_p, const uint8_t max_len_q)
 {
@@ -541,66 +543,6 @@  static void FUNC(loop_filter_luma_large)(pixel *pix, const ptrdiff_t xstride, co
     }
 }
 
-static void FUNC(loop_filter_luma_strong)(pixel *pix, const ptrdiff_t xstride, const ptrdiff_t ystride, const int32_t tc,
-    const uint8_t no_p, const uint8_t no_q)
-{
-    const int tc2 = tc << 1;
-    const int tc3 = tc * 3;
-    for (int d = 0; d < 4; d++) {
-        const int p3 = P3;
-        const int p2 = P2;
-        const int p1 = P1;
-        const int p0 = P0;
-        const int q0 = Q0;
-        const int q1 = Q1;
-        const int q2 = Q2;
-        const int q3 = Q3;
-        if (!no_p) {
-            P0 = p0 + av_clip(((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) - p0, -tc3, tc3);
-            P1 = p1 + av_clip(((p2 + p1 + p0 + q0 + 2) >> 2) - p1, -tc2, tc2);
-            P2 = p2 + av_clip(((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) - p2, -tc, tc);
-        }
-        if (!no_q) {
-            Q0 = q0 + av_clip(((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) - q0, -tc3, tc3);
-            Q1 = q1 + av_clip(((p0 + q0 + q1 + q2 + 2) >> 2) - q1, -tc2, tc2);
-            Q2 = q2 + av_clip(((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) - q2, -tc, tc);
-        }
-        pix += ystride;
-    }
-}
-
-static void FUNC(loop_filter_luma_weak)(pixel *pix, const ptrdiff_t xstride, const ptrdiff_t ystride,
-    const int32_t tc, const int32_t beta, const uint8_t no_p, const uint8_t no_q, const int nd_p, const int nd_q)
-{
-    const int tc_2 = tc >> 1;
-    for (int d = 0; d < 4; d++) {
-        const int p2 = P2;
-        const int p1 = P1;
-        const int p0 = P0;
-        const int q0 = Q0;
-        const int q1 = Q1;
-        const int q2 = Q2;
-        int delta0 = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4;
-        if (abs(delta0) < 10 * tc) {
-            delta0 = av_clip(delta0, -tc, tc);
-            if (!no_p)
-                P0 = av_clip_pixel(p0 + delta0);
-            if (!no_q)
-                Q0 = av_clip_pixel(q0 - delta0);
-            if (!no_p && nd_p > 1) {
-                const int deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2);
-                P1 = av_clip_pixel(p1 + deltap1);
-            }
-            if (!no_q && nd_q > 1) {
-                const int deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1, -tc_2, tc_2);
-                Q1 = av_clip_pixel(q1 + deltaq1);
-            }
-        }
-        pix += ystride;
-    }
-
-}
-
 static void FUNC(vvc_loop_filter_luma)(uint8_t* _pix, ptrdiff_t _xstride, ptrdiff_t _ystride,
     const int32_t *_beta, const int32_t *_tc, const uint8_t *_no_p, const uint8_t *_no_q,
     const uint8_t *_max_len_p, const uint8_t *_max_len_q, int hor_ctu_edge)
@@ -673,7 +615,7 @@  static void FUNC(vvc_loop_filter_luma)(uint8_t* _pix, ptrdiff_t _xstride, ptrdif
                 abs(P3 - P0) + abs(Q3 - Q0) < beta_3 && abs(P0 - Q0) < tc25 &&
                 abs(TP3 - TP0) + abs(TQ3 - TQ0) < beta_3 && abs(TP0 - TQ0) < tc25 &&
                 (d0 << 1) < beta_2 && (d3 << 1) < beta_2) {
-                FUNC(loop_filter_luma_strong)(pix, xstride, ystride, tc, no_p, no_q);
+                FUNC(loop_filter_luma_strong)(pix, xstride, ystride, tc, tc << 1, tc * 3, no_p, no_q);
             } else { // weak filtering
                 int nd_p = 1;
                 int nd_q = 1;
@@ -737,24 +679,6 @@  static void FUNC(loop_filter_chroma_strong_one_side)(pixel *pix, const ptrdiff_t
     }
 }
 
-static void FUNC(loop_filter_chroma_weak)(pixel *pix, const ptrdiff_t xstride, const ptrdiff_t ystride,
-    const int size, const int32_t tc, const uint8_t no_p, const uint8_t no_q)
-{
-    for (int d = 0; d < size; d++) {
-        int delta0;
-        const int p1 = P1;
-        const int p0 = P0;
-        const int q0 = Q0;
-        const int q1 = Q1;
-        delta0 = av_clip((((q0 - p0) * 4) + p1 - q1 + 4) >> 3, -tc, tc);
-        if (!no_p)
-            P0 = av_clip_pixel(p0 + delta0);
-        if (!no_q)
-            Q0 = av_clip_pixel(q0 - delta0);
-        pix += ystride;
-    }
-}
-
 static void FUNC(vvc_loop_filter_chroma)(uint8_t *_pix, const ptrdiff_t  _xstride, const ptrdiff_t _ystride,
     const int32_t *_beta, const int32_t *_tc, const uint8_t *_no_p, const uint8_t *_no_q,
     const uint8_t *_max_len_p, const uint8_t *_max_len_q, const int shift)