diff mbox series

[FFmpeg-devel,v6,10/14] vvcdec: add LMCS, Deblocking, SAO, and ALF filters

Message ID TYSPR06MB643317A870CDEFE44A677B30AA85A@TYSPR06MB6433.apcprd06.prod.outlook.com
State Superseded
Headers show
Series [FFmpeg-devel,v6,01/14] vvcdec: add vvc decoder stub | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

Nuo Mi Dec. 5, 2023, 2:45 p.m. UTC
---
 libavcodec/vvc/Makefile              |    1 +
 libavcodec/vvc/vvc_ctu.h             |    1 +
 libavcodec/vvc/vvc_filter.c          | 1348 ++++++++++++++++++++++++++
 libavcodec/vvc/vvc_filter.h          |   71 ++
 libavcodec/vvc/vvc_filter_template.c | 1135 ++++++++++++++++++++++
 5 files changed, 2556 insertions(+)
 create mode 100644 libavcodec/vvc/vvc_filter.c
 create mode 100644 libavcodec/vvc/vvc_filter.h
 create mode 100644 libavcodec/vvc/vvc_filter_template.c
diff mbox series

Patch

diff --git a/libavcodec/vvc/Makefile b/libavcodec/vvc/Makefile
index 3b1ac72029..9e7fef7d38 100644
--- a/libavcodec/vvc/Makefile
+++ b/libavcodec/vvc/Makefile
@@ -5,6 +5,7 @@  OBJS-$(CONFIG_VVC_DECODER)          +=  vvc/vvcdec.o            \
                                         vvc/vvc_cabac.o         \
                                         vvc/vvc_ctu.o           \
                                         vvc/vvc_data.o          \
+                                        vvc/vvc_filter.o        \
                                         vvc/vvc_inter.o         \
                                         vvc/vvc_intra.o         \
                                         vvc/vvc_itx_1d.o        \
diff --git a/libavcodec/vvc/vvc_ctu.h b/libavcodec/vvc/vvc_ctu.h
index 577136b2e2..e9b157795c 100644
--- a/libavcodec/vvc/vvc_ctu.h
+++ b/libavcodec/vvc/vvc_ctu.h
@@ -462,6 +462,7 @@  typedef struct ALFParams {
 void ff_vvc_set_neighbour_available(VVCLocalContext *lc, int x0, int y0, int w, int h);
 void ff_vvc_decode_neighbour(VVCLocalContext *lc, int x_ctb, int y_ctb, int rx, int ry, int rs);
 void ff_vvc_ctu_free_cus(CTU *ctu);
+int ff_vvc_get_qPy(const VVCFrameContext *fc, int xc, int yc);
 void ff_vvc_ep_init_stat_coeff(EntryPoint *ep, int bit_depth, int persistent_rice_adaptation_enabled_flag);
 
 #endif // AVCODEC_VVC_VVC_CTU_H
diff --git a/libavcodec/vvc/vvc_filter.c b/libavcodec/vvc/vvc_filter.c
new file mode 100644
index 0000000000..19dcdbe3c8
--- /dev/null
+++ b/libavcodec/vvc/vvc_filter.c
@@ -0,0 +1,1348 @@ 
+/*
+ * VVC filters
+ *
+ * Copyright (C) 2021 Nuo Mi
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "libavutil/frame.h"
+
+#include "vvc_ctu.h"
+#include "vvc_data.h"
+#include "vvc_filter.h"
+#include "vvc_refs.h"
+
+#define LEFT        0
+#define TOP         1
+#define RIGHT       2
+#define BOTTOM      3
+#define MAX_EDGES   4
+
+#define DEFAULT_INTRA_TC_OFFSET 2
+
+//Table 43 Derivation of threshold variables beta' and tc' from input Q
+static const uint16_t tctable[66] = {
+      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+      0,   0,   3,   4,   4,   4,   4,   5,   5,   5,   5,   7,   7,   8,   9,  10,
+     10,  11,  13,  14,  15,  17,  19,  21,  24,  25,  29,  33,  36,  41,  45,  51,
+     57,  64,  71,  80,  89, 100, 112, 125, 141, 157, 177, 198, 222, 250, 280, 314,
+    352, 395,
+
+};
+
+//Table 43 Derivation of threshold variables beta' and tc' from input Q
+static const uint8_t betatable[64] = {
+      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+      6,   7,   8,   9,  10,  11,  12,  13,  14,  15,  16,  17,  18,  20,  22,  24,
+     26,  28,  30,  32,  34,  36,  38,  40,  42,  44,  46,  48,  50,  52,  54,  56,
+     58,  60,  62,  64,  66,  68,  70,  72,  74,  76,  78,  80,  82,  84,  86,  88,
+};
+
+
+static int get_qPc(const VVCFrameContext *fc, const int x0, const int y0, const int chroma)
+{
+    const int x             = x0 >> MIN_TU_LOG2;
+    const int y             = y0 >> MIN_TU_LOG2;
+    const int min_tu_width  = fc->ps.pps->min_tu_width;
+    return fc->tab.qp[chroma][x + y * min_tu_width];
+}
+
+static void copy_ctb(uint8_t *dst, const uint8_t *src, const int width, const int height,
+    const ptrdiff_t dst_stride, const ptrdiff_t src_stride)
+{
+    for (int y = 0; y < height; y++) {
+        memcpy(dst, src, width);
+
+        dst += dst_stride;
+        src += src_stride;
+    }
+}
+
+static void copy_pixel(uint8_t *dst, const uint8_t *src, const int pixel_shift)
+{
+    if (pixel_shift)
+        *(uint16_t *)dst = *(uint16_t *)src;
+    else
+        *dst = *src;
+}
+
+static void copy_vert(uint8_t *dst, const uint8_t *src, const int pixel_shift, const int height,
+    const ptrdiff_t dst_stride, const ptrdiff_t src_stride)
+{
+    int i;
+    if (pixel_shift == 0) {
+        for (i = 0; i < height; i++) {
+            *dst = *src;
+            dst += dst_stride;
+            src += src_stride;
+        }
+    } else {
+        for (i = 0; i < height; i++) {
+            *(uint16_t *)dst = *(uint16_t *)src;
+            dst += dst_stride;
+            src += src_stride;
+        }
+    }
+}
+
+static void copy_ctb_to_hv(VVCFrameContext *fc, const uint8_t *src,
+    const ptrdiff_t src_stride, const int x, const int y, const int width, const int height,
+    const int c_idx, const int x_ctb, const int y_ctb, const int top)
+{
+    int ps = fc->ps.sps->pixel_shift;
+    int w  = fc->ps.sps->width >> fc->ps.sps->hshift[c_idx];
+    int h  = fc->ps.sps->height >> fc->ps.sps->vshift[c_idx];
+
+    if (top) {
+        /* top */
+        memcpy(fc->tab.sao_pixel_buffer_h[c_idx] + (((2 * y_ctb) * w + x) << ps),
+            src, width << ps);
+    } else {
+        /* bottom */
+        memcpy(fc->tab.sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 1) * w + x) << ps),
+            src + src_stride * (height - 1), width << ps);
+
+        /* copy vertical edges */
+        copy_vert(fc->tab.sao_pixel_buffer_v[c_idx] + (((2 * x_ctb) * h + y) << ps), src, ps, height, 1 << ps, src_stride);
+        copy_vert(fc->tab.sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 1) * h + y) << ps), src + ((width - 1) << ps), ps, height, 1 << ps, src_stride);
+    }
+}
+
+static void sao_copy_ctb_to_hv(VVCLocalContext *lc, const int rx, const int ry, const int top)
+{
+    VVCFrameContext *fc     = lc->fc;
+    const int ctb_size_y    = fc->ps.sps->ctb_size_y;
+    const int x0            = rx << fc->ps.sps->ctb_log2_size_y;
+    const int y0            = ry << fc->ps.sps->ctb_log2_size_y;
+
+    for (int c_idx = 0; c_idx < (fc->ps.sps->r->sps_chroma_format_idc ? 3 : 1); c_idx++) {
+        const int x                = x0 >> fc->ps.sps->hshift[c_idx];
+        const int y                = y0 >> fc->ps.sps->vshift[c_idx];
+        const ptrdiff_t src_stride = fc->frame->linesize[c_idx];
+        const int ctb_size_h       = ctb_size_y >> fc->ps.sps->hshift[c_idx];
+        const int ctb_size_v       = ctb_size_y >> fc->ps.sps->vshift[c_idx];
+        const int width            = FFMIN(ctb_size_h, (fc->ps.sps->width  >> fc->ps.sps->hshift[c_idx]) - x);
+        const int height           = FFMIN(ctb_size_v, (fc->ps.sps->height >> fc->ps.sps->vshift[c_idx]) - y);
+        uint8_t *src               = &fc->frame->data[c_idx][y * src_stride + (x << fc->ps.sps->pixel_shift)];
+        copy_ctb_to_hv(fc, src, src_stride, x, y, width, height, c_idx, rx, ry, top);
+    }
+
+}
+
+void ff_vvc_sao_copy_ctb_to_hv(VVCLocalContext *lc, const int rx, const int ry, const int last_row)
+{
+    if (ry)
+        sao_copy_ctb_to_hv(lc, rx, ry - 1, 0);
+
+    sao_copy_ctb_to_hv(lc, rx, ry, 1);
+
+    if (last_row)
+        sao_copy_ctb_to_hv(lc, rx, ry, 0);
+}
+
+void ff_vvc_sao_filter(VVCLocalContext *lc, int x, int y)
+{
+    VVCFrameContext *fc  = lc->fc;
+    const int ctb_size_y = fc->ps.sps->ctb_size_y;
+    static const uint8_t sao_tab[16] = { 0, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8 };
+    int c_idx;
+    int edges[4];  // 0 left 1 top 2 right 3 bottom
+    int x_ctb                = x >> fc->ps.sps->ctb_log2_size_y;
+    int y_ctb                = y >> fc->ps.sps->ctb_log2_size_y;
+    SAOParams *sao           = &CTB(fc->tab.sao, x_ctb, y_ctb);
+    // flags indicating unfilterable edges
+    uint8_t vert_edge[]      = { 0, 0 };
+    uint8_t horiz_edge[]     = { 0, 0 };
+    uint8_t diag_edge[]      = { 0, 0, 0, 0 };
+    uint8_t lfase            = fc->ps.pps->r->pps_loop_filter_across_slices_enabled_flag;
+    uint8_t no_tile_filter   = fc->ps.pps->r->num_tiles_in_pic > 1 &&
+                               !fc->ps.pps->r->pps_loop_filter_across_tiles_enabled_flag;
+    uint8_t restore          = no_tile_filter || !lfase;
+    uint8_t left_tile_edge   = 0;
+    uint8_t right_tile_edge  = 0;
+    uint8_t up_tile_edge     = 0;
+    uint8_t bottom_tile_edge = 0;
+
+    edges[LEFT]   = x_ctb == 0;
+    edges[TOP]    = y_ctb == 0;
+    edges[RIGHT]  = x_ctb == fc->ps.pps->ctb_width  - 1;
+    edges[BOTTOM] = y_ctb == fc->ps.pps->ctb_height - 1;
+
+    if (restore) {
+        if (!edges[LEFT]) {
+            left_tile_edge  = no_tile_filter && fc->ps.pps->ctb_to_col_bd[x_ctb] == x_ctb;
+            vert_edge[0]    = (!lfase && CTB(fc->tab.slice_idx, x_ctb, y_ctb) != CTB(fc->tab.slice_idx, x_ctb - 1, y_ctb)) || left_tile_edge;
+        }
+        if (!edges[RIGHT]) {
+            right_tile_edge = no_tile_filter && fc->ps.pps->ctb_to_col_bd[x_ctb] != fc->ps.pps->ctb_to_col_bd[x_ctb + 1];
+            vert_edge[1]    = (!lfase && CTB(fc->tab.slice_idx, x_ctb, y_ctb) != CTB(fc->tab.slice_idx, x_ctb + 1, y_ctb)) || right_tile_edge;
+        }
+        if (!edges[TOP]) {
+            up_tile_edge     = no_tile_filter && fc->ps.pps->ctb_to_row_bd[y_ctb] == y_ctb;
+            horiz_edge[0]    = (!lfase && CTB(fc->tab.slice_idx, x_ctb, y_ctb) != CTB(fc->tab.slice_idx, x_ctb, y_ctb - 1)) || up_tile_edge;
+        }
+        if (!edges[BOTTOM]) {
+            bottom_tile_edge = no_tile_filter && fc->ps.pps->ctb_to_row_bd[y_ctb] != fc->ps.pps->ctb_to_row_bd[y_ctb + 1];
+            horiz_edge[1]    = (!lfase && CTB(fc->tab.slice_idx, x_ctb, y_ctb) != CTB(fc->tab.slice_idx, x_ctb, y_ctb + 1)) || bottom_tile_edge;
+        }
+        if (!edges[LEFT] && !edges[TOP]) {
+            diag_edge[0] = (!lfase && CTB(fc->tab.slice_idx, x_ctb, y_ctb) != CTB(fc->tab.slice_idx, x_ctb - 1, y_ctb - 1)) || left_tile_edge || up_tile_edge;
+        }
+        if (!edges[TOP] && !edges[RIGHT]) {
+            diag_edge[1] = (!lfase && CTB(fc->tab.slice_idx, x_ctb, y_ctb) != CTB(fc->tab.slice_idx, x_ctb + 1, y_ctb - 1)) || right_tile_edge || up_tile_edge;
+        }
+        if (!edges[RIGHT] && !edges[BOTTOM]) {
+            diag_edge[2] = (!lfase && CTB(fc->tab.slice_idx, x_ctb, y_ctb) != CTB(fc->tab.slice_idx, x_ctb + 1, y_ctb + 1)) || right_tile_edge || bottom_tile_edge;
+        }
+        if (!edges[LEFT] && !edges[BOTTOM]) {
+            diag_edge[3] = (!lfase && CTB(fc->tab.slice_idx, x_ctb, y_ctb) != CTB(fc->tab.slice_idx, x_ctb - 1, y_ctb + 1)) || left_tile_edge || bottom_tile_edge;
+        }
+    }
+
+    for (c_idx = 0; c_idx < (fc->ps.sps->r->sps_chroma_format_idc ? 3 : 1); c_idx++) {
+        int x0       = x >> fc->ps.sps->hshift[c_idx];
+        int y0       = y >> fc->ps.sps->vshift[c_idx];
+        ptrdiff_t src_stride = fc->frame->linesize[c_idx];
+        int ctb_size_h = ctb_size_y >> fc->ps.sps->hshift[c_idx];
+        int ctb_size_v = ctb_size_y >> fc->ps.sps->vshift[c_idx];
+        int width    = FFMIN(ctb_size_h, (fc->ps.sps->width  >> fc->ps.sps->hshift[c_idx]) - x0);
+        int height   = FFMIN(ctb_size_v, (fc->ps.sps->height >> fc->ps.sps->vshift[c_idx]) - y0);
+        int tab      = sao_tab[(FFALIGN(width, 8) >> 3) - 1];
+        uint8_t *src = &fc->frame->data[c_idx][y0 * src_stride + (x0 << fc->ps.sps->pixel_shift)];
+        ptrdiff_t dst_stride;
+        uint8_t *dst;
+
+        switch (sao->type_idx[c_idx]) {
+        case SAO_BAND:
+            fc->vvcdsp.sao.band_filter[tab](src, src, src_stride, src_stride,
+                sao->offset_val[c_idx], sao->band_position[c_idx], width, height);
+            break;
+        case SAO_EDGE:
+        {
+            int w = fc->ps.sps->width >> fc->ps.sps->hshift[c_idx];
+            int h = fc->ps.sps->height >> fc->ps.sps->vshift[c_idx];
+            int sh = fc->ps.sps->pixel_shift;
+
+            dst_stride = 2*MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE;
+            dst = lc->sao_buffer + dst_stride + AV_INPUT_BUFFER_PADDING_SIZE;
+
+            if (!edges[TOP]) {
+                int left = 1 - edges[LEFT];
+                int right = 1 - edges[RIGHT];
+                const uint8_t *src1;
+                uint8_t *dst1;
+                int pos;
+
+                dst1 = dst - dst_stride - (left << sh);
+                src1 = fc->tab.sao_pixel_buffer_h[c_idx] + (((2 * y_ctb - 1) * w + x0 - left) << sh);
+                pos = 0;
+                if (left) {
+                    copy_pixel(dst1, src1, sh);
+                    pos += (1 << sh);
+                }
+                memcpy(dst1 + pos, src1 + pos, width << sh);
+                if (right) {
+                    pos += width << sh;
+                    copy_pixel(dst1 + pos, src1 + pos, sh);
+                }
+            }
+            if (!edges[BOTTOM]) {
+                int left = 1 - edges[LEFT];
+                int right = 1 - edges[RIGHT];
+                const uint8_t *src1;
+                uint8_t *dst1;
+                int pos;
+
+                dst1 = dst + height * dst_stride - (left << sh);
+                src1 = fc->tab.sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 2) * w + x0 - left) << sh);
+                pos = 0;
+                if (left) {
+                    copy_pixel(dst1, src1, sh);
+                    pos += (1 << sh);
+                }
+                memcpy(dst1 + pos, src1 + pos, width << sh);
+                if (right) {
+                    pos += width << sh;
+                    copy_pixel(dst1 + pos, src1 + pos, sh);
+                }
+            }
+            if (!edges[LEFT]) {
+                copy_vert(dst - (1 << sh),
+                    fc->tab.sao_pixel_buffer_v[c_idx] + (((2 * x_ctb - 1) * h + y0) << sh),
+                    sh, height, dst_stride, 1 << sh);
+            }
+            if (!edges[RIGHT]) {
+                copy_vert(dst + (width << sh),
+                    fc->tab.sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 2) * h + y0) << sh),
+                    sh, height, dst_stride, 1 << sh);
+            }
+
+            copy_ctb(dst, src,  width << sh, height, dst_stride, src_stride);
+            fc->vvcdsp.sao.edge_filter[tab](src, dst, src_stride, sao->offset_val[c_idx],
+                                            sao->eo_class[c_idx], width, height);
+            fc->vvcdsp.sao.edge_restore[restore](src, dst,
+                                                src_stride, dst_stride,
+                                                sao,
+                                                edges, width,
+                                                height, c_idx,
+                                                vert_edge,
+                                                horiz_edge,
+                                                diag_edge);
+            break;
+        }
+        }
+    }
+}
+
+#define TAB_BS(t, x, y)     (t)[((y) >> 2) * (fc->tab.bs_width) + ((x) >> 2)]
+#define TAB_MAX_LEN(t, x, y)  (t)[((y) >> 2) * (fc->tab.bs_width) + ((x) >> 2)]
+
+//8 samples a time
+#define DEBLOCK_STEP            8
+#define LUMA_GRID               4
+#define CHROMA_GRID             8
+
+static int boundary_strength(const VVCLocalContext *lc, MvField *curr, MvField *neigh,
+    const RefPicList *neigh_rpl)
+{
+    RefPicList *rpl = lc->sc->rpl;
+    if (curr->pred_flag == PF_BI &&  neigh->pred_flag == PF_BI) {
+        // same L0 and L1
+        if (rpl[0].list[curr->ref_idx[0]] == neigh_rpl[0].list[neigh->ref_idx[0]]  &&
+            rpl[0].list[curr->ref_idx[0]] == rpl[1].list[curr->ref_idx[1]] &&
+            neigh_rpl[0].list[neigh->ref_idx[0]] == neigh_rpl[1].list[neigh->ref_idx[1]]) {
+            if ((FFABS(neigh->mv[0].x - curr->mv[0].x) >= 8 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 8 ||
+                 FFABS(neigh->mv[1].x - curr->mv[1].x) >= 8 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 8) &&
+                (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 8 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 8 ||
+                 FFABS(neigh->mv[0].x - curr->mv[1].x) >= 8 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 8))
+                return 1;
+            else
+                return 0;
+        } else if (neigh_rpl[0].list[neigh->ref_idx[0]] == rpl[0].list[curr->ref_idx[0]] &&
+                   neigh_rpl[1].list[neigh->ref_idx[1]] == rpl[1].list[curr->ref_idx[1]]) {
+            if (FFABS(neigh->mv[0].x - curr->mv[0].x) >= 8 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 8 ||
+                FFABS(neigh->mv[1].x - curr->mv[1].x) >= 8 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 8)
+                return 1;
+            else
+                return 0;
+        } else if (neigh_rpl[1].list[neigh->ref_idx[1]] == rpl[0].list[curr->ref_idx[0]] &&
+                   neigh_rpl[0].list[neigh->ref_idx[0]] == rpl[1].list[curr->ref_idx[1]]) {
+            if (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 8 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 8 ||
+                FFABS(neigh->mv[0].x - curr->mv[1].x) >= 8 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 8)
+                return 1;
+            else
+                return 0;
+        } else {
+            return 1;
+        }
+    } else if ((curr->pred_flag != PF_BI) && (neigh->pred_flag != PF_BI)){ // 1 MV
+        Mv A, B;
+        int ref_A, ref_B;
+
+        if (curr->pred_flag & 1) {
+            A     = curr->mv[0];
+            ref_A = rpl[0].list[curr->ref_idx[0]];
+        } else {
+            A     = curr->mv[1];
+            ref_A = rpl[1].list[curr->ref_idx[1]];
+        }
+
+        if (neigh->pred_flag & 1) {
+            B     = neigh->mv[0];
+            ref_B = neigh_rpl[0].list[neigh->ref_idx[0]];
+        } else {
+            B     = neigh->mv[1];
+            ref_B = neigh_rpl[1].list[neigh->ref_idx[1]];
+        }
+
+        if (ref_A == ref_B) {
+            if (FFABS(A.x - B.x) >= 8 || FFABS(A.y - B.y) >= 8)
+                return 1;
+            else
+                return 0;
+        } else
+            return 1;
+    }
+
+    return 1;
+}
+
+//part of 8.8.3.3 Derivation process of transform block boundary
+static void derive_max_filter_length_luma(const VVCFrameContext *fc, const int qx, const int qy,
+                                          const int is_intra, const int has_subblock, const int vertical, uint8_t *max_len_p, uint8_t *max_len_q)
+{
+    const int px =  vertical ? qx - 1 : qx;
+    const int py = !vertical ? qy - 1 : qy;
+    const uint8_t *tb_size = vertical ? fc->tab.tb_width[LUMA] : fc->tab.tb_height[LUMA];
+    const int size_p = tb_size[(py >> MIN_TU_LOG2) * fc->ps.pps->min_tu_width + (px >> MIN_TU_LOG2)];
+    const int size_q = tb_size[(qy >> MIN_TU_LOG2) * fc->ps.pps->min_tu_width + (qx >> MIN_TU_LOG2)];
+    const int min_cb_log2 = fc->ps.sps->min_cb_log2_size_y;
+    const int off_p = (py >> min_cb_log2) * fc->ps.pps->min_cb_width + (px >> min_cb_log2);
+    if (size_p <= 4 || size_q <= 4) {
+        *max_len_p = *max_len_q = 1;
+    } else {
+        *max_len_p = *max_len_q = 3;
+        if (size_p >= 32)
+            *max_len_p = 7;
+        if (size_q >= 32)
+            *max_len_q = 7;
+    }
+    if (has_subblock)
+        *max_len_q = FFMIN(5, *max_len_q);
+    if (fc->tab.msf[off_p] || fc->tab.iaf[off_p])
+        *max_len_p = FFMIN(5, *max_len_p);
+}
+
+static void vvc_deblock_subblock_bs_vertical(const VVCLocalContext *lc,
+    const int cb_x, const int cb_y, const int x0, const int y0, const int width, const int height)
+{
+    const VVCFrameContext  *fc  = lc->fc;
+    MvField *tab_mvf            = fc->tab.mvf;
+    RefPicList *rpl             = lc->sc->rpl;
+    const int min_pu_width      = fc->ps.pps->min_pu_width;
+    const int log2_min_pu_size  = MIN_PU_LOG2;
+    uint8_t max_len_p, max_len_q;
+    int bs, i, j;
+
+    // bs for TU internal vertical PU boundaries
+    for (j = 0; j < height; j += 4) {
+        int y_pu = (y0 + j) >> log2_min_pu_size;
+
+        for (i = 8 - ((x0 - cb_x) % 8); i < width; i += 8) {
+            int xp_pu = (x0 + i - 1) >> log2_min_pu_size;
+            int xq_pu = (x0 + i)     >> log2_min_pu_size;
+            MvField *left = &tab_mvf[y_pu * min_pu_width + xp_pu];
+            MvField *curr = &tab_mvf[y_pu * min_pu_width + xq_pu];
+            const int x = x0 + i;
+            const int y = y0 + j;
+
+            bs = boundary_strength(lc, curr, left, rpl);
+            TAB_BS(fc->tab.vertical_bs[LUMA], x, y) = bs;
+
+
+            max_len_p = max_len_q = 0;
+            if (i == 4 || i == width - 4)
+                max_len_p = max_len_q = 1;
+            else if (i == 8 || i == width - 8)
+                max_len_p = max_len_q = 2;
+            else
+                max_len_p = max_len_q = 3;
+
+            TAB_MAX_LEN(fc->tab.vertical_p, x, y) = max_len_p;
+            TAB_MAX_LEN(fc->tab.vertical_q, x, y) = max_len_q;
+        }
+    }
+}
+
+static void vvc_deblock_subblock_bs_horizontal(const VVCLocalContext *lc,
+    const int cb_x, const int cb_y, const int x0, const int y0, const int width, const int height)
+{
+    const VVCFrameContext  *fc  = lc->fc;
+    MvField* tab_mvf            = fc->tab.mvf;
+    RefPicList* rpl             = lc->sc->rpl;
+    const int min_pu_width      = fc->ps.pps->min_pu_width;
+    const int log2_min_pu_size  = MIN_PU_LOG2;
+    uint8_t max_len_p, max_len_q;
+    int bs, i, j;
+
+    // bs for TU internal horizontal PU boundaries
+    for (j = 8 - ((y0 - cb_y) % 8); j < height; j += 8) {
+        int yp_pu = (y0 + j - 1) >> log2_min_pu_size;
+        int yq_pu = (y0 + j)     >> log2_min_pu_size;
+
+        for (i = 0; i < width; i += 4) {
+            int x_pu = (x0 + i) >> log2_min_pu_size;
+            MvField *top  = &tab_mvf[yp_pu * min_pu_width + x_pu];
+            MvField *curr = &tab_mvf[yq_pu * min_pu_width + x_pu];
+            const int x = x0 + i;
+            const int y = y0 + j;
+
+            bs = boundary_strength(lc, curr, top, rpl);
+            TAB_BS(fc->tab.horizontal_bs[LUMA], x, y) = bs;
+
+            //fixme:
+            //edgeTbFlags[ x − sbW ][ y ] is equal to 1
+            //edgeTbFlags[ x + sbW ][ y ] is equal to 1
+            max_len_p = max_len_q = 0;
+            if (j == 4 || j == height - 4)
+                max_len_p = max_len_q = 1;
+            else if (j == 8 || j == height - 8)
+                max_len_p = max_len_q = 2;
+            else
+                max_len_p = max_len_q = 3;
+            TAB_MAX_LEN(fc->tab.horizontal_p, x, y) = max_len_p;
+            TAB_MAX_LEN(fc->tab.horizontal_q, x, y) = max_len_q;
+        }
+    }
+
+}
+
+static void vvc_deblock_bs_luma_vertical(const VVCLocalContext *lc,
+    const int x0, const int y0, const int width, const int height)
+{
+    const VVCFrameContext *fc  = lc->fc;
+    MvField *tab_mvf           = fc->tab.mvf;
+    const int log2_min_pu_size = MIN_PU_LOG2;
+    const int log2_min_tu_size = MIN_TU_LOG2;
+    const int min_pu_width     = fc->ps.pps->min_pu_width;
+    const int min_tu_width     = fc->ps.pps->min_tu_width;
+    const int min_cb_log2      = fc->ps.sps->min_cb_log2_size_y;
+    const int min_cb_width     = fc->ps.pps->min_cb_width;
+    int is_intra = tab_mvf[(y0 >> log2_min_pu_size) * min_pu_width +
+                           (x0 >> log2_min_pu_size)].pred_flag == PF_INTRA;
+    int boundary_left;
+    int i, bs, has_vertical_sb = 0;
+    uint8_t max_len_p, max_len_q;
+
+    const int off_q            = (y0 >> min_cb_log2) * min_cb_width + (x0 >> min_cb_log2);
+    const int cb_x             = fc->tab.cb_pos_x[LUMA][off_q];
+    const int cb_y             = fc->tab.cb_pos_y[LUMA][off_q];
+    const int cb_width         = fc->tab.cb_width[LUMA][off_q];
+
+    if (!is_intra) {
+        if (fc->tab.msf[off_q] || fc->tab.iaf[off_q])
+            has_vertical_sb   = cb_width  > 8;
+    }
+
+    // bs for vertical TU boundaries
+    boundary_left = x0 > 0 && !(x0 & 3);
+    if (boundary_left &&
+        ((!fc->ps.pps->r->pps_loop_filter_across_slices_enabled_flag &&
+            lc->boundary_flags & BOUNDARY_LEFT_SLICE &&
+            (x0 % (1 << fc->ps.sps->ctb_log2_size_y)) == 0) ||
+            (!fc->ps.pps->r->pps_loop_filter_across_tiles_enabled_flag &&
+            lc->boundary_flags & BOUNDARY_LEFT_TILE &&
+            (x0 % (1 << fc->ps.sps->ctb_log2_size_y)) == 0)))
+        boundary_left = 0;
+
+    if (boundary_left) {
+        const RefPicList *rpl_left =
+            (lc->boundary_flags & BOUNDARY_LEFT_SLICE) ? ff_vvc_get_ref_list(fc, fc->ref, x0 - 1, y0) : lc->sc->rpl;
+        int xp_pu = (x0 - 1) >> log2_min_pu_size;
+        int xq_pu =  x0      >> log2_min_pu_size;
+        int xp_tu = (x0 - 1) >> log2_min_tu_size;
+        int xq_tu =  x0      >> log2_min_tu_size;
+
+        for (i = 0; i < height; i += 4) {
+            const int off_x = cb_x - x0;
+            int y_pu      = (y0 + i) >> log2_min_pu_size;
+            int y_tu      = (y0 + i) >> log2_min_tu_size;
+            MvField *left = &tab_mvf[y_pu * min_pu_width + xp_pu];
+            MvField *curr = &tab_mvf[y_pu * min_pu_width + xq_pu];
+            uint8_t left_cbf_luma = fc->tab.tu_coded_flag[LUMA][y_tu * min_tu_width + xp_tu];
+            uint8_t curr_cbf_luma = fc->tab.tu_coded_flag[LUMA][y_tu * min_tu_width + xq_tu];
+            uint8_t pcmf          = fc->tab.pcmf[LUMA][y_tu * min_tu_width + xp_tu] &&
+                fc->tab.pcmf[LUMA][y_tu * min_tu_width + xq_tu];
+
+            if (pcmf)
+                bs = 0;
+            else if (curr->pred_flag == PF_INTRA || left->pred_flag == PF_INTRA || curr->ciip_flag || left->ciip_flag)
+                bs = 2;
+            else if (curr_cbf_luma || left_cbf_luma)
+                bs = 1;
+            else if (off_x && ((off_x % 8) || !has_vertical_sb))
+                bs = 0;                                     ////inside a cu, not aligned to 8 or with no subblocks
+            else
+                bs = boundary_strength(lc, curr, left, rpl_left);
+
+            TAB_BS(fc->tab.vertical_bs[LUMA], x0, (y0 + i)) = bs;
+
+            derive_max_filter_length_luma(fc, x0, y0 + i, is_intra, has_vertical_sb, 1, &max_len_p, &max_len_q);
+            TAB_MAX_LEN(fc->tab.vertical_p, x0, y0 + i) = max_len_p;
+            TAB_MAX_LEN(fc->tab.vertical_q, x0, y0 + i) = max_len_q;
+        }
+    }
+
+    if (!is_intra) {
+        if (fc->tab.msf[off_q] || fc->tab.iaf[off_q])
+            vvc_deblock_subblock_bs_vertical(lc, cb_x, cb_y, x0, y0, width, height);
+    }
+
+}
+static void vvc_deblock_bs_luma_horizontal(const VVCLocalContext *lc,
+    const int x0, const int y0, const int width, const int height)
+{
+    const VVCFrameContext *fc  = lc->fc;
+    MvField *tab_mvf           = fc->tab.mvf;
+    const int log2_min_pu_size = MIN_PU_LOG2;
+    const int log2_min_tu_size = MIN_TU_LOG2;
+    const int min_pu_width     = fc->ps.pps->min_pu_width;
+    const int min_tu_width     = fc->ps.pps->min_tu_width;
+    const int min_cb_log2      = fc->ps.sps->min_cb_log2_size_y;
+    const int min_cb_width     = fc->ps.pps->min_cb_width;
+    int is_intra = tab_mvf[(y0 >> log2_min_pu_size) * min_pu_width +
+                           (x0 >> log2_min_pu_size)].pred_flag == PF_INTRA;
+    int boundary_upper;
+    int i, bs, has_horizontal_sb = 0;
+    uint8_t max_len_p, max_len_q;
+
+    const int off_q            = (y0 >> min_cb_log2) * min_cb_width + (x0 >> min_cb_log2);
+    const int cb_x             = fc->tab.cb_pos_x[LUMA][off_q];
+    const int cb_y             = fc->tab.cb_pos_y[LUMA][off_q];
+    const int cb_height        = fc->tab.cb_height[LUMA][off_q];
+
+    if (!is_intra) {
+        if (fc->tab.msf[off_q] || fc->tab.iaf[off_q])
+            has_horizontal_sb = cb_height > 8;
+    }
+
+    boundary_upper = y0 > 0 && !(y0 & 3);
+    if (boundary_upper &&
+        ((!fc->ps.pps->r->pps_loop_filter_across_slices_enabled_flag &&
+            lc->boundary_flags & BOUNDARY_UPPER_SLICE &&
+            (y0 % (1 << fc->ps.sps->ctb_log2_size_y)) == 0) ||
+            (!fc->ps.pps->r->pps_loop_filter_across_tiles_enabled_flag &&
+            lc->boundary_flags & BOUNDARY_UPPER_TILE &&
+            (y0 % (1 << fc->ps.sps->ctb_log2_size_y)) == 0)))
+        boundary_upper = 0;
+
+    if (boundary_upper) {
+        const RefPicList *rpl_top =
+            (lc->boundary_flags & BOUNDARY_UPPER_SLICE) ? ff_vvc_get_ref_list(fc, fc->ref, x0, y0 - 1) : lc->sc->rpl;
+        int yp_pu = (y0 - 1) >> log2_min_pu_size;
+        int yq_pu =  y0      >> log2_min_pu_size;
+        int yp_tu = (y0 - 1) >> log2_min_tu_size;
+        int yq_tu =  y0      >> log2_min_tu_size;
+
+        for (i = 0; i < width; i += 4) {
+            const int off_y = y0 - cb_y;
+            int x_pu = (x0 + i) >> log2_min_pu_size;
+            int x_tu = (x0 + i) >> log2_min_tu_size;
+            MvField *top  = &tab_mvf[yp_pu * min_pu_width + x_pu];
+            MvField *curr = &tab_mvf[yq_pu * min_pu_width + x_pu];
+            uint8_t top_cbf_luma  = fc->tab.tu_coded_flag[LUMA][yp_tu * min_tu_width + x_tu];
+            uint8_t curr_cbf_luma = fc->tab.tu_coded_flag[LUMA][yq_tu * min_tu_width + x_tu];
+            const uint8_t pcmf    = fc->tab.pcmf[LUMA][yp_tu * min_tu_width + x_tu] &&
+                fc->tab.pcmf[LUMA][yq_tu * min_tu_width + x_tu];
+
+            if (pcmf)
+                bs = 0;
+            else if (curr->pred_flag == PF_INTRA || top->pred_flag == PF_INTRA || curr->ciip_flag || top->ciip_flag)
+                bs = 2;
+            else if (curr_cbf_luma || top_cbf_luma)
+                bs = 1;
+            else if (off_y && ((off_y % 8) || !has_horizontal_sb))
+                bs = 0;                                     //inside a cu, not aligned to 8 or with no subblocks
+            else
+                bs = boundary_strength(lc, curr, top, rpl_top);
+
+            TAB_BS(fc->tab.horizontal_bs[LUMA], x0 + i, y0) = bs;
+
+            derive_max_filter_length_luma(fc, x0 + i, y0, is_intra, has_horizontal_sb, 0, &max_len_p, &max_len_q);
+            TAB_MAX_LEN(fc->tab.horizontal_p, x0 + i, y0) = max_len_p;
+            TAB_MAX_LEN(fc->tab.horizontal_q, x0 + i, y0) = max_len_q;
+        }
+    }
+
+    if (!is_intra) {
+        if (fc->tab.msf[off_q] || fc->tab.iaf[off_q])
+            vvc_deblock_subblock_bs_horizontal(lc, cb_x, cb_y, x0, y0, width, height);
+    }
+}
+
+static void vvc_deblock_bs_chroma_vertical(const VVCLocalContext *lc,
+    const int x0, const int y0, const int width, const int height)
+{
+    const VVCFrameContext *fc  = lc->fc;
+    MvField *tab_mvf           = fc->tab.mvf;
+    const int log2_min_pu_size = MIN_PU_LOG2;
+    const int log2_min_tu_size = MIN_PU_LOG2;
+    const int min_pu_width     = fc->ps.pps->min_pu_width;
+    const int min_tu_width     = fc->ps.pps->min_tu_width;
+    int boundary_left, i;
+
+    // bs for vertical TU boundaries
+    boundary_left = x0 > 0 && !(x0 & ((CHROMA_GRID << fc->ps.sps->hshift[1]) - 1));
+    if (boundary_left &&
+        ((!fc->ps.pps->r->pps_loop_filter_across_slices_enabled_flag &&
+          lc->boundary_flags & BOUNDARY_LEFT_SLICE &&
+          (x0 % (1 << fc->ps.sps->ctb_log2_size_y)) == 0) ||
+         (!fc->ps.pps->r->pps_loop_filter_across_tiles_enabled_flag &&
+          lc->boundary_flags & BOUNDARY_LEFT_TILE &&
+          (x0 % (1 << fc->ps.sps->ctb_log2_size_y)) == 0)))
+        boundary_left = 0;
+
+    if (boundary_left) {
+        int xp_pu = (x0 - 1) >> log2_min_pu_size;
+        int xq_pu =  x0      >> log2_min_pu_size;
+        int xp_tu = (x0 - 1) >> log2_min_tu_size;
+        int xq_tu =  x0      >> log2_min_tu_size;
+
+        for (i = 0; i < height; i += 2) {
+            int y_pu      = (y0 + i) >> log2_min_pu_size;
+            int y_tu      = (y0 + i) >> log2_min_tu_size;
+            MvField *left = &tab_mvf[y_pu * min_pu_width + xp_pu];
+            MvField *curr = &tab_mvf[y_pu * min_pu_width + xq_pu];
+            const int left_tu = y_tu * min_tu_width + xp_tu;
+            const int curr_tu = y_tu * min_tu_width + xq_tu;
+            const uint8_t pcmf = fc->tab.pcmf[CHROMA][left_tu] && fc->tab.pcmf[CHROMA][curr_tu];
+
+            for (int c = CB; c <= CR; c++) {
+                uint8_t cbf  = fc->tab.tu_coded_flag[c][left_tu] |
+                    fc->tab.tu_coded_flag[c][curr_tu] |
+                    fc->tab.tu_joint_cbcr_residual_flag[left_tu] |
+                    fc->tab.tu_joint_cbcr_residual_flag[curr_tu];
+                int bs = 0;
+
+                if (pcmf)
+                    bs = 0;
+                else if (curr->pred_flag == PF_INTRA || left->pred_flag == PF_INTRA || curr->ciip_flag || left->ciip_flag)
+                    bs = 2;
+                else if (cbf)
+                    bs = 1;
+                TAB_BS(fc->tab.vertical_bs[c], x0, (y0 + i)) = bs;
+            }
+        }
+    }
+}
+
+static void vvc_deblock_bs_chroma_horizontal(const VVCLocalContext *lc,
+    const int x0, const int y0, const int width, const int height)
+{
+    const VVCFrameContext *fc = lc->fc;
+    MvField *tab_mvf = fc->tab.mvf;
+    const int log2_min_pu_size = MIN_PU_LOG2;
+    const int log2_min_tu_size = MIN_PU_LOG2;
+    const int min_pu_width = fc->ps.pps->min_pu_width;
+    const int min_tu_width = fc->ps.pps->min_tu_width;
+    int boundary_upper;
+    int i;
+
+    boundary_upper = y0 > 0 && !(y0 & ((CHROMA_GRID << fc->ps.sps->vshift[1]) - 1));
+    if (boundary_upper &&
+        ((!fc->ps.pps->r->pps_loop_filter_across_slices_enabled_flag &&
+            lc->boundary_flags & BOUNDARY_UPPER_SLICE &&
+            (y0 % (1 << fc->ps.sps->ctb_log2_size_y)) == 0) ||
+            (!fc->ps.pps->r->pps_loop_filter_across_tiles_enabled_flag &&
+                lc->boundary_flags & BOUNDARY_UPPER_TILE &&
+                (y0 % (1 << fc->ps.sps->ctb_log2_size_y)) == 0)))
+        boundary_upper = 0;
+
+    if (boundary_upper) {
+        int yp_pu = (y0 - 1) >> log2_min_pu_size;
+        int yq_pu = y0 >> log2_min_pu_size;
+        int yp_tu = (y0 - 1) >> log2_min_tu_size;
+        int yq_tu = y0 >> log2_min_tu_size;
+
+        for (i = 0; i < width; i += 2) {
+            int x_pu = (x0 + i) >> log2_min_pu_size;
+            int x_tu = (x0 + i) >> log2_min_tu_size;
+            MvField *top = &tab_mvf[yp_pu * min_pu_width + x_pu];
+            MvField *curr = &tab_mvf[yq_pu * min_pu_width + x_pu];
+            const int top_tu = yp_tu * min_tu_width + x_tu;
+            const int curr_tu = yq_tu * min_tu_width + x_tu;
+            const uint8_t pcmf = fc->tab.pcmf[CHROMA][top_tu] && fc->tab.pcmf[CHROMA][curr_tu];
+
+            for (int c = CB; c <= CR; c++) {
+                uint8_t cbf = fc->tab.tu_coded_flag[c][top_tu] |
+                    fc->tab.tu_coded_flag[c][curr_tu] |
+                    fc->tab.tu_joint_cbcr_residual_flag[top_tu] |
+                    fc->tab.tu_joint_cbcr_residual_flag[curr_tu];
+                int bs = 0;
+
+                if (pcmf)
+                    bs = 0;
+                else if (curr->pred_flag == PF_INTRA || top->pred_flag == PF_INTRA || curr->ciip_flag || top->ciip_flag)
+                    bs = 2;
+                else if (cbf)
+                    bs = 1;
+                TAB_BS(fc->tab.horizontal_bs[c], x0 + i, y0) = bs;
+            }
+        }
+    }
+}
+
+typedef void (*deblock_bs_fn)(const VVCLocalContext *lc, const int x0, const int y0,
+    const int width, const int height);
+
+static void vvc_deblock_bs(const VVCLocalContext *lc, const int x0, const int y0, const int vertical)
+{
+    const VVCFrameContext *fc = lc->fc;
+    const VVCSPS *sps   = fc->ps.sps;
+    const VVCPPS *pps   = fc->ps.pps;
+    const int ctb_size  = sps->ctb_size_y;
+    const int x_end     = FFMIN(x0 + ctb_size, pps->width) >> MIN_TU_LOG2;
+    const int y_end     = FFMIN(y0 + ctb_size, pps->height) >> MIN_TU_LOG2;
+    deblock_bs_fn deblock_bs[2][2] = {
+        { vvc_deblock_bs_luma_horizontal, vvc_deblock_bs_chroma_horizontal },
+        { vvc_deblock_bs_luma_vertical,   vvc_deblock_bs_chroma_vertical   }
+    };
+
+    for (int is_chroma = 0; is_chroma <= 1; is_chroma++) {
+        const int hs = sps->hshift[is_chroma];
+        const int vs = sps->vshift[is_chroma];
+        for (int y = y0 >> MIN_TU_LOG2; y < y_end; y++) {
+            for (int x = x0 >> MIN_TU_LOG2; x < x_end; x++) {
+                const int off = y * fc->ps.pps->min_tu_width + x;
+                if ((fc->tab.tb_pos_x0[is_chroma][off] >> MIN_TU_LOG2) == x && (fc->tab.tb_pos_y0[is_chroma][off] >> MIN_TU_LOG2) == y) {
+                    deblock_bs[vertical][is_chroma](lc, x << MIN_TU_LOG2, y << MIN_TU_LOG2,
+                        fc->tab.tb_width[is_chroma][off] << hs, fc->tab.tb_height[is_chroma][off] << vs);
+                }
+            }
+        }
+    }
+}
+
+//part of 8.8.3.3 Derivation process of transform block boundary
+static void max_filter_length_luma(const VVCFrameContext *fc, const int qx, const int qy,
+                                   const int vertical, uint8_t *max_len_p, uint8_t *max_len_q)
+{
+    const uint8_t *tab_len_p = vertical ? fc->tab.vertical_p : fc->tab.horizontal_p;
+    const uint8_t *tab_len_q = vertical ? fc->tab.vertical_q : fc->tab.horizontal_q;
+    *max_len_p = TAB_MAX_LEN(tab_len_p, qx, qy);
+    *max_len_q = TAB_MAX_LEN(tab_len_q, qx, qy);
+}
+
+//part of 8.8.3.3 Derivation process of transform block boundary
+static void max_filter_length_chroma(const VVCFrameContext *fc, const int qx, const int qy,
+                                     const int vertical, const int horizontal_ctu_edge, const int bs, uint8_t *max_len_p, uint8_t *max_len_q)
+{
+    const int px =  vertical ? qx - 1 : qx;
+    const int py = !vertical ? qy - 1 : qy;
+    const uint8_t *tb_size = vertical ? fc->tab.tb_width[CHROMA] : fc->tab.tb_height[CHROMA];
+
+    const int size_p = tb_size[(py >> MIN_TU_LOG2) * fc->ps.pps->min_tu_width + (px >> MIN_TU_LOG2)];
+    const int size_q = tb_size[(qy >> MIN_TU_LOG2) * fc->ps.pps->min_tu_width + (qx >> MIN_TU_LOG2)];
+    if (size_p >= 8 && size_q >= 8) {
+        *max_len_p = *max_len_q = 3;
+        if (horizontal_ctu_edge)
+            *max_len_p = 1;
+    } else {
+        //part of 8.8.3.6.4 Decision process for chroma block edges
+        *max_len_p = *max_len_q = (bs == 2);
+    }
+}
+
+static void max_filter_length(const VVCFrameContext *fc, const int qx, const int qy,
+    const int c_idx, const int vertical, const int horizontal_ctu_edge, const int bs, uint8_t *max_len_p, uint8_t *max_len_q)
+{
+    if (!c_idx)
+        max_filter_length_luma(fc, qx, qy, vertical, max_len_p, max_len_q);
+    else
+        max_filter_length_chroma(fc, qx, qy, vertical, horizontal_ctu_edge, bs, max_len_p, max_len_q);
+}
+
+#define TC_CALC(qp, bs)                                                 \
+    tctable[av_clip((qp) + DEFAULT_INTRA_TC_OFFSET * ((bs) - 1) +       \
+                    (tc_offset & -2),                                   \
+                    0, MAX_QP + DEFAULT_INTRA_TC_OFFSET)]
+
+// part of 8.8.3.6.2 Decision process for luma block edges
+static int get_qp_y(const VVCFrameContext *fc, const uint8_t *src, const int x, const int y, const int vertical)
+{
+    const VVCSPS *sps   = fc->ps.sps;
+    const int qp        = (ff_vvc_get_qPy(fc, x - vertical, y - !vertical) + ff_vvc_get_qPy(fc, x, y) + 1) >> 1;
+    int qp_offset       = 0;
+    int level;
+
+    if (!sps->r->sps_ladf_enabled_flag)
+        return qp;
+
+    level = fc->vvcdsp.lf.ladf_level[vertical](src, fc->frame->linesize[LUMA]);
+    qp_offset = sps->r->sps_ladf_lowest_interval_qp_offset;
+    for (int i = 0; i < sps->num_ladf_intervals - 1 && level > sps->ladf_interval_lower_bound[i + 1]; i++)
+        qp_offset = sps->r->sps_ladf_qp_offset[i];
+
+    return qp + qp_offset;
+}
+
+// part of 8.8.3.6.2 Decision process for luma block edges
+static int get_qp_c(const VVCFrameContext *fc, const int x, const int y, const int c_idx, const int vertical)
+{
+    const VVCSPS *sps   = fc->ps.sps;
+    return (get_qPc(fc, x - vertical, y - !vertical, c_idx) + get_qPc(fc, x, y, c_idx) - 2 * sps->qp_bd_offset + 1) >> 1;
+}
+
+static int get_qp(const VVCFrameContext *fc, const uint8_t *src, const int x, const int y, const int c_idx, const int vertical)
+{
+    if (!c_idx)
+        return get_qp_y(fc, src, x, y, vertical);
+    return get_qp_c(fc, x, y, c_idx, vertical);
+}
+
+void ff_vvc_deblock_vertical(const VVCLocalContext *lc, int x0, int y0)
+{
+    VVCFrameContext *fc = lc->fc;
+    const VVCSPS *sps   = fc->ps.sps;
+    const int c_end     = sps->r->sps_chroma_format_idc ? VVC_MAX_SAMPLE_ARRAYS : 1;
+    uint8_t *src;
+    int x, y, qp;
+
+    //not use this yet, may needed by plt.
+    const uint8_t no_p[4] = { 0 };
+    const uint8_t no_q[4] = { 0 } ;
+
+    int ctb_log2_size_y = fc->ps.sps->ctb_log2_size_y;
+    int x_end, y_end;
+    int ctb_size = 1 << ctb_log2_size_y;
+    int ctb = (x0 >> ctb_log2_size_y) +
+        (y0 >> ctb_log2_size_y) * fc->ps.pps->ctb_width;
+    DBParams  *params = fc->tab.deblock + ctb;
+
+    vvc_deblock_bs(lc, x0, y0, 1);
+
+    x_end = x0 + ctb_size;
+    if (x_end > fc->ps.sps->width)
+        x_end = fc->ps.sps->width;
+    y_end = y0 + ctb_size;
+    if (y_end > fc->ps.sps->height)
+        y_end = fc->ps.sps->height;
+
+    for (int c_idx = 0; c_idx < c_end; c_idx++) {
+        const int hs            = sps->hshift[c_idx];
+        const int vs            = sps->vshift[c_idx];
+        const int grid          = c_idx ? (CHROMA_GRID << hs) : LUMA_GRID;
+        const int tc_offset     = params->tc_offset[c_idx];
+        const int beta_offset   = params->beta_offset[c_idx];
+
+        for (y = y0; y < y_end; y += (DEBLOCK_STEP << vs)) {
+            for (x = x0 ? x0 : grid; x < x_end; x += grid) {
+                int32_t bs[4], beta[4], tc[4], all_zero_bs = 1;
+                uint8_t max_len_p[4], max_len_q[4];
+
+                for (int i = 0; i < DEBLOCK_STEP >> (2 - vs); i++) {
+                    const int dy = i << 2;
+                    bs[i] = (y + dy < y_end) ? TAB_BS(fc->tab.vertical_bs[c_idx], x, y + dy) : 0;
+                    if (bs[i]) {
+                        src = &fc->frame->data[c_idx][((y + dy) >> vs) * fc->frame->linesize[c_idx] + ((x >> hs) << fc->ps.sps->pixel_shift)];
+                        qp = get_qp(fc, src, x, y + dy, c_idx, 1);
+
+                        beta[i] = betatable[av_clip(qp + beta_offset, 0, MAX_QP)];
+
+                        max_filter_length(fc, x, y + dy, c_idx, 1, 0, bs[i], &max_len_p[i], &max_len_q[i]);
+                        all_zero_bs = 0;
+                    }
+                    tc[i] = bs[i] ? TC_CALC(qp, bs[i]) : 0;
+                }
+
+                if (!all_zero_bs) {
+                    src = &fc->frame->data[c_idx][(y >> vs) * fc->frame->linesize[c_idx] + ((x >> hs) << fc->ps.sps->pixel_shift)];
+                    if (!c_idx) {
+                        fc->vvcdsp.lf.filter_luma[1](src, fc->frame->linesize[c_idx],
+                            beta, tc, no_p, no_q, max_len_p, max_len_q, 0);
+                    } else {
+                        fc->vvcdsp.lf.filter_chroma[1](src, fc->frame->linesize[c_idx],
+                            beta, tc, no_p, no_q, max_len_p, max_len_q, vs);
+                    }
+                }
+            }
+        }
+    }
+}
+
+void ff_vvc_deblock_horizontal(const VVCLocalContext *lc, int x0, int y0)
+{
+    VVCFrameContext *fc = lc->fc;
+    const VVCSPS *sps   = fc->ps.sps;
+    const int c_end     = fc->ps.sps->r->sps_chroma_format_idc ? VVC_MAX_SAMPLE_ARRAYS : 1;
+    uint8_t* src;
+    int x, y, qp;
+
+    //not use this yet, may needed by plt.
+    const uint8_t no_p[4] = { 0 };
+    const uint8_t no_q[4] = { 0 } ;
+
+    int ctb_log2_size_y = fc->ps.sps->ctb_log2_size_y;
+    int x_end, y_end;
+    int ctb_size = 1 << ctb_log2_size_y;
+    int ctb = (x0 >> ctb_log2_size_y) +
+        (y0 >> ctb_log2_size_y) * fc->ps.pps->ctb_width;
+
+    vvc_deblock_bs(lc, x0, y0, 0);
+
+    x_end = x0 + ctb_size;
+    if (x_end > fc->ps.sps->width)
+        x_end = fc->ps.sps->width;
+    y_end = y0 + ctb_size;
+    if (y_end > fc->ps.sps->height)
+        y_end = fc->ps.sps->height;
+
+    for (int c_idx = 0; c_idx < c_end; c_idx++) {
+        const int hs            = sps->hshift[c_idx];
+        const int vs            = sps->vshift[c_idx];
+        const int grid          = c_idx ? (CHROMA_GRID << vs) : LUMA_GRID;
+
+        for (y = y0; y < y_end; y += grid) {
+            const uint8_t horizontal_ctu_edge = !(y % fc->ps.sps->ctb_size_y);
+            if (!y)
+                continue;
+
+            for (x = x0 ? x0: 0; x < x_end; x += (DEBLOCK_STEP << hs)) {
+                int32_t bs[4], beta[4], tc[4], all_zero_bs = 1;
+                uint8_t max_len_p[4], max_len_q[4];
+
+                for (int i = 0; i < DEBLOCK_STEP >> (2 - hs); i++) {
+                    const int dx = i << 2;
+                    const DBParams *params = fc->tab.deblock + ctb - (x + dx < x0);
+                    const int beta_offset = params->beta_offset[c_idx];
+                    const int tc_offset = params->tc_offset[c_idx];
+
+                    bs[i] = (x + dx < x_end) ? TAB_BS(fc->tab.horizontal_bs[c_idx], x + dx, y) : 0;
+                    if (bs[i]) {
+                        src = &fc->frame->data[c_idx][(y >> vs) * fc->frame->linesize[c_idx] + (((x + dx)>> hs) << fc->ps.sps->pixel_shift)];
+                        qp = get_qp(fc, src, x + dx, y, c_idx, 0);
+
+                        beta[i] = betatable[av_clip(qp + beta_offset, 0, MAX_QP)];
+
+                        max_filter_length(fc, x + dx, y, c_idx, 0, horizontal_ctu_edge, bs[i], &max_len_p[i], &max_len_q[i]);
+                        all_zero_bs = 0;
+                    }
+                    tc[i] = bs[i] ? TC_CALC(qp, bs[i]) : 0;
+                }
+                if (!all_zero_bs) {
+                    src = &fc->frame->data[c_idx][(y >> vs) * fc->frame->linesize[c_idx] + ((x >> hs) << fc->ps.sps->pixel_shift)];
+                    if (!c_idx) {
+                        fc->vvcdsp.lf.filter_luma[0](src, fc->frame->linesize[c_idx],
+                            beta, tc, no_p, no_q, max_len_p, max_len_q, horizontal_ctu_edge);
+                    } else {
+                        fc->vvcdsp.lf.filter_chroma[0](src, fc->frame->linesize[c_idx],
+                            beta, tc, no_p, no_q, max_len_p, max_len_q, hs);
+                    }
+                }
+            }
+        }
+    }
+}
+
+static void alf_copy_border(uint8_t *dst, const uint8_t *src,
+    const int pixel_shift, int width, const int height, const ptrdiff_t dst_stride, const ptrdiff_t src_stride)
+{
+    width <<= pixel_shift;
+    for (int i = 0; i < height; i++) {
+        memcpy(dst, src, width);
+        dst += dst_stride;
+        src += src_stride;
+    }
+}
+
+static void alf_extend_vert(uint8_t *_dst, const uint8_t *_src,
+    const int pixel_shift, const int width, const int height, ptrdiff_t stride)
+{
+    if (pixel_shift == 0) {
+        for (int i = 0; i < height; i++) {
+            memset(_dst, *_src, width);
+            _src += stride;
+            _dst += stride;
+        }
+    } else {
+        const uint16_t *src = (const uint16_t *)_src;
+        uint16_t *dst = (uint16_t *)_dst;
+        stride >>= pixel_shift;
+
+        for (int i = 0; i < height; i++) {
+            for (int j = 0; j < width; j++)
+                dst[j] = *src;
+            src += stride;
+            dst += stride;
+        }
+    }
+}
+
+static void alf_extend_horz(uint8_t *dst, const uint8_t *src,
+    const int pixel_shift, int width, const int height, const ptrdiff_t stride)
+{
+    width <<= pixel_shift;
+    for (int i = 0; i < height; i++) {
+        memcpy(dst, src, width);
+        dst += stride;
+    }
+}
+
+static void alf_copy_ctb_to_hv(VVCFrameContext *fc, const uint8_t *src, const ptrdiff_t src_stride,
+    const int x, const int y, const int width, const int height, const int x_ctb, const int y_ctb, const int c_idx)
+{
+    const int ps            = fc->ps.sps->pixel_shift;
+    const int w             = fc->ps.sps->width >> fc->ps.sps->hshift[c_idx];
+    const int h             = fc->ps.sps->height >> fc->ps.sps->vshift[c_idx];
+    const int border_pixels = (c_idx == 0) ? ALF_BORDER_LUMA : ALF_BORDER_CHROMA;
+    const int offset_h[]    = { 0, height - border_pixels };
+    const int offset_v[]    = { 0, width  - border_pixels };
+
+    /* copy horizontal edges */
+    for (int i = 0; i < FF_ARRAY_ELEMS(offset_h); i++) {
+        alf_copy_border(fc->tab.alf_pixel_buffer_h[c_idx][i] + ((border_pixels * y_ctb * w + x)<< ps),
+            src + offset_h[i] * src_stride, ps, width, border_pixels, w << ps, src_stride);
+    }
+    /* copy vertical edges */
+    for (int i = 0; i < FF_ARRAY_ELEMS(offset_v); i++) {
+        alf_copy_border(fc->tab.alf_pixel_buffer_v[c_idx][i] + ((h * x_ctb + y) * (border_pixels << ps)),
+            src + (offset_v[i] << ps), ps, border_pixels, height, border_pixels << ps, src_stride);
+    }
+}
+
+static void alf_fill_border_h(uint8_t *dst, const ptrdiff_t dst_stride, const uint8_t *src, const ptrdiff_t src_stride,
+    const uint8_t *border, const int width, const int border_pixels, const int ps, const int edge)
+{
+    if (edge)
+        alf_extend_horz(dst, border, ps, width, border_pixels, dst_stride);
+    else
+        alf_copy_border(dst, src, ps, width, border_pixels, dst_stride, src_stride);
+}
+
+static void alf_fill_border_v(uint8_t *dst, const ptrdiff_t dst_stride, const uint8_t *src,
+    const uint8_t *border, const int border_pixels, const int height, const int pixel_shift, const int *edges, const int edge)
+{
+    const ptrdiff_t src_stride = (border_pixels << pixel_shift);
+
+    if (edge) {
+        alf_extend_vert(dst, border, pixel_shift, border_pixels, height + 2 * border_pixels, dst_stride);
+        return;
+    }
+
+    //left/right
+    alf_copy_border(dst + dst_stride * border_pixels * edges[TOP], src + src_stride * border_pixels * edges[TOP],
+        pixel_shift, border_pixels, height + (!edges[TOP] + !edges[BOTTOM]) * border_pixels, dst_stride, src_stride);
+
+    //top left/right
+    if (edges[TOP])
+        alf_extend_horz(dst, dst + dst_stride * border_pixels, pixel_shift, border_pixels, border_pixels, dst_stride);
+
+    //bottom left/right
+    if (edges[BOTTOM]) {
+        dst += dst_stride * (border_pixels + height);
+        alf_extend_horz(dst, dst - dst_stride, pixel_shift, border_pixels, border_pixels, dst_stride);
+    }
+}
+
+static void alf_prepare_buffer(VVCFrameContext *fc, uint8_t *_dst, const uint8_t *_src, const int x, const int y,
+    const int x_ctb, const int y_ctb, const int width, const int height, const ptrdiff_t dst_stride, const ptrdiff_t src_stride,
+    const int c_idx, const int *edges)
+{
+    const int ps = fc->ps.sps->pixel_shift;
+    const int w = fc->ps.sps->width >> fc->ps.sps->hshift[c_idx];
+    const int h = fc->ps.sps->height >> fc->ps.sps->vshift[c_idx];
+    const int border_pixels = c_idx == 0 ? ALF_BORDER_LUMA : ALF_BORDER_CHROMA;
+    uint8_t *dst, *src;
+
+    copy_ctb(_dst, _src, width << ps, height, dst_stride, src_stride);
+
+    //top
+    src = fc->tab.alf_pixel_buffer_h[c_idx][1] + (((border_pixels * (y_ctb - 1)) * w + x) << ps);
+    dst = _dst - border_pixels * dst_stride;
+    alf_fill_border_h(dst, dst_stride, src, w  << ps, _dst, width, border_pixels, ps, edges[TOP]);
+
+    //bottom
+    src = fc->tab.alf_pixel_buffer_h[c_idx][0] + ((border_pixels * (y_ctb + 1) * w + x) << ps);
+    dst = _dst + height * dst_stride;
+    alf_fill_border_h(dst, dst_stride, src, w  << ps, _dst + (height - 1) * dst_stride, width, border_pixels, ps, edges[BOTTOM]);
+
+
+    //left
+    src = fc->tab.alf_pixel_buffer_v[c_idx][1] + (h * (x_ctb - 1) + y - border_pixels) * (border_pixels << ps);
+    dst = _dst - (border_pixels << ps) - border_pixels * dst_stride;
+    alf_fill_border_v(dst, dst_stride, src,  dst + (border_pixels << ps), border_pixels, height, ps, edges, edges[LEFT]);
+
+    //right
+    src = fc->tab.alf_pixel_buffer_v[c_idx][0] + (h * (x_ctb + 1) + y - border_pixels) * (border_pixels << ps);
+    dst = _dst + (width << ps) - border_pixels * dst_stride;
+    alf_fill_border_v(dst, dst_stride, src,  dst - (1 << ps), border_pixels, height, ps, edges, edges[RIGHT]);
+}
+
+#define ALF_MAX_BLOCKS_IN_CTU   (MAX_CTU_SIZE * MAX_CTU_SIZE / ALF_BLOCK_SIZE / ALF_BLOCK_SIZE)
+#define ALF_MAX_FILTER_SIZE     (ALF_MAX_BLOCKS_IN_CTU * ALF_NUM_COEFF_LUMA)
+
+static void alf_get_coeff_and_clip(VVCLocalContext *lc, int16_t *coeff, int16_t *clip,
+    const uint8_t *src, ptrdiff_t src_stride, int width, int height, int vb_pos, ALFParams *alf)
+{
+    const VVCFrameContext *fc       = lc->fc;
+    const H266RawSliceHeader *rsh   = lc->sc->sh.r;
+    uint8_t fixed_clip_set[ALF_NUM_FILTERS_LUMA][ALF_NUM_COEFF_LUMA] = { 0 };
+    const int16_t *coeff_set;
+    const uint8_t *clip_idx_set;
+    const uint8_t *class_to_filt;
+    const int size = width * height / ALF_BLOCK_SIZE / ALF_BLOCK_SIZE;
+    int class_idx[ALF_MAX_BLOCKS_IN_CTU];
+    int transpose_idx[ALF_MAX_BLOCKS_IN_CTU];
+
+    if (alf->ctb_filt_set_idx_y < 16) {
+        coeff_set           = &ff_vvc_alf_fix_filt_coeff[0][0];
+        clip_idx_set        = &fixed_clip_set[0][0];
+        class_to_filt       = ff_vvc_alf_class_to_filt_map[alf->ctb_filt_set_idx_y];
+    } else {
+        const int id        = rsh->sh_alf_aps_id_luma[alf->ctb_filt_set_idx_y - 16];
+        const VVCALF *aps   = fc->ps.alf_list[id];
+        coeff_set           = &aps->luma_coeff[0][0];
+        clip_idx_set        = &aps->luma_clip_idx[0][0];
+        class_to_filt       = ff_vvc_alf_aps_class_to_filt_map;
+    }
+    fc->vvcdsp.alf.classify(class_idx, transpose_idx, src, src_stride, width, height,
+        vb_pos, lc->alf_gradient_tmp);
+    fc->vvcdsp.alf.recon_coeff_and_clip(coeff, clip, class_idx, transpose_idx, size,
+        coeff_set, clip_idx_set, class_to_filt);
+}
+
+static void alf_filter_luma(VVCLocalContext *lc, uint8_t *dst, const uint8_t *src,
+    const ptrdiff_t dst_stride, const ptrdiff_t src_stride, const int x0, const int y0,
+    const int width, const int height, const int _vb_pos, ALFParams *alf)
+{
+    const VVCFrameContext *fc   = lc->fc;
+    int vb_pos                  = _vb_pos - y0;
+    int16_t *coeff               = (int16_t*)lc->tmp;
+    int16_t *clip               = (int16_t *)lc->tmp1;
+
+    av_assert0(ALF_MAX_FILTER_SIZE <= sizeof(lc->tmp));
+    av_assert0(ALF_MAX_FILTER_SIZE * sizeof(int16_t) <= sizeof(lc->tmp1));
+
+    alf_get_coeff_and_clip(lc, coeff, clip, src, src_stride, width, height, vb_pos, alf);
+    fc->vvcdsp.alf.filter[LUMA](dst, dst_stride, src, src_stride, width, height, coeff, clip, vb_pos);
+}
+
+static int alf_clip_from_idx(const VVCFrameContext *fc, const int idx)
+{
+    const VVCSPS *sps   = fc->ps.sps;
+    const int offset[] = {0, 3, 5, 7};
+
+    return 1 << (sps->bit_depth - offset[idx]);
+}
+
+static void alf_filter_chroma(VVCLocalContext *lc, uint8_t *dst, const uint8_t *src,
+    const ptrdiff_t dst_stride, const ptrdiff_t src_stride, const int c_idx,
+    const int width, const int height, const int vb_pos, ALFParams *alf)
+{
+    VVCFrameContext *fc             = lc->fc;
+    const H266RawSliceHeader *rsh   = lc->sc->sh.r;
+    const VVCALF *aps               = fc->ps.alf_list[rsh->sh_alf_aps_id_chroma];
+    const int idx                   = alf->alf_ctb_filter_alt_idx[c_idx - 1];
+    const int16_t *coeff            = aps->chroma_coeff[idx];
+    int16_t clip[ALF_NUM_COEFF_CHROMA];
+
+    for (int i = 0; i < ALF_NUM_COEFF_CHROMA; i++)
+        clip[i] = alf_clip_from_idx(fc, aps->chroma_clip_idx[idx][i]);
+
+    fc->vvcdsp.alf.filter[CHROMA](dst, dst_stride, src, src_stride, width, height, coeff, clip, vb_pos);
+}
+
+static void alf_filter_cc(VVCLocalContext *lc, uint8_t *dst, const uint8_t *luma,
+    const ptrdiff_t dst_stride, const ptrdiff_t luma_stride, const int c_idx,
+    const int width, const int height, const int hs, const int vs, const int vb_pos, ALFParams *alf)
+{
+    VVCFrameContext *fc             = lc->fc;
+    const H266RawSliceHeader *rsh   = lc->sc->sh.r;
+    const int idx                   = c_idx - 1;
+    const int cc_aps_id             = c_idx == CB ? rsh->sh_alf_cc_cb_aps_id : rsh->sh_alf_cc_cr_aps_id;
+    const VVCALF *aps               = fc->ps.alf_list[cc_aps_id];
+
+    if (aps) {
+        const int16_t *coeff = aps->cc_coeff[idx][alf->ctb_cc_idc[idx] - 1];
+
+        fc->vvcdsp.alf.filter_cc(dst, dst_stride, luma, luma_stride, width, height, hs, vs, coeff, vb_pos);
+    }
+}
+
+void ff_vvc_alf_copy_ctu_to_hv(VVCLocalContext* lc, const int x0, const int y0)
+{
+    VVCFrameContext *fc     = lc->fc;
+    const int x_ctb         = x0 >> fc->ps.sps->ctb_log2_size_y;
+    const int y_ctb         = y0 >> fc->ps.sps->ctb_log2_size_y;
+    const int ctb_size_y    = fc->ps.sps->ctb_size_y;
+    const int ps            = fc->ps.sps->pixel_shift;
+    const int c_end         = fc->ps.sps->r->sps_chroma_format_idc ? VVC_MAX_SAMPLE_ARRAYS : 1;
+
+    for (int c_idx = 0; c_idx < c_end; c_idx++) {
+        const int hs        = fc->ps.sps->hshift[c_idx];
+        const int vs        = fc->ps.sps->vshift[c_idx];
+        const int x         = x0 >> hs;
+        const int y         = y0 >> vs;
+        const int width     = FFMIN(fc->ps.sps->width - x0, ctb_size_y) >> hs;
+        const int height    = FFMIN(fc->ps.sps->height - y0, ctb_size_y) >> vs;
+
+        const int src_stride = fc->frame->linesize[c_idx];
+        uint8_t* src = &fc->frame->data[c_idx][y * src_stride + (x << ps)];
+
+        alf_copy_ctb_to_hv(fc, src, src_stride, x, y, width, height, x_ctb, y_ctb, c_idx);
+    }
+}
+
+void ff_vvc_alf_filter(VVCLocalContext *lc, const int x0, const int y0)
+{
+    VVCFrameContext *fc     = lc->fc;
+    const VVCPPS *pps       = fc->ps.pps;
+    const int x_ctb         = x0 >> fc->ps.sps->ctb_log2_size_y;
+    const int y_ctb         = y0 >> fc->ps.sps->ctb_log2_size_y;
+    const int ctb_size_y    = fc->ps.sps->ctb_size_y;
+    const int ps            = fc->ps.sps->pixel_shift;
+    const int padded_stride = EDGE_EMU_BUFFER_STRIDE << ps;
+    const int padded_offset = padded_stride * ALF_PADDING_SIZE + (ALF_PADDING_SIZE << ps);
+    const int c_end         = fc->ps.sps->r->sps_chroma_format_idc ? VVC_MAX_SAMPLE_ARRAYS : 1;
+    ALFParams *alf          = &CTB(fc->tab.alf, x_ctb, y_ctb);
+    int edges[MAX_EDGES]    = { x_ctb == 0, y_ctb == 0, x_ctb == pps->ctb_width - 1, y_ctb == pps->ctb_height - 1 };
+
+    if (!pps->r->pps_loop_filter_across_tiles_enabled_flag) {
+        edges[LEFT] = edges[LEFT] || (lc->boundary_flags & BOUNDARY_LEFT_TILE);
+        edges[TOP]  = edges[TOP] || (lc->boundary_flags & BOUNDARY_UPPER_TILE);
+        edges[RIGHT] = edges[RIGHT] || pps->ctb_to_col_bd[x_ctb] != pps->ctb_to_col_bd[x_ctb + 1];
+        edges[BOTTOM] = edges[BOTTOM] || pps->ctb_to_row_bd[y_ctb] != pps->ctb_to_row_bd[y_ctb + 1];
+    }
+
+    if (!pps->r->pps_loop_filter_across_slices_enabled_flag) {
+        edges[LEFT] = edges[LEFT] || (lc->boundary_flags & BOUNDARY_LEFT_SLICE);
+        edges[TOP] = edges[TOP] || (lc->boundary_flags & BOUNDARY_UPPER_SLICE);
+        edges[RIGHT] = edges[RIGHT] || CTB(fc->tab.slice_idx, x_ctb, y_ctb) != CTB(fc->tab.slice_idx, x_ctb + 1, y_ctb);
+        edges[BOTTOM] = edges[BOTTOM] || CTB(fc->tab.slice_idx, x_ctb, y_ctb) != CTB(fc->tab.slice_idx, x_ctb, y_ctb + 1);
+    }
+
+    for (int c_idx = 0; c_idx < c_end; c_idx++) {
+        const int hs = fc->ps.sps->hshift[c_idx];
+        const int vs = fc->ps.sps->vshift[c_idx];
+        const int ctb_size_h = ctb_size_y >> hs;
+        const int ctb_size_v = ctb_size_y >> vs;
+        const int x = x0 >> hs;
+        const int y = y0 >> vs;
+        const int pic_width = fc->ps.sps->width >> hs;
+        const int pic_height = fc->ps.sps->height >> vs;
+        const int width  = FFMIN(pic_width  - x, ctb_size_h);
+        const int height = FFMIN(pic_height - y, ctb_size_v);
+        const int src_stride = fc->frame->linesize[c_idx];
+        uint8_t *src = &fc->frame->data[c_idx][y * src_stride + (x << ps)];
+        uint8_t *padded;
+
+        if (alf->ctb_flag[c_idx] || (!c_idx && (alf->ctb_cc_idc[0] || alf->ctb_cc_idc[1]))) {
+            padded = (c_idx ? lc->alf_buffer_chroma : lc->alf_buffer_luma) + padded_offset;
+            alf_prepare_buffer(fc, padded, src, x, y, x_ctb, y_ctb, width, height,
+                padded_stride, src_stride, c_idx, edges);
+        }
+        if (alf->ctb_flag[c_idx]) {
+            if (!c_idx)  {
+                alf_filter_luma(lc, src, padded, src_stride, padded_stride, x, y,
+                    width, height, y + ctb_size_v - ALF_VB_POS_ABOVE_LUMA, alf);
+            } else {
+                alf_filter_chroma(lc, src, padded, src_stride, padded_stride, c_idx,
+                    width, height, ctb_size_v - ALF_VB_POS_ABOVE_CHROMA, alf);
+            }
+        }
+        if (c_idx && alf->ctb_cc_idc[c_idx - 1]) {
+            padded = lc->alf_buffer_luma + padded_offset;
+            alf_filter_cc(lc, src, padded, src_stride, padded_stride, c_idx,
+                width, height, hs, vs, (ctb_size_v << vs) - ALF_VB_POS_ABOVE_LUMA, alf);
+        }
+
+        alf->applied[c_idx] = 1;
+    }
+}
+
+
+void ff_vvc_lmcs_filter(const VVCLocalContext *lc, const int x, const int y)
+{
+    const SliceContext *sc = lc->sc;
+    const VVCFrameContext *fc = lc->fc;
+    const int ctb_size  = fc->ps.sps->ctb_size_y;
+    const int width     = FFMIN(fc->ps.pps->width  - x, ctb_size);
+    const int height    = FFMIN(fc->ps.pps->height - y, ctb_size);
+    uint8_t *data       = fc->frame->data[LUMA] + y * fc->frame->linesize[LUMA] + (x << fc->ps.sps->pixel_shift);
+    if (sc->sh.r->sh_lmcs_used_flag)
+        fc->vvcdsp.lmcs.filter(data, fc->frame->linesize[LUMA], width, height, fc->ps.lmcs.inv_lut);
+}
diff --git a/libavcodec/vvc/vvc_filter.h b/libavcodec/vvc/vvc_filter.h
new file mode 100644
index 0000000000..2ae4c33e2d
--- /dev/null
+++ b/libavcodec/vvc/vvc_filter.h
@@ -0,0 +1,71 @@ 
+/*
+ * VVC filters
+ *
+ * Copyright (C) 2022 Nuo Mi
+ *
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef AVCODEC_VVC_VVC_FILTER_H
+#define AVCODEC_VVC_VVC_FILTER_H
+
+#include "vvcdec.h"
+
+/**
+ * lmcs filter for the CTU
+ * @param lc local context for CTU
+ * @param x0 x position for the CTU
+ * @param y0 y position for the CTU
+ */
+void ff_vvc_lmcs_filter(const VVCLocalContext *lc, const int x0, const int y0);
+
+/**
+ * vertical deblock filter for the CTU
+ * @param lc local context for CTU
+ * @param x0 x position for the CTU
+ * @param y0 y position for the CTU
+ */
+void ff_vvc_deblock_vertical(const VVCLocalContext *lc, int x0, int y0);
+
+/**
+ * horizontal deblock filter for the CTU
+ * @param lc local context for CTU
+ * @param x0 x position for the CTU
+ * @param y0 y position for the CTU
+ */
+void ff_vvc_deblock_horizontal(const VVCLocalContext *lc, int x0, int y0);
+
+/**
+ * sao filter for the CTU
+ * @param lc local context for CTU
+ * @param x0 x position for the CTU
+ * @param y0 y position for the CTU
+ */
+void ff_vvc_sao_filter(VVCLocalContext *lc, const int x0, const int y0);
+
+void ff_vvc_sao_copy_ctb_to_hv(VVCLocalContext* lc, int rx, int ry, int last_row);
+void ff_vvc_alf_copy_ctu_to_hv(VVCLocalContext* lc, int x0, int y0);
+
+/**
+ * alf filter for the CTU
+ * @param lc local context for CTU
+ * @param x0 x position for the CTU
+ * @param y0 y position for the CTU
+ */
+void ff_vvc_alf_filter(VVCLocalContext *lc, const int x0, const int y0);
+
+#endif // AVCODEC_VVC_VVC_CTU_H
diff --git a/libavcodec/vvc/vvc_filter_template.c b/libavcodec/vvc/vvc_filter_template.c
new file mode 100644
index 0000000000..a4f1792ec4
--- /dev/null
+++ b/libavcodec/vvc/vvc_filter_template.c
@@ -0,0 +1,1135 @@ 
+/*
+ * VVC filters DSP
+ *
+ * Copyright (C) 2022 Nuo Mi
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+static void FUNC(lmcs_filter_luma)(uint8_t *_dst, ptrdiff_t dst_stride, const int width, const int height, const uint8_t *_lut)
+{
+    const pixel *lut = (const pixel *)_lut;
+    pixel *dst = (pixel*)_dst;
+    dst_stride /= sizeof(pixel);
+
+    for (int y = 0; y < height; y++) {
+        for (int x = 0; x < width; x++)
+            dst[x] = lut[dst[x]];
+        dst += dst_stride;
+    }
+}
+
+static void FUNC(sao_band_filter)(uint8_t *_dst, const uint8_t *_src, ptrdiff_t dst_stride, ptrdiff_t src_stride,
+    const int16_t *sao_offset_val, const int sao_left_class, const int width, const int height)
+{
+    pixel *dst       = (pixel *)_dst;
+    const pixel *src = (pixel *)_src;
+    int offset_table[32] = { 0 };
+    int k, y, x;
+    int shift  = BIT_DEPTH - 5;
+
+    dst_stride /= sizeof(pixel);
+    src_stride /= sizeof(pixel);
+
+    for (k = 0; k < 4; k++)
+        offset_table[(k + sao_left_class) & 31] = sao_offset_val[k + 1];
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
+        dst += dst_stride;
+        src += src_stride;
+    }
+}
+
+#define CMP(a, b) (((a) > (b)) - ((a) < (b)))
+
+static void FUNC(sao_edge_filter)(uint8_t *_dst, const uint8_t *_src, ptrdiff_t dst_stride,
+    const int16_t *sao_offset_val, const int eo, const int width, const int height)
+{
+    static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
+    static const int8_t pos[4][2][2] = {
+        { { -1,  0 }, {  1, 0 } }, // horizontal
+        { {  0, -1 }, {  0, 1 } }, // vertical
+        { { -1, -1 }, {  1, 1 } }, // 45 degree
+        { {  1, -1 }, { -1, 1 } }, // 135 degree
+    };
+    pixel *dst          = (pixel *)_dst;
+    const pixel *src    = (pixel *)_src;
+    int a_stride, b_stride;
+    int x, y;
+    ptrdiff_t src_stride = (2 * MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE) / sizeof(pixel);
+    dst_stride /= sizeof(pixel);
+
+    a_stride = pos[eo][0][0] + pos[eo][0][1] * src_stride;
+    b_stride = pos[eo][1][0] + pos[eo][1][1] * src_stride;
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++) {
+            int diff0 = CMP(src[x], src[x + a_stride]);
+            int diff1 = CMP(src[x], src[x + b_stride]);
+            int offset_val        = edge_idx[2 + diff0 + diff1];
+            dst[x] = av_clip_pixel(src[x] + sao_offset_val[offset_val]);
+        }
+        src += src_stride;
+        dst += dst_stride;
+    }
+}
+
+static void FUNC(sao_edge_restore_0)(uint8_t *_dst, const uint8_t *_src,
+    ptrdiff_t dst_stride, ptrdiff_t src_stride, const SAOParams *sao,
+    const int *borders, const int _width, const int _height, const int c_idx,
+    const uint8_t *vert_edge, const uint8_t *horiz_edge, const uint8_t *diag_edge)
+{
+    int x, y;
+    pixel *dst                      = (pixel *)_dst;
+    const pixel *src                = (pixel *)_src;
+    const int16_t *sao_offset_val   = sao->offset_val[c_idx];
+    const int sao_eo_class          = sao->eo_class[c_idx];
+    int init_x = 0, width = _width, height = _height;
+
+    dst_stride /= sizeof(pixel);
+    src_stride /= sizeof(pixel);
+
+    if (sao_eo_class != SAO_EO_VERT) {
+        if (borders[0]) {
+            int offset_val = sao_offset_val[0];
+            for (y = 0; y < height; y++) {
+                dst[y * dst_stride] = av_clip_pixel(src[y * src_stride] + offset_val);
+            }
+            init_x = 1;
+        }
+        if (borders[2]) {
+            int offset_val = sao_offset_val[0];
+            int offset     = width - 1;
+            for (x = 0; x < height; x++) {
+                dst[x * dst_stride + offset] = av_clip_pixel(src[x * src_stride + offset] + offset_val);
+            }
+            width--;
+        }
+    }
+    if (sao_eo_class != SAO_EO_HORIZ) {
+        if (borders[1]) {
+            int offset_val = sao_offset_val[0];
+            for (x = init_x; x < width; x++)
+                dst[x] = av_clip_pixel(src[x] + offset_val);
+        }
+        if (borders[3]) {
+            int offset_val   = sao_offset_val[0];
+            ptrdiff_t y_dst_stride = dst_stride * (height - 1);
+            ptrdiff_t y_src_stride = src_stride * (height - 1);
+            for (x = init_x; x < width; x++)
+                dst[x + y_dst_stride] = av_clip_pixel(src[x + y_src_stride] + offset_val);
+            height--;
+        }
+    }
+}
+
+static void FUNC(sao_edge_restore_1)(uint8_t *_dst, const uint8_t *_src,
+    ptrdiff_t dst_stride, ptrdiff_t src_stride, const SAOParams *sao,
+    const int *borders, const int _width, const int _height, const int c_idx,
+    const uint8_t *vert_edge, const uint8_t *horiz_edge, const uint8_t *diag_edge)
+{
+    int x, y;
+    pixel *dst                      = (pixel *)_dst;
+    const pixel *src                = (pixel *)_src;
+    const int16_t *sao_offset_val   = sao->offset_val[c_idx];
+    const int sao_eo_class          = sao->eo_class[c_idx];
+    int init_x = 0, init_y = 0, width = _width, height = _height;
+
+    dst_stride /= sizeof(pixel);
+    src_stride /= sizeof(pixel);
+
+    if (sao_eo_class != SAO_EO_VERT) {
+        if (borders[0]) {
+            int offset_val = sao_offset_val[0];
+            for (y = 0; y < height; y++) {
+                dst[y * dst_stride] = av_clip_pixel(src[y * src_stride] + offset_val);
+            }
+            init_x = 1;
+        }
+        if (borders[2]) {
+            int offset_val = sao_offset_val[0];
+            int offset     = width - 1;
+            for (x = 0; x < height; x++) {
+                dst[x * dst_stride + offset] = av_clip_pixel(src[x * src_stride + offset] + offset_val);
+            }
+            width--;
+        }
+    }
+    if (sao_eo_class != SAO_EO_HORIZ) {
+        if (borders[1]) {
+            int offset_val = sao_offset_val[0];
+            for (x = init_x; x < width; x++)
+                dst[x] = av_clip_pixel(src[x] + offset_val);
+            init_y = 1;
+        }
+        if (borders[3]) {
+            int offset_val   = sao_offset_val[0];
+            ptrdiff_t y_dst_stride = dst_stride * (height - 1);
+            ptrdiff_t y_src_stride = src_stride * (height - 1);
+            for (x = init_x; x < width; x++)
+                dst[x + y_dst_stride] = av_clip_pixel(src[x + y_src_stride] + offset_val);
+            height--;
+        }
+    }
+
+    {
+        int save_upper_left  = !diag_edge[0] && sao_eo_class == SAO_EO_135D && !borders[0] && !borders[1];
+        int save_upper_right = !diag_edge[1] && sao_eo_class == SAO_EO_45D  && !borders[1] && !borders[2];
+        int save_lower_right = !diag_edge[2] && sao_eo_class == SAO_EO_135D && !borders[2] && !borders[3];
+        int save_lower_left  = !diag_edge[3] && sao_eo_class == SAO_EO_45D  && !borders[0] && !borders[3];
+
+        // Restore pixels that can't be modified
+        if (vert_edge[0] && sao_eo_class != SAO_EO_VERT) {
+            for (y = init_y + save_upper_left; y < height - save_lower_left; y++)
+                dst[y * dst_stride] = src[y * src_stride];
+        }
+        if (vert_edge[1] && sao_eo_class != SAO_EO_VERT) {
+            for (y = init_y + save_upper_right; y < height - save_lower_right; y++)
+                dst[y * dst_stride + width - 1] = src[y * src_stride + width - 1];
+        }
+
+        if (horiz_edge[0] && sao_eo_class != SAO_EO_HORIZ) {
+            for (x = init_x + save_upper_left; x < width - save_upper_right; x++)
+                dst[x] = src[x];
+        }
+        if (horiz_edge[1] && sao_eo_class != SAO_EO_HORIZ) {
+            for (x = init_x + save_lower_left; x < width - save_lower_right; x++)
+                dst[(height - 1) * dst_stride + x] = src[(height - 1) * src_stride + x];
+        }
+        if (diag_edge[0] && sao_eo_class == SAO_EO_135D)
+            dst[0] = src[0];
+        if (diag_edge[1] && sao_eo_class == SAO_EO_45D)
+            dst[width - 1] = src[width - 1];
+        if (diag_edge[2] && sao_eo_class == SAO_EO_135D)
+            dst[dst_stride * (height - 1) + width - 1] = src[src_stride * (height - 1) + width - 1];
+        if (diag_edge[3] && sao_eo_class == SAO_EO_45D)
+            dst[dst_stride * (height - 1)] = src[src_stride * (height - 1)];
+
+    }
+}
+
+#undef CMP
+
+static av_always_inline int16_t FUNC(alf_clip)(pixel curr, pixel v0, pixel v1, int16_t clip)
+{
+    return av_clip(v0 - curr, -clip, clip) + av_clip(v1 - curr, -clip, clip);
+}
+
+static void FUNC(alf_filter_luma)(uint8_t *_dst, ptrdiff_t dst_stride, const uint8_t *_src, ptrdiff_t src_stride,
+    const int width, const int height, const int16_t *filter, const int16_t *clip, const int vb_pos)
+{
+    const pixel *src    = (pixel *)_src;
+    const int shift     = 7;
+    const int offset    = 1 << ( shift - 1 );
+    const int vb_above  = vb_pos - 4;
+    const int vb_below  = vb_pos + 3;
+
+    dst_stride /= sizeof(pixel);
+    src_stride /= sizeof(pixel);
+
+    for (int y = 0; y < height; y += ALF_BLOCK_SIZE) {
+        for (int x = 0; x < width; x += ALF_BLOCK_SIZE) {
+            const pixel *s0 = src + y * src_stride + x;
+            const pixel *s1 = s0 + src_stride;
+            const pixel *s2 = s0 - src_stride;
+            const pixel *s3 = s1 + src_stride;
+            const pixel *s4 = s2 - src_stride;
+            const pixel *s5 = s3 + src_stride;
+            const pixel *s6 = s4 - src_stride;
+
+            for (int i = 0; i < ALF_BLOCK_SIZE; i++) {
+                pixel *dst = (pixel *)_dst + (y + i) * dst_stride + x;
+
+                const pixel *p0 = s0 + i * src_stride;
+                const pixel *p1 = s1 + i * src_stride;
+                const pixel *p2 = s2 + i * src_stride;
+                const pixel *p3 = s3 + i * src_stride;
+                const pixel *p4 = s4 + i * src_stride;
+                const pixel *p5 = s5 + i * src_stride;
+                const pixel *p6 = s6 + i * src_stride;
+
+                const int is_near_vb_above = (y + i <  vb_pos) && (y + i >= vb_pos - 1);
+                const int is_near_vb_below = (y + i >= vb_pos) && (y + i <= vb_pos);
+                const int is_near_vb = is_near_vb_above || is_near_vb_below;
+
+                if ((y + i < vb_pos) && ((y + i) >= vb_above)) {
+                    p1 = (y + i == vb_pos - 1) ? p0 : p1;
+                    p3 = (y + i >= vb_pos - 2) ? p1 : p3;
+                    p5 = (y + i >= vb_pos - 3) ? p3 : p5;
+
+                    p2 = (y + i == vb_pos - 1) ? p0 : p2;
+                    p4 = (y + i >= vb_pos - 2) ? p2 : p4;
+                    p6 = (y + i >= vb_pos - 3) ? p4 : p6;
+                } else if ((y + i >= vb_pos) && ((y + i) <= vb_below)) {
+                    p2 = (y + i == vb_pos    ) ? p0 : p2;
+                    p4 = (y + i <= vb_pos + 1) ? p2 : p4;
+                    p6 = (y + i <= vb_pos + 2) ? p4 : p6;
+
+                    p1 = (y + i == vb_pos    ) ? p0 : p1;
+                    p3 = (y + i <= vb_pos + 1) ? p1 : p3;
+                    p5 = (y + i <= vb_pos + 2) ? p3 : p5;
+                }
+
+                for (int j = 0; j < ALF_BLOCK_SIZE; j++) {
+                    int sum = 0;
+                    const pixel curr = *p0;
+
+                    sum += filter[0]  * FUNC(alf_clip)(curr, p5[+0], p6[+0], clip[0]);
+                    sum += filter[1]  * FUNC(alf_clip)(curr, p3[+1], p4[-1], clip[1]);
+                    sum += filter[2]  * FUNC(alf_clip)(curr, p3[+0], p4[+0], clip[2]);
+                    sum += filter[3]  * FUNC(alf_clip)(curr, p3[-1], p4[+1], clip[3]);
+                    sum += filter[4]  * FUNC(alf_clip)(curr, p1[+2], p2[-2], clip[4]);
+                    sum += filter[5]  * FUNC(alf_clip)(curr, p1[+1], p2[-1], clip[5]);
+                    sum += filter[6]  * FUNC(alf_clip)(curr, p1[+0], p2[+0], clip[6]);
+                    sum += filter[7]  * FUNC(alf_clip)(curr, p1[-1], p2[+1], clip[7]);
+                    sum += filter[8]  * FUNC(alf_clip)(curr, p1[-2], p2[+2], clip[8]);
+                    sum += filter[9]  * FUNC(alf_clip)(curr, p0[+3], p0[-3], clip[9]);
+                    sum += filter[10] * FUNC(alf_clip)(curr, p0[+2], p0[-2], clip[10]);
+                    sum += filter[11] * FUNC(alf_clip)(curr, p0[+1], p0[-1], clip[11]);
+
+                    if (!is_near_vb)
+                        sum = (sum + offset) >> shift;
+                    else
+                        sum = (sum + (1 << ((shift + 3) - 1))) >> (shift + 3);
+                    sum += curr;
+                    dst[j] = CLIP(sum);
+
+                    p0++;
+                    p1++;
+                    p2++;
+                    p3++;
+                    p4++;
+                    p5++;
+                    p6++;
+                }
+            }
+            filter += ALF_NUM_COEFF_LUMA;
+            clip += ALF_NUM_COEFF_LUMA;
+        }
+    }
+}
+
+static void FUNC(alf_filter_chroma)(uint8_t* _dst, ptrdiff_t dst_stride, const uint8_t* _src, ptrdiff_t src_stride,
+    const int width, const int height, const int16_t* filter, const int16_t* clip, const int vb_pos)
+{
+    const pixel *src = (pixel *)_src;
+    const int shift  = 7;
+    const int offset = 1 << ( shift - 1 );
+    const int vb_above  = vb_pos - 2;
+    const int vb_below  = vb_pos + 1;
+
+    dst_stride /= sizeof(pixel);
+    src_stride /= sizeof(pixel);
+
+    for (int y = 0; y < height; y += ALF_BLOCK_SIZE) {
+        for (int x = 0; x < width; x += ALF_BLOCK_SIZE) {
+            const pixel *s0 = src + y * src_stride + x;
+            const pixel *s1 = s0 + src_stride;
+            const pixel *s2 = s0 - src_stride;
+            const pixel *s3 = s1 + src_stride;
+            const pixel *s4 = s2 - src_stride;
+            const pixel *s5 = s3 + src_stride;
+            const pixel *s6 = s4 - src_stride;
+
+            for (int i = 0; i < ALF_BLOCK_SIZE; i++) {
+                pixel *dst = (pixel *)_dst + (y + i) * dst_stride + x;
+
+                const pixel *p0 = s0 + i * src_stride;
+                const pixel *p1 = s1 + i * src_stride;
+                const pixel *p2 = s2 + i * src_stride;
+                const pixel *p3 = s3 + i * src_stride;
+                const pixel *p4 = s4 + i * src_stride;
+                const pixel *p5 = s5 + i * src_stride;
+                const pixel *p6 = s6 + i * src_stride;
+
+                const int is_near_vb_above = (y + i <  vb_pos) && (y + i >= vb_pos - 1);
+                const int is_near_vb_below = (y + i >= vb_pos) && (y + i <= vb_pos);
+                const int is_near_vb = is_near_vb_above || is_near_vb_below;
+
+                if ((y + i < vb_pos) && ((y + i) >= vb_above)) {
+                    p1 = (y + i == vb_pos - 1) ? p0 : p1;
+                    p3 = (y + i >= vb_pos - 2) ? p1 : p3;
+                    p5 = (y + i >= vb_pos - 3) ? p3 : p5;
+
+                    p2 = (y + i == vb_pos - 1) ? p0 : p2;
+                    p4 = (y + i >= vb_pos - 2) ? p2 : p4;
+                    p6 = (y + i >= vb_pos - 3) ? p4 : p6;
+                } else if ((y + i >= vb_pos) && ((y + i) <= vb_below)) {
+                    p2 = (y + i == vb_pos    ) ? p0 : p2;
+                    p4 = (y + i <= vb_pos + 1) ? p2 : p4;
+                    p6 = (y + i <= vb_pos + 2) ? p4 : p6;
+
+                    p1 = (y + i == vb_pos    ) ? p0 : p1;
+                    p3 = (y + i <= vb_pos + 1) ? p1 : p3;
+                    p5 = (y + i <= vb_pos + 2) ? p3 : p5;
+                }
+
+                for (int j = 0; j < ALF_BLOCK_SIZE; j++) {
+                    int sum = 0;
+                    const pixel curr = *p0;
+
+                    sum += filter[0]  * FUNC(alf_clip)(curr, p3[+0], p4[+0], clip[0]);
+                    sum += filter[1]  * FUNC(alf_clip)(curr, p1[+1], p2[-1], clip[1]);
+                    sum += filter[2]  * FUNC(alf_clip)(curr, p1[+0], p2[+0], clip[2]);
+                    sum += filter[3]  * FUNC(alf_clip)(curr, p1[-1], p2[+1], clip[3]);
+                    sum += filter[4]  * FUNC(alf_clip)(curr, p0[+2], p0[-2], clip[4]);
+                    sum += filter[5]  * FUNC(alf_clip)(curr, p0[+1], p0[-1], clip[5]);
+
+                    if (!is_near_vb)
+                        sum = (sum + offset) >> shift;
+                    else
+                        sum = (sum + (1 << ((shift + 3) - 1))) >> (shift + 3);
+                    sum += curr;
+                    dst[j] = CLIP(sum);
+
+                    p0++;
+                    p1++;
+                    p2++;
+                    p3++;
+                    p4++;
+                    p5++;
+                    p6++;
+                }
+            }
+        }
+    }
+}
+
+static void FUNC(alf_filter_cc)(uint8_t *_dst, ptrdiff_t dst_stride, const uint8_t *_luma, const ptrdiff_t luma_stride,
+    const int width, const int height, const int hs, const int vs, const int16_t *filter, const int vb_pos)
+{
+    const ptrdiff_t stride = luma_stride / sizeof(pixel);
+
+    dst_stride /= sizeof(pixel);
+
+    for (int y = 0; y < height; y++) {
+        for (int x = 0; x < width; x++) {
+            int sum = 0;
+            pixel *dst  = (pixel *)_dst  + y * dst_stride + x;
+            const pixel *src  = (pixel *)_luma + (y << vs) * stride + (x << hs);
+
+            const pixel *s0 = src - stride;
+            const pixel *s1 = src;
+            const pixel *s2 = src + stride;
+            const pixel *s3 = src + 2 * stride;
+
+            const int pos = y << vs;
+            if (!vs && (pos == vb_pos || pos == vb_pos + 1))
+                continue;
+
+            if (pos == (vb_pos - 2) || pos == (vb_pos + 1))
+                s3 = s2;
+            else  if (pos == (vb_pos - 1) || pos == vb_pos)
+                s3 = s2 = s0 = s1;
+
+
+            sum += filter[0] * (*s0 - *src);
+            sum += filter[1] * (*(s1 - 1) - *src);
+            sum += filter[2] * (*(s1 + 1) - *src);
+            sum += filter[3] * (*(s2 - 1) - *src);
+            sum += filter[4] * (*s2 - *src);
+            sum += filter[5] * (*(s2 + 1) - *src);
+            sum += filter[6] * (*s3 - *src);
+            sum = av_clip((sum + 64) >> 7, -(1 << (BIT_DEPTH - 1)), (1 << (BIT_DEPTH - 1)) - 1);
+            sum += *dst;
+            *dst = av_clip_pixel(sum);
+        }
+    }
+}
+
+#define ALF_DIR_VERT        0
+#define ALF_DIR_HORZ        1
+#define ALF_DIR_DIGA0       2
+#define ALF_DIR_DIGA1       3
+
+static void FUNC(alf_get_idx)(int *class_idx, int *transpose_idx, const int *sum, const int ac)
+{
+    static const int arg_var[] = {0, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4 };
+
+    int hv0, hv1, dir_hv, d0, d1, dir_d, hvd1, hvd0, sum_hv, dir1;
+
+    dir_hv = sum[ALF_DIR_VERT] <= sum[ALF_DIR_HORZ];
+    hv1    = FFMAX(sum[ALF_DIR_VERT], sum[ALF_DIR_HORZ]);
+    hv0    = FFMIN(sum[ALF_DIR_VERT], sum[ALF_DIR_HORZ]);
+
+    dir_d  = sum[ALF_DIR_DIGA0] <= sum[ALF_DIR_DIGA1];
+    d1     = FFMAX(sum[ALF_DIR_DIGA0], sum[ALF_DIR_DIGA1]);
+    d0     = FFMIN(sum[ALF_DIR_DIGA0], sum[ALF_DIR_DIGA1]);
+
+    //promote to avoid overflow
+    dir1 = (uint64_t)d1 * hv0 <= (uint64_t)hv1 * d0;
+    hvd1 = dir1 ? hv1 : d1;
+    hvd0 = dir1 ? hv0 : d0;
+
+    sum_hv = sum[ALF_DIR_HORZ] + sum[ALF_DIR_VERT];
+    *class_idx = arg_var[av_clip_uintp2(sum_hv * ac >> (BIT_DEPTH - 1), 4)];
+    if (hvd1 * 2 > 9 * hvd0)
+        *class_idx += ((dir1 << 1) + 2) * 5;
+    else if (hvd1 > 2 * hvd0)
+        *class_idx += ((dir1 << 1) + 1) * 5;
+
+    *transpose_idx = dir_d * 2 + dir_hv;
+}
+
+static void FUNC(alf_classify)(int *class_idx, int *transpose_idx,
+    const uint8_t *_src, const ptrdiff_t _src_stride, const int width, const int height,
+    const int vb_pos, int *gradient_tmp)
+{
+    int *grad;
+
+    const int h = height + ALF_GRADIENT_BORDER * 2;
+    const int w = width  + ALF_GRADIENT_BORDER * 2;
+    const int size = (ALF_BLOCK_SIZE + ALF_GRADIENT_BORDER * 2) / ALF_GRADIENT_STEP;
+    const int gstride = (w / ALF_GRADIENT_STEP) * ALF_NUM_DIR;
+
+    const pixel *src           = (const pixel *)_src;
+    const ptrdiff_t src_stride = _src_stride / sizeof(pixel);
+    src -= (ALF_GRADIENT_BORDER + 1) * src_stride + ALF_GRADIENT_BORDER;
+
+    grad = gradient_tmp;
+    for (int y = 0; y < h; y += ALF_GRADIENT_STEP) {
+        const pixel *s0  = src + y * src_stride;
+        const pixel *s1  = s0 + src_stride;
+        const pixel *s2  = s1 + src_stride;
+        const pixel *s3  = s2 + src_stride;
+
+        if (y == vb_pos)          //above
+            s3 = s2;
+        else if (y == vb_pos + ALF_GRADIENT_BORDER)
+            s0 = s1;
+
+        for (int x = 0; x < w; x += ALF_GRADIENT_STEP) {
+            //two points a time
+            const pixel *a0  = s0 + x;
+            const pixel *p0  = s1 + x;
+            const pixel *b0  = s2 + x;
+            const int val0   = (*p0) << 1;
+
+            const pixel *a1  = s1 + x + 1;
+            const pixel *p1  = s2 + x + 1;
+            const pixel *b1  = s3 + x + 1;
+            const int val1   = (*p1) << 1;
+
+            grad[ALF_DIR_VERT]  = FFABS(val0 - *a0 - *b0) + FFABS(val1 - *a1 - *b1);
+            grad[ALF_DIR_HORZ]  = FFABS(val0 - *(p0 - 1) - *(p0 + 1)) + FFABS(val1 - *(p1 - 1) - *(p1 + 1));
+            grad[ALF_DIR_DIGA0] = FFABS(val0 - *(a0 - 1) - *(b0 + 1)) + FFABS(val1 - *(a1 - 1) - *(b1 + 1));
+            grad[ALF_DIR_DIGA1] = FFABS(val0 - *(a0 + 1) - *(b0 - 1)) + FFABS(val1 - *(a1 + 1) - *(b1 - 1));
+            grad += ALF_NUM_DIR;
+        }
+    }
+
+    for (int y = 0; y < height ; y += ALF_BLOCK_SIZE ) {
+        int start = 0;
+        int end   = (ALF_BLOCK_SIZE + ALF_GRADIENT_BORDER * 2) / ALF_GRADIENT_STEP;
+        int ac    = 2;
+        if (y + ALF_BLOCK_SIZE == vb_pos) {
+            end -= ALF_GRADIENT_BORDER / ALF_GRADIENT_STEP;
+            ac = 3;
+        } else if (y == vb_pos) {
+            start += ALF_GRADIENT_BORDER / ALF_GRADIENT_STEP;
+            ac = 3;
+        }
+        for (int x = 0; x < width; x += ALF_BLOCK_SIZE) {
+            const int xg = x / ALF_GRADIENT_STEP;
+            const int yg = y / ALF_GRADIENT_STEP;
+            int sum[ALF_NUM_DIR] = { 0 };
+
+            grad = gradient_tmp + (yg + start) * gstride + xg * ALF_NUM_DIR;
+            //todo: optimize this loop
+            for (int i = start; i < end; i++) {
+                for (int j = 0; j < size; j++) {
+                    sum[ALF_DIR_VERT]  += grad[ALF_DIR_VERT];
+                    sum[ALF_DIR_HORZ]  += grad[ALF_DIR_HORZ];
+                    sum[ALF_DIR_DIGA0] += grad[ALF_DIR_DIGA0];
+                    sum[ALF_DIR_DIGA1] += grad[ALF_DIR_DIGA1];
+                    grad += ALF_NUM_DIR;
+                }
+                grad += gstride - size * ALF_NUM_DIR;
+            }
+            FUNC(alf_get_idx)(class_idx, transpose_idx, sum, ac);
+
+            class_idx++;
+            transpose_idx++;
+        }
+    }
+
+}
+
+static void FUNC(alf_recon_coeff_and_clip)(int16_t *coeff, int16_t *clip,
+    const int *class_idx, const int *transpose_idx, const int size,
+    const int16_t *coeff_set, const uint8_t *clip_idx_set, const uint8_t *class_to_filt)
+{
+    const static int index[][ALF_NUM_COEFF_LUMA] = {
+        { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 },
+        { 9, 4, 10, 8, 1, 5, 11, 7, 3, 0, 2, 6 },
+        { 0, 3, 2, 1, 8, 7, 6, 5, 4, 9, 10, 11 },
+        { 9, 8, 10, 4, 3, 7, 11, 5, 1, 0, 2, 6 },
+    };
+
+    const int16_t clip_set[] = {
+        1 << BIT_DEPTH, 1 << (BIT_DEPTH - 3), 1 << (BIT_DEPTH - 5), 1 << (BIT_DEPTH - 7)
+    };
+
+    for (int i = 0; i < size; i++) {
+        const int16_t  *src_coeff = coeff_set + class_to_filt[class_idx[i]] * ALF_NUM_COEFF_LUMA;
+        const uint8_t *clip_idx  = clip_idx_set + class_idx[i] * ALF_NUM_COEFF_LUMA;
+
+        for (int j = 0; j < ALF_NUM_COEFF_LUMA; j++) {
+            const int idx = index[transpose_idx[i]][j];
+            *coeff++ = src_coeff[idx];
+            *clip++  = clip_set[clip_idx[idx]];
+        }
+    }
+}
+
+#undef ALF_DIR_HORZ
+#undef ALF_DIR_VERT
+#undef ALF_DIR_DIGA0
+#undef ALF_DIR_DIGA1
+
+// line zero
+#define P7 pix[-8 * xstride]
+#define P6 pix[-7 * xstride]
+#define P5 pix[-6 * xstride]
+#define P4 pix[-5 * xstride]
+#define P3 pix[-4 * xstride]
+#define P2 pix[-3 * xstride]
+#define P1 pix[-2 * xstride]
+#define P0 pix[-1 * xstride]
+#define Q0 pix[0 * xstride]
+#define Q1 pix[1 * xstride]
+#define Q2 pix[2 * xstride]
+#define Q3 pix[3 * xstride]
+#define Q4 pix[4 * xstride]
+#define Q5 pix[5 * xstride]
+#define Q6 pix[6 * xstride]
+#define Q7 pix[7 * xstride]
+#define P(x) pix[(-(x)-1) * xstride]
+#define Q(x) pix[(x)      * xstride]
+
+// line three. used only for deblocking decision
+#define TP7 pix[-8 * xstride + 3 * ystride]
+#define TP6 pix[-7 * xstride + 3 * ystride]
+#define TP5 pix[-6 * xstride + 3 * ystride]
+#define TP4 pix[-5 * xstride + 3 * ystride]
+#define TP3 pix[-4 * xstride + 3 * ystride]
+#define TP2 pix[-3 * xstride + 3 * ystride]
+#define TP1 pix[-2 * xstride + 3 * ystride]
+#define TP0 pix[-1 * xstride + 3 * ystride]
+#define TQ0 pix[0  * xstride + 3 * ystride]
+#define TQ1 pix[1  * xstride + 3 * ystride]
+#define TQ2 pix[2  * xstride + 3 * ystride]
+#define TQ3 pix[3  * xstride + 3 * ystride]
+#define TQ4 pix[4  * xstride + 3 * ystride]
+#define TQ5 pix[5  * xstride + 3 * ystride]
+#define TQ6 pix[6  * xstride + 3 * ystride]
+#define TQ7 pix[7  * xstride + 3 * ystride]
+#define TP(x) pix[(-(x)-1) * xstride + 3 * ystride]
+#define TQ(x) pix[(x)      * xstride + 3 * ystride]
+
+#define FP3 pix[-4 * xstride + 1 * ystride]
+#define FP2 pix[-3 * xstride + 1 * ystride]
+#define FP1 pix[-2 * xstride + 1 * ystride]
+#define FP0 pix[-1 * xstride + 1 * ystride]
+#define FQ0 pix[0  * xstride + 1 * ystride]
+#define FQ1 pix[1  * xstride + 1 * ystride]
+#define FQ2 pix[2  * xstride + 1 * ystride]
+#define FQ3 pix[3  * xstride + 1 * ystride]
+
+static void FUNC(loop_filter_luma_large)(pixel *pix, const ptrdiff_t xstride, const ptrdiff_t ystride, const int32_t tc,
+    const uint8_t no_p, const uint8_t no_q, const uint8_t max_len_p, const uint8_t max_len_q)
+{
+    for (int d = 0; d < 4; d++) {
+        const int p6 = P6;
+        const int p5 = P5;
+        const int p4 = P4;
+        const int p3 = P3;
+        const int p2 = P2;
+        const int p1 = P1;
+        const int p0 = P0;
+        const int q0 = Q0;
+        const int q1 = Q1;
+        const int q2 = Q2;
+        const int q3 = Q3;
+        const int q4 = Q4;
+        const int q5 = Q5;
+        const int q6 = Q6;
+        int m;
+        if (max_len_p == 5 && max_len_q == 5)
+            m = (p4 + p3 + 2 * (p2 + p1 + p0 + q0 + q1 + q2) + q3 + q4 + 8) >> 4;
+        else if (max_len_p == max_len_q)
+            m = (p6 + p5 + p4 + p3 + p2 + p1 + 2 * (p0 + q0) + q1 + q2 + q3 + q4 + q5 + q6 + 8) >> 4;
+        else if (max_len_p + max_len_q == 12)
+            m = (p5 + p4 + p3 + p2 + 2 * (p1 + p0 + q0 + q1) + q2 + q3 + q4 + q5 + 8) >> 4;
+        else if (max_len_p + max_len_q == 8)
+            m = (p3 + p2 + p1 + p0 + q0 + q1 + q2 + q3 + 4) >> 3;
+        else if (max_len_q == 7)
+            m = (2 * (p2 + p1 + p0 + q0) + p0 + p1 + q1 + q2 + q3 + q4 + q5 + q6 + 8) >> 4;
+        else
+            m = (p6 + p5 + p4 + p3 + p2 + p1 + 2 * (q2 + q1 + q0 + p0) + q0 + q1 + 8) >> 4;
+        if (!no_p) {
+            const int refp = (P(max_len_p) + P(max_len_p - 1) + 1) >> 1;
+            if (max_len_p == 3) {
+                P0 = p0 + av_clip(((m * 53 + refp * 11 + 32) >> 6) - p0, -(tc * 6 >> 1), (tc * 6 >> 1));
+                P1 = p1 + av_clip(((m * 32 + refp * 32 + 32) >> 6) - p1, -(tc * 4 >> 1), (tc * 4 >> 1));
+                P2 = p2 + av_clip(((m * 11 + refp * 53 + 32) >> 6) - p2, -(tc * 2 >> 1), (tc * 2 >> 1));
+            } else if (max_len_p == 5) {
+                P0 = p0 + av_clip(((m * 58 + refp *  6 + 32) >> 6) - p0, -(tc * 6 >> 1), (tc * 6 >> 1));
+                P1 = p1 + av_clip(((m * 45 + refp * 19 + 32) >> 6) - p1, -(tc * 5 >> 1), (tc * 5 >> 1));
+                P2 = p2 + av_clip(((m * 32 + refp * 32 + 32) >> 6) - p2, -(tc * 4 >> 1), (tc * 4 >> 1));
+                P3 = p3 + av_clip(((m * 19 + refp * 45 + 32) >> 6) - p3, -(tc * 3 >> 1), (tc * 3 >> 1));
+                P4 = p4 + av_clip(((m *  6 + refp * 58 + 32) >> 6) - p4, -(tc * 2 >> 1), (tc * 2 >> 1));
+            } else {
+                P0 = p0 + av_clip(((m * 59 + refp *  5 + 32) >> 6) - p0, -(tc * 6 >> 1), (tc * 6 >> 1));
+                P1 = p1 + av_clip(((m * 50 + refp * 14 + 32) >> 6) - p1, -(tc * 5 >> 1), (tc * 5 >> 1));
+                P2 = p2 + av_clip(((m * 41 + refp * 23 + 32) >> 6) - p2, -(tc * 4 >> 1), (tc * 4 >> 1));
+                P3 = p3 + av_clip(((m * 32 + refp * 32 + 32) >> 6) - p3, -(tc * 3 >> 1), (tc * 3 >> 1));
+                P4 = p4 + av_clip(((m * 23 + refp * 41 + 32) >> 6) - p4, -(tc * 2 >> 1), (tc * 2 >> 1));
+                P5 = p5 + av_clip(((m * 14 + refp * 50 + 32) >> 6) - p5, -(tc * 1 >> 1), (tc * 1 >> 1));
+                P6 = p6 + av_clip(((m *  5 + refp * 59 + 32) >> 6) - p6, -(tc * 1 >> 1), (tc * 1 >> 1));
+            }
+        }
+        if (!no_q) {
+            const int refq = (Q(max_len_q) + Q(max_len_q - 1) + 1) >> 1;
+            if (max_len_q == 3) {
+                Q0 = q0 + av_clip(((m * 53 + refq * 11 + 32) >> 6) - q0,  -(tc * 6 >> 1), (tc * 6 >> 1));
+                Q1 = q1 + av_clip(((m * 32 + refq * 32 + 32) >> 6) - q1,  -(tc * 4 >> 1), (tc * 4 >> 1));
+                Q2 = q2 + av_clip(((m * 11 + refq * 53 + 32) >> 6) - q2,  -(tc * 2 >> 1), (tc * 2 >> 1));
+            } else if (max_len_q == 5) {
+                Q0 = q0 + av_clip(((m * 58 + refq *  6 + 32) >> 6) - q0, -(tc * 6 >> 1), (tc * 6 >> 1));
+                Q1 = q1 + av_clip(((m * 45 + refq * 19 + 32) >> 6) - q1, -(tc * 5 >> 1), (tc * 5 >> 1));
+                Q2 = q2 + av_clip(((m * 32 + refq * 32 + 32) >> 6) - q2, -(tc * 4 >> 1), (tc * 4 >> 1));
+                Q3 = q3 + av_clip(((m * 19 + refq * 45 + 32) >> 6) - q3, -(tc * 3 >> 1), (tc * 3 >> 1));
+                Q4 = q4 + av_clip(((m *  6 + refq * 58 + 32) >> 6) - q4, -(tc * 2 >> 1), (tc * 2 >> 1));
+            } else {
+                Q0 = q0 + av_clip(((m * 59 + refq *  5 + 32) >> 6) - q0, -(tc * 6 >> 1), (tc * 6 >> 1));
+                Q1 = q1 + av_clip(((m * 50 + refq * 14 + 32) >> 6) - q1, -(tc * 5 >> 1), (tc * 5 >> 1));
+                Q2 = q2 + av_clip(((m * 41 + refq * 23 + 32) >> 6) - q2, -(tc * 4 >> 1), (tc * 4 >> 1));
+                Q3 = q3 + av_clip(((m * 32 + refq * 32 + 32) >> 6) - q3, -(tc * 3 >> 1), (tc * 3 >> 1));
+                Q4 = q4 + av_clip(((m * 23 + refq * 41 + 32) >> 6) - q4, -(tc * 2 >> 1), (tc * 2 >> 1));
+                Q5 = q5 + av_clip(((m * 14 + refq * 50 + 32) >> 6) - q5, -(tc * 1 >> 1), (tc * 1 >> 1));
+                Q6 = q6 + av_clip(((m *  5 + refq * 59 + 32) >> 6) - q6, -(tc * 1 >> 1), (tc * 1 >> 1));
+            }
+
+        }
+        pix += ystride;
+    }
+}
+
+static void FUNC(loop_filter_luma_strong)(pixel *pix, const ptrdiff_t xstride, const ptrdiff_t ystride, const int32_t tc,
+    const uint8_t no_p, const uint8_t no_q)
+{
+    const int tc2 = tc << 1;
+    const int tc3 = tc * 3;
+    for (int d = 0; d < 4; d++) {
+        const int p3 = P3;
+        const int p2 = P2;
+        const int p1 = P1;
+        const int p0 = P0;
+        const int q0 = Q0;
+        const int q1 = Q1;
+        const int q2 = Q2;
+        const int q3 = Q3;
+        if (!no_p) {
+            P0 = p0 + av_clip(((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) - p0, -tc3, tc3);
+            P1 = p1 + av_clip(((p2 + p1 + p0 + q0 + 2) >> 2) - p1, -tc2, tc2);
+            P2 = p2 + av_clip(((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) - p2, -tc, tc);
+        }
+        if (!no_q) {
+            Q0 = q0 + av_clip(((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) - q0, -tc3, tc3);
+            Q1 = q1 + av_clip(((p0 + q0 + q1 + q2 + 2) >> 2) - q1, -tc2, tc2);
+            Q2 = q2 + av_clip(((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) - q2, -tc, tc);
+        }
+        pix += ystride;
+    }
+}
+
+static void FUNC(loop_filter_luma_weak)(pixel *pix, const ptrdiff_t xstride, const ptrdiff_t ystride,
+    const int32_t tc, const int32_t beta, const uint8_t no_p, const uint8_t no_q, const int nd_p, const int nd_q)
+{
+    const int tc_2 = tc >> 1;
+    for (int d = 0; d < 4; d++) {
+        const int p2 = P2;
+        const int p1 = P1;
+        const int p0 = P0;
+        const int q0 = Q0;
+        const int q1 = Q1;
+        const int q2 = Q2;
+        int delta0 = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4;
+        if (abs(delta0) < 10 * tc) {
+            delta0 = av_clip(delta0, -tc, tc);
+            if (!no_p)
+                P0 = av_clip_pixel(p0 + delta0);
+            if (!no_q)
+                Q0 = av_clip_pixel(q0 - delta0);
+            if (!no_p && nd_p > 1) {
+                const int deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2);
+                P1 = av_clip_pixel(p1 + deltap1);
+            }
+            if (!no_q && nd_q > 1) {
+                const int deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1, -tc_2, tc_2);
+                Q1 = av_clip_pixel(q1 + deltaq1);
+            }
+        }
+        pix += ystride;
+    }
+
+}
+
+static void FUNC(vvc_loop_filter_luma)(uint8_t* _pix, ptrdiff_t _xstride, ptrdiff_t _ystride,
+    const int32_t *_beta, const int32_t *_tc, const uint8_t *_no_p, const uint8_t *_no_q,
+    const uint8_t *_max_len_p, const uint8_t *_max_len_q, int hor_ctu_edge)
+{
+    const ptrdiff_t xstride = _xstride / sizeof(pixel);
+    const ptrdiff_t ystride = _ystride / sizeof(pixel);
+
+    for (int i = 0; i < 2; i++) {
+        pixel* pix      = (pixel*)_pix + i * 4 * ystride;
+        const int dp0   = abs(P2 - 2 * P1 + P0);
+        const int dq0   = abs(Q2 - 2 * Q1 + Q0);
+        const int dp3   = abs(TP2 - 2 * TP1 + TP0);
+        const int dq3   = abs(TQ2 - 2 * TQ1 + TQ0);
+        const int d0    = dp0 + dq0;
+        const int d3    = dp3 + dq3;
+#if BIT_DEPTH < 10
+        const int tc    = (_tc[i] + (1 << (9 - BIT_DEPTH))) >> (10 - BIT_DEPTH);
+#else
+        const int tc    = _tc[i] << (BIT_DEPTH - 10);
+#endif
+        const int tc25  = ((tc * 5 + 1) >> 1);
+
+        const int no_p  = _no_p[i];
+        const int no_q  = _no_q[i];
+
+        int max_len_p   = _max_len_p[i];
+        int max_len_q   = _max_len_q[i];
+
+        const int large_p = (max_len_p > 3 && !hor_ctu_edge);
+        const int large_q = max_len_q > 3;
+        const int beta = _beta[i] << BIT_DEPTH - 8;
+
+        const int beta_3 = beta >> 3;
+        const int beta_2 = beta >> 2;
+
+        if (!tc)
+            continue;
+
+        if (large_p || large_q) {
+            const int dp0l = large_p ? ((dp0 + abs(P5 - 2 * P4 + P3) + 1) >> 1) : dp0;
+            const int dq0l = large_q ? ((dq0 + abs(Q5 - 2 * Q4 + Q3) + 1) >> 1) : dq0;
+            const int dp3l = large_p ? ((dp3 + abs(TP5 - 2 * TP4 + TP3) + 1) >> 1) : dp3;
+            const int dq3l = large_q ? ((dq3 + abs(TQ5 - 2 * TQ4 + TQ3) + 1) >> 1) : dq3;
+            const int d0l = dp0l + dq0l;
+            const int d3l = dp3l + dq3l;
+            const int beta53 = beta * 3 >> 5;
+            const int beta_4 = beta >> 4;
+            max_len_p = large_p ? max_len_p : 3;
+            max_len_q = large_q ? max_len_q : 3;
+
+            if (d0l + d3l < beta) {
+                const int sp0l = abs(P3 - P0) + (max_len_p == 7 ? abs(P7 - P6 - P5 + P4) : 0);
+                const int sq0l = abs(Q0 - Q3) + (max_len_q == 7 ? abs(Q4 - Q5 - Q6 + Q7) : 0);
+                const int sp3l = abs(TP3 - TP0) + (max_len_p == 7 ? abs(TP7 - TP6 - TP5 + TP4) : 0);
+                const int sq3l = abs(TQ0 - TQ3) + (max_len_q == 7 ? abs(TQ4 - TQ5 - TQ6 + TQ7) : 0);
+                const int sp0 = large_p ? ((sp0l + abs(P3 -   P(max_len_p)) + 1) >> 1) : sp0l;
+                const int sp3 = large_p ? ((sp3l + abs(TP3 - TP(max_len_p)) + 1) >> 1) : sp3l;
+                const int sq0 = large_q ? ((sq0l + abs(Q3 -   Q(max_len_q)) + 1) >> 1) : sq0l;
+                const int sq3 = large_q ? ((sq3l + abs(TQ3 - TQ(max_len_q)) + 1) >> 1) : sq3l;
+                if (sp0 + sq0 < beta53 && abs(P0 - Q0) < tc25 &&
+                    sp3 + sq3 < beta53 && abs(TP0 - TQ0) < tc25 &&
+                    (d0l << 1) < beta_4 && (d3l << 1) < beta_4) {
+                    FUNC(loop_filter_luma_large)(pix, xstride, ystride, tc, no_p, no_q, max_len_p, max_len_q);
+                    continue;
+                }
+            }
+        }
+        if (d0 + d3 < beta) {
+            if (max_len_p > 2 && max_len_q > 2 &&
+                abs(P3 - P0) + abs(Q3 - Q0) < beta_3 && abs(P0 - Q0) < tc25 &&
+                abs(TP3 - TP0) + abs(TQ3 - TQ0) < beta_3 && abs(TP0 - TQ0) < tc25 &&
+                (d0 << 1) < beta_2 && (d3 << 1) < beta_2) {
+                FUNC(loop_filter_luma_strong)(pix, xstride, ystride, tc, no_p, no_q);
+            } else { // weak filtering
+                int nd_p = 1;
+                int nd_q = 1;
+                if (max_len_p > 1 && max_len_q > 1) {
+                    if (dp0 + dp3 < ((beta + (beta >> 1)) >> 3))
+                        nd_p = 2;
+                    if (dq0 + dq3 < ((beta + (beta >> 1)) >> 3))
+                        nd_q = 2;
+                }
+                FUNC(loop_filter_luma_weak)(pix, xstride, ystride, tc, beta, no_p, no_q, nd_p, nd_q);
+            }
+        }
+    }
+}
+
+static void FUNC(loop_filter_chroma_strong)(pixel *pix, const ptrdiff_t xstride, const ptrdiff_t ystride,
+    const int size, const int32_t tc, const uint8_t no_p, const uint8_t no_q)
+{
+    for (int d = 0; d < size; d++) {
+        const int p3 = P3;
+        const int p2 = P2;
+        const int p1 = P1;
+        const int p0 = P0;
+        const int q0 = Q0;
+        const int q1 = Q1;
+        const int q2 = Q2;
+        const int q3 = Q3;
+        if (!no_p) {
+            P0 = av_clip((p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4) >> 3, p0 - tc, p0 + tc);
+            P1 = av_clip((2 * p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4) >> 3, p1 - tc, p1 + tc);
+            P2 = av_clip((3 * p3 + 2 * p2 + p1 + p0 + q0 + 4) >> 3, p2 - tc, p2 + tc );
+        }
+        if (!no_q) {
+            Q0 = av_clip((p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4) >> 3, q0 - tc, q0 + tc);
+            Q1 = av_clip((p1 + p0 + q0 + 2 * q1 + q2 + 2 * q3 + 4) >> 3, q1 - tc, q1 + tc);
+            Q2 = av_clip((p0 + q0 + q1 + 2 * q2 + 3 * q3 + 4) >> 3, q2 - tc, q2 + tc);
+        }
+        pix += ystride;
+    }
+}
+
+static void FUNC(loop_filter_chroma_strong_one_side)(pixel *pix, const ptrdiff_t xstride, const ptrdiff_t ystride,
+    const int size, const int32_t tc, const uint8_t no_p, const uint8_t no_q)
+{
+    for (int d = 0; d < size; d++) {
+        const int p1 = P1;
+        const int p0 = P0;
+        const int q0 = Q0;
+        const int q1 = Q1;
+        const int q2 = Q2;
+        const int q3 = Q3;
+        if (!no_p) {
+            P0 = av_clip((3 * p1 + 2 * p0 + q0 + q1 + q2 + 4) >> 3, p0 - tc, p0 + tc);
+        }
+        if (!no_q) {
+            Q0 = av_clip((2 * p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4) >> 3, q0 - tc, q0 + tc);
+            Q1 = av_clip((p1 + p0 + q0 + 2 * q1 + q2 + 2 * q3 + 4) >> 3, q1 - tc, q1 + tc);
+            Q2 = av_clip((p0 + q0 + q1 + 2 * q2 + 3 * q3 + 4) >> 3, q2 - tc, q2 + tc);
+        }
+        pix += ystride;
+    }
+}
+
+static void FUNC(loop_filter_chroma_weak)(pixel *pix, const ptrdiff_t xstride, const ptrdiff_t ystride,
+    const int size, const int32_t tc, const uint8_t no_p, const uint8_t no_q)
+{
+    for (int d = 0; d < size; d++) {
+        int delta0;
+        const int p1 = P1;
+        const int p0 = P0;
+        const int q0 = Q0;
+        const int q1 = Q1;
+        delta0 = av_clip((((q0 - p0) * 4) + p1 - q1 + 4) >> 3, -tc, tc);
+        if (!no_p)
+            P0 = av_clip_pixel(p0 + delta0);
+        if (!no_q)
+            Q0 = av_clip_pixel(q0 - delta0);
+        pix += ystride;
+    }
+}
+
+static void FUNC(vvc_loop_filter_chroma)(uint8_t *_pix, const ptrdiff_t  _xstride, const ptrdiff_t _ystride,
+    const int32_t *_beta, const int32_t *_tc, const uint8_t *_no_p, const uint8_t *_no_q,
+    const uint8_t *_max_len_p, const uint8_t *_max_len_q, const int shift)
+{
+    const ptrdiff_t xstride = _xstride / sizeof(pixel);
+    const ptrdiff_t ystride = _ystride / sizeof(pixel);
+    const int size          = shift ? 2 : 4;
+    const int end           = 8 / size;         // 8 samples a loop
+
+    for (int i = 0; i < end; i++) {
+        pixel *pix          = (pixel *)_pix + i * size * ystride;
+        const uint8_t no_p  = _no_p[i];
+        const uint8_t no_q  = _no_q[i];
+        const int beta      = _beta[i] << (BIT_DEPTH - 8);
+        const int beta_3    = beta >> 3;
+        const int beta_2    = beta >> 2;
+
+#if BIT_DEPTH < 10
+        const int tc = (_tc[i] + (1 << (9 - BIT_DEPTH))) >> (10 - BIT_DEPTH);
+#else
+        const int tc = _tc[i] << (BIT_DEPTH - 10);
+#endif
+        const int tc25      = ((tc * 5 + 1) >> 1);
+
+        uint8_t max_len_p   = _max_len_p[i];
+        uint8_t max_len_q   = _max_len_q[i];
+
+        if (!max_len_p || !max_len_q || !tc)
+            continue;
+
+        if (max_len_q == 3){
+            const int p1n  = shift ? FP1 : TP1;
+            const int p2n = max_len_p == 1 ? p1n : (shift ? FP2 : TP2);
+            const int p0n  = shift ? FP0 : TP0;
+            const int q0n  = shift ? FQ0 : TQ0;
+            const int q1n  = shift ? FQ1 : TQ1;
+            const int q2n  = shift ? FQ2 : TQ2;
+            const int p3   = max_len_p == 1 ? P1 : P3;
+            const int p2   = max_len_p == 1 ? P1 : P2;
+            const int p1   = P1;
+            const int p0   = P0;
+            const int dp0  = abs(p2 - 2 * p1 + p0);
+            const int dq0  = abs(Q2 - 2 * Q1 + Q0);
+
+            const int dp1 = abs(p2n - 2 * p1n + p0n);
+            const int dq1 = abs(q2n - 2 * q1n + q0n);
+            const int d0  = dp0 + dq0;
+            const int d1  = dp1 + dq1;
+
+            if (d0 + d1 < beta) {
+                const int p3n = max_len_p == 1 ? p1n : (shift ? FP3 : TP3);
+                const int q3n = shift ? FQ3 : TQ3;
+                const int dsam0 = (d0 << 1) < beta_2 && (abs(p3 - p0) + abs(Q0 - Q3)     < beta_3) &&
+                    abs(p0 - Q0)   < tc25;
+                const int dsam1 = (d1 << 1) < beta_2 && (abs(p3n - p0n) + abs(q0n - q3n) < beta_3) &&
+                    abs(p0n - q0n) < tc25;
+                if (!dsam0 || !dsam1)
+                    max_len_p = max_len_q = 1;
+            } else {
+                max_len_p = max_len_q = 1;
+            }
+        }
+
+        if (max_len_p == 3 && max_len_q == 3)
+            FUNC(loop_filter_chroma_strong)(pix, xstride, ystride, size, tc, no_p, no_q);
+        else if (max_len_q == 3)
+            FUNC(loop_filter_chroma_strong_one_side)(pix, xstride, ystride, size, tc, no_p, no_q);
+        else
+            FUNC(loop_filter_chroma_weak)(pix, xstride, ystride, size, tc, no_p, no_q);
+    }
+}
+
+static void FUNC(vvc_h_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
+    const int32_t *beta, const int32_t *tc, const uint8_t *no_p, const uint8_t *no_q,
+    const uint8_t *max_len_p, const uint8_t *max_len_q, int shift)
+{
+    FUNC(vvc_loop_filter_chroma)(pix, stride, sizeof(pixel), beta, tc,
+        no_p, no_q, max_len_p, max_len_q, shift);
+}
+
+static void FUNC(vvc_v_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
+    const int32_t *beta, const int32_t *tc, const uint8_t *no_p, const uint8_t *no_q,
+    const uint8_t *max_len_p, const uint8_t *max_len_q, int shift)
+{
+    FUNC(vvc_loop_filter_chroma)(pix, sizeof(pixel), stride, beta, tc,
+        no_p, no_q,  max_len_p, max_len_q, shift);
+}
+
+static void FUNC(vvc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
+    const int32_t *beta, const int32_t *tc, const uint8_t *no_p, const uint8_t *no_q,
+    const uint8_t *max_len_p, const uint8_t *max_len_q, int hor_ctu_edge)
+{
+    FUNC(vvc_loop_filter_luma)(pix, stride, sizeof(pixel), beta, tc,
+        no_p, no_q, max_len_p, max_len_q, hor_ctu_edge);
+}
+
+static void FUNC(vvc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
+    const int32_t *beta, const int32_t *tc, const uint8_t *no_p, const uint8_t *no_q,
+    const uint8_t *max_len_p, const uint8_t *max_len_q, int hor_ctu_edge)
+{
+    FUNC(vvc_loop_filter_luma)(pix, sizeof(pixel), stride, beta, tc,
+        no_p, no_q, max_len_p, max_len_q, hor_ctu_edge);
+}
+
+static int FUNC(vvc_loop_ladf_level)(const uint8_t *_pix, const ptrdiff_t _xstride, const ptrdiff_t _ystride)
+{
+    const pixel *pix        = (pixel *)_pix;
+    const ptrdiff_t xstride = _xstride / sizeof(pixel);
+    const ptrdiff_t ystride = _ystride / sizeof(pixel);
+    return (P0 + TP0 + Q0 + TQ0) >> 2;
+}
+
+static int FUNC(vvc_h_loop_ladf_level)(const uint8_t *pix, ptrdiff_t stride)
+{
+    return FUNC(vvc_loop_ladf_level)(pix, stride, sizeof(pixel));
+}
+
+static int FUNC(vvc_v_loop_ladf_level)(const uint8_t *pix, ptrdiff_t stride)
+{
+    return FUNC(vvc_loop_ladf_level)(pix, sizeof(pixel), stride);
+}
+
+#undef P7
+#undef P6
+#undef P5
+#undef P4
+#undef P3
+#undef P2
+#undef P1
+#undef P0
+#undef Q0
+#undef Q1
+#undef Q2
+#undef Q3
+#undef Q4
+#undef Q5
+#undef Q6
+#undef Q7
+
+#undef TP7
+#undef TP6
+#undef TP5
+#undef TP4
+#undef TP3
+#undef TP2
+#undef TP1
+#undef TP0
+#undef TQ0
+#undef TQ1
+#undef TQ2
+#undef TQ3
+#undef TQ4
+#undef TQ5
+#undef TQ6
+#undef TQ7
+
+static void FUNC(ff_vvc_lmcs_dsp_init)(VVCLMCSDSPContext *const lmcs)
+{
+    lmcs->filter = FUNC(lmcs_filter_luma);
+}
+
+static void FUNC(ff_vvc_lf_dsp_init)(VVCLFDSPContext *const lf)
+{
+    lf->ladf_level[0]      = FUNC(vvc_h_loop_ladf_level);
+    lf->ladf_level[1]      = FUNC(vvc_v_loop_ladf_level);
+    lf->filter_luma[0]     = FUNC(vvc_h_loop_filter_luma);
+    lf->filter_luma[1]     = FUNC(vvc_v_loop_filter_luma);
+    lf->filter_chroma[0]   = FUNC(vvc_h_loop_filter_chroma);
+    lf->filter_chroma[1]   = FUNC(vvc_v_loop_filter_chroma);
+}
+
+static void FUNC(ff_vvc_sao_dsp_init)(VVCSAODSPContext *const sao)
+{
+    for (int i = 0; i < FF_ARRAY_ELEMS(sao->band_filter); i++)
+        sao->band_filter[i] = FUNC(sao_band_filter);
+    for (int i = 0; i < FF_ARRAY_ELEMS(sao->edge_filter); i++)
+        sao->edge_filter[i] = FUNC(sao_edge_filter);
+    sao->edge_restore[0] = FUNC(sao_edge_restore_0);
+    sao->edge_restore[1] = FUNC(sao_edge_restore_1);
+}
+
+static void FUNC(ff_vvc_alf_dsp_init)(VVCALFDSPContext *const alf)
+{
+    alf->filter[LUMA]    = FUNC(alf_filter_luma);
+    alf->filter[CHROMA]  = FUNC(alf_filter_chroma);
+    alf->filter_cc       = FUNC(alf_filter_cc);
+    alf->classify        = FUNC(alf_classify);
+    alf->recon_coeff_and_clip = FUNC(alf_recon_coeff_and_clip);
+}