diff mbox series

[FFmpeg-devel,09/14] vvcdec: add intra prediction

Message ID 20230521130319.13813-10-nuomi2021@gmail.com
State Superseded
Headers show
Series add vvc decoder c code | expand

Checks

Context Check Description
andriy/make_x86 fail Make failed

Commit Message

Nuo Mi May 21, 2023, 1:03 p.m. UTC
---
 libavcodec/vvc/Makefile             |    3 +-
 libavcodec/vvc/vvc_ctu.c            |   40 ++
 libavcodec/vvc/vvc_ctu.h            |    2 +
 libavcodec/vvc/vvc_intra.c          |  763 ++++++++++++++++++++
 libavcodec/vvc/vvc_intra.h          |   49 ++
 libavcodec/vvc/vvc_intra_template.c | 1018 +++++++++++++++++++++++++++
 6 files changed, 1874 insertions(+), 1 deletion(-)
 create mode 100644 libavcodec/vvc/vvc_intra.c
 create mode 100644 libavcodec/vvc/vvc_intra.h
 create mode 100644 libavcodec/vvc/vvc_intra_template.c
diff mbox series

Patch

diff --git a/libavcodec/vvc/Makefile b/libavcodec/vvc/Makefile
index d536a6b1ea..9d95a15424 100644
--- a/libavcodec/vvc/Makefile
+++ b/libavcodec/vvc/Makefile
@@ -10,4 +10,5 @@  OBJS-$(CONFIG_VVC_DECODER)          +=  vvc/vvcdec.o            \
                                         vvc/vvc_mvs.o           \
                                         vvc/vvc_ctu.o           \
                                         vvc/vvc_inter.o         \
-                                        vvc/vvc_itx_1d.o
+                                        vvc/vvc_itx_1d.o        \
+                                        vvc/vvc_intra.o
diff --git a/libavcodec/vvc/vvc_ctu.c b/libavcodec/vvc/vvc_ctu.c
index 4e6e582718..a5fb788f6e 100644
--- a/libavcodec/vvc/vvc_ctu.c
+++ b/libavcodec/vvc/vvc_ctu.c
@@ -25,6 +25,35 @@ 
 #include "vvc_data.h"
 #include "vvc_mvs.h"
 
+void ff_vvc_decode_neighbour(VVCLocalContext *lc, const int x_ctb, const int y_ctb,
+    const int rx, const int ry, const int rs)
+{
+    VVCFrameContext *fc = lc->fc;
+    const int ctb_size         = fc->ps.sps->ctb_size_y;
+
+    lc->end_of_tiles_x = fc->ps.sps->width;
+    lc->end_of_tiles_y = fc->ps.sps->height;
+    if (fc->ps.pps->ctb_to_col_bd[rx] != fc->ps.pps->ctb_to_col_bd[rx + 1])
+        lc->end_of_tiles_x = FFMIN(x_ctb + ctb_size, lc->end_of_tiles_x);
+    if (fc->ps.pps->ctb_to_row_bd[ry] != fc->ps.pps->ctb_to_row_bd[ry + 1])
+        lc->end_of_tiles_y = FFMIN(y_ctb + ctb_size, lc->end_of_tiles_y);
+
+    lc->boundary_flags = 0;
+    if (rx > 0 && fc->ps.pps->ctb_to_col_bd[rx] != fc->ps.pps->ctb_to_col_bd[rx - 1])
+        lc->boundary_flags |= BOUNDARY_LEFT_TILE;
+    if (rx > 0 && fc->tab.slice_idx[rs] != fc->tab.slice_idx[rs - 1])
+        lc->boundary_flags |= BOUNDARY_LEFT_SLICE;
+    if (ry > 0 && fc->ps.pps->ctb_to_row_bd[ry] != fc->ps.pps->ctb_to_row_bd[ry - 1])
+        lc->boundary_flags |= BOUNDARY_UPPER_TILE;
+    if (ry > 0 && fc->tab.slice_idx[rs] != fc->tab.slice_idx[rs - fc->ps.pps->ctb_width])
+        lc->boundary_flags |= BOUNDARY_UPPER_SLICE;
+    lc->ctb_left_flag = rx > 0 && !(lc->boundary_flags & BOUNDARY_LEFT_TILE);
+    lc->ctb_up_flag   = ry > 0 && !(lc->boundary_flags & BOUNDARY_UPPER_TILE) && !(lc->boundary_flags & BOUNDARY_UPPER_SLICE);
+    lc->ctb_up_right_flag = lc->ctb_up_flag && (fc->ps.pps->ctb_to_col_bd[rx] == fc->ps.pps->ctb_to_col_bd[rx + 1]) &&
+        (fc->ps.pps->ctb_to_row_bd[ry] == fc->ps.pps->ctb_to_row_bd[ry - 1]);
+    lc->ctb_up_left_flag = lc->ctb_left_flag && lc->ctb_up_flag;
+}
+
 void ff_vvc_set_neighbour_available(VVCLocalContext *lc,
     const int x0, const int y0, const int w, const int h)
 {
@@ -39,3 +68,14 @@  void ff_vvc_set_neighbour_available(VVCLocalContext *lc,
             (x0b + w == 1 << log2_ctb_size) ? lc->ctb_up_right_flag && !y0b : lc->na.cand_up;
     lc->na.cand_up_right = lc->na.cand_up_right_sap && (x0 + w) < lc->end_of_tiles_x;
 }
+
+void ff_vvc_ctu_free_cus(CTU *ctu)
+{
+    while (ctu->cus) {
+        CodingUnit *cu      = ctu->cus;
+        AVBufferRef *buf    = cu->buf;
+
+        ctu->cus = ctu->cus->next;
+        av_buffer_unref(&buf);
+    }
+}
diff --git a/libavcodec/vvc/vvc_ctu.h b/libavcodec/vvc/vvc_ctu.h
index 02c757559e..92a4fcd539 100644
--- a/libavcodec/vvc/vvc_ctu.h
+++ b/libavcodec/vvc/vvc_ctu.h
@@ -388,4 +388,6 @@  struct ALFParams {
 
 //utils
 void ff_vvc_set_neighbour_available(VVCLocalContext *lc, int x0, int y0, int w, int h);
+void ff_vvc_decode_neighbour(VVCLocalContext *lc, int x_ctb, int y_ctb, int rx, int ry, int rs);
+void ff_vvc_ctu_free_cus(CTU *ctu);
 #endif // AVCODEC_VVC_CTU_H
diff --git a/libavcodec/vvc/vvc_intra.c b/libavcodec/vvc/vvc_intra.c
new file mode 100644
index 0000000000..ec5be8fef5
--- /dev/null
+++ b/libavcodec/vvc/vvc_intra.c
@@ -0,0 +1,763 @@ 
+/*
+ * VVC intra prediction
+ *
+ * Copyright (C) 2021 Nuo Mi
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "vvc_data.h"
+#include "vvc_inter.h"
+#include "vvc_intra.h"
+#include "vvc_itx_1d.h"
+
+static int is_cclm(enum IntraPredMode mode)
+{
+    return mode == INTRA_LT_CCLM || mode == INTRA_L_CCLM || mode == INTRA_T_CCLM;
+}
+
+static int derive_ilfnst_pred_mode_intra(const VVCLocalContext *lc, const TransformBlock *tb)
+{
+    const VVCFrameContext *fc = lc->fc;
+    const VVCSPS *sps = fc->ps.sps;
+    const CodingUnit *cu          = lc->cu;
+    const int x_tb                = tb->x0 >> fc->ps.sps->min_cb_log2_size_y;
+    const int y_tb                = tb->y0 >> fc->ps.sps->min_cb_log2_size_y;
+    const int x_c                 = (tb->x0 + (tb->tb_width << sps->hshift[1] >> 1) ) >> fc->ps.sps->min_cb_log2_size_y;
+    const int y_c                 = (tb->y0 + (tb->tb_height << sps->vshift[1] >> 1)) >> fc->ps.sps->min_cb_log2_size_y;
+    const int min_cb_width        = fc->ps.pps->min_cb_width;
+    const int intra_mip_flag      = SAMPLE_CTB(fc->tab.imf, x_tb, y_tb);
+    int pred_mode_intra = tb->c_idx == 0 ? cu->intra_pred_mode_y : cu->intra_pred_mode_c;
+    if (intra_mip_flag && !tb->c_idx) {
+        pred_mode_intra = INTRA_PLANAR;
+    } else if (is_cclm(pred_mode_intra)) {
+        int intra_mip_flag_c = SAMPLE_CTB(fc->tab.imf, x_c, y_c);
+        int cu_pred_mode = SAMPLE_CTB(fc->tab.cpm[0], x_c, y_c);
+        if (intra_mip_flag_c) {
+            pred_mode_intra = INTRA_PLANAR;
+        } else if (cu_pred_mode == MODE_IBC || cu_pred_mode == MODE_PLT) {
+            pred_mode_intra = INTRA_DC;
+        } else {
+            pred_mode_intra = SAMPLE_CTB(fc->tab.ipm, x_c, y_c);
+        }
+    }
+    pred_mode_intra = ff_vvc_wide_angle_mode_mapping(cu, tb->tb_width, tb->tb_height, tb->c_idx, pred_mode_intra);
+
+    return pred_mode_intra;
+}
+
+//8.7.4 Transformation process for scaled transform coefficients
+static void ilfnst_transform(const VVCLocalContext *lc, TransformBlock *tb)
+{
+    const CodingUnit *cu        = lc->cu;
+    const int w                 = tb->tb_width;
+    const int h                 = tb->tb_height;
+    const int n_lfnst_out_size  = (w >= 8 && h >= 8) ? 48 : 16;                         ///< nLfnstOutSize
+    const int log2_lfnst_size   = (w >= 8 && h >= 8) ? 3 : 2;                           ///< log2LfnstSize
+    const int n_lfnst_size      = 1 << log2_lfnst_size;                                 ///< nLfnstSize
+    const int non_zero_size     = ((w == 8 && h == 8) || (w == 4 && h == 4)) ? 8 : 16;  ///< nonZeroSize
+    const int pred_mode_intra   = derive_ilfnst_pred_mode_intra(lc, tb);
+    const int transpose         = pred_mode_intra > 34;
+    int u[16], v[48];
+
+    for (int x = 0; x < non_zero_size; x++) {
+        int xc = ff_vvc_diag_scan_x[2][2][x];
+        int yc = ff_vvc_diag_scan_y[2][2][x];
+        u[x] = tb->coeffs[w * yc + xc];
+    }
+    ff_vvc_inv_lfnst_1d(v, u, non_zero_size, n_lfnst_out_size, pred_mode_intra, cu->lfnst_idx);
+    if (transpose) {
+        int *dst = tb->coeffs;
+        const int *src = v;
+        if (n_lfnst_size == 4) {
+            for (int y = 0; y < 4; y++) {
+                dst[0] = src[0];
+                dst[1] = src[4];
+                dst[2] = src[8];
+                dst[3] = src[12];
+                src++;
+                dst += w;
+            }
+        } else {
+            for (int y = 0; y < 8; y++) {
+                dst[0] = src[0];
+                dst[1] = src[8];
+                dst[2] = src[16];
+                dst[3] = src[24];
+                if (y < 4) {
+                    dst[4] = src[32];
+                    dst[5] = src[36];
+                    dst[6] = src[40];
+                    dst[7] = src[44];
+                }
+                src++;
+                dst += w;
+            }
+        }
+
+    } else {
+        int *dst = tb->coeffs;
+        const int *src = v;
+        for (int y = 0; y < n_lfnst_size; y++) {
+            int size = (y < 4) ? n_lfnst_size : 4;
+            memcpy(dst, src, size * sizeof(int));
+            src += size;
+            dst += w;
+        }
+    }
+    tb->max_scan_x = n_lfnst_size - 1;
+    tb->max_scan_y = n_lfnst_size - 1;
+}
+
+//part of 8.7.4 Transformation process for scaled transform coefficients
+static void derive_transform_type(const VVCFrameContext *fc, const VVCLocalContext *lc, const TransformBlock *tb, enum TxType *trh, enum TxType *trv)
+{
+    const CodingUnit *cu = lc->cu;
+    static const enum TxType mts_to_trh[] = {DCT2, DST7, DCT8, DST7, DCT8};
+    static const enum TxType mts_to_trv[] = {DCT2, DST7, DST7, DCT8, DCT8};
+    const VVCSPS *sps       = fc->ps.sps;
+    int implicit_mts_enabled = 0;
+    if (tb->c_idx || (cu->isp_split_type != ISP_NO_SPLIT && cu->lfnst_idx)) {
+        *trh = *trv = DCT2;
+        return;
+    }
+
+    if (sps->mts_enabled_flag) {
+        if (cu->isp_split_type != ISP_NO_SPLIT ||
+            (cu->sbt_flag && FFMAX(tb->tb_width, tb->tb_height) <= 32) ||
+            (!sps->explicit_mts_intra_enabled_flag && cu->pred_mode == MODE_INTRA &&
+            !cu->lfnst_idx && !cu->intra_mip_flag)) {
+            implicit_mts_enabled = 1;
+        }
+    }
+    if (implicit_mts_enabled) {
+        const int w = tb->tb_width;
+        const int h = tb->tb_height;
+        if (cu->sbt_flag) {
+            *trh = (cu->sbt_horizontal_flag  || cu->sbt_pos_flag) ? DST7 : DCT8;
+            *trv = (!cu->sbt_horizontal_flag || cu->sbt_pos_flag) ? DST7 : DCT8;
+        } else {
+            *trh = (w >= 4 && w <= 16) ? DST7 : DCT2;
+            *trv = (h >= 4 && h <= 16) ? DST7 : DCT2;
+        }
+        return;
+    }
+    *trh = mts_to_trh[cu->mts_idx];
+    *trv = mts_to_trv[cu->mts_idx];
+}
+
+static void add_residual_for_joint_coding_chroma(VVCLocalContext *lc,
+    const TransformUnit *tu, TransformBlock *tb, const int chroma_scale)
+{
+    const VVCFrameContext *fc  = lc->fc;
+    const CodingUnit *cu = lc->cu;
+    const int c_sign = 1 - 2 * fc->ps.ph->joint_cbcr_sign_flag;
+    const int shift  = tu->coded_flag[1] ^ tu->coded_flag[2];
+    const int c_idx  = 1 + tu->coded_flag[1];
+    const ptrdiff_t stride = fc->frame->linesize[c_idx];
+    const int hs = fc->ps.sps->hshift[c_idx];
+    const int vs = fc->ps.sps->vshift[c_idx];
+    uint8_t *dst = &fc->frame->data[c_idx][(tb->y0 >> vs) * stride +
+                                          ((tb->x0 >> hs) << fc->ps.sps->pixel_shift)];
+    if (chroma_scale) {
+        fc->vvcdsp.itx.pred_residual_joint(tb->coeffs, tb->tb_width, tb->tb_height, c_sign, shift);
+        fc->vvcdsp.intra.lmcs_scale_chroma(lc, tb->coeffs, tb->coeffs, tb->tb_width, tb->tb_height, cu->x0, cu->y0);
+        fc->vvcdsp.itx.add_residual(dst, tb->coeffs, tb->tb_width, tb->tb_height, stride);
+    } else {
+        fc->vvcdsp.itx.add_residual_joint(dst, tb->coeffs, tb->tb_width, tb->tb_height, stride, c_sign, shift);
+    }
+}
+
+static int add_reconstructed_area(VVCLocalContext *lc, const int ch_type, const int x0, const int y0, const int w, const int h)
+{
+    const VVCSPS *sps       = lc->fc->ps.sps;
+    const int hs = sps->hshift[ch_type];
+    const int vs = sps->vshift[ch_type];
+    ReconstructedArea *a;
+
+    if (lc->num_ras[ch_type] >= FF_ARRAY_ELEMS(lc->ras[ch_type]))
+        return AVERROR_INVALIDDATA;
+
+    a = &lc->ras[ch_type][lc->num_ras[ch_type]];
+    a->x = x0 >> hs;
+    a->y = y0 >> vs;
+    a->w = w >> hs;
+    a->h = h >> vs;
+    lc->num_ras[ch_type]++;
+
+    return 0;
+}
+
+static void add_tu_area(const TransformUnit *tu, int *x0, int *y0, int *w, int *h)
+{
+    *x0 = tu->x0;
+    *y0 = tu->y0;
+    *w = tu->width;
+    *h = tu->height;
+}
+
+#define MIN_ISP_PRED_WIDTH 4
+static int get_luma_predict_unit(const CodingUnit *cu, const TransformUnit *tu, const int idx, int *x0, int *y0, int *w, int *h)
+{
+    int has_luma = 1;
+    add_tu_area(tu, x0, y0, w, h);
+    if (cu->isp_split_type == ISP_VER_SPLIT && tu->width < MIN_ISP_PRED_WIDTH) {
+        *w = MIN_ISP_PRED_WIDTH;
+        has_luma = !(idx % (MIN_ISP_PRED_WIDTH / tu->width));
+    }
+    return has_luma;
+}
+
+static int get_chroma_predict_unit(const CodingUnit *cu, const TransformUnit *tu, const int idx, int *x0, int *y0, int *w, int *h)
+{
+    if (cu->isp_split_type == ISP_NO_SPLIT) {
+        add_tu_area(tu, x0, y0, w, h);
+        return 1;
+    }
+    if (idx == cu->num_intra_subpartitions - 1) {
+        *x0 = cu->x0;
+        *y0 = cu->y0;
+        *w = cu->cb_width;
+        *h = cu->cb_height;
+        return 1;
+    }
+    return 0;
+}
+
+//8.4.5.1 General decoding process for intra blocks
+static void predict_intra(VVCLocalContext *lc, const TransformUnit *tu, const int idx, const int target_ch_type)
+{
+    const VVCFrameContext *fc         = lc->fc;
+    const CodingUnit *cu        = lc->cu;
+    const VVCTreeType tree_type = cu->tree_type;
+    int x0, y0, w, h;
+    if (cu->pred_mode != MODE_INTRA) {
+        add_reconstructed_area(lc, target_ch_type, tu->x0, tu->y0, tu->width, tu->height);
+        return;
+    }
+    if (!target_ch_type && tree_type != DUAL_TREE_CHROMA) {
+        if (get_luma_predict_unit(cu, tu, idx, &x0, &y0, &w, &h)) {
+            ff_vvc_set_neighbour_available(lc, x0, y0, w, h);
+            fc->vvcdsp.intra.intra_pred(lc, x0, y0, w, h, 0);
+            add_reconstructed_area(lc, 0, x0, y0, w, h);
+        }
+    }
+    if (target_ch_type && tree_type != DUAL_TREE_LUMA) {
+        if (get_chroma_predict_unit(cu, tu, idx, &x0, &y0, &w, &h)){
+            ff_vvc_set_neighbour_available(lc, x0, y0, w, h);
+            if (is_cclm(cu->intra_pred_mode_c)) {
+                fc->vvcdsp.intra.intra_cclm_pred(lc, x0, y0, w, h);
+            } else {
+                fc->vvcdsp.intra.intra_pred(lc, x0, y0, w, h, 1);
+                fc->vvcdsp.intra.intra_pred(lc, x0, y0, w, h, 2);
+            }
+            add_reconstructed_area(lc, 1, x0, y0, w, h);
+        }
+    }
+}
+
+static void scale_clip(int *coeff, const int nzw, const int w, const int h, const int shift)
+{
+    const int add = 1 << (shift - 1);
+    for (int y = 0; y < h; y++) {
+        int *p = coeff + y * w;
+        for (int x = 0; x < nzw; x++) {
+            *p = av_clip_int16((*p + add) >> shift);
+            p++;
+        }
+        memset(p, 0, sizeof(*p) * (w - nzw));
+    }
+}
+
+static void scale(int *out, const int *in, const int w, const int h, const int shift)
+{
+    const int add = 1 << (shift - 1);
+    for (int y = 0; y < h; y++) {
+        for (int x = 0; x < w; x++) {
+            int *o = out + y * w + x;
+            const int *i = in + y * w + x;
+            *o = (*i + add) >> shift;
+        }
+    }
+}
+
+// part of 8.7.3 Scaling process for transform coefficients
+static void derive_qp(const VVCLocalContext *lc, const TransformUnit *tu, TransformBlock *tb)
+{
+    const VVCSPS *sps           = lc->fc->ps.sps;
+    const VVCSH *sh             = &lc->sc->sh;
+    const CodingUnit *cu        = lc->cu;
+    int qp, qp_act_offset;
+
+    if (tb->c_idx == 0) {
+        //fix me
+        qp = cu->qp[LUMA] + sps->qp_bd_offset;
+        qp_act_offset = cu->act_enabled_flag ? -5 : 0;
+    } else {
+        const int is_jcbcr = tu->joint_cbcr_residual_flag && tu->coded_flag[CB] && tu->coded_flag[CR];
+        const int idx = is_jcbcr ? JCBCR : tb->c_idx;
+        qp = cu->qp[idx];
+        qp_act_offset = cu->act_enabled_flag ? 1 : 0;
+    }
+    if (tb->ts) {
+        const int qp_prime_ts_min = 4 + 6 * sps->min_qp_prime_ts;
+
+        tb->qp = av_clip(qp + qp_act_offset, qp_prime_ts_min, 63 + sps->qp_bd_offset);
+        tb->rect_non_ts_flag = 0;
+        tb->bd_shift = 10;
+    } else {
+        const int log_sum = tb->log2_tb_width + tb->log2_tb_height;
+        const int rect_non_ts_flag = log_sum & 1;
+
+        tb->qp = av_clip(qp + qp_act_offset, 0, 63 + sps->qp_bd_offset);
+        tb->rect_non_ts_flag = rect_non_ts_flag;
+        tb->bd_shift = sps->bit_depth + rect_non_ts_flag + (log_sum / 2) - 5 + sh->dep_quant_used_flag;
+    }
+    tb->bd_offset = (1 << tb->bd_shift) >> 1;
+}
+
+//8.7.3 Scaling process for transform coefficients
+static av_always_inline int derive_scale(const TransformBlock *tb, const int sh_dep_quant_used_flag)
+{
+    static const uint8_t rem6[63 + 2 * 6 + 1] = {
+        0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2,
+        3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
+        0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
+        4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3
+    };
+
+    static const uint8_t div6[63 + 2 * 6 + 1] = {
+        0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3,  3,  3,
+        3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6,  6,  6,
+        7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10,
+        10, 10, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12
+    };
+
+    const static int level_scale[2][6] = {
+        { 40, 45, 51, 57, 64, 72 },
+        { 57, 64, 72, 80, 90, 102 }
+    };
+    const int addin = sh_dep_quant_used_flag && !tb->ts;
+    const int qp    = tb->qp + addin;
+
+    return level_scale[tb->rect_non_ts_flag][rem6[qp]] << div6[qp];
+}
+
+//8.7.3 Scaling process for transform coefficients
+static const uint8_t* derive_scale_m(const VVCLocalContext *lc, const TransformBlock *tb, uint8_t *scale_m)
+{
+    //Table 38 – Specification of the scaling matrix identifier variable id according to predMode, cIdx, nTbW, and nTbH
+    const int ids[2][3][6] = {
+        {
+            {  0,  2,  8, 14, 20, 26 },
+            {  0,  3,  9, 15, 21, 21 },
+            {  0,  4, 10, 16, 22, 22 }
+        },
+        {
+            {  0,  5, 11, 17, 23, 27 },
+            {  0,  6, 12, 18, 24, 24 },
+            {  1,  7, 13, 19, 25, 25 },
+        }
+    };
+    const VVCFrameParamSets *ps = &lc->fc->ps;
+    const VVCSPS *sps           = ps->sps;
+    const VVCSH *sh             = &lc->sc->sh;
+    const CodingUnit *cu        = lc->cu;
+    const AVBufferRef *ref;
+    const VVCScalingList *sl;
+    const int id = ids[cu->pred_mode != MODE_INTRA][tb->c_idx][FFMAX(tb->log2_tb_height, tb->log2_tb_width) - 1];
+    const int log2_matrix_size = (id < 2) ? 1 : (id < 8) ? 2 : 3;
+    uint8_t *p = scale_m;
+
+    av_assert0(!sps->scaling_matrix_for_alternative_colour_space_disabled_flag);
+
+    if (!sh->explicit_scaling_list_used_flag || tb->ts ||
+        sps->scaling_matrix_for_lfnst_disabled_flag && cu->apply_lfnst_flag[tb->c_idx])
+        return ff_vvc_default_scale_m;
+
+    ref = ps->scaling_list[ps->ph->scaling_list_aps_id];
+    if (!ref || !ref->data) {
+        av_log(lc->fc->avctx, AV_LOG_WARNING, "bug: no scaling list aps, id = %d", ps->ph->scaling_list_aps_id);
+        return ff_vvc_default_scale_m;
+    }
+
+    sl = (const VVCScalingList *)ref->data;
+    for (int y = tb->min_scan_y; y <= tb->max_scan_y; y++) {
+        const int off = y << log2_matrix_size >> tb->log2_tb_height << log2_matrix_size;
+        const uint8_t *m = &sl->scaling_matrix_rec[id][off];
+
+        for (int x = tb->min_scan_x; x <= tb->max_scan_x; x++)
+            *p++ = m[x << log2_matrix_size >> tb->log2_tb_width];
+    }
+    if (id >= SL_START_16x16 && !tb->min_scan_x && !tb->min_scan_y)
+        *scale_m = sl->scaling_matrix_dc_rec[id - SL_START_16x16];
+
+    return scale_m;
+}
+
+//8.7.3 Scaling process for transform coefficients
+static av_always_inline int scale_coeff(const TransformBlock *tb, int coeff, const int scale, const int scale_m)
+{
+    coeff = (coeff * scale * scale_m + tb->bd_offset) >> tb->bd_shift;
+    coeff = av_clip(coeff, -(1<<15), (1<<15) - 1);
+    return coeff;
+}
+
+static void dequant(const VVCLocalContext *lc, const TransformUnit *tu, TransformBlock *tb)
+{
+    uint8_t tmp[MAX_TB_SIZE * MAX_TB_SIZE];
+    const VVCSH *sh         = &lc->sc->sh;
+    const uint8_t *scale_m  = derive_scale_m(lc, tb, tmp);
+    int scale;
+
+    derive_qp(lc, tu, tb);
+    scale = derive_scale(tb, sh->dep_quant_used_flag);
+
+    for (int y = tb->min_scan_y; y <= tb->max_scan_y; y++) {
+        for (int x = tb->min_scan_x; x <= tb->max_scan_x; x++) {
+            int *coeff = tb->coeffs + y * tb->tb_width + x;
+
+            if (*coeff)
+                *coeff = scale_coeff(tb, *coeff, scale, *scale_m);
+            scale_m++;
+        }
+    }
+}
+
+static void itx_2d(const VVCFrameContext *fc, TransformBlock *tb, const enum TxType trh, const enum TxType trv, int *temp)
+{
+    const VVCSPS *sps   = fc->ps.sps;
+    const int w         = tb->tb_width;
+    const int h         = tb->tb_height;
+    const int nzw       = tb->max_scan_x + 1;
+
+    for (int x = 0; x < nzw; x++)
+        fc->vvcdsp.itx.itx[trv][tb->log2_tb_height - 1](temp + x, w, tb->coeffs + x, w);
+    scale_clip(temp, nzw, w, h, 7);
+
+    for (int y = 0; y < h; y++)
+        fc->vvcdsp.itx.itx[trh][tb->log2_tb_width - 1](tb->coeffs + y * w, 1, temp + y * w, 1);
+    scale(tb->coeffs, tb->coeffs, w, h, 20 - sps->bit_depth);
+}
+
+static void itx_1d(const VVCFrameContext *fc, TransformBlock *tb, const enum TxType trh, const enum TxType trv, int  *temp)
+{
+    const VVCSPS *sps   = fc->ps.sps;
+    const int w         = tb->tb_width;
+    const int h         = tb->tb_height;
+
+    if (w > 1)
+        fc->vvcdsp.itx.itx[trh][tb->log2_tb_width - 1](temp, 1, tb->coeffs, 1);
+    else
+        fc->vvcdsp.itx.itx[trv][tb->log2_tb_height - 1](temp, 1, tb->coeffs, 1);
+    scale(tb->coeffs, temp, w, h, 21 - sps->bit_depth);
+}
+
+static void transform_bdpcm(TransformBlock *tb, const VVCLocalContext *lc, const CodingUnit *cu)
+{
+    const IntraPredMode mode = tb->c_idx ? cu->intra_pred_mode_c : cu->intra_pred_mode_y;
+    const int vertical       = mode == INTRA_VERT;
+    lc->fc->vvcdsp.itx.transform_bdpcm(tb->coeffs, tb->tb_width, tb->tb_height, vertical, 15);
+    if (vertical)
+        tb->max_scan_y = tb->tb_height - 1;
+    else
+        tb->max_scan_x = tb->tb_width - 1;
+}
+
+static void itransform(VVCLocalContext *lc, TransformUnit *tu, const int tu_idx, const int target_ch_type)
+{
+    const VVCFrameContext *fc   = lc->fc;
+    const VVCSPS *sps           = fc->ps.sps;
+    const VVCSH *sh             = &lc->sc->sh;
+    const CodingUnit *cu        = lc->cu;
+    const int ps                = fc->ps.sps->pixel_shift;
+    DECLARE_ALIGNED(32, int, temp)[MAX_TB_SIZE * MAX_TB_SIZE];
+
+    for (int i = 0; i < tu->nb_tbs; i++) {
+        TransformBlock *tb  = &tu->tbs[i];
+        const int c_idx     = tb->c_idx;
+        const int ch_type   = c_idx > 0;
+
+        if (ch_type == target_ch_type && tb->has_coeffs) {
+            const int w             = tb->tb_width;
+            const int h             = tb->tb_height;
+            const int chroma_scale  = ch_type && sh->lmcs_used_flag && fc->ps.ph->chroma_residual_scale_flag && (w * h > 4);
+            const ptrdiff_t stride  = fc->frame->linesize[c_idx];
+            const int hs            = sps->hshift[c_idx];
+            const int vs            = sps->vshift[c_idx];
+            uint8_t *dst            = &fc->frame->data[c_idx][(tb->y0 >> vs) * stride + ((tb->x0 >> hs) << ps)];
+
+            if (cu->bdpcm_flag[tb->c_idx])
+                transform_bdpcm(tb, lc, cu);
+            dequant(lc, tu, tb);
+            if (!tb->ts) {
+                enum TxType trh, trv;
+
+                if (cu->apply_lfnst_flag[c_idx])
+                    ilfnst_transform(lc, tb);
+                derive_transform_type(fc, lc, tb, &trh, &trv);
+                if (w > 1 && h > 1)
+                    itx_2d(fc, tb, trh, trv, temp);
+                else
+                    itx_1d(fc, tb, trh, trv, temp);
+            }
+
+            if (chroma_scale)
+                fc->vvcdsp.intra.lmcs_scale_chroma(lc, temp, tb->coeffs, w, h, cu->x0, cu->y0);
+            fc->vvcdsp.itx.add_residual(dst, chroma_scale ? temp : tb->coeffs, w, h, stride);
+
+            if (tu->joint_cbcr_residual_flag && tb->c_idx)
+                add_residual_for_joint_coding_chroma(lc, tu, tb, chroma_scale);
+        }
+    }
+}
+
+static int reconstruct(VVCLocalContext *lc)
+{
+    VVCFrameContext *fc = lc->fc;
+    CodingUnit *cu      = lc->cu;
+    const int start     = cu->tree_type == DUAL_TREE_CHROMA;
+    const int end       = fc->ps.sps->chroma_format_idc && (cu->tree_type != DUAL_TREE_LUMA);
+
+    for (int ch_type = start; ch_type <= end; ch_type++) {
+        for (int i = 0; i < cu->num_tus; i++) {
+            TransformUnit *tu = &cu->tus[i];
+
+            predict_intra(lc, tu, i, ch_type);
+            itransform(lc, tu, i, ch_type);
+        }
+    }
+    return 0;
+}
+
+int ff_vvc_reconstruct(VVCLocalContext *lc, const int rs, const int rx, const int ry)
+{
+    const VVCFrameContext *fc   = lc->fc;
+    const VVCSPS *sps           = fc->ps.sps;
+    const int x_ctb             = rx << sps->ctb_log2_size_y;
+    const int y_ctb             = ry << sps->ctb_log2_size_y;
+    CTU *ctu                    = fc->tab.ctus + rs;
+    CodingUnit *cu              = ctu->cus;
+    int ret                     = 0;
+
+    lc->num_ras[0] = lc->num_ras[1] = 0;
+    lc->lmcs.x_vpdu = -1;
+    lc->lmcs.y_vpdu = -1;
+    ff_vvc_decode_neighbour(lc, x_ctb, y_ctb, rx, ry, rs);
+    while (cu) {
+        lc->cu = cu;
+
+        if (cu->ciip_flag)
+            ff_vvc_predict_ciip(lc);
+        if (cu->coded_flag) {
+            ret = reconstruct(lc);
+        } else {
+            add_reconstructed_area(lc, LUMA, cu->x0, cu->y0, cu->cb_width, cu->cb_height);
+            add_reconstructed_area(lc, CHROMA, cu->x0, cu->y0, cu->cb_width, cu->cb_height);
+        }
+        cu = cu->next;
+    }
+    ff_vvc_ctu_free_cus(ctu);
+    return ret;
+}
+
+int ff_vvc_get_mip_size_id(const int w, const int h)
+{
+    if (w == 4 && h == 4)
+        return 0;
+    if ((w == 4 || h == 4) || (w == 8 && h == 8))
+        return 1;
+    return 2;
+}
+
+int ff_vvc_nscale_derive(const int w, const int h, const int mode)
+{
+    int side_size, nscale;
+    av_assert0(mode < INTRA_LT_CCLM && !(mode > INTRA_HORZ && mode < INTRA_VERT));
+    if (mode == INTRA_PLANAR || mode == INTRA_DC ||
+        mode == INTRA_HORZ || mode == INTRA_VERT) {
+        nscale = (av_log2(w) + av_log2(h) - 2) >> 2;
+    } else {
+        const int intra_pred_angle = ff_vvc_intra_pred_angle_derive(mode);
+        const int inv_angle        = ff_vvc_intra_inv_angle_derive(intra_pred_angle);
+        if (mode >= INTRA_VERT)
+            side_size = h;
+        if (mode <= INTRA_HORZ)
+            side_size = w;
+        nscale = FFMIN(2, av_log2(side_size) - av_log2(3 * inv_angle - 2) + 8);
+    }
+    return nscale;
+}
+
+int ff_vvc_need_pdpc(const int w, const int h, const uint8_t bdpcm_flag, const int mode, const int ref_idx)
+{
+    av_assert0(mode < INTRA_LT_CCLM);
+    if ((w >= 4 && h >= 4) && !ref_idx && !bdpcm_flag) {
+        int nscale;
+        if (mode == INTRA_PLANAR || mode == INTRA_DC ||
+            mode == INTRA_HORZ || mode == INTRA_VERT)
+            return 1;
+        if (mode > INTRA_HORZ && mode < INTRA_VERT)
+            return 0;
+        nscale = ff_vvc_nscale_derive(w, h, mode);
+        return nscale >= 0;
+
+    }
+    return 0;
+}
+
+static const ReconstructedArea* get_reconstructed_area(const VVCLocalContext *lc, const int x, const int y, const int c_idx)
+{
+    const int ch_type = c_idx > 0;
+    for (int i = lc->num_ras[ch_type] - 1; i >= 0; i--) {
+        const ReconstructedArea* a = &lc->ras[ch_type][i];
+        const int r = (a->x + a->w);
+        const int b = (a->y + a->h);
+        if (a->x <= x && x < r && a->y <= y && y < b)
+            return a;
+
+        //it's too far away, no need check it;
+        if (x >= r && y >= b)
+            break;
+    }
+    return NULL;
+}
+
+int ff_vvc_get_top_available(const VVCLocalContext *lc, const int x, const int y, int target_size, const int c_idx)
+{
+    const VVCFrameContext *fc = lc->fc;
+    const VVCSPS *sps = fc->ps.sps;
+    const int hs = sps->hshift[c_idx];
+    const int vs = sps->vshift[c_idx];
+    const int log2_ctb_size_v   = sps->ctb_log2_size_y - vs;
+    const int end_of_ctb_x      = ((lc->cu->x0 >> sps->ctb_log2_size_y) + 1) << sps->ctb_log2_size_y;
+    const int y0b               = av_mod_uintp2(y, log2_ctb_size_v);
+    const int max_x             = FFMIN(fc->ps.pps->width, end_of_ctb_x) >> hs;
+    const ReconstructedArea *a;
+    int px = x;
+
+    if (!y0b) {
+        if (!lc->ctb_up_flag)
+            return 0;
+        target_size = FFMIN(target_size, (lc->end_of_tiles_x >> hs) - x);
+        if (sps->entropy_coding_sync_enabled_flag)
+            target_size = FFMIN(target_size, (end_of_ctb_x >> hs) - x);
+        return target_size;
+    }
+
+    target_size = FFMAX(0, FFMIN(target_size, max_x - x));
+    while (target_size > 0 && (a = get_reconstructed_area(lc, px, y - 1, c_idx))) {
+        const int sz = FFMIN(target_size, a->x + a->w - px);
+        px += sz;
+        target_size -= sz;
+    }
+    return px - x;
+}
+
+int ff_vvc_get_left_available(const VVCLocalContext *lc, const int x, const int y, int target_size, const int c_idx)
+{
+    const VVCFrameContext *fc = lc->fc;
+    const VVCSPS *sps = fc->ps.sps;
+    const int hs = sps->hshift[c_idx];
+    const int vs = sps->vshift[c_idx];
+    const int log2_ctb_size_h   =  sps->ctb_log2_size_y - hs;
+    const int x0b               = av_mod_uintp2(x, log2_ctb_size_h);
+    const int end_of_ctb_y      = ((lc->cu->y0 >> sps->ctb_log2_size_y) + 1) << sps->ctb_log2_size_y;
+    const int max_y             = FFMIN(fc->ps.pps->height, end_of_ctb_y) >> vs;
+    const ReconstructedArea *a;
+    int  py = y;
+
+    if (!x0b && !lc->ctb_left_flag)
+        return 0;
+
+    target_size = FFMAX(0, FFMIN(target_size, max_y - y));
+    if (!x0b)
+        return target_size;
+
+    while (target_size > 0 && (a = get_reconstructed_area(lc, x - 1, py, c_idx))) {
+        const int sz = FFMIN(target_size, a->y + a->h - py);
+        py += sz;
+        target_size -= sz;
+    }
+    return py - y;
+}
+
+static int less(const void *a, const void *b)
+{
+    return *(const int*)a - *(const int*)b;
+}
+
+int ff_vvc_ref_filter_flag_derive(const int mode)
+{
+    static const int modes[] = { -14, -12, -10, -6, INTRA_PLANAR, 2, 34, 66, 72, 76, 78, 80};
+    return bsearch(&mode, modes, FF_ARRAY_ELEMS(modes), sizeof(int), less) != NULL;
+}
+
+int ff_vvc_intra_pred_angle_derive(const int pred_mode)
+{
+    static const int angles[] = {
+          0,   1,   2,   3,   4,   6,   8,  10,  12,  14,  16,  18,  20,  23,  26, 29,
+         32,  35,  39,  45,  51,  57,  64,  73,  86, 102, 128, 171, 256, 341, 512
+    };
+    int sign = 1, idx, intra_pred_angle;
+    if (pred_mode > INTRA_DIAG) {
+        idx = pred_mode - INTRA_VERT;
+    } else if (pred_mode > 0) {
+        idx = INTRA_HORZ - pred_mode;
+    } else {
+        idx = INTRA_HORZ - 2 - pred_mode;
+    }
+    if (idx < 0) {
+        idx = -idx;
+        sign = -1;
+    }
+    intra_pred_angle = sign * angles[idx];
+    return intra_pred_angle;
+}
+
+#define ROUND(f) (int)(f < 0 ? -(-f + 0.5) : (f + 0.5))
+int ff_vvc_intra_inv_angle_derive(const int intra_pred_angle)
+{
+    float inv_angle;
+    av_assert0(intra_pred_angle);
+    inv_angle = 32 * 512.0 / intra_pred_angle;
+    return ROUND(inv_angle);
+}
+
+//8.4.5.2.7 Wide angle intra prediction mode mapping proces
+int ff_vvc_wide_angle_mode_mapping(const CodingUnit *cu,
+    const int tb_width, const int tb_height, const int c_idx, int pred_mode_intra)
+{
+    int nw, nh, wh_ratio, min, max;
+
+    if (cu->isp_split_type == ISP_NO_SPLIT || c_idx) {
+        nw = tb_width;
+        nh = tb_height;
+    } else {
+        nw = cu->cb_width;
+        nh = cu->cb_height;
+    }
+    wh_ratio    = FFABS(ff_log2(nw) - ff_log2(nh));
+    max         = (wh_ratio > 1) ? (8  + 2 * wh_ratio) : 8;
+    min         = (wh_ratio > 1) ? (60 - 2 * wh_ratio) : 60;
+
+    if (nw > nh && pred_mode_intra >=2 && pred_mode_intra < max)
+        pred_mode_intra += 65;
+    else if (nh > nw && pred_mode_intra <= 66 && pred_mode_intra > min)
+        pred_mode_intra -= 67;
+    return pred_mode_intra;
+}
diff --git a/libavcodec/vvc/vvc_intra.h b/libavcodec/vvc/vvc_intra.h
new file mode 100644
index 0000000000..12d0dae801
--- /dev/null
+++ b/libavcodec/vvc/vvc_intra.h
@@ -0,0 +1,49 @@ 
+/*
+ * VVC intra prediction
+ *
+ * Copyright (C) 2021 Nuo Mi
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef AVCODEC_VVC_INTRA_H
+#define AVCODEC_VVC_INTRA_H
+
+#include "vvc_ctu.h"
+
+/**
+ * reconstruct a CTU
+ * @param lc local context for CTU
+ * @param rs raster order for the CTU.
+ * @param rx raster order x for the CTU.
+ * @param ry raster order y for the CTU.
+ * @return AVERROR
+ */
+int ff_vvc_reconstruct(VVCLocalContext *lc, const int rs, const int rx, const int ry);
+
+//utils for vvc_intra_template
+int ff_vvc_get_top_available(const VVCLocalContext *lc, int x0, int y0, int target_size, int c_idx);
+int ff_vvc_get_left_available(const VVCLocalContext *lc, int x0, int y0, int target_size, int c_idx);
+int ff_vvc_get_mip_size_id(int w, int h);
+int ff_vvc_need_pdpc(int w, int h, uint8_t bdpcm_flag, int mode, int ref_idx);
+int ff_vvc_nscale_derive(int w, int h, int mode);
+int ff_vvc_ref_filter_flag_derive(int mode);
+int ff_vvc_intra_pred_angle_derive(int pred_mode);
+int ff_vvc_intra_inv_angle_derive(int pred_mode);
+int ff_vvc_wide_angle_mode_mapping(const CodingUnit *cu,
+    int tb_width, int tb_height, int c_idx, int pred_mode_intra);
+
+#endif // AVCODEC_VVC_INTRA_H
diff --git a/libavcodec/vvc/vvc_intra_template.c b/libavcodec/vvc/vvc_intra_template.c
new file mode 100644
index 0000000000..f4e42bf799
--- /dev/null
+++ b/libavcodec/vvc/vvc_intra_template.c
@@ -0,0 +1,1018 @@ 
+/*
+ * VVC intra prediction DSP
+ *
+ * Copyright (C) 2021-2023 Nuomi
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/bit_depth_template.c"
+
+#include "vvc_data.h"
+#include "vvc_intra.h"
+
+#define POS(x, y) src[(x) + stride * (y)]
+
+static av_always_inline void FUNC(cclm_linear_pred)(VVCFrameContext *fc, const int x0, const int y0,
+    const int w, const int h, const pixel* pdsy, const int *a, const int *b, const int *k)
+{
+    const VVCSPS *sps = fc->ps.sps;
+    for (int i = 0; i < VVC_MAX_SAMPLE_ARRAYS - 1; i++) {
+        const int c_idx = i + 1;
+        const int x = x0 >> sps->hshift[c_idx];
+        const int y = y0 >> sps->vshift[c_idx];
+        const ptrdiff_t stride = fc->frame->linesize[c_idx] / sizeof(pixel);
+        pixel *src = (pixel*)fc->frame->data[c_idx] + x + y * stride;
+        for (int y = 0; y < h; y++) {
+            for (int x = 0; x < w; x++) {
+                const int dsy = pdsy[y * w + x];
+                const int pred = ((dsy * a[i]) >> k[i]) + b[i];
+                POS(x, y) = CLIP(pred);
+            }
+        }
+    }
+}
+
+#define MAX_PICK_POS 4
+#define TOP  0
+#define LEFT 1
+
+static av_always_inline void FUNC(cclm_get_params_default)(int *a, int *b, int *k)
+{
+    for (int i = 0; i < 2; i++) {
+        a[i] = k[i] = 0;
+        b[i] = 1 << (BIT_DEPTH - 1);
+    }
+}
+
+static av_always_inline int FUNC(cclm_get_select_pos)(const VVCLocalContext *lc,
+    const int x, const int y, const int w, const int h, const int avail_t, const int avail_l,
+    int cnt[2], int pos[2][MAX_PICK_POS])
+{
+    const enum IntraPredMode mode = lc->cu->intra_pred_mode_c;
+    const int num_is4 = !avail_t || !avail_l || mode != INTRA_LT_CCLM;
+    int num_samp[2];
+
+    if (mode == INTRA_LT_CCLM) {
+        num_samp[TOP]  = avail_t ? w : 0;
+        num_samp[LEFT] = avail_l ? h : 0;
+    } else {
+        num_samp[TOP] = (avail_t && mode == INTRA_T_CCLM) ? ff_vvc_get_top_available(lc,  x, y, w + FFMIN(w, h), 1) : 0;
+        num_samp[LEFT] = (avail_l && mode == INTRA_L_CCLM) ? ff_vvc_get_left_available(lc, x, y, h + FFMIN(w, h), 1) : 0;
+    }
+    if (!num_samp[TOP] && !num_samp[LEFT]) {
+        return 0;
+    }
+    for (int i = TOP; i <= LEFT; i++) {
+        const int start = num_samp[i] >> (2 + num_is4);
+        const int step  = FFMAX(1, num_samp[i] >> (1 + num_is4)) ;
+        cnt[i] = FFMIN(num_samp[i], (1 + num_is4) << 1);
+        for (int c = 0; c < cnt[i]; c++)
+            pos[i][c] = start + c * step;
+    }
+    return 1;
+}
+
+static av_always_inline void FUNC(cclm_select_luma_444)(const pixel *src, const int step,
+    const int cnt, const int pos[MAX_PICK_POS],  pixel *sel_luma)
+{
+    for (int i = 0; i < cnt; i++)
+        sel_luma[i] = src[pos[i] * step];
+}
+
+static av_always_inline void FUNC(cclm_select_luma)(const VVCFrameContext *fc,
+    const int x0, const int y0, const int avail_t, const int avail_l, const int cnt[2], const int pos[2][MAX_PICK_POS],
+    pixel *sel_luma)
+{
+    const VVCSPS *sps = fc->ps.sps;
+
+    const int b_ctu_boundary = !av_mod_uintp2(y0, sps->ctb_log2_size_y);
+    const int hs = sps->hshift[1];
+    const int vs = sps->vshift[1];
+    const ptrdiff_t stride = fc->frame->linesize[0] / sizeof(pixel);
+
+    if (!hs && !vs) {
+        const pixel* src = (pixel*)fc->frame->data[0] + x0 + y0 * stride;
+        FUNC(cclm_select_luma_444)(src - avail_t * stride, 1, cnt[TOP], pos[TOP], sel_luma);
+        FUNC(cclm_select_luma_444)(src - avail_l, stride, cnt[LEFT], pos[LEFT], sel_luma + cnt[TOP]);
+    } else {
+        // top
+        if (vs && !b_ctu_boundary) {
+            const pixel *source = (pixel *)fc->frame->data[0] + x0 + (y0 - 2) * stride;
+            for (int i = 0; i < cnt[TOP]; i++) {
+                const int x = pos[TOP][i] << hs;
+                const pixel *src = source + x;
+                const int has_left = x || avail_l;
+                const pixel l = has_left ? POS(-1, 0) : POS(0, 0);
+                if (sps->chroma_vertical_collocated_flag) {
+                    sel_luma[i] = (POS(0, -1) + l + 4 * POS(0, 0) + POS(1, 0) + POS(0, 1) + 4) >> 3;
+                } else {
+                    const pixel l1 = has_left ? POS(-1, 1) : POS(0, 1);
+                    sel_luma[i] = (l + l1 + 2 * (POS(0, 0) + POS(0, 1)) + POS(1, 0) + POS(1, 1) + 4) >> 3;
+                }
+            }
+        } else {
+            const pixel *source = (pixel*)fc->frame->data[0] + x0 + (y0 - 1) * stride;
+            for (int i = 0; i < cnt[TOP]; i++) {
+                const int x = pos[TOP][i] << hs;
+                const pixel *src = source + x;
+                const int has_left = x || avail_l;
+                const pixel l = has_left ? POS(-1, 0) : POS(0, 0);
+                sel_luma[i] = (l + 2 * POS(0, 0) + POS(1, 0) + 2) >> 2;
+            }
+        }
+
+        // left
+        {
+            const pixel *left;
+            const pixel *source = (pixel *)fc->frame->data[0] + x0 + y0 * stride - (1 + hs) * avail_l;
+            left = source - avail_l;
+
+            for (int i = 0; i < cnt[LEFT]; i++) {
+                const int y = pos[LEFT][i] << vs;
+                const int offset = y * stride;
+                const pixel *l   = left + offset;
+                const pixel *src = source + offset;
+                pixel pred;
+                if (!vs) {
+                    pred = (*l + 2 * POS(0, 0) + POS(1, 0) + 2) >> 2;
+                } else {
+                    if (sps->chroma_vertical_collocated_flag) {
+                        const int has_top = y || avail_t;
+                        const pixel t = has_top ? POS(0, -1) : POS(0, 0);
+                        pred = (*l + t + 4 * POS(0, 0) + POS(1, 0) + POS(0, 1) + 4) >> 3;
+                    } else {
+                        pred = (*l + *(l + stride) + 2 * POS(0, 0) + 2 * POS(0, 1) + POS(1, 0) + POS(1, 1) + 4) >> 3;
+                    }
+                }
+                sel_luma[i + cnt[TOP]] = pred;
+            }
+        }
+    }
+}
+
+static av_always_inline void FUNC(cclm_select_chroma)(const VVCFrameContext *fc,
+    const int x, const int y, const int cnt[2], const int pos[2][MAX_PICK_POS],
+    pixel sel[][MAX_PICK_POS * 2])
+{
+    for (int c_idx = 1; c_idx < VVC_MAX_SAMPLE_ARRAYS; c_idx++) {
+        const ptrdiff_t stride = fc->frame->linesize[c_idx] / sizeof(pixel);
+
+        //top
+        const pixel *src = (pixel*)fc->frame->data[c_idx] + x + (y - 1)* stride;
+        for (int i = 0; i < cnt[TOP]; i++) {
+            sel[c_idx][i] = src[pos[TOP][i]];
+        }
+
+        //left
+        src = (pixel*)fc->frame->data[c_idx] + x - 1 + y * stride;
+        for (int i = 0; i < cnt[LEFT]; i++) {
+            sel[c_idx][i + cnt[TOP]] = src[pos[LEFT][i] * stride];
+        }
+    }
+}
+
+static av_always_inline int FUNC(cclm_select_samples)(const VVCLocalContext *lc,
+    const int x0, const int y0, const int w, const int h, const int avail_t, const int avail_l,
+    pixel sel[][MAX_PICK_POS * 2])
+{
+    const VVCFrameContext *fc = lc->fc;
+    const VVCSPS *sps   = fc->ps.sps;
+    const int x  = x0 >> sps->hshift[1];
+    const int y  = y0 >> sps->vshift[1];
+    int cnt[2], pos[2][MAX_PICK_POS];
+
+    if (!FUNC(cclm_get_select_pos)(lc, x, y, w, h, avail_t, avail_l, cnt, pos))
+        return 0;
+
+    FUNC(cclm_select_luma)(fc, x0, y0, avail_t, avail_l, cnt, pos, sel[LUMA]);
+    FUNC(cclm_select_chroma)(fc, x, y, cnt, pos, sel);
+
+    if (cnt[TOP] + cnt[LEFT] == 2) {
+        for (int c_idx = 0; c_idx < VVC_MAX_SAMPLE_ARRAYS; c_idx++) {
+            sel[c_idx][3] = sel[c_idx][0];
+            sel[c_idx][2] = sel[c_idx][1];
+            sel[c_idx][0] = sel[c_idx][1];
+            sel[c_idx][1] = sel[c_idx][3];
+        }
+    }
+    return 1;
+}
+
+static av_always_inline void FUNC(cclm_get_min_max)(
+    const pixel sel[][MAX_PICK_POS * 2], int *min, int *max)
+{
+    int min_grp_idx[] = { 0, 2 };
+    int max_grp_idx[] = { 1, 3 };
+
+    if (sel[LUMA][min_grp_idx[0]] > sel[LUMA][min_grp_idx[1]])
+        FFSWAP(int, min_grp_idx[0], min_grp_idx[1]);
+    if (sel[LUMA][max_grp_idx[0]] > sel[LUMA][max_grp_idx[1]])
+        FFSWAP(int, max_grp_idx[0], max_grp_idx[1]);
+    if (sel[LUMA][min_grp_idx[0]] > sel[LUMA][max_grp_idx[1]]) {
+        FFSWAP(int, min_grp_idx[0], max_grp_idx[0]);
+        FFSWAP(int, min_grp_idx[1], max_grp_idx[1]);
+    }
+    if (sel[LUMA][min_grp_idx[1]] > sel[LUMA][max_grp_idx[0]])
+        FFSWAP(int, min_grp_idx[1], max_grp_idx[0]);
+    for (int c_idx = 0; c_idx < VVC_MAX_SAMPLE_ARRAYS; c_idx++) {
+        max[c_idx] = (sel[c_idx][max_grp_idx[0]] + sel[c_idx][max_grp_idx[1]] + 1) >> 1;
+        min[c_idx] = (sel[c_idx][min_grp_idx[0]] + sel[c_idx][min_grp_idx[1]] + 1) >> 1;
+    }
+}
+
+static av_always_inline void FUNC(cclm_get_params)(const VVCLocalContext *lc,
+    const int x0, const int y0, const int w, const int h, const int avail_t, const int avail_l,
+    int *a, int *b, int *k)
+{
+    pixel sel[VVC_MAX_SAMPLE_ARRAYS][MAX_PICK_POS * 2];
+    int max[VVC_MAX_SAMPLE_ARRAYS], min[VVC_MAX_SAMPLE_ARRAYS];
+    int diff;
+
+    if (!FUNC(cclm_select_samples)(lc, x0, y0, w, h, avail_t, avail_l, sel)) {
+        FUNC(cclm_get_params_default)(a, b, k);
+        return;
+    }
+
+    FUNC(cclm_get_min_max)(sel, min, max);
+
+    diff = max[LUMA] - min[LUMA];
+    if (diff == 0) {
+        for (int i = 0; i < 2; i++) {
+            a[i] = k[i] = 0;
+            b[i] = min[i + 1];
+        }
+        return;
+    }
+    for (int i = 0; i < 2; i++) {
+        const static int div_sig_table[] = {0, 7, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 1, 1, 0};
+        const int diffc = max[i + 1] - min[i + 1];
+        int  x = av_log2(diff);
+        int  y, v, sign, add;
+        const int norm_diff = ((diff << 4) >> x) & 15;
+        x += (norm_diff) ? 1 : 0;
+        y = abs(diffc) > 0 ? av_log2(abs(diffc)) + 1 : 0;
+        v = div_sig_table[norm_diff] | 8;
+        add = (1 << y >> 1);
+        a[i] = (diffc * v + add) >> y;
+        k[i] = FFMAX(1, 3 + x -y);
+        sign = a[i] < 0 ? -1 : (a[i] > 0);
+        a[i] = ((3 + x - y) < 1) ?  sign * 15 : a[i];
+        b[i] = min[i + 1] - ((a[i] * min[0]) >> k[i]);
+    }
+
+}
+
+#undef TOP
+#undef LEFT
+
+static av_always_inline void FUNC(cclm_get_luma_rec_pixels)(const VVCFrameContext *fc,
+    const int x0, const int y0, const int w, const int h, const int avail_t, const int avail_l,
+    pixel *pdsy)
+{
+    const int hs            = fc->ps.sps->hshift[1];
+    const int vs            = fc->ps.sps->vshift[1];
+    const ptrdiff_t stride  = fc->frame->linesize[0] / sizeof(pixel);
+    const pixel *source     = (pixel*)fc->frame->data[0] + x0 + y0 * stride;
+    const pixel *left       = source - avail_l;
+    const pixel *top        = source - avail_t * stride;
+
+    const VVCSPS *sps = fc->ps.sps;
+    if (!hs && !vs) {
+        for (int i = 0; i < h; i++)
+            memcpy(pdsy + i * w, source + i * stride, w * sizeof(pixel));
+        return;
+    }
+    for (int i = 0; i < h; i++) {
+        const pixel *src  = source;
+        const pixel *l = left;
+        const pixel *t = top;
+        if (!vs) {
+            for (int j = 0; j < w; j++) {
+                pixel pred  = (*l + 2 * POS(0, 0) + POS(1, 0) + 2) >> 2;
+                pdsy[i * w + j] = pred;
+                src += 2;
+                l = src - 1;
+            }
+
+        } else {
+            if (sps->chroma_vertical_collocated_flag)  {
+                for (int j = 0; j < w; j++) {
+                    pixel pred  = (*l + *t + 4 * POS(0, 0) + POS(1, 0) + POS(0, 1) + 4) >> 3;
+                    pdsy[i * w + j] = pred;
+                    src += 2;
+                    t += 2;
+                    l = src - 1;
+                }
+            } else {
+                for (int j = 0; j < w; j++) {
+                    pixel pred  = (*l + *(l + stride) + 2 * POS(0, 0) + 2 * POS(0, 1) + POS(1, 0) + POS(1, 1) + 4) >> 3;
+
+                    pdsy[i * w + j] = pred;
+                    src += 2;
+                    l = src - 1;
+                }
+            }
+        }
+        source += (stride << vs);
+        left   += (stride << vs);
+        top    = source - stride;
+    }
+}
+
+static av_always_inline void FUNC(cclm_pred_default)(VVCFrameContext *fc,
+    const int x, const int y, const int w, const int h, const int avail_t, const int avail_l)
+{
+    for (int c_idx = 1; c_idx < VVC_MAX_SAMPLE_ARRAYS; c_idx++) {
+        const ptrdiff_t stride = fc->frame->linesize[c_idx] / sizeof(pixel);
+        pixel *dst = (pixel*)fc->frame->data[c_idx] + x + y * stride;
+        for (int i = 0; i < h; i++) {
+            for (int j = 0; j < w; j++) {
+                dst[j] = 1 << (BIT_DEPTH - 1);
+            }
+            dst += stride;
+        }
+    }
+}
+
+//8.4.5.2.14 Specification of INTRA_LT_CCLM, INTRA_L_CCLM and INTRA_T_CCLM intra prediction mode
+static void FUNC(intra_cclm_pred)(const VVCLocalContext *lc, const int x0, const int y0,
+    const int width, const int height)
+{
+    VVCFrameContext *fc     = lc->fc;
+    const VVCSPS *sps = fc->ps.sps;
+    const int avail_t = ff_vvc_get_top_available(lc, x0, y0, 1, 0);
+    const int avail_l = ff_vvc_get_left_available(lc, x0, y0, 1, 0);
+    const int hs = sps->hshift[1];
+    const int vs = sps->vshift[1];
+    const int x  = x0 >> hs;
+    const int y  = y0 >> vs;
+    const int w  = width >> hs;
+    const int h  = height >> vs;
+    int a[2], b[2], k[2];
+
+    pixel dsy[MAX_TB_SIZE * MAX_TB_SIZE];
+    if (!avail_t && !avail_l) {
+        FUNC(cclm_pred_default)(fc, x, y, w, h, avail_t, avail_l);
+        return;
+    }
+    FUNC(cclm_get_luma_rec_pixels)(fc, x0, y0, w, h, avail_t, avail_l, dsy);
+    FUNC(cclm_get_params) (lc, x0, y0, w, h, avail_t, avail_l, a, b, k);
+    FUNC(cclm_linear_pred)(fc, x0, y0, w, h, dsy, a, b, k);
+}
+
+static int FUNC(lmcs_sum_samples)(const pixel *start, ptrdiff_t stride, const int avail, const int target_size)
+{
+    const int size = FFMIN(avail, target_size);
+    int sum = 0;
+    for (int i = 0; i < size; i++) {
+        sum += *start;
+        start += stride;
+    }
+    sum += *(start - stride) * (target_size - size);
+    return sum;
+}
+
+// 8.7.5.3 Picture reconstruction with luma dependent chroma residual scaling process for chroma samples
+static int FUNC(lmcs_derive_chroma_scale)(VVCLocalContext *lc, const int x0, const int y0)
+{
+    VVCFrameContext *fc = lc->fc;
+    const VVCPH *ph     = fc->ps.ph;
+    const int size_y = FFMIN(fc->ps.sps->ctb_size_y, 64);
+
+    const int x = x0 & ~(size_y - 1);
+    const int y = y0 & ~(size_y - 1);
+    if (lc->lmcs.x_vpdu != x || lc->lmcs.y_vpdu != y) {
+        int cnt = 0, luma = 0, i;
+        const pixel *src = (const pixel *)(fc->frame->data[LUMA] + y * fc->frame->linesize[LUMA] + (x << fc->ps.sps->pixel_shift));
+        const ptrdiff_t stride = fc->frame->linesize[LUMA] / sizeof(pixel);
+        const int avail_t = ff_vvc_get_top_available (lc, x, y, 1, 0);
+        const int avail_l = ff_vvc_get_left_available(lc, x, y, 1, 0);
+        if (avail_l) {
+            luma += FUNC(lmcs_sum_samples)(src - 1, stride, fc->ps.pps->height - y, size_y);
+            cnt = size_y;
+        }
+        if (avail_t) {
+            luma += FUNC(lmcs_sum_samples)(src - stride, 1, fc->ps.pps->width - x, size_y);
+            cnt += size_y;
+        }
+        if (cnt)
+            luma = (luma + (cnt >> 1)) >> av_log2(cnt);
+        else
+            luma = 1 << (BIT_DEPTH - 1);
+
+        for (i = ph->lmcs_min_bin_idx; i <= ph->lmcs_max_bin_idx; i++) {
+            if (luma < ph->lmcs_pivot[i + 1])
+                break;
+        }
+        i = FFMIN(i, LMCS_MAX_BIN_SIZE - 1);
+
+        lc->lmcs.chroma_scale = ph->lmcs_chroma_scale_coeff[i];
+        lc->lmcs.x_vpdu = x;
+        lc->lmcs.y_vpdu = y;
+    }
+    return lc->lmcs.chroma_scale;
+}
+
+// 8.7.5.3 Picture reconstruction with luma dependent chroma residual scaling process for chroma samples
+static void FUNC(lmcs_scale_chroma)(VVCLocalContext *lc, int *dst, const int *coeff,
+    const int width, const int height, const int x0_cu, const int y0_cu)
+{
+    const int chroma_scale = FUNC(lmcs_derive_chroma_scale)(lc, x0_cu, y0_cu);
+
+    for (int y = 0; y < height; y++) {
+        for (int x = 0; x < width; x++) {
+            const int c = av_clip_intp2(*coeff, BIT_DEPTH);
+
+            if (c > 0)
+                *dst = (c * chroma_scale + (1 << 10)) >> 11;
+            else
+                *dst = -((-c * chroma_scale + (1 << 10)) >> 11);
+            coeff++;
+            dst++;
+        }
+    }
+}
+
+static av_always_inline void FUNC(ref_filter)(const pixel *left, const pixel *top,
+    pixel *filtered_left, pixel *filtered_top, const int left_size, const int top_size,
+    const int unfilter_last_one)
+{
+    filtered_left[-1] = filtered_top[-1] = (left[0] +  2 * left[-1] + top[0] + 2 ) >> 2;
+    for (int i = 0; i < left_size - unfilter_last_one; i++) {
+        filtered_left[i] = (left[i- 1] + 2 * left[i] + left[i + 1] + 2) >> 2;
+    }
+    for (int i = 0; i < top_size - unfilter_last_one; i++) {
+        filtered_top[i] = (top[i-1] + 2 * top[i] + top[i + 1] + 2) >> 2;
+    }
+    if (unfilter_last_one) {
+        filtered_top[top_size - 1] = top[top_size - 1];
+        filtered_left[left_size - 1] = left[left_size - 1];
+    }
+}
+
+static av_always_inline void FUNC(prepare_intra_edge_params)(const VVCLocalContext *lc,
+    IntraEdgeParams* edge, const pixel *src, const ptrdiff_t stride,
+    const int x, int y, int w, int h, int c_idx, const int is_intra_mip,
+    const int mode, const int ref_idx, const int need_pdpc)
+{
+#define EXTEND(ptr, val, len)         \
+do {                                  \
+    for (i = 0; i < (len); i++)       \
+        *(ptr + i) = val;             \
+} while (0)
+    const CodingUnit *cu = lc->cu;
+    const int ref_filter_flag = is_intra_mip ? 0 : ff_vvc_ref_filter_flag_derive(mode);
+    const int filter_flag = !ref_idx && w * h > 32 && !c_idx &&
+        cu->isp_split_type == ISP_NO_SPLIT && ref_filter_flag;
+    int cand_up_left      = lc->na.cand_up_left;
+    pixel  *left          = (pixel*)edge->left_array + MAX_TB_SIZE + 3;
+    pixel  *top           = (pixel*)edge->top_array  + MAX_TB_SIZE + 3;
+    pixel  *filtered_left = (pixel*)edge->filtered_left_array + MAX_TB_SIZE + 3;
+    pixel  *filtered_top  = (pixel*)edge->filtered_top_array  + MAX_TB_SIZE + 3;
+    const int ref_line = ref_idx == 3 ? -4 : (-1 - ref_idx);
+    int left_size, top_size, unfilter_left_size, unfilter_top_size;
+    int left_available, top_available;
+    int refw, refh;
+    int intra_pred_angle, inv_angle;
+    int i;
+
+    if (is_intra_mip || mode == INTRA_PLANAR) {
+        left_size = h + 1;
+        top_size  = w + 1;
+        unfilter_left_size = left_size + filter_flag;
+        unfilter_top_size  = top_size  + filter_flag;
+    } else if (mode == INTRA_DC) {
+        unfilter_left_size = left_size = h;
+        unfilter_top_size = top_size  = w;
+    } else if (mode == INTRA_VERT) {
+        //we may need 1 pixel to predict the top left.
+        unfilter_left_size = left_size = need_pdpc ? h : 1;
+        unfilter_top_size = top_size  = w;
+    } else if (mode == INTRA_HORZ) {
+        unfilter_left_size = left_size = h;
+        //even need_pdpc == 0, we may need 1 pixel to predict the top left.
+        unfilter_top_size = top_size = need_pdpc ? w : 1;
+    } else {
+        if (cu->isp_split_type == ISP_NO_SPLIT || c_idx) {
+            refw = w * 2;
+            refh = h * 2;
+        } else {
+            refw = cu->cb_width + w;
+            refh = cu->cb_height + h;
+        }
+        intra_pred_angle = ff_vvc_intra_pred_angle_derive(mode);
+        inv_angle = ff_vvc_intra_inv_angle_derive(intra_pred_angle);
+        unfilter_top_size = top_size  = refw;
+        unfilter_left_size = left_size = refh;
+    }
+
+    left_available = ff_vvc_get_left_available(lc, x, y, unfilter_left_size, c_idx);
+    for (i = 0; i < left_available; i++)
+        left[i] = POS(ref_line, i);
+
+    top_available = ff_vvc_get_top_available(lc, x, y, unfilter_top_size, c_idx);
+    memcpy(top, src + ref_line * stride, top_available * sizeof(pixel));
+
+    for (int i = -1; i >= ref_line; i--) {
+        if (cand_up_left) {
+            left[i] = POS(ref_line, i);
+            top[i]  = POS(i, ref_line);
+        } else if (left_available) {
+            left[i] = top[i] = left[0];
+        } else if (top_available) {
+            left[i] = top[i] = top[0];
+        } else {
+            left[i] = top[i] = 1 << (BIT_DEPTH - 1);
+        }
+    }
+
+    EXTEND(top + top_available, top[top_available-1], unfilter_top_size - top_available);
+    EXTEND(left + left_available, left[left_available-1], unfilter_left_size - left_available);
+
+    if (ref_filter_flag) {
+        if (!ref_idx && w * h > 32 && !c_idx && cu->isp_split_type == ISP_NO_SPLIT ) {
+            const int unfilter_last_one = left_size == unfilter_left_size;
+            FUNC(ref_filter)(left, top, filtered_left, filtered_top, unfilter_left_size, unfilter_top_size, unfilter_last_one);
+            left = filtered_left;
+            top  = filtered_top;
+        }
+    }
+    if (!is_intra_mip && mode != INTRA_PLANAR && mode != INTRA_DC) {
+        if (ref_filter_flag || ref_idx || cu->isp_split_type != ISP_NO_SPLIT) {
+            edge->filter_flag = 0;
+        } else {
+            const int min_dist_ver_hor = FFMIN(abs(mode - 50), abs(mode - 18));
+            const int intra_hor_ver_dist_thres[] = {24, 14, 2, 0, 0};
+            const int ntbs = (av_log2(w) + av_log2(h)) >> 1;
+            edge->filter_flag = min_dist_ver_hor > intra_hor_ver_dist_thres[ntbs - 2];
+        }
+
+        if (mode != INTRA_VERT && mode != INTRA_HORZ) {
+            if (mode >= INTRA_DIAG) {
+                if (intra_pred_angle < 0) {
+                    pixel *p = top - (ref_idx + 1);
+                    for (int x = -h; x < 0; x++) {
+                        const int idx = -1 - ref_idx + FFMIN((x*inv_angle + 256) >> 9, h);
+                        p[x] = left[idx];
+                    }
+                } else {
+                    for (int i = refw; i <= refw + FFMAX(1, w/h) * ref_idx + 1; i++)
+                        top[i] = top[refw - 1];
+                }
+            } else {
+                if (intra_pred_angle < 0) {
+                    pixel *p = left - (ref_idx + 1);
+                    for (int x = -w; x < 0; x++) {
+                        const int idx = -1 - ref_idx + FFMIN((x*inv_angle + 256) >> 9, w);
+                        p[x] = top[idx];
+                    }
+                } else {
+                    for (int i = refh; i <= refh + FFMAX(1, h/w) * ref_idx + 1; i++)
+                        left[i] = left[refh - 1];
+                }
+            }
+        }
+    }
+    edge->left = (uint8_t*)left;
+    edge->top  = (uint8_t*)top;
+}
+
+//8.4.1 General decoding process for coding units coded in intra prediction mode
+static void FUNC(intra_pred)(const VVCLocalContext *lc, int x0, int y0,
+    const int width, const int height, int c_idx)
+{
+    VVCFrameContext *fc     = lc->fc;
+    const VVCSPS *sps = fc->ps.sps;
+    const VVCPPS *pps = fc->ps.pps;
+    const CodingUnit *cu = lc->cu;
+    const int log2_min_cb_size    = sps->min_cb_log2_size_y;
+    const int min_cb_width        = pps->min_cb_width;
+    const int x_cb                = x0 >> log2_min_cb_size;
+    const int y_cb                = y0 >> log2_min_cb_size;
+
+    const int hshift = fc->ps.sps->hshift[c_idx];
+    const int vshift = fc->ps.sps->vshift[c_idx];
+    const int x = x0 >> hshift;
+    const int y = y0 >> vshift;
+    const int w = width >> hshift;
+    const int h = height >> vshift;
+    const ptrdiff_t stride = fc->frame->linesize[c_idx] / sizeof(pixel);
+
+    const int pred_mode = c_idx ? cu->intra_pred_mode_c : cu->intra_pred_mode_y;
+    const int mode = ff_vvc_wide_angle_mode_mapping(cu, w, h, c_idx, pred_mode);
+
+    const int intra_mip_flag  = SAMPLE_CTB(fc->tab.imf, x_cb, y_cb);
+    const int is_intra_mip    = intra_mip_flag && (!c_idx || cu->mip_chroma_direct_flag);
+    const int ref_idx = c_idx ? 0 : cu->intra_luma_ref_idx;
+    const int need_pdpc = ff_vvc_need_pdpc(w, h, cu->bdpcm_flag[c_idx], mode, ref_idx);
+
+
+    pixel *src = (pixel*)fc->frame->data[c_idx] + x + y * stride;
+    IntraEdgeParams edge;
+
+    FUNC(prepare_intra_edge_params)(lc, &edge, src, stride, x, y, w, h, c_idx, is_intra_mip, mode, ref_idx, need_pdpc);
+
+    if (is_intra_mip) {
+        int intra_mip_transposed_flag = SAMPLE_CTB(fc->tab.imtf, x_cb, y_cb);
+        int intra_mip_mode = SAMPLE_CTB(fc->tab.imm, x_cb, y_cb);
+
+        fc->vvcdsp.intra.pred_mip((uint8_t *)src, edge.top, edge.left,
+                        w, h, stride, intra_mip_mode, intra_mip_transposed_flag);
+    } else if (mode == INTRA_PLANAR) {
+        fc->vvcdsp.intra.pred_planar((uint8_t *)src, edge.top, edge.left, w, h, stride);
+    } else if (mode == INTRA_DC) {
+        fc->vvcdsp.intra.pred_dc((uint8_t *)src, edge.top, edge.left, w, h, stride);
+    } else if (mode == INTRA_VERT) {
+        fc->vvcdsp.intra.pred_v((uint8_t *)src, edge.top, w, h, stride);
+    } else if (mode == INTRA_HORZ) {
+        fc->vvcdsp.intra.pred_h((uint8_t *)src, edge.left, w, h, stride);
+    } else {
+        if (mode >= INTRA_DIAG) {
+            fc->vvcdsp.intra.pred_angular_v((uint8_t *)src, edge.top, edge.left,
+                                  w, h, stride, c_idx, mode, ref_idx,
+                                  edge.filter_flag, need_pdpc);
+        } else {
+            fc->vvcdsp.intra.pred_angular_h((uint8_t *)src, edge.top, edge.left,
+                                  w, h, stride, c_idx, mode, ref_idx,
+                                  edge.filter_flag, need_pdpc);
+        }
+    }
+    if (need_pdpc) {
+        //8.4.5.2.15 Position-dependent intra prediction sample filtering process
+        if (!is_intra_mip && (mode == INTRA_PLANAR || mode == INTRA_DC ||
+            mode == INTRA_VERT || mode == INTRA_HORZ)) {
+            const int scale = (av_log2(w) + av_log2(h) - 2) >> 2;
+            const pixel *left = (pixel*)edge.left;
+            const pixel *top  = (pixel*)edge.top;
+            for (int y = 0; y < h; y++) {
+                for (int x = 0; x < w; x++) {
+                    int l, t, wl, wt, pred;
+                    pixel val;
+                    if (mode == INTRA_PLANAR || mode == INTRA_DC) {
+                        l  = left[y];
+                        t = top[x];
+                        wl = 32 >> FFMIN((x << 1) >> scale, 31);
+                        wt = 32 >> FFMIN((y << 1) >> scale, 31);
+                    } else {
+                        l  = left[y] - left[-1] + POS(x,y);
+                        t = top[x] - top[-1] + POS(x,y);
+                        wl = (mode == INTRA_VERT) ?  (32 >> FFMIN((x << 1) >> scale, 31)) : 0;
+                        wt = (mode == INTRA_HORZ) ?  (32 >> FFMIN((y << 1) >> scale, 31)) : 0;
+                    }
+                    val = POS(x, y);
+                    pred  = val + ((wl * (l - val) + wt * (t - val) + 32) >> 6);
+                    POS(x, y) = CLIP(pred);
+                }
+            }
+        }
+    }
+}
+
+//8.4.5.2.11 Specification of INTRA_PLANAR intra prediction mode
+static av_always_inline void FUNC(pred_planar)(uint8_t *_src, const uint8_t *_top,
+    const uint8_t *_left, const int w, const int h, const ptrdiff_t stride)
+{
+    int x, y;
+    pixel *src        = (pixel *)_src;
+    const pixel *top  = (const pixel *)_top;
+    const pixel *left = (const pixel *)_left;
+    const int logw  = av_log2(w);
+    const int logh  = av_log2(h);
+    const int size  =  w * h;
+    const int shift = (logw + logh + 1);
+    for (y = 0; y < h; y++) {
+        for (x = 0; x < w; x++) {
+            const int pred_v = ((h - 1 - y) * top[x]  + (y + 1) * left[h]) << logw;
+            const int pred_h = ((w - 1 - x) * left[y] + (x + 1) * top[w]) << logh;
+            const int pred = (pred_v + pred_h + size) >> shift;
+            POS(x, y) = pred;
+        }
+    }
+}
+
+//8.4.5.2.3 MIP boundary sample downsampling process
+static av_always_inline void FUNC(mip_downsampling)(int *reduced, const int boundary_size,
+    const pixel *ref, const int n_tb_s)
+{
+    const int b_dwn = n_tb_s / boundary_size;
+    const int log2 = av_log2(b_dwn);
+
+    if (boundary_size == n_tb_s) {
+        for (int i = 0; i < n_tb_s; i++)
+            reduced[i] = ref[i];
+        return;
+    }
+    for (int i = 0; i < boundary_size; i++) {
+        int r;
+        r = *ref++;
+        for (int j = 1; j < b_dwn; j++)
+            r += *ref++;
+        reduced[i] = (r + (1 << (log2 - 1))) >> log2;
+    }
+}
+
+static av_always_inline void FUNC(mip_reduced_pred)(pixel *src, const ptrdiff_t stride,
+    const int up_hor, const int up_ver, const int pred_size, const int *reduced, const int reduced_size,
+    const int ow, const int temp0, const uint8_t *matrix, int is_transposed)
+{
+    src = &POS(up_hor - 1, up_ver - 1);
+    for (int y = 0; y < pred_size; y++) {
+        for (int x = 0; x < pred_size; x++) {
+            int pred = 0;
+            for (int i = 0; i < reduced_size; i++)
+                pred += reduced[i] * matrix[i];
+            matrix += reduced_size;
+            pred = ((pred + ow) >> 6) + temp0;
+            pred = av_clip(pred, 0, (1<<BIT_DEPTH) - 1);
+            if (is_transposed)
+                POS(y * up_hor, x * up_ver) = pred;
+            else
+                POS(x * up_hor, y * up_ver) = pred;
+        }
+    }
+}
+
+static av_always_inline void FUNC(mip_upsampling_1d)(pixel *dst, const int dst_step, const int dst_stride, const int dst_height, const int factor,
+    const pixel *boundary, const int boundary_step,  const int pred_size)
+{
+
+    for (int i = 0; i < dst_height; i++) {
+        const pixel *before = boundary;
+        const pixel *after  = dst - dst_step;
+        pixel *d = dst;
+        for (int j = 0; j < pred_size; j++) {
+            after += dst_step * factor;
+            for (int k = 1; k < factor; k++) {
+                int mid = (factor - k) * (*before) + k * (*after);
+                *d = (mid + factor / 2) / factor;
+                d += dst_step;
+            }
+            before = after;
+            d += dst_step;
+        }
+        boundary += boundary_step;
+        dst += dst_stride;
+    }
+}
+
+//8.4.5.2.2 Matrix-based intra sample prediction
+static av_always_inline void FUNC(pred_mip)(uint8_t *_src, const uint8_t *_top,
+    const uint8_t *_left, const int w, const int h, const ptrdiff_t stride,
+    int mode_id, int is_transposed)
+{
+    pixel *src        = (pixel *)_src;
+    const pixel *top  = (const pixel *)_top;
+    const pixel *left = (const pixel *)_left;
+
+    const int size_id = ff_vvc_get_mip_size_id(w, h);
+    static const int boundary_sizes[] = {2, 4, 4};
+    static const int pred_sizes[] = {4, 4, 8};
+    const int boundary_size = boundary_sizes[size_id];
+    const int pred_size     = pred_sizes[size_id];
+    const int in_size = 2 * boundary_size - ((size_id == 2) ? 1 : 0);
+    const uint8_t *matrix = ff_vvc_get_mip_matrix(size_id, mode_id);
+    const int up_hor = w / pred_size;
+    const int up_ver = h / pred_size;
+
+    int reduced[16];
+    int *red_t  = reduced;
+    int *red_l  = reduced + boundary_size;
+    int off = 1, ow = 0;
+    int temp0;
+
+    if (is_transposed) {
+        FFSWAP(int*, red_t, red_l);
+    }
+    FUNC(mip_downsampling)(red_t, boundary_size, top, w);
+    FUNC(mip_downsampling)(red_l, boundary_size, left, h);
+
+    temp0 = reduced[0];
+    if (size_id != 2) {
+        off = 0;
+        ow = (1 << (BIT_DEPTH - 1)) - temp0;
+    } else {
+        ow = reduced[1] - temp0;
+    }
+    reduced[0] = ow;
+    for (int i = 1; i < in_size; i++) {
+        reduced[i] = reduced[i + off] - temp0;
+        ow += reduced[i];
+    }
+    ow = 32 - 32 * ow;
+
+    FUNC(mip_reduced_pred)(src, stride, up_hor, up_ver, pred_size, reduced, in_size, ow, temp0, matrix, is_transposed);
+    if (up_hor > 1 || up_ver > 1) {
+        if (up_hor > 1)
+            FUNC(mip_upsampling_1d)(&POS(0, up_ver - 1), 1, up_ver * stride, pred_size, up_hor, left + up_ver - 1, up_ver, pred_size);
+        if (up_ver > 1)
+            FUNC(mip_upsampling_1d)(src, stride, 1, w, up_ver, top, 1, pred_size);
+    }
+}
+
+static av_always_inline pixel FUNC(pred_dc_val)(const pixel *top, const pixel *left,
+    const int w, const int h)
+{
+    pixel dc_val;
+    int sum = 0;
+    unsigned int offset = (w == h) ? (w << 1) : FFMAX(w, h);
+    const int shift = av_log2(offset);
+    offset >>= 1;
+    if (w >= h) {
+        for (int i = 0; i < w; i++)
+            sum += top[i];
+    }
+    if (w <= h) {
+        for (int i = 0; i < h; i++)
+            sum += left[i];
+    }
+    dc_val = (sum + offset) >> shift;
+    return dc_val;
+}
+
+//8.4.5.2.12 Specification of INTRA_DC intra prediction mode
+static av_always_inline void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
+    const uint8_t *_left, const int w, const int h, const ptrdiff_t stride)
+{
+    int x, y;
+    pixel *src          = (pixel *)_src;
+    const pixel *top    = (const pixel *)_top;
+    const pixel *left   = (const pixel *)_left;
+    const pixel dc      = FUNC(pred_dc_val)(top, left, w, h);
+    const pixel4 a      = PIXEL_SPLAT_X4(dc);
+    for (y = 0; y < h; y++) {
+        pixel *s = src;
+        for (x = 0; x < w; x += 4) {
+            AV_WN4P(s, a);
+            s += 4;
+        }
+        src += stride;
+    }
+}
+
+static av_always_inline void FUNC(pred_v)(uint8_t *_src, const uint8_t *_top,
+    const int w, const int h, const ptrdiff_t stride)
+{
+    pixel *src          = (pixel *)_src;
+    const pixel *top    = (const pixel *)_top;
+    for (int y = 0; y < h; y++) {
+        memcpy(src, top, sizeof(pixel)  * w);
+        src += stride;
+    }
+}
+
+static void FUNC(pred_h)(uint8_t *_src, const uint8_t *_left, const int w, const int h,
+    const ptrdiff_t stride)
+{
+    pixel *src          = (pixel *)_src;
+    const pixel *left    = (const pixel *)_left;
+    for (int y = 0; y < h; y++) {
+        const pixel4 a = PIXEL_SPLAT_X4(left[y]);
+        for (int x = 0; x < w; x += 4) {
+            AV_WN4P(&POS(x, y), a);
+        }
+    }
+}
+
+//8.4.5.2.13 Specification of INTRA_ANGULAR2..INTRA_ANGULAR66 intra prediction modes
+static void FUNC(pred_angular_v)(uint8_t *_src, const uint8_t *_top, const uint8_t *_left,
+    const int w, const int h, const ptrdiff_t stride, const int c_idx, const int mode,
+    const int ref_idx, const int filter_flag, const int need_pdpc)
+{
+    pixel *src          = (pixel *)_src;
+    const pixel *left   = (const pixel *)_left;
+    const pixel *top    = (const pixel *)_top - (1 + ref_idx);
+    const int intra_pred_angle = ff_vvc_intra_pred_angle_derive(mode);
+    int pos = (1 + ref_idx) * intra_pred_angle;
+    const int dp = intra_pred_angle;
+    const int is_luma = !c_idx;
+    int nscale, inv_angle;
+
+    if (need_pdpc) {
+        inv_angle = ff_vvc_intra_inv_angle_derive(intra_pred_angle);
+        nscale = ff_vvc_nscale_derive(w, h, mode);
+    }
+
+    for (int y = 0; y < h; y++) {
+        const int idx   = (pos >> 5) + ref_idx;
+        const int fact = pos & 31;
+        if (!fact && (!is_luma || !filter_flag)) {
+            for (int x = 0; x < w; x++) {
+                const pixel *p = top + x + idx + 1;
+                const pixel pred = p[0];
+                POS(x, y) = pred;
+            }
+        } else {
+            if (!c_idx) {
+                const int8_t* f = filter_flag ? ff_vvc_filter_g[fact] : ff_vvc_filter_c[fact];
+                for (int x = 0; x < w; x++) {
+                    const pixel *p = top + x + idx;
+                    const int pred = (p[0] * f[0] + p[1] * f[1] +
+                                        p[2] * f[2] + p[3] * f[3] + 32) >> 6;
+                    POS(x, y) = av_clip_pixel(pred);
+                }
+            } else {
+                for (int x = 0; x < w; x++) {
+                    const pixel *p = top + x + idx + 1;
+                    const pixel pred = ((32 - fact) * p[0] + fact * p[1] + 16) >> 5;
+                    POS(x, y) = pred;
+                }
+            }
+        }
+        if (need_pdpc) {
+            int inv_angle_sum = 256 + inv_angle;
+            for (int x = 0; x < FFMIN(w, 3 << nscale); x++) {
+                const pixel l   = left[y + (inv_angle_sum >> 9)];
+                const pixel val = POS(x, y);
+                const int wl    = 32 >> ((x << 1) >> nscale);
+                const int pred  = val + (((l - val) * wl + 32) >> 6);
+                POS(x, y) = CLIP(pred);
+                inv_angle_sum += inv_angle;
+            }
+        }
+        pos += dp;
+    }
+}
+
+//8.4.5.2.13 Specification of INTRA_ANGULAR2..INTRA_ANGULAR66 intra prediction modes
+static void FUNC(pred_angular_h)(uint8_t *_src, const uint8_t *_top, const uint8_t *_left,
+    const int w, const int h, const ptrdiff_t stride, const int c_idx, const int mode,
+    const int ref_idx, const int filter_flag, const int need_pdpc)
+{
+    pixel *src          = (pixel *)_src;
+    const pixel *left   = (const pixel *)_left - (1 + ref_idx);
+    const pixel *top    = (const pixel *)_top;
+    const int is_luma = !c_idx;
+    const int intra_pred_angle = ff_vvc_intra_pred_angle_derive(mode);
+    const int dp = intra_pred_angle;
+    int nscale = 0, inv_angle, inv_angle_sum;
+
+    if (need_pdpc) {
+        inv_angle = ff_vvc_intra_inv_angle_derive(intra_pred_angle);
+        inv_angle_sum = 256 + inv_angle;
+        nscale = ff_vvc_nscale_derive(w, h, mode);
+    }
+
+    for (int y = 0; y < h; y++) {
+        int pos = (1 + ref_idx) * intra_pred_angle;
+        int wt;
+        if (need_pdpc)
+            wt = (32 >> ((y * 2) >> nscale));
+
+        for (int x = 0; x < w; x++) {
+            const int idx  = (pos >> 5) + ref_idx;
+            const int fact = pos & 31;
+            const pixel *p = left + y + idx;
+            int pred;
+            if (!fact && (!is_luma || !filter_flag)) {
+                pred = p[1];
+            } else {
+                if (!c_idx) {
+                    const int8_t* f = filter_flag ? ff_vvc_filter_g[fact] : ff_vvc_filter_c[fact] ;
+                    pred = (p[0] * f[0] + p[1] * f[1] + p[2] * f[2] + p[3] * f[3] + 32) >> 6;
+                    pred = CLIP(pred);
+                } else {
+                    pred = ((32 - fact) * p[1] + fact * p[2] + 16) >> 5;
+                }
+            }
+            if (need_pdpc) {
+                if (y < (3 << nscale)) {
+                    const pixel t = top[x + (inv_angle_sum >> 9)];
+                    pred = CLIP(pred + (((t - pred) * wt + 32) >> 6));
+                }
+            }
+            POS(x, y) = pred;
+            pos += dp;
+        }
+        if (need_pdpc)
+            inv_angle_sum += inv_angle;
+    }
+}
+
+static void FUNC(ff_vvc_intra_dsp_init)(VVCIntraDSPContext *const intra)
+{
+    intra->lmcs_scale_chroma  = FUNC(lmcs_scale_chroma);
+    intra->intra_cclm_pred    = FUNC(intra_cclm_pred);
+    intra->intra_pred         = FUNC(intra_pred);
+    intra->pred_planar        = FUNC(pred_planar);
+    intra->pred_mip           = FUNC(pred_mip);
+    intra->pred_dc            = FUNC(pred_dc);
+    intra->pred_v             = FUNC(pred_v);
+    intra->pred_h             = FUNC(pred_h);
+    intra->pred_angular_v     = FUNC(pred_angular_v);
+    intra->pred_angular_h     = FUNC(pred_angular_h);
+}