Message ID | tencent_90527C02AEB23975C365B5E46A1203B5680A@qq.com |
---|---|
State | New |
Headers | show |
Series | [FFmpeg-devel,v2] avcodec/vvc: Don't use large array on stack | expand |
Context | Check | Description |
---|---|---|
yinshiyou/make_loongarch64 | success | Make finished |
yinshiyou/make_fate_loongarch64 | success | Make fate finished |
andriy/make_x86 | success | Make finished |
andriy/make_fate_x86 | success | Make fate finished |
> On Sep 20, 2024, at 11:43, Zhao Zhili <quinkblack@foxmail.com> wrote: > > From: Zhao Zhili <zhilizhao@tencent.com> > > tmp_array in dmvr_hv takes 33024 bytes on stack, which can be > dangerous. I prefer version 3 personally https://ffmpeg.org/pipermail/ffmpeg-devel/2024-September/333709.html > --- > libavcodec/vvc/ctu.h | 1 + > libavcodec/vvc/dsp.h | 2 +- > libavcodec/vvc/inter.c | 2 +- > libavcodec/vvc/inter_template.c | 12 +++++++----- > libavcodec/x86/vvc/vvcdsp_init.c | 8 ++++---- > tests/checkasm/vvc_mc.c | 17 +++++++++++++---- > 6 files changed, 27 insertions(+), 15 deletions(-) > > diff --git a/libavcodec/vvc/ctu.h b/libavcodec/vvc/ctu.h > index eab4612561..eb3e51c7e5 100644 > --- a/libavcodec/vvc/ctu.h > +++ b/libavcodec/vvc/ctu.h > @@ -385,6 +385,7 @@ typedef struct VVCLocalContext { > DECLARE_ALIGNED(32, uint8_t, alf_buffer_luma)[(MAX_CTU_SIZE + 2 * ALF_PADDING_SIZE) * EDGE_EMU_BUFFER_STRIDE * 2]; > DECLARE_ALIGNED(32, uint8_t, alf_buffer_chroma)[(MAX_CTU_SIZE + 2 * ALF_PADDING_SIZE) * EDGE_EMU_BUFFER_STRIDE * 2]; > DECLARE_ALIGNED(32, int32_t, alf_gradient_tmp)[ALF_GRADIENT_SIZE * ALF_GRADIENT_SIZE * ALF_NUM_DIR]; > + DECLARE_ALIGNED(32, int16_t, dmvr_tmp)[(MAX_PB_SIZE + BILINEAR_EXTRA) * MAX_PB_SIZE]; > > struct { > int sbt_num_fourths_tb0; ///< SbtNumFourthsTb0 > diff --git a/libavcodec/vvc/dsp.h b/libavcodec/vvc/dsp.h > index 635ebcafed..3594dfc5f5 100644 > --- a/libavcodec/vvc/dsp.h > +++ b/libavcodec/vvc/dsp.h > @@ -99,7 +99,7 @@ typedef struct VVCInterDSPContext { > > int (*sad)(const int16_t *src0, const int16_t *src1, int dx, int dy, int block_w, int block_h); > void (*dmvr[2][2])(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, int height, > - intptr_t mx, intptr_t my, int width); > + intptr_t mx, intptr_t my, int width, int16_t *tmp); > } VVCInterDSPContext; > > struct VVCLocalContext; > diff --git a/libavcodec/vvc/inter.c b/libavcodec/vvc/inter.c > index 64a9dd1e46..48b633d580 100644 > --- a/libavcodec/vvc/inter.c > +++ b/libavcodec/vvc/inter.c > @@ -806,7 +806,7 @@ static void dmvr_mv_refine(VVCLocalContext *lc, MvField *mvf, MvField *orig_mv, > const int wrap_enabled = fc->ps.pps->r->pps_ref_wraparound_enabled_flag; > > MC_EMULATED_EDGE_BILINEAR(lc->edge_emu_buffer, &src, &src_stride, ox, oy); > - fc->vvcdsp.inter.dmvr[!!my][!!mx](tmp[i], src, src_stride, pred_h, mx, my, pred_w); > + fc->vvcdsp.inter.dmvr[!!my][!!mx](tmp[i], src, src_stride, pred_h, mx, my, pred_w, lc->dmvr_tmp); > } > > min_sad = fc->vvcdsp.inter.sad(tmp[L0], tmp[L1], dx, dy, block_w, block_h); > diff --git a/libavcodec/vvc/inter_template.c b/libavcodec/vvc/inter_template.c > index c073a73e76..fad1ba801f 100644 > --- a/libavcodec/vvc/inter_template.c > +++ b/libavcodec/vvc/inter_template.c > @@ -474,7 +474,8 @@ static void FUNC(apply_bdof)(uint8_t *_dst, const ptrdiff_t _dst_stride, const i > > //8.5.3.2.2 Luma sample bilinear interpolation process > static void FUNC(dmvr)(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, > - const int height, const intptr_t mx, const intptr_t my, const int width) > + const int height, const intptr_t mx, const intptr_t my, const int width, > + int16_t *tmp) > { > #if BIT_DEPTH != 10 > const pixel *src = (const pixel *)_src; > @@ -502,7 +503,8 @@ static void FUNC(dmvr)(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_s > > //8.5.3.2.2 Luma sample bilinear interpolation process > static void FUNC(dmvr_h)(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, > - const int height, const intptr_t mx, const intptr_t my, const int width) > + const int height, const intptr_t mx, const intptr_t my, const int width, > + int16_t *tmp) > { > const pixel *src = (const pixel*)_src; > const ptrdiff_t src_stride = _src_stride / sizeof(pixel); > @@ -520,7 +522,8 @@ static void FUNC(dmvr_h)(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src > > //8.5.3.2.2 Luma sample bilinear interpolation process > static void FUNC(dmvr_v)(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, > - const int height, const intptr_t mx, const intptr_t my, const int width) > + const int height, const intptr_t mx, const intptr_t my, const int width, > + int16_t *tmp) > { > const pixel *src = (pixel*)_src; > const ptrdiff_t src_stride = _src_stride / sizeof(pixel); > @@ -539,9 +542,8 @@ static void FUNC(dmvr_v)(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src > > //8.5.3.2.2 Luma sample bilinear interpolation process > static void FUNC(dmvr_hv)(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, > - const int height, const intptr_t mx, const intptr_t my, const int width) > + const int height, const intptr_t mx, const intptr_t my, const int width, int16_t *tmp_array) > { > - int16_t tmp_array[(MAX_PB_SIZE + BILINEAR_EXTRA) * MAX_PB_SIZE]; > int16_t *tmp = tmp_array; > const pixel *src = (const pixel*)_src; > const ptrdiff_t src_stride = _src_stride / sizeof(pixel); > diff --git a/libavcodec/x86/vvc/vvcdsp_init.c b/libavcodec/x86/vvc/vvcdsp_init.c > index f3e2e3a27b..7ff3e2bdff 100644 > --- a/libavcodec/x86/vvc/vvcdsp_init.c > +++ b/libavcodec/x86/vvc/vvcdsp_init.c > @@ -90,13 +90,13 @@ AVG_PROTOTYPES(12, avx2) > > #define DMVR_PROTOTYPES(bd, opt) \ > void ff_vvc_dmvr_##bd##_##opt(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, \ > - int height, intptr_t mx, intptr_t my, int width); \ > + int height, intptr_t mx, intptr_t my, int width, int16_t *unused); \ > void ff_vvc_dmvr_h_##bd##_##opt(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, \ > - int height, intptr_t mx, intptr_t my, int width); \ > + int height, intptr_t mx, intptr_t my, int width, int16_t *unused); \ > void ff_vvc_dmvr_v_##bd##_##opt(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, \ > - int height, intptr_t mx, intptr_t my, int width); \ > + int height, intptr_t mx, intptr_t my, int width, int16_t *unused); \ > void ff_vvc_dmvr_hv_##bd##_##opt(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, \ > - int height, intptr_t mx, intptr_t my, int width); \ > + int height, intptr_t mx, intptr_t my, int width, int16_t *unused); \ > > DMVR_PROTOTYPES( 8, avx2) > DMVR_PROTOTYPES(10, avx2) > diff --git a/tests/checkasm/vvc_mc.c b/tests/checkasm/vvc_mc.c > index 754cf19065..591557dfa6 100644 > --- a/tests/checkasm/vvc_mc.c > +++ b/tests/checkasm/vvc_mc.c > @@ -333,6 +333,7 @@ static void check_avg(void) > } > > #define SR_RANGE 2 > +#define DMVR_TMP_BUF_SIZE ((MAX_PB_SIZE + BILINEAR_EXTRA) * MAX_PB_SIZE * sizeof(int16_t)) > static void check_dmvr(void) > { > LOCAL_ALIGNED_32(uint16_t, dst0, [DST_BUF_SIZE]); > @@ -340,14 +341,20 @@ static void check_dmvr(void) > LOCAL_ALIGNED_32(uint8_t, src0, [SRC_BUF_SIZE]); > LOCAL_ALIGNED_32(uint8_t, src1, [SRC_BUF_SIZE]); > const int dst_stride = MAX_PB_SIZE * sizeof(int16_t); > + int16_t *tmp0 = av_mallocz(DMVR_TMP_BUF_SIZE); > + int16_t *tmp1 = av_mallocz(DMVR_TMP_BUF_SIZE); > > VVCDSPContext c; > declare_func(void, int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, int height, > - intptr_t mx, intptr_t my, int width); > + intptr_t mx, intptr_t my, int width, int16_t *tmp); > + > + if (!tmp0 || !tmp1) > + fail(); > > for (int bit_depth = 8; bit_depth <= 12; bit_depth += 2) { > ff_vvc_dsp_init(&c, bit_depth); > randomize_pixels(src0, src1, SRC_BUF_SIZE); > + randomize_buffers(tmp0, tmp1, DMVR_TMP_BUF_SIZE / 2, UINT32_MAX); > for (int i = 0; i < 2; i++) { > for (int j = 0; j < 2; j++) { > for (int h = 8; h <= 16; h *= 2) { > @@ -371,8 +378,8 @@ static void check_dmvr(void) > if (check_func(c.inter.dmvr[j][i], "%s_%d_%dx%d", type, bit_depth, pred_w, pred_h)) { > memset(dst0, 0, DST_BUF_SIZE); > memset(dst1, 0, DST_BUF_SIZE); > - call_ref(dst0, src0 + SRC_OFFSET, PIXEL_STRIDE, pred_h, mx, my, pred_w); > - call_new(dst1, src1 + SRC_OFFSET, PIXEL_STRIDE, pred_h, mx, my, pred_w); > + call_ref(dst0, src0 + SRC_OFFSET, PIXEL_STRIDE, pred_h, mx, my, pred_w, tmp0); > + call_new(dst1, src1 + SRC_OFFSET, PIXEL_STRIDE, pred_h, mx, my, pred_w, tmp1); > for (int k = 0; k < pred_h; k++) { > if (memcmp(dst0 + k * dst_stride, dst1 + k * dst_stride, pred_w * sizeof(int16_t))) { > fail(); > @@ -380,13 +387,15 @@ static void check_dmvr(void) > } > } > > - bench_new(dst1, src1 + SRC_OFFSET, PIXEL_STRIDE, pred_h, mx, my, pred_w); > + bench_new(dst1, src1 + SRC_OFFSET, PIXEL_STRIDE, pred_h, mx, my, pred_w, tmp1); > } > } > } > } > } > } > + av_free(tmp0); > + av_free(tmp1); > report("dmvr"); > } > > -- > 2.39.3 >
diff --git a/libavcodec/vvc/ctu.h b/libavcodec/vvc/ctu.h index eab4612561..eb3e51c7e5 100644 --- a/libavcodec/vvc/ctu.h +++ b/libavcodec/vvc/ctu.h @@ -385,6 +385,7 @@ typedef struct VVCLocalContext { DECLARE_ALIGNED(32, uint8_t, alf_buffer_luma)[(MAX_CTU_SIZE + 2 * ALF_PADDING_SIZE) * EDGE_EMU_BUFFER_STRIDE * 2]; DECLARE_ALIGNED(32, uint8_t, alf_buffer_chroma)[(MAX_CTU_SIZE + 2 * ALF_PADDING_SIZE) * EDGE_EMU_BUFFER_STRIDE * 2]; DECLARE_ALIGNED(32, int32_t, alf_gradient_tmp)[ALF_GRADIENT_SIZE * ALF_GRADIENT_SIZE * ALF_NUM_DIR]; + DECLARE_ALIGNED(32, int16_t, dmvr_tmp)[(MAX_PB_SIZE + BILINEAR_EXTRA) * MAX_PB_SIZE]; struct { int sbt_num_fourths_tb0; ///< SbtNumFourthsTb0 diff --git a/libavcodec/vvc/dsp.h b/libavcodec/vvc/dsp.h index 635ebcafed..3594dfc5f5 100644 --- a/libavcodec/vvc/dsp.h +++ b/libavcodec/vvc/dsp.h @@ -99,7 +99,7 @@ typedef struct VVCInterDSPContext { int (*sad)(const int16_t *src0, const int16_t *src1, int dx, int dy, int block_w, int block_h); void (*dmvr[2][2])(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, int height, - intptr_t mx, intptr_t my, int width); + intptr_t mx, intptr_t my, int width, int16_t *tmp); } VVCInterDSPContext; struct VVCLocalContext; diff --git a/libavcodec/vvc/inter.c b/libavcodec/vvc/inter.c index 64a9dd1e46..48b633d580 100644 --- a/libavcodec/vvc/inter.c +++ b/libavcodec/vvc/inter.c @@ -806,7 +806,7 @@ static void dmvr_mv_refine(VVCLocalContext *lc, MvField *mvf, MvField *orig_mv, const int wrap_enabled = fc->ps.pps->r->pps_ref_wraparound_enabled_flag; MC_EMULATED_EDGE_BILINEAR(lc->edge_emu_buffer, &src, &src_stride, ox, oy); - fc->vvcdsp.inter.dmvr[!!my][!!mx](tmp[i], src, src_stride, pred_h, mx, my, pred_w); + fc->vvcdsp.inter.dmvr[!!my][!!mx](tmp[i], src, src_stride, pred_h, mx, my, pred_w, lc->dmvr_tmp); } min_sad = fc->vvcdsp.inter.sad(tmp[L0], tmp[L1], dx, dy, block_w, block_h); diff --git a/libavcodec/vvc/inter_template.c b/libavcodec/vvc/inter_template.c index c073a73e76..fad1ba801f 100644 --- a/libavcodec/vvc/inter_template.c +++ b/libavcodec/vvc/inter_template.c @@ -474,7 +474,8 @@ static void FUNC(apply_bdof)(uint8_t *_dst, const ptrdiff_t _dst_stride, const i //8.5.3.2.2 Luma sample bilinear interpolation process static void FUNC(dmvr)(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, - const int height, const intptr_t mx, const intptr_t my, const int width) + const int height, const intptr_t mx, const intptr_t my, const int width, + int16_t *tmp) { #if BIT_DEPTH != 10 const pixel *src = (const pixel *)_src; @@ -502,7 +503,8 @@ static void FUNC(dmvr)(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_s //8.5.3.2.2 Luma sample bilinear interpolation process static void FUNC(dmvr_h)(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, - const int height, const intptr_t mx, const intptr_t my, const int width) + const int height, const intptr_t mx, const intptr_t my, const int width, + int16_t *tmp) { const pixel *src = (const pixel*)_src; const ptrdiff_t src_stride = _src_stride / sizeof(pixel); @@ -520,7 +522,8 @@ static void FUNC(dmvr_h)(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src //8.5.3.2.2 Luma sample bilinear interpolation process static void FUNC(dmvr_v)(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, - const int height, const intptr_t mx, const intptr_t my, const int width) + const int height, const intptr_t mx, const intptr_t my, const int width, + int16_t *tmp) { const pixel *src = (pixel*)_src; const ptrdiff_t src_stride = _src_stride / sizeof(pixel); @@ -539,9 +542,8 @@ static void FUNC(dmvr_v)(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src //8.5.3.2.2 Luma sample bilinear interpolation process static void FUNC(dmvr_hv)(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, - const int height, const intptr_t mx, const intptr_t my, const int width) + const int height, const intptr_t mx, const intptr_t my, const int width, int16_t *tmp_array) { - int16_t tmp_array[(MAX_PB_SIZE + BILINEAR_EXTRA) * MAX_PB_SIZE]; int16_t *tmp = tmp_array; const pixel *src = (const pixel*)_src; const ptrdiff_t src_stride = _src_stride / sizeof(pixel); diff --git a/libavcodec/x86/vvc/vvcdsp_init.c b/libavcodec/x86/vvc/vvcdsp_init.c index f3e2e3a27b..7ff3e2bdff 100644 --- a/libavcodec/x86/vvc/vvcdsp_init.c +++ b/libavcodec/x86/vvc/vvcdsp_init.c @@ -90,13 +90,13 @@ AVG_PROTOTYPES(12, avx2) #define DMVR_PROTOTYPES(bd, opt) \ void ff_vvc_dmvr_##bd##_##opt(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, \ - int height, intptr_t mx, intptr_t my, int width); \ + int height, intptr_t mx, intptr_t my, int width, int16_t *unused); \ void ff_vvc_dmvr_h_##bd##_##opt(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, \ - int height, intptr_t mx, intptr_t my, int width); \ + int height, intptr_t mx, intptr_t my, int width, int16_t *unused); \ void ff_vvc_dmvr_v_##bd##_##opt(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, \ - int height, intptr_t mx, intptr_t my, int width); \ + int height, intptr_t mx, intptr_t my, int width, int16_t *unused); \ void ff_vvc_dmvr_hv_##bd##_##opt(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, \ - int height, intptr_t mx, intptr_t my, int width); \ + int height, intptr_t mx, intptr_t my, int width, int16_t *unused); \ DMVR_PROTOTYPES( 8, avx2) DMVR_PROTOTYPES(10, avx2) diff --git a/tests/checkasm/vvc_mc.c b/tests/checkasm/vvc_mc.c index 754cf19065..591557dfa6 100644 --- a/tests/checkasm/vvc_mc.c +++ b/tests/checkasm/vvc_mc.c @@ -333,6 +333,7 @@ static void check_avg(void) } #define SR_RANGE 2 +#define DMVR_TMP_BUF_SIZE ((MAX_PB_SIZE + BILINEAR_EXTRA) * MAX_PB_SIZE * sizeof(int16_t)) static void check_dmvr(void) { LOCAL_ALIGNED_32(uint16_t, dst0, [DST_BUF_SIZE]); @@ -340,14 +341,20 @@ static void check_dmvr(void) LOCAL_ALIGNED_32(uint8_t, src0, [SRC_BUF_SIZE]); LOCAL_ALIGNED_32(uint8_t, src1, [SRC_BUF_SIZE]); const int dst_stride = MAX_PB_SIZE * sizeof(int16_t); + int16_t *tmp0 = av_mallocz(DMVR_TMP_BUF_SIZE); + int16_t *tmp1 = av_mallocz(DMVR_TMP_BUF_SIZE); VVCDSPContext c; declare_func(void, int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, int height, - intptr_t mx, intptr_t my, int width); + intptr_t mx, intptr_t my, int width, int16_t *tmp); + + if (!tmp0 || !tmp1) + fail(); for (int bit_depth = 8; bit_depth <= 12; bit_depth += 2) { ff_vvc_dsp_init(&c, bit_depth); randomize_pixels(src0, src1, SRC_BUF_SIZE); + randomize_buffers(tmp0, tmp1, DMVR_TMP_BUF_SIZE / 2, UINT32_MAX); for (int i = 0; i < 2; i++) { for (int j = 0; j < 2; j++) { for (int h = 8; h <= 16; h *= 2) { @@ -371,8 +378,8 @@ static void check_dmvr(void) if (check_func(c.inter.dmvr[j][i], "%s_%d_%dx%d", type, bit_depth, pred_w, pred_h)) { memset(dst0, 0, DST_BUF_SIZE); memset(dst1, 0, DST_BUF_SIZE); - call_ref(dst0, src0 + SRC_OFFSET, PIXEL_STRIDE, pred_h, mx, my, pred_w); - call_new(dst1, src1 + SRC_OFFSET, PIXEL_STRIDE, pred_h, mx, my, pred_w); + call_ref(dst0, src0 + SRC_OFFSET, PIXEL_STRIDE, pred_h, mx, my, pred_w, tmp0); + call_new(dst1, src1 + SRC_OFFSET, PIXEL_STRIDE, pred_h, mx, my, pred_w, tmp1); for (int k = 0; k < pred_h; k++) { if (memcmp(dst0 + k * dst_stride, dst1 + k * dst_stride, pred_w * sizeof(int16_t))) { fail(); @@ -380,13 +387,15 @@ static void check_dmvr(void) } } - bench_new(dst1, src1 + SRC_OFFSET, PIXEL_STRIDE, pred_h, mx, my, pred_w); + bench_new(dst1, src1 + SRC_OFFSET, PIXEL_STRIDE, pred_h, mx, my, pred_w, tmp1); } } } } } } + av_free(tmp0); + av_free(tmp1); report("dmvr"); }
From: Zhao Zhili <zhilizhao@tencent.com> tmp_array in dmvr_hv takes 33024 bytes on stack, which can be dangerous. --- libavcodec/vvc/ctu.h | 1 + libavcodec/vvc/dsp.h | 2 +- libavcodec/vvc/inter.c | 2 +- libavcodec/vvc/inter_template.c | 12 +++++++----- libavcodec/x86/vvc/vvcdsp_init.c | 8 ++++---- tests/checkasm/vvc_mc.c | 17 +++++++++++++---- 6 files changed, 27 insertions(+), 15 deletions(-)