diff mbox series

[FFmpeg-devel,v2] avcodec/vvc: Don't use large array on stack

Message ID tencent_90527C02AEB23975C365B5E46A1203B5680A@qq.com
State New
Headers show
Series [FFmpeg-devel,v2] avcodec/vvc: Don't use large array on stack | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

Zhao Zhili Sept. 20, 2024, 3:43 a.m. UTC
From: Zhao Zhili <zhilizhao@tencent.com>

tmp_array in dmvr_hv takes 33024 bytes on stack, which can be
dangerous.
---
 libavcodec/vvc/ctu.h             |  1 +
 libavcodec/vvc/dsp.h             |  2 +-
 libavcodec/vvc/inter.c           |  2 +-
 libavcodec/vvc/inter_template.c  | 12 +++++++-----
 libavcodec/x86/vvc/vvcdsp_init.c |  8 ++++----
 tests/checkasm/vvc_mc.c          | 17 +++++++++++++----
 6 files changed, 27 insertions(+), 15 deletions(-)

Comments

Zhao Zhili Sept. 20, 2024, 1:44 p.m. UTC | #1
> On Sep 20, 2024, at 11:43, Zhao Zhili <quinkblack@foxmail.com> wrote:
> 
> From: Zhao Zhili <zhilizhao@tencent.com>
> 
> tmp_array in dmvr_hv takes 33024 bytes on stack, which can be
> dangerous.

I prefer version 3 personally

https://ffmpeg.org/pipermail/ffmpeg-devel/2024-September/333709.html

> ---
> libavcodec/vvc/ctu.h             |  1 +
> libavcodec/vvc/dsp.h             |  2 +-
> libavcodec/vvc/inter.c           |  2 +-
> libavcodec/vvc/inter_template.c  | 12 +++++++-----
> libavcodec/x86/vvc/vvcdsp_init.c |  8 ++++----
> tests/checkasm/vvc_mc.c          | 17 +++++++++++++----
> 6 files changed, 27 insertions(+), 15 deletions(-)
> 
> diff --git a/libavcodec/vvc/ctu.h b/libavcodec/vvc/ctu.h
> index eab4612561..eb3e51c7e5 100644
> --- a/libavcodec/vvc/ctu.h
> +++ b/libavcodec/vvc/ctu.h
> @@ -385,6 +385,7 @@ typedef struct VVCLocalContext {
>     DECLARE_ALIGNED(32, uint8_t, alf_buffer_luma)[(MAX_CTU_SIZE + 2 * ALF_PADDING_SIZE) * EDGE_EMU_BUFFER_STRIDE * 2];
>     DECLARE_ALIGNED(32, uint8_t, alf_buffer_chroma)[(MAX_CTU_SIZE + 2 * ALF_PADDING_SIZE) * EDGE_EMU_BUFFER_STRIDE * 2];
>     DECLARE_ALIGNED(32, int32_t, alf_gradient_tmp)[ALF_GRADIENT_SIZE * ALF_GRADIENT_SIZE * ALF_NUM_DIR];
> +    DECLARE_ALIGNED(32, int16_t, dmvr_tmp)[(MAX_PB_SIZE + BILINEAR_EXTRA) * MAX_PB_SIZE];
> 
>     struct {
>         int sbt_num_fourths_tb0;                ///< SbtNumFourthsTb0
> diff --git a/libavcodec/vvc/dsp.h b/libavcodec/vvc/dsp.h
> index 635ebcafed..3594dfc5f5 100644
> --- a/libavcodec/vvc/dsp.h
> +++ b/libavcodec/vvc/dsp.h
> @@ -99,7 +99,7 @@ typedef struct VVCInterDSPContext {
> 
>     int (*sad)(const int16_t *src0, const int16_t *src1, int dx, int dy, int block_w, int block_h);
>     void (*dmvr[2][2])(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, int height,
> -        intptr_t mx, intptr_t my, int width);
> +        intptr_t mx, intptr_t my, int width, int16_t *tmp);
> } VVCInterDSPContext;
> 
> struct VVCLocalContext;
> diff --git a/libavcodec/vvc/inter.c b/libavcodec/vvc/inter.c
> index 64a9dd1e46..48b633d580 100644
> --- a/libavcodec/vvc/inter.c
> +++ b/libavcodec/vvc/inter.c
> @@ -806,7 +806,7 @@ static void dmvr_mv_refine(VVCLocalContext *lc, MvField *mvf, MvField *orig_mv,
>         const int wrap_enabled  = fc->ps.pps->r->pps_ref_wraparound_enabled_flag;
> 
>         MC_EMULATED_EDGE_BILINEAR(lc->edge_emu_buffer, &src, &src_stride, ox, oy);
> -        fc->vvcdsp.inter.dmvr[!!my][!!mx](tmp[i], src, src_stride, pred_h, mx, my, pred_w);
> +        fc->vvcdsp.inter.dmvr[!!my][!!mx](tmp[i], src, src_stride, pred_h, mx, my, pred_w, lc->dmvr_tmp);
>     }
> 
>     min_sad = fc->vvcdsp.inter.sad(tmp[L0], tmp[L1], dx, dy, block_w, block_h);
> diff --git a/libavcodec/vvc/inter_template.c b/libavcodec/vvc/inter_template.c
> index c073a73e76..fad1ba801f 100644
> --- a/libavcodec/vvc/inter_template.c
> +++ b/libavcodec/vvc/inter_template.c
> @@ -474,7 +474,8 @@ static void FUNC(apply_bdof)(uint8_t *_dst, const ptrdiff_t _dst_stride, const i
> 
> //8.5.3.2.2 Luma sample bilinear interpolation process
> static void FUNC(dmvr)(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
> -    const int height, const intptr_t mx, const intptr_t my, const int width)
> +    const int height, const intptr_t mx, const intptr_t my, const int width,
> +    int16_t *tmp)
> {
> #if BIT_DEPTH != 10
>     const pixel *src            = (const pixel *)_src;
> @@ -502,7 +503,8 @@ static void FUNC(dmvr)(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_s
> 
> //8.5.3.2.2 Luma sample bilinear interpolation process
> static void FUNC(dmvr_h)(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
> -    const int height, const intptr_t mx, const intptr_t my, const int width)
> +    const int height, const intptr_t mx, const intptr_t my, const int width,
> +    int16_t *tmp)
> {
>     const pixel *src            = (const pixel*)_src;
>     const ptrdiff_t src_stride  = _src_stride / sizeof(pixel);
> @@ -520,7 +522,8 @@ static void FUNC(dmvr_h)(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src
> 
> //8.5.3.2.2 Luma sample bilinear interpolation process
> static void FUNC(dmvr_v)(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
> -    const int height, const intptr_t mx, const intptr_t my, const int width)
> +    const int height, const intptr_t mx, const intptr_t my, const int width,
> +    int16_t *tmp)
> {
>     const pixel *src            = (pixel*)_src;
>     const ptrdiff_t src_stride  = _src_stride / sizeof(pixel);
> @@ -539,9 +542,8 @@ static void FUNC(dmvr_v)(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src
> 
> //8.5.3.2.2 Luma sample bilinear interpolation process
> static void FUNC(dmvr_hv)(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
> -    const int height, const intptr_t mx, const intptr_t my, const int width)
> +    const int height, const intptr_t mx, const intptr_t my, const int width, int16_t *tmp_array)
> {
> -    int16_t tmp_array[(MAX_PB_SIZE + BILINEAR_EXTRA) * MAX_PB_SIZE];
>     int16_t *tmp                = tmp_array;
>     const pixel *src            = (const pixel*)_src;
>     const ptrdiff_t src_stride  = _src_stride / sizeof(pixel);
> diff --git a/libavcodec/x86/vvc/vvcdsp_init.c b/libavcodec/x86/vvc/vvcdsp_init.c
> index f3e2e3a27b..7ff3e2bdff 100644
> --- a/libavcodec/x86/vvc/vvcdsp_init.c
> +++ b/libavcodec/x86/vvc/vvcdsp_init.c
> @@ -90,13 +90,13 @@ AVG_PROTOTYPES(12, avx2)
> 
> #define DMVR_PROTOTYPES(bd, opt)                                                                    \
> void ff_vvc_dmvr_##bd##_##opt(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride,               \
> -     int height, intptr_t mx, intptr_t my, int width);                                              \
> +     int height, intptr_t mx, intptr_t my, int width, int16_t *unused);                             \
> void ff_vvc_dmvr_h_##bd##_##opt(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride,             \
> -     int height, intptr_t mx, intptr_t my, int width);                                              \
> +     int height, intptr_t mx, intptr_t my, int width, int16_t *unused);                             \
> void ff_vvc_dmvr_v_##bd##_##opt(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride,             \
> -     int height, intptr_t mx, intptr_t my, int width);                                              \
> +     int height, intptr_t mx, intptr_t my, int width, int16_t *unused);                             \
> void ff_vvc_dmvr_hv_##bd##_##opt(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride,            \
> -     int height, intptr_t mx, intptr_t my, int width);                                              \
> +     int height, intptr_t mx, intptr_t my, int width, int16_t *unused);                             \
> 
> DMVR_PROTOTYPES( 8, avx2)
> DMVR_PROTOTYPES(10, avx2)
> diff --git a/tests/checkasm/vvc_mc.c b/tests/checkasm/vvc_mc.c
> index 754cf19065..591557dfa6 100644
> --- a/tests/checkasm/vvc_mc.c
> +++ b/tests/checkasm/vvc_mc.c
> @@ -333,6 +333,7 @@ static void check_avg(void)
> }
> 
> #define SR_RANGE 2
> +#define DMVR_TMP_BUF_SIZE ((MAX_PB_SIZE + BILINEAR_EXTRA) * MAX_PB_SIZE * sizeof(int16_t))
> static void check_dmvr(void)
> {
>     LOCAL_ALIGNED_32(uint16_t, dst0, [DST_BUF_SIZE]);
> @@ -340,14 +341,20 @@ static void check_dmvr(void)
>     LOCAL_ALIGNED_32(uint8_t,  src0, [SRC_BUF_SIZE]);
>     LOCAL_ALIGNED_32(uint8_t,  src1, [SRC_BUF_SIZE]);
>     const int dst_stride = MAX_PB_SIZE * sizeof(int16_t);
> +    int16_t *tmp0 = av_mallocz(DMVR_TMP_BUF_SIZE);
> +    int16_t *tmp1 = av_mallocz(DMVR_TMP_BUF_SIZE);
> 
>     VVCDSPContext c;
>     declare_func(void, int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, int height,
> -        intptr_t mx, intptr_t my, int width);
> +        intptr_t mx, intptr_t my, int width, int16_t *tmp);
> +
> +    if (!tmp0 || !tmp1)
> +        fail();
> 
>     for (int bit_depth = 8; bit_depth <= 12; bit_depth += 2) {
>         ff_vvc_dsp_init(&c, bit_depth);
>         randomize_pixels(src0, src1, SRC_BUF_SIZE);
> +        randomize_buffers(tmp0, tmp1, DMVR_TMP_BUF_SIZE / 2, UINT32_MAX);
>         for (int i = 0; i < 2; i++) {
>             for (int j = 0; j < 2; j++) {
>                 for (int h = 8; h <= 16; h *= 2) {
> @@ -371,8 +378,8 @@ static void check_dmvr(void)
>                         if (check_func(c.inter.dmvr[j][i], "%s_%d_%dx%d", type, bit_depth, pred_w, pred_h)) {
>                             memset(dst0, 0, DST_BUF_SIZE);
>                             memset(dst1, 0, DST_BUF_SIZE);
> -                            call_ref(dst0, src0 + SRC_OFFSET, PIXEL_STRIDE, pred_h, mx, my, pred_w);
> -                            call_new(dst1, src1 + SRC_OFFSET, PIXEL_STRIDE, pred_h, mx, my, pred_w);
> +                            call_ref(dst0, src0 + SRC_OFFSET, PIXEL_STRIDE, pred_h, mx, my, pred_w, tmp0);
> +                            call_new(dst1, src1 + SRC_OFFSET, PIXEL_STRIDE, pred_h, mx, my, pred_w, tmp1);
>                             for (int k = 0; k < pred_h; k++) {
>                                 if (memcmp(dst0 + k * dst_stride, dst1 + k * dst_stride, pred_w * sizeof(int16_t))) {
>                                     fail();
> @@ -380,13 +387,15 @@ static void check_dmvr(void)
>                                 }
>                             }
> 
> -                            bench_new(dst1, src1 + SRC_OFFSET, PIXEL_STRIDE, pred_h, mx, my, pred_w);
> +                            bench_new(dst1, src1 + SRC_OFFSET, PIXEL_STRIDE, pred_h, mx, my, pred_w, tmp1);
>                         }
>                     }
>                 }
>             }
>         }
>     }
> +    av_free(tmp0);
> +    av_free(tmp1);
>     report("dmvr");
> }
> 
> -- 
> 2.39.3
>
diff mbox series

Patch

diff --git a/libavcodec/vvc/ctu.h b/libavcodec/vvc/ctu.h
index eab4612561..eb3e51c7e5 100644
--- a/libavcodec/vvc/ctu.h
+++ b/libavcodec/vvc/ctu.h
@@ -385,6 +385,7 @@  typedef struct VVCLocalContext {
     DECLARE_ALIGNED(32, uint8_t, alf_buffer_luma)[(MAX_CTU_SIZE + 2 * ALF_PADDING_SIZE) * EDGE_EMU_BUFFER_STRIDE * 2];
     DECLARE_ALIGNED(32, uint8_t, alf_buffer_chroma)[(MAX_CTU_SIZE + 2 * ALF_PADDING_SIZE) * EDGE_EMU_BUFFER_STRIDE * 2];
     DECLARE_ALIGNED(32, int32_t, alf_gradient_tmp)[ALF_GRADIENT_SIZE * ALF_GRADIENT_SIZE * ALF_NUM_DIR];
+    DECLARE_ALIGNED(32, int16_t, dmvr_tmp)[(MAX_PB_SIZE + BILINEAR_EXTRA) * MAX_PB_SIZE];
 
     struct {
         int sbt_num_fourths_tb0;                ///< SbtNumFourthsTb0
diff --git a/libavcodec/vvc/dsp.h b/libavcodec/vvc/dsp.h
index 635ebcafed..3594dfc5f5 100644
--- a/libavcodec/vvc/dsp.h
+++ b/libavcodec/vvc/dsp.h
@@ -99,7 +99,7 @@  typedef struct VVCInterDSPContext {
 
     int (*sad)(const int16_t *src0, const int16_t *src1, int dx, int dy, int block_w, int block_h);
     void (*dmvr[2][2])(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, int height,
-        intptr_t mx, intptr_t my, int width);
+        intptr_t mx, intptr_t my, int width, int16_t *tmp);
 } VVCInterDSPContext;
 
 struct VVCLocalContext;
diff --git a/libavcodec/vvc/inter.c b/libavcodec/vvc/inter.c
index 64a9dd1e46..48b633d580 100644
--- a/libavcodec/vvc/inter.c
+++ b/libavcodec/vvc/inter.c
@@ -806,7 +806,7 @@  static void dmvr_mv_refine(VVCLocalContext *lc, MvField *mvf, MvField *orig_mv,
         const int wrap_enabled  = fc->ps.pps->r->pps_ref_wraparound_enabled_flag;
 
         MC_EMULATED_EDGE_BILINEAR(lc->edge_emu_buffer, &src, &src_stride, ox, oy);
-        fc->vvcdsp.inter.dmvr[!!my][!!mx](tmp[i], src, src_stride, pred_h, mx, my, pred_w);
+        fc->vvcdsp.inter.dmvr[!!my][!!mx](tmp[i], src, src_stride, pred_h, mx, my, pred_w, lc->dmvr_tmp);
     }
 
     min_sad = fc->vvcdsp.inter.sad(tmp[L0], tmp[L1], dx, dy, block_w, block_h);
diff --git a/libavcodec/vvc/inter_template.c b/libavcodec/vvc/inter_template.c
index c073a73e76..fad1ba801f 100644
--- a/libavcodec/vvc/inter_template.c
+++ b/libavcodec/vvc/inter_template.c
@@ -474,7 +474,8 @@  static void FUNC(apply_bdof)(uint8_t *_dst, const ptrdiff_t _dst_stride, const i
 
 //8.5.3.2.2 Luma sample bilinear interpolation process
 static void FUNC(dmvr)(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
-    const int height, const intptr_t mx, const intptr_t my, const int width)
+    const int height, const intptr_t mx, const intptr_t my, const int width,
+    int16_t *tmp)
 {
 #if BIT_DEPTH != 10
     const pixel *src            = (const pixel *)_src;
@@ -502,7 +503,8 @@  static void FUNC(dmvr)(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_s
 
 //8.5.3.2.2 Luma sample bilinear interpolation process
 static void FUNC(dmvr_h)(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
-    const int height, const intptr_t mx, const intptr_t my, const int width)
+    const int height, const intptr_t mx, const intptr_t my, const int width,
+    int16_t *tmp)
 {
     const pixel *src            = (const pixel*)_src;
     const ptrdiff_t src_stride  = _src_stride / sizeof(pixel);
@@ -520,7 +522,8 @@  static void FUNC(dmvr_h)(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src
 
 //8.5.3.2.2 Luma sample bilinear interpolation process
 static void FUNC(dmvr_v)(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
-    const int height, const intptr_t mx, const intptr_t my, const int width)
+    const int height, const intptr_t mx, const intptr_t my, const int width,
+    int16_t *tmp)
 {
     const pixel *src            = (pixel*)_src;
     const ptrdiff_t src_stride  = _src_stride / sizeof(pixel);
@@ -539,9 +542,8 @@  static void FUNC(dmvr_v)(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src
 
 //8.5.3.2.2 Luma sample bilinear interpolation process
 static void FUNC(dmvr_hv)(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
-    const int height, const intptr_t mx, const intptr_t my, const int width)
+    const int height, const intptr_t mx, const intptr_t my, const int width, int16_t *tmp_array)
 {
-    int16_t tmp_array[(MAX_PB_SIZE + BILINEAR_EXTRA) * MAX_PB_SIZE];
     int16_t *tmp                = tmp_array;
     const pixel *src            = (const pixel*)_src;
     const ptrdiff_t src_stride  = _src_stride / sizeof(pixel);
diff --git a/libavcodec/x86/vvc/vvcdsp_init.c b/libavcodec/x86/vvc/vvcdsp_init.c
index f3e2e3a27b..7ff3e2bdff 100644
--- a/libavcodec/x86/vvc/vvcdsp_init.c
+++ b/libavcodec/x86/vvc/vvcdsp_init.c
@@ -90,13 +90,13 @@  AVG_PROTOTYPES(12, avx2)
 
 #define DMVR_PROTOTYPES(bd, opt)                                                                    \
 void ff_vvc_dmvr_##bd##_##opt(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride,               \
-     int height, intptr_t mx, intptr_t my, int width);                                              \
+     int height, intptr_t mx, intptr_t my, int width, int16_t *unused);                             \
 void ff_vvc_dmvr_h_##bd##_##opt(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride,             \
-     int height, intptr_t mx, intptr_t my, int width);                                              \
+     int height, intptr_t mx, intptr_t my, int width, int16_t *unused);                             \
 void ff_vvc_dmvr_v_##bd##_##opt(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride,             \
-     int height, intptr_t mx, intptr_t my, int width);                                              \
+     int height, intptr_t mx, intptr_t my, int width, int16_t *unused);                             \
 void ff_vvc_dmvr_hv_##bd##_##opt(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride,            \
-     int height, intptr_t mx, intptr_t my, int width);                                              \
+     int height, intptr_t mx, intptr_t my, int width, int16_t *unused);                             \
 
 DMVR_PROTOTYPES( 8, avx2)
 DMVR_PROTOTYPES(10, avx2)
diff --git a/tests/checkasm/vvc_mc.c b/tests/checkasm/vvc_mc.c
index 754cf19065..591557dfa6 100644
--- a/tests/checkasm/vvc_mc.c
+++ b/tests/checkasm/vvc_mc.c
@@ -333,6 +333,7 @@  static void check_avg(void)
 }
 
 #define SR_RANGE 2
+#define DMVR_TMP_BUF_SIZE ((MAX_PB_SIZE + BILINEAR_EXTRA) * MAX_PB_SIZE * sizeof(int16_t))
 static void check_dmvr(void)
 {
     LOCAL_ALIGNED_32(uint16_t, dst0, [DST_BUF_SIZE]);
@@ -340,14 +341,20 @@  static void check_dmvr(void)
     LOCAL_ALIGNED_32(uint8_t,  src0, [SRC_BUF_SIZE]);
     LOCAL_ALIGNED_32(uint8_t,  src1, [SRC_BUF_SIZE]);
     const int dst_stride = MAX_PB_SIZE * sizeof(int16_t);
+    int16_t *tmp0 = av_mallocz(DMVR_TMP_BUF_SIZE);
+    int16_t *tmp1 = av_mallocz(DMVR_TMP_BUF_SIZE);
 
     VVCDSPContext c;
     declare_func(void, int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, int height,
-        intptr_t mx, intptr_t my, int width);
+        intptr_t mx, intptr_t my, int width, int16_t *tmp);
+
+    if (!tmp0 || !tmp1)
+        fail();
 
     for (int bit_depth = 8; bit_depth <= 12; bit_depth += 2) {
         ff_vvc_dsp_init(&c, bit_depth);
         randomize_pixels(src0, src1, SRC_BUF_SIZE);
+        randomize_buffers(tmp0, tmp1, DMVR_TMP_BUF_SIZE / 2, UINT32_MAX);
         for (int i = 0; i < 2; i++) {
             for (int j = 0; j < 2; j++) {
                 for (int h = 8; h <= 16; h *= 2) {
@@ -371,8 +378,8 @@  static void check_dmvr(void)
                         if (check_func(c.inter.dmvr[j][i], "%s_%d_%dx%d", type, bit_depth, pred_w, pred_h)) {
                             memset(dst0, 0, DST_BUF_SIZE);
                             memset(dst1, 0, DST_BUF_SIZE);
-                            call_ref(dst0, src0 + SRC_OFFSET, PIXEL_STRIDE, pred_h, mx, my, pred_w);
-                            call_new(dst1, src1 + SRC_OFFSET, PIXEL_STRIDE, pred_h, mx, my, pred_w);
+                            call_ref(dst0, src0 + SRC_OFFSET, PIXEL_STRIDE, pred_h, mx, my, pred_w, tmp0);
+                            call_new(dst1, src1 + SRC_OFFSET, PIXEL_STRIDE, pred_h, mx, my, pred_w, tmp1);
                             for (int k = 0; k < pred_h; k++) {
                                 if (memcmp(dst0 + k * dst_stride, dst1 + k * dst_stride, pred_w * sizeof(int16_t))) {
                                     fail();
@@ -380,13 +387,15 @@  static void check_dmvr(void)
                                 }
                             }
 
-                            bench_new(dst1, src1 + SRC_OFFSET, PIXEL_STRIDE, pred_h, mx, my, pred_w);
+                            bench_new(dst1, src1 + SRC_OFFSET, PIXEL_STRIDE, pred_h, mx, my, pred_w, tmp1);
                         }
                     }
                 }
             }
         }
     }
+    av_free(tmp0);
+    av_free(tmp1);
     report("dmvr");
 }