diff mbox series

[FFmpeg-devel] codec/aarch64/hevc:add transform_luma_neon checkasm

Message ID tencent_399030B280A76677E6352C4B09CD64B3C60A@qq.com
State New
Headers show
Series [FFmpeg-devel] codec/aarch64/hevc:add transform_luma_neon checkasm | expand

Checks

Context Check Description
andriy/configure_x86 warning Failed to apply patch
yinshiyou/configure_loongarch64 warning Failed to apply patch

Commit Message

徐福隆 April 9, 2023, 4:09 a.m. UTC
got 56% speed up (run_count=1000, CPU=Cortex A53)
transform_4x4_luma_neon: 45 transform_4x4_luma_c: 103
---
 libavcodec/aarch64/hevcdsp_idct_neon.S    | 51 ++++++++++++++++++++++-
 libavcodec/aarch64/hevcdsp_init_aarch64.c |  2 +
 tests/checkasm/hevc_idct.c                | 28 +++++++++++++
 3 files changed, 80 insertions(+), 1 deletion(-)

Comments

Martin Storsjö April 12, 2023, 12:53 p.m. UTC | #1
On Sun, 9 Apr 2023, xufuji456 wrote:

> got 56% speed up (run_count=1000, CPU=Cortex A53)
> transform_4x4_luma_neon: 45 transform_4x4_luma_c: 103
> ---
> libavcodec/aarch64/hevcdsp_idct_neon.S    | 51 ++++++++++++++++++++++-
> libavcodec/aarch64/hevcdsp_init_aarch64.c |  2 +
> tests/checkasm/hevc_idct.c                | 28 +++++++++++++
> 3 files changed, 80 insertions(+), 1 deletion(-)

When sending updates to your patches, please mention somewhere (outside of 
the main git commit message) what has changed. When I got back to looking 
at your patches now, I see a handful of revisions of the same patch with 
no explanation of what has changed.

>
> diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S
> index 74a96957bf..fc683a6396 100644
> --- a/libavcodec/aarch64/hevcdsp_idct_neon.S
> +++ b/libavcodec/aarch64/hevcdsp_idct_neon.S
> @@ -6,6 +6,7 @@
>  * Ported from arm/hevcdsp_idct_neon.S by
>  * Copyright (c) 2020 Reimar Döffinger
>  * Copyright (c) 2023 J. Dekker <jdek@itanimul.li>
> + * Copyright (c) 2023 xu fulong <839789740@qq.com>
>  *
>  * This file is part of FFmpeg.
>  *
> @@ -656,4 +657,52 @@ idct_dc 16, 8
> idct_dc 16, 10
> 
> idct_dc 32, 8
> -idct_dc 32, 10
> \ No newline at end of file

The patch still does not apply cleanly on git master due to this. Please 
rebase your patch on top of the actual public git master.

> +idct_dc 32, 10
> +
> +.macro tr4_luma_shift r0, r1, r2, r3, shift
> +        saddl       v0.4s, \r0, \r2         // c0 = src0 + src2
> +        saddl       v1.4s, \r2, \r3         // c1 = src2 + src3
> +        ssubl       v2.4s, \r0, \r3         // c2 = src0 - src3
> +        smull       v3.4s, \r1, v21.4h      // c3 = 74 * src1
> +
> +        saddl       v7.4s, \r0, \r3         // src0 + src3
> +        ssubw       v7.4s, v7.4s, \r2       // src0 - src2 + src3
> +        mul         v7.4s, v7.4s, v18.4s    // dst2 = 74 * (src0 - src2 + src3)
> +
> +        mul         v5.4s, v0.4s, v19.4s    // 29 * c0
> +        mul         v6.4s, v1.4s, v20.4s    // 55 * c1
> +        add         v5.4s, v5.4s, v6.4s     // 29 * c0 + 55 * c1
> +        add         v5.4s, v5.4s, v3.4s     // dst0 = 29 * c0 + 55 * c1 + c3
> +
> +        mul         v1.4s, v1.4s, v19.4s    // 29 * c1
> +        mul         v6.4s, v2.4s, v20.4s    // 55 * c2
> +        sub         v6.4s, v6.4s, v1.4s     // 55 * c2 - 29 * c1
> +        add         v6.4s, v6.4s, v3.4s     // dst1 = 55 * c2 - 29 * c1 + c3
> +
> +        mul         v0.4s, v0.4s, v20.4s    // 55 * c0
> +        mul         v2.4s, v2.4s, v19.4s    // 29 * c2
> +        add         v0.4s, v0.4s, v2.4s     // 55 * c0 + 29 * c2
> +        sub         v0.4s, v0.4s, v3.4s     // dst3 = 55 * c0 + 29 * c2 - c3
> +
> +        sqrshrn     \r0, v5.4s, \shift
> +        sqrshrn     \r1, v6.4s, \shift
> +        sqrshrn     \r2, v7.4s, \shift
> +        sqrshrn     \r3, v0.4s, \shift
> +.endm
> +
> +function ff_hevc_transform_luma_4x4_neon_8, export=1
> +        ld1            {v28.4h-v31.4h}, [x0]
> +        movi           v18.4s, #74
> +        movi           v19.4s, #29
> +        movi           v20.4s, #55
> +        movi           v21.4h, #74
> +
> +        tr4_luma_shift v28.4h, v29.4h, v30.4h, v31.4h, #7
> +        transpose_4x4H v28, v29, v30, v31, v22, v23, v24, v25
> +
> +        tr4_luma_shift v28.4h, v29.4h, v30.4h, v31.4h, #12
> +        transpose_4x4H v28, v29, v30, v31, v22, v23, v24, v25
> +
> +        st1            {v28.4h-v31.4h}, [x0]
> +        ret
> +endfunc
> diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
> index a923bae35c..6605a39973 100644
> --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
> +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
> @@ -75,6 +75,7 @@ void ff_hevc_idct_4x4_dc_10_neon(int16_t *coeffs);
> void ff_hevc_idct_8x8_dc_10_neon(int16_t *coeffs);
> void ff_hevc_idct_16x16_dc_10_neon(int16_t *coeffs);
> void ff_hevc_idct_32x32_dc_10_neon(int16_t *coeffs);
> +void ff_hevc_transform_luma_4x4_neon_8(int16_t *coeffs);
> void ff_hevc_sao_band_filter_8x8_8_neon(uint8_t *_dst, const uint8_t *_src,
>                                   ptrdiff_t stride_dst, ptrdiff_t stride_src,
>                                   const int16_t *sao_offset_val, int sao_left_class,
> @@ -142,6 +143,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
>         c->idct_dc[1]                  = ff_hevc_idct_8x8_dc_8_neon;
>         c->idct_dc[2]                  = ff_hevc_idct_16x16_dc_8_neon;
>         c->idct_dc[3]                  = ff_hevc_idct_32x32_dc_8_neon;
> +        c->transform_4x4_luma          = ff_hevc_transform_luma_4x4_neon_8;
>         c->sao_band_filter[0]          =
>         c->sao_band_filter[1]          =
>         c->sao_band_filter[2]          =
> diff --git a/tests/checkasm/hevc_idct.c b/tests/checkasm/hevc_idct.c
> index 338b8a23e4..1c2b08d0f8 100644
> --- a/tests/checkasm/hevc_idct.c
> +++ b/tests/checkasm/hevc_idct.c
> @@ -84,6 +84,27 @@ static void check_idct_dc(HEVCDSPContext h, int bit_depth)
>     }
> }
> 
> +static void check_transform_luma(HEVCDSPContext h)

Unrelated to your patch, I see that this test file already does this 
elsewhere, but it is uncommon to pass such a context by value here; 
normally we'd pass it as a pointer to the subfunctions. But that's a 
preexisting issue, so stick with the current convention and we can change 
them all in a separate patch.

> +{
> +    LOCAL_ALIGNED(32, int16_t, coeffs0, [32 * 32]);
> +    LOCAL_ALIGNED(32, int16_t, coeffs1, [32 * 32]);
> +
> +    int block_size = 4;
> +    int size = block_size * block_size;
> +    declare_func_emms(AV_CPU_FLAG_MMXEXT, void, int16_t *coeffs);
> +
> +    randomize_buffers(coeffs0, size);
> +    memcpy(coeffs1, coeffs0, sizeof(*coeffs0) * size);
> +
> +    if (check_func(h.transform_4x4_luma, "hevc_transform_4x4_luma")) {
> +        call_ref(coeffs0);
> +        call_new(coeffs1);
> +        if (memcmp(coeffs0, coeffs1, sizeof(*coeffs0) * size))
> +            fail();
> +        bench_new(coeffs1);
> +    }
> +}
> +
> void checkasm_check_hevc_idct(void)
> {
>     int bit_depth;
> @@ -103,4 +124,11 @@ void checkasm_check_hevc_idct(void)
>         check_idct(h, bit_depth);
>     }
>     report("idct");
> +
> +    bit_depth = 8;
> +    HEVCDSPContext h;

This causes warnings:

src/tests/checkasm/hevc_idct.c: In function ‘checkasm_check_hevc_idct’:
src/tests/checkasm/hevc_idct.c:129:5: warning: ISO C90 forbids mixed 
declaration
s and code [-Wdeclaration-after-statement]
      HEVCDSPContext h;
      ^~~~~~~~~~~~~~

And why not test all bitdepths like the other tests? Even if you might not 
be adding assembly for other bitdepths right now, it's best to make the 
test cover them all from the start.

// Martin
Fahad Mustafa April 12, 2023, 12:56 p.m. UTC | #2
Please don't send emails again.

On Wed, Apr 12, 2023, 5:54 PM Martin Storsjö <martin@martin.st> wrote:

> On Sun, 9 Apr 2023, xufuji456 wrote:
>
> > got 56% speed up (run_count=1000, CPU=Cortex A53)
> > transform_4x4_luma_neon: 45 transform_4x4_luma_c: 103
> > ---
> > libavcodec/aarch64/hevcdsp_idct_neon.S    | 51 ++++++++++++++++++++++-
> > libavcodec/aarch64/hevcdsp_init_aarch64.c |  2 +
> > tests/checkasm/hevc_idct.c                | 28 +++++++++++++
> > 3 files changed, 80 insertions(+), 1 deletion(-)
>
> When sending updates to your patches, please mention somewhere (outside of
> the main git commit message) what has changed. When I got back to looking
> at your patches now, I see a handful of revisions of the same patch with
> no explanation of what has changed.
>
> >
> > diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S
> b/libavcodec/aarch64/hevcdsp_idct_neon.S
> > index 74a96957bf..fc683a6396 100644
> > --- a/libavcodec/aarch64/hevcdsp_idct_neon.S
> > +++ b/libavcodec/aarch64/hevcdsp_idct_neon.S
> > @@ -6,6 +6,7 @@
> >  * Ported from arm/hevcdsp_idct_neon.S by
> >  * Copyright (c) 2020 Reimar Döffinger
> >  * Copyright (c) 2023 J. Dekker <jdek@itanimul.li>
> > + * Copyright (c) 2023 xu fulong <839789740@qq.com>
> >  *
> >  * This file is part of FFmpeg.
> >  *
> > @@ -656,4 +657,52 @@ idct_dc 16, 8
> > idct_dc 16, 10
> >
> > idct_dc 32, 8
> > -idct_dc 32, 10
> > \ No newline at end of file
>
> The patch still does not apply cleanly on git master due to this. Please
> rebase your patch on top of the actual public git master.
>
> > +idct_dc 32, 10
> > +
> > +.macro tr4_luma_shift r0, r1, r2, r3, shift
> > +        saddl       v0.4s, \r0, \r2         // c0 = src0 + src2
> > +        saddl       v1.4s, \r2, \r3         // c1 = src2 + src3
> > +        ssubl       v2.4s, \r0, \r3         // c2 = src0 - src3
> > +        smull       v3.4s, \r1, v21.4h      // c3 = 74 * src1
> > +
> > +        saddl       v7.4s, \r0, \r3         // src0 + src3
> > +        ssubw       v7.4s, v7.4s, \r2       // src0 - src2 + src3
> > +        mul         v7.4s, v7.4s, v18.4s    // dst2 = 74 * (src0 - src2
> + src3)
> > +
> > +        mul         v5.4s, v0.4s, v19.4s    // 29 * c0
> > +        mul         v6.4s, v1.4s, v20.4s    // 55 * c1
> > +        add         v5.4s, v5.4s, v6.4s     // 29 * c0 + 55 * c1
> > +        add         v5.4s, v5.4s, v3.4s     // dst0 = 29 * c0 + 55 * c1
> + c3
> > +
> > +        mul         v1.4s, v1.4s, v19.4s    // 29 * c1
> > +        mul         v6.4s, v2.4s, v20.4s    // 55 * c2
> > +        sub         v6.4s, v6.4s, v1.4s     // 55 * c2 - 29 * c1
> > +        add         v6.4s, v6.4s, v3.4s     // dst1 = 55 * c2 - 29 * c1
> + c3
> > +
> > +        mul         v0.4s, v0.4s, v20.4s    // 55 * c0
> > +        mul         v2.4s, v2.4s, v19.4s    // 29 * c2
> > +        add         v0.4s, v0.4s, v2.4s     // 55 * c0 + 29 * c2
> > +        sub         v0.4s, v0.4s, v3.4s     // dst3 = 55 * c0 + 29 * c2
> - c3
> > +
> > +        sqrshrn     \r0, v5.4s, \shift
> > +        sqrshrn     \r1, v6.4s, \shift
> > +        sqrshrn     \r2, v7.4s, \shift
> > +        sqrshrn     \r3, v0.4s, \shift
> > +.endm
> > +
> > +function ff_hevc_transform_luma_4x4_neon_8, export=1
> > +        ld1            {v28.4h-v31.4h}, [x0]
> > +        movi           v18.4s, #74
> > +        movi           v19.4s, #29
> > +        movi           v20.4s, #55
> > +        movi           v21.4h, #74
> > +
> > +        tr4_luma_shift v28.4h, v29.4h, v30.4h, v31.4h, #7
> > +        transpose_4x4H v28, v29, v30, v31, v22, v23, v24, v25
> > +
> > +        tr4_luma_shift v28.4h, v29.4h, v30.4h, v31.4h, #12
> > +        transpose_4x4H v28, v29, v30, v31, v22, v23, v24, v25
> > +
> > +        st1            {v28.4h-v31.4h}, [x0]
> > +        ret
> > +endfunc
> > diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c
> b/libavcodec/aarch64/hevcdsp_init_aarch64.c
> > index a923bae35c..6605a39973 100644
> > --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
> > +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
> > @@ -75,6 +75,7 @@ void ff_hevc_idct_4x4_dc_10_neon(int16_t *coeffs);
> > void ff_hevc_idct_8x8_dc_10_neon(int16_t *coeffs);
> > void ff_hevc_idct_16x16_dc_10_neon(int16_t *coeffs);
> > void ff_hevc_idct_32x32_dc_10_neon(int16_t *coeffs);
> > +void ff_hevc_transform_luma_4x4_neon_8(int16_t *coeffs);
> > void ff_hevc_sao_band_filter_8x8_8_neon(uint8_t *_dst, const uint8_t
> *_src,
> >                                   ptrdiff_t stride_dst, ptrdiff_t
> stride_src,
> >                                   const int16_t *sao_offset_val, int
> sao_left_class,
> > @@ -142,6 +143,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext
> *c, const int bit_depth)
> >         c->idct_dc[1]                  = ff_hevc_idct_8x8_dc_8_neon;
> >         c->idct_dc[2]                  = ff_hevc_idct_16x16_dc_8_neon;
> >         c->idct_dc[3]                  = ff_hevc_idct_32x32_dc_8_neon;
> > +        c->transform_4x4_luma          =
> ff_hevc_transform_luma_4x4_neon_8;
> >         c->sao_band_filter[0]          =
> >         c->sao_band_filter[1]          =
> >         c->sao_band_filter[2]          =
> > diff --git a/tests/checkasm/hevc_idct.c b/tests/checkasm/hevc_idct.c
> > index 338b8a23e4..1c2b08d0f8 100644
> > --- a/tests/checkasm/hevc_idct.c
> > +++ b/tests/checkasm/hevc_idct.c
> > @@ -84,6 +84,27 @@ static void check_idct_dc(HEVCDSPContext h, int
> bit_depth)
> >     }
> > }
> >
> > +static void check_transform_luma(HEVCDSPContext h)
>
> Unrelated to your patch, I see that this test file already does this
> elsewhere, but it is uncommon to pass such a context by value here;
> normally we'd pass it as a pointer to the subfunctions. But that's a
> preexisting issue, so stick with the current convention and we can change
> them all in a separate patch.
>
> > +{
> > +    LOCAL_ALIGNED(32, int16_t, coeffs0, [32 * 32]);
> > +    LOCAL_ALIGNED(32, int16_t, coeffs1, [32 * 32]);
> > +
> > +    int block_size = 4;
> > +    int size = block_size * block_size;
> > +    declare_func_emms(AV_CPU_FLAG_MMXEXT, void, int16_t *coeffs);
> > +
> > +    randomize_buffers(coeffs0, size);
> > +    memcpy(coeffs1, coeffs0, sizeof(*coeffs0) * size);
> > +
> > +    if (check_func(h.transform_4x4_luma, "hevc_transform_4x4_luma")) {
> > +        call_ref(coeffs0);
> > +        call_new(coeffs1);
> > +        if (memcmp(coeffs0, coeffs1, sizeof(*coeffs0) * size))
> > +            fail();
> > +        bench_new(coeffs1);
> > +    }
> > +}
> > +
> > void checkasm_check_hevc_idct(void)
> > {
> >     int bit_depth;
> > @@ -103,4 +124,11 @@ void checkasm_check_hevc_idct(void)
> >         check_idct(h, bit_depth);
> >     }
> >     report("idct");
> > +
> > +    bit_depth = 8;
> > +    HEVCDSPContext h;
>
> This causes warnings:
>
> src/tests/checkasm/hevc_idct.c: In function ‘checkasm_check_hevc_idct’:
> src/tests/checkasm/hevc_idct.c:129:5: warning: ISO C90 forbids mixed
> declaration
> s and code [-Wdeclaration-after-statement]
>       HEVCDSPContext h;
>       ^~~~~~~~~~~~~~
>
> And why not test all bitdepths like the other tests? Even if you might not
> be adding assembly for other bitdepths right now, it's best to make the
> test cover them all from the start.
>
> // Martin
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
diff mbox series

Patch

diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S
index 74a96957bf..fc683a6396 100644
--- a/libavcodec/aarch64/hevcdsp_idct_neon.S
+++ b/libavcodec/aarch64/hevcdsp_idct_neon.S
@@ -6,6 +6,7 @@ 
  * Ported from arm/hevcdsp_idct_neon.S by
  * Copyright (c) 2020 Reimar Döffinger
  * Copyright (c) 2023 J. Dekker <jdek@itanimul.li>
+ * Copyright (c) 2023 xu fulong <839789740@qq.com>
  *
  * This file is part of FFmpeg.
  *
@@ -656,4 +657,52 @@  idct_dc 16, 8
 idct_dc 16, 10
 
 idct_dc 32, 8
-idct_dc 32, 10
\ No newline at end of file
+idct_dc 32, 10
+
+.macro tr4_luma_shift r0, r1, r2, r3, shift
+        saddl       v0.4s, \r0, \r2         // c0 = src0 + src2
+        saddl       v1.4s, \r2, \r3         // c1 = src2 + src3
+        ssubl       v2.4s, \r0, \r3         // c2 = src0 - src3
+        smull       v3.4s, \r1, v21.4h      // c3 = 74 * src1
+
+        saddl       v7.4s, \r0, \r3         // src0 + src3
+        ssubw       v7.4s, v7.4s, \r2       // src0 - src2 + src3
+        mul         v7.4s, v7.4s, v18.4s    // dst2 = 74 * (src0 - src2 + src3)
+
+        mul         v5.4s, v0.4s, v19.4s    // 29 * c0
+        mul         v6.4s, v1.4s, v20.4s    // 55 * c1
+        add         v5.4s, v5.4s, v6.4s     // 29 * c0 + 55 * c1
+        add         v5.4s, v5.4s, v3.4s     // dst0 = 29 * c0 + 55 * c1 + c3
+
+        mul         v1.4s, v1.4s, v19.4s    // 29 * c1
+        mul         v6.4s, v2.4s, v20.4s    // 55 * c2
+        sub         v6.4s, v6.4s, v1.4s     // 55 * c2 - 29 * c1
+        add         v6.4s, v6.4s, v3.4s     // dst1 = 55 * c2 - 29 * c1 + c3
+
+        mul         v0.4s, v0.4s, v20.4s    // 55 * c0
+        mul         v2.4s, v2.4s, v19.4s    // 29 * c2
+        add         v0.4s, v0.4s, v2.4s     // 55 * c0 + 29 * c2
+        sub         v0.4s, v0.4s, v3.4s     // dst3 = 55 * c0 + 29 * c2 - c3
+
+        sqrshrn     \r0, v5.4s, \shift
+        sqrshrn     \r1, v6.4s, \shift
+        sqrshrn     \r2, v7.4s, \shift
+        sqrshrn     \r3, v0.4s, \shift
+.endm
+
+function ff_hevc_transform_luma_4x4_neon_8, export=1
+        ld1            {v28.4h-v31.4h}, [x0]
+        movi           v18.4s, #74
+        movi           v19.4s, #29
+        movi           v20.4s, #55
+        movi           v21.4h, #74
+
+        tr4_luma_shift v28.4h, v29.4h, v30.4h, v31.4h, #7
+        transpose_4x4H v28, v29, v30, v31, v22, v23, v24, v25
+
+        tr4_luma_shift v28.4h, v29.4h, v30.4h, v31.4h, #12
+        transpose_4x4H v28, v29, v30, v31, v22, v23, v24, v25
+
+        st1            {v28.4h-v31.4h}, [x0]
+        ret
+endfunc
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index a923bae35c..6605a39973 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -75,6 +75,7 @@  void ff_hevc_idct_4x4_dc_10_neon(int16_t *coeffs);
 void ff_hevc_idct_8x8_dc_10_neon(int16_t *coeffs);
 void ff_hevc_idct_16x16_dc_10_neon(int16_t *coeffs);
 void ff_hevc_idct_32x32_dc_10_neon(int16_t *coeffs);
+void ff_hevc_transform_luma_4x4_neon_8(int16_t *coeffs);
 void ff_hevc_sao_band_filter_8x8_8_neon(uint8_t *_dst, const uint8_t *_src,
                                   ptrdiff_t stride_dst, ptrdiff_t stride_src,
                                   const int16_t *sao_offset_val, int sao_left_class,
@@ -142,6 +143,7 @@  av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
         c->idct_dc[1]                  = ff_hevc_idct_8x8_dc_8_neon;
         c->idct_dc[2]                  = ff_hevc_idct_16x16_dc_8_neon;
         c->idct_dc[3]                  = ff_hevc_idct_32x32_dc_8_neon;
+        c->transform_4x4_luma          = ff_hevc_transform_luma_4x4_neon_8;
         c->sao_band_filter[0]          =
         c->sao_band_filter[1]          =
         c->sao_band_filter[2]          =
diff --git a/tests/checkasm/hevc_idct.c b/tests/checkasm/hevc_idct.c
index 338b8a23e4..1c2b08d0f8 100644
--- a/tests/checkasm/hevc_idct.c
+++ b/tests/checkasm/hevc_idct.c
@@ -84,6 +84,27 @@  static void check_idct_dc(HEVCDSPContext h, int bit_depth)
     }
 }
 
+static void check_transform_luma(HEVCDSPContext h)
+{
+    LOCAL_ALIGNED(32, int16_t, coeffs0, [32 * 32]);
+    LOCAL_ALIGNED(32, int16_t, coeffs1, [32 * 32]);
+
+    int block_size = 4;
+    int size = block_size * block_size;
+    declare_func_emms(AV_CPU_FLAG_MMXEXT, void, int16_t *coeffs);
+
+    randomize_buffers(coeffs0, size);
+    memcpy(coeffs1, coeffs0, sizeof(*coeffs0) * size);
+
+    if (check_func(h.transform_4x4_luma, "hevc_transform_4x4_luma")) {
+        call_ref(coeffs0);
+        call_new(coeffs1);
+        if (memcmp(coeffs0, coeffs1, sizeof(*coeffs0) * size))
+            fail();
+        bench_new(coeffs1);
+    }
+}
+
 void checkasm_check_hevc_idct(void)
 {
     int bit_depth;
@@ -103,4 +124,11 @@  void checkasm_check_hevc_idct(void)
         check_idct(h, bit_depth);
     }
     report("idct");
+
+    bit_depth = 8;
+    HEVCDSPContext h;
+
+    ff_hevc_dsp_init(&h, bit_depth);
+    check_transform_luma(h);
+    report("transform_luma");
 }