diff mbox series

[FFmpeg-devel,1/2] lavc/aarch64: new 8-bit hevc 16x16 idct

Message ID 20220623122311.20097-1-jdek@itanimul.li
State New
Headers show
Series [FFmpeg-devel,1/2] lavc/aarch64: new 8-bit hevc 16x16 idct | expand

Checks

Context Check Description
yinshiyou/make_fate_loongarch64 success Make fate finished
yinshiyou/make_loongarch64 warning New warnings during build
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

J. Dekker June 23, 2022, 12:23 p.m. UTC
old:
hevc_idct_16x16_8_c: 5366.2
hevc_idct_16x16_8_neon: 1493.2

new:
hevc_idct_16x16_8_c: 5363.2
hevc_idct_16x16_8_neon: 943.5

Co-developed-by: Rafal Dabrowa <fatwildcat@gmail.com>
Signed-off-by: J. Dekker <jdek@itanimul.li>
---
 libavcodec/aarch64/hevcdsp_idct_neon.S    | 666 ++++++++++++++++++++++
 libavcodec/aarch64/hevcdsp_init_aarch64.c |   3 +-
 2 files changed, 668 insertions(+), 1 deletion(-)

 This idct is significantly faster than the one we currently have, I
 suspect its for a couple reasons: 1) it's only written for 8bit 2) it's
 unrolled signficantly more. It comes at a hefty cost of roughly 2.25x
 the object size. I'm wondering if this idct is salvagable, or the one
 we have should just be improved instead.

Comments

Martin Storsjö Aug. 9, 2022, 12:15 p.m. UTC | #1
On Thu, 23 Jun 2022, J. Dekker wrote:

> old:
> hevc_idct_16x16_8_c: 5366.2
> hevc_idct_16x16_8_neon: 1493.2
>
> new:
> hevc_idct_16x16_8_c: 5363.2
> hevc_idct_16x16_8_neon: 943.5
>
> Co-developed-by: Rafal Dabrowa <fatwildcat@gmail.com>
> Signed-off-by: J. Dekker <jdek@itanimul.li>
> ---
> libavcodec/aarch64/hevcdsp_idct_neon.S    | 666 ++++++++++++++++++++++
> libavcodec/aarch64/hevcdsp_init_aarch64.c |   3 +-
> 2 files changed, 668 insertions(+), 1 deletion(-)

Throughout the new code, you have e.g. "add x5, x5, x4, lsl 2", where the 
"lsl 2" breaks assembling with MS armasm64 - it's missing the '#' on the 
constant 2.

Also, for loads/stores, it seems to be missing the same '#' for 
postincrement, e.g. "ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x4], 64". Also 
"mov x4, 64". Apparently armasm64 doesn't have a problem with that, but it 
would still be good to have it consistent with the rest.

> This idct is significantly faster than the one we currently have, I
> suspect its for a couple reasons: 1) it's only written for 8bit

I don't see how that would change anything? Isn't the only thing that 
differs between 8 and 10/12 bit in the existing implementation about how 
much to scale down at the end? All other intermediate values are the same 
size?

> 2) it's unrolled signficantly more. It comes at a hefty cost of roughly 
> 2.25x the object size.

If by that, you mean that the existing code works on 4 elements at a time 
(i.e. mostly operating on .4h vectors), while this one operates on .8h 
vectors, then yes, that's most probably the biggest source of the speedup 
(even if a lot of the intermediate stuff happens in .4s vectors). The 
existing code was ported from the 32 bit arm version (which probably had 
to stick to 4 elements at a time due to register availability there), 
while it probably could have been made double width when it was ported to 
64 bit.

> I'm wondering if this idct is salvagable, or the one we have should just 
> be improved instead.

Well, my honest opinion is:

- I don't quite understand the current code (I've worked on the 
vp8/vp9/av1 IDCTs a fair amount, but the HEVC one seems to be different 
enough that I don't recognize all the concepts here.

- The current implementation would need to be reformatted if kept

- The current implementation does have some rather clear high level 
structure though, e.g. when looking at the idct_16x16 macro.

- The new implementation seems to be just one huuuuge function. If you 
know it by heart, it's probably good, but it's really hard to get an 
overview of if you're not familiar with the HEVC IDCTs.

As for steps forward:
- Is it possible to widen the existing implementation to operate on 8 
elements instead of 4? I think that would bring it up to par with this 
one.
- Can you get some high level structure to the new implementation so that 
it becomes understandable? Either lots of more comments explaining what's 
happening and why, or splitting it up in smaller macros.

Some more comments on the code itself below:

> +// void ff_hevc_idct_16x16_8_neon(int16_t *coeffs, int col_limit)
> +function ff_hevc_idct_16x16_8_neon_new, export=1
> +        sub             sp, sp, 64
> +        st1             {v8.16b, v9.16b, v10.16b, v11.16b}, [sp]
> +        sub             sp, sp, 32
> +        st1             {v14.16b, v15.16b}, [sp]
> +        mov             x3, 0
> +        mov             x2, x0
> +1:      mov             x4, x2
> +        mov             x5, 32
> +        ld1             {v16.8h}, [x4], x5
> +        ld1             {v17.8h}, [x4], x5
> +        ld1             {v18.8h}, [x4], x5
> +        ld1             {v19.8h}, [x4], x5
> +        ld1             {v20.8h}, [x4], x5
> +        ld1             {v21.8h}, [x4], x5
> +        ld1             {v22.8h}, [x4], x5
> +        ld1             {v23.8h}, [x4], x5
> +        ld1             {v24.8h}, [x4], x5
> +        ld1             {v25.8h}, [x4], x5
> +        ld1             {v26.8h}, [x4], x5
> +        ld1             {v27.8h}, [x4], x5
> +        ld1             {v28.8h}, [x4], x5
> +        ld1             {v29.8h}, [x4], x5
> +        ld1             {v30.8h}, [x4], x5
> +        ld1             {v31.8h}, [x4], x5
> +        cmp             x1, 12
> +        b.hs            5f
> +        // limit2 below 16
> +        bic             x4, x1, 1
> +        adr             x5, .LimitMask
> +        cbnz            x3, 3f
> +        // columns 0 .. 7 - cleanup of indexes 5 .. 7
> +        ld1             {v0.8h}, [x5]
> +        adr             x5, 2f
> +        add             x5, x5, x4, lsl 2
> +        add             x5, x5, x4, lsl 1
> +        br              x5
> +2:      and             v17.16b, v17.16b, v0.16b    // col_limit 0..1 -> limit2 == 4..5
> +        and             v19.16b, v19.16b, v0.16b
> +        b               5f

I don't really know what these jump tables do and how it corresponds to 
things in the existing implementation - but I guess that can be one part 
of what makes things faster too.

The existing implementation does an 16x16 transform by first doing 4x 
transforms for an 4x16 piece of data, transpose that, then do another 4x 
4x16 for the second pass. How does the new implementation do it?

If I understand correctly, the old implementation didn't take col_limit 
into account at all. Can that be one part of what makes things faster - or 
is that only something that makes a difference in real use but not in 
checkasm benchmarks?

// Martin
diff mbox series

Patch

diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S
index 0869431294..784bae33b3 100644
--- a/libavcodec/aarch64/hevcdsp_idct_neon.S
+++ b/libavcodec/aarch64/hevcdsp_idct_neon.S
@@ -618,3 +618,669 @@  idct_dc 16, 10
 
 idct_dc 32, 8
 idct_dc 32, 10
+
+// WIP
+
+.Lo0_coeff:     .hword  83, 36, 0, 0, 0, 0, 0, 0
+.Lo8transform0: .hword  89,  75,  50,  18               // transform[4,12,20,28][0]
+.Lo8transform1: .hword  75, -18, -89, -50
+.Lo8transform2: .hword  50, -89,  18,  75
+.Lo8transform3: .hword  18, -50,  75, -89
+
+.LimitMask:
+        .hword          0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0
+        .hword          0xffff,      0,      0,      0,      0, 0, 0, 0
+
+.Leo_coeff:
+        .hword          64,  64,  64,  64,  83,  36, -36, -83
+        .hword          64, -64, -64,  64,  36, -83,  83, -36
+        .hword          89,  75,  50,  18,  75, -18, -89, -50   // transform[4,12][0-3]
+        .hword          50, -89,  18,  75,  18, -50,  75, -89   // transform[20,28][0-3]
+.Lo16transform0: .hword 90,  87,  80,  70,  57,  43,  25,   9   // transform[2][0-7], also transform[2,6,10..][0]
+.Lo16transform1: .hword 87,  57,   9, -43, -80, -90, -70, -25   // transform[6][0-7]
+.Lo16transform2: .hword 80,   9, -70, -87, -25,  57,  90,  43   // transform[10][0-7]
+.Lo16transform3: .hword 70, -43, -87,   9,  90,  25, -80, -57   // transform[14][0-7]
+.Lo16transform4: .hword 57, -80, -25,  90,  -9, -87,  43,  70   // transform[18][0-7]
+.Lo16transform5: .hword 43, -90,  57,  25, -87,  70,   9, -80   // transform[22][0-7]
+.Lo16transform6: .hword 25, -70,  90, -80,  43,   9, -57,  87   // transform[26][0-7]
+.Lo16transform7: .hword  9, -25,  43, -57,  70, -80,  87, -90   // transform[30][0-7]
+
+// void ff_hevc_idct_16x16_8_neon(int16_t *coeffs, int col_limit)
+function ff_hevc_idct_16x16_8_neon_new, export=1
+        sub             sp, sp, 64
+        st1             {v8.16b, v9.16b, v10.16b, v11.16b}, [sp]
+        sub             sp, sp, 32
+        st1             {v14.16b, v15.16b}, [sp]
+        mov             x3, 0
+        mov             x2, x0
+1:      mov             x4, x2
+        mov             x5, 32
+        ld1             {v16.8h}, [x4], x5
+        ld1             {v17.8h}, [x4], x5
+        ld1             {v18.8h}, [x4], x5
+        ld1             {v19.8h}, [x4], x5
+        ld1             {v20.8h}, [x4], x5
+        ld1             {v21.8h}, [x4], x5
+        ld1             {v22.8h}, [x4], x5
+        ld1             {v23.8h}, [x4], x5
+        ld1             {v24.8h}, [x4], x5
+        ld1             {v25.8h}, [x4], x5
+        ld1             {v26.8h}, [x4], x5
+        ld1             {v27.8h}, [x4], x5
+        ld1             {v28.8h}, [x4], x5
+        ld1             {v29.8h}, [x4], x5
+        ld1             {v30.8h}, [x4], x5
+        ld1             {v31.8h}, [x4], x5
+        cmp             x1, 12
+        b.hs            5f
+        // limit2 below 16
+        bic             x4, x1, 1
+        adr             x5, .LimitMask
+        cbnz            x3, 3f
+        // columns 0 .. 7 - cleanup of indexes 5 .. 7
+        ld1             {v0.8h}, [x5]
+        adr             x5, 2f
+        add             x5, x5, x4, lsl 2
+        add             x5, x5, x4, lsl 1
+        br              x5
+2:      and             v17.16b, v17.16b, v0.16b    // col_limit 0..1 -> limit2 == 4..5
+        and             v19.16b, v19.16b, v0.16b
+        b               5f
+        and             v19.16b, v19.16b, v0.16b    // col_limit 2..3 -> limit2 == 6..7
+        and             v21.16b, v21.16b, v0.16b
+        b               5f
+        and             v21.16b, v21.16b, v0.16b    // col_limit 4..5 -> limit2 == 8..9
+        and             v23.16b, v23.16b, v0.16b
+        b               5f
+        and             v23.16b, v23.16b, v0.16b    // col_limit 6..7 -> limit2 == 10..11
+        and             v25.16b, v25.16b, v0.16b
+        b               5f
+        and             v25.16b, v25.16b, v0.16b    // col_limit 8..9 -> limit2 == 12..13
+        and             v27.16b, v27.16b, v0.16b
+        b               5f
+        and             v27.16b, v27.16b, v0.16b    // col_limit 10..11 -> limit2 == 14..15
+        and             v29.16b, v29.16b, v0.16b
+        b               5f
+        // columns 8 .. 15
+3:      subs            x4, x4, 2
+        b.lo            5f
+        ld1             {v0.8h, v1.8h}, [x5]
+        adr             x5, 4f
+        add             x5, x5, x4, lsl 3
+        add             x5, x5, x4, lsl 1
+        br              x5
+4:      and             v17.16b, v17.16b, v1.16b    // col_limit 2..3 -> limit2 == 2..3
+        b               5f
+        nop
+        nop
+        nop
+        and             v17.16b, v17.16b, v1.16b    // col_limit 4..5 -> limit2 == 4..5
+        and             v19.16b, v19.16b, v1.16b
+        b               5f
+        nop
+        nop
+        and             v17.16b, v17.16b, v0.16b    // col_limit 6..7 -> limit2 == 6..7
+        and             v19.16b, v19.16b, v1.16b
+        and             v21.16b, v21.16b, v1.16b
+        b               5f
+        nop
+        and             v17.16b, v17.16b, v0.16b    // col_limit 8..9 -> limit2 == 8..9
+        and             v19.16b, v19.16b, v0.16b
+        and             v21.16b, v21.16b, v1.16b
+        and             v23.16b, v23.16b, v1.16b
+        b               5f
+        and             v19.16b, v19.16b, v0.16b    // col_limit 10..11 -> limit2 == 10..11
+        and             v21.16b, v21.16b, v0.16b
+        and             v23.16b, v23.16b, v1.16b
+        and             v25.16b, v25.16b, v1.16b
+        b               5f
+5:      adr             x4, .Lo0_coeff
+        ld1             {v14.8h}, [x4]
+
+        // v0,v1 = e0
+        sshll           v0.4s, v16.4h, 6
+        sshll           v1.4s, v24.4h, 6
+        add             v0.4s, v0.4s, v1.4s
+        sshll2          v1.4s, v16.8h, 6
+        sshll2          v2.4s, v24.8h, 6
+        add             v1.4s, v1.4s, v2.4s
+
+        // v2,v3 = o0
+        smull           v2.4s, v20.4h, v14.h[0]
+        smlal           v2.4s, v28.4h, v14.h[1]
+        smull2          v3.4s, v20.8h, v14.h[0]
+        smlal2          v3.4s, v28.8h, v14.h[1]
+
+        // v4,v5 = e_8[0]
+        add             v4.4s, v0.4s, v2.4s
+        add             v5.4s, v1.4s, v3.4s
+
+        // v6,v7 = e_8[3]
+        sub             v6.4s, v0.4s, v2.4s
+        sub             v7.4s, v1.4s, v3.4s
+
+
+        // v0,v1 = o_8[0]
+        adr             x4, .Lo8transform0
+        ld1             {v15.4h}, [x4]
+        smull           v0.4s, v18.4h, v15.h[0]
+        smlal           v0.4s, v22.4h, v15.h[1]
+        smlal           v0.4s, v26.4h, v15.h[2]
+        smlal           v0.4s, v30.4h, v15.h[3]
+        smull2          v1.4s, v18.8h, v15.h[0]
+        smlal2          v1.4s, v22.8h, v15.h[1]
+        smlal2          v1.4s, v26.8h, v15.h[2]
+        smlal2          v1.4s, v30.8h, v15.h[3]
+
+        // v2,v3 = e_16[0]
+        add             v2.4s, v4.4s, v0.4s
+        add             v3.4s, v5.4s, v1.4s
+
+        // v8,v9 = o_16[0]
+        adr             x4, .Lo16transform0
+        ld1             {v15.8h}, [x4]
+
+        mov             x5, 16
+        cmp             x1, 12
+        b.hs            6f
+        add             x5, x1, 4
+        bic             x5, x5, 1
+        cbz             x3, 6f
+        orr             x5, x1, 1
+        subs            x5, x5, 2
+        csel            x5, x5, xzr, hs
+6:      mov             x4, 64
+        sub             x6, x4, x5, lsl 2
+        adr             x5, 7f
+        add             x5, x5, x6
+        movi            v8.4s, 0
+        movi            v9.4s, 0
+        br              x5
+7:      smlal2          v9.4s, v31.8h, v15.h[7]
+        smlal           v8.4s, v31.4h, v15.h[7]
+        smlal2          v9.4s, v29.8h, v15.h[6]
+        smlal           v8.4s, v29.4h, v15.h[6]
+        smlal2          v9.4s, v27.8h, v15.h[5]
+        smlal           v8.4s, v27.4h, v15.h[5]
+        smlal2          v9.4s, v25.8h, v15.h[4]
+        smlal           v8.4s, v25.4h, v15.h[4]
+        smlal2          v9.4s, v23.8h, v15.h[3]
+        smlal           v8.4s, v23.4h, v15.h[3]
+        smlal2          v9.4s, v21.8h, v15.h[2]
+        smlal           v8.4s, v21.4h, v15.h[2]
+        smlal2          v9.4s, v19.8h, v15.h[1]
+        smlal           v8.4s, v19.4h, v15.h[1]
+        smlal2          v9.4s, v17.8h, v15.h[0]
+        smlal           v8.4s, v17.4h, v15.h[0]
+
+        // tmp[0 * 16]
+        add             v10.4s, v2.4s, v8.4s
+        add             v11.4s, v3.4s, v9.4s
+        sqrshrn         v10.4h, v10.4s, 7
+        sqrshrn2        v10.8h, v11.4s, 7
+        st1             {v10.8h}, [x2]
+
+        // tmp[15 * 16]
+        sub             v10.4s, v2.4s, v8.4s
+        sub             v11.4s, v3.4s, v9.4s
+        sqrshrn         v10.4h, v10.4s, 7
+        sqrshrn2        v10.8h, v11.4s, 7
+        add             x4, x2, 15 * 32
+        st1             {v10.8h}, [x4]
+
+        // v2,v3 = e_16[7]
+        sub             v2.4s, v4.4s, v0.4s
+        sub             v3.4s, v5.4s, v1.4s
+
+        // v8,v9 = o_16[7]
+        adr             x4, .Lo16transform7
+        ld1             {v15.8h}, [x4]
+        adr             x5, 8f
+        add             x5, x5, x6
+        movi            v8.4s, 0
+        movi            v9.4s, 0
+        br              x5
+8:      smlal2          v9.4s, v31.8h, v15.h[7]
+        smlal           v8.4s, v31.4h, v15.h[7]
+        smlal2          v9.4s, v29.8h, v15.h[6]
+        smlal           v8.4s, v29.4h, v15.h[6]
+        smlal2          v9.4s, v27.8h, v15.h[5]
+        smlal           v8.4s, v27.4h, v15.h[5]
+        smlal2          v9.4s, v25.8h, v15.h[4]
+        smlal           v8.4s, v25.4h, v15.h[4]
+        smlal2          v9.4s, v23.8h, v15.h[3]
+        smlal           v8.4s, v23.4h, v15.h[3]
+        smlal2          v9.4s, v21.8h, v15.h[2]
+        smlal           v8.4s, v21.4h, v15.h[2]
+        smlal2          v9.4s, v19.8h, v15.h[1]
+        smlal           v8.4s, v19.4h, v15.h[1]
+        smlal2          v9.4s, v17.8h, v15.h[0]
+        smlal           v8.4s, v17.4h, v15.h[0]
+
+        // tmp[7 * 16]
+        add             v10.4s, v2.4s, v8.4s
+        add             v11.4s, v3.4s, v9.4s
+        sqrshrn         v10.4h, v10.4s, 7
+        sqrshrn2        v10.8h, v11.4s, 7
+        add             x4, x2, 7 * 32
+        st1             {v10.8h}, [x4]
+
+        // tmp[8 * 16]
+        sub             v10.4s, v2.4s, v8.4s
+        sub             v11.4s, v3.4s, v9.4s
+        sqrshrn         v10.4h, v10.4s, 7
+        sqrshrn2        v10.8h, v11.4s, 7
+        add             x4, x2, 8 * 32
+        st1             {v10.8h}, [x4]
+
+        // v0,v1 = o_8[3]
+        adr             x4, .Lo8transform3
+        ld1             {v15.4h}, [x4]
+        smull           v0.4s, v18.4h, v15.h[0]
+        smlal           v0.4s, v22.4h, v15.h[1]
+        smlal           v0.4s, v26.4h, v15.h[2]
+        smlal           v0.4s, v30.4h, v15.h[3]
+        smull2          v1.4s, v18.8h, v15.h[0]
+        smlal2          v1.4s, v22.8h, v15.h[1]
+        smlal2          v1.4s, v26.8h, v15.h[2]
+        smlal2          v1.4s, v30.8h, v15.h[3]
+
+        // v2,v3 = e_16[3]
+        add             v2.4s, v6.4s, v0.4s
+        add             v3.4s, v7.4s, v1.4s
+
+        // v8,v9 = o_16[3]
+        adr             x4, .Lo16transform3
+        ld1             {v15.8h}, [x4]
+        adr             x5, 9f
+        add             x5, x5, x6
+        movi            v8.4s, 0
+        movi            v9.4s, 0
+        br              x5
+9:      smlal2          v9.4s, v31.8h, v15.h[7]
+        smlal           v8.4s, v31.4h, v15.h[7]
+        smlal2          v9.4s, v29.8h, v15.h[6]
+        smlal           v8.4s, v29.4h, v15.h[6] // 13
+        smlal2          v9.4s, v27.8h, v15.h[5]
+        smlal           v8.4s, v27.4h, v15.h[5] // 11
+        smlal2          v9.4s, v25.8h, v15.h[4]
+        smlal           v8.4s, v25.4h, v15.h[4] // 9
+        smlal2          v9.4s, v23.8h, v15.h[3]
+        smlal           v8.4s, v23.4h, v15.h[3] // 7
+        smlal2          v9.4s, v21.8h, v15.h[2]
+        smlal           v8.4s, v21.4h, v15.h[2] // 5
+        smlal2          v9.4s, v19.8h, v15.h[1]
+        smlal           v8.4s, v19.4h, v15.h[1] // 3
+        smlal2          v9.4s, v17.8h, v15.h[0]
+        smlal           v8.4s, v17.4h, v15.h[0] // 1
+
+        // tmp[3 * 16]
+        add             v10.4s, v2.4s, v8.4s
+        add             v11.4s, v3.4s, v9.4s
+        sqrshrn         v10.4h, v10.4s, 7
+        sqrshrn2        v10.8h, v11.4s, 7
+        add             x4, x2, 3 * 32
+        st1             {v10.8h}, [x4]
+
+        // tmp[12 * 16]
+        sub             v10.4s, v2.4s, v8.4s
+        sub             v11.4s, v3.4s, v9.4s
+        sqrshrn         v10.4h, v10.4s, 7
+        sqrshrn2        v10.8h, v11.4s, 7
+        add             x4, x2, 12 * 32
+        st1             {v10.8h}, [x4]
+
+        // v2,v3 = e_16[4]
+        sub             v2.4s, v6.4s, v0.4s
+        sub             v3.4s, v7.4s, v1.4s
+
+        // v8,v9 = o_16[4]
+        adr             x4, .Lo16transform4
+        ld1             {v15.8h}, [x4]
+        adr             x5, 10f
+        add             x5, x5, x6
+        movi            v8.4s, 0
+        movi            v9.4s, 0
+        br              x5
+10:     smlal2          v9.4s, v31.8h, v15.h[7]
+        smlal           v8.4s, v31.4h, v15.h[7]
+        smlal2          v9.4s, v29.8h, v15.h[6]
+        smlal           v8.4s, v29.4h, v15.h[6]
+        smlal2          v9.4s, v27.8h, v15.h[5]
+        smlal           v8.4s, v27.4h, v15.h[5]
+        smlal2          v9.4s, v25.8h, v15.h[4]
+        smlal           v8.4s, v25.4h, v15.h[4]
+        smlal2          v9.4s, v23.8h, v15.h[3]
+        smlal           v8.4s, v23.4h, v15.h[3]
+        smlal2          v9.4s, v21.8h, v15.h[2]
+        smlal           v8.4s, v21.4h, v15.h[2]
+        smlal2          v9.4s, v19.8h, v15.h[1]
+        smlal           v8.4s, v19.4h, v15.h[1]
+        smlal2          v9.4s, v17.8h, v15.h[0]
+        smlal           v8.4s, v17.4h, v15.h[0]
+
+        // tmp[4 * 16]
+        add             v10.4s, v2.4s, v8.4s
+        add             v11.4s, v3.4s, v9.4s
+        sqrshrn         v10.4h, v10.4s, 7
+        sqrshrn2        v10.8h, v11.4s, 7
+        add             x4, x2, 4 * 32
+        st1             {v10.8h}, [x4]
+
+        // tmp[11 * 16]
+        sub             v10.4s, v2.4s, v8.4s
+        sub             v11.4s, v3.4s, v9.4s
+        sqrshrn         v10.4h, v10.4s, 7
+        sqrshrn2        v10.8h, v11.4s, 7
+        add             x4, x2, 11 * 32
+        st1             {v10.8h}, [x4]
+
+
+        // v0,v1 = e1
+        sshll           v0.4s, v16.4h, 6
+        sshll           v1.4s, v24.4h, 6
+        sub             v0.4s, v0.4s, v1.4s
+        sshll2          v1.4s, v16.8h, 6
+        sshll2          v2.4s, v24.8h, 6
+        sub             v1.4s, v1.4s, v2.4s
+
+        // v2,v3 = o1
+        smull           v2.4s, v20.4h, v14.h[1]
+        smlsl           v2.4s, v28.4h, v14.h[0]
+        smull2          v3.4s, v20.8h, v14.h[1]
+        smlsl2          v3.4s, v28.8h, v14.h[0]
+
+        // v4,v5 = e_8[1]
+        add             v4.4s, v0.4s, v2.4s
+        add             v5.4s, v1.4s, v3.4s
+
+        // v6,v7 = e_8[2]
+        sub             v6.4s, v0.4s, v2.4s
+        sub             v7.4s, v1.4s, v3.4s
+
+        // v0,v1 = o_8[1]
+        adr             x4, .Lo8transform1
+        ld1             {v15.4h}, [x4]
+        smull           v0.4s, v18.4h, v15.h[0]
+        smlal           v0.4s, v22.4h, v15.h[1]
+        smlal           v0.4s, v26.4h, v15.h[2]
+        smlal           v0.4s, v30.4h, v15.h[3]
+        smull2          v1.4s, v18.8h, v15.h[0]
+        smlal2          v1.4s, v22.8h, v15.h[1]
+        smlal2          v1.4s, v26.8h, v15.h[2]
+        smlal2          v1.4s, v30.8h, v15.h[3]
+
+        // v2,v3 = e_16[1]
+        add             v2.4s, v4.4s, v0.4s
+        add             v3.4s, v5.4s, v1.4s
+
+        // v8,v9 = o_16[1]
+        adr             x4, .Lo16transform1
+        ld1             {v15.8h}, [x4]
+        adr             x5, 11f
+        add             x5, x5, x6
+        movi            v8.4s, 0
+        movi            v9.4s, 0
+        br              x5
+11:     smlal2          v9.4s, v31.8h, v15.h[7]
+        smlal           v8.4s, v31.4h, v15.h[7]
+        smlal2          v9.4s, v29.8h, v15.h[6]
+        smlal           v8.4s, v29.4h, v15.h[6]
+        smlal2          v9.4s, v27.8h, v15.h[5]
+        smlal           v8.4s, v27.4h, v15.h[5]
+        smlal2          v9.4s, v25.8h, v15.h[4]
+        smlal           v8.4s, v25.4h, v15.h[4]
+        smlal2          v9.4s, v23.8h, v15.h[3]
+        smlal           v8.4s, v23.4h, v15.h[3]
+        smlal2          v9.4s, v21.8h, v15.h[2]
+        smlal           v8.4s, v21.4h, v15.h[2]
+        smlal2          v9.4s, v19.8h, v15.h[1]
+        smlal           v8.4s, v19.4h, v15.h[1]
+        smlal2          v9.4s, v17.8h, v15.h[0]
+        smlal           v8.4s, v17.4h, v15.h[0]
+
+        // tmp[1 * 16]
+        add             v10.4s, v2.4s, v8.4s
+        add             v11.4s, v3.4s, v9.4s
+        sqrshrn         v10.4h, v10.4s, 7
+        sqrshrn2        v10.8h, v11.4s, 7
+        add             x4, x2, 1 * 32
+        st1             {v10.8h}, [x4]
+
+        // tmp[14 * 16]
+        sub             v10.4s, v2.4s, v8.4s
+        sub             v11.4s, v3.4s, v9.4s
+        sqrshrn         v10.4h, v10.4s, 7
+        sqrshrn2        v10.8h, v11.4s, 7
+        add             x4, x2, 14 * 32
+        st1             {v10.8h}, [x4]
+
+        // v2,v3 = e_16[6]
+        sub             v2.4s, v4.4s, v0.4s
+        sub             v3.4s, v5.4s, v1.4s
+
+        // v8,v9 = o_16[6]
+        adr             x4, .Lo16transform6
+        ld1             {v15.8h}, [x4]
+        adr             x5, 12f
+        add             x5, x5, x6
+        movi            v8.4s, 0
+        movi            v9.4s, 0
+        br              x5
+12:     smlal2          v9.4s, v31.8h, v15.h[7]
+        smlal           v8.4s, v31.4h, v15.h[7]
+        smlal2          v9.4s, v29.8h, v15.h[6]
+        smlal           v8.4s, v29.4h, v15.h[6]
+        smlal2          v9.4s, v27.8h, v15.h[5]
+        smlal           v8.4s, v27.4h, v15.h[5]
+        smlal2          v9.4s, v25.8h, v15.h[4]
+        smlal           v8.4s, v25.4h, v15.h[4]
+        smlal2          v9.4s, v23.8h, v15.h[3]
+        smlal           v8.4s, v23.4h, v15.h[3]
+        smlal2          v9.4s, v21.8h, v15.h[2]
+        smlal           v8.4s, v21.4h, v15.h[2]
+        smlal2          v9.4s, v19.8h, v15.h[1]
+        smlal           v8.4s, v19.4h, v15.h[1]
+        smlal2          v9.4s, v17.8h, v15.h[0]
+        smlal           v8.4s, v17.4h, v15.h[0]
+
+        // tmp[6 * 16]
+        add             v10.4s, v2.4s, v8.4s
+        add             v11.4s, v3.4s, v9.4s
+        sqrshrn         v10.4h, v10.4s, 7
+        sqrshrn2        v10.8h, v11.4s, 7
+        add             x4, x2, 6 * 32
+        st1             {v10.8h}, [x4]
+
+        // tmp[9 * 16]
+        sub             v10.4s, v2.4s, v8.4s
+        sub             v11.4s, v3.4s, v9.4s
+        sqrshrn         v10.4h, v10.4s, 7
+        sqrshrn2        v10.8h, v11.4s, 7
+        add             x4, x2, 9 * 32
+        st1             {v10.8h}, [x4]
+
+        // v0,v1 = o_8[2]
+        adr             x4, .Lo8transform2
+        ld1             {v15.4h}, [x4]
+        smull           v0.4s, v18.4h, v15.h[0]
+        smlal           v0.4s, v22.4h, v15.h[1]
+        smlal           v0.4s, v26.4h, v15.h[2]
+        smlal           v0.4s, v30.4h, v15.h[3]
+        smull2          v1.4s, v18.8h, v15.h[0]
+        smlal2          v1.4s, v22.8h, v15.h[1]
+        smlal2          v1.4s, v26.8h, v15.h[2]
+        smlal2          v1.4s, v30.8h, v15.h[3]
+
+        // v2,v3 = e_16[2]
+        add             v2.4s, v6.4s, v0.4s
+        add             v3.4s, v7.4s, v1.4s
+
+        // v8,v9 = o_16[2]
+        adr             x4, .Lo16transform2
+        ld1             {v15.8h}, [x4]
+        adr             x5, 13f
+        add             x5, x5, x6
+        movi            v8.4s, 0
+        movi            v9.4s, 0
+        br              x5
+13:     smlal2          v9.4s, v31.8h, v15.h[7]
+        smlal           v8.4s, v31.4h, v15.h[7]
+        smlal2          v9.4s, v29.8h, v15.h[6]
+        smlal           v8.4s, v29.4h, v15.h[6]
+        smlal2          v9.4s, v27.8h, v15.h[5]
+        smlal           v8.4s, v27.4h, v15.h[5]
+        smlal2          v9.4s, v25.8h, v15.h[4]
+        smlal           v8.4s, v25.4h, v15.h[4]
+        smlal2          v9.4s, v23.8h, v15.h[3]
+        smlal           v8.4s, v23.4h, v15.h[3]
+        smlal2          v9.4s, v21.8h, v15.h[2]
+        smlal           v8.4s, v21.4h, v15.h[2]
+        smlal2          v9.4s, v19.8h, v15.h[1]
+        smlal           v8.4s, v19.4h, v15.h[1]
+        smlal2          v9.4s, v17.8h, v15.h[0]
+        smlal           v8.4s, v17.4h, v15.h[0]
+
+        // tmp[2 * 16]
+        add             v10.4s, v2.4s, v8.4s
+        add             v11.4s, v3.4s, v9.4s
+        sqrshrn         v10.4h, v10.4s, 7
+        sqrshrn2        v10.8h, v11.4s, 7
+        add             x4, x2, 2 * 32
+        st1             {v10.8h}, [x4]
+
+        // tmp[13 * 16]
+        sub             v10.4s, v2.4s, v8.4s
+        sub             v11.4s, v3.4s, v9.4s
+        sqrshrn         v10.4h, v10.4s, 7
+        sqrshrn2        v10.8h, v11.4s, 7
+        add             x4, x2, 13 * 32
+        st1             {v10.8h}, [x4]
+
+        // v2,v3 = e_16[5]
+        sub             v2.4s, v6.4s, v0.4s
+        sub             v3.4s, v7.4s, v1.4s
+
+        // v8,v9 = o_16[5]
+        adr             x4, .Lo16transform5
+        ld1             {v15.8h}, [x4]
+        adr             x5, 14f
+        add             x5, x5, x6
+        movi            v8.4s, 0
+        movi            v9.4s, 0
+        br              x5
+14:     smlal2          v9.4s, v31.8h, v15.h[7]
+        smlal           v8.4s, v31.4h, v15.h[7]
+        smlal2          v9.4s, v29.8h, v15.h[6]
+        smlal           v8.4s, v29.4h, v15.h[6]
+        smlal2          v9.4s, v27.8h, v15.h[5]
+        smlal           v8.4s, v27.4h, v15.h[5]
+        smlal2          v9.4s, v25.8h, v15.h[4]
+        smlal           v8.4s, v25.4h, v15.h[4]
+        smlal2          v9.4s, v23.8h, v15.h[3]
+        smlal           v8.4s, v23.4h, v15.h[3]
+        smlal2          v9.4s, v21.8h, v15.h[2]
+        smlal           v8.4s, v21.4h, v15.h[2]
+        smlal2          v9.4s, v19.8h, v15.h[1]
+        smlal           v8.4s, v19.4h, v15.h[1]
+        smlal2          v9.4s, v17.8h, v15.h[0]
+        smlal           v8.4s, v17.4h, v15.h[0]
+
+        // tmp[5 * 16]
+        add             v10.4s, v2.4s, v8.4s
+        add             v11.4s, v3.4s, v9.4s
+        sqrshrn         v10.4h, v10.4s, 7
+        sqrshrn2        v10.8h, v11.4s, 7
+        add             x4, x2, 5 * 32
+        st1             {v10.8h}, [x4]
+
+        // tmp[10 * 16]
+        sub             v10.4s, v2.4s, v8.4s
+        sub             v11.4s, v3.4s, v9.4s
+        sqrshrn         v10.4h, v10.4s, 7
+        sqrshrn2        v10.8h, v11.4s, 7
+        add             x4, x2, 10 * 32
+        st1             {v10.8h}, [x4]
+
+        add             x2, x2, 16
+        add             x3, x3, 1
+        cmp             x3, 2
+        b.lo            1b
+
+
+        // horizontal transform
+        adr             x4, .Leo_coeff
+        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x4], 64
+        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x4], 64
+        ld1             {v24.8h, v25.8h, v26.8h, v27.8h}, [x4], 64
+        // o_16 jump address
+        mov             x4, 64
+        bic             x5, x1, 1
+        subs            x4, x4, x5, lsl 2
+        csel            x4, x4, xzr, hs
+        adr             x5, 15f
+        add             x5, x5, x4
+
+        mov             x3, 16
+14:     ld1             {v6.8h, v7.8h}, [x0]
+
+        // v2 = e_8
+        smull           v2.4s, v16.4h, v6.h[0]
+        smlal2          v2.4s, v16.8h, v6.h[4]
+        smlal           v2.4s, v17.4h, v7.h[0]
+        smlal2          v2.4s, v17.8h, v7.h[4]
+
+        // v3 = o_8
+        smull           v3.4s, v18.4h, v6.h[2]
+        smlal2          v3.4s, v18.8h, v6.h[6]
+        smlal           v3.4s, v19.4h, v7.h[2]
+        smlal2          v3.4s, v19.8h, v7.h[6]
+
+        // v0,v1 = e_16
+        add             v0.4s, v2.4s, v3.4s
+        sub             v2.4s, v2.4s, v3.4s
+        mov             v1.d[0], v2.d[1]
+        mov             v1.d[1], v2.d[0]
+        rev64           v1.4s, v1.4s
+
+        // v2,v3 = o_16
+        movi            v2.4s, 0
+        movi            v3.4s, 0
+        br              x5
+15:     smlal           v2.4s, v27.4h, v7.h[7]
+        smlal2          v3.4s, v27.8h, v7.h[7]
+        smlal           v2.4s, v26.4h, v7.h[5]
+        smlal2          v3.4s, v26.8h, v7.h[5]
+        smlal           v2.4s, v25.4h, v7.h[3]
+        smlal2          v3.4s, v25.8h, v7.h[3]
+        smlal           v2.4s, v24.4h, v7.h[1]
+        smlal2          v3.4s, v24.8h, v7.h[1]
+        smlal           v2.4s, v23.4h, v6.h[7]
+        smlal2          v3.4s, v23.8h, v6.h[7]
+        smlal           v2.4s, v22.4h, v6.h[5]
+        smlal2          v3.4s, v22.8h, v6.h[5]
+        smlal           v2.4s, v21.4h, v6.h[3]
+        smlal2          v3.4s, v21.8h, v6.h[3]
+        smlal           v2.4s, v20.4h, v6.h[1]
+        smlal2          v3.4s, v20.8h, v6.h[1]
+
+        // coeff
+        add             v4.4s, v0.4s, v2.4s
+        add             v5.4s, v1.4s, v3.4s
+        sub             v6.4s, v0.4s, v2.4s
+        sub             v7.4s, v1.4s, v3.4s
+        sqrshrn         v4.4h, v4.4s, 12
+        sqrshrn2        v4.8h, v5.4s, 12
+        sqrshrn         v6.4h, v6.4s, 12
+        sqrshrn2        v6.8h, v7.4s, 12
+        mov             v5.d[0], v6.d[1]
+        mov             v5.d[1], v6.d[0]
+        rev64           v5.8h, v5.8h
+        st1             {v4.8h, v5.8h}, [x0], 32
+        subs            x3, x3, 1
+        b.ne            14b
+
+        ld1             {v14.16b, v15.16b}, [sp], 32
+        ld1             {v8.16b, v9.16b, v10.16b, v11.16b}, [sp], 64
+        ret
+endfunc
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 2002530266..612ebb9541 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -45,6 +45,7 @@  void ff_hevc_idct_8x8_8_neon(int16_t *coeffs, int col_limit);
 void ff_hevc_idct_8x8_10_neon(int16_t *coeffs, int col_limit);
 void ff_hevc_idct_16x16_8_neon(int16_t *coeffs, int col_limit);
 void ff_hevc_idct_16x16_10_neon(int16_t *coeffs, int col_limit);
+void ff_hevc_idct_16x16_8_neon_new(int16_t *coeffs, int col_limit);
 void ff_hevc_idct_4x4_dc_8_neon(int16_t *coeffs);
 void ff_hevc_idct_8x8_dc_8_neon(int16_t *coeffs);
 void ff_hevc_idct_16x16_dc_8_neon(int16_t *coeffs);
@@ -72,7 +73,7 @@  av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
         c->add_residual[2]             = ff_hevc_add_residual_16x16_8_neon;
         c->add_residual[3]             = ff_hevc_add_residual_32x32_8_neon;
         c->idct[1]                     = ff_hevc_idct_8x8_8_neon;
-        c->idct[2]                     = ff_hevc_idct_16x16_8_neon;
+        c->idct[2]                     = ff_hevc_idct_16x16_8_neon_new;
         c->idct_dc[0]                  = ff_hevc_idct_4x4_dc_8_neon;
         c->idct_dc[1]                  = ff_hevc_idct_8x8_dc_8_neon;
         c->idct_dc[2]                  = ff_hevc_idct_16x16_dc_8_neon;