Message ID | 20230604041756.5196-5-Logan.Lyu@myais.com.cn |
---|---|
State | New |
Headers | show |
Series | [FFmpeg-devel,1/5] lavc/aarch64: new optimization for 8-bit hevc_pel_uni_pixels | expand |
Context | Check | Description |
---|---|---|
andriy/configure_x86 | warning | Failed to apply patch |
yinshiyou/configure_loongarch64 | warning | Failed to apply patch |
On Sun, 4 Jun 2023, Logan.Lyu@myais.com.cn wrote: > From: Logan Lyu <Logan.Lyu@myais.com.cn> > > Signed-off-by: Logan Lyu <Logan.Lyu@myais.com.cn> > --- > libavcodec/aarch64/hevcdsp_epel_neon.S | 703 ++++++++++++++++++++++ > libavcodec/aarch64/hevcdsp_init_aarch64.c | 7 + > 2 files changed, 710 insertions(+) > > diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S b/libavcodec/aarch64/hevcdsp_epel_neon.S > index 32f052a7b1..24a74d2c7d 100644 > --- a/libavcodec/aarch64/hevcdsp_epel_neon.S > +++ b/libavcodec/aarch64/hevcdsp_epel_neon.S > @@ -718,6 +718,709 @@ function ff_hevc_put_hevc_epel_uni_w_h64_8_neon_i8mm, export=1 > ret > endfunc > > +.macro epel_uni_w_hv_start > + mov x15, x5 //denom > + mov x16, x6 //wx > + mov x17, x7 //ox > + add w15, w15, #6 //shift = denom+6 > + > + > + ldp x5, x6, [sp] > + ldp x7, xzr, [sp, #16] Why ldp into xzr, that seems pointless? > + > + sub sp, sp, #128 > + stp q12, q13, [sp] This could be "stp q12, q13, [sp, #-128]!" > + stp q14, q15, [sp, #32] > + stp q8, q9, [sp, #64] > + stp q10, q11, [sp, #96] > + > + dup v13.8h, w16 //wx > + dup v14.4s, w17 //ox > + > + mov w17, #1 > + lsl w17, w17, w15 > + lsr w17, w17, #1 > + dup v15.4s, w17 > + > + neg w15, w15 // -shift > + dup v12.4s, w15 //shift > +.endm > + > +.macro epel_uni_w_hv_end > + smull v28.4s, v4.4h, v13.4h > + smull2 v29.4s, v4.8h, v13.8h > + add v28.4s, v28.4s, v15.4s > + add v29.4s, v29.4s, v15.4s > + sshl v28.4s, v28.4s, v12.4s > + sshl v29.4s, v29.4s, v12.4s > + add v28.4s, v28.4s, v14.4s > + add v29.4s, v29.4s, v14.4s > + sqxtn v4.4h, v28.4s > + sqxtn2 v4.8h, v29.4s > +.endm > + > +.macro epel_uni_w_hv_end2 > + smull v28.4s, v4.4h, v13.4h > + smull2 v29.4s, v4.8h, v13.8h > + smull v30.4s, v5.4h, v13.4h > + smull2 v31.4s, v5.8h, v13.8h > + add v28.4s, v28.4s, v15.4s > + add v29.4s, v29.4s, v15.4s > + add v30.4s, v30.4s, v15.4s > + add v31.4s, v31.4s, v15.4s > + > + sshl v28.4s, v28.4s, v12.4s > + sshl v29.4s, v29.4s, v12.4s > + sshl v30.4s, v30.4s, v12.4s > + sshl v31.4s, v31.4s, v12.4s > + > + add v28.4s, v28.4s, v14.4s > + add v29.4s, v29.4s, v14.4s > + add v30.4s, v30.4s, v14.4s > + add v31.4s, v31.4s, v14.4s > + > + sqxtn v4.4h, v28.4s > + sqxtn2 v4.8h, v29.4s > + sqxtn v5.4h, v30.4s > + sqxtn2 v5.8h, v31.4s > +.endm > + > +.macro epel_uni_w_hv_end3 > + smull v1.4s, v4.4h, v13.4h > + smull2 v2.4s, v4.8h, v13.8h > + smull v28.4s, v5.4h, v13.4h > + smull2 v29.4s, v5.8h, v13.8h > + smull v30.4s, v6.4h, v13.4h > + smull2 v31.4s, v6.8h, v13.8h > + add v1.4s, v1.4s, v15.4s > + add v2.4s, v2.4s, v15.4s > + add v28.4s, v28.4s, v15.4s > + add v29.4s, v29.4s, v15.4s > + add v30.4s, v30.4s, v15.4s > + add v31.4s, v31.4s, v15.4s > + > + sshl v1.4s, v1.4s, v12.4s > + sshl v2.4s, v2.4s, v12.4s > + sshl v28.4s, v28.4s, v12.4s > + sshl v29.4s, v29.4s, v12.4s > + sshl v30.4s, v30.4s, v12.4s > + sshl v31.4s, v31.4s, v12.4s > + add v1.4s, v1.4s, v14.4s > + add v2.4s, v2.4s, v14.4s > + add v28.4s, v28.4s, v14.4s > + add v29.4s, v29.4s, v14.4s > + add v30.4s, v30.4s, v14.4s > + add v31.4s, v31.4s, v14.4s > + > + sqxtn v4.4h, v1.4s > + sqxtn2 v4.8h, v2.4s > + sqxtn v5.4h, v28.4s > + sqxtn2 v5.8h, v29.4s > + sqxtn v6.4h, v30.4s > + sqxtn2 v6.8h, v31.4s > +.endm > + > +.macro calc_epelh dst, src0, src1, src2, src3 > + smull \dst\().4s, \src0\().4h, v0.h[0] > + smlal \dst\().4s, \src1\().4h, v0.h[1] > + smlal \dst\().4s, \src2\().4h, v0.h[2] > + smlal \dst\().4s, \src3\().4h, v0.h[3] > + sqshrn \dst\().4h, \dst\().4s, #6 > +.endm > + > +.macro calc_epelh2 dst, tmp, src0, src1, src2, src3 > + smull2 \tmp\().4s, \src0\().8h, v0.h[0] > + smlal2 \tmp\().4s, \src1\().8h, v0.h[1] > + smlal2 \tmp\().4s, \src2\().8h, v0.h[2] > + smlal2 \tmp\().4s, \src3\().8h, v0.h[3] > + sqshrn2 \dst\().8h, \tmp\().4s, #6 > +.endm > + > +.macro load_epel_filterh freg, xreg > + movrel \xreg, epel_filters > + add \xreg, \xreg, \freg, lsl #2 > + ld1 {v0.8b}, [\xreg] > + sxtl v0.8h, v0.8b > +.endm > + > +function ff_hevc_put_hevc_epel_uni_w_hv4_8_neon_i8mm, export=1 > + epel_uni_w_hv_start > + and x4, x4, 0xffffffff What does this "and" do here? Is it a case where the argument is "int", while the upper bits of the register is undefined? In those cases, you're best off by just using "w4", possibly "w4, uxtw" (or sxtw) instead of manually doing such an "and" here. > + > + add x10, x4, #3 > + lsl x10, x10, #7 > + sub sp, sp, x10 // tmp_array > + stp x0, x1, [sp, #-16]! > + stp x4, x6, [sp, #-16]! > + stp xzr, x30, [sp, #-16]! Don't do consecutive decrements like this, but do one "stp ..., [sp, #-48]!" followed by "stp ..., [sp, #16]" etc. > + add x0, sp, #48 > + sub x1, x2, x3 > + mov x2, x3 > + add x3, x4, #3 > + mov x4, x5 > + bl X(ff_hevc_put_hevc_epel_h4_8_neon_i8mm) > + ldp xzr, x30, [sp], #16 > + ldp x4, x6, [sp], #16 > + ldp x0, x1, [sp], #16 > + load_epel_filterh x6, x5 > + mov x10, #(MAX_PB_SIZE * 2) > + ld1 {v16.4h}, [sp], x10 > + ld1 {v17.4h}, [sp], x10 > + ld1 {v18.4h}, [sp], x10 > +1: ld1 {v19.4h}, [sp], x10 > + calc_epelh v4, v16, v17, v18, v19 > + epel_uni_w_hv_end > + sqxtun v4.8b, v4.8h > + str s4, [x0] > + add x0, x0, x1 > + subs x4, x4, #1 > + b.eq 2f > + > + ld1 {v16.4h}, [sp], x10 > + calc_epelh v4, v17, v18, v19, v16 > + epel_uni_w_hv_end > + sqxtun v4.8b, v4.8h > + str s4, [x0] > + add x0, x0, x1 > + subs x4, x4, #1 > + b.eq 2f > + > + ld1 {v17.4h}, [sp], x10 > + calc_epelh v4, v18, v19, v16, v17 > + epel_uni_w_hv_end > + sqxtun v4.8b, v4.8h > + str s4, [x0] > + add x0, x0, x1 > + subs x4, x4, #1 > + b.eq 2f > + > + ld1 {v18.4h}, [sp], x10 > + calc_epelh v4, v19, v16, v17, v18 > + epel_uni_w_hv_end > + sqxtun v4.8b, v4.8h > + str s4, [x0] > + add x0, x0, x1 > + subs x4, x4, #1 > + b.ne 1b > +2: > + ldp q12, q13, [sp] > + ldp q14, q15, [sp, #32] > + ldp q8, q9, [sp, #64] > + ldp q10, q11, [sp, #96] > + add sp, sp, #128 Fold the stack increment into ldp, like "ldp q12, q13, [sp], #128". The same thing applies to all other functions in this patch too. > diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c > index 348497bbbe..fbbc4e6071 100644 > --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c > +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c > @@ -189,6 +189,11 @@ NEON8_FNPROTO(qpel_uni_w_h, (uint8_t *_dst, ptrdiff_t _dststride, > int height, int denom, int wx, int ox, > intptr_t mx, intptr_t my, int width), _i8mm); > > +NEON8_FNPROTO(epel_uni_w_hv, (uint8_t *_dst, ptrdiff_t _dststride, > + const uint8_t *_src, ptrdiff_t _srcstride, > + int height, int denom, int wx, int ox, > + intptr_t mx, intptr_t my, int width), _i8mm); > + > NEON8_FNPROTO_PARTIAL_5(qpel_uni_w_hv, (uint8_t *_dst, ptrdiff_t _dststride, > const uint8_t *_src, ptrdiff_t _srcstride, > int height, int denom, int wx, int ox, > @@ -286,11 +291,13 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth) > NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 1, 0, epel_uni_w_v,); > NEON8_FNASSIGN_PARTIAL_4(c->put_hevc_qpel_uni_w, 1, 0, qpel_uni_w_v,); > > + > if (have_i8mm(cpu_flags)) { Stray whitespace change. // Martin
Hi, Martin, I modified it according to your comments. Please review again. And here are the checkasm benchmark results of the related functions: put_hevc_epel_uni_w_hv4_8_c: 254.6 put_hevc_epel_uni_w_hv4_8_i8mm: 102.9 put_hevc_epel_uni_w_hv6_8_c: 411.6 put_hevc_epel_uni_w_hv6_8_i8mm: 221.6 put_hevc_epel_uni_w_hv8_8_c: 669.4 put_hevc_epel_uni_w_hv8_8_i8mm: 214.9 put_hevc_epel_uni_w_hv12_8_c: 1412.6 put_hevc_epel_uni_w_hv12_8_i8mm: 481.4 put_hevc_epel_uni_w_hv16_8_c: 2425.4 put_hevc_epel_uni_w_hv16_8_i8mm: 647.4 put_hevc_epel_uni_w_hv24_8_c: 5384.1 put_hevc_epel_uni_w_hv24_8_i8mm: 1450.6 put_hevc_epel_uni_w_hv32_8_c: 9470.9 put_hevc_epel_uni_w_hv32_8_i8mm: 2497.1 put_hevc_epel_uni_w_hv48_8_c: 20930.1 put_hevc_epel_uni_w_hv48_8_i8mm: 5635.9 put_hevc_epel_uni_w_hv64_8_c: 36682.9 put_hevc_epel_uni_w_hv64_8_i8mm: 9712.6 在 2023/6/12 16:19, Martin Storsjö 写道: > On Sun, 4 Jun 2023, Logan.Lyu@myais.com.cn wrote: > >> From: Logan Lyu <Logan.Lyu@myais.com.cn> >> >> Signed-off-by: Logan Lyu <Logan.Lyu@myais.com.cn> >> --- >> libavcodec/aarch64/hevcdsp_epel_neon.S | 703 ++++++++++++++++++++++ >> libavcodec/aarch64/hevcdsp_init_aarch64.c | 7 + >> 2 files changed, 710 insertions(+) >> >> diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S >> b/libavcodec/aarch64/hevcdsp_epel_neon.S >> index 32f052a7b1..24a74d2c7d 100644 >> --- a/libavcodec/aarch64/hevcdsp_epel_neon.S >> +++ b/libavcodec/aarch64/hevcdsp_epel_neon.S >> @@ -718,6 +718,709 @@ function >> ff_hevc_put_hevc_epel_uni_w_h64_8_neon_i8mm, export=1 >> ret >> endfunc >> >> +.macro epel_uni_w_hv_start >> + mov x15, x5 //denom >> + mov x16, x6 //wx >> + mov x17, x7 //ox >> + add w15, w15, #6 //shift = denom+6 >> + >> + >> + ldp x5, x6, [sp] >> + ldp x7, xzr, [sp, #16] > > Why ldp into xzr, that seems pointless? > >> + >> + sub sp, sp, #128 >> + stp q12, q13, [sp] > > This could be "stp q12, q13, [sp, #-128]!" > >> + stp q14, q15, [sp, #32] >> + stp q8, q9, [sp, #64] >> + stp q10, q11, [sp, #96] >> + >> + dup v13.8h, w16 //wx >> + dup v14.4s, w17 //ox >> + >> + mov w17, #1 >> + lsl w17, w17, w15 >> + lsr w17, w17, #1 >> + dup v15.4s, w17 >> + >> + neg w15, w15 // -shift >> + dup v12.4s, w15 //shift >> +.endm >> + >> +.macro epel_uni_w_hv_end >> + smull v28.4s, v4.4h, v13.4h >> + smull2 v29.4s, v4.8h, v13.8h >> + add v28.4s, v28.4s, v15.4s >> + add v29.4s, v29.4s, v15.4s >> + sshl v28.4s, v28.4s, v12.4s >> + sshl v29.4s, v29.4s, v12.4s >> + add v28.4s, v28.4s, v14.4s >> + add v29.4s, v29.4s, v14.4s >> + sqxtn v4.4h, v28.4s >> + sqxtn2 v4.8h, v29.4s >> +.endm >> + >> +.macro epel_uni_w_hv_end2 >> + smull v28.4s, v4.4h, v13.4h >> + smull2 v29.4s, v4.8h, v13.8h >> + smull v30.4s, v5.4h, v13.4h >> + smull2 v31.4s, v5.8h, v13.8h >> + add v28.4s, v28.4s, v15.4s >> + add v29.4s, v29.4s, v15.4s >> + add v30.4s, v30.4s, v15.4s >> + add v31.4s, v31.4s, v15.4s >> + >> + sshl v28.4s, v28.4s, v12.4s >> + sshl v29.4s, v29.4s, v12.4s >> + sshl v30.4s, v30.4s, v12.4s >> + sshl v31.4s, v31.4s, v12.4s >> + >> + add v28.4s, v28.4s, v14.4s >> + add v29.4s, v29.4s, v14.4s >> + add v30.4s, v30.4s, v14.4s >> + add v31.4s, v31.4s, v14.4s >> + >> + sqxtn v4.4h, v28.4s >> + sqxtn2 v4.8h, v29.4s >> + sqxtn v5.4h, v30.4s >> + sqxtn2 v5.8h, v31.4s >> +.endm >> + >> +.macro epel_uni_w_hv_end3 >> + smull v1.4s, v4.4h, v13.4h >> + smull2 v2.4s, v4.8h, v13.8h >> + smull v28.4s, v5.4h, v13.4h >> + smull2 v29.4s, v5.8h, v13.8h >> + smull v30.4s, v6.4h, v13.4h >> + smull2 v31.4s, v6.8h, v13.8h >> + add v1.4s, v1.4s, v15.4s >> + add v2.4s, v2.4s, v15.4s >> + add v28.4s, v28.4s, v15.4s >> + add v29.4s, v29.4s, v15.4s >> + add v30.4s, v30.4s, v15.4s >> + add v31.4s, v31.4s, v15.4s >> + >> + sshl v1.4s, v1.4s, v12.4s >> + sshl v2.4s, v2.4s, v12.4s >> + sshl v28.4s, v28.4s, v12.4s >> + sshl v29.4s, v29.4s, v12.4s >> + sshl v30.4s, v30.4s, v12.4s >> + sshl v31.4s, v31.4s, v12.4s >> + add v1.4s, v1.4s, v14.4s >> + add v2.4s, v2.4s, v14.4s >> + add v28.4s, v28.4s, v14.4s >> + add v29.4s, v29.4s, v14.4s >> + add v30.4s, v30.4s, v14.4s >> + add v31.4s, v31.4s, v14.4s >> + >> + sqxtn v4.4h, v1.4s >> + sqxtn2 v4.8h, v2.4s >> + sqxtn v5.4h, v28.4s >> + sqxtn2 v5.8h, v29.4s >> + sqxtn v6.4h, v30.4s >> + sqxtn2 v6.8h, v31.4s >> +.endm >> + >> +.macro calc_epelh dst, src0, src1, src2, src3 >> + smull \dst\().4s, \src0\().4h, v0.h[0] >> + smlal \dst\().4s, \src1\().4h, v0.h[1] >> + smlal \dst\().4s, \src2\().4h, v0.h[2] >> + smlal \dst\().4s, \src3\().4h, v0.h[3] >> + sqshrn \dst\().4h, \dst\().4s, #6 >> +.endm >> + >> +.macro calc_epelh2 dst, tmp, src0, src1, src2, src3 >> + smull2 \tmp\().4s, \src0\().8h, v0.h[0] >> + smlal2 \tmp\().4s, \src1\().8h, v0.h[1] >> + smlal2 \tmp\().4s, \src2\().8h, v0.h[2] >> + smlal2 \tmp\().4s, \src3\().8h, v0.h[3] >> + sqshrn2 \dst\().8h, \tmp\().4s, #6 >> +.endm >> + >> +.macro load_epel_filterh freg, xreg >> + movrel \xreg, epel_filters >> + add \xreg, \xreg, \freg, lsl #2 >> + ld1 {v0.8b}, [\xreg] >> + sxtl v0.8h, v0.8b >> +.endm >> + >> +function ff_hevc_put_hevc_epel_uni_w_hv4_8_neon_i8mm, export=1 >> + epel_uni_w_hv_start >> + and x4, x4, 0xffffffff > > What does this "and" do here? Is it a case where the argument is > "int", while the upper bits of the register is undefined? In those > cases, you're best off by just using "w4", possibly "w4, uxtw" (or > sxtw) instead of manually doing such an "and" here. > >> + >> + add x10, x4, #3 >> + lsl x10, x10, #7 >> + sub sp, sp, x10 // tmp_array >> + stp x0, x1, [sp, #-16]! >> + stp x4, x6, [sp, #-16]! >> + stp xzr, x30, [sp, #-16]! > > Don't do consecutive decrements like this, but do one "stp ..., [sp, > #-48]!" followed by "stp ..., [sp, #16]" etc. > >> + add x0, sp, #48 >> + sub x1, x2, x3 >> + mov x2, x3 >> + add x3, x4, #3 >> + mov x4, x5 >> + bl X(ff_hevc_put_hevc_epel_h4_8_neon_i8mm) >> + ldp xzr, x30, [sp], #16 >> + ldp x4, x6, [sp], #16 >> + ldp x0, x1, [sp], #16 >> + load_epel_filterh x6, x5 >> + mov x10, #(MAX_PB_SIZE * 2) >> + ld1 {v16.4h}, [sp], x10 >> + ld1 {v17.4h}, [sp], x10 >> + ld1 {v18.4h}, [sp], x10 >> +1: ld1 {v19.4h}, [sp], x10 >> + calc_epelh v4, v16, v17, v18, v19 >> + epel_uni_w_hv_end >> + sqxtun v4.8b, v4.8h >> + str s4, [x0] >> + add x0, x0, x1 >> + subs x4, x4, #1 >> + b.eq 2f >> + >> + ld1 {v16.4h}, [sp], x10 >> + calc_epelh v4, v17, v18, v19, v16 >> + epel_uni_w_hv_end >> + sqxtun v4.8b, v4.8h >> + str s4, [x0] >> + add x0, x0, x1 >> + subs x4, x4, #1 >> + b.eq 2f >> + >> + ld1 {v17.4h}, [sp], x10 >> + calc_epelh v4, v18, v19, v16, v17 >> + epel_uni_w_hv_end >> + sqxtun v4.8b, v4.8h >> + str s4, [x0] >> + add x0, x0, x1 >> + subs x4, x4, #1 >> + b.eq 2f >> + >> + ld1 {v18.4h}, [sp], x10 >> + calc_epelh v4, v19, v16, v17, v18 >> + epel_uni_w_hv_end >> + sqxtun v4.8b, v4.8h >> + str s4, [x0] >> + add x0, x0, x1 >> + subs x4, x4, #1 >> + b.ne 1b >> +2: >> + ldp q12, q13, [sp] >> + ldp q14, q15, [sp, #32] >> + ldp q8, q9, [sp, #64] >> + ldp q10, q11, [sp, #96] >> + add sp, sp, #128 > > Fold the stack increment into ldp, like "ldp q12, q13, [sp], #128". > > The same thing applies to all other functions in this patch too. > >> diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c >> b/libavcodec/aarch64/hevcdsp_init_aarch64.c >> index 348497bbbe..fbbc4e6071 100644 >> --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c >> +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c >> @@ -189,6 +189,11 @@ NEON8_FNPROTO(qpel_uni_w_h, (uint8_t *_dst, >> ptrdiff_t _dststride, >> int height, int denom, int wx, int ox, >> intptr_t mx, intptr_t my, int width), _i8mm); >> >> +NEON8_FNPROTO(epel_uni_w_hv, (uint8_t *_dst, ptrdiff_t _dststride, >> + const uint8_t *_src, ptrdiff_t _srcstride, >> + int height, int denom, int wx, int ox, >> + intptr_t mx, intptr_t my, int width), _i8mm); >> + >> NEON8_FNPROTO_PARTIAL_5(qpel_uni_w_hv, (uint8_t *_dst, ptrdiff_t >> _dststride, >> const uint8_t *_src, ptrdiff_t _srcstride, >> int height, int denom, int wx, int ox, >> @@ -286,11 +291,13 @@ av_cold void >> ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth) >> NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 1, 0, epel_uni_w_v,); >> NEON8_FNASSIGN_PARTIAL_4(c->put_hevc_qpel_uni_w, 1, 0, >> qpel_uni_w_v,); >> >> + >> if (have_i8mm(cpu_flags)) { > > Stray whitespace change. > > // Martin > From 47b7f7af634add7680b56a216fff7dbe1f08cd11 Mon Sep 17 00:00:00 2001 From: Logan Lyu <Logan.Lyu@myais.com.cn> Date: Sun, 28 May 2023 10:35:43 +0800 Subject: [PATCH 5/5] lavc/aarch64: new optimization for 8-bit hevc_epel_uni_w_hv Signed-off-by: Logan Lyu <Logan.Lyu@myais.com.cn> --- libavcodec/aarch64/hevcdsp_epel_neon.S | 694 ++++++++++++++++++++++ libavcodec/aarch64/hevcdsp_init_aarch64.c | 6 + 2 files changed, 700 insertions(+) diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S b/libavcodec/aarch64/hevcdsp_epel_neon.S index 8b6f396a0b..355679af29 100644 --- a/libavcodec/aarch64/hevcdsp_epel_neon.S +++ b/libavcodec/aarch64/hevcdsp_epel_neon.S @@ -717,6 +717,700 @@ function ff_hevc_put_hevc_epel_uni_w_h64_8_neon_i8mm, export=1 ret endfunc +.macro epel_uni_w_hv_start + mov x15, x5 //denom + mov x16, x6 //wx + mov x17, x7 //ox + add w15, w15, #6 //shift = denom+6 + + + ldp x5, x6, [sp] + ldr x7, [sp, #16] + + stp q12, q13, [sp, #-128]! + stp q14, q15, [sp, #32] + stp q8, q9, [sp, #64] + stp q10, q11, [sp, #96] + + dup v13.8h, w16 //wx + dup v14.4s, w17 //ox + + mov w17, #1 + lsl w17, w17, w15 + lsr w17, w17, #1 + dup v15.4s, w17 + + neg w15, w15 // -shift + dup v12.4s, w15 //shift +.endm + +.macro epel_uni_w_hv_end + smull v28.4s, v4.4h, v13.4h + smull2 v29.4s, v4.8h, v13.8h + add v28.4s, v28.4s, v15.4s + add v29.4s, v29.4s, v15.4s + sshl v28.4s, v28.4s, v12.4s + sshl v29.4s, v29.4s, v12.4s + add v28.4s, v28.4s, v14.4s + add v29.4s, v29.4s, v14.4s + sqxtn v4.4h, v28.4s + sqxtn2 v4.8h, v29.4s +.endm + +.macro epel_uni_w_hv_end2 + smull v28.4s, v4.4h, v13.4h + smull2 v29.4s, v4.8h, v13.8h + smull v30.4s, v5.4h, v13.4h + smull2 v31.4s, v5.8h, v13.8h + add v28.4s, v28.4s, v15.4s + add v29.4s, v29.4s, v15.4s + add v30.4s, v30.4s, v15.4s + add v31.4s, v31.4s, v15.4s + + sshl v28.4s, v28.4s, v12.4s + sshl v29.4s, v29.4s, v12.4s + sshl v30.4s, v30.4s, v12.4s + sshl v31.4s, v31.4s, v12.4s + + add v28.4s, v28.4s, v14.4s + add v29.4s, v29.4s, v14.4s + add v30.4s, v30.4s, v14.4s + add v31.4s, v31.4s, v14.4s + + sqxtn v4.4h, v28.4s + sqxtn2 v4.8h, v29.4s + sqxtn v5.4h, v30.4s + sqxtn2 v5.8h, v31.4s +.endm + +.macro epel_uni_w_hv_end3 + smull v1.4s, v4.4h, v13.4h + smull2 v2.4s, v4.8h, v13.8h + smull v28.4s, v5.4h, v13.4h + smull2 v29.4s, v5.8h, v13.8h + smull v30.4s, v6.4h, v13.4h + smull2 v31.4s, v6.8h, v13.8h + add v1.4s, v1.4s, v15.4s + add v2.4s, v2.4s, v15.4s + add v28.4s, v28.4s, v15.4s + add v29.4s, v29.4s, v15.4s + add v30.4s, v30.4s, v15.4s + add v31.4s, v31.4s, v15.4s + + sshl v1.4s, v1.4s, v12.4s + sshl v2.4s, v2.4s, v12.4s + sshl v28.4s, v28.4s, v12.4s + sshl v29.4s, v29.4s, v12.4s + sshl v30.4s, v30.4s, v12.4s + sshl v31.4s, v31.4s, v12.4s + add v1.4s, v1.4s, v14.4s + add v2.4s, v2.4s, v14.4s + add v28.4s, v28.4s, v14.4s + add v29.4s, v29.4s, v14.4s + add v30.4s, v30.4s, v14.4s + add v31.4s, v31.4s, v14.4s + + sqxtn v4.4h, v1.4s + sqxtn2 v4.8h, v2.4s + sqxtn v5.4h, v28.4s + sqxtn2 v5.8h, v29.4s + sqxtn v6.4h, v30.4s + sqxtn2 v6.8h, v31.4s +.endm + +.macro calc_epelh dst, src0, src1, src2, src3 + smull \dst\().4s, \src0\().4h, v0.h[0] + smlal \dst\().4s, \src1\().4h, v0.h[1] + smlal \dst\().4s, \src2\().4h, v0.h[2] + smlal \dst\().4s, \src3\().4h, v0.h[3] + sqshrn \dst\().4h, \dst\().4s, #6 +.endm + +.macro calc_epelh2 dst, tmp, src0, src1, src2, src3 + smull2 \tmp\().4s, \src0\().8h, v0.h[0] + smlal2 \tmp\().4s, \src1\().8h, v0.h[1] + smlal2 \tmp\().4s, \src2\().8h, v0.h[2] + smlal2 \tmp\().4s, \src3\().8h, v0.h[3] + sqshrn2 \dst\().8h, \tmp\().4s, #6 +.endm + +.macro load_epel_filterh freg, xreg + movrel \xreg, epel_filters + add \xreg, \xreg, \freg, lsl #2 + ld1 {v0.8b}, [\xreg] + sxtl v0.8h, v0.8b +.endm + +function ff_hevc_put_hevc_epel_uni_w_hv4_8_neon_i8mm, export=1 + epel_uni_w_hv_start + sxtw x4, w4 + + add x10, x4, #3 + lsl x10, x10, #7 + sub sp, sp, x10 // tmp_array + stp xzr, x30, [sp, #-48]! + stp x4, x6, [sp, #16] + stp x0, x1, [sp, #32] + add x0, sp, #48 + sub x1, x2, x3 + mov x2, x3 + add x3, x4, #3 + mov x4, x5 + bl X(ff_hevc_put_hevc_epel_h4_8_neon_i8mm) + ldp x4, x6, [sp, #16] + ldp x0, x1, [sp, #32] + ldp xzr, x30, [sp], #48 + load_epel_filterh x6, x5 + mov x10, #(MAX_PB_SIZE * 2) + ld1 {v16.4h}, [sp], x10 + ld1 {v17.4h}, [sp], x10 + ld1 {v18.4h}, [sp], x10 +1: ld1 {v19.4h}, [sp], x10 + subs x4, x4, #1 + calc_epelh v4, v16, v17, v18, v19 + epel_uni_w_hv_end + sqxtun v4.8b, v4.8h + str s4, [x0] + add x0, x0, x1 + b.eq 2f + + ld1 {v16.4h}, [sp], x10 + subs x4, x4, #1 + calc_epelh v4, v17, v18, v19, v16 + epel_uni_w_hv_end + sqxtun v4.8b, v4.8h + str s4, [x0] + add x0, x0, x1 + b.eq 2f + + ld1 {v17.4h}, [sp], x10 + subs x4, x4, #1 + calc_epelh v4, v18, v19, v16, v17 + epel_uni_w_hv_end + sqxtun v4.8b, v4.8h + str s4, [x0] + add x0, x0, x1 + b.eq 2f + + ld1 {v18.4h}, [sp], x10 + subs x4, x4, #1 + calc_epelh v4, v19, v16, v17, v18 + epel_uni_w_hv_end + sqxtun v4.8b, v4.8h + str s4, [x0] + add x0, x0, x1 + b.ne 1b +2: + ldp q14, q15, [sp, #32] + ldp q8, q9, [sp, #64] + ldp q10, q11, [sp, #96] + ldp q12, q13, [sp], #128 + ret +endfunc + +function ff_hevc_put_hevc_epel_uni_w_hv6_8_neon_i8mm, export=1 + epel_uni_w_hv_start + sxtw x4, w4 + + add x10, x4, #3 + lsl x10, x10, #7 + sub sp, sp, x10 // tmp_array + stp xzr, x30, [sp, #-48]! + stp x4, x6, [sp, #16] + stp x0, x1, [sp, #32] + add x0, sp, #48 + sub x1, x2, x3 + mov x2, x3 + add x3, x4, #3 + mov x4, x5 + bl X(ff_hevc_put_hevc_epel_h6_8_neon_i8mm) + ldp x4, x6, [sp, #16] + ldp x0, x1, [sp, #32] + ldp xzr, x30, [sp], #48 + load_epel_filterh x6, x5 + sub x1, x1, #4 + mov x10, #(MAX_PB_SIZE * 2) + ld1 {v16.8h}, [sp], x10 + ld1 {v17.8h}, [sp], x10 + ld1 {v18.8h}, [sp], x10 +1: ld1 {v19.8h}, [sp], x10 + subs x4, x4, #1 + calc_epelh v4, v16, v17, v18, v19 + calc_epelh2 v4, v5, v16, v17, v18, v19 + epel_uni_w_hv_end + sqxtun v4.8b, v4.8h + st1 {v4.s}[0], [x0], #4 + st1 {v4.h}[2], [x0], x1 + b.eq 2f + + ld1 {v16.8h}, [sp], x10 + subs x4, x4, #1 + calc_epelh v4, v17, v18, v19, v16 + calc_epelh2 v4, v5, v17, v18, v19, v16 + epel_uni_w_hv_end + sqxtun v4.8b, v4.8h + st1 {v4.s}[0], [x0], #4 + st1 {v4.h}[2], [x0], x1 + b.eq 2f + + ld1 {v17.8h}, [sp], x10 + subs x4, x4, #1 + calc_epelh v4, v18, v19, v16, v17 + calc_epelh2 v4, v5, v18, v19, v16, v17 + epel_uni_w_hv_end + sqxtun v4.8b, v4.8h + st1 {v4.s}[0], [x0], #4 + st1 {v4.h}[2], [x0], x1 + b.eq 2f + + ld1 {v18.8h}, [sp], x10 + subs x4, x4, #1 + calc_epelh v4, v19, v16, v17, v18 + calc_epelh2 v4, v5, v19, v16, v17, v18 + epel_uni_w_hv_end + sqxtun v4.8b, v4.8h + st1 {v4.s}[0], [x0], #4 + st1 {v4.h}[2], [x0], x1 + b.ne 1b +2: + ldp q14, q15, [sp, #32] + ldp q8, q9, [sp, #64] + ldp q10, q11, [sp, #96] + ldp q12, q13, [sp], #128 + ret +endfunc + +function ff_hevc_put_hevc_epel_uni_w_hv8_8_neon_i8mm, export=1 + epel_uni_w_hv_start + sxtw x4, w4 + + add x10, x4, #3 + lsl x10, x10, #7 + sub sp, sp, x10 // tmp_array + stp xzr, x30, [sp, #-48]! + stp x4, x6, [sp, #16] + stp x0, x1, [sp, #32] + add x0, sp, #48 + sub x1, x2, x3 + mov x2, x3 + add x3, x4, #3 + mov x4, x5 + bl X(ff_hevc_put_hevc_epel_h8_8_neon_i8mm) + ldp x4, x6, [sp, #16] + ldp x0, x1, [sp, #32] + ldp xzr, x30, [sp], #48 + load_epel_filterh x6, x5 + mov x10, #(MAX_PB_SIZE * 2) + ld1 {v16.8h}, [sp], x10 + ld1 {v17.8h}, [sp], x10 + ld1 {v18.8h}, [sp], x10 +1: ld1 {v19.8h}, [sp], x10 + subs x4, x4, #1 + calc_epelh v4, v16, v17, v18, v19 + calc_epelh2 v4, v5, v16, v17, v18, v19 + epel_uni_w_hv_end + sqxtun v4.8b, v4.8h + st1 {v4.8b}, [x0], x1 + b.eq 2f + + ld1 {v16.8h}, [sp], x10 + subs x4, x4, #1 + calc_epelh v4, v17, v18, v19, v16 + calc_epelh2 v4, v5, v17, v18, v19, v16 + epel_uni_w_hv_end + sqxtun v4.8b, v4.8h + st1 {v4.8b}, [x0], x1 + b.eq 2f + + ld1 {v17.8h}, [sp], x10 + subs x4, x4, #1 + calc_epelh v4, v18, v19, v16, v17 + calc_epelh2 v4, v5, v18, v19, v16, v17 + epel_uni_w_hv_end + sqxtun v4.8b, v4.8h + st1 {v4.8b}, [x0], x1 + b.eq 2f + + ld1 {v18.8h}, [sp], x10 + subs x4, x4, #1 + calc_epelh v4, v19, v16, v17, v18 + calc_epelh2 v4, v5, v19, v16, v17, v18 + epel_uni_w_hv_end + sqxtun v4.8b, v4.8h + st1 {v4.8b}, [x0], x1 + b.ne 1b +2: + ldp q14, q15, [sp, #32] + ldp q8, q9, [sp, #64] + ldp q10, q11, [sp, #96] + ldp q12, q13, [sp], #128 + ret +endfunc + +function ff_hevc_put_hevc_epel_uni_w_hv12_8_neon_i8mm, export=1 + epel_uni_w_hv_start + sxtw x4, w4 + + add x10, x4, #3 + lsl x10, x10, #7 + sub sp, sp, x10 // tmp_array + stp xzr, x30, [sp, #-48]! + stp x4, x6, [sp, #16] + stp x0, x1, [sp, #32] + add x0, sp, #48 + sub x1, x2, x3 + mov x2, x3 + add x3, x4, #3 + mov x4, x5 + bl X(ff_hevc_put_hevc_epel_h12_8_neon_i8mm) + ldp x4, x6, [sp, #16] + ldp x0, x1, [sp, #32] + ldp xzr, x30, [sp], #48 + load_epel_filterh x6, x5 + sub x1, x1, #8 + mov x10, #(MAX_PB_SIZE * 2) + ld1 {v16.8h, v17.8h}, [sp], x10 + ld1 {v18.8h, v19.8h}, [sp], x10 + ld1 {v20.8h, v21.8h}, [sp], x10 +1: ld1 {v22.8h, v23.8h}, [sp], x10 + subs x4, x4, #1 + calc_epelh v4, v16, v18, v20, v22 + calc_epelh2 v4, v5, v16, v18, v20, v22 + calc_epelh v5, v17, v19, v21, v23 + epel_uni_w_hv_end2 + sqxtun v4.8b, v4.8h + sqxtun2 v4.16b, v5.8h + st1 {v4.8b}, [x0], #8 + st1 {v4.s}[2], [x0], x1 + b.eq 2f + + ld1 {v16.8h, v17.8h}, [sp], x10 + subs x4, x4, #1 + calc_epelh v4, v18, v20, v22, v16 + calc_epelh2 v4, v5, v18, v20, v22, v16 + calc_epelh v5, v19, v21, v23, v17 + epel_uni_w_hv_end2 + sqxtun v4.8b, v4.8h + sqxtun2 v4.16b, v5.8h + st1 {v4.8b}, [x0], #8 + st1 {v4.s}[2], [x0], x1 + b.eq 2f + + ld1 {v18.8h, v19.8h}, [sp], x10 + subs x4, x4, #1 + calc_epelh v4, v20, v22, v16, v18 + calc_epelh2 v4, v5, v20, v22, v16, v18 + calc_epelh v5, v21, v23, v17, v19 + epel_uni_w_hv_end2 + sqxtun v4.8b, v4.8h + sqxtun2 v4.16b, v5.8h + st1 {v4.8b}, [x0], #8 + st1 {v4.s}[2], [x0], x1 + b.eq 2f + + ld1 {v20.8h, v21.8h}, [sp], x10 + subs x4, x4, #1 + calc_epelh v4, v22, v16, v18, v20 + calc_epelh2 v4, v5, v22, v16, v18, v20 + calc_epelh v5, v23, v17, v19, v21 + epel_uni_w_hv_end2 + sqxtun v4.8b, v4.8h + sqxtun2 v4.16b, v5.8h + st1 {v4.8b}, [x0], #8 + st1 {v4.s}[2], [x0], x1 + b.ne 1b +2: + ldp q14, q15, [sp, #32] + ldp q8, q9, [sp, #64] + ldp q10, q11, [sp, #96] + ldp q12, q13, [sp], #128 + ret +endfunc + +function ff_hevc_put_hevc_epel_uni_w_hv16_8_neon_i8mm, export=1 + epel_uni_w_hv_start + sxtw x4, w4 + + add x10, x4, #3 + lsl x10, x10, #7 + sub sp, sp, x10 // tmp_array + stp xzr, x30, [sp, #-48]! + stp x4, x6, [sp, #16] + stp x0, x1, [sp, #32] + add x0, sp, #48 + sub x1, x2, x3 + mov x2, x3 + add x3, x4, #3 + mov x4, x5 + bl X(ff_hevc_put_hevc_epel_h16_8_neon_i8mm) + ldp x4, x6, [sp, #16] + ldp x0, x1, [sp, #32] + ldp xzr, x30, [sp], #48 + load_epel_filterh x6, x5 + mov x10, #(MAX_PB_SIZE * 2) + ld1 {v16.8h, v17.8h}, [sp], x10 + ld1 {v18.8h, v19.8h}, [sp], x10 + ld1 {v20.8h, v21.8h}, [sp], x10 +1: ld1 {v22.8h, v23.8h}, [sp], x10 + subs x4, x4, #1 + calc_epelh v4, v16, v18, v20, v22 + calc_epelh2 v4, v5, v16, v18, v20, v22 + calc_epelh v5, v17, v19, v21, v23 + calc_epelh2 v5, v6, v17, v19, v21, v23 + epel_uni_w_hv_end2 + sqxtun v4.8b, v4.8h + sqxtun2 v4.16b, v5.8h + st1 {v4.16b}, [x0], x1 + b.eq 2f + + ld1 {v16.8h, v17.8h}, [sp], x10 + subs x4, x4, #1 + calc_epelh v4, v18, v20, v22, v16 + calc_epelh2 v4, v5, v18, v20, v22, v16 + calc_epelh v5, v19, v21, v23, v17 + calc_epelh2 v5, v6, v19, v21, v23, v17 + epel_uni_w_hv_end2 + sqxtun v4.8b, v4.8h + sqxtun2 v4.16b, v5.8h + st1 {v4.16b}, [x0], x1 + b.eq 2f + + ld1 {v18.8h, v19.8h}, [sp], x10 + subs x4, x4, #1 + calc_epelh v4, v20, v22, v16, v18 + calc_epelh2 v4, v5, v20, v22, v16, v18 + calc_epelh v5, v21, v23, v17, v19 + calc_epelh2 v5, v6, v21, v23, v17, v19 + epel_uni_w_hv_end2 + sqxtun v4.8b, v4.8h + sqxtun2 v4.16b, v5.8h + st1 {v4.16b}, [x0], x1 + b.eq 2f + + ld1 {v20.8h, v21.8h}, [sp], x10 + subs x4, x4, #1 + calc_epelh v4, v22, v16, v18, v20 + calc_epelh2 v4, v5, v22, v16, v18, v20 + calc_epelh v5, v23, v17, v19, v21 + calc_epelh2 v5, v6, v23, v17, v19, v21 + epel_uni_w_hv_end2 + sqxtun v4.8b, v4.8h + sqxtun2 v4.16b, v5.8h + st1 {v4.16b}, [x0], x1 + b.ne 1b +2: + ldp q14, q15, [sp, #32] + ldp q8, q9, [sp, #64] + ldp q10, q11, [sp, #96] + ldp q12, q13, [sp], #128 + ret +endfunc + +function ff_hevc_put_hevc_epel_uni_w_hv24_8_neon_i8mm, export=1 + epel_uni_w_hv_start + sxtw x4, w4 + + add x10, x4, #3 + lsl x10, x10, #7 + sub sp, sp, x10 // tmp_array + stp xzr, x30, [sp, #-48]! + stp x4, x6, [sp, #16] + stp x0, x1, [sp, #32] + add x0, sp, #48 + sub x1, x2, x3 + mov x2, x3 + add x3, x4, #3 + mov x4, x5 + bl X(ff_hevc_put_hevc_epel_h24_8_neon_i8mm) + ldp x4, x6, [sp, #16] + ldp x0, x1, [sp, #32] + ldp xzr, x30, [sp], #48 + load_epel_filterh x6, x5 + mov x10, #(MAX_PB_SIZE * 2) + ld1 {v16.8h, v17.8h, v18.8h}, [sp], x10 + ld1 {v19.8h, v20.8h, v21.8h}, [sp], x10 + ld1 {v22.8h, v23.8h, v24.8h}, [sp], x10 +1: ld1 {v25.8h, v26.8h, v27.8h}, [sp], x10 + subs x4, x4, #1 + calc_epelh v4, v16, v19, v22, v25 + calc_epelh2 v4, v5, v16, v19, v22, v25 + calc_epelh v5, v17, v20, v23, v26 + calc_epelh2 v5, v6, v17, v20, v23, v26 + calc_epelh v6, v18, v21, v24, v27 + calc_epelh2 v6, v7, v18, v21, v24, v27 + + epel_uni_w_hv_end3 + sqxtun v4.8b, v4.8h + sqxtun v5.8b, v5.8h + sqxtun v6.8b, v6.8h + st1 {v4.8b, v5.8b, v6.8b}, [x0], x1 + b.eq 2f + + ld1 {v16.8h, v17.8h, v18.8h}, [sp], x10 + subs x4, x4, #1 + calc_epelh v4, v19, v22, v25, v16 + calc_epelh2 v4, v5, v19, v22, v25, v16 + calc_epelh v5, v20, v23, v26, v17 + calc_epelh2 v5, v6, v20, v23, v26, v17 + calc_epelh v6, v21, v24, v27, v18 + calc_epelh2 v6, v7, v21, v24, v27, v18 + epel_uni_w_hv_end3 + + sqxtun v4.8b, v4.8h + sqxtun v5.8b, v5.8h + sqxtun v6.8b, v6.8h + st1 {v4.8b, v5.8b, v6.8b}, [x0], x1 + b.eq 2f + + ld1 {v19.8h, v20.8h, v21.8h}, [sp], x10 + subs x4, x4, #1 + calc_epelh v4, v22, v25, v16, v19 + calc_epelh2 v4, v5, v22, v25, v16, v19 + calc_epelh v5, v23, v26, v17, v20 + calc_epelh2 v5, v6, v23, v26, v17, v20 + calc_epelh v6, v24, v27, v18, v21 + calc_epelh2 v6, v7, v24, v27, v18, v21 + epel_uni_w_hv_end3 + + sqxtun v4.8b, v4.8h + sqxtun v5.8b, v5.8h + sqxtun v6.8b, v6.8h + st1 {v4.8b, v5.8b, v6.8b}, [x0], x1 + b.eq 2f + + ld1 {v22.8h, v23.8h, v24.8h}, [sp], x10 + subs x4, x4, #1 + calc_epelh v4, v25, v16, v19, v22 + calc_epelh2 v4, v5, v25, v16, v19, v22 + calc_epelh v5, v26, v17, v20, v23 + calc_epelh2 v5, v6, v26, v17, v20, v23 + calc_epelh v6, v27, v18, v21, v24 + calc_epelh2 v6, v7, v27, v18, v21, v24 + epel_uni_w_hv_end3 + + sqxtun v4.8b, v4.8h + sqxtun v5.8b, v5.8h + sqxtun v6.8b, v6.8h + st1 {v4.8b, v5.8b, v6.8b}, [x0], x1 + b.ne 1b +2: + ldp q14, q15, [sp, #32] + ldp q8, q9, [sp, #64] + ldp q10, q11, [sp, #96] + ldp q12, q13, [sp], #128 + ret +endfunc + +function ff_hevc_put_hevc_epel_uni_w_hv32_8_neon_i8mm, export=1 + ldp x15, x16, [sp] + stp x0, x30, [sp, #-16]! + stp x1, x2, [sp, #-16]! + stp x3, x4, [sp, #-16]! + stp x5, x6, [sp, #-16]! + mov x17, #16 + stp x17, x7, [sp, #-16]! + stp x15, x16, [sp, #-16]! + bl X(ff_hevc_put_hevc_epel_uni_w_hv16_8_neon_i8mm) + ldp x15, x16, [sp], #16 + ldp x17, x7, [sp], #16 + ldp x5, x6, [sp], #16 + ldp x3, x4, [sp], #16 + ldp x1, x2, [sp], #16 + ldr x0, [sp] + add x0, x0, #16 + add x2, x2, #16 + mov x17, #16 + stp x17, xzr, [sp, #-16]! + stp x15, x16, [sp, #-16]! + bl X(ff_hevc_put_hevc_epel_uni_w_hv16_8_neon_i8mm) + ldp x15, x16, [sp], #16 + ldp x17, xzr, [sp], #16 + ldp xzr, x30, [sp], #16 + ret +endfunc + +function ff_hevc_put_hevc_epel_uni_w_hv48_8_neon_i8mm, export=1 + ldp x15, x16, [sp] + stp x0, x30, [sp, #-16]! + stp x1, x2, [sp, #-16]! + stp x3, x4, [sp, #-16]! + stp x5, x6, [sp, #-16]! + mov x17, #24 + stp x17, x7, [sp, #-16]! + stp x15, x16, [sp, #-16]! + bl X(ff_hevc_put_hevc_epel_uni_w_hv24_8_neon_i8mm) + ldp x15, x16, [sp], #16 + ldp x17, x7, [sp], #16 + ldp x5, x6, [sp], #16 + ldp x3, x4, [sp], #16 + ldp x1, x2, [sp], #16 + ldr x0, [sp] + add x0, x0, #24 + add x2, x2, #24 + mov x17, #24 + stp x17, xzr, [sp, #-16]! + stp x15, x16, [sp, #-16]! + bl X(ff_hevc_put_hevc_epel_uni_w_hv24_8_neon_i8mm) + ldp x15, x16, [sp], #16 + ldp x17, xzr, [sp], #16 + ldp xzr, x30, [sp], #16 + ret +endfunc + +function ff_hevc_put_hevc_epel_uni_w_hv64_8_neon_i8mm, export=1 + ldp x15, x16, [sp] + stp x0, x30, [sp, #-16]! + stp x1, x2, [sp, #-16]! + stp x3, x4, [sp, #-16]! + stp x5, x6, [sp, #-16]! + mov x17, #16 + stp x17, x7, [sp, #-16]! + stp x15, x16, [sp, #-16]! + bl X(ff_hevc_put_hevc_epel_uni_w_hv16_8_neon_i8mm) + ldp x15, x16, [sp], #16 + ldp x17, x7, [sp], #16 + ldp x5, x6, [sp] + ldp x3, x4, [sp, #16] + ldp x1, x2, [sp, #32] + ldr x0, [sp, #48] + add x0, x0, #16 + add x2, x2, #16 + mov x17, #16 + stp x17, x7, [sp, #-16]! + stp x15, x16, [sp, #-16]! + + bl X(ff_hevc_put_hevc_epel_uni_w_hv16_8_neon_i8mm) + ldp x15, x16, [sp], #16 + ldp x17, x7, [sp], #16 + ldp x5, x6, [sp] + ldp x3, x4, [sp, #16] + ldp x1, x2, [sp, #32] + ldr x0, [sp, #48] + add x0, x0, #32 + add x2, x2, #32 + mov x17, #16 + stp x17, x7, [sp, #-16]! + stp x15, x16, [sp, #-16]! + bl X(ff_hevc_put_hevc_epel_uni_w_hv16_8_neon_i8mm) + ldp x15, x16, [sp], #16 + ldp x17, x7, [sp], #16 + ldp x5, x6, [sp], #16 + ldp x3, x4, [sp], #16 + ldp x1, x2, [sp], #16 + ldr x0, [sp] + add x0, x0, #48 + add x2, x2, #48 + mov x17, #16 + stp x17, xzr, [sp, #-16]! + stp x15, x16, [sp, #-16]! + bl X(ff_hevc_put_hevc_epel_uni_w_hv16_8_neon_i8mm) + ldp x15, x16, [sp], #16 + ldp x17, xzr, [sp], #16 + ldp xzr, x30, [sp], #16 + ret +endfunc + + #endif diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c index b448d755b9..e125b0cfb2 100644 --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c @@ -189,6 +189,11 @@ NEON8_FNPROTO(qpel_uni_w_h, (uint8_t *_dst, ptrdiff_t _dststride, int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width), _i8mm); +NEON8_FNPROTO(epel_uni_w_hv, (uint8_t *_dst, ptrdiff_t _dststride, + const uint8_t *_src, ptrdiff_t _srcstride, + int height, int denom, int wx, int ox, + intptr_t mx, intptr_t my, int width), _i8mm); + NEON8_FNPROTO_PARTIAL_5(qpel_uni_w_hv, (uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src, ptrdiff_t _srcstride, int height, int denom, int wx, int ox, @@ -291,6 +296,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth) NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 1, epel_uni_w_h ,_i8mm); NEON8_FNASSIGN(c->put_hevc_qpel, 0, 1, qpel_h, _i8mm); NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 1, qpel_uni_w_h, _i8mm); + NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 1, 1, epel_uni_w_hv, _i8mm); NEON8_FNASSIGN_PARTIAL_5(c->put_hevc_qpel_uni_w, 1, 1, qpel_uni_w_hv, _i8mm); }
On Sun, 18 Jun 2023, Logan.Lyu wrote: > Hi, Martin, > > I modified it according to your comments. Please review again. > From 47b7f7af634add7680b56a216fff7dbe1f08cd11 Mon Sep 17 00:00:00 2001 > From: Logan Lyu <Logan.Lyu@myais.com.cn> > Date: Sun, 28 May 2023 10:35:43 +0800 > Subject: [PATCH 5/5] lavc/aarch64: new optimization for 8-bit > hevc_epel_uni_w_hv > > Signed-off-by: Logan Lyu <Logan.Lyu@myais.com.cn> > --- > libavcodec/aarch64/hevcdsp_epel_neon.S | 694 ++++++++++++++++++++++ > libavcodec/aarch64/hevcdsp_init_aarch64.c | 6 + > 2 files changed, 700 insertions(+) > > diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S b/libavcodec/aarch64/hevcdsp_epel_neon.S > index 8b6f396a0b..355679af29 100644 > --- a/libavcodec/aarch64/hevcdsp_epel_neon.S > +++ b/libavcodec/aarch64/hevcdsp_epel_neon.S > @@ -717,6 +717,700 @@ function ff_hevc_put_hevc_epel_uni_w_h64_8_neon_i8mm, export=1 > ret > endfunc > > +.macro epel_uni_w_hv_start > + mov x15, x5 //denom > + mov x16, x6 //wx > + mov x17, x7 //ox > + add w15, w15, #6 //shift = denom+6 > + > + > + ldp x5, x6, [sp] > + ldr x7, [sp, #16] > + > + stp q12, q13, [sp, #-128]! > + stp q14, q15, [sp, #32] > + stp q8, q9, [sp, #64] > + stp q10, q11, [sp, #96] Only need to back up 64 bytes, by backing up d8-d15. Also, the order is quite weird here, why not keep them in e.g. linear order? > +function ff_hevc_put_hevc_epel_uni_w_hv4_8_neon_i8mm, export=1 > + epel_uni_w_hv_start > + sxtw x4, w4 > + > + add x10, x4, #3 > + lsl x10, x10, #7 > + sub sp, sp, x10 // tmp_array > + stp xzr, x30, [sp, #-48]! As mentioned already in the previous review - why do you back up and restore xzr here? That's not necessary. Yes, you should keep the stack 16 byte aligned, but you can just leave an empty slot, and just do "str x30, [sp, #-48]!" here, and vice versa with "ldr" instead of ldp when restoring. The same goes in all functions here. > +2: > + ldp q14, q15, [sp, #32] > + ldp q8, q9, [sp, #64] > + ldp q10, q11, [sp, #96] > + ldp q12, q13, [sp], #128 Only need d8-d15, and weird register order here, and elsewhere. > +function ff_hevc_put_hevc_epel_uni_w_hv24_8_neon_i8mm, export=1 > + epel_uni_w_hv_start > + sxtw x4, w4 FWIW, it's unusual to need an explicit sxtw instruction, but I guess if you use it in the form "add x10, x4, #3" it might be needed. > +function ff_hevc_put_hevc_epel_uni_w_hv32_8_neon_i8mm, export=1 > + ldp x15, x16, [sp] > + stp x0, x30, [sp, #-16]! > + stp x1, x2, [sp, #-16]! > + stp x3, x4, [sp, #-16]! > + stp x5, x6, [sp, #-16]! Don't do consecutive stack pointer updates like this, but merge it into one large stack decrement followed by positive offsets, like in all the other cases of stp/ldp. > + mov x17, #16 > + stp x17, x7, [sp, #-16]! > + stp x15, x16, [sp, #-16]! > + bl X(ff_hevc_put_hevc_epel_uni_w_hv16_8_neon_i8mm) > + ldp x15, x16, [sp], #16 > + ldp x17, x7, [sp], #16 > + ldp x5, x6, [sp], #16 > + ldp x3, x4, [sp], #16 > + ldp x1, x2, [sp], #16 > + ldr x0, [sp] > + add x0, x0, #16 > + add x2, x2, #16 > + mov x17, #16 > + stp x17, xzr, [sp, #-16]! > + stp x15, x16, [sp, #-16]! Don't do multiple stack decrements, don't needlessly store xzr here. The same goes for all the other functions in this patch. // Martin
Hi, Martin, Thanks for your comments. I have now amended the unreasonable parts of ldp/stp that I have seen. And I updated patch 3 and patch 5. (Although I have attached all 5 patches) In addition, I thought that q8-q15 was required to be saved according to the calling convention before, but later I confirmed that it is the lower 64bit, thank you for reminding. Please take a look. If there are some small mistakes, please correct them directly. If there are still many problems, please remind me again, thank you! 在 2023/7/2 5:28, Martin Storsjö 写道: > On Sun, 18 Jun 2023, Logan.Lyu wrote: > >> Hi, Martin, >> >> I modified it according to your comments. Please review again. > >> From 47b7f7af634add7680b56a216fff7dbe1f08cd11 Mon Sep 17 00:00:00 2001 >> From: Logan Lyu <Logan.Lyu@myais.com.cn> >> Date: Sun, 28 May 2023 10:35:43 +0800 >> Subject: [PATCH 5/5] lavc/aarch64: new optimization for 8-bit >> hevc_epel_uni_w_hv >> >> Signed-off-by: Logan Lyu <Logan.Lyu@myais.com.cn> >> --- >> libavcodec/aarch64/hevcdsp_epel_neon.S | 694 ++++++++++++++++++++++ >> libavcodec/aarch64/hevcdsp_init_aarch64.c | 6 + >> 2 files changed, 700 insertions(+) >> >> diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S >> b/libavcodec/aarch64/hevcdsp_epel_neon.S >> index 8b6f396a0b..355679af29 100644 >> --- a/libavcodec/aarch64/hevcdsp_epel_neon.S >> +++ b/libavcodec/aarch64/hevcdsp_epel_neon.S >> @@ -717,6 +717,700 @@ function >> ff_hevc_put_hevc_epel_uni_w_h64_8_neon_i8mm, export=1 >> ret >> endfunc >> >> +.macro epel_uni_w_hv_start >> + mov x15, x5 //denom >> + mov x16, x6 //wx >> + mov x17, x7 //ox >> + add w15, w15, #6 //shift = denom+6 >> + >> + >> + ldp x5, x6, [sp] >> + ldr x7, [sp, #16] >> + >> + stp q12, q13, [sp, #-128]! >> + stp q14, q15, [sp, #32] >> + stp q8, q9, [sp, #64] >> + stp q10, q11, [sp, #96] > > Only need to back up 64 bytes, by backing up d8-d15. Also, the order > is quite weird here, why not keep them in e.g. linear order? > >> +function ff_hevc_put_hevc_epel_uni_w_hv4_8_neon_i8mm, export=1 >> + epel_uni_w_hv_start >> + sxtw x4, w4 >> + >> + add x10, x4, #3 >> + lsl x10, x10, #7 >> + sub sp, sp, x10 // tmp_array >> + stp xzr, x30, [sp, #-48]! > > As mentioned already in the previous review - why do you back up and > restore xzr here? That's not necessary. Yes, you should keep the stack > 16 byte aligned, but you can just leave an empty slot, and just do > "str x30, [sp, #-48]!" here, and vice versa with "ldr" instead of ldp > when restoring. > > The same goes in all functions here. > >> +2: >> + ldp q14, q15, [sp, #32] >> + ldp q8, q9, [sp, #64] >> + ldp q10, q11, [sp, #96] >> + ldp q12, q13, [sp], #128 > > Only need d8-d15, and weird register order here, and elsewhere. > >> +function ff_hevc_put_hevc_epel_uni_w_hv24_8_neon_i8mm, export=1 >> + epel_uni_w_hv_start >> + sxtw x4, w4 > > FWIW, it's unusual to need an explicit sxtw instruction, but I guess > if you use it in the form "add x10, x4, #3" it might be needed. > >> +function ff_hevc_put_hevc_epel_uni_w_hv32_8_neon_i8mm, export=1 >> + ldp x15, x16, [sp] >> + stp x0, x30, [sp, #-16]! >> + stp x1, x2, [sp, #-16]! >> + stp x3, x4, [sp, #-16]! >> + stp x5, x6, [sp, #-16]! > > Don't do consecutive stack pointer updates like this, but merge it > into one large stack decrement followed by positive offsets, like in > all the other cases of stp/ldp. > >> + mov x17, #16 >> + stp x17, x7, [sp, #-16]! >> + stp x15, x16, [sp, #-16]! >> + bl X(ff_hevc_put_hevc_epel_uni_w_hv16_8_neon_i8mm) >> + ldp x15, x16, [sp], #16 >> + ldp x17, x7, [sp], #16 >> + ldp x5, x6, [sp], #16 >> + ldp x3, x4, [sp], #16 >> + ldp x1, x2, [sp], #16 >> + ldr x0, [sp] >> + add x0, x0, #16 >> + add x2, x2, #16 >> + mov x17, #16 >> + stp x17, xzr, [sp, #-16]! >> + stp x15, x16, [sp, #-16]! > > Don't do multiple stack decrements, don't needlessly store xzr here. > > The same goes for all the other functions in this patch. > > // Martin > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". From c7959c64da41d2e6a14cbd3afa019fa1792d9767 Mon Sep 17 00:00:00 2001 From: Logan Lyu <Logan.Lyu@myais.com.cn> Date: Sat, 27 May 2023 09:42:07 +0800 Subject: [PATCH v1 3/5] lavc/aarch64: new optimization for 8-bit hevc_epel_uni_w_v --- libavcodec/aarch64/hevcdsp_epel_neon.S | 503 ++++++++++++++++++++++ libavcodec/aarch64/hevcdsp_init_aarch64.c | 6 + 2 files changed, 509 insertions(+) diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S b/libavcodec/aarch64/hevcdsp_epel_neon.S index 0411de9864..0e3bf74953 100644 --- a/libavcodec/aarch64/hevcdsp_epel_neon.S +++ b/libavcodec/aarch64/hevcdsp_epel_neon.S @@ -375,3 +375,506 @@ function ff_hevc_put_hevc_epel_uni_w_h64_8_neon_i8mm, export=1 endfunc #endif + + +.macro EPEL_UNI_W_V_HEADER + ldr x12, [sp, #8] + movrel x9, epel_filters + add x9, x9, x12, lsl #2 + ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x9] // filter + neg v0.16b, v0.16b + neg v3.16b, v3.16b + mov w10, #-6 + sub w10, w10, w5 + dup v30.8h, w6 + dup v31.4s, w10 + dup v29.4s, w7 + sub x2, x2, x3 +.endm + +.macro EPEL_UNI_W_V4_CALC d0, s0, s1, s2, s3 + movi \d0\().2d, #0 + umlsl \d0\().8h, \s0\().8b, v0.8b + umlal \d0\().8h, \s1\().8b, v1.8b + umlal \d0\().8h, \s2\().8b, v2.8b + umlsl \d0\().8h, \s3\().8b, v3.8b + smull \d0\().4s, \d0\().4h, v30.4h + sqrshl \d0\().4s, \d0\().4s, v31.4s + sqadd \d0\().4s, \d0\().4s, v29.4s + sqxtn \d0\().4h, \d0\().4s + sqxtun \d0\().8b, \d0\().8h +.endm + +function ff_hevc_put_hevc_epel_uni_w_v4_8_neon, export=1 + EPEL_UNI_W_V_HEADER + + ldr s4, [x2] + ldr s5, [x2, x3] + add x2, x2, x3, lsl #1 + ldr s6, [x2] +1: + ldr s7, [x2, x3] + subs w4, w4, #1 + add x2, x2, x3, lsl #1 + EPEL_UNI_W_V4_CALC v16, v4, v5, v6, v7 + str s16, [x0] + b.eq 2f + add x0, x0, x1 + ldr s4, [x2] + subs w4, w4, #1 + EPEL_UNI_W_V4_CALC v17, v5, v6, v7, v4 + str s17, [x0] + add x0, x0, x1 + b.eq 2f + ldr s5, [x2, x3] + subs w4, w4, #1 + add x2, x2, x3, lsl #1 + EPEL_UNI_W_V4_CALC v18, v6, v7, v4, v5 + str s18, [x0] + add x0, x0, x1 + b.eq 2f + ldr s6, [x2] + subs w4, w4, #1 + EPEL_UNI_W_V4_CALC v19, v7, v4, v5, v6 + str s19, [x0] + add x0, x0, x1 + b.hi 1b +2: + ret +endfunc + +.macro EPEL_UNI_W_V8_CALC d0, s0, s1, s2, s3, t0, t1 + movi \d0\().2d, #0 + umlsl \d0\().8h, \s0\().8b, v0.8b + umlal \d0\().8h, \s1\().8b, v1.8b + umlal \d0\().8h, \s2\().8b, v2.8b + umlsl \d0\().8h, \s3\().8b, v3.8b + smull \t0\().4s, \d0\().4h, v30.4h + smull2 \t1\().4s, \d0\().8h, v30.8h + sqrshl \t0\().4s, \t0\().4s, v31.4s + sqrshl \t1\().4s, \t1\().4s, v31.4s + sqadd \t0\().4s, \t0\().4s, v29.4s + sqadd \t1\().4s, \t1\().4s, v29.4s + sqxtn \d0\().4h, \t0\().4s + sqxtn2 \d0\().8h, \t1\().4s + sqxtun \d0\().8b, \d0\().8h +.endm + +function ff_hevc_put_hevc_epel_uni_w_v6_8_neon, export=1 + EPEL_UNI_W_V_HEADER + + sub x1, x1, #4 + ldr d4, [x2] + ldr d5, [x2, x3] + add x2, x2, x3, lsl #1 + ldr d6, [x2] +1: + ldr d7, [x2, x3] + subs w4, w4, #1 + add x2, x2, x3, lsl #1 + EPEL_UNI_W_V8_CALC v16, v4, v5, v6, v7, v20, v21 + str s16, [x0], #4 + st1 {v16.h}[2], [x0], x1 + b.eq 2f + ldr d4, [x2] + subs w4, w4, #1 + EPEL_UNI_W_V8_CALC v17, v5, v6, v7, v4, v20, v21 + str s17, [x0], #4 + st1 {v17.h}[2], [x0], x1 + b.eq 2f + ldr d5, [x2, x3] + subs w4, w4, #1 + add x2, x2, x3, lsl #1 + EPEL_UNI_W_V8_CALC v18, v6, v7, v4, v5, v20, v21 + str s18, [x0], #4 + st1 {v18.h}[2], [x0], x1 + b.eq 2f + ldr d6, [x2] + subs w4, w4, #1 + EPEL_UNI_W_V8_CALC v19, v7, v4, v5, v6, v20, v21 + str s19, [x0], #4 + st1 {v19.h}[2], [x0], x1 + b.hi 1b +2: + ret +endfunc + +function ff_hevc_put_hevc_epel_uni_w_v8_8_neon, export=1 + EPEL_UNI_W_V_HEADER + + ldr d4, [x2] + ldr d5, [x2, x3] + add x2, x2, x3, lsl #1 + ldr d6, [x2] +1: + ldr d7, [x2, x3] + subs w4, w4, #1 + add x2, x2, x3, lsl #1 + EPEL_UNI_W_V8_CALC v16, v4, v5, v6, v7, v20, v21 + str d16, [x0] + add x0, x0, x1 + b.eq 2f + ldr d4, [x2] + subs w4, w4, #1 + EPEL_UNI_W_V8_CALC v17, v5, v6, v7, v4, v20, v21 + str d17, [x0] + add x0, x0, x1 + b.eq 2f + ldr d5, [x2, x3] + subs w4, w4, #1 + add x2, x2, x3, lsl #1 + EPEL_UNI_W_V8_CALC v18, v6, v7, v4, v5, v20, v21 + str d18, [x0] + add x0, x0, x1 + b.eq 2f + ldr d6, [x2] + subs w4, w4, #1 + EPEL_UNI_W_V8_CALC v19, v7, v4, v5, v6, v20, v21 + str d19, [x0] + add x0, x0, x1 + b.hi 1b +2: + ret +endfunc + +.macro EPEL_UNI_W_V12_CALC d0, d1, s0, s1, s2, s3, t0, t1, t2, t3 + movi \d0\().2d, #0 + movi \d1\().2d, #0 + umlsl \d0\().8h, \s0\().8b, v0.8b + umlsl2 \d1\().8h, \s0\().16b, v0.16b + umlal \d0\().8h, \s1\().8b, v1.8b + umlal2 \d1\().8h, \s1\().16b, v1.16b + umlal \d0\().8h, \s2\().8b, v2.8b + umlal2 \d1\().8h, \s2\().16b, v2.16b + umlsl \d0\().8h, \s3\().8b, v3.8b + umlsl2 \d1\().8h, \s3\().16b, v3.16b + + smull \t0\().4s, \d0\().4h, v30.4h + smull2 \t1\().4s, \d0\().8h, v30.8h + smull \t2\().4s, \d1\().4h, v30.4h + + sqrshl \t0\().4s, \t0\().4s, v31.4s + sqrshl \t1\().4s, \t1\().4s, v31.4s + sqrshl \t2\().4s, \t2\().4s, v31.4s + sqadd \t0\().4s, \t0\().4s, v29.4s + sqadd \t1\().4s, \t1\().4s, v29.4s + sqadd \t2\().4s, \t2\().4s, v29.4s + + sqxtn \d0\().4h, \t0\().4s + sqxtn2 \d0\().8h, \t1\().4s + sqxtn \d1\().4h, \t2\().4s + sqxtun \d0\().8b, \d0\().8h + sqxtun2 \d0\().16b, \d1\().8h +.endm + +function ff_hevc_put_hevc_epel_uni_w_v12_8_neon, export=1 + EPEL_UNI_W_V_HEADER + + ldr q4, [x2] + ldr q5, [x2, x3] + add x2, x2, x3, lsl #1 + ldr q6, [x2] + sub x1, x1, #8 +1: + ldr q7, [x2, x3] + subs w4, w4, #1 + add x2, x2, x3, lsl #1 + EPEL_UNI_W_V12_CALC v16, v17, v4, v5, v6, v7, v24, v25, v26, v27 + str d16, [x0], #8 + st1 {v16.s}[2], [x0] + add x0, x0, x1 + b.eq 2f + ldr q4, [x2] + subs w4, w4, #1 + EPEL_UNI_W_V12_CALC v18, v19, v5, v6, v7, v4, v24, v25, v26, v27 + str d18, [x0], #8 + st1 {v18.s}[2], [x0] + add x0, x0, x1 + b.eq 2f + ldr q5, [x2, x3] + subs w4, w4, #1 + add x2, x2, x3, lsl #1 + EPEL_UNI_W_V12_CALC v20, v21, v6, v7, v4, v5, v24, v25, v26, v27 + str d20, [x0], #8 + st1 {v20.s}[2], [x0] + add x0, x0, x1 + b.eq 2f + ldr q6, [x2] + subs w4, w4, #1 + EPEL_UNI_W_V12_CALC v22, v23, v7, v4, v5, v6, v24, v25, v26, v27 + str d22, [x0], #8 + st1 {v22.s}[2], [x0] + add x0, x0, x1 + b.hi 1b +2: + ret +endfunc + +.macro EPEL_UNI_W_V16_CALC d0, d1, s0, s1, s2, s3, t0, t1, t2, t3 + movi \d0\().2d, #0 + movi \d1\().2d, #0 + umlsl \d0\().8h, \s0\().8b, v0.8b + umlsl2 \d1\().8h, \s0\().16b, v0.16b + umlal \d0\().8h, \s1\().8b, v1.8b + umlal2 \d1\().8h, \s1\().16b, v1.16b + umlal \d0\().8h, \s2\().8b, v2.8b + umlal2 \d1\().8h, \s2\().16b, v2.16b + umlsl \d0\().8h, \s3\().8b, v3.8b + umlsl2 \d1\().8h, \s3\().16b, v3.16b + + smull \t0\().4s, \d0\().4h, v30.4h + smull2 \t1\().4s, \d0\().8h, v30.8h + smull \t2\().4s, \d1\().4h, v30.4h + smull2 \t3\().4s, \d1\().8h, v30.8h + + sqrshl \t0\().4s, \t0\().4s, v31.4s + sqrshl \t1\().4s, \t1\().4s, v31.4s + sqrshl \t2\().4s, \t2\().4s, v31.4s + sqrshl \t3\().4s, \t3\().4s, v31.4s + sqadd \t0\().4s, \t0\().4s, v29.4s + sqadd \t1\().4s, \t1\().4s, v29.4s + sqadd \t2\().4s, \t2\().4s, v29.4s + sqadd \t3\().4s, \t3\().4s, v29.4s + + sqxtn \d0\().4h, \t0\().4s + sqxtn2 \d0\().8h, \t1\().4s + sqxtn \d1\().4h, \t2\().4s + sqxtn2 \d1\().8h, \t3\().4s + sqxtun \d0\().8b, \d0\().8h + sqxtun2 \d0\().16b, \d1\().8h +.endm + + +function ff_hevc_put_hevc_epel_uni_w_v16_8_neon, export=1 + EPEL_UNI_W_V_HEADER + + ldr q4, [x2] + ldr q5, [x2, x3] + add x2, x2, x3, lsl #1 + ldr q6, [x2] +1: + ldr q7, [x2, x3] + subs w4, w4, #1 + add x2, x2, x3, lsl #1 + EPEL_UNI_W_V16_CALC v16, v17 v4, v5, v6, v7, v24, v25, v26, v27 + str q16, [x0] + add x0, x0, x1 + b.eq 2f + ldr q4, [x2] + subs w4, w4, #1 + EPEL_UNI_W_V16_CALC v18, v19, v5, v6, v7, v4, v24, v25, v26, v27 + str q18, [x0] + add x0, x0, x1 + b.eq 2f + ldr q5, [x2, x3] + subs w4, w4, #1 + add x2, x2, x3, lsl #1 + EPEL_UNI_W_V16_CALC v20, v21, v6, v7, v4, v5, v24, v25, v26, v27 + str q20, [x0] + add x0, x0, x1 + b.eq 2f + ldr q6, [x2] + subs w4, w4, #1 + EPEL_UNI_W_V16_CALC v22, v23, v7, v4, v5, v6, v24, v25, v26, v27 + str q22, [x0] + add x0, x0, x1 + b.hi 1b +2: + ret +endfunc + + + +function ff_hevc_put_hevc_epel_uni_w_v24_8_neon, export=1 + EPEL_UNI_W_V_HEADER + + ldp q16, q17, [x2] + add x2, x2, x3 + ldp q18, q19, [x2] + add x2, x2, x3 + ldp q20, q21, [x2] + add x2, x2, x3 +1: + ldp q22, q23, [x2] + subs w4, w4, #1 + add x2, x2, x3 + EPEL_UNI_W_V16_CALC v4, v5, v16, v18, v20, v22, v24, v25, v26, v27 + EPEL_UNI_W_V8_CALC v6, v17, v19, v21, v23, v24, v25 + str q4, [x0] + str d6, [x0, #16] + add x0, x0, x1 + b.eq 2f + ldp q16, q17, [x2] + subs w4, w4, #1 + add x2, x2, x3 + EPEL_UNI_W_V16_CALC v4, v5, v18, v20, v22, v16, v24, v25, v26, v27 + EPEL_UNI_W_V8_CALC v6, v19, v21, v23, v17, v24, v25 + str q4, [x0] + str d6, [x0, #16] + add x0, x0, x1 + b.eq 2f + ldp q18, q19, [x2] + subs w4, w4, #1 + add x2, x2, x3 + EPEL_UNI_W_V16_CALC v4, v5, v20, v22, v16, v18, v24, v25, v26, v27 + EPEL_UNI_W_V8_CALC v6, v21, v23, v17, v19, v24, v25 + str q4, [x0] + str d6, [x0, #16] + add x0, x0, x1 + b.eq 2f + ldp q20, q21, [x2] + subs w4, w4, #1 + add x2, x2, x3 + EPEL_UNI_W_V16_CALC v4, v5, v22, v16, v18, v20, v24, v25, v26, v27 + EPEL_UNI_W_V8_CALC v6, v23, v17, v19, v21, v24, v25 + str q4, [x0] + str d6, [x0, #16] + add x0, x0, x1 + b.hi 1b +2: + ret +endfunc + +function ff_hevc_put_hevc_epel_uni_w_v32_8_neon, export=1 + EPEL_UNI_W_V_HEADER + + ldp q16, q17, [x2] + add x2, x2, x3 + ldp q18, q19, [x2] + add x2, x2, x3 + ldp q20, q21, [x2] + add x2, x2, x3 +1: + ldp q22, q23, [x2] + subs w4, w4, #1 + add x2, x2, x3 + EPEL_UNI_W_V16_CALC v4, v5, v16, v18, v20, v22, v24, v25, v26, v27 + EPEL_UNI_W_V16_CALC v6, v7, v17, v19, v21, v23, v24, v25, v26, v27 + str q4, [x0] + str q6, [x0, #16] + add x0, x0, x1 + b.eq 2f + ldp q16, q17, [x2] + subs w4, w4, #1 + add x2, x2, x3 + EPEL_UNI_W_V16_CALC v4, v5, v18, v20, v22, v16, v24, v25, v26, v27 + EPEL_UNI_W_V16_CALC v6, v7, v19, v21, v23, v17, v24, v25, v26, v27 + str q4, [x0] + str q6, [x0, #16] + add x0, x0, x1 + b.eq 2f + ldp q18, q19, [x2] + subs w4, w4, #1 + add x2, x2, x3 + EPEL_UNI_W_V16_CALC v4, v5, v20, v22, v16, v18, v24, v25, v26, v27 + EPEL_UNI_W_V16_CALC v6, v7, v21, v23, v17, v19, v24, v25, v26, v27 + str q4, [x0] + str q6, [x0, #16] + add x0, x0, x1 + b.eq 2f + ldp q20, q21, [x2] + subs w4, w4, #1 + add x2, x2, x3 + EPEL_UNI_W_V16_CALC v4, v5, v22, v16, v18, v20, v24, v25, v26, v27 + EPEL_UNI_W_V16_CALC v6, v7, v23, v17, v19, v21, v24, v25, v26, v27 + str q4, [x0] + str q6, [x0, #16] + add x0, x0, x1 + b.hi 1b +2: + ret +endfunc + +function ff_hevc_put_hevc_epel_uni_w_v48_8_neon, export=1 + EPEL_UNI_W_V_HEADER + stp d8, d9, [sp, #-32]! + stp d10, d11, [sp, #16] + + ld1 {v16.16b, v17.16b, v18.16b}, [x2], x3 + ld1 {v19.16b, v20.16b, v21.16b}, [x2], x3 + ld1 {v22.16b, v23.16b, v24.16b}, [x2], x3 +1: + ld1 {v25.16b, v26.16b, v27.16b}, [x2], x3 + subs w4, w4, #1 + EPEL_UNI_W_V16_CALC v4, v6, v16, v19, v22, v25, v8, v9, v10, v11 + EPEL_UNI_W_V16_CALC v5, v7, v17, v20, v23, v26, v8, v9, v10, v11 + EPEL_UNI_W_V16_CALC v6, v7, v18, v21, v24, v27, v8, v9, v10, v11 + st1 {v4.16b, v5.16b, v6.16b}, [x0], x1 + b.eq 2f + ld1 {v16.16b, v17.16b, v18.16b}, [x2], x3 + subs w4, w4, #1 + EPEL_UNI_W_V16_CALC v4, v6, v19, v22, v25, v16, v8, v9, v10, v11 + EPEL_UNI_W_V16_CALC v5, v7, v20, v23, v26, v17, v8, v9, v10, v11 + EPEL_UNI_W_V16_CALC v6, v7, v21, v24, v27, v18, v8, v9, v10, v11 + st1 {v4.16b, v5.16b, v6.16b}, [x0], x1 + b.eq 2f + ld1 {v19.16b, v20.16b, v21.16b}, [x2], x3 + subs w4, w4, #1 + EPEL_UNI_W_V16_CALC v4, v6, v22, v25, v16, v19, v8, v9, v10, v11 + EPEL_UNI_W_V16_CALC v5, v7, v23, v26, v17, v20, v8, v9, v10, v11 + EPEL_UNI_W_V16_CALC v6, v7, v24, v27, v18, v21, v8, v9, v10, v11 + st1 {v4.16b, v5.16b, v6.16b}, [x0], x1 + b.eq 2f + ld1 {v22.16b, v23.16b, v24.16b}, [x2], x3 + subs w4, w4, #1 + EPEL_UNI_W_V16_CALC v4, v6, v25, v16, v19, v22, v8, v9, v10, v11 + EPEL_UNI_W_V16_CALC v5, v7, v26, v17, v20, v23, v8, v9, v10, v11 + EPEL_UNI_W_V16_CALC v6, v7, v27, v18, v21, v24, v8, v9, v10, v11 + st1 {v4.16b, v5.16b, v6.16b}, [x0], x1 + b.hi 1b +2: + ldp d10, d11, [sp, #16] + ldp d8, d9, [sp], #32 + ret +endfunc + +function ff_hevc_put_hevc_epel_uni_w_v64_8_neon, export=1 + EPEL_UNI_W_V_HEADER + stp d8, d9, [sp, #-64]! + stp d10, d11, [sp, #16] + stp d12, d13, [sp, #32] + stp d14, d15, [sp, #48] + + ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], x3 + ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], x3 + ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], x3 +1: + ld1 {v12.16b, v13.16b, v14.16b, v15.16b}, [x2], x3 + subs w4, w4, #1 + EPEL_UNI_W_V16_CALC v4, v6, v16, v20, v24, v12, v8, v9, v10, v11 + EPEL_UNI_W_V16_CALC v5, v7, v17, v21, v25, v13, v8, v9, v10, v11 + EPEL_UNI_W_V16_CALC v6, v7, v18, v22, v26, v14, v8, v9, v10, v11 + EPEL_UNI_W_V16_CALC v7,v28, v19, v23, v27, v15, v8, v9, v10, v11 + st1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x0], x1 + b.eq 2f + ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], x3 + subs w4, w4, #1 + EPEL_UNI_W_V16_CALC v4, v6, v20, v24, v12, v16, v8, v9, v10, v11 + EPEL_UNI_W_V16_CALC v5, v7, v21, v25, v13, v17, v8, v9, v10, v11 + EPEL_UNI_W_V16_CALC v6, v7, v22, v26, v14, v18, v8, v9, v10, v11 + EPEL_UNI_W_V16_CALC v7,v28, v23, v27, v15, v19, v8, v9, v10, v11 + st1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x0], x1 + b.eq 2f + ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], x3 + subs w4, w4, #1 + EPEL_UNI_W_V16_CALC v4, v6, v24, v12, v16, v20, v8, v9, v10, v11 + EPEL_UNI_W_V16_CALC v5, v7, v25, v13, v17, v21, v8, v9, v10, v11 + EPEL_UNI_W_V16_CALC v6, v7, v26, v14, v18, v22, v8, v9, v10, v11 + EPEL_UNI_W_V16_CALC v7,v28, v27, v15, v19, v23, v8, v9, v10, v11 + st1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x0], x1 + b.eq 2f + ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], x3 + subs w4, w4, #1 + EPEL_UNI_W_V16_CALC v4, v6, v12, v16, v20, v24, v8, v9, v10, v11 + EPEL_UNI_W_V16_CALC v5, v7, v13, v17, v21, v25, v8, v9, v10, v11 + EPEL_UNI_W_V16_CALC v6, v7, v14, v18, v22, v26, v8, v9, v10, v11 + EPEL_UNI_W_V16_CALC v7,v28, v15, v19, v23, v27, v8, v9, v10, v11 + st1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x0], x1 + b.hi 1b +2: + ldp d10, d11, [sp, #16] + ldp d12, d13, [sp, #32] + ldp d14, d15, [sp, #48] + ldp d8, d9, [sp], #64 + ret +endfunc diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c index 8af0a2b4b9..4a260e1d9a 100644 --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c @@ -161,6 +161,11 @@ NEON8_FNPROTO(pel_uni_w_pixels, (uint8_t *_dst, ptrdiff_t _dststride, int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width),); +NEON8_FNPROTO(epel_uni_w_v, (uint8_t *_dst, ptrdiff_t _dststride, + const uint8_t *_src, ptrdiff_t _srcstride, + int height, int denom, int wx, int ox, + intptr_t mx, intptr_t my, int width),); + NEON8_FNPROTO_PARTIAL_4(qpel_uni_w_v, (uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src, ptrdiff_t _srcstride, int height, int denom, int wx, int ox, @@ -274,6 +279,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth) NEON8_FNASSIGN(c->put_hevc_qpel_uni, 0, 0, pel_uni_pixels,); NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 0, pel_uni_w_pixels,); NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 0, pel_uni_w_pixels,); + NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 1, 0, epel_uni_w_v,); NEON8_FNASSIGN_PARTIAL_4(c->put_hevc_qpel_uni_w, 1, 0, qpel_uni_w_v,); if (have_i8mm(cpu_flags)) {
On Thu, 13 Jul 2023, Logan.Lyu wrote: > Hi, Martin, > > Thanks for your comments. > > I have now amended the unreasonable parts of ldp/stp that I have seen. And I > updated patch 3 and patch 5. (Although I have attached all 5 patches) > In addition, I thought that q8-q15 was required to be saved according to the > calling convention before, but later I confirmed that it is the lower 64bit, > thank you for reminding. > > Please take a look. If there are some small mistakes, please correct them > directly. If there are still many problems, please remind me again, thank > you! Thanks, this looks mostly good to me! In patch 3, there was still one case of a missing comma between macro arguments, that I fixed. I also included the checkasm benchmark numbers in the commit messages - please remember to add them for future patches. I'll push these patches later after a bit more testing, if that testing doesn't show any further issues. Thanks! // Martin
diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S b/libavcodec/aarch64/hevcdsp_epel_neon.S index 32f052a7b1..24a74d2c7d 100644 --- a/libavcodec/aarch64/hevcdsp_epel_neon.S +++ b/libavcodec/aarch64/hevcdsp_epel_neon.S @@ -718,6 +718,709 @@ function ff_hevc_put_hevc_epel_uni_w_h64_8_neon_i8mm, export=1 ret endfunc +.macro epel_uni_w_hv_start + mov x15, x5 //denom + mov x16, x6 //wx + mov x17, x7 //ox + add w15, w15, #6 //shift = denom+6 + + + ldp x5, x6, [sp] + ldp x7, xzr, [sp, #16] + + sub sp, sp, #128 + stp q12, q13, [sp] + stp q14, q15, [sp, #32] + stp q8, q9, [sp, #64] + stp q10, q11, [sp, #96] + + dup v13.8h, w16 //wx + dup v14.4s, w17 //ox + + mov w17, #1 + lsl w17, w17, w15 + lsr w17, w17, #1 + dup v15.4s, w17 + + neg w15, w15 // -shift + dup v12.4s, w15 //shift +.endm + +.macro epel_uni_w_hv_end + smull v28.4s, v4.4h, v13.4h + smull2 v29.4s, v4.8h, v13.8h + add v28.4s, v28.4s, v15.4s + add v29.4s, v29.4s, v15.4s + sshl v28.4s, v28.4s, v12.4s + sshl v29.4s, v29.4s, v12.4s + add v28.4s, v28.4s, v14.4s + add v29.4s, v29.4s, v14.4s + sqxtn v4.4h, v28.4s + sqxtn2 v4.8h, v29.4s +.endm + +.macro epel_uni_w_hv_end2 + smull v28.4s, v4.4h, v13.4h + smull2 v29.4s, v4.8h, v13.8h + smull v30.4s, v5.4h, v13.4h + smull2 v31.4s, v5.8h, v13.8h + add v28.4s, v28.4s, v15.4s + add v29.4s, v29.4s, v15.4s + add v30.4s, v30.4s, v15.4s + add v31.4s, v31.4s, v15.4s + + sshl v28.4s, v28.4s, v12.4s + sshl v29.4s, v29.4s, v12.4s + sshl v30.4s, v30.4s, v12.4s + sshl v31.4s, v31.4s, v12.4s + + add v28.4s, v28.4s, v14.4s + add v29.4s, v29.4s, v14.4s + add v30.4s, v30.4s, v14.4s + add v31.4s, v31.4s, v14.4s + + sqxtn v4.4h, v28.4s + sqxtn2 v4.8h, v29.4s + sqxtn v5.4h, v30.4s + sqxtn2 v5.8h, v31.4s +.endm + +.macro epel_uni_w_hv_end3 + smull v1.4s, v4.4h, v13.4h + smull2 v2.4s, v4.8h, v13.8h + smull v28.4s, v5.4h, v13.4h + smull2 v29.4s, v5.8h, v13.8h + smull v30.4s, v6.4h, v13.4h + smull2 v31.4s, v6.8h, v13.8h + add v1.4s, v1.4s, v15.4s + add v2.4s, v2.4s, v15.4s + add v28.4s, v28.4s, v15.4s + add v29.4s, v29.4s, v15.4s + add v30.4s, v30.4s, v15.4s + add v31.4s, v31.4s, v15.4s + + sshl v1.4s, v1.4s, v12.4s + sshl v2.4s, v2.4s, v12.4s + sshl v28.4s, v28.4s, v12.4s + sshl v29.4s, v29.4s, v12.4s + sshl v30.4s, v30.4s, v12.4s + sshl v31.4s, v31.4s, v12.4s + add v1.4s, v1.4s, v14.4s + add v2.4s, v2.4s, v14.4s + add v28.4s, v28.4s, v14.4s + add v29.4s, v29.4s, v14.4s + add v30.4s, v30.4s, v14.4s + add v31.4s, v31.4s, v14.4s + + sqxtn v4.4h, v1.4s + sqxtn2 v4.8h, v2.4s + sqxtn v5.4h, v28.4s + sqxtn2 v5.8h, v29.4s + sqxtn v6.4h, v30.4s + sqxtn2 v6.8h, v31.4s +.endm + +.macro calc_epelh dst, src0, src1, src2, src3 + smull \dst\().4s, \src0\().4h, v0.h[0] + smlal \dst\().4s, \src1\().4h, v0.h[1] + smlal \dst\().4s, \src2\().4h, v0.h[2] + smlal \dst\().4s, \src3\().4h, v0.h[3] + sqshrn \dst\().4h, \dst\().4s, #6 +.endm + +.macro calc_epelh2 dst, tmp, src0, src1, src2, src3 + smull2 \tmp\().4s, \src0\().8h, v0.h[0] + smlal2 \tmp\().4s, \src1\().8h, v0.h[1] + smlal2 \tmp\().4s, \src2\().8h, v0.h[2] + smlal2 \tmp\().4s, \src3\().8h, v0.h[3] + sqshrn2 \dst\().8h, \tmp\().4s, #6 +.endm + +.macro load_epel_filterh freg, xreg + movrel \xreg, epel_filters + add \xreg, \xreg, \freg, lsl #2 + ld1 {v0.8b}, [\xreg] + sxtl v0.8h, v0.8b +.endm + +function ff_hevc_put_hevc_epel_uni_w_hv4_8_neon_i8mm, export=1 + epel_uni_w_hv_start + and x4, x4, 0xffffffff + + add x10, x4, #3 + lsl x10, x10, #7 + sub sp, sp, x10 // tmp_array + stp x0, x1, [sp, #-16]! + stp x4, x6, [sp, #-16]! + stp xzr, x30, [sp, #-16]! + add x0, sp, #48 + sub x1, x2, x3 + mov x2, x3 + add x3, x4, #3 + mov x4, x5 + bl X(ff_hevc_put_hevc_epel_h4_8_neon_i8mm) + ldp xzr, x30, [sp], #16 + ldp x4, x6, [sp], #16 + ldp x0, x1, [sp], #16 + load_epel_filterh x6, x5 + mov x10, #(MAX_PB_SIZE * 2) + ld1 {v16.4h}, [sp], x10 + ld1 {v17.4h}, [sp], x10 + ld1 {v18.4h}, [sp], x10 +1: ld1 {v19.4h}, [sp], x10 + calc_epelh v4, v16, v17, v18, v19 + epel_uni_w_hv_end + sqxtun v4.8b, v4.8h + str s4, [x0] + add x0, x0, x1 + subs x4, x4, #1 + b.eq 2f + + ld1 {v16.4h}, [sp], x10 + calc_epelh v4, v17, v18, v19, v16 + epel_uni_w_hv_end + sqxtun v4.8b, v4.8h + str s4, [x0] + add x0, x0, x1 + subs x4, x4, #1 + b.eq 2f + + ld1 {v17.4h}, [sp], x10 + calc_epelh v4, v18, v19, v16, v17 + epel_uni_w_hv_end + sqxtun v4.8b, v4.8h + str s4, [x0] + add x0, x0, x1 + subs x4, x4, #1 + b.eq 2f + + ld1 {v18.4h}, [sp], x10 + calc_epelh v4, v19, v16, v17, v18 + epel_uni_w_hv_end + sqxtun v4.8b, v4.8h + str s4, [x0] + add x0, x0, x1 + subs x4, x4, #1 + b.ne 1b +2: + ldp q12, q13, [sp] + ldp q14, q15, [sp, #32] + ldp q8, q9, [sp, #64] + ldp q10, q11, [sp, #96] + add sp, sp, #128 + ret +endfunc + +function ff_hevc_put_hevc_epel_uni_w_hv6_8_neon_i8mm, export=1 + epel_uni_w_hv_start + and x4, x4, 0xffffffff + + add x10, x4, #3 + lsl x10, x10, #7 + sub sp, sp, x10 // tmp_array + stp x0, x1, [sp, #-16]! + stp x4, x6, [sp, #-16]! + stp xzr, x30, [sp, #-16]! + add x0, sp, #48 + sub x1, x2, x3 + mov x2, x3 + add x3, x4, #3 + mov x4, x5 + bl X(ff_hevc_put_hevc_epel_h6_8_neon_i8mm) + ldp xzr, x30, [sp], #16 + ldp x4, x6, [sp], #16 + ldp x0, x1, [sp], #16 + load_epel_filterh x6, x5 + sub x1, x1, #4 + mov x10, #(MAX_PB_SIZE * 2) + ld1 {v16.8h}, [sp], x10 + ld1 {v17.8h}, [sp], x10 + ld1 {v18.8h}, [sp], x10 +1: ld1 {v19.8h}, [sp], x10 + calc_epelh v4, v16, v17, v18, v19 + calc_epelh2 v4, v5, v16, v17, v18, v19 + epel_uni_w_hv_end + sqxtun v4.8b, v4.8h + st1 {v4.s}[0], [x0], #4 + st1 {v4.h}[2], [x0], x1 + subs x4, x4, #1 + b.eq 2f + + ld1 {v16.8h}, [sp], x10 + calc_epelh v4, v17, v18, v19, v16 + calc_epelh2 v4, v5, v17, v18, v19, v16 + epel_uni_w_hv_end + sqxtun v4.8b, v4.8h + st1 {v4.s}[0], [x0], #4 + st1 {v4.h}[2], [x0], x1 + subs x4, x4, #1 + b.eq 2f + + ld1 {v17.8h}, [sp], x10 + calc_epelh v4, v18, v19, v16, v17 + calc_epelh2 v4, v5, v18, v19, v16, v17 + epel_uni_w_hv_end + sqxtun v4.8b, v4.8h + st1 {v4.s}[0], [x0], #4 + st1 {v4.h}[2], [x0], x1 + subs x4, x4, #1 + b.eq 2f + + ld1 {v18.8h}, [sp], x10 + calc_epelh v4, v19, v16, v17, v18 + calc_epelh2 v4, v5, v19, v16, v17, v18 + epel_uni_w_hv_end + sqxtun v4.8b, v4.8h + st1 {v4.s}[0], [x0], #4 + st1 {v4.h}[2], [x0], x1 + subs x4, x4, #1 + b.ne 1b +2: + ldp q12, q13, [sp] + ldp q14, q15, [sp, #32] + ldp q8, q9, [sp, #64] + ldp q10, q11, [sp, #96] + add sp, sp, #128 + ret +endfunc + +function ff_hevc_put_hevc_epel_uni_w_hv8_8_neon_i8mm, export=1 + epel_uni_w_hv_start + and x4, x4, 0xffffffff + + add x10, x4, #3 + lsl x10, x10, #7 + sub sp, sp, x10 // tmp_array + stp x0, x1, [sp, #-16]! + stp x4, x6, [sp, #-16]! + stp xzr, x30, [sp, #-16]! + add x0, sp, #48 + sub x1, x2, x3 + mov x2, x3 + add x3, x4, #3 + mov x4, x5 + bl X(ff_hevc_put_hevc_epel_h8_8_neon_i8mm) + ldp xzr, x30, [sp], #16 + ldp x4, x6, [sp], #16 + ldp x0, x1, [sp], #16 + load_epel_filterh x6, x5 + mov x10, #(MAX_PB_SIZE * 2) + ld1 {v16.8h}, [sp], x10 + ld1 {v17.8h}, [sp], x10 + ld1 {v18.8h}, [sp], x10 +1: ld1 {v19.8h}, [sp], x10 + calc_epelh v4, v16, v17, v18, v19 + calc_epelh2 v4, v5, v16, v17, v18, v19 + epel_uni_w_hv_end + sqxtun v4.8b, v4.8h + st1 {v4.8b}, [x0], x1 + subs x4, x4, #1 + b.eq 2f + + ld1 {v16.8h}, [sp], x10 + calc_epelh v4, v17, v18, v19, v16 + calc_epelh2 v4, v5, v17, v18, v19, v16 + epel_uni_w_hv_end + sqxtun v4.8b, v4.8h + st1 {v4.8b}, [x0], x1 + subs x4, x4, #1 + b.eq 2f + + ld1 {v17.8h}, [sp], x10 + calc_epelh v4, v18, v19, v16, v17 + calc_epelh2 v4, v5, v18, v19, v16, v17 + epel_uni_w_hv_end + sqxtun v4.8b, v4.8h + st1 {v4.8b}, [x0], x1 + subs x4, x4, #1 + b.eq 2f + + ld1 {v18.8h}, [sp], x10 + calc_epelh v4, v19, v16, v17, v18 + calc_epelh2 v4, v5, v19, v16, v17, v18 + epel_uni_w_hv_end + sqxtun v4.8b, v4.8h + st1 {v4.8b}, [x0], x1 + subs x4, x4, #1 + b.ne 1b +2: + ldp q12, q13, [sp] + ldp q14, q15, [sp, #32] + ldp q8, q9, [sp, #64] + ldp q10, q11, [sp, #96] + add sp, sp, #128 + ret +endfunc + +function ff_hevc_put_hevc_epel_uni_w_hv12_8_neon_i8mm, export=1 + epel_uni_w_hv_start + and x4, x4, 0xffffffff + + add x10, x4, #3 + lsl x10, x10, #7 + sub sp, sp, x10 // tmp_array + stp x0, x1, [sp, #-16]! + stp x4, x6, [sp, #-16]! + stp xzr, x30, [sp, #-16]! + add x0, sp, #48 + sub x1, x2, x3 + mov x2, x3 + add x3, x4, #3 + mov x4, x5 + bl X(ff_hevc_put_hevc_epel_h12_8_neon_i8mm) + ldp xzr, x30, [sp], #16 + ldp x4, x6, [sp], #16 + ldp x0, x1, [sp], #16 + load_epel_filterh x6, x5 + sub x1, x1, #8 + mov x10, #(MAX_PB_SIZE * 2) + ld1 {v16.8h, v17.8h}, [sp], x10 + ld1 {v18.8h, v19.8h}, [sp], x10 + ld1 {v20.8h, v21.8h}, [sp], x10 +1: ld1 {v22.8h, v23.8h}, [sp], x10 + calc_epelh v4, v16, v18, v20, v22 + calc_epelh2 v4, v5, v16, v18, v20, v22 + calc_epelh v5, v17, v19, v21, v23 + epel_uni_w_hv_end2 + sqxtun v4.8b, v4.8h + sqxtun2 v4.16b, v5.8h + st1 {v4.8b}, [x0], #8 + st1 {v4.s}[2], [x0], x1 + subs x4, x4, #1 + b.eq 2f + + ld1 {v16.8h, v17.8h}, [sp], x10 + calc_epelh v4, v18, v20, v22, v16 + calc_epelh2 v4, v5, v18, v20, v22, v16 + calc_epelh v5, v19, v21, v23, v17 + epel_uni_w_hv_end2 + sqxtun v4.8b, v4.8h + sqxtun2 v4.16b, v5.8h + + st1 {v4.8b}, [x0], #8 + st1 {v4.s}[2], [x0], x1 + subs x4, x4, #1 + b.eq 2f + ld1 {v18.8h, v19.8h}, [sp], x10 + calc_epelh v4, v20, v22, v16, v18 + calc_epelh2 v4, v5, v20, v22, v16, v18 + calc_epelh v5, v21, v23, v17, v19 + epel_uni_w_hv_end2 + sqxtun v4.8b, v4.8h + sqxtun2 v4.16b, v5.8h + st1 {v4.8b}, [x0], #8 + st1 {v4.s}[2], [x0], x1 + subs x4, x4, #1 + b.eq 2f + + ld1 {v20.8h, v21.8h}, [sp], x10 + calc_epelh v4, v22, v16, v18, v20 + calc_epelh2 v4, v5, v22, v16, v18, v20 + calc_epelh v5, v23, v17, v19, v21 + epel_uni_w_hv_end2 + sqxtun v4.8b, v4.8h + sqxtun2 v4.16b, v5.8h + st1 {v4.8b}, [x0], #8 + st1 {v4.s}[2], [x0], x1 + subs x4, x4, #1 + b.ne 1b +2: + ldp q12, q13, [sp] + ldp q14, q15, [sp, #32] + ldp q8, q9, [sp, #64] + ldp q10, q11, [sp, #96] + add sp, sp, #128 + ret +endfunc + +function ff_hevc_put_hevc_epel_uni_w_hv16_8_neon_i8mm, export=1 + epel_uni_w_hv_start + + and x4, x4, 0xffffffff + + add x10, x4, #3 + lsl x10, x10, #7 + sub sp, sp, x10 // tmp_array + stp x0, x1, [sp, #-16]! + stp x4, x6, [sp, #-16]! + stp xzr, x30, [sp, #-16]! + add x0, sp, #48 + sub x1, x2, x3 + mov x2, x3 + add x3, x4, #3 + mov x4, x5 + bl X(ff_hevc_put_hevc_epel_h16_8_neon_i8mm) + ldp xzr, x30, [sp], #16 + ldp x4, x6, [sp], #16 + ldp x0, x1, [sp], #16 + load_epel_filterh x6, x5 + mov x10, #(MAX_PB_SIZE * 2) + ld1 {v16.8h, v17.8h}, [sp], x10 + ld1 {v18.8h, v19.8h}, [sp], x10 + ld1 {v20.8h, v21.8h}, [sp], x10 +1: ld1 {v22.8h, v23.8h}, [sp], x10 + calc_epelh v4, v16, v18, v20, v22 + calc_epelh2 v4, v5, v16, v18, v20, v22 + calc_epelh v5, v17, v19, v21, v23 + calc_epelh2 v5, v6, v17, v19, v21, v23 + epel_uni_w_hv_end2 + sqxtun v4.8b, v4.8h + sqxtun2 v4.16b, v5.8h + st1 {v4.16b}, [x0], x1 + subs x4, x4, #1 + b.eq 2f + + ld1 {v16.8h, v17.8h}, [sp], x10 + calc_epelh v4, v18, v20, v22, v16 + calc_epelh2 v4, v5, v18, v20, v22, v16 + calc_epelh v5, v19, v21, v23, v17 + calc_epelh2 v5, v6, v19, v21, v23, v17 + epel_uni_w_hv_end2 + sqxtun v4.8b, v4.8h + sqxtun2 v4.16b, v5.8h + st1 {v4.16b}, [x0], x1 + subs x4, x4, #1 + b.eq 2f + + ld1 {v18.8h, v19.8h}, [sp], x10 + calc_epelh v4, v20, v22, v16, v18 + calc_epelh2 v4, v5, v20, v22, v16, v18 + calc_epelh v5, v21, v23, v17, v19 + calc_epelh2 v5, v6, v21, v23, v17, v19 + epel_uni_w_hv_end2 + sqxtun v4.8b, v4.8h + sqxtun2 v4.16b, v5.8h + st1 {v4.16b}, [x0], x1 + subs x4, x4, #1 + b.eq 2f + + ld1 {v20.8h, v21.8h}, [sp], x10 + calc_epelh v4, v22, v16, v18, v20 + calc_epelh2 v4, v5, v22, v16, v18, v20 + calc_epelh v5, v23, v17, v19, v21 + calc_epelh2 v5, v6, v23, v17, v19, v21 + epel_uni_w_hv_end2 + sqxtun v4.8b, v4.8h + sqxtun2 v4.16b, v5.8h + st1 {v4.16b}, [x0], x1 + subs x4, x4, #1 + b.ne 1b +2: + ldp q12, q13, [sp] + ldp q14, q15, [sp, #32] + ldp q8, q9, [sp, #64] + ldp q10, q11, [sp, #96] + add sp, sp, #128 + ret +endfunc + +function ff_hevc_put_hevc_epel_uni_w_hv24_8_neon_i8mm, export=1 + epel_uni_w_hv_start + and x4, x4, 0xffffffff + + add x10, x4, #3 + lsl x10, x10, #7 + sub sp, sp, x10 // tmp_array + stp x0, x1, [sp, #-16]! + stp x4, x6, [sp, #-16]! + stp xzr, x30, [sp, #-16]! + add x0, sp, #48 + sub x1, x2, x3 + mov x2, x3 + add x3, x4, #3 + mov x4, x5 + bl X(ff_hevc_put_hevc_epel_h24_8_neon_i8mm) + ldp xzr, x30, [sp], #16 + ldp x4, x6, [sp], #16 + ldp x0, x1, [sp], #16 + load_epel_filterh x6, x5 + mov x10, #(MAX_PB_SIZE * 2) + ld1 {v16.8h, v17.8h, v18.8h}, [sp], x10 + ld1 {v19.8h, v20.8h, v21.8h}, [sp], x10 + ld1 {v22.8h, v23.8h, v24.8h}, [sp], x10 +1: ld1 {v25.8h, v26.8h, v27.8h}, [sp], x10 + calc_epelh v4, v16, v19, v22, v25 + calc_epelh2 v4, v5, v16, v19, v22, v25 + calc_epelh v5, v17, v20, v23, v26 + calc_epelh2 v5, v6, v17, v20, v23, v26 + calc_epelh v6, v18, v21, v24, v27 + calc_epelh2 v6, v7, v18, v21, v24, v27 + + epel_uni_w_hv_end3 + sqxtun v4.8b, v4.8h + sqxtun v5.8b, v5.8h + sqxtun v6.8b, v6.8h + st1 {v4.8b, v5.8b, v6.8b}, [x0], x1 + subs x4, x4, #1 + b.eq 2f + + ld1 {v16.8h, v17.8h, v18.8h}, [sp], x10 + calc_epelh v4, v19, v22, v25, v16 + calc_epelh2 v4, v5, v19, v22, v25, v16 + calc_epelh v5, v20, v23, v26, v17 + calc_epelh2 v5, v6, v20, v23, v26, v17 + calc_epelh v6, v21, v24, v27, v18 + calc_epelh2 v6, v7, v21, v24, v27, v18 + epel_uni_w_hv_end3 + + sqxtun v4.8b, v4.8h + sqxtun v5.8b, v5.8h + sqxtun v6.8b, v6.8h + st1 {v4.8b, v5.8b, v6.8b}, [x0], x1 + subs x4, x4, #1 + b.eq 2f + + ld1 {v19.8h, v20.8h, v21.8h}, [sp], x10 + calc_epelh v4, v22, v25, v16, v19 + calc_epelh2 v4, v5, v22, v25, v16, v19 + calc_epelh v5, v23, v26, v17, v20 + calc_epelh2 v5, v6, v23, v26, v17, v20 + calc_epelh v6, v24, v27, v18, v21 + calc_epelh2 v6, v7, v24, v27, v18, v21 + epel_uni_w_hv_end3 + + sqxtun v4.8b, v4.8h + sqxtun v5.8b, v5.8h + sqxtun v6.8b, v6.8h + st1 {v4.8b, v5.8b, v6.8b}, [x0], x1 + subs x4, x4, #1 + b.eq 2f + + ld1 {v22.8h, v23.8h, v24.8h}, [sp], x10 + calc_epelh v4, v25, v16, v19, v22 + calc_epelh2 v4, v5, v25, v16, v19, v22 + calc_epelh v5, v26, v17, v20, v23 + calc_epelh2 v5, v6, v26, v17, v20, v23 + calc_epelh v6, v27, v18, v21, v24 + calc_epelh2 v6, v7, v27, v18, v21, v24 + epel_uni_w_hv_end3 + + sqxtun v4.8b, v4.8h + sqxtun v5.8b, v5.8h + sqxtun v6.8b, v6.8h + st1 {v4.8b, v5.8b, v6.8b}, [x0], x1 + subs x4, x4, #1 + b.ne 1b +2: + ldp q12, q13, [sp] + ldp q14, q15, [sp, #32] + ldp q8, q9, [sp, #64] + ldp q10, q11, [sp, #96] + add sp, sp, #128 + ret +endfunc + +function ff_hevc_put_hevc_epel_uni_w_hv32_8_neon_i8mm, export=1 + + ldp x15, x16, [sp] + stp x0, x30, [sp, #-16]! + stp x1, x2, [sp, #-16]! + stp x3, x4, [sp, #-16]! + stp x5, x6, [sp, #-16]! + mov x17, #16 + stp x17, x7, [sp, #-16]! + stp x15, x16, [sp, #-16]! + bl X(ff_hevc_put_hevc_epel_uni_w_hv16_8_neon_i8mm) + ldp x15, x16, [sp], #16 + ldp x17, x7, [sp], #16 + ldp x5, x6, [sp], #16 + ldp x3, x4, [sp], #16 + ldp x1, x2, [sp], #16 + ldr x0, [sp] + add x0, x0, #16 + add x2, x2, #16 + mov x17, #16 + stp x17, xzr, [sp, #-16]! + stp x15, x16, [sp, #-16]! + bl X(ff_hevc_put_hevc_epel_uni_w_hv16_8_neon_i8mm) + ldp x15, x16, [sp], #16 + ldp x17, xzr, [sp], #16 + ldp xzr, x30, [sp], #16 + ret +endfunc + +function ff_hevc_put_hevc_epel_uni_w_hv48_8_neon_i8mm, export=1 + ldp x15, x16, [sp] + stp x0, x30, [sp, #-16]! + stp x1, x2, [sp, #-16]! + stp x3, x4, [sp, #-16]! + stp x5, x6, [sp, #-16]! + mov x17, #24 + stp x17, x7, [sp, #-16]! + stp x15, x16, [sp, #-16]! + bl X(ff_hevc_put_hevc_epel_uni_w_hv24_8_neon_i8mm) + ldp x15, x16, [sp], #16 + ldp x17, x7, [sp], #16 + ldp x5, x6, [sp], #16 + ldp x3, x4, [sp], #16 + ldp x1, x2, [sp], #16 + ldr x0, [sp] + add x0, x0, #24 + add x2, x2, #24 + mov x17, #24 + stp x17, xzr, [sp, #-16]! + stp x15, x16, [sp, #-16]! + bl X(ff_hevc_put_hevc_epel_uni_w_hv24_8_neon_i8mm) + ldp x15, x16, [sp], #16 + ldp x17, xzr, [sp], #16 + ldp xzr, x30, [sp], #16 + ret +endfunc + +function ff_hevc_put_hevc_epel_uni_w_hv64_8_neon_i8mm, export=1 + ldp x15, x16, [sp] + stp x0, x30, [sp, #-16]! + stp x1, x2, [sp, #-16]! + stp x3, x4, [sp, #-16]! + stp x5, x6, [sp, #-16]! + mov x17, #16 + stp x17, x7, [sp, #-16]! + stp x15, x16, [sp, #-16]! + bl X(ff_hevc_put_hevc_epel_uni_w_hv16_8_neon_i8mm) + ldp x15, x16, [sp], #16 + ldp x17, x7, [sp], #16 + ldp x5, x6, [sp] + ldp x3, x4, [sp, #16] + ldp x1, x2, [sp, #32] + ldr x0, [sp, #48] + add x0, x0, #16 + add x2, x2, #16 + mov x17, #16 + stp x17, x7, [sp, #-16]! + stp x15, x16, [sp, #-16]! + + bl X(ff_hevc_put_hevc_epel_uni_w_hv16_8_neon_i8mm) + ldp x15, x16, [sp], #16 + ldp x17, x7, [sp], #16 + ldp x5, x6, [sp] + ldp x3, x4, [sp, #16] + ldp x1, x2, [sp, #32] + ldr x0, [sp, #48] + add x0, x0, #32 + add x2, x2, #32 + mov x17, #16 + stp x17, x7, [sp, #-16]! + stp x15, x16, [sp, #-16]! + bl X(ff_hevc_put_hevc_epel_uni_w_hv16_8_neon_i8mm) + ldp x15, x16, [sp], #16 + ldp x17, x7, [sp], #16 + ldp x5, x6, [sp], #16 + ldp x3, x4, [sp], #16 + ldp x1, x2, [sp], #16 + ldr x0, [sp] + add x0, x0, #48 + add x2, x2, #48 + mov x17, #16 + stp x17, xzr, [sp, #-16]! + stp x15, x16, [sp, #-16]! + bl X(ff_hevc_put_hevc_epel_uni_w_hv16_8_neon_i8mm) + ldp x15, x16, [sp], #16 + ldp x17, xzr, [sp], #16 + ldp xzr, x30, [sp], #16 + ret +endfunc + + #endif diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c index 348497bbbe..fbbc4e6071 100644 --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c @@ -189,6 +189,11 @@ NEON8_FNPROTO(qpel_uni_w_h, (uint8_t *_dst, ptrdiff_t _dststride, int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width), _i8mm); +NEON8_FNPROTO(epel_uni_w_hv, (uint8_t *_dst, ptrdiff_t _dststride, + const uint8_t *_src, ptrdiff_t _srcstride, + int height, int denom, int wx, int ox, + intptr_t mx, intptr_t my, int width), _i8mm); + NEON8_FNPROTO_PARTIAL_5(qpel_uni_w_hv, (uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src, ptrdiff_t _srcstride, int height, int denom, int wx, int ox, @@ -286,11 +291,13 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth) NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 1, 0, epel_uni_w_v,); NEON8_FNASSIGN_PARTIAL_4(c->put_hevc_qpel_uni_w, 1, 0, qpel_uni_w_v,); + if (have_i8mm(cpu_flags)) { NEON8_FNASSIGN(c->put_hevc_epel, 0, 1, epel_h, _i8mm); NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 1, epel_uni_w_h ,_i8mm); NEON8_FNASSIGN(c->put_hevc_qpel, 0, 1, qpel_h, _i8mm); NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 1, qpel_uni_w_h, _i8mm); + NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 1, 1, epel_uni_w_hv, _i8mm); NEON8_FNASSIGN_PARTIAL_5(c->put_hevc_qpel_uni_w, 1, 1, qpel_uni_w_hv, _i8mm); } }