@@ -19,130 +19,130 @@
#include "libavutil/aarch64/asm.S"
function ff_ps_add_squares_neon, export=1
-1: ld1 {v0.4s,v1.4s}, [x1], #32
- fmul v0.4s, v0.4s, v0.4s
- fmul v1.4s, v1.4s, v1.4s
- faddp v2.4s, v0.4s, v1.4s
- ld1 {v3.4s}, [x0]
- fadd v3.4s, v3.4s, v2.4s
- st1 {v3.4s}, [x0], #16
- subs w2, w2, #4
- b.gt 1b
+1: ld1 {v0.4s,v1.4s}, [x1], #32
+ fmul v0.4s, v0.4s, v0.4s
+ fmul v1.4s, v1.4s, v1.4s
+ faddp v2.4s, v0.4s, v1.4s
+ ld1 {v3.4s}, [x0]
+ fadd v3.4s, v3.4s, v2.4s
+ st1 {v3.4s}, [x0], #16
+ subs w2, w2, #4
+ b.gt 1b
ret
endfunc
function ff_ps_mul_pair_single_neon, export=1
-1: ld1 {v0.4s,v1.4s}, [x1], #32
- ld1 {v2.4s}, [x2], #16
- zip1 v3.4s, v2.4s, v2.4s
- zip2 v4.4s, v2.4s, v2.4s
- fmul v0.4s, v0.4s, v3.4s
- fmul v1.4s, v1.4s, v4.4s
- st1 {v0.4s,v1.4s}, [x0], #32
- subs w3, w3, #4
- b.gt 1b
+1: ld1 {v0.4s,v1.4s}, [x1], #32
+ ld1 {v2.4s}, [x2], #16
+ zip1 v3.4s, v2.4s, v2.4s
+ zip2 v4.4s, v2.4s, v2.4s
+ fmul v0.4s, v0.4s, v3.4s
+ fmul v1.4s, v1.4s, v4.4s
+ st1 {v0.4s,v1.4s}, [x0], #32
+ subs w3, w3, #4
+ b.gt 1b
ret
endfunc
function ff_ps_stereo_interpolate_neon, export=1
- ld1 {v0.4s}, [x2]
- ld1 {v1.4s}, [x3]
- zip1 v4.4s, v0.4s, v0.4s
- zip2 v5.4s, v0.4s, v0.4s
- zip1 v6.4s, v1.4s, v1.4s
- zip2 v7.4s, v1.4s, v1.4s
-1: ld1 {v2.2s}, [x0]
- ld1 {v3.2s}, [x1]
- fadd v4.4s, v4.4s, v6.4s
- fadd v5.4s, v5.4s, v7.4s
- mov v2.d[1], v2.d[0]
- mov v3.d[1], v3.d[0]
- fmul v2.4s, v2.4s, v4.4s
- fmla v2.4s, v3.4s, v5.4s
- st1 {v2.d}[0], [x0], #8
- st1 {v2.d}[1], [x1], #8
- subs w4, w4, #1
- b.gt 1b
+ ld1 {v0.4s}, [x2]
+ ld1 {v1.4s}, [x3]
+ zip1 v4.4s, v0.4s, v0.4s
+ zip2 v5.4s, v0.4s, v0.4s
+ zip1 v6.4s, v1.4s, v1.4s
+ zip2 v7.4s, v1.4s, v1.4s
+1: ld1 {v2.2s}, [x0]
+ ld1 {v3.2s}, [x1]
+ fadd v4.4s, v4.4s, v6.4s
+ fadd v5.4s, v5.4s, v7.4s
+ mov v2.d[1], v2.d[0]
+ mov v3.d[1], v3.d[0]
+ fmul v2.4s, v2.4s, v4.4s
+ fmla v2.4s, v3.4s, v5.4s
+ st1 {v2.d}[0], [x0], #8
+ st1 {v2.d}[1], [x1], #8
+ subs w4, w4, #1
+ b.gt 1b
ret
endfunc
function ff_ps_stereo_interpolate_ipdopd_neon, export=1
- ld1 {v0.4s,v1.4s}, [x2]
- ld1 {v6.4s,v7.4s}, [x3]
- fneg v2.4s, v1.4s
- fneg v3.4s, v7.4s
- zip1 v16.4s, v0.4s, v0.4s
- zip2 v17.4s, v0.4s, v0.4s
- zip1 v18.4s, v2.4s, v1.4s
- zip2 v19.4s, v2.4s, v1.4s
- zip1 v20.4s, v6.4s, v6.4s
- zip2 v21.4s, v6.4s, v6.4s
- zip1 v22.4s, v3.4s, v7.4s
- zip2 v23.4s, v3.4s, v7.4s
-1: ld1 {v2.2s}, [x0]
- ld1 {v3.2s}, [x1]
- fadd v16.4s, v16.4s, v20.4s
- fadd v17.4s, v17.4s, v21.4s
- mov v2.d[1], v2.d[0]
- mov v3.d[1], v3.d[0]
- fmul v4.4s, v2.4s, v16.4s
- fmla v4.4s, v3.4s, v17.4s
- fadd v18.4s, v18.4s, v22.4s
- fadd v19.4s, v19.4s, v23.4s
- ext v2.16b, v2.16b, v2.16b, #4
- ext v3.16b, v3.16b, v3.16b, #4
- fmla v4.4s, v2.4s, v18.4s
- fmla v4.4s, v3.4s, v19.4s
- st1 {v4.d}[0], [x0], #8
- st1 {v4.d}[1], [x1], #8
- subs w4, w4, #1
- b.gt 1b
+ ld1 {v0.4s,v1.4s}, [x2]
+ ld1 {v6.4s,v7.4s}, [x3]
+ fneg v2.4s, v1.4s
+ fneg v3.4s, v7.4s
+ zip1 v16.4s, v0.4s, v0.4s
+ zip2 v17.4s, v0.4s, v0.4s
+ zip1 v18.4s, v2.4s, v1.4s
+ zip2 v19.4s, v2.4s, v1.4s
+ zip1 v20.4s, v6.4s, v6.4s
+ zip2 v21.4s, v6.4s, v6.4s
+ zip1 v22.4s, v3.4s, v7.4s
+ zip2 v23.4s, v3.4s, v7.4s
+1: ld1 {v2.2s}, [x0]
+ ld1 {v3.2s}, [x1]
+ fadd v16.4s, v16.4s, v20.4s
+ fadd v17.4s, v17.4s, v21.4s
+ mov v2.d[1], v2.d[0]
+ mov v3.d[1], v3.d[0]
+ fmul v4.4s, v2.4s, v16.4s
+ fmla v4.4s, v3.4s, v17.4s
+ fadd v18.4s, v18.4s, v22.4s
+ fadd v19.4s, v19.4s, v23.4s
+ ext v2.16b, v2.16b, v2.16b, #4
+ ext v3.16b, v3.16b, v3.16b, #4
+ fmla v4.4s, v2.4s, v18.4s
+ fmla v4.4s, v3.4s, v19.4s
+ st1 {v4.d}[0], [x0], #8
+ st1 {v4.d}[1], [x1], #8
+ subs w4, w4, #1
+ b.gt 1b
ret
endfunc
function ff_ps_hybrid_analysis_neon, export=1
- lsl x3, x3, #3
- ld2 {v0.4s,v1.4s}, [x1], #32
- ld2 {v2.2s,v3.2s}, [x1], #16
- ld1 {v24.2s}, [x1], #8
- ld2 {v4.2s,v5.2s}, [x1], #16
- ld2 {v6.4s,v7.4s}, [x1]
- rev64 v6.4s, v6.4s
- rev64 v7.4s, v7.4s
- ext v6.16b, v6.16b, v6.16b, #8
- ext v7.16b, v7.16b, v7.16b, #8
- rev64 v4.2s, v4.2s
- rev64 v5.2s, v5.2s
- mov v2.d[1], v3.d[0]
- mov v4.d[1], v5.d[0]
- mov v5.d[1], v2.d[0]
- mov v3.d[1], v4.d[0]
- fadd v16.4s, v0.4s, v6.4s
- fadd v17.4s, v1.4s, v7.4s
- fsub v18.4s, v1.4s, v7.4s
- fsub v19.4s, v0.4s, v6.4s
- fadd v22.4s, v2.4s, v4.4s
- fsub v23.4s, v5.4s, v3.4s
- trn1 v20.2d, v22.2d, v23.2d // {re4+re8, re5+re7, im8-im4, im7-im5}
- trn2 v21.2d, v22.2d, v23.2d // {im4+im8, im5+im7, re4-re8, re5-re7}
-1: ld2 {v2.4s,v3.4s}, [x2], #32
- ld2 {v4.2s,v5.2s}, [x2], #16
- ld1 {v6.2s}, [x2], #8
- add x2, x2, #8
- mov v4.d[1], v5.d[0]
- mov v6.s[1], v6.s[0]
- fmul v6.2s, v6.2s, v24.2s
- fmul v0.4s, v2.4s, v16.4s
- fmul v1.4s, v2.4s, v17.4s
- fmls v0.4s, v3.4s, v18.4s
- fmla v1.4s, v3.4s, v19.4s
- fmla v0.4s, v4.4s, v20.4s
- fmla v1.4s, v4.4s, v21.4s
- faddp v0.4s, v0.4s, v1.4s
- faddp v0.4s, v0.4s, v0.4s
- fadd v0.2s, v0.2s, v6.2s
- st1 {v0.2s}, [x0], x3
- subs w4, w4, #1
- b.gt 1b
+ lsl x3, x3, #3
+ ld2 {v0.4s,v1.4s}, [x1], #32
+ ld2 {v2.2s,v3.2s}, [x1], #16
+ ld1 {v24.2s}, [x1], #8
+ ld2 {v4.2s,v5.2s}, [x1], #16
+ ld2 {v6.4s,v7.4s}, [x1]
+ rev64 v6.4s, v6.4s
+ rev64 v7.4s, v7.4s
+ ext v6.16b, v6.16b, v6.16b, #8
+ ext v7.16b, v7.16b, v7.16b, #8
+ rev64 v4.2s, v4.2s
+ rev64 v5.2s, v5.2s
+ mov v2.d[1], v3.d[0]
+ mov v4.d[1], v5.d[0]
+ mov v5.d[1], v2.d[0]
+ mov v3.d[1], v4.d[0]
+ fadd v16.4s, v0.4s, v6.4s
+ fadd v17.4s, v1.4s, v7.4s
+ fsub v18.4s, v1.4s, v7.4s
+ fsub v19.4s, v0.4s, v6.4s
+ fadd v22.4s, v2.4s, v4.4s
+ fsub v23.4s, v5.4s, v3.4s
+ trn1 v20.2d, v22.2d, v23.2d // {re4+re8, re5+re7, im8-im4, im7-im5}
+ trn2 v21.2d, v22.2d, v23.2d // {im4+im8, im5+im7, re4-re8, re5-re7}
+1: ld2 {v2.4s,v3.4s}, [x2], #32
+ ld2 {v4.2s,v5.2s}, [x2], #16
+ ld1 {v6.2s}, [x2], #8
+ add x2, x2, #8
+ mov v4.d[1], v5.d[0]
+ mov v6.s[1], v6.s[0]
+ fmul v6.2s, v6.2s, v24.2s
+ fmul v0.4s, v2.4s, v16.4s
+ fmul v1.4s, v2.4s, v17.4s
+ fmls v0.4s, v3.4s, v18.4s
+ fmla v1.4s, v3.4s, v19.4s
+ fmla v0.4s, v4.4s, v20.4s
+ fmla v1.4s, v4.4s, v21.4s
+ faddp v0.4s, v0.4s, v1.4s
+ faddp v0.4s, v0.4s, v0.4s
+ fadd v0.2s, v0.2s, v6.2s
+ st1 {v0.2s}, [x0], x3
+ subs w4, w4, #1
+ b.gt 1b
ret
endfunc
@@ -33,81 +33,81 @@ const tab_x2, align=4
endconst
function ff_opus_deemphasis_neon, export=1
- movrel x4, tab_st
- ld1 {v4.4s}, [x4]
- movrel x4, tab_x0
- ld1 {v5.4s}, [x4]
- movrel x4, tab_x1
- ld1 {v6.4s}, [x4]
- movrel x4, tab_x2
- ld1 {v7.4s}, [x4]
+ movrel x4, tab_st
+ ld1 {v4.4s}, [x4]
+ movrel x4, tab_x0
+ ld1 {v5.4s}, [x4]
+ movrel x4, tab_x1
+ ld1 {v6.4s}, [x4]
+ movrel x4, tab_x2
+ ld1 {v7.4s}, [x4]
- fmul v0.4s, v4.4s, v0.s[0]
+ fmul v0.4s, v4.4s, v0.s[0]
-1: ld1 {v1.4s, v2.4s}, [x1], #32
+1: ld1 {v1.4s, v2.4s}, [x1], #32
- fmla v0.4s, v5.4s, v1.s[0]
- fmul v3.4s, v7.4s, v2.s[2]
+ fmla v0.4s, v5.4s, v1.s[0]
+ fmul v3.4s, v7.4s, v2.s[2]
- fmla v0.4s, v6.4s, v1.s[1]
- fmla v3.4s, v6.4s, v2.s[1]
+ fmla v0.4s, v6.4s, v1.s[1]
+ fmla v3.4s, v6.4s, v2.s[1]
- fmla v0.4s, v7.4s, v1.s[2]
- fmla v3.4s, v5.4s, v2.s[0]
+ fmla v0.4s, v7.4s, v1.s[2]
+ fmla v3.4s, v5.4s, v2.s[0]
- fadd v1.4s, v1.4s, v0.4s
- fadd v2.4s, v2.4s, v3.4s
+ fadd v1.4s, v1.4s, v0.4s
+ fadd v2.4s, v2.4s, v3.4s
- fmla v2.4s, v4.4s, v1.s[3]
+ fmla v2.4s, v4.4s, v1.s[3]
- st1 {v1.4s, v2.4s}, [x0], #32
- fmul v0.4s, v4.4s, v2.s[3]
+ st1 {v1.4s, v2.4s}, [x0], #32
+ fmul v0.4s, v4.4s, v2.s[3]
- subs w2, w2, #8
- b.gt 1b
+ subs w2, w2, #8
+ b.gt 1b
- mov s0, v2.s[3]
+ mov s0, v2.s[3]
ret
endfunc
function ff_opus_postfilter_neon, export=1
- ld1 {v0.4s}, [x2]
- dup v1.4s, v0.s[1]
- dup v2.4s, v0.s[2]
- dup v0.4s, v0.s[0]
+ ld1 {v0.4s}, [x2]
+ dup v1.4s, v0.s[1]
+ dup v2.4s, v0.s[2]
+ dup v0.4s, v0.s[0]
- add w1, w1, #2
- sub x1, x0, x1, lsl #2
+ add w1, w1, #2
+ sub x1, x0, x1, lsl #2
- ld1 {v3.4s}, [x1]
- fmul v3.4s, v3.4s, v2.4s
+ ld1 {v3.4s}, [x1]
+ fmul v3.4s, v3.4s, v2.4s
-1: add x1, x1, #4
- ld1 {v4.4s}, [x1]
- add x1, x1, #4
- ld1 {v5.4s}, [x1]
- add x1, x1, #4
- ld1 {v6.4s}, [x1]
- add x1, x1, #4
- ld1 {v7.4s}, [x1]
+1: add x1, x1, #4
+ ld1 {v4.4s}, [x1]
+ add x1, x1, #4
+ ld1 {v5.4s}, [x1]
+ add x1, x1, #4
+ ld1 {v6.4s}, [x1]
+ add x1, x1, #4
+ ld1 {v7.4s}, [x1]
- fmla v3.4s, v7.4s, v2.4s
- fadd v6.4s, v6.4s, v4.4s
+ fmla v3.4s, v7.4s, v2.4s
+ fadd v6.4s, v6.4s, v4.4s
- ld1 {v4.4s}, [x0]
- fmla v4.4s, v5.4s, v0.4s
+ ld1 {v4.4s}, [x0]
+ fmla v4.4s, v5.4s, v0.4s
- fmul v6.4s, v6.4s, v1.4s
- fadd v6.4s, v6.4s, v3.4s
+ fmul v6.4s, v6.4s, v1.4s
+ fadd v6.4s, v6.4s, v3.4s
- fadd v4.4s, v4.4s, v6.4s
- fmul v3.4s, v7.4s, v2.4s
+ fadd v4.4s, v4.4s, v6.4s
+ fmul v3.4s, v7.4s, v2.4s
- st1 {v4.4s}, [x0], #16
+ st1 {v4.4s}, [x0], #16
- subs w3, w3, #4
- b.gt 1b
+ subs w3, w3, #4
+ b.gt 1b
ret
endfunc
@@ -21,57 +21,57 @@
#include "libavutil/aarch64/asm.S"
function ff_resample_common_apply_filter_x4_float_neon, export=1
- movi v0.4s, #0 // accumulator
-1: ld1 {v1.4s}, [x1], #16 // src[0..3]
- ld1 {v2.4s}, [x2], #16 // filter[0..3]
- fmla v0.4s, v1.4s, v2.4s // accumulator += src[0..3] * filter[0..3]
- subs w3, w3, #4 // filter_length -= 4
- b.gt 1b // loop until filter_length
- faddp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
- faddp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
- st1 {v0.s}[0], [x0], #4 // write accumulator
+ movi v0.4s, #0 // accumulator
+1: ld1 {v1.4s}, [x1], #16 // src[0..3]
+ ld1 {v2.4s}, [x2], #16 // filter[0..3]
+ fmla v0.4s, v1.4s, v2.4s // accumulator += src[0..3] * filter[0..3]
+ subs w3, w3, #4 // filter_length -= 4
+ b.gt 1b // loop until filter_length
+ faddp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
+ faddp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
+ st1 {v0.s}[0], [x0], #4 // write accumulator
ret
endfunc
function ff_resample_common_apply_filter_x8_float_neon, export=1
- movi v0.4s, #0 // accumulator
-1: ld1 {v1.4s}, [x1], #16 // src[0..3]
- ld1 {v2.4s}, [x2], #16 // filter[0..3]
- ld1 {v3.4s}, [x1], #16 // src[4..7]
- ld1 {v4.4s}, [x2], #16 // filter[4..7]
- fmla v0.4s, v1.4s, v2.4s // accumulator += src[0..3] * filter[0..3]
- fmla v0.4s, v3.4s, v4.4s // accumulator += src[4..7] * filter[4..7]
- subs w3, w3, #8 // filter_length -= 8
- b.gt 1b // loop until filter_length
- faddp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
- faddp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
- st1 {v0.s}[0], [x0], #4 // write accumulator
+ movi v0.4s, #0 // accumulator
+1: ld1 {v1.4s}, [x1], #16 // src[0..3]
+ ld1 {v2.4s}, [x2], #16 // filter[0..3]
+ ld1 {v3.4s}, [x1], #16 // src[4..7]
+ ld1 {v4.4s}, [x2], #16 // filter[4..7]
+ fmla v0.4s, v1.4s, v2.4s // accumulator += src[0..3] * filter[0..3]
+ fmla v0.4s, v3.4s, v4.4s // accumulator += src[4..7] * filter[4..7]
+ subs w3, w3, #8 // filter_length -= 8
+ b.gt 1b // loop until filter_length
+ faddp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
+ faddp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
+ st1 {v0.s}[0], [x0], #4 // write accumulator
ret
endfunc
function ff_resample_common_apply_filter_x4_s16_neon, export=1
- movi v0.4s, #0 // accumulator
-1: ld1 {v1.4h}, [x1], #8 // src[0..3]
- ld1 {v2.4h}, [x2], #8 // filter[0..3]
- smlal v0.4s, v1.4h, v2.4h // accumulator += src[0..3] * filter[0..3]
- subs w3, w3, #4 // filter_length -= 4
- b.gt 1b // loop until filter_length
- addp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
- addp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
- st1 {v0.s}[0], [x0], #4 // write accumulator
+ movi v0.4s, #0 // accumulator
+1: ld1 {v1.4h}, [x1], #8 // src[0..3]
+ ld1 {v2.4h}, [x2], #8 // filter[0..3]
+ smlal v0.4s, v1.4h, v2.4h // accumulator += src[0..3] * filter[0..3]
+ subs w3, w3, #4 // filter_length -= 4
+ b.gt 1b // loop until filter_length
+ addp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
+ addp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
+ st1 {v0.s}[0], [x0], #4 // write accumulator
ret
endfunc
function ff_resample_common_apply_filter_x8_s16_neon, export=1
- movi v0.4s, #0 // accumulator
-1: ld1 {v1.8h}, [x1], #16 // src[0..7]
- ld1 {v2.8h}, [x2], #16 // filter[0..7]
- smlal v0.4s, v1.4h, v2.4h // accumulator += src[0..3] * filter[0..3]
- smlal2 v0.4s, v1.8h, v2.8h // accumulator += src[4..7] * filter[4..7]
- subs w3, w3, #8 // filter_length -= 8
- b.gt 1b // loop until filter_length
- addp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
- addp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
- st1 {v0.s}[0], [x0], #4 // write accumulator
+ movi v0.4s, #0 // accumulator
+1: ld1 {v1.8h}, [x1], #16 // src[0..7]
+ ld1 {v2.8h}, [x2], #16 // filter[0..7]
+ smlal v0.4s, v1.4h, v2.4h // accumulator += src[0..3] * filter[0..3]
+ smlal2 v0.4s, v1.8h, v2.8h // accumulator += src[4..7] * filter[4..7]
+ subs w3, w3, #8 // filter_length -= 8
+ b.gt 1b // loop until filter_length
+ addp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
+ addp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
+ st1 {v0.s}[0], [x0], #4 // write accumulator
ret
endfunc
@@ -41,53 +41,53 @@
;----------------------------------------------------------------------------- */
function ff_hscale8to15_X8_neon, export=1
- sbfiz x7, x6, #1, #32 // filterSize*2 (*2 because int16)
-1: ldr w8, [x5], #4 // filterPos[idx]
- ldr w0, [x5], #4 // filterPos[idx + 1]
- ldr w11, [x5], #4 // filterPos[idx + 2]
- ldr w9, [x5], #4 // filterPos[idx + 3]
- mov x16, x4 // filter0 = filter
- add x12, x16, x7 // filter1 = filter0 + filterSize*2
- add x13, x12, x7 // filter2 = filter1 + filterSize*2
- add x4, x13, x7 // filter3 = filter2 + filterSize*2
- movi v0.2d, #0 // val sum part 1 (for dst[0])
- movi v1.2d, #0 // val sum part 2 (for dst[1])
- movi v2.2d, #0 // val sum part 3 (for dst[2])
- movi v3.2d, #0 // val sum part 4 (for dst[3])
- add x17, x3, w8, uxtw // srcp + filterPos[0]
- add x8, x3, w0, uxtw // srcp + filterPos[1]
- add x0, x3, w11, uxtw // srcp + filterPos[2]
- add x11, x3, w9, uxtw // srcp + filterPos[3]
- mov w15, w6 // filterSize counter
-2: ld1 {v4.8b}, [x17], #8 // srcp[filterPos[0] + {0..7}]
- ld1 {v5.8h}, [x16], #16 // load 8x16-bit filter values, part 1
- ld1 {v6.8b}, [x8], #8 // srcp[filterPos[1] + {0..7}]
- ld1 {v7.8h}, [x12], #16 // load 8x16-bit at filter+filterSize
- uxtl v4.8h, v4.8b // unpack part 1 to 16-bit
- smlal v0.4s, v4.4h, v5.4h // v0 accumulates srcp[filterPos[0] + {0..3}] * filter[{0..3}]
- smlal2 v0.4s, v4.8h, v5.8h // v0 accumulates srcp[filterPos[0] + {4..7}] * filter[{4..7}]
- ld1 {v16.8b}, [x0], #8 // srcp[filterPos[2] + {0..7}]
- ld1 {v17.8h}, [x13], #16 // load 8x16-bit at filter+2*filterSize
- uxtl v6.8h, v6.8b // unpack part 2 to 16-bit
- smlal v1.4s, v6.4h, v7.4h // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}]
- uxtl v16.8h, v16.8b // unpack part 3 to 16-bit
- smlal v2.4s, v16.4h, v17.4h // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}]
- smlal2 v2.4s, v16.8h, v17.8h // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}]
- ld1 {v18.8b}, [x11], #8 // srcp[filterPos[3] + {0..7}]
- smlal2 v1.4s, v6.8h, v7.8h // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}]
- ld1 {v19.8h}, [x4], #16 // load 8x16-bit at filter+3*filterSize
- subs w15, w15, #8 // j -= 8: processed 8/filterSize
- uxtl v18.8h, v18.8b // unpack part 4 to 16-bit
- smlal v3.4s, v18.4h, v19.4h // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}]
- smlal2 v3.4s, v18.8h, v19.8h // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}]
- b.gt 2b // inner loop if filterSize not consumed completely
- addp v0.4s, v0.4s, v1.4s // part01 horizontal pair adding
- addp v2.4s, v2.4s, v3.4s // part23 horizontal pair adding
- addp v0.4s, v0.4s, v2.4s // part0123 horizontal pair adding
- subs w2, w2, #4 // dstW -= 4
- sqshrn v0.4h, v0.4s, #7 // shift and clip the 2x16-bit final values
- st1 {v0.4h}, [x1], #8 // write to destination part0123
- b.gt 1b // loop until end of line
+ sbfiz x7, x6, #1, #32 // filterSize*2 (*2 because int16)
+1: ldr w8, [x5], #4 // filterPos[idx]
+ ldr w0, [x5], #4 // filterPos[idx + 1]
+ ldr w11, [x5], #4 // filterPos[idx + 2]
+ ldr w9, [x5], #4 // filterPos[idx + 3]
+ mov x16, x4 // filter0 = filter
+ add x12, x16, x7 // filter1 = filter0 + filterSize*2
+ add x13, x12, x7 // filter2 = filter1 + filterSize*2
+ add x4, x13, x7 // filter3 = filter2 + filterSize*2
+ movi v0.2d, #0 // val sum part 1 (for dst[0])
+ movi v1.2d, #0 // val sum part 2 (for dst[1])
+ movi v2.2d, #0 // val sum part 3 (for dst[2])
+ movi v3.2d, #0 // val sum part 4 (for dst[3])
+ add x17, x3, w8, uxtw // srcp + filterPos[0]
+ add x8, x3, w0, uxtw // srcp + filterPos[1]
+ add x0, x3, w11, uxtw // srcp + filterPos[2]
+ add x11, x3, w9, uxtw // srcp + filterPos[3]
+ mov w15, w6 // filterSize counter
+2: ld1 {v4.8b}, [x17], #8 // srcp[filterPos[0] + {0..7}]
+ ld1 {v5.8h}, [x16], #16 // load 8x16-bit filter values, part 1
+ ld1 {v6.8b}, [x8], #8 // srcp[filterPos[1] + {0..7}]
+ ld1 {v7.8h}, [x12], #16 // load 8x16-bit at filter+filterSize
+ uxtl v4.8h, v4.8b // unpack part 1 to 16-bit
+ smlal v0.4s, v4.4h, v5.4h // v0 accumulates srcp[filterPos[0] + {0..3}] * filter[{0..3}]
+ smlal2 v0.4s, v4.8h, v5.8h // v0 accumulates srcp[filterPos[0] + {4..7}] * filter[{4..7}]
+ ld1 {v16.8b}, [x0], #8 // srcp[filterPos[2] + {0..7}]
+ ld1 {v17.8h}, [x13], #16 // load 8x16-bit at filter+2*filterSize
+ uxtl v6.8h, v6.8b // unpack part 2 to 16-bit
+ smlal v1.4s, v6.4h, v7.4h // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}]
+ uxtl v16.8h, v16.8b // unpack part 3 to 16-bit
+ smlal v2.4s, v16.4h, v17.4h // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}]
+ smlal2 v2.4s, v16.8h, v17.8h // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}]
+ ld1 {v18.8b}, [x11], #8 // srcp[filterPos[3] + {0..7}]
+ smlal2 v1.4s, v6.8h, v7.8h // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}]
+ ld1 {v19.8h}, [x4], #16 // load 8x16-bit at filter+3*filterSize
+ subs w15, w15, #8 // j -= 8: processed 8/filterSize
+ uxtl v18.8h, v18.8b // unpack part 4 to 16-bit
+ smlal v3.4s, v18.4h, v19.4h // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}]
+ smlal2 v3.4s, v18.8h, v19.8h // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}]
+ b.gt 2b // inner loop if filterSize not consumed completely
+ addp v0.4s, v0.4s, v1.4s // part01 horizontal pair adding
+ addp v2.4s, v2.4s, v3.4s // part23 horizontal pair adding
+ addp v0.4s, v0.4s, v2.4s // part0123 horizontal pair adding
+ subs w2, w2, #4 // dstW -= 4
+ sqshrn v0.4h, v0.4s, #7 // shift and clip the 2x16-bit final values
+ st1 {v0.4h}, [x1], #8 // write to destination part0123
+ b.gt 1b // loop until end of line
ret
endfunc
@@ -103,98 +103,98 @@ function ff_hscale8to15_X4_neon, export=1
// This function for filter sizes that are 4 mod 8. In other words, anything that's 0 mod 4 but not
// 0 mod 8. It also assumes that dstW is 0 mod 4.
- lsl w7, w6, #1 // w7 = filterSize * 2
+ lsl w7, w6, #1 // w7 = filterSize * 2
1:
- ldp w8, w9, [x5] // filterPos[idx + 0], [idx + 1]
- ldp w10, w11, [x5, #8] // filterPos[idx + 2], [idx + 3]
+ ldp w8, w9, [x5] // filterPos[idx + 0], [idx + 1]
+ ldp w10, w11, [x5, #8] // filterPos[idx + 2], [idx + 3]
- movi v16.2d, #0 // initialize accumulator for idx + 0
- movi v17.2d, #0 // initialize accumulator for idx + 1
- movi v18.2d, #0 // initialize accumulator for idx + 2
- movi v19.2d, #0 // initialize accumulator for idx + 3
+ movi v16.2d, #0 // initialize accumulator for idx + 0
+ movi v17.2d, #0 // initialize accumulator for idx + 1
+ movi v18.2d, #0 // initialize accumulator for idx + 2
+ movi v19.2d, #0 // initialize accumulator for idx + 3
- mov x12, x4 // filter pointer for idx + 0
- add x13, x4, x7 // filter pointer for idx + 1
- add x8, x3, w8, uxtw // srcp + filterPos[idx + 0]
- add x9, x3, w9, uxtw // srcp + filterPos[idx + 1]
+ mov x12, x4 // filter pointer for idx + 0
+ add x13, x4, x7 // filter pointer for idx + 1
+ add x8, x3, w8, uxtw // srcp + filterPos[idx + 0]
+ add x9, x3, w9, uxtw // srcp + filterPos[idx + 1]
- add x14, x13, x7 // filter pointer for idx + 2
- add x10, x3, w10, uxtw // srcp + filterPos[idx + 2]
- add x11, x3, w11, uxtw // srcp + filterPos[idx + 3]
+ add x14, x13, x7 // filter pointer for idx + 2
+ add x10, x3, w10, uxtw // srcp + filterPos[idx + 2]
+ add x11, x3, w11, uxtw // srcp + filterPos[idx + 3]
- mov w0, w6 // copy filterSize to a temp register, w0
- add x5, x5, #16 // advance the filterPos pointer
- add x15, x14, x7 // filter pointer for idx + 3
- mov x16, xzr // temp register for offsetting filter pointers
+ mov w0, w6 // copy filterSize to a temp register, w0
+ add x5, x5, #16 // advance the filterPos pointer
+ add x15, x14, x7 // filter pointer for idx + 3
+ mov x16, xzr // temp register for offsetting filter pointers
2:
// This section loops over 8-wide chunks of filter size
- ldr d4, [x8], #8 // load 8 bytes from srcp for idx + 0
- ldr q0, [x12, x16] // load 8 values, 16 bytes from filter for idx + 0
+ ldr d4, [x8], #8 // load 8 bytes from srcp for idx + 0
+ ldr q0, [x12, x16] // load 8 values, 16 bytes from filter for idx + 0
- ldr d5, [x9], #8 // load 8 bytes from srcp for idx + 1
- ldr q1, [x13, x16] // load 8 values, 16 bytes from filter for idx + 1
+ ldr d5, [x9], #8 // load 8 bytes from srcp for idx + 1
+ ldr q1, [x13, x16] // load 8 values, 16 bytes from filter for idx + 1
- uxtl v4.8h, v4.8b // unsigned extend long for idx + 0
- uxtl v5.8h, v5.8b // unsigned extend long for idx + 1
+ uxtl v4.8h, v4.8b // unsigned extend long for idx + 0
+ uxtl v5.8h, v5.8b // unsigned extend long for idx + 1
- ldr d6, [x10], #8 // load 8 bytes from srcp for idx + 2
- ldr q2, [x14, x16] // load 8 values, 16 bytes from filter for idx + 2
+ ldr d6, [x10], #8 // load 8 bytes from srcp for idx + 2
+ ldr q2, [x14, x16] // load 8 values, 16 bytes from filter for idx + 2
- smlal v16.4s, v0.4h, v4.4h // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 0
- smlal v17.4s, v1.4h, v5.4h // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 1
+ smlal v16.4s, v0.4h, v4.4h // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 0
+ smlal v17.4s, v1.4h, v5.4h // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 1
- ldr d7, [x11], #8 // load 8 bytes from srcp for idx + 3
- ldr q3, [x15, x16] // load 8 values, 16 bytes from filter for idx + 3
+ ldr d7, [x11], #8 // load 8 bytes from srcp for idx + 3
+ ldr q3, [x15, x16] // load 8 values, 16 bytes from filter for idx + 3
- sub w0, w0, #8 // decrement the remaining filterSize counter
- smlal2 v16.4s, v0.8h, v4.8h // val += src[srcPos + j + 4..7] * filter[fs * i + j + 4..7], idx + 0
- smlal2 v17.4s, v1.8h, v5.8h // val += src[srcPos + j + 4..7] * filter[fs * i + j + 4..7], idx + 1
- uxtl v6.8h, v6.8b // unsigned extend long for idx + 2
- uxtl v7.8h, v7.8b // unsigned extend long for idx + 3
- smlal v18.4s, v2.4h, v6.4h // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 2
- smlal v19.4s, v3.4h, v7.4h // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 3
+ sub w0, w0, #8 // decrement the remaining filterSize counter
+ smlal2 v16.4s, v0.8h, v4.8h // val += src[srcPos + j + 4..7] * filter[fs * i + j + 4..7], idx + 0
+ smlal2 v17.4s, v1.8h, v5.8h // val += src[srcPos + j + 4..7] * filter[fs * i + j + 4..7], idx + 1
+ uxtl v6.8h, v6.8b // unsigned extend long for idx + 2
+ uxtl v7.8h, v7.8b // unsigned extend long for idx + 3
+ smlal v18.4s, v2.4h, v6.4h // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 2
+ smlal v19.4s, v3.4h, v7.4h // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 3
- cmp w0, #8 // are there at least 8 more elements in filter to consume?
- add x16, x16, #16 // advance the offsetting register for filter values
+ cmp w0, #8 // are there at least 8 more elements in filter to consume?
+ add x16, x16, #16 // advance the offsetting register for filter values
- smlal2 v18.4s, v2.8h, v6.8h // val += src[srcPos + j + 4..7] * filter[fs * i + j + 4..7], idx + 2
- smlal2 v19.4s, v3.8h, v7.8h // val += src[srcPos + j + 4..7] * filter[fs * i + j + 4..7], idx + 3
+ smlal2 v18.4s, v2.8h, v6.8h // val += src[srcPos + j + 4..7] * filter[fs * i + j + 4..7], idx + 2
+ smlal2 v19.4s, v3.8h, v7.8h // val += src[srcPos + j + 4..7] * filter[fs * i + j + 4..7], idx + 3
- b.ge 2b // branch back to inner loop
+ b.ge 2b // branch back to inner loop
// complete the remaining 4 filter elements
- sub x17, x7, #8 // calculate the offset of the filter pointer for the remaining 4 elements
-
- ldr s4, [x8] // load 4 bytes from srcp for idx + 0
- ldr d0, [x12, x17] // load 4 values, 8 bytes from filter for idx + 0
- ldr s5, [x9] // load 4 bytes from srcp for idx + 1
- ldr d1, [x13, x17] // load 4 values, 8 bytes from filter for idx + 1
-
- uxtl v4.8h, v4.8b // unsigned extend long for idx + 0
- uxtl v5.8h, v5.8b // unsigned extend long for idx + 1
-
- ldr s6, [x10] // load 4 bytes from srcp for idx + 2
- ldr d2, [x14, x17] // load 4 values, 8 bytes from filter for idx + 2
- smlal v16.4s, v0.4h, v4.4h // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 0
- smlal v17.4s, v1.4h, v5.4h // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 1
- ldr s7, [x11] // load 4 bytes from srcp for idx + 3
- ldr d3, [x15, x17] // load 4 values, 8 bytes from filter for idx + 3
-
- uxtl v6.8h, v6.8b // unsigned extend long for idx + 2
- uxtl v7.8h, v7.8b // unsigned extend long for idx + 3
- addp v16.4s, v16.4s, v17.4s // horizontal pair adding for idx 0,1
- smlal v18.4s, v2.4h, v6.4h // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 2
- smlal v19.4s, v3.4h, v7.4h // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 3
-
- addp v18.4s, v18.4s, v19.4s // horizontal pair adding for idx 2,3
- addp v16.4s, v16.4s, v18.4s // final horizontal pair adding producing one vector with results for idx = 0..3
-
- subs w2, w2, #4 // dstW -= 4
- sqshrn v0.4h, v16.4s, #7 // shift and clip the 2x16-bit final values
- st1 {v0.4h}, [x1], #8 // write to destination idx 0..3
- add x4, x4, x7, lsl #2 // filter += (filterSize*2) * 4
- b.gt 1b // loop until end of line
+ sub x17, x7, #8 // calculate the offset of the filter pointer for the remaining 4 elements
+
+ ldr s4, [x8] // load 4 bytes from srcp for idx + 0
+ ldr d0, [x12, x17] // load 4 values, 8 bytes from filter for idx + 0
+ ldr s5, [x9] // load 4 bytes from srcp for idx + 1
+ ldr d1, [x13, x17] // load 4 values, 8 bytes from filter for idx + 1
+
+ uxtl v4.8h, v4.8b // unsigned extend long for idx + 0
+ uxtl v5.8h, v5.8b // unsigned extend long for idx + 1
+
+ ldr s6, [x10] // load 4 bytes from srcp for idx + 2
+ ldr d2, [x14, x17] // load 4 values, 8 bytes from filter for idx + 2
+ smlal v16.4s, v0.4h, v4.4h // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 0
+ smlal v17.4s, v1.4h, v5.4h // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 1
+ ldr s7, [x11] // load 4 bytes from srcp for idx + 3
+ ldr d3, [x15, x17] // load 4 values, 8 bytes from filter for idx + 3
+
+ uxtl v6.8h, v6.8b // unsigned extend long for idx + 2
+ uxtl v7.8h, v7.8b // unsigned extend long for idx + 3
+ addp v16.4s, v16.4s, v17.4s // horizontal pair adding for idx 0,1
+ smlal v18.4s, v2.4h, v6.4h // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 2
+ smlal v19.4s, v3.4h, v7.4h // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 3
+
+ addp v18.4s, v18.4s, v19.4s // horizontal pair adding for idx 2,3
+ addp v16.4s, v16.4s, v18.4s // final horizontal pair adding producing one vector with results for idx = 0..3
+
+ subs w2, w2, #4 // dstW -= 4
+ sqshrn v0.4h, v16.4s, #7 // shift and clip the 2x16-bit final values
+ st1 {v0.4h}, [x1], #8 // write to destination idx 0..3
+ add x4, x4, x7, lsl #2 // filter += (filterSize*2) * 4
+ b.gt 1b // loop until end of line
ret
endfunc
@@ -219,132 +219,132 @@ function ff_hscale8to15_4_neon, export=1
// 3. Complete madd
// 4. Complete remaining iterations when dstW % 8 != 0
- sub sp, sp, #32 // allocate 32 bytes on the stack
- cmp w2, #16 // if dstW <16, skip to the last block used for wrapping up
- b.lt 2f
+ sub sp, sp, #32 // allocate 32 bytes on the stack
+ cmp w2, #16 // if dstW <16, skip to the last block used for wrapping up
+ b.lt 2f
// load 8 values from filterPos to be used as offsets into src
- ldp w8, w9, [x5] // filterPos[idx + 0], [idx + 1]
- ldp w10, w11, [x5, #8] // filterPos[idx + 2], [idx + 3]
- ldp w12, w13, [x5, #16] // filterPos[idx + 4], [idx + 5]
- ldp w14, w15, [x5, #24] // filterPos[idx + 6], [idx + 7]
- add x5, x5, #32 // advance filterPos
+ ldp w8, w9, [x5] // filterPos[idx + 0], [idx + 1]
+ ldp w10, w11, [x5, #8] // filterPos[idx + 2], [idx + 3]
+ ldp w12, w13, [x5, #16] // filterPos[idx + 4], [idx + 5]
+ ldp w14, w15, [x5, #24] // filterPos[idx + 6], [idx + 7]
+ add x5, x5, #32 // advance filterPos
// gather random access data from src into contiguous memory
- ldr w8, [x3, w8, uxtw] // src[filterPos[idx + 0]][0..3]
- ldr w9, [x3, w9, uxtw] // src[filterPos[idx + 1]][0..3]
- ldr w10, [x3, w10, uxtw] // src[filterPos[idx + 2]][0..3]
- ldr w11, [x3, w11, uxtw] // src[filterPos[idx + 3]][0..3]
- ldr w12, [x3, w12, uxtw] // src[filterPos[idx + 4]][0..3]
- ldr w13, [x3, w13, uxtw] // src[filterPos[idx + 5]][0..3]
- ldr w14, [x3, w14, uxtw] // src[filterPos[idx + 6]][0..3]
- ldr w15, [x3, w15, uxtw] // src[filterPos[idx + 7]][0..3]
- stp w8, w9, [sp] // *scratch_mem = { src[filterPos[idx + 0]][0..3], src[filterPos[idx + 1]][0..3] }
- stp w10, w11, [sp, #8] // *scratch_mem = { src[filterPos[idx + 2]][0..3], src[filterPos[idx + 3]][0..3] }
- stp w12, w13, [sp, #16] // *scratch_mem = { src[filterPos[idx + 4]][0..3], src[filterPos[idx + 5]][0..3] }
- stp w14, w15, [sp, #24] // *scratch_mem = { src[filterPos[idx + 6]][0..3], src[filterPos[idx + 7]][0..3] }
+ ldr w8, [x3, w8, uxtw] // src[filterPos[idx + 0]][0..3]
+ ldr w9, [x3, w9, uxtw] // src[filterPos[idx + 1]][0..3]
+ ldr w10, [x3, w10, uxtw] // src[filterPos[idx + 2]][0..3]
+ ldr w11, [x3, w11, uxtw] // src[filterPos[idx + 3]][0..3]
+ ldr w12, [x3, w12, uxtw] // src[filterPos[idx + 4]][0..3]
+ ldr w13, [x3, w13, uxtw] // src[filterPos[idx + 5]][0..3]
+ ldr w14, [x3, w14, uxtw] // src[filterPos[idx + 6]][0..3]
+ ldr w15, [x3, w15, uxtw] // src[filterPos[idx + 7]][0..3]
+ stp w8, w9, [sp] // *scratch_mem = { src[filterPos[idx + 0]][0..3], src[filterPos[idx + 1]][0..3] }
+ stp w10, w11, [sp, #8] // *scratch_mem = { src[filterPos[idx + 2]][0..3], src[filterPos[idx + 3]][0..3] }
+ stp w12, w13, [sp, #16] // *scratch_mem = { src[filterPos[idx + 4]][0..3], src[filterPos[idx + 5]][0..3] }
+ stp w14, w15, [sp, #24] // *scratch_mem = { src[filterPos[idx + 6]][0..3], src[filterPos[idx + 7]][0..3] }
1:
- ld4 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp] // transpose 8 bytes each from src into 4 registers
+ ld4 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp] // transpose 8 bytes each from src into 4 registers
// load 8 values from filterPos to be used as offsets into src
- ldp w8, w9, [x5] // filterPos[idx + 0][0..3], [idx + 1][0..3], next iteration
- ldp w10, w11, [x5, #8] // filterPos[idx + 2][0..3], [idx + 3][0..3], next iteration
- ldp w12, w13, [x5, #16] // filterPos[idx + 4][0..3], [idx + 5][0..3], next iteration
- ldp w14, w15, [x5, #24] // filterPos[idx + 6][0..3], [idx + 7][0..3], next iteration
+ ldp w8, w9, [x5] // filterPos[idx + 0][0..3], [idx + 1][0..3], next iteration
+ ldp w10, w11, [x5, #8] // filterPos[idx + 2][0..3], [idx + 3][0..3], next iteration
+ ldp w12, w13, [x5, #16] // filterPos[idx + 4][0..3], [idx + 5][0..3], next iteration
+ ldp w14, w15, [x5, #24] // filterPos[idx + 6][0..3], [idx + 7][0..3], next iteration
- movi v0.2d, #0 // Clear madd accumulator for idx 0..3
- movi v5.2d, #0 // Clear madd accumulator for idx 4..7
+ movi v0.2d, #0 // Clear madd accumulator for idx 0..3
+ movi v5.2d, #0 // Clear madd accumulator for idx 4..7
- ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x4], #64 // load filter idx + 0..7
+ ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x4], #64 // load filter idx + 0..7
- add x5, x5, #32 // advance filterPos
+ add x5, x5, #32 // advance filterPos
// interleaved SIMD and prefetching intended to keep ld/st and vector pipelines busy
- uxtl v16.8h, v16.8b // unsigned extend long, covert src data to 16-bit
- uxtl v17.8h, v17.8b // unsigned extend long, covert src data to 16-bit
- ldr w8, [x3, w8, uxtw] // src[filterPos[idx + 0]], next iteration
- ldr w9, [x3, w9, uxtw] // src[filterPos[idx + 1]], next iteration
- uxtl v18.8h, v18.8b // unsigned extend long, covert src data to 16-bit
- uxtl v19.8h, v19.8b // unsigned extend long, covert src data to 16-bit
- ldr w10, [x3, w10, uxtw] // src[filterPos[idx + 2]], next iteration
- ldr w11, [x3, w11, uxtw] // src[filterPos[idx + 3]], next iteration
-
- smlal v0.4s, v1.4h, v16.4h // multiply accumulate inner loop j = 0, idx = 0..3
- smlal v0.4s, v2.4h, v17.4h // multiply accumulate inner loop j = 1, idx = 0..3
- ldr w12, [x3, w12, uxtw] // src[filterPos[idx + 4]], next iteration
- ldr w13, [x3, w13, uxtw] // src[filterPos[idx + 5]], next iteration
- smlal v0.4s, v3.4h, v18.4h // multiply accumulate inner loop j = 2, idx = 0..3
- smlal v0.4s, v4.4h, v19.4h // multiply accumulate inner loop j = 3, idx = 0..3
- ldr w14, [x3, w14, uxtw] // src[filterPos[idx + 6]], next iteration
- ldr w15, [x3, w15, uxtw] // src[filterPos[idx + 7]], next iteration
-
- smlal2 v5.4s, v1.8h, v16.8h // multiply accumulate inner loop j = 0, idx = 4..7
- smlal2 v5.4s, v2.8h, v17.8h // multiply accumulate inner loop j = 1, idx = 4..7
- stp w8, w9, [sp] // *scratch_mem = { src[filterPos[idx + 0]][0..3], src[filterPos[idx + 1]][0..3] }
- stp w10, w11, [sp, #8] // *scratch_mem = { src[filterPos[idx + 2]][0..3], src[filterPos[idx + 3]][0..3] }
- smlal2 v5.4s, v3.8h, v18.8h // multiply accumulate inner loop j = 2, idx = 4..7
- smlal2 v5.4s, v4.8h, v19.8h // multiply accumulate inner loop j = 3, idx = 4..7
- stp w12, w13, [sp, #16] // *scratch_mem = { src[filterPos[idx + 4]][0..3], src[filterPos[idx + 5]][0..3] }
- stp w14, w15, [sp, #24] // *scratch_mem = { src[filterPos[idx + 6]][0..3], src[filterPos[idx + 7]][0..3] }
-
- sub w2, w2, #8 // dstW -= 8
- sqshrn v0.4h, v0.4s, #7 // shift and clip the 2x16-bit final values
- sqshrn v1.4h, v5.4s, #7 // shift and clip the 2x16-bit final values
- st1 {v0.4h, v1.4h}, [x1], #16 // write to dst[idx + 0..7]
- cmp w2, #16 // continue on main loop if there are at least 16 iterations left
- b.ge 1b
+ uxtl v16.8h, v16.8b // unsigned extend long, covert src data to 16-bit
+ uxtl v17.8h, v17.8b // unsigned extend long, covert src data to 16-bit
+ ldr w8, [x3, w8, uxtw] // src[filterPos[idx + 0]], next iteration
+ ldr w9, [x3, w9, uxtw] // src[filterPos[idx + 1]], next iteration
+ uxtl v18.8h, v18.8b // unsigned extend long, covert src data to 16-bit
+ uxtl v19.8h, v19.8b // unsigned extend long, covert src data to 16-bit
+ ldr w10, [x3, w10, uxtw] // src[filterPos[idx + 2]], next iteration
+ ldr w11, [x3, w11, uxtw] // src[filterPos[idx + 3]], next iteration
+
+ smlal v0.4s, v1.4h, v16.4h // multiply accumulate inner loop j = 0, idx = 0..3
+ smlal v0.4s, v2.4h, v17.4h // multiply accumulate inner loop j = 1, idx = 0..3
+ ldr w12, [x3, w12, uxtw] // src[filterPos[idx + 4]], next iteration
+ ldr w13, [x3, w13, uxtw] // src[filterPos[idx + 5]], next iteration
+ smlal v0.4s, v3.4h, v18.4h // multiply accumulate inner loop j = 2, idx = 0..3
+ smlal v0.4s, v4.4h, v19.4h // multiply accumulate inner loop j = 3, idx = 0..3
+ ldr w14, [x3, w14, uxtw] // src[filterPos[idx + 6]], next iteration
+ ldr w15, [x3, w15, uxtw] // src[filterPos[idx + 7]], next iteration
+
+ smlal2 v5.4s, v1.8h, v16.8h // multiply accumulate inner loop j = 0, idx = 4..7
+ smlal2 v5.4s, v2.8h, v17.8h // multiply accumulate inner loop j = 1, idx = 4..7
+ stp w8, w9, [sp] // *scratch_mem = { src[filterPos[idx + 0]][0..3], src[filterPos[idx + 1]][0..3] }
+ stp w10, w11, [sp, #8] // *scratch_mem = { src[filterPos[idx + 2]][0..3], src[filterPos[idx + 3]][0..3] }
+ smlal2 v5.4s, v3.8h, v18.8h // multiply accumulate inner loop j = 2, idx = 4..7
+ smlal2 v5.4s, v4.8h, v19.8h // multiply accumulate inner loop j = 3, idx = 4..7
+ stp w12, w13, [sp, #16] // *scratch_mem = { src[filterPos[idx + 4]][0..3], src[filterPos[idx + 5]][0..3] }
+ stp w14, w15, [sp, #24] // *scratch_mem = { src[filterPos[idx + 6]][0..3], src[filterPos[idx + 7]][0..3] }
+
+ sub w2, w2, #8 // dstW -= 8
+ sqshrn v0.4h, v0.4s, #7 // shift and clip the 2x16-bit final values
+ sqshrn v1.4h, v5.4s, #7 // shift and clip the 2x16-bit final values
+ st1 {v0.4h, v1.4h}, [x1], #16 // write to dst[idx + 0..7]
+ cmp w2, #16 // continue on main loop if there are at least 16 iterations left
+ b.ge 1b
// last full iteration
- ld4 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp]
- ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x4], #64 // load filter idx + 0..7
+ ld4 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp]
+ ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x4], #64 // load filter idx + 0..7
- movi v0.2d, #0 // Clear madd accumulator for idx 0..3
- movi v5.2d, #0 // Clear madd accumulator for idx 4..7
+ movi v0.2d, #0 // Clear madd accumulator for idx 0..3
+ movi v5.2d, #0 // Clear madd accumulator for idx 4..7
- uxtl v16.8h, v16.8b // unsigned extend long, covert src data to 16-bit
- uxtl v17.8h, v17.8b // unsigned extend long, covert src data to 16-bit
- uxtl v18.8h, v18.8b // unsigned extend long, covert src data to 16-bit
- uxtl v19.8h, v19.8b // unsigned extend long, covert src data to 16-bit
+ uxtl v16.8h, v16.8b // unsigned extend long, covert src data to 16-bit
+ uxtl v17.8h, v17.8b // unsigned extend long, covert src data to 16-bit
+ uxtl v18.8h, v18.8b // unsigned extend long, covert src data to 16-bit
+ uxtl v19.8h, v19.8b // unsigned extend long, covert src data to 16-bit
- smlal v0.4s, v1.4h, v16.4h // multiply accumulate inner loop j = 0, idx = 0..3
- smlal v0.4s, v2.4h, v17.4h // multiply accumulate inner loop j = 1, idx = 0..3
- smlal v0.4s, v3.4h, v18.4h // multiply accumulate inner loop j = 2, idx = 0..3
- smlal v0.4s, v4.4h, v19.4h // multiply accumulate inner loop j = 3, idx = 0..3
+ smlal v0.4s, v1.4h, v16.4h // multiply accumulate inner loop j = 0, idx = 0..3
+ smlal v0.4s, v2.4h, v17.4h // multiply accumulate inner loop j = 1, idx = 0..3
+ smlal v0.4s, v3.4h, v18.4h // multiply accumulate inner loop j = 2, idx = 0..3
+ smlal v0.4s, v4.4h, v19.4h // multiply accumulate inner loop j = 3, idx = 0..3
- smlal2 v5.4s, v1.8h, v16.8h // multiply accumulate inner loop j = 0, idx = 4..7
- smlal2 v5.4s, v2.8h, v17.8h // multiply accumulate inner loop j = 1, idx = 4..7
- smlal2 v5.4s, v3.8h, v18.8h // multiply accumulate inner loop j = 2, idx = 4..7
- smlal2 v5.4s, v4.8h, v19.8h // multiply accumulate inner loop j = 3, idx = 4..7
+ smlal2 v5.4s, v1.8h, v16.8h // multiply accumulate inner loop j = 0, idx = 4..7
+ smlal2 v5.4s, v2.8h, v17.8h // multiply accumulate inner loop j = 1, idx = 4..7
+ smlal2 v5.4s, v3.8h, v18.8h // multiply accumulate inner loop j = 2, idx = 4..7
+ smlal2 v5.4s, v4.8h, v19.8h // multiply accumulate inner loop j = 3, idx = 4..7
- subs w2, w2, #8 // dstW -= 8
- sqshrn v0.4h, v0.4s, #7 // shift and clip the 2x16-bit final values
- sqshrn v1.4h, v5.4s, #7 // shift and clip the 2x16-bit final values
- st1 {v0.4h, v1.4h}, [x1], #16 // write to dst[idx + 0..7]
+ subs w2, w2, #8 // dstW -= 8
+ sqshrn v0.4h, v0.4s, #7 // shift and clip the 2x16-bit final values
+ sqshrn v1.4h, v5.4s, #7 // shift and clip the 2x16-bit final values
+ st1 {v0.4h, v1.4h}, [x1], #16 // write to dst[idx + 0..7]
- cbnz w2, 2f // if >0 iterations remain, jump to the wrap up section
+ cbnz w2, 2f // if >0 iterations remain, jump to the wrap up section
- add sp, sp, #32 // clean up stack
+ add sp, sp, #32 // clean up stack
ret
// finish up when dstW % 8 != 0 or dstW < 16
2:
// load src
- ldr w8, [x5], #4 // filterPos[i]
- add x9, x3, w8, uxtw // calculate the address for src load
- ld1 {v5.s}[0], [x9] // src[filterPos[i] + 0..3]
+ ldr w8, [x5], #4 // filterPos[i]
+ add x9, x3, w8, uxtw // calculate the address for src load
+ ld1 {v5.s}[0], [x9] // src[filterPos[i] + 0..3]
// load filter
- ld1 {v6.4h}, [x4], #8 // filter[filterSize * i + 0..3]
+ ld1 {v6.4h}, [x4], #8 // filter[filterSize * i + 0..3]
- uxtl v5.8h, v5.8b // unsigned exten long, convert src data to 16-bit
- smull v0.4s, v5.4h, v6.4h // 4 iterations of src[...] * filter[...]
- addv s0, v0.4s // add up products of src and filter values
- sqshrn h0, s0, #7 // shift and clip the 2x16-bit final value
- st1 {v0.h}[0], [x1], #2 // dst[i] = ...
- sub w2, w2, #1 // dstW--
- cbnz w2, 2b
+ uxtl v5.8h, v5.8b // unsigned exten long, convert src data to 16-bit
+ smull v0.4s, v5.4h, v6.4h // 4 iterations of src[...] * filter[...]
+ addv s0, v0.4s // add up products of src and filter values
+ sqshrn h0, s0, #7 // shift and clip the 2x16-bit final value
+ st1 {v0.h}[0], [x1], #2 // dst[i] = ...
+ sub w2, w2, #1 // dstW--
+ cbnz w2, 2b
- add sp, sp, #32 // clean up stack
+ add sp, sp, #32 // clean up stack
ret
endfunc
@@ -357,187 +357,187 @@ function ff_hscale8to19_4_neon, export=1
// x5 const int32_t *filterPos
// w6 int filterSize
- movi v18.4s, #1
- movi v17.4s, #1
- shl v18.4s, v18.4s, #19
- sub v18.4s, v18.4s, v17.4s // max allowed value
+ movi v18.4s, #1
+ movi v17.4s, #1
+ shl v18.4s, v18.4s, #19
+ sub v18.4s, v18.4s, v17.4s // max allowed value
- cmp w2, #16
- b.lt 2f // move to last block
+ cmp w2, #16
+ b.lt 2f // move to last block
- ldp w8, w9, [x5] // filterPos[0], filterPos[1]
- ldp w10, w11, [x5, #8] // filterPos[2], filterPos[3]
- ldp w12, w13, [x5, #16] // filterPos[4], filterPos[5]
- ldp w14, w15, [x5, #24] // filterPos[6], filterPos[7]
- add x5, x5, #32
+ ldp w8, w9, [x5] // filterPos[0], filterPos[1]
+ ldp w10, w11, [x5, #8] // filterPos[2], filterPos[3]
+ ldp w12, w13, [x5, #16] // filterPos[4], filterPos[5]
+ ldp w14, w15, [x5, #24] // filterPos[6], filterPos[7]
+ add x5, x5, #32
// load data from
- ldr w8, [x3, w8, uxtw]
- ldr w9, [x3, w9, uxtw]
- ldr w10, [x3, w10, uxtw]
- ldr w11, [x3, w11, uxtw]
- ldr w12, [x3, w12, uxtw]
- ldr w13, [x3, w13, uxtw]
- ldr w14, [x3, w14, uxtw]
- ldr w15, [x3, w15, uxtw]
-
- sub sp, sp, #32
-
- stp w8, w9, [sp]
- stp w10, w11, [sp, #8]
- stp w12, w13, [sp, #16]
- stp w14, w15, [sp, #24]
+ ldr w8, [x3, w8, uxtw]
+ ldr w9, [x3, w9, uxtw]
+ ldr w10, [x3, w10, uxtw]
+ ldr w11, [x3, w11, uxtw]
+ ldr w12, [x3, w12, uxtw]
+ ldr w13, [x3, w13, uxtw]
+ ldr w14, [x3, w14, uxtw]
+ ldr w15, [x3, w15, uxtw]
+
+ sub sp, sp, #32
+
+ stp w8, w9, [sp]
+ stp w10, w11, [sp, #8]
+ stp w12, w13, [sp, #16]
+ stp w14, w15, [sp, #24]
1:
- ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp]
- ld4 {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64 // filter[0..7]
+ ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp]
+ ld4 {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64 // filter[0..7]
// load filterPositions into registers for next iteration
- ldp w8, w9, [x5] // filterPos[0], filterPos[1]
- ldp w10, w11, [x5, #8] // filterPos[2], filterPos[3]
- ldp w12, w13, [x5, #16] // filterPos[4], filterPos[5]
- ldp w14, w15, [x5, #24] // filterPos[6], filterPos[7]
- add x5, x5, #32
- uxtl v0.8h, v0.8b
- ldr w8, [x3, w8, uxtw]
- smull v5.4s, v0.4h, v28.4h // multiply first column of src
- ldr w9, [x3, w9, uxtw]
- smull2 v6.4s, v0.8h, v28.8h
- stp w8, w9, [sp]
-
- uxtl v1.8h, v1.8b
- ldr w10, [x3, w10, uxtw]
- smlal v5.4s, v1.4h, v29.4h // multiply second column of src
- ldr w11, [x3, w11, uxtw]
- smlal2 v6.4s, v1.8h, v29.8h
- stp w10, w11, [sp, #8]
-
- uxtl v2.8h, v2.8b
- ldr w12, [x3, w12, uxtw]
- smlal v5.4s, v2.4h, v30.4h // multiply third column of src
- ldr w13, [x3, w13, uxtw]
- smlal2 v6.4s, v2.8h, v30.8h
- stp w12, w13, [sp, #16]
-
- uxtl v3.8h, v3.8b
- ldr w14, [x3, w14, uxtw]
- smlal v5.4s, v3.4h, v31.4h // multiply fourth column of src
- ldr w15, [x3, w15, uxtw]
- smlal2 v6.4s, v3.8h, v31.8h
- stp w14, w15, [sp, #24]
-
- sub w2, w2, #8
- sshr v5.4s, v5.4s, #3
- sshr v6.4s, v6.4s, #3
- smin v5.4s, v5.4s, v18.4s
- smin v6.4s, v6.4s, v18.4s
-
- st1 {v5.4s, v6.4s}, [x1], #32
- cmp w2, #16
- b.ge 1b
+ ldp w8, w9, [x5] // filterPos[0], filterPos[1]
+ ldp w10, w11, [x5, #8] // filterPos[2], filterPos[3]
+ ldp w12, w13, [x5, #16] // filterPos[4], filterPos[5]
+ ldp w14, w15, [x5, #24] // filterPos[6], filterPos[7]
+ add x5, x5, #32
+ uxtl v0.8h, v0.8b
+ ldr w8, [x3, w8, uxtw]
+ smull v5.4s, v0.4h, v28.4h // multiply first column of src
+ ldr w9, [x3, w9, uxtw]
+ smull2 v6.4s, v0.8h, v28.8h
+ stp w8, w9, [sp]
+
+ uxtl v1.8h, v1.8b
+ ldr w10, [x3, w10, uxtw]
+ smlal v5.4s, v1.4h, v29.4h // multiply second column of src
+ ldr w11, [x3, w11, uxtw]
+ smlal2 v6.4s, v1.8h, v29.8h
+ stp w10, w11, [sp, #8]
+
+ uxtl v2.8h, v2.8b
+ ldr w12, [x3, w12, uxtw]
+ smlal v5.4s, v2.4h, v30.4h // multiply third column of src
+ ldr w13, [x3, w13, uxtw]
+ smlal2 v6.4s, v2.8h, v30.8h
+ stp w12, w13, [sp, #16]
+
+ uxtl v3.8h, v3.8b
+ ldr w14, [x3, w14, uxtw]
+ smlal v5.4s, v3.4h, v31.4h // multiply fourth column of src
+ ldr w15, [x3, w15, uxtw]
+ smlal2 v6.4s, v3.8h, v31.8h
+ stp w14, w15, [sp, #24]
+
+ sub w2, w2, #8
+ sshr v5.4s, v5.4s, #3
+ sshr v6.4s, v6.4s, #3
+ smin v5.4s, v5.4s, v18.4s
+ smin v6.4s, v6.4s, v18.4s
+
+ st1 {v5.4s, v6.4s}, [x1], #32
+ cmp w2, #16
+ b.ge 1b
// here we make last iteration, without updating the registers
- ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp]
- ld4 {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64 // filter[0..7]
-
- uxtl v0.8h, v0.8b
- uxtl v1.8h, v1.8b
- smull v5.4s, v0.4h, v28.4h
- smull2 v6.4s, v0.8h, v28.8h
- uxtl v2.8h, v2.8b
- smlal v5.4s, v1.4h, v29.4h
- smlal2 v6.4s, v1.8h, v29.8h
- uxtl v3.8h, v3.8b
- smlal v5.4s, v2.4h, v30.4h
- smlal2 v6.4s, v2.8h, v30.8h
- smlal v5.4s, v3.4h, v31.4h
- smlal2 v6.4s, v3.8h, v31.8h
-
- sshr v5.4s, v5.4s, #3
- sshr v6.4s, v6.4s, #3
-
- smin v5.4s, v5.4s, v18.4s
- smin v6.4s, v6.4s, v18.4s
-
- sub w2, w2, #8
- st1 {v5.4s, v6.4s}, [x1], #32
- add sp, sp, #32 // restore stack
- cbnz w2, 2f
+ ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp]
+ ld4 {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64 // filter[0..7]
+
+ uxtl v0.8h, v0.8b
+ uxtl v1.8h, v1.8b
+ smull v5.4s, v0.4h, v28.4h
+ smull2 v6.4s, v0.8h, v28.8h
+ uxtl v2.8h, v2.8b
+ smlal v5.4s, v1.4h, v29.4h
+ smlal2 v6.4s, v1.8h, v29.8h
+ uxtl v3.8h, v3.8b
+ smlal v5.4s, v2.4h, v30.4h
+ smlal2 v6.4s, v2.8h, v30.8h
+ smlal v5.4s, v3.4h, v31.4h
+ smlal2 v6.4s, v3.8h, v31.8h
+
+ sshr v5.4s, v5.4s, #3
+ sshr v6.4s, v6.4s, #3
+
+ smin v5.4s, v5.4s, v18.4s
+ smin v6.4s, v6.4s, v18.4s
+
+ sub w2, w2, #8
+ st1 {v5.4s, v6.4s}, [x1], #32
+ add sp, sp, #32 // restore stack
+ cbnz w2, 2f
ret
2:
- ldr w8, [x5], #4 // load filterPos
- add x9, x3, w8, uxtw // src + filterPos
- ld1 {v0.s}[0], [x9] // load 4 * uint8_t* into one single
- ld1 {v31.4h}, [x4], #8
- uxtl v0.8h, v0.8b
- smull v5.4s, v0.4h, v31.4h
- saddlv d0, v5.4s
- sqshrn s0, d0, #3
- smin v0.4s, v0.4s, v18.4s
- st1 {v0.s}[0], [x1], #4
- sub w2, w2, #1
- cbnz w2, 2b // if iterations remain jump to beginning
+ ldr w8, [x5], #4 // load filterPos
+ add x9, x3, w8, uxtw // src + filterPos
+ ld1 {v0.s}[0], [x9] // load 4 * uint8_t* into one single
+ ld1 {v31.4h}, [x4], #8
+ uxtl v0.8h, v0.8b
+ smull v5.4s, v0.4h, v31.4h
+ saddlv d0, v5.4s
+ sqshrn s0, d0, #3
+ smin v0.4s, v0.4s, v18.4s
+ st1 {v0.s}[0], [x1], #4
+ sub w2, w2, #1
+ cbnz w2, 2b // if iterations remain jump to beginning
ret
endfunc
function ff_hscale8to19_X8_neon, export=1
- movi v20.4s, #1
- movi v17.4s, #1
- shl v20.4s, v20.4s, #19
- sub v20.4s, v20.4s, v17.4s
+ movi v20.4s, #1
+ movi v17.4s, #1
+ shl v20.4s, v20.4s, #19
+ sub v20.4s, v20.4s, v17.4s
- sbfiz x7, x6, #1, #32 // filterSize*2 (*2 because int16)
+ sbfiz x7, x6, #1, #32 // filterSize*2 (*2 because int16)
1:
- mov x16, x4 // filter0 = filter
- ldr w8, [x5], #4 // filterPos[idx]
- add x12, x16, x7 // filter1 = filter0 + filterSize*2
- ldr w0, [x5], #4 // filterPos[idx + 1]
- add x13, x12, x7 // filter2 = filter1 + filterSize*2
- ldr w11, [x5], #4 // filterPos[idx + 2]
- add x4, x13, x7 // filter3 = filter2 + filterSize*2
- ldr w9, [x5], #4 // filterPos[idx + 3]
- movi v0.2d, #0 // val sum part 1 (for dst[0])
- movi v1.2d, #0 // val sum part 2 (for dst[1])
- movi v2.2d, #0 // val sum part 3 (for dst[2])
- movi v3.2d, #0 // val sum part 4 (for dst[3])
- add x17, x3, w8, uxtw // srcp + filterPos[0]
- add x8, x3, w0, uxtw // srcp + filterPos[1]
- add x0, x3, w11, uxtw // srcp + filterPos[2]
- add x11, x3, w9, uxtw // srcp + filterPos[3]
- mov w15, w6 // filterSize counter
-2: ld1 {v4.8b}, [x17], #8 // srcp[filterPos[0] + {0..7}]
- ld1 {v5.8h}, [x16], #16 // load 8x16-bit filter values, part 1
- uxtl v4.8h, v4.8b // unpack part 1 to 16-bit
- smlal v0.4s, v4.4h, v5.4h // v0 accumulates srcp[filterPos[0] + {0..3}] * filter[{0..3}]
- ld1 {v6.8b}, [x8], #8 // srcp[filterPos[1] + {0..7}]
- smlal2 v0.4s, v4.8h, v5.8h // v0 accumulates srcp[filterPos[0] + {4..7}] * filter[{4..7}]
- ld1 {v7.8h}, [x12], #16 // load 8x16-bit at filter+filterSize
- ld1 {v16.8b}, [x0], #8 // srcp[filterPos[2] + {0..7}]
- uxtl v6.8h, v6.8b // unpack part 2 to 16-bit
- ld1 {v17.8h}, [x13], #16 // load 8x16-bit at filter+2*filterSize
- uxtl v16.8h, v16.8b // unpack part 3 to 16-bit
- smlal v1.4s, v6.4h, v7.4h // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}]
- ld1 {v18.8b}, [x11], #8 // srcp[filterPos[3] + {0..7}]
- smlal v2.4s, v16.4h, v17.4h // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}]
- ld1 {v19.8h}, [x4], #16 // load 8x16-bit at filter+3*filterSize
- smlal2 v2.4s, v16.8h, v17.8h // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}]
- uxtl v18.8h, v18.8b // unpack part 4 to 16-bit
- smlal2 v1.4s, v6.8h, v7.8h // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}]
- smlal v3.4s, v18.4h, v19.4h // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}]
- subs w15, w15, #8 // j -= 8: processed 8/filterSize
- smlal2 v3.4s, v18.8h, v19.8h // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}]
- b.gt 2b // inner loop if filterSize not consumed completely
- addp v0.4s, v0.4s, v1.4s // part01 horizontal pair adding
- addp v2.4s, v2.4s, v3.4s // part23 horizontal pair adding
- addp v0.4s, v0.4s, v2.4s // part0123 horizontal pair adding
- subs w2, w2, #4 // dstW -= 4
- sshr v0.4s, v0.4s, #3 // shift and clip the 2x16-bit final values
- smin v0.4s, v0.4s, v20.4s
- st1 {v0.4s}, [x1], #16 // write to destination part0123
- b.gt 1b // loop until end of line
+ mov x16, x4 // filter0 = filter
+ ldr w8, [x5], #4 // filterPos[idx]
+ add x12, x16, x7 // filter1 = filter0 + filterSize*2
+ ldr w0, [x5], #4 // filterPos[idx + 1]
+ add x13, x12, x7 // filter2 = filter1 + filterSize*2
+ ldr w11, [x5], #4 // filterPos[idx + 2]
+ add x4, x13, x7 // filter3 = filter2 + filterSize*2
+ ldr w9, [x5], #4 // filterPos[idx + 3]
+ movi v0.2d, #0 // val sum part 1 (for dst[0])
+ movi v1.2d, #0 // val sum part 2 (for dst[1])
+ movi v2.2d, #0 // val sum part 3 (for dst[2])
+ movi v3.2d, #0 // val sum part 4 (for dst[3])
+ add x17, x3, w8, uxtw // srcp + filterPos[0]
+ add x8, x3, w0, uxtw // srcp + filterPos[1]
+ add x0, x3, w11, uxtw // srcp + filterPos[2]
+ add x11, x3, w9, uxtw // srcp + filterPos[3]
+ mov w15, w6 // filterSize counter
+2: ld1 {v4.8b}, [x17], #8 // srcp[filterPos[0] + {0..7}]
+ ld1 {v5.8h}, [x16], #16 // load 8x16-bit filter values, part 1
+ uxtl v4.8h, v4.8b // unpack part 1 to 16-bit
+ smlal v0.4s, v4.4h, v5.4h // v0 accumulates srcp[filterPos[0] + {0..3}] * filter[{0..3}]
+ ld1 {v6.8b}, [x8], #8 // srcp[filterPos[1] + {0..7}]
+ smlal2 v0.4s, v4.8h, v5.8h // v0 accumulates srcp[filterPos[0] + {4..7}] * filter[{4..7}]
+ ld1 {v7.8h}, [x12], #16 // load 8x16-bit at filter+filterSize
+ ld1 {v16.8b}, [x0], #8 // srcp[filterPos[2] + {0..7}]
+ uxtl v6.8h, v6.8b // unpack part 2 to 16-bit
+ ld1 {v17.8h}, [x13], #16 // load 8x16-bit at filter+2*filterSize
+ uxtl v16.8h, v16.8b // unpack part 3 to 16-bit
+ smlal v1.4s, v6.4h, v7.4h // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}]
+ ld1 {v18.8b}, [x11], #8 // srcp[filterPos[3] + {0..7}]
+ smlal v2.4s, v16.4h, v17.4h // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}]
+ ld1 {v19.8h}, [x4], #16 // load 8x16-bit at filter+3*filterSize
+ smlal2 v2.4s, v16.8h, v17.8h // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}]
+ uxtl v18.8h, v18.8b // unpack part 4 to 16-bit
+ smlal2 v1.4s, v6.8h, v7.8h // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}]
+ smlal v3.4s, v18.4h, v19.4h // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}]
+ subs w15, w15, #8 // j -= 8: processed 8/filterSize
+ smlal2 v3.4s, v18.8h, v19.8h // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}]
+ b.gt 2b // inner loop if filterSize not consumed completely
+ addp v0.4s, v0.4s, v1.4s // part01 horizontal pair adding
+ addp v2.4s, v2.4s, v3.4s // part23 horizontal pair adding
+ addp v0.4s, v0.4s, v2.4s // part0123 horizontal pair adding
+ subs w2, w2, #4 // dstW -= 4
+ sshr v0.4s, v0.4s, #3 // shift and clip the 2x16-bit final values
+ smin v0.4s, v0.4s, v20.4s
+ st1 {v0.4s}, [x1], #16 // write to destination part0123
+ b.gt 1b // loop until end of line
ret
endfunc
@@ -550,91 +550,91 @@ function ff_hscale8to19_X4_neon, export=1
// x5 const int32_t *filterPos
// w6 int filterSize
- movi v20.4s, #1
- movi v17.4s, #1
- shl v20.4s, v20.4s, #19
- sub v20.4s, v20.4s, v17.4s
+ movi v20.4s, #1
+ movi v17.4s, #1
+ shl v20.4s, v20.4s, #19
+ sub v20.4s, v20.4s, v17.4s
- lsl w7, w6, #1
+ lsl w7, w6, #1
1:
- ldp w8, w9, [x5]
- ldp w10, w11, [x5, #8]
-
- movi v16.2d, #0 // initialize accumulator for idx + 0
- movi v17.2d, #0 // initialize accumulator for idx + 1
- movi v18.2d, #0 // initialize accumulator for idx + 2
- movi v19.2d, #0 // initialize accumulator for idx + 3
-
- mov x12, x4 // filter + 0
- add x13, x4, x7 // filter + 1
- add x8, x3, w8, uxtw // srcp + filterPos 0
- add x14, x13, x7 // filter + 2
- add x9, x3, w9, uxtw // srcp + filterPos 1
- add x15, x14, x7 // filter + 3
- add x10, x3, w10, uxtw // srcp + filterPos 2
- mov w0, w6 // save the filterSize to temporary variable
- add x11, x3, w11, uxtw // srcp + filterPos 3
- add x5, x5, #16 // advance filter position
- mov x16, xzr // clear the register x16 used for offsetting the filter values
+ ldp w8, w9, [x5]
+ ldp w10, w11, [x5, #8]
+
+ movi v16.2d, #0 // initialize accumulator for idx + 0
+ movi v17.2d, #0 // initialize accumulator for idx + 1
+ movi v18.2d, #0 // initialize accumulator for idx + 2
+ movi v19.2d, #0 // initialize accumulator for idx + 3
+
+ mov x12, x4 // filter + 0
+ add x13, x4, x7 // filter + 1
+ add x8, x3, w8, uxtw // srcp + filterPos 0
+ add x14, x13, x7 // filter + 2
+ add x9, x3, w9, uxtw // srcp + filterPos 1
+ add x15, x14, x7 // filter + 3
+ add x10, x3, w10, uxtw // srcp + filterPos 2
+ mov w0, w6 // save the filterSize to temporary variable
+ add x11, x3, w11, uxtw // srcp + filterPos 3
+ add x5, x5, #16 // advance filter position
+ mov x16, xzr // clear the register x16 used for offsetting the filter values
2:
- ldr d4, [x8], #8 // load src values for idx 0
- ldr q31, [x12, x16] // load filter values for idx 0
- uxtl v4.8h, v4.8b // extend type to match the filter' size
- ldr d5, [x9], #8 // load src values for idx 1
- smlal v16.4s, v4.4h, v31.4h // multiplication of lower half for idx 0
- uxtl v5.8h, v5.8b // extend type to match the filter' size
- ldr q30, [x13, x16] // load filter values for idx 1
- smlal2 v16.4s, v4.8h, v31.8h // multiplication of upper half for idx 0
- ldr d6, [x10], #8 // load src values for idx 2
- ldr q29, [x14, x16] // load filter values for idx 2
- smlal v17.4s, v5.4h, v30.4h // multiplication of lower half for idx 1
- ldr d7, [x11], #8 // load src values for idx 3
- smlal2 v17.4s, v5.8h, v30.8h // multiplication of upper half for idx 1
- uxtl v6.8h, v6.8b // extend tpye to matchi the filter's size
- ldr q28, [x15, x16] // load filter values for idx 3
- smlal v18.4s, v6.4h, v29.4h // multiplication of lower half for idx 2
- uxtl v7.8h, v7.8b
- smlal2 v18.4s, v6.8h, v29.8h // multiplication of upper half for idx 2
- sub w0, w0, #8
- smlal v19.4s, v7.4h, v28.4h // multiplication of lower half for idx 3
- cmp w0, #8
- smlal2 v19.4s, v7.8h, v28.8h // multiplication of upper half for idx 3
- add x16, x16, #16 // advance filter values indexing
-
- b.ge 2b
+ ldr d4, [x8], #8 // load src values for idx 0
+ ldr q31, [x12, x16] // load filter values for idx 0
+ uxtl v4.8h, v4.8b // extend type to match the filter' size
+ ldr d5, [x9], #8 // load src values for idx 1
+ smlal v16.4s, v4.4h, v31.4h // multiplication of lower half for idx 0
+ uxtl v5.8h, v5.8b // extend type to match the filter' size
+ ldr q30, [x13, x16] // load filter values for idx 1
+ smlal2 v16.4s, v4.8h, v31.8h // multiplication of upper half for idx 0
+ ldr d6, [x10], #8 // load src values for idx 2
+ ldr q29, [x14, x16] // load filter values for idx 2
+ smlal v17.4s, v5.4h, v30.4h // multiplication of lower half for idx 1
+ ldr d7, [x11], #8 // load src values for idx 3
+ smlal2 v17.4s, v5.8h, v30.8h // multiplication of upper half for idx 1
+ uxtl v6.8h, v6.8b // extend tpye to matchi the filter's size
+ ldr q28, [x15, x16] // load filter values for idx 3
+ smlal v18.4s, v6.4h, v29.4h // multiplication of lower half for idx 2
+ uxtl v7.8h, v7.8b
+ smlal2 v18.4s, v6.8h, v29.8h // multiplication of upper half for idx 2
+ sub w0, w0, #8
+ smlal v19.4s, v7.4h, v28.4h // multiplication of lower half for idx 3
+ cmp w0, #8
+ smlal2 v19.4s, v7.8h, v28.8h // multiplication of upper half for idx 3
+ add x16, x16, #16 // advance filter values indexing
+
+ b.ge 2b
// 4 iterations left
- sub x17, x7, #8 // step back to wrap up the filter pos for last 4 elements
-
- ldr s4, [x8] // load src values for idx 0
- ldr d31, [x12, x17] // load filter values for idx 0
- uxtl v4.8h, v4.8b // extend type to match the filter' size
- ldr s5, [x9] // load src values for idx 1
- smlal v16.4s, v4.4h, v31.4h
- ldr d30, [x13, x17] // load filter values for idx 1
- uxtl v5.8h, v5.8b // extend type to match the filter' size
- ldr s6, [x10] // load src values for idx 2
- smlal v17.4s, v5.4h, v30.4h
- uxtl v6.8h, v6.8b // extend type to match the filter's size
- ldr d29, [x14, x17] // load filter values for idx 2
- ldr s7, [x11] // load src values for idx 3
- addp v16.4s, v16.4s, v17.4s
- uxtl v7.8h, v7.8b
- ldr d28, [x15, x17] // load filter values for idx 3
- smlal v18.4s, v6.4h, v29.4h
- smlal v19.4s, v7.4h, v28.4h
- subs w2, w2, #4
- addp v18.4s, v18.4s, v19.4s
- addp v16.4s, v16.4s, v18.4s
- sshr v16.4s, v16.4s, #3
- smin v16.4s, v16.4s, v20.4s
-
- st1 {v16.4s}, [x1], #16
- add x4, x4, x7, lsl #2
- b.gt 1b
+ sub x17, x7, #8 // step back to wrap up the filter pos for last 4 elements
+
+ ldr s4, [x8] // load src values for idx 0
+ ldr d31, [x12, x17] // load filter values for idx 0
+ uxtl v4.8h, v4.8b // extend type to match the filter' size
+ ldr s5, [x9] // load src values for idx 1
+ smlal v16.4s, v4.4h, v31.4h
+ ldr d30, [x13, x17] // load filter values for idx 1
+ uxtl v5.8h, v5.8b // extend type to match the filter' size
+ ldr s6, [x10] // load src values for idx 2
+ smlal v17.4s, v5.4h, v30.4h
+ uxtl v6.8h, v6.8b // extend type to match the filter's size
+ ldr d29, [x14, x17] // load filter values for idx 2
+ ldr s7, [x11] // load src values for idx 3
+ addp v16.4s, v16.4s, v17.4s
+ uxtl v7.8h, v7.8b
+ ldr d28, [x15, x17] // load filter values for idx 3
+ smlal v18.4s, v6.4h, v29.4h
+ smlal v19.4s, v7.4h, v28.4h
+ subs w2, w2, #4
+ addp v18.4s, v18.4s, v19.4s
+ addp v16.4s, v16.4s, v18.4s
+ sshr v16.4s, v16.4s, #3
+ smin v16.4s, v16.4s, v20.4s
+
+ st1 {v16.4s}, [x1], #16
+ add x4, x4, x7, lsl #2
+ b.gt 1b
ret
endfunc
@@ -647,191 +647,191 @@ function ff_hscale16to15_4_neon_asm, export=1
// x5 const int32_t *filterPos
// w6 int filterSize
- movi v18.4s, #1
- movi v17.4s, #1
- shl v18.4s, v18.4s, #15
- sub v18.4s, v18.4s, v17.4s // max allowed value
- dup v17.4s, w0 // read shift
- neg v17.4s, v17.4s // negate it, so it can be used in sshl (effectively shift right)
+ movi v18.4s, #1
+ movi v17.4s, #1
+ shl v18.4s, v18.4s, #15
+ sub v18.4s, v18.4s, v17.4s // max allowed value
+ dup v17.4s, w0 // read shift
+ neg v17.4s, v17.4s // negate it, so it can be used in sshl (effectively shift right)
- cmp w2, #16
- b.lt 2f // move to last block
+ cmp w2, #16
+ b.lt 2f // move to last block
- ldp w8, w9, [x5] // filterPos[0], filterPos[1]
- ldp w10, w11, [x5, #8] // filterPos[2], filterPos[3]
- ldp w12, w13, [x5, #16] // filterPos[4], filterPos[5]
- ldp w14, w15, [x5, #24] // filterPos[6], filterPos[7]
- add x5, x5, #32
+ ldp w8, w9, [x5] // filterPos[0], filterPos[1]
+ ldp w10, w11, [x5, #8] // filterPos[2], filterPos[3]
+ ldp w12, w13, [x5, #16] // filterPos[4], filterPos[5]
+ ldp w14, w15, [x5, #24] // filterPos[6], filterPos[7]
+ add x5, x5, #32
// shift all filterPos left by one, as uint16_t will be read
- lsl x8, x8, #1
- lsl x9, x9, #1
- lsl x10, x10, #1
- lsl x11, x11, #1
- lsl x12, x12, #1
- lsl x13, x13, #1
- lsl x14, x14, #1
- lsl x15, x15, #1
+ lsl x8, x8, #1
+ lsl x9, x9, #1
+ lsl x10, x10, #1
+ lsl x11, x11, #1
+ lsl x12, x12, #1
+ lsl x13, x13, #1
+ lsl x14, x14, #1
+ lsl x15, x15, #1
// load src with given offset
- ldr x8, [x3, w8, uxtw]
- ldr x9, [x3, w9, uxtw]
- ldr x10, [x3, w10, uxtw]
- ldr x11, [x3, w11, uxtw]
- ldr x12, [x3, w12, uxtw]
- ldr x13, [x3, w13, uxtw]
- ldr x14, [x3, w14, uxtw]
- ldr x15, [x3, w15, uxtw]
-
- sub sp, sp, #64
+ ldr x8, [x3, w8, uxtw]
+ ldr x9, [x3, w9, uxtw]
+ ldr x10, [x3, w10, uxtw]
+ ldr x11, [x3, w11, uxtw]
+ ldr x12, [x3, w12, uxtw]
+ ldr x13, [x3, w13, uxtw]
+ ldr x14, [x3, w14, uxtw]
+ ldr x15, [x3, w15, uxtw]
+
+ sub sp, sp, #64
// push src on stack so it can be loaded into vectors later
- stp x8, x9, [sp]
- stp x10, x11, [sp, #16]
- stp x12, x13, [sp, #32]
- stp x14, x15, [sp, #48]
+ stp x8, x9, [sp]
+ stp x10, x11, [sp, #16]
+ stp x12, x13, [sp, #32]
+ stp x14, x15, [sp, #48]
1:
- ld4 {v0.8h, v1.8h, v2.8h, v3.8h}, [sp]
- ld4 {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64 // filter[0..7]
+ ld4 {v0.8h, v1.8h, v2.8h, v3.8h}, [sp]
+ ld4 {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64 // filter[0..7]
// Each of blocks does the following:
// Extend src and filter to 32 bits with uxtl and sxtl
// multiply or multiply and accumulate results
// Extending to 32 bits is necessary, as unit16_t values can't
// be represented as int16_t without type promotion.
- uxtl v26.4s, v0.4h
- sxtl v27.4s, v28.4h
- uxtl2 v0.4s, v0.8h
- mul v5.4s, v26.4s, v27.4s
- sxtl2 v28.4s, v28.8h
- uxtl v26.4s, v1.4h
- mul v6.4s, v0.4s, v28.4s
-
- sxtl v27.4s, v29.4h
- uxtl2 v0.4s, v1.8h
- mla v5.4s, v27.4s, v26.4s
- sxtl2 v28.4s, v29.8h
- uxtl v26.4s, v2.4h
- mla v6.4s, v28.4s, v0.4s
-
- sxtl v27.4s, v30.4h
- uxtl2 v0.4s, v2.8h
- mla v5.4s, v27.4s, v26.4s
- sxtl2 v28.4s, v30.8h
- uxtl v26.4s, v3.4h
- mla v6.4s, v28.4s, v0.4s
-
- sxtl v27.4s, v31.4h
- uxtl2 v0.4s, v3.8h
- mla v5.4s, v27.4s, v26.4s
- sxtl2 v28.4s, v31.8h
- sub w2, w2, #8
- mla v6.4s, v28.4s, v0.4s
-
- sshl v5.4s, v5.4s, v17.4s
- sshl v6.4s, v6.4s, v17.4s
- smin v5.4s, v5.4s, v18.4s
- smin v6.4s, v6.4s, v18.4s
- xtn v5.4h, v5.4s
- xtn2 v5.8h, v6.4s
-
- st1 {v5.8h}, [x1], #16
- cmp w2, #16
+ uxtl v26.4s, v0.4h
+ sxtl v27.4s, v28.4h
+ uxtl2 v0.4s, v0.8h
+ mul v5.4s, v26.4s, v27.4s
+ sxtl2 v28.4s, v28.8h
+ uxtl v26.4s, v1.4h
+ mul v6.4s, v0.4s, v28.4s
+
+ sxtl v27.4s, v29.4h
+ uxtl2 v0.4s, v1.8h
+ mla v5.4s, v27.4s, v26.4s
+ sxtl2 v28.4s, v29.8h
+ uxtl v26.4s, v2.4h
+ mla v6.4s, v28.4s, v0.4s
+
+ sxtl v27.4s, v30.4h
+ uxtl2 v0.4s, v2.8h
+ mla v5.4s, v27.4s, v26.4s
+ sxtl2 v28.4s, v30.8h
+ uxtl v26.4s, v3.4h
+ mla v6.4s, v28.4s, v0.4s
+
+ sxtl v27.4s, v31.4h
+ uxtl2 v0.4s, v3.8h
+ mla v5.4s, v27.4s, v26.4s
+ sxtl2 v28.4s, v31.8h
+ sub w2, w2, #8
+ mla v6.4s, v28.4s, v0.4s
+
+ sshl v5.4s, v5.4s, v17.4s
+ sshl v6.4s, v6.4s, v17.4s
+ smin v5.4s, v5.4s, v18.4s
+ smin v6.4s, v6.4s, v18.4s
+ xtn v5.4h, v5.4s
+ xtn2 v5.8h, v6.4s
+
+ st1 {v5.8h}, [x1], #16
+ cmp w2, #16
// load filterPositions into registers for next iteration
- ldp w8, w9, [x5] // filterPos[0], filterPos[1]
- ldp w10, w11, [x5, #8] // filterPos[2], filterPos[3]
- ldp w12, w13, [x5, #16] // filterPos[4], filterPos[5]
- ldp w14, w15, [x5, #24] // filterPos[6], filterPos[7]
- add x5, x5, #32
-
- lsl x8, x8, #1
- lsl x9, x9, #1
- lsl x10, x10, #1
- lsl x11, x11, #1
- lsl x12, x12, #1
- lsl x13, x13, #1
- lsl x14, x14, #1
- lsl x15, x15, #1
-
- ldr x8, [x3, w8, uxtw]
- ldr x9, [x3, w9, uxtw]
- ldr x10, [x3, w10, uxtw]
- ldr x11, [x3, w11, uxtw]
- ldr x12, [x3, w12, uxtw]
- ldr x13, [x3, w13, uxtw]
- ldr x14, [x3, w14, uxtw]
- ldr x15, [x3, w15, uxtw]
-
- stp x8, x9, [sp]
- stp x10, x11, [sp, #16]
- stp x12, x13, [sp, #32]
- stp x14, x15, [sp, #48]
-
- b.ge 1b
+ ldp w8, w9, [x5] // filterPos[0], filterPos[1]
+ ldp w10, w11, [x5, #8] // filterPos[2], filterPos[3]
+ ldp w12, w13, [x5, #16] // filterPos[4], filterPos[5]
+ ldp w14, w15, [x5, #24] // filterPos[6], filterPos[7]
+ add x5, x5, #32
+
+ lsl x8, x8, #1
+ lsl x9, x9, #1
+ lsl x10, x10, #1
+ lsl x11, x11, #1
+ lsl x12, x12, #1
+ lsl x13, x13, #1
+ lsl x14, x14, #1
+ lsl x15, x15, #1
+
+ ldr x8, [x3, w8, uxtw]
+ ldr x9, [x3, w9, uxtw]
+ ldr x10, [x3, w10, uxtw]
+ ldr x11, [x3, w11, uxtw]
+ ldr x12, [x3, w12, uxtw]
+ ldr x13, [x3, w13, uxtw]
+ ldr x14, [x3, w14, uxtw]
+ ldr x15, [x3, w15, uxtw]
+
+ stp x8, x9, [sp]
+ stp x10, x11, [sp, #16]
+ stp x12, x13, [sp, #32]
+ stp x14, x15, [sp, #48]
+
+ b.ge 1b
// here we make last iteration, without updating the registers
- ld4 {v0.8h, v1.8h, v2.8h, v3.8h}, [sp]
- ld4 {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64
-
- uxtl v26.4s, v0.4h
- sxtl v27.4s, v28.4h
- uxtl2 v0.4s, v0.8h
- mul v5.4s, v26.4s, v27.4s
- sxtl2 v28.4s, v28.8h
- uxtl v26.4s, v1.4h
- mul v6.4s, v0.4s, v28.4s
-
- sxtl v27.4s, v29.4h
- uxtl2 v0.4s, v1.8h
- mla v5.4s, v26.4s, v27.4s
- sxtl2 v28.4s, v29.8h
- uxtl v26.4s, v2.4h
- mla v6.4s, v0.4s, v28.4s
-
- sxtl v27.4s, v30.4h
- uxtl2 v0.4s, v2.8h
- mla v5.4s, v26.4s, v27.4s
- sxtl2 v28.4s, v30.8h
- uxtl v26.4s, v3.4h
- mla v6.4s, v0.4s, v28.4s
-
- sxtl v27.4s, v31.4h
- uxtl2 v0.4s, v3.8h
- mla v5.4s, v26.4s, v27.4s
- sxtl2 v28.4s, v31.8h
- subs w2, w2, #8
- mla v6.4s, v0.4s, v28.4s
-
- sshl v5.4s, v5.4s, v17.4s
- sshl v6.4s, v6.4s, v17.4s
- smin v5.4s, v5.4s, v18.4s
- smin v6.4s, v6.4s, v18.4s
- xtn v5.4h, v5.4s
- xtn2 v5.8h, v6.4s
-
- st1 {v5.8h}, [x1], #16
- add sp, sp, #64 // restore stack
- cbnz w2, 2f
+ ld4 {v0.8h, v1.8h, v2.8h, v3.8h}, [sp]
+ ld4 {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64
+
+ uxtl v26.4s, v0.4h
+ sxtl v27.4s, v28.4h
+ uxtl2 v0.4s, v0.8h
+ mul v5.4s, v26.4s, v27.4s
+ sxtl2 v28.4s, v28.8h
+ uxtl v26.4s, v1.4h
+ mul v6.4s, v0.4s, v28.4s
+
+ sxtl v27.4s, v29.4h
+ uxtl2 v0.4s, v1.8h
+ mla v5.4s, v26.4s, v27.4s
+ sxtl2 v28.4s, v29.8h
+ uxtl v26.4s, v2.4h
+ mla v6.4s, v0.4s, v28.4s
+
+ sxtl v27.4s, v30.4h
+ uxtl2 v0.4s, v2.8h
+ mla v5.4s, v26.4s, v27.4s
+ sxtl2 v28.4s, v30.8h
+ uxtl v26.4s, v3.4h
+ mla v6.4s, v0.4s, v28.4s
+
+ sxtl v27.4s, v31.4h
+ uxtl2 v0.4s, v3.8h
+ mla v5.4s, v26.4s, v27.4s
+ sxtl2 v28.4s, v31.8h
+ subs w2, w2, #8
+ mla v6.4s, v0.4s, v28.4s
+
+ sshl v5.4s, v5.4s, v17.4s
+ sshl v6.4s, v6.4s, v17.4s
+ smin v5.4s, v5.4s, v18.4s
+ smin v6.4s, v6.4s, v18.4s
+ xtn v5.4h, v5.4s
+ xtn2 v5.8h, v6.4s
+
+ st1 {v5.8h}, [x1], #16
+ add sp, sp, #64 // restore stack
+ cbnz w2, 2f
ret
2:
- ldr w8, [x5], #4 // load filterPos
- lsl w8, w8, #1
- add x9, x3, w8, uxtw // src + filterPos
- ld1 {v0.4h}, [x9] // load 4 * uint16_t
- ld1 {v31.4h}, [x4], #8
-
- uxtl v0.4s, v0.4h
- sxtl v31.4s, v31.4h
- mul v5.4s, v0.4s, v31.4s
- addv s0, v5.4s
- sshl v0.4s, v0.4s, v17.4s
- smin v0.4s, v0.4s, v18.4s
- st1 {v0.h}[0], [x1], #2
- sub w2, w2, #1
- cbnz w2, 2b // if iterations remain jump to beginning
+ ldr w8, [x5], #4 // load filterPos
+ lsl w8, w8, #1
+ add x9, x3, w8, uxtw // src + filterPos
+ ld1 {v0.4h}, [x9] // load 4 * uint16_t
+ ld1 {v31.4h}, [x4], #8
+
+ uxtl v0.4s, v0.4h
+ sxtl v31.4s, v31.4h
+ mul v5.4s, v0.4s, v31.4s
+ addv s0, v5.4s
+ sshl v0.4s, v0.4s, v17.4s
+ smin v0.4s, v0.4s, v18.4s
+ st1 {v0.h}[0], [x1], #2
+ sub w2, w2, #1
+ cbnz w2, 2b // if iterations remain jump to beginning
ret
endfunc
@@ -845,79 +845,79 @@ function ff_hscale16to15_X8_neon_asm, export=1
// x5 const int32_t *filterPos
// w6 int filterSize
- movi v20.4s, #1
- movi v21.4s, #1
- shl v20.4s, v20.4s, #15
- sub v20.4s, v20.4s, v21.4s
- dup v21.4s, w0
- neg v21.4s, v21.4s
-
- sbfiz x7, x6, #1, #32 // filterSize*2 (*2 because int16)
-1: ldr w8, [x5], #4 // filterPos[idx]
- lsl w8, w8, #1
- ldr w10, [x5], #4 // filterPos[idx + 1]
- lsl w10, w10, #1
- ldr w11, [x5], #4 // filterPos[idx + 2]
- lsl w11, w11, #1
- ldr w9, [x5], #4 // filterPos[idx + 3]
- lsl w9, w9, #1
- mov x16, x4 // filter0 = filter
- add x12, x16, x7 // filter1 = filter0 + filterSize*2
- add x13, x12, x7 // filter2 = filter1 + filterSize*2
- add x4, x13, x7 // filter3 = filter2 + filterSize*2
- movi v0.2d, #0 // val sum part 1 (for dst[0])
- movi v1.2d, #0 // val sum part 2 (for dst[1])
- movi v2.2d, #0 // val sum part 3 (for dst[2])
- movi v3.2d, #0 // val sum part 4 (for dst[3])
- add x17, x3, w8, uxtw // srcp + filterPos[0]
- add x8, x3, w10, uxtw // srcp + filterPos[1]
- add x10, x3, w11, uxtw // srcp + filterPos[2]
- add x11, x3, w9, uxtw // srcp + filterPos[3]
- mov w15, w6 // filterSize counter
-2: ld1 {v4.8h}, [x17], #16 // srcp[filterPos[0] + {0..7}]
- ld1 {v5.8h}, [x16], #16 // load 8x16-bit filter values, part 1
- ld1 {v6.8h}, [x8], #16 // srcp[filterPos[1] + {0..7}]
- ld1 {v7.8h}, [x12], #16 // load 8x16-bit at filter+filterSize
- uxtl v24.4s, v4.4h // extend srcp lower half to 32 bits to preserve sign
- sxtl v25.4s, v5.4h // extend filter lower half to 32 bits to match srcp size
- uxtl2 v4.4s, v4.8h // extend srcp upper half to 32 bits
- mla v0.4s, v24.4s, v25.4s // multiply accumulate lower half of v4 * v5
- sxtl2 v5.4s, v5.8h // extend filter upper half to 32 bits
- uxtl v26.4s, v6.4h // extend srcp lower half to 32 bits
- mla v0.4s, v4.4s, v5.4s // multiply accumulate upper half of v4 * v5
- sxtl v27.4s, v7.4h // exted filter lower half
- uxtl2 v6.4s, v6.8h // extend srcp upper half
- sxtl2 v7.4s, v7.8h // extend filter upper half
- ld1 {v16.8h}, [x10], #16 // srcp[filterPos[2] + {0..7}]
- mla v1.4s, v26.4s, v27.4s // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}]
- ld1 {v17.8h}, [x13], #16 // load 8x16-bit at filter+2*filterSize
- uxtl v22.4s, v16.4h // extend srcp lower half
- sxtl v23.4s, v17.4h // extend filter lower half
- uxtl2 v16.4s, v16.8h // extend srcp upper half
- sxtl2 v17.4s, v17.8h // extend filter upper half
- mla v2.4s, v22.4s, v23.4s // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}]
- mla v2.4s, v16.4s, v17.4s // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}]
- ld1 {v18.8h}, [x11], #16 // srcp[filterPos[3] + {0..7}]
- mla v1.4s, v6.4s, v7.4s // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}]
- ld1 {v19.8h}, [x4], #16 // load 8x16-bit at filter+3*filterSize
- subs w15, w15, #8 // j -= 8: processed 8/filterSize
- uxtl v28.4s, v18.4h // extend srcp lower half
- sxtl v29.4s, v19.4h // extend filter lower half
- uxtl2 v18.4s, v18.8h // extend srcp upper half
- sxtl2 v19.4s, v19.8h // extend filter upper half
- mla v3.4s, v28.4s, v29.4s // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}]
- mla v3.4s, v18.4s, v19.4s // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}]
- b.gt 2b // inner loop if filterSize not consumed completely
- addp v0.4s, v0.4s, v1.4s // part01 horizontal pair adding
- addp v2.4s, v2.4s, v3.4s // part23 horizontal pair adding
- addp v0.4s, v0.4s, v2.4s // part0123 horizontal pair adding
- subs w2, w2, #4 // dstW -= 4
- sshl v0.4s, v0.4s, v21.4s // shift right (effectively rigth, as shift is negative); overflow expected
- smin v0.4s, v0.4s, v20.4s // apply min (do not use sqshl)
- xtn v0.4h, v0.4s // narrow down to 16 bits
-
- st1 {v0.4h}, [x1], #8 // write to destination part0123
- b.gt 1b // loop until end of line
+ movi v20.4s, #1
+ movi v21.4s, #1
+ shl v20.4s, v20.4s, #15
+ sub v20.4s, v20.4s, v21.4s
+ dup v21.4s, w0
+ neg v21.4s, v21.4s
+
+ sbfiz x7, x6, #1, #32 // filterSize*2 (*2 because int16)
+1: ldr w8, [x5], #4 // filterPos[idx]
+ lsl w8, w8, #1
+ ldr w10, [x5], #4 // filterPos[idx + 1]
+ lsl w10, w10, #1
+ ldr w11, [x5], #4 // filterPos[idx + 2]
+ lsl w11, w11, #1
+ ldr w9, [x5], #4 // filterPos[idx + 3]
+ lsl w9, w9, #1
+ mov x16, x4 // filter0 = filter
+ add x12, x16, x7 // filter1 = filter0 + filterSize*2
+ add x13, x12, x7 // filter2 = filter1 + filterSize*2
+ add x4, x13, x7 // filter3 = filter2 + filterSize*2
+ movi v0.2d, #0 // val sum part 1 (for dst[0])
+ movi v1.2d, #0 // val sum part 2 (for dst[1])
+ movi v2.2d, #0 // val sum part 3 (for dst[2])
+ movi v3.2d, #0 // val sum part 4 (for dst[3])
+ add x17, x3, w8, uxtw // srcp + filterPos[0]
+ add x8, x3, w10, uxtw // srcp + filterPos[1]
+ add x10, x3, w11, uxtw // srcp + filterPos[2]
+ add x11, x3, w9, uxtw // srcp + filterPos[3]
+ mov w15, w6 // filterSize counter
+2: ld1 {v4.8h}, [x17], #16 // srcp[filterPos[0] + {0..7}]
+ ld1 {v5.8h}, [x16], #16 // load 8x16-bit filter values, part 1
+ ld1 {v6.8h}, [x8], #16 // srcp[filterPos[1] + {0..7}]
+ ld1 {v7.8h}, [x12], #16 // load 8x16-bit at filter+filterSize
+ uxtl v24.4s, v4.4h // extend srcp lower half to 32 bits to preserve sign
+ sxtl v25.4s, v5.4h // extend filter lower half to 32 bits to match srcp size
+ uxtl2 v4.4s, v4.8h // extend srcp upper half to 32 bits
+ mla v0.4s, v24.4s, v25.4s // multiply accumulate lower half of v4 * v5
+ sxtl2 v5.4s, v5.8h // extend filter upper half to 32 bits
+ uxtl v26.4s, v6.4h // extend srcp lower half to 32 bits
+ mla v0.4s, v4.4s, v5.4s // multiply accumulate upper half of v4 * v5
+ sxtl v27.4s, v7.4h // exted filter lower half
+ uxtl2 v6.4s, v6.8h // extend srcp upper half
+ sxtl2 v7.4s, v7.8h // extend filter upper half
+ ld1 {v16.8h}, [x10], #16 // srcp[filterPos[2] + {0..7}]
+ mla v1.4s, v26.4s, v27.4s // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}]
+ ld1 {v17.8h}, [x13], #16 // load 8x16-bit at filter+2*filterSize
+ uxtl v22.4s, v16.4h // extend srcp lower half
+ sxtl v23.4s, v17.4h // extend filter lower half
+ uxtl2 v16.4s, v16.8h // extend srcp upper half
+ sxtl2 v17.4s, v17.8h // extend filter upper half
+ mla v2.4s, v22.4s, v23.4s // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}]
+ mla v2.4s, v16.4s, v17.4s // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}]
+ ld1 {v18.8h}, [x11], #16 // srcp[filterPos[3] + {0..7}]
+ mla v1.4s, v6.4s, v7.4s // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}]
+ ld1 {v19.8h}, [x4], #16 // load 8x16-bit at filter+3*filterSize
+ subs w15, w15, #8 // j -= 8: processed 8/filterSize
+ uxtl v28.4s, v18.4h // extend srcp lower half
+ sxtl v29.4s, v19.4h // extend filter lower half
+ uxtl2 v18.4s, v18.8h // extend srcp upper half
+ sxtl2 v19.4s, v19.8h // extend filter upper half
+ mla v3.4s, v28.4s, v29.4s // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}]
+ mla v3.4s, v18.4s, v19.4s // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}]
+ b.gt 2b // inner loop if filterSize not consumed completely
+ addp v0.4s, v0.4s, v1.4s // part01 horizontal pair adding
+ addp v2.4s, v2.4s, v3.4s // part23 horizontal pair adding
+ addp v0.4s, v0.4s, v2.4s // part0123 horizontal pair adding
+ subs w2, w2, #4 // dstW -= 4
+ sshl v0.4s, v0.4s, v21.4s // shift right (effectively rigth, as shift is negative); overflow expected
+ smin v0.4s, v0.4s, v20.4s // apply min (do not use sqshl)
+ xtn v0.4h, v0.4s // narrow down to 16 bits
+
+ st1 {v0.4h}, [x1], #8 // write to destination part0123
+ b.gt 1b // loop until end of line
ret
endfunc
@@ -930,118 +930,118 @@ function ff_hscale16to15_X4_neon_asm, export=1
// x5 const int32_t *filterPos
// w6 int filterSize
- stp d8, d9, [sp, #-0x20]!
- stp d10, d11, [sp, #0x10]
+ stp d8, d9, [sp, #-0x20]!
+ stp d10, d11, [sp, #0x10]
- movi v18.4s, #1
- movi v17.4s, #1
- shl v18.4s, v18.4s, #15
- sub v21.4s, v18.4s, v17.4s // max allowed value
- dup v17.4s, w0 // read shift
- neg v20.4s, v17.4s // negate it, so it can be used in sshl (effectively shift right)
+ movi v18.4s, #1
+ movi v17.4s, #1
+ shl v18.4s, v18.4s, #15
+ sub v21.4s, v18.4s, v17.4s // max allowed value
+ dup v17.4s, w0 // read shift
+ neg v20.4s, v17.4s // negate it, so it can be used in sshl (effectively shift right)
- lsl w7, w6, #1
+ lsl w7, w6, #1
1:
- ldp w8, w9, [x5]
- ldp w10, w11, [x5, #8]
-
- movi v16.2d, #0 // initialize accumulator for idx + 0
- movi v17.2d, #0 // initialize accumulator for idx + 1
- movi v18.2d, #0 // initialize accumulator for idx + 2
- movi v19.2d, #0 // initialize accumulator for idx + 3
-
- mov x12, x4 // filter + 0
- add x13, x4, x7 // filter + 1
- add x8, x3, x8, lsl #1 // srcp + filterPos 0
- add x14, x13, x7 // filter + 2
- add x9, x3, x9, lsl #1 // srcp + filterPos 1
- add x15, x14, x7 // filter + 3
- add x10, x3, x10, lsl #1 // srcp + filterPos 2
- mov w0, w6 // save the filterSize to temporary variable
- add x11, x3, x11, lsl #1 // srcp + filterPos 3
- add x5, x5, #16 // advance filter position
- mov x16, xzr // clear the register x16 used for offsetting the filter values
+ ldp w8, w9, [x5]
+ ldp w10, w11, [x5, #8]
+
+ movi v16.2d, #0 // initialize accumulator for idx + 0
+ movi v17.2d, #0 // initialize accumulator for idx + 1
+ movi v18.2d, #0 // initialize accumulator for idx + 2
+ movi v19.2d, #0 // initialize accumulator for idx + 3
+
+ mov x12, x4 // filter + 0
+ add x13, x4, x7 // filter + 1
+ add x8, x3, x8, lsl #1 // srcp + filterPos 0
+ add x14, x13, x7 // filter + 2
+ add x9, x3, x9, lsl #1 // srcp + filterPos 1
+ add x15, x14, x7 // filter + 3
+ add x10, x3, x10, lsl #1 // srcp + filterPos 2
+ mov w0, w6 // save the filterSize to temporary variable
+ add x11, x3, x11, lsl #1 // srcp + filterPos 3
+ add x5, x5, #16 // advance filter position
+ mov x16, xzr // clear the register x16 used for offsetting the filter values
2:
- ldr q4, [x8], #16 // load src values for idx 0
- ldr q5, [x9], #16 // load src values for idx 1
- uxtl v26.4s, v4.4h
- uxtl2 v4.4s, v4.8h
- ldr q31, [x12, x16] // load filter values for idx 0
- ldr q6, [x10], #16 // load src values for idx 2
- sxtl v22.4s, v31.4h
- sxtl2 v31.4s, v31.8h
- mla v16.4s, v26.4s, v22.4s // multiplication of lower half for idx 0
- uxtl v25.4s, v5.4h
- uxtl2 v5.4s, v5.8h
- ldr q30, [x13, x16] // load filter values for idx 1
- ldr q7, [x11], #16 // load src values for idx 3
- mla v16.4s, v4.4s, v31.4s // multiplication of upper half for idx 0
- uxtl v24.4s, v6.4h
- sxtl v8.4s, v30.4h
- sxtl2 v30.4s, v30.8h
- mla v17.4s, v25.4s, v8.4s // multiplication of lower half for idx 1
- ldr q29, [x14, x16] // load filter values for idx 2
- uxtl2 v6.4s, v6.8h
- sxtl v9.4s, v29.4h
- sxtl2 v29.4s, v29.8h
- mla v17.4s, v5.4s, v30.4s // multiplication of upper half for idx 1
- mla v18.4s, v24.4s, v9.4s // multiplication of lower half for idx 2
- ldr q28, [x15, x16] // load filter values for idx 3
- uxtl v23.4s, v7.4h
- sxtl v10.4s, v28.4h
- mla v18.4s, v6.4s, v29.4s // multiplication of upper half for idx 2
- uxtl2 v7.4s, v7.8h
- sxtl2 v28.4s, v28.8h
- mla v19.4s, v23.4s, v10.4s // multiplication of lower half for idx 3
- sub w0, w0, #8
- cmp w0, #8
- mla v19.4s, v7.4s, v28.4s // multiplication of upper half for idx 3
-
- add x16, x16, #16 // advance filter values indexing
-
- b.ge 2b
+ ldr q4, [x8], #16 // load src values for idx 0
+ ldr q5, [x9], #16 // load src values for idx 1
+ uxtl v26.4s, v4.4h
+ uxtl2 v4.4s, v4.8h
+ ldr q31, [x12, x16] // load filter values for idx 0
+ ldr q6, [x10], #16 // load src values for idx 2
+ sxtl v22.4s, v31.4h
+ sxtl2 v31.4s, v31.8h
+ mla v16.4s, v26.4s, v22.4s // multiplication of lower half for idx 0
+ uxtl v25.4s, v5.4h
+ uxtl2 v5.4s, v5.8h
+ ldr q30, [x13, x16] // load filter values for idx 1
+ ldr q7, [x11], #16 // load src values for idx 3
+ mla v16.4s, v4.4s, v31.4s // multiplication of upper half for idx 0
+ uxtl v24.4s, v6.4h
+ sxtl v8.4s, v30.4h
+ sxtl2 v30.4s, v30.8h
+ mla v17.4s, v25.4s, v8.4s // multiplication of lower half for idx 1
+ ldr q29, [x14, x16] // load filter values for idx 2
+ uxtl2 v6.4s, v6.8h
+ sxtl v9.4s, v29.4h
+ sxtl2 v29.4s, v29.8h
+ mla v17.4s, v5.4s, v30.4s // multiplication of upper half for idx 1
+ mla v18.4s, v24.4s, v9.4s // multiplication of lower half for idx 2
+ ldr q28, [x15, x16] // load filter values for idx 3
+ uxtl v23.4s, v7.4h
+ sxtl v10.4s, v28.4h
+ mla v18.4s, v6.4s, v29.4s // multiplication of upper half for idx 2
+ uxtl2 v7.4s, v7.8h
+ sxtl2 v28.4s, v28.8h
+ mla v19.4s, v23.4s, v10.4s // multiplication of lower half for idx 3
+ sub w0, w0, #8
+ cmp w0, #8
+ mla v19.4s, v7.4s, v28.4s // multiplication of upper half for idx 3
+
+ add x16, x16, #16 // advance filter values indexing
+
+ b.ge 2b
// 4 iterations left
- sub x17, x7, #8 // step back to wrap up the filter pos for last 4 elements
-
- ldr d4, [x8] // load src values for idx 0
- ldr d31, [x12, x17] // load filter values for idx 0
- uxtl v4.4s, v4.4h
- sxtl v31.4s, v31.4h
- ldr d5, [x9] // load src values for idx 1
- mla v16.4s, v4.4s, v31.4s // multiplication of upper half for idx 0
- ldr d30, [x13, x17] // load filter values for idx 1
- uxtl v5.4s, v5.4h
- sxtl v30.4s, v30.4h
- ldr d6, [x10] // load src values for idx 2
- mla v17.4s, v5.4s, v30.4s // multiplication of upper half for idx 1
- ldr d29, [x14, x17] // load filter values for idx 2
- uxtl v6.4s, v6.4h
- sxtl v29.4s, v29.4h
- ldr d7, [x11] // load src values for idx 3
- ldr d28, [x15, x17] // load filter values for idx 3
- mla v18.4s, v6.4s, v29.4s // multiplication of upper half for idx 2
- uxtl v7.4s, v7.4h
- sxtl v28.4s, v28.4h
- addp v16.4s, v16.4s, v17.4s
- mla v19.4s, v7.4s, v28.4s // multiplication of upper half for idx 3
- subs w2, w2, #4
- addp v18.4s, v18.4s, v19.4s
- addp v16.4s, v16.4s, v18.4s
- sshl v16.4s, v16.4s, v20.4s
- smin v16.4s, v16.4s, v21.4s
- xtn v16.4h, v16.4s
-
- st1 {v16.4h}, [x1], #8
- add x4, x4, x7, lsl #2
- b.gt 1b
-
- ldp d8, d9, [sp]
- ldp d10, d11, [sp, #0x10]
-
- add sp, sp, #0x20
+ sub x17, x7, #8 // step back to wrap up the filter pos for last 4 elements
+
+ ldr d4, [x8] // load src values for idx 0
+ ldr d31, [x12, x17] // load filter values for idx 0
+ uxtl v4.4s, v4.4h
+ sxtl v31.4s, v31.4h
+ ldr d5, [x9] // load src values for idx 1
+ mla v16.4s, v4.4s, v31.4s // multiplication of upper half for idx 0
+ ldr d30, [x13, x17] // load filter values for idx 1
+ uxtl v5.4s, v5.4h
+ sxtl v30.4s, v30.4h
+ ldr d6, [x10] // load src values for idx 2
+ mla v17.4s, v5.4s, v30.4s // multiplication of upper half for idx 1
+ ldr d29, [x14, x17] // load filter values for idx 2
+ uxtl v6.4s, v6.4h
+ sxtl v29.4s, v29.4h
+ ldr d7, [x11] // load src values for idx 3
+ ldr d28, [x15, x17] // load filter values for idx 3
+ mla v18.4s, v6.4s, v29.4s // multiplication of upper half for idx 2
+ uxtl v7.4s, v7.4h
+ sxtl v28.4s, v28.4h
+ addp v16.4s, v16.4s, v17.4s
+ mla v19.4s, v7.4s, v28.4s // multiplication of upper half for idx 3
+ subs w2, w2, #4
+ addp v18.4s, v18.4s, v19.4s
+ addp v16.4s, v16.4s, v18.4s
+ sshl v16.4s, v16.4s, v20.4s
+ smin v16.4s, v16.4s, v21.4s
+ xtn v16.4h, v16.4s
+
+ st1 {v16.4h}, [x1], #8
+ add x4, x4, x7, lsl #2
+ b.gt 1b
+
+ ldp d8, d9, [sp]
+ ldp d10, d11, [sp, #0x10]
+
+ add sp, sp, #0x20
ret
endfunc
@@ -1055,188 +1055,188 @@ function ff_hscale16to19_4_neon_asm, export=1
// x5 const int32_t *filterPos
// w6 int filterSize
- movi v18.4s, #1
- movi v17.4s, #1
- shl v18.4s, v18.4s, #19
- sub v18.4s, v18.4s, v17.4s // max allowed value
- dup v17.4s, w0 // read shift
- neg v17.4s, v17.4s // negate it, so it can be used in sshl (effectively shift right)
+ movi v18.4s, #1
+ movi v17.4s, #1
+ shl v18.4s, v18.4s, #19
+ sub v18.4s, v18.4s, v17.4s // max allowed value
+ dup v17.4s, w0 // read shift
+ neg v17.4s, v17.4s // negate it, so it can be used in sshl (effectively shift right)
- cmp w2, #16
- b.lt 2f // move to last block
+ cmp w2, #16
+ b.lt 2f // move to last block
- ldp w8, w9, [x5] // filterPos[0], filterPos[1]
- ldp w10, w11, [x5, #8] // filterPos[2], filterPos[3]
- ldp w12, w13, [x5, #16] // filterPos[4], filterPos[5]
- ldp w14, w15, [x5, #24] // filterPos[6], filterPos[7]
- add x5, x5, #32
+ ldp w8, w9, [x5] // filterPos[0], filterPos[1]
+ ldp w10, w11, [x5, #8] // filterPos[2], filterPos[3]
+ ldp w12, w13, [x5, #16] // filterPos[4], filterPos[5]
+ ldp w14, w15, [x5, #24] // filterPos[6], filterPos[7]
+ add x5, x5, #32
// shift all filterPos left by one, as uint16_t will be read
- lsl x8, x8, #1
- lsl x9, x9, #1
- lsl x10, x10, #1
- lsl x11, x11, #1
- lsl x12, x12, #1
- lsl x13, x13, #1
- lsl x14, x14, #1
- lsl x15, x15, #1
+ lsl x8, x8, #1
+ lsl x9, x9, #1
+ lsl x10, x10, #1
+ lsl x11, x11, #1
+ lsl x12, x12, #1
+ lsl x13, x13, #1
+ lsl x14, x14, #1
+ lsl x15, x15, #1
// load src with given offset
- ldr x8, [x3, w8, uxtw]
- ldr x9, [x3, w9, uxtw]
- ldr x10, [x3, w10, uxtw]
- ldr x11, [x3, w11, uxtw]
- ldr x12, [x3, w12, uxtw]
- ldr x13, [x3, w13, uxtw]
- ldr x14, [x3, w14, uxtw]
- ldr x15, [x3, w15, uxtw]
-
- sub sp, sp, #64
+ ldr x8, [x3, w8, uxtw]
+ ldr x9, [x3, w9, uxtw]
+ ldr x10, [x3, w10, uxtw]
+ ldr x11, [x3, w11, uxtw]
+ ldr x12, [x3, w12, uxtw]
+ ldr x13, [x3, w13, uxtw]
+ ldr x14, [x3, w14, uxtw]
+ ldr x15, [x3, w15, uxtw]
+
+ sub sp, sp, #64
// push src on stack so it can be loaded into vectors later
- stp x8, x9, [sp]
- stp x10, x11, [sp, #16]
- stp x12, x13, [sp, #32]
- stp x14, x15, [sp, #48]
+ stp x8, x9, [sp]
+ stp x10, x11, [sp, #16]
+ stp x12, x13, [sp, #32]
+ stp x14, x15, [sp, #48]
1:
- ld4 {v0.8h, v1.8h, v2.8h, v3.8h}, [sp]
- ld4 {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64 // filter[0..7]
+ ld4 {v0.8h, v1.8h, v2.8h, v3.8h}, [sp]
+ ld4 {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64 // filter[0..7]
// Each of blocks does the following:
// Extend src and filter to 32 bits with uxtl and sxtl
// multiply or multiply and accumulate results
// Extending to 32 bits is necessary, as unit16_t values can't
// be represented as int16_t without type promotion.
- uxtl v26.4s, v0.4h
- sxtl v27.4s, v28.4h
- uxtl2 v0.4s, v0.8h
- mul v5.4s, v26.4s, v27.4s
- sxtl2 v28.4s, v28.8h
- uxtl v26.4s, v1.4h
- mul v6.4s, v0.4s, v28.4s
-
- sxtl v27.4s, v29.4h
- uxtl2 v0.4s, v1.8h
- mla v5.4s, v27.4s, v26.4s
- sxtl2 v28.4s, v29.8h
- uxtl v26.4s, v2.4h
- mla v6.4s, v28.4s, v0.4s
-
- sxtl v27.4s, v30.4h
- uxtl2 v0.4s, v2.8h
- mla v5.4s, v27.4s, v26.4s
- sxtl2 v28.4s, v30.8h
- uxtl v26.4s, v3.4h
- mla v6.4s, v28.4s, v0.4s
-
- sxtl v27.4s, v31.4h
- uxtl2 v0.4s, v3.8h
- mla v5.4s, v27.4s, v26.4s
- sxtl2 v28.4s, v31.8h
- sub w2, w2, #8
- mla v6.4s, v28.4s, v0.4s
-
- sshl v5.4s, v5.4s, v17.4s
- sshl v6.4s, v6.4s, v17.4s
- smin v5.4s, v5.4s, v18.4s
- smin v6.4s, v6.4s, v18.4s
-
- st1 {v5.4s, v6.4s}, [x1], #32
- cmp w2, #16
+ uxtl v26.4s, v0.4h
+ sxtl v27.4s, v28.4h
+ uxtl2 v0.4s, v0.8h
+ mul v5.4s, v26.4s, v27.4s
+ sxtl2 v28.4s, v28.8h
+ uxtl v26.4s, v1.4h
+ mul v6.4s, v0.4s, v28.4s
+
+ sxtl v27.4s, v29.4h
+ uxtl2 v0.4s, v1.8h
+ mla v5.4s, v27.4s, v26.4s
+ sxtl2 v28.4s, v29.8h
+ uxtl v26.4s, v2.4h
+ mla v6.4s, v28.4s, v0.4s
+
+ sxtl v27.4s, v30.4h
+ uxtl2 v0.4s, v2.8h
+ mla v5.4s, v27.4s, v26.4s
+ sxtl2 v28.4s, v30.8h
+ uxtl v26.4s, v3.4h
+ mla v6.4s, v28.4s, v0.4s
+
+ sxtl v27.4s, v31.4h
+ uxtl2 v0.4s, v3.8h
+ mla v5.4s, v27.4s, v26.4s
+ sxtl2 v28.4s, v31.8h
+ sub w2, w2, #8
+ mla v6.4s, v28.4s, v0.4s
+
+ sshl v5.4s, v5.4s, v17.4s
+ sshl v6.4s, v6.4s, v17.4s
+ smin v5.4s, v5.4s, v18.4s
+ smin v6.4s, v6.4s, v18.4s
+
+ st1 {v5.4s, v6.4s}, [x1], #32
+ cmp w2, #16
// load filterPositions into registers for next iteration
- ldp w8, w9, [x5] // filterPos[0], filterPos[1]
- ldp w10, w11, [x5, #8] // filterPos[2], filterPos[3]
- ldp w12, w13, [x5, #16] // filterPos[4], filterPos[5]
- ldp w14, w15, [x5, #24] // filterPos[6], filterPos[7]
- add x5, x5, #32
-
- lsl x8, x8, #1
- lsl x9, x9, #1
- lsl x10, x10, #1
- lsl x11, x11, #1
- lsl x12, x12, #1
- lsl x13, x13, #1
- lsl x14, x14, #1
- lsl x15, x15, #1
-
- ldr x8, [x3, w8, uxtw]
- ldr x9, [x3, w9, uxtw]
- ldr x10, [x3, w10, uxtw]
- ldr x11, [x3, w11, uxtw]
- ldr x12, [x3, w12, uxtw]
- ldr x13, [x3, w13, uxtw]
- ldr x14, [x3, w14, uxtw]
- ldr x15, [x3, w15, uxtw]
-
- stp x8, x9, [sp]
- stp x10, x11, [sp, #16]
- stp x12, x13, [sp, #32]
- stp x14, x15, [sp, #48]
-
- b.ge 1b
+ ldp w8, w9, [x5] // filterPos[0], filterPos[1]
+ ldp w10, w11, [x5, #8] // filterPos[2], filterPos[3]
+ ldp w12, w13, [x5, #16] // filterPos[4], filterPos[5]
+ ldp w14, w15, [x5, #24] // filterPos[6], filterPos[7]
+ add x5, x5, #32
+
+ lsl x8, x8, #1
+ lsl x9, x9, #1
+ lsl x10, x10, #1
+ lsl x11, x11, #1
+ lsl x12, x12, #1
+ lsl x13, x13, #1
+ lsl x14, x14, #1
+ lsl x15, x15, #1
+
+ ldr x8, [x3, w8, uxtw]
+ ldr x9, [x3, w9, uxtw]
+ ldr x10, [x3, w10, uxtw]
+ ldr x11, [x3, w11, uxtw]
+ ldr x12, [x3, w12, uxtw]
+ ldr x13, [x3, w13, uxtw]
+ ldr x14, [x3, w14, uxtw]
+ ldr x15, [x3, w15, uxtw]
+
+ stp x8, x9, [sp]
+ stp x10, x11, [sp, #16]
+ stp x12, x13, [sp, #32]
+ stp x14, x15, [sp, #48]
+
+ b.ge 1b
// here we make last iteration, without updating the registers
- ld4 {v0.8h, v1.8h, v2.8h, v3.8h}, [sp]
- ld4 {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64
-
- uxtl v26.4s, v0.4h
- sxtl v27.4s, v28.4h
- uxtl2 v0.4s, v0.8h
- mul v5.4s, v26.4s, v27.4s
- sxtl2 v28.4s, v28.8h
- uxtl v26.4s, v1.4h
- mul v6.4s, v0.4s, v28.4s
-
- sxtl v27.4s, v29.4h
- uxtl2 v0.4s, v1.8h
- mla v5.4s, v26.4s, v27.4s
- sxtl2 v28.4s, v29.8h
- uxtl v26.4s, v2.4h
- mla v6.4s, v0.4s, v28.4s
-
- sxtl v27.4s, v30.4h
- uxtl2 v0.4s, v2.8h
- mla v5.4s, v26.4s, v27.4s
- sxtl2 v28.4s, v30.8h
- uxtl v26.4s, v3.4h
- mla v6.4s, v0.4s, v28.4s
-
- sxtl v27.4s, v31.4h
- uxtl2 v0.4s, v3.8h
- mla v5.4s, v26.4s, v27.4s
- sxtl2 v28.4s, v31.8h
- subs w2, w2, #8
- mla v6.4s, v0.4s, v28.4s
-
- sshl v5.4s, v5.4s, v17.4s
- sshl v6.4s, v6.4s, v17.4s
-
- smin v5.4s, v5.4s, v18.4s
- smin v6.4s, v6.4s, v18.4s
-
- st1 {v5.4s, v6.4s}, [x1], #32
- add sp, sp, #64 // restore stack
- cbnz w2, 2f
+ ld4 {v0.8h, v1.8h, v2.8h, v3.8h}, [sp]
+ ld4 {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64
+
+ uxtl v26.4s, v0.4h
+ sxtl v27.4s, v28.4h
+ uxtl2 v0.4s, v0.8h
+ mul v5.4s, v26.4s, v27.4s
+ sxtl2 v28.4s, v28.8h
+ uxtl v26.4s, v1.4h
+ mul v6.4s, v0.4s, v28.4s
+
+ sxtl v27.4s, v29.4h
+ uxtl2 v0.4s, v1.8h
+ mla v5.4s, v26.4s, v27.4s
+ sxtl2 v28.4s, v29.8h
+ uxtl v26.4s, v2.4h
+ mla v6.4s, v0.4s, v28.4s
+
+ sxtl v27.4s, v30.4h
+ uxtl2 v0.4s, v2.8h
+ mla v5.4s, v26.4s, v27.4s
+ sxtl2 v28.4s, v30.8h
+ uxtl v26.4s, v3.4h
+ mla v6.4s, v0.4s, v28.4s
+
+ sxtl v27.4s, v31.4h
+ uxtl2 v0.4s, v3.8h
+ mla v5.4s, v26.4s, v27.4s
+ sxtl2 v28.4s, v31.8h
+ subs w2, w2, #8
+ mla v6.4s, v0.4s, v28.4s
+
+ sshl v5.4s, v5.4s, v17.4s
+ sshl v6.4s, v6.4s, v17.4s
+
+ smin v5.4s, v5.4s, v18.4s
+ smin v6.4s, v6.4s, v18.4s
+
+ st1 {v5.4s, v6.4s}, [x1], #32
+ add sp, sp, #64 // restore stack
+ cbnz w2, 2f
ret
2:
- ldr w8, [x5], #4 // load filterPos
- lsl w8, w8, #1
- add x9, x3, w8, uxtw // src + filterPos
- ld1 {v0.4h}, [x9] // load 4 * uint16_t
- ld1 {v31.4h}, [x4], #8
-
- uxtl v0.4s, v0.4h
- sxtl v31.4s, v31.4h
- subs w2, w2, #1
- mul v5.4s, v0.4s, v31.4s
- addv s0, v5.4s
- sshl v0.4s, v0.4s, v17.4s
- smin v0.4s, v0.4s, v18.4s
- st1 {v0.s}[0], [x1], #4
- cbnz w2, 2b // if iterations remain jump to beginning
+ ldr w8, [x5], #4 // load filterPos
+ lsl w8, w8, #1
+ add x9, x3, w8, uxtw // src + filterPos
+ ld1 {v0.4h}, [x9] // load 4 * uint16_t
+ ld1 {v31.4h}, [x4], #8
+
+ uxtl v0.4s, v0.4h
+ sxtl v31.4s, v31.4h
+ subs w2, w2, #1
+ mul v5.4s, v0.4s, v31.4s
+ addv s0, v5.4s
+ sshl v0.4s, v0.4s, v17.4s
+ smin v0.4s, v0.4s, v18.4s
+ st1 {v0.s}[0], [x1], #4
+ cbnz w2, 2b // if iterations remain jump to beginning
ret
endfunc
@@ -1250,77 +1250,77 @@ function ff_hscale16to19_X8_neon_asm, export=1
// x5 const int32_t *filterPos
// w6 int filterSize
- movi v20.4s, #1
- movi v21.4s, #1
- shl v20.4s, v20.4s, #19
- sub v20.4s, v20.4s, v21.4s
- dup v21.4s, w0
- neg v21.4s, v21.4s
-
- sbfiz x7, x6, #1, #32 // filterSize*2 (*2 because int16)
-1: ldr w8, [x5], #4 // filterPos[idx]
- ldr w10, [x5], #4 // filterPos[idx + 1]
- lsl w8, w8, #1
- ldr w11, [x5], #4 // filterPos[idx + 2]
- ldr w9, [x5], #4 // filterPos[idx + 3]
- mov x16, x4 // filter0 = filter
- lsl w11, w11, #1
- add x12, x16, x7 // filter1 = filter0 + filterSize*2
- lsl w9, w9, #1
- add x13, x12, x7 // filter2 = filter1 + filterSize*2
- lsl w10, w10, #1
- add x4, x13, x7 // filter3 = filter2 + filterSize*2
- movi v0.2d, #0 // val sum part 1 (for dst[0])
- movi v1.2d, #0 // val sum part 2 (for dst[1])
- movi v2.2d, #0 // val sum part 3 (for dst[2])
- movi v3.2d, #0 // val sum part 4 (for dst[3])
- add x17, x3, w8, uxtw // srcp + filterPos[0]
- add x8, x3, w10, uxtw // srcp + filterPos[1]
- add x10, x3, w11, uxtw // srcp + filterPos[2]
- add x11, x3, w9, uxtw // srcp + filterPos[3]
- mov w15, w6 // filterSize counter
-2: ld1 {v4.8h}, [x17], #16 // srcp[filterPos[0] + {0..7}]
- ld1 {v5.8h}, [x16], #16 // load 8x16-bit filter values, part 1
- ld1 {v6.8h}, [x8], #16 // srcp[filterPos[1] + {0..7}]
- ld1 {v7.8h}, [x12], #16 // load 8x16-bit at filter+filterSize
- uxtl v24.4s, v4.4h // extend srcp lower half to 32 bits to preserve sign
- sxtl v25.4s, v5.4h // extend filter lower half to 32 bits to match srcp size
- uxtl2 v4.4s, v4.8h // extend srcp upper half to 32 bits
- mla v0.4s, v24.4s, v25.4s // multiply accumulate lower half of v4 * v5
- sxtl2 v5.4s, v5.8h // extend filter upper half to 32 bits
- uxtl v26.4s, v6.4h // extend srcp lower half to 32 bits
- mla v0.4s, v4.4s, v5.4s // multiply accumulate upper half of v4 * v5
- sxtl v27.4s, v7.4h // exted filter lower half
- uxtl2 v6.4s, v6.8h // extend srcp upper half
- sxtl2 v7.4s, v7.8h // extend filter upper half
- ld1 {v16.8h}, [x10], #16 // srcp[filterPos[2] + {0..7}]
- mla v1.4s, v26.4s, v27.4s // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}]
- ld1 {v17.8h}, [x13], #16 // load 8x16-bit at filter+2*filterSize
- uxtl v22.4s, v16.4h // extend srcp lower half
- sxtl v23.4s, v17.4h // extend filter lower half
- uxtl2 v16.4s, v16.8h // extend srcp upper half
- sxtl2 v17.4s, v17.8h // extend filter upper half
- mla v2.4s, v22.4s, v23.4s // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}]
- mla v2.4s, v16.4s, v17.4s // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}]
- ld1 {v18.8h}, [x11], #16 // srcp[filterPos[3] + {0..7}]
- mla v1.4s, v6.4s, v7.4s // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}]
- ld1 {v19.8h}, [x4], #16 // load 8x16-bit at filter+3*filterSize
- subs w15, w15, #8 // j -= 8: processed 8/filterSize
- uxtl v28.4s, v18.4h // extend srcp lower half
- sxtl v29.4s, v19.4h // extend filter lower half
- uxtl2 v18.4s, v18.8h // extend srcp upper half
- sxtl2 v19.4s, v19.8h // extend filter upper half
- mla v3.4s, v28.4s, v29.4s // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}]
- mla v3.4s, v18.4s, v19.4s // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}]
- b.gt 2b // inner loop if filterSize not consumed completely
- addp v0.4s, v0.4s, v1.4s // part01 horizontal pair adding
- addp v2.4s, v2.4s, v3.4s // part23 horizontal pair adding
- addp v0.4s, v0.4s, v2.4s // part0123 horizontal pair adding
- subs w2, w2, #4 // dstW -= 4
- sshl v0.4s, v0.4s, v21.4s // shift right (effectively rigth, as shift is negative); overflow expected
- smin v0.4s, v0.4s, v20.4s // apply min (do not use sqshl)
- st1 {v0.4s}, [x1], #16 // write to destination part0123
- b.gt 1b // loop until end of line
+ movi v20.4s, #1
+ movi v21.4s, #1
+ shl v20.4s, v20.4s, #19
+ sub v20.4s, v20.4s, v21.4s
+ dup v21.4s, w0
+ neg v21.4s, v21.4s
+
+ sbfiz x7, x6, #1, #32 // filterSize*2 (*2 because int16)
+1: ldr w8, [x5], #4 // filterPos[idx]
+ ldr w10, [x5], #4 // filterPos[idx + 1]
+ lsl w8, w8, #1
+ ldr w11, [x5], #4 // filterPos[idx + 2]
+ ldr w9, [x5], #4 // filterPos[idx + 3]
+ mov x16, x4 // filter0 = filter
+ lsl w11, w11, #1
+ add x12, x16, x7 // filter1 = filter0 + filterSize*2
+ lsl w9, w9, #1
+ add x13, x12, x7 // filter2 = filter1 + filterSize*2
+ lsl w10, w10, #1
+ add x4, x13, x7 // filter3 = filter2 + filterSize*2
+ movi v0.2d, #0 // val sum part 1 (for dst[0])
+ movi v1.2d, #0 // val sum part 2 (for dst[1])
+ movi v2.2d, #0 // val sum part 3 (for dst[2])
+ movi v3.2d, #0 // val sum part 4 (for dst[3])
+ add x17, x3, w8, uxtw // srcp + filterPos[0]
+ add x8, x3, w10, uxtw // srcp + filterPos[1]
+ add x10, x3, w11, uxtw // srcp + filterPos[2]
+ add x11, x3, w9, uxtw // srcp + filterPos[3]
+ mov w15, w6 // filterSize counter
+2: ld1 {v4.8h}, [x17], #16 // srcp[filterPos[0] + {0..7}]
+ ld1 {v5.8h}, [x16], #16 // load 8x16-bit filter values, part 1
+ ld1 {v6.8h}, [x8], #16 // srcp[filterPos[1] + {0..7}]
+ ld1 {v7.8h}, [x12], #16 // load 8x16-bit at filter+filterSize
+ uxtl v24.4s, v4.4h // extend srcp lower half to 32 bits to preserve sign
+ sxtl v25.4s, v5.4h // extend filter lower half to 32 bits to match srcp size
+ uxtl2 v4.4s, v4.8h // extend srcp upper half to 32 bits
+ mla v0.4s, v24.4s, v25.4s // multiply accumulate lower half of v4 * v5
+ sxtl2 v5.4s, v5.8h // extend filter upper half to 32 bits
+ uxtl v26.4s, v6.4h // extend srcp lower half to 32 bits
+ mla v0.4s, v4.4s, v5.4s // multiply accumulate upper half of v4 * v5
+ sxtl v27.4s, v7.4h // exted filter lower half
+ uxtl2 v6.4s, v6.8h // extend srcp upper half
+ sxtl2 v7.4s, v7.8h // extend filter upper half
+ ld1 {v16.8h}, [x10], #16 // srcp[filterPos[2] + {0..7}]
+ mla v1.4s, v26.4s, v27.4s // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}]
+ ld1 {v17.8h}, [x13], #16 // load 8x16-bit at filter+2*filterSize
+ uxtl v22.4s, v16.4h // extend srcp lower half
+ sxtl v23.4s, v17.4h // extend filter lower half
+ uxtl2 v16.4s, v16.8h // extend srcp upper half
+ sxtl2 v17.4s, v17.8h // extend filter upper half
+ mla v2.4s, v22.4s, v23.4s // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}]
+ mla v2.4s, v16.4s, v17.4s // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}]
+ ld1 {v18.8h}, [x11], #16 // srcp[filterPos[3] + {0..7}]
+ mla v1.4s, v6.4s, v7.4s // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}]
+ ld1 {v19.8h}, [x4], #16 // load 8x16-bit at filter+3*filterSize
+ subs w15, w15, #8 // j -= 8: processed 8/filterSize
+ uxtl v28.4s, v18.4h // extend srcp lower half
+ sxtl v29.4s, v19.4h // extend filter lower half
+ uxtl2 v18.4s, v18.8h // extend srcp upper half
+ sxtl2 v19.4s, v19.8h // extend filter upper half
+ mla v3.4s, v28.4s, v29.4s // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}]
+ mla v3.4s, v18.4s, v19.4s // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}]
+ b.gt 2b // inner loop if filterSize not consumed completely
+ addp v0.4s, v0.4s, v1.4s // part01 horizontal pair adding
+ addp v2.4s, v2.4s, v3.4s // part23 horizontal pair adding
+ addp v0.4s, v0.4s, v2.4s // part0123 horizontal pair adding
+ subs w2, w2, #4 // dstW -= 4
+ sshl v0.4s, v0.4s, v21.4s // shift right (effectively rigth, as shift is negative); overflow expected
+ smin v0.4s, v0.4s, v20.4s // apply min (do not use sqshl)
+ st1 {v0.4s}, [x1], #16 // write to destination part0123
+ b.gt 1b // loop until end of line
ret
endfunc
@@ -1333,117 +1333,117 @@ function ff_hscale16to19_X4_neon_asm, export=1
// x5 const int32_t *filterPos
// w6 int filterSize
- stp d8, d9, [sp, #-0x20]!
- stp d10, d11, [sp, #0x10]
+ stp d8, d9, [sp, #-0x20]!
+ stp d10, d11, [sp, #0x10]
- movi v18.4s, #1
- movi v17.4s, #1
- shl v18.4s, v18.4s, #19
- sub v21.4s, v18.4s, v17.4s // max allowed value
- dup v17.4s, w0 // read shift
- neg v20.4s, v17.4s // negate it, so it can be used in sshl (effectively shift right)
+ movi v18.4s, #1
+ movi v17.4s, #1
+ shl v18.4s, v18.4s, #19
+ sub v21.4s, v18.4s, v17.4s // max allowed value
+ dup v17.4s, w0 // read shift
+ neg v20.4s, v17.4s // negate it, so it can be used in sshl (effectively shift right)
- lsl w7, w6, #1
+ lsl w7, w6, #1
1:
- ldp w8, w9, [x5]
- ldp w10, w11, [x5, #8]
-
- movi v16.2d, #0 // initialize accumulator for idx + 0
- movi v17.2d, #0 // initialize accumulator for idx + 1
- movi v18.2d, #0 // initialize accumulator for idx + 2
- movi v19.2d, #0 // initialize accumulator for idx + 3
-
- mov x12, x4 // filter + 0
- add x13, x4, x7 // filter + 1
- add x8, x3, x8, lsl #1 // srcp + filterPos 0
- add x14, x13, x7 // filter + 2
- add x9, x3, x9, lsl #1 // srcp + filterPos 1
- add x15, x14, x7 // filter + 3
- add x10, x3, x10, lsl #1 // srcp + filterPos 2
- mov w0, w6 // save the filterSize to temporary variable
- add x11, x3, x11, lsl #1 // srcp + filterPos 3
- add x5, x5, #16 // advance filter position
- mov x16, xzr // clear the register x16 used for offsetting the filter values
+ ldp w8, w9, [x5]
+ ldp w10, w11, [x5, #8]
+
+ movi v16.2d, #0 // initialize accumulator for idx + 0
+ movi v17.2d, #0 // initialize accumulator for idx + 1
+ movi v18.2d, #0 // initialize accumulator for idx + 2
+ movi v19.2d, #0 // initialize accumulator for idx + 3
+
+ mov x12, x4 // filter + 0
+ add x13, x4, x7 // filter + 1
+ add x8, x3, x8, lsl #1 // srcp + filterPos 0
+ add x14, x13, x7 // filter + 2
+ add x9, x3, x9, lsl #1 // srcp + filterPos 1
+ add x15, x14, x7 // filter + 3
+ add x10, x3, x10, lsl #1 // srcp + filterPos 2
+ mov w0, w6 // save the filterSize to temporary variable
+ add x11, x3, x11, lsl #1 // srcp + filterPos 3
+ add x5, x5, #16 // advance filter position
+ mov x16, xzr // clear the register x16 used for offsetting the filter values
2:
- ldr q4, [x8], #16 // load src values for idx 0
- ldr q5, [x9], #16 // load src values for idx 1
- uxtl v26.4s, v4.4h
- uxtl2 v4.4s, v4.8h
- ldr q31, [x12, x16] // load filter values for idx 0
- ldr q6, [x10], #16 // load src values for idx 2
- sxtl v22.4s, v31.4h
- sxtl2 v31.4s, v31.8h
- mla v16.4s, v26.4s, v22.4s // multiplication of lower half for idx 0
- uxtl v25.4s, v5.4h
- uxtl2 v5.4s, v5.8h
- ldr q30, [x13, x16] // load filter values for idx 1
- ldr q7, [x11], #16 // load src values for idx 3
- mla v16.4s, v4.4s, v31.4s // multiplication of upper half for idx 0
- uxtl v24.4s, v6.4h
- sxtl v8.4s, v30.4h
- sxtl2 v30.4s, v30.8h
- mla v17.4s, v25.4s, v8.4s // multiplication of lower half for idx 1
- ldr q29, [x14, x16] // load filter values for idx 2
- uxtl2 v6.4s, v6.8h
- sxtl v9.4s, v29.4h
- sxtl2 v29.4s, v29.8h
- mla v17.4s, v5.4s, v30.4s // multiplication of upper half for idx 1
- ldr q28, [x15, x16] // load filter values for idx 3
- mla v18.4s, v24.4s, v9.4s // multiplication of lower half for idx 2
- uxtl v23.4s, v7.4h
- sxtl v10.4s, v28.4h
- mla v18.4s, v6.4s, v29.4s // multiplication of upper half for idx 2
- uxtl2 v7.4s, v7.8h
- sxtl2 v28.4s, v28.8h
- mla v19.4s, v23.4s, v10.4s // multiplication of lower half for idx 3
- sub w0, w0, #8
- cmp w0, #8
- mla v19.4s, v7.4s, v28.4s // multiplication of upper half for idx 3
-
- add x16, x16, #16 // advance filter values indexing
-
- b.ge 2b
+ ldr q4, [x8], #16 // load src values for idx 0
+ ldr q5, [x9], #16 // load src values for idx 1
+ uxtl v26.4s, v4.4h
+ uxtl2 v4.4s, v4.8h
+ ldr q31, [x12, x16] // load filter values for idx 0
+ ldr q6, [x10], #16 // load src values for idx 2
+ sxtl v22.4s, v31.4h
+ sxtl2 v31.4s, v31.8h
+ mla v16.4s, v26.4s, v22.4s // multiplication of lower half for idx 0
+ uxtl v25.4s, v5.4h
+ uxtl2 v5.4s, v5.8h
+ ldr q30, [x13, x16] // load filter values for idx 1
+ ldr q7, [x11], #16 // load src values for idx 3
+ mla v16.4s, v4.4s, v31.4s // multiplication of upper half for idx 0
+ uxtl v24.4s, v6.4h
+ sxtl v8.4s, v30.4h
+ sxtl2 v30.4s, v30.8h
+ mla v17.4s, v25.4s, v8.4s // multiplication of lower half for idx 1
+ ldr q29, [x14, x16] // load filter values for idx 2
+ uxtl2 v6.4s, v6.8h
+ sxtl v9.4s, v29.4h
+ sxtl2 v29.4s, v29.8h
+ mla v17.4s, v5.4s, v30.4s // multiplication of upper half for idx 1
+ ldr q28, [x15, x16] // load filter values for idx 3
+ mla v18.4s, v24.4s, v9.4s // multiplication of lower half for idx 2
+ uxtl v23.4s, v7.4h
+ sxtl v10.4s, v28.4h
+ mla v18.4s, v6.4s, v29.4s // multiplication of upper half for idx 2
+ uxtl2 v7.4s, v7.8h
+ sxtl2 v28.4s, v28.8h
+ mla v19.4s, v23.4s, v10.4s // multiplication of lower half for idx 3
+ sub w0, w0, #8
+ cmp w0, #8
+ mla v19.4s, v7.4s, v28.4s // multiplication of upper half for idx 3
+
+ add x16, x16, #16 // advance filter values indexing
+
+ b.ge 2b
// 4 iterations left
- sub x17, x7, #8 // step back to wrap up the filter pos for last 4 elements
-
- ldr d4, [x8] // load src values for idx 0
- ldr d31, [x12, x17] // load filter values for idx 0
- uxtl v4.4s, v4.4h
- sxtl v31.4s, v31.4h
- ldr d5, [x9] // load src values for idx 1
- mla v16.4s, v4.4s, v31.4s // multiplication of upper half for idx 0
- ldr d30, [x13, x17] // load filter values for idx 1
- uxtl v5.4s, v5.4h
- sxtl v30.4s, v30.4h
- ldr d6, [x10] // load src values for idx 2
- mla v17.4s, v5.4s, v30.4s // multiplication of upper half for idx 1
- ldr d29, [x14, x17] // load filter values for idx 2
- uxtl v6.4s, v6.4h
- sxtl v29.4s, v29.4h
- ldr d7, [x11] // load src values for idx 3
- ldr d28, [x15, x17] // load filter values for idx 3
- mla v18.4s, v6.4s, v29.4s // multiplication of upper half for idx 2
- uxtl v7.4s, v7.4h
- sxtl v28.4s, v28.4h
- addp v16.4s, v16.4s, v17.4s
- mla v19.4s, v7.4s, v28.4s // multiplication of upper half for idx 3
- subs w2, w2, #4
- addp v18.4s, v18.4s, v19.4s
- addp v16.4s, v16.4s, v18.4s
- sshl v16.4s, v16.4s, v20.4s
- smin v16.4s, v16.4s, v21.4s
-
- st1 {v16.4s}, [x1], #16
- add x4, x4, x7, lsl #2
- b.gt 1b
-
- ldp d8, d9, [sp]
- ldp d10, d11, [sp, #0x10]
-
- add sp, sp, #0x20
+ sub x17, x7, #8 // step back to wrap up the filter pos for last 4 elements
+
+ ldr d4, [x8] // load src values for idx 0
+ ldr d31, [x12, x17] // load filter values for idx 0
+ uxtl v4.4s, v4.4h
+ sxtl v31.4s, v31.4h
+ ldr d5, [x9] // load src values for idx 1
+ mla v16.4s, v4.4s, v31.4s // multiplication of upper half for idx 0
+ ldr d30, [x13, x17] // load filter values for idx 1
+ uxtl v5.4s, v5.4h
+ sxtl v30.4s, v30.4h
+ ldr d6, [x10] // load src values for idx 2
+ mla v17.4s, v5.4s, v30.4s // multiplication of upper half for idx 1
+ ldr d29, [x14, x17] // load filter values for idx 2
+ uxtl v6.4s, v6.4h
+ sxtl v29.4s, v29.4h
+ ldr d7, [x11] // load src values for idx 3
+ ldr d28, [x15, x17] // load filter values for idx 3
+ mla v18.4s, v6.4s, v29.4s // multiplication of upper half for idx 2
+ uxtl v7.4s, v7.4h
+ sxtl v28.4s, v28.4h
+ addp v16.4s, v16.4s, v17.4s
+ mla v19.4s, v7.4s, v28.4s // multiplication of upper half for idx 3
+ subs w2, w2, #4
+ addp v18.4s, v18.4s, v19.4s
+ addp v16.4s, v16.4s, v18.4s
+ sshl v16.4s, v16.4s, v20.4s
+ smin v16.4s, v16.4s, v21.4s
+
+ st1 {v16.4s}, [x1], #16
+ add x4, x4, x7, lsl #2
+ b.gt 1b
+
+ ldp d8, d9, [sp]
+ ldp d10, d11, [sp, #0x10]
+
+ add sp, sp, #0x20
ret
endfunc
@@ -29,178 +29,178 @@ function ff_yuv2planeX_8_neon, export=1
// x5 - const uint8_t *dither,
// w6 - int offset
- ld1 {v0.8b}, [x5] // load 8x8-bit dither
- and w6, w6, #7
- cbz w6, 1f // check if offsetting present
- ext v0.8b, v0.8b, v0.8b, #3 // honor offsetting which can be 0 or 3 only
-1: uxtl v0.8h, v0.8b // extend dither to 16-bit
- ushll v1.4s, v0.4h, #12 // extend dither to 32-bit with left shift by 12 (part 1)
- ushll2 v2.4s, v0.8h, #12 // extend dither to 32-bit with left shift by 12 (part 2)
- cmp w1, #8 // if filterSize == 8, branch to specialized version
- b.eq 6f
- cmp w1, #4 // if filterSize == 4, branch to specialized version
- b.eq 8f
- cmp w1, #2 // if filterSize == 2, branch to specialized version
- b.eq 10f
+ ld1 {v0.8b}, [x5] // load 8x8-bit dither
+ and w6, w6, #7
+ cbz w6, 1f // check if offsetting present
+ ext v0.8b, v0.8b, v0.8b, #3 // honor offsetting which can be 0 or 3 only
+1: uxtl v0.8h, v0.8b // extend dither to 16-bit
+ ushll v1.4s, v0.4h, #12 // extend dither to 32-bit with left shift by 12 (part 1)
+ ushll2 v2.4s, v0.8h, #12 // extend dither to 32-bit with left shift by 12 (part 2)
+ cmp w1, #8 // if filterSize == 8, branch to specialized version
+ b.eq 6f
+ cmp w1, #4 // if filterSize == 4, branch to specialized version
+ b.eq 8f
+ cmp w1, #2 // if filterSize == 2, branch to specialized version
+ b.eq 10f
// The filter size does not match of the of specialized implementations. It is either even or odd. If it is even
// then use the first section below.
- mov x7, #0 // i = 0
- tbnz w1, #0, 4f // if filterSize % 2 != 0 branch to specialized version
+ mov x7, #0 // i = 0
+ tbnz w1, #0, 4f // if filterSize % 2 != 0 branch to specialized version
// fs % 2 == 0
-2: mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value
- mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value
- mov w8, w1 // tmpfilterSize = filterSize
- mov x9, x2 // srcp = src
- mov x10, x0 // filterp = filter
-3: ldp x11, x12, [x9], #16 // get 2 pointers: src[j] and src[j+1]
- ldr s7, [x10], #4 // read 2x16-bit coeff X and Y at filter[j] and filter[j+1]
- add x11, x11, x7, lsl #1 // &src[j ][i]
- add x12, x12, x7, lsl #1 // &src[j+1][i]
- ld1 {v5.8h}, [x11] // read 8x16-bit @ src[j ][i + {0..7}]: A,B,C,D,E,F,G,H
- ld1 {v6.8h}, [x12] // read 8x16-bit @ src[j+1][i + {0..7}]: I,J,K,L,M,N,O,P
- smlal v3.4s, v5.4h, v7.h[0] // val0 += {A,B,C,D} * X
- smlal2 v4.4s, v5.8h, v7.h[0] // val1 += {E,F,G,H} * X
- smlal v3.4s, v6.4h, v7.h[1] // val0 += {I,J,K,L} * Y
- smlal2 v4.4s, v6.8h, v7.h[1] // val1 += {M,N,O,P} * Y
- subs w8, w8, #2 // tmpfilterSize -= 2
- b.gt 3b // loop until filterSize consumed
-
- sqshrun v3.4h, v3.4s, #16 // clip16(val0>>16)
- sqshrun2 v3.8h, v4.4s, #16 // clip16(val1>>16)
- uqshrn v3.8b, v3.8h, #3 // clip8(val>>19)
- st1 {v3.8b}, [x3], #8 // write to destination
- subs w4, w4, #8 // dstW -= 8
- add x7, x7, #8 // i += 8
- b.gt 2b // loop until width consumed
+2: mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value
+ mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value
+ mov w8, w1 // tmpfilterSize = filterSize
+ mov x9, x2 // srcp = src
+ mov x10, x0 // filterp = filter
+3: ldp x11, x12, [x9], #16 // get 2 pointers: src[j] and src[j+1]
+ ldr s7, [x10], #4 // read 2x16-bit coeff X and Y at filter[j] and filter[j+1]
+ add x11, x11, x7, lsl #1 // &src[j ][i]
+ add x12, x12, x7, lsl #1 // &src[j+1][i]
+ ld1 {v5.8h}, [x11] // read 8x16-bit @ src[j ][i + {0..7}]: A,B,C,D,E,F,G,H
+ ld1 {v6.8h}, [x12] // read 8x16-bit @ src[j+1][i + {0..7}]: I,J,K,L,M,N,O,P
+ smlal v3.4s, v5.4h, v7.h[0] // val0 += {A,B,C,D} * X
+ smlal2 v4.4s, v5.8h, v7.h[0] // val1 += {E,F,G,H} * X
+ smlal v3.4s, v6.4h, v7.h[1] // val0 += {I,J,K,L} * Y
+ smlal2 v4.4s, v6.8h, v7.h[1] // val1 += {M,N,O,P} * Y
+ subs w8, w8, #2 // tmpfilterSize -= 2
+ b.gt 3b // loop until filterSize consumed
+
+ sqshrun v3.4h, v3.4s, #16 // clip16(val0>>16)
+ sqshrun2 v3.8h, v4.4s, #16 // clip16(val1>>16)
+ uqshrn v3.8b, v3.8h, #3 // clip8(val>>19)
+ st1 {v3.8b}, [x3], #8 // write to destination
+ subs w4, w4, #8 // dstW -= 8
+ add x7, x7, #8 // i += 8
+ b.gt 2b // loop until width consumed
ret
// If filter size is odd (most likely == 1), then use this section.
// fs % 2 != 0
-4: mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value
- mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value
- mov w8, w1 // tmpfilterSize = filterSize
- mov x9, x2 // srcp = src
- mov x10, x0 // filterp = filter
-5: ldr x11, [x9], #8 // get 1 pointer: src[j]
- ldr h6, [x10], #2 // read 1 16 bit coeff X at filter[j]
- add x11, x11, x7, lsl #1 // &src[j ][i]
- ld1 {v5.8h}, [x11] // read 8x16-bit @ src[j ][i + {0..7}]: A,B,C,D,E,F,G,H
- smlal v3.4s, v5.4h, v6.h[0] // val0 += {A,B,C,D} * X
- smlal2 v4.4s, v5.8h, v6.h[0] // val1 += {E,F,G,H} * X
- subs w8, w8, #1 // tmpfilterSize -= 2
- b.gt 5b // loop until filterSize consumed
-
- sqshrun v3.4h, v3.4s, #16 // clip16(val0>>16)
- sqshrun2 v3.8h, v4.4s, #16 // clip16(val1>>16)
- uqshrn v3.8b, v3.8h, #3 // clip8(val>>19)
- st1 {v3.8b}, [x3], #8 // write to destination
- subs w4, w4, #8 // dstW -= 8
- add x7, x7, #8 // i += 8
- b.gt 4b // loop until width consumed
+4: mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value
+ mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value
+ mov w8, w1 // tmpfilterSize = filterSize
+ mov x9, x2 // srcp = src
+ mov x10, x0 // filterp = filter
+5: ldr x11, [x9], #8 // get 1 pointer: src[j]
+ ldr h6, [x10], #2 // read 1 16 bit coeff X at filter[j]
+ add x11, x11, x7, lsl #1 // &src[j ][i]
+ ld1 {v5.8h}, [x11] // read 8x16-bit @ src[j ][i + {0..7}]: A,B,C,D,E,F,G,H
+ smlal v3.4s, v5.4h, v6.h[0] // val0 += {A,B,C,D} * X
+ smlal2 v4.4s, v5.8h, v6.h[0] // val1 += {E,F,G,H} * X
+ subs w8, w8, #1 // tmpfilterSize -= 2
+ b.gt 5b // loop until filterSize consumed
+
+ sqshrun v3.4h, v3.4s, #16 // clip16(val0>>16)
+ sqshrun2 v3.8h, v4.4s, #16 // clip16(val1>>16)
+ uqshrn v3.8b, v3.8h, #3 // clip8(val>>19)
+ st1 {v3.8b}, [x3], #8 // write to destination
+ subs w4, w4, #8 // dstW -= 8
+ add x7, x7, #8 // i += 8
+ b.gt 4b // loop until width consumed
ret
6: // fs=8
- ldp x5, x6, [x2] // load 2 pointers: src[j ] and src[j+1]
- ldp x7, x9, [x2, #16] // load 2 pointers: src[j+2] and src[j+3]
- ldp x10, x11, [x2, #32] // load 2 pointers: src[j+4] and src[j+5]
- ldp x12, x13, [x2, #48] // load 2 pointers: src[j+6] and src[j+7]
+ ldp x5, x6, [x2] // load 2 pointers: src[j ] and src[j+1]
+ ldp x7, x9, [x2, #16] // load 2 pointers: src[j+2] and src[j+3]
+ ldp x10, x11, [x2, #32] // load 2 pointers: src[j+4] and src[j+5]
+ ldp x12, x13, [x2, #48] // load 2 pointers: src[j+6] and src[j+7]
// load 8x16-bit values for filter[j], where j=0..7
- ld1 {v6.8h}, [x0]
+ ld1 {v6.8h}, [x0]
7:
- mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value
- mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value
-
- ld1 {v24.8h}, [x5], #16 // load 8x16-bit values for src[j + 0][i + {0..7}]
- ld1 {v25.8h}, [x6], #16 // load 8x16-bit values for src[j + 1][i + {0..7}]
- ld1 {v26.8h}, [x7], #16 // load 8x16-bit values for src[j + 2][i + {0..7}]
- ld1 {v27.8h}, [x9], #16 // load 8x16-bit values for src[j + 3][i + {0..7}]
- ld1 {v28.8h}, [x10], #16 // load 8x16-bit values for src[j + 4][i + {0..7}]
- ld1 {v29.8h}, [x11], #16 // load 8x16-bit values for src[j + 5][i + {0..7}]
- ld1 {v30.8h}, [x12], #16 // load 8x16-bit values for src[j + 6][i + {0..7}]
- ld1 {v31.8h}, [x13], #16 // load 8x16-bit values for src[j + 7][i + {0..7}]
-
- smlal v3.4s, v24.4h, v6.h[0] // val0 += src[0][i + {0..3}] * filter[0]
- smlal2 v4.4s, v24.8h, v6.h[0] // val1 += src[0][i + {4..7}] * filter[0]
- smlal v3.4s, v25.4h, v6.h[1] // val0 += src[1][i + {0..3}] * filter[1]
- smlal2 v4.4s, v25.8h, v6.h[1] // val1 += src[1][i + {4..7}] * filter[1]
- smlal v3.4s, v26.4h, v6.h[2] // val0 += src[2][i + {0..3}] * filter[2]
- smlal2 v4.4s, v26.8h, v6.h[2] // val1 += src[2][i + {4..7}] * filter[2]
- smlal v3.4s, v27.4h, v6.h[3] // val0 += src[3][i + {0..3}] * filter[3]
- smlal2 v4.4s, v27.8h, v6.h[3] // val1 += src[3][i + {4..7}] * filter[3]
- smlal v3.4s, v28.4h, v6.h[4] // val0 += src[4][i + {0..3}] * filter[4]
- smlal2 v4.4s, v28.8h, v6.h[4] // val1 += src[4][i + {4..7}] * filter[4]
- smlal v3.4s, v29.4h, v6.h[5] // val0 += src[5][i + {0..3}] * filter[5]
- smlal2 v4.4s, v29.8h, v6.h[5] // val1 += src[5][i + {4..7}] * filter[5]
- smlal v3.4s, v30.4h, v6.h[6] // val0 += src[6][i + {0..3}] * filter[6]
- smlal2 v4.4s, v30.8h, v6.h[6] // val1 += src[6][i + {4..7}] * filter[6]
- smlal v3.4s, v31.4h, v6.h[7] // val0 += src[7][i + {0..3}] * filter[7]
- smlal2 v4.4s, v31.8h, v6.h[7] // val1 += src[7][i + {4..7}] * filter[7]
-
- sqshrun v3.4h, v3.4s, #16 // clip16(val0>>16)
- sqshrun2 v3.8h, v4.4s, #16 // clip16(val1>>16)
- uqshrn v3.8b, v3.8h, #3 // clip8(val>>19)
- subs w4, w4, #8 // dstW -= 8
- st1 {v3.8b}, [x3], #8 // write to destination
- b.gt 7b // loop until width consumed
+ mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value
+ mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value
+
+ ld1 {v24.8h}, [x5], #16 // load 8x16-bit values for src[j + 0][i + {0..7}]
+ ld1 {v25.8h}, [x6], #16 // load 8x16-bit values for src[j + 1][i + {0..7}]
+ ld1 {v26.8h}, [x7], #16 // load 8x16-bit values for src[j + 2][i + {0..7}]
+ ld1 {v27.8h}, [x9], #16 // load 8x16-bit values for src[j + 3][i + {0..7}]
+ ld1 {v28.8h}, [x10], #16 // load 8x16-bit values for src[j + 4][i + {0..7}]
+ ld1 {v29.8h}, [x11], #16 // load 8x16-bit values for src[j + 5][i + {0..7}]
+ ld1 {v30.8h}, [x12], #16 // load 8x16-bit values for src[j + 6][i + {0..7}]
+ ld1 {v31.8h}, [x13], #16 // load 8x16-bit values for src[j + 7][i + {0..7}]
+
+ smlal v3.4s, v24.4h, v6.h[0] // val0 += src[0][i + {0..3}] * filter[0]
+ smlal2 v4.4s, v24.8h, v6.h[0] // val1 += src[0][i + {4..7}] * filter[0]
+ smlal v3.4s, v25.4h, v6.h[1] // val0 += src[1][i + {0..3}] * filter[1]
+ smlal2 v4.4s, v25.8h, v6.h[1] // val1 += src[1][i + {4..7}] * filter[1]
+ smlal v3.4s, v26.4h, v6.h[2] // val0 += src[2][i + {0..3}] * filter[2]
+ smlal2 v4.4s, v26.8h, v6.h[2] // val1 += src[2][i + {4..7}] * filter[2]
+ smlal v3.4s, v27.4h, v6.h[3] // val0 += src[3][i + {0..3}] * filter[3]
+ smlal2 v4.4s, v27.8h, v6.h[3] // val1 += src[3][i + {4..7}] * filter[3]
+ smlal v3.4s, v28.4h, v6.h[4] // val0 += src[4][i + {0..3}] * filter[4]
+ smlal2 v4.4s, v28.8h, v6.h[4] // val1 += src[4][i + {4..7}] * filter[4]
+ smlal v3.4s, v29.4h, v6.h[5] // val0 += src[5][i + {0..3}] * filter[5]
+ smlal2 v4.4s, v29.8h, v6.h[5] // val1 += src[5][i + {4..7}] * filter[5]
+ smlal v3.4s, v30.4h, v6.h[6] // val0 += src[6][i + {0..3}] * filter[6]
+ smlal2 v4.4s, v30.8h, v6.h[6] // val1 += src[6][i + {4..7}] * filter[6]
+ smlal v3.4s, v31.4h, v6.h[7] // val0 += src[7][i + {0..3}] * filter[7]
+ smlal2 v4.4s, v31.8h, v6.h[7] // val1 += src[7][i + {4..7}] * filter[7]
+
+ sqshrun v3.4h, v3.4s, #16 // clip16(val0>>16)
+ sqshrun2 v3.8h, v4.4s, #16 // clip16(val1>>16)
+ uqshrn v3.8b, v3.8h, #3 // clip8(val>>19)
+ subs w4, w4, #8 // dstW -= 8
+ st1 {v3.8b}, [x3], #8 // write to destination
+ b.gt 7b // loop until width consumed
ret
8: // fs=4
- ldp x5, x6, [x2] // load 2 pointers: src[j ] and src[j+1]
- ldp x7, x9, [x2, #16] // load 2 pointers: src[j+2] and src[j+3]
+ ldp x5, x6, [x2] // load 2 pointers: src[j ] and src[j+1]
+ ldp x7, x9, [x2, #16] // load 2 pointers: src[j+2] and src[j+3]
// load 4x16-bit values for filter[j], where j=0..3 and replicated across lanes
- ld1 {v6.4h}, [x0]
+ ld1 {v6.4h}, [x0]
9:
- mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value
- mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value
-
- ld1 {v24.8h}, [x5], #16 // load 8x16-bit values for src[j + 0][i + {0..7}]
- ld1 {v25.8h}, [x6], #16 // load 8x16-bit values for src[j + 1][i + {0..7}]
- ld1 {v26.8h}, [x7], #16 // load 8x16-bit values for src[j + 2][i + {0..7}]
- ld1 {v27.8h}, [x9], #16 // load 8x16-bit values for src[j + 3][i + {0..7}]
-
- smlal v3.4s, v24.4h, v6.h[0] // val0 += src[0][i + {0..3}] * filter[0]
- smlal2 v4.4s, v24.8h, v6.h[0] // val1 += src[0][i + {4..7}] * filter[0]
- smlal v3.4s, v25.4h, v6.h[1] // val0 += src[1][i + {0..3}] * filter[1]
- smlal2 v4.4s, v25.8h, v6.h[1] // val1 += src[1][i + {4..7}] * filter[1]
- smlal v3.4s, v26.4h, v6.h[2] // val0 += src[2][i + {0..3}] * filter[2]
- smlal2 v4.4s, v26.8h, v6.h[2] // val1 += src[2][i + {4..7}] * filter[2]
- smlal v3.4s, v27.4h, v6.h[3] // val0 += src[3][i + {0..3}] * filter[3]
- smlal2 v4.4s, v27.8h, v6.h[3] // val1 += src[3][i + {4..7}] * filter[3]
-
- sqshrun v3.4h, v3.4s, #16 // clip16(val0>>16)
- sqshrun2 v3.8h, v4.4s, #16 // clip16(val1>>16)
- uqshrn v3.8b, v3.8h, #3 // clip8(val>>19)
- st1 {v3.8b}, [x3], #8 // write to destination
- subs w4, w4, #8 // dstW -= 8
- b.gt 9b // loop until width consumed
+ mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value
+ mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value
+
+ ld1 {v24.8h}, [x5], #16 // load 8x16-bit values for src[j + 0][i + {0..7}]
+ ld1 {v25.8h}, [x6], #16 // load 8x16-bit values for src[j + 1][i + {0..7}]
+ ld1 {v26.8h}, [x7], #16 // load 8x16-bit values for src[j + 2][i + {0..7}]
+ ld1 {v27.8h}, [x9], #16 // load 8x16-bit values for src[j + 3][i + {0..7}]
+
+ smlal v3.4s, v24.4h, v6.h[0] // val0 += src[0][i + {0..3}] * filter[0]
+ smlal2 v4.4s, v24.8h, v6.h[0] // val1 += src[0][i + {4..7}] * filter[0]
+ smlal v3.4s, v25.4h, v6.h[1] // val0 += src[1][i + {0..3}] * filter[1]
+ smlal2 v4.4s, v25.8h, v6.h[1] // val1 += src[1][i + {4..7}] * filter[1]
+ smlal v3.4s, v26.4h, v6.h[2] // val0 += src[2][i + {0..3}] * filter[2]
+ smlal2 v4.4s, v26.8h, v6.h[2] // val1 += src[2][i + {4..7}] * filter[2]
+ smlal v3.4s, v27.4h, v6.h[3] // val0 += src[3][i + {0..3}] * filter[3]
+ smlal2 v4.4s, v27.8h, v6.h[3] // val1 += src[3][i + {4..7}] * filter[3]
+
+ sqshrun v3.4h, v3.4s, #16 // clip16(val0>>16)
+ sqshrun2 v3.8h, v4.4s, #16 // clip16(val1>>16)
+ uqshrn v3.8b, v3.8h, #3 // clip8(val>>19)
+ st1 {v3.8b}, [x3], #8 // write to destination
+ subs w4, w4, #8 // dstW -= 8
+ b.gt 9b // loop until width consumed
ret
10: // fs=2
- ldp x5, x6, [x2] // load 2 pointers: src[j ] and src[j+1]
+ ldp x5, x6, [x2] // load 2 pointers: src[j ] and src[j+1]
// load 2x16-bit values for filter[j], where j=0..1 and replicated across lanes
- ldr s6, [x0]
+ ldr s6, [x0]
11:
- mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value
- mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value
-
- ld1 {v24.8h}, [x5], #16 // load 8x16-bit values for src[j + 0][i + {0..7}]
- ld1 {v25.8h}, [x6], #16 // load 8x16-bit values for src[j + 1][i + {0..7}]
-
- smlal v3.4s, v24.4h, v6.h[0] // val0 += src[0][i + {0..3}] * filter[0]
- smlal2 v4.4s, v24.8h, v6.h[0] // val1 += src[0][i + {4..7}] * filter[0]
- smlal v3.4s, v25.4h, v6.h[1] // val0 += src[1][i + {0..3}] * filter[1]
- smlal2 v4.4s, v25.8h, v6.h[1] // val1 += src[1][i + {4..7}] * filter[1]
-
- sqshrun v3.4h, v3.4s, #16 // clip16(val0>>16)
- sqshrun2 v3.8h, v4.4s, #16 // clip16(val1>>16)
- uqshrn v3.8b, v3.8h, #3 // clip8(val>>19)
- st1 {v3.8b}, [x3], #8 // write to destination
- subs w4, w4, #8 // dstW -= 8
- b.gt 11b // loop until width consumed
+ mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value
+ mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value
+
+ ld1 {v24.8h}, [x5], #16 // load 8x16-bit values for src[j + 0][i + {0..7}]
+ ld1 {v25.8h}, [x6], #16 // load 8x16-bit values for src[j + 1][i + {0..7}]
+
+ smlal v3.4s, v24.4h, v6.h[0] // val0 += src[0][i + {0..3}] * filter[0]
+ smlal2 v4.4s, v24.8h, v6.h[0] // val1 += src[0][i + {4..7}] * filter[0]
+ smlal v3.4s, v25.4h, v6.h[1] // val0 += src[1][i + {0..3}] * filter[1]
+ smlal2 v4.4s, v25.8h, v6.h[1] // val1 += src[1][i + {4..7}] * filter[1]
+
+ sqshrun v3.4h, v3.4s, #16 // clip16(val0>>16)
+ sqshrun2 v3.8h, v4.4s, #16 // clip16(val1>>16)
+ uqshrn v3.8b, v3.8h, #3 // clip8(val>>19)
+ st1 {v3.8b}, [x3], #8 // write to destination
+ subs w4, w4, #8 // dstW -= 8
+ b.gt 11b // loop until width consumed
ret
endfunc
@@ -210,25 +210,25 @@ function ff_yuv2plane1_8_neon, export=1
// w2 - int dstW,
// x3 - const uint8_t *dither,
// w4 - int offset
- ld1 {v0.8b}, [x3] // load 8x8-bit dither
- and w4, w4, #7
- cbz w4, 1f // check if offsetting present
- ext v0.8b, v0.8b, v0.8b, #3 // honor offsetting which can be 0 or 3 only
-1: uxtl v0.8h, v0.8b // extend dither to 32-bit
- uxtl v1.4s, v0.4h
- uxtl2 v2.4s, v0.8h
+ ld1 {v0.8b}, [x3] // load 8x8-bit dither
+ and w4, w4, #7
+ cbz w4, 1f // check if offsetting present
+ ext v0.8b, v0.8b, v0.8b, #3 // honor offsetting which can be 0 or 3 only
+1: uxtl v0.8h, v0.8b // extend dither to 32-bit
+ uxtl v1.4s, v0.4h
+ uxtl2 v2.4s, v0.8h
2:
- ld1 {v3.8h}, [x0], #16 // read 8x16-bit @ src[j ][i + {0..7}]: A,B,C,D,E,F,G,H
- sxtl v4.4s, v3.4h
- sxtl2 v5.4s, v3.8h
- add v4.4s, v4.4s, v1.4s
- add v5.4s, v5.4s, v2.4s
- sqshrun v4.4h, v4.4s, #6
- sqshrun2 v4.8h, v5.4s, #6
-
- uqshrn v3.8b, v4.8h, #1 // clip8(val>>7)
- subs w2, w2, #8 // dstW -= 8
- st1 {v3.8b}, [x1], #8 // write to destination
- b.gt 2b // loop until width consumed
+ ld1 {v3.8h}, [x0], #16 // read 8x16-bit @ src[j ][i + {0..7}]: A,B,C,D,E,F,G,H
+ sxtl v4.4s, v3.4h
+ sxtl2 v5.4s, v3.8h
+ add v4.4s, v4.4s, v1.4s
+ add v5.4s, v5.4s, v2.4s
+ sqshrun v4.4h, v4.4s, #6
+ sqshrun2 v4.8h, v5.4s, #6
+
+ uqshrn v3.8b, v4.8h, #1 // clip8(val>>7)
+ subs w2, w2, #8 // dstW -= 8
+ st1 {v3.8b}, [x1], #8 // write to destination
+ b.gt 2b // loop until width consumed
ret
endfunc
@@ -23,23 +23,23 @@
.macro load_yoff_ycoeff yoff ycoeff
#if defined(__APPLE__)
- ldp w9, w10, [sp, #\yoff]
+ ldp w9, w10, [sp, #\yoff]
#else
- ldr w9, [sp, #\yoff]
- ldr w10, [sp, #\ycoeff]
+ ldr w9, [sp, #\yoff]
+ ldr w10, [sp, #\ycoeff]
#endif
.endm
.macro load_args_nv12
- ldr x8, [sp] // table
- load_yoff_ycoeff 8, 16 // y_offset, y_coeff
- ld1 {v1.1d}, [x8]
- dup v0.8h, w10
- dup v3.8h, w9
- sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding)
- sub w5, w5, w0 // w5 = linesizeY - width (paddingY)
- sub w7, w7, w0 // w7 = linesizeC - width (paddingC)
- neg w11, w0
+ ldr x8, [sp] // table
+ load_yoff_ycoeff 8, 16 // y_offset, y_coeff
+ ld1 {v1.1d}, [x8]
+ dup v0.8h, w10
+ dup v3.8h, w9
+ sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding)
+ sub w5, w5, w0 // w5 = linesizeY - width (paddingY)
+ sub w7, w7, w0 // w7 = linesizeC - width (paddingC)
+ neg w11, w0
.endm
.macro load_args_nv21
@@ -47,52 +47,52 @@
.endm
.macro load_args_yuv420p
- ldr x13, [sp] // srcV
- ldr w14, [sp, #8] // linesizeV
- ldr x8, [sp, #16] // table
- load_yoff_ycoeff 24, 32 // y_offset, y_coeff
- ld1 {v1.1d}, [x8]
- dup v0.8h, w10
- dup v3.8h, w9
- sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding)
- sub w5, w5, w0 // w5 = linesizeY - width (paddingY)
- sub w7, w7, w0, lsr #1 // w7 = linesizeU - width / 2 (paddingU)
- sub w14, w14, w0, lsr #1 // w14 = linesizeV - width / 2 (paddingV)
- lsr w11, w0, #1
- neg w11, w11
+ ldr x13, [sp] // srcV
+ ldr w14, [sp, #8] // linesizeV
+ ldr x8, [sp, #16] // table
+ load_yoff_ycoeff 24, 32 // y_offset, y_coeff
+ ld1 {v1.1d}, [x8]
+ dup v0.8h, w10
+ dup v3.8h, w9
+ sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding)
+ sub w5, w5, w0 // w5 = linesizeY - width (paddingY)
+ sub w7, w7, w0, lsr #1 // w7 = linesizeU - width / 2 (paddingU)
+ sub w14, w14, w0, lsr #1 // w14 = linesizeV - width / 2 (paddingV)
+ lsr w11, w0, #1
+ neg w11, w11
.endm
.macro load_args_yuv422p
- ldr x13, [sp] // srcV
- ldr w14, [sp, #8] // linesizeV
- ldr x8, [sp, #16] // table
- load_yoff_ycoeff 24, 32 // y_offset, y_coeff
- ld1 {v1.1d}, [x8]
- dup v0.8h, w10
- dup v3.8h, w9
- sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding)
- sub w5, w5, w0 // w5 = linesizeY - width (paddingY)
- sub w7, w7, w0, lsr #1 // w7 = linesizeU - width / 2 (paddingU)
- sub w14, w14, w0, lsr #1 // w14 = linesizeV - width / 2 (paddingV)
+ ldr x13, [sp] // srcV
+ ldr w14, [sp, #8] // linesizeV
+ ldr x8, [sp, #16] // table
+ load_yoff_ycoeff 24, 32 // y_offset, y_coeff
+ ld1 {v1.1d}, [x8]
+ dup v0.8h, w10
+ dup v3.8h, w9
+ sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding)
+ sub w5, w5, w0 // w5 = linesizeY - width (paddingY)
+ sub w7, w7, w0, lsr #1 // w7 = linesizeU - width / 2 (paddingU)
+ sub w14, w14, w0, lsr #1 // w14 = linesizeV - width / 2 (paddingV)
.endm
.macro load_chroma_nv12
- ld2 {v16.8b, v17.8b}, [x6], #16
- ushll v18.8h, v16.8b, #3
- ushll v19.8h, v17.8b, #3
+ ld2 {v16.8b, v17.8b}, [x6], #16
+ ushll v18.8h, v16.8b, #3
+ ushll v19.8h, v17.8b, #3
.endm
.macro load_chroma_nv21
- ld2 {v16.8b, v17.8b}, [x6], #16
- ushll v19.8h, v16.8b, #3
- ushll v18.8h, v17.8b, #3
+ ld2 {v16.8b, v17.8b}, [x6], #16
+ ushll v19.8h, v16.8b, #3
+ ushll v18.8h, v17.8b, #3
.endm
.macro load_chroma_yuv420p
- ld1 {v16.8b}, [ x6], #8
- ld1 {v17.8b}, [x13], #8
- ushll v18.8h, v16.8b, #3
- ushll v19.8h, v17.8b, #3
+ ld1 {v16.8b}, [ x6], #8
+ ld1 {v17.8b}, [x13], #8
+ ushll v18.8h, v16.8b, #3
+ ushll v19.8h, v17.8b, #3
.endm
.macro load_chroma_yuv422p
@@ -100,9 +100,9 @@
.endm
.macro increment_nv12
- ands w15, w1, #1
- csel w16, w7, w11, ne // incC = (h & 1) ? paddincC : -width
- add x6, x6, w16, sxtw // srcC += incC
+ ands w15, w1, #1
+ csel w16, w7, w11, ne // incC = (h & 1) ? paddincC : -width
+ add x6, x6, w16, sxtw // srcC += incC
.endm
.macro increment_nv21
@@ -110,100 +110,100 @@
.endm
.macro increment_yuv420p
- ands w15, w1, #1
- csel w16, w7, w11, ne // incU = (h & 1) ? paddincU : -width/2
- csel w17, w14, w11, ne // incV = (h & 1) ? paddincV : -width/2
- add x6, x6, w16, sxtw // srcU += incU
- add x13, x13, w17, sxtw // srcV += incV
+ ands w15, w1, #1
+ csel w16, w7, w11, ne // incU = (h & 1) ? paddincU : -width/2
+ csel w17, w14, w11, ne // incV = (h & 1) ? paddincV : -width/2
+ add x6, x6, w16, sxtw // srcU += incU
+ add x13, x13, w17, sxtw // srcV += incV
.endm
.macro increment_yuv422p
- add x6, x6, w7, sxtw // srcU += incU
- add x13, x13, w14, sxtw // srcV += incV
+ add x6, x6, w7, sxtw // srcU += incU
+ add x13, x13, w14, sxtw // srcV += incV
.endm
.macro compute_rgba r1 g1 b1 a1 r2 g2 b2 a2
- add v20.8h, v26.8h, v20.8h // Y1 + R1
- add v21.8h, v27.8h, v21.8h // Y2 + R2
- add v22.8h, v26.8h, v22.8h // Y1 + G1
- add v23.8h, v27.8h, v23.8h // Y2 + G2
- add v24.8h, v26.8h, v24.8h // Y1 + B1
- add v25.8h, v27.8h, v25.8h // Y2 + B2
- sqrshrun \r1, v20.8h, #1 // clip_u8((Y1 + R1) >> 1)
- sqrshrun \r2, v21.8h, #1 // clip_u8((Y2 + R1) >> 1)
- sqrshrun \g1, v22.8h, #1 // clip_u8((Y1 + G1) >> 1)
- sqrshrun \g2, v23.8h, #1 // clip_u8((Y2 + G1) >> 1)
- sqrshrun \b1, v24.8h, #1 // clip_u8((Y1 + B1) >> 1)
- sqrshrun \b2, v25.8h, #1 // clip_u8((Y2 + B1) >> 1)
- movi \a1, #255
- movi \a2, #255
+ add v20.8h, v26.8h, v20.8h // Y1 + R1
+ add v21.8h, v27.8h, v21.8h // Y2 + R2
+ add v22.8h, v26.8h, v22.8h // Y1 + G1
+ add v23.8h, v27.8h, v23.8h // Y2 + G2
+ add v24.8h, v26.8h, v24.8h // Y1 + B1
+ add v25.8h, v27.8h, v25.8h // Y2 + B2
+ sqrshrun \r1, v20.8h, #1 // clip_u8((Y1 + R1) >> 1)
+ sqrshrun \r2, v21.8h, #1 // clip_u8((Y2 + R1) >> 1)
+ sqrshrun \g1, v22.8h, #1 // clip_u8((Y1 + G1) >> 1)
+ sqrshrun \g2, v23.8h, #1 // clip_u8((Y2 + G1) >> 1)
+ sqrshrun \b1, v24.8h, #1 // clip_u8((Y1 + B1) >> 1)
+ sqrshrun \b2, v25.8h, #1 // clip_u8((Y2 + B1) >> 1)
+ movi \a1, #255
+ movi \a2, #255
.endm
.macro declare_func ifmt ofmt
function ff_\ifmt\()_to_\ofmt\()_neon, export=1
load_args_\ifmt
- mov w9, w1
+ mov w9, w1
1:
- mov w8, w0 // w8 = width
+ mov w8, w0 // w8 = width
2:
- movi v5.8h, #4, lsl #8 // 128 * (1<<3)
+ movi v5.8h, #4, lsl #8 // 128 * (1<<3)
load_chroma_\ifmt
- sub v18.8h, v18.8h, v5.8h // U*(1<<3) - 128*(1<<3)
- sub v19.8h, v19.8h, v5.8h // V*(1<<3) - 128*(1<<3)
- sqdmulh v20.8h, v19.8h, v1.h[0] // V * v2r (R)
- sqdmulh v22.8h, v18.8h, v1.h[1] // U * u2g
- sqdmulh v19.8h, v19.8h, v1.h[2] // V * v2g
- add v22.8h, v22.8h, v19.8h // U * u2g + V * v2g (G)
- sqdmulh v24.8h, v18.8h, v1.h[3] // U * u2b (B)
- zip2 v21.8h, v20.8h, v20.8h // R2
- zip1 v20.8h, v20.8h, v20.8h // R1
- zip2 v23.8h, v22.8h, v22.8h // G2
- zip1 v22.8h, v22.8h, v22.8h // G1
- zip2 v25.8h, v24.8h, v24.8h // B2
- zip1 v24.8h, v24.8h, v24.8h // B1
- ld1 {v2.16b}, [x4], #16 // load luma
- ushll v26.8h, v2.8b, #3 // Y1*(1<<3)
- ushll2 v27.8h, v2.16b, #3 // Y2*(1<<3)
- sub v26.8h, v26.8h, v3.8h // Y1*(1<<3) - y_offset
- sub v27.8h, v27.8h, v3.8h // Y2*(1<<3) - y_offset
- sqdmulh v26.8h, v26.8h, v0.8h // ((Y1*(1<<3) - y_offset) * y_coeff) >> 15
- sqdmulh v27.8h, v27.8h, v0.8h // ((Y2*(1<<3) - y_offset) * y_coeff) >> 15
+ sub v18.8h, v18.8h, v5.8h // U*(1<<3) - 128*(1<<3)
+ sub v19.8h, v19.8h, v5.8h // V*(1<<3) - 128*(1<<3)
+ sqdmulh v20.8h, v19.8h, v1.h[0] // V * v2r (R)
+ sqdmulh v22.8h, v18.8h, v1.h[1] // U * u2g
+ sqdmulh v19.8h, v19.8h, v1.h[2] // V * v2g
+ add v22.8h, v22.8h, v19.8h // U * u2g + V * v2g (G)
+ sqdmulh v24.8h, v18.8h, v1.h[3] // U * u2b (B)
+ zip2 v21.8h, v20.8h, v20.8h // R2
+ zip1 v20.8h, v20.8h, v20.8h // R1
+ zip2 v23.8h, v22.8h, v22.8h // G2
+ zip1 v22.8h, v22.8h, v22.8h // G1
+ zip2 v25.8h, v24.8h, v24.8h // B2
+ zip1 v24.8h, v24.8h, v24.8h // B1
+ ld1 {v2.16b}, [x4], #16 // load luma
+ ushll v26.8h, v2.8b, #3 // Y1*(1<<3)
+ ushll2 v27.8h, v2.16b, #3 // Y2*(1<<3)
+ sub v26.8h, v26.8h, v3.8h // Y1*(1<<3) - y_offset
+ sub v27.8h, v27.8h, v3.8h // Y2*(1<<3) - y_offset
+ sqdmulh v26.8h, v26.8h, v0.8h // ((Y1*(1<<3) - y_offset) * y_coeff) >> 15
+ sqdmulh v27.8h, v27.8h, v0.8h // ((Y2*(1<<3) - y_offset) * y_coeff) >> 15
.ifc \ofmt,argb // 1 2 3 0
- compute_rgba v5.8b,v6.8b,v7.8b,v4.8b, v17.8b,v18.8b,v19.8b,v16.8b
+ compute_rgba v5.8b,v6.8b,v7.8b,v4.8b, v17.8b,v18.8b,v19.8b,v16.8b
.endif
.ifc \ofmt,rgba // 0 1 2 3
- compute_rgba v4.8b,v5.8b,v6.8b,v7.8b, v16.8b,v17.8b,v18.8b,v19.8b
+ compute_rgba v4.8b,v5.8b,v6.8b,v7.8b, v16.8b,v17.8b,v18.8b,v19.8b
.endif
.ifc \ofmt,abgr // 3 2 1 0
- compute_rgba v7.8b,v6.8b,v5.8b,v4.8b, v19.8b,v18.8b,v17.8b,v16.8b
+ compute_rgba v7.8b,v6.8b,v5.8b,v4.8b, v19.8b,v18.8b,v17.8b,v16.8b
.endif
.ifc \ofmt,bgra // 2 1 0 3
- compute_rgba v6.8b,v5.8b,v4.8b,v7.8b, v18.8b,v17.8b,v16.8b,v19.8b
+ compute_rgba v6.8b,v5.8b,v4.8b,v7.8b, v18.8b,v17.8b,v16.8b,v19.8b
.endif
- st4 { v4.8b, v5.8b, v6.8b, v7.8b}, [x2], #32
- st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [x2], #32
- subs w8, w8, #16 // width -= 16
- b.gt 2b
- add x2, x2, w3, sxtw // dst += padding
- add x4, x4, w5, sxtw // srcY += paddingY
+ st4 { v4.8b, v5.8b, v6.8b, v7.8b}, [x2], #32
+ st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [x2], #32
+ subs w8, w8, #16 // width -= 16
+ b.gt 2b
+ add x2, x2, w3, sxtw // dst += padding
+ add x4, x4, w5, sxtw // srcY += paddingY
increment_\ifmt
- subs w1, w1, #1 // height -= 1
- b.gt 1b
- mov w0, w9
+ subs w1, w1, #1 // height -= 1
+ b.gt 1b
+ mov w0, w9
ret
endfunc
.endm
.macro declare_rgb_funcs ifmt
- declare_func \ifmt, argb
- declare_func \ifmt, rgba
- declare_func \ifmt, abgr
- declare_func \ifmt, bgra
+ declare_func \ifmt, argb
+ declare_func \ifmt, rgba
+ declare_func \ifmt, abgr
+ declare_func \ifmt, bgra
.endm
declare_rgb_funcs nv12