@@ -526,7 +526,7 @@ function ff_h264_h_loop_filter_chroma_mbaff_intra_neon, export=1
ld1 {v17.8b}, [x4], x1
ld1 {v19.8b}, [x4], x1
- transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29
+ transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29
h264_loop_filter_chroma_intra
@@ -554,7 +554,7 @@ h_loop_filter_chroma420_intra:
ld1 {v17.s}[1], [x4], x1
ld1 {v19.s}[1], [x4], x1
- transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29
+ transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29
h264_loop_filter_chroma_intra
@@ -1017,7 +1017,7 @@ function ff_h264_h_loop_filter_chroma_mbaff_intra_neon_10, export=1
ld1 {v16.8h}, [x4], x1
ld1 {v19.8h}, [x9], x1
- transpose_4x8H v18, v16, v17, v19, v26, v27, v28, v29
+ transpose_4x8H v18, v16, v17, v19, v26, v27, v28, v29
h264_loop_filter_chroma_intra_10
@@ -1045,7 +1045,7 @@ h_loop_filter_chroma420_intra_10:
ld1 {v19.4h}, [x4], x1
ld1 {v19.d}[1], [x9], x1
- transpose_4x8H v18, v16, v17, v19, v26, v27, v28, v29
+ transpose_4x8H v18, v16, v17, v19, v26, v27, v28, v29
h264_loop_filter_chroma_intra_10
@@ -580,8 +580,8 @@ function \type\()_h264_qpel16_hv_lowpass_l2_neon
endfunc
.endm
- h264_qpel16_hv put
- h264_qpel16_hv avg
+ h264_qpel16_hv put
+ h264_qpel16_hv avg
.macro h264_qpel8 type
function ff_\type\()_h264_qpel8_mc10_neon, export=1
@@ -759,8 +759,8 @@ function ff_\type\()_h264_qpel8_mc33_neon, export=1
endfunc
.endm
- h264_qpel8 put
- h264_qpel8 avg
+ h264_qpel8 put
+ h264_qpel8 avg
.macro h264_qpel16 type
function ff_\type\()_h264_qpel16_mc10_neon, export=1
@@ -931,5 +931,5 @@ function ff_\type\()_h264_qpel16_mc33_neon, export=1
endfunc
.endm
- h264_qpel16 put
- h264_qpel16 avg
+ h264_qpel16 put
+ h264_qpel16 avg
@@ -239,23 +239,23 @@ function hevc_add_residual_32x32_16_neon, export=0
endfunc
.macro tr_4x4 in0, in1, in2, in3, out0, out1, out2, out3, shift
- sshll v20.4s, \in0, #6
- sshll v21.4s, \in0, #6
- smull v22.4s, \in1, v4.h[1]
- smull v23.4s, \in1, v4.h[3]
- smlal v20.4s, \in2, v4.h[0] //e0
- smlsl v21.4s, \in2, v4.h[0] //e1
- smlal v22.4s, \in3, v4.h[3] //o0
- smlsl v23.4s, \in3, v4.h[1] //o1
-
- add v24.4s, v20.4s, v22.4s
- sub v20.4s, v20.4s, v22.4s
- add v22.4s, v21.4s, v23.4s
- sub v21.4s, v21.4s, v23.4s
- sqrshrn \out0, v24.4s, #\shift
- sqrshrn \out3, v20.4s, #\shift
- sqrshrn \out1, v22.4s, #\shift
- sqrshrn \out2, v21.4s, #\shift
+ sshll v20.4s, \in0, #6
+ sshll v21.4s, \in0, #6
+ smull v22.4s, \in1, v4.h[1]
+ smull v23.4s, \in1, v4.h[3]
+ smlal v20.4s, \in2, v4.h[0] //e0
+ smlsl v21.4s, \in2, v4.h[0] //e1
+ smlal v22.4s, \in3, v4.h[3] //o0
+ smlsl v23.4s, \in3, v4.h[1] //o1
+
+ add v24.4s, v20.4s, v22.4s
+ sub v20.4s, v20.4s, v22.4s
+ add v22.4s, v21.4s, v23.4s
+ sub v21.4s, v21.4s, v23.4s
+ sqrshrn \out0, v24.4s, #\shift
+ sqrshrn \out3, v20.4s, #\shift
+ sqrshrn \out1, v22.4s, #\shift
+ sqrshrn \out2, v21.4s, #\shift
.endm
.macro idct_4x4 bitdepth
@@ -294,19 +294,19 @@ endfunc
// uses and clobbers v28-v31 as temp registers
.macro tr_4x4_8 in0, in1, in2, in3, out0, out1, out2, out3, p1, p2
- sshll\p1 v28.4s, \in0, #6
- mov v29.16b, v28.16b
- smull\p1 v30.4s, \in1, v0.h[1]
- smull\p1 v31.4s, \in1, v0.h[3]
- smlal\p2 v28.4s, \in2, v0.h[0] //e0
- smlsl\p2 v29.4s, \in2, v0.h[0] //e1
- smlal\p2 v30.4s, \in3, v0.h[3] //o0
- smlsl\p2 v31.4s, \in3, v0.h[1] //o1
-
- add \out0, v28.4s, v30.4s
- add \out1, v29.4s, v31.4s
- sub \out2, v29.4s, v31.4s
- sub \out3, v28.4s, v30.4s
+ sshll\p1 v28.4s, \in0, #6
+ mov v29.16b, v28.16b
+ smull\p1 v30.4s, \in1, v0.h[1]
+ smull\p1 v31.4s, \in1, v0.h[3]
+ smlal\p2 v28.4s, \in2, v0.h[0] //e0
+ smlsl\p2 v29.4s, \in2, v0.h[0] //e1
+ smlal\p2 v30.4s, \in3, v0.h[3] //o0
+ smlsl\p2 v31.4s, \in3, v0.h[1] //o1
+
+ add \out0, v28.4s, v30.4s
+ add \out1, v29.4s, v31.4s
+ sub \out2, v29.4s, v31.4s
+ sub \out3, v28.4s, v30.4s
.endm
.macro transpose_8x8 r0, r1, r2, r3, r4, r5, r6, r7
@@ -362,11 +362,11 @@ endfunc
.macro idct_8x8 bitdepth
function ff_hevc_idct_8x8_\bitdepth\()_neon, export=1
//x0 - coeffs
- mov x1, x0
+ mov x1, x0
ld1 {v16.8h-v19.8h}, [x1], #64
ld1 {v20.8h-v23.8h}, [x1]
- movrel x1, trans
+ movrel x1, trans
ld1 {v0.8h}, [x1]
tr_8x4 7, v16,.4h, v17,.4h, v18,.4h, v19,.4h, v20,.4h, v21,.4h, v22,.4h, v23,.4h
@@ -379,7 +379,7 @@ function ff_hevc_idct_8x8_\bitdepth\()_neon, export=1
transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23
- mov x1, x0
+ mov x1, x0
st1 {v16.8h-v19.8h}, [x1], #64
st1 {v20.8h-v23.8h}, [x1]
@@ -388,8 +388,8 @@ endfunc
.endm
.macro butterfly e, o, tmp_p, tmp_m
- add \tmp_p, \e, \o
- sub \tmp_m, \e, \o
+ add \tmp_p, \e, \o
+ sub \tmp_m, \e, \o
.endm
.macro tr16_8x4 in0, in1, in2, in3, offset
@@ -418,7 +418,7 @@ endfunc
butterfly v25.4s, v29.4s, v17.4s, v22.4s
butterfly v26.4s, v30.4s, v18.4s, v21.4s
butterfly v27.4s, v31.4s, v19.4s, v20.4s
- add x4, sp, #\offset
+ add x4, sp, #\offset
st1 {v16.4s-v19.4s}, [x4], #64
st1 {v20.4s-v23.4s}, [x4]
.endm
@@ -435,14 +435,14 @@ endfunc
.endm
.macro add_member in, t0, t1, t2, t3, t4, t5, t6, t7, op0, op1, op2, op3, op4, op5, op6, op7, p
- sum_sub v21.4s, \in, \t0, \op0, \p
- sum_sub v22.4s, \in, \t1, \op1, \p
- sum_sub v23.4s, \in, \t2, \op2, \p
- sum_sub v24.4s, \in, \t3, \op3, \p
- sum_sub v25.4s, \in, \t4, \op4, \p
- sum_sub v26.4s, \in, \t5, \op5, \p
- sum_sub v27.4s, \in, \t6, \op6, \p
- sum_sub v28.4s, \in, \t7, \op7, \p
+ sum_sub v21.4s, \in, \t0, \op0, \p
+ sum_sub v22.4s, \in, \t1, \op1, \p
+ sum_sub v23.4s, \in, \t2, \op2, \p
+ sum_sub v24.4s, \in, \t3, \op3, \p
+ sum_sub v25.4s, \in, \t4, \op4, \p
+ sum_sub v26.4s, \in, \t5, \op5, \p
+ sum_sub v27.4s, \in, \t6, \op6, \p
+ sum_sub v28.4s, \in, \t7, \op7, \p
.endm
.macro butterfly16 in0, in1, in2, in3, in4, in5, in6, in7
@@ -528,20 +528,20 @@ endfunc
.macro tr_16x4 name, shift, offset, step
function func_tr_16x4_\name
- mov x1, x5
- add x3, x5, #(\step * 64)
- mov x2, #(\step * 128)
+ mov x1, x5
+ add x3, x5, #(\step * 64)
+ mov x2, #(\step * 128)
load16 v16.d, v17.d, v18.d, v19.d
- movrel x1, trans
+ movrel x1, trans
ld1 {v0.8h}, [x1]
tr16_8x4 v16, v17, v18, v19, \offset
- add x1, x5, #(\step * 32)
- add x3, x5, #(\step * 3 *32)
- mov x2, #(\step * 128)
+ add x1, x5, #(\step * 32)
+ add x3, x5, #(\step * 3 *32)
+ mov x2, #(\step * 128)
load16 v20.d, v17.d, v18.d, v19.d
- movrel x1, trans, 16
+ movrel x1, trans, 16
ld1 {v1.8h}, [x1]
smull v21.4s, v20.4h, v1.h[0]
smull v22.4s, v20.4h, v1.h[1]
@@ -560,19 +560,19 @@ function func_tr_16x4_\name
add_member v19.4h, v1.h[6], v1.h[3], v1.h[0], v1.h[2], v1.h[5], v1.h[7], v1.h[4], v1.h[1], +, -, +, -, +, +, -, +
add_member v19.8h, v1.h[7], v1.h[6], v1.h[5], v1.h[4], v1.h[3], v1.h[2], v1.h[1], v1.h[0], +, -, +, -, +, -, +, -, 2
- add x4, sp, #\offset
+ add x4, sp, #\offset
ld1 {v16.4s-v19.4s}, [x4], #64
butterfly16 v16.4s, v21.4s, v17.4s, v22.4s, v18.4s, v23.4s, v19.4s, v24.4s
.if \shift > 0
scale v29, v30, v31, v24, v20.4s, v16.4s, v21.4s, v17.4s, v22.4s, v18.4s, v23.4s, v19.4s, \shift
transpose16_4x4_2 v29, v30, v31, v24, v2, v3, v4, v5, v6, v7
- mov x1, x6
- add x3, x6, #(24 +3*32)
- mov x2, #32
- mov x4, #-32
+ mov x1, x6
+ add x3, x6, #(24 +3*32)
+ mov x2, #32
+ mov x4, #-32
store16 v29.d, v30.d, v31.d, v24.d, x4
.else
- store_to_stack \offset, (\offset + 240), v20.4s, v21.4s, v22.4s, v23.4s, v19.4s, v18.4s, v17.4s, v16.4s
+ store_to_stack \offset, (\offset + 240), v20.4s, v21.4s, v22.4s, v23.4s, v19.4s, v18.4s, v17.4s, v16.4s
.endif
add x4, sp, #(\offset + 64)
@@ -582,13 +582,13 @@ function func_tr_16x4_\name
scale v29, v30, v31, v20, v20.4s, v16.4s, v25.4s, v17.4s, v26.4s, v18.4s, v27.4s, v19.4s, \shift
transpose16_4x4_2 v29, v30, v31, v20, v2, v3, v4, v5, v6, v7
- add x1, x6, #8
- add x3, x6, #(16 + 3 * 32)
- mov x2, #32
- mov x4, #-32
+ add x1, x6, #8
+ add x3, x6, #(16 + 3 * 32)
+ mov x2, #32
+ mov x4, #-32
store16 v29.d, v30.d, v31.d, v20.d, x4
.else
- store_to_stack (\offset + 64), (\offset + 176), v20.4s, v25.4s, v26.4s, v27.4s, v19.4s, v18.4s, v17.4s, v16.4s
+ store_to_stack (\offset + 64), (\offset + 176), v20.4s, v25.4s, v26.4s, v27.4s, v19.4s, v18.4s, v17.4s, v16.4s
.endif
ret
@@ -601,21 +601,21 @@ function ff_hevc_idct_16x16_\bitdepth\()_neon, export=1
mov x15, x30
// allocate a temp buffer
- sub sp, sp, #640
+ sub sp, sp, #640
.irp i, 0, 1, 2, 3
- add x5, x0, #(8 * \i)
- add x6, sp, #(8 * \i * 16)
+ add x5, x0, #(8 * \i)
+ add x6, sp, #(8 * \i * 16)
bl func_tr_16x4_firstpass
.endr
.irp i, 0, 1, 2, 3
- add x5, sp, #(8 * \i)
- add x6, x0, #(8 * \i * 16)
+ add x5, sp, #(8 * \i)
+ add x6, x0, #(8 * \i * 16)
bl func_tr_16x4_secondpass_\bitdepth
.endr
- add sp, sp, #640
+ add sp, sp, #640
ret x15
endfunc
@@ -644,10 +644,10 @@ endfunc
.endm
.macro add_member32 in, t0, t1, t2, t3, op0, op1, op2, op3, p
- sum_sub v24.4s, \in, \t0, \op0, \p
- sum_sub v25.4s, \in, \t1, \op1, \p
- sum_sub v26.4s, \in, \t2, \op2, \p
- sum_sub v27.4s, \in, \t3, \op3, \p
+ sum_sub v24.4s, \in, \t0, \op0, \p
+ sum_sub v25.4s, \in, \t1, \op1, \p
+ sum_sub v26.4s, \in, \t2, \op2, \p
+ sum_sub v27.4s, \in, \t3, \op3, \p
.endm
.macro butterfly32 in0, in1, in2, in3, out
@@ -841,85 +841,85 @@ idct_32x32 8
idct_32x32 10
.macro tr4_luma_shift r0, r1, r2, r3, shift
- saddl v0.4s, \r0, \r2 // c0 = src0 + src2
- saddl v1.4s, \r2, \r3 // c1 = src2 + src3
- ssubl v2.4s, \r0, \r3 // c2 = src0 - src3
- smull v3.4s, \r1, v21.4h // c3 = 74 * src1
-
- saddl v7.4s, \r0, \r3 // src0 + src3
- ssubw v7.4s, v7.4s, \r2 // src0 - src2 + src3
- mul v7.4s, v7.4s, v18.4s // dst2 = 74 * (src0 - src2 + src3)
-
- mul v5.4s, v0.4s, v19.4s // 29 * c0
- mul v6.4s, v1.4s, v20.4s // 55 * c1
- add v5.4s, v5.4s, v6.4s // 29 * c0 + 55 * c1
- add v5.4s, v5.4s, v3.4s // dst0 = 29 * c0 + 55 * c1 + c3
-
- mul v1.4s, v1.4s, v19.4s // 29 * c1
- mul v6.4s, v2.4s, v20.4s // 55 * c2
- sub v6.4s, v6.4s, v1.4s // 55 * c2 - 29 * c1
- add v6.4s, v6.4s, v3.4s // dst1 = 55 * c2 - 29 * c1 + c3
-
- mul v0.4s, v0.4s, v20.4s // 55 * c0
- mul v2.4s, v2.4s, v19.4s // 29 * c2
- add v0.4s, v0.4s, v2.4s // 55 * c0 + 29 * c2
- sub v0.4s, v0.4s, v3.4s // dst3 = 55 * c0 + 29 * c2 - c3
-
- sqrshrn \r0, v5.4s, \shift
- sqrshrn \r1, v6.4s, \shift
- sqrshrn \r2, v7.4s, \shift
- sqrshrn \r3, v0.4s, \shift
+ saddl v0.4s, \r0, \r2 // c0 = src0 + src2
+ saddl v1.4s, \r2, \r3 // c1 = src2 + src3
+ ssubl v2.4s, \r0, \r3 // c2 = src0 - src3
+ smull v3.4s, \r1, v21.4h // c3 = 74 * src1
+
+ saddl v7.4s, \r0, \r3 // src0 + src3
+ ssubw v7.4s, v7.4s, \r2 // src0 - src2 + src3
+ mul v7.4s, v7.4s, v18.4s // dst2 = 74 * (src0 - src2 + src3)
+
+ mul v5.4s, v0.4s, v19.4s // 29 * c0
+ mul v6.4s, v1.4s, v20.4s // 55 * c1
+ add v5.4s, v5.4s, v6.4s // 29 * c0 + 55 * c1
+ add v5.4s, v5.4s, v3.4s // dst0 = 29 * c0 + 55 * c1 + c3
+
+ mul v1.4s, v1.4s, v19.4s // 29 * c1
+ mul v6.4s, v2.4s, v20.4s // 55 * c2
+ sub v6.4s, v6.4s, v1.4s // 55 * c2 - 29 * c1
+ add v6.4s, v6.4s, v3.4s // dst1 = 55 * c2 - 29 * c1 + c3
+
+ mul v0.4s, v0.4s, v20.4s // 55 * c0
+ mul v2.4s, v2.4s, v19.4s // 29 * c2
+ add v0.4s, v0.4s, v2.4s // 55 * c0 + 29 * c2
+ sub v0.4s, v0.4s, v3.4s // dst3 = 55 * c0 + 29 * c2 - c3
+
+ sqrshrn \r0, v5.4s, \shift
+ sqrshrn \r1, v6.4s, \shift
+ sqrshrn \r2, v7.4s, \shift
+ sqrshrn \r3, v0.4s, \shift
.endm
function ff_hevc_transform_luma_4x4_neon_8, export=1
- ld1 {v28.4h-v31.4h}, [x0]
- movi v18.4s, #74
- movi v19.4s, #29
- movi v20.4s, #55
- movi v21.4h, #74
+ ld1 {v28.4h-v31.4h}, [x0]
+ movi v18.4s, #74
+ movi v19.4s, #29
+ movi v20.4s, #55
+ movi v21.4h, #74
- tr4_luma_shift v28.4h, v29.4h, v30.4h, v31.4h, #7
- transpose_4x4H v28, v29, v30, v31, v22, v23, v24, v25
+ tr4_luma_shift v28.4h, v29.4h, v30.4h, v31.4h, #7
+ transpose_4x4H v28, v29, v30, v31, v22, v23, v24, v25
- tr4_luma_shift v28.4h, v29.4h, v30.4h, v31.4h, #12
- transpose_4x4H v28, v29, v30, v31, v22, v23, v24, v25
+ tr4_luma_shift v28.4h, v29.4h, v30.4h, v31.4h, #12
+ transpose_4x4H v28, v29, v30, v31, v22, v23, v24, v25
- st1 {v28.4h-v31.4h}, [x0]
+ st1 {v28.4h-v31.4h}, [x0]
ret
endfunc
// void ff_hevc_idct_NxN_dc_DEPTH_neon(int16_t *coeffs)
.macro idct_dc size, bitdepth
function ff_hevc_idct_\size\()x\size\()_dc_\bitdepth\()_neon, export=1
- ld1r {v4.8h}, [x0]
- srshr v4.8h, v4.8h, #1
- srshr v0.8h, v4.8h, #(14 - \bitdepth)
- srshr v1.8h, v4.8h, #(14 - \bitdepth)
+ ld1r {v4.8h}, [x0]
+ srshr v4.8h, v4.8h, #1
+ srshr v0.8h, v4.8h, #(14 - \bitdepth)
+ srshr v1.8h, v4.8h, #(14 - \bitdepth)
.if \size > 4
- srshr v2.8h, v4.8h, #(14 - \bitdepth)
- srshr v3.8h, v4.8h, #(14 - \bitdepth)
+ srshr v2.8h, v4.8h, #(14 - \bitdepth)
+ srshr v3.8h, v4.8h, #(14 - \bitdepth)
.if \size > 16 /* dc 32x32 */
- mov x2, #4
+ mov x2, #4
1:
- subs x2, x2, #1
+ subs x2, x2, #1
.endif
add x12, x0, #64
mov x13, #128
.if \size > 8 /* dc 16x16 */
- st1 {v0.8h-v3.8h}, [x0], x13
- st1 {v0.8h-v3.8h}, [x12], x13
- st1 {v0.8h-v3.8h}, [x0], x13
- st1 {v0.8h-v3.8h}, [x12], x13
- st1 {v0.8h-v3.8h}, [x0], x13
- st1 {v0.8h-v3.8h}, [x12], x13
+ st1 {v0.8h-v3.8h}, [x0], x13
+ st1 {v0.8h-v3.8h}, [x12], x13
+ st1 {v0.8h-v3.8h}, [x0], x13
+ st1 {v0.8h-v3.8h}, [x12], x13
+ st1 {v0.8h-v3.8h}, [x0], x13
+ st1 {v0.8h-v3.8h}, [x12], x13
.endif /* dc 8x8 */
- st1 {v0.8h-v3.8h}, [x0], x13
- st1 {v0.8h-v3.8h}, [x12], x13
+ st1 {v0.8h-v3.8h}, [x0], x13
+ st1 {v0.8h-v3.8h}, [x12], x13
.if \size > 16 /* dc 32x32 */
bne 1b
.endif
.else /* dc 4x4 */
- st1 {v0.8h-v1.8h}, [x0]
+ st1 {v0.8h-v1.8h}, [x0]
.endif
ret
endfunc
@@ -840,19 +840,19 @@ function ff_hevc_put_hevc_qpel_uni_v16_8_neon, export=1
endfunc
function ff_hevc_put_hevc_qpel_uni_v24_8_neon, export=1
- b X(ff_hevc_put_hevc_qpel_uni_v12_8_neon)
+ b X(ff_hevc_put_hevc_qpel_uni_v12_8_neon)
endfunc
function ff_hevc_put_hevc_qpel_uni_v32_8_neon, export=1
- b X(ff_hevc_put_hevc_qpel_uni_v16_8_neon)
+ b X(ff_hevc_put_hevc_qpel_uni_v16_8_neon)
endfunc
function ff_hevc_put_hevc_qpel_uni_v48_8_neon, export=1
- b X(ff_hevc_put_hevc_qpel_uni_v16_8_neon)
+ b X(ff_hevc_put_hevc_qpel_uni_v16_8_neon)
endfunc
function ff_hevc_put_hevc_qpel_uni_v64_8_neon, export=1
- b X(ff_hevc_put_hevc_qpel_uni_v16_8_neon)
+ b X(ff_hevc_put_hevc_qpel_uni_v16_8_neon)
endfunc
function ff_hevc_put_hevc_pel_uni_w_pixels4_8_neon, export=1
@@ -1560,21 +1560,21 @@ endfunc
#if HAVE_I8MM
.macro calc_all2
- calc v30, v31, v16, v18, v20, v22, v24, v26, v28, v30, v17, v19, v21, v23, v25, v27, v29, v31
+ calc v30, v31, v16, v18, v20, v22, v24, v26, v28, v30, v17, v19, v21, v23, v25, v27, v29, v31
b.eq 2f
- calc v16, v17, v18, v20, v22, v24, v26, v28, v30, v16, v19, v21, v23, v25, v27, v29, v31, v17
+ calc v16, v17, v18, v20, v22, v24, v26, v28, v30, v16, v19, v21, v23, v25, v27, v29, v31, v17
b.eq 2f
- calc v18, v19, v20, v22, v24, v26, v28, v30, v16, v18, v21, v23, v25, v27, v29, v31, v17, v19
+ calc v18, v19, v20, v22, v24, v26, v28, v30, v16, v18, v21, v23, v25, v27, v29, v31, v17, v19
b.eq 2f
- calc v20, v21, v22, v24, v26, v28, v30, v16, v18, v20, v23, v25, v27, v29, v31, v17, v19, v21
+ calc v20, v21, v22, v24, v26, v28, v30, v16, v18, v20, v23, v25, v27, v29, v31, v17, v19, v21
b.eq 2f
- calc v22, v23, v24, v26, v28, v30, v16, v18, v20, v22, v25, v27, v29, v31, v17, v19, v21, v23
+ calc v22, v23, v24, v26, v28, v30, v16, v18, v20, v22, v25, v27, v29, v31, v17, v19, v21, v23
b.eq 2f
- calc v24, v25, v26, v28, v30, v16, v18, v20, v22, v24, v27, v29, v31, v17, v19, v21, v23, v25
+ calc v24, v25, v26, v28, v30, v16, v18, v20, v22, v24, v27, v29, v31, v17, v19, v21, v23, v25
b.eq 2f
- calc v26, v27, v28, v30, v16, v18, v20, v22, v24, v26, v29, v31, v17, v19, v21, v23, v25, v27
+ calc v26, v27, v28, v30, v16, v18, v20, v22, v24, v26, v29, v31, v17, v19, v21, v23, v25, v27
b.eq 2f
- calc v28, v29, v30, v16, v18, v20, v22, v24, v26, v28, v31, v17, v19, v21, v23, v25, v27, v29
+ calc v28, v29, v30, v16, v18, v20, v22, v24, v26, v28, v31, v17, v19, v21, v23, v25, v27, v29
b.hi 1b
.endm
@@ -34,13 +34,13 @@ endconst
function ff_opus_deemphasis_neon, export=1
movrel x4, tab_st
- ld1 {v4.4s}, [x4]
+ ld1 {v4.4s}, [x4]
movrel x4, tab_x0
- ld1 {v5.4s}, [x4]
+ ld1 {v5.4s}, [x4]
movrel x4, tab_x1
- ld1 {v6.4s}, [x4]
+ ld1 {v6.4s}, [x4]
movrel x4, tab_x2
- ld1 {v7.4s}, [x4]
+ ld1 {v7.4s}, [x4]
fmul v0.4s, v4.4s, v0.s[0]
@@ -330,32 +330,32 @@ endfunc
// v17: hev
// convert to signed value:
- eor v3.16b, v3.16b, v21.16b // PS0 = P0 ^ 0x80
- eor v4.16b, v4.16b, v21.16b // QS0 = Q0 ^ 0x80
-
- movi v20.8h, #3
- ssubl v18.8h, v4.8b, v3.8b // QS0 - PS0
- ssubl2 v19.8h, v4.16b, v3.16b // (widened to 16bit)
- eor v2.16b, v2.16b, v21.16b // PS1 = P1 ^ 0x80
- eor v5.16b, v5.16b, v21.16b // QS1 = Q1 ^ 0x80
- mul v18.8h, v18.8h, v20.8h // w = 3 * (QS0 - PS0)
- mul v19.8h, v19.8h, v20.8h
-
- sqsub v20.16b, v2.16b, v5.16b // clamp(PS1-QS1)
- movi v22.16b, #4
- movi v23.16b, #3
+ eor v3.16b, v3.16b, v21.16b // PS0 = P0 ^ 0x80
+ eor v4.16b, v4.16b, v21.16b // QS0 = Q0 ^ 0x80
+
+ movi v20.8h, #3
+ ssubl v18.8h, v4.8b, v3.8b // QS0 - PS0
+ ssubl2 v19.8h, v4.16b, v3.16b // (widened to 16bit)
+ eor v2.16b, v2.16b, v21.16b // PS1 = P1 ^ 0x80
+ eor v5.16b, v5.16b, v21.16b // QS1 = Q1 ^ 0x80
+ mul v18.8h, v18.8h, v20.8h // w = 3 * (QS0 - PS0)
+ mul v19.8h, v19.8h, v20.8h
+
+ sqsub v20.16b, v2.16b, v5.16b // clamp(PS1-QS1)
+ movi v22.16b, #4
+ movi v23.16b, #3
.if \inner
- and v20.16b, v20.16b, v17.16b // if(hev) w += clamp(PS1-QS1)
+ and v20.16b, v20.16b, v17.16b // if(hev) w += clamp(PS1-QS1)
.endif
- saddw v18.8h, v18.8h, v20.8b // w += clamp(PS1-QS1)
- saddw2 v19.8h, v19.8h, v20.16b
- sqxtn v18.8b, v18.8h // narrow result back into v18
- sqxtn2 v18.16b, v19.8h
+ saddw v18.8h, v18.8h, v20.8b // w += clamp(PS1-QS1)
+ saddw2 v19.8h, v19.8h, v20.16b
+ sqxtn v18.8b, v18.8h // narrow result back into v18
+ sqxtn2 v18.16b, v19.8h
.if !\inner && !\simple
- eor v1.16b, v1.16b, v21.16b // PS2 = P2 ^ 0x80
- eor v6.16b, v6.16b, v21.16b // QS2 = Q2 ^ 0x80
+ eor v1.16b, v1.16b, v21.16b // PS2 = P2 ^ 0x80
+ eor v6.16b, v6.16b, v21.16b // QS2 = Q2 ^ 0x80
.endif
- and v18.16b, v18.16b, v16.16b // w &= normal_limit
+ and v18.16b, v18.16b, v16.16b // w &= normal_limit
// registers used at this point..
// v0 -> P3 (don't corrupt)
@@ -375,44 +375,44 @@ endfunc
// P0 = s2u(PS0 + c2);
.if \simple
- sqadd v19.16b, v18.16b, v22.16b // c1 = clamp((w&hev)+4)
- sqadd v20.16b, v18.16b, v23.16b // c2 = clamp((w&hev)+3)
- sshr v19.16b, v19.16b, #3 // c1 >>= 3
- sshr v20.16b, v20.16b, #3 // c2 >>= 3
- sqsub v4.16b, v4.16b, v19.16b // QS0 = clamp(QS0-c1)
- sqadd v3.16b, v3.16b, v20.16b // PS0 = clamp(PS0+c2)
- eor v4.16b, v4.16b, v21.16b // Q0 = QS0 ^ 0x80
- eor v3.16b, v3.16b, v21.16b // P0 = PS0 ^ 0x80
- eor v5.16b, v5.16b, v21.16b // Q1 = QS1 ^ 0x80
- eor v2.16b, v2.16b, v21.16b // P1 = PS1 ^ 0x80
+ sqadd v19.16b, v18.16b, v22.16b // c1 = clamp((w&hev)+4)
+ sqadd v20.16b, v18.16b, v23.16b // c2 = clamp((w&hev)+3)
+ sshr v19.16b, v19.16b, #3 // c1 >>= 3
+ sshr v20.16b, v20.16b, #3 // c2 >>= 3
+ sqsub v4.16b, v4.16b, v19.16b // QS0 = clamp(QS0-c1)
+ sqadd v3.16b, v3.16b, v20.16b // PS0 = clamp(PS0+c2)
+ eor v4.16b, v4.16b, v21.16b // Q0 = QS0 ^ 0x80
+ eor v3.16b, v3.16b, v21.16b // P0 = PS0 ^ 0x80
+ eor v5.16b, v5.16b, v21.16b // Q1 = QS1 ^ 0x80
+ eor v2.16b, v2.16b, v21.16b // P1 = PS1 ^ 0x80
.elseif \inner
// the !is4tap case of filter_common, only used for inner blocks
// c3 = ((c1&~hev) + 1) >> 1;
// Q1 = s2u(QS1 - c3);
// P1 = s2u(PS1 + c3);
- sqadd v19.16b, v18.16b, v22.16b // c1 = clamp((w&hev)+4)
- sqadd v20.16b, v18.16b, v23.16b // c2 = clamp((w&hev)+3)
- sshr v19.16b, v19.16b, #3 // c1 >>= 3
- sshr v20.16b, v20.16b, #3 // c2 >>= 3
- sqsub v4.16b, v4.16b, v19.16b // QS0 = clamp(QS0-c1)
- sqadd v3.16b, v3.16b, v20.16b // PS0 = clamp(PS0+c2)
- bic v19.16b, v19.16b, v17.16b // c1 & ~hev
- eor v4.16b, v4.16b, v21.16b // Q0 = QS0 ^ 0x80
- srshr v19.16b, v19.16b, #1 // c3 >>= 1
- eor v3.16b, v3.16b, v21.16b // P0 = PS0 ^ 0x80
- sqsub v5.16b, v5.16b, v19.16b // QS1 = clamp(QS1-c3)
- sqadd v2.16b, v2.16b, v19.16b // PS1 = clamp(PS1+c3)
- eor v5.16b, v5.16b, v21.16b // Q1 = QS1 ^ 0x80
- eor v2.16b, v2.16b, v21.16b // P1 = PS1 ^ 0x80
+ sqadd v19.16b, v18.16b, v22.16b // c1 = clamp((w&hev)+4)
+ sqadd v20.16b, v18.16b, v23.16b // c2 = clamp((w&hev)+3)
+ sshr v19.16b, v19.16b, #3 // c1 >>= 3
+ sshr v20.16b, v20.16b, #3 // c2 >>= 3
+ sqsub v4.16b, v4.16b, v19.16b // QS0 = clamp(QS0-c1)
+ sqadd v3.16b, v3.16b, v20.16b // PS0 = clamp(PS0+c2)
+ bic v19.16b, v19.16b, v17.16b // c1 & ~hev
+ eor v4.16b, v4.16b, v21.16b // Q0 = QS0 ^ 0x80
+ srshr v19.16b, v19.16b, #1 // c3 >>= 1
+ eor v3.16b, v3.16b, v21.16b // P0 = PS0 ^ 0x80
+ sqsub v5.16b, v5.16b, v19.16b // QS1 = clamp(QS1-c3)
+ sqadd v2.16b, v2.16b, v19.16b // PS1 = clamp(PS1+c3)
+ eor v5.16b, v5.16b, v21.16b // Q1 = QS1 ^ 0x80
+ eor v2.16b, v2.16b, v21.16b // P1 = PS1 ^ 0x80
.else
- and v20.16b, v18.16b, v17.16b // w & hev
- sqadd v19.16b, v20.16b, v22.16b // c1 = clamp((w&hev)+4)
- sqadd v20.16b, v20.16b, v23.16b // c2 = clamp((w&hev)+3)
- sshr v19.16b, v19.16b, #3 // c1 >>= 3
- sshr v20.16b, v20.16b, #3 // c2 >>= 3
- bic v18.16b, v18.16b, v17.16b // w &= ~hev
- sqsub v4.16b, v4.16b, v19.16b // QS0 = clamp(QS0-c1)
- sqadd v3.16b, v3.16b, v20.16b // PS0 = clamp(PS0+c2)
+ and v20.16b, v18.16b, v17.16b // w & hev
+ sqadd v19.16b, v20.16b, v22.16b // c1 = clamp((w&hev)+4)
+ sqadd v20.16b, v20.16b, v23.16b // c2 = clamp((w&hev)+3)
+ sshr v19.16b, v19.16b, #3 // c1 >>= 3
+ sshr v20.16b, v20.16b, #3 // c2 >>= 3
+ bic v18.16b, v18.16b, v17.16b // w &= ~hev
+ sqsub v4.16b, v4.16b, v19.16b // QS0 = clamp(QS0-c1)
+ sqadd v3.16b, v3.16b, v20.16b // PS0 = clamp(PS0+c2)
// filter_mbedge:
// a = clamp((27*w + 63) >> 7);
@@ -424,35 +424,35 @@ endfunc
// a = clamp((9*w + 63) >> 7);
// Q2 = s2u(QS2 - a);
// P2 = s2u(PS2 + a);
- movi v17.8h, #63
- sshll v22.8h, v18.8b, #3
- sshll2 v23.8h, v18.16b, #3
- saddw v22.8h, v22.8h, v18.8b
- saddw2 v23.8h, v23.8h, v18.16b
- add v16.8h, v17.8h, v22.8h
- add v17.8h, v17.8h, v23.8h // 9*w + 63
- add v19.8h, v16.8h, v22.8h
- add v20.8h, v17.8h, v23.8h // 18*w + 63
- add v22.8h, v19.8h, v22.8h
- add v23.8h, v20.8h, v23.8h // 27*w + 63
- sqshrn v16.8b, v16.8h, #7
- sqshrn2 v16.16b, v17.8h, #7 // clamp(( 9*w + 63)>>7)
- sqshrn v19.8b, v19.8h, #7
- sqshrn2 v19.16b, v20.8h, #7 // clamp((18*w + 63)>>7)
- sqshrn v22.8b, v22.8h, #7
- sqshrn2 v22.16b, v23.8h, #7 // clamp((27*w + 63)>>7)
- sqadd v1.16b, v1.16b, v16.16b // PS2 = clamp(PS2+a)
- sqsub v6.16b, v6.16b, v16.16b // QS2 = clamp(QS2-a)
- sqadd v2.16b, v2.16b, v19.16b // PS1 = clamp(PS1+a)
- sqsub v5.16b, v5.16b, v19.16b // QS1 = clamp(QS1-a)
- sqadd v3.16b, v3.16b, v22.16b // PS0 = clamp(PS0+a)
- sqsub v4.16b, v4.16b, v22.16b // QS0 = clamp(QS0-a)
- eor v3.16b, v3.16b, v21.16b // P0 = PS0 ^ 0x80
- eor v4.16b, v4.16b, v21.16b // Q0 = QS0 ^ 0x80
- eor v2.16b, v2.16b, v21.16b // P1 = PS1 ^ 0x80
- eor v5.16b, v5.16b, v21.16b // Q1 = QS1 ^ 0x80
- eor v1.16b, v1.16b, v21.16b // P2 = PS2 ^ 0x80
- eor v6.16b, v6.16b, v21.16b // Q2 = QS2 ^ 0x80
+ movi v17.8h, #63
+ sshll v22.8h, v18.8b, #3
+ sshll2 v23.8h, v18.16b, #3
+ saddw v22.8h, v22.8h, v18.8b
+ saddw2 v23.8h, v23.8h, v18.16b
+ add v16.8h, v17.8h, v22.8h
+ add v17.8h, v17.8h, v23.8h // 9*w + 63
+ add v19.8h, v16.8h, v22.8h
+ add v20.8h, v17.8h, v23.8h // 18*w + 63
+ add v22.8h, v19.8h, v22.8h
+ add v23.8h, v20.8h, v23.8h // 27*w + 63
+ sqshrn v16.8b, v16.8h, #7
+ sqshrn2 v16.16b, v17.8h, #7 // clamp(( 9*w + 63)>>7)
+ sqshrn v19.8b, v19.8h, #7
+ sqshrn2 v19.16b, v20.8h, #7 // clamp((18*w + 63)>>7)
+ sqshrn v22.8b, v22.8h, #7
+ sqshrn2 v22.16b, v23.8h, #7 // clamp((27*w + 63)>>7)
+ sqadd v1.16b, v1.16b, v16.16b // PS2 = clamp(PS2+a)
+ sqsub v6.16b, v6.16b, v16.16b // QS2 = clamp(QS2-a)
+ sqadd v2.16b, v2.16b, v19.16b // PS1 = clamp(PS1+a)
+ sqsub v5.16b, v5.16b, v19.16b // QS1 = clamp(QS1-a)
+ sqadd v3.16b, v3.16b, v22.16b // PS0 = clamp(PS0+a)
+ sqsub v4.16b, v4.16b, v22.16b // QS0 = clamp(QS0-a)
+ eor v3.16b, v3.16b, v21.16b // P0 = PS0 ^ 0x80
+ eor v4.16b, v4.16b, v21.16b // Q0 = QS0 ^ 0x80
+ eor v2.16b, v2.16b, v21.16b // P1 = PS1 ^ 0x80
+ eor v5.16b, v5.16b, v21.16b // Q1 = QS1 ^ 0x80
+ eor v1.16b, v1.16b, v21.16b // P2 = PS2 ^ 0x80
+ eor v6.16b, v6.16b, v21.16b // Q2 = QS2 ^ 0x80
.endif
.endm
@@ -507,48 +507,48 @@ function ff_vp8_v_loop_filter8uv\name\()_neon, export=1
sub x0, x0, x2, lsl #2
sub x1, x1, x2, lsl #2
// Load pixels:
- ld1 {v0.d}[0], [x0], x2 // P3
- ld1 {v0.d}[1], [x1], x2 // P3
- ld1 {v1.d}[0], [x0], x2 // P2
- ld1 {v1.d}[1], [x1], x2 // P2
- ld1 {v2.d}[0], [x0], x2 // P1
- ld1 {v2.d}[1], [x1], x2 // P1
- ld1 {v3.d}[0], [x0], x2 // P0
- ld1 {v3.d}[1], [x1], x2 // P0
- ld1 {v4.d}[0], [x0], x2 // Q0
- ld1 {v4.d}[1], [x1], x2 // Q0
- ld1 {v5.d}[0], [x0], x2 // Q1
- ld1 {v5.d}[1], [x1], x2 // Q1
- ld1 {v6.d}[0], [x0], x2 // Q2
- ld1 {v6.d}[1], [x1], x2 // Q2
- ld1 {v7.d}[0], [x0] // Q3
- ld1 {v7.d}[1], [x1] // Q3
-
- dup v22.16b, w3 // flim_E
- dup v23.16b, w4 // flim_I
+ ld1 {v0.d}[0], [x0], x2 // P3
+ ld1 {v0.d}[1], [x1], x2 // P3
+ ld1 {v1.d}[0], [x0], x2 // P2
+ ld1 {v1.d}[1], [x1], x2 // P2
+ ld1 {v2.d}[0], [x0], x2 // P1
+ ld1 {v2.d}[1], [x1], x2 // P1
+ ld1 {v3.d}[0], [x0], x2 // P0
+ ld1 {v3.d}[1], [x1], x2 // P0
+ ld1 {v4.d}[0], [x0], x2 // Q0
+ ld1 {v4.d}[1], [x1], x2 // Q0
+ ld1 {v5.d}[0], [x0], x2 // Q1
+ ld1 {v5.d}[1], [x1], x2 // Q1
+ ld1 {v6.d}[0], [x0], x2 // Q2
+ ld1 {v6.d}[1], [x1], x2 // Q2
+ ld1 {v7.d}[0], [x0] // Q3
+ ld1 {v7.d}[1], [x1] // Q3
+
+ dup v22.16b, w3 // flim_E
+ dup v23.16b, w4 // flim_I
vp8_loop_filter inner=\inner, hev_thresh=w5
// back up to P2: u,v -= stride * 6
- sub x0, x0, x2, lsl #2
- sub x1, x1, x2, lsl #2
- sub x0, x0, x2, lsl #1
- sub x1, x1, x2, lsl #1
+ sub x0, x0, x2, lsl #2
+ sub x1, x1, x2, lsl #2
+ sub x0, x0, x2, lsl #1
+ sub x1, x1, x2, lsl #1
// Store pixels:
- st1 {v1.d}[0], [x0], x2 // P2
- st1 {v1.d}[1], [x1], x2 // P2
- st1 {v2.d}[0], [x0], x2 // P1
- st1 {v2.d}[1], [x1], x2 // P1
- st1 {v3.d}[0], [x0], x2 // P0
- st1 {v3.d}[1], [x1], x2 // P0
- st1 {v4.d}[0], [x0], x2 // Q0
- st1 {v4.d}[1], [x1], x2 // Q0
- st1 {v5.d}[0], [x0], x2 // Q1
- st1 {v5.d}[1], [x1], x2 // Q1
- st1 {v6.d}[0], [x0] // Q2
- st1 {v6.d}[1], [x1] // Q2
+ st1 {v1.d}[0], [x0], x2 // P2
+ st1 {v1.d}[1], [x1], x2 // P2
+ st1 {v2.d}[0], [x0], x2 // P1
+ st1 {v2.d}[1], [x1], x2 // P1
+ st1 {v3.d}[0], [x0], x2 // P0
+ st1 {v3.d}[1], [x1], x2 // P0
+ st1 {v4.d}[0], [x0], x2 // Q0
+ st1 {v4.d}[1], [x1], x2 // Q0
+ st1 {v5.d}[0], [x0], x2 // Q1
+ st1 {v5.d}[1], [x1], x2 // Q1
+ st1 {v6.d}[0], [x0] // Q2
+ st1 {v6.d}[1], [x1] // Q2
ret
endfunc
@@ -579,7 +579,7 @@ function ff_vp8_h_loop_filter16\name\()_neon, export=1
ld1 {v6.d}[1], [x0], x1
ld1 {v7.d}[1], [x0], x1
- transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
+ transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
dup v22.16b, w2 // flim_E
.if !\simple
@@ -590,7 +590,7 @@ function ff_vp8_h_loop_filter16\name\()_neon, export=1
sub x0, x0, x1, lsl #4 // backup 16 rows
- transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
+ transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
// Store pixels:
st1 {v0.d}[0], [x0], x1
@@ -624,24 +624,24 @@ function ff_vp8_h_loop_filter8uv\name\()_neon, export=1
sub x1, x1, #4
// Load pixels:
- ld1 {v0.d}[0], [x0], x2 // load u
- ld1 {v0.d}[1], [x1], x2 // load v
- ld1 {v1.d}[0], [x0], x2
- ld1 {v1.d}[1], [x1], x2
- ld1 {v2.d}[0], [x0], x2
- ld1 {v2.d}[1], [x1], x2
- ld1 {v3.d}[0], [x0], x2
- ld1 {v3.d}[1], [x1], x2
- ld1 {v4.d}[0], [x0], x2
- ld1 {v4.d}[1], [x1], x2
- ld1 {v5.d}[0], [x0], x2
- ld1 {v5.d}[1], [x1], x2
- ld1 {v6.d}[0], [x0], x2
- ld1 {v6.d}[1], [x1], x2
- ld1 {v7.d}[0], [x0], x2
- ld1 {v7.d}[1], [x1], x2
-
- transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
+ ld1 {v0.d}[0], [x0], x2 // load u
+ ld1 {v0.d}[1], [x1], x2 // load v
+ ld1 {v1.d}[0], [x0], x2
+ ld1 {v1.d}[1], [x1], x2
+ ld1 {v2.d}[0], [x0], x2
+ ld1 {v2.d}[1], [x1], x2
+ ld1 {v3.d}[0], [x0], x2
+ ld1 {v3.d}[1], [x1], x2
+ ld1 {v4.d}[0], [x0], x2
+ ld1 {v4.d}[1], [x1], x2
+ ld1 {v5.d}[0], [x0], x2
+ ld1 {v5.d}[1], [x1], x2
+ ld1 {v6.d}[0], [x0], x2
+ ld1 {v6.d}[1], [x1], x2
+ ld1 {v7.d}[0], [x0], x2
+ ld1 {v7.d}[1], [x1], x2
+
+ transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
dup v22.16b, w3 // flim_E
dup v23.16b, w4 // flim_I
@@ -651,25 +651,25 @@ function ff_vp8_h_loop_filter8uv\name\()_neon, export=1
sub x0, x0, x2, lsl #3 // backup u 8 rows
sub x1, x1, x2, lsl #3 // backup v 8 rows
- transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
+ transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
// Store pixels:
- st1 {v0.d}[0], [x0], x2 // load u
- st1 {v0.d}[1], [x1], x2 // load v
- st1 {v1.d}[0], [x0], x2
- st1 {v1.d}[1], [x1], x2
- st1 {v2.d}[0], [x0], x2
- st1 {v2.d}[1], [x1], x2
- st1 {v3.d}[0], [x0], x2
- st1 {v3.d}[1], [x1], x2
- st1 {v4.d}[0], [x0], x2
- st1 {v4.d}[1], [x1], x2
- st1 {v5.d}[0], [x0], x2
- st1 {v5.d}[1], [x1], x2
- st1 {v6.d}[0], [x0], x2
- st1 {v6.d}[1], [x1], x2
- st1 {v7.d}[0], [x0]
- st1 {v7.d}[1], [x1]
+ st1 {v0.d}[0], [x0], x2 // load u
+ st1 {v0.d}[1], [x1], x2 // load v
+ st1 {v1.d}[0], [x0], x2
+ st1 {v1.d}[1], [x1], x2
+ st1 {v2.d}[0], [x0], x2
+ st1 {v2.d}[1], [x1], x2
+ st1 {v3.d}[0], [x0], x2
+ st1 {v3.d}[1], [x1], x2
+ st1 {v4.d}[0], [x0], x2
+ st1 {v4.d}[1], [x1], x2
+ st1 {v5.d}[0], [x0], x2
+ st1 {v5.d}[1], [x1], x2
+ st1 {v6.d}[0], [x0], x2
+ st1 {v6.d}[1], [x1], x2
+ st1 {v7.d}[0], [x0]
+ st1 {v7.d}[1], [x1]
ret
@@ -729,9 +729,9 @@ FFT16_FN ns_float, 1
.endm
.macro SR_COMBINE_4 len, part, off
- add x10, x1, x21
- add x11, x1, x21, lsl #1
- add x12, x1, x22
+ add x10, x1, x21
+ add x11, x1, x21, lsl #1
+ add x12, x1, x22
ldp q0, q1, [x1, #((0 + \part)*32 + \off)]
ldp q4, q5, [x1, #((2 + \part)*32 + \off)]
@@ -759,9 +759,9 @@ FFT16_FN ns_float, 1
.endm
.macro SR_COMBINE_FULL len, off=0
- add x10, x1, x21
- add x11, x1, x21, lsl #1
- add x12, x1, x22
+ add x10, x1, x21
+ add x11, x1, x21, lsl #1
+ add x12, x1, x22
SR_COMBINE_4 \len, 0, \off
SR_COMBINE_4 \len, 1, \off