@@ -1123,18 +1123,14 @@ endfunc
.endm
function idct32_odd
- ld1 {v0.8h,v1.8h}, [x11]
-
- dmbutterfly v16, v31, v0.h[0], v0.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
- dmbutterfly v24, v23, v0.h[2], v0.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
- dmbutterfly v20, v27, v0.h[4], v0.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
- dmbutterfly v28, v19, v0.h[6], v0.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
- dmbutterfly v18, v29, v1.h[0], v1.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
- dmbutterfly v26, v21, v1.h[2], v1.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
- dmbutterfly v22, v25, v1.h[4], v1.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
- dmbutterfly v30, v17, v1.h[6], v1.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
-
- ld1 {v0.8h}, [x10]
+ dmbutterfly v16, v31, v8.h[0], v8.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
+ dmbutterfly v24, v23, v8.h[2], v8.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
+ dmbutterfly v20, v27, v8.h[4], v8.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
+ dmbutterfly v28, v19, v8.h[6], v8.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
+ dmbutterfly v18, v29, v9.h[0], v9.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
+ dmbutterfly v26, v21, v9.h[2], v9.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
+ dmbutterfly v22, v25, v9.h[4], v9.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
+ dmbutterfly v30, v17, v9.h[6], v9.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
butterfly_8h v4, v24, v16, v24 // v4 = t16, v24 = t17
butterfly_8h v5, v20, v28, v20 // v5 = t19, v20 = t18
@@ -1153,18 +1149,14 @@ function idct32_odd
endfunc
function idct32_odd_half
- ld1 {v0.8h,v1.8h}, [x11]
-
- dmbutterfly_h1 v16, v31, v0.h[0], v0.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
- dmbutterfly_h2 v24, v23, v0.h[2], v0.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
- dmbutterfly_h1 v20, v27, v0.h[4], v0.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
- dmbutterfly_h2 v28, v19, v0.h[6], v0.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
- dmbutterfly_h1 v18, v29, v1.h[0], v1.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
- dmbutterfly_h2 v26, v21, v1.h[2], v1.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
- dmbutterfly_h1 v22, v25, v1.h[4], v1.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
- dmbutterfly_h2 v30, v17, v1.h[6], v1.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
-
- ld1 {v0.8h}, [x10]
+ dmbutterfly_h1 v16, v31, v8.h[0], v8.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
+ dmbutterfly_h2 v24, v23, v8.h[2], v8.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
+ dmbutterfly_h1 v20, v27, v8.h[4], v8.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
+ dmbutterfly_h2 v28, v19, v8.h[6], v8.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
+ dmbutterfly_h1 v18, v29, v9.h[0], v9.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
+ dmbutterfly_h2 v26, v21, v9.h[2], v9.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
+ dmbutterfly_h1 v22, v25, v9.h[4], v9.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
+ dmbutterfly_h2 v30, v17, v9.h[6], v9.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
butterfly_8h v4, v24, v16, v24 // v4 = t16, v24 = t17
butterfly_8h v5, v20, v28, v20 // v5 = t19, v20 = t18
@@ -1183,18 +1175,14 @@ function idct32_odd_half
endfunc
function idct32_odd_quarter
- ld1 {v0.8h,v1.8h}, [x11]
-
- dsmull_h v4, v5, v16, v0.h[0]
- dsmull_h v28, v29, v19, v0.h[7]
- dsmull_h v30, v31, v16, v0.h[1]
- dsmull_h v22, v23, v17, v1.h[6]
- dsmull_h v7, v6, v17, v1.h[7]
- dsmull_h v26, v27, v19, v0.h[6]
- dsmull_h v20, v21, v18, v1.h[0]
- dsmull_h v24, v25, v18, v1.h[1]
-
- ld1 {v0.8h}, [x10]
+ dsmull_h v4, v5, v16, v8.h[0]
+ dsmull_h v28, v29, v19, v8.h[7]
+ dsmull_h v30, v31, v16, v8.h[1]
+ dsmull_h v22, v23, v17, v9.h[6]
+ dsmull_h v7, v6, v17, v9.h[7]
+ dsmull_h v26, v27, v19, v8.h[6]
+ dsmull_h v20, v21, v18, v9.h[0]
+ dsmull_h v24, v25, v18, v9.h[1]
neg v28.4s, v28.4s
neg v29.4s, v29.4s
@@ -1240,12 +1228,8 @@ endfunc
// x1 = unused
// x2 = src
// x9 = double input stride
-// x10 = idct_coeffs
-// x11 = idct_coeffs + 32
function idct32_1d_8x32_pass1\suffix\()_neon
mov x14, x30
- ld1 {v0.8h,v1.8h}, [x10]
-
movi v2.8h, #0
// v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
@@ -1278,14 +1262,14 @@ function idct32_1d_8x32_pass1\suffix\()_neon
.macro store_rev a, b
// There's no rev128 instruction, but we reverse each 64 bit
// half, and then flip them using an ext with 8 bytes offset.
- rev64 v1.8h, \b
+ rev64 v3.8h, \b
st1 {\a}, [x0], #16
- rev64 v0.8h, \a
- ext v1.16b, v1.16b, v1.16b, #8
+ rev64 v2.8h, \a
+ ext v3.16b, v3.16b, v3.16b, #8
st1 {\b}, [x0], #16
- ext v0.16b, v0.16b, v0.16b, #8
- st1 {v1.8h}, [x0], #16
- st1 {v0.8h}, [x0], #16
+ ext v2.16b, v2.16b, v2.16b, #8
+ st1 {v3.8h}, [x0], #16
+ st1 {v2.8h}, [x0], #16
.endm
store_rev v16.8h, v24.8h
store_rev v17.8h, v25.8h
@@ -1339,20 +1323,20 @@ function idct32_1d_8x32_pass1\suffix\()_neon
// subtracted from the output.
.macro store_rev a, b
ld1 {v4.8h}, [x0]
- rev64 v1.8h, \b
+ rev64 v3.8h, \b
add v4.8h, v4.8h, \a
- rev64 v0.8h, \a
+ rev64 v2.8h, \a
st1 {v4.8h}, [x0], #16
- ext v1.16b, v1.16b, v1.16b, #8
+ ext v3.16b, v3.16b, v3.16b, #8
ld1 {v5.8h}, [x0]
- ext v0.16b, v0.16b, v0.16b, #8
+ ext v2.16b, v2.16b, v2.16b, #8
add v5.8h, v5.8h, \b
st1 {v5.8h}, [x0], #16
ld1 {v6.8h}, [x0]
- sub v6.8h, v6.8h, v1.8h
+ sub v6.8h, v6.8h, v3.8h
st1 {v6.8h}, [x0], #16
ld1 {v7.8h}, [x0]
- sub v7.8h, v7.8h, v0.8h
+ sub v7.8h, v7.8h, v2.8h
st1 {v7.8h}, [x0], #16
.endm
@@ -1376,12 +1360,8 @@ endfunc
// x2 = src (temp buffer)
// x7 = negative double temp buffer stride
// x9 = double temp buffer stride
-// x10 = idct_coeffs
-// x11 = idct_coeffs + 32
function idct32_1d_8x32_pass2\suffix\()_neon
mov x14, x30
- ld1 {v0.8h,v1.8h}, [x10]
-
// v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
.ifb \suffix
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
@@ -1454,15 +1434,15 @@ function idct32_1d_8x32_pass2\suffix\()_neon
sub v6.8h, v6.8h, \c
sub v7.8h, v7.8h, \d
.endif
- ld1 {v0.8b}, [x0], x1
- ld1 {v1.8b}, [x0], x1
+ ld1 {v10.8b}, [x0], x1
+ ld1 {v11.8b}, [x0], x1
srshr v4.8h, v4.8h, #6
ld1 {v2.8b}, [x0], x1
srshr v5.8h, v5.8h, #6
- uaddw v4.8h, v4.8h, v0.8b
+ uaddw v4.8h, v4.8h, v10.8b
ld1 {v3.8b}, [x0], x1
srshr v6.8h, v6.8h, #6
- uaddw v5.8h, v5.8h, v1.8b
+ uaddw v5.8h, v5.8h, v11.8b
srshr v7.8h, v7.8h, #6
sub x0, x0, x1, lsl #2
uaddw v6.8h, v6.8h, v2.8b
@@ -1503,13 +1483,10 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1
b.eq idct32x32_dc_add_neon
movrel x10, idct_coeffs
- add x11, x10, #32
movrel x12, min_eob_idct_idct_32, 2
mov x15, x30
- stp d14, d15, [sp, #-0x10]!
- stp d12, d13, [sp, #-0x10]!
stp d10, d11, [sp, #-0x10]!
stp d8, d9, [sp, #-0x10]!
@@ -1523,6 +1500,9 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1
mov x9, #128
neg x7, x9
+ ld1 {v0.8h,v1.8h}, [x10], #32
+ ld1 {v8.8h,v9.8h}, [x10]
+
cmp w3, #34
b.le idct32x32_quarter_add_neon
cmp w3, #135
@@ -1565,8 +1545,6 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1
ldp d8, d9, [sp], 0x10
ldp d10, d11, [sp], 0x10
- ldp d12, d13, [sp], 0x10
- ldp d14, d15, [sp], 0x10
br x15
endfunc
@@ -1592,8 +1570,6 @@ function idct32x32_\size\()_add_neon
ldp d8, d9, [sp], 0x10
ldp d10, d11, [sp], 0x10
- ldp d12, d13, [sp], 0x10
- ldp d14, d15, [sp], 0x10
br x15
endfunc