@@ -1113,11 +1113,11 @@ function vsse_intra8_neon, export=1
// x3 ptrdiff_t stride
// w4 int h
+ sub w4, w4, #1 // we need to make h-1 iterations
ld1 {v0.8b}, [x1], x3
+ cmp w4, #3
movi v16.4s, #0
- sub w4, w4, #1 // we need to make h-1 iterations
- cmp w4, #3
b.lt 2f
1:
@@ -1127,13 +1127,13 @@ function vsse_intra8_neon, export=1
ld1 {v2.8b}, [x1], x3
uabd v30.8b, v0.8b, v1.8b
ld1 {v3.8b}, [x1], x3
- umull v29.8h, v30.8b, v30.8b
uabd v27.8b, v1.8b, v2.8b
- uadalp v16.4s, v29.8h
- umull v26.8h, v27.8b, v27.8b
+ umull v29.8h, v30.8b, v30.8b
uabd v25.8b, v2.8b, v3.8b
- uadalp v16.4s, v26.8h
+ umull v26.8h, v27.8b, v27.8b
+ uadalp v16.4s, v29.8h
umull v24.8h, v25.8b, v25.8b
+ uadalp v16.4s, v26.8h
sub w4, w4, #3
uadalp v16.4s, v24.8h
cmp w4, #3
From: Martin Storsjö <martin@martin.st> Before: Cortex A53 A72 A73 vsse_5_neon: 74.7 31.5 26.0 After: vsse_5_neon: 62.7 32.5 25.7 --- libavcodec/aarch64/me_cmp_neon.S | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-)