diff mbox

[FFmpeg-devel,1/4] x86/af_afir: fix processing the last element

Message ID 20190103004357.5604-1-jamrial@gmail.com
State Accepted
Commit 9b5bd665e105894919cdcfa0ed9818919538e5f6
Headers show

Commit Message

James Almer Jan. 3, 2019, 12:43 a.m. UTC
ff_fcmul_add_sse3() is now identical to the C version.

Signed-off-by: James Almer <jamrial@gmail.com>
---
 libavfilter/x86/af_afir.asm | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

Comments

Paul B Mahol Jan. 3, 2019, 8:43 a.m. UTC | #1
On 1/3/19, James Almer <jamrial@gmail.com> wrote:
> ff_fcmul_add_sse3() is now identical to the C version.
>
> Signed-off-by: James Almer <jamrial@gmail.com>
> ---
>  libavfilter/x86/af_afir.asm | 7 +++++--
>  1 file changed, 5 insertions(+), 2 deletions(-)
>
> diff --git a/libavfilter/x86/af_afir.asm b/libavfilter/x86/af_afir.asm
> index 849d85e70f..fcc1f426db 100644
> --- a/libavfilter/x86/af_afir.asm
> +++ b/libavfilter/x86/af_afir.asm
> @@ -30,7 +30,6 @@ SECTION .text
>  INIT_XMM sse3
>  cglobal fcmul_add, 4,4,6, sum, t, c, len
>      shl       lend, 3
> -    add       lend, mmsize*2
>      add         tq, lenq
>      add         cq, lenq
>      add       sumq, lenq
> @@ -57,4 +56,8 @@ ALIGN 16
>      movaps    [sumq + lenq+mmsize], m3
>      add       lenq, mmsize*2
>      jl .loop
> -    REP_RET
> +    movss xm0, [tq + lenq]
> +    mulss xm0, [cq + lenq]
> +    addss xm0, [sumq + lenq]
> +    movss [sumq + lenq], xm0
> +    RET
> --
> 2.20.1

OK
diff mbox

Patch

diff --git a/libavfilter/x86/af_afir.asm b/libavfilter/x86/af_afir.asm
index 849d85e70f..fcc1f426db 100644
--- a/libavfilter/x86/af_afir.asm
+++ b/libavfilter/x86/af_afir.asm
@@ -30,7 +30,6 @@  SECTION .text
 INIT_XMM sse3
 cglobal fcmul_add, 4,4,6, sum, t, c, len
     shl       lend, 3
-    add       lend, mmsize*2
     add         tq, lenq
     add         cq, lenq
     add       sumq, lenq
@@ -57,4 +56,8 @@  ALIGN 16
     movaps    [sumq + lenq+mmsize], m3
     add       lenq, mmsize*2
     jl .loop
-    REP_RET
+    movss xm0, [tq + lenq]
+    mulss xm0, [cq + lenq]
+    addss xm0, [sumq + lenq]
+    movss [sumq + lenq], xm0
+    RET