diff mbox

[FFmpeg-devel] x86/aacpsdsp: add ff_ps_hybrid_analysis_ileave_sse

Message ID 20170613003505.4460-1-jamrial@gmail.com
State New
Headers show

Commit Message

James Almer June 13, 2017, 12:35 a.m. UTC
About 2x faster than the c version.

Signed-off-by: James Almer <jamrial@gmail.com>
---
Depends on "[PATCH] x86/aacpsdsp: add ff_ps_hybrid_synthesis_deint_{sse,sse4}"

 libavcodec/x86/aacpsdsp.asm    | 106 +++++++++++++++++++++++++++++++++++++++++
 libavcodec/x86/aacpsdsp_init.c |   3 ++
 2 files changed, 109 insertions(+)

Comments

James Almer June 19, 2017, 1:51 a.m. UTC | #1
On 6/12/2017 9:35 PM, James Almer wrote:
> About 2x faster than the c version.
> 
> Signed-off-by: James Almer <jamrial@gmail.com>
> ---
> Depends on "[PATCH] x86/aacpsdsp: add ff_ps_hybrid_synthesis_deint_{sse,sse4}"
> 
>  libavcodec/x86/aacpsdsp.asm    | 106 +++++++++++++++++++++++++++++++++++++++++
>  libavcodec/x86/aacpsdsp_init.c |   3 ++
>  2 files changed, 109 insertions(+)

Pushed.
=?gb18030?B?bGl5b3ViZHU=?= June 19, 2017, 2:14 a.m. UTC | #2
On 6/12/2017 9:35 PM, James Almer wrote:
> About 2x faster than the c version.

> 

> Signed-off-by: James Almer <jamrial@gmail.com>

> ---

> Depends on "[PATCH] x86/aacpsdsp: add ff_ps_hybrid_synthesis_deint_{sse,sse4}"

> 

>  libavcodec/x86/aacpsdsp.asm    | 106 +++++++++++++++++++++++++++++++++++++++++

>  libavcodec/x86/aacpsdsp_init.c |   3 ++

>  2 files changed, 109 insertions(+)


Pushed.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>..

where can i watch the patch?
Steven Liu June 19, 2017, 3:25 a.m. UTC | #3
2017-06-19 10:14 GMT+08:00 liyoubdu <liyoubdu@qq.com>:
> On 6/12/2017 9:35 PM, James Almer wrote:
>> About 2x faster than the c version.
>>
>> Signed-off-by: James Almer <jamrial@gmail.com>
>> ---
>> Depends on "[PATCH] x86/aacpsdsp: add ff_ps_hybrid_synthesis_deint_{sse,sse4}"
>>
>>  libavcodec/x86/aacpsdsp.asm    | 106 +++++++++++++++++++++++++++++++++++++++++
>>  libavcodec/x86/aacpsdsp_init.c |   3 ++
>>  2 files changed, 109 insertions(+)
>
> Pushed.
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>..
> where can i watch the patch?
Now, you can git clone from ffmpeg mainline, and use git log, you can
see the commit
or look this mail thread, you can see the patch conext
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
diff mbox

Patch

diff --git a/libavcodec/x86/aacpsdsp.asm b/libavcodec/x86/aacpsdsp.asm
index cdcadefcdc..70a3d84780 100644
--- a/libavcodec/x86/aacpsdsp.asm
+++ b/libavcodec/x86/aacpsdsp.asm
@@ -172,6 +172,112 @@  align 16
 .ret:
     REP_RET
 
+;**********************************************************
+;void ps_hybrid_analysis_ileave_sse(float out[2][38][64],
+;                                   float (*in)[32][2],
+;                                   int i, int len)
+;**********************************************************
+INIT_XMM sse
+cglobal ps_hybrid_analysis_ileave, 3, 7, 5, out, in, i, len, in0, in1, tmp
+    movsxdifnidn        iq, id
+    mov               lend, 32 << 3
+    lea                inq, [inq+iq*4]
+    mov               tmpd, id
+    shl               tmpd, 8
+    add               outq, tmpq
+    mov               tmpd, 64
+    sub               tmpd, id
+    mov                 id, tmpd
+
+    test                id, 1
+    jne .loop4
+    test                id, 2
+    jne .loop8
+
+align 16
+.loop16:
+    mov               in0q, inq
+    mov               in1q, 38*64*4
+    add               in1q, in0q
+    mov               tmpd, lend
+
+.inner_loop16:
+    movaps              m0, [in0q]
+    movaps              m1, [in1q]
+    movaps              m2, [in0q+lenq]
+    movaps              m3, [in1q+lenq]
+    TRANSPOSE4x4PS 0, 1, 2, 3, 4
+    movaps          [outq], m0
+    movaps     [outq+lenq], m1
+    movaps   [outq+lenq*2], m2
+    movaps [outq+3*32*2*4], m3
+    lea               in0q, [in0q+lenq*2]
+    lea               in1q, [in1q+lenq*2]
+    add               outq, mmsize
+    sub               tmpd, mmsize
+    jg .inner_loop16
+    add                inq, 16
+    add               outq, 3*32*2*4
+    sub                 id, 4
+    jg .loop16
+    RET
+
+align 16
+.loop8:
+    mov               in0q, inq
+    mov               in1q, 38*64*4
+    add               in1q, in0q
+    mov               tmpd, lend
+
+.inner_loop8:
+    movlps              m0, [in0q]
+    movlps              m1, [in1q]
+    movhps              m0, [in0q+lenq]
+    movhps              m1, [in1q+lenq]
+    SBUTTERFLYPS 0, 1, 2
+    SBUTTERFLYPD 0, 1, 2
+    movaps          [outq], m0
+    movaps     [outq+lenq], m1
+    lea               in0q, [in0q+lenq*2]
+    lea               in1q, [in1q+lenq*2]
+    add               outq, mmsize
+    sub               tmpd, mmsize
+    jg .inner_loop8
+    add                inq, 8
+    add               outq, lenq
+    sub                 id, 2
+    jg .loop16
+    RET
+
+align 16
+.loop4:
+    mov               in0q, inq
+    mov               in1q, 38*64*4
+    add               in1q, in0q
+    mov               tmpd, lend
+
+.inner_loop4:
+    movss               m0, [in0q]
+    movss               m1, [in1q]
+    movss               m2, [in0q+lenq]
+    movss               m3, [in1q+lenq]
+    movlhps             m0, m1
+    movlhps             m2, m3
+    shufps              m0, m2, q2020
+    movaps          [outq], m0
+    lea               in0q, [in0q+lenq*2]
+    lea               in1q, [in1q+lenq*2]
+    add               outq, mmsize
+    sub               tmpd, mmsize
+    jg .inner_loop4
+    add                inq, 4
+    sub                 id, 1
+    test                id, 2
+    jne .loop8
+    cmp                 id, 4
+    jge .loop16
+    RET
+
 ;***********************************************************
 ;void ps_hybrid_synthesis_deint_sse4(float out[2][38][64],
 ;                                    float (*in)[32][2],
diff --git a/libavcodec/x86/aacpsdsp_init.c b/libavcodec/x86/aacpsdsp_init.c
index 25e089c395..056e23e59e 100644
--- a/libavcodec/x86/aacpsdsp_init.c
+++ b/libavcodec/x86/aacpsdsp_init.c
@@ -44,6 +44,8 @@  void ff_ps_hybrid_synthesis_deint_sse(float out[2][38][64], float (*in)[32][2],
                                       int i, int len);
 void ff_ps_hybrid_synthesis_deint_sse4(float out[2][38][64], float (*in)[32][2],
                                        int i, int len);
+void ff_ps_hybrid_analysis_ileave_sse(float (*out)[32][2], float L[2][38][64],
+                                      int i, int len);
 
 av_cold void ff_psdsp_init_x86(PSDSPContext *s)
 {
@@ -52,6 +54,7 @@  av_cold void ff_psdsp_init_x86(PSDSPContext *s)
     if (EXTERNAL_SSE(cpu_flags)) {
         s->add_squares            = ff_ps_add_squares_sse;
         s->mul_pair_single        = ff_ps_mul_pair_single_sse;
+        s->hybrid_analysis_ileave = ff_ps_hybrid_analysis_ileave_sse;
         s->hybrid_synthesis_deint = ff_ps_hybrid_synthesis_deint_sse;
         s->hybrid_analysis        = ff_ps_hybrid_analysis_sse;
     }