[FFmpeg-devel] JPEG2000: SSE optimisation of DWT decoding

Submitted by Nicolas Bertrand on Oct. 6, 2017, 3:30 p.m.

Details

Message ID 20171006153057.12159-1-nicoinattendu@gmail.com
State New
Headers show

Commit Message

Nicolas Bertrand Oct. 6, 2017, 3:30 p.m.
From: Maxime Taisant <maximetaisant@hotmail.fr>

---
 libavcodec/jpeg2000dwt.c          |   45 +-
 libavcodec/jpeg2000dwt.h          |    5 +
 libavcodec/x86/jpeg2000dsp.asm    | 1339 +++++++++++++++++++++++++++++++++++++
 libavcodec/x86/jpeg2000dsp_init.c |  119 ++++
 tests/checkasm/jpeg2000dsp.c      |    1 +
 5 files changed, 1496 insertions(+), 13 deletions(-)

Comments

Carl Eugen Hoyos Oct. 6, 2017, 9:43 p.m.
2017-10-06 17:30 GMT+02:00 Nicolas Bertrand <nicoinattendu@gmail.com>:
> From: Maxime Taisant <maximetaisant@hotmail.fr>
>
> ---
>  libavcodec/jpeg2000dwt.c          |   45 +-
>  libavcodec/jpeg2000dwt.h          |    5 +
>  libavcodec/x86/jpeg2000dsp.asm    | 1339 +++++++++++++++++++++++++++++++++++++
>  libavcodec/x86/jpeg2000dsp_init.c |  119 ++++
>  tests/checkasm/jpeg2000dsp.c      |    1 +
>  5 files changed, 1496 insertions(+), 13 deletions(-)
>
> diff --git a/libavcodec/jpeg2000dwt.c b/libavcodec/jpeg2000dwt.c
> index 55dd5e89b5..1a0c3fc034 100644
> --- a/libavcodec/jpeg2000dwt.c
> +++ b/libavcodec/jpeg2000dwt.c
> @@ -30,6 +30,7 @@
>  #include "libavutil/mem.h"
>  #include "jpeg2000dwt.h"
>  #include "internal.h"
> +#include "libavutil/timer.h"

This include should not be part of the final patch, same
for the commented TIMER calls.
But the results of your performance tests should be
at least part of the email, can be part of the commit
message.

> +    s->sse = 0;

I originally assumed this is a leftover but iiuc, you
need to add function_pointers to the the context
and replace every function call with a call to the
function in the context.

> -    switch (s->type) {
> -    case FF_DWT97:
> -        dwt_decode97_float(s, t);
> -        break;
> -    case FF_DWT97_INT:
> -        dwt_decode97_int(s, t);
> -        break;
> -    case FF_DWT53:
> -        dwt_decode53(s, t);
> -        break;
> -    default:
> -        return -1;
> +    switch(s->type){

> +        case FF_DWT97:

We like the indentation here as it is.

> +            if (s->sse)

> +            //{

Please keep the brackets in the final patch,
this variant is a very good example on why
they are useful;-)

> +            //    START_TIMER

But remove this.

Consider waiting for a review of the asm code.

Thank you, Carl Eugen
Michael Niedermayer Oct. 6, 2017, 11:44 p.m.
On Fri, Oct 06, 2017 at 05:30:57PM +0200, Nicolas Bertrand wrote:
> From: Maxime Taisant <maximetaisant@hotmail.fr>
> 
> ---
>  libavcodec/jpeg2000dwt.c          |   45 +-
>  libavcodec/jpeg2000dwt.h          |    5 +
>  libavcodec/x86/jpeg2000dsp.asm    | 1339 +++++++++++++++++++++++++++++++++++++
>  libavcodec/x86/jpeg2000dsp_init.c |  119 ++++
>  tests/checkasm/jpeg2000dsp.c      |    1 +
>  5 files changed, 1496 insertions(+), 13 deletions(-)

This fails to build on x86-32 linux

src/libavcodec/x86/jpeg2000dsp.asm:938: error: symbol `r7q' undefined
src/libavcodec/x86/jpeg2000dsp.asm:938: error: symbol `r10q' undefined
src/libavcodec/x86/jpeg2000dsp.asm:938: error: symbol `r10q' undefined
src/libavcodec/x86/jpeg2000dsp.asm:938: error: symbol `r11q' undefined
src/libavcodec/x86/jpeg2000dsp.asm:938: error: symbol `r11q' undefined
src/libavcodec/x86/jpeg2000dsp.asm:938: error: symbol `r11q' undefined
src/libavcodec/x86/jpeg2000dsp.asm:938: error: symbol `r11q' undefined
src/libavcodec/x86/jpeg2000dsp.asm:938: error: symbol `r11q' undefined
src/libavcodec/x86/jpeg2000dsp.asm:938: error: symbol `r11q' undefined
src/libavcodec/x86/jpeg2000dsp.asm:938: error: symbol `r10q' undefined
src/libavcodec/x86/jpeg2000dsp.asm:938: error: symbol `r11q' undefined
src/libavcodec/x86/jpeg2000dsp.asm:938: error: symbol `r9q' undefined
src/libavcodec/x86/jpeg2000dsp.asm:938: error: symbol `r9q' undefined
src/libavcodec/x86/jpeg2000dsp.asm:938: error: symbol `r9q' undefined
src/libavcodec/x86/jpeg2000dsp.asm:938: error: symbol `r8q' undefined
src/libavcodec/x86/jpeg2000dsp.asm:938: error: symbol `r8q' undefined
src/libavcodec/x86/jpeg2000dsp.asm:938: error: symbol `r8q' undefined
src/libavcodec/x86/jpeg2000dsp.asm:938: error: symbol `r9q' undefined
src/libavcodec/x86/jpeg2000dsp.asm:938: error: symbol `r8q' undefined
src/libavcodec/x86/jpeg2000dsp.asm:938: error: symbol `r8q' undefined
src/libavcodec/x86/jpeg2000dsp.asm:938: error: symbol `r8q' undefined
src/libavcodec/x86/jpeg2000dsp.asm:938: error: symbol `r9q' undefined
src/libavcodec/x86/jpeg2000dsp.asm:938: error: symbol `r9q' undefined
src/libavcodec/x86/jpeg2000dsp.asm:938: error: symbol `r10q' undefined
src/libavcodec/x86/jpeg2000dsp.asm:938: error: symbol `r11q' undefined
src/libavcodec/x86/jpeg2000dsp.asm:938: error: symbol `r8q' undefined
src/libavcodec/x86/jpeg2000dsp.asm:938: error: symbol `r8q' undefined
src/libavcodec/x86/jpeg2000dsp.asm:938: error: symbol `r9q' undefined
src/libavcodec/x86/jpeg2000dsp.asm:938: error: symbol `r8q' undefined
src/libavcodec/x86/jpeg2000dsp.asm:938: error: symbol `r10q' undefined
src/libavcodec/x86/jpeg2000dsp.asm:938: error: symbol `r11q' undefined
src/libavcodec/x86/jpeg2000dsp.asm:938: error: symbol `r9q' undefined
src/libavcodec/x86/jpeg2000dsp.asm:938: error: symbol `r8q' undefined
src/libavcodec/x86/jpeg2000dsp.asm:938: error: symbol `r10q' undefined
src/libavcodec/x86/jpeg2000dsp.asm:938: error: symbol `r8q' undefined
src/libavcodec/x86/jpeg2000dsp.asm:938: error: symbol `r9q' undefined
src/libavcodec/x86/jpeg2000dsp.asm:938: error: symbol `r10q' undefined
...

[...]

Patch hide | download patch | download mbox

diff --git a/libavcodec/jpeg2000dwt.c b/libavcodec/jpeg2000dwt.c
index 55dd5e89b5..1a0c3fc034 100644
--- a/libavcodec/jpeg2000dwt.c
+++ b/libavcodec/jpeg2000dwt.c
@@ -30,6 +30,7 @@ 
 #include "libavutil/mem.h"
 #include "jpeg2000dwt.h"
 #include "internal.h"
+#include "libavutil/timer.h"
 
 /* Defines for 9/7 DWT lifting parameters.
  * Parameters are in float. */
@@ -558,7 +559,7 @@  int ff_jpeg2000_dwt_init(DWTContext *s, int border[2][2],
         }
     switch (type) {
     case FF_DWT97:
-        s->f_linebuf = av_malloc_array((maxlen + 12), sizeof(*s->f_linebuf));
+        s->f_linebuf = av_malloc_array(4*(maxlen + 12), sizeof(*s->f_linebuf));
         if (!s->f_linebuf)
             return AVERROR(ENOMEM);
         break;
@@ -575,6 +576,11 @@  int ff_jpeg2000_dwt_init(DWTContext *s, int border[2][2],
     default:
         return -1;
     }
+
+    s->sse = 0;
+    if (ARCH_X86)
+        ff_jpeg2000dwt_init_x86(s, type);
+
     return 0;
 }
 
@@ -601,18 +607,31 @@  int ff_dwt_decode(DWTContext *s, void *t)
     if (s->ndeclevels == 0)
         return 0;
 
-    switch (s->type) {
-    case FF_DWT97:
-        dwt_decode97_float(s, t);
-        break;
-    case FF_DWT97_INT:
-        dwt_decode97_int(s, t);
-        break;
-    case FF_DWT53:
-        dwt_decode53(s, t);
-        break;
-    default:
-        return -1;
+    switch(s->type){
+        case FF_DWT97:
+            if (s->sse)
+            //{
+            //    START_TIMER
+                dwt_decode97_float_sse(s, t);
+            //    STOP_TIMER("dwt_decode97_float_sse");
+            //}
+            else            
+            //{
+            //    START_TIMER
+                dwt_decode97_float(s, t);
+            //    STOP_TIMER("dwt_decode97_float");
+            //}
+            /*{
+                START_TIMER
+                STOP_TIMER("decode_NULL");
+            }*/
+            break;
+        case FF_DWT97_INT:
+            dwt_decode97_int(s, t); break;
+        case FF_DWT53:
+            dwt_decode53(s, t); break;
+        default:
+            return -1;
     }
     return 0;
 }
diff --git a/libavcodec/jpeg2000dwt.h b/libavcodec/jpeg2000dwt.h
index 718d183ac1..622a404b79 100644
--- a/libavcodec/jpeg2000dwt.h
+++ b/libavcodec/jpeg2000dwt.h
@@ -48,6 +48,7 @@  typedef struct DWTContext {
     uint8_t type;                        ///< 0 for 9/7; 1 for 5/3
     int32_t *i_linebuf;                  ///< int buffer used by transform
     float   *f_linebuf;                  ///< float buffer used by transform
+    int sse;
 } DWTContext;
 
 /**
@@ -65,4 +66,8 @@  int ff_dwt_decode(DWTContext *s, void *t);
 
 void ff_dwt_destroy(DWTContext *s);
 
+void dwt_decode97_float_sse(DWTContext *s, float *t);
+
+void ff_jpeg2000dwt_init_x86(DWTContext *s, int type);
+
 #endif /* AVCODEC_JPEG2000DWT_H */
diff --git a/libavcodec/x86/jpeg2000dsp.asm b/libavcodec/x86/jpeg2000dsp.asm
index 56b5fbd606..b5d5b9a04b 100644
--- a/libavcodec/x86/jpeg2000dsp.asm
+++ b/libavcodec/x86/jpeg2000dsp.asm
@@ -2,6 +2,7 @@ 
 ;* SIMD-optimized JPEG2000 DSP functions
 ;* Copyright (c) 2014 Nicolas Bertrand
 ;* Copyright (c) 2015 James Almer
+;* Copyright (c) 2017 Maxime Taisant
 ;*
 ;* This file is part of FFmpeg.
 ;*
@@ -29,6 +30,16 @@  pf_ict1: times 8 dd 0.34413
 pf_ict2: times 8 dd 0.71414
 pf_ict3: times 8 dd 1.772
 
+F_LFTG_K: dd 1.230174104914001
+F_LFTG_X: dd 0.812893066115961
+
+F_LFTG_ALPHA: times 8 dd 1.586134342059924
+F_LFTG_BETA: times 8 dd 0.052980118572961
+F_LFTG_GAMMA: times 8 dd 0.882911075530934
+F_LFTG_DELTA: times 8 dd 0.443506852043971
+
+TWO: dd 2.0
+
 SECTION .text
 
 ;***********************************************************************
@@ -142,3 +153,1331 @@  RCT_INT
 INIT_YMM avx2
 RCT_INT
 %endif
+
+;***********************************************************************
+; ff_sr_ld97_float_<opt>(float *line, int i0, int i1)
+;***********************************************************************
+%macro SR1D97FLOAT 1
+cglobal sr_1d97_float, 3, 5, %1, line, i0, i1, j0, j1
+    mov   j0q, i0q
+    mov   j1q, i1q
+    add   j0q, 1
+    cmp   j1q, j0q
+    jg %%extend
+    sub   j0q, 2
+    jnz %%else
+    movss  m0, [lineq+4]
+    movss  m1, [F_LFTG_K]
+    movss  m2, [TWO]
+    divss  m1, m2
+    mulss  m0, m1
+    movss  [lineq+4], m0
+    jmp %%end
+
+%%else:
+    movss  m0, [lineq]
+    movss  m1, [F_LFTG_X]
+    mulss  m0, m1
+    movss [lineq], m0
+    jmp %%end
+
+%%extend:
+    shl   i0d, 2
+    shl   i1d, 2
+    mov   j0q, i0q
+    mov   j1q, i1q
+    movups m0, [lineq+j0q+4]
+    shufps m0, m0, q0123
+    movups [lineq+j0q-16], m0
+    movups m0, [lineq+j1q-20]
+    shufps m0, m0, q0123
+    movups [lineq+j1q], m0
+
+    movups m3, [F_LFTG_DELTA]
+    mov   j0q, i0q
+    mov   j1q, i1q
+    shr   j0q, 1
+    sub   j0q, 4
+    shr   j1q, 1
+    add   j1q, 8
+    cmp   j0q, j1q
+    jge %%beginloop2
+%%loop1:
+    add   j0q, 12
+    cmp   j0q, j1q
+    jge %%endloop1
+ 
+    ;line{2*i,2*(i+1),2*(i+2),2*(i+3)} -= F_LFTG_DELTA*(line{2*i-1,2*(i+1)-1,2*(i+2)-1,2*(i+3)-1}+line{2*i+1,2*(i+1)+1,2*(i+2)+1,2*(i+3)+1})
+    movups m0, [lineq+2*j0q-28]
+    movups m4, [lineq+2*j0q-12]
+    movaps m1, m0
+    shufps m0, m4, q3131
+    shufps m1, m4, q2020
+    movups m2, [lineq+2*j0q-24]
+    movups m5, [lineq+2*j0q-8] 
+    shufps m2, m5, q3131
+    addps  m2, m1
+    mulps  m2, m3
+    subps  m0, m2
+    movaps m4, m1
+    movlhps m1, m0
+    shufps m1, m1, q3120
+    shufps m4, m0, q3232
+    shufps m4, m4, q3120
+    movups [lineq+2*j0q-28], m1
+    movups [lineq+2*j0q-12], m4
+
+    add   j0q, 4
+    cmp   j0q, j1q
+    jge %%beginloop2
+    jmp %%loop1
+  
+%%endloop1:
+    sub   j0q, 12
+%%littleloop1:
+    movss  m0, [lineq+2*j0q]
+    movss  m1, [lineq+2*j0q-4]
+    movss  m2, [lineq+2*j0q+4]
+    addss  m1, m2
+    mulss  m1, m3
+    subss  m0, m1
+    movss [lineq+2*j0q], m0
+    add   j0q, 4
+    cmp   j0q, j1q
+    jl %%littleloop1
+
+%%beginloop2:
+    movups m3, [F_LFTG_GAMMA]
+    mov   j0q, i0q
+    mov   j1q, i1q
+    shr   j0q, 1
+    sub   j0q, 4
+    shr   j1q, 1
+    add   j1q, 4
+    cmp   j0q, j1q
+    jge %%beginloop3
+%%loop2:
+    add   j0q, 12
+    cmp   j0q, j1q
+    jge %%endloop2
+ 
+    ;line{2*i+1,2*(i+1)+1,2*(i+2)+1,2*(i+3)+1} -= F_LFTG_GAMMA*(line{2*i,2*(i+1),2*(i+2),2*(i+3)}+line{2*i+2,2*(i+1)+2,2*(i+2)+2,2*(i+3)+2})
+    movups m0, [lineq+2*j0q-24]
+    movups m4, [lineq+2*j0q-8]
+    movaps m1, m0
+    shufps m0, m4, q3131
+    shufps m1, m4, q2020
+    movups m2, [lineq+2*j0q-20]
+    movups m5, [lineq+2*j0q-4] 
+    shufps m2, m5, q3131
+    addps  m2, m1
+    mulps  m2, m3
+    subps  m0, m2
+    movaps m4, m1
+    movlhps m1, m0
+    shufps m1, m1, q3120
+    shufps m4, m0, q3232
+    shufps m4, m4, q3120
+    movups [lineq+2*j0q-24], m1
+    movups [lineq+2*j0q-8], m4
+
+    add   j0q, 4
+    cmp   j0q, j1q
+    jge %%beginloop3
+    jmp %%loop2
+  
+%%endloop2:
+    sub   j0q, 12
+%%littleloop2:
+    movss  m0, [lineq+2*j0q+4]
+    movss  m1, [lineq+2*j0q]
+    movss  m2, [lineq+2*j0q+8]
+    addss  m1, m2
+    mulss  m1, m3
+    subss  m0, m1
+    movss  [lineq+2*j0q+4], m0
+    add   j0q, 4
+    cmp   j0q, j1q
+    jl %%littleloop2
+
+%%beginloop3:
+    movups m3, [F_LFTG_BETA]
+    mov   j0q, i0q
+    mov   j1q, i1q
+    shr   j0q, 1
+    sub   j0q, 4
+    shr   j1q, 1
+    add   j1q, 8
+    cmp   j0q, j1q
+    jge %%beginloop4
+%%loop3:
+    add   j0q, 12
+    cmp   j0q, j1q
+    jge %%endloop3
+ 
+    ;line{2*i,2*(i+1),2*(i+2),2*(i+3)} += F_LFTG_BETA*(line{2*i-1,2*(i+1)-1,2*(i+2)-1,2*(i+3)-1}+line{2*i+1,2*(i+1)+1,2*(i+2)+1,2*(i+3)+1})
+    movups m0, [lineq+2*j0q-28]
+    movups m4, [lineq+2*j0q-12]
+    movaps m1, m0
+    shufps m0, m4, q3131
+    shufps m1, m4, q2020
+    movups m2, [lineq+2*j0q-24]
+    movups m5, [lineq+2*j0q-8] 
+    shufps m2, m5, q3131
+    addps  m2, m1
+    mulps  m2, m3
+    addps  m0, m2
+    movaps m4, m1
+    movlhps m1, m0
+    shufps m1, m1, q3120
+    shufps m4, m0, q3232
+    shufps m4, m4, q3120
+    movups [lineq+2*j0q-28], m1
+    movups [lineq+2*j0q-12], m4
+
+    add   j0q, 4
+    cmp   j0q, j1q
+    jge %%beginloop4
+    jmp %%loop3
+  
+%%endloop3:
+    sub   j0q, 12
+%%littleloop3:
+    movss  m0, [lineq+2*j0q]
+    movss  m1, [lineq+2*j0q-4]
+    movss  m2, [lineq+2*j0q+4]
+    addss  m1, m2
+    mulss  m1, m3
+    addss  m0, m1
+    movss [lineq+2*j0q], m0
+    add   j0q, 4
+    cmp   j0q, j1q
+    jl %%littleloop3
+
+%%beginloop4:
+    movups m3, [F_LFTG_ALPHA]
+    mov   j0q, i0q
+    mov   j1q, i1q
+    shr   j0q, 1
+    sub   j0q, 4
+    shr   j1q, 1
+    add   j1q, 4
+    cmp   j0q, j1q
+    jge %%end
+%%loop4:
+    add   j0q, 12
+    cmp   j0q, j1q
+    jge %%endloop4
+ 
+    ;line{2*i+1,2*(i+1)+1,2*(i+2)+1,2*(i+3)+1} += F_LFTG_ALPHA*(line{2*i,2*(i+1),2*(i+2),2*(i+3)}+line{2*i+2,2*(i+1)+2,2*(i+2)+2,2*(i+3)+2})
+    movups m0, [lineq+2*j0q-24]
+    movups m4, [lineq+2*j0q-8]
+    movaps m1, m0
+    shufps m0, m4, q3131
+    shufps m1, m4, q2020
+    movups m2, [lineq+2*j0q-20]
+    movups m5, [lineq+2*j0q-4] 
+    shufps m2, m5, q3131
+    addps  m2, m1
+    mulps  m2, m3
+    addps  m0, m2
+    movaps m4, m1
+    movlhps m1, m0
+    shufps m1, m1, q3120
+    shufps m4, m0, q3232
+    shufps m4, m4, q3120
+    movups [lineq+2*j0q-24], m1
+    movups [lineq+2*j0q-8], m4
+
+    add   j0q, 4
+    cmp   j0q, j1q
+    jge %%end
+    jmp %%loop4
+  
+%%endloop4:
+    sub   j0q, 12
+%%littleloop4:
+    movss  m0, [lineq+2*j0q+4]
+    movss  m1, [lineq+2*j0q]
+    movss  m2, [lineq+2*j0q+8]
+    addss  m1, m2
+    mulss  m1, m3
+    addss  m0, m1
+    movss [lineq+2*j0q+4], m0
+    add   j0q, 4
+    cmp   j0q, j1q
+    jl %%littleloop4
+
+%%end:
+    REP_RET
+%endmacro
+    
+INIT_XMM sse
+SR1D97FLOAT 6
+
+%macro SR1D97FLOAT_ 5      ; p, i0, i1, tmp0, tmp1
+    mov    %4, %2
+    mov    %5, %3
+    add    %4, 1
+    cmp    %5, %4
+    jg %%extend
+    sub    %4, 2
+    jnz %%else
+    movss  m0, [%1+4]
+    movss  m1, [F_LFTG_K]
+    movss  m2, [TWO]
+    divss  m1, m2
+    mulss  m0, m1
+    movss  [%1+4], m0
+    jmp %%end
+
+%%else:
+    movss  m0, [%1]
+    movss  m1, [F_LFTG_X]
+    mulss  m0, m1
+    movss  [%1], m0
+    jmp %%end
+
+%%extend:
+    shl    %2, 2
+    shl    %3, 2
+    mov    %4, %2
+    mov    %5, %3
+    movups m0, [%1+%4+4]
+    shufps m0, m0, q0123
+    movups [%1+%4-16], m0
+    movups m0, [%1+%5-20]
+    shufps m0, m0, q0123
+    movups [%1+%5], m0
+
+    movups m3, [F_LFTG_DELTA]
+    mov    %4, %2
+    mov    %5, %3
+    shr    %4, 1
+    sub    %4, 4
+    shr    %5, 1
+    add    %5, 8
+    cmp    %4, %5
+    jge %%beginloop2
+%%loop1:
+    add    %4, 12
+    cmp    %4, %5
+    jge %%endloop1
+ 
+    movups m0, [%1+2*%4-28]
+    movups m4, [%1+2*%4-12]
+    movaps m1, m0
+    shufps m0, m4, q3131
+    shufps m1, m4, q2020
+    movups m2, [%1+2*%4-24]
+    movups m5, [%1+2*%4-8] 
+    shufps m2, m5, q3131
+    addps  m2, m1
+    mulps  m2, m3
+    subps  m0, m2
+    movaps m4, m1
+    movlhps m1, m0
+    shufps m1, m1, q3120
+    shufps m4, m0, q3232
+    shufps m4, m4, q3120
+    movups [%1+2*%4-28], m1
+    movups [%1+2*%4-12], m4
+
+    add    %4, 4
+    cmp    %4, %5
+    jge %%beginloop2
+    jmp %%loop1
+  
+%%endloop1:
+    sub    %4, 12
+%%littleloop1:
+    movss  m0, [%1+2*%4]
+    movss  m1, [%1+2*%4-4]
+    movss  m2, [%1+2*%4+4]
+    addss  m1, m2
+    mulss  m1, m3
+    subss  m0, m1
+    movss  [%1+2*%4], m0
+    add    %4, 4
+    cmp    %4, %5
+    jl %%littleloop1
+
+%%beginloop2:
+    movups m3, [F_LFTG_GAMMA]
+    mov    %4, %2
+    mov    %5, %3
+    shr    %4, 1
+    sub    %4, 4
+    shr    %5, 1
+    add    %5, 4
+    cmp    %4, %5
+    jge %%beginloop3
+%%loop2:
+    add    %4, 12
+    cmp    %4, %5
+    jge %%endloop2
+ 
+    movups m0, [%1+2*%4-24]
+    movups m4, [%1+2*%4-8]
+    movaps m1, m0
+    shufps m0, m4, q3131
+    shufps m1, m4, q2020
+    movups m2, [%1+2*%4-20]
+    movups m5, [%1+2*%4-4] 
+    shufps m2, m5, q3131
+    addps  m2, m1
+    mulps  m2, m3
+    subps  m0, m2
+    movaps m4, m1
+    movlhps m1, m0
+    shufps m1, m1, q3120
+    shufps m4, m0, q3232
+    shufps m4, m4, q3120
+    movups [%1+2*%4-24], m1
+    movups [%1+2*%4-8], m4
+
+    add    %4, 4
+    cmp    %4, %5
+    jge %%beginloop3
+    jmp %%loop2
+  
+%%endloop2:
+    sub    %4, 12
+%%littleloop2:
+    movss  m0, [%1+2*%4+4]
+    movss  m1, [%1+2*%4]
+    movss  m2, [%1+2*%4+8]
+    addss  m1, m2
+    mulss  m1, m3
+    subss  m0, m1
+    movss  [%1+2*%4+4], m0
+    add    %4, 4
+    cmp    %4, %5
+    jl %%littleloop2
+
+%%beginloop3:
+    movups m3, [F_LFTG_BETA]
+    mov    %4, %2
+    mov    %5, %3
+    shr    %4, 1
+    sub    %4, 4
+    shr    %5, 1
+    add    %5, 8
+    cmp    %4, %5
+    jge %%beginloop4
+%%loop3:
+    add    %4, 12
+    cmp    %4, %5
+    jge %%endloop3
+
+    movups m0, [%1+2*%4-28]
+    movups m4, [%1+2*%4-12]
+    movaps m1, m0
+    shufps m0, m4, q3131
+    shufps m1, m4, q2020
+    movups m2, [%1+2*%4-24]
+    movups m5, [%1+2*%4-8] 
+    shufps m2, m5, q3131
+    addps  m2, m1
+    mulps  m2, m3
+    addps  m0, m2
+    movaps m4, m1
+    movlhps m1, m0
+    shufps m1, m1, q3120
+    shufps m4, m0, q3232
+    shufps m4, m4, q3120
+    movups [%1+2*%4-28], m1
+    movups [%1+2*%4-12], m4
+
+    add    %4, 4
+    cmp    %4, %5
+    jge %%beginloop4
+    jmp %%loop3
+  
+%%endloop3:
+    sub    %4, 12
+%%littleloop3:
+    movss  m0, [%1+2*%4]
+    movss  m1, [%1+2*%4-4]
+    movss  m2, [%1+2*%4+4]
+    addss  m1, m2
+    mulss  m1, m3
+    addss  m0, m1
+    movss  [%1+2*%4], m0
+    add    %4, 4
+    cmp    %4, %5
+    jl %%littleloop3
+
+%%beginloop4:
+    movups m3, [F_LFTG_ALPHA]
+    mov    %4, %2
+    mov    %5, %3
+    shr    %4, 1
+    sub    %4, 4
+    shr    %5, 1
+    add    %5, 4
+    cmp    %4, %5
+    jge %%end
+%%loop4:
+    add    %4, 12
+    cmp    %4, %5
+    jge %%endloop4
+ 
+    movups m0, [%1+2*%4-24]
+    movups m4, [%1+2*%4-8]
+    movaps m1, m0
+    shufps m0, m4, q3131
+    shufps m1, m4, q2020
+    movups m2, [%1+2*%4-20]
+    movups m5, [%1+2*%4-4] 
+    shufps m2, m5, q3131
+    addps  m2, m1
+    mulps  m2, m3
+    addps  m0, m2
+    movaps m4, m1
+    movlhps m1, m0
+    shufps m1, m1, q3120
+    shufps m4, m0, q3232
+    shufps m4, m4, q3120
+    movups [%1+2*%4-24], m1
+    movups [%1+2*%4-8], m4
+
+    add    %4, 4
+    cmp    %4, %5
+    jge %%end
+    jmp %%loop4
+  
+%%endloop4:
+    sub    %4, 12
+%%littleloop4:
+    movss  m0, [%1+2*%4+4]
+    movss  m1, [%1+2*%4]
+    movss  m2, [%1+2*%4+8]
+    addss  m1, m2
+    mulss  m1, m3
+    addss  m0, m1
+    movss [%1+2*%4+4], m0
+    add    %4, 4
+    cmp    %4, %5
+    jl %%littleloop4
+
+%%end:
+    shr    %2, 2
+    shr    %3, 2
+%endmacro
+
+
+;***********************************************************************
+; ff_hor_sd_float_<opt>(float *line, float *data, int mh, int lh, int lv, int w)
+;***********************************************************************
+%macro HORSDFLOAT 1
+cglobal hor_sd_float, 6, 12, %1, line, data, mh, lh, lv, w, l, lp, i0, i1, j0, j1
+    mov    lq, mhq
+    shl    lq, 2
+    add    lq, lineq
+    shl   lhq, 2
+    
+    mov   lpq, 0
+%%mainloop:
+    ;j0 = w*lp+j
+    mov   j0q, wq
+    imul  j0q, lpq
+
+    ;j1 = (lh-mh+1)/2 + j0
+    mov   j1q, lhq
+    shr   j1q, 2
+    sub   j1q, mhq
+    add   j1q, 1
+    shr   j1q, 1
+    add   j1q, j0q
+
+    shl   j0q, 2
+    shl   j1q, 2
+
+    ;i1 = 1-mh
+    mov   i1q, 1
+    sub   i1q, mhq
+    shl   i1q, 2
+
+    ;i0 = mh
+    mov   i0q, mhq
+    shl   i0q, 2
+ 
+    cmp   i0q, i1q
+    jg %%i1i0
+
+;i0 < i1
+    cmp   i1q, lhq
+    jge %%i0
+    
+    add   i0q, 4
+    cmp   i0q, i1q
+    jne %%inci0
+ 
+;i1 = i0+1
+%%beginloopi0i1   
+    sub   i0q, 4
+
+%%loopi0i1:
+    add   i1q, 24
+    cmp   i1q, lhq
+    jge %%endloopi0i1
+
+    ;l{i0,i0+2,i0+4,i0+6} <- data[j0:j0+3]
+    ;l{i0,i0+3,i0+5,i0+7} = l{i1,i1+2,i1+4,i1+6} <- data[j1:j1+3]
+    movups m0, [dataq+j0q]
+    movups m2, [dataq+j1q]
+    movaps m1, m0
+    movlhps m0, m2
+    shufps m0, m0, q3120
+    shufps m1, m2, q3232
+    shufps m1, m1, q3120
+    movups [lq+i0q], m0
+    movups [lq+i0q+16], m1
+
+    add   i1q, 8
+    add   i0q, 32
+    add   j0q, 16
+    add   j1q, 16
+    cmp   i1q, lhq
+    jl %%loopi0i1  
+    cmp   i0q, lhq
+    jge %%sr_1d
+
+    ;i1>=lh & i0<lh
+    movss m0, [dataq+j0q]
+    movss [lq+i0q], m0
+    jmp %%sr_1d
+
+;i1 + 6 >= lh
+%%endloopi0i1:
+    sub   i1q, 24
+%%littleloopi0i1:
+
+    ;l[i0] <- data[j0]
+    ;l[i1] <- data[j1]
+    movss m0, [dataq+j0q]
+    movss m1, [dataq+j1q]
+    movss [lq+i0q], m0
+    movss [lq+i1q], m1
+
+    add   i0q, 8
+    add   i1q, 8
+    add   j0q, 4
+    add   j1q, 4
+    cmp   i1q, lhq
+    jl %%littleloopi0i1
+    cmp   i0q, lhq
+    jge %%sr_1d
+
+    ;i1>=lh & i0<lh
+    movss  m0, [dataq+j0q]
+    movss  [lq+i0q], m0
+    jmp %%sr_1d
+
+;i1 < i0
+%%i1i0:
+    cmp   i0q, lhq
+    jge %%i1
+    
+    add   i1q, 4
+    cmp   i0q, i1q
+    jne %%inci1
+
+;i0 = i1+1
+%%beginloopi1i0    
+    sub   i1q, 4
+
+%%loopi1i0:
+    add   i0q, 24
+    cmp   i0q, lhq
+    jge %%endloopi1i0
+
+    ;l{i1,i1+2,i1+4,i1+6} <- data[j1:j1+3]
+    ;l{i1,i1+3,i1+5,i1+7} = l{i0,i0+2,i0+4,i0+6} <- data[j0:j0+3]
+    movups m0, [dataq+j1q]
+    movups m2, [dataq+j0q]
+    movaps m1, m0
+    movlhps m0, m2
+    shufps m0, m0, q3120
+    shufps m1, m2, q3232
+    shufps m1, m1, q3120
+    movups [lq+i1q], m0
+    movups [lq+i1q+16], m1
+
+    add   i0q, 8
+    add   i1q, 32
+    add   j0q, 16
+    add   j1q, 16
+    cmp   i0q, lhq
+    jl %%loopi1i0  
+    cmp   i1q, lhq
+    jge %%sr_1d
+
+    ;i0>=lh & i1<lh
+    movss  m0, [dataq+j1q]
+    movss  [lq+i1q], m0
+    jmp %%sr_1d
+
+%%endloopi1i0:
+    sub   i1q, 24
+%%littleloopi1i0:
+
+    ;l[i0] <- data[j0]
+    ;l[i1] <- data[j1]
+    movss  m0, [dataq+j1q]
+    movss  m1, [dataq+j0q]
+    movss  [lq+i1q], m0
+    movss  [lq+i0q], m1
+
+    add   i0q, 8
+    add   i1q, 8
+    add   j0q, 4
+    add   j1q, 4
+    cmp   i1q, lhq
+    jl %%littleloopi1i0
+    cmp   i0q, lhq
+    jge %%sr_1d
+
+    ;i0>=lh & i1<lh
+    movss  m0, [dataq+j0q]
+    movss  [lq+i0q], m0
+    jmp %%sr_1d
+
+;i0<i1 & i1>=lh
+%%i0:
+    cmp   i0q, lhq
+    jge %%sr_1d
+    movss  m0, [dataq+j0q]
+    movss  [lq+i0q], m0
+    add   i0q, 8
+    add   j0q, 4
+    jmp %%i0
+
+;i1<i0 & i0>=lh
+%%i1:
+    cmp   i1q, lhq
+    jge %%sr_1d
+    movss  m0, [dataq+j1q]
+    movss [lq+i1q], m0
+    add   i1q, 8
+    add   j1q, 4
+    jmp %%i1
+
+;i0 < i1-1
+%%inci0:
+    cmp   i0q, lhq
+    jge %%sr_1d
+    movss  m0, [dataq+j0q]
+    movss  [lq+i0q-4], m0
+    add   i0q, 8
+    add   j0q, 4
+    cmp   i0q, i1q
+    je %%beginloopi0i1
+    jmp %%inci0
+
+;i1 < i0-1
+%%inci1:
+    cmp   i1q, lhq
+    jge %%sr_1d
+    movss  m0, [dataq+j1q]
+    movss  [lq+i1q-4], m0
+    add   i1q, 8
+    add   j1q, 4
+    cmp   i0q, i1q
+    je %%beginloopi1i0
+    jmp %%inci1
+
+%%sr_1d:
+    mov   i0q, mhq
+    mov   i1q, lhq
+    shr   i1q, 2
+    add   i1q, mhq
+    SR1D97FLOAT_ lineq, i0q, i1q, j0q, j1q
+
+    mov   i0q, 0
+    cmp   i0q, lhq
+    jge %%endmainloop
+    mov   j0q, wq
+    imul  j0q, lpq
+    shl   j0q, 2
+%%subloop3:
+    add   i0q, 12
+    cmp   i0q, lhq
+    jge %%endsubloop3
+
+    movups  m0, [lq+i0q-12]
+    movups  [dataq+j0q], m0
+
+    add   i0q, 4
+    add   j0q, 16
+    cmp   i0q, lhq
+    jge %%endmainloop
+    jmp %%subloop3  
+
+%%endsubloop3:
+    sub   i0q, 12
+%%littlesubloop3:
+    movss  m0, [lq+i0q]
+    movss  [dataq+j0q], m0
+
+    add   i0q, 4
+    add   j0q, 4
+    cmp   i0q, lhq
+    jl %%littlesubloop3  
+
+%%endmainloop:
+    add   lpq, 1
+    cmp   lpq, lvq
+    jl %%mainloop
+
+    REP_RET
+%endmacro
+
+INIT_XMM sse
+HORSDFLOAT 6
+
+;***********************************************************************
+; ff_ver_sd_float_<opt>(float *line, float *data, int mv, int lv, int lh, int w)
+;***********************************************************************
+%macro VERSDFLOAT 1
+cglobal ver_sd_float, 6, 12, %1, line, data, mv, lh, lv, w, lp, i0, i1, j0, j1, inc
+    shl   mvq, 2
+    add   lineq, mvq
+    mov   incq, lvq
+    add   incq, 12
+    shl   incq, 2
+    shl   lvq, 2
+    shl   wq, 2    
+    
+    mov   lpq, 0
+
+%%mainloop:
+    ;j0 = w*j+lp
+    mov   j0q, lpq
+
+    add   lpq, 3
+    cmp   lpq, lhq
+    jge %%beginmainloop2
+
+    shr   lvq, 2
+    shr   wq, 2
+    ;j1 = w*(lv-mv+1)/2 + j0
+    mov   j1q, lvq
+    sub   j1q, mvq
+    add   j1q, 1
+    shr   j1q, 1
+    imul  j1q, wq
+    add   j1q, j0q
+
+    shl   lvq, 2
+    shl   wq, 2
+    shl   j1q, 2
+    shl   j0q, 2
+
+    ;i1 = 1-mv
+    mov   i1q, 4
+    sub   i1q, mvq
+
+    ;i0 = mv
+    mov   i0q, mvq
+ 
+    cmp   i0q, i1q
+    jg %%i1i0
+
+;i0 < i1
+    cmp   i1q, lvq
+    jge %%i0
+  
+    add   i0q, 4
+    cmp   i0q, i1q
+    jne %%inci0
+ 
+;i1 = i0+1
+%%beginloopi0i1   
+    sub   i0q, 4
+
+%%loopi0i1:
+;    add   i1q, 12
+;    cmp   i1q, lvq
+;    jge %%endloopi0i1
+    
+;    movlps  m0, [dataq+j0q]
+;    movhps  m0, [dataq+j1q]
+;    movlps  m1, [dataq+j0q+8]
+;    movhps  m1, [dataq+j1q+8]
+;    add     j0q, wq
+;    add     j1q, wq
+;    movlps  m2, [dataq+j0q]
+;    movhps  m2, [dataq+j1q]
+;    movlps  m3, [dataq+j0q+8]
+;    movhps  m3, [dataq+j1q+8]
+;    movaps  m4, m0
+;    shufps  m0, m2, q2020
+;    shufps  m4, m2, q3131
+;    movaps  m2, m4
+;    movaps  m4, m1
+;    shufps  m1, m3, q2020
+;    shufps  m4, m3, q3131
+;    movaps  m3, m4
+;    movups  [lineq+i0q], m0
+;    add     lineq, incq
+;    movups  [lineq+i0q], m2
+;    add     lineq, incq
+;    movups  [lineq+i0q], m1
+;    add     lineq, incq
+;    movups  [lineq+i0q], m3
+;    sub     lineq, incq
+;    sub     lineq, incq
+;    sub     lineq, incq
+
+;    add     i1q, 4
+;    add     i0q, 16
+;    add     j0q, wq
+;    add     j1q, wq
+;    cmp     i1q, lvq
+;    jl %%loopi0i1 
+;    cmp     i0q, lvq
+;    jl %%lasti0 
+;    jmp %%sr_1d
+
+;i1 + 3 >= lv
+%%endloopi0i1:
+;    sub     i1q, 12
+%%littleloopi0i1:
+
+    movss   m0, [dataq+j0q]
+    movss   m1, [dataq+j1q]
+    movss   [lineq+i0q], m0
+    movss   [lineq+i0q+4], m1
+    movss   m0, [dataq+j0q+4]
+    movss   m1, [dataq+j1q+4]
+    add     lineq, incq
+    movss   [lineq+i0q], m0
+    movss   [lineq+i0q+4], m1
+    movss   m0, [dataq+j0q+8]
+    movss   m1, [dataq+j1q+8]
+    add     lineq, incq
+    movss   [lineq+i0q], m0
+    movss   [lineq+i0q+4], m1
+    movss   m0, [dataq+j0q+12]
+    movss   m1, [dataq+j1q+12]
+    add     lineq, incq
+    movss   [lineq+i0q], m0
+    movss   [lineq+i0q+4], m1
+    sub     lineq, incq
+    sub     lineq, incq
+    sub     lineq, incq
+
+    add     i1q, 8
+    add     i0q, 8
+    add     j0q, wq
+    add     j1q, wq
+    cmp     i1q, lvq
+    jl %%littleloopi0i1
+    cmp     i0q, lvq
+    jge %%sr_1d
+
+%%lasti0:
+    movss   m0, [dataq+j0q]
+    movss   [lineq+i0q], m0
+    add     lineq, incq
+    movss   m0, [dataq+j0q+4]
+    movss   [lineq+i0q], m0
+    add     lineq, incq
+    movss   m0, [dataq+j0q+8]
+    movss   [lineq+i0q], m0
+    add     lineq, incq
+    movss   m0, [dataq+j0q+12]
+    movss   [lineq+i0q], m0
+    sub     lineq, incq
+    sub     lineq, incq
+    sub     lineq, incq
+    jmp %%sr_1d
+
+;i1 < i0
+%%i1i0:
+    cmp     i0q, lvq
+    jge %%i1
+    
+    add     i1q, 4
+    cmp     i0q, i1q
+    jne %%inci1
+
+;i0 = i1+1
+%%beginloopi1i0    
+    sub     i1q, 4
+
+%%loopi1i0:
+;    add   i0q, 12
+;    cmp   i0q, lvq
+;    jge %%endloopi0i1
+    
+;    movlps  m0, [dataq+j1q]
+;    movhps  m0, [dataq+j0q]
+;    movlps  m1, [dataq+j1q+8]
+;    movhps  m1, [dataq+j0q+8]
+;    add     j0q, wq
+;    add     j1q, wq
+;    movlps  m2, [dataq+j1q]
+;    movhps  m2, [dataq+j0q]
+;    movlps  m3, [dataq+j1q+8]
+;    movhps  m3, [dataq+j0q+8]
+;    movaps  m4, m0
+;    shufps  m0, m2, q2020
+;    shufps  m4, m2, q3131
+;    movaps  m2, m4
+;    movaps  m4, m1
+;    shufps  m1, m3, q2020
+;    shufps  m4, m3, q3131
+;    movaps  m3, m4
+;    movups  [lineq+i1q], m0
+;    add     lineq, incq
+;    movups  [lineq+i1q], m2
+;    add     lineq, incq
+;    movups  [lineq+i1q], m1
+;    add     lineq, incq
+;    movups  [lineq+i1q], m3
+;    sub     lineq, incq
+;    sub     lineq, incq
+;    sub     lineq, incq
+
+;    add     i0q, 4
+;    add     i1q, 16
+;    add     j1q, wq
+;    add     j0q, wq
+;    cmp     i0q, lvq
+;    jl %%loopi1i0 
+;    cmp     i1q, lvq
+;    jl %%lasti1 
+;    jmp %%sr_1d
+
+%%endloopi1i0:
+;    sub     i1q, 12
+%%littleloopi1i0:
+
+    movss   m0, [dataq+j1q]
+    movss   m1, [dataq+j0q]
+    movss   [lineq+i1q], m0
+    movss   [lineq+i1q+4], m1
+    movss   m0, [dataq+j1q+4]
+    movss   m1, [dataq+j0q+4]
+    add     lineq, incq
+    movss   [lineq+i1q], m0
+    movss   [lineq+i1q+4], m1
+    movss   m0, [dataq+j1q+8]
+    movss   m1, [dataq+j0q+8]
+    add     lineq, incq
+    movss   [lineq+i1q], m0
+    movss   [lineq+i1q+4], m1
+    movss   m0, [dataq+j1q+12]
+    movss   m1, [dataq+j0q+12]
+    add     lineq, incq
+    movss   [lineq+i1q], m0
+    movss   [lineq+i1q+4], m1
+    sub     lineq, incq
+    sub     lineq, incq
+    sub     lineq, incq
+
+    add     i1q, 8
+    add     i0q, 8
+    add     j0q, wq
+    add     j1q, wq
+    cmp     i0q, lvq
+    jl %%littleloopi1i0
+    cmp     i1q, lvq
+    jge %%sr_1d
+
+%%lasti1:
+    movss   m0, [dataq+j1q]
+    movss   [lineq+i1q], m0
+    add     lineq, incq
+    movss   m0, [dataq+j1q+4]
+    movss   [lineq+i1q], m0
+    add     lineq, incq
+    movss   m0, [dataq+j1q+8]
+    movss   [lineq+i1q], m0
+    add     lineq, incq
+    movss   m0, [dataq+j1q+12]
+    movss   [lineq+i1q], m0
+    sub     lineq, incq
+    sub     lineq, incq
+    sub     lineq, incq
+    jmp %%sr_1d
+
+;i0<i1 & i1>=lv
+%%i0:
+    cmp     i0q, lvq
+    jge %%sr_1d
+    movss   m0, [dataq+j0q]
+    movss   [lineq+i0q], m0
+    add     lineq, incq
+    movss   m0, [dataq+j0q+4]
+    movss   [lineq+i0q], m0
+    add     lineq, incq
+    movss   m0, [dataq+j0q+8]
+    movss   [lineq+i0q], m0
+    add     lineq, incq
+    movss   m0, [dataq+j0q+12]
+    movss   [lineq+i0q], m0
+    sub     lineq, incq
+    sub     lineq, incq
+    sub     lineq, incq
+    add     i0q, 8
+    add     j0q, wq
+    jmp %%i0
+
+;i1<i0 & i0>=lh
+%%i1:
+    cmp     i1q, lvq
+    jge %%sr_1d
+    movss   m0, [dataq+j1q]
+    movss   [lineq+i1q], m0
+    add     lineq, incq
+    movss   m0, [dataq+j1q+4]
+    movss   [lineq+i1q], m0
+    add     lineq, incq
+    movss   m0, [dataq+j1q+8]
+    movss   [lineq+i1q], m0
+    add     lineq, incq
+    movss   m0, [dataq+j1q+12]
+    movss   [lineq+i1q], m0
+    sub     lineq, incq
+    sub     lineq, incq
+    sub     lineq, incq
+    add     i1q, 8
+    add     j1q, wq
+    jmp %%i1
+
+;i0 < i1-1
+%%inci0:
+    cmp     i0q, lvq
+    jge %%sr_1d
+    movss   m0, [dataq+j0q]
+    movss   [lineq+i0q-4], m0
+    add     lineq, incq
+    movss   m0, [dataq+j0q+4]
+    movss   [lineq+i0q-4], m0
+    add     lineq, incq
+    movss   m0, [dataq+j0q+8]
+    movss   [lineq+i0q-4], m0
+    add     lineq, incq
+    movss   m0, [dataq+j0q+12]
+    movss   [lineq+i0q-4], m0
+    sub     lineq, incq
+    sub     lineq, incq
+    sub     lineq, incq
+    add     i0q, 8
+    add     j0q, wq
+    cmp     i0q, i1q
+    je %%beginloopi0i1
+    jmp %%inci0
+
+;i1 < i0-1
+%%inci1:
+    cmp     i1q, lvq
+    jge %%sr_1d
+    movss   m0, [dataq+j1q]
+    movss   [lineq+i1q-4], m0
+    add     lineq, incq
+    movss   m0, [dataq+j1q+4]
+    movss   [lineq+i1q-4], m0
+    add     lineq, incq
+    movss   m0, [dataq+j1q+8]
+    movss   [lineq+i1q-4], m0
+    add     lineq, incq
+    movss   m0, [dataq+j1q+12]
+    movss   [lineq+i1q-4], m0
+    sub     lineq, incq
+    sub     lineq, incq
+    sub     lineq, incq
+    add     i1q, 8
+    add     j1q, wq
+    cmp     i0q, i1q
+    je %%beginloopi1i0
+    jmp %%inci1
+
+%%sr_1d:
+    sub     lineq, mvq
+    mov     i0q, mvq
+    mov     i1q, lvq
+    add     i1q, mvq
+    shr     i0q, 2
+    shr     i1q, 2
+    SR1D97FLOAT_ lineq, i0q, i1q, j0q, j1q
+    add     lineq, incq
+    mov     i0q, mvq
+    mov     i1q, lvq
+    add     i1q, mvq
+    shr     i0q, 2
+    shr     i1q, 2
+    SR1D97FLOAT_ lineq, i0q, i1q, j0q, j1q
+    add     lineq, incq
+    mov     i0q, mvq
+    mov     i1q, lvq
+    add     i1q, mvq
+    shr     i0q, 2
+    shr     i1q, 2
+    SR1D97FLOAT_ lineq, i0q, i1q, j0q, j1q
+    add     lineq, incq
+    mov     i0q, mvq
+    mov     i1q, lvq
+    add     i1q, mvq
+    shr     i0q, 2
+    shr     i1q, 2
+    SR1D97FLOAT_ lineq, i0q, i1q, j0q, j1q
+    sub     lineq, incq
+    sub     lineq, incq
+    sub     lineq, incq
+    add     lineq, mvq
+
+    mov     i0q, 0
+    ;cmp     i0q, lvq
+    ;jge %%endmainloop
+    mov     j0q, lpq
+    sub     j0q, 3
+    shl     j0q, 2
+%%loop3:
+    add     i0q, 12
+    cmp     i0q, lvq
+    jge %%endloop3
+
+    movups  m0, [lineq+i0q-12]
+    add     lineq, incq
+    movups  m1, [lineq+i0q-12]
+    add     lineq, incq
+    movups  m2, [lineq+i0q-12]
+    add     lineq, incq
+    movups  m3, [lineq+i0q-12]
+    movaps  m4, m0
+    movaps  m5, m2
+    movlhps m0, m1
+    movlhps m2, m3
+    movaps  m6, m0
+    shufps  m0, m2, q2020
+    shufps  m6, m2, q3131
+    movaps  m2, m6
+    movhlps m1, m4
+    movhlps m3, m5
+    movaps  m6, m1
+    shufps  m1, m3, q2020
+    shufps  m6, m3, q3131
+    movaps  m3, m6
+    movups  [dataq+j0q], m0
+    add     j0q, wq
+    movups  [dataq+j0q], m2
+    add     j0q, wq
+    movups  [dataq+j0q], m1
+    add     j0q, wq
+    movups  [dataq+j0q], m3
+    add     j0q, wq
+    sub     lineq, incq
+    sub     lineq, incq
+    sub     lineq, incq
+
+    add     i0q, 4
+    cmp     i0q, lvq
+    jge %%endmainloop
+    jmp %%loop3  
+
+%%endloop3:
+    sub     i0q, 12
+
+%%littleloop3:
+    movss   m0, [lineq+i0q]
+    movss   [dataq+j0q], m0
+    add     lineq, incq
+    movss   m0, [lineq+i0q]
+    movss   [dataq+j0q+4], m0
+    add     lineq, incq
+    movss   m0, [lineq+i0q]
+    movss   [dataq+j0q+8], m0
+    add     lineq, incq
+    movss   m0, [lineq+i0q]
+    movss   [dataq+j0q+12], m0
+    sub     lineq, incq
+    sub     lineq, incq
+    sub     lineq, incq
+
+    add     i0q, 4
+    add     j0q, wq
+    cmp     i0q, lvq
+    jl %%littleloop3  
+
+%%endmainloop:
+    add     lpq, 1
+    cmp     lpq, lhq
+    jl %%mainloop
+    jmp %%end
+
+%%beginmainloop2:
+    sub     lpq, 3
+%%mainloop2:
+    ;j0 = w*j+lp
+    mov   j0q, lpq
+    shl   j0q, 2
+
+    ;i0 = mv
+    mov   i0q, mvq
+ 
+    cmp     i0q, lvq
+    jge %%beginloop5
+%%loop4:
+    movss   m0, [dataq+j0q]
+    movss   [lineq+i0q], m0
+
+    add     j0q, wq
+    add     i0q, 8
+    cmp     i0q, lvq
+    jl      %%loop4
+
+%%beginloop5:
+    ;i0 = 1-mv
+    mov   i0q, 4
+    sub   i0q, mvq
+    cmp     i0q, lvq
+    jge %%sr_1d_2
+%%loop5:
+    movss   m0, [dataq+j0q]
+    movss   [lineq+i0q], m0
+
+    add     j0q, wq
+    add     i0q, 8
+    cmp     i0q, lvq
+    jl      %%loop5
+
+%%sr_1d_2:
+    sub     lineq, mvq
+    mov     i0q, mvq
+    mov     i1q, lvq
+    add     i1q, mvq
+    shr     i1q, 2
+    shr     i0q, 2
+    SR1D97FLOAT_ lineq, i0q, i1q, j0q, j1q
+    add     lineq, mvq
+
+    mov     i0q, 0
+    cmp     i0q, lvq
+    jge %%endmainloop
+    mov     j0q, lpq
+    shl     j0q, 2
+%%loop6:
+    movss   m0, [lineq+i0q]
+    movss   [dataq+j0q], m0
+
+    add     j0q, wq
+    add     i0q, 4
+    cmp     i0q, lvq
+    jl %%loop6 
+
+%%endmainloop2:
+    add   lpq, 1
+    cmp   lpq, lhq
+    jl %%mainloop2
+
+%%end:
+    REP_RET
+%endmacro
+
+INIT_XMM sse
+VERSDFLOAT 6
diff --git a/libavcodec/x86/jpeg2000dsp_init.c b/libavcodec/x86/jpeg2000dsp_init.c
index baa81383ea..04cd01379d 100644
--- a/libavcodec/x86/jpeg2000dsp_init.c
+++ b/libavcodec/x86/jpeg2000dsp_init.c
@@ -19,16 +19,23 @@ 
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#include "libavutil/avassert.h"
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/jpeg2000dsp.h"
+#include "libavcodec/jpeg2000dwt.h"
+#include <stdio.h>
 
 void ff_ict_float_sse(void *src0, void *src1, void *src2, int csize);
 void ff_ict_float_avx(void *src0, void *src1, void *src2, int csize);
 void ff_rct_int_sse2 (void *src0, void *src1, void *src2, int csize);
 void ff_rct_int_avx2 (void *src0, void *src1, void *src2, int csize);
 
+void ff_sr_1d97_float_sse(float *line, int i0, int i1);
+void ff_hor_sd_float_sse(float *line, float *data, int mh, int lh, int lv, int w);
+void ff_ver_sd_float_sse(float *line, float *data, int mv, int lh, int lv, int w);
+
 av_cold void ff_jpeg2000dsp_init_x86(Jpeg2000DSPContext *c)
 {
     int cpu_flags = av_get_cpu_flags();
@@ -48,3 +55,115 @@  av_cold void ff_jpeg2000dsp_init_x86(Jpeg2000DSPContext *c)
         c->mct_decode[FF_DWT53] = ff_rct_int_avx2;
     }
 }
+
+av_cold void ff_jpeg2000dwt_init_x86(DWTContext *s, int type)
+{
+    int cpu_flags = av_get_cpu_flags();
+    if (EXTERNAL_SSE(cpu_flags)) {
+        if (type == FF_DWT97){
+            s->sse = 1;
+        }
+    }
+}
+
+void dwt_decode97_float_sse(DWTContext *s, float *t)
+{
+    int lev;
+    int w       = s->linelen[s->ndeclevels - 1][0];
+    float *line = s->f_linebuf;
+    float *data = t;
+    /* position at index O of line range [0-5,w+5] cf. extend function */
+    line += 5;
+    int len = s->linelen[s->ndeclevels - 1][0]*s->linelen[s->ndeclevels - 1][1];
+
+    int i, j = 0;
+
+    for (lev = 0; lev < s->ndeclevels; lev++) {
+        int lh = s->linelen[lev][0],
+            lv = s->linelen[lev][1],
+            mh = s->mod[lev][0],
+            mv = s->mod[lev][1],
+            lp;
+        float *l;
+        int *test = malloc(sizeof(int));
+
+        // HOR_SD
+        ff_hor_sd_float_sse(line, data, mh, lh, lv, w);
+
+        // VER_SD
+        ff_ver_sd_float_sse(line, data, mv, lh, lv, w);
+        /*l = line + mv;
+        inc = lv+16;
+        for (lp = 0; lp+3 < lh; lp += 4) {
+            //printf("hello \n");
+            j = 0;
+            // copy with interleaving
+            for (i = mv; i < lv; i += 2, j++){
+                l[i] = data[w * j + lp];
+                l += inc;
+                l[i] = data[w * j + lp + 1];
+                l += inc;
+                l[i] = data[w * j + lp + 2];
+                l += inc;
+                l[i] = data[w * j + lp + 3];
+                l -= inc;
+                l -= inc;
+                l -= inc;
+            }
+            for (i = 1 - mv; i < lv; i += 2, j++){
+                l[i] = data[w * j + lp];
+                l += inc;
+                l[i] = data[w * j + lp + 1];
+                l += inc;
+                l[i] = data[w * j + lp + 2];
+                l += inc;
+                l[i] = data[w * j + lp + 3];
+                l -= inc;
+                l -= inc;
+                l -= inc;
+            }
+
+            ff_sr_1d97_float_sse(l, mv, mv + lv);
+            l += inc;
+            ff_sr_1d97_float_sse(l, mv, mv + lv);
+            l += inc;
+            ff_sr_1d97_float_sse(l, mv, mv + lv);
+            l += inc;
+            ff_sr_1d97_float_sse(l, mv, mv + lv);
+            l -= inc;
+            l -= inc;
+            l -= inc;
+
+            for (i = 0; i < lv; i++){
+                data[w * i + lp] = l[i];
+                l += inc;
+                data[w * i + lp + 1] = l[i];
+                l += inc;
+                data[w * i + lp + 2] = l[i];
+                l += inc;
+                data[w * i + lp + 3] = l[i];
+                l -= inc;
+                l -= inc;
+                l -= inc;
+            }
+        }
+
+        for (; lp < lh; lp ++) {
+            //printf("hello \n");
+            j = 0;
+            // copy with interleaving
+            for (i = mv; i < lv; i += 2, j++){
+                l[i] = data[w * j + lp];
+            }
+            for (i = 1 - mv; i < lv; i += 2, j++){
+                l[i] = data[w * j + lp];
+            }
+
+            ff_sr_1d97_float_sse(line, mv, mv + lv);
+
+            for (i = 0; i < lv; i++){
+                data[w * i + lp] = l[i];
+            }
+        }*/
+    }
+}
diff --git a/tests/checkasm/jpeg2000dsp.c b/tests/checkasm/jpeg2000dsp.c
index 48559df085..92f3264674 100644
--- a/tests/checkasm/jpeg2000dsp.c
+++ b/tests/checkasm/jpeg2000dsp.c
@@ -20,6 +20,7 @@ 
 
 #include "checkasm.h"
 #include "libavcodec/jpeg2000dsp.h"
+#include "libavcodec/jpeg2000dwt.h"
 #include "libavutil/common.h"
 #include "libavutil/internal.h"
 #include "libavutil/intreadwrite.h"