[FFmpeg-devel] JPEG2000: SSE optimisation of DWT decoding

Submitted by maxime taisant on Aug. 8, 2017, 9:09 a.m.

Details

Message ID HE1PR0901MB159411BA24CED96B355D9459DB8A0@HE1PR0901MB1594.eurprd09.prod.outlook.com
State New
Headers show

Commit Message

maxime taisant Aug. 8, 2017, 9:09 a.m.
From: Maxime Taisant <maximetaisant@hotmail.fr>

Hi,

Here is some SSE optimisations for the dwt function used to decode JPEG2000.
I tested this code by using the time command while reading a JPEG2000 encoded video with ffmpeg and, on average, I observed a 4.05% general improvement, and a 12.67% improvement on the dwt decoding part alone.
In the nasm code, you can notice that the SR1DFLOAT macro appear twice. One version is called in the nasm code by the HORSD macro and the other is called in the C code of the dwt function, I couldn't figure out a way to make only one macro.
I also couldn't figure out a good way to optimize the VER_SD part, so that is why I left it unchanged, with just a SSE-optimized version of the SR_1D_FLOAT function.

Regards.

---
 libavcodec/jpeg2000dwt.c          |  21 +-
 libavcodec/jpeg2000dwt.h          |   6 +
 libavcodec/x86/jpeg2000dsp.asm    | 794 ++++++++++++++++++++++++++++++++++++++
 libavcodec/x86/jpeg2000dsp_init.c |  55 +++
 4 files changed, 863 insertions(+), 13 deletions(-)

Comments

Ivan Kalvachev Aug. 8, 2017, 6:23 p.m.
On 8/8/17, maxime taisant <maximetaisant@hotmail.fr> wrote:
> From: Maxime Taisant <maximetaisant@hotmail.fr>
>
> Hi,
>
> Here is some SSE optimisations for the dwt function used to decode JPEG2000.
> I tested this code by using the time command while reading a JPEG2000
> encoded video with ffmpeg and, on average, I observed a 4.05% general
> improvement, and a 12.67% improvement on the dwt decoding part alone.
> In the nasm code, you can notice that the SR1DFLOAT macro appear twice. One
> version is called in the nasm code by the HORSD macro and the other is
> called in the C code of the dwt function, I couldn't figure out a way to
> make only one macro.

You want to use the same macro at two locations or
you want to have 1 function and "call" it from 2 places?

For the former, I'd guess that you might have been getting
errors about duplicated labels, since you use the local to the file form
instead local to the macro form. aka: ".loop" vs "%%loop".

> I also couldn't figure out a good way to optimize the VER_SD part, so that
> is why I left it unchanged, with just a SSE-optimized version of the
> SR_1D_FLOAT function.

[...]
> +.extend:
> +    shl i0d, 2
> +    shl i1d, 2
> +    mov j0q, i0q
> +    mov j1q, i1q
> +    movups m0, [lineq+j0q+4]
> +    shufps m0, m0, 0x1B

The x86inc provides with readable method for the shuffle constant.
qXXXX where X is index in the source reg.
Using q3210 would generate constant that leaves all elements at their
original places.
The 0x1B is q0123 , that is swap, isn't it?.

Also, minor cosmetic nitpick.
 usually the first parameters are placed so their commas are vertically aligned.
This applies only when the parameter is register (so no jmp labels or
[] addresses ).

[...]
> +    ;line{2*i,2*(i+1),2*(i+2),2*(i+3)} -=
> F_LFTG_DELTA*(line{2*i-1,2*(i+1)-1,2*(i+2)-1,2*(i+3)-1}+line{2*i+1,2*(i+1)+1,2*(i+2)+1,2*(i+3)+1})
> +    movups m0, [lineq+2*j0q-28]
> +    movups m4, [lineq+2*j0q-12]
> +    movups m1, m0
> +    shufps m0, m4, 0xDD
> +    shufps m1, m4, 0x88

The x86inc provides with a way to emulate 3 operand avx.
This means it hides one of the movaps (use 'a' for reg reg).
    shufps m1, m0, m4, 0x88
    shufps m0, m4, 0xDD

[...]
> +    movups m2, [lineq+2*j0q-24]
> +    movups m5, [lineq+2*j0q-8]
> +    shufps m2, m5, 0xDD
> +    addps m2, m1
> +    mulps m2, m3
> +    subps m0, m2
> +    movups m4, m1
> +    shufps m1, m0, 0x44 ; 0100'0100 q1010
Is that movlhps m1, m0 ?

> +    shufps m1, m1, q3120
> +    shufps m4, m0, 0xEE
> +    shufps m4, m4, 0xD8


That's not comprehensive review, so other should still join in.

Best Regards.
Clément Bœsch Aug. 8, 2017, 7:41 p.m.
On Tue, Aug 08, 2017 at 09:09:44AM +0000, maxime taisant wrote:
> From: Maxime Taisant <maximetaisant@hotmail.fr>
> 
> Hi,
> 
> Here is some SSE optimisations for the dwt function used to decode JPEG2000.
> I tested this code by using the time command while reading a JPEG2000 encoded video with ffmpeg and, on average, I observed a 4.05% general improvement, and a 12.67% improvement on the dwt decoding part alone.
> In the nasm code, you can notice that the SR1DFLOAT macro appear twice. One version is called in the nasm code by the HORSD macro and the other is called in the C code of the dwt function, I couldn't figure out a way to make only one macro.
> I also couldn't figure out a good way to optimize the VER_SD part, so that is why I left it unchanged, with just a SSE-optimized version of the SR_1D_FLOAT function.
> 
> Regards.
> 
> ---
>  libavcodec/jpeg2000dwt.c          |  21 +-
>  libavcodec/jpeg2000dwt.h          |   6 +
>  libavcodec/x86/jpeg2000dsp.asm    | 794 ++++++++++++++++++++++++++++++++++++++
>  libavcodec/x86/jpeg2000dsp_init.c |  55 +++
>  4 files changed, 863 insertions(+), 13 deletions(-)
> 
> diff --git a/libavcodec/jpeg2000dwt.c b/libavcodec/jpeg2000dwt.c
> index 55dd5e89b5..69c935980d 100644
> --- a/libavcodec/jpeg2000dwt.c
> +++ b/libavcodec/jpeg2000dwt.c
> @@ -558,16 +558,19 @@ int ff_jpeg2000_dwt_init(DWTContext *s, int border[2][2],
>          }
>      switch (type) {
>      case FF_DWT97:
> +        dwt_decode = dwt_decode97_float;
>          s->f_linebuf = av_malloc_array((maxlen + 12), sizeof(*s->f_linebuf));
>          if (!s->f_linebuf)
>              return AVERROR(ENOMEM);
>          break;
>       case FF_DWT97_INT:
> +        dwt_decode = dwt_decode97_int;
>          s->i_linebuf = av_malloc_array((maxlen + 12), sizeof(*s->i_linebuf));
>          if (!s->i_linebuf)
>              return AVERROR(ENOMEM);
>          break;
>      case FF_DWT53:
> +        dwt_decode = dwt_decode53;
>          s->i_linebuf = av_malloc_array((maxlen +  6), sizeof(*s->i_linebuf));
>          if (!s->i_linebuf)
>              return AVERROR(ENOMEM);

Using globals is not acceptable, you need to fix that.

[...]
Michael Bradshaw Aug. 9, 2017, 2:11 p.m.
On Tue, Aug 8, 2017 at 2:09 AM, maxime taisant <maximetaisant@hotmail.fr>
wrote:
>
> [...]
> +void (*dwt_decode)(DWTContext *s, void *t);


Why the global variable? It seems unnecessary, and as Clément pointed out,
is unsafe and should not be used in the FFmpeg code base (at least not
without a very good justification and synchronization).
maxime taisant Aug. 10, 2017, 8:03 p.m.
> From: Clément Bœsch <u@pkh.me>

> 

> On Tue, Aug 08, 2017 at 09:09:44AM +0000, maxime taisant wrote:

> > From: Maxime Taisant <maximetaisant@hotmail.fr>

> >

> > Hi,

> >

> > Here is some SSE optimisations for the dwt function used to decode

> JPEG2000.

> > I tested this code by using the time command while reading a

> JPEG2000 encoded video with ffmpeg and, on average, I observed a

> 4.05% general improvement, and a 12.67% improvement on the dwt

> decoding part alone.

> > In the nasm code, you can notice that the SR1DFLOAT macro appear

> twice. One version is called in the nasm code by the HORSD macro

> and the other is called in the C code of the dwt function, I couldn't

> figure out a way to make only one macro.

> > I also couldn't figure out a good way to optimize the VER_SD part, so

> that is why I left it unchanged, with just a SSE-optimized version of

> the SR_1D_FLOAT function.

> >

> > Regards.

> >

> > ---

> >  libavcodec/jpeg2000dwt.c          |  21 +-

> >  libavcodec/jpeg2000dwt.h          |   6 +

> >  libavcodec/x86/jpeg2000dsp.asm    | 794

> ++++++++++++++++++++++++++++++++++++++

> >  libavcodec/x86/jpeg2000dsp_init.c |  55 +++

> >  4 files changed, 863 insertions(+), 13 deletions(-)

> >

> > diff --git a/libavcodec/jpeg2000dwt.c b/libavcodec/jpeg2000dwt.c

> index

> > 55dd5e89b5..69c935980d 100644

> > --- a/libavcodec/jpeg2000dwt.c

> > +++ b/libavcodec/jpeg2000dwt.c

> > @@ -558,16 +558,19 @@ int ff_jpeg2000_dwt_init(DWTContext *s,

> int border[2][2],

> >          }

> >      switch (type) {

> >      case FF_DWT97:

> > +        dwt_decode = dwt_decode97_float;

> >          s->f_linebuf = av_malloc_array((maxlen + 12), sizeof(*s-

> >f_linebuf));

> >          if (!s->f_linebuf)

> >              return AVERROR(ENOMEM);

> >          break;

> >       case FF_DWT97_INT:

> > +        dwt_decode = dwt_decode97_int;

> >          s->i_linebuf = av_malloc_array((maxlen + 12), sizeof(*s-

> >i_linebuf));

> >          if (!s->i_linebuf)

> >              return AVERROR(ENOMEM);

> >          break;

> >      case FF_DWT53:

> > +        dwt_decode = dwt_decode53;

> >          s->i_linebuf = av_malloc_array((maxlen +  6), sizeof(*s-

> >i_linebuf));

> >          if (!s->i_linebuf)

> >              return AVERROR(ENOMEM);

> 

> Using globals is not acceptable, you need to fix that.

> 


Yeah, I can't even remember why I did that... I will fix it.
Thank you.

Patch hide | download patch | download mbox

diff --git a/libavcodec/jpeg2000dwt.c b/libavcodec/jpeg2000dwt.c
index 55dd5e89b5..69c935980d 100644
--- a/libavcodec/jpeg2000dwt.c
+++ b/libavcodec/jpeg2000dwt.c
@@ -558,16 +558,19 @@  int ff_jpeg2000_dwt_init(DWTContext *s, int border[2][2],
         }
     switch (type) {
     case FF_DWT97:
+        dwt_decode = dwt_decode97_float;
         s->f_linebuf = av_malloc_array((maxlen + 12), sizeof(*s->f_linebuf));
         if (!s->f_linebuf)
             return AVERROR(ENOMEM);
         break;
      case FF_DWT97_INT:
+        dwt_decode = dwt_decode97_int;
         s->i_linebuf = av_malloc_array((maxlen + 12), sizeof(*s->i_linebuf));
         if (!s->i_linebuf)
             return AVERROR(ENOMEM);
         break;
     case FF_DWT53:
+        dwt_decode = dwt_decode53;
         s->i_linebuf = av_malloc_array((maxlen +  6), sizeof(*s->i_linebuf));
         if (!s->i_linebuf)
             return AVERROR(ENOMEM);
@@ -575,6 +578,10 @@  int ff_jpeg2000_dwt_init(DWTContext *s, int border[2][2],
     default:
         return -1;
     }
+
+    if (ARCH_X86)
+        ff_jpeg2000dwt_init_x86(s, type);
+
     return 0;
 }
 
@@ -601,19 +608,7 @@  int ff_dwt_decode(DWTContext *s, void *t)
     if (s->ndeclevels == 0)
         return 0;
 
-    switch (s->type) {
-    case FF_DWT97:
-        dwt_decode97_float(s, t);
-        break;
-    case FF_DWT97_INT:
-        dwt_decode97_int(s, t);
-        break;
-    case FF_DWT53:
-        dwt_decode53(s, t);
-        break;
-    default:
-        return -1;
-    }
+    dwt_decode(s,t);
     return 0;
 }
 
diff --git a/libavcodec/jpeg2000dwt.h b/libavcodec/jpeg2000dwt.h
index 718d183ac1..8462ddf8cd 100644
--- a/libavcodec/jpeg2000dwt.h
+++ b/libavcodec/jpeg2000dwt.h
@@ -50,6 +50,8 @@  typedef struct DWTContext {
     float   *f_linebuf;                  ///< float buffer used by transform
 } DWTContext;
 
+void (*dwt_decode)(DWTContext *s, void *t);
+
 /**
  * Initialize DWT.
  * @param s                 DWT context
@@ -65,4 +67,8 @@  int ff_dwt_decode(DWTContext *s, void *t);
 
 void ff_dwt_destroy(DWTContext *s);
 
+void dwt_decode97_float_sse(DWTContext *s, float *t);
+
+void ff_jpeg2000dwt_init_x86(DWTContext *s, int type);
+
 #endif /* AVCODEC_JPEG2000DWT_H */
diff --git a/libavcodec/x86/jpeg2000dsp.asm b/libavcodec/x86/jpeg2000dsp.asm
index 56b5fbd606..262704f288 100644
--- a/libavcodec/x86/jpeg2000dsp.asm
+++ b/libavcodec/x86/jpeg2000dsp.asm
@@ -2,6 +2,7 @@ 
 ;* SIMD-optimized JPEG2000 DSP functions
 ;* Copyright (c) 2014 Nicolas Bertrand
 ;* Copyright (c) 2015 James Almer
+;* Copyright (c) 2017 Maxime Taisant
 ;*
 ;* This file is part of FFmpeg.
 ;*
@@ -29,6 +30,16 @@  pf_ict1: times 8 dd 0.34413
 pf_ict2: times 8 dd 0.71414
 pf_ict3: times 8 dd 1.772
 
+F_LFTG_K: dd 1.230174104914001
+F_LFTG_X: dd 0.812893066115961
+
+F_LFTG_ALPHA: times 8 dd 1.586134342059924
+F_LFTG_BETA: times 8 dd 0.052980118572961
+F_LFTG_GAMMA: times 8 dd 0.882911075530934
+F_LFTG_DELTA: times 8 dd 0.443506852043971
+
+TWO: dd 2.0
+
 SECTION .text
 
 ;***********************************************************************
@@ -142,3 +153,786 @@  RCT_INT
 INIT_YMM avx2
 RCT_INT
 %endif
+
+;***********************************************************************
+; ff_sr_ld97_float_<opt>(float *line, int i0, int i1)
+;***********************************************************************
+%macro SR1D97FLOAT 1
+cglobal sr_1d97_float, 3, 5, %1, line, i0, i1, j0, j1
+    mov j0q, i0q
+    mov j1q, i1q
+    add j0q, 1
+    cmp j1q, j0q
+    jg .extend
+    sub j0q, 2
+    jnz .else
+    movss m0, [lineq+4]
+    movss m1, [F_LFTG_K]
+    movss m2, [TWO]
+    divss m1, m2
+    mulss m0, m1
+    movss [lineq+4], m0
+    jmp .end
+
+.else:
+    movss m0, [lineq]
+    movss m1, [F_LFTG_X]
+    mulss m0, m1
+    movss [lineq], m0
+    jmp .end
+
+.extend:
+    shl i0d, 2
+    shl i1d, 2
+    mov j0q, i0q
+    mov j1q, i1q
+    movups m0, [lineq+j0q+4]
+    shufps m0, m0, 0x1B
+    movups [lineq+j0q-16], m0
+    movups m0, [lineq+j1q-20]
+    shufps m0, m0, 0x1B
+    movups [lineq+j1q], m0
+
+    movups m3, [F_LFTG_DELTA]
+    mov j0q, i0q
+    mov j1q, i1q
+    shr j0q, 1
+    sub j0q, 4
+    shr j1q, 1
+    add j1q, 8
+    cmp j0q, j1q
+    jge .beginloop2
+.loop1:
+    add j0q, 12
+    cmp j0q, j1q
+    jge .endloop1
+ 
+    ;line{2*i,2*(i+1),2*(i+2),2*(i+3)} -= F_LFTG_DELTA*(line{2*i-1,2*(i+1)-1,2*(i+2)-1,2*(i+3)-1}+line{2*i+1,2*(i+1)+1,2*(i+2)+1,2*(i+3)+1})
+    movups m0, [lineq+2*j0q-28]
+    movups m4, [lineq+2*j0q-12]
+    movups m1, m0
+    shufps m0, m4, 0xDD
+    shufps m1, m4, 0x88
+    movups m2, [lineq+2*j0q-24]
+    movups m5, [lineq+2*j0q-8] 
+    shufps m2, m5, 0xDD
+    addps m2, m1
+    mulps m2, m3
+    subps m0, m2
+    movups m4, m1
+    shufps m1, m0, 0x44
+    shufps m1, m1, 0xD8
+    shufps m4, m0, 0xEE
+    shufps m4, m4, 0xD8
+    movups [lineq+2*j0q-28], m1
+    movups [lineq+2*j0q-12], m4
+
+    add j0q, 4
+    cmp j0q, j1q
+    jge .beginloop2
+    jmp .loop1
+  
+.endloop1:
+    sub j0q, 12
+.littleloop1:
+    movss m0, [lineq+2*j0q]
+    movss m1, [lineq+2*j0q-4]
+    movss m2, [lineq+2*j0q+4]
+    addss m1, m2
+    mulss m1, m3
+    subss m0, m1
+    movss [lineq+2*j0q], m0
+    add j0q, 4
+    cmp j0q, j1q
+    jl .littleloop1
+
+.beginloop2:
+    movups m3, [F_LFTG_GAMMA]
+    mov j0q, i0q
+    mov j1q, i1q
+    shr j0q, 1
+    sub j0q, 4
+    shr j1q, 1
+    add j1q, 4
+    cmp j0q, j1q
+    jge .beginloop3
+.loop2:
+    add j0q, 12
+    cmp j0q, j1q
+    jge .endloop2
+ 
+    ;line{2*i+1,2*(i+1)+1,2*(i+2)+1,2*(i+3)+1} -= F_LFTG_GAMMA*(line{2*i,2*(i+1),2*(i+2),2*(i+3)}+line{2*i+2,2*(i+1)+2,2*(i+2)+2,2*(i+3)+2})
+    movups m0, [lineq+2*j0q-24]
+    movups m4, [lineq+2*j0q-8]
+    movups m1, m0
+    shufps m0, m4, 0xDD
+    shufps m1, m4, 0x88
+    movups m2, [lineq+2*j0q-20]
+    movups m5, [lineq+2*j0q-4] 
+    shufps m2, m5, 0xDD
+    addps m2, m1
+    mulps m2, m3
+    subps m0, m2
+    movups m4, m1
+    shufps m1, m0, 0x44
+    shufps m1, m1, 0xD8
+    shufps m4, m0, 0xEE
+    shufps m4, m4, 0xD8
+    movups [lineq+2*j0q-24], m1
+    movups [lineq+2*j0q-8], m4
+
+    add j0q, 4
+    cmp j0q, j1q
+    jge .beginloop3
+    jmp .loop2
+  
+.endloop2:
+    sub j0q, 12
+.littleloop2:
+    movss m0, [lineq+2*j0q+4]
+    movss m1, [lineq+2*j0q]
+    movss m2, [lineq+2*j0q+8]
+    addss m1, m2
+    mulss m1, m3
+    subss m0, m1
+    movss [lineq+2*j0q+4], m0
+    add j0q, 4
+    cmp j0q, j1q
+    jl .littleloop2
+
+.beginloop3:
+    movups m3, [F_LFTG_BETA]
+    mov j0q, i0q
+    mov j1q, i1q
+    shr j0q, 1
+    sub j0q, 4
+    shr j1q, 1
+    add j1q, 8
+    cmp j0q, j1q
+    jge .beginloop4
+.loop3:
+    add j0q, 12
+    cmp j0q, j1q
+    jge .endloop3
+ 
+    ;line{2*i,2*(i+1),2*(i+2),2*(i+3)} += F_LFTG_BETA*(line{2*i-1,2*(i+1)-1,2*(i+2)-1,2*(i+3)-1}+line{2*i+1,2*(i+1)+1,2*(i+2)+1,2*(i+3)+1})
+    movups m0, [lineq+2*j0q-28]
+    movups m4, [lineq+2*j0q-12]
+    movups m1, m0
+    shufps m0, m4, 0xDD
+    shufps m1, m4, 0x88
+    movups m2, [lineq+2*j0q-24]
+    movups m5, [lineq+2*j0q-8] 
+    shufps m2, m5, 0xDD
+    addps m2, m1
+    mulps m2, m3
+    addps m0, m2
+    movups m4, m1
+    shufps m1, m0, 0x44
+    shufps m1, m1, 0xD8
+    shufps m4, m0, 0xEE
+    shufps m4, m4, 0xD8
+    movups [lineq+2*j0q-28], m1
+    movups [lineq+2*j0q-12], m4
+
+    add j0q, 4
+    cmp j0q, j1q
+    jge .beginloop4
+    jmp .loop3
+  
+.endloop3:
+    sub j0q, 12
+.littleloop3:
+    movss m0, [lineq+2*j0q]
+    movss m1, [lineq+2*j0q-4]
+    movss m2, [lineq+2*j0q+4]
+    addss m1, m2
+    mulss m1, m3
+    addss m0, m1
+    movss [lineq+2*j0q], m0
+    add j0q, 4
+    cmp j0q, j1q
+    jl .littleloop3
+
+.beginloop4:
+    movups m3, [F_LFTG_ALPHA]
+    mov j0q, i0q
+    mov j1q, i1q
+    shr j0q, 1
+    sub j0q, 4
+    shr j1q, 1
+    add j1q, 4
+    cmp j0q, j1q
+    jge .end
+.loop4:
+    add j0q, 12
+    cmp j0q, j1q
+    jge .endloop4
+ 
+    ;line{2*i+1,2*(i+1)+1,2*(i+2)+1,2*(i+3)+1} += F_LFTG_ALPHA*(line{2*i,2*(i+1),2*(i+2),2*(i+3)}+line{2*i+2,2*(i+1)+2,2*(i+2)+2,2*(i+3)+2})
+    movups m0, [lineq+2*j0q-24]
+    movups m4, [lineq+2*j0q-8]
+    movups m1, m0
+    shufps m0, m4, 0xDD
+    shufps m1, m4, 0x88
+    movups m2, [lineq+2*j0q-20]
+    movups m5, [lineq+2*j0q-4] 
+    shufps m2, m5, 0xDD
+    addps m2, m1
+    mulps m2, m3
+    addps m0, m2
+    movups m4, m1
+    shufps m1, m0, 0x44
+    shufps m1, m1, 0xD8
+    shufps m4, m0, 0xEE
+    shufps m4, m4, 0xD8
+    movups [lineq+2*j0q-24], m1
+    movups [lineq+2*j0q-8], m4
+
+    add j0q, 4
+    cmp j0q, j1q
+    jge .end
+    jmp .loop4
+  
+.endloop4:
+    sub j0q, 12
+.littleloop4:
+    movss m0, [lineq+2*j0q+4]
+    movss m1, [lineq+2*j0q]
+    movss m2, [lineq+2*j0q+8]
+    addss m1, m2
+    mulss m1, m3
+    addss m0, m1
+    movss [lineq+2*j0q+4], m0
+    add j0q, 4
+    cmp j0q, j1q
+    jl .littleloop4
+
+.end:
+    REP_RET
+%endmacro
+    
+INIT_XMM sse
+SR1D97FLOAT 6
+
+%macro SR1D97FLOAT_ 5      ; p, i0, i1, tmp0, tmp1
+    mov %4, %2
+    mov %5, %3
+    add %4, 1
+    cmp %5, %4
+    jg .extend
+    sub %4, 2
+    jnz .else
+    movss m0, [%1+4]
+    movss m1, [F_LFTG_K]
+    movss m2, [TWO]
+    divss m1, m2
+    mulss m0, m1
+    movss [%1+4], m0
+    jmp .end
+
+.else:
+    movss m0, [%1]
+    movss m1, [F_LFTG_X]
+    mulss m0, m1
+    movss [%1], m0
+    jmp .end
+
+.extend:
+    shl %2, 2
+    shl %3, 2
+    mov %4, %2
+    mov %5, %3
+    movups m0, [%1+%4+4]
+    shufps m0, m0, 0x1B
+    movups [%1+%4-16], m0
+    movups m0, [%1+%5-20]
+    shufps m0, m0, 0x1B
+    movups [%1+%5], m0
+
+    movups m3, [F_LFTG_DELTA]
+    mov %4, %2
+    mov %5, %3
+    shr %4, 1
+    sub %4, 4
+    shr %5, 1
+    add %5, 8
+    cmp %4, %5
+    jge .beginloop2
+.loop1:
+    add %4, 12
+    cmp %4, %5
+    jge .endloop1
+ 
+    movups m0, [%1+2*%4-28]
+    movups m4, [%1+2*%4-12]
+    movups m1, m0
+    shufps m0, m4, 0xDD
+    shufps m1, m4, 0x88
+    movups m2, [%1+2*%4-24]
+    movups m5, [%1+2*%4-8] 
+    shufps m2, m5, 0xDD
+    addps m2, m1
+    mulps m2, m3
+    subps m0, m2
+    movups m4, m1
+    shufps m1, m0, 0x44
+    shufps m1, m1, 0xD8
+    shufps m4, m0, 0xEE
+    shufps m4, m4, 0xD8
+    movups [%1+2*%4-28], m1
+    movups [%1+2*%4-12], m4
+
+    add %4, 4
+    cmp %4, %5
+    jge .beginloop2
+    jmp .loop1
+  
+.endloop1:
+    sub %4, 12
+.littleloop1:
+    movss m0, [%1+2*%4]
+    movss m1, [%1+2*%4-4]
+    movss m2, [%1+2*%4+4]
+    addss m1, m2
+    mulss m1, m3
+    subss m0, m1
+    movss [%1+2*%4], m0
+    add %4, 4
+    cmp %4, %5
+    jl .littleloop1
+
+.beginloop2:
+    movups m3, [F_LFTG_GAMMA]
+    mov %4, %2
+    mov %5, %3
+    shr %4, 1
+    sub %4, 4
+    shr %5, 1
+    add %5, 4
+    cmp %4, %5
+    jge .beginloop3
+.loop2:
+    add %4, 12
+    cmp %4, %5
+    jge .endloop2
+ 
+    movups m0, [%1+2*%4-24]
+    movups m4, [%1+2*%4-8]
+    movups m1, m0
+    shufps m0, m4, 0xDD
+    shufps m1, m4, 0x88
+    movups m2, [%1+2*%4-20]
+    movups m5, [%1+2*%4-4] 
+    shufps m2, m5, 0xDD
+    addps m2, m1
+    mulps m2, m3
+    subps m0, m2
+    movups m4, m1
+    shufps m1, m0, 0x44
+    shufps m1, m1, 0xD8
+    shufps m4, m0, 0xEE
+    shufps m4, m4, 0xD8
+    movups [%1+2*%4-24], m1
+    movups [%1+2*%4-8], m4
+
+    add %4, 4
+    cmp %4, %5
+    jge .beginloop3
+    jmp .loop2
+  
+.endloop2:
+    sub %4, 12
+.littleloop2:
+    movss m0, [%1+2*%4+4]
+    movss m1, [%1+2*%4]
+    movss m2, [%1+2*%4+8]
+    addss m1, m2
+    mulss m1, m3
+    subss m0, m1
+    movss [%1+2*%4+4], m0
+    add %4, 4
+    cmp %4, %5
+    jl .littleloop2
+
+.beginloop3:
+    movups m3, [F_LFTG_BETA]
+    mov %4, %2
+    mov %5, %3
+    shr %4, 1
+    sub %4, 4
+    shr %5, 1
+    add %5, 8
+    cmp %4, %5
+    jge .beginloop4
+.loop3:
+    add %4, 12
+    cmp %4, %5
+    jge .endloop3
+
+    movups m0, [%1+2*%4-28]
+    movups m4, [%1+2*%4-12]
+    movups m1, m0
+    shufps m0, m4, 0xDD
+    shufps m1, m4, 0x88
+    movups m2, [%1+2*%4-24]
+    movups m5, [%1+2*%4-8] 
+    shufps m2, m5, 0xDD
+    addps m2, m1
+    mulps m2, m3
+    addps m0, m2
+    movups m4, m1
+    shufps m1, m0, 0x44
+    shufps m1, m1, 0xD8
+    shufps m4, m0, 0xEE
+    shufps m4, m4, 0xD8
+    movups [%1+2*%4-28], m1
+    movups [%1+2*%4-12], m4
+
+    add %4, 4
+    cmp %4, %5
+    jge .beginloop4
+    jmp .loop3
+  
+.endloop3:
+    sub %4, 12
+.littleloop3:
+    movss m0, [%1+2*%4]
+    movss m1, [%1+2*%4-4]
+    movss m2, [%1+2*%4+4]
+    addss m1, m2
+    mulss m1, m3
+    addss m0, m1
+    movss [%1+2*%4], m0
+    add %4, 4
+    cmp %4, %5
+    jl .littleloop3
+
+.beginloop4:
+    movups m3, [F_LFTG_ALPHA]
+    mov %4, %2
+    mov %5, %3
+    shr %4, 1
+    sub %4, 4
+    shr %5, 1
+    add %5, 4
+    cmp %4, %5
+    jge .end
+.loop4:
+    add %4, 12
+    cmp %4, %5
+    jge .endloop4
+ 
+    movups m0, [%1+2*%4-24]
+    movups m4, [%1+2*%4-8]
+    movups m1, m0
+    shufps m0, m4, 0xDD
+    shufps m1, m4, 0x88
+    movups m2, [%1+2*%4-20]
+    movups m5, [%1+2*%4-4] 
+    shufps m2, m5, 0xDD
+    addps m2, m1
+    mulps m2, m3
+    addps m0, m2
+    movups m4, m1
+    shufps m1, m0, 0x44
+    shufps m1, m1, 0xD8
+    shufps m4, m0, 0xEE
+    shufps m4, m4, 0xD8
+    movups [%1+2*%4-24], m1
+    movups [%1+2*%4-8], m4
+
+    add %4, 4
+    cmp %4, %5
+    jge .end
+    jmp .loop4
+  
+.endloop4:
+    sub %4, 12
+.littleloop4:
+    movss m0, [%1+2*%4+4]
+    movss m1, [%1+2*%4]
+    movss m2, [%1+2*%4+8]
+    addss m1, m2
+    mulss m1, m3
+    addss m0, m1
+    movss [%1+2*%4+4], m0
+    add %4, 4
+    cmp %4, %5
+    jl .littleloop4
+
+.end:
+    shr %2, 2
+    shr %3, 2
+%endmacro
+
+
+;***********************************************************************
+; ff_hor_sd_float_<opt>(float *line, float *data, int mh, int lh, int lv, int w)
+;***********************************************************************
+%macro HORSDFLOAT 1
+cglobal hor_sd_float, 6, 12, %1, line, data, mh, lh, lv, w, l, lp, i0, i1, j0, j1
+    mov lq, mhq
+    shl lq, 2
+    add lq, lineq
+    shl lhq, 2
+    
+    mov lpq, 0
+.mainloop:
+    ;j0 = w*lp+j
+    mov j0q, wq
+    imul j0q, lpq
+
+    ;j1 = (lh-mh+1)/2 + j0
+    mov j1q, lhq
+    shr j1q, 2
+    sub j1q, mhq
+    add j1q, 1
+    shr j1q, 1
+    add j1q, j0q
+
+    shl j0q, 2
+    shl j1q, 2
+
+    ;i1 = 1-mh
+    mov i1q, 1
+    sub i1q, mhq
+    shl i1q, 2
+
+    ;i0 = mh
+    mov i0q, mhq
+    shl i0q, 2
+ 
+    cmp i0q, i1q
+    jg .i1i0
+
+;i0 < i1
+    cmp i1q, lhq
+    jge .i0
+    
+    add i0q, 4
+    cmp i0q, i1q
+    jne .inci0
+ 
+;i1 = i0+1
+.beginloopi0i1   
+    sub i0q, 4
+
+.loopi0i1:
+    add i1q, 24
+    cmp i1q, lhq
+    jge .endloopi0i1
+
+    ;l{i0,i0+2,i0+4,i0+6} <- data[j0:j0+3]
+    ;l{i0,i0+3,i0+5,i0+7} = l{i1,i1+2,i1+4,i1+6} <- data[j1:j1+3]
+    movups m0, [dataq+j0q]
+    movups m2, [dataq+j1q]
+    movups m1, m0
+    shufps m0, m2, 0x44
+    shufps m0, m0, 0xD8
+    shufps m1, m2, 0xEE
+    shufps m1, m1, 0xD8
+    movups [lq+i0q], m0
+    movups [lq+i0q+16], m1
+
+    add i1q, 8
+    add i0q, 32
+    add j0q, 16
+    add j1q, 16
+    cmp i1q, lhq
+    jl .loopi0i1  
+    cmp i0q, lhq
+    jge .sr_1d
+
+    ;i1>=lh & i0<lh
+    movss m0, [dataq+j0q]
+    movss [lq+i0q], m0
+    jmp .sr_1d
+
+;i1 + 6 >= lh
+.endloopi0i1:
+    sub i1q, 24
+.littleloopi0i1:
+
+    ;l[i0] <- data[j0]
+    ;l[i1] <- data[j1]
+    movss m0, [dataq+j0q]
+    movss m1, [dataq+j1q]
+    movss [lq+i0q], m0
+    movss [lq+i1q], m1
+
+    add i0q, 8
+    add i1q, 8
+    add j0q, 4
+    add j1q, 4
+    cmp i1q, lhq
+    jl .littleloopi0i1
+    cmp i0q, lhq
+    jge .sr_1d
+
+    ;i1>=lh & i0<lh
+    movss m0, [dataq+j0q]
+    movss [lq+i0q], m0
+    jmp .sr_1d
+
+;i1 < i0
+.i1i0:
+    cmp i0q, lhq
+    jge .i1
+    
+    add i1q, 4
+    cmp i0q, i1q
+    jne .inci1
+
+;i0 = i1+1
+.beginloopi1i0    
+    sub i1q, 4
+
+.loopi1i0:
+    add i0q, 24
+    cmp i0q, lhq
+    jge .endloopi1i0
+
+    ;l{i1,i1+2,i1+4,i1+6} <- data[j1:j1+3]
+    ;l{i1,i1+3,i1+5,i1+7} = l{i0,i0+2,i0+4,i0+6} <- data[j0:j0+3]
+    movups m0, [dataq+j1q]
+    movups m2, [dataq+j0q]
+    movups m1, m0
+    shufps m0, m2, 0x44
+    shufps m0, m0, 0xD8
+    shufps m1, m2, 0xEE
+    shufps m1, m1, 0xD8
+    movups [lq+i1q], m0
+    movups [lq+i1q+16], m1
+
+    add i0q, 8
+    add i1q, 32
+    add j0q, 16
+    add j1q, 16
+    cmp i0q, lhq
+    jl .loopi1i0  
+    cmp i1q, lhq
+    jge .sr_1d
+
+    ;i0>=lh & i1<lh
+    movss m0, [dataq+j1q]
+    movss [lq+i1q], m0
+    jmp .sr_1d
+
+.endloopi1i0:
+    sub i1q, 24
+.littleloopi1i0:
+
+    ;l[i0] <- data[j0]
+    ;l[i1] <- data[j1]
+    movss m0, [dataq+j1q]
+    movss m1, [dataq+j0q]
+    movss [lq+i1q], m0
+    movss [lq+i0q], m1
+
+    add i0q, 8
+    add i1q, 8
+    add j0q, 4
+    add j1q, 4
+    cmp i1q, lhq
+    jl .littleloopi1i0
+    cmp i0q, lhq
+    jge .sr_1d
+
+    ;i0>=lh & i1<lh
+    movss m0, [dataq+j0q]
+    movss [lq+i0q], m0
+    jmp .sr_1d
+
+;i0<i1 & i1>=lh
+.i0:
+    cmp i0q, lhq
+    jge .sr_1d
+    movss m0, [dataq+j0q]
+    movss [lq+i0q], m0
+    add i0q, 8
+    add j0q, 4
+    jmp .i0
+
+;i1<i0 & i0>=lh
+.i1:
+    cmp i1q, lhq
+    jge .sr_1d
+    movss m0, [dataq+j1q]
+    movss [lq+i1q], m0
+    add i1q, 8
+    add j1q, 4
+    jmp .i1
+
+;i0 < i1-1
+.inci0:
+    cmp i0q, lhq
+    jge .sr_1d
+    movss m0, [dataq+j0q]
+    movss [lq+i0q-4], m0
+    add i0q, 8
+    add j0q, 4
+    cmp i0q, i1q
+    je .beginloopi0i1
+    jmp .inci0
+
+;i1 < i0-1
+.inci1:
+    cmp i1q, lhq
+    jge .sr_1d
+    movss m0, [dataq+j1q]
+    movss [lq+i1q-4], m0
+    add i1q, 8
+    add j1q, 4
+    cmp i0q, i1q
+    je .beginloopi1i0
+    jmp .inci1
+
+.sr_1d:
+    mov i0q, mhq
+    mov i1q, lhq
+    shr i1q, 2
+    add i1q, mhq
+    SR1D97FLOAT_ lineq, i0q, i1q, j0q, j1q
+
+    mov i0q, 0
+    cmp i0q, lhq
+    jge .endmainloop
+    mov j0q, wq
+    imul j0q, lpq
+    shl j0q, 2
+.subloop3:
+    add i0q, 12
+    cmp i0q, lhq
+    jge .endsubloop3
+
+    movups m0, [lq+i0q-12]
+    movups [dataq+j0q], m0
+
+    add i0q, 4
+    add j0q, 16
+    cmp i0q, lhq
+    jge .endmainloop
+    jmp .subloop3  
+
+.endsubloop3:
+    sub i0q, 12
+.littlesubloop3:
+    movss m0, [lq+i0q]
+    movss [dataq+j0q], m0
+
+    add i0q, 4
+    add j0q, 4
+    cmp i0q, lhq
+    jl .littlesubloop3  
+
+.endmainloop:
+    add lpq, 1
+    cmp lpq, lvq
+    jl .mainloop
+
+    REP_RET
+%endmacro
+
+INIT_XMM sse
+HORSDFLOAT 6
diff --git a/libavcodec/x86/jpeg2000dsp_init.c b/libavcodec/x86/jpeg2000dsp_init.c
index baa81383ea..177330ea47 100644
--- a/libavcodec/x86/jpeg2000dsp_init.c
+++ b/libavcodec/x86/jpeg2000dsp_init.c
@@ -23,12 +23,16 @@ 
 #include "libavutil/cpu.h"
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/jpeg2000dsp.h"
+#include "libavcodec/jpeg2000dwt.h"
 
 void ff_ict_float_sse(void *src0, void *src1, void *src2, int csize);
 void ff_ict_float_avx(void *src0, void *src1, void *src2, int csize);
 void ff_rct_int_sse2 (void *src0, void *src1, void *src2, int csize);
 void ff_rct_int_avx2 (void *src0, void *src1, void *src2, int csize);
 
+void ff_sr_1d97_float_sse(float *line, int i0, int i1);
+void ff_hor_sd_float_sse(float *line, float *data, int mh, int lh, int lv, int w);
+
 av_cold void ff_jpeg2000dsp_init_x86(Jpeg2000DSPContext *c)
 {
     int cpu_flags = av_get_cpu_flags();
@@ -48,3 +52,54 @@  av_cold void ff_jpeg2000dsp_init_x86(Jpeg2000DSPContext *c)
         c->mct_decode[FF_DWT53] = ff_rct_int_avx2;
     }
 }
+
+av_cold void ff_jpeg2000dwt_init_x86(DWTContext *s, int type)
+{
+    int cpu_flags = av_get_cpu_flags();
+    if (EXTERNAL_SSE(cpu_flags)) {
+        if (type == FF_DWT97){
+            dwt_decode = dwt_decode97_float_sse;
+        }
+    }
+}
+
+void dwt_decode97_float_sse(DWTContext *s, float *t)
+{
+    int lev;
+    int w       = s->linelen[s->ndeclevels - 1][0];
+    float *line = s->f_linebuf;
+    float *data = t;
+    /* position at index O of line range [0-5,w+5] cf. extend function */
+    line += 5;
+
+    int i, j = 0;
+
+    for (lev = 0; lev < s->ndeclevels; lev++) {
+        int lh = s->linelen[lev][0],
+            lv = s->linelen[lev][1],
+            mh = s->mod[lev][0],
+            mv = s->mod[lev][1],
+            lp;
+        float *l;
+        // HOR_SD
+        ff_hor_sd_float_sse(line, data, mh, lh, lv, w);
+
+        // VER_SD
+        l = line + mv;
+        for (lp = 0; lp < lh; lp++) {
+            //printf("hello \n");
+            i = 0;
+            j = 0;
+            // copy with interleaving
+            for (i = mv; i < lv; i += 2, j++)
+                l[i] = data[w * j + lp];
+            for (i = 1 - mv; i < lv; i += 2, j++)
+                l[i] = data[w * j + lp];
+
+            ff_sr_1d97_float_sse(line, mv, mv + lv);
+
+            for (i = 0; i < lv; i++)
+                data[w * i + lp] = l[i];
+        }
+    }
+}