@@ -425,7 +425,10 @@ static void dwt_decode97_float(DWTContext *s, float *t)
for (i = 1 - mh; i < lh; i += 2, j++)
l[i] = data[w * lp + j];
- sr_1d97_float(line, mh, mh + lh);
+ if (ARCH_X86)
+ ff_sr_1d97_float_sse(line, mh, mh + lh);
+ else
+ sr_1d97_float(line, mh, mh + lh);
for (i = 0; i < lh; i++)
data[w * lp + i] = l[i];
@@ -65,4 +65,6 @@ int ff_dwt_decode(DWTContext *s, void *t);
void ff_dwt_destroy(DWTContext *s);
+void ff_sr_1d97_float_sse(float *p, int i0, int i1);
+
#endif /* AVCODEC_JPEG2000DWT_H */
@@ -29,6 +29,16 @@ pf_ict1: times 8 dd 0.34413
pf_ict2: times 8 dd 0.71414
pf_ict3: times 8 dd 1.772
+F_LFTG_K: dd 1.230174104914001
+F_LFTG_X: dd 0.812893066115961
+
+F_LFTG_ALPHA: times 8 dd 1.586134342059924
+F_LFTG_BETA: times 8 dd 0.052980118572961
+F_LFTG_GAMMA: times 8 dd 0.882911075530934
+F_LFTG_DELTA: times 8 dd 0.443506852043971
+
+TWO: dd 2.0
+
SECTION .text
;***********************************************************************
@@ -142,3 +152,261 @@ RCT_INT
INIT_YMM avx2
RCT_INT
%endif
+
+;***********************************************************************
+; ff_sr_ld97_float_<opt>(float *p, int i0, int i1)
+;***********************************************************************
+%macro SR1D97FLOAT 0
+cglobal sr_1d97_float, 3, 5, 10, p, i0, i1, tmp0, tmp1
+ mov tmp0q, i0q
+ mov tmp1q, i1q
+ add tmp0q, 1
+ cmp tmp1q, tmp0q
+ jg .extend
+ sub tmp0q, 2
+ jnz .else
+ movss m0, [pq+4]
+ movss m1, [F_LFTG_K]
+ movss m2, [TWO]
+ divss m1, m2
+ mulss m0, m1
+ movss [pq+4], m0
+ jmp .end
+
+.else:
+ movss m0, [pq]
+ movss m1, [F_LFTG_X]
+ mulss m0, m1
+ movss [pq], m0
+ jmp .end
+
+.extend:
+ shl i0d, 2
+ shl i1d, 2
+ mov tmp0q, i0q
+ mov tmp1q, i1q
+ movups m0, [pq+tmp0q+4]
+ shufps m0, m0, 0x1B
+ movups [pq+tmp0q-16], m0
+ movups m0, [pq+tmp1q-20]
+ shufps m0, m0, 0x1B
+ movups [pq+tmp1q], m0
+
+ movups m3, [F_LFTG_DELTA]
+ mov tmp0q, i0q
+ mov tmp1q, i1q
+ shr tmp0q, 1
+ sub tmp0q, 4
+ shr tmp1q, 1
+ add tmp1q, 8
+ cmp tmp0q, tmp1q
+ jge .beginloop2
+.loop1:
+ add tmp0q, 12
+ cmp tmp0q, tmp1q
+ jge .endloop1
+
+ movups m0, [pq+2*tmp0q-28]
+ movups m4, [pq+2*tmp0q-12]
+ movups m1, m0
+ shufps m0, m4, 0xDD
+ shufps m1, m4, 0x88
+ movups m2, [pq+2*tmp0q-24]
+ movups m5, [pq+2*tmp0q-8]
+ shufps m2, m5, 0xDD
+ addps m2, m1
+ mulps m2, m3
+ subps m0, m2
+ movups m4, m1
+ shufps m1, m0, 0x44
+ shufps m1, m1, 0xD8
+ shufps m4, m0, 0xEE
+ shufps m4, m4, 0xD8
+ movups [pq+2*tmp0q-28], m1
+ movups [pq+2*tmp0q-12], m4
+
+ add tmp0q, 4
+ cmp tmp0q, tmp1q
+ jge .beginloop2
+ jmp .loop1
+
+.endloop1:
+ sub tmp0q, 12
+.littleloop1:
+ movss m0, [pq+2*tmp0q]
+ movss m1, [pq+2*tmp0q-4]
+ movss m2, [pq+2*tmp0q+4]
+ addss m1, m2
+ mulss m1, m3
+ subss m0, m1
+ movss [pq+2*tmp0q], m0
+ add tmp0q, 4
+ cmp tmp0q, tmp1q
+ jl .littleloop1
+
+.beginloop2:
+ movups m3, [F_LFTG_GAMMA]
+ mov tmp0q, i0q
+ mov tmp1q, i1q
+ shr tmp0q, 1
+ sub tmp0q, 4
+ shr tmp1q, 1
+ add tmp1q, 4
+ cmp tmp0q, tmp1q
+ jge .beginloop3
+.loop2:
+ add tmp0q, 12
+ cmp tmp0q, tmp1q
+ jge .endloop2
+
+ movups m0, [pq+2*tmp0q-24]
+ movups m4, [pq+2*tmp0q-8]
+ movups m1, m0
+ shufps m0, m4, 0xDD
+ shufps m1, m4, 0x88
+ movups m2, [pq+2*tmp0q-20]
+ movups m5, [pq+2*tmp0q-4]
+ shufps m2, m5, 0xDD
+ addps m2, m1
+ mulps m2, m3
+ subps m0, m2
+ movups m4, m1
+ shufps m1, m0, 0x44
+ shufps m1, m1, 0xD8
+ shufps m4, m0, 0xEE
+ shufps m4, m4, 0xD8
+ movups [pq+2*tmp0q-24], m1
+ movups [pq+2*tmp0q-8], m4
+
+ add tmp0q, 4
+ cmp tmp0q, tmp1q
+ jge .beginloop3
+ jmp .loop2
+
+.endloop2:
+ sub tmp0q, 12
+.littleloop2:
+ movss m0, [pq+2*tmp0q+4]
+ movss m1, [pq+2*tmp0q]
+ movss m2, [pq+2*tmp0q+8]
+ addss m1, m2
+ mulss m1, m3
+ subss m0, m1
+ movss [pq+2*tmp0q+4], m0
+ add tmp0q, 4
+ cmp tmp0q, tmp1q
+ jl .littleloop2
+
+.beginloop3:
+ movups m3, [F_LFTG_BETA]
+ mov tmp0q, i0q
+ mov tmp1q, i1q
+ shr tmp0q, 1
+ sub tmp0q, 4
+ shr tmp1q, 1
+ add tmp1q, 8
+ cmp tmp0q, tmp1q
+ jge .beginloop4
+.loop3:
+ add tmp0q, 12
+ cmp tmp0q, tmp1q
+ jge .endloop3
+
+ movups m0, [pq+2*tmp0q-28]
+ movups m4, [pq+2*tmp0q-12]
+ movups m1, m0
+ shufps m0, m4, 0xDD
+ shufps m1, m4, 0x88
+ movups m2, [pq+2*tmp0q-24]
+ movups m5, [pq+2*tmp0q-8]
+ shufps m2, m5, 0xDD
+ addps m2, m1
+ mulps m2, m3
+ addps m0, m2
+ movups m4, m1
+ shufps m1, m0, 0x44
+ shufps m1, m1, 0xD8
+ shufps m4, m0, 0xEE
+ shufps m4, m4, 0xD8
+ movups [pq+2*tmp0q-28], m1
+ movups [pq+2*tmp0q-12], m4
+
+ add tmp0q, 4
+ cmp tmp0q, tmp1q
+ jge .beginloop4
+ jmp .loop3
+
+.endloop3:
+ sub tmp0q, 12
+.littleloop3:
+ movss m0, [pq+2*tmp0q]
+ movss m1, [pq+2*tmp0q-4]
+ movss m2, [pq+2*tmp0q+4]
+ addss m1, m2
+ mulss m1, m3
+ addss m0, m1
+ movss [pq+2*tmp0q], m0
+ add tmp0q, 4
+ cmp tmp0q, tmp1q
+ jl .littleloop3
+
+.beginloop4:
+ movups m3, [F_LFTG_ALPHA]
+ mov tmp0q, i0q
+ mov tmp1q, i1q
+ shr tmp0q, 1
+ sub tmp0q, 4
+ shr tmp1q, 1
+ add tmp1q, 4
+ cmp tmp0q, tmp1q
+ jge .end
+.loop4:
+ add tmp0q, 12
+ cmp tmp0q, tmp1q
+ jge .endloop4
+
+ movups m0, [pq+2*tmp0q-24]
+ movups m4, [pq+2*tmp0q-8]
+ movups m1, m0
+ shufps m0, m4, 0xDD
+ shufps m1, m4, 0x88
+ movups m2, [pq+2*tmp0q-20]
+ movups m5, [pq+2*tmp0q-4]
+ shufps m2, m5, 0xDD
+ addps m2, m1
+ mulps m2, m3
+ addps m0, m2
+ movups m4, m1
+ shufps m1, m0, 0x44
+ shufps m1, m1, 0xD8
+ shufps m4, m0, 0xEE
+ shufps m4, m4, 0xD8
+ movups [pq+2*tmp0q-24], m1
+ movups [pq+2*tmp0q-8], m4
+
+ add tmp0q, 4
+ cmp tmp0q, tmp1q
+ jge .end
+ jmp .loop4
+
+.endloop4:
+ sub tmp0q, 12
+.littleloop4:
+ movss m0, [pq+2*tmp0q+4]
+ movss m1, [pq+2*tmp0q]
+ movss m2, [pq+2*tmp0q+8]
+ addss m1, m2
+ mulss m1, m3
+ addss m0, m1
+ movss [pq+2*tmp0q+4], m0
+ add tmp0q, 4
+ cmp tmp0q, tmp1q
+ jl .littleloop4
+
+.end:
+ REP_RET
+%endmacro
+
+INIT_XMM sse
+SR1D97FLOAT
+
@@ -23,12 +23,15 @@
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/jpeg2000dsp.h"
+#include "libavcodec/jpeg2000dwt.h"
void ff_ict_float_sse(void *src0, void *src1, void *src2, int csize);
void ff_ict_float_avx(void *src0, void *src1, void *src2, int csize);
void ff_rct_int_sse2 (void *src0, void *src1, void *src2, int csize);
void ff_rct_int_avx2 (void *src0, void *src1, void *src2, int csize);
+void ff_sr_1d97_float_sse(float *p, int i0, int i1);
+
av_cold void ff_jpeg2000dsp_init_x86(Jpeg2000DSPContext *c)
{
int cpu_flags = av_get_cpu_flags();
From: Maxime Taisant <maximetaisant@hotmail.fr> Hi, I am currently working on SSE optimisations for the dwt functions used to decode JPEG2000. For the moment, I have only managed to produce a SSE-optimized version of the sr_1d97_float function (with relatively good results). I would like to have some comments on my work so far, to know if I am on the right track or if there is some parts that I need to improve or modify. Thank you. --- libavcodec/jpeg2000dwt.c | 5 +- libavcodec/jpeg2000dwt.h | 2 + libavcodec/x86/jpeg2000dsp.asm | 268 ++++++++++++++++++++++++++++++++++++++ libavcodec/x86/jpeg2000dsp_init.c | 3 + 4 files changed, 277 insertions(+), 1 deletion(-)