@@ -24,6 +24,7 @@
SECTION_RODATA
cextern pd_1
+pd_2: times 4 dd 2
SECTION .text
@@ -100,9 +101,63 @@ REP_RET
%endmacro
+%macro LEGALL53_VERTICAL_LO 0
+
+cglobal legall53_vertical_lo, 4, 4, 4, b0, b1, b2, w
+ mova m3, [pd_2]
+ shl wd, 2
+ add b0q, wq
+ add b1q, wq
+ add b2q, wq
+ neg wq
+
+ ALIGN 16
+ .loop:
+ mova m0, [b0q + wq]
+ mova m1, [b1q + wq]
+ mova m2, [b2q + wq]
+ paddd m0, m2
+ paddd m0, m3
+ psrad m0, 2
+ psubd m1, m0
+ mova [b1q + wq], m1
+ add wq, mmsize
+ jl .loop
+RET
+
+%endmacro
+
+%macro LEGALL53_VERTICAL_HI 0
+
+cglobal legall53_vertical_hi, 4, 4, 4, b0, b1, b2, w
+ mova m3, [pd_1]
+ shl wd, 2
+ add b0q, wq
+ add b1q, wq
+ add b2q, wq
+ neg wq
+
+ ALIGN 16
+ .loop:
+ mova m0, [b0q + wq]
+ mova m1, [b1q + wq]
+ mova m2, [b2q + wq]
+ paddd m0, m2
+ paddd m0, m3
+ psrad m0, 1
+ paddd m1, m0
+ mova [b1q + wq], m1
+ add wq, mmsize
+ jl .loop
+RET
+
+%endmacro
+
INIT_XMM sse2
HAAR_HORIZONTAL
HAAR_VERTICAL
+LEGALL53_VERTICAL_HI
+LEGALL53_VERTICAL_LO
INIT_XMM avx
HAAR_HORIZONTAL
@@ -23,6 +23,9 @@
#include "libavutil/x86/cpu.h"
#include "libavcodec/dirac_dwt.h"
+void ff_legall53_vertical_hi_sse2(int32_t *b0, int32_t *b1, int32_t *b2, int width);
+void ff_legall53_vertical_lo_sse2(int32_t *b0, int32_t *b1, int32_t *b2, int width);
+
void ff_horizontal_compose_haar_10bit_sse2(int32_t *b0, int32_t *b1, int width_align);
void ff_horizontal_compose_haar_10bit_avx(int32_t *b0, int32_t *b1, int width_align);
void ff_horizontal_compose_haar_10bit_avx2(int32_t *b0, int32_t *b1, int width_align);
@@ -91,6 +94,22 @@ static void horizontal_compose_haar_avx2(int32_t *b, int32_t *tmp, int width)
}
}
+static void legall53_vertical_lo_sse2(int32_t *b0, int32_t *b1, int32_t *b2, int width)
+{
+ int i = width & ~3;
+ ff_legall53_vertical_lo_sse2(b0, b1, b2, i);
+ for(; i<width; i++)
+ b1[i] = COMPOSE_53iL0(b0[i], b1[i], b2[i]);
+}
+
+static void legall53_vertical_hi_sse2(int32_t *b0, int32_t *b1, int32_t *b2, int width)
+{
+ int i = width & ~3;
+ ff_legall53_vertical_hi_sse2(b0, b1, b2, i);
+ for(; i<width; i++)
+ b1[i] = COMPOSE_DIRAC53iH0(b0[i], b1[i], b2[i]);
+}
+
av_cold void ff_spatial_idwt_init_10bit_x86(DWTContext *d, enum dwt_type type)
{
#if HAVE_X86ASM
@@ -98,6 +117,10 @@ av_cold void ff_spatial_idwt_init_10bit_x86(DWTContext *d, enum dwt_type type)
if (EXTERNAL_SSE2(cpu_flags)) {
switch (type) {
+ case DWT_DIRAC_LEGALL5_3:
+ d->vertical_compose_h0 = (void*)legall53_vertical_hi_sse2;
+ d->vertical_compose_l0 = (void*)legall53_vertical_lo_sse2;
+ break;
case DWT_DIRAC_HAAR0:
d->vertical_compose = (void*)vertical_compose_haar_sse2;
break;