@@ -227,7 +227,7 @@ cglobal add_dirac_obmc%1_%2, 6,6,5, dst, src, stride, obmc, yblen
punpckhbw m1, m4
mova m2, [obmcq+i]
mova m3, m2
- punpcklbw m2, m4
+ punpcklbw m2, m4
punpckhbw m3, m4
pmullw m0, m2
pmullw m1, m3
@@ -247,9 +247,6 @@ cglobal add_dirac_obmc%1_%2, 6,6,5, dst, src, stride, obmc, yblen
RET
%endm
-INIT_MMX
-ADD_OBMC 8, mmx
-
INIT_XMM
PUT_RECT sse2
ADD_RECT sse2
@@ -258,6 +255,24 @@ HPEL_FILTER sse2
ADD_OBMC 32, sse2
ADD_OBMC 16, sse2
+cglobal add_dirac_obmc8_sse2, 6,6,5, dst, src, stride, obmc, yblen
+ pxor m4, m4
+.loop:
+ movh m0, [srcq]
+ punpcklbw m0, m4
+ movh m1, [obmcq]
+ punpcklbw m1, m4
+ pmullw m0, m1
+ movu m1, [dstq]
+ paddw m0, m1
+ movu [dstq], m0
+ lea srcq, [srcq+strideq]
+ lea dstq, [dstq+2*strideq]
+ add obmcq, 32
+ sub yblend, 1
+ jg .loop
+ RET
+
INIT_XMM sse4
; void dequant_subband_32(uint8_t *src, uint8_t *dst, ptrdiff_t stride, const int qf, const int qs, int tot_v, int tot_h)
@@ -24,8 +24,7 @@
void ff_add_rect_clamped_sse2(uint8_t *, const uint16_t *, int, const int16_t *, int, int, int);
-void ff_add_dirac_obmc8_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
-
+void ff_add_dirac_obmc8_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
void ff_add_dirac_obmc16_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
void ff_add_dirac_obmc32_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
@@ -94,15 +93,12 @@ void ff_diracdsp_init_x86(DiracDSPContext* c)
#if HAVE_X86ASM
int mm_flags = av_get_cpu_flags();
- if (EXTERNAL_MMX(mm_flags)) {
- c->add_dirac_obmc[0] = ff_add_dirac_obmc8_mmx;
- }
-
if (EXTERNAL_SSE2(mm_flags)) {
c->dirac_hpel_filter = dirac_hpel_filter_sse2;
c->add_rect_clamped = ff_add_rect_clamped_sse2;
c->put_signed_rect_clamped[0] = (void *)ff_put_signed_rect_clamped_sse2;
+ c->add_dirac_obmc[0] = ff_add_dirac_obmc8_sse2;
c->add_dirac_obmc[1] = ff_add_dirac_obmc16_sse2;
c->add_dirac_obmc[2] = ff_add_dirac_obmc32_sse2;
@@ -116,5 +112,5 @@ void ff_diracdsp_init_x86(DiracDSPContext* c)
c->dequant_subband[1] = ff_dequant_subband_32_sse4;
c->put_signed_rect_clamped[1] = ff_put_signed_rect_clamped_10_sse4;
}
-#endif
+#endif // HAVE_X86ASM
}