@@ -236,7 +236,11 @@ static av_always_inline void ff_ ## OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uin
#define ff_put_h264_qpel8or16_hv2_lowpass_sse2 ff_put_h264_qpel8or16_hv2_lowpass_mmxext
#define ff_avg_h264_qpel8or16_hv2_lowpass_sse2 ff_avg_h264_qpel8or16_hv2_lowpass_mmxext
-#define H264_MC(OPNAME, SIZE, MMX, ALIGN) \
+#define H264_MC_C_H(OPNAME, SIZE, MMX, ALIGN) \
+H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\
+H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\
+
+#define H264_MC_C_V_H_HV(OPNAME, SIZE, MMX, ALIGN) \
H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\
H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\
H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\
@@ -372,13 +376,9 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, const uin
ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+3, halfHV, stride, SIZE, SIZE);\
}\
-#define H264_MC_4816(MMX)\
-H264_MC(put_, 4, MMX, 8)\
-H264_MC(put_, 8, MMX, 8)\
-H264_MC(put_, 16,MMX, 8)\
-H264_MC(avg_, 4, MMX, 8)\
-H264_MC(avg_, 8, MMX, 8)\
-H264_MC(avg_, 16,MMX, 8)\
+#define H264_MC(QPEL, SIZE, MMX, ALIGN)\
+QPEL(put_, SIZE, MMX, ALIGN) \
+QPEL(avg_, SIZE, MMX, ALIGN) \
#define H264_MC_816(QPEL, XMM)\
QPEL(put_, 8, XMM, 16)\
@@ -397,7 +397,14 @@ QPEL_H264_H_XMM(avg_,AVG_MMXEXT_OP, ssse3)
QPEL_H264_HV_XMM(put_, PUT_OP, ssse3)
QPEL_H264_HV_XMM(avg_,AVG_MMXEXT_OP, ssse3)
-H264_MC_4816(mmxext)
+H264_MC(H264_MC_C_V_H_HV, 4, mmxext, 8)
+#if ARCH_X86_32
+H264_MC(H264_MC_C_V_H_HV, 8, mmxext, 8)
+H264_MC(H264_MC_C_V_H_HV, 16, mmxext, 8)
+#else
+H264_MC(H264_MC_C_H, 8, mmxext, 8)
+H264_MC(H264_MC_C_H, 16, mmxext, 8)
+#endif
H264_MC_816(H264_MC_V, sse2)
H264_MC_816(H264_MC_HV, sse2)
H264_MC_816(H264_MC_H, ssse3)
@@ -499,12 +506,16 @@ QPEL16(mmxext)
#endif /* HAVE_X86ASM */
-#define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
+#define SET_QPEL_FUNCS0123(PFX, IDX, SIZE, CPU, PREFIX) \
do { \
c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
+ } while (0)
+#define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
+ do { \
+ SET_QPEL_FUNCS0123(PFX, IDX, SIZE, CPU, PREFIX); \
c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
@@ -543,11 +554,16 @@ av_cold void ff_h264qpel_init_x86(H264QpelContext *c, int bit_depth)
if (EXTERNAL_MMXEXT(cpu_flags)) {
if (!high_bit_depth) {
- SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmxext, );
- SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmxext, );
+#if ARCH_X86_32
+#define SET_MMXEXT_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX)
+#else
+#define SET_MMXEXT_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) SET_QPEL_FUNCS0123(PFX, IDX, SIZE, CPU, PREFIX)
+#endif
+ SET_MMXEXT_QPEL_FUNCS(put_h264_qpel, 0, 16, mmxext, );
+ SET_MMXEXT_QPEL_FUNCS(put_h264_qpel, 1, 8, mmxext, );
SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmxext, );
- SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmxext, );
- SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmxext, );
+ SET_MMXEXT_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmxext, );
+ SET_MMXEXT_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmxext, );
SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmxext, );
} else if (bit_depth == 10) {
#if ARCH_X86_32
@@ -461,9 +461,11 @@ cglobal %1_h264_qpel8or16_v_lowpass_op, 5,5,8 ; dst, src, dstStride, srcStride,
REP_RET
%endmacro
+%if ARCH_X86_32
INIT_MMX mmxext
QPEL8OR16_V_LOWPASS_OP put
QPEL8OR16_V_LOWPASS_OP avg
+%endif
INIT_XMM sse2
QPEL8OR16_V_LOWPASS_OP put
@@ -581,8 +583,10 @@ cglobal %1_h264_qpel8or16_hv1_lowpass_op, 4,4,8 ; src, tmp, srcStride, size
REP_RET
%endmacro
+%if ARCH_X86_32
INIT_MMX mmxext
QPEL8OR16_HV1_LOWPASS_OP put
+%endif
INIT_XMM sse2
QPEL8OR16_HV1_LOWPASS_OP put
x64 always has MMX, MMXEXT, SSE and SSE2 and this means that some functions for MMX, MMXEXT, SSE and 3dnow are always overridden by other functions (unless one e.g. explicitly disables SSE2). This commit therefore disables several MMXEXT functions (that are overridden by SSE2 functions) at compile-time for x64. Notice that some 10-bit SSE2 functions are overridden by sse2_cache64 functions in the same code block. This is suboptimal and the functions that are overridden should either be removed or the sse2_cache64 functions be put behind suitable checks. This commit does neither. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> --- I would love to get input on what to do with these sse2_cache64 functions. If no one says anything, I will send a patch that retains the current behaviour and removes the functions overridden by the sse2_cache64 functions. libavcodec/x86/h264_qpel.c | 44 +++++++++++++++++++++---------- libavcodec/x86/h264_qpel_8bit.asm | 4 +++ 2 files changed, 34 insertions(+), 14 deletions(-)