old mode 100644
new mode 100755
@@ -154,3 +154,107 @@ cglobal filter_3x3, 4, 15, 7, dst, width, rdiv, bias, matrix, ptr, c0, c1, c2, c
INIT_XMM sse4
FILTER_3X3
%endif
+
+; void filter_row_sse4(uint8_t *dst, int width,
+; float rdiv, float bias, const int *const matrix,
+; const uint8_t *c[], int peak, int radius,
+; int dstride, int stride)
+%if ARCH_X86_64
+INIT_XMM sse4
+%if UNIX64
+cglobal filter_row, 6, 10, 7, dst, width, matrix, ptr, mult, rad, r, x, i, ci
+%else
+cglobal filter_row, 4, 10, 7, dst, width, rdiv, bias, matrix, ptr, mult, rad, r, x, i, ci
+%endif
+
+%if WIN64
+ SWAP m0, m2
+ SWAP m1, m3
+ mov r2q, matrixmp
+ mov r3q, ptrmp
+ mov r5q, radmp
+ DEFINE_ARGS dst, width, matrix, ptr, mult, rad, r, x, i, ci
+%endif
+
+movsxdifnidn radq, radd
+sal radq, 1
+add radq, 1 ; 2*radius+1
+movsxdifnidn widthq, widthd
+VBROADCASTSS m0, m0
+VBROADCASTSS m1, m1
+pxor m6, m6
+movss m5, [half]
+VBROADCASTSS m5, m5
+
+xor xq, xq
+cmp widthq, mmsize/4
+jl .loop2
+
+mov rq, widthq
+and rq, mmsize/4-1
+sub widthq, rq
+
+.loop1:
+ pxor m4, m4
+ xor iq, iq
+ .loop1_1:
+ movss m2, [matrixq + 4*iq]
+ VBROADCASTSS m2, m2
+ mov ciq, [ptrq + iq * gprsize]
+ movss m3, [ciq + xq]
+ punpcklbw m3, m6
+ punpcklwd m3, m6
+ pmulld m2, m3
+ paddd m4, m2
+
+ add iq, 1
+ cmp iq, radq
+ jl .loop1_1
+
+ cvtdq2ps m4, m4
+ mulps m4, m0 ; sum *= rdiv
+ addps m4, m1 ; sum += bias
+ addps m4, m5 ; sum += 0.5
+ cvttps2dq m4, m4
+ packssdw m4, m4
+ packuswb m4, m4
+ movss [dstq + xq], m4
+
+ add xq, mmsize/4
+ cmp xq, widthq
+ jl .loop1
+
+ add widthq, rq
+ cmp xq, widthq
+ jge .end
+
+.loop2:
+ xor rd, rd
+ xor iq, iq
+ .loop2_2:
+ mov ciq, [ptrq + iq * gprsize]
+ movzx multd, byte [ciq + xq]
+ imul multd, [matrixq + 4*iq]
+ add rd, multd
+
+ add iq, 1
+ cmp iq, radq
+ jl .loop2_2
+
+ pxor m4, m4
+ cvtsi2ss m4, rd
+ mulss m4, m0 ; sum *= rdiv
+ addss m4, m1 ; sum += bias
+ addss m4, m5 ; sum += 0.5
+ cvttps2dq m4, m4
+ packssdw m4, m4
+ packuswb m4, m4
+ movd rd, m4
+ mov [dstq + xq], rb
+
+ add xq, 1
+ cmp xq, widthq
+ jl .loop2
+.end:
+ RET
+%endif
@@ -29,6 +29,11 @@ void ff_filter_3x3_sse4(uint8_t *dst, int width,
const uint8_t *c[], int peak, int radius,
int dstride, int stride);
+void ff_filter_row_sse4(uint8_t *dst, int width,
+ float rdiv, float bias, const int *const matrix,
+ const uint8_t *c[], int peak, int radius,
+ int dstride, int stride);
+
av_cold void ff_convolution_init_x86(ConvolutionContext *s)
{
#if ARCH_X86_64
@@ -41,6 +46,10 @@ av_cold void ff_convolution_init_x86(ConvolutionContext *s)
s->filter[i] = ff_filter_3x3_sse4;
}
}
+ if (s->mode[i] == MATRIX_ROW) {
+ if (EXTERNAL_SSE4(cpu_flags))
+ s->filter[i] = ff_filter_row_sse4;
+ }
}
#endif
}