@@ -389,19 +389,27 @@ static void filter_row(uint8_t *dst, int width,
static void filter_column(uint8_t *dst, int height,
float rdiv, float bias, const int *const matrix,
- const uint8_t *c[], int peak, int radius,
+ const uint8_t *c[], int length, int radius,
int dstride, int stride)
{
- int y;
+ int y, off16;
+ // NOTE: alignment to 64-bytes, so 16 of int can be fill into full of a cache line
+ DECLARE_ALIGNED(64, int, sum)[16];
for (y = 0; y < height; y++) {
- int i, sum = 0;
+ int i;
+ memset(sum, 0, sizeof(sum));
- for (i = 0; i < 2 * radius + 1; i++)
- sum += c[i][0 + y * stride] * matrix[i];
+ for (i = 0; i < 2 * radius + 1; i++) {
+ for (off16 = 0; off16 < length; off16++) {
+ sum[off16] += c[i][0 + y * stride + off16] * matrix[i];
+ }
+ }
- sum = (int)(sum * rdiv + bias + 0.5f);
- dst[0] = av_clip_uint8(sum);
+ for (off16 = 0; off16 < length; off16++) {
+ sum[off16] = (int)(sum[off16] * rdiv + bias + 0.5f);
+ dst[off16] = av_clip_uint8(sum[off16]);
+ }
dst += dstride;
}
}
@@ -521,7 +529,10 @@ static int filter_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
continue;
}
- for (y = slice_start; y < slice_end; y++) {
+ const int step = mode == MATRIX_COLUMN ? 16 : 1;
+ int smax = mode == MATRIX_COLUMN ? 16: s->max;
+ for (y = slice_start; y < slice_end; y += step) {
+ if (mode == MATRIX_COLUMN && slice_end - y < 16) smax = slice_end - y;
const int xoff = mode == MATRIX_COLUMN ? (y - slice_start) * bpc : radius * bpc;
const int yoff = mode == MATRIX_COLUMN ? radius * stride : 0;
@@ -531,12 +542,12 @@ static int filter_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
s->setup[plane](radius, c, src, stride, x, width, y, height, bpc);
s->filter[plane](dst + yoff + xoff, 1, rdiv,
- bias, matrix, c, s->max, radius,
+ bias, matrix, c, smax, radius,
dstride, stride);
}
s->setup[plane](radius, c, src, stride, radius, width, y, height, bpc);
s->filter[plane](dst + yoff + xoff, sizew - 2 * radius,
- rdiv, bias, matrix, c, s->max, radius,
+ rdiv, bias, matrix, c, smax, radius,
dstride, stride);
for (x = sizew - radius; x < sizew; x++) {
const int xoff = mode == MATRIX_COLUMN ? (y - slice_start) * bpc : x * bpc;
@@ -544,7 +555,7 @@ static int filter_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
s->setup[plane](radius, c, src, stride, x, width, y, height, bpc);
s->filter[plane](dst + yoff + xoff, 1, rdiv,
- bias, matrix, c, s->max, radius,
+ bias, matrix, c, smax, radius,
dstride, stride);
}
if (mode != MATRIX_COLUMN)