[FFmpeg-devel] Add support for "omp simd" pragma.

Message ID	20210110164351.86350-1-Reimar.Doeffinger@gmx.de
State	New
Headers	show Return-Path: <ffmpeg-devel-bounces@ffmpeg.org> From: Reimar.Doeffinger@gmx.de To: ffmpeg-devel@ffmpeg.org Date: Sun, 10 Jan 2021 17:43:51 +0100 Message-Id: <20210110164351.86350-1-Reimar.Doeffinger@gmx.de> MIME-Version: 1.0 Subject: [FFmpeg-devel] [PATCH] Add support for "omp simd" pragma. Precedence: list Reply-To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org> Cc: =?utf-8?q?Reimar_D=C3=B6ffinger?= <Reimar.Doeffinger@gmx.de> Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: base64 Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" <ffmpeg-devel-bounces@ffmpeg.org>
Series	[FFmpeg-devel] Add support for "omp simd" pragma. \| expand [FFmpeg-devel] Add support for "omp simd" pragma.

Context	Check	Description
andriy/x86_make	success	Make finished
andriy/x86_make_fate	success	Make fate finished
andriy/PPC64_make	success	Make finished
andriy/PPC64_make_fate	success	Make fate finished

diff --git a/configure b/configure index 900505756b..73b7c3daeb 100755 --- a/configure +++ b/configure @@ -406,6 +406,7 @@ Toolchain options: --enable-pic build position-independent code --enable-thumb compile for Thumb instruction set --enable-lto use link-time optimization + --enable-openmp-simd use the "omp simd" pragma to optimize code --env="ENV=override" override the environment variables Advanced options (experts only): @@ -2335,6 +2336,7 @@ HAVE_LIST=" opencl_dxva2 opencl_vaapi_beignet opencl_vaapi_intel_media + openmp_simd perl pod2man texi2html @@ -2446,6 +2448,7 @@ CMDLINE_SELECT=" extra_warnings logging lto + openmp_simd optimizations rpath stripping @@ -6926,6 +6929,26 @@ if enabled lto; then disable inline_asm_direct_symbol_refs fi +if enabled openmp_simd; then + ompopt="-fopenmp" + if ! test_cflags $ompopt ; then + test_cflags -Xpreprocessor -fopenmp && ompopt="-Xpreprocessor -fopenmp" + fi + test_cc $ompopt <<EOF && add_cflags "$ompopt" || die "failed to enable openmp SIMD" +#ifndef _OPENMP +#error _OPENMP is not defined +#endif +void test(unsigned char *c) +{ + _Pragma("omp simd") + for (int i = 0; i < 256; i++) + { + c[i] *= 16; + } +} +EOF +fi + enabled ftrapv && check_cflags -ftrapv test_cc -mno-red-zone <<EOF && noredzone_flags="-mno-red-zone" diff --git a/libavcodec/hevcdsp_template.c b/libavcodec/hevcdsp_template.c index 56cd9e605d..1a8b4160ec 100644 --- a/libavcodec/hevcdsp_template.c +++ b/libavcodec/hevcdsp_template.c @@ -50,6 +50,7 @@ static av_always_inline void FUNC(add_residual)(uint8_t *_dst, int16_t *res, stride /= sizeof(pixel); for (y = 0; y < size; y++) { + FF_OMP_SIMD for (x = 0; x < size; x++) { dst[x] = av_clip_pixel(dst[x] + *res); res++; @@ -247,6 +248,7 @@ static void FUNC(idct_ ## H ## x ## H )(int16_t *coeffs, \ int16_t *src = coeffs; \ IDCT_VAR ## H(H); \ \ + FF_OMP_SIMD \ for (i = 0; i < H; i++) { \ TR_ ## H(src, src, H, H, SCALE, limit2); \ if (limit2 < H && i%4 == 0 && !!i) \ @@ -256,6 +258,7 @@ static void FUNC(idct_ ## H ## x ## H )(int16_t *coeffs, \ \ shift = 20 - BIT_DEPTH; \ add = 1 << (shift - 1); \ + FF_OMP_SIMD \ for (i = 0; i < H; i++) { \ TR_ ## H(coeffs, coeffs, 1, 1, SCALE, limit); \ coeffs += H; \ @@ -502,6 +505,7 @@ static void FUNC(put_hevc_pel_pixels)(int16_t *dst, ptrdiff_t srcstride = _srcstride / sizeof(pixel); for (y = 0; y < height; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) dst[x] = src[x] << (14 - BIT_DEPTH); src += srcstride; @@ -543,6 +547,7 @@ static void FUNC(put_hevc_pel_bi_pixels)(uint8_t *_dst, ptrdiff_t _dststride, ui #endif for (y = 0; y < height; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) dst[x] = av_clip_pixel(((src[x] << (14 - BIT_DEPTH)) + src2[x] + offset) >> shift); src += srcstride; @@ -568,6 +573,7 @@ static void FUNC(put_hevc_pel_uni_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride, ox = ox * (1 << (BIT_DEPTH - 8)); for (y = 0; y < height; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) dst[x] = av_clip_pixel((((src[x] << (14 - BIT_DEPTH)) * wx + offset) >> shift) + ox); src += srcstride; @@ -592,6 +598,7 @@ static void FUNC(put_hevc_pel_bi_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride, ox0 = ox0 * (1 << (BIT_DEPTH - 8)); ox1 = ox1 * (1 << (BIT_DEPTH - 8)); for (y = 0; y < height; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) { dst[x] = av_clip_pixel(( (src[x] << (14 - BIT_DEPTH)) * wx1 + src2[x] * wx0 + (ox0 + ox1 + 1) * (1 << log2Wd)) >> (log2Wd + 1)); } @@ -623,6 +630,7 @@ static void FUNC(put_hevc_qpel_h)(int16_t *dst, ptrdiff_t srcstride = _srcstride / sizeof(pixel); const int8_t *filter = ff_hevc_qpel_filters[mx - 1]; for (y = 0; y < height; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) dst[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); src += srcstride; @@ -639,6 +647,7 @@ static void FUNC(put_hevc_qpel_v)(int16_t *dst, ptrdiff_t srcstride = _srcstride / sizeof(pixel); const int8_t *filter = ff_hevc_qpel_filters[my - 1]; for (y = 0; y < height; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) dst[x] = QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8); src += srcstride; @@ -662,6 +671,7 @@ static void FUNC(put_hevc_qpel_hv)(int16_t *dst, src -= QPEL_EXTRA_BEFORE * srcstride; filter = ff_hevc_qpel_filters[mx - 1]; for (y = 0; y < height + QPEL_EXTRA; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); src += srcstride; @@ -671,6 +681,7 @@ static void FUNC(put_hevc_qpel_hv)(int16_t *dst, tmp = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE; filter = ff_hevc_qpel_filters[my - 1]; for (y = 0; y < height; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) dst[x] = QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6; tmp += MAX_PB_SIZE; @@ -697,6 +708,7 @@ static void FUNC(put_hevc_qpel_uni_h)(uint8_t *_dst, ptrdiff_t _dststride, #endif for (y = 0; y < height; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + offset) >> shift); src += srcstride; @@ -724,6 +736,7 @@ static void FUNC(put_hevc_qpel_bi_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_ #endif for (y = 0; y < height; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift); src += srcstride; @@ -751,6 +764,7 @@ static void FUNC(put_hevc_qpel_uni_v)(uint8_t *_dst, ptrdiff_t _dststride, #endif for (y = 0; y < height; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + offset) >> shift); src += srcstride; @@ -779,6 +793,7 @@ static void FUNC(put_hevc_qpel_bi_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_ #endif for (y = 0; y < height; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift); src += srcstride; @@ -810,6 +825,7 @@ static void FUNC(put_hevc_qpel_uni_hv)(uint8_t *_dst, ptrdiff_t _dststride, src -= QPEL_EXTRA_BEFORE * srcstride; filter = ff_hevc_qpel_filters[mx - 1]; for (y = 0; y < height + QPEL_EXTRA; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); src += srcstride; @@ -820,6 +836,7 @@ static void FUNC(put_hevc_qpel_uni_hv)(uint8_t *_dst, ptrdiff_t _dststride, filter = ff_hevc_qpel_filters[my - 1]; for (y = 0; y < height; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + offset) >> shift); tmp += MAX_PB_SIZE; @@ -849,6 +866,7 @@ static void FUNC(put_hevc_qpel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8 src -= QPEL_EXTRA_BEFORE * srcstride; filter = ff_hevc_qpel_filters[mx - 1]; for (y = 0; y < height + QPEL_EXTRA; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); src += srcstride; @@ -859,6 +877,7 @@ static void FUNC(put_hevc_qpel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8 filter = ff_hevc_qpel_filters[my - 1]; for (y = 0; y < height; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + src2[x] + offset) >> shift); tmp += MAX_PB_SIZE; @@ -887,6 +906,7 @@ static void FUNC(put_hevc_qpel_uni_w_h)(uint8_t *_dst, ptrdiff_t _dststride, ox = ox * (1 << (BIT_DEPTH - 8)); for (y = 0; y < height; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) dst[x] = av_clip_pixel((((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox); src += srcstride; @@ -913,6 +933,7 @@ static void FUNC(put_hevc_qpel_bi_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint ox0 = ox0 * (1 << (BIT_DEPTH - 8)); ox1 = ox1 * (1 << (BIT_DEPTH - 8)); for (y = 0; y < height; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 + ((ox0 + ox1 + 1) * (1 << log2Wd))) >> (log2Wd + 1)); @@ -942,6 +963,7 @@ static void FUNC(put_hevc_qpel_uni_w_v)(uint8_t *_dst, ptrdiff_t _dststride, ox = ox * (1 << (BIT_DEPTH - 8)); for (y = 0; y < height; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) dst[x] = av_clip_pixel((((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox); src += srcstride; @@ -968,6 +990,7 @@ static void FUNC(put_hevc_qpel_bi_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint ox0 = ox0 * (1 << (BIT_DEPTH - 8)); ox1 = ox1 * (1 << (BIT_DEPTH - 8)); for (y = 0; y < height; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 + ((ox0 + ox1 + 1) * (1 << log2Wd))) >> (log2Wd + 1)); @@ -1000,6 +1023,7 @@ static void FUNC(put_hevc_qpel_uni_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, src -= QPEL_EXTRA_BEFORE * srcstride; filter = ff_hevc_qpel_filters[mx - 1]; for (y = 0; y < height + QPEL_EXTRA; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); src += srcstride; @@ -1011,6 +1035,7 @@ static void FUNC(put_hevc_qpel_uni_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, ox = ox * (1 << (BIT_DEPTH - 8)); for (y = 0; y < height; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) dst[x] = av_clip_pixel((((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx + offset) >> shift) + ox); tmp += MAX_PB_SIZE; @@ -1037,6 +1062,7 @@ static void FUNC(put_hevc_qpel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uin src -= QPEL_EXTRA_BEFORE * srcstride; filter = ff_hevc_qpel_filters[mx - 1]; for (y = 0; y < height + QPEL_EXTRA; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); src += srcstride; @@ -1049,6 +1075,7 @@ static void FUNC(put_hevc_qpel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uin ox0 = ox0 * (1 << (BIT_DEPTH - 8)); ox1 = ox1 * (1 << (BIT_DEPTH - 8)); for (y = 0; y < height; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx1 + src2[x] * wx0 + ((ox0 + ox1 + 1) * (1 << log2Wd))) >> (log2Wd + 1)); @@ -1076,6 +1103,7 @@ static void FUNC(put_hevc_epel_h)(int16_t *dst, ptrdiff_t srcstride = _srcstride / sizeof(pixel); const int8_t *filter = ff_hevc_epel_filters[mx - 1]; for (y = 0; y < height; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) dst[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); src += srcstride; @@ -1093,6 +1121,7 @@ static void FUNC(put_hevc_epel_v)(int16_t *dst, const int8_t *filter = ff_hevc_epel_filters[my - 1]; for (y = 0; y < height; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) dst[x] = EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8); src += srcstride; @@ -1114,6 +1143,7 @@ static void FUNC(put_hevc_epel_hv)(int16_t *dst, src -= EPEL_EXTRA_BEFORE * srcstride; for (y = 0; y < height + EPEL_EXTRA; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); src += srcstride; @@ -1124,6 +1154,7 @@ static void FUNC(put_hevc_epel_hv)(int16_t *dst, filter = ff_hevc_epel_filters[my - 1]; for (y = 0; y < height; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) dst[x] = EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6; tmp += MAX_PB_SIZE; @@ -1148,6 +1179,7 @@ static void FUNC(put_hevc_epel_uni_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8 #endif for (y = 0; y < height; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + offset) >> shift); src += srcstride; @@ -1173,6 +1205,7 @@ static void FUNC(put_hevc_epel_bi_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_ #endif for (y = 0; y < height; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) { dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift); } @@ -1199,6 +1232,7 @@ static void FUNC(put_hevc_epel_uni_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8 #endif for (y = 0; y < height; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + offset) >> shift); src += srcstride; @@ -1224,6 +1258,7 @@ static void FUNC(put_hevc_epel_bi_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_ #endif for (y = 0; y < height; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift); dst += dststride; @@ -1253,6 +1288,7 @@ static void FUNC(put_hevc_epel_uni_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint src -= EPEL_EXTRA_BEFORE * srcstride; for (y = 0; y < height + EPEL_EXTRA; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); src += srcstride; @@ -1263,6 +1299,7 @@ static void FUNC(put_hevc_epel_uni_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint filter = ff_hevc_epel_filters[my - 1]; for (y = 0; y < height; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + offset) >> shift); tmp += MAX_PB_SIZE; @@ -1292,6 +1329,7 @@ static void FUNC(put_hevc_epel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8 src -= EPEL_EXTRA_BEFORE * srcstride; for (y = 0; y < height + EPEL_EXTRA; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); src += srcstride; @@ -1302,6 +1340,7 @@ static void FUNC(put_hevc_epel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8 filter = ff_hevc_epel_filters[my - 1]; for (y = 0; y < height; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + src2[x] + offset) >> shift); tmp += MAX_PB_SIZE; @@ -1328,6 +1367,7 @@ static void FUNC(put_hevc_epel_uni_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uin ox = ox * (1 << (BIT_DEPTH - 8)); for (y = 0; y < height; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) { dst[x] = av_clip_pixel((((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox); } @@ -1353,6 +1393,7 @@ static void FUNC(put_hevc_epel_bi_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint ox0 = ox0 * (1 << (BIT_DEPTH - 8)); ox1 = ox1 * (1 << (BIT_DEPTH - 8)); for (y = 0; y < height; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 + ((ox0 + ox1 + 1) * (1 << log2Wd))) >> (log2Wd + 1)); @@ -1380,6 +1421,7 @@ static void FUNC(put_hevc_epel_uni_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uin ox = ox * (1 << (BIT_DEPTH - 8)); for (y = 0; y < height; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) { dst[x] = av_clip_pixel((((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox); } @@ -1405,6 +1447,7 @@ static void FUNC(put_hevc_epel_bi_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint ox0 = ox0 * (1 << (BIT_DEPTH - 8)); ox1 = ox1 * (1 << (BIT_DEPTH - 8)); for (y = 0; y < height; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 + ((ox0 + ox1 + 1) * (1 << log2Wd))) >> (log2Wd + 1)); @@ -1435,6 +1478,7 @@ static void FUNC(put_hevc_epel_uni_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, ui src -= EPEL_EXTRA_BEFORE * srcstride; for (y = 0; y < height + EPEL_EXTRA; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); src += srcstride; @@ -1446,6 +1490,7 @@ static void FUNC(put_hevc_epel_uni_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, ui ox = ox * (1 << (BIT_DEPTH - 8)); for (y = 0; y < height; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) dst[x] = av_clip_pixel((((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx + offset) >> shift) + ox); tmp += MAX_PB_SIZE; @@ -1472,6 +1517,7 @@ static void FUNC(put_hevc_epel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uin src -= EPEL_EXTRA_BEFORE * srcstride; for (y = 0; y < height + EPEL_EXTRA; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); src += srcstride; @@ -1484,6 +1530,7 @@ static void FUNC(put_hevc_epel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uin ox0 = ox0 * (1 << (BIT_DEPTH - 8)); ox1 = ox1 * (1 << (BIT_DEPTH - 8)); for (y = 0; y < height; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx1 + src2[x] * wx0 + ((ox0 + ox1 + 1) * (1 << log2Wd))) >> (log2Wd + 1)); diff --git a/libavutil/internal.h b/libavutil/internal.h index 93ea57c324..b0543bbf02 100644 --- a/libavutil/internal.h +++ b/libavutil/internal.h @@ -299,4 +299,10 @@ int avpriv_dict_set_timestamp(AVDictionary **dict, const char *key, int64_t time #define FF_PSEUDOPAL 0 #endif +#if HAVE_OPENMP_SIMD +#define FF_OMP_SIMD _Pragma("omp simd") +#else +#define FF_OMP_SIMD +#endif + #endif /* AVUTIL_INTERNAL_H */ diff --git a/libswscale/swscale_unscaled.c b/libswscale/swscale_unscaled.c index c4dd8a4d83..c112a61037 100644 --- a/libswscale/swscale_unscaled.c +++ b/libswscale/swscale_unscaled.c @@ -1743,6 +1743,7 @@ static int packedCopyWrapper(SwsContext *c, const uint8_t *src[], unsigned shift= src_depth-dst_depth, tmp;\ if (c->dither == SWS_DITHER_NONE) {\ for (i = 0; i < height; i++) {\ + FF_OMP_SIMD \ for (j = 0; j < length-7; j+=8) {\ dst[j+0] = dbswap(bswap(src[j+0])>>shift);\ dst[j+1] = dbswap(bswap(src[j+1])>>shift);\ @@ -1762,6 +1763,7 @@ static int packedCopyWrapper(SwsContext *c, const uint8_t *src[], } else if (shiftonly) {\ for (i = 0; i < height; i++) {\ const uint8_t *dither= dithers[shift-1][i&7];\ + FF_OMP_SIMD \ for (j = 0; j < length-7; j+=8) {\ tmp = (bswap(src[j+0]) + dither[0])>>shift; dst[j+0] = dbswap(tmp - (tmp>>dst_depth));\ tmp = (bswap(src[j+1]) + dither[1])>>shift; dst[j+1] = dbswap(tmp - (tmp>>dst_depth));\ @@ -1781,6 +1783,7 @@ static int packedCopyWrapper(SwsContext *c, const uint8_t *src[], } else {\ for (i = 0; i < height; i++) {\ const uint8_t *dither= dithers[shift-1][i&7];\ + FF_OMP_SIMD \ for (j = 0; j < length-7; j+=8) {\ tmp = bswap(src[j+0]); dst[j+0] = dbswap((tmp - (tmp>>dst_depth) + dither[0])>>shift);\ tmp = bswap(src[j+1]); dst[j+1] = dbswap((tmp - (tmp>>dst_depth) + dither[1])>>shift);\

[FFmpeg-devel] Add support for "omp simd" pragma.

Checks

Commit Message

Comments

Patch