From patchwork Sat May 25 20:57:21 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Lynne X-Patchwork-Id: 49264 Delivered-To: ffmpegpatchwork2@gmail.com Received: by 2002:a59:542:0:b0:460:55fa:d5ed with SMTP id 63csp2477291vqf; Sat, 25 May 2024 13:58:00 -0700 (PDT) X-Forwarded-Encrypted: i=2; AJvYcCXB93DTmWjBRsx5QV819xnQGd2RUd8Twiye0Ip385840NOdZXMioggpGfYoZlvNl7ziuKxN/ddsh7FvntmVOcT/SmQwDRosD4FofA== X-Google-Smtp-Source: AGHT+IGssQHLsu6cMcPKslUurH/GFTSOIbqq9Zl+So/dJKxUCic3CRqRHxjysOVcaW60XHcI9S8w X-Received: by 2002:a05:6402:2313:b0:579:c121:39d7 with SMTP id 4fb4d7f45d1cf-579c1213e08mr1009444a12.14.1716670679832; Sat, 25 May 2024 13:57:59 -0700 (PDT) ARC-Seal: i=1; a=rsa-sha256; t=1716670679; cv=none; d=google.com; s=arc-20160816; b=n/MhgIqlormwmgi4NEa7G+wfpAMPoLGl4jJIa8rdAWM1Tr/3Y5oLA8lmZHS7wEJAJ8 SMo2CvJ4OgRiiCuTJvFWIwLlBhqOEyuGNGXFQ4iCHv8HOVBtUHniB/FOL/ISxmo1SUgK GtGYyf7lidlC8RQ8ns3g8qWrqfHnGJ7JhlcGlbJomajMfKZJh6cT9OxWkIJHIEVCcu3h zEw59SKswKlTeMMA9nL7HXnRAAhCfrNY9Nuj6BuUX1J89NugZbbcE2SOok75HhQYYKn9 iffl3gk723pI+YqycN4WAYJYGDgXW8QydH1YmQQqbSx7EfHHrD3+ireUFxhYVk4Yjs4A +Auw== ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=google.com; s=arc-20160816; h=sender:errors-to:content-transfer-encoding:cc:reply-to:from :list-subscribe:list-help:list-post:list-archive:list-unsubscribe :list-id:precedence:subject:mime-version:message-id:date:to :delivered-to; bh=Qk/R4ZsRCMkppCGl3Rr+z+63mKgF1OtSAmeXDXJ1gXE=; fh=nenT92/WZoU6unXd3J6UhGUdod4piddKfVtctNBOh6k=; b=xIp4CfC0KeKCQ7mSsfVU8e19mOi1mp9WwKC37j0PnpwKn+ME0xoihpGvkrPQkx+Wo+ zqjyg1zUtRghs6y+8fA6sBW72Sf/kX6fdZ6Um0NRWEkHBPsoQB7XhgPTAfdm8lMcXUpq qe/XLiou3LqltYeAxBEOsQC9Dk/F/oeDdc0q/inksO12mOFHbt76KFOayAtfEC3aY/1s 1mRsmpCEuejecH67vCMdTkHVbsflJfbBReiQUG80XZDBXsvGChf4m5/UY5myG/8Cz5Fd hHV3xhiQ9cxIrYK7ZNxU6WuBJbN4sq7pvDf/f1LsQH7E/I95ftsTs7/d80fLNnV4gFbo sINA==; dara=google.com ARC-Authentication-Results: i=1; mx.google.com; spf=pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) smtp.mailfrom=ffmpeg-devel-bounces@ffmpeg.org Return-Path: Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org. [79.124.17.100]) by mx.google.com with ESMTP id 4fb4d7f45d1cf-57867e89113si1145992a12.213.2024.05.25.13.57.59; Sat, 25 May 2024 13:57:59 -0700 (PDT) Received-SPF: pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) client-ip=79.124.17.100; Authentication-Results: mx.google.com; spf=pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) smtp.mailfrom=ffmpeg-devel-bounces@ffmpeg.org Received: from [127.0.1.1] (localhost [127.0.0.1]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id 6D99A68D4EB; Sat, 25 May 2024 23:57:56 +0300 (EEST) X-Original-To: ffmpeg-devel@ffmpeg.org Delivered-To: ffmpeg-devel@ffmpeg.org Received: from vidala.lynne.ee (vidala.pars.ee [116.203.72.101]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTPS id EFD7E68D29F for ; Sat, 25 May 2024 23:57:49 +0300 (EEST) To: ffmpeg-devel@ffmpeg.org Date: Sat, 25 May 2024 22:57:21 +0200 Message-ID: <20240525205731.2578146-1-dev@lynne.ee> X-Mailer: git-send-email 2.43.0.381.gb435a96ce8 MIME-Version: 1.0 Subject: [FFmpeg-devel] [PATCH] lpc: rewrite lpc_compute_autocorr in external asm X-BeenThere: ffmpeg-devel@ffmpeg.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: FFmpeg development discussions and patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-Patchwork-Original-From: Lynne via ffmpeg-devel From: Lynne Reply-To: FFmpeg development discussions and patches Cc: Lynne Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" X-TUID: KUZT73EcrgwP The inline asm function had issues running under checkasm. So I came to finish what I started, and wrote the last part of LPC computation in assembly. autocorr_10_c: 135525.8 autocorr_10_sse2: 50729.8 autocorr_10_fma3: 19007.8 autocorr_30_c: 390100.8 autocorr_30_sse2: 142478.8 autocorr_30_fma3: 50559.8 autocorr_32_c: 407058.3 autocorr_32_sse2: 151633.3 autocorr_32_fma3: 50517.3 --- libavcodec/x86/lpc.asm | 91 +++++++++++++++++++++++++++++++++++++++ libavcodec/x86/lpc_init.c | 87 ++++--------------------------------- 2 files changed, 100 insertions(+), 78 deletions(-) diff --git a/libavcodec/x86/lpc.asm b/libavcodec/x86/lpc.asm index a585c17ef5..790841b7f4 100644 --- a/libavcodec/x86/lpc.asm +++ b/libavcodec/x86/lpc.asm @@ -32,6 +32,8 @@ dec_tab_sse2: times 2 dq -2.0 dec_tab_scalar: times 2 dq -1.0 seq_tab_sse2: dq 1.0, 0.0 +autoc_init_tab: times 4 dq 1.0 + SECTION .text %macro APPLY_WELCH_FN 0 @@ -261,3 +263,92 @@ APPLY_WELCH_FN INIT_YMM avx2 APPLY_WELCH_FN %endif + +%macro COMPUTE_AUTOCORR_FN 0 +cglobal lpc_compute_autocorr, 4, 7, 8, data, len, lag, autoc, lag_p, data_l, len_p + + shl lagd, 3 + shl lenq, 3 + xor lag_pq, lag_pq + +.lag_l: + movaps m8, [autoc_init_tab] + + mov len_pq, lag_pq + + lea data_lq, [lag_pq + mmsize - 8] + neg data_lq ; -j - mmsize + add data_lq, dataq ; data[-j - mmsize] +.len_l: + ; We waste the upper value here on SSE2, + ; but we use it on AVX. + movupd xm0, [dataq + len_pq] ; data[i] + movupd m1, [data_lq + len_pq] ; data[i - j] + +%if cpuflag(avx) + vbroadcastsd m0, xm0 + vperm2f128 m1, m1, m1, 0x01 +%endif + + shufpd m0, m0, m0, 1100b + shufpd m1, m1, m1, 0101b + +%if cpuflag(fma3) + fmaddpd m8, m0, m1, m8 ; sum += data[i]*data[i-j] +%else + mulpd m0, m1 + addpd m8, m0 ; sum += data[i]*data[i-j] +%endif + + add len_pq, 8 + cmp len_pq, lenq + jl .len_l + + movups [autocq + lag_pq], m8 ; autoc[j] = sum + add lag_pq, mmsize + cmp lag_pq, lagq + jl .lag_l + + ; The tail computation is guaranteed never to happen + ; as long as we're doing multiples of 4, rather than 2. + ; It is trivial to convert this to avx if ever needed. +%if !cpuflag(avx) + jg .end + ; If lag_p == lag fallthrough + +.tail: + movaps xm2, [autoc_init_tab] + + mov len_pq, lag_pq + sub len_pq, mmsize + + lea data_lq, [lag_pq] + neg data_lq ; -j + add data_lq, dataq ; data[-j] + +.tail_l: + movupd xm0, [dataq + len_pq] + movupd xm1, [data_lq + len_pq] + + mulpd xm0, xm1 + addpd xm2, xm0 ; sum += data[i]*data[i-j] + + add len_pq, mmsize + cmp len_pq, lenq + jl .tail_l + + shufpd xm1, xm2, xm2, 01b + addpd xm2, xm1 + + movhpd [autocq + lag_pq], xm2 +%endif + +.end: + RET + +%endmacro + +INIT_XMM sse2 +COMPUTE_AUTOCORR_FN +INIT_YMM fma3 +COMPUTE_AUTOCORR_FN diff --git a/libavcodec/x86/lpc_init.c b/libavcodec/x86/lpc_init.c index f2fca53799..96469fae40 100644 --- a/libavcodec/x86/lpc_init.c +++ b/libavcodec/x86/lpc_init.c @@ -28,89 +28,20 @@ void ff_lpc_apply_welch_window_sse2(const int32_t *data, ptrdiff_t len, double *w_data); void ff_lpc_apply_welch_window_avx2(const int32_t *data, ptrdiff_t len, double *w_data); - -DECLARE_ASM_CONST(16, double, pd_1)[2] = { 1.0, 1.0 }; - -#if HAVE_SSE2_INLINE - -static void lpc_compute_autocorr_sse2(const double *data, ptrdiff_t len, int lag, - double *autoc) -{ - int j; - - if((x86_reg)data & 15) - data++; - - for(j=0; jlpc_compute_autocorr = lpc_compute_autocorr_sse2; -#endif + if (EXTERNAL_SSE2(cpu_flags)) + c->lpc_compute_autocorr = ff_lpc_compute_autocorr_sse2; + + if (EXTERNAL_FMA3(cpu_flags)) + c->lpc_compute_autocorr = ff_lpc_compute_autocorr_fma3; if (EXTERNAL_SSE2(cpu_flags)) c->lpc_apply_welch_window = ff_lpc_apply_welch_window_sse2;