From patchwork Sun Nov 26 22:51:06 2017
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: James Darnley <james.darnley@gmail.com>
X-Patchwork-Id: 6376
Delivered-To: ffmpegpatchwork@gmail.com
Received: by 10.2.161.94 with SMTP id m30csp966761jah;
	Sun, 26 Nov 2017 14:58:32 -0800 (PST)
X-Google-Smtp-Source: 
 AGs4zMYYguhhoJBoVwh2nHIYuNnr1PDgk3TiTFzUYlbNnUdGYG/n0la60OOTNYNWj5K5X+JF8d6T
X-Received: by 10.28.122.11 with SMTP id v11mr15099438wmc.104.1511737112065;
	Sun, 26 Nov 2017 14:58:32 -0800 (PST)
ARC-Seal: i=1; a=rsa-sha256; t=1511737112; cv=none;
	d=google.com; s=arc-20160816;
	b=Kn2KCp89FVnVwlqj8n9xjQIWitUpWqGb4oJizsW4hm5c8FPc1+BIMrBP4sVnTGEzMH
	eqgP+ktd2jvHyypFK1jJBhtqaeRWKSug7X/uoMmGvK/jYT0i/UbG8kgGzTycv86BmDTa
	ebG2cRWaqHZkgahkQRmVo+1aSJ8qw0Fg6PBeXhvwOtPe8X2/83e8HMi4V00AL6fcnPLC
	NfUXftlpeK3agfo9k+AgpNRQPU6BJxzG/6Im8VL72As7EZDiLdaXJ/C7vm1jNWPC00x0
	pbr87JaAbnF9aRqrvpQzyh73rkewxMZN/vxk6iZLafdRvZLFsCGYENL+K2kx1cLFrQAK
	Q+0A==
ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=google.com;
	s=arc-20160816;
	h=sender:errors-to:content-transfer-encoding:mime-version:reply-to
	:list-subscribe:list-help:list-post:list-archive:list-unsubscribe
	:list-id:precedence:subject:references:in-reply-to:message-id:date
	:to:from:dkim-signature:delivered-to:arc-authentication-results;
	bh=zyGy3IFQPqcQ4VSMOFM1FmXDIUpAcg3AiQfvNOq77To=;
	b=Sz6/KLOIi8HWTkKedZYZHMteYV072AdYr/EQ3+U7u5Hwz/Qk20Ih9aFMGvyAuUOBYl
	GgYCTwAKn3LzK8bZfY826/ZWeLwKvaEXoENaefU+J8ZQDwrLb+rk7p+REn07zxRh9D09
	u8I/5aZR1L9EvIIH3Y8RRwYRxx30+jlmttEjYDCRwHl7FOfyt2QanFj8WPr+w/puecI9
	VJYxBptQiD2XFIr++wvel3P3VnnxGFZ7VUcifgkixvDT0BoQOFUteV8i11BhkbR0POvt
	k03Y29+5H7+HoKGbSEYXW9PJWyNnirOuQ7feJlxow8jW9YXV/FF5UFAR4VcpnrI7wH7I
	1S9w==
ARC-Authentication-Results: i=1; mx.google.com;
	dkim=neutral (body hash did not verify) header.i=@gmail.com
	header.s=20161025 header.b=MIjGCekd;
	spf=pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org
	designates 79.124.17.100 as permitted sender)
	smtp.mailfrom=ffmpeg-devel-bounces@ffmpeg.org;
	dmarc=fail (p=NONE sp=NONE dis=NONE) header.from=gmail.com
Return-Path: <ffmpeg-devel-bounces@ffmpeg.org>
Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org. [79.124.17.100])
	by mx.google.com with ESMTP id
	g34si21224013wra.332.2017.11.26.14.58.31;
	Sun, 26 Nov 2017 14:58:32 -0800 (PST)
Received-SPF: pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org
	designates 79.124.17.100 as permitted sender)
	client-ip=79.124.17.100;
Authentication-Results: mx.google.com;
	dkim=neutral (body hash did not verify) header.i=@gmail.com
	header.s=20161025 header.b=MIjGCekd;
	spf=pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org
	designates 79.124.17.100 as permitted sender)
	smtp.mailfrom=ffmpeg-devel-bounces@ffmpeg.org;
	dmarc=fail (p=NONE sp=NONE dis=NONE) header.from=gmail.com
Received: from [127.0.1.1] (localhost [127.0.0.1])
	by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id 3A12D68A345;
	Mon, 27 Nov 2017 00:58:29 +0200 (EET)
X-Original-To: ffmpeg-devel@ffmpeg.org
Delivered-To: ffmpeg-devel@ffmpeg.org
Received: from mail-wm0-f65.google.com (mail-wm0-f65.google.com
	[74.125.82.65])
	by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTPS id 4704C68A2DE
	for <ffmpeg-devel@ffmpeg.org>; Mon, 27 Nov 2017 00:58:27 +0200 (EET)
Received: by mail-wm0-f65.google.com with SMTP id b189so31291067wmd.5
	for <ffmpeg-devel@ffmpeg.org>; Sun, 26 Nov 2017 14:58:29 -0800 (PST)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20161025;
	h=from:to:subject:date:message-id:in-reply-to:references;
	bh=8bSC4fHyd8M1HZKcQ89znjr6R4D6X28De3M48vr3KDk=;
	b=MIjGCekdPQn+6WuOXoHoLAp+WKZGFBIGAHrInWdJ4QZTovbrNtpRrTTddZcuBNx1rE
	Sv2y3L/jOhehFBF3f+5pwSZn/Xkt1348feO8zjai3/lC27cDAGA1vw7l2lOLs9u7pgQv
	tjwjdlDpKVi+V5Nnvx3D+CuaZnNVlAXaOpr/PYzSgqpaJ8qicwstcp2f1PiFUAGIQ8oA
	q/1W5EHjNhEyX37qnUC79RlGU8C4OFIGRfRy1CRG+Kc73jjMZnbBNXGIlX+PK+GEJnfH
	BLkXrvUrpY+YB1GAsRsa5jmTC24U77IyQ+/PGrHgW83S1WsnIiud0sjhQY+HGIfb1OU+
	Xq4A==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
	d=1e100.net; s=20161025;
	h=x-gm-message-state:from:to:subject:date:message-id:in-reply-to
	:references;
	bh=8bSC4fHyd8M1HZKcQ89znjr6R4D6X28De3M48vr3KDk=;
	b=OXPPG817iXTMvxEBFSKvg2vyGAuy/Rp+K+QR6idjWkuCrgh1arydhY7dSmFNQlOWOi
	BMDwY/L0vaeS0ILu4Mi49wFW0D4h+oVzPpzyym6FWapYTWiYnkRWFnbvgC/Yf1VS4S4H
	8aqBylJpCB/8wPvyMnqD13qI9FGryD315xHM9TuN7mkmddEKCIQbC/+KH+qGGuppmWxA
	6OooUDm2h165H+nci2Pjc+98xUMe1OYZYaFl7AGjNj16TtjpnsxEMyz1Ff3SzLrkBYY0
	dJlaUSSphameSEqLmfGBai46tK2Vq+b9Vbk2D/xl2G7jTeH8lBSzxhzYVYswZNWFaav/
	IVCA==
X-Gm-Message-State: AJaThX4ZSND+1eJGoUXoiGC0lVP8/RKDjEbZkoIBKQAUFXyo+Nd8wYjT
	BW3U1V2fxBQmWTXNaHXsNQ5xuw==
X-Received: by 10.80.241.218 with SMTP id y26mr48985476edl.120.1511736721674;
	Sun, 26 Nov 2017 14:52:01 -0800 (PST)
Received: from Highwind.systemlords.lan (d51a44418.access.telenet.be.
	[81.164.68.24]) by smtp.gmail.com with ESMTPSA id
	h56sm22545791ede.15.2017.11.26.14.52.00
	for <ffmpeg-devel@ffmpeg.org>
	(version=TLS1_2 cipher=ECDHE-RSA-AES128-GCM-SHA256 bits=128/128);
	Sun, 26 Nov 2017 14:52:01 -0800 (PST)
From: James Darnley <james.darnley@gmail.com>
To: ffmpeg-devel@ffmpeg.org
Date: Sun, 26 Nov 2017 23:51:06 +0100
Message-Id: <20171126225111.5108-4-james.darnley@gmail.com>
X-Mailer: git-send-email 2.15.0
In-Reply-To: <20171126225111.5108-1-james.darnley@gmail.com>
References: <20171126225111.5108-1-james.darnley@gmail.com>
Subject: [FFmpeg-devel] [PATCH 3/8] avcodec/flac: add SSE4.2 version of the
	32-bit lpc encoder
X-BeenThere: ffmpeg-devel@ffmpeg.org
X-Mailman-Version: 2.1.20
Precedence: list
List-Id: FFmpeg development discussions and patches <ffmpeg-devel.ffmpeg.org>
List-Unsubscribe: <http://ffmpeg.org/mailman/options/ffmpeg-devel>,
	<mailto:ffmpeg-devel-request@ffmpeg.org?subject=unsubscribe>
List-Archive: <http://ffmpeg.org/pipermail/ffmpeg-devel/>
List-Post: <mailto:ffmpeg-devel@ffmpeg.org>
List-Help: <mailto:ffmpeg-devel-request@ffmpeg.org?subject=help>
List-Subscribe: <http://ffmpeg.org/mailman/listinfo/ffmpeg-devel>,
	<mailto:ffmpeg-devel-request@ffmpeg.org?subject=subscribe>
Reply-To: FFmpeg development discussions and patches
	<ffmpeg-devel@ffmpeg.org>
MIME-Version: 1.0
Errors-To: ffmpeg-devel-bounces@ffmpeg.org
Sender: "ffmpeg-devel" <ffmpeg-devel-bounces@ffmpeg.org>

From 1.3 to 2.5 times faster.  Runtime reduced by 4 to 58%.  As with the
16-bit version the speed-up generally increases with compression_level.

Also like the 16-bit version, it is not used with levels less than 3.

After this bug fix in long, long ago in e609cfd697 this 32-bit lpc
encoder is heavily used with 16-bit samples.
---
 libavcodec/x86/flac_dsp_gpl.asm | 106 ++++++++++++++++++++++++++++++++++++++++
 libavcodec/x86/flacdsp_init.c   |   5 ++
 2 files changed, 111 insertions(+)

diff --git a/libavcodec/x86/flac_dsp_gpl.asm b/libavcodec/x86/flac_dsp_gpl.asm
index c461c666be..618306eb5f 100644
--- a/libavcodec/x86/flac_dsp_gpl.asm
+++ b/libavcodec/x86/flac_dsp_gpl.asm
@@ -22,6 +22,12 @@
 
 %include "libavutil/x86/x86util.asm"
 
+SECTION_RODATA
+
+pd_0_int_min: times  2 dd 0, -2147483648
+pq_int_min:   times  2 dq -2147483648
+pq_int_max:   times  2 dq  2147483647
+
 SECTION .text
 
 %macro FUNCTION_BODY_16 0
@@ -116,8 +122,108 @@ RET
 
 %endmacro
 
+%macro PMINSQ 3
+    pcmpgtq %3, %2, %1
+    pand    %1, %3
+    pandn   %3, %2
+    por     %1, %3
+%endmacro
+
+%macro PMAXSQ 3
+    pcmpgtq %3, %1, %2
+    pand    %1, %3
+    pandn   %3, %2
+    por     %1, %3
+%endmacro
+
+%macro CLIPQ 4 ; reg, min, max, tmp
+    PMAXSQ %1, %2, %4
+    PMINSQ %1, %3, %4
+%endmacro
+
+%macro HACK_PSRAQ 4 ; dst, src (shift), sign extend mask, tmp
+    pxor    %4, %4 ; zero
+    pcmpgtq %4, %1 ; mask where 0 > dst
+    pand    %4, %3 ; mask & sign extend mask
+    psrlq   %1, %2 ; dst >>= shift
+    por     %1, %4 ; dst | mask
+%endmacro
+
+%macro FUNCTION_BODY_32 0
+
+%if ARCH_X86_64
+    cglobal flac_enc_lpc_32, 5, 7, 4, mmsize, res, smp, len, order, coefs
+    DECLARE_REG_TMP 5, 6
+    %define length r2d
+
+    movsxd orderq, orderd
+%else
+    cglobal flac_enc_lpc_32, 5, 6, 4, mmsize, res, smp, len, order, coefs
+    DECLARE_REG_TMP 2, 5
+    %define length r2mp
+%endif
+
+; Here we assume that the maximum order value is 32.  This means that we only
+; need to copy a maximum of 32 samples.  Therefore we let the preprocessor
+; unroll this loop and copy all 32.
+%assign iter 0
+%rep 32/(mmsize/4)
+    movu  m0,         [smpq+iter]
+    movu [resq+iter],  m0
+    %assign iter iter+mmsize
+%endrep
+
+lea    resq,   [resq+orderq*4]
+lea    smpq,   [smpq+orderq*4]
+lea    coefsq, [coefsq+orderq*4]
+sub    length,  orderd
+movd   m3,      r5m
+neg    orderq
+
+movu   m4,     [pd_0_int_min] ; load 1 bit
+psrad  m4,      m3            ; turn that into shift+1 bits
+pslld  m4,      1             ; reduce that
+mova  [rsp],    m4            ; save sign extend mask
+
+%define posj t0q
+%define negj t1q
+
+.looplen:
+    pxor m0,   m0
+    mov  posj, orderq
+    xor  negj, negj
+
+    .looporder:
+        movd   m2,  [coefsq+posj*4] ; c = coefs[j]
+        SPLATD m2
+        pmovzxdq m1,  [smpq+negj*4-4] ; s = smp[i-j-1]
+        pmuldq m1,   m2
+        paddq  m0,   m1             ; p += c * s
+
+        dec    negj
+        inc    posj
+    jnz .looporder
+
+    HACK_PSRAQ m0, m3, [rsp], m2    ; p >>= shift
+    CLIPQ   m0,   [pq_int_min], [pq_int_max], m2 ; clip(p >> shift)
+    pshufd  m0,    m0, q0020 ; pack into first 2 dwords
+    movh    m1,   [smpq]
+    psubd   m1,    m0               ; smp[i] - p
+    movh   [resq], m1               ; res[i] = smp[i] - (p >> shift)
+
+    add resq,   mmsize/2
+    add smpq,   mmsize/2
+    sub length, mmsize/8
+jg .looplen
+RET
+
+%endmacro ; FUNCTION_BODY_32
+
 INIT_XMM sse4
 FUNCTION_BODY_16
 
+INIT_XMM sse42
+FUNCTION_BODY_32
+
 INIT_YMM avx2
 FUNCTION_BODY_16
diff --git a/libavcodec/x86/flacdsp_init.c b/libavcodec/x86/flacdsp_init.c
index 0a5c01859f..f827186c26 100644
--- a/libavcodec/x86/flacdsp_init.c
+++ b/libavcodec/x86/flacdsp_init.c
@@ -29,6 +29,7 @@ void ff_flac_lpc_32_xop(int32_t *samples, const int coeffs[32], int order,
 
 void ff_flac_enc_lpc_16_sse4(int32_t *, const int32_t *, int, int, const int32_t *,int);
 void ff_flac_enc_lpc_16_avx2(int32_t *, const int32_t *, int, int, const int32_t *,int);
+void ff_flac_enc_lpc_32_sse42(int32_t *, const int32_t *, int, int, const int32_t *,int);
 
 #define DECORRELATE_FUNCS(fmt, opt)                                                      \
 void ff_flac_decorrelate_ls_##fmt##_##opt(uint8_t **out, int32_t **in, int channels,     \
@@ -111,6 +112,10 @@ av_cold void ff_flacdsp_init_x86(FLACDSPContext *c, enum AVSampleFormat fmt, int
         if (CONFIG_GPL)
             c->lpc16_encode = ff_flac_enc_lpc_16_sse4;
     }
+    if (EXTERNAL_SSE42(cpu_flags)) {
+        if (CONFIG_GPL)
+            c->lpc32_encode = ff_flac_enc_lpc_32_sse42;
+    }
     if (EXTERNAL_AVX2(cpu_flags)) {
         if (CONFIG_GPL)
             c->lpc16_encode = ff_flac_enc_lpc_16_avx2;