From patchwork Tue Sep 15 16:11:58 2020 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Alan Kelly X-Patchwork-Id: 22422 Return-Path: X-Original-To: patchwork@ffaux-bg.ffmpeg.org Delivered-To: patchwork@ffaux-bg.ffmpeg.org Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org [79.124.17.100]) by ffaux.localdomain (Postfix) with ESMTP id 5CE6944B4D2 for ; Tue, 15 Sep 2020 19:12:10 +0300 (EEST) Received: from [127.0.1.1] (localhost [127.0.0.1]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id 3883368B9BC; Tue, 15 Sep 2020 19:12:10 +0300 (EEST) X-Original-To: ffmpeg-devel@ffmpeg.org Delivered-To: ffmpeg-devel@ffmpeg.org Received: from mail-ej1-f74.google.com (mail-ej1-f74.google.com [209.85.218.74]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTPS id BD72768B976 for ; Tue, 15 Sep 2020 19:12:03 +0300 (EEST) Received: by mail-ej1-f74.google.com with SMTP id e13so1531133ejk.1 for ; Tue, 15 Sep 2020 09:12:03 -0700 (PDT) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=google.com; s=20161025; h=sender:date:message-id:mime-version:subject:from:to:cc; bh=rJk0OcG5bgCBdZQOxBpwVEnsCqFFfEULADrnNl5eKHY=; b=ngofbo2gKKoYeDiz+8I5EsZ+Fv5wcW5rWYTJ4iOeILeBjZUDxkOumg/xXLllA0rprY KdjfnzqbjspeBnG3jBfZUm6HltuyK0ei+6O/CoDfVYpYeKDSu47EK+Wtgb/U22kDsSvu sTLDCGF2r2sl8y73JWWShc9iXZoJGnzK1ayyWwqvG1a+5iKPqe8jgjC3ITgex03BzTEm BzX/GaXATssANtlfU2LttQAnXpbeNkbakgkDQGrC0juAbbbIqmug7cIXoLbcZw9h581y LyNXHm8ulxi9erFC1dyU3PlIYY0ycGQ+98o07pDBATsLnwKEB0u7tjzITeya6Qn+0PDW vYGQ== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20161025; h=x-gm-message-state:sender:date:message-id:mime-version:subject:from :to:cc; bh=rJk0OcG5bgCBdZQOxBpwVEnsCqFFfEULADrnNl5eKHY=; b=lavERrlvr7Da/Pzc1bhvVp19KD1149o2b3IYSL81I7rc28v/gNiiJ9Wyjo5XtLKUA+ y5jIzLa2qw1+Nfjni9bWFPWkZ/3P3g6pRa61TCtGEXgqv9kaKeqRXmwqR3nnzEpUYVmL GDnQnvNIpCW7ucgK8LKuJSHSppKpoDlpH0lviJCmx20txBhyNR6z180Ek+bArmvTjvcj 1xTSMigM2rN+vd3e1vEvH8FveUoxgLfG5IYQkkzNf5CFmDiji+iCqm9GfkeTQYYI8wEg vu7Elxxiov2Zjvf5otHQkJ4a0sOi2SPFKo9drmHBB8heTUw4sYN45YecVDowYXK5kFWq VS2w== X-Gm-Message-State: AOAM5308kalv8Mq0e7z39ur2dMZZHtnd7Ry2IbujpHAPuN90j2yubucE rDxWNM2w7LO3naQlwuXYPGPQT4iZgbAiJhaxKoQFW56eu/yqtplqveC6OtmAK470hjy3uEsxJrh LPa7tkZKL8czUkpu//RtMIxH8YYEOl5guxcYcmQjtGoJayznYBn28n+VkDNeL+shXD7Og/NA= X-Google-Smtp-Source: ABdhPJwwzXnrBLV/kv7oqBdUFeowRokaA1BGbWuYWRFXcr78UCtEw/sNL015sXzUCO7E9Zglr3C4op8i9KL9VMk= X-Received: from alankelly0.zrh.corp.google.com ([2a00:79e0:42:205:f693:9fff:fef7:aa73]) (user=alankelly job=sendgmr) by 2002:a17:907:408d:: with SMTP id nt21mr20379346ejb.355.1600186322767; Tue, 15 Sep 2020 09:12:02 -0700 (PDT) Date: Tue, 15 Sep 2020 18:11:58 +0200 Message-Id: <20200915161158.2051301-1-alankelly@google.com> Mime-Version: 1.0 X-Mailer: git-send-email 2.28.0.618.gf4bc123cb7-goog From: Alan Kelly To: ffmpeg-devel@ffmpeg.org Subject: [FFmpeg-devel] [PATCH] Unrolls main loop of yuv2yuvX_sse3 and general code tidying for ~20% speedup X-BeenThere: ffmpeg-devel@ffmpeg.org X-Mailman-Version: 2.1.20 Precedence: list List-Id: FFmpeg development discussions and patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Reply-To: FFmpeg development discussions and patches Cc: Alan Kelly Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" --- libswscale/x86/swscale.c | 138 ++++++++++++++++++++------------------- 1 file changed, 72 insertions(+), 66 deletions(-) diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c index 3160fedf04..e47fee2bbd 100644 --- a/libswscale/x86/swscale.c +++ b/libswscale/x86/swscale.c @@ -201,76 +201,82 @@ static void yuv2yuvX_sse3(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW, const uint8_t *dither, int offset) { - if(((uintptr_t)dest) & 15){ + if(((uintptr_t)dest) & 31){ yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset); return; } filterSize--; -#define MAIN_FUNCTION \ - "pxor %%xmm0, %%xmm0 \n\t" \ - "punpcklbw %%xmm0, %%xmm3 \n\t" \ - "movd %4, %%xmm1 \n\t" \ - "punpcklwd %%xmm1, %%xmm1 \n\t" \ - "punpckldq %%xmm1, %%xmm1 \n\t" \ - "punpcklqdq %%xmm1, %%xmm1 \n\t" \ - "psllw $3, %%xmm1 \n\t" \ - "paddw %%xmm1, %%xmm3 \n\t" \ - "psraw $4, %%xmm3 \n\t" \ - "movdqa %%xmm3, %%xmm4 \n\t" \ - "movdqa %%xmm3, %%xmm7 \n\t" \ - "movl %3, %%ecx \n\t" \ - "mov %0, %%"FF_REG_d" \n\t"\ - "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ - ".p2align 4 \n\t" /* FIXME Unroll? */\ - "1: \n\t"\ - "movddup 8(%%"FF_REG_d"), %%xmm0 \n\t" /* filterCoeff */\ - "movdqa (%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm2 \n\t" /* srcData */\ - "movdqa 16(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm5 \n\t" /* srcData */\ - "add $16, %%"FF_REG_d" \n\t"\ - "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ - "test %%"FF_REG_S", %%"FF_REG_S" \n\t"\ - "pmulhw %%xmm0, %%xmm2 \n\t"\ - "pmulhw %%xmm0, %%xmm5 \n\t"\ - "paddw %%xmm2, %%xmm3 \n\t"\ - "paddw %%xmm5, %%xmm4 \n\t"\ - " jnz 1b \n\t"\ - "psraw $3, %%xmm3 \n\t"\ - "psraw $3, %%xmm4 \n\t"\ - "packuswb %%xmm4, %%xmm3 \n\t"\ - "movntdq %%xmm3, (%1, %%"FF_REG_c") \n\t"\ - "add $16, %%"FF_REG_c" \n\t"\ - "cmp %2, %%"FF_REG_c" \n\t"\ - "movdqa %%xmm7, %%xmm3 \n\t" \ - "movdqa %%xmm7, %%xmm4 \n\t" \ - "mov %0, %%"FF_REG_d" \n\t"\ - "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ - "jb 1b \n\t" - - if (offset) { - __asm__ volatile( - "movq %5, %%xmm3 \n\t" - "movdqa %%xmm3, %%xmm4 \n\t" - "psrlq $24, %%xmm3 \n\t" - "psllq $40, %%xmm4 \n\t" - "por %%xmm4, %%xmm3 \n\t" - MAIN_FUNCTION - :: "g" (filter), - "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset), - "m"(filterSize), "m"(((uint64_t *) dither)[0]) - : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , "%xmm5" , "%xmm7" ,) - "%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_c - ); - } else { - __asm__ volatile( - "movq %5, %%xmm3 \n\t" - MAIN_FUNCTION - :: "g" (filter), - "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset), - "m"(filterSize), "m"(((uint64_t *) dither)[0]) - : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , "%xmm5" , "%xmm7" ,) - "%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_c - ); - } + __asm__ volatile( + "vmovq %5, %%xmm3 \n\t" + "cmpl $0, %3 \n\t" + "jz 2f \n\t" + + "# offset != 0 path. \n\t" + "vpsrlq $24, %%xmm3, %%xmm5 \n\t" + "vpsllq $40, %%xmm3, %%xmm3 \n\t" + "vpor %%xmm3, %%xmm5, %%xmm3 \n\t" + + "2: \n\t" + "vpxor %%xmm0, %%xmm0, %%xmm0 \n\t" + "mov (%0), %%"FF_REG_S" \n\t" + "vpunpcklbw %%xmm0, %%xmm3, %%xmm3 \n\t" + "vpbroadcastw %4, %%xmm1 \n\t" + "vpsllw $3, %%xmm1, %%xmm1 \n\t" + "mov %0, %%"FF_REG_d" \n\t" + "vpaddw %%xmm1, %%xmm3, %%xmm3 \n\t" + "vpsraw $4, %%xmm3, %%xmm3 \n\t" + "vmovdqa %%xmm3, %%xmm4 \n\t" + "vmovdqa %%xmm3, %%xmm7 \n\t" + "vmovdqa %%xmm3, %%xmm9 \n\t" + "vmovdqa %%xmm3, %%xmm10 \n\t" + "movl %3, %%ecx \n\t" + + ".p2align 4 \n\t" + "1: \n\t" + "vpbroadcastq 8(%%"FF_REG_d"), %%xmm0 \n\t" /* filterCoeff */ + "vmovdqa (%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm2 \n\t" /* srcData */ + "vmovdqa 16(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm5 \n\t" /* srcData */ + "vmovdqa 32(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm11 \n\t" /* srcData */ + "vmovdqa 48(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm12 \n\t" /* srcData */ + "add $16, %%"FF_REG_d" \n\t" + "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t" + "vpmulhw %%xmm0, %%xmm2, %%xmm2 \n\t" + "vpmulhw %%xmm0, %%xmm5, %%xmm5 \n\t" + "vpmulhw %%xmm0, %%xmm11, %%xmm11 \n\t" + "vpmulhw %%xmm0, %%xmm12, %%xmm12 \n\t" + "vpaddw %%xmm2, %%xmm3, %%xmm3 \n\t" + "vpaddw %%xmm5, %%xmm4, %%xmm4 \n\t" + "vpaddw %%xmm11, %%xmm9, %%xmm9 \n\t" + "vpaddw %%xmm12, %%xmm10, %%xmm10 \n\t" + "test %%"FF_REG_S", %%"FF_REG_S" \n\t" + "jnz 1b \n\t" + + "vpsraw $3, %%xmm3, %%xmm3 \n\t" + "vpsraw $3, %%xmm4, %%xmm4 \n\t" + "vpsraw $3, %%xmm9, %%xmm9 \n\t" + "vpsraw $3, %%xmm10, %%xmm10 \n\t" + "vpackuswb %%xmm4, %%xmm3, %%xmm3 \n\t" + "vpackuswb %%xmm10, %%xmm9, %%xmm9 \n\t" + "mov (%0), %%"FF_REG_S" \n\t" + "vmovntdq %%xmm3, (%1, %%"FF_REG_c")\n\t" + "vmovntdq %%xmm9, 16(%1, %%"FF_REG_c")\n\t" + "add $32, %%"FF_REG_c" \n\t" + "vmovdqa %%xmm7, %%xmm3 \n\t" + "vmovdqa %%xmm7, %%xmm4 \n\t" + "vmovdqa %%xmm7, %%xmm9 \n\t" + "vmovdqa %%xmm7, %%xmm10 \n\t" + "mov %0, %%"FF_REG_d" \n\t" + "cmp %2, %%"FF_REG_c" \n\t" + "jb 1b \n\t" + + : + : "r" (filter), + "r" (dest-offset), "r" ((int64_t)(dstW+offset)), "m" (offset), + "m"(filterSize), "m"(((uint64_t *) dither)[0]) + : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , + "%xmm5" , "%xmm7" , "%xmm9", "%xmm10" , "%xmm11" , "%xmm12" ,) + "%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_c + ); } #endif