From patchwork Thu Apr 1 10:00:15 2021 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Alan Kelly X-Patchwork-Id: 26678 Return-Path: X-Original-To: patchwork@ffaux-bg.ffmpeg.org Delivered-To: patchwork@ffaux-bg.ffmpeg.org Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org [79.124.17.100]) by ffaux.localdomain (Postfix) with ESMTP id 83CED449135 for ; Thu, 1 Apr 2021 13:06:31 +0300 (EEST) Received: from [127.0.1.1] (localhost [127.0.0.1]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id 5142D689E05; Thu, 1 Apr 2021 13:06:31 +0300 (EEST) X-Original-To: ffmpeg-devel@ffmpeg.org Delivered-To: ffmpeg-devel@ffmpeg.org Received: from mail-pj1-f74.google.com (mail-pj1-f74.google.com [209.85.216.74]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTPS id 076126802C0 for ; Thu, 1 Apr 2021 13:06:24 +0300 (EEST) Received: by mail-pj1-f74.google.com with SMTP id e15so3010455pjg.6 for ; Thu, 01 Apr 2021 03:06:23 -0700 (PDT) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=google.com; s=20161025; h=date:message-id:mime-version:subject:from:to:cc; bh=ejwcvvfqJRfXfKQ9/GHdzQk+KY/EKuiW/oMqVDxMEoI=; b=AJWAPBlD+D0oE/ZIz5cxLk3t5MPCD5VAXXuaGk6KBTdiH5GifzjyfmBA0DZQUDWTRe oeGv/UYPI3neGsSIxoaUYMXCWpq7HLF6vDYFb3Wov86JPiAVoUc2GqIwInvgRxnxCBAx NXdYR9hlULpuTJVYRDXr0AzMUYKckGbsObFIoG9Fm1JYtTvneiYBTbmPTm4aMO8GX45P jHTqGEfM/dnBjr9NAHf5OTdT0Nq+eVzZNH0sJtWodQdq7gOEj6l45DkBvRf3oen3HGFR ZzOJqudldAHnObttHhcTtnqF1qyLARB39/mFTeCuEBiT9DZ5rHTkqHBHfqoWNCbTD0My vhWg== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20161025; h=x-gm-message-state:date:message-id:mime-version:subject:from:to:cc; bh=ejwcvvfqJRfXfKQ9/GHdzQk+KY/EKuiW/oMqVDxMEoI=; b=rIITk5foIW+uvJ83HVIlfhIXgc5UxUUl3kamNL+jPOnYoMyNKUuyZs1bfHL7miO37L IFZDy3ZA4unx5j5c7ilmD4/RkM2hII7/krVtP7H3OHREKzH1rKDJDHwnbDX8r/E1iunJ gbbrSryJ7gsYL9LMUoscBCSebs9HoNy3kz6vOHhkdbhmcwv9zMtDEhkqlIbuPGItxQgQ Jsv3umHuvMbHCc9H8z7yTxo6Hf5g71MXCxo3dvAt1kxQ8QgOIxKwrz7NND8SrqnzipO8 ivLIMeWKrRH6aDtT7g1zPOZ9EBG3xwNbxe8CJV2lnWwXnUF8VfnHQ+wgk1whxl9FkWwR nm3A== X-Gm-Message-State: AOAM531TIw7QUyx74gPhKmSWsbEkD4k4wQNsjtk/GqoLxF9sfIPol5Aq whxOmG4fldcM7BH6gKHu+NsZRTZK0+yYUSZ6PgE9eM2eiOt3HgnFJ8Ltdzrk2Dg5aIxE+luY3N3 qGf42YLlI7s5EKa9OdDH0//sS2yrlc1LCxfUW1tA1z9HKtgwwCyRYQpbEF0hL36SDVTlwSZs= X-Google-Smtp-Source: ABdhPJzA89PhtQyjUj/rJi6FpPlfoHT7MqxAwWmgvDEFgV+kJ26JCQHBH9XAjptrEF6DxNARg7cA9XxM7V7W4ow= X-Received: from alankelly0.zrh.corp.google.com ([2a00:79e0:42:205:f5f7:ddab:dba1:8bb5]) (user=alankelly job=sendgmr) by 2002:ad4:4aa8:: with SMTP id i8mr7504973qvx.22.1617271227895; Thu, 01 Apr 2021 03:00:27 -0700 (PDT) Date: Thu, 1 Apr 2021 12:00:15 +0200 Message-Id: <20210401100017.2863838-1-alankelly@google.com> Mime-Version: 1.0 X-Mailer: git-send-email 2.31.0.291.g576ba9dcdaf-goog From: Alan Kelly To: ffmpeg-devel@ffmpeg.org Subject: [FFmpeg-devel] [PATCH 1/3] libswscale/x86/yuv2yuvX: Removes unrolling for mmx and mmxext X-BeenThere: ffmpeg-devel@ffmpeg.org X-Mailman-Version: 2.1.20 Precedence: list List-Id: FFmpeg development discussions and patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Reply-To: FFmpeg development discussions and patches Cc: Alan Kelly Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" --- This is so that inputs of size 8 are supported, as was the case with the original implementation. A bug was found with inputs not divisible by 16. libswscale/x86/yuv2yuvX.asm | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/libswscale/x86/yuv2yuvX.asm b/libswscale/x86/yuv2yuvX.asm index 521880dabe..b6294cb919 100644 --- a/libswscale/x86/yuv2yuvX.asm +++ b/libswscale/x86/yuv2yuvX.asm @@ -37,8 +37,10 @@ SECTION .text cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, dstW, dither, offset %if notcpuflag(sse3) %define movr mova +%define unroll 1 %else %define movr movdqu +%define unroll 2 %endif movsxdifnidn dstWq, dstWd movsxdifnidn offsetq, offsetd @@ -70,8 +72,10 @@ cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, dstW, dither, offset .outerloop: mova m4, m7 mova m3, m7 +%if cpuflag(sse3) mova m6, m7 mova m1, m7 +%endif .loop: %if cpuflag(avx2) vpbroadcastq m0, [filterSizeq + 8] @@ -84,28 +88,36 @@ cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, dstW, dither, offset pmulhw m5, m0, [srcq + offsetq * 2 + mmsize] paddw m3, m3, m2 paddw m4, m4, m5 +%if cpuflag(sse3) pmulhw m2, m0, [srcq + offsetq * 2 + 2 * mmsize] pmulhw m5, m0, [srcq + offsetq * 2 + 3 * mmsize] paddw m6, m6, m2 paddw m1, m1, m5 +%endif add filterSizeq, $10 mov srcq, [filterSizeq] test srcq, srcq jnz .loop psraw m3, m3, 3 psraw m4, m4, 3 +%if cpuflag(sse3) psraw m6, m6, 3 psraw m1, m1, 3 +%endif packuswb m3, m3, m4 +%if cpuflag(sse3) packuswb m6, m6, m1 +%endif mov srcq, [filterq] %if cpuflag(avx2) vpermq m3, m3, 216 vpermq m6, m6, 216 %endif movr [destq + offsetq], m3 +%if cpuflag(sse3) movr [destq + offsetq + mmsize], m6 - add offsetq, mmsize * 2 +%endif + add offsetq, mmsize * unroll mov filterSizeq, filterq cmp offsetq, dstWq jb .outerloop