diff mbox series

[FFmpeg-devel] sws/swscale_unscaled.c: Faster yuv422p10 -> yuv422p conversion

Message ID f0e143013659ac66b7903a5b163507ad45253403.camel@haerdin.se
State New
Headers show
Series [FFmpeg-devel] sws/swscale_unscaled.c: Faster yuv422p10 -> yuv422p conversion | expand

Checks

Context Check Description
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

Tomas Härdin Jan. 20, 2023, 3:20 p.m. UTC
I have in mind a more general solution that handles 9, 12, 14 and 16-
bit too, and 444p and maybe 420p

/Tomas
diff mbox series

Patch

From 99cc73053cc9a544ae923e5c8e3f4686f3c05454 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tomas=20H=C3=A4rdin?= <git@haerdin.se>
Date: Wed, 18 Jan 2023 17:28:53 +0100
Subject: [PATCH] sws/swscale_unscaled.c: Faster yuv422p10 -> yuv422p
 conversion

Based on work by Paul B Mahol.
---
 libswscale/swscale_unscaled.c | 46 +++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/libswscale/swscale_unscaled.c b/libswscale/swscale_unscaled.c
index 9af2e7ecc3..6c71ecb34d 100644
--- a/libswscale/swscale_unscaled.c
+++ b/libswscale/swscale_unscaled.c
@@ -371,6 +371,50 @@  static int yuv422pToUyvyWrapper(SwsContext *c, const uint8_t *src[],
     return srcSliceH;
 }
 
+static int yuv422p10ToYuv422p(SwsContext *c, const uint8_t *src[],
+                               int srcStride[], int srcSliceY, int srcSliceH,
+                               uint8_t *dstParam[], int dstStride[])
+{
+    const uint16_t *ysrc = (const uint16_t *)(src[0]);
+    const uint16_t *usrc = (const uint16_t *)(src[1]);
+    const uint16_t *vsrc = (const uint16_t *)(src[2]);
+
+    uint8_t *ydst = dstParam[0] + dstStride[0] * srcSliceY;
+    uint8_t *udst = dstParam[1] + dstStride[1] * srcSliceY;
+    uint8_t *vdst = dstParam[2] + dstStride[2] * srcSliceY;
+
+    for (int y = 0; y < srcSliceH; y++) {
+        int x = 0;
+
+#define BLOCK 4
+        for (; x < (c->dstW / 2 / BLOCK)*BLOCK; x += BLOCK) {
+            for (int x2 = x; x2 < x + BLOCK; x2++) {
+                ydst[2*x2+0] = ysrc[2*x2+0] >> 2;
+                ydst[2*x2+1] = ysrc[2*x2+1] >> 2;
+                udst[x2] = usrc[x2] >> 2;
+                vdst[x2] = vsrc[x2] >> 2;
+            }
+        }
+
+        for (; x < c->dstW / 2; x++) {
+            ydst[2*x+0] = ysrc[2*x+0] >> 2;
+            ydst[2*x+1] = ysrc[2*x+1] >> 2;
+            udst[x] = usrc[x] >> 2;
+            vdst[x] = vsrc[x] >> 2;
+        }
+
+        ysrc += srcStride[0] / 2;
+        usrc += srcStride[1] / 2;
+        vsrc += srcStride[2] / 2;
+
+        ydst += dstStride[0];
+        udst += dstStride[1];
+        vdst += dstStride[2];
+    }
+
+    return srcSliceH;
+}
+
 static int yuyvToYuv420Wrapper(SwsContext *c, const uint8_t *src[],
                                int srcStride[], int srcSliceY, int srcSliceH,
                                uint8_t *dstParam[], int dstStride[])
@@ -2223,6 +2267,8 @@  void ff_get_unscaled_swscale(SwsContext *c)
             c->convert_unscaled = planarCopyWrapper;
     }
 
+    if (srcFormat == AV_PIX_FMT_YUV422P10 && dstFormat == AV_PIX_FMT_YUV422P)
+        c->convert_unscaled = yuv422p10ToYuv422p;
 #if ARCH_PPC
     ff_get_unscaled_swscale_ppc(c);
 #elif ARCH_ARM
-- 
2.30.2