diff mbox

[FFmpeg-devel] swscale_unscaled: fix DITHER_COPY macro, use it only for dst_depth == 8

Message ID 39349041-c48c-77ec-b29e-f1352ad84753@poczta.onet.pl
State New
Headers show

Commit Message

Mateusz Sept. 27, 2017, 10:04 a.m. UTC
W dniu 2017-09-26 o 13:31, Carl Eugen Hoyos pisze:
> 2017-09-26 1:33 GMT+02:00 Mateusz <mateuszb@poczta.onet.pl>:
> 
>> I've sent C code patch 2017-09-06 (and nothing) so I thought that the
>> problem is with speed. For simplicity I've attached this patch.
> 
> You could (wait a day or two and) either add an option to
> select your dithering code or put it under #ifdef so more
> people can test it.

I've attached patch that do nothing unless you specify
--extra-cflags="-DNEW_DITHER_COPY" or export CFLAGS="-DNEW_DITHER_COPY"

>> In theory it is enough to make only dst = (src + dither)>>shift;
>> -- white in limited range has 0 on bits to remove (235*4 for example)
>> so overflow is impossible. For files with full range not marked as
>> full range overflow is possible (for dither > 0) and white goes
>> to black. tmp - (tmp>>dst_depth) undoing this overflow.
> 
> (Not necessarily related, sorry if I misunderstand:)
> Valid limited-range frames can contain some pixels with peak
> values outside of the defined range.
> 
> Carl Eugen

OK, so this fight with possible overflow is even more needed.

Mateusz
From 2c0adb2d9a0fc0fbbffc643d27860fbb779c08fc Mon Sep 17 00:00:00 2001
From: Mateusz <mateuszb@poczta.onet.pl>
Date: Tue, 26 Sep 2017 22:20:10 +0200
Subject: [PATCH] swscale: new precise DITHER_COPY macro (if
 "-DNEW_DITHER_COPY" CFLAGS)

---
 libswscale/swscale_unscaled.c | 47 ++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 46 insertions(+), 1 deletion(-)

Comments

Hendrik Leppkes Sept. 27, 2017, 10:39 a.m. UTC | #1
On Wed, Sep 27, 2017 at 12:04 PM, Mateusz <mateuszb@poczta.onet.pl> wrote:
>
> OK, so this fight with possible overflow is even more needed.
>

Luckily x86 SIMD has saturating instructions which don't overflow, so
if we device a way to properly optimize this in yasm/nasm assembly,
then this should be pretty simple to do.

- Hendrik
diff mbox

Patch

diff --git a/libswscale/swscale_unscaled.c b/libswscale/swscale_unscaled.c
index ef36aec..0d41695 100644
--- a/libswscale/swscale_unscaled.c
+++ b/libswscale/swscale_unscaled.c
@@ -110,6 +110,7 @@  DECLARE_ALIGNED(8, static const uint8_t, dithers)[8][8][8]={
   { 112, 16,104,  8,118, 22,110, 14,},
 }};
 
+#ifndef NEW_DITHER_COPY
 static const uint16_t dither_scale[15][16]={
 {    2,    3,    3,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,},
 {    2,    3,    7,    7,   13,   13,   25,   25,   25,   25,   25,   25,   25,   25,   25,   25,},
@@ -127,7 +128,7 @@  static const uint16_t dither_scale[15][16]={
 {    3,    5,    7,    9,   10,   12,   14,   14,   14,   14,   14,   14,   14,   15,32767,32767,},
 {    3,    5,    7,    9,   11,   12,   14,   15,   15,   15,   15,   15,   15,   15,   16,65535,},
 };
-
+#endif
 
 static void fillPlane(uint8_t *plane, int stride, int width, int height, int y,
                       uint8_t val)
@@ -1501,6 +1502,7 @@  static int packedCopyWrapper(SwsContext *c, const uint8_t *src[],
     return srcSliceH;
 }
 
+#ifndef NEW_DITHER_COPY
 #define DITHER_COPY(dst, dstStride, src, srcStride, bswap, dbswap)\
     uint16_t scale= dither_scale[dst_depth-1][src_depth-1];\
     int shift= src_depth-dst_depth + dither_scale[src_depth-2][dst_depth-1];\
@@ -1521,6 +1523,49 @@  static int packedCopyWrapper(SwsContext *c, const uint8_t *src[],
         dst += dstStride;\
         src += srcStride;\
     }
+#else
+#define DITHER_COPY(dst, dstStride, src, srcStride, bswap, dbswap)\
+    unsigned shift= src_depth-dst_depth, tmp;\
+    if (shiftonly) {\
+        for (i = 0; i < height; i++) {\
+            const uint8_t *dither= dithers[shift-1][i&7];\
+            for (j = 0; j < length-7; j+=8){\
+                tmp = (bswap(src[j+0]) + dither[0])>>shift; dst[j+0] = dbswap(tmp - (tmp>>dst_depth));\
+                tmp = (bswap(src[j+1]) + dither[1])>>shift; dst[j+1] = dbswap(tmp - (tmp>>dst_depth));\
+                tmp = (bswap(src[j+2]) + dither[2])>>shift; dst[j+2] = dbswap(tmp - (tmp>>dst_depth));\
+                tmp = (bswap(src[j+3]) + dither[3])>>shift; dst[j+3] = dbswap(tmp - (tmp>>dst_depth));\
+                tmp = (bswap(src[j+4]) + dither[4])>>shift; dst[j+4] = dbswap(tmp - (tmp>>dst_depth));\
+                tmp = (bswap(src[j+5]) + dither[5])>>shift; dst[j+5] = dbswap(tmp - (tmp>>dst_depth));\
+                tmp = (bswap(src[j+6]) + dither[6])>>shift; dst[j+6] = dbswap(tmp - (tmp>>dst_depth));\
+                tmp = (bswap(src[j+7]) + dither[7])>>shift; dst[j+7] = dbswap(tmp - (tmp>>dst_depth));\
+            }\
+            for (; j < length; j++){\
+                tmp = (bswap(src[j]) + dither[j&7])>>shift; dst[j] = dbswap(tmp - (tmp>>dst_depth));\
+            }\
+            dst += dstStride;\
+            src += srcStride;\
+        }\
+    } else {\
+        for (i = 0; i < height; i++) {\
+            const uint8_t *dither= dithers[shift-1][i&7];\
+            for (j = 0; j < length-7; j+=8){\
+                tmp = bswap(src[j+0]); dst[j+0] = dbswap((tmp - (tmp>>dst_depth) + dither[0])>>shift);\
+                tmp = bswap(src[j+1]); dst[j+1] = dbswap((tmp - (tmp>>dst_depth) + dither[1])>>shift);\
+                tmp = bswap(src[j+2]); dst[j+2] = dbswap((tmp - (tmp>>dst_depth) + dither[2])>>shift);\
+                tmp = bswap(src[j+3]); dst[j+3] = dbswap((tmp - (tmp>>dst_depth) + dither[3])>>shift);\
+                tmp = bswap(src[j+4]); dst[j+4] = dbswap((tmp - (tmp>>dst_depth) + dither[4])>>shift);\
+                tmp = bswap(src[j+5]); dst[j+5] = dbswap((tmp - (tmp>>dst_depth) + dither[5])>>shift);\
+                tmp = bswap(src[j+6]); dst[j+6] = dbswap((tmp - (tmp>>dst_depth) + dither[6])>>shift);\
+                tmp = bswap(src[j+7]); dst[j+7] = dbswap((tmp - (tmp>>dst_depth) + dither[7])>>shift);\
+            }\
+            for (; j < length; j++){\
+                tmp = bswap(src[j]); dst[j] = dbswap((tmp - (tmp>>dst_depth) + dither[j&7])>>shift);\
+            }\
+            dst += dstStride;\
+            src += srcStride;\
+        }\
+    }
+#endif
 
 static int planarCopyWrapper(SwsContext *c, const uint8_t *src[],
                              int srcStride[], int srcSliceY, int srcSliceH,