diff mbox

[FFmpeg-devel] libswscale/swscale_unscaled: fix DITHER_COPY macro

Message ID c6fb39a1-7cb3-69be-8646-b43203fa4e1a@poczta.onet.pl
State New
Headers show

Commit Message

Mateusz Sept. 6, 2017, 7:27 a.m. UTC
W dniu 2017-09-06 o 02:07, Michael Niedermayer pisze:
> On Wed, Sep 06, 2017 at 01:25:45AM +0200, Mateusz wrote:
>> W dniu 2017-09-05 o 23:37, Michael Niedermayer pisze:
>>> On Tue, Sep 05, 2017 at 04:42:06PM +0200, Mateusz wrote:
>>>> W dniu 2017-09-05 o 15:40, Michael Niedermayer pisze:
>>>>> On Mon, Sep 04, 2017 at 09:33:34AM +0200, Mateusz wrote:
>>>>>> If ffmpeg reduces bit-depth to 8-bit or more, it uses DITHER_COPY macro.
>>>>>> The problem is DITHER_COPY macro make images darker (on all planes).
>>>>>>
>>>>>> In x265 project there is issue #365 which is caused by this DITHER_COPY bug.
>>>>>> I think it is time to fix -- there are more and more high bit-depth sources.
>>>>>>
>>>>>> Please review.
>>>>>>
>>>>>>  libswscale/swscale_unscaled.c | 44 +++++++++++++------------------------------
>>>>>>  1 file changed, 13 insertions(+), 31 deletions(-)
>>>>>
>>>>> please provide a git compatible patch with with commit message as produced
>>>>> by git format-patch or git send-email
>>>>>
>>>>> this mail is not accepted as is by git
>>>>> Applying: libswscale/swscale_unscaled: fix DITHER_COPY macro
>>>>> error: corrupt patch at line 6
>>>>> error: could not build fake ancestor
>>>>> Patch failed at 0001 libswscale/swscale_unscaled: fix DITHER_COPY macro
>>>>>
>>>>> [...]
>>>>
>>>> I've attached the result of git format-patch command.
>>>>
>>>> Sorry for 1 private e-mail (I clicked wrong button).
>>>>
>>>> Mateusz
>>>
>>>>  swscale_unscaled.c |   44 +++++++++++++-------------------------------
>>>>  1 file changed, 13 insertions(+), 31 deletions(-)
>>>> 9973b13b3f74319abe9c97302ee87b2b3468b3b6  0001-fix-DITHER_COPY-macro-to-avoid-make-images-darker.patch
>>>> From 9b96d612fef46ccec7e148cd7f8e8666b4e7a4cd Mon Sep 17 00:00:00 2001
>>>> From: Mateusz <mateuszb@poczta.onet.pl>
>>>> Date: Tue, 5 Sep 2017 16:09:28 +0200
>>>> Subject: [PATCH] fix DITHER_COPY macro to avoid make images darker
>>>>
>>>> ---
>>>>  libswscale/swscale_unscaled.c | 44 +++++++++++++------------------------------
>>>>  1 file changed, 13 insertions(+), 31 deletions(-)
>>>>
>>>> diff --git a/libswscale/swscale_unscaled.c b/libswscale/swscale_unscaled.c
>>>> index ef36aec..3b7a5a9 100644
>>>> --- a/libswscale/swscale_unscaled.c
>>>> +++ b/libswscale/swscale_unscaled.c
>>>> @@ -110,24 +110,6 @@ DECLARE_ALIGNED(8, static const uint8_t, dithers)[8][8][8]={
>>>>    { 112, 16,104,  8,118, 22,110, 14,},
>>>>  }};
>>>>  
>>>> -static const uint16_t dither_scale[15][16]={
>>>> -{    2,    3,    3,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,},
>>>> -{    2,    3,    7,    7,   13,   13,   25,   25,   25,   25,   25,   25,   25,   25,   25,   25,},
>>>> -{    3,    3,    4,   15,   15,   29,   57,   57,   57,  113,  113,  113,  113,  113,  113,  113,},
>>>> -{    3,    4,    4,    5,   31,   31,   61,  121,  241,  241,  241,  241,  481,  481,  481,  481,},
>>>> -{    3,    4,    5,    5,    6,   63,   63,  125,  249,  497,  993,  993,  993,  993,  993, 1985,},
>>>> -{    3,    5,    6,    6,    6,    7,  127,  127,  253,  505, 1009, 2017, 4033, 4033, 4033, 4033,},
>>>> -{    3,    5,    6,    7,    7,    7,    8,  255,  255,  509, 1017, 2033, 4065, 8129,16257,16257,},
>>>> -{    3,    5,    6,    8,    8,    8,    8,    9,  511,  511, 1021, 2041, 4081, 8161,16321,32641,},
>>>> -{    3,    5,    7,    8,    9,    9,    9,    9,   10, 1023, 1023, 2045, 4089, 8177,16353,32705,},
>>>> -{    3,    5,    7,    8,   10,   10,   10,   10,   10,   11, 2047, 2047, 4093, 8185,16369,32737,},
>>>> -{    3,    5,    7,    8,   10,   11,   11,   11,   11,   11,   12, 4095, 4095, 8189,16377,32753,},
>>>> -{    3,    5,    7,    9,   10,   12,   12,   12,   12,   12,   12,   13, 8191, 8191,16381,32761,},
>>>> -{    3,    5,    7,    9,   10,   12,   13,   13,   13,   13,   13,   13,   14,16383,16383,32765,},
>>>> -{    3,    5,    7,    9,   10,   12,   14,   14,   14,   14,   14,   14,   14,   15,32767,32767,},
>>>> -{    3,    5,    7,    9,   11,   12,   14,   15,   15,   15,   15,   15,   15,   15,   16,65535,},
>>>> -};
>>>> -
>>>>  
>>>>  static void fillPlane(uint8_t *plane, int stride, int width, int height, int y,
>>>>                        uint8_t val)
>>>> @@ -1502,22 +1484,22 @@ static int packedCopyWrapper(SwsContext *c, const uint8_t *src[],
>>>>  }
>>>>  
>>>>  #define DITHER_COPY(dst, dstStride, src, srcStride, bswap, dbswap)\
>>>> -    uint16_t scale= dither_scale[dst_depth-1][src_depth-1];\
>>>> -    int shift= src_depth-dst_depth + dither_scale[src_depth-2][dst_depth-1];\
>>>> +    unsigned shift= src_depth-dst_depth, tmp;\
>>>>      for (i = 0; i < height; i++) {\
>>>> -        const uint8_t *dither= dithers[src_depth-9][i&7];\
>>>> +        const uint8_t *dither= dithers[shift-1][i&7];\
>>>>          for (j = 0; j < length-7; j+=8){\
>>>> -            dst[j+0] = dbswap((bswap(src[j+0]) + dither[0])*scale>>shift);\
>>>> -            dst[j+1] = dbswap((bswap(src[j+1]) + dither[1])*scale>>shift);\
>>>> -            dst[j+2] = dbswap((bswap(src[j+2]) + dither[2])*scale>>shift);\
>>>> -            dst[j+3] = dbswap((bswap(src[j+3]) + dither[3])*scale>>shift);\
>>>> -            dst[j+4] = dbswap((bswap(src[j+4]) + dither[4])*scale>>shift);\
>>>> -            dst[j+5] = dbswap((bswap(src[j+5]) + dither[5])*scale>>shift);\
>>>> -            dst[j+6] = dbswap((bswap(src[j+6]) + dither[6])*scale>>shift);\
>>>> -            dst[j+7] = dbswap((bswap(src[j+7]) + dither[7])*scale>>shift);\
>>>> +            tmp = (bswap(src[j+0]) + dither[0])>>shift; dst[j+0] = dbswap(tmp - (tmp>>dst_depth));\
>>>> +            tmp = (bswap(src[j+1]) + dither[1])>>shift; dst[j+1] = dbswap(tmp - (tmp>>dst_depth));\
>>>> +            tmp = (bswap(src[j+2]) + dither[2])>>shift; dst[j+2] = dbswap(tmp - (tmp>>dst_depth));\
>>>> +            tmp = (bswap(src[j+3]) + dither[3])>>shift; dst[j+3] = dbswap(tmp - (tmp>>dst_depth));\
>>>> +            tmp = (bswap(src[j+4]) + dither[4])>>shift; dst[j+4] = dbswap(tmp - (tmp>>dst_depth));\
>>>> +            tmp = (bswap(src[j+5]) + dither[5])>>shift; dst[j+5] = dbswap(tmp - (tmp>>dst_depth));\
>>>> +            tmp = (bswap(src[j+6]) + dither[6])>>shift; dst[j+6] = dbswap(tmp - (tmp>>dst_depth));\
>>>> +            tmp = (bswap(src[j+7]) + dither[7])>>shift; dst[j+7] = dbswap(tmp - (tmp>>dst_depth));\
>>>
>>> This does not look correct
>>>
>>> flat black should be mapped to flat black (ok)
>>> flat white should be mapped to flat white (ok)
>>> black+1 should not be mapped to flat black but to a dithered pattern mixing black with the next brighter color (ok)
>>> white-1 should not be mapped to flat white but to a dithered pattern mixing white with the next darker color (not ok)
>>>
>>> example if you take 9 bit to 7 bit
>>> 511 + {0..3} = {511..514}
>>> {511..514} >> 2 = {127,128}
>>> {127,128} - ({127,128} >> 7) = 127
>>>
>>> 510 + {0..3} = {510..513}
>>> {510..513} >> 2 = {127,128}
>>> {127,128} - ({127,128} >> 7) = 127
>>>
>>> Thus multiple of the brigher colors all get mapped to the same output,
>>> that way the brightest shades become indistingishable and this is not
>>> correct.
>>>
>>> above is based on code review not testing so i might have missed
>>> something
>>>
>>> [...]
>>
>> Yes, you are right.
>>
>> There is one more rule, that is important and (partially) in contradiction to white-1 to not flat white rule:
> 
>> if you convert 7-bit source to 9-bit and next resulting 9-bit to 7-bit, it should be the same picture as original.
> 
> this would be desirable
> 
> 
>> if we convert 7 bit white (or max value at plane) == 127, we have 127 << 2 == 508
>> now 508 goes always to 127
> 
> This would only be the case if 508 was the white point and not 511
> otherwise
> If you convert 7bit to 9bit, 127 is converted to 511, this would be a
> requirement to maintain white.
> if you use fewer bits it becomes more obvious
> 2 bits to 4 bits
> 3 -> 15 not 3 -> 12
> 
> whereever the white and black points are in the input and output
> colorspace they must get converted correctly.
> 
> 
> [...]

line #1538 in file libswscale/swscale_unscaled.c
int shiftonly = plane == 1 || plane == 2 || (!c->srcRange && plane == 0);

line #421 in file libswscale/swscale_internal.h
int srcRange;                 ///< 0 = MPG YUV range, 1 = JPG YUV range (source      image).

For movies (without special options) ffmpeg always upscale bit-depth by simple shift (127 to 508).

Only for plane 0 and srcRange == 1 (full range) ffmpeg upscale 127 to 511.

For me it is impossible to make DITHER_COPY macro that works OK for both cases, so I attached new version that check shiftonly state and works optimal according to the shiftonly state.

Now in both cases it is original picture after upscale bit-depth -> downscale bit-depth sequence.

Please review.

Mateusz
From fd9e271ea531d25bc5a708d0dfeb1be5415b90d0 Mon Sep 17 00:00:00 2001
From: Mateusz <mateuszb@poczta.onet.pl>
Date: Wed, 6 Sep 2017 09:05:02 +0200
Subject: [PATCH] fix DITHER_COPY macro according to shiftonly state

---
 libswscale/swscale_unscaled.c | 73 ++++++++++++++++++++++---------------------
 1 file changed, 38 insertions(+), 35 deletions(-)

Comments

Mateusz Sept. 11, 2017, 11:14 p.m. UTC | #1
W dniu 2017-09-06 o 09:27, Mateusz pisze:
> W dniu 2017-09-06 o 02:07, Michael Niedermayer pisze:
>> On Wed, Sep 06, 2017 at 01:25:45AM +0200, Mateusz wrote:
>>> W dniu 2017-09-05 o 23:37, Michael Niedermayer pisze:
>>>> On Tue, Sep 05, 2017 at 04:42:06PM +0200, Mateusz wrote:
>>>>> W dniu 2017-09-05 o 15:40, Michael Niedermayer pisze:
>>>>>> On Mon, Sep 04, 2017 at 09:33:34AM +0200, Mateusz wrote:
>>>>>>> If ffmpeg reduces bit-depth to 8-bit or more, it uses DITHER_COPY macro.
>>>>>>> The problem is DITHER_COPY macro make images darker (on all planes).
>>>>>>>
>>>>>>> In x265 project there is issue #365 which is caused by this DITHER_COPY bug.
>>>>>>> I think it is time to fix -- there are more and more high bit-depth sources.
>>>>>>>
>>>>>>> Please review.
>>>>>>>
>>>>>>>  libswscale/swscale_unscaled.c | 44 +++++++++++++------------------------------
>>>>>>>  1 file changed, 13 insertions(+), 31 deletions(-)
>>>>>>
>>>>>> please provide a git compatible patch with with commit message as produced
>>>>>> by git format-patch or git send-email
>>>>>>
>>>>>> this mail is not accepted as is by git
>>>>>> Applying: libswscale/swscale_unscaled: fix DITHER_COPY macro
>>>>>> error: corrupt patch at line 6
>>>>>> error: could not build fake ancestor
>>>>>> Patch failed at 0001 libswscale/swscale_unscaled: fix DITHER_COPY macro
>>>>>>
>>>>>> [...]
>>>>>
>>>>> I've attached the result of git format-patch command.
>>>>>
>>>>> Sorry for 1 private e-mail (I clicked wrong button).
>>>>>
>>>>> Mateusz
>>>>
>>>>>  swscale_unscaled.c |   44 +++++++++++++-------------------------------
>>>>>  1 file changed, 13 insertions(+), 31 deletions(-)
>>>>> 9973b13b3f74319abe9c97302ee87b2b3468b3b6  0001-fix-DITHER_COPY-macro-to-avoid-make-images-darker.patch
>>>>> From 9b96d612fef46ccec7e148cd7f8e8666b4e7a4cd Mon Sep 17 00:00:00 2001
>>>>> From: Mateusz <mateuszb@poczta.onet.pl>
>>>>> Date: Tue, 5 Sep 2017 16:09:28 +0200
>>>>> Subject: [PATCH] fix DITHER_COPY macro to avoid make images darker
>>>>>
>>>>> ---
>>>>>  libswscale/swscale_unscaled.c | 44 +++++++++++++------------------------------
>>>>>  1 file changed, 13 insertions(+), 31 deletions(-)
>>>>>
>>>>> diff --git a/libswscale/swscale_unscaled.c b/libswscale/swscale_unscaled.c
>>>>> index ef36aec..3b7a5a9 100644
>>>>> --- a/libswscale/swscale_unscaled.c
>>>>> +++ b/libswscale/swscale_unscaled.c
>>>>> @@ -110,24 +110,6 @@ DECLARE_ALIGNED(8, static const uint8_t, dithers)[8][8][8]={
>>>>>    { 112, 16,104,  8,118, 22,110, 14,},
>>>>>  }};
>>>>>  
>>>>> -static const uint16_t dither_scale[15][16]={
>>>>> -{    2,    3,    3,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,},
>>>>> -{    2,    3,    7,    7,   13,   13,   25,   25,   25,   25,   25,   25,   25,   25,   25,   25,},
>>>>> -{    3,    3,    4,   15,   15,   29,   57,   57,   57,  113,  113,  113,  113,  113,  113,  113,},
>>>>> -{    3,    4,    4,    5,   31,   31,   61,  121,  241,  241,  241,  241,  481,  481,  481,  481,},
>>>>> -{    3,    4,    5,    5,    6,   63,   63,  125,  249,  497,  993,  993,  993,  993,  993, 1985,},
>>>>> -{    3,    5,    6,    6,    6,    7,  127,  127,  253,  505, 1009, 2017, 4033, 4033, 4033, 4033,},
>>>>> -{    3,    5,    6,    7,    7,    7,    8,  255,  255,  509, 1017, 2033, 4065, 8129,16257,16257,},
>>>>> -{    3,    5,    6,    8,    8,    8,    8,    9,  511,  511, 1021, 2041, 4081, 8161,16321,32641,},
>>>>> -{    3,    5,    7,    8,    9,    9,    9,    9,   10, 1023, 1023, 2045, 4089, 8177,16353,32705,},
>>>>> -{    3,    5,    7,    8,   10,   10,   10,   10,   10,   11, 2047, 2047, 4093, 8185,16369,32737,},
>>>>> -{    3,    5,    7,    8,   10,   11,   11,   11,   11,   11,   12, 4095, 4095, 8189,16377,32753,},
>>>>> -{    3,    5,    7,    9,   10,   12,   12,   12,   12,   12,   12,   13, 8191, 8191,16381,32761,},
>>>>> -{    3,    5,    7,    9,   10,   12,   13,   13,   13,   13,   13,   13,   14,16383,16383,32765,},
>>>>> -{    3,    5,    7,    9,   10,   12,   14,   14,   14,   14,   14,   14,   14,   15,32767,32767,},
>>>>> -{    3,    5,    7,    9,   11,   12,   14,   15,   15,   15,   15,   15,   15,   15,   16,65535,},
>>>>> -};
>>>>> -
>>>>>  
>>>>>  static void fillPlane(uint8_t *plane, int stride, int width, int height, int y,
>>>>>                        uint8_t val)
>>>>> @@ -1502,22 +1484,22 @@ static int packedCopyWrapper(SwsContext *c, const uint8_t *src[],
>>>>>  }
>>>>>  
>>>>>  #define DITHER_COPY(dst, dstStride, src, srcStride, bswap, dbswap)\
>>>>> -    uint16_t scale= dither_scale[dst_depth-1][src_depth-1];\
>>>>> -    int shift= src_depth-dst_depth + dither_scale[src_depth-2][dst_depth-1];\
>>>>> +    unsigned shift= src_depth-dst_depth, tmp;\
>>>>>      for (i = 0; i < height; i++) {\
>>>>> -        const uint8_t *dither= dithers[src_depth-9][i&7];\
>>>>> +        const uint8_t *dither= dithers[shift-1][i&7];\
>>>>>          for (j = 0; j < length-7; j+=8){\
>>>>> -            dst[j+0] = dbswap((bswap(src[j+0]) + dither[0])*scale>>shift);\
>>>>> -            dst[j+1] = dbswap((bswap(src[j+1]) + dither[1])*scale>>shift);\
>>>>> -            dst[j+2] = dbswap((bswap(src[j+2]) + dither[2])*scale>>shift);\
>>>>> -            dst[j+3] = dbswap((bswap(src[j+3]) + dither[3])*scale>>shift);\
>>>>> -            dst[j+4] = dbswap((bswap(src[j+4]) + dither[4])*scale>>shift);\
>>>>> -            dst[j+5] = dbswap((bswap(src[j+5]) + dither[5])*scale>>shift);\
>>>>> -            dst[j+6] = dbswap((bswap(src[j+6]) + dither[6])*scale>>shift);\
>>>>> -            dst[j+7] = dbswap((bswap(src[j+7]) + dither[7])*scale>>shift);\
>>>>> +            tmp = (bswap(src[j+0]) + dither[0])>>shift; dst[j+0] = dbswap(tmp - (tmp>>dst_depth));\
>>>>> +            tmp = (bswap(src[j+1]) + dither[1])>>shift; dst[j+1] = dbswap(tmp - (tmp>>dst_depth));\
>>>>> +            tmp = (bswap(src[j+2]) + dither[2])>>shift; dst[j+2] = dbswap(tmp - (tmp>>dst_depth));\
>>>>> +            tmp = (bswap(src[j+3]) + dither[3])>>shift; dst[j+3] = dbswap(tmp - (tmp>>dst_depth));\
>>>>> +            tmp = (bswap(src[j+4]) + dither[4])>>shift; dst[j+4] = dbswap(tmp - (tmp>>dst_depth));\
>>>>> +            tmp = (bswap(src[j+5]) + dither[5])>>shift; dst[j+5] = dbswap(tmp - (tmp>>dst_depth));\
>>>>> +            tmp = (bswap(src[j+6]) + dither[6])>>shift; dst[j+6] = dbswap(tmp - (tmp>>dst_depth));\
>>>>> +            tmp = (bswap(src[j+7]) + dither[7])>>shift; dst[j+7] = dbswap(tmp - (tmp>>dst_depth));\
>>>>
>>>> This does not look correct
>>>>
>>>> flat black should be mapped to flat black (ok)
>>>> flat white should be mapped to flat white (ok)
>>>> black+1 should not be mapped to flat black but to a dithered pattern mixing black with the next brighter color (ok)
>>>> white-1 should not be mapped to flat white but to a dithered pattern mixing white with the next darker color (not ok)
>>>>
>>>> example if you take 9 bit to 7 bit
>>>> 511 + {0..3} = {511..514}
>>>> {511..514} >> 2 = {127,128}
>>>> {127,128} - ({127,128} >> 7) = 127
>>>>
>>>> 510 + {0..3} = {510..513}
>>>> {510..513} >> 2 = {127,128}
>>>> {127,128} - ({127,128} >> 7) = 127
>>>>
>>>> Thus multiple of the brigher colors all get mapped to the same output,
>>>> that way the brightest shades become indistingishable and this is not
>>>> correct.
>>>>
>>>> above is based on code review not testing so i might have missed
>>>> something
>>>>
>>>> [...]
>>>
>>> Yes, you are right.
>>>
>>> There is one more rule, that is important and (partially) in contradiction to white-1 to not flat white rule:
>>
>>> if you convert 7-bit source to 9-bit and next resulting 9-bit to 7-bit, it should be the same picture as original.
>>
>> this would be desirable
>>
>>
>>> if we convert 7 bit white (or max value at plane) == 127, we have 127 << 2 == 508
>>> now 508 goes always to 127
>>
>> This would only be the case if 508 was the white point and not 511
>> otherwise
>> If you convert 7bit to 9bit, 127 is converted to 511, this would be a
>> requirement to maintain white.
>> if you use fewer bits it becomes more obvious
>> 2 bits to 4 bits
>> 3 -> 15 not 3 -> 12
>>
>> whereever the white and black points are in the input and output
>> colorspace they must get converted correctly.
>>
>>
>> [...]
> 
> line #1538 in file libswscale/swscale_unscaled.c
> int shiftonly = plane == 1 || plane == 2 || (!c->srcRange && plane == 0);
> 
> line #421 in file libswscale/swscale_internal.h
> int srcRange;                 ///< 0 = MPG YUV range, 1 = JPG YUV range (source      image).
> 
> For movies (without special options) ffmpeg always upscale bit-depth by simple shift (127 to 508).
> 
> Only for plane 0 and srcRange == 1 (full range) ffmpeg upscale 127 to 511.
> 
> For me it is impossible to make DITHER_COPY macro that works OK for both cases, so I attached new version that check shiftonly state and works optimal according to the shiftonly state.
> 
> Now in both cases it is original picture after upscale bit-depth -> downscale bit-depth sequence.
> 
> Please review.
> 
> Mateusz

Examples to watch (for what is this patch):
100th iteration of 8-bit -> 10-bit -> 8-bit sequence with limited range (clean ffmpeg)
www.msystem.waw.pl/x265/limited.mkv

100th iteration of 8-bit -> 10-bit -> 8-bit sequence with full range (clean ffmpeg)
www.msystem.waw.pl/x265/full.mkv

100th iteration of 8-bit -> 10-bit -> 8-bit sequence with full or limited range with patch (bit exact source movie)
www.msystem.waw.pl/x265/patched.mkv

Batch file used to produce this movies (Windows):

copy /B /Y ..\KristenAndSara_1280x720_60.y4m c.y4m
copy /B /Y ..\KristenAndSara_1280x720_60.y4m c2.y4m

for /L %%i in (1 1 100) do (
@echo %%i
ffmpeg -i c.y4m -y -v warning -strict -1 -pix_fmt yuv420p10 t.y4m
ffmpeg -i t.y4m -y -v warning -strict -1 -pix_fmt yuv420p c.y4m
)
for /L %%i in (1 1 100) do (
@echo %%i
ffmpeg -color_range 2 -i c2.y4m -y -v warning -strict -1 -pix_fmt yuv420p10 t.y4m
ffmpeg -color_range 2 -i t.y4m -y -v warning -strict -1 -pix_fmt yuv420p c2.y4m
)
)
diff mbox

Patch

diff --git a/libswscale/swscale_unscaled.c b/libswscale/swscale_unscaled.c
index ef36aec..e3e375a 100644
--- a/libswscale/swscale_unscaled.c
+++ b/libswscale/swscale_unscaled.c
@@ -110,24 +110,6 @@  DECLARE_ALIGNED(8, static const uint8_t, dithers)[8][8][8]={
   { 112, 16,104,  8,118, 22,110, 14,},
 }};
 
-static const uint16_t dither_scale[15][16]={
-{    2,    3,    3,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,},
-{    2,    3,    7,    7,   13,   13,   25,   25,   25,   25,   25,   25,   25,   25,   25,   25,},
-{    3,    3,    4,   15,   15,   29,   57,   57,   57,  113,  113,  113,  113,  113,  113,  113,},
-{    3,    4,    4,    5,   31,   31,   61,  121,  241,  241,  241,  241,  481,  481,  481,  481,},
-{    3,    4,    5,    5,    6,   63,   63,  125,  249,  497,  993,  993,  993,  993,  993, 1985,},
-{    3,    5,    6,    6,    6,    7,  127,  127,  253,  505, 1009, 2017, 4033, 4033, 4033, 4033,},
-{    3,    5,    6,    7,    7,    7,    8,  255,  255,  509, 1017, 2033, 4065, 8129,16257,16257,},
-{    3,    5,    6,    8,    8,    8,    8,    9,  511,  511, 1021, 2041, 4081, 8161,16321,32641,},
-{    3,    5,    7,    8,    9,    9,    9,    9,   10, 1023, 1023, 2045, 4089, 8177,16353,32705,},
-{    3,    5,    7,    8,   10,   10,   10,   10,   10,   11, 2047, 2047, 4093, 8185,16369,32737,},
-{    3,    5,    7,    8,   10,   11,   11,   11,   11,   11,   12, 4095, 4095, 8189,16377,32753,},
-{    3,    5,    7,    9,   10,   12,   12,   12,   12,   12,   12,   13, 8191, 8191,16381,32761,},
-{    3,    5,    7,    9,   10,   12,   13,   13,   13,   13,   13,   13,   14,16383,16383,32765,},
-{    3,    5,    7,    9,   10,   12,   14,   14,   14,   14,   14,   14,   14,   15,32767,32767,},
-{    3,    5,    7,    9,   11,   12,   14,   15,   15,   15,   15,   15,   15,   15,   16,65535,},
-};
-
 
 static void fillPlane(uint8_t *plane, int stride, int width, int height, int y,
                       uint8_t val)
@@ -1502,24 +1484,45 @@  static int packedCopyWrapper(SwsContext *c, const uint8_t *src[],
 }
 
 #define DITHER_COPY(dst, dstStride, src, srcStride, bswap, dbswap)\
-    uint16_t scale= dither_scale[dst_depth-1][src_depth-1];\
-    int shift= src_depth-dst_depth + dither_scale[src_depth-2][dst_depth-1];\
-    for (i = 0; i < height; i++) {\
-        const uint8_t *dither= dithers[src_depth-9][i&7];\
-        for (j = 0; j < length-7; j+=8){\
-            dst[j+0] = dbswap((bswap(src[j+0]) + dither[0])*scale>>shift);\
-            dst[j+1] = dbswap((bswap(src[j+1]) + dither[1])*scale>>shift);\
-            dst[j+2] = dbswap((bswap(src[j+2]) + dither[2])*scale>>shift);\
-            dst[j+3] = dbswap((bswap(src[j+3]) + dither[3])*scale>>shift);\
-            dst[j+4] = dbswap((bswap(src[j+4]) + dither[4])*scale>>shift);\
-            dst[j+5] = dbswap((bswap(src[j+5]) + dither[5])*scale>>shift);\
-            dst[j+6] = dbswap((bswap(src[j+6]) + dither[6])*scale>>shift);\
-            dst[j+7] = dbswap((bswap(src[j+7]) + dither[7])*scale>>shift);\
+    unsigned shift= src_depth-dst_depth, tmp;\
+    if (shiftonly) {\
+        for (i = 0; i < height; i++) {\
+            const uint8_t *dither= dithers[shift-1][i&7];\
+            for (j = 0; j < length-7; j+=8){\
+                tmp = (bswap(src[j+0]) + dither[0])>>shift; dst[j+0] = dbswap(tmp - (tmp>>dst_depth));\
+                tmp = (bswap(src[j+1]) + dither[1])>>shift; dst[j+1] = dbswap(tmp - (tmp>>dst_depth));\
+                tmp = (bswap(src[j+2]) + dither[2])>>shift; dst[j+2] = dbswap(tmp - (tmp>>dst_depth));\
+                tmp = (bswap(src[j+3]) + dither[3])>>shift; dst[j+3] = dbswap(tmp - (tmp>>dst_depth));\
+                tmp = (bswap(src[j+4]) + dither[4])>>shift; dst[j+4] = dbswap(tmp - (tmp>>dst_depth));\
+                tmp = (bswap(src[j+5]) + dither[5])>>shift; dst[j+5] = dbswap(tmp - (tmp>>dst_depth));\
+                tmp = (bswap(src[j+6]) + dither[6])>>shift; dst[j+6] = dbswap(tmp - (tmp>>dst_depth));\
+                tmp = (bswap(src[j+7]) + dither[7])>>shift; dst[j+7] = dbswap(tmp - (tmp>>dst_depth));\
+            }\
+            for (; j < length; j++){\
+                tmp = (bswap(src[j]) + dither[j&7])>>shift; dst[j] = dbswap(tmp - (tmp>>dst_depth));\
+            }\
+            dst += dstStride;\
+            src += srcStride;\
+        }\
+    } else {\
+        for (i = 0; i < height; i++) {\
+            const uint8_t *dither= dithers[shift-1][i&7];\
+            for (j = 0; j < length-7; j+=8){\
+                tmp = bswap(src[j+0]); dst[j+0] = dbswap((tmp - (tmp>>dst_depth) + dither[0])>>shift);\
+                tmp = bswap(src[j+1]); dst[j+1] = dbswap((tmp - (tmp>>dst_depth) + dither[1])>>shift);\
+                tmp = bswap(src[j+2]); dst[j+2] = dbswap((tmp - (tmp>>dst_depth) + dither[2])>>shift);\
+                tmp = bswap(src[j+3]); dst[j+3] = dbswap((tmp - (tmp>>dst_depth) + dither[3])>>shift);\
+                tmp = bswap(src[j+4]); dst[j+4] = dbswap((tmp - (tmp>>dst_depth) + dither[4])>>shift);\
+                tmp = bswap(src[j+5]); dst[j+5] = dbswap((tmp - (tmp>>dst_depth) + dither[5])>>shift);\
+                tmp = bswap(src[j+6]); dst[j+6] = dbswap((tmp - (tmp>>dst_depth) + dither[6])>>shift);\
+                tmp = bswap(src[j+7]); dst[j+7] = dbswap((tmp - (tmp>>dst_depth) + dither[7])>>shift);\
+            }\
+            for (; j < length; j++){\
+                tmp = bswap(src[j]); dst[j] = dbswap((tmp - (tmp>>dst_depth) + dither[j&7])>>shift);\
+            }\
+            dst += dstStride;\
+            src += srcStride;\
         }\
-        for (; j < length; j++)\
-            dst[j] = dbswap((bswap(src[j]) + dither[j&7])*scale>>shift);\
-        dst += dstStride;\
-        src += srcStride;\
     }
 
 static int planarCopyWrapper(SwsContext *c, const uint8_t *src[],