diff mbox

[FFmpeg-devel] avcodec/magicyuv: add SIMD for median of 10bits

Message ID 1482514373-28939-1-git-send-email-onemda@gmail.com
State Rejected
Headers show

Commit Message

Paul B Mahol Dec. 23, 2016, 5:32 p.m. UTC
Signed-off-by: Paul B Mahol <onemda@gmail.com>
---
 libavcodec/lossless_videodsp.c          | 18 ++++++++++
 libavcodec/lossless_videodsp.h          |  1 +
 libavcodec/magicyuv.c                   | 23 +-----------
 libavcodec/x86/lossless_videodsp.asm    | 62 +++++++++++++++++++++++++++++++++
 libavcodec/x86/lossless_videodsp_init.c |  2 ++
 5 files changed, 84 insertions(+), 22 deletions(-)

Comments

Ronald S. Bultje Dec. 23, 2016, 11 p.m. UTC | #1
Hi,

On Fri, Dec 23, 2016 at 12:32 PM, Paul B Mahol <onemda@gmail.com> wrote:

> diff --git a/libavcodec/lossless_videodsp.h b/libavcodec/lossless_
> videodsp.h
>
[..]

> @@ -32,6 +32,7 @@ typedef struct LLVidDSPContext {
>
[..]

> +    void (*add_magy_median_pred_int16)(uint16_t *dst, const uint16_t
> *top, const uint16_t *diff, unsigned mask, int w, int *left, int *left_top);
>

That seems wrong. Why would you add a magicuv-specific function to
losslessdsp-context which is intended for functions shared between many
(not just one) lossless codecs? You probably want a new dsp for magicyuv
specifically.

I know this is tedious, but we're very specifically trying to prevent
dsputil from ever happening again.

Ronald
James Almer Dec. 23, 2016, 11:18 p.m. UTC | #2
On 12/23/2016 8:00 PM, Ronald S. Bultje wrote:
> Hi,
> 
> On Fri, Dec 23, 2016 at 12:32 PM, Paul B Mahol <onemda@gmail.com> wrote:
> 
>> diff --git a/libavcodec/lossless_videodsp.h b/libavcodec/lossless_
>> videodsp.h
>>
> [..]
> 
>> @@ -32,6 +32,7 @@ typedef struct LLVidDSPContext {
>>
> [..]
> 
>> +    void (*add_magy_median_pred_int16)(uint16_t *dst, const uint16_t
>> *top, const uint16_t *diff, unsigned mask, int w, int *left, int *left_top);
>>
> 
> That seems wrong. Why would you add a magicuv-specific function to
> losslessdsp-context which is intended for functions shared between many
> (not just one) lossless codecs? You probably want a new dsp for magicyuv
> specifically.
> 
> I know this is tedious, but we're very specifically trying to prevent
> dsputil from ever happening again.
> 
> Ronald

Some functions in this dsp are used only by huffyuv. Only one is used by
both huffyuv and magicyuv.
To properly apply what you mention, it would need to be split in two,
huffyuvdsp and lldsp, then this new function added to a new dsp called
magicyuvdsp.
Ronald S. Bultje Dec. 23, 2016, 11:21 p.m. UTC | #3
Hi,

On Fri, Dec 23, 2016 at 6:18 PM, James Almer <jamrial@gmail.com> wrote:

> On 12/23/2016 8:00 PM, Ronald S. Bultje wrote:
> > Hi,
> >
> > On Fri, Dec 23, 2016 at 12:32 PM, Paul B Mahol <onemda@gmail.com> wrote:
> >
> >> diff --git a/libavcodec/lossless_videodsp.h b/libavcodec/lossless_
> >> videodsp.h
> >>
> > [..]
> >
> >> @@ -32,6 +32,7 @@ typedef struct LLVidDSPContext {
> >>
> > [..]
> >
> >> +    void (*add_magy_median_pred_int16)(uint16_t *dst, const uint16_t
> >> *top, const uint16_t *diff, unsigned mask, int w, int *left, int
> *left_top);
> >>
> >
> > That seems wrong. Why would you add a magicuv-specific function to
> > losslessdsp-context which is intended for functions shared between many
> > (not just one) lossless codecs? You probably want a new dsp for magicyuv
> > specifically.
> >
> > I know this is tedious, but we're very specifically trying to prevent
> > dsputil from ever happening again.
> >
> > Ronald
>
> Some functions in this dsp are used only by huffyuv. Only one is used by
> both huffyuv and magicyuv.
> To properly apply what you mention, it would need to be split in two,
> huffyuvdsp and lldsp, then this new function added to a new dsp called
> magicyuvdsp.


That would be even better, yes.

Ronald
Paul B Mahol Dec. 24, 2016, 11:09 a.m. UTC | #4
On 12/24/16, Ronald S. Bultje <rsbultje@gmail.com> wrote:
> Hi,
>
> On Fri, Dec 23, 2016 at 6:18 PM, James Almer <jamrial@gmail.com> wrote:
>
>> On 12/23/2016 8:00 PM, Ronald S. Bultje wrote:
>> > Hi,
>> >
>> > On Fri, Dec 23, 2016 at 12:32 PM, Paul B Mahol <onemda@gmail.com> wrote:
>> >
>> >> diff --git a/libavcodec/lossless_videodsp.h b/libavcodec/lossless_
>> >> videodsp.h
>> >>
>> > [..]
>> >
>> >> @@ -32,6 +32,7 @@ typedef struct LLVidDSPContext {
>> >>
>> > [..]
>> >
>> >> +    void (*add_magy_median_pred_int16)(uint16_t *dst, const uint16_t
>> >> *top, const uint16_t *diff, unsigned mask, int w, int *left, int
>> *left_top);
>> >>
>> >
>> > That seems wrong. Why would you add a magicuv-specific function to
>> > losslessdsp-context which is intended for functions shared between many
>> > (not just one) lossless codecs? You probably want a new dsp for magicyuv
>> > specifically.
>> >
>> > I know this is tedious, but we're very specifically trying to prevent
>> > dsputil from ever happening again.
>> >
>> > Ronald
>>
>> Some functions in this dsp are used only by huffyuv. Only one is used by
>> both huffyuv and magicyuv.
>> To properly apply what you mention, it would need to be split in two,
>> huffyuvdsp and lldsp, then this new function added to a new dsp called
>> magicyuvdsp.
>
>
> That would be even better, yes.

What about yasm code?

I wanted that to be commented.
Ronald S. Bultje Dec. 24, 2016, 1:18 p.m. UTC | #5
Hi,

On Sat, Dec 24, 2016 at 6:09 AM, Paul B Mahol <onemda@gmail.com> wrote:

> On 12/24/16, Ronald S. Bultje <rsbultje@gmail.com> wrote:
> > Hi,
> >
> > On Fri, Dec 23, 2016 at 6:18 PM, James Almer <jamrial@gmail.com> wrote:
> >
> >> On 12/23/2016 8:00 PM, Ronald S. Bultje wrote:
> >> > Hi,
> >> >
> >> > On Fri, Dec 23, 2016 at 12:32 PM, Paul B Mahol <onemda@gmail.com>
> wrote:
> >> >
> >> >> diff --git a/libavcodec/lossless_videodsp.h b/libavcodec/lossless_
> >> >> videodsp.h
> >> >>
> >> > [..]
> >> >
> >> >> @@ -32,6 +32,7 @@ typedef struct LLVidDSPContext {
> >> >>
> >> > [..]
> >> >
> >> >> +    void (*add_magy_median_pred_int16)(uint16_t *dst, const
> uint16_t
> >> >> *top, const uint16_t *diff, unsigned mask, int w, int *left, int
> >> *left_top);
> >> >>
> >> >
> >> > That seems wrong. Why would you add a magicuv-specific function to
> >> > losslessdsp-context which is intended for functions shared between
> many
> >> > (not just one) lossless codecs? You probably want a new dsp for
> magicyuv
> >> > specifically.
> >> >
> >> > I know this is tedious, but we're very specifically trying to prevent
> >> > dsputil from ever happening again.
> >> >
> >> > Ronald
> >>
> >> Some functions in this dsp are used only by huffyuv. Only one is used by
> >> both huffyuv and magicyuv.
> >> To properly apply what you mention, it would need to be split in two,
> >> huffyuvdsp and lldsp, then this new function added to a new dsp called
> >> magicyuvdsp.
> >
> >
> > That would be even better, yes.
>
> What about yasm code?
>
> I wanted that to be commented.


It's like dithering, it uses the immediately adjacent pixel in the next
loop iteration, can you really simd this effectively?

Ronald
Paul B Mahol Dec. 24, 2016, 2:29 p.m. UTC | #6
On 12/24/16, Ronald S. Bultje <rsbultje@gmail.com> wrote:
> Hi,
>
> On Sat, Dec 24, 2016 at 6:09 AM, Paul B Mahol <onemda@gmail.com> wrote:
>
>> On 12/24/16, Ronald S. Bultje <rsbultje@gmail.com> wrote:
>> > Hi,
>> >
>> > On Fri, Dec 23, 2016 at 6:18 PM, James Almer <jamrial@gmail.com> wrote:
>> >
>> >> On 12/23/2016 8:00 PM, Ronald S. Bultje wrote:
>> >> > Hi,
>> >> >
>> >> > On Fri, Dec 23, 2016 at 12:32 PM, Paul B Mahol <onemda@gmail.com>
>> wrote:
>> >> >
>> >> >> diff --git a/libavcodec/lossless_videodsp.h b/libavcodec/lossless_
>> >> >> videodsp.h
>> >> >>
>> >> > [..]
>> >> >
>> >> >> @@ -32,6 +32,7 @@ typedef struct LLVidDSPContext {
>> >> >>
>> >> > [..]
>> >> >
>> >> >> +    void (*add_magy_median_pred_int16)(uint16_t *dst, const
>> uint16_t
>> >> >> *top, const uint16_t *diff, unsigned mask, int w, int *left, int
>> >> *left_top);
>> >> >>
>> >> >
>> >> > That seems wrong. Why would you add a magicuv-specific function to
>> >> > losslessdsp-context which is intended for functions shared between
>> many
>> >> > (not just one) lossless codecs? You probably want a new dsp for
>> magicyuv
>> >> > specifically.
>> >> >
>> >> > I know this is tedious, but we're very specifically trying to prevent
>> >> > dsputil from ever happening again.
>> >> >
>> >> > Ronald
>> >>
>> >> Some functions in this dsp are used only by huffyuv. Only one is used
>> >> by
>> >> both huffyuv and magicyuv.
>> >> To properly apply what you mention, it would need to be split in two,
>> >> huffyuvdsp and lldsp, then this new function added to a new dsp called
>> >> magicyuvdsp.
>> >
>> >
>> > That would be even better, yes.
>>
>> What about yasm code?
>>
>> I wanted that to be commented.
>
>
> It's like dithering, it uses the immediately adjacent pixel in the next
> loop iteration, can you really simd this effectively?

Apparently, and someone is making money from it.
Ronald S. Bultje Dec. 25, 2016, 4:11 p.m. UTC | #7
Hi,

On Sat, Dec 24, 2016 at 9:29 AM, Paul B Mahol <onemda@gmail.com> wrote:

> On 12/24/16, Ronald S. Bultje <rsbultje@gmail.com> wrote:
> > Hi,
> >
> > On Sat, Dec 24, 2016 at 6:09 AM, Paul B Mahol <onemda@gmail.com> wrote:
> >
> >> On 12/24/16, Ronald S. Bultje <rsbultje@gmail.com> wrote:
> >> > Hi,
> >> >
> >> > On Fri, Dec 23, 2016 at 6:18 PM, James Almer <jamrial@gmail.com>
> wrote:
> >> >
> >> >> On 12/23/2016 8:00 PM, Ronald S. Bultje wrote:
> >> >> > Hi,
> >> >> >
> >> >> > On Fri, Dec 23, 2016 at 12:32 PM, Paul B Mahol <onemda@gmail.com>
> >> wrote:
> >> >> >
> >> >> >> diff --git a/libavcodec/lossless_videodsp.h
> b/libavcodec/lossless_
> >> >> >> videodsp.h
> >> >> >>
> >> >> > [..]
> >> >> >
> >> >> >> @@ -32,6 +32,7 @@ typedef struct LLVidDSPContext {
> >> >> >>
> >> >> > [..]
> >> >> >
> >> >> >> +    void (*add_magy_median_pred_int16)(uint16_t *dst, const
> >> uint16_t
> >> >> >> *top, const uint16_t *diff, unsigned mask, int w, int *left, int
> >> >> *left_top);
> >> >> >>
> >> >> >
> >> >> > That seems wrong. Why would you add a magicuv-specific function to
> >> >> > losslessdsp-context which is intended for functions shared between
> >> many
> >> >> > (not just one) lossless codecs? You probably want a new dsp for
> >> magicyuv
> >> >> > specifically.
> >> >> >
> >> >> > I know this is tedious, but we're very specifically trying to
> prevent
> >> >> > dsputil from ever happening again.
> >> >> >
> >> >> > Ronald
> >> >>
> >> >> Some functions in this dsp are used only by huffyuv. Only one is used
> >> >> by
> >> >> both huffyuv and magicyuv.
> >> >> To properly apply what you mention, it would need to be split in two,
> >> >> huffyuvdsp and lldsp, then this new function added to a new dsp
> called
> >> >> magicyuvdsp.
> >> >
> >> >
> >> > That would be even better, yes.
> >>
> >> What about yasm code?
> >>
> >> I wanted that to be commented.
> >
> >
> > It's like dithering, it uses the immediately adjacent pixel in the next
> > loop iteration, can you really simd this effectively?
>
> Apparently, and someone is making money from it.


The parallelizable portion of it is the top-topleft, and you seem to do
that already. Other than that, I don't see much to be done. You can
probably use some mmxext instructions like pshufw to make life easier, but
I think you'll always be limited by the inherent limitation.

Ronald
James Almer Dec. 25, 2016, 6:14 p.m. UTC | #8
On 12/25/2016 1:11 PM, Ronald S. Bultje wrote:
> Hi,
> 
> On Sat, Dec 24, 2016 at 9:29 AM, Paul B Mahol <onemda@gmail.com> wrote:
> 
>> On 12/24/16, Ronald S. Bultje <rsbultje@gmail.com> wrote:
>>> Hi,
>>>
>>> On Sat, Dec 24, 2016 at 6:09 AM, Paul B Mahol <onemda@gmail.com> wrote:
>>>
>>>> On 12/24/16, Ronald S. Bultje <rsbultje@gmail.com> wrote:
>>>>> Hi,
>>>>>
>>>>> On Fri, Dec 23, 2016 at 6:18 PM, James Almer <jamrial@gmail.com>
>> wrote:
>>>>>
>>>>>> On 12/23/2016 8:00 PM, Ronald S. Bultje wrote:
>>>>>>> Hi,
>>>>>>>
>>>>>>> On Fri, Dec 23, 2016 at 12:32 PM, Paul B Mahol <onemda@gmail.com>
>>>> wrote:
>>>>>>>
>>>>>>>> diff --git a/libavcodec/lossless_videodsp.h
>> b/libavcodec/lossless_
>>>>>>>> videodsp.h
>>>>>>>>
>>>>>>> [..]
>>>>>>>
>>>>>>>> @@ -32,6 +32,7 @@ typedef struct LLVidDSPContext {
>>>>>>>>
>>>>>>> [..]
>>>>>>>
>>>>>>>> +    void (*add_magy_median_pred_int16)(uint16_t *dst, const
>>>> uint16_t
>>>>>>>> *top, const uint16_t *diff, unsigned mask, int w, int *left, int
>>>>>> *left_top);
>>>>>>>>
>>>>>>>
>>>>>>> That seems wrong. Why would you add a magicuv-specific function to
>>>>>>> losslessdsp-context which is intended for functions shared between
>>>> many
>>>>>>> (not just one) lossless codecs? You probably want a new dsp for
>>>> magicyuv
>>>>>>> specifically.
>>>>>>>
>>>>>>> I know this is tedious, but we're very specifically trying to
>> prevent
>>>>>>> dsputil from ever happening again.
>>>>>>>
>>>>>>> Ronald
>>>>>>
>>>>>> Some functions in this dsp are used only by huffyuv. Only one is used
>>>>>> by
>>>>>> both huffyuv and magicyuv.
>>>>>> To properly apply what you mention, it would need to be split in two,
>>>>>> huffyuvdsp and lldsp, then this new function added to a new dsp
>> called
>>>>>> magicyuvdsp.
>>>>>
>>>>>
>>>>> That would be even better, yes.
>>>>
>>>> What about yasm code?
>>>>
>>>> I wanted that to be commented.
>>>
>>>
>>> It's like dithering, it uses the immediately adjacent pixel in the next
>>> loop iteration, can you really simd this effectively?
>>
>> Apparently, and someone is making money from it.
> 
> 
> The parallelizable portion of it is the top-topleft, and you seem to do
> that already. Other than that, I don't see much to be done. You can
> probably use some mmxext instructions like pshufw to make life easier, but
> I think you'll always be limited by the inherent limitation.
> 
> Ronald

He can turn the movq + psrlq + psllq + por at the end of the loop into two
movq + palignr for an ssse3 version of the function (still using mmx regs),
but not much more than that i guess.
And even that will probably not make a noticeable difference, assuming it's
actually faster.
James Almer Dec. 28, 2016, 1:19 a.m. UTC | #9
On 12/25/2016 3:14 PM, James Almer wrote:
> On 12/25/2016 1:11 PM, Ronald S. Bultje wrote:
>> Hi,
>>
>> On Sat, Dec 24, 2016 at 9:29 AM, Paul B Mahol <onemda@gmail.com> wrote:
>>
>>> On 12/24/16, Ronald S. Bultje <rsbultje@gmail.com> wrote:
>>>> Hi,
>>>>
>>>> On Sat, Dec 24, 2016 at 6:09 AM, Paul B Mahol <onemda@gmail.com> wrote:
>>>>
>>>>> On 12/24/16, Ronald S. Bultje <rsbultje@gmail.com> wrote:
>>>>>> Hi,
>>>>>>
>>>>>> On Fri, Dec 23, 2016 at 6:18 PM, James Almer <jamrial@gmail.com>
>>> wrote:
>>>>>>
>>>>>>> On 12/23/2016 8:00 PM, Ronald S. Bultje wrote:
>>>>>>>> Hi,
>>>>>>>>
>>>>>>>> On Fri, Dec 23, 2016 at 12:32 PM, Paul B Mahol <onemda@gmail.com>
>>>>> wrote:
>>>>>>>>
>>>>>>>>> diff --git a/libavcodec/lossless_videodsp.h
>>> b/libavcodec/lossless_
>>>>>>>>> videodsp.h
>>>>>>>>>
>>>>>>>> [..]
>>>>>>>>
>>>>>>>>> @@ -32,6 +32,7 @@ typedef struct LLVidDSPContext {
>>>>>>>>>
>>>>>>>> [..]
>>>>>>>>
>>>>>>>>> +    void (*add_magy_median_pred_int16)(uint16_t *dst, const
>>>>> uint16_t
>>>>>>>>> *top, const uint16_t *diff, unsigned mask, int w, int *left, int
>>>>>>> *left_top);
>>>>>>>>>
>>>>>>>>
>>>>>>>> That seems wrong. Why would you add a magicuv-specific function to
>>>>>>>> losslessdsp-context which is intended for functions shared between
>>>>> many
>>>>>>>> (not just one) lossless codecs? You probably want a new dsp for
>>>>> magicyuv
>>>>>>>> specifically.
>>>>>>>>
>>>>>>>> I know this is tedious, but we're very specifically trying to
>>> prevent
>>>>>>>> dsputil from ever happening again.
>>>>>>>>
>>>>>>>> Ronald
>>>>>>>
>>>>>>> Some functions in this dsp are used only by huffyuv. Only one is used
>>>>>>> by
>>>>>>> both huffyuv and magicyuv.
>>>>>>> To properly apply what you mention, it would need to be split in two,
>>>>>>> huffyuvdsp and lldsp, then this new function added to a new dsp
>>> called
>>>>>>> magicyuvdsp.
>>>>>>
>>>>>>
>>>>>> That would be even better, yes.
>>>>>
>>>>> What about yasm code?
>>>>>
>>>>> I wanted that to be commented.
>>>>
>>>>
>>>> It's like dithering, it uses the immediately adjacent pixel in the next
>>>> loop iteration, can you really simd this effectively?
>>>
>>> Apparently, and someone is making money from it.
>>
>>
>> The parallelizable portion of it is the top-topleft, and you seem to do
>> that already. Other than that, I don't see much to be done. You can
>> probably use some mmxext instructions like pshufw to make life easier, but
>> I think you'll always be limited by the inherent limitation.
>>
>> Ronald
> 
> He can turn the movq + psrlq + psllq + por at the end of the loop into two
> movq + palignr for an ssse3 version of the function (still using mmx regs),
> but not much more than that i guess.
> And even that will probably not make a noticeable difference, assuming it's
> actually faster.

Looks like it's about 3% faster.
diff mbox

Patch

diff --git a/libavcodec/lossless_videodsp.c b/libavcodec/lossless_videodsp.c
index 3491621..15189f1 100644
--- a/libavcodec/lossless_videodsp.c
+++ b/libavcodec/lossless_videodsp.c
@@ -77,6 +77,23 @@  static void add_hfyu_median_pred_int16_c(uint16_t *dst, const uint16_t *src, con
     *left_top = lt;
 }
 
+static void add_magy_median_pred_int16_c(uint16_t *dst, const uint16_t *src, const uint16_t *diff, unsigned mask, int w, int *left, int *left_top){
+    int i;
+    uint16_t l, lt;
+
+    l  = *left;
+    lt = *left_top;
+
+    for(i=0; i<w; i++){
+        l  = (mid_pred(l, src[i], (l + src[i] - lt)) + diff[i]) & mask;
+        lt = src[i];
+        dst[i] = l;
+    }
+
+    *left     = l;
+    *left_top = lt;
+}
+
 static void sub_hfyu_median_pred_int16_c(uint16_t *dst, const uint16_t *src1, const uint16_t *src2, unsigned mask, int w, int *left, int *left_top){
     int i;
     uint16_t l, lt;
@@ -122,6 +139,7 @@  void ff_llviddsp_init(LLVidDSPContext *c, AVCodecContext *avctx)
     c->add_hfyu_left_pred_int16   = add_hfyu_left_pred_int16_c;
     c->add_hfyu_median_pred_int16 = add_hfyu_median_pred_int16_c;
     c->sub_hfyu_median_pred_int16 = sub_hfyu_median_pred_int16_c;
+    c->add_magy_median_pred_int16 = add_magy_median_pred_int16_c;
 
     if (ARCH_X86)
         ff_llviddsp_init_x86(c, avctx);
diff --git a/libavcodec/lossless_videodsp.h b/libavcodec/lossless_videodsp.h
index 040902e..c7a6881 100644
--- a/libavcodec/lossless_videodsp.h
+++ b/libavcodec/lossless_videodsp.h
@@ -32,6 +32,7 @@  typedef struct LLVidDSPContext {
     void (*sub_hfyu_median_pred_int16)(uint16_t *dst, const uint16_t *src1, const uint16_t *src2, unsigned mask, int w, int *left, int *left_top);
     void (*add_hfyu_median_pred_int16)(uint16_t *dst, const uint16_t *top, const uint16_t *diff, unsigned mask, int w, int *left, int *left_top);
     int  (*add_hfyu_left_pred_int16)(uint16_t *dst, const uint16_t *src, unsigned mask, int w, unsigned left);
+    void (*add_magy_median_pred_int16)(uint16_t *dst, const uint16_t *top, const uint16_t *diff, unsigned mask, int w, int *left, int *left_top);
 } LLVidDSPContext;
 
 void ff_llviddsp_init(LLVidDSPContext *llviddsp, AVCodecContext *avctx);
diff --git a/libavcodec/magicyuv.c b/libavcodec/magicyuv.c
index 16d7027..f65c434 100644
--- a/libavcodec/magicyuv.c
+++ b/libavcodec/magicyuv.c
@@ -144,27 +144,6 @@  static int huff_build(VLC *vlc, uint8_t *len)
                               syms,  sizeof(*syms),  sizeof(*syms), 0);
 }
 
-static void magicyuv_median_pred10(uint16_t *dst, const uint16_t *src1,
-                                   const uint16_t *diff, intptr_t w,
-                                   int *left, int *left_top)
-{
-    int i;
-    uint16_t l, lt;
-
-    l  = *left;
-    lt = *left_top;
-
-    for (i = 0; i < w; i++) {
-        l      = mid_pred(l, src1[i], (l + src1[i] - lt)) + diff[i];
-        l     &= 0x3FF;
-        lt     = src1[i];
-        dst[i] = l;
-    }
-
-    *left     = l;
-    *left_top = lt;
-}
-
 static int magy_decode_slice10(AVCodecContext *avctx, void *tdata,
                                int j, int threadnr)
 {
@@ -265,7 +244,7 @@  static int magy_decode_slice10(AVCodecContext *avctx, void *tdata,
                 dst += stride;
             }
             for (k = 1 + interlaced; k < height; k++) {
-                magicyuv_median_pred10(dst, dst - fake_stride, dst, width, &left, &lefttop);
+                s->llviddsp.add_magy_median_pred_int16(dst, dst - fake_stride, dst, 1023, width, &left, &lefttop);
                 lefttop = left = dst[0];
                 dst += stride;
             }
diff --git a/libavcodec/x86/lossless_videodsp.asm b/libavcodec/x86/lossless_videodsp.asm
index f06fcdf..8a2eb26 100644
--- a/libavcodec/x86/lossless_videodsp.asm
+++ b/libavcodec/x86/lossless_videodsp.asm
@@ -292,3 +292,65 @@  cglobal sub_hfyu_median_pred_int16, 7,7,0, dst, src1, src2, mask, w, left, left_
     movzx maskd, word [src2q + wq - 2]
     mov [leftq], maskd
     RET
+
+cglobal add_magy_median_pred_int16, 7,7,0, dst, top, diff, mask, w, left, left_top
+    add      wd, wd
+    movd    mm6, maskd
+    SPLATW  mm6, mm6
+    movq    mm0, [topq]
+    movq    mm2, mm0
+    movd    mm4, [left_topq]
+    psllq   mm2, 16
+    movq    mm1, mm0
+    por     mm4, mm2
+    movd    mm3, [leftq]
+    psubw   mm0, mm4 ; t-tl
+    add    dstq, wq
+    add    topq, wq
+    add   diffq, wq
+    neg      wq
+    jmp .skip
+.loop:
+    movq    mm4, [topq+wq]
+    movq    mm0, mm4
+    psllq   mm4, 16
+    por     mm4, mm1
+    movq    mm1, mm0 ; t
+    psubw   mm0, mm4 ; t-tl
+.skip:
+    movq    mm2, [diffq+wq]
+%assign i 0
+%rep 4
+    movq    mm4, mm0
+    paddw   mm4, mm3 ; t-tl+l
+    movq    mm5, mm3
+    pmaxsw  mm3, mm1
+    pminsw  mm5, mm1
+    pminsw  mm3, mm4
+    pmaxsw  mm3, mm5 ; median
+    paddw   mm3, mm2 ; +residual
+    pand    mm3, mm6
+%if i==0
+    movq    mm7, mm3
+    psllq   mm7, 48
+%else
+    movq    mm4, mm3
+    psrlq   mm7, 16
+    psllq   mm4, 48
+    por     mm7, mm4
+%endif
+%if i<3
+    psrlq   mm0, 16
+    psrlq   mm1, 16
+    psrlq   mm2, 16
+%endif
+%assign i i+1
+%endrep
+    movq [dstq+wq], mm7
+    add      wq, 8
+    jl .loop
+    movzx   r2d, word [dstq-2]
+    mov [leftq], r2d
+    movzx   r2d, word [topq-2]
+    mov [left_topq], r2d
+    RET
diff --git a/libavcodec/x86/lossless_videodsp_init.c b/libavcodec/x86/lossless_videodsp_init.c
index 548d043..8112c70 100644
--- a/libavcodec/x86/lossless_videodsp_init.c
+++ b/libavcodec/x86/lossless_videodsp_init.c
@@ -30,6 +30,7 @@  int ff_add_hfyu_left_pred_int16_ssse3(uint16_t *dst, const uint16_t *src, unsign
 int ff_add_hfyu_left_pred_int16_sse4(uint16_t *dst, const uint16_t *src, unsigned mask, int w, unsigned acc);
 void ff_add_hfyu_median_pred_int16_mmxext(uint16_t *dst, const uint16_t *top, const uint16_t *diff, unsigned mask, int w, int *left, int *left_top);
 void ff_sub_hfyu_median_pred_int16_mmxext(uint16_t *dst, const uint16_t *src1, const uint16_t *src2, unsigned mask, int w, int *left, int *left_top);
+void ff_add_magy_median_pred_int16_mmxext(uint16_t *dst, const uint16_t *top, const uint16_t *diff, unsigned mask, int w, int *left, int *left_top);
 
 
 void ff_llviddsp_init_x86(LLVidDSPContext *c, AVCodecContext *avctx)
@@ -44,6 +45,7 @@  void ff_llviddsp_init_x86(LLVidDSPContext *c, AVCodecContext *avctx)
 
     if (EXTERNAL_MMXEXT(cpu_flags) && pix_desc && pix_desc->comp[0].depth<16) {
         c->add_hfyu_median_pred_int16 = ff_add_hfyu_median_pred_int16_mmxext;
+        c->add_magy_median_pred_int16 = ff_add_magy_median_pred_int16_mmxext;
         c->sub_hfyu_median_pred_int16 = ff_sub_hfyu_median_pred_int16_mmxext;
     }