@@ -102,6 +102,7 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
c->idct = ff_simple_idct8_sse2;
+ c->idct_put = ff_simple_idct8_put_sse2;
c->perm_type = FF_IDCT_PERM_TRANSPOSE;
}
}
@@ -113,6 +114,7 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
c->idct = ff_simple_idct8_avx;
+ c->idct_put = ff_simple_idct8_put_avx;
c->perm_type = FF_IDCT_PERM_TRANSPOSE;
}
@@ -32,6 +32,9 @@ void ff_simple_idct_put_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
void ff_simple_idct8_sse2(int16_t *block);
void ff_simple_idct8_avx(int16_t *block);
+void ff_simple_idct8_put_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+void ff_simple_idct8_put_avx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+
void ff_simple_idct10_sse2(int16_t *block);
void ff_simple_idct10_avx(int16_t *block);
@@ -71,11 +71,34 @@ CONST_DEC w7_min_w5, W7sh2, -W5sh2
SECTION .text
+%macro STORE_HI_LO 12
+ movq %1, %9
+ movq %3, %10
+ movq %5, %11
+ movq %7, %12
+ movhps %2, %9
+ movhps %4, %10
+ movhps %6, %11
+ movhps %8, %12
+%endmacro
+
%macro idct_fn 0
cglobal simple_idct8, 1, 1, 16, block
IDCT_FN "", 11, pw_round_20_div_w4, 20, "store"
RET
+; TODO: optimise by not writing the final data to the block.
+cglobal simple_idct8_put, 3, 4, 16, pixels, lsize, block
+ IDCT_FN "", 11, pw_round_20_div_w4, 20
+ lea r3, [3*lsizeq]
+ lea r2, [pixelsq + r3]
+ packuswb m8, m0
+ packuswb m1, m2
+ packuswb m4, m11
+ packuswb m9, m10
+ STORE_HI_LO PASS8ROWS(pixelsq, r2, lsizeq, r3), m8, m1, m4, m9
+RET
+
cglobal simple_idct10, 1, 1, 16, block
IDCT_FN "", 12, "", 19, "store"
RET