@@ -54,7 +54,13 @@ void ff_h264_idct_add16intra_##depth##_rvv(uint8_t *d, const int *soffset, \
const uint8_t nnzc[5 * 8]); \
void ff_h264_idct8_add4_##depth##_rvv(uint8_t *d, const int *soffset, \
int16_t *s, int stride, \
- const uint8_t nnzc[5 * 8]);
+ const uint8_t nnzc[5 * 8]); \
+void ff_h264_idct4_add8_##depth##_rvv(uint8_t **d, const int *soffset, \
+ int16_t *s, int stride, \
+ const uint8_t nnzc[5 * 8]); \
+void ff_h264_idct4_add8_422_##depth##_rvv(uint8_t **d, const int *soffset, \
+ int16_t *s, int stride, \
+ const uint8_t nnzc[5 * 8]);
IDCT_DEPTH(8)
IDCT_DEPTH(9)
@@ -104,6 +110,10 @@ av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int bit_depth,
dsp->h264_idct_add16intra = ff_h264_idct_add16intra_8_rvv;
# if __riscv_xlen == 64
dsp->h264_idct8_add4 = ff_h264_idct8_add4_8_rvv;
+ if (chroma_format_idc <= 1)
+ dsp->h264_idct_add8 = ff_h264_idct4_add8_8_rvv;
+ else
+ dsp->h264_idct_add8 = ff_h264_idct4_add8_422_8_rvv;
# endif
}
if (flags & AV_CPU_FLAG_RVV_I64) {
@@ -123,10 +133,16 @@ av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int bit_depth,
if (zvl128b && (flags & AV_CPU_FLAG_RVB)) { \
dsp->h264_idct_dc_add = ff_h264_idct4_dc_add_##depth##_rvv; \
dsp->h264_idct8_dc_add = ff_h264_idct8_dc_add_##depth##_rvv; \
+ dsp->h264_idct_add16 = ff_h264_idct_add16_##depth##_rvv; \
+ dsp->h264_idct_add16intra = \
+ ff_h264_idct_add16intra_##depth##_rvv; \
if (__riscv_xlen == 64) { \
- dsp->h264_idct_add16 = ff_h264_idct_add16_##depth##_rvv; \
- dsp->h264_idct_add16intra = \
- ff_h264_idct_add16intra_##depth##_rvv; \
+ if (chroma_format_idc <= 1) \
+ dsp->h264_idct_add8 = \
+ ff_h264_idct4_add8_##depth##_rvv; \
+ else \
+ dsp->h264_idct_add8 = \
+ ff_h264_idct4_add8_422_##depth##_rvv; \
} \
} \
if (__riscv_xlen == 64 && (flags & AV_CPU_FLAG_RVB)) \
@@ -57,7 +57,7 @@ endfunc
func ff_h264_idct_add_8_rvv, zve32x
lpad 0
csrwi vxrm, 0
-.Lidct_add4_8_rvv:
+.Lidct4_add_8_rvv:
vsetivli zero, 4, e16, mf2, ta, ma
addi t1, a1, 1 * 4 * 2
vle16.v v0, (a1)
@@ -111,7 +111,7 @@ endfunc
func ff_h264_idct_add_16_rvv, zve32x
csrwi vxrm, 0
-.Lidct_add4_16_rvv:
+.Lidct4_add_16_rvv:
vsetivli zero, 4, e32, m1, ta, ma
addi t1, a1, 1 * 4 * 4
vle32.v v0, (a1)
@@ -543,19 +543,26 @@ endfunc
.endr
const ff_h264_scan8
- .byte 014, 015, 024, 025, 016, 017, 026, 027
- .byte 034, 035, 044, 045, 036, 037, 046, 047
+ .byte 014, 015, 024, 025, 016, 017, 026, 027
+ .byte 034, 035, 044, 045, 036, 037, 046, 047
+ .byte 064, 065, 074, 075, 066, 067, 076, 077
+ .byte 0104, 0105, 0114, 0115, 0106, 0107, 0116, 0117
+ .byte 0134, 0135, 0144, 0145, 0136, 0137, 0146, 0147
+ .byte 0154, 0155, 0164, 0165, 0156, 0157, 0166, 0167
endconst
-.macro idct4_adds type, depth
+.macro idct4_add16 type, depth
func ff_h264_idct_add\type\()_\depth\()_rvv, zve32x, b
.if \depth == 8
lpad 0
.endif
csrwi vxrm, 0
lla t0, ff_h264_scan8
- li t1, 32 * (\depth / 8)
vsetivli zero, 16, e8, m1, ta, ma
+.ifc \type, 16intra
+.Lidct4_add4_\depth\()_rvv:
+.endif
+ li t1, 32 * (\depth / 8)
vle8.v v8, (t0)
.if \depth == 8
vlse16.v v16, (a2), t1
@@ -587,7 +594,7 @@ func ff_h264_idct_add\type\()_\depth\()_rvv, zve32x, b
mv t5, a1
mv a1, a2
mv a2, a3
- li a3, 16
+ csrr a3, vl
mv a7, ra
1:
andi t0, a4, 1
@@ -603,7 +610,7 @@ func ff_h264_idct_add\type\()_\depth\()_rvv, zve32x, b
.else
beqz t0, 2f # if (nnzc[scan8[i]])
.endif
- jal .Lidct_add4_\depth\()_rvv
+ jal .Lidct4_add_\depth\()_rvv
j 3f
2:
.ifnc \type, 16
@@ -621,9 +628,67 @@ func ff_h264_idct_add\type\()_\depth\()_rvv, zve32x, b
endfunc
.endm
+.macro idct4_add8 type, depth
+func ff_h264_idct4_add\type\()_\depth\()_rvv, zve32x
+.if \depth == 8
+ lpad 0
+.endif
+ csrwi vxrm, 0
+ addi sp, sp, -32
+ addi a2, a2, 16 * 16 * 2 * (\depth / 8) # &block[16 * 16]
+ lla t0, ff_h264_scan8 + 16
+ sd s0, 0(sp)
+ sd ra, 8(sp)
+ mv s0, sp
+ sd a0, 16(sp)
+ sd a4, 24(sp)
+ ld a0, 0(a0) # dest[0]
+ addi a1, a1, 16 * 4 # &block_offset[16]
+ vsetivli zero, 4, e8, mf4, ta, ma
+ jal .Lidct4_add4_\depth\()_rvv
+
+ ld a4, 24(sp) # nnzc
+ ld a0, 16(sp)
+ mv a3, a2 # stride
+ addi a2, a1, (16 - 4) * 16 * 2 * (\depth / 8) # &block[32 * 16]
+ addi a1, t5, (16 - 4) * 4 # &block_offset[32]
+ ld a0, 8(a0) # dest[1]
+ lla t0, ff_h264_scan8 + 32
+.ifc \type, 8_422
+ vsetivli zero, 4, e8, mf4, ta, ma
+ jal .Lidct4_add4_\depth\()_rvv
+
+ ld a4, 24(sp) # nnzc
+ ld a0, 16(sp)
+ mv a3, a2 # stride
+ addi a2, a1, (-12- 4) * 16 * 2 * (\depth / 8) # &block[20 * 16]
+ addi a1, t5, (-8 - 4) * 4 # &block_offset[24]
+ ld a0, 0(a0) # dest[0]
+ lla t0, ff_h264_scan8 + 24
+ vsetivli zero, 4, e8, mf4, ta, ma
+ jal .Lidct4_add4_\depth\()_rvv
+
+ ld a4, 24(sp) # nnzc
+ ld a0, 16(sp)
+ mv a3, a2 # stride
+ addi a2, a1, (16 - 4) * 16 * 2 * (\depth / 8) # &block[36 * 16]
+ addi a1, t5, (16 - 4) * 4 # &block_offset[40]
+ ld a0, 8(a0) # dest[1]
+ lla t0, ff_h264_scan8 + 40
+.endif
+ ld ra, 8(sp)
+ ld s0, 0(sp)
+ addi sp, sp, 32
+ vsetivli zero, 4, e8, mf4, ta, ma
+ j .Lidct4_add4_\depth\()_rvv
+endfunc
+.endm
+
.irp depth, 8, 16
-idct4_adds 16, \depth
-idct4_adds 16intra, \depth
+idct4_add16 16, \depth
+idct4_add16 16intra, \depth
+idct4_add8 8, \depth
+idct4_add8 8_422, \depth
#if (__riscv_xlen == 64)
func ff_h264_idct8_add4_\depth\()_rvv, zve32x, b
@@ -724,5 +789,17 @@ func ff_h264_idct8_add4_\depth\()_rvv, zve32x
li a5, (1 << \depth) - 1
j ff_h264_idct8_add4_16_rvv
endfunc
+
+func ff_h264_idct4_add8_\depth\()_rvv, zve32x
+ lpad 0
+ li a5, (1 << \depth) - 1
+ j ff_h264_idct4_add8_16_rvv
+endfunc
+
+func ff_h264_idct4_add8_422_\depth\()_rvv, zve32x
+ lpad 0
+ li a5, (1 << \depth) - 1
+ j ff_h264_idct4_add8_422_16_rvv
+endfunc
#endif
.endr