From: Kostya Shishkov Date: Fri, 11 Aug 2023 16:27:48 +0000 (+0200) Subject: h264: more micro-optimisations X-Git-Url: https://git.nihav.org/?a=commitdiff_plain;h=fe64781def821c3900abf44bdfbb38f3b3d21345;p=nihav.git h264: more micro-optimisations * split IDCT function so it has only two parameters * evade instantiating frame references for each MB or even sub-block * other small code changes --- diff --git a/nihav-itu/src/codecs/h264/decoder_mt.rs b/nihav-itu/src/codecs/h264/decoder_mt.rs index 1824042..f66ce2f 100644 --- a/nihav-itu/src/codecs/h264/decoder_mt.rs +++ b/nihav-itu/src/codecs/h264/decoder_mt.rs @@ -39,17 +39,19 @@ impl FrameDecoder { let sslice_refs = SimplifiedSliceRefs::new(refs); let mut br = BitReader::new(&nal[hdr_size / 8..], BitReaderMode::BE); + let mut dst_pic = self.cur_pic.clone(); + let mut dst_frm = NASimpleVideoFrame::from_video_buf(&mut dst_pic.buf).unwrap(); if !self.pps.entropy_coding_mode { br.skip((hdr_size & 7) as u32)?; - self.decode_slice_cavlc(&mut br, full_size - (hdr_size & !7), hdr, &sslice_refs) + self.decode_slice_cavlc(&mut br, full_size - (hdr_size & !7), hdr, &sslice_refs, &mut dst_frm) } else { let csrc = &nal[(hdr_size + 7) / 8..]; validate!(csrc.len() >= 2); let mut cabac = CABAC::new(csrc, hdr.slice_type, hdr.slice_qp, hdr.cabac_init_idc as usize)?; - self.decode_slice_cabac(&mut cabac, hdr, &sslice_refs) + self.decode_slice_cabac(&mut cabac, hdr, &sslice_refs, &mut dst_frm) } } - fn decode_slice_cavlc(&mut self, br: &mut BitReader, full_size: usize, slice_hdr: &SliceHeader, refs: &SimplifiedSliceRefs) -> DecoderResult { + fn decode_slice_cavlc(&mut self, br: &mut BitReader, full_size: usize, slice_hdr: &SliceHeader, refs: &SimplifiedSliceRefs, frm: &mut NASimpleVideoFrame) -> DecoderResult { const INTRA_CBP: [u8; 48] = [ 47, 31, 15, 0, 23, 27, 29, 30, 7, 11, 13, 14, 39, 43, 45, 46, 16, 3, 5, 10, 12, 19, 21, 26, 28, 35, 37, 42, 44, 1, 2, 4, @@ -79,7 +81,7 @@ impl FrameDecoder { validate!(mb_idx + mb_skip_run <= self.num_mbs); mb_info.mb_type = skip_type; for _ in 0..mb_skip_run { - self.handle_macroblock(slice_hdr, &mut mb_info, refs)?; + self.handle_macroblock(slice_hdr, &mut mb_info, refs, frm)?; mb_idx += 1; } if mb_idx == self.num_mbs || br.tell() >= full_size { @@ -145,7 +147,7 @@ impl FrameDecoder { decode_residual_cavlc(br, &mut self.sstate, &mut mb_info, &self.cavlc_cb)?; } } - self.handle_macroblock(slice_hdr, &mut mb_info, refs)?; + self.handle_macroblock(slice_hdr, &mut mb_info, refs, frm)?; } mb_idx += 1; if let Ok(disp) = self.dispatch.read() { @@ -154,7 +156,7 @@ impl FrameDecoder { } Ok(mb_idx) } - fn decode_slice_cabac(&mut self, cabac: &mut CABAC, slice_hdr: &SliceHeader, refs: &SimplifiedSliceRefs) -> DecoderResult { + fn decode_slice_cabac(&mut self, cabac: &mut CABAC, slice_hdr: &SliceHeader, refs: &SimplifiedSliceRefs, frm: &mut NASimpleVideoFrame) -> DecoderResult { let mut mb_idx = slice_hdr.first_mb_in_slice; let mut prev_mb_skipped = false; let skip_type = if slice_hdr.slice_type.is_p() { MBType::PSkip } else { MBType::BSkip }; @@ -250,7 +252,7 @@ impl FrameDecoder { mb_info.transform_size_8x8 = false; last_qp_diff = false; } - self.handle_macroblock(slice_hdr, &mut mb_info, refs)?; + self.handle_macroblock(slice_hdr, &mut mb_info, refs, frm)?; prev_mb_skipped = mb_skip; if !(self.is_mbaff && ((mb_idx & 1) == 0)) && cabac.decode_terminate() { if let Ok(disp) = self.dispatch.read() { @@ -266,7 +268,7 @@ impl FrameDecoder { Err(DecoderError::InvalidData) } #[allow(clippy::cognitive_complexity)] - fn handle_macroblock(&mut self, slice_hdr: &SliceHeader, mb_info: &mut CurrentMBInfo, refs: &SimplifiedSliceRefs) -> DecoderResult<()> { + fn handle_macroblock(&mut self, slice_hdr: &SliceHeader, mb_info: &mut CurrentMBInfo, refs: &SimplifiedSliceRefs, frm: &mut NASimpleVideoFrame) -> DecoderResult<()> { let qp_y = mb_info.qp_y; let qpr = ((qp_y as i8) + self.pps.chroma_qp_index_offset).max(0).min(51) as usize; let qp_u = CHROMA_QUANTS[qpr]; @@ -291,16 +293,31 @@ impl FrameDecoder { } if !mb_info.transform_size_8x8 { let quant_dc = !mb_info.mb_type.is_intra16x16(); - for i in 0..16 { - if mb_info.coded[i] { - if !tx_bypass { - idct(&mut mb_info.coeffs[i], qp_y, quant_dc); + if quant_dc { + for i in 0..16 { + if mb_info.coded[i] { + if !tx_bypass { + idct(&mut mb_info.coeffs[i], qp_y); + } + } else if has_dc { + if !tx_bypass { + idct_dc(&mut mb_info.coeffs[i], qp_y, quant_dc); + } + mb_info.coded[i] = true; } - } else if has_dc { - if !tx_bypass { - idct_dc(&mut mb_info.coeffs[i], qp_y, quant_dc); + } + } else { + for i in 0..16 { + if mb_info.coded[i] { + if !tx_bypass { + idct_skip_dc(&mut mb_info.coeffs[i], qp_y); + } + } else if has_dc { + if !tx_bypass { + idct_dc(&mut mb_info.coeffs[i], qp_y, quant_dc); + } + mb_info.coded[i] = true; } - mb_info.coded[i] = true; } } } else { @@ -320,7 +337,7 @@ impl FrameDecoder { let blk_no = 16 + chroma * 4 + i; mb_info.coeffs[blk_no][0] = mb_info.chroma_dc[chroma][i]; if mb_info.coded[blk_no] { - idct(&mut mb_info.coeffs[blk_no], qp_c, false); + idct_skip_dc(&mut mb_info.coeffs[blk_no], qp_c); } else if mb_info.coeffs[blk_no][0] != 0 { idct_dc(&mut mb_info.coeffs[blk_no], qp_c, false); mb_info.coded[blk_no] = true; @@ -347,7 +364,6 @@ impl FrameDecoder { let xpos = self.sstate.mb_x * 16; let ypos = self.sstate.mb_y * 16; - let mut frm = NASimpleVideoFrame::from_video_buf(&mut self.cur_pic.buf).unwrap(); if mb_info.mb_type != MBType::PCM { let weight_mode = if self.pps.weighted_pred && slice_hdr.slice_type.is_p() { 1 @@ -356,7 +372,7 @@ impl FrameDecoder { } else { 0 }; - recon_mb_mt(&mut frm, slice_hdr, mb_info, &mut self.sstate, refs, &mut self.mc_dsp, weight_mode, &self.dispatch)?; + recon_mb_mt(frm, slice_hdr, mb_info, &mut self.sstate, refs, &mut self.mc_dsp, weight_mode, &self.dispatch)?; } else { for (dline, src) in frm.data[frm.offset[0] + xpos + ypos * frm.stride[0]..].chunks_mut(frm.stride[0]).take(16).zip(self.ipcm_buf.chunks(16)) { dline[..16].copy_from_slice(src); @@ -368,7 +384,7 @@ impl FrameDecoder { dline[..8].copy_from_slice(src); } } - self.sstate.save_ipred_context(&frm); + self.sstate.save_ipred_context(frm); let mv_info = &mut self.cur_pic.mv_info; let mb_pos = self.sstate.mb_x + self.sstate.mb_y * mv_info.mb_stride; diff --git a/nihav-itu/src/codecs/h264/decoder_st.rs b/nihav-itu/src/codecs/h264/decoder_st.rs index b9b7308..94b3977 100644 --- a/nihav-itu/src/codecs/h264/decoder_st.rs +++ b/nihav-itu/src/codecs/h264/decoder_st.rs @@ -218,15 +218,23 @@ println!("PAFF?"); self.transform_8x8_mode = pps.transform_8x8_mode; self.sstate.reset(sps.pic_width_in_mbs, sps.pic_height_in_mbs, slice_hdr.first_mb_in_slice); + + let mut dst_pic = if let Some(ref pic) = self.cur_pic { + pic.clone() + } else { + return Err(DecoderError::InvalidData); + }; + let mut dst_frm = NASimpleVideoFrame::from_video_buf(&mut dst_pic.buf).unwrap(); + let dst_mv_info = &mut dst_pic.mv_info; if !pps.entropy_coding_mode { - self.has_pic = self.decode_slice_cavlc(&mut br, &slice_hdr, full_size)?; + self.has_pic = self.decode_slice_cavlc(&mut br, &slice_hdr, full_size, &mut dst_frm, dst_mv_info)?; } else { br.align(); let start = br.tell() / 8; let csrc = &src[start..]; validate!(csrc.len() >= 2); let mut cabac = CABAC::new(csrc, slice_hdr.slice_type, slice_hdr.slice_qp, slice_hdr.cabac_init_idc as usize)?; - self.has_pic = self.decode_slice_cabac(&mut cabac, &slice_hdr)?; + self.has_pic = self.decode_slice_cabac(&mut cabac, &slice_hdr, &mut dst_frm, dst_mv_info)?; } }, 2 => { // slice data partition A @@ -337,7 +345,7 @@ println!("PAFF?"); } } #[allow(clippy::cognitive_complexity)] - fn handle_macroblock(&mut self, slice_hdr: &SliceHeader, mb_info: &mut CurrentMBInfo, slice_refs: &SimplifiedSliceRefs) { + fn handle_macroblock(&mut self, slice_hdr: &SliceHeader, mb_info: &mut CurrentMBInfo, slice_refs: &SimplifiedSliceRefs, frm: &mut NASimpleVideoFrame, mv_info: &mut FrameMV) { let pps = &self.pps[self.cur_pps]; let qp_y = mb_info.qp_y; @@ -365,12 +373,23 @@ println!("PAFF?"); if !tx_bypass { if !mb_info.transform_size_8x8 { let quant_dc = !mb_info.mb_type.is_intra16x16(); - for (coded, coeffs) in mb_info.coded[..16].iter_mut().zip(mb_info.coeffs[..16].iter_mut()) { - if *coded { - idct(coeffs, qp_y, quant_dc); - } else if has_dc { - idct_dc(coeffs, qp_y, quant_dc); - *coded = true; + if quant_dc { + for (coded, coeffs) in mb_info.coded[..16].iter_mut().zip(mb_info.coeffs[..16].iter_mut()) { + if *coded { + idct(coeffs, qp_y); + } else if has_dc { + idct_dc(coeffs, qp_y, quant_dc); + *coded = true; + } + } + } else { + for (coded, coeffs) in mb_info.coded[..16].iter_mut().zip(mb_info.coeffs[..16].iter_mut()) { + if *coded { + idct_skip_dc(coeffs, qp_y); + } else if has_dc { + idct_dc(coeffs, qp_y, quant_dc); + *coded = true; + } } } } else { @@ -397,7 +416,7 @@ println!("PAFF?"); let blk_no = 16 + chroma * 4 + i; mb_info.coeffs[blk_no][0] = mb_info.chroma_dc[chroma][i]; if mb_info.coded[blk_no] { - idct(&mut mb_info.coeffs[blk_no], qp_c, false); + idct_skip_dc(&mut mb_info.coeffs[blk_no], qp_c); } else if mb_info.coeffs[blk_no][0] != 0 { idct_dc(&mut mb_info.coeffs[blk_no], qp_c, false); mb_info.coded[blk_no] = true; @@ -416,28 +435,27 @@ println!("PAFF?"); let xpos = self.sstate.mb_x * 16; let ypos = self.sstate.mb_y * 16; - if let Some(ref mut pic) = self.cur_pic { - let mut frm = NASimpleVideoFrame::from_video_buf(&mut pic.buf).unwrap(); - if mb_info.mb_type != MBType::PCM { - let weight_mode = if self.pps[self.cur_pps].weighted_pred && slice_hdr.slice_type.is_p() { - 1 - } else if slice_hdr.slice_type.is_b() { - self.pps[self.cur_pps].weighted_bipred_idc - } else { - 0 - }; - recon_mb(&mut frm, slice_hdr, mb_info, &mut self.sstate, slice_refs, &mut self.mc_dsp, weight_mode); - } else { - for (dline, src) in frm.data[frm.offset[0] + xpos + ypos * frm.stride[0]..].chunks_mut(frm.stride[0]).take(16).zip(self.ipcm_buf.chunks(16)) { - dline[..16].copy_from_slice(src); - } - for (dline, src) in frm.data[frm.offset[1] + xpos/2 + ypos/2 * frm.stride[1]..].chunks_mut(frm.stride[1]).take(8).zip(self.ipcm_buf[256..].chunks(8)) { - dline[..8].copy_from_slice(src); - } - for (dline, src) in frm.data[frm.offset[2] + xpos/2 + ypos/2 * frm.stride[2]..].chunks_mut(frm.stride[2]).take(8).zip(self.ipcm_buf[256 + 64..].chunks(8)) { - dline[..8].copy_from_slice(src); - } + + if mb_info.mb_type != MBType::PCM { + let weight_mode = if self.pps[self.cur_pps].weighted_pred && slice_hdr.slice_type.is_p() { + 1 + } else if slice_hdr.slice_type.is_b() { + self.pps[self.cur_pps].weighted_bipred_idc + } else { + 0 + }; + recon_mb(frm, slice_hdr, mb_info, &mut self.sstate, slice_refs, &mut self.mc_dsp, weight_mode); + } else { + for (dline, src) in frm.data[frm.offset[0] + xpos + ypos * frm.stride[0]..].chunks_mut(frm.stride[0]).take(16).zip(self.ipcm_buf.chunks(16)) { + dline[..16].copy_from_slice(src); + } + for (dline, src) in frm.data[frm.offset[1] + xpos/2 + ypos/2 * frm.stride[1]..].chunks_mut(frm.stride[1]).take(8).zip(self.ipcm_buf[256..].chunks(8)) { + dline[..8].copy_from_slice(src); + } + for (dline, src) in frm.data[frm.offset[2] + xpos/2 + ypos/2 * frm.stride[2]..].chunks_mut(frm.stride[2]).take(8).zip(self.ipcm_buf[256 + 64..].chunks(8)) { + dline[..8].copy_from_slice(src); } + } /*match mb_info.mb_type { MBType::BSkip | MBType::Direct | MBType::B16x16(_) | MBType::B16x8(_, _) | MBType::B8x16(_, _) | MBType::B8x8 => { let dstride = frm.stride[0]; @@ -449,32 +467,27 @@ MBType::BSkip | MBType::Direct | MBType::B16x16(_) | MBType::B16x8(_, _) | MBTyp }, _ => {}, };*/ - self.sstate.save_ipred_context(&frm); + self.sstate.save_ipred_context(frm); + + let mb_pos = self.sstate.mb_x + self.sstate.mb_y * mv_info.mb_stride; + let mut mb = FrameMBInfo::new(); + mb.mb_type = mb_info.mb_type.into(); + for blk4 in 0..16 { + mb.mv[blk4] = self.sstate.get_cur_blk4(blk4).mv; } - if let Some(ref mut pic) = self.cur_pic { - let mv_info = &mut pic.mv_info; - let mb_pos = self.sstate.mb_x + self.sstate.mb_y * mv_info.mb_stride; - let mut mb = FrameMBInfo::new(); - mb.mb_type = mb_info.mb_type.into(); - for blk4 in 0..16 { - mb.mv[blk4] = self.sstate.get_cur_blk4(blk4).mv; - } - for blk8 in 0..4 { - mb.ref_poc[blk8] = slice_refs.map_refs(self.sstate.get_cur_blk8(blk8).ref_idx); - mb.ref_idx[blk8] = self.sstate.get_cur_blk8(blk8).ref_idx; - } - mv_info.mbs[mb_pos] = mb; + for blk8 in 0..4 { + mb.ref_poc[blk8] = slice_refs.map_refs(self.sstate.get_cur_blk8(blk8).ref_idx); + mb.ref_idx[blk8] = self.sstate.get_cur_blk8(blk8).ref_idx; } + mv_info.mbs[mb_pos] = mb; + if !self.deblock_skip && self.deblock_mode != 1 { self.sstate.fill_deblock(slice_refs, self.deblock_mode, self.is_s); - if let Some(ref mut pic) = self.cur_pic { - let mut frm = NASimpleVideoFrame::from_video_buf(&mut pic.buf).unwrap(); - loop_filter_mb(&mut frm, &self.sstate, self.lf_alpha, self.lf_beta); - } + loop_filter_mb(frm, &self.sstate, self.lf_alpha, self.lf_beta); } self.sstate.next_mb(); } - fn decode_slice_cavlc(&mut self, br: &mut BitReader, slice_hdr: &SliceHeader, full_size: usize) -> DecoderResult { + fn decode_slice_cavlc(&mut self, br: &mut BitReader, slice_hdr: &SliceHeader, full_size: usize, frm: &mut NASimpleVideoFrame, mv_info: &mut FrameMV) -> DecoderResult { const INTRA_CBP: [u8; 48] = [ 47, 31, 15, 0, 23, 27, 29, 30, 7, 11, 13, 14, 39, 43, 45, 46, 16, 3, 5, 10, 12, 19, 21, 26, 28, 35, 37, 42, 44, 1, 2, 4, @@ -508,7 +521,7 @@ _ => {}, validate!(mb_idx + mb_skip_run <= self.num_mbs); mb_info.mb_type = skip_type; for _ in 0..mb_skip_run { - self.handle_macroblock(slice_hdr, &mut mb_info, &sslice_refs); + self.handle_macroblock(slice_hdr, &mut mb_info, &sslice_refs, frm, mv_info); mb_idx += 1; } if mb_idx == self.num_mbs || br.tell() >= full_size { @@ -574,7 +587,7 @@ _ => {}, decode_residual_cavlc(br, &mut self.sstate, &mut mb_info, &self.cavlc_cb)?; } } - self.handle_macroblock(slice_hdr, &mut mb_info, &sslice_refs); + self.handle_macroblock(slice_hdr, &mut mb_info, &sslice_refs, frm, mv_info); } mb_idx += 1; } @@ -583,7 +596,7 @@ _ => {}, } Ok(mb_idx == self.num_mbs) } - fn decode_slice_cabac(&mut self, cabac: &mut CABAC, slice_hdr: &SliceHeader) -> DecoderResult { + fn decode_slice_cabac(&mut self, cabac: &mut CABAC, slice_hdr: &SliceHeader, frm: &mut NASimpleVideoFrame, mv_info: &mut FrameMV) -> DecoderResult { let mut mb_idx = slice_hdr.first_mb_in_slice; let mut prev_mb_skipped = false; let skip_type = if slice_hdr.slice_type.is_p() { MBType::PSkip } else { MBType::BSkip }; @@ -682,7 +695,7 @@ _ => {}, mb_info.transform_size_8x8 = false; last_qp_diff = false; } - self.handle_macroblock(slice_hdr, &mut mb_info, &sslice_refs); + self.handle_macroblock(slice_hdr, &mut mb_info, &sslice_refs, frm, mv_info); prev_mb_skipped = mb_skip; if !(self.is_mbaff && ((mb_idx & 1) == 0)) && cabac.decode_terminate() { if let Some(ref mut pic) = self.cur_pic { diff --git a/nihav-itu/src/codecs/h264/dsp/mc/mod.rs b/nihav-itu/src/codecs/h264/dsp/mc/mod.rs index 5845d92..615563e 100644 --- a/nihav-itu/src/codecs/h264/dsp/mc/mod.rs +++ b/nihav-itu/src/codecs/h264/dsp/mc/mod.rs @@ -199,7 +199,21 @@ impl H264MC { pub fn do_mc_avg(&mut self, frm: &mut NASimpleVideoFrame, refpic: &SimpleFrame, xpos: usize, ypos: usize, w: usize, h: usize, mv: MV) { let mut abuf = self.avg_buf.clone(); - let mut afrm = NASimpleVideoFrame::from_video_buf(&mut abuf).unwrap(); + let stride_y = abuf.get_stride(0); + let stride_c = abuf.get_stride(1); + let off_y = abuf.get_offset(0); + let off_u = abuf.get_offset(1); + let off_v = abuf.get_offset(2); + let data = abuf.get_data_mut().unwrap(); + let mut afrm = NASimpleVideoFrame { + width: [64, 32, 32, 0], + height: [64, 32, 32, 0], + flip: false, + stride: [stride_y, stride_c, stride_c, 0], + offset: [off_y, off_u, off_v, 0], + components: 3, + data, + }; let amv = MV { x: mv.x + (xpos as i16) * 4, y: mv.y + (ypos as i16) * 4 }; self.do_mc(&mut afrm, refpic, 0, 0, w, h, amv); let wsize = match w { diff --git a/nihav-itu/src/codecs/h264/dsp/mod.rs b/nihav-itu/src/codecs/h264/dsp/mod.rs index 76936ad..a2a58a4 100644 --- a/nihav-itu/src/codecs/h264/dsp/mod.rs +++ b/nihav-itu/src/codecs/h264/dsp/mod.rs @@ -135,7 +135,7 @@ pub fn idct_luma_dc(blk: &mut [i16; 16], qp: u8) { } } -pub fn idct(blk: &mut [i16; 16], qp: u8, quant_dc: bool) { +pub fn idct_skip_dc(blk: &mut [i16; 16], qp: u8) { const BLK_INDEX: [usize; 16] = [ 0, 2, 0, 2, 2, 1, 2, 1, @@ -144,8 +144,27 @@ pub fn idct(blk: &mut [i16; 16], qp: u8, quant_dc: bool) { ]; let qidx = (qp % 6) as usize; let shift = qp / 6; - let start = if quant_dc { 0 } else { 1 }; - for (el, &idx) in blk.iter_mut().zip(BLK_INDEX.iter()).skip(start) { + for (el, &idx) in blk.iter_mut().zip(BLK_INDEX.iter()).skip(1) { + *el = (*el * LEVEL_SCALE[idx][qidx]) << shift; + } + for row in blk.chunks_exact_mut(4) { + transform!(row[0], row[1], row[2], row[3], 0); + } + for i in 0..4 { + transform!(blk[i], blk[i + 4], blk[i + 8], blk[i + 12], 6); + } +} + +pub fn idct(blk: &mut [i16; 16], qp: u8) { + const BLK_INDEX: [usize; 16] = [ + 0, 2, 0, 2, + 2, 1, 2, 1, + 0, 2, 0, 2, + 2, 1, 2, 1 + ]; + let qidx = (qp % 6) as usize; + let shift = qp / 6; + for (el, &idx) in blk.iter_mut().zip(BLK_INDEX.iter()) { *el = (*el * LEVEL_SCALE[idx][qidx]) << shift; } for row in blk.chunks_exact_mut(4) { diff --git a/nihav-itu/src/codecs/h264/types.rs b/nihav-itu/src/codecs/h264/types.rs index 1310daa..4bcdb49 100644 --- a/nihav-itu/src/codecs/h264/types.rs +++ b/nihav-itu/src/codecs/h264/types.rs @@ -383,8 +383,9 @@ pub struct MBData { } pub fn blk4_to_blk8(blk4: usize) -> usize { - const MAP: [usize; 16] = [ 0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3 ]; - MAP[blk4 & 0xF] + /*const MAP: [usize; 16] = [ 0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3 ]; + MAP[blk4 & 0xF]*/ + ((blk4 & 2) >> 1) | ((blk4 & 8) >> 2) } #[derive(Clone,Copy)]