From: Kostya Shishkov Date: Mon, 7 Aug 2023 17:01:42 +0000 (+0200) Subject: h264: miscellaneous micro-optimisations X-Git-Url: https://git.nihav.org/?a=commitdiff_plain;h=754ab49a62c862e8c6e66ec88bb7ad626247140e;p=nihav.git h264: miscellaneous micro-optimisations --- diff --git a/nihav-itu/src/codecs/h264/cabac_coder.rs b/nihav-itu/src/codecs/h264/cabac_coder.rs index 3e9278e..82c20b0 100644 --- a/nihav-itu/src/codecs/h264/cabac_coder.rs +++ b/nihav-itu/src/codecs/h264/cabac_coder.rs @@ -235,8 +235,10 @@ impl<'a> CABAC<'a> { pub fn decode_012(&mut self, start: usize) -> u8 { if !self.decode_bit(start) { 0 + } else if !self.decode_bit(start + 1) { + 1 } else { - self.decode_bit(start + 1) as u8 + 1 + 2 } } fn refill(&mut self) { diff --git a/nihav-itu/src/codecs/h264/decoder_st.rs b/nihav-itu/src/codecs/h264/decoder_st.rs index 13fe1bd..7e8b83a 100644 --- a/nihav-itu/src/codecs/h264/decoder_st.rs +++ b/nihav-itu/src/codecs/h264/decoder_st.rs @@ -362,25 +362,29 @@ println!("PAFF?"); mb_info.coeffs[i][0] = mb_info.coeffs[24][i]; } } - if !mb_info.transform_size_8x8 { - let quant_dc = !mb_info.mb_type.is_intra16x16(); - for i in 0..16 { - if mb_info.coded[i] { - if !tx_bypass { - idct(&mut mb_info.coeffs[i], qp_y, quant_dc); + if !tx_bypass { + if !mb_info.transform_size_8x8 { + let quant_dc = !mb_info.mb_type.is_intra16x16(); + for (coded, coeffs) in mb_info.coded[..16].iter_mut().zip(mb_info.coeffs[..16].iter_mut()) { + if *coded { + idct(coeffs, qp_y, quant_dc); + } else if has_dc { + idct_dc(coeffs, qp_y, quant_dc); + *coded = true; } - } else if has_dc { - if !tx_bypass { - idct_dc(&mut mb_info.coeffs[i], qp_y, quant_dc); + } + } else { + for i in 0..4 { + if mb_info.coded[(i & 1) * 2 + (i & 2) * 4] { + dequant8x8(&mut mb_info.coeffs8x8[i].coeffs, &pps.scaling_list_8x8[!mb_info.mb_type.is_intra() as usize]); + idct8x8(&mut mb_info.coeffs8x8[i].coeffs, qp_y); } - mb_info.coded[i] = true; } } - } else { - for i in 0..4 { - if mb_info.coded[(i & 1) * 2 + (i & 2) * 4] && !tx_bypass { - dequant8x8(&mut mb_info.coeffs8x8[i].coeffs, &pps.scaling_list_8x8[!mb_info.mb_type.is_intra() as usize]); - idct8x8(&mut mb_info.coeffs8x8[i].coeffs, qp_y); + } else if !mb_info.transform_size_8x8 { + for i in 0..16 { + if !mb_info.coded[i] && has_dc { + mb_info.coded[i] = true; } } } diff --git a/nihav-itu/src/codecs/h264/dsp/mc/mod.rs b/nihav-itu/src/codecs/h264/dsp/mc/mod.rs index 19f2f28..f558441 100644 --- a/nihav-itu/src/codecs/h264/dsp/mc/mod.rs +++ b/nihav-itu/src/codecs/h264/dsp/mc/mod.rs @@ -273,7 +273,7 @@ fn put_block_weighted(dst: &mut [u8], stride: usize, src: &[u8], w: usize, h: us let wshift = wparams[2] as u8; let bias = (1 << wshift) >> 1; - for (drow, srow) in dst.chunks_mut(stride).zip(src.chunks(16)).take(h) { + for (drow, srow) in dst.chunks_mut(stride).zip(src.chunks_exact(16)).take(h) { for (dst, &src) in drow[..w].iter_mut().zip(srow.iter()) { *dst = clip_u8(((i16::from(src) * weight + bias) >> wshift) + offset); } @@ -302,7 +302,7 @@ fn put_block_weighted2(dst: &mut [u8], stride: usize, src0: &[u8], src1: &[u8], let offset = (offset0 + offset1 + 1) >> 1; let bias = (1 << wshift) >> 1; - for (drow, (srow0, srow1)) in dst.chunks_mut(stride).zip(src0.chunks(16).zip(src1.chunks(16))).take(h) { + for (drow, (srow0, srow1)) in dst.chunks_mut(stride).zip(src0.chunks_exact(16).zip(src1.chunks_exact(16))).take(h) { for (dst, (&src0, &src1)) in drow[..w].iter_mut().zip(srow0.iter().zip(srow1.iter())) { *dst = clip_u8(((i16::from(src0) * weight0 + i16::from(src1) * weight1 + bias) >> wshift) + offset); } diff --git a/nihav-itu/src/codecs/h264/dsp/mod.rs b/nihav-itu/src/codecs/h264/dsp/mod.rs index b07ffe8..76936ad 100644 --- a/nihav-itu/src/codecs/h264/dsp/mod.rs +++ b/nihav-itu/src/codecs/h264/dsp/mod.rs @@ -130,7 +130,7 @@ pub fn idct_luma_dc(blk: &mut [i16; 16], qp: u8) { for i in 0..4 { transform!(luma_dc; blk[i], blk[i + 4], blk[i + 8], blk[i + 12]); } - for row in blk.chunks_mut(4) { + for row in blk.chunks_exact_mut(4) { transform!(luma_dc; row[0], row[1], row[2], row[3]); } } @@ -148,7 +148,7 @@ pub fn idct(blk: &mut [i16; 16], qp: u8, quant_dc: bool) { for (el, &idx) in blk.iter_mut().zip(BLK_INDEX.iter()).skip(start) { *el = (*el * LEVEL_SCALE[idx][qidx]) << shift; } - for row in blk.chunks_mut(4) { + for row in blk.chunks_exact_mut(4) { transform!(row[0], row[1], row[2], row[3], 0); } for i in 0..4 { @@ -228,7 +228,7 @@ pub fn idct8x8(blk: &mut [i16; 64], qp: u8) { *dst = i32::from(src).wrapping_mul(i32::from(qmat[idx])).wrapping_add(bias) >> shift; } } - for row in tmp.chunks_mut(8) { + for row in tmp.chunks_exact_mut(8) { transform!(row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7]); } for col in 0..8 { @@ -242,7 +242,7 @@ pub fn idct8x8(blk: &mut [i16; 64], qp: u8) { pub fn add_coeffs(dst: &mut [u8], offset: usize, stride: usize, coeffs: &[i16]) { let out = &mut dst[offset..][..stride * 3 + 4]; - for (line, src) in out.chunks_mut(stride).take(4).zip(coeffs.chunks(4)) { + for (line, src) in out.chunks_mut(stride).take(4).zip(coeffs.chunks_exact(4)) { for (dst, src) in line.iter_mut().take(4).zip(src.iter()) { *dst = (i32::from(*dst) + i32::from(*src)).max(0).min(255) as u8; } @@ -251,7 +251,7 @@ pub fn add_coeffs(dst: &mut [u8], offset: usize, stride: usize, coeffs: &[i16]) pub fn add_coeffs8(dst: &mut [u8], offset: usize, stride: usize, coeffs: &[i16; 64]) { let out = &mut dst[offset..]; - for (line, src) in out.chunks_mut(stride).take(8).zip(coeffs.chunks(8)) { + for (line, src) in out.chunks_mut(stride).take(8).zip(coeffs.chunks_exact(8)) { for (dst, src) in line.iter_mut().take(8).zip(src.iter()) { *dst = (i32::from(*dst) + i32::from(*src)).max(0).min(255) as u8; } diff --git a/nihav-itu/src/codecs/h264/mb_recon.rs b/nihav-itu/src/codecs/h264/mb_recon.rs index 5d82503..5a204f3 100644 --- a/nihav-itu/src/codecs/h264/mb_recon.rs +++ b/nihav-itu/src/codecs/h264/mb_recon.rs @@ -364,7 +364,7 @@ fn do_b_mc(frm: &mut NASimpleVideoFrame, mode: BMode, xpos: usize, ypos: usi } } -fn do_b_mc_4x4bi(frm: &mut NASimpleVideoFrame, xpos: usize, ypos: usize, mv: [MV; 2], ref_pic0: Option>, weight0: &WeightInfo, ref_pic1: Option>, weight1: &WeightInfo, mc_dsp: &mut H264MC) { +fn do_b_mc_4x4bi(frm: &mut NASimpleVideoFrame, xpos: usize, ypos: usize, mv: &[MV; 2], ref_pic0: Option>, weight0: &WeightInfo, ref_pic1: Option>, weight1: &WeightInfo, mc_dsp: &mut H264MC) { if !weight0.is_weighted() || !weight1.is_weighted() { match (ref_pic0, ref_pic1) { (Some(buf0), Some(buf1)) => { @@ -585,11 +585,11 @@ pub fn recon_mb(frm: &mut NASimpleVideoFrame, slice_hdr: &SliceHeader, mb_in do_b_mc(frm, BMode::Bi, xpos, ypos, 16, 16, mv[0], rpic0, &weight0, mv[1], rpic1, &weight1, mc_dsp); } else { for blk4 in 0..16 { - let mv = sstate.get_cur_blk4(blk4).mv; let ref_idx = sstate.get_cur_blk8(blk4_to_blk8(blk4)).ref_idx; let rpic0 = frame_refs.select_ref_pic(0, ref_idx[0].index()); let rpic1 = frame_refs.select_ref_pic(1, ref_idx[1].index()); let (weight0, weight1) = get_weights(slice_hdr, frame_refs, BMode::Bi, weight_mode, ref_idx[0], ref_idx[1]); + let mv = &sstate.get_cur_blk4(blk4).mv; do_b_mc_4x4bi(frm, xpos + (blk4 & 3) * 4, ypos + (blk4 >> 2) * 4, mv, rpic0, &weight0, rpic1, &weight1, mc_dsp); } } @@ -607,11 +607,11 @@ pub fn recon_mb(frm: &mut NASimpleVideoFrame, slice_hdr: &SliceHeader, mb_in match subtype { SubMBType::Direct8x8 => { for blk in 0..4 { - let mv = sstate.get_cur_blk4(bx / 4 + (by / 4) * 4).mv; let ref_idx = sstate.get_cur_blk8(bx / 8 + (by / 8) * 2).ref_idx; let rpic0 = frame_refs.select_ref_pic(0, ref_idx[0].index()); let rpic1 = frame_refs.select_ref_pic(1, ref_idx[1].index()); let (weight0, weight1) = get_weights(slice_hdr, frame_refs, BMode::Bi, weight_mode, ref_idx[0], ref_idx[1]); + let mv = &sstate.get_cur_blk4(bx / 4 + (by / 4) * 4).mv; do_b_mc_4x4bi(frm, xpos + bx, ypos + by, mv, rpic0, &weight0, rpic1, &weight1, mc_dsp); bx += 4; if blk == 1 { diff --git a/nihav-itu/src/codecs/h264/types.rs b/nihav-itu/src/codecs/h264/types.rs index 00aa72e..4cc1fca 100644 --- a/nihav-itu/src/codecs/h264/types.rs +++ b/nihav-itu/src/codecs/h264/types.rs @@ -551,7 +551,7 @@ impl SliceState { if cur_cc || top_cc { self.deblock[y * 4 + x] |= 0x20; } else { - if mvdiff4(cur_mv[0], top_mv[0]) || mvdiff4(cur_mv[1], top_mv[1]) || !frefs.cmp_refs(cur_ref, top_ref) { + if mvdiff4(cur_mv, top_mv) || !frefs.cmp_refs(cur_ref, top_ref) { self.deblock[y * 4 + x] |= 0x10; } } @@ -574,7 +574,7 @@ impl SliceState { } else if cur_cc || left_cc { self.deblock[y * 4 + x] |= 2; } else { - if mvdiff4(cur_mv[0], left_mv[0]) || mvdiff4(cur_mv[1], left_mv[1]) || !frefs.cmp_refs(cur_ref, left_ref) { + if mvdiff4(cur_mv, left_mv) || !frefs.cmp_refs(cur_ref, left_ref) { self.deblock[y * 4 + x] |= 1; } } @@ -917,7 +917,27 @@ impl SliceState { } } -fn mvdiff4(mv1: MV, mv2: MV) -> bool { - let mv = mv1 - mv2; - (mv.x.abs() >= 4) || (mv.y.abs() >= 4) +#[cfg(not(target_arch="x86_64"))] +fn mvdiff4(mv1: &[MV; 2], mv2: &[MV; 2]) -> bool { + let mvd0 = mv1[0] - mv2[0]; + let mvd1 = mv1[1] - mv2[1]; + (mvd0.x.abs() >= 4) || (mvd0.y.abs() >= 4) || (mvd1.x.abs() >= 4) || (mvd1.y.abs() >= 4) +} + +#[cfg(target_arch="x86_64")] +fn mvdiff4(mv1: &[MV; 2], mv2: &[MV; 2]) -> bool { + unsafe { + let mut flag = false; + let ptr = std::mem::transmute::<*const MV, *const u64>(mv1.as_ptr()); + let mut m0 = *ptr; + let ptr = std::mem::transmute::<*const MV, *const u64>(mv2.as_ptr()); + let mut m1 = *ptr; + for _ in 0..4 { + let tmp = m0.wrapping_sub(m1) as u16; + flag |= tmp.wrapping_add(3) > 6; + m0 >>= 16; + m1 >>= 16; + } + flag + } }