From: Kostya Shishkov <kostya.shishkov@gmail.com>
Date: Mon, 7 Aug 2023 17:01:42 +0000 (+0200)
Subject: h264: miscellaneous micro-optimisations
X-Git-Url: https://git.nihav.org/?a=commitdiff_plain;h=754ab49a62c862e8c6e66ec88bb7ad626247140e;p=nihav.git

h264: miscellaneous micro-optimisations
---

diff --git a/nihav-itu/src/codecs/h264/cabac_coder.rs b/nihav-itu/src/codecs/h264/cabac_coder.rs
index 3e9278e..82c20b0 100644
--- a/nihav-itu/src/codecs/h264/cabac_coder.rs
+++ b/nihav-itu/src/codecs/h264/cabac_coder.rs
@@ -235,8 +235,10 @@ impl<'a> CABAC<'a> {
     pub fn decode_012(&mut self, start: usize) -> u8 {
         if !self.decode_bit(start) {
             0
+        } else if !self.decode_bit(start + 1) {
+            1
         } else {
-            self.decode_bit(start + 1) as u8 + 1
+            2
         }
     }
     fn refill(&mut self) {
diff --git a/nihav-itu/src/codecs/h264/decoder_st.rs b/nihav-itu/src/codecs/h264/decoder_st.rs
index 13fe1bd..7e8b83a 100644
--- a/nihav-itu/src/codecs/h264/decoder_st.rs
+++ b/nihav-itu/src/codecs/h264/decoder_st.rs
@@ -362,25 +362,29 @@ println!("PAFF?");
                 mb_info.coeffs[i][0] = mb_info.coeffs[24][i];
             }
         }
-        if !mb_info.transform_size_8x8 {
-            let quant_dc = !mb_info.mb_type.is_intra16x16();
-            for i in 0..16 {
-                if mb_info.coded[i] {
-                    if !tx_bypass {
-                        idct(&mut mb_info.coeffs[i], qp_y, quant_dc);
+        if !tx_bypass {
+            if !mb_info.transform_size_8x8 {
+                let quant_dc = !mb_info.mb_type.is_intra16x16();
+                for (coded, coeffs) in mb_info.coded[..16].iter_mut().zip(mb_info.coeffs[..16].iter_mut()) {
+                    if *coded {
+                        idct(coeffs, qp_y, quant_dc);
+                    } else if has_dc {
+                        idct_dc(coeffs, qp_y, quant_dc);
+                        *coded = true;
                     }
-                } else if has_dc {
-                    if !tx_bypass {
-                        idct_dc(&mut mb_info.coeffs[i], qp_y, quant_dc);
+                }
+            } else {
+                for i in 0..4 {
+                    if mb_info.coded[(i & 1) * 2 + (i & 2) * 4] {
+                        dequant8x8(&mut mb_info.coeffs8x8[i].coeffs, &pps.scaling_list_8x8[!mb_info.mb_type.is_intra() as usize]);
+                        idct8x8(&mut mb_info.coeffs8x8[i].coeffs, qp_y);
                     }
-                    mb_info.coded[i] = true;
                 }
             }
-        } else {
-            for i in 0..4 {
-                if mb_info.coded[(i & 1) * 2 + (i & 2) * 4] && !tx_bypass {
-                    dequant8x8(&mut mb_info.coeffs8x8[i].coeffs, &pps.scaling_list_8x8[!mb_info.mb_type.is_intra() as usize]);
-                    idct8x8(&mut mb_info.coeffs8x8[i].coeffs, qp_y);
+        } else if !mb_info.transform_size_8x8 {
+            for i in 0..16 {
+                if !mb_info.coded[i] && has_dc {
+                    mb_info.coded[i] = true;
                 }
             }
         }
diff --git a/nihav-itu/src/codecs/h264/dsp/mc/mod.rs b/nihav-itu/src/codecs/h264/dsp/mc/mod.rs
index 19f2f28..f558441 100644
--- a/nihav-itu/src/codecs/h264/dsp/mc/mod.rs
+++ b/nihav-itu/src/codecs/h264/dsp/mc/mod.rs
@@ -273,7 +273,7 @@ fn put_block_weighted(dst: &mut [u8], stride: usize, src: &[u8], w: usize, h: us
     let wshift = wparams[2] as u8;
     let bias = (1 << wshift) >> 1;
 
-    for (drow, srow) in dst.chunks_mut(stride).zip(src.chunks(16)).take(h) {
+    for (drow, srow) in dst.chunks_mut(stride).zip(src.chunks_exact(16)).take(h) {
         for (dst, &src) in drow[..w].iter_mut().zip(srow.iter()) {
             *dst = clip_u8(((i16::from(src) * weight + bias) >> wshift) + offset);
         }
@@ -302,7 +302,7 @@ fn put_block_weighted2(dst: &mut [u8], stride: usize, src0: &[u8], src1: &[u8],
     let offset = (offset0 + offset1 + 1) >> 1;
     let bias = (1 << wshift) >> 1;
 
-    for (drow, (srow0, srow1)) in dst.chunks_mut(stride).zip(src0.chunks(16).zip(src1.chunks(16))).take(h) {
+    for (drow, (srow0, srow1)) in dst.chunks_mut(stride).zip(src0.chunks_exact(16).zip(src1.chunks_exact(16))).take(h) {
         for (dst, (&src0, &src1)) in drow[..w].iter_mut().zip(srow0.iter().zip(srow1.iter())) {
             *dst = clip_u8(((i16::from(src0) * weight0 + i16::from(src1) * weight1 + bias) >> wshift) + offset);
         }
diff --git a/nihav-itu/src/codecs/h264/dsp/mod.rs b/nihav-itu/src/codecs/h264/dsp/mod.rs
index b07ffe8..76936ad 100644
--- a/nihav-itu/src/codecs/h264/dsp/mod.rs
+++ b/nihav-itu/src/codecs/h264/dsp/mod.rs
@@ -130,7 +130,7 @@ pub fn idct_luma_dc(blk: &mut [i16; 16], qp: u8) {
     for i in 0..4 {
         transform!(luma_dc; blk[i], blk[i + 4], blk[i + 8], blk[i + 12]);
     }
-    for row in blk.chunks_mut(4) {
+    for row in blk.chunks_exact_mut(4) {
         transform!(luma_dc; row[0], row[1], row[2], row[3]);
     }
 }
@@ -148,7 +148,7 @@ pub fn idct(blk: &mut [i16; 16], qp: u8, quant_dc: bool) {
     for (el, &idx) in blk.iter_mut().zip(BLK_INDEX.iter()).skip(start) {
         *el = (*el * LEVEL_SCALE[idx][qidx]) << shift;
     }
-    for row in blk.chunks_mut(4) {
+    for row in blk.chunks_exact_mut(4) {
         transform!(row[0], row[1], row[2], row[3], 0);
     }
     for i in 0..4 {
@@ -228,7 +228,7 @@ pub fn idct8x8(blk: &mut [i16; 64], qp: u8) {
             *dst = i32::from(src).wrapping_mul(i32::from(qmat[idx])).wrapping_add(bias) >> shift;
         }
     }
-    for row in tmp.chunks_mut(8) {
+    for row in tmp.chunks_exact_mut(8) {
         transform!(row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7]);
     }
     for col in 0..8 {
@@ -242,7 +242,7 @@ pub fn idct8x8(blk: &mut [i16; 64], qp: u8) {
 
 pub fn add_coeffs(dst: &mut [u8], offset: usize, stride: usize, coeffs: &[i16]) {
     let out = &mut dst[offset..][..stride * 3 + 4];
-    for (line, src) in out.chunks_mut(stride).take(4).zip(coeffs.chunks(4)) {
+    for (line, src) in out.chunks_mut(stride).take(4).zip(coeffs.chunks_exact(4)) {
         for (dst, src) in line.iter_mut().take(4).zip(src.iter()) {
             *dst = (i32::from(*dst) + i32::from(*src)).max(0).min(255) as u8;
         }
@@ -251,7 +251,7 @@ pub fn add_coeffs(dst: &mut [u8], offset: usize, stride: usize, coeffs: &[i16])
 
 pub fn add_coeffs8(dst: &mut [u8], offset: usize, stride: usize, coeffs: &[i16; 64]) {
     let out = &mut dst[offset..];
-    for (line, src) in out.chunks_mut(stride).take(8).zip(coeffs.chunks(8)) {
+    for (line, src) in out.chunks_mut(stride).take(8).zip(coeffs.chunks_exact(8)) {
         for (dst, src) in line.iter_mut().take(8).zip(src.iter()) {
             *dst = (i32::from(*dst) + i32::from(*src)).max(0).min(255) as u8;
         }
diff --git a/nihav-itu/src/codecs/h264/mb_recon.rs b/nihav-itu/src/codecs/h264/mb_recon.rs
index 5d82503..5a204f3 100644
--- a/nihav-itu/src/codecs/h264/mb_recon.rs
+++ b/nihav-itu/src/codecs/h264/mb_recon.rs
@@ -364,7 +364,7 @@ fn do_b_mc(frm: &mut NASimpleVideoFrame<u8>, mode: BMode, xpos: usize, ypos: usi
     }
 }
 
-fn do_b_mc_4x4bi(frm: &mut NASimpleVideoFrame<u8>, xpos: usize, ypos: usize, mv: [MV; 2], ref_pic0: Option<NAVideoBufferRef<u8>>, weight0: &WeightInfo, ref_pic1: Option<NAVideoBufferRef<u8>>, weight1: &WeightInfo, mc_dsp: &mut H264MC) {
+fn do_b_mc_4x4bi(frm: &mut NASimpleVideoFrame<u8>, xpos: usize, ypos: usize, mv: &[MV; 2], ref_pic0: Option<NAVideoBufferRef<u8>>, weight0: &WeightInfo, ref_pic1: Option<NAVideoBufferRef<u8>>, weight1: &WeightInfo, mc_dsp: &mut H264MC) {
     if !weight0.is_weighted() || !weight1.is_weighted() {
         match (ref_pic0, ref_pic1) {
             (Some(buf0), Some(buf1)) => {
@@ -585,11 +585,11 @@ pub fn recon_mb(frm: &mut NASimpleVideoFrame<u8>, slice_hdr: &SliceHeader, mb_in
                 do_b_mc(frm, BMode::Bi, xpos, ypos, 16, 16, mv[0], rpic0, &weight0, mv[1], rpic1, &weight1, mc_dsp);
             } else {
                 for blk4 in 0..16 {
-                    let mv = sstate.get_cur_blk4(blk4).mv;
                     let ref_idx = sstate.get_cur_blk8(blk4_to_blk8(blk4)).ref_idx;
                     let rpic0 = frame_refs.select_ref_pic(0, ref_idx[0].index());
                     let rpic1 = frame_refs.select_ref_pic(1, ref_idx[1].index());
                     let (weight0, weight1) = get_weights(slice_hdr, frame_refs, BMode::Bi, weight_mode, ref_idx[0], ref_idx[1]);
+                    let mv = &sstate.get_cur_blk4(blk4).mv;
                     do_b_mc_4x4bi(frm, xpos + (blk4 & 3) * 4, ypos + (blk4 >> 2) * 4, mv, rpic0, &weight0, rpic1, &weight1, mc_dsp);
                 }
             }
@@ -607,11 +607,11 @@ pub fn recon_mb(frm: &mut NASimpleVideoFrame<u8>, slice_hdr: &SliceHeader, mb_in
                 match subtype {
                     SubMBType::Direct8x8 => {
                         for blk in 0..4 {
-                            let mv = sstate.get_cur_blk4(bx / 4 + (by / 4) * 4).mv;
                             let ref_idx = sstate.get_cur_blk8(bx / 8 + (by / 8) * 2).ref_idx;
                             let rpic0 = frame_refs.select_ref_pic(0, ref_idx[0].index());
                             let rpic1 = frame_refs.select_ref_pic(1, ref_idx[1].index());
                             let (weight0, weight1) = get_weights(slice_hdr, frame_refs, BMode::Bi, weight_mode, ref_idx[0], ref_idx[1]);
+                            let mv = &sstate.get_cur_blk4(bx / 4 + (by / 4) * 4).mv;
                             do_b_mc_4x4bi(frm, xpos + bx, ypos + by, mv, rpic0, &weight0, rpic1, &weight1, mc_dsp);
                             bx += 4;
                             if blk == 1 {
diff --git a/nihav-itu/src/codecs/h264/types.rs b/nihav-itu/src/codecs/h264/types.rs
index 00aa72e..4cc1fca 100644
--- a/nihav-itu/src/codecs/h264/types.rs
+++ b/nihav-itu/src/codecs/h264/types.rs
@@ -551,7 +551,7 @@ impl SliceState {
                         if cur_cc || top_cc {
                             self.deblock[y * 4 + x] |= 0x20;
                         } else {
-                            if mvdiff4(cur_mv[0], top_mv[0]) || mvdiff4(cur_mv[1], top_mv[1]) || !frefs.cmp_refs(cur_ref, top_ref) {
+                            if mvdiff4(cur_mv, top_mv) || !frefs.cmp_refs(cur_ref, top_ref) {
                                 self.deblock[y * 4 + x] |= 0x10;
                             }
                         }
@@ -574,7 +574,7 @@ impl SliceState {
                 } else if cur_cc || left_cc {
                     self.deblock[y * 4 + x] |= 2;
                 } else {
-                    if mvdiff4(cur_mv[0], left_mv[0]) || mvdiff4(cur_mv[1], left_mv[1]) || !frefs.cmp_refs(cur_ref, left_ref) {
+                    if mvdiff4(cur_mv, left_mv) || !frefs.cmp_refs(cur_ref, left_ref) {
                         self.deblock[y * 4 + x] |= 1;
                     }
                 }
@@ -917,7 +917,27 @@ impl SliceState {
     }
 }
 
-fn mvdiff4(mv1: MV, mv2: MV) -> bool {
-    let mv = mv1 - mv2;
-    (mv.x.abs() >= 4) || (mv.y.abs() >= 4)
+#[cfg(not(target_arch="x86_64"))]
+fn mvdiff4(mv1: &[MV; 2], mv2: &[MV; 2]) -> bool {
+    let mvd0 = mv1[0] - mv2[0];
+    let mvd1 = mv1[1] - mv2[1];
+    (mvd0.x.abs() >= 4) || (mvd0.y.abs() >= 4) || (mvd1.x.abs() >= 4) || (mvd1.y.abs() >= 4)
+}
+
+#[cfg(target_arch="x86_64")]
+fn mvdiff4(mv1: &[MV; 2], mv2: &[MV; 2]) -> bool {
+    unsafe {
+        let mut flag = false;
+        let ptr = std::mem::transmute::<*const MV, *const u64>(mv1.as_ptr());
+        let mut m0 = *ptr;
+        let ptr = std::mem::transmute::<*const MV, *const u64>(mv2.as_ptr());
+        let mut m1 = *ptr;
+        for _ in 0..4 {
+            let tmp = m0.wrapping_sub(m1) as u16;
+            flag |= tmp.wrapping_add(3) > 6;
+            m0 >>= 16;
+            m1 >>= 16;
+        }
+        flag
+    }
 }