X-Git-Url: https://git.nihav.org/?p=nihav.git;a=blobdiff_plain;f=nihav-rad%2Fsrc%2Fcodecs%2Fbinkvidenc%2Fdsp.rs;fp=nihav-rad%2Fsrc%2Fcodecs%2Fbinkvidenc%2Fdsp.rs;h=e7809092408773793ff52a3ad297ad049b01b0ea;hp=0000000000000000000000000000000000000000;hb=217de10bc01258d3a479c2784d6c5542c30b0f63;hpb=94cebb38b3bf2252d0863d517838718787bb4529 diff --git a/nihav-rad/src/codecs/binkvidenc/dsp.rs b/nihav-rad/src/codecs/binkvidenc/dsp.rs new file mode 100644 index 0000000..e780909 --- /dev/null +++ b/nihav-rad/src/codecs/binkvidenc/dsp.rs @@ -0,0 +1,598 @@ +use super::{BlockTokens, MAX_DIST, RateControl, calc_diff}; +use super::super::binkviddata::*; + +const BINK_INV_SCAN: [usize; 64] = [ + 0, 1, 4, 5, 8, 9, 12, 13, + 2, 3, 6, 7, 10, 11, 14, 15, + 24, 25, 44, 45, 16, 17, 20, 21, + 26, 27, 46, 47, 18, 19, 22, 23, + 28, 29, 32, 33, 48, 49, 52, 53, + 30, 31, 34, 35, 50, 51, 54, 55, + 36, 37, 40, 41, 56, 57, 60, 61, + 38, 39, 42, 43, 58, 59, 62, 63 +]; + +trait WriteBit { + fn write_bit(&mut self, val: u16); + fn write_coef(&mut self, aval: u32, sign: bool, bits: u8); +} + +impl WriteBit for BlockTokens { + fn write_bit(&mut self, val: u16) { + self.other.push((val, 1)); + } + fn write_coef(&mut self, aval: u32, sign: bool, bits: u8) { + if bits > 1 { + self.other.push((aval as u16 & ((1 << (bits - 1)) - 1), bits - 1)); + } + self.write_bit(sign as u16); + } +} + +pub struct DSP { + diff: [i16; 64], + dct_i: [u8; 64], + dct_p: [u8; 64], + qmats: QuantMats, + i_start: usize, + i_len: usize, + p_start: usize, + p_len: usize, +} + +impl DSP { + pub fn new() -> Self { + let mut qmats = QuantMats::default(); + qmats.calc_binkb_quants(); + Self { + diff: [0; 64], + dct_i: [0; 64], + dct_p: [0; 64], + qmats, + i_start: 0, + i_len: 16, + p_start: 0, + p_len: 16, + } + } + pub fn get_diff(&mut self, mc_blk: &[u8; 64], cur_blk: &[u8; 64]) { + for (dst, (&prev, &cur)) in self.diff.iter_mut() + .zip(mc_blk.iter().zip(cur_blk.iter())) { + *dst = i16::from(cur) - i16::from(prev); + } + } + pub fn recon_residue(&self, dst: &mut [u8], dstride: usize, mc_blk: &[u8; 64]) { + for (dline, (prow, drow)) in dst.chunks_mut(dstride) + .zip(mc_blk.chunks_exact(8).zip(self.diff.chunks_exact(8))) { + for (dst, (&prev, &diff)) in dline.iter_mut().zip(prow.iter().zip(drow.iter())) { + *dst = (i16::from(prev) + diff) as u8; + } + } + } + pub fn recon_dct_i(&self, dst: &mut [u8], dstride: usize) { + for (dline, srow) in dst.chunks_mut(dstride).zip(self.dct_i.chunks_exact(8)) { + dline[..8].copy_from_slice(srow); + } + } + pub fn recon_dct_p(&self, dst: &mut [u8], dstride: usize) { + for (dline, srow) in dst.chunks_mut(dstride).zip(self.dct_p.chunks_exact(8)) { + dline[..8].copy_from_slice(srow); + } + } + + pub fn try_residue(&self, tokens: &mut BlockTokens) -> u32 { + let mut tree = Tree::new(true); + let mut flat = [0; 64]; + let mut blen = [0; 64]; + + for (&idx, &val) in BINK_INV_SCAN.iter().zip(self.diff.iter()) { + flat[idx] = val; + let aval = val.unsigned_abs(); + let mut b = 0u8; + while (1 << b) <= aval { + b += 1; + } + blen[idx] = b.saturating_sub(1); + } + + let mut max_val = 0; + let mut max_bits = 0; + let mut bits = 0; + let mut avals = [0; 64]; + let mut signs = [false; 64]; + for ((aval, sign), (&val, &vlen)) in avals.iter_mut().zip(signs.iter_mut()) + .zip(flat.iter().zip(blen.iter())) { + *aval = val.unsigned_abs(); + *sign = val < 0; + max_val = max_val.max(*aval); + max_bits = max_bits.max(vlen); + bits += aval.count_ones(); + } + + if max_bits > 7 || bits > 127 { + return MAX_DIST; + } + + tokens.nresidues.push(bits as u8); + tokens.other.push((max_bits as u16, 3)); + + let mut nz_cand = Vec::with_capacity(64); + let mut masks_left = bits + 1; + 'tree_loop: for cur_bits in (0..=max_bits).rev() { + let mask = 1 << cur_bits; + for &idx in nz_cand.iter() { + tokens.write_bit(((avals[idx] & mask) != 0) as u16); + if (avals[idx] & mask) != 0 { + masks_left -= 1; + if masks_left == 0 { + break 'tree_loop; + } + } + } + + let mut pos = tree.start; + while pos < tree.end { + if tree.state[pos] == TreeState::None { + pos += 1; + continue; + } + if let TreeState::Candidate(idx) = tree.state[pos] { + let idx = usize::from(idx); + if blen[idx] == cur_bits && flat[idx] != 0 { + tree.state[pos] = TreeState::None; + tokens.write_bit(1); + tokens.write_bit(signs[idx] as u16); + nz_cand.push(idx); + masks_left -= 1; + if masks_left == 0 { + break 'tree_loop; + } + } else { + tokens.write_bit(0); + pos += 1; + } + continue; + } + let range = tree.state[pos].get_range(); + let cur_max_bits = blen[range].iter().fold(0u8, |acc, &a| acc.max(a)); + if cur_max_bits == cur_bits { + tokens.write_bit(1); + match tree.state[pos] { + TreeState::Twenty(val) => { + tree.state[pos] = TreeState::Sixteen(val + 4); + for i in 0..4 { + let idx = usize::from(val) + i; + if blen[idx] == cur_bits && flat[idx] != 0 { + tokens.write_bit(0); + tokens.write_bit(signs[idx] as u16); + nz_cand.push(idx); + masks_left -= 1; + if masks_left == 0 { + break 'tree_loop; + } + } else { + tokens.write_bit(1); + tree.add_to_head(TreeState::Candidate(idx as u8)); + } + } + }, + TreeState::Sixteen(val) => { + tree.state[pos] = TreeState::Four(val); + for i in 1u8..4 { + tree.add_to_tail(TreeState::Four(val + i * 4)); + } + }, + TreeState::Four(val) => { + tree.state[pos] = TreeState::None; + for i in 0..4 { + let idx = usize::from(val) + i; + if blen[idx] == cur_bits && flat[idx] != 0 { + tokens.write_bit(0); + tokens.write_bit(signs[idx] as u16); + nz_cand.push(idx); + masks_left -= 1; + if masks_left == 0 { + break 'tree_loop; + } + } else { + tokens.write_bit(1); + tree.add_to_head(TreeState::Candidate(idx as u8)); + } + } + }, + _ => unreachable!(), + }; + } else { + tokens.write_bit(0); + pos += 1; + } + } + } + + 0 + } + + pub fn set_quant_ranges(&mut self, ranges: [u8; 4]) { + self.i_start = usize::from(ranges[0]); + self.i_len = usize::from(ranges[1]); + self.p_start = usize::from(ranges[2]); + self.p_len = usize::from(ranges[3]); + } + pub fn try_dct_intra(&mut self, blk: &[u8; 64], tokens: &mut BlockTokens, tmp_tok: &mut BlockTokens, is_b: bool, rc: &RateControl, mut best_dist: u32) -> u32 { + tokens.clear(); + if self.i_len == 0 { + return MAX_DIST; + } + + let mut ref_coeffs = [0i32; 64]; + for (dst, &src) in ref_coeffs.iter_mut().zip(blk.iter()) { + *dst = i32::from(src); + } + dct(&mut ref_coeffs); + + let mut dct_out = [0u8; 64]; + let qmats = if is_b { &self.qmats.intra_qmat } else { BINK_INTRA_QUANT }; + for (qidx, qmat) in qmats.iter().enumerate().skip(self.i_start).take(self.i_len) { + let mut coeffs = ref_coeffs; + for (idx, el) in coeffs.iter_mut().enumerate() { + *el /= qmat[BINK_INV_SCAN[idx]]; + } + + if coeffs[0] >= 2048 { + continue; + } + + tmp_tok.clear(); + tmp_tok.intradc.push(coeffs[0] as u16); + Self::code_dct_coeffs(&coeffs, tmp_tok); + if is_b { + tmp_tok.intraq.push(qidx as u8); + } else { + tmp_tok.other.push((qidx as u16, 4)); + } + let bits = tmp_tok.bits(is_b); + if rc.metric(0, bits) >= best_dist { + continue; + } + + for (idx, el) in coeffs.iter_mut().enumerate() { + if *el != 0 { + *el = (*el * qmat[BINK_INV_SCAN[idx]]) >> 11; + } + } + idct(&mut coeffs); + for (dst, &src) in dct_out.iter_mut().zip(coeffs.iter()) { + *dst = src as u8; + } + let diff = calc_diff(&dct_out, 8, blk, 8); + let dist = rc.metric(diff, bits); + if dist < best_dist { + best_dist = dist; + std::mem::swap(tokens, tmp_tok); + self.dct_i.copy_from_slice(&dct_out); + } + } + + best_dist + } + pub fn try_dct_inter(&mut self, ref_blk: &[u8; 64], cur_blk: &[u8; 64], tokens: &mut BlockTokens, tmp_tok: &mut BlockTokens, is_b: bool, rc: &RateControl, mut best_dist: u32) -> u32 { + let mv_x = tokens.xoff[0]; + let mv_y = tokens.yoff[0]; + + let mut ref_coeffs = [0i32; 64]; + for (dst, &src) in ref_coeffs.iter_mut().zip(self.diff.iter()) { + *dst = i32::from(src); + } + dct(&mut ref_coeffs); + + let mut dct_out = [0u8; 64]; + let qmats = if is_b { &self.qmats.inter_qmat } else { BINK_INTER_QUANT }; + for (qidx, qmat) in qmats.iter().enumerate().skip(self.p_start).take(self.p_len) { + let mut coeffs = ref_coeffs; + + for (idx, el) in coeffs.iter_mut().enumerate() { + *el /= qmat[BINK_INV_SCAN[idx]]; + } + + if coeffs[0].unsigned_abs() >= 1024 { + continue; + } + + tmp_tok.clear(); + tmp_tok.interdc.push(coeffs[0] as i16); + tmp_tok.xoff.push(mv_x); + tmp_tok.yoff.push(mv_y); + Self::code_dct_coeffs(&coeffs, tmp_tok); + if is_b { + tmp_tok.interq.push(qidx as u8); + } else { + tmp_tok.other.push((qidx as u16, 4)); + } + let bits = tmp_tok.bits(is_b); + if rc.metric(0, bits) >= best_dist { + continue; + } + + for (idx, el) in coeffs.iter_mut().enumerate() { + if *el != 0 { + *el = (*el * qmat[BINK_INV_SCAN[idx]]) >> 11; + } + } + idct(&mut coeffs); + for (dst, (&prev, &diff)) in dct_out.iter_mut().zip(ref_blk.iter().zip(coeffs.iter())) { + *dst = (i32::from(prev) + diff) as u8; + } + let diff = calc_diff(&dct_out, 8, cur_blk, 8); + let dist = rc.metric(diff, bits); + if dist < best_dist { + best_dist = dist; + std::mem::swap(tokens, tmp_tok); + self.dct_p.copy_from_slice(&dct_out); + } + } + + best_dist + } + + fn code_dct_coeffs(coeffs: &[i32; 64], tokens: &mut BlockTokens) { + let mut tree = Tree::new(false); + let mut flat = [0; 64]; + let mut blen = [0; 64]; + + for (&idx, &val) in BINK_INV_SCAN.iter().zip(coeffs.iter()).skip(1) { + flat[idx] = val; + let aval = val.unsigned_abs(); + let mut b = 0u8; + while (1 << b) <= aval { + b += 1; + } + blen[idx] = b; + } + + let mut max_val = 0; + let mut max_bits = 0; + let mut avals = [0; 64]; + let mut signs = [false; 64]; + for ((aval, sign), (&val, &vlen)) in avals.iter_mut().zip(signs.iter_mut()) + .zip(flat.iter().zip(blen.iter())) { + *aval = val.unsigned_abs(); + *sign = val < 0; + max_val = max_val.max(*aval); + max_bits = max_bits.max(vlen); + } + + tokens.other.push((u16::from(max_bits), 4)); + for cur_bits in (1..=max_bits).rev() { + let mut pos = tree.start; + while pos < tree.end { + if tree.state[pos] == TreeState::None { + pos += 1; + continue; + } + if let TreeState::Candidate(idx) = tree.state[pos] { + let idx = usize::from(idx); + if blen[idx] == cur_bits && flat[idx] != 0 { + tree.state[pos] = TreeState::None; + tokens.write_bit(1); + tokens.write_coef(avals[idx], signs[idx], cur_bits); + } else { + tokens.write_bit(0); + pos += 1; + } + continue; + } + let range = tree.state[pos].get_range(); + let cur_max_bits = blen[range].iter().fold(0u8, |acc, &a| acc.max(a)); + if cur_max_bits == cur_bits { + tokens.write_bit(1); + match tree.state[pos] { + TreeState::Twenty(val) => { + tree.state[pos] = TreeState::Sixteen(val + 4); + for i in 0..4 { + let idx = usize::from(val) + i; + if blen[idx] == cur_bits && flat[idx] != 0 { + tokens.write_bit(0); + tokens.write_coef(avals[idx], signs[idx], cur_bits); + } else { + tokens.write_bit(1); + tree.add_to_head(TreeState::Candidate(idx as u8)); + } + } + }, + TreeState::Sixteen(val) => { + tree.state[pos] = TreeState::Four(val); + for i in 1u8..4 { + tree.add_to_tail(TreeState::Four(val + i * 4)); + } + }, + TreeState::Four(val) => { + tree.state[pos] = TreeState::None; + for i in 0..4 { + let idx = usize::from(val) + i; + if blen[idx] == cur_bits && flat[idx] != 0 { + tokens.write_bit(0); + tokens.write_coef(avals[idx], signs[idx], cur_bits); + } else { + tokens.write_bit(1); + tree.add_to_head(TreeState::Candidate(idx as u8)); + } + } + }, + _ => unreachable!(), + }; + } else { + tokens.write_bit(0); + pos += 1; + } + } + } + } +} + +#[derive(Clone,Copy,Debug,PartialEq)] +enum TreeState { + None, + Twenty(u8), + Sixteen(u8), + Four(u8), + Candidate(u8), +} + +impl TreeState { + fn get_range(self) -> std::ops::Range { + let (base, len) = match self { + TreeState::None => (0, 0), + TreeState::Twenty(val) => (val, 20), + TreeState::Sixteen(val) => (val, 16), + TreeState::Four(val) => (val, 4), + TreeState::Candidate(val) => (val, 1), + }; + usize::from(base)..usize::from(base + len) + } +} + +struct Tree { + state: [TreeState; 128], + start: usize, + end: usize, +} + +impl Tree { + fn new(is_res: bool) -> Self { + let mut state = [TreeState::None; 128]; + let start = 64; + let mut end = start; + + state[end] = TreeState::Twenty(4); + end += 1; + state[end] = TreeState::Twenty(24); + end += 1; + state[end] = TreeState::Twenty(44); + end += 1; + if is_res { + state[end] = TreeState::Four(0); + end += 1; + } else { + for i in 1..4 { + state[end] = TreeState::Candidate(i); + end += 1; + } + } + Self { state, start, end } + } + fn add_to_tail(&mut self, ts: TreeState) { + self.state[self.end] = ts; + self.end += 1; + } + fn add_to_head(&mut self, ts: TreeState) { + self.start -= 1; + self.state[self.start] = ts; + } +} + +const A1: i32 = 2896; +const A2: i32 = 2217; +const A3: i32 = 3784; +const A4: i32 = -5352; + +macro_rules! idct { + ($src: expr, $sstep: expr, $dst: expr, $dstep: expr, $off: expr, $bias: expr, $shift: expr) => { + let a0 = $src[$off + 0 * $sstep] + $src[$off + 4 * $sstep]; + let a1 = $src[$off + 0 * $sstep] - $src[$off + 4 * $sstep]; + let a2 = $src[$off + 2 * $sstep] + $src[$off + 6 * $sstep]; + let a3 = A1.wrapping_mul($src[$off + 2 * $sstep] - $src[$off + 6 * $sstep]) >> 11; + let a4 = $src[$off + 5 * $sstep] + $src[$off + 3 * $sstep]; + let a5 = $src[$off + 5 * $sstep] - $src[$off + 3 * $sstep]; + let a6 = $src[$off + 1 * $sstep] + $src[$off + 7 * $sstep]; + let a7 = $src[$off + 1 * $sstep] - $src[$off + 7 * $sstep]; + let b0 = a4 + a6; + let b1 = A3.wrapping_mul(a5 + a7) >> 11; + let b2 = (A4.wrapping_mul(a5) >> 11) - b0 + b1; + let b3 = (A1.wrapping_mul(a6 - a4) >> 11) - b2; + let b4 = (A2.wrapping_mul(a7) >> 11) + b3 - b1; + let c0 = a0 + a2; + let c1 = a0 - a2; + let c2 = a1 + (a3 - a2); + let c3 = a1 - (a3 - a2); + + $dst[$off + 0 * $dstep] = (c0 + b0 + $bias) >> $shift; + $dst[$off + 1 * $dstep] = (c2 + b2 + $bias) >> $shift; + $dst[$off + 2 * $dstep] = (c3 + b3 + $bias) >> $shift; + $dst[$off + 3 * $dstep] = (c1 - b4 + $bias) >> $shift; + $dst[$off + 4 * $dstep] = (c1 + b4 + $bias) >> $shift; + $dst[$off + 5 * $dstep] = (c3 - b3 + $bias) >> $shift; + $dst[$off + 6 * $dstep] = (c2 - b2 + $bias) >> $shift; + $dst[$off + 7 * $dstep] = (c0 - b0 + $bias) >> $shift; + }; +} + +fn idct(coeffs: &mut [i32; 64]) { + let mut tmp: [i32; 64] = [0; 64]; + let mut row: [i32; 8] = [0; 8]; + for i in 0..8 { + idct!(coeffs, 8, tmp, 8, i, 0, 0); + } + for (drow, srow) in coeffs.chunks_exact_mut(8).zip(tmp.chunks_exact(8)) { + idct!(srow, 1, row, 1, 0, 0x7F, 8); + drow.copy_from_slice(&row); + } +} + +const B1: i32 = 2896; +const B2: i32 = 3789; +const B3: i32 = 1569; +const B4: i32 = 4464; +const B5: i32 = 6679; +const B6: i32 = 1327; +const B7: i32 = 888; +macro_rules! dct { + ($src: expr, $sstep: expr, $dst: expr, $dstep: expr, $off: expr, $bias: expr, $shift: expr) => { + let a0 = $src[$off + 0 * $sstep] + $src[$off + 7 * $sstep]; + let a1 = $src[$off + 0 * $sstep] - $src[$off + 7 * $sstep]; + let a2 = $src[$off + 1 * $sstep] + $src[$off + 6 * $sstep]; + let a3 = $src[$off + 1 * $sstep] - $src[$off + 6 * $sstep]; + let a4 = $src[$off + 2 * $sstep] + $src[$off + 5 * $sstep]; + let a5 = $src[$off + 2 * $sstep] - $src[$off + 5 * $sstep]; + let a6 = $src[$off + 3 * $sstep] + $src[$off + 4 * $sstep]; + let a7 = $src[$off + 3 * $sstep] - $src[$off + 4 * $sstep]; + + let b0 = (a0 + a4) << 7; + let b1 = (a0 - a4) << 7; + let b2 = (a2 + a6) << 7; + let b3 = (a2 - a6) << 7; + + $dst[$off + 0 * $dstep] = (b0 + b2 + $bias) >> $shift; + $dst[$off + 4 * $dstep] = (b1 - b3 + $bias) >> $shift; + + let c0 = (a0 - a6) << 7; + let c1 = B1.wrapping_mul((b1 + b3) >> 7) >> 5; + $dst[$off + 2 * $dstep] = (c0 + c1 + $bias) >> $shift; + $dst[$off + 6 * $dstep] = (c0 - c1 + $bias) >> $shift; + + let d0 = B2.wrapping_mul(a1) + B3.wrapping_mul(a7); + let d1 = ( d0 + B4.wrapping_mul(a5) + B5.wrapping_mul(a3) + (1 << 4)) >> 5; + let d2 = (-d0 + B6.wrapping_mul(a5) - B7.wrapping_mul(a3) + (1 << 4)) >> 5; + $dst[$off + 1 * $dstep] = ((a1 << 7) + d1 + $bias) >> $shift; + $dst[$off + 7 * $dstep] = ((a1 << 7) + d2 + $bias) >> $shift; + + let e0 = B3.wrapping_mul(a1) - B2.wrapping_mul(a7); + let e1 = ( e0 - B6.wrapping_mul(a3) - B5.wrapping_mul(a5) + (1 << 4)) >> 5; + let e2 = (-e0 - B4.wrapping_mul(a3) + B7.wrapping_mul(a5) + (1 << 4)) >> 5; + + $dst[$off + 3 * $dstep] = ((a1 << 7) + e1 + $bias) >> $shift; + $dst[$off + 5 * $dstep] = ((a1 << 7) + e2 + $bias) >> $shift; + }; +} + +fn dct(coeffs: &mut [i32; 64]) { + let mut tmp: [i32; 64] = [0; 64]; + let mut row: [i32; 8] = [0; 8]; + for i in 0..8 { + dct!(coeffs, 8, tmp, 8, i, 1, 1); + } + for (drow, srow) in coeffs.chunks_exact_mut(8).zip(tmp.chunks_exact(8)) { + dct!(srow, 1, row, 1, 0, 0, 0); + drow.copy_from_slice(&row); + } +} +