| 1 | use nihav_core::frame::NAVideoBufferRef; |
| 2 | use nihav_codec_support::codecs::blockdsp::edge_emu; |
| 3 | |
| 4 | fn clip_u8(val: i16) -> u8 { |
| 5 | val.max(0).min(255) as u8 |
| 6 | } |
| 7 | |
| 8 | fn delta(p1: i16, p0: i16, q0: i16, q1: i16) -> i16 { |
| 9 | ((p1 - q1).max(-128).min(127) + 3 * (q0 - p0)).max(-128).min(127) |
| 10 | } |
| 11 | |
| 12 | pub type LoopFilterFunc = fn(buf: &mut [u8], off: usize, step: usize, stride: usize, len: usize, thr: i16, thr_inner: i16, thr_hev: i16); |
| 13 | |
| 14 | pub fn simple_loop_filter(buf: &mut [u8], mut off: usize, step: usize, stride: usize, len: usize, thr: i16, _thr_inner: i16, _thr_hev: i16) { |
| 15 | for _ in 0..len { |
| 16 | let p1 = i16::from(buf[off - step * 2]); |
| 17 | let p0 = i16::from(buf[off - step * 1]); |
| 18 | let q0 = i16::from(buf[off + step * 0]); |
| 19 | let q1 = i16::from(buf[off + step * 1]); |
| 20 | let diff = (p0 - q0).abs() * 2 + ((p1 - q1).abs() >> 1); |
| 21 | if diff <= thr { |
| 22 | let diff = delta(p1, p0, q0, q1); |
| 23 | let diffq0 = (diff + 4).min(127) >> 3; |
| 24 | let diffp0 = (diff + 3).min(127) >> 3; |
| 25 | buf[off - step * 1] = clip_u8(p0 + diffp0); |
| 26 | buf[off + step * 0] = clip_u8(q0 - diffq0); |
| 27 | } |
| 28 | off += stride; |
| 29 | } |
| 30 | } |
| 31 | |
| 32 | fn normal_loop_filter(buf: &mut [u8], mut off: usize, step: usize, stride: usize, len: usize, thr: i16, thr_inner: i16, thr_hev: i16, edge: bool) { |
| 33 | for _i in 0..len { |
| 34 | let p1 = i16::from(buf[off - step * 2]); |
| 35 | let p0 = i16::from(buf[off - step * 1]); |
| 36 | let q0 = i16::from(buf[off + step * 0]); |
| 37 | let q1 = i16::from(buf[off + step * 1]); |
| 38 | let diff = (p0 - q0).abs() * 2 + ((p1 - q1).abs() >> 1); |
| 39 | if diff <= thr { |
| 40 | let p3 = i16::from(buf[off - step * 4]); |
| 41 | let p2 = i16::from(buf[off - step * 3]); |
| 42 | let p1 = i16::from(buf[off - step * 2]); |
| 43 | let q1 = i16::from(buf[off + step * 1]); |
| 44 | let q2 = i16::from(buf[off + step * 2]); |
| 45 | let q3 = i16::from(buf[off + step * 3]); |
| 46 | let dp2 = p3 - p2; |
| 47 | let dp1 = p2 - p1; |
| 48 | let dp0 = p1 - p0; |
| 49 | let dq0 = q1 - q0; |
| 50 | let dq1 = q2 - q1; |
| 51 | let dq2 = q3 - q2; |
| 52 | if (dp0.abs() <= thr_inner) && (dp1.abs() <= thr_inner) && |
| 53 | (dp2.abs() <= thr_inner) && (dq0.abs() <= thr_inner) && |
| 54 | (dq1.abs() <= thr_inner) && (dq2.abs() <= thr_inner) { |
| 55 | let high_edge_variation = (dp0.abs() > thr_hev) || (dq0.abs() > thr_hev); |
| 56 | if high_edge_variation { |
| 57 | let diff = delta(p1, p0, q0, q1); |
| 58 | let diffq0 = (diff + 4).min(127) >> 3; |
| 59 | let diffp0 = (diff + 3).min(127) >> 3; |
| 60 | buf[off - step * 1] = clip_u8(p0 + diffp0); |
| 61 | buf[off + step * 0] = clip_u8(q0 - diffq0); |
| 62 | } else if edge { |
| 63 | let d = delta(p1, p0, q0, q1); |
| 64 | let diff0 = (d * 27 + 63) >> 7; |
| 65 | buf[off - step * 1] = clip_u8(p0 + diff0); |
| 66 | buf[off + step * 0] = clip_u8(q0 - diff0); |
| 67 | let diff1 = (d * 18 + 63) >> 7; |
| 68 | buf[off - step * 2] = clip_u8(p1 + diff1); |
| 69 | buf[off + step * 1] = clip_u8(q1 - diff1); |
| 70 | let diff2 = (d * 9 + 63) >> 7; |
| 71 | buf[off - step * 3] = clip_u8(p2 + diff2); |
| 72 | buf[off + step * 2] = clip_u8(q2 - diff2); |
| 73 | } else { |
| 74 | let diff = (3 * (q0 - p0)).max(-128).min(127); |
| 75 | let diffq0 = (diff + 4).min(127) >> 3; |
| 76 | let diffp0 = (diff + 3).min(127) >> 3; |
| 77 | buf[off - step * 1] = clip_u8(p0 + diffp0); |
| 78 | buf[off + step * 0] = clip_u8(q0 - diffq0); |
| 79 | let diff2 = (diffq0 + 1) >> 1; |
| 80 | buf[off - step * 2] = clip_u8(p1 + diff2); |
| 81 | buf[off + step * 1] = clip_u8(q1 - diff2); |
| 82 | } |
| 83 | } |
| 84 | } |
| 85 | off += stride; |
| 86 | } |
| 87 | } |
| 88 | |
| 89 | pub fn normal_loop_filter_inner(buf: &mut [u8], off: usize, step: usize, stride: usize, len: usize, thr: i16, thr_inner: i16, thr_hev: i16) { |
| 90 | normal_loop_filter(buf, off, step, stride, len, thr, thr_inner, thr_hev, false); |
| 91 | } |
| 92 | |
| 93 | pub fn normal_loop_filter_edge(buf: &mut [u8], off: usize, step: usize, stride: usize, len: usize, thr: i16, thr_inner: i16, thr_hev: i16) { |
| 94 | normal_loop_filter(buf, off, step, stride, len, thr, thr_inner, thr_hev, true); |
| 95 | } |
| 96 | |
| 97 | pub fn iwht4x4(coeffs: &mut [i16; 16]) { |
| 98 | for i in 0..4 { |
| 99 | let s0 = coeffs[i]; |
| 100 | let s1 = coeffs[i + 4]; |
| 101 | let s2 = coeffs[i + 8]; |
| 102 | let s3 = coeffs[i + 12]; |
| 103 | let a1 = s0 + s3; |
| 104 | let b1 = s1 + s2; |
| 105 | let c1 = s1 - s2; |
| 106 | let d1 = s0 - s3; |
| 107 | coeffs[i] = a1 + b1; |
| 108 | coeffs[i + 4] = c1 + d1; |
| 109 | coeffs[i + 8] = a1 - b1; |
| 110 | coeffs[i + 12] = d1 - c1; |
| 111 | } |
| 112 | for row in coeffs.chunks_mut(4) { |
| 113 | let a1 = row[0] + row[3]; |
| 114 | let b1 = row[1] + row[2]; |
| 115 | let c1 = row[1] - row[2]; |
| 116 | let d1 = row[0] - row[3]; |
| 117 | row[0] = (a1 + b1 + 3) >> 3; |
| 118 | row[1] = (c1 + d1 + 3) >> 3; |
| 119 | row[2] = (a1 - b1 + 3) >> 3; |
| 120 | row[3] = (d1 - c1 + 3) >> 3; |
| 121 | } |
| 122 | } |
| 123 | |
| 124 | pub fn iwht4x4_dc(coeffs: &mut [i16; 16]) { |
| 125 | let dc = (coeffs[0] + 3) >> 3; |
| 126 | *coeffs = [dc; 16]; |
| 127 | } |
| 128 | |
| 129 | const COS_PI8_SQRT2_MINUS1: i32 = 20091; |
| 130 | const SIN_PI8_SQRT2: i32 = 35468; |
| 131 | |
| 132 | macro_rules! idct4 { |
| 133 | ($s0: expr, $s1: expr, $s2: expr, $s3: expr, $shift: expr) => {{ |
| 134 | let a1 = i32::from($s0) + i32::from($s2); |
| 135 | let b1 = i32::from($s0) - i32::from($s2); |
| 136 | let temp1 = (i32::from($s1) * SIN_PI8_SQRT2) >> 16; |
| 137 | let temp2 = i32::from($s3) + ((i32::from($s3) * COS_PI8_SQRT2_MINUS1) >> 16); |
| 138 | let c1 = temp1 - temp2; |
| 139 | let temp1 = i32::from($s1) + ((i32::from($s1) * COS_PI8_SQRT2_MINUS1) >> 16); |
| 140 | let temp2 = (i32::from($s3) * SIN_PI8_SQRT2) >> 16; |
| 141 | let d1 = temp1 + temp2; |
| 142 | |
| 143 | let bias = (1 << $shift) >> 1; |
| 144 | $s0 = ((a1 + d1 + bias) >> $shift) as i16; |
| 145 | $s3 = ((a1 - d1 + bias) >> $shift) as i16; |
| 146 | $s1 = ((b1 + c1 + bias) >> $shift) as i16; |
| 147 | $s2 = ((b1 - c1 + bias) >> $shift) as i16; |
| 148 | }} |
| 149 | } |
| 150 | |
| 151 | pub fn idct4x4(coeffs: &mut [i16; 16]) { |
| 152 | for i in 0..4 { |
| 153 | idct4!(coeffs[i], coeffs[i + 4], coeffs[i + 8], coeffs[i + 12], 0); |
| 154 | } |
| 155 | for row in coeffs.chunks_mut(4) { |
| 156 | idct4!(row[0], row[1], row[2], row[3], 3); |
| 157 | } |
| 158 | } |
| 159 | |
| 160 | pub fn idct4x4_dc(coeffs: &mut [i16; 16]) { |
| 161 | let dc = (coeffs[0] + 4) >> 3; |
| 162 | *coeffs = [dc; 16]; |
| 163 | } |
| 164 | macro_rules! interpolate { |
| 165 | ($src: expr, $off: expr, $step: expr, $mode: expr) => {{ |
| 166 | let s0 = i32::from($src[$off + 0 * $step]); |
| 167 | let s1 = i32::from($src[$off + 1 * $step]); |
| 168 | let a = (8 - $mode) as i32; |
| 169 | let b = $mode as i32; |
| 170 | ((a * s0 + b * s1 + 4) >> 3).max(0).min(255) as u8 |
| 171 | }} |
| 172 | } |
| 173 | |
| 174 | const TMP_STRIDE: usize = 16; |
| 175 | |
| 176 | fn mc_block_common(dst: &mut [u8], mut doff: usize, dstride: usize, src: &[u8], sstride: usize, size: usize, mx: usize, my: usize) { |
| 177 | if (mx == 0) && (my == 0) { |
| 178 | let dst = &mut dst[doff..]; |
| 179 | for (out, src) in dst.chunks_mut(dstride).take(size).zip(src.chunks(sstride)) { |
| 180 | (&mut out[0..size]).copy_from_slice(&src[0..size]); |
| 181 | } |
| 182 | } else if my == 0 { |
| 183 | for src in src.chunks(sstride).take(size) { |
| 184 | for x in 0..size { |
| 185 | dst[doff + x] = interpolate!(src, x, 1, mx); |
| 186 | } |
| 187 | doff += dstride; |
| 188 | } |
| 189 | } else if mx == 0 { |
| 190 | for y in 0..size { |
| 191 | for x in 0..size { |
| 192 | dst[doff + x] = interpolate!(src, x + y * sstride, sstride, my); |
| 193 | } |
| 194 | doff += dstride; |
| 195 | } |
| 196 | } else { |
| 197 | let mut tmp = [0u8; TMP_STRIDE * (16 + 1)]; |
| 198 | for (y, dst) in tmp.chunks_mut(TMP_STRIDE).take(size + 1).enumerate() { |
| 199 | for x in 0..size { |
| 200 | dst[x] = interpolate!(src, x + y * sstride, 1, mx); |
| 201 | } |
| 202 | } |
| 203 | for y in 0..size { |
| 204 | for x in 0..size { |
| 205 | dst[doff + x] = interpolate!(tmp, x + y * TMP_STRIDE, TMP_STRIDE, my); |
| 206 | } |
| 207 | doff += dstride; |
| 208 | } |
| 209 | } |
| 210 | } |
| 211 | fn mc_block(dst: &mut [u8], doff: usize, dstride: usize, xpos: usize, ypos: usize, |
| 212 | mvx: i16, mvy: i16, reffrm: NAVideoBufferRef<u8>, plane: usize, |
| 213 | mc_buf: &mut [u8], size: usize) { |
| 214 | if (mvx == 0) && (mvy == 0) { |
| 215 | let dst = &mut dst[doff..]; |
| 216 | let sstride = reffrm.get_stride(plane); |
| 217 | let srcoff = reffrm.get_offset(plane) + xpos + ypos * sstride; |
| 218 | let src = &reffrm.get_data(); |
| 219 | let src = &src[srcoff..]; |
| 220 | for (out, src) in dst.chunks_mut(dstride).take(size).zip(src.chunks(sstride)) { |
| 221 | (&mut out[0..size]).copy_from_slice(&src[0..size]); |
| 222 | } |
| 223 | return; |
| 224 | } |
| 225 | let (w, h) = reffrm.get_dimensions(plane); |
| 226 | let wa = if plane == 0 { (w + 15) & !15 } else { (w + 7) & !7 } as isize; |
| 227 | let ha = if plane == 0 { (h + 15) & !15 } else { (h + 7) & !7 } as isize; |
| 228 | let bsize = (size as isize) + 1; |
| 229 | let ref_x = (xpos as isize) + ((mvx >> 3) as isize); |
| 230 | let ref_y = (ypos as isize) + ((mvy >> 3) as isize); |
| 231 | |
| 232 | let (src, sstride) = if (ref_x < 0) || (ref_x + bsize > wa) || (ref_y < 0) || (ref_y + bsize > ha) { |
| 233 | edge_emu(&reffrm, ref_x, ref_y, bsize as usize, bsize as usize, mc_buf, 32, plane, 4); |
| 234 | (mc_buf as &[u8], 32) |
| 235 | } else { |
| 236 | let off = reffrm.get_offset(plane); |
| 237 | let stride = reffrm.get_stride(plane); |
| 238 | let data = reffrm.get_data(); |
| 239 | (&data[off + (ref_x as usize) + (ref_y as usize) * stride..], stride) |
| 240 | }; |
| 241 | let mx = (mvx & 7) as usize; |
| 242 | let my = (mvy & 7) as usize; |
| 243 | mc_block_common(dst, doff, dstride, src, sstride, size, mx, my); |
| 244 | } |
| 245 | pub fn mc_block16x16_bilin(dst: &mut [u8], doff: usize, dstride: usize, xpos: usize, ypos: usize, |
| 246 | mvx: i16, mvy: i16, src: NAVideoBufferRef<u8>, plane: usize, mc_buf: &mut [u8]) { |
| 247 | mc_block(dst, doff, dstride, xpos, ypos, mvx, mvy, src, plane, mc_buf, 16); |
| 248 | } |
| 249 | pub fn mc_block8x8_bilin(dst: &mut [u8], doff: usize, dstride: usize, xpos: usize, ypos: usize, |
| 250 | mvx: i16, mvy: i16, src: NAVideoBufferRef<u8>, plane: usize, mc_buf: &mut [u8]) { |
| 251 | mc_block(dst, doff, dstride, xpos, ypos, mvx, mvy, src, plane, mc_buf, 8); |
| 252 | } |
| 253 | pub fn mc_block4x4_bilin(dst: &mut [u8], doff: usize, dstride: usize, xpos: usize, ypos: usize, |
| 254 | mvx: i16, mvy: i16, src: NAVideoBufferRef<u8>, plane: usize, mc_buf: &mut [u8]) { |
| 255 | mc_block(dst, doff, dstride, xpos, ypos, mvx, mvy, src, plane, mc_buf, 4); |
| 256 | } |