1 #[allow(unexpected_cfgs)]
3 pub use mc::{H264MC, McBlock};
4 #[cfg(target_arch="x86_64")]
7 pub const CHROMA_QUANTS: [u8; 52] = [
8 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
9 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 29, 30,
10 31, 32, 32, 33, 34, 34, 35, 35, 36, 36, 37, 37, 37, 38, 38, 38,
14 pub const CHROMA_DC_SCAN: [usize; 4] = [ 0, 1, 2, 3];
15 pub const ZIGZAG: [usize; 16] = [
16 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
18 pub const ZIGZAG1: [usize; 15] = [
19 0, 3, 7, 4, 1, 2, 5, 8, 11, 12, 9, 6, 10, 13, 14
21 /*pub const IL_SCAN: [usize; 16] = [
22 0, 4, 1, 8, 12, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
24 pub const ZIGZAG8X8: [usize; 64] = [
25 0, 1, 8, 16, 9, 2, 3, 10,
26 17, 24, 32, 25, 18, 11, 4, 5,
27 12, 19, 26, 33, 40, 48, 41, 34,
28 27, 20, 13, 6, 7, 14, 21, 28,
29 35, 42, 49, 56, 57, 50, 43, 36,
30 29, 22, 15, 23, 30, 37, 44, 51,
31 58, 59, 52, 45, 38, 31, 39, 46,
32 53, 60, 61, 54, 47, 55, 62, 63
35 const LEVEL_SCALE: [[i16; 6]; 3] = [
36 [ 10, 11, 13, 14, 16, 18 ],
37 [ 16, 18, 20, 23, 25, 29 ],
38 [ 13, 14, 16, 18, 20, 23 ]
41 pub fn chroma_dc_transform(blk: &mut [i16; 4], qp: u8) {
42 let t0 = blk[0] + blk[2];
43 let t1 = blk[0] - blk[2];
44 let t2 = blk[1] + blk[3];
45 let t3 = blk[1] - blk[3];
51 let mul = LEVEL_SCALE[0][qp as usize];
52 for el in blk.iter_mut() {
53 *el = el.wrapping_mul(mul) >> 1;
56 let mul = LEVEL_SCALE[0][(qp % 6) as usize];
57 let shift = qp / 6 - 1;
58 for el in blk.iter_mut() {
59 *el = el.wrapping_mul(mul) << shift;
64 macro_rules! transform {
65 (luma_dc; $a: expr, $b: expr, $c: expr, $d: expr) => ({
66 let t0 = $a.wrapping_add($c);
67 let t1 = $a.wrapping_sub($c);
68 let t2 = $b.wrapping_add($d);
69 let t3 = $b.wrapping_sub($d);
70 $a = t0.wrapping_add(t2);
71 $b = t1.wrapping_add(t3);
72 $c = t1.wrapping_sub(t3);
73 $d = t0.wrapping_sub(t2);
75 ($a: expr, $b: expr, $c: expr, $d: expr, $shift: expr) => ({
76 let t0 = $a.wrapping_add($c);
77 let t1 = $a.wrapping_sub($c);
78 let t2 = ($b >> 1).wrapping_sub($d);
79 let t3 = $b.wrapping_add($d >> 1);
80 let bias = 1 << $shift >> 1;
81 $a = t0.wrapping_add(t3).wrapping_add(bias) >> $shift;
82 $b = t1.wrapping_add(t2).wrapping_add(bias) >> $shift;
83 $c = t1.wrapping_sub(t2).wrapping_add(bias) >> $shift;
84 $d = t0.wrapping_sub(t3).wrapping_add(bias) >> $shift;
86 ($a: expr, $b: expr, $c: expr, $d: expr, $e: expr, $f: expr, $g: expr, $h: expr) => {
88 let e1 = -$d + $f - $h - ($h >> 1);
90 let e3 = $b + $h - $d - ($d >> 1);
91 let e4 = ($c >> 1) - $g;
92 let e5 = -$b + $h + $f + ($f >> 1);
93 let e6 = $c + ($g >> 1);
94 let e7 = $d + $f + $b + ($b >> 1);
97 let f1 = e1 + (e7 >> 2);
99 let f3 = e3 + (e5 >> 2);
101 let f5 = (e3 >> 2) - e5;
103 let f7 = e7 - (e1 >> 2);
116 pub fn idct_luma_dc(blk: &mut [i16; 16], qp: u8) {
118 let mul = LEVEL_SCALE[0][(qp % 6) as usize];
119 let shift = 2 - qp / 6;
120 let bias = 1 << shift >> 1;
121 for el in blk.iter_mut() {
122 *el = el.wrapping_mul(mul).wrapping_add(bias) >> shift;
125 let mul = LEVEL_SCALE[0][(qp % 6) as usize];
126 let shift = qp / 6 - 2;
127 for el in blk.iter_mut() {
128 *el = el.wrapping_mul(mul) << shift;
132 transform!(luma_dc; blk[i], blk[i + 4], blk[i + 8], blk[i + 12]);
134 for row in blk.chunks_exact_mut(4) {
135 transform!(luma_dc; row[0], row[1], row[2], row[3]);
139 pub fn idct_skip_dc(blk: &mut [i16; 16], qp: u8) {
140 const BLK_INDEX: [usize; 16] = [
146 let qidx = (qp % 6) as usize;
148 for (el, &idx) in blk.iter_mut().zip(BLK_INDEX.iter()).skip(1) {
149 *el = (*el * LEVEL_SCALE[idx][qidx]) << shift;
151 for row in blk.chunks_exact_mut(4) {
152 transform!(row[0], row[1], row[2], row[3], 0);
155 transform!(blk[i], blk[i + 4], blk[i + 8], blk[i + 12], 6);
159 pub fn idct(blk: &mut [i16; 16], qp: u8) {
160 const BLK_INDEX: [usize; 16] = [
166 let qidx = (qp % 6) as usize;
168 for (el, &idx) in blk.iter_mut().zip(BLK_INDEX.iter()) {
169 *el = (*el * LEVEL_SCALE[idx][qidx]) << shift;
171 for row in blk.chunks_exact_mut(4) {
172 transform!(row[0], row[1], row[2], row[3], 0);
175 transform!(blk[i], blk[i + 4], blk[i + 8], blk[i + 12], 6);
179 pub fn idct_dc(blk: &mut [i16; 16], qp: u8, quant_dc: bool) {
180 let dc = if quant_dc {
181 (blk[0] * LEVEL_SCALE[0][(qp % 6) as usize]) << (qp / 6)
185 *blk = [(dc + 0x20) >> 6; 16];
188 const QMAT_8X8: [[u8; 16]; 6] = [
222 pub fn dequant8x8(blk: &mut [i16; 64], slist: &[u8; 64]) {
223 for (el, &scan) in blk.iter_mut().zip(ZIGZAG8X8.iter()) {
225 *el = el.wrapping_mul(i16::from(slist[scan]));
230 pub fn idct8x8(blk: &mut [i16; 64], qp: u8) {
231 let mut tmp = [0i32; 64];
232 let qmat = &QMAT_8X8[(qp % 6) as usize];
234 let shift = qp / 6 - 6;
235 for (i, (dst, &src)) in tmp.iter_mut().zip(blk.iter()).enumerate() {
238 let idx = (x & 3) + (y & 3) * 4;
239 *dst = i32::from(src).wrapping_mul(i32::from(qmat[idx])) << shift;
242 let shift = 6 - qp / 6;
243 let bias = (1 << shift) >> 1;
244 for (i, (dst, &src)) in tmp.iter_mut().zip(blk.iter()).enumerate() {
247 let idx = (x & 3) + (y & 3) * 4;
248 *dst = i32::from(src).wrapping_mul(i32::from(qmat[idx])).wrapping_add(bias) >> shift;
251 for row in tmp.chunks_exact_mut(8) {
252 transform!(row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7]);
255 transform!(tmp[col], tmp[col + 8], tmp[col + 8 * 2], tmp[col + 8 * 3],
256 tmp[col + 8 * 4], tmp[col + 8 * 5], tmp[col + 8 * 6], tmp[col + 8 * 7]);
258 for (dst, &src) in blk.iter_mut().zip(tmp.iter()) {
259 *dst = ((src + 0x20) >> 6) as i16;
263 pub fn add_coeffs(dst: &mut [u8], offset: usize, stride: usize, coeffs: &[i16]) {
264 let out = &mut dst[offset..][..stride * 3 + 4];
265 for (line, src) in out.chunks_mut(stride).take(4).zip(coeffs.chunks_exact(4)) {
266 for (dst, src) in line.iter_mut().take(4).zip(src.iter()) {
267 *dst = (i32::from(*dst) + i32::from(*src)).max(0).min(255) as u8;
272 pub fn add_coeffs8(dst: &mut [u8], offset: usize, stride: usize, coeffs: &[i16; 64]) {
273 let out = &mut dst[offset..];
274 for (line, src) in out.chunks_mut(stride).take(8).zip(coeffs.chunks_exact(8)) {
275 for (dst, src) in line.iter_mut().take(8).zip(src.iter()) {
276 *dst = (i32::from(*dst) + i32::from(*src)).max(0).min(255) as u8;
281 fn clip8(val: i16) -> u8 { val.max(0).min(255) as u8 }
283 fn ipred_dc128(buf: &mut [u8], stride: usize, bsize: usize) {
284 for row in buf.chunks_mut(stride).take(bsize) {
285 for el in row[..bsize].iter_mut() {
290 fn ipred_ver(buf: &mut [u8], stride: usize, top: &[u8], bsize: usize) {
291 for row in buf.chunks_mut(stride).take(bsize) {
292 row[..bsize].copy_from_slice(&top[..bsize]);
295 fn ipred_hor(buf: &mut [u8], stride: usize, left: &[u8], bsize: usize) {
296 for (row, &left) in buf.chunks_mut(stride).zip(left[1..].iter()).take(bsize) {
297 for el in row[..bsize].iter_mut() {
302 fn ipred_dc(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8], bsize: usize, shift: u8) {
303 let mut adc: u16 = 0;
304 for i in 0..bsize { adc += u16::from(top[i]); }
305 for i in 0..bsize { adc += u16::from(left[i + 1]); }
306 let dc = ((adc + (1 << (shift - 1))) >> shift) as u8;
308 for row in buf.chunks_mut(stride).take(bsize) {
309 for el in row[..bsize].iter_mut() {
314 fn ipred_left_dc(buf: &mut [u8], stride: usize, left: &[u8], bsize: usize, shift: u8) {
315 let mut adc: u16 = 0;
316 for i in 0..bsize { adc += u16::from(left[i + 1]); }
317 let dc = ((adc + (1 << (shift - 1))) >> shift) as u8;
319 for row in buf.chunks_mut(stride).take(bsize) {
320 for el in row[..bsize].iter_mut() {
325 fn ipred_top_dc(buf: &mut [u8], stride: usize, top: &[u8], bsize: usize, shift: u8) {
326 let mut adc: u16 = 0;
327 for i in 0..bsize { adc += u16::from(top[i]); }
328 let dc = ((adc + (1 << (shift - 1))) >> shift) as u8;
330 for row in buf.chunks_mut(stride).take(bsize) {
331 for el in row[..bsize].iter_mut() {
337 fn load(dst: &mut [u16], src: &[u8]) {
338 for (dst, &src) in dst.iter_mut().zip(src.iter()) {
339 *dst = u16::from(src);
343 fn ipred_4x4_ver(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8], _tr: &[u8]) {
344 ipred_ver(buf, stride, top, 4);
346 fn ipred_4x4_hor(buf: &mut [u8], stride: usize, _top: &[u8], left: &[u8], _tr: &[u8]) {
347 ipred_hor(buf, stride, left, 4);
349 fn ipred_4x4_diag_down_left(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8], tr: &[u8]) {
350 let mut t: [u16; 9] = [0; 9];
351 load(&mut t[..4], top);
352 load(&mut t[4..8], tr);
356 buf[i] = ((t[i] + 2 * t[i + 1] + t[i + 2] + 2) >> 2) as u8;
358 let dst = &mut buf[stride..];
360 dst[i] = ((t[i + 1] + 2 * t[i + 2] + t[i + 3] + 2) >> 2) as u8;
362 let dst = &mut buf[stride * 2..];
364 dst[i] = ((t[i + 2] + 2 * t[i + 3] + t[i + 4] + 2) >> 2) as u8;
366 let dst = &mut buf[stride * 3..];
368 dst[i] = ((t[i + 3] + 2 * t[i + 4] + t[i + 5] + 2) >> 2) as u8;
371 fn ipred_4x4_diag_down_right(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8], _tr: &[u8]) {
372 let mut t: [u16; 5] = [0; 5];
373 t[0] = u16::from(left[0]);
374 load(&mut t[1..], top);
375 let mut l: [u16; 5] = [0; 5];
381 dst[i + j * stride] = ((l[j - i - 1] + 2 * l[j - i] + l[j - i + 1] + 2) >> 2) as u8;
383 dst[j + j * stride] = ((l[1] + 2 * l[0] + t[1] + 2) >> 2) as u8;
385 dst[i + j * stride] = ((t[i - j - 1] + 2 * t[i - j] + t[i - j + 1] + 2) >> 2) as u8;
389 fn ipred_4x4_ver_right(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8], _tr: &[u8]) {
390 let mut t: [u16; 5] = [0; 5];
391 t[0] = u16::from(left[0]);
392 load(&mut t[1..], top);
393 let mut l: [u16; 5] = [0; 5];
399 let zvr = ((2 * i) as i8) - (j as i8);
403 pix = (t[i - (j >> 1)] + t[i - (j >> 1) + 1] + 1) >> 1;
405 pix = (t[i - (j >> 1) - 1] + 2 * t[i - (j >> 1)] + t[i - (j >> 1) + 1] + 2) >> 2;
409 pix = (l[1] + 2 * l[0] + t[1] + 2) >> 2;
411 pix = (l[j] + 2 * l[j - 1] + l[j - 2] + 2) >> 2;
414 dst[i + j * stride] = pix as u8;
418 fn ipred_4x4_ver_left(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8], tr: &[u8]) {
419 let mut t: [u16; 8] = [0; 8];
420 load(&mut t[..4], top);
421 load(&mut t[4..], tr);
424 dst[0 + 0 * stride] = ((t[0] + t[1] + 1) >> 1) as u8;
425 let pix = ((t[1] + t[2] + 1) >> 1) as u8;
426 dst[1 + 0 * stride] = pix;
427 dst[0 + 2 * stride] = pix;
428 let pix = ((t[2] + t[3] + 1) >> 1) as u8;
429 dst[2 + 0 * stride] = pix;
430 dst[1 + 2 * stride] = pix;
431 let pix = ((t[3] + t[4] + 1) >> 1) as u8;
432 dst[3 + 0 * stride] = pix;
433 dst[2 + 2 * stride] = pix;
434 dst[3 + 2 * stride] = ((t[4] + t[5] + 1) >> 1) as u8;
435 dst[0 + 1 * stride] = ((t[0] + 2*t[1] + t[2] + 2) >> 2) as u8;
436 let pix = ((t[1] + 2*t[2] + t[3] + 2) >> 2) as u8;
437 dst[1 + 1 * stride] = pix;
438 dst[0 + 3 * stride] = pix;
439 let pix = ((t[2] + 2*t[3] + t[4] + 2) >> 2) as u8;
440 dst[2 + 1 * stride] = pix;
441 dst[1 + 3 * stride] = pix;
442 let pix = ((t[3] + 2*t[4] + t[5] + 2) >> 2) as u8;
443 dst[3 + 1 * stride] = pix;
444 dst[2 + 3 * stride] = pix;
445 dst[3 + 3 * stride] = ((t[4] + 2*t[5] + t[6] + 2) >> 2) as u8;
447 fn ipred_4x4_hor_down(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8], _tr: &[u8]) {
448 let mut t: [u16; 5] = [0; 5];
449 t[0] = u16::from(left[0]);
450 load(&mut t[1..], top);
451 let mut l: [u16; 5] = [0; 5];
457 let zhd = ((2 * j) as i8) - (i as i8);
461 pix = (l[j - (i >> 1)] + l[j - (i >> 1) + 1] + 1) >> 1;
463 pix = (l[j - (i >> 1) - 1] + 2 * l[j - (i >> 1)] + l[j - (i >> 1) + 1] + 2) >> 2;
467 pix = (l[1] + 2 * l[0] + t[1] + 2) >> 2;
469 pix = (t[i - 2] + 2 * t[i - 1] + t[i] + 2) >> 2;
472 dst[i + j * stride] = pix as u8;
476 fn ipred_4x4_hor_up(buf: &mut [u8], stride: usize, _top: &[u8], left: &[u8], _tr: &[u8]) {
477 let mut l: [u16; 8] = [0; 8];
478 load(&mut l, &left[1..]);
481 dst[0 + 0 * stride] = ((l[0] + l[1] + 1) >> 1) as u8;
482 dst[1 + 0 * stride] = ((l[0] + 2*l[1] + l[2] + 2) >> 2) as u8;
483 let pix = ((l[1] + l[2] + 1) >> 1) as u8;
484 dst[2 + 0 * stride] = pix;
485 dst[0 + 1 * stride] = pix;
486 let pix = ((l[1] + 2*l[2] + l[3] + 2) >> 2) as u8;
487 dst[3 + 0 * stride] = pix;
488 dst[1 + 1 * stride] = pix;
489 let pix = ((l[2] + l[3] + 1) >> 1) as u8;
490 dst[2 + 1 * stride] = pix;
491 dst[0 + 2 * stride] = pix;
492 let pix = ((l[2] + 3*l[3] + 2) >> 2) as u8;
493 dst[3 + 1 * stride] = pix;
494 dst[1 + 2 * stride] = pix;
495 dst[3 + 2 * stride] = l[3] as u8;
496 dst[1 + 3 * stride] = l[3] as u8;
497 dst[0 + 3 * stride] = l[3] as u8;
498 dst[2 + 2 * stride] = l[3] as u8;
499 dst[2 + 3 * stride] = l[3] as u8;
500 dst[3 + 3 * stride] = l[3] as u8;
502 fn ipred_4x4_dc(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8], _tr: &[u8]) {
503 ipred_dc(buf, stride, top, left, 4, 3);
505 fn ipred_4x4_left_dc(buf: &mut [u8], stride: usize, _top: &[u8], left: &[u8], _tr: &[u8]) {
506 ipred_left_dc(buf, stride, left, 4, 2);
508 fn ipred_4x4_top_dc(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8], _tr: &[u8]) {
509 ipred_top_dc(buf, stride, top, 4, 2);
511 fn ipred_4x4_dc128(buf: &mut [u8], stride: usize, _top: &[u8], _left: &[u8], _tr: &[u8]) {
512 ipred_dc128(buf, stride, 4);
515 pub struct IPred8Context {
522 pub fn new() -> Self {
529 pub fn fill(&mut self, top: &[u8], left: &[u8], has_t: bool, has_tr: bool, has_l: bool, has_tl: bool) {
530 let mut t = [0x80u8; 19];
531 let mut l = [0x80u8; 11];
533 t[1..8 + 1].copy_from_slice(&top[..8]);
536 t[8 + 1..16 + 1].copy_from_slice(&top[8..][..8]);
537 t[16 + 1] = t[15 + 1];
538 t[17 + 1] = t[15 + 1];
540 let (t0, t1) = t.split_at_mut(8 + 1);
541 for el in t1.iter_mut() {
546 l[1..9].copy_from_slice(&left[1..9]);
559 self.t[i] = ((u16::from(t[i]) + 2 * u16::from(t[i + 1]) + u16::from(t[i + 2]) + 2) >> 2) as u8;
562 self.l[i] = ((u16::from(l[i]) + 2 * u16::from(l[i + 1]) + u16::from(l[i + 2]) + 2) >> 2) as u8;
564 self.tl = if has_t && has_l {
565 ((u16::from(t[1]) + 2 * u16::from(t[0]) + u16::from(l[1]) + 2) >> 2) as u8
567 ((3 * u16::from(t[0]) + u16::from(t[1]) + 2) >> 2) as u8
569 ((3 * u16::from(l[0]) + u16::from(l[1]) + 2) >> 2) as u8
576 fn ipred_y_8x8_ver(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
577 for row in buf.chunks_mut(stride).take(8) {
578 row[..8].copy_from_slice(&ctx.t[..8]);
581 fn ipred_y_8x8_hor(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
582 for (row, &l) in buf.chunks_mut(stride).zip(ctx.l.iter()).take(8) {
583 row[..8].copy_from_slice(&[l; 8]);
586 fn ipred_y_8x8_diag_down_left(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
587 let mut t = [0u16; 16];
588 load(&mut t, &ctx.t);
590 for (y, row) in buf.chunks_mut(stride).take(8).enumerate() {
591 for (x, pix) in row.iter_mut().take(8).enumerate() {
592 *pix = ((if (x != 7) || (y != 7) {
593 t[x + y] + 2 * t[x + y + 1] + t[x + y + 2]
600 fn ipred_y_8x8_diag_down_right(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
601 let mut t = [0u16; 9];
602 t[0] = u16::from(ctx.tl);
603 load(&mut t[1..], &ctx.t);
604 let mut l = [0u16; 9];
605 l[0] = u16::from(ctx.tl);
606 load(&mut l[1..], &ctx.l);
607 let diag = t[1] + 2 * t[0] + l[1];
609 for (y, row) in buf.chunks_mut(stride).take(8).enumerate() {
610 for (x, pix) in row.iter_mut().take(8).enumerate() {
612 t[x - y - 1] + 2 * t[x - y] + t[x - y + 1]
614 l[y - x - 1] + 2 * l[y - x] + l[y - x + 1]
621 fn ipred_y_8x8_ver_right(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
622 let mut t = [0u16; 9];
623 t[0] = u16::from(ctx.tl);
624 load(&mut t[1..], &ctx.t);
625 let mut l = [0u16; 9];
626 l[0] = u16::from(ctx.tl);
627 load(&mut l[1..], &ctx.l);
629 for (y, row) in buf.chunks_mut(stride).take(8).enumerate() {
630 for (x, pix) in row.iter_mut().take(8).enumerate() {
631 let zvr = 2 * (x as i8) - (y as i8);
633 let ix = x - (y >> 1);
635 (t[ix] + t[ix + 1] + 1) >> 1
637 (t[ix - 1] + 2 * t[ix] + t[ix + 1] + 2) >> 2
639 } else if zvr == -1 {
640 (l[1] + 2 * l[0] + t[1] + 2) >> 2
643 (l[ix] + 2 * l[ix - 1] + l[ix - 2] + 2) >> 2
648 fn ipred_y_8x8_ver_left(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
649 let mut t = [0u16; 16];
650 load(&mut t, &ctx.t);
652 for (y, row) in buf.chunks_mut(stride).take(8).enumerate() {
653 for (x, pix) in row.iter_mut().take(8).enumerate() {
654 let ix = x + (y >> 1);
655 *pix = if (y & 1) == 0 {
656 (t[ix] + t[ix + 1] + 1) >> 1
658 (t[ix] + 2 * t[ix + 1] + t[ix + 2] + 2) >> 2
664 fn ipred_y_8x8_hor_down(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
665 let mut t = [0u16; 9];
666 t[0] = u16::from(ctx.tl);
667 load(&mut t[1..], &ctx.t);
668 let mut l = [0u16; 9];
669 l[0] = u16::from(ctx.tl);
670 load(&mut l[1..], &ctx.l);
672 for (y, row) in buf.chunks_mut(stride).take(8).enumerate() {
673 for (x, pix) in row.iter_mut().take(8).enumerate() {
674 let zhd = 2 * (y as i8) - (x as i8);
676 let ix = y - (x >> 1);
678 (l[ix] + l[ix + 1] + 1) >> 1
680 (l[ix - 1] + 2 * l[ix] + l[ix + 1] + 2) >> 2
682 } else if zhd == -1 {
683 (l[1] + 2 * l[0] + t[1] + 2) >> 2
686 (t[ix] + 2 * t[ix - 1] + t[ix - 2] + 2) >> 2
691 fn ipred_y_8x8_hor_up(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
692 let mut l = [0u16; 8];
693 load(&mut l, &ctx.l);
695 for (y, row) in buf.chunks_mut(stride).take(8).enumerate() {
696 for (x, pix) in row.iter_mut().take(8).enumerate() {
698 let ix = y + (x >> 1);
701 } else if zhu == 13 {
702 (l[6] + 3 * l[7] + 2) >> 2
703 } else if (zhu & 1) != 0 {
704 (l[ix] + 2 * l[ix + 1] + l[ix + 2] + 2) >> 2
706 (l[ix] + l[ix + 1] + 1) >> 1
711 fn ipred_y_8x8_dc(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
713 for &t in ctx.t[..8].iter() {
716 for &l in ctx.l[..8].iter() {
719 let dc = ((sum + 8) >> 4) as u8;
720 for row in buf.chunks_mut(stride).take(8) {
721 for pix in row.iter_mut().take(8) {
726 fn ipred_y_8x8_left_dc(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
728 for &l in ctx.l[..8].iter() {
731 let dc = ((sum + 4) >> 3) as u8;
732 for row in buf.chunks_mut(stride).take(8) {
733 for pix in row.iter_mut().take(8) {
738 fn ipred_y_8x8_top_dc(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
740 for &t in ctx.t[..8].iter() {
743 let dc = ((sum + 4) >> 3) as u8;
744 for row in buf.chunks_mut(stride).take(8) {
745 for pix in row.iter_mut().take(8) {
750 fn ipred_y_8x8_dc128(buf: &mut [u8], stride: usize, _ctx: &IPred8Context) {
751 ipred_dc128(buf, stride, 8);
754 fn ipred_8x8_ver(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8]) {
755 ipred_ver(buf, stride, top, 8);
757 fn ipred_8x8_hor(buf: &mut [u8], stride: usize, _top: &[u8], left: &[u8]) {
758 ipred_hor(buf, stride, left, 8);
760 fn ipred_8x8_dc(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8]) {
762 load(&mut l, &left[1..]);
766 let dc0 = ((t[0] + t[1] + t[2] + t[3] + l[0] + l[1] + l[2] + l[3] + 4) >> 3) as u8;
767 let sum1 = t[4] + t[5] + t[6] + t[7];
768 let dc1 = ((sum1 + 2) >> 2) as u8;
769 let sum2 = l[4] + l[5] + l[6] + l[7];
770 let dc2 = ((sum2 + 2) >> 2) as u8;
771 let dc3 = ((sum1 + sum2 + 4) >> 3) as u8;
773 for row in buf.chunks_mut(stride).take(4) {
774 row[..4].copy_from_slice(&[dc0; 4]);
775 row[4..8].copy_from_slice(&[dc1; 4]);
777 for row in buf.chunks_mut(stride).skip(4).take(4) {
778 row[..4].copy_from_slice(&[dc2; 4]);
779 row[4..8].copy_from_slice(&[dc3; 4]);
782 fn ipred_8x8_left_dc(buf: &mut [u8], stride: usize, _top: &[u8], left: &[u8]) {
783 let mut left_dc0 = 0;
784 let mut left_dc1 = 0;
785 for &el in left[1..].iter().take(4) {
786 left_dc0 += u16::from(el);
788 for &el in left[1..].iter().skip(4).take(4) {
789 left_dc1 += u16::from(el);
791 let dc0 = ((left_dc0 + 2) >> 2) as u8;
792 let dc2 = ((left_dc1 + 2) >> 2) as u8;
793 for row in buf.chunks_mut(stride).take(4) {
794 row[..8].copy_from_slice(&[dc0; 8]);
796 for row in buf.chunks_mut(stride).skip(4).take(4) {
797 row[..8].copy_from_slice(&[dc2; 8]);
800 fn ipred_8x8_top_dc(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8]) {
801 ipred_top_dc(buf, stride, top, 4, 2);
802 ipred_top_dc(&mut buf[4..], stride, &top[4..], 4, 2);
803 let mut top = [0; 8];
804 top.copy_from_slice(&buf[stride * 3..][..8]);
805 ipred_top_dc(&mut buf[4 * stride..], stride, &top, 4, 2);
806 ipred_top_dc(&mut buf[4 + 4 * stride..], stride, &top[4..], 4, 2);
808 fn ipred_8x8_dc128(buf: &mut [u8], stride: usize, _top: &[u8], _left: &[u8]) {
809 ipred_dc128(buf, stride, 8);
811 fn ipred_8x8_plane(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8]) {
812 let mut h: i32 = 4 * (i32::from(top[7]) - i32::from(left[0]));
813 let mut v: i32 = 4 * (i32::from(left[8]) - i32::from(left[0]));
815 let i1 = (i + 1) as i32;
816 h += i1 * (i32::from(top[4 + i]) - i32::from(top[2 - i]));
817 v += i1 * (i32::from(left[5 + i]) - i32::from(left[3 - i]));
819 let b = (17 * h + 16) >> 5;
820 let c = (17 * v + 16) >> 5;
821 let mut a = 16 * (i32::from(left[8]) + i32::from(top[7])) - 3 * (b + c) + 16;
822 for line in buf.chunks_mut(stride).take(8) {
824 for el in line.iter_mut().take(8) {
825 *el = clip8((acc >> 5) as i16);
832 fn ipred_16x16_ver(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8]) {
833 ipred_ver(buf, stride, top, 16);
835 fn ipred_16x16_hor(buf: &mut [u8], stride: usize, _top: &[u8], left: &[u8]) {
836 ipred_hor(buf, stride, left, 16);
838 fn ipred_16x16_dc(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8]) {
839 ipred_dc(buf, stride, top, left, 16, 5);
841 fn ipred_16x16_left_dc(buf: &mut [u8], stride: usize, _top: &[u8], left: &[u8]) {
842 ipred_left_dc(buf, stride, left, 16, 4);
844 fn ipred_16x16_top_dc(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8]) {
845 ipred_top_dc(buf, stride, top, 16, 4);
847 fn ipred_16x16_dc128(buf: &mut [u8], stride: usize, _top: &[u8], _left: &[u8]) {
848 ipred_dc128(buf, stride, 16);
850 fn ipred_16x16_plane(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8]) {
851 let mut h = 8 * (i32::from(top[15]) - i32::from(left[0]));
852 let mut v = 8 * (i32::from(left[16]) - i32::from(left[0]));
854 h += ((k as i32) + 1) * (i32::from(top[8 + k]) - i32::from(top[6 - k]));
855 v += ((k as i32) + 1) * (i32::from(left[9 + k]) - i32::from(left[7 - k]));
858 h = (5 * h + 32) >> 6;
859 v = (5 * v + 32) >> 6;
861 let mut a = 16 * (i32::from(left[16]) + i32::from(top[15]) + 1) - 7 * (v + h);
863 for row in buf.chunks_mut(stride).take(16) {
867 for dst in row.chunks_exact_mut(4).take(4) {
868 dst[0] = clip8(((b ) >> 5) as i16);
869 dst[1] = clip8(((b + h) >> 5) as i16);
870 dst[2] = clip8(((b + 2*h) >> 5) as i16);
871 dst[3] = clip8(((b + 3*h) >> 5) as i16);
877 pub type IPred4x4Func = fn(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8], tr: &[u8]);
878 pub type IPred8x8Func = fn(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8]);
879 pub type IPred8x8LumaFunc = fn(buf: &mut [u8], stride: usize, ctx: &IPred8Context);
881 pub const IPRED4_DC128: usize = 11;
882 pub const IPRED4_DC_TOP: usize = 10;
883 pub const IPRED4_DC_LEFT: usize = 9;
884 pub const IPRED8_DC128: usize = 6;
885 pub const IPRED8_DC_TOP: usize = 5;
886 pub const IPRED8_DC_LEFT: usize = 4;
888 pub const IPRED_FUNCS4X4: [IPred4x4Func; 12] = [
889 ipred_4x4_ver, ipred_4x4_hor, ipred_4x4_dc,
890 ipred_4x4_diag_down_left, ipred_4x4_diag_down_right,
891 ipred_4x4_ver_right, ipred_4x4_hor_down, ipred_4x4_ver_left, ipred_4x4_hor_up,
892 ipred_4x4_left_dc, ipred_4x4_top_dc, ipred_4x4_dc128
895 pub const IPRED_FUNCS8X8_LUMA: [IPred8x8LumaFunc; 12] = [
896 ipred_y_8x8_ver, ipred_y_8x8_hor, ipred_y_8x8_dc,
897 ipred_y_8x8_diag_down_left, ipred_y_8x8_diag_down_right,
898 ipred_y_8x8_ver_right, ipred_y_8x8_hor_down,
899 ipred_y_8x8_ver_left, ipred_y_8x8_hor_up,
900 ipred_y_8x8_left_dc, ipred_y_8x8_top_dc, ipred_y_8x8_dc128
903 pub const IPRED_FUNCS8X8_CHROMA: [IPred8x8Func; 7] = [
904 ipred_8x8_dc, ipred_8x8_hor, ipred_8x8_ver, ipred_8x8_plane,
905 ipred_8x8_left_dc, ipred_8x8_top_dc, ipred_8x8_dc128
908 pub const IPRED_FUNCS16X16: [IPred8x8Func; 7] = [
909 ipred_16x16_ver, ipred_16x16_hor, ipred_16x16_dc, ipred_16x16_plane,
910 ipred_16x16_left_dc, ipred_16x16_top_dc, ipred_16x16_dc128
913 macro_rules! loop_filter {
914 (lumaedge; $buf: expr, $off: expr, $step: expr, $alpha: expr, $beta: expr) => {
915 let p2 = i16::from($buf[$off - $step * 3]);
916 let p1 = i16::from($buf[$off - $step * 2]);
917 let p0 = i16::from($buf[$off - $step]);
918 let q0 = i16::from($buf[$off]);
919 let q1 = i16::from($buf[$off + $step]);
920 let q2 = i16::from($buf[$off + $step * 2]);
921 let a_p = (p2 - p0).abs() < $beta;
922 let a_q = (q2 - q0).abs() < $beta;
923 if a_p && (p0 - q0).abs() < (($alpha >> 2) + 2) {
924 let p3 = i16::from($buf[$off - $step * 4]);
925 $buf[$off - $step * 3] = ((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) as u8;
926 $buf[$off - $step * 2] = ((p2 + p1 + p0 + q0 + 2) >> 2) as u8;
927 $buf[$off - $step] = ((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) as u8;
929 $buf[$off - $step] = ((2 * p1 + p0 + q1 + 2) >> 2) as u8;
931 if a_q && (p0 - q0).abs() < (($alpha >> 2) + 2) {
932 let q3 = i16::from($buf[$off + $step * 3]);
933 $buf[$off] = ((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) as u8;
934 $buf[$off + $step] = ((p0 + q0 + q1 + q2 + 2) >> 2) as u8;
935 $buf[$off + $step * 2] = ((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) as u8;
937 $buf[$off] = ((2 * q1 + q0 + p1 + 2) >> 2) as u8;
940 (chromaedge; $buf: expr, $off: expr, $step: expr) => {
941 let p1 = i16::from($buf[$off - $step * 2]);
942 let p0 = i16::from($buf[$off - $step]);
943 let q0 = i16::from($buf[$off]);
944 let q1 = i16::from($buf[$off + $step]);
945 $buf[$off - $step] = ((2 * p1 + p0 + q1 + 2) >> 2) as u8;
946 $buf[$off] = ((2 * q1 + q0 + p1 + 2) >> 2) as u8;
948 (lumanormal; $buf: expr, $off: expr, $step: expr, $tc0: expr, $beta: expr) => {
949 let p2 = i16::from($buf[$off - $step * 3]);
950 let p1 = i16::from($buf[$off - $step * 2]);
951 let p0 = i16::from($buf[$off - $step]);
952 let q0 = i16::from($buf[$off]);
953 let q1 = i16::from($buf[$off + $step]);
954 let q2 = i16::from($buf[$off + $step * 2]);
955 let a_p = (p2 - p0).abs() < $beta;
956 let a_q = (q2 - q0).abs() < $beta;
957 let tc = $tc0 + (a_p as i16) + (a_q as i16);
958 let delta = (((q0 - p0) * 4 + (p1 - q1) + 4) >> 3).max(-tc).min(tc);
959 if a_p && ($tc0 > 0) {
960 $buf[$off - $step * 2] = clip8(p1 + ((p2 + ((p0 + q0 + 1) >> 1) - p1 * 2) >> 1).max(-$tc0).min($tc0));
962 $buf[$off - $step] = clip8(p0 + delta);
963 $buf[$off] = clip8(q0 - delta);
964 if a_q && ($tc0 > 0) {
965 $buf[$off + $step] = clip8(q1 + ((q2 + ((p0 + q0 + 1) >> 1) - q1 * 2) >> 1).max(-$tc0).min($tc0));
968 (chromanormal; $buf: expr, $off: expr, $step: expr, $tc0: expr) => {
969 let p1 = i16::from($buf[$off - $step * 2]);
970 let p0 = i16::from($buf[$off - $step]);
971 let q0 = i16::from($buf[$off]);
972 let q1 = i16::from($buf[$off + $step]);
974 let delta = (((q0 - p0) * 4 + (p1 - q1) + 4) >> 3).max(-tc).min(tc);
975 $buf[$off - $step] = clip8(p0 + delta);
976 $buf[$off] = clip8(q0 - delta);
980 fn check_filter(buf: &[u8], off: usize, step: usize, alpha: i16, beta: i16) -> bool {
981 let p1 = i16::from(buf[off - step * 2]);
982 let p0 = i16::from(buf[off - step]);
983 let q0 = i16::from(buf[off]);
984 let q1 = i16::from(buf[off + step]);
985 (p0 - q0).abs() < alpha && (p1 - p0).abs() < beta && (q1 - q0).abs() < beta
988 #[cfg(not(target_arch="x86_64"))]
989 fn check_filter4(buf: &[u8], mut off: usize, step: usize, stride: usize, alpha: i16, beta: i16) -> [bool; 4] {
990 let mut flags = [false; 4];
991 for flag in flags.iter_mut() {
992 let p1 = i16::from(buf[off - step * 2]);
993 let p0 = i16::from(buf[off - step]);
994 let q0 = i16::from(buf[off]);
995 let q1 = i16::from(buf[off + step]);
996 *flag = (p0 - q0).abs() < alpha && (p1 - p0).abs() < beta && (q1 - q0).abs() < beta;
1002 #[cfg(target_arch="x86_64")]
1003 fn check_filter4(buf: &[u8], off: usize, step: usize, stride: usize, alpha: i16, beta: i16) -> [bool; 4] {
1005 let mut flags = [false; 4];
1006 let src = buf[off - step * 2..].as_ptr();
1007 let load_stride = step.max(stride);
1008 let fptr = flags.as_mut_ptr();
1009 let tflag = u32::from(step == 1);
1013 "movd xmm0, dword ptr [{src}]",
1014 "lea {tmp}, [{src} + {stride} * 2]",
1015 "movd xmm1, dword ptr [{src} + {stride}]",
1016 "movd xmm2, dword ptr [{tmp}]",
1017 "movd xmm3, dword ptr [{tmp} + {stride}]",
1018 "punpcklbw xmm0, xmm4",
1019 "punpcklbw xmm1, xmm4",
1020 "punpcklbw xmm2, xmm4",
1021 "punpcklbw xmm3, xmm4",
1023 // transpose block if necessary so it's always processed by rows
1024 "test {tflag:e}, {tflag:e}",
1026 "punpcklwd xmm0, xmm1",
1027 "movhlps xmm4, xmm0",
1028 "punpcklwd xmm2, xmm3",
1029 "movhlps xmm1, xmm2",
1030 "punpckldq xmm0, xmm2",
1031 "punpckldq xmm4, xmm1",
1032 "movhlps xmm1, xmm0",
1033 "movhlps xmm3, xmm4",
1034 "movaps xmm2, xmm4",
1037 // calculate deltas and flags
1038 "movd xmm4, {alpha:r}",
1039 "movd xmm5, {beta:r}",
1043 "pshuflw xmm4, xmm4, 0",
1044 "pshuflw xmm5, xmm5, 0",
1045 "pabsw xmm0, xmm0", // |p1 - p0|
1046 "pabsw xmm1, xmm1", // |p0 - q0|
1047 "pabsw xmm2, xmm3", // |q1 - q0|
1048 "movaps xmm3, xmm5",
1049 "pcmpgtw xmm4, xmm1",
1050 "pcmpgtw xmm5, xmm0",
1051 "pcmpgtw xmm3, xmm2",
1054 "packsswb xmm4, xmm4",
1055 "movd [{flags}], xmm4",
1058 stride = in(reg) load_stride,
1059 alpha = in(reg) alpha,
1060 beta = in(reg) beta,
1061 flags = in(reg) fptr,
1062 tflag = in(reg) tflag,
1074 pub fn loop_filter_lumaedge_v(dst: &mut [u8], mut off: usize, stride: usize, alpha: i16, beta: i16) {
1075 let flags = check_filter4(dst, off, 1, stride, alpha, beta);
1076 for &flag in flags.iter() {
1078 loop_filter!(lumaedge; dst, off, 1, alpha, beta);
1083 pub fn loop_filter_lumaedge_h(dst: &mut [u8], off: usize, stride: usize, alpha: i16, beta: i16) {
1084 let flags = check_filter4(dst, off, stride, 1, alpha, beta);
1085 for (x, &flag) in flags.iter().enumerate() {
1087 loop_filter!(lumaedge; dst, off + x, stride, alpha, beta);
1091 pub fn loop_filter_lumanormal_v(dst: &mut [u8], mut off: usize, stride: usize, alpha: i16, beta: i16, tc0: i16) {
1092 let flags = check_filter4(dst, off, 1, stride, alpha, beta);
1093 for &flag in flags.iter() {
1095 loop_filter!(lumanormal; dst, off, 1, tc0, beta);
1100 pub fn loop_filter_lumanormal_h(dst: &mut [u8], off: usize, stride: usize, alpha: i16, beta: i16, tc0: i16) {
1101 let flags = check_filter4(dst, off, stride, 1, alpha, beta);
1102 for (x, &flag) in flags.iter().enumerate() {
1104 loop_filter!(lumanormal; dst, off + x, stride, tc0, beta);
1108 pub fn loop_filter_chromaedge_v(dst: &mut [u8], mut off: usize, stride: usize, alpha: i16, beta: i16) {
1110 if check_filter(dst, off, 1, alpha, beta) {
1111 loop_filter!(chromaedge; dst, off, 1);
1116 pub fn loop_filter_chromaedge_h(dst: &mut [u8], off: usize, stride: usize, alpha: i16, beta: i16) {
1118 if check_filter(dst, off + x, stride, alpha, beta) {
1119 loop_filter!(chromaedge; dst, off + x, stride);
1123 pub fn loop_filter_chromanormal_v(dst: &mut [u8], mut off: usize, stride: usize, alpha: i16, beta: i16, tc0: i16) {
1125 if check_filter(dst, off, 1, alpha, beta) {
1126 loop_filter!(chromanormal; dst, off, 1, tc0);
1131 pub fn loop_filter_chromanormal_h(dst: &mut [u8], off: usize, stride: usize, alpha: i16, beta: i16, tc0: i16) {
1133 if check_filter(dst, off + x, stride, alpha, beta) {
1134 loop_filter!(chromanormal; dst, off + x, stride, tc0);