2 pub use mc::{H264MC, McBlock};
3 #[cfg(target_arch="x86_64")]
6 pub const CHROMA_QUANTS: [u8; 52] = [
7 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
8 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 29, 30,
9 31, 32, 32, 33, 34, 34, 35, 35, 36, 36, 37, 37, 37, 38, 38, 38,
13 pub const CHROMA_DC_SCAN: [usize; 4] = [ 0, 1, 2, 3];
14 pub const ZIGZAG: [usize; 16] = [
15 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
17 pub const ZIGZAG1: [usize; 15] = [
18 0, 3, 7, 4, 1, 2, 5, 8, 11, 12, 9, 6, 10, 13, 14
20 /*pub const IL_SCAN: [usize; 16] = [
21 0, 4, 1, 8, 12, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
23 pub const ZIGZAG8X8: [usize; 64] = [
24 0, 1, 8, 16, 9, 2, 3, 10,
25 17, 24, 32, 25, 18, 11, 4, 5,
26 12, 19, 26, 33, 40, 48, 41, 34,
27 27, 20, 13, 6, 7, 14, 21, 28,
28 35, 42, 49, 56, 57, 50, 43, 36,
29 29, 22, 15, 23, 30, 37, 44, 51,
30 58, 59, 52, 45, 38, 31, 39, 46,
31 53, 60, 61, 54, 47, 55, 62, 63
34 const LEVEL_SCALE: [[i16; 6]; 3] = [
35 [ 10, 11, 13, 14, 16, 18 ],
36 [ 16, 18, 20, 23, 25, 29 ],
37 [ 13, 14, 16, 18, 20, 23 ]
40 pub fn chroma_dc_transform(blk: &mut [i16; 4], qp: u8) {
41 let t0 = blk[0] + blk[2];
42 let t1 = blk[0] - blk[2];
43 let t2 = blk[1] + blk[3];
44 let t3 = blk[1] - blk[3];
50 let mul = LEVEL_SCALE[0][qp as usize];
51 for el in blk.iter_mut() {
52 *el = el.wrapping_mul(mul) >> 1;
55 let mul = LEVEL_SCALE[0][(qp % 6) as usize];
56 let shift = qp / 6 - 1;
57 for el in blk.iter_mut() {
58 *el = el.wrapping_mul(mul) << shift;
63 macro_rules! transform {
64 (luma_dc; $a: expr, $b: expr, $c: expr, $d: expr) => ({
65 let t0 = $a.wrapping_add($c);
66 let t1 = $a.wrapping_sub($c);
67 let t2 = $b.wrapping_add($d);
68 let t3 = $b.wrapping_sub($d);
69 $a = t0.wrapping_add(t2);
70 $b = t1.wrapping_add(t3);
71 $c = t1.wrapping_sub(t3);
72 $d = t0.wrapping_sub(t2);
74 ($a: expr, $b: expr, $c: expr, $d: expr, $shift: expr) => ({
75 let t0 = $a.wrapping_add($c);
76 let t1 = $a.wrapping_sub($c);
77 let t2 = ($b >> 1).wrapping_sub($d);
78 let t3 = $b.wrapping_add($d >> 1);
79 let bias = 1 << $shift >> 1;
80 $a = t0.wrapping_add(t3).wrapping_add(bias) >> $shift;
81 $b = t1.wrapping_add(t2).wrapping_add(bias) >> $shift;
82 $c = t1.wrapping_sub(t2).wrapping_add(bias) >> $shift;
83 $d = t0.wrapping_sub(t3).wrapping_add(bias) >> $shift;
85 ($a: expr, $b: expr, $c: expr, $d: expr, $e: expr, $f: expr, $g: expr, $h: expr) => {
87 let e1 = -$d + $f - $h - ($h >> 1);
89 let e3 = $b + $h - $d - ($d >> 1);
90 let e4 = ($c >> 1) - $g;
91 let e5 = -$b + $h + $f + ($f >> 1);
92 let e6 = $c + ($g >> 1);
93 let e7 = $d + $f + $b + ($b >> 1);
96 let f1 = e1 + (e7 >> 2);
98 let f3 = e3 + (e5 >> 2);
100 let f5 = (e3 >> 2) - e5;
102 let f7 = e7 - (e1 >> 2);
115 pub fn idct_luma_dc(blk: &mut [i16; 16], qp: u8) {
117 let mul = LEVEL_SCALE[0][(qp % 6) as usize];
118 let shift = 2 - qp / 6;
119 let bias = 1 << shift >> 1;
120 for el in blk.iter_mut() {
121 *el = el.wrapping_mul(mul).wrapping_add(bias) >> shift;
124 let mul = LEVEL_SCALE[0][(qp % 6) as usize];
125 let shift = qp / 6 - 2;
126 for el in blk.iter_mut() {
127 *el = el.wrapping_mul(mul) << shift;
131 transform!(luma_dc; blk[i], blk[i + 4], blk[i + 8], blk[i + 12]);
133 for row in blk.chunks_mut(4) {
134 transform!(luma_dc; row[0], row[1], row[2], row[3]);
138 pub fn idct(blk: &mut [i16; 16], qp: u8, quant_dc: bool) {
139 const BLK_INDEX: [usize; 16] = [
145 let qidx = (qp % 6) as usize;
147 let start = if quant_dc { 0 } else { 1 };
148 for (el, &idx) in blk.iter_mut().zip(BLK_INDEX.iter()).skip(start) {
149 *el = (*el * LEVEL_SCALE[idx][qidx]) << shift;
151 for row in blk.chunks_mut(4) {
152 transform!(row[0], row[1], row[2], row[3], 0);
155 transform!(blk[i], blk[i + 4], blk[i + 8], blk[i + 12], 6);
159 pub fn idct_dc(blk: &mut [i16; 16], qp: u8, quant_dc: bool) {
160 let dc = if quant_dc {
161 (blk[0] * LEVEL_SCALE[0][(qp % 6) as usize]) << (qp / 6)
165 *blk = [(dc + 0x20) >> 6; 16];
168 const QMAT_8X8: [[u8; 16]; 6] = [
202 pub fn dequant8x8(blk: &mut [i16; 64], slist: &[u8; 64]) {
203 for (el, &scan) in blk.iter_mut().zip(ZIGZAG8X8.iter()) {
205 *el = el.wrapping_mul(i16::from(slist[scan]));
210 pub fn idct8x8(blk: &mut [i16; 64], qp: u8) {
211 let mut tmp = [0i32; 64];
212 let qmat = &QMAT_8X8[(qp % 6) as usize];
214 let shift = qp / 6 - 6;
215 for (i, (dst, &src)) in tmp.iter_mut().zip(blk.iter()).enumerate() {
218 let idx = (x & 3) + (y & 3) * 4;
219 *dst = i32::from(src).wrapping_mul(i32::from(qmat[idx])) << shift;
222 let shift = 6 - qp / 6;
223 let bias = (1 << shift) >> 1;
224 for (i, (dst, &src)) in tmp.iter_mut().zip(blk.iter()).enumerate() {
227 let idx = (x & 3) + (y & 3) * 4;
228 *dst = i32::from(src).wrapping_mul(i32::from(qmat[idx])).wrapping_add(bias) >> shift;
231 for row in tmp.chunks_mut(8) {
232 transform!(row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7]);
235 transform!(tmp[col], tmp[col + 8], tmp[col + 8 * 2], tmp[col + 8 * 3],
236 tmp[col + 8 * 4], tmp[col + 8 * 5], tmp[col + 8 * 6], tmp[col + 8 * 7]);
238 for (dst, &src) in blk.iter_mut().zip(tmp.iter()) {
239 *dst = ((src + 0x20) >> 6) as i16;
243 pub fn add_coeffs(dst: &mut [u8], offset: usize, stride: usize, coeffs: &[i16]) {
244 let out = &mut dst[offset..][..stride * 3 + 4];
245 for (line, src) in out.chunks_mut(stride).take(4).zip(coeffs.chunks(4)) {
246 for (dst, src) in line.iter_mut().take(4).zip(src.iter()) {
247 *dst = (i32::from(*dst) + i32::from(*src)).max(0).min(255) as u8;
252 pub fn add_coeffs8(dst: &mut [u8], offset: usize, stride: usize, coeffs: &[i16; 64]) {
253 let out = &mut dst[offset..];
254 for (line, src) in out.chunks_mut(stride).take(8).zip(coeffs.chunks(8)) {
255 for (dst, src) in line.iter_mut().take(8).zip(src.iter()) {
256 *dst = (i32::from(*dst) + i32::from(*src)).max(0).min(255) as u8;
261 fn clip8(val: i16) -> u8 { val.max(0).min(255) as u8 }
263 fn ipred_dc128(buf: &mut [u8], stride: usize, bsize: usize) {
264 for row in buf.chunks_mut(stride).take(bsize) {
265 for el in row[..bsize].iter_mut() {
270 fn ipred_ver(buf: &mut [u8], stride: usize, top: &[u8], bsize: usize) {
271 for row in buf.chunks_mut(stride).take(bsize) {
272 row[..bsize].copy_from_slice(&top[..bsize]);
275 fn ipred_hor(buf: &mut [u8], stride: usize, left: &[u8], bsize: usize) {
276 for (row, &left) in buf.chunks_mut(stride).zip(left[1..].iter()).take(bsize) {
277 for el in row[..bsize].iter_mut() {
282 fn ipred_dc(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8], bsize: usize, shift: u8) {
283 let mut adc: u16 = 0;
284 for i in 0..bsize { adc += u16::from(top[i]); }
285 for i in 0..bsize { adc += u16::from(left[i + 1]); }
286 let dc = ((adc + (1 << (shift - 1))) >> shift) as u8;
288 for row in buf.chunks_mut(stride).take(bsize) {
289 for el in row[..bsize].iter_mut() {
294 fn ipred_left_dc(buf: &mut [u8], stride: usize, left: &[u8], bsize: usize, shift: u8) {
295 let mut adc: u16 = 0;
296 for i in 0..bsize { adc += u16::from(left[i + 1]); }
297 let dc = ((adc + (1 << (shift - 1))) >> shift) as u8;
299 for row in buf.chunks_mut(stride).take(bsize) {
300 for el in row[..bsize].iter_mut() {
305 fn ipred_top_dc(buf: &mut [u8], stride: usize, top: &[u8], bsize: usize, shift: u8) {
306 let mut adc: u16 = 0;
307 for i in 0..bsize { adc += u16::from(top[i]); }
308 let dc = ((adc + (1 << (shift - 1))) >> shift) as u8;
310 for row in buf.chunks_mut(stride).take(bsize) {
311 for el in row[..bsize].iter_mut() {
317 fn load(dst: &mut [u16], src: &[u8]) {
318 for (dst, &src) in dst.iter_mut().zip(src.iter()) {
319 *dst = u16::from(src);
323 fn ipred_4x4_ver(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8], _tr: &[u8]) {
324 ipred_ver(buf, stride, top, 4);
326 fn ipred_4x4_hor(buf: &mut [u8], stride: usize, _top: &[u8], left: &[u8], _tr: &[u8]) {
327 ipred_hor(buf, stride, left, 4);
329 fn ipred_4x4_diag_down_left(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8], tr: &[u8]) {
330 let mut t: [u16; 9] = [0; 9];
331 load(&mut t[..4], top);
332 load(&mut t[4..8], tr);
336 buf[i] = ((t[i] + 2 * t[i + 1] + t[i + 2] + 2) >> 2) as u8;
338 let dst = &mut buf[stride..];
340 dst[i] = ((t[i + 1] + 2 * t[i + 2] + t[i + 3] + 2) >> 2) as u8;
342 let dst = &mut buf[stride * 2..];
344 dst[i] = ((t[i + 2] + 2 * t[i + 3] + t[i + 4] + 2) >> 2) as u8;
346 let dst = &mut buf[stride * 3..];
348 dst[i] = ((t[i + 3] + 2 * t[i + 4] + t[i + 5] + 2) >> 2) as u8;
351 fn ipred_4x4_diag_down_right(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8], _tr: &[u8]) {
352 let mut t: [u16; 5] = [0; 5];
353 t[0] = u16::from(left[0]);
354 load(&mut t[1..], top);
355 let mut l: [u16; 5] = [0; 5];
361 dst[i + j * stride] = ((l[j - i - 1] + 2 * l[j - i] + l[j - i + 1] + 2) >> 2) as u8;
363 dst[j + j * stride] = ((l[1] + 2 * l[0] + t[1] + 2) >> 2) as u8;
365 dst[i + j * stride] = ((t[i - j - 1] + 2 * t[i - j] + t[i - j + 1] + 2) >> 2) as u8;
369 fn ipred_4x4_ver_right(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8], _tr: &[u8]) {
370 let mut t: [u16; 5] = [0; 5];
371 t[0] = u16::from(left[0]);
372 load(&mut t[1..], top);
373 let mut l: [u16; 5] = [0; 5];
379 let zvr = ((2 * i) as i8) - (j as i8);
383 pix = (t[i - (j >> 1)] + t[i - (j >> 1) + 1] + 1) >> 1;
385 pix = (t[i - (j >> 1) - 1] + 2 * t[i - (j >> 1)] + t[i - (j >> 1) + 1] + 2) >> 2;
389 pix = (l[1] + 2 * l[0] + t[1] + 2) >> 2;
391 pix = (l[j] + 2 * l[j - 1] + l[j - 2] + 2) >> 2;
394 dst[i + j * stride] = pix as u8;
398 fn ipred_4x4_ver_left(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8], tr: &[u8]) {
399 let mut t: [u16; 8] = [0; 8];
400 load(&mut t[..4], top);
401 load(&mut t[4..], tr);
404 dst[0 + 0 * stride] = ((t[0] + t[1] + 1) >> 1) as u8;
405 let pix = ((t[1] + t[2] + 1) >> 1) as u8;
406 dst[1 + 0 * stride] = pix;
407 dst[0 + 2 * stride] = pix;
408 let pix = ((t[2] + t[3] + 1) >> 1) as u8;
409 dst[2 + 0 * stride] = pix;
410 dst[1 + 2 * stride] = pix;
411 let pix = ((t[3] + t[4] + 1) >> 1) as u8;
412 dst[3 + 0 * stride] = pix;
413 dst[2 + 2 * stride] = pix;
414 dst[3 + 2 * stride] = ((t[4] + t[5] + 1) >> 1) as u8;
415 dst[0 + 1 * stride] = ((t[0] + 2*t[1] + t[2] + 2) >> 2) as u8;
416 let pix = ((t[1] + 2*t[2] + t[3] + 2) >> 2) as u8;
417 dst[1 + 1 * stride] = pix;
418 dst[0 + 3 * stride] = pix;
419 let pix = ((t[2] + 2*t[3] + t[4] + 2) >> 2) as u8;
420 dst[2 + 1 * stride] = pix;
421 dst[1 + 3 * stride] = pix;
422 let pix = ((t[3] + 2*t[4] + t[5] + 2) >> 2) as u8;
423 dst[3 + 1 * stride] = pix;
424 dst[2 + 3 * stride] = pix;
425 dst[3 + 3 * stride] = ((t[4] + 2*t[5] + t[6] + 2) >> 2) as u8;
427 fn ipred_4x4_hor_down(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8], _tr: &[u8]) {
428 let mut t: [u16; 5] = [0; 5];
429 t[0] = u16::from(left[0]);
430 load(&mut t[1..], top);
431 let mut l: [u16; 5] = [0; 5];
437 let zhd = ((2 * j) as i8) - (i as i8);
441 pix = (l[j - (i >> 1)] + l[j - (i >> 1) + 1] + 1) >> 1;
443 pix = (l[j - (i >> 1) - 1] + 2 * l[j - (i >> 1)] + l[j - (i >> 1) + 1] + 2) >> 2;
447 pix = (l[1] + 2 * l[0] + t[1] + 2) >> 2;
449 pix = (t[i - 2] + 2 * t[i - 1] + t[i] + 2) >> 2;
452 dst[i + j * stride] = pix as u8;
456 fn ipred_4x4_hor_up(buf: &mut [u8], stride: usize, _top: &[u8], left: &[u8], _tr: &[u8]) {
457 let mut l: [u16; 8] = [0; 8];
458 load(&mut l, &left[1..]);
461 dst[0 + 0 * stride] = ((l[0] + l[1] + 1) >> 1) as u8;
462 dst[1 + 0 * stride] = ((l[0] + 2*l[1] + l[2] + 2) >> 2) as u8;
463 let pix = ((l[1] + l[2] + 1) >> 1) as u8;
464 dst[2 + 0 * stride] = pix;
465 dst[0 + 1 * stride] = pix;
466 let pix = ((l[1] + 2*l[2] + l[3] + 2) >> 2) as u8;
467 dst[3 + 0 * stride] = pix;
468 dst[1 + 1 * stride] = pix;
469 let pix = ((l[2] + l[3] + 1) >> 1) as u8;
470 dst[2 + 1 * stride] = pix;
471 dst[0 + 2 * stride] = pix;
472 let pix = ((l[2] + 3*l[3] + 2) >> 2) as u8;
473 dst[3 + 1 * stride] = pix;
474 dst[1 + 2 * stride] = pix;
475 dst[3 + 2 * stride] = l[3] as u8;
476 dst[1 + 3 * stride] = l[3] as u8;
477 dst[0 + 3 * stride] = l[3] as u8;
478 dst[2 + 2 * stride] = l[3] as u8;
479 dst[2 + 3 * stride] = l[3] as u8;
480 dst[3 + 3 * stride] = l[3] as u8;
482 fn ipred_4x4_dc(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8], _tr: &[u8]) {
483 ipred_dc(buf, stride, top, left, 4, 3);
485 fn ipred_4x4_left_dc(buf: &mut [u8], stride: usize, _top: &[u8], left: &[u8], _tr: &[u8]) {
486 ipred_left_dc(buf, stride, left, 4, 2);
488 fn ipred_4x4_top_dc(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8], _tr: &[u8]) {
489 ipred_top_dc(buf, stride, top, 4, 2);
491 fn ipred_4x4_dc128(buf: &mut [u8], stride: usize, _top: &[u8], _left: &[u8], _tr: &[u8]) {
492 ipred_dc128(buf, stride, 4);
495 pub struct IPred8Context {
502 pub fn new() -> Self {
509 pub fn fill(&mut self, top: &[u8], left: &[u8], has_t: bool, has_tr: bool, has_l: bool, has_tl: bool) {
510 let mut t = [0x80u8; 19];
511 let mut l = [0x80u8; 11];
513 t[1..8 + 1].copy_from_slice(&top[..8]);
516 t[8 + 1..16 + 1].copy_from_slice(&top[8..][..8]);
517 t[16 + 1] = t[15 + 1];
518 t[17 + 1] = t[15 + 1];
520 let (t0, t1) = t.split_at_mut(8 + 1);
521 for el in t1.iter_mut() {
526 l[1..9].copy_from_slice(&left[1..9]);
539 self.t[i] = ((u16::from(t[i]) + 2 * u16::from(t[i + 1]) + u16::from(t[i + 2]) + 2) >> 2) as u8;
542 self.l[i] = ((u16::from(l[i]) + 2 * u16::from(l[i + 1]) + u16::from(l[i + 2]) + 2) >> 2) as u8;
544 self.tl = if has_t && has_l {
545 ((u16::from(t[1]) + 2 * u16::from(t[0]) + u16::from(l[1]) + 2) >> 2) as u8
547 ((3 * u16::from(t[0]) + u16::from(t[1]) + 2) >> 2) as u8
549 ((3 * u16::from(l[0]) + u16::from(l[1]) + 2) >> 2) as u8
556 fn ipred_y_8x8_ver(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
557 for row in buf.chunks_mut(stride).take(8) {
558 row[..8].copy_from_slice(&ctx.t[..8]);
561 fn ipred_y_8x8_hor(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
562 for (row, &l) in buf.chunks_mut(stride).zip(ctx.l.iter()).take(8) {
563 row[..8].copy_from_slice(&[l; 8]);
566 fn ipred_y_8x8_diag_down_left(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
567 let mut t = [0u16; 16];
568 load(&mut t, &ctx.t);
570 for (y, row) in buf.chunks_mut(stride).take(8).enumerate() {
571 for (x, pix) in row.iter_mut().take(8).enumerate() {
572 *pix = ((if (x != 7) || (y != 7) {
573 t[x + y] + 2 * t[x + y + 1] + t[x + y + 2]
580 fn ipred_y_8x8_diag_down_right(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
581 let mut t = [0u16; 9];
582 t[0] = u16::from(ctx.tl);
583 load(&mut t[1..], &ctx.t);
584 let mut l = [0u16; 9];
585 l[0] = u16::from(ctx.tl);
586 load(&mut l[1..], &ctx.l);
587 let diag = t[1] + 2 * t[0] + l[1];
589 for (y, row) in buf.chunks_mut(stride).take(8).enumerate() {
590 for (x, pix) in row.iter_mut().take(8).enumerate() {
592 t[x - y - 1] + 2 * t[x - y] + t[x - y + 1]
594 l[y - x - 1] + 2 * l[y - x] + l[y - x + 1]
601 fn ipred_y_8x8_ver_right(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
602 let mut t = [0u16; 9];
603 t[0] = u16::from(ctx.tl);
604 load(&mut t[1..], &ctx.t);
605 let mut l = [0u16; 9];
606 l[0] = u16::from(ctx.tl);
607 load(&mut l[1..], &ctx.l);
609 for (y, row) in buf.chunks_mut(stride).take(8).enumerate() {
610 for (x, pix) in row.iter_mut().take(8).enumerate() {
611 let zvr = 2 * (x as i8) - (y as i8);
613 let ix = x - (y >> 1);
615 (t[ix] + t[ix + 1] + 1) >> 1
617 (t[ix - 1] + 2 * t[ix] + t[ix + 1] + 2) >> 2
619 } else if zvr == -1 {
620 (l[1] + 2 * l[0] + t[1] + 2) >> 2
623 (l[ix] + 2 * l[ix - 1] + l[ix - 2] + 2) >> 2
628 fn ipred_y_8x8_ver_left(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
629 let mut t = [0u16; 16];
630 load(&mut t, &ctx.t);
632 for (y, row) in buf.chunks_mut(stride).take(8).enumerate() {
633 for (x, pix) in row.iter_mut().take(8).enumerate() {
634 let ix = x + (y >> 1);
635 *pix = if (y & 1) == 0 {
636 (t[ix] + t[ix + 1] + 1) >> 1
638 (t[ix] + 2 * t[ix + 1] + t[ix + 2] + 2) >> 2
644 fn ipred_y_8x8_hor_down(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
645 let mut t = [0u16; 9];
646 t[0] = u16::from(ctx.tl);
647 load(&mut t[1..], &ctx.t);
648 let mut l = [0u16; 9];
649 l[0] = u16::from(ctx.tl);
650 load(&mut l[1..], &ctx.l);
652 for (y, row) in buf.chunks_mut(stride).take(8).enumerate() {
653 for (x, pix) in row.iter_mut().take(8).enumerate() {
654 let zhd = 2 * (y as i8) - (x as i8);
656 let ix = y - (x >> 1);
658 (l[ix] + l[ix + 1] + 1) >> 1
660 (l[ix - 1] + 2 * l[ix] + l[ix + 1] + 2) >> 2
662 } else if zhd == -1 {
663 (l[1] + 2 * l[0] + t[1] + 2) >> 2
666 (t[ix] + 2 * t[ix - 1] + t[ix - 2] + 2) >> 2
671 fn ipred_y_8x8_hor_up(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
672 let mut l = [0u16; 8];
673 load(&mut l, &ctx.l);
675 for (y, row) in buf.chunks_mut(stride).take(8).enumerate() {
676 for (x, pix) in row.iter_mut().take(8).enumerate() {
678 let ix = y + (x >> 1);
681 } else if zhu == 13 {
682 (l[6] + 3 * l[7] + 2) >> 2
683 } else if (zhu & 1) != 0 {
684 (l[ix] + 2 * l[ix + 1] + l[ix + 2] + 2) >> 2
686 (l[ix] + l[ix + 1] + 1) >> 1
691 fn ipred_y_8x8_dc(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
693 for &t in ctx.t[..8].iter() {
696 for &l in ctx.l[..8].iter() {
699 let dc = ((sum + 8) >> 4) as u8;
700 for row in buf.chunks_mut(stride).take(8) {
701 for pix in row.iter_mut().take(8) {
706 fn ipred_y_8x8_left_dc(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
708 for &l in ctx.l[..8].iter() {
711 let dc = ((sum + 4) >> 3) as u8;
712 for row in buf.chunks_mut(stride).take(8) {
713 for pix in row.iter_mut().take(8) {
718 fn ipred_y_8x8_top_dc(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
720 for &t in ctx.t[..8].iter() {
723 let dc = ((sum + 4) >> 3) as u8;
724 for row in buf.chunks_mut(stride).take(8) {
725 for pix in row.iter_mut().take(8) {
730 fn ipred_y_8x8_dc128(buf: &mut [u8], stride: usize, _ctx: &IPred8Context) {
731 ipred_dc128(buf, stride, 8);
734 fn ipred_8x8_ver(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8]) {
735 ipred_ver(buf, stride, top, 8);
737 fn ipred_8x8_hor(buf: &mut [u8], stride: usize, _top: &[u8], left: &[u8]) {
738 ipred_hor(buf, stride, left, 8);
740 fn ipred_8x8_dc(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8]) {
742 load(&mut l, &left[1..]);
746 let dc0 = ((t[0] + t[1] + t[2] + t[3] + l[0] + l[1] + l[2] + l[3] + 4) >> 3) as u8;
747 let sum1 = t[4] + t[5] + t[6] + t[7];
748 let dc1 = ((sum1 + 2) >> 2) as u8;
749 let sum2 = l[4] + l[5] + l[6] + l[7];
750 let dc2 = ((sum2 + 2) >> 2) as u8;
751 let dc3 = ((sum1 + sum2 + 4) >> 3) as u8;
753 for row in buf.chunks_mut(stride).take(4) {
754 row[..4].copy_from_slice(&[dc0; 4]);
755 row[4..8].copy_from_slice(&[dc1; 4]);
757 for row in buf.chunks_mut(stride).skip(4).take(4) {
758 row[..4].copy_from_slice(&[dc2; 4]);
759 row[4..8].copy_from_slice(&[dc3; 4]);
762 fn ipred_8x8_left_dc(buf: &mut [u8], stride: usize, _top: &[u8], left: &[u8]) {
763 let mut left_dc0 = 0;
764 let mut left_dc1 = 0;
765 for &el in left[1..].iter().take(4) {
766 left_dc0 += u16::from(el);
768 for &el in left[1..].iter().skip(4).take(4) {
769 left_dc1 += u16::from(el);
771 let dc0 = ((left_dc0 + 2) >> 2) as u8;
772 let dc2 = ((left_dc1 + 2) >> 2) as u8;
773 for row in buf.chunks_mut(stride).take(4) {
774 row[..8].copy_from_slice(&[dc0; 8]);
776 for row in buf.chunks_mut(stride).skip(4).take(4) {
777 row[..8].copy_from_slice(&[dc2; 8]);
780 fn ipred_8x8_top_dc(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8]) {
781 ipred_top_dc(buf, stride, top, 4, 2);
782 ipred_top_dc(&mut buf[4..], stride, &top[4..], 4, 2);
783 let mut top = [0; 8];
784 top.copy_from_slice(&buf[stride * 3..][..8]);
785 ipred_top_dc(&mut buf[4 * stride..], stride, &top, 4, 2);
786 ipred_top_dc(&mut buf[4 + 4 * stride..], stride, &top[4..], 4, 2);
788 fn ipred_8x8_dc128(buf: &mut [u8], stride: usize, _top: &[u8], _left: &[u8]) {
789 ipred_dc128(buf, stride, 8);
791 fn ipred_8x8_plane(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8]) {
792 let mut h: i32 = 4 * (i32::from(top[7]) - i32::from(left[0]));
793 let mut v: i32 = 4 * (i32::from(left[8]) - i32::from(left[0]));
795 let i1 = (i + 1) as i32;
796 h += i1 * (i32::from(top[4 + i]) - i32::from(top[2 - i]));
797 v += i1 * (i32::from(left[5 + i]) - i32::from(left[3 - i]));
799 let b = (17 * h + 16) >> 5;
800 let c = (17 * v + 16) >> 5;
801 let mut a = 16 * (i32::from(left[8]) + i32::from(top[7])) - 3 * (b + c) + 16;
802 for line in buf.chunks_mut(stride).take(8) {
804 for el in line.iter_mut().take(8) {
805 *el = clip8((acc >> 5) as i16);
812 fn ipred_16x16_ver(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8]) {
813 ipred_ver(buf, stride, top, 16);
815 fn ipred_16x16_hor(buf: &mut [u8], stride: usize, _top: &[u8], left: &[u8]) {
816 ipred_hor(buf, stride, left, 16);
818 fn ipred_16x16_dc(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8]) {
819 ipred_dc(buf, stride, top, left, 16, 5);
821 fn ipred_16x16_left_dc(buf: &mut [u8], stride: usize, _top: &[u8], left: &[u8]) {
822 ipred_left_dc(buf, stride, left, 16, 4);
824 fn ipred_16x16_top_dc(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8]) {
825 ipred_top_dc(buf, stride, top, 16, 4);
827 fn ipred_16x16_dc128(buf: &mut [u8], stride: usize, _top: &[u8], _left: &[u8]) {
828 ipred_dc128(buf, stride, 16);
830 fn ipred_16x16_plane(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8]) {
831 let mut h = 8 * (i32::from(top[15]) - i32::from(left[0]));
832 let mut v = 8 * (i32::from(left[16]) - i32::from(left[0]));
834 h += ((k as i32) + 1) * (i32::from(top[8 + k]) - i32::from(top[6 - k]));
835 v += ((k as i32) + 1) * (i32::from(left[9 + k]) - i32::from(left[7 - k]));
838 h = (5 * h + 32) >> 6;
839 v = (5 * v + 32) >> 6;
841 let mut a = 16 * (i32::from(left[16]) + i32::from(top[15]) + 1) - 7 * (v + h);
843 for row in buf.chunks_mut(stride).take(16) {
847 for dst in row.chunks_exact_mut(4).take(4) {
848 dst[0] = clip8(((b ) >> 5) as i16);
849 dst[1] = clip8(((b + h) >> 5) as i16);
850 dst[2] = clip8(((b + 2*h) >> 5) as i16);
851 dst[3] = clip8(((b + 3*h) >> 5) as i16);
857 pub type IPred4x4Func = fn(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8], tr: &[u8]);
858 pub type IPred8x8Func = fn(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8]);
859 pub type IPred8x8LumaFunc = fn(buf: &mut [u8], stride: usize, ctx: &IPred8Context);
861 pub const IPRED4_DC128: usize = 11;
862 pub const IPRED4_DC_TOP: usize = 10;
863 pub const IPRED4_DC_LEFT: usize = 9;
864 pub const IPRED8_DC128: usize = 6;
865 pub const IPRED8_DC_TOP: usize = 5;
866 pub const IPRED8_DC_LEFT: usize = 4;
868 pub const IPRED_FUNCS4X4: [IPred4x4Func; 12] = [
869 ipred_4x4_ver, ipred_4x4_hor, ipred_4x4_dc,
870 ipred_4x4_diag_down_left, ipred_4x4_diag_down_right,
871 ipred_4x4_ver_right, ipred_4x4_hor_down, ipred_4x4_ver_left, ipred_4x4_hor_up,
872 ipred_4x4_left_dc, ipred_4x4_top_dc, ipred_4x4_dc128
875 pub const IPRED_FUNCS8X8_LUMA: [IPred8x8LumaFunc; 12] = [
876 ipred_y_8x8_ver, ipred_y_8x8_hor, ipred_y_8x8_dc,
877 ipred_y_8x8_diag_down_left, ipred_y_8x8_diag_down_right,
878 ipred_y_8x8_ver_right, ipred_y_8x8_hor_down,
879 ipred_y_8x8_ver_left, ipred_y_8x8_hor_up,
880 ipred_y_8x8_left_dc, ipred_y_8x8_top_dc, ipred_y_8x8_dc128
883 pub const IPRED_FUNCS8X8_CHROMA: [IPred8x8Func; 7] = [
884 ipred_8x8_dc, ipred_8x8_hor, ipred_8x8_ver, ipred_8x8_plane,
885 ipred_8x8_left_dc, ipred_8x8_top_dc, ipred_8x8_dc128
888 pub const IPRED_FUNCS16X16: [IPred8x8Func; 7] = [
889 ipred_16x16_ver, ipred_16x16_hor, ipred_16x16_dc, ipred_16x16_plane,
890 ipred_16x16_left_dc, ipred_16x16_top_dc, ipred_16x16_dc128
893 macro_rules! loop_filter {
894 (lumaedge; $buf: expr, $off: expr, $step: expr, $alpha: expr, $beta: expr) => {
895 let p2 = i16::from($buf[$off - $step * 3]);
896 let p1 = i16::from($buf[$off - $step * 2]);
897 let p0 = i16::from($buf[$off - $step]);
898 let q0 = i16::from($buf[$off]);
899 let q1 = i16::from($buf[$off + $step]);
900 let q2 = i16::from($buf[$off + $step * 2]);
901 let a_p = (p2 - p0).abs() < $beta;
902 let a_q = (q2 - q0).abs() < $beta;
903 if a_p && (p0 - q0).abs() < (($alpha >> 2) + 2) {
904 let p3 = i16::from($buf[$off - $step * 4]);
905 $buf[$off - $step * 3] = ((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) as u8;
906 $buf[$off - $step * 2] = ((p2 + p1 + p0 + q0 + 2) >> 2) as u8;
907 $buf[$off - $step] = ((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) as u8;
909 $buf[$off - $step] = ((2 * p1 + p0 + q1 + 2) >> 2) as u8;
911 if a_q && (p0 - q0).abs() < (($alpha >> 2) + 2) {
912 let q3 = i16::from($buf[$off + $step * 3]);
913 $buf[$off] = ((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) as u8;
914 $buf[$off + $step] = ((p0 + q0 + q1 + q2 + 2) >> 2) as u8;
915 $buf[$off + $step * 2] = ((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) as u8;
917 $buf[$off] = ((2 * q1 + q0 + p1 + 2) >> 2) as u8;
920 (chromaedge; $buf: expr, $off: expr, $step: expr) => {
921 let p1 = i16::from($buf[$off - $step * 2]);
922 let p0 = i16::from($buf[$off - $step]);
923 let q0 = i16::from($buf[$off]);
924 let q1 = i16::from($buf[$off + $step]);
925 $buf[$off - $step] = ((2 * p1 + p0 + q1 + 2) >> 2) as u8;
926 $buf[$off] = ((2 * q1 + q0 + p1 + 2) >> 2) as u8;
928 (lumanormal; $buf: expr, $off: expr, $step: expr, $tc0: expr, $beta: expr) => {
929 let p2 = i16::from($buf[$off - $step * 3]);
930 let p1 = i16::from($buf[$off - $step * 2]);
931 let p0 = i16::from($buf[$off - $step]);
932 let q0 = i16::from($buf[$off]);
933 let q1 = i16::from($buf[$off + $step]);
934 let q2 = i16::from($buf[$off + $step * 2]);
935 let a_p = (p2 - p0).abs() < $beta;
936 let a_q = (q2 - q0).abs() < $beta;
937 let tc = $tc0 + (a_p as i16) + (a_q as i16);
938 let delta = (((q0 - p0) * 4 + (p1 - q1) + 4) >> 3).max(-tc).min(tc);
939 if a_p && ($tc0 > 0) {
940 $buf[$off - $step * 2] = clip8(p1 + ((p2 + ((p0 + q0 + 1) >> 1) - p1 * 2) >> 1).max(-$tc0).min($tc0));
942 $buf[$off - $step] = clip8(p0 + delta);
943 $buf[$off] = clip8(q0 - delta);
944 if a_q && ($tc0 > 0) {
945 $buf[$off + $step] = clip8(q1 + ((q2 + ((p0 + q0 + 1) >> 1) - q1 * 2) >> 1).max(-$tc0).min($tc0));
948 (chromanormal; $buf: expr, $off: expr, $step: expr, $tc0: expr) => {
949 let p1 = i16::from($buf[$off - $step * 2]);
950 let p0 = i16::from($buf[$off - $step]);
951 let q0 = i16::from($buf[$off]);
952 let q1 = i16::from($buf[$off + $step]);
954 let delta = (((q0 - p0) * 4 + (p1 - q1) + 4) >> 3).max(-tc).min(tc);
955 $buf[$off - $step] = clip8(p0 + delta);
956 $buf[$off] = clip8(q0 - delta);
960 fn check_filter(buf: &[u8], off: usize, step: usize, alpha: i16, beta: i16) -> bool {
961 let p1 = i16::from(buf[off - step * 2]);
962 let p0 = i16::from(buf[off - step]);
963 let q0 = i16::from(buf[off]);
964 let q1 = i16::from(buf[off + step]);
965 (p0 - q0).abs() < alpha && (p1 - p0).abs() < beta && (q1 - q0).abs() < beta
968 #[cfg(not(target_arch="x86_64"))]
969 fn check_filter4(buf: &[u8], mut off: usize, step: usize, stride: usize, alpha: i16, beta: i16) -> [bool; 4] {
970 let mut flags = [false; 4];
971 for flag in flags.iter_mut() {
972 let p1 = i16::from(buf[off - step * 2]);
973 let p0 = i16::from(buf[off - step]);
974 let q0 = i16::from(buf[off]);
975 let q1 = i16::from(buf[off + step]);
976 *flag = (p0 - q0).abs() < alpha && (p1 - p0).abs() < beta && (q1 - q0).abs() < beta;
982 #[cfg(target_arch="x86_64")]
983 fn check_filter4(buf: &[u8], off: usize, step: usize, stride: usize, alpha: i16, beta: i16) -> [bool; 4] {
985 let mut flags = [false; 4];
986 let src = buf[off - step * 2..].as_ptr();
987 let load_stride = step.max(stride);
988 let fptr = flags.as_mut_ptr();
989 let tflag = u32::from(step == 1);
993 "movd xmm0, dword ptr [{src}]",
994 "lea {tmp}, [{src} + {stride} * 2]",
995 "movd xmm1, dword ptr [{src} + {stride}]",
996 "movd xmm2, dword ptr [{tmp}]",
997 "movd xmm3, dword ptr [{tmp} + {stride}]",
998 "punpcklbw xmm0, xmm4",
999 "punpcklbw xmm1, xmm4",
1000 "punpcklbw xmm2, xmm4",
1001 "punpcklbw xmm3, xmm4",
1003 // transpose block if necessary so it's always processed by rows
1004 "test {tflag:e}, {tflag:e}",
1006 "punpcklwd xmm0, xmm1",
1007 "movhlps xmm4, xmm0",
1008 "punpcklwd xmm2, xmm3",
1009 "movhlps xmm1, xmm2",
1010 "punpckldq xmm0, xmm2",
1011 "punpckldq xmm4, xmm1",
1012 "movhlps xmm1, xmm0",
1013 "movhlps xmm3, xmm4",
1014 "movaps xmm2, xmm4",
1017 // calculate deltas and flags
1018 "movd xmm4, {alpha:r}",
1019 "movd xmm5, {beta:r}",
1023 "pshuflw xmm4, xmm4, 0",
1024 "pshuflw xmm5, xmm5, 0",
1025 "pabsw xmm0, xmm0", // |p1 - p0|
1026 "pabsw xmm1, xmm1", // |p0 - q0|
1027 "pabsw xmm2, xmm3", // |q1 - q0|
1028 "movaps xmm3, xmm5",
1029 "pcmpgtw xmm4, xmm1",
1030 "pcmpgtw xmm5, xmm0",
1031 "pcmpgtw xmm3, xmm2",
1034 "packsswb xmm4, xmm4",
1035 "movd [{flags}], xmm4",
1038 stride = in(reg) load_stride,
1039 alpha = in(reg) alpha,
1040 beta = in(reg) beta,
1041 flags = in(reg) fptr,
1042 tflag = in(reg) tflag,
1054 pub fn loop_filter_lumaedge_v(dst: &mut [u8], mut off: usize, stride: usize, alpha: i16, beta: i16) {
1055 let flags = check_filter4(dst, off, 1, stride, alpha, beta);
1056 for &flag in flags.iter() {
1058 loop_filter!(lumaedge; dst, off, 1, alpha, beta);
1063 pub fn loop_filter_lumaedge_h(dst: &mut [u8], off: usize, stride: usize, alpha: i16, beta: i16) {
1064 let flags = check_filter4(dst, off, stride, 1, alpha, beta);
1065 for (x, &flag) in flags.iter().enumerate() {
1067 loop_filter!(lumaedge; dst, off + x, stride, alpha, beta);
1071 pub fn loop_filter_lumanormal_v(dst: &mut [u8], mut off: usize, stride: usize, alpha: i16, beta: i16, tc0: i16) {
1072 let flags = check_filter4(dst, off, 1, stride, alpha, beta);
1073 for &flag in flags.iter() {
1075 loop_filter!(lumanormal; dst, off, 1, tc0, beta);
1080 pub fn loop_filter_lumanormal_h(dst: &mut [u8], off: usize, stride: usize, alpha: i16, beta: i16, tc0: i16) {
1081 let flags = check_filter4(dst, off, stride, 1, alpha, beta);
1082 for (x, &flag) in flags.iter().enumerate() {
1084 loop_filter!(lumanormal; dst, off + x, stride, tc0, beta);
1088 pub fn loop_filter_chromaedge_v(dst: &mut [u8], mut off: usize, stride: usize, alpha: i16, beta: i16) {
1090 if check_filter(dst, off, 1, alpha, beta) {
1091 loop_filter!(chromaedge; dst, off, 1);
1096 pub fn loop_filter_chromaedge_h(dst: &mut [u8], off: usize, stride: usize, alpha: i16, beta: i16) {
1098 if check_filter(dst, off + x, stride, alpha, beta) {
1099 loop_filter!(chromaedge; dst, off + x, stride);
1103 pub fn loop_filter_chromanormal_v(dst: &mut [u8], mut off: usize, stride: usize, alpha: i16, beta: i16, tc0: i16) {
1105 if check_filter(dst, off, 1, alpha, beta) {
1106 loop_filter!(chromanormal; dst, off, 1, tc0);
1111 pub fn loop_filter_chromanormal_h(dst: &mut [u8], off: usize, stride: usize, alpha: i16, beta: i16, tc0: i16) {
1113 if check_filter(dst, off + x, stride, alpha, beta) {
1114 loop_filter!(chromanormal; dst, off + x, stride, tc0);