2 pub use mc::{H264MC, McBlock};
3 #[cfg(target_arch="x86_64")]
6 pub const CHROMA_QUANTS: [u8; 52] = [
7 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
8 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 29, 30,
9 31, 32, 32, 33, 34, 34, 35, 35, 36, 36, 37, 37, 37, 38, 38, 38,
13 pub const CHROMA_DC_SCAN: [usize; 4] = [ 0, 1, 2, 3];
14 pub const ZIGZAG: [usize; 16] = [
15 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
17 pub const ZIGZAG1: [usize; 15] = [
18 0, 3, 7, 4, 1, 2, 5, 8, 11, 12, 9, 6, 10, 13, 14
20 /*pub const IL_SCAN: [usize; 16] = [
21 0, 4, 1, 8, 12, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
23 pub const ZIGZAG8X8: [usize; 64] = [
24 0, 1, 8, 16, 9, 2, 3, 10,
25 17, 24, 32, 25, 18, 11, 4, 5,
26 12, 19, 26, 33, 40, 48, 41, 34,
27 27, 20, 13, 6, 7, 14, 21, 28,
28 35, 42, 49, 56, 57, 50, 43, 36,
29 29, 22, 15, 23, 30, 37, 44, 51,
30 58, 59, 52, 45, 38, 31, 39, 46,
31 53, 60, 61, 54, 47, 55, 62, 63
34 const LEVEL_SCALE: [[i16; 6]; 3] = [
35 [ 10, 11, 13, 14, 16, 18 ],
36 [ 16, 18, 20, 23, 25, 29 ],
37 [ 13, 14, 16, 18, 20, 23 ]
40 pub fn chroma_dc_transform(blk: &mut [i16; 4], qp: u8) {
41 let t0 = blk[0] + blk[2];
42 let t1 = blk[0] - blk[2];
43 let t2 = blk[1] + blk[3];
44 let t3 = blk[1] - blk[3];
50 let mul = LEVEL_SCALE[0][qp as usize];
51 for el in blk.iter_mut() {
52 *el = el.wrapping_mul(mul) >> 1;
55 let mul = LEVEL_SCALE[0][(qp % 6) as usize];
56 let shift = qp / 6 - 1;
57 for el in blk.iter_mut() {
58 *el = el.wrapping_mul(mul) << shift;
63 macro_rules! transform {
64 (luma_dc; $a: expr, $b: expr, $c: expr, $d: expr) => ({
65 let t0 = $a.wrapping_add($c);
66 let t1 = $a.wrapping_sub($c);
67 let t2 = $b.wrapping_add($d);
68 let t3 = $b.wrapping_sub($d);
69 $a = t0.wrapping_add(t2);
70 $b = t1.wrapping_add(t3);
71 $c = t1.wrapping_sub(t3);
72 $d = t0.wrapping_sub(t2);
74 ($a: expr, $b: expr, $c: expr, $d: expr, $shift: expr) => ({
75 let t0 = $a.wrapping_add($c);
76 let t1 = $a.wrapping_sub($c);
77 let t2 = ($b >> 1).wrapping_sub($d);
78 let t3 = $b.wrapping_add($d >> 1);
79 let bias = 1 << $shift >> 1;
80 $a = t0.wrapping_add(t3).wrapping_add(bias) >> $shift;
81 $b = t1.wrapping_add(t2).wrapping_add(bias) >> $shift;
82 $c = t1.wrapping_sub(t2).wrapping_add(bias) >> $shift;
83 $d = t0.wrapping_sub(t3).wrapping_add(bias) >> $shift;
85 ($a: expr, $b: expr, $c: expr, $d: expr, $e: expr, $f: expr, $g: expr, $h: expr) => {
87 let e1 = -$d + $f - $h - ($h >> 1);
89 let e3 = $b + $h - $d - ($d >> 1);
90 let e4 = ($c >> 1) - $g;
91 let e5 = -$b + $h + $f + ($f >> 1);
92 let e6 = $c + ($g >> 1);
93 let e7 = $d + $f + $b + ($b >> 1);
96 let f1 = e1 + (e7 >> 2);
98 let f3 = e3 + (e5 >> 2);
100 let f5 = (e3 >> 2) - e5;
102 let f7 = e7 - (e1 >> 2);
115 pub fn idct_luma_dc(blk: &mut [i16; 16], qp: u8) {
117 let mul = LEVEL_SCALE[0][(qp % 6) as usize];
118 let shift = 2 - qp / 6;
119 let bias = 1 << shift >> 1;
120 for el in blk.iter_mut() {
121 *el = el.wrapping_mul(mul).wrapping_add(bias) >> shift;
124 let mul = LEVEL_SCALE[0][(qp % 6) as usize];
125 let shift = qp / 6 - 2;
126 for el in blk.iter_mut() {
127 *el = el.wrapping_mul(mul) << shift;
131 transform!(luma_dc; blk[i], blk[i + 4], blk[i + 8], blk[i + 12]);
133 for row in blk.chunks_exact_mut(4) {
134 transform!(luma_dc; row[0], row[1], row[2], row[3]);
138 pub fn idct_skip_dc(blk: &mut [i16; 16], qp: u8) {
139 const BLK_INDEX: [usize; 16] = [
145 let qidx = (qp % 6) as usize;
147 for (el, &idx) in blk.iter_mut().zip(BLK_INDEX.iter()).skip(1) {
148 *el = (*el * LEVEL_SCALE[idx][qidx]) << shift;
150 for row in blk.chunks_exact_mut(4) {
151 transform!(row[0], row[1], row[2], row[3], 0);
154 transform!(blk[i], blk[i + 4], blk[i + 8], blk[i + 12], 6);
158 pub fn idct(blk: &mut [i16; 16], qp: u8) {
159 const BLK_INDEX: [usize; 16] = [
165 let qidx = (qp % 6) as usize;
167 for (el, &idx) in blk.iter_mut().zip(BLK_INDEX.iter()) {
168 *el = (*el * LEVEL_SCALE[idx][qidx]) << shift;
170 for row in blk.chunks_exact_mut(4) {
171 transform!(row[0], row[1], row[2], row[3], 0);
174 transform!(blk[i], blk[i + 4], blk[i + 8], blk[i + 12], 6);
178 pub fn idct_dc(blk: &mut [i16; 16], qp: u8, quant_dc: bool) {
179 let dc = if quant_dc {
180 (blk[0] * LEVEL_SCALE[0][(qp % 6) as usize]) << (qp / 6)
184 *blk = [(dc + 0x20) >> 6; 16];
187 const QMAT_8X8: [[u8; 16]; 6] = [
221 pub fn dequant8x8(blk: &mut [i16; 64], slist: &[u8; 64]) {
222 for (el, &scan) in blk.iter_mut().zip(ZIGZAG8X8.iter()) {
224 *el = el.wrapping_mul(i16::from(slist[scan]));
229 pub fn idct8x8(blk: &mut [i16; 64], qp: u8) {
230 let mut tmp = [0i32; 64];
231 let qmat = &QMAT_8X8[(qp % 6) as usize];
233 let shift = qp / 6 - 6;
234 for (i, (dst, &src)) in tmp.iter_mut().zip(blk.iter()).enumerate() {
237 let idx = (x & 3) + (y & 3) * 4;
238 *dst = i32::from(src).wrapping_mul(i32::from(qmat[idx])) << shift;
241 let shift = 6 - qp / 6;
242 let bias = (1 << shift) >> 1;
243 for (i, (dst, &src)) in tmp.iter_mut().zip(blk.iter()).enumerate() {
246 let idx = (x & 3) + (y & 3) * 4;
247 *dst = i32::from(src).wrapping_mul(i32::from(qmat[idx])).wrapping_add(bias) >> shift;
250 for row in tmp.chunks_exact_mut(8) {
251 transform!(row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7]);
254 transform!(tmp[col], tmp[col + 8], tmp[col + 8 * 2], tmp[col + 8 * 3],
255 tmp[col + 8 * 4], tmp[col + 8 * 5], tmp[col + 8 * 6], tmp[col + 8 * 7]);
257 for (dst, &src) in blk.iter_mut().zip(tmp.iter()) {
258 *dst = ((src + 0x20) >> 6) as i16;
262 pub fn add_coeffs(dst: &mut [u8], offset: usize, stride: usize, coeffs: &[i16]) {
263 let out = &mut dst[offset..][..stride * 3 + 4];
264 for (line, src) in out.chunks_mut(stride).take(4).zip(coeffs.chunks_exact(4)) {
265 for (dst, src) in line.iter_mut().take(4).zip(src.iter()) {
266 *dst = (i32::from(*dst) + i32::from(*src)).max(0).min(255) as u8;
271 pub fn add_coeffs8(dst: &mut [u8], offset: usize, stride: usize, coeffs: &[i16; 64]) {
272 let out = &mut dst[offset..];
273 for (line, src) in out.chunks_mut(stride).take(8).zip(coeffs.chunks_exact(8)) {
274 for (dst, src) in line.iter_mut().take(8).zip(src.iter()) {
275 *dst = (i32::from(*dst) + i32::from(*src)).max(0).min(255) as u8;
280 fn clip8(val: i16) -> u8 { val.max(0).min(255) as u8 }
282 fn ipred_dc128(buf: &mut [u8], stride: usize, bsize: usize) {
283 for row in buf.chunks_mut(stride).take(bsize) {
284 for el in row[..bsize].iter_mut() {
289 fn ipred_ver(buf: &mut [u8], stride: usize, top: &[u8], bsize: usize) {
290 for row in buf.chunks_mut(stride).take(bsize) {
291 row[..bsize].copy_from_slice(&top[..bsize]);
294 fn ipred_hor(buf: &mut [u8], stride: usize, left: &[u8], bsize: usize) {
295 for (row, &left) in buf.chunks_mut(stride).zip(left[1..].iter()).take(bsize) {
296 for el in row[..bsize].iter_mut() {
301 fn ipred_dc(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8], bsize: usize, shift: u8) {
302 let mut adc: u16 = 0;
303 for i in 0..bsize { adc += u16::from(top[i]); }
304 for i in 0..bsize { adc += u16::from(left[i + 1]); }
305 let dc = ((adc + (1 << (shift - 1))) >> shift) as u8;
307 for row in buf.chunks_mut(stride).take(bsize) {
308 for el in row[..bsize].iter_mut() {
313 fn ipred_left_dc(buf: &mut [u8], stride: usize, left: &[u8], bsize: usize, shift: u8) {
314 let mut adc: u16 = 0;
315 for i in 0..bsize { adc += u16::from(left[i + 1]); }
316 let dc = ((adc + (1 << (shift - 1))) >> shift) as u8;
318 for row in buf.chunks_mut(stride).take(bsize) {
319 for el in row[..bsize].iter_mut() {
324 fn ipred_top_dc(buf: &mut [u8], stride: usize, top: &[u8], bsize: usize, shift: u8) {
325 let mut adc: u16 = 0;
326 for i in 0..bsize { adc += u16::from(top[i]); }
327 let dc = ((adc + (1 << (shift - 1))) >> shift) as u8;
329 for row in buf.chunks_mut(stride).take(bsize) {
330 for el in row[..bsize].iter_mut() {
336 fn load(dst: &mut [u16], src: &[u8]) {
337 for (dst, &src) in dst.iter_mut().zip(src.iter()) {
338 *dst = u16::from(src);
342 fn ipred_4x4_ver(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8], _tr: &[u8]) {
343 ipred_ver(buf, stride, top, 4);
345 fn ipred_4x4_hor(buf: &mut [u8], stride: usize, _top: &[u8], left: &[u8], _tr: &[u8]) {
346 ipred_hor(buf, stride, left, 4);
348 fn ipred_4x4_diag_down_left(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8], tr: &[u8]) {
349 let mut t: [u16; 9] = [0; 9];
350 load(&mut t[..4], top);
351 load(&mut t[4..8], tr);
355 buf[i] = ((t[i] + 2 * t[i + 1] + t[i + 2] + 2) >> 2) as u8;
357 let dst = &mut buf[stride..];
359 dst[i] = ((t[i + 1] + 2 * t[i + 2] + t[i + 3] + 2) >> 2) as u8;
361 let dst = &mut buf[stride * 2..];
363 dst[i] = ((t[i + 2] + 2 * t[i + 3] + t[i + 4] + 2) >> 2) as u8;
365 let dst = &mut buf[stride * 3..];
367 dst[i] = ((t[i + 3] + 2 * t[i + 4] + t[i + 5] + 2) >> 2) as u8;
370 fn ipred_4x4_diag_down_right(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8], _tr: &[u8]) {
371 let mut t: [u16; 5] = [0; 5];
372 t[0] = u16::from(left[0]);
373 load(&mut t[1..], top);
374 let mut l: [u16; 5] = [0; 5];
380 dst[i + j * stride] = ((l[j - i - 1] + 2 * l[j - i] + l[j - i + 1] + 2) >> 2) as u8;
382 dst[j + j * stride] = ((l[1] + 2 * l[0] + t[1] + 2) >> 2) as u8;
384 dst[i + j * stride] = ((t[i - j - 1] + 2 * t[i - j] + t[i - j + 1] + 2) >> 2) as u8;
388 fn ipred_4x4_ver_right(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8], _tr: &[u8]) {
389 let mut t: [u16; 5] = [0; 5];
390 t[0] = u16::from(left[0]);
391 load(&mut t[1..], top);
392 let mut l: [u16; 5] = [0; 5];
398 let zvr = ((2 * i) as i8) - (j as i8);
402 pix = (t[i - (j >> 1)] + t[i - (j >> 1) + 1] + 1) >> 1;
404 pix = (t[i - (j >> 1) - 1] + 2 * t[i - (j >> 1)] + t[i - (j >> 1) + 1] + 2) >> 2;
408 pix = (l[1] + 2 * l[0] + t[1] + 2) >> 2;
410 pix = (l[j] + 2 * l[j - 1] + l[j - 2] + 2) >> 2;
413 dst[i + j * stride] = pix as u8;
417 fn ipred_4x4_ver_left(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8], tr: &[u8]) {
418 let mut t: [u16; 8] = [0; 8];
419 load(&mut t[..4], top);
420 load(&mut t[4..], tr);
423 dst[0 + 0 * stride] = ((t[0] + t[1] + 1) >> 1) as u8;
424 let pix = ((t[1] + t[2] + 1) >> 1) as u8;
425 dst[1 + 0 * stride] = pix;
426 dst[0 + 2 * stride] = pix;
427 let pix = ((t[2] + t[3] + 1) >> 1) as u8;
428 dst[2 + 0 * stride] = pix;
429 dst[1 + 2 * stride] = pix;
430 let pix = ((t[3] + t[4] + 1) >> 1) as u8;
431 dst[3 + 0 * stride] = pix;
432 dst[2 + 2 * stride] = pix;
433 dst[3 + 2 * stride] = ((t[4] + t[5] + 1) >> 1) as u8;
434 dst[0 + 1 * stride] = ((t[0] + 2*t[1] + t[2] + 2) >> 2) as u8;
435 let pix = ((t[1] + 2*t[2] + t[3] + 2) >> 2) as u8;
436 dst[1 + 1 * stride] = pix;
437 dst[0 + 3 * stride] = pix;
438 let pix = ((t[2] + 2*t[3] + t[4] + 2) >> 2) as u8;
439 dst[2 + 1 * stride] = pix;
440 dst[1 + 3 * stride] = pix;
441 let pix = ((t[3] + 2*t[4] + t[5] + 2) >> 2) as u8;
442 dst[3 + 1 * stride] = pix;
443 dst[2 + 3 * stride] = pix;
444 dst[3 + 3 * stride] = ((t[4] + 2*t[5] + t[6] + 2) >> 2) as u8;
446 fn ipred_4x4_hor_down(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8], _tr: &[u8]) {
447 let mut t: [u16; 5] = [0; 5];
448 t[0] = u16::from(left[0]);
449 load(&mut t[1..], top);
450 let mut l: [u16; 5] = [0; 5];
456 let zhd = ((2 * j) as i8) - (i as i8);
460 pix = (l[j - (i >> 1)] + l[j - (i >> 1) + 1] + 1) >> 1;
462 pix = (l[j - (i >> 1) - 1] + 2 * l[j - (i >> 1)] + l[j - (i >> 1) + 1] + 2) >> 2;
466 pix = (l[1] + 2 * l[0] + t[1] + 2) >> 2;
468 pix = (t[i - 2] + 2 * t[i - 1] + t[i] + 2) >> 2;
471 dst[i + j * stride] = pix as u8;
475 fn ipred_4x4_hor_up(buf: &mut [u8], stride: usize, _top: &[u8], left: &[u8], _tr: &[u8]) {
476 let mut l: [u16; 8] = [0; 8];
477 load(&mut l, &left[1..]);
480 dst[0 + 0 * stride] = ((l[0] + l[1] + 1) >> 1) as u8;
481 dst[1 + 0 * stride] = ((l[0] + 2*l[1] + l[2] + 2) >> 2) as u8;
482 let pix = ((l[1] + l[2] + 1) >> 1) as u8;
483 dst[2 + 0 * stride] = pix;
484 dst[0 + 1 * stride] = pix;
485 let pix = ((l[1] + 2*l[2] + l[3] + 2) >> 2) as u8;
486 dst[3 + 0 * stride] = pix;
487 dst[1 + 1 * stride] = pix;
488 let pix = ((l[2] + l[3] + 1) >> 1) as u8;
489 dst[2 + 1 * stride] = pix;
490 dst[0 + 2 * stride] = pix;
491 let pix = ((l[2] + 3*l[3] + 2) >> 2) as u8;
492 dst[3 + 1 * stride] = pix;
493 dst[1 + 2 * stride] = pix;
494 dst[3 + 2 * stride] = l[3] as u8;
495 dst[1 + 3 * stride] = l[3] as u8;
496 dst[0 + 3 * stride] = l[3] as u8;
497 dst[2 + 2 * stride] = l[3] as u8;
498 dst[2 + 3 * stride] = l[3] as u8;
499 dst[3 + 3 * stride] = l[3] as u8;
501 fn ipred_4x4_dc(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8], _tr: &[u8]) {
502 ipred_dc(buf, stride, top, left, 4, 3);
504 fn ipred_4x4_left_dc(buf: &mut [u8], stride: usize, _top: &[u8], left: &[u8], _tr: &[u8]) {
505 ipred_left_dc(buf, stride, left, 4, 2);
507 fn ipred_4x4_top_dc(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8], _tr: &[u8]) {
508 ipred_top_dc(buf, stride, top, 4, 2);
510 fn ipred_4x4_dc128(buf: &mut [u8], stride: usize, _top: &[u8], _left: &[u8], _tr: &[u8]) {
511 ipred_dc128(buf, stride, 4);
514 pub struct IPred8Context {
521 pub fn new() -> Self {
528 pub fn fill(&mut self, top: &[u8], left: &[u8], has_t: bool, has_tr: bool, has_l: bool, has_tl: bool) {
529 let mut t = [0x80u8; 19];
530 let mut l = [0x80u8; 11];
532 t[1..8 + 1].copy_from_slice(&top[..8]);
535 t[8 + 1..16 + 1].copy_from_slice(&top[8..][..8]);
536 t[16 + 1] = t[15 + 1];
537 t[17 + 1] = t[15 + 1];
539 let (t0, t1) = t.split_at_mut(8 + 1);
540 for el in t1.iter_mut() {
545 l[1..9].copy_from_slice(&left[1..9]);
558 self.t[i] = ((u16::from(t[i]) + 2 * u16::from(t[i + 1]) + u16::from(t[i + 2]) + 2) >> 2) as u8;
561 self.l[i] = ((u16::from(l[i]) + 2 * u16::from(l[i + 1]) + u16::from(l[i + 2]) + 2) >> 2) as u8;
563 self.tl = if has_t && has_l {
564 ((u16::from(t[1]) + 2 * u16::from(t[0]) + u16::from(l[1]) + 2) >> 2) as u8
566 ((3 * u16::from(t[0]) + u16::from(t[1]) + 2) >> 2) as u8
568 ((3 * u16::from(l[0]) + u16::from(l[1]) + 2) >> 2) as u8
575 fn ipred_y_8x8_ver(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
576 for row in buf.chunks_mut(stride).take(8) {
577 row[..8].copy_from_slice(&ctx.t[..8]);
580 fn ipred_y_8x8_hor(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
581 for (row, &l) in buf.chunks_mut(stride).zip(ctx.l.iter()).take(8) {
582 row[..8].copy_from_slice(&[l; 8]);
585 fn ipred_y_8x8_diag_down_left(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
586 let mut t = [0u16; 16];
587 load(&mut t, &ctx.t);
589 for (y, row) in buf.chunks_mut(stride).take(8).enumerate() {
590 for (x, pix) in row.iter_mut().take(8).enumerate() {
591 *pix = ((if (x != 7) || (y != 7) {
592 t[x + y] + 2 * t[x + y + 1] + t[x + y + 2]
599 fn ipred_y_8x8_diag_down_right(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
600 let mut t = [0u16; 9];
601 t[0] = u16::from(ctx.tl);
602 load(&mut t[1..], &ctx.t);
603 let mut l = [0u16; 9];
604 l[0] = u16::from(ctx.tl);
605 load(&mut l[1..], &ctx.l);
606 let diag = t[1] + 2 * t[0] + l[1];
608 for (y, row) in buf.chunks_mut(stride).take(8).enumerate() {
609 for (x, pix) in row.iter_mut().take(8).enumerate() {
611 t[x - y - 1] + 2 * t[x - y] + t[x - y + 1]
613 l[y - x - 1] + 2 * l[y - x] + l[y - x + 1]
620 fn ipred_y_8x8_ver_right(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
621 let mut t = [0u16; 9];
622 t[0] = u16::from(ctx.tl);
623 load(&mut t[1..], &ctx.t);
624 let mut l = [0u16; 9];
625 l[0] = u16::from(ctx.tl);
626 load(&mut l[1..], &ctx.l);
628 for (y, row) in buf.chunks_mut(stride).take(8).enumerate() {
629 for (x, pix) in row.iter_mut().take(8).enumerate() {
630 let zvr = 2 * (x as i8) - (y as i8);
632 let ix = x - (y >> 1);
634 (t[ix] + t[ix + 1] + 1) >> 1
636 (t[ix - 1] + 2 * t[ix] + t[ix + 1] + 2) >> 2
638 } else if zvr == -1 {
639 (l[1] + 2 * l[0] + t[1] + 2) >> 2
642 (l[ix] + 2 * l[ix - 1] + l[ix - 2] + 2) >> 2
647 fn ipred_y_8x8_ver_left(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
648 let mut t = [0u16; 16];
649 load(&mut t, &ctx.t);
651 for (y, row) in buf.chunks_mut(stride).take(8).enumerate() {
652 for (x, pix) in row.iter_mut().take(8).enumerate() {
653 let ix = x + (y >> 1);
654 *pix = if (y & 1) == 0 {
655 (t[ix] + t[ix + 1] + 1) >> 1
657 (t[ix] + 2 * t[ix + 1] + t[ix + 2] + 2) >> 2
663 fn ipred_y_8x8_hor_down(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
664 let mut t = [0u16; 9];
665 t[0] = u16::from(ctx.tl);
666 load(&mut t[1..], &ctx.t);
667 let mut l = [0u16; 9];
668 l[0] = u16::from(ctx.tl);
669 load(&mut l[1..], &ctx.l);
671 for (y, row) in buf.chunks_mut(stride).take(8).enumerate() {
672 for (x, pix) in row.iter_mut().take(8).enumerate() {
673 let zhd = 2 * (y as i8) - (x as i8);
675 let ix = y - (x >> 1);
677 (l[ix] + l[ix + 1] + 1) >> 1
679 (l[ix - 1] + 2 * l[ix] + l[ix + 1] + 2) >> 2
681 } else if zhd == -1 {
682 (l[1] + 2 * l[0] + t[1] + 2) >> 2
685 (t[ix] + 2 * t[ix - 1] + t[ix - 2] + 2) >> 2
690 fn ipred_y_8x8_hor_up(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
691 let mut l = [0u16; 8];
692 load(&mut l, &ctx.l);
694 for (y, row) in buf.chunks_mut(stride).take(8).enumerate() {
695 for (x, pix) in row.iter_mut().take(8).enumerate() {
697 let ix = y + (x >> 1);
700 } else if zhu == 13 {
701 (l[6] + 3 * l[7] + 2) >> 2
702 } else if (zhu & 1) != 0 {
703 (l[ix] + 2 * l[ix + 1] + l[ix + 2] + 2) >> 2
705 (l[ix] + l[ix + 1] + 1) >> 1
710 fn ipred_y_8x8_dc(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
712 for &t in ctx.t[..8].iter() {
715 for &l in ctx.l[..8].iter() {
718 let dc = ((sum + 8) >> 4) as u8;
719 for row in buf.chunks_mut(stride).take(8) {
720 for pix in row.iter_mut().take(8) {
725 fn ipred_y_8x8_left_dc(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
727 for &l in ctx.l[..8].iter() {
730 let dc = ((sum + 4) >> 3) as u8;
731 for row in buf.chunks_mut(stride).take(8) {
732 for pix in row.iter_mut().take(8) {
737 fn ipred_y_8x8_top_dc(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
739 for &t in ctx.t[..8].iter() {
742 let dc = ((sum + 4) >> 3) as u8;
743 for row in buf.chunks_mut(stride).take(8) {
744 for pix in row.iter_mut().take(8) {
749 fn ipred_y_8x8_dc128(buf: &mut [u8], stride: usize, _ctx: &IPred8Context) {
750 ipred_dc128(buf, stride, 8);
753 fn ipred_8x8_ver(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8]) {
754 ipred_ver(buf, stride, top, 8);
756 fn ipred_8x8_hor(buf: &mut [u8], stride: usize, _top: &[u8], left: &[u8]) {
757 ipred_hor(buf, stride, left, 8);
759 fn ipred_8x8_dc(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8]) {
761 load(&mut l, &left[1..]);
765 let dc0 = ((t[0] + t[1] + t[2] + t[3] + l[0] + l[1] + l[2] + l[3] + 4) >> 3) as u8;
766 let sum1 = t[4] + t[5] + t[6] + t[7];
767 let dc1 = ((sum1 + 2) >> 2) as u8;
768 let sum2 = l[4] + l[5] + l[6] + l[7];
769 let dc2 = ((sum2 + 2) >> 2) as u8;
770 let dc3 = ((sum1 + sum2 + 4) >> 3) as u8;
772 for row in buf.chunks_mut(stride).take(4) {
773 row[..4].copy_from_slice(&[dc0; 4]);
774 row[4..8].copy_from_slice(&[dc1; 4]);
776 for row in buf.chunks_mut(stride).skip(4).take(4) {
777 row[..4].copy_from_slice(&[dc2; 4]);
778 row[4..8].copy_from_slice(&[dc3; 4]);
781 fn ipred_8x8_left_dc(buf: &mut [u8], stride: usize, _top: &[u8], left: &[u8]) {
782 let mut left_dc0 = 0;
783 let mut left_dc1 = 0;
784 for &el in left[1..].iter().take(4) {
785 left_dc0 += u16::from(el);
787 for &el in left[1..].iter().skip(4).take(4) {
788 left_dc1 += u16::from(el);
790 let dc0 = ((left_dc0 + 2) >> 2) as u8;
791 let dc2 = ((left_dc1 + 2) >> 2) as u8;
792 for row in buf.chunks_mut(stride).take(4) {
793 row[..8].copy_from_slice(&[dc0; 8]);
795 for row in buf.chunks_mut(stride).skip(4).take(4) {
796 row[..8].copy_from_slice(&[dc2; 8]);
799 fn ipred_8x8_top_dc(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8]) {
800 ipred_top_dc(buf, stride, top, 4, 2);
801 ipred_top_dc(&mut buf[4..], stride, &top[4..], 4, 2);
802 let mut top = [0; 8];
803 top.copy_from_slice(&buf[stride * 3..][..8]);
804 ipred_top_dc(&mut buf[4 * stride..], stride, &top, 4, 2);
805 ipred_top_dc(&mut buf[4 + 4 * stride..], stride, &top[4..], 4, 2);
807 fn ipred_8x8_dc128(buf: &mut [u8], stride: usize, _top: &[u8], _left: &[u8]) {
808 ipred_dc128(buf, stride, 8);
810 fn ipred_8x8_plane(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8]) {
811 let mut h: i32 = 4 * (i32::from(top[7]) - i32::from(left[0]));
812 let mut v: i32 = 4 * (i32::from(left[8]) - i32::from(left[0]));
814 let i1 = (i + 1) as i32;
815 h += i1 * (i32::from(top[4 + i]) - i32::from(top[2 - i]));
816 v += i1 * (i32::from(left[5 + i]) - i32::from(left[3 - i]));
818 let b = (17 * h + 16) >> 5;
819 let c = (17 * v + 16) >> 5;
820 let mut a = 16 * (i32::from(left[8]) + i32::from(top[7])) - 3 * (b + c) + 16;
821 for line in buf.chunks_mut(stride).take(8) {
823 for el in line.iter_mut().take(8) {
824 *el = clip8((acc >> 5) as i16);
831 fn ipred_16x16_ver(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8]) {
832 ipred_ver(buf, stride, top, 16);
834 fn ipred_16x16_hor(buf: &mut [u8], stride: usize, _top: &[u8], left: &[u8]) {
835 ipred_hor(buf, stride, left, 16);
837 fn ipred_16x16_dc(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8]) {
838 ipred_dc(buf, stride, top, left, 16, 5);
840 fn ipred_16x16_left_dc(buf: &mut [u8], stride: usize, _top: &[u8], left: &[u8]) {
841 ipred_left_dc(buf, stride, left, 16, 4);
843 fn ipred_16x16_top_dc(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8]) {
844 ipred_top_dc(buf, stride, top, 16, 4);
846 fn ipred_16x16_dc128(buf: &mut [u8], stride: usize, _top: &[u8], _left: &[u8]) {
847 ipred_dc128(buf, stride, 16);
849 fn ipred_16x16_plane(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8]) {
850 let mut h = 8 * (i32::from(top[15]) - i32::from(left[0]));
851 let mut v = 8 * (i32::from(left[16]) - i32::from(left[0]));
853 h += ((k as i32) + 1) * (i32::from(top[8 + k]) - i32::from(top[6 - k]));
854 v += ((k as i32) + 1) * (i32::from(left[9 + k]) - i32::from(left[7 - k]));
857 h = (5 * h + 32) >> 6;
858 v = (5 * v + 32) >> 6;
860 let mut a = 16 * (i32::from(left[16]) + i32::from(top[15]) + 1) - 7 * (v + h);
862 for row in buf.chunks_mut(stride).take(16) {
866 for dst in row.chunks_exact_mut(4).take(4) {
867 dst[0] = clip8(((b ) >> 5) as i16);
868 dst[1] = clip8(((b + h) >> 5) as i16);
869 dst[2] = clip8(((b + 2*h) >> 5) as i16);
870 dst[3] = clip8(((b + 3*h) >> 5) as i16);
876 pub type IPred4x4Func = fn(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8], tr: &[u8]);
877 pub type IPred8x8Func = fn(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8]);
878 pub type IPred8x8LumaFunc = fn(buf: &mut [u8], stride: usize, ctx: &IPred8Context);
880 pub const IPRED4_DC128: usize = 11;
881 pub const IPRED4_DC_TOP: usize = 10;
882 pub const IPRED4_DC_LEFT: usize = 9;
883 pub const IPRED8_DC128: usize = 6;
884 pub const IPRED8_DC_TOP: usize = 5;
885 pub const IPRED8_DC_LEFT: usize = 4;
887 pub const IPRED_FUNCS4X4: [IPred4x4Func; 12] = [
888 ipred_4x4_ver, ipred_4x4_hor, ipred_4x4_dc,
889 ipred_4x4_diag_down_left, ipred_4x4_diag_down_right,
890 ipred_4x4_ver_right, ipred_4x4_hor_down, ipred_4x4_ver_left, ipred_4x4_hor_up,
891 ipred_4x4_left_dc, ipred_4x4_top_dc, ipred_4x4_dc128
894 pub const IPRED_FUNCS8X8_LUMA: [IPred8x8LumaFunc; 12] = [
895 ipred_y_8x8_ver, ipred_y_8x8_hor, ipred_y_8x8_dc,
896 ipred_y_8x8_diag_down_left, ipred_y_8x8_diag_down_right,
897 ipred_y_8x8_ver_right, ipred_y_8x8_hor_down,
898 ipred_y_8x8_ver_left, ipred_y_8x8_hor_up,
899 ipred_y_8x8_left_dc, ipred_y_8x8_top_dc, ipred_y_8x8_dc128
902 pub const IPRED_FUNCS8X8_CHROMA: [IPred8x8Func; 7] = [
903 ipred_8x8_dc, ipred_8x8_hor, ipred_8x8_ver, ipred_8x8_plane,
904 ipred_8x8_left_dc, ipred_8x8_top_dc, ipred_8x8_dc128
907 pub const IPRED_FUNCS16X16: [IPred8x8Func; 7] = [
908 ipred_16x16_ver, ipred_16x16_hor, ipred_16x16_dc, ipred_16x16_plane,
909 ipred_16x16_left_dc, ipred_16x16_top_dc, ipred_16x16_dc128
912 macro_rules! loop_filter {
913 (lumaedge; $buf: expr, $off: expr, $step: expr, $alpha: expr, $beta: expr) => {
914 let p2 = i16::from($buf[$off - $step * 3]);
915 let p1 = i16::from($buf[$off - $step * 2]);
916 let p0 = i16::from($buf[$off - $step]);
917 let q0 = i16::from($buf[$off]);
918 let q1 = i16::from($buf[$off + $step]);
919 let q2 = i16::from($buf[$off + $step * 2]);
920 let a_p = (p2 - p0).abs() < $beta;
921 let a_q = (q2 - q0).abs() < $beta;
922 if a_p && (p0 - q0).abs() < (($alpha >> 2) + 2) {
923 let p3 = i16::from($buf[$off - $step * 4]);
924 $buf[$off - $step * 3] = ((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) as u8;
925 $buf[$off - $step * 2] = ((p2 + p1 + p0 + q0 + 2) >> 2) as u8;
926 $buf[$off - $step] = ((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) as u8;
928 $buf[$off - $step] = ((2 * p1 + p0 + q1 + 2) >> 2) as u8;
930 if a_q && (p0 - q0).abs() < (($alpha >> 2) + 2) {
931 let q3 = i16::from($buf[$off + $step * 3]);
932 $buf[$off] = ((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) as u8;
933 $buf[$off + $step] = ((p0 + q0 + q1 + q2 + 2) >> 2) as u8;
934 $buf[$off + $step * 2] = ((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) as u8;
936 $buf[$off] = ((2 * q1 + q0 + p1 + 2) >> 2) as u8;
939 (chromaedge; $buf: expr, $off: expr, $step: expr) => {
940 let p1 = i16::from($buf[$off - $step * 2]);
941 let p0 = i16::from($buf[$off - $step]);
942 let q0 = i16::from($buf[$off]);
943 let q1 = i16::from($buf[$off + $step]);
944 $buf[$off - $step] = ((2 * p1 + p0 + q1 + 2) >> 2) as u8;
945 $buf[$off] = ((2 * q1 + q0 + p1 + 2) >> 2) as u8;
947 (lumanormal; $buf: expr, $off: expr, $step: expr, $tc0: expr, $beta: expr) => {
948 let p2 = i16::from($buf[$off - $step * 3]);
949 let p1 = i16::from($buf[$off - $step * 2]);
950 let p0 = i16::from($buf[$off - $step]);
951 let q0 = i16::from($buf[$off]);
952 let q1 = i16::from($buf[$off + $step]);
953 let q2 = i16::from($buf[$off + $step * 2]);
954 let a_p = (p2 - p0).abs() < $beta;
955 let a_q = (q2 - q0).abs() < $beta;
956 let tc = $tc0 + (a_p as i16) + (a_q as i16);
957 let delta = (((q0 - p0) * 4 + (p1 - q1) + 4) >> 3).max(-tc).min(tc);
958 if a_p && ($tc0 > 0) {
959 $buf[$off - $step * 2] = clip8(p1 + ((p2 + ((p0 + q0 + 1) >> 1) - p1 * 2) >> 1).max(-$tc0).min($tc0));
961 $buf[$off - $step] = clip8(p0 + delta);
962 $buf[$off] = clip8(q0 - delta);
963 if a_q && ($tc0 > 0) {
964 $buf[$off + $step] = clip8(q1 + ((q2 + ((p0 + q0 + 1) >> 1) - q1 * 2) >> 1).max(-$tc0).min($tc0));
967 (chromanormal; $buf: expr, $off: expr, $step: expr, $tc0: expr) => {
968 let p1 = i16::from($buf[$off - $step * 2]);
969 let p0 = i16::from($buf[$off - $step]);
970 let q0 = i16::from($buf[$off]);
971 let q1 = i16::from($buf[$off + $step]);
973 let delta = (((q0 - p0) * 4 + (p1 - q1) + 4) >> 3).max(-tc).min(tc);
974 $buf[$off - $step] = clip8(p0 + delta);
975 $buf[$off] = clip8(q0 - delta);
979 fn check_filter(buf: &[u8], off: usize, step: usize, alpha: i16, beta: i16) -> bool {
980 let p1 = i16::from(buf[off - step * 2]);
981 let p0 = i16::from(buf[off - step]);
982 let q0 = i16::from(buf[off]);
983 let q1 = i16::from(buf[off + step]);
984 (p0 - q0).abs() < alpha && (p1 - p0).abs() < beta && (q1 - q0).abs() < beta
987 #[cfg(not(target_arch="x86_64"))]
988 fn check_filter4(buf: &[u8], mut off: usize, step: usize, stride: usize, alpha: i16, beta: i16) -> [bool; 4] {
989 let mut flags = [false; 4];
990 for flag in flags.iter_mut() {
991 let p1 = i16::from(buf[off - step * 2]);
992 let p0 = i16::from(buf[off - step]);
993 let q0 = i16::from(buf[off]);
994 let q1 = i16::from(buf[off + step]);
995 *flag = (p0 - q0).abs() < alpha && (p1 - p0).abs() < beta && (q1 - q0).abs() < beta;
1001 #[cfg(target_arch="x86_64")]
1002 fn check_filter4(buf: &[u8], off: usize, step: usize, stride: usize, alpha: i16, beta: i16) -> [bool; 4] {
1004 let mut flags = [false; 4];
1005 let src = buf[off - step * 2..].as_ptr();
1006 let load_stride = step.max(stride);
1007 let fptr = flags.as_mut_ptr();
1008 let tflag = u32::from(step == 1);
1012 "movd xmm0, dword ptr [{src}]",
1013 "lea {tmp}, [{src} + {stride} * 2]",
1014 "movd xmm1, dword ptr [{src} + {stride}]",
1015 "movd xmm2, dword ptr [{tmp}]",
1016 "movd xmm3, dword ptr [{tmp} + {stride}]",
1017 "punpcklbw xmm0, xmm4",
1018 "punpcklbw xmm1, xmm4",
1019 "punpcklbw xmm2, xmm4",
1020 "punpcklbw xmm3, xmm4",
1022 // transpose block if necessary so it's always processed by rows
1023 "test {tflag:e}, {tflag:e}",
1025 "punpcklwd xmm0, xmm1",
1026 "movhlps xmm4, xmm0",
1027 "punpcklwd xmm2, xmm3",
1028 "movhlps xmm1, xmm2",
1029 "punpckldq xmm0, xmm2",
1030 "punpckldq xmm4, xmm1",
1031 "movhlps xmm1, xmm0",
1032 "movhlps xmm3, xmm4",
1033 "movaps xmm2, xmm4",
1036 // calculate deltas and flags
1037 "movd xmm4, {alpha:r}",
1038 "movd xmm5, {beta:r}",
1042 "pshuflw xmm4, xmm4, 0",
1043 "pshuflw xmm5, xmm5, 0",
1044 "pabsw xmm0, xmm0", // |p1 - p0|
1045 "pabsw xmm1, xmm1", // |p0 - q0|
1046 "pabsw xmm2, xmm3", // |q1 - q0|
1047 "movaps xmm3, xmm5",
1048 "pcmpgtw xmm4, xmm1",
1049 "pcmpgtw xmm5, xmm0",
1050 "pcmpgtw xmm3, xmm2",
1053 "packsswb xmm4, xmm4",
1054 "movd [{flags}], xmm4",
1057 stride = in(reg) load_stride,
1058 alpha = in(reg) alpha,
1059 beta = in(reg) beta,
1060 flags = in(reg) fptr,
1061 tflag = in(reg) tflag,
1073 pub fn loop_filter_lumaedge_v(dst: &mut [u8], mut off: usize, stride: usize, alpha: i16, beta: i16) {
1074 let flags = check_filter4(dst, off, 1, stride, alpha, beta);
1075 for &flag in flags.iter() {
1077 loop_filter!(lumaedge; dst, off, 1, alpha, beta);
1082 pub fn loop_filter_lumaedge_h(dst: &mut [u8], off: usize, stride: usize, alpha: i16, beta: i16) {
1083 let flags = check_filter4(dst, off, stride, 1, alpha, beta);
1084 for (x, &flag) in flags.iter().enumerate() {
1086 loop_filter!(lumaedge; dst, off + x, stride, alpha, beta);
1090 pub fn loop_filter_lumanormal_v(dst: &mut [u8], mut off: usize, stride: usize, alpha: i16, beta: i16, tc0: i16) {
1091 let flags = check_filter4(dst, off, 1, stride, alpha, beta);
1092 for &flag in flags.iter() {
1094 loop_filter!(lumanormal; dst, off, 1, tc0, beta);
1099 pub fn loop_filter_lumanormal_h(dst: &mut [u8], off: usize, stride: usize, alpha: i16, beta: i16, tc0: i16) {
1100 let flags = check_filter4(dst, off, stride, 1, alpha, beta);
1101 for (x, &flag) in flags.iter().enumerate() {
1103 loop_filter!(lumanormal; dst, off + x, stride, tc0, beta);
1107 pub fn loop_filter_chromaedge_v(dst: &mut [u8], mut off: usize, stride: usize, alpha: i16, beta: i16) {
1109 if check_filter(dst, off, 1, alpha, beta) {
1110 loop_filter!(chromaedge; dst, off, 1);
1115 pub fn loop_filter_chromaedge_h(dst: &mut [u8], off: usize, stride: usize, alpha: i16, beta: i16) {
1117 if check_filter(dst, off + x, stride, alpha, beta) {
1118 loop_filter!(chromaedge; dst, off + x, stride);
1122 pub fn loop_filter_chromanormal_v(dst: &mut [u8], mut off: usize, stride: usize, alpha: i16, beta: i16, tc0: i16) {
1124 if check_filter(dst, off, 1, alpha, beta) {
1125 loop_filter!(chromanormal; dst, off, 1, tc0);
1130 pub fn loop_filter_chromanormal_h(dst: &mut [u8], off: usize, stride: usize, alpha: i16, beta: i16, tc0: i16) {
1132 if check_filter(dst, off + x, stride, alpha, beta) {
1133 loop_filter!(chromanormal; dst, off + x, stride, tc0);