add LinePack decoder
[nihav.git] / nihav-itu / src / codecs / h264 / dsp / mod.rs
CommitLineData
2f9923e6 1mod mc;
834e7b28 2pub use mc::{H264MC, McBlock};
932ae27b
KS
3#[cfg(target_arch="x86_64")]
4use std::arch::asm;
999fbb83 5
696e4e20
KS
6pub const CHROMA_QUANTS: [u8; 52] = [
7 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
8 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 29, 30,
9 31, 32, 32, 33, 34, 34, 35, 35, 36, 36, 37, 37, 37, 38, 38, 38,
10 39, 39, 39, 39
11];
12
13pub const CHROMA_DC_SCAN: [usize; 4] = [ 0, 1, 2, 3];
14pub const ZIGZAG: [usize; 16] = [
15 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
16];
17pub const ZIGZAG1: [usize; 15] = [
18 0, 3, 7, 4, 1, 2, 5, 8, 11, 12, 9, 6, 10, 13, 14
19];
20/*pub const IL_SCAN: [usize; 16] = [
21 0, 4, 1, 8, 12, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
22];*/
23pub const ZIGZAG8X8: [usize; 64] = [
24 0, 1, 8, 16, 9, 2, 3, 10,
25 17, 24, 32, 25, 18, 11, 4, 5,
26 12, 19, 26, 33, 40, 48, 41, 34,
27 27, 20, 13, 6, 7, 14, 21, 28,
28 35, 42, 49, 56, 57, 50, 43, 36,
29 29, 22, 15, 23, 30, 37, 44, 51,
30 58, 59, 52, 45, 38, 31, 39, 46,
31 53, 60, 61, 54, 47, 55, 62, 63
32];
33
34const LEVEL_SCALE: [[i16; 6]; 3] = [
35 [ 10, 11, 13, 14, 16, 18 ],
36 [ 16, 18, 20, 23, 25, 29 ],
37 [ 13, 14, 16, 18, 20, 23 ]
38];
39
40pub fn chroma_dc_transform(blk: &mut [i16; 4], qp: u8) {
41 let t0 = blk[0] + blk[2];
42 let t1 = blk[0] - blk[2];
43 let t2 = blk[1] + blk[3];
44 let t3 = blk[1] - blk[3];
45 blk[0] = t0 + t2;
46 blk[1] = t0 - t2;
47 blk[2] = t1 + t3;
48 blk[3] = t1 - t3;
49 if qp < 6 {
4a1ca15c 50 let mul = LEVEL_SCALE[0][qp as usize];
696e4e20
KS
51 for el in blk.iter_mut() {
52 *el = el.wrapping_mul(mul) >> 1;
53 }
54 } else {
4a1ca15c 55 let mul = LEVEL_SCALE[0][(qp % 6) as usize];
696e4e20
KS
56 let shift = qp / 6 - 1;
57 for el in blk.iter_mut() {
58 *el = el.wrapping_mul(mul) << shift;
59 }
60 }
61}
62
63macro_rules! transform {
64 (luma_dc; $a: expr, $b: expr, $c: expr, $d: expr) => ({
65 let t0 = $a.wrapping_add($c);
66 let t1 = $a.wrapping_sub($c);
67 let t2 = $b.wrapping_add($d);
68 let t3 = $b.wrapping_sub($d);
69 $a = t0.wrapping_add(t2);
70 $b = t1.wrapping_add(t3);
71 $c = t1.wrapping_sub(t3);
72 $d = t0.wrapping_sub(t2);
73 });
74 ($a: expr, $b: expr, $c: expr, $d: expr, $shift: expr) => ({
75 let t0 = $a.wrapping_add($c);
76 let t1 = $a.wrapping_sub($c);
77 let t2 = ($b >> 1).wrapping_sub($d);
78 let t3 = $b.wrapping_add($d >> 1);
79 let bias = 1 << $shift >> 1;
80 $a = t0.wrapping_add(t3).wrapping_add(bias) >> $shift;
81 $b = t1.wrapping_add(t2).wrapping_add(bias) >> $shift;
82 $c = t1.wrapping_sub(t2).wrapping_add(bias) >> $shift;
83 $d = t0.wrapping_sub(t3).wrapping_add(bias) >> $shift;
84 });
85 ($a: expr, $b: expr, $c: expr, $d: expr, $e: expr, $f: expr, $g: expr, $h: expr) => {
86 let e0 = $a + $e;
87 let e1 = -$d + $f - $h - ($h >> 1);
88 let e2 = $a - $e;
89 let e3 = $b + $h - $d - ($d >> 1);
90 let e4 = ($c >> 1) - $g;
91 let e5 = -$b + $h + $f + ($f >> 1);
92 let e6 = $c + ($g >> 1);
93 let e7 = $d + $f + $b + ($b >> 1);
94
95 let f0 = e0 + e6;
96 let f1 = e1 + (e7 >> 2);
97 let f2 = e2 + e4;
98 let f3 = e3 + (e5 >> 2);
99 let f4 = e2 - e4;
100 let f5 = (e3 >> 2) - e5;
101 let f6 = e0 - e6;
102 let f7 = e7 - (e1 >> 2);
103
104 $a = f0 + f7;
105 $b = f2 + f5;
106 $c = f4 + f3;
107 $d = f6 + f1;
108 $e = f6 - f1;
109 $f = f4 - f3;
110 $g = f2 - f5;
111 $h = f0 - f7;
112 };
113}
114
115pub fn idct_luma_dc(blk: &mut [i16; 16], qp: u8) {
116 if qp < 12 {
4a1ca15c 117 let mul = LEVEL_SCALE[0][(qp % 6) as usize];
696e4e20
KS
118 let shift = 2 - qp / 6;
119 let bias = 1 << shift >> 1;
120 for el in blk.iter_mut() {
121 *el = el.wrapping_mul(mul).wrapping_add(bias) >> shift;
122 }
123 } else {
4a1ca15c 124 let mul = LEVEL_SCALE[0][(qp % 6) as usize];
696e4e20
KS
125 let shift = qp / 6 - 2;
126 for el in blk.iter_mut() {
127 *el = el.wrapping_mul(mul) << shift;
128 }
129 }
130 for i in 0..4 {
131 transform!(luma_dc; blk[i], blk[i + 4], blk[i + 8], blk[i + 12]);
132 }
754ab49a 133 for row in blk.chunks_exact_mut(4) {
696e4e20
KS
134 transform!(luma_dc; row[0], row[1], row[2], row[3]);
135 }
136}
137
fe64781d 138pub fn idct_skip_dc(blk: &mut [i16; 16], qp: u8) {
696e4e20
KS
139 const BLK_INDEX: [usize; 16] = [
140 0, 2, 0, 2,
141 2, 1, 2, 1,
142 0, 2, 0, 2,
143 2, 1, 2, 1
144 ];
145 let qidx = (qp % 6) as usize;
146 let shift = qp / 6;
fe64781d
KS
147 for (el, &idx) in blk.iter_mut().zip(BLK_INDEX.iter()).skip(1) {
148 *el = (*el * LEVEL_SCALE[idx][qidx]) << shift;
149 }
150 for row in blk.chunks_exact_mut(4) {
151 transform!(row[0], row[1], row[2], row[3], 0);
152 }
153 for i in 0..4 {
154 transform!(blk[i], blk[i + 4], blk[i + 8], blk[i + 12], 6);
155 }
156}
157
158pub fn idct(blk: &mut [i16; 16], qp: u8) {
159 const BLK_INDEX: [usize; 16] = [
160 0, 2, 0, 2,
161 2, 1, 2, 1,
162 0, 2, 0, 2,
163 2, 1, 2, 1
164 ];
165 let qidx = (qp % 6) as usize;
166 let shift = qp / 6;
167 for (el, &idx) in blk.iter_mut().zip(BLK_INDEX.iter()) {
696e4e20
KS
168 *el = (*el * LEVEL_SCALE[idx][qidx]) << shift;
169 }
754ab49a 170 for row in blk.chunks_exact_mut(4) {
3ad9bf2b
KS
171 transform!(row[0], row[1], row[2], row[3], 0);
172 }
173 for i in 0..4 {
174 transform!(blk[i], blk[i + 4], blk[i + 8], blk[i + 12], 6);
696e4e20
KS
175 }
176}
177
178pub fn idct_dc(blk: &mut [i16; 16], qp: u8, quant_dc: bool) {
179 let dc = if quant_dc {
180 (blk[0] * LEVEL_SCALE[0][(qp % 6) as usize]) << (qp / 6)
181 } else {
182 blk[0]
183 };
184 *blk = [(dc + 0x20) >> 6; 16];
185}
186
187const QMAT_8X8: [[u8; 16]; 6] = [
188 [
2b6a8fdc 189 20, 19, 25, 19,
696e4e20
KS
190 19, 18, 24, 18,
191 25, 24, 32, 24,
2b6a8fdc 192 19, 18, 24, 18
696e4e20 193 ], [
2b6a8fdc 194 22, 21, 28, 21,
696e4e20
KS
195 21, 19, 26, 19,
196 28, 26, 35, 26,
2b6a8fdc 197 21, 19, 26, 19
696e4e20 198 ], [
2b6a8fdc 199 26, 24, 33, 24,
696e4e20
KS
200 24, 23, 31, 23,
201 33, 31, 42, 31,
2b6a8fdc 202 24, 23, 31, 23
696e4e20 203 ], [
2b6a8fdc 204 28, 26, 35, 26,
696e4e20
KS
205 26, 25, 33, 25,
206 35, 33, 45, 33,
2b6a8fdc 207 26, 25, 33, 25
696e4e20 208 ], [
2b6a8fdc 209 32, 30, 40, 30,
696e4e20
KS
210 30, 28, 38, 28,
211 40, 38, 51, 38,
2b6a8fdc 212 30, 28, 38, 28
696e4e20 213 ], [
2b6a8fdc 214 36, 34, 46, 34,
696e4e20
KS
215 34, 32, 43, 32,
216 46, 43, 58, 43,
2b6a8fdc 217 34, 32, 43, 32
696e4e20
KS
218 ]
219];
220
221pub fn dequant8x8(blk: &mut [i16; 64], slist: &[u8; 64]) {
222 for (el, &scan) in blk.iter_mut().zip(ZIGZAG8X8.iter()) {
223 if *el != 0 {
224 *el = el.wrapping_mul(i16::from(slist[scan]));
225 }
226 }
227}
228
229pub fn idct8x8(blk: &mut [i16; 64], qp: u8) {
230 let mut tmp = [0i32; 64];
231 let qmat = &QMAT_8X8[(qp % 6) as usize];
232 if qp >= 36 {
233 let shift = qp / 6 - 6;
234 for (i, (dst, &src)) in tmp.iter_mut().zip(blk.iter()).enumerate() {
235 let x = i & 7;
236 let y = i >> 3;
237 let idx = (x & 3) + (y & 3) * 4;
238 *dst = i32::from(src).wrapping_mul(i32::from(qmat[idx])) << shift;
239 }
240 } else {
241 let shift = 6 - qp / 6;
242 let bias = (1 << shift) >> 1;
243 for (i, (dst, &src)) in tmp.iter_mut().zip(blk.iter()).enumerate() {
244 let x = i & 7;
245 let y = i >> 3;
246 let idx = (x & 3) + (y & 3) * 4;
247 *dst = i32::from(src).wrapping_mul(i32::from(qmat[idx])).wrapping_add(bias) >> shift;
248 }
249 }
754ab49a 250 for row in tmp.chunks_exact_mut(8) {
696e4e20
KS
251 transform!(row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7]);
252 }
253 for col in 0..8 {
254 transform!(tmp[col], tmp[col + 8], tmp[col + 8 * 2], tmp[col + 8 * 3],
255 tmp[col + 8 * 4], tmp[col + 8 * 5], tmp[col + 8 * 6], tmp[col + 8 * 7]);
256 }
257 for (dst, &src) in blk.iter_mut().zip(tmp.iter()) {
258 *dst = ((src + 0x20) >> 6) as i16;
259 }
260}
261
262pub fn add_coeffs(dst: &mut [u8], offset: usize, stride: usize, coeffs: &[i16]) {
263 let out = &mut dst[offset..][..stride * 3 + 4];
754ab49a 264 for (line, src) in out.chunks_mut(stride).take(4).zip(coeffs.chunks_exact(4)) {
696e4e20
KS
265 for (dst, src) in line.iter_mut().take(4).zip(src.iter()) {
266 *dst = (i32::from(*dst) + i32::from(*src)).max(0).min(255) as u8;
267 }
268 }
269}
270
271pub fn add_coeffs8(dst: &mut [u8], offset: usize, stride: usize, coeffs: &[i16; 64]) {
272 let out = &mut dst[offset..];
754ab49a 273 for (line, src) in out.chunks_mut(stride).take(8).zip(coeffs.chunks_exact(8)) {
696e4e20
KS
274 for (dst, src) in line.iter_mut().take(8).zip(src.iter()) {
275 *dst = (i32::from(*dst) + i32::from(*src)).max(0).min(255) as u8;
276 }
277 }
278}
279
696e4e20
KS
280fn clip8(val: i16) -> u8 { val.max(0).min(255) as u8 }
281
22de733b
KS
282fn ipred_dc128(buf: &mut [u8], stride: usize, bsize: usize) {
283 for row in buf.chunks_mut(stride).take(bsize) {
284 for el in row[..bsize].iter_mut() {
285 *el = 128;
286 }
696e4e20
KS
287 }
288}
22de733b
KS
289fn ipred_ver(buf: &mut [u8], stride: usize, top: &[u8], bsize: usize) {
290 for row in buf.chunks_mut(stride).take(bsize) {
291 row[..bsize].copy_from_slice(&top[..bsize]);
696e4e20
KS
292 }
293}
22de733b
KS
294fn ipred_hor(buf: &mut [u8], stride: usize, left: &[u8], bsize: usize) {
295 for (row, &left) in buf.chunks_mut(stride).zip(left[1..].iter()).take(bsize) {
296 for el in row[..bsize].iter_mut() {
297 *el = left;
298 }
696e4e20
KS
299 }
300}
22de733b 301fn ipred_dc(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8], bsize: usize, shift: u8) {
696e4e20 302 let mut adc: u16 = 0;
22de733b
KS
303 for i in 0..bsize { adc += u16::from(top[i]); }
304 for i in 0..bsize { adc += u16::from(left[i + 1]); }
696e4e20
KS
305 let dc = ((adc + (1 << (shift - 1))) >> shift) as u8;
306
22de733b
KS
307 for row in buf.chunks_mut(stride).take(bsize) {
308 for el in row[..bsize].iter_mut() {
309 *el = dc;
310 }
696e4e20
KS
311 }
312}
22de733b 313fn ipred_left_dc(buf: &mut [u8], stride: usize, left: &[u8], bsize: usize, shift: u8) {
696e4e20 314 let mut adc: u16 = 0;
22de733b 315 for i in 0..bsize { adc += u16::from(left[i + 1]); }
696e4e20
KS
316 let dc = ((adc + (1 << (shift - 1))) >> shift) as u8;
317
22de733b
KS
318 for row in buf.chunks_mut(stride).take(bsize) {
319 for el in row[..bsize].iter_mut() {
320 *el = dc;
321 }
696e4e20
KS
322 }
323}
22de733b 324fn ipred_top_dc(buf: &mut [u8], stride: usize, top: &[u8], bsize: usize, shift: u8) {
696e4e20 325 let mut adc: u16 = 0;
22de733b 326 for i in 0..bsize { adc += u16::from(top[i]); }
696e4e20
KS
327 let dc = ((adc + (1 << (shift - 1))) >> shift) as u8;
328
22de733b
KS
329 for row in buf.chunks_mut(stride).take(bsize) {
330 for el in row[..bsize].iter_mut() {
331 *el = dc;
332 }
696e4e20
KS
333 }
334}
335
22de733b
KS
336fn load(dst: &mut [u16], src: &[u8]) {
337 for (dst, &src) in dst.iter_mut().zip(src.iter()) {
338 *dst = u16::from(src);
339 }
696e4e20
KS
340}
341
22de733b
KS
342fn ipred_4x4_ver(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8], _tr: &[u8]) {
343 ipred_ver(buf, stride, top, 4);
696e4e20 344}
22de733b
KS
345fn ipred_4x4_hor(buf: &mut [u8], stride: usize, _top: &[u8], left: &[u8], _tr: &[u8]) {
346 ipred_hor(buf, stride, left, 4);
696e4e20 347}
22de733b 348fn ipred_4x4_diag_down_left(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8], tr: &[u8]) {
696e4e20 349 let mut t: [u16; 9] = [0; 9];
22de733b
KS
350 load(&mut t[..4], top);
351 load(&mut t[4..8], tr);
696e4e20
KS
352 t[8] = t[7];
353
696e4e20 354 for i in 0..4 {
22de733b 355 buf[i] = ((t[i] + 2 * t[i + 1] + t[i + 2] + 2) >> 2) as u8;
696e4e20 356 }
22de733b 357 let dst = &mut buf[stride..];
696e4e20
KS
358 for i in 0..4 {
359 dst[i] = ((t[i + 1] + 2 * t[i + 2] + t[i + 3] + 2) >> 2) as u8;
360 }
22de733b 361 let dst = &mut buf[stride * 2..];
696e4e20
KS
362 for i in 0..4 {
363 dst[i] = ((t[i + 2] + 2 * t[i + 3] + t[i + 4] + 2) >> 2) as u8;
364 }
22de733b 365 let dst = &mut buf[stride * 3..];
696e4e20
KS
366 for i in 0..4 {
367 dst[i] = ((t[i + 3] + 2 * t[i + 4] + t[i + 5] + 2) >> 2) as u8;
368 }
369}
22de733b 370fn ipred_4x4_diag_down_right(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8], _tr: &[u8]) {
696e4e20 371 let mut t: [u16; 5] = [0; 5];
22de733b 372 t[0] = u16::from(left[0]);
42005e25 373 load(&mut t[1..], top);
696e4e20 374 let mut l: [u16; 5] = [0; 5];
22de733b
KS
375 load(&mut l, left);
376 let dst = buf;
696e4e20
KS
377
378 for j in 0..4 {
379 for i in 0..j {
380 dst[i + j * stride] = ((l[j - i - 1] + 2 * l[j - i] + l[j - i + 1] + 2) >> 2) as u8;
381 }
382 dst[j + j * stride] = ((l[1] + 2 * l[0] + t[1] + 2) >> 2) as u8;
383 for i in (j+1)..4 {
384 dst[i + j * stride] = ((t[i - j - 1] + 2 * t[i - j] + t[i - j + 1] + 2) >> 2) as u8;
385 }
386 }
387}
22de733b 388fn ipred_4x4_ver_right(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8], _tr: &[u8]) {
696e4e20 389 let mut t: [u16; 5] = [0; 5];
22de733b 390 t[0] = u16::from(left[0]);
42005e25 391 load(&mut t[1..], top);
696e4e20 392 let mut l: [u16; 5] = [0; 5];
22de733b
KS
393 load(&mut l, left);
394 let dst = buf;
696e4e20
KS
395
396 for j in 0..4 {
397 for i in 0..4 {
398 let zvr = ((2 * i) as i8) - (j as i8);
399 let pix;
400 if zvr >= 0 {
401 if (zvr & 1) == 0 {
402 pix = (t[i - (j >> 1)] + t[i - (j >> 1) + 1] + 1) >> 1;
403 } else {
404 pix = (t[i - (j >> 1) - 1] + 2 * t[i - (j >> 1)] + t[i - (j >> 1) + 1] + 2) >> 2;
405 }
406 } else {
407 if zvr == -1 {
408 pix = (l[1] + 2 * l[0] + t[1] + 2) >> 2;
409 } else {
410 pix = (l[j] + 2 * l[j - 1] + l[j - 2] + 2) >> 2;
411 }
412 }
413 dst[i + j * stride] = pix as u8;
414 }
415 }
416}
22de733b 417fn ipred_4x4_ver_left(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8], tr: &[u8]) {
696e4e20 418 let mut t: [u16; 8] = [0; 8];
42005e25 419 load(&mut t[..4], top);
22de733b
KS
420 load(&mut t[4..], tr);
421 let dst = buf;
696e4e20
KS
422
423 dst[0 + 0 * stride] = ((t[0] + t[1] + 1) >> 1) as u8;
424 let pix = ((t[1] + t[2] + 1) >> 1) as u8;
425 dst[1 + 0 * stride] = pix;
426 dst[0 + 2 * stride] = pix;
427 let pix = ((t[2] + t[3] + 1) >> 1) as u8;
428 dst[2 + 0 * stride] = pix;
429 dst[1 + 2 * stride] = pix;
430 let pix = ((t[3] + t[4] + 1) >> 1) as u8;
431 dst[3 + 0 * stride] = pix;
432 dst[2 + 2 * stride] = pix;
433 dst[3 + 2 * stride] = ((t[4] + t[5] + 1) >> 1) as u8;
434 dst[0 + 1 * stride] = ((t[0] + 2*t[1] + t[2] + 2) >> 2) as u8;
435 let pix = ((t[1] + 2*t[2] + t[3] + 2) >> 2) as u8;
436 dst[1 + 1 * stride] = pix;
437 dst[0 + 3 * stride] = pix;
438 let pix = ((t[2] + 2*t[3] + t[4] + 2) >> 2) as u8;
439 dst[2 + 1 * stride] = pix;
440 dst[1 + 3 * stride] = pix;
441 let pix = ((t[3] + 2*t[4] + t[5] + 2) >> 2) as u8;
442 dst[3 + 1 * stride] = pix;
443 dst[2 + 3 * stride] = pix;
444 dst[3 + 3 * stride] = ((t[4] + 2*t[5] + t[6] + 2) >> 2) as u8;
445}
22de733b 446fn ipred_4x4_hor_down(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8], _tr: &[u8]) {
696e4e20 447 let mut t: [u16; 5] = [0; 5];
22de733b 448 t[0] = u16::from(left[0]);
42005e25 449 load(&mut t[1..], top);
696e4e20 450 let mut l: [u16; 5] = [0; 5];
22de733b
KS
451 load(&mut l, left);
452 let dst = buf;
696e4e20
KS
453
454 for j in 0..4 {
455 for i in 0..4 {
456 let zhd = ((2 * j) as i8) - (i as i8);
457 let pix;
458 if zhd >= 0 {
459 if (zhd & 1) == 0 {
460 pix = (l[j - (i >> 1)] + l[j - (i >> 1) + 1] + 1) >> 1;
461 } else {
462 pix = (l[j - (i >> 1) - 1] + 2 * l[j - (i >> 1)] + l[j - (i >> 1) + 1] + 2) >> 2;
463 }
464 } else {
465 if zhd == -1 {
466 pix = (l[1] + 2 * l[0] + t[1] + 2) >> 2;
467 } else {
468 pix = (t[i - 2] + 2 * t[i - 1] + t[i] + 2) >> 2;
469 }
470 }
471 dst[i + j * stride] = pix as u8;
472 }
473 }
474}
22de733b 475fn ipred_4x4_hor_up(buf: &mut [u8], stride: usize, _top: &[u8], left: &[u8], _tr: &[u8]) {
696e4e20 476 let mut l: [u16; 8] = [0; 8];
22de733b
KS
477 load(&mut l, &left[1..]);
478 let dst = buf;
696e4e20
KS
479
480 dst[0 + 0 * stride] = ((l[0] + l[1] + 1) >> 1) as u8;
481 dst[1 + 0 * stride] = ((l[0] + 2*l[1] + l[2] + 2) >> 2) as u8;
482 let pix = ((l[1] + l[2] + 1) >> 1) as u8;
483 dst[2 + 0 * stride] = pix;
484 dst[0 + 1 * stride] = pix;
485 let pix = ((l[1] + 2*l[2] + l[3] + 2) >> 2) as u8;
486 dst[3 + 0 * stride] = pix;
487 dst[1 + 1 * stride] = pix;
488 let pix = ((l[2] + l[3] + 1) >> 1) as u8;
489 dst[2 + 1 * stride] = pix;
490 dst[0 + 2 * stride] = pix;
491 let pix = ((l[2] + 3*l[3] + 2) >> 2) as u8;
492 dst[3 + 1 * stride] = pix;
493 dst[1 + 2 * stride] = pix;
494 dst[3 + 2 * stride] = l[3] as u8;
495 dst[1 + 3 * stride] = l[3] as u8;
496 dst[0 + 3 * stride] = l[3] as u8;
497 dst[2 + 2 * stride] = l[3] as u8;
498 dst[2 + 3 * stride] = l[3] as u8;
499 dst[3 + 3 * stride] = l[3] as u8;
500}
22de733b
KS
501fn ipred_4x4_dc(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8], _tr: &[u8]) {
502 ipred_dc(buf, stride, top, left, 4, 3);
696e4e20 503}
22de733b
KS
504fn ipred_4x4_left_dc(buf: &mut [u8], stride: usize, _top: &[u8], left: &[u8], _tr: &[u8]) {
505 ipred_left_dc(buf, stride, left, 4, 2);
696e4e20 506}
22de733b
KS
507fn ipred_4x4_top_dc(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8], _tr: &[u8]) {
508 ipred_top_dc(buf, stride, top, 4, 2);
696e4e20 509}
22de733b
KS
510fn ipred_4x4_dc128(buf: &mut [u8], stride: usize, _top: &[u8], _left: &[u8], _tr: &[u8]) {
511 ipred_dc128(buf, stride, 4);
696e4e20
KS
512}
513
514pub struct IPred8Context {
515 pub t: [u8; 16],
516 pub l: [u8; 8],
517 pub tl: u8,
518}
519
520impl IPred8Context {
521 pub fn new() -> Self {
522 Self {
523 t: [128; 16],
524 l: [128; 8],
525 tl: 128,
526 }
527 }
22de733b 528 pub fn fill(&mut self, top: &[u8], left: &[u8], has_t: bool, has_tr: bool, has_l: bool, has_tl: bool) {
696e4e20
KS
529 let mut t = [0x80u8; 19];
530 let mut l = [0x80u8; 11];
531 if has_t {
22de733b 532 t[1..8 + 1].copy_from_slice(&top[..8]);
696e4e20
KS
533 }
534 if has_tr {
22de733b 535 t[8 + 1..16 + 1].copy_from_slice(&top[8..][..8]);
696e4e20
KS
536 t[16 + 1] = t[15 + 1];
537 t[17 + 1] = t[15 + 1];
538 } else {
539 let (t0, t1) = t.split_at_mut(8 + 1);
540 for el in t1.iter_mut() {
541 *el = t0[7 + 1];
542 }
543 }
544 if has_l {
22de733b 545 l[1..9].copy_from_slice(&left[1..9]);
696e4e20
KS
546 l[8 + 1] = l[7 + 1];
547 l[9 + 1] = l[7 + 1];
548 }
549 if has_tl {
22de733b
KS
550 t[0] = left[0];
551 l[0] = left[0];
696e4e20
KS
552 } else {
553 t[0] = t[1];
554 l[0] = l[1];
555 }
556
557 for i in 0..16 {
558 self.t[i] = ((u16::from(t[i]) + 2 * u16::from(t[i + 1]) + u16::from(t[i + 2]) + 2) >> 2) as u8;
559 }
560 for i in 0..8 {
561 self.l[i] = ((u16::from(l[i]) + 2 * u16::from(l[i + 1]) + u16::from(l[i + 2]) + 2) >> 2) as u8;
562 }
563 self.tl = if has_t && has_l {
564 ((u16::from(t[1]) + 2 * u16::from(t[0]) + u16::from(l[1]) + 2) >> 2) as u8
565 } else if has_t {
566 ((3 * u16::from(t[0]) + u16::from(t[1]) + 2) >> 2) as u8
567 } else if has_l {
568 ((3 * u16::from(l[0]) + u16::from(l[1]) + 2) >> 2) as u8
569 } else {
570 t[0]
571 };
572 }
573}
574
575fn ipred_y_8x8_ver(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
576 for row in buf.chunks_mut(stride).take(8) {
577 row[..8].copy_from_slice(&ctx.t[..8]);
578 }
579}
580fn ipred_y_8x8_hor(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
581 for (row, &l) in buf.chunks_mut(stride).zip(ctx.l.iter()).take(8) {
582 row[..8].copy_from_slice(&[l; 8]);
583 }
584}
585fn ipred_y_8x8_diag_down_left(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
586 let mut t = [0u16; 16];
22de733b 587 load(&mut t, &ctx.t);
696e4e20
KS
588
589 for (y, row) in buf.chunks_mut(stride).take(8).enumerate() {
590 for (x, pix) in row.iter_mut().take(8).enumerate() {
591 *pix = ((if (x != 7) || (y != 7) {
592 t[x + y] + 2 * t[x + y + 1] + t[x + y + 2]
593 } else {
594 t[14] + 3 * t[15]
595 } + 2) >> 2) as u8;
596 }
597 }
598}
599fn ipred_y_8x8_diag_down_right(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
600 let mut t = [0u16; 9];
601 t[0] = u16::from(ctx.tl);
22de733b 602 load(&mut t[1..], &ctx.t);
696e4e20
KS
603 let mut l = [0u16; 9];
604 l[0] = u16::from(ctx.tl);
22de733b 605 load(&mut l[1..], &ctx.l);
696e4e20
KS
606 let diag = t[1] + 2 * t[0] + l[1];
607
608 for (y, row) in buf.chunks_mut(stride).take(8).enumerate() {
609 for (x, pix) in row.iter_mut().take(8).enumerate() {
610 *pix = ((if x > y {
611 t[x - y - 1] + 2 * t[x - y] + t[x - y + 1]
612 } else if x < y {
613 l[y - x - 1] + 2 * l[y - x] + l[y - x + 1]
614 } else {
615 diag
616 } + 2) >> 2) as u8;
617 }
618 }
619}
620fn ipred_y_8x8_ver_right(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
621 let mut t = [0u16; 9];
622 t[0] = u16::from(ctx.tl);
22de733b 623 load(&mut t[1..], &ctx.t);
696e4e20
KS
624 let mut l = [0u16; 9];
625 l[0] = u16::from(ctx.tl);
22de733b 626 load(&mut l[1..], &ctx.l);
696e4e20
KS
627
628 for (y, row) in buf.chunks_mut(stride).take(8).enumerate() {
629 for (x, pix) in row.iter_mut().take(8).enumerate() {
630 let zvr = 2 * (x as i8) - (y as i8);
631 *pix = if zvr >= 0 {
632 let ix = x - (y >> 1);
633 if (zvr & 1) == 0 {
634 (t[ix] + t[ix + 1] + 1) >> 1
635 } else {
636 (t[ix - 1] + 2 * t[ix] + t[ix + 1] + 2) >> 2
637 }
638 } else if zvr == -1 {
2b6a8fdc 639 (l[1] + 2 * l[0] + t[1] + 2) >> 2
696e4e20
KS
640 } else {
641 let ix = y - 2 * x;
642 (l[ix] + 2 * l[ix - 1] + l[ix - 2] + 2) >> 2
643 } as u8;
644 }
645 }
646}
647fn ipred_y_8x8_ver_left(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
648 let mut t = [0u16; 16];
22de733b 649 load(&mut t, &ctx.t);
696e4e20
KS
650
651 for (y, row) in buf.chunks_mut(stride).take(8).enumerate() {
652 for (x, pix) in row.iter_mut().take(8).enumerate() {
653 let ix = x + (y >> 1);
654 *pix = if (y & 1) == 0 {
655 (t[ix] + t[ix + 1] + 1) >> 1
656 } else {
657 (t[ix] + 2 * t[ix + 1] + t[ix + 2] + 2) >> 2
658 } as u8;
659 }
660 }
661
662}
663fn ipred_y_8x8_hor_down(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
664 let mut t = [0u16; 9];
665 t[0] = u16::from(ctx.tl);
22de733b 666 load(&mut t[1..], &ctx.t);
696e4e20
KS
667 let mut l = [0u16; 9];
668 l[0] = u16::from(ctx.tl);
22de733b 669 load(&mut l[1..], &ctx.l);
696e4e20
KS
670
671 for (y, row) in buf.chunks_mut(stride).take(8).enumerate() {
672 for (x, pix) in row.iter_mut().take(8).enumerate() {
673 let zhd = 2 * (y as i8) - (x as i8);
674 *pix = if zhd >= 0 {
675 let ix = y - (x >> 1);
676 if (zhd & 1) == 0 {
677 (l[ix] + l[ix + 1] + 1) >> 1
678 } else {
679 (l[ix - 1] + 2 * l[ix] + l[ix + 1] + 2) >> 2
680 }
681 } else if zhd == -1 {
2b6a8fdc 682 (l[1] + 2 * l[0] + t[1] + 2) >> 2
696e4e20
KS
683 } else {
684 let ix = x - 2 * y;
685 (t[ix] + 2 * t[ix - 1] + t[ix - 2] + 2) >> 2
686 } as u8;
687 }
688 }
689}
690fn ipred_y_8x8_hor_up(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
691 let mut l = [0u16; 8];
22de733b 692 load(&mut l, &ctx.l);
696e4e20
KS
693
694 for (y, row) in buf.chunks_mut(stride).take(8).enumerate() {
695 for (x, pix) in row.iter_mut().take(8).enumerate() {
696 let zhu = x + 2 * y;
697 let ix = y + (x >> 1);
698 *pix = if zhu > 13 {
699 l[7]
700 } else if zhu == 13 {
701 (l[6] + 3 * l[7] + 2) >> 2
702 } else if (zhu & 1) != 0 {
703 (l[ix] + 2 * l[ix + 1] + l[ix + 2] + 2) >> 2
704 } else {
705 (l[ix] + l[ix + 1] + 1) >> 1
706 } as u8;
707 }
708 }
709}
710fn ipred_y_8x8_dc(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
711 let mut sum = 0u16;
712 for &t in ctx.t[..8].iter() {
713 sum += u16::from(t);
714 }
715 for &l in ctx.l[..8].iter() {
716 sum += u16::from(l);
717 }
718 let dc = ((sum + 8) >> 4) as u8;
719 for row in buf.chunks_mut(stride).take(8) {
720 for pix in row.iter_mut().take(8) {
721 *pix = dc;
722 }
723 }
724}
725fn ipred_y_8x8_left_dc(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
726 let mut sum = 0u16;
727 for &l in ctx.l[..8].iter() {
728 sum += u16::from(l);
729 }
730 let dc = ((sum + 4) >> 3) as u8;
731 for row in buf.chunks_mut(stride).take(8) {
732 for pix in row.iter_mut().take(8) {
733 *pix = dc;
734 }
735 }
736}
737fn ipred_y_8x8_top_dc(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
738 let mut sum = 0u16;
739 for &t in ctx.t[..8].iter() {
740 sum += u16::from(t);
741 }
742 let dc = ((sum + 4) >> 3) as u8;
743 for row in buf.chunks_mut(stride).take(8) {
744 for pix in row.iter_mut().take(8) {
745 *pix = dc;
746 }
747 }
748}
749fn ipred_y_8x8_dc128(buf: &mut [u8], stride: usize, _ctx: &IPred8Context) {
22de733b 750 ipred_dc128(buf, stride, 8);
696e4e20
KS
751}
752
22de733b
KS
753fn ipred_8x8_ver(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8]) {
754 ipred_ver(buf, stride, top, 8);
696e4e20 755}
22de733b
KS
756fn ipred_8x8_hor(buf: &mut [u8], stride: usize, _top: &[u8], left: &[u8]) {
757 ipred_hor(buf, stride, left, 8);
696e4e20 758}
22de733b
KS
759fn ipred_8x8_dc(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8]) {
760 let mut l = [0; 8];
761 load(&mut l, &left[1..]);
762 let mut t = [0; 8];
42005e25 763 load(&mut t, top);
696e4e20
KS
764
765 let dc0 = ((t[0] + t[1] + t[2] + t[3] + l[0] + l[1] + l[2] + l[3] + 4) >> 3) as u8;
766 let sum1 = t[4] + t[5] + t[6] + t[7];
767 let dc1 = ((sum1 + 2) >> 2) as u8;
768 let sum2 = l[4] + l[5] + l[6] + l[7];
769 let dc2 = ((sum2 + 2) >> 2) as u8;
770 let dc3 = ((sum1 + sum2 + 4) >> 3) as u8;
771
22de733b 772 for row in buf.chunks_mut(stride).take(4) {
696e4e20
KS
773 row[..4].copy_from_slice(&[dc0; 4]);
774 row[4..8].copy_from_slice(&[dc1; 4]);
775 }
22de733b 776 for row in buf.chunks_mut(stride).skip(4).take(4) {
696e4e20
KS
777 row[..4].copy_from_slice(&[dc2; 4]);
778 row[4..8].copy_from_slice(&[dc3; 4]);
779 }
780}
22de733b 781fn ipred_8x8_left_dc(buf: &mut [u8], stride: usize, _top: &[u8], left: &[u8]) {
696e4e20
KS
782 let mut left_dc0 = 0;
783 let mut left_dc1 = 0;
22de733b
KS
784 for &el in left[1..].iter().take(4) {
785 left_dc0 += u16::from(el);
696e4e20 786 }
22de733b
KS
787 for &el in left[1..].iter().skip(4).take(4) {
788 left_dc1 += u16::from(el);
696e4e20
KS
789 }
790 let dc0 = ((left_dc0 + 2) >> 2) as u8;
791 let dc2 = ((left_dc1 + 2) >> 2) as u8;
22de733b 792 for row in buf.chunks_mut(stride).take(4) {
696e4e20
KS
793 row[..8].copy_from_slice(&[dc0; 8]);
794 }
22de733b 795 for row in buf.chunks_mut(stride).skip(4).take(4) {
696e4e20
KS
796 row[..8].copy_from_slice(&[dc2; 8]);
797 }
798}
22de733b
KS
799fn ipred_8x8_top_dc(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8]) {
800 ipred_top_dc(buf, stride, top, 4, 2);
801 ipred_top_dc(&mut buf[4..], stride, &top[4..], 4, 2);
802 let mut top = [0; 8];
803 top.copy_from_slice(&buf[stride * 3..][..8]);
804 ipred_top_dc(&mut buf[4 * stride..], stride, &top, 4, 2);
805 ipred_top_dc(&mut buf[4 + 4 * stride..], stride, &top[4..], 4, 2);
696e4e20 806}
22de733b
KS
807fn ipred_8x8_dc128(buf: &mut [u8], stride: usize, _top: &[u8], _left: &[u8]) {
808 ipred_dc128(buf, stride, 8);
696e4e20 809}
22de733b
KS
810fn ipred_8x8_plane(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8]) {
811 let mut h: i32 = 4 * (i32::from(top[7]) - i32::from(left[0]));
812 let mut v: i32 = 4 * (i32::from(left[8]) - i32::from(left[0]));
813 for i in 0..3 {
696e4e20 814 let i1 = (i + 1) as i32;
22de733b
KS
815 h += i1 * (i32::from(top[4 + i]) - i32::from(top[2 - i]));
816 v += i1 * (i32::from(left[5 + i]) - i32::from(left[3 - i]));
696e4e20
KS
817 }
818 let b = (17 * h + 16) >> 5;
819 let c = (17 * v + 16) >> 5;
22de733b
KS
820 let mut a = 16 * (i32::from(left[8]) + i32::from(top[7])) - 3 * (b + c) + 16;
821 for line in buf.chunks_mut(stride).take(8) {
696e4e20
KS
822 let mut acc = a;
823 for el in line.iter_mut().take(8) {
824 *el = clip8((acc >> 5) as i16);
825 acc += b;
826 }
827 a += c;
828 }
829}
830
22de733b
KS
831fn ipred_16x16_ver(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8]) {
832 ipred_ver(buf, stride, top, 16);
696e4e20 833}
22de733b
KS
834fn ipred_16x16_hor(buf: &mut [u8], stride: usize, _top: &[u8], left: &[u8]) {
835 ipred_hor(buf, stride, left, 16);
696e4e20 836}
22de733b
KS
837fn ipred_16x16_dc(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8]) {
838 ipred_dc(buf, stride, top, left, 16, 5);
696e4e20 839}
22de733b
KS
840fn ipred_16x16_left_dc(buf: &mut [u8], stride: usize, _top: &[u8], left: &[u8]) {
841 ipred_left_dc(buf, stride, left, 16, 4);
696e4e20 842}
22de733b
KS
843fn ipred_16x16_top_dc(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8]) {
844 ipred_top_dc(buf, stride, top, 16, 4);
696e4e20 845}
22de733b
KS
846fn ipred_16x16_dc128(buf: &mut [u8], stride: usize, _top: &[u8], _left: &[u8]) {
847 ipred_dc128(buf, stride, 16);
696e4e20 848}
22de733b
KS
849fn ipred_16x16_plane(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8]) {
850 let mut h = 8 * (i32::from(top[15]) - i32::from(left[0]));
851 let mut v = 8 * (i32::from(left[16]) - i32::from(left[0]));
852 for k in 0..7 {
853 h += ((k as i32) + 1) * (i32::from(top[8 + k]) - i32::from(top[6 - k]));
854 v += ((k as i32) + 1) * (i32::from(left[9 + k]) - i32::from(left[7 - k]));
696e4e20 855 }
22de733b 856
696e4e20
KS
857 h = (5 * h + 32) >> 6;
858 v = (5 * v + 32) >> 6;
859
22de733b 860 let mut a = 16 * (i32::from(left[16]) + i32::from(top[15]) + 1) - 7 * (v + h);
696e4e20 861
22de733b 862 for row in buf.chunks_mut(stride).take(16) {
696e4e20
KS
863 let mut b = a;
864 a += v;
865
866 for dst in row.chunks_exact_mut(4).take(4) {
867 dst[0] = clip8(((b ) >> 5) as i16);
868 dst[1] = clip8(((b + h) >> 5) as i16);
869 dst[2] = clip8(((b + 2*h) >> 5) as i16);
870 dst[3] = clip8(((b + 3*h) >> 5) as i16);
871 b += h * 4;
872 }
873 }
874}
875
22de733b
KS
876pub type IPred4x4Func = fn(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8], tr: &[u8]);
877pub type IPred8x8Func = fn(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8]);
696e4e20
KS
878pub type IPred8x8LumaFunc = fn(buf: &mut [u8], stride: usize, ctx: &IPred8Context);
879
880pub const IPRED4_DC128: usize = 11;
881pub const IPRED4_DC_TOP: usize = 10;
882pub const IPRED4_DC_LEFT: usize = 9;
883pub const IPRED8_DC128: usize = 6;
884pub const IPRED8_DC_TOP: usize = 5;
885pub const IPRED8_DC_LEFT: usize = 4;
886
887pub const IPRED_FUNCS4X4: [IPred4x4Func; 12] = [
888 ipred_4x4_ver, ipred_4x4_hor, ipred_4x4_dc,
889 ipred_4x4_diag_down_left, ipred_4x4_diag_down_right,
890 ipred_4x4_ver_right, ipred_4x4_hor_down, ipred_4x4_ver_left, ipred_4x4_hor_up,
891 ipred_4x4_left_dc, ipred_4x4_top_dc, ipred_4x4_dc128
892];
893
894pub const IPRED_FUNCS8X8_LUMA: [IPred8x8LumaFunc; 12] = [
895 ipred_y_8x8_ver, ipred_y_8x8_hor, ipred_y_8x8_dc,
896 ipred_y_8x8_diag_down_left, ipred_y_8x8_diag_down_right,
897 ipred_y_8x8_ver_right, ipred_y_8x8_hor_down,
898 ipred_y_8x8_ver_left, ipred_y_8x8_hor_up,
899 ipred_y_8x8_left_dc, ipred_y_8x8_top_dc, ipred_y_8x8_dc128
900];
901
902pub const IPRED_FUNCS8X8_CHROMA: [IPred8x8Func; 7] = [
903 ipred_8x8_dc, ipred_8x8_hor, ipred_8x8_ver, ipred_8x8_plane,
904 ipred_8x8_left_dc, ipred_8x8_top_dc, ipred_8x8_dc128
905];
906
907pub const IPRED_FUNCS16X16: [IPred8x8Func; 7] = [
908 ipred_16x16_ver, ipred_16x16_hor, ipred_16x16_dc, ipred_16x16_plane,
909 ipred_16x16_left_dc, ipred_16x16_top_dc, ipred_16x16_dc128
910];
911
696e4e20
KS
912macro_rules! loop_filter {
913 (lumaedge; $buf: expr, $off: expr, $step: expr, $alpha: expr, $beta: expr) => {
914 let p2 = i16::from($buf[$off - $step * 3]);
915 let p1 = i16::from($buf[$off - $step * 2]);
916 let p0 = i16::from($buf[$off - $step]);
917 let q0 = i16::from($buf[$off]);
918 let q1 = i16::from($buf[$off + $step]);
919 let q2 = i16::from($buf[$off + $step * 2]);
920 let a_p = (p2 - p0).abs() < $beta;
921 let a_q = (q2 - q0).abs() < $beta;
922 if a_p && (p0 - q0).abs() < (($alpha >> 2) + 2) {
923 let p3 = i16::from($buf[$off - $step * 4]);
924 $buf[$off - $step * 3] = ((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) as u8;
925 $buf[$off - $step * 2] = ((p2 + p1 + p0 + q0 + 2) >> 2) as u8;
926 $buf[$off - $step] = ((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) as u8;
927 } else {
928 $buf[$off - $step] = ((2 * p1 + p0 + q1 + 2) >> 2) as u8;
929 }
930 if a_q && (p0 - q0).abs() < (($alpha >> 2) + 2) {
931 let q3 = i16::from($buf[$off + $step * 3]);
932 $buf[$off] = ((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) as u8;
933 $buf[$off + $step] = ((p0 + q0 + q1 + q2 + 2) >> 2) as u8;
934 $buf[$off + $step * 2] = ((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) as u8;
935 } else {
936 $buf[$off] = ((2 * q1 + q0 + p1 + 2) >> 2) as u8;
937 }
938 };
939 (chromaedge; $buf: expr, $off: expr, $step: expr) => {
940 let p1 = i16::from($buf[$off - $step * 2]);
941 let p0 = i16::from($buf[$off - $step]);
942 let q0 = i16::from($buf[$off]);
943 let q1 = i16::from($buf[$off + $step]);
944 $buf[$off - $step] = ((2 * p1 + p0 + q1 + 2) >> 2) as u8;
945 $buf[$off] = ((2 * q1 + q0 + p1 + 2) >> 2) as u8;
946 };
947 (lumanormal; $buf: expr, $off: expr, $step: expr, $tc0: expr, $beta: expr) => {
948 let p2 = i16::from($buf[$off - $step * 3]);
949 let p1 = i16::from($buf[$off - $step * 2]);
950 let p0 = i16::from($buf[$off - $step]);
951 let q0 = i16::from($buf[$off]);
952 let q1 = i16::from($buf[$off + $step]);
953 let q2 = i16::from($buf[$off + $step * 2]);
954 let a_p = (p2 - p0).abs() < $beta;
955 let a_q = (q2 - q0).abs() < $beta;
956 let tc = $tc0 + (a_p as i16) + (a_q as i16);
957 let delta = (((q0 - p0) * 4 + (p1 - q1) + 4) >> 3).max(-tc).min(tc);
958 if a_p && ($tc0 > 0) {
959 $buf[$off - $step * 2] = clip8(p1 + ((p2 + ((p0 + q0 + 1) >> 1) - p1 * 2) >> 1).max(-$tc0).min($tc0));
960 }
961 $buf[$off - $step] = clip8(p0 + delta);
962 $buf[$off] = clip8(q0 - delta);
963 if a_q && ($tc0 > 0) {
964 $buf[$off + $step] = clip8(q1 + ((q2 + ((p0 + q0 + 1) >> 1) - q1 * 2) >> 1).max(-$tc0).min($tc0));
965 }
966 };
967 (chromanormal; $buf: expr, $off: expr, $step: expr, $tc0: expr) => {
968 let p1 = i16::from($buf[$off - $step * 2]);
969 let p0 = i16::from($buf[$off - $step]);
970 let q0 = i16::from($buf[$off]);
971 let q1 = i16::from($buf[$off + $step]);
972 let tc = $tc0 + 1;
973 let delta = (((q0 - p0) * 4 + (p1 - q1) + 4) >> 3).max(-tc).min(tc);
974 $buf[$off - $step] = clip8(p0 + delta);
975 $buf[$off] = clip8(q0 - delta);
976 }
977}
978
979fn check_filter(buf: &[u8], off: usize, step: usize, alpha: i16, beta: i16) -> bool {
980 let p1 = i16::from(buf[off - step * 2]);
981 let p0 = i16::from(buf[off - step]);
982 let q0 = i16::from(buf[off]);
983 let q1 = i16::from(buf[off + step]);
984 (p0 - q0).abs() < alpha && (p1 - p0).abs() < beta && (q1 - q0).abs() < beta
985}
986
932ae27b
KS
987#[cfg(not(target_arch="x86_64"))]
988fn check_filter4(buf: &[u8], mut off: usize, step: usize, stride: usize, alpha: i16, beta: i16) -> [bool; 4] {
989 let mut flags = [false; 4];
990 for flag in flags.iter_mut() {
991 let p1 = i16::from(buf[off - step * 2]);
992 let p0 = i16::from(buf[off - step]);
993 let q0 = i16::from(buf[off]);
994 let q1 = i16::from(buf[off + step]);
995 *flag = (p0 - q0).abs() < alpha && (p1 - p0).abs() < beta && (q1 - q0).abs() < beta;
996 off += stride;
997 }
998 flags
999}
1000
1001#[cfg(target_arch="x86_64")]
1002fn check_filter4(buf: &[u8], off: usize, step: usize, stride: usize, alpha: i16, beta: i16) -> [bool; 4] {
1003 unsafe {
1004 let mut flags = [false; 4];
1005 let src = buf[off - step * 2..].as_ptr();
1006 let load_stride = step.max(stride);
1007 let fptr = flags.as_mut_ptr();
1008 let tflag = u32::from(step == 1);
1009 asm! {
1010 // load block
1011 "pxor xmm4, xmm4",
1012 "movd xmm0, dword ptr [{src}]",
1013 "lea {tmp}, [{src} + {stride} * 2]",
1014 "movd xmm1, dword ptr [{src} + {stride}]",
1015 "movd xmm2, dword ptr [{tmp}]",
1016 "movd xmm3, dword ptr [{tmp} + {stride}]",
1017 "punpcklbw xmm0, xmm4",
1018 "punpcklbw xmm1, xmm4",
1019 "punpcklbw xmm2, xmm4",
1020 "punpcklbw xmm3, xmm4",
1021
1022 // transpose block if necessary so it's always processed by rows
1023 "test {tflag:e}, {tflag:e}",
1024 "jz 1f",
1025 "punpcklwd xmm0, xmm1",
1026 "movhlps xmm4, xmm0",
1027 "punpcklwd xmm2, xmm3",
1028 "movhlps xmm1, xmm2",
1029 "punpckldq xmm0, xmm2",
1030 "punpckldq xmm4, xmm1",
1031 "movhlps xmm1, xmm0",
1032 "movhlps xmm3, xmm4",
1033 "movaps xmm2, xmm4",
1034 "1:",
1035
1036 // calculate deltas and flags
1037 "movd xmm4, {alpha:r}",
1038 "movd xmm5, {beta:r}",
1039 "psubw xmm0, xmm1",
1040 "psubw xmm1, xmm2",
1041 "psubw xmm3, xmm2",
1042 "pshuflw xmm4, xmm4, 0",
1043 "pshuflw xmm5, xmm5, 0",
1044 "pabsw xmm0, xmm0", // |p1 - p0|
1045 "pabsw xmm1, xmm1", // |p0 - q0|
1046 "pabsw xmm2, xmm3", // |q1 - q0|
1047 "movaps xmm3, xmm5",
1048 "pcmpgtw xmm4, xmm1",
1049 "pcmpgtw xmm5, xmm0",
1050 "pcmpgtw xmm3, xmm2",
1051 "pand xmm4, xmm5",
1052 "pand xmm4, xmm3",
1053 "packsswb xmm4, xmm4",
1054 "movd [{flags}], xmm4",
1055 tmp = out(reg) _,
1056 src = in(reg) src,
1057 stride = in(reg) load_stride,
1058 alpha = in(reg) alpha,
1059 beta = in(reg) beta,
1060 flags = in(reg) fptr,
1061 tflag = in(reg) tflag,
1062 out("xmm0") _,
1063 out("xmm1") _,
1064 out("xmm2") _,
1065 out("xmm3") _,
1066 out("xmm4") _,
1067 out("xmm5") _,
1068 }
1069 flags
1070 }
1071}
1072
696e4e20 1073pub fn loop_filter_lumaedge_v(dst: &mut [u8], mut off: usize, stride: usize, alpha: i16, beta: i16) {
932ae27b
KS
1074 let flags = check_filter4(dst, off, 1, stride, alpha, beta);
1075 for &flag in flags.iter() {
1076 if flag {
696e4e20
KS
1077 loop_filter!(lumaedge; dst, off, 1, alpha, beta);
1078 }
1079 off += stride;
1080 }
1081}
1082pub fn loop_filter_lumaedge_h(dst: &mut [u8], off: usize, stride: usize, alpha: i16, beta: i16) {
932ae27b
KS
1083 let flags = check_filter4(dst, off, stride, 1, alpha, beta);
1084 for (x, &flag) in flags.iter().enumerate() {
1085 if flag {
696e4e20
KS
1086 loop_filter!(lumaedge; dst, off + x, stride, alpha, beta);
1087 }
1088 }
1089}
1090pub fn loop_filter_lumanormal_v(dst: &mut [u8], mut off: usize, stride: usize, alpha: i16, beta: i16, tc0: i16) {
932ae27b
KS
1091 let flags = check_filter4(dst, off, 1, stride, alpha, beta);
1092 for &flag in flags.iter() {
1093 if flag {
696e4e20
KS
1094 loop_filter!(lumanormal; dst, off, 1, tc0, beta);
1095 }
1096 off += stride;
1097 }
1098}
1099pub fn loop_filter_lumanormal_h(dst: &mut [u8], off: usize, stride: usize, alpha: i16, beta: i16, tc0: i16) {
932ae27b
KS
1100 let flags = check_filter4(dst, off, stride, 1, alpha, beta);
1101 for (x, &flag) in flags.iter().enumerate() {
1102 if flag {
696e4e20
KS
1103 loop_filter!(lumanormal; dst, off + x, stride, tc0, beta);
1104 }
1105 }
1106}
1107pub fn loop_filter_chromaedge_v(dst: &mut [u8], mut off: usize, stride: usize, alpha: i16, beta: i16) {
22de733b 1108 for _ in 0..2 {
696e4e20
KS
1109 if check_filter(dst, off, 1, alpha, beta) {
1110 loop_filter!(chromaedge; dst, off, 1);
1111 }
1112 off += stride;
1113 }
1114}
1115pub fn loop_filter_chromaedge_h(dst: &mut [u8], off: usize, stride: usize, alpha: i16, beta: i16) {
22de733b 1116 for x in 0..2 {
696e4e20
KS
1117 if check_filter(dst, off + x, stride, alpha, beta) {
1118 loop_filter!(chromaedge; dst, off + x, stride);
1119 }
1120 }
1121}
1122pub fn loop_filter_chromanormal_v(dst: &mut [u8], mut off: usize, stride: usize, alpha: i16, beta: i16, tc0: i16) {
22de733b 1123 for _ in 0..2 {
696e4e20
KS
1124 if check_filter(dst, off, 1, alpha, beta) {
1125 loop_filter!(chromanormal; dst, off, 1, tc0);
1126 }
1127 off += stride;
1128 }
1129}
1130pub fn loop_filter_chromanormal_h(dst: &mut [u8], off: usize, stride: usize, alpha: i16, beta: i16, tc0: i16) {
22de733b 1131 for x in 0..2 {
696e4e20
KS
1132 if check_filter(dst, off + x, stride, alpha, beta) {
1133 loop_filter!(chromanormal; dst, off + x, stride, tc0);
1134 }
1135 }
1136}