]> git.nihav.org Git - nihav.git/blame - nihav-itu/src/codecs/h264/dsp/mod.rs
avimux: do not record palette change chunks in OpenDML index
[nihav.git] / nihav-itu / src / codecs / h264 / dsp / mod.rs
CommitLineData
e5ab37bc 1#[allow(unexpected_cfgs)]
2f9923e6 2mod mc;
834e7b28 3pub use mc::{H264MC, McBlock};
932ae27b
KS
4#[cfg(target_arch="x86_64")]
5use std::arch::asm;
999fbb83 6
696e4e20
KS
7pub const CHROMA_QUANTS: [u8; 52] = [
8 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
9 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 29, 30,
10 31, 32, 32, 33, 34, 34, 35, 35, 36, 36, 37, 37, 37, 38, 38, 38,
11 39, 39, 39, 39
12];
13
14pub const CHROMA_DC_SCAN: [usize; 4] = [ 0, 1, 2, 3];
15pub const ZIGZAG: [usize; 16] = [
16 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
17];
18pub const ZIGZAG1: [usize; 15] = [
19 0, 3, 7, 4, 1, 2, 5, 8, 11, 12, 9, 6, 10, 13, 14
20];
21/*pub const IL_SCAN: [usize; 16] = [
22 0, 4, 1, 8, 12, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
23];*/
24pub const ZIGZAG8X8: [usize; 64] = [
25 0, 1, 8, 16, 9, 2, 3, 10,
26 17, 24, 32, 25, 18, 11, 4, 5,
27 12, 19, 26, 33, 40, 48, 41, 34,
28 27, 20, 13, 6, 7, 14, 21, 28,
29 35, 42, 49, 56, 57, 50, 43, 36,
30 29, 22, 15, 23, 30, 37, 44, 51,
31 58, 59, 52, 45, 38, 31, 39, 46,
32 53, 60, 61, 54, 47, 55, 62, 63
33];
34
35const LEVEL_SCALE: [[i16; 6]; 3] = [
36 [ 10, 11, 13, 14, 16, 18 ],
37 [ 16, 18, 20, 23, 25, 29 ],
38 [ 13, 14, 16, 18, 20, 23 ]
39];
40
41pub fn chroma_dc_transform(blk: &mut [i16; 4], qp: u8) {
42 let t0 = blk[0] + blk[2];
43 let t1 = blk[0] - blk[2];
44 let t2 = blk[1] + blk[3];
45 let t3 = blk[1] - blk[3];
46 blk[0] = t0 + t2;
47 blk[1] = t0 - t2;
48 blk[2] = t1 + t3;
49 blk[3] = t1 - t3;
50 if qp < 6 {
4a1ca15c 51 let mul = LEVEL_SCALE[0][qp as usize];
696e4e20
KS
52 for el in blk.iter_mut() {
53 *el = el.wrapping_mul(mul) >> 1;
54 }
55 } else {
4a1ca15c 56 let mul = LEVEL_SCALE[0][(qp % 6) as usize];
696e4e20
KS
57 let shift = qp / 6 - 1;
58 for el in blk.iter_mut() {
59 *el = el.wrapping_mul(mul) << shift;
60 }
61 }
62}
63
64macro_rules! transform {
65 (luma_dc; $a: expr, $b: expr, $c: expr, $d: expr) => ({
66 let t0 = $a.wrapping_add($c);
67 let t1 = $a.wrapping_sub($c);
68 let t2 = $b.wrapping_add($d);
69 let t3 = $b.wrapping_sub($d);
70 $a = t0.wrapping_add(t2);
71 $b = t1.wrapping_add(t3);
72 $c = t1.wrapping_sub(t3);
73 $d = t0.wrapping_sub(t2);
74 });
75 ($a: expr, $b: expr, $c: expr, $d: expr, $shift: expr) => ({
76 let t0 = $a.wrapping_add($c);
77 let t1 = $a.wrapping_sub($c);
78 let t2 = ($b >> 1).wrapping_sub($d);
79 let t3 = $b.wrapping_add($d >> 1);
80 let bias = 1 << $shift >> 1;
81 $a = t0.wrapping_add(t3).wrapping_add(bias) >> $shift;
82 $b = t1.wrapping_add(t2).wrapping_add(bias) >> $shift;
83 $c = t1.wrapping_sub(t2).wrapping_add(bias) >> $shift;
84 $d = t0.wrapping_sub(t3).wrapping_add(bias) >> $shift;
85 });
86 ($a: expr, $b: expr, $c: expr, $d: expr, $e: expr, $f: expr, $g: expr, $h: expr) => {
87 let e0 = $a + $e;
88 let e1 = -$d + $f - $h - ($h >> 1);
89 let e2 = $a - $e;
90 let e3 = $b + $h - $d - ($d >> 1);
91 let e4 = ($c >> 1) - $g;
92 let e5 = -$b + $h + $f + ($f >> 1);
93 let e6 = $c + ($g >> 1);
94 let e7 = $d + $f + $b + ($b >> 1);
95
96 let f0 = e0 + e6;
97 let f1 = e1 + (e7 >> 2);
98 let f2 = e2 + e4;
99 let f3 = e3 + (e5 >> 2);
100 let f4 = e2 - e4;
101 let f5 = (e3 >> 2) - e5;
102 let f6 = e0 - e6;
103 let f7 = e7 - (e1 >> 2);
104
105 $a = f0 + f7;
106 $b = f2 + f5;
107 $c = f4 + f3;
108 $d = f6 + f1;
109 $e = f6 - f1;
110 $f = f4 - f3;
111 $g = f2 - f5;
112 $h = f0 - f7;
113 };
114}
115
116pub fn idct_luma_dc(blk: &mut [i16; 16], qp: u8) {
117 if qp < 12 {
4a1ca15c 118 let mul = LEVEL_SCALE[0][(qp % 6) as usize];
696e4e20
KS
119 let shift = 2 - qp / 6;
120 let bias = 1 << shift >> 1;
121 for el in blk.iter_mut() {
122 *el = el.wrapping_mul(mul).wrapping_add(bias) >> shift;
123 }
124 } else {
4a1ca15c 125 let mul = LEVEL_SCALE[0][(qp % 6) as usize];
696e4e20
KS
126 let shift = qp / 6 - 2;
127 for el in blk.iter_mut() {
128 *el = el.wrapping_mul(mul) << shift;
129 }
130 }
131 for i in 0..4 {
132 transform!(luma_dc; blk[i], blk[i + 4], blk[i + 8], blk[i + 12]);
133 }
754ab49a 134 for row in blk.chunks_exact_mut(4) {
696e4e20
KS
135 transform!(luma_dc; row[0], row[1], row[2], row[3]);
136 }
137}
138
fe64781d 139pub fn idct_skip_dc(blk: &mut [i16; 16], qp: u8) {
696e4e20
KS
140 const BLK_INDEX: [usize; 16] = [
141 0, 2, 0, 2,
142 2, 1, 2, 1,
143 0, 2, 0, 2,
144 2, 1, 2, 1
145 ];
146 let qidx = (qp % 6) as usize;
147 let shift = qp / 6;
fe64781d
KS
148 for (el, &idx) in blk.iter_mut().zip(BLK_INDEX.iter()).skip(1) {
149 *el = (*el * LEVEL_SCALE[idx][qidx]) << shift;
150 }
151 for row in blk.chunks_exact_mut(4) {
152 transform!(row[0], row[1], row[2], row[3], 0);
153 }
154 for i in 0..4 {
155 transform!(blk[i], blk[i + 4], blk[i + 8], blk[i + 12], 6);
156 }
157}
158
159pub fn idct(blk: &mut [i16; 16], qp: u8) {
160 const BLK_INDEX: [usize; 16] = [
161 0, 2, 0, 2,
162 2, 1, 2, 1,
163 0, 2, 0, 2,
164 2, 1, 2, 1
165 ];
166 let qidx = (qp % 6) as usize;
167 let shift = qp / 6;
168 for (el, &idx) in blk.iter_mut().zip(BLK_INDEX.iter()) {
696e4e20
KS
169 *el = (*el * LEVEL_SCALE[idx][qidx]) << shift;
170 }
754ab49a 171 for row in blk.chunks_exact_mut(4) {
3ad9bf2b
KS
172 transform!(row[0], row[1], row[2], row[3], 0);
173 }
174 for i in 0..4 {
175 transform!(blk[i], blk[i + 4], blk[i + 8], blk[i + 12], 6);
696e4e20
KS
176 }
177}
178
179pub fn idct_dc(blk: &mut [i16; 16], qp: u8, quant_dc: bool) {
180 let dc = if quant_dc {
181 (blk[0] * LEVEL_SCALE[0][(qp % 6) as usize]) << (qp / 6)
182 } else {
183 blk[0]
184 };
185 *blk = [(dc + 0x20) >> 6; 16];
186}
187
188const QMAT_8X8: [[u8; 16]; 6] = [
189 [
2b6a8fdc 190 20, 19, 25, 19,
696e4e20
KS
191 19, 18, 24, 18,
192 25, 24, 32, 24,
2b6a8fdc 193 19, 18, 24, 18
696e4e20 194 ], [
2b6a8fdc 195 22, 21, 28, 21,
696e4e20
KS
196 21, 19, 26, 19,
197 28, 26, 35, 26,
2b6a8fdc 198 21, 19, 26, 19
696e4e20 199 ], [
2b6a8fdc 200 26, 24, 33, 24,
696e4e20
KS
201 24, 23, 31, 23,
202 33, 31, 42, 31,
2b6a8fdc 203 24, 23, 31, 23
696e4e20 204 ], [
2b6a8fdc 205 28, 26, 35, 26,
696e4e20
KS
206 26, 25, 33, 25,
207 35, 33, 45, 33,
2b6a8fdc 208 26, 25, 33, 25
696e4e20 209 ], [
2b6a8fdc 210 32, 30, 40, 30,
696e4e20
KS
211 30, 28, 38, 28,
212 40, 38, 51, 38,
2b6a8fdc 213 30, 28, 38, 28
696e4e20 214 ], [
2b6a8fdc 215 36, 34, 46, 34,
696e4e20
KS
216 34, 32, 43, 32,
217 46, 43, 58, 43,
2b6a8fdc 218 34, 32, 43, 32
696e4e20
KS
219 ]
220];
221
222pub fn dequant8x8(blk: &mut [i16; 64], slist: &[u8; 64]) {
223 for (el, &scan) in blk.iter_mut().zip(ZIGZAG8X8.iter()) {
224 if *el != 0 {
225 *el = el.wrapping_mul(i16::from(slist[scan]));
226 }
227 }
228}
229
230pub fn idct8x8(blk: &mut [i16; 64], qp: u8) {
231 let mut tmp = [0i32; 64];
232 let qmat = &QMAT_8X8[(qp % 6) as usize];
233 if qp >= 36 {
234 let shift = qp / 6 - 6;
235 for (i, (dst, &src)) in tmp.iter_mut().zip(blk.iter()).enumerate() {
236 let x = i & 7;
237 let y = i >> 3;
238 let idx = (x & 3) + (y & 3) * 4;
239 *dst = i32::from(src).wrapping_mul(i32::from(qmat[idx])) << shift;
240 }
241 } else {
242 let shift = 6 - qp / 6;
243 let bias = (1 << shift) >> 1;
244 for (i, (dst, &src)) in tmp.iter_mut().zip(blk.iter()).enumerate() {
245 let x = i & 7;
246 let y = i >> 3;
247 let idx = (x & 3) + (y & 3) * 4;
248 *dst = i32::from(src).wrapping_mul(i32::from(qmat[idx])).wrapping_add(bias) >> shift;
249 }
250 }
754ab49a 251 for row in tmp.chunks_exact_mut(8) {
696e4e20
KS
252 transform!(row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7]);
253 }
254 for col in 0..8 {
255 transform!(tmp[col], tmp[col + 8], tmp[col + 8 * 2], tmp[col + 8 * 3],
256 tmp[col + 8 * 4], tmp[col + 8 * 5], tmp[col + 8 * 6], tmp[col + 8 * 7]);
257 }
258 for (dst, &src) in blk.iter_mut().zip(tmp.iter()) {
259 *dst = ((src + 0x20) >> 6) as i16;
260 }
261}
262
263pub fn add_coeffs(dst: &mut [u8], offset: usize, stride: usize, coeffs: &[i16]) {
264 let out = &mut dst[offset..][..stride * 3 + 4];
754ab49a 265 for (line, src) in out.chunks_mut(stride).take(4).zip(coeffs.chunks_exact(4)) {
696e4e20
KS
266 for (dst, src) in line.iter_mut().take(4).zip(src.iter()) {
267 *dst = (i32::from(*dst) + i32::from(*src)).max(0).min(255) as u8;
268 }
269 }
270}
271
272pub fn add_coeffs8(dst: &mut [u8], offset: usize, stride: usize, coeffs: &[i16; 64]) {
273 let out = &mut dst[offset..];
754ab49a 274 for (line, src) in out.chunks_mut(stride).take(8).zip(coeffs.chunks_exact(8)) {
696e4e20
KS
275 for (dst, src) in line.iter_mut().take(8).zip(src.iter()) {
276 *dst = (i32::from(*dst) + i32::from(*src)).max(0).min(255) as u8;
277 }
278 }
279}
280
696e4e20
KS
281fn clip8(val: i16) -> u8 { val.max(0).min(255) as u8 }
282
22de733b
KS
283fn ipred_dc128(buf: &mut [u8], stride: usize, bsize: usize) {
284 for row in buf.chunks_mut(stride).take(bsize) {
285 for el in row[..bsize].iter_mut() {
286 *el = 128;
287 }
696e4e20
KS
288 }
289}
22de733b
KS
290fn ipred_ver(buf: &mut [u8], stride: usize, top: &[u8], bsize: usize) {
291 for row in buf.chunks_mut(stride).take(bsize) {
292 row[..bsize].copy_from_slice(&top[..bsize]);
696e4e20
KS
293 }
294}
22de733b
KS
295fn ipred_hor(buf: &mut [u8], stride: usize, left: &[u8], bsize: usize) {
296 for (row, &left) in buf.chunks_mut(stride).zip(left[1..].iter()).take(bsize) {
297 for el in row[..bsize].iter_mut() {
298 *el = left;
299 }
696e4e20
KS
300 }
301}
22de733b 302fn ipred_dc(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8], bsize: usize, shift: u8) {
696e4e20 303 let mut adc: u16 = 0;
22de733b
KS
304 for i in 0..bsize { adc += u16::from(top[i]); }
305 for i in 0..bsize { adc += u16::from(left[i + 1]); }
696e4e20
KS
306 let dc = ((adc + (1 << (shift - 1))) >> shift) as u8;
307
22de733b
KS
308 for row in buf.chunks_mut(stride).take(bsize) {
309 for el in row[..bsize].iter_mut() {
310 *el = dc;
311 }
696e4e20
KS
312 }
313}
22de733b 314fn ipred_left_dc(buf: &mut [u8], stride: usize, left: &[u8], bsize: usize, shift: u8) {
696e4e20 315 let mut adc: u16 = 0;
22de733b 316 for i in 0..bsize { adc += u16::from(left[i + 1]); }
696e4e20
KS
317 let dc = ((adc + (1 << (shift - 1))) >> shift) as u8;
318
22de733b
KS
319 for row in buf.chunks_mut(stride).take(bsize) {
320 for el in row[..bsize].iter_mut() {
321 *el = dc;
322 }
696e4e20
KS
323 }
324}
22de733b 325fn ipred_top_dc(buf: &mut [u8], stride: usize, top: &[u8], bsize: usize, shift: u8) {
696e4e20 326 let mut adc: u16 = 0;
22de733b 327 for i in 0..bsize { adc += u16::from(top[i]); }
696e4e20
KS
328 let dc = ((adc + (1 << (shift - 1))) >> shift) as u8;
329
22de733b
KS
330 for row in buf.chunks_mut(stride).take(bsize) {
331 for el in row[..bsize].iter_mut() {
332 *el = dc;
333 }
696e4e20
KS
334 }
335}
336
22de733b
KS
337fn load(dst: &mut [u16], src: &[u8]) {
338 for (dst, &src) in dst.iter_mut().zip(src.iter()) {
339 *dst = u16::from(src);
340 }
696e4e20
KS
341}
342
22de733b
KS
343fn ipred_4x4_ver(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8], _tr: &[u8]) {
344 ipred_ver(buf, stride, top, 4);
696e4e20 345}
22de733b
KS
346fn ipred_4x4_hor(buf: &mut [u8], stride: usize, _top: &[u8], left: &[u8], _tr: &[u8]) {
347 ipred_hor(buf, stride, left, 4);
696e4e20 348}
22de733b 349fn ipred_4x4_diag_down_left(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8], tr: &[u8]) {
696e4e20 350 let mut t: [u16; 9] = [0; 9];
22de733b
KS
351 load(&mut t[..4], top);
352 load(&mut t[4..8], tr);
696e4e20
KS
353 t[8] = t[7];
354
696e4e20 355 for i in 0..4 {
22de733b 356 buf[i] = ((t[i] + 2 * t[i + 1] + t[i + 2] + 2) >> 2) as u8;
696e4e20 357 }
22de733b 358 let dst = &mut buf[stride..];
696e4e20
KS
359 for i in 0..4 {
360 dst[i] = ((t[i + 1] + 2 * t[i + 2] + t[i + 3] + 2) >> 2) as u8;
361 }
22de733b 362 let dst = &mut buf[stride * 2..];
696e4e20
KS
363 for i in 0..4 {
364 dst[i] = ((t[i + 2] + 2 * t[i + 3] + t[i + 4] + 2) >> 2) as u8;
365 }
22de733b 366 let dst = &mut buf[stride * 3..];
696e4e20
KS
367 for i in 0..4 {
368 dst[i] = ((t[i + 3] + 2 * t[i + 4] + t[i + 5] + 2) >> 2) as u8;
369 }
370}
22de733b 371fn ipred_4x4_diag_down_right(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8], _tr: &[u8]) {
696e4e20 372 let mut t: [u16; 5] = [0; 5];
22de733b 373 t[0] = u16::from(left[0]);
42005e25 374 load(&mut t[1..], top);
696e4e20 375 let mut l: [u16; 5] = [0; 5];
22de733b
KS
376 load(&mut l, left);
377 let dst = buf;
696e4e20
KS
378
379 for j in 0..4 {
380 for i in 0..j {
381 dst[i + j * stride] = ((l[j - i - 1] + 2 * l[j - i] + l[j - i + 1] + 2) >> 2) as u8;
382 }
383 dst[j + j * stride] = ((l[1] + 2 * l[0] + t[1] + 2) >> 2) as u8;
384 for i in (j+1)..4 {
385 dst[i + j * stride] = ((t[i - j - 1] + 2 * t[i - j] + t[i - j + 1] + 2) >> 2) as u8;
386 }
387 }
388}
22de733b 389fn ipred_4x4_ver_right(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8], _tr: &[u8]) {
696e4e20 390 let mut t: [u16; 5] = [0; 5];
22de733b 391 t[0] = u16::from(left[0]);
42005e25 392 load(&mut t[1..], top);
696e4e20 393 let mut l: [u16; 5] = [0; 5];
22de733b
KS
394 load(&mut l, left);
395 let dst = buf;
696e4e20
KS
396
397 for j in 0..4 {
398 for i in 0..4 {
399 let zvr = ((2 * i) as i8) - (j as i8);
400 let pix;
401 if zvr >= 0 {
402 if (zvr & 1) == 0 {
403 pix = (t[i - (j >> 1)] + t[i - (j >> 1) + 1] + 1) >> 1;
404 } else {
405 pix = (t[i - (j >> 1) - 1] + 2 * t[i - (j >> 1)] + t[i - (j >> 1) + 1] + 2) >> 2;
406 }
407 } else {
408 if zvr == -1 {
409 pix = (l[1] + 2 * l[0] + t[1] + 2) >> 2;
410 } else {
411 pix = (l[j] + 2 * l[j - 1] + l[j - 2] + 2) >> 2;
412 }
413 }
414 dst[i + j * stride] = pix as u8;
415 }
416 }
417}
22de733b 418fn ipred_4x4_ver_left(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8], tr: &[u8]) {
696e4e20 419 let mut t: [u16; 8] = [0; 8];
42005e25 420 load(&mut t[..4], top);
22de733b
KS
421 load(&mut t[4..], tr);
422 let dst = buf;
696e4e20
KS
423
424 dst[0 + 0 * stride] = ((t[0] + t[1] + 1) >> 1) as u8;
425 let pix = ((t[1] + t[2] + 1) >> 1) as u8;
426 dst[1 + 0 * stride] = pix;
427 dst[0 + 2 * stride] = pix;
428 let pix = ((t[2] + t[3] + 1) >> 1) as u8;
429 dst[2 + 0 * stride] = pix;
430 dst[1 + 2 * stride] = pix;
431 let pix = ((t[3] + t[4] + 1) >> 1) as u8;
432 dst[3 + 0 * stride] = pix;
433 dst[2 + 2 * stride] = pix;
434 dst[3 + 2 * stride] = ((t[4] + t[5] + 1) >> 1) as u8;
435 dst[0 + 1 * stride] = ((t[0] + 2*t[1] + t[2] + 2) >> 2) as u8;
436 let pix = ((t[1] + 2*t[2] + t[3] + 2) >> 2) as u8;
437 dst[1 + 1 * stride] = pix;
438 dst[0 + 3 * stride] = pix;
439 let pix = ((t[2] + 2*t[3] + t[4] + 2) >> 2) as u8;
440 dst[2 + 1 * stride] = pix;
441 dst[1 + 3 * stride] = pix;
442 let pix = ((t[3] + 2*t[4] + t[5] + 2) >> 2) as u8;
443 dst[3 + 1 * stride] = pix;
444 dst[2 + 3 * stride] = pix;
445 dst[3 + 3 * stride] = ((t[4] + 2*t[5] + t[6] + 2) >> 2) as u8;
446}
22de733b 447fn ipred_4x4_hor_down(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8], _tr: &[u8]) {
696e4e20 448 let mut t: [u16; 5] = [0; 5];
22de733b 449 t[0] = u16::from(left[0]);
42005e25 450 load(&mut t[1..], top);
696e4e20 451 let mut l: [u16; 5] = [0; 5];
22de733b
KS
452 load(&mut l, left);
453 let dst = buf;
696e4e20
KS
454
455 for j in 0..4 {
456 for i in 0..4 {
457 let zhd = ((2 * j) as i8) - (i as i8);
458 let pix;
459 if zhd >= 0 {
460 if (zhd & 1) == 0 {
461 pix = (l[j - (i >> 1)] + l[j - (i >> 1) + 1] + 1) >> 1;
462 } else {
463 pix = (l[j - (i >> 1) - 1] + 2 * l[j - (i >> 1)] + l[j - (i >> 1) + 1] + 2) >> 2;
464 }
465 } else {
466 if zhd == -1 {
467 pix = (l[1] + 2 * l[0] + t[1] + 2) >> 2;
468 } else {
469 pix = (t[i - 2] + 2 * t[i - 1] + t[i] + 2) >> 2;
470 }
471 }
472 dst[i + j * stride] = pix as u8;
473 }
474 }
475}
22de733b 476fn ipred_4x4_hor_up(buf: &mut [u8], stride: usize, _top: &[u8], left: &[u8], _tr: &[u8]) {
696e4e20 477 let mut l: [u16; 8] = [0; 8];
22de733b
KS
478 load(&mut l, &left[1..]);
479 let dst = buf;
696e4e20
KS
480
481 dst[0 + 0 * stride] = ((l[0] + l[1] + 1) >> 1) as u8;
482 dst[1 + 0 * stride] = ((l[0] + 2*l[1] + l[2] + 2) >> 2) as u8;
483 let pix = ((l[1] + l[2] + 1) >> 1) as u8;
484 dst[2 + 0 * stride] = pix;
485 dst[0 + 1 * stride] = pix;
486 let pix = ((l[1] + 2*l[2] + l[3] + 2) >> 2) as u8;
487 dst[3 + 0 * stride] = pix;
488 dst[1 + 1 * stride] = pix;
489 let pix = ((l[2] + l[3] + 1) >> 1) as u8;
490 dst[2 + 1 * stride] = pix;
491 dst[0 + 2 * stride] = pix;
492 let pix = ((l[2] + 3*l[3] + 2) >> 2) as u8;
493 dst[3 + 1 * stride] = pix;
494 dst[1 + 2 * stride] = pix;
495 dst[3 + 2 * stride] = l[3] as u8;
496 dst[1 + 3 * stride] = l[3] as u8;
497 dst[0 + 3 * stride] = l[3] as u8;
498 dst[2 + 2 * stride] = l[3] as u8;
499 dst[2 + 3 * stride] = l[3] as u8;
500 dst[3 + 3 * stride] = l[3] as u8;
501}
22de733b
KS
502fn ipred_4x4_dc(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8], _tr: &[u8]) {
503 ipred_dc(buf, stride, top, left, 4, 3);
696e4e20 504}
22de733b
KS
505fn ipred_4x4_left_dc(buf: &mut [u8], stride: usize, _top: &[u8], left: &[u8], _tr: &[u8]) {
506 ipred_left_dc(buf, stride, left, 4, 2);
696e4e20 507}
22de733b
KS
508fn ipred_4x4_top_dc(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8], _tr: &[u8]) {
509 ipred_top_dc(buf, stride, top, 4, 2);
696e4e20 510}
22de733b
KS
511fn ipred_4x4_dc128(buf: &mut [u8], stride: usize, _top: &[u8], _left: &[u8], _tr: &[u8]) {
512 ipred_dc128(buf, stride, 4);
696e4e20
KS
513}
514
515pub struct IPred8Context {
516 pub t: [u8; 16],
517 pub l: [u8; 8],
518 pub tl: u8,
519}
520
521impl IPred8Context {
522 pub fn new() -> Self {
523 Self {
524 t: [128; 16],
525 l: [128; 8],
526 tl: 128,
527 }
528 }
22de733b 529 pub fn fill(&mut self, top: &[u8], left: &[u8], has_t: bool, has_tr: bool, has_l: bool, has_tl: bool) {
696e4e20
KS
530 let mut t = [0x80u8; 19];
531 let mut l = [0x80u8; 11];
532 if has_t {
22de733b 533 t[1..8 + 1].copy_from_slice(&top[..8]);
696e4e20
KS
534 }
535 if has_tr {
22de733b 536 t[8 + 1..16 + 1].copy_from_slice(&top[8..][..8]);
696e4e20
KS
537 t[16 + 1] = t[15 + 1];
538 t[17 + 1] = t[15 + 1];
539 } else {
540 let (t0, t1) = t.split_at_mut(8 + 1);
541 for el in t1.iter_mut() {
542 *el = t0[7 + 1];
543 }
544 }
545 if has_l {
22de733b 546 l[1..9].copy_from_slice(&left[1..9]);
696e4e20
KS
547 l[8 + 1] = l[7 + 1];
548 l[9 + 1] = l[7 + 1];
549 }
550 if has_tl {
22de733b
KS
551 t[0] = left[0];
552 l[0] = left[0];
696e4e20
KS
553 } else {
554 t[0] = t[1];
555 l[0] = l[1];
556 }
557
558 for i in 0..16 {
559 self.t[i] = ((u16::from(t[i]) + 2 * u16::from(t[i + 1]) + u16::from(t[i + 2]) + 2) >> 2) as u8;
560 }
561 for i in 0..8 {
562 self.l[i] = ((u16::from(l[i]) + 2 * u16::from(l[i + 1]) + u16::from(l[i + 2]) + 2) >> 2) as u8;
563 }
564 self.tl = if has_t && has_l {
565 ((u16::from(t[1]) + 2 * u16::from(t[0]) + u16::from(l[1]) + 2) >> 2) as u8
566 } else if has_t {
567 ((3 * u16::from(t[0]) + u16::from(t[1]) + 2) >> 2) as u8
568 } else if has_l {
569 ((3 * u16::from(l[0]) + u16::from(l[1]) + 2) >> 2) as u8
570 } else {
571 t[0]
572 };
573 }
574}
575
576fn ipred_y_8x8_ver(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
577 for row in buf.chunks_mut(stride).take(8) {
578 row[..8].copy_from_slice(&ctx.t[..8]);
579 }
580}
581fn ipred_y_8x8_hor(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
582 for (row, &l) in buf.chunks_mut(stride).zip(ctx.l.iter()).take(8) {
583 row[..8].copy_from_slice(&[l; 8]);
584 }
585}
586fn ipred_y_8x8_diag_down_left(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
587 let mut t = [0u16; 16];
22de733b 588 load(&mut t, &ctx.t);
696e4e20
KS
589
590 for (y, row) in buf.chunks_mut(stride).take(8).enumerate() {
591 for (x, pix) in row.iter_mut().take(8).enumerate() {
592 *pix = ((if (x != 7) || (y != 7) {
593 t[x + y] + 2 * t[x + y + 1] + t[x + y + 2]
594 } else {
595 t[14] + 3 * t[15]
596 } + 2) >> 2) as u8;
597 }
598 }
599}
600fn ipred_y_8x8_diag_down_right(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
601 let mut t = [0u16; 9];
602 t[0] = u16::from(ctx.tl);
22de733b 603 load(&mut t[1..], &ctx.t);
696e4e20
KS
604 let mut l = [0u16; 9];
605 l[0] = u16::from(ctx.tl);
22de733b 606 load(&mut l[1..], &ctx.l);
696e4e20
KS
607 let diag = t[1] + 2 * t[0] + l[1];
608
609 for (y, row) in buf.chunks_mut(stride).take(8).enumerate() {
610 for (x, pix) in row.iter_mut().take(8).enumerate() {
611 *pix = ((if x > y {
612 t[x - y - 1] + 2 * t[x - y] + t[x - y + 1]
613 } else if x < y {
614 l[y - x - 1] + 2 * l[y - x] + l[y - x + 1]
615 } else {
616 diag
617 } + 2) >> 2) as u8;
618 }
619 }
620}
621fn ipred_y_8x8_ver_right(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
622 let mut t = [0u16; 9];
623 t[0] = u16::from(ctx.tl);
22de733b 624 load(&mut t[1..], &ctx.t);
696e4e20
KS
625 let mut l = [0u16; 9];
626 l[0] = u16::from(ctx.tl);
22de733b 627 load(&mut l[1..], &ctx.l);
696e4e20
KS
628
629 for (y, row) in buf.chunks_mut(stride).take(8).enumerate() {
630 for (x, pix) in row.iter_mut().take(8).enumerate() {
631 let zvr = 2 * (x as i8) - (y as i8);
632 *pix = if zvr >= 0 {
633 let ix = x - (y >> 1);
634 if (zvr & 1) == 0 {
635 (t[ix] + t[ix + 1] + 1) >> 1
636 } else {
637 (t[ix - 1] + 2 * t[ix] + t[ix + 1] + 2) >> 2
638 }
639 } else if zvr == -1 {
2b6a8fdc 640 (l[1] + 2 * l[0] + t[1] + 2) >> 2
696e4e20
KS
641 } else {
642 let ix = y - 2 * x;
643 (l[ix] + 2 * l[ix - 1] + l[ix - 2] + 2) >> 2
644 } as u8;
645 }
646 }
647}
648fn ipred_y_8x8_ver_left(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
649 let mut t = [0u16; 16];
22de733b 650 load(&mut t, &ctx.t);
696e4e20
KS
651
652 for (y, row) in buf.chunks_mut(stride).take(8).enumerate() {
653 for (x, pix) in row.iter_mut().take(8).enumerate() {
654 let ix = x + (y >> 1);
655 *pix = if (y & 1) == 0 {
656 (t[ix] + t[ix + 1] + 1) >> 1
657 } else {
658 (t[ix] + 2 * t[ix + 1] + t[ix + 2] + 2) >> 2
659 } as u8;
660 }
661 }
662
663}
664fn ipred_y_8x8_hor_down(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
665 let mut t = [0u16; 9];
666 t[0] = u16::from(ctx.tl);
22de733b 667 load(&mut t[1..], &ctx.t);
696e4e20
KS
668 let mut l = [0u16; 9];
669 l[0] = u16::from(ctx.tl);
22de733b 670 load(&mut l[1..], &ctx.l);
696e4e20
KS
671
672 for (y, row) in buf.chunks_mut(stride).take(8).enumerate() {
673 for (x, pix) in row.iter_mut().take(8).enumerate() {
674 let zhd = 2 * (y as i8) - (x as i8);
675 *pix = if zhd >= 0 {
676 let ix = y - (x >> 1);
677 if (zhd & 1) == 0 {
678 (l[ix] + l[ix + 1] + 1) >> 1
679 } else {
680 (l[ix - 1] + 2 * l[ix] + l[ix + 1] + 2) >> 2
681 }
682 } else if zhd == -1 {
2b6a8fdc 683 (l[1] + 2 * l[0] + t[1] + 2) >> 2
696e4e20
KS
684 } else {
685 let ix = x - 2 * y;
686 (t[ix] + 2 * t[ix - 1] + t[ix - 2] + 2) >> 2
687 } as u8;
688 }
689 }
690}
691fn ipred_y_8x8_hor_up(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
692 let mut l = [0u16; 8];
22de733b 693 load(&mut l, &ctx.l);
696e4e20
KS
694
695 for (y, row) in buf.chunks_mut(stride).take(8).enumerate() {
696 for (x, pix) in row.iter_mut().take(8).enumerate() {
697 let zhu = x + 2 * y;
698 let ix = y + (x >> 1);
699 *pix = if zhu > 13 {
700 l[7]
701 } else if zhu == 13 {
702 (l[6] + 3 * l[7] + 2) >> 2
703 } else if (zhu & 1) != 0 {
704 (l[ix] + 2 * l[ix + 1] + l[ix + 2] + 2) >> 2
705 } else {
706 (l[ix] + l[ix + 1] + 1) >> 1
707 } as u8;
708 }
709 }
710}
711fn ipred_y_8x8_dc(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
712 let mut sum = 0u16;
713 for &t in ctx.t[..8].iter() {
714 sum += u16::from(t);
715 }
716 for &l in ctx.l[..8].iter() {
717 sum += u16::from(l);
718 }
719 let dc = ((sum + 8) >> 4) as u8;
720 for row in buf.chunks_mut(stride).take(8) {
721 for pix in row.iter_mut().take(8) {
722 *pix = dc;
723 }
724 }
725}
726fn ipred_y_8x8_left_dc(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
727 let mut sum = 0u16;
728 for &l in ctx.l[..8].iter() {
729 sum += u16::from(l);
730 }
731 let dc = ((sum + 4) >> 3) as u8;
732 for row in buf.chunks_mut(stride).take(8) {
733 for pix in row.iter_mut().take(8) {
734 *pix = dc;
735 }
736 }
737}
738fn ipred_y_8x8_top_dc(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
739 let mut sum = 0u16;
740 for &t in ctx.t[..8].iter() {
741 sum += u16::from(t);
742 }
743 let dc = ((sum + 4) >> 3) as u8;
744 for row in buf.chunks_mut(stride).take(8) {
745 for pix in row.iter_mut().take(8) {
746 *pix = dc;
747 }
748 }
749}
750fn ipred_y_8x8_dc128(buf: &mut [u8], stride: usize, _ctx: &IPred8Context) {
22de733b 751 ipred_dc128(buf, stride, 8);
696e4e20
KS
752}
753
22de733b
KS
754fn ipred_8x8_ver(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8]) {
755 ipred_ver(buf, stride, top, 8);
696e4e20 756}
22de733b
KS
757fn ipred_8x8_hor(buf: &mut [u8], stride: usize, _top: &[u8], left: &[u8]) {
758 ipred_hor(buf, stride, left, 8);
696e4e20 759}
22de733b
KS
760fn ipred_8x8_dc(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8]) {
761 let mut l = [0; 8];
762 load(&mut l, &left[1..]);
763 let mut t = [0; 8];
42005e25 764 load(&mut t, top);
696e4e20
KS
765
766 let dc0 = ((t[0] + t[1] + t[2] + t[3] + l[0] + l[1] + l[2] + l[3] + 4) >> 3) as u8;
767 let sum1 = t[4] + t[5] + t[6] + t[7];
768 let dc1 = ((sum1 + 2) >> 2) as u8;
769 let sum2 = l[4] + l[5] + l[6] + l[7];
770 let dc2 = ((sum2 + 2) >> 2) as u8;
771 let dc3 = ((sum1 + sum2 + 4) >> 3) as u8;
772
22de733b 773 for row in buf.chunks_mut(stride).take(4) {
696e4e20
KS
774 row[..4].copy_from_slice(&[dc0; 4]);
775 row[4..8].copy_from_slice(&[dc1; 4]);
776 }
22de733b 777 for row in buf.chunks_mut(stride).skip(4).take(4) {
696e4e20
KS
778 row[..4].copy_from_slice(&[dc2; 4]);
779 row[4..8].copy_from_slice(&[dc3; 4]);
780 }
781}
22de733b 782fn ipred_8x8_left_dc(buf: &mut [u8], stride: usize, _top: &[u8], left: &[u8]) {
696e4e20
KS
783 let mut left_dc0 = 0;
784 let mut left_dc1 = 0;
22de733b
KS
785 for &el in left[1..].iter().take(4) {
786 left_dc0 += u16::from(el);
696e4e20 787 }
22de733b
KS
788 for &el in left[1..].iter().skip(4).take(4) {
789 left_dc1 += u16::from(el);
696e4e20
KS
790 }
791 let dc0 = ((left_dc0 + 2) >> 2) as u8;
792 let dc2 = ((left_dc1 + 2) >> 2) as u8;
22de733b 793 for row in buf.chunks_mut(stride).take(4) {
696e4e20
KS
794 row[..8].copy_from_slice(&[dc0; 8]);
795 }
22de733b 796 for row in buf.chunks_mut(stride).skip(4).take(4) {
696e4e20
KS
797 row[..8].copy_from_slice(&[dc2; 8]);
798 }
799}
22de733b
KS
800fn ipred_8x8_top_dc(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8]) {
801 ipred_top_dc(buf, stride, top, 4, 2);
802 ipred_top_dc(&mut buf[4..], stride, &top[4..], 4, 2);
803 let mut top = [0; 8];
804 top.copy_from_slice(&buf[stride * 3..][..8]);
805 ipred_top_dc(&mut buf[4 * stride..], stride, &top, 4, 2);
806 ipred_top_dc(&mut buf[4 + 4 * stride..], stride, &top[4..], 4, 2);
696e4e20 807}
22de733b
KS
808fn ipred_8x8_dc128(buf: &mut [u8], stride: usize, _top: &[u8], _left: &[u8]) {
809 ipred_dc128(buf, stride, 8);
696e4e20 810}
22de733b
KS
811fn ipred_8x8_plane(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8]) {
812 let mut h: i32 = 4 * (i32::from(top[7]) - i32::from(left[0]));
813 let mut v: i32 = 4 * (i32::from(left[8]) - i32::from(left[0]));
814 for i in 0..3 {
696e4e20 815 let i1 = (i + 1) as i32;
22de733b
KS
816 h += i1 * (i32::from(top[4 + i]) - i32::from(top[2 - i]));
817 v += i1 * (i32::from(left[5 + i]) - i32::from(left[3 - i]));
696e4e20
KS
818 }
819 let b = (17 * h + 16) >> 5;
820 let c = (17 * v + 16) >> 5;
22de733b
KS
821 let mut a = 16 * (i32::from(left[8]) + i32::from(top[7])) - 3 * (b + c) + 16;
822 for line in buf.chunks_mut(stride).take(8) {
696e4e20
KS
823 let mut acc = a;
824 for el in line.iter_mut().take(8) {
825 *el = clip8((acc >> 5) as i16);
826 acc += b;
827 }
828 a += c;
829 }
830}
831
22de733b
KS
832fn ipred_16x16_ver(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8]) {
833 ipred_ver(buf, stride, top, 16);
696e4e20 834}
22de733b
KS
835fn ipred_16x16_hor(buf: &mut [u8], stride: usize, _top: &[u8], left: &[u8]) {
836 ipred_hor(buf, stride, left, 16);
696e4e20 837}
22de733b
KS
838fn ipred_16x16_dc(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8]) {
839 ipred_dc(buf, stride, top, left, 16, 5);
696e4e20 840}
22de733b
KS
841fn ipred_16x16_left_dc(buf: &mut [u8], stride: usize, _top: &[u8], left: &[u8]) {
842 ipred_left_dc(buf, stride, left, 16, 4);
696e4e20 843}
22de733b
KS
844fn ipred_16x16_top_dc(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8]) {
845 ipred_top_dc(buf, stride, top, 16, 4);
696e4e20 846}
22de733b
KS
847fn ipred_16x16_dc128(buf: &mut [u8], stride: usize, _top: &[u8], _left: &[u8]) {
848 ipred_dc128(buf, stride, 16);
696e4e20 849}
22de733b
KS
850fn ipred_16x16_plane(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8]) {
851 let mut h = 8 * (i32::from(top[15]) - i32::from(left[0]));
852 let mut v = 8 * (i32::from(left[16]) - i32::from(left[0]));
853 for k in 0..7 {
854 h += ((k as i32) + 1) * (i32::from(top[8 + k]) - i32::from(top[6 - k]));
855 v += ((k as i32) + 1) * (i32::from(left[9 + k]) - i32::from(left[7 - k]));
696e4e20 856 }
22de733b 857
696e4e20
KS
858 h = (5 * h + 32) >> 6;
859 v = (5 * v + 32) >> 6;
860
22de733b 861 let mut a = 16 * (i32::from(left[16]) + i32::from(top[15]) + 1) - 7 * (v + h);
696e4e20 862
22de733b 863 for row in buf.chunks_mut(stride).take(16) {
696e4e20
KS
864 let mut b = a;
865 a += v;
866
867 for dst in row.chunks_exact_mut(4).take(4) {
868 dst[0] = clip8(((b ) >> 5) as i16);
869 dst[1] = clip8(((b + h) >> 5) as i16);
870 dst[2] = clip8(((b + 2*h) >> 5) as i16);
871 dst[3] = clip8(((b + 3*h) >> 5) as i16);
872 b += h * 4;
873 }
874 }
875}
876
22de733b
KS
877pub type IPred4x4Func = fn(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8], tr: &[u8]);
878pub type IPred8x8Func = fn(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8]);
696e4e20
KS
879pub type IPred8x8LumaFunc = fn(buf: &mut [u8], stride: usize, ctx: &IPred8Context);
880
881pub const IPRED4_DC128: usize = 11;
882pub const IPRED4_DC_TOP: usize = 10;
883pub const IPRED4_DC_LEFT: usize = 9;
884pub const IPRED8_DC128: usize = 6;
885pub const IPRED8_DC_TOP: usize = 5;
886pub const IPRED8_DC_LEFT: usize = 4;
887
888pub const IPRED_FUNCS4X4: [IPred4x4Func; 12] = [
889 ipred_4x4_ver, ipred_4x4_hor, ipred_4x4_dc,
890 ipred_4x4_diag_down_left, ipred_4x4_diag_down_right,
891 ipred_4x4_ver_right, ipred_4x4_hor_down, ipred_4x4_ver_left, ipred_4x4_hor_up,
892 ipred_4x4_left_dc, ipred_4x4_top_dc, ipred_4x4_dc128
893];
894
895pub const IPRED_FUNCS8X8_LUMA: [IPred8x8LumaFunc; 12] = [
896 ipred_y_8x8_ver, ipred_y_8x8_hor, ipred_y_8x8_dc,
897 ipred_y_8x8_diag_down_left, ipred_y_8x8_diag_down_right,
898 ipred_y_8x8_ver_right, ipred_y_8x8_hor_down,
899 ipred_y_8x8_ver_left, ipred_y_8x8_hor_up,
900 ipred_y_8x8_left_dc, ipred_y_8x8_top_dc, ipred_y_8x8_dc128
901];
902
903pub const IPRED_FUNCS8X8_CHROMA: [IPred8x8Func; 7] = [
904 ipred_8x8_dc, ipred_8x8_hor, ipred_8x8_ver, ipred_8x8_plane,
905 ipred_8x8_left_dc, ipred_8x8_top_dc, ipred_8x8_dc128
906];
907
908pub const IPRED_FUNCS16X16: [IPred8x8Func; 7] = [
909 ipred_16x16_ver, ipred_16x16_hor, ipred_16x16_dc, ipred_16x16_plane,
910 ipred_16x16_left_dc, ipred_16x16_top_dc, ipred_16x16_dc128
911];
912
696e4e20
KS
913macro_rules! loop_filter {
914 (lumaedge; $buf: expr, $off: expr, $step: expr, $alpha: expr, $beta: expr) => {
915 let p2 = i16::from($buf[$off - $step * 3]);
916 let p1 = i16::from($buf[$off - $step * 2]);
917 let p0 = i16::from($buf[$off - $step]);
918 let q0 = i16::from($buf[$off]);
919 let q1 = i16::from($buf[$off + $step]);
920 let q2 = i16::from($buf[$off + $step * 2]);
921 let a_p = (p2 - p0).abs() < $beta;
922 let a_q = (q2 - q0).abs() < $beta;
923 if a_p && (p0 - q0).abs() < (($alpha >> 2) + 2) {
924 let p3 = i16::from($buf[$off - $step * 4]);
925 $buf[$off - $step * 3] = ((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) as u8;
926 $buf[$off - $step * 2] = ((p2 + p1 + p0 + q0 + 2) >> 2) as u8;
927 $buf[$off - $step] = ((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) as u8;
928 } else {
929 $buf[$off - $step] = ((2 * p1 + p0 + q1 + 2) >> 2) as u8;
930 }
931 if a_q && (p0 - q0).abs() < (($alpha >> 2) + 2) {
932 let q3 = i16::from($buf[$off + $step * 3]);
933 $buf[$off] = ((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) as u8;
934 $buf[$off + $step] = ((p0 + q0 + q1 + q2 + 2) >> 2) as u8;
935 $buf[$off + $step * 2] = ((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) as u8;
936 } else {
937 $buf[$off] = ((2 * q1 + q0 + p1 + 2) >> 2) as u8;
938 }
939 };
940 (chromaedge; $buf: expr, $off: expr, $step: expr) => {
941 let p1 = i16::from($buf[$off - $step * 2]);
942 let p0 = i16::from($buf[$off - $step]);
943 let q0 = i16::from($buf[$off]);
944 let q1 = i16::from($buf[$off + $step]);
945 $buf[$off - $step] = ((2 * p1 + p0 + q1 + 2) >> 2) as u8;
946 $buf[$off] = ((2 * q1 + q0 + p1 + 2) >> 2) as u8;
947 };
948 (lumanormal; $buf: expr, $off: expr, $step: expr, $tc0: expr, $beta: expr) => {
949 let p2 = i16::from($buf[$off - $step * 3]);
950 let p1 = i16::from($buf[$off - $step * 2]);
951 let p0 = i16::from($buf[$off - $step]);
952 let q0 = i16::from($buf[$off]);
953 let q1 = i16::from($buf[$off + $step]);
954 let q2 = i16::from($buf[$off + $step * 2]);
955 let a_p = (p2 - p0).abs() < $beta;
956 let a_q = (q2 - q0).abs() < $beta;
957 let tc = $tc0 + (a_p as i16) + (a_q as i16);
958 let delta = (((q0 - p0) * 4 + (p1 - q1) + 4) >> 3).max(-tc).min(tc);
959 if a_p && ($tc0 > 0) {
960 $buf[$off - $step * 2] = clip8(p1 + ((p2 + ((p0 + q0 + 1) >> 1) - p1 * 2) >> 1).max(-$tc0).min($tc0));
961 }
962 $buf[$off - $step] = clip8(p0 + delta);
963 $buf[$off] = clip8(q0 - delta);
964 if a_q && ($tc0 > 0) {
965 $buf[$off + $step] = clip8(q1 + ((q2 + ((p0 + q0 + 1) >> 1) - q1 * 2) >> 1).max(-$tc0).min($tc0));
966 }
967 };
968 (chromanormal; $buf: expr, $off: expr, $step: expr, $tc0: expr) => {
969 let p1 = i16::from($buf[$off - $step * 2]);
970 let p0 = i16::from($buf[$off - $step]);
971 let q0 = i16::from($buf[$off]);
972 let q1 = i16::from($buf[$off + $step]);
973 let tc = $tc0 + 1;
974 let delta = (((q0 - p0) * 4 + (p1 - q1) + 4) >> 3).max(-tc).min(tc);
975 $buf[$off - $step] = clip8(p0 + delta);
976 $buf[$off] = clip8(q0 - delta);
977 }
978}
979
980fn check_filter(buf: &[u8], off: usize, step: usize, alpha: i16, beta: i16) -> bool {
981 let p1 = i16::from(buf[off - step * 2]);
982 let p0 = i16::from(buf[off - step]);
983 let q0 = i16::from(buf[off]);
984 let q1 = i16::from(buf[off + step]);
985 (p0 - q0).abs() < alpha && (p1 - p0).abs() < beta && (q1 - q0).abs() < beta
986}
987
932ae27b
KS
988#[cfg(not(target_arch="x86_64"))]
989fn check_filter4(buf: &[u8], mut off: usize, step: usize, stride: usize, alpha: i16, beta: i16) -> [bool; 4] {
990 let mut flags = [false; 4];
991 for flag in flags.iter_mut() {
992 let p1 = i16::from(buf[off - step * 2]);
993 let p0 = i16::from(buf[off - step]);
994 let q0 = i16::from(buf[off]);
995 let q1 = i16::from(buf[off + step]);
996 *flag = (p0 - q0).abs() < alpha && (p1 - p0).abs() < beta && (q1 - q0).abs() < beta;
997 off += stride;
998 }
999 flags
1000}
1001
1002#[cfg(target_arch="x86_64")]
1003fn check_filter4(buf: &[u8], off: usize, step: usize, stride: usize, alpha: i16, beta: i16) -> [bool; 4] {
1004 unsafe {
1005 let mut flags = [false; 4];
1006 let src = buf[off - step * 2..].as_ptr();
1007 let load_stride = step.max(stride);
1008 let fptr = flags.as_mut_ptr();
1009 let tflag = u32::from(step == 1);
1010 asm! {
1011 // load block
1012 "pxor xmm4, xmm4",
1013 "movd xmm0, dword ptr [{src}]",
1014 "lea {tmp}, [{src} + {stride} * 2]",
1015 "movd xmm1, dword ptr [{src} + {stride}]",
1016 "movd xmm2, dword ptr [{tmp}]",
1017 "movd xmm3, dword ptr [{tmp} + {stride}]",
1018 "punpcklbw xmm0, xmm4",
1019 "punpcklbw xmm1, xmm4",
1020 "punpcklbw xmm2, xmm4",
1021 "punpcklbw xmm3, xmm4",
1022
1023 // transpose block if necessary so it's always processed by rows
1024 "test {tflag:e}, {tflag:e}",
f01a1a90 1025 "jz 2f",
932ae27b
KS
1026 "punpcklwd xmm0, xmm1",
1027 "movhlps xmm4, xmm0",
1028 "punpcklwd xmm2, xmm3",
1029 "movhlps xmm1, xmm2",
1030 "punpckldq xmm0, xmm2",
1031 "punpckldq xmm4, xmm1",
1032 "movhlps xmm1, xmm0",
1033 "movhlps xmm3, xmm4",
1034 "movaps xmm2, xmm4",
f01a1a90 1035 "2:",
932ae27b
KS
1036
1037 // calculate deltas and flags
1038 "movd xmm4, {alpha:r}",
1039 "movd xmm5, {beta:r}",
1040 "psubw xmm0, xmm1",
1041 "psubw xmm1, xmm2",
1042 "psubw xmm3, xmm2",
1043 "pshuflw xmm4, xmm4, 0",
1044 "pshuflw xmm5, xmm5, 0",
1045 "pabsw xmm0, xmm0", // |p1 - p0|
1046 "pabsw xmm1, xmm1", // |p0 - q0|
1047 "pabsw xmm2, xmm3", // |q1 - q0|
1048 "movaps xmm3, xmm5",
1049 "pcmpgtw xmm4, xmm1",
1050 "pcmpgtw xmm5, xmm0",
1051 "pcmpgtw xmm3, xmm2",
1052 "pand xmm4, xmm5",
1053 "pand xmm4, xmm3",
1054 "packsswb xmm4, xmm4",
1055 "movd [{flags}], xmm4",
1056 tmp = out(reg) _,
1057 src = in(reg) src,
1058 stride = in(reg) load_stride,
1059 alpha = in(reg) alpha,
1060 beta = in(reg) beta,
1061 flags = in(reg) fptr,
1062 tflag = in(reg) tflag,
1063 out("xmm0") _,
1064 out("xmm1") _,
1065 out("xmm2") _,
1066 out("xmm3") _,
1067 out("xmm4") _,
1068 out("xmm5") _,
1069 }
1070 flags
1071 }
1072}
1073
696e4e20 1074pub fn loop_filter_lumaedge_v(dst: &mut [u8], mut off: usize, stride: usize, alpha: i16, beta: i16) {
932ae27b
KS
1075 let flags = check_filter4(dst, off, 1, stride, alpha, beta);
1076 for &flag in flags.iter() {
1077 if flag {
696e4e20
KS
1078 loop_filter!(lumaedge; dst, off, 1, alpha, beta);
1079 }
1080 off += stride;
1081 }
1082}
1083pub fn loop_filter_lumaedge_h(dst: &mut [u8], off: usize, stride: usize, alpha: i16, beta: i16) {
932ae27b
KS
1084 let flags = check_filter4(dst, off, stride, 1, alpha, beta);
1085 for (x, &flag) in flags.iter().enumerate() {
1086 if flag {
696e4e20
KS
1087 loop_filter!(lumaedge; dst, off + x, stride, alpha, beta);
1088 }
1089 }
1090}
1091pub fn loop_filter_lumanormal_v(dst: &mut [u8], mut off: usize, stride: usize, alpha: i16, beta: i16, tc0: i16) {
932ae27b
KS
1092 let flags = check_filter4(dst, off, 1, stride, alpha, beta);
1093 for &flag in flags.iter() {
1094 if flag {
696e4e20
KS
1095 loop_filter!(lumanormal; dst, off, 1, tc0, beta);
1096 }
1097 off += stride;
1098 }
1099}
1100pub fn loop_filter_lumanormal_h(dst: &mut [u8], off: usize, stride: usize, alpha: i16, beta: i16, tc0: i16) {
932ae27b
KS
1101 let flags = check_filter4(dst, off, stride, 1, alpha, beta);
1102 for (x, &flag) in flags.iter().enumerate() {
1103 if flag {
696e4e20
KS
1104 loop_filter!(lumanormal; dst, off + x, stride, tc0, beta);
1105 }
1106 }
1107}
1108pub fn loop_filter_chromaedge_v(dst: &mut [u8], mut off: usize, stride: usize, alpha: i16, beta: i16) {
22de733b 1109 for _ in 0..2 {
696e4e20
KS
1110 if check_filter(dst, off, 1, alpha, beta) {
1111 loop_filter!(chromaedge; dst, off, 1);
1112 }
1113 off += stride;
1114 }
1115}
1116pub fn loop_filter_chromaedge_h(dst: &mut [u8], off: usize, stride: usize, alpha: i16, beta: i16) {
22de733b 1117 for x in 0..2 {
696e4e20
KS
1118 if check_filter(dst, off + x, stride, alpha, beta) {
1119 loop_filter!(chromaedge; dst, off + x, stride);
1120 }
1121 }
1122}
1123pub fn loop_filter_chromanormal_v(dst: &mut [u8], mut off: usize, stride: usize, alpha: i16, beta: i16, tc0: i16) {
22de733b 1124 for _ in 0..2 {
696e4e20
KS
1125 if check_filter(dst, off, 1, alpha, beta) {
1126 loop_filter!(chromanormal; dst, off, 1, tc0);
1127 }
1128 off += stride;
1129 }
1130}
1131pub fn loop_filter_chromanormal_h(dst: &mut [u8], off: usize, stride: usize, alpha: i16, beta: i16, tc0: i16) {
22de733b 1132 for x in 0..2 {
696e4e20
KS
1133 if check_filter(dst, off + x, stride, alpha, beta) {
1134 loop_filter!(chromanormal; dst, off + x, stride, tc0);
1135 }
1136 }
1137}