]> git.nihav.org Git - nihav.git/blob - dsp/mod.rs
add MPEG-4 ASP decoder
[nihav.git] / dsp / mod.rs
1 mod mc;
2 pub use mc::{H264MC, McBlock};
3 #[cfg(target_arch="x86_64")]
4 use std::arch::asm;
5
6 pub const CHROMA_QUANTS: [u8; 52] = [
7 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
8 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 29, 30,
9 31, 32, 32, 33, 34, 34, 35, 35, 36, 36, 37, 37, 37, 38, 38, 38,
10 39, 39, 39, 39
11 ];
12
13 pub const CHROMA_DC_SCAN: [usize; 4] = [ 0, 1, 2, 3];
14 pub const ZIGZAG: [usize; 16] = [
15 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
16 ];
17 pub const ZIGZAG1: [usize; 15] = [
18 0, 3, 7, 4, 1, 2, 5, 8, 11, 12, 9, 6, 10, 13, 14
19 ];
20 /*pub const IL_SCAN: [usize; 16] = [
21 0, 4, 1, 8, 12, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
22 ];*/
23 pub const ZIGZAG8X8: [usize; 64] = [
24 0, 1, 8, 16, 9, 2, 3, 10,
25 17, 24, 32, 25, 18, 11, 4, 5,
26 12, 19, 26, 33, 40, 48, 41, 34,
27 27, 20, 13, 6, 7, 14, 21, 28,
28 35, 42, 49, 56, 57, 50, 43, 36,
29 29, 22, 15, 23, 30, 37, 44, 51,
30 58, 59, 52, 45, 38, 31, 39, 46,
31 53, 60, 61, 54, 47, 55, 62, 63
32 ];
33
34 const LEVEL_SCALE: [[i16; 6]; 3] = [
35 [ 10, 11, 13, 14, 16, 18 ],
36 [ 16, 18, 20, 23, 25, 29 ],
37 [ 13, 14, 16, 18, 20, 23 ]
38 ];
39
40 pub fn chroma_dc_transform(blk: &mut [i16; 4], qp: u8) {
41 let t0 = blk[0] + blk[2];
42 let t1 = blk[0] - blk[2];
43 let t2 = blk[1] + blk[3];
44 let t3 = blk[1] - blk[3];
45 blk[0] = t0 + t2;
46 blk[1] = t0 - t2;
47 blk[2] = t1 + t3;
48 blk[3] = t1 - t3;
49 if qp < 6 {
50 let mul = LEVEL_SCALE[0][qp as usize];
51 for el in blk.iter_mut() {
52 *el = el.wrapping_mul(mul) >> 1;
53 }
54 } else {
55 let mul = LEVEL_SCALE[0][(qp % 6) as usize];
56 let shift = qp / 6 - 1;
57 for el in blk.iter_mut() {
58 *el = el.wrapping_mul(mul) << shift;
59 }
60 }
61 }
62
63 macro_rules! transform {
64 (luma_dc; $a: expr, $b: expr, $c: expr, $d: expr) => ({
65 let t0 = $a.wrapping_add($c);
66 let t1 = $a.wrapping_sub($c);
67 let t2 = $b.wrapping_add($d);
68 let t3 = $b.wrapping_sub($d);
69 $a = t0.wrapping_add(t2);
70 $b = t1.wrapping_add(t3);
71 $c = t1.wrapping_sub(t3);
72 $d = t0.wrapping_sub(t2);
73 });
74 ($a: expr, $b: expr, $c: expr, $d: expr, $shift: expr) => ({
75 let t0 = $a.wrapping_add($c);
76 let t1 = $a.wrapping_sub($c);
77 let t2 = ($b >> 1).wrapping_sub($d);
78 let t3 = $b.wrapping_add($d >> 1);
79 let bias = 1 << $shift >> 1;
80 $a = t0.wrapping_add(t3).wrapping_add(bias) >> $shift;
81 $b = t1.wrapping_add(t2).wrapping_add(bias) >> $shift;
82 $c = t1.wrapping_sub(t2).wrapping_add(bias) >> $shift;
83 $d = t0.wrapping_sub(t3).wrapping_add(bias) >> $shift;
84 });
85 ($a: expr, $b: expr, $c: expr, $d: expr, $e: expr, $f: expr, $g: expr, $h: expr) => {
86 let e0 = $a + $e;
87 let e1 = -$d + $f - $h - ($h >> 1);
88 let e2 = $a - $e;
89 let e3 = $b + $h - $d - ($d >> 1);
90 let e4 = ($c >> 1) - $g;
91 let e5 = -$b + $h + $f + ($f >> 1);
92 let e6 = $c + ($g >> 1);
93 let e7 = $d + $f + $b + ($b >> 1);
94
95 let f0 = e0 + e6;
96 let f1 = e1 + (e7 >> 2);
97 let f2 = e2 + e4;
98 let f3 = e3 + (e5 >> 2);
99 let f4 = e2 - e4;
100 let f5 = (e3 >> 2) - e5;
101 let f6 = e0 - e6;
102 let f7 = e7 - (e1 >> 2);
103
104 $a = f0 + f7;
105 $b = f2 + f5;
106 $c = f4 + f3;
107 $d = f6 + f1;
108 $e = f6 - f1;
109 $f = f4 - f3;
110 $g = f2 - f5;
111 $h = f0 - f7;
112 };
113 }
114
115 pub fn idct_luma_dc(blk: &mut [i16; 16], qp: u8) {
116 if qp < 12 {
117 let mul = LEVEL_SCALE[0][(qp % 6) as usize];
118 let shift = 2 - qp / 6;
119 let bias = 1 << shift >> 1;
120 for el in blk.iter_mut() {
121 *el = el.wrapping_mul(mul).wrapping_add(bias) >> shift;
122 }
123 } else {
124 let mul = LEVEL_SCALE[0][(qp % 6) as usize];
125 let shift = qp / 6 - 2;
126 for el in blk.iter_mut() {
127 *el = el.wrapping_mul(mul) << shift;
128 }
129 }
130 for i in 0..4 {
131 transform!(luma_dc; blk[i], blk[i + 4], blk[i + 8], blk[i + 12]);
132 }
133 for row in blk.chunks_mut(4) {
134 transform!(luma_dc; row[0], row[1], row[2], row[3]);
135 }
136 }
137
138 pub fn idct(blk: &mut [i16; 16], qp: u8, quant_dc: bool) {
139 const BLK_INDEX: [usize; 16] = [
140 0, 2, 0, 2,
141 2, 1, 2, 1,
142 0, 2, 0, 2,
143 2, 1, 2, 1
144 ];
145 let qidx = (qp % 6) as usize;
146 let shift = qp / 6;
147 let start = if quant_dc { 0 } else { 1 };
148 for (el, &idx) in blk.iter_mut().zip(BLK_INDEX.iter()).skip(start) {
149 *el = (*el * LEVEL_SCALE[idx][qidx]) << shift;
150 }
151 for row in blk.chunks_mut(4) {
152 transform!(row[0], row[1], row[2], row[3], 0);
153 }
154 for i in 0..4 {
155 transform!(blk[i], blk[i + 4], blk[i + 8], blk[i + 12], 6);
156 }
157 }
158
159 pub fn idct_dc(blk: &mut [i16; 16], qp: u8, quant_dc: bool) {
160 let dc = if quant_dc {
161 (blk[0] * LEVEL_SCALE[0][(qp % 6) as usize]) << (qp / 6)
162 } else {
163 blk[0]
164 };
165 *blk = [(dc + 0x20) >> 6; 16];
166 }
167
168 const QMAT_8X8: [[u8; 16]; 6] = [
169 [
170 20, 19, 25, 19,
171 19, 18, 24, 18,
172 25, 24, 32, 24,
173 19, 18, 24, 18
174 ], [
175 22, 21, 28, 21,
176 21, 19, 26, 19,
177 28, 26, 35, 26,
178 21, 19, 26, 19
179 ], [
180 26, 24, 33, 24,
181 24, 23, 31, 23,
182 33, 31, 42, 31,
183 24, 23, 31, 23
184 ], [
185 28, 26, 35, 26,
186 26, 25, 33, 25,
187 35, 33, 45, 33,
188 26, 25, 33, 25
189 ], [
190 32, 30, 40, 30,
191 30, 28, 38, 28,
192 40, 38, 51, 38,
193 30, 28, 38, 28
194 ], [
195 36, 34, 46, 34,
196 34, 32, 43, 32,
197 46, 43, 58, 43,
198 34, 32, 43, 32
199 ]
200 ];
201
202 pub fn dequant8x8(blk: &mut [i16; 64], slist: &[u8; 64]) {
203 for (el, &scan) in blk.iter_mut().zip(ZIGZAG8X8.iter()) {
204 if *el != 0 {
205 *el = el.wrapping_mul(i16::from(slist[scan]));
206 }
207 }
208 }
209
210 pub fn idct8x8(blk: &mut [i16; 64], qp: u8) {
211 let mut tmp = [0i32; 64];
212 let qmat = &QMAT_8X8[(qp % 6) as usize];
213 if qp >= 36 {
214 let shift = qp / 6 - 6;
215 for (i, (dst, &src)) in tmp.iter_mut().zip(blk.iter()).enumerate() {
216 let x = i & 7;
217 let y = i >> 3;
218 let idx = (x & 3) + (y & 3) * 4;
219 *dst = i32::from(src).wrapping_mul(i32::from(qmat[idx])) << shift;
220 }
221 } else {
222 let shift = 6 - qp / 6;
223 let bias = (1 << shift) >> 1;
224 for (i, (dst, &src)) in tmp.iter_mut().zip(blk.iter()).enumerate() {
225 let x = i & 7;
226 let y = i >> 3;
227 let idx = (x & 3) + (y & 3) * 4;
228 *dst = i32::from(src).wrapping_mul(i32::from(qmat[idx])).wrapping_add(bias) >> shift;
229 }
230 }
231 for row in tmp.chunks_mut(8) {
232 transform!(row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7]);
233 }
234 for col in 0..8 {
235 transform!(tmp[col], tmp[col + 8], tmp[col + 8 * 2], tmp[col + 8 * 3],
236 tmp[col + 8 * 4], tmp[col + 8 * 5], tmp[col + 8 * 6], tmp[col + 8 * 7]);
237 }
238 for (dst, &src) in blk.iter_mut().zip(tmp.iter()) {
239 *dst = ((src + 0x20) >> 6) as i16;
240 }
241 }
242
243 pub fn add_coeffs(dst: &mut [u8], offset: usize, stride: usize, coeffs: &[i16]) {
244 let out = &mut dst[offset..][..stride * 3 + 4];
245 for (line, src) in out.chunks_mut(stride).take(4).zip(coeffs.chunks(4)) {
246 for (dst, src) in line.iter_mut().take(4).zip(src.iter()) {
247 *dst = (i32::from(*dst) + i32::from(*src)).max(0).min(255) as u8;
248 }
249 }
250 }
251
252 pub fn add_coeffs8(dst: &mut [u8], offset: usize, stride: usize, coeffs: &[i16; 64]) {
253 let out = &mut dst[offset..];
254 for (line, src) in out.chunks_mut(stride).take(8).zip(coeffs.chunks(8)) {
255 for (dst, src) in line.iter_mut().take(8).zip(src.iter()) {
256 *dst = (i32::from(*dst) + i32::from(*src)).max(0).min(255) as u8;
257 }
258 }
259 }
260
261 fn clip8(val: i16) -> u8 { val.max(0).min(255) as u8 }
262
263 fn ipred_dc128(buf: &mut [u8], stride: usize, bsize: usize) {
264 for row in buf.chunks_mut(stride).take(bsize) {
265 for el in row[..bsize].iter_mut() {
266 *el = 128;
267 }
268 }
269 }
270 fn ipred_ver(buf: &mut [u8], stride: usize, top: &[u8], bsize: usize) {
271 for row in buf.chunks_mut(stride).take(bsize) {
272 row[..bsize].copy_from_slice(&top[..bsize]);
273 }
274 }
275 fn ipred_hor(buf: &mut [u8], stride: usize, left: &[u8], bsize: usize) {
276 for (row, &left) in buf.chunks_mut(stride).zip(left[1..].iter()).take(bsize) {
277 for el in row[..bsize].iter_mut() {
278 *el = left;
279 }
280 }
281 }
282 fn ipred_dc(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8], bsize: usize, shift: u8) {
283 let mut adc: u16 = 0;
284 for i in 0..bsize { adc += u16::from(top[i]); }
285 for i in 0..bsize { adc += u16::from(left[i + 1]); }
286 let dc = ((adc + (1 << (shift - 1))) >> shift) as u8;
287
288 for row in buf.chunks_mut(stride).take(bsize) {
289 for el in row[..bsize].iter_mut() {
290 *el = dc;
291 }
292 }
293 }
294 fn ipred_left_dc(buf: &mut [u8], stride: usize, left: &[u8], bsize: usize, shift: u8) {
295 let mut adc: u16 = 0;
296 for i in 0..bsize { adc += u16::from(left[i + 1]); }
297 let dc = ((adc + (1 << (shift - 1))) >> shift) as u8;
298
299 for row in buf.chunks_mut(stride).take(bsize) {
300 for el in row[..bsize].iter_mut() {
301 *el = dc;
302 }
303 }
304 }
305 fn ipred_top_dc(buf: &mut [u8], stride: usize, top: &[u8], bsize: usize, shift: u8) {
306 let mut adc: u16 = 0;
307 for i in 0..bsize { adc += u16::from(top[i]); }
308 let dc = ((adc + (1 << (shift - 1))) >> shift) as u8;
309
310 for row in buf.chunks_mut(stride).take(bsize) {
311 for el in row[..bsize].iter_mut() {
312 *el = dc;
313 }
314 }
315 }
316
317 fn load(dst: &mut [u16], src: &[u8]) {
318 for (dst, &src) in dst.iter_mut().zip(src.iter()) {
319 *dst = u16::from(src);
320 }
321 }
322
323 fn ipred_4x4_ver(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8], _tr: &[u8]) {
324 ipred_ver(buf, stride, top, 4);
325 }
326 fn ipred_4x4_hor(buf: &mut [u8], stride: usize, _top: &[u8], left: &[u8], _tr: &[u8]) {
327 ipred_hor(buf, stride, left, 4);
328 }
329 fn ipred_4x4_diag_down_left(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8], tr: &[u8]) {
330 let mut t: [u16; 9] = [0; 9];
331 load(&mut t[..4], top);
332 load(&mut t[4..8], tr);
333 t[8] = t[7];
334
335 for i in 0..4 {
336 buf[i] = ((t[i] + 2 * t[i + 1] + t[i + 2] + 2) >> 2) as u8;
337 }
338 let dst = &mut buf[stride..];
339 for i in 0..4 {
340 dst[i] = ((t[i + 1] + 2 * t[i + 2] + t[i + 3] + 2) >> 2) as u8;
341 }
342 let dst = &mut buf[stride * 2..];
343 for i in 0..4 {
344 dst[i] = ((t[i + 2] + 2 * t[i + 3] + t[i + 4] + 2) >> 2) as u8;
345 }
346 let dst = &mut buf[stride * 3..];
347 for i in 0..4 {
348 dst[i] = ((t[i + 3] + 2 * t[i + 4] + t[i + 5] + 2) >> 2) as u8;
349 }
350 }
351 fn ipred_4x4_diag_down_right(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8], _tr: &[u8]) {
352 let mut t: [u16; 5] = [0; 5];
353 t[0] = u16::from(left[0]);
354 load(&mut t[1..], top);
355 let mut l: [u16; 5] = [0; 5];
356 load(&mut l, left);
357 let dst = buf;
358
359 for j in 0..4 {
360 for i in 0..j {
361 dst[i + j * stride] = ((l[j - i - 1] + 2 * l[j - i] + l[j - i + 1] + 2) >> 2) as u8;
362 }
363 dst[j + j * stride] = ((l[1] + 2 * l[0] + t[1] + 2) >> 2) as u8;
364 for i in (j+1)..4 {
365 dst[i + j * stride] = ((t[i - j - 1] + 2 * t[i - j] + t[i - j + 1] + 2) >> 2) as u8;
366 }
367 }
368 }
369 fn ipred_4x4_ver_right(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8], _tr: &[u8]) {
370 let mut t: [u16; 5] = [0; 5];
371 t[0] = u16::from(left[0]);
372 load(&mut t[1..], top);
373 let mut l: [u16; 5] = [0; 5];
374 load(&mut l, left);
375 let dst = buf;
376
377 for j in 0..4 {
378 for i in 0..4 {
379 let zvr = ((2 * i) as i8) - (j as i8);
380 let pix;
381 if zvr >= 0 {
382 if (zvr & 1) == 0 {
383 pix = (t[i - (j >> 1)] + t[i - (j >> 1) + 1] + 1) >> 1;
384 } else {
385 pix = (t[i - (j >> 1) - 1] + 2 * t[i - (j >> 1)] + t[i - (j >> 1) + 1] + 2) >> 2;
386 }
387 } else {
388 if zvr == -1 {
389 pix = (l[1] + 2 * l[0] + t[1] + 2) >> 2;
390 } else {
391 pix = (l[j] + 2 * l[j - 1] + l[j - 2] + 2) >> 2;
392 }
393 }
394 dst[i + j * stride] = pix as u8;
395 }
396 }
397 }
398 fn ipred_4x4_ver_left(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8], tr: &[u8]) {
399 let mut t: [u16; 8] = [0; 8];
400 load(&mut t[..4], top);
401 load(&mut t[4..], tr);
402 let dst = buf;
403
404 dst[0 + 0 * stride] = ((t[0] + t[1] + 1) >> 1) as u8;
405 let pix = ((t[1] + t[2] + 1) >> 1) as u8;
406 dst[1 + 0 * stride] = pix;
407 dst[0 + 2 * stride] = pix;
408 let pix = ((t[2] + t[3] + 1) >> 1) as u8;
409 dst[2 + 0 * stride] = pix;
410 dst[1 + 2 * stride] = pix;
411 let pix = ((t[3] + t[4] + 1) >> 1) as u8;
412 dst[3 + 0 * stride] = pix;
413 dst[2 + 2 * stride] = pix;
414 dst[3 + 2 * stride] = ((t[4] + t[5] + 1) >> 1) as u8;
415 dst[0 + 1 * stride] = ((t[0] + 2*t[1] + t[2] + 2) >> 2) as u8;
416 let pix = ((t[1] + 2*t[2] + t[3] + 2) >> 2) as u8;
417 dst[1 + 1 * stride] = pix;
418 dst[0 + 3 * stride] = pix;
419 let pix = ((t[2] + 2*t[3] + t[4] + 2) >> 2) as u8;
420 dst[2 + 1 * stride] = pix;
421 dst[1 + 3 * stride] = pix;
422 let pix = ((t[3] + 2*t[4] + t[5] + 2) >> 2) as u8;
423 dst[3 + 1 * stride] = pix;
424 dst[2 + 3 * stride] = pix;
425 dst[3 + 3 * stride] = ((t[4] + 2*t[5] + t[6] + 2) >> 2) as u8;
426 }
427 fn ipred_4x4_hor_down(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8], _tr: &[u8]) {
428 let mut t: [u16; 5] = [0; 5];
429 t[0] = u16::from(left[0]);
430 load(&mut t[1..], top);
431 let mut l: [u16; 5] = [0; 5];
432 load(&mut l, left);
433 let dst = buf;
434
435 for j in 0..4 {
436 for i in 0..4 {
437 let zhd = ((2 * j) as i8) - (i as i8);
438 let pix;
439 if zhd >= 0 {
440 if (zhd & 1) == 0 {
441 pix = (l[j - (i >> 1)] + l[j - (i >> 1) + 1] + 1) >> 1;
442 } else {
443 pix = (l[j - (i >> 1) - 1] + 2 * l[j - (i >> 1)] + l[j - (i >> 1) + 1] + 2) >> 2;
444 }
445 } else {
446 if zhd == -1 {
447 pix = (l[1] + 2 * l[0] + t[1] + 2) >> 2;
448 } else {
449 pix = (t[i - 2] + 2 * t[i - 1] + t[i] + 2) >> 2;
450 }
451 }
452 dst[i + j * stride] = pix as u8;
453 }
454 }
455 }
456 fn ipred_4x4_hor_up(buf: &mut [u8], stride: usize, _top: &[u8], left: &[u8], _tr: &[u8]) {
457 let mut l: [u16; 8] = [0; 8];
458 load(&mut l, &left[1..]);
459 let dst = buf;
460
461 dst[0 + 0 * stride] = ((l[0] + l[1] + 1) >> 1) as u8;
462 dst[1 + 0 * stride] = ((l[0] + 2*l[1] + l[2] + 2) >> 2) as u8;
463 let pix = ((l[1] + l[2] + 1) >> 1) as u8;
464 dst[2 + 0 * stride] = pix;
465 dst[0 + 1 * stride] = pix;
466 let pix = ((l[1] + 2*l[2] + l[3] + 2) >> 2) as u8;
467 dst[3 + 0 * stride] = pix;
468 dst[1 + 1 * stride] = pix;
469 let pix = ((l[2] + l[3] + 1) >> 1) as u8;
470 dst[2 + 1 * stride] = pix;
471 dst[0 + 2 * stride] = pix;
472 let pix = ((l[2] + 3*l[3] + 2) >> 2) as u8;
473 dst[3 + 1 * stride] = pix;
474 dst[1 + 2 * stride] = pix;
475 dst[3 + 2 * stride] = l[3] as u8;
476 dst[1 + 3 * stride] = l[3] as u8;
477 dst[0 + 3 * stride] = l[3] as u8;
478 dst[2 + 2 * stride] = l[3] as u8;
479 dst[2 + 3 * stride] = l[3] as u8;
480 dst[3 + 3 * stride] = l[3] as u8;
481 }
482 fn ipred_4x4_dc(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8], _tr: &[u8]) {
483 ipred_dc(buf, stride, top, left, 4, 3);
484 }
485 fn ipred_4x4_left_dc(buf: &mut [u8], stride: usize, _top: &[u8], left: &[u8], _tr: &[u8]) {
486 ipred_left_dc(buf, stride, left, 4, 2);
487 }
488 fn ipred_4x4_top_dc(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8], _tr: &[u8]) {
489 ipred_top_dc(buf, stride, top, 4, 2);
490 }
491 fn ipred_4x4_dc128(buf: &mut [u8], stride: usize, _top: &[u8], _left: &[u8], _tr: &[u8]) {
492 ipred_dc128(buf, stride, 4);
493 }
494
495 pub struct IPred8Context {
496 pub t: [u8; 16],
497 pub l: [u8; 8],
498 pub tl: u8,
499 }
500
501 impl IPred8Context {
502 pub fn new() -> Self {
503 Self {
504 t: [128; 16],
505 l: [128; 8],
506 tl: 128,
507 }
508 }
509 pub fn fill(&mut self, top: &[u8], left: &[u8], has_t: bool, has_tr: bool, has_l: bool, has_tl: bool) {
510 let mut t = [0x80u8; 19];
511 let mut l = [0x80u8; 11];
512 if has_t {
513 t[1..8 + 1].copy_from_slice(&top[..8]);
514 }
515 if has_tr {
516 t[8 + 1..16 + 1].copy_from_slice(&top[8..][..8]);
517 t[16 + 1] = t[15 + 1];
518 t[17 + 1] = t[15 + 1];
519 } else {
520 let (t0, t1) = t.split_at_mut(8 + 1);
521 for el in t1.iter_mut() {
522 *el = t0[7 + 1];
523 }
524 }
525 if has_l {
526 l[1..9].copy_from_slice(&left[1..9]);
527 l[8 + 1] = l[7 + 1];
528 l[9 + 1] = l[7 + 1];
529 }
530 if has_tl {
531 t[0] = left[0];
532 l[0] = left[0];
533 } else {
534 t[0] = t[1];
535 l[0] = l[1];
536 }
537
538 for i in 0..16 {
539 self.t[i] = ((u16::from(t[i]) + 2 * u16::from(t[i + 1]) + u16::from(t[i + 2]) + 2) >> 2) as u8;
540 }
541 for i in 0..8 {
542 self.l[i] = ((u16::from(l[i]) + 2 * u16::from(l[i + 1]) + u16::from(l[i + 2]) + 2) >> 2) as u8;
543 }
544 self.tl = if has_t && has_l {
545 ((u16::from(t[1]) + 2 * u16::from(t[0]) + u16::from(l[1]) + 2) >> 2) as u8
546 } else if has_t {
547 ((3 * u16::from(t[0]) + u16::from(t[1]) + 2) >> 2) as u8
548 } else if has_l {
549 ((3 * u16::from(l[0]) + u16::from(l[1]) + 2) >> 2) as u8
550 } else {
551 t[0]
552 };
553 }
554 }
555
556 fn ipred_y_8x8_ver(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
557 for row in buf.chunks_mut(stride).take(8) {
558 row[..8].copy_from_slice(&ctx.t[..8]);
559 }
560 }
561 fn ipred_y_8x8_hor(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
562 for (row, &l) in buf.chunks_mut(stride).zip(ctx.l.iter()).take(8) {
563 row[..8].copy_from_slice(&[l; 8]);
564 }
565 }
566 fn ipred_y_8x8_diag_down_left(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
567 let mut t = [0u16; 16];
568 load(&mut t, &ctx.t);
569
570 for (y, row) in buf.chunks_mut(stride).take(8).enumerate() {
571 for (x, pix) in row.iter_mut().take(8).enumerate() {
572 *pix = ((if (x != 7) || (y != 7) {
573 t[x + y] + 2 * t[x + y + 1] + t[x + y + 2]
574 } else {
575 t[14] + 3 * t[15]
576 } + 2) >> 2) as u8;
577 }
578 }
579 }
580 fn ipred_y_8x8_diag_down_right(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
581 let mut t = [0u16; 9];
582 t[0] = u16::from(ctx.tl);
583 load(&mut t[1..], &ctx.t);
584 let mut l = [0u16; 9];
585 l[0] = u16::from(ctx.tl);
586 load(&mut l[1..], &ctx.l);
587 let diag = t[1] + 2 * t[0] + l[1];
588
589 for (y, row) in buf.chunks_mut(stride).take(8).enumerate() {
590 for (x, pix) in row.iter_mut().take(8).enumerate() {
591 *pix = ((if x > y {
592 t[x - y - 1] + 2 * t[x - y] + t[x - y + 1]
593 } else if x < y {
594 l[y - x - 1] + 2 * l[y - x] + l[y - x + 1]
595 } else {
596 diag
597 } + 2) >> 2) as u8;
598 }
599 }
600 }
601 fn ipred_y_8x8_ver_right(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
602 let mut t = [0u16; 9];
603 t[0] = u16::from(ctx.tl);
604 load(&mut t[1..], &ctx.t);
605 let mut l = [0u16; 9];
606 l[0] = u16::from(ctx.tl);
607 load(&mut l[1..], &ctx.l);
608
609 for (y, row) in buf.chunks_mut(stride).take(8).enumerate() {
610 for (x, pix) in row.iter_mut().take(8).enumerate() {
611 let zvr = 2 * (x as i8) - (y as i8);
612 *pix = if zvr >= 0 {
613 let ix = x - (y >> 1);
614 if (zvr & 1) == 0 {
615 (t[ix] + t[ix + 1] + 1) >> 1
616 } else {
617 (t[ix - 1] + 2 * t[ix] + t[ix + 1] + 2) >> 2
618 }
619 } else if zvr == -1 {
620 (l[1] + 2 * l[0] + t[1] + 2) >> 2
621 } else {
622 let ix = y - 2 * x;
623 (l[ix] + 2 * l[ix - 1] + l[ix - 2] + 2) >> 2
624 } as u8;
625 }
626 }
627 }
628 fn ipred_y_8x8_ver_left(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
629 let mut t = [0u16; 16];
630 load(&mut t, &ctx.t);
631
632 for (y, row) in buf.chunks_mut(stride).take(8).enumerate() {
633 for (x, pix) in row.iter_mut().take(8).enumerate() {
634 let ix = x + (y >> 1);
635 *pix = if (y & 1) == 0 {
636 (t[ix] + t[ix + 1] + 1) >> 1
637 } else {
638 (t[ix] + 2 * t[ix + 1] + t[ix + 2] + 2) >> 2
639 } as u8;
640 }
641 }
642
643 }
644 fn ipred_y_8x8_hor_down(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
645 let mut t = [0u16; 9];
646 t[0] = u16::from(ctx.tl);
647 load(&mut t[1..], &ctx.t);
648 let mut l = [0u16; 9];
649 l[0] = u16::from(ctx.tl);
650 load(&mut l[1..], &ctx.l);
651
652 for (y, row) in buf.chunks_mut(stride).take(8).enumerate() {
653 for (x, pix) in row.iter_mut().take(8).enumerate() {
654 let zhd = 2 * (y as i8) - (x as i8);
655 *pix = if zhd >= 0 {
656 let ix = y - (x >> 1);
657 if (zhd & 1) == 0 {
658 (l[ix] + l[ix + 1] + 1) >> 1
659 } else {
660 (l[ix - 1] + 2 * l[ix] + l[ix + 1] + 2) >> 2
661 }
662 } else if zhd == -1 {
663 (l[1] + 2 * l[0] + t[1] + 2) >> 2
664 } else {
665 let ix = x - 2 * y;
666 (t[ix] + 2 * t[ix - 1] + t[ix - 2] + 2) >> 2
667 } as u8;
668 }
669 }
670 }
671 fn ipred_y_8x8_hor_up(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
672 let mut l = [0u16; 8];
673 load(&mut l, &ctx.l);
674
675 for (y, row) in buf.chunks_mut(stride).take(8).enumerate() {
676 for (x, pix) in row.iter_mut().take(8).enumerate() {
677 let zhu = x + 2 * y;
678 let ix = y + (x >> 1);
679 *pix = if zhu > 13 {
680 l[7]
681 } else if zhu == 13 {
682 (l[6] + 3 * l[7] + 2) >> 2
683 } else if (zhu & 1) != 0 {
684 (l[ix] + 2 * l[ix + 1] + l[ix + 2] + 2) >> 2
685 } else {
686 (l[ix] + l[ix + 1] + 1) >> 1
687 } as u8;
688 }
689 }
690 }
691 fn ipred_y_8x8_dc(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
692 let mut sum = 0u16;
693 for &t in ctx.t[..8].iter() {
694 sum += u16::from(t);
695 }
696 for &l in ctx.l[..8].iter() {
697 sum += u16::from(l);
698 }
699 let dc = ((sum + 8) >> 4) as u8;
700 for row in buf.chunks_mut(stride).take(8) {
701 for pix in row.iter_mut().take(8) {
702 *pix = dc;
703 }
704 }
705 }
706 fn ipred_y_8x8_left_dc(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
707 let mut sum = 0u16;
708 for &l in ctx.l[..8].iter() {
709 sum += u16::from(l);
710 }
711 let dc = ((sum + 4) >> 3) as u8;
712 for row in buf.chunks_mut(stride).take(8) {
713 for pix in row.iter_mut().take(8) {
714 *pix = dc;
715 }
716 }
717 }
718 fn ipred_y_8x8_top_dc(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
719 let mut sum = 0u16;
720 for &t in ctx.t[..8].iter() {
721 sum += u16::from(t);
722 }
723 let dc = ((sum + 4) >> 3) as u8;
724 for row in buf.chunks_mut(stride).take(8) {
725 for pix in row.iter_mut().take(8) {
726 *pix = dc;
727 }
728 }
729 }
730 fn ipred_y_8x8_dc128(buf: &mut [u8], stride: usize, _ctx: &IPred8Context) {
731 ipred_dc128(buf, stride, 8);
732 }
733
734 fn ipred_8x8_ver(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8]) {
735 ipred_ver(buf, stride, top, 8);
736 }
737 fn ipred_8x8_hor(buf: &mut [u8], stride: usize, _top: &[u8], left: &[u8]) {
738 ipred_hor(buf, stride, left, 8);
739 }
740 fn ipred_8x8_dc(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8]) {
741 let mut l = [0; 8];
742 load(&mut l, &left[1..]);
743 let mut t = [0; 8];
744 load(&mut t, top);
745
746 let dc0 = ((t[0] + t[1] + t[2] + t[3] + l[0] + l[1] + l[2] + l[3] + 4) >> 3) as u8;
747 let sum1 = t[4] + t[5] + t[6] + t[7];
748 let dc1 = ((sum1 + 2) >> 2) as u8;
749 let sum2 = l[4] + l[5] + l[6] + l[7];
750 let dc2 = ((sum2 + 2) >> 2) as u8;
751 let dc3 = ((sum1 + sum2 + 4) >> 3) as u8;
752
753 for row in buf.chunks_mut(stride).take(4) {
754 row[..4].copy_from_slice(&[dc0; 4]);
755 row[4..8].copy_from_slice(&[dc1; 4]);
756 }
757 for row in buf.chunks_mut(stride).skip(4).take(4) {
758 row[..4].copy_from_slice(&[dc2; 4]);
759 row[4..8].copy_from_slice(&[dc3; 4]);
760 }
761 }
762 fn ipred_8x8_left_dc(buf: &mut [u8], stride: usize, _top: &[u8], left: &[u8]) {
763 let mut left_dc0 = 0;
764 let mut left_dc1 = 0;
765 for &el in left[1..].iter().take(4) {
766 left_dc0 += u16::from(el);
767 }
768 for &el in left[1..].iter().skip(4).take(4) {
769 left_dc1 += u16::from(el);
770 }
771 let dc0 = ((left_dc0 + 2) >> 2) as u8;
772 let dc2 = ((left_dc1 + 2) >> 2) as u8;
773 for row in buf.chunks_mut(stride).take(4) {
774 row[..8].copy_from_slice(&[dc0; 8]);
775 }
776 for row in buf.chunks_mut(stride).skip(4).take(4) {
777 row[..8].copy_from_slice(&[dc2; 8]);
778 }
779 }
780 fn ipred_8x8_top_dc(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8]) {
781 ipred_top_dc(buf, stride, top, 4, 2);
782 ipred_top_dc(&mut buf[4..], stride, &top[4..], 4, 2);
783 let mut top = [0; 8];
784 top.copy_from_slice(&buf[stride * 3..][..8]);
785 ipred_top_dc(&mut buf[4 * stride..], stride, &top, 4, 2);
786 ipred_top_dc(&mut buf[4 + 4 * stride..], stride, &top[4..], 4, 2);
787 }
788 fn ipred_8x8_dc128(buf: &mut [u8], stride: usize, _top: &[u8], _left: &[u8]) {
789 ipred_dc128(buf, stride, 8);
790 }
791 fn ipred_8x8_plane(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8]) {
792 let mut h: i32 = 4 * (i32::from(top[7]) - i32::from(left[0]));
793 let mut v: i32 = 4 * (i32::from(left[8]) - i32::from(left[0]));
794 for i in 0..3 {
795 let i1 = (i + 1) as i32;
796 h += i1 * (i32::from(top[4 + i]) - i32::from(top[2 - i]));
797 v += i1 * (i32::from(left[5 + i]) - i32::from(left[3 - i]));
798 }
799 let b = (17 * h + 16) >> 5;
800 let c = (17 * v + 16) >> 5;
801 let mut a = 16 * (i32::from(left[8]) + i32::from(top[7])) - 3 * (b + c) + 16;
802 for line in buf.chunks_mut(stride).take(8) {
803 let mut acc = a;
804 for el in line.iter_mut().take(8) {
805 *el = clip8((acc >> 5) as i16);
806 acc += b;
807 }
808 a += c;
809 }
810 }
811
812 fn ipred_16x16_ver(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8]) {
813 ipred_ver(buf, stride, top, 16);
814 }
815 fn ipred_16x16_hor(buf: &mut [u8], stride: usize, _top: &[u8], left: &[u8]) {
816 ipred_hor(buf, stride, left, 16);
817 }
818 fn ipred_16x16_dc(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8]) {
819 ipred_dc(buf, stride, top, left, 16, 5);
820 }
821 fn ipred_16x16_left_dc(buf: &mut [u8], stride: usize, _top: &[u8], left: &[u8]) {
822 ipred_left_dc(buf, stride, left, 16, 4);
823 }
824 fn ipred_16x16_top_dc(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8]) {
825 ipred_top_dc(buf, stride, top, 16, 4);
826 }
827 fn ipred_16x16_dc128(buf: &mut [u8], stride: usize, _top: &[u8], _left: &[u8]) {
828 ipred_dc128(buf, stride, 16);
829 }
830 fn ipred_16x16_plane(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8]) {
831 let mut h = 8 * (i32::from(top[15]) - i32::from(left[0]));
832 let mut v = 8 * (i32::from(left[16]) - i32::from(left[0]));
833 for k in 0..7 {
834 h += ((k as i32) + 1) * (i32::from(top[8 + k]) - i32::from(top[6 - k]));
835 v += ((k as i32) + 1) * (i32::from(left[9 + k]) - i32::from(left[7 - k]));
836 }
837
838 h = (5 * h + 32) >> 6;
839 v = (5 * v + 32) >> 6;
840
841 let mut a = 16 * (i32::from(left[16]) + i32::from(top[15]) + 1) - 7 * (v + h);
842
843 for row in buf.chunks_mut(stride).take(16) {
844 let mut b = a;
845 a += v;
846
847 for dst in row.chunks_exact_mut(4).take(4) {
848 dst[0] = clip8(((b ) >> 5) as i16);
849 dst[1] = clip8(((b + h) >> 5) as i16);
850 dst[2] = clip8(((b + 2*h) >> 5) as i16);
851 dst[3] = clip8(((b + 3*h) >> 5) as i16);
852 b += h * 4;
853 }
854 }
855 }
856
857 pub type IPred4x4Func = fn(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8], tr: &[u8]);
858 pub type IPred8x8Func = fn(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8]);
859 pub type IPred8x8LumaFunc = fn(buf: &mut [u8], stride: usize, ctx: &IPred8Context);
860
861 pub const IPRED4_DC128: usize = 11;
862 pub const IPRED4_DC_TOP: usize = 10;
863 pub const IPRED4_DC_LEFT: usize = 9;
864 pub const IPRED8_DC128: usize = 6;
865 pub const IPRED8_DC_TOP: usize = 5;
866 pub const IPRED8_DC_LEFT: usize = 4;
867
868 pub const IPRED_FUNCS4X4: [IPred4x4Func; 12] = [
869 ipred_4x4_ver, ipred_4x4_hor, ipred_4x4_dc,
870 ipred_4x4_diag_down_left, ipred_4x4_diag_down_right,
871 ipred_4x4_ver_right, ipred_4x4_hor_down, ipred_4x4_ver_left, ipred_4x4_hor_up,
872 ipred_4x4_left_dc, ipred_4x4_top_dc, ipred_4x4_dc128
873 ];
874
875 pub const IPRED_FUNCS8X8_LUMA: [IPred8x8LumaFunc; 12] = [
876 ipred_y_8x8_ver, ipred_y_8x8_hor, ipred_y_8x8_dc,
877 ipred_y_8x8_diag_down_left, ipred_y_8x8_diag_down_right,
878 ipred_y_8x8_ver_right, ipred_y_8x8_hor_down,
879 ipred_y_8x8_ver_left, ipred_y_8x8_hor_up,
880 ipred_y_8x8_left_dc, ipred_y_8x8_top_dc, ipred_y_8x8_dc128
881 ];
882
883 pub const IPRED_FUNCS8X8_CHROMA: [IPred8x8Func; 7] = [
884 ipred_8x8_dc, ipred_8x8_hor, ipred_8x8_ver, ipred_8x8_plane,
885 ipred_8x8_left_dc, ipred_8x8_top_dc, ipred_8x8_dc128
886 ];
887
888 pub const IPRED_FUNCS16X16: [IPred8x8Func; 7] = [
889 ipred_16x16_ver, ipred_16x16_hor, ipred_16x16_dc, ipred_16x16_plane,
890 ipred_16x16_left_dc, ipred_16x16_top_dc, ipred_16x16_dc128
891 ];
892
893 macro_rules! loop_filter {
894 (lumaedge; $buf: expr, $off: expr, $step: expr, $alpha: expr, $beta: expr) => {
895 let p2 = i16::from($buf[$off - $step * 3]);
896 let p1 = i16::from($buf[$off - $step * 2]);
897 let p0 = i16::from($buf[$off - $step]);
898 let q0 = i16::from($buf[$off]);
899 let q1 = i16::from($buf[$off + $step]);
900 let q2 = i16::from($buf[$off + $step * 2]);
901 let a_p = (p2 - p0).abs() < $beta;
902 let a_q = (q2 - q0).abs() < $beta;
903 if a_p && (p0 - q0).abs() < (($alpha >> 2) + 2) {
904 let p3 = i16::from($buf[$off - $step * 4]);
905 $buf[$off - $step * 3] = ((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) as u8;
906 $buf[$off - $step * 2] = ((p2 + p1 + p0 + q0 + 2) >> 2) as u8;
907 $buf[$off - $step] = ((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) as u8;
908 } else {
909 $buf[$off - $step] = ((2 * p1 + p0 + q1 + 2) >> 2) as u8;
910 }
911 if a_q && (p0 - q0).abs() < (($alpha >> 2) + 2) {
912 let q3 = i16::from($buf[$off + $step * 3]);
913 $buf[$off] = ((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) as u8;
914 $buf[$off + $step] = ((p0 + q0 + q1 + q2 + 2) >> 2) as u8;
915 $buf[$off + $step * 2] = ((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) as u8;
916 } else {
917 $buf[$off] = ((2 * q1 + q0 + p1 + 2) >> 2) as u8;
918 }
919 };
920 (chromaedge; $buf: expr, $off: expr, $step: expr) => {
921 let p1 = i16::from($buf[$off - $step * 2]);
922 let p0 = i16::from($buf[$off - $step]);
923 let q0 = i16::from($buf[$off]);
924 let q1 = i16::from($buf[$off + $step]);
925 $buf[$off - $step] = ((2 * p1 + p0 + q1 + 2) >> 2) as u8;
926 $buf[$off] = ((2 * q1 + q0 + p1 + 2) >> 2) as u8;
927 };
928 (lumanormal; $buf: expr, $off: expr, $step: expr, $tc0: expr, $beta: expr) => {
929 let p2 = i16::from($buf[$off - $step * 3]);
930 let p1 = i16::from($buf[$off - $step * 2]);
931 let p0 = i16::from($buf[$off - $step]);
932 let q0 = i16::from($buf[$off]);
933 let q1 = i16::from($buf[$off + $step]);
934 let q2 = i16::from($buf[$off + $step * 2]);
935 let a_p = (p2 - p0).abs() < $beta;
936 let a_q = (q2 - q0).abs() < $beta;
937 let tc = $tc0 + (a_p as i16) + (a_q as i16);
938 let delta = (((q0 - p0) * 4 + (p1 - q1) + 4) >> 3).max(-tc).min(tc);
939 if a_p && ($tc0 > 0) {
940 $buf[$off - $step * 2] = clip8(p1 + ((p2 + ((p0 + q0 + 1) >> 1) - p1 * 2) >> 1).max(-$tc0).min($tc0));
941 }
942 $buf[$off - $step] = clip8(p0 + delta);
943 $buf[$off] = clip8(q0 - delta);
944 if a_q && ($tc0 > 0) {
945 $buf[$off + $step] = clip8(q1 + ((q2 + ((p0 + q0 + 1) >> 1) - q1 * 2) >> 1).max(-$tc0).min($tc0));
946 }
947 };
948 (chromanormal; $buf: expr, $off: expr, $step: expr, $tc0: expr) => {
949 let p1 = i16::from($buf[$off - $step * 2]);
950 let p0 = i16::from($buf[$off - $step]);
951 let q0 = i16::from($buf[$off]);
952 let q1 = i16::from($buf[$off + $step]);
953 let tc = $tc0 + 1;
954 let delta = (((q0 - p0) * 4 + (p1 - q1) + 4) >> 3).max(-tc).min(tc);
955 $buf[$off - $step] = clip8(p0 + delta);
956 $buf[$off] = clip8(q0 - delta);
957 }
958 }
959
960 fn check_filter(buf: &[u8], off: usize, step: usize, alpha: i16, beta: i16) -> bool {
961 let p1 = i16::from(buf[off - step * 2]);
962 let p0 = i16::from(buf[off - step]);
963 let q0 = i16::from(buf[off]);
964 let q1 = i16::from(buf[off + step]);
965 (p0 - q0).abs() < alpha && (p1 - p0).abs() < beta && (q1 - q0).abs() < beta
966 }
967
968 #[cfg(not(target_arch="x86_64"))]
969 fn check_filter4(buf: &[u8], mut off: usize, step: usize, stride: usize, alpha: i16, beta: i16) -> [bool; 4] {
970 let mut flags = [false; 4];
971 for flag in flags.iter_mut() {
972 let p1 = i16::from(buf[off - step * 2]);
973 let p0 = i16::from(buf[off - step]);
974 let q0 = i16::from(buf[off]);
975 let q1 = i16::from(buf[off + step]);
976 *flag = (p0 - q0).abs() < alpha && (p1 - p0).abs() < beta && (q1 - q0).abs() < beta;
977 off += stride;
978 }
979 flags
980 }
981
982 #[cfg(target_arch="x86_64")]
983 fn check_filter4(buf: &[u8], off: usize, step: usize, stride: usize, alpha: i16, beta: i16) -> [bool; 4] {
984 unsafe {
985 let mut flags = [false; 4];
986 let src = buf[off - step * 2..].as_ptr();
987 let load_stride = step.max(stride);
988 let fptr = flags.as_mut_ptr();
989 let tflag = u32::from(step == 1);
990 asm! {
991 // load block
992 "pxor xmm4, xmm4",
993 "movd xmm0, dword ptr [{src}]",
994 "lea {tmp}, [{src} + {stride} * 2]",
995 "movd xmm1, dword ptr [{src} + {stride}]",
996 "movd xmm2, dword ptr [{tmp}]",
997 "movd xmm3, dword ptr [{tmp} + {stride}]",
998 "punpcklbw xmm0, xmm4",
999 "punpcklbw xmm1, xmm4",
1000 "punpcklbw xmm2, xmm4",
1001 "punpcklbw xmm3, xmm4",
1002
1003 // transpose block if necessary so it's always processed by rows
1004 "test {tflag:e}, {tflag:e}",
1005 "jz 1f",
1006 "punpcklwd xmm0, xmm1",
1007 "movhlps xmm4, xmm0",
1008 "punpcklwd xmm2, xmm3",
1009 "movhlps xmm1, xmm2",
1010 "punpckldq xmm0, xmm2",
1011 "punpckldq xmm4, xmm1",
1012 "movhlps xmm1, xmm0",
1013 "movhlps xmm3, xmm4",
1014 "movaps xmm2, xmm4",
1015 "1:",
1016
1017 // calculate deltas and flags
1018 "movd xmm4, {alpha:r}",
1019 "movd xmm5, {beta:r}",
1020 "psubw xmm0, xmm1",
1021 "psubw xmm1, xmm2",
1022 "psubw xmm3, xmm2",
1023 "pshuflw xmm4, xmm4, 0",
1024 "pshuflw xmm5, xmm5, 0",
1025 "pabsw xmm0, xmm0", // |p1 - p0|
1026 "pabsw xmm1, xmm1", // |p0 - q0|
1027 "pabsw xmm2, xmm3", // |q1 - q0|
1028 "movaps xmm3, xmm5",
1029 "pcmpgtw xmm4, xmm1",
1030 "pcmpgtw xmm5, xmm0",
1031 "pcmpgtw xmm3, xmm2",
1032 "pand xmm4, xmm5",
1033 "pand xmm4, xmm3",
1034 "packsswb xmm4, xmm4",
1035 "movd [{flags}], xmm4",
1036 tmp = out(reg) _,
1037 src = in(reg) src,
1038 stride = in(reg) load_stride,
1039 alpha = in(reg) alpha,
1040 beta = in(reg) beta,
1041 flags = in(reg) fptr,
1042 tflag = in(reg) tflag,
1043 out("xmm0") _,
1044 out("xmm1") _,
1045 out("xmm2") _,
1046 out("xmm3") _,
1047 out("xmm4") _,
1048 out("xmm5") _,
1049 }
1050 flags
1051 }
1052 }
1053
1054 pub fn loop_filter_lumaedge_v(dst: &mut [u8], mut off: usize, stride: usize, alpha: i16, beta: i16) {
1055 let flags = check_filter4(dst, off, 1, stride, alpha, beta);
1056 for &flag in flags.iter() {
1057 if flag {
1058 loop_filter!(lumaedge; dst, off, 1, alpha, beta);
1059 }
1060 off += stride;
1061 }
1062 }
1063 pub fn loop_filter_lumaedge_h(dst: &mut [u8], off: usize, stride: usize, alpha: i16, beta: i16) {
1064 let flags = check_filter4(dst, off, stride, 1, alpha, beta);
1065 for (x, &flag) in flags.iter().enumerate() {
1066 if flag {
1067 loop_filter!(lumaedge; dst, off + x, stride, alpha, beta);
1068 }
1069 }
1070 }
1071 pub fn loop_filter_lumanormal_v(dst: &mut [u8], mut off: usize, stride: usize, alpha: i16, beta: i16, tc0: i16) {
1072 let flags = check_filter4(dst, off, 1, stride, alpha, beta);
1073 for &flag in flags.iter() {
1074 if flag {
1075 loop_filter!(lumanormal; dst, off, 1, tc0, beta);
1076 }
1077 off += stride;
1078 }
1079 }
1080 pub fn loop_filter_lumanormal_h(dst: &mut [u8], off: usize, stride: usize, alpha: i16, beta: i16, tc0: i16) {
1081 let flags = check_filter4(dst, off, stride, 1, alpha, beta);
1082 for (x, &flag) in flags.iter().enumerate() {
1083 if flag {
1084 loop_filter!(lumanormal; dst, off + x, stride, tc0, beta);
1085 }
1086 }
1087 }
1088 pub fn loop_filter_chromaedge_v(dst: &mut [u8], mut off: usize, stride: usize, alpha: i16, beta: i16) {
1089 for _ in 0..2 {
1090 if check_filter(dst, off, 1, alpha, beta) {
1091 loop_filter!(chromaedge; dst, off, 1);
1092 }
1093 off += stride;
1094 }
1095 }
1096 pub fn loop_filter_chromaedge_h(dst: &mut [u8], off: usize, stride: usize, alpha: i16, beta: i16) {
1097 for x in 0..2 {
1098 if check_filter(dst, off + x, stride, alpha, beta) {
1099 loop_filter!(chromaedge; dst, off + x, stride);
1100 }
1101 }
1102 }
1103 pub fn loop_filter_chromanormal_v(dst: &mut [u8], mut off: usize, stride: usize, alpha: i16, beta: i16, tc0: i16) {
1104 for _ in 0..2 {
1105 if check_filter(dst, off, 1, alpha, beta) {
1106 loop_filter!(chromanormal; dst, off, 1, tc0);
1107 }
1108 off += stride;
1109 }
1110 }
1111 pub fn loop_filter_chromanormal_h(dst: &mut [u8], off: usize, stride: usize, alpha: i16, beta: i16, tc0: i16) {
1112 for x in 0..2 {
1113 if check_filter(dst, off + x, stride, alpha, beta) {
1114 loop_filter!(chromanormal; dst, off + x, stride, tc0);
1115 }
1116 }
1117 }