aac: clear M/S flags
[nihav.git] / nihav-itu / src / codecs / h264 / dsp / mod.rs
1 mod mc;
2 pub use mc::{H264MC, McBlock};
3 #[cfg(target_arch="x86_64")]
4 use std::arch::asm;
5
6 pub const CHROMA_QUANTS: [u8; 52] = [
7 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
8 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 29, 30,
9 31, 32, 32, 33, 34, 34, 35, 35, 36, 36, 37, 37, 37, 38, 38, 38,
10 39, 39, 39, 39
11 ];
12
13 pub const CHROMA_DC_SCAN: [usize; 4] = [ 0, 1, 2, 3];
14 pub const ZIGZAG: [usize; 16] = [
15 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
16 ];
17 pub const ZIGZAG1: [usize; 15] = [
18 0, 3, 7, 4, 1, 2, 5, 8, 11, 12, 9, 6, 10, 13, 14
19 ];
20 /*pub const IL_SCAN: [usize; 16] = [
21 0, 4, 1, 8, 12, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
22 ];*/
23 pub const ZIGZAG8X8: [usize; 64] = [
24 0, 1, 8, 16, 9, 2, 3, 10,
25 17, 24, 32, 25, 18, 11, 4, 5,
26 12, 19, 26, 33, 40, 48, 41, 34,
27 27, 20, 13, 6, 7, 14, 21, 28,
28 35, 42, 49, 56, 57, 50, 43, 36,
29 29, 22, 15, 23, 30, 37, 44, 51,
30 58, 59, 52, 45, 38, 31, 39, 46,
31 53, 60, 61, 54, 47, 55, 62, 63
32 ];
33
34 const LEVEL_SCALE: [[i16; 6]; 3] = [
35 [ 10, 11, 13, 14, 16, 18 ],
36 [ 16, 18, 20, 23, 25, 29 ],
37 [ 13, 14, 16, 18, 20, 23 ]
38 ];
39
40 pub fn chroma_dc_transform(blk: &mut [i16; 4], qp: u8) {
41 let t0 = blk[0] + blk[2];
42 let t1 = blk[0] - blk[2];
43 let t2 = blk[1] + blk[3];
44 let t3 = blk[1] - blk[3];
45 blk[0] = t0 + t2;
46 blk[1] = t0 - t2;
47 blk[2] = t1 + t3;
48 blk[3] = t1 - t3;
49 if qp < 6 {
50 let mul = LEVEL_SCALE[0][qp as usize];
51 for el in blk.iter_mut() {
52 *el = el.wrapping_mul(mul) >> 1;
53 }
54 } else {
55 let mul = LEVEL_SCALE[0][(qp % 6) as usize];
56 let shift = qp / 6 - 1;
57 for el in blk.iter_mut() {
58 *el = el.wrapping_mul(mul) << shift;
59 }
60 }
61 }
62
63 macro_rules! transform {
64 (luma_dc; $a: expr, $b: expr, $c: expr, $d: expr) => ({
65 let t0 = $a.wrapping_add($c);
66 let t1 = $a.wrapping_sub($c);
67 let t2 = $b.wrapping_add($d);
68 let t3 = $b.wrapping_sub($d);
69 $a = t0.wrapping_add(t2);
70 $b = t1.wrapping_add(t3);
71 $c = t1.wrapping_sub(t3);
72 $d = t0.wrapping_sub(t2);
73 });
74 ($a: expr, $b: expr, $c: expr, $d: expr, $shift: expr) => ({
75 let t0 = $a.wrapping_add($c);
76 let t1 = $a.wrapping_sub($c);
77 let t2 = ($b >> 1).wrapping_sub($d);
78 let t3 = $b.wrapping_add($d >> 1);
79 let bias = 1 << $shift >> 1;
80 $a = t0.wrapping_add(t3).wrapping_add(bias) >> $shift;
81 $b = t1.wrapping_add(t2).wrapping_add(bias) >> $shift;
82 $c = t1.wrapping_sub(t2).wrapping_add(bias) >> $shift;
83 $d = t0.wrapping_sub(t3).wrapping_add(bias) >> $shift;
84 });
85 ($a: expr, $b: expr, $c: expr, $d: expr, $e: expr, $f: expr, $g: expr, $h: expr) => {
86 let e0 = $a + $e;
87 let e1 = -$d + $f - $h - ($h >> 1);
88 let e2 = $a - $e;
89 let e3 = $b + $h - $d - ($d >> 1);
90 let e4 = ($c >> 1) - $g;
91 let e5 = -$b + $h + $f + ($f >> 1);
92 let e6 = $c + ($g >> 1);
93 let e7 = $d + $f + $b + ($b >> 1);
94
95 let f0 = e0 + e6;
96 let f1 = e1 + (e7 >> 2);
97 let f2 = e2 + e4;
98 let f3 = e3 + (e5 >> 2);
99 let f4 = e2 - e4;
100 let f5 = (e3 >> 2) - e5;
101 let f6 = e0 - e6;
102 let f7 = e7 - (e1 >> 2);
103
104 $a = f0 + f7;
105 $b = f2 + f5;
106 $c = f4 + f3;
107 $d = f6 + f1;
108 $e = f6 - f1;
109 $f = f4 - f3;
110 $g = f2 - f5;
111 $h = f0 - f7;
112 };
113 }
114
115 pub fn idct_luma_dc(blk: &mut [i16; 16], qp: u8) {
116 if qp < 12 {
117 let mul = LEVEL_SCALE[0][(qp % 6) as usize];
118 let shift = 2 - qp / 6;
119 let bias = 1 << shift >> 1;
120 for el in blk.iter_mut() {
121 *el = el.wrapping_mul(mul).wrapping_add(bias) >> shift;
122 }
123 } else {
124 let mul = LEVEL_SCALE[0][(qp % 6) as usize];
125 let shift = qp / 6 - 2;
126 for el in blk.iter_mut() {
127 *el = el.wrapping_mul(mul) << shift;
128 }
129 }
130 for i in 0..4 {
131 transform!(luma_dc; blk[i], blk[i + 4], blk[i + 8], blk[i + 12]);
132 }
133 for row in blk.chunks_exact_mut(4) {
134 transform!(luma_dc; row[0], row[1], row[2], row[3]);
135 }
136 }
137
138 pub fn idct_skip_dc(blk: &mut [i16; 16], qp: u8) {
139 const BLK_INDEX: [usize; 16] = [
140 0, 2, 0, 2,
141 2, 1, 2, 1,
142 0, 2, 0, 2,
143 2, 1, 2, 1
144 ];
145 let qidx = (qp % 6) as usize;
146 let shift = qp / 6;
147 for (el, &idx) in blk.iter_mut().zip(BLK_INDEX.iter()).skip(1) {
148 *el = (*el * LEVEL_SCALE[idx][qidx]) << shift;
149 }
150 for row in blk.chunks_exact_mut(4) {
151 transform!(row[0], row[1], row[2], row[3], 0);
152 }
153 for i in 0..4 {
154 transform!(blk[i], blk[i + 4], blk[i + 8], blk[i + 12], 6);
155 }
156 }
157
158 pub fn idct(blk: &mut [i16; 16], qp: u8) {
159 const BLK_INDEX: [usize; 16] = [
160 0, 2, 0, 2,
161 2, 1, 2, 1,
162 0, 2, 0, 2,
163 2, 1, 2, 1
164 ];
165 let qidx = (qp % 6) as usize;
166 let shift = qp / 6;
167 for (el, &idx) in blk.iter_mut().zip(BLK_INDEX.iter()) {
168 *el = (*el * LEVEL_SCALE[idx][qidx]) << shift;
169 }
170 for row in blk.chunks_exact_mut(4) {
171 transform!(row[0], row[1], row[2], row[3], 0);
172 }
173 for i in 0..4 {
174 transform!(blk[i], blk[i + 4], blk[i + 8], blk[i + 12], 6);
175 }
176 }
177
178 pub fn idct_dc(blk: &mut [i16; 16], qp: u8, quant_dc: bool) {
179 let dc = if quant_dc {
180 (blk[0] * LEVEL_SCALE[0][(qp % 6) as usize]) << (qp / 6)
181 } else {
182 blk[0]
183 };
184 *blk = [(dc + 0x20) >> 6; 16];
185 }
186
187 const QMAT_8X8: [[u8; 16]; 6] = [
188 [
189 20, 19, 25, 19,
190 19, 18, 24, 18,
191 25, 24, 32, 24,
192 19, 18, 24, 18
193 ], [
194 22, 21, 28, 21,
195 21, 19, 26, 19,
196 28, 26, 35, 26,
197 21, 19, 26, 19
198 ], [
199 26, 24, 33, 24,
200 24, 23, 31, 23,
201 33, 31, 42, 31,
202 24, 23, 31, 23
203 ], [
204 28, 26, 35, 26,
205 26, 25, 33, 25,
206 35, 33, 45, 33,
207 26, 25, 33, 25
208 ], [
209 32, 30, 40, 30,
210 30, 28, 38, 28,
211 40, 38, 51, 38,
212 30, 28, 38, 28
213 ], [
214 36, 34, 46, 34,
215 34, 32, 43, 32,
216 46, 43, 58, 43,
217 34, 32, 43, 32
218 ]
219 ];
220
221 pub fn dequant8x8(blk: &mut [i16; 64], slist: &[u8; 64]) {
222 for (el, &scan) in blk.iter_mut().zip(ZIGZAG8X8.iter()) {
223 if *el != 0 {
224 *el = el.wrapping_mul(i16::from(slist[scan]));
225 }
226 }
227 }
228
229 pub fn idct8x8(blk: &mut [i16; 64], qp: u8) {
230 let mut tmp = [0i32; 64];
231 let qmat = &QMAT_8X8[(qp % 6) as usize];
232 if qp >= 36 {
233 let shift = qp / 6 - 6;
234 for (i, (dst, &src)) in tmp.iter_mut().zip(blk.iter()).enumerate() {
235 let x = i & 7;
236 let y = i >> 3;
237 let idx = (x & 3) + (y & 3) * 4;
238 *dst = i32::from(src).wrapping_mul(i32::from(qmat[idx])) << shift;
239 }
240 } else {
241 let shift = 6 - qp / 6;
242 let bias = (1 << shift) >> 1;
243 for (i, (dst, &src)) in tmp.iter_mut().zip(blk.iter()).enumerate() {
244 let x = i & 7;
245 let y = i >> 3;
246 let idx = (x & 3) + (y & 3) * 4;
247 *dst = i32::from(src).wrapping_mul(i32::from(qmat[idx])).wrapping_add(bias) >> shift;
248 }
249 }
250 for row in tmp.chunks_exact_mut(8) {
251 transform!(row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7]);
252 }
253 for col in 0..8 {
254 transform!(tmp[col], tmp[col + 8], tmp[col + 8 * 2], tmp[col + 8 * 3],
255 tmp[col + 8 * 4], tmp[col + 8 * 5], tmp[col + 8 * 6], tmp[col + 8 * 7]);
256 }
257 for (dst, &src) in blk.iter_mut().zip(tmp.iter()) {
258 *dst = ((src + 0x20) >> 6) as i16;
259 }
260 }
261
262 pub fn add_coeffs(dst: &mut [u8], offset: usize, stride: usize, coeffs: &[i16]) {
263 let out = &mut dst[offset..][..stride * 3 + 4];
264 for (line, src) in out.chunks_mut(stride).take(4).zip(coeffs.chunks_exact(4)) {
265 for (dst, src) in line.iter_mut().take(4).zip(src.iter()) {
266 *dst = (i32::from(*dst) + i32::from(*src)).max(0).min(255) as u8;
267 }
268 }
269 }
270
271 pub fn add_coeffs8(dst: &mut [u8], offset: usize, stride: usize, coeffs: &[i16; 64]) {
272 let out = &mut dst[offset..];
273 for (line, src) in out.chunks_mut(stride).take(8).zip(coeffs.chunks_exact(8)) {
274 for (dst, src) in line.iter_mut().take(8).zip(src.iter()) {
275 *dst = (i32::from(*dst) + i32::from(*src)).max(0).min(255) as u8;
276 }
277 }
278 }
279
280 fn clip8(val: i16) -> u8 { val.max(0).min(255) as u8 }
281
282 fn ipred_dc128(buf: &mut [u8], stride: usize, bsize: usize) {
283 for row in buf.chunks_mut(stride).take(bsize) {
284 for el in row[..bsize].iter_mut() {
285 *el = 128;
286 }
287 }
288 }
289 fn ipred_ver(buf: &mut [u8], stride: usize, top: &[u8], bsize: usize) {
290 for row in buf.chunks_mut(stride).take(bsize) {
291 row[..bsize].copy_from_slice(&top[..bsize]);
292 }
293 }
294 fn ipred_hor(buf: &mut [u8], stride: usize, left: &[u8], bsize: usize) {
295 for (row, &left) in buf.chunks_mut(stride).zip(left[1..].iter()).take(bsize) {
296 for el in row[..bsize].iter_mut() {
297 *el = left;
298 }
299 }
300 }
301 fn ipred_dc(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8], bsize: usize, shift: u8) {
302 let mut adc: u16 = 0;
303 for i in 0..bsize { adc += u16::from(top[i]); }
304 for i in 0..bsize { adc += u16::from(left[i + 1]); }
305 let dc = ((adc + (1 << (shift - 1))) >> shift) as u8;
306
307 for row in buf.chunks_mut(stride).take(bsize) {
308 for el in row[..bsize].iter_mut() {
309 *el = dc;
310 }
311 }
312 }
313 fn ipred_left_dc(buf: &mut [u8], stride: usize, left: &[u8], bsize: usize, shift: u8) {
314 let mut adc: u16 = 0;
315 for i in 0..bsize { adc += u16::from(left[i + 1]); }
316 let dc = ((adc + (1 << (shift - 1))) >> shift) as u8;
317
318 for row in buf.chunks_mut(stride).take(bsize) {
319 for el in row[..bsize].iter_mut() {
320 *el = dc;
321 }
322 }
323 }
324 fn ipred_top_dc(buf: &mut [u8], stride: usize, top: &[u8], bsize: usize, shift: u8) {
325 let mut adc: u16 = 0;
326 for i in 0..bsize { adc += u16::from(top[i]); }
327 let dc = ((adc + (1 << (shift - 1))) >> shift) as u8;
328
329 for row in buf.chunks_mut(stride).take(bsize) {
330 for el in row[..bsize].iter_mut() {
331 *el = dc;
332 }
333 }
334 }
335
336 fn load(dst: &mut [u16], src: &[u8]) {
337 for (dst, &src) in dst.iter_mut().zip(src.iter()) {
338 *dst = u16::from(src);
339 }
340 }
341
342 fn ipred_4x4_ver(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8], _tr: &[u8]) {
343 ipred_ver(buf, stride, top, 4);
344 }
345 fn ipred_4x4_hor(buf: &mut [u8], stride: usize, _top: &[u8], left: &[u8], _tr: &[u8]) {
346 ipred_hor(buf, stride, left, 4);
347 }
348 fn ipred_4x4_diag_down_left(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8], tr: &[u8]) {
349 let mut t: [u16; 9] = [0; 9];
350 load(&mut t[..4], top);
351 load(&mut t[4..8], tr);
352 t[8] = t[7];
353
354 for i in 0..4 {
355 buf[i] = ((t[i] + 2 * t[i + 1] + t[i + 2] + 2) >> 2) as u8;
356 }
357 let dst = &mut buf[stride..];
358 for i in 0..4 {
359 dst[i] = ((t[i + 1] + 2 * t[i + 2] + t[i + 3] + 2) >> 2) as u8;
360 }
361 let dst = &mut buf[stride * 2..];
362 for i in 0..4 {
363 dst[i] = ((t[i + 2] + 2 * t[i + 3] + t[i + 4] + 2) >> 2) as u8;
364 }
365 let dst = &mut buf[stride * 3..];
366 for i in 0..4 {
367 dst[i] = ((t[i + 3] + 2 * t[i + 4] + t[i + 5] + 2) >> 2) as u8;
368 }
369 }
370 fn ipred_4x4_diag_down_right(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8], _tr: &[u8]) {
371 let mut t: [u16; 5] = [0; 5];
372 t[0] = u16::from(left[0]);
373 load(&mut t[1..], top);
374 let mut l: [u16; 5] = [0; 5];
375 load(&mut l, left);
376 let dst = buf;
377
378 for j in 0..4 {
379 for i in 0..j {
380 dst[i + j * stride] = ((l[j - i - 1] + 2 * l[j - i] + l[j - i + 1] + 2) >> 2) as u8;
381 }
382 dst[j + j * stride] = ((l[1] + 2 * l[0] + t[1] + 2) >> 2) as u8;
383 for i in (j+1)..4 {
384 dst[i + j * stride] = ((t[i - j - 1] + 2 * t[i - j] + t[i - j + 1] + 2) >> 2) as u8;
385 }
386 }
387 }
388 fn ipred_4x4_ver_right(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8], _tr: &[u8]) {
389 let mut t: [u16; 5] = [0; 5];
390 t[0] = u16::from(left[0]);
391 load(&mut t[1..], top);
392 let mut l: [u16; 5] = [0; 5];
393 load(&mut l, left);
394 let dst = buf;
395
396 for j in 0..4 {
397 for i in 0..4 {
398 let zvr = ((2 * i) as i8) - (j as i8);
399 let pix;
400 if zvr >= 0 {
401 if (zvr & 1) == 0 {
402 pix = (t[i - (j >> 1)] + t[i - (j >> 1) + 1] + 1) >> 1;
403 } else {
404 pix = (t[i - (j >> 1) - 1] + 2 * t[i - (j >> 1)] + t[i - (j >> 1) + 1] + 2) >> 2;
405 }
406 } else {
407 if zvr == -1 {
408 pix = (l[1] + 2 * l[0] + t[1] + 2) >> 2;
409 } else {
410 pix = (l[j] + 2 * l[j - 1] + l[j - 2] + 2) >> 2;
411 }
412 }
413 dst[i + j * stride] = pix as u8;
414 }
415 }
416 }
417 fn ipred_4x4_ver_left(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8], tr: &[u8]) {
418 let mut t: [u16; 8] = [0; 8];
419 load(&mut t[..4], top);
420 load(&mut t[4..], tr);
421 let dst = buf;
422
423 dst[0 + 0 * stride] = ((t[0] + t[1] + 1) >> 1) as u8;
424 let pix = ((t[1] + t[2] + 1) >> 1) as u8;
425 dst[1 + 0 * stride] = pix;
426 dst[0 + 2 * stride] = pix;
427 let pix = ((t[2] + t[3] + 1) >> 1) as u8;
428 dst[2 + 0 * stride] = pix;
429 dst[1 + 2 * stride] = pix;
430 let pix = ((t[3] + t[4] + 1) >> 1) as u8;
431 dst[3 + 0 * stride] = pix;
432 dst[2 + 2 * stride] = pix;
433 dst[3 + 2 * stride] = ((t[4] + t[5] + 1) >> 1) as u8;
434 dst[0 + 1 * stride] = ((t[0] + 2*t[1] + t[2] + 2) >> 2) as u8;
435 let pix = ((t[1] + 2*t[2] + t[3] + 2) >> 2) as u8;
436 dst[1 + 1 * stride] = pix;
437 dst[0 + 3 * stride] = pix;
438 let pix = ((t[2] + 2*t[3] + t[4] + 2) >> 2) as u8;
439 dst[2 + 1 * stride] = pix;
440 dst[1 + 3 * stride] = pix;
441 let pix = ((t[3] + 2*t[4] + t[5] + 2) >> 2) as u8;
442 dst[3 + 1 * stride] = pix;
443 dst[2 + 3 * stride] = pix;
444 dst[3 + 3 * stride] = ((t[4] + 2*t[5] + t[6] + 2) >> 2) as u8;
445 }
446 fn ipred_4x4_hor_down(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8], _tr: &[u8]) {
447 let mut t: [u16; 5] = [0; 5];
448 t[0] = u16::from(left[0]);
449 load(&mut t[1..], top);
450 let mut l: [u16; 5] = [0; 5];
451 load(&mut l, left);
452 let dst = buf;
453
454 for j in 0..4 {
455 for i in 0..4 {
456 let zhd = ((2 * j) as i8) - (i as i8);
457 let pix;
458 if zhd >= 0 {
459 if (zhd & 1) == 0 {
460 pix = (l[j - (i >> 1)] + l[j - (i >> 1) + 1] + 1) >> 1;
461 } else {
462 pix = (l[j - (i >> 1) - 1] + 2 * l[j - (i >> 1)] + l[j - (i >> 1) + 1] + 2) >> 2;
463 }
464 } else {
465 if zhd == -1 {
466 pix = (l[1] + 2 * l[0] + t[1] + 2) >> 2;
467 } else {
468 pix = (t[i - 2] + 2 * t[i - 1] + t[i] + 2) >> 2;
469 }
470 }
471 dst[i + j * stride] = pix as u8;
472 }
473 }
474 }
475 fn ipred_4x4_hor_up(buf: &mut [u8], stride: usize, _top: &[u8], left: &[u8], _tr: &[u8]) {
476 let mut l: [u16; 8] = [0; 8];
477 load(&mut l, &left[1..]);
478 let dst = buf;
479
480 dst[0 + 0 * stride] = ((l[0] + l[1] + 1) >> 1) as u8;
481 dst[1 + 0 * stride] = ((l[0] + 2*l[1] + l[2] + 2) >> 2) as u8;
482 let pix = ((l[1] + l[2] + 1) >> 1) as u8;
483 dst[2 + 0 * stride] = pix;
484 dst[0 + 1 * stride] = pix;
485 let pix = ((l[1] + 2*l[2] + l[3] + 2) >> 2) as u8;
486 dst[3 + 0 * stride] = pix;
487 dst[1 + 1 * stride] = pix;
488 let pix = ((l[2] + l[3] + 1) >> 1) as u8;
489 dst[2 + 1 * stride] = pix;
490 dst[0 + 2 * stride] = pix;
491 let pix = ((l[2] + 3*l[3] + 2) >> 2) as u8;
492 dst[3 + 1 * stride] = pix;
493 dst[1 + 2 * stride] = pix;
494 dst[3 + 2 * stride] = l[3] as u8;
495 dst[1 + 3 * stride] = l[3] as u8;
496 dst[0 + 3 * stride] = l[3] as u8;
497 dst[2 + 2 * stride] = l[3] as u8;
498 dst[2 + 3 * stride] = l[3] as u8;
499 dst[3 + 3 * stride] = l[3] as u8;
500 }
501 fn ipred_4x4_dc(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8], _tr: &[u8]) {
502 ipred_dc(buf, stride, top, left, 4, 3);
503 }
504 fn ipred_4x4_left_dc(buf: &mut [u8], stride: usize, _top: &[u8], left: &[u8], _tr: &[u8]) {
505 ipred_left_dc(buf, stride, left, 4, 2);
506 }
507 fn ipred_4x4_top_dc(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8], _tr: &[u8]) {
508 ipred_top_dc(buf, stride, top, 4, 2);
509 }
510 fn ipred_4x4_dc128(buf: &mut [u8], stride: usize, _top: &[u8], _left: &[u8], _tr: &[u8]) {
511 ipred_dc128(buf, stride, 4);
512 }
513
514 pub struct IPred8Context {
515 pub t: [u8; 16],
516 pub l: [u8; 8],
517 pub tl: u8,
518 }
519
520 impl IPred8Context {
521 pub fn new() -> Self {
522 Self {
523 t: [128; 16],
524 l: [128; 8],
525 tl: 128,
526 }
527 }
528 pub fn fill(&mut self, top: &[u8], left: &[u8], has_t: bool, has_tr: bool, has_l: bool, has_tl: bool) {
529 let mut t = [0x80u8; 19];
530 let mut l = [0x80u8; 11];
531 if has_t {
532 t[1..8 + 1].copy_from_slice(&top[..8]);
533 }
534 if has_tr {
535 t[8 + 1..16 + 1].copy_from_slice(&top[8..][..8]);
536 t[16 + 1] = t[15 + 1];
537 t[17 + 1] = t[15 + 1];
538 } else {
539 let (t0, t1) = t.split_at_mut(8 + 1);
540 for el in t1.iter_mut() {
541 *el = t0[7 + 1];
542 }
543 }
544 if has_l {
545 l[1..9].copy_from_slice(&left[1..9]);
546 l[8 + 1] = l[7 + 1];
547 l[9 + 1] = l[7 + 1];
548 }
549 if has_tl {
550 t[0] = left[0];
551 l[0] = left[0];
552 } else {
553 t[0] = t[1];
554 l[0] = l[1];
555 }
556
557 for i in 0..16 {
558 self.t[i] = ((u16::from(t[i]) + 2 * u16::from(t[i + 1]) + u16::from(t[i + 2]) + 2) >> 2) as u8;
559 }
560 for i in 0..8 {
561 self.l[i] = ((u16::from(l[i]) + 2 * u16::from(l[i + 1]) + u16::from(l[i + 2]) + 2) >> 2) as u8;
562 }
563 self.tl = if has_t && has_l {
564 ((u16::from(t[1]) + 2 * u16::from(t[0]) + u16::from(l[1]) + 2) >> 2) as u8
565 } else if has_t {
566 ((3 * u16::from(t[0]) + u16::from(t[1]) + 2) >> 2) as u8
567 } else if has_l {
568 ((3 * u16::from(l[0]) + u16::from(l[1]) + 2) >> 2) as u8
569 } else {
570 t[0]
571 };
572 }
573 }
574
575 fn ipred_y_8x8_ver(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
576 for row in buf.chunks_mut(stride).take(8) {
577 row[..8].copy_from_slice(&ctx.t[..8]);
578 }
579 }
580 fn ipred_y_8x8_hor(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
581 for (row, &l) in buf.chunks_mut(stride).zip(ctx.l.iter()).take(8) {
582 row[..8].copy_from_slice(&[l; 8]);
583 }
584 }
585 fn ipred_y_8x8_diag_down_left(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
586 let mut t = [0u16; 16];
587 load(&mut t, &ctx.t);
588
589 for (y, row) in buf.chunks_mut(stride).take(8).enumerate() {
590 for (x, pix) in row.iter_mut().take(8).enumerate() {
591 *pix = ((if (x != 7) || (y != 7) {
592 t[x + y] + 2 * t[x + y + 1] + t[x + y + 2]
593 } else {
594 t[14] + 3 * t[15]
595 } + 2) >> 2) as u8;
596 }
597 }
598 }
599 fn ipred_y_8x8_diag_down_right(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
600 let mut t = [0u16; 9];
601 t[0] = u16::from(ctx.tl);
602 load(&mut t[1..], &ctx.t);
603 let mut l = [0u16; 9];
604 l[0] = u16::from(ctx.tl);
605 load(&mut l[1..], &ctx.l);
606 let diag = t[1] + 2 * t[0] + l[1];
607
608 for (y, row) in buf.chunks_mut(stride).take(8).enumerate() {
609 for (x, pix) in row.iter_mut().take(8).enumerate() {
610 *pix = ((if x > y {
611 t[x - y - 1] + 2 * t[x - y] + t[x - y + 1]
612 } else if x < y {
613 l[y - x - 1] + 2 * l[y - x] + l[y - x + 1]
614 } else {
615 diag
616 } + 2) >> 2) as u8;
617 }
618 }
619 }
620 fn ipred_y_8x8_ver_right(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
621 let mut t = [0u16; 9];
622 t[0] = u16::from(ctx.tl);
623 load(&mut t[1..], &ctx.t);
624 let mut l = [0u16; 9];
625 l[0] = u16::from(ctx.tl);
626 load(&mut l[1..], &ctx.l);
627
628 for (y, row) in buf.chunks_mut(stride).take(8).enumerate() {
629 for (x, pix) in row.iter_mut().take(8).enumerate() {
630 let zvr = 2 * (x as i8) - (y as i8);
631 *pix = if zvr >= 0 {
632 let ix = x - (y >> 1);
633 if (zvr & 1) == 0 {
634 (t[ix] + t[ix + 1] + 1) >> 1
635 } else {
636 (t[ix - 1] + 2 * t[ix] + t[ix + 1] + 2) >> 2
637 }
638 } else if zvr == -1 {
639 (l[1] + 2 * l[0] + t[1] + 2) >> 2
640 } else {
641 let ix = y - 2 * x;
642 (l[ix] + 2 * l[ix - 1] + l[ix - 2] + 2) >> 2
643 } as u8;
644 }
645 }
646 }
647 fn ipred_y_8x8_ver_left(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
648 let mut t = [0u16; 16];
649 load(&mut t, &ctx.t);
650
651 for (y, row) in buf.chunks_mut(stride).take(8).enumerate() {
652 for (x, pix) in row.iter_mut().take(8).enumerate() {
653 let ix = x + (y >> 1);
654 *pix = if (y & 1) == 0 {
655 (t[ix] + t[ix + 1] + 1) >> 1
656 } else {
657 (t[ix] + 2 * t[ix + 1] + t[ix + 2] + 2) >> 2
658 } as u8;
659 }
660 }
661
662 }
663 fn ipred_y_8x8_hor_down(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
664 let mut t = [0u16; 9];
665 t[0] = u16::from(ctx.tl);
666 load(&mut t[1..], &ctx.t);
667 let mut l = [0u16; 9];
668 l[0] = u16::from(ctx.tl);
669 load(&mut l[1..], &ctx.l);
670
671 for (y, row) in buf.chunks_mut(stride).take(8).enumerate() {
672 for (x, pix) in row.iter_mut().take(8).enumerate() {
673 let zhd = 2 * (y as i8) - (x as i8);
674 *pix = if zhd >= 0 {
675 let ix = y - (x >> 1);
676 if (zhd & 1) == 0 {
677 (l[ix] + l[ix + 1] + 1) >> 1
678 } else {
679 (l[ix - 1] + 2 * l[ix] + l[ix + 1] + 2) >> 2
680 }
681 } else if zhd == -1 {
682 (l[1] + 2 * l[0] + t[1] + 2) >> 2
683 } else {
684 let ix = x - 2 * y;
685 (t[ix] + 2 * t[ix - 1] + t[ix - 2] + 2) >> 2
686 } as u8;
687 }
688 }
689 }
690 fn ipred_y_8x8_hor_up(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
691 let mut l = [0u16; 8];
692 load(&mut l, &ctx.l);
693
694 for (y, row) in buf.chunks_mut(stride).take(8).enumerate() {
695 for (x, pix) in row.iter_mut().take(8).enumerate() {
696 let zhu = x + 2 * y;
697 let ix = y + (x >> 1);
698 *pix = if zhu > 13 {
699 l[7]
700 } else if zhu == 13 {
701 (l[6] + 3 * l[7] + 2) >> 2
702 } else if (zhu & 1) != 0 {
703 (l[ix] + 2 * l[ix + 1] + l[ix + 2] + 2) >> 2
704 } else {
705 (l[ix] + l[ix + 1] + 1) >> 1
706 } as u8;
707 }
708 }
709 }
710 fn ipred_y_8x8_dc(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
711 let mut sum = 0u16;
712 for &t in ctx.t[..8].iter() {
713 sum += u16::from(t);
714 }
715 for &l in ctx.l[..8].iter() {
716 sum += u16::from(l);
717 }
718 let dc = ((sum + 8) >> 4) as u8;
719 for row in buf.chunks_mut(stride).take(8) {
720 for pix in row.iter_mut().take(8) {
721 *pix = dc;
722 }
723 }
724 }
725 fn ipred_y_8x8_left_dc(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
726 let mut sum = 0u16;
727 for &l in ctx.l[..8].iter() {
728 sum += u16::from(l);
729 }
730 let dc = ((sum + 4) >> 3) as u8;
731 for row in buf.chunks_mut(stride).take(8) {
732 for pix in row.iter_mut().take(8) {
733 *pix = dc;
734 }
735 }
736 }
737 fn ipred_y_8x8_top_dc(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
738 let mut sum = 0u16;
739 for &t in ctx.t[..8].iter() {
740 sum += u16::from(t);
741 }
742 let dc = ((sum + 4) >> 3) as u8;
743 for row in buf.chunks_mut(stride).take(8) {
744 for pix in row.iter_mut().take(8) {
745 *pix = dc;
746 }
747 }
748 }
749 fn ipred_y_8x8_dc128(buf: &mut [u8], stride: usize, _ctx: &IPred8Context) {
750 ipred_dc128(buf, stride, 8);
751 }
752
753 fn ipred_8x8_ver(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8]) {
754 ipred_ver(buf, stride, top, 8);
755 }
756 fn ipred_8x8_hor(buf: &mut [u8], stride: usize, _top: &[u8], left: &[u8]) {
757 ipred_hor(buf, stride, left, 8);
758 }
759 fn ipred_8x8_dc(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8]) {
760 let mut l = [0; 8];
761 load(&mut l, &left[1..]);
762 let mut t = [0; 8];
763 load(&mut t, top);
764
765 let dc0 = ((t[0] + t[1] + t[2] + t[3] + l[0] + l[1] + l[2] + l[3] + 4) >> 3) as u8;
766 let sum1 = t[4] + t[5] + t[6] + t[7];
767 let dc1 = ((sum1 + 2) >> 2) as u8;
768 let sum2 = l[4] + l[5] + l[6] + l[7];
769 let dc2 = ((sum2 + 2) >> 2) as u8;
770 let dc3 = ((sum1 + sum2 + 4) >> 3) as u8;
771
772 for row in buf.chunks_mut(stride).take(4) {
773 row[..4].copy_from_slice(&[dc0; 4]);
774 row[4..8].copy_from_slice(&[dc1; 4]);
775 }
776 for row in buf.chunks_mut(stride).skip(4).take(4) {
777 row[..4].copy_from_slice(&[dc2; 4]);
778 row[4..8].copy_from_slice(&[dc3; 4]);
779 }
780 }
781 fn ipred_8x8_left_dc(buf: &mut [u8], stride: usize, _top: &[u8], left: &[u8]) {
782 let mut left_dc0 = 0;
783 let mut left_dc1 = 0;
784 for &el in left[1..].iter().take(4) {
785 left_dc0 += u16::from(el);
786 }
787 for &el in left[1..].iter().skip(4).take(4) {
788 left_dc1 += u16::from(el);
789 }
790 let dc0 = ((left_dc0 + 2) >> 2) as u8;
791 let dc2 = ((left_dc1 + 2) >> 2) as u8;
792 for row in buf.chunks_mut(stride).take(4) {
793 row[..8].copy_from_slice(&[dc0; 8]);
794 }
795 for row in buf.chunks_mut(stride).skip(4).take(4) {
796 row[..8].copy_from_slice(&[dc2; 8]);
797 }
798 }
799 fn ipred_8x8_top_dc(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8]) {
800 ipred_top_dc(buf, stride, top, 4, 2);
801 ipred_top_dc(&mut buf[4..], stride, &top[4..], 4, 2);
802 let mut top = [0; 8];
803 top.copy_from_slice(&buf[stride * 3..][..8]);
804 ipred_top_dc(&mut buf[4 * stride..], stride, &top, 4, 2);
805 ipred_top_dc(&mut buf[4 + 4 * stride..], stride, &top[4..], 4, 2);
806 }
807 fn ipred_8x8_dc128(buf: &mut [u8], stride: usize, _top: &[u8], _left: &[u8]) {
808 ipred_dc128(buf, stride, 8);
809 }
810 fn ipred_8x8_plane(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8]) {
811 let mut h: i32 = 4 * (i32::from(top[7]) - i32::from(left[0]));
812 let mut v: i32 = 4 * (i32::from(left[8]) - i32::from(left[0]));
813 for i in 0..3 {
814 let i1 = (i + 1) as i32;
815 h += i1 * (i32::from(top[4 + i]) - i32::from(top[2 - i]));
816 v += i1 * (i32::from(left[5 + i]) - i32::from(left[3 - i]));
817 }
818 let b = (17 * h + 16) >> 5;
819 let c = (17 * v + 16) >> 5;
820 let mut a = 16 * (i32::from(left[8]) + i32::from(top[7])) - 3 * (b + c) + 16;
821 for line in buf.chunks_mut(stride).take(8) {
822 let mut acc = a;
823 for el in line.iter_mut().take(8) {
824 *el = clip8((acc >> 5) as i16);
825 acc += b;
826 }
827 a += c;
828 }
829 }
830
831 fn ipred_16x16_ver(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8]) {
832 ipred_ver(buf, stride, top, 16);
833 }
834 fn ipred_16x16_hor(buf: &mut [u8], stride: usize, _top: &[u8], left: &[u8]) {
835 ipred_hor(buf, stride, left, 16);
836 }
837 fn ipred_16x16_dc(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8]) {
838 ipred_dc(buf, stride, top, left, 16, 5);
839 }
840 fn ipred_16x16_left_dc(buf: &mut [u8], stride: usize, _top: &[u8], left: &[u8]) {
841 ipred_left_dc(buf, stride, left, 16, 4);
842 }
843 fn ipred_16x16_top_dc(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8]) {
844 ipred_top_dc(buf, stride, top, 16, 4);
845 }
846 fn ipred_16x16_dc128(buf: &mut [u8], stride: usize, _top: &[u8], _left: &[u8]) {
847 ipred_dc128(buf, stride, 16);
848 }
849 fn ipred_16x16_plane(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8]) {
850 let mut h = 8 * (i32::from(top[15]) - i32::from(left[0]));
851 let mut v = 8 * (i32::from(left[16]) - i32::from(left[0]));
852 for k in 0..7 {
853 h += ((k as i32) + 1) * (i32::from(top[8 + k]) - i32::from(top[6 - k]));
854 v += ((k as i32) + 1) * (i32::from(left[9 + k]) - i32::from(left[7 - k]));
855 }
856
857 h = (5 * h + 32) >> 6;
858 v = (5 * v + 32) >> 6;
859
860 let mut a = 16 * (i32::from(left[16]) + i32::from(top[15]) + 1) - 7 * (v + h);
861
862 for row in buf.chunks_mut(stride).take(16) {
863 let mut b = a;
864 a += v;
865
866 for dst in row.chunks_exact_mut(4).take(4) {
867 dst[0] = clip8(((b ) >> 5) as i16);
868 dst[1] = clip8(((b + h) >> 5) as i16);
869 dst[2] = clip8(((b + 2*h) >> 5) as i16);
870 dst[3] = clip8(((b + 3*h) >> 5) as i16);
871 b += h * 4;
872 }
873 }
874 }
875
876 pub type IPred4x4Func = fn(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8], tr: &[u8]);
877 pub type IPred8x8Func = fn(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8]);
878 pub type IPred8x8LumaFunc = fn(buf: &mut [u8], stride: usize, ctx: &IPred8Context);
879
880 pub const IPRED4_DC128: usize = 11;
881 pub const IPRED4_DC_TOP: usize = 10;
882 pub const IPRED4_DC_LEFT: usize = 9;
883 pub const IPRED8_DC128: usize = 6;
884 pub const IPRED8_DC_TOP: usize = 5;
885 pub const IPRED8_DC_LEFT: usize = 4;
886
887 pub const IPRED_FUNCS4X4: [IPred4x4Func; 12] = [
888 ipred_4x4_ver, ipred_4x4_hor, ipred_4x4_dc,
889 ipred_4x4_diag_down_left, ipred_4x4_diag_down_right,
890 ipred_4x4_ver_right, ipred_4x4_hor_down, ipred_4x4_ver_left, ipred_4x4_hor_up,
891 ipred_4x4_left_dc, ipred_4x4_top_dc, ipred_4x4_dc128
892 ];
893
894 pub const IPRED_FUNCS8X8_LUMA: [IPred8x8LumaFunc; 12] = [
895 ipred_y_8x8_ver, ipred_y_8x8_hor, ipred_y_8x8_dc,
896 ipred_y_8x8_diag_down_left, ipred_y_8x8_diag_down_right,
897 ipred_y_8x8_ver_right, ipred_y_8x8_hor_down,
898 ipred_y_8x8_ver_left, ipred_y_8x8_hor_up,
899 ipred_y_8x8_left_dc, ipred_y_8x8_top_dc, ipred_y_8x8_dc128
900 ];
901
902 pub const IPRED_FUNCS8X8_CHROMA: [IPred8x8Func; 7] = [
903 ipred_8x8_dc, ipred_8x8_hor, ipred_8x8_ver, ipred_8x8_plane,
904 ipred_8x8_left_dc, ipred_8x8_top_dc, ipred_8x8_dc128
905 ];
906
907 pub const IPRED_FUNCS16X16: [IPred8x8Func; 7] = [
908 ipred_16x16_ver, ipred_16x16_hor, ipred_16x16_dc, ipred_16x16_plane,
909 ipred_16x16_left_dc, ipred_16x16_top_dc, ipred_16x16_dc128
910 ];
911
912 macro_rules! loop_filter {
913 (lumaedge; $buf: expr, $off: expr, $step: expr, $alpha: expr, $beta: expr) => {
914 let p2 = i16::from($buf[$off - $step * 3]);
915 let p1 = i16::from($buf[$off - $step * 2]);
916 let p0 = i16::from($buf[$off - $step]);
917 let q0 = i16::from($buf[$off]);
918 let q1 = i16::from($buf[$off + $step]);
919 let q2 = i16::from($buf[$off + $step * 2]);
920 let a_p = (p2 - p0).abs() < $beta;
921 let a_q = (q2 - q0).abs() < $beta;
922 if a_p && (p0 - q0).abs() < (($alpha >> 2) + 2) {
923 let p3 = i16::from($buf[$off - $step * 4]);
924 $buf[$off - $step * 3] = ((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) as u8;
925 $buf[$off - $step * 2] = ((p2 + p1 + p0 + q0 + 2) >> 2) as u8;
926 $buf[$off - $step] = ((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) as u8;
927 } else {
928 $buf[$off - $step] = ((2 * p1 + p0 + q1 + 2) >> 2) as u8;
929 }
930 if a_q && (p0 - q0).abs() < (($alpha >> 2) + 2) {
931 let q3 = i16::from($buf[$off + $step * 3]);
932 $buf[$off] = ((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) as u8;
933 $buf[$off + $step] = ((p0 + q0 + q1 + q2 + 2) >> 2) as u8;
934 $buf[$off + $step * 2] = ((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) as u8;
935 } else {
936 $buf[$off] = ((2 * q1 + q0 + p1 + 2) >> 2) as u8;
937 }
938 };
939 (chromaedge; $buf: expr, $off: expr, $step: expr) => {
940 let p1 = i16::from($buf[$off - $step * 2]);
941 let p0 = i16::from($buf[$off - $step]);
942 let q0 = i16::from($buf[$off]);
943 let q1 = i16::from($buf[$off + $step]);
944 $buf[$off - $step] = ((2 * p1 + p0 + q1 + 2) >> 2) as u8;
945 $buf[$off] = ((2 * q1 + q0 + p1 + 2) >> 2) as u8;
946 };
947 (lumanormal; $buf: expr, $off: expr, $step: expr, $tc0: expr, $beta: expr) => {
948 let p2 = i16::from($buf[$off - $step * 3]);
949 let p1 = i16::from($buf[$off - $step * 2]);
950 let p0 = i16::from($buf[$off - $step]);
951 let q0 = i16::from($buf[$off]);
952 let q1 = i16::from($buf[$off + $step]);
953 let q2 = i16::from($buf[$off + $step * 2]);
954 let a_p = (p2 - p0).abs() < $beta;
955 let a_q = (q2 - q0).abs() < $beta;
956 let tc = $tc0 + (a_p as i16) + (a_q as i16);
957 let delta = (((q0 - p0) * 4 + (p1 - q1) + 4) >> 3).max(-tc).min(tc);
958 if a_p && ($tc0 > 0) {
959 $buf[$off - $step * 2] = clip8(p1 + ((p2 + ((p0 + q0 + 1) >> 1) - p1 * 2) >> 1).max(-$tc0).min($tc0));
960 }
961 $buf[$off - $step] = clip8(p0 + delta);
962 $buf[$off] = clip8(q0 - delta);
963 if a_q && ($tc0 > 0) {
964 $buf[$off + $step] = clip8(q1 + ((q2 + ((p0 + q0 + 1) >> 1) - q1 * 2) >> 1).max(-$tc0).min($tc0));
965 }
966 };
967 (chromanormal; $buf: expr, $off: expr, $step: expr, $tc0: expr) => {
968 let p1 = i16::from($buf[$off - $step * 2]);
969 let p0 = i16::from($buf[$off - $step]);
970 let q0 = i16::from($buf[$off]);
971 let q1 = i16::from($buf[$off + $step]);
972 let tc = $tc0 + 1;
973 let delta = (((q0 - p0) * 4 + (p1 - q1) + 4) >> 3).max(-tc).min(tc);
974 $buf[$off - $step] = clip8(p0 + delta);
975 $buf[$off] = clip8(q0 - delta);
976 }
977 }
978
979 fn check_filter(buf: &[u8], off: usize, step: usize, alpha: i16, beta: i16) -> bool {
980 let p1 = i16::from(buf[off - step * 2]);
981 let p0 = i16::from(buf[off - step]);
982 let q0 = i16::from(buf[off]);
983 let q1 = i16::from(buf[off + step]);
984 (p0 - q0).abs() < alpha && (p1 - p0).abs() < beta && (q1 - q0).abs() < beta
985 }
986
987 #[cfg(not(target_arch="x86_64"))]
988 fn check_filter4(buf: &[u8], mut off: usize, step: usize, stride: usize, alpha: i16, beta: i16) -> [bool; 4] {
989 let mut flags = [false; 4];
990 for flag in flags.iter_mut() {
991 let p1 = i16::from(buf[off - step * 2]);
992 let p0 = i16::from(buf[off - step]);
993 let q0 = i16::from(buf[off]);
994 let q1 = i16::from(buf[off + step]);
995 *flag = (p0 - q0).abs() < alpha && (p1 - p0).abs() < beta && (q1 - q0).abs() < beta;
996 off += stride;
997 }
998 flags
999 }
1000
1001 #[cfg(target_arch="x86_64")]
1002 fn check_filter4(buf: &[u8], off: usize, step: usize, stride: usize, alpha: i16, beta: i16) -> [bool; 4] {
1003 unsafe {
1004 let mut flags = [false; 4];
1005 let src = buf[off - step * 2..].as_ptr();
1006 let load_stride = step.max(stride);
1007 let fptr = flags.as_mut_ptr();
1008 let tflag = u32::from(step == 1);
1009 asm! {
1010 // load block
1011 "pxor xmm4, xmm4",
1012 "movd xmm0, dword ptr [{src}]",
1013 "lea {tmp}, [{src} + {stride} * 2]",
1014 "movd xmm1, dword ptr [{src} + {stride}]",
1015 "movd xmm2, dword ptr [{tmp}]",
1016 "movd xmm3, dword ptr [{tmp} + {stride}]",
1017 "punpcklbw xmm0, xmm4",
1018 "punpcklbw xmm1, xmm4",
1019 "punpcklbw xmm2, xmm4",
1020 "punpcklbw xmm3, xmm4",
1021
1022 // transpose block if necessary so it's always processed by rows
1023 "test {tflag:e}, {tflag:e}",
1024 "jz 1f",
1025 "punpcklwd xmm0, xmm1",
1026 "movhlps xmm4, xmm0",
1027 "punpcklwd xmm2, xmm3",
1028 "movhlps xmm1, xmm2",
1029 "punpckldq xmm0, xmm2",
1030 "punpckldq xmm4, xmm1",
1031 "movhlps xmm1, xmm0",
1032 "movhlps xmm3, xmm4",
1033 "movaps xmm2, xmm4",
1034 "1:",
1035
1036 // calculate deltas and flags
1037 "movd xmm4, {alpha:r}",
1038 "movd xmm5, {beta:r}",
1039 "psubw xmm0, xmm1",
1040 "psubw xmm1, xmm2",
1041 "psubw xmm3, xmm2",
1042 "pshuflw xmm4, xmm4, 0",
1043 "pshuflw xmm5, xmm5, 0",
1044 "pabsw xmm0, xmm0", // |p1 - p0|
1045 "pabsw xmm1, xmm1", // |p0 - q0|
1046 "pabsw xmm2, xmm3", // |q1 - q0|
1047 "movaps xmm3, xmm5",
1048 "pcmpgtw xmm4, xmm1",
1049 "pcmpgtw xmm5, xmm0",
1050 "pcmpgtw xmm3, xmm2",
1051 "pand xmm4, xmm5",
1052 "pand xmm4, xmm3",
1053 "packsswb xmm4, xmm4",
1054 "movd [{flags}], xmm4",
1055 tmp = out(reg) _,
1056 src = in(reg) src,
1057 stride = in(reg) load_stride,
1058 alpha = in(reg) alpha,
1059 beta = in(reg) beta,
1060 flags = in(reg) fptr,
1061 tflag = in(reg) tflag,
1062 out("xmm0") _,
1063 out("xmm1") _,
1064 out("xmm2") _,
1065 out("xmm3") _,
1066 out("xmm4") _,
1067 out("xmm5") _,
1068 }
1069 flags
1070 }
1071 }
1072
1073 pub fn loop_filter_lumaedge_v(dst: &mut [u8], mut off: usize, stride: usize, alpha: i16, beta: i16) {
1074 let flags = check_filter4(dst, off, 1, stride, alpha, beta);
1075 for &flag in flags.iter() {
1076 if flag {
1077 loop_filter!(lumaedge; dst, off, 1, alpha, beta);
1078 }
1079 off += stride;
1080 }
1081 }
1082 pub fn loop_filter_lumaedge_h(dst: &mut [u8], off: usize, stride: usize, alpha: i16, beta: i16) {
1083 let flags = check_filter4(dst, off, stride, 1, alpha, beta);
1084 for (x, &flag) in flags.iter().enumerate() {
1085 if flag {
1086 loop_filter!(lumaedge; dst, off + x, stride, alpha, beta);
1087 }
1088 }
1089 }
1090 pub fn loop_filter_lumanormal_v(dst: &mut [u8], mut off: usize, stride: usize, alpha: i16, beta: i16, tc0: i16) {
1091 let flags = check_filter4(dst, off, 1, stride, alpha, beta);
1092 for &flag in flags.iter() {
1093 if flag {
1094 loop_filter!(lumanormal; dst, off, 1, tc0, beta);
1095 }
1096 off += stride;
1097 }
1098 }
1099 pub fn loop_filter_lumanormal_h(dst: &mut [u8], off: usize, stride: usize, alpha: i16, beta: i16, tc0: i16) {
1100 let flags = check_filter4(dst, off, stride, 1, alpha, beta);
1101 for (x, &flag) in flags.iter().enumerate() {
1102 if flag {
1103 loop_filter!(lumanormal; dst, off + x, stride, tc0, beta);
1104 }
1105 }
1106 }
1107 pub fn loop_filter_chromaedge_v(dst: &mut [u8], mut off: usize, stride: usize, alpha: i16, beta: i16) {
1108 for _ in 0..2 {
1109 if check_filter(dst, off, 1, alpha, beta) {
1110 loop_filter!(chromaedge; dst, off, 1);
1111 }
1112 off += stride;
1113 }
1114 }
1115 pub fn loop_filter_chromaedge_h(dst: &mut [u8], off: usize, stride: usize, alpha: i16, beta: i16) {
1116 for x in 0..2 {
1117 if check_filter(dst, off + x, stride, alpha, beta) {
1118 loop_filter!(chromaedge; dst, off + x, stride);
1119 }
1120 }
1121 }
1122 pub fn loop_filter_chromanormal_v(dst: &mut [u8], mut off: usize, stride: usize, alpha: i16, beta: i16, tc0: i16) {
1123 for _ in 0..2 {
1124 if check_filter(dst, off, 1, alpha, beta) {
1125 loop_filter!(chromanormal; dst, off, 1, tc0);
1126 }
1127 off += stride;
1128 }
1129 }
1130 pub fn loop_filter_chromanormal_h(dst: &mut [u8], off: usize, stride: usize, alpha: i16, beta: i16, tc0: i16) {
1131 for x in 0..2 {
1132 if check_filter(dst, off + x, stride, alpha, beta) {
1133 loop_filter!(chromanormal; dst, off + x, stride, tc0);
1134 }
1135 }
1136 }