]> git.nihav.org Git - nihav.git/blob - nihav-itu/src/codecs/h264/dsp/mod.rs
16af6dee97f7b9d8e7c65d248afd78c39964bc0b
[nihav.git] / nihav-itu / src / codecs / h264 / dsp / mod.rs
1 #[allow(unexpected_cfgs)]
2 mod mc;
3 pub use mc::{H264MC, McBlock};
4 #[cfg(target_arch="x86_64")]
5 use std::arch::asm;
6
7 pub const CHROMA_QUANTS: [u8; 52] = [
8 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
9 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 29, 30,
10 31, 32, 32, 33, 34, 34, 35, 35, 36, 36, 37, 37, 37, 38, 38, 38,
11 39, 39, 39, 39
12 ];
13
14 pub const CHROMA_DC_SCAN: [usize; 4] = [ 0, 1, 2, 3];
15 pub const ZIGZAG: [usize; 16] = [
16 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
17 ];
18 pub const ZIGZAG1: [usize; 15] = [
19 0, 3, 7, 4, 1, 2, 5, 8, 11, 12, 9, 6, 10, 13, 14
20 ];
21 /*pub const IL_SCAN: [usize; 16] = [
22 0, 4, 1, 8, 12, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
23 ];*/
24 pub const ZIGZAG8X8: [usize; 64] = [
25 0, 1, 8, 16, 9, 2, 3, 10,
26 17, 24, 32, 25, 18, 11, 4, 5,
27 12, 19, 26, 33, 40, 48, 41, 34,
28 27, 20, 13, 6, 7, 14, 21, 28,
29 35, 42, 49, 56, 57, 50, 43, 36,
30 29, 22, 15, 23, 30, 37, 44, 51,
31 58, 59, 52, 45, 38, 31, 39, 46,
32 53, 60, 61, 54, 47, 55, 62, 63
33 ];
34
35 const LEVEL_SCALE: [[i16; 6]; 3] = [
36 [ 10, 11, 13, 14, 16, 18 ],
37 [ 16, 18, 20, 23, 25, 29 ],
38 [ 13, 14, 16, 18, 20, 23 ]
39 ];
40
41 pub fn chroma_dc_transform(blk: &mut [i16; 4], qp: u8) {
42 let t0 = blk[0] + blk[2];
43 let t1 = blk[0] - blk[2];
44 let t2 = blk[1] + blk[3];
45 let t3 = blk[1] - blk[3];
46 blk[0] = t0 + t2;
47 blk[1] = t0 - t2;
48 blk[2] = t1 + t3;
49 blk[3] = t1 - t3;
50 if qp < 6 {
51 let mul = LEVEL_SCALE[0][qp as usize];
52 for el in blk.iter_mut() {
53 *el = el.wrapping_mul(mul) >> 1;
54 }
55 } else {
56 let mul = LEVEL_SCALE[0][(qp % 6) as usize];
57 let shift = qp / 6 - 1;
58 for el in blk.iter_mut() {
59 *el = el.wrapping_mul(mul) << shift;
60 }
61 }
62 }
63
64 macro_rules! transform {
65 (luma_dc; $a: expr, $b: expr, $c: expr, $d: expr) => ({
66 let t0 = $a.wrapping_add($c);
67 let t1 = $a.wrapping_sub($c);
68 let t2 = $b.wrapping_add($d);
69 let t3 = $b.wrapping_sub($d);
70 $a = t0.wrapping_add(t2);
71 $b = t1.wrapping_add(t3);
72 $c = t1.wrapping_sub(t3);
73 $d = t0.wrapping_sub(t2);
74 });
75 ($a: expr, $b: expr, $c: expr, $d: expr, $shift: expr) => ({
76 let t0 = $a.wrapping_add($c);
77 let t1 = $a.wrapping_sub($c);
78 let t2 = ($b >> 1).wrapping_sub($d);
79 let t3 = $b.wrapping_add($d >> 1);
80 let bias = 1 << $shift >> 1;
81 $a = t0.wrapping_add(t3).wrapping_add(bias) >> $shift;
82 $b = t1.wrapping_add(t2).wrapping_add(bias) >> $shift;
83 $c = t1.wrapping_sub(t2).wrapping_add(bias) >> $shift;
84 $d = t0.wrapping_sub(t3).wrapping_add(bias) >> $shift;
85 });
86 ($a: expr, $b: expr, $c: expr, $d: expr, $e: expr, $f: expr, $g: expr, $h: expr) => {
87 let e0 = $a + $e;
88 let e1 = -$d + $f - $h - ($h >> 1);
89 let e2 = $a - $e;
90 let e3 = $b + $h - $d - ($d >> 1);
91 let e4 = ($c >> 1) - $g;
92 let e5 = -$b + $h + $f + ($f >> 1);
93 let e6 = $c + ($g >> 1);
94 let e7 = $d + $f + $b + ($b >> 1);
95
96 let f0 = e0 + e6;
97 let f1 = e1 + (e7 >> 2);
98 let f2 = e2 + e4;
99 let f3 = e3 + (e5 >> 2);
100 let f4 = e2 - e4;
101 let f5 = (e3 >> 2) - e5;
102 let f6 = e0 - e6;
103 let f7 = e7 - (e1 >> 2);
104
105 $a = f0 + f7;
106 $b = f2 + f5;
107 $c = f4 + f3;
108 $d = f6 + f1;
109 $e = f6 - f1;
110 $f = f4 - f3;
111 $g = f2 - f5;
112 $h = f0 - f7;
113 };
114 }
115
116 pub fn idct_luma_dc(blk: &mut [i16; 16], qp: u8) {
117 if qp < 12 {
118 let mul = LEVEL_SCALE[0][(qp % 6) as usize];
119 let shift = 2 - qp / 6;
120 let bias = 1 << shift >> 1;
121 for el in blk.iter_mut() {
122 *el = el.wrapping_mul(mul).wrapping_add(bias) >> shift;
123 }
124 } else {
125 let mul = LEVEL_SCALE[0][(qp % 6) as usize];
126 let shift = qp / 6 - 2;
127 for el in blk.iter_mut() {
128 *el = el.wrapping_mul(mul) << shift;
129 }
130 }
131 for i in 0..4 {
132 transform!(luma_dc; blk[i], blk[i + 4], blk[i + 8], blk[i + 12]);
133 }
134 for row in blk.chunks_exact_mut(4) {
135 transform!(luma_dc; row[0], row[1], row[2], row[3]);
136 }
137 }
138
139 pub fn idct_skip_dc(blk: &mut [i16; 16], qp: u8) {
140 const BLK_INDEX: [usize; 16] = [
141 0, 2, 0, 2,
142 2, 1, 2, 1,
143 0, 2, 0, 2,
144 2, 1, 2, 1
145 ];
146 let qidx = (qp % 6) as usize;
147 let shift = qp / 6;
148 for (el, &idx) in blk.iter_mut().zip(BLK_INDEX.iter()).skip(1) {
149 *el = (*el * LEVEL_SCALE[idx][qidx]) << shift;
150 }
151 for row in blk.chunks_exact_mut(4) {
152 transform!(row[0], row[1], row[2], row[3], 0);
153 }
154 for i in 0..4 {
155 transform!(blk[i], blk[i + 4], blk[i + 8], blk[i + 12], 6);
156 }
157 }
158
159 pub fn idct(blk: &mut [i16; 16], qp: u8) {
160 const BLK_INDEX: [usize; 16] = [
161 0, 2, 0, 2,
162 2, 1, 2, 1,
163 0, 2, 0, 2,
164 2, 1, 2, 1
165 ];
166 let qidx = (qp % 6) as usize;
167 let shift = qp / 6;
168 for (el, &idx) in blk.iter_mut().zip(BLK_INDEX.iter()) {
169 *el = (*el * LEVEL_SCALE[idx][qidx]) << shift;
170 }
171 for row in blk.chunks_exact_mut(4) {
172 transform!(row[0], row[1], row[2], row[3], 0);
173 }
174 for i in 0..4 {
175 transform!(blk[i], blk[i + 4], blk[i + 8], blk[i + 12], 6);
176 }
177 }
178
179 pub fn idct_dc(blk: &mut [i16; 16], qp: u8, quant_dc: bool) {
180 let dc = if quant_dc {
181 (blk[0] * LEVEL_SCALE[0][(qp % 6) as usize]) << (qp / 6)
182 } else {
183 blk[0]
184 };
185 *blk = [(dc + 0x20) >> 6; 16];
186 }
187
188 const QMAT_8X8: [[u8; 16]; 6] = [
189 [
190 20, 19, 25, 19,
191 19, 18, 24, 18,
192 25, 24, 32, 24,
193 19, 18, 24, 18
194 ], [
195 22, 21, 28, 21,
196 21, 19, 26, 19,
197 28, 26, 35, 26,
198 21, 19, 26, 19
199 ], [
200 26, 24, 33, 24,
201 24, 23, 31, 23,
202 33, 31, 42, 31,
203 24, 23, 31, 23
204 ], [
205 28, 26, 35, 26,
206 26, 25, 33, 25,
207 35, 33, 45, 33,
208 26, 25, 33, 25
209 ], [
210 32, 30, 40, 30,
211 30, 28, 38, 28,
212 40, 38, 51, 38,
213 30, 28, 38, 28
214 ], [
215 36, 34, 46, 34,
216 34, 32, 43, 32,
217 46, 43, 58, 43,
218 34, 32, 43, 32
219 ]
220 ];
221
222 pub fn dequant8x8(blk: &mut [i16; 64], slist: &[u8; 64]) {
223 for (el, &scan) in blk.iter_mut().zip(ZIGZAG8X8.iter()) {
224 if *el != 0 {
225 *el = el.wrapping_mul(i16::from(slist[scan]));
226 }
227 }
228 }
229
230 pub fn idct8x8(blk: &mut [i16; 64], qp: u8) {
231 let mut tmp = [0i32; 64];
232 let qmat = &QMAT_8X8[(qp % 6) as usize];
233 if qp >= 36 {
234 let shift = qp / 6 - 6;
235 for (i, (dst, &src)) in tmp.iter_mut().zip(blk.iter()).enumerate() {
236 let x = i & 7;
237 let y = i >> 3;
238 let idx = (x & 3) + (y & 3) * 4;
239 *dst = i32::from(src).wrapping_mul(i32::from(qmat[idx])) << shift;
240 }
241 } else {
242 let shift = 6 - qp / 6;
243 let bias = (1 << shift) >> 1;
244 for (i, (dst, &src)) in tmp.iter_mut().zip(blk.iter()).enumerate() {
245 let x = i & 7;
246 let y = i >> 3;
247 let idx = (x & 3) + (y & 3) * 4;
248 *dst = i32::from(src).wrapping_mul(i32::from(qmat[idx])).wrapping_add(bias) >> shift;
249 }
250 }
251 for row in tmp.chunks_exact_mut(8) {
252 transform!(row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7]);
253 }
254 for col in 0..8 {
255 transform!(tmp[col], tmp[col + 8], tmp[col + 8 * 2], tmp[col + 8 * 3],
256 tmp[col + 8 * 4], tmp[col + 8 * 5], tmp[col + 8 * 6], tmp[col + 8 * 7]);
257 }
258 for (dst, &src) in blk.iter_mut().zip(tmp.iter()) {
259 *dst = ((src + 0x20) >> 6) as i16;
260 }
261 }
262
263 pub fn add_coeffs(dst: &mut [u8], offset: usize, stride: usize, coeffs: &[i16]) {
264 let out = &mut dst[offset..][..stride * 3 + 4];
265 for (line, src) in out.chunks_mut(stride).take(4).zip(coeffs.chunks_exact(4)) {
266 for (dst, src) in line.iter_mut().take(4).zip(src.iter()) {
267 *dst = (i32::from(*dst) + i32::from(*src)).max(0).min(255) as u8;
268 }
269 }
270 }
271
272 pub fn add_coeffs8(dst: &mut [u8], offset: usize, stride: usize, coeffs: &[i16; 64]) {
273 let out = &mut dst[offset..];
274 for (line, src) in out.chunks_mut(stride).take(8).zip(coeffs.chunks_exact(8)) {
275 for (dst, src) in line.iter_mut().take(8).zip(src.iter()) {
276 *dst = (i32::from(*dst) + i32::from(*src)).max(0).min(255) as u8;
277 }
278 }
279 }
280
281 fn clip8(val: i16) -> u8 { val.max(0).min(255) as u8 }
282
283 fn ipred_dc128(buf: &mut [u8], stride: usize, bsize: usize) {
284 for row in buf.chunks_mut(stride).take(bsize) {
285 for el in row[..bsize].iter_mut() {
286 *el = 128;
287 }
288 }
289 }
290 fn ipred_ver(buf: &mut [u8], stride: usize, top: &[u8], bsize: usize) {
291 for row in buf.chunks_mut(stride).take(bsize) {
292 row[..bsize].copy_from_slice(&top[..bsize]);
293 }
294 }
295 fn ipred_hor(buf: &mut [u8], stride: usize, left: &[u8], bsize: usize) {
296 for (row, &left) in buf.chunks_mut(stride).zip(left[1..].iter()).take(bsize) {
297 for el in row[..bsize].iter_mut() {
298 *el = left;
299 }
300 }
301 }
302 fn ipred_dc(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8], bsize: usize, shift: u8) {
303 let mut adc: u16 = 0;
304 for i in 0..bsize { adc += u16::from(top[i]); }
305 for i in 0..bsize { adc += u16::from(left[i + 1]); }
306 let dc = ((adc + (1 << (shift - 1))) >> shift) as u8;
307
308 for row in buf.chunks_mut(stride).take(bsize) {
309 for el in row[..bsize].iter_mut() {
310 *el = dc;
311 }
312 }
313 }
314 fn ipred_left_dc(buf: &mut [u8], stride: usize, left: &[u8], bsize: usize, shift: u8) {
315 let mut adc: u16 = 0;
316 for i in 0..bsize { adc += u16::from(left[i + 1]); }
317 let dc = ((adc + (1 << (shift - 1))) >> shift) as u8;
318
319 for row in buf.chunks_mut(stride).take(bsize) {
320 for el in row[..bsize].iter_mut() {
321 *el = dc;
322 }
323 }
324 }
325 fn ipred_top_dc(buf: &mut [u8], stride: usize, top: &[u8], bsize: usize, shift: u8) {
326 let mut adc: u16 = 0;
327 for i in 0..bsize { adc += u16::from(top[i]); }
328 let dc = ((adc + (1 << (shift - 1))) >> shift) as u8;
329
330 for row in buf.chunks_mut(stride).take(bsize) {
331 for el in row[..bsize].iter_mut() {
332 *el = dc;
333 }
334 }
335 }
336
337 fn load(dst: &mut [u16], src: &[u8]) {
338 for (dst, &src) in dst.iter_mut().zip(src.iter()) {
339 *dst = u16::from(src);
340 }
341 }
342
343 fn ipred_4x4_ver(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8], _tr: &[u8]) {
344 ipred_ver(buf, stride, top, 4);
345 }
346 fn ipred_4x4_hor(buf: &mut [u8], stride: usize, _top: &[u8], left: &[u8], _tr: &[u8]) {
347 ipred_hor(buf, stride, left, 4);
348 }
349 fn ipred_4x4_diag_down_left(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8], tr: &[u8]) {
350 let mut t: [u16; 9] = [0; 9];
351 load(&mut t[..4], top);
352 load(&mut t[4..8], tr);
353 t[8] = t[7];
354
355 for i in 0..4 {
356 buf[i] = ((t[i] + 2 * t[i + 1] + t[i + 2] + 2) >> 2) as u8;
357 }
358 let dst = &mut buf[stride..];
359 for i in 0..4 {
360 dst[i] = ((t[i + 1] + 2 * t[i + 2] + t[i + 3] + 2) >> 2) as u8;
361 }
362 let dst = &mut buf[stride * 2..];
363 for i in 0..4 {
364 dst[i] = ((t[i + 2] + 2 * t[i + 3] + t[i + 4] + 2) >> 2) as u8;
365 }
366 let dst = &mut buf[stride * 3..];
367 for i in 0..4 {
368 dst[i] = ((t[i + 3] + 2 * t[i + 4] + t[i + 5] + 2) >> 2) as u8;
369 }
370 }
371 fn ipred_4x4_diag_down_right(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8], _tr: &[u8]) {
372 let mut t: [u16; 5] = [0; 5];
373 t[0] = u16::from(left[0]);
374 load(&mut t[1..], top);
375 let mut l: [u16; 5] = [0; 5];
376 load(&mut l, left);
377 let dst = buf;
378
379 for j in 0..4 {
380 for i in 0..j {
381 dst[i + j * stride] = ((l[j - i - 1] + 2 * l[j - i] + l[j - i + 1] + 2) >> 2) as u8;
382 }
383 dst[j + j * stride] = ((l[1] + 2 * l[0] + t[1] + 2) >> 2) as u8;
384 for i in (j+1)..4 {
385 dst[i + j * stride] = ((t[i - j - 1] + 2 * t[i - j] + t[i - j + 1] + 2) >> 2) as u8;
386 }
387 }
388 }
389 fn ipred_4x4_ver_right(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8], _tr: &[u8]) {
390 let mut t: [u16; 5] = [0; 5];
391 t[0] = u16::from(left[0]);
392 load(&mut t[1..], top);
393 let mut l: [u16; 5] = [0; 5];
394 load(&mut l, left);
395 let dst = buf;
396
397 for j in 0..4 {
398 for i in 0..4 {
399 let zvr = ((2 * i) as i8) - (j as i8);
400 let pix;
401 if zvr >= 0 {
402 if (zvr & 1) == 0 {
403 pix = (t[i - (j >> 1)] + t[i - (j >> 1) + 1] + 1) >> 1;
404 } else {
405 pix = (t[i - (j >> 1) - 1] + 2 * t[i - (j >> 1)] + t[i - (j >> 1) + 1] + 2) >> 2;
406 }
407 } else {
408 if zvr == -1 {
409 pix = (l[1] + 2 * l[0] + t[1] + 2) >> 2;
410 } else {
411 pix = (l[j] + 2 * l[j - 1] + l[j - 2] + 2) >> 2;
412 }
413 }
414 dst[i + j * stride] = pix as u8;
415 }
416 }
417 }
418 fn ipred_4x4_ver_left(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8], tr: &[u8]) {
419 let mut t: [u16; 8] = [0; 8];
420 load(&mut t[..4], top);
421 load(&mut t[4..], tr);
422 let dst = buf;
423
424 dst[0 + 0 * stride] = ((t[0] + t[1] + 1) >> 1) as u8;
425 let pix = ((t[1] + t[2] + 1) >> 1) as u8;
426 dst[1 + 0 * stride] = pix;
427 dst[0 + 2 * stride] = pix;
428 let pix = ((t[2] + t[3] + 1) >> 1) as u8;
429 dst[2 + 0 * stride] = pix;
430 dst[1 + 2 * stride] = pix;
431 let pix = ((t[3] + t[4] + 1) >> 1) as u8;
432 dst[3 + 0 * stride] = pix;
433 dst[2 + 2 * stride] = pix;
434 dst[3 + 2 * stride] = ((t[4] + t[5] + 1) >> 1) as u8;
435 dst[0 + 1 * stride] = ((t[0] + 2*t[1] + t[2] + 2) >> 2) as u8;
436 let pix = ((t[1] + 2*t[2] + t[3] + 2) >> 2) as u8;
437 dst[1 + 1 * stride] = pix;
438 dst[0 + 3 * stride] = pix;
439 let pix = ((t[2] + 2*t[3] + t[4] + 2) >> 2) as u8;
440 dst[2 + 1 * stride] = pix;
441 dst[1 + 3 * stride] = pix;
442 let pix = ((t[3] + 2*t[4] + t[5] + 2) >> 2) as u8;
443 dst[3 + 1 * stride] = pix;
444 dst[2 + 3 * stride] = pix;
445 dst[3 + 3 * stride] = ((t[4] + 2*t[5] + t[6] + 2) >> 2) as u8;
446 }
447 fn ipred_4x4_hor_down(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8], _tr: &[u8]) {
448 let mut t: [u16; 5] = [0; 5];
449 t[0] = u16::from(left[0]);
450 load(&mut t[1..], top);
451 let mut l: [u16; 5] = [0; 5];
452 load(&mut l, left);
453 let dst = buf;
454
455 for j in 0..4 {
456 for i in 0..4 {
457 let zhd = ((2 * j) as i8) - (i as i8);
458 let pix;
459 if zhd >= 0 {
460 if (zhd & 1) == 0 {
461 pix = (l[j - (i >> 1)] + l[j - (i >> 1) + 1] + 1) >> 1;
462 } else {
463 pix = (l[j - (i >> 1) - 1] + 2 * l[j - (i >> 1)] + l[j - (i >> 1) + 1] + 2) >> 2;
464 }
465 } else {
466 if zhd == -1 {
467 pix = (l[1] + 2 * l[0] + t[1] + 2) >> 2;
468 } else {
469 pix = (t[i - 2] + 2 * t[i - 1] + t[i] + 2) >> 2;
470 }
471 }
472 dst[i + j * stride] = pix as u8;
473 }
474 }
475 }
476 fn ipred_4x4_hor_up(buf: &mut [u8], stride: usize, _top: &[u8], left: &[u8], _tr: &[u8]) {
477 let mut l: [u16; 8] = [0; 8];
478 load(&mut l, &left[1..]);
479 let dst = buf;
480
481 dst[0 + 0 * stride] = ((l[0] + l[1] + 1) >> 1) as u8;
482 dst[1 + 0 * stride] = ((l[0] + 2*l[1] + l[2] + 2) >> 2) as u8;
483 let pix = ((l[1] + l[2] + 1) >> 1) as u8;
484 dst[2 + 0 * stride] = pix;
485 dst[0 + 1 * stride] = pix;
486 let pix = ((l[1] + 2*l[2] + l[3] + 2) >> 2) as u8;
487 dst[3 + 0 * stride] = pix;
488 dst[1 + 1 * stride] = pix;
489 let pix = ((l[2] + l[3] + 1) >> 1) as u8;
490 dst[2 + 1 * stride] = pix;
491 dst[0 + 2 * stride] = pix;
492 let pix = ((l[2] + 3*l[3] + 2) >> 2) as u8;
493 dst[3 + 1 * stride] = pix;
494 dst[1 + 2 * stride] = pix;
495 dst[3 + 2 * stride] = l[3] as u8;
496 dst[1 + 3 * stride] = l[3] as u8;
497 dst[0 + 3 * stride] = l[3] as u8;
498 dst[2 + 2 * stride] = l[3] as u8;
499 dst[2 + 3 * stride] = l[3] as u8;
500 dst[3 + 3 * stride] = l[3] as u8;
501 }
502 fn ipred_4x4_dc(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8], _tr: &[u8]) {
503 ipred_dc(buf, stride, top, left, 4, 3);
504 }
505 fn ipred_4x4_left_dc(buf: &mut [u8], stride: usize, _top: &[u8], left: &[u8], _tr: &[u8]) {
506 ipred_left_dc(buf, stride, left, 4, 2);
507 }
508 fn ipred_4x4_top_dc(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8], _tr: &[u8]) {
509 ipred_top_dc(buf, stride, top, 4, 2);
510 }
511 fn ipred_4x4_dc128(buf: &mut [u8], stride: usize, _top: &[u8], _left: &[u8], _tr: &[u8]) {
512 ipred_dc128(buf, stride, 4);
513 }
514
515 pub struct IPred8Context {
516 pub t: [u8; 16],
517 pub l: [u8; 8],
518 pub tl: u8,
519 }
520
521 impl IPred8Context {
522 pub fn new() -> Self {
523 Self {
524 t: [128; 16],
525 l: [128; 8],
526 tl: 128,
527 }
528 }
529 pub fn fill(&mut self, top: &[u8], left: &[u8], has_t: bool, has_tr: bool, has_l: bool, has_tl: bool) {
530 let mut t = [0x80u8; 19];
531 let mut l = [0x80u8; 11];
532 if has_t {
533 t[1..8 + 1].copy_from_slice(&top[..8]);
534 }
535 if has_tr {
536 t[8 + 1..16 + 1].copy_from_slice(&top[8..][..8]);
537 t[16 + 1] = t[15 + 1];
538 t[17 + 1] = t[15 + 1];
539 } else {
540 let (t0, t1) = t.split_at_mut(8 + 1);
541 for el in t1.iter_mut() {
542 *el = t0[7 + 1];
543 }
544 }
545 if has_l {
546 l[1..9].copy_from_slice(&left[1..9]);
547 l[8 + 1] = l[7 + 1];
548 l[9 + 1] = l[7 + 1];
549 }
550 if has_tl {
551 t[0] = left[0];
552 l[0] = left[0];
553 } else {
554 t[0] = t[1];
555 l[0] = l[1];
556 }
557
558 for i in 0..16 {
559 self.t[i] = ((u16::from(t[i]) + 2 * u16::from(t[i + 1]) + u16::from(t[i + 2]) + 2) >> 2) as u8;
560 }
561 for i in 0..8 {
562 self.l[i] = ((u16::from(l[i]) + 2 * u16::from(l[i + 1]) + u16::from(l[i + 2]) + 2) >> 2) as u8;
563 }
564 self.tl = if has_t && has_l {
565 ((u16::from(t[1]) + 2 * u16::from(t[0]) + u16::from(l[1]) + 2) >> 2) as u8
566 } else if has_t {
567 ((3 * u16::from(t[0]) + u16::from(t[1]) + 2) >> 2) as u8
568 } else if has_l {
569 ((3 * u16::from(l[0]) + u16::from(l[1]) + 2) >> 2) as u8
570 } else {
571 t[0]
572 };
573 }
574 }
575
576 fn ipred_y_8x8_ver(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
577 for row in buf.chunks_mut(stride).take(8) {
578 row[..8].copy_from_slice(&ctx.t[..8]);
579 }
580 }
581 fn ipred_y_8x8_hor(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
582 for (row, &l) in buf.chunks_mut(stride).zip(ctx.l.iter()).take(8) {
583 row[..8].copy_from_slice(&[l; 8]);
584 }
585 }
586 fn ipred_y_8x8_diag_down_left(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
587 let mut t = [0u16; 16];
588 load(&mut t, &ctx.t);
589
590 for (y, row) in buf.chunks_mut(stride).take(8).enumerate() {
591 for (x, pix) in row.iter_mut().take(8).enumerate() {
592 *pix = ((if (x != 7) || (y != 7) {
593 t[x + y] + 2 * t[x + y + 1] + t[x + y + 2]
594 } else {
595 t[14] + 3 * t[15]
596 } + 2) >> 2) as u8;
597 }
598 }
599 }
600 fn ipred_y_8x8_diag_down_right(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
601 let mut t = [0u16; 9];
602 t[0] = u16::from(ctx.tl);
603 load(&mut t[1..], &ctx.t);
604 let mut l = [0u16; 9];
605 l[0] = u16::from(ctx.tl);
606 load(&mut l[1..], &ctx.l);
607 let diag = t[1] + 2 * t[0] + l[1];
608
609 for (y, row) in buf.chunks_mut(stride).take(8).enumerate() {
610 for (x, pix) in row.iter_mut().take(8).enumerate() {
611 *pix = ((if x > y {
612 t[x - y - 1] + 2 * t[x - y] + t[x - y + 1]
613 } else if x < y {
614 l[y - x - 1] + 2 * l[y - x] + l[y - x + 1]
615 } else {
616 diag
617 } + 2) >> 2) as u8;
618 }
619 }
620 }
621 fn ipred_y_8x8_ver_right(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
622 let mut t = [0u16; 9];
623 t[0] = u16::from(ctx.tl);
624 load(&mut t[1..], &ctx.t);
625 let mut l = [0u16; 9];
626 l[0] = u16::from(ctx.tl);
627 load(&mut l[1..], &ctx.l);
628
629 for (y, row) in buf.chunks_mut(stride).take(8).enumerate() {
630 for (x, pix) in row.iter_mut().take(8).enumerate() {
631 let zvr = 2 * (x as i8) - (y as i8);
632 *pix = if zvr >= 0 {
633 let ix = x - (y >> 1);
634 if (zvr & 1) == 0 {
635 (t[ix] + t[ix + 1] + 1) >> 1
636 } else {
637 (t[ix - 1] + 2 * t[ix] + t[ix + 1] + 2) >> 2
638 }
639 } else if zvr == -1 {
640 (l[1] + 2 * l[0] + t[1] + 2) >> 2
641 } else {
642 let ix = y - 2 * x;
643 (l[ix] + 2 * l[ix - 1] + l[ix - 2] + 2) >> 2
644 } as u8;
645 }
646 }
647 }
648 fn ipred_y_8x8_ver_left(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
649 let mut t = [0u16; 16];
650 load(&mut t, &ctx.t);
651
652 for (y, row) in buf.chunks_mut(stride).take(8).enumerate() {
653 for (x, pix) in row.iter_mut().take(8).enumerate() {
654 let ix = x + (y >> 1);
655 *pix = if (y & 1) == 0 {
656 (t[ix] + t[ix + 1] + 1) >> 1
657 } else {
658 (t[ix] + 2 * t[ix + 1] + t[ix + 2] + 2) >> 2
659 } as u8;
660 }
661 }
662
663 }
664 fn ipred_y_8x8_hor_down(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
665 let mut t = [0u16; 9];
666 t[0] = u16::from(ctx.tl);
667 load(&mut t[1..], &ctx.t);
668 let mut l = [0u16; 9];
669 l[0] = u16::from(ctx.tl);
670 load(&mut l[1..], &ctx.l);
671
672 for (y, row) in buf.chunks_mut(stride).take(8).enumerate() {
673 for (x, pix) in row.iter_mut().take(8).enumerate() {
674 let zhd = 2 * (y as i8) - (x as i8);
675 *pix = if zhd >= 0 {
676 let ix = y - (x >> 1);
677 if (zhd & 1) == 0 {
678 (l[ix] + l[ix + 1] + 1) >> 1
679 } else {
680 (l[ix - 1] + 2 * l[ix] + l[ix + 1] + 2) >> 2
681 }
682 } else if zhd == -1 {
683 (l[1] + 2 * l[0] + t[1] + 2) >> 2
684 } else {
685 let ix = x - 2 * y;
686 (t[ix] + 2 * t[ix - 1] + t[ix - 2] + 2) >> 2
687 } as u8;
688 }
689 }
690 }
691 fn ipred_y_8x8_hor_up(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
692 let mut l = [0u16; 8];
693 load(&mut l, &ctx.l);
694
695 for (y, row) in buf.chunks_mut(stride).take(8).enumerate() {
696 for (x, pix) in row.iter_mut().take(8).enumerate() {
697 let zhu = x + 2 * y;
698 let ix = y + (x >> 1);
699 *pix = if zhu > 13 {
700 l[7]
701 } else if zhu == 13 {
702 (l[6] + 3 * l[7] + 2) >> 2
703 } else if (zhu & 1) != 0 {
704 (l[ix] + 2 * l[ix + 1] + l[ix + 2] + 2) >> 2
705 } else {
706 (l[ix] + l[ix + 1] + 1) >> 1
707 } as u8;
708 }
709 }
710 }
711 fn ipred_y_8x8_dc(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
712 let mut sum = 0u16;
713 for &t in ctx.t[..8].iter() {
714 sum += u16::from(t);
715 }
716 for &l in ctx.l[..8].iter() {
717 sum += u16::from(l);
718 }
719 let dc = ((sum + 8) >> 4) as u8;
720 for row in buf.chunks_mut(stride).take(8) {
721 for pix in row.iter_mut().take(8) {
722 *pix = dc;
723 }
724 }
725 }
726 fn ipred_y_8x8_left_dc(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
727 let mut sum = 0u16;
728 for &l in ctx.l[..8].iter() {
729 sum += u16::from(l);
730 }
731 let dc = ((sum + 4) >> 3) as u8;
732 for row in buf.chunks_mut(stride).take(8) {
733 for pix in row.iter_mut().take(8) {
734 *pix = dc;
735 }
736 }
737 }
738 fn ipred_y_8x8_top_dc(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
739 let mut sum = 0u16;
740 for &t in ctx.t[..8].iter() {
741 sum += u16::from(t);
742 }
743 let dc = ((sum + 4) >> 3) as u8;
744 for row in buf.chunks_mut(stride).take(8) {
745 for pix in row.iter_mut().take(8) {
746 *pix = dc;
747 }
748 }
749 }
750 fn ipred_y_8x8_dc128(buf: &mut [u8], stride: usize, _ctx: &IPred8Context) {
751 ipred_dc128(buf, stride, 8);
752 }
753
754 fn ipred_8x8_ver(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8]) {
755 ipred_ver(buf, stride, top, 8);
756 }
757 fn ipred_8x8_hor(buf: &mut [u8], stride: usize, _top: &[u8], left: &[u8]) {
758 ipred_hor(buf, stride, left, 8);
759 }
760 fn ipred_8x8_dc(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8]) {
761 let mut l = [0; 8];
762 load(&mut l, &left[1..]);
763 let mut t = [0; 8];
764 load(&mut t, top);
765
766 let dc0 = ((t[0] + t[1] + t[2] + t[3] + l[0] + l[1] + l[2] + l[3] + 4) >> 3) as u8;
767 let sum1 = t[4] + t[5] + t[6] + t[7];
768 let dc1 = ((sum1 + 2) >> 2) as u8;
769 let sum2 = l[4] + l[5] + l[6] + l[7];
770 let dc2 = ((sum2 + 2) >> 2) as u8;
771 let dc3 = ((sum1 + sum2 + 4) >> 3) as u8;
772
773 for row in buf.chunks_mut(stride).take(4) {
774 row[..4].copy_from_slice(&[dc0; 4]);
775 row[4..8].copy_from_slice(&[dc1; 4]);
776 }
777 for row in buf.chunks_mut(stride).skip(4).take(4) {
778 row[..4].copy_from_slice(&[dc2; 4]);
779 row[4..8].copy_from_slice(&[dc3; 4]);
780 }
781 }
782 fn ipred_8x8_left_dc(buf: &mut [u8], stride: usize, _top: &[u8], left: &[u8]) {
783 let mut left_dc0 = 0;
784 let mut left_dc1 = 0;
785 for &el in left[1..].iter().take(4) {
786 left_dc0 += u16::from(el);
787 }
788 for &el in left[1..].iter().skip(4).take(4) {
789 left_dc1 += u16::from(el);
790 }
791 let dc0 = ((left_dc0 + 2) >> 2) as u8;
792 let dc2 = ((left_dc1 + 2) >> 2) as u8;
793 for row in buf.chunks_mut(stride).take(4) {
794 row[..8].copy_from_slice(&[dc0; 8]);
795 }
796 for row in buf.chunks_mut(stride).skip(4).take(4) {
797 row[..8].copy_from_slice(&[dc2; 8]);
798 }
799 }
800 fn ipred_8x8_top_dc(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8]) {
801 ipred_top_dc(buf, stride, top, 4, 2);
802 ipred_top_dc(&mut buf[4..], stride, &top[4..], 4, 2);
803 let mut top = [0; 8];
804 top.copy_from_slice(&buf[stride * 3..][..8]);
805 ipred_top_dc(&mut buf[4 * stride..], stride, &top, 4, 2);
806 ipred_top_dc(&mut buf[4 + 4 * stride..], stride, &top[4..], 4, 2);
807 }
808 fn ipred_8x8_dc128(buf: &mut [u8], stride: usize, _top: &[u8], _left: &[u8]) {
809 ipred_dc128(buf, stride, 8);
810 }
811 fn ipred_8x8_plane(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8]) {
812 let mut h: i32 = 4 * (i32::from(top[7]) - i32::from(left[0]));
813 let mut v: i32 = 4 * (i32::from(left[8]) - i32::from(left[0]));
814 for i in 0..3 {
815 let i1 = (i + 1) as i32;
816 h += i1 * (i32::from(top[4 + i]) - i32::from(top[2 - i]));
817 v += i1 * (i32::from(left[5 + i]) - i32::from(left[3 - i]));
818 }
819 let b = (17 * h + 16) >> 5;
820 let c = (17 * v + 16) >> 5;
821 let mut a = 16 * (i32::from(left[8]) + i32::from(top[7])) - 3 * (b + c) + 16;
822 for line in buf.chunks_mut(stride).take(8) {
823 let mut acc = a;
824 for el in line.iter_mut().take(8) {
825 *el = clip8((acc >> 5) as i16);
826 acc += b;
827 }
828 a += c;
829 }
830 }
831
832 fn ipred_16x16_ver(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8]) {
833 ipred_ver(buf, stride, top, 16);
834 }
835 fn ipred_16x16_hor(buf: &mut [u8], stride: usize, _top: &[u8], left: &[u8]) {
836 ipred_hor(buf, stride, left, 16);
837 }
838 fn ipred_16x16_dc(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8]) {
839 ipred_dc(buf, stride, top, left, 16, 5);
840 }
841 fn ipred_16x16_left_dc(buf: &mut [u8], stride: usize, _top: &[u8], left: &[u8]) {
842 ipred_left_dc(buf, stride, left, 16, 4);
843 }
844 fn ipred_16x16_top_dc(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8]) {
845 ipred_top_dc(buf, stride, top, 16, 4);
846 }
847 fn ipred_16x16_dc128(buf: &mut [u8], stride: usize, _top: &[u8], _left: &[u8]) {
848 ipred_dc128(buf, stride, 16);
849 }
850 fn ipred_16x16_plane(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8]) {
851 let mut h = 8 * (i32::from(top[15]) - i32::from(left[0]));
852 let mut v = 8 * (i32::from(left[16]) - i32::from(left[0]));
853 for k in 0..7 {
854 h += ((k as i32) + 1) * (i32::from(top[8 + k]) - i32::from(top[6 - k]));
855 v += ((k as i32) + 1) * (i32::from(left[9 + k]) - i32::from(left[7 - k]));
856 }
857
858 h = (5 * h + 32) >> 6;
859 v = (5 * v + 32) >> 6;
860
861 let mut a = 16 * (i32::from(left[16]) + i32::from(top[15]) + 1) - 7 * (v + h);
862
863 for row in buf.chunks_mut(stride).take(16) {
864 let mut b = a;
865 a += v;
866
867 for dst in row.chunks_exact_mut(4).take(4) {
868 dst[0] = clip8(((b ) >> 5) as i16);
869 dst[1] = clip8(((b + h) >> 5) as i16);
870 dst[2] = clip8(((b + 2*h) >> 5) as i16);
871 dst[3] = clip8(((b + 3*h) >> 5) as i16);
872 b += h * 4;
873 }
874 }
875 }
876
877 pub type IPred4x4Func = fn(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8], tr: &[u8]);
878 pub type IPred8x8Func = fn(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8]);
879 pub type IPred8x8LumaFunc = fn(buf: &mut [u8], stride: usize, ctx: &IPred8Context);
880
881 pub const IPRED4_DC128: usize = 11;
882 pub const IPRED4_DC_TOP: usize = 10;
883 pub const IPRED4_DC_LEFT: usize = 9;
884 pub const IPRED8_DC128: usize = 6;
885 pub const IPRED8_DC_TOP: usize = 5;
886 pub const IPRED8_DC_LEFT: usize = 4;
887
888 pub const IPRED_FUNCS4X4: [IPred4x4Func; 12] = [
889 ipred_4x4_ver, ipred_4x4_hor, ipred_4x4_dc,
890 ipred_4x4_diag_down_left, ipred_4x4_diag_down_right,
891 ipred_4x4_ver_right, ipred_4x4_hor_down, ipred_4x4_ver_left, ipred_4x4_hor_up,
892 ipred_4x4_left_dc, ipred_4x4_top_dc, ipred_4x4_dc128
893 ];
894
895 pub const IPRED_FUNCS8X8_LUMA: [IPred8x8LumaFunc; 12] = [
896 ipred_y_8x8_ver, ipred_y_8x8_hor, ipred_y_8x8_dc,
897 ipred_y_8x8_diag_down_left, ipred_y_8x8_diag_down_right,
898 ipred_y_8x8_ver_right, ipred_y_8x8_hor_down,
899 ipred_y_8x8_ver_left, ipred_y_8x8_hor_up,
900 ipred_y_8x8_left_dc, ipred_y_8x8_top_dc, ipred_y_8x8_dc128
901 ];
902
903 pub const IPRED_FUNCS8X8_CHROMA: [IPred8x8Func; 7] = [
904 ipred_8x8_dc, ipred_8x8_hor, ipred_8x8_ver, ipred_8x8_plane,
905 ipred_8x8_left_dc, ipred_8x8_top_dc, ipred_8x8_dc128
906 ];
907
908 pub const IPRED_FUNCS16X16: [IPred8x8Func; 7] = [
909 ipred_16x16_ver, ipred_16x16_hor, ipred_16x16_dc, ipred_16x16_plane,
910 ipred_16x16_left_dc, ipred_16x16_top_dc, ipred_16x16_dc128
911 ];
912
913 macro_rules! loop_filter {
914 (lumaedge; $buf: expr, $off: expr, $step: expr, $alpha: expr, $beta: expr) => {
915 let p2 = i16::from($buf[$off - $step * 3]);
916 let p1 = i16::from($buf[$off - $step * 2]);
917 let p0 = i16::from($buf[$off - $step]);
918 let q0 = i16::from($buf[$off]);
919 let q1 = i16::from($buf[$off + $step]);
920 let q2 = i16::from($buf[$off + $step * 2]);
921 let a_p = (p2 - p0).abs() < $beta;
922 let a_q = (q2 - q0).abs() < $beta;
923 if a_p && (p0 - q0).abs() < (($alpha >> 2) + 2) {
924 let p3 = i16::from($buf[$off - $step * 4]);
925 $buf[$off - $step * 3] = ((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) as u8;
926 $buf[$off - $step * 2] = ((p2 + p1 + p0 + q0 + 2) >> 2) as u8;
927 $buf[$off - $step] = ((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) as u8;
928 } else {
929 $buf[$off - $step] = ((2 * p1 + p0 + q1 + 2) >> 2) as u8;
930 }
931 if a_q && (p0 - q0).abs() < (($alpha >> 2) + 2) {
932 let q3 = i16::from($buf[$off + $step * 3]);
933 $buf[$off] = ((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) as u8;
934 $buf[$off + $step] = ((p0 + q0 + q1 + q2 + 2) >> 2) as u8;
935 $buf[$off + $step * 2] = ((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) as u8;
936 } else {
937 $buf[$off] = ((2 * q1 + q0 + p1 + 2) >> 2) as u8;
938 }
939 };
940 (chromaedge; $buf: expr, $off: expr, $step: expr) => {
941 let p1 = i16::from($buf[$off - $step * 2]);
942 let p0 = i16::from($buf[$off - $step]);
943 let q0 = i16::from($buf[$off]);
944 let q1 = i16::from($buf[$off + $step]);
945 $buf[$off - $step] = ((2 * p1 + p0 + q1 + 2) >> 2) as u8;
946 $buf[$off] = ((2 * q1 + q0 + p1 + 2) >> 2) as u8;
947 };
948 (lumanormal; $buf: expr, $off: expr, $step: expr, $tc0: expr, $beta: expr) => {
949 let p2 = i16::from($buf[$off - $step * 3]);
950 let p1 = i16::from($buf[$off - $step * 2]);
951 let p0 = i16::from($buf[$off - $step]);
952 let q0 = i16::from($buf[$off]);
953 let q1 = i16::from($buf[$off + $step]);
954 let q2 = i16::from($buf[$off + $step * 2]);
955 let a_p = (p2 - p0).abs() < $beta;
956 let a_q = (q2 - q0).abs() < $beta;
957 let tc = $tc0 + (a_p as i16) + (a_q as i16);
958 let delta = (((q0 - p0) * 4 + (p1 - q1) + 4) >> 3).max(-tc).min(tc);
959 if a_p && ($tc0 > 0) {
960 $buf[$off - $step * 2] = clip8(p1 + ((p2 + ((p0 + q0 + 1) >> 1) - p1 * 2) >> 1).max(-$tc0).min($tc0));
961 }
962 $buf[$off - $step] = clip8(p0 + delta);
963 $buf[$off] = clip8(q0 - delta);
964 if a_q && ($tc0 > 0) {
965 $buf[$off + $step] = clip8(q1 + ((q2 + ((p0 + q0 + 1) >> 1) - q1 * 2) >> 1).max(-$tc0).min($tc0));
966 }
967 };
968 (chromanormal; $buf: expr, $off: expr, $step: expr, $tc0: expr) => {
969 let p1 = i16::from($buf[$off - $step * 2]);
970 let p0 = i16::from($buf[$off - $step]);
971 let q0 = i16::from($buf[$off]);
972 let q1 = i16::from($buf[$off + $step]);
973 let tc = $tc0 + 1;
974 let delta = (((q0 - p0) * 4 + (p1 - q1) + 4) >> 3).max(-tc).min(tc);
975 $buf[$off - $step] = clip8(p0 + delta);
976 $buf[$off] = clip8(q0 - delta);
977 }
978 }
979
980 fn check_filter(buf: &[u8], off: usize, step: usize, alpha: i16, beta: i16) -> bool {
981 let p1 = i16::from(buf[off - step * 2]);
982 let p0 = i16::from(buf[off - step]);
983 let q0 = i16::from(buf[off]);
984 let q1 = i16::from(buf[off + step]);
985 (p0 - q0).abs() < alpha && (p1 - p0).abs() < beta && (q1 - q0).abs() < beta
986 }
987
988 #[cfg(not(target_arch="x86_64"))]
989 fn check_filter4(buf: &[u8], mut off: usize, step: usize, stride: usize, alpha: i16, beta: i16) -> [bool; 4] {
990 let mut flags = [false; 4];
991 for flag in flags.iter_mut() {
992 let p1 = i16::from(buf[off - step * 2]);
993 let p0 = i16::from(buf[off - step]);
994 let q0 = i16::from(buf[off]);
995 let q1 = i16::from(buf[off + step]);
996 *flag = (p0 - q0).abs() < alpha && (p1 - p0).abs() < beta && (q1 - q0).abs() < beta;
997 off += stride;
998 }
999 flags
1000 }
1001
1002 #[cfg(target_arch="x86_64")]
1003 fn check_filter4(buf: &[u8], off: usize, step: usize, stride: usize, alpha: i16, beta: i16) -> [bool; 4] {
1004 unsafe {
1005 let mut flags = [false; 4];
1006 let src = buf[off - step * 2..].as_ptr();
1007 let load_stride = step.max(stride);
1008 let fptr = flags.as_mut_ptr();
1009 let tflag = u32::from(step == 1);
1010 asm! {
1011 // load block
1012 "pxor xmm4, xmm4",
1013 "movd xmm0, dword ptr [{src}]",
1014 "lea {tmp}, [{src} + {stride} * 2]",
1015 "movd xmm1, dword ptr [{src} + {stride}]",
1016 "movd xmm2, dword ptr [{tmp}]",
1017 "movd xmm3, dword ptr [{tmp} + {stride}]",
1018 "punpcklbw xmm0, xmm4",
1019 "punpcklbw xmm1, xmm4",
1020 "punpcklbw xmm2, xmm4",
1021 "punpcklbw xmm3, xmm4",
1022
1023 // transpose block if necessary so it's always processed by rows
1024 "test {tflag:e}, {tflag:e}",
1025 "jz 2f",
1026 "punpcklwd xmm0, xmm1",
1027 "movhlps xmm4, xmm0",
1028 "punpcklwd xmm2, xmm3",
1029 "movhlps xmm1, xmm2",
1030 "punpckldq xmm0, xmm2",
1031 "punpckldq xmm4, xmm1",
1032 "movhlps xmm1, xmm0",
1033 "movhlps xmm3, xmm4",
1034 "movaps xmm2, xmm4",
1035 "2:",
1036
1037 // calculate deltas and flags
1038 "movd xmm4, {alpha:r}",
1039 "movd xmm5, {beta:r}",
1040 "psubw xmm0, xmm1",
1041 "psubw xmm1, xmm2",
1042 "psubw xmm3, xmm2",
1043 "pshuflw xmm4, xmm4, 0",
1044 "pshuflw xmm5, xmm5, 0",
1045 "pabsw xmm0, xmm0", // |p1 - p0|
1046 "pabsw xmm1, xmm1", // |p0 - q0|
1047 "pabsw xmm2, xmm3", // |q1 - q0|
1048 "movaps xmm3, xmm5",
1049 "pcmpgtw xmm4, xmm1",
1050 "pcmpgtw xmm5, xmm0",
1051 "pcmpgtw xmm3, xmm2",
1052 "pand xmm4, xmm5",
1053 "pand xmm4, xmm3",
1054 "packsswb xmm4, xmm4",
1055 "movd [{flags}], xmm4",
1056 tmp = out(reg) _,
1057 src = in(reg) src,
1058 stride = in(reg) load_stride,
1059 alpha = in(reg) alpha,
1060 beta = in(reg) beta,
1061 flags = in(reg) fptr,
1062 tflag = in(reg) tflag,
1063 out("xmm0") _,
1064 out("xmm1") _,
1065 out("xmm2") _,
1066 out("xmm3") _,
1067 out("xmm4") _,
1068 out("xmm5") _,
1069 }
1070 flags
1071 }
1072 }
1073
1074 pub fn loop_filter_lumaedge_v(dst: &mut [u8], mut off: usize, stride: usize, alpha: i16, beta: i16) {
1075 let flags = check_filter4(dst, off, 1, stride, alpha, beta);
1076 for &flag in flags.iter() {
1077 if flag {
1078 loop_filter!(lumaedge; dst, off, 1, alpha, beta);
1079 }
1080 off += stride;
1081 }
1082 }
1083 pub fn loop_filter_lumaedge_h(dst: &mut [u8], off: usize, stride: usize, alpha: i16, beta: i16) {
1084 let flags = check_filter4(dst, off, stride, 1, alpha, beta);
1085 for (x, &flag) in flags.iter().enumerate() {
1086 if flag {
1087 loop_filter!(lumaedge; dst, off + x, stride, alpha, beta);
1088 }
1089 }
1090 }
1091 pub fn loop_filter_lumanormal_v(dst: &mut [u8], mut off: usize, stride: usize, alpha: i16, beta: i16, tc0: i16) {
1092 let flags = check_filter4(dst, off, 1, stride, alpha, beta);
1093 for &flag in flags.iter() {
1094 if flag {
1095 loop_filter!(lumanormal; dst, off, 1, tc0, beta);
1096 }
1097 off += stride;
1098 }
1099 }
1100 pub fn loop_filter_lumanormal_h(dst: &mut [u8], off: usize, stride: usize, alpha: i16, beta: i16, tc0: i16) {
1101 let flags = check_filter4(dst, off, stride, 1, alpha, beta);
1102 for (x, &flag) in flags.iter().enumerate() {
1103 if flag {
1104 loop_filter!(lumanormal; dst, off + x, stride, tc0, beta);
1105 }
1106 }
1107 }
1108 pub fn loop_filter_chromaedge_v(dst: &mut [u8], mut off: usize, stride: usize, alpha: i16, beta: i16) {
1109 for _ in 0..2 {
1110 if check_filter(dst, off, 1, alpha, beta) {
1111 loop_filter!(chromaedge; dst, off, 1);
1112 }
1113 off += stride;
1114 }
1115 }
1116 pub fn loop_filter_chromaedge_h(dst: &mut [u8], off: usize, stride: usize, alpha: i16, beta: i16) {
1117 for x in 0..2 {
1118 if check_filter(dst, off + x, stride, alpha, beta) {
1119 loop_filter!(chromaedge; dst, off + x, stride);
1120 }
1121 }
1122 }
1123 pub fn loop_filter_chromanormal_v(dst: &mut [u8], mut off: usize, stride: usize, alpha: i16, beta: i16, tc0: i16) {
1124 for _ in 0..2 {
1125 if check_filter(dst, off, 1, alpha, beta) {
1126 loop_filter!(chromanormal; dst, off, 1, tc0);
1127 }
1128 off += stride;
1129 }
1130 }
1131 pub fn loop_filter_chromanormal_h(dst: &mut [u8], off: usize, stride: usize, alpha: i16, beta: i16, tc0: i16) {
1132 for x in 0..2 {
1133 if check_filter(dst, off + x, stride, alpha, beta) {
1134 loop_filter!(chromanormal; dst, off + x, stride, tc0);
1135 }
1136 }
1137 }