h264: optimise check_filter() for AMD64
[nihav.git] / nihav-itu / src / codecs / h264 / dsp / mod.rs
CommitLineData
2f9923e6
KS
1mod mc;
2pub use mc::H264MC;
932ae27b
KS
3#[cfg(target_arch="x86_64")]
4use std::arch::asm;
999fbb83 5
696e4e20
KS
6pub const CHROMA_QUANTS: [u8; 52] = [
7 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
8 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 29, 30,
9 31, 32, 32, 33, 34, 34, 35, 35, 36, 36, 37, 37, 37, 38, 38, 38,
10 39, 39, 39, 39
11];
12
13pub const CHROMA_DC_SCAN: [usize; 4] = [ 0, 1, 2, 3];
14pub const ZIGZAG: [usize; 16] = [
15 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
16];
17pub const ZIGZAG1: [usize; 15] = [
18 0, 3, 7, 4, 1, 2, 5, 8, 11, 12, 9, 6, 10, 13, 14
19];
20/*pub const IL_SCAN: [usize; 16] = [
21 0, 4, 1, 8, 12, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
22];*/
23pub const ZIGZAG8X8: [usize; 64] = [
24 0, 1, 8, 16, 9, 2, 3, 10,
25 17, 24, 32, 25, 18, 11, 4, 5,
26 12, 19, 26, 33, 40, 48, 41, 34,
27 27, 20, 13, 6, 7, 14, 21, 28,
28 35, 42, 49, 56, 57, 50, 43, 36,
29 29, 22, 15, 23, 30, 37, 44, 51,
30 58, 59, 52, 45, 38, 31, 39, 46,
31 53, 60, 61, 54, 47, 55, 62, 63
32];
33
34const LEVEL_SCALE: [[i16; 6]; 3] = [
35 [ 10, 11, 13, 14, 16, 18 ],
36 [ 16, 18, 20, 23, 25, 29 ],
37 [ 13, 14, 16, 18, 20, 23 ]
38];
39
40pub fn chroma_dc_transform(blk: &mut [i16; 4], qp: u8) {
41 let t0 = blk[0] + blk[2];
42 let t1 = blk[0] - blk[2];
43 let t2 = blk[1] + blk[3];
44 let t3 = blk[1] - blk[3];
45 blk[0] = t0 + t2;
46 blk[1] = t0 - t2;
47 blk[2] = t1 + t3;
48 blk[3] = t1 - t3;
49 if qp < 6 {
4a1ca15c 50 let mul = LEVEL_SCALE[0][qp as usize];
696e4e20
KS
51 for el in blk.iter_mut() {
52 *el = el.wrapping_mul(mul) >> 1;
53 }
54 } else {
4a1ca15c 55 let mul = LEVEL_SCALE[0][(qp % 6) as usize];
696e4e20
KS
56 let shift = qp / 6 - 1;
57 for el in blk.iter_mut() {
58 *el = el.wrapping_mul(mul) << shift;
59 }
60 }
61}
62
63macro_rules! transform {
64 (luma_dc; $a: expr, $b: expr, $c: expr, $d: expr) => ({
65 let t0 = $a.wrapping_add($c);
66 let t1 = $a.wrapping_sub($c);
67 let t2 = $b.wrapping_add($d);
68 let t3 = $b.wrapping_sub($d);
69 $a = t0.wrapping_add(t2);
70 $b = t1.wrapping_add(t3);
71 $c = t1.wrapping_sub(t3);
72 $d = t0.wrapping_sub(t2);
73 });
74 ($a: expr, $b: expr, $c: expr, $d: expr, $shift: expr) => ({
75 let t0 = $a.wrapping_add($c);
76 let t1 = $a.wrapping_sub($c);
77 let t2 = ($b >> 1).wrapping_sub($d);
78 let t3 = $b.wrapping_add($d >> 1);
79 let bias = 1 << $shift >> 1;
80 $a = t0.wrapping_add(t3).wrapping_add(bias) >> $shift;
81 $b = t1.wrapping_add(t2).wrapping_add(bias) >> $shift;
82 $c = t1.wrapping_sub(t2).wrapping_add(bias) >> $shift;
83 $d = t0.wrapping_sub(t3).wrapping_add(bias) >> $shift;
84 });
85 ($a: expr, $b: expr, $c: expr, $d: expr, $e: expr, $f: expr, $g: expr, $h: expr) => {
86 let e0 = $a + $e;
87 let e1 = -$d + $f - $h - ($h >> 1);
88 let e2 = $a - $e;
89 let e3 = $b + $h - $d - ($d >> 1);
90 let e4 = ($c >> 1) - $g;
91 let e5 = -$b + $h + $f + ($f >> 1);
92 let e6 = $c + ($g >> 1);
93 let e7 = $d + $f + $b + ($b >> 1);
94
95 let f0 = e0 + e6;
96 let f1 = e1 + (e7 >> 2);
97 let f2 = e2 + e4;
98 let f3 = e3 + (e5 >> 2);
99 let f4 = e2 - e4;
100 let f5 = (e3 >> 2) - e5;
101 let f6 = e0 - e6;
102 let f7 = e7 - (e1 >> 2);
103
104 $a = f0 + f7;
105 $b = f2 + f5;
106 $c = f4 + f3;
107 $d = f6 + f1;
108 $e = f6 - f1;
109 $f = f4 - f3;
110 $g = f2 - f5;
111 $h = f0 - f7;
112 };
113}
114
115pub fn idct_luma_dc(blk: &mut [i16; 16], qp: u8) {
116 if qp < 12 {
4a1ca15c 117 let mul = LEVEL_SCALE[0][(qp % 6) as usize];
696e4e20
KS
118 let shift = 2 - qp / 6;
119 let bias = 1 << shift >> 1;
120 for el in blk.iter_mut() {
121 *el = el.wrapping_mul(mul).wrapping_add(bias) >> shift;
122 }
123 } else {
4a1ca15c 124 let mul = LEVEL_SCALE[0][(qp % 6) as usize];
696e4e20
KS
125 let shift = qp / 6 - 2;
126 for el in blk.iter_mut() {
127 *el = el.wrapping_mul(mul) << shift;
128 }
129 }
130 for i in 0..4 {
131 transform!(luma_dc; blk[i], blk[i + 4], blk[i + 8], blk[i + 12]);
132 }
133 for row in blk.chunks_mut(4) {
134 transform!(luma_dc; row[0], row[1], row[2], row[3]);
135 }
136}
137
138pub fn idct(blk: &mut [i16; 16], qp: u8, quant_dc: bool) {
139 const BLK_INDEX: [usize; 16] = [
140 0, 2, 0, 2,
141 2, 1, 2, 1,
142 0, 2, 0, 2,
143 2, 1, 2, 1
144 ];
145 let qidx = (qp % 6) as usize;
146 let shift = qp / 6;
147 let start = if quant_dc { 0 } else { 1 };
148 for (el, &idx) in blk.iter_mut().zip(BLK_INDEX.iter()).skip(start) {
149 *el = (*el * LEVEL_SCALE[idx][qidx]) << shift;
150 }
696e4e20 151 for row in blk.chunks_mut(4) {
3ad9bf2b
KS
152 transform!(row[0], row[1], row[2], row[3], 0);
153 }
154 for i in 0..4 {
155 transform!(blk[i], blk[i + 4], blk[i + 8], blk[i + 12], 6);
696e4e20
KS
156 }
157}
158
159pub fn idct_dc(blk: &mut [i16; 16], qp: u8, quant_dc: bool) {
160 let dc = if quant_dc {
161 (blk[0] * LEVEL_SCALE[0][(qp % 6) as usize]) << (qp / 6)
162 } else {
163 blk[0]
164 };
165 *blk = [(dc + 0x20) >> 6; 16];
166}
167
168const QMAT_8X8: [[u8; 16]; 6] = [
169 [
2b6a8fdc 170 20, 19, 25, 19,
696e4e20
KS
171 19, 18, 24, 18,
172 25, 24, 32, 24,
2b6a8fdc 173 19, 18, 24, 18
696e4e20 174 ], [
2b6a8fdc 175 22, 21, 28, 21,
696e4e20
KS
176 21, 19, 26, 19,
177 28, 26, 35, 26,
2b6a8fdc 178 21, 19, 26, 19
696e4e20 179 ], [
2b6a8fdc 180 26, 24, 33, 24,
696e4e20
KS
181 24, 23, 31, 23,
182 33, 31, 42, 31,
2b6a8fdc 183 24, 23, 31, 23
696e4e20 184 ], [
2b6a8fdc 185 28, 26, 35, 26,
696e4e20
KS
186 26, 25, 33, 25,
187 35, 33, 45, 33,
2b6a8fdc 188 26, 25, 33, 25
696e4e20 189 ], [
2b6a8fdc 190 32, 30, 40, 30,
696e4e20
KS
191 30, 28, 38, 28,
192 40, 38, 51, 38,
2b6a8fdc 193 30, 28, 38, 28
696e4e20 194 ], [
2b6a8fdc 195 36, 34, 46, 34,
696e4e20
KS
196 34, 32, 43, 32,
197 46, 43, 58, 43,
2b6a8fdc 198 34, 32, 43, 32
696e4e20
KS
199 ]
200];
201
202pub fn dequant8x8(blk: &mut [i16; 64], slist: &[u8; 64]) {
203 for (el, &scan) in blk.iter_mut().zip(ZIGZAG8X8.iter()) {
204 if *el != 0 {
205 *el = el.wrapping_mul(i16::from(slist[scan]));
206 }
207 }
208}
209
210pub fn idct8x8(blk: &mut [i16; 64], qp: u8) {
211 let mut tmp = [0i32; 64];
212 let qmat = &QMAT_8X8[(qp % 6) as usize];
213 if qp >= 36 {
214 let shift = qp / 6 - 6;
215 for (i, (dst, &src)) in tmp.iter_mut().zip(blk.iter()).enumerate() {
216 let x = i & 7;
217 let y = i >> 3;
218 let idx = (x & 3) + (y & 3) * 4;
219 *dst = i32::from(src).wrapping_mul(i32::from(qmat[idx])) << shift;
220 }
221 } else {
222 let shift = 6 - qp / 6;
223 let bias = (1 << shift) >> 1;
224 for (i, (dst, &src)) in tmp.iter_mut().zip(blk.iter()).enumerate() {
225 let x = i & 7;
226 let y = i >> 3;
227 let idx = (x & 3) + (y & 3) * 4;
228 *dst = i32::from(src).wrapping_mul(i32::from(qmat[idx])).wrapping_add(bias) >> shift;
229 }
230 }
231 for row in tmp.chunks_mut(8) {
232 transform!(row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7]);
233 }
234 for col in 0..8 {
235 transform!(tmp[col], tmp[col + 8], tmp[col + 8 * 2], tmp[col + 8 * 3],
236 tmp[col + 8 * 4], tmp[col + 8 * 5], tmp[col + 8 * 6], tmp[col + 8 * 7]);
237 }
238 for (dst, &src) in blk.iter_mut().zip(tmp.iter()) {
239 *dst = ((src + 0x20) >> 6) as i16;
240 }
241}
242
243pub fn add_coeffs(dst: &mut [u8], offset: usize, stride: usize, coeffs: &[i16]) {
244 let out = &mut dst[offset..][..stride * 3 + 4];
245 for (line, src) in out.chunks_mut(stride).take(4).zip(coeffs.chunks(4)) {
246 for (dst, src) in line.iter_mut().take(4).zip(src.iter()) {
247 *dst = (i32::from(*dst) + i32::from(*src)).max(0).min(255) as u8;
248 }
249 }
250}
251
252pub fn add_coeffs8(dst: &mut [u8], offset: usize, stride: usize, coeffs: &[i16; 64]) {
253 let out = &mut dst[offset..];
254 for (line, src) in out.chunks_mut(stride).take(8).zip(coeffs.chunks(8)) {
255 for (dst, src) in line.iter_mut().take(8).zip(src.iter()) {
256 *dst = (i32::from(*dst) + i32::from(*src)).max(0).min(255) as u8;
257 }
258 }
259}
260
696e4e20
KS
261fn clip8(val: i16) -> u8 { val.max(0).min(255) as u8 }
262
22de733b
KS
263fn ipred_dc128(buf: &mut [u8], stride: usize, bsize: usize) {
264 for row in buf.chunks_mut(stride).take(bsize) {
265 for el in row[..bsize].iter_mut() {
266 *el = 128;
267 }
696e4e20
KS
268 }
269}
22de733b
KS
270fn ipred_ver(buf: &mut [u8], stride: usize, top: &[u8], bsize: usize) {
271 for row in buf.chunks_mut(stride).take(bsize) {
272 row[..bsize].copy_from_slice(&top[..bsize]);
696e4e20
KS
273 }
274}
22de733b
KS
275fn ipred_hor(buf: &mut [u8], stride: usize, left: &[u8], bsize: usize) {
276 for (row, &left) in buf.chunks_mut(stride).zip(left[1..].iter()).take(bsize) {
277 for el in row[..bsize].iter_mut() {
278 *el = left;
279 }
696e4e20
KS
280 }
281}
22de733b 282fn ipred_dc(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8], bsize: usize, shift: u8) {
696e4e20 283 let mut adc: u16 = 0;
22de733b
KS
284 for i in 0..bsize { adc += u16::from(top[i]); }
285 for i in 0..bsize { adc += u16::from(left[i + 1]); }
696e4e20
KS
286 let dc = ((adc + (1 << (shift - 1))) >> shift) as u8;
287
22de733b
KS
288 for row in buf.chunks_mut(stride).take(bsize) {
289 for el in row[..bsize].iter_mut() {
290 *el = dc;
291 }
696e4e20
KS
292 }
293}
22de733b 294fn ipred_left_dc(buf: &mut [u8], stride: usize, left: &[u8], bsize: usize, shift: u8) {
696e4e20 295 let mut adc: u16 = 0;
22de733b 296 for i in 0..bsize { adc += u16::from(left[i + 1]); }
696e4e20
KS
297 let dc = ((adc + (1 << (shift - 1))) >> shift) as u8;
298
22de733b
KS
299 for row in buf.chunks_mut(stride).take(bsize) {
300 for el in row[..bsize].iter_mut() {
301 *el = dc;
302 }
696e4e20
KS
303 }
304}
22de733b 305fn ipred_top_dc(buf: &mut [u8], stride: usize, top: &[u8], bsize: usize, shift: u8) {
696e4e20 306 let mut adc: u16 = 0;
22de733b 307 for i in 0..bsize { adc += u16::from(top[i]); }
696e4e20
KS
308 let dc = ((adc + (1 << (shift - 1))) >> shift) as u8;
309
22de733b
KS
310 for row in buf.chunks_mut(stride).take(bsize) {
311 for el in row[..bsize].iter_mut() {
312 *el = dc;
313 }
696e4e20
KS
314 }
315}
316
22de733b
KS
317fn load(dst: &mut [u16], src: &[u8]) {
318 for (dst, &src) in dst.iter_mut().zip(src.iter()) {
319 *dst = u16::from(src);
320 }
696e4e20
KS
321}
322
22de733b
KS
323fn ipred_4x4_ver(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8], _tr: &[u8]) {
324 ipred_ver(buf, stride, top, 4);
696e4e20 325}
22de733b
KS
326fn ipred_4x4_hor(buf: &mut [u8], stride: usize, _top: &[u8], left: &[u8], _tr: &[u8]) {
327 ipred_hor(buf, stride, left, 4);
696e4e20 328}
22de733b 329fn ipred_4x4_diag_down_left(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8], tr: &[u8]) {
696e4e20 330 let mut t: [u16; 9] = [0; 9];
22de733b
KS
331 load(&mut t[..4], top);
332 load(&mut t[4..8], tr);
696e4e20
KS
333 t[8] = t[7];
334
696e4e20 335 for i in 0..4 {
22de733b 336 buf[i] = ((t[i] + 2 * t[i + 1] + t[i + 2] + 2) >> 2) as u8;
696e4e20 337 }
22de733b 338 let dst = &mut buf[stride..];
696e4e20
KS
339 for i in 0..4 {
340 dst[i] = ((t[i + 1] + 2 * t[i + 2] + t[i + 3] + 2) >> 2) as u8;
341 }
22de733b 342 let dst = &mut buf[stride * 2..];
696e4e20
KS
343 for i in 0..4 {
344 dst[i] = ((t[i + 2] + 2 * t[i + 3] + t[i + 4] + 2) >> 2) as u8;
345 }
22de733b 346 let dst = &mut buf[stride * 3..];
696e4e20
KS
347 for i in 0..4 {
348 dst[i] = ((t[i + 3] + 2 * t[i + 4] + t[i + 5] + 2) >> 2) as u8;
349 }
350}
22de733b 351fn ipred_4x4_diag_down_right(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8], _tr: &[u8]) {
696e4e20 352 let mut t: [u16; 5] = [0; 5];
22de733b 353 t[0] = u16::from(left[0]);
42005e25 354 load(&mut t[1..], top);
696e4e20 355 let mut l: [u16; 5] = [0; 5];
22de733b
KS
356 load(&mut l, left);
357 let dst = buf;
696e4e20
KS
358
359 for j in 0..4 {
360 for i in 0..j {
361 dst[i + j * stride] = ((l[j - i - 1] + 2 * l[j - i] + l[j - i + 1] + 2) >> 2) as u8;
362 }
363 dst[j + j * stride] = ((l[1] + 2 * l[0] + t[1] + 2) >> 2) as u8;
364 for i in (j+1)..4 {
365 dst[i + j * stride] = ((t[i - j - 1] + 2 * t[i - j] + t[i - j + 1] + 2) >> 2) as u8;
366 }
367 }
368}
22de733b 369fn ipred_4x4_ver_right(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8], _tr: &[u8]) {
696e4e20 370 let mut t: [u16; 5] = [0; 5];
22de733b 371 t[0] = u16::from(left[0]);
42005e25 372 load(&mut t[1..], top);
696e4e20 373 let mut l: [u16; 5] = [0; 5];
22de733b
KS
374 load(&mut l, left);
375 let dst = buf;
696e4e20
KS
376
377 for j in 0..4 {
378 for i in 0..4 {
379 let zvr = ((2 * i) as i8) - (j as i8);
380 let pix;
381 if zvr >= 0 {
382 if (zvr & 1) == 0 {
383 pix = (t[i - (j >> 1)] + t[i - (j >> 1) + 1] + 1) >> 1;
384 } else {
385 pix = (t[i - (j >> 1) - 1] + 2 * t[i - (j >> 1)] + t[i - (j >> 1) + 1] + 2) >> 2;
386 }
387 } else {
388 if zvr == -1 {
389 pix = (l[1] + 2 * l[0] + t[1] + 2) >> 2;
390 } else {
391 pix = (l[j] + 2 * l[j - 1] + l[j - 2] + 2) >> 2;
392 }
393 }
394 dst[i + j * stride] = pix as u8;
395 }
396 }
397}
22de733b 398fn ipred_4x4_ver_left(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8], tr: &[u8]) {
696e4e20 399 let mut t: [u16; 8] = [0; 8];
42005e25 400 load(&mut t[..4], top);
22de733b
KS
401 load(&mut t[4..], tr);
402 let dst = buf;
696e4e20
KS
403
404 dst[0 + 0 * stride] = ((t[0] + t[1] + 1) >> 1) as u8;
405 let pix = ((t[1] + t[2] + 1) >> 1) as u8;
406 dst[1 + 0 * stride] = pix;
407 dst[0 + 2 * stride] = pix;
408 let pix = ((t[2] + t[3] + 1) >> 1) as u8;
409 dst[2 + 0 * stride] = pix;
410 dst[1 + 2 * stride] = pix;
411 let pix = ((t[3] + t[4] + 1) >> 1) as u8;
412 dst[3 + 0 * stride] = pix;
413 dst[2 + 2 * stride] = pix;
414 dst[3 + 2 * stride] = ((t[4] + t[5] + 1) >> 1) as u8;
415 dst[0 + 1 * stride] = ((t[0] + 2*t[1] + t[2] + 2) >> 2) as u8;
416 let pix = ((t[1] + 2*t[2] + t[3] + 2) >> 2) as u8;
417 dst[1 + 1 * stride] = pix;
418 dst[0 + 3 * stride] = pix;
419 let pix = ((t[2] + 2*t[3] + t[4] + 2) >> 2) as u8;
420 dst[2 + 1 * stride] = pix;
421 dst[1 + 3 * stride] = pix;
422 let pix = ((t[3] + 2*t[4] + t[5] + 2) >> 2) as u8;
423 dst[3 + 1 * stride] = pix;
424 dst[2 + 3 * stride] = pix;
425 dst[3 + 3 * stride] = ((t[4] + 2*t[5] + t[6] + 2) >> 2) as u8;
426}
22de733b 427fn ipred_4x4_hor_down(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8], _tr: &[u8]) {
696e4e20 428 let mut t: [u16; 5] = [0; 5];
22de733b 429 t[0] = u16::from(left[0]);
42005e25 430 load(&mut t[1..], top);
696e4e20 431 let mut l: [u16; 5] = [0; 5];
22de733b
KS
432 load(&mut l, left);
433 let dst = buf;
696e4e20
KS
434
435 for j in 0..4 {
436 for i in 0..4 {
437 let zhd = ((2 * j) as i8) - (i as i8);
438 let pix;
439 if zhd >= 0 {
440 if (zhd & 1) == 0 {
441 pix = (l[j - (i >> 1)] + l[j - (i >> 1) + 1] + 1) >> 1;
442 } else {
443 pix = (l[j - (i >> 1) - 1] + 2 * l[j - (i >> 1)] + l[j - (i >> 1) + 1] + 2) >> 2;
444 }
445 } else {
446 if zhd == -1 {
447 pix = (l[1] + 2 * l[0] + t[1] + 2) >> 2;
448 } else {
449 pix = (t[i - 2] + 2 * t[i - 1] + t[i] + 2) >> 2;
450 }
451 }
452 dst[i + j * stride] = pix as u8;
453 }
454 }
455}
22de733b 456fn ipred_4x4_hor_up(buf: &mut [u8], stride: usize, _top: &[u8], left: &[u8], _tr: &[u8]) {
696e4e20 457 let mut l: [u16; 8] = [0; 8];
22de733b
KS
458 load(&mut l, &left[1..]);
459 let dst = buf;
696e4e20
KS
460
461 dst[0 + 0 * stride] = ((l[0] + l[1] + 1) >> 1) as u8;
462 dst[1 + 0 * stride] = ((l[0] + 2*l[1] + l[2] + 2) >> 2) as u8;
463 let pix = ((l[1] + l[2] + 1) >> 1) as u8;
464 dst[2 + 0 * stride] = pix;
465 dst[0 + 1 * stride] = pix;
466 let pix = ((l[1] + 2*l[2] + l[3] + 2) >> 2) as u8;
467 dst[3 + 0 * stride] = pix;
468 dst[1 + 1 * stride] = pix;
469 let pix = ((l[2] + l[3] + 1) >> 1) as u8;
470 dst[2 + 1 * stride] = pix;
471 dst[0 + 2 * stride] = pix;
472 let pix = ((l[2] + 3*l[3] + 2) >> 2) as u8;
473 dst[3 + 1 * stride] = pix;
474 dst[1 + 2 * stride] = pix;
475 dst[3 + 2 * stride] = l[3] as u8;
476 dst[1 + 3 * stride] = l[3] as u8;
477 dst[0 + 3 * stride] = l[3] as u8;
478 dst[2 + 2 * stride] = l[3] as u8;
479 dst[2 + 3 * stride] = l[3] as u8;
480 dst[3 + 3 * stride] = l[3] as u8;
481}
22de733b
KS
482fn ipred_4x4_dc(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8], _tr: &[u8]) {
483 ipred_dc(buf, stride, top, left, 4, 3);
696e4e20 484}
22de733b
KS
485fn ipred_4x4_left_dc(buf: &mut [u8], stride: usize, _top: &[u8], left: &[u8], _tr: &[u8]) {
486 ipred_left_dc(buf, stride, left, 4, 2);
696e4e20 487}
22de733b
KS
488fn ipred_4x4_top_dc(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8], _tr: &[u8]) {
489 ipred_top_dc(buf, stride, top, 4, 2);
696e4e20 490}
22de733b
KS
491fn ipred_4x4_dc128(buf: &mut [u8], stride: usize, _top: &[u8], _left: &[u8], _tr: &[u8]) {
492 ipred_dc128(buf, stride, 4);
696e4e20
KS
493}
494
495pub struct IPred8Context {
496 pub t: [u8; 16],
497 pub l: [u8; 8],
498 pub tl: u8,
499}
500
501impl IPred8Context {
502 pub fn new() -> Self {
503 Self {
504 t: [128; 16],
505 l: [128; 8],
506 tl: 128,
507 }
508 }
22de733b 509 pub fn fill(&mut self, top: &[u8], left: &[u8], has_t: bool, has_tr: bool, has_l: bool, has_tl: bool) {
696e4e20
KS
510 let mut t = [0x80u8; 19];
511 let mut l = [0x80u8; 11];
512 if has_t {
22de733b 513 t[1..8 + 1].copy_from_slice(&top[..8]);
696e4e20
KS
514 }
515 if has_tr {
22de733b 516 t[8 + 1..16 + 1].copy_from_slice(&top[8..][..8]);
696e4e20
KS
517 t[16 + 1] = t[15 + 1];
518 t[17 + 1] = t[15 + 1];
519 } else {
520 let (t0, t1) = t.split_at_mut(8 + 1);
521 for el in t1.iter_mut() {
522 *el = t0[7 + 1];
523 }
524 }
525 if has_l {
22de733b 526 l[1..9].copy_from_slice(&left[1..9]);
696e4e20
KS
527 l[8 + 1] = l[7 + 1];
528 l[9 + 1] = l[7 + 1];
529 }
530 if has_tl {
22de733b
KS
531 t[0] = left[0];
532 l[0] = left[0];
696e4e20
KS
533 } else {
534 t[0] = t[1];
535 l[0] = l[1];
536 }
537
538 for i in 0..16 {
539 self.t[i] = ((u16::from(t[i]) + 2 * u16::from(t[i + 1]) + u16::from(t[i + 2]) + 2) >> 2) as u8;
540 }
541 for i in 0..8 {
542 self.l[i] = ((u16::from(l[i]) + 2 * u16::from(l[i + 1]) + u16::from(l[i + 2]) + 2) >> 2) as u8;
543 }
544 self.tl = if has_t && has_l {
545 ((u16::from(t[1]) + 2 * u16::from(t[0]) + u16::from(l[1]) + 2) >> 2) as u8
546 } else if has_t {
547 ((3 * u16::from(t[0]) + u16::from(t[1]) + 2) >> 2) as u8
548 } else if has_l {
549 ((3 * u16::from(l[0]) + u16::from(l[1]) + 2) >> 2) as u8
550 } else {
551 t[0]
552 };
553 }
554}
555
556fn ipred_y_8x8_ver(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
557 for row in buf.chunks_mut(stride).take(8) {
558 row[..8].copy_from_slice(&ctx.t[..8]);
559 }
560}
561fn ipred_y_8x8_hor(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
562 for (row, &l) in buf.chunks_mut(stride).zip(ctx.l.iter()).take(8) {
563 row[..8].copy_from_slice(&[l; 8]);
564 }
565}
566fn ipred_y_8x8_diag_down_left(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
567 let mut t = [0u16; 16];
22de733b 568 load(&mut t, &ctx.t);
696e4e20
KS
569
570 for (y, row) in buf.chunks_mut(stride).take(8).enumerate() {
571 for (x, pix) in row.iter_mut().take(8).enumerate() {
572 *pix = ((if (x != 7) || (y != 7) {
573 t[x + y] + 2 * t[x + y + 1] + t[x + y + 2]
574 } else {
575 t[14] + 3 * t[15]
576 } + 2) >> 2) as u8;
577 }
578 }
579}
580fn ipred_y_8x8_diag_down_right(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
581 let mut t = [0u16; 9];
582 t[0] = u16::from(ctx.tl);
22de733b 583 load(&mut t[1..], &ctx.t);
696e4e20
KS
584 let mut l = [0u16; 9];
585 l[0] = u16::from(ctx.tl);
22de733b 586 load(&mut l[1..], &ctx.l);
696e4e20
KS
587 let diag = t[1] + 2 * t[0] + l[1];
588
589 for (y, row) in buf.chunks_mut(stride).take(8).enumerate() {
590 for (x, pix) in row.iter_mut().take(8).enumerate() {
591 *pix = ((if x > y {
592 t[x - y - 1] + 2 * t[x - y] + t[x - y + 1]
593 } else if x < y {
594 l[y - x - 1] + 2 * l[y - x] + l[y - x + 1]
595 } else {
596 diag
597 } + 2) >> 2) as u8;
598 }
599 }
600}
601fn ipred_y_8x8_ver_right(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
602 let mut t = [0u16; 9];
603 t[0] = u16::from(ctx.tl);
22de733b 604 load(&mut t[1..], &ctx.t);
696e4e20
KS
605 let mut l = [0u16; 9];
606 l[0] = u16::from(ctx.tl);
22de733b 607 load(&mut l[1..], &ctx.l);
696e4e20
KS
608
609 for (y, row) in buf.chunks_mut(stride).take(8).enumerate() {
610 for (x, pix) in row.iter_mut().take(8).enumerate() {
611 let zvr = 2 * (x as i8) - (y as i8);
612 *pix = if zvr >= 0 {
613 let ix = x - (y >> 1);
614 if (zvr & 1) == 0 {
615 (t[ix] + t[ix + 1] + 1) >> 1
616 } else {
617 (t[ix - 1] + 2 * t[ix] + t[ix + 1] + 2) >> 2
618 }
619 } else if zvr == -1 {
2b6a8fdc 620 (l[1] + 2 * l[0] + t[1] + 2) >> 2
696e4e20
KS
621 } else {
622 let ix = y - 2 * x;
623 (l[ix] + 2 * l[ix - 1] + l[ix - 2] + 2) >> 2
624 } as u8;
625 }
626 }
627}
628fn ipred_y_8x8_ver_left(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
629 let mut t = [0u16; 16];
22de733b 630 load(&mut t, &ctx.t);
696e4e20
KS
631
632 for (y, row) in buf.chunks_mut(stride).take(8).enumerate() {
633 for (x, pix) in row.iter_mut().take(8).enumerate() {
634 let ix = x + (y >> 1);
635 *pix = if (y & 1) == 0 {
636 (t[ix] + t[ix + 1] + 1) >> 1
637 } else {
638 (t[ix] + 2 * t[ix + 1] + t[ix + 2] + 2) >> 2
639 } as u8;
640 }
641 }
642
643}
644fn ipred_y_8x8_hor_down(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
645 let mut t = [0u16; 9];
646 t[0] = u16::from(ctx.tl);
22de733b 647 load(&mut t[1..], &ctx.t);
696e4e20
KS
648 let mut l = [0u16; 9];
649 l[0] = u16::from(ctx.tl);
22de733b 650 load(&mut l[1..], &ctx.l);
696e4e20
KS
651
652 for (y, row) in buf.chunks_mut(stride).take(8).enumerate() {
653 for (x, pix) in row.iter_mut().take(8).enumerate() {
654 let zhd = 2 * (y as i8) - (x as i8);
655 *pix = if zhd >= 0 {
656 let ix = y - (x >> 1);
657 if (zhd & 1) == 0 {
658 (l[ix] + l[ix + 1] + 1) >> 1
659 } else {
660 (l[ix - 1] + 2 * l[ix] + l[ix + 1] + 2) >> 2
661 }
662 } else if zhd == -1 {
2b6a8fdc 663 (l[1] + 2 * l[0] + t[1] + 2) >> 2
696e4e20
KS
664 } else {
665 let ix = x - 2 * y;
666 (t[ix] + 2 * t[ix - 1] + t[ix - 2] + 2) >> 2
667 } as u8;
668 }
669 }
670}
671fn ipred_y_8x8_hor_up(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
672 let mut l = [0u16; 8];
22de733b 673 load(&mut l, &ctx.l);
696e4e20
KS
674
675 for (y, row) in buf.chunks_mut(stride).take(8).enumerate() {
676 for (x, pix) in row.iter_mut().take(8).enumerate() {
677 let zhu = x + 2 * y;
678 let ix = y + (x >> 1);
679 *pix = if zhu > 13 {
680 l[7]
681 } else if zhu == 13 {
682 (l[6] + 3 * l[7] + 2) >> 2
683 } else if (zhu & 1) != 0 {
684 (l[ix] + 2 * l[ix + 1] + l[ix + 2] + 2) >> 2
685 } else {
686 (l[ix] + l[ix + 1] + 1) >> 1
687 } as u8;
688 }
689 }
690}
691fn ipred_y_8x8_dc(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
692 let mut sum = 0u16;
693 for &t in ctx.t[..8].iter() {
694 sum += u16::from(t);
695 }
696 for &l in ctx.l[..8].iter() {
697 sum += u16::from(l);
698 }
699 let dc = ((sum + 8) >> 4) as u8;
700 for row in buf.chunks_mut(stride).take(8) {
701 for pix in row.iter_mut().take(8) {
702 *pix = dc;
703 }
704 }
705}
706fn ipred_y_8x8_left_dc(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
707 let mut sum = 0u16;
708 for &l in ctx.l[..8].iter() {
709 sum += u16::from(l);
710 }
711 let dc = ((sum + 4) >> 3) as u8;
712 for row in buf.chunks_mut(stride).take(8) {
713 for pix in row.iter_mut().take(8) {
714 *pix = dc;
715 }
716 }
717}
718fn ipred_y_8x8_top_dc(buf: &mut [u8], stride: usize, ctx: &IPred8Context) {
719 let mut sum = 0u16;
720 for &t in ctx.t[..8].iter() {
721 sum += u16::from(t);
722 }
723 let dc = ((sum + 4) >> 3) as u8;
724 for row in buf.chunks_mut(stride).take(8) {
725 for pix in row.iter_mut().take(8) {
726 *pix = dc;
727 }
728 }
729}
730fn ipred_y_8x8_dc128(buf: &mut [u8], stride: usize, _ctx: &IPred8Context) {
22de733b 731 ipred_dc128(buf, stride, 8);
696e4e20
KS
732}
733
22de733b
KS
734fn ipred_8x8_ver(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8]) {
735 ipred_ver(buf, stride, top, 8);
696e4e20 736}
22de733b
KS
737fn ipred_8x8_hor(buf: &mut [u8], stride: usize, _top: &[u8], left: &[u8]) {
738 ipred_hor(buf, stride, left, 8);
696e4e20 739}
22de733b
KS
740fn ipred_8x8_dc(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8]) {
741 let mut l = [0; 8];
742 load(&mut l, &left[1..]);
743 let mut t = [0; 8];
42005e25 744 load(&mut t, top);
696e4e20
KS
745
746 let dc0 = ((t[0] + t[1] + t[2] + t[3] + l[0] + l[1] + l[2] + l[3] + 4) >> 3) as u8;
747 let sum1 = t[4] + t[5] + t[6] + t[7];
748 let dc1 = ((sum1 + 2) >> 2) as u8;
749 let sum2 = l[4] + l[5] + l[6] + l[7];
750 let dc2 = ((sum2 + 2) >> 2) as u8;
751 let dc3 = ((sum1 + sum2 + 4) >> 3) as u8;
752
22de733b 753 for row in buf.chunks_mut(stride).take(4) {
696e4e20
KS
754 row[..4].copy_from_slice(&[dc0; 4]);
755 row[4..8].copy_from_slice(&[dc1; 4]);
756 }
22de733b 757 for row in buf.chunks_mut(stride).skip(4).take(4) {
696e4e20
KS
758 row[..4].copy_from_slice(&[dc2; 4]);
759 row[4..8].copy_from_slice(&[dc3; 4]);
760 }
761}
22de733b 762fn ipred_8x8_left_dc(buf: &mut [u8], stride: usize, _top: &[u8], left: &[u8]) {
696e4e20
KS
763 let mut left_dc0 = 0;
764 let mut left_dc1 = 0;
22de733b
KS
765 for &el in left[1..].iter().take(4) {
766 left_dc0 += u16::from(el);
696e4e20 767 }
22de733b
KS
768 for &el in left[1..].iter().skip(4).take(4) {
769 left_dc1 += u16::from(el);
696e4e20
KS
770 }
771 let dc0 = ((left_dc0 + 2) >> 2) as u8;
772 let dc2 = ((left_dc1 + 2) >> 2) as u8;
22de733b 773 for row in buf.chunks_mut(stride).take(4) {
696e4e20
KS
774 row[..8].copy_from_slice(&[dc0; 8]);
775 }
22de733b 776 for row in buf.chunks_mut(stride).skip(4).take(4) {
696e4e20
KS
777 row[..8].copy_from_slice(&[dc2; 8]);
778 }
779}
22de733b
KS
780fn ipred_8x8_top_dc(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8]) {
781 ipred_top_dc(buf, stride, top, 4, 2);
782 ipred_top_dc(&mut buf[4..], stride, &top[4..], 4, 2);
783 let mut top = [0; 8];
784 top.copy_from_slice(&buf[stride * 3..][..8]);
785 ipred_top_dc(&mut buf[4 * stride..], stride, &top, 4, 2);
786 ipred_top_dc(&mut buf[4 + 4 * stride..], stride, &top[4..], 4, 2);
696e4e20 787}
22de733b
KS
788fn ipred_8x8_dc128(buf: &mut [u8], stride: usize, _top: &[u8], _left: &[u8]) {
789 ipred_dc128(buf, stride, 8);
696e4e20 790}
22de733b
KS
791fn ipred_8x8_plane(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8]) {
792 let mut h: i32 = 4 * (i32::from(top[7]) - i32::from(left[0]));
793 let mut v: i32 = 4 * (i32::from(left[8]) - i32::from(left[0]));
794 for i in 0..3 {
696e4e20 795 let i1 = (i + 1) as i32;
22de733b
KS
796 h += i1 * (i32::from(top[4 + i]) - i32::from(top[2 - i]));
797 v += i1 * (i32::from(left[5 + i]) - i32::from(left[3 - i]));
696e4e20
KS
798 }
799 let b = (17 * h + 16) >> 5;
800 let c = (17 * v + 16) >> 5;
22de733b
KS
801 let mut a = 16 * (i32::from(left[8]) + i32::from(top[7])) - 3 * (b + c) + 16;
802 for line in buf.chunks_mut(stride).take(8) {
696e4e20
KS
803 let mut acc = a;
804 for el in line.iter_mut().take(8) {
805 *el = clip8((acc >> 5) as i16);
806 acc += b;
807 }
808 a += c;
809 }
810}
811
22de733b
KS
812fn ipred_16x16_ver(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8]) {
813 ipred_ver(buf, stride, top, 16);
696e4e20 814}
22de733b
KS
815fn ipred_16x16_hor(buf: &mut [u8], stride: usize, _top: &[u8], left: &[u8]) {
816 ipred_hor(buf, stride, left, 16);
696e4e20 817}
22de733b
KS
818fn ipred_16x16_dc(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8]) {
819 ipred_dc(buf, stride, top, left, 16, 5);
696e4e20 820}
22de733b
KS
821fn ipred_16x16_left_dc(buf: &mut [u8], stride: usize, _top: &[u8], left: &[u8]) {
822 ipred_left_dc(buf, stride, left, 16, 4);
696e4e20 823}
22de733b
KS
824fn ipred_16x16_top_dc(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8]) {
825 ipred_top_dc(buf, stride, top, 16, 4);
696e4e20 826}
22de733b
KS
827fn ipred_16x16_dc128(buf: &mut [u8], stride: usize, _top: &[u8], _left: &[u8]) {
828 ipred_dc128(buf, stride, 16);
696e4e20 829}
22de733b
KS
830fn ipred_16x16_plane(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8]) {
831 let mut h = 8 * (i32::from(top[15]) - i32::from(left[0]));
832 let mut v = 8 * (i32::from(left[16]) - i32::from(left[0]));
833 for k in 0..7 {
834 h += ((k as i32) + 1) * (i32::from(top[8 + k]) - i32::from(top[6 - k]));
835 v += ((k as i32) + 1) * (i32::from(left[9 + k]) - i32::from(left[7 - k]));
696e4e20 836 }
22de733b 837
696e4e20
KS
838 h = (5 * h + 32) >> 6;
839 v = (5 * v + 32) >> 6;
840
22de733b 841 let mut a = 16 * (i32::from(left[16]) + i32::from(top[15]) + 1) - 7 * (v + h);
696e4e20 842
22de733b 843 for row in buf.chunks_mut(stride).take(16) {
696e4e20
KS
844 let mut b = a;
845 a += v;
846
847 for dst in row.chunks_exact_mut(4).take(4) {
848 dst[0] = clip8(((b ) >> 5) as i16);
849 dst[1] = clip8(((b + h) >> 5) as i16);
850 dst[2] = clip8(((b + 2*h) >> 5) as i16);
851 dst[3] = clip8(((b + 3*h) >> 5) as i16);
852 b += h * 4;
853 }
854 }
855}
856
22de733b
KS
857pub type IPred4x4Func = fn(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8], tr: &[u8]);
858pub type IPred8x8Func = fn(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8]);
696e4e20
KS
859pub type IPred8x8LumaFunc = fn(buf: &mut [u8], stride: usize, ctx: &IPred8Context);
860
861pub const IPRED4_DC128: usize = 11;
862pub const IPRED4_DC_TOP: usize = 10;
863pub const IPRED4_DC_LEFT: usize = 9;
864pub const IPRED8_DC128: usize = 6;
865pub const IPRED8_DC_TOP: usize = 5;
866pub const IPRED8_DC_LEFT: usize = 4;
867
868pub const IPRED_FUNCS4X4: [IPred4x4Func; 12] = [
869 ipred_4x4_ver, ipred_4x4_hor, ipred_4x4_dc,
870 ipred_4x4_diag_down_left, ipred_4x4_diag_down_right,
871 ipred_4x4_ver_right, ipred_4x4_hor_down, ipred_4x4_ver_left, ipred_4x4_hor_up,
872 ipred_4x4_left_dc, ipred_4x4_top_dc, ipred_4x4_dc128
873];
874
875pub const IPRED_FUNCS8X8_LUMA: [IPred8x8LumaFunc; 12] = [
876 ipred_y_8x8_ver, ipred_y_8x8_hor, ipred_y_8x8_dc,
877 ipred_y_8x8_diag_down_left, ipred_y_8x8_diag_down_right,
878 ipred_y_8x8_ver_right, ipred_y_8x8_hor_down,
879 ipred_y_8x8_ver_left, ipred_y_8x8_hor_up,
880 ipred_y_8x8_left_dc, ipred_y_8x8_top_dc, ipred_y_8x8_dc128
881];
882
883pub const IPRED_FUNCS8X8_CHROMA: [IPred8x8Func; 7] = [
884 ipred_8x8_dc, ipred_8x8_hor, ipred_8x8_ver, ipred_8x8_plane,
885 ipred_8x8_left_dc, ipred_8x8_top_dc, ipred_8x8_dc128
886];
887
888pub const IPRED_FUNCS16X16: [IPred8x8Func; 7] = [
889 ipred_16x16_ver, ipred_16x16_hor, ipred_16x16_dc, ipred_16x16_plane,
890 ipred_16x16_left_dc, ipred_16x16_top_dc, ipred_16x16_dc128
891];
892
696e4e20
KS
893macro_rules! loop_filter {
894 (lumaedge; $buf: expr, $off: expr, $step: expr, $alpha: expr, $beta: expr) => {
895 let p2 = i16::from($buf[$off - $step * 3]);
896 let p1 = i16::from($buf[$off - $step * 2]);
897 let p0 = i16::from($buf[$off - $step]);
898 let q0 = i16::from($buf[$off]);
899 let q1 = i16::from($buf[$off + $step]);
900 let q2 = i16::from($buf[$off + $step * 2]);
901 let a_p = (p2 - p0).abs() < $beta;
902 let a_q = (q2 - q0).abs() < $beta;
903 if a_p && (p0 - q0).abs() < (($alpha >> 2) + 2) {
904 let p3 = i16::from($buf[$off - $step * 4]);
905 $buf[$off - $step * 3] = ((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) as u8;
906 $buf[$off - $step * 2] = ((p2 + p1 + p0 + q0 + 2) >> 2) as u8;
907 $buf[$off - $step] = ((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) as u8;
908 } else {
909 $buf[$off - $step] = ((2 * p1 + p0 + q1 + 2) >> 2) as u8;
910 }
911 if a_q && (p0 - q0).abs() < (($alpha >> 2) + 2) {
912 let q3 = i16::from($buf[$off + $step * 3]);
913 $buf[$off] = ((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) as u8;
914 $buf[$off + $step] = ((p0 + q0 + q1 + q2 + 2) >> 2) as u8;
915 $buf[$off + $step * 2] = ((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) as u8;
916 } else {
917 $buf[$off] = ((2 * q1 + q0 + p1 + 2) >> 2) as u8;
918 }
919 };
920 (chromaedge; $buf: expr, $off: expr, $step: expr) => {
921 let p1 = i16::from($buf[$off - $step * 2]);
922 let p0 = i16::from($buf[$off - $step]);
923 let q0 = i16::from($buf[$off]);
924 let q1 = i16::from($buf[$off + $step]);
925 $buf[$off - $step] = ((2 * p1 + p0 + q1 + 2) >> 2) as u8;
926 $buf[$off] = ((2 * q1 + q0 + p1 + 2) >> 2) as u8;
927 };
928 (lumanormal; $buf: expr, $off: expr, $step: expr, $tc0: expr, $beta: expr) => {
929 let p2 = i16::from($buf[$off - $step * 3]);
930 let p1 = i16::from($buf[$off - $step * 2]);
931 let p0 = i16::from($buf[$off - $step]);
932 let q0 = i16::from($buf[$off]);
933 let q1 = i16::from($buf[$off + $step]);
934 let q2 = i16::from($buf[$off + $step * 2]);
935 let a_p = (p2 - p0).abs() < $beta;
936 let a_q = (q2 - q0).abs() < $beta;
937 let tc = $tc0 + (a_p as i16) + (a_q as i16);
938 let delta = (((q0 - p0) * 4 + (p1 - q1) + 4) >> 3).max(-tc).min(tc);
939 if a_p && ($tc0 > 0) {
940 $buf[$off - $step * 2] = clip8(p1 + ((p2 + ((p0 + q0 + 1) >> 1) - p1 * 2) >> 1).max(-$tc0).min($tc0));
941 }
942 $buf[$off - $step] = clip8(p0 + delta);
943 $buf[$off] = clip8(q0 - delta);
944 if a_q && ($tc0 > 0) {
945 $buf[$off + $step] = clip8(q1 + ((q2 + ((p0 + q0 + 1) >> 1) - q1 * 2) >> 1).max(-$tc0).min($tc0));
946 }
947 };
948 (chromanormal; $buf: expr, $off: expr, $step: expr, $tc0: expr) => {
949 let p1 = i16::from($buf[$off - $step * 2]);
950 let p0 = i16::from($buf[$off - $step]);
951 let q0 = i16::from($buf[$off]);
952 let q1 = i16::from($buf[$off + $step]);
953 let tc = $tc0 + 1;
954 let delta = (((q0 - p0) * 4 + (p1 - q1) + 4) >> 3).max(-tc).min(tc);
955 $buf[$off - $step] = clip8(p0 + delta);
956 $buf[$off] = clip8(q0 - delta);
957 }
958}
959
960fn check_filter(buf: &[u8], off: usize, step: usize, alpha: i16, beta: i16) -> bool {
961 let p1 = i16::from(buf[off - step * 2]);
962 let p0 = i16::from(buf[off - step]);
963 let q0 = i16::from(buf[off]);
964 let q1 = i16::from(buf[off + step]);
965 (p0 - q0).abs() < alpha && (p1 - p0).abs() < beta && (q1 - q0).abs() < beta
966}
967
932ae27b
KS
968#[cfg(not(target_arch="x86_64"))]
969fn check_filter4(buf: &[u8], mut off: usize, step: usize, stride: usize, alpha: i16, beta: i16) -> [bool; 4] {
970 let mut flags = [false; 4];
971 for flag in flags.iter_mut() {
972 let p1 = i16::from(buf[off - step * 2]);
973 let p0 = i16::from(buf[off - step]);
974 let q0 = i16::from(buf[off]);
975 let q1 = i16::from(buf[off + step]);
976 *flag = (p0 - q0).abs() < alpha && (p1 - p0).abs() < beta && (q1 - q0).abs() < beta;
977 off += stride;
978 }
979 flags
980}
981
982#[cfg(target_arch="x86_64")]
983fn check_filter4(buf: &[u8], off: usize, step: usize, stride: usize, alpha: i16, beta: i16) -> [bool; 4] {
984 unsafe {
985 let mut flags = [false; 4];
986 let src = buf[off - step * 2..].as_ptr();
987 let load_stride = step.max(stride);
988 let fptr = flags.as_mut_ptr();
989 let tflag = u32::from(step == 1);
990 asm! {
991 // load block
992 "pxor xmm4, xmm4",
993 "movd xmm0, dword ptr [{src}]",
994 "lea {tmp}, [{src} + {stride} * 2]",
995 "movd xmm1, dword ptr [{src} + {stride}]",
996 "movd xmm2, dword ptr [{tmp}]",
997 "movd xmm3, dword ptr [{tmp} + {stride}]",
998 "punpcklbw xmm0, xmm4",
999 "punpcklbw xmm1, xmm4",
1000 "punpcklbw xmm2, xmm4",
1001 "punpcklbw xmm3, xmm4",
1002
1003 // transpose block if necessary so it's always processed by rows
1004 "test {tflag:e}, {tflag:e}",
1005 "jz 1f",
1006 "punpcklwd xmm0, xmm1",
1007 "movhlps xmm4, xmm0",
1008 "punpcklwd xmm2, xmm3",
1009 "movhlps xmm1, xmm2",
1010 "punpckldq xmm0, xmm2",
1011 "punpckldq xmm4, xmm1",
1012 "movhlps xmm1, xmm0",
1013 "movhlps xmm3, xmm4",
1014 "movaps xmm2, xmm4",
1015 "1:",
1016
1017 // calculate deltas and flags
1018 "movd xmm4, {alpha:r}",
1019 "movd xmm5, {beta:r}",
1020 "psubw xmm0, xmm1",
1021 "psubw xmm1, xmm2",
1022 "psubw xmm3, xmm2",
1023 "pshuflw xmm4, xmm4, 0",
1024 "pshuflw xmm5, xmm5, 0",
1025 "pabsw xmm0, xmm0", // |p1 - p0|
1026 "pabsw xmm1, xmm1", // |p0 - q0|
1027 "pabsw xmm2, xmm3", // |q1 - q0|
1028 "movaps xmm3, xmm5",
1029 "pcmpgtw xmm4, xmm1",
1030 "pcmpgtw xmm5, xmm0",
1031 "pcmpgtw xmm3, xmm2",
1032 "pand xmm4, xmm5",
1033 "pand xmm4, xmm3",
1034 "packsswb xmm4, xmm4",
1035 "movd [{flags}], xmm4",
1036 tmp = out(reg) _,
1037 src = in(reg) src,
1038 stride = in(reg) load_stride,
1039 alpha = in(reg) alpha,
1040 beta = in(reg) beta,
1041 flags = in(reg) fptr,
1042 tflag = in(reg) tflag,
1043 out("xmm0") _,
1044 out("xmm1") _,
1045 out("xmm2") _,
1046 out("xmm3") _,
1047 out("xmm4") _,
1048 out("xmm5") _,
1049 }
1050 flags
1051 }
1052}
1053
696e4e20 1054pub fn loop_filter_lumaedge_v(dst: &mut [u8], mut off: usize, stride: usize, alpha: i16, beta: i16) {
932ae27b
KS
1055 let flags = check_filter4(dst, off, 1, stride, alpha, beta);
1056 for &flag in flags.iter() {
1057 if flag {
696e4e20
KS
1058 loop_filter!(lumaedge; dst, off, 1, alpha, beta);
1059 }
1060 off += stride;
1061 }
1062}
1063pub fn loop_filter_lumaedge_h(dst: &mut [u8], off: usize, stride: usize, alpha: i16, beta: i16) {
932ae27b
KS
1064 let flags = check_filter4(dst, off, stride, 1, alpha, beta);
1065 for (x, &flag) in flags.iter().enumerate() {
1066 if flag {
696e4e20
KS
1067 loop_filter!(lumaedge; dst, off + x, stride, alpha, beta);
1068 }
1069 }
1070}
1071pub fn loop_filter_lumanormal_v(dst: &mut [u8], mut off: usize, stride: usize, alpha: i16, beta: i16, tc0: i16) {
932ae27b
KS
1072 let flags = check_filter4(dst, off, 1, stride, alpha, beta);
1073 for &flag in flags.iter() {
1074 if flag {
696e4e20
KS
1075 loop_filter!(lumanormal; dst, off, 1, tc0, beta);
1076 }
1077 off += stride;
1078 }
1079}
1080pub fn loop_filter_lumanormal_h(dst: &mut [u8], off: usize, stride: usize, alpha: i16, beta: i16, tc0: i16) {
932ae27b
KS
1081 let flags = check_filter4(dst, off, stride, 1, alpha, beta);
1082 for (x, &flag) in flags.iter().enumerate() {
1083 if flag {
696e4e20
KS
1084 loop_filter!(lumanormal; dst, off + x, stride, tc0, beta);
1085 }
1086 }
1087}
1088pub fn loop_filter_chromaedge_v(dst: &mut [u8], mut off: usize, stride: usize, alpha: i16, beta: i16) {
22de733b 1089 for _ in 0..2 {
696e4e20
KS
1090 if check_filter(dst, off, 1, alpha, beta) {
1091 loop_filter!(chromaedge; dst, off, 1);
1092 }
1093 off += stride;
1094 }
1095}
1096pub fn loop_filter_chromaedge_h(dst: &mut [u8], off: usize, stride: usize, alpha: i16, beta: i16) {
22de733b 1097 for x in 0..2 {
696e4e20
KS
1098 if check_filter(dst, off + x, stride, alpha, beta) {
1099 loop_filter!(chromaedge; dst, off + x, stride);
1100 }
1101 }
1102}
1103pub fn loop_filter_chromanormal_v(dst: &mut [u8], mut off: usize, stride: usize, alpha: i16, beta: i16, tc0: i16) {
22de733b 1104 for _ in 0..2 {
696e4e20
KS
1105 if check_filter(dst, off, 1, alpha, beta) {
1106 loop_filter!(chromanormal; dst, off, 1, tc0);
1107 }
1108 off += stride;
1109 }
1110}
1111pub fn loop_filter_chromanormal_h(dst: &mut [u8], off: usize, stride: usize, alpha: i16, beta: i16, tc0: i16) {
22de733b 1112 for x in 0..2 {
696e4e20
KS
1113 if check_filter(dst, off + x, stride, alpha, beta) {
1114 loop_filter!(chromanormal; dst, off + x, stride, tc0);
1115 }
1116 }
1117}