h264: add SIMD optimisations for x86_64 (not enabled by default)
[nihav.git] / nihav-itu / src / codecs / h264 / dsp / mc / x86 / chroma_mc.rs
1 use std::arch::asm;
2
3 #[cfg(target_arch = "x86")]
4 fn chroma_interp(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, dx: u16, dy: u16, w: usize, h: usize) {
5 let a0 = 8 - dx;
6 let a1 = dx;
7 let b0 = 8 - dy;
8 let b1 = dy;
9
10 if a0 == 8 && b0 == 8 {
11 unsafe {
12 let mut src = src.as_ptr();
13 let mut dst = dst.as_mut_ptr();
14 for _ in 0..h {
15 std::ptr::copy_nonoverlapping(src, dst, w);
16 src = src.add(sstride);
17 dst = dst.add(dstride);
18 }
19 }
20 } else if a0 == 8 {
21 unsafe {
22 let mut src0 = src.as_ptr();
23 let mut src1 = src0.add(sstride);
24 let mut dst = dst.as_mut_ptr();
25 for _ in 0..h {
26 for x in 0..w {
27 let a = *src0.add(x);
28 let b = *src1.add(x);
29 *dst.add(x) = ((u16::from(a) * b0 + u16::from(b) * b1 + 4) >> 3) as u8;
30 }
31 src0 = src0.add(sstride);
32 src1 = src1.add(sstride);
33 dst = dst.add(dstride);
34 }
35 }
36 } else if b0 == 8 {
37 unsafe {
38 let mut src = src.as_ptr();
39 let mut dst = dst.as_mut_ptr();
40 for _ in 0..h {
41 let mut a = *src;
42 for x in 0..w {
43 let b = *src.add(x + 1);
44 *dst.add(x) = ((u16::from(a) * a0 + u16::from(b) * a1 + 4) >> 3) as u8;
45 a = b;
46 }
47 src = src.add(sstride);
48 dst = dst.add(dstride);
49 }
50 }
51 } else {
52 unsafe {
53 let mut src0 = src.as_ptr();
54 let mut src1 = src0.add(sstride);
55 let mut dst = dst.as_mut_ptr();
56 for _ in 0..h {
57 let mut a = *src0;
58 let mut c = *src1;
59 for x in 0..w {
60 let b = *src0.add(x + 1);
61 let d = *src1.add(x + 1);
62 *dst.add(x) = ((u16::from(a) * a0 * b0 + u16::from(b) * a1 * b0 + u16::from(c) * a0 * b1 + u16::from(d) * a1 * b1 + 0x20) >> 6) as u8;
63 a = b;
64 c = d;
65 }
66 src0 = src0.add(sstride);
67 src1 = src1.add(sstride);
68 dst = dst.add(dstride);
69 }
70 }
71 }
72 }
73
74 pub fn chroma_interp_8(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, dx: u16, dy: u16, h: usize) {
75 unsafe {
76 match (dx, dy) {
77 (0, 0) => {
78 asm!(
79 "lea {stmp}, [{src} + {sstride} * 2]",
80 "lea {dtmp}, [{dst} + {dstride} * 2]",
81 "2:",
82 "movq xmm0, [{src}]",
83 "movq xmm1, [{src} + {sstride}]",
84 "movq xmm2, [{stmp}]",
85 "movq xmm3, [{stmp} + {sstride}]",
86 "movq [{dst}], xmm0",
87 "lea {src}, [{src} + {sstride} * 4]",
88 "movq [{dst} + {dstride}], xmm1",
89 "lea {stmp}, [{stmp} + {sstride} * 4]",
90 "movq [{dtmp}], xmm2",
91 "lea {dst}, [{dst} + {dstride} * 4]",
92 "movq [{dtmp} + {dstride}], xmm3",
93 "lea {dtmp}, [{dtmp} + {dstride} * 4]",
94 "sub {h}, 4",
95 "jnz 2b",
96 src = inout(reg) src.as_ptr() => _,
97 sstride = in(reg) sstride,
98 dst = inout(reg) dst.as_mut_ptr() => _,
99 dstride = in(reg) dstride,
100 h = inout(reg) h => _,
101 stmp = out(reg) _,
102 dtmp = out(reg) _,
103 out("xmm0") _,
104 out("xmm1") _,
105 out("xmm2") _,
106 out("xmm3") _,
107 );
108 },
109 (0, _) => {
110 asm!(
111 "pxor xmm0, xmm0",
112 "movd xmm3, {a0:e}",
113 "movd xmm4, {a1:e}",
114 "mov {a1:e}, 0x0004",
115 "movd xmm5, {a1:e}",
116 "pshuflw xmm3, xmm3, 0",
117 "pshuflw xmm4, xmm4, 0",
118 "pshuflw xmm5, xmm5, 0",
119 "movlhps xmm3, xmm3",
120 "movlhps xmm4, xmm4",
121 "movlhps xmm5, xmm5",
122 "movq xmm6, [{src}]",
123 "add {src}, {sstride}",
124 "punpcklbw xmm6, xmm0",
125 "2:",
126 "movaps xmm1, xmm6",
127 "movq xmm2, [{src}]",
128 "punpcklbw xmm2, xmm0",
129 "movaps xmm6, xmm2",
130 "pmullw xmm1, xmm3",
131 "pmullw xmm2, xmm4",
132 "add {src}, {sstride}",
133 "paddw xmm1, xmm2",
134 "paddw xmm1, xmm5",
135 "psraw xmm1, 3",
136 "packuswb xmm1, xmm1",
137 "movq [{dst}], xmm1",
138 "add {dst}, {dstride}",
139 "dec {h}",
140 "jnz 2b",
141 src = inout(reg) src.as_ptr() => _,
142 sstride = in(reg) sstride,
143 dst = inout(reg) dst.as_mut_ptr() => _,
144 dstride = in(reg) dstride,
145 h = inout(reg) h => _,
146 a0 = in(reg) i32::from(8 - dy),
147 a1 = inout(reg) i32::from(dy) => _,
148 out("xmm0") _,
149 out("xmm1") _,
150 out("xmm2") _,
151 out("xmm3") _,
152 out("xmm4") _,
153 out("xmm5") _,
154 out("xmm6") _,
155 );
156 },
157 (_, 0) => {
158 asm!(
159 "pxor xmm0, xmm0",
160 "movd xmm3, {a0:e}",
161 "movd xmm4, {a1:e}",
162 "mov {a1:e}, 0x0004",
163 "movd xmm5, {a1:e}",
164 "pshuflw xmm3, xmm3, 0",
165 "pshuflw xmm4, xmm4, 0",
166 "pshuflw xmm5, xmm5, 0",
167 "movlhps xmm3, xmm3",
168 "movlhps xmm4, xmm4",
169 "movlhps xmm5, xmm5",
170 "2:",
171 "movq xmm1, [{src}]",
172 "movq xmm2, [{src} + 1]",
173 "punpcklbw xmm1, xmm0",
174 "punpcklbw xmm2, xmm0",
175 "pmullw xmm1, xmm3",
176 "pmullw xmm2, xmm4",
177 "add {src}, {sstride}",
178 "paddw xmm1, xmm2",
179 "paddw xmm1, xmm5",
180 "psraw xmm1, 3",
181 "packuswb xmm1, xmm1",
182 "movq [{dst}], xmm1",
183 "add {dst}, {dstride}",
184 "dec {h}",
185 "jnz 2b",
186 src = inout(reg) src.as_ptr() => _,
187 sstride = inout(reg) sstride => _,
188 dst = inout(reg) dst.as_mut_ptr() => _,
189 dstride = inout(reg) dstride => _,
190 h = inout(reg) h => _,
191 a0 = inout(reg) i32::from(8 - dx) => _,
192 a1 = inout(reg) i32::from(dx) => _,
193 out("xmm0") _,
194 out("xmm1") _,
195 out("xmm2") _,
196 out("xmm3") _,
197 out("xmm4") _,
198 out("xmm5") _,
199 );
200 },
201 #[cfg(target_arch = "x86")]
202 _ => chroma_interp(dst, dstride, src, sstride, dx, dy, 8, h),
203 #[cfg(target_arch = "x86_64")]
204 _ => {
205 asm!(
206 "pxor xmm0, xmm0",
207 "movd xmm3, {a0:e}",
208 "movd xmm4, {a1:e}",
209 "movd xmm5, {b0:e}",
210 "movd xmm6, {b1:e}",
211 "mov {a1:e}, 0x0020",
212 "movd xmm7, {a1:e}",
213 "pshuflw xmm3, xmm3, 0",
214 "pshuflw xmm4, xmm4, 0",
215 "pshuflw xmm5, xmm5, 0",
216 "pshuflw xmm6, xmm6, 0",
217 "pshuflw xmm7, xmm7, 0",
218 "movlhps xmm3, xmm3",
219 "movlhps xmm4, xmm4",
220 "movlhps xmm5, xmm5",
221 "movlhps xmm6, xmm6",
222 "movlhps xmm7, xmm7",
223
224 "movq xmm8, [{src}]",
225 "movq xmm2, [{src} + 1]",
226 "punpcklbw xmm8, xmm0",
227 "punpcklbw xmm2, xmm0",
228 "pmullw xmm8, xmm3",
229 "pmullw xmm2, xmm4",
230 "add {src}, {sstride}",
231 "paddw xmm8, xmm2",
232
233 "2:",
234 "movq xmm1, [{src}]",
235 "movq xmm2, [{src} + 1]",
236 "punpcklbw xmm1, xmm0",
237 "punpcklbw xmm2, xmm0",
238 "pmullw xmm1, xmm3",
239 "pmullw xmm2, xmm4",
240 "add {src}, {sstride}",
241 "paddw xmm1, xmm2",
242 "movaps xmm2, xmm8",
243 "movaps xmm8, xmm1",
244
245 "pmullw xmm1, xmm6",
246 "pmullw xmm2, xmm5",
247 "paddw xmm1, xmm2",
248 "paddw xmm1, xmm7",
249 "psraw xmm1, 6",
250 "packuswb xmm1, xmm1",
251 "movq [{dst}], xmm1",
252 "add {dst}, {dstride}",
253 "dec {h}",
254 "jnz 2b",
255 src = inout(reg) src.as_ptr() => _,
256 sstride = inout(reg) sstride => _,
257 dst = inout(reg) dst.as_mut_ptr() => _,
258 dstride = inout(reg) dstride => _,
259 h = inout(reg) h => _,
260 a0 = inout(reg) i32::from(8 - dx) => _,
261 a1 = inout(reg) i32::from(dx) => _,
262 b0 = inout(reg) i32::from(8 - dy) => _,
263 b1 = inout(reg) i32::from(dy) => _,
264 out("xmm0") _,
265 out("xmm1") _,
266 out("xmm2") _,
267 out("xmm3") _,
268 out("xmm4") _,
269 out("xmm5") _,
270 out("xmm6") _,
271 out("xmm7") _,
272 out("xmm8") _,
273 );
274 },
275 };
276 }
277 }
278
279 pub fn chroma_interp_4(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, dx: u16, dy: u16, h: usize) {
280 unsafe {
281 match (dx, dy) {
282 (0, 0) => {
283 asm!(
284 "2:",
285 "movd xmm0, [{src}]",
286 "movd xmm1, [{src} + {sstride}]",
287 "movd [{dst}], xmm0",
288 "lea {src}, [{src} + {sstride} * 2]",
289 "movd [{dst} + {dstride}], xmm1",
290 "lea {dst}, [{dst} + {dstride} * 2]",
291 "sub {h}, 2",
292 "jnz 2b",
293 src = inout(reg) src.as_ptr() => _,
294 sstride = in(reg) sstride,
295 dst = inout(reg) dst.as_mut_ptr() => _,
296 dstride = in(reg) dstride,
297 h = inout(reg) h => _,
298 out("xmm0") _,
299 out("xmm1") _,
300 );
301 },
302 (0, _) => {
303 asm!(
304 "pxor xmm0, xmm0",
305 "movd xmm3, {a0:e}",
306 "movd xmm4, {a1:e}",
307 "mov {a1:e}, 0x0004",
308 "movd xmm5, {a1:e}",
309 "pshuflw xmm3, xmm3, 0",
310 "pshuflw xmm4, xmm4, 0",
311 "pshuflw xmm5, xmm5, 0",
312 "movd xmm6, [{src}]",
313 "add {src}, {sstride}",
314 "punpcklbw xmm6, xmm0",
315 "2:",
316 "movaps xmm1, xmm6",
317 "movd xmm2, [{src}]",
318 "punpcklbw xmm2, xmm0",
319 "movaps xmm6, xmm2",
320 "pmullw xmm1, xmm3",
321 "pmullw xmm2, xmm4",
322 "add {src}, {sstride}",
323 "paddw xmm1, xmm2",
324 "paddw xmm1, xmm5",
325 "psraw xmm1, 3",
326 "packuswb xmm1, xmm1",
327 "movd [{dst}], xmm1",
328 "add {dst}, {dstride}",
329 "dec {h}",
330 "jnz 2b",
331 src = inout(reg) src.as_ptr() => _,
332 sstride = inout(reg) sstride => _,
333 dst = inout(reg) dst.as_mut_ptr() => _,
334 dstride = inout(reg) dstride => _,
335 h = inout(reg) h => _,
336 a0 = inout(reg) i32::from(8 - dy) => _,
337 a1 = inout(reg) i32::from(dy) => _,
338 out("xmm0") _,
339 out("xmm1") _,
340 out("xmm2") _,
341 out("xmm3") _,
342 out("xmm4") _,
343 out("xmm5") _,
344 out("xmm6") _,
345 );
346 },
347 (_, 0) => {
348 asm!(
349 "pxor xmm0, xmm0",
350 "movd xmm3, {a0:e}",
351 "movd xmm4, {a1:e}",
352 "mov {a1:e}, 0x0004",
353 "movd xmm5, {a1:e}",
354 "pshuflw xmm3, xmm3, 0",
355 "pshuflw xmm4, xmm4, 0",
356 "pshuflw xmm5, xmm5, 0",
357 "2:",
358 "movd xmm1, [{src}]",
359 "movd xmm2, [{src} + 1]",
360 "punpcklbw xmm1, xmm0",
361 "punpcklbw xmm2, xmm0",
362 "pmullw xmm1, xmm3",
363 "pmullw xmm2, xmm4",
364 "add {src}, {sstride}",
365 "paddw xmm1, xmm2",
366 "paddw xmm1, xmm5",
367 "psraw xmm1, 3",
368 "packuswb xmm1, xmm1",
369 "movd [{dst}], xmm1",
370 "add {dst}, {dstride}",
371 "dec {h}",
372 "jnz 2b",
373 src = inout(reg) src.as_ptr() => _,
374 sstride = inout(reg) sstride => _,
375 dst = inout(reg) dst.as_mut_ptr() => _,
376 dstride = inout(reg) dstride => _,
377 h = inout(reg) h => _,
378 a0 = inout(reg) i32::from(8 - dx) => _,
379 a1 = inout(reg) i32::from(dx) => _,
380 out("xmm0") _,
381 out("xmm1") _,
382 out("xmm2") _,
383 out("xmm3") _,
384 out("xmm4") _,
385 out("xmm5") _,
386 );
387 },
388 #[cfg(target_arch = "x86")]
389 _ => chroma_interp(dst, dstride, src, sstride, dx, dy, 4, h),
390 #[cfg(target_arch = "x86_64")]
391 _ => {
392 asm!(
393 "pxor xmm0, xmm0",
394 "movd xmm3, {a0:e}",
395 "movd xmm4, {a1:e}",
396 "movd xmm5, {b0:e}",
397 "movd xmm6, {b1:e}",
398 "mov {a1:e}, 0x0020",
399 "movd xmm7, {a1:e}",
400 "pshuflw xmm3, xmm3, 0",
401 "pshuflw xmm4, xmm4, 0",
402 "pshuflw xmm5, xmm5, 0",
403 "pshuflw xmm6, xmm6, 0",
404 "pshuflw xmm7, xmm7, 0",
405
406 "movd xmm8, [{src}]",
407 "movd xmm2, [{src} + 1]",
408 "punpcklbw xmm8, xmm0",
409 "punpcklbw xmm2, xmm0",
410 "pmullw xmm8, xmm3",
411 "pmullw xmm2, xmm4",
412 "add {src}, {sstride}",
413 "paddw xmm8, xmm2",
414
415 "2:",
416 "movd xmm1, [{src}]",
417 "movd xmm2, [{src} + 1]",
418 "punpcklbw xmm1, xmm0",
419 "punpcklbw xmm2, xmm0",
420 "pmullw xmm1, xmm3",
421 "pmullw xmm2, xmm4",
422 "add {src}, {sstride}",
423 "paddw xmm1, xmm2",
424 "movaps xmm2, xmm8",
425 "movaps xmm8, xmm1",
426
427 "pmullw xmm1, xmm6",
428 "pmullw xmm2, xmm5",
429 "paddw xmm1, xmm2",
430 "paddw xmm1, xmm7",
431 "psraw xmm1, 6",
432 "packuswb xmm1, xmm1",
433 "movd [{dst}], xmm1",
434 "add {dst}, {dstride}",
435 "dec {h}",
436 "jnz 2b",
437 src = inout(reg) src.as_ptr() => _,
438 sstride = inout(reg) sstride => _,
439 dst = inout(reg) dst.as_mut_ptr() => _,
440 dstride = inout(reg) dstride => _,
441 h = inout(reg) h => _,
442 a0 = inout(reg) i32::from(8 - dx) => _,
443 a1 = inout(reg) i32::from(dx) => _,
444 b0 = inout(reg) i32::from(8 - dy) => _,
445 b1 = inout(reg) i32::from(dy) => _,
446 out("xmm0") _,
447 out("xmm1") _,
448 out("xmm2") _,
449 out("xmm3") _,
450 out("xmm4") _,
451 out("xmm5") _,
452 out("xmm6") _,
453 out("xmm7") _,
454 out("xmm8") _,
455 );
456 },
457 };
458 }
459 }
460
461 #[inline]
462 fn chr_interp2(a: u8, b: u8, b0: u16, b1: u16) -> u8 {
463 ((u16::from(a) * b0 + u16::from(b) * b1 + 4) >> 3) as u8
464 }
465 #[inline]
466 fn chr_interp4(a: u8, b: u8, c: u8, d: u8, a0: u16, a1: u16, b0: u16, b1: u16) -> u8 {
467 ((u16::from(a) * a0 * b0 + u16::from(b) * a1 * b0 + u16::from(c) * a0 * b1 + u16::from(d) * a1 * b1 + 0x20) >> 6) as u8
468 }
469
470 pub fn chroma_interp_2(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, dx: u16, dy: u16, h: usize) {
471 let a0 = 8 - dx;
472 let a1 = dx;
473 let b0 = 8 - dy;
474 let b1 = dy;
475
476 if a0 == 8 && b0 == 8 {
477 unsafe {
478 let mut src = src.as_ptr();
479 let mut dst = dst.as_mut_ptr();
480 std::ptr::copy_nonoverlapping(src, dst, 2);
481 src = src.add(sstride);
482 dst = dst.add(dstride);
483 std::ptr::copy_nonoverlapping(src, dst, 2);
484 if h == 4 {
485 src = src.add(sstride);
486 dst = dst.add(dstride);
487 std::ptr::copy_nonoverlapping(src, dst, 2);
488 src = src.add(sstride);
489 dst = dst.add(dstride);
490 std::ptr::copy_nonoverlapping(src, dst, 2);
491 }
492 }
493 } else if a0 == 8 {
494 unsafe {
495 let mut src0 = src.as_ptr();
496 let mut src1 = src0.add(sstride);
497 let mut dst = dst.as_mut_ptr();
498 *dst = chr_interp2(*src0, *src1, b0, b1);
499 *dst.add(1) = chr_interp2(*src0.add(1), *src1.add(1), b0, b1);
500 *dst.add(dstride) = chr_interp2(*src0.add(sstride), *src1.add(sstride), b0, b1);
501 *dst.add(dstride + 1) = chr_interp2(*src0.add(sstride + 1), *src1.add(sstride + 1), b0, b1);
502 if h == 4 {
503 src0 = src0.add(sstride * 2);
504 src1 = src1.add(sstride * 2);
505 dst = dst.add(dstride * 2);
506 *dst = chr_interp2(*src0, *src1, b0, b1);
507 *dst.add(1) = chr_interp2(*src0.add(1), *src1.add(1), b0, b1);
508 *dst.add(dstride) = chr_interp2(*src0.add(sstride), *src1.add(sstride), b0, b1);
509 *dst.add(dstride + 1) = chr_interp2(*src0.add(sstride + 1), *src1.add(sstride + 1), b0, b1);
510 }
511 }
512 } else if b0 == 8 {
513 unsafe {
514 let mut src = src.as_ptr();
515 let mut dst = dst.as_mut_ptr();
516 let (a, b, c) = (*src, *src.add(1), *src.add(2));
517 *dst = chr_interp2(a, b, a0, a1);
518 *dst.add(1) = chr_interp2(b, c, a0, a1);
519 let (a, b, c) = (*src.add(sstride), *src.add(sstride + 1), *src.add(sstride + 2));
520 *dst.add(dstride) = chr_interp2(a, b, a0, a1);
521 *dst.add(dstride + 1) = chr_interp2(b, c, a0, a1);
522 if h == 4 {
523 src = src.add(sstride * 2);
524 dst = dst.add(dstride * 2);
525 let (a, b, c) = (*src, *src.add(1), *src.add(2));
526 *dst = chr_interp2(a, b, a0, a1);
527 *dst.add(1) = chr_interp2(b, c, a0, a1);
528 let (a, b, c) = (*src.add(sstride), *src.add(sstride + 1), *src.add(sstride + 2));
529 *dst.add(dstride) = chr_interp2(a, b, a0, a1);
530 *dst.add(dstride + 1) = chr_interp2(b, c, a0, a1);
531 }
532 }
533 } else {
534 unsafe {
535 let height = h;
536 let mut src0 = src.as_ptr();
537 let mut src1 = src0.add(sstride);
538 let mut dst = dst.as_mut_ptr();
539
540 let (a, b, c) = (*src0, *src0.add(1), *src0.add(2));
541 let (d, e, f) = (*src1, *src1.add(1), *src1.add(2));
542 let (g, h, i) = (*src1.add(sstride), *src1.add(sstride + 1), *src1.add(sstride + 2));
543 *dst = chr_interp4(a, b, d, e, a0, a1, b0, b1);
544 *dst.add(1) = chr_interp4(b, c, e, f, a0, a1, b0, b1);
545 *dst.add(dstride) = chr_interp4(d, e, g, h, a0, a1, b0, b1);
546 *dst.add(dstride + 1) = chr_interp4(e, f, h, i, a0, a1, b0, b1);
547 if height == 4 {
548 src0 = src0.add(sstride * 3);
549 src1 = src1.add(sstride * 3);
550 dst = dst.add(dstride * 2);
551 let (a, b, c) = (*src0, *src0.add(1), *src0.add(2));
552 let (d, e, f) = (*src1, *src1.add(1), *src1.add(2));
553 *dst = chr_interp4(g, h, a, b, a0, a1, b0, b1);
554 *dst.add(1) = chr_interp4(h, i, b, c, a0, a1, b0, b1);
555 *dst.add(dstride) = chr_interp4(a, b, d, e, a0, a1, b0, b1);
556 *dst.add(dstride + 1) = chr_interp4(b, c, e, f, a0, a1, b0, b1);
557 }
558 }
559 }
560 }
561