3 #[cfg(target_arch = "x86")]
4 fn chroma_interp(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, dx: u16, dy: u16, w: usize, h: usize) {
10 if a0 == 8 && b0 == 8 {
12 let mut src = src.as_ptr();
13 let mut dst = dst.as_mut_ptr();
15 std::ptr::copy_nonoverlapping(src, dst, w);
16 src = src.add(sstride);
17 dst = dst.add(dstride);
22 let mut src0 = src.as_ptr();
23 let mut src1 = src0.add(sstride);
24 let mut dst = dst.as_mut_ptr();
29 *dst.add(x) = ((u16::from(a) * b0 + u16::from(b) * b1 + 4) >> 3) as u8;
31 src0 = src0.add(sstride);
32 src1 = src1.add(sstride);
33 dst = dst.add(dstride);
38 let mut src = src.as_ptr();
39 let mut dst = dst.as_mut_ptr();
43 let b = *src.add(x + 1);
44 *dst.add(x) = ((u16::from(a) * a0 + u16::from(b) * a1 + 4) >> 3) as u8;
47 src = src.add(sstride);
48 dst = dst.add(dstride);
53 let mut src0 = src.as_ptr();
54 let mut src1 = src0.add(sstride);
55 let mut dst = dst.as_mut_ptr();
60 let b = *src0.add(x + 1);
61 let d = *src1.add(x + 1);
62 *dst.add(x) = ((u16::from(a) * a0 * b0 + u16::from(b) * a1 * b0 + u16::from(c) * a0 * b1 + u16::from(d) * a1 * b1 + 0x20) >> 6) as u8;
66 src0 = src0.add(sstride);
67 src1 = src1.add(sstride);
68 dst = dst.add(dstride);
74 pub fn chroma_interp_8(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, dx: u16, dy: u16, h: usize) {
79 "lea {stmp}, [{src} + {sstride} * 2]",
80 "lea {dtmp}, [{dst} + {dstride} * 2]",
83 "movq xmm1, [{src} + {sstride}]",
84 "movq xmm2, [{stmp}]",
85 "movq xmm3, [{stmp} + {sstride}]",
87 "lea {src}, [{src} + {sstride} * 4]",
88 "movq [{dst} + {dstride}], xmm1",
89 "lea {stmp}, [{stmp} + {sstride} * 4]",
90 "movq [{dtmp}], xmm2",
91 "lea {dst}, [{dst} + {dstride} * 4]",
92 "movq [{dtmp} + {dstride}], xmm3",
93 "lea {dtmp}, [{dtmp} + {dstride} * 4]",
96 src = inout(reg) src.as_ptr() => _,
97 sstride = in(reg) sstride,
98 dst = inout(reg) dst.as_mut_ptr() => _,
99 dstride = in(reg) dstride,
100 h = inout(reg) h => _,
114 "mov {a1:e}, 0x0004",
116 "pshuflw xmm3, xmm3, 0",
117 "pshuflw xmm4, xmm4, 0",
118 "pshuflw xmm5, xmm5, 0",
119 "movlhps xmm3, xmm3",
120 "movlhps xmm4, xmm4",
121 "movlhps xmm5, xmm5",
122 "movq xmm6, [{src}]",
123 "add {src}, {sstride}",
124 "punpcklbw xmm6, xmm0",
127 "movq xmm2, [{src}]",
128 "punpcklbw xmm2, xmm0",
132 "add {src}, {sstride}",
136 "packuswb xmm1, xmm1",
137 "movq [{dst}], xmm1",
138 "add {dst}, {dstride}",
141 src = inout(reg) src.as_ptr() => _,
142 sstride = in(reg) sstride,
143 dst = inout(reg) dst.as_mut_ptr() => _,
144 dstride = in(reg) dstride,
145 h = inout(reg) h => _,
146 a0 = in(reg) i32::from(8 - dy),
147 a1 = inout(reg) i32::from(dy) => _,
162 "mov {a1:e}, 0x0004",
164 "pshuflw xmm3, xmm3, 0",
165 "pshuflw xmm4, xmm4, 0",
166 "pshuflw xmm5, xmm5, 0",
167 "movlhps xmm3, xmm3",
168 "movlhps xmm4, xmm4",
169 "movlhps xmm5, xmm5",
171 "movq xmm1, [{src}]",
172 "movq xmm2, [{src} + 1]",
173 "punpcklbw xmm1, xmm0",
174 "punpcklbw xmm2, xmm0",
177 "add {src}, {sstride}",
181 "packuswb xmm1, xmm1",
182 "movq [{dst}], xmm1",
183 "add {dst}, {dstride}",
186 src = inout(reg) src.as_ptr() => _,
187 sstride = inout(reg) sstride => _,
188 dst = inout(reg) dst.as_mut_ptr() => _,
189 dstride = inout(reg) dstride => _,
190 h = inout(reg) h => _,
191 a0 = inout(reg) i32::from(8 - dx) => _,
192 a1 = inout(reg) i32::from(dx) => _,
201 #[cfg(target_arch = "x86")]
202 _ => chroma_interp(dst, dstride, src, sstride, dx, dy, 8, h),
203 #[cfg(target_arch = "x86_64")]
211 "mov {a1:e}, 0x0020",
213 "pshuflw xmm3, xmm3, 0",
214 "pshuflw xmm4, xmm4, 0",
215 "pshuflw xmm5, xmm5, 0",
216 "pshuflw xmm6, xmm6, 0",
217 "pshuflw xmm7, xmm7, 0",
218 "movlhps xmm3, xmm3",
219 "movlhps xmm4, xmm4",
220 "movlhps xmm5, xmm5",
221 "movlhps xmm6, xmm6",
222 "movlhps xmm7, xmm7",
224 "movq xmm8, [{src}]",
225 "movq xmm2, [{src} + 1]",
226 "punpcklbw xmm8, xmm0",
227 "punpcklbw xmm2, xmm0",
230 "add {src}, {sstride}",
234 "movq xmm1, [{src}]",
235 "movq xmm2, [{src} + 1]",
236 "punpcklbw xmm1, xmm0",
237 "punpcklbw xmm2, xmm0",
240 "add {src}, {sstride}",
250 "packuswb xmm1, xmm1",
251 "movq [{dst}], xmm1",
252 "add {dst}, {dstride}",
255 src = inout(reg) src.as_ptr() => _,
256 sstride = inout(reg) sstride => _,
257 dst = inout(reg) dst.as_mut_ptr() => _,
258 dstride = inout(reg) dstride => _,
259 h = inout(reg) h => _,
260 a0 = inout(reg) i32::from(8 - dx) => _,
261 a1 = inout(reg) i32::from(dx) => _,
262 b0 = inout(reg) i32::from(8 - dy) => _,
263 b1 = inout(reg) i32::from(dy) => _,
279 pub fn chroma_interp_4(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, dx: u16, dy: u16, h: usize) {
285 "movd xmm0, [{src}]",
286 "movd xmm1, [{src} + {sstride}]",
287 "movd [{dst}], xmm0",
288 "lea {src}, [{src} + {sstride} * 2]",
289 "movd [{dst} + {dstride}], xmm1",
290 "lea {dst}, [{dst} + {dstride} * 2]",
293 src = inout(reg) src.as_ptr() => _,
294 sstride = in(reg) sstride,
295 dst = inout(reg) dst.as_mut_ptr() => _,
296 dstride = in(reg) dstride,
297 h = inout(reg) h => _,
307 "mov {a1:e}, 0x0004",
309 "pshuflw xmm3, xmm3, 0",
310 "pshuflw xmm4, xmm4, 0",
311 "pshuflw xmm5, xmm5, 0",
312 "movd xmm6, [{src}]",
313 "add {src}, {sstride}",
314 "punpcklbw xmm6, xmm0",
317 "movd xmm2, [{src}]",
318 "punpcklbw xmm2, xmm0",
322 "add {src}, {sstride}",
326 "packuswb xmm1, xmm1",
327 "movd [{dst}], xmm1",
328 "add {dst}, {dstride}",
331 src = inout(reg) src.as_ptr() => _,
332 sstride = inout(reg) sstride => _,
333 dst = inout(reg) dst.as_mut_ptr() => _,
334 dstride = inout(reg) dstride => _,
335 h = inout(reg) h => _,
336 a0 = inout(reg) i32::from(8 - dy) => _,
337 a1 = inout(reg) i32::from(dy) => _,
352 "mov {a1:e}, 0x0004",
354 "pshuflw xmm3, xmm3, 0",
355 "pshuflw xmm4, xmm4, 0",
356 "pshuflw xmm5, xmm5, 0",
358 "movd xmm1, [{src}]",
359 "movd xmm2, [{src} + 1]",
360 "punpcklbw xmm1, xmm0",
361 "punpcklbw xmm2, xmm0",
364 "add {src}, {sstride}",
368 "packuswb xmm1, xmm1",
369 "movd [{dst}], xmm1",
370 "add {dst}, {dstride}",
373 src = inout(reg) src.as_ptr() => _,
374 sstride = inout(reg) sstride => _,
375 dst = inout(reg) dst.as_mut_ptr() => _,
376 dstride = inout(reg) dstride => _,
377 h = inout(reg) h => _,
378 a0 = inout(reg) i32::from(8 - dx) => _,
379 a1 = inout(reg) i32::from(dx) => _,
388 #[cfg(target_arch = "x86")]
389 _ => chroma_interp(dst, dstride, src, sstride, dx, dy, 4, h),
390 #[cfg(target_arch = "x86_64")]
398 "mov {a1:e}, 0x0020",
400 "pshuflw xmm3, xmm3, 0",
401 "pshuflw xmm4, xmm4, 0",
402 "pshuflw xmm5, xmm5, 0",
403 "pshuflw xmm6, xmm6, 0",
404 "pshuflw xmm7, xmm7, 0",
406 "movd xmm8, [{src}]",
407 "movd xmm2, [{src} + 1]",
408 "punpcklbw xmm8, xmm0",
409 "punpcklbw xmm2, xmm0",
412 "add {src}, {sstride}",
416 "movd xmm1, [{src}]",
417 "movd xmm2, [{src} + 1]",
418 "punpcklbw xmm1, xmm0",
419 "punpcklbw xmm2, xmm0",
422 "add {src}, {sstride}",
432 "packuswb xmm1, xmm1",
433 "movd [{dst}], xmm1",
434 "add {dst}, {dstride}",
437 src = inout(reg) src.as_ptr() => _,
438 sstride = inout(reg) sstride => _,
439 dst = inout(reg) dst.as_mut_ptr() => _,
440 dstride = inout(reg) dstride => _,
441 h = inout(reg) h => _,
442 a0 = inout(reg) i32::from(8 - dx) => _,
443 a1 = inout(reg) i32::from(dx) => _,
444 b0 = inout(reg) i32::from(8 - dy) => _,
445 b1 = inout(reg) i32::from(dy) => _,
462 fn chr_interp2(a: u8, b: u8, b0: u16, b1: u16) -> u8 {
463 ((u16::from(a) * b0 + u16::from(b) * b1 + 4) >> 3) as u8
466 fn chr_interp4(a: u8, b: u8, c: u8, d: u8, a0: u16, a1: u16, b0: u16, b1: u16) -> u8 {
467 ((u16::from(a) * a0 * b0 + u16::from(b) * a1 * b0 + u16::from(c) * a0 * b1 + u16::from(d) * a1 * b1 + 0x20) >> 6) as u8
470 pub fn chroma_interp_2(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, dx: u16, dy: u16, h: usize) {
476 if a0 == 8 && b0 == 8 {
478 let mut src = src.as_ptr();
479 let mut dst = dst.as_mut_ptr();
480 std::ptr::copy_nonoverlapping(src, dst, 2);
481 src = src.add(sstride);
482 dst = dst.add(dstride);
483 std::ptr::copy_nonoverlapping(src, dst, 2);
485 src = src.add(sstride);
486 dst = dst.add(dstride);
487 std::ptr::copy_nonoverlapping(src, dst, 2);
488 src = src.add(sstride);
489 dst = dst.add(dstride);
490 std::ptr::copy_nonoverlapping(src, dst, 2);
495 let mut src0 = src.as_ptr();
496 let mut src1 = src0.add(sstride);
497 let mut dst = dst.as_mut_ptr();
498 *dst = chr_interp2(*src0, *src1, b0, b1);
499 *dst.add(1) = chr_interp2(*src0.add(1), *src1.add(1), b0, b1);
500 *dst.add(dstride) = chr_interp2(*src0.add(sstride), *src1.add(sstride), b0, b1);
501 *dst.add(dstride + 1) = chr_interp2(*src0.add(sstride + 1), *src1.add(sstride + 1), b0, b1);
503 src0 = src0.add(sstride * 2);
504 src1 = src1.add(sstride * 2);
505 dst = dst.add(dstride * 2);
506 *dst = chr_interp2(*src0, *src1, b0, b1);
507 *dst.add(1) = chr_interp2(*src0.add(1), *src1.add(1), b0, b1);
508 *dst.add(dstride) = chr_interp2(*src0.add(sstride), *src1.add(sstride), b0, b1);
509 *dst.add(dstride + 1) = chr_interp2(*src0.add(sstride + 1), *src1.add(sstride + 1), b0, b1);
514 let mut src = src.as_ptr();
515 let mut dst = dst.as_mut_ptr();
516 let (a, b, c) = (*src, *src.add(1), *src.add(2));
517 *dst = chr_interp2(a, b, a0, a1);
518 *dst.add(1) = chr_interp2(b, c, a0, a1);
519 let (a, b, c) = (*src.add(sstride), *src.add(sstride + 1), *src.add(sstride + 2));
520 *dst.add(dstride) = chr_interp2(a, b, a0, a1);
521 *dst.add(dstride + 1) = chr_interp2(b, c, a0, a1);
523 src = src.add(sstride * 2);
524 dst = dst.add(dstride * 2);
525 let (a, b, c) = (*src, *src.add(1), *src.add(2));
526 *dst = chr_interp2(a, b, a0, a1);
527 *dst.add(1) = chr_interp2(b, c, a0, a1);
528 let (a, b, c) = (*src.add(sstride), *src.add(sstride + 1), *src.add(sstride + 2));
529 *dst.add(dstride) = chr_interp2(a, b, a0, a1);
530 *dst.add(dstride + 1) = chr_interp2(b, c, a0, a1);
536 let mut src0 = src.as_ptr();
537 let mut src1 = src0.add(sstride);
538 let mut dst = dst.as_mut_ptr();
540 let (a, b, c) = (*src0, *src0.add(1), *src0.add(2));
541 let (d, e, f) = (*src1, *src1.add(1), *src1.add(2));
542 let (g, h, i) = (*src1.add(sstride), *src1.add(sstride + 1), *src1.add(sstride + 2));
543 *dst = chr_interp4(a, b, d, e, a0, a1, b0, b1);
544 *dst.add(1) = chr_interp4(b, c, e, f, a0, a1, b0, b1);
545 *dst.add(dstride) = chr_interp4(d, e, g, h, a0, a1, b0, b1);
546 *dst.add(dstride + 1) = chr_interp4(e, f, h, i, a0, a1, b0, b1);
548 src0 = src0.add(sstride * 3);
549 src1 = src1.add(sstride * 3);
550 dst = dst.add(dstride * 2);
551 let (a, b, c) = (*src0, *src0.add(1), *src0.add(2));
552 let (d, e, f) = (*src1, *src1.add(1), *src1.add(2));
553 *dst = chr_interp4(g, h, a, b, a0, a1, b0, b1);
554 *dst.add(1) = chr_interp4(h, i, b, c, a0, a1, b0, b1);
555 *dst.add(dstride) = chr_interp4(a, b, d, e, a0, a1, b0, b1);
556 *dst.add(dstride + 1) = chr_interp4(b, c, e, f, a0, a1, b0, b1);