X-Git-Url: https://git.nihav.org/?a=blobdiff_plain;f=nihav-itu%2Fsrc%2Fcodecs%2Fh264%2Fdsp%2Fmc%2Fx86%2Fchroma_mc.rs;fp=nihav-itu%2Fsrc%2Fcodecs%2Fh264%2Fdsp%2Fmc%2Fx86%2Fchroma_mc.rs;h=12ccc65f4ccd93cdd577b2546d2633b186d9f9b5;hb=42005e259dd77147b77c7a0057aa3cf033e331d0;hp=0000000000000000000000000000000000000000;hpb=15845d1a4eafe534d6f9064ef8a3622d377c4be6;p=nihav.git diff --git a/nihav-itu/src/codecs/h264/dsp/mc/x86/chroma_mc.rs b/nihav-itu/src/codecs/h264/dsp/mc/x86/chroma_mc.rs new file mode 100644 index 0000000..12ccc65 --- /dev/null +++ b/nihav-itu/src/codecs/h264/dsp/mc/x86/chroma_mc.rs @@ -0,0 +1,561 @@ +use std::arch::asm; + +#[cfg(target_arch = "x86")] +fn chroma_interp(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, dx: u16, dy: u16, w: usize, h: usize) { + let a0 = 8 - dx; + let a1 = dx; + let b0 = 8 - dy; + let b1 = dy; + + if a0 == 8 && b0 == 8 { + unsafe { + let mut src = src.as_ptr(); + let mut dst = dst.as_mut_ptr(); + for _ in 0..h { + std::ptr::copy_nonoverlapping(src, dst, w); + src = src.add(sstride); + dst = dst.add(dstride); + } + } + } else if a0 == 8 { + unsafe { + let mut src0 = src.as_ptr(); + let mut src1 = src0.add(sstride); + let mut dst = dst.as_mut_ptr(); + for _ in 0..h { + for x in 0..w { + let a = *src0.add(x); + let b = *src1.add(x); + *dst.add(x) = ((u16::from(a) * b0 + u16::from(b) * b1 + 4) >> 3) as u8; + } + src0 = src0.add(sstride); + src1 = src1.add(sstride); + dst = dst.add(dstride); + } + } + } else if b0 == 8 { + unsafe { + let mut src = src.as_ptr(); + let mut dst = dst.as_mut_ptr(); + for _ in 0..h { + let mut a = *src; + for x in 0..w { + let b = *src.add(x + 1); + *dst.add(x) = ((u16::from(a) * a0 + u16::from(b) * a1 + 4) >> 3) as u8; + a = b; + } + src = src.add(sstride); + dst = dst.add(dstride); + } + } + } else { + unsafe { + let mut src0 = src.as_ptr(); + let mut src1 = src0.add(sstride); + let mut dst = dst.as_mut_ptr(); + for _ in 0..h { + let mut a = *src0; + let mut c = *src1; + for x in 0..w { + let b = *src0.add(x + 1); + let d = *src1.add(x + 1); + *dst.add(x) = ((u16::from(a) * a0 * b0 + u16::from(b) * a1 * b0 + u16::from(c) * a0 * b1 + u16::from(d) * a1 * b1 + 0x20) >> 6) as u8; + a = b; + c = d; + } + src0 = src0.add(sstride); + src1 = src1.add(sstride); + dst = dst.add(dstride); + } + } + } +} + +pub fn chroma_interp_8(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, dx: u16, dy: u16, h: usize) { + unsafe { + match (dx, dy) { + (0, 0) => { + asm!( + "lea {stmp}, [{src} + {sstride} * 2]", + "lea {dtmp}, [{dst} + {dstride} * 2]", + "2:", + "movq xmm0, [{src}]", + "movq xmm1, [{src} + {sstride}]", + "movq xmm2, [{stmp}]", + "movq xmm3, [{stmp} + {sstride}]", + "movq [{dst}], xmm0", + "lea {src}, [{src} + {sstride} * 4]", + "movq [{dst} + {dstride}], xmm1", + "lea {stmp}, [{stmp} + {sstride} * 4]", + "movq [{dtmp}], xmm2", + "lea {dst}, [{dst} + {dstride} * 4]", + "movq [{dtmp} + {dstride}], xmm3", + "lea {dtmp}, [{dtmp} + {dstride} * 4]", + "sub {h}, 4", + "jnz 2b", + src = inout(reg) src.as_ptr() => _, + sstride = in(reg) sstride, + dst = inout(reg) dst.as_mut_ptr() => _, + dstride = in(reg) dstride, + h = inout(reg) h => _, + stmp = out(reg) _, + dtmp = out(reg) _, + out("xmm0") _, + out("xmm1") _, + out("xmm2") _, + out("xmm3") _, + ); + }, + (0, _) => { + asm!( + "pxor xmm0, xmm0", + "movd xmm3, {a0:e}", + "movd xmm4, {a1:e}", + "mov {a1:e}, 0x0004", + "movd xmm5, {a1:e}", + "pshuflw xmm3, xmm3, 0", + "pshuflw xmm4, xmm4, 0", + "pshuflw xmm5, xmm5, 0", + "movlhps xmm3, xmm3", + "movlhps xmm4, xmm4", + "movlhps xmm5, xmm5", + "movq xmm6, [{src}]", + "add {src}, {sstride}", + "punpcklbw xmm6, xmm0", + "2:", + "movaps xmm1, xmm6", + "movq xmm2, [{src}]", + "punpcklbw xmm2, xmm0", + "movaps xmm6, xmm2", + "pmullw xmm1, xmm3", + "pmullw xmm2, xmm4", + "add {src}, {sstride}", + "paddw xmm1, xmm2", + "paddw xmm1, xmm5", + "psraw xmm1, 3", + "packuswb xmm1, xmm1", + "movq [{dst}], xmm1", + "add {dst}, {dstride}", + "dec {h}", + "jnz 2b", + src = inout(reg) src.as_ptr() => _, + sstride = in(reg) sstride, + dst = inout(reg) dst.as_mut_ptr() => _, + dstride = in(reg) dstride, + h = inout(reg) h => _, + a0 = in(reg) i32::from(8 - dy), + a1 = inout(reg) i32::from(dy) => _, + out("xmm0") _, + out("xmm1") _, + out("xmm2") _, + out("xmm3") _, + out("xmm4") _, + out("xmm5") _, + out("xmm6") _, + ); + }, + (_, 0) => { + asm!( + "pxor xmm0, xmm0", + "movd xmm3, {a0:e}", + "movd xmm4, {a1:e}", + "mov {a1:e}, 0x0004", + "movd xmm5, {a1:e}", + "pshuflw xmm3, xmm3, 0", + "pshuflw xmm4, xmm4, 0", + "pshuflw xmm5, xmm5, 0", + "movlhps xmm3, xmm3", + "movlhps xmm4, xmm4", + "movlhps xmm5, xmm5", + "2:", + "movq xmm1, [{src}]", + "movq xmm2, [{src} + 1]", + "punpcklbw xmm1, xmm0", + "punpcklbw xmm2, xmm0", + "pmullw xmm1, xmm3", + "pmullw xmm2, xmm4", + "add {src}, {sstride}", + "paddw xmm1, xmm2", + "paddw xmm1, xmm5", + "psraw xmm1, 3", + "packuswb xmm1, xmm1", + "movq [{dst}], xmm1", + "add {dst}, {dstride}", + "dec {h}", + "jnz 2b", + src = inout(reg) src.as_ptr() => _, + sstride = inout(reg) sstride => _, + dst = inout(reg) dst.as_mut_ptr() => _, + dstride = inout(reg) dstride => _, + h = inout(reg) h => _, + a0 = inout(reg) i32::from(8 - dx) => _, + a1 = inout(reg) i32::from(dx) => _, + out("xmm0") _, + out("xmm1") _, + out("xmm2") _, + out("xmm3") _, + out("xmm4") _, + out("xmm5") _, + ); + }, + #[cfg(target_arch = "x86")] + _ => chroma_interp(dst, dstride, src, sstride, dx, dy, 8, h), + #[cfg(target_arch = "x86_64")] + _ => { + asm!( + "pxor xmm0, xmm0", + "movd xmm3, {a0:e}", + "movd xmm4, {a1:e}", + "movd xmm5, {b0:e}", + "movd xmm6, {b1:e}", + "mov {a1:e}, 0x0020", + "movd xmm7, {a1:e}", + "pshuflw xmm3, xmm3, 0", + "pshuflw xmm4, xmm4, 0", + "pshuflw xmm5, xmm5, 0", + "pshuflw xmm6, xmm6, 0", + "pshuflw xmm7, xmm7, 0", + "movlhps xmm3, xmm3", + "movlhps xmm4, xmm4", + "movlhps xmm5, xmm5", + "movlhps xmm6, xmm6", + "movlhps xmm7, xmm7", + + "movq xmm8, [{src}]", + "movq xmm2, [{src} + 1]", + "punpcklbw xmm8, xmm0", + "punpcklbw xmm2, xmm0", + "pmullw xmm8, xmm3", + "pmullw xmm2, xmm4", + "add {src}, {sstride}", + "paddw xmm8, xmm2", + + "2:", + "movq xmm1, [{src}]", + "movq xmm2, [{src} + 1]", + "punpcklbw xmm1, xmm0", + "punpcklbw xmm2, xmm0", + "pmullw xmm1, xmm3", + "pmullw xmm2, xmm4", + "add {src}, {sstride}", + "paddw xmm1, xmm2", + "movaps xmm2, xmm8", + "movaps xmm8, xmm1", + + "pmullw xmm1, xmm6", + "pmullw xmm2, xmm5", + "paddw xmm1, xmm2", + "paddw xmm1, xmm7", + "psraw xmm1, 6", + "packuswb xmm1, xmm1", + "movq [{dst}], xmm1", + "add {dst}, {dstride}", + "dec {h}", + "jnz 2b", + src = inout(reg) src.as_ptr() => _, + sstride = inout(reg) sstride => _, + dst = inout(reg) dst.as_mut_ptr() => _, + dstride = inout(reg) dstride => _, + h = inout(reg) h => _, + a0 = inout(reg) i32::from(8 - dx) => _, + a1 = inout(reg) i32::from(dx) => _, + b0 = inout(reg) i32::from(8 - dy) => _, + b1 = inout(reg) i32::from(dy) => _, + out("xmm0") _, + out("xmm1") _, + out("xmm2") _, + out("xmm3") _, + out("xmm4") _, + out("xmm5") _, + out("xmm6") _, + out("xmm7") _, + out("xmm8") _, + ); + }, + }; + } +} + +pub fn chroma_interp_4(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, dx: u16, dy: u16, h: usize) { + unsafe { + match (dx, dy) { + (0, 0) => { + asm!( + "2:", + "movd xmm0, [{src}]", + "movd xmm1, [{src} + {sstride}]", + "movd [{dst}], xmm0", + "lea {src}, [{src} + {sstride} * 2]", + "movd [{dst} + {dstride}], xmm1", + "lea {dst}, [{dst} + {dstride} * 2]", + "sub {h}, 2", + "jnz 2b", + src = inout(reg) src.as_ptr() => _, + sstride = in(reg) sstride, + dst = inout(reg) dst.as_mut_ptr() => _, + dstride = in(reg) dstride, + h = inout(reg) h => _, + out("xmm0") _, + out("xmm1") _, + ); + }, + (0, _) => { + asm!( + "pxor xmm0, xmm0", + "movd xmm3, {a0:e}", + "movd xmm4, {a1:e}", + "mov {a1:e}, 0x0004", + "movd xmm5, {a1:e}", + "pshuflw xmm3, xmm3, 0", + "pshuflw xmm4, xmm4, 0", + "pshuflw xmm5, xmm5, 0", + "movd xmm6, [{src}]", + "add {src}, {sstride}", + "punpcklbw xmm6, xmm0", + "2:", + "movaps xmm1, xmm6", + "movd xmm2, [{src}]", + "punpcklbw xmm2, xmm0", + "movaps xmm6, xmm2", + "pmullw xmm1, xmm3", + "pmullw xmm2, xmm4", + "add {src}, {sstride}", + "paddw xmm1, xmm2", + "paddw xmm1, xmm5", + "psraw xmm1, 3", + "packuswb xmm1, xmm1", + "movd [{dst}], xmm1", + "add {dst}, {dstride}", + "dec {h}", + "jnz 2b", + src = inout(reg) src.as_ptr() => _, + sstride = inout(reg) sstride => _, + dst = inout(reg) dst.as_mut_ptr() => _, + dstride = inout(reg) dstride => _, + h = inout(reg) h => _, + a0 = inout(reg) i32::from(8 - dy) => _, + a1 = inout(reg) i32::from(dy) => _, + out("xmm0") _, + out("xmm1") _, + out("xmm2") _, + out("xmm3") _, + out("xmm4") _, + out("xmm5") _, + out("xmm6") _, + ); + }, + (_, 0) => { + asm!( + "pxor xmm0, xmm0", + "movd xmm3, {a0:e}", + "movd xmm4, {a1:e}", + "mov {a1:e}, 0x0004", + "movd xmm5, {a1:e}", + "pshuflw xmm3, xmm3, 0", + "pshuflw xmm4, xmm4, 0", + "pshuflw xmm5, xmm5, 0", + "2:", + "movd xmm1, [{src}]", + "movd xmm2, [{src} + 1]", + "punpcklbw xmm1, xmm0", + "punpcklbw xmm2, xmm0", + "pmullw xmm1, xmm3", + "pmullw xmm2, xmm4", + "add {src}, {sstride}", + "paddw xmm1, xmm2", + "paddw xmm1, xmm5", + "psraw xmm1, 3", + "packuswb xmm1, xmm1", + "movd [{dst}], xmm1", + "add {dst}, {dstride}", + "dec {h}", + "jnz 2b", + src = inout(reg) src.as_ptr() => _, + sstride = inout(reg) sstride => _, + dst = inout(reg) dst.as_mut_ptr() => _, + dstride = inout(reg) dstride => _, + h = inout(reg) h => _, + a0 = inout(reg) i32::from(8 - dx) => _, + a1 = inout(reg) i32::from(dx) => _, + out("xmm0") _, + out("xmm1") _, + out("xmm2") _, + out("xmm3") _, + out("xmm4") _, + out("xmm5") _, + ); + }, + #[cfg(target_arch = "x86")] + _ => chroma_interp(dst, dstride, src, sstride, dx, dy, 4, h), + #[cfg(target_arch = "x86_64")] + _ => { + asm!( + "pxor xmm0, xmm0", + "movd xmm3, {a0:e}", + "movd xmm4, {a1:e}", + "movd xmm5, {b0:e}", + "movd xmm6, {b1:e}", + "mov {a1:e}, 0x0020", + "movd xmm7, {a1:e}", + "pshuflw xmm3, xmm3, 0", + "pshuflw xmm4, xmm4, 0", + "pshuflw xmm5, xmm5, 0", + "pshuflw xmm6, xmm6, 0", + "pshuflw xmm7, xmm7, 0", + + "movd xmm8, [{src}]", + "movd xmm2, [{src} + 1]", + "punpcklbw xmm8, xmm0", + "punpcklbw xmm2, xmm0", + "pmullw xmm8, xmm3", + "pmullw xmm2, xmm4", + "add {src}, {sstride}", + "paddw xmm8, xmm2", + + "2:", + "movd xmm1, [{src}]", + "movd xmm2, [{src} + 1]", + "punpcklbw xmm1, xmm0", + "punpcklbw xmm2, xmm0", + "pmullw xmm1, xmm3", + "pmullw xmm2, xmm4", + "add {src}, {sstride}", + "paddw xmm1, xmm2", + "movaps xmm2, xmm8", + "movaps xmm8, xmm1", + + "pmullw xmm1, xmm6", + "pmullw xmm2, xmm5", + "paddw xmm1, xmm2", + "paddw xmm1, xmm7", + "psraw xmm1, 6", + "packuswb xmm1, xmm1", + "movd [{dst}], xmm1", + "add {dst}, {dstride}", + "dec {h}", + "jnz 2b", + src = inout(reg) src.as_ptr() => _, + sstride = inout(reg) sstride => _, + dst = inout(reg) dst.as_mut_ptr() => _, + dstride = inout(reg) dstride => _, + h = inout(reg) h => _, + a0 = inout(reg) i32::from(8 - dx) => _, + a1 = inout(reg) i32::from(dx) => _, + b0 = inout(reg) i32::from(8 - dy) => _, + b1 = inout(reg) i32::from(dy) => _, + out("xmm0") _, + out("xmm1") _, + out("xmm2") _, + out("xmm3") _, + out("xmm4") _, + out("xmm5") _, + out("xmm6") _, + out("xmm7") _, + out("xmm8") _, + ); + }, + }; + } +} + +#[inline] +fn chr_interp2(a: u8, b: u8, b0: u16, b1: u16) -> u8 { + ((u16::from(a) * b0 + u16::from(b) * b1 + 4) >> 3) as u8 +} +#[inline] +fn chr_interp4(a: u8, b: u8, c: u8, d: u8, a0: u16, a1: u16, b0: u16, b1: u16) -> u8 { + ((u16::from(a) * a0 * b0 + u16::from(b) * a1 * b0 + u16::from(c) * a0 * b1 + u16::from(d) * a1 * b1 + 0x20) >> 6) as u8 +} + +pub fn chroma_interp_2(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, dx: u16, dy: u16, h: usize) { + let a0 = 8 - dx; + let a1 = dx; + let b0 = 8 - dy; + let b1 = dy; + + if a0 == 8 && b0 == 8 { + unsafe { + let mut src = src.as_ptr(); + let mut dst = dst.as_mut_ptr(); + std::ptr::copy_nonoverlapping(src, dst, 2); + src = src.add(sstride); + dst = dst.add(dstride); + std::ptr::copy_nonoverlapping(src, dst, 2); + if h == 4 { + src = src.add(sstride); + dst = dst.add(dstride); + std::ptr::copy_nonoverlapping(src, dst, 2); + src = src.add(sstride); + dst = dst.add(dstride); + std::ptr::copy_nonoverlapping(src, dst, 2); + } + } + } else if a0 == 8 { + unsafe { + let mut src0 = src.as_ptr(); + let mut src1 = src0.add(sstride); + let mut dst = dst.as_mut_ptr(); + *dst = chr_interp2(*src0, *src1, b0, b1); + *dst.add(1) = chr_interp2(*src0.add(1), *src1.add(1), b0, b1); + *dst.add(dstride) = chr_interp2(*src0.add(sstride), *src1.add(sstride), b0, b1); + *dst.add(dstride + 1) = chr_interp2(*src0.add(sstride + 1), *src1.add(sstride + 1), b0, b1); + if h == 4 { + src0 = src0.add(sstride * 2); + src1 = src1.add(sstride * 2); + dst = dst.add(dstride * 2); + *dst = chr_interp2(*src0, *src1, b0, b1); + *dst.add(1) = chr_interp2(*src0.add(1), *src1.add(1), b0, b1); + *dst.add(dstride) = chr_interp2(*src0.add(sstride), *src1.add(sstride), b0, b1); + *dst.add(dstride + 1) = chr_interp2(*src0.add(sstride + 1), *src1.add(sstride + 1), b0, b1); + } + } + } else if b0 == 8 { + unsafe { + let mut src = src.as_ptr(); + let mut dst = dst.as_mut_ptr(); + let (a, b, c) = (*src, *src.add(1), *src.add(2)); + *dst = chr_interp2(a, b, a0, a1); + *dst.add(1) = chr_interp2(b, c, a0, a1); + let (a, b, c) = (*src.add(sstride), *src.add(sstride + 1), *src.add(sstride + 2)); + *dst.add(dstride) = chr_interp2(a, b, a0, a1); + *dst.add(dstride + 1) = chr_interp2(b, c, a0, a1); + if h == 4 { + src = src.add(sstride * 2); + dst = dst.add(dstride * 2); + let (a, b, c) = (*src, *src.add(1), *src.add(2)); + *dst = chr_interp2(a, b, a0, a1); + *dst.add(1) = chr_interp2(b, c, a0, a1); + let (a, b, c) = (*src.add(sstride), *src.add(sstride + 1), *src.add(sstride + 2)); + *dst.add(dstride) = chr_interp2(a, b, a0, a1); + *dst.add(dstride + 1) = chr_interp2(b, c, a0, a1); + } + } + } else { + unsafe { + let height = h; + let mut src0 = src.as_ptr(); + let mut src1 = src0.add(sstride); + let mut dst = dst.as_mut_ptr(); + + let (a, b, c) = (*src0, *src0.add(1), *src0.add(2)); + let (d, e, f) = (*src1, *src1.add(1), *src1.add(2)); + let (g, h, i) = (*src1.add(sstride), *src1.add(sstride + 1), *src1.add(sstride + 2)); + *dst = chr_interp4(a, b, d, e, a0, a1, b0, b1); + *dst.add(1) = chr_interp4(b, c, e, f, a0, a1, b0, b1); + *dst.add(dstride) = chr_interp4(d, e, g, h, a0, a1, b0, b1); + *dst.add(dstride + 1) = chr_interp4(e, f, h, i, a0, a1, b0, b1); + if height == 4 { + src0 = src0.add(sstride * 3); + src1 = src1.add(sstride * 3); + dst = dst.add(dstride * 2); + let (a, b, c) = (*src0, *src0.add(1), *src0.add(2)); + let (d, e, f) = (*src1, *src1.add(1), *src1.add(2)); + *dst = chr_interp4(g, h, a, b, a0, a1, b0, b1); + *dst.add(1) = chr_interp4(h, i, b, c, a0, a1, b0, b1); + *dst.add(dstride) = chr_interp4(a, b, d, e, a0, a1, b0, b1); + *dst.add(dstride + 1) = chr_interp4(b, c, e, f, a0, a1, b0, b1); + } + } + } +} +