[features]
default = ["all_decoders"]
+simd = [] #enable when the default rustc is >=1.62
all_decoders = ["all_video_decoders"]
decoders = []
use nihav_codec_support::codecs::MV;
use nihav_codec_support::codecs::blockdsp::*;
-#[cfg(not(debug_assertions))]
-mod release;
-#[cfg(not(debug_assertions))]
-use release::*;
-#[cfg(debug_assertions)]
-mod debug;
-#[cfg(debug_assertions)]
-use debug::*;
+macro_rules! module_selector {
+ ($( ($cond:meta, $module:ident) ),*) => {
+ module_selector!(list; r#false; $(($cond, $module)),*);
+ };
+ (list; $nocond:meta; ($ccar:meta, $carmod:ident), $(($condcdr:meta, $cdrmod:ident)),*) => {
+ module_selector!(single; $nocond; $ccar; $carmod);
+ module_selector!(list; any($nocond, $ccar); $(($condcdr, $cdrmod)),*);
+ };
+ (list; $nocond:meta; ($yescond:meta, $module:ident)) => {
+ module_selector!(single; $nocond; $yescond; $module);
+ };
+ (list; $_:meta; ) => {};
+ (single; $nocond:meta; $yescond:meta; $module:ident) => {
+ #[cfg(all(not($nocond), $yescond))]
+ mod $module;
+ #[cfg(all(not($nocond), $yescond))]
+ use $module::*;
+ };
+}
+
+module_selector! (
+ (all(feature = "simd", target_arch = "x86_64"), x86),
+ (debug_assertions, debug),
+ (not(debug_assertions), release)
+);
type MCFunc = fn (dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, h: usize);
fn register_simd(&mut self);
}
+#[allow(clippy::type_complexity)]
pub struct H264MC {
avg_buf: NAVideoBufferRef<u8>,
pub put_block_weighted: [fn (dst: &mut [u8], stride: usize, src: &[u8], h: usize, wparams: [i8; 3]); 4],
let (ysrc, ystride) = if (src_x - pre < 0) || (src_x + (w as isize) + post > (yw as isize)) || (src_y - pre < 0) || (src_y + (h as isize) + post > (yh as isize)) {
let add = (pre + post) as usize;
edge_emu(&refpic, src_x - pre, src_y - pre, w + add, h + add, &mut ebuf, 22, 0, 0);
- (ebuf.as_slice(), 22)
+ (&ebuf[..], 22)
} else {
(&src[refpic.get_offset(0) + ((src_x - pre) as usize) + ((src_y - pre) as usize) * systride..], systride)
};
--- /dev/null
+use std::arch::asm;
+
+macro_rules! avg_template {
+ ($name: ident, $mov: expr) => {
+ pub fn $name(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, bh: usize) {
+ unsafe {
+ asm!(
+ "2:",
+ concat!($mov, " xmm1, [{src}]"),
+ concat!($mov, " xmm3, [{src} + {sstride}]"),
+ concat!($mov, " xmm0, [{dst}]"),
+ concat!($mov, " xmm2, [{dst} + {dstride}]"),
+ "lea {src}, [{src} + {sstride} * 2]",
+ "pavgb xmm0, xmm1",
+ "pavgb xmm2, xmm3",
+ concat!($mov, " [{dst}], xmm0"),
+ concat!($mov, " [{dst} + {dstride}], xmm2"),
+ "lea {dst}, [{dst} + {dstride} * 2]",
+ "sub {h}, 2",
+ "jnz 2b",
+ src = inout(reg) src.as_ptr() => _,
+ sstride = in(reg) sstride,
+ dst = inout(reg) dst.as_mut_ptr() => _,
+ dstride = in(reg) dstride,
+ h = inout(reg) bh => _,
+ out("xmm0") _,
+ out("xmm1") _,
+ out("xmm2") _,
+ out("xmm3") _,
+ );
+ }
+ }
+ }
+}
+
+avg_template!(avg_4, "movd");
+avg_template!(avg_8, "movq");
+
+pub fn avg_16(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, bh: usize) {
+ unsafe {
+ asm!(
+ "lea {stmp}, [{src} + {sstride} * 2]",
+ "lea {dtmp}, [{dst} + {dstride} * 2]",
+ "2:",
+ "movaps xmm0, [{src}]",
+ "movaps xmm1, [{src} + {sstride}]",
+ "movaps xmm2, [{stmp}]",
+ "movaps xmm3, [{stmp} + {sstride}]",
+ "pavgb xmm0, [{dst}]",
+ "pavgb xmm1, [{dst} + {dstride}]",
+ "pavgb xmm2, [{dtmp}]",
+ "pavgb xmm3, [{dtmp} + {dstride}]",
+ "lea {src}, [{src} + {sstride} * 4]",
+ "movaps [{dst}], xmm0",
+ "lea {stmp}, [{stmp} + {sstride} * 4]",
+ "movaps [{dst} + {dstride}], xmm1",
+ "lea {dst}, [{dst} + {dstride} * 4]",
+ "movaps [{dtmp}], xmm2",
+ "movaps [{dtmp} + {dstride}], xmm3",
+ "lea {dtmp}, [{dtmp} + {dstride} * 4]",
+ "sub {h}, 4",
+ "jnz 2b",
+ src = inout(reg) src.as_ptr() => _,
+ sstride = in(reg) sstride,
+ dst = inout(reg) dst.as_mut_ptr() => _,
+ dstride = in(reg) dstride,
+ h = inout(reg) bh => _,
+ stmp = out(reg) _,
+ dtmp = out(reg) _,
+ out("xmm0") _,
+ out("xmm1") _,
+ out("xmm2") _,
+ out("xmm3") _,
+ );
+ }
+}
+
+macro_rules! put_block_weighted {
+ ($func:ident, $width:expr, $load:expr, $store:expr) => {
+ pub fn $func(dst: &mut [u8], stride: usize, src: &[u8], h: usize, wparams: [i8; 3]) {
+ if wparams == [1, 0, 0] {
+ for (dst, src) in dst.chunks_mut(stride).zip(src.chunks(16)).take(h) {
+ dst[..$width].copy_from_slice(&src[..$width]);
+ }
+ } else {
+ let weight = i32::from(wparams[0]);
+ let offset = i32::from(wparams[1]);
+ let wshift = i32::from(wparams[2]);
+ let bias = (1 << wshift) >> 1;
+
+ unsafe {
+ asm!(
+ "xorps xmm0, xmm0",
+ "movd xmm1, {weight:e}",
+ "movd xmm2, {offset:e}",
+ "movd xmm3, {wshift:e}",
+ "movd xmm4, {bias:e}",
+ "pshuflw xmm1, xmm1, 0",
+ "pshuflw xmm2, xmm2, 0",
+ "pshuflw xmm4, xmm4, 0",
+ "movlhps xmm1, xmm1",
+ "movlhps xmm2, xmm2",
+ "movlhps xmm4, xmm4",
+ "2:",
+ concat!($load, " xmm5, [{src}]"),
+ "add {src}, 16",
+ "movaps xmm7, xmm5",
+ "punpcklbw xmm5, xmm0",
+ "punpckhbw xmm7, xmm0",
+ "pmullw xmm5, xmm1",
+ "pmullw xmm7, xmm1",
+ "paddw xmm5, xmm4",
+ "paddw xmm7, xmm4",
+ "psraw xmm5, xmm3",
+ "psraw xmm7, xmm3",
+ "paddw xmm5, xmm2",
+ "paddw xmm7, xmm2",
+ "packuswb xmm5, xmm7",
+ concat!($store, " [{dst}], xmm5"),
+ "add {dst}, {stride}",
+ "dec {h}",
+ "jnz 2b",
+ h = inout(reg) h => _,
+ src = inout(reg) src.as_ptr() => _,
+ dst = inout(reg) dst.as_mut_ptr() => _,
+ stride = in(reg) stride,
+ weight = in(reg) weight,
+ offset = in(reg) offset,
+ wshift = in(reg) wshift,
+ bias = in(reg) bias,
+ out("xmm0") _,
+ out("xmm1") _,
+ out("xmm2") _,
+ out("xmm3") _,
+ out("xmm4") _,
+ out("xmm5") _,
+ out("xmm7") _,
+ );
+ }
+ }
+ }
+ }
+}
+
+put_block_weighted!(put_block_weighted_16, 16, "movups", "movaps");
+put_block_weighted!(put_block_weighted_8, 8, "movq", "movq");
+put_block_weighted!(put_block_weighted_4, 4, "movd", "movd");
+
+macro_rules! put_block_weighted2 {
+ ($func:ident, $mov:expr) => {
+ pub fn $func(dst: &mut [u8], stride: usize, src0: &[u8], src1: &[u8], h: usize, wparams: [i8; 5]) {
+ if wparams == [1, 0, 1, 0, 0] {
+ unsafe {
+ asm!(
+ "2:",
+ concat!($mov, " xmm0, [{src0}]"),
+ concat!($mov, " xmm1, [{src1}]"),
+ "add {src0}, 16",
+ "pavgb xmm0, xmm1",
+ "add {src1}, 16",
+ concat!($mov, " [{dst}], xmm0"),
+ "add {dst}, {stride}",
+ "dec {h}",
+ "jnz 2b",
+ src0 = inout(reg) src0.as_ptr() => _,
+ src1 = inout(reg) src1.as_ptr() => _,
+ dst = inout(reg) dst.as_mut_ptr() => _,
+ stride = in(reg) stride,
+ h = inout(reg) h => _,
+ out("xmm0") _,
+ out("xmm1") _,
+ );
+ }
+ return;
+ }
+ let weight0 = i32::from(wparams[0]);
+ let offset0 = i32::from(wparams[1]);
+ let weight1 = i32::from(wparams[2]);
+ let offset1 = i32::from(wparams[3]);
+ let wshift = i32::from(wparams[4]) + 1;
+ let offset = (offset0 + offset1 + 1) >> 1;
+ let bias = (1 << wshift) >> 1;
+
+ unsafe {
+ asm!(
+ "xorps xmm0, xmm0",
+ "movd xmm1, {weight0:e}",
+ "movd xmm2, {weight1:e}",
+ "movd xmm3, {offset:e}",
+ "movd xmm4, {wshift:e}",
+ "movd xmm5, {bias:e}",
+ "pshuflw xmm1, xmm1, 0",
+ "pshuflw xmm2, xmm2, 0",
+ "pshuflw xmm3, xmm3, 0",
+ "pshuflw xmm5, xmm5, 0",
+ "movlhps xmm1, xmm1",
+ "movlhps xmm2, xmm2",
+ "movlhps xmm3, xmm3",
+ "movlhps xmm5, xmm5",
+ "2:",
+ concat!($mov, " xmm6, [{src0}]"),
+ "add {src0}, 16",
+ concat!($mov, " xmm7, [{src1}]"),
+ "add {src1}, 16",
+ "punpcklbw xmm6, xmm0",
+ "punpcklbw xmm7, xmm0",
+ "pmullw xmm6, xmm1",
+ "pmullw xmm7, xmm2",
+ "paddw xmm6, xmm5",
+ "paddw xmm6, xmm7",
+ "psraw xmm6, xmm4",
+ "paddw xmm6, xmm3",
+ "movhlps xmm7, xmm6",
+ "packuswb xmm6, xmm7",
+ concat!($mov, " [{dst}], xmm6"),
+ "add {dst}, {stride}",
+ "dec {h}",
+ "jnz 2b",
+ h = inout(reg) h => _,
+ src0 = inout(reg) src0.as_ptr() => _,
+ src1 = inout(reg) src1.as_ptr() => _,
+ dst = inout(reg) dst.as_mut_ptr() => _,
+ stride = in(reg) stride,
+ weight0 = in(reg) weight0,
+ weight1 = in(reg) weight1,
+ offset = in(reg) offset,
+ wshift = in(reg) wshift,
+ bias = in(reg) bias,
+ out("xmm0") _,
+ out("xmm1") _,
+ out("xmm2") _,
+ out("xmm3") _,
+ out("xmm4") _,
+ out("xmm5") _,
+ out("xmm6") _,
+ out("xmm7") _,
+ );
+ }
+ }
+ }
+}
+
+put_block_weighted2!(put_block_weighted2_8, "movq");
+put_block_weighted2!(put_block_weighted2_4, "movd");
+
+pub fn put_block_weighted2_16(dst: &mut [u8], stride: usize, src0: &[u8], src1: &[u8], h: usize, wparams: [i8; 5]) {
+ if wparams == [1, 0, 1, 0, 0] {
+ unsafe {
+ asm!(
+ "2:",
+ "movups xmm0, [{src0}]",
+ "movups xmm1, [{src1}]",
+ "add {src0}, 16",
+ "pavgb xmm0, xmm1",
+ "add {src1}, 16",
+ "movaps [{dst}], xmm0",
+ "add {dst}, {stride}",
+ "dec {h}",
+ "jnz 2b",
+ src0 = inout(reg) src0.as_ptr() => _,
+ src1 = inout(reg) src1.as_ptr() => _,
+ dst = inout(reg) dst.as_mut_ptr() => _,
+ stride = in(reg) stride,
+ h = inout(reg) h => _,
+ out("xmm0") _,
+ out("xmm1") _,
+ );
+ }
+ return;
+ }
+ let weight0 = i32::from(wparams[0]);
+ let offset0 = i32::from(wparams[1]);
+ let weight1 = i32::from(wparams[2]);
+ let offset1 = i32::from(wparams[3]);
+ let wshift = i32::from(wparams[4]) + 1;
+ let offset = (offset0 + offset1 + 1) >> 1;
+ let bias = (1 << wshift) >> 1;
+
+ unsafe {
+ asm!(
+ "xorps xmm0, xmm0",
+ "movd xmm1, {weight0:e}",
+ "movd xmm2, {weight1:e}",
+ "movd xmm3, {offset:e}",
+ "movd xmm4, {wshift:e}",
+ "movd xmm5, {bias:e}",
+ "pshuflw xmm1, xmm1, 0",
+ "pshuflw xmm2, xmm2, 0",
+ "pshuflw xmm3, xmm3, 0",
+ "pshuflw xmm5, xmm5, 0",
+ "movlhps xmm1, xmm1",
+ "movlhps xmm2, xmm2",
+ "movlhps xmm3, xmm3",
+ "movlhps xmm5, xmm5",
+ "2:",
+ "movq xmm6, [{src0}]",
+ "movq xmm7, [{src1}]",
+ "punpcklbw xmm6, xmm0",
+ "punpcklbw xmm7, xmm0",
+ "pmullw xmm6, xmm1",
+ "pmullw xmm7, xmm2",
+ "paddw xmm6, xmm5",
+ "paddw xmm6, xmm7",
+ "psraw xmm6, xmm4",
+ "paddw xmm6, xmm3",
+ "movhlps xmm7, xmm6",
+ "packuswb xmm6, xmm7",
+ "movq [{dst}], xmm6",
+ "movq xmm6, [{src0} + 8]",
+ "add {src0}, 16",
+ "movq xmm7, [{src1} + 8]",
+ "add {src1}, 16",
+ "punpcklbw xmm6, xmm0",
+ "punpcklbw xmm7, xmm0",
+ "pmullw xmm6, xmm1",
+ "pmullw xmm7, xmm2",
+ "paddw xmm6, xmm5",
+ "paddw xmm6, xmm7",
+ "psraw xmm6, xmm4",
+ "paddw xmm6, xmm3",
+ "movhlps xmm7, xmm6",
+ "packuswb xmm6, xmm7",
+ "movq [{dst} + 8], xmm6",
+ "add {dst}, {stride}",
+ "dec {h}",
+ "jnz 2b",
+ h = inout(reg) h => _,
+ src0 = inout(reg) src0.as_ptr() => _,
+ src1 = inout(reg) src1.as_ptr() => _,
+ dst = inout(reg) dst.as_mut_ptr() => _,
+ stride = in(reg) stride,
+ weight0 = in(reg) weight0,
+ weight1 = in(reg) weight1,
+ offset = in(reg) offset,
+ wshift = in(reg) wshift,
+ bias = in(reg) bias,
+ out("xmm0") _,
+ out("xmm1") _,
+ out("xmm2") _,
+ out("xmm3") _,
+ out("xmm4") _,
+ out("xmm5") _,
+ out("xmm6") _,
+ out("xmm7") _,
+ );
+ }
+}
--- /dev/null
+use std::arch::asm;
+
+#[cfg(target_arch = "x86")]
+fn chroma_interp(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, dx: u16, dy: u16, w: usize, h: usize) {
+ let a0 = 8 - dx;
+ let a1 = dx;
+ let b0 = 8 - dy;
+ let b1 = dy;
+
+ if a0 == 8 && b0 == 8 {
+ unsafe {
+ let mut src = src.as_ptr();
+ let mut dst = dst.as_mut_ptr();
+ for _ in 0..h {
+ std::ptr::copy_nonoverlapping(src, dst, w);
+ src = src.add(sstride);
+ dst = dst.add(dstride);
+ }
+ }
+ } else if a0 == 8 {
+ unsafe {
+ let mut src0 = src.as_ptr();
+ let mut src1 = src0.add(sstride);
+ let mut dst = dst.as_mut_ptr();
+ for _ in 0..h {
+ for x in 0..w {
+ let a = *src0.add(x);
+ let b = *src1.add(x);
+ *dst.add(x) = ((u16::from(a) * b0 + u16::from(b) * b1 + 4) >> 3) as u8;
+ }
+ src0 = src0.add(sstride);
+ src1 = src1.add(sstride);
+ dst = dst.add(dstride);
+ }
+ }
+ } else if b0 == 8 {
+ unsafe {
+ let mut src = src.as_ptr();
+ let mut dst = dst.as_mut_ptr();
+ for _ in 0..h {
+ let mut a = *src;
+ for x in 0..w {
+ let b = *src.add(x + 1);
+ *dst.add(x) = ((u16::from(a) * a0 + u16::from(b) * a1 + 4) >> 3) as u8;
+ a = b;
+ }
+ src = src.add(sstride);
+ dst = dst.add(dstride);
+ }
+ }
+ } else {
+ unsafe {
+ let mut src0 = src.as_ptr();
+ let mut src1 = src0.add(sstride);
+ let mut dst = dst.as_mut_ptr();
+ for _ in 0..h {
+ let mut a = *src0;
+ let mut c = *src1;
+ for x in 0..w {
+ let b = *src0.add(x + 1);
+ let d = *src1.add(x + 1);
+ *dst.add(x) = ((u16::from(a) * a0 * b0 + u16::from(b) * a1 * b0 + u16::from(c) * a0 * b1 + u16::from(d) * a1 * b1 + 0x20) >> 6) as u8;
+ a = b;
+ c = d;
+ }
+ src0 = src0.add(sstride);
+ src1 = src1.add(sstride);
+ dst = dst.add(dstride);
+ }
+ }
+ }
+}
+
+pub fn chroma_interp_8(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, dx: u16, dy: u16, h: usize) {
+ unsafe {
+ match (dx, dy) {
+ (0, 0) => {
+ asm!(
+ "lea {stmp}, [{src} + {sstride} * 2]",
+ "lea {dtmp}, [{dst} + {dstride} * 2]",
+ "2:",
+ "movq xmm0, [{src}]",
+ "movq xmm1, [{src} + {sstride}]",
+ "movq xmm2, [{stmp}]",
+ "movq xmm3, [{stmp} + {sstride}]",
+ "movq [{dst}], xmm0",
+ "lea {src}, [{src} + {sstride} * 4]",
+ "movq [{dst} + {dstride}], xmm1",
+ "lea {stmp}, [{stmp} + {sstride} * 4]",
+ "movq [{dtmp}], xmm2",
+ "lea {dst}, [{dst} + {dstride} * 4]",
+ "movq [{dtmp} + {dstride}], xmm3",
+ "lea {dtmp}, [{dtmp} + {dstride} * 4]",
+ "sub {h}, 4",
+ "jnz 2b",
+ src = inout(reg) src.as_ptr() => _,
+ sstride = in(reg) sstride,
+ dst = inout(reg) dst.as_mut_ptr() => _,
+ dstride = in(reg) dstride,
+ h = inout(reg) h => _,
+ stmp = out(reg) _,
+ dtmp = out(reg) _,
+ out("xmm0") _,
+ out("xmm1") _,
+ out("xmm2") _,
+ out("xmm3") _,
+ );
+ },
+ (0, _) => {
+ asm!(
+ "pxor xmm0, xmm0",
+ "movd xmm3, {a0:e}",
+ "movd xmm4, {a1:e}",
+ "mov {a1:e}, 0x0004",
+ "movd xmm5, {a1:e}",
+ "pshuflw xmm3, xmm3, 0",
+ "pshuflw xmm4, xmm4, 0",
+ "pshuflw xmm5, xmm5, 0",
+ "movlhps xmm3, xmm3",
+ "movlhps xmm4, xmm4",
+ "movlhps xmm5, xmm5",
+ "movq xmm6, [{src}]",
+ "add {src}, {sstride}",
+ "punpcklbw xmm6, xmm0",
+ "2:",
+ "movaps xmm1, xmm6",
+ "movq xmm2, [{src}]",
+ "punpcklbw xmm2, xmm0",
+ "movaps xmm6, xmm2",
+ "pmullw xmm1, xmm3",
+ "pmullw xmm2, xmm4",
+ "add {src}, {sstride}",
+ "paddw xmm1, xmm2",
+ "paddw xmm1, xmm5",
+ "psraw xmm1, 3",
+ "packuswb xmm1, xmm1",
+ "movq [{dst}], xmm1",
+ "add {dst}, {dstride}",
+ "dec {h}",
+ "jnz 2b",
+ src = inout(reg) src.as_ptr() => _,
+ sstride = in(reg) sstride,
+ dst = inout(reg) dst.as_mut_ptr() => _,
+ dstride = in(reg) dstride,
+ h = inout(reg) h => _,
+ a0 = in(reg) i32::from(8 - dy),
+ a1 = inout(reg) i32::from(dy) => _,
+ out("xmm0") _,
+ out("xmm1") _,
+ out("xmm2") _,
+ out("xmm3") _,
+ out("xmm4") _,
+ out("xmm5") _,
+ out("xmm6") _,
+ );
+ },
+ (_, 0) => {
+ asm!(
+ "pxor xmm0, xmm0",
+ "movd xmm3, {a0:e}",
+ "movd xmm4, {a1:e}",
+ "mov {a1:e}, 0x0004",
+ "movd xmm5, {a1:e}",
+ "pshuflw xmm3, xmm3, 0",
+ "pshuflw xmm4, xmm4, 0",
+ "pshuflw xmm5, xmm5, 0",
+ "movlhps xmm3, xmm3",
+ "movlhps xmm4, xmm4",
+ "movlhps xmm5, xmm5",
+ "2:",
+ "movq xmm1, [{src}]",
+ "movq xmm2, [{src} + 1]",
+ "punpcklbw xmm1, xmm0",
+ "punpcklbw xmm2, xmm0",
+ "pmullw xmm1, xmm3",
+ "pmullw xmm2, xmm4",
+ "add {src}, {sstride}",
+ "paddw xmm1, xmm2",
+ "paddw xmm1, xmm5",
+ "psraw xmm1, 3",
+ "packuswb xmm1, xmm1",
+ "movq [{dst}], xmm1",
+ "add {dst}, {dstride}",
+ "dec {h}",
+ "jnz 2b",
+ src = inout(reg) src.as_ptr() => _,
+ sstride = inout(reg) sstride => _,
+ dst = inout(reg) dst.as_mut_ptr() => _,
+ dstride = inout(reg) dstride => _,
+ h = inout(reg) h => _,
+ a0 = inout(reg) i32::from(8 - dx) => _,
+ a1 = inout(reg) i32::from(dx) => _,
+ out("xmm0") _,
+ out("xmm1") _,
+ out("xmm2") _,
+ out("xmm3") _,
+ out("xmm4") _,
+ out("xmm5") _,
+ );
+ },
+ #[cfg(target_arch = "x86")]
+ _ => chroma_interp(dst, dstride, src, sstride, dx, dy, 8, h),
+ #[cfg(target_arch = "x86_64")]
+ _ => {
+ asm!(
+ "pxor xmm0, xmm0",
+ "movd xmm3, {a0:e}",
+ "movd xmm4, {a1:e}",
+ "movd xmm5, {b0:e}",
+ "movd xmm6, {b1:e}",
+ "mov {a1:e}, 0x0020",
+ "movd xmm7, {a1:e}",
+ "pshuflw xmm3, xmm3, 0",
+ "pshuflw xmm4, xmm4, 0",
+ "pshuflw xmm5, xmm5, 0",
+ "pshuflw xmm6, xmm6, 0",
+ "pshuflw xmm7, xmm7, 0",
+ "movlhps xmm3, xmm3",
+ "movlhps xmm4, xmm4",
+ "movlhps xmm5, xmm5",
+ "movlhps xmm6, xmm6",
+ "movlhps xmm7, xmm7",
+
+ "movq xmm8, [{src}]",
+ "movq xmm2, [{src} + 1]",
+ "punpcklbw xmm8, xmm0",
+ "punpcklbw xmm2, xmm0",
+ "pmullw xmm8, xmm3",
+ "pmullw xmm2, xmm4",
+ "add {src}, {sstride}",
+ "paddw xmm8, xmm2",
+
+ "2:",
+ "movq xmm1, [{src}]",
+ "movq xmm2, [{src} + 1]",
+ "punpcklbw xmm1, xmm0",
+ "punpcklbw xmm2, xmm0",
+ "pmullw xmm1, xmm3",
+ "pmullw xmm2, xmm4",
+ "add {src}, {sstride}",
+ "paddw xmm1, xmm2",
+ "movaps xmm2, xmm8",
+ "movaps xmm8, xmm1",
+
+ "pmullw xmm1, xmm6",
+ "pmullw xmm2, xmm5",
+ "paddw xmm1, xmm2",
+ "paddw xmm1, xmm7",
+ "psraw xmm1, 6",
+ "packuswb xmm1, xmm1",
+ "movq [{dst}], xmm1",
+ "add {dst}, {dstride}",
+ "dec {h}",
+ "jnz 2b",
+ src = inout(reg) src.as_ptr() => _,
+ sstride = inout(reg) sstride => _,
+ dst = inout(reg) dst.as_mut_ptr() => _,
+ dstride = inout(reg) dstride => _,
+ h = inout(reg) h => _,
+ a0 = inout(reg) i32::from(8 - dx) => _,
+ a1 = inout(reg) i32::from(dx) => _,
+ b0 = inout(reg) i32::from(8 - dy) => _,
+ b1 = inout(reg) i32::from(dy) => _,
+ out("xmm0") _,
+ out("xmm1") _,
+ out("xmm2") _,
+ out("xmm3") _,
+ out("xmm4") _,
+ out("xmm5") _,
+ out("xmm6") _,
+ out("xmm7") _,
+ out("xmm8") _,
+ );
+ },
+ };
+ }
+}
+
+pub fn chroma_interp_4(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, dx: u16, dy: u16, h: usize) {
+ unsafe {
+ match (dx, dy) {
+ (0, 0) => {
+ asm!(
+ "2:",
+ "movd xmm0, [{src}]",
+ "movd xmm1, [{src} + {sstride}]",
+ "movd [{dst}], xmm0",
+ "lea {src}, [{src} + {sstride} * 2]",
+ "movd [{dst} + {dstride}], xmm1",
+ "lea {dst}, [{dst} + {dstride} * 2]",
+ "sub {h}, 2",
+ "jnz 2b",
+ src = inout(reg) src.as_ptr() => _,
+ sstride = in(reg) sstride,
+ dst = inout(reg) dst.as_mut_ptr() => _,
+ dstride = in(reg) dstride,
+ h = inout(reg) h => _,
+ out("xmm0") _,
+ out("xmm1") _,
+ );
+ },
+ (0, _) => {
+ asm!(
+ "pxor xmm0, xmm0",
+ "movd xmm3, {a0:e}",
+ "movd xmm4, {a1:e}",
+ "mov {a1:e}, 0x0004",
+ "movd xmm5, {a1:e}",
+ "pshuflw xmm3, xmm3, 0",
+ "pshuflw xmm4, xmm4, 0",
+ "pshuflw xmm5, xmm5, 0",
+ "movd xmm6, [{src}]",
+ "add {src}, {sstride}",
+ "punpcklbw xmm6, xmm0",
+ "2:",
+ "movaps xmm1, xmm6",
+ "movd xmm2, [{src}]",
+ "punpcklbw xmm2, xmm0",
+ "movaps xmm6, xmm2",
+ "pmullw xmm1, xmm3",
+ "pmullw xmm2, xmm4",
+ "add {src}, {sstride}",
+ "paddw xmm1, xmm2",
+ "paddw xmm1, xmm5",
+ "psraw xmm1, 3",
+ "packuswb xmm1, xmm1",
+ "movd [{dst}], xmm1",
+ "add {dst}, {dstride}",
+ "dec {h}",
+ "jnz 2b",
+ src = inout(reg) src.as_ptr() => _,
+ sstride = inout(reg) sstride => _,
+ dst = inout(reg) dst.as_mut_ptr() => _,
+ dstride = inout(reg) dstride => _,
+ h = inout(reg) h => _,
+ a0 = inout(reg) i32::from(8 - dy) => _,
+ a1 = inout(reg) i32::from(dy) => _,
+ out("xmm0") _,
+ out("xmm1") _,
+ out("xmm2") _,
+ out("xmm3") _,
+ out("xmm4") _,
+ out("xmm5") _,
+ out("xmm6") _,
+ );
+ },
+ (_, 0) => {
+ asm!(
+ "pxor xmm0, xmm0",
+ "movd xmm3, {a0:e}",
+ "movd xmm4, {a1:e}",
+ "mov {a1:e}, 0x0004",
+ "movd xmm5, {a1:e}",
+ "pshuflw xmm3, xmm3, 0",
+ "pshuflw xmm4, xmm4, 0",
+ "pshuflw xmm5, xmm5, 0",
+ "2:",
+ "movd xmm1, [{src}]",
+ "movd xmm2, [{src} + 1]",
+ "punpcklbw xmm1, xmm0",
+ "punpcklbw xmm2, xmm0",
+ "pmullw xmm1, xmm3",
+ "pmullw xmm2, xmm4",
+ "add {src}, {sstride}",
+ "paddw xmm1, xmm2",
+ "paddw xmm1, xmm5",
+ "psraw xmm1, 3",
+ "packuswb xmm1, xmm1",
+ "movd [{dst}], xmm1",
+ "add {dst}, {dstride}",
+ "dec {h}",
+ "jnz 2b",
+ src = inout(reg) src.as_ptr() => _,
+ sstride = inout(reg) sstride => _,
+ dst = inout(reg) dst.as_mut_ptr() => _,
+ dstride = inout(reg) dstride => _,
+ h = inout(reg) h => _,
+ a0 = inout(reg) i32::from(8 - dx) => _,
+ a1 = inout(reg) i32::from(dx) => _,
+ out("xmm0") _,
+ out("xmm1") _,
+ out("xmm2") _,
+ out("xmm3") _,
+ out("xmm4") _,
+ out("xmm5") _,
+ );
+ },
+ #[cfg(target_arch = "x86")]
+ _ => chroma_interp(dst, dstride, src, sstride, dx, dy, 4, h),
+ #[cfg(target_arch = "x86_64")]
+ _ => {
+ asm!(
+ "pxor xmm0, xmm0",
+ "movd xmm3, {a0:e}",
+ "movd xmm4, {a1:e}",
+ "movd xmm5, {b0:e}",
+ "movd xmm6, {b1:e}",
+ "mov {a1:e}, 0x0020",
+ "movd xmm7, {a1:e}",
+ "pshuflw xmm3, xmm3, 0",
+ "pshuflw xmm4, xmm4, 0",
+ "pshuflw xmm5, xmm5, 0",
+ "pshuflw xmm6, xmm6, 0",
+ "pshuflw xmm7, xmm7, 0",
+
+ "movd xmm8, [{src}]",
+ "movd xmm2, [{src} + 1]",
+ "punpcklbw xmm8, xmm0",
+ "punpcklbw xmm2, xmm0",
+ "pmullw xmm8, xmm3",
+ "pmullw xmm2, xmm4",
+ "add {src}, {sstride}",
+ "paddw xmm8, xmm2",
+
+ "2:",
+ "movd xmm1, [{src}]",
+ "movd xmm2, [{src} + 1]",
+ "punpcklbw xmm1, xmm0",
+ "punpcklbw xmm2, xmm0",
+ "pmullw xmm1, xmm3",
+ "pmullw xmm2, xmm4",
+ "add {src}, {sstride}",
+ "paddw xmm1, xmm2",
+ "movaps xmm2, xmm8",
+ "movaps xmm8, xmm1",
+
+ "pmullw xmm1, xmm6",
+ "pmullw xmm2, xmm5",
+ "paddw xmm1, xmm2",
+ "paddw xmm1, xmm7",
+ "psraw xmm1, 6",
+ "packuswb xmm1, xmm1",
+ "movd [{dst}], xmm1",
+ "add {dst}, {dstride}",
+ "dec {h}",
+ "jnz 2b",
+ src = inout(reg) src.as_ptr() => _,
+ sstride = inout(reg) sstride => _,
+ dst = inout(reg) dst.as_mut_ptr() => _,
+ dstride = inout(reg) dstride => _,
+ h = inout(reg) h => _,
+ a0 = inout(reg) i32::from(8 - dx) => _,
+ a1 = inout(reg) i32::from(dx) => _,
+ b0 = inout(reg) i32::from(8 - dy) => _,
+ b1 = inout(reg) i32::from(dy) => _,
+ out("xmm0") _,
+ out("xmm1") _,
+ out("xmm2") _,
+ out("xmm3") _,
+ out("xmm4") _,
+ out("xmm5") _,
+ out("xmm6") _,
+ out("xmm7") _,
+ out("xmm8") _,
+ );
+ },
+ };
+ }
+}
+
+#[inline]
+fn chr_interp2(a: u8, b: u8, b0: u16, b1: u16) -> u8 {
+ ((u16::from(a) * b0 + u16::from(b) * b1 + 4) >> 3) as u8
+}
+#[inline]
+fn chr_interp4(a: u8, b: u8, c: u8, d: u8, a0: u16, a1: u16, b0: u16, b1: u16) -> u8 {
+ ((u16::from(a) * a0 * b0 + u16::from(b) * a1 * b0 + u16::from(c) * a0 * b1 + u16::from(d) * a1 * b1 + 0x20) >> 6) as u8
+}
+
+pub fn chroma_interp_2(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, dx: u16, dy: u16, h: usize) {
+ let a0 = 8 - dx;
+ let a1 = dx;
+ let b0 = 8 - dy;
+ let b1 = dy;
+
+ if a0 == 8 && b0 == 8 {
+ unsafe {
+ let mut src = src.as_ptr();
+ let mut dst = dst.as_mut_ptr();
+ std::ptr::copy_nonoverlapping(src, dst, 2);
+ src = src.add(sstride);
+ dst = dst.add(dstride);
+ std::ptr::copy_nonoverlapping(src, dst, 2);
+ if h == 4 {
+ src = src.add(sstride);
+ dst = dst.add(dstride);
+ std::ptr::copy_nonoverlapping(src, dst, 2);
+ src = src.add(sstride);
+ dst = dst.add(dstride);
+ std::ptr::copy_nonoverlapping(src, dst, 2);
+ }
+ }
+ } else if a0 == 8 {
+ unsafe {
+ let mut src0 = src.as_ptr();
+ let mut src1 = src0.add(sstride);
+ let mut dst = dst.as_mut_ptr();
+ *dst = chr_interp2(*src0, *src1, b0, b1);
+ *dst.add(1) = chr_interp2(*src0.add(1), *src1.add(1), b0, b1);
+ *dst.add(dstride) = chr_interp2(*src0.add(sstride), *src1.add(sstride), b0, b1);
+ *dst.add(dstride + 1) = chr_interp2(*src0.add(sstride + 1), *src1.add(sstride + 1), b0, b1);
+ if h == 4 {
+ src0 = src0.add(sstride * 2);
+ src1 = src1.add(sstride * 2);
+ dst = dst.add(dstride * 2);
+ *dst = chr_interp2(*src0, *src1, b0, b1);
+ *dst.add(1) = chr_interp2(*src0.add(1), *src1.add(1), b0, b1);
+ *dst.add(dstride) = chr_interp2(*src0.add(sstride), *src1.add(sstride), b0, b1);
+ *dst.add(dstride + 1) = chr_interp2(*src0.add(sstride + 1), *src1.add(sstride + 1), b0, b1);
+ }
+ }
+ } else if b0 == 8 {
+ unsafe {
+ let mut src = src.as_ptr();
+ let mut dst = dst.as_mut_ptr();
+ let (a, b, c) = (*src, *src.add(1), *src.add(2));
+ *dst = chr_interp2(a, b, a0, a1);
+ *dst.add(1) = chr_interp2(b, c, a0, a1);
+ let (a, b, c) = (*src.add(sstride), *src.add(sstride + 1), *src.add(sstride + 2));
+ *dst.add(dstride) = chr_interp2(a, b, a0, a1);
+ *dst.add(dstride + 1) = chr_interp2(b, c, a0, a1);
+ if h == 4 {
+ src = src.add(sstride * 2);
+ dst = dst.add(dstride * 2);
+ let (a, b, c) = (*src, *src.add(1), *src.add(2));
+ *dst = chr_interp2(a, b, a0, a1);
+ *dst.add(1) = chr_interp2(b, c, a0, a1);
+ let (a, b, c) = (*src.add(sstride), *src.add(sstride + 1), *src.add(sstride + 2));
+ *dst.add(dstride) = chr_interp2(a, b, a0, a1);
+ *dst.add(dstride + 1) = chr_interp2(b, c, a0, a1);
+ }
+ }
+ } else {
+ unsafe {
+ let height = h;
+ let mut src0 = src.as_ptr();
+ let mut src1 = src0.add(sstride);
+ let mut dst = dst.as_mut_ptr();
+
+ let (a, b, c) = (*src0, *src0.add(1), *src0.add(2));
+ let (d, e, f) = (*src1, *src1.add(1), *src1.add(2));
+ let (g, h, i) = (*src1.add(sstride), *src1.add(sstride + 1), *src1.add(sstride + 2));
+ *dst = chr_interp4(a, b, d, e, a0, a1, b0, b1);
+ *dst.add(1) = chr_interp4(b, c, e, f, a0, a1, b0, b1);
+ *dst.add(dstride) = chr_interp4(d, e, g, h, a0, a1, b0, b1);
+ *dst.add(dstride + 1) = chr_interp4(e, f, h, i, a0, a1, b0, b1);
+ if height == 4 {
+ src0 = src0.add(sstride * 3);
+ src1 = src1.add(sstride * 3);
+ dst = dst.add(dstride * 2);
+ let (a, b, c) = (*src0, *src0.add(1), *src0.add(2));
+ let (d, e, f) = (*src1, *src1.add(1), *src1.add(2));
+ *dst = chr_interp4(g, h, a, b, a0, a1, b0, b1);
+ *dst.add(1) = chr_interp4(h, i, b, c, a0, a1, b0, b1);
+ *dst.add(dstride) = chr_interp4(a, b, d, e, a0, a1, b0, b1);
+ *dst.add(dstride + 1) = chr_interp4(b, c, e, f, a0, a1, b0, b1);
+ }
+ }
+ }
+}
+
--- /dev/null
+use std::arch::asm;
+use super::super::clip_u8;
+
+const TMP_BUF_STRIDE: usize = 32;
+
+fn interp_block1(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, w: usize, h: usize, hor: bool, avg0: bool) {
+ unsafe {
+ let step = if hor { 1 } else { sstride };
+ let avgidx = if avg0 { step * 2 } else { step * 3 };
+ let mut src = src.as_ptr();
+ let mut dst = dst.as_mut_ptr();
+ for _ in 0..h {
+ for _ in 0..w {
+ let t = clip_u8(( i16::from(*src)
+ - 5 * i16::from(*src.add(step))
+ + 20 * i16::from(*src.add(step * 2))
+ + 20 * i16::from(*src.add(step * 3))
+ - 5 * i16::from(*src.add(step * 4))
+ + i16::from(*src.add(step * 5))
+ + 16) >> 5);
+ *dst = ((u16::from(t) + u16::from(*src.add(avgidx)) + 1) >> 1) as u8;
+ src = src.add(1);
+ dst = dst.add(1);
+ }
+ dst = dst.sub(w).add(dstride);
+ src = src.sub(w).add(sstride);
+ }
+ }
+}
+
+fn interp_block2(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, w: usize, h: usize, hor: bool) {
+ unsafe {
+ let step = if hor { 1 } else { sstride };
+ let mut pix = dst.as_mut_ptr();
+ let mut src = src.as_ptr();
+ for _ in 0..h {
+ for x in 0..w {
+ *pix.add(x) = clip_u8(( i16::from(*src)
+ - 5 * i16::from(*src.add(step))
+ + 20 * i16::from(*src.add(step * 2))
+ + 20 * i16::from(*src.add(step * 3))
+ - 5 * i16::from(*src.add(step * 4))
+ + i16::from(*src.add(step * 5))
+ + 16) >> 5);
+ src = src.add(1);
+ }
+ pix = pix.add(dstride);
+ src = src.sub(w);
+ src = src.add(sstride);
+ }
+ }
+}
+
+fn mc_avg_tmp(dst: &mut [u8], dstride: usize, w: usize, h: usize, tmp: &[u8], tmp2: &[u8]) {
+ unsafe {
+ let mut src1 = tmp.as_ptr();
+ let mut src2 = tmp2.as_ptr();
+ let mut dst = dst.as_mut_ptr();
+ for _ in 0..h {
+ for x in 0..w {
+ let a = *src1.add(x);
+ let b = *src2.add(x);
+ *dst.add(x) = ((u16::from(a) + u16::from(b) + 1) >> 1) as u8;
+ }
+ dst = dst.add(dstride);
+ src1 = src1.add(TMP_BUF_STRIDE);
+ src2 = src2.add(TMP_BUF_STRIDE);
+ }
+ }
+}
+
+fn h264_mc01(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, w: usize, h: usize) {
+ interp_block1(dst, dstride, &src[sstride * 2..], sstride, w, h, true, true);
+}
+
+fn h264_mc02(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, w: usize, h: usize) {
+ interp_block2(dst, dstride, &src[sstride * 2..], sstride, w, h, true);
+}
+
+fn h264_mc03(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, w: usize, h: usize) {
+ interp_block1(dst, dstride, &src[sstride * 2..], sstride, w, h, true, false);
+}
+
+fn h264_mc10(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, w: usize, h: usize) {
+ interp_block1(dst, dstride, &src[2..], sstride, w, h, false, true);
+}
+
+fn h264_mc11(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, w: usize, h: usize) {
+ let mut tmp : [u8; TMP_BUF_STRIDE * 16] = unsafe { std::mem::MaybeUninit::uninit().assume_init() };
+ let mut tmp2: [u8; TMP_BUF_STRIDE * 16] = unsafe { std::mem::MaybeUninit::uninit().assume_init() };
+ h264_mc02(&mut tmp, TMP_BUF_STRIDE, src, sstride, w, h);
+ h264_mc20(&mut tmp2, TMP_BUF_STRIDE, src, sstride, w, h);
+ mc_avg_tmp(dst, dstride, w, h, &tmp, &tmp2);
+}
+
+fn h264_mc12(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, w: usize, h: usize) {
+ let mut tmp : [u8; TMP_BUF_STRIDE * 16] = unsafe { std::mem::MaybeUninit::uninit().assume_init() };
+ let mut tmp2: [u8; TMP_BUF_STRIDE * 16] = unsafe { std::mem::MaybeUninit::uninit().assume_init() };
+ h264_mc02(&mut tmp, TMP_BUF_STRIDE, src, sstride, w, h);
+ h264_mc22(&mut tmp2, TMP_BUF_STRIDE, src, sstride, w, h);
+ mc_avg_tmp(dst, dstride, w, h, &tmp, &tmp2);
+}
+
+fn h264_mc13(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, w: usize, h: usize) {
+ let mut tmp : [u8; TMP_BUF_STRIDE * 16] = unsafe { std::mem::MaybeUninit::uninit().assume_init() };
+ let mut tmp2: [u8; TMP_BUF_STRIDE * 16] = unsafe { std::mem::MaybeUninit::uninit().assume_init() };
+ h264_mc02(&mut tmp, TMP_BUF_STRIDE, src, sstride, w, h);
+ h264_mc20(&mut tmp2, TMP_BUF_STRIDE, &src[1..], sstride, w, h);
+ mc_avg_tmp(dst, dstride, w, h, &tmp, &tmp2);
+}
+
+fn h264_mc20(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, w: usize, h: usize) {
+ interp_block2(dst, dstride, &src[2..], sstride, w, h, false);
+}
+
+fn h264_mc21(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, w: usize, h: usize) {
+ let mut tmp : [u8; TMP_BUF_STRIDE * 16] = unsafe { std::mem::MaybeUninit::uninit().assume_init() };
+ let mut tmp2: [u8; TMP_BUF_STRIDE * 16] = unsafe { std::mem::MaybeUninit::uninit().assume_init() };
+ h264_mc22(&mut tmp, TMP_BUF_STRIDE, src, sstride, w, h);
+ h264_mc20(&mut tmp2, TMP_BUF_STRIDE, src, sstride, w, h);
+ mc_avg_tmp(dst, dstride, w, h, &tmp, &tmp2);
+}
+
+fn h264_mc22(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, w: usize, h: usize) {
+ let mut tmp: [i32; TMP_BUF_STRIDE * 16] = unsafe { std::mem::MaybeUninit::uninit().assume_init() };
+ unsafe {
+ let mut src = src.as_ptr();
+ let mut dst = tmp.as_mut_ptr();
+ for _ in 0..h {
+ for _ in 0..w+5 {
+ *dst = i32::from(*src)
+ - 5 * i32::from(*src.add(sstride))
+ + 20 * i32::from(*src.add(sstride * 2))
+ + 20 * i32::from(*src.add(sstride * 3))
+ - 5 * i32::from(*src.add(sstride * 4))
+ + i32::from(*src.add(sstride * 5));
+ dst = dst.add(1);
+ src = src.add(1);
+ }
+ src = src.sub(w+5).add(sstride);
+ dst = dst.sub(w+5).add(TMP_BUF_STRIDE);
+ }
+ }
+ unsafe {
+ let mut dst = dst.as_mut_ptr();
+ let mut src = tmp.as_ptr();
+ for _ in 0..h {
+ for _ in 0..w {
+ *dst = clip_u8(((*src - 5 * *src.add(1) + 20 * *src.add(2) + 20 * *src.add(3) - 5 * *src.add(4) + *src.add(5) + 512) >> 10) as i16);
+ dst = dst.add(1);
+ src = src.add(1);
+ }
+ dst = dst.sub(w).add(dstride);
+ src = src.sub(w).add(TMP_BUF_STRIDE);
+ }
+ }
+}
+
+fn h264_mc23(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, w: usize, h: usize) {
+ let mut tmp : [u8; TMP_BUF_STRIDE * 16] = unsafe { std::mem::MaybeUninit::uninit().assume_init() };
+ let mut tmp2: [u8; TMP_BUF_STRIDE * 16] = unsafe { std::mem::MaybeUninit::uninit().assume_init() };
+ h264_mc22(&mut tmp, TMP_BUF_STRIDE, src, sstride, w, h);
+ h264_mc20(&mut tmp2, TMP_BUF_STRIDE, &src[1..], sstride, w, h);
+ mc_avg_tmp(dst, dstride, w, h, &tmp, &tmp2);
+}
+
+fn h264_mc30(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, w: usize, h: usize) {
+ interp_block1(dst, dstride, &src[2..], sstride, w, h, false, false);
+}
+
+fn h264_mc31(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, w: usize, h: usize) {
+ let mut tmp : [u8; TMP_BUF_STRIDE * 16] = unsafe { std::mem::MaybeUninit::uninit().assume_init() };
+ let mut tmp2: [u8; TMP_BUF_STRIDE * 16] = unsafe { std::mem::MaybeUninit::uninit().assume_init() };
+ h264_mc20(&mut tmp, TMP_BUF_STRIDE, src, sstride, w, h);
+ h264_mc02(&mut tmp2, TMP_BUF_STRIDE, &src[sstride..], sstride, w, h);
+ mc_avg_tmp(dst, dstride, w, h, &tmp, &tmp2);
+}
+
+fn h264_mc32(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, w: usize, h: usize) {
+ let mut tmp : [u8; TMP_BUF_STRIDE * 16] = unsafe { std::mem::MaybeUninit::uninit().assume_init() };
+ let mut tmp2: [u8; TMP_BUF_STRIDE * 16] = unsafe { std::mem::MaybeUninit::uninit().assume_init() };
+ h264_mc22(&mut tmp, TMP_BUF_STRIDE, src, sstride, w, h);
+ h264_mc02(&mut tmp2, TMP_BUF_STRIDE, &src[sstride..], sstride, w, h);
+ mc_avg_tmp(dst, dstride, w, h, &tmp, &tmp2);
+}
+
+fn h264_mc33(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, w: usize, h: usize) {
+ let mut tmp : [u8; TMP_BUF_STRIDE * 16] = unsafe { std::mem::MaybeUninit::uninit().assume_init() };
+ let mut tmp2: [u8; TMP_BUF_STRIDE * 16] = unsafe { std::mem::MaybeUninit::uninit().assume_init() };
+ h264_mc20(&mut tmp, TMP_BUF_STRIDE, &src[1..], sstride, w, h);
+ h264_mc02(&mut tmp2, TMP_BUF_STRIDE, &src[sstride..], sstride, w, h);
+ mc_avg_tmp(dst, dstride, w, h, &tmp, &tmp2);
+}
+
+macro_rules! luma_mc {
+ ($orig:ident, $func4:ident, $func8:ident, $func16:ident) => {
+ fn $func4(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, h: usize) {
+ $orig(dst, dstride, src, sstride, 4, h);
+ }
+ fn $func8(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, h: usize) {
+ $orig(dst, dstride, src, sstride, 8, h);
+ }
+ fn $func16(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, h: usize) {
+ $orig(dst, dstride, src, sstride, 16, h);
+ }
+ }
+}
+
+macro_rules! mc00_template {
+ ($func:ident, $load:expr, $store:expr) => {
+ fn $func(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, h: usize) {
+ unsafe {
+ asm!(
+ "lea {tmps}, [{src} + {sstride} * 2]",
+ "lea {tmpd}, [{dst} + {dstride} * 2]",
+ "2:",
+ concat!($load, " xmm0, [{src}]"),
+ concat!($load, " xmm1, [{src} + {sstride}]"),
+ concat!($load, " xmm2, [{tmps}]"),
+ concat!($load, " xmm3, [{tmps} + {sstride}]"),
+ concat!($store, " [{dst}], xmm0"),
+ "lea {src}, [{src} + {sstride}*4]",
+ concat!($store, " [{dst} + {dstride}], xmm1"),
+ "lea {tmps}, [{tmps} + {sstride}*4]",
+ concat!($store, " [{tmpd}], xmm2"),
+ "lea {dst}, [{dst} + {dstride}*4]",
+ concat!($store, " [{tmpd} + {dstride}], xmm3"),
+ "lea {tmpd}, [{tmpd} + {dstride}*4]",
+ "sub {h}, 4",
+ "jnz 2b",
+ dst = inout(reg) dst.as_mut_ptr() => _,
+ dstride = in(reg) dstride,
+ src = inout(reg) src.as_ptr() => _,
+ sstride = in(reg) sstride,
+ h = inout(reg) h => _,
+ tmps = out(reg) _,
+ tmpd = out(reg) _,
+ out("xmm0") _,
+ out("xmm1") _,
+ out("xmm2") _,
+ out("xmm3") _,
+ );
+ }
+ }
+ }
+}
+
+mc00_template!(h264_mc00_16, "movups", "movaps");
+mc00_template!(h264_mc00_8, "movq", "movq");
+mc00_template!(h264_mc00_4, "movd", "movd");
+
+luma_mc!(h264_mc01, h264_mc01_4, h264_mc01_8, h264_mc01_16);
+luma_mc!(h264_mc02, h264_mc02_4, h264_mc02_8, h264_mc02_16);
+luma_mc!(h264_mc03, h264_mc03_4, h264_mc03_8, h264_mc03_16);
+luma_mc!(h264_mc10, h264_mc10_4, h264_mc10_8, h264_mc10_16);
+luma_mc!(h264_mc11, h264_mc11_4, h264_mc11_8, h264_mc11_16);
+luma_mc!(h264_mc12, h264_mc12_4, h264_mc12_8, h264_mc12_16);
+luma_mc!(h264_mc13, h264_mc13_4, h264_mc13_8, h264_mc13_16);
+luma_mc!(h264_mc20, h264_mc20_4, h264_mc20_8, h264_mc20_16);
+luma_mc!(h264_mc21, h264_mc21_4, h264_mc21_8, h264_mc21_16);
+luma_mc!(h264_mc22, h264_mc22_4, h264_mc22_8, h264_mc22_16);
+luma_mc!(h264_mc23, h264_mc23_4, h264_mc23_8, h264_mc23_16);
+luma_mc!(h264_mc30, h264_mc30_4, h264_mc30_8, h264_mc30_16);
+luma_mc!(h264_mc31, h264_mc31_4, h264_mc31_8, h264_mc31_16);
+luma_mc!(h264_mc32, h264_mc32_4, h264_mc32_8, h264_mc32_16);
+luma_mc!(h264_mc33, h264_mc33_4, h264_mc33_8, h264_mc33_16);
+
+pub const H264_LUMA_INTERP: &[[super::super::MCFunc; 16]; 3] = &[
+ [
+ h264_mc00_4, h264_mc01_4, h264_mc02_4, h264_mc03_4,
+ h264_mc10_4, h264_mc11_4, h264_mc12_4, h264_mc13_4,
+ h264_mc20_4, h264_mc21_4, h264_mc22_4, h264_mc23_4,
+ h264_mc30_4, h264_mc31_4, h264_mc32_4, h264_mc33_4
+ ], [
+ h264_mc00_8, h264_mc01_8, h264_mc02_8, h264_mc03_8,
+ h264_mc10_8, h264_mc11_8, h264_mc12_8, h264_mc13_8,
+ h264_mc20_8, h264_mc21_8, h264_mc22_8, h264_mc23_8,
+ h264_mc30_8, h264_mc31_8, h264_mc32_8, h264_mc33_8
+ ], [
+ h264_mc00_16, h264_mc01_16, h264_mc02_16, h264_mc03_16,
+ h264_mc10_16, h264_mc11_16, h264_mc12_16, h264_mc13_16,
+ h264_mc20_16, h264_mc21_16, h264_mc22_16, h264_mc23_16,
+ h264_mc30_16, h264_mc31_16, h264_mc32_16, h264_mc33_16
+ ]
+];
--- /dev/null
+#[allow(clippy::uninit_assumed_init)]
+mod luma_mc;
+pub use luma_mc::H264_LUMA_INTERP;
+mod chroma_mc;
+pub use chroma_mc::*;
+mod blockdsp;
+use blockdsp::*;
+
+impl super::RegisterSIMD for super::H264MC {
+ fn register_simd(&mut self) {
+ self.avg[1] = avg_4;
+ self.avg[2] = avg_8;
+ self.avg[3] = avg_16;
+ self.put_block_weighted[1] = put_block_weighted_4;
+ self.put_block_weighted[2] = put_block_weighted_8;
+ self.put_block_weighted[3] = put_block_weighted_16;
+ self.put_block_weighted2[1] = put_block_weighted2_4;
+ self.put_block_weighted2[2] = put_block_weighted2_8;
+ self.put_block_weighted2[3] = put_block_weighted2_16;
+ }
+}
fn ipred_4x4_diag_down_right(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8], _tr: &[u8]) {
let mut t: [u16; 5] = [0; 5];
t[0] = u16::from(left[0]);
- load(&mut t[1..], &top);
+ load(&mut t[1..], top);
let mut l: [u16; 5] = [0; 5];
load(&mut l, left);
let dst = buf;
fn ipred_4x4_ver_right(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8], _tr: &[u8]) {
let mut t: [u16; 5] = [0; 5];
t[0] = u16::from(left[0]);
- load(&mut t[1..], &top);
+ load(&mut t[1..], top);
let mut l: [u16; 5] = [0; 5];
load(&mut l, left);
let dst = buf;
}
fn ipred_4x4_ver_left(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8], tr: &[u8]) {
let mut t: [u16; 8] = [0; 8];
- load(&mut t[..4], &top);
+ load(&mut t[..4], top);
load(&mut t[4..], tr);
let dst = buf;
fn ipred_4x4_hor_down(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8], _tr: &[u8]) {
let mut t: [u16; 5] = [0; 5];
t[0] = u16::from(left[0]);
- load(&mut t[1..], &top);
+ load(&mut t[1..], top);
let mut l: [u16; 5] = [0; 5];
load(&mut l, left);
let dst = buf;
let mut l = [0; 8];
load(&mut l, &left[1..]);
let mut t = [0; 8];
- load(&mut t, &top);
+ load(&mut t, top);
let dc0 = ((t[0] + t[1] + t[2] + t[3] + l[0] + l[1] + l[2] + l[3] + 4) >> 3) as u8;
let sum1 = t[4] + t[5] + t[6] + t[7];
}
}
+#[allow(clippy::match_like_matches_macro)]
fn do_b_mc(frm: &mut NASimpleVideoFrame<u8>, mode: BMode, xpos: usize, ypos: usize, w: usize, h: usize, mv0: MV, ref_pic0: Option<NAVideoBufferRef<u8>>, weight0: &WeightInfo, mv1: MV, ref_pic1: Option<NAVideoBufferRef<u8>>, weight1: &WeightInfo, mc_dsp: &mut H264MC) {
let do_weight = match (mode, weight0.is_weighted(), weight1.is_weighted()) {
(BMode::L0, true, _) => true,
match mb_info.mb_type {
MBType::Intra16x16(_, _, _) => {
- pred_intra(frm, &sstate, &mb_info);
+ pred_intra(frm, sstate, mb_info);
},
MBType::Intra4x4 | MBType::Intra8x8 => {
- pred_intra(frm, &sstate, &mb_info);
+ pred_intra(frm, sstate, mb_info);
},
MBType::PCM => {},
MBType::PSkip => {
};
if !mb_info.mb_type.is_skip() {
if mb_info.mb_type != MBType::Intra4x4 && mb_info.mb_type != MBType::Intra8x8 {
- add_luma(frm, &sstate, &mb_info);
+ add_luma(frm, sstate, mb_info);
}
- add_chroma(frm, &sstate, &mb_info);
+ add_chroma(frm, sstate, mb_info);
}
}
} else {
0
};
- recon_mb(&mut frm, slice_hdr, &mb_info, &mut self.sstate, &self.frame_refs, &mut self.mc_dsp, weight_mode);
+ recon_mb(&mut frm, slice_hdr, mb_info, &mut self.sstate, &self.frame_refs, &mut self.mc_dsp, weight_mode);
} else {
for (dline, src) in frm.data[frm.offset[0] + xpos + ypos * frm.stride[0]..].chunks_mut(frm.stride[0]).take(16).zip(self.ipcm_buf.chunks(16)) {
dline[..16].copy_from_slice(src);
];
let mut mb_idx = slice_hdr.first_mb_in_slice as usize;
- let mut mb_info = CurrentMBInfo::default();
- mb_info.qp_y = slice_hdr.slice_qp;
+ let mut mb_info = CurrentMBInfo { qp_y: slice_hdr.slice_qp, ..Default::default() };
let skip_type = if slice_hdr.slice_type.is_p() { MBType::PSkip } else { MBType::BSkip };
while br.tell() < full_size && mb_idx < self.num_mbs {
mb_info.coded = [false; 25];
let skip_type = if slice_hdr.slice_type.is_p() { MBType::PSkip } else { MBType::BSkip };
let mut last_qp_diff = false;
- let mut mb_info = CurrentMBInfo::default();
- mb_info.qp_y = slice_hdr.slice_qp;
+ let mut mb_info = CurrentMBInfo { qp_y: slice_hdr.slice_qp, ..Default::default() };
while mb_idx < self.num_mbs {
mb_info.coded = [false; 25];
if self.is_mbaff && (((mb_idx & 1) == 0) || (prev_mb_skipped && ((mb_idx & 1) == 1))) {
let _mb_field_decoding = cabac.decode_bit(70);
}
- let mut mb_type = cabac_decode_mb_type(cabac, &slice_hdr, &self.sstate);
+ let mut mb_type = cabac_decode_mb_type(cabac, slice_hdr, &self.sstate);
mb_info.mb_type = mb_type;
mb_info.transform_size_8x8 = false;
if mb_type == MBType::PCM {
pub fn select_ref_pic(&self, list_id: u8, ref_id: usize) -> Option<NAVideoBufferRef<u8>> {
let ref_list = if list_id == 0 { &self.ref_list0 } else { &self.ref_list1 };
if ref_list.len() > ref_id {
- if let Some(ref pic) = ref_list[ref_id] {
- Some(pic.buf.clone())
- } else {
- None
- }
+ ref_list[ref_id].as_ref().map(|pic| pic.buf.clone())
} else {
None
}
}
pub fn is_high_profile(profile: u8) -> bool {
- match profile {
- 100 | 110 | 122 | 244 | 44 | 83 | 86 | 118 | 128 | 138 | 139 | 134 | 125 => true,
- _ => false,
- }
+ matches!(profile, 100 | 110 | 122 | 244 | 44 | 83 | 86 | 118 | 128 | 138 | 139 | 134 | 125)
}
#[allow(clippy::cognitive_complexity)]
impl SliceType {
pub fn is_intra(self) -> bool {
- match self {
- SliceType::I | SliceType::SI => true,
- _ => false,
- }
+ matches!(self, SliceType::I | SliceType::SI)
}
pub fn is_p(self) -> bool {
- match self {
- SliceType::P | SliceType::SP => true,
- _ => false,
- }
+ matches!(self, SliceType::P | SliceType::SP)
}
pub fn is_b(self) -> bool { self == SliceType::B }
pub fn is_s(self) -> bool {
- match self {
- SliceType::SI | SliceType::SP => true,
- _ => false,
- }
+ matches!(self, SliceType::SI | SliceType::SP)
}
pub fn to_frame_type(self) -> FrameType {
match self {
}
#[allow(clippy::cognitive_complexity)]
+#[allow(clippy::manual_range_contains)]
pub fn parse_slice_header(br: &mut BitReader, sps_arr: &[SeqParameterSet], pps_arr: &[PicParameterSet], is_idr: bool, nal_ref_idc: u8) -> DecoderResult<SliceHeader> {
let mut hdr: SliceHeader = unsafe { std::mem::zeroed() };
impl MBType {
pub fn is_intra(self) -> bool {
- match self {
- MBType::Intra4x4 | MBType::Intra8x8 | MBType::Intra16x16(_, _, _) | MBType::PCM => true,
- _ => false,
- }
+ matches!(self, MBType::Intra4x4 | MBType::Intra8x8 | MBType::Intra16x16(_, _, _) | MBType::PCM)
}
pub fn is_intra16x16(self) -> bool {
- if let MBType::Intra16x16(_, _, _) = self {
- true
- } else {
- false
- }
+ matches!(self, MBType::Intra16x16(_, _, _))
}
pub fn is_skip(self) -> bool {
- match self {
- MBType::PSkip | MBType::BSkip => true,
- _ => false,
- }
+ matches!(self, MBType::PSkip | MBType::BSkip)
}
pub fn is_4x4(self) -> bool { self.num_parts() == 4 }
pub fn is_l0(self, part: usize) -> bool {
impl CompactMBType {
pub fn is_intra(self) -> bool {
- match self {
- CompactMBType::Intra4x4 | CompactMBType::Intra8x8 | CompactMBType::Intra16x16 => true,
- _ => false,
- }
+ matches!(self, CompactMBType::Intra4x4 | CompactMBType::Intra8x8 | CompactMBType::Intra16x16)
}
pub fn is_intra16orpcm(self) -> bool {
- match self {
- CompactMBType::Intra16x16 | CompactMBType::PCM => true,
- _ => false,
- }
+ matches!(self, CompactMBType::Intra16x16 | CompactMBType::PCM)
}
pub fn is_skip(self) -> bool {
- match self {
- CompactMBType::PSkip | CompactMBType::BSkip => true,
- _ => false,
- }
+ matches!(self, CompactMBType::PSkip | CompactMBType::BSkip)
}
pub fn is_direct(self) -> bool {
- match self {
- CompactMBType::BSkip | CompactMBType::Direct | CompactMBType::None => true,
- _ => false,
- }
+ matches!(self, CompactMBType::BSkip | CompactMBType::Direct | CompactMBType::None)
}
pub fn is_inter(self) -> bool {
!self.is_intra() && !self.is_skip() && self != CompactMBType::PCM
}
pub fn is_16x16_ref(self) -> bool {
- match self {
+ matches!(self,
CompactMBType::Intra4x4 |
CompactMBType::Intra8x8 |
CompactMBType::Intra16x16 |
CompactMBType::PCM |
CompactMBType::P16x16 |
- CompactMBType::B16x16 => true,
- _ => false,
- }
+ CompactMBType::B16x16)
}
}
}
}
-impl Into<u8> for IntraPredMode {
- fn into(self) -> u8 {
- match self {
+impl From<IntraPredMode> for u8 {
+ fn from(val: IntraPredMode) -> Self {
+ match val {
IntraPredMode::Vertical => 0,
IntraPredMode::Horizontal => 1,
IntraPredMode::DC => 2,
self.get_cur_blk4(blk4).mv = [mv0, mv1];
self.get_cur_blk8(blk4_to_blk8(blk4)).ref_idx = [ref0, ref1];
}
+ #[allow(clippy::nonminimal_bool)]
pub fn get_direct_mv(&self, frame_refs: &FrameRefs, mbi: &FrameMBInfo, r1_poc: u16, r1_long: bool, temporal_mv: bool, cur_id: u16, blk4: usize) -> (MV, PicRef, MV, PicRef) {
let blk8 = blk4_to_blk8(blk4);
let (col_mv, r0_poc, col_idx) = if mbi.ref_poc[blk8] == [MISSING_POC; 2] {
($a:expr) => { if !$a { println!("check failed at {}:{}", file!(), line!()); return Err(DecoderError::InvalidData); } };
}
+#[allow(clippy::collapsible_else_if)]
#[allow(clippy::too_many_arguments)]
+#[allow(clippy::upper_case_acronyms)]
#[cfg(feature="decoder_h264")]
mod h264;