From 42005e259dd77147b77c7a0057aa3cf033e331d0 Mon Sep 17 00:00:00 2001 From: Kostya Shishkov Date: Mon, 5 Sep 2022 18:20:48 +0200 Subject: [PATCH] h264: add SIMD optimisations for x86_64 (not enabled by default) --- nihav-itu/Cargo.toml | 1 + nihav-itu/src/codecs/h264/dsp/mc/mod.rs | 36 +- .../src/codecs/h264/dsp/mc/x86/blockdsp.rs | 347 +++++++++++ .../src/codecs/h264/dsp/mc/x86/chroma_mc.rs | 561 ++++++++++++++++++ .../src/codecs/h264/dsp/mc/x86/luma_mc.rs | 285 +++++++++ nihav-itu/src/codecs/h264/dsp/mc/x86/mod.rs | 21 + nihav-itu/src/codecs/h264/dsp/mod.rs | 10 +- nihav-itu/src/codecs/h264/mb_recon.rs | 9 +- nihav-itu/src/codecs/h264/mod.rs | 10 +- nihav-itu/src/codecs/h264/pic_ref.rs | 6 +- nihav-itu/src/codecs/h264/sets.rs | 5 +- nihav-itu/src/codecs/h264/slice.rs | 16 +- nihav-itu/src/codecs/h264/types.rs | 49 +- nihav-itu/src/codecs/mod.rs | 2 + 14 files changed, 1277 insertions(+), 81 deletions(-) create mode 100644 nihav-itu/src/codecs/h264/dsp/mc/x86/blockdsp.rs create mode 100644 nihav-itu/src/codecs/h264/dsp/mc/x86/chroma_mc.rs create mode 100644 nihav-itu/src/codecs/h264/dsp/mc/x86/luma_mc.rs create mode 100644 nihav-itu/src/codecs/h264/dsp/mc/x86/mod.rs diff --git a/nihav-itu/Cargo.toml b/nihav-itu/Cargo.toml index 8134981..19eaa50 100644 --- a/nihav-itu/Cargo.toml +++ b/nihav-itu/Cargo.toml @@ -15,6 +15,7 @@ nihav_commonfmt = { path = "../nihav-commonfmt", default-features=false, feature [features] default = ["all_decoders"] +simd = [] #enable when the default rustc is >=1.62 all_decoders = ["all_video_decoders"] decoders = [] diff --git a/nihav-itu/src/codecs/h264/dsp/mc/mod.rs b/nihav-itu/src/codecs/h264/dsp/mc/mod.rs index ca4e77f..27bffe5 100644 --- a/nihav-itu/src/codecs/h264/dsp/mc/mod.rs +++ b/nihav-itu/src/codecs/h264/dsp/mc/mod.rs @@ -2,14 +2,31 @@ use nihav_core::frame::*; use nihav_codec_support::codecs::MV; use nihav_codec_support::codecs::blockdsp::*; -#[cfg(not(debug_assertions))] -mod release; -#[cfg(not(debug_assertions))] -use release::*; -#[cfg(debug_assertions)] -mod debug; -#[cfg(debug_assertions)] -use debug::*; +macro_rules! module_selector { + ($( ($cond:meta, $module:ident) ),*) => { + module_selector!(list; r#false; $(($cond, $module)),*); + }; + (list; $nocond:meta; ($ccar:meta, $carmod:ident), $(($condcdr:meta, $cdrmod:ident)),*) => { + module_selector!(single; $nocond; $ccar; $carmod); + module_selector!(list; any($nocond, $ccar); $(($condcdr, $cdrmod)),*); + }; + (list; $nocond:meta; ($yescond:meta, $module:ident)) => { + module_selector!(single; $nocond; $yescond; $module); + }; + (list; $_:meta; ) => {}; + (single; $nocond:meta; $yescond:meta; $module:ident) => { + #[cfg(all(not($nocond), $yescond))] + mod $module; + #[cfg(all(not($nocond), $yescond))] + use $module::*; + }; +} + +module_selector! ( + (all(feature = "simd", target_arch = "x86_64"), x86), + (debug_assertions, debug), + (not(debug_assertions), release) +); type MCFunc = fn (dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, h: usize); @@ -19,6 +36,7 @@ trait RegisterSIMD { fn register_simd(&mut self); } +#[allow(clippy::type_complexity)] pub struct H264MC { avg_buf: NAVideoBufferRef, pub put_block_weighted: [fn (dst: &mut [u8], stride: usize, src: &[u8], h: usize, wparams: [i8; 3]); 4], @@ -62,7 +80,7 @@ impl H264MC { let (ysrc, ystride) = if (src_x - pre < 0) || (src_x + (w as isize) + post > (yw as isize)) || (src_y - pre < 0) || (src_y + (h as isize) + post > (yh as isize)) { let add = (pre + post) as usize; edge_emu(&refpic, src_x - pre, src_y - pre, w + add, h + add, &mut ebuf, 22, 0, 0); - (ebuf.as_slice(), 22) + (&ebuf[..], 22) } else { (&src[refpic.get_offset(0) + ((src_x - pre) as usize) + ((src_y - pre) as usize) * systride..], systride) }; diff --git a/nihav-itu/src/codecs/h264/dsp/mc/x86/blockdsp.rs b/nihav-itu/src/codecs/h264/dsp/mc/x86/blockdsp.rs new file mode 100644 index 0000000..cd52586 --- /dev/null +++ b/nihav-itu/src/codecs/h264/dsp/mc/x86/blockdsp.rs @@ -0,0 +1,347 @@ +use std::arch::asm; + +macro_rules! avg_template { + ($name: ident, $mov: expr) => { + pub fn $name(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, bh: usize) { + unsafe { + asm!( + "2:", + concat!($mov, " xmm1, [{src}]"), + concat!($mov, " xmm3, [{src} + {sstride}]"), + concat!($mov, " xmm0, [{dst}]"), + concat!($mov, " xmm2, [{dst} + {dstride}]"), + "lea {src}, [{src} + {sstride} * 2]", + "pavgb xmm0, xmm1", + "pavgb xmm2, xmm3", + concat!($mov, " [{dst}], xmm0"), + concat!($mov, " [{dst} + {dstride}], xmm2"), + "lea {dst}, [{dst} + {dstride} * 2]", + "sub {h}, 2", + "jnz 2b", + src = inout(reg) src.as_ptr() => _, + sstride = in(reg) sstride, + dst = inout(reg) dst.as_mut_ptr() => _, + dstride = in(reg) dstride, + h = inout(reg) bh => _, + out("xmm0") _, + out("xmm1") _, + out("xmm2") _, + out("xmm3") _, + ); + } + } + } +} + +avg_template!(avg_4, "movd"); +avg_template!(avg_8, "movq"); + +pub fn avg_16(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, bh: usize) { + unsafe { + asm!( + "lea {stmp}, [{src} + {sstride} * 2]", + "lea {dtmp}, [{dst} + {dstride} * 2]", + "2:", + "movaps xmm0, [{src}]", + "movaps xmm1, [{src} + {sstride}]", + "movaps xmm2, [{stmp}]", + "movaps xmm3, [{stmp} + {sstride}]", + "pavgb xmm0, [{dst}]", + "pavgb xmm1, [{dst} + {dstride}]", + "pavgb xmm2, [{dtmp}]", + "pavgb xmm3, [{dtmp} + {dstride}]", + "lea {src}, [{src} + {sstride} * 4]", + "movaps [{dst}], xmm0", + "lea {stmp}, [{stmp} + {sstride} * 4]", + "movaps [{dst} + {dstride}], xmm1", + "lea {dst}, [{dst} + {dstride} * 4]", + "movaps [{dtmp}], xmm2", + "movaps [{dtmp} + {dstride}], xmm3", + "lea {dtmp}, [{dtmp} + {dstride} * 4]", + "sub {h}, 4", + "jnz 2b", + src = inout(reg) src.as_ptr() => _, + sstride = in(reg) sstride, + dst = inout(reg) dst.as_mut_ptr() => _, + dstride = in(reg) dstride, + h = inout(reg) bh => _, + stmp = out(reg) _, + dtmp = out(reg) _, + out("xmm0") _, + out("xmm1") _, + out("xmm2") _, + out("xmm3") _, + ); + } +} + +macro_rules! put_block_weighted { + ($func:ident, $width:expr, $load:expr, $store:expr) => { + pub fn $func(dst: &mut [u8], stride: usize, src: &[u8], h: usize, wparams: [i8; 3]) { + if wparams == [1, 0, 0] { + for (dst, src) in dst.chunks_mut(stride).zip(src.chunks(16)).take(h) { + dst[..$width].copy_from_slice(&src[..$width]); + } + } else { + let weight = i32::from(wparams[0]); + let offset = i32::from(wparams[1]); + let wshift = i32::from(wparams[2]); + let bias = (1 << wshift) >> 1; + + unsafe { + asm!( + "xorps xmm0, xmm0", + "movd xmm1, {weight:e}", + "movd xmm2, {offset:e}", + "movd xmm3, {wshift:e}", + "movd xmm4, {bias:e}", + "pshuflw xmm1, xmm1, 0", + "pshuflw xmm2, xmm2, 0", + "pshuflw xmm4, xmm4, 0", + "movlhps xmm1, xmm1", + "movlhps xmm2, xmm2", + "movlhps xmm4, xmm4", + "2:", + concat!($load, " xmm5, [{src}]"), + "add {src}, 16", + "movaps xmm7, xmm5", + "punpcklbw xmm5, xmm0", + "punpckhbw xmm7, xmm0", + "pmullw xmm5, xmm1", + "pmullw xmm7, xmm1", + "paddw xmm5, xmm4", + "paddw xmm7, xmm4", + "psraw xmm5, xmm3", + "psraw xmm7, xmm3", + "paddw xmm5, xmm2", + "paddw xmm7, xmm2", + "packuswb xmm5, xmm7", + concat!($store, " [{dst}], xmm5"), + "add {dst}, {stride}", + "dec {h}", + "jnz 2b", + h = inout(reg) h => _, + src = inout(reg) src.as_ptr() => _, + dst = inout(reg) dst.as_mut_ptr() => _, + stride = in(reg) stride, + weight = in(reg) weight, + offset = in(reg) offset, + wshift = in(reg) wshift, + bias = in(reg) bias, + out("xmm0") _, + out("xmm1") _, + out("xmm2") _, + out("xmm3") _, + out("xmm4") _, + out("xmm5") _, + out("xmm7") _, + ); + } + } + } + } +} + +put_block_weighted!(put_block_weighted_16, 16, "movups", "movaps"); +put_block_weighted!(put_block_weighted_8, 8, "movq", "movq"); +put_block_weighted!(put_block_weighted_4, 4, "movd", "movd"); + +macro_rules! put_block_weighted2 { + ($func:ident, $mov:expr) => { + pub fn $func(dst: &mut [u8], stride: usize, src0: &[u8], src1: &[u8], h: usize, wparams: [i8; 5]) { + if wparams == [1, 0, 1, 0, 0] { + unsafe { + asm!( + "2:", + concat!($mov, " xmm0, [{src0}]"), + concat!($mov, " xmm1, [{src1}]"), + "add {src0}, 16", + "pavgb xmm0, xmm1", + "add {src1}, 16", + concat!($mov, " [{dst}], xmm0"), + "add {dst}, {stride}", + "dec {h}", + "jnz 2b", + src0 = inout(reg) src0.as_ptr() => _, + src1 = inout(reg) src1.as_ptr() => _, + dst = inout(reg) dst.as_mut_ptr() => _, + stride = in(reg) stride, + h = inout(reg) h => _, + out("xmm0") _, + out("xmm1") _, + ); + } + return; + } + let weight0 = i32::from(wparams[0]); + let offset0 = i32::from(wparams[1]); + let weight1 = i32::from(wparams[2]); + let offset1 = i32::from(wparams[3]); + let wshift = i32::from(wparams[4]) + 1; + let offset = (offset0 + offset1 + 1) >> 1; + let bias = (1 << wshift) >> 1; + + unsafe { + asm!( + "xorps xmm0, xmm0", + "movd xmm1, {weight0:e}", + "movd xmm2, {weight1:e}", + "movd xmm3, {offset:e}", + "movd xmm4, {wshift:e}", + "movd xmm5, {bias:e}", + "pshuflw xmm1, xmm1, 0", + "pshuflw xmm2, xmm2, 0", + "pshuflw xmm3, xmm3, 0", + "pshuflw xmm5, xmm5, 0", + "movlhps xmm1, xmm1", + "movlhps xmm2, xmm2", + "movlhps xmm3, xmm3", + "movlhps xmm5, xmm5", + "2:", + concat!($mov, " xmm6, [{src0}]"), + "add {src0}, 16", + concat!($mov, " xmm7, [{src1}]"), + "add {src1}, 16", + "punpcklbw xmm6, xmm0", + "punpcklbw xmm7, xmm0", + "pmullw xmm6, xmm1", + "pmullw xmm7, xmm2", + "paddw xmm6, xmm5", + "paddw xmm6, xmm7", + "psraw xmm6, xmm4", + "paddw xmm6, xmm3", + "movhlps xmm7, xmm6", + "packuswb xmm6, xmm7", + concat!($mov, " [{dst}], xmm6"), + "add {dst}, {stride}", + "dec {h}", + "jnz 2b", + h = inout(reg) h => _, + src0 = inout(reg) src0.as_ptr() => _, + src1 = inout(reg) src1.as_ptr() => _, + dst = inout(reg) dst.as_mut_ptr() => _, + stride = in(reg) stride, + weight0 = in(reg) weight0, + weight1 = in(reg) weight1, + offset = in(reg) offset, + wshift = in(reg) wshift, + bias = in(reg) bias, + out("xmm0") _, + out("xmm1") _, + out("xmm2") _, + out("xmm3") _, + out("xmm4") _, + out("xmm5") _, + out("xmm6") _, + out("xmm7") _, + ); + } + } + } +} + +put_block_weighted2!(put_block_weighted2_8, "movq"); +put_block_weighted2!(put_block_weighted2_4, "movd"); + +pub fn put_block_weighted2_16(dst: &mut [u8], stride: usize, src0: &[u8], src1: &[u8], h: usize, wparams: [i8; 5]) { + if wparams == [1, 0, 1, 0, 0] { + unsafe { + asm!( + "2:", + "movups xmm0, [{src0}]", + "movups xmm1, [{src1}]", + "add {src0}, 16", + "pavgb xmm0, xmm1", + "add {src1}, 16", + "movaps [{dst}], xmm0", + "add {dst}, {stride}", + "dec {h}", + "jnz 2b", + src0 = inout(reg) src0.as_ptr() => _, + src1 = inout(reg) src1.as_ptr() => _, + dst = inout(reg) dst.as_mut_ptr() => _, + stride = in(reg) stride, + h = inout(reg) h => _, + out("xmm0") _, + out("xmm1") _, + ); + } + return; + } + let weight0 = i32::from(wparams[0]); + let offset0 = i32::from(wparams[1]); + let weight1 = i32::from(wparams[2]); + let offset1 = i32::from(wparams[3]); + let wshift = i32::from(wparams[4]) + 1; + let offset = (offset0 + offset1 + 1) >> 1; + let bias = (1 << wshift) >> 1; + + unsafe { + asm!( + "xorps xmm0, xmm0", + "movd xmm1, {weight0:e}", + "movd xmm2, {weight1:e}", + "movd xmm3, {offset:e}", + "movd xmm4, {wshift:e}", + "movd xmm5, {bias:e}", + "pshuflw xmm1, xmm1, 0", + "pshuflw xmm2, xmm2, 0", + "pshuflw xmm3, xmm3, 0", + "pshuflw xmm5, xmm5, 0", + "movlhps xmm1, xmm1", + "movlhps xmm2, xmm2", + "movlhps xmm3, xmm3", + "movlhps xmm5, xmm5", + "2:", + "movq xmm6, [{src0}]", + "movq xmm7, [{src1}]", + "punpcklbw xmm6, xmm0", + "punpcklbw xmm7, xmm0", + "pmullw xmm6, xmm1", + "pmullw xmm7, xmm2", + "paddw xmm6, xmm5", + "paddw xmm6, xmm7", + "psraw xmm6, xmm4", + "paddw xmm6, xmm3", + "movhlps xmm7, xmm6", + "packuswb xmm6, xmm7", + "movq [{dst}], xmm6", + "movq xmm6, [{src0} + 8]", + "add {src0}, 16", + "movq xmm7, [{src1} + 8]", + "add {src1}, 16", + "punpcklbw xmm6, xmm0", + "punpcklbw xmm7, xmm0", + "pmullw xmm6, xmm1", + "pmullw xmm7, xmm2", + "paddw xmm6, xmm5", + "paddw xmm6, xmm7", + "psraw xmm6, xmm4", + "paddw xmm6, xmm3", + "movhlps xmm7, xmm6", + "packuswb xmm6, xmm7", + "movq [{dst} + 8], xmm6", + "add {dst}, {stride}", + "dec {h}", + "jnz 2b", + h = inout(reg) h => _, + src0 = inout(reg) src0.as_ptr() => _, + src1 = inout(reg) src1.as_ptr() => _, + dst = inout(reg) dst.as_mut_ptr() => _, + stride = in(reg) stride, + weight0 = in(reg) weight0, + weight1 = in(reg) weight1, + offset = in(reg) offset, + wshift = in(reg) wshift, + bias = in(reg) bias, + out("xmm0") _, + out("xmm1") _, + out("xmm2") _, + out("xmm3") _, + out("xmm4") _, + out("xmm5") _, + out("xmm6") _, + out("xmm7") _, + ); + } +} diff --git a/nihav-itu/src/codecs/h264/dsp/mc/x86/chroma_mc.rs b/nihav-itu/src/codecs/h264/dsp/mc/x86/chroma_mc.rs new file mode 100644 index 0000000..12ccc65 --- /dev/null +++ b/nihav-itu/src/codecs/h264/dsp/mc/x86/chroma_mc.rs @@ -0,0 +1,561 @@ +use std::arch::asm; + +#[cfg(target_arch = "x86")] +fn chroma_interp(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, dx: u16, dy: u16, w: usize, h: usize) { + let a0 = 8 - dx; + let a1 = dx; + let b0 = 8 - dy; + let b1 = dy; + + if a0 == 8 && b0 == 8 { + unsafe { + let mut src = src.as_ptr(); + let mut dst = dst.as_mut_ptr(); + for _ in 0..h { + std::ptr::copy_nonoverlapping(src, dst, w); + src = src.add(sstride); + dst = dst.add(dstride); + } + } + } else if a0 == 8 { + unsafe { + let mut src0 = src.as_ptr(); + let mut src1 = src0.add(sstride); + let mut dst = dst.as_mut_ptr(); + for _ in 0..h { + for x in 0..w { + let a = *src0.add(x); + let b = *src1.add(x); + *dst.add(x) = ((u16::from(a) * b0 + u16::from(b) * b1 + 4) >> 3) as u8; + } + src0 = src0.add(sstride); + src1 = src1.add(sstride); + dst = dst.add(dstride); + } + } + } else if b0 == 8 { + unsafe { + let mut src = src.as_ptr(); + let mut dst = dst.as_mut_ptr(); + for _ in 0..h { + let mut a = *src; + for x in 0..w { + let b = *src.add(x + 1); + *dst.add(x) = ((u16::from(a) * a0 + u16::from(b) * a1 + 4) >> 3) as u8; + a = b; + } + src = src.add(sstride); + dst = dst.add(dstride); + } + } + } else { + unsafe { + let mut src0 = src.as_ptr(); + let mut src1 = src0.add(sstride); + let mut dst = dst.as_mut_ptr(); + for _ in 0..h { + let mut a = *src0; + let mut c = *src1; + for x in 0..w { + let b = *src0.add(x + 1); + let d = *src1.add(x + 1); + *dst.add(x) = ((u16::from(a) * a0 * b0 + u16::from(b) * a1 * b0 + u16::from(c) * a0 * b1 + u16::from(d) * a1 * b1 + 0x20) >> 6) as u8; + a = b; + c = d; + } + src0 = src0.add(sstride); + src1 = src1.add(sstride); + dst = dst.add(dstride); + } + } + } +} + +pub fn chroma_interp_8(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, dx: u16, dy: u16, h: usize) { + unsafe { + match (dx, dy) { + (0, 0) => { + asm!( + "lea {stmp}, [{src} + {sstride} * 2]", + "lea {dtmp}, [{dst} + {dstride} * 2]", + "2:", + "movq xmm0, [{src}]", + "movq xmm1, [{src} + {sstride}]", + "movq xmm2, [{stmp}]", + "movq xmm3, [{stmp} + {sstride}]", + "movq [{dst}], xmm0", + "lea {src}, [{src} + {sstride} * 4]", + "movq [{dst} + {dstride}], xmm1", + "lea {stmp}, [{stmp} + {sstride} * 4]", + "movq [{dtmp}], xmm2", + "lea {dst}, [{dst} + {dstride} * 4]", + "movq [{dtmp} + {dstride}], xmm3", + "lea {dtmp}, [{dtmp} + {dstride} * 4]", + "sub {h}, 4", + "jnz 2b", + src = inout(reg) src.as_ptr() => _, + sstride = in(reg) sstride, + dst = inout(reg) dst.as_mut_ptr() => _, + dstride = in(reg) dstride, + h = inout(reg) h => _, + stmp = out(reg) _, + dtmp = out(reg) _, + out("xmm0") _, + out("xmm1") _, + out("xmm2") _, + out("xmm3") _, + ); + }, + (0, _) => { + asm!( + "pxor xmm0, xmm0", + "movd xmm3, {a0:e}", + "movd xmm4, {a1:e}", + "mov {a1:e}, 0x0004", + "movd xmm5, {a1:e}", + "pshuflw xmm3, xmm3, 0", + "pshuflw xmm4, xmm4, 0", + "pshuflw xmm5, xmm5, 0", + "movlhps xmm3, xmm3", + "movlhps xmm4, xmm4", + "movlhps xmm5, xmm5", + "movq xmm6, [{src}]", + "add {src}, {sstride}", + "punpcklbw xmm6, xmm0", + "2:", + "movaps xmm1, xmm6", + "movq xmm2, [{src}]", + "punpcklbw xmm2, xmm0", + "movaps xmm6, xmm2", + "pmullw xmm1, xmm3", + "pmullw xmm2, xmm4", + "add {src}, {sstride}", + "paddw xmm1, xmm2", + "paddw xmm1, xmm5", + "psraw xmm1, 3", + "packuswb xmm1, xmm1", + "movq [{dst}], xmm1", + "add {dst}, {dstride}", + "dec {h}", + "jnz 2b", + src = inout(reg) src.as_ptr() => _, + sstride = in(reg) sstride, + dst = inout(reg) dst.as_mut_ptr() => _, + dstride = in(reg) dstride, + h = inout(reg) h => _, + a0 = in(reg) i32::from(8 - dy), + a1 = inout(reg) i32::from(dy) => _, + out("xmm0") _, + out("xmm1") _, + out("xmm2") _, + out("xmm3") _, + out("xmm4") _, + out("xmm5") _, + out("xmm6") _, + ); + }, + (_, 0) => { + asm!( + "pxor xmm0, xmm0", + "movd xmm3, {a0:e}", + "movd xmm4, {a1:e}", + "mov {a1:e}, 0x0004", + "movd xmm5, {a1:e}", + "pshuflw xmm3, xmm3, 0", + "pshuflw xmm4, xmm4, 0", + "pshuflw xmm5, xmm5, 0", + "movlhps xmm3, xmm3", + "movlhps xmm4, xmm4", + "movlhps xmm5, xmm5", + "2:", + "movq xmm1, [{src}]", + "movq xmm2, [{src} + 1]", + "punpcklbw xmm1, xmm0", + "punpcklbw xmm2, xmm0", + "pmullw xmm1, xmm3", + "pmullw xmm2, xmm4", + "add {src}, {sstride}", + "paddw xmm1, xmm2", + "paddw xmm1, xmm5", + "psraw xmm1, 3", + "packuswb xmm1, xmm1", + "movq [{dst}], xmm1", + "add {dst}, {dstride}", + "dec {h}", + "jnz 2b", + src = inout(reg) src.as_ptr() => _, + sstride = inout(reg) sstride => _, + dst = inout(reg) dst.as_mut_ptr() => _, + dstride = inout(reg) dstride => _, + h = inout(reg) h => _, + a0 = inout(reg) i32::from(8 - dx) => _, + a1 = inout(reg) i32::from(dx) => _, + out("xmm0") _, + out("xmm1") _, + out("xmm2") _, + out("xmm3") _, + out("xmm4") _, + out("xmm5") _, + ); + }, + #[cfg(target_arch = "x86")] + _ => chroma_interp(dst, dstride, src, sstride, dx, dy, 8, h), + #[cfg(target_arch = "x86_64")] + _ => { + asm!( + "pxor xmm0, xmm0", + "movd xmm3, {a0:e}", + "movd xmm4, {a1:e}", + "movd xmm5, {b0:e}", + "movd xmm6, {b1:e}", + "mov {a1:e}, 0x0020", + "movd xmm7, {a1:e}", + "pshuflw xmm3, xmm3, 0", + "pshuflw xmm4, xmm4, 0", + "pshuflw xmm5, xmm5, 0", + "pshuflw xmm6, xmm6, 0", + "pshuflw xmm7, xmm7, 0", + "movlhps xmm3, xmm3", + "movlhps xmm4, xmm4", + "movlhps xmm5, xmm5", + "movlhps xmm6, xmm6", + "movlhps xmm7, xmm7", + + "movq xmm8, [{src}]", + "movq xmm2, [{src} + 1]", + "punpcklbw xmm8, xmm0", + "punpcklbw xmm2, xmm0", + "pmullw xmm8, xmm3", + "pmullw xmm2, xmm4", + "add {src}, {sstride}", + "paddw xmm8, xmm2", + + "2:", + "movq xmm1, [{src}]", + "movq xmm2, [{src} + 1]", + "punpcklbw xmm1, xmm0", + "punpcklbw xmm2, xmm0", + "pmullw xmm1, xmm3", + "pmullw xmm2, xmm4", + "add {src}, {sstride}", + "paddw xmm1, xmm2", + "movaps xmm2, xmm8", + "movaps xmm8, xmm1", + + "pmullw xmm1, xmm6", + "pmullw xmm2, xmm5", + "paddw xmm1, xmm2", + "paddw xmm1, xmm7", + "psraw xmm1, 6", + "packuswb xmm1, xmm1", + "movq [{dst}], xmm1", + "add {dst}, {dstride}", + "dec {h}", + "jnz 2b", + src = inout(reg) src.as_ptr() => _, + sstride = inout(reg) sstride => _, + dst = inout(reg) dst.as_mut_ptr() => _, + dstride = inout(reg) dstride => _, + h = inout(reg) h => _, + a0 = inout(reg) i32::from(8 - dx) => _, + a1 = inout(reg) i32::from(dx) => _, + b0 = inout(reg) i32::from(8 - dy) => _, + b1 = inout(reg) i32::from(dy) => _, + out("xmm0") _, + out("xmm1") _, + out("xmm2") _, + out("xmm3") _, + out("xmm4") _, + out("xmm5") _, + out("xmm6") _, + out("xmm7") _, + out("xmm8") _, + ); + }, + }; + } +} + +pub fn chroma_interp_4(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, dx: u16, dy: u16, h: usize) { + unsafe { + match (dx, dy) { + (0, 0) => { + asm!( + "2:", + "movd xmm0, [{src}]", + "movd xmm1, [{src} + {sstride}]", + "movd [{dst}], xmm0", + "lea {src}, [{src} + {sstride} * 2]", + "movd [{dst} + {dstride}], xmm1", + "lea {dst}, [{dst} + {dstride} * 2]", + "sub {h}, 2", + "jnz 2b", + src = inout(reg) src.as_ptr() => _, + sstride = in(reg) sstride, + dst = inout(reg) dst.as_mut_ptr() => _, + dstride = in(reg) dstride, + h = inout(reg) h => _, + out("xmm0") _, + out("xmm1") _, + ); + }, + (0, _) => { + asm!( + "pxor xmm0, xmm0", + "movd xmm3, {a0:e}", + "movd xmm4, {a1:e}", + "mov {a1:e}, 0x0004", + "movd xmm5, {a1:e}", + "pshuflw xmm3, xmm3, 0", + "pshuflw xmm4, xmm4, 0", + "pshuflw xmm5, xmm5, 0", + "movd xmm6, [{src}]", + "add {src}, {sstride}", + "punpcklbw xmm6, xmm0", + "2:", + "movaps xmm1, xmm6", + "movd xmm2, [{src}]", + "punpcklbw xmm2, xmm0", + "movaps xmm6, xmm2", + "pmullw xmm1, xmm3", + "pmullw xmm2, xmm4", + "add {src}, {sstride}", + "paddw xmm1, xmm2", + "paddw xmm1, xmm5", + "psraw xmm1, 3", + "packuswb xmm1, xmm1", + "movd [{dst}], xmm1", + "add {dst}, {dstride}", + "dec {h}", + "jnz 2b", + src = inout(reg) src.as_ptr() => _, + sstride = inout(reg) sstride => _, + dst = inout(reg) dst.as_mut_ptr() => _, + dstride = inout(reg) dstride => _, + h = inout(reg) h => _, + a0 = inout(reg) i32::from(8 - dy) => _, + a1 = inout(reg) i32::from(dy) => _, + out("xmm0") _, + out("xmm1") _, + out("xmm2") _, + out("xmm3") _, + out("xmm4") _, + out("xmm5") _, + out("xmm6") _, + ); + }, + (_, 0) => { + asm!( + "pxor xmm0, xmm0", + "movd xmm3, {a0:e}", + "movd xmm4, {a1:e}", + "mov {a1:e}, 0x0004", + "movd xmm5, {a1:e}", + "pshuflw xmm3, xmm3, 0", + "pshuflw xmm4, xmm4, 0", + "pshuflw xmm5, xmm5, 0", + "2:", + "movd xmm1, [{src}]", + "movd xmm2, [{src} + 1]", + "punpcklbw xmm1, xmm0", + "punpcklbw xmm2, xmm0", + "pmullw xmm1, xmm3", + "pmullw xmm2, xmm4", + "add {src}, {sstride}", + "paddw xmm1, xmm2", + "paddw xmm1, xmm5", + "psraw xmm1, 3", + "packuswb xmm1, xmm1", + "movd [{dst}], xmm1", + "add {dst}, {dstride}", + "dec {h}", + "jnz 2b", + src = inout(reg) src.as_ptr() => _, + sstride = inout(reg) sstride => _, + dst = inout(reg) dst.as_mut_ptr() => _, + dstride = inout(reg) dstride => _, + h = inout(reg) h => _, + a0 = inout(reg) i32::from(8 - dx) => _, + a1 = inout(reg) i32::from(dx) => _, + out("xmm0") _, + out("xmm1") _, + out("xmm2") _, + out("xmm3") _, + out("xmm4") _, + out("xmm5") _, + ); + }, + #[cfg(target_arch = "x86")] + _ => chroma_interp(dst, dstride, src, sstride, dx, dy, 4, h), + #[cfg(target_arch = "x86_64")] + _ => { + asm!( + "pxor xmm0, xmm0", + "movd xmm3, {a0:e}", + "movd xmm4, {a1:e}", + "movd xmm5, {b0:e}", + "movd xmm6, {b1:e}", + "mov {a1:e}, 0x0020", + "movd xmm7, {a1:e}", + "pshuflw xmm3, xmm3, 0", + "pshuflw xmm4, xmm4, 0", + "pshuflw xmm5, xmm5, 0", + "pshuflw xmm6, xmm6, 0", + "pshuflw xmm7, xmm7, 0", + + "movd xmm8, [{src}]", + "movd xmm2, [{src} + 1]", + "punpcklbw xmm8, xmm0", + "punpcklbw xmm2, xmm0", + "pmullw xmm8, xmm3", + "pmullw xmm2, xmm4", + "add {src}, {sstride}", + "paddw xmm8, xmm2", + + "2:", + "movd xmm1, [{src}]", + "movd xmm2, [{src} + 1]", + "punpcklbw xmm1, xmm0", + "punpcklbw xmm2, xmm0", + "pmullw xmm1, xmm3", + "pmullw xmm2, xmm4", + "add {src}, {sstride}", + "paddw xmm1, xmm2", + "movaps xmm2, xmm8", + "movaps xmm8, xmm1", + + "pmullw xmm1, xmm6", + "pmullw xmm2, xmm5", + "paddw xmm1, xmm2", + "paddw xmm1, xmm7", + "psraw xmm1, 6", + "packuswb xmm1, xmm1", + "movd [{dst}], xmm1", + "add {dst}, {dstride}", + "dec {h}", + "jnz 2b", + src = inout(reg) src.as_ptr() => _, + sstride = inout(reg) sstride => _, + dst = inout(reg) dst.as_mut_ptr() => _, + dstride = inout(reg) dstride => _, + h = inout(reg) h => _, + a0 = inout(reg) i32::from(8 - dx) => _, + a1 = inout(reg) i32::from(dx) => _, + b0 = inout(reg) i32::from(8 - dy) => _, + b1 = inout(reg) i32::from(dy) => _, + out("xmm0") _, + out("xmm1") _, + out("xmm2") _, + out("xmm3") _, + out("xmm4") _, + out("xmm5") _, + out("xmm6") _, + out("xmm7") _, + out("xmm8") _, + ); + }, + }; + } +} + +#[inline] +fn chr_interp2(a: u8, b: u8, b0: u16, b1: u16) -> u8 { + ((u16::from(a) * b0 + u16::from(b) * b1 + 4) >> 3) as u8 +} +#[inline] +fn chr_interp4(a: u8, b: u8, c: u8, d: u8, a0: u16, a1: u16, b0: u16, b1: u16) -> u8 { + ((u16::from(a) * a0 * b0 + u16::from(b) * a1 * b0 + u16::from(c) * a0 * b1 + u16::from(d) * a1 * b1 + 0x20) >> 6) as u8 +} + +pub fn chroma_interp_2(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, dx: u16, dy: u16, h: usize) { + let a0 = 8 - dx; + let a1 = dx; + let b0 = 8 - dy; + let b1 = dy; + + if a0 == 8 && b0 == 8 { + unsafe { + let mut src = src.as_ptr(); + let mut dst = dst.as_mut_ptr(); + std::ptr::copy_nonoverlapping(src, dst, 2); + src = src.add(sstride); + dst = dst.add(dstride); + std::ptr::copy_nonoverlapping(src, dst, 2); + if h == 4 { + src = src.add(sstride); + dst = dst.add(dstride); + std::ptr::copy_nonoverlapping(src, dst, 2); + src = src.add(sstride); + dst = dst.add(dstride); + std::ptr::copy_nonoverlapping(src, dst, 2); + } + } + } else if a0 == 8 { + unsafe { + let mut src0 = src.as_ptr(); + let mut src1 = src0.add(sstride); + let mut dst = dst.as_mut_ptr(); + *dst = chr_interp2(*src0, *src1, b0, b1); + *dst.add(1) = chr_interp2(*src0.add(1), *src1.add(1), b0, b1); + *dst.add(dstride) = chr_interp2(*src0.add(sstride), *src1.add(sstride), b0, b1); + *dst.add(dstride + 1) = chr_interp2(*src0.add(sstride + 1), *src1.add(sstride + 1), b0, b1); + if h == 4 { + src0 = src0.add(sstride * 2); + src1 = src1.add(sstride * 2); + dst = dst.add(dstride * 2); + *dst = chr_interp2(*src0, *src1, b0, b1); + *dst.add(1) = chr_interp2(*src0.add(1), *src1.add(1), b0, b1); + *dst.add(dstride) = chr_interp2(*src0.add(sstride), *src1.add(sstride), b0, b1); + *dst.add(dstride + 1) = chr_interp2(*src0.add(sstride + 1), *src1.add(sstride + 1), b0, b1); + } + } + } else if b0 == 8 { + unsafe { + let mut src = src.as_ptr(); + let mut dst = dst.as_mut_ptr(); + let (a, b, c) = (*src, *src.add(1), *src.add(2)); + *dst = chr_interp2(a, b, a0, a1); + *dst.add(1) = chr_interp2(b, c, a0, a1); + let (a, b, c) = (*src.add(sstride), *src.add(sstride + 1), *src.add(sstride + 2)); + *dst.add(dstride) = chr_interp2(a, b, a0, a1); + *dst.add(dstride + 1) = chr_interp2(b, c, a0, a1); + if h == 4 { + src = src.add(sstride * 2); + dst = dst.add(dstride * 2); + let (a, b, c) = (*src, *src.add(1), *src.add(2)); + *dst = chr_interp2(a, b, a0, a1); + *dst.add(1) = chr_interp2(b, c, a0, a1); + let (a, b, c) = (*src.add(sstride), *src.add(sstride + 1), *src.add(sstride + 2)); + *dst.add(dstride) = chr_interp2(a, b, a0, a1); + *dst.add(dstride + 1) = chr_interp2(b, c, a0, a1); + } + } + } else { + unsafe { + let height = h; + let mut src0 = src.as_ptr(); + let mut src1 = src0.add(sstride); + let mut dst = dst.as_mut_ptr(); + + let (a, b, c) = (*src0, *src0.add(1), *src0.add(2)); + let (d, e, f) = (*src1, *src1.add(1), *src1.add(2)); + let (g, h, i) = (*src1.add(sstride), *src1.add(sstride + 1), *src1.add(sstride + 2)); + *dst = chr_interp4(a, b, d, e, a0, a1, b0, b1); + *dst.add(1) = chr_interp4(b, c, e, f, a0, a1, b0, b1); + *dst.add(dstride) = chr_interp4(d, e, g, h, a0, a1, b0, b1); + *dst.add(dstride + 1) = chr_interp4(e, f, h, i, a0, a1, b0, b1); + if height == 4 { + src0 = src0.add(sstride * 3); + src1 = src1.add(sstride * 3); + dst = dst.add(dstride * 2); + let (a, b, c) = (*src0, *src0.add(1), *src0.add(2)); + let (d, e, f) = (*src1, *src1.add(1), *src1.add(2)); + *dst = chr_interp4(g, h, a, b, a0, a1, b0, b1); + *dst.add(1) = chr_interp4(h, i, b, c, a0, a1, b0, b1); + *dst.add(dstride) = chr_interp4(a, b, d, e, a0, a1, b0, b1); + *dst.add(dstride + 1) = chr_interp4(b, c, e, f, a0, a1, b0, b1); + } + } + } +} + diff --git a/nihav-itu/src/codecs/h264/dsp/mc/x86/luma_mc.rs b/nihav-itu/src/codecs/h264/dsp/mc/x86/luma_mc.rs new file mode 100644 index 0000000..8109b59 --- /dev/null +++ b/nihav-itu/src/codecs/h264/dsp/mc/x86/luma_mc.rs @@ -0,0 +1,285 @@ +use std::arch::asm; +use super::super::clip_u8; + +const TMP_BUF_STRIDE: usize = 32; + +fn interp_block1(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, w: usize, h: usize, hor: bool, avg0: bool) { + unsafe { + let step = if hor { 1 } else { sstride }; + let avgidx = if avg0 { step * 2 } else { step * 3 }; + let mut src = src.as_ptr(); + let mut dst = dst.as_mut_ptr(); + for _ in 0..h { + for _ in 0..w { + let t = clip_u8(( i16::from(*src) + - 5 * i16::from(*src.add(step)) + + 20 * i16::from(*src.add(step * 2)) + + 20 * i16::from(*src.add(step * 3)) + - 5 * i16::from(*src.add(step * 4)) + + i16::from(*src.add(step * 5)) + + 16) >> 5); + *dst = ((u16::from(t) + u16::from(*src.add(avgidx)) + 1) >> 1) as u8; + src = src.add(1); + dst = dst.add(1); + } + dst = dst.sub(w).add(dstride); + src = src.sub(w).add(sstride); + } + } +} + +fn interp_block2(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, w: usize, h: usize, hor: bool) { + unsafe { + let step = if hor { 1 } else { sstride }; + let mut pix = dst.as_mut_ptr(); + let mut src = src.as_ptr(); + for _ in 0..h { + for x in 0..w { + *pix.add(x) = clip_u8(( i16::from(*src) + - 5 * i16::from(*src.add(step)) + + 20 * i16::from(*src.add(step * 2)) + + 20 * i16::from(*src.add(step * 3)) + - 5 * i16::from(*src.add(step * 4)) + + i16::from(*src.add(step * 5)) + + 16) >> 5); + src = src.add(1); + } + pix = pix.add(dstride); + src = src.sub(w); + src = src.add(sstride); + } + } +} + +fn mc_avg_tmp(dst: &mut [u8], dstride: usize, w: usize, h: usize, tmp: &[u8], tmp2: &[u8]) { + unsafe { + let mut src1 = tmp.as_ptr(); + let mut src2 = tmp2.as_ptr(); + let mut dst = dst.as_mut_ptr(); + for _ in 0..h { + for x in 0..w { + let a = *src1.add(x); + let b = *src2.add(x); + *dst.add(x) = ((u16::from(a) + u16::from(b) + 1) >> 1) as u8; + } + dst = dst.add(dstride); + src1 = src1.add(TMP_BUF_STRIDE); + src2 = src2.add(TMP_BUF_STRIDE); + } + } +} + +fn h264_mc01(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, w: usize, h: usize) { + interp_block1(dst, dstride, &src[sstride * 2..], sstride, w, h, true, true); +} + +fn h264_mc02(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, w: usize, h: usize) { + interp_block2(dst, dstride, &src[sstride * 2..], sstride, w, h, true); +} + +fn h264_mc03(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, w: usize, h: usize) { + interp_block1(dst, dstride, &src[sstride * 2..], sstride, w, h, true, false); +} + +fn h264_mc10(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, w: usize, h: usize) { + interp_block1(dst, dstride, &src[2..], sstride, w, h, false, true); +} + +fn h264_mc11(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, w: usize, h: usize) { + let mut tmp : [u8; TMP_BUF_STRIDE * 16] = unsafe { std::mem::MaybeUninit::uninit().assume_init() }; + let mut tmp2: [u8; TMP_BUF_STRIDE * 16] = unsafe { std::mem::MaybeUninit::uninit().assume_init() }; + h264_mc02(&mut tmp, TMP_BUF_STRIDE, src, sstride, w, h); + h264_mc20(&mut tmp2, TMP_BUF_STRIDE, src, sstride, w, h); + mc_avg_tmp(dst, dstride, w, h, &tmp, &tmp2); +} + +fn h264_mc12(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, w: usize, h: usize) { + let mut tmp : [u8; TMP_BUF_STRIDE * 16] = unsafe { std::mem::MaybeUninit::uninit().assume_init() }; + let mut tmp2: [u8; TMP_BUF_STRIDE * 16] = unsafe { std::mem::MaybeUninit::uninit().assume_init() }; + h264_mc02(&mut tmp, TMP_BUF_STRIDE, src, sstride, w, h); + h264_mc22(&mut tmp2, TMP_BUF_STRIDE, src, sstride, w, h); + mc_avg_tmp(dst, dstride, w, h, &tmp, &tmp2); +} + +fn h264_mc13(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, w: usize, h: usize) { + let mut tmp : [u8; TMP_BUF_STRIDE * 16] = unsafe { std::mem::MaybeUninit::uninit().assume_init() }; + let mut tmp2: [u8; TMP_BUF_STRIDE * 16] = unsafe { std::mem::MaybeUninit::uninit().assume_init() }; + h264_mc02(&mut tmp, TMP_BUF_STRIDE, src, sstride, w, h); + h264_mc20(&mut tmp2, TMP_BUF_STRIDE, &src[1..], sstride, w, h); + mc_avg_tmp(dst, dstride, w, h, &tmp, &tmp2); +} + +fn h264_mc20(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, w: usize, h: usize) { + interp_block2(dst, dstride, &src[2..], sstride, w, h, false); +} + +fn h264_mc21(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, w: usize, h: usize) { + let mut tmp : [u8; TMP_BUF_STRIDE * 16] = unsafe { std::mem::MaybeUninit::uninit().assume_init() }; + let mut tmp2: [u8; TMP_BUF_STRIDE * 16] = unsafe { std::mem::MaybeUninit::uninit().assume_init() }; + h264_mc22(&mut tmp, TMP_BUF_STRIDE, src, sstride, w, h); + h264_mc20(&mut tmp2, TMP_BUF_STRIDE, src, sstride, w, h); + mc_avg_tmp(dst, dstride, w, h, &tmp, &tmp2); +} + +fn h264_mc22(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, w: usize, h: usize) { + let mut tmp: [i32; TMP_BUF_STRIDE * 16] = unsafe { std::mem::MaybeUninit::uninit().assume_init() }; + unsafe { + let mut src = src.as_ptr(); + let mut dst = tmp.as_mut_ptr(); + for _ in 0..h { + for _ in 0..w+5 { + *dst = i32::from(*src) + - 5 * i32::from(*src.add(sstride)) + + 20 * i32::from(*src.add(sstride * 2)) + + 20 * i32::from(*src.add(sstride * 3)) + - 5 * i32::from(*src.add(sstride * 4)) + + i32::from(*src.add(sstride * 5)); + dst = dst.add(1); + src = src.add(1); + } + src = src.sub(w+5).add(sstride); + dst = dst.sub(w+5).add(TMP_BUF_STRIDE); + } + } + unsafe { + let mut dst = dst.as_mut_ptr(); + let mut src = tmp.as_ptr(); + for _ in 0..h { + for _ in 0..w { + *dst = clip_u8(((*src - 5 * *src.add(1) + 20 * *src.add(2) + 20 * *src.add(3) - 5 * *src.add(4) + *src.add(5) + 512) >> 10) as i16); + dst = dst.add(1); + src = src.add(1); + } + dst = dst.sub(w).add(dstride); + src = src.sub(w).add(TMP_BUF_STRIDE); + } + } +} + +fn h264_mc23(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, w: usize, h: usize) { + let mut tmp : [u8; TMP_BUF_STRIDE * 16] = unsafe { std::mem::MaybeUninit::uninit().assume_init() }; + let mut tmp2: [u8; TMP_BUF_STRIDE * 16] = unsafe { std::mem::MaybeUninit::uninit().assume_init() }; + h264_mc22(&mut tmp, TMP_BUF_STRIDE, src, sstride, w, h); + h264_mc20(&mut tmp2, TMP_BUF_STRIDE, &src[1..], sstride, w, h); + mc_avg_tmp(dst, dstride, w, h, &tmp, &tmp2); +} + +fn h264_mc30(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, w: usize, h: usize) { + interp_block1(dst, dstride, &src[2..], sstride, w, h, false, false); +} + +fn h264_mc31(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, w: usize, h: usize) { + let mut tmp : [u8; TMP_BUF_STRIDE * 16] = unsafe { std::mem::MaybeUninit::uninit().assume_init() }; + let mut tmp2: [u8; TMP_BUF_STRIDE * 16] = unsafe { std::mem::MaybeUninit::uninit().assume_init() }; + h264_mc20(&mut tmp, TMP_BUF_STRIDE, src, sstride, w, h); + h264_mc02(&mut tmp2, TMP_BUF_STRIDE, &src[sstride..], sstride, w, h); + mc_avg_tmp(dst, dstride, w, h, &tmp, &tmp2); +} + +fn h264_mc32(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, w: usize, h: usize) { + let mut tmp : [u8; TMP_BUF_STRIDE * 16] = unsafe { std::mem::MaybeUninit::uninit().assume_init() }; + let mut tmp2: [u8; TMP_BUF_STRIDE * 16] = unsafe { std::mem::MaybeUninit::uninit().assume_init() }; + h264_mc22(&mut tmp, TMP_BUF_STRIDE, src, sstride, w, h); + h264_mc02(&mut tmp2, TMP_BUF_STRIDE, &src[sstride..], sstride, w, h); + mc_avg_tmp(dst, dstride, w, h, &tmp, &tmp2); +} + +fn h264_mc33(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, w: usize, h: usize) { + let mut tmp : [u8; TMP_BUF_STRIDE * 16] = unsafe { std::mem::MaybeUninit::uninit().assume_init() }; + let mut tmp2: [u8; TMP_BUF_STRIDE * 16] = unsafe { std::mem::MaybeUninit::uninit().assume_init() }; + h264_mc20(&mut tmp, TMP_BUF_STRIDE, &src[1..], sstride, w, h); + h264_mc02(&mut tmp2, TMP_BUF_STRIDE, &src[sstride..], sstride, w, h); + mc_avg_tmp(dst, dstride, w, h, &tmp, &tmp2); +} + +macro_rules! luma_mc { + ($orig:ident, $func4:ident, $func8:ident, $func16:ident) => { + fn $func4(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, h: usize) { + $orig(dst, dstride, src, sstride, 4, h); + } + fn $func8(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, h: usize) { + $orig(dst, dstride, src, sstride, 8, h); + } + fn $func16(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, h: usize) { + $orig(dst, dstride, src, sstride, 16, h); + } + } +} + +macro_rules! mc00_template { + ($func:ident, $load:expr, $store:expr) => { + fn $func(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, h: usize) { + unsafe { + asm!( + "lea {tmps}, [{src} + {sstride} * 2]", + "lea {tmpd}, [{dst} + {dstride} * 2]", + "2:", + concat!($load, " xmm0, [{src}]"), + concat!($load, " xmm1, [{src} + {sstride}]"), + concat!($load, " xmm2, [{tmps}]"), + concat!($load, " xmm3, [{tmps} + {sstride}]"), + concat!($store, " [{dst}], xmm0"), + "lea {src}, [{src} + {sstride}*4]", + concat!($store, " [{dst} + {dstride}], xmm1"), + "lea {tmps}, [{tmps} + {sstride}*4]", + concat!($store, " [{tmpd}], xmm2"), + "lea {dst}, [{dst} + {dstride}*4]", + concat!($store, " [{tmpd} + {dstride}], xmm3"), + "lea {tmpd}, [{tmpd} + {dstride}*4]", + "sub {h}, 4", + "jnz 2b", + dst = inout(reg) dst.as_mut_ptr() => _, + dstride = in(reg) dstride, + src = inout(reg) src.as_ptr() => _, + sstride = in(reg) sstride, + h = inout(reg) h => _, + tmps = out(reg) _, + tmpd = out(reg) _, + out("xmm0") _, + out("xmm1") _, + out("xmm2") _, + out("xmm3") _, + ); + } + } + } +} + +mc00_template!(h264_mc00_16, "movups", "movaps"); +mc00_template!(h264_mc00_8, "movq", "movq"); +mc00_template!(h264_mc00_4, "movd", "movd"); + +luma_mc!(h264_mc01, h264_mc01_4, h264_mc01_8, h264_mc01_16); +luma_mc!(h264_mc02, h264_mc02_4, h264_mc02_8, h264_mc02_16); +luma_mc!(h264_mc03, h264_mc03_4, h264_mc03_8, h264_mc03_16); +luma_mc!(h264_mc10, h264_mc10_4, h264_mc10_8, h264_mc10_16); +luma_mc!(h264_mc11, h264_mc11_4, h264_mc11_8, h264_mc11_16); +luma_mc!(h264_mc12, h264_mc12_4, h264_mc12_8, h264_mc12_16); +luma_mc!(h264_mc13, h264_mc13_4, h264_mc13_8, h264_mc13_16); +luma_mc!(h264_mc20, h264_mc20_4, h264_mc20_8, h264_mc20_16); +luma_mc!(h264_mc21, h264_mc21_4, h264_mc21_8, h264_mc21_16); +luma_mc!(h264_mc22, h264_mc22_4, h264_mc22_8, h264_mc22_16); +luma_mc!(h264_mc23, h264_mc23_4, h264_mc23_8, h264_mc23_16); +luma_mc!(h264_mc30, h264_mc30_4, h264_mc30_8, h264_mc30_16); +luma_mc!(h264_mc31, h264_mc31_4, h264_mc31_8, h264_mc31_16); +luma_mc!(h264_mc32, h264_mc32_4, h264_mc32_8, h264_mc32_16); +luma_mc!(h264_mc33, h264_mc33_4, h264_mc33_8, h264_mc33_16); + +pub const H264_LUMA_INTERP: &[[super::super::MCFunc; 16]; 3] = &[ + [ + h264_mc00_4, h264_mc01_4, h264_mc02_4, h264_mc03_4, + h264_mc10_4, h264_mc11_4, h264_mc12_4, h264_mc13_4, + h264_mc20_4, h264_mc21_4, h264_mc22_4, h264_mc23_4, + h264_mc30_4, h264_mc31_4, h264_mc32_4, h264_mc33_4 + ], [ + h264_mc00_8, h264_mc01_8, h264_mc02_8, h264_mc03_8, + h264_mc10_8, h264_mc11_8, h264_mc12_8, h264_mc13_8, + h264_mc20_8, h264_mc21_8, h264_mc22_8, h264_mc23_8, + h264_mc30_8, h264_mc31_8, h264_mc32_8, h264_mc33_8 + ], [ + h264_mc00_16, h264_mc01_16, h264_mc02_16, h264_mc03_16, + h264_mc10_16, h264_mc11_16, h264_mc12_16, h264_mc13_16, + h264_mc20_16, h264_mc21_16, h264_mc22_16, h264_mc23_16, + h264_mc30_16, h264_mc31_16, h264_mc32_16, h264_mc33_16 + ] +]; diff --git a/nihav-itu/src/codecs/h264/dsp/mc/x86/mod.rs b/nihav-itu/src/codecs/h264/dsp/mc/x86/mod.rs new file mode 100644 index 0000000..d472c76 --- /dev/null +++ b/nihav-itu/src/codecs/h264/dsp/mc/x86/mod.rs @@ -0,0 +1,21 @@ +#[allow(clippy::uninit_assumed_init)] +mod luma_mc; +pub use luma_mc::H264_LUMA_INTERP; +mod chroma_mc; +pub use chroma_mc::*; +mod blockdsp; +use blockdsp::*; + +impl super::RegisterSIMD for super::H264MC { + fn register_simd(&mut self) { + self.avg[1] = avg_4; + self.avg[2] = avg_8; + self.avg[3] = avg_16; + self.put_block_weighted[1] = put_block_weighted_4; + self.put_block_weighted[2] = put_block_weighted_8; + self.put_block_weighted[3] = put_block_weighted_16; + self.put_block_weighted2[1] = put_block_weighted2_4; + self.put_block_weighted2[2] = put_block_weighted2_8; + self.put_block_weighted2[3] = put_block_weighted2_16; + } +} diff --git a/nihav-itu/src/codecs/h264/dsp/mod.rs b/nihav-itu/src/codecs/h264/dsp/mod.rs index 2d98ddf..c95e124 100644 --- a/nihav-itu/src/codecs/h264/dsp/mod.rs +++ b/nihav-itu/src/codecs/h264/dsp/mod.rs @@ -349,7 +349,7 @@ fn ipred_4x4_diag_down_left(buf: &mut [u8], stride: usize, top: &[u8], _left: &[ fn ipred_4x4_diag_down_right(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8], _tr: &[u8]) { let mut t: [u16; 5] = [0; 5]; t[0] = u16::from(left[0]); - load(&mut t[1..], &top); + load(&mut t[1..], top); let mut l: [u16; 5] = [0; 5]; load(&mut l, left); let dst = buf; @@ -367,7 +367,7 @@ fn ipred_4x4_diag_down_right(buf: &mut [u8], stride: usize, top: &[u8], left: &[ fn ipred_4x4_ver_right(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8], _tr: &[u8]) { let mut t: [u16; 5] = [0; 5]; t[0] = u16::from(left[0]); - load(&mut t[1..], &top); + load(&mut t[1..], top); let mut l: [u16; 5] = [0; 5]; load(&mut l, left); let dst = buf; @@ -395,7 +395,7 @@ fn ipred_4x4_ver_right(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8], _ } fn ipred_4x4_ver_left(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8], tr: &[u8]) { let mut t: [u16; 8] = [0; 8]; - load(&mut t[..4], &top); + load(&mut t[..4], top); load(&mut t[4..], tr); let dst = buf; @@ -425,7 +425,7 @@ fn ipred_4x4_ver_left(buf: &mut [u8], stride: usize, top: &[u8], _left: &[u8], t fn ipred_4x4_hor_down(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8], _tr: &[u8]) { let mut t: [u16; 5] = [0; 5]; t[0] = u16::from(left[0]); - load(&mut t[1..], &top); + load(&mut t[1..], top); let mut l: [u16; 5] = [0; 5]; load(&mut l, left); let dst = buf; @@ -739,7 +739,7 @@ fn ipred_8x8_dc(buf: &mut [u8], stride: usize, top: &[u8], left: &[u8]) { let mut l = [0; 8]; load(&mut l, &left[1..]); let mut t = [0; 8]; - load(&mut t, &top); + load(&mut t, top); let dc0 = ((t[0] + t[1] + t[2] + t[3] + l[0] + l[1] + l[2] + l[3] + 4) >> 3) as u8; let sum1 = t[4] + t[5] + t[6] + t[7]; diff --git a/nihav-itu/src/codecs/h264/mb_recon.rs b/nihav-itu/src/codecs/h264/mb_recon.rs index e78c134..e17095b 100644 --- a/nihav-itu/src/codecs/h264/mb_recon.rs +++ b/nihav-itu/src/codecs/h264/mb_recon.rs @@ -244,6 +244,7 @@ fn do_p_mc(frm: &mut NASimpleVideoFrame, xpos: usize, ypos: usize, w: usize, } } +#[allow(clippy::match_like_matches_macro)] fn do_b_mc(frm: &mut NASimpleVideoFrame, mode: BMode, xpos: usize, ypos: usize, w: usize, h: usize, mv0: MV, ref_pic0: Option>, weight0: &WeightInfo, mv1: MV, ref_pic1: Option>, weight1: &WeightInfo, mc_dsp: &mut H264MC) { let do_weight = match (mode, weight0.is_weighted(), weight1.is_weighted()) { (BMode::L0, true, _) => true, @@ -423,10 +424,10 @@ pub fn recon_mb(frm: &mut NASimpleVideoFrame, slice_hdr: &SliceHeader, mb_in match mb_info.mb_type { MBType::Intra16x16(_, _, _) => { - pred_intra(frm, &sstate, &mb_info); + pred_intra(frm, sstate, mb_info); }, MBType::Intra4x4 | MBType::Intra8x8 => { - pred_intra(frm, &sstate, &mb_info); + pred_intra(frm, sstate, mb_info); }, MBType::PCM => {}, MBType::PSkip => { @@ -600,8 +601,8 @@ pub fn recon_mb(frm: &mut NASimpleVideoFrame, slice_hdr: &SliceHeader, mb_in }; if !mb_info.mb_type.is_skip() { if mb_info.mb_type != MBType::Intra4x4 && mb_info.mb_type != MBType::Intra8x8 { - add_luma(frm, &sstate, &mb_info); + add_luma(frm, sstate, mb_info); } - add_chroma(frm, &sstate, &mb_info); + add_chroma(frm, sstate, mb_info); } } diff --git a/nihav-itu/src/codecs/h264/mod.rs b/nihav-itu/src/codecs/h264/mod.rs index a0a94a4..220ed9c 100644 --- a/nihav-itu/src/codecs/h264/mod.rs +++ b/nihav-itu/src/codecs/h264/mod.rs @@ -594,7 +594,7 @@ println!("PAFF?"); } else { 0 }; - recon_mb(&mut frm, slice_hdr, &mb_info, &mut self.sstate, &self.frame_refs, &mut self.mc_dsp, weight_mode); + recon_mb(&mut frm, slice_hdr, mb_info, &mut self.sstate, &self.frame_refs, &mut self.mc_dsp, weight_mode); } else { for (dline, src) in frm.data[frm.offset[0] + xpos + ypos * frm.stride[0]..].chunks_mut(frm.stride[0]).take(16).zip(self.ipcm_buf.chunks(16)) { dline[..16].copy_from_slice(src); @@ -655,8 +655,7 @@ _ => {}, ]; let mut mb_idx = slice_hdr.first_mb_in_slice as usize; - let mut mb_info = CurrentMBInfo::default(); - mb_info.qp_y = slice_hdr.slice_qp; + let mut mb_info = CurrentMBInfo { qp_y: slice_hdr.slice_qp, ..Default::default() }; let skip_type = if slice_hdr.slice_type.is_p() { MBType::PSkip } else { MBType::BSkip }; while br.tell() < full_size && mb_idx < self.num_mbs { mb_info.coded = [false; 25]; @@ -754,8 +753,7 @@ _ => {}, let skip_type = if slice_hdr.slice_type.is_p() { MBType::PSkip } else { MBType::BSkip }; let mut last_qp_diff = false; - let mut mb_info = CurrentMBInfo::default(); - mb_info.qp_y = slice_hdr.slice_qp; + let mut mb_info = CurrentMBInfo { qp_y: slice_hdr.slice_qp, ..Default::default() }; while mb_idx < self.num_mbs { mb_info.coded = [false; 25]; @@ -771,7 +769,7 @@ _ => {}, if self.is_mbaff && (((mb_idx & 1) == 0) || (prev_mb_skipped && ((mb_idx & 1) == 1))) { let _mb_field_decoding = cabac.decode_bit(70); } - let mut mb_type = cabac_decode_mb_type(cabac, &slice_hdr, &self.sstate); + let mut mb_type = cabac_decode_mb_type(cabac, slice_hdr, &self.sstate); mb_info.mb_type = mb_type; mb_info.transform_size_8x8 = false; if mb_type == MBType::PCM { diff --git a/nihav-itu/src/codecs/h264/pic_ref.rs b/nihav-itu/src/codecs/h264/pic_ref.rs index 0d69292..cafcc56 100644 --- a/nihav-itu/src/codecs/h264/pic_ref.rs +++ b/nihav-itu/src/codecs/h264/pic_ref.rs @@ -371,11 +371,7 @@ impl FrameRefs { pub fn select_ref_pic(&self, list_id: u8, ref_id: usize) -> Option> { let ref_list = if list_id == 0 { &self.ref_list0 } else { &self.ref_list1 }; if ref_list.len() > ref_id { - if let Some(ref pic) = ref_list[ref_id] { - Some(pic.buf.clone()) - } else { - None - } + ref_list[ref_id].as_ref().map(|pic| pic.buf.clone()) } else { None } diff --git a/nihav-itu/src/codecs/h264/sets.rs b/nihav-itu/src/codecs/h264/sets.rs index b02faf1..61cf0a8 100644 --- a/nihav-itu/src/codecs/h264/sets.rs +++ b/nihav-itu/src/codecs/h264/sets.rs @@ -44,10 +44,7 @@ pub struct SeqParameterSet { } pub fn is_high_profile(profile: u8) -> bool { - match profile { - 100 | 110 | 122 | 244 | 44 | 83 | 86 | 118 | 128 | 138 | 139 | 134 | 125 => true, - _ => false, - } + matches!(profile, 100 | 110 | 122 | 244 | 44 | 83 | 86 | 118 | 128 | 138 | 139 | 134 | 125) } #[allow(clippy::cognitive_complexity)] diff --git a/nihav-itu/src/codecs/h264/slice.rs b/nihav-itu/src/codecs/h264/slice.rs index 5c70729..e5a72ad 100644 --- a/nihav-itu/src/codecs/h264/slice.rs +++ b/nihav-itu/src/codecs/h264/slice.rs @@ -18,23 +18,14 @@ pub enum SliceType { impl SliceType { pub fn is_intra(self) -> bool { - match self { - SliceType::I | SliceType::SI => true, - _ => false, - } + matches!(self, SliceType::I | SliceType::SI) } pub fn is_p(self) -> bool { - match self { - SliceType::P | SliceType::SP => true, - _ => false, - } + matches!(self, SliceType::P | SliceType::SP) } pub fn is_b(self) -> bool { self == SliceType::B } pub fn is_s(self) -> bool { - match self { - SliceType::SI | SliceType::SP => true, - _ => false, - } + matches!(self, SliceType::SI | SliceType::SP) } pub fn to_frame_type(self) -> FrameType { match self { @@ -162,6 +153,7 @@ pub fn parse_slice_header_minimal(br: &mut BitReader) -> DecoderResult<(usize, S } #[allow(clippy::cognitive_complexity)] +#[allow(clippy::manual_range_contains)] pub fn parse_slice_header(br: &mut BitReader, sps_arr: &[SeqParameterSet], pps_arr: &[PicParameterSet], is_idr: bool, nal_ref_idc: u8) -> DecoderResult { let mut hdr: SliceHeader = unsafe { std::mem::zeroed() }; diff --git a/nihav-itu/src/codecs/h264/types.rs b/nihav-itu/src/codecs/h264/types.rs index f70819b..319ebf5 100644 --- a/nihav-itu/src/codecs/h264/types.rs +++ b/nihav-itu/src/codecs/h264/types.rs @@ -36,23 +36,13 @@ pub enum MBType { impl MBType { pub fn is_intra(self) -> bool { - match self { - MBType::Intra4x4 | MBType::Intra8x8 | MBType::Intra16x16(_, _, _) | MBType::PCM => true, - _ => false, - } + matches!(self, MBType::Intra4x4 | MBType::Intra8x8 | MBType::Intra16x16(_, _, _) | MBType::PCM) } pub fn is_intra16x16(self) -> bool { - if let MBType::Intra16x16(_, _, _) = self { - true - } else { - false - } + matches!(self, MBType::Intra16x16(_, _, _)) } pub fn is_skip(self) -> bool { - match self { - MBType::PSkip | MBType::BSkip => true, - _ => false, - } + matches!(self, MBType::PSkip | MBType::BSkip) } pub fn is_4x4(self) -> bool { self.num_parts() == 4 } pub fn is_l0(self, part: usize) -> bool { @@ -200,42 +190,28 @@ pub enum CompactMBType { impl CompactMBType { pub fn is_intra(self) -> bool { - match self { - CompactMBType::Intra4x4 | CompactMBType::Intra8x8 | CompactMBType::Intra16x16 => true, - _ => false, - } + matches!(self, CompactMBType::Intra4x4 | CompactMBType::Intra8x8 | CompactMBType::Intra16x16) } pub fn is_intra16orpcm(self) -> bool { - match self { - CompactMBType::Intra16x16 | CompactMBType::PCM => true, - _ => false, - } + matches!(self, CompactMBType::Intra16x16 | CompactMBType::PCM) } pub fn is_skip(self) -> bool { - match self { - CompactMBType::PSkip | CompactMBType::BSkip => true, - _ => false, - } + matches!(self, CompactMBType::PSkip | CompactMBType::BSkip) } pub fn is_direct(self) -> bool { - match self { - CompactMBType::BSkip | CompactMBType::Direct | CompactMBType::None => true, - _ => false, - } + matches!(self, CompactMBType::BSkip | CompactMBType::Direct | CompactMBType::None) } pub fn is_inter(self) -> bool { !self.is_intra() && !self.is_skip() && self != CompactMBType::PCM } pub fn is_16x16_ref(self) -> bool { - match self { + matches!(self, CompactMBType::Intra4x4 | CompactMBType::Intra8x8 | CompactMBType::Intra16x16 | CompactMBType::PCM | CompactMBType::P16x16 | - CompactMBType::B16x16 => true, - _ => false, - } + CompactMBType::B16x16) } } @@ -313,9 +289,9 @@ impl From for IntraPredMode { } } -impl Into for IntraPredMode { - fn into(self) -> u8 { - match self { +impl From for u8 { + fn from(val: IntraPredMode) -> Self { + match val { IntraPredMode::Vertical => 0, IntraPredMode::Horizontal => 1, IntraPredMode::DC => 2, @@ -795,6 +771,7 @@ impl SliceState { self.get_cur_blk4(blk4).mv = [mv0, mv1]; self.get_cur_blk8(blk4_to_blk8(blk4)).ref_idx = [ref0, ref1]; } + #[allow(clippy::nonminimal_bool)] pub fn get_direct_mv(&self, frame_refs: &FrameRefs, mbi: &FrameMBInfo, r1_poc: u16, r1_long: bool, temporal_mv: bool, cur_id: u16, blk4: usize) -> (MV, PicRef, MV, PicRef) { let blk8 = blk4_to_blk8(blk4); let (col_mv, r0_poc, col_idx) = if mbi.ref_poc[blk8] == [MISSING_POC; 2] { diff --git a/nihav-itu/src/codecs/mod.rs b/nihav-itu/src/codecs/mod.rs index efb24bc..ac1f7d6 100644 --- a/nihav-itu/src/codecs/mod.rs +++ b/nihav-itu/src/codecs/mod.rs @@ -4,7 +4,9 @@ macro_rules! validate { ($a:expr) => { if !$a { println!("check failed at {}:{}", file!(), line!()); return Err(DecoderError::InvalidData); } }; } +#[allow(clippy::collapsible_else_if)] #[allow(clippy::too_many_arguments)] +#[allow(clippy::upper_case_acronyms)] #[cfg(feature="decoder_h264")] mod h264; -- 2.39.5