X-Git-Url: https://git.nihav.org/?a=blobdiff_plain;f=nihav-itu%2Fsrc%2Fcodecs%2Fh264%2Fdsp%2Fmc%2Fx86%2Fblockdsp.rs;fp=nihav-itu%2Fsrc%2Fcodecs%2Fh264%2Fdsp%2Fmc%2Fx86%2Fblockdsp.rs;h=cd5258693f2084c7af95320a4504c1f8f77516c2;hb=42005e259dd77147b77c7a0057aa3cf033e331d0;hp=0000000000000000000000000000000000000000;hpb=15845d1a4eafe534d6f9064ef8a3622d377c4be6;p=nihav.git diff --git a/nihav-itu/src/codecs/h264/dsp/mc/x86/blockdsp.rs b/nihav-itu/src/codecs/h264/dsp/mc/x86/blockdsp.rs new file mode 100644 index 0000000..cd52586 --- /dev/null +++ b/nihav-itu/src/codecs/h264/dsp/mc/x86/blockdsp.rs @@ -0,0 +1,347 @@ +use std::arch::asm; + +macro_rules! avg_template { + ($name: ident, $mov: expr) => { + pub fn $name(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, bh: usize) { + unsafe { + asm!( + "2:", + concat!($mov, " xmm1, [{src}]"), + concat!($mov, " xmm3, [{src} + {sstride}]"), + concat!($mov, " xmm0, [{dst}]"), + concat!($mov, " xmm2, [{dst} + {dstride}]"), + "lea {src}, [{src} + {sstride} * 2]", + "pavgb xmm0, xmm1", + "pavgb xmm2, xmm3", + concat!($mov, " [{dst}], xmm0"), + concat!($mov, " [{dst} + {dstride}], xmm2"), + "lea {dst}, [{dst} + {dstride} * 2]", + "sub {h}, 2", + "jnz 2b", + src = inout(reg) src.as_ptr() => _, + sstride = in(reg) sstride, + dst = inout(reg) dst.as_mut_ptr() => _, + dstride = in(reg) dstride, + h = inout(reg) bh => _, + out("xmm0") _, + out("xmm1") _, + out("xmm2") _, + out("xmm3") _, + ); + } + } + } +} + +avg_template!(avg_4, "movd"); +avg_template!(avg_8, "movq"); + +pub fn avg_16(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, bh: usize) { + unsafe { + asm!( + "lea {stmp}, [{src} + {sstride} * 2]", + "lea {dtmp}, [{dst} + {dstride} * 2]", + "2:", + "movaps xmm0, [{src}]", + "movaps xmm1, [{src} + {sstride}]", + "movaps xmm2, [{stmp}]", + "movaps xmm3, [{stmp} + {sstride}]", + "pavgb xmm0, [{dst}]", + "pavgb xmm1, [{dst} + {dstride}]", + "pavgb xmm2, [{dtmp}]", + "pavgb xmm3, [{dtmp} + {dstride}]", + "lea {src}, [{src} + {sstride} * 4]", + "movaps [{dst}], xmm0", + "lea {stmp}, [{stmp} + {sstride} * 4]", + "movaps [{dst} + {dstride}], xmm1", + "lea {dst}, [{dst} + {dstride} * 4]", + "movaps [{dtmp}], xmm2", + "movaps [{dtmp} + {dstride}], xmm3", + "lea {dtmp}, [{dtmp} + {dstride} * 4]", + "sub {h}, 4", + "jnz 2b", + src = inout(reg) src.as_ptr() => _, + sstride = in(reg) sstride, + dst = inout(reg) dst.as_mut_ptr() => _, + dstride = in(reg) dstride, + h = inout(reg) bh => _, + stmp = out(reg) _, + dtmp = out(reg) _, + out("xmm0") _, + out("xmm1") _, + out("xmm2") _, + out("xmm3") _, + ); + } +} + +macro_rules! put_block_weighted { + ($func:ident, $width:expr, $load:expr, $store:expr) => { + pub fn $func(dst: &mut [u8], stride: usize, src: &[u8], h: usize, wparams: [i8; 3]) { + if wparams == [1, 0, 0] { + for (dst, src) in dst.chunks_mut(stride).zip(src.chunks(16)).take(h) { + dst[..$width].copy_from_slice(&src[..$width]); + } + } else { + let weight = i32::from(wparams[0]); + let offset = i32::from(wparams[1]); + let wshift = i32::from(wparams[2]); + let bias = (1 << wshift) >> 1; + + unsafe { + asm!( + "xorps xmm0, xmm0", + "movd xmm1, {weight:e}", + "movd xmm2, {offset:e}", + "movd xmm3, {wshift:e}", + "movd xmm4, {bias:e}", + "pshuflw xmm1, xmm1, 0", + "pshuflw xmm2, xmm2, 0", + "pshuflw xmm4, xmm4, 0", + "movlhps xmm1, xmm1", + "movlhps xmm2, xmm2", + "movlhps xmm4, xmm4", + "2:", + concat!($load, " xmm5, [{src}]"), + "add {src}, 16", + "movaps xmm7, xmm5", + "punpcklbw xmm5, xmm0", + "punpckhbw xmm7, xmm0", + "pmullw xmm5, xmm1", + "pmullw xmm7, xmm1", + "paddw xmm5, xmm4", + "paddw xmm7, xmm4", + "psraw xmm5, xmm3", + "psraw xmm7, xmm3", + "paddw xmm5, xmm2", + "paddw xmm7, xmm2", + "packuswb xmm5, xmm7", + concat!($store, " [{dst}], xmm5"), + "add {dst}, {stride}", + "dec {h}", + "jnz 2b", + h = inout(reg) h => _, + src = inout(reg) src.as_ptr() => _, + dst = inout(reg) dst.as_mut_ptr() => _, + stride = in(reg) stride, + weight = in(reg) weight, + offset = in(reg) offset, + wshift = in(reg) wshift, + bias = in(reg) bias, + out("xmm0") _, + out("xmm1") _, + out("xmm2") _, + out("xmm3") _, + out("xmm4") _, + out("xmm5") _, + out("xmm7") _, + ); + } + } + } + } +} + +put_block_weighted!(put_block_weighted_16, 16, "movups", "movaps"); +put_block_weighted!(put_block_weighted_8, 8, "movq", "movq"); +put_block_weighted!(put_block_weighted_4, 4, "movd", "movd"); + +macro_rules! put_block_weighted2 { + ($func:ident, $mov:expr) => { + pub fn $func(dst: &mut [u8], stride: usize, src0: &[u8], src1: &[u8], h: usize, wparams: [i8; 5]) { + if wparams == [1, 0, 1, 0, 0] { + unsafe { + asm!( + "2:", + concat!($mov, " xmm0, [{src0}]"), + concat!($mov, " xmm1, [{src1}]"), + "add {src0}, 16", + "pavgb xmm0, xmm1", + "add {src1}, 16", + concat!($mov, " [{dst}], xmm0"), + "add {dst}, {stride}", + "dec {h}", + "jnz 2b", + src0 = inout(reg) src0.as_ptr() => _, + src1 = inout(reg) src1.as_ptr() => _, + dst = inout(reg) dst.as_mut_ptr() => _, + stride = in(reg) stride, + h = inout(reg) h => _, + out("xmm0") _, + out("xmm1") _, + ); + } + return; + } + let weight0 = i32::from(wparams[0]); + let offset0 = i32::from(wparams[1]); + let weight1 = i32::from(wparams[2]); + let offset1 = i32::from(wparams[3]); + let wshift = i32::from(wparams[4]) + 1; + let offset = (offset0 + offset1 + 1) >> 1; + let bias = (1 << wshift) >> 1; + + unsafe { + asm!( + "xorps xmm0, xmm0", + "movd xmm1, {weight0:e}", + "movd xmm2, {weight1:e}", + "movd xmm3, {offset:e}", + "movd xmm4, {wshift:e}", + "movd xmm5, {bias:e}", + "pshuflw xmm1, xmm1, 0", + "pshuflw xmm2, xmm2, 0", + "pshuflw xmm3, xmm3, 0", + "pshuflw xmm5, xmm5, 0", + "movlhps xmm1, xmm1", + "movlhps xmm2, xmm2", + "movlhps xmm3, xmm3", + "movlhps xmm5, xmm5", + "2:", + concat!($mov, " xmm6, [{src0}]"), + "add {src0}, 16", + concat!($mov, " xmm7, [{src1}]"), + "add {src1}, 16", + "punpcklbw xmm6, xmm0", + "punpcklbw xmm7, xmm0", + "pmullw xmm6, xmm1", + "pmullw xmm7, xmm2", + "paddw xmm6, xmm5", + "paddw xmm6, xmm7", + "psraw xmm6, xmm4", + "paddw xmm6, xmm3", + "movhlps xmm7, xmm6", + "packuswb xmm6, xmm7", + concat!($mov, " [{dst}], xmm6"), + "add {dst}, {stride}", + "dec {h}", + "jnz 2b", + h = inout(reg) h => _, + src0 = inout(reg) src0.as_ptr() => _, + src1 = inout(reg) src1.as_ptr() => _, + dst = inout(reg) dst.as_mut_ptr() => _, + stride = in(reg) stride, + weight0 = in(reg) weight0, + weight1 = in(reg) weight1, + offset = in(reg) offset, + wshift = in(reg) wshift, + bias = in(reg) bias, + out("xmm0") _, + out("xmm1") _, + out("xmm2") _, + out("xmm3") _, + out("xmm4") _, + out("xmm5") _, + out("xmm6") _, + out("xmm7") _, + ); + } + } + } +} + +put_block_weighted2!(put_block_weighted2_8, "movq"); +put_block_weighted2!(put_block_weighted2_4, "movd"); + +pub fn put_block_weighted2_16(dst: &mut [u8], stride: usize, src0: &[u8], src1: &[u8], h: usize, wparams: [i8; 5]) { + if wparams == [1, 0, 1, 0, 0] { + unsafe { + asm!( + "2:", + "movups xmm0, [{src0}]", + "movups xmm1, [{src1}]", + "add {src0}, 16", + "pavgb xmm0, xmm1", + "add {src1}, 16", + "movaps [{dst}], xmm0", + "add {dst}, {stride}", + "dec {h}", + "jnz 2b", + src0 = inout(reg) src0.as_ptr() => _, + src1 = inout(reg) src1.as_ptr() => _, + dst = inout(reg) dst.as_mut_ptr() => _, + stride = in(reg) stride, + h = inout(reg) h => _, + out("xmm0") _, + out("xmm1") _, + ); + } + return; + } + let weight0 = i32::from(wparams[0]); + let offset0 = i32::from(wparams[1]); + let weight1 = i32::from(wparams[2]); + let offset1 = i32::from(wparams[3]); + let wshift = i32::from(wparams[4]) + 1; + let offset = (offset0 + offset1 + 1) >> 1; + let bias = (1 << wshift) >> 1; + + unsafe { + asm!( + "xorps xmm0, xmm0", + "movd xmm1, {weight0:e}", + "movd xmm2, {weight1:e}", + "movd xmm3, {offset:e}", + "movd xmm4, {wshift:e}", + "movd xmm5, {bias:e}", + "pshuflw xmm1, xmm1, 0", + "pshuflw xmm2, xmm2, 0", + "pshuflw xmm3, xmm3, 0", + "pshuflw xmm5, xmm5, 0", + "movlhps xmm1, xmm1", + "movlhps xmm2, xmm2", + "movlhps xmm3, xmm3", + "movlhps xmm5, xmm5", + "2:", + "movq xmm6, [{src0}]", + "movq xmm7, [{src1}]", + "punpcklbw xmm6, xmm0", + "punpcklbw xmm7, xmm0", + "pmullw xmm6, xmm1", + "pmullw xmm7, xmm2", + "paddw xmm6, xmm5", + "paddw xmm6, xmm7", + "psraw xmm6, xmm4", + "paddw xmm6, xmm3", + "movhlps xmm7, xmm6", + "packuswb xmm6, xmm7", + "movq [{dst}], xmm6", + "movq xmm6, [{src0} + 8]", + "add {src0}, 16", + "movq xmm7, [{src1} + 8]", + "add {src1}, 16", + "punpcklbw xmm6, xmm0", + "punpcklbw xmm7, xmm0", + "pmullw xmm6, xmm1", + "pmullw xmm7, xmm2", + "paddw xmm6, xmm5", + "paddw xmm6, xmm7", + "psraw xmm6, xmm4", + "paddw xmm6, xmm3", + "movhlps xmm7, xmm6", + "packuswb xmm6, xmm7", + "movq [{dst} + 8], xmm6", + "add {dst}, {stride}", + "dec {h}", + "jnz 2b", + h = inout(reg) h => _, + src0 = inout(reg) src0.as_ptr() => _, + src1 = inout(reg) src1.as_ptr() => _, + dst = inout(reg) dst.as_mut_ptr() => _, + stride = in(reg) stride, + weight0 = in(reg) weight0, + weight1 = in(reg) weight1, + offset = in(reg) offset, + wshift = in(reg) wshift, + bias = in(reg) bias, + out("xmm0") _, + out("xmm1") _, + out("xmm2") _, + out("xmm3") _, + out("xmm4") _, + out("xmm5") _, + out("xmm6") _, + out("xmm7") _, + ); + } +}