From a439fb0b8b38fc265d79140a2fa9592232744161 Mon Sep 17 00:00:00 2001 From: Kostya Shishkov Date: Sat, 21 Oct 2023 09:54:50 +0200 Subject: [PATCH] hwdec-vaapi: optimise chroma deinterleaving --- hwdec-vaapi/src/lib.rs | 96 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 86 insertions(+), 10 deletions(-) diff --git a/hwdec-vaapi/src/lib.rs b/hwdec-vaapi/src/lib.rs index ca843c2..7b580d5 100644 --- a/hwdec-vaapi/src/lib.rs +++ b/hwdec-vaapi/src/lib.rs @@ -283,6 +283,91 @@ pub struct VaapiH264Decoder { tb_den: u32, } +#[cfg(not(target_arch="x86_64"))] +fn deint_chroma(frm: NASimpleVideoFrame, src: &[u8], sstride: usize) { + let mut uoff = frm.offset[1]; + let mut voff = frm.offset[2]; + for cline in src.chunks(sstride).take(frm.height[1]) { + for (x, pair) in cline.chunks_exact(2).take(frm.width[1]).enumerate() { + frm.data[uoff + x] = pair[0]; + frm.data[voff + x] = pair[1]; + } + uoff += frm.stride[1]; + voff += frm.stride[2]; + } +} + +#[cfg(target_arch="x86_64")] +use std::arch::asm; +#[cfg(target_arch="x86_64")] +fn deint_chroma(frm: NASimpleVideoFrame, src: &[u8], sstride: usize) { + unsafe { + let width = frm.width[1]; + let height = frm.height[1]; + let dst = frm.data.as_mut_ptr(); + let udst = dst.add(frm.offset[1]); + let vdst = dst.add(frm.offset[2]); + let dstep = frm.stride[1] - width; + let sstep = sstride - width * 2; + asm!( + "2:", + " mov {tmp}, {width}", + " test {width}, 8", + " jz 3f", + " movaps xmm0, [{src}]", + " movaps xmm1, xmm0", + " psllw xmm0, 8", + " psrlw xmm1, 8", + " psrlw xmm0, 8", + " packuswb xmm1, xmm1", + " packuswb xmm0, xmm0", + " movq [{vdst}], xmm1", + " movq [{udst}], xmm0", + " add {src}, 16", + " add {vdst}, 8", + " add {udst}, 8", + " sub {tmp}, 8", + " 3:", + " movaps xmm0, [{src}]", + " movaps xmm1, [{src} + 16]", + " movaps xmm2, xmm0", + " movaps xmm3, xmm1", + " psllw xmm0, 8", + " psllw xmm1, 8", + " psrlw xmm2, 8", + " psrlw xmm3, 8", + " psrlw xmm0, 8", + " psrlw xmm1, 8", + " packuswb xmm2, xmm3", + " packuswb xmm0, xmm1", + " movups [{vdst}], xmm2", + " movups [{udst}], xmm0", + " add {src}, 32", + " add {vdst}, 16", + " add {udst}, 16", + " sub {tmp}, 16", + " jnz 3b", + " add {udst}, {dstep}", + " add {vdst}, {dstep}", + " add {src}, {sstep}", + " dec {height}", + " jnz 2b", + src = inout(reg) src.as_ptr() => _, + udst = inout(reg) udst => _, + vdst = inout(reg) vdst => _, + width = in(reg) width, + height = inout(reg) height => _, + dstep = in(reg) dstep, + sstep = in(reg) sstep, + tmp = out(reg) _, + out("xmm0") _, + out("xmm1") _, + out("xmm2") _, + out("xmm3") _, + ); + } +} + fn fill_frame(ifmt: VAImageFormat, pic: &Picture, frm: &mut NABufferType) -> DecoderResult<()> { let mut vbuf = frm.get_vbuf().unwrap(); let (w, h) = pic.surface_size(); @@ -305,16 +390,7 @@ fn fill_frame(ifmt: VAImageFormat, pic: &Picture, frm: &mut NABuffe dline[..frm.width[0]].copy_from_slice(&sline[..frm.width[0]]); } - let mut uoff = frm.offset[1]; - let mut voff = frm.offset[2]; - for cline in imgdata[iimg.offsets[1] as usize..].chunks(iimg.pitches[1] as usize).take(frm.height[1]) { - for (x, pair) in cline.chunks_exact(2).take(frm.width[1]).enumerate() { - frm.data[uoff + x] = pair[0]; - frm.data[voff + x] = pair[1]; - } - uoff += frm.stride[1]; - voff += frm.stride[2]; - } + deint_chroma(frm, &imgdata[iimg.offsets[1] as usize..], iimg.pitches[1] as usize); }, _ => unimplemented!(), }; -- 2.39.5