From: Kostya Shishkov Date: Sat, 4 Nov 2023 09:17:30 +0000 (+0100) Subject: optimise luma copy part a bit X-Git-Url: https://git.nihav.org/?p=nihav-player.git;a=commitdiff_plain;h=25685ca46df6f9f0d446a2033b1e582985839803 optimise luma copy part a bit --- diff --git a/hwdec-vaapi/src/lib.rs b/hwdec-vaapi/src/lib.rs index 1ef3e41..40e14f4 100644 --- a/hwdec-vaapi/src/lib.rs +++ b/hwdec-vaapi/src/lib.rs @@ -283,6 +283,14 @@ pub struct VaapiH264Decoder { tb_den: u32, } +#[cfg(not(target_arch="x86_64"))] +fn copy_luma(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, w: usize, h: usize) { + for (dline, sline) in dst.chunks_mut(dstride) + .zip(src.chunks(sstride)) + .take(h) { + dline[..w].copy_from_slice(&sline[..w]); + } +} #[cfg(not(target_arch="x86_64"))] fn deint_chroma(frm: NASimpleVideoFrame, src: &[u8], sstride: usize) { let mut uoff = frm.offset[1]; @@ -300,6 +308,46 @@ fn deint_chroma(frm: NASimpleVideoFrame, src: &[u8], sstride: usize) { #[cfg(target_arch="x86_64")] use std::arch::asm; #[cfg(target_arch="x86_64")] +fn copy_luma(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, w: usize, h: usize) { + if dst.as_ptr().align_offset(32) == 0 && src.as_ptr().align_offset(32) == 0 && + (w % 64) == 0 && ((dstride | sstride) % 32) == 0 { + unsafe { + asm!( + "2:", + " mov {x}, {w}", + " 3:", + " vmovdqa ymm0, [{src}]", + " vmovdqa ymm1, [{src}+32]", + " vmovdqa [{dst}], ymm0", + " vmovdqa [{dst}+32], ymm1", + " add {src}, 64", + " add {dst}, 64", + " sub {x}, 64", + " jnz 3b", + " add {src}, {sstep}", + " add {dst}, {dstep}", + " dec {h}", + " jnz 2b", + dst = inout(reg) dst.as_mut_ptr() => _, + src = inout(reg) src.as_ptr() => _, + sstep = in(reg) sstride - w, + dstep = in(reg) dstride - w, + w = in(reg) w, + h = in(reg) h, + x = out(reg) _, + out("ymm0") _, + out("ymm1") _, + ); + } + } else { + for (dline, sline) in dst.chunks_mut(dstride) + .zip(src.chunks(sstride)) + .take(h) { + dline[..w].copy_from_slice(&sline[..w]); + } + } +} +#[cfg(target_arch="x86_64")] fn deint_chroma(frm: NASimpleVideoFrame, src: &[u8], sstride: usize) { unsafe { let width = frm.width[1]; @@ -384,11 +432,7 @@ fn fill_frame(ifmt: VAImageFormat, pic: &Picture, frm: &mut NABuffe validate!(iimg.width == (frm.width[0] as u16)); validate!(iimg.height == (frm.height[0] as u16)); - for (dline, sline) in frm.data[frm.offset[0]..].chunks_mut(frm.stride[0]) - .zip(imgdata[iimg.offsets[0] as usize..].chunks(iimg.pitches[0] as usize)) - .take(frm.height[0]) { - dline[..frm.width[0]].copy_from_slice(&sline[..frm.width[0]]); - } + copy_luma(&mut frm.data[frm.offset[0]..], frm.stride[0], &imgdata[iimg.offsets[0] as usize..], iimg.pitches[0] as usize, frm.width[0], frm.height[0]); deint_chroma(frm, &imgdata[iimg.offsets[1] as usize..], iimg.pitches[1] as usize); },