X-Git-Url: https://git.nihav.org/?a=blobdiff_plain;f=hwdec-vaapi%2Fsrc%2Flib.rs;h=cdc8b080a52f3df32df7fc5a920c80cda04f42a0;hb=HEAD;hp=7b580d5917a006c044c811f05b725692175fbc58;hpb=a439fb0b8b38fc265d79140a2fa9592232744161;p=nihav-player.git diff --git a/hwdec-vaapi/src/lib.rs b/hwdec-vaapi/src/lib.rs index 7b580d5..cdc8b08 100644 --- a/hwdec-vaapi/src/lib.rs +++ b/hwdec-vaapi/src/lib.rs @@ -283,6 +283,17 @@ pub struct VaapiH264Decoder { tb_den: u32, } +fn copy_luma_default(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, w: usize, h: usize) { + for (dline, sline) in dst.chunks_mut(dstride) + .zip(src.chunks(sstride)) + .take(h) { + dline[..w].copy_from_slice(&sline[..w]); + } +} +#[cfg(not(target_arch="x86_64"))] +fn copy_luma(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, w: usize, h: usize) { + copy_luma_default(dst, dstride, src, sstride, w, h); +} #[cfg(not(target_arch="x86_64"))] fn deint_chroma(frm: NASimpleVideoFrame, src: &[u8], sstride: usize) { let mut uoff = frm.offset[1]; @@ -300,6 +311,50 @@ fn deint_chroma(frm: NASimpleVideoFrame, src: &[u8], sstride: usize) { #[cfg(target_arch="x86_64")] use std::arch::asm; #[cfg(target_arch="x86_64")] +fn copy_luma(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, w: usize, h: usize) { + if !is_x86_feature_detected!("avx") { + copy_luma_default(dst, dstride, src, sstride, w, h); + return; + } + if dst.as_ptr().align_offset(32) == 0 && src.as_ptr().align_offset(32) == 0 && + (w % 64) == 0 && ((dstride | sstride) % 32) == 0 { + unsafe { + asm!( + "2:", + " mov {x}, {w}", + " 3:", + " vmovdqa ymm0, [{src}]", + " vmovdqa ymm1, [{src}+32]", + " vmovdqa [{dst}], ymm0", + " vmovdqa [{dst}+32], ymm1", + " add {src}, 64", + " add {dst}, 64", + " sub {x}, 64", + " jnz 3b", + " add {src}, {sstep}", + " add {dst}, {dstep}", + " dec {h}", + " jnz 2b", + dst = inout(reg) dst.as_mut_ptr() => _, + src = inout(reg) src.as_ptr() => _, + sstep = in(reg) sstride - w, + dstep = in(reg) dstride - w, + w = in(reg) w, + h = in(reg) h, + x = out(reg) _, + out("ymm0") _, + out("ymm1") _, + ); + } + } else { + for (dline, sline) in dst.chunks_mut(dstride) + .zip(src.chunks(sstride)) + .take(h) { + dline[..w].copy_from_slice(&sline[..w]); + } + } +} +#[cfg(target_arch="x86_64")] fn deint_chroma(frm: NASimpleVideoFrame, src: &[u8], sstride: usize) { unsafe { let width = frm.width[1]; @@ -384,11 +439,7 @@ fn fill_frame(ifmt: VAImageFormat, pic: &Picture, frm: &mut NABuffe validate!(iimg.width == (frm.width[0] as u16)); validate!(iimg.height == (frm.height[0] as u16)); - for (dline, sline) in frm.data[frm.offset[0]..].chunks_mut(frm.stride[0]) - .zip(imgdata[iimg.offsets[0] as usize..].chunks(iimg.pitches[0] as usize)) - .take(frm.height[0]) { - dline[..frm.width[0]].copy_from_slice(&sline[..frm.width[0]]); - } + copy_luma(&mut frm.data[frm.offset[0]..], frm.stride[0], &imgdata[iimg.offsets[0] as usize..], iimg.pitches[0] as usize, frm.width[0], frm.height[0]); deint_chroma(frm, &imgdata[iimg.offsets[1] as usize..], iimg.pitches[1] as usize); }, @@ -484,7 +535,10 @@ impl VaapiH264Decoder { match profile { 100 | 110 | 122 | 144 => { let b = br.read_byte()?; - validate!((b & 0xFC) == 0xFC); + // some encoders put something different here + if (b & 0xFC) != 0xFC { + return Ok(()); + } // b & 3 -> chroma format let b = br.read_byte()?; validate!((b & 0xF8) == 0xF8);