X-Git-Url: https://git.nihav.org/?a=blobdiff_plain;f=hwdec-vaapi%2Fsrc%2Flib.rs;h=cdc8b080a52f3df32df7fc5a920c80cda04f42a0;hb=c84ea16daef963fe73d4b2e69b20b68524563ae0;hp=ca843c26f0b278287e279a9782e62c316ee7c9eb;hpb=e5ccd68db9e8cf512c1506e8769ca2e0a07d0b0e;p=nihav-player.git diff --git a/hwdec-vaapi/src/lib.rs b/hwdec-vaapi/src/lib.rs index ca843c2..cdc8b08 100644 --- a/hwdec-vaapi/src/lib.rs +++ b/hwdec-vaapi/src/lib.rs @@ -283,6 +283,146 @@ pub struct VaapiH264Decoder { tb_den: u32, } +fn copy_luma_default(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, w: usize, h: usize) { + for (dline, sline) in dst.chunks_mut(dstride) + .zip(src.chunks(sstride)) + .take(h) { + dline[..w].copy_from_slice(&sline[..w]); + } +} +#[cfg(not(target_arch="x86_64"))] +fn copy_luma(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, w: usize, h: usize) { + copy_luma_default(dst, dstride, src, sstride, w, h); +} +#[cfg(not(target_arch="x86_64"))] +fn deint_chroma(frm: NASimpleVideoFrame, src: &[u8], sstride: usize) { + let mut uoff = frm.offset[1]; + let mut voff = frm.offset[2]; + for cline in src.chunks(sstride).take(frm.height[1]) { + for (x, pair) in cline.chunks_exact(2).take(frm.width[1]).enumerate() { + frm.data[uoff + x] = pair[0]; + frm.data[voff + x] = pair[1]; + } + uoff += frm.stride[1]; + voff += frm.stride[2]; + } +} + +#[cfg(target_arch="x86_64")] +use std::arch::asm; +#[cfg(target_arch="x86_64")] +fn copy_luma(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, w: usize, h: usize) { + if !is_x86_feature_detected!("avx") { + copy_luma_default(dst, dstride, src, sstride, w, h); + return; + } + if dst.as_ptr().align_offset(32) == 0 && src.as_ptr().align_offset(32) == 0 && + (w % 64) == 0 && ((dstride | sstride) % 32) == 0 { + unsafe { + asm!( + "2:", + " mov {x}, {w}", + " 3:", + " vmovdqa ymm0, [{src}]", + " vmovdqa ymm1, [{src}+32]", + " vmovdqa [{dst}], ymm0", + " vmovdqa [{dst}+32], ymm1", + " add {src}, 64", + " add {dst}, 64", + " sub {x}, 64", + " jnz 3b", + " add {src}, {sstep}", + " add {dst}, {dstep}", + " dec {h}", + " jnz 2b", + dst = inout(reg) dst.as_mut_ptr() => _, + src = inout(reg) src.as_ptr() => _, + sstep = in(reg) sstride - w, + dstep = in(reg) dstride - w, + w = in(reg) w, + h = in(reg) h, + x = out(reg) _, + out("ymm0") _, + out("ymm1") _, + ); + } + } else { + for (dline, sline) in dst.chunks_mut(dstride) + .zip(src.chunks(sstride)) + .take(h) { + dline[..w].copy_from_slice(&sline[..w]); + } + } +} +#[cfg(target_arch="x86_64")] +fn deint_chroma(frm: NASimpleVideoFrame, src: &[u8], sstride: usize) { + unsafe { + let width = frm.width[1]; + let height = frm.height[1]; + let dst = frm.data.as_mut_ptr(); + let udst = dst.add(frm.offset[1]); + let vdst = dst.add(frm.offset[2]); + let dstep = frm.stride[1] - width; + let sstep = sstride - width * 2; + asm!( + "2:", + " mov {tmp}, {width}", + " test {width}, 8", + " jz 3f", + " movaps xmm0, [{src}]", + " movaps xmm1, xmm0", + " psllw xmm0, 8", + " psrlw xmm1, 8", + " psrlw xmm0, 8", + " packuswb xmm1, xmm1", + " packuswb xmm0, xmm0", + " movq [{vdst}], xmm1", + " movq [{udst}], xmm0", + " add {src}, 16", + " add {vdst}, 8", + " add {udst}, 8", + " sub {tmp}, 8", + " 3:", + " movaps xmm0, [{src}]", + " movaps xmm1, [{src} + 16]", + " movaps xmm2, xmm0", + " movaps xmm3, xmm1", + " psllw xmm0, 8", + " psllw xmm1, 8", + " psrlw xmm2, 8", + " psrlw xmm3, 8", + " psrlw xmm0, 8", + " psrlw xmm1, 8", + " packuswb xmm2, xmm3", + " packuswb xmm0, xmm1", + " movups [{vdst}], xmm2", + " movups [{udst}], xmm0", + " add {src}, 32", + " add {vdst}, 16", + " add {udst}, 16", + " sub {tmp}, 16", + " jnz 3b", + " add {udst}, {dstep}", + " add {vdst}, {dstep}", + " add {src}, {sstep}", + " dec {height}", + " jnz 2b", + src = inout(reg) src.as_ptr() => _, + udst = inout(reg) udst => _, + vdst = inout(reg) vdst => _, + width = in(reg) width, + height = inout(reg) height => _, + dstep = in(reg) dstep, + sstep = in(reg) sstep, + tmp = out(reg) _, + out("xmm0") _, + out("xmm1") _, + out("xmm2") _, + out("xmm3") _, + ); + } +} + fn fill_frame(ifmt: VAImageFormat, pic: &Picture, frm: &mut NABufferType) -> DecoderResult<()> { let mut vbuf = frm.get_vbuf().unwrap(); let (w, h) = pic.surface_size(); @@ -299,22 +439,9 @@ fn fill_frame(ifmt: VAImageFormat, pic: &Picture, frm: &mut NABuffe validate!(iimg.width == (frm.width[0] as u16)); validate!(iimg.height == (frm.height[0] as u16)); - for (dline, sline) in frm.data[frm.offset[0]..].chunks_mut(frm.stride[0]) - .zip(imgdata[iimg.offsets[0] as usize..].chunks(iimg.pitches[0] as usize)) - .take(frm.height[0]) { - dline[..frm.width[0]].copy_from_slice(&sline[..frm.width[0]]); - } + copy_luma(&mut frm.data[frm.offset[0]..], frm.stride[0], &imgdata[iimg.offsets[0] as usize..], iimg.pitches[0] as usize, frm.width[0], frm.height[0]); - let mut uoff = frm.offset[1]; - let mut voff = frm.offset[2]; - for cline in imgdata[iimg.offsets[1] as usize..].chunks(iimg.pitches[1] as usize).take(frm.height[1]) { - for (x, pair) in cline.chunks_exact(2).take(frm.width[1]).enumerate() { - frm.data[uoff + x] = pair[0]; - frm.data[voff + x] = pair[1]; - } - uoff += frm.stride[1]; - voff += frm.stride[2]; - } + deint_chroma(frm, &imgdata[iimg.offsets[1] as usize..], iimg.pitches[1] as usize); }, _ => unimplemented!(), }; @@ -408,7 +535,10 @@ impl VaapiH264Decoder { match profile { 100 | 110 | 122 | 144 => { let b = br.read_byte()?; - validate!((b & 0xFC) == 0xFC); + // some encoders put something different here + if (b & 0xFC) != 0xFC { + return Ok(()); + } // b & 3 -> chroma format let b = br.read_byte()?; validate!((b & 0xF8) == 0xF8);