hwdec-vaapi: handle copying data into kodaed chroma planes

[nihav-player.git] / hwdec-vaapi / src / lib.rs
diff --git a/hwdec-vaapi/src/lib.rs b/hwdec-vaapi/src/lib.rs

index 7b580d5917a006c044c811f05b725692175fbc58..9dca7b17b69407b2b01d3d3fa2331647568b3dce 100644 (file)
--- a/hwdec-vaapi/src/lib.rs
+++ b/hwdec-vaapi/src/lib.rs
@@ -273,6 +273,7 @@ struct VaapiInternals {
  pub struct VaapiH264Decoder {
      info:           NACodecInfoRef,
      vaapi:          Option<VaapiInternals>,
+    needs_derive:   bool,
      spses:          Vec<SeqParameterSet>,
      ppses:          Vec<PicParameterSet>,
      frame_refs:     FrameRefs,
@@ -283,6 +284,17 @@ pub struct VaapiH264Decoder {
      tb_den:         u32,
  }
  
+fn copy_luma_default(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, w: usize, h: usize) {
+    for (dline, sline) in dst.chunks_mut(dstride)
+            .zip(src.chunks(sstride))
+            .take(h) {
+        dline[..w].copy_from_slice(&sline[..w]);
+    }
+}
+#[cfg(not(target_arch="x86_64"))]
+fn copy_luma(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, w: usize, h: usize) {
+    copy_luma_default(dst, dstride, src, sstride, w, h);
+}
  #[cfg(not(target_arch="x86_64"))]
  fn deint_chroma(frm: NASimpleVideoFrame<u8>, src: &[u8], sstride: usize) {
      let mut uoff = frm.offset[1];
@@ -300,10 +312,55 @@ fn deint_chroma(frm: NASimpleVideoFrame<u8>, src: &[u8], sstride: usize) {
  #[cfg(target_arch="x86_64")]
  use std::arch::asm;
  #[cfg(target_arch="x86_64")]
+fn copy_luma(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, w: usize, h: usize) {
+    if !is_x86_feature_detected!("avx") {
+        copy_luma_default(dst, dstride, src, sstride, w, h);
+        return;
+    }
+    if dst.as_ptr().align_offset(32) == 0 && src.as_ptr().align_offset(32) == 0 &&
+            (w % 64) == 0 && ((dstride | sstride) % 32) == 0 {
+        unsafe {
+            asm!(
+                "2:",
+                "  mov {x}, {w}",
+                "  3:",
+                "    vmovdqa ymm0, [{src}]",
+                "    vmovdqa ymm1, [{src}+32]",
+                "    vmovdqa [{dst}], ymm0",
+                "    vmovdqa [{dst}+32], ymm1",
+                "    add {src}, 64",
+                "    add {dst}, 64",
+                "    sub {x},   64",
+                "    jnz 3b",
+                "  add {src}, {sstep}",
+                "  add {dst}, {dstep}",
+                "  dec {h}",
+                "  jnz 2b",
+                dst = inout(reg) dst.as_mut_ptr() => _,
+                src = inout(reg) src.as_ptr() => _,
+                sstep = in(reg) sstride - w,
+                dstep = in(reg) dstride - w,
+                w = in(reg) w,
+                h = in(reg) h,
+                x = out(reg) _,
+                out("ymm0") _,
+                out("ymm1") _,
+            );
+        }
+    } else {
+        let copy_len = dstride.min(w);
+        for (dline, sline) in dst.chunks_mut(dstride)
+                .zip(src.chunks(sstride))
+                .take(h) {
+            dline[..copy_len].copy_from_slice(&sline[..copy_len]);
+        }
+    }
+}
+#[cfg(target_arch="x86_64")]
  fn deint_chroma(frm: NASimpleVideoFrame<u8>, src: &[u8], sstride: usize) {
      unsafe {
-        let width = frm.width[1];
-        let height = frm.height[1];
+        let width = (frm.width[1] + 7) & !7;
+        let height = (frm.height[1] + 7) & !7;
          let dst = frm.data.as_mut_ptr();
          let udst = dst.add(frm.offset[1]);
          let vdst = dst.add(frm.offset[2]);
@@ -368,12 +425,12 @@ fn deint_chroma(frm: NASimpleVideoFrame<u8>, src: &[u8], sstride: usize) {
      }
  }
  
-fn fill_frame(ifmt: VAImageFormat, pic: &Picture<PictureSync>, frm: &mut NABufferType) -> DecoderResult<()> {
+fn fill_frame(ifmt: VAImageFormat, pic: &Picture<PictureSync>, frm: &mut NABufferType, needs_derive: bool) -> DecoderResult<()> {
      let mut vbuf = frm.get_vbuf().unwrap();
      let (w, h) = pic.surface_size();
      //let cur_ts = pic.timestamp();
  
-    let img = Image::new(pic, ifmt, w, h, true).expect("get image");
+    let img = Image::new(pic, ifmt, w, h, !needs_derive).expect("get image");
  
      let iimg = img.image();
      let imgdata: &[u8] = img.as_ref();
@@ -381,17 +438,22 @@ fn fill_frame(ifmt: VAImageFormat, pic: &Picture<PictureSync>, frm: &mut NABuffe
      match iimg.format.fourcc().map_err(|_| DecoderError::InvalidData)? {
          VAFourcc::NV12 => {
              let frm = NASimpleVideoFrame::from_video_buf(&mut vbuf).unwrap();
-            validate!(iimg.width == (frm.width[0] as u16));
-            validate!(iimg.height == (frm.height[0] as u16));
+            validate!(iimg.width  == (((frm.width[0]  + 15) & !15) as u16));
+            validate!(iimg.height == (((frm.height[0] + 15) & !15) as u16));
  
-            for (dline, sline) in frm.data[frm.offset[0]..].chunks_mut(frm.stride[0])
-                    .zip(imgdata[iimg.offsets[0] as usize..].chunks(iimg.pitches[0] as usize))
-                    .take(frm.height[0]) {
-                dline[..frm.width[0]].copy_from_slice(&sline[..frm.width[0]]);
-            }
+            copy_luma(&mut frm.data[frm.offset[0]..], frm.stride[0], &imgdata[iimg.offsets[0] as usize..], iimg.pitches[0] as usize, (frm.width[0] + 15) & !15, (frm.height[0] + 15) & !15);
  
              deint_chroma(frm, &imgdata[iimg.offsets[1] as usize..], iimg.pitches[1] as usize);
          },
+        VAFourcc::YV12 => {
+            let frm = NASimpleVideoFrame::from_video_buf(&mut vbuf).unwrap();
+            validate!(iimg.width  == (((frm.width[0]  + 15) & !15) as u16));
+            validate!(iimg.height == (((frm.height[0] + 15) & !15) as u16));
+
+            copy_luma(&mut frm.data[frm.offset[0]..], frm.stride[0], &imgdata[iimg.offsets[0] as usize..], iimg.pitches[0] as usize, (frm.width[0] + 15) & !15, (frm.height[0] + 15) & !15);
+            copy_luma(&mut frm.data[frm.offset[2]..], frm.stride[2], &imgdata[iimg.offsets[1] as usize..], iimg.pitches[1] as usize, (frm.width[1] + 15) & !15, (frm.height[1] + 15) & !15);
+            copy_luma(&mut frm.data[frm.offset[1]..], frm.stride[1], &imgdata[iimg.offsets[2] as usize..], iimg.pitches[2] as usize, (frm.width[2] + 15) & !15, (frm.height[2] + 15) & !15);
+        },
          _ => unimplemented!(),
      };
      Ok(())
@@ -402,6 +464,7 @@ impl Default for VaapiH264Decoder {
          Self {
              info:           NACodecInfoRef::default(),
              vaapi:          None,
+            needs_derive:   false,
              spses:          Vec::with_capacity(1),
              ppses:          Vec::with_capacity(4),
              frame_refs:     FrameRefs::new(),
@@ -433,10 +496,10 @@ impl VaapiH264Decoder {
                  let _compatibility      = br.read_byte()?;
                  let _level              = br.read_byte()?;
                  let b                   = br.read_byte()?;
-                validate!((b & 0xFC) == 0xFC);
+                //validate!((b & 0xFC) == 0xFC);
                  self.nal_len            = (b & 3) + 1;
                  let b                   = br.read_byte()?;
-                validate!((b & 0xE0) == 0xE0);
+                //validate!((b & 0xE0) == 0xE0);
                  let num_sps = (b & 0x1F) as usize;
                  for _ in 0..num_sps {
                      let len             = br.read_u16be()? as usize;
@@ -484,7 +547,10 @@ impl VaapiH264Decoder {
                      match profile {
                          100 | 110 | 122 | 144 => {
                              let b       = br.read_byte()?;
-                            validate!((b & 0xFC) == 0xFC);
+                            // some encoders put something different here
+                            if (b & 0xFC) != 0xFC {
+                                return Ok(());
+                            }
                              // b & 3 -> chroma format
                              let b       = br.read_byte()?;
                              validate!((b & 0xF8) == 0xF8);
@@ -537,6 +603,10 @@ println!("no decoding support for this profile");
                  return Err(DecoderError::Bug);
              }
  
+            let needs_derive= if let Ok(vendor) = display.query_vendor_string() {
+                    vendor.contains("Kaby Lake")
+                } else { false };
+
              let config = display.create_config(vec![
                      VAConfigAttrib { type_: VAConfigAttribType::VAConfigAttribRTFormat, value: RTFormat::YUV420.into() },
                  ], va_profile, VAEntrypoint::VAEntrypointVLD).map_err(|_| {
@@ -559,8 +629,9 @@ println!("config creation failed!");
              }
  
              self.vaapi = Some(VaapiInternals { display, context, ref_pics, surfaces, ifmt });
+            self.needs_derive = needs_derive;
  
-            let vinfo = NAVideoInfo::new(width, height, false, YUV420_FORMAT);
+            let vinfo = NAVideoInfo::new(vinfo.get_width(), vinfo.get_height(), false, YUV420_FORMAT);
              self.info = NACodecInfo::new_ref(info.get_name(), NACodecTypeInfo::Video(vinfo), info.get_extradata()).into_ref();
              self.out_frm = alloc_video_buffer(vinfo, 4)?;
  
@@ -986,7 +1057,7 @@ panic!("ran out of free surfaces");
                  let is_ref = frm.is_ref;
                  let ftype = frm.ftype;
                  if let Ok(pic) = frm.pic.sync() {
-                    let _ = fill_frame(vactx.ifmt, &pic, &mut self.out_frm);
+                    let _ = fill_frame(vactx.ifmt, &pic, &mut self.out_frm, self.needs_derive);
  
                      if !is_ref {
                          if let Ok(surf) = pic.take_surface() {
@@ -1019,7 +1090,7 @@ panic!("ran out of free surfaces");
                  let is_ref = frm.is_ref;
                  let ftype = frm.ftype;
                  if let Ok(pic) = frm.pic.sync() {
-                    let _ = fill_frame(vactx.ifmt, &pic, &mut self.out_frm);
+                    let _ = fill_frame(vactx.ifmt, &pic, &mut self.out_frm, self.needs_derive);
  
                      if !is_ref {
                          if let Ok(surf) = pic.take_surface() {