From 76431444843f6800c20ce046ad2e30a976402c38 Mon Sep 17 00:00:00 2001 From: Kostya Shishkov Date: Sat, 27 Aug 2022 10:46:18 +0200 Subject: [PATCH] h264: split put_block_weighted() by size --- nihav-itu/src/codecs/h264/dsp/mc/debug.rs | 55 ++++++++-- nihav-itu/src/codecs/h264/dsp/mc/mod.rs | 108 ++++++++++++++------ nihav-itu/src/codecs/h264/dsp/mc/release.rs | 55 ++++++++-- nihav-itu/src/codecs/h264/mb_recon.rs | 30 ++++-- 4 files changed, 195 insertions(+), 53 deletions(-) diff --git a/nihav-itu/src/codecs/h264/dsp/mc/debug.rs b/nihav-itu/src/codecs/h264/dsp/mc/debug.rs index 640e597..9f773ac 100644 --- a/nihav-itu/src/codecs/h264/dsp/mc/debug.rs +++ b/nihav-itu/src/codecs/h264/dsp/mc/debug.rs @@ -1,5 +1,3 @@ -use nihav_codec_support::codecs::blockdsp::*; - use super::clip_u8; const TMP_BUF_STRIDE: usize = 32; @@ -204,9 +202,52 @@ pub fn chroma_interp(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, } } -pub const H264_LUMA_INTERP: &[BlkInterpFunc] = &[ - h264_mc00, h264_mc01, h264_mc02, h264_mc03, - h264_mc10, h264_mc11, h264_mc12, h264_mc13, - h264_mc20, h264_mc21, h264_mc22, h264_mc23, - h264_mc30, h264_mc31, h264_mc32, h264_mc33 +macro_rules! luma_mc { + ($orig:ident, $func4:ident, $func8:ident, $func16:ident) => { + fn $func4(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, h: usize) { + $orig(dst, dstride, src, sstride, 4, h); + } + fn $func8(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, h: usize) { + $orig(dst, dstride, src, sstride, 8, h); + } + fn $func16(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, h: usize) { + $orig(dst, dstride, src, sstride, 16, h); + } + } +} + +luma_mc!(h264_mc00, h264_mc00_4, h264_mc00_8, h264_mc00_16); +luma_mc!(h264_mc01, h264_mc01_4, h264_mc01_8, h264_mc01_16); +luma_mc!(h264_mc02, h264_mc02_4, h264_mc02_8, h264_mc02_16); +luma_mc!(h264_mc03, h264_mc03_4, h264_mc03_8, h264_mc03_16); +luma_mc!(h264_mc10, h264_mc10_4, h264_mc10_8, h264_mc10_16); +luma_mc!(h264_mc11, h264_mc11_4, h264_mc11_8, h264_mc11_16); +luma_mc!(h264_mc12, h264_mc12_4, h264_mc12_8, h264_mc12_16); +luma_mc!(h264_mc13, h264_mc13_4, h264_mc13_8, h264_mc13_16); +luma_mc!(h264_mc20, h264_mc20_4, h264_mc20_8, h264_mc20_16); +luma_mc!(h264_mc21, h264_mc21_4, h264_mc21_8, h264_mc21_16); +luma_mc!(h264_mc22, h264_mc22_4, h264_mc22_8, h264_mc22_16); +luma_mc!(h264_mc23, h264_mc23_4, h264_mc23_8, h264_mc23_16); +luma_mc!(h264_mc30, h264_mc30_4, h264_mc30_8, h264_mc30_16); +luma_mc!(h264_mc31, h264_mc31_4, h264_mc31_8, h264_mc31_16); +luma_mc!(h264_mc32, h264_mc32_4, h264_mc32_8, h264_mc32_16); +luma_mc!(h264_mc33, h264_mc33_4, h264_mc33_8, h264_mc33_16); + +pub const H264_LUMA_INTERP: &[[super::MCFunc; 16]; 3] = &[ + [ + h264_mc00_4, h264_mc01_4, h264_mc02_4, h264_mc03_4, + h264_mc10_4, h264_mc11_4, h264_mc12_4, h264_mc13_4, + h264_mc20_4, h264_mc21_4, h264_mc22_4, h264_mc23_4, + h264_mc30_4, h264_mc31_4, h264_mc32_4, h264_mc33_4 + ], [ + h264_mc00_8, h264_mc01_8, h264_mc02_8, h264_mc03_8, + h264_mc10_8, h264_mc11_8, h264_mc12_8, h264_mc13_8, + h264_mc20_8, h264_mc21_8, h264_mc22_8, h264_mc23_8, + h264_mc30_8, h264_mc31_8, h264_mc32_8, h264_mc33_8 + ], [ + h264_mc00_16, h264_mc01_16, h264_mc02_16, h264_mc03_16, + h264_mc10_16, h264_mc11_16, h264_mc12_16, h264_mc13_16, + h264_mc20_16, h264_mc21_16, h264_mc22_16, h264_mc23_16, + h264_mc30_16, h264_mc31_16, h264_mc32_16, h264_mc33_16 + ] ]; diff --git a/nihav-itu/src/codecs/h264/dsp/mc/mod.rs b/nihav-itu/src/codecs/h264/dsp/mc/mod.rs index 1f5ff20..3ed248c 100644 --- a/nihav-itu/src/codecs/h264/dsp/mc/mod.rs +++ b/nihav-itu/src/codecs/h264/dsp/mc/mod.rs @@ -11,16 +11,22 @@ mod debug; #[cfg(debug_assertions)] use debug::*; +type MCFunc = fn (dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, h: usize); + fn clip_u8(val: i16) -> u8 { val.max(0).min(255) as u8 } pub struct H264MC { avg_buf: NAVideoBufferRef, + pub put_block_weighted: [fn (dst: &mut [u8], stride: usize, src: &[u8], h: usize, wparams: [i8; 3]); 4], + pub put_block_weighted2: [fn (dst: &mut [u8], stride: usize, src0: &[u8], src1: &[u8], h: usize, wparams: [i8; 5]); 4], } impl H264MC { pub fn new(avg_buf: NAVideoBufferRef) -> Self { Self { - avg_buf + avg_buf, + put_block_weighted: [put_blk_w_2, put_blk_w_4, put_blk_w_8, put_blk_w_16], + put_block_weighted2: [put_blk_w2_2, put_blk_w2_4, put_blk_w2_8, put_blk_w2_16], } } pub fn do_mc(&mut self, frm: &mut NASimpleVideoFrame, refpic: NAVideoBufferRef, xpos: usize, ypos: usize, w: usize, h: usize, mv: MV) { @@ -42,7 +48,12 @@ impl H264MC { } else { (&src[refpic.get_offset(0) + ((src_x - pre) as usize) + ((src_y - pre) as usize) * systride..], systride) }; - (H264_LUMA_INTERP[mode])(&mut frm.data[frm.offset[0] + xpos + ypos * frm.stride[0]..], frm.stride[0], ysrc, ystride, w, h); + let wmode = match w { + 4 => 0, + 8 => 1, + _ => 2, + }; + (H264_LUMA_INTERP[wmode][mode])(&mut frm.data[frm.offset[0] + xpos + ypos * frm.stride[0]..], frm.stride[0], ysrc, ystride, h); let (cw, ch) = refpic.get_dimensions(1); let mvx = mv.x >> 3; @@ -84,19 +95,24 @@ impl H264MC { const EBUF_STRIDE: usize = 32; let mut ebuf = [0u8; EBUF_STRIDE * (16 + 2 + 3)]; + let wmode = match w { + 4 => 0, + 8 => 1, + _ => 2, + }; if (sx - pre < 0) || (sx + (w as isize) + post > (width as isize)) || (sy - pre < 0) || (sy + (h as isize) + post > (height as isize)) { let edge = (pre + post) as usize; edge_emu(&refpic, sx - pre, sy - pre, w + edge, h + edge, &mut ebuf, EBUF_STRIDE, 0, 0); - (H264_LUMA_INTERP[mode])(ydst, 16, &ebuf, EBUF_STRIDE, w, h); + (H264_LUMA_INTERP[wmode][mode])(ydst, 16, &ebuf, EBUF_STRIDE, h); } else { let sstride = refpic.get_stride(0); let soff = refpic.get_offset(0); let sdta = refpic.get_data(); let sbuf: &[u8] = sdta.as_slice(); let saddr = soff + ((sx - pre) as usize) + ((sy - pre) as usize) * sstride; - (H264_LUMA_INTERP[mode])(ydst, 16, &sbuf[saddr..], sstride, w, h); + (H264_LUMA_INTERP[wmode][mode])(ydst, 16, &sbuf[saddr..], sstride, h); } let (cw, ch) = refpic.get_dimensions(1); @@ -148,35 +164,6 @@ impl H264MC { } } - pub fn put_block_weighted(&mut self, dst: &mut [u8], stride: usize, src: &[u8], w: usize, h: usize, wparams: [i8; 3]) { - let weight = i16::from(wparams[0]); - let offset = i16::from(wparams[1]); - let wshift = wparams[2] as u8; - let bias = (1 << wshift) >> 1; - - for (drow, srow) in dst.chunks_mut(stride).zip(src.chunks(16)).take(h) { - for (dst, &src) in drow[..w].iter_mut().zip(srow.iter()) { - *dst = clip_u8(((i16::from(src) * weight + bias) >> wshift) + offset); - } - } - } - - pub fn put_block_weighted2(&mut self, dst: &mut [u8], stride: usize, src0: &[u8], src1: &[u8], w: usize, h: usize, wparams: [i8; 5]) { - let weight0 = i16::from(wparams[0]); - let offset0 = i16::from(wparams[1]); - let weight1 = i16::from(wparams[2]); - let offset1 = i16::from(wparams[3]); - let wshift = (wparams[4] as u8) + 1; - let offset = (offset0 + offset1 + 1) >> 1; - let bias = (1 << wshift) >> 1; - - for (drow, (srow0, srow1)) in dst.chunks_mut(stride).zip(src0.chunks(16).zip(src1.chunks(16))).take(h) { - for (dst, (&src0, &src1)) in drow[..w].iter_mut().zip(srow0.iter().zip(srow1.iter())) { - *dst = clip_u8(((i16::from(src0) * weight0 + i16::from(src1) * weight1 + bias) >> wshift) + offset); - } - } - } - pub fn gray_block(&mut self, frm: &mut NASimpleVideoFrame, x: usize, y: usize, w: usize, h: usize) { let yoff = frm.offset[0] + x + y * frm.stride[0]; let coff = [frm.offset[1] + x / 2 + y / 2 * frm.stride[1], @@ -195,3 +182,58 @@ impl H264MC { } } } + +fn put_block_weighted(dst: &mut [u8], stride: usize, src: &[u8], w: usize, h: usize, wparams: [i8; 3]) { + let weight = i16::from(wparams[0]); + let offset = i16::from(wparams[1]); + let wshift = wparams[2] as u8; + let bias = (1 << wshift) >> 1; + + for (drow, srow) in dst.chunks_mut(stride).zip(src.chunks(16)).take(h) { + for (dst, &src) in drow[..w].iter_mut().zip(srow.iter()) { + *dst = clip_u8(((i16::from(src) * weight + bias) >> wshift) + offset); + } + } +} + +fn put_blk_w_2(dst: &mut [u8], stride: usize, src: &[u8], h: usize, wparams: [i8; 3]) { + put_block_weighted(dst, stride, src, 2, h, wparams); +} +fn put_blk_w_4(dst: &mut [u8], stride: usize, src: &[u8], h: usize, wparams: [i8; 3]) { + put_block_weighted(dst, stride, src, 4, h, wparams); +} +fn put_blk_w_8(dst: &mut [u8], stride: usize, src: &[u8], h: usize, wparams: [i8; 3]) { + put_block_weighted(dst, stride, src, 8, h, wparams); +} +fn put_blk_w_16(dst: &mut [u8], stride: usize, src: &[u8], h: usize, wparams: [i8; 3]) { + put_block_weighted(dst, stride, src, 16, h, wparams); +} + +fn put_block_weighted2(dst: &mut [u8], stride: usize, src0: &[u8], src1: &[u8], w: usize, h: usize, wparams: [i8; 5]) { + let weight0 = i16::from(wparams[0]); + let offset0 = i16::from(wparams[1]); + let weight1 = i16::from(wparams[2]); + let offset1 = i16::from(wparams[3]); + let wshift = (wparams[4] as u8) + 1; + let offset = (offset0 + offset1 + 1) >> 1; + let bias = (1 << wshift) >> 1; + + for (drow, (srow0, srow1)) in dst.chunks_mut(stride).zip(src0.chunks(16).zip(src1.chunks(16))).take(h) { + for (dst, (&src0, &src1)) in drow[..w].iter_mut().zip(srow0.iter().zip(srow1.iter())) { + *dst = clip_u8(((i16::from(src0) * weight0 + i16::from(src1) * weight1 + bias) >> wshift) + offset); + } + } +} + +fn put_blk_w2_2(dst: &mut [u8], stride: usize, src0: &[u8], src1: &[u8], h: usize, wparams: [i8; 5]) { + put_block_weighted2(dst, stride, src0, src1, 2, h, wparams); +} +fn put_blk_w2_4(dst: &mut [u8], stride: usize, src0: &[u8], src1: &[u8], h: usize, wparams: [i8; 5]) { + put_block_weighted2(dst, stride, src0, src1, 4, h, wparams); +} +fn put_blk_w2_8(dst: &mut [u8], stride: usize, src0: &[u8], src1: &[u8], h: usize, wparams: [i8; 5]) { + put_block_weighted2(dst, stride, src0, src1, 8, h, wparams); +} +fn put_blk_w2_16(dst: &mut [u8], stride: usize, src0: &[u8], src1: &[u8], h: usize, wparams: [i8; 5]) { + put_block_weighted2(dst, stride, src0, src1, 16, h, wparams); +} diff --git a/nihav-itu/src/codecs/h264/dsp/mc/release.rs b/nihav-itu/src/codecs/h264/dsp/mc/release.rs index 87b1bc5..3a43500 100644 --- a/nihav-itu/src/codecs/h264/dsp/mc/release.rs +++ b/nihav-itu/src/codecs/h264/dsp/mc/release.rs @@ -1,5 +1,3 @@ -use nihav_codec_support::codecs::blockdsp::*; - use super::clip_u8; const TMP_BUF_STRIDE: usize = 32; @@ -276,9 +274,52 @@ pub fn chroma_interp(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, } } -pub const H264_LUMA_INTERP: &[BlkInterpFunc] = &[ - h264_mc00, h264_mc01, h264_mc02, h264_mc03, - h264_mc10, h264_mc11, h264_mc12, h264_mc13, - h264_mc20, h264_mc21, h264_mc22, h264_mc23, - h264_mc30, h264_mc31, h264_mc32, h264_mc33 +macro_rules! luma_mc { + ($orig:ident, $func4:ident, $func8:ident, $func16:ident) => { + fn $func4(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, h: usize) { + $orig(dst, dstride, src, sstride, 4, h); + } + fn $func8(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, h: usize) { + $orig(dst, dstride, src, sstride, 8, h); + } + fn $func16(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, h: usize) { + $orig(dst, dstride, src, sstride, 16, h); + } + } +} + +luma_mc!(h264_mc00, h264_mc00_4, h264_mc00_8, h264_mc00_16); +luma_mc!(h264_mc01, h264_mc01_4, h264_mc01_8, h264_mc01_16); +luma_mc!(h264_mc02, h264_mc02_4, h264_mc02_8, h264_mc02_16); +luma_mc!(h264_mc03, h264_mc03_4, h264_mc03_8, h264_mc03_16); +luma_mc!(h264_mc10, h264_mc10_4, h264_mc10_8, h264_mc10_16); +luma_mc!(h264_mc11, h264_mc11_4, h264_mc11_8, h264_mc11_16); +luma_mc!(h264_mc12, h264_mc12_4, h264_mc12_8, h264_mc12_16); +luma_mc!(h264_mc13, h264_mc13_4, h264_mc13_8, h264_mc13_16); +luma_mc!(h264_mc20, h264_mc20_4, h264_mc20_8, h264_mc20_16); +luma_mc!(h264_mc21, h264_mc21_4, h264_mc21_8, h264_mc21_16); +luma_mc!(h264_mc22, h264_mc22_4, h264_mc22_8, h264_mc22_16); +luma_mc!(h264_mc23, h264_mc23_4, h264_mc23_8, h264_mc23_16); +luma_mc!(h264_mc30, h264_mc30_4, h264_mc30_8, h264_mc30_16); +luma_mc!(h264_mc31, h264_mc31_4, h264_mc31_8, h264_mc31_16); +luma_mc!(h264_mc32, h264_mc32_4, h264_mc32_8, h264_mc32_16); +luma_mc!(h264_mc33, h264_mc33_4, h264_mc33_8, h264_mc33_16); + +pub const H264_LUMA_INTERP: &[[super::MCFunc; 16]; 3] = &[ + [ + h264_mc00_4, h264_mc01_4, h264_mc02_4, h264_mc03_4, + h264_mc10_4, h264_mc11_4, h264_mc12_4, h264_mc13_4, + h264_mc20_4, h264_mc21_4, h264_mc22_4, h264_mc23_4, + h264_mc30_4, h264_mc31_4, h264_mc32_4, h264_mc33_4 + ], [ + h264_mc00_8, h264_mc01_8, h264_mc02_8, h264_mc03_8, + h264_mc10_8, h264_mc11_8, h264_mc12_8, h264_mc13_8, + h264_mc20_8, h264_mc21_8, h264_mc22_8, h264_mc23_8, + h264_mc30_8, h264_mc31_8, h264_mc32_8, h264_mc33_8 + ], [ + h264_mc00_16, h264_mc01_16, h264_mc02_16, h264_mc03_16, + h264_mc10_16, h264_mc11_16, h264_mc12_16, h264_mc13_16, + h264_mc20_16, h264_mc21_16, h264_mc22_16, h264_mc23_16, + h264_mc30_16, h264_mc31_16, h264_mc32_16, h264_mc33_16 + ] ]; diff --git a/nihav-itu/src/codecs/h264/mb_recon.rs b/nihav-itu/src/codecs/h264/mb_recon.rs index 97637a2..e78c134 100644 --- a/nihav-itu/src/codecs/h264/mb_recon.rs +++ b/nihav-itu/src/codecs/h264/mb_recon.rs @@ -219,7 +219,13 @@ fn do_p_mc(frm: &mut NASimpleVideoFrame, xpos: usize, ypos: usize, w: usize, } else { [1, 0, 0] }; - mc_dsp.put_block_weighted(&mut frm.data[yoff..], frm.stride[0], &ytmp, w, h, yw); + let wmode = match w { + 2 => 0, + 4 => 1, + 8 => 2, + _ => 3, + }; + (mc_dsp.put_block_weighted[wmode])(&mut frm.data[yoff..], frm.stride[0], &ytmp, h, yw); for chroma in 0..2 { let cstride = frm.stride[chroma + 1]; @@ -230,7 +236,7 @@ fn do_p_mc(frm: &mut NASimpleVideoFrame, xpos: usize, ypos: usize, w: usize, [1, 0, 0] }; let csrc = if chroma == 0 { &utmp } else { &vtmp }; - mc_dsp.put_block_weighted(&mut frm.data[coff..], cstride, csrc, w / 2, h / 2, cw); + (mc_dsp.put_block_weighted[wmode - 1])(&mut frm.data[coff..], cstride, csrc, h / 2, cw); } } } else { @@ -297,7 +303,13 @@ fn do_b_mc(frm: &mut NASimpleVideoFrame, mode: BMode, xpos: usize, ypos: usi } else { [1, 0, 0] }; - mc_dsp.put_block_weighted(&mut frm.data[yoff..], frm.stride[0], &ytmp0, w, h, yw); + let wmode = match w { + 2 => 0, + 4 => 1, + 8 => 2, + _ => 3, + }; + (mc_dsp.put_block_weighted[wmode])(&mut frm.data[yoff..], frm.stride[0], &ytmp0, h, yw); for chroma in 0..2 { let cstride = frm.stride[chroma + 1]; @@ -308,7 +320,7 @@ fn do_b_mc(frm: &mut NASimpleVideoFrame, mode: BMode, xpos: usize, ypos: usi [1, 0, 0] }; let csrc = if chroma == 0 { &utmp0 } else { &vtmp0 }; - mc_dsp.put_block_weighted(&mut frm.data[coff..], cstride, csrc, w / 2, h / 2, cw); + (mc_dsp.put_block_weighted[wmode - 1])(&mut frm.data[coff..], cstride, csrc, h / 2, cw); } }, (BMode::Bi, Some(buf0), Some(buf1)) => { // do both and avg @@ -322,7 +334,13 @@ fn do_b_mc(frm: &mut NASimpleVideoFrame, mode: BMode, xpos: usize, ypos: usi (false, true) => [1 << weight1.luma_shift, 0, weight1.luma_weight, weight1.luma_offset, weight1.luma_shift as i8], (false, false) => [1, 0, 1, 0, 0], }; - mc_dsp.put_block_weighted2(&mut frm.data[yoff..], frm.stride[0], &ytmp0, &ytmp1, w, h, yw); + let wmode = match w { + 2 => 0, + 4 => 1, + 8 => 2, + _ => 3, + }; + (mc_dsp.put_block_weighted2[wmode])(&mut frm.data[yoff..], frm.stride[0], &ytmp0, &ytmp1, h, yw); for chroma in 0..2 { let cstride = frm.stride[chroma + 1]; @@ -339,7 +357,7 @@ fn do_b_mc(frm: &mut NASimpleVideoFrame, mode: BMode, xpos: usize, ypos: usi }; let csrc0 = if chroma == 0 { &utmp0 } else { &vtmp0 }; let csrc1 = if chroma == 0 { &utmp1 } else { &vtmp1 }; - mc_dsp.put_block_weighted2(&mut frm.data[coff..], cstride, csrc0, csrc1, w / 2, h / 2, cw); + (mc_dsp.put_block_weighted2[wmode - 1])(&mut frm.data[coff..], cstride, csrc0, csrc1, h / 2, cw); } }, _ => { -- 2.39.5