From: Kostya Shishkov Date: Sat, 12 Aug 2017 09:16:11 +0000 (+0200) Subject: indeo4: faster recombine_plane X-Git-Url: https://git.nihav.org/?a=commitdiff_plain;h=0694edb9f7373c853b23802230b16f86ada19dd7;p=nihav.git indeo4: faster recombine_plane --- diff --git a/src/codecs/indeo/indeo4.rs b/src/codecs/indeo/indeo4.rs index b426254..e43fc45 100644 --- a/src/codecs/indeo/indeo4.rs +++ b/src/codecs/indeo/indeo4.rs @@ -5,6 +5,12 @@ use super::super::*; use super::ivi::*; use super::ivibr::*; +#[inline(always)] +fn mclip8(a: i32) -> u8 { + if (a as u16) > 255 { !(a >> 16) as u8 } + else { a as u8 } +} + struct Indeo4Parser { mb_cb: IVICodebook, blk_cb: IVICodebook, @@ -331,7 +337,7 @@ impl IndeoXParser for Indeo4Parser { } fn recombine_plane(&mut self, src: &[i16], sstride: usize, dst: &mut [u8], dstride: usize, w: usize, h: usize) { - let mut idx0 = 0; +/* let mut idx0 = 0; let mut idx1 = w / 2; let mut idx2 = (h / 2) * sstride; let mut idx3 = idx2 + idx1; @@ -344,10 +350,14 @@ impl IndeoXParser for Indeo4Parser { let p1 = src[idx1 + x]; let p2 = src[idx2 + x]; let p3 = src[idx3 + x]; - dst[oidx0 + x * 2 + 0] = clip8(((p0 + p1 + p2 + p3 + 2) >> 2) + 128); - dst[oidx0 + x * 2 + 1] = clip8(((p0 + p1 - p2 - p3 + 2) >> 2) + 128); - dst[oidx1 + x * 2 + 0] = clip8(((p0 - p1 + p2 - p3 + 2) >> 2) + 128); - dst[oidx1 + x * 2 + 1] = clip8(((p0 - p1 - p2 + p3 + 2) >> 2) + 128); + let s0 = p0 + p2; + let d0 = p0 - p2; + let s1 = p1 + p3; + let d1 = p1 - p3; + dst[oidx0 + x * 2 + 0] = clip8(((s0 + s1 + 2) >> 2) + 128); + dst[oidx0 + x * 2 + 1] = clip8(((d0 + d1 + 2) >> 2) + 128); + dst[oidx1 + x * 2 + 0] = clip8(((s0 - s1 + 2) >> 2) + 128); + dst[oidx1 + x * 2 + 1] = clip8(((d0 - d1 + 2) >> 2) + 128); } idx0 += sstride; idx1 += sstride; @@ -355,6 +365,54 @@ impl IndeoXParser for Indeo4Parser { idx3 += sstride; oidx0 += dstride * 2; oidx1 += dstride * 2; + }*/ + unsafe { + let hw = (w / 2) as isize; + let hh = (h / 2) as isize; + let mut band0 = src.as_ptr(); + let mut band1 = band0.offset(hw); + let mut band2 = band0.offset(((h / 2) * sstride) as isize); + let mut band3 = band2.offset(hw); + let mut dst0 = dst.as_mut_ptr(); + let mut dst1 = dst0.offset(dstride as isize); + for _ in 0..hh { + let mut b0_ptr = band0; + let mut b1_ptr = band1; + let mut b2_ptr = band2; + let mut b3_ptr = band3; + let mut d0_ptr = dst0; + let mut d1_ptr = dst1; + for _ in 0..hw { + let p0 = *b0_ptr as i32; + let p1 = *b1_ptr as i32; + let p2 = *b2_ptr as i32; + let p3 = *b3_ptr as i32; + let s0 = p0.wrapping_add(p2); + let s1 = p1.wrapping_add(p3); + let d0 = p0.wrapping_sub(p2); + let d1 = p1.wrapping_sub(p3); + let o0 = s0.wrapping_add(s1).wrapping_add(2); + let o1 = d0.wrapping_add(d1).wrapping_add(2); + let o2 = s0.wrapping_sub(s1).wrapping_add(2); + let o3 = d0.wrapping_sub(d1).wrapping_add(2); + *d0_ptr.offset(0) = mclip8((o0 >> 2).wrapping_add(128)); + *d0_ptr.offset(1) = mclip8((o1 >> 2).wrapping_add(128)); + *d1_ptr.offset(0) = mclip8((o2 >> 2).wrapping_add(128)); + *d1_ptr.offset(1) = mclip8((o3 >> 2).wrapping_add(128)); + b0_ptr = b0_ptr.offset(1); + b1_ptr = b1_ptr.offset(1); + b2_ptr = b2_ptr.offset(1); + b3_ptr = b3_ptr.offset(1); + d0_ptr = d0_ptr.offset(2); + d1_ptr = d1_ptr.offset(2); + } + band0 = band0.offset(sstride as isize); + band1 = band1.offset(sstride as isize); + band2 = band2.offset(sstride as isize); + band3 = band3.offset(sstride as isize); + dst0 = dst0.offset((dstride * 2) as isize); + dst1 = dst1.offset((dstride * 2) as isize); + } } } }