]>
Commit | Line | Data |
---|---|---|
3952bfd9 KS |
1 | use nihav_core::frame::*; |
2 | use nihav_codec_support::codecs::blockdsp::edge_emu; | |
3 | ||
4 | #[allow(clippy::too_many_arguments)] | |
5 | pub fn get_block(dst: &mut [u8], dstride: usize, src: NAVideoBufferRef<u8>, comp: usize, | |
6 | dx: usize, dy: usize, mv_x: i16, mv_y: i16) | |
7 | { | |
8 | let (w, h) = src.get_dimensions(comp); | |
9 | let sx = (dx as isize) + (mv_x as isize); | |
10 | let sy = (dy as isize) + (mv_y as isize); | |
11 | ||
12 | if (sx - 2 < 0) || (sx + 8 + 2 > (w as isize)) || | |
13 | (sy - 2 < 0) || (sy + 8 + 2 > (h as isize)) { | |
14 | edge_emu(&src, sx - 2, sy - 2, 8 + 2 + 2, 8 + 2 + 2, | |
15 | dst, dstride, comp, 0); | |
16 | } else { | |
17 | let sstride = src.get_stride(comp); | |
18 | let soff = src.get_offset(comp); | |
19 | let sdta = src.get_data(); | |
20 | let sbuf: &[u8] = sdta.as_slice(); | |
21 | let saddr = soff + ((sx - 2) as usize) + ((sy - 2) as usize) * sstride; | |
22 | let src = &sbuf[saddr..]; | |
23 | for (dline, sline) in dst.chunks_mut(dstride).zip(src.chunks(sstride)).take(12) { | |
24 | dline[..12].copy_from_slice(&sline[..12]); | |
25 | } | |
26 | } | |
27 | } | |
28 | ||
29 | pub fn calc_variance(src: &[u8], stride: usize) -> u16 { | |
30 | let mut sum = 0; | |
31 | let mut ssum = 0; | |
32 | for line in src.chunks(stride * 2).take(4) { | |
33 | for el in line.iter().take(8).step_by(2) { | |
34 | let pix = u32::from(*el); | |
35 | sum += pix; | |
36 | ssum += pix * pix; | |
37 | } | |
38 | } | |
39 | ((ssum * 16 - sum * sum) >> 8) as u16 | |
40 | } | |
41 | ||
42 | macro_rules! mc_filter { | |
43 | (bilinear; $a: expr, $b: expr, $c: expr) => { | |
44 | ((u16::from($a) * (8 - $c) + u16::from($b) * $c + 4) >> 3) as u8 | |
45 | }; | |
46 | (bicubic; $src: expr, $off: expr, $step: expr, $coeffs: expr) => { | |
47 | ((i32::from($src[$off - $step] ) * i32::from($coeffs[0]) + | |
48 | i32::from($src[$off] ) * i32::from($coeffs[1]) + | |
49 | i32::from($src[$off + $step] ) * i32::from($coeffs[2]) + | |
50 | i32::from($src[$off + $step * 2]) * i32::from($coeffs[3]) + 64) >> 7).min(255).max(0) as u8 | |
51 | } | |
52 | } | |
53 | ||
54 | //#[allow(snake_case)] | |
e510768d | 55 | #[cfg(not(target_arch = "x86_64"))] |
3952bfd9 KS |
56 | pub fn mc_bilinear(dst: &mut [u8], dstride: usize, src: &[u8], mut soff: usize, sstride: usize, mx: u16, my: u16) { |
57 | if my == 0 { | |
58 | for dline in dst.chunks_mut(dstride).take(8) { | |
59 | for i in 0..8 { | |
60 | dline[i] = mc_filter!(bilinear; src[soff + i], src[soff + i + 1], mx); | |
61 | } | |
62 | soff += sstride; | |
63 | } | |
64 | } else if mx == 0 { | |
65 | for dline in dst.chunks_mut(dstride).take(8) { | |
66 | for i in 0..8 { | |
67 | dline[i] = mc_filter!(bilinear; src[soff + i], src[soff + i + sstride], my); | |
68 | } | |
69 | soff += sstride; | |
70 | } | |
71 | } else { | |
72 | let mut tmp = [0u8; 8]; | |
73 | for i in 0..8 { | |
74 | tmp[i] = mc_filter!(bilinear; src[soff + i], src[soff + i + 1], mx); | |
75 | } | |
76 | soff += sstride; | |
77 | for dline in dst.chunks_mut(dstride).take(8) { | |
78 | for i in 0..8 { | |
79 | let cur = mc_filter!(bilinear; src[soff + i], src[soff + i + 1], mx); | |
80 | dline[i] = mc_filter!(bilinear; tmp[i], cur, my); | |
81 | tmp[i] = cur; | |
82 | } | |
83 | soff += sstride; | |
84 | } | |
85 | } | |
86 | } | |
87 | ||
e510768d KS |
88 | #[cfg(target_arch = "x86_64")] |
89 | use std::arch::x86_64::*; | |
90 | ||
91 | #[cfg(target_arch = "x86_64")] | |
92 | pub fn mc_bilinear(dst: &mut [u8], dstride: usize, src: &[u8], soff: usize, sstride: usize, mx: u16, my: u16) { | |
93 | if my == 0 { | |
94 | unsafe { | |
95 | let mut sptr = src[soff..].as_ptr(); | |
96 | let mut dptr = dst.as_mut_ptr(); | |
97 | let bias = _mm_set1_epi16(4); | |
98 | let a = _mm_set1_epi16((8 - mx) as i16); | |
99 | let b = _mm_set1_epi16( mx as i16); | |
100 | let z = _mm_setzero_si128(); | |
101 | for _ in 0..8 { | |
102 | let s0 = _mm_unpacklo_epi8(_mm_loadl_epi64(sptr as *const __m128i), z); | |
103 | let s1 = _mm_unpacklo_epi8(_mm_loadl_epi64(sptr.add(1) as *const __m128i), z); | |
104 | let s0 = _mm_mullo_epi16(s0, a); | |
105 | let s1 = _mm_mullo_epi16(s1, b); | |
106 | sptr = sptr.add(sstride); | |
107 | let t = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(s0, bias), s1), 3); | |
108 | let t = _mm_packus_epi16(t, t); | |
109 | _mm_storel_epi64(dptr as *mut __m128i, t); | |
110 | dptr = dptr.add(dstride); | |
111 | } | |
112 | } | |
113 | } else if mx == 0 { | |
114 | unsafe { | |
115 | let mut sptr = src[soff..].as_ptr(); | |
116 | let mut dptr = dst.as_mut_ptr(); | |
117 | let bias = _mm_set1_epi16(4); | |
118 | let a = _mm_set1_epi16((8 - my) as i16); | |
119 | let b = _mm_set1_epi16( my as i16); | |
120 | let z = _mm_setzero_si128(); | |
121 | let mut last = _mm_unpacklo_epi8(_mm_loadl_epi64(sptr as *const __m128i), z); | |
122 | last = _mm_mullo_epi16(last, a); | |
123 | sptr = sptr.add(sstride); | |
124 | for _ in 0..8 { | |
125 | let s = _mm_unpacklo_epi8(_mm_loadl_epi64(sptr as *const __m128i), z); | |
126 | sptr = sptr.add(sstride); | |
127 | let s1 = _mm_mullo_epi16(s, b); | |
128 | let t = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(last, bias), s1), 3); | |
129 | last = _mm_mullo_epi16(s, a); | |
130 | let t = _mm_packus_epi16(t, t); | |
131 | _mm_storel_epi64(dptr as *mut __m128i, t); | |
132 | dptr = dptr.add(dstride); | |
133 | } | |
134 | } | |
135 | } else { | |
136 | unsafe { | |
137 | let mut sptr = src[soff..].as_ptr(); | |
138 | let mut dptr = dst.as_mut_ptr(); | |
139 | let bias = _mm_set1_epi16(4); | |
140 | let a = _mm_set1_epi16((8 - mx) as i16); | |
141 | let b = _mm_set1_epi16( mx as i16); | |
142 | let c = _mm_set1_epi16((8 - my) as i16); | |
143 | let d = _mm_set1_epi16( my as i16); | |
144 | let z = _mm_setzero_si128(); | |
145 | ||
146 | let s0 = _mm_unpacklo_epi8(_mm_loadl_epi64(sptr as *const __m128i), z); | |
147 | let s1 = _mm_unpacklo_epi8(_mm_loadl_epi64(sptr.add(1) as *const __m128i), z); | |
148 | let s0 = _mm_mullo_epi16(s0, a); | |
149 | let s1 = _mm_mullo_epi16(s1, b); | |
150 | let t = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(s0, bias), s1), 3); | |
151 | let mut last = _mm_mullo_epi16(t, c); | |
152 | sptr = sptr.add(sstride); | |
153 | for _ in 0..8 { | |
154 | let s0 = _mm_unpacklo_epi8(_mm_loadl_epi64(sptr as *const __m128i), z); | |
155 | let s1 = _mm_unpacklo_epi8(_mm_loadl_epi64(sptr.add(1) as *const __m128i), z); | |
156 | let s0 = _mm_mullo_epi16(s0, a); | |
157 | let s1 = _mm_mullo_epi16(s1, b); | |
158 | sptr = sptr.add(sstride); | |
159 | let t = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(s0, bias), s1), 3); | |
160 | let t1 = _mm_add_epi16(_mm_add_epi16(last, bias), _mm_mullo_epi16(t, d)); | |
161 | last = _mm_mullo_epi16(t, c); | |
162 | let out = _mm_srai_epi16(t1, 3); | |
163 | _mm_storel_epi64(dptr as *mut __m128i, _mm_packus_epi16(out, out)); | |
164 | dptr = dptr.add(dstride); | |
165 | } | |
166 | } | |
167 | } | |
168 | } | |
169 | ||
3952bfd9 KS |
170 | #[allow(clippy::trivially_copy_pass_by_ref)] |
171 | pub fn mc_bicubic(dst: &mut [u8], dstride: usize, src: &[u8], mut soff: usize, sstride: usize, coeffs_w: &[i16; 4], coeffs_h: &[i16; 4]) { | |
172 | if coeffs_h[1] == 128 { | |
173 | for dline in dst.chunks_mut(dstride).take(8) { | |
174 | for i in 0..8 { | |
175 | dline[i] = mc_filter!(bicubic; src, soff + i, 1, coeffs_w); | |
176 | } | |
177 | soff += sstride; | |
178 | } | |
179 | } else if coeffs_w[1] == 128 { // horizontal-only interpolation | |
180 | for dline in dst.chunks_mut(dstride).take(8) { | |
181 | for i in 0..8 { | |
182 | dline[i] = mc_filter!(bicubic; src, soff + i, sstride, coeffs_h); | |
183 | } | |
184 | soff += sstride; | |
185 | } | |
186 | } else { | |
187 | let mut buf = [0u8; 16 * 11]; | |
188 | soff -= sstride; | |
189 | for dline in buf.chunks_mut(16) { | |
190 | for i in 0..8 { | |
191 | dline[i] = mc_filter!(bicubic; src, soff + i, 1, coeffs_w); | |
192 | } | |
193 | soff += sstride; | |
194 | } | |
195 | let mut soff = 16; | |
196 | for dline in dst.chunks_mut(dstride).take(8) { | |
197 | for i in 0..8 { | |
198 | dline[i] = mc_filter!(bicubic; buf, soff + i, 16, coeffs_h); | |
199 | } | |
200 | soff += 16; | |
201 | } | |
202 | } | |
203 | } |