]>
Commit | Line | Data |
---|---|---|
42005e25 KS |
1 | use std::arch::asm; |
2 | use super::super::clip_u8; | |
3 | ||
4 | const TMP_BUF_STRIDE: usize = 32; | |
5 | ||
6 | fn interp_block1(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, w: usize, h: usize, hor: bool, avg0: bool) { | |
7 | unsafe { | |
8 | let step = if hor { 1 } else { sstride }; | |
9 | let avgidx = if avg0 { step * 2 } else { step * 3 }; | |
10 | let mut src = src.as_ptr(); | |
11 | let mut dst = dst.as_mut_ptr(); | |
12 | for _ in 0..h { | |
13 | for _ in 0..w { | |
14 | let t = clip_u8(( i16::from(*src) | |
15 | - 5 * i16::from(*src.add(step)) | |
16 | + 20 * i16::from(*src.add(step * 2)) | |
17 | + 20 * i16::from(*src.add(step * 3)) | |
18 | - 5 * i16::from(*src.add(step * 4)) | |
19 | + i16::from(*src.add(step * 5)) | |
20 | + 16) >> 5); | |
21 | *dst = ((u16::from(t) + u16::from(*src.add(avgidx)) + 1) >> 1) as u8; | |
22 | src = src.add(1); | |
23 | dst = dst.add(1); | |
24 | } | |
25 | dst = dst.sub(w).add(dstride); | |
26 | src = src.sub(w).add(sstride); | |
27 | } | |
28 | } | |
29 | } | |
30 | ||
31 | fn interp_block2(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, w: usize, h: usize, hor: bool) { | |
32 | unsafe { | |
33 | let step = if hor { 1 } else { sstride }; | |
34 | let mut pix = dst.as_mut_ptr(); | |
35 | let mut src = src.as_ptr(); | |
36 | for _ in 0..h { | |
37 | for x in 0..w { | |
38 | *pix.add(x) = clip_u8(( i16::from(*src) | |
39 | - 5 * i16::from(*src.add(step)) | |
40 | + 20 * i16::from(*src.add(step * 2)) | |
41 | + 20 * i16::from(*src.add(step * 3)) | |
42 | - 5 * i16::from(*src.add(step * 4)) | |
43 | + i16::from(*src.add(step * 5)) | |
44 | + 16) >> 5); | |
45 | src = src.add(1); | |
46 | } | |
47 | pix = pix.add(dstride); | |
48 | src = src.sub(w); | |
49 | src = src.add(sstride); | |
50 | } | |
51 | } | |
52 | } | |
53 | ||
54 | fn mc_avg_tmp(dst: &mut [u8], dstride: usize, w: usize, h: usize, tmp: &[u8], tmp2: &[u8]) { | |
55 | unsafe { | |
56 | let mut src1 = tmp.as_ptr(); | |
57 | let mut src2 = tmp2.as_ptr(); | |
58 | let mut dst = dst.as_mut_ptr(); | |
59 | for _ in 0..h { | |
60 | for x in 0..w { | |
61 | let a = *src1.add(x); | |
62 | let b = *src2.add(x); | |
63 | *dst.add(x) = ((u16::from(a) + u16::from(b) + 1) >> 1) as u8; | |
64 | } | |
65 | dst = dst.add(dstride); | |
66 | src1 = src1.add(TMP_BUF_STRIDE); | |
67 | src2 = src2.add(TMP_BUF_STRIDE); | |
68 | } | |
69 | } | |
70 | } | |
71 | ||
72 | fn h264_mc01(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, w: usize, h: usize) { | |
73 | interp_block1(dst, dstride, &src[sstride * 2..], sstride, w, h, true, true); | |
74 | } | |
75 | ||
76 | fn h264_mc02(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, w: usize, h: usize) { | |
77 | interp_block2(dst, dstride, &src[sstride * 2..], sstride, w, h, true); | |
78 | } | |
79 | ||
80 | fn h264_mc03(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, w: usize, h: usize) { | |
81 | interp_block1(dst, dstride, &src[sstride * 2..], sstride, w, h, true, false); | |
82 | } | |
83 | ||
84 | fn h264_mc10(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, w: usize, h: usize) { | |
85 | interp_block1(dst, dstride, &src[2..], sstride, w, h, false, true); | |
86 | } | |
87 | ||
88 | fn h264_mc11(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, w: usize, h: usize) { | |
3fdbd1ee KS |
89 | let mut tmp : [u8; TMP_BUF_STRIDE * 16] = unsafe { let arr = std::mem::MaybeUninit::uninit(); arr.assume_init() }; |
90 | let mut tmp2: [u8; TMP_BUF_STRIDE * 16] = unsafe { let arr = std::mem::MaybeUninit::uninit(); arr.assume_init() }; | |
42005e25 KS |
91 | h264_mc02(&mut tmp, TMP_BUF_STRIDE, src, sstride, w, h); |
92 | h264_mc20(&mut tmp2, TMP_BUF_STRIDE, src, sstride, w, h); | |
93 | mc_avg_tmp(dst, dstride, w, h, &tmp, &tmp2); | |
94 | } | |
95 | ||
96 | fn h264_mc12(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, w: usize, h: usize) { | |
3fdbd1ee KS |
97 | let mut tmp : [u8; TMP_BUF_STRIDE * 16] = unsafe { let arr = std::mem::MaybeUninit::uninit(); arr.assume_init() }; |
98 | let mut tmp2: [u8; TMP_BUF_STRIDE * 16] = unsafe { let arr = std::mem::MaybeUninit::uninit(); arr.assume_init() }; | |
42005e25 KS |
99 | h264_mc02(&mut tmp, TMP_BUF_STRIDE, src, sstride, w, h); |
100 | h264_mc22(&mut tmp2, TMP_BUF_STRIDE, src, sstride, w, h); | |
101 | mc_avg_tmp(dst, dstride, w, h, &tmp, &tmp2); | |
102 | } | |
103 | ||
104 | fn h264_mc13(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, w: usize, h: usize) { | |
3fdbd1ee KS |
105 | let mut tmp : [u8; TMP_BUF_STRIDE * 16] = unsafe { let arr = std::mem::MaybeUninit::uninit(); arr.assume_init() }; |
106 | let mut tmp2: [u8; TMP_BUF_STRIDE * 16] = unsafe { let arr = std::mem::MaybeUninit::uninit(); arr.assume_init() }; | |
42005e25 KS |
107 | h264_mc02(&mut tmp, TMP_BUF_STRIDE, src, sstride, w, h); |
108 | h264_mc20(&mut tmp2, TMP_BUF_STRIDE, &src[1..], sstride, w, h); | |
109 | mc_avg_tmp(dst, dstride, w, h, &tmp, &tmp2); | |
110 | } | |
111 | ||
112 | fn h264_mc20(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, w: usize, h: usize) { | |
113 | interp_block2(dst, dstride, &src[2..], sstride, w, h, false); | |
114 | } | |
115 | ||
116 | fn h264_mc21(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, w: usize, h: usize) { | |
3fdbd1ee KS |
117 | let mut tmp : [u8; TMP_BUF_STRIDE * 16] = unsafe { let arr = std::mem::MaybeUninit::uninit(); arr.assume_init() }; |
118 | let mut tmp2: [u8; TMP_BUF_STRIDE * 16] = unsafe { let arr = std::mem::MaybeUninit::uninit(); arr.assume_init() }; | |
42005e25 KS |
119 | h264_mc22(&mut tmp, TMP_BUF_STRIDE, src, sstride, w, h); |
120 | h264_mc20(&mut tmp2, TMP_BUF_STRIDE, src, sstride, w, h); | |
121 | mc_avg_tmp(dst, dstride, w, h, &tmp, &tmp2); | |
122 | } | |
123 | ||
124 | fn h264_mc22(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, w: usize, h: usize) { | |
3fdbd1ee | 125 | let mut tmp: [i32; TMP_BUF_STRIDE * 16] = unsafe { let arr = std::mem::MaybeUninit::uninit(); arr.assume_init() }; |
42005e25 KS |
126 | unsafe { |
127 | let mut src = src.as_ptr(); | |
128 | let mut dst = tmp.as_mut_ptr(); | |
129 | for _ in 0..h { | |
130 | for _ in 0..w+5 { | |
131 | *dst = i32::from(*src) | |
132 | - 5 * i32::from(*src.add(sstride)) | |
133 | + 20 * i32::from(*src.add(sstride * 2)) | |
134 | + 20 * i32::from(*src.add(sstride * 3)) | |
135 | - 5 * i32::from(*src.add(sstride * 4)) | |
136 | + i32::from(*src.add(sstride * 5)); | |
137 | dst = dst.add(1); | |
138 | src = src.add(1); | |
139 | } | |
140 | src = src.sub(w+5).add(sstride); | |
141 | dst = dst.sub(w+5).add(TMP_BUF_STRIDE); | |
142 | } | |
143 | } | |
144 | unsafe { | |
145 | let mut dst = dst.as_mut_ptr(); | |
146 | let mut src = tmp.as_ptr(); | |
147 | for _ in 0..h { | |
148 | for _ in 0..w { | |
149 | *dst = clip_u8(((*src - 5 * *src.add(1) + 20 * *src.add(2) + 20 * *src.add(3) - 5 * *src.add(4) + *src.add(5) + 512) >> 10) as i16); | |
150 | dst = dst.add(1); | |
151 | src = src.add(1); | |
152 | } | |
153 | dst = dst.sub(w).add(dstride); | |
154 | src = src.sub(w).add(TMP_BUF_STRIDE); | |
155 | } | |
156 | } | |
157 | } | |
158 | ||
159 | fn h264_mc23(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, w: usize, h: usize) { | |
3fdbd1ee KS |
160 | let mut tmp : [u8; TMP_BUF_STRIDE * 16] = unsafe { let arr = std::mem::MaybeUninit::uninit(); arr.assume_init() }; |
161 | let mut tmp2: [u8; TMP_BUF_STRIDE * 16] = unsafe { let arr = std::mem::MaybeUninit::uninit(); arr.assume_init() }; | |
42005e25 KS |
162 | h264_mc22(&mut tmp, TMP_BUF_STRIDE, src, sstride, w, h); |
163 | h264_mc20(&mut tmp2, TMP_BUF_STRIDE, &src[1..], sstride, w, h); | |
164 | mc_avg_tmp(dst, dstride, w, h, &tmp, &tmp2); | |
165 | } | |
166 | ||
167 | fn h264_mc30(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, w: usize, h: usize) { | |
168 | interp_block1(dst, dstride, &src[2..], sstride, w, h, false, false); | |
169 | } | |
170 | ||
171 | fn h264_mc31(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, w: usize, h: usize) { | |
3fdbd1ee KS |
172 | let mut tmp : [u8; TMP_BUF_STRIDE * 16] = unsafe { let arr = std::mem::MaybeUninit::uninit(); arr.assume_init() }; |
173 | let mut tmp2: [u8; TMP_BUF_STRIDE * 16] = unsafe { let arr = std::mem::MaybeUninit::uninit(); arr.assume_init() }; | |
42005e25 KS |
174 | h264_mc20(&mut tmp, TMP_BUF_STRIDE, src, sstride, w, h); |
175 | h264_mc02(&mut tmp2, TMP_BUF_STRIDE, &src[sstride..], sstride, w, h); | |
176 | mc_avg_tmp(dst, dstride, w, h, &tmp, &tmp2); | |
177 | } | |
178 | ||
179 | fn h264_mc32(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, w: usize, h: usize) { | |
3fdbd1ee KS |
180 | let mut tmp : [u8; TMP_BUF_STRIDE * 16] = unsafe { let arr = std::mem::MaybeUninit::uninit(); arr.assume_init() }; |
181 | let mut tmp2: [u8; TMP_BUF_STRIDE * 16] = unsafe { let arr = std::mem::MaybeUninit::uninit(); arr.assume_init() }; | |
42005e25 KS |
182 | h264_mc22(&mut tmp, TMP_BUF_STRIDE, src, sstride, w, h); |
183 | h264_mc02(&mut tmp2, TMP_BUF_STRIDE, &src[sstride..], sstride, w, h); | |
184 | mc_avg_tmp(dst, dstride, w, h, &tmp, &tmp2); | |
185 | } | |
186 | ||
187 | fn h264_mc33(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, w: usize, h: usize) { | |
3fdbd1ee KS |
188 | let mut tmp : [u8; TMP_BUF_STRIDE * 16] = unsafe { let arr = std::mem::MaybeUninit::uninit(); arr.assume_init() }; |
189 | let mut tmp2: [u8; TMP_BUF_STRIDE * 16] = unsafe { let arr = std::mem::MaybeUninit::uninit(); arr.assume_init() }; | |
42005e25 KS |
190 | h264_mc20(&mut tmp, TMP_BUF_STRIDE, &src[1..], sstride, w, h); |
191 | h264_mc02(&mut tmp2, TMP_BUF_STRIDE, &src[sstride..], sstride, w, h); | |
192 | mc_avg_tmp(dst, dstride, w, h, &tmp, &tmp2); | |
193 | } | |
194 | ||
195 | macro_rules! luma_mc { | |
196 | ($orig:ident, $func4:ident, $func8:ident, $func16:ident) => { | |
197 | fn $func4(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, h: usize) { | |
198 | $orig(dst, dstride, src, sstride, 4, h); | |
199 | } | |
200 | fn $func8(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, h: usize) { | |
201 | $orig(dst, dstride, src, sstride, 8, h); | |
202 | } | |
203 | fn $func16(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, h: usize) { | |
204 | $orig(dst, dstride, src, sstride, 16, h); | |
205 | } | |
206 | } | |
207 | } | |
208 | ||
209 | macro_rules! mc00_template { | |
210 | ($func:ident, $load:expr, $store:expr) => { | |
211 | fn $func(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, h: usize) { | |
212 | unsafe { | |
213 | asm!( | |
214 | "lea {tmps}, [{src} + {sstride} * 2]", | |
215 | "lea {tmpd}, [{dst} + {dstride} * 2]", | |
216 | "2:", | |
217 | concat!($load, " xmm0, [{src}]"), | |
218 | concat!($load, " xmm1, [{src} + {sstride}]"), | |
219 | concat!($load, " xmm2, [{tmps}]"), | |
220 | concat!($load, " xmm3, [{tmps} + {sstride}]"), | |
221 | concat!($store, " [{dst}], xmm0"), | |
222 | "lea {src}, [{src} + {sstride}*4]", | |
223 | concat!($store, " [{dst} + {dstride}], xmm1"), | |
224 | "lea {tmps}, [{tmps} + {sstride}*4]", | |
225 | concat!($store, " [{tmpd}], xmm2"), | |
226 | "lea {dst}, [{dst} + {dstride}*4]", | |
227 | concat!($store, " [{tmpd} + {dstride}], xmm3"), | |
228 | "lea {tmpd}, [{tmpd} + {dstride}*4]", | |
229 | "sub {h}, 4", | |
230 | "jnz 2b", | |
231 | dst = inout(reg) dst.as_mut_ptr() => _, | |
232 | dstride = in(reg) dstride, | |
233 | src = inout(reg) src.as_ptr() => _, | |
234 | sstride = in(reg) sstride, | |
235 | h = inout(reg) h => _, | |
236 | tmps = out(reg) _, | |
237 | tmpd = out(reg) _, | |
238 | out("xmm0") _, | |
239 | out("xmm1") _, | |
240 | out("xmm2") _, | |
241 | out("xmm3") _, | |
242 | ); | |
243 | } | |
244 | } | |
245 | } | |
246 | } | |
247 | ||
248 | mc00_template!(h264_mc00_16, "movups", "movaps"); | |
249 | mc00_template!(h264_mc00_8, "movq", "movq"); | |
250 | mc00_template!(h264_mc00_4, "movd", "movd"); | |
251 | ||
252 | luma_mc!(h264_mc01, h264_mc01_4, h264_mc01_8, h264_mc01_16); | |
253 | luma_mc!(h264_mc02, h264_mc02_4, h264_mc02_8, h264_mc02_16); | |
254 | luma_mc!(h264_mc03, h264_mc03_4, h264_mc03_8, h264_mc03_16); | |
255 | luma_mc!(h264_mc10, h264_mc10_4, h264_mc10_8, h264_mc10_16); | |
256 | luma_mc!(h264_mc11, h264_mc11_4, h264_mc11_8, h264_mc11_16); | |
257 | luma_mc!(h264_mc12, h264_mc12_4, h264_mc12_8, h264_mc12_16); | |
258 | luma_mc!(h264_mc13, h264_mc13_4, h264_mc13_8, h264_mc13_16); | |
259 | luma_mc!(h264_mc20, h264_mc20_4, h264_mc20_8, h264_mc20_16); | |
260 | luma_mc!(h264_mc21, h264_mc21_4, h264_mc21_8, h264_mc21_16); | |
261 | luma_mc!(h264_mc22, h264_mc22_4, h264_mc22_8, h264_mc22_16); | |
262 | luma_mc!(h264_mc23, h264_mc23_4, h264_mc23_8, h264_mc23_16); | |
263 | luma_mc!(h264_mc30, h264_mc30_4, h264_mc30_8, h264_mc30_16); | |
264 | luma_mc!(h264_mc31, h264_mc31_4, h264_mc31_8, h264_mc31_16); | |
265 | luma_mc!(h264_mc32, h264_mc32_4, h264_mc32_8, h264_mc32_16); | |
266 | luma_mc!(h264_mc33, h264_mc33_4, h264_mc33_8, h264_mc33_16); | |
267 | ||
268 | pub const H264_LUMA_INTERP: &[[super::super::MCFunc; 16]; 3] = &[ | |
269 | [ | |
270 | h264_mc00_4, h264_mc01_4, h264_mc02_4, h264_mc03_4, | |
271 | h264_mc10_4, h264_mc11_4, h264_mc12_4, h264_mc13_4, | |
272 | h264_mc20_4, h264_mc21_4, h264_mc22_4, h264_mc23_4, | |
273 | h264_mc30_4, h264_mc31_4, h264_mc32_4, h264_mc33_4 | |
274 | ], [ | |
275 | h264_mc00_8, h264_mc01_8, h264_mc02_8, h264_mc03_8, | |
276 | h264_mc10_8, h264_mc11_8, h264_mc12_8, h264_mc13_8, | |
277 | h264_mc20_8, h264_mc21_8, h264_mc22_8, h264_mc23_8, | |
278 | h264_mc30_8, h264_mc31_8, h264_mc32_8, h264_mc33_8 | |
279 | ], [ | |
280 | h264_mc00_16, h264_mc01_16, h264_mc02_16, h264_mc03_16, | |
281 | h264_mc10_16, h264_mc11_16, h264_mc12_16, h264_mc13_16, | |
282 | h264_mc20_16, h264_mc21_16, h264_mc22_16, h264_mc23_16, | |
283 | h264_mc30_16, h264_mc31_16, h264_mc32_16, h264_mc33_16 | |
284 | ] | |
285 | ]; |