]>
Commit | Line | Data |
---|---|---|
42005e25 KS |
1 | use std::arch::asm; |
2 | ||
3 | macro_rules! avg_template { | |
4 | ($name: ident, $mov: expr) => { | |
5 | pub fn $name(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, bh: usize) { | |
6 | unsafe { | |
7 | asm!( | |
8 | "2:", | |
9 | concat!($mov, " xmm1, [{src}]"), | |
10 | concat!($mov, " xmm3, [{src} + {sstride}]"), | |
11 | concat!($mov, " xmm0, [{dst}]"), | |
12 | concat!($mov, " xmm2, [{dst} + {dstride}]"), | |
13 | "lea {src}, [{src} + {sstride} * 2]", | |
14 | "pavgb xmm0, xmm1", | |
15 | "pavgb xmm2, xmm3", | |
16 | concat!($mov, " [{dst}], xmm0"), | |
17 | concat!($mov, " [{dst} + {dstride}], xmm2"), | |
18 | "lea {dst}, [{dst} + {dstride} * 2]", | |
19 | "sub {h}, 2", | |
20 | "jnz 2b", | |
21 | src = inout(reg) src.as_ptr() => _, | |
22 | sstride = in(reg) sstride, | |
23 | dst = inout(reg) dst.as_mut_ptr() => _, | |
24 | dstride = in(reg) dstride, | |
25 | h = inout(reg) bh => _, | |
26 | out("xmm0") _, | |
27 | out("xmm1") _, | |
28 | out("xmm2") _, | |
29 | out("xmm3") _, | |
30 | ); | |
31 | } | |
32 | } | |
33 | } | |
34 | } | |
35 | ||
36 | avg_template!(avg_4, "movd"); | |
37 | avg_template!(avg_8, "movq"); | |
38 | ||
39 | pub fn avg_16(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, bh: usize) { | |
40 | unsafe { | |
41 | asm!( | |
42 | "lea {stmp}, [{src} + {sstride} * 2]", | |
43 | "lea {dtmp}, [{dst} + {dstride} * 2]", | |
44 | "2:", | |
45 | "movaps xmm0, [{src}]", | |
46 | "movaps xmm1, [{src} + {sstride}]", | |
47 | "movaps xmm2, [{stmp}]", | |
48 | "movaps xmm3, [{stmp} + {sstride}]", | |
49 | "pavgb xmm0, [{dst}]", | |
50 | "pavgb xmm1, [{dst} + {dstride}]", | |
51 | "pavgb xmm2, [{dtmp}]", | |
52 | "pavgb xmm3, [{dtmp} + {dstride}]", | |
53 | "lea {src}, [{src} + {sstride} * 4]", | |
54 | "movaps [{dst}], xmm0", | |
55 | "lea {stmp}, [{stmp} + {sstride} * 4]", | |
56 | "movaps [{dst} + {dstride}], xmm1", | |
57 | "lea {dst}, [{dst} + {dstride} * 4]", | |
58 | "movaps [{dtmp}], xmm2", | |
59 | "movaps [{dtmp} + {dstride}], xmm3", | |
60 | "lea {dtmp}, [{dtmp} + {dstride} * 4]", | |
61 | "sub {h}, 4", | |
62 | "jnz 2b", | |
63 | src = inout(reg) src.as_ptr() => _, | |
64 | sstride = in(reg) sstride, | |
65 | dst = inout(reg) dst.as_mut_ptr() => _, | |
66 | dstride = in(reg) dstride, | |
67 | h = inout(reg) bh => _, | |
68 | stmp = out(reg) _, | |
69 | dtmp = out(reg) _, | |
70 | out("xmm0") _, | |
71 | out("xmm1") _, | |
72 | out("xmm2") _, | |
73 | out("xmm3") _, | |
74 | ); | |
75 | } | |
76 | } | |
77 | ||
78 | macro_rules! put_block_weighted { | |
79 | ($func:ident, $width:expr, $load:expr, $store:expr) => { | |
80 | pub fn $func(dst: &mut [u8], stride: usize, src: &[u8], h: usize, wparams: [i8; 3]) { | |
81 | if wparams == [1, 0, 0] { | |
82 | for (dst, src) in dst.chunks_mut(stride).zip(src.chunks(16)).take(h) { | |
83 | dst[..$width].copy_from_slice(&src[..$width]); | |
84 | } | |
85 | } else { | |
86 | let weight = i32::from(wparams[0]); | |
87 | let offset = i32::from(wparams[1]); | |
88 | let wshift = i32::from(wparams[2]); | |
89 | let bias = (1 << wshift) >> 1; | |
90 | ||
91 | unsafe { | |
92 | asm!( | |
93 | "xorps xmm0, xmm0", | |
94 | "movd xmm1, {weight:e}", | |
95 | "movd xmm2, {offset:e}", | |
96 | "movd xmm3, {wshift:e}", | |
97 | "movd xmm4, {bias:e}", | |
98 | "pshuflw xmm1, xmm1, 0", | |
99 | "pshuflw xmm2, xmm2, 0", | |
100 | "pshuflw xmm4, xmm4, 0", | |
101 | "movlhps xmm1, xmm1", | |
102 | "movlhps xmm2, xmm2", | |
103 | "movlhps xmm4, xmm4", | |
104 | "2:", | |
105 | concat!($load, " xmm5, [{src}]"), | |
106 | "add {src}, 16", | |
107 | "movaps xmm7, xmm5", | |
108 | "punpcklbw xmm5, xmm0", | |
109 | "punpckhbw xmm7, xmm0", | |
110 | "pmullw xmm5, xmm1", | |
111 | "pmullw xmm7, xmm1", | |
112 | "paddw xmm5, xmm4", | |
113 | "paddw xmm7, xmm4", | |
114 | "psraw xmm5, xmm3", | |
115 | "psraw xmm7, xmm3", | |
116 | "paddw xmm5, xmm2", | |
117 | "paddw xmm7, xmm2", | |
118 | "packuswb xmm5, xmm7", | |
119 | concat!($store, " [{dst}], xmm5"), | |
120 | "add {dst}, {stride}", | |
121 | "dec {h}", | |
122 | "jnz 2b", | |
123 | h = inout(reg) h => _, | |
124 | src = inout(reg) src.as_ptr() => _, | |
125 | dst = inout(reg) dst.as_mut_ptr() => _, | |
126 | stride = in(reg) stride, | |
127 | weight = in(reg) weight, | |
128 | offset = in(reg) offset, | |
129 | wshift = in(reg) wshift, | |
130 | bias = in(reg) bias, | |
131 | out("xmm0") _, | |
132 | out("xmm1") _, | |
133 | out("xmm2") _, | |
134 | out("xmm3") _, | |
135 | out("xmm4") _, | |
136 | out("xmm5") _, | |
137 | out("xmm7") _, | |
138 | ); | |
139 | } | |
140 | } | |
141 | } | |
142 | } | |
143 | } | |
144 | ||
145 | put_block_weighted!(put_block_weighted_16, 16, "movups", "movaps"); | |
146 | put_block_weighted!(put_block_weighted_8, 8, "movq", "movq"); | |
147 | put_block_weighted!(put_block_weighted_4, 4, "movd", "movd"); | |
148 | ||
149 | macro_rules! put_block_weighted2 { | |
150 | ($func:ident, $mov:expr) => { | |
151 | pub fn $func(dst: &mut [u8], stride: usize, src0: &[u8], src1: &[u8], h: usize, wparams: [i8; 5]) { | |
152 | if wparams == [1, 0, 1, 0, 0] { | |
153 | unsafe { | |
154 | asm!( | |
155 | "2:", | |
156 | concat!($mov, " xmm0, [{src0}]"), | |
157 | concat!($mov, " xmm1, [{src1}]"), | |
158 | "add {src0}, 16", | |
159 | "pavgb xmm0, xmm1", | |
160 | "add {src1}, 16", | |
161 | concat!($mov, " [{dst}], xmm0"), | |
162 | "add {dst}, {stride}", | |
163 | "dec {h}", | |
164 | "jnz 2b", | |
165 | src0 = inout(reg) src0.as_ptr() => _, | |
166 | src1 = inout(reg) src1.as_ptr() => _, | |
167 | dst = inout(reg) dst.as_mut_ptr() => _, | |
168 | stride = in(reg) stride, | |
169 | h = inout(reg) h => _, | |
170 | out("xmm0") _, | |
171 | out("xmm1") _, | |
172 | ); | |
173 | } | |
174 | return; | |
175 | } | |
176 | let weight0 = i32::from(wparams[0]); | |
177 | let offset0 = i32::from(wparams[1]); | |
178 | let weight1 = i32::from(wparams[2]); | |
179 | let offset1 = i32::from(wparams[3]); | |
180 | let wshift = i32::from(wparams[4]) + 1; | |
181 | let offset = (offset0 + offset1 + 1) >> 1; | |
182 | let bias = (1 << wshift) >> 1; | |
183 | ||
184 | unsafe { | |
185 | asm!( | |
186 | "xorps xmm0, xmm0", | |
187 | "movd xmm1, {weight0:e}", | |
188 | "movd xmm2, {weight1:e}", | |
189 | "movd xmm3, {offset:e}", | |
190 | "movd xmm4, {wshift:e}", | |
191 | "movd xmm5, {bias:e}", | |
192 | "pshuflw xmm1, xmm1, 0", | |
193 | "pshuflw xmm2, xmm2, 0", | |
194 | "pshuflw xmm3, xmm3, 0", | |
195 | "pshuflw xmm5, xmm5, 0", | |
196 | "movlhps xmm1, xmm1", | |
197 | "movlhps xmm2, xmm2", | |
198 | "movlhps xmm3, xmm3", | |
199 | "movlhps xmm5, xmm5", | |
200 | "2:", | |
201 | concat!($mov, " xmm6, [{src0}]"), | |
202 | "add {src0}, 16", | |
203 | concat!($mov, " xmm7, [{src1}]"), | |
204 | "add {src1}, 16", | |
205 | "punpcklbw xmm6, xmm0", | |
206 | "punpcklbw xmm7, xmm0", | |
207 | "pmullw xmm6, xmm1", | |
208 | "pmullw xmm7, xmm2", | |
209 | "paddw xmm6, xmm5", | |
210 | "paddw xmm6, xmm7", | |
211 | "psraw xmm6, xmm4", | |
212 | "paddw xmm6, xmm3", | |
213 | "movhlps xmm7, xmm6", | |
214 | "packuswb xmm6, xmm7", | |
215 | concat!($mov, " [{dst}], xmm6"), | |
216 | "add {dst}, {stride}", | |
217 | "dec {h}", | |
218 | "jnz 2b", | |
219 | h = inout(reg) h => _, | |
220 | src0 = inout(reg) src0.as_ptr() => _, | |
221 | src1 = inout(reg) src1.as_ptr() => _, | |
222 | dst = inout(reg) dst.as_mut_ptr() => _, | |
223 | stride = in(reg) stride, | |
224 | weight0 = in(reg) weight0, | |
225 | weight1 = in(reg) weight1, | |
226 | offset = in(reg) offset, | |
227 | wshift = in(reg) wshift, | |
228 | bias = in(reg) bias, | |
229 | out("xmm0") _, | |
230 | out("xmm1") _, | |
231 | out("xmm2") _, | |
232 | out("xmm3") _, | |
233 | out("xmm4") _, | |
234 | out("xmm5") _, | |
235 | out("xmm6") _, | |
236 | out("xmm7") _, | |
237 | ); | |
238 | } | |
239 | } | |
240 | } | |
241 | } | |
242 | ||
243 | put_block_weighted2!(put_block_weighted2_8, "movq"); | |
244 | put_block_weighted2!(put_block_weighted2_4, "movd"); | |
245 | ||
246 | pub fn put_block_weighted2_16(dst: &mut [u8], stride: usize, src0: &[u8], src1: &[u8], h: usize, wparams: [i8; 5]) { | |
247 | if wparams == [1, 0, 1, 0, 0] { | |
248 | unsafe { | |
249 | asm!( | |
250 | "2:", | |
251 | "movups xmm0, [{src0}]", | |
252 | "movups xmm1, [{src1}]", | |
253 | "add {src0}, 16", | |
254 | "pavgb xmm0, xmm1", | |
255 | "add {src1}, 16", | |
256 | "movaps [{dst}], xmm0", | |
257 | "add {dst}, {stride}", | |
258 | "dec {h}", | |
259 | "jnz 2b", | |
260 | src0 = inout(reg) src0.as_ptr() => _, | |
261 | src1 = inout(reg) src1.as_ptr() => _, | |
262 | dst = inout(reg) dst.as_mut_ptr() => _, | |
263 | stride = in(reg) stride, | |
264 | h = inout(reg) h => _, | |
265 | out("xmm0") _, | |
266 | out("xmm1") _, | |
267 | ); | |
268 | } | |
269 | return; | |
270 | } | |
271 | let weight0 = i32::from(wparams[0]); | |
272 | let offset0 = i32::from(wparams[1]); | |
273 | let weight1 = i32::from(wparams[2]); | |
274 | let offset1 = i32::from(wparams[3]); | |
275 | let wshift = i32::from(wparams[4]) + 1; | |
276 | let offset = (offset0 + offset1 + 1) >> 1; | |
277 | let bias = (1 << wshift) >> 1; | |
278 | ||
279 | unsafe { | |
280 | asm!( | |
281 | "xorps xmm0, xmm0", | |
282 | "movd xmm1, {weight0:e}", | |
283 | "movd xmm2, {weight1:e}", | |
284 | "movd xmm3, {offset:e}", | |
285 | "movd xmm4, {wshift:e}", | |
286 | "movd xmm5, {bias:e}", | |
287 | "pshuflw xmm1, xmm1, 0", | |
288 | "pshuflw xmm2, xmm2, 0", | |
289 | "pshuflw xmm3, xmm3, 0", | |
290 | "pshuflw xmm5, xmm5, 0", | |
291 | "movlhps xmm1, xmm1", | |
292 | "movlhps xmm2, xmm2", | |
293 | "movlhps xmm3, xmm3", | |
294 | "movlhps xmm5, xmm5", | |
295 | "2:", | |
296 | "movq xmm6, [{src0}]", | |
297 | "movq xmm7, [{src1}]", | |
298 | "punpcklbw xmm6, xmm0", | |
299 | "punpcklbw xmm7, xmm0", | |
300 | "pmullw xmm6, xmm1", | |
301 | "pmullw xmm7, xmm2", | |
302 | "paddw xmm6, xmm5", | |
303 | "paddw xmm6, xmm7", | |
304 | "psraw xmm6, xmm4", | |
305 | "paddw xmm6, xmm3", | |
306 | "movhlps xmm7, xmm6", | |
307 | "packuswb xmm6, xmm7", | |
308 | "movq [{dst}], xmm6", | |
309 | "movq xmm6, [{src0} + 8]", | |
310 | "add {src0}, 16", | |
311 | "movq xmm7, [{src1} + 8]", | |
312 | "add {src1}, 16", | |
313 | "punpcklbw xmm6, xmm0", | |
314 | "punpcklbw xmm7, xmm0", | |
315 | "pmullw xmm6, xmm1", | |
316 | "pmullw xmm7, xmm2", | |
317 | "paddw xmm6, xmm5", | |
318 | "paddw xmm6, xmm7", | |
319 | "psraw xmm6, xmm4", | |
320 | "paddw xmm6, xmm3", | |
321 | "movhlps xmm7, xmm6", | |
322 | "packuswb xmm6, xmm7", | |
323 | "movq [{dst} + 8], xmm6", | |
324 | "add {dst}, {stride}", | |
325 | "dec {h}", | |
326 | "jnz 2b", | |
327 | h = inout(reg) h => _, | |
328 | src0 = inout(reg) src0.as_ptr() => _, | |
329 | src1 = inout(reg) src1.as_ptr() => _, | |
330 | dst = inout(reg) dst.as_mut_ptr() => _, | |
331 | stride = in(reg) stride, | |
332 | weight0 = in(reg) weight0, | |
333 | weight1 = in(reg) weight1, | |
334 | offset = in(reg) offset, | |
335 | wshift = in(reg) wshift, | |
336 | bias = in(reg) bias, | |
337 | out("xmm0") _, | |
338 | out("xmm1") _, | |
339 | out("xmm2") _, | |
340 | out("xmm3") _, | |
341 | out("xmm4") _, | |
342 | out("xmm5") _, | |
343 | out("xmm6") _, | |
344 | out("xmm7") _, | |
345 | ); | |
346 | } | |
347 | } |