h264: add SIMD optimisations for x86_64 (not enabled by default)
[nihav.git] / nihav-itu / src / codecs / h264 / dsp / mc / x86 / blockdsp.rs
1 use std::arch::asm;
2
3 macro_rules! avg_template {
4 ($name: ident, $mov: expr) => {
5 pub fn $name(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, bh: usize) {
6 unsafe {
7 asm!(
8 "2:",
9 concat!($mov, " xmm1, [{src}]"),
10 concat!($mov, " xmm3, [{src} + {sstride}]"),
11 concat!($mov, " xmm0, [{dst}]"),
12 concat!($mov, " xmm2, [{dst} + {dstride}]"),
13 "lea {src}, [{src} + {sstride} * 2]",
14 "pavgb xmm0, xmm1",
15 "pavgb xmm2, xmm3",
16 concat!($mov, " [{dst}], xmm0"),
17 concat!($mov, " [{dst} + {dstride}], xmm2"),
18 "lea {dst}, [{dst} + {dstride} * 2]",
19 "sub {h}, 2",
20 "jnz 2b",
21 src = inout(reg) src.as_ptr() => _,
22 sstride = in(reg) sstride,
23 dst = inout(reg) dst.as_mut_ptr() => _,
24 dstride = in(reg) dstride,
25 h = inout(reg) bh => _,
26 out("xmm0") _,
27 out("xmm1") _,
28 out("xmm2") _,
29 out("xmm3") _,
30 );
31 }
32 }
33 }
34 }
35
36 avg_template!(avg_4, "movd");
37 avg_template!(avg_8, "movq");
38
39 pub fn avg_16(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, bh: usize) {
40 unsafe {
41 asm!(
42 "lea {stmp}, [{src} + {sstride} * 2]",
43 "lea {dtmp}, [{dst} + {dstride} * 2]",
44 "2:",
45 "movaps xmm0, [{src}]",
46 "movaps xmm1, [{src} + {sstride}]",
47 "movaps xmm2, [{stmp}]",
48 "movaps xmm3, [{stmp} + {sstride}]",
49 "pavgb xmm0, [{dst}]",
50 "pavgb xmm1, [{dst} + {dstride}]",
51 "pavgb xmm2, [{dtmp}]",
52 "pavgb xmm3, [{dtmp} + {dstride}]",
53 "lea {src}, [{src} + {sstride} * 4]",
54 "movaps [{dst}], xmm0",
55 "lea {stmp}, [{stmp} + {sstride} * 4]",
56 "movaps [{dst} + {dstride}], xmm1",
57 "lea {dst}, [{dst} + {dstride} * 4]",
58 "movaps [{dtmp}], xmm2",
59 "movaps [{dtmp} + {dstride}], xmm3",
60 "lea {dtmp}, [{dtmp} + {dstride} * 4]",
61 "sub {h}, 4",
62 "jnz 2b",
63 src = inout(reg) src.as_ptr() => _,
64 sstride = in(reg) sstride,
65 dst = inout(reg) dst.as_mut_ptr() => _,
66 dstride = in(reg) dstride,
67 h = inout(reg) bh => _,
68 stmp = out(reg) _,
69 dtmp = out(reg) _,
70 out("xmm0") _,
71 out("xmm1") _,
72 out("xmm2") _,
73 out("xmm3") _,
74 );
75 }
76 }
77
78 macro_rules! put_block_weighted {
79 ($func:ident, $width:expr, $load:expr, $store:expr) => {
80 pub fn $func(dst: &mut [u8], stride: usize, src: &[u8], h: usize, wparams: [i8; 3]) {
81 if wparams == [1, 0, 0] {
82 for (dst, src) in dst.chunks_mut(stride).zip(src.chunks(16)).take(h) {
83 dst[..$width].copy_from_slice(&src[..$width]);
84 }
85 } else {
86 let weight = i32::from(wparams[0]);
87 let offset = i32::from(wparams[1]);
88 let wshift = i32::from(wparams[2]);
89 let bias = (1 << wshift) >> 1;
90
91 unsafe {
92 asm!(
93 "xorps xmm0, xmm0",
94 "movd xmm1, {weight:e}",
95 "movd xmm2, {offset:e}",
96 "movd xmm3, {wshift:e}",
97 "movd xmm4, {bias:e}",
98 "pshuflw xmm1, xmm1, 0",
99 "pshuflw xmm2, xmm2, 0",
100 "pshuflw xmm4, xmm4, 0",
101 "movlhps xmm1, xmm1",
102 "movlhps xmm2, xmm2",
103 "movlhps xmm4, xmm4",
104 "2:",
105 concat!($load, " xmm5, [{src}]"),
106 "add {src}, 16",
107 "movaps xmm7, xmm5",
108 "punpcklbw xmm5, xmm0",
109 "punpckhbw xmm7, xmm0",
110 "pmullw xmm5, xmm1",
111 "pmullw xmm7, xmm1",
112 "paddw xmm5, xmm4",
113 "paddw xmm7, xmm4",
114 "psraw xmm5, xmm3",
115 "psraw xmm7, xmm3",
116 "paddw xmm5, xmm2",
117 "paddw xmm7, xmm2",
118 "packuswb xmm5, xmm7",
119 concat!($store, " [{dst}], xmm5"),
120 "add {dst}, {stride}",
121 "dec {h}",
122 "jnz 2b",
123 h = inout(reg) h => _,
124 src = inout(reg) src.as_ptr() => _,
125 dst = inout(reg) dst.as_mut_ptr() => _,
126 stride = in(reg) stride,
127 weight = in(reg) weight,
128 offset = in(reg) offset,
129 wshift = in(reg) wshift,
130 bias = in(reg) bias,
131 out("xmm0") _,
132 out("xmm1") _,
133 out("xmm2") _,
134 out("xmm3") _,
135 out("xmm4") _,
136 out("xmm5") _,
137 out("xmm7") _,
138 );
139 }
140 }
141 }
142 }
143 }
144
145 put_block_weighted!(put_block_weighted_16, 16, "movups", "movaps");
146 put_block_weighted!(put_block_weighted_8, 8, "movq", "movq");
147 put_block_weighted!(put_block_weighted_4, 4, "movd", "movd");
148
149 macro_rules! put_block_weighted2 {
150 ($func:ident, $mov:expr) => {
151 pub fn $func(dst: &mut [u8], stride: usize, src0: &[u8], src1: &[u8], h: usize, wparams: [i8; 5]) {
152 if wparams == [1, 0, 1, 0, 0] {
153 unsafe {
154 asm!(
155 "2:",
156 concat!($mov, " xmm0, [{src0}]"),
157 concat!($mov, " xmm1, [{src1}]"),
158 "add {src0}, 16",
159 "pavgb xmm0, xmm1",
160 "add {src1}, 16",
161 concat!($mov, " [{dst}], xmm0"),
162 "add {dst}, {stride}",
163 "dec {h}",
164 "jnz 2b",
165 src0 = inout(reg) src0.as_ptr() => _,
166 src1 = inout(reg) src1.as_ptr() => _,
167 dst = inout(reg) dst.as_mut_ptr() => _,
168 stride = in(reg) stride,
169 h = inout(reg) h => _,
170 out("xmm0") _,
171 out("xmm1") _,
172 );
173 }
174 return;
175 }
176 let weight0 = i32::from(wparams[0]);
177 let offset0 = i32::from(wparams[1]);
178 let weight1 = i32::from(wparams[2]);
179 let offset1 = i32::from(wparams[3]);
180 let wshift = i32::from(wparams[4]) + 1;
181 let offset = (offset0 + offset1 + 1) >> 1;
182 let bias = (1 << wshift) >> 1;
183
184 unsafe {
185 asm!(
186 "xorps xmm0, xmm0",
187 "movd xmm1, {weight0:e}",
188 "movd xmm2, {weight1:e}",
189 "movd xmm3, {offset:e}",
190 "movd xmm4, {wshift:e}",
191 "movd xmm5, {bias:e}",
192 "pshuflw xmm1, xmm1, 0",
193 "pshuflw xmm2, xmm2, 0",
194 "pshuflw xmm3, xmm3, 0",
195 "pshuflw xmm5, xmm5, 0",
196 "movlhps xmm1, xmm1",
197 "movlhps xmm2, xmm2",
198 "movlhps xmm3, xmm3",
199 "movlhps xmm5, xmm5",
200 "2:",
201 concat!($mov, " xmm6, [{src0}]"),
202 "add {src0}, 16",
203 concat!($mov, " xmm7, [{src1}]"),
204 "add {src1}, 16",
205 "punpcklbw xmm6, xmm0",
206 "punpcklbw xmm7, xmm0",
207 "pmullw xmm6, xmm1",
208 "pmullw xmm7, xmm2",
209 "paddw xmm6, xmm5",
210 "paddw xmm6, xmm7",
211 "psraw xmm6, xmm4",
212 "paddw xmm6, xmm3",
213 "movhlps xmm7, xmm6",
214 "packuswb xmm6, xmm7",
215 concat!($mov, " [{dst}], xmm6"),
216 "add {dst}, {stride}",
217 "dec {h}",
218 "jnz 2b",
219 h = inout(reg) h => _,
220 src0 = inout(reg) src0.as_ptr() => _,
221 src1 = inout(reg) src1.as_ptr() => _,
222 dst = inout(reg) dst.as_mut_ptr() => _,
223 stride = in(reg) stride,
224 weight0 = in(reg) weight0,
225 weight1 = in(reg) weight1,
226 offset = in(reg) offset,
227 wshift = in(reg) wshift,
228 bias = in(reg) bias,
229 out("xmm0") _,
230 out("xmm1") _,
231 out("xmm2") _,
232 out("xmm3") _,
233 out("xmm4") _,
234 out("xmm5") _,
235 out("xmm6") _,
236 out("xmm7") _,
237 );
238 }
239 }
240 }
241 }
242
243 put_block_weighted2!(put_block_weighted2_8, "movq");
244 put_block_weighted2!(put_block_weighted2_4, "movd");
245
246 pub fn put_block_weighted2_16(dst: &mut [u8], stride: usize, src0: &[u8], src1: &[u8], h: usize, wparams: [i8; 5]) {
247 if wparams == [1, 0, 1, 0, 0] {
248 unsafe {
249 asm!(
250 "2:",
251 "movups xmm0, [{src0}]",
252 "movups xmm1, [{src1}]",
253 "add {src0}, 16",
254 "pavgb xmm0, xmm1",
255 "add {src1}, 16",
256 "movaps [{dst}], xmm0",
257 "add {dst}, {stride}",
258 "dec {h}",
259 "jnz 2b",
260 src0 = inout(reg) src0.as_ptr() => _,
261 src1 = inout(reg) src1.as_ptr() => _,
262 dst = inout(reg) dst.as_mut_ptr() => _,
263 stride = in(reg) stride,
264 h = inout(reg) h => _,
265 out("xmm0") _,
266 out("xmm1") _,
267 );
268 }
269 return;
270 }
271 let weight0 = i32::from(wparams[0]);
272 let offset0 = i32::from(wparams[1]);
273 let weight1 = i32::from(wparams[2]);
274 let offset1 = i32::from(wparams[3]);
275 let wshift = i32::from(wparams[4]) + 1;
276 let offset = (offset0 + offset1 + 1) >> 1;
277 let bias = (1 << wshift) >> 1;
278
279 unsafe {
280 asm!(
281 "xorps xmm0, xmm0",
282 "movd xmm1, {weight0:e}",
283 "movd xmm2, {weight1:e}",
284 "movd xmm3, {offset:e}",
285 "movd xmm4, {wshift:e}",
286 "movd xmm5, {bias:e}",
287 "pshuflw xmm1, xmm1, 0",
288 "pshuflw xmm2, xmm2, 0",
289 "pshuflw xmm3, xmm3, 0",
290 "pshuflw xmm5, xmm5, 0",
291 "movlhps xmm1, xmm1",
292 "movlhps xmm2, xmm2",
293 "movlhps xmm3, xmm3",
294 "movlhps xmm5, xmm5",
295 "2:",
296 "movq xmm6, [{src0}]",
297 "movq xmm7, [{src1}]",
298 "punpcklbw xmm6, xmm0",
299 "punpcklbw xmm7, xmm0",
300 "pmullw xmm6, xmm1",
301 "pmullw xmm7, xmm2",
302 "paddw xmm6, xmm5",
303 "paddw xmm6, xmm7",
304 "psraw xmm6, xmm4",
305 "paddw xmm6, xmm3",
306 "movhlps xmm7, xmm6",
307 "packuswb xmm6, xmm7",
308 "movq [{dst}], xmm6",
309 "movq xmm6, [{src0} + 8]",
310 "add {src0}, 16",
311 "movq xmm7, [{src1} + 8]",
312 "add {src1}, 16",
313 "punpcklbw xmm6, xmm0",
314 "punpcklbw xmm7, xmm0",
315 "pmullw xmm6, xmm1",
316 "pmullw xmm7, xmm2",
317 "paddw xmm6, xmm5",
318 "paddw xmm6, xmm7",
319 "psraw xmm6, xmm4",
320 "paddw xmm6, xmm3",
321 "movhlps xmm7, xmm6",
322 "packuswb xmm6, xmm7",
323 "movq [{dst} + 8], xmm6",
324 "add {dst}, {stride}",
325 "dec {h}",
326 "jnz 2b",
327 h = inout(reg) h => _,
328 src0 = inout(reg) src0.as_ptr() => _,
329 src1 = inout(reg) src1.as_ptr() => _,
330 dst = inout(reg) dst.as_mut_ptr() => _,
331 stride = in(reg) stride,
332 weight0 = in(reg) weight0,
333 weight1 = in(reg) weight1,
334 offset = in(reg) offset,
335 wshift = in(reg) wshift,
336 bias = in(reg) bias,
337 out("xmm0") _,
338 out("xmm1") _,
339 out("xmm2") _,
340 out("xmm3") _,
341 out("xmm4") _,
342 out("xmm5") _,
343 out("xmm6") _,
344 out("xmm7") _,
345 );
346 }
347 }