3 macro_rules! avg_template {
4 ($name: ident, $mov: expr) => {
5 pub fn $name(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, bh: usize) {
9 concat!($mov, " xmm1, [{src}]"),
10 concat!($mov, " xmm3, [{src} + {sstride}]"),
11 concat!($mov, " xmm0, [{dst}]"),
12 concat!($mov, " xmm2, [{dst} + {dstride}]"),
13 "lea {src}, [{src} + {sstride} * 2]",
16 concat!($mov, " [{dst}], xmm0"),
17 concat!($mov, " [{dst} + {dstride}], xmm2"),
18 "lea {dst}, [{dst} + {dstride} * 2]",
21 src = inout(reg) src.as_ptr() => _,
22 sstride = in(reg) sstride,
23 dst = inout(reg) dst.as_mut_ptr() => _,
24 dstride = in(reg) dstride,
25 h = inout(reg) bh => _,
36 avg_template!(avg_4, "movd");
37 avg_template!(avg_8, "movq");
39 pub fn avg_16(dst: &mut [u8], dstride: usize, src: &[u8], sstride: usize, bh: usize) {
42 "lea {stmp}, [{src} + {sstride} * 2]",
43 "lea {dtmp}, [{dst} + {dstride} * 2]",
45 "movaps xmm0, [{src}]",
46 "movaps xmm1, [{src} + {sstride}]",
47 "movaps xmm2, [{stmp}]",
48 "movaps xmm3, [{stmp} + {sstride}]",
49 "pavgb xmm0, [{dst}]",
50 "pavgb xmm1, [{dst} + {dstride}]",
51 "pavgb xmm2, [{dtmp}]",
52 "pavgb xmm3, [{dtmp} + {dstride}]",
53 "lea {src}, [{src} + {sstride} * 4]",
54 "movaps [{dst}], xmm0",
55 "lea {stmp}, [{stmp} + {sstride} * 4]",
56 "movaps [{dst} + {dstride}], xmm1",
57 "lea {dst}, [{dst} + {dstride} * 4]",
58 "movaps [{dtmp}], xmm2",
59 "movaps [{dtmp} + {dstride}], xmm3",
60 "lea {dtmp}, [{dtmp} + {dstride} * 4]",
63 src = inout(reg) src.as_ptr() => _,
64 sstride = in(reg) sstride,
65 dst = inout(reg) dst.as_mut_ptr() => _,
66 dstride = in(reg) dstride,
67 h = inout(reg) bh => _,
78 macro_rules! put_block_weighted {
79 ($func:ident, $width:expr, $load:expr, $store:expr) => {
80 pub fn $func(dst: &mut [u8], stride: usize, src: &[u8], h: usize, wparams: [i8; 3]) {
81 if wparams == [1, 0, 0] {
82 for (dst, src) in dst.chunks_mut(stride).zip(src.chunks(16)).take(h) {
83 dst[..$width].copy_from_slice(&src[..$width]);
86 let weight = i32::from(wparams[0]);
87 let offset = i32::from(wparams[1]);
88 let wshift = i32::from(wparams[2]);
89 let bias = (1 << wshift) >> 1;
94 "movd xmm1, {weight:e}",
95 "movd xmm2, {offset:e}",
96 "movd xmm3, {wshift:e}",
97 "movd xmm4, {bias:e}",
98 "pshuflw xmm1, xmm1, 0",
99 "pshuflw xmm2, xmm2, 0",
100 "pshuflw xmm4, xmm4, 0",
101 "movlhps xmm1, xmm1",
102 "movlhps xmm2, xmm2",
103 "movlhps xmm4, xmm4",
105 concat!($load, " xmm5, [{src}]"),
108 "punpcklbw xmm5, xmm0",
109 "punpckhbw xmm7, xmm0",
118 "packuswb xmm5, xmm7",
119 concat!($store, " [{dst}], xmm5"),
120 "add {dst}, {stride}",
123 h = inout(reg) h => _,
124 src = inout(reg) src.as_ptr() => _,
125 dst = inout(reg) dst.as_mut_ptr() => _,
126 stride = in(reg) stride,
127 weight = in(reg) weight,
128 offset = in(reg) offset,
129 wshift = in(reg) wshift,
145 put_block_weighted!(put_block_weighted_16, 16, "movups", "movaps");
146 put_block_weighted!(put_block_weighted_8, 8, "movq", "movq");
147 put_block_weighted!(put_block_weighted_4, 4, "movd", "movd");
149 macro_rules! put_block_weighted2 {
150 ($func:ident, $mov:expr) => {
151 pub fn $func(dst: &mut [u8], stride: usize, src0: &[u8], src1: &[u8], h: usize, wparams: [i8; 5]) {
152 if wparams == [1, 0, 1, 0, 0] {
156 concat!($mov, " xmm0, [{src0}]"),
157 concat!($mov, " xmm1, [{src1}]"),
161 concat!($mov, " [{dst}], xmm0"),
162 "add {dst}, {stride}",
165 src0 = inout(reg) src0.as_ptr() => _,
166 src1 = inout(reg) src1.as_ptr() => _,
167 dst = inout(reg) dst.as_mut_ptr() => _,
168 stride = in(reg) stride,
169 h = inout(reg) h => _,
176 let weight0 = i32::from(wparams[0]);
177 let offset0 = i32::from(wparams[1]);
178 let weight1 = i32::from(wparams[2]);
179 let offset1 = i32::from(wparams[3]);
180 let wshift = i32::from(wparams[4]) + 1;
181 let offset = (offset0 + offset1 + 1) >> 1;
182 let bias = (1 << wshift) >> 1;
187 "movd xmm1, {weight0:e}",
188 "movd xmm2, {weight1:e}",
189 "movd xmm3, {offset:e}",
190 "movd xmm4, {wshift:e}",
191 "movd xmm5, {bias:e}",
192 "pshuflw xmm1, xmm1, 0",
193 "pshuflw xmm2, xmm2, 0",
194 "pshuflw xmm3, xmm3, 0",
195 "pshuflw xmm5, xmm5, 0",
196 "movlhps xmm1, xmm1",
197 "movlhps xmm2, xmm2",
198 "movlhps xmm3, xmm3",
199 "movlhps xmm5, xmm5",
201 concat!($mov, " xmm6, [{src0}]"),
203 concat!($mov, " xmm7, [{src1}]"),
205 "punpcklbw xmm6, xmm0",
206 "punpcklbw xmm7, xmm0",
213 "movhlps xmm7, xmm6",
214 "packuswb xmm6, xmm7",
215 concat!($mov, " [{dst}], xmm6"),
216 "add {dst}, {stride}",
219 h = inout(reg) h => _,
220 src0 = inout(reg) src0.as_ptr() => _,
221 src1 = inout(reg) src1.as_ptr() => _,
222 dst = inout(reg) dst.as_mut_ptr() => _,
223 stride = in(reg) stride,
224 weight0 = in(reg) weight0,
225 weight1 = in(reg) weight1,
226 offset = in(reg) offset,
227 wshift = in(reg) wshift,
243 put_block_weighted2!(put_block_weighted2_8, "movq");
244 put_block_weighted2!(put_block_weighted2_4, "movd");
246 pub fn put_block_weighted2_16(dst: &mut [u8], stride: usize, src0: &[u8], src1: &[u8], h: usize, wparams: [i8; 5]) {
247 if wparams == [1, 0, 1, 0, 0] {
251 "movups xmm0, [{src0}]",
252 "movups xmm1, [{src1}]",
256 "movaps [{dst}], xmm0",
257 "add {dst}, {stride}",
260 src0 = inout(reg) src0.as_ptr() => _,
261 src1 = inout(reg) src1.as_ptr() => _,
262 dst = inout(reg) dst.as_mut_ptr() => _,
263 stride = in(reg) stride,
264 h = inout(reg) h => _,
271 let weight0 = i32::from(wparams[0]);
272 let offset0 = i32::from(wparams[1]);
273 let weight1 = i32::from(wparams[2]);
274 let offset1 = i32::from(wparams[3]);
275 let wshift = i32::from(wparams[4]) + 1;
276 let offset = (offset0 + offset1 + 1) >> 1;
277 let bias = (1 << wshift) >> 1;
282 "movd xmm1, {weight0:e}",
283 "movd xmm2, {weight1:e}",
284 "movd xmm3, {offset:e}",
285 "movd xmm4, {wshift:e}",
286 "movd xmm5, {bias:e}",
287 "pshuflw xmm1, xmm1, 0",
288 "pshuflw xmm2, xmm2, 0",
289 "pshuflw xmm3, xmm3, 0",
290 "pshuflw xmm5, xmm5, 0",
291 "movlhps xmm1, xmm1",
292 "movlhps xmm2, xmm2",
293 "movlhps xmm3, xmm3",
294 "movlhps xmm5, xmm5",
296 "movq xmm6, [{src0}]",
297 "movq xmm7, [{src1}]",
298 "punpcklbw xmm6, xmm0",
299 "punpcklbw xmm7, xmm0",
306 "movhlps xmm7, xmm6",
307 "packuswb xmm6, xmm7",
308 "movq [{dst}], xmm6",
309 "movq xmm6, [{src0} + 8]",
311 "movq xmm7, [{src1} + 8]",
313 "punpcklbw xmm6, xmm0",
314 "punpcklbw xmm7, xmm0",
321 "movhlps xmm7, xmm6",
322 "packuswb xmm6, xmm7",
323 "movq [{dst} + 8], xmm6",
324 "add {dst}, {stride}",
327 h = inout(reg) h => _,
328 src0 = inout(reg) src0.as_ptr() => _,
329 src1 = inout(reg) src1.as_ptr() => _,
330 dst = inout(reg) dst.as_mut_ptr() => _,
331 stride = in(reg) stride,
332 weight0 = in(reg) weight0,
333 weight1 = in(reg) weight1,
334 offset = in(reg) offset,
335 wshift = in(reg) wshift,