ape: switch large filter to 16-bit data and add x86_64 optimisation
[nihav.git] / nihav-llaudio / src / codecs / apepred.rs
1 const HISTORY_SIZE: usize = 512;
2
3 fn val2sign(val: i32) -> i32 {
4 if val > 0 {
5 -1
6 } else if val < 0 {
7 1
8 } else {
9 0
10 }
11 }
12
13 pub struct OldFilt {
14 version: u16,
15 compression: u16,
16 }
17
18 pub struct NewFilt {
19 version: u16,
20 filters: [NFilterContext; 3],
21 lfilt: LastFilterContext,
22 rfilt: LastFilterContext,
23 }
24
25 #[allow(clippy::large_enum_variant)]
26 pub enum FilterMode {
27 Old(OldFilt),
28 New(NewFilt),
29 None,
30 }
31
32 impl FilterMode {
33 pub fn new(version: u16, compression: u16) -> Self {
34 if version < 3930 {
35 FilterMode::Old(OldFilt::new(version, compression))
36 } else {
37 FilterMode::New(NewFilt::new(version, compression))
38 }
39 }
40 pub fn filter_mono(&mut self, l: &mut [i32]) {
41 match *self {
42 FilterMode::Old(ref mut ofilt) => ofilt.filter(l),
43 FilterMode::New(ref mut nfilt) => nfilt.filter_mono(l),
44 FilterMode::None => unreachable!(),
45 };
46 }
47 pub fn filter_stereo(&mut self, l: &mut [i32], r: &mut [i32]) {
48 match *self {
49 FilterMode::Old(ref mut ofilt) => {
50 ofilt.filter(l);
51 ofilt.filter(r);
52 for (l, r) in l.iter_mut().zip(r.iter_mut()) {
53 let new_l = *l - *r / 2;
54 let new_r = *r + new_l;
55 *l = new_l;
56 *r = new_r;
57 }
58 },
59 FilterMode::New(ref mut nfilt) => {
60 nfilt.filter_stereo(l, r);
61 for (l, r) in l.iter_mut().zip(r.iter_mut()) {
62 let new_l = *r - *l / 2;
63 let new_r = *l + new_l;
64 *l = new_l;
65 *r = new_r;
66 }
67 },
68 FilterMode::None => unreachable!(),
69 };
70 }
71 }
72
73 const NEW_FILTER_PARAMS: [[(u8, u8); 3]; 5] = [
74 [ (0, 0), ( 0, 0), ( 0, 0) ],
75 [ (1, 11), ( 0, 0), ( 0, 0) ],
76 [ (4, 11), ( 0, 0), ( 0, 0) ],
77 [ (2, 10), (16, 13), ( 0, 0) ],
78 [ (1, 11), (16, 13), (80, 15) ],
79 ];
80
81 #[derive(Clone,Default)]
82 struct NFilterContext {
83 buf: Vec<i16>,
84 coeffs: Vec<i16>,
85 order: usize,
86 bits: u8,
87 avg: i32,
88 new: bool,
89 }
90
91 #[cfg(target_arch = "x86_64")]
92 use std::arch::x86_64::*;
93
94 #[cfg(target_arch = "x86_64")]
95 fn adapt_loop(filt: &mut [i16], coeffs: &[i16], adapt: &[i16], val: i32) -> i32 {
96 let mut sum = [0i32; 4];
97 let iters = filt.len() / 16;
98 unsafe {
99 let mut sumv = _mm_setzero_si128();
100 let mut fptr = filt.as_mut_ptr();
101 let mut cptr = coeffs.as_ptr();
102 let mut aptr = adapt.as_ptr();
103 if val < 0 {
104 for _ in 0..iters {
105 let r0 = _mm_loadu_si128(cptr as *const __m128i);
106 let r1 = _mm_loadu_si128(cptr.add(8) as *const __m128i);
107 let c0 = _mm_load_si128(fptr as *const __m128i);
108 let c1 = _mm_load_si128(fptr.add(8) as *const __m128i);
109 sumv = _mm_add_epi32(sumv, _mm_madd_epi16(r0, c0));
110 sumv = _mm_add_epi32(sumv, _mm_madd_epi16(r1, c1));
111 let a0 = _mm_loadu_si128(aptr as *const __m128i);
112 let a1 = _mm_loadu_si128(aptr.add(8) as *const __m128i);
113 let c0 = _mm_add_epi16(c0, a0);
114 let c1 = _mm_add_epi16(c1, a1);
115 _mm_store_si128(fptr as *mut __m128i, c0);
116 _mm_store_si128(fptr.add(8) as *mut __m128i, c1);
117 fptr = fptr.add(16);
118 cptr = cptr.add(16);
119 aptr = aptr.add(16);
120 }
121 } else if val > 0 {
122 for _ in 0..iters {
123 let r0 = _mm_loadu_si128(cptr as *const __m128i);
124 let r1 = _mm_loadu_si128(cptr.add(8) as *const __m128i);
125 let c0 = _mm_load_si128(fptr as *const __m128i);
126 let c1 = _mm_load_si128(fptr.add(8) as *const __m128i);
127 sumv = _mm_add_epi32(sumv, _mm_madd_epi16(r0, c0));
128 sumv = _mm_add_epi32(sumv, _mm_madd_epi16(r1, c1));
129 let a0 = _mm_loadu_si128(aptr as *const __m128i);
130 let a1 = _mm_loadu_si128(aptr.add(8) as *const __m128i);
131 let c0 = _mm_sub_epi16(c0, a0);
132 let c1 = _mm_sub_epi16(c1, a1);
133 _mm_store_si128(fptr as *mut __m128i, c0);
134 _mm_store_si128(fptr.add(8) as *mut __m128i, c1);
135 fptr = fptr.add(16);
136 cptr = cptr.add(16);
137 aptr = aptr.add(16);
138 }
139 } else {
140 for _ in 0..iters {
141 let r0 = _mm_loadu_si128(cptr as *const __m128i);
142 let r1 = _mm_loadu_si128(cptr.add(8) as *const __m128i);
143 let c0 = _mm_load_si128(fptr as *const __m128i);
144 let c1 = _mm_load_si128(fptr.add(8) as *const __m128i);
145 sumv = _mm_add_epi32(sumv, _mm_madd_epi16(r0, c0));
146 sumv = _mm_add_epi32(sumv, _mm_madd_epi16(r1, c1));
147 fptr = fptr.add(16);
148 cptr = cptr.add(16);
149 aptr = aptr.add(16);
150 }
151 }
152 _mm_storeu_si128(sum.as_mut_ptr() as *mut __m128i, sumv);
153 }
154 sum[0] + sum[1] + sum[2] + sum[3]
155 }
156
157 #[cfg(not(target_arch = "x86_64"))]
158 fn adapt_loop(filt: &mut [i16], coeffs: &[i16], adapt: &[i16], val: i32) -> i32 {
159 let mut sum = 0i32;
160 for (coef, (res, adapt)) in filt.iter_mut().zip(coeffs.iter().zip(adapt.iter())) {
161 sum += i32::from(*coef) * i32::from(*res);
162 if val < 0 {
163 *coef += *adapt;
164 } else if val > 0 {
165 *coef -= *adapt;
166 }
167 }
168 sum
169 }
170
171 impl NFilterContext {
172 fn new(ord16: u8, bits: u8, new: bool) -> Self {
173 let order = ord16 as usize * 16;
174 Self {
175 buf: if order > 0 { vec![0; order * 2 + HISTORY_SIZE] } else { Vec::new() },
176 coeffs: vec![0; order],
177 order,
178 bits,
179 avg: 0,
180 new,
181 }
182 }
183 fn reset(&mut self) {
184 for el in self.buf[..self.order * 2].iter_mut() { *el = 0; }
185 for el in self.coeffs.iter_mut() { *el = 0; }
186 self.avg = 0;
187 }
188 fn apply(&mut self, dst: &mut [i32]) {
189 if self.order == 0 { return; }
190 let mut adapt_pos = self.order;
191 let mut delay_pos = self.order * 2;
192 for el in dst.iter_mut() {
193 let sum = adapt_loop(&mut self.coeffs,
194 &self.buf[delay_pos - self.order..],
195 &self.buf[adapt_pos - self.order..], *el);
196 let pred = (sum + (1 << (self.bits - 1))) >> self.bits;
197 let val = *el + pred;
198 *el = val;
199 self.buf[delay_pos] = val.min(32767).max(-32768) as i16;
200 if self.new {
201 let aval = val.abs();
202 let sign = val2sign(val) as i16;
203 self.buf[adapt_pos] = if aval == 0 {
204 0
205 } else if aval <= self.avg * 4 / 3 {
206 sign * 8
207 } else if aval <= self.avg * 3 {
208 sign * 16
209 } else {
210 sign * 32
211 };
212 self.avg += (aval - self.avg) / 16;
213 self.buf[adapt_pos - 1] >>= 1;
214 self.buf[adapt_pos - 2] >>= 1;
215 self.buf[adapt_pos - 8] >>= 1;
216 } else {
217 self.buf[adapt_pos] = 4 * (val2sign(val) as i16);
218 self.buf[adapt_pos - 4] >>= 1;
219 self.buf[adapt_pos - 8] >>= 1;
220 }
221 delay_pos += 1;
222 adapt_pos += 1;
223 if delay_pos == HISTORY_SIZE + self.order * 2 {
224 delay_pos = self.order * 2;
225 adapt_pos = self.order;
226 for i in 0..self.order * 2 {
227 self.buf[i] = self.buf[HISTORY_SIZE + i];
228 }
229 }
230 }
231 }
232 }
233
234 #[derive(Clone,Copy,Default)]
235 struct LastFilterContext {
236 last_a: i32,
237 filter_a: i32,
238 filter_b: i32,
239 coeffs_a: [i32; 4],
240 coeffs_b: [i32; 5],
241 delay_a: [i32; 4],
242 adapt_a: [i32; 4],
243 delay_b: [i32; 5],
244 adapt_b: [i32; 5],
245 }
246
247 impl LastFilterContext {
248 fn init(&mut self) {
249 const COEFFS_A_NEW: [i32; 4] = [360, 317, -109, 98];
250
251 self.filter_a = 0;
252 self.filter_b = 0;
253 self.coeffs_a = COEFFS_A_NEW;
254 self.coeffs_b = [0; 5];
255 self.last_a = 0;
256 self.delay_a = [0; 4];
257 self.adapt_a = [0; 4];
258 self.delay_b = [0; 5];
259 self.adapt_b = [0; 5];
260 }
261 fn predict_a(&mut self) -> i32 {
262 for i in (0..3).rev() {
263 self.delay_a[i + 1] = self.delay_a[i];
264 self.adapt_a[i + 1] = self.adapt_a[i];
265 }
266 self.delay_a[0] = self.last_a;
267 self.delay_a[1] = self.last_a - self.delay_a[1];
268 self.adapt_a[0] = val2sign(self.delay_a[0]);
269 self.adapt_a[1] = val2sign(self.delay_a[1]);
270
271 self.delay_a[0] * self.coeffs_a[0] +
272 self.delay_a[1] * self.coeffs_a[1] +
273 self.delay_a[2] * self.coeffs_a[2] +
274 self.delay_a[3] * self.coeffs_a[3]
275 }
276 fn predict_b(&mut self, other_a: i32) -> i32 {
277 for i in (0..4).rev() {
278 self.delay_b[i + 1] = self.delay_b[i];
279 self.adapt_b[i + 1] = self.adapt_b[i];
280 }
281 self.delay_b[0] = other_a - ((self.filter_b * 31) >> 5);
282 self.delay_b[1] = self.delay_b[0] - self.delay_b[1];
283 self.adapt_b[0] = val2sign(self.delay_b[0]);
284 self.adapt_b[1] = val2sign(self.delay_b[1]);
285
286 self.filter_b = other_a;
287
288 (self.delay_b[0] * self.coeffs_b[0] +
289 self.delay_b[1] * self.coeffs_b[1] +
290 self.delay_b[2] * self.coeffs_b[2] +
291 self.delay_b[3] * self.coeffs_b[3] +
292 self.delay_b[4] * self.coeffs_b[4]) >> 1
293 }
294 fn update_a(&mut self, pred: i32, diff: i32) -> i32 {
295 self.last_a = diff + (pred >> 10);
296 let sign = val2sign(diff);
297 for i in 0..4 {
298 self.coeffs_a[i] += self.adapt_a[i] * sign;
299 }
300 self.filter_a = self.last_a + ((self.filter_a * 31) >> 5);
301
302 self.filter_a
303 }
304 fn update_b(&mut self, diff: i32) {
305 let sign = val2sign(diff);
306 for i in 0..5 {
307 self.coeffs_b[i] += self.adapt_b[i] * sign;
308 }
309 }
310 fn predict_3930(&mut self, diff: i32) -> i32 {
311 for i in (0..3).rev() {
312 self.delay_a[i + 1] = self.delay_a[i];
313 }
314 self.delay_a[0] = self.last_a;
315 let d0 = self.delay_a[0];
316 let d1 = self.delay_a[0] - self.delay_a[1];
317 let d2 = self.delay_a[1] - self.delay_a[2];
318 let d3 = self.delay_a[2] - self.delay_a[3];
319
320 let pred = (self.coeffs_a[0] * d0 +
321 self.coeffs_a[1] * d1 +
322 self.coeffs_a[2] * d2 +
323 self.coeffs_a[3] * d3) >> 9;
324 self.last_a = diff + pred;
325 self.filter_a = self.last_a + ((self.filter_a * 31) >> 5);
326
327 let sign = val2sign(diff);
328 self.coeffs_a[0] += if d0 < 0 { sign } else { -sign };
329 self.coeffs_a[1] += if d1 < 0 { sign } else { -sign };
330 self.coeffs_a[2] += if d2 < 0 { sign } else { -sign };
331 self.coeffs_a[3] += if d3 < 0 { sign } else { -sign };
332
333 self.filter_a
334 }
335 }
336
337 impl NewFilt {
338 fn new(version: u16, compression: u16) -> Self {
339 let cidx = (compression / 1000) as usize - 1;
340 let mut obj = Self {
341 version,
342 filters: [NFilterContext::default(), NFilterContext::default(), NFilterContext::default()],
343 lfilt: LastFilterContext::default(),
344 rfilt: LastFilterContext::default(),
345 };
346 obj.version = version;
347 let new = version >= 3980;
348 for i in 0..3 {
349 let (ord16, bits) = NEW_FILTER_PARAMS[cidx][i];
350 obj.filters[i] = NFilterContext::new(ord16, bits, new);
351 }
352 obj
353 }
354 fn filter_mono(&mut self, dst: &mut [i32]) {
355 for filt in self.filters.iter_mut() {
356 filt.reset();
357 filt.apply(dst);
358 }
359 self.lfilt.init();
360 if self.version >= 3950 {
361 for el in dst.iter_mut() {
362 let pred = self.lfilt.predict_a();
363 *el = self.lfilt.update_a(pred, *el);
364 }
365 } else {
366 for el in dst.iter_mut() {
367 *el = self.lfilt.predict_3930(*el);
368 }
369 }
370 }
371 fn filter_stereo(&mut self, l: &mut [i32], r: &mut [i32]) {
372 for filt in self.filters.iter_mut() {
373 filt.reset();
374 filt.apply(l);
375 filt.reset();
376 filt.apply(r);
377 }
378 self.lfilt.init();
379 self.rfilt.init();
380 if self.version >= 3950 {
381 for (l, r) in l.iter_mut().zip(r.iter_mut()) {
382 let mut pred = self.lfilt.predict_a();
383 pred += self.lfilt.predict_b(self.rfilt.filter_a);
384 let new_l = self.lfilt.update_a(pred, *l);
385 self.lfilt.update_b(*l);
386 *l = new_l;
387
388 let mut pred = self.rfilt.predict_a();
389 pred += self.rfilt.predict_b(self.lfilt.filter_a);
390 let new_r = self.rfilt.update_a(pred, *r);
391 self.rfilt.update_b(*r);
392 *r = new_r;
393 }
394 } else {
395 for (l, r) in l.iter_mut().zip(r.iter_mut()) {
396 let new_l = self.lfilt.predict_3930(*r);
397 let new_r = self.rfilt.predict_3930(*l);
398 *l = new_l;
399 *r = new_r;
400 }
401 }
402 }
403 }
404
405 impl OldFilt {
406 fn new(version: u16, compression: u16) -> Self {
407 Self {
408 version, compression
409 }
410 }
411 fn filter(&mut self, dst: &mut [i32]) {
412 match self.compression {
413 1000 => {
414 Self::filter_fast(dst);
415 },
416 2000 => {
417 Self::filter_normal(dst, 4, 10);
418 },
419 3000 => {
420 Self::filter_high(dst, 16, 9);
421 Self::filter_normal(dst, 16, 10);
422 },
423 4000 => {
424 if self.version < 3830 {
425 Self::filter_high(dst, 128, 11);
426 Self::filter_normal(dst, 128, 10);
427 } else {
428 Self::filter_extra_high(dst);
429 Self::filter_high(dst, 256, 12);
430 Self::filter_normal(dst, 256, 11);
431 }
432 },
433 _ => unreachable!(),
434 };
435 }
436 fn filter_fast(dst: &mut [i32]) {
437 const COEFF_A_FAST: i32 = 375;
438
439 if dst.len() <= 3 {
440 return;
441 }
442 let mut delay = [dst[1], dst[0]];
443 let mut last = dst[2];
444 let mut filter = dst[2];
445 let mut weight = COEFF_A_FAST;
446 for el in dst[3..].iter_mut() {
447 delay[1] = delay[0];
448 delay[0] = last;
449 let pred = delay[0] * 2 - delay[1];
450 last = *el + ((pred * weight) >> 9);
451 if (*el ^ pred) > 0 {
452 weight += 1;
453 } else {
454 weight -= 1;
455 }
456 filter += last;
457 *el = filter;
458 }
459 }
460 fn filter_normal(dst: &mut [i32], start: usize, shift: u8) {
461 const COEFFS_A_NORMAL: [i32; 3] = [64, 115, 64];
462 const COEFFS_B_NORMAL: [i32; 2] = [740, 0];
463
464 let mut last = 0;
465 let mut coeffs_a = COEFFS_A_NORMAL;
466 let mut coeffs_b = COEFFS_B_NORMAL;
467 let mut filter_a = 0;
468 let mut filter_b = 0;
469 let mut delay_a = [0; 3];
470 let mut delay_b = [0; 2];
471
472 for (i, el) in dst.iter_mut().enumerate() {
473 delay_a[2] = delay_a[1]; delay_a[1] = delay_a[0]; delay_a[0] = last;
474 delay_b[1] = delay_b[0]; delay_b[0] = filter_b;
475 if i < start {
476 let val = *el + filter_a;
477 last = *el;
478 filter_b = *el;
479 filter_a = val;
480 *el = val;
481 continue;
482 }
483 let a0 = delay_a[0] + (delay_a[2] - delay_a[1]) * 8;
484 let a1 = (delay_a[0] - delay_a[1]) * 2;
485 let a2 = delay_a[0];
486 let b0 = delay_b[0] * 2 - delay_b[1];
487 let b1 = delay_b[0];
488
489 let pred_a = a0 * coeffs_a[0] + a1 * coeffs_a[1] + a2 * coeffs_a[2];
490 let pred_b = b0 * coeffs_b[0] - b1 * coeffs_b[1];
491
492 let sign = val2sign(*el);
493 coeffs_a[0] += (((a0 >> 30) & 2) - 1) * sign;
494 coeffs_a[1] += (((a1 >> 28) & 8) - 4) * sign;
495 coeffs_a[2] += (((a2 >> 28) & 8) - 4) * sign;
496 last = *el + (pred_a >> 11);
497
498 let sign = val2sign(last);
499 coeffs_b[0] += (((b0 >> 29) & 4) - 2) * sign;
500 coeffs_b[1] -= (((b1 >> 30) & 2) - 1) * sign;
501
502 filter_b = last + (pred_b >> shift);
503 filter_a = filter_b + ((filter_a * 31) >> 5);
504
505 *el = filter_a;
506 }
507 }
508 fn filter_high(dst: &mut [i32], order: usize, shift: u8) {
509 let mut coeffs = [0i32; 256];
510 let mut delay = [0i32; 256];
511 if dst.len() <= order {
512 return;
513 }
514 delay[..order].copy_from_slice(&dst[..order]);
515 for el in dst[order..].iter_mut() {
516 let sign = val2sign(*el);
517 let mut sum = 0;
518 for i in 0..order {
519 sum += delay[i] * coeffs[i];
520 coeffs[i] -= (((delay[i] >> 30) & 2) - 1) * sign;
521 }
522 *el -= sum >> shift;
523 for i in 0..order-1 {
524 delay[i] = delay[i + 1];
525 }
526 delay[order - 1] = *el;
527 }
528 }
529 fn filter_extra_high(dst: &mut [i32]) {
530 let mut coeffs = [0i32; 8];
531 let mut delay = [0i32; 8];
532 for el in dst[256..].iter_mut() {
533 let sign = val2sign(*el);
534 let mut sum = 0;
535 for i in 0..8 {
536 sum += delay[i] * coeffs[i];
537 coeffs[i] -= (((delay[i] >> 30) & 2) - 1) * sign;
538 }
539 for i in (0..7).rev() {
540 delay[i + 1] = delay[i];
541 }
542 delay[0] = *el;
543 *el -= sum >> 9;
544 }
545 }
546 }