FastLED 3.9.15
Loading...
Searching...
No Matches

◆ inverse_mdct()

static void fl::third_party::vorbis::inverse_mdct ( float * buffer,
int32_t n,
vorb * f,
int32_t blocktype )
static

Definition at line 2359 of file stb_vorbis.cpp.hpp.

2360{
2361 int32_t n2 = n >> 1, n4 = n >> 2, n8 = n >> 3, l;
2362 int32_t ld;
2363 // @OPTIMIZE: reduce register pressure by using fewer variables?
2364 int32_t save_point = fl_stbv_temp_alloc_save(f);
2365 float *buf2 = (float *) fl_stbv_temp_alloc(f, n2 * sizeof(*buf2));
2366 float *u=nullptr,*v=nullptr;
2367 // twiddle factors
2368 float *A = f->A[blocktype];
2369
2370 // IMDCT algorithm from "The use of multirate filter banks for coding of high quality digital audio"
2371 // See notes about bugs in that paper in less-optimal implementation 'inverse_mdct_old' after this function.
2372
2373 // kernel from paper
2374
2375
2376 // merged:
2377 // copy and reflect spectral data
2378 // step 0
2379
2380 // note that it turns out that the items added together during
2381 // this step are, in fact, being added to themselves (as reflected
2382 // by step 0). inexplicable inefficiency! this became obvious
2383 // once I combined the passes.
2384
2385 // so there's a missing 'times 2' here (for adding X to itself).
2386 // this propagates through linearly to the end, where the numbers
2387 // are 1/2 too small, and need to be compensated for.
2388
2389 {
2390 float *d,*e, *AA, *e_stop;
2391 d = &buf2[n2-2];
2392 AA = A;
2393 e = &buffer[0];
2394 e_stop = &buffer[n2];
2395 while (e != e_stop) {
2396 d[1] = (e[0] * AA[0] - e[2]*AA[1]);
2397 d[0] = (e[0] * AA[1] + e[2]*AA[0]);
2398 d -= 2;
2399 AA += 2;
2400 e += 4;
2401 }
2402
2403 e = &buffer[n2-3];
2404 while (d >= buf2) {
2405 d[1] = (-e[2] * AA[0] - -e[0]*AA[1]);
2406 d[0] = (-e[2] * AA[1] + -e[0]*AA[0]);
2407 d -= 2;
2408 AA += 2;
2409 e -= 4;
2410 }
2411 }
2412
2413 // now we use symbolic names for these, so that we can
2414 // possibly swap their meaning as we change which operations
2415 // are in place
2416
2417 u = buffer;
2418 v = buf2;
2419
2420 // step 2 (paper output is w, now u)
2421 // this could be in place, but the data ends up in the wrong
2422 // place... _somebody_'s got to swap it, so this is nominated
2423 {
2424 float *AA = &A[n2-8];
2425 float *d0,*d1, *e0, *e1;
2426
2427 e0 = &v[n4];
2428 e1 = &v[0];
2429
2430 d0 = &u[n4];
2431 d1 = &u[0];
2432
2433 while (AA >= A) {
2434 float v40_20, v41_21;
2435
2436 v41_21 = e0[1] - e1[1];
2437 v40_20 = e0[0] - e1[0];
2438 d0[1] = e0[1] + e1[1];
2439 d0[0] = e0[0] + e1[0];
2440 d1[1] = v41_21*AA[4] - v40_20*AA[5];
2441 d1[0] = v40_20*AA[4] + v41_21*AA[5];
2442
2443 v41_21 = e0[3] - e1[3];
2444 v40_20 = e0[2] - e1[2];
2445 d0[3] = e0[3] + e1[3];
2446 d0[2] = e0[2] + e1[2];
2447 d1[3] = v41_21*AA[0] - v40_20*AA[1];
2448 d1[2] = v40_20*AA[0] + v41_21*AA[1];
2449
2450 AA -= 8;
2451
2452 d0 += 4;
2453 d1 += 4;
2454 e0 += 4;
2455 e1 += 4;
2456 }
2457 }
2458
2459 // step 3
2460 ld = ilog(n) - 1; // ilog is off-by-one from normal definitions
2461
2462 // optimized step 3:
2463
2464 // the original step3 loop can be nested r inside s or s inside r;
2465 // it's written originally as s inside r, but this is dumb when r
2466 // iterates many times, and s few. So I have two copies of it and
2467 // switch between them halfway.
2468
2469 // this is iteration 0 of step 3
2470 imdct_step3_iter0_loop(n >> 4, u, n2-1-n4*0, -(n >> 3), A);
2471 imdct_step3_iter0_loop(n >> 4, u, n2-1-n4*1, -(n >> 3), A);
2472
2473 // this is iteration 1 of step 3
2474 imdct_step3_inner_r_loop(n >> 5, u, n2-1 - n8*0, -(n >> 4), A, 16);
2475 imdct_step3_inner_r_loop(n >> 5, u, n2-1 - n8*1, -(n >> 4), A, 16);
2476 imdct_step3_inner_r_loop(n >> 5, u, n2-1 - n8*2, -(n >> 4), A, 16);
2477 imdct_step3_inner_r_loop(n >> 5, u, n2-1 - n8*3, -(n >> 4), A, 16);
2478
2479 l=2;
2480 for (; l < (ld-3)>>1; ++l) {
2481 int32_t k0 = n >> (l+2), k0_2 = k0>>1;
2482 int32_t lim = 1 << (l+1);
2483 int32_t i;
2484 for (i=0; i < lim; ++i)
2485 imdct_step3_inner_r_loop(n >> (l+4), u, n2-1 - k0*i, -k0_2, A, 1 << (l+3));
2486 }
2487
2488 for (; l < ld-6; ++l) {
2489 int32_t k0 = n >> (l+2), k1 = 1 << (l+3), k0_2 = k0>>1;
2490 int32_t rlim = n >> (l+6), r;
2491 int32_t lim = 1 << (l+1);
2492 int32_t i_off;
2493 float *Aptr = A; // Renamed from A0 to avoid conflict with Arduino analog pin macro
2494 i_off = n2-1;
2495 for (r=rlim; r > 0; --r) {
2496 imdct_step3_inner_s_loop(lim, u, i_off, -k0_2, Aptr, k1, k0);
2497 Aptr += k1*4;
2498 i_off -= 8;
2499 }
2500 }
2501
2502 // iterations with count:
2503 // ld-6,-5,-4 all interleaved together
2504 // the big win comes from getting rid of needless flops
2505 // due to the constants on pass 5 & 4 being all 1 and 0;
2506 // combining them to be simultaneous to improve cache made little difference
2507 imdct_step3_inner_s_loop_ld654(n >> 5, u, n2-1, A, n);
2508
2509 // output is u
2510
2511 // step 4, 5, and 6
2512 // cannot be in-place because of step 5
2513 {
2514 uint16 *bitrev = f->bit_reverse[blocktype];
2515 // weirdly, I'd have thought reading sequentially and writing
2516 // erratically would have been better than vice-versa, but in
2517 // fact that's not what my testing showed. (That is, with
2518 // j = bitreverse(i), do you read i and write j, or read j and write i.)
2519
2520 float *d0 = &v[n4-4];
2521 float *d1 = &v[n2-4];
2522 while (d0 >= v) {
2523 int32_t k4;
2524
2525 k4 = bitrev[0];
2526 d1[3] = u[k4+0];
2527 d1[2] = u[k4+1];
2528 d0[3] = u[k4+2];
2529 d0[2] = u[k4+3];
2530
2531 k4 = bitrev[1];
2532 d1[1] = u[k4+0];
2533 d1[0] = u[k4+1];
2534 d0[1] = u[k4+2];
2535 d0[0] = u[k4+3];
2536
2537 d0 -= 4;
2538 d1 -= 4;
2539 bitrev += 2;
2540 }
2541 }
2542 // (paper output is u, now v)
2543
2544
2545 // data must be in buf2
2546 FL_ASSERT(v == buf2, "v must equal buf2");
2547
2548 // step 7 (paper output is v, now v)
2549 // this is now in place
2550 {
2551 float *C = f->C[blocktype];
2552 float *d, *e;
2553
2554 d = v;
2555 e = v + n2 - 4;
2556
2557 while (d < e) {
2558 float a02,a11,b0,b1,b2,b3;
2559
2560 a02 = d[0] - e[2];
2561 a11 = d[1] + e[3];
2562
2563 b0 = C[1]*a02 + C[0]*a11;
2564 b1 = C[1]*a11 - C[0]*a02;
2565
2566 b2 = d[0] + e[ 2];
2567 b3 = d[1] - e[ 3];
2568
2569 d[0] = b2 + b0;
2570 d[1] = b3 + b1;
2571 e[2] = b2 - b0;
2572 e[3] = b1 - b3;
2573
2574 a02 = d[2] - e[0];
2575 a11 = d[3] + e[1];
2576
2577 b0 = C[3]*a02 + C[2]*a11;
2578 b1 = C[3]*a11 - C[2]*a02;
2579
2580 b2 = d[2] + e[ 0];
2581 b3 = d[3] - e[ 1];
2582
2583 d[2] = b2 + b0;
2584 d[3] = b3 + b1;
2585 e[0] = b2 - b0;
2586 e[1] = b1 - b3;
2587
2588 C += 4;
2589 d += 4;
2590 e -= 4;
2591 }
2592 }
2593
2594 // data must be in buf2
2595
2596
2597 // step 8+decode (paper output is X, now buffer)
2598 // this generates pairs of data a la 8 and pushes them directly through
2599 // the decode kernel (pushing rather than pulling) to avoid having
2600 // to make another pass later
2601
2602 // this cannot POSSIBLY be in place, so we refer to the buffers directly
2603
2604 {
2605 float *d0,*d1,*d2,*d3;
2606
2607 float *B = f->B[blocktype] + n2 - 8;
2608 float *e = buf2 + n2 - 8;
2609 d0 = &buffer[0];
2610 d1 = &buffer[n2-4];
2611 d2 = &buffer[n2];
2612 d3 = &buffer[n-4];
2613 while (e >= v) {
2614 float p0,p1,p2,p3;
2615
2616 p3 = e[6]*B[7] - e[7]*B[6];
2617 p2 = -e[6]*B[6] - e[7]*B[7];
2618
2619 d0[0] = p3;
2620 d1[3] = - p3;
2621 d2[0] = p2;
2622 d3[3] = p2;
2623
2624 p1 = e[4]*B[5] - e[5]*B[4];
2625 p0 = -e[4]*B[4] - e[5]*B[5];
2626
2627 d0[1] = p1;
2628 d1[2] = - p1;
2629 d2[1] = p0;
2630 d3[2] = p0;
2631
2632 p3 = e[2]*B[3] - e[3]*B[2];
2633 p2 = -e[2]*B[2] - e[3]*B[3];
2634
2635 d0[2] = p3;
2636 d1[1] = - p3;
2637 d2[2] = p2;
2638 d3[1] = p2;
2639
2640 p1 = e[0]*B[1] - e[1]*B[0];
2641 p0 = -e[0]*B[0] - e[1]*B[1];
2642
2643 d0[3] = p1;
2644 d1[0] = - p1;
2645 d2[3] = p0;
2646 d3[0] = p0;
2647
2648 B -= 8;
2649 e -= 8;
2650 d0 += 4;
2651 d2 += 4;
2652 d1 -= 4;
2653 d3 -= 4;
2654 }
2655 }
2656
2657 fl_stbv_temp_free(f,buf2);
2658 fl_stbv_temp_alloc_restore(f,save_point);
2659}
#define FL_ASSERT(x, MSG)
Definition assert.h:6
static void imdct_step3_inner_s_loop(int32_t n, float *e, int32_t i_off, int32_t k_off, float *A, int32_t a_off, int32_t k0) FL_NOEXCEPT
static int32_t ilog(int32 n) FL_NOEXCEPT
static void imdct_step3_iter0_loop(int32_t n, float *e, int32_t i_off, int32_t k_off, float *A) FL_NOEXCEPT
static constexpr int8_t C
static void imdct_step3_inner_s_loop_ld654(int32_t n, float *e, int32_t i_off, float *A, int32_t base_n) FL_NOEXCEPT
static void imdct_step3_inner_r_loop(int32_t lim, float *e, int32_t d0, int32_t k_off, float *A, int32_t k1) FL_NOEXCEPT
fl::i32 int32_t
Definition coder.h:220
FL_DISABLE_WARNING_PUSH unsigned char * B
#define fl_stbv_temp_free(f, p)
#define fl_stbv_temp_alloc_save(f)
#define fl_stbv_temp_alloc(f, size)
#define fl_stbv_temp_alloc_restore(f, p)

References fl::B, C, FL_ASSERT, FL_NOEXCEPT, fl_stbv_temp_alloc, fl_stbv_temp_alloc_restore, fl_stbv_temp_alloc_save, fl_stbv_temp_free, ilog(), imdct_step3_inner_r_loop(), imdct_step3_inner_s_loop(), imdct_step3_inner_s_loop_ld654(), and imdct_step3_iter0_loop().

Referenced by vorbis_decode_packet_rest().

+ Here is the call graph for this function:
+ Here is the caller graph for this function: