◆ inverse_mdct()

static void fl::third_party::vorbis::inverse_mdct	(	float *	buffer,
		int32_t	n,
		vorb *	f,
		int32_t	blocktype )
static
Definition at line 2359 of file stb_vorbis.cpp.hpp.
{
   int32_t n2 = n >> 1, n4 = n >> 2, n8 = n >> 3, l;
   int32_t ld;
   // @OPTIMIZE: reduce register pressure by using fewer variables?
   int32_t save_point = fl_stbv_temp_alloc_save(f);
   float *buf2 = (float *) fl_stbv_temp_alloc(f, n2 * sizeof(*buf2));
   float *u=nullptr,*v=nullptr;
   // twiddle factors
   float *A = f->A[blocktype];
 
   // IMDCT algorithm from "The use of multirate filter banks for coding of high quality digital audio"
   // See notes about bugs in that paper in less-optimal implementation 'inverse_mdct_old' after this function.
 
   // kernel from paper
 
 
   // merged:
   //   copy and reflect spectral data
   //   step 0
 
   // note that it turns out that the items added together during
   // this step are, in fact, being added to themselves (as reflected
   // by step 0). inexplicable inefficiency! this became obvious
   // once I combined the passes.
 
   // so there's a missing 'times 2' here (for adding X to itself).
   // this propagates through linearly to the end, where the numbers
   // are 1/2 too small, and need to be compensated for.
 
   {
      float *d,*e, *AA, *e_stop;
      d = &buf2[n2-2];
      AA = A;
      e = &buffer[0];
      e_stop = &buffer[n2];
      while (e != e_stop) {
         d[1] = (e[0] * AA[0] - e[2]*AA[1]);
         d[0] = (e[0] * AA[1] + e[2]*AA[0]);
         d -= 2;
         AA += 2;
         e += 4;
      }
 
      e = &buffer[n2-3];
      while (d >= buf2) {
         d[1] = (-e[2] * AA[0] - -e[0]*AA[1]);
         d[0] = (-e[2] * AA[1] + -e[0]*AA[0]);
         d -= 2;
         AA += 2;
         e -= 4;
      }
   }
 
   // now we use symbolic names for these, so that we can
   // possibly swap their meaning as we change which operations
   // are in place
 
   u = buffer;
   v = buf2;
 
   // step 2    (paper output is w, now u)
   // this could be in place, but the data ends up in the wrong
   // place... _somebody_'s got to swap it, so this is nominated
   {
      float *AA = &A[n2-8];
      float *d0,*d1, *e0, *e1;
 
      e0 = &v[n4];
      e1 = &v[0];
 
      d0 = &u[n4];
      d1 = &u[0];
 
      while (AA >= A) {
         float v40_20, v41_21;
 
         v41_21 = e0[1] - e1[1];
         v40_20 = e0[0] - e1[0];
         d0[1]  = e0[1] + e1[1];
         d0[0]  = e0[0] + e1[0];
         d1[1]  = v41_21*AA[4] - v40_20*AA[5];
         d1[0]  = v40_20*AA[4] + v41_21*AA[5];
 
         v41_21 = e0[3] - e1[3];
         v40_20 = e0[2] - e1[2];
         d0[3]  = e0[3] + e1[3];
         d0[2]  = e0[2] + e1[2];
         d1[3]  = v41_21*AA[0] - v40_20*AA[1];
         d1[2]  = v40_20*AA[0] + v41_21*AA[1];
 
         AA -= 8;
 
         d0 += 4;
         d1 += 4;
         e0 += 4;
         e1 += 4;
      }
   }
 
   // step 3
   ld = ilog(n) - 1; // ilog is off-by-one from normal definitions
 
   // optimized step 3:
 
   // the original step3 loop can be nested r inside s or s inside r;
   // it's written originally as s inside r, but this is dumb when r
   // iterates many times, and s few. So I have two copies of it and
   // switch between them halfway.
 
   // this is iteration 0 of step 3
   imdct_step3_iter0_loop(n >> 4, u, n2-1-n4*0, -(n >> 3), A);
   imdct_step3_iter0_loop(n >> 4, u, n2-1-n4*1, -(n >> 3), A);
 
   // this is iteration 1 of step 3
   imdct_step3_inner_r_loop(n >> 5, u, n2-1 - n8*0, -(n >> 4), A, 16);
   imdct_step3_inner_r_loop(n >> 5, u, n2-1 - n8*1, -(n >> 4), A, 16);
   imdct_step3_inner_r_loop(n >> 5, u, n2-1 - n8*2, -(n >> 4), A, 16);
   imdct_step3_inner_r_loop(n >> 5, u, n2-1 - n8*3, -(n >> 4), A, 16);
 
   l=2;
   for (; l < (ld-3)>>1; ++l) {
      int32_t k0 = n >> (l+2), k0_2 = k0>>1;
      int32_t lim = 1 << (l+1);
      int32_t i;
      for (i=0; i < lim; ++i)
         imdct_step3_inner_r_loop(n >> (l+4), u, n2-1 - k0*i, -k0_2, A, 1 << (l+3));
   }
 
   for (; l < ld-6; ++l) {
      int32_t k0 = n >> (l+2), k1 = 1 << (l+3), k0_2 = k0>>1;
      int32_t rlim = n >> (l+6), r;
      int32_t lim = 1 << (l+1);
      int32_t i_off;
      float *Aptr = A;  // Renamed from A0 to avoid conflict with Arduino analog pin macro
      i_off = n2-1;
      for (r=rlim; r > 0; --r) {
         imdct_step3_inner_s_loop(lim, u, i_off, -k0_2, Aptr, k1, k0);
         Aptr += k1*4;
         i_off -= 8;
      }
   }
 
   // iterations with count:
   //   ld-6,-5,-4 all interleaved together
   //       the big win comes from getting rid of needless flops
   //         due to the constants on pass 5 & 4 being all 1 and 0;
   //       combining them to be simultaneous to improve cache made little difference
   imdct_step3_inner_s_loop_ld654(n >> 5, u, n2-1, A, n);
 
   // output is u
 
   // step 4, 5, and 6
   // cannot be in-place because of step 5
   {
      uint16 *bitrev = f->bit_reverse[blocktype];
      // weirdly, I'd have thought reading sequentially and writing
      // erratically would have been better than vice-versa, but in
      // fact that's not what my testing showed. (That is, with
      // j = bitreverse(i), do you read i and write j, or read j and write i.)
 
      float *d0 = &v[n4-4];
      float *d1 = &v[n2-4];
      while (d0 >= v) {
         int32_t k4;
 
         k4 = bitrev[0];
         d1[3] = u[k4+0];
         d1[2] = u[k4+1];
         d0[3] = u[k4+2];
         d0[2] = u[k4+3];
 
         k4 = bitrev[1];
         d1[1] = u[k4+0];
         d1[0] = u[k4+1];
         d0[1] = u[k4+2];
         d0[0] = u[k4+3];
 
         d0 -= 4;
         d1 -= 4;
         bitrev += 2;
      }
   }
   // (paper output is u, now v)
 
 
   // data must be in buf2
   FL_ASSERT(v == buf2, "v must equal buf2");
 
   // step 7   (paper output is v, now v)
   // this is now in place
   {
      float *C = f->C[blocktype];
      float *d, *e;
 
      d = v;
      e = v + n2 - 4;
 
      while (d < e) {
         float a02,a11,b0,b1,b2,b3;
 
         a02 = d[0] - e[2];
         a11 = d[1] + e[3];
 
         b0 = C[1]*a02 + C[0]*a11;
         b1 = C[1]*a11 - C[0]*a02;
 
         b2 = d[0] + e[ 2];
         b3 = d[1] - e[ 3];
 
         d[0] = b2 + b0;
         d[1] = b3 + b1;
         e[2] = b2 - b0;
         e[3] = b1 - b3;
 
         a02 = d[2] - e[0];
         a11 = d[3] + e[1];
 
         b0 = C[3]*a02 + C[2]*a11;
         b1 = C[3]*a11 - C[2]*a02;
 
         b2 = d[2] + e[ 0];
         b3 = d[3] - e[ 1];
 
         d[2] = b2 + b0;
         d[3] = b3 + b1;
         e[0] = b2 - b0;
         e[1] = b1 - b3;
 
         C += 4;
         d += 4;
         e -= 4;
      }
   }
 
   // data must be in buf2
 
 
   // step 8+decode   (paper output is X, now buffer)
   // this generates pairs of data a la 8 and pushes them directly through
   // the decode kernel (pushing rather than pulling) to avoid having
   // to make another pass later
 
   // this cannot POSSIBLY be in place, so we refer to the buffers directly
 
   {
      float *d0,*d1,*d2,*d3;
 
      float *B = f->B[blocktype] + n2 - 8;
      float *e = buf2 + n2 - 8;
      d0 = &buffer[0];
      d1 = &buffer[n2-4];
      d2 = &buffer[n2];
      d3 = &buffer[n-4];
      while (e >= v) {
         float p0,p1,p2,p3;
 
         p3 =  e[6]*B[7] - e[7]*B[6];
         p2 = -e[6]*B[6] - e[7]*B[7];
 
         d0[0] =   p3;
         d1[3] = - p3;
         d2[0] =   p2;
         d3[3] =   p2;
 
         p1 =  e[4]*B[5] - e[5]*B[4];
         p0 = -e[4]*B[4] - e[5]*B[5];
 
         d0[1] =   p1;
         d1[2] = - p1;
         d2[1] =   p0;
         d3[2] =   p0;
 
         p3 =  e[2]*B[3] - e[3]*B[2];
         p2 = -e[2]*B[2] - e[3]*B[3];
 
         d0[2] =   p3;
         d1[1] = - p3;
         d2[2] =   p2;
         d3[1] =   p2;
 
         p1 =  e[0]*B[1] - e[1]*B[0];
         p0 = -e[0]*B[0] - e[1]*B[1];
 
         d0[3] =   p1;
         d1[0] = - p1;
         d2[3] =   p0;
         d3[0] =   p0;
 
         B -= 8;
         e -= 8;
         d0 += 4;
         d2 += 4;
         d1 -= 4;
         d3 -= 4;
      }
   }
 
   fl_stbv_temp_free(f,buf2);
   fl_stbv_temp_alloc_restore(f,save_point);
}
References fl::B, C, FL_ASSERT, FL_NOEXCEPT, fl_stbv_temp_alloc, fl_stbv_temp_alloc_restore, fl_stbv_temp_alloc_save, fl_stbv_temp_free, ilog(), imdct_step3_inner_r_loop(), imdct_step3_inner_s_loop(), imdct_step3_inner_s_loop_ld654(), and imdct_step3_iter0_loop().
Referenced by vorbis_decode_packet_rest().
Here is the call graph for this function:
Here is the caller graph for this function: