d1/d07/dct32_8hpp_source.html

/* ***** BEGIN LICENSE BLOCK *****

 * Version: RCSL 1.0/RPSL 1.0

 *

 * Portions Copyright (c) 1995-2002 RealNetworks, Inc. All Rights Reserved.

 *

 * The contents of this file, and the files included with this file, are

 * subject to the current version of the RealNetworks Public Source License

 * Version 1.0 (the "RPSL") available at

 * http://www.helixcommunity.org/content/rpsl unless you have licensed

 * the file under the RealNetworks Community Source License Version 1.0

 * (the "RCSL") available at http://www.helixcommunity.org/content/rcsl,

 * in which case the RCSL will apply. You may also obtain the license terms

 * directly from RealNetworks.  You may not use this file except in

 * compliance with the RPSL or, if you have a valid RCSL with RealNetworks

 * applicable to this file, the RCSL.  Please see the applicable RPSL or

 * RCSL for the rights, obligations and limitations governing use of the

 * contents of the file.

 *

 * This file is part of the Helix DNA Technology. RealNetworks is the

 * developer of the Original Code and owns the copyrights in the portions

 * it created.

 *

 * This file, and the files included with this file, is distributed and made

 * available on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER

 * EXPRESS OR IMPLIED, AND REALNETWORKS HEREBY DISCLAIMS ALL SUCH WARRANTIES,

 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, FITNESS

 * FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.

 *

 * Technology Compatibility Kit Test Suite(s) Location:

 *    http://www.helixcommunity.org/content/tck

 *

 * Contributor(s):

 *

 * ***** END LICENSE BLOCK ***** */


/**************************************************************************************

 * Fixed-point MP3 decoder

 * Jon Recker (jrecker@real.com), Ken Cooke (kenc@real.com)

 * June 2003

 *

 * dct32.c - optimized implementations of 32-point DCT for matrixing stage of

 *             polyphase filter

 **************************************************************************************/


#include "coder.h"

#include "fl/stl/stdint.h"

#include "fl/stl/noexcept.h"

#include "assembly.h"

namespace fl {

namespace third_party {


#define COS0_0  0x4013c251  /* Q31 */

#define COS0_1  0x40b345bd  /* Q31 */

#define COS0_2  0x41fa2d6d  /* Q31 */

#define COS0_3  0x43f93421  /* Q31 */

#define COS0_4  0x46cc1bc4  /* Q31 */

#define COS0_5  0x4a9d9cf0  /* Q31 */

#define COS0_6  0x4fae3711  /* Q31 */

#define COS0_7  0x56601ea7  /* Q31 */

#define COS0_8  0x5f4cf6eb  /* Q31 */

#define COS0_9  0x6b6fcf26  /* Q31 */

#define COS0_10 0x7c7d1db3  /* Q31 */

#define COS0_11 0x4ad81a97  /* Q30 */

#define COS0_12 0x5efc8d96  /* Q30 */

#define COS0_13 0x41d95790  /* Q29 */

#define COS0_14 0x6d0b20cf  /* Q29 */

#define COS0_15 0x518522fb  /* Q27 */


#define COS1_0  0x404f4672  /* Q31 */

#define COS1_1  0x42e13c10  /* Q31 */

#define COS1_2  0x48919f44  /* Q31 */

#define COS1_3  0x52cb0e63  /* Q31 */

#define COS1_4  0x64e2402e  /* Q31 */

#define COS1_5  0x43e224a9  /* Q30 */

#define COS1_6  0x6e3c92c1  /* Q30 */

#define COS1_7  0x519e4e04  /* Q28 */


#define COS2_0  0x4140fb46  /* Q31 */

#define COS2_1  0x4cf8de88  /* Q31 */

#define COS2_2  0x73326bbf  /* Q31 */

#define COS2_3  0x52036742  /* Q29 */


#define COS3_0  0x4545e9ef  /* Q31 */

#define COS3_1  0x539eba45  /* Q30 */


#define COS4_0  (static_cast<int32_t>(0x5a82799aU)) /* Q31 */


// faster in ROM


static const int32_t dcttab[48] = {

    /* first pass */

    COS0_0, COS0_15, COS1_0, /* 31, 27, 31 */

    COS0_1, COS0_14, COS1_1, /* 31, 29, 31 */

    COS0_2, COS0_13, COS1_2, /* 31, 29, 31 */

    COS0_3, COS0_12, COS1_3, /* 31, 30, 31 */

    COS0_4, COS0_11, COS1_4, /* 31, 30, 31 */

    COS0_5, COS0_10, COS1_5, /* 31, 31, 30 */

    COS0_6, COS0_9,  COS1_6,  /* 31, 31, 30 */

    COS0_7, COS0_8,  COS1_7,  /* 31, 31, 28 */

    /* second pass */

     COS2_0,  COS2_3, COS3_0, /* 31, 29, 31 */

     COS2_1,  COS2_2, COS3_1, /* 31, 31, 30 */

    -COS2_0, -COS2_3, COS3_0,     /* 31, 29, 31 */

    -COS2_1, -COS2_2, COS3_1,     /* 31, 31, 30 */

     COS2_0,  COS2_3, COS3_0,     /* 31, 29, 31 */

     COS2_1,  COS2_2, COS3_1,     /* 31, 31, 30 */

    -COS2_0, -COS2_3, COS3_0,     /* 31, 29, 31 */

    -COS2_1, -COS2_2, COS3_1,     /* 31, 31, 30 */

};


#define D32FP(i, s0, s1, s2) { \

    a0 = buf[i];            a3 = buf[31-i]; \

    a1 = buf[15-i];         a2 = buf[16+i]; \

    b0 = a0 + a3;           b3 = MULSHIFT32(*cptr++, a0 - a3) * (1 << (s0));    \

    b1 = a1 + a2;           b2 = MULSHIFT32(*cptr++, a1 - a2) * (1 << (s1));    \

    buf[i] = b0 + b1;       buf[15-i] = MULSHIFT32(*cptr,   b0 - b1) * (1 << (s2)); \

    buf[16+i] = b2 + b3;    buf[31-i] = MULSHIFT32(*cptr++, b3 - b2) * (1 << (s2)); \

}


/**************************************************************************************

 * Function:    FDCT32

 *

 * Description: Ken's highly-optimized 32-point DCT (radix-4 + radix-8)

 *

 * Inputs:      input buffer, length = 32 samples

 *              require at least 6 guard bits in input vector x to avoid possibility

 *                of overflow in internal calculations (see bbtest_imdct test app)

 *              buffer offset and oddblock flag for polyphase filter input buffer

 *              number of guard bits in input

 *

 * Outputs:     output buffer, data copied and interleaved for polyphase filter

 *              no guarantees about number of guard bits in output

 *

 * Return:      none

 *

 * Notes:       number of muls = 4*8 + 12*4 = 80

 *              final stage of DCT is hardcoded to shuffle data into the proper order

 *                for the polyphase filterbank

 *              fully unrolled stage 1, for max precision (scale the 1/cos() factors

 *                differently, depending on magnitude)

 *              guard bit analysis verified by exhaustive testing of all 2^32

 *                combinations of max pos/max neg values in x[]

 *

 * TODO:        code organization and optimization for ARM

 *              possibly interleave stereo (cut # of coef loads in half - may not have

 *                enough registers)

 **************************************************************************************/

// about 1ms faster in RAM


void FDCT32(int32_t *buf, int32_t *dest, int32_t offset, int32_t oddBlock, int32_t gb) FL_NOEXCEPT

{

    int32_t i, s, tmp, es;

    const int32_t *cptr = dcttab;

    int32_t a0, a1, a2, a3, a4, a5, a6, a7;

    int32_t b0, b1, b2, b3, b4, b5, b6, b7;

    int32_t *d;


    /* scaling - ensure at least 6 guard bits for DCT

     * (in practice this is already true 99% of time, so this code is

     *  almost never triggered)

     */

    es = 0;

    if (gb < 6) {

        es = 6 - gb;

        for (i = 0; i < 32; i++)

            buf[i] >>= es;

    }


    /* first pass */

    D32FP(0, 1, 5, 1);

    D32FP(1, 1, 3, 1);

    D32FP(2, 1, 3, 1);

    D32FP(3, 1, 2, 1);

    D32FP(4, 1, 2, 1);

    D32FP(5, 1, 1, 2);

    D32FP(6, 1, 1, 2);

    D32FP(7, 1, 1, 4);


    /* second pass */

    for (i = 4; i > 0; i--) {

        a0 = buf[0];        a7 = buf[7];        a3 = buf[3];        a4 = buf[4];

        b0 = a0 + a7;       b7 = MULSHIFT32(*cptr++, a0 - a7) * 2L;

        b3 = a3 + a4;       b4 = MULSHIFT32(*cptr++, a3 - a4) * 8L;

        a0 = b0 + b3;       a3 = MULSHIFT32(*cptr,   b0 - b3) * 2L;

        a4 = b4 + b7;       a7 = MULSHIFT32(*cptr++, b7 - b4) * 2L;


        a1 = buf[1];        a6 = buf[6];        a2 = buf[2];        a5 = buf[5];

        b1 = a1 + a6;       b6 = MULSHIFT32(*cptr++, a1 - a6) * 2L;

        b2 = a2 + a5;       b5 = MULSHIFT32(*cptr++, a2 - a5) * 2L;

        a1 = b1 + b2;       a2 = MULSHIFT32(*cptr,   b1 - b2) * 4L;

        a5 = b5 + b6;       a6 = MULSHIFT32(*cptr++, b6 - b5) * 4L;


        b0 = a0 + a1;       b1 = MULSHIFT32(COS4_0, a0 - a1) * 2L;

        b2 = a2 + a3;       b3 = MULSHIFT32(COS4_0, a3 - a2) * 2L;

        buf[0] = b0;        buf[1] = b1;

        buf[2] = b2 + b3;   buf[3] = b3;


        b4 = a4 + a5;       b5 = MULSHIFT32(COS4_0, a4 - a5) * 2L;

        b6 = a6 + a7;       b7 = MULSHIFT32(COS4_0, a7 - a6) * 2L;

        b6 += b7;

        buf[4] = b4 + b6;   buf[5] = b5 + b7;

        buf[6] = b5 + b6;   buf[7] = b7;


        buf += 8;

    }

    buf -= 32;  /* reset */


    /* sample 0 - always delayed one block */

    d = dest + 64*16 + ((offset - oddBlock) & 7) + (oddBlock ? 0 : VBUF_LENGTH);

    s = buf[ 0];                d[0] = d[8] = s;


    /* samples 16 to 31 */

    d = dest + offset + (oddBlock ? VBUF_LENGTH  : 0);


    s = buf[ 1];                d[0] = d[8] = s;    d += 64;


    tmp = buf[25] + buf[29];

    s = buf[17] + tmp;          d[0] = d[8] = s;    d += 64;

    s = buf[ 9] + buf[13];      d[0] = d[8] = s;    d += 64;

    s = buf[21] + tmp;          d[0] = d[8] = s;    d += 64;


    tmp = buf[29] + buf[27];

    s = buf[ 5];                d[0] = d[8] = s;    d += 64;

    s = buf[21] + tmp;          d[0] = d[8] = s;    d += 64;

    s = buf[13] + buf[11];      d[0] = d[8] = s;    d += 64;

    s = buf[19] + tmp;          d[0] = d[8] = s;    d += 64;


    tmp = buf[27] + buf[31];

    s = buf[ 3];                d[0] = d[8] = s;    d += 64;

    s = buf[19] + tmp;          d[0] = d[8] = s;    d += 64;

    s = buf[11] + buf[15];      d[0] = d[8] = s;    d += 64;

    s = buf[23] + tmp;          d[0] = d[8] = s;    d += 64;


    tmp = buf[31];

    s = buf[ 7];                d[0] = d[8] = s;    d += 64;

    s = buf[23] + tmp;          d[0] = d[8] = s;    d += 64;

    s = buf[15];                d[0] = d[8] = s;    d += 64;

    s = tmp;                    d[0] = d[8] = s;


    /* samples 16 to 1 (sample 16 used again) */

    d = dest + 16 + ((offset - oddBlock) & 7) + (oddBlock ? 0 : VBUF_LENGTH);


    s = buf[ 1];                d[0] = d[8] = s;    d += 64;


    tmp = buf[30] + buf[25];

    s = buf[17] + tmp;          d[0] = d[8] = s;    d += 64;

    s = buf[14] + buf[ 9];      d[0] = d[8] = s;    d += 64;

    s = buf[22] + tmp;          d[0] = d[8] = s;    d += 64;

    s = buf[ 6];                d[0] = d[8] = s;    d += 64;


    tmp = buf[26] + buf[30];

    s = buf[22] + tmp;          d[0] = d[8] = s;    d += 64;

    s = buf[10] + buf[14];      d[0] = d[8] = s;    d += 64;

    s = buf[18] + tmp;          d[0] = d[8] = s;    d += 64;

    s = buf[ 2];                d[0] = d[8] = s;    d += 64;


    tmp = buf[28] + buf[26];

    s = buf[18] + tmp;          d[0] = d[8] = s;    d += 64;

    s = buf[12] + buf[10];      d[0] = d[8] = s;    d += 64;

    s = buf[20] + tmp;          d[0] = d[8] = s;    d += 64;

    s = buf[ 4];                d[0] = d[8] = s;    d += 64;


    tmp = buf[24] + buf[28];

    s = buf[20] + tmp;          d[0] = d[8] = s;    d += 64;

    s = buf[ 8] + buf[12];      d[0] = d[8] = s;    d += 64;

    s = buf[16] + tmp;          d[0] = d[8] = s;


    /* this is so rarely invoked that it's not worth making two versions of the output

     *   shuffle code (one for no shift, one for clip + variable shift) like in IMDCT

     * here we just load, clip, shift, and store on the rare instances that es != 0

     */

    if (es) {

        d = dest + 64*16 + ((offset - oddBlock) & 7) + (oddBlock ? 0 : VBUF_LENGTH);

        s = d[0];   CLIP_2N(s, 31 - es); d[0] = d[8] = (s << es);


        d = dest + offset + (oddBlock ? VBUF_LENGTH  : 0);

        for (i = 16; i <= 31; i++) {

            s = d[0];   CLIP_2N(s, 31 - es); d[0] = d[8] = (s << es);    d += 64;

        }


        d = dest + 16 + ((offset - oddBlock) & 7) + (oddBlock ? 0 : VBUF_LENGTH);

        for (i = 15; i >= 0; i--) {

            s = d[0];   CLIP_2N(s, 31 - es); d[0] = d[8] = (s << es);    d += 64;

        }

    }

}


}  // namespace third_party

}  // namespace fl

assembly.h

VBUF_LENGTH
#define VBUF_LENGTH
Definition coder.h:109

CLIP_2N
#define CLIP_2N(y, n)
Definition coder.h:89

coder.h

COS0_3
#define COS0_3
Definition dct32.hpp:58

COS0_4
#define COS0_4
Definition dct32.hpp:59

COS1_6
#define COS1_6
Definition dct32.hpp:78

COS0_8
#define COS0_8
Definition dct32.hpp:63

COS0_10
#define COS0_10
Definition dct32.hpp:65

COS3_1
#define COS3_1
Definition dct32.hpp:87

COS0_0
#define COS0_0
Definition dct32.hpp:55

COS4_0
#define COS4_0
Definition dct32.hpp:89

COS1_7
#define COS1_7
Definition dct32.hpp:79

COS0_5
#define COS0_5
Definition dct32.hpp:60

COS1_2
#define COS1_2
Definition dct32.hpp:74

COS2_0
#define COS2_0
Definition dct32.hpp:81

COS0_1
#define COS0_1
Definition dct32.hpp:56

COS0_9
#define COS0_9
Definition dct32.hpp:64

D32FP
#define D32FP(i, s0, s1, s2)
Definition dct32.hpp:113

COS3_0
#define COS3_0
Definition dct32.hpp:86

COS1_0
#define COS1_0
Definition dct32.hpp:72

COS2_2
#define COS2_2
Definition dct32.hpp:83

COS1_5
#define COS1_5
Definition dct32.hpp:77

COS1_4
#define COS1_4
Definition dct32.hpp:76

COS1_1
#define COS1_1
Definition dct32.hpp:73

COS2_1
#define COS2_1
Definition dct32.hpp:82

COS0_2
#define COS0_2
Definition dct32.hpp:57

COS0_7
#define COS0_7
Definition dct32.hpp:62

COS2_3
#define COS2_3
Definition dct32.hpp:84

COS1_3
#define COS1_3
Definition dct32.hpp:75

COS0_12
#define COS0_12
Definition dct32.hpp:67

COS0_14
#define COS0_14
Definition dct32.hpp:69

COS0_11
#define COS0_11
Definition dct32.hpp:66

COS0_15
#define COS0_15
Definition dct32.hpp:70

COS0_6
#define COS0_6
Definition dct32.hpp:61

COS0_13
#define COS0_13
Definition dct32.hpp:68

offset
fl::UISlider offset("Offset", 0.0f, 0.0f, 1.0f, 0.01f)

fl::third_party::FDCT32
void FDCT32(int32_t *x, int32_t *d, int32_t offset, int32_t oddBlock, int32_t gb) FL_NOEXCEPT
Definition dct32.hpp:151

fl::third_party::MULSHIFT32
__inline int32_t MULSHIFT32(int32_t x, int32_t y) FL_NOEXCEPT
Multiply together two 32-bit numbers and return the top 32-bits of the result.
Definition assembly.h:503

fl::third_party::dcttab
static const int32_t dcttab[48]
Definition dct32.hpp:92

fl::third_party::int32_t
fl::i32 int32_t
Definition coder.h:220

fl::third_party
Definition mp3.cpp.hpp:12

fl
Base definition for an LED controller.
Definition crgb.hpp:179

noexcept.h

FL_NOEXCEPT
#define FL_NOEXCEPT

stdint.h