FastLED 3.9.15
Loading...
Searching...
No Matches
dct32.hpp
Go to the documentation of this file.
1/* ***** BEGIN LICENSE BLOCK *****
2 * Version: RCSL 1.0/RPSL 1.0
3 *
4 * Portions Copyright (c) 1995-2002 RealNetworks, Inc. All Rights Reserved.
5 *
6 * The contents of this file, and the files included with this file, are
7 * subject to the current version of the RealNetworks Public Source License
8 * Version 1.0 (the "RPSL") available at
9 * http://www.helixcommunity.org/content/rpsl unless you have licensed
10 * the file under the RealNetworks Community Source License Version 1.0
11 * (the "RCSL") available at http://www.helixcommunity.org/content/rcsl,
12 * in which case the RCSL will apply. You may also obtain the license terms
13 * directly from RealNetworks. You may not use this file except in
14 * compliance with the RPSL or, if you have a valid RCSL with RealNetworks
15 * applicable to this file, the RCSL. Please see the applicable RPSL or
16 * RCSL for the rights, obligations and limitations governing use of the
17 * contents of the file.
18 *
19 * This file is part of the Helix DNA Technology. RealNetworks is the
20 * developer of the Original Code and owns the copyrights in the portions
21 * it created.
22 *
23 * This file, and the files included with this file, is distributed and made
24 * available on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
25 * EXPRESS OR IMPLIED, AND REALNETWORKS HEREBY DISCLAIMS ALL SUCH WARRANTIES,
26 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, FITNESS
27 * FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
28 *
29 * Technology Compatibility Kit Test Suite(s) Location:
30 * http://www.helixcommunity.org/content/tck
31 *
32 * Contributor(s):
33 *
34 * ***** END LICENSE BLOCK ***** */
35
36/**************************************************************************************
37 * Fixed-point MP3 decoder
38 * Jon Recker (jrecker@real.com), Ken Cooke (kenc@real.com)
39 * June 2003
40 *
41 * dct32.c - optimized implementations of 32-point DCT for matrixing stage of
42 * polyphase filter
43 **************************************************************************************/
44
45#include "coder.h"
46#include "fl/stl/stdint.h"
47#include "fl/stl/noexcept.h"
48#include "assembly.h"
49namespace fl {
50namespace third_party {
51
52
53
54
55#define COS0_0 0x4013c251 /* Q31 */
56#define COS0_1 0x40b345bd /* Q31 */
57#define COS0_2 0x41fa2d6d /* Q31 */
58#define COS0_3 0x43f93421 /* Q31 */
59#define COS0_4 0x46cc1bc4 /* Q31 */
60#define COS0_5 0x4a9d9cf0 /* Q31 */
61#define COS0_6 0x4fae3711 /* Q31 */
62#define COS0_7 0x56601ea7 /* Q31 */
63#define COS0_8 0x5f4cf6eb /* Q31 */
64#define COS0_9 0x6b6fcf26 /* Q31 */
65#define COS0_10 0x7c7d1db3 /* Q31 */
66#define COS0_11 0x4ad81a97 /* Q30 */
67#define COS0_12 0x5efc8d96 /* Q30 */
68#define COS0_13 0x41d95790 /* Q29 */
69#define COS0_14 0x6d0b20cf /* Q29 */
70#define COS0_15 0x518522fb /* Q27 */
71
72#define COS1_0 0x404f4672 /* Q31 */
73#define COS1_1 0x42e13c10 /* Q31 */
74#define COS1_2 0x48919f44 /* Q31 */
75#define COS1_3 0x52cb0e63 /* Q31 */
76#define COS1_4 0x64e2402e /* Q31 */
77#define COS1_5 0x43e224a9 /* Q30 */
78#define COS1_6 0x6e3c92c1 /* Q30 */
79#define COS1_7 0x519e4e04 /* Q28 */
80
81#define COS2_0 0x4140fb46 /* Q31 */
82#define COS2_1 0x4cf8de88 /* Q31 */
83#define COS2_2 0x73326bbf /* Q31 */
84#define COS2_3 0x52036742 /* Q29 */
85
86#define COS3_0 0x4545e9ef /* Q31 */
87#define COS3_1 0x539eba45 /* Q30 */
88
89#define COS4_0 (static_cast<int32_t>(0x5a82799aU)) /* Q31 */
90
91// faster in ROM
92static const int32_t dcttab[48] = {
93 /* first pass */
94 COS0_0, COS0_15, COS1_0, /* 31, 27, 31 */
95 COS0_1, COS0_14, COS1_1, /* 31, 29, 31 */
96 COS0_2, COS0_13, COS1_2, /* 31, 29, 31 */
97 COS0_3, COS0_12, COS1_3, /* 31, 30, 31 */
98 COS0_4, COS0_11, COS1_4, /* 31, 30, 31 */
99 COS0_5, COS0_10, COS1_5, /* 31, 31, 30 */
100 COS0_6, COS0_9, COS1_6, /* 31, 31, 30 */
101 COS0_7, COS0_8, COS1_7, /* 31, 31, 28 */
102 /* second pass */
103 COS2_0, COS2_3, COS3_0, /* 31, 29, 31 */
104 COS2_1, COS2_2, COS3_1, /* 31, 31, 30 */
105 -COS2_0, -COS2_3, COS3_0, /* 31, 29, 31 */
106 -COS2_1, -COS2_2, COS3_1, /* 31, 31, 30 */
107 COS2_0, COS2_3, COS3_0, /* 31, 29, 31 */
108 COS2_1, COS2_2, COS3_1, /* 31, 31, 30 */
109 -COS2_0, -COS2_3, COS3_0, /* 31, 29, 31 */
110 -COS2_1, -COS2_2, COS3_1, /* 31, 31, 30 */
111};
112
113#define D32FP(i, s0, s1, s2) { \
114 a0 = buf[i]; a3 = buf[31-i]; \
115 a1 = buf[15-i]; a2 = buf[16+i]; \
116 b0 = a0 + a3; b3 = MULSHIFT32(*cptr++, a0 - a3) * (1 << (s0)); \
117 b1 = a1 + a2; b2 = MULSHIFT32(*cptr++, a1 - a2) * (1 << (s1)); \
118 buf[i] = b0 + b1; buf[15-i] = MULSHIFT32(*cptr, b0 - b1) * (1 << (s2)); \
119 buf[16+i] = b2 + b3; buf[31-i] = MULSHIFT32(*cptr++, b3 - b2) * (1 << (s2)); \
120}
121
122/**************************************************************************************
123 * Function: FDCT32
124 *
125 * Description: Ken's highly-optimized 32-point DCT (radix-4 + radix-8)
126 *
127 * Inputs: input buffer, length = 32 samples
128 * require at least 6 guard bits in input vector x to avoid possibility
129 * of overflow in internal calculations (see bbtest_imdct test app)
130 * buffer offset and oddblock flag for polyphase filter input buffer
131 * number of guard bits in input
132 *
133 * Outputs: output buffer, data copied and interleaved for polyphase filter
134 * no guarantees about number of guard bits in output
135 *
136 * Return: none
137 *
138 * Notes: number of muls = 4*8 + 12*4 = 80
139 * final stage of DCT is hardcoded to shuffle data into the proper order
140 * for the polyphase filterbank
141 * fully unrolled stage 1, for max precision (scale the 1/cos() factors
142 * differently, depending on magnitude)
143 * guard bit analysis verified by exhaustive testing of all 2^32
144 * combinations of max pos/max neg values in x[]
145 *
146 * TODO: code organization and optimization for ARM
147 * possibly interleave stereo (cut # of coef loads in half - may not have
148 * enough registers)
149 **************************************************************************************/
150// about 1ms faster in RAM
152{
153 int32_t i, s, tmp, es;
154 const int32_t *cptr = dcttab;
155 int32_t a0, a1, a2, a3, a4, a5, a6, a7;
156 int32_t b0, b1, b2, b3, b4, b5, b6, b7;
157 int32_t *d;
158
159 /* scaling - ensure at least 6 guard bits for DCT
160 * (in practice this is already true 99% of time, so this code is
161 * almost never triggered)
162 */
163 es = 0;
164 if (gb < 6) {
165 es = 6 - gb;
166 for (i = 0; i < 32; i++)
167 buf[i] >>= es;
168 }
169
170 /* first pass */
171 D32FP(0, 1, 5, 1);
172 D32FP(1, 1, 3, 1);
173 D32FP(2, 1, 3, 1);
174 D32FP(3, 1, 2, 1);
175 D32FP(4, 1, 2, 1);
176 D32FP(5, 1, 1, 2);
177 D32FP(6, 1, 1, 2);
178 D32FP(7, 1, 1, 4);
179
180 /* second pass */
181 for (i = 4; i > 0; i--) {
182 a0 = buf[0]; a7 = buf[7]; a3 = buf[3]; a4 = buf[4];
183 b0 = a0 + a7; b7 = MULSHIFT32(*cptr++, a0 - a7) * 2L;
184 b3 = a3 + a4; b4 = MULSHIFT32(*cptr++, a3 - a4) * 8L;
185 a0 = b0 + b3; a3 = MULSHIFT32(*cptr, b0 - b3) * 2L;
186 a4 = b4 + b7; a7 = MULSHIFT32(*cptr++, b7 - b4) * 2L;
187
188 a1 = buf[1]; a6 = buf[6]; a2 = buf[2]; a5 = buf[5];
189 b1 = a1 + a6; b6 = MULSHIFT32(*cptr++, a1 - a6) * 2L;
190 b2 = a2 + a5; b5 = MULSHIFT32(*cptr++, a2 - a5) * 2L;
191 a1 = b1 + b2; a2 = MULSHIFT32(*cptr, b1 - b2) * 4L;
192 a5 = b5 + b6; a6 = MULSHIFT32(*cptr++, b6 - b5) * 4L;
193
194 b0 = a0 + a1; b1 = MULSHIFT32(COS4_0, a0 - a1) * 2L;
195 b2 = a2 + a3; b3 = MULSHIFT32(COS4_0, a3 - a2) * 2L;
196 buf[0] = b0; buf[1] = b1;
197 buf[2] = b2 + b3; buf[3] = b3;
198
199 b4 = a4 + a5; b5 = MULSHIFT32(COS4_0, a4 - a5) * 2L;
200 b6 = a6 + a7; b7 = MULSHIFT32(COS4_0, a7 - a6) * 2L;
201 b6 += b7;
202 buf[4] = b4 + b6; buf[5] = b5 + b7;
203 buf[6] = b5 + b6; buf[7] = b7;
204
205 buf += 8;
206 }
207 buf -= 32; /* reset */
208
209 /* sample 0 - always delayed one block */
210 d = dest + 64*16 + ((offset - oddBlock) & 7) + (oddBlock ? 0 : VBUF_LENGTH);
211 s = buf[ 0]; d[0] = d[8] = s;
212
213 /* samples 16 to 31 */
214 d = dest + offset + (oddBlock ? VBUF_LENGTH : 0);
215
216 s = buf[ 1]; d[0] = d[8] = s; d += 64;
217
218 tmp = buf[25] + buf[29];
219 s = buf[17] + tmp; d[0] = d[8] = s; d += 64;
220 s = buf[ 9] + buf[13]; d[0] = d[8] = s; d += 64;
221 s = buf[21] + tmp; d[0] = d[8] = s; d += 64;
222
223 tmp = buf[29] + buf[27];
224 s = buf[ 5]; d[0] = d[8] = s; d += 64;
225 s = buf[21] + tmp; d[0] = d[8] = s; d += 64;
226 s = buf[13] + buf[11]; d[0] = d[8] = s; d += 64;
227 s = buf[19] + tmp; d[0] = d[8] = s; d += 64;
228
229 tmp = buf[27] + buf[31];
230 s = buf[ 3]; d[0] = d[8] = s; d += 64;
231 s = buf[19] + tmp; d[0] = d[8] = s; d += 64;
232 s = buf[11] + buf[15]; d[0] = d[8] = s; d += 64;
233 s = buf[23] + tmp; d[0] = d[8] = s; d += 64;
234
235 tmp = buf[31];
236 s = buf[ 7]; d[0] = d[8] = s; d += 64;
237 s = buf[23] + tmp; d[0] = d[8] = s; d += 64;
238 s = buf[15]; d[0] = d[8] = s; d += 64;
239 s = tmp; d[0] = d[8] = s;
240
241 /* samples 16 to 1 (sample 16 used again) */
242 d = dest + 16 + ((offset - oddBlock) & 7) + (oddBlock ? 0 : VBUF_LENGTH);
243
244 s = buf[ 1]; d[0] = d[8] = s; d += 64;
245
246 tmp = buf[30] + buf[25];
247 s = buf[17] + tmp; d[0] = d[8] = s; d += 64;
248 s = buf[14] + buf[ 9]; d[0] = d[8] = s; d += 64;
249 s = buf[22] + tmp; d[0] = d[8] = s; d += 64;
250 s = buf[ 6]; d[0] = d[8] = s; d += 64;
251
252 tmp = buf[26] + buf[30];
253 s = buf[22] + tmp; d[0] = d[8] = s; d += 64;
254 s = buf[10] + buf[14]; d[0] = d[8] = s; d += 64;
255 s = buf[18] + tmp; d[0] = d[8] = s; d += 64;
256 s = buf[ 2]; d[0] = d[8] = s; d += 64;
257
258 tmp = buf[28] + buf[26];
259 s = buf[18] + tmp; d[0] = d[8] = s; d += 64;
260 s = buf[12] + buf[10]; d[0] = d[8] = s; d += 64;
261 s = buf[20] + tmp; d[0] = d[8] = s; d += 64;
262 s = buf[ 4]; d[0] = d[8] = s; d += 64;
263
264 tmp = buf[24] + buf[28];
265 s = buf[20] + tmp; d[0] = d[8] = s; d += 64;
266 s = buf[ 8] + buf[12]; d[0] = d[8] = s; d += 64;
267 s = buf[16] + tmp; d[0] = d[8] = s;
268
269 /* this is so rarely invoked that it's not worth making two versions of the output
270 * shuffle code (one for no shift, one for clip + variable shift) like in IMDCT
271 * here we just load, clip, shift, and store on the rare instances that es != 0
272 */
273 if (es) {
274 d = dest + 64*16 + ((offset - oddBlock) & 7) + (oddBlock ? 0 : VBUF_LENGTH);
275 s = d[0]; CLIP_2N(s, 31 - es); d[0] = d[8] = (s << es);
276
277 d = dest + offset + (oddBlock ? VBUF_LENGTH : 0);
278 for (i = 16; i <= 31; i++) {
279 s = d[0]; CLIP_2N(s, 31 - es); d[0] = d[8] = (s << es); d += 64;
280 }
281
282 d = dest + 16 + ((offset - oddBlock) & 7) + (oddBlock ? 0 : VBUF_LENGTH);
283 for (i = 15; i >= 0; i--) {
284 s = d[0]; CLIP_2N(s, 31 - es); d[0] = d[8] = (s << es); d += 64;
285 }
286 }
287}
288
289} // namespace third_party
290} // namespace fl
#define VBUF_LENGTH
Definition coder.h:109
#define CLIP_2N(y, n)
Definition coder.h:89
#define COS0_3
Definition dct32.hpp:58
#define COS0_4
Definition dct32.hpp:59
#define COS1_6
Definition dct32.hpp:78
#define COS0_8
Definition dct32.hpp:63
#define COS0_10
Definition dct32.hpp:65
#define COS3_1
Definition dct32.hpp:87
#define COS0_0
Definition dct32.hpp:55
#define COS4_0
Definition dct32.hpp:89
#define COS1_7
Definition dct32.hpp:79
#define COS0_5
Definition dct32.hpp:60
#define COS1_2
Definition dct32.hpp:74
#define COS2_0
Definition dct32.hpp:81
#define COS0_1
Definition dct32.hpp:56
#define COS0_9
Definition dct32.hpp:64
#define D32FP(i, s0, s1, s2)
Definition dct32.hpp:113
#define COS3_0
Definition dct32.hpp:86
#define COS1_0
Definition dct32.hpp:72
#define COS2_2
Definition dct32.hpp:83
#define COS1_5
Definition dct32.hpp:77
#define COS1_4
Definition dct32.hpp:76
#define COS1_1
Definition dct32.hpp:73
#define COS2_1
Definition dct32.hpp:82
#define COS0_2
Definition dct32.hpp:57
#define COS0_7
Definition dct32.hpp:62
#define COS2_3
Definition dct32.hpp:84
#define COS1_3
Definition dct32.hpp:75
#define COS0_12
Definition dct32.hpp:67
#define COS0_14
Definition dct32.hpp:69
#define COS0_11
Definition dct32.hpp:66
#define COS0_15
Definition dct32.hpp:70
#define COS0_6
Definition dct32.hpp:61
#define COS0_13
Definition dct32.hpp:68
fl::UISlider offset("Offset", 0.0f, 0.0f, 1.0f, 0.01f)
void FDCT32(int32_t *x, int32_t *d, int32_t offset, int32_t oddBlock, int32_t gb) FL_NOEXCEPT
Definition dct32.hpp:151
__inline int32_t MULSHIFT32(int32_t x, int32_t y) FL_NOEXCEPT
Multiply together two 32-bit numbers and return the top 32-bits of the result.
Definition assembly.h:503
static const int32_t dcttab[48]
Definition dct32.hpp:92
fl::i32 int32_t
Definition coder.h:220
Base definition for an LED controller.
Definition crgb.hpp:179
#define FL_NOEXCEPT