FastLED 3.9.15
Loading...
Searching...
No Matches
sin32.h
Go to the documentation of this file.
1#pragma once
2
3
4#include "fl/stl/int.h"
6#include "fastled_progmem.h"
7#include "fl/math/simd.h" // Platform-dispatched SIMD (SSE2/NEON/scalar)
8#include "fl/stl/align.h"
9#include "fl/stl/noexcept.h"
10namespace fl {
11
12// Paired sin/cos quarter-wave LUT with interleaved values and derivatives.
13// Layout per entry [qi]: { y_sin(qi), m_sin(qi), y_cos(qi), m_cos(qi) }
14// where y_cos(qi) = y_sin(64-qi), m_cos(qi) = m_sin(64-qi).
15// 65 entries (indices 0..64), covering 0 to pi/2.
16// Full sine/cosine reconstructed via quarter-wave symmetry.
17// Quadratic interpolation with exact stored derivative for O(h^3) accuracy.
18//
19// Output range: [-2147418112, 2147418112] (= 32767 * 65536)
20// Stride: 4 i32 per entry (16 bytes). 65 entries = 1040 bytes.
21// Stored in FL_PROGMEM for AVR flash placement.
22extern const i32 FL_PROGMEM sinCosPairedLut[];
23
24struct SinCos32 {
27};
28
29// Read an i32 from the PROGMEM-qualified LUT.
31 return (i32)FL_PGM_READ_DWORD_ALIGNED(addr);
32}
33
34// Core branchless quadratic interpolation from paired LUT.
35// qi: quarter-wave table index (0..64)
36// qi_next: adjacent index (qi+1 for direct, qi-1 for mirrored)
37// dmask: 0x00000000 (direct) or 0xFFFFFFFF (mirrored, negates derivative)
38// t: fraction in [0, 65535]
39// offset: 0 for sin, 2 for cos (selects which pair within the stride-4 entry)
40FASTLED_FORCE_INLINE i32 sin32_interp(u8 qi, u8 qi_next, i32 dmask, u32 t, u8 offset = 0) FL_NOEXCEPT {
41 i32 y0 = read_sin32_lut(&sinCosPairedLut[qi * 4 + offset]);
42 i32 m0 = read_sin32_lut(&sinCosPairedLut[qi * 4 + offset + 1]);
43 i32 y1 = read_sin32_lut(&sinCosPairedLut[qi_next * 4 + offset]);
44
45 // Branchless conditional negate derivative
46 m0 = (m0 ^ dmask) - dmask;
47
48 // Quadratic interpolation (Horner form, 2 muls):
49 // P(t) = y0 + T*(m0 + T*(y1 - y0 - m0)) where T = t/65536
50 i32 c = y1 - y0 - m0;
51 i32 r = (i32)((i64)c * t >> 16) + m0;
52 return (i32)(((i64)r * t >> 16) + (i64)y0);
53}
54
55// 0 to 16777216 is a full circle
56// output is between -2147418112 and 2147418112
57// Branchless quarter-wave lookup with quadratic interpolation.
58// Cost: 3 table loads, 2 i64 multiplies, no branches.
60 u8 angle256 = static_cast<u8>(angle >> 16); // 0..255
61 u32 t = angle & 0xFFFF; // 0..65535
62
63 u8 quadrant = angle256 >> 6; // 0..3
64 u8 pos = angle256 & 0x3F; // 0..63
65
66 // Branchless quarter-wave mapping
67 u8 mirror = quadrant & 1;
68 u8 qi = static_cast<u8>(pos + mirror * (64 - 2 * pos));
69 u8 qi_next = static_cast<u8>(qi + 1 - 2 * mirror);
70
71 i32 dmask = -static_cast<i32>(mirror);
72 i32 raw = sin32_interp(qi, qi_next, dmask, t);
73
74 // Branchless sign: negative in quadrants 2, 3
75 i32 vmask = -static_cast<i32>((quadrant >> 1) & 1);
76 return (i32)(((i64)raw ^ vmask) - vmask);
77}
78
79// 0 to 16777216 is a full circle
80// output is between -2147418112 and 2147418112
82 return sin32(angle + 4194304u);
83}
84
85// Compute sin and cos simultaneously, faster than separate sin32+cos32 calls.
86// Uses paired LUT: sin and cos data colocated at same index (no qi_c computation).
87// Cost: 6 table loads, 4 i64 multiplies, no branches.
89 u8 angle256 = static_cast<u8>(angle >> 16);
90 u32 t = angle & 0xFFFF;
91
92 u8 quadrant = angle256 >> 6;
93 u8 pos = angle256 & 0x3F;
94
95 // Quarter-wave mapping (same for both sin and cos)
96 u8 mirror_s = quadrant & 1;
97 u8 qi = static_cast<u8>(pos + mirror_s * (64 - 2 * pos));
98 u8 qi_next = static_cast<u8>(qi + 1 - 2 * mirror_s);
99
100 // Derivative masks: sin and cos have opposite mirror states
101 i32 sdmask = -static_cast<i32>(mirror_s);
102 i32 cdmask = ~sdmask; // opposite mirror
103
104 // Sin at offset 0, cos at offset 2 — same qi, same cache line
105 i32 s_raw = sin32_interp(qi, qi_next, sdmask, t, 0);
106 i32 c_raw = sin32_interp(qi, qi_next, cdmask, t, 2);
107
108 // Sin sign: negative in quadrants 2, 3
109 i32 svmask = -static_cast<i32>((quadrant >> 1) & 1);
110 // Cos sign: negative in quadrants 1, 2 (XOR of quadrant bits)
111 i32 cvmask = -static_cast<i32>((quadrant ^ (quadrant >> 1)) & 1);
112
113 SinCos32 out;
114 out.sin_val = (i32)(((i64)s_raw ^ svmask) - svmask);
115 out.cos_val = (i32)(((i64)c_raw ^ cvmask) - cvmask);
116 return out;
117}
118
119// 0 to 65536 is a full circle
120// output is between -32767 and 32767
122 u32 angle32 = static_cast<u32>(angle) << 8;
123 return static_cast<i16>(sin32(angle32) >> 16);
124}
125
126// 0 to 65536 is a full circle
127// output is between -32767 and 32767
129 u32 angle32 = static_cast<u32>(angle) << 8;
130 return static_cast<i16>(cos32(angle32) >> 16);
131}
132
134struct FL_ALIGNAS(16) SinCos32_simd {
135 simd::simd_u32x4 sin_vals; // 4 sin results (raw i32, range: -2147418112 to 2147418112)
136 simd::simd_u32x4 cos_vals; // 4 cos results (raw i32, range: -2147418112 to 2147418112)
137};
138
146 // ========== Phase 1a: SIMD angle decomposition ==========
147 // Break down each of the 4 angles into components needed for LUT lookup
148
149 // Extract high 8 bits (angle >> 16) → 0..255 range for quadrant + position
150 simd::simd_u32x4 angle256_vec = simd::srl_u32_4(angles, 16);
151
152 // Extract low 16 bits → fractional part for interpolation (0..65535)
153 simd::simd_u32x4 t_vec = simd::and_u32_4(angles, simd::set1_u32_4(0xFFFF));
154
155 // Determine quadrant (0=0°-90°, 1=90°-180°, 2=180°-270°, 3=270°-360°)
156 simd::simd_u32x4 quadrant_vec = simd::srl_u32_4(angle256_vec, 6);
157
158 // Position within quadrant (0..63) → maps to LUT indices 0..63
159 simd::simd_u32x4 pos_vec = simd::and_u32_4(angle256_vec, simd::set1_u32_4(0x3F));
160
161 // Mirror flag: quadrants 1 and 3 need mirrored LUT access
162 simd::simd_u32x4 mirror_s_vec = simd::and_u32_4(quadrant_vec, simd::set1_u32_4(1));
163
164 // ========== Phase 1b: SIMD mask computation ==========
165 // Compute masks for branchless conditional negation in interpolation and sign application
166
167 // Sin derivative mask: 0x00000000 (direct) or 0xFFFFFFFF (negate derivative in mirrored quadrants)
168 simd::simd_u32x4 sdmask_vec = simd::sub_i32_4(simd::set1_u32_4(0), mirror_s_vec);
169
170 // Cos derivative mask: opposite of sin (cos mirrors opposite direction)
171 simd::simd_u32x4 cdmask_vec = simd::xor_u32_4(sdmask_vec, simd::set1_u32_4(0xFFFFFFFF));
172
173 // Sin value sign mask: negative in quadrants 2 and 3 (bit 1 of quadrant)
174 simd::simd_u32x4 quadrant_bit1 = simd::and_u32_4(simd::srl_u32_4(quadrant_vec, 1), simd::set1_u32_4(1));
175 simd::simd_u32x4 svmask_vec = simd::sub_i32_4(simd::set1_u32_4(0), quadrant_bit1);
176
177 // Cos value sign mask: negative in quadrants 1 and 2 (XOR of quadrant bits 0 and 1)
178 simd::simd_u32x4 quadrant_xor = simd::xor_u32_4(quadrant_vec, simd::srl_u32_4(quadrant_vec, 1));
179 simd::simd_u32x4 quadrant_xor_bit0 = simd::and_u32_4(quadrant_xor, simd::set1_u32_4(1));
180 simd::simd_u32x4 cvmask_vec = simd::sub_i32_4(simd::set1_u32_4(0), quadrant_xor_bit0);
181
182 // ========== Phase 1c: Vectorized quarter-wave index mapping ==========
183 // Map position to LUT index using quarter-wave symmetry (branchless mirror logic)
184
185 // Compute qi = pos + mirror * (64 - 2*pos)
186 // This mirrors the position for odd quadrants: pos 0→64, pos 63→1
187 simd::simd_u32x4 two_pos = simd::add_i32_4(pos_vec, pos_vec);
188 simd::simd_u32x4 term_vec = simd::sub_i32_4(simd::set1_u32_4(64), two_pos);
189 simd::simd_u32x4 masked_term = simd::and_u32_4(term_vec, sdmask_vec); // Zero term if not mirrored
190 simd::simd_u32x4 qi_s_vec = simd::add_i32_4(pos_vec, masked_term);
191
192 // Compute qi_next = qi + 1 - 2*mirror
193 // Direct: qi_next = qi + 1 (forward), Mirrored: qi_next = qi - 1 (backward)
194 simd::simd_u32x4 two_mirror_s = simd::add_i32_4(mirror_s_vec, mirror_s_vec);
195 simd::simd_u32x4 qi_next_s_vec = simd::sub_i32_4(simd::add_i32_4(qi_s_vec, simd::set1_u32_4(1)), two_mirror_s);
196
197 // ========== Phase 2: Vector LUT loads + 4x4 transpose (AoS → SoA) ==========
198 // Each LUT entry is 16 bytes: {y_sin, m_sin, y_cos, m_cos} (4 × i32)
199 // We load 4 full entries (one per angle) and transpose to separate vectors
200 // Input: 4 rows × 4 columns (AoS: each row is one angle's data)
201 // Output: 4 columns as separate vectors (SoA: each vector contains same field for 4 angles)
202
203 // Extract indices from SIMD registers for scalar LUT indexing
204 // (SIMD gather is not available on all platforms, so we use scalar indexing)
205 u32 qi0 = simd::extract_u32_4(qi_s_vec, 0);
206 u32 qi1 = simd::extract_u32_4(qi_s_vec, 1);
207 u32 qi2 = simd::extract_u32_4(qi_s_vec, 2);
208 u32 qi3 = simd::extract_u32_4(qi_s_vec, 3);
209
210 u32 qn0 = simd::extract_u32_4(qi_next_s_vec, 0);
211 u32 qn1 = simd::extract_u32_4(qi_next_s_vec, 1);
212 u32 qn2 = simd::extract_u32_4(qi_next_s_vec, 2);
213 u32 qn3 = simd::extract_u32_4(qi_next_s_vec, 3);
214
215 // Load full 16-byte qi entries: {y_sin, m_sin, y_cos, m_cos}
216 // e0 = {y_s0, m_s0, y_c0, m_c0}, e1 = {y_s1, m_s1, y_c1, m_c1}, ...
217 simd::simd_u32x4 e0 = simd::load_u32_4(reinterpret_cast<const u32*>(&sinCosPairedLut[qi0 * 4])); // ok reinterpret cast
218 simd::simd_u32x4 e1 = simd::load_u32_4(reinterpret_cast<const u32*>(&sinCosPairedLut[qi1 * 4])); // ok reinterpret cast
219 simd::simd_u32x4 e2 = simd::load_u32_4(reinterpret_cast<const u32*>(&sinCosPairedLut[qi2 * 4])); // ok reinterpret cast
220 simd::simd_u32x4 e3 = simd::load_u32_4(reinterpret_cast<const u32*>(&sinCosPairedLut[qi3 * 4])); // ok reinterpret cast
221
222 // 4x4 transpose qi entries: AoS {y_s, m_s, y_c, m_c} → SoA
223 // Classic SIMD matrix transpose using interleaved unpacks
224 // Step 1: interleave pairs at 32-bit granularity
225 simd::simd_u32x4 t01lo = simd::unpacklo_u32_4(e0, e1); // {y_s0, y_s1, m_s0, m_s1}
226 simd::simd_u32x4 t01hi = simd::unpackhi_u32_4(e0, e1); // {y_c0, y_c1, m_c0, m_c1}
227 simd::simd_u32x4 t23lo = simd::unpacklo_u32_4(e2, e3); // {y_s2, y_s3, m_s2, m_s3}
228 simd::simd_u32x4 t23hi = simd::unpackhi_u32_4(e2, e3); // {y_c2, y_c3, m_c2, m_c3}
229 // Step 2: final interleave at 64-bit granularity → complete SoA vectors
230 simd::simd_u32x4 y0_s_v = simd::unpacklo_u64_as_u32_4(t01lo, t23lo); // {y_s0, y_s1, y_s2, y_s3}
231 simd::simd_u32x4 m0_s_v = simd::unpackhi_u64_as_u32_4(t01lo, t23lo); // {m_s0, m_s1, m_s2, m_s3}
232 simd::simd_u32x4 y0_c_v = simd::unpacklo_u64_as_u32_4(t01hi, t23hi); // {y_c0, y_c1, y_c2, y_c3}
233 simd::simd_u32x4 m0_c_v = simd::unpackhi_u64_as_u32_4(t01hi, t23hi); // {m_c0, m_c1, m_c2, m_c3}
234
235 // Load full 16-byte qi_next entries (only need y_sin and y_cos for interpolation endpoint)
236 simd::simd_u32x4 n0 = simd::load_u32_4(reinterpret_cast<const u32*>(&sinCosPairedLut[qn0 * 4])); // ok reinterpret cast
237 simd::simd_u32x4 n1 = simd::load_u32_4(reinterpret_cast<const u32*>(&sinCosPairedLut[qn1 * 4])); // ok reinterpret cast
238 simd::simd_u32x4 n2 = simd::load_u32_4(reinterpret_cast<const u32*>(&sinCosPairedLut[qn2 * 4])); // ok reinterpret cast
239 simd::simd_u32x4 n3 = simd::load_u32_4(reinterpret_cast<const u32*>(&sinCosPairedLut[qn3 * 4])); // ok reinterpret cast
240
241 // Partial transpose qi_next: only extract y_sin and y_cos (derivatives not needed)
242 simd::simd_u32x4 n01lo = simd::unpacklo_u32_4(n0, n1); // {y_s0, y_s1, m_s0, m_s1}
243 simd::simd_u32x4 n01hi = simd::unpackhi_u32_4(n0, n1); // {y_c0, y_c1, m_c0, m_c1}
244 simd::simd_u32x4 n23lo = simd::unpacklo_u32_4(n2, n3); // {y_s2, y_s3, m_s2, m_s3}
245 simd::simd_u32x4 n23hi = simd::unpackhi_u32_4(n2, n3); // {y_c2, y_c3, m_c2, m_c3}
246 simd::simd_u32x4 y1_s_v = simd::unpacklo_u64_as_u32_4(n01lo, n23lo); // {y1_s0, y1_s1, y1_s2, y1_s3}
247 simd::simd_u32x4 y1_c_v = simd::unpacklo_u64_as_u32_4(n01hi, n23hi); // {y1_c0, y1_c1, y1_c2, y1_c3}
248
249 // ========== Phase 3: SIMD quadratic interpolation ==========
250 // Vectorized version of scalar interpolation formula from sin32_interp()
251 // P(t) = y0 + T*(m0 + T*(y1 - y0 - m0)) where T = t/65536
252 // Implemented as: c = y1 - y0 - m0; r = c*T + m0; result = r*T + y0
253 //
254 // Optimization: Uses mulhi_su32_4 (signed × unsigned-positive, 11 ops on SSE2)
255 // instead of mulhi_i32_4 (14 ops on SSE2). Safe because t_vec is always in [0, 65535].
256
257 // Sin interpolation
258 // Apply derivative mask: conditionally negate m0 for mirrored quadrants
259 m0_s_v = simd::sub_i32_4(simd::xor_u32_4(m0_s_v, sdmask_vec), sdmask_vec); // (m0 ^ mask) - mask
260 // Compute quadratic coefficient: c = y1 - y0 - m0
261 simd::simd_u32x4 c_s = simd::sub_i32_4(simd::sub_i32_4(y1_s_v, y0_s_v), m0_s_v);
262 // First multiply: r = c*t/65536 + m0 (linear term + derivative)
263 simd::simd_u32x4 r_s = simd::add_i32_4(simd::mulhi_su32_4(c_s, t_vec), m0_s_v);
264 // Second multiply: result = r*t/65536 + y0 (quadratic interpolation complete)
265 simd::simd_u32x4 s_raw = simd::add_i32_4(simd::mulhi_su32_4(r_s, t_vec), y0_s_v);
266
267 // Cos interpolation (identical structure to sin, but uses cos-specific masks and data)
268 m0_c_v = simd::sub_i32_4(simd::xor_u32_4(m0_c_v, cdmask_vec), cdmask_vec); // Conditionally negate m0
269 simd::simd_u32x4 c_c = simd::sub_i32_4(simd::sub_i32_4(y1_c_v, y0_c_v), m0_c_v); // c = y1 - y0 - m0
270 simd::simd_u32x4 r_c = simd::add_i32_4(simd::mulhi_su32_4(c_c, t_vec), m0_c_v); // r = c*t/65536 + m0
271 simd::simd_u32x4 c_raw = simd::add_i32_4(simd::mulhi_su32_4(r_c, t_vec), y0_c_v); // result = r*t/65536 + y0
272
273 // ========== Apply final sign masks ==========
274 // Use branchless negate: (val ^ mask) - mask
275 // If mask = 0x00000000: val unchanged
276 // If mask = 0xFFFFFFFF: val negated (two's complement)
277 SinCos32_simd result;
278 result.sin_vals = simd::sub_i32_4(simd::xor_u32_4(s_raw, svmask_vec), svmask_vec); // Negate if quadrant 2 or 3
279 result.cos_vals = simd::sub_i32_4(simd::xor_u32_4(c_raw, cvmask_vec), cvmask_vec); // Negate if quadrant 1 or 2
280 return result;
281}
282
283} // namespace fl
uint8_t pos
Definition Blur.ino:11
Alignment macros and utilities for FastLED.
fl::UISlider offset("Offset", 0.0f, 0.0f, 1.0f, 0.01f)
#define FL_PGM_READ_DWORD_ALIGNED(addr)
#define FL_PROGMEM
PROGMEM keyword for storage.
Wrapper definitions to allow seamless use of PROGMEM in environments that have it.
platforms::simd_u32x4 simd_u32x4
Definition types.h:26
unsigned char u8
Definition stdint.h:131
FASTLED_FORCE_INLINE i32 cos32(u32 angle) FL_NOEXCEPT
Definition sin32.h:81
FASTLED_FORCE_INLINE i32 read_sin32_lut(const i32 *addr) FL_NOEXCEPT
Definition sin32.h:30
FASTLED_FORCE_INLINE i32 sin32(u32 angle) FL_NOEXCEPT
Definition sin32.h:59
FASTLED_FORCE_INLINE SinCos32_simd sincos32_simd(simd::simd_u32x4 angles) FL_NOEXCEPT
Process 4 angles simultaneously, returning vectorized sin/cos values SIMD-optimized: vectorized angle...
Definition sin32.h:145
const i32 sinCosPairedLut[]
FASTLED_FORCE_INLINE i32 sin32_interp(u8 qi, u8 qi_next, i32 dmask, u32 t, u8 offset=0) FL_NOEXCEPT
Definition sin32.h:40
FASTLED_FORCE_INLINE i16 sin16lut(u16 angle) FL_NOEXCEPT
Definition sin32.h:121
fl::i64 i64
Definition s16x16x4.h:222
expected< T, E > result
Alias for expected (Rust-style naming)
Definition result.h:31
FASTLED_FORCE_INLINE i16 cos16lut(u16 angle) FL_NOEXCEPT
Definition sin32.h:128
FASTLED_FORCE_INLINE SinCos32 sincos32(u32 angle) FL_NOEXCEPT
Definition sin32.h:88
Base definition for an LED controller.
Definition crgb.hpp:179
i32 sin_val
Definition sin32.h:25
i32 cos_val
Definition sin32.h:26
#define FASTLED_FORCE_INLINE
#define FL_ALIGNAS(N)
#define FL_NOEXCEPT
Umbrella header for SIMD subsystem.