FastLED 3.9.15
Loading...
Searching...
No Matches

◆ sincos32_simd()

FASTLED_FORCE_INLINE SinCos32_simd fl::sincos32_simd ( simd::simd_u32x4 angles)

Process 4 angles simultaneously, returning vectorized sin/cos values SIMD-optimized: vectorized angle decomposition, vector LUT loads with 4x4 transpose (AoS→SoA), and vectorized quadratic interpolation.

Parameters
angles4 u32 angles (0 to 16777216 per angle is a full circle)
Returns
SinCos32_simd with raw i32 values (range: -2147418112 to 2147418112)

Definition at line 145 of file sin32.h.

145 {
146 // ========== Phase 1a: SIMD angle decomposition ==========
147 // Break down each of the 4 angles into components needed for LUT lookup
148
149 // Extract high 8 bits (angle >> 16) → 0..255 range for quadrant + position
150 simd::simd_u32x4 angle256_vec = simd::srl_u32_4(angles, 16);
151
152 // Extract low 16 bits → fractional part for interpolation (0..65535)
153 simd::simd_u32x4 t_vec = simd::and_u32_4(angles, simd::set1_u32_4(0xFFFF));
154
155 // Determine quadrant (0=0°-90°, 1=90°-180°, 2=180°-270°, 3=270°-360°)
156 simd::simd_u32x4 quadrant_vec = simd::srl_u32_4(angle256_vec, 6);
157
158 // Position within quadrant (0..63) → maps to LUT indices 0..63
159 simd::simd_u32x4 pos_vec = simd::and_u32_4(angle256_vec, simd::set1_u32_4(0x3F));
160
161 // Mirror flag: quadrants 1 and 3 need mirrored LUT access
162 simd::simd_u32x4 mirror_s_vec = simd::and_u32_4(quadrant_vec, simd::set1_u32_4(1));
163
164 // ========== Phase 1b: SIMD mask computation ==========
165 // Compute masks for branchless conditional negation in interpolation and sign application
166
167 // Sin derivative mask: 0x00000000 (direct) or 0xFFFFFFFF (negate derivative in mirrored quadrants)
168 simd::simd_u32x4 sdmask_vec = simd::sub_i32_4(simd::set1_u32_4(0), mirror_s_vec);
169
170 // Cos derivative mask: opposite of sin (cos mirrors opposite direction)
171 simd::simd_u32x4 cdmask_vec = simd::xor_u32_4(sdmask_vec, simd::set1_u32_4(0xFFFFFFFF));
172
173 // Sin value sign mask: negative in quadrants 2 and 3 (bit 1 of quadrant)
174 simd::simd_u32x4 quadrant_bit1 = simd::and_u32_4(simd::srl_u32_4(quadrant_vec, 1), simd::set1_u32_4(1));
175 simd::simd_u32x4 svmask_vec = simd::sub_i32_4(simd::set1_u32_4(0), quadrant_bit1);
176
177 // Cos value sign mask: negative in quadrants 1 and 2 (XOR of quadrant bits 0 and 1)
178 simd::simd_u32x4 quadrant_xor = simd::xor_u32_4(quadrant_vec, simd::srl_u32_4(quadrant_vec, 1));
179 simd::simd_u32x4 quadrant_xor_bit0 = simd::and_u32_4(quadrant_xor, simd::set1_u32_4(1));
180 simd::simd_u32x4 cvmask_vec = simd::sub_i32_4(simd::set1_u32_4(0), quadrant_xor_bit0);
181
182 // ========== Phase 1c: Vectorized quarter-wave index mapping ==========
183 // Map position to LUT index using quarter-wave symmetry (branchless mirror logic)
184
185 // Compute qi = pos + mirror * (64 - 2*pos)
186 // This mirrors the position for odd quadrants: pos 0→64, pos 63→1
187 simd::simd_u32x4 two_pos = simd::add_i32_4(pos_vec, pos_vec);
188 simd::simd_u32x4 term_vec = simd::sub_i32_4(simd::set1_u32_4(64), two_pos);
189 simd::simd_u32x4 masked_term = simd::and_u32_4(term_vec, sdmask_vec); // Zero term if not mirrored
190 simd::simd_u32x4 qi_s_vec = simd::add_i32_4(pos_vec, masked_term);
191
192 // Compute qi_next = qi + 1 - 2*mirror
193 // Direct: qi_next = qi + 1 (forward), Mirrored: qi_next = qi - 1 (backward)
194 simd::simd_u32x4 two_mirror_s = simd::add_i32_4(mirror_s_vec, mirror_s_vec);
195 simd::simd_u32x4 qi_next_s_vec = simd::sub_i32_4(simd::add_i32_4(qi_s_vec, simd::set1_u32_4(1)), two_mirror_s);
196
197 // ========== Phase 2: Vector LUT loads + 4x4 transpose (AoS → SoA) ==========
198 // Each LUT entry is 16 bytes: {y_sin, m_sin, y_cos, m_cos} (4 × i32)
199 // We load 4 full entries (one per angle) and transpose to separate vectors
200 // Input: 4 rows × 4 columns (AoS: each row is one angle's data)
201 // Output: 4 columns as separate vectors (SoA: each vector contains same field for 4 angles)
202
203 // Extract indices from SIMD registers for scalar LUT indexing
204 // (SIMD gather is not available on all platforms, so we use scalar indexing)
205 u32 qi0 = simd::extract_u32_4(qi_s_vec, 0);
206 u32 qi1 = simd::extract_u32_4(qi_s_vec, 1);
207 u32 qi2 = simd::extract_u32_4(qi_s_vec, 2);
208 u32 qi3 = simd::extract_u32_4(qi_s_vec, 3);
209
210 u32 qn0 = simd::extract_u32_4(qi_next_s_vec, 0);
211 u32 qn1 = simd::extract_u32_4(qi_next_s_vec, 1);
212 u32 qn2 = simd::extract_u32_4(qi_next_s_vec, 2);
213 u32 qn3 = simd::extract_u32_4(qi_next_s_vec, 3);
214
215 // Load full 16-byte qi entries: {y_sin, m_sin, y_cos, m_cos}
216 // e0 = {y_s0, m_s0, y_c0, m_c0}, e1 = {y_s1, m_s1, y_c1, m_c1}, ...
217 simd::simd_u32x4 e0 = simd::load_u32_4(reinterpret_cast<const u32*>(&sinCosPairedLut[qi0 * 4])); // ok reinterpret cast
218 simd::simd_u32x4 e1 = simd::load_u32_4(reinterpret_cast<const u32*>(&sinCosPairedLut[qi1 * 4])); // ok reinterpret cast
219 simd::simd_u32x4 e2 = simd::load_u32_4(reinterpret_cast<const u32*>(&sinCosPairedLut[qi2 * 4])); // ok reinterpret cast
220 simd::simd_u32x4 e3 = simd::load_u32_4(reinterpret_cast<const u32*>(&sinCosPairedLut[qi3 * 4])); // ok reinterpret cast
221
222 // 4x4 transpose qi entries: AoS {y_s, m_s, y_c, m_c} → SoA
223 // Classic SIMD matrix transpose using interleaved unpacks
224 // Step 1: interleave pairs at 32-bit granularity
225 simd::simd_u32x4 t01lo = simd::unpacklo_u32_4(e0, e1); // {y_s0, y_s1, m_s0, m_s1}
226 simd::simd_u32x4 t01hi = simd::unpackhi_u32_4(e0, e1); // {y_c0, y_c1, m_c0, m_c1}
227 simd::simd_u32x4 t23lo = simd::unpacklo_u32_4(e2, e3); // {y_s2, y_s3, m_s2, m_s3}
228 simd::simd_u32x4 t23hi = simd::unpackhi_u32_4(e2, e3); // {y_c2, y_c3, m_c2, m_c3}
229 // Step 2: final interleave at 64-bit granularity → complete SoA vectors
230 simd::simd_u32x4 y0_s_v = simd::unpacklo_u64_as_u32_4(t01lo, t23lo); // {y_s0, y_s1, y_s2, y_s3}
231 simd::simd_u32x4 m0_s_v = simd::unpackhi_u64_as_u32_4(t01lo, t23lo); // {m_s0, m_s1, m_s2, m_s3}
232 simd::simd_u32x4 y0_c_v = simd::unpacklo_u64_as_u32_4(t01hi, t23hi); // {y_c0, y_c1, y_c2, y_c3}
233 simd::simd_u32x4 m0_c_v = simd::unpackhi_u64_as_u32_4(t01hi, t23hi); // {m_c0, m_c1, m_c2, m_c3}
234
235 // Load full 16-byte qi_next entries (only need y_sin and y_cos for interpolation endpoint)
236 simd::simd_u32x4 n0 = simd::load_u32_4(reinterpret_cast<const u32*>(&sinCosPairedLut[qn0 * 4])); // ok reinterpret cast
237 simd::simd_u32x4 n1 = simd::load_u32_4(reinterpret_cast<const u32*>(&sinCosPairedLut[qn1 * 4])); // ok reinterpret cast
238 simd::simd_u32x4 n2 = simd::load_u32_4(reinterpret_cast<const u32*>(&sinCosPairedLut[qn2 * 4])); // ok reinterpret cast
239 simd::simd_u32x4 n3 = simd::load_u32_4(reinterpret_cast<const u32*>(&sinCosPairedLut[qn3 * 4])); // ok reinterpret cast
240
241 // Partial transpose qi_next: only extract y_sin and y_cos (derivatives not needed)
242 simd::simd_u32x4 n01lo = simd::unpacklo_u32_4(n0, n1); // {y_s0, y_s1, m_s0, m_s1}
243 simd::simd_u32x4 n01hi = simd::unpackhi_u32_4(n0, n1); // {y_c0, y_c1, m_c0, m_c1}
244 simd::simd_u32x4 n23lo = simd::unpacklo_u32_4(n2, n3); // {y_s2, y_s3, m_s2, m_s3}
245 simd::simd_u32x4 n23hi = simd::unpackhi_u32_4(n2, n3); // {y_c2, y_c3, m_c2, m_c3}
246 simd::simd_u32x4 y1_s_v = simd::unpacklo_u64_as_u32_4(n01lo, n23lo); // {y1_s0, y1_s1, y1_s2, y1_s3}
247 simd::simd_u32x4 y1_c_v = simd::unpacklo_u64_as_u32_4(n01hi, n23hi); // {y1_c0, y1_c1, y1_c2, y1_c3}
248
249 // ========== Phase 3: SIMD quadratic interpolation ==========
250 // Vectorized version of scalar interpolation formula from sin32_interp()
251 // P(t) = y0 + T*(m0 + T*(y1 - y0 - m0)) where T = t/65536
252 // Implemented as: c = y1 - y0 - m0; r = c*T + m0; result = r*T + y0
253 //
254 // Optimization: Uses mulhi_su32_4 (signed × unsigned-positive, 11 ops on SSE2)
255 // instead of mulhi_i32_4 (14 ops on SSE2). Safe because t_vec is always in [0, 65535].
256
257 // Sin interpolation
258 // Apply derivative mask: conditionally negate m0 for mirrored quadrants
259 m0_s_v = simd::sub_i32_4(simd::xor_u32_4(m0_s_v, sdmask_vec), sdmask_vec); // (m0 ^ mask) - mask
260 // Compute quadratic coefficient: c = y1 - y0 - m0
261 simd::simd_u32x4 c_s = simd::sub_i32_4(simd::sub_i32_4(y1_s_v, y0_s_v), m0_s_v);
262 // First multiply: r = c*t/65536 + m0 (linear term + derivative)
263 simd::simd_u32x4 r_s = simd::add_i32_4(simd::mulhi_su32_4(c_s, t_vec), m0_s_v);
264 // Second multiply: result = r*t/65536 + y0 (quadratic interpolation complete)
265 simd::simd_u32x4 s_raw = simd::add_i32_4(simd::mulhi_su32_4(r_s, t_vec), y0_s_v);
266
267 // Cos interpolation (identical structure to sin, but uses cos-specific masks and data)
268 m0_c_v = simd::sub_i32_4(simd::xor_u32_4(m0_c_v, cdmask_vec), cdmask_vec); // Conditionally negate m0
269 simd::simd_u32x4 c_c = simd::sub_i32_4(simd::sub_i32_4(y1_c_v, y0_c_v), m0_c_v); // c = y1 - y0 - m0
270 simd::simd_u32x4 r_c = simd::add_i32_4(simd::mulhi_su32_4(c_c, t_vec), m0_c_v); // r = c*t/65536 + m0
271 simd::simd_u32x4 c_raw = simd::add_i32_4(simd::mulhi_su32_4(r_c, t_vec), y0_c_v); // result = r*t/65536 + y0
272
273 // ========== Apply final sign masks ==========
274 // Use branchless negate: (val ^ mask) - mask
275 // If mask = 0x00000000: val unchanged
276 // If mask = 0xFFFFFFFF: val negated (two's complement)
277 SinCos32_simd result;
278 result.sin_vals = simd::sub_i32_4(simd::xor_u32_4(s_raw, svmask_vec), svmask_vec); // Negate if quadrant 2 or 3
279 result.cos_vals = simd::sub_i32_4(simd::xor_u32_4(c_raw, cvmask_vec), cvmask_vec); // Negate if quadrant 1 or 2
280 return result;
281}
platforms::simd_u32x4 simd_u32x4
Definition types.h:26
const i32 sinCosPairedLut[]
expected< T, E > result
Alias for expected (Rust-style naming)
Definition result.h:31

References FASTLED_FORCE_INLINE, FL_NOEXCEPT, and sinCosPairedLut.

Referenced by fl::Chasing_Spirals_Q31::draw(), and fl::anonymous_namespace{chasing_spirals.cpp.hpp}::simd4_processChannel().

+ Here is the caller graph for this function: