FastLED 3.9.15
Loading...
Searching...
No Matches
wave8.hpp
Go to the documentation of this file.
1
6
7#pragma once
8
9#include "fl/channels/wave8.h"
12#include "fl/stl/isr/memcpy.h"
13#include "fl/stl/bit_cast.h"
14
16
17namespace fl {
18
19namespace detail {
20
21// ============================================================================
22// Lookup Tables
23// ============================================================================
24
25// 2-lane LUT: Spreads 4 bits into 2-lane interleaved positions (nibble → byte)
26constexpr u8 kTranspose4_16_LUT[16] = {0x00, 0x01, 0x04, 0x05, 0x10, 0x11,
27 0x14, 0x15, 0x40, 0x41, 0x44, 0x45,
28 0x50, 0x51, 0x54, 0x55};
29
30// 4-lane LUT: Spreads 2 bits into 4-lane interleaved positions (2-bit → byte)
31// Maps [0b00, 0b01, 0b10, 0b11] → bit patterns at lane positions
32// For lane N: bits placed at positions (bit*4 + N)
33constexpr u8 kTranspose2_4_LUT[4] = {
34 0x00, // 0b00 → no bits set
35 0x01, // 0b01 → bit at position 0 (pulse 1)
36 0x10, // 0b10 → bit at position 4 (pulse 0)
37 0x11 // 0b11 → bits at positions 0 and 4 (both pulses)
38};
39
40// ============================================================================
41// Byte to Wave8Byte Conversion (Force Inline Helper)
42// ============================================================================
43
48 const Wave8BitExpansionLut &lut,
49 Wave8Byte *output) {
50 // ISR-optimized copy: Copy high nibble (4 bytes = 1 x uint32_t)
51 const Wave8Bit *high_nibble_data = lut.lut[(byte_value >> 4) & 0xF];
52 isr::memcpy_32(fl::bit_cast_ptr<u32>(&output->symbols[0]),
53 fl::bit_cast_ptr<const u32>(high_nibble_data),
54 1); // 4 bytes = 1 x uint32_t
55
56 // ISR-optimized copy: Copy low nibble (4 bytes = 1 x uint32_t)
57 const Wave8Bit *low_nibble_data = lut.lut[byte_value & 0xF];
58 isr::memcpy_32(fl::bit_cast_ptr<u32>(&output->symbols[4]),
59 fl::bit_cast_ptr<const u32>(low_nibble_data),
60 1); // 4 bytes = 1 x uint32_t
61}
62
69void wave8_expand_byte(u8 byte_value,
70 const Wave8ByteExpansionLut &lut,
71 Wave8Byte *output) {
73 fl::bit_cast_ptr<const u32>(&lut.lut[byte_value]),
74 2); // 8 bytes = 2 x uint32_t
75}
76
77// ============================================================================
78// 2-Lane Transposition Helper Macro
79// ============================================================================
80
81#define FL_WAVE8_SPREAD_TO_16(lane_u8_0, lane_u8_1, out_16) \
82 do { \
83 const u8 _a = (u8)(lane_u8_0); \
84 const u8 _b = (u8)(lane_u8_1); \
85 const u16 _even = \
86 (u16)((u16)::fl::detail::kTranspose4_16_LUT[_b & 0x0Fu] | \
87 ((u16)::fl::detail::kTranspose4_16_LUT[_b >> 4] << 8)); \
88 const u16 _odd = \
89 (u16)(((u16)::fl::detail::kTranspose4_16_LUT[_a & 0x0Fu] | \
90 ((u16)::fl::detail::kTranspose4_16_LUT[_a >> 4] << 8)) \
91 << 1); \
92 (out_16) |= (u16)(_even | _odd); \
93 } while (0)
94
95// ============================================================================
96// 2-Lane Transposition (Force Inline)
97// ============================================================================
98
103void wave8_transpose_2(const Wave8Byte lane_waves[2],
104 u8 output[2 * sizeof(Wave8Byte)]) {
105 for (int symbol_idx = 0; symbol_idx < 8; symbol_idx++) {
106 u16 interleaved = 0;
107 // NOTE: FL_WAVE8_SPREAD_TO_16 macro treats first param as ODD bits, second as EVEN bits
108 // This matches wave8Untranspose_2 expectations: lane[0]→odd, lane[1]→even
109 FL_WAVE8_SPREAD_TO_16(lane_waves[0].symbols[symbol_idx].data,
110 lane_waves[1].symbols[symbol_idx].data,
111 interleaved);
112
113 output[symbol_idx * 2] = (u8)(interleaved >> 8); // High byte first (MSB pulses 4-7)
114 output[symbol_idx * 2 + 1] = (u8)(interleaved & 0xFF); // Low byte second (LSB pulses 0-3)
115 }
116}
117
118// ============================================================================
119// 4-Lane Transposition (Force Inline)
120// ============================================================================
121
126void wave8_transpose_4(const Wave8Byte lane_waves[4],
127 u8 output[4 * sizeof(Wave8Byte)]) {
128 // Each symbol (Wave8Bit) has 8 pulses
129 // With 4 lanes, we produce 4 bytes per symbol (2 pulses per byte × 4 lanes)
130 // Output format: [L3_P7, L2_P7, L1_P7, L0_P7, L3_P6, L2_P6, L1_P6, L0_P6, ...]
131 //
132 // OPTIMIZED VERSION: Fully unrolled direct extraction (4.0x speedup vs baseline)
133 // Based on successful 16-lane pattern that achieved 8x speedup
134 // Eliminates triple-nested loops by explicitly extracting and packing bits
135
136 // Process each symbol (8 iterations)
137 for (int symbol_idx = 0; symbol_idx < 8; symbol_idx++) {
138 // Pre-load all 4 lane bytes into registers
139 u8 l0 = lane_waves[0].symbols[symbol_idx].data;
140 u8 l1 = lane_waves[1].symbols[symbol_idx].data;
141 u8 l2 = lane_waves[2].symbols[symbol_idx].data;
142 u8 l3 = lane_waves[3].symbols[symbol_idx].data;
143
144 // Explicitly construct all 4 output bytes
145 // Each output byte contains 2 pulses from all 4 lanes
146 // Bit layout: [L3_hi, L2_hi, L1_hi, L0_hi, L3_lo, L2_lo, L1_lo, L0_lo]
147
148 // Byte 0: pulses 7 (hi) and 6 (lo)
149 output[symbol_idx * 4 + 0] =
150 ((l3 >> 7) & 1) << 7 |
151 ((l2 >> 7) & 1) << 6 |
152 ((l1 >> 7) & 1) << 5 |
153 ((l0 >> 7) & 1) << 4 |
154 ((l3 >> 6) & 1) << 3 |
155 ((l2 >> 6) & 1) << 2 |
156 ((l1 >> 6) & 1) << 1 |
157 ((l0 >> 6) & 1);
158
159 // Byte 1: pulses 5 (hi) and 4 (lo)
160 output[symbol_idx * 4 + 1] =
161 ((l3 >> 5) & 1) << 7 |
162 ((l2 >> 5) & 1) << 6 |
163 ((l1 >> 5) & 1) << 5 |
164 ((l0 >> 5) & 1) << 4 |
165 ((l3 >> 4) & 1) << 3 |
166 ((l2 >> 4) & 1) << 2 |
167 ((l1 >> 4) & 1) << 1 |
168 ((l0 >> 4) & 1);
169
170 // Byte 2: pulses 3 (hi) and 2 (lo)
171 output[symbol_idx * 4 + 2] =
172 ((l3 >> 3) & 1) << 7 |
173 ((l2 >> 3) & 1) << 6 |
174 ((l1 >> 3) & 1) << 5 |
175 ((l0 >> 3) & 1) << 4 |
176 ((l3 >> 2) & 1) << 3 |
177 ((l2 >> 2) & 1) << 2 |
178 ((l1 >> 2) & 1) << 1 |
179 ((l0 >> 2) & 1);
180
181 // Byte 3: pulses 1 (hi) and 0 (lo)
182 output[symbol_idx * 4 + 3] =
183 ((l3 >> 1) & 1) << 7 |
184 ((l2 >> 1) & 1) << 6 |
185 ((l1 >> 1) & 1) << 5 |
186 ((l0 >> 1) & 1) << 4 |
187 ((l3 >> 0) & 1) << 3 |
188 ((l2 >> 0) & 1) << 2 |
189 ((l1 >> 0) & 1) << 1 |
190 ((l0 >> 0) & 1);
191 }
192}
193
194// ============================================================================
195// 8-Lane Transposition (Force Inline)
196// ============================================================================
197
205void wave8_transpose_8(const Wave8Byte lane_waves[8],
206 u8 output[8 * sizeof(Wave8Byte)]) {
207 for (int symbol_idx = 0; symbol_idx < 8; symbol_idx++) {
208 u8 l[8];
209 for (int lane = 0; lane < 8; lane++) {
210 l[lane] = lane_waves[lane].symbols[symbol_idx].data;
211 }
212 spread_transpose8_symbol(l, output + symbol_idx * 8);
213 }
214}
215
216// ============================================================================
217// 16-Lane Transposition (Force Inline)
218// ============================================================================
219
231void wave8_transpose_16(const Wave8Byte lane_waves[16],
232 u8 output[16 * sizeof(Wave8Byte)]) {
233 for (int symbol_idx = 0; symbol_idx < 8; symbol_idx++) {
234 u8 l[16];
235 for (int lane = 0; lane < 16; lane++) {
236 l[lane] = lane_waves[lane].symbols[symbol_idx].data;
237 }
238 spread_transpose16_symbol(l, output + symbol_idx * 16);
239 }
240}
241
249void wave8_transpose_16x2_pipe2(const Wave8Byte lane_waves_a[16],
250 const Wave8Byte lane_waves_b[16],
251 u8 output_a[16 * sizeof(Wave8Byte)],
252 u8 output_b[16 * sizeof(Wave8Byte)]) {
253 for (int symbol_idx = 0; symbol_idx < 8; symbol_idx++) {
254 u8 la[16];
255 u8 lb[16];
256 for (int lane = 0; lane < 16; lane++) {
257 la[lane] = lane_waves_a[lane].symbols[symbol_idx].data;
258 lb[lane] = lane_waves_b[lane].symbols[symbol_idx].data;
259 }
260 spread_transpose16_symbol(la, output_a + symbol_idx * 16);
261 spread_transpose16_symbol(lb, output_b + symbol_idx * 16);
262 }
263}
264
273void wave8_transpose_16x4_pipe4(const Wave8Byte lane_waves_a[16],
274 const Wave8Byte lane_waves_b[16],
275 const Wave8Byte lane_waves_c[16],
276 const Wave8Byte lane_waves_d[16],
277 u8 output_a[16 * sizeof(Wave8Byte)],
278 u8 output_b[16 * sizeof(Wave8Byte)],
279 u8 output_c[16 * sizeof(Wave8Byte)],
280 u8 output_d[16 * sizeof(Wave8Byte)]) {
281 for (int symbol_idx = 0; symbol_idx < 8; symbol_idx++) {
282 u8 la[16];
283 u8 lb[16];
284 u8 lc[16];
285 u8 ld[16];
286 for (int lane = 0; lane < 16; lane++) {
287 la[lane] = lane_waves_a[lane].symbols[symbol_idx].data;
288 lb[lane] = lane_waves_b[lane].symbols[symbol_idx].data;
289 lc[lane] = lane_waves_c[lane].symbols[symbol_idx].data;
290 ld[lane] = lane_waves_d[lane].symbols[symbol_idx].data;
291 }
292 spread_transpose16_symbol(la, output_a + symbol_idx * 16);
293 spread_transpose16_symbol(lb, output_b + symbol_idx * 16);
294 spread_transpose16_symbol(lc, output_c + symbol_idx * 16);
295 spread_transpose16_symbol(ld, output_d + symbol_idx * 16);
296 }
297}
298
315void wave8_transpose_16_bf1(const u8 lanes[16],
316 u8 W0, u8 W1,
317 u8 output[16 * sizeof(Wave8Byte)]) {
318 u8 d_mask[8];
319 u8 m0_mask[8];
320 const u8 D_byte = W0 ^ W1;
321 for (int p = 0; p < 8; ++p) {
322 const int shift = 7 - p;
323 d_mask[p] = ((D_byte >> shift) & 1) ? 0xFFu : 0x00u;
324 m0_mask[p] = ((W0 >> shift) & 1) ? 0xFFu : 0x00u;
325 }
326 // Bit-transpose: spread_transpose16_symbol on input bytes gives
327 // cols[2s+h] = byte where bit L = bit(7-s) of lanes[L+8h].
328 u8 cols[16];
329 spread_transpose16_symbol(lanes, cols);
330 for (int s = 0; s < 8; ++s) {
331 const u8 col_lo = cols[2 * s + 0];
332 const u8 col_hi = cols[2 * s + 1];
333 for (int p = 0; p < 8; ++p) {
334 output[s * 16 + p * 2 + 0] = m0_mask[p] ^ (col_lo & d_mask[p]);
335 output[s * 16 + p * 2 + 1] = m0_mask[p] ^ (col_hi & d_mask[p]);
336 }
337 }
338}
339
345void wave8_transpose_8_bf1(const u8 lanes[8],
346 u8 W0, u8 W1,
347 u8 output[8 * sizeof(Wave8Byte)]) {
348 u8 d_mask[8];
349 u8 m0_mask[8];
350 const u8 D_byte = W0 ^ W1;
351 for (int p = 0; p < 8; ++p) {
352 const int shift = 7 - p;
353 d_mask[p] = ((D_byte >> shift) & 1) ? 0xFFu : 0x00u;
354 m0_mask[p] = ((W0 >> shift) & 1) ? 0xFFu : 0x00u;
355 }
356 u8 cols[8];
357 spread_transpose8_symbol(lanes, cols);
358 for (int s = 0; s < 8; ++s) {
359 const u8 col = cols[s];
360 for (int p = 0; p < 8; ++p) {
361 output[s * 8 + p] = m0_mask[p] ^ (col & d_mask[p]);
362 }
363 }
364}
365
370void wave8_transpose_4_bf1(const u8 lanes[4],
371 u8 W0, u8 W1,
372 u8 output[4 * sizeof(Wave8Byte)]) {
373 u8 d_mask[8];
374 u8 m0_mask[8];
375 const u8 D_byte = W0 ^ W1;
376 for (int p = 0; p < 8; ++p) {
377 const int shift = 7 - p;
378 d_mask[p] = ((D_byte >> shift) & 1) ? 0xFFu : 0x00u;
379 m0_mask[p] = ((W0 >> shift) & 1) ? 0xFFu : 0x00u;
380 }
381 // Bit transpose: cols[s] (low nibble) = bit(7-s) of lanes 0..3.
382 // spreadA(lanes[L]) puts the 4 high-nibble bits of lanes[L] into bit 0 of
383 // 4 separate bytes; OR'd across lanes with shifts gives 4 bytes (cols[0..3]).
384 const u32 aLo = spreadA(lanes[0]) | (spreadA(lanes[1]) << 1)
385 | (spreadA(lanes[2]) << 2) | (spreadA(lanes[3]) << 3);
386 const u32 bLo = spreadB(lanes[0]) | (spreadB(lanes[1]) << 1)
387 | (spreadB(lanes[2]) << 2) | (spreadB(lanes[3]) << 3);
388 u8 cols[8];
389 cols[0] = static_cast<u8>(aLo);
390 cols[1] = static_cast<u8>(aLo >> 8);
391 cols[2] = static_cast<u8>(aLo >> 16);
392 cols[3] = static_cast<u8>(aLo >> 24);
393 cols[4] = static_cast<u8>(bLo);
394 cols[5] = static_cast<u8>(bLo >> 8);
395 cols[6] = static_cast<u8>(bLo >> 16);
396 cols[7] = static_cast<u8>(bLo >> 24);
397 for (int s = 0; s < 8; ++s) {
398 const u8 col = cols[s]; // bits 0..3 = lanes 0..3
399 for (int k = 0; k < 4; ++k) {
400 const int p_hi = 2 * k;
401 const int p_lo = 2 * k + 1;
402 const u8 hi = static_cast<u8>((m0_mask[p_hi] & 0xF0u) ^ ((col << 4) & d_mask[p_hi]));
403 const u8 lo = static_cast<u8>((m0_mask[p_lo] & 0x0Fu) ^ (col & d_mask[p_lo]));
404 output[s * 4 + k] = static_cast<u8>(hi | lo);
405 }
406 }
407}
408
417void wave8_transpose_2_bf1(const u8 lanes[2],
418 u8 W0, u8 W1,
419 u8 output[2 * sizeof(Wave8Byte)]) {
420 u8 d_mask[8];
421 u8 m0_mask[8];
422 const u8 D_byte = W0 ^ W1;
423 for (int p = 0; p < 8; ++p) {
424 const int shift = 7 - p;
425 d_mask[p] = ((D_byte >> shift) & 1) ? 0xFFu : 0x00u;
426 m0_mask[p] = ((W0 >> shift) & 1) ? 0xFFu : 0x00u;
427 }
428 for (int s = 0; s < 8; ++s) {
429 const int bit_idx = 7 - s;
430 const u8 b0 = static_cast<u8>((lanes[0] >> bit_idx) & 1u);
431 const u8 b1 = static_cast<u8>((lanes[1] >> bit_idx) & 1u);
432 u8 byte_hi = 0;
433 u8 byte_lo = 0;
434 for (int q = 0; q < 4; ++q) {
435 // Reference `wave8_transpose_2` lands chipset-byte bit (4+q) at
436 // high-byte bit position (2*q) for lane 1 (even) / (2*q+1) for
437 // lane 0 (odd). Chipset-byte bit (4+q) corresponds to bf1
438 // pulse index (3 - q) because bf1 numbers pulse p ↔ chipset bit
439 // (7 - p). Similarly the low byte uses chipset bits 0..3, which
440 // map to bf1 pulses 7..4.
441 const int p_hi = 3 - q;
442 const int p_lo = 7 - q;
443 const u8 m0_p_hi = static_cast<u8>(m0_mask[p_hi] & 1u);
444 const u8 d_p_hi = static_cast<u8>(d_mask[p_hi] & 1u);
445 const u8 m0_p_lo = static_cast<u8>(m0_mask[p_lo] & 1u);
446 const u8 d_p_lo = static_cast<u8>(d_mask[p_lo] & 1u);
447 const u8 v0_hi = static_cast<u8>(m0_p_hi ^ (b0 & d_p_hi));
448 const u8 v1_hi = static_cast<u8>(m0_p_hi ^ (b1 & d_p_hi));
449 const u8 v0_lo = static_cast<u8>(m0_p_lo ^ (b0 & d_p_lo));
450 const u8 v1_lo = static_cast<u8>(m0_p_lo ^ (b1 & d_p_lo));
451 byte_hi |= static_cast<u8>((v1_hi << (2 * q)) | (v0_hi << (2 * q + 1)));
452 byte_lo |= static_cast<u8>((v1_lo << (2 * q)) | (v0_lo << (2 * q + 1)));
453 }
454 output[s * 2 + 0] = byte_hi;
455 output[s * 2 + 1] = byte_lo;
456 }
457}
458
464void wave8_transpose_16x4_bf1_pipe4(const u8 lanes_a[16],
465 const u8 lanes_b[16],
466 const u8 lanes_c[16],
467 const u8 lanes_d[16],
468 u8 W0, u8 W1,
469 u8 output_a[16 * sizeof(Wave8Byte)],
470 u8 output_b[16 * sizeof(Wave8Byte)],
471 u8 output_c[16 * sizeof(Wave8Byte)],
472 u8 output_d[16 * sizeof(Wave8Byte)]) {
473 u8 d_mask[8];
474 u8 m0_mask[8];
475 const u8 D_byte = W0 ^ W1;
476 for (int p = 0; p < 8; ++p) {
477 const int shift = 7 - p;
478 d_mask[p] = ((D_byte >> shift) & 1) ? 0xFFu : 0x00u;
479 m0_mask[p] = ((W0 >> shift) & 1) ? 0xFFu : 0x00u;
480 }
481 u8 cols_a[16], cols_b[16], cols_c[16], cols_d[16];
482 spread_transpose16_symbol(lanes_a, cols_a);
483 spread_transpose16_symbol(lanes_b, cols_b);
484 spread_transpose16_symbol(lanes_c, cols_c);
485 spread_transpose16_symbol(lanes_d, cols_d);
486 for (int s = 0; s < 8; ++s) {
487 const u8 al = cols_a[2*s + 0], ah = cols_a[2*s + 1];
488 const u8 bl = cols_b[2*s + 0], bh = cols_b[2*s + 1];
489 const u8 cl = cols_c[2*s + 0], ch = cols_c[2*s + 1];
490 const u8 dl = cols_d[2*s + 0], dh = cols_d[2*s + 1];
491 for (int p = 0; p < 8; ++p) {
492 const u8 dm = d_mask[p], mm = m0_mask[p];
493 output_a[s*16 + p*2 + 0] = mm ^ (al & dm);
494 output_a[s*16 + p*2 + 1] = mm ^ (ah & dm);
495 output_b[s*16 + p*2 + 0] = mm ^ (bl & dm);
496 output_b[s*16 + p*2 + 1] = mm ^ (bh & dm);
497 output_c[s*16 + p*2 + 0] = mm ^ (cl & dm);
498 output_c[s*16 + p*2 + 1] = mm ^ (ch & dm);
499 output_d[s*16 + p*2 + 0] = mm ^ (dl & dm);
500 output_d[s*16 + p*2 + 1] = mm ^ (dh & dm);
501 }
502 }
503}
504
505} // namespace detail
506
507// ============================================================================
508// Public wave8() Function (Force Inline)
509// ============================================================================
510
514void wave8(u8 lane,
515 const Wave8BitExpansionLut &lut,
516 u8 (&FL_RESTRICT_PARAM output)[sizeof(Wave8Byte)]) {
517 // Convert single lane byte to wave pulse symbols (8 bytes packed)
518 // Use properly aligned local variable to avoid alignment issues
519 Wave8Byte waveformSymbol;
520 detail::wave8_convert_byte_to_wave8byte(lane, lut, &waveformSymbol);
521
522 // ISR-optimized 32-bit copy: Copy 8 bytes as 2 x uint32_t words
523 // Wave8Byte is 8-byte aligned (FL_ALIGNAS(8)), guaranteeing 4-byte alignment
525 fl::bit_cast_ptr<const u32>(&waveformSymbol.symbols[0].data),
526 2); // 8 bytes = 2 x uint32_t
527}
528
529} // namespace fl
530
Shared u32 "spread LUT" bit-matrix transpose primitive (no SIMD, no u64).
ISR-safe memory operations (inline, header-only)
FASTLED_FORCE_INLINE FL_IRAM FL_OPTIMIZE_FUNCTION void wave8_transpose_4_bf1(const u8 lanes[4], u8 W0, u8 W1, u8 output[4 *sizeof(Wave8Byte)])
BF1 for 4-lane Wave8.
Definition wave8.hpp:370
FASTLED_FORCE_INLINE FL_IRAM FL_OPTIMIZE_FUNCTION void wave8_transpose_2_bf1(const u8 lanes[2], u8 W0, u8 W1, u8 output[2 *sizeof(Wave8Byte)])
BF1 for 2-lane Wave8.
Definition wave8.hpp:417
FASTLED_FORCE_INLINE FL_IRAM FL_OPTIMIZE_FUNCTION void wave8_transpose_16(const Wave8Byte lane_waves[16], u8 output[16 *sizeof(Wave8Byte)])
Transpose 16 lanes of Wave8Byte data into interleaved format.
Definition wave8.hpp:231
FASTLED_FORCE_INLINE FL_IRAM FL_OPTIMIZE_FUNCTION void wave8_transpose_8_bf1(const u8 lanes[8], u8 W0, u8 W1, u8 output[8 *sizeof(Wave8Byte)])
BF1 for 8-lane Wave8 — same algebraic identity as 16-lane BF1.
Definition wave8.hpp:345
FASTLED_FORCE_INLINE FL_IRAM FL_OPTIMIZE_FUNCTION void wave8_convert_byte_to_wave8byte(u8 byte_value, const Wave8BitExpansionLut &lut, Wave8Byte *output)
Helper: Convert byte to Wave8Byte using nibble LUT (internal use only)
Definition wave8.hpp:47
FASTLED_FORCE_INLINE FL_IRAM FL_OPTIMIZE_FUNCTION void wave8_transpose_8(const Wave8Byte lane_waves[8], u8 output[8 *sizeof(Wave8Byte)])
Transpose 8 lanes of Wave8Byte data into interleaved format.
Definition wave8.hpp:205
FASTLED_FORCE_INLINE FL_IRAM FL_OPTIMIZE_FUNCTION void spread_transpose16_symbol(const u8 l[16], u8 out[16])
Transpose one symbol of 16 lanes (16 input bytes) into 16 output bytes: 8 pulses × 2 bytes,...
constexpr u8 kTranspose2_4_LUT[4]
Definition wave8.hpp:33
FASTLED_FORCE_INLINE FL_IRAM FL_OPTIMIZE_FUNCTION void wave8_transpose_16x4_pipe4(const Wave8Byte lane_waves_a[16], const Wave8Byte lane_waves_b[16], const Wave8Byte lane_waves_c[16], const Wave8Byte lane_waves_d[16], u8 output_a[16 *sizeof(Wave8Byte)], u8 output_b[16 *sizeof(Wave8Byte)], u8 output_c[16 *sizeof(Wave8Byte)], u8 output_d[16 *sizeof(Wave8Byte)])
Pipe4: transpose 16-lane × 4-byte-positions in one fused call.
Definition wave8.hpp:273
FASTLED_FORCE_INLINE FL_IRAM FL_OPTIMIZE_FUNCTION void wave8_transpose_2(const Wave8Byte lane_waves[2], u8 output[2 *sizeof(Wave8Byte)])
Transpose 2 lanes of Wave8Byte data into interleaved format.
Definition wave8.hpp:103
FASTLED_FORCE_INLINE FL_IRAM FL_OPTIMIZE_FUNCTION void wave8_expand_byte(u8 byte_value, const Wave8ByteExpansionLut &lut, Wave8Byte *output)
Byte-indexed expansion (#2526): one indexed 8-byte copy.
Definition wave8.hpp:69
FASTLED_FORCE_INLINE FL_IRAM FL_OPTIMIZE_FUNCTION void wave8_transpose_16x2_pipe2(const Wave8Byte lane_waves_a[16], const Wave8Byte lane_waves_b[16], u8 output_a[16 *sizeof(Wave8Byte)], u8 output_b[16 *sizeof(Wave8Byte)])
Pipe2: transpose 16-lane × 2-byte-positions in one fused call.
Definition wave8.hpp:249
FASTLED_FORCE_INLINE u32 spreadA(u8 v)
Pulses 7,6,5,4 of v (byte j = bit (7-j)). Depends only on the high nibble.
FASTLED_FORCE_INLINE FL_IRAM FL_OPTIMIZE_FUNCTION void spread_transpose8_symbol(const u8 l[8], u8 out[8])
Transpose one symbol of 8 lanes (8 input bytes) into 8 output bytes: 8 pulses × 1 byte (bit L = lane ...
FASTLED_FORCE_INLINE u32 spreadB(u8 v)
Pulses 3,2,1,0 of v (byte j = bit (3-j)). Depends only on the low nibble.
FASTLED_FORCE_INLINE FL_IRAM FL_OPTIMIZE_FUNCTION void wave8_transpose_4(const Wave8Byte lane_waves[4], u8 output[4 *sizeof(Wave8Byte)])
Transpose 4 lanes of Wave8Byte data into interleaved format.
Definition wave8.hpp:126
FASTLED_FORCE_INLINE FL_IRAM FL_OPTIMIZE_FUNCTION void wave8_transpose_16_bf1(const u8 lanes[16], u8 W0, u8 W1, u8 output[16 *sizeof(Wave8Byte)])
BF1: chipset-aware direct encode for Wave8 16-lane (#2548 deep-dive).
Definition wave8.hpp:315
constexpr u8 kTranspose4_16_LUT[16]
Definition wave8.hpp:26
FASTLED_FORCE_INLINE FL_IRAM FL_OPTIMIZE_FUNCTION void wave8_transpose_16x4_bf1_pipe4(const u8 lanes_a[16], const u8 lanes_b[16], const u8 lanes_c[16], const u8 lanes_d[16], u8 W0, u8 W1, u8 output_a[16 *sizeof(Wave8Byte)], u8 output_b[16 *sizeof(Wave8Byte)], u8 output_c[16 *sizeof(Wave8Byte)], u8 output_d[16 *sizeof(Wave8Byte)])
BF1 + pipe4: 4-position software-pipelined BF1 (#2548 deep-dive).
Definition wave8.hpp:464
Compile-time linker keep-alive hook for a single fl::Bus.
Definition bus_traits.h:48
FL_OPTIMIZE_FUNCTION FL_IRAM FASTLED_FORCE_INLINE void memcpy_32(u32 *FL_RESTRICT_PARAM dst, const u32 *FL_RESTRICT_PARAM src, size_t count)
ISR-optimized 32-bit block copy for 4-byte aligned memory.
Definition memcpy.h:32
unsigned char u8
Definition stdint.h:131
FASTLED_FORCE_INLINE FL_IRAM FL_OPTIMIZE_FUNCTION void wave8(u8 lane, const Wave8BitExpansionLut &lut, u8(&FL_RESTRICT_PARAM output)[sizeof(Wave8Byte)])
Convert byte to 8 Wave8Bit structures using nibble LUT.
Definition wave8.hpp:514
To * bit_cast_ptr(void *storage) FL_NOEXCEPT
Definition bit_cast.h:60
@ W1
White is second.
Definition eorder.h:26
@ W0
White is first.
Definition eorder.h:27
Base definition for an LED controller.
Definition crgb.hpp:179
Type-safe container for packed 8-bit wave pulse pattern.
Definition wave8.h:22
#define FL_OPTIMIZATION_LEVEL_O3_BEGIN
#define FASTLED_FORCE_INLINE
#define FL_OPTIMIZATION_LEVEL_O3_END
#define FL_OPTIMIZE_FUNCTION
#define FL_IRAM
#define FL_RESTRICT_PARAM
#define FL_WAVE8_SPREAD_TO_16(lane_u8_0, lane_u8_1, out_16)
Definition wave8.hpp:81