27 0x14, 0x15, 0x40, 0x41, 0x44, 0x45,
28 0x50, 0x51, 0x54, 0x55};
48 const Wave8BitExpansionLut &lut,
51 const Wave8Bit *high_nibble_data = lut.lut[(byte_value >> 4) & 0xF];
57 const Wave8Bit *low_nibble_data = lut.lut[byte_value & 0xF];
70 const Wave8ByteExpansionLut &lut,
81#define FL_WAVE8_SPREAD_TO_16(lane_u8_0, lane_u8_1, out_16) \
83 const u8 _a = (u8)(lane_u8_0); \
84 const u8 _b = (u8)(lane_u8_1); \
86 (u16)((u16)::fl::detail::kTranspose4_16_LUT[_b & 0x0Fu] | \
87 ((u16)::fl::detail::kTranspose4_16_LUT[_b >> 4] << 8)); \
89 (u16)(((u16)::fl::detail::kTranspose4_16_LUT[_a & 0x0Fu] | \
90 ((u16)::fl::detail::kTranspose4_16_LUT[_a >> 4] << 8)) \
92 (out_16) |= (u16)(_even | _odd); \
104 u8 output[2 *
sizeof(Wave8Byte)]) {
105 for (
int symbol_idx = 0; symbol_idx < 8; symbol_idx++) {
110 lane_waves[1].symbols[symbol_idx].data,
113 output[symbol_idx * 2] = (
u8)(interleaved >> 8);
114 output[symbol_idx * 2 + 1] = (
u8)(interleaved & 0xFF);
127 u8 output[4 *
sizeof(Wave8Byte)]) {
137 for (
int symbol_idx = 0; symbol_idx < 8; symbol_idx++) {
139 u8 l0 = lane_waves[0].symbols[symbol_idx].data;
140 u8 l1 = lane_waves[1].symbols[symbol_idx].data;
141 u8 l2 = lane_waves[2].symbols[symbol_idx].data;
142 u8 l3 = lane_waves[3].symbols[symbol_idx].data;
149 output[symbol_idx * 4 + 0] =
150 ((l3 >> 7) & 1) << 7 |
151 ((l2 >> 7) & 1) << 6 |
152 ((l1 >> 7) & 1) << 5 |
153 ((l0 >> 7) & 1) << 4 |
154 ((l3 >> 6) & 1) << 3 |
155 ((l2 >> 6) & 1) << 2 |
156 ((l1 >> 6) & 1) << 1 |
160 output[symbol_idx * 4 + 1] =
161 ((l3 >> 5) & 1) << 7 |
162 ((l2 >> 5) & 1) << 6 |
163 ((l1 >> 5) & 1) << 5 |
164 ((l0 >> 5) & 1) << 4 |
165 ((l3 >> 4) & 1) << 3 |
166 ((l2 >> 4) & 1) << 2 |
167 ((l1 >> 4) & 1) << 1 |
171 output[symbol_idx * 4 + 2] =
172 ((l3 >> 3) & 1) << 7 |
173 ((l2 >> 3) & 1) << 6 |
174 ((l1 >> 3) & 1) << 5 |
175 ((l0 >> 3) & 1) << 4 |
176 ((l3 >> 2) & 1) << 3 |
177 ((l2 >> 2) & 1) << 2 |
178 ((l1 >> 2) & 1) << 1 |
182 output[symbol_idx * 4 + 3] =
183 ((l3 >> 1) & 1) << 7 |
184 ((l2 >> 1) & 1) << 6 |
185 ((l1 >> 1) & 1) << 5 |
186 ((l0 >> 1) & 1) << 4 |
187 ((l3 >> 0) & 1) << 3 |
188 ((l2 >> 0) & 1) << 2 |
189 ((l1 >> 0) & 1) << 1 |
206 u8 output[8 *
sizeof(Wave8Byte)]) {
207 for (
int symbol_idx = 0; symbol_idx < 8; symbol_idx++) {
209 for (
int lane = 0; lane < 8; lane++) {
210 l[lane] = lane_waves[lane].symbols[symbol_idx].data;
232 u8 output[16 *
sizeof(Wave8Byte)]) {
233 for (
int symbol_idx = 0; symbol_idx < 8; symbol_idx++) {
235 for (
int lane = 0; lane < 16; lane++) {
236 l[lane] = lane_waves[lane].symbols[symbol_idx].data;
250 const Wave8Byte lane_waves_b[16],
251 u8 output_a[16 *
sizeof(Wave8Byte)],
252 u8 output_b[16 *
sizeof(Wave8Byte)]) {
253 for (
int symbol_idx = 0; symbol_idx < 8; symbol_idx++) {
256 for (
int lane = 0; lane < 16; lane++) {
257 la[lane] = lane_waves_a[lane].symbols[symbol_idx].data;
258 lb[lane] = lane_waves_b[lane].symbols[symbol_idx].data;
274 const Wave8Byte lane_waves_b[16],
275 const Wave8Byte lane_waves_c[16],
276 const Wave8Byte lane_waves_d[16],
277 u8 output_a[16 *
sizeof(Wave8Byte)],
278 u8 output_b[16 *
sizeof(Wave8Byte)],
279 u8 output_c[16 *
sizeof(Wave8Byte)],
280 u8 output_d[16 *
sizeof(Wave8Byte)]) {
281 for (
int symbol_idx = 0; symbol_idx < 8; symbol_idx++) {
286 for (
int lane = 0; lane < 16; lane++) {
287 la[lane] = lane_waves_a[lane].symbols[symbol_idx].data;
288 lb[lane] = lane_waves_b[lane].symbols[symbol_idx].data;
289 lc[lane] = lane_waves_c[lane].symbols[symbol_idx].data;
290 ld[lane] = lane_waves_d[lane].symbols[symbol_idx].data;
317 u8 output[16 *
sizeof(Wave8Byte)]) {
320 const u8 D_byte =
W0 ^
W1;
321 for (
int p = 0; p < 8; ++p) {
322 const int shift = 7 - p;
323 d_mask[p] = ((D_byte >> shift) & 1) ? 0xFFu : 0x00u;
324 m0_mask[p] = ((
W0 >> shift) & 1) ? 0xFFu : 0x00u;
330 for (
int s = 0; s < 8; ++s) {
331 const u8 col_lo = cols[2 * s + 0];
332 const u8 col_hi = cols[2 * s + 1];
333 for (
int p = 0; p < 8; ++p) {
334 output[s * 16 + p * 2 + 0] = m0_mask[p] ^ (col_lo & d_mask[p]);
335 output[s * 16 + p * 2 + 1] = m0_mask[p] ^ (col_hi & d_mask[p]);
347 u8 output[8 *
sizeof(Wave8Byte)]) {
350 const u8 D_byte =
W0 ^
W1;
351 for (
int p = 0; p < 8; ++p) {
352 const int shift = 7 - p;
353 d_mask[p] = ((D_byte >> shift) & 1) ? 0xFFu : 0x00u;
354 m0_mask[p] = ((
W0 >> shift) & 1) ? 0xFFu : 0x00u;
358 for (
int s = 0; s < 8; ++s) {
359 const u8 col = cols[s];
360 for (
int p = 0; p < 8; ++p) {
361 output[s * 8 + p] = m0_mask[p] ^ (col & d_mask[p]);
372 u8 output[4 *
sizeof(Wave8Byte)]) {
375 const u8 D_byte =
W0 ^
W1;
376 for (
int p = 0; p < 8; ++p) {
377 const int shift = 7 - p;
378 d_mask[p] = ((D_byte >> shift) & 1) ? 0xFFu : 0x00u;
379 m0_mask[p] = ((
W0 >> shift) & 1) ? 0xFFu : 0x00u;
389 cols[0] =
static_cast<u8>(aLo);
390 cols[1] =
static_cast<u8>(aLo >> 8);
391 cols[2] =
static_cast<u8>(aLo >> 16);
392 cols[3] =
static_cast<u8>(aLo >> 24);
393 cols[4] =
static_cast<u8>(bLo);
394 cols[5] =
static_cast<u8>(bLo >> 8);
395 cols[6] =
static_cast<u8>(bLo >> 16);
396 cols[7] =
static_cast<u8>(bLo >> 24);
397 for (
int s = 0; s < 8; ++s) {
398 const u8 col = cols[s];
399 for (
int k = 0; k < 4; ++k) {
400 const int p_hi = 2 * k;
401 const int p_lo = 2 * k + 1;
402 const u8 hi =
static_cast<u8>((m0_mask[p_hi] & 0xF0u) ^ ((col << 4) & d_mask[p_hi]));
403 const u8 lo =
static_cast<u8>((m0_mask[p_lo] & 0x0Fu) ^ (col & d_mask[p_lo]));
404 output[s * 4 + k] =
static_cast<u8>(hi | lo);
419 u8 output[2 *
sizeof(Wave8Byte)]) {
422 const u8 D_byte =
W0 ^
W1;
423 for (
int p = 0; p < 8; ++p) {
424 const int shift = 7 - p;
425 d_mask[p] = ((D_byte >> shift) & 1) ? 0xFFu : 0x00u;
426 m0_mask[p] = ((
W0 >> shift) & 1) ? 0xFFu : 0x00u;
428 for (
int s = 0; s < 8; ++s) {
429 const int bit_idx = 7 - s;
430 const u8 b0 =
static_cast<u8>((lanes[0] >> bit_idx) & 1u);
431 const u8 b1 =
static_cast<u8>((lanes[1] >> bit_idx) & 1u);
434 for (
int q = 0; q < 4; ++q) {
441 const int p_hi = 3 - q;
442 const int p_lo = 7 - q;
443 const u8 m0_p_hi =
static_cast<u8>(m0_mask[p_hi] & 1u);
444 const u8 d_p_hi =
static_cast<u8>(d_mask[p_hi] & 1u);
445 const u8 m0_p_lo =
static_cast<u8>(m0_mask[p_lo] & 1u);
446 const u8 d_p_lo =
static_cast<u8>(d_mask[p_lo] & 1u);
447 const u8 v0_hi =
static_cast<u8>(m0_p_hi ^ (b0 & d_p_hi));
448 const u8 v1_hi =
static_cast<u8>(m0_p_hi ^ (b1 & d_p_hi));
449 const u8 v0_lo =
static_cast<u8>(m0_p_lo ^ (b0 & d_p_lo));
450 const u8 v1_lo =
static_cast<u8>(m0_p_lo ^ (b1 & d_p_lo));
451 byte_hi |=
static_cast<u8>((v1_hi << (2 * q)) | (v0_hi << (2 * q + 1)));
452 byte_lo |=
static_cast<u8>((v1_lo << (2 * q)) | (v0_lo << (2 * q + 1)));
454 output[s * 2 + 0] = byte_hi;
455 output[s * 2 + 1] = byte_lo;
465 const u8 lanes_b[16],
466 const u8 lanes_c[16],
467 const u8 lanes_d[16],
469 u8 output_a[16 *
sizeof(Wave8Byte)],
470 u8 output_b[16 *
sizeof(Wave8Byte)],
471 u8 output_c[16 *
sizeof(Wave8Byte)],
472 u8 output_d[16 *
sizeof(Wave8Byte)]) {
475 const u8 D_byte =
W0 ^
W1;
476 for (
int p = 0; p < 8; ++p) {
477 const int shift = 7 - p;
478 d_mask[p] = ((D_byte >> shift) & 1) ? 0xFFu : 0x00u;
479 m0_mask[p] = ((
W0 >> shift) & 1) ? 0xFFu : 0x00u;
481 u8 cols_a[16], cols_b[16], cols_c[16], cols_d[16];
486 for (
int s = 0; s < 8; ++s) {
487 const u8 al = cols_a[2*s + 0], ah = cols_a[2*s + 1];
488 const u8 bl = cols_b[2*s + 0], bh = cols_b[2*s + 1];
489 const u8 cl = cols_c[2*s + 0], ch = cols_c[2*s + 1];
490 const u8 dl = cols_d[2*s + 0], dh = cols_d[2*s + 1];
491 for (
int p = 0; p < 8; ++p) {
492 const u8 dm = d_mask[p], mm = m0_mask[p];
493 output_a[s*16 + p*2 + 0] = mm ^ (al & dm);
494 output_a[s*16 + p*2 + 1] = mm ^ (ah & dm);
495 output_b[s*16 + p*2 + 0] = mm ^ (bl & dm);
496 output_b[s*16 + p*2 + 1] = mm ^ (bh & dm);
497 output_c[s*16 + p*2 + 0] = mm ^ (cl & dm);
498 output_c[s*16 + p*2 + 1] = mm ^ (ch & dm);
499 output_d[s*16 + p*2 + 0] = mm ^ (dl & dm);
500 output_d[s*16 + p*2 + 1] = mm ^ (dh & dm);
515 const Wave8BitExpansionLut &lut,
519 Wave8Byte waveformSymbol;
Shared u32 "spread LUT" bit-matrix transpose primitive (no SIMD, no u64).
ISR-safe memory operations (inline, header-only)
FASTLED_FORCE_INLINE FL_IRAM FL_OPTIMIZE_FUNCTION void wave8_transpose_4_bf1(const u8 lanes[4], u8 W0, u8 W1, u8 output[4 *sizeof(Wave8Byte)])
BF1 for 4-lane Wave8.
FASTLED_FORCE_INLINE FL_IRAM FL_OPTIMIZE_FUNCTION void wave8_transpose_2_bf1(const u8 lanes[2], u8 W0, u8 W1, u8 output[2 *sizeof(Wave8Byte)])
BF1 for 2-lane Wave8.
FASTLED_FORCE_INLINE FL_IRAM FL_OPTIMIZE_FUNCTION void wave8_transpose_16(const Wave8Byte lane_waves[16], u8 output[16 *sizeof(Wave8Byte)])
Transpose 16 lanes of Wave8Byte data into interleaved format.
FASTLED_FORCE_INLINE FL_IRAM FL_OPTIMIZE_FUNCTION void wave8_transpose_8_bf1(const u8 lanes[8], u8 W0, u8 W1, u8 output[8 *sizeof(Wave8Byte)])
BF1 for 8-lane Wave8 — same algebraic identity as 16-lane BF1.
FASTLED_FORCE_INLINE FL_IRAM FL_OPTIMIZE_FUNCTION void wave8_convert_byte_to_wave8byte(u8 byte_value, const Wave8BitExpansionLut &lut, Wave8Byte *output)
Helper: Convert byte to Wave8Byte using nibble LUT (internal use only)
FASTLED_FORCE_INLINE FL_IRAM FL_OPTIMIZE_FUNCTION void wave8_transpose_8(const Wave8Byte lane_waves[8], u8 output[8 *sizeof(Wave8Byte)])
Transpose 8 lanes of Wave8Byte data into interleaved format.
FASTLED_FORCE_INLINE FL_IRAM FL_OPTIMIZE_FUNCTION void spread_transpose16_symbol(const u8 l[16], u8 out[16])
Transpose one symbol of 16 lanes (16 input bytes) into 16 output bytes: 8 pulses × 2 bytes,...
constexpr u8 kTranspose2_4_LUT[4]
FASTLED_FORCE_INLINE FL_IRAM FL_OPTIMIZE_FUNCTION void wave8_transpose_16x4_pipe4(const Wave8Byte lane_waves_a[16], const Wave8Byte lane_waves_b[16], const Wave8Byte lane_waves_c[16], const Wave8Byte lane_waves_d[16], u8 output_a[16 *sizeof(Wave8Byte)], u8 output_b[16 *sizeof(Wave8Byte)], u8 output_c[16 *sizeof(Wave8Byte)], u8 output_d[16 *sizeof(Wave8Byte)])
Pipe4: transpose 16-lane × 4-byte-positions in one fused call.
FASTLED_FORCE_INLINE FL_IRAM FL_OPTIMIZE_FUNCTION void wave8_transpose_2(const Wave8Byte lane_waves[2], u8 output[2 *sizeof(Wave8Byte)])
Transpose 2 lanes of Wave8Byte data into interleaved format.
FASTLED_FORCE_INLINE FL_IRAM FL_OPTIMIZE_FUNCTION void wave8_expand_byte(u8 byte_value, const Wave8ByteExpansionLut &lut, Wave8Byte *output)
Byte-indexed expansion (#2526): one indexed 8-byte copy.
FASTLED_FORCE_INLINE FL_IRAM FL_OPTIMIZE_FUNCTION void wave8_transpose_16x2_pipe2(const Wave8Byte lane_waves_a[16], const Wave8Byte lane_waves_b[16], u8 output_a[16 *sizeof(Wave8Byte)], u8 output_b[16 *sizeof(Wave8Byte)])
Pipe2: transpose 16-lane × 2-byte-positions in one fused call.
FASTLED_FORCE_INLINE u32 spreadA(u8 v)
Pulses 7,6,5,4 of v (byte j = bit (7-j)). Depends only on the high nibble.
FASTLED_FORCE_INLINE FL_IRAM FL_OPTIMIZE_FUNCTION void spread_transpose8_symbol(const u8 l[8], u8 out[8])
Transpose one symbol of 8 lanes (8 input bytes) into 8 output bytes: 8 pulses × 1 byte (bit L = lane ...
FASTLED_FORCE_INLINE u32 spreadB(u8 v)
Pulses 3,2,1,0 of v (byte j = bit (3-j)). Depends only on the low nibble.
FASTLED_FORCE_INLINE FL_IRAM FL_OPTIMIZE_FUNCTION void wave8_transpose_4(const Wave8Byte lane_waves[4], u8 output[4 *sizeof(Wave8Byte)])
Transpose 4 lanes of Wave8Byte data into interleaved format.
FASTLED_FORCE_INLINE FL_IRAM FL_OPTIMIZE_FUNCTION void wave8_transpose_16_bf1(const u8 lanes[16], u8 W0, u8 W1, u8 output[16 *sizeof(Wave8Byte)])
BF1: chipset-aware direct encode for Wave8 16-lane (#2548 deep-dive).
constexpr u8 kTranspose4_16_LUT[16]
FASTLED_FORCE_INLINE FL_IRAM FL_OPTIMIZE_FUNCTION void wave8_transpose_16x4_bf1_pipe4(const u8 lanes_a[16], const u8 lanes_b[16], const u8 lanes_c[16], const u8 lanes_d[16], u8 W0, u8 W1, u8 output_a[16 *sizeof(Wave8Byte)], u8 output_b[16 *sizeof(Wave8Byte)], u8 output_c[16 *sizeof(Wave8Byte)], u8 output_d[16 *sizeof(Wave8Byte)])
BF1 + pipe4: 4-position software-pipelined BF1 (#2548 deep-dive).
Compile-time linker keep-alive hook for a single fl::Bus.
FL_OPTIMIZE_FUNCTION FL_IRAM FASTLED_FORCE_INLINE void memcpy_32(u32 *FL_RESTRICT_PARAM dst, const u32 *FL_RESTRICT_PARAM src, size_t count)
ISR-optimized 32-bit block copy for 4-byte aligned memory.
FASTLED_FORCE_INLINE FL_IRAM FL_OPTIMIZE_FUNCTION void wave8(u8 lane, const Wave8BitExpansionLut &lut, u8(&FL_RESTRICT_PARAM output)[sizeof(Wave8Byte)])
Convert byte to 8 Wave8Bit structures using nibble LUT.
To * bit_cast_ptr(void *storage) FL_NOEXCEPT
Base definition for an LED controller.
Type-safe container for packed 8-bit wave pulse pattern.
#define FL_OPTIMIZATION_LEVEL_O3_BEGIN
#define FASTLED_FORCE_INLINE
#define FL_OPTIMIZATION_LEVEL_O3_END
#define FL_OPTIMIZE_FUNCTION
#define FL_RESTRICT_PARAM
#define FL_WAVE8_SPREAD_TO_16(lane_u8_0, lane_u8_1, out_16)