FastLED 3.9.15
Loading...
Searching...
No Matches
transposition.h
Go to the documentation of this file.
1#pragma once
2
23
25#include "fl/stl/int.h"
26#include "fl/stl/span.h"
27#include "fl/stl/optional.h"
28#include "fl/stl/cstring.h"
29#include "fl/stl/noexcept.h"
30
32
33namespace fl {
34
35// ============================================================================
36// Core 8x1 Bit Transpose Functions
37// ============================================================================
38
39// Note: These transpose functions are used across multiple platforms,
40// so they are defined for all targets
41
43typedef union {
45 struct {
46 fl::u32 a0:1;
47 fl::u32 a1:1;
48 fl::u32 a2:1;
49 fl::u32 a3:1;
50 fl::u32 a4:1;
51 fl::u32 a5:1;
52 fl::u32 a6:1;
53 fl::u32 a7:1;
54 };
55} just8bits;
56
58typedef struct {
59 fl::u32 a0:1;
60 fl::u32 a1:1;
61 fl::u32 a2:1;
62 fl::u32 a3:1;
63 fl::u32 a4:1;
64 fl::u32 a5:1;
65 fl::u32 a6:1;
66 fl::u32 a7:1;
67 fl::u32 b0:1;
68 fl::u32 b1:1;
69 fl::u32 b2:1;
70 fl::u32 b3:1;
71 fl::u32 b4:1;
72 fl::u32 b5:1;
73 fl::u32 b6:1;
74 fl::u32 b7:1;
75 fl::u32 c0:1;
76 fl::u32 c1:1;
77 fl::u32 c2:1;
78 fl::u32 c3:1;
79 fl::u32 c4:1;
80 fl::u32 c5:1;
81 fl::u32 c6:1;
82 fl::u32 c7:1;
83 fl::u32 d0:1;
84 fl::u32 d1:1;
85 fl::u32 d2:1;
86 fl::u32 d3:1;
87 fl::u32 d4:1;
88 fl::u32 d5:1;
89 fl::u32 d6:1;
90 fl::u32 d7:1;
91} sub4;
92
94typedef union {
95 fl::u32 word[2];
97 struct {
98 sub4 a;
99 sub4 b;
100 };
101} bitswap_type;
102
110void transpose8x1_noinline(unsigned char *A, unsigned char *B) FL_NOEXCEPT;
111
116FASTLED_FORCE_INLINE void transpose8x1(unsigned char *A, unsigned char *B) FL_NOEXCEPT {
117 fl::u32 x, y, t;
118
119 // Load the array and pack it into x and y.
120 y = *(fl::u32*)(A);
121 x = *(fl::u32*)(A+4);
122
123 // pre-transform x
124 t = (x ^ (x >> 7)) & 0x00AA00AA; x = x ^ t ^ (t << 7);
125 t = (x ^ (x >>14)) & 0x0000CCCC; x = x ^ t ^ (t <<14);
126
127 // pre-transform y
128 t = (y ^ (y >> 7)) & 0x00AA00AA; y = y ^ t ^ (t << 7);
129 t = (y ^ (y >>14)) & 0x0000CCCC; y = y ^ t ^ (t <<14);
130
131 // final transform
132 t = (x & 0xF0F0F0F0) | ((y >> 4) & 0x0F0F0F0F);
133 y = ((x << 4) & 0xF0F0F0F0) | (y & 0x0F0F0F0F);
134 x = t;
135
136 *((u32*)B) = y;
137 *((u32*)(B+4)) = x;
138}
139
146FASTLED_FORCE_INLINE void transpose8x1_MSB(unsigned char *A, unsigned char *B) FL_NOEXCEPT {
147 fl::u32 x, y, t;
148
149 // Load the array and pack it into x and y.
150 y = *(fl::u32*)(A);
151 x = *(fl::u32*)(A+4);
152
153 // pre-transform x
154 t = (x ^ (x >> 7)) & 0x00AA00AA; x = x ^ t ^ (t << 7);
155 t = (x ^ (x >>14)) & 0x0000CCCC; x = x ^ t ^ (t <<14);
156
157 // pre-transform y
158 t = (y ^ (y >> 7)) & 0x00AA00AA; y = y ^ t ^ (t << 7);
159 t = (y ^ (y >>14)) & 0x0000CCCC; y = y ^ t ^ (t <<14);
160
161 // final transform
162 t = (x & 0xF0F0F0F0) | ((y >> 4) & 0x0F0F0F0F);
163 y = ((x << 4) & 0xF0F0F0F0) | (y & 0x0F0F0F0F);
164 x = t;
165
166 B[7] = y; y >>= 8;
167 B[6] = y; y >>= 8;
168 B[5] = y; y >>= 8;
169 B[4] = y;
170
171 B[3] = x; x >>= 8;
172 B[2] = x; x >>= 8;
173 B[1] = x; x >>= 8;
174 B[0] = x;
175}
176
185template<int m, int n>
186FASTLED_FORCE_INLINE void transpose8(unsigned char *A, unsigned char *B) FL_NOEXCEPT {
187 fl::u32 x, y, t;
188
189 // Load the array and pack it into x and y.
190 if(m == 1) {
191 y = *(fl::u32*)(A);
192 x = *(fl::u32*)(A+4);
193 } else {
194 x = (fl::u32(A[0])<<24) | (fl::u32(A[m])<<16) | (fl::u32(A[2*m])<<8) | A[3*m];
195 y = (fl::u32(A[4*m])<<24) | (fl::u32(A[5*m])<<16) | (fl::u32(A[6*m])<<8) | A[7*m];
196 }
197
198 // pre-transform x
199 t = (x ^ (x >> 7)) & 0x00AA00AA; x = x ^ t ^ (t << 7);
200 t = (x ^ (x >>14)) & 0x0000CCCC; x = x ^ t ^ (t <<14);
201
202 // pre-transform y
203 t = (y ^ (y >> 7)) & 0x00AA00AA; y = y ^ t ^ (t << 7);
204 t = (y ^ (y >>14)) & 0x0000CCCC; y = y ^ t ^ (t <<14);
205
206 // final transform
207 t = (x & 0xF0F0F0F0) | ((y >> 4) & 0x0F0F0F0F);
208 y = ((x << 4) & 0xF0F0F0F0) | (y & 0x0F0F0F0F);
209 x = t;
210
211 B[7*n] = y; y >>= 8;
212 B[6*n] = y; y >>= 8;
213 B[5*n] = y; y >>= 8;
214 B[4*n] = y;
215
216 B[3*n] = x; x >>= 8;
217 B[2*n] = x; x >>= 8;
218 B[n] = x; x >>= 8;
219 B[0] = x;
220}
221
222// ============================================================================
223// Low-Level ISR-Safe Transposition Primitives
224// ============================================================================
225
239inline void transpose_2lane_inline(
240 const u8* lane0_byte,
241 const u8* lane1_byte,
242 u8* output,
243 size_t num_bytes
245
258inline void transpose_4lane_inline(
259 const u8* const lanes[4],
260 u8* output,
261 size_t num_bytes
263
276inline void transpose_8lane_inline(
277 const u8* const lanes[8],
278 u8* output,
279 size_t num_bytes
281
294inline void transpose_16lane_inline(
295 const u8* const lanes[16],
296 u8* output,
297 size_t num_bytes
299
319template<typename TSource>
320inline void transpose_generic_inline(
321 const TSource* const lanes[],
322 size_t num_lanes,
323 u8* output,
324 size_t num_items
326
327// Implementation of inline ISR-safe primitives
328
330 const u8* lane0_byte,
331 const u8* lane1_byte,
332 u8* output,
333 size_t num_bytes
334) FL_NOEXCEPT {
335 for (size_t byte_idx = 0; byte_idx < num_bytes; byte_idx++) {
336 u8 a = lane0_byte[byte_idx];
337 u8 b = lane1_byte[byte_idx];
338
339 // dest[0] contains bit pairs for positions 7,6,5,4 (MSB first)
340 output[byte_idx * 2 + 0] =
341 ((a >> 7) & 0x01) << 0 | ((b >> 7) & 0x01) << 1 |
342 ((a >> 6) & 0x01) << 2 | ((b >> 6) & 0x01) << 3 |
343 ((a >> 5) & 0x01) << 4 | ((b >> 5) & 0x01) << 5 |
344 ((a >> 4) & 0x01) << 6 | ((b >> 4) & 0x01) << 7;
345
346 // dest[1] contains bit pairs for positions 3,2,1,0 (LSB)
347 output[byte_idx * 2 + 1] =
348 ((a >> 3) & 0x01) << 0 | ((b >> 3) & 0x01) << 1 |
349 ((a >> 2) & 0x01) << 2 | ((b >> 2) & 0x01) << 3 |
350 ((a >> 1) & 0x01) << 4 | ((b >> 1) & 0x01) << 5 |
351 ((a >> 0) & 0x01) << 6 | ((b >> 0) & 0x01) << 7;
352 }
353}
354
356 const u8* const lanes[4],
357 u8* output,
358 size_t num_bytes
359) FL_NOEXCEPT {
360 for (size_t byte_idx = 0; byte_idx < num_bytes; byte_idx++) {
361 u8 a = lanes[0][byte_idx];
362 u8 b = lanes[1][byte_idx];
363 u8 c = lanes[2][byte_idx];
364 u8 d = lanes[3][byte_idx];
365
366 u8* dest = &output[byte_idx * 4];
367
368 dest[0] = ((a >> 7) & 0x01) << 0 | ((b >> 7) & 0x01) << 1 | ((c >> 7) & 0x01) << 2 | ((d >> 7) & 0x01) << 3 |
369 ((a >> 6) & 0x01) << 4 | ((b >> 6) & 0x01) << 5 | ((c >> 6) & 0x01) << 6 | ((d >> 6) & 0x01) << 7;
370
371 dest[1] = ((a >> 5) & 0x01) << 0 | ((b >> 5) & 0x01) << 1 | ((c >> 5) & 0x01) << 2 | ((d >> 5) & 0x01) << 3 |
372 ((a >> 4) & 0x01) << 4 | ((b >> 4) & 0x01) << 5 | ((c >> 4) & 0x01) << 6 | ((d >> 4) & 0x01) << 7;
373
374 dest[2] = ((a >> 3) & 0x01) << 0 | ((b >> 3) & 0x01) << 1 | ((c >> 3) & 0x01) << 2 | ((d >> 3) & 0x01) << 3 |
375 ((a >> 2) & 0x01) << 4 | ((b >> 2) & 0x01) << 5 | ((c >> 2) & 0x01) << 6 | ((d >> 2) & 0x01) << 7;
376
377 dest[3] = ((a >> 1) & 0x01) << 0 | ((b >> 1) & 0x01) << 1 | ((c >> 1) & 0x01) << 2 | ((d >> 1) & 0x01) << 3 |
378 ((a >> 0) & 0x01) << 4 | ((b >> 0) & 0x01) << 5 | ((c >> 0) & 0x01) << 6 | ((d >> 0) & 0x01) << 7;
379 }
380}
381
383 const u8* const lanes[8],
384 u8* output,
385 size_t num_bytes
386) FL_NOEXCEPT {
387 for (size_t byte_idx = 0; byte_idx < num_bytes; byte_idx++) {
388 // Pack 8 bytes into a single 64-bit register
389 // This reduces register pressure and enables parallel bit extraction
390 u64 packed =
391 ((u64)lanes[0][byte_idx] << 0) |
392 ((u64)lanes[1][byte_idx] << 8) |
393 ((u64)lanes[2][byte_idx] << 16) |
394 ((u64)lanes[3][byte_idx] << 24) |
395 ((u64)lanes[4][byte_idx] << 32) |
396 ((u64)lanes[5][byte_idx] << 40) |
397 ((u64)lanes[6][byte_idx] << 48) |
398 ((u64)lanes[7][byte_idx] << 56);
399
400 u8* dest = &output[byte_idx * 8];
401
402 // Extract bits in parallel (compiler can optimize independent shifts)
403 for (int bit = 7; bit >= 0; bit--) {
404 dest[7 - bit] =
405 ((packed >> (bit + 0)) & 0x01) << 0 |
406 ((packed >> (bit + 8)) & 0x01) << 1 |
407 ((packed >> (bit + 16)) & 0x01) << 2 |
408 ((packed >> (bit + 24)) & 0x01) << 3 |
409 ((packed >> (bit + 32)) & 0x01) << 4 |
410 ((packed >> (bit + 40)) & 0x01) << 5 |
411 ((packed >> (bit + 48)) & 0x01) << 6 |
412 ((packed >> (bit + 56)) & 0x01) << 7;
413 }
414 }
415}
416
418 const u8* const lanes[16],
419 u8* output,
420 size_t num_bytes
421) FL_NOEXCEPT {
422 for (size_t byte_idx = 0; byte_idx < num_bytes; byte_idx++) {
423 // Pack lanes 0-7 into first 64-bit register
424 u64 packed_lo =
425 ((u64)lanes[0][byte_idx] << 0) |
426 ((u64)lanes[1][byte_idx] << 8) |
427 ((u64)lanes[2][byte_idx] << 16) |
428 ((u64)lanes[3][byte_idx] << 24) |
429 ((u64)lanes[4][byte_idx] << 32) |
430 ((u64)lanes[5][byte_idx] << 40) |
431 ((u64)lanes[6][byte_idx] << 48) |
432 ((u64)lanes[7][byte_idx] << 56);
433
434 // Pack lanes 8-15 into second 64-bit register
435 u64 packed_hi =
436 ((u64)lanes[8][byte_idx] << 0) |
437 ((u64)lanes[9][byte_idx] << 8) |
438 ((u64)lanes[10][byte_idx] << 16) |
439 ((u64)lanes[11][byte_idx] << 24) |
440 ((u64)lanes[12][byte_idx] << 32) |
441 ((u64)lanes[13][byte_idx] << 40) |
442 ((u64)lanes[14][byte_idx] << 48) |
443 ((u64)lanes[15][byte_idx] << 56);
444
445 u8* dest = &output[byte_idx * 16];
446
447 // Extract bits in parallel from both packed registers
448 for (int bit = 7; bit >= 0; bit--) {
449 dest[7 - bit] =
450 ((packed_lo >> (bit + 0)) & 0x01) << 0 |
451 ((packed_lo >> (bit + 8)) & 0x01) << 1 |
452 ((packed_lo >> (bit + 16)) & 0x01) << 2 |
453 ((packed_lo >> (bit + 24)) & 0x01) << 3 |
454 ((packed_lo >> (bit + 32)) & 0x01) << 4 |
455 ((packed_lo >> (bit + 40)) & 0x01) << 5 |
456 ((packed_lo >> (bit + 48)) & 0x01) << 6 |
457 ((packed_lo >> (bit + 56)) & 0x01) << 7;
458
459 dest[15 - bit] =
460 ((packed_hi >> (bit + 0)) & 0x01) << 0 |
461 ((packed_hi >> (bit + 8)) & 0x01) << 1 |
462 ((packed_hi >> (bit + 16)) & 0x01) << 2 |
463 ((packed_hi >> (bit + 24)) & 0x01) << 3 |
464 ((packed_hi >> (bit + 32)) & 0x01) << 4 |
465 ((packed_hi >> (bit + 40)) & 0x01) << 5 |
466 ((packed_hi >> (bit + 48)) & 0x01) << 6 |
467 ((packed_hi >> (bit + 56)) & 0x01) << 7;
468 }
469 }
470}
471
472template<typename TSource>
474 const TSource* const lanes[],
475 size_t num_lanes,
476 u8* output,
477 size_t num_items
478) FL_NOEXCEPT {
479 constexpr size_t bits_per_item = sizeof(TSource) * 8;
480
481 for (size_t item_idx = 0; item_idx < num_items; item_idx++) {
482 u8* dest = &output[item_idx * bits_per_item];
483
484 // Process each bit position in the source data (MSB to LSB)
485 for (size_t bit_pos = 0; bit_pos < bits_per_item; bit_pos++) {
486 size_t src_bit = (bits_per_item - 1) - bit_pos;
487 u8 output_byte = 0;
488
489 // Extract bit from each lane (up to 8 lanes per output byte)
490 for (size_t lane = 0; lane < num_lanes && lane < 8; lane++) {
491 TSource src_value = lanes[lane][item_idx];
492 u8 bit = (src_value >> src_bit) & 0x01;
493 output_byte |= (bit << (7 - lane));
494 }
495
496 dest[bit_pos] = output_byte;
497 }
498 }
499}
500
501// ============================================================================
502// SPI Multi-Lane Transposer
503// ============================================================================
504
584public:
590
602 static bool transpose2(const fl::optional<LaneData>& lane0,
603 const fl::optional<LaneData>& lane1,
604 fl::span<u8> output,
605 const char** error = nullptr) FL_NOEXCEPT;
606
620 static bool transpose4(const fl::optional<LaneData>& lane0,
621 const fl::optional<LaneData>& lane1,
622 const fl::optional<LaneData>& lane2,
623 const fl::optional<LaneData>& lane3,
624 fl::span<u8> output,
625 const char** error = nullptr) FL_NOEXCEPT;
626
637 static bool transpose8(const fl::optional<LaneData> lanes[8],
638 fl::span<u8> output,
639 const char** error = nullptr) FL_NOEXCEPT;
640
651 static bool transpose16(const fl::optional<LaneData> lanes[16],
652 fl::span<u8> output,
653 const char** error = nullptr) FL_NOEXCEPT;
654
655private:
657 static u8 getLaneByte(const LaneData& lane, size_t byte_idx, size_t max_size) FL_NOEXCEPT;
658};
659
660// ============================================================================
661// Parallel Strip Transposer (RP2040/RP2350 PIO)
662// ============================================================================
663
681 const u8* const input[8],
682 u8* output,
683 u16 num_leds,
684 u8 bytes_per_led
685) FL_NOEXCEPT {
686 // Process each LED
687 for (u16 led = 0; led < num_leds; led++) {
688 u8 temp_input[8];
689
690 // Process each byte in the LED
691 for (u8 byte_idx = 0; byte_idx < bytes_per_led; byte_idx++) {
692 // Collect one byte from each strip for this byte position
693 for (int strip = 0; strip < 8; strip++) {
694 temp_input[strip] = input[strip][led * bytes_per_led + byte_idx];
695 }
696
697 // Transpose 8 bytes → 8 bytes (1 bit from each strip per output byte)
698 transpose8x1_MSB(temp_input, output);
699
700 // Advance output pointer by 8 bytes
701 output += 8;
702 }
703 }
704}
705
723 const u8* const input[4],
724 u8* output,
725 u16 num_leds,
726 u8 bytes_per_led
727) FL_NOEXCEPT {
728 // Process each LED
729 for (u16 led = 0; led < num_leds; led++) {
730 // Process each byte in the LED
731 for (u8 byte_idx = 0; byte_idx < bytes_per_led; byte_idx++) {
732 // Collect one byte from each strip for this byte position
733 u8 strip_bytes[4];
734 for (int strip = 0; strip < 4; strip++) {
735 strip_bytes[strip] = input[strip][led * bytes_per_led + byte_idx];
736 }
737
738 // Transpose: extract each bit position from all 4 strips
739 for (int bit = 7; bit >= 0; bit--) {
740 u8 output_byte = 0;
741 // Pack bits from all 4 strips into lower 4 bits
742 for (int strip = 0; strip < 4; strip++) {
743 output_byte |= ((strip_bytes[strip] >> bit) & 1) << strip;
744 }
745 *output++ = output_byte;
746 }
747 }
748 }
749}
750
768 const u8* const input[2],
769 u8* output,
770 u16 num_leds,
771 u8 bytes_per_led
772) FL_NOEXCEPT {
773 // Process each LED
774 for (u16 led = 0; led < num_leds; led++) {
775 // Process each byte in the LED
776 for (u8 byte_idx = 0; byte_idx < bytes_per_led; byte_idx++) {
777 // Collect one byte from each strip for this byte position
778 u8 strip_bytes[2];
779 strip_bytes[0] = input[0][led * bytes_per_led + byte_idx];
780 strip_bytes[1] = input[1][led * bytes_per_led + byte_idx];
781
782 // Transpose: extract each bit position from both strips
783 for (int bit = 7; bit >= 0; bit--) {
784 u8 output_byte =
785 ((strip_bytes[0] >> bit) & 1) |
786 (((strip_bytes[1] >> bit) & 1) << 1);
787 *output++ = output_byte;
788 }
789 }
790 }
791}
792
801 return num_leds * bytes_per_led * 8;
802}
803
813 u8 num_strips,
814 const u8* const* input,
815 u8* output,
816 u16 num_leds,
817 u8 bytes_per_led
818) FL_NOEXCEPT {
819 switch (num_strips) {
820 case 8:
821 transpose_8strips(input, output, num_leds, bytes_per_led);
822 return true;
823 case 4:
824 transpose_4strips(input, output, num_leds, bytes_per_led);
825 return true;
826 case 2:
827 transpose_2strips(input, output, num_leds, bytes_per_led);
828 return true;
829 default:
830 return false; // Invalid strip count
831 }
832}
833
834// ============================================================================
835// PARLIO Wave8 Transposer (ESP32-S3 Parallel I/O)
836// ============================================================================
837
844template<size_t DATA_WIDTH>
846 const u8* FL_RESTRICT_PARAM laneWaveforms,
847 u8* FL_RESTRICT_PARAM outputBuffer
848) FL_NOEXCEPT {
849 constexpr size_t bytes_per_lane = 8; // sizeof(Wave8Byte)
850 constexpr size_t pulsesPerByte = 64; // 8 bits × 8 pulses per bit
851 size_t outputIdx = 0;
852
853 // Note: Using regular if statements (C++11 compatible)
854 // Compiler optimizes away dead branches for constant template parameters
855 if (DATA_WIDTH == 8) {
856 // Special optimized case for 8 lanes with bit packing
857 // Optimized: Hoist packing outside inner loop to reduce redundant operations
858 for (size_t bit_pos = 0; bit_pos < 8; bit_pos++) {
859 // Pack 8 wave8_byte values into a single 64-bit register for parallel extraction
860 // This packing is done once per bit_pos (8 times) instead of 64 times
861 u64 packed =
862 ((u64)laneWaveforms[0 * bytes_per_lane + bit_pos] << 0) |
863 ((u64)laneWaveforms[1 * bytes_per_lane + bit_pos] << 8) |
864 ((u64)laneWaveforms[2 * bytes_per_lane + bit_pos] << 16) |
865 ((u64)laneWaveforms[3 * bytes_per_lane + bit_pos] << 24) |
866 ((u64)laneWaveforms[4 * bytes_per_lane + bit_pos] << 32) |
867 ((u64)laneWaveforms[5 * bytes_per_lane + bit_pos] << 40) |
868 ((u64)laneWaveforms[6 * bytes_per_lane + bit_pos] << 48) |
869 ((u64)laneWaveforms[7 * bytes_per_lane + bit_pos] << 56);
870
871 // Inner loop: extract 8 pulses from the packed data
872 for (size_t pulse_bit = 0; pulse_bit < 8; pulse_bit++) {
873 // Extract pulse bits in parallel (compiler can optimize independent shifts)
874 outputBuffer[outputIdx++] =
875 ((packed >> (7 - pulse_bit + 0)) & 0x01) << 0 |
876 ((packed >> (7 - pulse_bit + 8)) & 0x01) << 1 |
877 ((packed >> (7 - pulse_bit + 16)) & 0x01) << 2 |
878 ((packed >> (7 - pulse_bit + 24)) & 0x01) << 3 |
879 ((packed >> (7 - pulse_bit + 32)) & 0x01) << 4 |
880 ((packed >> (7 - pulse_bit + 40)) & 0x01) << 5 |
881 ((packed >> (7 - pulse_bit + 48)) & 0x01) << 6 |
882 ((packed >> (7 - pulse_bit + 56)) & 0x01) << 7;
883 }
884 }
885 } else if (DATA_WIDTH <= 8) {
886 // Pack into single bytes (compile-time branch elimination via template instantiation)
887 // Guard against division by zero when DATA_WIDTH > 8 (shouldn't execute this branch, but compiler still evaluates it)
888 const size_t ticksPerByte = (DATA_WIDTH > 8) ? 1 : (8 / DATA_WIDTH);
889 const size_t numOutputBytes = (pulsesPerByte + ticksPerByte - 1) / ticksPerByte;
890
891 for (size_t outputByteIdx = 0; outputByteIdx < numOutputBytes; outputByteIdx++) {
892 u8 outputByte = 0;
893
894 FL_UNROLL(8)
895 for (size_t t = 0; t < ticksPerByte; t++) {
896 size_t pulse_idx = outputByteIdx * ticksPerByte + t;
897 if (pulse_idx >= pulsesPerByte)
898 break;
899
900 size_t bit_pos = pulse_idx / 8;
901 size_t pulse_bit = pulse_idx % 8;
902
903 FL_UNROLL(8)
904 for (size_t lane = 0; lane < DATA_WIDTH; lane++) {
905 const u8* laneWaveform = laneWaveforms + (lane * bytes_per_lane);
906 u8 wave8_byte = laneWaveform[bit_pos];
907 u8 pulse = (wave8_byte >> (7 - pulse_bit)) & 1;
908
909 size_t bitPos = t * DATA_WIDTH + lane;
910 outputByte |= (pulse << bitPos);
911 }
912 }
913
914 outputBuffer[outputIdx++] = outputByte;
915 }
916 } else if (DATA_WIDTH == 16) {
917 // Pack into 16-bit words (compile-time branch)
918 // Optimized: Software pipelining + output buffering
919 // Process 2 bit positions in parallel for better ILP, and batch writes for better cache efficiency
920
921 // Output buffer: accumulate 16 words (32 bytes) before writing
922 // This aligns with typical 32-byte cache lines and reduces memory write overhead
923 u8 writeBuffer[32];
924 size_t writeIdx = 0;
925
926 for (size_t bit_pos = 0; bit_pos < 8; bit_pos += 2) {
927 // Pack 16 wave8_byte values for TWO bit positions simultaneously
928 // This enables instruction-level parallelism and better register utilization
929 u64 packed_lo_0 =
930 ((u64)laneWaveforms[0 * bytes_per_lane + bit_pos + 0] << 0) |
931 ((u64)laneWaveforms[1 * bytes_per_lane + bit_pos + 0] << 8) |
932 ((u64)laneWaveforms[2 * bytes_per_lane + bit_pos + 0] << 16) |
933 ((u64)laneWaveforms[3 * bytes_per_lane + bit_pos + 0] << 24) |
934 ((u64)laneWaveforms[4 * bytes_per_lane + bit_pos + 0] << 32) |
935 ((u64)laneWaveforms[5 * bytes_per_lane + bit_pos + 0] << 40) |
936 ((u64)laneWaveforms[6 * bytes_per_lane + bit_pos + 0] << 48) |
937 ((u64)laneWaveforms[7 * bytes_per_lane + bit_pos + 0] << 56);
938
939 u64 packed_hi_0 =
940 ((u64)laneWaveforms[8 * bytes_per_lane + bit_pos + 0] << 0) |
941 ((u64)laneWaveforms[9 * bytes_per_lane + bit_pos + 0] << 8) |
942 ((u64)laneWaveforms[10 * bytes_per_lane + bit_pos + 0] << 16) |
943 ((u64)laneWaveforms[11 * bytes_per_lane + bit_pos + 0] << 24) |
944 ((u64)laneWaveforms[12 * bytes_per_lane + bit_pos + 0] << 32) |
945 ((u64)laneWaveforms[13 * bytes_per_lane + bit_pos + 0] << 40) |
946 ((u64)laneWaveforms[14 * bytes_per_lane + bit_pos + 0] << 48) |
947 ((u64)laneWaveforms[15 * bytes_per_lane + bit_pos + 0] << 56);
948
949 u64 packed_lo_1 =
950 ((u64)laneWaveforms[0 * bytes_per_lane + bit_pos + 1] << 0) |
951 ((u64)laneWaveforms[1 * bytes_per_lane + bit_pos + 1] << 8) |
952 ((u64)laneWaveforms[2 * bytes_per_lane + bit_pos + 1] << 16) |
953 ((u64)laneWaveforms[3 * bytes_per_lane + bit_pos + 1] << 24) |
954 ((u64)laneWaveforms[4 * bytes_per_lane + bit_pos + 1] << 32) |
955 ((u64)laneWaveforms[5 * bytes_per_lane + bit_pos + 1] << 40) |
956 ((u64)laneWaveforms[6 * bytes_per_lane + bit_pos + 1] << 48) |
957 ((u64)laneWaveforms[7 * bytes_per_lane + bit_pos + 1] << 56);
958
959 u64 packed_hi_1 =
960 ((u64)laneWaveforms[8 * bytes_per_lane + bit_pos + 1] << 0) |
961 ((u64)laneWaveforms[9 * bytes_per_lane + bit_pos + 1] << 8) |
962 ((u64)laneWaveforms[10 * bytes_per_lane + bit_pos + 1] << 16) |
963 ((u64)laneWaveforms[11 * bytes_per_lane + bit_pos + 1] << 24) |
964 ((u64)laneWaveforms[12 * bytes_per_lane + bit_pos + 1] << 32) |
965 ((u64)laneWaveforms[13 * bytes_per_lane + bit_pos + 1] << 40) |
966 ((u64)laneWaveforms[14 * bytes_per_lane + bit_pos + 1] << 48) |
967 ((u64)laneWaveforms[15 * bytes_per_lane + bit_pos + 1] << 56);
968
969 // Inner loop: interleave extraction from both bit positions
970 // This allows CPU to execute independent operations in parallel
971 for (size_t pulse_bit = 0; pulse_bit < 8; pulse_bit++) {
972 // Extract pulse bits for first bit position
973 u16 outputWord_0 =
974 ((packed_lo_0 >> (7 - pulse_bit + 0)) & 0x01) << 0 |
975 ((packed_lo_0 >> (7 - pulse_bit + 8)) & 0x01) << 1 |
976 ((packed_lo_0 >> (7 - pulse_bit + 16)) & 0x01) << 2 |
977 ((packed_lo_0 >> (7 - pulse_bit + 24)) & 0x01) << 3 |
978 ((packed_lo_0 >> (7 - pulse_bit + 32)) & 0x01) << 4 |
979 ((packed_lo_0 >> (7 - pulse_bit + 40)) & 0x01) << 5 |
980 ((packed_lo_0 >> (7 - pulse_bit + 48)) & 0x01) << 6 |
981 ((packed_lo_0 >> (7 - pulse_bit + 56)) & 0x01) << 7 |
982 ((packed_hi_0 >> (7 - pulse_bit + 0)) & 0x01) << 8 |
983 ((packed_hi_0 >> (7 - pulse_bit + 8)) & 0x01) << 9 |
984 ((packed_hi_0 >> (7 - pulse_bit + 16)) & 0x01) << 10 |
985 ((packed_hi_0 >> (7 - pulse_bit + 24)) & 0x01) << 11 |
986 ((packed_hi_0 >> (7 - pulse_bit + 32)) & 0x01) << 12 |
987 ((packed_hi_0 >> (7 - pulse_bit + 40)) & 0x01) << 13 |
988 ((packed_hi_0 >> (7 - pulse_bit + 48)) & 0x01) << 14 |
989 ((packed_hi_0 >> (7 - pulse_bit + 56)) & 0x01) << 15;
990
991 // Extract pulse bits for second bit position
992 u16 outputWord_1 =
993 ((packed_lo_1 >> (7 - pulse_bit + 0)) & 0x01) << 0 |
994 ((packed_lo_1 >> (7 - pulse_bit + 8)) & 0x01) << 1 |
995 ((packed_lo_1 >> (7 - pulse_bit + 16)) & 0x01) << 2 |
996 ((packed_lo_1 >> (7 - pulse_bit + 24)) & 0x01) << 3 |
997 ((packed_lo_1 >> (7 - pulse_bit + 32)) & 0x01) << 4 |
998 ((packed_lo_1 >> (7 - pulse_bit + 40)) & 0x01) << 5 |
999 ((packed_lo_1 >> (7 - pulse_bit + 48)) & 0x01) << 6 |
1000 ((packed_lo_1 >> (7 - pulse_bit + 56)) & 0x01) << 7 |
1001 ((packed_hi_1 >> (7 - pulse_bit + 0)) & 0x01) << 8 |
1002 ((packed_hi_1 >> (7 - pulse_bit + 8)) & 0x01) << 9 |
1003 ((packed_hi_1 >> (7 - pulse_bit + 16)) & 0x01) << 10 |
1004 ((packed_hi_1 >> (7 - pulse_bit + 24)) & 0x01) << 11 |
1005 ((packed_hi_1 >> (7 - pulse_bit + 32)) & 0x01) << 12 |
1006 ((packed_hi_1 >> (7 - pulse_bit + 40)) & 0x01) << 13 |
1007 ((packed_hi_1 >> (7 - pulse_bit + 48)) & 0x01) << 14 |
1008 ((packed_hi_1 >> (7 - pulse_bit + 56)) & 0x01) << 15;
1009
1010 // Write to buffer instead of directly to output
1011 writeBuffer[writeIdx++] = outputWord_0 & 0xFF;
1012 writeBuffer[writeIdx++] = (outputWord_0 >> 8) & 0xFF;
1013 writeBuffer[writeIdx++] = outputWord_1 & 0xFF;
1014 writeBuffer[writeIdx++] = (outputWord_1 >> 8) & 0xFF;
1015 }
1016
1017 // Flush buffer when full (16 words = 32 bytes)
1018 // This triggers efficient burst writes that align with cache lines
1019 if (writeIdx == 32) {
1020 fl::memcpy(&outputBuffer[outputIdx], writeBuffer, 32);
1021 outputIdx += 32;
1022 writeIdx = 0;
1023 }
1024 }
1025 } else {
1026 // Invalid DATA_WIDTH (compile-time error if template instantiated with wrong value)
1027 return 0;
1028 }
1029
1030 return outputIdx;
1031}
1032
1065 const u8* FL_RESTRICT_PARAM laneWaveforms,
1066 size_t data_width,
1067 u8* FL_RESTRICT_PARAM outputBuffer
1068) FL_NOEXCEPT {
1069 // Dispatch to template specialization based on runtime data_width
1070 // Compiler generates optimized code for each specialization (no runtime branching)
1071 switch (data_width) {
1072 case 1:
1073 return transpose_wave8byte_parlio_template<1>(laneWaveforms, outputBuffer);
1074 case 2:
1075 return transpose_wave8byte_parlio_template<2>(laneWaveforms, outputBuffer);
1076 case 4:
1077 return transpose_wave8byte_parlio_template<4>(laneWaveforms, outputBuffer);
1078 case 8:
1079 return transpose_wave8byte_parlio_template<8>(laneWaveforms, outputBuffer);
1080 case 16:
1081 return transpose_wave8byte_parlio_template<16>(laneWaveforms, outputBuffer);
1082 default:
1083 // Invalid data_width
1084 return 0;
1085 }
1086}
1087
1088} // namespace fl
1089
static bool transpose8(const fl::optional< LaneData > lanes[8], fl::span< u8 > output, const char **error=nullptr) FL_NOEXCEPT
Transpose 8 lanes of data into interleaved octal-SPI format.
static bool transpose4(const fl::optional< LaneData > &lane0, const fl::optional< LaneData > &lane1, const fl::optional< LaneData > &lane2, const fl::optional< LaneData > &lane3, fl::span< u8 > output, const char **error=nullptr) FL_NOEXCEPT
Transpose 4 lanes of data into interleaved quad-SPI format.
static bool transpose16(const fl::optional< LaneData > lanes[16], fl::span< u8 > output, const char **error=nullptr) FL_NOEXCEPT
Transpose 16 lanes of data into interleaved hex-SPI format.
static bool transpose2(const fl::optional< LaneData > &lane0, const fl::optional< LaneData > &lane1, fl::span< u8 > output, const char **error=nullptr) FL_NOEXCEPT
Transpose 2 lanes of data into interleaved dual-SPI format.
static u8 getLaneByte(const LaneData &lane, size_t byte_idx, size_t max_size) FL_NOEXCEPT
Get byte from lane at given index, handling padding automatically.
Unified stateless bit-interleaving transposer for multi-lane SPI parallel LED transmission.
fl::span< const u8 > payload
Actual LED data for this lane.
fl::span< const u8 > padding_frame
Black LED frame for padding (repeating pattern)
Lane data structure: payload + padding frame.
unsigned char u8
Definition stdint.h:131
unsigned char u8
Definition s16x16x4.h:132
void * memcpy(void *dest, const void *src, size_t n) FL_NOEXCEPT
FASTLED_FORCE_INLINE FL_IRAM size_t transpose_wave8byte_parlio(const u8 *FL_RESTRICT_PARAM laneWaveforms, size_t data_width, u8 *FL_RESTRICT_PARAM outputBuffer) FL_NOEXCEPT
unsigned char u8
Definition stdint.h:131
FASTLED_FORCE_INLINE void transpose8x1_MSB(unsigned char *A, unsigned char *B) FL_NOEXCEPT
Simplified 8x1 bit transpose with MSB-first output.
FL_DISABLE_WARNING_PUSH unsigned char * B
FASTLED_FORCE_INLINE void transpose_2strips(const u8 *const input[2], u8 *output, u16 num_leds, u8 bytes_per_led) FL_NOEXCEPT
Transpose 2 LED strips into parallel bit format.
void transpose_2lane_inline(const u8 *lane0_byte, const u8 *lane1_byte, u8 *output, size_t num_bytes) FL_NOEXCEPT
Low-level bit-interleaving primitive for 2 lanes (ISR-safe)
Optional< T > optional
Definition optional.h:16
FASTLED_FORCE_INLINE void transpose8(unsigned char *A, unsigned char *B) FL_NOEXCEPT
Templated 8x8 bit transpose with custom stride.
void transpose8x1_noinline(unsigned char *A, unsigned char *B) FL_NOEXCEPT
Simplified 8x1 bit transpose (non-inline version)
bool transpose_strips(u8 num_strips, const u8 *const *input, u8 *output, u16 num_leds, u8 bytes_per_led) FL_NOEXCEPT
Helper to transpose N strips with automatic dispatch.
void transpose_8lane_inline(const u8 *const lanes[8], u8 *output, size_t num_bytes) FL_NOEXCEPT
Low-level bit-interleaving primitive for 8 lanes (ISR-safe)
FASTLED_FORCE_INLINE void transpose8x1(unsigned char *A, unsigned char *B) FL_NOEXCEPT
Simplified 8x1 bit transpose (inline version)
FASTLED_FORCE_INLINE void transpose_4strips(const u8 *const input[4], u8 *output, u16 num_leds, u8 bytes_per_led) FL_NOEXCEPT
Transpose 4 LED strips into parallel bit format.
FASTLED_FORCE_INLINE u32 calculate_transpose_buffer_size(u16 num_leds, u8 bytes_per_led) FL_NOEXCEPT
Calculate output buffer size needed for transposed data.
FASTLED_FORCE_INLINE FL_IRAM FL_OPTIMIZE_FUNCTION size_t transpose_wave8byte_parlio_template(const u8 *FL_RESTRICT_PARAM laneWaveforms, u8 *FL_RESTRICT_PARAM outputBuffer) FL_NOEXCEPT
Template specialization of transpose for compile-time data_width (optimization)
void transpose_generic_inline(const TSource *const lanes[], size_t num_lanes, u8 *output, size_t num_items) FL_NOEXCEPT
Generic bit-interleaving primitive for N lanes with M-bit source data (ISR-safe)
void transpose_16lane_inline(const u8 *const lanes[16], u8 *output, size_t num_bytes) FL_NOEXCEPT
Low-level bit-interleaving primitive for 16 lanes (ISR-safe)
FASTLED_FORCE_INLINE void transpose_8strips(const u8 *const input[8], u8 *output, u16 num_leds, u8 bytes_per_led) FL_NOEXCEPT
Transpose 8 LED strips into parallel bit format.
fl::u64 u64
Definition s16x16x4.h:221
void transpose_4lane_inline(const u8 *const lanes[4], u8 *output, size_t num_bytes) FL_NOEXCEPT
Low-level bit-interleaving primitive for 4 lanes (ISR-safe)
Base definition for an LED controller.
Definition crgb.hpp:179
fl::u32 b0
byte 'b', bit 0
fl::u8 bytes[8]
eight 8-bit values to load for swapping
sub4 a
32-bit access struct for bit swapping, upper four bytes
fl::u32 c7
byte 'c', bit 7
fl::u32 d7
byte 'd', bit 7
fl::u32 d4
byte 'd', bit 4
fl::u32 d6
byte 'd', bit 6
fl::u32 a5
bit 5 (0x20)
fl::u32 a0
byte 'a', bit 0
fl::u32 b6
byte 'b', bit 6
fl::u32 c6
byte 'c', bit 6
fl::u32 a0
bit 0 (0x01)
fl::u32 b5
byte 'b', bit 5
fl::u32 c0
byte 'c', bit 0
fl::u32 a7
byte 'a', bit 7
fl::u32 a2
bit 2 (0x04)
fl::u32 b1
byte 'b', bit 1
fl::u32 c1
byte 'c', bit 1
fl::u32 c2
byte 'c', bit 2
fl::u32 a4
bit 4 (0x10)
fl::u32 a1
bit 1 (0x02)
fl::u32 word[2]
two 32-bit values to load for swapping
fl::u32 b3
byte 'b', bit 3
sub4 b
32-bit access struct for bit swapping, lower four bytes
fl::u32 c4
byte 'c', bit 4
fl::u32 b4
byte 'b', bit 4
fl::u32 a3
byte 'a', bit 3
fl::u32 d5
byte 'd', bit 5
fl::u32 a3
bit 3 (0x08)
fl::u32 b2
byte 'b', bit 2
fl::u32 a5
byte 'a', bit 5
fl::u32 a4
byte 'a', bit 4
fl::u32 c3
byte 'c', bit 3
fl::u32 d2
byte 'd', bit 2
fl::u8 raw
the entire byte
fl::u32 a2
byte 'a', bit 2
fl::u32 c5
byte 'c', bit 5
fl::u32 a7
bit 7 (0x80)
fl::u32 b7
byte 'b', bit 7
fl::u32 d1
byte 'd', bit 1
fl::u32 d0
byte 'd', bit 0
fl::u32 a6
byte 'a', bit 6
fl::u32 a1
byte 'a', bit 1
fl::u32 a6
bit 6 (0x40)
fl::u32 d3
byte 'd', bit 3
Structure representing 32 bits of access.
#define FL_OPTIMIZATION_LEVEL_O3_BEGIN
#define FL_UNROLL(N)
#define FASTLED_FORCE_INLINE
#define FL_OPTIMIZATION_LEVEL_O3_END
#define FL_OPTIMIZE_FUNCTION
#define FL_IRAM
#define FL_RESTRICT_PARAM
#define FL_NOEXCEPT