FastLED 3.9.15
Loading...
Searching...
No Matches

◆ transpose_wave8byte_parlio_template()

template<size_t DATA_WIDTH>
FASTLED_FORCE_INLINE FL_IRAM FL_OPTIMIZE_FUNCTION size_t fl::transpose_wave8byte_parlio_template ( const u8 *FL_RESTRICT_PARAM laneWaveforms,
u8 *FL_RESTRICT_PARAM outputBuffer )

Template specialization of transpose for compile-time data_width (optimization)

This template version eliminates runtime branching by specializing for each data width. The compiler generates optimized code for each DATA_WIDTH value at compile time.

Template Parameters
DATA_WIDTHNumber of parallel lanes (1, 2, 4, 8, or 16) - compile-time constant
Examples
/home/runner/work/FastLED/FastLED/src/fl/math/transposition.h.

Definition at line 845 of file transposition.h.

848 {
849 constexpr size_t bytes_per_lane = 8; // sizeof(Wave8Byte)
850 constexpr size_t pulsesPerByte = 64; // 8 bits × 8 pulses per bit
851 size_t outputIdx = 0;
852
853 // Note: Using regular if statements (C++11 compatible)
854 // Compiler optimizes away dead branches for constant template parameters
855 if (DATA_WIDTH == 8) {
856 // Special optimized case for 8 lanes with bit packing
857 // Optimized: Hoist packing outside inner loop to reduce redundant operations
858 for (size_t bit_pos = 0; bit_pos < 8; bit_pos++) {
859 // Pack 8 wave8_byte values into a single 64-bit register for parallel extraction
860 // This packing is done once per bit_pos (8 times) instead of 64 times
861 u64 packed =
862 ((u64)laneWaveforms[0 * bytes_per_lane + bit_pos] << 0) |
863 ((u64)laneWaveforms[1 * bytes_per_lane + bit_pos] << 8) |
864 ((u64)laneWaveforms[2 * bytes_per_lane + bit_pos] << 16) |
865 ((u64)laneWaveforms[3 * bytes_per_lane + bit_pos] << 24) |
866 ((u64)laneWaveforms[4 * bytes_per_lane + bit_pos] << 32) |
867 ((u64)laneWaveforms[5 * bytes_per_lane + bit_pos] << 40) |
868 ((u64)laneWaveforms[6 * bytes_per_lane + bit_pos] << 48) |
869 ((u64)laneWaveforms[7 * bytes_per_lane + bit_pos] << 56);
870
871 // Inner loop: extract 8 pulses from the packed data
872 for (size_t pulse_bit = 0; pulse_bit < 8; pulse_bit++) {
873 // Extract pulse bits in parallel (compiler can optimize independent shifts)
874 outputBuffer[outputIdx++] =
875 ((packed >> (7 - pulse_bit + 0)) & 0x01) << 0 |
876 ((packed >> (7 - pulse_bit + 8)) & 0x01) << 1 |
877 ((packed >> (7 - pulse_bit + 16)) & 0x01) << 2 |
878 ((packed >> (7 - pulse_bit + 24)) & 0x01) << 3 |
879 ((packed >> (7 - pulse_bit + 32)) & 0x01) << 4 |
880 ((packed >> (7 - pulse_bit + 40)) & 0x01) << 5 |
881 ((packed >> (7 - pulse_bit + 48)) & 0x01) << 6 |
882 ((packed >> (7 - pulse_bit + 56)) & 0x01) << 7;
883 }
884 }
885 } else if (DATA_WIDTH <= 8) {
886 // Pack into single bytes (compile-time branch elimination via template instantiation)
887 // Guard against division by zero when DATA_WIDTH > 8 (shouldn't execute this branch, but compiler still evaluates it)
888 const size_t ticksPerByte = (DATA_WIDTH > 8) ? 1 : (8 / DATA_WIDTH);
889 const size_t numOutputBytes = (pulsesPerByte + ticksPerByte - 1) / ticksPerByte;
890
891 for (size_t outputByteIdx = 0; outputByteIdx < numOutputBytes; outputByteIdx++) {
892 u8 outputByte = 0;
893
894 FL_UNROLL(8)
895 for (size_t t = 0; t < ticksPerByte; t++) {
896 size_t pulse_idx = outputByteIdx * ticksPerByte + t;
897 if (pulse_idx >= pulsesPerByte)
898 break;
899
900 size_t bit_pos = pulse_idx / 8;
901 size_t pulse_bit = pulse_idx % 8;
902
903 FL_UNROLL(8)
904 for (size_t lane = 0; lane < DATA_WIDTH; lane++) {
905 const u8* laneWaveform = laneWaveforms + (lane * bytes_per_lane);
906 u8 wave8_byte = laneWaveform[bit_pos];
907 u8 pulse = (wave8_byte >> (7 - pulse_bit)) & 1;
908
909 size_t bitPos = t * DATA_WIDTH + lane;
910 outputByte |= (pulse << bitPos);
911 }
912 }
913
914 outputBuffer[outputIdx++] = outputByte;
915 }
916 } else if (DATA_WIDTH == 16) {
917 // Pack into 16-bit words (compile-time branch)
918 // Optimized: Software pipelining + output buffering
919 // Process 2 bit positions in parallel for better ILP, and batch writes for better cache efficiency
920
921 // Output buffer: accumulate 16 words (32 bytes) before writing
922 // This aligns with typical 32-byte cache lines and reduces memory write overhead
923 u8 writeBuffer[32];
924 size_t writeIdx = 0;
925
926 for (size_t bit_pos = 0; bit_pos < 8; bit_pos += 2) {
927 // Pack 16 wave8_byte values for TWO bit positions simultaneously
928 // This enables instruction-level parallelism and better register utilization
929 u64 packed_lo_0 =
930 ((u64)laneWaveforms[0 * bytes_per_lane + bit_pos + 0] << 0) |
931 ((u64)laneWaveforms[1 * bytes_per_lane + bit_pos + 0] << 8) |
932 ((u64)laneWaveforms[2 * bytes_per_lane + bit_pos + 0] << 16) |
933 ((u64)laneWaveforms[3 * bytes_per_lane + bit_pos + 0] << 24) |
934 ((u64)laneWaveforms[4 * bytes_per_lane + bit_pos + 0] << 32) |
935 ((u64)laneWaveforms[5 * bytes_per_lane + bit_pos + 0] << 40) |
936 ((u64)laneWaveforms[6 * bytes_per_lane + bit_pos + 0] << 48) |
937 ((u64)laneWaveforms[7 * bytes_per_lane + bit_pos + 0] << 56);
938
939 u64 packed_hi_0 =
940 ((u64)laneWaveforms[8 * bytes_per_lane + bit_pos + 0] << 0) |
941 ((u64)laneWaveforms[9 * bytes_per_lane + bit_pos + 0] << 8) |
942 ((u64)laneWaveforms[10 * bytes_per_lane + bit_pos + 0] << 16) |
943 ((u64)laneWaveforms[11 * bytes_per_lane + bit_pos + 0] << 24) |
944 ((u64)laneWaveforms[12 * bytes_per_lane + bit_pos + 0] << 32) |
945 ((u64)laneWaveforms[13 * bytes_per_lane + bit_pos + 0] << 40) |
946 ((u64)laneWaveforms[14 * bytes_per_lane + bit_pos + 0] << 48) |
947 ((u64)laneWaveforms[15 * bytes_per_lane + bit_pos + 0] << 56);
948
949 u64 packed_lo_1 =
950 ((u64)laneWaveforms[0 * bytes_per_lane + bit_pos + 1] << 0) |
951 ((u64)laneWaveforms[1 * bytes_per_lane + bit_pos + 1] << 8) |
952 ((u64)laneWaveforms[2 * bytes_per_lane + bit_pos + 1] << 16) |
953 ((u64)laneWaveforms[3 * bytes_per_lane + bit_pos + 1] << 24) |
954 ((u64)laneWaveforms[4 * bytes_per_lane + bit_pos + 1] << 32) |
955 ((u64)laneWaveforms[5 * bytes_per_lane + bit_pos + 1] << 40) |
956 ((u64)laneWaveforms[6 * bytes_per_lane + bit_pos + 1] << 48) |
957 ((u64)laneWaveforms[7 * bytes_per_lane + bit_pos + 1] << 56);
958
959 u64 packed_hi_1 =
960 ((u64)laneWaveforms[8 * bytes_per_lane + bit_pos + 1] << 0) |
961 ((u64)laneWaveforms[9 * bytes_per_lane + bit_pos + 1] << 8) |
962 ((u64)laneWaveforms[10 * bytes_per_lane + bit_pos + 1] << 16) |
963 ((u64)laneWaveforms[11 * bytes_per_lane + bit_pos + 1] << 24) |
964 ((u64)laneWaveforms[12 * bytes_per_lane + bit_pos + 1] << 32) |
965 ((u64)laneWaveforms[13 * bytes_per_lane + bit_pos + 1] << 40) |
966 ((u64)laneWaveforms[14 * bytes_per_lane + bit_pos + 1] << 48) |
967 ((u64)laneWaveforms[15 * bytes_per_lane + bit_pos + 1] << 56);
968
969 // Inner loop: interleave extraction from both bit positions
970 // This allows CPU to execute independent operations in parallel
971 for (size_t pulse_bit = 0; pulse_bit < 8; pulse_bit++) {
972 // Extract pulse bits for first bit position
973 u16 outputWord_0 =
974 ((packed_lo_0 >> (7 - pulse_bit + 0)) & 0x01) << 0 |
975 ((packed_lo_0 >> (7 - pulse_bit + 8)) & 0x01) << 1 |
976 ((packed_lo_0 >> (7 - pulse_bit + 16)) & 0x01) << 2 |
977 ((packed_lo_0 >> (7 - pulse_bit + 24)) & 0x01) << 3 |
978 ((packed_lo_0 >> (7 - pulse_bit + 32)) & 0x01) << 4 |
979 ((packed_lo_0 >> (7 - pulse_bit + 40)) & 0x01) << 5 |
980 ((packed_lo_0 >> (7 - pulse_bit + 48)) & 0x01) << 6 |
981 ((packed_lo_0 >> (7 - pulse_bit + 56)) & 0x01) << 7 |
982 ((packed_hi_0 >> (7 - pulse_bit + 0)) & 0x01) << 8 |
983 ((packed_hi_0 >> (7 - pulse_bit + 8)) & 0x01) << 9 |
984 ((packed_hi_0 >> (7 - pulse_bit + 16)) & 0x01) << 10 |
985 ((packed_hi_0 >> (7 - pulse_bit + 24)) & 0x01) << 11 |
986 ((packed_hi_0 >> (7 - pulse_bit + 32)) & 0x01) << 12 |
987 ((packed_hi_0 >> (7 - pulse_bit + 40)) & 0x01) << 13 |
988 ((packed_hi_0 >> (7 - pulse_bit + 48)) & 0x01) << 14 |
989 ((packed_hi_0 >> (7 - pulse_bit + 56)) & 0x01) << 15;
990
991 // Extract pulse bits for second bit position
992 u16 outputWord_1 =
993 ((packed_lo_1 >> (7 - pulse_bit + 0)) & 0x01) << 0 |
994 ((packed_lo_1 >> (7 - pulse_bit + 8)) & 0x01) << 1 |
995 ((packed_lo_1 >> (7 - pulse_bit + 16)) & 0x01) << 2 |
996 ((packed_lo_1 >> (7 - pulse_bit + 24)) & 0x01) << 3 |
997 ((packed_lo_1 >> (7 - pulse_bit + 32)) & 0x01) << 4 |
998 ((packed_lo_1 >> (7 - pulse_bit + 40)) & 0x01) << 5 |
999 ((packed_lo_1 >> (7 - pulse_bit + 48)) & 0x01) << 6 |
1000 ((packed_lo_1 >> (7 - pulse_bit + 56)) & 0x01) << 7 |
1001 ((packed_hi_1 >> (7 - pulse_bit + 0)) & 0x01) << 8 |
1002 ((packed_hi_1 >> (7 - pulse_bit + 8)) & 0x01) << 9 |
1003 ((packed_hi_1 >> (7 - pulse_bit + 16)) & 0x01) << 10 |
1004 ((packed_hi_1 >> (7 - pulse_bit + 24)) & 0x01) << 11 |
1005 ((packed_hi_1 >> (7 - pulse_bit + 32)) & 0x01) << 12 |
1006 ((packed_hi_1 >> (7 - pulse_bit + 40)) & 0x01) << 13 |
1007 ((packed_hi_1 >> (7 - pulse_bit + 48)) & 0x01) << 14 |
1008 ((packed_hi_1 >> (7 - pulse_bit + 56)) & 0x01) << 15;
1009
1010 // Write to buffer instead of directly to output
1011 writeBuffer[writeIdx++] = outputWord_0 & 0xFF;
1012 writeBuffer[writeIdx++] = (outputWord_0 >> 8) & 0xFF;
1013 writeBuffer[writeIdx++] = outputWord_1 & 0xFF;
1014 writeBuffer[writeIdx++] = (outputWord_1 >> 8) & 0xFF;
1015 }
1016
1017 // Flush buffer when full (16 words = 32 bytes)
1018 // This triggers efficient burst writes that align with cache lines
1019 if (writeIdx == 32) {
1020 fl::memcpy(&outputBuffer[outputIdx], writeBuffer, 32);
1021 outputIdx += 32;
1022 writeIdx = 0;
1023 }
1024 }
1025 } else {
1026 // Invalid DATA_WIDTH (compile-time error if template instantiated with wrong value)
1027 return 0;
1028 }
1029
1030 return outputIdx;
1031}
void * memcpy(void *dest, const void *src, size_t n) FL_NOEXCEPT
unsigned char u8
Definition stdint.h:131
fl::u64 u64
Definition s16x16x4.h:221
#define FL_UNROLL(N)

References FASTLED_FORCE_INLINE, FL_IRAM, FL_NOEXCEPT, FL_OPTIMIZE_FUNCTION, FL_RESTRICT_PARAM, FL_UNROLL, memcpy(), and t.

Referenced by transpose_wave8byte_parlio().

+ Here is the call graph for this function:
+ Here is the caller graph for this function: