This template version eliminates runtime branching by specializing for each data width. The compiler generates optimized code for each DATA_WIDTH value at compile time.
848 {
849 constexpr size_t bytes_per_lane = 8;
850 constexpr size_t pulsesPerByte = 64;
851 size_t outputIdx = 0;
852
853
854
855 if (DATA_WIDTH == 8) {
856
857
858 for (size_t bit_pos = 0; bit_pos < 8; bit_pos++) {
859
860
862 ((
u64)laneWaveforms[0 * bytes_per_lane + bit_pos] << 0) |
863 ((
u64)laneWaveforms[1 * bytes_per_lane + bit_pos] << 8) |
864 ((
u64)laneWaveforms[2 * bytes_per_lane + bit_pos] << 16) |
865 ((
u64)laneWaveforms[3 * bytes_per_lane + bit_pos] << 24) |
866 ((
u64)laneWaveforms[4 * bytes_per_lane + bit_pos] << 32) |
867 ((
u64)laneWaveforms[5 * bytes_per_lane + bit_pos] << 40) |
868 ((
u64)laneWaveforms[6 * bytes_per_lane + bit_pos] << 48) |
869 ((
u64)laneWaveforms[7 * bytes_per_lane + bit_pos] << 56);
870
871
872 for (size_t pulse_bit = 0; pulse_bit < 8; pulse_bit++) {
873
874 outputBuffer[outputIdx++] =
875 ((packed >> (7 - pulse_bit + 0)) & 0x01) << 0 |
876 ((packed >> (7 - pulse_bit + 8)) & 0x01) << 1 |
877 ((packed >> (7 - pulse_bit + 16)) & 0x01) << 2 |
878 ((packed >> (7 - pulse_bit + 24)) & 0x01) << 3 |
879 ((packed >> (7 - pulse_bit + 32)) & 0x01) << 4 |
880 ((packed >> (7 - pulse_bit + 40)) & 0x01) << 5 |
881 ((packed >> (7 - pulse_bit + 48)) & 0x01) << 6 |
882 ((packed >> (7 - pulse_bit + 56)) & 0x01) << 7;
883 }
884 }
885 } else if (DATA_WIDTH <= 8) {
886
887
888 const size_t ticksPerByte = (DATA_WIDTH > 8) ? 1 : (8 / DATA_WIDTH);
889 const size_t numOutputBytes = (pulsesPerByte + ticksPerByte - 1) / ticksPerByte;
890
891 for (size_t outputByteIdx = 0; outputByteIdx < numOutputBytes; outputByteIdx++) {
893
895 for (
size_t t = 0;
t < ticksPerByte;
t++) {
896 size_t pulse_idx = outputByteIdx * ticksPerByte +
t;
897 if (pulse_idx >= pulsesPerByte)
898 break;
899
900 size_t bit_pos = pulse_idx / 8;
901 size_t pulse_bit = pulse_idx % 8;
902
904 for (size_t lane = 0; lane < DATA_WIDTH; lane++) {
905 const u8* laneWaveform = laneWaveforms + (lane * bytes_per_lane);
906 u8 wave8_byte = laneWaveform[bit_pos];
907 u8 pulse = (wave8_byte >> (7 - pulse_bit)) & 1;
908
909 size_t bitPos =
t * DATA_WIDTH + lane;
910 outputByte |= (pulse << bitPos);
911 }
912 }
913
914 outputBuffer[outputIdx++] = outputByte;
915 }
916 } else if (DATA_WIDTH == 16) {
917
918
919
920
921
922
924 size_t writeIdx = 0;
925
926 for (size_t bit_pos = 0; bit_pos < 8; bit_pos += 2) {
927
928
930 ((
u64)laneWaveforms[0 * bytes_per_lane + bit_pos + 0] << 0) |
931 ((
u64)laneWaveforms[1 * bytes_per_lane + bit_pos + 0] << 8) |
932 ((
u64)laneWaveforms[2 * bytes_per_lane + bit_pos + 0] << 16) |
933 ((
u64)laneWaveforms[3 * bytes_per_lane + bit_pos + 0] << 24) |
934 ((
u64)laneWaveforms[4 * bytes_per_lane + bit_pos + 0] << 32) |
935 ((
u64)laneWaveforms[5 * bytes_per_lane + bit_pos + 0] << 40) |
936 ((
u64)laneWaveforms[6 * bytes_per_lane + bit_pos + 0] << 48) |
937 ((
u64)laneWaveforms[7 * bytes_per_lane + bit_pos + 0] << 56);
938
940 ((
u64)laneWaveforms[8 * bytes_per_lane + bit_pos + 0] << 0) |
941 ((
u64)laneWaveforms[9 * bytes_per_lane + bit_pos + 0] << 8) |
942 ((
u64)laneWaveforms[10 * bytes_per_lane + bit_pos + 0] << 16) |
943 ((
u64)laneWaveforms[11 * bytes_per_lane + bit_pos + 0] << 24) |
944 ((
u64)laneWaveforms[12 * bytes_per_lane + bit_pos + 0] << 32) |
945 ((
u64)laneWaveforms[13 * bytes_per_lane + bit_pos + 0] << 40) |
946 ((
u64)laneWaveforms[14 * bytes_per_lane + bit_pos + 0] << 48) |
947 ((
u64)laneWaveforms[15 * bytes_per_lane + bit_pos + 0] << 56);
948
950 ((
u64)laneWaveforms[0 * bytes_per_lane + bit_pos + 1] << 0) |
951 ((
u64)laneWaveforms[1 * bytes_per_lane + bit_pos + 1] << 8) |
952 ((
u64)laneWaveforms[2 * bytes_per_lane + bit_pos + 1] << 16) |
953 ((
u64)laneWaveforms[3 * bytes_per_lane + bit_pos + 1] << 24) |
954 ((
u64)laneWaveforms[4 * bytes_per_lane + bit_pos + 1] << 32) |
955 ((
u64)laneWaveforms[5 * bytes_per_lane + bit_pos + 1] << 40) |
956 ((
u64)laneWaveforms[6 * bytes_per_lane + bit_pos + 1] << 48) |
957 ((
u64)laneWaveforms[7 * bytes_per_lane + bit_pos + 1] << 56);
958
960 ((
u64)laneWaveforms[8 * bytes_per_lane + bit_pos + 1] << 0) |
961 ((
u64)laneWaveforms[9 * bytes_per_lane + bit_pos + 1] << 8) |
962 ((
u64)laneWaveforms[10 * bytes_per_lane + bit_pos + 1] << 16) |
963 ((
u64)laneWaveforms[11 * bytes_per_lane + bit_pos + 1] << 24) |
964 ((
u64)laneWaveforms[12 * bytes_per_lane + bit_pos + 1] << 32) |
965 ((
u64)laneWaveforms[13 * bytes_per_lane + bit_pos + 1] << 40) |
966 ((
u64)laneWaveforms[14 * bytes_per_lane + bit_pos + 1] << 48) |
967 ((
u64)laneWaveforms[15 * bytes_per_lane + bit_pos + 1] << 56);
968
969
970
971 for (size_t pulse_bit = 0; pulse_bit < 8; pulse_bit++) {
972
973 u16 outputWord_0 =
974 ((packed_lo_0 >> (7 - pulse_bit + 0)) & 0x01) << 0 |
975 ((packed_lo_0 >> (7 - pulse_bit + 8)) & 0x01) << 1 |
976 ((packed_lo_0 >> (7 - pulse_bit + 16)) & 0x01) << 2 |
977 ((packed_lo_0 >> (7 - pulse_bit + 24)) & 0x01) << 3 |
978 ((packed_lo_0 >> (7 - pulse_bit + 32)) & 0x01) << 4 |
979 ((packed_lo_0 >> (7 - pulse_bit + 40)) & 0x01) << 5 |
980 ((packed_lo_0 >> (7 - pulse_bit + 48)) & 0x01) << 6 |
981 ((packed_lo_0 >> (7 - pulse_bit + 56)) & 0x01) << 7 |
982 ((packed_hi_0 >> (7 - pulse_bit + 0)) & 0x01) << 8 |
983 ((packed_hi_0 >> (7 - pulse_bit + 8)) & 0x01) << 9 |
984 ((packed_hi_0 >> (7 - pulse_bit + 16)) & 0x01) << 10 |
985 ((packed_hi_0 >> (7 - pulse_bit + 24)) & 0x01) << 11 |
986 ((packed_hi_0 >> (7 - pulse_bit + 32)) & 0x01) << 12 |
987 ((packed_hi_0 >> (7 - pulse_bit + 40)) & 0x01) << 13 |
988 ((packed_hi_0 >> (7 - pulse_bit + 48)) & 0x01) << 14 |
989 ((packed_hi_0 >> (7 - pulse_bit + 56)) & 0x01) << 15;
990
991
992 u16 outputWord_1 =
993 ((packed_lo_1 >> (7 - pulse_bit + 0)) & 0x01) << 0 |
994 ((packed_lo_1 >> (7 - pulse_bit + 8)) & 0x01) << 1 |
995 ((packed_lo_1 >> (7 - pulse_bit + 16)) & 0x01) << 2 |
996 ((packed_lo_1 >> (7 - pulse_bit + 24)) & 0x01) << 3 |
997 ((packed_lo_1 >> (7 - pulse_bit + 32)) & 0x01) << 4 |
998 ((packed_lo_1 >> (7 - pulse_bit + 40)) & 0x01) << 5 |
999 ((packed_lo_1 >> (7 - pulse_bit + 48)) & 0x01) << 6 |
1000 ((packed_lo_1 >> (7 - pulse_bit + 56)) & 0x01) << 7 |
1001 ((packed_hi_1 >> (7 - pulse_bit + 0)) & 0x01) << 8 |
1002 ((packed_hi_1 >> (7 - pulse_bit + 8)) & 0x01) << 9 |
1003 ((packed_hi_1 >> (7 - pulse_bit + 16)) & 0x01) << 10 |
1004 ((packed_hi_1 >> (7 - pulse_bit + 24)) & 0x01) << 11 |
1005 ((packed_hi_1 >> (7 - pulse_bit + 32)) & 0x01) << 12 |
1006 ((packed_hi_1 >> (7 - pulse_bit + 40)) & 0x01) << 13 |
1007 ((packed_hi_1 >> (7 - pulse_bit + 48)) & 0x01) << 14 |
1008 ((packed_hi_1 >> (7 - pulse_bit + 56)) & 0x01) << 15;
1009
1010
1011 writeBuffer[writeIdx++] = outputWord_0 & 0xFF;
1012 writeBuffer[writeIdx++] = (outputWord_0 >> 8) & 0xFF;
1013 writeBuffer[writeIdx++] = outputWord_1 & 0xFF;
1014 writeBuffer[writeIdx++] = (outputWord_1 >> 8) & 0xFF;
1015 }
1016
1017
1018
1019 if (writeIdx == 32) {
1020 fl::memcpy(&outputBuffer[outputIdx], writeBuffer, 32);
1021 outputIdx += 32;
1022 writeIdx = 0;
1023 }
1024 }
1025 } else {
1026
1027 return 0;
1028 }
1029
1030 return outputIdx;
1031}
void * memcpy(void *dest, const void *src, size_t n) FL_NOEXCEPT