418 const u8*
const lanes[16],
422 for (
size_t byte_idx = 0; byte_idx < num_bytes; byte_idx++) {
425 ((
u64)lanes[0][byte_idx] << 0) |
426 ((
u64)lanes[1][byte_idx] << 8) |
427 ((
u64)lanes[2][byte_idx] << 16) |
428 ((
u64)lanes[3][byte_idx] << 24) |
429 ((
u64)lanes[4][byte_idx] << 32) |
430 ((
u64)lanes[5][byte_idx] << 40) |
431 ((
u64)lanes[6][byte_idx] << 48) |
432 ((
u64)lanes[7][byte_idx] << 56);
436 ((
u64)lanes[8][byte_idx] << 0) |
437 ((
u64)lanes[9][byte_idx] << 8) |
438 ((
u64)lanes[10][byte_idx] << 16) |
439 ((
u64)lanes[11][byte_idx] << 24) |
440 ((
u64)lanes[12][byte_idx] << 32) |
441 ((
u64)lanes[13][byte_idx] << 40) |
442 ((
u64)lanes[14][byte_idx] << 48) |
443 ((
u64)lanes[15][byte_idx] << 56);
445 u8* dest = &output[byte_idx * 16];
448 for (
int bit = 7; bit >= 0; bit--) {
450 ((packed_lo >> (bit + 0)) & 0x01) << 0 |
451 ((packed_lo >> (bit + 8)) & 0x01) << 1 |
452 ((packed_lo >> (bit + 16)) & 0x01) << 2 |
453 ((packed_lo >> (bit + 24)) & 0x01) << 3 |
454 ((packed_lo >> (bit + 32)) & 0x01) << 4 |
455 ((packed_lo >> (bit + 40)) & 0x01) << 5 |
456 ((packed_lo >> (bit + 48)) & 0x01) << 6 |
457 ((packed_lo >> (bit + 56)) & 0x01) << 7;
460 ((packed_hi >> (bit + 0)) & 0x01) << 0 |
461 ((packed_hi >> (bit + 8)) & 0x01) << 1 |
462 ((packed_hi >> (bit + 16)) & 0x01) << 2 |
463 ((packed_hi >> (bit + 24)) & 0x01) << 3 |
464 ((packed_hi >> (bit + 32)) & 0x01) << 4 |
465 ((packed_hi >> (bit + 40)) & 0x01) << 5 |
466 ((packed_hi >> (bit + 48)) & 0x01) << 6 |
467 ((packed_hi >> (bit + 56)) & 0x01) << 7;
849 constexpr size_t bytes_per_lane = 8;
850 constexpr size_t pulsesPerByte = 64;
851 size_t outputIdx = 0;
855 if (DATA_WIDTH == 8) {
858 for (
size_t bit_pos = 0; bit_pos < 8; bit_pos++) {
862 ((
u64)laneWaveforms[0 * bytes_per_lane + bit_pos] << 0) |
863 ((
u64)laneWaveforms[1 * bytes_per_lane + bit_pos] << 8) |
864 ((
u64)laneWaveforms[2 * bytes_per_lane + bit_pos] << 16) |
865 ((
u64)laneWaveforms[3 * bytes_per_lane + bit_pos] << 24) |
866 ((
u64)laneWaveforms[4 * bytes_per_lane + bit_pos] << 32) |
867 ((
u64)laneWaveforms[5 * bytes_per_lane + bit_pos] << 40) |
868 ((
u64)laneWaveforms[6 * bytes_per_lane + bit_pos] << 48) |
869 ((
u64)laneWaveforms[7 * bytes_per_lane + bit_pos] << 56);
872 for (
size_t pulse_bit = 0; pulse_bit < 8; pulse_bit++) {
874 outputBuffer[outputIdx++] =
875 ((packed >> (7 - pulse_bit + 0)) & 0x01) << 0 |
876 ((packed >> (7 - pulse_bit + 8)) & 0x01) << 1 |
877 ((packed >> (7 - pulse_bit + 16)) & 0x01) << 2 |
878 ((packed >> (7 - pulse_bit + 24)) & 0x01) << 3 |
879 ((packed >> (7 - pulse_bit + 32)) & 0x01) << 4 |
880 ((packed >> (7 - pulse_bit + 40)) & 0x01) << 5 |
881 ((packed >> (7 - pulse_bit + 48)) & 0x01) << 6 |
882 ((packed >> (7 - pulse_bit + 56)) & 0x01) << 7;
885 }
else if (DATA_WIDTH <= 8) {
888 const size_t ticksPerByte = (DATA_WIDTH > 8) ? 1 : (8 / DATA_WIDTH);
889 const size_t numOutputBytes = (pulsesPerByte + ticksPerByte - 1) / ticksPerByte;
891 for (
size_t outputByteIdx = 0; outputByteIdx < numOutputBytes; outputByteIdx++) {
895 for (
size_t t = 0;
t < ticksPerByte;
t++) {
896 size_t pulse_idx = outputByteIdx * ticksPerByte +
t;
897 if (pulse_idx >= pulsesPerByte)
900 size_t bit_pos = pulse_idx / 8;
901 size_t pulse_bit = pulse_idx % 8;
904 for (
size_t lane = 0; lane < DATA_WIDTH; lane++) {
905 const u8* laneWaveform = laneWaveforms + (lane * bytes_per_lane);
906 u8 wave8_byte = laneWaveform[bit_pos];
907 u8 pulse = (wave8_byte >> (7 - pulse_bit)) & 1;
909 size_t bitPos =
t * DATA_WIDTH + lane;
910 outputByte |= (pulse << bitPos);
914 outputBuffer[outputIdx++] = outputByte;
916 }
else if (DATA_WIDTH == 16) {
926 for (
size_t bit_pos = 0; bit_pos < 8; bit_pos += 2) {
930 ((
u64)laneWaveforms[0 * bytes_per_lane + bit_pos + 0] << 0) |
931 ((
u64)laneWaveforms[1 * bytes_per_lane + bit_pos + 0] << 8) |
932 ((
u64)laneWaveforms[2 * bytes_per_lane + bit_pos + 0] << 16) |
933 ((
u64)laneWaveforms[3 * bytes_per_lane + bit_pos + 0] << 24) |
934 ((
u64)laneWaveforms[4 * bytes_per_lane + bit_pos + 0] << 32) |
935 ((
u64)laneWaveforms[5 * bytes_per_lane + bit_pos + 0] << 40) |
936 ((
u64)laneWaveforms[6 * bytes_per_lane + bit_pos + 0] << 48) |
937 ((
u64)laneWaveforms[7 * bytes_per_lane + bit_pos + 0] << 56);
940 ((
u64)laneWaveforms[8 * bytes_per_lane + bit_pos + 0] << 0) |
941 ((
u64)laneWaveforms[9 * bytes_per_lane + bit_pos + 0] << 8) |
942 ((
u64)laneWaveforms[10 * bytes_per_lane + bit_pos + 0] << 16) |
943 ((
u64)laneWaveforms[11 * bytes_per_lane + bit_pos + 0] << 24) |
944 ((
u64)laneWaveforms[12 * bytes_per_lane + bit_pos + 0] << 32) |
945 ((
u64)laneWaveforms[13 * bytes_per_lane + bit_pos + 0] << 40) |
946 ((
u64)laneWaveforms[14 * bytes_per_lane + bit_pos + 0] << 48) |
947 ((
u64)laneWaveforms[15 * bytes_per_lane + bit_pos + 0] << 56);
950 ((
u64)laneWaveforms[0 * bytes_per_lane + bit_pos + 1] << 0) |
951 ((
u64)laneWaveforms[1 * bytes_per_lane + bit_pos + 1] << 8) |
952 ((
u64)laneWaveforms[2 * bytes_per_lane + bit_pos + 1] << 16) |
953 ((
u64)laneWaveforms[3 * bytes_per_lane + bit_pos + 1] << 24) |
954 ((
u64)laneWaveforms[4 * bytes_per_lane + bit_pos + 1] << 32) |
955 ((
u64)laneWaveforms[5 * bytes_per_lane + bit_pos + 1] << 40) |
956 ((
u64)laneWaveforms[6 * bytes_per_lane + bit_pos + 1] << 48) |
957 ((
u64)laneWaveforms[7 * bytes_per_lane + bit_pos + 1] << 56);
960 ((
u64)laneWaveforms[8 * bytes_per_lane + bit_pos + 1] << 0) |
961 ((
u64)laneWaveforms[9 * bytes_per_lane + bit_pos + 1] << 8) |
962 ((
u64)laneWaveforms[10 * bytes_per_lane + bit_pos + 1] << 16) |
963 ((
u64)laneWaveforms[11 * bytes_per_lane + bit_pos + 1] << 24) |
964 ((
u64)laneWaveforms[12 * bytes_per_lane + bit_pos + 1] << 32) |
965 ((
u64)laneWaveforms[13 * bytes_per_lane + bit_pos + 1] << 40) |
966 ((
u64)laneWaveforms[14 * bytes_per_lane + bit_pos + 1] << 48) |
967 ((
u64)laneWaveforms[15 * bytes_per_lane + bit_pos + 1] << 56);
971 for (
size_t pulse_bit = 0; pulse_bit < 8; pulse_bit++) {
974 ((packed_lo_0 >> (7 - pulse_bit + 0)) & 0x01) << 0 |
975 ((packed_lo_0 >> (7 - pulse_bit + 8)) & 0x01) << 1 |
976 ((packed_lo_0 >> (7 - pulse_bit + 16)) & 0x01) << 2 |
977 ((packed_lo_0 >> (7 - pulse_bit + 24)) & 0x01) << 3 |
978 ((packed_lo_0 >> (7 - pulse_bit + 32)) & 0x01) << 4 |
979 ((packed_lo_0 >> (7 - pulse_bit + 40)) & 0x01) << 5 |
980 ((packed_lo_0 >> (7 - pulse_bit + 48)) & 0x01) << 6 |
981 ((packed_lo_0 >> (7 - pulse_bit + 56)) & 0x01) << 7 |
982 ((packed_hi_0 >> (7 - pulse_bit + 0)) & 0x01) << 8 |
983 ((packed_hi_0 >> (7 - pulse_bit + 8)) & 0x01) << 9 |
984 ((packed_hi_0 >> (7 - pulse_bit + 16)) & 0x01) << 10 |
985 ((packed_hi_0 >> (7 - pulse_bit + 24)) & 0x01) << 11 |
986 ((packed_hi_0 >> (7 - pulse_bit + 32)) & 0x01) << 12 |
987 ((packed_hi_0 >> (7 - pulse_bit + 40)) & 0x01) << 13 |
988 ((packed_hi_0 >> (7 - pulse_bit + 48)) & 0x01) << 14 |
989 ((packed_hi_0 >> (7 - pulse_bit + 56)) & 0x01) << 15;
993 ((packed_lo_1 >> (7 - pulse_bit + 0)) & 0x01) << 0 |
994 ((packed_lo_1 >> (7 - pulse_bit + 8)) & 0x01) << 1 |
995 ((packed_lo_1 >> (7 - pulse_bit + 16)) & 0x01) << 2 |
996 ((packed_lo_1 >> (7 - pulse_bit + 24)) & 0x01) << 3 |
997 ((packed_lo_1 >> (7 - pulse_bit + 32)) & 0x01) << 4 |
998 ((packed_lo_1 >> (7 - pulse_bit + 40)) & 0x01) << 5 |
999 ((packed_lo_1 >> (7 - pulse_bit + 48)) & 0x01) << 6 |
1000 ((packed_lo_1 >> (7 - pulse_bit + 56)) & 0x01) << 7 |
1001 ((packed_hi_1 >> (7 - pulse_bit + 0)) & 0x01) << 8 |
1002 ((packed_hi_1 >> (7 - pulse_bit + 8)) & 0x01) << 9 |
1003 ((packed_hi_1 >> (7 - pulse_bit + 16)) & 0x01) << 10 |
1004 ((packed_hi_1 >> (7 - pulse_bit + 24)) & 0x01) << 11 |
1005 ((packed_hi_1 >> (7 - pulse_bit + 32)) & 0x01) << 12 |
1006 ((packed_hi_1 >> (7 - pulse_bit + 40)) & 0x01) << 13 |
1007 ((packed_hi_1 >> (7 - pulse_bit + 48)) & 0x01) << 14 |
1008 ((packed_hi_1 >> (7 - pulse_bit + 56)) & 0x01) << 15;
1011 writeBuffer[writeIdx++] = outputWord_0 & 0xFF;
1012 writeBuffer[writeIdx++] = (outputWord_0 >> 8) & 0xFF;
1013 writeBuffer[writeIdx++] = outputWord_1 & 0xFF;
1014 writeBuffer[writeIdx++] = (outputWord_1 >> 8) & 0xFF;
1019 if (writeIdx == 32) {
1020 fl::memcpy(&outputBuffer[outputIdx], writeBuffer, 32);