d9/d31/transposition_8h_source.html

#pragma once


#include "fl/stl/compiler_control.h"

#include "fl/stl/int.h"

#include "fl/stl/span.h"

#include "fl/stl/optional.h"

#include "fl/stl/cstring.h"

#include "fl/stl/noexcept.h"


FL_OPTIMIZATION_LEVEL_O3_BEGIN


namespace fl {


// ============================================================================

// Core 8x1 Bit Transpose Functions

// ============================================================================


// Note: These transpose functions are used across multiple platforms,

// so they are defined for all targets


typedef union {

    fl::u8 raw;


    struct {

        fl::u32 a0:1;

        fl::u32 a1:1;

        fl::u32 a2:1;

        fl::u32 a3:1;

        fl::u32 a4:1;

        fl::u32 a5:1;

        fl::u32 a6:1;

        fl::u32 a7:1;

    };


} just8bits;


typedef struct {

    fl::u32 a0:1;

    fl::u32 a1:1;

    fl::u32 a2:1;

    fl::u32 a3:1;

    fl::u32 a4:1;

    fl::u32 a5:1;

    fl::u32 a6:1;

    fl::u32 a7:1;

    fl::u32 b0:1;

    fl::u32 b1:1;

    fl::u32 b2:1;

    fl::u32 b3:1;

    fl::u32 b4:1;

    fl::u32 b5:1;

    fl::u32 b6:1;

    fl::u32 b7:1;

    fl::u32 c0:1;

    fl::u32 c1:1;

    fl::u32 c2:1;

    fl::u32 c3:1;

    fl::u32 c4:1;

    fl::u32 c5:1;

    fl::u32 c6:1;

    fl::u32 c7:1;

    fl::u32 d0:1;

    fl::u32 d1:1;

    fl::u32 d2:1;

    fl::u32 d3:1;

    fl::u32 d4:1;

    fl::u32 d5:1;

    fl::u32 d6:1;

    fl::u32 d7:1;

} sub4;


typedef union {

    fl::u32 word[2];

    fl::u8 bytes[8];


    struct {

        sub4 a;

        sub4 b;

    };


} bitswap_type;


void transpose8x1_noinline(unsigned char *A, unsigned char *B) FL_NOEXCEPT;


FASTLED_FORCE_INLINE void transpose8x1(unsigned char *A, unsigned char *B) FL_NOEXCEPT {

    fl::u32 x, y, t;


    // Load the array and pack it into x and y.

    y = *(fl::u32*)(A);

    x = *(fl::u32*)(A+4);


    // pre-transform x

    t = (x ^ (x >> 7)) & 0x00AA00AA;  x = x ^ t ^ (t << 7);

    t = (x ^ (x >>14)) & 0x0000CCCC;  x = x ^ t ^ (t <<14);


    // pre-transform y

    t = (y ^ (y >> 7)) & 0x00AA00AA;  y = y ^ t ^ (t << 7);

    t = (y ^ (y >>14)) & 0x0000CCCC;  y = y ^ t ^ (t <<14);


    // final transform

    t = (x & 0xF0F0F0F0) | ((y >> 4) & 0x0F0F0F0F);

    y = ((x << 4) & 0xF0F0F0F0) | (y & 0x0F0F0F0F);

    x = t;


    *((u32*)B) = y;

    *((u32*)(B+4)) = x;

}


FASTLED_FORCE_INLINE void transpose8x1_MSB(unsigned char *A, unsigned char *B) FL_NOEXCEPT {

    fl::u32 x, y, t;


    // Load the array and pack it into x and y.

    y = *(fl::u32*)(A);

    x = *(fl::u32*)(A+4);


    // pre-transform x

    t = (x ^ (x >> 7)) & 0x00AA00AA;  x = x ^ t ^ (t << 7);

    t = (x ^ (x >>14)) & 0x0000CCCC;  x = x ^ t ^ (t <<14);


    // pre-transform y

    t = (y ^ (y >> 7)) & 0x00AA00AA;  y = y ^ t ^ (t << 7);

    t = (y ^ (y >>14)) & 0x0000CCCC;  y = y ^ t ^ (t <<14);


    // final transform

    t = (x & 0xF0F0F0F0) | ((y >> 4) & 0x0F0F0F0F);

    y = ((x << 4) & 0xF0F0F0F0) | (y & 0x0F0F0F0F);

    x = t;


    B[7] = y; y >>= 8;

    B[6] = y; y >>= 8;

    B[5] = y; y >>= 8;

    B[4] = y;


    B[3] = x; x >>= 8;

    B[2] = x; x >>= 8;

    B[1] = x; x >>= 8;

    B[0] = x;

}


template<int m, int n>


FASTLED_FORCE_INLINE void transpose8(unsigned char *A, unsigned char *B) FL_NOEXCEPT {

    fl::u32 x, y, t;


    // Load the array and pack it into x and y.

    if(m == 1) {

        y = *(fl::u32*)(A);

        x = *(fl::u32*)(A+4);

    } else {

        x = (fl::u32(A[0])<<24)   | (fl::u32(A[m])<<16)   | (fl::u32(A[2*m])<<8) | A[3*m];

        y = (fl::u32(A[4*m])<<24) | (fl::u32(A[5*m])<<16) | (fl::u32(A[6*m])<<8) | A[7*m];

    }


    // pre-transform x

    t = (x ^ (x >> 7)) & 0x00AA00AA;  x = x ^ t ^ (t << 7);

    t = (x ^ (x >>14)) & 0x0000CCCC;  x = x ^ t ^ (t <<14);


    // pre-transform y

    t = (y ^ (y >> 7)) & 0x00AA00AA;  y = y ^ t ^ (t << 7);

    t = (y ^ (y >>14)) & 0x0000CCCC;  y = y ^ t ^ (t <<14);


    // final transform

    t = (x & 0xF0F0F0F0) | ((y >> 4) & 0x0F0F0F0F);

    y = ((x << 4) & 0xF0F0F0F0) | (y & 0x0F0F0F0F);

    x = t;


    B[7*n] = y; y >>= 8;

    B[6*n] = y; y >>= 8;

    B[5*n] = y; y >>= 8;

    B[4*n] = y;


    B[3*n] = x; x >>= 8;

    B[2*n] = x; x >>= 8;

    B[n] = x; x >>= 8;

    B[0] = x;

}


// ============================================================================

// Low-Level ISR-Safe Transposition Primitives

// ============================================================================


inline void transpose_2lane_inline(

    const u8* lane0_byte,

    const u8* lane1_byte,

    u8* output,

    size_t num_bytes

) FL_NOEXCEPT;


inline void transpose_4lane_inline(

    const u8* const lanes[4],

    u8* output,

    size_t num_bytes

) FL_NOEXCEPT;


inline void transpose_8lane_inline(

    const u8* const lanes[8],

    u8* output,

    size_t num_bytes

) FL_NOEXCEPT;


inline void transpose_16lane_inline(

    const u8* const lanes[16],

    u8* output,

    size_t num_bytes

) FL_NOEXCEPT;


template<typename TSource>

inline void transpose_generic_inline(

    const TSource* const lanes[],

    size_t num_lanes,

    u8* output,

    size_t num_items

) FL_NOEXCEPT;


// Implementation of inline ISR-safe primitives


inline void transpose_2lane_inline(

    const u8* lane0_byte,

    const u8* lane1_byte,

    u8* output,

    size_t num_bytes

) FL_NOEXCEPT {

    for (size_t byte_idx = 0; byte_idx < num_bytes; byte_idx++) {

        u8 a = lane0_byte[byte_idx];

        u8 b = lane1_byte[byte_idx];


        // dest[0] contains bit pairs for positions 7,6,5,4 (MSB first)

        output[byte_idx * 2 + 0] =

            ((a >> 7) & 0x01) << 0 | ((b >> 7) & 0x01) << 1 |

            ((a >> 6) & 0x01) << 2 | ((b >> 6) & 0x01) << 3 |

            ((a >> 5) & 0x01) << 4 | ((b >> 5) & 0x01) << 5 |

            ((a >> 4) & 0x01) << 6 | ((b >> 4) & 0x01) << 7;


        // dest[1] contains bit pairs for positions 3,2,1,0 (LSB)

        output[byte_idx * 2 + 1] =

            ((a >> 3) & 0x01) << 0 | ((b >> 3) & 0x01) << 1 |

            ((a >> 2) & 0x01) << 2 | ((b >> 2) & 0x01) << 3 |

            ((a >> 1) & 0x01) << 4 | ((b >> 1) & 0x01) << 5 |

            ((a >> 0) & 0x01) << 6 | ((b >> 0) & 0x01) << 7;

    }

}


inline void transpose_4lane_inline(

    const u8* const lanes[4],

    u8* output,

    size_t num_bytes

) FL_NOEXCEPT {

    for (size_t byte_idx = 0; byte_idx < num_bytes; byte_idx++) {

        u8 a = lanes[0][byte_idx];

        u8 b = lanes[1][byte_idx];

        u8 c = lanes[2][byte_idx];

        u8 d = lanes[3][byte_idx];


        u8* dest = &output[byte_idx * 4];


        dest[0] = ((a >> 7) & 0x01) << 0 | ((b >> 7) & 0x01) << 1 | ((c >> 7) & 0x01) << 2 | ((d >> 7) & 0x01) << 3 |

                  ((a >> 6) & 0x01) << 4 | ((b >> 6) & 0x01) << 5 | ((c >> 6) & 0x01) << 6 | ((d >> 6) & 0x01) << 7;


        dest[1] = ((a >> 5) & 0x01) << 0 | ((b >> 5) & 0x01) << 1 | ((c >> 5) & 0x01) << 2 | ((d >> 5) & 0x01) << 3 |

                  ((a >> 4) & 0x01) << 4 | ((b >> 4) & 0x01) << 5 | ((c >> 4) & 0x01) << 6 | ((d >> 4) & 0x01) << 7;


        dest[2] = ((a >> 3) & 0x01) << 0 | ((b >> 3) & 0x01) << 1 | ((c >> 3) & 0x01) << 2 | ((d >> 3) & 0x01) << 3 |

                  ((a >> 2) & 0x01) << 4 | ((b >> 2) & 0x01) << 5 | ((c >> 2) & 0x01) << 6 | ((d >> 2) & 0x01) << 7;


        dest[3] = ((a >> 1) & 0x01) << 0 | ((b >> 1) & 0x01) << 1 | ((c >> 1) & 0x01) << 2 | ((d >> 1) & 0x01) << 3 |

                  ((a >> 0) & 0x01) << 4 | ((b >> 0) & 0x01) << 5 | ((c >> 0) & 0x01) << 6 | ((d >> 0) & 0x01) << 7;

    }

}


inline void transpose_8lane_inline(

    const u8* const lanes[8],

    u8* output,

    size_t num_bytes

) FL_NOEXCEPT {

    for (size_t byte_idx = 0; byte_idx < num_bytes; byte_idx++) {

        // Pack 8 bytes into a single 64-bit register

        // This reduces register pressure and enables parallel bit extraction

        u64 packed =

            ((u64)lanes[0][byte_idx] << 0)  |

            ((u64)lanes[1][byte_idx] << 8)  |

            ((u64)lanes[2][byte_idx] << 16) |

            ((u64)lanes[3][byte_idx] << 24) |

            ((u64)lanes[4][byte_idx] << 32) |

            ((u64)lanes[5][byte_idx] << 40) |

            ((u64)lanes[6][byte_idx] << 48) |

            ((u64)lanes[7][byte_idx] << 56);


        u8* dest = &output[byte_idx * 8];


        // Extract bits in parallel (compiler can optimize independent shifts)

        for (int bit = 7; bit >= 0; bit--) {

            dest[7 - bit] =

                ((packed >> (bit + 0))  & 0x01) << 0 |

                ((packed >> (bit + 8))  & 0x01) << 1 |

                ((packed >> (bit + 16)) & 0x01) << 2 |

                ((packed >> (bit + 24)) & 0x01) << 3 |

                ((packed >> (bit + 32)) & 0x01) << 4 |

                ((packed >> (bit + 40)) & 0x01) << 5 |

                ((packed >> (bit + 48)) & 0x01) << 6 |

                ((packed >> (bit + 56)) & 0x01) << 7;

        }

    }

}


inline void transpose_16lane_inline(

    const u8* const lanes[16],

    u8* output,

    size_t num_bytes

) FL_NOEXCEPT {

    for (size_t byte_idx = 0; byte_idx < num_bytes; byte_idx++) {

        // Pack lanes 0-7 into first 64-bit register

        u64 packed_lo =

            ((u64)lanes[0][byte_idx] << 0)  |

            ((u64)lanes[1][byte_idx] << 8)  |

            ((u64)lanes[2][byte_idx] << 16) |

            ((u64)lanes[3][byte_idx] << 24) |

            ((u64)lanes[4][byte_idx] << 32) |

            ((u64)lanes[5][byte_idx] << 40) |

            ((u64)lanes[6][byte_idx] << 48) |

            ((u64)lanes[7][byte_idx] << 56);


        // Pack lanes 8-15 into second 64-bit register

        u64 packed_hi =

            ((u64)lanes[8][byte_idx]  << 0)  |

            ((u64)lanes[9][byte_idx]  << 8)  |

            ((u64)lanes[10][byte_idx] << 16) |

            ((u64)lanes[11][byte_idx] << 24) |

            ((u64)lanes[12][byte_idx] << 32) |

            ((u64)lanes[13][byte_idx] << 40) |

            ((u64)lanes[14][byte_idx] << 48) |

            ((u64)lanes[15][byte_idx] << 56);


        u8* dest = &output[byte_idx * 16];


        // Extract bits in parallel from both packed registers

        for (int bit = 7; bit >= 0; bit--) {

            dest[7 - bit] =

                ((packed_lo >> (bit + 0))  & 0x01) << 0 |

                ((packed_lo >> (bit + 8))  & 0x01) << 1 |

                ((packed_lo >> (bit + 16)) & 0x01) << 2 |

                ((packed_lo >> (bit + 24)) & 0x01) << 3 |

                ((packed_lo >> (bit + 32)) & 0x01) << 4 |

                ((packed_lo >> (bit + 40)) & 0x01) << 5 |

                ((packed_lo >> (bit + 48)) & 0x01) << 6 |

                ((packed_lo >> (bit + 56)) & 0x01) << 7;


            dest[15 - bit] =

                ((packed_hi >> (bit + 0))  & 0x01) << 0 |

                ((packed_hi >> (bit + 8))  & 0x01) << 1 |

                ((packed_hi >> (bit + 16)) & 0x01) << 2 |

                ((packed_hi >> (bit + 24)) & 0x01) << 3 |

                ((packed_hi >> (bit + 32)) & 0x01) << 4 |

                ((packed_hi >> (bit + 40)) & 0x01) << 5 |

                ((packed_hi >> (bit + 48)) & 0x01) << 6 |

                ((packed_hi >> (bit + 56)) & 0x01) << 7;

        }

    }

}


template<typename TSource>


inline void transpose_generic_inline(

    const TSource* const lanes[],

    size_t num_lanes,

    u8* output,

    size_t num_items

) FL_NOEXCEPT {

    constexpr size_t bits_per_item = sizeof(TSource) * 8;


    for (size_t item_idx = 0; item_idx < num_items; item_idx++) {

        u8* dest = &output[item_idx * bits_per_item];


        // Process each bit position in the source data (MSB to LSB)

        for (size_t bit_pos = 0; bit_pos < bits_per_item; bit_pos++) {

            size_t src_bit = (bits_per_item - 1) - bit_pos;

            u8 output_byte = 0;


            // Extract bit from each lane (up to 8 lanes per output byte)

            for (size_t lane = 0; lane < num_lanes && lane < 8; lane++) {

                TSource src_value = lanes[lane][item_idx];

                u8 bit = (src_value >> src_bit) & 0x01;

                output_byte |= (bit << (7 - lane));

            }


            dest[bit_pos] = output_byte;

        }

    }

}


// ============================================================================

// SPI Multi-Lane Transposer

// ============================================================================


class SPITransposer {

public:


    struct LaneData {

        fl::span<const u8> payload;

        fl::span<const u8> padding_frame;

    };


    static bool transpose2(const fl::optional<LaneData>& lane0,

                          const fl::optional<LaneData>& lane1,

                          fl::span<u8> output,

                          const char** error = nullptr) FL_NOEXCEPT;


    static bool transpose4(const fl::optional<LaneData>& lane0,

                          const fl::optional<LaneData>& lane1,

                          const fl::optional<LaneData>& lane2,

                          const fl::optional<LaneData>& lane3,

                          fl::span<u8> output,

                          const char** error = nullptr) FL_NOEXCEPT;


    static bool transpose8(const fl::optional<LaneData> lanes[8],

                          fl::span<u8> output,

                          const char** error = nullptr) FL_NOEXCEPT;


    static bool transpose16(const fl::optional<LaneData> lanes[16],

                           fl::span<u8> output,

                           const char** error = nullptr) FL_NOEXCEPT;


private:

    static u8 getLaneByte(const LaneData& lane, size_t byte_idx, size_t max_size) FL_NOEXCEPT;

};


// ============================================================================

// Parallel Strip Transposer (RP2040/RP2350 PIO)

// ============================================================================


FASTLED_FORCE_INLINE void transpose_8strips(

    const u8* const input[8],

    u8* output,

    u16 num_leds,

    u8 bytes_per_led

) FL_NOEXCEPT {

    // Process each LED

    for (u16 led = 0; led < num_leds; led++) {

        u8 temp_input[8];


        // Process each byte in the LED

        for (u8 byte_idx = 0; byte_idx < bytes_per_led; byte_idx++) {

            // Collect one byte from each strip for this byte position

            for (int strip = 0; strip < 8; strip++) {

                temp_input[strip] = input[strip][led * bytes_per_led + byte_idx];

            }


            // Transpose 8 bytes → 8 bytes (1 bit from each strip per output byte)

            transpose8x1_MSB(temp_input, output);


            // Advance output pointer by 8 bytes

            output += 8;

        }

    }

}


FASTLED_FORCE_INLINE void transpose_4strips(

    const u8* const input[4],

    u8* output,

    u16 num_leds,

    u8 bytes_per_led

) FL_NOEXCEPT {

    // Process each LED

    for (u16 led = 0; led < num_leds; led++) {

        // Process each byte in the LED

        for (u8 byte_idx = 0; byte_idx < bytes_per_led; byte_idx++) {

            // Collect one byte from each strip for this byte position

            u8 strip_bytes[4];

            for (int strip = 0; strip < 4; strip++) {

                strip_bytes[strip] = input[strip][led * bytes_per_led + byte_idx];

            }


            // Transpose: extract each bit position from all 4 strips

            for (int bit = 7; bit >= 0; bit--) {

                u8 output_byte = 0;

                // Pack bits from all 4 strips into lower 4 bits

                for (int strip = 0; strip < 4; strip++) {

                    output_byte |= ((strip_bytes[strip] >> bit) & 1) << strip;

                }

                *output++ = output_byte;

            }

        }

    }

}


FASTLED_FORCE_INLINE void transpose_2strips(

    const u8* const input[2],

    u8* output,

    u16 num_leds,

    u8 bytes_per_led

) FL_NOEXCEPT {

    // Process each LED

    for (u16 led = 0; led < num_leds; led++) {

        // Process each byte in the LED

        for (u8 byte_idx = 0; byte_idx < bytes_per_led; byte_idx++) {

            // Collect one byte from each strip for this byte position

            u8 strip_bytes[2];

            strip_bytes[0] = input[0][led * bytes_per_led + byte_idx];

            strip_bytes[1] = input[1][led * bytes_per_led + byte_idx];


            // Transpose: extract each bit position from both strips

            for (int bit = 7; bit >= 0; bit--) {

                u8 output_byte =

                    ((strip_bytes[0] >> bit) & 1) |

                    (((strip_bytes[1] >> bit) & 1) << 1);

                *output++ = output_byte;

            }

        }

    }

}


FASTLED_FORCE_INLINE u32 calculate_transpose_buffer_size(u16 num_leds, u8 bytes_per_led) FL_NOEXCEPT {

    return num_leds * bytes_per_led * 8;

}


inline bool transpose_strips(

    u8 num_strips,

    const u8* const* input,

    u8* output,

    u16 num_leds,

    u8 bytes_per_led

) FL_NOEXCEPT {

    switch (num_strips) {

        case 8:

            transpose_8strips(input, output, num_leds, bytes_per_led);

            return true;

        case 4:

            transpose_4strips(input, output, num_leds, bytes_per_led);

            return true;

        case 2:

            transpose_2strips(input, output, num_leds, bytes_per_led);

            return true;

        default:

            return false;  // Invalid strip count

    }

}


// ============================================================================

// PARLIO Wave8 Transposer (ESP32-S3 Parallel I/O)

// ============================================================================


template<size_t DATA_WIDTH>


FASTLED_FORCE_INLINE FL_IRAM FL_OPTIMIZE_FUNCTION size_t transpose_wave8byte_parlio_template(

    const u8* FL_RESTRICT_PARAM laneWaveforms,

    u8* FL_RESTRICT_PARAM outputBuffer

) FL_NOEXCEPT {

    constexpr size_t bytes_per_lane = 8;   // sizeof(Wave8Byte)

    constexpr size_t pulsesPerByte = 64;   // 8 bits × 8 pulses per bit

    size_t outputIdx = 0;


    // Note: Using regular if statements (C++11 compatible)

    // Compiler optimizes away dead branches for constant template parameters

    if (DATA_WIDTH == 8) {

        // Special optimized case for 8 lanes with bit packing

        // Optimized: Hoist packing outside inner loop to reduce redundant operations

        for (size_t bit_pos = 0; bit_pos < 8; bit_pos++) {

            // Pack 8 wave8_byte values into a single 64-bit register for parallel extraction

            // This packing is done once per bit_pos (8 times) instead of 64 times

            u64 packed =

                ((u64)laneWaveforms[0 * bytes_per_lane + bit_pos] << 0)  |

                ((u64)laneWaveforms[1 * bytes_per_lane + bit_pos] << 8)  |

                ((u64)laneWaveforms[2 * bytes_per_lane + bit_pos] << 16) |

                ((u64)laneWaveforms[3 * bytes_per_lane + bit_pos] << 24) |

                ((u64)laneWaveforms[4 * bytes_per_lane + bit_pos] << 32) |

                ((u64)laneWaveforms[5 * bytes_per_lane + bit_pos] << 40) |

                ((u64)laneWaveforms[6 * bytes_per_lane + bit_pos] << 48) |

                ((u64)laneWaveforms[7 * bytes_per_lane + bit_pos] << 56);


            // Inner loop: extract 8 pulses from the packed data

            for (size_t pulse_bit = 0; pulse_bit < 8; pulse_bit++) {

                // Extract pulse bits in parallel (compiler can optimize independent shifts)

                outputBuffer[outputIdx++] =

                    ((packed >> (7 - pulse_bit + 0))  & 0x01) << 0 |

                    ((packed >> (7 - pulse_bit + 8))  & 0x01) << 1 |

                    ((packed >> (7 - pulse_bit + 16)) & 0x01) << 2 |

                    ((packed >> (7 - pulse_bit + 24)) & 0x01) << 3 |

                    ((packed >> (7 - pulse_bit + 32)) & 0x01) << 4 |

                    ((packed >> (7 - pulse_bit + 40)) & 0x01) << 5 |

                    ((packed >> (7 - pulse_bit + 48)) & 0x01) << 6 |

                    ((packed >> (7 - pulse_bit + 56)) & 0x01) << 7;

            }

        }

    } else if (DATA_WIDTH <= 8) {

        // Pack into single bytes (compile-time branch elimination via template instantiation)

        // Guard against division by zero when DATA_WIDTH > 8 (shouldn't execute this branch, but compiler still evaluates it)

        const size_t ticksPerByte = (DATA_WIDTH > 8) ? 1 : (8 / DATA_WIDTH);

        const size_t numOutputBytes = (pulsesPerByte + ticksPerByte - 1) / ticksPerByte;


        for (size_t outputByteIdx = 0; outputByteIdx < numOutputBytes; outputByteIdx++) {

            u8 outputByte = 0;


            FL_UNROLL(8)

            for (size_t t = 0; t < ticksPerByte; t++) {

                size_t pulse_idx = outputByteIdx * ticksPerByte + t;

                if (pulse_idx >= pulsesPerByte)

                    break;


                size_t bit_pos = pulse_idx / 8;

                size_t pulse_bit = pulse_idx % 8;


                FL_UNROLL(8)

                for (size_t lane = 0; lane < DATA_WIDTH; lane++) {

                    const u8* laneWaveform = laneWaveforms + (lane * bytes_per_lane);

                    u8 wave8_byte = laneWaveform[bit_pos];

                    u8 pulse = (wave8_byte >> (7 - pulse_bit)) & 1;


                    size_t bitPos = t * DATA_WIDTH + lane;

                    outputByte |= (pulse << bitPos);

                }

            }


            outputBuffer[outputIdx++] = outputByte;

        }

    } else if (DATA_WIDTH == 16) {

        // Pack into 16-bit words (compile-time branch)

        // Optimized: Software pipelining + output buffering

        // Process 2 bit positions in parallel for better ILP, and batch writes for better cache efficiency


        // Output buffer: accumulate 16 words (32 bytes) before writing

        // This aligns with typical 32-byte cache lines and reduces memory write overhead

        u8 writeBuffer[32];

        size_t writeIdx = 0;


        for (size_t bit_pos = 0; bit_pos < 8; bit_pos += 2) {

            // Pack 16 wave8_byte values for TWO bit positions simultaneously

            // This enables instruction-level parallelism and better register utilization

            u64 packed_lo_0 =

                ((u64)laneWaveforms[0 * bytes_per_lane + bit_pos + 0] << 0)  |

                ((u64)laneWaveforms[1 * bytes_per_lane + bit_pos + 0] << 8)  |

                ((u64)laneWaveforms[2 * bytes_per_lane + bit_pos + 0] << 16) |

                ((u64)laneWaveforms[3 * bytes_per_lane + bit_pos + 0] << 24) |

                ((u64)laneWaveforms[4 * bytes_per_lane + bit_pos + 0] << 32) |

                ((u64)laneWaveforms[5 * bytes_per_lane + bit_pos + 0] << 40) |

                ((u64)laneWaveforms[6 * bytes_per_lane + bit_pos + 0] << 48) |

                ((u64)laneWaveforms[7 * bytes_per_lane + bit_pos + 0] << 56);


            u64 packed_hi_0 =

                ((u64)laneWaveforms[8  * bytes_per_lane + bit_pos + 0] << 0)  |

                ((u64)laneWaveforms[9  * bytes_per_lane + bit_pos + 0] << 8)  |

                ((u64)laneWaveforms[10 * bytes_per_lane + bit_pos + 0] << 16) |

                ((u64)laneWaveforms[11 * bytes_per_lane + bit_pos + 0] << 24) |

                ((u64)laneWaveforms[12 * bytes_per_lane + bit_pos + 0] << 32) |

                ((u64)laneWaveforms[13 * bytes_per_lane + bit_pos + 0] << 40) |

                ((u64)laneWaveforms[14 * bytes_per_lane + bit_pos + 0] << 48) |

                ((u64)laneWaveforms[15 * bytes_per_lane + bit_pos + 0] << 56);


            u64 packed_lo_1 =

                ((u64)laneWaveforms[0 * bytes_per_lane + bit_pos + 1] << 0)  |

                ((u64)laneWaveforms[1 * bytes_per_lane + bit_pos + 1] << 8)  |

                ((u64)laneWaveforms[2 * bytes_per_lane + bit_pos + 1] << 16) |

                ((u64)laneWaveforms[3 * bytes_per_lane + bit_pos + 1] << 24) |

                ((u64)laneWaveforms[4 * bytes_per_lane + bit_pos + 1] << 32) |

                ((u64)laneWaveforms[5 * bytes_per_lane + bit_pos + 1] << 40) |

                ((u64)laneWaveforms[6 * bytes_per_lane + bit_pos + 1] << 48) |

                ((u64)laneWaveforms[7 * bytes_per_lane + bit_pos + 1] << 56);


            u64 packed_hi_1 =

                ((u64)laneWaveforms[8  * bytes_per_lane + bit_pos + 1] << 0)  |

                ((u64)laneWaveforms[9  * bytes_per_lane + bit_pos + 1] << 8)  |

                ((u64)laneWaveforms[10 * bytes_per_lane + bit_pos + 1] << 16) |

                ((u64)laneWaveforms[11 * bytes_per_lane + bit_pos + 1] << 24) |

                ((u64)laneWaveforms[12 * bytes_per_lane + bit_pos + 1] << 32) |

                ((u64)laneWaveforms[13 * bytes_per_lane + bit_pos + 1] << 40) |

                ((u64)laneWaveforms[14 * bytes_per_lane + bit_pos + 1] << 48) |

                ((u64)laneWaveforms[15 * bytes_per_lane + bit_pos + 1] << 56);


            // Inner loop: interleave extraction from both bit positions

            // This allows CPU to execute independent operations in parallel

            for (size_t pulse_bit = 0; pulse_bit < 8; pulse_bit++) {

                // Extract pulse bits for first bit position

                u16 outputWord_0 =

                    ((packed_lo_0 >> (7 - pulse_bit + 0))  & 0x01) << 0  |

                    ((packed_lo_0 >> (7 - pulse_bit + 8))  & 0x01) << 1  |

                    ((packed_lo_0 >> (7 - pulse_bit + 16)) & 0x01) << 2  |

                    ((packed_lo_0 >> (7 - pulse_bit + 24)) & 0x01) << 3  |

                    ((packed_lo_0 >> (7 - pulse_bit + 32)) & 0x01) << 4  |

                    ((packed_lo_0 >> (7 - pulse_bit + 40)) & 0x01) << 5  |

                    ((packed_lo_0 >> (7 - pulse_bit + 48)) & 0x01) << 6  |

                    ((packed_lo_0 >> (7 - pulse_bit + 56)) & 0x01) << 7  |

                    ((packed_hi_0 >> (7 - pulse_bit + 0))  & 0x01) << 8  |

                    ((packed_hi_0 >> (7 - pulse_bit + 8))  & 0x01) << 9  |

                    ((packed_hi_0 >> (7 - pulse_bit + 16)) & 0x01) << 10 |

                    ((packed_hi_0 >> (7 - pulse_bit + 24)) & 0x01) << 11 |

                    ((packed_hi_0 >> (7 - pulse_bit + 32)) & 0x01) << 12 |

                    ((packed_hi_0 >> (7 - pulse_bit + 40)) & 0x01) << 13 |

                    ((packed_hi_0 >> (7 - pulse_bit + 48)) & 0x01) << 14 |

                    ((packed_hi_0 >> (7 - pulse_bit + 56)) & 0x01) << 15;


                // Extract pulse bits for second bit position

                u16 outputWord_1 =

                    ((packed_lo_1 >> (7 - pulse_bit + 0))  & 0x01) << 0  |

                    ((packed_lo_1 >> (7 - pulse_bit + 8))  & 0x01) << 1  |

                    ((packed_lo_1 >> (7 - pulse_bit + 16)) & 0x01) << 2  |

                    ((packed_lo_1 >> (7 - pulse_bit + 24)) & 0x01) << 3  |

                    ((packed_lo_1 >> (7 - pulse_bit + 32)) & 0x01) << 4  |

                    ((packed_lo_1 >> (7 - pulse_bit + 40)) & 0x01) << 5  |

                    ((packed_lo_1 >> (7 - pulse_bit + 48)) & 0x01) << 6  |

                    ((packed_lo_1 >> (7 - pulse_bit + 56)) & 0x01) << 7  |

                    ((packed_hi_1 >> (7 - pulse_bit + 0))  & 0x01) << 8  |

                    ((packed_hi_1 >> (7 - pulse_bit + 8))  & 0x01) << 9  |

                    ((packed_hi_1 >> (7 - pulse_bit + 16)) & 0x01) << 10 |

                    ((packed_hi_1 >> (7 - pulse_bit + 24)) & 0x01) << 11 |

                    ((packed_hi_1 >> (7 - pulse_bit + 32)) & 0x01) << 12 |

                    ((packed_hi_1 >> (7 - pulse_bit + 40)) & 0x01) << 13 |

                    ((packed_hi_1 >> (7 - pulse_bit + 48)) & 0x01) << 14 |

                    ((packed_hi_1 >> (7 - pulse_bit + 56)) & 0x01) << 15;


                // Write to buffer instead of directly to output

                writeBuffer[writeIdx++] = outputWord_0 & 0xFF;

                writeBuffer[writeIdx++] = (outputWord_0 >> 8) & 0xFF;

                writeBuffer[writeIdx++] = outputWord_1 & 0xFF;

                writeBuffer[writeIdx++] = (outputWord_1 >> 8) & 0xFF;

            }


            // Flush buffer when full (16 words = 32 bytes)

            // This triggers efficient burst writes that align with cache lines

            if (writeIdx == 32) {

                fl::memcpy(&outputBuffer[outputIdx], writeBuffer, 32);

                outputIdx += 32;

                writeIdx = 0;

            }

        }

    } else {

        // Invalid DATA_WIDTH (compile-time error if template instantiated with wrong value)

        return 0;

    }


    return outputIdx;

}


FASTLED_FORCE_INLINE FL_IRAM size_t transpose_wave8byte_parlio(

    const u8* FL_RESTRICT_PARAM laneWaveforms,

    size_t data_width,

    u8* FL_RESTRICT_PARAM outputBuffer

) FL_NOEXCEPT {

    // Dispatch to template specialization based on runtime data_width

    // Compiler generates optimized code for each specialization (no runtime branching)

    switch (data_width) {

        case 1:

            return transpose_wave8byte_parlio_template<1>(laneWaveforms, outputBuffer);

        case 2:

            return transpose_wave8byte_parlio_template<2>(laneWaveforms, outputBuffer);

        case 4:

            return transpose_wave8byte_parlio_template<4>(laneWaveforms, outputBuffer);

        case 8:

            return transpose_wave8byte_parlio_template<8>(laneWaveforms, outputBuffer);

        case 16:

            return transpose_wave8byte_parlio_template<16>(laneWaveforms, outputBuffer);

        default:

            // Invalid data_width

            return 0;

    }

}


}  // namespace fl


FL_OPTIMIZATION_LEVEL_O3_END

fl::SPITransposer::transpose8
static bool transpose8(const fl::optional< LaneData > lanes[8], fl::span< u8 > output, const char **error=nullptr) FL_NOEXCEPT
Transpose 8 lanes of data into interleaved octal-SPI format.
Definition transposition.cpp.hpp:180

fl::SPITransposer::transpose4
static bool transpose4(const fl::optional< LaneData > &lane0, const fl::optional< LaneData > &lane1, const fl::optional< LaneData > &lane2, const fl::optional< LaneData > &lane3, fl::span< u8 > output, const char **error=nullptr) FL_NOEXCEPT
Transpose 4 lanes of data into interleaved quad-SPI format.
Definition transposition.cpp.hpp:114

fl::SPITransposer::transpose16
static bool transpose16(const fl::optional< LaneData > lanes[16], fl::span< u8 > output, const char **error=nullptr) FL_NOEXCEPT
Transpose 16 lanes of data into interleaved hex-SPI format.
Definition transposition.cpp.hpp:240

fl::SPITransposer::transpose2
static bool transpose2(const fl::optional< LaneData > &lane0, const fl::optional< LaneData > &lane1, fl::span< u8 > output, const char **error=nullptr) FL_NOEXCEPT
Transpose 2 lanes of data into interleaved dual-SPI format.
Definition transposition.cpp.hpp:53

fl::SPITransposer::getLaneByte
static u8 getLaneByte(const LaneData &lane, size_t byte_idx, size_t max_size) FL_NOEXCEPT
Get byte from lane at given index, handling padding automatically.
Definition transposition.cpp.hpp:300

fl::SPITransposer
Unified stateless bit-interleaving transposer for multi-lane SPI parallel LED transmission.
Definition transposition.h:583

fl::SPITransposer::LaneData::payload
fl::span< const u8 > payload
Actual LED data for this lane.
Definition transposition.h:587

fl::SPITransposer::LaneData::padding_frame
fl::span< const u8 > padding_frame
Black LED frame for padding (repeating pattern)
Definition transposition.h:588

fl::SPITransposer::LaneData
Lane data structure: payload + padding frame.
Definition transposition.h:586

fl::span
Definition span.h:385

compiler_control.h

cstring.h

int.h

u8
unsigned char u8
Definition stdint.h:131

fl::fl::u8
unsigned char u8
Definition s16x16x4.h:132

fl::x
x
Definition transposition.cpp.hpp:24

fl::memcpy
void * memcpy(void *dest, const void *src, size_t n) FL_NOEXCEPT
Definition cstring.cpp.hpp:110

fl::transpose_wave8byte_parlio
FASTLED_FORCE_INLINE FL_IRAM size_t transpose_wave8byte_parlio(const u8 *FL_RESTRICT_PARAM laneWaveforms, size_t data_width, u8 *FL_RESTRICT_PARAM outputBuffer) FL_NOEXCEPT
Definition transposition.h:1064

fl::u8
unsigned char u8
Definition stdint.h:131

fl::transpose8x1_MSB
FASTLED_FORCE_INLINE void transpose8x1_MSB(unsigned char *A, unsigned char *B) FL_NOEXCEPT
Simplified 8x1 bit transpose with MSB-first output.
Definition transposition.h:146

fl::y
y
Definition transposition.cpp.hpp:23

fl::B
FL_DISABLE_WARNING_PUSH unsigned char * B
Definition transposition.cpp.hpp:19

fl::transpose_2strips
FASTLED_FORCE_INLINE void transpose_2strips(const u8 *const input[2], u8 *output, u16 num_leds, u8 bytes_per_led) FL_NOEXCEPT
Transpose 2 LED strips into parallel bit format.
Definition transposition.h:767

fl::transpose_2lane_inline
void transpose_2lane_inline(const u8 *lane0_byte, const u8 *lane1_byte, u8 *output, size_t num_bytes) FL_NOEXCEPT
Low-level bit-interleaving primitive for 2 lanes (ISR-safe)
Definition transposition.h:329

fl::optional
Optional< T > optional
Definition optional.h:16

fl::transpose8
FASTLED_FORCE_INLINE void transpose8(unsigned char *A, unsigned char *B) FL_NOEXCEPT
Templated 8x8 bit transpose with custom stride.
Definition transposition.h:186

fl::transpose8x1_noinline
void transpose8x1_noinline(unsigned char *A, unsigned char *B) FL_NOEXCEPT
Simplified 8x1 bit transpose (non-inline version)

fl::transpose_strips
bool transpose_strips(u8 num_strips, const u8 *const *input, u8 *output, u16 num_leds, u8 bytes_per_led) FL_NOEXCEPT
Helper to transpose N strips with automatic dispatch.
Definition transposition.h:812

fl::transpose_8lane_inline
void transpose_8lane_inline(const u8 *const lanes[8], u8 *output, size_t num_bytes) FL_NOEXCEPT
Low-level bit-interleaving primitive for 8 lanes (ISR-safe)
Definition transposition.h:382

fl::transpose8x1
FASTLED_FORCE_INLINE void transpose8x1(unsigned char *A, unsigned char *B) FL_NOEXCEPT
Simplified 8x1 bit transpose (inline version)
Definition transposition.h:116

fl::transpose_4strips
FASTLED_FORCE_INLINE void transpose_4strips(const u8 *const input[4], u8 *output, u16 num_leds, u8 bytes_per_led) FL_NOEXCEPT
Transpose 4 LED strips into parallel bit format.
Definition transposition.h:722

fl::calculate_transpose_buffer_size
FASTLED_FORCE_INLINE u32 calculate_transpose_buffer_size(u16 num_leds, u8 bytes_per_led) FL_NOEXCEPT
Calculate output buffer size needed for transposed data.
Definition transposition.h:800

fl::transpose_wave8byte_parlio_template
FASTLED_FORCE_INLINE FL_IRAM FL_OPTIMIZE_FUNCTION size_t transpose_wave8byte_parlio_template(const u8 *FL_RESTRICT_PARAM laneWaveforms, u8 *FL_RESTRICT_PARAM outputBuffer) FL_NOEXCEPT
Template specialization of transpose for compile-time data_width (optimization)
Definition transposition.h:845

fl::transpose_generic_inline
void transpose_generic_inline(const TSource *const lanes[], size_t num_lanes, u8 *output, size_t num_items) FL_NOEXCEPT
Generic bit-interleaving primitive for N lanes with M-bit source data (ISR-safe)
Definition transposition.h:473

fl::transpose_16lane_inline
void transpose_16lane_inline(const u8 *const lanes[16], u8 *output, size_t num_bytes) FL_NOEXCEPT
Low-level bit-interleaving primitive for 16 lanes (ISR-safe)
Definition transposition.h:417

fl::transpose_8strips
FASTLED_FORCE_INLINE void transpose_8strips(const u8 *const input[8], u8 *output, u16 num_leds, u8 bytes_per_led) FL_NOEXCEPT
Transpose 8 LED strips into parallel bit format.
Definition transposition.h:680

fl::t
t
Definition transposition.cpp.hpp:27

fl::u64
fl::u64 u64
Definition s16x16x4.h:221

fl::transpose_4lane_inline
void transpose_4lane_inline(const u8 *const lanes[4], u8 *output, size_t num_bytes) FL_NOEXCEPT
Low-level bit-interleaving primitive for 4 lanes (ISR-safe)
Definition transposition.h:355

fl
Base definition for an LED controller.
Definition crgb.hpp:179

fl::sub4::b0
fl::u32 b0
byte 'b', bit 0
Definition transposition.h:67

fl::bitswap_type::bytes
fl::u8 bytes[8]
eight 8-bit values to load for swapping
Definition transposition.h:96

fl::bitswap_type.__unnamed0__::a
sub4 a
32-bit access struct for bit swapping, upper four bytes

fl::sub4::c7
fl::u32 c7
byte 'c', bit 7
Definition transposition.h:82

fl::sub4::d7
fl::u32 d7
byte 'd', bit 7
Definition transposition.h:90

fl::sub4::d4
fl::u32 d4
byte 'd', bit 4
Definition transposition.h:87

fl::sub4::d6
fl::u32 d6
byte 'd', bit 6
Definition transposition.h:89

fl::just8bits.__unnamed0__::a5
fl::u32 a5
bit 5 (0x20)

fl::sub4::a0
fl::u32 a0
byte 'a', bit 0
Definition transposition.h:59

fl::sub4::b6
fl::u32 b6
byte 'b', bit 6
Definition transposition.h:73

fl::sub4::c6
fl::u32 c6
byte 'c', bit 6
Definition transposition.h:81

fl::just8bits.__unnamed0__::a0
fl::u32 a0
bit 0 (0x01)

fl::sub4::b5
fl::u32 b5
byte 'b', bit 5
Definition transposition.h:72

fl::sub4::c0
fl::u32 c0
byte 'c', bit 0
Definition transposition.h:75

fl::sub4::a7
fl::u32 a7
byte 'a', bit 7
Definition transposition.h:66

fl::just8bits.__unnamed0__::a2
fl::u32 a2
bit 2 (0x04)

fl::sub4::b1
fl::u32 b1
byte 'b', bit 1
Definition transposition.h:68

fl::sub4::c1
fl::u32 c1
byte 'c', bit 1
Definition transposition.h:76

fl::sub4::c2
fl::u32 c2
byte 'c', bit 2
Definition transposition.h:77

fl::just8bits.__unnamed0__::a4
fl::u32 a4
bit 4 (0x10)

fl::just8bits.__unnamed0__::a1
fl::u32 a1
bit 1 (0x02)

fl::bitswap_type::word
fl::u32 word[2]
two 32-bit values to load for swapping
Definition transposition.h:95

fl::sub4::b3
fl::u32 b3
byte 'b', bit 3
Definition transposition.h:70

fl::bitswap_type.__unnamed0__::b
sub4 b
32-bit access struct for bit swapping, lower four bytes

fl::sub4::c4
fl::u32 c4
byte 'c', bit 4
Definition transposition.h:79

fl::sub4::b4
fl::u32 b4
byte 'b', bit 4
Definition transposition.h:71

fl::sub4::a3
fl::u32 a3
byte 'a', bit 3
Definition transposition.h:62

fl::sub4::d5
fl::u32 d5
byte 'd', bit 5
Definition transposition.h:88

fl::just8bits.__unnamed0__::a3
fl::u32 a3
bit 3 (0x08)

fl::sub4::b2
fl::u32 b2
byte 'b', bit 2
Definition transposition.h:69

fl::sub4::a5
fl::u32 a5
byte 'a', bit 5
Definition transposition.h:64

fl::sub4::a4
fl::u32 a4
byte 'a', bit 4
Definition transposition.h:63

fl::sub4::c3
fl::u32 c3
byte 'c', bit 3
Definition transposition.h:78

fl::sub4::d2
fl::u32 d2
byte 'd', bit 2
Definition transposition.h:85

fl::just8bits::raw
fl::u8 raw
the entire byte
Definition transposition.h:44

fl::sub4::a2
fl::u32 a2
byte 'a', bit 2
Definition transposition.h:61

fl::sub4::c5
fl::u32 c5
byte 'c', bit 5
Definition transposition.h:80

fl::just8bits.__unnamed0__::a7
fl::u32 a7
bit 7 (0x80)

fl::sub4::b7
fl::u32 b7
byte 'b', bit 7
Definition transposition.h:74

fl::sub4::d1
fl::u32 d1
byte 'd', bit 1
Definition transposition.h:84

fl::sub4::d0
fl::u32 d0
byte 'd', bit 0
Definition transposition.h:83

fl::sub4::a6
fl::u32 a6
byte 'a', bit 6
Definition transposition.h:65

fl::sub4::a1
fl::u32 a1
byte 'a', bit 1
Definition transposition.h:60

fl::just8bits.__unnamed0__::a6
fl::u32 a6
bit 6 (0x40)

fl::sub4::d3
fl::u32 d3
byte 'd', bit 3
Definition transposition.h:86

fl::max_size
Definition type_traits.h:904

fl::sub4
Structure representing 32 bits of access.
Definition transposition.h:58

noexcept.h

optional.h

FL_OPTIMIZATION_LEVEL_O3_BEGIN
#define FL_OPTIMIZATION_LEVEL_O3_BEGIN

FL_UNROLL
#define FL_UNROLL(N)

FASTLED_FORCE_INLINE
#define FASTLED_FORCE_INLINE

FL_OPTIMIZATION_LEVEL_O3_END
#define FL_OPTIMIZATION_LEVEL_O3_END

FL_OPTIMIZE_FUNCTION
#define FL_OPTIMIZE_FUNCTION

FL_IRAM
#define FL_IRAM

FL_RESTRICT_PARAM
#define FL_RESTRICT_PARAM

FL_NOEXCEPT
#define FL_NOEXCEPT

span.h