◆ transpose_wave8byte_parlio_template()

template<size_t DATA_WIDTH>
FASTLED_FORCE_INLINE FL_IRAM FL_OPTIMIZE_FUNCTION size_t fl::transpose_wave8byte_parlio_template	(	const u8 *FL_RESTRICT_PARAM	laneWaveforms,
		u8 *FL_RESTRICT_PARAM	outputBuffer )
Template specialization of transpose for compile-time data_width (optimization)
This template version eliminates runtime branching by specializing for each data width. The compiler generates optimized code for each DATA_WIDTH value at compile time.
Template Parameters
DATA_WIDTH Number of parallel lanes (1, 2, 4, 8, or 16) - compile-time constant
Examples: /home/runner/work/FastLED/FastLED/src/fl/math/transposition.h.
Definition at line 845 of file transposition.h.
              {
    constexpr size_t bytes_per_lane = 8;   // sizeof(Wave8Byte)
    constexpr size_t pulsesPerByte = 64;   // 8 bits × 8 pulses per bit
    size_t outputIdx = 0;
 
    // Note: Using regular if statements (C++11 compatible)
    // Compiler optimizes away dead branches for constant template parameters
    if (DATA_WIDTH == 8) {
        // Special optimized case for 8 lanes with bit packing
        // Optimized: Hoist packing outside inner loop to reduce redundant operations
        for (size_t bit_pos = 0; bit_pos < 8; bit_pos++) {
            // Pack 8 wave8_byte values into a single 64-bit register for parallel extraction
            // This packing is done once per bit_pos (8 times) instead of 64 times
            u64 packed =
                ((u64)laneWaveforms[0 * bytes_per_lane + bit_pos] << 0)  |
                ((u64)laneWaveforms[1 * bytes_per_lane + bit_pos] << 8)  |
                ((u64)laneWaveforms[2 * bytes_per_lane + bit_pos] << 16) |
                ((u64)laneWaveforms[3 * bytes_per_lane + bit_pos] << 24) |
                ((u64)laneWaveforms[4 * bytes_per_lane + bit_pos] << 32) |
                ((u64)laneWaveforms[5 * bytes_per_lane + bit_pos] << 40) |
                ((u64)laneWaveforms[6 * bytes_per_lane + bit_pos] << 48) |
                ((u64)laneWaveforms[7 * bytes_per_lane + bit_pos] << 56);
 
            // Inner loop: extract 8 pulses from the packed data
            for (size_t pulse_bit = 0; pulse_bit < 8; pulse_bit++) {
                // Extract pulse bits in parallel (compiler can optimize independent shifts)
                outputBuffer[outputIdx++] =
                    ((packed >> (7 - pulse_bit + 0))  & 0x01) << 0 |
                    ((packed >> (7 - pulse_bit + 8))  & 0x01) << 1 |
                    ((packed >> (7 - pulse_bit + 16)) & 0x01) << 2 |
                    ((packed >> (7 - pulse_bit + 24)) & 0x01) << 3 |
                    ((packed >> (7 - pulse_bit + 32)) & 0x01) << 4 |
                    ((packed >> (7 - pulse_bit + 40)) & 0x01) << 5 |
                    ((packed >> (7 - pulse_bit + 48)) & 0x01) << 6 |
                    ((packed >> (7 - pulse_bit + 56)) & 0x01) << 7;
            }
        }
    } else if (DATA_WIDTH <= 8) {
        // Pack into single bytes (compile-time branch elimination via template instantiation)
        // Guard against division by zero when DATA_WIDTH > 8 (shouldn't execute this branch, but compiler still evaluates it)
        const size_t ticksPerByte = (DATA_WIDTH > 8) ? 1 : (8 / DATA_WIDTH);
        const size_t numOutputBytes = (pulsesPerByte + ticksPerByte - 1) / ticksPerByte;
 
        for (size_t outputByteIdx = 0; outputByteIdx < numOutputBytes; outputByteIdx++) {
            u8 outputByte = 0;
 
            FL_UNROLL(8)
            for (size_t t = 0; t < ticksPerByte; t++) {
                size_t pulse_idx = outputByteIdx * ticksPerByte + t;
                if (pulse_idx >= pulsesPerByte)
                    break;
 
                size_t bit_pos = pulse_idx / 8;
                size_t pulse_bit = pulse_idx % 8;
 
                FL_UNROLL(8)
                for (size_t lane = 0; lane < DATA_WIDTH; lane++) {
                    const u8* laneWaveform = laneWaveforms + (lane * bytes_per_lane);
                    u8 wave8_byte = laneWaveform[bit_pos];
                    u8 pulse = (wave8_byte >> (7 - pulse_bit)) & 1;
 
                    size_t bitPos = t * DATA_WIDTH + lane;
                    outputByte |= (pulse << bitPos);
                }
            }
 
            outputBuffer[outputIdx++] = outputByte;
        }
    } else if (DATA_WIDTH == 16) {
        // Pack into 16-bit words (compile-time branch)
        // Optimized: Software pipelining + output buffering
        // Process 2 bit positions in parallel for better ILP, and batch writes for better cache efficiency
 
        // Output buffer: accumulate 16 words (32 bytes) before writing
        // This aligns with typical 32-byte cache lines and reduces memory write overhead
        u8 writeBuffer[32];
        size_t writeIdx = 0;
 
        for (size_t bit_pos = 0; bit_pos < 8; bit_pos += 2) {
            // Pack 16 wave8_byte values for TWO bit positions simultaneously
            // This enables instruction-level parallelism and better register utilization
            u64 packed_lo_0 =
                ((u64)laneWaveforms[0 * bytes_per_lane + bit_pos + 0] << 0)  |
                ((u64)laneWaveforms[1 * bytes_per_lane + bit_pos + 0] << 8)  |
                ((u64)laneWaveforms[2 * bytes_per_lane + bit_pos + 0] << 16) |
                ((u64)laneWaveforms[3 * bytes_per_lane + bit_pos + 0] << 24) |
                ((u64)laneWaveforms[4 * bytes_per_lane + bit_pos + 0] << 32) |
                ((u64)laneWaveforms[5 * bytes_per_lane + bit_pos + 0] << 40) |
                ((u64)laneWaveforms[6 * bytes_per_lane + bit_pos + 0] << 48) |
                ((u64)laneWaveforms[7 * bytes_per_lane + bit_pos + 0] << 56);
 
            u64 packed_hi_0 =
                ((u64)laneWaveforms[8  * bytes_per_lane + bit_pos + 0] << 0)  |
                ((u64)laneWaveforms[9  * bytes_per_lane + bit_pos + 0] << 8)  |
                ((u64)laneWaveforms[10 * bytes_per_lane + bit_pos + 0] << 16) |
                ((u64)laneWaveforms[11 * bytes_per_lane + bit_pos + 0] << 24) |
                ((u64)laneWaveforms[12 * bytes_per_lane + bit_pos + 0] << 32) |
                ((u64)laneWaveforms[13 * bytes_per_lane + bit_pos + 0] << 40) |
                ((u64)laneWaveforms[14 * bytes_per_lane + bit_pos + 0] << 48) |
                ((u64)laneWaveforms[15 * bytes_per_lane + bit_pos + 0] << 56);
 
            u64 packed_lo_1 =
                ((u64)laneWaveforms[0 * bytes_per_lane + bit_pos + 1] << 0)  |
                ((u64)laneWaveforms[1 * bytes_per_lane + bit_pos + 1] << 8)  |
                ((u64)laneWaveforms[2 * bytes_per_lane + bit_pos + 1] << 16) |
                ((u64)laneWaveforms[3 * bytes_per_lane + bit_pos + 1] << 24) |
                ((u64)laneWaveforms[4 * bytes_per_lane + bit_pos + 1] << 32) |
                ((u64)laneWaveforms[5 * bytes_per_lane + bit_pos + 1] << 40) |
                ((u64)laneWaveforms[6 * bytes_per_lane + bit_pos + 1] << 48) |
                ((u64)laneWaveforms[7 * bytes_per_lane + bit_pos + 1] << 56);
 
            u64 packed_hi_1 =
                ((u64)laneWaveforms[8  * bytes_per_lane + bit_pos + 1] << 0)  |
                ((u64)laneWaveforms[9  * bytes_per_lane + bit_pos + 1] << 8)  |
                ((u64)laneWaveforms[10 * bytes_per_lane + bit_pos + 1] << 16) |
                ((u64)laneWaveforms[11 * bytes_per_lane + bit_pos + 1] << 24) |
                ((u64)laneWaveforms[12 * bytes_per_lane + bit_pos + 1] << 32) |
                ((u64)laneWaveforms[13 * bytes_per_lane + bit_pos + 1] << 40) |
                ((u64)laneWaveforms[14 * bytes_per_lane + bit_pos + 1] << 48) |
                ((u64)laneWaveforms[15 * bytes_per_lane + bit_pos + 1] << 56);
 
            // Inner loop: interleave extraction from both bit positions
            // This allows CPU to execute independent operations in parallel
            for (size_t pulse_bit = 0; pulse_bit < 8; pulse_bit++) {
                // Extract pulse bits for first bit position
                u16 outputWord_0 =
                    ((packed_lo_0 >> (7 - pulse_bit + 0))  & 0x01) << 0  |
                    ((packed_lo_0 >> (7 - pulse_bit + 8))  & 0x01) << 1  |
                    ((packed_lo_0 >> (7 - pulse_bit + 16)) & 0x01) << 2  |
                    ((packed_lo_0 >> (7 - pulse_bit + 24)) & 0x01) << 3  |
                    ((packed_lo_0 >> (7 - pulse_bit + 32)) & 0x01) << 4  |
                    ((packed_lo_0 >> (7 - pulse_bit + 40)) & 0x01) << 5  |
                    ((packed_lo_0 >> (7 - pulse_bit + 48)) & 0x01) << 6  |
                    ((packed_lo_0 >> (7 - pulse_bit + 56)) & 0x01) << 7  |
                    ((packed_hi_0 >> (7 - pulse_bit + 0))  & 0x01) << 8  |
                    ((packed_hi_0 >> (7 - pulse_bit + 8))  & 0x01) << 9  |
                    ((packed_hi_0 >> (7 - pulse_bit + 16)) & 0x01) << 10 |
                    ((packed_hi_0 >> (7 - pulse_bit + 24)) & 0x01) << 11 |
                    ((packed_hi_0 >> (7 - pulse_bit + 32)) & 0x01) << 12 |
                    ((packed_hi_0 >> (7 - pulse_bit + 40)) & 0x01) << 13 |
                    ((packed_hi_0 >> (7 - pulse_bit + 48)) & 0x01) << 14 |
                    ((packed_hi_0 >> (7 - pulse_bit + 56)) & 0x01) << 15;
 
                // Extract pulse bits for second bit position
                u16 outputWord_1 =
                    ((packed_lo_1 >> (7 - pulse_bit + 0))  & 0x01) << 0  |
                    ((packed_lo_1 >> (7 - pulse_bit + 8))  & 0x01) << 1  |
                    ((packed_lo_1 >> (7 - pulse_bit + 16)) & 0x01) << 2  |
                    ((packed_lo_1 >> (7 - pulse_bit + 24)) & 0x01) << 3  |
                    ((packed_lo_1 >> (7 - pulse_bit + 32)) & 0x01) << 4  |
                    ((packed_lo_1 >> (7 - pulse_bit + 40)) & 0x01) << 5  |
                    ((packed_lo_1 >> (7 - pulse_bit + 48)) & 0x01) << 6  |
                    ((packed_lo_1 >> (7 - pulse_bit + 56)) & 0x01) << 7  |
                    ((packed_hi_1 >> (7 - pulse_bit + 0))  & 0x01) << 8  |
                    ((packed_hi_1 >> (7 - pulse_bit + 8))  & 0x01) << 9  |
                    ((packed_hi_1 >> (7 - pulse_bit + 16)) & 0x01) << 10 |
                    ((packed_hi_1 >> (7 - pulse_bit + 24)) & 0x01) << 11 |
                    ((packed_hi_1 >> (7 - pulse_bit + 32)) & 0x01) << 12 |
                    ((packed_hi_1 >> (7 - pulse_bit + 40)) & 0x01) << 13 |
                    ((packed_hi_1 >> (7 - pulse_bit + 48)) & 0x01) << 14 |
                    ((packed_hi_1 >> (7 - pulse_bit + 56)) & 0x01) << 15;
 
                // Write to buffer instead of directly to output
                writeBuffer[writeIdx++] = outputWord_0 & 0xFF;
                writeBuffer[writeIdx++] = (outputWord_0 >> 8) & 0xFF;
                writeBuffer[writeIdx++] = outputWord_1 & 0xFF;
                writeBuffer[writeIdx++] = (outputWord_1 >> 8) & 0xFF;
            }
 
            // Flush buffer when full (16 words = 32 bytes)
            // This triggers efficient burst writes that align with cache lines
            if (writeIdx == 32) {
                fl::memcpy(&outputBuffer[outputIdx], writeBuffer, 32);
                outputIdx += 32;
                writeIdx = 0;
            }
        }
    } else {
        // Invalid DATA_WIDTH (compile-time error if template instantiated with wrong value)
        return 0;
    }
 
    return outputIdx;
}
References FASTLED_FORCE_INLINE, FL_IRAM, FL_NOEXCEPT, FL_OPTIMIZE_FUNCTION, FL_RESTRICT_PARAM, FL_UNROLL, memcpy(), and t.
Referenced by transpose_wave8byte_parlio().
Here is the call graph for this function:
Here is the caller graph for this function: