FastLED 3.9.15
Loading...
Searching...
No Matches

◆ wave8_transpose_4()

FASTLED_FORCE_INLINE FL_IRAM FL_OPTIMIZE_FUNCTION void fl::detail::wave8_transpose_4 ( const Wave8Byte lane_waves[4],
u8 output[4 *sizeof(Wave8Byte)] )

Transpose 4 lanes of Wave8Byte data into interleaved format.

Parameters
lane_wavesArray of 4 Wave8Byte structures
outputOutput buffer (32 bytes)

Definition at line 126 of file wave8.hpp.

127 {
128 // Each symbol (Wave8Bit) has 8 pulses
129 // With 4 lanes, we produce 4 bytes per symbol (2 pulses per byte × 4 lanes)
130 // Output format: [L3_P7, L2_P7, L1_P7, L0_P7, L3_P6, L2_P6, L1_P6, L0_P6, ...]
131 //
132 // OPTIMIZED VERSION: Fully unrolled direct extraction (4.0x speedup vs baseline)
133 // Based on successful 16-lane pattern that achieved 8x speedup
134 // Eliminates triple-nested loops by explicitly extracting and packing bits
135
136 // Process each symbol (8 iterations)
137 for (int symbol_idx = 0; symbol_idx < 8; symbol_idx++) {
138 // Pre-load all 4 lane bytes into registers
139 u8 l0 = lane_waves[0].symbols[symbol_idx].data;
140 u8 l1 = lane_waves[1].symbols[symbol_idx].data;
141 u8 l2 = lane_waves[2].symbols[symbol_idx].data;
142 u8 l3 = lane_waves[3].symbols[symbol_idx].data;
143
144 // Explicitly construct all 4 output bytes
145 // Each output byte contains 2 pulses from all 4 lanes
146 // Bit layout: [L3_hi, L2_hi, L1_hi, L0_hi, L3_lo, L2_lo, L1_lo, L0_lo]
147
148 // Byte 0: pulses 7 (hi) and 6 (lo)
149 output[symbol_idx * 4 + 0] =
150 ((l3 >> 7) & 1) << 7 |
151 ((l2 >> 7) & 1) << 6 |
152 ((l1 >> 7) & 1) << 5 |
153 ((l0 >> 7) & 1) << 4 |
154 ((l3 >> 6) & 1) << 3 |
155 ((l2 >> 6) & 1) << 2 |
156 ((l1 >> 6) & 1) << 1 |
157 ((l0 >> 6) & 1);
158
159 // Byte 1: pulses 5 (hi) and 4 (lo)
160 output[symbol_idx * 4 + 1] =
161 ((l3 >> 5) & 1) << 7 |
162 ((l2 >> 5) & 1) << 6 |
163 ((l1 >> 5) & 1) << 5 |
164 ((l0 >> 5) & 1) << 4 |
165 ((l3 >> 4) & 1) << 3 |
166 ((l2 >> 4) & 1) << 2 |
167 ((l1 >> 4) & 1) << 1 |
168 ((l0 >> 4) & 1);
169
170 // Byte 2: pulses 3 (hi) and 2 (lo)
171 output[symbol_idx * 4 + 2] =
172 ((l3 >> 3) & 1) << 7 |
173 ((l2 >> 3) & 1) << 6 |
174 ((l1 >> 3) & 1) << 5 |
175 ((l0 >> 3) & 1) << 4 |
176 ((l3 >> 2) & 1) << 3 |
177 ((l2 >> 2) & 1) << 2 |
178 ((l1 >> 2) & 1) << 1 |
179 ((l0 >> 2) & 1);
180
181 // Byte 3: pulses 1 (hi) and 0 (lo)
182 output[symbol_idx * 4 + 3] =
183 ((l3 >> 1) & 1) << 7 |
184 ((l2 >> 1) & 1) << 6 |
185 ((l1 >> 1) & 1) << 5 |
186 ((l0 >> 1) & 1) << 4 |
187 ((l3 >> 0) & 1) << 3 |
188 ((l2 >> 0) & 1) << 2 |
189 ((l1 >> 0) & 1) << 1 |
190 ((l0 >> 0) & 1);
191 }
192}
unsigned char u8
Definition stdint.h:131

Referenced by fl::wave8Transpose_4(), and fl::wave8Transpose_4().

+ Here is the caller graph for this function: