FastLED 3.9.15
Loading...
Searching...
No Matches
AutoResearchParlioEncode.h
Go to the documentation of this file.
1
26
27#pragma once
28
30#include "fl/channels/wave8.h"
32#include "fl/log/log.h" // FL_WARN
33#include "fl/stl/bit_cast.h"
34#include "fl/stl/int.h"
35
36#include "fl/stl/cstring.h" // fl::memcpy / fl::memset
37
38#if defined(ARDUINO_ARCH_ESP32) || defined(ESP_PLATFORM)
39#include <Arduino.h> // micros()
40#include <esp_heap_caps.h>
41#define FL_PARLIO_BENCH_ENABLED 1
42#else
43#define FL_PARLIO_BENCH_ENABLED 0
44#endif
45
46namespace autoresearch {
47namespace parlio_bench {
48
50 fl::u32 iters; // byte-positions per measurement
51 fl::u32 lanes; // 16
52 fl::u32 leds_per_lane; // 256 (matches the standard P4 test config)
53
54 // Per-byte-position hot loop = 16-lane gather + BF1 pipe4 direct encode.
55 // 4 placements of {scratch, output}: SRAM/SRAM, SRAM/PSRAM, PSRAM/SRAM, PSRAM/PSRAM.
56 fl::u32 perpos_ss_us; // scratch SRAM, output SRAM
57 fl::u32 perpos_sp_us; // scratch SRAM, output PSRAM
58 fl::u32 perpos_ps_us; // scratch PSRAM, output SRAM
59 fl::u32 perpos_pp_us; // scratch PSRAM, output PSRAM
60
63 fl::u32 sink;
64};
65
66#if FL_PARLIO_BENCH_ENABLED
67
68namespace {
69
70// Mirror of the engine's clockless hot loop for one byte-position, 16-lane
71// Wave8 path (parlio_engine.cpp.hpp ≈line 981-994). lane_stride matches the
72// production layout: lane k reads from scratch[k * lane_stride + byte_offset].
74fl::u32 fl_parlio_inner_one_byte_position_bf1(const fl::u8 *scratch,
75 fl::size_t lane_stride,
76 fl::size_t byte_offset,
77 const fl::Wave8ByteExpansionLut &byte_lut,
78 fl::u8 *output,
79 fl::size_t output_idx) {
80 fl::u8 lanes[16];
81 for (fl::size_t lane = 0; lane < 16; lane++) {
82 lanes[lane] = scratch[lane * lane_stride + byte_offset];
83 }
85 reinterpret_cast<const fl::u8(&)[16]>(lanes), byte_lut,
86 *reinterpret_cast<fl::u8(*)[16 * sizeof(fl::Wave8Byte)]>(output + output_idx));
87 return static_cast<fl::u32>(output[output_idx] ^ output[output_idx + 127]);
88}
89
91fl::u32 fl_parlio_inner_four_byte_positions_bf1_pipe4(const fl::u8 *scratch,
92 fl::size_t lane_stride,
93 fl::size_t byte_offset,
94 const fl::Wave8ByteExpansionLut &byte_lut,
95 fl::u8 *output) {
96 fl::u8 lanes_a[16];
97 fl::u8 lanes_b[16];
98 fl::u8 lanes_c[16];
99 fl::u8 lanes_d[16];
100 for (fl::size_t lane = 0; lane < 16; lane++) {
101 const fl::u8 *base = scratch + lane * lane_stride + byte_offset;
102 lanes_a[lane] = base[0];
103 lanes_b[lane] = base[1];
104 lanes_c[lane] = base[2];
105 lanes_d[lane] = base[3];
106 }
107
108 constexpr fl::size_t BLOCK_SIZE = 16 * sizeof(fl::Wave8Byte);
109 fl::u8 *out_a = output + byte_offset * BLOCK_SIZE;
111 reinterpret_cast<const fl::u8(&)[16]>(lanes_a),
112 reinterpret_cast<const fl::u8(&)[16]>(lanes_b),
113 reinterpret_cast<const fl::u8(&)[16]>(lanes_c),
114 reinterpret_cast<const fl::u8(&)[16]>(lanes_d),
115 byte_lut,
116 *reinterpret_cast<fl::u8(*)[BLOCK_SIZE]>(out_a),
117 *reinterpret_cast<fl::u8(*)[BLOCK_SIZE]>(out_a + BLOCK_SIZE),
118 *reinterpret_cast<fl::u8(*)[BLOCK_SIZE]>(out_a + 2 * BLOCK_SIZE),
119 *reinterpret_cast<fl::u8(*)[BLOCK_SIZE]>(out_a + 3 * BLOCK_SIZE));
120
121 return static_cast<fl::u32>(out_a[0] ^ out_a[BLOCK_SIZE * 4 - 1]);
122}
123
124// Time one configuration: scratch + output buffers (caller pre-allocates).
125inline fl::u32 fl_parlio_measure(const fl::u8 *scratch, fl::size_t scratch_size,
126 fl::u8 *output, fl::size_t output_size,
127 const fl::Wave8ByteExpansionLut &byte_lut,
128 int iters_byte_positions,
129 volatile fl::u32 *sink) {
130 constexpr fl::size_t LANES = 16;
131 constexpr fl::size_t BYTES_PER_LANE = 768; // 256 LEDs × 3
132 const fl::size_t lane_stride = BYTES_PER_LANE;
133
134 const fl::size_t required_scratch = LANES * BYTES_PER_LANE;
135 const fl::size_t required_output = BYTES_PER_LANE * LANES * sizeof(fl::Wave8Byte);
136 if (scratch_size < required_scratch || output_size < required_output) {
137 FL_WARN("fl_parlio_measure: undersized buffer "
138 "(scratch=" << scratch_size << " need=" << required_scratch
139 << ", output=" << output_size << " need=" << required_output << ")");
140 return 0u;
141 }
142
143 constexpr fl::size_t BLOCK_SIZE = LANES * sizeof(fl::Wave8Byte);
144 fl::u32 t0 = micros();
145 int it = 0;
146 while (it < iters_byte_positions) {
147 const fl::size_t byte_offset =
148 static_cast<fl::size_t>(it) % BYTES_PER_LANE;
149 if (byte_offset + 3 < BYTES_PER_LANE && it + 3 < iters_byte_positions) {
150 *sink ^= fl_parlio_inner_four_byte_positions_bf1_pipe4(
151 scratch, lane_stride, byte_offset, byte_lut, output);
152 it += 4;
153 } else {
154 const fl::size_t output_idx = byte_offset * BLOCK_SIZE;
155 *sink ^= fl_parlio_inner_one_byte_position_bf1(
156 scratch, lane_stride, byte_offset, byte_lut, output, output_idx);
157 it += 1;
158 }
159 }
160 return micros() - t0;
161}
162
163inline fl::u8 *fl_parlio_alloc(fl::size_t size, bool psram) {
164 return reinterpret_cast<fl::u8 *>(heap_caps_malloc(
165 size,
166 psram ? (MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT)
167 : (MALLOC_CAP_INTERNAL | MALLOC_CAP_8BIT)));
168}
169
170} // anonymous namespace
171
172inline ParlioEncodeResult measureParlioEncode(int iters_in = 12000) {
174 if (iters_in < 1) iters_in = 1;
175 if (iters_in > 200000) iters_in = 200000;
176 result.iters = static_cast<fl::u32>(iters_in);
177 result.lanes = 16;
178 result.leds_per_lane = 256;
179
180 constexpr fl::size_t LANES = 16;
181 constexpr fl::size_t BYTES_PER_LANE = 768;
182 constexpr fl::size_t SCRATCH_BYTES = LANES * BYTES_PER_LANE; // 12 KB
183 constexpr fl::size_t OUTPUT_BYTES = BYTES_PER_LANE * LANES * sizeof(fl::Wave8Byte); // 96 KB
184
185 fl::ChipsetTiming timing;
186 timing.T1 = 400;
187 timing.T2 = 450;
188 timing.T3 = 400;
189 fl::Wave8BitExpansionLut nib_lut = fl::buildWave8ExpansionLUT(timing);
190 fl::Wave8ByteExpansionLut byte_lut = fl::buildWave8ByteExpansionLUT(nib_lut);
191
192 fl::u8 *scratch_sram = fl_parlio_alloc(SCRATCH_BYTES, /*psram=*/false);
193 fl::u8 *output_sram = fl_parlio_alloc(OUTPUT_BYTES, /*psram=*/false);
194
195 if (!scratch_sram || !output_sram) {
196 heap_caps_free(scratch_sram);
197 heap_caps_free(output_sram);
198 result.iters = 0;
199 return result;
200 }
201
202 fl::u8 *scratch_psram = fl_parlio_alloc(SCRATCH_BYTES, /*psram=*/true);
203 fl::u8 *output_psram = fl_parlio_alloc(OUTPUT_BYTES, /*psram=*/true);
204 result.scratch_psram_ok = scratch_psram != nullptr;
205 result.output_psram_ok = output_psram != nullptr;
206
207 if (!scratch_psram) {
208 scratch_psram = scratch_sram;
209 }
210 if (!output_psram) {
211 output_psram = output_sram;
212 }
213
214 // Fill scratch buffers with representative LED data.
215 for (fl::size_t i = 0; i < SCRATCH_BYTES; ++i) {
216 const fl::u8 v = static_cast<fl::u8>((i * 31 + 7) & 0xFF);
217 scratch_sram[i] = v;
218 if (result.scratch_psram_ok) {
219 scratch_psram[i] = v;
220 }
221 }
222 fl::memset(output_sram, 0, OUTPUT_BYTES);
223 if (result.output_psram_ok) {
224 fl::memset(output_psram, 0, OUTPUT_BYTES);
225 }
226
227 volatile fl::u32 sink = 0;
228
229 // Warm caches / icache.
230 fl_parlio_measure(scratch_sram, SCRATCH_BYTES, output_sram, OUTPUT_BYTES,
231 byte_lut, 64, &sink);
232
233 result.perpos_ss_us = fl_parlio_measure(
234 scratch_sram, SCRATCH_BYTES, output_sram, OUTPUT_BYTES,
235 byte_lut, iters_in, &sink);
236 if (result.output_psram_ok) {
237 result.perpos_sp_us = fl_parlio_measure(
238 scratch_sram, SCRATCH_BYTES, output_psram, OUTPUT_BYTES,
239 byte_lut, iters_in, &sink);
240 }
241 if (result.scratch_psram_ok) {
242 result.perpos_ps_us = fl_parlio_measure(
243 scratch_psram, SCRATCH_BYTES, output_sram, OUTPUT_BYTES,
244 byte_lut, iters_in, &sink);
245 }
246 if (result.scratch_psram_ok && result.output_psram_ok) {
247 result.perpos_pp_us = fl_parlio_measure(
248 scratch_psram, SCRATCH_BYTES, output_psram, OUTPUT_BYTES,
249 byte_lut, iters_in, &sink);
250 }
251
252 result.sink = static_cast<fl::u32>(sink);
253
254 heap_caps_free(scratch_sram);
255 if (result.scratch_psram_ok) {
256 heap_caps_free(scratch_psram);
257 }
258 heap_caps_free(output_sram);
259 if (result.output_psram_ok) {
260 heap_caps_free(output_psram);
261 }
262 return result;
263}
264
265#else // !FL_PARLIO_BENCH_ENABLED
266
267inline ParlioEncodeResult measureParlioEncode(int /*iters*/ = 12000) { return {}; }
268
269#endif
270
271} // namespace parlio_bench
272} // namespace autoresearch
#define BLOCK_SIZE
Definition coder.h:106
Centralized LED chipset timing definitions with nanosecond precision.
#define FL_WARN(X)
Definition log.h:276
Centralized logging categories for FastLED hardware interfaces and subsystems.
ParlioEncodeResult measureParlioEncode(int=12000)
unsigned char u8
Definition stdint.h:131
FL_OPTIMIZE_FUNCTION Wave8BitExpansionLut buildWave8ExpansionLUT(const ChipsetTiming &timing)
Build a Wave8BitExpansionLut from chipset timing data.
FL_OPTIMIZE_FUNCTION FL_IRAM void wave8Transpose_16_bf1(const u8(&FL_RESTRICT_PARAM lanes)[16], const Wave8ByteExpansionLut &lut, u8(&FL_RESTRICT_PARAM output)[16 *sizeof(Wave8Byte)])
BF1: chipset-aware direct encode for 16-lane Wave8 (#2548 deep-dive).
void * memset(void *s, int c, size_t n) FL_NOEXCEPT
Wave8ByteExpansionLut buildWave8ByteExpansionLUT(const Wave8BitExpansionLut &nibble)
Build a byte-indexed expansion LUT (#2526) from the nibble LUT.
fl::size size_t
Definition s16x16x4.h:223
expected< T, E > result
Alias for expected (Rust-style naming)
Definition result.h:31
fl::u32 micros()
Universal microsecond timer - returns microseconds since system startup.
FL_OPTIMIZE_FUNCTION FL_IRAM void wave8Transpose_16x4_bf1_pipe4(const u8(&FL_RESTRICT_PARAM lanes_a)[16], const u8(&FL_RESTRICT_PARAM lanes_b)[16], const u8(&FL_RESTRICT_PARAM lanes_c)[16], const u8(&FL_RESTRICT_PARAM lanes_d)[16], const Wave8ByteExpansionLut &lut, u8(&FL_RESTRICT_PARAM output_a)[16 *sizeof(Wave8Byte)], u8(&FL_RESTRICT_PARAM output_b)[16 *sizeof(Wave8Byte)], u8(&FL_RESTRICT_PARAM output_c)[16 *sizeof(Wave8Byte)], u8(&FL_RESTRICT_PARAM output_d)[16 *sizeof(Wave8Byte)])
BF1 + pipe4: 4-position-pipelined direct encode (#2548 deep-dive).
#define FASTLED_FORCE_INLINE
#define FL_IRAM
Inline implementation details for wave8 transposition.