66#if FL_PARLIO_BENCH_ENABLED
74fl::u32 fl_parlio_inner_one_byte_position_bf1(
const fl::u8 *scratch,
77 const fl::Wave8ByteExpansionLut &byte_lut,
82 lanes[lane] = scratch[lane * lane_stride + byte_offset];
85 reinterpret_cast<const fl::u8(&)[16]
>(lanes), byte_lut,
86 *
reinterpret_cast<fl::u8(*)[16 * sizeof(fl::Wave8Byte)]
>(output + output_idx));
87 return static_cast<fl::u32
>(output[output_idx] ^ output[output_idx + 127]);
91fl::u32 fl_parlio_inner_four_byte_positions_bf1_pipe4(
const fl::u8 *scratch,
94 const fl::Wave8ByteExpansionLut &byte_lut,
100 for (
fl::size_t lane = 0; lane < 16; lane++) {
101 const fl::u8 *base = scratch + lane * lane_stride + byte_offset;
102 lanes_a[lane] = base[0];
103 lanes_b[lane] = base[1];
104 lanes_c[lane] = base[2];
105 lanes_d[lane] = base[3];
111 reinterpret_cast<const fl::u8(&)[16]
>(lanes_a),
112 reinterpret_cast<const fl::u8(&)[16]
>(lanes_b),
113 reinterpret_cast<const fl::u8(&)[16]
>(lanes_c),
114 reinterpret_cast<const fl::u8(&)[16]
>(lanes_d),
121 return static_cast<fl::u32
>(out_a[0] ^ out_a[
BLOCK_SIZE * 4 - 1]);
125inline fl::u32 fl_parlio_measure(
const fl::u8 *scratch,
fl::size_t scratch_size,
127 const fl::Wave8ByteExpansionLut &byte_lut,
128 int iters_byte_positions,
129 volatile fl::u32 *sink) {
132 const fl::size_t lane_stride = BYTES_PER_LANE;
134 const fl::size_t required_scratch = LANES * BYTES_PER_LANE;
135 const fl::size_t required_output = BYTES_PER_LANE * LANES *
sizeof(fl::Wave8Byte);
136 if (scratch_size < required_scratch || output_size < required_output) {
137 FL_WARN(
"fl_parlio_measure: undersized buffer "
138 "(scratch=" << scratch_size <<
" need=" << required_scratch
139 <<
", output=" << output_size <<
" need=" << required_output <<
")");
146 while (it < iters_byte_positions) {
149 if (byte_offset + 3 < BYTES_PER_LANE && it + 3 < iters_byte_positions) {
150 *sink ^= fl_parlio_inner_four_byte_positions_bf1_pipe4(
151 scratch, lane_stride, byte_offset, byte_lut, output);
155 *sink ^= fl_parlio_inner_one_byte_position_bf1(
156 scratch, lane_stride, byte_offset, byte_lut, output, output_idx);
164 return reinterpret_cast<fl::u8 *
>(heap_caps_malloc(
166 psram ? (MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT)
167 : (MALLOC_CAP_INTERNAL | MALLOC_CAP_8BIT)));
174 if (iters_in < 1) iters_in = 1;
175 if (iters_in > 200000) iters_in = 200000;
176 result.iters =
static_cast<fl::u32
>(iters_in);
178 result.leds_per_lane = 256;
182 constexpr fl::size_t SCRATCH_BYTES = LANES * BYTES_PER_LANE;
183 constexpr fl::size_t OUTPUT_BYTES = BYTES_PER_LANE * LANES *
sizeof(fl::Wave8Byte);
185 fl::ChipsetTiming timing;
192 fl::u8 *scratch_sram = fl_parlio_alloc(SCRATCH_BYTES,
false);
193 fl::u8 *output_sram = fl_parlio_alloc(OUTPUT_BYTES,
false);
195 if (!scratch_sram || !output_sram) {
196 heap_caps_free(scratch_sram);
197 heap_caps_free(output_sram);
202 fl::u8 *scratch_psram = fl_parlio_alloc(SCRATCH_BYTES,
true);
203 fl::u8 *output_psram = fl_parlio_alloc(OUTPUT_BYTES,
true);
204 result.scratch_psram_ok = scratch_psram !=
nullptr;
205 result.output_psram_ok = output_psram !=
nullptr;
207 if (!scratch_psram) {
208 scratch_psram = scratch_sram;
211 output_psram = output_sram;
215 for (
fl::size_t i = 0; i < SCRATCH_BYTES; ++i) {
216 const fl::u8 v =
static_cast<fl::u8>((i * 31 + 7) & 0xFF);
218 if (
result.scratch_psram_ok) {
219 scratch_psram[i] = v;
223 if (
result.output_psram_ok) {
227 volatile fl::u32 sink = 0;
230 fl_parlio_measure(scratch_sram, SCRATCH_BYTES, output_sram, OUTPUT_BYTES,
231 byte_lut, 64, &sink);
233 result.perpos_ss_us = fl_parlio_measure(
234 scratch_sram, SCRATCH_BYTES, output_sram, OUTPUT_BYTES,
235 byte_lut, iters_in, &sink);
236 if (
result.output_psram_ok) {
237 result.perpos_sp_us = fl_parlio_measure(
238 scratch_sram, SCRATCH_BYTES, output_psram, OUTPUT_BYTES,
239 byte_lut, iters_in, &sink);
241 if (
result.scratch_psram_ok) {
242 result.perpos_ps_us = fl_parlio_measure(
243 scratch_psram, SCRATCH_BYTES, output_sram, OUTPUT_BYTES,
244 byte_lut, iters_in, &sink);
246 if (
result.scratch_psram_ok &&
result.output_psram_ok) {
247 result.perpos_pp_us = fl_parlio_measure(
248 scratch_psram, SCRATCH_BYTES, output_psram, OUTPUT_BYTES,
249 byte_lut, iters_in, &sink);
252 result.sink =
static_cast<fl::u32
>(sink);
254 heap_caps_free(scratch_sram);
255 if (
result.scratch_psram_ok) {
256 heap_caps_free(scratch_psram);
258 heap_caps_free(output_sram);
259 if (
result.output_psram_ok) {
260 heap_caps_free(output_psram);
FL_OPTIMIZE_FUNCTION FL_IRAM void wave8Transpose_16x4_bf1_pipe4(const u8(&FL_RESTRICT_PARAM lanes_a)[16], const u8(&FL_RESTRICT_PARAM lanes_b)[16], const u8(&FL_RESTRICT_PARAM lanes_c)[16], const u8(&FL_RESTRICT_PARAM lanes_d)[16], const Wave8ByteExpansionLut &lut, u8(&FL_RESTRICT_PARAM output_a)[16 *sizeof(Wave8Byte)], u8(&FL_RESTRICT_PARAM output_b)[16 *sizeof(Wave8Byte)], u8(&FL_RESTRICT_PARAM output_c)[16 *sizeof(Wave8Byte)], u8(&FL_RESTRICT_PARAM output_d)[16 *sizeof(Wave8Byte)])
BF1 + pipe4: 4-position-pipelined direct encode (#2548 deep-dive).