FastLED 3.9.15
Loading...
Searching...
No Matches
AutoResearchSimd.h
Go to the documentation of this file.
1// AutoResearchSimd.h - Comprehensive SIMD operation autoresearch tests
2//
3// Full SIMD test suite covering every operation in fl::simd namespace.
4// Tests include normal cases, boundary/edge cases, and signed arithmetic.
5//
6// Uses scalar fallback on platforms without native SIMD support.
7
8#pragma once
9
10#include <FastLED.h>
11#include "fl/math/simd.h"
12#include "fl/stl/sstream.h"
18
19namespace autoresearch {
20namespace simd_check {
21
22using namespace fl::simd;
23
24// ============================================================================
25// Helper Functions
26// ============================================================================
27
28inline bool compare_u8(const uint8_t* a, const uint8_t* b, size_t n) {
29 for (size_t i = 0; i < n; i++) {
30 if (a[i] != b[i]) return false;
31 }
32 return true;
33}
34
35inline bool compare_u32(const uint32_t* a, const uint32_t* b, size_t n) {
36 for (size_t i = 0; i < n; i++) {
37 if (a[i] != b[i]) return false;
38 }
39 return true;
40}
41
42inline bool compare_f32(const float* a, const float* b, size_t n, float eps = 0.001f) {
43 for (size_t i = 0; i < n; i++) {
44 float diff = a[i] - b[i];
45 if (diff < -eps || diff > eps) return false;
46 }
47 return true;
48}
49
50// Reinterpret u32 as i32 for readability
51inline int32_t as_i32(uint32_t v) { return static_cast<int32_t>(v); }
52inline uint32_t as_u32(int32_t v) { return static_cast<uint32_t>(v); }
53
54// ============================================================================
55// u8x16 Load/Store Tests
56// ============================================================================
57
58inline bool test_load_store_u8_16() {
59 uint8_t input[16] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
60 uint8_t output[16] = {0};
61 simd_u8x16 v = load_u8_16(input);
62 store_u8_16(output, v);
63 return compare_u8(input, output, 16);
64}
65
67 uint8_t input[16] = {0,255,0,255, 0,255,0,255, 0,255,0,255, 0,255,0,255};
68 uint8_t output[16] = {0};
69 simd_u8x16 v = load_u8_16(input);
70 store_u8_16(output, v);
71 return compare_u8(input, output, 16);
72}
73
74// ============================================================================
75// u32x4 Load/Store Tests
76// ============================================================================
77
78inline bool test_load_store_u32_4() {
79 uint32_t input[4] = {0x12345678, 0x9ABCDEF0, 0xFEDCBA98, 0x76543210};
80 uint32_t output[4] = {0};
81 simd_u32x4 v = load_u32_4(input);
82 store_u32_4(output, v);
83 return compare_u32(input, output, 4);
84}
85
87 uint32_t input[4] = {0, 0xFFFFFFFF, 1, 0x80000000};
88 uint32_t output[4] = {0};
89 simd_u32x4 v = load_u32_4(input);
90 store_u32_4(output, v);
91 return compare_u32(input, output, 4);
92}
93
95 FL_ALIGNAS(16) uint32_t input[4] = {0xAAAAAAAA, 0xBBBBBBBB, 0xCCCCCCCC, 0xDDDDDDDD};
96 FL_ALIGNAS(16) uint32_t output[4] = {0};
97 simd_u32x4 v = load_u32_4_aligned(input);
98 store_u32_4_aligned(output, v);
99 return compare_u32(input, output, 4);
100}
101
102// ============================================================================
103// f32x4 Load/Store Tests
104// ============================================================================
105
107 float input[4] = {1.5f, 2.5f, 3.5f, 4.5f};
108 float output[4] = {0.0f};
109 simd_f32x4 v = load_f32_4(input);
110 store_f32_4(output, v);
111 return compare_f32(input, output, 4);
112}
113
115 float input[4] = {0.0f, -0.0f, 1e30f, -1e30f};
116 float output[4] = {0.0f};
117 simd_f32x4 v = load_f32_4(input);
118 store_f32_4(output, v);
119 // Compare bitwise for zero sign preservation
120 for (int i = 0; i < 4; i++) {
121 float diff = input[i] - output[i];
122 if (diff < -0.001f || diff > 0.001f) return false;
123 }
124 return true;
125}
126
127// ============================================================================
128// u8x16 Saturating Arithmetic Tests
129// ============================================================================
130
131inline bool test_add_sat_u8_16() {
132 uint8_t a[16] = {100,150,200,250, 100,150,200,250, 100,150,200,250, 100,150,200,250};
133 uint8_t b[16] = {50,100,50,100, 50,100,50,100, 50,100,50,100, 50,100,50,100};
134 uint8_t expected[16] = {150,250,250,255, 150,250,250,255, 150,250,250,255, 150,250,250,255};
135 uint8_t output[16] = {0};
136 simd_u8x16 va = load_u8_16(a);
137 simd_u8x16 vb = load_u8_16(b);
138 store_u8_16(output, add_sat_u8_16(va, vb));
139 return compare_u8(expected, output, 16);
140}
141
143 // 255 + 255 = 255 (saturated), 0 + 0 = 0
144 uint8_t a[16] = {255,255,0,0, 128,1,254,255, 255,255,0,0, 128,1,254,255};
145 uint8_t b[16] = {255,1,0,0, 128,255,2,0, 255,1,0,0, 128,255,2,0};
146 uint8_t expected[16] = {255,255,0,0, 255,255,255,255, 255,255,0,0, 255,255,255,255};
147 uint8_t output[16] = {0};
148 store_u8_16(output, add_sat_u8_16(load_u8_16(a), load_u8_16(b)));
149 return compare_u8(expected, output, 16);
150}
151
152inline bool test_sub_sat_u8_16() {
153 uint8_t a[16] = {100,50,200,10, 100,50,200,10, 100,50,200,10, 100,50,200,10};
154 uint8_t b[16] = {50,100,50,100, 50,100,50,100, 50,100,50,100, 50,100,50,100};
155 uint8_t expected[16] = {50,0,150,0, 50,0,150,0, 50,0,150,0, 50,0,150,0};
156 uint8_t output[16] = {0};
157 store_u8_16(output, sub_sat_u8_16(load_u8_16(a), load_u8_16(b)));
158 return compare_u8(expected, output, 16);
159}
160
162 // 0 - 255 = 0 (clamped), 255 - 0 = 255, equal values = 0
163 uint8_t a[16] = {0,255,100,0, 1,0,0,0, 0,255,100,0, 1,0,0,0};
164 uint8_t b[16] = {255,0,100,0, 0,1,255,128, 255,0,100,0, 0,1,255,128};
165 uint8_t expected[16] = {0,255,0,0, 1,0,0,0, 0,255,0,0, 1,0,0,0};
166 uint8_t output[16] = {0};
167 store_u8_16(output, sub_sat_u8_16(load_u8_16(a), load_u8_16(b)));
168 return compare_u8(expected, output, 16);
169}
170
171// ============================================================================
172// u8x16 Scale / Blend Tests
173// ============================================================================
174
175inline bool test_scale_u8_16() {
176 uint8_t a[16] = {255,128,64,32, 255,128,64,32, 255,128,64,32, 255,128,64,32};
177 uint8_t output[16] = {0};
178 store_u8_16(output, scale_u8_16(load_u8_16(a), 128));
179 for (int i = 0; i < 16; i++) {
180 int expected = (a[i] * 128) / 256;
181 int diff = (int)output[i] - expected;
182 if (diff < -1 || diff > 1) return false;
183 }
184 return true;
185}
186
188 uint8_t a[16] = {255,128,64,32, 255,128,64,32, 255,128,64,32, 255,128,64,32};
189 uint8_t expected[16] = {0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0};
190 uint8_t output[16] = {0};
191 store_u8_16(output, scale_u8_16(load_u8_16(a), 0));
192 return compare_u8(expected, output, 16);
193}
194
196 // scale by 255 should give approximately the original value
197 uint8_t a[16] = {255,128,64,32, 10,200,100,50, 255,128,64,32, 10,200,100,50};
198 uint8_t output[16] = {0};
199 store_u8_16(output, scale_u8_16(load_u8_16(a), 255));
200 for (int i = 0; i < 16; i++) {
201 int expected = (a[i] * 255) / 256;
202 int diff = (int)output[i] - expected;
203 if (diff < -1 || diff > 1) return false;
204 }
205 return true;
206}
207
208inline bool test_blend_u8_16() {
209 uint8_t a[16] = {0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0};
210 uint8_t b[16] = {255,255,255,255, 255,255,255,255, 255,255,255,255, 255,255,255,255};
211 uint8_t output[16] = {0};
212 store_u8_16(output, blend_u8_16(load_u8_16(a), load_u8_16(b), 128));
213 for (int i = 0; i < 16; i++) {
214 int diff = (int)output[i] - 128;
215 if (diff < -2 || diff > 2) return false;
216 }
217 return true;
218}
219
221 // amount=0 -> all a, amount=255 -> ~all b
222 uint8_t a[16] = {100,100,100,100, 100,100,100,100, 100,100,100,100, 100,100,100,100};
223 uint8_t b[16] = {200,200,200,200, 200,200,200,200, 200,200,200,200, 200,200,200,200};
224 uint8_t output0[16] = {0};
225 uint8_t output255[16] = {0};
226 store_u8_16(output0, blend_u8_16(load_u8_16(a), load_u8_16(b), 0));
227 store_u8_16(output255, blend_u8_16(load_u8_16(a), load_u8_16(b), 255));
228 for (int i = 0; i < 16; i++) {
229 if (output0[i] != 100) return false; // amount=0 -> a
230 int diff = (int)output255[i] - 200;
231 if (diff < -2 || diff > 2) return false; // amount=255 -> ~b
232 }
233 return true;
234}
235
236// ============================================================================
237// u8x16 Comparison Tests
238// ============================================================================
239
240inline bool test_min_u8_16() {
241 uint8_t a[16] = {10,200,30,240, 0,255,1,254, 10,200,30,240, 0,255,1,254};
242 uint8_t b[16] = {20,100,40,120, 0,0,255,255, 20,100,40,120, 0,0,255,255};
243 uint8_t expected[16] = {10,100,30,120, 0,0,1,254, 10,100,30,120, 0,0,1,254};
244 uint8_t output[16] = {0};
245 store_u8_16(output, min_u8_16(load_u8_16(a), load_u8_16(b)));
246 return compare_u8(expected, output, 16);
247}
248
249inline bool test_max_u8_16() {
250 uint8_t a[16] = {10,200,30,240, 0,255,1,254, 10,200,30,240, 0,255,1,254};
251 uint8_t b[16] = {20,100,40,120, 0,0,255,255, 20,100,40,120, 0,0,255,255};
252 uint8_t expected[16] = {20,200,40,240, 0,255,255,255, 20,200,40,240, 0,255,255,255};
253 uint8_t output[16] = {0};
254 store_u8_16(output, max_u8_16(load_u8_16(a), load_u8_16(b)));
255 return compare_u8(expected, output, 16);
256}
257
258inline bool test_avg_u8_16() {
259 uint8_t a[16] = {100,200,50,0, 255,0,1,254, 100,200,50,0, 255,0,1,254};
260 uint8_t b[16] = {200,100,150,0, 255,0,1,254, 200,100,150,0, 255,0,1,254};
261 uint8_t output[16] = {0};
262 store_u8_16(output, avg_u8_16(load_u8_16(a), load_u8_16(b)));
263 for (int i = 0; i < 16; i++) {
264 int expected = (a[i] + b[i]) / 2;
265 int diff = (int)output[i] - expected;
266 if (diff < -1 || diff > 1) return false;
267 }
268 return true;
269}
270
271inline bool test_avg_round_u8_16() {
272 uint8_t a[16] = {101,201,51,1, 255,0,3,253, 101,201,51,1, 255,0,3,253};
273 uint8_t b[16] = {200,100,150,0, 254,1,2,252, 200,100,150,0, 254,1,2,252};
274 uint8_t output[16] = {0};
275 store_u8_16(output, avg_round_u8_16(load_u8_16(a), load_u8_16(b)));
276 for (int i = 0; i < 16; i++) {
277 int expected = (a[i] + b[i] + 1) / 2;
278 int diff = (int)output[i] - expected;
279 if (diff < -1 || diff > 1) return false;
280 }
281 return true;
282}
283
284// ============================================================================
285// u8x16 Bitwise Tests
286// ============================================================================
287
288inline bool test_and_u8_16() {
289 uint8_t a[16] = {0xFF,0x0F,0xF0,0xAA, 0xFF,0x0F,0xF0,0xAA, 0xFF,0x0F,0xF0,0xAA, 0xFF,0x0F,0xF0,0xAA};
290 uint8_t b[16] = {0x0F,0xFF,0xFF,0x55, 0x0F,0xFF,0xFF,0x55, 0x0F,0xFF,0xFF,0x55, 0x0F,0xFF,0xFF,0x55};
291 uint8_t expected[16] = {0x0F,0x0F,0xF0,0x00, 0x0F,0x0F,0xF0,0x00, 0x0F,0x0F,0xF0,0x00, 0x0F,0x0F,0xF0,0x00};
292 uint8_t output[16] = {0};
293 store_u8_16(output, and_u8_16(load_u8_16(a), load_u8_16(b)));
294 return compare_u8(expected, output, 16);
295}
296
297inline bool test_or_u8_16() {
298 uint8_t a[16] = {0xFF,0x0F,0xF0,0xAA, 0x00,0x00,0xFF,0x80, 0xFF,0x0F,0xF0,0xAA, 0x00,0x00,0xFF,0x80};
299 uint8_t b[16] = {0x0F,0xFF,0xFF,0x55, 0x00,0xFF,0x00,0x01, 0x0F,0xFF,0xFF,0x55, 0x00,0xFF,0x00,0x01};
300 uint8_t expected[16] = {0xFF,0xFF,0xFF,0xFF, 0x00,0xFF,0xFF,0x81, 0xFF,0xFF,0xFF,0xFF, 0x00,0xFF,0xFF,0x81};
301 uint8_t output[16] = {0};
302 store_u8_16(output, or_u8_16(load_u8_16(a), load_u8_16(b)));
303 return compare_u8(expected, output, 16);
304}
305
306inline bool test_xor_u8_16() {
307 uint8_t a[16] = {0xFF,0x0F,0xF0,0xAA, 0xFF,0x00,0xAB,0x01, 0xFF,0x0F,0xF0,0xAA, 0xFF,0x00,0xAB,0x01};
308 uint8_t b[16] = {0x0F,0xFF,0xFF,0x55, 0xFF,0x00,0xAB,0x01, 0x0F,0xFF,0xFF,0x55, 0xFF,0x00,0xAB,0x01};
309 uint8_t expected[16] = {0xF0,0xF0,0x0F,0xFF, 0x00,0x00,0x00,0x00, 0xF0,0xF0,0x0F,0xFF, 0x00,0x00,0x00,0x00};
310 uint8_t output[16] = {0};
311 store_u8_16(output, xor_u8_16(load_u8_16(a), load_u8_16(b)));
312 return compare_u8(expected, output, 16);
313}
314
315inline bool test_andnot_u8_16() {
316 // andnot(a, b) = ~a & b
317 uint8_t a[16] = {0xFF,0x0F,0xF0,0xAA, 0x00,0xFF,0x55,0x80, 0xFF,0x0F,0xF0,0xAA, 0x00,0xFF,0x55,0x80};
318 uint8_t b[16] = {0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF};
319 uint8_t expected[16] = {0x00,0xF0,0x0F,0x55, 0xFF,0x00,0xAA,0x7F, 0x00,0xF0,0x0F,0x55, 0xFF,0x00,0xAA,0x7F};
320 uint8_t output[16] = {0};
321 store_u8_16(output, andnot_u8_16(load_u8_16(a), load_u8_16(b)));
322 return compare_u8(expected, output, 16);
323}
324
325// ============================================================================
326// u32x4 Broadcast / Construct Tests
327// ============================================================================
328
329inline bool test_set1_u32_4() {
330 uint32_t output[4] = {0};
331 store_u32_4(output, set1_u32_4(0xDEADBEEF));
332 uint32_t expected[4] = {0xDEADBEEF, 0xDEADBEEF, 0xDEADBEEF, 0xDEADBEEF};
333 return compare_u32(expected, output, 4);
334}
335
336inline bool test_set1_u32_4_zero() {
337 uint32_t output[4] = {1,1,1,1};
338 store_u32_4(output, set1_u32_4(0));
339 uint32_t expected[4] = {0, 0, 0, 0};
340 return compare_u32(expected, output, 4);
341}
342
343inline bool test_set_u32_4() {
344 uint32_t output[4] = {0};
345 store_u32_4(output, set_u32_4(0x11111111, 0x22222222, 0x33333333, 0x44444444));
346 uint32_t expected[4] = {0x11111111, 0x22222222, 0x33333333, 0x44444444};
347 return compare_u32(expected, output, 4);
348}
349
350inline bool test_set1_f32_4() {
351 float output[4] = {0.0f};
352 store_f32_4(output, set1_f32_4(3.14f));
353 float expected[4] = {3.14f, 3.14f, 3.14f, 3.14f};
354 return compare_f32(expected, output, 4);
355}
356
357// ============================================================================
358// u32x4 Extract Test
359// ============================================================================
360
361inline bool test_extract_u32_4() {
362 simd_u32x4 v = set_u32_4(10, 20, 30, 40);
363 if (extract_u32_4(v, 0) != 10) return false;
364 if (extract_u32_4(v, 1) != 20) return false;
365 if (extract_u32_4(v, 2) != 30) return false;
366 if (extract_u32_4(v, 3) != 40) return false;
367 return true;
368}
369
370// ============================================================================
371// u32x4 Bitwise Tests
372// ============================================================================
373
374inline bool test_xor_u32_4() {
375 uint32_t a[4] = {0xFFFFFFFF, 0x0F0F0F0F, 0xAAAAAAAA, 0x12345678};
376 uint32_t b[4] = {0x0F0F0F0F, 0xFFFFFFFF, 0x55555555, 0x12345678};
377 uint32_t expected[4] = {0xF0F0F0F0, 0xF0F0F0F0, 0xFFFFFFFF, 0x00000000};
378 uint32_t output[4] = {0};
379 store_u32_4(output, xor_u32_4(load_u32_4(a), load_u32_4(b)));
380 return compare_u32(expected, output, 4);
381}
382
383inline bool test_and_u32_4() {
384 uint32_t a[4] = {0xFFFF0000, 0x0F0F0F0F, 0xAAAAAAAA, 0x00000000};
385 uint32_t b[4] = {0x0000FFFF, 0xF0F0F0F0, 0xFFFFFFFF, 0xFFFFFFFF};
386 uint32_t expected[4] = {0x00000000, 0x00000000, 0xAAAAAAAA, 0x00000000};
387 uint32_t output[4] = {0};
388 store_u32_4(output, and_u32_4(load_u32_4(a), load_u32_4(b)));
389 return compare_u32(expected, output, 4);
390}
391
392inline bool test_or_u32_4() {
393 uint32_t a[4] = {0xFFFF0000, 0x0F0F0F0F, 0xAAAAAAAA, 0x00000000};
394 uint32_t b[4] = {0x0000FFFF, 0xF0F0F0F0, 0x55555555, 0x00000000};
395 uint32_t expected[4] = {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000};
396 uint32_t output[4] = {0};
397 store_u32_4(output, or_u32_4(load_u32_4(a), load_u32_4(b)));
398 return compare_u32(expected, output, 4);
399}
400
401// ============================================================================
402// i32x4 Arithmetic Tests
403// ============================================================================
404
405inline bool test_add_i32_4() {
406 // Use signed values stored as u32
407 uint32_t a[4] = {as_u32(100), as_u32(-100), as_u32(0x7FFFFFFF), as_u32(0)};
408 uint32_t b[4] = {as_u32(200), as_u32(-200), as_u32(1), as_u32(0)};
409 uint32_t output[4] = {0};
410 store_u32_4(output, add_i32_4(load_u32_4(a), load_u32_4(b)));
411 if (as_i32(output[0]) != 300) return false;
412 if (as_i32(output[1]) != -300) return false;
413 // 0x7FFFFFFF + 1 wraps (expected behavior, no saturation)
414 if (output[2] != 0x80000000) return false;
415 if (as_i32(output[3]) != 0) return false;
416 return true;
417}
418
419inline bool test_sub_i32_4() {
420 uint32_t a[4] = {as_u32(300), as_u32(-100), as_u32(0), as_u32(1000)};
421 uint32_t b[4] = {as_u32(200), as_u32(200), as_u32(0), as_u32(1000)};
422 uint32_t output[4] = {0};
423 store_u32_4(output, sub_i32_4(load_u32_4(a), load_u32_4(b)));
424 if (as_i32(output[0]) != 100) return false;
425 if (as_i32(output[1]) != -300) return false;
426 if (as_i32(output[2]) != 0) return false;
427 if (as_i32(output[3]) != 0) return false;
428 return true;
429}
430
431// ============================================================================
432// i32x4 Shift Tests
433// ============================================================================
434
435inline bool test_srl_u32_4() {
436 uint32_t input[4] = {0x80000000, 0xFFFFFFFF, 0x00000010, 0x12345678};
437 uint32_t output[4] = {0};
438 store_u32_4(output, srl_u32_4(load_u32_4(input), 4));
439 uint32_t expected[4] = {0x08000000, 0x0FFFFFFF, 0x00000001, 0x01234567};
440 return compare_u32(expected, output, 4);
441}
442
443inline bool test_sll_u32_4() {
444 uint32_t input[4] = {0x00000001, 0x0FFFFFFF, 0x80000000, 0x12345678};
445 uint32_t output[4] = {0};
446 store_u32_4(output, sll_u32_4(load_u32_4(input), 4));
447 uint32_t expected[4] = {0x00000010, 0xFFFFFFF0, 0x00000000, 0x23456780};
448 return compare_u32(expected, output, 4);
449}
450
451inline bool test_sra_i32_4() {
452 // Arithmetic shift preserves sign bit
453 uint32_t input[4] = {as_u32(-16), as_u32(16), 0x80000000, 0x7FFFFFFF};
454 uint32_t output[4] = {0};
455 store_u32_4(output, sra_i32_4(load_u32_4(input), 2));
456 if (as_i32(output[0]) != -4) return false; // -16 >> 2 = -4 (sign-extended)
457 if (as_i32(output[1]) != 4) return false; // 16 >> 2 = 4
458 if (as_i32(output[2]) != as_i32(0xE0000000)) return false; // 0x80000000 >> 2 sign-extended
459 if (as_i32(output[3]) != as_i32(0x1FFFFFFF)) return false; // 0x7FFFFFFF >> 2
460 return true;
461}
462
463// ============================================================================
464// i32x4 Min/Max Tests (signed)
465// ============================================================================
466
467inline bool test_min_i32_4() {
468 uint32_t a[4] = {as_u32(10), as_u32(-10), as_u32(0x7FFFFFFF), as_u32(-1)};
469 uint32_t b[4] = {as_u32(20), as_u32(10), as_u32((int32_t)0x80000000), as_u32(0)};
470 uint32_t output[4] = {0};
471 store_u32_4(output, min_i32_4(load_u32_4(a), load_u32_4(b)));
472 if (as_i32(output[0]) != 10) return false;
473 if (as_i32(output[1]) != -10) return false;
474 if (as_i32(output[2]) != (int32_t)0x80000000) return false; // INT32_MIN < INT32_MAX
475 if (as_i32(output[3]) != -1) return false; // -1 < 0
476 return true;
477}
478
479inline bool test_max_i32_4() {
480 uint32_t a[4] = {as_u32(10), as_u32(-10), as_u32(0x7FFFFFFF), as_u32(-1)};
481 uint32_t b[4] = {as_u32(20), as_u32(10), as_u32((int32_t)0x80000000), as_u32(0)};
482 uint32_t output[4] = {0};
483 store_u32_4(output, max_i32_4(load_u32_4(a), load_u32_4(b)));
484 if (as_i32(output[0]) != 20) return false;
485 if (as_i32(output[1]) != 10) return false;
486 if (as_i32(output[2]) != 0x7FFFFFFF) return false;
487 if (as_i32(output[3]) != 0) return false;
488 return true;
489}
490
491// ============================================================================
492// Fixed-Point Multiply Tests (Q16.16)
493// ============================================================================
494
495inline bool test_mulhi_i32_4() {
496 // mulhi_i32_4: ((i64)a * (i64)b) >> 16
497 // Test: 0x00010000 * 0x00020000 >> 16 = 0x00020000 (1.0 * 2.0 = 2.0 in Q16.16)
498 uint32_t a[4] = {0x00010000, 0x00020000, as_u32(-0x00010000), 0x00008000};
499 uint32_t b[4] = {0x00020000, 0x00010000, 0x00020000, 0x00008000};
500 uint32_t output[4] = {0};
501 store_u32_4(output, mulhi_i32_4(load_u32_4(a), load_u32_4(b)));
502 if (output[0] != 0x00020000) return false; // 1.0 * 2.0 = 2.0
503 if (output[1] != 0x00020000) return false; // 2.0 * 1.0 = 2.0
504 if (as_i32(output[2]) != -0x00020000) return false; // -1.0 * 2.0 = -2.0
505 if (output[3] != 0x00004000) return false; // 0.5 * 0.5 = 0.25
506 return true;
507}
508
509inline bool test_mulhi_u32_4() {
510 // mulhi_u32_4: ((u64)a * (u64)b) >> 16 (unsigned)
511 uint32_t a[4] = {0x00010000, 0x00020000, 0x00030000, 0x00008000};
512 uint32_t b[4] = {0x00020000, 0x00030000, 0x00010000, 0x00008000};
513 uint32_t output[4] = {0};
514 store_u32_4(output, mulhi_u32_4(load_u32_4(a), load_u32_4(b)));
515 if (output[0] != 0x00020000) return false; // 1.0 * 2.0
516 if (output[1] != 0x00060000) return false; // 2.0 * 3.0
517 if (output[2] != 0x00030000) return false; // 3.0 * 1.0
518 if (output[3] != 0x00004000) return false; // 0.5 * 0.5
519 return true;
520}
521
522inline bool test_mulhi_su32_4() {
523 // mulhi_su32_4: signed a * unsigned b >> 16
524 // When b >= 0, should match mulhi_i32_4
525 uint32_t a[4] = {as_u32(-0x00010000), 0x00020000, as_u32(-0x00020000), 0x00010000};
526 uint32_t b[4] = {0x00020000, 0x00010000, 0x00010000, 0x00030000};
527 uint32_t output[4] = {0};
528 store_u32_4(output, mulhi_su32_4(load_u32_4(a), load_u32_4(b)));
529 if (as_i32(output[0]) != -0x00020000) return false; // -1.0 * 2.0 = -2.0
530 if (output[1] != 0x00020000) return false; // 2.0 * 1.0 = 2.0
531 if (as_i32(output[2]) != -0x00020000) return false; // -2.0 * 1.0 = -2.0
532 if (output[3] != 0x00030000) return false; // 1.0 * 3.0 = 3.0
533 return true;
534}
535
536inline bool test_mulhi32_i32_4() {
537 // mulhi32_i32_4: ((i64)a * (i64)b) >> 32
538 // Test: 0x40000000 * 0x40000000 >> 32 = 0x10000000
539 uint32_t a[4] = {0x40000000, as_u32(-0x40000000), 0x7FFFFFFF, 0x00000001};
540 uint32_t b[4] = {0x40000000, 0x40000000, 0x00000002, 0x7FFFFFFF};
541 uint32_t output[4] = {0};
542 store_u32_4(output, mulhi32_i32_4(load_u32_4(a), load_u32_4(b)));
543 if (output[0] != 0x10000000) return false; // 0.5 * 0.5 = 0.25 (Q31)
544 if (as_i32(output[1]) != as_i32(0xF0000000)) return false; // -0.5 * 0.5 = -0.25
545 if (output[2] != 0x00000000) return false; // small * small -> 0
546 if (output[3] != 0x00000000) return false; // 1 * MAX -> 0 (high bits only)
547 return true;
548}
549
550// ============================================================================
551// u32x4 Interleave / Unpack Tests
552// ============================================================================
553
554inline bool test_unpacklo_u32_4() {
555 // {a0, b0, a1, b1}
556 simd_u32x4 a = set_u32_4(1, 2, 3, 4);
557 simd_u32x4 b = set_u32_4(10, 20, 30, 40);
558 uint32_t output[4] = {0};
559 store_u32_4(output, unpacklo_u32_4(a, b));
560 uint32_t expected[4] = {1, 10, 2, 20};
561 return compare_u32(expected, output, 4);
562}
563
564inline bool test_unpackhi_u32_4() {
565 // {a2, b2, a3, b3}
566 simd_u32x4 a = set_u32_4(1, 2, 3, 4);
567 simd_u32x4 b = set_u32_4(10, 20, 30, 40);
568 uint32_t output[4] = {0};
569 store_u32_4(output, unpackhi_u32_4(a, b));
570 uint32_t expected[4] = {3, 30, 4, 40};
571 return compare_u32(expected, output, 4);
572}
573
575 // {a0, a1, b0, b1}
576 simd_u32x4 a = set_u32_4(1, 2, 3, 4);
577 simd_u32x4 b = set_u32_4(10, 20, 30, 40);
578 uint32_t output[4] = {0};
579 store_u32_4(output, unpacklo_u64_as_u32_4(a, b));
580 uint32_t expected[4] = {1, 2, 10, 20};
581 return compare_u32(expected, output, 4);
582}
583
585 // {a2, a3, b2, b3}
586 simd_u32x4 a = set_u32_4(1, 2, 3, 4);
587 simd_u32x4 b = set_u32_4(10, 20, 30, 40);
588 uint32_t output[4] = {0};
589 store_u32_4(output, unpackhi_u64_as_u32_4(a, b));
590 uint32_t expected[4] = {3, 4, 30, 40};
591 return compare_u32(expected, output, 4);
592}
593
594// ============================================================================
595// f32x4 Arithmetic Tests
596// ============================================================================
597
598inline bool test_add_f32_4() {
599 float a[4] = {1.0f, 2.0f, 3.0f, 4.0f};
600 float b[4] = {5.0f, 6.0f, 7.0f, 8.0f};
601 float expected[4] = {6.0f, 8.0f, 10.0f, 12.0f};
602 float output[4] = {0.0f};
603 store_f32_4(output, add_f32_4(load_f32_4(a), load_f32_4(b)));
604 return compare_f32(expected, output, 4);
605}
606
608 float a[4] = {-1.0f, 2.0f, -3.0f, 0.0f};
609 float b[4] = {1.0f, -2.0f, -3.0f, 0.0f};
610 float expected[4] = {0.0f, 0.0f, -6.0f, 0.0f};
611 float output[4] = {0.0f};
612 store_f32_4(output, add_f32_4(load_f32_4(a), load_f32_4(b)));
613 return compare_f32(expected, output, 4);
614}
615
616inline bool test_sub_f32_4() {
617 float a[4] = {10.0f, 20.0f, 30.0f, 40.0f};
618 float b[4] = {1.0f, 2.0f, 3.0f, 4.0f};
619 float expected[4] = {9.0f, 18.0f, 27.0f, 36.0f};
620 float output[4] = {0.0f};
621 store_f32_4(output, sub_f32_4(load_f32_4(a), load_f32_4(b)));
622 return compare_f32(expected, output, 4);
623}
624
625inline bool test_mul_f32_4() {
626 float a[4] = {2.0f, 3.0f, 4.0f, 5.0f};
627 float b[4] = {3.0f, 4.0f, 5.0f, 6.0f};
628 float expected[4] = {6.0f, 12.0f, 20.0f, 30.0f};
629 float output[4] = {0.0f};
630 store_f32_4(output, mul_f32_4(load_f32_4(a), load_f32_4(b)));
631 return compare_f32(expected, output, 4);
632}
633
635 float a[4] = {-2.0f, 3.0f, -4.0f, 0.0f};
636 float b[4] = {3.0f, -4.0f, -5.0f, 100.0f};
637 float expected[4] = {-6.0f, -12.0f, 20.0f, 0.0f};
638 float output[4] = {0.0f};
639 store_f32_4(output, mul_f32_4(load_f32_4(a), load_f32_4(b)));
640 return compare_f32(expected, output, 4);
641}
642
643inline bool test_div_f32_4() {
644 float a[4] = {10.0f, 20.0f, 30.0f, 40.0f};
645 float b[4] = {2.0f, 4.0f, 5.0f, 8.0f};
646 float expected[4] = {5.0f, 5.0f, 6.0f, 5.0f};
647 float output[4] = {0.0f};
648 store_f32_4(output, div_f32_4(load_f32_4(a), load_f32_4(b)));
649 return compare_f32(expected, output, 4);
650}
651
652inline bool test_sqrt_f32_4() {
653 float a[4] = {4.0f, 9.0f, 16.0f, 25.0f};
654 float expected[4] = {2.0f, 3.0f, 4.0f, 5.0f};
655 float output[4] = {0.0f};
656 store_f32_4(output, sqrt_f32_4(load_f32_4(a)));
657 return compare_f32(expected, output, 4);
658}
659
660inline bool test_sqrt_f32_4_zero() {
661 float a[4] = {0.0f, 1.0f, 100.0f, 0.25f};
662 float expected[4] = {0.0f, 1.0f, 10.0f, 0.5f};
663 float output[4] = {0.0f};
664 store_f32_4(output, sqrt_f32_4(load_f32_4(a)));
665 return compare_f32(expected, output, 4);
666}
667
668inline bool test_min_f32_4() {
669 float a[4] = {1.0f, 5.0f, -3.0f, 7.0f};
670 float b[4] = {4.0f, 2.0f, 6.0f, -1.0f};
671 float expected[4] = {1.0f, 2.0f, -3.0f, -1.0f};
672 float output[4] = {0.0f};
673 store_f32_4(output, min_f32_4(load_f32_4(a), load_f32_4(b)));
674 return compare_f32(expected, output, 4);
675}
676
677inline bool test_max_f32_4() {
678 float a[4] = {1.0f, 5.0f, -3.0f, 7.0f};
679 float b[4] = {4.0f, 2.0f, 6.0f, -1.0f};
680 float expected[4] = {4.0f, 5.0f, 6.0f, 7.0f};
681 float output[4] = {0.0f};
682 store_f32_4(output, max_f32_4(load_f32_4(a), load_f32_4(b)));
683 return compare_f32(expected, output, 4);
684}
685
686// ============================================================================
687// Cross-AutoResearch Tests: SIMD vs Scalar Reference
688// These tests compute expected results using explicit scalar math, then compare
689// against the SIMD function output. Catches PIE assembly bugs by never trusting
690// the SIMD implementation for expected values.
691// ============================================================================
692
693// Scalar reference: bitwise AND of two u8[16] arrays
694inline void ref_and_u8_16(const uint8_t* a, const uint8_t* b, uint8_t* out) {
695 for (int i = 0; i < 16; i++) out[i] = a[i] & b[i];
696}
697inline void ref_or_u8_16(const uint8_t* a, const uint8_t* b, uint8_t* out) {
698 for (int i = 0; i < 16; i++) out[i] = a[i] | b[i];
699}
700inline void ref_xor_u8_16(const uint8_t* a, const uint8_t* b, uint8_t* out) {
701 for (int i = 0; i < 16; i++) out[i] = a[i] ^ b[i];
702}
703inline void ref_andnot_u8_16(const uint8_t* a, const uint8_t* b, uint8_t* out) {
704 for (int i = 0; i < 16; i++) out[i] = (~a[i]) & b[i];
705}
706inline void ref_xor_u32_4(const uint32_t* a, const uint32_t* b, uint32_t* out) {
707 for (int i = 0; i < 4; i++) out[i] = a[i] ^ b[i];
708}
709inline void ref_and_u32_4(const uint32_t* a, const uint32_t* b, uint32_t* out) {
710 for (int i = 0; i < 4; i++) out[i] = a[i] & b[i];
711}
712inline void ref_or_u32_4(const uint32_t* a, const uint32_t* b, uint32_t* out) {
713 for (int i = 0; i < 4; i++) out[i] = a[i] | b[i];
714}
715
716// Adversarial cross-autoresearch: alternating bit patterns
718 uint8_t a[16] = {0xAA,0x55,0xFF,0x00, 0x0F,0xF0,0x81,0x7E, 0x01,0xFE,0xCC,0x33, 0xDB,0x24,0xA5,0x5A};
719 uint8_t b[16] = {0x55,0xAA,0x00,0xFF, 0xF0,0x0F,0x7E,0x81, 0xFE,0x01,0x33,0xCC, 0x24,0xDB,0x5A,0xA5};
720 uint8_t expected[16], actual[16];
721 ref_and_u8_16(a, b, expected);
722 store_u8_16(actual, and_u8_16(load_u8_16(a), load_u8_16(b)));
723 return compare_u8(expected, actual, 16);
724}
725
727 uint8_t a[16] = {0xAA,0x55,0xFF,0x00, 0x0F,0xF0,0x81,0x7E, 0x01,0xFE,0xCC,0x33, 0xDB,0x24,0xA5,0x5A};
728 uint8_t b[16] = {0x55,0xAA,0x00,0xFF, 0xF0,0x0F,0x7E,0x81, 0xFE,0x01,0x33,0xCC, 0x24,0xDB,0x5A,0xA5};
729 uint8_t expected[16], actual[16];
730 ref_or_u8_16(a, b, expected);
731 store_u8_16(actual, or_u8_16(load_u8_16(a), load_u8_16(b)));
732 return compare_u8(expected, actual, 16);
733}
734
736 uint8_t a[16] = {0xAA,0x55,0xFF,0x00, 0x0F,0xF0,0x81,0x7E, 0x01,0xFE,0xCC,0x33, 0xDB,0x24,0xA5,0x5A};
737 uint8_t b[16] = {0x55,0xAA,0x00,0xFF, 0xF0,0x0F,0x7E,0x81, 0xFE,0x01,0x33,0xCC, 0x24,0xDB,0x5A,0xA5};
738 uint8_t expected[16], actual[16];
739 ref_xor_u8_16(a, b, expected);
740 store_u8_16(actual, xor_u8_16(load_u8_16(a), load_u8_16(b)));
741 return compare_u8(expected, actual, 16);
742}
743
745 uint8_t a[16] = {0xAA,0x55,0xFF,0x00, 0x0F,0xF0,0x81,0x7E, 0x01,0xFE,0xCC,0x33, 0xDB,0x24,0xA5,0x5A};
746 uint8_t b[16] = {0x55,0xAA,0x00,0xFF, 0xF0,0x0F,0x7E,0x81, 0xFE,0x01,0x33,0xCC, 0x24,0xDB,0x5A,0xA5};
747 uint8_t expected[16], actual[16];
749 store_u8_16(actual, andnot_u8_16(load_u8_16(a), load_u8_16(b)));
750 return compare_u8(expected, actual, 16);
751}
752
753// Adversarial u32 cross-autoresearch: powers of 2, all-ones, sign bit
755 uint32_t a[4] = {0x80000001, 0x7FFFFFFE, 0xDEADBEEF, 0x00000000};
756 uint32_t b[4] = {0x80000001, 0x80000001, 0xCAFEBABE, 0xFFFFFFFF};
757 uint32_t expected[4], actual[4];
758 ref_xor_u32_4(a, b, expected);
759 store_u32_4(actual, xor_u32_4(load_u32_4(a), load_u32_4(b)));
760 return compare_u32(expected, actual, 4);
761}
762
764 uint32_t a[4] = {0x80000001, 0x7FFFFFFE, 0xDEADBEEF, 0x00000000};
765 uint32_t b[4] = {0x80000001, 0x80000001, 0xCAFEBABE, 0xFFFFFFFF};
766 uint32_t expected[4], actual[4];
767 ref_and_u32_4(a, b, expected);
768 store_u32_4(actual, and_u32_4(load_u32_4(a), load_u32_4(b)));
769 return compare_u32(expected, actual, 4);
770}
771
773 uint32_t a[4] = {0x80000001, 0x7FFFFFFE, 0xDEADBEEF, 0x00000000};
774 uint32_t b[4] = {0x80000001, 0x80000001, 0xCAFEBABE, 0xFFFFFFFF};
775 uint32_t expected[4], actual[4];
776 ref_or_u32_4(a, b, expected);
777 store_u32_4(actual, or_u32_4(load_u32_4(a), load_u32_4(b)));
778 return compare_u32(expected, actual, 4);
779}
780
781// Scalar reference: scale
782inline void ref_scale_u8_16(const uint8_t* v, uint8_t scale, uint8_t* out) {
783 for (int i = 0; i < 16; i++) {
784 out[i] = (uint8_t)(((uint16_t)v[i] * scale) >> 8);
785 }
786}
787
789 // Adversarial: mix of 0, 1, 127, 128, 254, 255 with various scale factors
790 uint8_t v[16] = {0,1,127,128, 254,255,0x55,0xAA, 0x0F,0xF0,0x80,0x7F, 0x01,0xFE,0xFF,0x00};
791 uint8_t scale_vals[] = {0, 1, 127, 128, 254, 255};
792 for (int s = 0; s < 6; s++) {
793 uint8_t expected[16], actual[16];
794 ref_scale_u8_16(v, scale_vals[s], expected);
795 store_u8_16(actual, scale_u8_16(load_u8_16(v), scale_vals[s]));
796 if (!compare_u8(expected, actual, 16)) return false;
797 }
798 return true;
799}
800
801// Scalar reference: unsigned saturating add/sub
802inline void ref_add_sat_u8_16(const uint8_t* a, const uint8_t* b, uint8_t* out) {
803 for (int i = 0; i < 16; i++) {
804 uint16_t sum = (uint16_t)a[i] + (uint16_t)b[i];
805 out[i] = (sum > 255) ? 255 : (uint8_t)sum;
806 }
807}
808inline void ref_sub_sat_u8_16(const uint8_t* a, const uint8_t* b, uint8_t* out) {
809 for (int i = 0; i < 16; i++) {
810 out[i] = (a[i] > b[i]) ? (a[i] - b[i]) : 0;
811 }
812}
813
814// Adversarial cross-autoresearch: sat add with every combination
816 // Test adversarial: all-255+all-255, alternating, near-overflow
817 uint8_t a[16] = {255,254,128,127, 0,1,200,50, 0xFF,0x80,0x7F,0x01, 100,200,150,250};
818 uint8_t b[16] = {255,2,128,129, 0,255,56,206, 0x01,0x80,0x81,0xFF, 156,56,106,6};
819 uint8_t expected[16], actual[16];
821 store_u8_16(actual, add_sat_u8_16(load_u8_16(a), load_u8_16(b)));
822 return compare_u8(expected, actual, 16);
823}
824
826 uint8_t a[16] = {255,0,128,127, 0,1,200,50, 0xFF,0x80,0x7F,0x01, 100,200,150,250};
827 uint8_t b[16] = {255,255,128,129, 0,255,56,206, 0x01,0x80,0x81,0xFF, 156,56,106,6};
828 uint8_t expected[16], actual[16];
830 store_u8_16(actual, sub_sat_u8_16(load_u8_16(a), load_u8_16(b)));
831 return compare_u8(expected, actual, 16);
832}
833
834// Scalar reference: i32 wrapping add/sub
835inline void ref_add_i32_4(const uint32_t* a, const uint32_t* b, uint32_t* out) {
836 for (int i = 0; i < 4; i++) out[i] = a[i] + b[i]; // wrapping
837}
838inline void ref_sub_i32_4(const uint32_t* a, const uint32_t* b, uint32_t* out) {
839 for (int i = 0; i < 4; i++) out[i] = a[i] - b[i]; // wrapping
840}
841
842// Adversarial wrapping add: overflow, underflow, sign flip
844 uint32_t a[4] = {0x7FFFFFFF, 0x80000000, 0xFFFFFFFF, 0x00000001};
845 uint32_t b[4] = {0x00000001, 0x80000000, 0x00000001, 0xFFFFFFFF};
846 uint32_t expected[4], actual[4];
847 ref_add_i32_4(a, b, expected);
848 store_u32_4(actual, add_i32_4(load_u32_4(a), load_u32_4(b)));
849 return compare_u32(expected, actual, 4);
850}
851
853 uint32_t a[4] = {0x00000000, 0x80000000, 0x7FFFFFFF, 0x00000001};
854 uint32_t b[4] = {0x00000001, 0x7FFFFFFF, 0x80000000, 0x00000002};
855 uint32_t expected[4], actual[4];
856 ref_sub_i32_4(a, b, expected);
857 store_u32_4(actual, sub_i32_4(load_u32_4(a), load_u32_4(b)));
858 return compare_u32(expected, actual, 4);
859}
860
861// Scalar reference: shifts
862inline void ref_srl_u32_4(const uint32_t* v, int shift, uint32_t* out) {
863 for (int i = 0; i < 4; i++) out[i] = v[i] >> shift;
864}
865inline void ref_sll_u32_4(const uint32_t* v, int shift, uint32_t* out) {
866 for (int i = 0; i < 4; i++) out[i] = v[i] << shift;
867}
868inline void ref_sra_i32_4(const uint32_t* v, int shift, uint32_t* out) {
869 for (int i = 0; i < 4; i++) out[i] = (uint32_t)((int32_t)v[i] >> shift);
870}
871
873 uint32_t v[4] = {0x80000000, 0xFFFFFFFF, 0x00000001, 0xDEADBEEF};
874 for (int shift = 0; shift <= 31; shift += 7) {
875 uint32_t expected[4], actual[4];
876 ref_srl_u32_4(v, shift, expected);
877 store_u32_4(actual, srl_u32_4(load_u32_4(v), shift));
878 if (!compare_u32(expected, actual, 4)) return false;
879 }
880 return true;
881}
882
884 uint32_t v[4] = {0x80000000, 0xFFFFFFFF, 0x00000001, 0xDEADBEEF};
885 for (int shift = 0; shift <= 31; shift += 7) {
886 uint32_t expected[4], actual[4];
887 ref_sll_u32_4(v, shift, expected);
888 store_u32_4(actual, sll_u32_4(load_u32_4(v), shift));
889 if (!compare_u32(expected, actual, 4)) return false;
890 }
891 return true;
892}
893
895 uint32_t v[4] = {0x80000000, 0xFFFFFFFF, 0x7FFFFFFF, 0xDEADBEEF};
896 for (int shift = 0; shift <= 31; shift += 7) {
897 uint32_t expected[4], actual[4];
898 ref_sra_i32_4(v, shift, expected);
899 store_u32_4(actual, sra_i32_4(load_u32_4(v), shift));
900 if (!compare_u32(expected, actual, 4)) return false;
901 }
902 return true;
903}
904
905// Scalar reference: min/max i32
906inline void ref_min_i32_4(const uint32_t* a, const uint32_t* b, uint32_t* out) {
907 for (int i = 0; i < 4; i++) {
908 int32_t ai = (int32_t)a[i], bi = (int32_t)b[i];
909 out[i] = (uint32_t)(ai < bi ? ai : bi);
910 }
911}
912inline void ref_max_i32_4(const uint32_t* a, const uint32_t* b, uint32_t* out) {
913 for (int i = 0; i < 4; i++) {
914 int32_t ai = (int32_t)a[i], bi = (int32_t)b[i];
915 out[i] = (uint32_t)(ai > bi ? ai : bi);
916 }
917}
918
920 uint32_t a[4] = {0x80000000, 0x7FFFFFFF, 0xFFFFFFFF, 0x00000000};
921 uint32_t b[4] = {0x7FFFFFFF, 0x80000000, 0x00000000, 0xFFFFFFFF};
922 uint32_t expected[4], actual[4];
923 ref_min_i32_4(a, b, expected);
924 store_u32_4(actual, min_i32_4(load_u32_4(a), load_u32_4(b)));
925 return compare_u32(expected, actual, 4);
926}
927
929 uint32_t a[4] = {0x80000000, 0x7FFFFFFF, 0xFFFFFFFF, 0x00000000};
930 uint32_t b[4] = {0x7FFFFFFF, 0x80000000, 0x00000000, 0xFFFFFFFF};
931 uint32_t expected[4], actual[4];
932 ref_max_i32_4(a, b, expected);
933 store_u32_4(actual, max_i32_4(load_u32_4(a), load_u32_4(b)));
934 return compare_u32(expected, actual, 4);
935}
936
937// Scalar reference: multiply variants
938inline void ref_mulhi_i32_4(const uint32_t* a, const uint32_t* b, uint32_t* out) {
939 for (int i = 0; i < 4; i++) {
940 int64_t prod = (int64_t)(int32_t)a[i] * (int64_t)(int32_t)b[i];
941 out[i] = (uint32_t)(int32_t)(prod >> 16);
942 }
943}
944inline void ref_mulhi_u32_4(const uint32_t* a, const uint32_t* b, uint32_t* out) {
945 for (int i = 0; i < 4; i++) {
946 uint64_t prod = (uint64_t)a[i] * (uint64_t)b[i];
947 out[i] = (uint32_t)(prod >> 16);
948 }
949}
950inline void ref_mulhi32_i32_4(const uint32_t* a, const uint32_t* b, uint32_t* out) {
951 for (int i = 0; i < 4; i++) {
952 int64_t prod = (int64_t)(int32_t)a[i] * (int64_t)(int32_t)b[i];
953 out[i] = (uint32_t)(int32_t)(prod >> 32);
954 }
955}
956
958 uint32_t a[4] = {0x7FFFFFFF, 0x80000000, 0x00010000, 0xFFFF0000};
959 uint32_t b[4] = {0x00020000, 0x00020000, 0xFFFF0000, 0xFFFF0000};
960 uint32_t expected[4], actual[4];
962 store_u32_4(actual, mulhi_i32_4(load_u32_4(a), load_u32_4(b)));
963 return compare_u32(expected, actual, 4);
964}
965
967 uint32_t a[4] = {0xFFFFFFFF, 0x80000000, 0x00010000, 0x00000001};
968 uint32_t b[4] = {0x00000002, 0x00000002, 0x00010000, 0xFFFFFFFF};
969 uint32_t expected[4], actual[4];
971 store_u32_4(actual, mulhi_u32_4(load_u32_4(a), load_u32_4(b)));
972 return compare_u32(expected, actual, 4);
973}
974
976 uint32_t a[4] = {0x7FFFFFFF, 0x80000000, 0x40000000, 0xC0000000};
977 uint32_t b[4] = {0x7FFFFFFF, 0x80000000, 0x40000000, 0x40000000};
978 uint32_t expected[4], actual[4];
980 store_u32_4(actual, mulhi32_i32_4(load_u32_4(a), load_u32_4(b)));
981 return compare_u32(expected, actual, 4);
982}
983
984// Scalar reference: min/max u8 and float ops
985inline void ref_min_u8_16(const uint8_t* a, const uint8_t* b, uint8_t* out) {
986 for (int i = 0; i < 16; i++) out[i] = a[i] < b[i] ? a[i] : b[i];
987}
988inline void ref_max_u8_16(const uint8_t* a, const uint8_t* b, uint8_t* out) {
989 for (int i = 0; i < 16; i++) out[i] = a[i] > b[i] ? a[i] : b[i];
990}
991
993 uint8_t a[16] = {0,255,128,127, 1,254,0x55,0xAA, 0x0F,0xF0,0x80,0x7F, 0x01,0xFE,0xFF,0x00};
994 uint8_t b[16] = {255,0,127,128, 254,1,0xAA,0x55, 0xF0,0x0F,0x7F,0x80, 0xFE,0x01,0x00,0xFF};
995 uint8_t expected[16], actual[16];
996 ref_min_u8_16(a, b, expected);
997 store_u8_16(actual, min_u8_16(load_u8_16(a), load_u8_16(b)));
998 return compare_u8(expected, actual, 16);
999}
1000
1002 uint8_t a[16] = {0,255,128,127, 1,254,0x55,0xAA, 0x0F,0xF0,0x80,0x7F, 0x01,0xFE,0xFF,0x00};
1003 uint8_t b[16] = {255,0,127,128, 254,1,0xAA,0x55, 0xF0,0x0F,0x7F,0x80, 0xFE,0x01,0x00,0xFF};
1004 uint8_t expected[16], actual[16];
1005 ref_max_u8_16(a, b, expected);
1006 store_u8_16(actual, max_u8_16(load_u8_16(a), load_u8_16(b)));
1007 return compare_u8(expected, actual, 16);
1008}
1009
1010// Scalar reference: float ops
1012 float a[4] = {-1.5f, 0.0f, 3.14159f, 1e10f};
1013 float b[4] = {2.5f, -0.0f, -2.71828f, 1e-10f};
1014 float out[4];
1015 // Test add
1016 store_f32_4(out, add_f32_4(load_f32_4(a), load_f32_4(b)));
1017 for (int i = 0; i < 4; i++) {
1018 float diff = out[i] - (a[i] + b[i]);
1019 if (diff < -0.001f || diff > 0.001f) return false;
1020 }
1021 // Test sub
1022 store_f32_4(out, sub_f32_4(load_f32_4(a), load_f32_4(b)));
1023 for (int i = 0; i < 4; i++) {
1024 float diff = out[i] - (a[i] - b[i]);
1025 if (diff < -0.001f || diff > 0.001f) return false;
1026 }
1027 // Test mul
1028 store_f32_4(out, mul_f32_4(load_f32_4(a), load_f32_4(b)));
1029 for (int i = 0; i < 4; i++) {
1030 float diff = out[i] - (a[i] * b[i]);
1031 if (diff < -1.0f && diff > 1.0f) return false; // large values need loose tolerance
1032 }
1033 return true;
1034}
1035
1036// Cross-autoresearch: aligned load/store with adversarial patterns
1038 FL_ALIGNAS(16) uint32_t src[4] = {0x80000000, 0x7FFFFFFF, 0xDEADBEEF, 0x00000000};
1039 FL_ALIGNAS(16) uint32_t dst[4] = {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF};
1040 simd_u32x4 v = load_u32_4_aligned(src);
1041 store_u32_4_aligned(dst, v);
1042 return compare_u32(src, dst, 4);
1043}
1044
1045// Cross-autoresearch: broadcast + adversarial values
1047 // Test adversarial values: sign bit, all ones, zero, alternating bits
1048 uint32_t test_values[] = {0x80000000, 0xFFFFFFFF, 0x00000000, 0xAAAAAAAA, 0x55555555, 0x01010101};
1049 for (int t = 0; t < 6; t++) {
1050 uint32_t val = test_values[t];
1051 uint32_t output[4];
1052 store_u32_4(output, set1_u32_4(val));
1053 for (int i = 0; i < 4; i++) {
1054 if (output[i] != val) return false;
1055 }
1056 }
1057 return true;
1058}
1059
1060// ============================================================================
1061// Chained Operation Tests (pipeline correctness)
1062// ============================================================================
1063
1065 // Simulate a brightness pipeline: scale by 128 then add bias, verify saturation
1066 uint8_t input[16] = {200,200,200,200, 200,200,200,200, 200,200,200,200, 200,200,200,200};
1067 uint8_t bias[16] = {200,200,200,200, 200,200,200,200, 200,200,200,200, 200,200,200,200};
1068 uint8_t output[16] = {0};
1069 simd_u8x16 v = load_u8_16(input);
1070 v = scale_u8_16(v, 128); // ~100
1071 v = add_sat_u8_16(v, load_u8_16(bias)); // ~300 -> saturated to 255
1072 store_u8_16(output, v);
1073 for (int i = 0; i < 16; i++) {
1074 if (output[i] < 250) return false; // Should be 255 (saturated)
1075 }
1076 return true;
1077}
1078
1080 // Simulate fixed-point angle decomposition: shift, add, mask
1081 uint32_t angles[4] = {0x10000000, 0x20000000, 0x30000000, 0x40000000};
1082 simd_u32x4 v = load_u32_4(angles);
1083 v = srl_u32_4(v, 16); // Extract high 16 bits
1084 v = add_i32_4(v, set1_u32_4(0x100)); // Add offset
1085 v = and_u32_4(v, set1_u32_4(0xFFFF)); // Mask to 16 bits
1086 uint32_t output[4] = {0};
1087 store_u32_4(output, v);
1088 uint32_t expected[4] = {0x1100, 0x2100, 0x3100, 0x4100};
1089 return compare_u32(expected, output, 4);
1090}
1091
1093 // Simulate color: multiply by gain, add bias, clamp to [0, 1]
1094 float input[4] = {0.3f, 0.6f, 0.9f, 1.2f};
1095 float gain[4] = {2.0f, 2.0f, 2.0f, 2.0f};
1096 float output[4] = {0.0f};
1097 simd_f32x4 v = load_f32_4(input);
1098 v = mul_f32_4(v, load_f32_4(gain)); // 0.6, 1.2, 1.8, 2.4
1099 v = min_f32_4(v, set1_f32_4(1.0f)); // clamp to 1.0
1100 v = max_f32_4(v, set1_f32_4(0.0f)); // clamp to 0.0
1101 store_f32_4(output, v);
1102 float expected[4] = {0.6f, 1.0f, 1.0f, 1.0f};
1103 return compare_f32(expected, output, 4);
1104}
1105
1106// ============================================================================
1107// Arithmetic Benchmark: add / sub / mul / div across float, s16x16, s16x16x4
1108// ============================================================================
1109//
1110// Each test runs 4-wide unrolled loops with feedback to prevent elimination.
1111// s16x16x4 has no division operator, so that cell is skipped.
1112
1113static volatile uint32_t g_bench_sink;
1114
1116 int64_t iterations;
1117 // [op][type]: op={add,sub,mul,div}, type={float,s8x8,s16x16,u16x16,simd}
1122};
1123
1124// Helper: time 4-wide scalar float with a binary op
1125template <typename Op>
1126inline int64_t benchFloat4(int iters, Op op) {
1127 float a0 = 1.5f, a1 = 2.3f, a2 = 0.7f, a3 = 3.1f;
1128 float b0 = 0.5f, b1 = 1.2f, b2 = 2.0f, b3 = 0.9f;
1129 uint32_t t0 = micros();
1130 for (int i = 0; i < iters; i++) {
1131 a0 = op(a0, b0); a1 = op(a1, b1);
1132 a2 = op(a2, b2); a3 = op(a3, b3);
1133 b0 = a0 + 0.001f; b1 = a1 + 0.001f;
1134 b2 = a2 + 0.001f; b3 = a3 + 0.001f;
1135 }
1136 uint32_t t1 = micros();
1137 uint32_t tmp; fl::memcpy(&tmp, &a0, sizeof(tmp));
1138 g_bench_sink = tmp;
1139 return static_cast<int64_t>(t1 - t0);
1140}
1141
1142// Helper: time 4-wide scalar s16x16 with a binary op
1143template <typename Op>
1144inline int64_t benchS16x16_4(int iters, Op op) {
1145 fl::s16x16 a0(1.5f), a1(2.3f), a2(0.7f), a3(3.1f);
1146 fl::s16x16 b0(0.5f), b1(1.2f), b2(2.0f), b3(0.9f);
1148 uint32_t t0 = micros();
1149 for (int i = 0; i < iters; i++) {
1150 a0 = op(a0, b0); a1 = op(a1, b1);
1151 a2 = op(a2, b2); a3 = op(a3, b3);
1152 b0 = a0 + bump; b1 = a1 + bump;
1153 b2 = a2 + bump; b3 = a3 + bump;
1154 }
1155 uint32_t t1 = micros();
1156 g_bench_sink = static_cast<uint32_t>(a0.raw());
1157 return static_cast<int64_t>(t1 - t0);
1158}
1159
1160// Helper: time s16x16x4 SIMD with a binary op
1161template <typename Op>
1162inline int64_t benchSimd4(int iters, Op op) {
1164 set_u32_4(as_u32(fl::s16x16(1.5f).raw()), as_u32(fl::s16x16(2.3f).raw()),
1165 as_u32(fl::s16x16(0.7f).raw()), as_u32(fl::s16x16(3.1f).raw())));
1167 set_u32_4(as_u32(fl::s16x16(0.5f).raw()), as_u32(fl::s16x16(1.2f).raw()),
1168 as_u32(fl::s16x16(2.0f).raw()), as_u32(fl::s16x16(0.9f).raw())));
1170 uint32_t t0 = micros();
1171 for (int i = 0; i < iters; i++) {
1172 a = op(a, b);
1173 b = a + bump;
1174 }
1175 uint32_t t1 = micros();
1176 g_bench_sink = extract_u32_4(a.raw, 0);
1177 return static_cast<int64_t>(t1 - t0);
1178}
1179
1180struct OpAdd {
1181 template<typename T> T operator()(T a, T b) const { return a + b; }
1182};
1183struct OpSub {
1184 template<typename T> T operator()(T a, T b) const { return a - b; }
1185};
1186struct OpMul {
1187 template<typename T> T operator()(T a, T b) const { return a * b; }
1188};
1189// Helper: time 4-wide scalar s8x8 with a binary op
1190template <typename Op>
1191inline int64_t benchS8x8_4(int iters, Op op) {
1192 fl::s8x8 a0(1.5f), a1(2.3f), a2(0.7f), a3(3.1f);
1193 fl::s8x8 b0(0.5f), b1(1.2f), b2(2.0f), b3(0.9f);
1194 fl::s8x8 bump = fl::s8x8::from_raw(1);
1195 uint32_t t0 = micros();
1196 for (int i = 0; i < iters; i++) {
1197 a0 = op(a0, b0); a1 = op(a1, b1);
1198 a2 = op(a2, b2); a3 = op(a3, b3);
1199 b0 = a0 + bump; b1 = a1 + bump;
1200 b2 = a2 + bump; b3 = a3 + bump;
1201 }
1202 uint32_t t1 = micros();
1203 g_bench_sink = static_cast<uint32_t>(a0.raw());
1204 return static_cast<int64_t>(t1 - t0);
1205}
1206
1207// Helper: time 4-wide scalar u16x16 with a binary op
1208template <typename Op>
1209inline int64_t benchU16x16_4(int iters, Op op) {
1210 fl::u16x16 a0(1.5f), a1(2.3f), a2(0.7f), a3(3.1f);
1211 fl::u16x16 b0(0.5f), b1(1.2f), b2(2.0f), b3(0.9f);
1213 uint32_t t0 = micros();
1214 for (int i = 0; i < iters; i++) {
1215 a0 = op(a0, b0); a1 = op(a1, b1);
1216 a2 = op(a2, b2); a3 = op(a3, b3);
1217 b0 = a0 + bump; b1 = a1 + bump;
1218 b2 = a2 + bump; b3 = a3 + bump;
1219 }
1220 uint32_t t1 = micros();
1221 g_bench_sink = static_cast<uint32_t>(a0.raw());
1222 return static_cast<int64_t>(t1 - t0);
1223}
1224
1226 float operator()(float a, float b) const { return a / b; }
1227};
1229 fl::s8x8 operator()(fl::s8x8 a, fl::s8x8 b) const { return a / b; }
1230};
1232 fl::s16x16 operator()(fl::s16x16 a, fl::s16x16 b) const { return a / b; }
1233};
1235 fl::u16x16 operator()(fl::u16x16 a, fl::u16x16 b) const { return a / b; }
1236};
1237
1238inline BenchmarkResult runMultiplyBenchmark(int iters = 10000) {
1240 r.iterations = iters;
1241
1242 // Add
1243 r.add_float_us = benchFloat4(iters, OpAdd());
1244 r.add_s8x8_us = benchS8x8_4(iters, OpAdd());
1245 r.add_s16x16_us = benchS16x16_4(iters, OpAdd());
1246 r.add_u16x16_us = benchU16x16_4(iters, OpAdd());
1247 r.add_simd_us = benchSimd4(iters, OpAdd());
1248
1249 // Sub
1250 r.sub_float_us = benchFloat4(iters, OpSub());
1251 r.sub_s8x8_us = benchS8x8_4(iters, OpSub());
1252 r.sub_s16x16_us = benchS16x16_4(iters, OpSub());
1253 r.sub_u16x16_us = benchU16x16_4(iters, OpSub());
1254 r.sub_simd_us = benchSimd4(iters, OpSub());
1255
1256 // Mul
1257 r.mul_float_us = benchFloat4(iters, OpMul());
1258 r.mul_s8x8_us = benchS8x8_4(iters, OpMul());
1259 r.mul_s16x16_us = benchS16x16_4(iters, OpMul());
1260 r.mul_u16x16_us = benchU16x16_4(iters, OpMul());
1261 r.mul_simd_us = benchSimd4(iters, OpMul());
1262
1263 // Div (no SIMD div for s16x16x4)
1264 r.div_float_us = benchFloat4(iters, OpDivFloat());
1265 r.div_s8x8_us = benchS8x8_4(iters, OpDivS8x8());
1268
1269 return r;
1270}
1271
1272// ============================================================================
1273// Test Runner
1274// ============================================================================
1275
1277 const char* name;
1278 bool (*func)();
1279};
1280
1282inline void getTests(const SimdTestEntry** out_tests, int* out_count) {
1283 static const SimdTestEntry tests[] = {
1284 // Load/Store u8x16
1285 {"load/store u8x16", test_load_store_u8_16},
1286 {"load/store u8x16 boundary", test_load_store_u8_16_boundary},
1287 // Load/Store u32x4
1288 {"load/store u32x4", test_load_store_u32_4},
1289 {"load/store u32x4 boundary", test_load_store_u32_4_boundary},
1290 {"load/store u32x4 aligned", test_load_store_u32_4_aligned},
1291 // Load/Store f32x4
1292 {"load/store f32x4", test_load_store_f32_4},
1293 {"load/store f32x4 special", test_load_store_f32_4_special},
1294 // Saturating Arithmetic u8x16
1295 {"add_sat u8x16", test_add_sat_u8_16},
1296 {"add_sat u8x16 full saturate", test_add_sat_u8_16_full_saturate},
1297 {"sub_sat u8x16", test_sub_sat_u8_16},
1298 {"sub_sat u8x16 full clamp", test_sub_sat_u8_16_full_clamp},
1299 // Scale / Blend u8x16
1300 {"scale u8x16", test_scale_u8_16},
1301 {"scale u8x16 zero", test_scale_u8_16_zero},
1302 {"scale u8x16 full", test_scale_u8_16_full},
1303 {"blend u8x16", test_blend_u8_16},
1304 {"blend u8x16 endpoints", test_blend_u8_16_endpoints},
1305 // Comparison u8x16
1306 {"min u8x16", test_min_u8_16},
1307 {"max u8x16", test_max_u8_16},
1308 {"avg u8x16", test_avg_u8_16},
1309 {"avg_round u8x16", test_avg_round_u8_16},
1310 // Bitwise u8x16
1311 {"and u8x16", test_and_u8_16},
1312 {"or u8x16", test_or_u8_16},
1313 {"xor u8x16", test_xor_u8_16},
1314 {"andnot u8x16", test_andnot_u8_16},
1315 // Broadcast / Construct u32x4
1316 {"set1 u32x4", test_set1_u32_4},
1317 {"set1 u32x4 zero", test_set1_u32_4_zero},
1318 {"set u32x4", test_set_u32_4},
1319 {"set1 f32x4", test_set1_f32_4},
1320 // Extract u32x4
1321 {"extract u32x4", test_extract_u32_4},
1322 // Bitwise u32x4
1323 {"xor u32x4", test_xor_u32_4},
1324 {"and u32x4", test_and_u32_4},
1325 {"or u32x4", test_or_u32_4},
1326 // Arithmetic i32x4
1327 {"add i32x4", test_add_i32_4},
1328 {"sub i32x4", test_sub_i32_4},
1329 // Shift u32x4 / i32x4
1330 {"srl u32x4", test_srl_u32_4},
1331 {"sll u32x4", test_sll_u32_4},
1332 {"sra i32x4", test_sra_i32_4},
1333 // Min/Max i32x4 (signed)
1334 {"min i32x4", test_min_i32_4},
1335 {"max i32x4", test_max_i32_4},
1336 // Fixed-Point Multiply
1337 {"mulhi i32x4 (Q16.16)", test_mulhi_i32_4},
1338 {"mulhi u32x4 (Q16.16)", test_mulhi_u32_4},
1339 {"mulhi su32x4 (Q16.16)", test_mulhi_su32_4},
1340 {"mulhi32 i32x4 (>>32)", test_mulhi32_i32_4},
1341 // Interleave / Unpack
1342 {"unpacklo u32x4", test_unpacklo_u32_4},
1343 {"unpackhi u32x4", test_unpackhi_u32_4},
1344 {"unpacklo u64 as u32x4", test_unpacklo_u64_as_u32_4},
1345 {"unpackhi u64 as u32x4", test_unpackhi_u64_as_u32_4},
1346 // Float Arithmetic
1347 {"add f32x4", test_add_f32_4},
1348 {"add f32x4 negative", test_add_f32_4_negative},
1349 {"sub f32x4", test_sub_f32_4},
1350 {"mul f32x4", test_mul_f32_4},
1351 {"mul f32x4 negative", test_mul_f32_4_negative},
1352 {"div f32x4", test_div_f32_4},
1353 {"sqrt f32x4", test_sqrt_f32_4},
1354 {"sqrt f32x4 zero", test_sqrt_f32_4_zero},
1355 {"min f32x4", test_min_f32_4},
1356 {"max f32x4", test_max_f32_4},
1357 // Cross-AutoResearch: SIMD vs Scalar Reference (adversarial patterns)
1358 {"crossval scale u8x16", test_crossval_scale_u8_16},
1359 {"crossval add_sat u8x16", test_crossval_add_sat_u8_16},
1360 {"crossval sub_sat u8x16", test_crossval_sub_sat_u8_16},
1361 {"crossval aligned load/store", test_crossval_aligned_load_store},
1362 {"crossval AND u8x16", test_crossval_and_u8_16},
1363 {"crossval OR u8x16", test_crossval_or_u8_16},
1364 {"crossval XOR u8x16", test_crossval_xor_u8_16},
1365 {"crossval ANDNOT u8x16", test_crossval_andnot_u8_16},
1366 {"crossval XOR u32x4", test_crossval_xor_u32_4},
1367 {"crossval AND u32x4", test_crossval_and_u32_4},
1368 {"crossval OR u32x4", test_crossval_or_u32_4},
1369 {"crossval broadcast u32x4", test_crossval_set1_u32_4},
1370 // Cross-AutoResearch: i32 arithmetic, shifts, min/max, multiply, u8 min/max, float
1371 {"crossval add i32x4", test_crossval_add_i32_4},
1372 {"crossval sub i32x4", test_crossval_sub_i32_4},
1373 {"crossval srl u32x4", test_crossval_srl_u32_4},
1374 {"crossval sll u32x4", test_crossval_sll_u32_4},
1375 {"crossval sra i32x4", test_crossval_sra_i32_4},
1376 {"crossval min i32x4", test_crossval_min_i32_4},
1377 {"crossval max i32x4", test_crossval_max_i32_4},
1378 {"crossval mulhi i32x4", test_crossval_mulhi_i32_4},
1379 {"crossval mulhi u32x4", test_crossval_mulhi_u32_4},
1380 {"crossval mulhi32 i32x4", test_crossval_mulhi32_i32_4},
1381 {"crossval min u8x16", test_crossval_min_u8_16},
1382 {"crossval max u8x16", test_crossval_max_u8_16},
1383 {"crossval float ops", test_crossval_float_ops},
1384 // Pipeline / Chained Operation Tests
1385 {"pipeline u8 scale+add+clamp", test_u8_pipeline_scale_add_clamp},
1386 {"pipeline i32 shift+add+mask", test_i32_pipeline_shift_add_mask},
1387 {"pipeline f32 mul+add+clamp", test_f32_pipeline_mul_add_clamp},
1388 };
1389 *out_tests = tests;
1390 *out_count = sizeof(tests) / sizeof(tests[0]);
1391}
1392
1394inline int runSimdTests() {
1395 const SimdTestEntry* tests = nullptr;
1396 int num_tests = 0;
1397 getTests(&tests, &num_tests);
1398
1399 int passed = 0;
1400 int failed = 0;
1401
1402 FL_PRINT("\n[SIMD AUTORESEARCH]");
1403 FL_PRINT("────────────────────────────────────────────────────────────────");
1404
1405#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)
1406 FL_PRINT(" SIMD Backend: x86 SSE2");
1407#elif defined(__XTENSA__) && FL_XTENSA_HAS_PIE
1408 FL_PRINT(" SIMD Backend: Xtensa PIE (ESP32-S3)");
1409#elif defined(__XTENSA__)
1410 FL_PRINT(" SIMD Backend: Xtensa scalar");
1411#elif defined(__riscv)
1412 FL_PRINT(" SIMD Backend: RISC-V scalar");
1413#else
1414 FL_PRINT(" SIMD Backend: Scalar fallback");
1415#endif
1416
1417 fl::sstream ss;
1418 ss << " Running " << num_tests << " SIMD tests...\n";
1419 FL_PRINT(ss.str());
1420
1421 for (int i = 0; i < num_tests; i++) {
1422 bool ok = tests[i].func();
1423 if (ok) {
1424 passed++;
1425 ss.clear();
1426 ss << " [PASS] " << tests[i].name;
1427 FL_PRINT(ss.str());
1428 } else {
1429 failed++;
1430 ss.clear();
1431 ss << " [FAIL] " << tests[i].name;
1432 FL_ERROR(ss.str());
1433 }
1434 }
1435
1436 ss.clear();
1437 ss << "\n[SIMD RESULTS] " << passed << "/" << num_tests << " passed";
1438 if (failed > 0) {
1439 ss << ", " << failed << " FAILED";
1440 FL_ERROR(ss.str());
1441 } else {
1442 ss << " - ALL PASSED";
1443 FL_PRINT(ss.str());
1444 }
1445
1446 return failed;
1447}
1448
1449} // namespace simd_check
1450} // namespace autoresearch
fl::UISlider scale("Scale", 4,.1, 4,.1)
expected type for operations that can fail (C++23-style)
Definition expected.h:79
static constexpr FASTLED_FORCE_INLINE s16x16 from_raw(i32 raw) FL_NOEXCEPT
Definition s16x16.h:54
static constexpr FASTLED_FORCE_INLINE s8x8 from_raw(i16 raw) FL_NOEXCEPT
Definition s8x8.h:53
string str() const FL_NOEXCEPT
Definition strstream.h:43
void clear() FL_NOEXCEPT
Definition strstream.h:358
static constexpr FASTLED_FORCE_INLINE u16x16 from_raw(u32 raw) FL_NOEXCEPT
Definition u16x16.h:53
static uint32_t t
Definition Luminova.h:55
#define FL_ERROR(X)
Definition log.h:219
#define FL_PRINT(X)
Print without prefix (like FL_WARN but without "WARN: " prefix) Uses sstream for dynamic formatting (...
Definition log.h:457
uint32_t as_u32(int32_t v)
void ref_srl_u32_4(const uint32_t *v, int shift, uint32_t *out)
void ref_and_u32_4(const uint32_t *a, const uint32_t *b, uint32_t *out)
void ref_add_sat_u8_16(const uint8_t *a, const uint8_t *b, uint8_t *out)
int64_t benchSimd4(int iters, Op op)
bool compare_u32(const uint32_t *a, const uint32_t *b, size_t n)
int64_t benchS16x16_4(int iters, Op op)
void ref_max_u8_16(const uint8_t *a, const uint8_t *b, uint8_t *out)
void ref_max_i32_4(const uint32_t *a, const uint32_t *b, uint32_t *out)
int64_t benchFloat4(int iters, Op op)
void ref_scale_u8_16(const uint8_t *v, uint8_t scale, uint8_t *out)
int runSimdTests()
Run the full SIMD test suite. Returns the number of failures.
void ref_xor_u8_16(const uint8_t *a, const uint8_t *b, uint8_t *out)
int64_t benchS8x8_4(int iters, Op op)
void ref_andnot_u8_16(const uint8_t *a, const uint8_t *b, uint8_t *out)
void ref_mulhi_i32_4(const uint32_t *a, const uint32_t *b, uint32_t *out)
void ref_sll_u32_4(const uint32_t *v, int shift, uint32_t *out)
int32_t as_i32(uint32_t v)
void ref_and_u8_16(const uint8_t *a, const uint8_t *b, uint8_t *out)
void getTests(const SimdTestEntry **out_tests, int *out_count)
Get the static test table. Used by both runSimdTests() and the RPC handler.
void ref_or_u32_4(const uint32_t *a, const uint32_t *b, uint32_t *out)
void ref_min_i32_4(const uint32_t *a, const uint32_t *b, uint32_t *out)
void ref_add_i32_4(const uint32_t *a, const uint32_t *b, uint32_t *out)
void ref_or_u8_16(const uint8_t *a, const uint8_t *b, uint8_t *out)
void ref_sub_i32_4(const uint32_t *a, const uint32_t *b, uint32_t *out)
bool compare_u8(const uint8_t *a, const uint8_t *b, size_t n)
void ref_xor_u32_4(const uint32_t *a, const uint32_t *b, uint32_t *out)
void ref_sra_i32_4(const uint32_t *v, int shift, uint32_t *out)
void ref_min_u8_16(const uint8_t *a, const uint8_t *b, uint8_t *out)
int64_t benchU16x16_4(int iters, Op op)
bool compare_f32(const float *a, const float *b, size_t n, float eps=0.001f)
BenchmarkResult runMultiplyBenchmark(int iters=10000)
void ref_mulhi_u32_4(const uint32_t *a, const uint32_t *b, uint32_t *out)
void ref_mulhi32_i32_4(const uint32_t *a, const uint32_t *b, uint32_t *out)
void ref_sub_sat_u8_16(const uint8_t *a, const uint8_t *b, uint8_t *out)
static volatile uint32_t g_bench_sink
platforms::simd_u32x4 simd_u32x4
Definition types.h:26
platforms::simd_u8x16 simd_u8x16
Definition types.h:24
platforms::simd_f32x4 simd_f32x4
Definition types.h:27
void * memcpy(void *dest, const void *src, size_t n) FL_NOEXCEPT
fl::u32 uint32_t
Definition s16x16x4.h:219
fl::u16 uint16_t
Definition s16x16x4.h:214
fl::u32 micros()
Universal microsecond timer - returns microseconds since system startup.
fl::i32 int32_t
Definition s16x16x4.h:220
unsigned char uint8_t
Definition s16x16x4.h:209
SIMD 4-wide s0x32 fixed-point vector type.
#define FL_ALIGNAS(N)
SIMD 4-wide s16x16 fixed-point vector type.
Umbrella header for SIMD subsystem.
float operator()(float a, float b) const
fl::s16x16 operator()(fl::s16x16 a, fl::s16x16 b) const
fl::s8x8 operator()(fl::s8x8 a, fl::s8x8 b) const
fl::u16x16 operator()(fl::u16x16 a, fl::u16x16 b) const
static FASTLED_FORCE_INLINE s16x16x4 from_raw(simd::simd_u32x4 r)
Definition s16x16x4.h:24
simd::simd_u32x4 raw
Definition s16x16x4.h:20
static FASTLED_FORCE_INLINE s16x16x4 set1(s16x16 value)
Definition s16x16x4.h:41
4-wide s16x16 vector (general fixed-point) Backed by 128-bit SIMD register (4× i32 in Q16 format)
Definition s16x16x4.h:19