29 for (
size_t i = 0; i < n; i++) {
30 if (a[i] != b[i])
return false;
36 for (
size_t i = 0; i < n; i++) {
37 if (a[i] != b[i])
return false;
42inline bool compare_f32(
const float* a,
const float* b,
size_t n,
float eps = 0.001f) {
43 for (
size_t i = 0; i < n; i++) {
44 float diff = a[i] - b[i];
45 if (diff < -eps || diff > eps)
return false;
59 uint8_t input[16] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
62 store_u8_16(output, v);
67 uint8_t input[16] = {0,255,0,255, 0,255,0,255, 0,255,0,255, 0,255,0,255};
70 store_u8_16(output, v);
79 uint32_t input[4] = {0x12345678, 0x9ABCDEF0, 0xFEDCBA98, 0x76543210};
82 store_u32_4(output, v);
87 uint32_t input[4] = {0, 0xFFFFFFFF, 1, 0x80000000};
90 store_u32_4(output, v);
98 store_u32_4_aligned(output, v);
107 float input[4] = {1.5f, 2.5f, 3.5f, 4.5f};
108 float output[4] = {0.0f};
110 store_f32_4(output, v);
115 float input[4] = {0.0f, -0.0f, 1e30f, -1e30f};
116 float output[4] = {0.0f};
118 store_f32_4(output, v);
120 for (
int i = 0; i < 4; i++) {
121 float diff = input[i] - output[i];
122 if (diff < -0.001f || diff > 0.001f)
return false;
132 uint8_t a[16] = {100,150,200,250, 100,150,200,250, 100,150,200,250, 100,150,200,250};
133 uint8_t b[16] = {50,100,50,100, 50,100,50,100, 50,100,50,100, 50,100,50,100};
134 uint8_t expected[16] = {150,250,250,255, 150,250,250,255, 150,250,250,255, 150,250,250,255};
138 store_u8_16(output, add_sat_u8_16(va, vb));
144 uint8_t a[16] = {255,255,0,0, 128,1,254,255, 255,255,0,0, 128,1,254,255};
145 uint8_t b[16] = {255,1,0,0, 128,255,2,0, 255,1,0,0, 128,255,2,0};
146 uint8_t expected[16] = {255,255,0,0, 255,255,255,255, 255,255,0,0, 255,255,255,255};
148 store_u8_16(output, add_sat_u8_16(load_u8_16(a), load_u8_16(b)));
153 uint8_t a[16] = {100,50,200,10, 100,50,200,10, 100,50,200,10, 100,50,200,10};
154 uint8_t b[16] = {50,100,50,100, 50,100,50,100, 50,100,50,100, 50,100,50,100};
155 uint8_t expected[16] = {50,0,150,0, 50,0,150,0, 50,0,150,0, 50,0,150,0};
157 store_u8_16(output, sub_sat_u8_16(load_u8_16(a), load_u8_16(b)));
163 uint8_t a[16] = {0,255,100,0, 1,0,0,0, 0,255,100,0, 1,0,0,0};
164 uint8_t b[16] = {255,0,100,0, 0,1,255,128, 255,0,100,0, 0,1,255,128};
165 uint8_t expected[16] = {0,255,0,0, 1,0,0,0, 0,255,0,0, 1,0,0,0};
167 store_u8_16(output, sub_sat_u8_16(load_u8_16(a), load_u8_16(b)));
176 uint8_t a[16] = {255,128,64,32, 255,128,64,32, 255,128,64,32, 255,128,64,32};
178 store_u8_16(output, scale_u8_16(load_u8_16(a), 128));
179 for (
int i = 0; i < 16; i++) {
181 int diff = (int)output[i] -
expected;
182 if (diff < -1 || diff > 1)
return false;
188 uint8_t a[16] = {255,128,64,32, 255,128,64,32, 255,128,64,32, 255,128,64,32};
189 uint8_t expected[16] = {0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0};
191 store_u8_16(output, scale_u8_16(load_u8_16(a), 0));
197 uint8_t a[16] = {255,128,64,32, 10,200,100,50, 255,128,64,32, 10,200,100,50};
199 store_u8_16(output, scale_u8_16(load_u8_16(a), 255));
200 for (
int i = 0; i < 16; i++) {
202 int diff = (int)output[i] -
expected;
203 if (diff < -1 || diff > 1)
return false;
209 uint8_t a[16] = {0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0};
210 uint8_t b[16] = {255,255,255,255, 255,255,255,255, 255,255,255,255, 255,255,255,255};
212 store_u8_16(output, blend_u8_16(load_u8_16(a), load_u8_16(b), 128));
213 for (
int i = 0; i < 16; i++) {
214 int diff = (int)output[i] - 128;
215 if (diff < -2 || diff > 2)
return false;
222 uint8_t a[16] = {100,100,100,100, 100,100,100,100, 100,100,100,100, 100,100,100,100};
223 uint8_t b[16] = {200,200,200,200, 200,200,200,200, 200,200,200,200, 200,200,200,200};
226 store_u8_16(output0, blend_u8_16(load_u8_16(a), load_u8_16(b), 0));
227 store_u8_16(output255, blend_u8_16(load_u8_16(a), load_u8_16(b), 255));
228 for (
int i = 0; i < 16; i++) {
229 if (output0[i] != 100)
return false;
230 int diff = (int)output255[i] - 200;
231 if (diff < -2 || diff > 2)
return false;
241 uint8_t a[16] = {10,200,30,240, 0,255,1,254, 10,200,30,240, 0,255,1,254};
242 uint8_t b[16] = {20,100,40,120, 0,0,255,255, 20,100,40,120, 0,0,255,255};
243 uint8_t expected[16] = {10,100,30,120, 0,0,1,254, 10,100,30,120, 0,0,1,254};
245 store_u8_16(output, min_u8_16(load_u8_16(a), load_u8_16(b)));
250 uint8_t a[16] = {10,200,30,240, 0,255,1,254, 10,200,30,240, 0,255,1,254};
251 uint8_t b[16] = {20,100,40,120, 0,0,255,255, 20,100,40,120, 0,0,255,255};
252 uint8_t expected[16] = {20,200,40,240, 0,255,255,255, 20,200,40,240, 0,255,255,255};
254 store_u8_16(output, max_u8_16(load_u8_16(a), load_u8_16(b)));
259 uint8_t a[16] = {100,200,50,0, 255,0,1,254, 100,200,50,0, 255,0,1,254};
260 uint8_t b[16] = {200,100,150,0, 255,0,1,254, 200,100,150,0, 255,0,1,254};
262 store_u8_16(output, avg_u8_16(load_u8_16(a), load_u8_16(b)));
263 for (
int i = 0; i < 16; i++) {
265 int diff = (int)output[i] -
expected;
266 if (diff < -1 || diff > 1)
return false;
272 uint8_t a[16] = {101,201,51,1, 255,0,3,253, 101,201,51,1, 255,0,3,253};
273 uint8_t b[16] = {200,100,150,0, 254,1,2,252, 200,100,150,0, 254,1,2,252};
275 store_u8_16(output, avg_round_u8_16(load_u8_16(a), load_u8_16(b)));
276 for (
int i = 0; i < 16; i++) {
277 int expected = (a[i] + b[i] + 1) / 2;
278 int diff = (int)output[i] -
expected;
279 if (diff < -1 || diff > 1)
return false;
289 uint8_t a[16] = {0xFF,0x0F,0xF0,0xAA, 0xFF,0x0F,0xF0,0xAA, 0xFF,0x0F,0xF0,0xAA, 0xFF,0x0F,0xF0,0xAA};
290 uint8_t b[16] = {0x0F,0xFF,0xFF,0x55, 0x0F,0xFF,0xFF,0x55, 0x0F,0xFF,0xFF,0x55, 0x0F,0xFF,0xFF,0x55};
291 uint8_t expected[16] = {0x0F,0x0F,0xF0,0x00, 0x0F,0x0F,0xF0,0x00, 0x0F,0x0F,0xF0,0x00, 0x0F,0x0F,0xF0,0x00};
293 store_u8_16(output, and_u8_16(load_u8_16(a), load_u8_16(b)));
298 uint8_t a[16] = {0xFF,0x0F,0xF0,0xAA, 0x00,0x00,0xFF,0x80, 0xFF,0x0F,0xF0,0xAA, 0x00,0x00,0xFF,0x80};
299 uint8_t b[16] = {0x0F,0xFF,0xFF,0x55, 0x00,0xFF,0x00,0x01, 0x0F,0xFF,0xFF,0x55, 0x00,0xFF,0x00,0x01};
300 uint8_t expected[16] = {0xFF,0xFF,0xFF,0xFF, 0x00,0xFF,0xFF,0x81, 0xFF,0xFF,0xFF,0xFF, 0x00,0xFF,0xFF,0x81};
302 store_u8_16(output, or_u8_16(load_u8_16(a), load_u8_16(b)));
307 uint8_t a[16] = {0xFF,0x0F,0xF0,0xAA, 0xFF,0x00,0xAB,0x01, 0xFF,0x0F,0xF0,0xAA, 0xFF,0x00,0xAB,0x01};
308 uint8_t b[16] = {0x0F,0xFF,0xFF,0x55, 0xFF,0x00,0xAB,0x01, 0x0F,0xFF,0xFF,0x55, 0xFF,0x00,0xAB,0x01};
309 uint8_t expected[16] = {0xF0,0xF0,0x0F,0xFF, 0x00,0x00,0x00,0x00, 0xF0,0xF0,0x0F,0xFF, 0x00,0x00,0x00,0x00};
311 store_u8_16(output, xor_u8_16(load_u8_16(a), load_u8_16(b)));
317 uint8_t a[16] = {0xFF,0x0F,0xF0,0xAA, 0x00,0xFF,0x55,0x80, 0xFF,0x0F,0xF0,0xAA, 0x00,0xFF,0x55,0x80};
318 uint8_t b[16] = {0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF};
319 uint8_t expected[16] = {0x00,0xF0,0x0F,0x55, 0xFF,0x00,0xAA,0x7F, 0x00,0xF0,0x0F,0x55, 0xFF,0x00,0xAA,0x7F};
321 store_u8_16(output, andnot_u8_16(load_u8_16(a), load_u8_16(b)));
331 store_u32_4(output, set1_u32_4(0xDEADBEEF));
338 store_u32_4(output, set1_u32_4(0));
345 store_u32_4(output, set_u32_4(0x11111111, 0x22222222, 0x33333333, 0x44444444));
351 float output[4] = {0.0f};
352 store_f32_4(output, set1_f32_4(3.14f));
353 float expected[4] = {3.14f, 3.14f, 3.14f, 3.14f};
363 if (extract_u32_4(v, 0) != 10)
return false;
364 if (extract_u32_4(v, 1) != 20)
return false;
365 if (extract_u32_4(v, 2) != 30)
return false;
366 if (extract_u32_4(v, 3) != 40)
return false;
375 uint32_t a[4] = {0xFFFFFFFF, 0x0F0F0F0F, 0xAAAAAAAA, 0x12345678};
376 uint32_t b[4] = {0x0F0F0F0F, 0xFFFFFFFF, 0x55555555, 0x12345678};
379 store_u32_4(output, xor_u32_4(load_u32_4(a), load_u32_4(b)));
384 uint32_t a[4] = {0xFFFF0000, 0x0F0F0F0F, 0xAAAAAAAA, 0x00000000};
385 uint32_t b[4] = {0x0000FFFF, 0xF0F0F0F0, 0xFFFFFFFF, 0xFFFFFFFF};
388 store_u32_4(output, and_u32_4(load_u32_4(a), load_u32_4(b)));
393 uint32_t a[4] = {0xFFFF0000, 0x0F0F0F0F, 0xAAAAAAAA, 0x00000000};
394 uint32_t b[4] = {0x0000FFFF, 0xF0F0F0F0, 0x55555555, 0x00000000};
397 store_u32_4(output, or_u32_4(load_u32_4(a), load_u32_4(b)));
410 store_u32_4(output, add_i32_4(load_u32_4(a), load_u32_4(b)));
411 if (
as_i32(output[0]) != 300)
return false;
412 if (
as_i32(output[1]) != -300)
return false;
414 if (output[2] != 0x80000000)
return false;
415 if (
as_i32(output[3]) != 0)
return false;
423 store_u32_4(output, sub_i32_4(load_u32_4(a), load_u32_4(b)));
424 if (
as_i32(output[0]) != 100)
return false;
425 if (
as_i32(output[1]) != -300)
return false;
426 if (
as_i32(output[2]) != 0)
return false;
427 if (
as_i32(output[3]) != 0)
return false;
436 uint32_t input[4] = {0x80000000, 0xFFFFFFFF, 0x00000010, 0x12345678};
438 store_u32_4(output, srl_u32_4(load_u32_4(input), 4));
444 uint32_t input[4] = {0x00000001, 0x0FFFFFFF, 0x80000000, 0x12345678};
446 store_u32_4(output, sll_u32_4(load_u32_4(input), 4));
455 store_u32_4(output, sra_i32_4(load_u32_4(input), 2));
456 if (
as_i32(output[0]) != -4)
return false;
457 if (
as_i32(output[1]) != 4)
return false;
458 if (
as_i32(output[2]) !=
as_i32(0xE0000000))
return false;
459 if (
as_i32(output[3]) !=
as_i32(0x1FFFFFFF))
return false;
471 store_u32_4(output, min_i32_4(load_u32_4(a), load_u32_4(b)));
472 if (
as_i32(output[0]) != 10)
return false;
473 if (
as_i32(output[1]) != -10)
return false;
475 if (
as_i32(output[3]) != -1)
return false;
483 store_u32_4(output, max_i32_4(load_u32_4(a), load_u32_4(b)));
484 if (
as_i32(output[0]) != 20)
return false;
485 if (
as_i32(output[1]) != 10)
return false;
486 if (
as_i32(output[2]) != 0x7FFFFFFF)
return false;
487 if (
as_i32(output[3]) != 0)
return false;
498 uint32_t a[4] = {0x00010000, 0x00020000,
as_u32(-0x00010000), 0x00008000};
499 uint32_t b[4] = {0x00020000, 0x00010000, 0x00020000, 0x00008000};
501 store_u32_4(output, mulhi_i32_4(load_u32_4(a), load_u32_4(b)));
502 if (output[0] != 0x00020000)
return false;
503 if (output[1] != 0x00020000)
return false;
504 if (
as_i32(output[2]) != -0x00020000)
return false;
505 if (output[3] != 0x00004000)
return false;
511 uint32_t a[4] = {0x00010000, 0x00020000, 0x00030000, 0x00008000};
512 uint32_t b[4] = {0x00020000, 0x00030000, 0x00010000, 0x00008000};
514 store_u32_4(output, mulhi_u32_4(load_u32_4(a), load_u32_4(b)));
515 if (output[0] != 0x00020000)
return false;
516 if (output[1] != 0x00060000)
return false;
517 if (output[2] != 0x00030000)
return false;
518 if (output[3] != 0x00004000)
return false;
526 uint32_t b[4] = {0x00020000, 0x00010000, 0x00010000, 0x00030000};
528 store_u32_4(output, mulhi_su32_4(load_u32_4(a), load_u32_4(b)));
529 if (
as_i32(output[0]) != -0x00020000)
return false;
530 if (output[1] != 0x00020000)
return false;
531 if (
as_i32(output[2]) != -0x00020000)
return false;
532 if (output[3] != 0x00030000)
return false;
539 uint32_t a[4] = {0x40000000,
as_u32(-0x40000000), 0x7FFFFFFF, 0x00000001};
540 uint32_t b[4] = {0x40000000, 0x40000000, 0x00000002, 0x7FFFFFFF};
542 store_u32_4(output, mulhi32_i32_4(load_u32_4(a), load_u32_4(b)));
543 if (output[0] != 0x10000000)
return false;
544 if (
as_i32(output[1]) !=
as_i32(0xF0000000))
return false;
545 if (output[2] != 0x00000000)
return false;
546 if (output[3] != 0x00000000)
return false;
559 store_u32_4(output, unpacklo_u32_4(a, b));
569 store_u32_4(output, unpackhi_u32_4(a, b));
579 store_u32_4(output, unpacklo_u64_as_u32_4(a, b));
589 store_u32_4(output, unpackhi_u64_as_u32_4(a, b));
599 float a[4] = {1.0f, 2.0f, 3.0f, 4.0f};
600 float b[4] = {5.0f, 6.0f, 7.0f, 8.0f};
601 float expected[4] = {6.0f, 8.0f, 10.0f, 12.0f};
602 float output[4] = {0.0f};
603 store_f32_4(output, add_f32_4(load_f32_4(a), load_f32_4(b)));
608 float a[4] = {-1.0f, 2.0f, -3.0f, 0.0f};
609 float b[4] = {1.0f, -2.0f, -3.0f, 0.0f};
610 float expected[4] = {0.0f, 0.0f, -6.0f, 0.0f};
611 float output[4] = {0.0f};
612 store_f32_4(output, add_f32_4(load_f32_4(a), load_f32_4(b)));
617 float a[4] = {10.0f, 20.0f, 30.0f, 40.0f};
618 float b[4] = {1.0f, 2.0f, 3.0f, 4.0f};
619 float expected[4] = {9.0f, 18.0f, 27.0f, 36.0f};
620 float output[4] = {0.0f};
621 store_f32_4(output, sub_f32_4(load_f32_4(a), load_f32_4(b)));
626 float a[4] = {2.0f, 3.0f, 4.0f, 5.0f};
627 float b[4] = {3.0f, 4.0f, 5.0f, 6.0f};
628 float expected[4] = {6.0f, 12.0f, 20.0f, 30.0f};
629 float output[4] = {0.0f};
630 store_f32_4(output, mul_f32_4(load_f32_4(a), load_f32_4(b)));
635 float a[4] = {-2.0f, 3.0f, -4.0f, 0.0f};
636 float b[4] = {3.0f, -4.0f, -5.0f, 100.0f};
637 float expected[4] = {-6.0f, -12.0f, 20.0f, 0.0f};
638 float output[4] = {0.0f};
639 store_f32_4(output, mul_f32_4(load_f32_4(a), load_f32_4(b)));
644 float a[4] = {10.0f, 20.0f, 30.0f, 40.0f};
645 float b[4] = {2.0f, 4.0f, 5.0f, 8.0f};
646 float expected[4] = {5.0f, 5.0f, 6.0f, 5.0f};
647 float output[4] = {0.0f};
648 store_f32_4(output, div_f32_4(load_f32_4(a), load_f32_4(b)));
653 float a[4] = {4.0f, 9.0f, 16.0f, 25.0f};
654 float expected[4] = {2.0f, 3.0f, 4.0f, 5.0f};
655 float output[4] = {0.0f};
656 store_f32_4(output, sqrt_f32_4(load_f32_4(a)));
661 float a[4] = {0.0f, 1.0f, 100.0f, 0.25f};
662 float expected[4] = {0.0f, 1.0f, 10.0f, 0.5f};
663 float output[4] = {0.0f};
664 store_f32_4(output, sqrt_f32_4(load_f32_4(a)));
669 float a[4] = {1.0f, 5.0f, -3.0f, 7.0f};
670 float b[4] = {4.0f, 2.0f, 6.0f, -1.0f};
671 float expected[4] = {1.0f, 2.0f, -3.0f, -1.0f};
672 float output[4] = {0.0f};
673 store_f32_4(output, min_f32_4(load_f32_4(a), load_f32_4(b)));
678 float a[4] = {1.0f, 5.0f, -3.0f, 7.0f};
679 float b[4] = {4.0f, 2.0f, 6.0f, -1.0f};
680 float expected[4] = {4.0f, 5.0f, 6.0f, 7.0f};
681 float output[4] = {0.0f};
682 store_f32_4(output, max_f32_4(load_f32_4(a), load_f32_4(b)));
695 for (
int i = 0; i < 16; i++) out[i] = a[i] & b[i];
698 for (
int i = 0; i < 16; i++) out[i] = a[i] | b[i];
701 for (
int i = 0; i < 16; i++) out[i] = a[i] ^ b[i];
704 for (
int i = 0; i < 16; i++) out[i] = (~a[i]) & b[i];
707 for (
int i = 0; i < 4; i++) out[i] = a[i] ^ b[i];
710 for (
int i = 0; i < 4; i++) out[i] = a[i] & b[i];
713 for (
int i = 0; i < 4; i++) out[i] = a[i] | b[i];
718 uint8_t a[16] = {0xAA,0x55,0xFF,0x00, 0x0F,0xF0,0x81,0x7E, 0x01,0xFE,0xCC,0x33, 0xDB,0x24,0xA5,0x5A};
719 uint8_t b[16] = {0x55,0xAA,0x00,0xFF, 0xF0,0x0F,0x7E,0x81, 0xFE,0x01,0x33,0xCC, 0x24,0xDB,0x5A,0xA5};
722 store_u8_16(actual, and_u8_16(load_u8_16(a), load_u8_16(b)));
727 uint8_t a[16] = {0xAA,0x55,0xFF,0x00, 0x0F,0xF0,0x81,0x7E, 0x01,0xFE,0xCC,0x33, 0xDB,0x24,0xA5,0x5A};
728 uint8_t b[16] = {0x55,0xAA,0x00,0xFF, 0xF0,0x0F,0x7E,0x81, 0xFE,0x01,0x33,0xCC, 0x24,0xDB,0x5A,0xA5};
731 store_u8_16(actual, or_u8_16(load_u8_16(a), load_u8_16(b)));
736 uint8_t a[16] = {0xAA,0x55,0xFF,0x00, 0x0F,0xF0,0x81,0x7E, 0x01,0xFE,0xCC,0x33, 0xDB,0x24,0xA5,0x5A};
737 uint8_t b[16] = {0x55,0xAA,0x00,0xFF, 0xF0,0x0F,0x7E,0x81, 0xFE,0x01,0x33,0xCC, 0x24,0xDB,0x5A,0xA5};
740 store_u8_16(actual, xor_u8_16(load_u8_16(a), load_u8_16(b)));
745 uint8_t a[16] = {0xAA,0x55,0xFF,0x00, 0x0F,0xF0,0x81,0x7E, 0x01,0xFE,0xCC,0x33, 0xDB,0x24,0xA5,0x5A};
746 uint8_t b[16] = {0x55,0xAA,0x00,0xFF, 0xF0,0x0F,0x7E,0x81, 0xFE,0x01,0x33,0xCC, 0x24,0xDB,0x5A,0xA5};
749 store_u8_16(actual, andnot_u8_16(load_u8_16(a), load_u8_16(b)));
755 uint32_t a[4] = {0x80000001, 0x7FFFFFFE, 0xDEADBEEF, 0x00000000};
756 uint32_t b[4] = {0x80000001, 0x80000001, 0xCAFEBABE, 0xFFFFFFFF};
759 store_u32_4(actual, xor_u32_4(load_u32_4(a), load_u32_4(b)));
764 uint32_t a[4] = {0x80000001, 0x7FFFFFFE, 0xDEADBEEF, 0x00000000};
765 uint32_t b[4] = {0x80000001, 0x80000001, 0xCAFEBABE, 0xFFFFFFFF};
768 store_u32_4(actual, and_u32_4(load_u32_4(a), load_u32_4(b)));
773 uint32_t a[4] = {0x80000001, 0x7FFFFFFE, 0xDEADBEEF, 0x00000000};
774 uint32_t b[4] = {0x80000001, 0x80000001, 0xCAFEBABE, 0xFFFFFFFF};
777 store_u32_4(actual, or_u32_4(load_u32_4(a), load_u32_4(b)));
783 for (
int i = 0; i < 16; i++) {
790 uint8_t v[16] = {0,1,127,128, 254,255,0x55,0xAA, 0x0F,0xF0,0x80,0x7F, 0x01,0xFE,0xFF,0x00};
791 uint8_t scale_vals[] = {0, 1, 127, 128, 254, 255};
792 for (
int s = 0; s < 6; s++) {
795 store_u8_16(actual, scale_u8_16(load_u8_16(v), scale_vals[s]));
803 for (
int i = 0; i < 16; i++) {
805 out[i] = (sum > 255) ? 255 : (
uint8_t)sum;
809 for (
int i = 0; i < 16; i++) {
810 out[i] = (a[i] > b[i]) ? (a[i] - b[i]) : 0;
817 uint8_t a[16] = {255,254,128,127, 0,1,200,50, 0xFF,0x80,0x7F,0x01, 100,200,150,250};
818 uint8_t b[16] = {255,2,128,129, 0,255,56,206, 0x01,0x80,0x81,0xFF, 156,56,106,6};
821 store_u8_16(actual, add_sat_u8_16(load_u8_16(a), load_u8_16(b)));
826 uint8_t a[16] = {255,0,128,127, 0,1,200,50, 0xFF,0x80,0x7F,0x01, 100,200,150,250};
827 uint8_t b[16] = {255,255,128,129, 0,255,56,206, 0x01,0x80,0x81,0xFF, 156,56,106,6};
830 store_u8_16(actual, sub_sat_u8_16(load_u8_16(a), load_u8_16(b)));
836 for (
int i = 0; i < 4; i++) out[i] = a[i] + b[i];
839 for (
int i = 0; i < 4; i++) out[i] = a[i] - b[i];
844 uint32_t a[4] = {0x7FFFFFFF, 0x80000000, 0xFFFFFFFF, 0x00000001};
845 uint32_t b[4] = {0x00000001, 0x80000000, 0x00000001, 0xFFFFFFFF};
848 store_u32_4(actual, add_i32_4(load_u32_4(a), load_u32_4(b)));
853 uint32_t a[4] = {0x00000000, 0x80000000, 0x7FFFFFFF, 0x00000001};
854 uint32_t b[4] = {0x00000001, 0x7FFFFFFF, 0x80000000, 0x00000002};
857 store_u32_4(actual, sub_i32_4(load_u32_4(a), load_u32_4(b)));
863 for (
int i = 0; i < 4; i++) out[i] = v[i] >> shift;
866 for (
int i = 0; i < 4; i++) out[i] = v[i] << shift;
869 for (
int i = 0; i < 4; i++) out[i] = (
uint32_t)((
int32_t)v[i] >> shift);
873 uint32_t v[4] = {0x80000000, 0xFFFFFFFF, 0x00000001, 0xDEADBEEF};
874 for (
int shift = 0; shift <= 31; shift += 7) {
877 store_u32_4(actual, srl_u32_4(load_u32_4(v), shift));
884 uint32_t v[4] = {0x80000000, 0xFFFFFFFF, 0x00000001, 0xDEADBEEF};
885 for (
int shift = 0; shift <= 31; shift += 7) {
888 store_u32_4(actual, sll_u32_4(load_u32_4(v), shift));
895 uint32_t v[4] = {0x80000000, 0xFFFFFFFF, 0x7FFFFFFF, 0xDEADBEEF};
896 for (
int shift = 0; shift <= 31; shift += 7) {
899 store_u32_4(actual, sra_i32_4(load_u32_4(v), shift));
907 for (
int i = 0; i < 4; i++) {
909 out[i] = (
uint32_t)(ai < bi ? ai : bi);
913 for (
int i = 0; i < 4; i++) {
915 out[i] = (
uint32_t)(ai > bi ? ai : bi);
920 uint32_t a[4] = {0x80000000, 0x7FFFFFFF, 0xFFFFFFFF, 0x00000000};
921 uint32_t b[4] = {0x7FFFFFFF, 0x80000000, 0x00000000, 0xFFFFFFFF};
924 store_u32_4(actual, min_i32_4(load_u32_4(a), load_u32_4(b)));
929 uint32_t a[4] = {0x80000000, 0x7FFFFFFF, 0xFFFFFFFF, 0x00000000};
930 uint32_t b[4] = {0x7FFFFFFF, 0x80000000, 0x00000000, 0xFFFFFFFF};
933 store_u32_4(actual, max_i32_4(load_u32_4(a), load_u32_4(b)));
939 for (
int i = 0; i < 4; i++) {
945 for (
int i = 0; i < 4; i++) {
946 uint64_t prod = (uint64_t)a[i] * (uint64_t)b[i];
951 for (
int i = 0; i < 4; i++) {
958 uint32_t a[4] = {0x7FFFFFFF, 0x80000000, 0x00010000, 0xFFFF0000};
959 uint32_t b[4] = {0x00020000, 0x00020000, 0xFFFF0000, 0xFFFF0000};
962 store_u32_4(actual, mulhi_i32_4(load_u32_4(a), load_u32_4(b)));
967 uint32_t a[4] = {0xFFFFFFFF, 0x80000000, 0x00010000, 0x00000001};
968 uint32_t b[4] = {0x00000002, 0x00000002, 0x00010000, 0xFFFFFFFF};
971 store_u32_4(actual, mulhi_u32_4(load_u32_4(a), load_u32_4(b)));
976 uint32_t a[4] = {0x7FFFFFFF, 0x80000000, 0x40000000, 0xC0000000};
977 uint32_t b[4] = {0x7FFFFFFF, 0x80000000, 0x40000000, 0x40000000};
980 store_u32_4(actual, mulhi32_i32_4(load_u32_4(a), load_u32_4(b)));
986 for (
int i = 0; i < 16; i++) out[i] = a[i] < b[i] ? a[i] : b[i];
989 for (
int i = 0; i < 16; i++) out[i] = a[i] > b[i] ? a[i] : b[i];
993 uint8_t a[16] = {0,255,128,127, 1,254,0x55,0xAA, 0x0F,0xF0,0x80,0x7F, 0x01,0xFE,0xFF,0x00};
994 uint8_t b[16] = {255,0,127,128, 254,1,0xAA,0x55, 0xF0,0x0F,0x7F,0x80, 0xFE,0x01,0x00,0xFF};
997 store_u8_16(actual, min_u8_16(load_u8_16(a), load_u8_16(b)));
1002 uint8_t a[16] = {0,255,128,127, 1,254,0x55,0xAA, 0x0F,0xF0,0x80,0x7F, 0x01,0xFE,0xFF,0x00};
1003 uint8_t b[16] = {255,0,127,128, 254,1,0xAA,0x55, 0xF0,0x0F,0x7F,0x80, 0xFE,0x01,0x00,0xFF};
1006 store_u8_16(actual, max_u8_16(load_u8_16(a), load_u8_16(b)));
1012 float a[4] = {-1.5f, 0.0f, 3.14159f, 1e10f};
1013 float b[4] = {2.5f, -0.0f, -2.71828f, 1e-10f};
1016 store_f32_4(out, add_f32_4(load_f32_4(a), load_f32_4(b)));
1017 for (
int i = 0; i < 4; i++) {
1018 float diff = out[i] - (a[i] + b[i]);
1019 if (diff < -0.001f || diff > 0.001f)
return false;
1022 store_f32_4(out, sub_f32_4(load_f32_4(a), load_f32_4(b)));
1023 for (
int i = 0; i < 4; i++) {
1024 float diff = out[i] - (a[i] - b[i]);
1025 if (diff < -0.001f || diff > 0.001f)
return false;
1028 store_f32_4(out, mul_f32_4(load_f32_4(a), load_f32_4(b)));
1029 for (
int i = 0; i < 4; i++) {
1030 float diff = out[i] - (a[i] * b[i]);
1031 if (diff < -1.0f && diff > 1.0f)
return false;
1041 store_u32_4_aligned(dst, v);
1048 uint32_t test_values[] = {0x80000000, 0xFFFFFFFF, 0x00000000, 0xAAAAAAAA, 0x55555555, 0x01010101};
1049 for (
int t = 0;
t < 6;
t++) {
1052 store_u32_4(output, set1_u32_4(val));
1053 for (
int i = 0; i < 4; i++) {
1054 if (output[i] != val)
return false;
1066 uint8_t input[16] = {200,200,200,200, 200,200,200,200, 200,200,200,200, 200,200,200,200};
1067 uint8_t bias[16] = {200,200,200,200, 200,200,200,200, 200,200,200,200, 200,200,200,200};
1070 v = scale_u8_16(v, 128);
1071 v = add_sat_u8_16(v, load_u8_16(bias));
1072 store_u8_16(output, v);
1073 for (
int i = 0; i < 16; i++) {
1074 if (output[i] < 250)
return false;
1081 uint32_t angles[4] = {0x10000000, 0x20000000, 0x30000000, 0x40000000};
1083 v = srl_u32_4(v, 16);
1084 v = add_i32_4(v, set1_u32_4(0x100));
1085 v = and_u32_4(v, set1_u32_4(0xFFFF));
1087 store_u32_4(output, v);
1094 float input[4] = {0.3f, 0.6f, 0.9f, 1.2f};
1095 float gain[4] = {2.0f, 2.0f, 2.0f, 2.0f};
1096 float output[4] = {0.0f};
1098 v = mul_f32_4(v, load_f32_4(gain));
1099 v = min_f32_4(v, set1_f32_4(1.0f));
1100 v = max_f32_4(v, set1_f32_4(0.0f));
1101 store_f32_4(output, v);
1102 float expected[4] = {0.6f, 1.0f, 1.0f, 1.0f};
1125template <
typename Op>
1127 float a0 = 1.5f, a1 = 2.3f, a2 = 0.7f, a3 = 3.1f;
1128 float b0 = 0.5f, b1 = 1.2f, b2 = 2.0f, b3 = 0.9f;
1130 for (
int i = 0; i < iters; i++) {
1131 a0 = op(a0, b0); a1 = op(a1, b1);
1132 a2 = op(a2, b2); a3 = op(a3, b3);
1133 b0 = a0 + 0.001f; b1 = a1 + 0.001f;
1134 b2 = a2 + 0.001f; b3 = a3 + 0.001f;
1139 return static_cast<int64_t
>(t1 - t0);
1143template <
typename Op>
1145 fl::s16x16 a0(1.5f), a1(2.3f), a2(0.7f), a3(3.1f);
1146 fl::s16x16 b0(0.5f), b1(1.2f), b2(2.0f), b3(0.9f);
1149 for (
int i = 0; i < iters; i++) {
1150 a0 = op(a0, b0); a1 = op(a1, b1);
1151 a2 = op(a2, b2); a3 = op(a3, b3);
1152 b0 = a0 + bump; b1 = a1 + bump;
1153 b2 = a2 + bump; b3 = a3 + bump;
1157 return static_cast<int64_t
>(t1 - t0);
1161template <
typename Op>
1171 for (
int i = 0; i < iters; i++) {
1177 return static_cast<int64_t
>(t1 - t0);
1181 template<
typename T> T
operator()(T a, T b)
const {
return a + b; }
1184 template<
typename T> T
operator()(T a, T b)
const {
return a - b; }
1187 template<
typename T> T
operator()(T a, T b)
const {
return a * b; }
1190template <
typename Op>
1192 fl::s8x8 a0(1.5f), a1(2.3f), a2(0.7f), a3(3.1f);
1193 fl::s8x8 b0(0.5f), b1(1.2f), b2(2.0f), b3(0.9f);
1196 for (
int i = 0; i < iters; i++) {
1197 a0 = op(a0, b0); a1 = op(a1, b1);
1198 a2 = op(a2, b2); a3 = op(a3, b3);
1199 b0 = a0 + bump; b1 = a1 + bump;
1200 b2 = a2 + bump; b3 = a3 + bump;
1204 return static_cast<int64_t
>(t1 - t0);
1208template <
typename Op>
1210 fl::u16x16 a0(1.5f), a1(2.3f), a2(0.7f), a3(3.1f);
1211 fl::u16x16 b0(0.5f), b1(1.2f), b2(2.0f), b3(0.9f);
1214 for (
int i = 0; i < iters; i++) {
1215 a0 = op(a0, b0); a1 = op(a1, b1);
1216 a2 = op(a2, b2); a3 = op(a3, b3);
1217 b0 = a0 + bump; b1 = a1 + bump;
1218 b2 = a2 + bump; b3 = a3 + bump;
1222 return static_cast<int64_t
>(t1 - t0);
1390 *out_count =
sizeof(tests) /
sizeof(tests[0]);
1403 FL_PRINT(
"────────────────────────────────────────────────────────────────");
1405#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)
1406 FL_PRINT(
" SIMD Backend: x86 SSE2");
1407#elif defined(__XTENSA__) && FL_XTENSA_HAS_PIE
1408 FL_PRINT(
" SIMD Backend: Xtensa PIE (ESP32-S3)");
1409#elif defined(__XTENSA__)
1410 FL_PRINT(
" SIMD Backend: Xtensa scalar");
1411#elif defined(__riscv)
1412 FL_PRINT(
" SIMD Backend: RISC-V scalar");
1414 FL_PRINT(
" SIMD Backend: Scalar fallback");
1418 ss <<
" Running " << num_tests <<
" SIMD tests...\n";
1421 for (
int i = 0; i < num_tests; i++) {
1422 bool ok = tests[i].
func();
1426 ss <<
" [PASS] " << tests[i].
name;
1431 ss <<
" [FAIL] " << tests[i].
name;
1437 ss <<
"\n[SIMD RESULTS] " << passed <<
"/" << num_tests <<
" passed";
1439 ss <<
", " << failed <<
" FAILED";
1442 ss <<
" - ALL PASSED";
fl::UISlider scale("Scale", 4,.1, 4,.1)
expected type for operations that can fail (C++23-style)
static constexpr FASTLED_FORCE_INLINE s16x16 from_raw(i32 raw) FL_NOEXCEPT
static constexpr FASTLED_FORCE_INLINE s8x8 from_raw(i16 raw) FL_NOEXCEPT
string str() const FL_NOEXCEPT
static constexpr FASTLED_FORCE_INLINE u16x16 from_raw(u32 raw) FL_NOEXCEPT
#define FL_PRINT(X)
Print without prefix (like FL_WARN but without "WARN: " prefix) Uses sstream for dynamic formatting (...
uint32_t as_u32(int32_t v)
void ref_srl_u32_4(const uint32_t *v, int shift, uint32_t *out)
bool test_mulhi32_i32_4()
bool test_extract_u32_4()
bool test_sqrt_f32_4_zero()
bool test_crossval_mulhi_u32_4()
void ref_and_u32_4(const uint32_t *a, const uint32_t *b, uint32_t *out)
bool test_crossval_max_u8_16()
bool test_u8_pipeline_scale_add_clamp()
bool test_unpacklo_u64_as_u32_4()
void ref_add_sat_u8_16(const uint8_t *a, const uint8_t *b, uint8_t *out)
bool test_add_sat_u8_16()
bool test_crossval_and_u32_4()
int64_t benchSimd4(int iters, Op op)
bool test_crossval_scale_u8_16()
bool compare_u32(const uint32_t *a, const uint32_t *b, size_t n)
int64_t benchS16x16_4(int iters, Op op)
void ref_max_u8_16(const uint8_t *a, const uint8_t *b, uint8_t *out)
bool test_crossval_sub_i32_4()
void ref_max_i32_4(const uint32_t *a, const uint32_t *b, uint32_t *out)
bool test_load_store_u32_4_boundary()
bool test_i32_pipeline_shift_add_mask()
bool test_crossval_xor_u32_4()
bool test_crossval_aligned_load_store()
bool test_scale_u8_16_zero()
bool test_crossval_mulhi32_i32_4()
int64_t benchFloat4(int iters, Op op)
bool test_crossval_set1_u32_4()
void ref_scale_u8_16(const uint8_t *v, uint8_t scale, uint8_t *out)
bool test_crossval_sll_u32_4()
int runSimdTests()
Run the full SIMD test suite. Returns the number of failures.
bool test_unpackhi_u64_as_u32_4()
void ref_xor_u8_16(const uint8_t *a, const uint8_t *b, uint8_t *out)
bool test_blend_u8_16_endpoints()
bool test_mul_f32_4_negative()
int64_t benchS8x8_4(int iters, Op op)
bool test_crossval_srl_u32_4()
void ref_andnot_u8_16(const uint8_t *a, const uint8_t *b, uint8_t *out)
bool test_crossval_add_sat_u8_16()
void ref_mulhi_i32_4(const uint32_t *a, const uint32_t *b, uint32_t *out)
void ref_sll_u32_4(const uint32_t *v, int shift, uint32_t *out)
int32_t as_i32(uint32_t v)
void ref_and_u8_16(const uint8_t *a, const uint8_t *b, uint8_t *out)
bool test_load_store_f32_4()
bool test_crossval_or_u8_16()
bool test_load_store_f32_4_special()
bool test_crossval_add_i32_4()
bool test_unpackhi_u32_4()
bool test_scale_u8_16_full()
void getTests(const SimdTestEntry **out_tests, int *out_count)
Get the static test table. Used by both runSimdTests() and the RPC handler.
bool test_load_store_u8_16_boundary()
void ref_or_u32_4(const uint32_t *a, const uint32_t *b, uint32_t *out)
bool test_crossval_sub_sat_u8_16()
void ref_min_i32_4(const uint32_t *a, const uint32_t *b, uint32_t *out)
void ref_add_i32_4(const uint32_t *a, const uint32_t *b, uint32_t *out)
bool test_unpacklo_u32_4()
bool test_crossval_float_ops()
void ref_or_u8_16(const uint8_t *a, const uint8_t *b, uint8_t *out)
void ref_sub_i32_4(const uint32_t *a, const uint32_t *b, uint32_t *out)
bool test_load_store_u32_4()
bool test_load_store_u32_4_aligned()
bool test_crossval_or_u32_4()
bool compare_u8(const uint8_t *a, const uint8_t *b, size_t n)
bool test_crossval_sra_i32_4()
bool test_crossval_andnot_u8_16()
void ref_xor_u32_4(const uint32_t *a, const uint32_t *b, uint32_t *out)
void ref_sra_i32_4(const uint32_t *v, int shift, uint32_t *out)
void ref_min_u8_16(const uint8_t *a, const uint8_t *b, uint8_t *out)
bool test_crossval_and_u8_16()
int64_t benchU16x16_4(int iters, Op op)
bool compare_f32(const float *a, const float *b, size_t n, float eps=0.001f)
BenchmarkResult runMultiplyBenchmark(int iters=10000)
bool test_sub_sat_u8_16_full_clamp()
bool test_f32_pipeline_mul_add_clamp()
void ref_mulhi_u32_4(const uint32_t *a, const uint32_t *b, uint32_t *out)
bool test_avg_round_u8_16()
void ref_mulhi32_i32_4(const uint32_t *a, const uint32_t *b, uint32_t *out)
bool test_add_f32_4_negative()
bool test_load_store_u8_16()
bool test_set1_u32_4_zero()
void ref_sub_sat_u8_16(const uint8_t *a, const uint8_t *b, uint8_t *out)
bool test_crossval_xor_u8_16()
bool test_crossval_mulhi_i32_4()
bool test_add_sat_u8_16_full_saturate()
static volatile uint32_t g_bench_sink
bool test_crossval_max_i32_4()
bool test_crossval_min_u8_16()
bool test_crossval_min_i32_4()
bool test_sub_sat_u8_16()
platforms::simd_u32x4 simd_u32x4
platforms::simd_u8x16 simd_u8x16
platforms::simd_f32x4 simd_f32x4
void * memcpy(void *dest, const void *src, size_t n) FL_NOEXCEPT
fl::u32 micros()
Universal microsecond timer - returns microseconds since system startup.
SIMD 4-wide s0x32 fixed-point vector type.
SIMD 4-wide s16x16 fixed-point vector type.
Umbrella header for SIMD subsystem.
T operator()(T a, T b) const
float operator()(float a, float b) const
fl::s16x16 operator()(fl::s16x16 a, fl::s16x16 b) const
fl::s8x8 operator()(fl::s8x8 a, fl::s8x8 b) const
fl::u16x16 operator()(fl::u16x16 a, fl::u16x16 b) const
T operator()(T a, T b) const
T operator()(T a, T b) const
static FASTLED_FORCE_INLINE s16x16x4 from_raw(simd::simd_u32x4 r)
static FASTLED_FORCE_INLINE s16x16x4 set1(s16x16 value)
4-wide s16x16 vector (general fixed-point) Backed by 128-bit SIMD register (4× i32 in Q16 format)