Transposes 16 input bytes into 16-way interleaved format. This function is ISR-safe: no allocations, no exceptions, minimal overhead. Inline functions are automatically placed where needed - no IRAM_ATTR required.
421 {
422 for (size_t byte_idx = 0; byte_idx < num_bytes; byte_idx++) {
423
425 ((
u64)lanes[0][byte_idx] << 0) |
426 ((
u64)lanes[1][byte_idx] << 8) |
427 ((
u64)lanes[2][byte_idx] << 16) |
428 ((
u64)lanes[3][byte_idx] << 24) |
429 ((
u64)lanes[4][byte_idx] << 32) |
430 ((
u64)lanes[5][byte_idx] << 40) |
431 ((
u64)lanes[6][byte_idx] << 48) |
432 ((
u64)lanes[7][byte_idx] << 56);
433
434
436 ((
u64)lanes[8][byte_idx] << 0) |
437 ((
u64)lanes[9][byte_idx] << 8) |
438 ((
u64)lanes[10][byte_idx] << 16) |
439 ((
u64)lanes[11][byte_idx] << 24) |
440 ((
u64)lanes[12][byte_idx] << 32) |
441 ((
u64)lanes[13][byte_idx] << 40) |
442 ((
u64)lanes[14][byte_idx] << 48) |
443 ((
u64)lanes[15][byte_idx] << 56);
444
445 u8* dest = &output[byte_idx * 16];
446
447
448 for (int bit = 7; bit >= 0; bit--) {
449 dest[7 - bit] =
450 ((packed_lo >> (bit + 0)) & 0x01) << 0 |
451 ((packed_lo >> (bit + 8)) & 0x01) << 1 |
452 ((packed_lo >> (bit + 16)) & 0x01) << 2 |
453 ((packed_lo >> (bit + 24)) & 0x01) << 3 |
454 ((packed_lo >> (bit + 32)) & 0x01) << 4 |
455 ((packed_lo >> (bit + 40)) & 0x01) << 5 |
456 ((packed_lo >> (bit + 48)) & 0x01) << 6 |
457 ((packed_lo >> (bit + 56)) & 0x01) << 7;
458
459 dest[15 - bit] =
460 ((packed_hi >> (bit + 0)) & 0x01) << 0 |
461 ((packed_hi >> (bit + 8)) & 0x01) << 1 |
462 ((packed_hi >> (bit + 16)) & 0x01) << 2 |
463 ((packed_hi >> (bit + 24)) & 0x01) << 3 |
464 ((packed_hi >> (bit + 32)) & 0x01) << 4 |
465 ((packed_hi >> (bit + 40)) & 0x01) << 5 |
466 ((packed_hi >> (bit + 48)) & 0x01) << 6 |
467 ((packed_hi >> (bit + 56)) & 0x01) << 7;
468 }
469 }
470}