BF1 for 2-lane Wave8.
Output: 8 symbols × 2 bytes = 16 bytes. Each byte packs 4 pulses × 2 lanes bit-interleaved (lane 1 in even bit positions, lane 0 in odd). High byte (output[s*2 + 0]) holds bf1-pulse indices 0..3 with q=0 → pulse 3 and q=3 → pulse 0; low byte holds bf1-pulse indices 4..7 with q=0 → pulse 7 and q=3 → pulse 4. Layout matches wave8_transpose_2's LUT-based packing (see FL_WAVE8_SPREAD_TO_16 / kTranspose4_16_LUT).
419 {
422 const u8 D_byte =
W0 ^
W1;
423 for (int p = 0; p < 8; ++p) {
424 const int shift = 7 - p;
425 d_mask[p] = ((D_byte >> shift) & 1) ? 0xFFu : 0x00u;
426 m0_mask[p] = ((
W0 >> shift) & 1) ? 0xFFu : 0x00u;
427 }
428 for (int s = 0; s < 8; ++s) {
429 const int bit_idx = 7 - s;
430 const u8 b0 =
static_cast<u8>((lanes[0] >> bit_idx) & 1u);
431 const u8 b1 =
static_cast<u8>((lanes[1] >> bit_idx) & 1u);
434 for (int q = 0; q < 4; ++q) {
435
436
437
438
439
440
441 const int p_hi = 3 - q;
442 const int p_lo = 7 - q;
443 const u8 m0_p_hi =
static_cast<u8>(m0_mask[p_hi] & 1u);
444 const u8 d_p_hi =
static_cast<u8>(d_mask[p_hi] & 1u);
445 const u8 m0_p_lo =
static_cast<u8>(m0_mask[p_lo] & 1u);
446 const u8 d_p_lo =
static_cast<u8>(d_mask[p_lo] & 1u);
447 const u8 v0_hi =
static_cast<u8>(m0_p_hi ^ (b0 & d_p_hi));
448 const u8 v1_hi =
static_cast<u8>(m0_p_hi ^ (b1 & d_p_hi));
449 const u8 v0_lo =
static_cast<u8>(m0_p_lo ^ (b0 & d_p_lo));
450 const u8 v1_lo =
static_cast<u8>(m0_p_lo ^ (b1 & d_p_lo));
451 byte_hi |=
static_cast<u8>((v1_hi << (2 * q)) | (v0_hi << (2 * q + 1)));
452 byte_lo |=
static_cast<u8>((v1_lo << (2 * q)) | (v0_lo << (2 * q + 1)));
453 }
454 output[s * 2 + 0] = byte_hi;
455 output[s * 2 + 1] = byte_lo;
456 }
457}