608 {
610 int i = 0;
611 for (; i + 63 < nbytes; i += 64) {
612 auto va0 = fsimd::load_u8_32(a+i), vb0 = fsimd::load_u8_32(b+i), vc0 = fsimd::load_u8_32(c+i);
613 auto va1 = fsimd::load_u8_32(a+i+32), vb1 = fsimd::load_u8_32(b+i+32), vc1 = fsimd::load_u8_32(c+i+32);
614 fsimd::store_u8_32(out+i, fsimd::avg_round_u8_32(fsimd::avg_round_u8_32(va0, vc0), vb0));
615 fsimd::store_u8_32(out+i+32, fsimd::avg_round_u8_32(fsimd::avg_round_u8_32(va1, vc1), vb1));
616 }
617 for (; i + 31 < nbytes; i += 32) {
618 auto va = fsimd::load_u8_32(a+i), vb = fsimd::load_u8_32(b+i), vc = fsimd::load_u8_32(c+i);
619 fsimd::store_u8_32(out+i, fsimd::avg_round_u8_32(fsimd::avg_round_u8_32(va, vc), vb));
620 }
621 for (; i + 15 < nbytes; i += 16) {
622 auto va = fsimd::load_u8_16(a+i), vb = fsimd::load_u8_16(b+i), vc = fsimd::load_u8_16(c+i);
623 fsimd::store_u8_16(out+i, fsimd::avg_round_u8_16(fsimd::avg_round_u8_16(va, vc), vb));
624 }
625 for (; i < nbytes; ++i)
626 out[i] = (
u8)(((u16)a[i] + ((u16)b[i] << 1) + (u16)c[i]) >> 2);
627}