676 {
678 const auto w6w = fsimd::set1_u16_16(6), w15w = fsimd::set1_u16_16(15), w20w = fsimd::set1_u16_16(20);
679 const auto w6 = fsimd::set1_u16_8(6), w15 = fsimd::set1_u16_8(15), w20 = fsimd::set1_u16_8(20);
680 int i = 0;
681 for (; i + 31 < nbytes; i += 32) {
682 auto v0 = fsimd::load_u8_32(p0+i), v1 = fsimd::load_u8_32(p1+i), v2 = fsimd::load_u8_32(p2+i);
683 auto v3 = fsimd::load_u8_32(p3+i), v4 = fsimd::load_u8_32(p4+i), v5 = fsimd::load_u8_32(p5+i);
684 auto v6v = fsimd::load_u8_32(p6+i);
685 auto s06 = fsimd::add_u16_16(fsimd::widen_lo_u8x32_to_u16(v0), fsimd::widen_lo_u8x32_to_u16(v6v));
686 auto s15 = fsimd::add_u16_16(fsimd::widen_lo_u8x32_to_u16(v1), fsimd::widen_lo_u8x32_to_u16(v5));
687 auto s24 = fsimd::add_u16_16(fsimd::widen_lo_u8x32_to_u16(v2), fsimd::widen_lo_u8x32_to_u16(v4));
688 auto lo = fsimd::add_u16_16(s06, fsimd::add_u16_16(fsimd::mullo_u16_16(s15, w6w),
689 fsimd::add_u16_16(fsimd::mullo_u16_16(s24, w15w), fsimd::mullo_u16_16(fsimd::widen_lo_u8x32_to_u16(v3), w20w))));
690 lo = fsimd::srli_u16_16(lo, 6);
691 auto s06h = fsimd::add_u16_16(fsimd::widen_hi_u8x32_to_u16(v0), fsimd::widen_hi_u8x32_to_u16(v6v));
692 auto s15h = fsimd::add_u16_16(fsimd::widen_hi_u8x32_to_u16(v1), fsimd::widen_hi_u8x32_to_u16(v5));
693 auto s24h = fsimd::add_u16_16(fsimd::widen_hi_u8x32_to_u16(v2), fsimd::widen_hi_u8x32_to_u16(v4));
694 auto hi = fsimd::add_u16_16(s06h, fsimd::add_u16_16(fsimd::mullo_u16_16(s15h, w6w),
695 fsimd::add_u16_16(fsimd::mullo_u16_16(s24h, w15w), fsimd::mullo_u16_16(fsimd::widen_hi_u8x32_to_u16(v3), w20w))));
696 hi = fsimd::srli_u16_16(hi, 6);
697 fsimd::store_u8_32(out+i, fsimd::narrow_u16x16_to_u8(lo, hi));
698 }
699 for (; i + 15 < nbytes; i += 16) {
700 auto v0 = fsimd::load_u8_16(p0+i), v1 = fsimd::load_u8_16(p1+i), v2 = fsimd::load_u8_16(p2+i);
701 auto v3 = fsimd::load_u8_16(p3+i), v4 = fsimd::load_u8_16(p4+i), v5 = fsimd::load_u8_16(p5+i);
702 auto v6v = fsimd::load_u8_16(p6+i);
703 auto s06 = fsimd::add_u16_8(fsimd::widen_lo_u8_to_u16(v0), fsimd::widen_lo_u8_to_u16(v6v));
704 auto s15 = fsimd::add_u16_8(fsimd::widen_lo_u8_to_u16(v1), fsimd::widen_lo_u8_to_u16(v5));
705 auto s24 = fsimd::add_u16_8(fsimd::widen_lo_u8_to_u16(v2), fsimd::widen_lo_u8_to_u16(v4));
706 auto lo = fsimd::add_u16_8(s06, fsimd::add_u16_8(fsimd::mullo_u16_8(s15, w6),
707 fsimd::add_u16_8(fsimd::mullo_u16_8(s24, w15), fsimd::mullo_u16_8(fsimd::widen_lo_u8_to_u16(v3), w20))));
708 lo = fsimd::srli_u16_8(lo, 6);
709 auto s06h = fsimd::add_u16_8(fsimd::widen_hi_u8_to_u16(v0), fsimd::widen_hi_u8_to_u16(v6v));
710 auto s15h = fsimd::add_u16_8(fsimd::widen_hi_u8_to_u16(v1), fsimd::widen_hi_u8_to_u16(v5));
711 auto s24h = fsimd::add_u16_8(fsimd::widen_hi_u8_to_u16(v2), fsimd::widen_hi_u8_to_u16(v4));
712 auto hi = fsimd::add_u16_8(s06h, fsimd::add_u16_8(fsimd::mullo_u16_8(s15h, w6),
713 fsimd::add_u16_8(fsimd::mullo_u16_8(s24h, w15), fsimd::mullo_u16_8(fsimd::widen_hi_u8_to_u16(v3), w20))));
714 hi = fsimd::srli_u16_8(hi, 6);
715 fsimd::store_u8_16(out+i, fsimd::narrow_u16_to_u8(lo, hi));
716 }
717 for (; i < nbytes; ++i) {
718 u16 s06=(u16)p0[i]+(u16)p6[i], s15=(u16)p1[i]+(u16)p5[i], s24=(u16)p2[i]+(u16)p4[i];
719 out[i]=(
u8)((s06+s15*6+s24*15+(u16)p3[i]*20)>>6);
720 }
721}