FastLED 3.9.15
Loading...
Searching...
No Matches
fl::gfx::blur_detail Namespace Reference

Classes

struct  interior_row
 
struct  interior_row< 0, RGB_T, acc_t >
 
struct  interior_row< 1, RGB_T, acc_t >
 
struct  interior_row< 2, RGB_T, acc_t >
 
struct  interior_row< 3, RGB_T, acc_t >
 
struct  interior_row< 4, RGB_T, acc_t >
 
struct  pixel_ops
 
struct  pixel_ops< CRGB >
 
struct  pixel_ops< CRGB16 >
 
struct  simd_hconv_dispatch
 
struct  simd_hconv_dispatch< 0 >
 
struct  simd_hconv_dispatch< 1 >
 
struct  simd_hconv_dispatch< 2 >
 
struct  simd_hconv_dispatch< 3 >
 
struct  simd_hconv_dispatch< 4 >
 
struct  simd_vconv_dispatch
 
struct  simd_vconv_dispatch< 0 >
 
struct  simd_vconv_dispatch< 1 >
 
struct  simd_vconv_dispatch< 2 >
 
struct  simd_vconv_dispatch< 3 >
 
struct  simd_vconv_dispatch< 4 >
 
struct  vpass_pixel_kernel
 
struct  vpass_pixel_kernel< 0, RGB_T, acc_t >
 
struct  vpass_pixel_kernel< 1, RGB_T, acc_t >
 
struct  vpass_pixel_kernel< 2, RGB_T, acc_t >
 
struct  vpass_pixel_kernel< 3, RGB_T, acc_t >
 
struct  vpass_pixel_kernel< 4, RGB_T, acc_t >
 

Functions

template<typename AlphaT>
constexpr AlphaT alpha_identity ()
 
template<>
constexpr alpha16 alpha_identity< alpha16 > ()
 
template<>
constexpr alpha8 alpha_identity< alpha8 > ()
 
template<int R, typename RGB_T, typename acc_t>
FL_NO_INLINE_IF_AVR static FL_OPTIMIZE_FUNCTION void apply_pass (const RGB_T *pad, RGB_T *out, int count, int stride)
 
template<int R, typename RGB_T, typename acc_t, typename AlphaT>
FL_NO_INLINE_IF_AVR static FL_OPTIMIZE_FUNCTION void apply_pass_alpha (const RGB_T *pad, RGB_T *out, int count, int stride, AlphaT alpha)
 
template<int hR, int vR, typename RGB_T>
static int compute_pad_size (int w, int h)
 
template<typename RGB_T>
static fl::span< RGB_T > get_padbuf (int minSize)
 
template<int R, typename RGB_T, typename acc_t, bool ApplyAlpha, typename AlphaT>
FL_ALWAYS_INLINE void hpass_row (RGB_T *pad, RGB_T *out, int w, AlphaT alpha)
 
static void simd_conv_121 (const u8 *FL_RESTRICT_PARAM a, const u8 *FL_RESTRICT_PARAM b, const u8 *FL_RESTRICT_PARAM c, u8 *FL_RESTRICT_PARAM out, int nbytes)
 
static void simd_conv_14641 (const u8 *p0, const u8 *p1, const u8 *p2, const u8 *p3, const u8 *p4, u8 *out, int nbytes)
 
static void simd_conv_r3 (const u8 *p0, const u8 *p1, const u8 *p2, const u8 *p3, const u8 *p4, const u8 *p5, const u8 *p6, u8 *out, int nbytes)
 
static void simd_conv_r4 (const u8 *p0, const u8 *p1, const u8 *p2, const u8 *p3, const u8 *p4, const u8 *p5, const u8 *p6, const u8 *p7, const u8 *p8, u8 *out, int nbytes)
 
template<int R, typename RGB_T, typename acc_t, bool ApplyAlpha, typename AlphaT>
static void vpass_full (RGB_T *pixels, int w, int h, RGB_T *scratch, AlphaT alpha)
 
template<int R, typename RGB_T, typename acc_t, bool ApplyAlpha, typename AlphaT>
static FL_OPTIMIZE_FUNCTION void vpass_rowmajor_impl (RGB_T *pixels, int w, int h, RGB_T *scratch, AlphaT alpha)
 

◆ fl::gfx::blur_detail::pixel_ops

struct fl::gfx::blur_detail::pixel_ops
+ Inheritance diagram for fl::gfx::blur_detail::pixel_ops< RGB_T >:

◆ fl::gfx::blur_detail::simd_hconv_dispatch

struct fl::gfx::blur_detail::simd_hconv_dispatch
+ Inheritance diagram for fl::gfx::blur_detail::simd_hconv_dispatch< R >:

◆ fl::gfx::blur_detail::simd_vconv_dispatch

struct fl::gfx::blur_detail::simd_vconv_dispatch
+ Inheritance diagram for fl::gfx::blur_detail::simd_vconv_dispatch< R >:

◆ fl::gfx::blur_detail::vpass_pixel_kernel

struct fl::gfx::blur_detail::vpass_pixel_kernel
+ Inheritance diagram for fl::gfx::blur_detail::vpass_pixel_kernel< R, RGB_T, acc_t >:

Function Documentation

◆ alpha_identity()

template<typename AlphaT>
AlphaT fl::gfx::blur_detail::alpha_identity ( )
constexpr

Referenced by fl::gfx::blurGaussianImpl(), and fl::gfx::blurGaussianMappedImpl().

+ Here is the caller graph for this function:

◆ alpha_identity< alpha16 >()

template<>
alpha16 fl::gfx::blur_detail::alpha_identity< alpha16 > ( )
constexpr

Definition at line 290 of file blur.cpp.hpp.

290{ return alpha16(65535); }
Unsigned 16-bit alpha / brightness — UNORM16.
Definition alpha.h:87

◆ alpha_identity< alpha8 >()

template<>
alpha8 fl::gfx::blur_detail::alpha_identity< alpha8 > ( )
constexpr

Definition at line 289 of file blur.cpp.hpp.

289{ return alpha8(255); }
Unsigned 8-bit alpha / brightness — UNORM8.
Definition alpha.h:42

◆ apply_pass()

template<int R, typename RGB_T, typename acc_t>
FL_NO_INLINE_IF_AVR static FL_OPTIMIZE_FUNCTION void fl::gfx::blur_detail::apply_pass ( const RGB_T * pad,
RGB_T * out,
int count,
int stride )
static

Definition at line 563 of file blur.cpp.hpp.

563 {
564 constexpr int shift = 2 * R;
565 using P = pixel_ops<RGB_T>;
566 for (int i = 0; i < count; ++i) {
567 acc_t r, g, b;
568 interior_row<R, RGB_T, acc_t>::apply(pad, R + i, r, g, b);
569 *out = P::make(static_cast<acc_t>(r >> shift),
570 static_cast<acc_t>(g >> shift),
571 static_cast<acc_t>(b >> shift));
572 out += stride;
573 }
574}
FASTLED_FORCE_INLINE fl::u8 P(fl::u8 x)

References fl::P().

Referenced by hpass_row().

+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ apply_pass_alpha()

template<int R, typename RGB_T, typename acc_t, typename AlphaT>
FL_NO_INLINE_IF_AVR static FL_OPTIMIZE_FUNCTION void fl::gfx::blur_detail::apply_pass_alpha ( const RGB_T * pad,
RGB_T * out,
int count,
int stride,
AlphaT alpha )
static

Definition at line 578 of file blur.cpp.hpp.

579 {
580 constexpr int shift = 2 * R;
581 using P = pixel_ops<RGB_T>;
582 for (int i = 0; i < count; ++i) {
583 acc_t r, g, b;
584 interior_row<R, RGB_T, acc_t>::apply(pad, R + i, r, g, b);
585 *out = P::make(static_cast<acc_t>(r >> shift),
586 static_cast<acc_t>(g >> shift),
587 static_cast<acc_t>(b >> shift), alpha);
588 out += stride;
589 }
590}

References fl::P().

Referenced by hpass_row().

+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ compute_pad_size()

template<int hR, int vR, typename RGB_T>
static int fl::gfx::blur_detail::compute_pad_size ( int w,
int h )
static

Definition at line 1049 of file blur.cpp.hpp.

1049 {
1050 int hPad = 2 * hR + w;
1051#if defined(FL_IS_AVR)
1052 int vPad = 2 * vR + h;
1053#else
1054 int vPad = vR > 0 ? (vR + 2) * w : 0;
1055#endif
1056 return hPad > vPad ? hPad : vPad;
1057}

Referenced by fl::gfx::blurGaussianImpl(), and fl::gfx::blurGaussianMappedImpl().

+ Here is the caller graph for this function:

◆ get_padbuf()

template<typename RGB_T>
static fl::span< RGB_T > fl::gfx::blur_detail::get_padbuf ( int minSize)
static

Definition at line 349 of file blur.cpp.hpp.

349 {
351 if (static_cast<int>(buf.size()) < minSize) {
352 buf.resize(minSize);
353 }
354 return buf;
355}
fl::size size() const FL_NOEXCEPT
void resize(fl::size n) FL_NOEXCEPT
Definition vector.h:593

References fl::vector< T >::resize(), and fl::vector_basic::size().

Referenced by fl::gfx::blurGaussianImpl(), and fl::gfx::blurGaussianMappedImpl().

+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ hpass_row()

template<int R, typename RGB_T, typename acc_t, bool ApplyAlpha, typename AlphaT>
FL_ALWAYS_INLINE void fl::gfx::blur_detail::hpass_row ( RGB_T * pad,
RGB_T * out,
int w,
AlphaT alpha )

Definition at line 1063 of file blur.cpp.hpp.

1063 {
1064#if defined(FL_IS_AVR)
1065 if (ApplyAlpha)
1066 apply_pass_alpha_1ch<R>(pad, out, w, 1, alpha);
1067 else
1068 apply_pass_1ch<R>(pad, out, w, 1);
1069#else
1070 // SIMD fast path: u8-channel (CRGB), no alpha on this pass.
1071 if (sizeof(typename RGB_T::fp) == 1 && !ApplyAlpha) {
1072 constexpr int S = (int)sizeof(RGB_T);
1073 const int nbytes = w * S;
1074 const u8 *pb = (const u8 *)pad;
1075 u8 *ob = (u8 *)out;
1077 pb, S, ob, nbytes, (u8 *)(pad + 2 * R + w), w);
1078 } else if (ApplyAlpha) {
1079 apply_pass_alpha<R, RGB_T, acc_t>(pad, out, w, 1, alpha);
1080 } else {
1081 apply_pass<R, RGB_T, acc_t>(pad, out, w, 1);
1082 }
1083#endif
1084}
FL_NO_INLINE_IF_AVR static FL_OPTIMIZE_FUNCTION void apply_pass_alpha(const RGB_T *pad, RGB_T *out, int count, int stride, AlphaT alpha)
Definition blur.cpp.hpp:578
FL_NO_INLINE_IF_AVR static FL_OPTIMIZE_FUNCTION void apply_pass(const RGB_T *pad, RGB_T *out, int count, int stride)
Definition blur.cpp.hpp:563
unsigned char u8
Definition stdint.h:131

References apply_pass(), and apply_pass_alpha().

Referenced by fl::gfx::blurGaussianImpl().

+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ simd_conv_121()

static void fl::gfx::blur_detail::simd_conv_121 ( const u8 *FL_RESTRICT_PARAM a,
const u8 *FL_RESTRICT_PARAM b,
const u8 *FL_RESTRICT_PARAM c,
u8 *FL_RESTRICT_PARAM out,
int nbytes )
static

Definition at line 605 of file blur.cpp.hpp.

608 {
609 namespace fsimd = fl::simd; // ok bare using
610 int i = 0;
611 for (; i + 63 < nbytes; i += 64) {
612 auto va0 = fsimd::load_u8_32(a+i), vb0 = fsimd::load_u8_32(b+i), vc0 = fsimd::load_u8_32(c+i);
613 auto va1 = fsimd::load_u8_32(a+i+32), vb1 = fsimd::load_u8_32(b+i+32), vc1 = fsimd::load_u8_32(c+i+32);
614 fsimd::store_u8_32(out+i, fsimd::avg_round_u8_32(fsimd::avg_round_u8_32(va0, vc0), vb0));
615 fsimd::store_u8_32(out+i+32, fsimd::avg_round_u8_32(fsimd::avg_round_u8_32(va1, vc1), vb1));
616 }
617 for (; i + 31 < nbytes; i += 32) {
618 auto va = fsimd::load_u8_32(a+i), vb = fsimd::load_u8_32(b+i), vc = fsimd::load_u8_32(c+i);
619 fsimd::store_u8_32(out+i, fsimd::avg_round_u8_32(fsimd::avg_round_u8_32(va, vc), vb));
620 }
621 for (; i + 15 < nbytes; i += 16) {
622 auto va = fsimd::load_u8_16(a+i), vb = fsimd::load_u8_16(b+i), vc = fsimd::load_u8_16(c+i);
623 fsimd::store_u8_16(out+i, fsimd::avg_round_u8_16(fsimd::avg_round_u8_16(va, vc), vb));
624 }
625 for (; i < nbytes; ++i)
626 out[i] = (u8)(((u16)a[i] + ((u16)b[i] << 1) + (u16)c[i]) >> 2);
627}

References FL_RESTRICT_PARAM.

Referenced by fl::gfx::blur_detail::simd_hconv_dispatch< 1 >::apply(), and fl::gfx::blur_detail::simd_vconv_dispatch< 1 >::apply().

+ Here is the caller graph for this function:

◆ simd_conv_14641()

static void fl::gfx::blur_detail::simd_conv_14641 ( const u8 * p0,
const u8 * p1,
const u8 * p2,
const u8 * p3,
const u8 * p4,
u8 * out,
int nbytes )
static

Definition at line 633 of file blur.cpp.hpp.

635 {
636 namespace fsimd = fl::simd; // ok bare using
637 const auto w4w = fsimd::set1_u16_16(4), w6w = fsimd::set1_u16_16(6);
638 const auto w4 = fsimd::set1_u16_8(4), w6 = fsimd::set1_u16_8(6);
639 int i = 0;
640 for (; i + 31 < nbytes; i += 32) {
641 auto v0 = fsimd::load_u8_32(p0+i), v1 = fsimd::load_u8_32(p1+i);
642 auto v2 = fsimd::load_u8_32(p2+i), v3 = fsimd::load_u8_32(p3+i), v4 = fsimd::load_u8_32(p4+i);
643 auto s04 = fsimd::add_u16_16(fsimd::widen_lo_u8x32_to_u16(v0), fsimd::widen_lo_u8x32_to_u16(v4));
644 auto s13 = fsimd::add_u16_16(fsimd::widen_lo_u8x32_to_u16(v1), fsimd::widen_lo_u8x32_to_u16(v3));
645 auto lo = fsimd::add_u16_16(s04, fsimd::add_u16_16(fsimd::mullo_u16_16(s13, w4w), fsimd::mullo_u16_16(fsimd::widen_lo_u8x32_to_u16(v2), w6w)));
646 lo = fsimd::srli_u16_16(lo, 4);
647 auto s04h = fsimd::add_u16_16(fsimd::widen_hi_u8x32_to_u16(v0), fsimd::widen_hi_u8x32_to_u16(v4));
648 auto s13h = fsimd::add_u16_16(fsimd::widen_hi_u8x32_to_u16(v1), fsimd::widen_hi_u8x32_to_u16(v3));
649 auto hi = fsimd::add_u16_16(s04h, fsimd::add_u16_16(fsimd::mullo_u16_16(s13h, w4w), fsimd::mullo_u16_16(fsimd::widen_hi_u8x32_to_u16(v2), w6w)));
650 hi = fsimd::srli_u16_16(hi, 4);
651 fsimd::store_u8_32(out+i, fsimd::narrow_u16x16_to_u8(lo, hi));
652 }
653 for (; i + 15 < nbytes; i += 16) {
654 auto v0 = fsimd::load_u8_16(p0+i), v1 = fsimd::load_u8_16(p1+i);
655 auto v2 = fsimd::load_u8_16(p2+i), v3 = fsimd::load_u8_16(p3+i), v4 = fsimd::load_u8_16(p4+i);
656 auto s04 = fsimd::add_u16_8(fsimd::widen_lo_u8_to_u16(v0), fsimd::widen_lo_u8_to_u16(v4));
657 auto s13 = fsimd::add_u16_8(fsimd::widen_lo_u8_to_u16(v1), fsimd::widen_lo_u8_to_u16(v3));
658 auto lo = fsimd::add_u16_8(s04, fsimd::add_u16_8(fsimd::mullo_u16_8(s13, w4), fsimd::mullo_u16_8(fsimd::widen_lo_u8_to_u16(v2), w6)));
659 lo = fsimd::srli_u16_8(lo, 4);
660 auto s04h = fsimd::add_u16_8(fsimd::widen_hi_u8_to_u16(v0), fsimd::widen_hi_u8_to_u16(v4));
661 auto s13h = fsimd::add_u16_8(fsimd::widen_hi_u8_to_u16(v1), fsimd::widen_hi_u8_to_u16(v3));
662 auto hi = fsimd::add_u16_8(s04h, fsimd::add_u16_8(fsimd::mullo_u16_8(s13h, w4), fsimd::mullo_u16_8(fsimd::widen_hi_u8_to_u16(v2), w6)));
663 hi = fsimd::srli_u16_8(hi, 4);
664 fsimd::store_u8_16(out+i, fsimd::narrow_u16_to_u8(lo, hi));
665 }
666 for (; i < nbytes; ++i) {
667 u16 s04 = (u16)p0[i] + (u16)p4[i];
668 u16 s13 = (u16)p1[i] + (u16)p3[i];
669 out[i] = (u8)((s04 + s13 * 4 + (u16)p2[i] * 6) >> 4);
670 }
671}

Referenced by fl::gfx::blur_detail::simd_hconv_dispatch< 2 >::apply(), and fl::gfx::blur_detail::simd_vconv_dispatch< 2 >::apply().

+ Here is the caller graph for this function:

◆ simd_conv_r3()

static void fl::gfx::blur_detail::simd_conv_r3 ( const u8 * p0,
const u8 * p1,
const u8 * p2,
const u8 * p3,
const u8 * p4,
const u8 * p5,
const u8 * p6,
u8 * out,
int nbytes )
static

Definition at line 674 of file blur.cpp.hpp.

676 {
677 namespace fsimd = fl::simd; // ok bare using
678 const auto w6w = fsimd::set1_u16_16(6), w15w = fsimd::set1_u16_16(15), w20w = fsimd::set1_u16_16(20);
679 const auto w6 = fsimd::set1_u16_8(6), w15 = fsimd::set1_u16_8(15), w20 = fsimd::set1_u16_8(20);
680 int i = 0;
681 for (; i + 31 < nbytes; i += 32) {
682 auto v0 = fsimd::load_u8_32(p0+i), v1 = fsimd::load_u8_32(p1+i), v2 = fsimd::load_u8_32(p2+i);
683 auto v3 = fsimd::load_u8_32(p3+i), v4 = fsimd::load_u8_32(p4+i), v5 = fsimd::load_u8_32(p5+i);
684 auto v6v = fsimd::load_u8_32(p6+i);
685 auto s06 = fsimd::add_u16_16(fsimd::widen_lo_u8x32_to_u16(v0), fsimd::widen_lo_u8x32_to_u16(v6v));
686 auto s15 = fsimd::add_u16_16(fsimd::widen_lo_u8x32_to_u16(v1), fsimd::widen_lo_u8x32_to_u16(v5));
687 auto s24 = fsimd::add_u16_16(fsimd::widen_lo_u8x32_to_u16(v2), fsimd::widen_lo_u8x32_to_u16(v4));
688 auto lo = fsimd::add_u16_16(s06, fsimd::add_u16_16(fsimd::mullo_u16_16(s15, w6w),
689 fsimd::add_u16_16(fsimd::mullo_u16_16(s24, w15w), fsimd::mullo_u16_16(fsimd::widen_lo_u8x32_to_u16(v3), w20w))));
690 lo = fsimd::srli_u16_16(lo, 6);
691 auto s06h = fsimd::add_u16_16(fsimd::widen_hi_u8x32_to_u16(v0), fsimd::widen_hi_u8x32_to_u16(v6v));
692 auto s15h = fsimd::add_u16_16(fsimd::widen_hi_u8x32_to_u16(v1), fsimd::widen_hi_u8x32_to_u16(v5));
693 auto s24h = fsimd::add_u16_16(fsimd::widen_hi_u8x32_to_u16(v2), fsimd::widen_hi_u8x32_to_u16(v4));
694 auto hi = fsimd::add_u16_16(s06h, fsimd::add_u16_16(fsimd::mullo_u16_16(s15h, w6w),
695 fsimd::add_u16_16(fsimd::mullo_u16_16(s24h, w15w), fsimd::mullo_u16_16(fsimd::widen_hi_u8x32_to_u16(v3), w20w))));
696 hi = fsimd::srli_u16_16(hi, 6);
697 fsimd::store_u8_32(out+i, fsimd::narrow_u16x16_to_u8(lo, hi));
698 }
699 for (; i + 15 < nbytes; i += 16) {
700 auto v0 = fsimd::load_u8_16(p0+i), v1 = fsimd::load_u8_16(p1+i), v2 = fsimd::load_u8_16(p2+i);
701 auto v3 = fsimd::load_u8_16(p3+i), v4 = fsimd::load_u8_16(p4+i), v5 = fsimd::load_u8_16(p5+i);
702 auto v6v = fsimd::load_u8_16(p6+i);
703 auto s06 = fsimd::add_u16_8(fsimd::widen_lo_u8_to_u16(v0), fsimd::widen_lo_u8_to_u16(v6v));
704 auto s15 = fsimd::add_u16_8(fsimd::widen_lo_u8_to_u16(v1), fsimd::widen_lo_u8_to_u16(v5));
705 auto s24 = fsimd::add_u16_8(fsimd::widen_lo_u8_to_u16(v2), fsimd::widen_lo_u8_to_u16(v4));
706 auto lo = fsimd::add_u16_8(s06, fsimd::add_u16_8(fsimd::mullo_u16_8(s15, w6),
707 fsimd::add_u16_8(fsimd::mullo_u16_8(s24, w15), fsimd::mullo_u16_8(fsimd::widen_lo_u8_to_u16(v3), w20))));
708 lo = fsimd::srli_u16_8(lo, 6);
709 auto s06h = fsimd::add_u16_8(fsimd::widen_hi_u8_to_u16(v0), fsimd::widen_hi_u8_to_u16(v6v));
710 auto s15h = fsimd::add_u16_8(fsimd::widen_hi_u8_to_u16(v1), fsimd::widen_hi_u8_to_u16(v5));
711 auto s24h = fsimd::add_u16_8(fsimd::widen_hi_u8_to_u16(v2), fsimd::widen_hi_u8_to_u16(v4));
712 auto hi = fsimd::add_u16_8(s06h, fsimd::add_u16_8(fsimd::mullo_u16_8(s15h, w6),
713 fsimd::add_u16_8(fsimd::mullo_u16_8(s24h, w15), fsimd::mullo_u16_8(fsimd::widen_hi_u8_to_u16(v3), w20))));
714 hi = fsimd::srli_u16_8(hi, 6);
715 fsimd::store_u8_16(out+i, fsimd::narrow_u16_to_u8(lo, hi));
716 }
717 for (; i < nbytes; ++i) {
718 u16 s06=(u16)p0[i]+(u16)p6[i], s15=(u16)p1[i]+(u16)p5[i], s24=(u16)p2[i]+(u16)p4[i];
719 out[i]=(u8)((s06+s15*6+s24*15+(u16)p3[i]*20)>>6);
720 }
721}

Referenced by fl::gfx::blur_detail::simd_hconv_dispatch< 3 >::apply(), and fl::gfx::blur_detail::simd_vconv_dispatch< 3 >::apply().

+ Here is the caller graph for this function:

◆ simd_conv_r4()

static void fl::gfx::blur_detail::simd_conv_r4 ( const u8 * p0,
const u8 * p1,
const u8 * p2,
const u8 * p3,
const u8 * p4,
const u8 * p5,
const u8 * p6,
const u8 * p7,
const u8 * p8,
u8 * out,
int nbytes )
static

Definition at line 724 of file blur.cpp.hpp.

726 {
727 namespace fsimd = fl::simd; // ok bare using
728 const auto w8w = fsimd::set1_u16_16(8), w28w = fsimd::set1_u16_16(28);
729 const auto w56w = fsimd::set1_u16_16(56), w70w = fsimd::set1_u16_16(70);
730 const auto w8 = fsimd::set1_u16_8(8), w28 = fsimd::set1_u16_8(28);
731 const auto w56 = fsimd::set1_u16_8(56), w70 = fsimd::set1_u16_8(70);
732 int i = 0;
733 for (; i + 31 < nbytes; i += 32) {
734 auto v0 = fsimd::load_u8_32(p0+i), v1 = fsimd::load_u8_32(p1+i), v2 = fsimd::load_u8_32(p2+i);
735 auto v3 = fsimd::load_u8_32(p3+i), v4 = fsimd::load_u8_32(p4+i), v5 = fsimd::load_u8_32(p5+i);
736 auto v6v = fsimd::load_u8_32(p6+i), v7 = fsimd::load_u8_32(p7+i), v8v = fsimd::load_u8_32(p8+i);
737 auto s08 = fsimd::add_u16_16(fsimd::widen_lo_u8x32_to_u16(v0), fsimd::widen_lo_u8x32_to_u16(v8v));
738 auto s17 = fsimd::add_u16_16(fsimd::widen_lo_u8x32_to_u16(v1), fsimd::widen_lo_u8x32_to_u16(v7));
739 auto s26 = fsimd::add_u16_16(fsimd::widen_lo_u8x32_to_u16(v2), fsimd::widen_lo_u8x32_to_u16(v6v));
740 auto s35 = fsimd::add_u16_16(fsimd::widen_lo_u8x32_to_u16(v3), fsimd::widen_lo_u8x32_to_u16(v5));
741 auto lo = fsimd::add_u16_16(s08, fsimd::add_u16_16(fsimd::mullo_u16_16(s17, w8w),
742 fsimd::add_u16_16(fsimd::mullo_u16_16(s26, w28w),
743 fsimd::add_u16_16(fsimd::mullo_u16_16(s35, w56w), fsimd::mullo_u16_16(fsimd::widen_lo_u8x32_to_u16(v4), w70w)))));
744 lo = fsimd::srli_u16_16(lo, 8);
745 auto s08h = fsimd::add_u16_16(fsimd::widen_hi_u8x32_to_u16(v0), fsimd::widen_hi_u8x32_to_u16(v8v));
746 auto s17h = fsimd::add_u16_16(fsimd::widen_hi_u8x32_to_u16(v1), fsimd::widen_hi_u8x32_to_u16(v7));
747 auto s26h = fsimd::add_u16_16(fsimd::widen_hi_u8x32_to_u16(v2), fsimd::widen_hi_u8x32_to_u16(v6v));
748 auto s35h = fsimd::add_u16_16(fsimd::widen_hi_u8x32_to_u16(v3), fsimd::widen_hi_u8x32_to_u16(v5));
749 auto hi = fsimd::add_u16_16(s08h, fsimd::add_u16_16(fsimd::mullo_u16_16(s17h, w8w),
750 fsimd::add_u16_16(fsimd::mullo_u16_16(s26h, w28w),
751 fsimd::add_u16_16(fsimd::mullo_u16_16(s35h, w56w), fsimd::mullo_u16_16(fsimd::widen_hi_u8x32_to_u16(v4), w70w)))));
752 hi = fsimd::srli_u16_16(hi, 8);
753 fsimd::store_u8_32(out+i, fsimd::narrow_u16x16_to_u8(lo, hi));
754 }
755 for (; i + 15 < nbytes; i += 16) {
756 auto v0 = fsimd::load_u8_16(p0+i), v1 = fsimd::load_u8_16(p1+i), v2 = fsimd::load_u8_16(p2+i);
757 auto v3 = fsimd::load_u8_16(p3+i), v4 = fsimd::load_u8_16(p4+i), v5 = fsimd::load_u8_16(p5+i);
758 auto v6v = fsimd::load_u8_16(p6+i), v7 = fsimd::load_u8_16(p7+i), v8v = fsimd::load_u8_16(p8+i);
759 auto s08 = fsimd::add_u16_8(fsimd::widen_lo_u8_to_u16(v0), fsimd::widen_lo_u8_to_u16(v8v));
760 auto s17 = fsimd::add_u16_8(fsimd::widen_lo_u8_to_u16(v1), fsimd::widen_lo_u8_to_u16(v7));
761 auto s26 = fsimd::add_u16_8(fsimd::widen_lo_u8_to_u16(v2), fsimd::widen_lo_u8_to_u16(v6v));
762 auto s35 = fsimd::add_u16_8(fsimd::widen_lo_u8_to_u16(v3), fsimd::widen_lo_u8_to_u16(v5));
763 auto lo = fsimd::add_u16_8(s08, fsimd::add_u16_8(fsimd::mullo_u16_8(s17, w8),
764 fsimd::add_u16_8(fsimd::mullo_u16_8(s26, w28),
765 fsimd::add_u16_8(fsimd::mullo_u16_8(s35, w56), fsimd::mullo_u16_8(fsimd::widen_lo_u8_to_u16(v4), w70)))));
766 lo = fsimd::srli_u16_8(lo, 8);
767 auto s08h = fsimd::add_u16_8(fsimd::widen_hi_u8_to_u16(v0), fsimd::widen_hi_u8_to_u16(v8v));
768 auto s17h = fsimd::add_u16_8(fsimd::widen_hi_u8_to_u16(v1), fsimd::widen_hi_u8_to_u16(v7));
769 auto s26h = fsimd::add_u16_8(fsimd::widen_hi_u8_to_u16(v2), fsimd::widen_hi_u8_to_u16(v6v));
770 auto s35h = fsimd::add_u16_8(fsimd::widen_hi_u8_to_u16(v3), fsimd::widen_hi_u8_to_u16(v5));
771 auto hi = fsimd::add_u16_8(s08h, fsimd::add_u16_8(fsimd::mullo_u16_8(s17h, w8),
772 fsimd::add_u16_8(fsimd::mullo_u16_8(s26h, w28),
773 fsimd::add_u16_8(fsimd::mullo_u16_8(s35h, w56), fsimd::mullo_u16_8(fsimd::widen_hi_u8_to_u16(v4), w70)))));
774 hi = fsimd::srli_u16_8(hi, 8);
775 fsimd::store_u8_16(out+i, fsimd::narrow_u16_to_u8(lo, hi));
776 }
777 for (; i < nbytes; ++i) {
778 u16 s08=(u16)p0[i]+(u16)p8[i], s17=(u16)p1[i]+(u16)p7[i];
779 u16 s26=(u16)p2[i]+(u16)p6[i], s35=(u16)p3[i]+(u16)p5[i];
780 out[i]=(u8)((s08+s17*8+s26*28+s35*56+(u16)p4[i]*70)>>8);
781 }
782}

Referenced by fl::gfx::blur_detail::simd_hconv_dispatch< 4 >::apply(), and fl::gfx::blur_detail::simd_vconv_dispatch< 4 >::apply().

+ Here is the caller graph for this function:

◆ vpass_full()

template<int R, typename RGB_T, typename acc_t, bool ApplyAlpha, typename AlphaT>
static void fl::gfx::blur_detail::vpass_full ( RGB_T * pixels,
int w,
int h,
RGB_T * scratch,
AlphaT alpha )
static

Definition at line 1088 of file blur.cpp.hpp.

1088 {
1089#if defined(FL_IS_AVR)
1090 // AVR: column-by-column with per-channel noinline + O3.
1091 FL_BUILTIN_MEMSET(scratch, 0, R * sizeof(RGB_T));
1092 FL_BUILTIN_MEMSET(scratch + R + h, 0, R * sizeof(RGB_T));
1093
1094 for (int x = 0; x < w; ++x) {
1095 // Linearize column into padded region.
1096 {
1097 const RGB_T *src = pixels + x;
1098 RGB_T *dst = scratch + R;
1099 for (int i = 0; i < h; ++i) {
1100 *dst++ = *src;
1101 src += w;
1102 }
1103 }
1104 if (ApplyAlpha)
1105 apply_pass_alpha_1ch<R>(scratch, pixels + x, h, w, alpha);
1106 else
1107 apply_pass_1ch<R>(scratch, pixels + x, h, w);
1108 }
1109#else
1110 // Non-AVR: row-major vertical pass for cache efficiency.
1111 // Direct R=2 V-pass using exact [1,4,6,4,1] kernel (no cascaded R=1).
1112 if (ApplyAlpha) {
1114 pixels, w, h, scratch, alpha);
1115 } else {
1117 pixels, w, h, scratch, alpha);
1118 }
1119#endif
1120}
static FL_OPTIMIZE_FUNCTION void vpass_rowmajor_impl(RGB_T *pixels, int w, int h, RGB_T *scratch, AlphaT alpha)
Definition blur.cpp.hpp:967
#define FL_BUILTIN_MEMSET(dest, val, n)

References FL_BUILTIN_MEMSET, vpass_rowmajor_impl(), and fl::x.

Referenced by fl::gfx::blurGaussianImpl().

+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ vpass_rowmajor_impl()

template<int R, typename RGB_T, typename acc_t, bool ApplyAlpha, typename AlphaT>
static FL_OPTIMIZE_FUNCTION void fl::gfx::blur_detail::vpass_rowmajor_impl ( RGB_T * pixels,
int w,
int h,
RGB_T * scratch,
AlphaT alpha )
static

Definition at line 967 of file blur.cpp.hpp.

970{
971 constexpr int shift = 2 * R;
972 using P = pixel_ops<RGB_T>;
973
974 // Ring buffer: bufs[0..R-1] = saved previous rows, bufs[R] = save slot.
975 // Extra zero_row for bottom-boundary padding.
976 RGB_T *bufs[5] = {nullptr, nullptr, nullptr, nullptr, nullptr};
977 for (int i = 0; i <= R; ++i)
978 bufs[i] = scratch + i * w;
979 RGB_T *zero_row = scratch + (R + 1) * w;
980
981 // Zero all: first R buffers (top-boundary padding) + zero_row.
982 FL_BUILTIN_MEMSET(scratch, 0, (R + 2) * w * sizeof(RGB_T));
983
984 for (int y = 0; y < h; ++y) {
985 RGB_T *out_row = pixels + y * w;
986
987 // Save current row before we overwrite it.
988 FL_BUILTIN_MEMCPY(bufs[R], out_row, w * sizeof(RGB_T));
989
990 // Forward row pointers (rows y+1 .. y+R, or zero_row if OOB).
991 const RGB_T *fwd[4] = {zero_row, zero_row, zero_row, zero_row};
992 for (int k = 0; k < R; ++k)
993 fwd[k] = (y + 1 + k < h) ? (pixels + (y + 1 + k) * w) : zero_row;
994
995 // Prefetch the furthest-ahead row needed by the NEXT iteration.
996 // At row y, next iteration needs fwd[R-1] = row y+1+R.
997 // Prefetching 2 rows ahead gives the memory subsystem time to fetch.
998 {
999 const int prefetch_y = y + R + 2;
1000 if (prefetch_y < h) {
1001 const char *pf = (const char *)(pixels + prefetch_y * w);
1002 const int row_bytes = w * (int)sizeof(RGB_T);
1003 for (int off = 0; off < row_bytes; off += 64)
1004 __builtin_prefetch(pf + off, 0, 3);
1005 }
1006 }
1007
1008 // Process all pixels in this output row.
1009 // For u8-channel types (CRGB), process as raw byte stream — all
1010 // channels use the same kernel weights, so we treat the row as a
1011 // flat u8 array of w*sizeof(RGB_T) bytes. This produces a simpler
1012 // loop that the compiler can optimize better at low -O levels.
1013 if (sizeof(typename RGB_T::fp) == 1 && !ApplyAlpha) {
1014 // Raw byte fast path (CRGB without alpha).
1015 const int nbytes = w * (int)sizeof(RGB_T);
1016 u8 *ob = (u8 *)out_row;
1017
1018 simd_vconv_dispatch<R>::apply(bufs, fwd, ob, nbytes);
1019 } else {
1020 // Generic path: per-pixel struct access (CRGB16 or alpha case).
1021 for (int x = 0; x < w; ++x) {
1022 acc_t r, g, b;
1023
1024 vpass_pixel_kernel<R, RGB_T, acc_t>::apply(bufs, fwd, x, r, g, b);
1025
1026 if (ApplyAlpha) {
1027 out_row[x] = P::make(static_cast<acc_t>(r >> shift),
1028 static_cast<acc_t>(g >> shift),
1029 static_cast<acc_t>(b >> shift), alpha);
1030 } else {
1031 out_row[x] = P::make(static_cast<acc_t>(r >> shift),
1032 static_cast<acc_t>(g >> shift),
1033 static_cast<acc_t>(b >> shift));
1034 }
1035 }
1036 }
1037
1038 // Rotate ring buffer: discard oldest, current becomes newest saved.
1039 RGB_T *recycled = bufs[0];
1040 for (int i = 0; i < R; ++i) bufs[i] = bufs[i + 1];
1041 bufs[R] = recycled;
1042 }
1043}
#define FL_BUILTIN_MEMCPY(dest, src, n)

References FL_BUILTIN_MEMCPY, FL_BUILTIN_MEMSET, fl::P(), fl::x, and fl::y.

Referenced by vpass_full().

+ Here is the call graph for this function:
+ Here is the caller graph for this function: