FastLED 3.9.15
Loading...
Searching...
No Matches

◆ vpass_rowmajor_impl()

template<int R, typename RGB_T, typename acc_t, bool ApplyAlpha, typename AlphaT>
static FL_OPTIMIZE_FUNCTION void fl::gfx::blur_detail::vpass_rowmajor_impl ( RGB_T * pixels,
int w,
int h,
RGB_T * scratch,
AlphaT alpha )
static

Definition at line 967 of file blur.cpp.hpp.

970{
971 constexpr int shift = 2 * R;
972 using P = pixel_ops<RGB_T>;
973
974 // Ring buffer: bufs[0..R-1] = saved previous rows, bufs[R] = save slot.
975 // Extra zero_row for bottom-boundary padding.
976 RGB_T *bufs[5] = {nullptr, nullptr, nullptr, nullptr, nullptr};
977 for (int i = 0; i <= R; ++i)
978 bufs[i] = scratch + i * w;
979 RGB_T *zero_row = scratch + (R + 1) * w;
980
981 // Zero all: first R buffers (top-boundary padding) + zero_row.
982 FL_BUILTIN_MEMSET(scratch, 0, (R + 2) * w * sizeof(RGB_T));
983
984 for (int y = 0; y < h; ++y) {
985 RGB_T *out_row = pixels + y * w;
986
987 // Save current row before we overwrite it.
988 FL_BUILTIN_MEMCPY(bufs[R], out_row, w * sizeof(RGB_T));
989
990 // Forward row pointers (rows y+1 .. y+R, or zero_row if OOB).
991 const RGB_T *fwd[4] = {zero_row, zero_row, zero_row, zero_row};
992 for (int k = 0; k < R; ++k)
993 fwd[k] = (y + 1 + k < h) ? (pixels + (y + 1 + k) * w) : zero_row;
994
995 // Prefetch the furthest-ahead row needed by the NEXT iteration.
996 // At row y, next iteration needs fwd[R-1] = row y+1+R.
997 // Prefetching 2 rows ahead gives the memory subsystem time to fetch.
998 {
999 const int prefetch_y = y + R + 2;
1000 if (prefetch_y < h) {
1001 const char *pf = (const char *)(pixels + prefetch_y * w);
1002 const int row_bytes = w * (int)sizeof(RGB_T);
1003 for (int off = 0; off < row_bytes; off += 64)
1004 __builtin_prefetch(pf + off, 0, 3);
1005 }
1006 }
1007
1008 // Process all pixels in this output row.
1009 // For u8-channel types (CRGB), process as raw byte stream — all
1010 // channels use the same kernel weights, so we treat the row as a
1011 // flat u8 array of w*sizeof(RGB_T) bytes. This produces a simpler
1012 // loop that the compiler can optimize better at low -O levels.
1013 if (sizeof(typename RGB_T::fp) == 1 && !ApplyAlpha) {
1014 // Raw byte fast path (CRGB without alpha).
1015 const int nbytes = w * (int)sizeof(RGB_T);
1016 u8 *ob = (u8 *)out_row;
1017
1018 simd_vconv_dispatch<R>::apply(bufs, fwd, ob, nbytes);
1019 } else {
1020 // Generic path: per-pixel struct access (CRGB16 or alpha case).
1021 for (int x = 0; x < w; ++x) {
1022 acc_t r, g, b;
1023
1024 vpass_pixel_kernel<R, RGB_T, acc_t>::apply(bufs, fwd, x, r, g, b);
1025
1026 if (ApplyAlpha) {
1027 out_row[x] = P::make(static_cast<acc_t>(r >> shift),
1028 static_cast<acc_t>(g >> shift),
1029 static_cast<acc_t>(b >> shift), alpha);
1030 } else {
1031 out_row[x] = P::make(static_cast<acc_t>(r >> shift),
1032 static_cast<acc_t>(g >> shift),
1033 static_cast<acc_t>(b >> shift));
1034 }
1035 }
1036 }
1037
1038 // Rotate ring buffer: discard oldest, current becomes newest saved.
1039 RGB_T *recycled = bufs[0];
1040 for (int i = 0; i < R; ++i) bufs[i] = bufs[i + 1];
1041 bufs[R] = recycled;
1042 }
1043}
unsigned char u8
Definition stdint.h:131
FASTLED_FORCE_INLINE fl::u8 P(fl::u8 x)
#define FL_BUILTIN_MEMCPY(dest, src, n)
#define FL_BUILTIN_MEMSET(dest, val, n)

References FL_BUILTIN_MEMCPY, FL_BUILTIN_MEMSET, fl::P(), fl::x, and fl::y.

Referenced by vpass_full().

+ Here is the call graph for this function:
+ Here is the caller graph for this function: