FastLED 3.9.15
Loading...
Searching...
No Matches
blur.cpp.hpp
Go to the documentation of this file.
1
2
3#include "fl/stl/stdint.h"
4
5#define FASTLED_INTERNAL
6#include "fl/system/fastled.h"
7
8#include "crgb.h"
9#include "fl/gfx/blur.h"
12#include "fl/log/log.h"
13#include "fl/math/xymap.h"
14#include "fl/math/scale8.h"
15#include "fl/stl/int.h"
16#include "fl/stl/span.h"
17#include "fl/gfx/crgb.h"
18#include "fl/gfx/crgb16.h"
19#include "fl/stl/singleton.h"
20#include "fl/stl/vector.h"
21
22// Platform-neutral SIMD for blur kernels (SSE2, NEON, Xtensa PIE, scalar).
23#if !defined(FL_IS_AVR)
24#include "fl/math/simd.h"
25#endif
26
27// Force O3 even in debug builds so blur benchmarks don't hit watchdog timeouts.
29
30// Legacy XY function. This is a weak symbol that can be overridden by the user.
31// IMPORTANT: This MUST be in the global namespace (not fl::) for backward compatibility
32// with user code from FastLED 3.7.6 that defines: uint16_t XY(uint8_t x, uint8_t y)
33fl::u16 XY(fl::u8 x, fl::u8 y) FL_LINK_WEAK;
34
38 FL_ERROR("XY function not provided - using default [0][0]. Use blur2d with XYMap instead");
39 return 0;
40}
41
42namespace fl {
43
44// make this a weak symbol
45namespace {
46fl::u16 xy_legacy_wrapper(fl::u16 x, fl::u16 y, fl::u16 width,
47 fl::u16 height) {
50 return ::XY(x, y); // Call global namespace XY
51}
52} // namespace
53
54namespace gfx {
55
56// blur1d: one-dimensional blur filter. Spreads light to 2 line neighbors.
57// blur2d: two-dimensional blur filter. Spreads light to 8 XY neighbors.
58//
59// 0 = no spread at all
60// 64 = moderate spreading
61// 172 = maximum smooth, even spreading
62//
63// 173..255 = wider spreading, but increasing flicker
64//
65// Total light is NOT entirely conserved, so many repeated
66// calls to 'blur' will also result in the light fading,
67// eventually all the way to black; this is by design so that
68// it can be used to (slowly) clear the LEDs to black.
70 const fl::u16 numLeds = static_cast<fl::u16>(leds.size());
71 fl::u8 keep = 255 - blur_amount;
72 fl::u8 seep = blur_amount >> 1;
73 CRGB carryover = CRGB::Black;
74 for (fl::u16 i = 0; i < numLeds; ++i) {
75 CRGB cur = leds[i];
76 CRGB part = cur;
77 part.nscale8(seep);
78 cur.nscale8(keep);
79 cur += carryover;
80 if (i)
81 leds[i - 1] += part;
82 leds[i] = cur;
83 carryover = part;
84 }
85}
86
92
94 // Legacy path: uses global XY() via XYMap for user-defined layouts.
95 // Keeps its own blur algorithm copy because XYMap indexing differs from
96 // the cache-coherent rectangular layout used by Canvas.
97 XYMap xyMap =
99 fl::u8 keep = 255 - blur_amount;
100 fl::u8 seep = blur_amount >> 1;
101 // blur rows
102 for (fl::u8 row = 0; row < height; ++row) {
103 CRGB carryover = CRGB::Black;
104 for (fl::u8 i = 0; i < width; ++i) {
105 CRGB cur = leds[xyMap.mapToIndex(i, row)];
106 CRGB part = cur;
107 part.nscale8(seep);
108 cur.nscale8(keep);
109 cur += carryover;
110 if (i)
111 leds[xyMap.mapToIndex(i - 1, row)] += part;
112 leds[xyMap.mapToIndex(i, row)] = cur;
113 carryover = part;
114 }
115 }
116 // blur columns
117 for (fl::u8 col = 0; col < width; ++col) {
118 CRGB carryover = CRGB::Black;
119 for (fl::u8 i = 0; i < height; ++i) {
120 CRGB cur = leds[xyMap.mapToIndex(col, i)];
121 CRGB part = cur;
122 part.nscale8(seep);
123 cur.nscale8(keep);
124 cur += carryover;
125 if (i)
126 leds[xyMap.mapToIndex(col, i - 1)] += part;
127 leds[xyMap.mapToIndex(col, i)] = cur;
128 carryover = part;
129 }
130 }
131}
132
134 fract8 blur_amount, const XYMap &xyMap) {
135 CRGB *pixels = leds.data();
136 fl::u8 keep = 255 - blur_amount;
137 fl::u8 seep = blur_amount >> 1;
138 if (xyMap.isRectangularGrid()) {
139 for (fl::u8 row = 0; row < height; ++row) {
140 CRGB carryover = CRGB::Black;
141 CRGB *rowBase = pixels + row * width;
142 for (fl::u8 col = 0; col < width; ++col) {
143 CRGB cur = rowBase[col];
144 CRGB part = cur;
145 part.nscale8(seep);
146 cur.nscale8(keep);
147 cur += carryover;
148 if (col)
149 rowBase[col - 1] += part;
150 rowBase[col] = cur;
151 carryover = part;
152 }
153 }
154 return;
155 }
156 for (fl::u8 row = 0; row < height; ++row) {
157 CRGB carryover = CRGB::Black;
158 for (fl::u8 i = 0; i < width; ++i) {
159 CRGB cur = leds[xyMap.mapToIndex(i, row)];
160 CRGB part = cur;
161 part.nscale8(seep);
162 cur.nscale8(keep);
163 cur += carryover;
164 if (i)
165 leds[xyMap.mapToIndex(i - 1, row)] += part;
166 leds[xyMap.mapToIndex(i, row)] = cur;
167 carryover = part;
168 }
169 }
170}
171
173 fract8 blur_amount, const XYMap &xyMap) {
174 CRGB *pixels = leds.data();
175 fl::u8 keep = 255 - blur_amount;
176 fl::u8 seep = blur_amount >> 1;
177 if (xyMap.isRectangularGrid()) {
178 for (fl::u8 col = 0; col < width; ++col) {
179 CRGB carryover = CRGB::Black;
180 for (fl::u8 row = 0; row < height; ++row) {
181 CRGB cur = pixels[row * width + col];
182 CRGB part = cur;
183 part.nscale8(seep);
184 cur.nscale8(keep);
185 cur += carryover;
186 if (row)
187 pixels[(row - 1) * width + col] += part;
188 pixels[row * width + col] = cur;
189 carryover = part;
190 }
191 }
192 return;
193 }
194 for (fl::u8 col = 0; col < width; ++col) {
195 CRGB carryover = CRGB::Black;
196 for (fl::u8 i = 0; i < height; ++i) {
197 CRGB cur = leds[xyMap.mapToIndex(col, i)];
198 CRGB part = cur;
199 part.nscale8(seep);
200 cur.nscale8(keep);
201 cur += carryover;
202 if (i)
203 leds[xyMap.mapToIndex(col, i - 1)] += part;
204 leds[xyMap.mapToIndex(col, i)] = cur;
205 carryover = part;
206 }
207 }
208}
209
211 const int w = canvas.width;
212 const int h = canvas.height;
213 CRGB *pixels = canvas.pixels;
214 fl::u8 keep = 255 - blur_amount;
215 fl::u8 seep = blur_amount >> 1;
216 for (int row = 0; row < h; ++row) {
217 CRGB carryover = CRGB::Black;
218 CRGB *rowBase = pixels + row * w;
219 for (int col = 0; col < w; ++col) {
220 CRGB cur = rowBase[col];
221 CRGB part = cur;
222 part.nscale8(seep);
223 cur.nscale8(keep);
224 cur += carryover;
225 if (col)
226 rowBase[col - 1] += part;
227 rowBase[col] = cur;
228 carryover = part;
229 }
230 }
231}
232
234 const int w = canvas.width;
235 const int h = canvas.height;
236 CRGB *pixels = canvas.pixels;
237 fl::u8 keep = 255 - blur_amount;
238 fl::u8 seep = blur_amount >> 1;
239 for (int col = 0; col < w; ++col) {
240 CRGB carryover = CRGB::Black;
241 for (int row = 0; row < h; ++row) {
242 CRGB cur = pixels[row * w + col];
243 CRGB part = cur;
244 part.nscale8(seep);
245 cur.nscale8(keep);
246 cur += carryover;
247 if (row)
248 pixels[(row - 1) * w + col] += part;
249 pixels[row * w + col] = cur;
250 carryover = part;
251 }
252 }
253}
254
259
260} // namespace gfx
261} // namespace fl
262
263// ── Separable Gaussian blur — two-pass binomial convolution ──────────
264//
265// Based on sutaburosu's SKIPSM Gaussian blur implementation.
266// Uses binomial coefficient weights (Pascal's triangle) for a fast and
267// flexible Gaussian blur approximation, optimized for kernel sizes up to 9×9.
268// https://people.videolan.org/~tmatth/papers/Gaussian%20blur%20using%20finite-state%20machines.pdf
269//
270// Two separable passes (horizontal then vertical), each applying the 1D
271// binomial kernel for the given radius:
272// radius 0: [1] sum = 1 shift = 0
273// radius 1: [1, 2, 1] sum = 4 shift = 2
274// radius 2: [1, 4, 6, 4, 1] sum = 16 shift = 4
275// radius 3: [1, 6, 15, 20, 15, 6, 1] sum = 64 shift = 6
276// radius 4: [1, 8, 28, 56, 70, 56, 28, 8, 1] sum = 256 shift = 8
277//
278// Out-of-bounds pixels are treated as zero (zero-padding).
279// Normalization by right-shift of 2*radius bits per pass.
280
281namespace fl {
282namespace gfx {
283
284
285namespace blur_detail {
286
287// Identity alpha value for each type (no dimming).
288template <typename AlphaT> constexpr AlphaT alpha_identity();
289template <> constexpr alpha8 alpha_identity<alpha8>() { return alpha8(255); }
290template <> constexpr alpha16 alpha_identity<alpha16>() { return alpha16(65535); }
291
292// Channel extraction and alpha-scaled pixel construction.
293template <typename RGB_T>
295
296template <>
298 FL_ALWAYS_INLINE u16 ch(u8 v) { return v; }
299 FL_ALWAYS_INLINE CRGB zero() { return CRGB(0, 0, 0); }
300
301 FL_ALWAYS_INLINE CRGB make(u16 r, u16 g, u16 b) {
302 return CRGB(static_cast<u8>(r), static_cast<u8>(g),
303 static_cast<u8>(b));
304 }
305
306 FL_ALWAYS_INLINE CRGB make(u16 r, u16 g, u16 b, alpha8 a) {
307 if (a.value == 255) return make(r, g, b);
308 u16 a1 = static_cast<u16>(a.value) + 1;
309 return CRGB(static_cast<u8>((r * a1) >> 8),
310 static_cast<u8>((g * a1) >> 8),
311 static_cast<u8>((b * a1) >> 8));
312 }
313
314 FL_ALWAYS_INLINE CRGB make(u16 r, u16 g, u16 b, alpha16 a) {
315 if (a.value >= 65535) return make(r, g, b);
316 u32 a1 = static_cast<u32>(a.value) + 1;
317 return CRGB(static_cast<u8>((r * a1) >> 16),
318 static_cast<u8>((g * a1) >> 16),
319 static_cast<u8>((b * a1) >> 16));
320 }
321};
322
323template <>
325 FL_ALWAYS_INLINE u32 ch(u8x8 v) { return v.raw(); }
326 FL_ALWAYS_INLINE CRGB16 zero() { return CRGB16(u8x8(0), u8x8(0), u8x8(0)); }
327
328 FL_ALWAYS_INLINE CRGB16 make(u32 r, u32 g, u32 b) {
329 return CRGB16(u8x8::from_raw(static_cast<u16>(r)),
330 u8x8::from_raw(static_cast<u16>(g)),
331 u8x8::from_raw(static_cast<u16>(b)));
332 }
333
334 FL_ALWAYS_INLINE CRGB16 make(u32 r, u32 g, u32 b, alpha8 a) {
335 if (a.value == 255) return make(r, g, b);
336 u32 a1 = static_cast<u32>(a.value) + 1;
337 return make((r * a1) >> 8, (g * a1) >> 8, (b * a1) >> 8);
338 }
339
340 FL_ALWAYS_INLINE CRGB16 make(u32 r, u32 g, u32 b, alpha16 a) {
341 if (a.value >= 65535) return make(r, g, b);
342 u32 a1 = static_cast<u32>(a.value) + 1;
343 return make((r * a1) >> 16, (g * a1) >> 16, (b * a1) >> 16);
344 }
345};
346
347// Thread-local padded pixel buffer for zero-padding approach.
348template <typename RGB_T>
349static fl::span<RGB_T> get_padbuf(int minSize) {
351 if (static_cast<int>(buf.size()) < minSize) {
352 buf.resize(minSize);
353 }
354 return buf;
355}
356
357// Interior row pixel — fully-unrolled, no bounds checks.
358// Also reused for vertical pass via linearized column data.
359// Template-specialized per radius for direct hardcoded weights.
360template <int R, typename RGB_T, typename acc_t>
362
363template <typename RGB_T, typename acc_t>
364struct interior_row<0, RGB_T, acc_t> {
365 FL_ALWAYS_INLINE void apply(const RGB_T *row, int x,
366 acc_t &r, acc_t &g, acc_t &b) {
367 using P = pixel_ops<RGB_T>;
368 r = P::ch(row[x].r); g = P::ch(row[x].g); b = P::ch(row[x].b);
369 }
370};
371
372template <typename RGB_T, typename acc_t>
373struct interior_row<1, RGB_T, acc_t> {
374 FL_ALWAYS_INLINE void apply(const RGB_T *row, int x,
375 acc_t &r, acc_t &g, acc_t &b) {
376 using P = pixel_ops<RGB_T>;
377 P p;
378 r = 0; g = 0; b = 0;
379 { const RGB_T &px = row[x-1]; r += p.ch(px.r); g += p.ch(px.g); b += p.ch(px.b); }
380 { const RGB_T &px = row[x]; r += p.ch(px.r) * 2; g += p.ch(px.g) * 2; b += p.ch(px.b) * 2; }
381 { const RGB_T &px = row[x+1]; r += p.ch(px.r); g += p.ch(px.g); b += p.ch(px.b); }
382 }
383};
384
385template <typename RGB_T, typename acc_t>
386struct interior_row<2, RGB_T, acc_t> {
387 FL_ALWAYS_INLINE void apply(const RGB_T *row, int x,
388 acc_t &r, acc_t &g, acc_t &b) {
389 using P = pixel_ops<RGB_T>;
390 P p;
391 // [1, 4, 6, 4, 1] — symmetric: (e0+e4) + 4*(e1+e3) + 6*e2
392 const RGB_T &pm2 = row[x-2], &pm1 = row[x-1], &pc = row[x], &pp1 = row[x+1], &pp2 = row[x+2];
393 const acc_t s04r = p.ch(pm2.r) + p.ch(pp2.r), s13r = p.ch(pm1.r) + p.ch(pp1.r);
394 const acc_t s04g = p.ch(pm2.g) + p.ch(pp2.g), s13g = p.ch(pm1.g) + p.ch(pp1.g);
395 const acc_t s04b = p.ch(pm2.b) + p.ch(pp2.b), s13b = p.ch(pm1.b) + p.ch(pp1.b);
396 r = s04r + s13r * 4 + p.ch(pc.r) * 6;
397 g = s04g + s13g * 4 + p.ch(pc.g) * 6;
398 b = s04b + s13b * 4 + p.ch(pc.b) * 6;
399 }
400};
401
402template <typename RGB_T, typename acc_t>
403struct interior_row<3, RGB_T, acc_t> {
404 FL_ALWAYS_INLINE void apply(const RGB_T *row, int x,
405 acc_t &r, acc_t &g, acc_t &b) {
406 using P = pixel_ops<RGB_T>;
407 P p;
408 // [1, 6, 15, 20, 15, 6, 1] — symmetric: (e0+e6) + 6*(e1+e5) + 15*(e2+e4) + 20*e3
409 const RGB_T &pm3 = row[x-3], &pm2 = row[x-2], &pm1 = row[x-1], &pc = row[x];
410 const RGB_T &pp1 = row[x+1], &pp2 = row[x+2], &pp3 = row[x+3];
411 const acc_t s06r = p.ch(pm3.r) + p.ch(pp3.r), s15r = p.ch(pm2.r) + p.ch(pp2.r), s24r = p.ch(pm1.r) + p.ch(pp1.r);
412 const acc_t s06g = p.ch(pm3.g) + p.ch(pp3.g), s15g = p.ch(pm2.g) + p.ch(pp2.g), s24g = p.ch(pm1.g) + p.ch(pp1.g);
413 const acc_t s06b = p.ch(pm3.b) + p.ch(pp3.b), s15b = p.ch(pm2.b) + p.ch(pp2.b), s24b = p.ch(pm1.b) + p.ch(pp1.b);
414 r = s06r + s15r * 6 + s24r * 15 + p.ch(pc.r) * 20;
415 g = s06g + s15g * 6 + s24g * 15 + p.ch(pc.g) * 20;
416 b = s06b + s15b * 6 + s24b * 15 + p.ch(pc.b) * 20;
417 }
418};
419
420template <typename RGB_T, typename acc_t>
421struct interior_row<4, RGB_T, acc_t> {
422 FL_ALWAYS_INLINE void apply(const RGB_T *row, int x,
423 acc_t &r, acc_t &g, acc_t &b) {
424 using P = pixel_ops<RGB_T>;
425 P p;
426 // [1, 8, 28, 56, 70, 56, 28, 8, 1] — symmetric: (e0+e8) + 8*(e1+e7) + 28*(e2+e6) + 56*(e3+e5) + 70*e4
427 const RGB_T &pm4 = row[x-4], &pm3 = row[x-3], &pm2 = row[x-2], &pm1 = row[x-1], &pc = row[x];
428 const RGB_T &pp1 = row[x+1], &pp2 = row[x+2], &pp3 = row[x+3], &pp4 = row[x+4];
429 const acc_t s08r = p.ch(pm4.r) + p.ch(pp4.r), s17r = p.ch(pm3.r) + p.ch(pp3.r);
430 const acc_t s26r = p.ch(pm2.r) + p.ch(pp2.r), s35r = p.ch(pm1.r) + p.ch(pp1.r);
431 const acc_t s08g = p.ch(pm4.g) + p.ch(pp4.g), s17g = p.ch(pm3.g) + p.ch(pp3.g);
432 const acc_t s26g = p.ch(pm2.g) + p.ch(pp2.g), s35g = p.ch(pm1.g) + p.ch(pp1.g);
433 const acc_t s08b = p.ch(pm4.b) + p.ch(pp4.b), s17b = p.ch(pm3.b) + p.ch(pp3.b);
434 const acc_t s26b = p.ch(pm2.b) + p.ch(pp2.b), s35b = p.ch(pm1.b) + p.ch(pp1.b);
435 r = s08r + s17r * 8 + s26r * 28 + s35r * 56 + p.ch(pc.r) * 70;
436 g = s08g + s17g * 8 + s26g * 28 + s35g * 56 + p.ch(pc.g) * 70;
437 b = s08b + s17b * 8 + s26b * 28 + s35b * 56 + p.ch(pc.b) * 70;
438 }
439};
440
441// ── AVR per-channel convolution kernels ──────────────────────────────
442// On AVR, processing one color channel at a time cuts live accumulator
443// registers from 6 (r,g,b as u16 pairs) to 2, dramatically reducing
444// register spilling in the 32-register AVR architecture.
445//
446// conv1ch<R>::apply(center): computes the 1D binomial kernel for a
447// single u8 channel, where `center` points to the center pixel's
448// channel byte and the pixel stride is sizeof(CRGB) = 3 (compile-time
449// constant).
450#if defined(FL_IS_AVR)
451
452template <int R> struct conv1ch;
453
454// All conv1ch specializations use positive-only offsets from the window
455// start (p = center - R*S). This maps cleanly to AVR's LDD instruction
456// which supports displacement 0-63. Max offset = 2*R*S = 24 for R4.
457template <> struct conv1ch<0> {
458 static inline u16 __attribute__((always_inline)) apply(const u8 *c) {
459 return (u16)c[0];
460 }
461};
462
463template <> struct conv1ch<1> {
464 static inline u16 __attribute__((always_inline)) apply(const u8 *c) {
465 constexpr int S = sizeof(CRGB); // 3
466 const u8 *p = c - S;
467 // [64, 128, 64] (rescaled from [1,2,1] so sum=256, shift-by-8 is free)
468 return (u16)p[0] * 64 + (u16)p[S] * 128 + (u16)p[2*S] * 64;
469 }
470};
471
472template <> struct conv1ch<2> {
473 static inline u16 __attribute__((always_inline)) apply(const u8 *c) {
474 constexpr int S = sizeof(CRGB);
475 const u8 *p = c - 2*S;
476 // [16, 64, 96, 64, 16] (rescaled from [1,4,6,4,1] so sum=256, shift-by-8 is free)
477 return (u16)p[0] * 16 + (u16)p[S] * 64 + (u16)p[2*S] * 96
478 + (u16)p[3*S] * 64 + (u16)p[4*S] * 16;
479 }
480};
481
482template <> struct conv1ch<3> {
483 static inline u16 __attribute__((always_inline)) apply(const u8 *c) {
484 constexpr int S = sizeof(CRGB);
485 const u8 *p = c - 3*S;
486 // [4, 24, 60, 80, 60, 24, 4] (rescaled from [1,6,15,20,15,6,1] so sum=256, shift-by-8 is free)
487 return (u16)p[0] * 4 + (u16)p[S] * 24 + (u16)p[2*S] * 60
488 + (u16)p[3*S] * 80 + (u16)p[4*S] * 60 + (u16)p[5*S] * 24
489 + (u16)p[6*S] * 4;
490 }
491};
492
493template <> struct conv1ch<4> {
494 static inline u16 __attribute__((always_inline)) apply(const u8 *c) {
495 constexpr int S = sizeof(CRGB);
496 const u8 *p = c - 4*S;
497 // [1, 8, 28, 56, 70, 56, 28, 8, 1]
498 return (u16)p[0] + (u16)p[S] * 8 + (u16)p[2*S] * 28
499 + (u16)p[3*S] * 56 + (u16)p[4*S] * 70 + (u16)p[5*S] * 56
500 + (u16)p[6*S] * 28 + (u16)p[7*S] * 8 + (u16)p[8*S];
501 }
502};
503
504// AVR noinline per-channel pass for CRGB (no alpha).
505template <int R>
507static void apply_pass_1ch(const CRGB *pad, CRGB *out, int count, int stride) {
508 constexpr int shift = (R == 0) ? 0 : 8;
509 for (int i = 0; i < count; ++i) {
510 const u8 *base = pad[R + i].raw;
511 out->r = static_cast<u8>(conv1ch<R>::apply(base + 0) >> shift);
512 out->g = static_cast<u8>(conv1ch<R>::apply(base + 1) >> shift);
513 out->b = static_cast<u8>(conv1ch<R>::apply(base + 2) >> shift);
514 out += stride;
515 }
516}
517
518// AVR noinline per-channel pass for CRGB with alpha dim.
519template <int R>
521static void apply_pass_alpha_1ch(const CRGB *pad, CRGB *out, int count,
522 int stride, alpha8 alpha) {
523 constexpr int shift = (R == 0) ? 0 : 8;
524 u16 a1 = static_cast<u16>(alpha.value) + 1;
525 for (int i = 0; i < count; ++i) {
526 const u8 *base = pad[R + i].raw;
527 u16 r = conv1ch<R>::apply(base + 0) >> shift;
528 u16 g = conv1ch<R>::apply(base + 1) >> shift;
529 u16 b = conv1ch<R>::apply(base + 2) >> shift;
530 out->r = static_cast<u8>((r * a1) >> 8);
531 out->g = static_cast<u8>((g * a1) >> 8);
532 out->b = static_cast<u8>((b * a1) >> 8);
533 out += stride;
534 }
535}
536
537// AVR noinline per-channel pass for CRGB with alpha16 dim.
538template <int R>
540static void apply_pass_alpha_1ch(const CRGB *pad, CRGB *out, int count,
541 int stride, alpha16 alpha) {
542 constexpr int shift = (R == 0) ? 0 : 8;
543 u32 a1 = static_cast<u32>(alpha.value) + 1;
544 for (int i = 0; i < count; ++i) {
545 const u8 *base = pad[R + i].raw;
546 u16 r = conv1ch<R>::apply(base + 0) >> shift;
547 u16 g = conv1ch<R>::apply(base + 1) >> shift;
548 u16 b = conv1ch<R>::apply(base + 2) >> shift;
549 out->r = static_cast<u8>((r * a1) >> 16);
550 out->g = static_cast<u8>((g * a1) >> 16);
551 out->b = static_cast<u8>((b * a1) >> 16);
552 out += stride;
553 }
554}
555
556#endif // FL_IS_AVR
557
558// Row-level kernel application — noinline on AVR to isolate register pressure.
559// On non-AVR platforms (or CRGB16 on AVR), processes all 3 channels
560// simultaneously using the interior_row kernel.
561template <int R, typename RGB_T, typename acc_t>
563static void apply_pass(const RGB_T *pad, RGB_T *out, int count, int stride) {
564 constexpr int shift = 2 * R;
565 using P = pixel_ops<RGB_T>;
566 for (int i = 0; i < count; ++i) {
567 acc_t r, g, b;
568 interior_row<R, RGB_T, acc_t>::apply(pad, R + i, r, g, b);
569 *out = P::make(static_cast<acc_t>(r >> shift),
570 static_cast<acc_t>(g >> shift),
571 static_cast<acc_t>(b >> shift));
572 out += stride;
573 }
574}
575
576template <int R, typename RGB_T, typename acc_t, typename AlphaT>
578static void apply_pass_alpha(const RGB_T *pad, RGB_T *out, int count,
579 int stride, AlphaT alpha) {
580 constexpr int shift = 2 * R;
581 using P = pixel_ops<RGB_T>;
582 for (int i = 0; i < count; ++i) {
583 acc_t r, g, b;
584 interior_row<R, RGB_T, acc_t>::apply(pad, R + i, r, g, b);
585 *out = P::make(static_cast<acc_t>(r >> shift),
586 static_cast<acc_t>(g >> shift),
587 static_cast<acc_t>(b >> shift), alpha);
588 out += stride;
589 }
590}
591
592
593// ── Platform-neutral SIMD byte-level convolution kernels ────────────────
594// Process nbytes of output using stride-S byte-level convolution.
595// For horizontal pass: S = sizeof(CRGB) = 3 (neighboring pixels).
596// For vertical pass: S = 1 (same column, consecutive row buffers).
597// The same kernel weights apply to all bytes (R, G, B treated uniformly).
598// Uses fl::simd u8x16/u16x8 operations — compiles to SSE2, NEON, PIE, or scalar.
599#if !defined(FL_IS_AVR)
600
601// Kernel: [1, 2, 1] >> 2 ≈ avg(avg(a, c), b)
602// Uses hardware byte averaging which computes (x+y+1)>>1.
603// Two nested avg ops approximate (a + 2b + c) >> 2 with at most +1 rounding
604// per channel — imperceptible for blur and ~3x fewer SIMD instructions.
605static void simd_conv_121(const u8 * FL_RESTRICT_PARAM a,
606 const u8 * FL_RESTRICT_PARAM b,
607 const u8 * FL_RESTRICT_PARAM c,
608 u8 * FL_RESTRICT_PARAM out, int nbytes) {
609 namespace fsimd = fl::simd; // ok bare using
610 int i = 0;
611 for (; i + 63 < nbytes; i += 64) {
612 auto va0 = fsimd::load_u8_32(a+i), vb0 = fsimd::load_u8_32(b+i), vc0 = fsimd::load_u8_32(c+i);
613 auto va1 = fsimd::load_u8_32(a+i+32), vb1 = fsimd::load_u8_32(b+i+32), vc1 = fsimd::load_u8_32(c+i+32);
614 fsimd::store_u8_32(out+i, fsimd::avg_round_u8_32(fsimd::avg_round_u8_32(va0, vc0), vb0));
615 fsimd::store_u8_32(out+i+32, fsimd::avg_round_u8_32(fsimd::avg_round_u8_32(va1, vc1), vb1));
616 }
617 for (; i + 31 < nbytes; i += 32) {
618 auto va = fsimd::load_u8_32(a+i), vb = fsimd::load_u8_32(b+i), vc = fsimd::load_u8_32(c+i);
619 fsimd::store_u8_32(out+i, fsimd::avg_round_u8_32(fsimd::avg_round_u8_32(va, vc), vb));
620 }
621 for (; i + 15 < nbytes; i += 16) {
622 auto va = fsimd::load_u8_16(a+i), vb = fsimd::load_u8_16(b+i), vc = fsimd::load_u8_16(c+i);
623 fsimd::store_u8_16(out+i, fsimd::avg_round_u8_16(fsimd::avg_round_u8_16(va, vc), vb));
624 }
625 for (; i < nbytes; ++i)
626 out[i] = (u8)(((u16)a[i] + ((u16)b[i] << 1) + (u16)c[i]) >> 2);
627}
628
629// Helper: weighted sum for u16x8 — computes (s_sym + w*s_sym_pair + ... + wc*center) >> shift.
630// Used by R=2, R=3, R=4 kernels for both low and high halves of the 16-byte register.
631
632// Kernel: [1, 4, 6, 4, 1] >> 4
633static void simd_conv_14641(const u8 *p0, const u8 *p1, const u8 *p2,
634 const u8 *p3, const u8 *p4,
635 u8 *out, int nbytes) {
636 namespace fsimd = fl::simd; // ok bare using
637 const auto w4w = fsimd::set1_u16_16(4), w6w = fsimd::set1_u16_16(6);
638 const auto w4 = fsimd::set1_u16_8(4), w6 = fsimd::set1_u16_8(6);
639 int i = 0;
640 for (; i + 31 < nbytes; i += 32) {
641 auto v0 = fsimd::load_u8_32(p0+i), v1 = fsimd::load_u8_32(p1+i);
642 auto v2 = fsimd::load_u8_32(p2+i), v3 = fsimd::load_u8_32(p3+i), v4 = fsimd::load_u8_32(p4+i);
643 auto s04 = fsimd::add_u16_16(fsimd::widen_lo_u8x32_to_u16(v0), fsimd::widen_lo_u8x32_to_u16(v4));
644 auto s13 = fsimd::add_u16_16(fsimd::widen_lo_u8x32_to_u16(v1), fsimd::widen_lo_u8x32_to_u16(v3));
645 auto lo = fsimd::add_u16_16(s04, fsimd::add_u16_16(fsimd::mullo_u16_16(s13, w4w), fsimd::mullo_u16_16(fsimd::widen_lo_u8x32_to_u16(v2), w6w)));
646 lo = fsimd::srli_u16_16(lo, 4);
647 auto s04h = fsimd::add_u16_16(fsimd::widen_hi_u8x32_to_u16(v0), fsimd::widen_hi_u8x32_to_u16(v4));
648 auto s13h = fsimd::add_u16_16(fsimd::widen_hi_u8x32_to_u16(v1), fsimd::widen_hi_u8x32_to_u16(v3));
649 auto hi = fsimd::add_u16_16(s04h, fsimd::add_u16_16(fsimd::mullo_u16_16(s13h, w4w), fsimd::mullo_u16_16(fsimd::widen_hi_u8x32_to_u16(v2), w6w)));
650 hi = fsimd::srli_u16_16(hi, 4);
651 fsimd::store_u8_32(out+i, fsimd::narrow_u16x16_to_u8(lo, hi));
652 }
653 for (; i + 15 < nbytes; i += 16) {
654 auto v0 = fsimd::load_u8_16(p0+i), v1 = fsimd::load_u8_16(p1+i);
655 auto v2 = fsimd::load_u8_16(p2+i), v3 = fsimd::load_u8_16(p3+i), v4 = fsimd::load_u8_16(p4+i);
656 auto s04 = fsimd::add_u16_8(fsimd::widen_lo_u8_to_u16(v0), fsimd::widen_lo_u8_to_u16(v4));
657 auto s13 = fsimd::add_u16_8(fsimd::widen_lo_u8_to_u16(v1), fsimd::widen_lo_u8_to_u16(v3));
658 auto lo = fsimd::add_u16_8(s04, fsimd::add_u16_8(fsimd::mullo_u16_8(s13, w4), fsimd::mullo_u16_8(fsimd::widen_lo_u8_to_u16(v2), w6)));
659 lo = fsimd::srli_u16_8(lo, 4);
660 auto s04h = fsimd::add_u16_8(fsimd::widen_hi_u8_to_u16(v0), fsimd::widen_hi_u8_to_u16(v4));
661 auto s13h = fsimd::add_u16_8(fsimd::widen_hi_u8_to_u16(v1), fsimd::widen_hi_u8_to_u16(v3));
662 auto hi = fsimd::add_u16_8(s04h, fsimd::add_u16_8(fsimd::mullo_u16_8(s13h, w4), fsimd::mullo_u16_8(fsimd::widen_hi_u8_to_u16(v2), w6)));
663 hi = fsimd::srli_u16_8(hi, 4);
664 fsimd::store_u8_16(out+i, fsimd::narrow_u16_to_u8(lo, hi));
665 }
666 for (; i < nbytes; ++i) {
667 u16 s04 = (u16)p0[i] + (u16)p4[i];
668 u16 s13 = (u16)p1[i] + (u16)p3[i];
669 out[i] = (u8)((s04 + s13 * 4 + (u16)p2[i] * 6) >> 4);
670 }
671}
672
673// Kernel: [1, 6, 15, 20, 15, 6, 1] >> 6
674static void simd_conv_r3(const u8 *p0, const u8 *p1, const u8 *p2,
675 const u8 *p3, const u8 *p4, const u8 *p5,
676 const u8 *p6, u8 *out, int nbytes) {
677 namespace fsimd = fl::simd; // ok bare using
678 const auto w6w = fsimd::set1_u16_16(6), w15w = fsimd::set1_u16_16(15), w20w = fsimd::set1_u16_16(20);
679 const auto w6 = fsimd::set1_u16_8(6), w15 = fsimd::set1_u16_8(15), w20 = fsimd::set1_u16_8(20);
680 int i = 0;
681 for (; i + 31 < nbytes; i += 32) {
682 auto v0 = fsimd::load_u8_32(p0+i), v1 = fsimd::load_u8_32(p1+i), v2 = fsimd::load_u8_32(p2+i);
683 auto v3 = fsimd::load_u8_32(p3+i), v4 = fsimd::load_u8_32(p4+i), v5 = fsimd::load_u8_32(p5+i);
684 auto v6v = fsimd::load_u8_32(p6+i);
685 auto s06 = fsimd::add_u16_16(fsimd::widen_lo_u8x32_to_u16(v0), fsimd::widen_lo_u8x32_to_u16(v6v));
686 auto s15 = fsimd::add_u16_16(fsimd::widen_lo_u8x32_to_u16(v1), fsimd::widen_lo_u8x32_to_u16(v5));
687 auto s24 = fsimd::add_u16_16(fsimd::widen_lo_u8x32_to_u16(v2), fsimd::widen_lo_u8x32_to_u16(v4));
688 auto lo = fsimd::add_u16_16(s06, fsimd::add_u16_16(fsimd::mullo_u16_16(s15, w6w),
689 fsimd::add_u16_16(fsimd::mullo_u16_16(s24, w15w), fsimd::mullo_u16_16(fsimd::widen_lo_u8x32_to_u16(v3), w20w))));
690 lo = fsimd::srli_u16_16(lo, 6);
691 auto s06h = fsimd::add_u16_16(fsimd::widen_hi_u8x32_to_u16(v0), fsimd::widen_hi_u8x32_to_u16(v6v));
692 auto s15h = fsimd::add_u16_16(fsimd::widen_hi_u8x32_to_u16(v1), fsimd::widen_hi_u8x32_to_u16(v5));
693 auto s24h = fsimd::add_u16_16(fsimd::widen_hi_u8x32_to_u16(v2), fsimd::widen_hi_u8x32_to_u16(v4));
694 auto hi = fsimd::add_u16_16(s06h, fsimd::add_u16_16(fsimd::mullo_u16_16(s15h, w6w),
695 fsimd::add_u16_16(fsimd::mullo_u16_16(s24h, w15w), fsimd::mullo_u16_16(fsimd::widen_hi_u8x32_to_u16(v3), w20w))));
696 hi = fsimd::srli_u16_16(hi, 6);
697 fsimd::store_u8_32(out+i, fsimd::narrow_u16x16_to_u8(lo, hi));
698 }
699 for (; i + 15 < nbytes; i += 16) {
700 auto v0 = fsimd::load_u8_16(p0+i), v1 = fsimd::load_u8_16(p1+i), v2 = fsimd::load_u8_16(p2+i);
701 auto v3 = fsimd::load_u8_16(p3+i), v4 = fsimd::load_u8_16(p4+i), v5 = fsimd::load_u8_16(p5+i);
702 auto v6v = fsimd::load_u8_16(p6+i);
703 auto s06 = fsimd::add_u16_8(fsimd::widen_lo_u8_to_u16(v0), fsimd::widen_lo_u8_to_u16(v6v));
704 auto s15 = fsimd::add_u16_8(fsimd::widen_lo_u8_to_u16(v1), fsimd::widen_lo_u8_to_u16(v5));
705 auto s24 = fsimd::add_u16_8(fsimd::widen_lo_u8_to_u16(v2), fsimd::widen_lo_u8_to_u16(v4));
706 auto lo = fsimd::add_u16_8(s06, fsimd::add_u16_8(fsimd::mullo_u16_8(s15, w6),
707 fsimd::add_u16_8(fsimd::mullo_u16_8(s24, w15), fsimd::mullo_u16_8(fsimd::widen_lo_u8_to_u16(v3), w20))));
708 lo = fsimd::srli_u16_8(lo, 6);
709 auto s06h = fsimd::add_u16_8(fsimd::widen_hi_u8_to_u16(v0), fsimd::widen_hi_u8_to_u16(v6v));
710 auto s15h = fsimd::add_u16_8(fsimd::widen_hi_u8_to_u16(v1), fsimd::widen_hi_u8_to_u16(v5));
711 auto s24h = fsimd::add_u16_8(fsimd::widen_hi_u8_to_u16(v2), fsimd::widen_hi_u8_to_u16(v4));
712 auto hi = fsimd::add_u16_8(s06h, fsimd::add_u16_8(fsimd::mullo_u16_8(s15h, w6),
713 fsimd::add_u16_8(fsimd::mullo_u16_8(s24h, w15), fsimd::mullo_u16_8(fsimd::widen_hi_u8_to_u16(v3), w20))));
714 hi = fsimd::srli_u16_8(hi, 6);
715 fsimd::store_u8_16(out+i, fsimd::narrow_u16_to_u8(lo, hi));
716 }
717 for (; i < nbytes; ++i) {
718 u16 s06=(u16)p0[i]+(u16)p6[i], s15=(u16)p1[i]+(u16)p5[i], s24=(u16)p2[i]+(u16)p4[i];
719 out[i]=(u8)((s06+s15*6+s24*15+(u16)p3[i]*20)>>6);
720 }
721}
722
723// Kernel: [1, 8, 28, 56, 70, 56, 28, 8, 1] >> 8
724static void simd_conv_r4(const u8 *p0, const u8 *p1, const u8 *p2, const u8 *p3,
725 const u8 *p4, const u8 *p5, const u8 *p6, const u8 *p7,
726 const u8 *p8, u8 *out, int nbytes) {
727 namespace fsimd = fl::simd; // ok bare using
728 const auto w8w = fsimd::set1_u16_16(8), w28w = fsimd::set1_u16_16(28);
729 const auto w56w = fsimd::set1_u16_16(56), w70w = fsimd::set1_u16_16(70);
730 const auto w8 = fsimd::set1_u16_8(8), w28 = fsimd::set1_u16_8(28);
731 const auto w56 = fsimd::set1_u16_8(56), w70 = fsimd::set1_u16_8(70);
732 int i = 0;
733 for (; i + 31 < nbytes; i += 32) {
734 auto v0 = fsimd::load_u8_32(p0+i), v1 = fsimd::load_u8_32(p1+i), v2 = fsimd::load_u8_32(p2+i);
735 auto v3 = fsimd::load_u8_32(p3+i), v4 = fsimd::load_u8_32(p4+i), v5 = fsimd::load_u8_32(p5+i);
736 auto v6v = fsimd::load_u8_32(p6+i), v7 = fsimd::load_u8_32(p7+i), v8v = fsimd::load_u8_32(p8+i);
737 auto s08 = fsimd::add_u16_16(fsimd::widen_lo_u8x32_to_u16(v0), fsimd::widen_lo_u8x32_to_u16(v8v));
738 auto s17 = fsimd::add_u16_16(fsimd::widen_lo_u8x32_to_u16(v1), fsimd::widen_lo_u8x32_to_u16(v7));
739 auto s26 = fsimd::add_u16_16(fsimd::widen_lo_u8x32_to_u16(v2), fsimd::widen_lo_u8x32_to_u16(v6v));
740 auto s35 = fsimd::add_u16_16(fsimd::widen_lo_u8x32_to_u16(v3), fsimd::widen_lo_u8x32_to_u16(v5));
741 auto lo = fsimd::add_u16_16(s08, fsimd::add_u16_16(fsimd::mullo_u16_16(s17, w8w),
742 fsimd::add_u16_16(fsimd::mullo_u16_16(s26, w28w),
743 fsimd::add_u16_16(fsimd::mullo_u16_16(s35, w56w), fsimd::mullo_u16_16(fsimd::widen_lo_u8x32_to_u16(v4), w70w)))));
744 lo = fsimd::srli_u16_16(lo, 8);
745 auto s08h = fsimd::add_u16_16(fsimd::widen_hi_u8x32_to_u16(v0), fsimd::widen_hi_u8x32_to_u16(v8v));
746 auto s17h = fsimd::add_u16_16(fsimd::widen_hi_u8x32_to_u16(v1), fsimd::widen_hi_u8x32_to_u16(v7));
747 auto s26h = fsimd::add_u16_16(fsimd::widen_hi_u8x32_to_u16(v2), fsimd::widen_hi_u8x32_to_u16(v6v));
748 auto s35h = fsimd::add_u16_16(fsimd::widen_hi_u8x32_to_u16(v3), fsimd::widen_hi_u8x32_to_u16(v5));
749 auto hi = fsimd::add_u16_16(s08h, fsimd::add_u16_16(fsimd::mullo_u16_16(s17h, w8w),
750 fsimd::add_u16_16(fsimd::mullo_u16_16(s26h, w28w),
751 fsimd::add_u16_16(fsimd::mullo_u16_16(s35h, w56w), fsimd::mullo_u16_16(fsimd::widen_hi_u8x32_to_u16(v4), w70w)))));
752 hi = fsimd::srli_u16_16(hi, 8);
753 fsimd::store_u8_32(out+i, fsimd::narrow_u16x16_to_u8(lo, hi));
754 }
755 for (; i + 15 < nbytes; i += 16) {
756 auto v0 = fsimd::load_u8_16(p0+i), v1 = fsimd::load_u8_16(p1+i), v2 = fsimd::load_u8_16(p2+i);
757 auto v3 = fsimd::load_u8_16(p3+i), v4 = fsimd::load_u8_16(p4+i), v5 = fsimd::load_u8_16(p5+i);
758 auto v6v = fsimd::load_u8_16(p6+i), v7 = fsimd::load_u8_16(p7+i), v8v = fsimd::load_u8_16(p8+i);
759 auto s08 = fsimd::add_u16_8(fsimd::widen_lo_u8_to_u16(v0), fsimd::widen_lo_u8_to_u16(v8v));
760 auto s17 = fsimd::add_u16_8(fsimd::widen_lo_u8_to_u16(v1), fsimd::widen_lo_u8_to_u16(v7));
761 auto s26 = fsimd::add_u16_8(fsimd::widen_lo_u8_to_u16(v2), fsimd::widen_lo_u8_to_u16(v6v));
762 auto s35 = fsimd::add_u16_8(fsimd::widen_lo_u8_to_u16(v3), fsimd::widen_lo_u8_to_u16(v5));
763 auto lo = fsimd::add_u16_8(s08, fsimd::add_u16_8(fsimd::mullo_u16_8(s17, w8),
764 fsimd::add_u16_8(fsimd::mullo_u16_8(s26, w28),
765 fsimd::add_u16_8(fsimd::mullo_u16_8(s35, w56), fsimd::mullo_u16_8(fsimd::widen_lo_u8_to_u16(v4), w70)))));
766 lo = fsimd::srli_u16_8(lo, 8);
767 auto s08h = fsimd::add_u16_8(fsimd::widen_hi_u8_to_u16(v0), fsimd::widen_hi_u8_to_u16(v8v));
768 auto s17h = fsimd::add_u16_8(fsimd::widen_hi_u8_to_u16(v1), fsimd::widen_hi_u8_to_u16(v7));
769 auto s26h = fsimd::add_u16_8(fsimd::widen_hi_u8_to_u16(v2), fsimd::widen_hi_u8_to_u16(v6v));
770 auto s35h = fsimd::add_u16_8(fsimd::widen_hi_u8_to_u16(v3), fsimd::widen_hi_u8_to_u16(v5));
771 auto hi = fsimd::add_u16_8(s08h, fsimd::add_u16_8(fsimd::mullo_u16_8(s17h, w8),
772 fsimd::add_u16_8(fsimd::mullo_u16_8(s26h, w28),
773 fsimd::add_u16_8(fsimd::mullo_u16_8(s35h, w56), fsimd::mullo_u16_8(fsimd::widen_hi_u8_to_u16(v4), w70)))));
774 hi = fsimd::srli_u16_8(hi, 8);
775 fsimd::store_u8_16(out+i, fsimd::narrow_u16_to_u8(lo, hi));
776 }
777 for (; i < nbytes; ++i) {
778 u16 s08=(u16)p0[i]+(u16)p8[i], s17=(u16)p1[i]+(u16)p7[i];
779 u16 s26=(u16)p2[i]+(u16)p6[i], s35=(u16)p3[i]+(u16)p5[i];
780 out[i]=(u8)((s08+s17*8+s26*28+s35*56+(u16)p4[i]*70)>>8);
781 }
782}
783
784// ── Template dispatch: SIMD vertical convolution by radius ─────────────
785// Compile-time selection eliminates runtime if-else chain on R.
786template <int R> struct simd_vconv_dispatch;
787
788template <> struct simd_vconv_dispatch<0> {
789 template <typename RGB_T>
790 static void apply(RGB_T **bufs, const RGB_T **, u8 *out, int nbytes) {
791 FL_BUILTIN_MEMCPY(out, (const u8*)bufs[0], nbytes);
792 }
793};
794
795template <> struct simd_vconv_dispatch<1> {
796 template <typename RGB_T>
797 static void apply(RGB_T **bufs, const RGB_T **fwd, u8 *out, int nbytes) {
798 simd_conv_121((const u8*)bufs[0], (const u8*)bufs[1],
799 (const u8*)fwd[0], out, nbytes);
800 }
801};
802
803template <> struct simd_vconv_dispatch<2> {
804 template <typename RGB_T>
805 static void apply(RGB_T **bufs, const RGB_T **fwd, u8 *out, int nbytes) {
806 simd_conv_14641((const u8*)bufs[0], (const u8*)bufs[1],
807 (const u8*)bufs[2], (const u8*)fwd[0],
808 (const u8*)fwd[1], out, nbytes);
809 }
810};
811
812template <> struct simd_vconv_dispatch<3> {
813 template <typename RGB_T>
814 static void apply(RGB_T **bufs, const RGB_T **fwd, u8 *out, int nbytes) {
815 simd_conv_r3((const u8*)bufs[0], (const u8*)bufs[1],
816 (const u8*)bufs[2], (const u8*)bufs[3],
817 (const u8*)fwd[0], (const u8*)fwd[1],
818 (const u8*)fwd[2], out, nbytes);
819 }
820};
821
822template <> struct simd_vconv_dispatch<4> {
823 template <typename RGB_T>
824 static void apply(RGB_T **bufs, const RGB_T **fwd, u8 *out, int nbytes) {
825 simd_conv_r4((const u8*)bufs[0], (const u8*)bufs[1],
826 (const u8*)bufs[2], (const u8*)bufs[3],
827 (const u8*)bufs[4], (const u8*)fwd[0],
828 (const u8*)fwd[1], (const u8*)fwd[2],
829 (const u8*)fwd[3], out, nbytes);
830 }
831};
832
833// ── Template dispatch: SIMD horizontal convolution by radius ───────────
834template <int R> struct simd_hconv_dispatch;
835
836template <> struct simd_hconv_dispatch<0> {
837 static void apply(const u8 *pb, int, u8 *ob, int nbytes, u8 *, int) {
838 FL_BUILTIN_MEMCPY(ob, pb, nbytes);
839 }
840};
841
842template <> struct simd_hconv_dispatch<1> {
843 static void apply(const u8 *pb, int S, u8 *ob, int nbytes, u8 *, int) {
844 simd_conv_121(pb, pb + S, pb + 2*S, ob, nbytes);
845 }
846};
847
848template <> struct simd_hconv_dispatch<2> {
849 static void apply(const u8 *pb, int S, u8 *ob, int nbytes, u8 *, int) {
850 // Direct [1,4,6,4,1] kernel — exact u16 multiply+shift, no cascaded
851 // avg_round rounding. Slightly more SIMD ops than cascaded R=1 but
852 // produces bit-exact results matching the scalar interior_row path.
853 simd_conv_14641(pb, pb+S, pb+2*S, pb+3*S, pb+4*S, ob, nbytes);
854 }
855};
856
857template <> struct simd_hconv_dispatch<3> {
858 static void apply(const u8 *pb, int S, u8 *ob, int nbytes, u8 *, int) {
859 simd_conv_r3(pb, pb+S, pb+2*S, pb+3*S, pb+4*S, pb+5*S, pb+6*S, ob, nbytes);
860 }
861};
862
863template <> struct simd_hconv_dispatch<4> {
864 static void apply(const u8 *pb, int S, u8 *ob, int nbytes, u8 *, int) {
865 simd_conv_r4(pb, pb+S, pb+2*S, pb+3*S, pb+4*S, pb+5*S, pb+6*S, pb+7*S, pb+8*S, ob, nbytes);
866 }
867};
868
869// ── Template dispatch: per-pixel vertical convolution by radius ────────
870// Used by the generic path (CRGB16 or alpha case).
871template <int R, typename RGB_T, typename acc_t> struct vpass_pixel_kernel;
872
873template <typename RGB_T, typename acc_t>
874struct vpass_pixel_kernel<0, RGB_T, acc_t> {
875 FL_ALWAYS_INLINE void apply(RGB_T **bufs, const RGB_T **, int x,
876 acc_t &r, acc_t &g, acc_t &b) {
878 r = p.ch(bufs[0][x].r);
879 g = p.ch(bufs[0][x].g);
880 b = p.ch(bufs[0][x].b);
881 }
882};
883
884template <typename RGB_T, typename acc_t>
885struct vpass_pixel_kernel<1, RGB_T, acc_t> {
886 FL_ALWAYS_INLINE void apply(RGB_T **bufs, const RGB_T **fwd, int x,
887 acc_t &r, acc_t &g, acc_t &b) {
889 r = (p.ch(bufs[0][x].r) + p.ch(fwd[0][x].r)) + (p.ch(bufs[1][x].r) << 1);
890 g = (p.ch(bufs[0][x].g) + p.ch(fwd[0][x].g)) + (p.ch(bufs[1][x].g) << 1);
891 b = (p.ch(bufs[0][x].b) + p.ch(fwd[0][x].b)) + (p.ch(bufs[1][x].b) << 1);
892 }
893};
894
895template <typename RGB_T, typename acc_t>
896struct vpass_pixel_kernel<2, RGB_T, acc_t> {
897 FL_ALWAYS_INLINE void apply(RGB_T **bufs, const RGB_T **fwd, int x,
898 acc_t &r, acc_t &g, acc_t &b) {
900 const acc_t sr04 = p.ch(bufs[0][x].r) + p.ch(fwd[1][x].r);
901 const acc_t sg04 = p.ch(bufs[0][x].g) + p.ch(fwd[1][x].g);
902 const acc_t sb04 = p.ch(bufs[0][x].b) + p.ch(fwd[1][x].b);
903 const acc_t sr13 = p.ch(bufs[1][x].r) + p.ch(fwd[0][x].r);
904 const acc_t sg13 = p.ch(bufs[1][x].g) + p.ch(fwd[0][x].g);
905 const acc_t sb13 = p.ch(bufs[1][x].b) + p.ch(fwd[0][x].b);
906 r = sr04 + sr13 * 4 + p.ch(bufs[2][x].r) * 6;
907 g = sg04 + sg13 * 4 + p.ch(bufs[2][x].g) * 6;
908 b = sb04 + sb13 * 4 + p.ch(bufs[2][x].b) * 6;
909 }
910};
911
912template <typename RGB_T, typename acc_t>
913struct vpass_pixel_kernel<3, RGB_T, acc_t> {
914 FL_ALWAYS_INLINE void apply(RGB_T **bufs, const RGB_T **fwd, int x,
915 acc_t &r, acc_t &g, acc_t &b) {
917 const acc_t sr06 = p.ch(bufs[0][x].r) + p.ch(fwd[2][x].r);
918 const acc_t sg06 = p.ch(bufs[0][x].g) + p.ch(fwd[2][x].g);
919 const acc_t sb06 = p.ch(bufs[0][x].b) + p.ch(fwd[2][x].b);
920 const acc_t sr15 = p.ch(bufs[1][x].r) + p.ch(fwd[1][x].r);
921 const acc_t sg15 = p.ch(bufs[1][x].g) + p.ch(fwd[1][x].g);
922 const acc_t sb15 = p.ch(bufs[1][x].b) + p.ch(fwd[1][x].b);
923 const acc_t sr24 = p.ch(bufs[2][x].r) + p.ch(fwd[0][x].r);
924 const acc_t sg24 = p.ch(bufs[2][x].g) + p.ch(fwd[0][x].g);
925 const acc_t sb24 = p.ch(bufs[2][x].b) + p.ch(fwd[0][x].b);
926 r = sr06 + sr15 * 6 + sr24 * 15 + p.ch(bufs[3][x].r) * 20;
927 g = sg06 + sg15 * 6 + sg24 * 15 + p.ch(bufs[3][x].g) * 20;
928 b = sb06 + sb15 * 6 + sb24 * 15 + p.ch(bufs[3][x].b) * 20;
929 }
930};
931
932template <typename RGB_T, typename acc_t>
933struct vpass_pixel_kernel<4, RGB_T, acc_t> {
934 FL_ALWAYS_INLINE void apply(RGB_T **bufs, const RGB_T **fwd, int x,
935 acc_t &r, acc_t &g, acc_t &b) {
937 const acc_t sr08 = p.ch(bufs[0][x].r) + p.ch(fwd[3][x].r);
938 const acc_t sg08 = p.ch(bufs[0][x].g) + p.ch(fwd[3][x].g);
939 const acc_t sb08 = p.ch(bufs[0][x].b) + p.ch(fwd[3][x].b);
940 const acc_t sr17 = p.ch(bufs[1][x].r) + p.ch(fwd[2][x].r);
941 const acc_t sg17 = p.ch(bufs[1][x].g) + p.ch(fwd[2][x].g);
942 const acc_t sb17 = p.ch(bufs[1][x].b) + p.ch(fwd[2][x].b);
943 const acc_t sr26 = p.ch(bufs[2][x].r) + p.ch(fwd[1][x].r);
944 const acc_t sg26 = p.ch(bufs[2][x].g) + p.ch(fwd[1][x].g);
945 const acc_t sb26 = p.ch(bufs[2][x].b) + p.ch(fwd[1][x].b);
946 const acc_t sr35 = p.ch(bufs[3][x].r) + p.ch(fwd[0][x].r);
947 const acc_t sg35 = p.ch(bufs[3][x].g) + p.ch(fwd[0][x].g);
948 const acc_t sb35 = p.ch(bufs[3][x].b) + p.ch(fwd[0][x].b);
949 r = sr08 + sr17 * 8 + sr26 * 28 + sr35 * 56 + p.ch(bufs[4][x].r) * 70;
950 g = sg08 + sg17 * 8 + sg26 * 28 + sg35 * 56 + p.ch(bufs[4][x].g) * 70;
951 b = sb08 + sb17 * 8 + sb26 * 28 + sb35 * 56 + p.ch(bufs[4][x].b) * 70;
952 }
953};
954
955#endif // !FL_IS_AVR
956
957// ── Row-major vertical pass (non-AVR) ──────────────────────────────────
958// Processes vertical convolution in row-major order for cache efficiency.
959// Instead of gathering individual columns into a linear buffer (strided
960// reads + writes), iterates row by row with sequential memory access.
961// Uses a ring buffer of R+1 saved rows to hold originals of overwritten rows.
962// scratch must have at least (R+2)*w elements.
963#if !defined(FL_IS_AVR)
964
965template <int R, typename RGB_T, typename acc_t, bool ApplyAlpha, typename AlphaT>
968 RGB_T *pixels, int w, int h,
969 RGB_T *scratch, AlphaT alpha)
970{
971 constexpr int shift = 2 * R;
972 using P = pixel_ops<RGB_T>;
973
974 // Ring buffer: bufs[0..R-1] = saved previous rows, bufs[R] = save slot.
975 // Extra zero_row for bottom-boundary padding.
976 RGB_T *bufs[5] = {nullptr, nullptr, nullptr, nullptr, nullptr};
977 for (int i = 0; i <= R; ++i)
978 bufs[i] = scratch + i * w;
979 RGB_T *zero_row = scratch + (R + 1) * w;
980
981 // Zero all: first R buffers (top-boundary padding) + zero_row.
982 FL_BUILTIN_MEMSET(scratch, 0, (R + 2) * w * sizeof(RGB_T));
983
984 for (int y = 0; y < h; ++y) {
985 RGB_T *out_row = pixels + y * w;
986
987 // Save current row before we overwrite it.
988 FL_BUILTIN_MEMCPY(bufs[R], out_row, w * sizeof(RGB_T));
989
990 // Forward row pointers (rows y+1 .. y+R, or zero_row if OOB).
991 const RGB_T *fwd[4] = {zero_row, zero_row, zero_row, zero_row};
992 for (int k = 0; k < R; ++k)
993 fwd[k] = (y + 1 + k < h) ? (pixels + (y + 1 + k) * w) : zero_row;
994
995 // Prefetch the furthest-ahead row needed by the NEXT iteration.
996 // At row y, next iteration needs fwd[R-1] = row y+1+R.
997 // Prefetching 2 rows ahead gives the memory subsystem time to fetch.
998 {
999 const int prefetch_y = y + R + 2;
1000 if (prefetch_y < h) {
1001 const char *pf = (const char *)(pixels + prefetch_y * w);
1002 const int row_bytes = w * (int)sizeof(RGB_T);
1003 for (int off = 0; off < row_bytes; off += 64)
1004 __builtin_prefetch(pf + off, 0, 3);
1005 }
1006 }
1007
1008 // Process all pixels in this output row.
1009 // For u8-channel types (CRGB), process as raw byte stream — all
1010 // channels use the same kernel weights, so we treat the row as a
1011 // flat u8 array of w*sizeof(RGB_T) bytes. This produces a simpler
1012 // loop that the compiler can optimize better at low -O levels.
1013 if (sizeof(typename RGB_T::fp) == 1 && !ApplyAlpha) {
1014 // Raw byte fast path (CRGB without alpha).
1015 const int nbytes = w * (int)sizeof(RGB_T);
1016 u8 *ob = (u8 *)out_row;
1017
1018 simd_vconv_dispatch<R>::apply(bufs, fwd, ob, nbytes);
1019 } else {
1020 // Generic path: per-pixel struct access (CRGB16 or alpha case).
1021 for (int x = 0; x < w; ++x) {
1022 acc_t r, g, b;
1023
1024 vpass_pixel_kernel<R, RGB_T, acc_t>::apply(bufs, fwd, x, r, g, b);
1025
1026 if (ApplyAlpha) {
1027 out_row[x] = P::make(static_cast<acc_t>(r >> shift),
1028 static_cast<acc_t>(g >> shift),
1029 static_cast<acc_t>(b >> shift), alpha);
1030 } else {
1031 out_row[x] = P::make(static_cast<acc_t>(r >> shift),
1032 static_cast<acc_t>(g >> shift),
1033 static_cast<acc_t>(b >> shift));
1034 }
1035 }
1036 }
1037
1038 // Rotate ring buffer: discard oldest, current becomes newest saved.
1039 RGB_T *recycled = bufs[0];
1040 for (int i = 0; i < R; ++i) bufs[i] = bufs[i + 1];
1041 bufs[R] = recycled;
1042 }
1043}
1044
1045#endif // !FL_IS_AVR
1046
1047// ── Helper: pad buffer size calculation ─────────────────────────────────
1048template <int hR, int vR, typename RGB_T>
1049static int compute_pad_size(int w, int h) {
1050 int hPad = 2 * hR + w;
1051#if defined(FL_IS_AVR)
1052 int vPad = 2 * vR + h;
1053#else
1054 int vPad = vR > 0 ? (vR + 2) * w : 0;
1055#endif
1056 return hPad > vPad ? hPad : vPad;
1057}
1058
1059// ── Helper: horizontal pass for one padded row ─────────────────────────
1060// Dispatches to the appropriate kernel based on platform and pixel type.
1061template <int R, typename RGB_T, typename acc_t, bool ApplyAlpha, typename AlphaT>
1063void hpass_row(RGB_T *pad, RGB_T *out, int w, AlphaT alpha) {
1064#if defined(FL_IS_AVR)
1065 if (ApplyAlpha)
1066 apply_pass_alpha_1ch<R>(pad, out, w, 1, alpha);
1067 else
1068 apply_pass_1ch<R>(pad, out, w, 1);
1069#else
1070 // SIMD fast path: u8-channel (CRGB), no alpha on this pass.
1071 if (sizeof(typename RGB_T::fp) == 1 && !ApplyAlpha) {
1072 constexpr int S = (int)sizeof(RGB_T);
1073 const int nbytes = w * S;
1074 const u8 *pb = (const u8 *)pad;
1075 u8 *ob = (u8 *)out;
1077 pb, S, ob, nbytes, (u8 *)(pad + 2 * R + w), w);
1078 } else if (ApplyAlpha) {
1079 apply_pass_alpha<R, RGB_T, acc_t>(pad, out, w, 1, alpha);
1080 } else {
1081 apply_pass<R, RGB_T, acc_t>(pad, out, w, 1);
1082 }
1083#endif
1084}
1085
1086// ── Helper: vertical pass over entire image ─────────────────────────────
1087template <int R, typename RGB_T, typename acc_t, bool ApplyAlpha, typename AlphaT>
1088static void vpass_full(RGB_T *pixels, int w, int h, RGB_T *scratch, AlphaT alpha) {
1089#if defined(FL_IS_AVR)
1090 // AVR: column-by-column with per-channel noinline + O3.
1091 FL_BUILTIN_MEMSET(scratch, 0, R * sizeof(RGB_T));
1092 FL_BUILTIN_MEMSET(scratch + R + h, 0, R * sizeof(RGB_T));
1093
1094 for (int x = 0; x < w; ++x) {
1095 // Linearize column into padded region.
1096 {
1097 const RGB_T *src = pixels + x;
1098 RGB_T *dst = scratch + R;
1099 for (int i = 0; i < h; ++i) {
1100 *dst++ = *src;
1101 src += w;
1102 }
1103 }
1104 if (ApplyAlpha)
1105 apply_pass_alpha_1ch<R>(scratch, pixels + x, h, w, alpha);
1106 else
1107 apply_pass_1ch<R>(scratch, pixels + x, h, w);
1108 }
1109#else
1110 // Non-AVR: row-major vertical pass for cache efficiency.
1111 // Direct R=2 V-pass using exact [1,4,6,4,1] kernel (no cascaded R=1).
1112 if (ApplyAlpha) {
1114 pixels, w, h, scratch, alpha);
1115 } else {
1117 pixels, w, h, scratch, alpha);
1118 }
1119#endif
1120}
1121
1122} // namespace blur_detail
1123
1124// Separable Gaussian blur: horizontal pass then vertical pass.
1125// Zero-padding approach: copy row/column to a padded buffer with zeros,
1126// then apply the fast unrolled interior_row kernel to ALL positions.
1127// This eliminates slow edge handling and reuses interior_row for both passes.
1128// Dim (alpha) is applied once at the final output.
1129template <int hRadius, int vRadius, typename RGB_T, typename AlphaT>
1131void blurGaussianImpl(Canvas<RGB_T> &canvas, AlphaT alpha) {
1132 const int w = canvas.width;
1133 const int h = canvas.height;
1134 if (w <= 0 || h <= 0)
1135 return;
1136
1138
1139 // Accumulator: u16 for 8-bit channels (max per-pass sum: 255*256=65280),
1140 // u32 for wider channels.
1141 using acc_t = fl::conditional_t<sizeof(typename RGB_T::fp) == 1, u16, u32>;
1142
1143 const bool applyAlpha = !(alpha == blur_detail::alpha_identity<AlphaT>());
1144
1145 // Handle no-blur case (radius 0 in both dimensions).
1146 if (hRadius == 0 && vRadius == 0) {
1147 if (applyAlpha) {
1148 RGB_T *pixels = canvas.pixels;
1149 for (int i = 0; i < w * h; ++i) {
1150 RGB_T &p = pixels[i];
1151 p = P::make(P::ch(p.r), P::ch(p.g), P::ch(p.b), alpha);
1152 }
1153 }
1154 return;
1155 }
1156
1159 RGB_T *pad = padbuf.data();
1160 RGB_T *pixels = canvas.pixels;
1161
1162 // ── Horizontal pass ──────────────────────────────────────────────
1163 if (hRadius > 0) {
1164 // Zero the fixed padding regions once (reused for every row).
1165 FL_BUILTIN_MEMSET(pad, 0, hRadius * sizeof(RGB_T));
1166 FL_BUILTIN_MEMSET(pad + hRadius + w, 0, hRadius * sizeof(RGB_T));
1167
1168 for (int y = 0; y < h; ++y) {
1169 RGB_T *row = pixels + y * w;
1170 FL_BUILTIN_MEMCPY(pad + hRadius, row, w * sizeof(RGB_T));
1171
1172 if (vRadius == 0 && applyAlpha)
1174 pad, row, w, alpha);
1175 else
1177 pad, row, w, alpha);
1178 }
1179 }
1180
1181 // ── Vertical pass ──────────────────────────────────────────────────
1182 if (vRadius > 0) {
1183 if (applyAlpha)
1185 pixels, w, h, pad, alpha);
1186 else
1188 pixels, w, h, pad, alpha);
1189 }
1190}
1191
1192// ── alpha8 overload (UNORM8 dim) ─────────────────────────────────────────
1193
1194template <int hRadius, int vRadius, typename RGB_T>
1195void blurGaussian(Canvas<RGB_T> &canvas, alpha8 dimFactor) {
1196 blurGaussianImpl<hRadius, vRadius>(canvas, dimFactor);
1197}
1198
1199// ── alpha16 overload (UNORM16 dim — true 16-bit precision) ───────────────
1200
1201template <int hRadius, int vRadius, typename RGB_T>
1202void blurGaussian(Canvas<RGB_T> &canvas, alpha16 dimFactor) {
1203 blurGaussianImpl<hRadius, vRadius>(canvas, dimFactor);
1204}
1205
1206// ── CanvasMapped: XYMap-based Gaussian blur ──────────────────────────────
1207// Delegates to the optimized Canvas path when the XYMap is rectangular.
1208// Falls back to per-pixel gather/scatter through XYMap otherwise.
1209template <int hRadius, int vRadius, typename RGB_T, typename AlphaT>
1211void blurGaussianMappedImpl(CanvasMapped<RGB_T> &canvas, AlphaT alpha) {
1212 const int w = canvas.width;
1213 const int h = canvas.height;
1214 if (w <= 0 || h <= 0)
1215 return;
1216
1217 // Fast path: rectangular XYMap → delegate to optimized Canvas blur.
1218 if (canvas.xymap->isRectangularGrid()) {
1219 Canvas<RGB_T> rect(canvas.pixels, w, h);
1221 return;
1222 }
1223
1224 // Slow path: non-rectangular XYMap → per-pixel gather/scatter.
1226 using acc_t = fl::conditional_t<sizeof(typename RGB_T::fp) == 1, u16, u32>;
1227
1228 const bool applyAlpha = !(alpha == blur_detail::alpha_identity<AlphaT>());
1229
1230 // Handle no-blur case.
1231 if (hRadius == 0 && vRadius == 0) {
1232 if (applyAlpha) {
1233 for (int y = 0; y < h; ++y) {
1234 for (int x = 0; x < w; ++x) {
1235 RGB_T &p = canvas.at(x, y);
1236 P ops;
1237 p = ops.make(ops.ch(p.r), ops.ch(p.g), ops.ch(p.b), alpha);
1238 }
1239 }
1240 }
1241 return;
1242 }
1243
1246 RGB_T *pad = padbuf.data();
1247
1248 // ── Horizontal pass: gather row via XYMap, convolve, scatter back ──
1249 if (hRadius > 0) {
1250 FL_BUILTIN_MEMSET(pad, 0, hRadius * sizeof(RGB_T));
1251 FL_BUILTIN_MEMSET(pad + hRadius + w, 0, hRadius * sizeof(RGB_T));
1252
1253 for (int y = 0; y < h; ++y) {
1254 for (int x = 0; x < w; ++x)
1255 pad[hRadius + x] = canvas.at(x, y);
1256
1257 constexpr int shift = 2 * hRadius;
1258 for (int x = 0; x < w; ++x) {
1259 acc_t r, g, b;
1261 RGB_T result;
1262 if (vRadius == 0 && applyAlpha)
1263 result = P().make(static_cast<acc_t>(r >> shift),
1264 static_cast<acc_t>(g >> shift),
1265 static_cast<acc_t>(b >> shift), alpha);
1266 else
1267 result = P().make(static_cast<acc_t>(r >> shift),
1268 static_cast<acc_t>(g >> shift),
1269 static_cast<acc_t>(b >> shift));
1270 canvas.at(x, y) = result;
1271 }
1272 }
1273 }
1274
1275 // ── Vertical pass: gather column via XYMap, convolve, scatter back ──
1276 if (vRadius > 0) {
1277 FL_BUILTIN_MEMSET(pad, 0, vRadius * sizeof(RGB_T));
1278 FL_BUILTIN_MEMSET(pad + vRadius + h, 0, vRadius * sizeof(RGB_T));
1279
1280 for (int x = 0; x < w; ++x) {
1281 for (int y = 0; y < h; ++y)
1282 pad[vRadius + y] = canvas.at(x, y);
1283
1284 constexpr int shift = 2 * vRadius;
1285 for (int y = 0; y < h; ++y) {
1286 acc_t r, g, b;
1288 RGB_T result;
1289 if (applyAlpha)
1290 result = P().make(static_cast<acc_t>(r >> shift),
1291 static_cast<acc_t>(g >> shift),
1292 static_cast<acc_t>(b >> shift), alpha);
1293 else
1294 result = P().make(static_cast<acc_t>(r >> shift),
1295 static_cast<acc_t>(g >> shift),
1296 static_cast<acc_t>(b >> shift));
1297 canvas.at(x, y) = result;
1298 }
1299 }
1300 }
1301}
1302
1303// ── CanvasMapped alpha8 overload ─────────────────────────────────────────
1304
1305template <int hRadius, int vRadius, typename RGB_T>
1306void blurGaussian(CanvasMapped<RGB_T> &canvas, alpha8 dimFactor) {
1308}
1309
1310// ── CanvasMapped alpha16 overload ────────────────────────────────────────
1311
1312template <int hRadius, int vRadius, typename RGB_T>
1313void blurGaussian(CanvasMapped<RGB_T> &canvas, alpha16 dimFactor) {
1315}
1316
1317// ── Explicit instantiations: alpha8 overload ─────────────────────────────
1318
1319#define BLUR_INST_F8(H, V, T) \
1320 template void blurGaussian<H, V, T>(Canvas<T> &, alpha8);
1321
1322// CRGB — symmetric, h-only, v-only, asymmetric.
1330
1331// CRGB16 — same combos (not available on AVR due to RAM constraints).
1332#if !defined(FL_IS_AVR)
1340#endif
1341
1342#undef BLUR_INST_F8
1343
1344// ── Explicit instantiations: alpha16 overload ────────────────────────────
1345
1346#define BLUR_INST_F16(H, V, T) \
1347 template void blurGaussian<H, V, T>(Canvas<T> &, alpha16);
1348
1349// CRGB
1357
1358// CRGB16
1359#if !defined(FL_IS_AVR)
1367#endif
1368
1369#undef BLUR_INST_F16
1370
1371// ── Explicit instantiations: CanvasMapped alpha8 ─────────────────────────
1372
1373#define BLUR_MAPPED_INST_F8(H, V, T) \
1374 template void blurGaussian<H, V, T>(CanvasMapped<T> &, alpha8);
1375
1383
1384#if !defined(FL_IS_AVR)
1392#endif
1393
1394#undef BLUR_MAPPED_INST_F8
1395
1396// ── Explicit instantiations: CanvasMapped alpha16 ────────────────────────
1397
1398#define BLUR_MAPPED_INST_F16(H, V, T) \
1399 template void blurGaussian<H, V, T>(CanvasMapped<T> &, alpha16);
1400
1408
1409#if !defined(FL_IS_AVR)
1417#endif
1418
1419#undef BLUR_MAPPED_INST_F16
1420
1421} // namespace gfx
1422} // namespace fl
1423
fl::XYMap xyMap
fl::CRGB leds[NUM_LEDS]
XYMap xymap
int y
Definition simple.h:93
int x
Definition simple.h:92
FL_OPTIMIZATION_LEVEL_O3_BEGIN fl::u16 XY(fl::u8 x, fl::u8 y) FL_LINK_WEAK
Definition blur.cpp.hpp:35
static XYMap constructWithUserFunction(u16 width, u16 height, XYFunction xyFunction, u16 offset=0) FL_NOEXCEPT
Definition xymap.cpp.hpp:27
bool isRectangularGrid() const FL_NOEXCEPT
Definition xymap.h:114
const T * data() const FL_NOEXCEPT
Definition span.h:461
constexpr u16 raw() const FL_NOEXCEPT
Definition u8x8.h:59
static constexpr FASTLED_FORCE_INLINE u8x8 from_raw(u16 raw) FL_NOEXCEPT
Definition u8x8.h:53
fl::size size() const FL_NOEXCEPT
void resize(fl::size n) FL_NOEXCEPT
Definition vector.h:593
Defines the 8-bit red, green, and blue (RGB) pixel type in the fl namespace.
Legacy compatibility header for 8-bit scaling functions.
Internal FastLED header for implementation files.
void blur2d(fl::span< CRGB > leds, fl::u8 width, fl::u8 height, fract8 blur_amount, const XYMap &xymap)
Two-dimensional blur filter (span version).
Definition blur.cpp.hpp:87
void blur1d(fl::span< CRGB > leds, fract8 blur_amount)
One-dimensional blur filter (span version).
Definition blur.cpp.hpp:69
void blurRows(fl::span< CRGB > leds, fl::u8 width, fl::u8 height, fract8 blur_amount, const XYMap &xyMap)
Perform a blur1d() on every row of a rectangular matrix (span version).
Definition blur.cpp.hpp:133
void blurColumns(fl::span< CRGB > leds, fl::u8 width, fl::u8 height, fract8 blur_amount, const XYMap &xyMap)
Perform a blur1d() on every column of a rectangular matrix (span version).
Definition blur.cpp.hpp:172
fl::CRGB CRGB
Definition crgb.h:25
#define FL_ERROR(X)
Definition log.h:219
Centralized logging categories for FastLED hardware interfaces and subsystems.
fl::u16 xy_legacy_wrapper(fl::u16 x, fl::u16 y, fl::u16 width, fl::u16 height)
Definition blur.cpp.hpp:46
unsigned char u8
Definition s16x16x4.h:132
u8 fract8
Fixed-Point Fractional Types.
Definition s16x16x4.h:161
typename conditional< B, T, F >::type conditional_t
Definition s16x16x4.h:115
static void vpass_full(RGB_T *pixels, int w, int h, RGB_T *scratch, AlphaT alpha)
static FL_OPTIMIZE_FUNCTION void vpass_rowmajor_impl(RGB_T *pixels, int w, int h, RGB_T *scratch, AlphaT alpha)
Definition blur.cpp.hpp:967
constexpr AlphaT alpha_identity()
FL_ALWAYS_INLINE void hpass_row(RGB_T *pad, RGB_T *out, int w, AlphaT alpha)
static void simd_conv_121(const u8 *FL_RESTRICT_PARAM a, const u8 *FL_RESTRICT_PARAM b, const u8 *FL_RESTRICT_PARAM c, u8 *FL_RESTRICT_PARAM out, int nbytes)
Definition blur.cpp.hpp:605
FL_NO_INLINE_IF_AVR static FL_OPTIMIZE_FUNCTION void apply_pass_alpha(const RGB_T *pad, RGB_T *out, int count, int stride, AlphaT alpha)
Definition blur.cpp.hpp:578
static void simd_conv_r3(const u8 *p0, const u8 *p1, const u8 *p2, const u8 *p3, const u8 *p4, const u8 *p5, const u8 *p6, u8 *out, int nbytes)
Definition blur.cpp.hpp:674
static fl::span< RGB_T > get_padbuf(int minSize)
Definition blur.cpp.hpp:349
static void simd_conv_14641(const u8 *p0, const u8 *p1, const u8 *p2, const u8 *p3, const u8 *p4, u8 *out, int nbytes)
Definition blur.cpp.hpp:633
static int compute_pad_size(int w, int h)
constexpr alpha8 alpha_identity< alpha8 >()
Definition blur.cpp.hpp:289
static void simd_conv_r4(const u8 *p0, const u8 *p1, const u8 *p2, const u8 *p3, const u8 *p4, const u8 *p5, const u8 *p6, const u8 *p7, const u8 *p8, u8 *out, int nbytes)
Definition blur.cpp.hpp:724
constexpr alpha16 alpha_identity< alpha16 >()
Definition blur.cpp.hpp:290
FL_NO_INLINE_IF_AVR static FL_OPTIMIZE_FUNCTION void apply_pass(const RGB_T *pad, RGB_T *out, int count, int stride)
Definition blur.cpp.hpp:563
CRGB BLUR_INST_F8(2, 2, CRGB) BLUR_INST_F8(3
void blurGaussian(Canvas< RGB_T > &canvas, alpha8 dimFactor)
Compile-time Gaussian blur with independent H/V radii.
CRGB BLUR_INST_F16(2, 2, CRGB) BLUR_INST_F16(3
FL_OPTIMIZE_FUNCTION void blurGaussianImpl(Canvas< RGB_T > &canvas, AlphaT alpha)
CRGB BLUR_MAPPED_INST_F16(2, 2, CRGB) BLUR_MAPPED_INST_F16(3
CRGB BLUR_MAPPED_INST_F8(2, 2, CRGB) BLUR_MAPPED_INST_F8(3
FL_OPTIMIZE_FUNCTION void blurGaussianMappedImpl(CanvasMapped< RGB_T > &canvas, AlphaT alpha)
unsigned char u8
Definition stdint.h:131
fl::CRGB CRGB
Definition video.h:15
u8 u8 height
Definition blur.h:186
FASTLED_FORCE_INLINE fl::u8 P(fl::u8 x)
u8 u8 fract8 blur_amount
Definition blur.h:186
u8 width
Definition blur.h:186
expected< T, E > result
Alias for expected (Rust-style naming)
Definition result.h:31
Base definition for an LED controller.
Definition crgb.hpp:179
#define FL_OPTIMIZATION_LEVEL_O3_BEGIN
#define FL_ALWAYS_INLINE
#define FL_BUILTIN_MEMCPY(dest, src, n)
#define FL_NO_INLINE
#define FL_NO_INLINE_IF_AVR
#define FL_OPTIMIZATION_LEVEL_O3_END
#define FASTLED_UNUSED(x)
#define FL_OPTIMIZE_FUNCTION
#define FL_BUILTIN_MEMSET(dest, val, n)
#define FL_RESTRICT_PARAM
#define FL_LINK_WEAK
Umbrella header for SIMD subsystem.
CRGB & nscale8(u8 scaledown) FL_NOEXCEPT
Scale down a RGB to N/256ths of its current brightness, using "plain math" dimming rules.
Definition crgb.cpp.hpp:88
@ Black
<div style='background:#000000;width:4em;height:4em;'></div>
Definition crgb.h:510
Representation of an 8-bit RGB pixel (Red, Green, Blue)
Definition crgb.h:38
unsigned short value
Definition alpha.h:88
Unsigned 16-bit alpha / brightness — UNORM16.
Definition alpha.h:87
unsigned char value
Definition alpha.h:43
Unsigned 8-bit alpha / brightness — UNORM8.
Definition alpha.h:42
RGB_T * pixels
Definition canvas.h:72
Simple rectangular canvas for graphics operations Combines a pixel buffer with dimensions for cache-o...
Definition canvas.h:66
const XYMap * xymap
Definition canvas.h:132
fl::span< RGB_T > pixels
Definition canvas.h:131
RGB_T & at(int x, int y) FL_NOEXCEPT
Definition canvas.h:141
XYMap-backed canvas for non-rectangular or remapped layouts.
Definition canvas.h:130
FL_ALWAYS_INLINE void apply(const RGB_T *row, int x, acc_t &r, acc_t &g, acc_t &b)
Definition blur.cpp.hpp:365
FL_ALWAYS_INLINE void apply(const RGB_T *row, int x, acc_t &r, acc_t &g, acc_t &b)
Definition blur.cpp.hpp:374
FL_ALWAYS_INLINE void apply(const RGB_T *row, int x, acc_t &r, acc_t &g, acc_t &b)
Definition blur.cpp.hpp:387
FL_ALWAYS_INLINE void apply(const RGB_T *row, int x, acc_t &r, acc_t &g, acc_t &b)
Definition blur.cpp.hpp:404
FL_ALWAYS_INLINE void apply(const RGB_T *row, int x, acc_t &r, acc_t &g, acc_t &b)
Definition blur.cpp.hpp:422
FL_ALWAYS_INLINE u32 ch(u8x8 v)
Definition blur.cpp.hpp:325
FL_ALWAYS_INLINE CRGB16 make(u32 r, u32 g, u32 b, alpha16 a)
Definition blur.cpp.hpp:340
FL_ALWAYS_INLINE CRGB16 make(u32 r, u32 g, u32 b, alpha8 a)
Definition blur.cpp.hpp:334
FL_ALWAYS_INLINE CRGB16 make(u32 r, u32 g, u32 b)
Definition blur.cpp.hpp:328
FL_ALWAYS_INLINE CRGB make(u16 r, u16 g, u16 b)
Definition blur.cpp.hpp:301
FL_ALWAYS_INLINE u16 ch(u8 v)
Definition blur.cpp.hpp:298
FL_ALWAYS_INLINE CRGB make(u16 r, u16 g, u16 b, alpha16 a)
Definition blur.cpp.hpp:314
FL_ALWAYS_INLINE CRGB make(u16 r, u16 g, u16 b, alpha8 a)
Definition blur.cpp.hpp:306
static void apply(const u8 *pb, int, u8 *ob, int nbytes, u8 *, int)
Definition blur.cpp.hpp:837
static void apply(const u8 *pb, int S, u8 *ob, int nbytes, u8 *, int)
Definition blur.cpp.hpp:843
static void apply(const u8 *pb, int S, u8 *ob, int nbytes, u8 *, int)
Definition blur.cpp.hpp:849
static void apply(const u8 *pb, int S, u8 *ob, int nbytes, u8 *, int)
Definition blur.cpp.hpp:858
static void apply(const u8 *pb, int S, u8 *ob, int nbytes, u8 *, int)
Definition blur.cpp.hpp:864
static void apply(RGB_T **bufs, const RGB_T **, u8 *out, int nbytes)
Definition blur.cpp.hpp:790
static void apply(RGB_T **bufs, const RGB_T **fwd, u8 *out, int nbytes)
Definition blur.cpp.hpp:797
static void apply(RGB_T **bufs, const RGB_T **fwd, u8 *out, int nbytes)
Definition blur.cpp.hpp:805
static void apply(RGB_T **bufs, const RGB_T **fwd, u8 *out, int nbytes)
Definition blur.cpp.hpp:814
static void apply(RGB_T **bufs, const RGB_T **fwd, u8 *out, int nbytes)
Definition blur.cpp.hpp:824
FL_ALWAYS_INLINE void apply(RGB_T **bufs, const RGB_T **, int x, acc_t &r, acc_t &g, acc_t &b)
Definition blur.cpp.hpp:875
FL_ALWAYS_INLINE void apply(RGB_T **bufs, const RGB_T **fwd, int x, acc_t &r, acc_t &g, acc_t &b)
Definition blur.cpp.hpp:886
FL_ALWAYS_INLINE void apply(RGB_T **bufs, const RGB_T **fwd, int x, acc_t &r, acc_t &g, acc_t &b)
Definition blur.cpp.hpp:897
FL_ALWAYS_INLINE void apply(RGB_T **bufs, const RGB_T **fwd, int x, acc_t &r, acc_t &g, acc_t &b)
Definition blur.cpp.hpp:914
FL_ALWAYS_INLINE void apply(RGB_T **bufs, const RGB_T **fwd, int x, acc_t &r, acc_t &g, acc_t &b)
Definition blur.cpp.hpp:934