Add composite color strip source type with layer blending

Composite sources stack multiple existing color strip sources as layers with configurable blend modes (Normal, Add, Multiply, Screen) and per-layer opacity. Includes full CRUD, hot-reload, delete protection for referenced layers, and pre-allocated integer blend math at 30 FPS. Also eliminates per-frame numpy allocations in color_strip_stream, effect_stream, and wled_target_processor (buffer pre-allocation). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-23 11:01:44 +03:00
parent e5a6eafd09
commit 2657f46e5d
15 changed files with 1042 additions and 144 deletions
--- a/server/src/wled_controller/core/processing/color_strip_stream.py
+++ b/server/src/wled_controller/core/processing/color_strip_stream.py
@@ -28,21 +28,36 @@ from wled_controller.utils.timer import high_resolution_timer
 logger = get_logger(__name__)


-def _apply_saturation(colors: np.ndarray, saturation: float) -> np.ndarray:
-    """Adjust saturation via luminance mixing (Rec.601 weights).
+def _apply_saturation(colors: np.ndarray, saturation: float,
+                      _i32: np.ndarray = None, _i32_gray: np.ndarray = None,
+                      _out: np.ndarray = None) -> np.ndarray:
+    """Adjust saturation via luminance mixing (Rec.601 weights, integer math).

    saturation=1.0: no change
    saturation=0.0: grayscale
    saturation=2.0: double saturation (clipped to 0-255)
+
+    Optional pre-allocated scratch buffers (_i32, _i32_gray, _out) avoid
+    per-frame allocations when called from a hot loop.
    """
-    gray = (
-        colors[:, 0].astype(np.int32) * 299
-        + colors[:, 1].astype(np.int32) * 587
-        + colors[:, 2].astype(np.int32) * 114
-    ) // 1000
-    gray = gray[:, np.newaxis]  # (N, 1) for broadcast
-    result = gray + saturation * (colors.astype(np.int32) - gray)
-    return np.clip(result, 0, 255).astype(np.uint8)
+    n = len(colors)
+    if _i32 is None:
+        _i32 = np.empty((n, 3), dtype=np.int32)
+    if _i32_gray is None:
+        _i32_gray = np.empty((n, 1), dtype=np.int32)
+    if _out is None:
+        _out = np.empty((n, 3), dtype=np.uint8)
+
+    sat_int = int(saturation * 256)
+    np.copyto(_i32, colors, casting='unsafe')
+    _i32_gray[:, 0] = (_i32[:, 0] * 299 + _i32[:, 1] * 587 + _i32[:, 2] * 114) // 1000
+    _i32 *= sat_int
+    _i32_gray *= (256 - sat_int)
+    _i32 += _i32_gray
+    _i32 >>= 8
+    np.clip(_i32, 0, 255, out=_i32)
+    np.copyto(_out, _i32, casting='unsafe')
+    return _out


 def _build_gamma_lut(gamma: float) -> np.ndarray:
@@ -278,6 +293,45 @@ class PictureColorStripStream(ColorStripStream):
        """Background thread: poll source, process, cache colors."""
        cached_frame = None

+        # Scratch buffer pool (pre-allocated, resized when LED count changes)
+        _pool_n = 0
+        _frame_a = _frame_b = None   # double-buffered uint8 output
+        _use_a = True
+        _u16_a = _u16_b = None       # uint16 scratch for smoothing / interp blending
+        _i32 = _i32_gray = None      # int32 scratch for saturation + brightness
+
+        def _blend_u16(a, b, alpha_b, out):
+            """Blend two uint8 arrays: out = ((256-alpha_b)*a + alpha_b*b) >> 8.
+
+            Uses pre-allocated uint16 scratch buffers (_u16_a, _u16_b).
+            """
+            np.copyto(_u16_a, a, casting='unsafe')
+            np.copyto(_u16_b, b, casting='unsafe')
+            _u16_a *= (256 - alpha_b)
+            _u16_b *= alpha_b
+            _u16_a += _u16_b
+            _u16_a >>= 8
+            np.copyto(out, _u16_a, casting='unsafe')
+
+        def _apply_corrections(led_colors, frame_buf):
+            """Apply saturation, gamma, brightness using pre-allocated scratch.
+
+            Returns the (possibly reassigned) led_colors array.
+            """
+            if self._saturation != 1.0:
+                _apply_saturation(led_colors, self._saturation, _i32, _i32_gray, led_colors)
+            if self._gamma != 1.0:
+                led_colors = self._gamma_lut[led_colors]
+            if self._brightness != 1.0:
+                bright_int = int(self._brightness * 256)
+                np.copyto(_i32, led_colors, casting='unsafe')
+                _i32 *= bright_int
+                _i32 >>= 8
+                np.clip(_i32, 0, 255, out=_i32)
+                np.copyto(frame_buf, _i32, casting='unsafe')
+                led_colors = frame_buf
+            return led_colors
+
        with high_resolution_timer():
            while self._running:
                loop_start = time.perf_counter()
@@ -293,22 +347,14 @@ class PictureColorStripStream(ColorStripStream):
                            and self._frame_interpolation
                            and self._interp_from is not None
                            and self._interp_to is not None
+                            and _u16_a is not None
                        ):
+                            # Interpolate between previous and current capture
                            t = min(1.0, (loop_start - self._interp_start) / self._interp_duration)
-                            alpha = int(t * 256)
-                            led_colors = (
-                                (256 - alpha) * self._interp_from.astype(np.uint16)
-                                + alpha * self._interp_to.astype(np.uint16)
-                            ) >> 8
-                            led_colors = led_colors.astype(np.uint8)
-                            if self._saturation != 1.0:
-                                led_colors = _apply_saturation(led_colors, self._saturation)
-                            if self._gamma != 1.0:
-                                led_colors = self._gamma_lut[led_colors]
-                            if self._brightness != 1.0:
-                                led_colors = np.clip(
-                                    led_colors.astype(np.float32) * self._brightness, 0, 255
-                                ).astype(np.uint8)
+                            frame_buf = _frame_a if _use_a else _frame_b
+                            _use_a = not _use_a
+                            _blend_u16(self._interp_from, self._interp_to, int(t * 256), frame_buf)
+                            led_colors = _apply_corrections(frame_buf, frame_buf)
                            with self._colors_lock:
                                self._latest_colors = led_colors
                        elapsed = time.perf_counter() - loop_start
@@ -332,14 +378,32 @@ class PictureColorStripStream(ColorStripStream):
                    led_colors = self._pixel_mapper.map_border_to_leds(border_pixels)
                    t2 = time.perf_counter()

-                    # Pad or truncate to match the declared led_count
+                    # Ensure scratch pool is sized for this frame
                    target_count = self._led_count
-                    if target_count > 0 and len(led_colors) != target_count:
-                        if len(led_colors) < target_count:
-                            pad = np.zeros((target_count - len(led_colors), 3), dtype=np.uint8)
-                            led_colors = np.concatenate([led_colors, pad])
+                    _n = target_count if target_count > 0 else len(led_colors)
+                    if _n > 0 and _n != _pool_n:
+                        _pool_n = _n
+                        _frame_a = np.empty((_n, 3), dtype=np.uint8)
+                        _frame_b = np.empty((_n, 3), dtype=np.uint8)
+                        _u16_a = np.empty((_n, 3), dtype=np.uint16)
+                        _u16_b = np.empty((_n, 3), dtype=np.uint16)
+                        _i32 = np.empty((_n, 3), dtype=np.int32)
+                        _i32_gray = np.empty((_n, 1), dtype=np.int32)
+                        self._previous_colors = None
+
+                    # Copy/pad into double-buffered frame (avoids per-frame allocations)
+                    frame_buf = _frame_a if _use_a else _frame_b
+                    _use_a = not _use_a
+                    n_leds = len(led_colors)
+                    if _pool_n > 0:
+                        if n_leds < _pool_n:
+                            frame_buf[:n_leds] = led_colors
+                            frame_buf[n_leds:] = 0
+                        elif n_leds > _pool_n:
+                            frame_buf[:] = led_colors[:_pool_n]
                        else:
-                            led_colors = led_colors[:target_count]
+                            frame_buf[:] = led_colors
+                        led_colors = frame_buf

                    # Update interpolation buffers (raw colors, before corrections)
                    if self._frame_interpolation:
@@ -348,25 +412,22 @@ class PictureColorStripStream(ColorStripStream):
                        self._interp_start = loop_start
                        self._interp_duration = max(interval, 0.001)

-                    # Temporal smoothing
+                    # Temporal smoothing (pre-allocated uint16 scratch)
                    smoothing = self._smoothing
                    if (
                        self._previous_colors is not None
                        and smoothing > 0
                        and len(self._previous_colors) == len(led_colors)
+                        and _u16_a is not None
                    ):
-                        alpha = int(smoothing * 256)
-                        led_colors = (
-                            (256 - alpha) * led_colors.astype(np.uint16)
-                            + alpha * self._previous_colors.astype(np.uint16)
-                        ) >> 8
-                        led_colors = led_colors.astype(np.uint8)
+                        _blend_u16(led_colors, self._previous_colors,
+                                   int(smoothing * 256), led_colors)
                    t3 = time.perf_counter()

-                    # Saturation
+                    # Saturation (pre-allocated int32 scratch)
                    saturation = self._saturation
                    if saturation != 1.0:
-                        led_colors = _apply_saturation(led_colors, saturation)
+                        _apply_saturation(led_colors, saturation, _i32, _i32_gray, led_colors)
                    t4 = time.perf_counter()

                    # Gamma (LUT lookup — O(1) per pixel)
@@ -374,12 +435,16 @@ class PictureColorStripStream(ColorStripStream):
                        led_colors = self._gamma_lut[led_colors]
                    t5 = time.perf_counter()

-                    # Brightness
+                    # Brightness (integer math with pre-allocated int32 scratch)
                    brightness = self._brightness
                    if brightness != 1.0:
-                        led_colors = np.clip(
-                            led_colors.astype(np.float32) * brightness, 0, 255
-                        ).astype(np.uint8)
+                        bright_int = int(brightness * 256)
+                        np.copyto(_i32, led_colors, casting='unsafe')
+                        _i32 *= bright_int
+                        _i32 >>= 8
+                        np.clip(_i32, 0, 255, out=_i32)
+                        np.copyto(frame_buf, _i32, casting='unsafe')
+                        led_colors = frame_buf
                    t6 = time.perf_counter()

                    self._previous_colors = led_colors
@@ -913,6 +978,9 @@ class GradientColorStripStream(ColorStripStream):
        _pool_n = 0
        _buf_a = _buf_b = _scratch_u16 = None
        _use_a = True
+        _wave_i = None              # cached np.arange for wave animation
+        _wave_factors = None        # float32 scratch for wave sin result
+        _wave_u16 = None            # uint16 scratch for wave int factors

        with high_resolution_timer():
            while self._running:
@@ -940,6 +1008,9 @@ class GradientColorStripStream(ColorStripStream):
                        _buf_a = np.empty((n, 3), dtype=np.uint8)
                        _buf_b = np.empty((n, 3), dtype=np.uint8)
                        _scratch_u16 = np.empty((n, 3), dtype=np.uint16)
+                        _wave_i = np.arange(n, dtype=np.float32)
+                        _wave_factors = np.empty(n, dtype=np.float32)
+                        _wave_u16 = np.empty(n, dtype=np.uint16)

                    buf = _buf_a if _use_a else _buf_b
                    _use_a = not _use_a
@@ -963,13 +1034,17 @@ class GradientColorStripStream(ColorStripStream):

                    elif atype == "wave":
                        if n > 1:
-                            i_arr = np.arange(n, dtype=np.float32)
-                            factor = 0.5 * (1 + np.sin(
-                                2 * math.pi * i_arr / n - 2 * math.pi * speed * t * 0.25
-                            ))
-                            int_factors = np.clip(factor * 256, 0, 256).astype(np.uint16)
+                            np.sin(
+                                2 * math.pi * _wave_i / n - 2 * math.pi * speed * t * 0.25,
+                                out=_wave_factors,
+                            )
+                            _wave_factors *= 0.5
+                            _wave_factors += 0.5
+                            np.multiply(_wave_factors, 256, out=_wave_factors)
+                            np.clip(_wave_factors, 0, 256, out=_wave_factors)
+                            np.copyto(_wave_u16, _wave_factors, casting='unsafe')
                            np.copyto(_scratch_u16, base)
-                            _scratch_u16 *= int_factors[:, None]
+                            _scratch_u16 *= _wave_u16[:, None]
                            _scratch_u16 >>= 8
                            np.copyto(buf, _scratch_u16, casting='unsafe')
                            colors = buf
--- a/server/src/wled_controller/core/processing/color_strip_stream_manager.py
+++ b/server/src/wled_controller/core/processing/color_strip_stream_manager.py
@@ -100,12 +100,16 @@ class ColorStripStreamManager:

        # Non-sharable: always create a fresh per-consumer instance
        if not source.sharable:
-            stream_cls = _SIMPLE_STREAM_MAP.get(source.source_type)
-            if not stream_cls:
-                raise ValueError(
-                    f"Unsupported color strip source type '{source.source_type}' for {css_id}"
-                )
-            css_stream = stream_cls(source)
+            if source.source_type == "composite":
+                from wled_controller.core.processing.composite_stream import CompositeColorStripStream
+                css_stream = CompositeColorStripStream(source, self)
+            else:
+                stream_cls = _SIMPLE_STREAM_MAP.get(source.source_type)
+                if not stream_cls:
+                    raise ValueError(
+                        f"Unsupported color strip source type '{source.source_type}' for {css_id}"
+                    )
+                css_stream = stream_cls(source)
            css_stream.start()
            key = f"{css_id}:{consumer_id}" if consumer_id else css_id
            self._streams[key] = _ColorStripEntry(
--- a/server/src/wled_controller/core/processing/composite_stream.py
+++ b/server/src/wled_controller/core/processing/composite_stream.py
@@ -0,0 +1,313 @@
+"""Composite color strip stream — blends multiple sub-streams as layers."""
+
+import threading
+import time
+from typing import Dict, List, Optional
+
+import numpy as np
+
+from wled_controller.core.processing.color_strip_stream import ColorStripStream
+from wled_controller.utils import get_logger
+
+logger = get_logger(__name__)
+
+# Blend-mode dispatch keys
+_BLEND_NORMAL = "normal"
+_BLEND_ADD = "add"
+_BLEND_MULTIPLY = "multiply"
+_BLEND_SCREEN = "screen"
+
+
+class CompositeColorStripStream(ColorStripStream):
+    """Blends multiple ColorStripStreams as layers with blend modes and opacity.
+
+    Each layer references an existing (non-composite) ColorStripSource.
+    Sub-streams are acquired from the ColorStripStreamManager so picture
+    sources share their existing capture pipeline.
+
+    Processing runs in a background thread at 30 FPS, polling each
+    sub-stream's latest colors and blending bottom-to-top.
+    """
+
+    def __init__(self, source, css_manager):
+        self._source_id: str = source.id
+        self._layers: List[dict] = list(source.layers)
+        self._led_count: int = source.led_count
+        self._auto_size: bool = source.led_count == 0
+        self._css_manager = css_manager
+        self._fps: int = 30
+
+        self._running = False
+        self._thread: Optional[threading.Thread] = None
+        self._latest_colors: Optional[np.ndarray] = None
+        self._colors_lock = threading.Lock()
+
+        # layer_index -> (source_id, consumer_id, stream)
+        self._sub_streams: Dict[int, tuple] = {}
+
+        # Pre-allocated scratch (rebuilt when LED count changes)
+        self._pool_n = 0
+        self._result_a: Optional[np.ndarray] = None
+        self._result_b: Optional[np.ndarray] = None
+        self._use_a = True
+        self._u16_a: Optional[np.ndarray] = None
+        self._u16_b: Optional[np.ndarray] = None
+        self._resize_buf: Optional[np.ndarray] = None
+
+    # ── ColorStripStream interface ──────────────────────────────
+
+    @property
+    def target_fps(self) -> int:
+        return self._fps
+
+    @property
+    def led_count(self) -> int:
+        return self._led_count
+
+    @property
+    def is_animated(self) -> bool:
+        return True
+
+    def start(self) -> None:
+        self._acquire_sub_streams()
+        self._running = True
+        self._thread = threading.Thread(
+            target=self._processing_loop, daemon=True,
+            name=f"CompositeCSS-{self._source_id[:12]}",
+        )
+        self._thread.start()
+        logger.info(
+            f"CompositeColorStripStream started: {self._source_id} "
+            f"({len(self._sub_streams)} layers, {self._led_count} LEDs)"
+        )
+
+    def stop(self) -> None:
+        self._running = False
+        if self._thread is not None:
+            self._thread.join(timeout=5.0)
+            self._thread = None
+        self._release_sub_streams()
+        logger.info(f"CompositeColorStripStream stopped: {self._source_id}")
+
+    def get_latest_colors(self) -> Optional[np.ndarray]:
+        with self._colors_lock:
+            return self._latest_colors
+
+    def configure(self, device_led_count: int) -> None:
+        if self._auto_size and device_led_count > 0 and device_led_count != self._led_count:
+            self._led_count = device_led_count
+            # Re-configure sub-streams that support auto-sizing
+            for _idx, (src_id, consumer_id, stream) in self._sub_streams.items():
+                if hasattr(stream, "configure"):
+                    stream.configure(device_led_count)
+            logger.debug(f"CompositeColorStripStream auto-sized to {device_led_count} LEDs")
+
+    def update_source(self, source) -> None:
+        """Hot-update: rebuild sub-streams if layer config changed."""
+        new_layers = list(source.layers)
+        old_layer_ids = [(l.get("source_id"), l.get("blend_mode"), l.get("opacity"), l.get("enabled"))
+                         for l in self._layers]
+        new_layer_ids = [(l.get("source_id"), l.get("blend_mode"), l.get("opacity"), l.get("enabled"))
+                         for l in new_layers]
+
+        self._layers = new_layers
+
+        if source.led_count != 0:
+            self._led_count = source.led_count
+            self._auto_size = False
+
+        # If layer composition changed, rebuild sub-streams
+        if old_layer_ids != new_layer_ids:
+            self._release_sub_streams()
+            self._acquire_sub_streams()
+            logger.info(f"CompositeColorStripStream rebuilt sub-streams: {self._source_id}")
+
+    # ── Sub-stream lifecycle ────────────────────────────────────
+
+    def _acquire_sub_streams(self) -> None:
+        for i, layer in enumerate(self._layers):
+            if not layer.get("enabled", True):
+                continue
+            src_id = layer.get("source_id", "")
+            if not src_id:
+                continue
+            consumer_id = f"{self._source_id}__layer_{i}"
+            try:
+                stream = self._css_manager.acquire(src_id, consumer_id)
+                if hasattr(stream, "configure") and self._led_count > 0:
+                    stream.configure(self._led_count)
+                self._sub_streams[i] = (src_id, consumer_id, stream)
+            except Exception as e:
+                logger.warning(
+                    f"Composite layer {i} (source {src_id}) failed to acquire: {e}"
+                )
+
+    def _release_sub_streams(self) -> None:
+        for _idx, (src_id, consumer_id, _stream) in list(self._sub_streams.items()):
+            try:
+                self._css_manager.release(src_id, consumer_id)
+            except Exception as e:
+                logger.warning(f"Composite layer release error ({src_id}): {e}")
+        self._sub_streams.clear()
+
+    # ── Scratch pool ────────────────────────────────────────────
+
+    def _ensure_pool(self, n: int) -> None:
+        if n == self._pool_n or n <= 0:
+            return
+        self._pool_n = n
+        self._result_a = np.empty((n, 3), dtype=np.uint8)
+        self._result_b = np.empty((n, 3), dtype=np.uint8)
+        self._u16_a = np.empty((n, 3), dtype=np.uint16)
+        self._u16_b = np.empty((n, 3), dtype=np.uint16)
+        self._resize_buf = np.empty((n, 3), dtype=np.uint8)
+
+    # ── Resize helper ───────────────────────────────────────────
+
+    def _resize_to_target(self, colors: np.ndarray, target_n: int) -> np.ndarray:
+        """Resize (N, 3) uint8 array to (target_n, 3) via linear interpolation."""
+        n_src = len(colors)
+        if n_src == target_n:
+            return colors
+        src_x = np.linspace(0, 1, n_src)
+        dst_x = np.linspace(0, 1, target_n)
+        buf = self._resize_buf
+        for ch in range(3):
+            np.copyto(
+                buf[:, ch],
+                np.interp(dst_x, src_x, colors[:, ch]),
+                casting="unsafe",
+            )
+        return buf
+
+    # ── Blend operations (integer math, pre-allocated) ──────────
+
+    def _blend_normal(self, bottom: np.ndarray, top: np.ndarray, alpha: int,
+                      out: np.ndarray) -> None:
+        """Normal blend: out = (bottom * (256-a) + top * a) >> 8"""
+        u16a, u16b = self._u16_a, self._u16_b
+        np.copyto(u16a, bottom, casting="unsafe")
+        np.copyto(u16b, top, casting="unsafe")
+        u16a *= (256 - alpha)
+        u16b *= alpha
+        u16a += u16b
+        u16a >>= 8
+        np.copyto(out, u16a, casting="unsafe")
+
+    def _blend_add(self, bottom: np.ndarray, top: np.ndarray, alpha: int,
+                   out: np.ndarray) -> None:
+        """Additive blend: out = min(255, bottom + top * alpha >> 8)"""
+        u16a, u16b = self._u16_a, self._u16_b
+        np.copyto(u16a, bottom, casting="unsafe")
+        np.copyto(u16b, top, casting="unsafe")
+        u16b *= alpha
+        u16b >>= 8
+        u16a += u16b
+        np.clip(u16a, 0, 255, out=u16a)
+        np.copyto(out, u16a, casting="unsafe")
+
+    def _blend_multiply(self, bottom: np.ndarray, top: np.ndarray, alpha: int,
+                        out: np.ndarray) -> None:
+        """Multiply blend: blended = bottom*top>>8, then lerp with alpha."""
+        u16a, u16b = self._u16_a, self._u16_b
+        # blended = (bottom * top) >> 8
+        np.copyto(u16a, bottom, casting="unsafe")
+        np.copyto(u16b, top, casting="unsafe")
+        u16a *= u16b
+        u16a >>= 8
+        # lerp: result = (bottom * (256-a) + blended * a) >> 8
+        np.copyto(u16b, bottom, casting="unsafe")
+        u16b *= (256 - alpha)
+        u16a *= alpha
+        u16a += u16b
+        u16a >>= 8
+        np.copyto(out, u16a, casting="unsafe")
+
+    def _blend_screen(self, bottom: np.ndarray, top: np.ndarray, alpha: int,
+                      out: np.ndarray) -> None:
+        """Screen blend: blended = 255 - (255-bottom)*(255-top)>>8, then lerp."""
+        u16a, u16b = self._u16_a, self._u16_b
+        # blended = 255 - ((255 - bottom) * (255 - top)) >> 8
+        np.copyto(u16a, bottom, casting="unsafe")
+        np.copyto(u16b, top, casting="unsafe")
+        u16a[:] = 255 - u16a
+        u16b[:] = 255 - u16b
+        u16a *= u16b
+        u16a >>= 8
+        u16a[:] = 255 - u16a
+        # lerp: result = (bottom * (256-a) + blended * a) >> 8
+        np.copyto(u16b, bottom, casting="unsafe")
+        u16b *= (256 - alpha)
+        u16a *= alpha
+        u16a += u16b
+        u16a >>= 8
+        np.copyto(out, u16a, casting="unsafe")
+
+    _BLEND_DISPATCH = {
+        _BLEND_NORMAL: "_blend_normal",
+        _BLEND_ADD: "_blend_add",
+        _BLEND_MULTIPLY: "_blend_multiply",
+        _BLEND_SCREEN: "_blend_screen",
+    }
+
+    # ── Processing loop ─────────────────────────────────────────
+
+    def _processing_loop(self) -> None:
+        while self._running:
+            loop_start = time.perf_counter()
+            frame_time = 1.0 / self._fps
+
+            try:
+                target_n = self._led_count
+                if target_n <= 0:
+                    time.sleep(frame_time)
+                    continue
+
+                self._ensure_pool(target_n)
+
+                result_buf = self._result_a if self._use_a else self._result_b
+                self._use_a = not self._use_a
+                has_result = False
+
+                for i, layer in enumerate(self._layers):
+                    if not layer.get("enabled", True):
+                        continue
+                    if i not in self._sub_streams:
+                        continue
+
+                    _src_id, _consumer_id, stream = self._sub_streams[i]
+                    colors = stream.get_latest_colors()
+                    if colors is None:
+                        continue
+
+                    # Resize to target LED count if needed
+                    if len(colors) != target_n:
+                        colors = self._resize_to_target(colors, target_n)
+
+                    opacity = layer.get("opacity", 1.0)
+                    blend_mode = layer.get("blend_mode", _BLEND_NORMAL)
+                    alpha = int(opacity * 256)
+                    alpha = max(0, min(256, alpha))
+
+                    if not has_result:
+                        # First layer: copy directly (or blend with black if opacity < 1)
+                        if alpha >= 256 and blend_mode == _BLEND_NORMAL:
+                            result_buf[:] = colors
+                        else:
+                            result_buf[:] = 0
+                            blend_fn = getattr(self, self._BLEND_DISPATCH.get(blend_mode, "_blend_normal"))
+                            blend_fn(result_buf, colors, alpha, result_buf)
+                        has_result = True
+                    else:
+                        blend_fn = getattr(self, self._BLEND_DISPATCH.get(blend_mode, "_blend_normal"))
+                        blend_fn(result_buf, colors, alpha, result_buf)
+
+                if has_result:
+                    with self._colors_lock:
+                        self._latest_colors = result_buf
+
+            except Exception as e:
+                logger.error(f"CompositeColorStripStream processing error: {e}", exc_info=True)
+
+            elapsed = time.perf_counter() - loop_start
+            time.sleep(max(frame_time - elapsed, 0.001))
--- a/server/src/wled_controller/core/processing/effect_stream.py
+++ b/server/src/wled_controller/core/processing/effect_stream.py
@@ -74,34 +74,85 @@ def _build_palette_lut(name: str) -> np.ndarray:
 # ── 1-D value noise (no external deps) ──────────────────────────────────

 class _ValueNoise1D:
-    """Simple 1-D value noise with smoothstep interpolation and fractal octaves."""
+    """Simple 1-D value noise with smoothstep interpolation and fractal octaves.
+
+    Scratch buffers are lazily allocated and reused across calls to avoid
+    per-frame numpy allocations in hot loops.
+    """

    def __init__(self, seed: int = 42):
        rng = np.random.RandomState(seed)
        self._table = rng.random(512).astype(np.float32)
+        self._scratch_n = 0
+
+    def _ensure_scratch(self, n: int) -> None:
+        """(Re)allocate scratch buffers when array size changes."""
+        if n == self._scratch_n:
+            return
+        self._scratch_n = n
+        self._xi = np.empty(n, dtype=np.int64)
+        self._frac = np.empty(n, dtype=np.float32)
+        self._t = np.empty(n, dtype=np.float32)
+        self._a = np.empty(n, dtype=np.float32)
+        self._b = np.empty(n, dtype=np.float32)
+        self._oct_x = np.empty(n, dtype=np.float32)
+        self._fbm_result = np.empty(n, dtype=np.float32)

    def noise(self, x: np.ndarray) -> np.ndarray:
-        """Single-octave smooth noise for an array of float positions."""
+        """Single-octave smooth noise for an array of float positions.
+
+        Returns an internal buffer (_b) — caller must copy if the result
+        is needed beyond the next noise() or fbm() call.
+        """
+        n = len(x)
+        self._ensure_scratch(n)
        size = len(self._table)
-        xi = np.floor(x).astype(np.int64)
-        frac = (x - xi).astype(np.float32)
-        t = frac * frac * (3.0 - 2.0 * frac)  # smoothstep
-        a = self._table[xi % size]
-        b = self._table[(xi + 1) % size]
-        return a + t * (b - a)
+        # xi = floor(x)
+        np.floor(x, out=self._frac)
+        np.copyto(self._xi, self._frac, casting='unsafe')
+        # frac = x - xi
+        np.subtract(x, self._frac, out=self._frac)
+        # t = frac * frac * (3 - 2 * frac)  (smoothstep)
+        np.multiply(self._frac, self._frac, out=self._t)
+        np.multiply(self._frac, -2.0, out=self._a)
+        self._a += 3.0
+        self._t *= self._a
+        # Table lookups (fancy indexing is unavoidable but copies into pre-allocated)
+        np.remainder(self._xi, size, out=self._xi)
+        self._a[:] = self._table[self._xi]
+        self._xi += 1
+        np.remainder(self._xi, size, out=self._xi)
+        self._b[:] = self._table[self._xi]
+        # result = a + t * (b - a)
+        self._b -= self._a
+        self._b *= self._t
+        self._b += self._a
+        return self._b

    def fbm(self, x: np.ndarray, octaves: int = 3) -> np.ndarray:
-        """Fractal Brownian Motion — layered noise at decreasing amplitude."""
-        result = np.zeros_like(x, dtype=np.float32)
+        """Fractal Brownian Motion — layered noise at decreasing amplitude.
+
+        Returns an internal buffer (_fbm_result) — caller must copy if the
+        result is needed beyond the next fbm() call.
+        """
+        n = len(x)
+        self._ensure_scratch(n)
+        self._fbm_result[:] = 0
        amp = 1.0
        freq = 1.0
        total_amp = 0.0
        for _ in range(octaves):
-            result += amp * self.noise(x * freq)
+            np.multiply(x, freq, out=self._oct_x)
+            self.noise(self._oct_x)
+            # noise() result is in self._b; copy to _a for accumulation
+            self._a[:] = self._b
+            self._a *= amp
+            self._fbm_result += self._a
            total_amp += amp
            amp *= 0.5
            freq *= 2.0
-        return result / total_amp
+        self._fbm_result /= total_amp
+        return self._fbm_result


 # ── Effect stream ────────────────────────────────────────────────────────
@@ -135,6 +186,17 @@ class EffectColorStripStream(ColorStripStream):
        # Fire state — allocated lazily in render loop
        self._heat: Optional[np.ndarray] = None
        self._heat_n = 0
+        # Scratch arrays (allocated in _animate_loop when LED count is known)
+        self._s_f32_a: Optional[np.ndarray] = None
+        self._s_f32_b: Optional[np.ndarray] = None
+        self._s_f32_c: Optional[np.ndarray] = None
+        self._s_i32: Optional[np.ndarray] = None
+        self._s_f32_rgb: Optional[np.ndarray] = None
+        self._s_arange: Optional[np.ndarray] = None
+        self._s_layer1: Optional[np.ndarray] = None
+        self._s_layer2: Optional[np.ndarray] = None
+        self._plasma_key = (0, 0.0)
+        self._plasma_x: Optional[np.ndarray] = None
        self._update_from_source(source)

    def _update_from_source(self, source) -> None:
@@ -232,6 +294,16 @@ class EffectColorStripStream(ColorStripStream):
                    _pool_n = n
                    _buf_a = np.empty((n, 3), dtype=np.uint8)
                    _buf_b = np.empty((n, 3), dtype=np.uint8)
+                    # Scratch arrays for render methods
+                    self._s_f32_a = np.empty(n, dtype=np.float32)
+                    self._s_f32_b = np.empty(n, dtype=np.float32)
+                    self._s_f32_c = np.empty(n, dtype=np.float32)
+                    self._s_i32 = np.empty(n, dtype=np.int32)
+                    self._s_f32_rgb = np.empty((n, 3), dtype=np.float32)
+                    self._s_arange = np.arange(n, dtype=np.float32)
+                    self._s_layer1 = np.empty(n, dtype=np.float32)
+                    self._s_layer2 = np.empty(n, dtype=np.float32)
+                    self._plasma_key = (0, 0.0)

                buf = _buf_a if _use_a else _buf_b
                _use_a = not _use_a
@@ -271,8 +343,7 @@ class EffectColorStripStream(ColorStripStream):

        # Diffuse heat upward (index 0 = bottom, index n-1 = top)
        if n >= 3:
-            # Average of neighbors, shifted upward
-            new_heat = np.empty_like(heat)
+            new_heat = self._s_f32_a
            new_heat[0] = (heat[0] + heat[1]) * 0.5
            new_heat[1:-1] = (heat[:-2] + heat[1:-1] + heat[2:]) / 3.0
            new_heat[-1] = heat[-1] * 0.5
@@ -285,9 +356,11 @@ class EffectColorStripStream(ColorStripStream):
            if np.random.random() < spark_prob:
                heat[i] = min(1.0, heat[i] + 0.4 + 0.6 * np.random.random())

-        # Map heat to palette
-        indices = np.clip((heat * 255).astype(np.int32), 0, 255)
-        buf[:] = lut[indices]
+        # Map heat to palette (pre-allocated scratch)
+        np.multiply(heat, 255, out=self._s_f32_a)
+        np.clip(self._s_f32_a, 0, 255, out=self._s_f32_a)
+        np.copyto(self._s_i32, self._s_f32_a, casting='unsafe')
+        buf[:] = lut[self._s_i32]

    # ── Meteor ───────────────────────────────────────────────────────

@@ -313,22 +386,32 @@ class EffectColorStripStream(ColorStripStream):
        decay = 0.05 + 0.25 * (1.0 - min(1.0, intensity))  # 0.05 (long) to 0.30 (short)

        # Compute brightness for each LED based on distance behind the meteor
-        indices = np.arange(n, dtype=np.float32)
+        indices = self._s_arange
+        dist = self._s_f32_a
        if mirror:
-            dist = np.abs(indices - pos)
+            np.subtract(indices, pos, out=dist)
+            np.abs(dist, out=dist)
        else:
-            # Signed distance in the direction of travel (behind = positive)
-            dist = (pos - indices) % n
+            np.subtract(pos, indices, out=dist)
+            dist %= n

-        brightness = np.exp(-dist * decay)
+        np.multiply(dist, -decay, out=self._s_f32_b)
+        np.exp(self._s_f32_b, out=self._s_f32_b)
+        brightness = self._s_f32_b

-        # Apply color
+        # Apply color using pre-allocated scratch
        r, g, b = color
-        buf[:, 0] = np.clip(brightness * r, 0, 255).astype(np.uint8)
-        buf[:, 1] = np.clip(brightness * g, 0, 255).astype(np.uint8)
-        buf[:, 2] = np.clip(brightness * b, 0, 255).astype(np.uint8)
+        np.multiply(brightness, r, out=self._s_f32_c)
+        np.clip(self._s_f32_c, 0, 255, out=self._s_f32_c)
+        np.copyto(buf[:, 0], self._s_f32_c, casting='unsafe')
+        np.multiply(brightness, g, out=self._s_f32_c)
+        np.clip(self._s_f32_c, 0, 255, out=self._s_f32_c)
+        np.copyto(buf[:, 1], self._s_f32_c, casting='unsafe')
+        np.multiply(brightness, b, out=self._s_f32_c)
+        np.clip(self._s_f32_c, 0, 255, out=self._s_f32_c)
+        np.copyto(buf[:, 2], self._s_f32_c, casting='unsafe')

-        # Bright white-ish head (within ±1 LED of position)
+        # Bright white-ish head (2-3 LEDs — small, leave allocating)
        head_mask = np.abs(indices - pos) < 1.5
        head_brightness = np.clip(1.0 - np.abs(indices - pos), 0, 1)
        buf[head_mask, 0] = np.clip(
@@ -352,8 +435,14 @@ class EffectColorStripStream(ColorStripStream):
        scale = self._scale
        lut = self._palette_lut

+        # Cache x array (only changes when n or scale change)
+        key = (n, scale)
+        if key != self._plasma_key:
+            self._plasma_key = key
+            self._plasma_x = np.linspace(0, scale * math.pi * 2, n, dtype=np.float64)
+
        phase = t * speed * 0.5
-        x = np.linspace(0, scale * math.pi * 2, n, dtype=np.float64)
+        x = self._plasma_x

        v = (
            np.sin(x + phase)
@@ -373,10 +462,15 @@ class EffectColorStripStream(ColorStripStream):
        scale = self._scale
        lut = self._palette_lut

-        positions = np.arange(n, dtype=np.float32) * scale * 0.1 + t * speed * 0.5
-        values = self._noise.fbm(positions, octaves=3)
-        indices = np.clip((values * 255).astype(np.int32), 0, 255)
-        buf[:] = lut[indices]
+        # Positions from cached arange (avoids per-frame np.arange)
+        np.multiply(self._s_arange, scale * 0.1, out=self._s_f32_a)
+        self._s_f32_a += t * speed * 0.5
+        values = self._noise.fbm(self._s_f32_a, octaves=3)
+        # Map to palette indices using pre-allocated scratch
+        np.multiply(values, 255, out=self._s_f32_b)
+        np.clip(self._s_f32_b, 0, 255, out=self._s_f32_b)
+        np.copyto(self._s_i32, self._s_f32_b, casting='unsafe')
+        buf[:] = lut[self._s_i32]

    # ── Aurora ───────────────────────────────────────────────────────

@@ -387,22 +481,39 @@ class EffectColorStripStream(ColorStripStream):
        intensity = self._intensity
        lut = self._palette_lut

-        positions = np.arange(n, dtype=np.float32) * scale * 0.08
+        # Positions from cached arange
+        np.multiply(self._s_arange, scale * 0.08, out=self._s_f32_a)

-        # Three noise layers at different speeds and offsets
-        layer1 = self._noise.fbm(positions + t * speed * 0.2, octaves=3)
-        layer2 = self._noise.fbm(positions * 1.5 + t * speed * 0.35 + 100.0, octaves=3)
-        layer3 = self._noise.fbm(positions * 0.7 + t * speed * 0.15 + 200.0, octaves=2)
+        # Three noise layers — copy results to dedicated buffers since fbm
+        # may return an internal reference that gets overwritten on the next call
+        np.add(self._s_f32_a, t * speed * 0.2, out=self._s_f32_b)
+        self._s_layer1[:] = self._noise.fbm(self._s_f32_b, octaves=3)

-        # Combine layers: layer1 drives hue, layer2 modulates brightness,
-        # layer3 adds slow undulation
-        hue = (layer1 + layer3 * 0.5) * 0.67  # 0–1 range for palette lookup
-        hue = np.clip(hue, 0.0, 1.0)
+        np.multiply(self._s_f32_a, 1.5, out=self._s_f32_b)
+        self._s_f32_b += t * speed * 0.35 + 100.0
+        self._s_layer2[:] = self._noise.fbm(self._s_f32_b, octaves=3)

-        brightness = 0.3 + 0.7 * layer2 * intensity
-        brightness = np.clip(brightness, 0.0, 1.0)
+        np.multiply(self._s_f32_a, 0.7, out=self._s_f32_b)
+        self._s_f32_b += t * speed * 0.15 + 200.0
+        layer3 = self._noise.fbm(self._s_f32_b, octaves=2)

-        indices = np.clip((hue * 255).astype(np.int32), 0, 255)
-        colors = lut[indices].astype(np.float32)
-        colors *= brightness[:, np.newaxis]
-        buf[:] = np.clip(colors, 0, 255).astype(np.uint8)
+        # Combine layers: hue from layer1 + layer3, brightness from layer2
+        hue = self._s_f32_a  # reuse (positions no longer needed)
+        np.multiply(layer3, 0.5, out=hue)
+        hue += self._s_layer1
+        hue *= 0.67
+        np.clip(hue, 0.0, 1.0, out=hue)
+
+        bright = self._s_f32_b
+        np.multiply(self._s_layer2, 0.7 * intensity, out=bright)
+        bright += 0.3
+        np.clip(bright, 0.0, 1.0, out=bright)
+
+        # Map to palette using pre-allocated scratch
+        np.multiply(hue, 255, out=hue)
+        np.copyto(self._s_i32, hue, casting='unsafe')
+        np.clip(self._s_i32, 0, 255, out=self._s_i32)
+        self._s_f32_rgb[:] = lut[self._s_i32]
+        self._s_f32_rgb *= bright[:, np.newaxis]
+        np.clip(self._s_f32_rgb, 0, 255, out=self._s_f32_rgb)
+        np.copyto(buf, self._s_f32_rgb, casting='unsafe')
--- a/server/src/wled_controller/core/processing/wled_target_processor.py
+++ b/server/src/wled_controller/core/processing/wled_target_processor.py
@@ -122,14 +122,8 @@ class WledTargetProcessor(TargetProcessor):
            self._color_strip_stream = stream
            self._resolved_display_index = stream.display_index

-            # For auto-sized static/gradient/color_cycle/effect streams (led_count == 0), size to device LED count
-            from wled_controller.core.processing.color_strip_stream import (
-                ColorCycleColorStripStream,
-                GradientColorStripStream,
-                StaticColorStripStream,
-            )
-            from wled_controller.core.processing.effect_stream import EffectColorStripStream
-            if isinstance(stream, (StaticColorStripStream, GradientColorStripStream, ColorCycleColorStripStream, EffectColorStripStream)) and device_info.led_count > 0:
+            # For auto-sized non-picture streams (led_count == 0), size to device LED count
+            if hasattr(stream, "configure") and device_info.led_count > 0:
                effective_leds = device_info.led_count - self._led_skip_start - self._led_skip_end
                stream.configure(max(1, effective_leds))

@@ -415,19 +409,20 @@ class WledTargetProcessor(TargetProcessor):
        ])
        return result

-    def _apply_led_skip(self, colors: np.ndarray) -> np.ndarray:
-        """Pad color array with black at start/end for skipped LEDs."""
-        s, e = self._led_skip_start, self._led_skip_end
-        if s <= 0 and e <= 0:
+    @staticmethod
+    def _apply_led_skip(colors: np.ndarray, buf: Optional[np.ndarray], skip_start: int) -> np.ndarray:
+        """Copy effective colors into pre-allocated buffer with black padding.
+
+        Args:
+            colors: Effective LED colors (skip-excluded)
+            buf: Pre-allocated (device_led_count, 3) buffer with black edges,
+                 or None when no skip is configured.
+            skip_start: Number of black LEDs at the start (write offset)
+        """
+        if buf is None:
            return colors
-        channels = colors.shape[1] if colors.ndim == 2 else 3
-        parts = []
-        if s > 0:
-            parts.append(np.zeros((s, channels), dtype=np.uint8))
-        parts.append(colors)
-        if e > 0:
-            parts.append(np.zeros((e, channels), dtype=np.uint8))
-        return np.vstack(parts)
+        buf[skip_start:skip_start + len(colors)] = colors
+        return buf

    async def _processing_loop(self) -> None:
        """Main processing loop — poll ColorStripStream → apply brightness → send."""
@@ -440,7 +435,58 @@ class WledTargetProcessor(TargetProcessor):
        last_send_time = 0.0
        prev_frame_time_stamp = time.perf_counter()
        loop = asyncio.get_running_loop()
-        effective_leds = max(1, (device_info.led_count if device_info else 0) - self._led_skip_start - self._led_skip_end)
+        _init_device_info = self._ctx.get_device_info(self._device_id)
+        _total_leds = _init_device_info.led_count if _init_device_info else 0
+        effective_leds = max(1, _total_leds - self._led_skip_start - self._led_skip_end)
+
+        # Pre-allocate skip buffer (reused every frame — edges stay black)
+        if (self._led_skip_start > 0 or self._led_skip_end > 0) and _total_leds > 0:
+            _skip_buf: Optional[np.ndarray] = np.zeros((_total_leds, 3), dtype=np.uint8)
+        else:
+            _skip_buf = None
+
+        # Pre-allocate resampling cache (linspace + result reused while sizes unchanged)
+        _fit_key = (0, 0)
+        _fit_src_x = _fit_dst_x = _fit_result = None
+
+        def _cached_fit(colors_in):
+            """Resample colors to effective_leds using cached linspace arrays."""
+            nonlocal _fit_key, _fit_src_x, _fit_dst_x, _fit_result
+            n_src = len(colors_in)
+            if n_src == effective_leds or effective_leds <= 0:
+                return colors_in
+            if (n_src, effective_leds) != _fit_key:
+                _fit_key = (n_src, effective_leds)
+                _fit_src_x = np.linspace(0, 1, n_src)
+                _fit_dst_x = np.linspace(0, 1, effective_leds)
+                _fit_result = np.empty((effective_leds, 3), dtype=np.uint8)
+            for _ch in range(3):
+                np.copyto(_fit_result[:, _ch],
+                          np.interp(_fit_dst_x, _fit_src_x, colors_in[:, _ch]),
+                          casting='unsafe')
+            return _fit_result
+
+        # Pre-allocate brightness scratch (uint16 intermediate + uint8 output)
+        _bright_u16: Optional[np.ndarray] = None
+        _bright_out: Optional[np.ndarray] = None
+        _bright_n = 0
+
+        def _cached_brightness(colors_in, dev_info):
+            """Apply software brightness using pre-allocated uint16 scratch."""
+            nonlocal _bright_n, _bright_u16, _bright_out
+            if not dev_info or dev_info.software_brightness >= 255:
+                return colors_in
+            _dn = len(colors_in)
+            if _dn != _bright_n:
+                _bright_n = _dn
+                _bright_u16 = np.empty((_dn, 3), dtype=np.uint16)
+                _bright_out = np.empty((_dn, 3), dtype=np.uint8)
+            np.copyto(_bright_u16, colors_in, casting='unsafe')
+            _bright_u16 *= dev_info.software_brightness
+            _bright_u16 >>= 8
+            np.copyto(_bright_out, _bright_u16, casting='unsafe')
+            return _bright_out
+
        # Short re-poll interval when the animation thread hasn't produced a new
        # frame yet.  The animation thread and this loop both target the same FPS
        # but are unsynchronised; without a short re-poll the loop can miss a
@@ -502,9 +548,9 @@ class WledTargetProcessor(TargetProcessor):
                                    break
                                kc = prev_colors
                                if device_info and device_info.led_count > 0:
-                                    kc = self._fit_to_device(kc, effective_leds)
-                                    kc = self._apply_led_skip(kc)
-                                send_colors = self._apply_brightness(kc, device_info)
+                                    kc = _cached_fit(kc)
+                                    kc = self._apply_led_skip(kc, _skip_buf, self._led_skip_start)
+                                send_colors = _cached_brightness(kc, device_info)
                                if self._led_client.supports_fast_send:
                                    self._led_client.send_pixels_fast(send_colors)
                                else:
@@ -525,11 +571,11 @@ class WledTargetProcessor(TargetProcessor):

                        # Fit to effective LED count (excluding skipped) then pad with blacks
                        if device_info and device_info.led_count > 0:
-                            colors = self._fit_to_device(colors, effective_leds)
-                            colors = self._apply_led_skip(colors)
+                            colors = _cached_fit(colors)
+                            colors = self._apply_led_skip(colors, _skip_buf, self._led_skip_start)

                        # Apply device software brightness
-                        send_colors = self._apply_brightness(colors, device_info)
+                        send_colors = _cached_brightness(colors, device_info)

                        # Send to LED device
                        if not self._is_running or self._led_client is None: