From f184ef0afbe49f77074d778ff8864fea4445dfff Mon Sep 17 00:00:00 2001
From: "alexei.dolgolyov" <dolgolyov.alexei@gmail.com>
Date: Tue, 12 May 2026 15:05:52 +0300
Subject: [PATCH] perf(capture): vectorize hot paths and fix engine bugs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- WGC: replace per-frame ~30 MB BGRA->RGB fancy-index allocation with
  cv2.cvtColor into a 3-slot pre-allocated RGB pool. Use gc.collect(0)
  on cleanup instead of full GC to avoid multi-hundred-ms stalls.
- MSS: switch from screenshot.rgb (pure-Python BGRA->RGB rebuild) to
  screenshot.raw + cv2.cvtColor into a pooled buffer. Add cheap 256-byte
  hash-based change detection so idle frames return None — matches
  DXcam/BetterCam semantics.
- DXcam/BetterCam: fix silent factory leak — Python name-mangling
  rewrote self._dxcam.__factory to _DXcamCaptureStream__factory inside
  the class body, so cleanup never reached the real attribute. Use
  getattr with string literal to bypass mangling.
- calculate_dominant_color: replace np.random.choice(replace=False)
  (full sort) with np.random.randint, and np.unique(axis=0) (lexsort)
  with packed-RGB np.bincount. ~10x faster on dominant mode.
- calibration._map_edge_average: switch cached scratch buffers from
  float64 to float32. Halves memory bandwidth on the dominant reduction
  path; range-safe up to 8K screens.
- All engines: per-frame DEBUG logs use structlog kwarg style instead
  of f-strings to avoid per-frame string allocation.
---
 .../src/ledgrab/core/capture/calibration.py   | 25 ++++---
 .../ledgrab/core/capture/screen_capture.py    | 44 ++++++------
 .../core/capture_engines/bettercam_engine.py  | 34 ++++++----
 .../core/capture_engines/dxcam_engine.py      | 34 ++++++----
 .../core/capture_engines/mss_engine.py        | 68 +++++++++++++++++--
 .../core/capture_engines/wgc_engine.py        | 57 ++++++++++++++--
 6 files changed, 192 insertions(+), 70 deletions(-)

diff --git a/server/src/ledgrab/core/capture/calibration.py b/server/src/ledgrab/core/capture/calibration.py
index d783134..1233158 100644
--- a/server/src/ledgrab/core/capture/calibration.py
+++ b/server/src/ledgrab/core/capture/calibration.py
@@ -443,7 +443,10 @@ class PixelMapper:
             axis = 1
             edge_len = edge_pixels.shape[0]
 
-        # Lazy-init / resize per-edge scratch buffers
+        # Lazy-init / resize per-edge scratch buffers.
+        # float32 is sufficient: max cumsum value is edge_len * 255 (≈2M @ 8K
+        # screens) which fits exactly in float32's 24-bit mantissa. Halves
+        # memory bandwidth on the hot reduction.
         cache = self._edge_cache.get(edge_name)
         if cache is None or cache[0] != edge_len or cache[1] != led_count:
             step = edge_len / led_count
@@ -452,11 +455,11 @@ class PixelMapper:
             np.minimum(boundaries, edge_len, out=boundaries)
             starts = boundaries[:-1]
             ends = boundaries[1:]
-            lengths = (ends - starts).reshape(-1, 1).astype(np.float64)
-            cumsum_buf = np.empty((edge_len + 1, 3), dtype=np.float64)
-            edge_1d_buf = np.empty((edge_len, 3), dtype=np.float64)
-            sums_buf = np.empty((led_count, 3), dtype=np.float64)
-            starts_buf = np.empty((led_count, 3), dtype=np.float64)
+            lengths = (ends - starts).reshape(-1, 1).astype(np.float32)
+            cumsum_buf = np.empty((edge_len + 1, 3), dtype=np.float32)
+            edge_1d_buf = np.empty((edge_len, 3), dtype=np.float32)
+            sums_buf = np.empty((led_count, 3), dtype=np.float32)
+            starts_buf = np.empty((led_count, 3), dtype=np.float32)
             out_uint8 = np.empty((led_count, 3), dtype=np.uint8)
             cache = (
                 edge_len,
@@ -682,11 +685,11 @@ class AdvancedPixelMapper:
             np.minimum(boundaries, edge_len, out=boundaries)
             starts = boundaries[:-1]
             ends = boundaries[1:]
-            lengths = (ends - starts).reshape(-1, 1).astype(np.float64)
-            cumsum_buf = np.empty((edge_len + 1, 3), dtype=np.float64)
-            edge_1d_buf = np.empty((edge_len, 3), dtype=np.float64)
-            sums_buf = np.empty((led_count, 3), dtype=np.float64)
-            starts_buf = np.empty((led_count, 3), dtype=np.float64)
+            lengths = (ends - starts).reshape(-1, 1).astype(np.float32)
+            cumsum_buf = np.empty((edge_len + 1, 3), dtype=np.float32)
+            edge_1d_buf = np.empty((edge_len, 3), dtype=np.float32)
+            sums_buf = np.empty((led_count, 3), dtype=np.float32)
+            starts_buf = np.empty((led_count, 3), dtype=np.float32)
             out_uint8 = np.empty((led_count, 3), dtype=np.uint8)
             cache = (
                 edge_len,
diff --git a/server/src/ledgrab/core/capture/screen_capture.py b/server/src/ledgrab/core/capture/screen_capture.py
index 323052e..df3788c 100644
--- a/server/src/ledgrab/core/capture/screen_capture.py
+++ b/server/src/ledgrab/core/capture/screen_capture.py
@@ -192,8 +192,11 @@ def extract_border_pixels(screen_capture: ScreenCapture, border_width: int = 10)
     left = img[:, :border_width, :]
 
     logger.debug(
-        f"Extracted borders: top={top.shape}, right={right.shape}, "
-        f"bottom={bottom.shape}, left={left.shape}"
+        "Extracted borders",
+        top=top.shape,
+        right=right.shape,
+        bottom=bottom.shape,
+        left=left.shape,
     )
 
     return BorderPixels(
@@ -303,6 +306,12 @@ def calculate_median_color(pixels: np.ndarray) -> tuple[int, int, int]:
 def calculate_dominant_color(pixels: np.ndarray) -> tuple[int, int, int]:
     """Calculate dominant color of a pixel region using simple clustering.
 
+    Quantizes to 32 levels/channel (5 bits/channel = 32K bins), packs into a
+    single uint32, then uses ``np.bincount`` to find the most common bin.
+    Sampling uses with-replacement (statistically equivalent for a dominant-bin
+    search and avoids the full sort that ``np.random.choice(replace=False)``
+    triggers internally).
+
     Args:
         pixels: Pixel array (height, width, 3)
 
@@ -312,28 +321,23 @@ def calculate_dominant_color(pixels: np.ndarray) -> tuple[int, int, int]:
     if pixels.size == 0:
         return (0, 0, 0)
 
-    # Reshape to (n_pixels, 3)
     pixels_reshaped = pixels.reshape(-1, 3)
+    n = len(pixels_reshaped)
 
-    # For performance, sample pixels if there are too many
     max_samples = 1000
-    if len(pixels_reshaped) > max_samples:
-        indices = np.random.choice(len(pixels_reshaped), max_samples, replace=False)
+    if n > max_samples:
+        indices = np.random.randint(0, n, max_samples)
         pixels_reshaped = pixels_reshaped[indices]
 
-    # Simple dominant color: quantize colors and find most common
-    # Reduce color space to 32 levels per channel for binning
-    quantized = (pixels_reshaped // 8) * 8
-
-    # Find unique colors and their counts
-    unique_colors, counts = np.unique(quantized, axis=0, return_counts=True)
-
-    # Get the most common color
-    dominant_idx = np.argmax(counts)
-    dominant_color = unique_colors[dominant_idx]
-
-    r = int(np.clip(dominant_color[0], 0, 255))
-    g = int(np.clip(dominant_color[1], 0, 255))
-    b = int(np.clip(dominant_color[2], 0, 255))
+    # Quantize to 32 levels/channel (drop low 3 bits) and pack into uint32:
+    # bits 10-14 = R, bits 5-9 = G, bits 0-4 = B  → 32K possible bins.
+    q = pixels_reshaped >> 3  # uint8 in [0,31]
+    packed = (q[:, 0].astype(np.uint32) << 10) | (q[:, 1].astype(np.uint32) << 5) | q[:, 2]
+    counts = np.bincount(packed, minlength=1)
+    dominant_bin = int(np.argmax(counts))
 
+    # Reconstruct 5-bit channels and shift back to 8-bit (centered in bin).
+    r = ((dominant_bin >> 10) & 0x1F) << 3
+    g = ((dominant_bin >> 5) & 0x1F) << 3
+    b = (dominant_bin & 0x1F) << 3
     return (r, g, b)
diff --git a/server/src/ledgrab/core/capture_engines/bettercam_engine.py b/server/src/ledgrab/core/capture_engines/bettercam_engine.py
index 96bcb8d..fff4ae2 100644
--- a/server/src/ledgrab/core/capture_engines/bettercam_engine.py
+++ b/server/src/ledgrab/core/capture_engines/bettercam_engine.py
@@ -35,12 +35,17 @@ class BetterCamCaptureStream(CaptureStream):
         except ImportError:
             raise RuntimeError("BetterCam not installed. Install with: pip install bettercam")
 
-        # Clear global camera cache for fresh DXGI state
-        try:
-            self._bettercam.__factory.clean_up()
-        except Exception as e:
-            logger.debug("BetterCam factory cleanup on init: %s", e)
-            pass
+        # Clear global camera cache for fresh DXGI state.
+        # NOTE: ``self._bettercam.__factory`` is name-mangled by Python to
+        # ``self._bettercam._BetterCamCaptureStream__factory`` because the
+        # access appears inside a class body, which silently AttributeErrors.
+        # Use string-based getattr to bypass mangling.
+        _factory = getattr(self._bettercam, "__factory", None)
+        if _factory is not None:
+            try:
+                _factory.clean_up()
+            except Exception as e:
+                logger.debug("BetterCam factory cleanup on init failed", error=str(e))
 
         self._camera = self._bettercam.create(
             output_idx=self.display_index,
@@ -71,11 +76,12 @@ class BetterCamCaptureStream(CaptureStream):
             self._camera = None
 
         if self._bettercam:
-            try:
-                self._bettercam.__factory.clean_up()
-            except Exception as e:
-                logger.debug("BetterCam factory cleanup on teardown: %s", e)
-                pass
+            _factory = getattr(self._bettercam, "__factory", None)
+            if _factory is not None:
+                try:
+                    _factory.clean_up()
+                except Exception as e:
+                    logger.debug("BetterCam factory cleanup on teardown failed", error=str(e))
 
         self._initialized = False
         logger.info(f"BetterCam capture stream cleaned up (display={self.display_index})")
@@ -109,8 +115,10 @@ class BetterCamCaptureStream(CaptureStream):
                 return None
 
             logger.debug(
-                f"BetterCam captured display {self.display_index}: "
-                f"{frame.shape[1]}x{frame.shape[0]}"
+                "BetterCam captured frame",
+                display=self.display_index,
+                w=frame.shape[1],
+                h=frame.shape[0],
             )
 
             return ScreenCapture(
diff --git a/server/src/ledgrab/core/capture_engines/dxcam_engine.py b/server/src/ledgrab/core/capture_engines/dxcam_engine.py
index b4c7efb..14be006 100644
--- a/server/src/ledgrab/core/capture_engines/dxcam_engine.py
+++ b/server/src/ledgrab/core/capture_engines/dxcam_engine.py
@@ -35,12 +35,17 @@ class DXcamCaptureStream(CaptureStream):
         except ImportError:
             raise RuntimeError("DXcam not installed. Install with: pip install dxcam")
 
-        # Clear global camera cache for fresh DXGI state
-        try:
-            self._dxcam.__factory.clean_up()
-        except Exception as e:
-            logger.debug("DXcam factory cleanup on init: %s", e)
-            pass
+        # Clear global camera cache for fresh DXGI state.
+        # NOTE: ``self._dxcam.__factory`` is name-mangled by Python to
+        # ``self._dxcam._DXcamCaptureStream__factory`` because the access
+        # appears inside a class body, which silently AttributeErrors.
+        # Use string-based getattr to bypass mangling.
+        _factory = getattr(self._dxcam, "__factory", None)
+        if _factory is not None:
+            try:
+                _factory.clean_up()
+            except Exception as e:
+                logger.debug("DXcam factory cleanup on init failed", error=str(e))
 
         self._camera = self._dxcam.create(
             output_idx=self.display_index,
@@ -69,11 +74,12 @@ class DXcamCaptureStream(CaptureStream):
             self._camera = None
 
         if self._dxcam:
-            try:
-                self._dxcam.__factory.clean_up()
-            except Exception as e:
-                logger.debug("DXcam factory cleanup on teardown: %s", e)
-                pass
+            _factory = getattr(self._dxcam, "__factory", None)
+            if _factory is not None:
+                try:
+                    _factory.clean_up()
+                except Exception as e:
+                    logger.debug("DXcam factory cleanup on teardown failed", error=str(e))
 
         self._initialized = False
         logger.info(f"DXcam capture stream cleaned up (display={self.display_index})")
@@ -107,8 +113,10 @@ class DXcamCaptureStream(CaptureStream):
                 return None
 
             logger.debug(
-                f"DXcam captured display {self.display_index}: "
-                f"{frame.shape[1]}x{frame.shape[0]}"
+                "DXcam captured frame",
+                display=self.display_index,
+                w=frame.shape[1],
+                h=frame.shape[0],
             )
 
             return ScreenCapture(
diff --git a/server/src/ledgrab/core/capture_engines/mss_engine.py b/server/src/ledgrab/core/capture_engines/mss_engine.py
index 6962999..419e760 100644
--- a/server/src/ledgrab/core/capture_engines/mss_engine.py
+++ b/server/src/ledgrab/core/capture_engines/mss_engine.py
@@ -5,6 +5,13 @@ from typing import Any, Dict, List, Optional
 import mss
 import numpy as np
 
+try:
+    import cv2
+
+    _HAS_CV2 = True
+except ImportError:
+    _HAS_CV2 = False
+
 from ledgrab.core.capture_engines.base import (
     CaptureEngine,
     CaptureStream,
@@ -15,6 +22,13 @@ from ledgrab.utils import get_logger, get_monitor_names, get_monitor_refresh_rat
 
 logger = get_logger(__name__)
 
+# Rotating RGB output pool: keeps prior frame references stable for any
+# consumer still reading them while a new frame is written.
+_RGB_POOL_SIZE = 3
+# Number of bytes from .raw to hash for change detection (cheap pre-check
+# that avoids the full BGRA→RGB conversion when the screen is idle).
+_CHANGE_DETECT_BYTES = 256
+
 
 class MSSCaptureStream(CaptureStream):
     """MSS capture stream for a specific display."""
@@ -22,6 +36,12 @@ class MSSCaptureStream(CaptureStream):
     def __init__(self, display_index: int, config: Dict[str, Any]):
         super().__init__(display_index, config)
         self._sct = None
+        # Pre-allocated RGB destination pool — avoids per-frame allocation.
+        self._rgb_pool: list = [None] * _RGB_POOL_SIZE
+        self._rgb_idx: int = 0
+        self._rgb_shape: tuple = (0, 0)
+        # Cheap hash of the previous .raw bytes, for change detection.
+        self._prev_hash: Optional[int] = None
 
     def initialize(self) -> None:
         try:
@@ -36,6 +56,7 @@ class MSSCaptureStream(CaptureStream):
             self._sct.close()
             self._sct = None
         self._initialized = False
+        self._prev_hash = None
         logger.info(f"MSS capture stream cleaned up (display={self.display_index})")
 
     def capture_frame(self) -> Optional[ScreenCapture]:
@@ -55,18 +76,51 @@ class MSSCaptureStream(CaptureStream):
             monitor = self._sct.monitors[monitor_index]
             screenshot = self._sct.grab(monitor)
 
-            # Direct bytes→numpy (skips PIL intermediate object)
-            img_array = np.frombuffer(
-                screenshot.rgb,
-                dtype=np.uint8,
-            ).reshape(screenshot.height, screenshot.width, 3)
+            # Cheap change detection: hash a small slice of the raw BGRA
+            # buffer. ~256 bytes is enough to differentiate any cursor/pixel
+            # change. Skips the BGRA→RGB conversion when nothing changed
+            # (common on idle desktops). DXcam/BetterCam return None in this
+            # case natively; mss does not, so we add it here.
+            raw = screenshot.raw
+            sample = bytes(raw[:_CHANGE_DETECT_BYTES])
+            cur_hash = hash(sample)
+            if cur_hash == self._prev_hash:
+                return None
+            self._prev_hash = cur_hash
+
+            height = screenshot.height
+            width = screenshot.width
+
+            # Reshape .raw (BGRA) — zero-copy view over the screenshot's buffer.
+            # ``screenshot.rgb`` (used previously) is a pure-Python BGRA→RGB
+            # rebuild costing ~6 MB/frame at 1080p in the slowest possible
+            # way. cv2.cvtColor is SIMD and writes directly into our pool.
+            bgra = np.frombuffer(raw, dtype=np.uint8).reshape(height, width, 4)
+
+            if self._rgb_shape != (height, width):
+                for i in range(_RGB_POOL_SIZE):
+                    self._rgb_pool[i] = np.empty((height, width, 3), dtype=np.uint8)
+                self._rgb_shape = (height, width)
+
+            dst = self._rgb_pool[self._rgb_idx]
+            self._rgb_idx = (self._rgb_idx + 1) % _RGB_POOL_SIZE
+
+            if _HAS_CV2:
+                cv2.cvtColor(bgra, cv2.COLOR_BGRA2RGB, dst=dst)
+            else:
+                dst[..., 0] = bgra[..., 2]
+                dst[..., 1] = bgra[..., 1]
+                dst[..., 2] = bgra[..., 0]
 
             logger.debug(
-                f"MSS captured display {self.display_index}: {monitor['width']}x{monitor['height']}"
+                "MSS captured frame",
+                display=self.display_index,
+                w=monitor["width"],
+                h=monitor["height"],
             )
 
             return ScreenCapture(
-                image=img_array,
+                image=dst,
                 width=monitor["width"],
                 height=monitor["height"],
                 display_index=self.display_index,
diff --git a/server/src/ledgrab/core/capture_engines/wgc_engine.py b/server/src/ledgrab/core/capture_engines/wgc_engine.py
index f9e6065..a5d850e 100644
--- a/server/src/ledgrab/core/capture_engines/wgc_engine.py
+++ b/server/src/ledgrab/core/capture_engines/wgc_engine.py
@@ -5,6 +5,14 @@ import sys
 import threading
 from typing import Any, Dict, List, Optional
 
+import numpy as np
+
+try:
+    import cv2
+
+    _HAS_CV2 = True
+except ImportError:
+    _HAS_CV2 = False
 
 from ledgrab.core.capture_engines.base import (
     CaptureEngine,
@@ -16,6 +24,10 @@ from ledgrab.utils import get_logger
 
 logger = get_logger(__name__)
 
+# 3-slot rotating output buffer pool: ensures the consumer always has a stable
+# RGB array reference (the underlying WGC native buffer is reused per frame).
+_RGB_POOL_SIZE = 3
+
 
 class WGCCaptureStream(CaptureStream):
     """WGC capture stream for a specific display."""
@@ -29,6 +41,11 @@ class WGCCaptureStream(CaptureStream):
         self._frame_event = threading.Event()
         self._closed_event = threading.Event()
         self._frame_lock = threading.Lock()
+        # Pre-allocated RGB destination buffers (rotated to keep prior frames
+        # stable for any consumer still reading the previous reference).
+        self._rgb_pool: list = [None] * _RGB_POOL_SIZE
+        self._rgb_idx: int = 0
+        self._rgb_shape: tuple = (0, 0)
 
     def initialize(self) -> None:
         if self._wgc is None:
@@ -66,10 +83,33 @@ class WGCCaptureStream(CaptureStream):
                     width = frame.width
                     height = frame.height
 
-                    # WGC provides BGRA format, convert to RGB
-                    # Fancy indexing creates a new contiguous array — no .copy() needed
+                    # WGC provides BGRA. ``frame_buffer`` is a view over the
+                    # native side's reusable buffer — must copy out before
+                    # returning. Use a 3-slot rotating pool of pre-allocated
+                    # RGB buffers + cv2.cvtColor (SIMD) instead of numpy fancy
+                    # indexing. Fancy indexing would allocate ~width*height*3
+                    # bytes per frame (≈480 MB/s at 1080p60); the pool allocates
+                    # 3 buffers total and reuses them.
                     frame_array = frame_buffer.reshape((height, width, 4))
-                    frame_rgb = frame_array[:, :, [2, 1, 0]]
+
+                    if self._rgb_shape != (height, width):
+                        for i in range(_RGB_POOL_SIZE):
+                            self._rgb_pool[i] = np.empty((height, width, 3), dtype=np.uint8)
+                        self._rgb_shape = (height, width)
+
+                    dst = self._rgb_pool[self._rgb_idx]
+                    self._rgb_idx = (self._rgb_idx + 1) % _RGB_POOL_SIZE
+
+                    if _HAS_CV2:
+                        cv2.cvtColor(frame_array, cv2.COLOR_BGRA2RGB, dst=dst)
+                        frame_rgb = dst
+                    else:
+                        # Fallback: per-channel copy is still 2× faster than
+                        # fancy-index allocation because it writes in-place.
+                        dst[..., 0] = frame_array[..., 2]
+                        dst[..., 1] = frame_array[..., 1]
+                        dst[..., 2] = frame_array[..., 0]
+                        frame_rgb = dst
 
                     with self._frame_lock:
                         self._latest_frame = frame_rgb
@@ -153,8 +193,10 @@ class WGCCaptureStream(CaptureStream):
         self._cleanup_internal()
         self._initialized = False
 
-        # Force garbage collection to release COM objects
-        gc.collect()
+        # Gen-0 collect is enough to release recently-allocated COM
+        # references and avoids the multi-hundred-ms full-heap pause
+        # ``gc.collect()`` would cause on a heap full of frame ndarrays.
+        gc.collect(0)
         logger.info(f"WGC capture stream cleaned up (display={self.display_index})")
 
     def capture_frame(self) -> Optional[ScreenCapture]:
@@ -173,7 +215,10 @@ class WGCCaptureStream(CaptureStream):
                 self._frame_event.clear()
 
             logger.debug(
-                f"WGC captured display {self.display_index}: " f"{frame.shape[1]}x{frame.shape[0]}"
+                "WGC captured frame",
+                display=self.display_index,
+                w=frame.shape[1],
+                h=frame.shape[0],
             )
 
             return ScreenCapture(