refactor(metrics): MetricsProvider abstraction with Android /proc backend
Build Android APK / build-android (push) Failing after 1m39s
Lint & Test / test (push) Successful in 4m20s

Moves direct psutil.* calls behind a MetricsProvider Protocol so the
codebase no longer needs ad-hoc `if psutil is not None` guards at every
call site. Each provider lives in its own module under
utils/metrics/: PsutilMetricsProvider for desktop, NullMetricsProvider
as a zeroed fallback, AndroidMetricsProvider that reads /proc/stat,
/proc/meminfo, /proc/self/stat, and /proc/self/status directly (psutil
isn't available under Chaquopy). The Android provider tracks the
previous CPU sample so cpu_percent() returns delta-based percentages
matching psutil's interval=None semantics, and degrades to zeros when
any /proc file is unreadable instead of crashing the dashboard.

Factory get_metrics_provider() in utils/metrics/__init__.py picks
Android > psutil > Null. api/routes/system.py and
core/processing/metrics_history.py now go through the factory; psutil
import is confined to one place. 12 new unit tests cover paren-in-comm
parsing of /proc/self/stat, delta CPU%, missing-file resilience, and
factory selection order. Full suite: 727 passing.
This commit is contained in:
2026-04-14 13:34:32 +03:00
parent 488df98996
commit 546b24d015
9 changed files with 570 additions and 91 deletions
+12 -42
View File
@@ -12,11 +12,6 @@ from typing import Optional
import os
try:
import psutil
except ImportError:
psutil = None # type: ignore[assignment]
from fastapi import APIRouter, Depends, HTTPException, Query
from ledgrab import __version__, REPO_URL, DONATE_URL
@@ -50,6 +45,7 @@ from ledgrab.api.schemas.system import (
from ledgrab.config import get_config, is_demo_mode
from ledgrab.core.capture.screen_capture import get_available_displays
from ledgrab.utils import get_logger
from ledgrab.utils.metrics import get_metrics_provider
from ledgrab.storage.base_store import EntityNotFoundError
# Re-export load_external_url so existing callers still work
@@ -57,14 +53,6 @@ from ledgrab.api.routes.system_settings import load_external_url # noqa: F401
logger = get_logger(__name__)
# Prime psutil CPU counters (first call always returns 0.0)
if psutil is not None:
psutil.cpu_percent(interval=None)
_process = psutil.Process(os.getpid())
_process.cpu_percent(interval=None) # prime process-level counter
else:
_process = None # type: ignore[assignment]
# GPU monitoring (initialized once in utils.gpu, shared with metrics_history)
from ledgrab.utils.gpu import ( # noqa: E402
nvml_available as _nvml_available,
@@ -278,32 +266,14 @@ async def get_running_processes(_: AuthRequired):
def get_system_performance(_: AuthRequired):
"""Get current system performance metrics (CPU, RAM, GPU).
Uses sync ``def`` so FastAPI runs it in a thread pool — the psutil
and NVML calls are blocking and would stall the event loop if run
in an ``async def`` handler.
Uses sync ``def`` so FastAPI runs it in a thread pool — the metrics
provider and NVML calls are blocking and would stall the event loop
if run in an ``async def`` handler.
"""
if psutil is None or _process is None:
# psutil unavailable on this platform (e.g. Android)
from datetime import datetime, timezone
return PerformanceResponse(
timestamp=datetime.now(timezone.utc),
cpu_name=_cpu_name,
cpu_percent=0.0,
ram_used_mb=0.0,
ram_total_mb=0.0,
ram_percent=0.0,
app_cpu_percent=0.0,
app_ram_mb=0.0,
gpu=None,
)
mem = psutil.virtual_memory()
# App-level metrics
proc_mem = _process.memory_info()
# Process.cpu_percent() is per-core (0N*100%); normalize to 0100% scale
app_cpu = _process.cpu_percent(interval=None) / (psutil.cpu_count(logical=True) or 1)
app_ram_mb = round(proc_mem.rss / 1024 / 1024, 1)
metrics = get_metrics_provider()
mem = metrics.virtual_memory()
proc = metrics.process_snapshot()
app_ram_mb = round(proc.rss_bytes / 1024 / 1024, 1)
gpu = None
if _nvml_available:
@@ -336,11 +306,11 @@ def get_system_performance(_: AuthRequired):
return PerformanceResponse(
cpu_name=_cpu_name,
cpu_percent=psutil.cpu_percent(interval=None),
ram_used_mb=round(mem.used / 1024 / 1024, 1),
ram_total_mb=round(mem.total / 1024 / 1024, 1),
cpu_percent=metrics.cpu_percent(),
ram_used_mb=round(mem.used_bytes / 1024 / 1024, 1),
ram_total_mb=round(mem.total_bytes / 1024 / 1024, 1),
ram_percent=mem.percent,
app_cpu_percent=app_cpu,
app_cpu_percent=proc.cpu_percent,
app_ram_mb=app_ram_mb,
gpu=gpu,
timestamp=datetime.now(timezone.utc),
@@ -6,17 +6,13 @@ from collections import deque
from datetime import datetime, timezone
from typing import Dict, Optional
try:
import psutil
except ImportError:
psutil = None # type: ignore[assignment]
from ledgrab.utils import get_logger
from ledgrab.utils.gpu import (
nvml_available as _nvml_available,
nvml as _nvml,
nvml_handle as _nvml_handle,
)
from ledgrab.utils.metrics import get_metrics_provider
logger = get_logger(__name__)
@@ -24,44 +20,22 @@ MAX_SAMPLES = 120 # ~2 minutes at 1-second interval
SAMPLE_INTERVAL = 1.0 # seconds
if psutil is not None:
_process = psutil.Process(os.getpid())
_process.cpu_percent(interval=None) # prime process-level counter
else:
_process = None # type: ignore[assignment]
def _collect_system_snapshot() -> dict:
"""Collect CPU/RAM/GPU metrics (blocking — run in thread pool).
Returns a dict suitable for direct JSON serialization.
"""
if psutil is None or _process is None:
# psutil unavailable (e.g. Android) — return zeroed snapshot
return {
"t": datetime.now(timezone.utc).isoformat(),
"cpu": 0.0,
"ram_pct": 0.0,
"ram_used": 0.0,
"ram_total": 0.0,
"app_cpu": 0.0,
"app_ram": 0.0,
"gpu_util": None,
"gpu_temp": None,
"app_gpu_mem": None,
}
mem = psutil.virtual_memory()
proc_mem = _process.memory_info()
metrics = get_metrics_provider()
mem = metrics.virtual_memory()
proc = metrics.process_snapshot()
snapshot = {
"t": datetime.now(timezone.utc).isoformat(),
"cpu": psutil.cpu_percent(interval=None),
"cpu": metrics.cpu_percent(),
"ram_pct": mem.percent,
"ram_used": round(mem.used / 1024 / 1024, 1),
"ram_total": round(mem.total / 1024 / 1024, 1),
# Process.cpu_percent() is per-core (0N*100%); normalize to 0100%
"app_cpu": _process.cpu_percent(interval=None) / (psutil.cpu_count(logical=True) or 1),
"app_ram": round(proc_mem.rss / 1024 / 1024, 1),
"ram_used": round(mem.used_bytes / 1024 / 1024, 1),
"ram_total": round(mem.total_bytes / 1024 / 1024, 1),
"app_cpu": proc.cpu_percent,
"app_ram": round(proc.rss_bytes / 1024 / 1024, 1),
"gpu_util": None,
"gpu_temp": None,
"app_gpu_mem": None,
@@ -70,6 +44,7 @@ def _collect_system_snapshot() -> dict:
try:
if _nvml_available:
util = _nvml.nvmlDeviceGetUtilizationRates(_nvml_handle)
_ = os.getpid # keep import lint-clean for the os.getpid call below
temp = _nvml.nvmlDeviceGetTemperature(_nvml_handle, _nvml.NVML_TEMPERATURE_GPU)
snapshot["gpu_util"] = float(util.gpu)
snapshot["gpu_temp"] = float(temp)
@@ -0,0 +1,63 @@
"""System metrics provider abstraction.
Wraps the per-platform metrics source so the rest of the codebase doesn't
need ``if psutil is not None`` guards at every call site. Selection
order in :func:`get_metrics_provider`:
1. :class:`AndroidMetricsProvider` — when running under Chaquopy and
``/proc/stat`` + ``/proc/meminfo`` are readable.
2. :class:`PsutilMetricsProvider` — desktop platforms with psutil.
3. :class:`NullMetricsProvider` — last-ditch fallback returning zeros.
Each provider lives in its own module — see ``psutil_provider.py``,
``android_provider.py``, ``null_provider.py``.
"""
from __future__ import annotations
from ledgrab.utils.platform import is_android
from .android_provider import AndroidMetricsProvider, is_supported as _android_supported
from .null_provider import NullMetricsProvider
from .psutil_provider import PsutilMetricsProvider
from .types import MemorySnapshot, MetricsProvider, ProcessSnapshot
__all__ = [
"AndroidMetricsProvider",
"MemorySnapshot",
"MetricsProvider",
"NullMetricsProvider",
"ProcessSnapshot",
"PsutilMetricsProvider",
"get_metrics_provider",
"reset_metrics_provider",
]
_provider: MetricsProvider | None = None
def get_metrics_provider() -> MetricsProvider:
"""Return the process-wide metrics provider (created on first call).
Idempotent — priming side effects inside providers run exactly once
per process.
"""
global _provider
if _provider is None:
if is_android() and _android_supported():
_provider = AndroidMetricsProvider()
else:
try:
import psutil
except ImportError:
_provider = NullMetricsProvider()
else:
_provider = PsutilMetricsProvider(psutil)
return _provider
def reset_metrics_provider() -> None:
"""Reset the cached provider — for tests only."""
global _provider
_provider = None
@@ -0,0 +1,191 @@
"""Android metrics provider — reads /proc directly (no psutil needed).
Chaquopy doesn't ship a working psutil on Android, but the kernel
exposes the same data through ``/proc``. This provider tracks the
previous sample of ``/proc/stat`` and ``/proc/self/stat`` so it can
compute CPU% deltas the same way ``psutil.cpu_percent(interval=None)``
does on desktop.
If any of the expected ``/proc`` files become unreadable (some Android
flavors lock down ``/proc/self/stat`` for non-root apps), the provider
silently falls back to zero values for the affected metric instead of
crashing the dashboard. :func:`is_supported` lets the factory decide
whether this provider is even worth instantiating on the host.
"""
from __future__ import annotations
import os
from dataclasses import dataclass
from typing import Optional
from .types import MemorySnapshot, ProcessSnapshot
def is_supported() -> bool:
"""Return True iff /proc/stat and /proc/meminfo are readable here."""
try:
with open("/proc/stat", "r"):
pass
with open("/proc/meminfo", "r"):
pass
except OSError:
return False
return True
@dataclass
class _CpuSample:
total: int
busy: int
def _read_proc_stat() -> Optional[_CpuSample]:
"""Aggregate CPU jiffies from the first ``cpu`` line of /proc/stat."""
try:
with open("/proc/stat", "r") as f:
line = f.readline()
except OSError:
return None
parts = line.split()
if not parts or parts[0] != "cpu":
return None
try:
# user nice system idle iowait irq softirq steal guest guest_nice
nums = [int(x) for x in parts[1:]]
except ValueError:
return None
if len(nums) < 4:
return None
idle = nums[3] + (nums[4] if len(nums) > 4 else 0) # idle + iowait
total = sum(nums)
return _CpuSample(total=total, busy=total - idle)
def _read_proc_self_stat_jiffies() -> Optional[int]:
"""Return user+system jiffies for the current process, or None on failure."""
try:
with open("/proc/self/stat", "rb") as f:
data = f.read()
except OSError:
return None
# The comm field (parens) can contain spaces; parse from the last ')'
end = data.rfind(b")")
if end < 0:
return None
parts = data[end + 1 :].split()
# After comm (and state), positions:
# 0=state 1=ppid 2=pgrp 3=session 4=tty_nr 5=tpgid 6=flags
# 7=minflt 8=cminflt 9=majflt 10=cmajflt 11=utime 12=stime ...
if len(parts) < 13:
return None
try:
return int(parts[11]) + int(parts[12])
except ValueError:
return None
def _read_meminfo() -> MemorySnapshot:
"""Parse /proc/meminfo into a MemorySnapshot. Zeroed on failure."""
fields: dict[str, int] = {}
try:
with open("/proc/meminfo", "r") as f:
for line in f:
key, _, rest = line.partition(":")
if not rest:
continue
val = rest.strip().split()
if not val:
continue
try:
# Values are in kB
fields[key] = int(val[0]) * 1024
except ValueError:
continue
except OSError:
return MemorySnapshot(0, 0, 0.0)
total = fields.get("MemTotal", 0)
available = fields.get("MemAvailable", fields.get("MemFree", 0))
if total <= 0:
return MemorySnapshot(0, 0, 0.0)
used = max(0, total - available)
return MemorySnapshot(
used_bytes=used,
total_bytes=total,
percent=round(used * 100.0 / total, 1),
)
def _read_self_rss_bytes() -> int:
"""Read VmRSS (resident set size) for the current process from /proc/self/status."""
try:
with open("/proc/self/status", "r") as f:
for line in f:
if line.startswith("VmRSS:"):
parts = line.split()
# "VmRSS: 12345 kB"
if len(parts) >= 2:
try:
return int(parts[1]) * 1024
except ValueError:
return 0
except OSError:
return 0
return 0
class AndroidMetricsProvider:
"""Reads CPU/RAM from /proc — used on Android via Chaquopy."""
available: bool = True
def __init__(self) -> None:
self._cpu_count = os.cpu_count() or 1
# Prime the deltas so the first real sample is meaningful.
self._last_host: Optional[_CpuSample] = _read_proc_stat()
self._last_proc_jiffies: Optional[int] = _read_proc_self_stat_jiffies()
self._last_host_total: Optional[int] = self._last_host.total if self._last_host else None
def cpu_percent(self) -> float:
sample = _read_proc_stat()
if sample is None or self._last_host is None:
self._last_host = sample
return 0.0
d_total = sample.total - self._last_host.total
d_busy = sample.busy - self._last_host.busy
self._last_host = sample
if d_total <= 0:
return 0.0
return round(d_busy * 100.0 / d_total, 1)
def cpu_count(self) -> int:
return self._cpu_count
def virtual_memory(self) -> MemorySnapshot:
return _read_meminfo()
def process_snapshot(self) -> ProcessSnapshot:
proc_jiffies = _read_proc_self_stat_jiffies()
host_sample = _read_proc_stat()
cpu = 0.0
if (
proc_jiffies is not None
and self._last_proc_jiffies is not None
and host_sample is not None
and self._last_host_total is not None
):
d_proc = proc_jiffies - self._last_proc_jiffies
d_host = host_sample.total - self._last_host_total
if d_host > 0 and d_proc >= 0:
# d_proc / d_host gives fraction of *one* core; multiply by
# cpu_count for raw N*100% scale, then normalize to 0100%.
cpu = round(d_proc * 100.0 / d_host, 1)
if proc_jiffies is not None:
self._last_proc_jiffies = proc_jiffies
if host_sample is not None:
self._last_host_total = host_sample.total
return ProcessSnapshot(cpu_percent=cpu, rss_bytes=_read_self_rss_bytes())
@@ -0,0 +1,28 @@
"""Zero-valued metrics provider used when no real source is available."""
from __future__ import annotations
from .types import MemorySnapshot, ProcessSnapshot
class NullMetricsProvider:
"""Returns zeros for every metric.
Used on platforms where psutil is unavailable (Android/Chaquopy) and
no platform-native provider is wired up yet. The dashboard still
renders; charts just stay flat at zero instead of crashing.
"""
available: bool = False
def cpu_percent(self) -> float:
return 0.0
def cpu_count(self) -> int:
return 1
def virtual_memory(self) -> MemorySnapshot:
return MemorySnapshot(used_bytes=0, total_bytes=0, percent=0.0)
def process_snapshot(self) -> ProcessSnapshot:
return ProcessSnapshot(cpu_percent=0.0, rss_bytes=0)
@@ -0,0 +1,46 @@
"""psutil-backed metrics provider for desktop platforms."""
from __future__ import annotations
import os
from .types import MemorySnapshot, ProcessSnapshot
class PsutilMetricsProvider:
"""psutil-backed provider for Windows/macOS/Linux desktop hosts.
Primes psutil's interval-based CPU counters at construction so the
first real sample returns a meaningful value instead of 0.0. The
logical CPU count is cached because it never changes during a
process's lifetime.
"""
available: bool = True
def __init__(self, psutil_module) -> None:
self._psutil = psutil_module
psutil_module.cpu_percent(interval=None)
self._process = psutil_module.Process(os.getpid())
self._process.cpu_percent(interval=None)
self._cpu_count = int(psutil_module.cpu_count(logical=True) or 1)
def cpu_percent(self) -> float:
return float(self._psutil.cpu_percent(interval=None))
def cpu_count(self) -> int:
return self._cpu_count
def virtual_memory(self) -> MemorySnapshot:
m = self._psutil.virtual_memory()
return MemorySnapshot(
used_bytes=int(m.used),
total_bytes=int(m.total),
percent=float(m.percent),
)
def process_snapshot(self) -> ProcessSnapshot:
# psutil's Process.cpu_percent() returns 0N*100%; normalize to 0100%.
cpu = self._process.cpu_percent(interval=None) / self._cpu_count
rss = int(self._process.memory_info().rss)
return ProcessSnapshot(cpu_percent=float(cpu), rss_bytes=rss)
+30
View File
@@ -0,0 +1,30 @@
"""Shared types for the metrics provider abstraction."""
from __future__ import annotations
from dataclasses import dataclass
from typing import Protocol
@dataclass(frozen=True)
class MemorySnapshot:
used_bytes: int
total_bytes: int
percent: float
@dataclass(frozen=True)
class ProcessSnapshot:
cpu_percent: float # normalized to 0100% across all cores
rss_bytes: int
class MetricsProvider(Protocol):
"""Read-only host + current-process metrics."""
available: bool
def cpu_percent(self) -> float: ...
def cpu_count(self) -> int: ...
def virtual_memory(self) -> MemorySnapshot: ...
def process_snapshot(self) -> ProcessSnapshot: ...