fix: production-readiness hardening — security, perf, a11y, observability

Security - Default scripts_management, callbacks_management, links_management, and media_folders_management to False so a leaked token cannot escalate to RCE through admin CRUD endpoints. - TokenSpec + scope hierarchy (read | control | admin); legacy bare-string api_tokens entries promote to admin for back-compat. Management endpoints now require admin scope. - WebSocket subprotocol auth (Sec-WebSocket-Protocol: media-server.token.<T>) preferred over ?token= query so the token no longer lands in URL/history/ Referer; query fallback retained for HA integration back-compat. - Origin allow-list check on the WS endpoint (CSWSH defence). - In-process token-bucket rate limiter: 5/min for failed auths, 10/min for /api/scripts/execute and /api/callbacks/execute. - shell=False subprocess path (shlex.split) + per-parameter regex `pattern` in ScriptParameterConfig to harden shell=true scripts against parameter injection (Windows cmd.exe env-var expansion). - CSP gains form-action, worker-src, manifest-src directives. - Refuse cors_origins=["*"] at startup; strip token=... from uvicorn access logs; validate Gitea release tag against strict SemVer regex. - noopener noreferrer + no-referrer referrerpolicy on every outbound link. - icacls hardening of config.yaml on Windows (current user + SYSTEM + Administrators only); 0600 still enforced on POSIX. - WS volume handler clamps input and never drops the socket on bad messages. Performance - Album-art read in windows_media gated by track key — was decoding the WinRT thumbnail twice per second regardless of track changes. - /api/media/artwork returns content-derived ETag + Cache-Control so the browser sends If-None-Match and gets 304s on track repeats. - Foreground-service ctypes argtypes hoisted to one-time module init (was re-declaring ~14 prototypes per probe). - display_service _static_cache keyed by (edid_hash, ...) tuple with eviction of disappeared monitors — fixes stale capabilities on hot-plug swaps where the new topology has the same monitor count. - Visualizer rAF loop paused on document.hidden, resumed on visible. Reliability / bug fixes - Lifespan rewritten as try/yield/finally so a partial-startup failure cannot orphan background tasks or executors. - _run_callback in routes/media.py keeps a strong task ref (GC-safe) and uses the dedicated callback executor instead of the default pool. - macos_media.set_volume() no longer always returns True. - TrayManager._restart_requested initialised in __init__; set before signalling exit so the main thread observes it correctly. - Missing static_dir now logs a WARNING instead of silent UI disable. UX / accessibility / PWA - manifest.json theme_color and background_color match the Studio Reference base (#0E0D0B); added id and scope for PWA installability. - ARIA on mini-player icon buttons; inner SVGs marked aria-hidden. - OS mediaSession API wired so headset / lockscreen / Bluetooth buttons drive play/pause/next/prev/seek and show track metadata + artwork. Observability - X-Request-ID middleware (accept upstream id if it matches a safe regex, otherwise UUID4); request_id_var added to ContextVars and included in every log line alongside the token label. - Audit log (append-only JSONL) for every script + callback execution, including the on_play/on_pause/etc. event callbacks. Background-thread writer; queue capped; flushed in lifespan teardown. Deployment - proxy_headers + forwarded_allow_ips plumbed through Settings → uvicorn.Config for reverse-proxy installs. - HTTPS support via ssl_certfile + ssl_keyfile (+ optional password); startup refuses to launch with only one of the pair set. - Thumbnail cache moved from project-root .cache to %LOCALAPPDATA%/media-server/cache (Windows) and $XDG_CACHE_HOME/media-server/thumbnails (POSIX). Tests - 35 new tests across auth scopes, rate limiter, browser path traversal (../ NUL UNC absolute), script-param validation incl. regex, Gitea tag whitelist, config atomic write + POSIX perms. 47 passed / 4 skipped.
2026-05-22 22:25:54 +03:00
parent 450f9fe1ee
commit d131ba461c
31 changed files with 1586 additions and 204 deletions
@@ -36,12 +36,20 @@ def _spawn_background(coro) -> asyncio.Task:


 def _require_folder_management() -> None:
-    """Raise 403 if media folder management is disabled in config."""
+    """Raise 403 if media folder management is disabled OR caller lacks admin scope."""
    if not settings.media_folders_management:
        raise HTTPException(
            status_code=403,
            detail="Media folder management is disabled. Set media_folders_management: true in config.yaml to enable.",
        )
+    from ..auth import auth_enabled, token_has_scope, token_label_var
+    if auth_enabled():
+        label = token_label_var.get("unknown")
+        if not token_has_scope(label, "admin"):
+            raise HTTPException(
+                status_code=403,
+                detail=f"Token '{label}' lacks required scope: admin",
+            )


 async def _broadcast_after_open(controller, label: str, max_wait: float = 2.0) -> None:
@@ -8,12 +8,14 @@ import time
 from concurrent.futures import ThreadPoolExecutor
 from typing import Any

-from fastapi import APIRouter, Depends, HTTPException, status
+from fastapi import APIRouter, Depends, HTTPException, Request, status
 from pydantic import BaseModel, Field

 from ..auth import verify_token
 from ..config import CallbackConfig, settings
 from ..config_manager import config_manager
+from ..services.rate_limit import check as ratelimit_check
+from ..services.rate_limit import get_peer

 router = APIRouter(prefix="/api/callbacks", tags=["callbacks"])
 logger = logging.getLogger(__name__)
@@ -28,6 +30,7 @@ def shutdown_callback_executor() -> None:


 def _require_callbacks_management() -> None:
+    """Authorise a callbacks-CRUD operation. Operator flag + per-token admin scope."""
    if not settings.callbacks_management:
        raise HTTPException(
            status_code=status.HTTP_403_FORBIDDEN,
@@ -36,6 +39,14 @@ def _require_callbacks_management() -> None:
                " in config.yaml to enable."
            ),
        )
+    from ..auth import auth_enabled, token_has_scope, token_label_var
+    if auth_enabled():
+        label = token_label_var.get("unknown")
+        if not token_has_scope(label, "admin"):
+            raise HTTPException(
+                status_code=status.HTTP_403_FORBIDDEN,
+                detail=f"Token '{label}' lacks required scope: admin",
+            )


 class CallbackInfo(BaseModel):
@@ -122,6 +133,7 @@ async def list_callbacks(_: str = Depends(verify_token)) -> list[CallbackInfo]:
@router.post("/execute/{callback_name}")
 async def execute_callback(
    callback_name: str,
+    http_request: Request,
    _: str = Depends(verify_token),
 ) -> CallbackExecuteResponse:
    """Execute a callback for debugging purposes.
@@ -132,6 +144,16 @@ async def execute_callback(
    Returns:
        Execution result including stdout, stderr, and exit code
    """
+    # Rate-limit callback execution per peer (10/min) — callbacks also run
+    # subprocesses and need the same protection as scripts.
+    allowed, retry_after = ratelimit_check("execute", get_peer(http_request))
+    if not allowed:
+        raise HTTPException(
+            status_code=429,
+            detail="Too many callback executions, slow down",
+            headers={"Retry-After": str(int(retry_after or 60))},
+        )
+
    # Validate callback name
    _validate_callback_name(callback_name)

@@ -146,6 +168,8 @@ async def execute_callback(

    logger.info(f"Executing callback for debugging: {callback_name}")

+    from ..services.audit_log import record_script_execution
+
    try:
        # Execute in dedicated thread pool to not block the default executor
        loop = asyncio.get_running_loop()
@@ -159,6 +183,15 @@ async def execute_callback(
            ),
        )

+        record_script_execution(
+            kind="callback",
+            name=callback_name,
+            exit_code=result["exit_code"],
+            duration=result.get("execution_time"),
+            stdout=result.get("stdout"),
+            stderr=result.get("stderr"),
+        )
+
        return CallbackExecuteResponse(
            success=result["exit_code"] == 0,
            callback=callback_name,
@@ -170,6 +203,13 @@ async def execute_callback(

    except Exception as e:
        logger.error(f"Callback execution error: {e}")
+        record_script_execution(
+            kind="callback",
+            name=callback_name,
+            exit_code=None,
+            duration=None,
+            error=str(e),
+        )
        return CallbackExecuteResponse(
            success=False,
            callback=callback_name,
@@ -39,11 +39,20 @@ def _validate_icon(icon: str) -> str:


 def _require_links_management() -> None:
+    """Authorise a links-CRUD operation. Operator flag + per-token admin scope."""
    if not settings.links_management:
        raise HTTPException(
            status_code=status.HTTP_403_FORBIDDEN,
            detail="Links management is disabled. Set links_management: true in config.yaml to enable.",
        )
+    from ..auth import auth_enabled, token_has_scope, token_label_var
+    if auth_enabled():
+        label = token_label_var.get("unknown")
+        if not token_has_scope(label, "admin"):
+            raise HTTPException(
+                status_code=status.HTTP_403_FORBIDDEN,
+                detail=f"Token '{label}' lacks required scope: admin",
+            )


 class LinkInfo(BaseModel):
@@ -3,7 +3,16 @@
 import asyncio
 import logging

-from fastapi import APIRouter, Depends, HTTPException, Query, WebSocket, WebSocketDisconnect, status
+from fastapi import (
+    APIRouter,
+    Depends,
+    HTTPException,
+    Query,
+    Request,
+    WebSocket,
+    WebSocketDisconnect,
+    status,
+)
 from fastapi.responses import Response

 from ..auth import verify_token, verify_token_or_query
@@ -17,19 +26,28 @@ logger = logging.getLogger(__name__)
 router = APIRouter(prefix="/api/media", tags=["media"])


+# Strong refs to background tasks so the asyncio GC can't drop them before
+# they run. Mirrors the pattern used in routes/browser.py.
+_background_callback_tasks: set[asyncio.Task] = set()
+
+
 def _run_callback(callback_name: str) -> None:
    """Fire-and-forget a callback if configured. Failures are logged but don't block."""
    if not settings.callbacks or callback_name not in settings.callbacks:
        return

    async def _execute():
+        # Use the dedicated callback executor (not the default loop pool) so a
+        # misbehaving callback can't starve the rest of the app's sync tasks.
+        from ..services.audit_log import record_script_execution
+        from .callbacks import _callback_executor
        from .scripts import _run_script

        try:
            callback = settings.callbacks[callback_name]
            loop = asyncio.get_running_loop()
            result = await loop.run_in_executor(
-                None,
+                _callback_executor,
                lambda: _run_script(
                    command=callback.command,
                    timeout=callback.timeout,
@@ -37,6 +55,14 @@ def _run_callback(callback_name: str) -> None:
                    working_dir=callback.working_dir,
                ),
            )
+            record_script_execution(
+                kind="event-callback",
+                name=callback_name,
+                exit_code=result["exit_code"],
+                duration=result.get("execution_time"),
+                stdout=result.get("stdout"),
+                stderr=result.get("stderr"),
+            )
            if result["exit_code"] != 0:
                logger.warning(
                    "Callback %s failed with exit code %s: %s",
@@ -46,8 +72,18 @@ def _run_callback(callback_name: str) -> None:
                )
        except Exception as e:
            logger.error("Callback %s error: %s", callback_name, e)
+            from ..services.audit_log import record_script_execution as _rec
+            _rec(
+                kind="event-callback",
+                name=callback_name,
+                exit_code=None,
+                duration=None,
+                error=str(e),
+            )

-    asyncio.create_task(_execute())
+    task = asyncio.create_task(_execute())
+    _background_callback_tasks.add(task)
+    task.add_done_callback(_background_callback_tasks.discard)


@router.get("/status", response_model=MediaStatus)
@@ -242,11 +278,14 @@ async def toggle(_: str = Depends(verify_token)) -> dict:


@router.get("/artwork")
-async def get_artwork(_: str = Depends(verify_token_or_query)) -> Response:
+async def get_artwork(
+    request: Request,
+    _: str = Depends(verify_token_or_query),
+) -> Response:
    """Get the current album artwork.

-    Returns:
-        The album art image as PNG/JPEG
+    Returns the bytes with a content-derived ETag so the browser can serve a
+    304 when the same track is re-requested.
    """
    art_bytes = get_current_album_art()
    if art_bytes is None:
@@ -255,16 +294,34 @@ async def get_artwork(_: str = Depends(verify_token_or_query)) -> Response:
            detail="No album artwork available",
        )

-    # Try to detect image type from magic bytes
-    content_type = "image/png"  # Default
+    # Detect image type from magic bytes
    if art_bytes[:3] == b"\xff\xd8\xff":
        content_type = "image/jpeg"
    elif art_bytes[:8] == b"\x89PNG\r\n\x1a\n":
        content_type = "image/png"
-    elif art_bytes[:4] == b"RIFF" and art_bytes[8:12] == b"WEBP":
+    elif art_bytes[:4] == b"RIFF" and len(art_bytes) > 12 and art_bytes[8:12] == b"WEBP":
        content_type = "image/webp"
+    elif art_bytes[:2] == b"BM":
+        content_type = "image/bmp"
+    else:
+        content_type = "application/octet-stream"

-    return Response(content=art_bytes, media_type=content_type)
+    # Content-derived ETag (blake2b-128 — non-crypto cache key, ruff S324-safe)
+    import hashlib
+
+    etag = '"' + hashlib.blake2b(art_bytes, digest_size=16).hexdigest() + '"'
+
+    if request.headers.get("if-none-match") == etag:
+        return Response(status_code=status.HTTP_304_NOT_MODIFIED, headers={"ETag": etag})
+
+    return Response(
+        content=art_bytes,
+        media_type=content_type,
+        headers={
+            "ETag": etag,
+            "Cache-Control": "private, max-age=0, must-revalidate",
+        },
+    )


@router.get("/visualizer/status")
@@ -323,12 +380,17 @@ async def set_visualizer_device(
@router.websocket("/ws")
 async def websocket_endpoint(
    websocket: WebSocket,
-    token: str | None = Query(None, description="API authentication token"),
+    token: str | None = Query(None, description="API authentication token (legacy)"),
 ) -> None:
    """WebSocket endpoint for real-time media status updates.

-    Authentication is done via query parameter since WebSocket
-    doesn't support custom headers in the browser.
+    Authentication is accepted from two sources, in priority order:
+      1. ``Sec-WebSocket-Protocol`` subprotocol of the form
+         ``media-server.token.<TOKEN>``. This is the preferred path because
+         the token never lands in the URL, request logs, or browser history.
+         The browser WebSocket API supports custom subprotocols natively.
+      2. ``?token=<TOKEN>`` query parameter (legacy, kept for back-compat
+         with older clients and the HA integration).

    Messages sent to client:
    - {"type": "status", "data": {...}} - Initial status on connect
@@ -339,11 +401,40 @@ async def websocket_endpoint(
    - {"type": "ping"} - Keepalive, server responds with {"type": "pong"}
    - {"type": "get_status"} - Request current status
    """
+    # Pull token from subprotocol if present. WebSocket spec lets either side
+    # negotiate exactly one subprotocol back; we accept the token one and
+    # answer with the same string so browsers consider the negotiation
+    # successful.
+    subprotocol_token: str | None = None
+    accept_subprotocol: str | None = None
+    raw_protocols = websocket.headers.get("sec-websocket-protocol", "")
+    for proto in (p.strip() for p in raw_protocols.split(",") if p.strip()):
+        if proto.startswith("media-server.token."):
+            subprotocol_token = proto[len("media-server.token."):]
+            accept_subprotocol = proto
+            break
+    effective_token = subprotocol_token or token
+    # Origin check — block CSWSH from third-party LAN pages. We accept the same
+    # set of origins as CORS plus the default localhost loopback.
+    allowed_origins = set(
+        settings.cors_origins
+        or [
+            f"http://localhost:{settings.port}",
+            f"http://127.0.0.1:{settings.port}",
+        ]
+    )
+    origin = websocket.headers.get("origin")
+    # Same-origin connections from native apps may omit Origin entirely; only
+    # reject when an Origin is present AND not in the allow-list.
+    if origin is not None and origin not in allowed_origins:
+        await websocket.close(code=4003, reason="Origin not allowed")
+        return
+
    # Verify token
    from ..auth import auth_enabled, get_token_label, token_label_var

    if auth_enabled():
-        label = get_token_label(token) if token else None
+        label = get_token_label(effective_token) if effective_token else None
        if label is None:
            await websocket.close(code=4001, reason="Invalid authentication token")
            return
@@ -351,16 +442,25 @@ async def websocket_endpoint(
    else:
        token_label_var.set("anonymous")

-    await ws_manager.connect(websocket)
+    # Accept with the negotiated subprotocol if one was used. Starlette's
+    # connect() calls accept() with no subprotocol — we need to accept first
+    # explicitly to echo the subprotocol back, then hand off to the manager.
+    if accept_subprotocol is not None:
+        await websocket.accept(subprotocol=accept_subprotocol)
+        await ws_manager.connect(websocket, already_accepted=True)
+    else:
+        await ws_manager.connect(websocket)

    try:
        while True:
            # Wait for messages from client (for keepalive/ping)
            data = await websocket.receive_json()

-            if data.get("type") == "ping":
+            msg_type = data.get("type") if isinstance(data, dict) else None
+
+            if msg_type == "ping":
                await websocket.send_json({"type": "pong"})
-            elif data.get("type") == "get_status":
+            elif msg_type == "get_status":
                # Allow manual status request
                controller = get_media_controller()
                status_data = await controller.get_status()
@@ -368,15 +468,20 @@ async def websocket_endpoint(
                    "type": "status",
                    "data": status_data.model_dump(),
                })
-            elif data.get("type") == "volume":
-                # Low-latency volume control via WebSocket
-                volume = data.get("volume")
-                if volume is not None:
-                    controller = get_media_controller()
-                    await controller.set_volume(int(volume))
-            elif data.get("type") == "enable_visualizer":
+            elif msg_type == "volume":
+                # Low-latency volume control via WebSocket. Coerce, clamp, and
+                # never drop the socket on a single bad message — that would
+                # turn the WS into a one-shot DoS for any holder of a token.
+                try:
+                    volume = int(data.get("volume"))
+                except (TypeError, ValueError):
+                    continue
+                volume = max(0, min(100, volume))
+                controller = get_media_controller()
+                await controller.set_volume(volume)
+            elif msg_type == "enable_visualizer":
                await ws_manager.subscribe_visualizer(websocket)
-            elif data.get("type") == "disable_visualizer":
+            elif msg_type == "disable_visualizer":
                await ws_manager.unsubscribe_visualizer(websocket)

    except WebSocketDisconnect:
@@ -10,12 +10,14 @@ import time
 from concurrent.futures import ThreadPoolExecutor
 from typing import Any

-from fastapi import APIRouter, Depends, HTTPException, status
+from fastapi import APIRouter, Depends, HTTPException, Request, status
 from pydantic import BaseModel, Field

 from ..auth import verify_token
 from ..config import ScriptConfig, ScriptParameterConfig, settings
 from ..config_manager import config_manager
+from ..services.rate_limit import check as ratelimit_check
+from ..services.rate_limit import get_peer
 from ..services.websocket_manager import ws_manager

 router = APIRouter(prefix="/api/scripts", tags=["scripts"])
@@ -31,6 +33,12 @@ def shutdown_script_executor() -> None:


 def _require_scripts_management() -> None:
+    """Authorise a scripts-CRUD operation.
+
+    Two gates: the operator-level `scripts_management` flag in config.yaml,
+    AND the per-token `admin` scope check (read from request-context). Either
+    failure → 403.
+    """
    if not settings.scripts_management:
        raise HTTPException(
            status_code=status.HTTP_403_FORBIDDEN,
@@ -39,6 +47,14 @@ def _require_scripts_management() -> None:
                " in config.yaml to enable."
            ),
        )
+    from ..auth import auth_enabled, token_has_scope, token_label_var
+    if auth_enabled():
+        label = token_label_var.get("unknown")
+        if not token_has_scope(label, "admin"):
+            raise HTTPException(
+                status_code=status.HTTP_403_FORBIDDEN,
+                detail=f"Token '{label}' lacks required scope: admin",
+            )


 class ScriptExecuteRequest(BaseModel):
@@ -215,6 +231,28 @@ def _validate_params(
            # string — just convert to str
            value = str(value)

+        # Optional regex constraint, validated against the *string form* of the
+        # value. This is the only practical defence for string parameters that
+        # flow into shell=true scripts via env vars (Windows cmd.exe expands
+        # `%VAR%` after argument parsing, so embedded `&`/`|`/`%` would inject
+        # commands). Authors of shell scripts should ALWAYS define a pattern.
+        if pdef.pattern:
+            try:
+                if not re.fullmatch(pdef.pattern, str(value)):
+                    raise HTTPException(
+                        status_code=status.HTTP_400_BAD_REQUEST,
+                        detail=(
+                            f"Parameter '{pname}' value {value!r} does not match"
+                            f" required pattern: {pdef.pattern}"
+                        ),
+                    )
+            except re.error as e:
+                # Bad pattern in config — fail closed.
+                raise HTTPException(
+                    status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+                    detail=f"Parameter '{pname}' has invalid pattern: {e}",
+                ) from e
+
        env_vars[f"SCRIPT_PARAM_{pname.upper()}"] = str(value)

    return env_vars
@@ -223,6 +261,7 @@ def _validate_params(
@router.post("/execute/{script_name}")
 async def execute_script(
    script_name: str,
+    http_request: Request,
    request: ScriptExecuteRequest | None = None,
    _: str = Depends(verify_token),
 ) -> ScriptExecuteResponse:
@@ -235,6 +274,16 @@ async def execute_script(
    Returns:
        Execution result including stdout, stderr, and exit code
    """
+    # Rate-limit script execution per peer so a leaked token can't be used to
+    # spam the shell-exec endpoint.
+    allowed, retry_after = ratelimit_check("execute", get_peer(http_request))
+    if not allowed:
+        raise HTTPException(
+            status_code=429,
+            detail="Too many script executions, slow down",
+            headers={"Retry-After": str(int(retry_after or 60))},
+        )
+
    # Check if script exists
    if script_name not in settings.scripts:
        raise HTTPException(
@@ -249,6 +298,8 @@ async def execute_script(

    logger.info(f"Executing script: {script_name}")

+    from ..services.audit_log import record_script_execution
+
    try:
        # Execute in dedicated thread pool to not block the default executor
        loop = asyncio.get_running_loop()
@@ -263,6 +314,15 @@ async def execute_script(
            ),
        )

+        record_script_execution(
+            kind="script",
+            name=script_name,
+            exit_code=result["exit_code"],
+            duration=result.get("execution_time"),
+            stdout=result.get("stdout"),
+            stderr=result.get("stderr"),
+        )
+
        return ScriptExecuteResponse(
            success=result["exit_code"] == 0,
            script=script_name,
@@ -274,6 +334,13 @@ async def execute_script(

    except Exception as e:
        logger.error(f"Script execution error: {e}")
+        record_script_execution(
+            kind="script",
+            name=script_name,
+            exit_code=None,
+            duration=None,
+            error=str(e),
+        )
        return ScriptExecuteResponse(
            success=False,
            script=script_name,
@@ -313,9 +380,21 @@ def _run_script(
    else:
        popen_kwargs["start_new_session"] = True

+    # When shell=False, the user-provided command string is split via shlex
+    # (POSIX rules — also works for Windows args without backslashes). This
+    # disables shell metacharacter expansion entirely, so SCRIPT_PARAM_* env
+    # vars referenced as $FOO / %FOO% will be treated as literal text by the
+    # process, not interpreted by a shell. Use shell=false for any script
+    # whose params come from external input.
+    if shell:
+        run_command: str | list[str] = command
+    else:
+        import shlex
+        run_command = shlex.split(command, posix=(sys.platform != "win32"))
+
    try:
        result = subprocess.run(
-            command,
+            run_command,
            shell=shell,
            cwd=working_dir,
            capture_output=True,