fix(security,perf): harden restore, CSRF, token_version + perf pass

Security
- Sign pending_restore.json (SHA256 stored in AppSetting, verified on
  startup apply) + refuse path outside data_dir, tighten to 0600.
- Require same-origin Origin/Referer on POST /api/backup/apply-restart —
  Bearer-in-localStorage is CSRF-reachable from any XSS'd admin tab.
- Bump token_version on role/username change and admin password reset so
  demoted admins lose admin in already-issued JWTs.  Guard last-admin
  TOCTOU via COUNT + post-commit re-check that rolls back a race.
- SSRF guard (validate_outbound_url) in ImmichClient.__init__ and the
  external_domain setter — admin-mutable URLs were bypassing the check
  that webhook/slack/discord paths already used.  Dev restart script now
  sets NOTIFY_BRIDGE_ALLOW_PRIVATE_URLS=1 so homelab Immich still works.
- Redact + cap Immich error bodies to ~120 chars before they flow into
  ActionExecution.error / EventLog.details (both UI-visible).
- Deny-list sensitive keys (api_key / token / secret / password /
  authorization / cookie / ...) in template-context merges so a rogue
  template can't exfiltrate provider creds via {{ api_key }}.
- Cap user-controlled Immich search params (query ≤256, person_ids ≤50,
  size ≤100) so a Telegram listener can't DoS upstream.
- Stream upload reads with running byte counter + content-length precheck
  instead of buffering the full body and then rejecting.
- Log Telegram parse_mode fallbacks instead of swallowing silently;
  template escape bugs now surface in server logs.
- Rollback partial imports on pending-restore failure (error recorded on
  a fresh session).

Performance
- Fix N+1 in _refresh_telegram_chat_titles: single IN query instead of
  session.get per chat.
- Parallelize album + shared-link fetches in test_dispatch (asyncio.gather)
  and per-receiver Telegram test sends in notifier (semaphore 5).
- Early-exit collect_scheduled_assets(limit=0) so the periodic-summary
  test path skips full per-album filter/sample (was O(album_assets)).
- Emit explicit CREATE INDEX IF NOT EXISTS for event_log user_id /
  action_id / provider_id so the first boot after upgrade isn't left
  unindexed for the dashboard query.
- Add AbortController timeout (120s) to fetchAuth so uploads/downloads
  don't hang indefinitely.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-22 02:28:55 +03:00
parent fe92b206b7
commit 56993d2ca3
13 changed files with 530 additions and 100 deletions
@@ -1,13 +1,15 @@
"""Configuration backup/restore API (admin only)."""
import asyncio
import hashlib
import json
import logging
import os
import signal
from datetime import datetime, timezone
from urllib.parse import urlparse
from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, UploadFile, File, Query
from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, Request, UploadFile, File, Query
from fastapi.responses import JSONResponse
from sqlmodel.ext.asyncio.session import AsyncSession
@@ -28,6 +30,11 @@ PENDING_RESTORE_PATH_KEY = "pending_restore_path"
PENDING_RESTORE_CONFLICT_KEY = "pending_restore_conflict_mode"
PENDING_RESTORE_UPLOADED_AT_KEY = "pending_restore_uploaded_at"
PENDING_RESTORE_UPLOADED_BY_KEY = "pending_restore_uploaded_by"
# SHA256 of the staged pending_restore.json, written atomically with the file.
# The startup hook refuses to apply if the on-disk file's hash does not match —
# defends against anyone dropping a tampered file into data/ between prepare
# and restart.
PENDING_RESTORE_SHA256_KEY = "pending_restore_sha256"
def _pending_restore_path():
@@ -44,6 +51,69 @@ router = APIRouter(prefix="/api/backup", tags=["backup"])
MAX_UPLOAD_SIZE = 10 * 1024 * 1024 # 10 MB
async def _read_upload_bounded(file: UploadFile, max_bytes: int = MAX_UPLOAD_SIZE) -> bytes:
"""Read an UploadFile into memory, failing fast if it exceeds ``max_bytes``.
Rejects on ``content_length`` header up-front when available; always
stream-reads with a running byte counter so we never allocate more than
the limit even when the header is missing or lies.
"""
# Fast path: reject on header before we allocate anything.
cl = file.headers.get("content-length") if hasattr(file, "headers") else None
if cl:
try:
if int(cl) > max_bytes:
raise HTTPException(status_code=400, detail="File too large (max 10 MB)")
except ValueError:
pass
chunks: list[bytes] = []
total = 0
while True:
chunk = await file.read(64 * 1024)
if not chunk:
break
total += len(chunk)
if total > max_bytes:
raise HTTPException(status_code=400, detail="File too large (max 10 MB)")
chunks.append(chunk)
return b"".join(chunks)
def _check_same_origin(request: Request) -> None:
"""Reject cross-origin admin-write POSTs (CSRF defense).
Bearer tokens in ``localStorage`` plus cookie-less CORS mean a malicious
page cannot technically submit our Authorization header from a victim's
session, BUT browser extensions and misconfigured CORS policies routinely
break this assumption. For endpoints whose blast radius is restart/RCE-
equivalent (restore apply), we additionally require the request to come
from our own origin.
"""
host = request.headers.get("host", "").lower()
if not host:
raise HTTPException(status_code=400, detail="Missing Host header")
def _host_of(u: str | None) -> str:
if not u:
return ""
try:
return (urlparse(u).netloc or "").lower()
except Exception: # noqa: BLE001
return ""
origin_host = _host_of(request.headers.get("origin"))
referer_host = _host_of(request.headers.get("referer"))
# At least one of Origin/Referer must be present and match Host.
# Legitimate browser requests to this endpoint always ship Origin.
same = (origin_host and origin_host == host) or (referer_host and referer_host == host)
if not same:
raise HTTPException(
status_code=403,
detail="Cross-origin request rejected",
)
def _backup_dir():
return app_config.data_dir / "backups"
@@ -104,9 +174,7 @@ async def validate_config(
user: User = Depends(require_admin),
):
"""Validate a backup file without importing."""
content = await file.read()
if len(content) > MAX_UPLOAD_SIZE:
raise HTTPException(status_code=400, detail="File too large (max 10 MB)")
content = await _read_upload_bounded(file)
try:
raw = json.loads(content)
@@ -129,9 +197,7 @@ async def import_config(
session: AsyncSession = Depends(get_session),
):
"""Import configuration from a backup file."""
content = await file.read()
if len(content) > MAX_UPLOAD_SIZE:
raise HTTPException(status_code=400, detail="File too large (max 10 MB)")
content = await _read_upload_bounded(file)
try:
raw = json.loads(content)
@@ -167,6 +233,7 @@ async def _clear_pending_restore_markers(session: AsyncSession) -> None:
PENDING_RESTORE_CONFLICT_KEY,
PENDING_RESTORE_UPLOADED_AT_KEY,
PENDING_RESTORE_UPLOADED_BY_KEY,
PENDING_RESTORE_SHA256_KEY,
):
row = await session.get(AppSetting, key)
if row:
@@ -185,9 +252,7 @@ async def prepare_restore(
Validates the uploaded file, writes it to ``data/pending_restore.json``,
and persists marker settings so startup will apply it atomically.
"""
content = await file.read()
if len(content) > MAX_UPLOAD_SIZE:
raise HTTPException(status_code=400, detail="File too large (max 10 MB)")
content = await _read_upload_bounded(file)
try:
raw = json.loads(content)
@@ -205,15 +270,25 @@ async def prepare_restore(
pending_path.parent.mkdir(parents=True, exist_ok=True)
# Atomic write: write to tmp then rename, so a crash mid-write never
# leaves a truncated pending_restore.json that would break startup apply.
payload = json.dumps(raw).encode("utf-8")
digest = hashlib.sha256(payload).hexdigest()
tmp_path = pending_path.with_suffix(pending_path.suffix + ".tmp")
tmp_path.write_text(json.dumps(raw), encoding="utf-8")
tmp_path.write_bytes(payload)
os.replace(tmp_path, pending_path)
# Best-effort tighten perms so a non-root local user cannot swap the file
# for one they control between prepare and restart. On Windows this is a
# no-op; on POSIX we restrict to owner-only rw.
try:
os.chmod(pending_path, 0o600)
except OSError:
pass
now_iso = datetime.now(timezone.utc).isoformat()
await _set_app_setting(session, PENDING_RESTORE_PATH_KEY, str(pending_path))
await _set_app_setting(session, PENDING_RESTORE_CONFLICT_KEY, conflict_mode.value)
await _set_app_setting(session, PENDING_RESTORE_UPLOADED_AT_KEY, now_iso)
await _set_app_setting(session, PENDING_RESTORE_UPLOADED_BY_KEY, user.username)
await _set_app_setting(session, PENDING_RESTORE_SHA256_KEY, digest)
await session.commit()
return {
@@ -292,6 +367,7 @@ def _is_supervised() -> bool:
@router.post("/apply-restart")
async def apply_and_restart(
request: Request,
background_tasks: BackgroundTasks,
user: User = Depends(require_admin),
session: AsyncSession = Depends(get_session),
@@ -299,7 +375,11 @@ async def apply_and_restart(
"""Trigger a graceful exit so the supervisor respawns and applies the pending restore.
Only allowed when a pending restore is staged AND the process is supervised.
Requires same-origin Origin/Referer — this endpoint's blast radius is a
full config replace + restart, so an admin token alone (vulnerable to
XSS-driven CSRF) is not enough.
"""
_check_same_origin(request)
path_row = await session.get(AppSetting, PENDING_RESTORE_PATH_KEY)
if not path_row or not path_row.value:
raise HTTPException(status_code=409, detail="No pending restore to apply")